diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..26034a7 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-04-08T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05729v1","updated":"2024-04-08T17:59:46Z","published":"2024-04-08T17:59:46Z","title":"Finding Visual Task Vectors","summary":" Visual Prompting is a technique for teaching models to perform a visual task\nvia in-context examples, without any additional training. In this work, we\nanalyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find\ntask vectors, activations that encode task-specific information. Equipped with\nthis insight, we demonstrate that it is possible to identify the task vectors\nand use them to guide the network towards performing different tasks without\nproviding any input-output examples. To find task vectors, we compute the\naverage intermediate activations per task and use the REINFORCE algorithm to\nsearch for the subset of task vectors. The resulting task vectors guide the\nmodel towards performing a task better than the original model without the need\nfor input-output examples.\n","authors":["Alberto Hojel","Yutong Bai","Trevor Darrell","Amir Globerson","Amir Bar"],"pdf_url":"https://arxiv.org/pdf/2404.05729v1.pdf","comment":"https://github.com/alhojel/visual_task_vectors"},{"id":"http://arxiv.org/abs/2404.05726v1","updated":"2024-04-08T17:59:24Z","published":"2024-04-08T17:59:24Z","title":"MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video\n Understanding","summary":" With the success of large language models (LLMs), integrating the vision\nmodel into LLMs to build vision-language foundation models has gained much more\ninterest recently. However, existing LLM-based large multimodal models (e.g.,\nVideo-LLaMA, VideoChat) can only take in a limited number of frames for short\nvideo understanding. In this study, we mainly focus on designing an efficient\nand effective model for long-term video understanding. Instead of trying to\nprocess more frames simultaneously like most existing work, we propose to\nprocess videos in an online manner and store past video information in a memory\nbank. This allows our model to reference historical video content for long-term\nanalysis without exceeding LLMs' context length constraints or GPU memory\nlimits. Our memory bank can be seamlessly integrated into current multimodal\nLLMs in an off-the-shelf manner. We conduct extensive experiments on various\nvideo understanding tasks, such as long-video understanding, video question\nanswering, and video captioning, and our model can achieve state-of-the-art\nperformances across multiple datasets. Code available at\nhttps://boheumd.github.io/MA-LMM/.\n","authors":["Bo He","Hengduo Li","Young Kyun Jang","Menglin Jia","Xuefei Cao","Ashish Shah","Abhinav Shrivastava","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2404.05726v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05719v1","updated":"2024-04-08T17:55:44Z","published":"2024-04-08T17:55:44Z","title":"Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have been\nnoteworthy, yet, these general-domain MLLMs often fall short in their ability\nto comprehend and interact effectively with user interface (UI) screens. In\nthis paper, we present Ferret-UI, a new MLLM tailored for enhanced\nunderstanding of mobile UI screens, equipped with referring, grounding, and\nreasoning capabilities. Given that UI screens typically exhibit a more\nelongated aspect ratio and contain smaller objects of interest (e.g., icons,\ntexts) than natural images, we incorporate \"any resolution\" on top of Ferret to\nmagnify details and leverage enhanced visual features. Specifically, each\nscreen is divided into 2 sub-images based on the original aspect ratio (i.e.,\nhorizontal division for portrait screens and vertical division for landscape\nscreens). Both sub-images are encoded separately before being sent to LLMs. We\nmeticulously gather training samples from an extensive range of elementary UI\ntasks, such as icon recognition, find text, and widget listing. These samples\nare formatted for instruction-following with region annotations to facilitate\nprecise referring and grounding. To augment the model's reasoning ability, we\nfurther compile a dataset for advanced tasks, including detailed description,\nperception/interaction conversations, and function inference. After training on\nthe curated datasets, Ferret-UI exhibits outstanding comprehension of UI\nscreens and the capability to execute open-ended instructions. For model\nevaluation, we establish a comprehensive benchmark encompassing all the\naforementioned tasks. Ferret-UI excels not only beyond most open-source UI\nMLLMs, but also surpasses GPT-4V on all the elementary UI tasks.\n","authors":["Keen You","Haotian Zhang","Eldon Schoop","Floris Weers","Amanda Swearngin","Jeffrey Nichols","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2404.05719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05717v1","updated":"2024-04-08T17:52:29Z","published":"2024-04-08T17:52:29Z","title":"SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual\n Editing","summary":" Effective editing of personal content holds a pivotal role in enabling\nindividuals to express their creativity, weaving captivating narratives within\ntheir visual stories, and elevate the overall quality and impact of their\nvisual content. Therefore, in this work, we introduce SwapAnything, a novel\nframework that can swap any objects in an image with personalized concepts\ngiven by the reference, while keeping the context unchanged. Compared with\nexisting methods for personalized subject swapping, SwapAnything has three\nunique advantages: (1) precise control of arbitrary objects and parts rather\nthan the main subject, (2) more faithful preservation of context pixels, (3)\nbetter adaptation of the personalized concept to the image. First, we propose\ntargeted variable swapping to apply region control over latent feature maps and\nswap masked variables for faithful context preservation and initial semantic\nconcept swapping. Then, we introduce appearance adaptation, to seamlessly adapt\nthe semantic concept into the original image in terms of target location,\nshape, style, and content during the image generation process. Extensive\nresults on both human and automatic evaluation demonstrate significant\nimprovements of our approach over baseline methods on personalized swapping.\nFurthermore, SwapAnything shows its precise and faithful swapping abilities\nacross single object, multiple objects, partial object, and cross-domain\nswapping tasks. SwapAnything also achieves great performance on text-based\nswapping and tasks beyond swapping such as object insertion.\n","authors":["Jing Gu","Yilin Wang","Nanxuan Zhao","Wei Xiong","Qing Liu","Zhifei Zhang","He Zhang","Jianming Zhang","HyunJoon Jung","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05717v1.pdf","comment":"18 pages, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.04071v4","updated":"2024-04-08T17:49:58Z","published":"2023-11-07T15:35:56Z","title":"Energy-Calibrated VAE with Test Time Free Lunch","summary":" In this paper, we propose a novel generative model that utilizes a\nconditional Energy-Based Model (EBM) for enhancing Variational Autoencoder\n(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer\nfrom blurry generated samples due to the lack of a tailored training on the\nsamples generated in the generative direction. On the other hand, EBMs can\ngenerate high-quality samples but require expensive Markov Chain Monte Carlo\n(MCMC) sampling. To address these issues, we introduce a conditional EBM for\ncalibrating the generative direction of VAE during training, without requiring\nit for the generation at test time. In particular, we train EC-VAE upon both\nthe input data and the calibrated samples with adaptive weight to enhance\nefficacy while avoiding MCMC sampling at test time. Furthermore, we extend the\ncalibration idea of EC-VAE to variational learning and normalizing flows, and\napply EC-VAE to an additional application of zero-shot image restoration via\nneural transport prior and range-null theory. We evaluate the proposed method\nwith two applications, including image generation and zero-shot image\nrestoration, and the experimental results show that our method achieves\ncompetitive performance over single-step non-adversarial generation. Our code\nis available at https://github.com/DJ-LYH/EC-VAE.\n","authors":["Yihong Luo","Siya Qiu","Xingjian Tao","Yujun Cai","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2311.04071v4.pdf","comment":"Revision. Code is available at https://github.com/DJ-LYH/EC-VAE"},{"id":"http://arxiv.org/abs/2404.05705v1","updated":"2024-04-08T17:42:08Z","published":"2024-04-08T17:42:08Z","title":"Learning 3D-Aware GANs from Unposed Images with Template Feature Field","summary":" Collecting accurate camera poses of training images has been shown to well\nserve the learning of 3D-aware generative adversarial networks (GANs) yet can\nbe quite expensive in practice. This work targets learning 3D-aware GANs from\nunposed images, for which we propose to perform on-the-fly pose estimation of\ntraining images with a learned template feature field (TeFF). Concretely, in\naddition to a generative radiance field as in previous approaches, we ask the\ngenerator to also learn a field from 2D semantic features while sharing the\ndensity from the radiance field. Such a framework allows us to acquire a\ncanonical 3D feature template leveraging the dataset mean discovered by the\ngenerative model, and further efficiently estimate the pose parameters on real\ndata. Experimental results on various challenging datasets demonstrate the\nsuperiority of our approach over state-of-the-art alternatives from both the\nqualitative and the quantitative perspectives.\n","authors":["Xinya Chen","Hanlei Guo","Yanrui Bin","Shangzhan Zhang","Yuanbo Yang","Yue Wang","Yujun Shen","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2404.05705v1.pdf","comment":"https://XDimlab.github.io/TeFF"},{"id":"http://arxiv.org/abs/2404.05693v1","updated":"2024-04-08T17:18:30Z","published":"2024-04-08T17:18:30Z","title":"Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic\n Segmentation for Satellite Imagery","summary":" Satellite imagery is crucial for tasks like environmental monitoring and\nurban planning. Typically, it relies on semantic segmentation or Land Use Land\nCover (LULC) classification to categorize each pixel. Despite the advancements\nbrought about by Deep Neural Networks (DNNs), their performance in segmentation\ntasks is hindered by challenges such as limited availability of labeled data,\nclass imbalance and the inherent variability and complexity of satellite\nimages. In order to mitigate those issues, our study explores the effectiveness\nof a Cut-and-Paste augmentation technique for semantic segmentation in\nsatellite images. We adapt this augmentation, which usually requires labeled\ninstances, to the case of semantic segmentation. By leveraging the connected\ncomponents in the semantic segmentation labels, we extract instances that are\nthen randomly pasted during training. Using the DynamicEarthNet dataset and a\nU-Net model for evaluation, we found that this augmentation significantly\nenhances the mIoU score on the test set from 37.9 to 44.1. This finding\nhighlights the potential of the Cut-and-Paste augmentation to improve the\ngeneralization capabilities of semantic segmentation models in satellite\nimagery.\n","authors":["Ionut M. Motoi","Leonardo Saraceni","Daniele Nardi","Thomas A. Ciarfuglia"],"pdf_url":"https://arxiv.org/pdf/2404.05693v1.pdf","comment":"Accepted for publication in IEEE 2024 International Geoscience &\n Remote Sensing Symposium (IGARSS 2024)"},{"id":"http://arxiv.org/abs/2404.05687v1","updated":"2024-04-08T17:10:45Z","published":"2024-04-08T17:10:45Z","title":"Retrieval-Augmented Open-Vocabulary Object Detection","summary":" Open-vocabulary object detection (OVD) has been studied with Vision-Language\nModels (VLMs) to detect novel objects beyond the pre-trained categories.\nPrevious approaches improve the generalization ability to expand the knowledge\nof the detector, using 'positive' pseudo-labels with additional 'class' names,\ne.g., sock, iPod, and alligator. To extend the previous methods in two aspects,\nwe propose Retrieval-Augmented Losses and visual Features (RALF). Our method\nretrieves related 'negative' classes and augments loss functions. Also, visual\nfeatures are augmented with 'verbalized concepts' of classes, e.g., worn on the\nfeet, handheld music player, and sharp teeth. Specifically, RALF consists of\ntwo modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual\nFeatures (RAF). RAL constitutes two losses reflecting the semantic similarity\nwith negative vocabularies. In addition, RAF augments visual features with the\nverbalized concepts from a large language model (LLM). Our experiments\ndemonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We\nachieve improvement up to 3.4 box AP$_{50}^{\\text{N}}$ on novel categories of\nthe COCO dataset and 3.6 mask AP$_{\\text{r}}$ gains on the LVIS dataset. Code\nis available at https://github.com/mlvlab/RALF .\n","authors":["Jooyeon Kim","Eulrang Cho","Sehyung Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05687v1.pdf","comment":"Accepted paper at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05680v1","updated":"2024-04-08T16:58:31Z","published":"2024-04-08T16:58:31Z","title":"SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane\n Representation","summary":" While recent advances in 3D-aware Generative Adversarial Networks (GANs) have\naided the development of near-frontal view human face synthesis, the challenge\nof comprehensively synthesizing a full 3D head viewable from all angles still\npersists. Although PanoHead proves the possibilities of using a large-scale\ndataset with images of both frontal and back views for full-head synthesis, it\noften causes artifacts for back views. Based on our in-depth analysis, we found\nthe reasons are mainly twofold. First, from network architecture perspective,\nwe found each plane in the utilized tri-plane/tri-grid representation space\ntends to confuse the features from both sides, causing \"mirroring\" artifacts\n(e.g., the glasses appear in the back). Second, from data supervision aspect,\nwe found that existing discriminator training in 3D GANs mainly focuses on the\nquality of the rendered image itself, and does not care much about its\nplausibility with the perspective from which it was rendered. This makes it\npossible to generate \"face\" in non-frontal views, due to its easiness to fool\nthe discriminator. In response, we propose SphereHead, a novel tri-plane\nrepresentation in the spherical coordinate system that fits the human head's\ngeometric characteristics and efficiently mitigates many of the generated\nartifacts. We further introduce a view-image consistency loss for the\ndiscriminator to emphasize the correspondence of the camera parameters and the\nimages. The combination of these efforts results in visually superior outcomes\nwith significantly fewer artifacts. Our code and dataset are publicly available\nat https://lhyfst.github.io/spherehead.\n","authors":["Heyuan Li","Ce Chen","Tianhao Shi","Yuda Qiu","Sizhe An","Guanying Chen","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2404.05680v1.pdf","comment":"project page: https://lhyfst.github.io/spherehead"},{"id":"http://arxiv.org/abs/2312.07425v2","updated":"2024-04-08T16:56:17Z","published":"2023-12-12T16:48:53Z","title":"Deep Internal Learning: Deep Learning from a Single Input","summary":" Deep learning, in general, focuses on training a neural network from large\nlabeled datasets. Yet, in many cases there is value in training a network just\nfrom the input at hand. This is particularly relevant in many signal and image\nprocessing problems where training data is scarce and diversity is large on the\none hand, and on the other, there is a lot of structure in the data that can be\nexploited. Using this information is the key to deep internal-learning\nstrategies, which may involve training a network from scratch using a single\ninput or adapting an already trained network to a provided input example at\ninference time. This survey paper aims at covering deep internal-learning\ntechniques that have been proposed in the past few years for these two\nimportant directions. While our main focus will be on image processing\nproblems, most of the approaches that we survey are derived for general signals\n(vectors with recurring patterns that can be distinguished from noise) and are\ntherefore applicable to other modalities.\n","authors":["Tom Tirer","Raja Giryes","Se Young Chun","Yonina C. Eldar"],"pdf_url":"https://arxiv.org/pdf/2312.07425v2.pdf","comment":"Accepted to IEEE Signal Processing Magazine"},{"id":"http://arxiv.org/abs/2404.05675v1","updated":"2024-04-08T16:56:05Z","published":"2024-04-08T16:56:05Z","title":"Normalizing Flows on the Product Space of SO(3) Manifolds for\n Probabilistic Human Pose Modeling","summary":" Normalizing flows have proven their efficacy for density estimation in\nEuclidean space, but their application to rotational representations, crucial\nin various domains such as robotics or human pose modeling, remains\nunderexplored. Probabilistic models of the human pose can benefit from\napproaches that rigorously consider the rotational nature of human joints. For\nthis purpose, we introduce HuProSO3, a normalizing flow model that operates on\na high-dimensional product space of SO(3) manifolds, modeling the joint\ndistribution for human joints with three degrees of freedom. HuProSO3's\nadvantage over state-of-the-art approaches is demonstrated through its superior\nmodeling accuracy in three different applications and its capability to\nevaluate the exact likelihood. This work not only addresses the technical\nchallenge of learning densities on SO(3) manifolds, but it also has broader\nimplications for domains where the probabilistic regression of correlated 3D\nrotations is of importance.\n","authors":["Olaf Dünkel","Tim Salzmann","Florian Pfaff"],"pdf_url":"https://arxiv.org/pdf/2404.05675v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05674v1","updated":"2024-04-08T16:55:49Z","published":"2024-04-08T16:55:49Z","title":"MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation","summary":" In this paper, we present MoMA: an open-vocabulary, training-free\npersonalized image model that boasts flexible zero-shot capabilities. As\nfoundational text-to-image models rapidly evolve, the demand for robust\nimage-to-image translation grows. Addressing this need, MoMA specializes in\nsubject-driven personalized image generation. Utilizing an open-source,\nMultimodal Large Language Model (MLLM), we train MoMA to serve a dual role as\nboth a feature extractor and a generator. This approach effectively synergizes\nreference image and text prompt information to produce valuable image features,\nfacilitating an image diffusion model. To better leverage the generated\nfeatures, we further introduce a novel self-attention shortcut method that\nefficiently transfers image features to an image diffusion model, improving the\nresemblance of the target object in generated images. Remarkably, as a\ntuning-free plug-and-play module, our model requires only a single reference\nimage and outperforms existing methods in generating images with high detail\nfidelity, enhanced identity-preservation and prompt faithfulness. Our work is\nopen-source, thereby providing universal access to these advancements.\n","authors":["Kunpeng Song","Yizhe Zhu","Bingchen Liu","Qing Yan","Ahmed Elgammal","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05673v1","updated":"2024-04-08T16:55:39Z","published":"2024-04-08T16:55:39Z","title":"CoReS: Orchestrating the Dance of Reasoning and Segmentation","summary":" The reasoning segmentation task, which demands a nuanced comprehension of\nintricate queries to accurately pinpoint object regions, is attracting\nincreasing attention. However, Multi-modal Large Language Models (MLLM) often\nfind it difficult to accurately localize the objects described in complex\nreasoning contexts. We believe that the act of reasoning segmentation should\nmirror the cognitive stages of human visual search, where each step is a\nprogressive refinement of thought toward the final object. Thus we introduce\nthe Chains of Reasoning and Segmenting (CoReS) and find this top-down visual\nhierarchy indeed enhances the visual search process. Specifically, we propose a\ndual-chain structure that generates multi-modal, chain-like outputs to aid the\nsegmentation process. Furthermore, to steer the MLLM's outputs into this\nintended hierarchy, we incorporate in-context inputs as guidance. Extensive\nexperiments demonstrate the superior performance of our CoReS, which surpasses\nthe state-of-the-art method by 7.1\\% on the ReasonSeg dataset. The code will be\nreleased at https://github.com/baoxiaoyi/CoReS.\n","authors":["Xiaoyi Bao","Siyang Sun","Shuailei Ma","Kecheng Zheng","Yuxin Guo","Guosheng Zhao","Yun Zheng","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05669v1","updated":"2024-04-08T16:52:21Z","published":"2024-04-08T16:52:21Z","title":"NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for\n Document Enhancement","summary":" Real-world documents may suffer various forms of degradation, often resulting\nin lower accuracy in optical character recognition (OCR) systems. Therefore, a\ncrucial preprocessing step is essential to eliminate noise while preserving\ntext and key features of documents. In this paper, we propose NAF-DPM, a novel\ngenerative framework based on a diffusion probabilistic model (DPM) designed to\nrestore the original quality of degraded documents. While DPMs are recognized\nfor their high-quality generated images, they are also known for their large\ninference time. To mitigate this problem we provide the DPM with an efficient\nnonlinear activation-free (NAF) network and we employ as a sampler a fast\nsolver of ordinary differential equations, which can converge in a few\niterations. To better preserve text characters, we introduce an additional\ndifferentiable module based on convolutional recurrent neural networks,\nsimulating the behavior of an OCR system during training. Experiments conducted\non various datasets showcase the superiority of our approach, achieving\nstate-of-the-art performance in terms of pixel-level and perceptual similarity\nmetrics. Furthermore, the results demonstrate a notable character error\nreduction made by OCR systems when transcribing real-world document images\nenhanced by our framework. Code and pre-trained models are available at\nhttps://github.com/ispamm/NAF-DPM.\n","authors":["Giordano Cicchetti","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2404.05669v1.pdf","comment":"Under review at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2404.05667v1","updated":"2024-04-08T16:51:33Z","published":"2024-04-08T16:51:33Z","title":"AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic\n Segmentation","summary":" A serious issue that harms the performance of zero-shot visual recognition is\nnamed objective misalignment, i.e., the learning objective prioritizes\nimproving the recognition accuracy of seen classes rather than unseen classes,\nwhile the latter is the true target to pursue. This issue becomes more\nsignificant in zero-shot image segmentation because the stronger (i.e.,\npixel-level) supervision brings a larger gap between seen and unseen classes.\nTo mitigate it, we propose a novel architecture named AlignZeg, which embodies\na comprehensive improvement of the segmentation pipeline, including proposal\nextraction, classification, and correction, to better fit the goal of zero-shot\nsegmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a\nmutual interaction between mask queries and visual features, facilitating\ndetailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced\nProposal Classification. AlignZeg introduces synthetic data and incorporates\nmultiple background prototypes to allocate a more generalizable feature space.\n(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a\nclass indicator to find potential unseen class proposals followed by a\nprediction postprocess to correct the prediction bias. Experiments demonstrate\nthat AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an\naverage 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in\nidentifying unseen classes, and we further validate that the improvement comes\nfrom alleviating the objective misalignment issue.\n","authors":["Jiannan Ge","Lingxi Xie","Hongtao Xie","Pandeng Li","Xiaopeng Zhang","Yongdong Zhang","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05666v1","updated":"2024-04-08T16:51:19Z","published":"2024-04-08T16:51:19Z","title":"YaART: Yet Another ART Rendering Technology","summary":" In the rapidly progressing field of generative models, the development of\nefficient and high-fidelity text-to-image diffusion systems represents a\nsignificant frontier. This study introduces YaART, a novel production-grade\ntext-to-image cascaded diffusion model aligned to human preferences using\nReinforcement Learning from Human Feedback (RLHF). During the development of\nYaART, we especially focus on the choices of the model and training dataset\nsizes, the aspects that were not systematically investigated for text-to-image\ncascaded diffusion models before. In particular, we comprehensively analyze how\nthese choices affect both the efficiency of the training process and the\nquality of the generated images, which are highly important in practice.\nFurthermore, we demonstrate that models trained on smaller datasets of\nhigher-quality images can successfully compete with those trained on larger\ndatasets, establishing a more efficient scenario of diffusion models training.\nFrom the quality perspective, YaART is consistently preferred by users over\nmany existing state-of-the-art models.\n","authors":["Sergey Kastryulin","Artem Konev","Alexander Shishenya","Eugene Lyapustin","Artem Khurshudov","Alexander Tselousov","Nikita Vinokurov","Denis Kuznedelev","Alexander Markovich","Grigoriy Livshits","Alexey Kirillov","Anastasiia Tabisheva","Liubov Chubarova","Marina Kaminskaia","Alexander Ustyuzhanin","Artemii Shvetsov","Daniil Shlenskii","Valerii Startsev","Dmitrii Kornilov","Mikhail Romanov","Artem Babenko","Sergei Ovcharenko","Valentin Khrulkov"],"pdf_url":"https://arxiv.org/pdf/2404.05666v1.pdf","comment":"Prompts and additional information are available on the project page,\n see https://ya.ru/ai/art/paper-yaart-v1"},{"id":"http://arxiv.org/abs/2404.05662v1","updated":"2024-04-08T16:46:25Z","published":"2024-04-08T16:46:25Z","title":"BinaryDM: Towards Accurate Binarization of Diffusion Model","summary":" With the advancement of diffusion models (DMs) and the substantially\nincreased computational requirements, quantization emerges as a practical\nsolution to obtain compact and efficient low-bit DMs. However, the highly\ndiscrete representation leads to severe accuracy degradation, hindering the\nquantization of diffusion models to ultra-low bit-widths. In this paper, we\npropose BinaryDM, a novel accurate quantization-aware training approach to push\nthe weights of diffusion models towards the limit of 1-bit. Firstly, we present\na Learnable Multi-basis Binarizer (LMB) to recover the representations\ngenerated by the binarized DM, which improves the information in details of\nrepresentations crucial to the DM. Secondly, a Low-rank Representation\nMimicking (LRM) is applied to enhance the binarization-aware optimization of\nthe DM, alleviating the optimization direction ambiguity caused by fine-grained\nalignment. Moreover, a progressive initialization strategy is applied to\ntraining DMs to avoid convergence difficulties. Comprehensive experiments\ndemonstrate that BinaryDM achieves significant accuracy and efficiency gains\ncompared to SOTA quantization methods of DMs under ultra-low bit-widths. As the\nfirst binarization method for diffusion models, BinaryDM achieves impressive\n16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit\nactivation, showcasing its substantial advantages and potential for deploying\nDMs on resource-limited scenarios.\n","authors":["Xingyu Zheng","Haotong Qin","Xudong Ma","Mingyuan Zhang","Haojie Hao","Jiakai Wang","Zixiang Zhao","Jinyang Guo","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05662v1.pdf","comment":"The code will soon be available at\n https://github.com/Xingyu-Zheng/BinaryDM"},{"id":"http://arxiv.org/abs/2404.05661v1","updated":"2024-04-08T16:46:07Z","published":"2024-04-08T16:46:07Z","title":"Automatic Controllable Colorization via Imagination","summary":" We propose a framework for automatic colorization that allows for iterative\nediting and modifications. The core of our framework lies in an imagination\nmodule: by understanding the content within a grayscale image, we utilize a\npre-trained image generation model to generate multiple images that contain the\nsame content. These images serve as references for coloring, mimicking the\nprocess of human experts. As the synthesized images can be imperfect or\ndifferent from the original grayscale image, we propose a Reference Refinement\nModule to select the optimal reference composition. Unlike most previous\nend-to-end automatic colorization algorithms, our framework allows for\niterative and localized modifications of the colorization results because we\nexplicitly model the coloring samples. Extensive experiments demonstrate the\nsuperiority of our framework over existing automatic colorization algorithms in\neditability and flexibility. Project page:\nhttps://xy-cong.github.io/imagine-colorization.\n","authors":["Xiaoyan Cong","Yue Wu","Qifeng Chen","Chenyang Lei"],"pdf_url":"https://arxiv.org/pdf/2404.05661v1.pdf","comment":"CVPR 2024. Project page:\n https://xy-cong.github.io/imagine-colorization"},{"id":"http://arxiv.org/abs/2404.05657v1","updated":"2024-04-08T16:40:15Z","published":"2024-04-08T16:40:15Z","title":"MLP Can Be A Good Transformer Learner","summary":" Self-attention mechanism is the key of the Transformer but often criticized\nfor its computation demands. Previous token pruning works motivate their\nmethods from the view of computation redundancy but still need to load the full\nnetwork and require same memory costs. This paper introduces a novel strategy\nthat simplifies vision transformers and reduces computational load through the\nselective removal of non-essential attention layers, guided by entropy\nconsiderations. We identify that regarding the attention layer in bottom\nblocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit\nthe same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited\nsince they exhibit smaller feature entropy compared to those MLPs in the top\nblocks. Therefore, we propose to integrate the uninformative attention layers\ninto their subsequent counterparts by degenerating them into identical mapping,\nyielding only MLP in certain transformer blocks. Experimental results on\nImageNet-1k show that the proposed method can remove 40% attention layer of\nDeiT-B, improving throughput and memory bound without performance compromise.\nCode is available at https://github.com/sihaoevery/lambda_vit.\n","authors":["Sihao Lin","Pumeng Lyu","Dongrui Liu","Tao Tang","Xiaodan Liang","Andy Song","Xiaojun Chang"],"pdf_url":"https://arxiv.org/pdf/2404.05657v1.pdf","comment":"efficient transformer"},{"id":"http://arxiv.org/abs/2404.05641v1","updated":"2024-04-08T16:21:22Z","published":"2024-04-08T16:21:22Z","title":"3D-COCO: extension of MS-COCO dataset for image detection and 3D\n reconstruction modules","summary":" We introduce 3D-COCO, an extension of the original MS-COCO dataset providing\n3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve\ncomputer vision tasks such as 3D reconstruction or image detection configurable\nwith textual, 2D image, and 3D CAD model queries. We complete the existing\nMS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By\nusing an IoU-based method, we match each MS-COCO annotation with the best 3D\nmodels to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a\npremiere that should pave the way for new research on 3D-related topics. The\ndataset and its source codes is available at\nhttps://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/\n","authors":["Maxence Bideaux","Alice Phe","Mohamed Chaouch","Bertrand Luvison","Quoc-Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2404.05641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06908v2","updated":"2024-04-08T16:16:56Z","published":"2024-03-11T17:00:27Z","title":"FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization","summary":" 3D Gaussian splatting has achieved very impressive performance in real-time\nnovel view synthesis. However, it often suffers from over-reconstruction during\nGaussian densification where high-variance image regions are covered by a few\nlarge Gaussians only, leading to blur and artifacts in the rendered images. We\ndesign a progressive frequency regularization (FreGS) technique to tackle the\nover-reconstruction issue within the frequency space. Specifically, FreGS\nperforms coarse-to-fine Gaussian densification by exploiting low-to-high\nfrequency components that can be easily extracted with low-pass and high-pass\nfilters in the Fourier space. By minimizing the discrepancy between the\nfrequency spectrum of the rendered image and the corresponding ground truth, it\nachieves high-quality Gaussian densification and alleviates the\nover-reconstruction of Gaussian splatting effectively. Experiments over\nmultiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and\nDeep Blending) show that FreGS achieves superior novel view synthesis and\noutperforms the state-of-the-art consistently.\n","authors":["Jiahui Zhang","Fangneng Zhan","Muyu Xu","Shijian Lu","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2403.06908v2.pdf","comment":"Accepted by CVPR 2024. Project website:\n https://rogeraigc.github.io/FreGS-Page/"},{"id":"http://arxiv.org/abs/2403.15238v2","updated":"2024-04-08T16:14:45Z","published":"2024-03-22T14:32:02Z","title":"WEEP: A method for spatial interpretation of weakly supervised CNN\n models in computational pathology","summary":" Deep learning enables the modelling of high-resolution histopathology\nwhole-slide images (WSI). Weakly supervised learning of tile-level data is\ntypically applied for tasks where labels only exist on the patient or WSI level\n(e.g. patient outcomes or histological grading). In this context, there is a\nneed for improved spatial interpretability of predictions from such models. We\npropose a novel method, Wsi rEgion sElection aPproach (WEEP), for model\ninterpretation. It provides a principled yet straightforward way to establish\nthe spatial area of WSI required for assigning a particular prediction label.\nWe demonstrate WEEP on a binary classification task in the area of breast\ncancer computational pathology. WEEP is easy to implement, is directly\nconnected to the model-based decision process, and offers information relevant\nto both research and diagnostic applications.\n","authors":["Abhinav Sharma","Bojing Liu","Mattias Rantalainen"],"pdf_url":"https://arxiv.org/pdf/2403.15238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05626v1","updated":"2024-04-08T15:59:29Z","published":"2024-04-08T15:59:29Z","title":"Learning a Category-level Object Pose Estimator without Pose Annotations","summary":" 3D object pose estimation is a challenging task. Previous works always\nrequire thousands of object images with annotated poses for learning the 3D\npose correspondence, which is laborious and time-consuming for labeling. In\nthis paper, we propose to learn a category-level 3D object pose estimator\nwithout pose annotations. Instead of using manually annotated images, we\nleverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under\ncontrolled pose differences and propose to learn our object pose estimator with\nthose images. Directly using the original diffusion model leads to images with\nnoisy poses and artifacts. To tackle this issue, firstly, we exploit an image\nencoder, which is learned from a specially designed contrastive pose learning,\nto filter the unreasonable details and extract image feature maps.\nAdditionally, we propose a novel learning strategy that allows the model to\nlearn object poses from those generated image sets without knowing the\nalignment of their canonical poses. Experimental results show that our method\nhas the capability of category-level object pose estimation from a single shot\nsetting (as pose definition), while significantly outperforming other\nstate-of-the-art methods on the few-shot category-level object pose estimation\nbenchmarks.\n","authors":["Fengrui Tian","Yaoyao Liu","Adam Kortylewski","Yueqi Duan","Shaoyi Du","Alan Yuille","Angtian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05321v3","updated":"2024-04-08T15:59:11Z","published":"2022-09-12T15:26:13Z","title":"Deep Feature Statistics Mapping for Generalized Screen Content Image\n Quality Assessment","summary":" The statistical regularities of natural images, referred to as natural scene\nstatistics, play an important role in no-reference image quality assessment.\nHowever, it has been widely acknowledged that screen content images (SCIs),\nwhich are typically computer generated, do not hold such statistics. Here we\nmake the first attempt to learn the statistics of SCIs, based upon which the\nquality of SCIs can be effectively determined. The underlying mechanism of the\nproposed approach is based upon the mild assumption that the SCIs, which are\nnot physically acquired, still obey certain statistics that could be understood\nin a learning fashion. We empirically show that the statistics deviation could\nbe effectively leveraged in quality assessment, and the proposed method is\nsuperior when evaluated in different settings. Extensive experimental results\ndemonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA)\nmodel delivers promising performance compared with existing NR-IQA models and\nshows a high generalization capability in the cross-dataset settings. The\nimplementation of our method is publicly available at\nhttps://github.com/Baoliang93/DFSS-IQA.\n","authors":["Baoliang Chen","Hanwei Zhu","Lingyu Zhu","Shiqi Wang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2209.05321v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14466v2","updated":"2024-04-08T15:51:37Z","published":"2022-07-29T04:10:22Z","title":"Towards Domain-agnostic Depth Completion","summary":" Existing depth completion methods are often targeted at a specific sparse\ndepth type and generalize poorly across task domains. We present a method to\ncomplete sparse/semi-dense, noisy, and potentially low-resolution depth maps\nobtained by various range sensors, including those in modern mobile phones, or\nby multi-view reconstruction algorithms. Our method leverages a data-driven\nprior in the form of a single image depth prediction network trained on\nlarge-scale datasets, the output of which is used as an input to our model. We\npropose an effective training scheme where we simulate various sparsity\npatterns in typical task domains. In addition, we design two new benchmarks to\nevaluate the generalizability and the robustness of depth completion methods.\nOur simple method shows superior cross-domain generalization ability against\nstate-of-the-art depth completion methods, introducing a practical solution to\nhigh-quality depth capture on a mobile device. The code is available at:\nhttps://github.com/YvanYin/FillDepth.\n","authors":["Guangkai Xu","Wei Yin","Jianming Zhang","Oliver Wang","Simon Niklaus","Simon Chen","Jia-Wang Bian"],"pdf_url":"https://arxiv.org/pdf/2207.14466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05621v1","updated":"2024-04-08T15:51:21Z","published":"2024-04-08T15:51:21Z","title":"MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning","summary":" While excellent in transfer learning, Vision-Language models (VLMs) come with\nhigh computational costs due to their large number of parameters. To address\nthis issue, removing parameters via model pruning is a viable solution.\nHowever, existing techniques for VLMs are task-specific, and thus require\npruning the network from scratch for each new task of interest. In this work,\nwe explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP).\nGiven a pretrained VLM, the goal is to find a unique pruned counterpart\ntransferable to multiple unknown downstream tasks. In this challenging setting,\nthe transferable representations already encoded in the pretrained model are a\nkey aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a\nfirst, gradient-free, pruning framework for TA-VLP where: (i) the importance of\na parameter is expressed in terms of its magnitude and its information flow, by\nincorporating the saliency of the neurons it connects; and (ii) pruning is\ndriven by the emergent (multimodal) distribution of the VLM parameters after\npretraining. We benchmark eight state-of-the-art pruning algorithms in the\ncontext of TA-VLP, experimenting with two VLMs, three vision-language tasks,\nand three pruning ratios. Our experimental results show that MULTIFLOW\noutperforms recent sophisticated, combinatorial competitors in the vast\nmajority of the cases, paving the way towards addressing TA-VLP. The code is\npublicly available at https://github.com/FarinaMatteo/multiflow.\n","authors":["Matteo Farina","Massimiliano Mancini","Elia Cunegatti","Gaowen Liu","Giovanni Iacca","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05621v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2207.12080v4","updated":"2024-04-08T15:50:13Z","published":"2022-07-25T11:57:01Z","title":"Intention-Conditioned Long-Term Human Egocentric Action Forecasting","summary":" To anticipate how a human would act in the future, it is essential to\nunderstand the human intention since it guides the human towards a certain\ngoal. In this paper, we propose a hierarchical architecture which assumes a\nsequence of human action (low-level) can be driven from the human intention\n(high-level). Based on this, we deal with Long-Term Action Anticipation task in\negocentric videos. Our framework first extracts two level of human information\nover the N observed videos human actions through a Hierarchical Multi-task MLP\nMixer (H3M). Then, we condition the uncertainty of the future through an\nIntention-Conditioned Variational Auto-Encoder (I-CVAE) that generates K stable\npredictions of the next Z=20 actions that the observed human might perform. By\nleveraging human intention as high-level information, we claim that our model\nis able to anticipate more time-consistent actions in the long-term, thus\nimproving the results over baseline methods in EGO4D Challenge. This work\nranked first in both CVPR@2022 and ECVV@2022 EGO4D LTA Challenge by providing\nmore plausible anticipated sequences, improving the anticipation of nouns and\noverall actions. Webpage: https://evm7.github.io/icvae-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2207.12080v4.pdf","comment":"Winner of CVPR@2022 and ECCV@2022 EGO4D LTA Challenge. Accepted in\n WACV2023. Webpage: https://evm7.github.io/icvae-page/"},{"id":"http://arxiv.org/abs/2302.08274v3","updated":"2024-04-08T15:48:50Z","published":"2023-02-16T13:06:39Z","title":"Robust Human Motion Forecasting using Transformer-based Model","summary":" Comprehending human motion is a fundamental challenge for developing\nHuman-Robot Collaborative applications. Computer vision researchers have\naddressed this field by only focusing on reducing error in predictions, but not\ntaking into account the requirements to facilitate its implementation in\nrobots. In this paper, we propose a new model based on Transformer that\nsimultaneously deals with the real time 3D human motion forecasting in the\nshort and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently\nexploit the spatio-temporal information of a shortly observed sequence (400ms)\nand generates a competitive accuracy against the current state-of-the-art.\n2CH-TR stands out for the efficient performance of the Transformer, being\nlighter and faster than its competitors. In addition, our model is tested in\nconditions where the human motion is severely occluded, demonstrating its\nrobustness in reconstructing and predicting 3D human motion in a highly noisy\nenvironment. Our experiment results show that the proposed 2CH-TR outperforms\nthe ST-Transformer, which is another state-of-the-art model based on the\nTransformer, in terms of reconstruction and prediction under the same\nconditions of input prefix. Our model reduces in 8.89% the mean squared error\nof ST-Transformer in short-term prediction, and 2.57% in long-term prediction\nin Human3.6M dataset with 400ms input prefix. Webpage:\nhttps://evm7.github.io/2CHTR-page/\n","authors":["Esteve Valls Mascaro","Shuo Ma","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2302.08274v3.pdf","comment":"Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/"},{"id":"http://arxiv.org/abs/2308.07301v2","updated":"2024-04-08T15:47:20Z","published":"2023-08-14T17:39:44Z","title":"A Unified Masked Autoencoder with Patchified Skeletons for Motion\n Synthesis","summary":" The synthesis of human motion has traditionally been addressed through\ntask-dependent models that focus on specific challenges, such as predicting\nfuture motions or filling in intermediate poses conditioned on known key-poses.\nIn this paper, we present a novel task-independent model called UNIMASK-M,\nwhich can effectively address these challenges using a unified architecture.\nOur model obtains comparable or better performance than the state-of-the-art in\neach field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model\ndecomposes a human pose into body parts to leverage the spatio-temporal\nrelationships existing in human motion. Moreover, we reformulate various\npose-conditioned motion synthesis tasks as a reconstruction problem with\ndifferent masking patterns given as input. By explicitly informing our model\nabout the masked joints, our UNIMASK-M becomes more robust to occlusions.\nExperimental results show that our model successfully forecasts human motion on\nthe Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion\ninbetweening on the LaFAN1 dataset, particularly in long transition periods.\nMore information can be found on the project website\nhttps://evm7.github.io/UNIMASKM-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07301v2.pdf","comment":"Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/"},{"id":"http://arxiv.org/abs/2309.16524v2","updated":"2024-04-08T15:46:09Z","published":"2023-09-28T15:34:49Z","title":"HOI4ABOT: Human-Object Interaction Anticipation for Human Intention\n Reading Collaborative roBOTs","summary":" Robots are becoming increasingly integrated into our lives, assisting us in\nvarious tasks. To ensure effective collaboration between humans and robots, it\nis essential that they understand our intentions and anticipate our actions. In\nthis paper, we propose a Human-Object Interaction (HOI) anticipation framework\nfor collaborative robots. We propose an efficient and robust transformer-based\nmodel to detect and anticipate HOIs from videos. This enhanced anticipation\nempowers robots to proactively assist humans, resulting in more efficient and\nintuitive collaborations. Our model outperforms state-of-the-art results in HOI\ndetection and anticipation in VidHOI dataset with an increase of 1.76% and\n1.04% in mAP respectively while being 15.4 times faster. We showcase the\neffectiveness of our approach through experimental results in a real robot,\ndemonstrating that the robot's ability to anticipate HOIs is key for better\nHuman-Robot Interaction. More information can be found on our project webpage:\nhttps://evm7.github.io/HOI4ABOT_page/\n","authors":["Esteve Valls Mascaro","Daniel Sliwowski","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2309.16524v2.pdf","comment":"Proceedings in Conference on Robot Learning 2023. Webpage:\n https://evm7.github.io/HOI4ABOT_page/"},{"id":"http://arxiv.org/abs/2402.04768v2","updated":"2024-04-08T15:43:14Z","published":"2024-02-07T11:37:14Z","title":"Robot Interaction Behavior Generation based on Social Motion Forecasting\n for Human-Robot Interaction","summary":" Integrating robots into populated environments is a complex challenge that\nrequires an understanding of human social dynamics. In this work, we propose to\nmodel social motion forecasting in a shared human-robot representation space,\nwhich facilitates us to synthesize robot motions that interact with humans in\nsocial scenarios despite not observing any robot in the motion training. We\ndevelop a transformer-based architecture called ECHO, which operates in the\naforementioned shared space to predict the future motions of the agents\nencountered in social scenarios. Contrary to prior works, we reformulate the\nsocial motion problem as the refinement of the predicted individual motions\nbased on the surrounding agents, which facilitates the training while allowing\nfor single-motion forecasting when only one human is in the scene. We evaluate\nour model in multi-person and human-robot motion forecasting tasks and obtain\nstate-of-the-art performance by a large margin while being efficient and\nperforming in real-time. Additionally, our qualitative results showcase the\neffectiveness of our approach in generating human-robot interaction behaviors\nthat can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/\n","authors":["Esteve Valls Mascaro","Yashuai Yan","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2402.04768v2.pdf","comment":"Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/"},{"id":"http://arxiv.org/abs/2404.05607v1","updated":"2024-04-08T15:29:46Z","published":"2024-04-08T15:29:46Z","title":"A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion","summary":" Nowadays, the family of Stable Diffusion (SD) models has gained prominence\nfor its high quality outputs and scalability. This has also raised security\nconcerns on social media, as malicious users can create and disseminate harmful\ncontent. Existing approaches involve training components or entire SDs to embed\na watermark in generated images for traceability and responsibility\nattribution. However, in the era of AI-generated content (AIGC), the rapid\niteration of SDs renders retraining with watermark models costly. To address\nthis, we propose a training-free plug-and-play watermark framework for SDs.\nWithout modifying any components of SDs, we embed diverse watermarks in the\nlatent space, adapting to the denoising process. Our experimental findings\nreveal that our method effectively harmonizes image quality and watermark\ninvisibility. Furthermore, it performs robustly under various attacks. We also\nhave validated that our method is generalized to multiple versions of SDs, even\nwithout retraining the watermark model.\n","authors":["Guokai Zhang","Lanjun Wang","Yuting Su","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05606v1","updated":"2024-04-08T15:25:50Z","published":"2024-04-08T15:25:50Z","title":"Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view\n Reconstruction","summary":" Face meshes in consistent topology serve as the foundation for many\nface-related applications, such as 3DMM constrained face reconstruction and\nexpression retargeting. Traditional methods commonly acquire topology uniformed\nface meshes by two separate steps: multi-view stereo (MVS) to reconstruct\nshapes followed by non-rigid registration to align topology, but struggles with\nhandling noise and non-lambertian surfaces. Recently neural volume rendering\ntechniques have been rapidly evolved and shown great advantages in 3D\nreconstruction or novel view synthesis. Our goal is to leverage the superiority\nof neural volume rendering into multi-view reconstruction of face mesh with\nconsistent topology. We propose a mesh volume rendering method that enables\ndirectly optimizing mesh geometry while preserving topology, and learning\nimplicit features to model complex facial appearance from multi-view images.\nThe key innovation lies in spreading sparse mesh features into the surrounding\nspace to simulate radiance field required for volume rendering, which\nfacilitates backpropagation of gradients from images to mesh geometry and\nimplicit appearance features. Our proposed feature spreading module exhibits\ndeformation invariance, enabling photorealistic rendering seamlessly after mesh\nediting. We conduct experiments on multi-view face image dataset to evaluate\nthe reconstruction and implement an application for photorealistic rendering of\nanimated face mesh.\n","authors":["Yating Wang","Ran Yi","Ke Fan","Jinkun Hao","Jiangbo Lu","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05603v1","updated":"2024-04-08T15:22:38Z","published":"2024-04-08T15:22:38Z","title":"Self-Explainable Affordance Learning with Embodied Caption","summary":" In the field of visual affordance learning, previous methods mainly used\nabundant images or videos that delineate human behavior patterns to identify\naction possibility regions for object manipulation, with a variety of\napplications in robotic tasks. However, they encounter a main challenge of\naction ambiguity, illustrated by the vagueness like whether to beat or carry a\ndrum, and the complexities involved in processing intricate scenes. Moreover,\nit is important for human intervention to rectify robot errors in time. To\naddress these issues, we introduce Self-Explainable Affordance learning (SEA)\nwith embodied caption. This innovation enables robots to articulate their\nintentions and bridge the gap between explainable vision-language caption and\nvisual affordance learning. Due to a lack of appropriate dataset, we unveil a\npioneering dataset and metrics tailored for this task, which integrates images,\nheatmaps, and embodied captions. Furthermore, we propose a novel model to\neffectively combine affordance grounding with self-explanation in a simple but\nefficient manner. Extensive quantitative and qualitative experiments\ndemonstrate our method's effectiveness.\n","authors":["Zhipeng Zhang","Zhimin Wei","Guolei Sun","Peng Wang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00722v3","updated":"2024-04-08T15:15:56Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based applications to low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing information from non-local areas. In the\ndomain of super-resolution, Swin-transformer-based approaches have become\nmainstream due to their capacity to capture global spatial information and\ntheir shifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced image\nquality and network efficiency by expanding the receptive field or designing\ncomplex networks, yielding commendable results. However, we observed that\nspatial information tends to diminish during the forward propagation process\ndue to increased depth, leading to a loss of spatial information and,\nconsequently, limiting the model's potential. To address this, we propose the\nDense-residual-connected Transformer (DRCT), aimed at mitigating the loss of\nspatial information through dense-residual connections between layers, thereby\nunleashing the model's potential and enhancing performance. Experiment results\nindicate that our approach is not only straightforward but also achieves\nremarkable efficiency, surpassing state-of-the-art methods and performing\ncommendably at NTIRE2024.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v3.pdf","comment":"NTIRE 2024 Image Super-resolution (x4)"},{"id":"http://arxiv.org/abs/2404.05595v1","updated":"2024-04-08T15:14:20Z","published":"2024-04-08T15:14:20Z","title":"UniFL: Improve Stable Diffusion via Unified Feedback Learning","summary":" Diffusion models have revolutionized the field of image generation, leading\nto the proliferation of high-quality models and diverse downstream\napplications. However, despite these significant advancements, the current\ncompetitive solutions still suffer from several limitations, including inferior\nvisual quality, a lack of aesthetic appeal, and inefficient inference, without\na comprehensive solution in sight. To address these challenges, we present\nUniFL, a unified framework that leverages feedback learning to enhance\ndiffusion models comprehensively. UniFL stands out as a universal, effective,\nand generalizable solution applicable to various diffusion models, such as\nSD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual\nfeedback learning, which enhances visual quality; decoupled feedback learning,\nwhich improves aesthetic appeal; and adversarial feedback learning, which\noptimizes inference speed. In-depth experiments and extensive user studies\nvalidate the superior performance of our proposed method in enhancing both the\nquality of generated models and their acceleration. For instance, UniFL\nsurpasses ImageReward by 17% user preference in terms of generation quality and\noutperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we\nhave verified the efficacy of our approach in downstream tasks, including Lora,\nControlNet, and AnimateDiff.\n","authors":["Jiacheng Zhang","Jie Wu","Yuxi Ren","Xin Xia","Huafeng Kuang","Pan Xie","Jiashi Li","Xuefeng Xiao","Weilin Huang","Min Zheng","Lean Fu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2404.05595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05584v1","updated":"2024-04-08T14:59:53Z","published":"2024-04-08T14:59:53Z","title":"Neural Cellular Automata for Lightweight, Robust and Explainable\n Classification of White Blood Cell Images","summary":" Diagnosis of hematological malignancies depends on accurate identification of\nwhite blood cells in peripheral blood smears. Deep learning techniques are\nemerging as a viable solution to scale and optimize this process by automatic\nidentification of cells in laboratories. However, these techniques face several\nchallenges such as limited generalizability, sensitivity to domain shifts and\nlack of explainability. Here, we are introducing a novel approach based on\nneural cellular automata (NCA) for white blood cell classification. We test our\napproach on three datasets of white blood cell images and show that we achieve\ncompetitive performance compared to conventional methods. Our NCA-based method\nis significantly smaller in terms of parameters and exhibits robustness to\ndomain shifts. Furthermore, the architecture is inherently explainable,\nproviding insights into the decision process for each classification, helping\nexperts understand and validate model predictions. Results demonstrate that NCA\nnot only can be used for image classification, but also address key challenges\nof conventional methods, indicating a high potential for applicability in\nclinical practice.\n","authors":["Michael Deutges","Ario Sadafi","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05583v1","updated":"2024-04-08T14:58:52Z","published":"2024-04-08T14:58:52Z","title":"Towards More General Video-based Deepfake Detection through Facial\n Feature Guided Adaptation for Foundation Model","summary":" With the rise of deep learning, generative models have enabled the creation\nof highly realistic synthetic images, presenting challenges due to their\npotential misuse. While research in Deepfake detection has grown rapidly in\nresponse, many detection methods struggle with unseen Deepfakes generated by\nnew synthesis techniques. To address this generalisation challenge, we propose\na novel Deepfake detection approach by adapting rich information encoded inside\nthe Foundation Models with rich information encoded inside, specifically using\nthe image encoder from CLIP which has demonstrated strong zero-shot capability\nfor downstream tasks. Inspired by the recent advances of parameter efficient\nfine-tuning, we propose a novel side-network-based decoder to extract spatial\nand temporal cues from the given video clip, with the promotion of the Facial\nComponent Guidance (FCG) to guidencourage the spatial feature to include\nfeatures of key facial parts for more robust and general Deepfake detection.\nThrough extensive cross-dataset evaluations, our approach exhibits superior\neffectiveness in identifying unseen Deepfake samples, achieving notable\nperformance improvementsuccess even with limited training samples and\nmanipulation types. Our model secures an average performance enhancement of\n0.9% AUROC in cross-dataset assessments comparing with state-of-the-art\nmethods, especiallytablishing a significant lead of achieving 4.4% improvement\non the challenging DFDC dataset.\n","authors":["Yue-Hua Han","Tai-Ming Huang","Shu-Tzu Lo","Po-Han Huang","Kai-Lung Hua","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05580v1","updated":"2024-04-08T14:56:26Z","published":"2024-04-08T14:56:26Z","title":"Responsible Visual Editing","summary":" With recent advancements in visual synthesis, there is a growing risk of\nencountering images with detrimental effects, such as hate, discrimination, or\nprivacy violations. The research on transforming harmful images into\nresponsible ones remains unexplored. In this paper, we formulate a new task,\nresponsible visual editing, which entails modifying specific concepts within an\nimage to render it more responsible while minimizing changes. However, the\nconcept that needs to be edited is often abstract, making it challenging to\nlocate what needs to be modified and plan how to modify it. To tackle these\nchallenges, we propose a Cognitive Editor (CoEditor) that harnesses the large\nmultimodal model through a two-stage cognitive process: (1) a perceptual\ncognitive process to focus on what needs to be modified and (2) a behavioral\ncognitive process to strategize how to modify. To mitigate the negative\nimplications of harmful images on research, we create a transparent and public\ndataset, AltBear, which expresses harmful information using teddy bears instead\nof humans. Experiments demonstrate that CoEditor can effectively comprehend\nabstract concepts within complex scenes and significantly surpass the\nperformance of baseline models for responsible visual editing. We find that the\nAltBear dataset corresponds well to the harmful content found in real images,\noffering a consistent experimental evaluation, thereby providing a safer\nbenchmark for future research. Moreover, CoEditor also shows great results in\ngeneral editing. We release our code and dataset at\nhttps://github.com/kodenii/Responsible-Visual-Editing.\n","authors":["Minheng Ni","Yeli Shen","Lei Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05580v1.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.05579v1","updated":"2024-04-08T14:55:35Z","published":"2024-04-08T14:55:35Z","title":"Robust Data Pruning: Uncovering and Overcoming Implicit Bias","summary":" In the era of exceptionally data-hungry models, careful selection of the\ntraining data is essential to mitigate the extensive costs of deep learning.\nData pruning offers a solution by removing redundant or uninformative samples\nfrom the dataset, which yields faster convergence and improved neural scaling\nlaws. However, little is known about its impact on classification bias of the\ntrained models. We conduct the first systematic study of this effect and reveal\nthat existing data pruning algorithms can produce highly biased classifiers. At\nthe same time, we argue that random data pruning with appropriate class ratios\nhas potential to improve the worst-class performance. We propose a\n\"fairness-aware\" approach to pruning and empirically demonstrate its\nperformance on standard computer vision benchmarks. In sharp contrast to\nexisting algorithms, our proposed method continues improving robustness at a\ntolerable drop of average performance as we prune more from the datasets. We\npresent theoretical analysis of the classification risk in a mixture of\nGaussians to further motivate our algorithm and support our findings.\n","authors":["Artem Vysogorets","Kartik Ahuja","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2404.05579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05578v1","updated":"2024-04-08T14:54:54Z","published":"2024-04-08T14:54:54Z","title":"Social-MAE: Social Masked Autoencoder for Multi-person Motion\n Representation Learning","summary":" For a complete comprehension of multi-person scenes, it is essential to go\nbeyond basic tasks like detection and tracking. Higher-level tasks, such as\nunderstanding the interactions and social activities among individuals, are\nalso crucial. Progress towards models that can fully understand scenes\ninvolving multiple people is hindered by a lack of sufficient annotated data\nfor such high-level tasks. To address this challenge, we introduce Social-MAE,\na simple yet effective transformer-based masked autoencoder framework for\nmulti-person human motion data. The framework uses masked modeling to pre-train\nthe encoder to reconstruct masked human joint trajectories, enabling it to\nlearn generalizable and data efficient representations of motion in human\ncrowded scenes. Social-MAE comprises a transformer as the MAE encoder and a\nlighter-weight transformer as the MAE decoder which operates on multi-person\njoints' trajectory in the frequency domain. After the reconstruction task, the\nMAE decoder is replaced with a task-specific decoder and the model is\nfine-tuned end-to-end for a variety of high-level social tasks. Our proposed\nmodel combined with our pre-training approach achieves the state-of-the-art\nresults on various high-level social tasks, including multi-person pose\nforecasting, social grouping, and social action understanding. These\nimprovements are demonstrated across four popular multi-person datasets\nencompassing both human 2D and 3D body pose.\n","authors":["Mahsa Ehsanpour","Ian Reid","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.05578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16741v2","updated":"2024-04-08T14:42:15Z","published":"2024-01-30T04:39:32Z","title":"MESA: Matching Everything by Segmenting Anything","summary":" Feature matching is a crucial task in the field of computer vision, which\ninvolves finding correspondences between images. Previous studies achieve\nremarkable performance using learning-based feature comparison. However, the\npervasive presence of matching redundancy between images gives rise to\nunnecessary and error-prone computations in these methods, imposing limitations\non their accuracy. To address this issue, we propose MESA, a novel approach to\nestablish precise area (or region) matches for efficient matching redundancy\nreduction. MESA first leverages the advanced image understanding capability of\nSAM, a state-of-the-art foundation model for image segmentation, to obtain\nimage areas with implicit semantic. Then, a multi-relational graph is proposed\nto model the spatial structure of these areas and construct their scale\nhierarchy. Based on graphical models derived from the graph, the area matching\nis reformulated as an energy minimization task and effectively resolved.\nExtensive experiments demonstrate that MESA yields substantial precision\nimprovement for multiple point matchers in indoor and outdoor downstream tasks,\ne.g. +13.61% for DKM in indoor pose estimation.\n","authors":["Yesheng Zhang","Xu Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.16741v2.pdf","comment":"CVPR24"},{"id":"http://arxiv.org/abs/2312.01068v2","updated":"2024-04-08T14:33:12Z","published":"2023-12-02T08:34:22Z","title":"DPHMs: Diffusion Parametric Head Models for Depth-based Tracking","summary":" We introduce Diffusion Parametric Head Models (DPHMs), a generative model\nthat enables robust volumetric head reconstruction and tracking from monocular\ndepth sequences. While recent volumetric head models, such as NPHMs, can now\nexcel in representing high-fidelity head geometries, tracking and\nreconstructing heads from real-world single-view depth sequences remains very\nchallenging, as the fitting to partial and noisy observations is\nunderconstrained. To tackle these challenges, we propose a latent\ndiffusion-based prior to regularize volumetric head reconstruction and\ntracking. This prior-based regularizer effectively constrains the identity and\nexpression codes to lie on the underlying latent manifold which represents\nplausible head shapes. To evaluate the effectiveness of the diffusion-based\nprior, we collect a dataset of monocular Kinect sequences consisting of various\ncomplex facial expression motions and rapid transitions. We compare our method\nto state-of-the-art tracking methods and demonstrate improved head identity\nreconstruction as well as robust expression tracking.\n","authors":["Jiapeng Tang","Angela Dai","Yinyu Nie","Lev Markhasin","Justus Thies","Matthias Niessner"],"pdf_url":"https://arxiv.org/pdf/2312.01068v2.pdf","comment":"CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/"},{"id":"http://arxiv.org/abs/2404.05559v1","updated":"2024-04-08T14:30:42Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2307.06206v2","updated":"2024-04-08T14:26:52Z","published":"2023-07-12T14:52:21Z","title":"SepVAE: a contrastive VAE to separate pathological patterns from healthy\n ones","summary":" Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders\n(VAEs) that aims at separating the common factors of variation between a\nbackground dataset (BG) (i.e., healthy subjects) and a target dataset (TG)\n(i.e., patients) from the ones that only exist in the target dataset. To do so,\nthese methods separate the latent space into a set of salient features (i.e.,\nproper to the target dataset) and a set of common features (i.e., exist in both\ndatasets). Currently, all models fail to prevent the sharing of information\nbetween latent spaces effectively and to capture all salient factors of\nvariation. To this end, we introduce two crucial regularization losses: a\ndisentangling term between common and salient representations and a\nclassification term between background and target samples in the salient space.\nWe show a better performance than previous CA-VAEs methods on three medical\napplications and a natural images dataset (CelebA). Code and datasets are\navailable on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae.\n","authors":["Robin Louiset","Edouard Duchesnay","Antoine Grigis","Benoit Dufumier","Pietro Gori"],"pdf_url":"https://arxiv.org/pdf/2307.06206v2.pdf","comment":"Workshop on Interpretable ML in Healthcare at International\n Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023"},{"id":"http://arxiv.org/abs/2308.16018v4","updated":"2024-04-08T14:09:27Z","published":"2023-08-30T13:20:54Z","title":"SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for\n Skeleton-based Action Recognition","summary":" Graph convolution networks (GCNs) have achieved remarkable performance in\nskeleton-based action recognition. However, previous GCN-based methods rely on\nelaborate human priors excessively and construct complex feature aggregation\nmechanisms, which limits the generalizability and effectiveness of networks. To\nsolve these problems, we propose a novel Spatial Topology Gating Unit (STGU),\nan MLP-based variant without extra priors, to capture the co-occurrence\ntopology features that encode the spatial dependency across all joints. In\nSTGU, to learn the point-wise topology features, a new gate-based feature\ninteraction mechanism is introduced to activate the features point-to-point by\nthe attention map generated from the input sample. Based on the STGU, we\npropose the first MLP-based model, SiT-MLP, for skeleton-based action\nrecognition in this work. Compared with previous methods on three large-scale\ndatasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP\nreduces the parameters significantly with favorable results. The code will be\navailable at https://github.com/BUPTSJZhang/SiT?MLP.\n","authors":["Shaojie Zhang","Jianqin Yin","Yonghao Dang","Jiajun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16018v4.pdf","comment":"Accepted by IEEE TCSVT 2024"},{"id":"http://arxiv.org/abs/2312.07526v2","updated":"2024-04-08T13:40:43Z","published":"2023-12-12T18:55:29Z","title":"RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose\n Estimation","summary":" Real-time multi-person pose estimation presents significant challenges in\nbalancing speed and precision. While two-stage top-down methods slow down as\nthe number of people in the image increases, existing one-stage methods often\nfail to simultaneously deliver high accuracy and real-time performance. This\npaper introduces RTMO, a one-stage pose estimation framework that seamlessly\nintegrates coordinate classification by representing keypoints using dual 1-D\nheatmaps within the YOLO architecture, achieving accuracy comparable to\ntop-down methods while maintaining high speed. We propose a dynamic coordinate\nclassifier and a tailored loss function for heatmap learning, specifically\ndesigned to address the incompatibilities between coordinate classification and\ndense prediction models. RTMO outperforms state-of-the-art one-stage pose\nestimators, achieving 1.1% higher AP on COCO while operating about 9 times\nfaster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on\nCOCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and\naccuracy. The code and models are available at\nhttps://github.com/open-mmlab/mmpose/tree/main/projects/rtmo.\n","authors":["Peng Lu","Tao Jiang","Yining Li","Xiangtai Li","Kai Chen","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2312.07526v2.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo"},{"id":"http://arxiv.org/abs/2404.05519v1","updated":"2024-04-08T13:40:01Z","published":"2024-04-08T13:40:01Z","title":"Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot\n Editing of Text-to-Video Diffusion Models","summary":" With recent advances in image and video diffusion models for content\ncreation, a plethora of techniques have been proposed for customizing their\ngenerated content. In particular, manipulating the cross-attention layers of\nText-to-Image (T2I) diffusion models has shown great promise in controlling the\nshape and location of objects in the scene. Transferring image-editing\ntechniques to the video domain, however, is extremely challenging as object\nmotion and temporal consistency are difficult to capture accurately. In this\nwork, we take a first look at the role of cross-attention in Text-to-Video\n(T2V) diffusion models for zero-shot video editing. While one-shot models have\nshown potential in controlling motion and camera movement, we demonstrate\nzero-shot control over object shape, position and movement in T2V models. We\nshow that despite the limitations of current T2V models, cross-attention\nguidance can be a promising approach for editing videos.\n","authors":["Saman Motamed","Wouter Van Gansbeke","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05519v1.pdf","comment":"Generative Models for Computer Vision Generative Models for Computer\n Vision CVPR 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.05518v1","updated":"2024-04-08T13:39:12Z","published":"2024-04-08T13:39:12Z","title":"DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker","summary":" Accurately distinguishing each object is a fundamental goal of Multi-object\ntracking (MOT) algorithms. However, achieving this goal still remains\nchallenging, primarily due to: (i) For crowded scenes with occluded objects,\nthe high overlap of object bounding boxes leads to confusion among closely\nlocated objects. Nevertheless, humans naturally perceive the depth of elements\nin a scene when observing 2D videos. Inspired by this, even though the bounding\nboxes of objects are close on the camera plane, we can differentiate them in\nthe depth dimension, thereby establishing a 3D perception of the objects. (ii)\nFor videos with rapidly irregular camera motion, abrupt changes in object\npositions can result in ID switches. However, if the camera pose are known, we\ncan compensate for the errors in linear motion models. In this paper, we\npropose \\textit{DepthMOT}, which achieves: (i) detecting and estimating scene\ndepth map \\textit{end-to-end}, (ii) compensating the irregular camera motion by\ncamera pose estimation. Extensive experiments demonstrate the superior\nperformance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be\navailable at \\url{https://github.com/JackWoo0831/DepthMOT}.\n","authors":["Jiapeng Wu","Yichen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05512v1","updated":"2024-04-08T13:35:14Z","published":"2024-04-08T13:35:14Z","title":"Impact of LiDAR visualisations on semantic segmentation of\n archaeological objects","summary":" Deep learning methods in LiDAR-based archaeological research often leverage\nvisualisation techniques derived from Digital Elevation Models to enhance\ncharacteristics of archaeological objects present in the images. This paper\ninvestigates the impact of visualisations on deep learning performance through\na comprehensive testing framework. The study involves the use of eight semantic\nsegmentation models to evaluate seven diverse visualisations across two study\nareas, encompassing five archaeological classes. Experimental results reveal\nthat the choice of appropriate visualisations can influence performance by up\nto 8%. Yet, pinpointing one visualisation that outperforms the others in\nsegmenting all archaeological classes proves challenging. The observed\nperformance variation, reaching up to 25% across different model\nconfigurations, underscores the importance of thoughtfully selecting model\nconfigurations and LiDAR visualisations for successfully segmenting\narchaeological objects.\n","authors":["Raveerat Jaturapitpornchai","Giulio Poggi","Gregory Sech","Ziga Kokalj","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05512v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2404.05505v1","updated":"2024-04-08T13:27:07Z","published":"2024-04-08T13:27:07Z","title":"Taming Transformers for Realistic Lidar Point Cloud Generation","summary":" Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the\nLidar point cloud generation task, benefiting from their stable training and\niterative refinement during sampling. However, DMs often fail to realistically\nmodel Lidar raydrop noise due to their inherent denoising process. To retain\nthe strength of iterative sampling while enhancing the generation of raydrop\nnoise, we introduce LidarGRIT, a generative model that uses auto-regressive\ntransformers to iteratively sample the range images in the latent space rather\nthan image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode\nrange images and raydrop masks. Our results show that LidarGRIT achieves\nsuperior performance compared to SOTA models on KITTI-360 and KITTI odometry\ndatasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT.\n","authors":["Hamed Haghighi","Amir Samadi","Mehrdad Dianati","Valentina Donzella","Kurt Debattista"],"pdf_url":"https://arxiv.org/pdf/2404.05505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08077v2","updated":"2024-04-08T13:23:47Z","published":"2023-11-14T11:05:08Z","title":"Zero-Shot Segmentation of Eye Features Using the Segment Anything Model\n (SAM)","summary":" The advent of foundation models signals a new era in artificial intelligence.\nThe Segment Anything Model (SAM) is the first foundation model for image\nsegmentation. In this study, we evaluate SAM's ability to segment features from\neye images recorded in virtual reality setups. The increasing requirement for\nannotated eye-image datasets presents a significant opportunity for SAM to\nredefine the landscape of data annotation in gaze estimation. Our investigation\ncenters on SAM's zero-shot learning abilities and the effectiveness of prompts\nlike bounding boxes or point clicks. Our results are consistent with studies in\nother domains, demonstrating that SAM's segmentation effectiveness can be\non-par with specialized models depending on the feature, with prompts improving\nits performance, evidenced by an IoU of 93.34% for pupil segmentation in one\ndataset. Foundation models like SAM could revolutionize gaze estimation by\nenabling quick and easy image segmentation, reducing reliance on specialized\nmodels and extensive manual annotation.\n","authors":["Virmarie Maquiling","Sean Anthony Byrne","Diederick C. Niehorster","Marcus Nyström","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2311.08077v2.pdf","comment":"14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on\n Eye Tracking Research & Applications"},{"id":"http://arxiv.org/abs/2311.16728v2","updated":"2024-04-08T13:17:05Z","published":"2023-11-28T12:19:00Z","title":"Photo-SLAM: Real-time Simultaneous Localization and Photorealistic\n Mapping for Monocular, Stereo, and RGB-D Cameras","summary":" The integration of neural rendering and the SLAM system recently showed\npromising results in joint localization and photorealistic view reconstruction.\nHowever, existing methods, fully relying on implicit representations, are so\nresource-hungry that they cannot run on portable devices, which deviates from\nthe original intention of SLAM. In this paper, we present Photo-SLAM, a novel\nSLAM framework with a hyper primitives map. Specifically, we simultaneously\nexploit explicit geometric features for localization and learn implicit\nphotometric features to represent the texture information of the observed\nenvironment. In addition to actively densifying hyper primitives based on\ngeometric features, we further introduce a Gaussian-Pyramid-based training\nmethod to progressively learn multi-level features, enhancing photorealistic\nmapping performance. The extensive experiments with monocular, stereo, and\nRGB-D datasets prove that our proposed system Photo-SLAM significantly\noutperforms current state-of-the-art SLAM systems for online photorealistic\nmapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times\nfaster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time\nspeed using an embedded platform such as Jetson AGX Orin, showing the potential\nof robotics applications.\n","authors":["Huajian Huang","Longwei Li","Hui Cheng","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.16728v2.pdf","comment":"CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project\n Page: https://huajianup.github.io/research/Photo-SLAM/"},{"id":"http://arxiv.org/abs/2311.17389v2","updated":"2024-04-08T13:15:03Z","published":"2023-11-29T06:42:12Z","title":"360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization\n with Cross-device Queries","summary":" Portable 360$^\\circ$ cameras are becoming a cheap and efficient tool to\nestablish large visual databases. By capturing omnidirectional views of a\nscene, these cameras could expedite building environment models that are\nessential for visual localization. However, such an advantage is often\noverlooked due to the lack of valuable datasets. This paper introduces a new\nbenchmark dataset, 360Loc, composed of 360$^\\circ$ images with ground truth\nposes for visual localization. We present a practical implementation of\n360$^\\circ$ mapping combining 360$^\\circ$ images with lidar data to generate\nthe ground truth 6DoF poses. 360Loc is the first dataset and benchmark that\nexplores the challenge of cross-device visual positioning, involving\n360$^\\circ$ reference frames, and query frames from pinhole, ultra-wide FoV\nfisheye, and 360$^\\circ$ cameras. We propose a virtual camera approach to\ngenerate lower-FoV query frames from 360$^\\circ$ images, which ensures a fair\ncomparison of performance among different query types in visual localization\ntasks. We also extend this virtual camera approach to feature matching-based\nand pose regression-based methods to alleviate the performance loss caused by\nthe cross-device domain gap, and evaluate its effectiveness against\nstate-of-the-art baselines. We demonstrate that omnidirectional visual\nlocalization is more robust in challenging large-scale scenes with symmetries\nand repetitive structures. These results provide new insights into 360-camera\nmapping and omnidirectional visual localization with cross-device queries.\n","authors":["Huajian Huang","Changkun Liu","Yipeng Zhu","Hui Cheng","Tristan Braud","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.17389v2.pdf","comment":"CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/"},{"id":"http://arxiv.org/abs/2404.05490v1","updated":"2024-04-08T13:11:57Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00226v2","updated":"2024-04-08T13:05:11Z","published":"2024-03-30T02:56:54Z","title":"Design as Desired: Utilizing Visual Question Answering for Multimodal\n Pre-training","summary":" Multimodal pre-training demonstrates its potential in the medical domain,\nwhich learns medical visual representations from paired medical reports.\nHowever, many pre-training tasks require extra annotations from clinicians, and\nmost of them fail to explicitly guide the model to learn the desired features\nof different pathologies. To the best of our knowledge, we are the first to\nutilize Visual Question Answering (VQA) for multimodal pre-training to guide\nthe framework focusing on targeted pathological features. In this work, we\nleverage descriptions in medical reports to design multi-granular\nquestion-answer pairs associated with different diseases, which assist the\nframework in pre-training without requiring extra annotations from experts. We\nalso propose a novel pre-training framework with a quasi-textual feature\ntransformer, a module designed to transform visual features into a\nquasi-textual space closer to the textual domain via a contrastive learning\nstrategy. This narrows the vision-language gap and facilitates modality\nalignment. Our framework is applied to four downstream tasks: report\ngeneration, classification, segmentation, and detection across five datasets.\nExtensive experiments demonstrate the superiority of our framework compared to\nother state-of-the-art methods. Our code will be released upon acceptance.\n","authors":["Tongkun Su","Jun Li","Xi Zhang","Haibo Jin","Hao Chen","Qiong Wang","Faqin Lv","Baoliang Zhao","Yin Hu"],"pdf_url":"https://arxiv.org/pdf/2404.00226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01941v3","updated":"2024-04-08T12:51:35Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v3.pdf","comment":"Accepted to CVPR 2024. More results available at\n https://cic.tju.edu.cn/faculty/likun/projects/LPSNet"},{"id":"http://arxiv.org/abs/2306.14227v2","updated":"2024-04-08T12:50:51Z","published":"2023-06-25T12:15:44Z","title":"A ground-based dataset and a diffusion model for on-orbit low-light\n image enhancement","summary":" On-orbit service is important for maintaining the sustainability of space\nenvironment. Space-based visible camera is an economical and lightweight sensor\nfor situation awareness during on-orbit service. However, it can be easily\naffected by the low illumination environment. Recently, deep learning has\nachieved remarkable success in image enhancement of natural images, but seldom\napplied in space due to the data bottleneck. In this article, we first propose\na dataset of the Beidou Navigation Satellite for on-orbit low-light image\nenhancement (LLIE). In the automatic data collection scheme, we focus on\nreducing domain gap and improving the diversity of the dataset. we collect\nhardware in-the-loop images based on a robotic simulation testbed imitating\nspace lighting conditions. To evenly sample poses of different orientation and\ndistance without collision, a collision-free working space and pose stratified\nsampling is proposed. Afterwards, a novel diffusion model is proposed. To\nenhance the image contrast without over-exposure and blurring details, we\ndesign a fused attention to highlight the structure and dark region. Finally,\nwe compare our method with previous methods using our dataset, which indicates\nthat our method has a better capacity in on-orbit LLIE.\n","authors":["Yiman Zhu","Lu Wang","Jingyi Yuan","Yu Guo"],"pdf_url":"https://arxiv.org/pdf/2306.14227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05468v1","updated":"2024-04-08T12:46:39Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v1.pdf","comment":"Pre-print to be updated"},{"id":"http://arxiv.org/abs/2404.05466v1","updated":"2024-04-08T12:44:24Z","published":"2024-04-08T12:44:24Z","title":"Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder","summary":" Automatic lip-reading (ALR) aims to automatically transcribe spoken content\nfrom a speaker's silent lip motion captured in video. Current mainstream\nlip-reading approaches only use a single visual encoder to model input videos\nof a single scale. In this paper, we propose to enhance lipreading by\nincorporating multi-scale video data and multi-encoder. Specifically, we first\npropose a novel multi-scale lip extraction algorithm based on the size of the\nspeaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip\nfeatures at different scales. For the multi-encoder, in addition to the\nmainstream Transformer and Conformer, we also incorporate the recently proposed\nBranchformer and EBranchformer as visual encoders. In the experiments, we\nexplore the influence of different video data scales and encoders on ALR system\nperformance and fuse the texts transcribed by all ALR systems using recognizer\noutput voting error reduction (ROVER). Finally, our proposed approach placed\nsecond in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in\ncharacter error rate (CER) compared to the official baseline on the evaluation\nset.\n","authors":["He Wang","Pengcheng Guo","Xucheng Wan","Huan Zhou","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.05466v1.pdf","comment":"6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR"},{"id":"http://arxiv.org/abs/2404.05465v1","updated":"2024-04-08T12:43:32Z","published":"2024-04-08T12:43:32Z","title":"HAMMR: HierArchical MultiModal React agents for generic VQA","summary":" Combining Large Language Models (LLMs) with external specialized tools\n(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual\nQuestion Answering (VQA). While this approach was demonstrated to work well\nwhen optimized and evaluated for each individual benchmark, in practice it is\ncrucial for the next generation of real-world AI systems to handle a broad\nrange of multimodal problems. Therefore we pose the VQA problem from a unified\nperspective and evaluate a single system on a varied suite of VQA tasks\nincluding counting, spatial reasoning, OCR-based reasoning, visual pointing,\nexternal knowledge, and more. In this setting, we demonstrate that naively\napplying the LLM+tools approach using the combined set of all tools leads to\npoor results. This motivates us to introduce HAMMR: HierArchical MultiModal\nReact. We start from a multimodal ReAct-based system and make it hierarchical\nby enabling our HAMMR agents to call upon other specialized agents. This\nenhances the compositionality of the LLM+tools approach, which we show to be\ncritical for obtaining high accuracy on generic VQA. Concretely, on our generic\nVQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%.\nAdditionally, HAMMR achieves state-of-the-art results on this task,\noutperforming the generic standalone PaLI-X VQA model by 5.0%.\n","authors":["Lluis Castrejon","Thomas Mensink","Howard Zhou","Vittorio Ferrari","Andre Araujo","Jasper Uijlings"],"pdf_url":"https://arxiv.org/pdf/2404.05465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05447v1","updated":"2024-04-08T12:29:46Z","published":"2024-04-08T12:29:46Z","title":"Pansharpening of PRISMA products for archaeological prospection","summary":" Hyperspectral data recorded from satellite platforms are often ill-suited for\ngeo-archaeological prospection due to low spatial resolution. The established\npotential of hyperspectral data from airborne sensors in identifying\narchaeological features has, on the other side, generated increased interest in\nenhancing hyperspectral data to achieve higher spatial resolution. This\nimprovement is crucial for detecting traces linked to sub-surface\ngeo-archaeological features and can make satellite hyperspectral acquisitions\nmore suitable for archaeological research. This research assesses the usability\nof pansharpened PRISMA satellite products in geo-archaeological prospections.\nThree pan-sharpening methods (GSA, MTF-GLP and HySure) are compared\nquantitatively and qualitatively and tested over the archaeological landscape\nof Aquileia (Italy). The results suggest that the application of pansharpening\ntechniques makes hyperspectral satellite imagery highly suitable, under certain\nconditions, to the identification of sub-surface archaeological features of\nsmall and large size.\n","authors":["Gregory Sech","Giulio Poggi","Marina Ljubenovic","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05447v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2301.07409v2","updated":"2024-04-08T12:25:10Z","published":"2023-01-18T10:13:29Z","title":"Representing Noisy Image Without Denoising","summary":" A long-standing topic in artificial intelligence is the effective recognition\nof patterns from noisy images. In this regard, the recent data-driven paradigm\nconsiders 1) improving the representation robustness by adding noisy samples in\ntraining phase (i.e., data augmentation) or 2) pre-processing the noisy image\nby learning to solve the inverse problem (i.e., image denoising). However, such\nmethods generally exhibit inefficient process and unstable result, limiting\ntheir practical applications. In this paper, we explore a non-learning paradigm\nthat aims to derive robust representation directly from noisy images, without\nthe denoising as pre-processing. Here, the noise-robust representation is\ndesigned as Fractional-order Moments in Radon space (FMR), with also beneficial\nproperties of orthogonality and rotation invariance. Unlike earlier\ninteger-order methods, our work is a more generic design taking such classical\nmethods as special cases, and the introduced fractional-order parameter offers\ntime-frequency analysis capability that is not available in classical methods.\nFormally, both implicit and explicit paths for constructing the FMR are\ndiscussed in detail. Extensive simulation experiments and an image security\napplication are provided to demonstrate the uniqueness and usefulness of our\nFMR, especially for noise robustness, rotation invariance, and time-frequency\ndiscriminability.\n","authors":["Shuren Qi","Yushu Zhang","Chao Wang","Tao Xiang","Xiaochun Cao","Yong Xiang"],"pdf_url":"https://arxiv.org/pdf/2301.07409v2.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.05439v1","updated":"2024-04-08T12:18:01Z","published":"2024-04-08T12:18:01Z","title":"Action-conditioned video data improves predictability","summary":" Long-term video generation and prediction remain challenging tasks in\ncomputer vision, particularly in partially observable scenarios where cameras\nare mounted on moving platforms. The interaction between observed image frames\nand the motion of the recording agent introduces additional complexities. To\naddress these issues, we introduce the Action-Conditioned Video Generation\n(ACVG) framework, a novel approach that investigates the relationship between\nactions and generated image frames through a deep dual Generator-Actor\narchitecture. ACVG generates video sequences conditioned on the actions of\nrobots, enabling exploration and analysis of how vision and action mutually\ninfluence one another in dynamic environments. We evaluate the framework's\neffectiveness on an indoor robot motion dataset which consists of sequences of\nimage frames along with the sequences of actions taken by the robotic agent,\nconducting a comprehensive empirical study comparing ACVG to other\nstate-of-the-art frameworks along with a detailed ablation study.\n","authors":["Meenakshi Sarkar","Debasish Ghose"],"pdf_url":"https://arxiv.org/pdf/2404.05439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05297v2","updated":"2024-04-08T12:17:24Z","published":"2024-03-08T13:24:46Z","title":"PEEB: Part-based Image Classifiers with an Explainable and Editable\n Language Bottleneck","summary":" CLIP-based classifiers rely on the prompt containing a {class name} that is\nknown to the text encoder. Therefore, they perform poorly on new classes or the\nclasses whose names rarely appear on the Internet (e.g., scientific names of\nbirds). For fine-grained classification, we propose PEEB - an explainable and\neditable classifier to (1) express the class name into a set of text\ndescriptors that describe the visual parts of that class; and (2) match the\nembeddings of the detected parts to their textual descriptors in each class to\ncompute a logit score for classification. In a zero-shot setting where the\nclass names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1\naccuracy). Compared to part-based classifiers, PEEB is not only the\nstate-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20%\naccuracy on CUB-200 and Dogs-120, respectively) but also the first to enable\nusers to edit the text descriptors to form a new classifier without any\nre-training. Compared to concept bottleneck models, PEEB is also the SOTA in\nboth zero-shot and supervised-learning settings.\n","authors":["Thang M. Pham","Peijie Chen","Tin Nguyen","Seunghyun Yoon","Trung Bui","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2403.05297v2.pdf","comment":"Findings of NAACL 2024 (long paper)"},{"id":"http://arxiv.org/abs/2305.10874v3","updated":"2024-04-08T12:17:01Z","published":"2023-05-18T11:06:15Z","title":"Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation","summary":" With the explosive popularity of AI-generated content (AIGC), video\ngeneration has recently received a lot of attention. Generating videos guided\nby text instructions poses significant challenges, such as modeling the complex\nrelationship between space and time, and the lack of large-scale text-video\npaired data. Existing text-video datasets suffer from limitations in both\ncontent quality and scale, or they are not open-source, rendering them\ninaccessible for study and use. For model design, previous approaches extend\npretrained text-to-image generation models by adding temporal 1D\nconvolution/attention modules for video generation. However, these approaches\noverlook the importance of jointly modeling space and time, inevitably leading\nto temporal distortions and misalignment between texts and videos. In this\npaper, we propose a novel approach that strengthens the interaction between\nspatial and temporal perceptions. In particular, we utilize a swapped\ncross-attention mechanism in 3D windows that alternates the ``query'' role\nbetween spatial and temporal blocks, enabling mutual reinforcement for each\nother. Moreover, to fully unlock model capabilities for high-quality video\ngeneration and promote the development of the field, we curate a large-scale\nand open-source video dataset called HD-VG-130M. This dataset comprises 130\nmillion text-video pairs from the open-domain, ensuring high-definition,\nwidescreen and watermark-free characters. A smaller-scale yet more meticulously\ncleaned subset further enhances the data quality, aiding models in achieving\nsuperior performance. Experimental quantitative and qualitative results\ndemonstrate the superiority of our approach in terms of per-frame quality,\ntemporal correlation, and text-video alignment, with clear margins.\n","authors":["Wenjing Wang","Huan Yang","Zixi Tuo","Huiguo He","Junchen Zhu","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05426v1","updated":"2024-04-08T11:54:49Z","published":"2024-04-08T11:54:49Z","title":"Test-Time Zero-Shot Temporal Action Localization","summary":" Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate\nactions in untrimmed videos unseen during training. Existing ZS-TAL methods\ninvolve fine-tuning a model on a large amount of annotated training data. While\neffective, training-based ZS-TAL approaches assume the availability of labeled\ndata for supervised learning, which can be impractical in some applications.\nFurthermore, the training process naturally induces a domain bias into the\nlearned model, which may adversely affect the model's generalization ability to\narbitrary videos. These considerations prompt us to approach the ZS-TAL problem\nfrom a radically novel perspective, relaxing the requirement for training data.\nTo this aim, we introduce a novel method that performs Test-Time adaptation for\nTemporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained\nVision and Language Model (VLM). T3AL operates in three steps. First, a\nvideo-level pseudo-label of the action category is computed by aggregating\ninformation from the entire video. Then, action localization is performed\nadopting a novel procedure inspired by self-supervised learning. Finally,\nframe-level textual descriptions extracted with a state-of-the-art captioning\nmodel are employed for refining the action region proposals. We validate the\neffectiveness of T3AL by conducting experiments on the THUMOS14 and the\nActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly\noutperforms zero-shot baselines based on state-of-the-art VLMs, confirming the\nbenefit of a test-time adaptation approach.\n","authors":["Benedetta Liberatori","Alessandro Conti","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05937v3","updated":"2024-04-08T11:46:07Z","published":"2024-02-08T18:59:53Z","title":"InstaGen: Enhancing Object Detection by Training on Synthetic Dataset","summary":" In this paper, we present a novel paradigm to enhance the ability of object\ndetector, e.g., expanding categories or improving detection performance, by\ntraining on synthetic dataset generated from diffusion models. Specifically, we\nintegrate an instance-level grounding head into a pre-trained, generative\ndiffusion model, to augment it with the ability of localising instances in the\ngenerated images. The grounding head is trained to align the text embedding of\ncategory names with the regional visual feature of the diffusion model, using\nsupervision from an off-the-shelf object detector, and a novel self-training\nscheme on (novel) categories not covered by the detector. We conduct thorough\nexperiments to show that, this enhanced version of diffusion model, termed as\nInstaGen, can serve as a data synthesizer, to enhance object detectors by\ntraining on its generated samples, demonstrating superior performance over\nexisting state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse\n(+1.2 to 5.2 AP) scenarios. Project page with code:\nhttps://fcjian.github.io/InstaGen.\n","authors":["Chengjian Feng","Yujie Zhong","Zequn Jie","Weidi Xie","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2402.05937v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05414v1","updated":"2024-04-08T11:32:26Z","published":"2024-04-08T11:32:26Z","title":"Two Hands Are Better Than One: Resolving Hand to Hand Intersections via\n Occupancy Networks","summary":" 3D hand pose estimation from images has seen considerable interest from the\nliterature, with new methods improving overall 3D accuracy. One current\nchallenge is to address hand-to-hand interaction where self-occlusions and\nfinger articulation pose a significant problem to estimation. Little work has\napplied physical constraints that minimize the hand intersections that occur as\na result of noisy estimation. This work addresses the intersection of hands by\nexploiting an occupancy network that represents the hand's volume as a\ncontinuous manifold. This allows us to model the probability distribution of\npoints being inside a hand. We designed an intersection loss function to\nminimize the likelihood of hand-to-point intersections. Moreover, we propose a\nnew hand mesh parameterization that is superior to the commonly used MANO model\nin many respects including lower mesh complexity, underlying 3D skeleton\nextraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the\nmodels trained using our intersection loss achieve better results than the\nstate-of-the-art by significantly decreasing the number of hand intersections\nwhile lowering the mean per-joint positional error. Additionally, we\ndemonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE\ndatasets and show reduced hand-to-hand intersections for complex domains such\nas sign-language pose estimation.\n","authors":["Maksym Ivashechkin","Oscar Mendez","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2404.05414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06704v3","updated":"2024-04-08T11:24:30Z","published":"2023-12-10T11:45:45Z","title":"SIFU: Side-view Conditioned Implicit Function for Real-world Usable\n Clothed Human Reconstruction","summary":" Creating high-quality 3D models of clothed humans from single images for\nreal-world applications is crucial. Despite recent advancements, accurately\nreconstructing humans in complex poses or with loose clothing from in-the-wild\nimages, along with predicting textures for unseen areas, remains a significant\nchallenge. A key limitation of previous methods is their insufficient prior\nguidance in transitioning from 2D to 3D and in texture prediction. In response,\nwe introduce SIFU (Side-view Conditioned Implicit Function for Real-world\nUsable Clothed Human Reconstruction), a novel approach combining a Side-view\nDecoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU\nemploys a cross-attention mechanism within the transformer, using SMPL-X\nnormals as queries to effectively decouple side-view features in the process of\nmapping 2D features to 3D. This method not only improves the precision of the\n3D models but also their robustness, especially when SMPL-X estimates are not\nperfect. Our texture refinement process leverages text-to-image diffusion-based\nprior to generate realistic and consistent textures for invisible views.\nThrough extensive experiments, SIFU surpasses SOTA methods in both geometry and\ntexture reconstruction, showcasing enhanced robustness in complex scenarios and\nachieving an unprecedented Chamfer and P2S measurement. Our approach extends to\npractical applications such as 3D printing and scene building, demonstrating\nits broad utility in real-world scenarios. Project page\nhttps://river-zhang.github.io/SIFU-projectpage/ .\n","authors":["Zechuan Zhang","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2312.06704v3.pdf","comment":"Accepted by CVPR 2024; Project page\n https://river-zhang.github.io/SIFU-projectpage/"},{"id":"http://arxiv.org/abs/2303.13514v3","updated":"2024-04-08T11:22:05Z","published":"2023-03-23T17:59:35Z","title":"SAOR: Single-View Articulated Object Reconstruction","summary":" We introduce SAOR, a novel approach for estimating the 3D shape, texture, and\nviewpoint of an articulated object from a single image captured in the wild.\nUnlike prior approaches that rely on pre-defined category-specific 3D templates\nor tailored 3D skeletons, SAOR learns to articulate shapes from single-view\nimage collections with a skeleton-free part-based model without requiring any\n3D object shape priors. To prevent ill-posed solutions, we propose a\ncross-instance consistency loss that exploits disentangled object shape\ndeformation and articulation. This is helped by a new silhouette-based sampling\nmechanism to enhance viewpoint diversity during training. Our method only\nrequires estimated object silhouettes and relative depth maps from\noff-the-shelf pre-trained networks during training. At inference time, given a\nsingle-view image, it efficiently outputs an explicit mesh representation. We\nobtain improved qualitative and quantitative results on challenging quadruped\nanimals compared to relevant existing work.\n","authors":["Mehmet Aygün","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2303.13514v3.pdf","comment":"Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor"},{"id":"http://arxiv.org/abs/2404.05409v1","updated":"2024-04-08T11:20:28Z","published":"2024-04-08T11:20:28Z","title":"Anatomical Conditioning for Contrastive Unpaired Image-to-Image\n Translation of Optical Coherence Tomography Images","summary":" For a unified analysis of medical images from different modalities, data\nharmonization using image-to-image (I2I) translation is desired. We study this\nproblem employing an optical coherence tomography (OCT) data set of\nSpectralis-OCT and Home-OCT images. I2I translation is challenging because the\nimages are unpaired, and a bijective mapping does not exist due to the\ninformation discrepancy between both domains. This problem has been addressed\nby the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it\nreduces semantic consistency. To restore the semantic consistency, we support\nthe style decoder using an additional segmentation decoder. Our approach\nincreases the similarity between the style-translated images and the target\ndistribution. Importantly, we improve the segmentation of biomarkers in\nHome-OCT images in an unsupervised domain adaptation scenario. Our data\nharmonization approach provides potential for the monitoring of diseases, e.g.,\nage related macular disease, using different OCT devices.\n","authors":["Marc S. Seibel","Hristina Uzunova","Timo Kepp","Heinz Handels"],"pdf_url":"https://arxiv.org/pdf/2404.05409v1.pdf","comment":"Accepted at ISBI 2024"},{"id":"http://arxiv.org/abs/2311.10605v2","updated":"2024-04-08T10:59:06Z","published":"2023-11-17T16:01:06Z","title":"CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification","summary":" Person re-identification (re-ID) is a challenging task that aims to learn\ndiscriminative features for person retrieval. In person re-ID, Jaccard distance\nis a widely used distance metric, especially in re-ranking and clustering\nscenarios. However, we discover that camera variation has a significant\nnegative impact on the reliability of Jaccard distance. In particular, Jaccard\ndistance calculates the distance based on the overlap of relevant neighbors.\nDue to camera variation, intra-camera samples dominate the relevant neighbors,\nwhich reduces the reliability of the neighbors by introducing intra-camera\nnegative samples and excluding inter-camera positive samples. To overcome this\nproblem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that\nleverages camera information to enhance the reliability of Jaccard distance.\nSpecifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to\nfind k-reciprocal nearest neighbors on the intra-camera and inter-camera\nranking lists, which improves the reliability of relevant neighbors and\nguarantees the contribution of inter-camera samples in the overlap. Moreover,\nwe propose a camera-aware local query expansion (CLQE) to mine reliable samples\nin relevant neighbors by exploiting camera variation as a strong constraint and\nassign these samples higher weights in overlap, further improving the\nreliability. Our CA-Jaccard distance is simple yet effective and can serve as a\ngeneral distance metric for person re-ID methods with high reliability and low\ncomputational cost. Extensive experiments demonstrate the effectiveness of our\nmethod.\n","authors":["Yiyu Chen","Zheyi Fan","Zhaoru Chen","Yixuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.10605v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2309.04190v4","updated":"2024-04-08T10:57:42Z","published":"2023-09-08T08:03:42Z","title":"SegmentAnything helps microscopy images based automatic and quantitative\n organoid detection and analysis","summary":" Organoids are self-organized 3D cell clusters that closely mimic the\narchitecture and function of in vivo tissues and organs. Quantification of\norganoid morphology helps in studying organ development, drug discovery, and\ntoxicity assessment. Recent microscopy techniques provide a potent tool to\nacquire organoid morphology features, but manual image analysis remains a labor\nand time-intensive process. Thus, this paper proposes a comprehensive pipeline\nfor microscopy analysis that leverages the SegmentAnything to precisely\ndemarcate individual organoids. Additionally, we introduce a set of\nmorphological properties, including perimeter, area, radius, non-smoothness,\nand non-circularity, allowing researchers to analyze the organoid structures\nquantitatively and automatically. To validate the effectiveness of our\napproach, we conducted tests on bright-field images of human induced\npluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The\nresults obtained from our automatic pipeline closely align with manual organoid\ndetection and measurement, showcasing the capability of our proposed method in\naccelerating organoids morphology analysis.\n","authors":["Xiaodan Xing","Chunling Tang","Yunzhe Guo","Nicholas Kurniawan","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04190v4.pdf","comment":"Replace Figure 4 with the correct version. The original version is\n wrong due to a column name mismatch"},{"id":"http://arxiv.org/abs/2404.05393v1","updated":"2024-04-08T10:52:29Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05392v1","updated":"2024-04-08T10:51:29Z","published":"2024-04-08T10:51:29Z","title":"T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise\n Event Spotting in Sports Videos","summary":" In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer\nEncoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses\nmultiple challenges in the task, including the need for discriminability among\nframe representations, high output temporal resolution to maintain prediction\nprecision, and the necessity to capture information at different temporal\nscales to handle events with varying dynamics. It tackles these challenges\nthrough its specifically designed architecture, featuring an encoder-decoder\nfor leveraging multiple temporal scales and achieving high output temporal\nresolution, along with temporal modules designed to increase token\ndiscriminability. Leveraging these characteristics, T-DEED achieves SOTA\nperformance on the FigureSkating and FineDiving datasets.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.05392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15288v2","updated":"2024-04-08T10:48:22Z","published":"2023-12-23T16:05:47Z","title":"Understanding normalization in contrastive representation learning and\n out-of-distribution detection","summary":" Contrastive representation learning has emerged as an outstanding approach\nfor anomaly detection. In this work, we explore the $\\ell_2$-norm of\ncontrastive features and its applications in out-of-distribution detection. We\npropose a simple method based on contrastive learning, which incorporates\nout-of-distribution data by discriminating against normal samples in the\ncontrastive layer space. Our approach can be applied flexibly as an outlier\nexposure (OE) approach, where the out-of-distribution data is a huge collective\nof random images, or as a fully self-supervised learning approach, where the\nout-of-distribution data is self-generated by applying distribution-shifting\ntransformations. The ability to incorporate additional out-of-distribution\nsamples enables a feasible solution for datasets where AD methods based on\ncontrastive learning generally underperform, such as aerial images or\nmicroscopy images. Furthermore, the high-quality features learned through\ncontrastive learning consistently enhance performance in OE scenarios, even\nwhen the available out-of-distribution dataset is not diverse enough. Our\nextensive experiments demonstrate the superiority of our proposed method under\nvarious scenarios, including unimodal and multimodal settings, with various\nimage datasets.\n","authors":["Tai Le-Gia","Jaehyun Ahn"],"pdf_url":"https://arxiv.org/pdf/2312.15288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05384v1","updated":"2024-04-08T10:45:29Z","published":"2024-04-08T10:45:29Z","title":"Rethinking the Spatial Inconsistency in Classifier-Free Diffusion\n Guidance","summary":" Classifier-Free Guidance (CFG) has been widely used in text-to-image\ndiffusion models, where the CFG scale is introduced to control the strength of\ntext guidance on the whole image space. However, we argue that a global CFG\nscale results in spatial inconsistency on varying semantic strengths and\nsuboptimal image quality. To address this problem, we present a novel approach,\nSemantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance\ndegrees for different semantic units in text-to-image diffusion models.\nSpecifically, we first design a training-free semantic segmentation method to\npartition the latent image into relatively independent semantic regions at each\ndenoising step. In particular, the cross-attention map in the denoising U-net\nbackbone is renormalized for assigning each patch to the corresponding token,\nwhile the self-attention map is used to complete the semantic regions. Then, to\nbalance the amplification of diverse semantic units, we adaptively adjust the\nCFG scales across different semantic regions to rescale the text guidance\ndegrees into a uniform level. Finally, extensive experiments demonstrate the\nsuperiority of S-CFG over the original CFG strategy on various text-to-image\ndiffusion models, without requiring any extra training cost. our codes are\navailable at https://github.com/SmilesDZgk/S-CFG.\n","authors":["Dazhong Shen","Guanglu Song","Zeyue Xue","Fu-Yun Wang","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05384v1.pdf","comment":"accepted by CVPR-2024"},{"id":"http://arxiv.org/abs/2305.15873v2","updated":"2024-04-08T10:28:38Z","published":"2023-05-25T09:09:32Z","title":"Confronting Ambiguity in 6D Object Pose Estimation via Score-Based\n Diffusion on SE(3)","summary":" Addressing pose ambiguity in 6D object pose estimation from single RGB images\npresents a significant challenge, particularly due to object symmetries or\nocclusions. In response, we introduce a novel score-based diffusion method\napplied to the $SE(3)$ group, marking the first application of diffusion models\nto $SE(3)$ within the image domain, specifically tailored for pose estimation\ntasks. Extensive evaluations demonstrate the method's efficacy in handling pose\nambiguity, mitigating perspective-induced ambiguity, and showcasing the\nrobustness of our surrogate Stein score formulation on $SE(3)$. This\nformulation not only improves the convergence of denoising process but also\nenhances computational efficiency. Thus, we pioneer a promising strategy for 6D\nobject pose estimation.\n","authors":["Tsu-Ching Hsiao","Hao-Wei Chen","Hsuan-Kung Yang","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2305.15873v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05366v1","updated":"2024-04-08T10:05:24Z","published":"2024-04-08T10:05:24Z","title":"CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery","summary":" In Generalized Category Discovery (GCD), we cluster unlabeled samples of\nknown and novel classes, leveraging a training dataset of known classes. A\nsalient challenge arises due to domain shifts between these datasets. To\naddress this, we present a novel setting: Across Domain Generalized Category\nDiscovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains)\nas a remedy. CDAD-NET is architected to synchronize potential known class\nsamples across both the labeled (source) and unlabeled (target) datasets, while\nemphasizing the distinct categorization of the target data. To facilitate this,\nwe propose an entropy-driven adversarial learning strategy that accounts for\nthe distance distributions of target samples relative to source-domain class\nprototypes. Parallelly, the discriminative nature of the shared space is upheld\nthrough a fusion of three metric learning objectives. In the source domain, our\nfocus is on refining the proximity between samples and their affiliated class\nprototypes, while in the target domain, we integrate a neighborhood-centric\ncontrastive learning mechanism, enriched with an adept neighborsmining\napproach. To further accentuate the nuanced feature interrelation among\nsemantically aligned images, we champion the concept of conditional image\ninpainting, underscoring the premise that semantically analogous images prove\nmore efficacious to the task than their disjointed counterparts.\nExperimentally, CDAD-NET eclipses existing literature with a performance\nincrement of 8-15% on three AD-GCD benchmarks we present.\n","authors":["Sai Bhargav Rongali","Sarthak Mehrotra","Ankit Jha","Mohamad Hassan N C","Shirsha Bose","Tanisha Gupta","Mainak Singha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2404.05366v1.pdf","comment":"Accepted in L3D-IVU, CVPR Workshop, 2024"},{"id":"http://arxiv.org/abs/2308.13888v3","updated":"2024-04-08T10:04:29Z","published":"2023-08-26T14:12:19Z","title":"Neural Implicit Morphing of Face Images","summary":" Face morphing is a problem in computer graphics with numerous artistic and\nforensic applications. It is challenging due to variations in pose, lighting,\ngender, and ethnicity. This task consists of a warping for feature alignment\nand a blending for a seamless transition between the warped images. We propose\nto leverage coord-based neural networks to represent such warpings and\nblendings of face images. During training, we exploit the smoothness and\nflexibility of such networks by combining energy functionals employed in\nclassical approaches without discretizations. Additionally, our method is\ntime-dependent, allowing a continuous warping/blending of the images. During\nmorphing inference, we need both direct and inverse transformations of the\ntime-dependent warping. The first (second) is responsible for warping the\ntarget (source) image into the source (target) image. Our neural warping stores\nthose maps in a single network dismissing the need for inverting them. The\nresults of our experiments indicate that our method is competitive with both\nclassical and generative models under the lens of image quality and\nface-morphing detectors. Aesthetically, the resulting images present a seamless\nblending of diverse faces not yet usual in the literature.\n","authors":["Guilherme Schardong","Tiago Novello","Hallison Paz","Iurii Medvedev","Vinícius da Silva","Luiz Velho","Nuno Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2308.13888v3.pdf","comment":"14 pages, 20 figures, accepted for CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05362v1","updated":"2024-04-08T09:54:28Z","published":"2024-04-08T09:54:28Z","title":"Multi-head Attention-based Deep Multiple Instance Learning","summary":" This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple\nInstance Learning model, designed for weakly supervised Whole Slide Images\n(WSIs) classification in digital pathology. Inspired by the multi-head\nattention mechanism of the Transformer, MAD-MIL simplifies model complexity\nwhile achieving competitive results against advanced models like CLAM and\nDS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16,\nTCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL.\nThis demonstrates enhanced information diversity, interpretability, and\nefficiency in slide representation. The model's effectiveness, coupled with\nfewer trainable parameters and lower computational complexity makes it a\npromising solution for automated pathology workflows. Our code is available at\nhttps://github.com/tueimage/MAD-MIL.\n","authors":["Hassan Keshvarikhojasteh","Josien Pluim","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2404.05362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01585v3","updated":"2024-04-08T09:53:27Z","published":"2023-02-03T07:35:53Z","title":"SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation","summary":" Aerial image segmentation is the basis for applications such as automatically\ncreating maps or tracking deforestation. In true orthophotos, which are often\nused in these applications, many objects and regions can be approximated well\nby polygons. However, this fact is rarely exploited by state-of-the-art\nsemantic segmentation models. Instead, most models allow unnecessary degrees of\nfreedom in their predictions by allowing arbitrary region shapes. We therefore\npresent a refinement of our deep learning model which predicts binary space\npartitioning trees, an efficient polygon representation. The refinements\ninclude a new feature decoder architecture and a new differentiable BSP tree\nrenderer which both avoid vanishing gradients. Additionally, we designed a\nnovel loss function specifically designed to improve the spatial partitioning\ndefined by the predicted trees. Furthermore, our expanded model can predict\nmultiple trees at once and thus can predict class-specific segmentations. As an\nadditional contribution, we investigate the impact of a non-optimal training\nprocess in comparison to an optimized training process. While model\narchitectures optimized for aerial images, such as PFNet or our own model, show\nan advantage under non-optimal conditions, this advantage disappears under\noptimal training conditions. Despite this observation, our model still makes\nbetter predictions for small rectangular objects, e.g., cars.\n","authors":["Daniel Gritzner","Jörn Ostermann"],"pdf_url":"https://arxiv.org/pdf/2302.01585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05357v1","updated":"2024-04-08T09:48:02Z","published":"2024-04-08T09:48:02Z","title":"CNN-based Game State Detection for a Foosball Table","summary":" The automation of games using Deep Reinforcement Learning Strategies (DRL) is\na well-known challenge in AI research. While for feature extraction in a video\ngame typically the whole image is used, this is hardly practical for many real\nworld games. Instead, using a smaller game state reducing the dimension of the\nparameter space to include essential parameters only seems to be a promising\napproach. In the game of Foosball, a compact and comprehensive game state\ndescription consists of the positional shifts and rotations of the figures and\nthe position of the ball over time. In particular, velocities and accelerations\ncan be derived from consecutive time samples of the game state. In this paper,\na figure detection system to determine the game state in Foosball is presented.\nWe capture a dataset containing the rotations of the rods which were measured\nusing accelerometers and the positional shifts were derived using traditional\nComputer Vision techniques (in a laboratory setting). This dataset is utilized\nto train Convolutional Neural Network (CNN) based end-to-end regression models\nto predict the rotations and shifts of each rod. We present an evaluation of\nour system using different state-of-the-art CNNs as base architectures for the\nregression model. We show that our system is able to predict the game state\nwith high accuracy. By providing data for both black and white teams, the\npresented system is intended to provide the required data for future\ndevelopments of Imitation Learning techniques w.r.t. to observing human\nplayers.\n","authors":["David Hagens","Jan Knaup","Elke Hergenröther","Andreas Weinmann"],"pdf_url":"https://arxiv.org/pdf/2404.05357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05348v1","updated":"2024-04-08T09:33:40Z","published":"2024-04-08T09:33:40Z","title":"Iterative Refinement Strategy for Automated Data Labeling: Facial\n Landmark Diagnosis in Medical Imaging","summary":" Automated data labeling techniques are crucial for accelerating the\ndevelopment of deep learning models, particularly in complex medical imaging\napplications. However, ensuring accuracy and efficiency remains challenging.\nThis paper presents iterative refinement strategies for automated data labeling\nin facial landmark diagnosis to enhance accuracy and efficiency for deep\nlearning models in medical applications, including dermatology, plastic\nsurgery, and ophthalmology. Leveraging feedback mechanisms and advanced\nalgorithms, our approach iteratively refines initial labels, reducing reliance\non manual intervention while improving label quality. Through empirical\nevaluation and case studies, we demonstrate the effectiveness of our proposed\nstrategies in deep learning tasks across medical imaging domains. Our results\nhighlight the importance of iterative refinement in automated data labeling to\nenhance the capabilities of deep learning systems in medical imaging\napplications.\n","authors":["Yu-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13263v2","updated":"2024-04-08T09:31:33Z","published":"2023-06-23T02:19:52Z","title":"Synthetic data shuffling accelerates the convergence of federated\n learning under data heterogeneity","summary":" In federated learning, data heterogeneity is a critical challenge. A\nstraightforward solution is to shuffle the clients' data to homogenize the\ndistribution. However, this may violate data access rights, and how and when\nshuffling can accelerate the convergence of a federated optimization algorithm\nis not theoretically well understood. In this paper, we establish a precise and\nquantifiable correspondence between data heterogeneity and parameters in the\nconvergence rate when a fraction of data is shuffled across clients. We prove\nthat shuffling can quadratically reduce the gradient dissimilarity with respect\nto the shuffling percentage, accelerating convergence. Inspired by the theory,\nwe propose a practical approach that addresses the data access rights issue by\nshuffling locally generated synthetic data. The experimental results show that\nshuffling synthetic data improves the performance of multiple existing\nfederated learning algorithms by a large margin.\n","authors":["Bo Li","Yasin Esfandiari","Mikkel N. Schmidt","Tommy S. Alstrøm","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2306.13263v2.pdf","comment":"Accepted at TMLR"},{"id":"http://arxiv.org/abs/2404.05341v1","updated":"2024-04-08T09:27:42Z","published":"2024-04-08T09:27:42Z","title":"Comparative Analysis of Image Enhancement Techniques for Brain Tumor\n Segmentation: Contrast, Histogram, and Hybrid Approaches","summary":" This study systematically investigates the impact of image enhancement\ntechniques on Convolutional Neural Network (CNN)-based Brain Tumor\nSegmentation, focusing on Histogram Equalization (HE), Contrast Limited\nAdaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing\nthe U-Net architecture on a dataset of 3064 Brain MRI images, the research\ndelves into preprocessing steps, including resizing and enhancement, to\noptimize segmentation accuracy. A detailed analysis of the CNN-based U-Net\narchitecture, training, and validation processes is provided. The comparative\nanalysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals\nthat the hybrid approach CLAHE-HE consistently outperforms others. Results\nhighlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing,\nand validation, respectively) and robust segmentation overlap, with Jaccard\nvalues of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and\n0.9932 for the same phases, emphasizing its potential in neuro-oncological\napplications. The study concludes with a call for refinement in segmentation\nmethodologies to further enhance diagnostic precision and treatment planning in\nneuro-oncology.\n","authors":["Shoffan Saifullah","Andri Pranolo","Rafał Dreżewski"],"pdf_url":"https://arxiv.org/pdf/2404.05341v1.pdf","comment":"9 Pages, & Figures, 2 Tables, International Conference on Computer\n Science Electronics and Information (ICCSEI 2023)"},{"id":"http://arxiv.org/abs/2404.05331v1","updated":"2024-04-08T09:18:32Z","published":"2024-04-08T09:18:32Z","title":"Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask\n Prompt","summary":" Text-to-image generation has witnessed great progress, especially with the\nrecent advancements in diffusion models. Since texts cannot provide detailed\nconditions like object appearance, reference images are usually leveraged for\nthe control of objects in the generated images. However, existing methods still\nsuffer limited accuracy when the relationship between the foreground and\nbackground is complicated. To address this issue, we develop a framework termed\nMask-ControlNet by introducing an additional mask prompt. Specifically, we\nfirst employ large vision models to obtain masks to segment the objects of\ninterest in the reference image. Then, the object images are employed as\nadditional prompts to facilitate the diffusion model to better understand the\nrelationship between foreground and background regions during image generation.\nExperiments show that the mask prompts enhance the controllability of the\ndiffusion model to maintain higher fidelity to the reference image while\nachieving better image quality. Comparison with previous text-to-image\ngeneration methods demonstrates our method's superior quantitative and\nqualitative performance on the benchmark datasets.\n","authors":["Zhiqi Huang","Huixin Xiong","Haoyu Wang","Longguang Wang","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v1","updated":"2024-04-08T09:08:43Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.12017v2","updated":"2024-04-08T09:02:40Z","published":"2023-03-21T16:54:01Z","title":"Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR\n Fusion","summary":" In this paper, we study the problem of jointly estimating the optical flow\nand scene flow from synchronized 2D and 3D data. Previous methods either employ\na complex pipeline that splits the joint task into independent stages, or fuse\n2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such\none-size-fits-all approaches suffer from a dilemma of failing to fully utilize\nthe characteristic of each modality or to maximize the inter-modality\ncomplementarity. To address the problem, we propose a novel end-to-end\nframework, which consists of 2D and 3D branches with multiple bidirectional\nfusion connections between them in specific layers. Different from previous\nwork, we apply a point-based 3D branch to extract the LiDAR features, as it\npreserves the geometric structure of point clouds. To fuse dense image features\nand sparse point features, we propose a learnable operator named bidirectional\ncamera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the\nbidirectional fusion pipeline, one based on the pyramidal coarse-to-fine\narchitecture (dubbed CamLiPWC), and the other one based on the recurrent\nall-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC\nand CamLiRAFT surpass all existing methods and achieve up to a 47.9\\% reduction\nin 3D end-point-error from the best published result. Our best-performing\nmodel, CamLiRAFT, achieves an error of 4.26\\% on the KITTI Scene Flow\nbenchmark, ranking 1st among all submissions with much fewer parameters.\nBesides, our methods have strong generalization performance and the ability to\nhandle non-rigid motion. Code is available at\nhttps://github.com/MCG-NJU/CamLiFlow.\n","authors":["Haisong Liu","Tao Lu","Yihui Xu","Jia Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12017v2.pdf","comment":"Accepted to TPAMI 2023"},{"id":"http://arxiv.org/abs/2404.05309v1","updated":"2024-04-08T08:57:32Z","published":"2024-04-08T08:57:32Z","title":"CLIPping the Limits: Finding the Sweet Spot for Relevant Images in\n Automated Driving Systems Perception Testing","summary":" Perception systems, especially cameras, are the eyes of automated driving\nsystems. Ensuring that they function reliably and robustly is therefore an\nimportant building block in the automation of vehicles. There are various\napproaches to test the perception of automated driving systems. Ultimately,\nhowever, it always comes down to the investigation of the behavior of\nperception systems under specific input data. Camera images are a crucial part\nof the input data. Image data sets are therefore collected for the testing of\nautomated driving systems, but it is non-trivial to find specific images in\nthese data sets. Thanks to recent developments in neural networks, there are\nnow methods for sorting the images in a data set according to their similarity\nto a prompt in natural language. In order to further automate the provision of\nsearch results, we make a contribution by automating the threshold definition\nin these sorted results and returning only the images relevant to the prompt as\na result. Our focus is on preventing false positives and false negatives\nequally. It is also important that our method is robust and in the case that\nour assumptions are not fulfilled, we provide a fallback solution.\n","authors":["Philipp Rigoll","Laurenz Adolph","Lennart Ries","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2404.05309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05307v1","updated":"2024-04-08T08:53:54Z","published":"2024-04-08T08:53:54Z","title":"Human Detection from 4D Radar Data in Low-Visibility Field Conditions","summary":" Autonomous driving technology is increasingly being used on public roads and\nin industrial settings such as mines. While it is essential to detect\npedestrians, vehicles, or other obstacles, adverse field conditions negatively\naffect the performance of classical sensors such as cameras or lidars. Radar,\non the other hand, is a promising modality that is less affected by, e.g.,\ndust, smoke, water mist or fog. In particular, modern 4D imaging radars provide\ntarget responses across the range, vertical angle, horizontal angle and Doppler\nvelocity dimensions. We propose TMVA4D, a CNN architecture that leverages this\n4D radar modality for semantic segmentation. The CNN is trained to distinguish\nbetween the background and person classes based on a series of 2D projections\nof the 4D radar data that include the elevation, azimuth, range, and Doppler\nvelocity dimensions. We also outline the process of compiling a novel dataset\nconsisting of data collected in industrial settings with a car-mounted 4D radar\nand describe how the ground-truth labels were generated from reference thermal\nimages. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an\nmDice score of 86.1%, evaluated on the two classes background and person\n","authors":["Mikael Skog","Oleksandr Kotlyar","Vladimír Kubelka","Martin Magnusson"],"pdf_url":"https://arxiv.org/pdf/2404.05307v1.pdf","comment":"Submitted to Radar in Robotics workshop at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.05300v1","updated":"2024-04-08T08:42:47Z","published":"2024-04-08T08:42:47Z","title":"Texture Classification Network Integrating Adaptive Wavelet Transform","summary":" Graves' disease is a common condition that is diagnosed clinically by\ndetermining the smoothness of the thyroid texture and its morphology in\nultrasound images. Currently, the most widely used approach for the automated\ndiagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for\nboth feature extraction and classification. However, these methods demonstrate\nlimited efficacy in capturing texture features. Given the high capacity of\nwavelets in describing texture features, this research integrates learnable\nwavelet modules utilizing the Lifting Scheme into CNNs and incorporates a\nparallel wavelet branch into the ResNet18 model to enhance texture feature\nextraction. Our model can analyze texture features in spatial and frequency\ndomains simultaneously, leading to optimized classification accuracy. We\nconducted experiments on collected ultrasound datasets and publicly available\nnatural image texture datasets, our proposed network achieved 97.27% accuracy\nand 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image\ntexture datasets, surpassing the accuracy of ResNet and conrming the\neffectiveness of our approach.\n","authors":["Su-Xi Yu","Jing-Yuan He","Yi Wang","Yu-Jiao Cai","Jun Yang","Bo Lin","Wei-Bin Yang","Jian Ruan"],"pdf_url":"https://arxiv.org/pdf/2404.05300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05290v1","updated":"2024-04-08T08:28:19Z","published":"2024-04-08T08:28:19Z","title":"MindSet: Vision. A toolbox for testing DNNs on key psychological\n experiments","summary":" Multiple benchmarks have been developed to assess the alignment between deep\nneural networks (DNNs) and human vision. In almost all cases these benchmarks\nare observational in the sense they are composed of behavioural and brain\nresponses to naturalistic images that have not been manipulated to test\nhypotheses regarding how DNNs or humans perceive and identify objects. Here we\nintroduce the toolbox MindSet: Vision, consisting of a collection of image\ndatasets and related scripts designed to test DNNs on 30 psychological\nfindings. In all experimental conditions, the stimuli are systematically\nmanipulated to test specific hypotheses regarding human visual perception and\nobject recognition. In addition to providing pre-generated datasets of images,\nwe provide code to regenerate these datasets, offering many configurable\nparameters which greatly extend the dataset versatility for different research\ncontexts, and code to facilitate the testing of DNNs on these image datasets\nusing three different methods (similarity judgments, out-of-distribution\nclassification, and decoder method), accessible at\nhttps://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of\nthese methods as an example of how the toolbox can be used.\n","authors":["Valerio Biscione","Dong Yin","Gaurav Malhotra","Marin Dujmovic","Milton L. Montero","Guillermo Puebla","Federico Adolfi","Rachel F. Heaton","John E. Hummel","Benjamin D. Evans","Karim Habashy","Jeffrey S. Bowers"],"pdf_url":"https://arxiv.org/pdf/2404.05290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05285v1","updated":"2024-04-08T08:20:53Z","published":"2024-04-08T08:20:53Z","title":"Detecting Every Object from Events","summary":" Object detection is critical in autonomous driving, and it is more practical\nyet challenging to localize objects of unknown categories: an endeavour known\nas Class-Agnostic Object Detection (CAOD). Existing studies on CAOD\npredominantly rely on ordinary cameras, but these frame-based sensors usually\nhave high latency and limited dynamic range, leading to safety risks in\nreal-world scenarios. In this study, we turn to a new modality enabled by the\nso-called event camera, featured by its sub-millisecond latency and high\ndynamic range, for robust CAOD. We propose Detecting Every Object in Events\n(DEOE), an approach tailored for achieving high-speed, class-agnostic\nopen-world object detection in event-based vision. Built upon the fast\nevent-based backbone: recurrent vision transformer, we jointly consider the\nspatial and temporal consistencies to identify potential objects. The\ndiscovered potential objects are assimilated as soft positive samples to avoid\nbeing suppressed as background. Moreover, we introduce a disentangled\nobjectness head to separate the foreground-background classification and novel\nobject discovery tasks, enhancing the model's generalization in localizing\nnovel objects while maintaining a strong ability to filter out the background.\nExtensive experiments confirm the superiority of our proposed DEOE in\ncomparison with three strong baseline methods that integrate the\nstate-of-the-art event-based object detector with advancements in RGB-based\nCAOD. Our code is available at https://github.com/Hatins/DEOE.\n","authors":["Haitian Zhang","Chang Xu","Xinya Wang","Bingde Liu","Guang Hua","Lei Yu","Wen Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19428v3","updated":"2024-04-08T08:18:33Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v3.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2404.05280v1","updated":"2024-04-08T08:11:56Z","published":"2024-04-08T08:11:56Z","title":"MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues","summary":" 3D object detection based on roadside cameras is an additional way for\nautonomous driving to alleviate the challenges of occlusion and short\nperception range from vehicle cameras. Previous methods for roadside 3D object\ndetection mainly focus on modeling the depth or height of objects, neglecting\nthe stationary of cameras and the characteristic of inter-frame consistency. In\nthis work, we propose a novel framework, namely MOSE, for MOnocular 3D object\ndetection with Scene cuEs. The scene cues are the frame-invariant\nscene-specific features, which are crucial for object localization and can be\nintuitively regarded as the height between the surface of the real road and the\nvirtual ground plane. In the proposed framework, a scene cue bank is designed\nto aggregate scene cues from multiple frames of the same scene with a carefully\ndesigned extrinsic augmentation strategy. Then, a transformer-based decoder\nlifts the aggregated scene cues as well as the 3D position embeddings for 3D\nobject location, which boosts generalization ability in heterologous scenes.\nThe extensive experiment results on two public benchmarks demonstrate the\nstate-of-the-art performance of the proposed method, which surpasses the\nexisting methods by a large margin.\n","authors":["Xiahan Chen","Mingjian Chen","Sanli Tang","Yi Niu","Jiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.05280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00916v2","updated":"2024-04-08T08:08:43Z","published":"2024-04-01T04:43:45Z","title":"Gyro-based Neural Single Image Deblurring","summary":" In this paper, we present GyroDeblurNet, a novel single image deblurring\nmethod that utilizes a gyro sensor to effectively resolve the ill-posedness of\nimage deblurring. The gyro sensor provides valuable information about camera\nmotion during exposure time that can significantly improve deblurring quality.\nHowever, effectively exploiting real-world gyro data is challenging due to\nsignificant errors from various sources including sensor noise, the disparity\nbetween the positions of a camera module and a gyro sensor, the absence of\ntranslational motion information, and moving objects whose motions cannot be\ncaptured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with\ntwo novel neural network blocks: a gyro refinement block and a gyro deblurring\nblock. The gyro refinement block refines the error-ridden gyro data using the\nblur information from the input image. On the other hand, the gyro deblurring\nblock removes blur from the input image using the refined gyro data and further\ncompensates for gyro error by leveraging the blur information from the input\nimage. For training a neural network with erroneous gyro data, we propose a\ntraining strategy based on the curriculum learning. We also introduce a novel\ngyro data embedding scheme to represent real-world intricate camera shakes.\nFinally, we present a synthetic dataset and a real dataset for the training and\nevaluation of gyro-based single image deblurring. Our experiments demonstrate\nthat our approach achieves state-of-the-art deblurring quality by effectively\nutilizing erroneous gyro data.\n","authors":["Heemin Yang","Jaesung Rim","Seungyong Lee","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.00916v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.05274v1","updated":"2024-04-08T08:04:44Z","published":"2024-04-08T08:04:44Z","title":"Deep Optics for Video Snapshot Compressive Imaging","summary":" Video snapshot compressive imaging (SCI) aims to capture a sequence of video\nframes with only a single shot of a 2D detector, whose backbones rest in\noptical modulation patterns (also known as masks) and a computational\nreconstruction algorithm. Advanced deep learning algorithms and mature hardware\nare putting video SCI into practical applications. Yet, there are two clouds in\nthe sunshine of SCI: i) low dynamic range as a victim of high temporal\nmultiplexing, and ii) existing deep learning algorithms' degradation on real\nsystem. To address these challenges, this paper presents a deep optics\nframework to jointly optimize masks and a reconstruction network. Specifically,\nwe first propose a new type of structural mask to realize motion-aware and\nfull-dynamic-range measurement. Considering the motion awareness property in\nmeasurement domain, we develop an efficient network for video SCI\nreconstruction using Transformer to capture long-term temporal dependencies,\ndubbed Res2former. Moreover, sensor response is introduced into the forward\nmodel of video SCI to guarantee end-to-end model training close to real system.\nFinally, we implement the learned structural masks on a digital micro-mirror\ndevice. Experimental results on synthetic and real data validate the\neffectiveness of the proposed framework. We believe this is a milestone for\nreal-world video SCI. The source code and data are available at\nhttps://github.com/pwangcs/DeepOpticsSCI.\n","authors":["Ping Wang","Lishun Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.05274v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2404.05268v1","updated":"2024-04-08T07:59:04Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation aims to synthesize instantiations of\nuser-specified concepts and has achieved unprecedented progress in handling\nindividual concept. However, when extending to multiple customized concepts,\nexisting methods exhibit limitations in terms of flexibility and fidelity, only\naccommodating the combination of limited types of models and potentially\nresulting in a mix of characteristics from different concepts. In this paper,\nwe introduce the Multi-concept guidance for Multi-concept customization, termed\nMC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the\nrequirements for model architecture via inference time optimization, allowing\nthe integration of various heterogeneous single-concept customized models. It\nadaptively refines the attention weights between visual and textual tokens,\ndirecting image regions to focus on their associated words while diminishing\nthe impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$\neven surpasses previous methods that require additional training in terms of\nconsistency with input prompt and reference images. Moreover, MC$^2$ can be\nextended to elevate the compositional capabilities of text-to-image generation,\nyielding appealing results. Code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05264v1","updated":"2024-04-08T07:54:18Z","published":"2024-04-08T07:54:18Z","title":"Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in\n Multimodal Large Language Model Security","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities\nthat increasingly influence various aspects of our daily lives, constantly\ndefining the new boundary of Artificial General Intelligence (AGI). Image\nmodalities, enriched with profound semantic information and a more continuous\nmathematical nature compared to other modalities, greatly enhance the\nfunctionalities of MLLMs when integrated. However, this integration serves as a\ndouble-edged sword, providing attackers with expansive vulnerabilities to\nexploit for highly covert and harmful attacks. The pursuit of reliable AI\nsystems like powerful MLLMs has emerged as a pivotal area of contemporary\nresearch. In this paper, we endeavor to demostrate the multifaceted risks\nassociated with the incorporation of image modalities into MLLMs. Initially, we\ndelineate the foundational components and training processes of MLLMs.\nSubsequently, we construct a threat model, outlining the security\nvulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing\nscholarly discourses on MLLMs' attack and defense mechanisms, culminating in\nsuggestions for the future research on MLLM security. Through this\ncomprehensive analysis, we aim to deepen the academic understanding of MLLM\nsecurity challenges and propel forward the development of trustworthy MLLM\nsystems.\n","authors":["Yihe Fan","Yuxin Cao","Ziyu Zhao","Ziyao Liu","Shaofeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05264v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.00936v3","updated":"2024-04-08T07:52:38Z","published":"2024-04-01T05:46:15Z","title":"A Comprehensive Review of Knowledge Distillation in Computer Vision","summary":" Deep learning techniques have been demonstrated to surpass preceding\ncutting-edge machine learning techniques in recent years, with computer vision\nbeing one of the most prominent examples. However, deep learning models suffer\nfrom significant drawbacks when deployed in resource-constrained environments\ndue to their large model size and high complexity. Knowledge Distillation is\none of the prominent solutions to overcome this challenge. This review paper\nexamines the current state of research on knowledge distillation, a technique\nfor compressing complex models into smaller and simpler ones. The paper\nprovides an overview of the major principles and techniques associated with\nknowledge distillation and reviews the applications of knowledge distillation\nin the domain of computer vision. The review focuses on the benefits of\nknowledge distillation, as well as the problems that must be overcome to\nimprove its effectiveness.\n","authors":["Sheikh Musa Kaleem","Tufail Rouf","Gousia Habib","Tausifa jan Saleem","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2404.00936v3.pdf","comment":"36 pages ,10 figures"},{"id":"http://arxiv.org/abs/2309.03467v2","updated":"2024-04-08T07:49:47Z","published":"2023-09-07T03:22:59Z","title":"Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree\n Image Generation","summary":" A 360-degree (omni-directional) image provides an all-encompassing spherical\nview of a scene. Recently, there has been an increasing interest in\nsynthesising 360-degree images from conventional narrow field of view (NFoV)\nimages captured by digital cameras and smartphones, for providing immersive\nexperiences in various scenarios such as virtual reality. Yet, existing methods\ntypically fall short in synthesizing intricate visual details or ensure the\ngenerated images align consistently with user-provided prompts. In this study,\nautoregressive omni-aware generative network (AOG-Net) is proposed for\n360-degree image generation by out-painting an incomplete 360-degree image\nprogressively with NFoV and text guidances joinly or individually. This\nautoregressive scheme not only allows for deriving finer-grained and\ntext-consistent patterns by dynamically generating and adjusting the process\nbut also offers users greater flexibility to edit their conditions throughout\nthe generation process. A global-local conditioning mechanism is devised to\ncomprehensively formulate the outpainting guidance in each autoregressive step.\nText guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and\nfurther formulated with cross-attention based transformers into a global stream\nand a local stream into a conditioned generative backbone model. As AOG-Net is\ncompatible to leverage large-scale models for the conditional encoder and the\ngenerative prior, it enables the generation to use extensive open-vocabulary\ntext guidances. Comprehensive experiments on two commonly used 360-degree image\ndatasets for both indoor and outdoor settings demonstrate the state-of-the-art\nperformance of our proposed method. Our code will be made publicly available.\n","authors":["Zhuqiang Lu","Kun Hu","Chaoyue Wang","Lei Bai","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03467v2.pdf","comment":"Accepted by AAAI 24"},{"id":"http://arxiv.org/abs/2404.05258v1","updated":"2024-04-08T07:47:28Z","published":"2024-04-08T07:47:28Z","title":"Unsupervised Band Selection Using Fused HSI and LiDAR Attention\n Integrating With Autoencoder","summary":" Band selection in hyperspectral imaging (HSI) is critical for optimising data\nprocessing and enhancing analytical accuracy. Traditional approaches have\npredominantly concentrated on analysing spectral and pixel characteristics\nwithin individual bands independently. These approaches overlook the potential\nbenefits of integrating multiple data sources, such as Light Detection and\nRanging (LiDAR), and is further challenged by the limited availability of\nlabeled data in HSI processing, which represents a significant obstacle. To\naddress these challenges, this paper introduces a novel unsupervised band\nselection framework that incorporates attention mechanisms and an Autoencoder\nfor reconstruction-based band selection. Our methodology distinctively\nintegrates HSI with LiDAR data through an attention score, using a\nconvolutional Autoencoder to process the combined feature mask. This fusion\neffectively captures essential spatial and spectral features and reduces\nredundancy in hyperspectral datasets. A comprehensive comparative analysis of\nour innovative fused band selection approach is performed against existing\nunsupervised band selection and fusion models. We used data sets such as\nHouston 2013, Trento, and MUUFLE for our experiments. The results demonstrate\nthat our method achieves superior classification accuracy and significantly\noutperforms existing models. This enhancement in HSI band selection,\nfacilitated by the incorporation of LiDAR features, underscores the\nconsiderable advantages of integrating features from different sources.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.05258v1.pdf","comment":"13 pages, 13figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.05256v1","updated":"2024-04-08T07:43:23Z","published":"2024-04-08T07:43:23Z","title":"Text-to-Image Synthesis for Any Artistic Styles: Advancements in\n Personalized Artistic Image Generation via Subdivision and Dual Binding","summary":" Recent advancements in text-to-image models, such as Stable Diffusion, have\ndemonstrated their ability to synthesize visual images through natural language\nprompts. One approach of personalizing text-to-image models, exemplified by\nDreamBooth, fine-tunes the pre-trained model by binding unique text identifiers\nwith a few images of a specific subject. Although existing fine-tuning methods\nhave demonstrated competence in rendering images according to the styles of\nfamous painters, it is still challenging to learn to produce images\nencapsulating distinct art styles due to abstract and broad visual perceptions\nof stylistic attributes such as lines, shapes, textures, and colors. In this\npaper, we introduce a new method, Single-StyleForge, for personalization. It\nfine-tunes pre-trained text-to-image diffusion models to generate diverse\nimages in specified styles from text prompts. By using around 15-20 images of\nthe target style, the approach establishes a foundational binding of a unique\ntoken identifier with a broad range of the target style. It also utilizes\nauxiliary images to strengthen this binding, resulting in offering specific\nguidance on representing elements such as persons in a target style-consistent\nmanner. In addition, we present ways to improve the quality of style and\ntext-image alignment through a method called Multi-StyleForge, which inherits\nthe strategy used in StyleForge and learns tokens in multiple. Experimental\nevaluation conducted on six distinct artistic styles demonstrates substantial\nimprovements in both the quality of generated images and the perceptual\nfidelity metrics, such as FID, KID, and CLIP scores.\n","authors":["Junseo Park","Beomseok Ko","Hyeryung Jang"],"pdf_url":"https://arxiv.org/pdf/2404.05256v1.pdf","comment":"20 pages, 12 figuers"},{"id":"http://arxiv.org/abs/2404.05253v1","updated":"2024-04-08T07:34:39Z","published":"2024-04-08T07:34:39Z","title":"CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement","summary":" Low-light image enhancement (LLIE) aims to improve low-illumination images.\nHowever, existing methods face two challenges: (1) uncertainty in restoration\nfrom diverse brightness degradations; (2) loss of texture and color information\ncaused by noise suppression and light enhancement. In this paper, we propose a\nnovel enhancement approach, CodeEnhance, by leveraging quantized priors and\nimage refinement to address these challenges. In particular, we reframe LLIE as\nlearning an image-to-code mapping from low-light images to discrete codebook,\nwhich has been learned from high-quality images. To enhance this process, a\nSemantic Embedding Module (SEM) is introduced to integrate semantic information\nwith low-level features, and a Codebook Shift (CS) mechanism, designed to adapt\nthe pre-learned codebook to better suit the distinct characteristics of our\nlow-light dataset. Additionally, we present an Interactive Feature\nTransformation (IFT) module to refine texture and color information during\nimage reconstruction, allowing for interactive enhancement based on user\npreferences. Extensive experiments on both real-world and synthetic benchmarks\ndemonstrate that the incorporation of prior knowledge and controllable\ninformation transfer significantly enhances LLIE performance in terms of\nquality and fidelity. The proposed CodeEnhance exhibits superior robustness to\nvarious degradations, including uneven illumination, noise, and color\ndistortion.\n","authors":["Xu Wu","XianXu Hou","Zhihui Lai","Jie Zhou","Ya-nan Zhang","Witold Pedrycz","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.05253v1.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2312.03203v3","updated":"2024-04-08T07:19:52Z","published":"2023-12-06T00:46:30Z","title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled\n Feature Fields","summary":" 3D scene representations have gained immense popularity in recent years.\nMethods that use Neural Radiance fields are versatile for traditional tasks\nsuch as novel view synthesis. In recent times, some work has emerged that aims\nto extend the functionality of NeRF beyond view synthesis, for semantically\naware tasks such as editing and segmentation using 3D feature field\ndistillation from 2D foundation models. However, these methods have two major\nlimitations: (a) they are limited by the rendering speed of NeRF pipelines, and\n(b) implicitly represented feature fields suffer from continuity artifacts\nreducing feature quality. Recently, 3D Gaussian Splatting has shown\nstate-of-the-art performance on real-time radiance field rendering. In this\nwork, we go one step further: in addition to radiance field rendering, we\nenable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D\nfoundation model distillation. This translation is not straightforward: naively\nincorporating feature fields in the 3DGS framework encounters significant\nchallenges, notably the disparities in spatial resolution and channel\nconsistency between RGB images and feature maps. We propose architectural and\ntraining changes to efficiently avert this problem. Our proposed method is\ngeneral, and our experiments showcase novel view semantic segmentation,\nlanguage-guided editing and segment anything through learning feature fields\nfrom state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across\nexperiments, our distillation method is able to provide comparable or better\nresults, while being significantly faster to both train and render.\nAdditionally, to the best of our knowledge, we are the first method to enable\npoint and bounding-box prompting for radiance field manipulation, by leveraging\nthe SAM model. Project website at: https://feature-3dgs.github.io/\n","authors":["Shijie Zhou","Haoran Chang","Sicheng Jiang","Zhiwen Fan","Zehao Zhu","Dejia Xu","Pradyumna Chari","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2312.03203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05238v1","updated":"2024-04-08T07:09:15Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve a human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature attribution maps \\cite{bansal2020sam}\nhave been established as a common means for explaining the input features that\nare important to AI's decisions. It is an interesting but unexplored question\nwhether allowing users to edit the importance scores of input features at test\ntime would improve the human-AI team's accuracy on downstream tasks. In this\npaper, we address this question by taking CHM-Corr, a state-of-the-art,\nante-hoc explanation method \\cite{taesiri2022visual} that first predicts\npatch-wise correspondences between the input and the training-set images, and\nthen uses them to make classification decisions. We build an interactive\ninterface on top of CHM-Corr, enabling users to directly edit the initial\nfeature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface,\nusers gain insights into if, when, and how the model changes its outputs,\nenhancing understanding beyond static explanations. Our user study with 18\nmachine learning researchers who performed $\\sim$1,400 decisions shows that our\ninteractive approach does not improve user accuracy on CUB-200 bird image\nclassification over static explanations. This challenges the belief that\ninteractivity inherently boosts XAI\neffectiveness~\\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding}\nand raises needs for future research. Our work contributes to the field by\nopen-sourcing an interactive tool for manipulating model attention, and it lays\nthe groundwork for future research to enable effective human-AI interaction in\ncomputer vision. We release code and data on\n\\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our\ninterface are available \\href{http://137.184.82.109:7080/}{here}.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v1.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2312.07246v2","updated":"2024-04-08T07:07:02Z","published":"2023-12-12T13:22:44Z","title":"Unifying Correspondence, Pose and NeRF for Pose-Free Novel View\n Synthesis from Stereo Pairs","summary":" This work delves into the task of pose-free novel view synthesis from stereo\npairs, a challenging and pioneering task in 3D vision. Our innovative\nframework, unlike any before, seamlessly integrates 2D correspondence matching,\ncamera pose estimation, and NeRF rendering, fostering a synergistic enhancement\nof these tasks. We achieve this through designing an architecture that utilizes\na shared representation, which serves as a foundation for enhanced 3D geometry\nunderstanding. Capitalizing on the inherent interplay between the tasks, our\nunified framework is trained end-to-end with the proposed training strategy to\nimprove overall model accuracy. Through extensive evaluations across diverse\nindoor and outdoor scenes from two real-world datasets, we demonstrate that our\napproach achieves substantial improvement over previous methodologies,\nespecially in scenarios characterized by extreme viewpoint changes and the\nabsence of accurate camera poses.\n","authors":["Sunghwan Hong","Jaewoo Jung","Heeseong Shin","Jiaolong Yang","Seungryong Kim","Chong Luo"],"pdf_url":"https://arxiv.org/pdf/2312.07246v2.pdf","comment":"Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera\n ready version (Highlight)"},{"id":"http://arxiv.org/abs/2404.05236v1","updated":"2024-04-08T07:01:42Z","published":"2024-04-08T07:01:42Z","title":"Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation","summary":" Recently, a surge of 3D style transfer methods has been proposed that\nleverage the scene reconstruction power of a pre-trained neural radiance field\n(NeRF). To successfully stylize a scene this way, one must first reconstruct a\nphoto-realistic radiance field from collected images of the scene. However,\nwhen only sparse input views are available, pre-trained few-shot NeRFs often\nsuffer from high-frequency artifacts, which are generated as a by-product of\nhigh-frequency details for improving reconstruction quality. Is it possible to\ngenerate more faithful stylized scenes from sparse inputs by directly\noptimizing encoding-based scene representation with target style? In this\npaper, we consider the stylization of sparse-view scenes in terms of\ndisentangling content semantics and style textures. We propose a coarse-to-fine\nsparse-view scene stylization framework, where a novel hierarchical\nencoding-based neural representation is designed to generate high-quality\nstylized scenes directly from implicit scene representations. We also propose a\nnew optimization strategy with content strength annealing to achieve realistic\nstylization and better content preservation. Extensive experiments demonstrate\nthat our method can achieve high-quality stylization of sparse-view scenes and\noutperforms fine-tuning-based baselines in terms of stylization quality and\nefficiency.\n","authors":["Y. Wang","A. Gao","Y. Gong","Y. Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.05236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05231v1","updated":"2024-04-08T06:53:30Z","published":"2024-04-08T06:53:30Z","title":"PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly\n Detection","summary":" The vision-language model has brought great improvement to few-shot\nindustrial anomaly detection, which usually needs to design of hundreds of\nprompts through prompt engineering. For automated scenarios, we first use\nconventional prompt learning with many-class paradigm as the baseline to\nautomatically learn prompts but found that it can not work well in one-class\nanomaly detection. To address the above problem, this paper proposes a\none-class prompt learning method for few-shot anomaly detection, termed\nPromptAD. First, we propose semantic concatenation which can transpose normal\nprompts into anomaly prompts by concatenating normal prompts with anomaly\nsuffixes, thus constructing a large number of negative samples used to guide\nprompt learning in one-class setting. Furthermore, to mitigate the training\nchallenge caused by the absence of anomaly images, we introduce the concept of\nexplicit anomaly margin, which is used to explicitly control the margin between\nnormal prompt features and anomaly prompt features through a hyper-parameter.\nFor image-level/pixel-level anomaly detection, PromptAD achieves first place in\n11/12 few-shot settings on MVTec and VisA.\n","authors":["Xiaofan Li","Zhizhong Zhang","Xin Tan","Chengwei Chen","Yanyun Qu","Yuan Xie","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05231v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05225v1","updated":"2024-04-08T06:40:28Z","published":"2024-04-08T06:40:28Z","title":"LayoutLLM: Layout Instruction Tuning with Large Language Models for\n Document Understanding","summary":" Recently, leveraging large language models (LLMs) or multimodal large\nlanguage models (MLLMs) for document understanding has been proven very\npromising. However, previous works that employ LLMs/MLLMs for document\nunderstanding have not fully explored and utilized the document layout\ninformation, which is vital for precise document understanding. In this paper,\nwe propose LayoutLLM, an LLM/MLLM based method for document understanding. The\ncore of LayoutLLM is a layout instruction tuning strategy, which is specially\ndesigned to enhance the comprehension and utilization of document layouts. The\nproposed layout instruction tuning strategy consists of two components:\nLayout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture\nthe characteristics of document layout in Layout-aware Pre-training, three\ngroups of pre-training tasks, corresponding to document-level, region-level and\nsegment-level information, are introduced. Furthermore, a novel module called\nlayout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on\nregions relevant to the question and generate accurate answers. LayoutCoT is\neffective for boosting the performance of document understanding. Meanwhile, it\nbrings a certain degree of interpretability, which could facilitate manual\ninspection and correction. Experiments on standard benchmarks show that the\nproposed LayoutLLM significantly outperforms existing methods that adopt\nopen-source 7B LLMs/MLLMs for document understanding. The training data of the\nLayoutLLM is publicly available at\nhttps://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM\n","authors":["Chuwei Luo","Yufan Shen","Zhaoqing Zhu","Qi Zheng","Zhi Yu","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2404.05225v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05220v1","updated":"2024-04-08T06:32:11Z","published":"2024-04-08T06:32:11Z","title":"StylizedGS: Controllable Stylization for 3D Gaussian Splatting","summary":" With the rapid development of XR, 3D generation and editing are becoming more\nand more important, among which, stylization is an important tool of 3D\nappearance editing. It can achieve consistent 3D artistic stylization given a\nsingle reference style image and thus is a user-friendly editing way. However,\nrecent NeRF-based 3D stylization methods face efficiency issues that affect the\nactual user experience and the implicit nature limits its ability to transfer\nthe geometric pattern styles. Additionally, the ability for artists to exert\nflexible control over stylized scenes is considered highly desirable, fostering\nan environment conducive to creative exploration. In this paper, we introduce\nStylizedGS, a 3D neural style transfer framework with adaptable control over\nperceptual factors based on 3D Gaussian Splatting (3DGS) representation. The\n3DGS brings the benefits of high efficiency. We propose a GS filter to\neliminate floaters in the reconstruction which affects the stylization effects\nbefore stylization. Then the nearest neighbor-based style loss is introduced to\nachieve stylization by fine-tuning the geometry and color parameters of 3DGS,\nwhile a depth preservation loss with other regularizations is proposed to\nprevent the tampering of geometry content. Moreover, facilitated by specially\ndesigned losses, StylizedGS enables users to control color, stylized scale and\nregions during the stylization to possess customized capabilities. Our method\ncan attain high-quality stylization results characterized by faithful\nbrushstrokes and geometric consistency with flexible controls. Extensive\nexperiments across various scenes and styles demonstrate the effectiveness and\nefficiency of our method concerning both stylization quality and inference FPS.\n","authors":["Dingxi Zhang","Zhuoxun Chen","Yu-Jie Yuan","Fang-Lue Zhang","Zhenliang He","Shiguang Shan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.05220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05773v2","updated":"2024-04-08T06:28:13Z","published":"2024-02-08T16:00:25Z","title":"UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery","summary":" Raindrops adhering to the lens of UAVs can obstruct visibility of the\nbackground scene and degrade image quality. Despite recent progress in image\nderaining methods and datasets, there is a lack of focus on raindrop removal\nfrom UAV aerial imagery due to the unique challenges posed by varying angles\nand rapid movement during drone flight. To fill the gap in this research, we\nfirst construct a new benchmark dataset for removing raindrops from UAV images,\ncalled UAV-Rain1k. In this letter, we provide a dataset generation pipeline,\nwhich includes modeling raindrop shapes using Blender, collecting background\nimages from various UAV angles, random sampling of rain masks and etc. Based on\nthe proposed benchmark, we further present a comprehensive evaluation of\nexisting representative image deraining algorithms, and reveal future research\nopportunities worth exploring. The proposed dataset is publicly available at\nhttps://github.com/cschenxiang/UAV-Rain1k.\n","authors":["Wenhui Chang","Hongming Chen","Xin He","Xiang Chen","Liangduo Shen"],"pdf_url":"https://arxiv.org/pdf/2402.05773v2.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW) 2024"},{"id":"http://arxiv.org/abs/2312.17118v3","updated":"2024-04-08T06:23:12Z","published":"2023-12-28T16:54:53Z","title":"Fully Sparse 3D Occupancy Prediction","summary":" Occupancy prediction plays a pivotal role in autonomous driving. Previous\nmethods typically construct dense 3D volumes, neglecting the inherent sparsity\nof the scene and suffering high computational costs. To bridge the gap, we\nintroduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc\ninitially reconstructs a sparse 3D representation from visual inputs and\nsubsequently predicts semantic/instance occupancy from the 3D sparse\nrepresentation by sparse queries. A mask-guided sparse sampling is designed to\nenable sparse queries to interact with 2D features in a fully sparse manner,\nthereby circumventing costly dense features or global attention. Additionally,\nwe design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the\ninconsistency penalty along depths raised in traditional voxel-level mIoU\ncriteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of\n34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history\nframes inputs. By incorporating more preceding frames to 15, SparseOcc\ncontinuously improves its performance to 35.1 RayIoU without whistles and\nbells. Code is available at https://github.com/MCG-NJU/SparseOcc.\n","authors":["Haisong Liu","Yang Chen","Haiguang Wang","Zetong Yang","Tianyu Li","Jia Zeng","Li Chen","Hongyang Li","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.17118v3.pdf","comment":"Add new metric: RayIoU"},{"id":"http://arxiv.org/abs/2404.05218v1","updated":"2024-04-08T06:15:13Z","published":"2024-04-08T06:15:13Z","title":"Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware\n Trajectory Conditioning","summary":" Human pose forecasting garners attention for its diverse applications.\nHowever, challenges in modeling the multi-modal nature of human motion and\nintricate interactions among agents persist, particularly with longer\ntimescales and more agents. In this paper, we propose an interaction-aware\ntrajectory-conditioned long-term multi-agent human pose forecasting model,\nutilizing a coarse-to-fine prediction approach: multi-modal global trajectories\nare initially forecasted, followed by respective local pose forecasts\nconditioned on each mode. In doing so, our Trajectory2Pose model introduces a\ngraph-based agent-wise interaction module for a reciprocal forecast of local\nmotion-conditioned global trajectory and trajectory-conditioned local pose. Our\nmodel effectively handles the multi-modality of human motion and the complexity\nof long-term multi-agent interactions, improving performance in complex\nenvironments. Furthermore, we address the lack of long-term (6s+) multi-agent\n(5+) datasets by constructing a new dataset from real-world images and 2D\nannotations, enabling a comprehensive evaluation of our proposed model.\nState-of-the-art prediction performance on both complex and simpler datasets\nconfirms the generalized effectiveness of our method. The code is available at\nhttps://github.com/Jaewoo97/T2P.\n","authors":["Jaewoo Jeong","Daehee Park","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2404.05218v1.pdf","comment":"2024 CVPR Highlight"},{"id":"http://arxiv.org/abs/2404.02135v3","updated":"2024-04-08T06:11:48Z","published":"2024-04-02T17:48:46Z","title":"Enhancing Ship Classification in Optical Satellite Imagery: Integrating\n Convolutional Block Attention Module with ResNet for Improved Performance","summary":" This study presents an advanced Convolutional Neural Network (CNN)\narchitecture for ship classification from optical satellite imagery,\nsignificantly enhancing performance through the integration of the\nConvolutional Block Attention Module (CBAM) and additional architectural\ninnovations. Building upon the foundational ResNet50 model, we first\nincorporated a standard CBAM to direct the model's focus towards more\ninformative features, achieving an accuracy of 87% compared to the baseline\nResNet50's 85%. Further augmentations involved multi-scale feature integration,\ndepthwise separable convolutions, and dilated convolutions, culminating in the\nEnhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable\naccuracy of 95%, with precision, recall, and f1-scores all witnessing\nsubstantial improvements across various ship classes. The bulk carrier and oil\ntanker classes, in particular, showcased nearly perfect precision and recall\nrates, underscoring the model's enhanced capability in accurately identifying\nand classifying ships. Attention heatmap analyses further validated the\nimproved model's efficacy, revealing a more focused attention on relevant ship\nfeatures, regardless of background complexities. These findings underscore the\npotential of integrating attention mechanisms and architectural innovations in\nCNNs for high-resolution satellite imagery classification. The study navigates\nthrough the challenges of class imbalance and computational costs, proposing\nfuture directions towards scalability and adaptability in new or rare ship type\nrecognition. This research lays a groundwork for the application of advanced\ndeep learning techniques in the domain of remote sensing, offering insights\ninto scalable and efficient satellite image classification.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Junseob Shin","Hyerin Cha","Yeom Hyeok","Seung Won Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02135v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05215v1","updated":"2024-04-08T06:07:32Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v1.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.05212v1","updated":"2024-04-08T05:58:07Z","published":"2024-04-08T05:58:07Z","title":"DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage\n CJK Character Generation","summary":" Chinese, Japanese, and Korean (CJK), with a vast number of native speakers,\nhas profound influence on society and culture. The typesetting of CJK languages\ncarries a wide range of requirements due to the complexity of their scripts and\nunique literary traditions. A critical aspect of this typesetting process is\nthat CJK fonts need to provide a set of consistent-looking glyphs for\napproximately one hundred thousand characters. However, creating such a font is\ninherently labor-intensive and expensive, which significantly hampers the\ndevelopment of new CJK fonts for typesetting, historical, aesthetic, or\nartistic purposes.\n To bridge this gap, we are motivated by recent advancements in\ndiffusion-based generative models and propose a novel diffusion method for\ngenerating glyphs in a targeted style from a \\emph{single} conditioned,\nstandard glyph form. Our experiments show that our method is capable of\ngenerating fonts of both printed and hand-written styles, the latter of which\npresents a greater challenge. Moreover, our approach shows remarkable zero-shot\ngeneralization capabilities for non-CJK but Chinese-inspired scripts. We also\nshow our method facilitates smooth style interpolation and generates bitmap\nimages suitable for vectorization, which is crucial in the font creation\nprocess. In summary, our proposed method opens the door to high-quality,\ngenerative model-assisted font creation for CJK characters, for both\ntypesetting and artistic endeavors.\n","authors":["Yingtao Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05211v1","updated":"2024-04-08T05:50:46Z","published":"2024-04-08T05:50:46Z","title":"Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image\n Clustering","summary":" Hyperspectral image (HSI) clustering is a challenging task due to its high\ncomplexity. Despite subspace clustering shows impressive performance for HSI,\ntraditional methods tend to ignore the global-local interaction in HSI data. In\nthis study, we proposed a multi-level graph subspace contrastive learning\n(MLGSC) for HSI clustering. The model is divided into the following main parts.\nGraph convolution subspace construction: utilizing spectral and texture\nfeautures to construct two graph convolution views. Local-global graph\nrepresentation: local graph representations were obtained by step-by-step\nconvolutions and a more representative global graph representation was obtained\nusing an attention-based pooling strategy. Multi-level graph subspace\ncontrastive learning: multi-level contrastive learning was conducted to obtain\nlocal-global joint graph representations, to improve the consistency of the\npositive samples between views, and to obtain more robust graph embeddings.\nSpecifically, graph-level contrastive learning is used to better learn global\nrepresentations of HSI data. Node-level intra-view and inter-view contrastive\nlearning is designed to learn joint representations of local regions of HSI.\nThe proposed model is evaluated on four popular HSI datasets: Indian Pines,\nPavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%,\n99.96%, 92.28%, and 95.73%, which significantly outperforms the current\nstate-of-the-art clustering methods.\n","authors":["Jingxin Wang","Renxiang Guan","Kainan Gao","Zihao Li","Hao Li","Xianju Li","Chang Tang"],"pdf_url":"https://arxiv.org/pdf/2404.05211v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.05210v1","updated":"2024-04-08T05:45:03Z","published":"2024-04-08T05:45:03Z","title":"Bidirectional Long-Range Parser for Sequential Data Understanding","summary":" The transformer is a powerful data modelling framework responsible for\nremarkable performance on a wide range of tasks. However, they are limited in\nterms of scalability as it is suboptimal and inefficient to process\nlong-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range\nParser), a novel and versatile attention mechanism designed to increase\nperformance and efficiency on long-sequence tasks. It leverages short and long\nrange heuristics in the form of a local sliding window approach combined with a\nglobal bidirectional latent space synthesis technique. We show the benefits and\nversatility of our approach on vision and language domains by demonstrating\ncompetitive results against state-of-the-art methods on the Long-Range-Arena\nand CIFAR benchmarks together with ablations demonstrating the computational\nefficiency.\n","authors":["George Leotescu","Daniel Voinea","Alin-Ionut Popa"],"pdf_url":"https://arxiv.org/pdf/2404.05210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05207v1","updated":"2024-04-08T05:23:12Z","published":"2024-04-08T05:23:12Z","title":"iVPT: Improving Task-relevant Information Sharing in Visual Prompt\n Tuning by Cross-layer Dynamic Connection","summary":" Recent progress has shown great potential of visual prompt tuning (VPT) when\nadapting pre-trained vision transformers to various downstream tasks. However,\nmost existing solutions independently optimize prompts at each layer, thereby\nneglecting the usage of task-relevant information encoded in prompt tokens\nacross layers. Additionally, existing prompt structures are prone to\ninterference from task-irrelevant noise in input images, which can do harm to\nthe sharing of task-relevant information. In this paper, we propose a novel VPT\napproach, \\textbf{iVPT}. It innovatively incorporates a cross-layer dynamic\nconnection (CDC) for input prompt tokens from adjacent layers, enabling\neffective sharing of task-relevant information. Furthermore, we design a\ndynamic aggregation (DA) module that facilitates selective sharing of\ninformation between layers. The combination of CDC and DA enhances the\nflexibility of the attention process within the VPT framework. Building upon\nthese foundations, iVPT introduces an attentive reinforcement (AR) mechanism,\nby automatically identifying salient image tokens, which are further enhanced\nby prompt tokens in an additive manner. Extensive experiments on 24 image\nclassification and semantic segmentation benchmarks clearly demonstrate the\nadvantage of the proposed iVPT, compared to the state-of-the-art counterparts.\n","authors":["Nan Zhou","Jiaxin Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.05207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05206v1","updated":"2024-04-08T05:19:28Z","published":"2024-04-08T05:19:28Z","title":"SoundingActions: Learning How Actions Sound from Narrated Egocentric\n Videos","summary":" We propose a novel self-supervised embedding to learn how actions sound from\nnarrated in-the-wild egocentric videos. Whereas existing methods rely on\ncurated data with known audio-visual correspondence, our multimodal\ncontrastive-consensus coding (MC3) embedding reinforces the associations\nbetween audio, language, and vision when all modality pairs agree, while\ndiminishing those associations when any one pair does not. We show our approach\ncan successfully discover how the long tail of human actions sound from\negocentric video, outperforming an array of recent multimodal embedding\ntechniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal\ntasks.\n","authors":["Changan Chen","Kumar Ashutosh","Rohit Girdhar","David Harwath","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2404.05206v1.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://vision.cs.utexas.edu/projects/soundingactions"},{"id":"http://arxiv.org/abs/2404.05205v1","updated":"2024-04-08T05:18:39Z","published":"2024-04-08T05:18:39Z","title":"A secure and private ensemble matcher using multi-vault obfuscated\n templates","summary":" Given the irrevocability of biometric samples and mounting privacy concerns,\nbiometric template security and secure matching are among the essential\nfeatures of any well-designed modern biometric system. In this paper, we\npropose an obfuscation method that hides the biometric template information\nwith just enough chaff. The main idea is to reduce the number of chaff points\nto a practical level by creating n sub-templates from the original template and\nhiding each sub-template with m chaff points. During verification, s closest\nvectors to the biometric query are retrieved from each vault and then combined\nto generate hash values that are compared with the stored hash value. We\ndemonstrate the effectiveness of synthetic facial images, generated by a\nGenerative Adversarial Network (GAN), as ``random chaff points'' within a\nsecure-vault authorization system. This approach safeguards user identities\nduring training and deployment. We tested our protocol using the AT&T, GT, and\nLFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and\n0.90, respectively. These numbers were close to those of the unprotected\ntemplates, showing that our method does not adversely affect accuracy.\n","authors":["Babak Poorebrahim Gilkalaye","Shubhabrata Mukherjee","Reza Derakhshani"],"pdf_url":"https://arxiv.org/pdf/2404.05205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11825v2","updated":"2024-04-08T05:11:47Z","published":"2023-11-20T15:03:56Z","title":"Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning","summary":" In this work, we use multi-view aerial images to reconstruct the geometry,\nlighting, and material of facades using neural signed distance fields (SDFs).\nWithout the requirement of complex equipment, our method only takes simple RGB\nimages captured by a drone as inputs to enable physically based and\nphotorealistic novel-view rendering, relighting, and editing. However, a\nreal-world facade usually has complex appearances ranging from diffuse rocks\nwith subtle details to large-area glass windows with specular reflections,\nmaking it hard to attend to everything. As a result, previous methods can\npreserve the geometry details but fail to reconstruct smooth glass windows or\nverse vise. In order to address this challenge, we introduce three spatial- and\nsemantic-adaptive optimization strategies, including a semantic regularization\napproach based on zero-shot segmentation techniques to improve material\nconsistency, a frequency-aware geometry regularization to balance surface\nsmoothness and details in different surfaces, and a visibility probe-based\nscheme to enable efficient modeling of the local lighting in large-scale\noutdoor environments. In addition, we capture a real-world facade aerial 3D\nscanning image set and corresponding point clouds for training and\nbenchmarking. The experiment demonstrates the superior quality of our method on\nfacade holistic inverse rendering, novel view synthesis, and scene editing\ncompared to state-of-the-art baselines.\n","authors":["Zixuan Xie","Rengan Xie","Rong Li","Kai Huang","Pengju Qiao","Jingsen Zhu","Xu Yin","Qi Ye","Wei Hua","Yuchi Huo","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2311.11825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01518v3","updated":"2024-04-08T05:09:19Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v3.pdf","comment":"Accepted to CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.05196v1","updated":"2024-04-08T04:53:29Z","published":"2024-04-08T04:53:29Z","title":"HSViT: Horizontally Scalable Vision Transformer","summary":" While the Vision Transformer (ViT) architecture gains prominence in computer\nvision and attracts significant attention from multimedia communities, its\ndeficiency in prior knowledge (inductive bias) regarding shift, scale, and\nrotational invariance necessitates pre-training on large-scale datasets.\nFurthermore, the growing layers and parameters in both ViT and convolutional\nneural networks (CNNs) impede their applicability to mobile multimedia\nservices, primarily owing to the constrained computational resources on edge\ndevices. To mitigate the aforementioned challenges, this paper introduces a\nnovel horizontally scalable vision transformer (HSViT). Specifically, a novel\nimage-level feature embedding allows ViT to better leverage the inductive bias\ninherent in the convolutional layers. Based on this, an innovative horizontally\nscalable architecture is designed, which reduces the number of layers and\nparameters of the models while facilitating collaborative training and\ninference of ViT models across multiple nodes. The experimental results depict\nthat, without pre-training on large-scale datasets, HSViT achieves up to 10%\nhigher top-1 accuracy than state-of-the-art schemes, ascertaining its superior\npreservation of inductive bias. The code is available at\nhttps://github.com/xuchenhao001/HSViT.\n","authors":["Chenhao Xu","Chang-Tsun Li","Chee Peng Lim","Douglas Creighton"],"pdf_url":"https://arxiv.org/pdf/2404.05196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05187v1","updated":"2024-04-08T04:27:36Z","published":"2024-04-08T04:27:36Z","title":"LGSDF: Continual Global Learning of Signed Distance Fields Aided by\n Local Updating","summary":" Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves\ntraining a neural network to regress the signed distance from any point to the\nnearest obstacle, which has the advantages of lightweight storage and\ncontinuous querying. However, existing algorithms usually rely on conflicting\nraw observations as training data, resulting in poor map performance. In this\npaper, we propose LGSDF, an ESDF continual Global learning algorithm aided by\nLocal updating. At the front end, axis-aligned grids are dynamically updated by\npre-processed sensor observations, where incremental fusion alleviates\nestimation error caused by limited viewing directions. At the back end, a\nrandomly initialized implicit ESDF neural network performs continual\nself-supervised learning guided by these grids to generate smooth and\ncontinuous maps. The results on multiple scenes show that LGSDF can construct\nmore accurate ESDF maps and meshes compared with SOTA (State Of The Art)\nexplicit and implicit mapping algorithms. The source code of LGSDF is publicly\navailable at https://github.com/BIT-DYN/LGSDF.\n","authors":["Yufeng Yue","Yinan Deng","Jiahui Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05183v1","updated":"2024-04-08T04:17:27Z","published":"2024-04-08T04:17:27Z","title":"Progressive Alignment with VLM-LLM Feature to Augment Defect\n Classification for the ASE Dataset","summary":" Traditional defect classification approaches are facing with two barriers.\n(1) Insufficient training data and unstable data quality. Collecting sufficient\ndefective sample is expensive and time-costing, consequently leading to dataset\nvariance. It introduces the difficulty on recognition and learning. (2)\nOver-dependence on visual modality. When the image pattern and texture is\nmonotonic for all defect classes in a given dataset, the performance of\nconventional AOI system cannot be guaranteed. In scenarios where image quality\nis compromised due to mechanical failures or when defect information is\ninherently difficult to discern, the performance of deep models cannot be\nguaranteed. A main question is, \"how to solve those two problems when they\noccur at the same time?\" The feasible strategy is to explore another feature\nwithin dataset and combine an eminent vision-language model (VLM) and\nLarge-Language model (LLM) with their astonishing zero-shot capability. In this\nwork, we propose the special ASE dataset, including rich data description\nrecorded on image, for defect classification, but the defect feature is uneasy\nto learn directly. Secondly, We present the prompting for VLM-LLM against\ndefect classification with the proposed ASE dataset to activate extra-modality\nfeature from images to enhance performance. Then, We design the novel\nprogressive feature alignment (PFA) block to refine image-text feature to\nalleviate the difficulty of alignment under few-shot scenario. Finally, the\nproposed Cross-modality attention fusion (CMAF) module can effectively fuse\ndifferent modality feature. Experiment results have demonstrated our method's\neffectiveness over several defect classification methods for the ASE dataset.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Chun-Hung Sun","Kuang-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2404.05183v1.pdf","comment":"MULA 2024"},{"id":"http://arxiv.org/abs/2404.05181v1","updated":"2024-04-08T04:13:35Z","published":"2024-04-08T04:13:35Z","title":"Adaptive Learning for Multi-view Stereo Reconstruction","summary":" Deep learning has recently demonstrated its excellent performance on the task\nof multi-view stereo (MVS). However, loss functions applied for deep MVS are\nrarely studied. In this paper, we first analyze existing loss functions'\nproperties for deep depth based MVS approaches. Regression based loss leads to\ninaccurate continuous results by computing mathematical expectation, while\nclassification based loss outputs discretized depth values. To this end, we\nthen propose a novel loss function, named adaptive Wasserstein loss, which is\nable to narrow down the difference between the true and predicted probability\ndistributions of depth. Besides, a simple but effective offset module is\nintroduced to better achieve sub-pixel prediction accuracy. Extensive\nexperiments on different benchmarks, including DTU, Tanks and Temples and\nBlendedMVS, show that the proposed method with the adaptive Wasserstein loss\nand the offset module achieves state-of-the-art performance.\n","authors":["Qinglu Min","Jie Zhao","Zhihao Zhang","Chen Min"],"pdf_url":"https://arxiv.org/pdf/2404.05181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05180v1","updated":"2024-04-08T04:10:50Z","published":"2024-04-08T04:10:50Z","title":"GloSoFarID: Global multispectral dataset for Solar Farm IDentification\n in satellite imagery","summary":" Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal\nsolution in the global pursuit of clean and renewable energy. This technology\naddresses the urgent need for sustainable energy alternatives by converting\nsolar power into electricity without greenhouse gas emissions. It not only\ncurtails global carbon emissions but also reduces reliance on finite,\nnon-renewable energy sources. In this context, monitoring solar panel farms\nbecomes essential for understanding and facilitating the worldwide shift toward\nclean energy. This study contributes to this effort by developing the first\ncomprehensive global dataset of multispectral satellite imagery of solar panel\nfarms. This dataset is intended to form the basis for training robust machine\nlearning models, which can accurately map and analyze the expansion and\ndistribution of solar panel farms globally. The insights gained from this\nendeavor will be instrumental in guiding informed decision-making for a\nsustainable energy future. https://github.com/yzyly1992/GloSoFarID\n","authors":["Zhiyuan Yang","Ryan Rad"],"pdf_url":"https://arxiv.org/pdf/2404.05180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05169v1","updated":"2024-04-08T03:33:01Z","published":"2024-04-08T03:33:01Z","title":"QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease\n Diagnosis","summary":" Due to the complexity of medical image acquisition and the difficulty of\nannotation, medical image datasets inevitably contain noise. Noisy data with\nwrong labels affects the robustness and generalization ability of deep neural\nnetworks. Previous noise learning methods mainly considered noise arising from\nimages being mislabeled, i.e. label noise, assuming that all mislabeled images\nare of high image quality. However, medical images are prone to suffering\nextreme quality issues, i.e. data noise, where discriminative visual features\nare missing for disease diagnosis. In this paper, we propose a noise learning\nframework, termed as QMix, that learns a robust disease diagnosis model under\nmixed noise. QMix alternates between sample separation and quality-aware\nsemisupervised training in each training epoch. In the sample separation phase,\nwe design a joint uncertainty-loss criterion to effectively separate (1)\ncorrectly labeled images; (2) mislabeled images with high quality and (3)\nmislabeled images with low quality. In the semi-supervised training phase, we\ntrain a disease diagnosis model to learn robust feature representation from the\nseparated samples. Specifically, we devise a sample-reweighing loss to mitigate\nthe effect of mislabeled images with low quality during training. Meanwhile, a\ncontrastive enhancement loss is proposed to further distinguish mislabeled\nimages with low quality from correctly labeled images. QMix achieved\nstate-of-the-art disease diagnosis performance on five public retinal image\ndatasets and exhibited substantial improvement on robustness against mixed\nnoise.\n","authors":["Junlin Hou","Jilan Xu","Rui Feng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05163v1","updated":"2024-04-08T03:06:19Z","published":"2024-04-08T03:06:19Z","title":"Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular\n Videos","summary":" In this work, we pioneer Semantic Flow, a neural semantic representation of\ndynamic scenes from monocular videos. In contrast to previous NeRF methods that\nreconstruct dynamic scenes from the colors and volume densities of individual\npoints, Semantic Flow learns semantics from continuous flows that contain rich\n3D motion information. As there is 2D-to-3D ambiguity problem in the viewing\ndirection when extracting 3D flow features from 2D video frames, we consider\nthe volume densities as opacity priors that describe the contributions of flow\nfeatures to the semantics on the frames. More specifically, we first learn a\nflow network to predict flows in the dynamic scene, and propose a flow feature\naggregation module to extract flow features from video frames. Then, we propose\na flow attention module to extract motion information from flow features, which\nis followed by a semantic network to output semantic logits of flows. We\nintegrate the logits with volume densities in the viewing direction to\nsupervise the flow features with semantic labels on video frames. Experimental\nresults show that our model is able to learn from multiple dynamic scenes and\nsupports a series of new tasks such as instance-level scene editing, semantic\ncompletions, dynamic scene tracking and semantic adaption on novel scenes.\nCodes are available at https://github.com/tianfr/Semantic-Flow/.\n","authors":["Fengrui Tian","Yueqi Duan","Angtian Wang","Jianfei Guo","Shaoyi Du"],"pdf_url":"https://arxiv.org/pdf/2404.05163v1.pdf","comment":"Accepted by ICLR 2024, Codes are available at\n https://github.com/tianfr/Semantic-Flow/"},{"id":"http://arxiv.org/abs/2311.08393v3","updated":"2024-04-08T02:57:55Z","published":"2023-11-14T18:53:28Z","title":"MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable\n Trajectory Generation","summary":" The learn-from-observation (LfO) paradigm is a human-inspired mode for a\nrobot to learn to perform a task simply by watching it being performed. LfO can\nfacilitate robot integration on factory floors by minimizing disruption and\nreducing tedious programming. A key component of the LfO pipeline is a\ntransformation of the depth camera frames to the corresponding task state and\naction pairs, which are then relayed to learning techniques such as imitation\nor inverse reinforcement learning for understanding the task parameters. While\nseveral existing computer vision models analyze videos for activity\nrecognition, SA-Net specifically targets robotic LfO from RGB-D data. However,\nSA-Net and many other models analyze frame data captured from a single\nviewpoint. Their analysis is therefore highly sensitive to occlusions of the\nobserved task, which are frequent in deployments. An obvious way of reducing\nocclusions is to simultaneously observe the task from multiple viewpoints and\nsynchronously fuse the multiple streams in the model. Toward this, we present\nmulti-view SA-Net, which generalizes the SA-Net model to allow the perception\nof multiple viewpoints of the task activity, integrate them, and better\nrecognize the state and action in each frame. Performance evaluations on two\ndistinct domains establish that MVSA-Net recognizes the state-action pairs\nunder occlusion more accurately compared to single-view MVSA-Net and other\nbaselines. Our ablation studies further evaluate its performance under\ndifferent ambient conditions and establish the contribution of the architecture\ncomponents. As such, MVSA-Net offers a significantly more robust and deployable\nstate-action trajectory generation compared to previous methods.\n","authors":["Ehsan Asali","Prashant Doshi","Jin Sun"],"pdf_url":"https://arxiv.org/pdf/2311.08393v3.pdf","comment":"Presented at Deployable AI Workshop at AAAI-2024 and 'Towards\n Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023"},{"id":"http://arxiv.org/abs/2403.05805v2","updated":"2024-04-08T02:47:54Z","published":"2024-03-09T05:50:32Z","title":"And Then the Hammer Broke: Reflections on Machine Ethics from Feminist\n Philosophy of Science","summary":" Vision is an important metaphor in ethical and political questions of\nknowledge. The feminist philosopher Donna Haraway points out the ``perverse''\nnature of an intrusive, alienating, all-seeing vision (to which we might cry\nout ``stop looking at me!''), but also encourages us to embrace the embodied\nnature of sight and its promises for genuinely situated knowledge. Current\ntechnologies of machine vision -- surveillance cameras, drones (for war or\nrecreation), iPhone cameras -- are usually construed as instances of the former\nrather than the latter, and for good reasons. However, although in no way\nattempting to diminish the real suffering these technologies have brought about\nin the world, I make the case for understanding technologies of computer vision\nas material instances of embodied seeing and situated knowing. Furthermore,\nborrowing from Iris Murdoch's concept of moral vision, I suggest that these\ntechnologies direct our labor towards self-reflection in ethically significant\nways. My approach draws upon paradigms in computer vision research,\nphenomenology, and feminist epistemology. Ultimately, this essay is an argument\nfor directing more philosophical attention from merely criticizing technologies\nof vision as ethically deficient towards embracing them as complex,\nmethodologically and epistemologically important objects.\n","authors":["Andre Ye"],"pdf_url":"https://arxiv.org/pdf/2403.05805v2.pdf","comment":"Pacific University Philosophy Conference"},{"id":"http://arxiv.org/abs/2403.03954v3","updated":"2024-04-08T02:46:38Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v3.pdf","comment":"Videos, code, and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2404.00989v2","updated":"2024-04-08T02:37:25Z","published":"2024-04-01T08:34:42Z","title":"360+x: A Panoptic Multi-modal Scene Understanding Dataset","summary":" Human perception of the world is shaped by a multitude of viewpoints and\nmodalities. While many existing datasets focus on scene understanding from a\ncertain perspective (e.g. egocentric or third-person views), our dataset offers\na panoptic perspective (i.e. multiple viewpoints with multiple data\nmodalities). Specifically, we encapsulate third-person panoramic and front\nviews, as well as egocentric monocular/binocular views with rich modalities\nincluding video, multi-channel audio, directional binaural delay, location data\nand textual scene descriptions within each scene captured, presenting\ncomprehensive observation of the world. Figure 1 offers a glimpse of all 28\nscene categories of our 360+x dataset. To the best of our knowledge, this is\nthe first database that covers multiple viewpoints with multiple data\nmodalities to mimic how daily information is accessed in the real world.\nThrough our benchmark analysis, we presented 5 different scene understanding\ntasks on the proposed 360+x dataset to evaluate the impact and benefit of each\ndata modality and perspective in panoptic scene understanding. We hope this\nunique dataset could broaden the scope of comprehensive scene understanding and\nencourage the community to approach these problems from more diverse\nperspectives.\n","authors":["Hao Chen","Yuqi Hou","Chenyuan Qu","Irene Testini","Xiaohan Hong","Jianbo Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.00989v2.pdf","comment":"CVPR 2024 (Oral Presentation), Project page:\n https://x360dataset.github.io/"},{"id":"http://arxiv.org/abs/2402.07819v2","updated":"2024-04-08T02:36:23Z","published":"2024-02-12T17:24:35Z","title":"A Benchmark Grocery Dataset of Realworld Point Clouds From Single View","summary":" Fine-grained grocery object recognition is an important computer vision\nproblem with broad applications in automatic checkout, in-store robotic\nnavigation, and assistive technologies for the visually impaired. Existing\ndatasets on groceries are mainly 2D images. Models trained on these datasets\nare limited to learning features from the regular 2D grids. While portable 3D\nsensors such as Kinect were commonly available for mobile phones, sensors such\nas LiDAR and TrueDepth, have recently been integrated into mobile phones.\nDespite the availability of mobile 3D sensors, there are currently no dedicated\nreal-world large-scale benchmark 3D datasets for grocery. In addition, existing\n3D datasets lack fine-grained grocery categories and have limited training\nsamples. Furthermore, collecting data by going around the object versus the\ntraditional photo capture makes data collection cumbersome. Thus, we introduce\na large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes,\nwith a total of 87,898 3D point clouds created from 10,755 RGB-D single-view\nimages. We benchmark our dataset on six recent state-of-the-art 3D point cloud\nclassification models. Additionally, we also benchmark the dataset on few-shot\nand continual learning point cloud classification tasks. Project Page:\nhttps://bigdatavision.org/3DGrocery100/.\n","authors":["Shivanand Venkanna Sheshappanavar","Tejas Anvekar","Shivanand Kundargi","Yufan Wang","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2402.07819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02241v2","updated":"2024-04-08T02:06:37Z","published":"2024-04-02T18:59:39Z","title":"Linear Combination of Saved Checkpoints Makes Consistency and Diffusion\n Models Better","summary":" Diffusion Models (DM) and Consistency Models (CM) are two types of popular\ngenerative models with good generation quality on various tasks. When training\nDM and CM, intermediate weight checkpoints are not fully utilized and only the\nlast converged checkpoint is used. In this work, we find that high-quality\nmodel weights often lie in a basin which cannot be reached by SGD but can be\nobtained by proper checkpoint averaging. Based on these observations, we\npropose LCSC, a simple but effective and efficient method to enhance the\nperformance of DM and CM, by combining checkpoints along the training\ntrajectory with coefficients deduced from evolutionary search. We demonstrate\nthe value of LCSC through two use cases: $\\textbf{(a) Reducing training cost.}$\nWith LCSC, we only need to train DM/CM with fewer number of iterations and/or\nlower batch sizes to obtain comparable sample quality with the fully trained\nmodel. For example, LCSC achieves considerable training speedups for CM\n(23$\\times$ on CIFAR-10 and 15$\\times$ on ImageNet-64). $\\textbf{(b) Enhancing\npre-trained models.}$ Assuming full training is already done, LCSC can further\nimprove the generation quality or speed of the final converged models. For\nexample, LCSC achieves better performance using 1 number of function evaluation\n(NFE) than the base model with 2 NFE on consistency distillation, and decreases\nthe NFE of DM from 15 to 9 while maintaining the generation quality on\nCIFAR-10. Our code is available at\nhttps://github.com/imagination-research/LCSC.\n","authors":["Enshu Liu","Junyi Zhu","Zinan Lin","Xuefei Ning","Matthew B. Blaschko","Sergey Yekhanin","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05145v1","updated":"2024-04-08T02:02:15Z","published":"2024-04-08T02:02:15Z","title":"UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic\n Segmentation in Adverse Weather","summary":" LiDAR semantic segmentation (LSS) is a critical task in autonomous driving\nand has achieved promising progress. However, prior LSS methods are\nconventionally investigated and evaluated on datasets within the same domain in\nclear weather. The robustness of LSS models in unseen scenes and all weather\nconditions is crucial for ensuring safety and reliability in real applications.\nTo this end, we propose UniMix, a universal method that enhances the\nadaptability and generalizability of LSS models. UniMix first leverages\nphysically valid adverse weather simulation to construct a Bridge Domain, which\nserves to bridge the domain gap between the clear weather scenes and the\nadverse weather scenes. Then, a Universal Mixing operator is defined regarding\nspatial, intensity, and semantic distributions to create the intermediate\ndomain with mixed samples from given domains. Integrating the proposed two\ntechniques into a teacher-student framework, UniMix efficiently mitigates the\ndomain gap and enables LSS models to learn weather-robust and domain-invariant\nrepresentations. We devote UniMix to two main setups: 1) unsupervised domain\nadaption, adapting the model from the clear weather source domain to the\nadverse weather target domain; 2) domain generalization, learning a model that\ngeneralizes well to unseen scenes in adverse weather. Extensive experiments\nvalidate the effectiveness of UniMix across different tasks and datasets, all\nachieving superior performance over state-of-the-art methods. The code will be\nreleased.\n","authors":["Haimei Zhao","Jing Zhang","Zhuo Chen","Shanshan Zhao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2404.05145v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05144v1","updated":"2024-04-08T01:55:28Z","published":"2024-04-08T01:55:28Z","title":"Enhancing Clinical Efficiency through LLM: Discharge Note Generation for\n Cardiac Patients","summary":" Medical documentation, including discharge notes, is crucial for ensuring\npatient care quality, continuity, and effective medical communication. However,\nthe manual creation of these documents is not only time-consuming but also\nprone to inconsistencies and potential errors. The automation of this\ndocumentation process using artificial intelligence (AI) represents a promising\narea of innovation in healthcare. This study directly addresses the\ninefficiencies and inaccuracies in creating discharge notes manually,\nparticularly for cardiac patients, by employing AI techniques, specifically\nlarge language model (LLM). Utilizing a substantial dataset from a cardiology\ncenter, encompassing wide-ranging medical records and physician assessments,\nour research evaluates the capability of LLM to enhance the documentation\nprocess. Among the various models assessed, Mistral-7B distinguished itself by\naccurately generating discharge notes that significantly improve both\ndocumentation efficiency and the continuity of care for patients. These notes\nunderwent rigorous qualitative evaluation by medical expert, receiving high\nmarks for their clinical relevance, completeness, readability, and contribution\nto informed decision-making and care planning. Coupled with quantitative\nanalyses, these results confirm Mistral-7B's efficacy in distilling complex\nmedical information into concise, coherent summaries. Overall, our findings\nilluminate the considerable promise of specialized LLM, such as Mistral-7B, in\nrefining healthcare documentation workflows and advancing patient care. This\nstudy lays the groundwork for further integrating advanced AI technologies in\nhealthcare, demonstrating their potential to revolutionize patient\ndocumentation and support better care outcomes.\n","authors":["HyoJe Jung","Yunha Kim","Heejung Choi","Hyeram Seo","Minkyoung Kim","JiYe Han","Gaeun Kee","Seohyun Park","Soyoung Ko","Byeolhee Kim","Suyeon Kim","Tae Joon Jun","Young-Hak Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05144v1.pdf","comment":"10 pages, 1 figure, 3 tables, conference"},{"id":"http://arxiv.org/abs/2404.05139v1","updated":"2024-04-08T01:38:43Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v1.pdf","comment":"Accepted by ICRA 2022. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.05136v1","updated":"2024-04-08T01:29:10Z","published":"2024-04-08T01:29:10Z","title":"Self-Supervised Multi-Object Tracking with Path Consistency","summary":" In this paper, we propose a novel concept of path consistency to learn robust\nobject matching without using manual object identity supervision. Our key idea\nis that, to track a object through frames, we can obtain multiple different\nassociation results from a model by varying the frames it can observe, i.e.,\nskipping frames in observation. As the differences in observations do not alter\nthe identities of objects, the obtained association results should be\nconsistent. Based on this rationale, we generate multiple observation paths,\neach specifying a different set of frames to be skipped, and formulate the Path\nConsistency Loss that enforces the association results are consistent across\ndifferent observation paths. We use the proposed loss to train our object\nmatching model with only self-supervision. By extensive experiments on three\ntracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method\noutperforms existing unsupervised methods with consistent margins on various\nevaluation metrics, and even achieves performance close to supervised methods.\n","authors":["Zijia Lu","Bing Shuai","Yanbei Chen","Zhenlin Xu","Davide Modolo"],"pdf_url":"https://arxiv.org/pdf/2404.05136v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05129v1","updated":"2024-04-08T01:14:09Z","published":"2024-04-08T01:14:09Z","title":"Image-based Agarwood Resinous Area Segmentation using Deep Learning","summary":" The manual extraction method of Agarwood resinous compound is laborious work,\nrequires skilled workers, and is subject to human errors. Commercial Agarwood\nindustries have been actively exploring using Computer Numerical Control (CNC)\nmachines to replace human effort for this particular task. The CNC machine\naccepts a G-code script produced from a binary image in which the wood region\nthat needs to be chiselled off is marked with (0, 0, 0) as its RGB value.\nRather than requiring a human expert to perform the region marking, we propose\nusing a Deep learning image segmentation method instead. Our setup involves a\ncamera that captures the cross-section image and then passes the image file to\na computer. The computer performs the automated image segmentation and feeds\nthe CNC machine with a G-code script. In this article, we report the initial\nsegmentation results achieved using a state-of-the-art Deep learning\nsegmentation method and discuss potential improvements to refine the\nsegmentation accuracy.\n","authors":["Irwandi Hipiny","Johari Abdullah","Noor Alamshah Bolhassan"],"pdf_url":"https://arxiv.org/pdf/2404.05129v1.pdf","comment":"15 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2207.01200v4","updated":"2024-04-08T01:11:22Z","published":"2022-07-04T05:03:10Z","title":"S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation","summary":" Deep learning has become a powerful tool for Mars exploration. Mars terrain\nsemantic segmentation is an important Martian vision task, which is the base of\nrover autonomous planning and safe driving. However, there is a lack of\nsufficient detailed and high-confidence data annotations, which are exactly\nrequired by most deep learning methods to obtain a good model. To address this\nproblem, we propose our solution from the perspective of joint data and method\ndesign. We first present a newdataset S5Mars for Semi-SuperviSed learning on\nMars Semantic Segmentation, which contains 6K high-resolution images and is\nsparsely annotated based on confidence, ensuring the high quality of labels.\nThen to learn from this sparse data, we propose a semi-supervised learning\n(SSL) framework for Mars image semantic segmentation, to learn representations\nfrom limited labeled data. Different from the existing SSL methods which are\nmostly targeted at the Earth image data, our method takes into account Mars\ndata characteristics. Specifically, we first investigate the impact of current\nwidely used natural image augmentations on Mars images. Based on the analysis,\nwe then proposed two novel and effective augmentations for SSL of Mars\nsegmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost\nthe model performance. Meanwhile, to fully leverage the unlabeled data, we\nintroduce a soft-to-hard consistency learning strategy, learning from different\ntargets based on prediction confidence. Experimental results show that our\nmethod can outperform state-of-the-art SSL approaches remarkably. Our proposed\ndataset is available at https://jhang2020.github.io/S5Mars.github.io/.\n","authors":["Jiahang Zhang","Lilang Lin","Zejia Fan","Wenjing Wang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2207.01200v4.pdf","comment":"IEEE TGRS 2024"},{"id":"http://arxiv.org/abs/2404.05128v1","updated":"2024-04-08T01:08:41Z","published":"2024-04-08T01:08:41Z","title":"Improving Deep Learning Predictions with Simulated Images, and Vice\n Versa","summary":" Artificial neural networks are often used to identify features of crop\nplants. However, training their models requires many annotated images, which\ncan be expensive and time-consuming to acquire. Procedural models of plants,\nsuch as those developed with Lindenmayer-systems (L-systems) can be created to\nproduce visually realistic simulations, and hence images of plant simulations,\nwhere annotations are implicitly known. These synthetic images can either\naugment or completely replace real images in training neural networks for\nphenotyping tasks. In this paper, we systematically vary amounts of real and\nsynthetic images used for training in both maize and canola to better\nunderstand situations where synthetic images generated from L-systems can help\nprediction on real images. This work also explores the degree to which realism\nin the synthetic images improves prediction. Furthermore, we see how neural\nnetwork predictions can be used to help calibrate L-systems themselves,\ncreating a feedback loop.\n","authors":["Nazifa Azam Khan","Mikolaj Cieslak","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2404.05128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03202v2","updated":"2024-04-08T01:05:57Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field\n Reconstruction using Omnidirectional Images","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in robotics. However, the current 3D Gaussian Splatting\nsystem only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. As a result, we realize\ndifferentiable optimization of the radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. To benefit the research community, the code will be\nmade publicly available once the paper is published.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.05111v1","updated":"2024-04-08T00:13:05Z","published":"2024-04-08T00:13:05Z","title":"Class Similarity Transition: Decoupling Class Similarities and Imbalance\n from Generalized Few-shot Segmentation","summary":" In Generalized Few-shot Segmentation (GFSS), a model is trained with a large\ncorpus of base class samples and then adapted on limited samples of novel\nclasses. This paper focuses on the relevance between base and novel classes,\nand improves GFSS in two aspects: 1) mining the similarity between base and\nnovel classes to promote the learning of novel classes, and 2) mitigating the\nclass imbalance issue caused by the volume difference between the support set\nand the training set. Specifically, we first propose a similarity transition\nmatrix to guide the learning of novel classes with base class knowledge. Then,\nwe leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive\nInference to the GFSS task to address the problem of class imbalance as well as\noverfitting the support set. In addition, by extending the probability\ntransition matrix, the proposed method can mitigate the catastrophic forgetting\nof base classes when learning novel classes. With a simple training phase, our\nproposed method can be applied to any segmentation network trained on base\nclasses. We validated our methods on the adapted version of OpenEarthMap.\nCompared to existing GFSS baselines, our method excels them all from 3% to 7%\nand ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at\nthe completion of this paper. Code:\nhttps://github.com/earth-insights/ClassTrans\n","authors":["Shihong Wang","Ruixun Liu","Kaiyu Li","Jiawei Jiang","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2404.05111v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.09250v2","updated":"2024-04-08T22:40:01Z","published":"2023-12-14T18:59:36Z","title":"Single Mesh Diffusion Models with Field Latents for Texture Generation","summary":" We introduce a framework for intrinsic latent diffusion models operating\ndirectly on the surfaces of 3D shapes, with the goal of synthesizing\nhigh-quality textures. Our approach is underpinned by two contributions: field\nlatents, a latent representation encoding textures as discrete vector fields on\nthe mesh vertices, and field latent diffusion models, which learn to denoise a\ndiffusion process in the learned latent space on the surface. We consider a\nsingle-textured-mesh paradigm, where our models are trained to generate\nvariations of a given texture on a mesh. We show the synthesized textures are\nof superior fidelity compared those from existing single-textured-mesh\ngenerative models. Our models can also be adapted for user-controlled editing\ntasks such as inpainting and label-guided generation. The efficacy of our\napproach is due in part to the equivariance of our proposed framework under\nisometries, allowing our models to seamlessly reproduce details across locally\nsimilar regions and opening the door to a notion of generative texture\ntransfer.\n","authors":["Thomas W. Mitchel","Carlos Esteves","Ameesh Makadia"],"pdf_url":"https://arxiv.org/pdf/2312.09250v2.pdf","comment":"CVPR 2024. Code and additional visualizations available:\n https://single-mesh-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2311.12539v2","updated":"2024-04-08T22:19:23Z","published":"2023-11-21T11:33:15Z","title":"GMISeg: General Medical Image Segmentation without Re-Training","summary":" Although deep learning models have become the main method for medical image\nsegmentation, they often cannot be extended to unknown segmentation tasks\ninvolving new anatomical structures, image shapes, or labels. For new\nsegmentation tasks, researchers often have to retrain or fine-tune the model,\nwhich is time-consuming and poses a significant obstacle to clinical\nresearchers, who often lack the resources and professional knowledge to train\nneural networks. Therefore, we proposed a general method that can solve unknown\nmedical image segmentation tasks without requiring additional training. Given\nan example set of images and prompts for defining new segmentation tasks,\nGMISeg applies a novel low-rank fine-tuning strategy based on the proposed\napproach to the SAM (Segment Anything Model) image encoder, and works with the\nprompt encoder and mask decoder to fine-tune the labeled dataset without the\nneed for additional training. To achieve generalization of new tasks, we used\nmedical image datasets with different imaging modes for different parts. We\ntrained and generalized GMISeg on a different set of anatomical and imaging\nmodes using cardiac images on other site datasets. We have demonstrated that\nGMISeg outperforms the latest methods on unknown tasks and have conducted a\ncomprehensive analysis and summary of the important performance of the proposed\nmethod.\n","authors":["Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2311.12539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11470v2","updated":"2024-04-08T22:01:32Z","published":"2022-10-20T17:59:54Z","title":"i-MAE: Are Latent Representations in Masked Autoencoders Linearly\n Separable?","summary":" Masked image modeling (MIM) has been recognized as a strong self-supervised\npre-training approach in the vision domain. However, the mechanism and\nproperties of the learned representations by such a scheme, as well as how to\nfurther enhance the representations are so far not well-explored. In this\npaper, we aim to explore an interactive Masked Autoencoders (i-MAE) framework\nto enhance the representation capability from two aspects: (1) employing a\ntwo-way image reconstruction and a latent feature reconstruction with\ndistillation loss to learn better features; (2) proposing a semantics-enhanced\nsampling strategy to boost the learned semantics in MAE. Upon the proposed\ni-MAE architecture, we can address two critical questions to explore the\nbehaviors of the learned representations in MAE: (1) Whether the separability\nof latent representations in Masked Autoencoders is helpful for model\nperformance? We study it by forcing the input as a mixture of two images\ninstead of one. (2) Whether we can enhance the representations in the latent\nfeature space by controlling the degree of semantics during sampling on Masked\nAutoencoders? To this end, we propose a sampling strategy within a mini-batch\nbased on the semantics of training samples to examine this aspect. Extensive\nexperiments are conducted on CIFAR-10/100, Tiny-ImageNet and ImageNet-1K to\nverify the observations we discovered. Furthermore, in addition to\nqualitatively analyzing the characteristics of the latent representations, we\nexamine the existence of linear separability and the degree of semantics in the\nlatent space by proposing two evaluation schemes. The surprising and consistent\nresults demonstrate that i-MAE is a superior framework design for understanding\nMAE frameworks, as well as achieving better representational ability. Code is\navailable at https://github.com/vision-learning-acceleration-lab/i-mae.\n","authors":["Kevin Zhang","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2210.11470v2.pdf","comment":"Project page: https://zhiqiangshen.com/projects/i-mae/"},{"id":"http://arxiv.org/abs/2404.03392v2","updated":"2024-04-08T21:26:47Z","published":"2024-04-04T11:49:56Z","title":"Two Tricks to Improve Unsupervised Segmentation Learning","summary":" We present two practical improvement techniques for unsupervised segmentation\nlearning. These techniques address limitations in the resolution and accuracy\nof predicted segmentation maps of recent state-of-the-art methods. Firstly, we\nleverage image post-processing techniques such as guided filtering to refine\nthe output masks, improving accuracy while avoiding substantial computational\ncosts. Secondly, we introduce a multi-scale consistency criterion, based on a\nteacher-student training scheme. This criterion matches segmentation masks\npredicted from regions of the input image extracted at different resolutions to\neach other. Experimental results on several benchmarks used in unsupervised\nsegmentation learning demonstrate the effectiveness of our proposed techniques.\n","authors":["Alp Eren Sari","Francesco Locatello","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2404.03392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05872v1","updated":"2024-04-08T21:09:59Z","published":"2024-04-08T21:09:59Z","title":"TabConv: Low-Computation CNN Inference via Table Lookups","summary":" Convolutional Neural Networks (CNNs) have demonstrated remarkable ability\nthroughout the field of computer vision. However, CNN inference requires a\nlarge number of arithmetic operations, making them expensive to deploy in\nhardware. Current approaches alleviate this issue by developing\nhardware-supported, algorithmic processes to simplify spatial convolution\nfunctions. However, these methods still heavily rely on matrix multiplication,\nleading to significant computational overhead. To bridge the gap between\nhardware, algorithmic acceleration, and approximate matrix multiplication, we\npropose TabConv, a novel, table-based approximation for convolution to\nsignificantly reduce arithmetic operations during inference. Additionally, we\nintroduce a priority masking technique based on cosine similarity to select\nlayers for table-based approximation, thereby maintaining the model\nperformance. We evaluate our approach on popular CNNs: ResNet-18, ResNet-34,\nand NetworkInNetwork (NIN). TabConv preserves over 93% of the original model's\nperformance while reducing arithmetic operations by 36.5%, 25.8%, and 99.4% for\nResNet-18 on CIFAR-10, CIFAR-100, and MNIST, respectively, 35.6% and 99.3% for\nResNet-34 on CIFAR-10 and MNIST, and 98.9% for NIN on MNIST, achieving\nlow-computation inference.\n","authors":["Neelesh Gupta","Narayanan Kannan","Pengmiao Zhang","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.05872v1.pdf","comment":"8 pages, Accepted at CF '24"},{"id":"http://arxiv.org/abs/2404.05862v1","updated":"2024-04-08T20:51:30Z","published":"2024-04-08T20:51:30Z","title":"Towards Improved Semiconductor Defect Inspection for high-NA EUVL based\n on SEMI-SuperYOLO-NAS","summary":" Due to potential pitch reduction, the semiconductor industry is adopting\nHigh-NA EUVL technology. However, its low depth of focus presents challenges\nfor High Volume Manufacturing. To address this, suppliers are exploring thinner\nphotoresists and new underlayers/hardmasks. These may suffer from poor SNR,\ncomplicating defect detection. Vision-based ML algorithms offer a promising\nsolution for semiconductor defect inspection. However, developing a robust ML\nmodel across various image resolutions without explicit training remains a\nchallenge for nano-scale defect inspection. This research's goal is to propose\na scale-invariant ADCD framework capable to upscale images, addressing this\nissue. We propose an improvised ADCD framework as SEMI-SuperYOLO-NAS, which\nbuilds upon the baseline YOLO-NAS architecture. This framework integrates a SR\nassisted branch to aid in learning HR features by the defect detection\nbackbone, particularly for detecting nano-scale defect instances from LR\nimages. Additionally, the SR-assisted branch can recursively generate upscaled\nimages from their corresponding downscaled counterparts, enabling defect\ndetection inference across various image resolutions without requiring explicit\ntraining. Moreover, we investigate improved data augmentation strategy aimed at\ngenerating diverse and realistic training datasets to enhance model\nperformance. We have evaluated our proposed approach using two original FAB\ndatasets obtained from two distinct processes and captured using two different\nimaging tools. Finally, we demonstrate zero-shot inference for our model on a\nnew, originating from a process condition distinct from the training dataset\nand possessing different Pitch characteristics. Experimental validation\ndemonstrates that our proposed ADCD framework aids in increasing the throughput\nof imaging tools for defect inspection by reducing the required image pixel\nresolutions.\n","authors":["Ying-Lin Chen","Jacob Deforce","Vic De Ridder","Bappaditya Dey","Victor Blanco","Sandip Halder","Philippe Leray"],"pdf_url":"https://arxiv.org/pdf/2404.05862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05849v1","updated":"2024-04-08T20:31:27Z","published":"2024-04-08T20:31:27Z","title":"Localizing Moments of Actions in Untrimmed Videos of Infants with Autism\n Spectrum Disorder","summary":" Autism Spectrum Disorder (ASD) presents significant challenges in early\ndiagnosis and intervention, impacting children and their families. With\nprevalence rates rising, there is a critical need for accessible and efficient\nscreening tools. Leveraging machine learning (ML) techniques, in particular\nTemporal Action Localization (TAL), holds promise for automating ASD screening.\nThis paper introduces a self-attention based TAL model designed to identify\nASD-related behaviors in infant videos. Unlike existing methods, our approach\nsimplifies complex modeling and emphasizes efficiency, which is essential for\npractical deployment in real-world scenarios. Importantly, this work\nunderscores the importance of developing computer vision methods capable of\noperating in naturilistic environments with little equipment control,\naddressing key challenges in ASD screening. This study is the first to conduct\nend-to-end temporal action localization in untrimmed videos of infants with\nASD, offering promising avenues for early intervention and support. We report\nbaseline results of behavior detection using our TAL model. We achieve 70%\naccuracy for look face, 79% accuracy for look object, 72% for smile and 65% for\nvocalization.\n","authors":["Halil Ismail Helvaci","Sen-ching Samson Cheung","Chen-Nee Chuah","Sally Ozonoff"],"pdf_url":"https://arxiv.org/pdf/2404.05849v1.pdf","comment":"7 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.05828v1","updated":"2024-04-08T19:46:20Z","published":"2024-04-08T19:46:20Z","title":"Privacy-Preserving Deep Learning Using Deformable Operators for Secure\n Task Learning","summary":" In the era of cloud computing and data-driven applications, it is crucial to\nprotect sensitive information to maintain data privacy, ensuring truly reliable\nsystems. As a result, preserving privacy in deep learning systems has become a\ncritical concern. Existing methods for privacy preservation rely on image\nencryption or perceptual transformation approaches. However, they often suffer\nfrom reduced task performance and high computational costs. To address these\nchallenges, we propose a novel Privacy-Preserving framework that uses a set of\ndeformable operators for secure task learning. Our method involves shuffling\npixels during the analog-to-digital conversion process to generate visually\nprotected data. Those are then fed into a well-known network enhanced with\ndeformable operators. Using our approach, users can achieve equivalent\nperformance to original images without additional training using a secret key.\nMoreover, our method enables access control against unauthorized users.\nExperimental results demonstrate the efficacy of our approach, showcasing its\npotential in cloud-based scenarios and privacy-sensitive applications.\n","authors":["Fabian Perez","Jhon Lopez","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2404.05828v1.pdf","comment":"copyright 2024 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2403.04932v2","updated":"2024-04-08T19:45:32Z","published":"2024-03-07T22:39:02Z","title":"Divide and Conquer: High-Resolution Industrial Anomaly Detection via\n Memory Efficient Tiled Ensemble","summary":" Industrial anomaly detection is an important task within computer vision with\na wide range of practical use cases. The small size of anomalous regions in\nmany real-world datasets necessitates processing the images at a high\nresolution. This frequently poses significant challenges concerning memory\nconsumption during the model training and inference stages, leaving some\nexisting methods impractical for widespread adoption. To overcome this\nchallenge, we present the tiled ensemble approach, which reduces memory\nconsumption by dividing the input images into a grid of tiles and training a\ndedicated model for each tile location. The tiled ensemble is compatible with\nany existing anomaly detection model without the need for any modification of\nthe underlying architecture. By introducing overlapping tiles, we utilize the\nbenefits of traditional stacking ensembles, leading to further improvements in\nanomaly detection capabilities beyond high resolution alone. We perform a\ncomprehensive analysis using diverse underlying architectures, including Padim,\nPatchCore, FastFlow, and Reverse Distillation, on two standard anomaly\ndetection datasets: MVTec and VisA. Our method demonstrates a notable\nimprovement across setups while remaining within GPU memory constraints,\nconsuming only as much GPU memory as a single model needs to process a single\ntile.\n","authors":["Blaž Rolih","Dick Ameln","Ashwin Vaidya","Samet Akcay"],"pdf_url":"https://arxiv.org/pdf/2403.04932v2.pdf","comment":"To appear at CVPR 24 Visual Anomaly Detection Workshop. Research\n conducted during Google Summer of Code 2023 at OpenVINO (Intel). GSoC 2023\n page: https://summerofcode.withgoogle.com/archive/2023/projects/WUSjdxGl"},{"id":"http://arxiv.org/abs/2401.00896v2","updated":"2024-04-08T18:40:31Z","published":"2023-12-31T10:51:52Z","title":"TrailBlazer: Trajectory Control for Diffusion-Based Video Generation","summary":" Within recent approaches to text-to-video (T2V) generation, achieving\ncontrollability in the synthesized video is often a challenge. Typically, this\nissue is addressed by providing low-level per-frame guidance in the form of\nedge maps, depth maps, or an existing video to be altered. However, the process\nof obtaining such guidance can be labor-intensive. This paper focuses on\nenhancing controllability in video synthesis by employing straightforward\nbounding boxes to guide the subject in various ways, all without the need for\nneural network training, finetuning, optimization at inference time, or the use\nof pre-existing videos. Our algorithm, TrailBlazer, is constructed upon a\npre-trained (T2V) model, and easy to implement. The subject is directed by a\nbounding box through the proposed spatial and temporal attention map editing.\nMoreover, we introduce the concept of keyframing, allowing the subject\ntrajectory and overall appearance to be guided by both a moving bounding box\nand corresponding prompts, without the need to provide a detailed mask. The\nmethod is efficient, with negligible additional computation relative to the\nunderlying pre-trained model. Despite the simplicity of the bounding box\nguidance, the resulting motion is surprisingly natural, with emergent effects\nincluding perspective and movement toward the virtual camera as the box size\nincreases.\n","authors":["Wan-Duo Kurt Ma","J. P. Lewis","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2401.00896v2.pdf","comment":"14 pages, 18 figures, Project Page:\n https://hohonu-vicml.github.io/Trailblazer.Page/"},{"id":"http://arxiv.org/abs/2404.05814v1","updated":"2024-04-08T18:36:18Z","published":"2024-04-08T18:36:18Z","title":"Towards Explainable Automated Neuroanatomy","summary":" We present a novel method for quantifying the microscopic structure of brain\ntissue. It is based on the automated recognition of interpretable features\nobtained by analyzing the shapes of cells. This contrasts with prevailing\nmethods of brain anatomical analysis in two ways. First, contemporary methods\nuse gray-scale values derived from smoothed version of the anatomical images,\nwhich dissipated valuable information from the texture of the images. Second,\ncontemporary analysis uses the output of black-box Convolutional Neural\nNetworks, while our system makes decisions based on interpretable features\nobtained by analyzing the shapes of individual cells. An important benefit of\nthis open-box approach is that the anatomist can understand and correct the\ndecisions made by the computer. Our proposed system can accurately localize and\nidentify existing brain structures. This can be used to align and coregistar\nbrains and will facilitate connectomic studies for reverse engineering of brain\ncircuitry.\n","authors":["Kui Qian","Litao Qiao","Beth Friedman","Edward O'Donnell","David Kleinfeld","Yoav Freund"],"pdf_url":"https://arxiv.org/pdf/2404.05814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05802v1","updated":"2024-04-08T18:05:24Z","published":"2024-04-08T18:05:24Z","title":"BatSort: Enhanced Battery Classification with Transfer Learning for\n Battery Sorting and Recycling","summary":" Battery recycling is a critical process for minimizing environmental harm and\nresource waste for used batteries. However, it is challenging, largely because\nsorting batteries is costly and hardly automated to group batteries based on\nbattery types. In this paper, we introduce a machine learning-based approach\nfor battery-type classification and address the daunting problem of data\nscarcity for the application. We propose BatSort which applies transfer\nlearning to utilize the existing knowledge optimized with large-scale datasets\nand customizes ResNet to be specialized for classifying battery types. We\ncollected our in-house battery-type dataset of small-scale to guide the\nknowledge transfer as a case study and evaluate the system performance. We\nconducted an experimental study and the results show that BatSort can achieve\noutstanding accuracy of 92.1% on average and up to 96.2% and the performance is\nstable for battery-type classification. Our solution helps realize fast and\nautomated battery sorting with minimized cost and can be transferred to related\nindustry applications with insufficient data.\n","authors":["Yunyi Zhao","Wei Zhang","Erhai Hu","Qingyu Yan","Cheng Xiang","King Jet Tseng","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2404.05802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05783v1","updated":"2024-04-08T17:53:21Z","published":"2024-04-08T17:53:21Z","title":"Responsible Generative AI: What to Generate and What Not","summary":" In recent years, generative AI (GenAI), like large language models and\ntext-to-image models, has received significant attention across various\ndomains. However, ensuring the responsible generation of content by these\nmodels is crucial for their real-world applicability. This raises an\ninteresting question: \\textit{What should responsible GenAI generate, and what\nshould it not?} To answer the question, this paper investigates the practical\nresponsible requirements of both textual and visual generative models,\noutlining five key considerations: generating truthful content, avoiding toxic\ncontent, refusing harmful instruction, leaking no training data-related\ncontent, and ensuring generated content identifiable. Specifically, we review\nrecent advancements and challenges in addressing these requirements. Besides,\nwe discuss and emphasize the importance of responsible GenAI across healthcare,\neducation, finance, and artificial general intelligence domains. Through a\nunified perspective on both textual and visual generative models, this paper\naims to provide insights into practical safety-related issues and further\nbenefit the community in building responsible GenAI.\n","authors":["Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.05783v1.pdf","comment":"74 pages, 10 figures"},{"id":"http://arxiv.org/abs/2205.10793v2","updated":"2024-04-08T16:59:24Z","published":"2022-05-22T10:26:54Z","title":"Knowledge Distillation via the Target-aware Transformer","summary":" Knowledge distillation becomes a de facto standard to improve the performance\nof small neural networks. Most of the previous works propose to regress the\nrepresentational features from the teacher to the student in a one-to-one\nspatial matching fashion. However, people tend to overlook the fact that, due\nto the architecture differences, the semantic information on the same spatial\nlocation usually vary. This greatly undermines the underlying assumption of the\none-to-one distillation approach. To this end, we propose a novel one-to-all\nspatial matching knowledge distillation approach. Specifically, we allow each\npixel of the teacher feature to be distilled to all spatial locations of the\nstudent features given its similarity, which is generated from a target-aware\ntransformer. Our approach surpasses the state-of-the-art methods by a\nsignificant margin on various computer vision benchmarks, such as ImageNet,\nPascal VOC and COCOStuff10k. Code is available at\nhttps://github.com/sihaoevery/TaT.\n","authors":["Sihao Lin","Hongwei Xie","Bing Wang","Kaicheng Yu","Xiaojun Chang","Xiaodan Liang","Gang Wang"],"pdf_url":"https://arxiv.org/pdf/2205.10793v2.pdf","comment":"CVPR2022(Oral)"},{"id":"http://arxiv.org/abs/2303.17546v3","updated":"2024-04-08T16:49:16Z","published":"2023-03-30T17:13:56Z","title":"PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor","summary":" Generative image editing has recently witnessed extremely fast-paced growth.\nSome works use high-level conditioning such as text, while others use low-level\nconditioning. Nevertheless, most of them lack fine-grained control over the\nproperties of the different objects present in the image, i.e. object-level\nimage editing. In this work, we tackle the task by perceiving the images as an\namalgamation of various objects and aim to control the properties of each\nobject in a fine-grained manner. Out of these properties, we identify structure\nand appearance as the most intuitive to understand and useful for editing\npurposes. We propose PAIR Diffusion, a generic framework that can enable a\ndiffusion model to control the structure and appearance properties of each\nobject in the image. We show that having control over the properties of each\nobject in an image leads to comprehensive editing capabilities. Our framework\nallows for various object-level editing operations on real images such as\nreference image-based appearance editing, free-form shape editing, adding\nobjects, and variations. Thanks to our design, we do not require any inversion\nstep. Additionally, we propose multimodal classifier-free guidance which\nenables editing images using both reference images and text when using our\napproach with foundational diffusion models. We validate the above claims by\nextensively evaluating our framework on both unconditional and foundational\ndiffusion models. Please refer to\nhttps://vidit98.github.io/publication/conference-paper/pair_diff.html for code\nand model release.\n","authors":["Vidit Goel","Elia Peruzzo","Yifan Jiang","Dejia Xu","Xingqian Xu","Nicu Sebe","Trevor Darrell","Zhangyang Wang","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2303.17546v3.pdf","comment":"Accepted in CVPR 2024, Project page\n https://vidit98.github.io/publication/conference-paper/pair_diff.html"},{"id":"http://arxiv.org/abs/2312.03048v2","updated":"2024-04-08T08:59:24Z","published":"2023-12-05T18:34:12Z","title":"DGInStyle: Domain-Generalizable Semantic Segmentation with Image\n Diffusion Models and Stylized Semantic Control","summary":" Large, pretrained latent diffusion models (LDMs) have demonstrated an\nextraordinary ability to generate creative content, specialize to user data\nthrough few-shot fine-tuning, and condition their output on other modalities,\nsuch as semantic maps. However, are they usable as large-scale data generators,\ne.g., to improve tasks in the perception stack, like semantic segmentation? We\ninvestigate this question in the context of autonomous driving, and answer it\nwith a resounding \"yes\". We propose an efficient data generation pipeline\ntermed DGInStyle. First, we examine the problem of specializing a pretrained\nLDM to semantically-controlled generation within a narrow domain. Second, we\npropose a Style Swap technique to endow the rich generative prior with the\nlearned semantic control. Third, we design a Multi-resolution Latent Fusion\ntechnique to overcome the bias of LDMs towards dominant objects. Using\nDGInStyle, we generate a diverse dataset of street scenes, train a\ndomain-agnostic semantic segmentation model on it, and evaluate the model on\nmultiple popular autonomous driving datasets. Our approach consistently\nincreases the performance of several domain generalization methods compared to\nthe previous state-of-the-art methods. Source code and dataset are available at\nhttps://dginstyle.github.io.\n","authors":["Yuru Jia","Lukas Hoyer","Shengyu Huang","Tianfu Wang","Luc Van Gool","Konrad Schindler","Anton Obukhov"],"pdf_url":"https://arxiv.org/pdf/2312.03048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05776v1","updated":"2024-04-08T06:47:03Z","published":"2024-04-08T06:47:03Z","title":"Forecasting Electric Vehicle Battery Output Voltage: A Predictive\n Modeling Approach","summary":" The battery management system plays a vital role in ensuring the safety and\ndependability of electric and hybrid vehicles. It is responsible for various\nfunctions, including state evaluation, monitoring, charge control, and cell\nbalancing, all integrated within the BMS. Nonetheless, due to the uncertainties\nsurrounding battery performance, implementing these functionalities poses\nsignificant challenges. In this study, we explore the latest approaches for\nassessing battery states, highlight notable advancements in battery management\nsystems (BMS), address existing issues with current BMS technology, and put\nforth possible solutions for predicting battery charging voltage.\n","authors":["Narayana Darapaneni","Ashish K","Ullas M S","Anwesh Reddy Paduri"],"pdf_url":"https://arxiv.org/pdf/2404.05776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04125v2","updated":"2024-04-08T21:14:43Z","published":"2024-04-04T17:58:02Z","title":"No \"Zero-Shot\" Without Exponential Data: Pretraining Concept Frequency\n Determines Multimodal Model Performance","summary":" Web-crawled pretraining datasets underlie the impressive \"zero-shot\"\nevaluation performance of multimodal models, such as CLIP for\nclassification/retrieval and Stable-Diffusion for image generation. However, it\nis unclear how meaningful the notion of \"zero-shot\" generalization is for such\nmultimodal models, as it is not known to what extent their pretraining datasets\nencompass the downstream concepts targeted for during \"zero-shot\" evaluation.\nIn this work, we ask: How is the performance of multimodal models on downstream\nconcepts influenced by the frequency of these concepts in their pretraining\ndatasets? We comprehensively investigate this question across 34 models and\nfive standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M,\nLAION-Aesthetics), generating over 300GB of data artifacts. We consistently\nfind that, far from exhibiting \"zero-shot\" generalization, multimodal models\nrequire exponentially more data to achieve linear improvements in downstream\n\"zero-shot\" performance, following a sample inefficient log-linear scaling\ntrend. This trend persists even when controlling for sample-level similarity\nbetween pretraining and downstream datasets, and testing on purely synthetic\ndata distributions. Furthermore, upon benchmarking models on long-tailed data\nsampled based on our analysis, we demonstrate that multimodal models across the\nboard perform poorly. We contribute this long-tail test set as the \"Let it\nWag!\" benchmark to further research in this direction. Taken together, our\nstudy reveals an exponential need for training data which implies that the key\nto \"zero-shot\" generalization capabilities under large-scale training paradigms\nremains to be found.\n","authors":["Vishaal Udandarao","Ameya Prabhu","Adhiraj Ghosh","Yash Sharma","Philip H. S. Torr","Adel Bibi","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2404.04125v2.pdf","comment":"Extended version of the short paper accepted at DPFM, ICLR'24"},{"id":"http://arxiv.org/abs/2404.07236v1","updated":"2024-04-08T08:50:09Z","published":"2024-04-08T08:50:09Z","title":"Lightweight Deep Learning for Resource-Constrained Environments: A\n Survey","summary":" Over the past decade, the dominance of deep learning has prevailed across\nvarious domains of artificial intelligence, including natural language\nprocessing, computer vision, and biomedical signal processing. While there have\nbeen remarkable improvements in model accuracy, deploying these models on\nlightweight devices, such as mobile phones and microcontrollers, is constrained\nby limited resources. In this survey, we provide comprehensive design guidance\ntailored for these devices, detailing the meticulous design of lightweight\nmodels, compression methods, and hardware acceleration strategies. The\nprincipal goal of this work is to explore methods and concepts for getting\naround hardware constraints without compromising the model's accuracy.\nAdditionally, we explore two notable paths for lightweight deep learning in the\nfuture: deployment techniques for TinyML and Large Language Models. Although\nthese paths undoubtedly have potential, they also present significant\nchallenges, encouraging research into unexplored areas.\n","authors":["Hou-I Liu","Marco Galindo","Hongxia Xie","Lai-Kuan Wong","Hong-Han Shuai","Yung-Yui Li","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.07236v1.pdf","comment":"40 pages"}]},"2024-04-07T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05107v1","updated":"2024-04-07T23:31:37Z","published":"2024-04-07T23:31:37Z","title":"Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by\n Unsupervised Learning","summary":" The reconstruction of human visual inputs from brain activity, particularly\nthrough functional Magnetic Resonance Imaging (fMRI), holds promising avenues\nfor unraveling the mechanisms of the human visual system. Despite the\nsignificant strides made by deep learning methods in improving the quality and\ninterpretability of visual reconstruction, there remains a substantial demand\nfor high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The\nchallenge arises in integrating diverse smaller 3-Tesla datasets or\naccommodating new subjects with brief and low-quality fMRI scans. In response\nto these constraints, we propose a novel framework that generates enhanced 3T\nfMRI data through an unsupervised Generative Adversarial Network (GAN),\nleveraging unpaired training across two distinct fMRI datasets in 7T and 3T,\nrespectively. This approach aims to overcome the limitations of the scarcity of\nhigh-quality 7-Tesla data and the challenges associated with brief and\nlow-quality scans in 3-Tesla experiments. In this paper, we demonstrate the\nreconstruction capabilities of the enhanced 3T fMRI data, highlighting its\nproficiency in generating superior input visual images compared to\ndata-intensive methods trained and tested on a single subject.\n","authors":["Yujian Xiong","Wenhui Zhu","Zhong-Lin Lu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05107v1.pdf","comment":"Accepted by ISBI 2024"},{"id":"http://arxiv.org/abs/2307.05845v5","updated":"2024-04-07T23:27:06Z","published":"2023-07-11T23:36:49Z","title":"PIGEON: Predicting Image Geolocations","summary":" Planet-scale image geolocalization remains a challenging problem due to the\ndiversity of images originating from anywhere in the world. Although approaches\nbased on vision transformers have made significant progress in geolocalization\naccuracy, success in prior literature is constrained to narrow distributions of\nimages of landmarks, and performance has not generalized to unseen places. We\npresent a new geolocalization system that combines semantic geocell creation,\nmulti-task contrastive pretraining, and a novel loss function. Additionally,\nour work is the first to perform retrieval over location clusters for guess\nrefinements. We train two models for evaluations on street-level data and\ngeneral-purpose image geolocalization; the first model, PIGEON, is trained on\ndata from the game of Geoguessr and is capable of placing over 40% of its\nguesses within 25 kilometers of the target location globally. We also develop a\nbot and deploy PIGEON in a blind experiment against humans, ranking in the top\n0.01% of players. We further challenge one of the world's foremost professional\nGeoguessr players to a series of six matches with millions of viewers, winning\nall six games. Our second model, PIGEOTTO, differs in that it is trained on a\ndataset of images from Flickr and Wikipedia, achieving state-of-the-art results\non a wide range of image geolocalization benchmarks, outperforming the previous\nSOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8\npercentage points on the country level. Our findings suggest that PIGEOTTO is\nthe first image geolocalization model that effectively generalizes to unseen\nplaces and that our approach can pave the way for highly accurate, planet-scale\nimage geolocalization systems. Our code is available on GitHub.\n","authors":["Lukas Haas","Michal Skreta","Silas Alberti","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.05845v5.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.05105v1","updated":"2024-04-07T23:10:26Z","published":"2024-04-07T23:10:26Z","title":"VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for\n Deformable 3D Image Registration","summary":" Image registration, a critical process in medical imaging, involves aligning\ndifferent sets of medical imaging data into a single unified coordinate system.\nDeep learning networks, such as the Convolutional Neural Network (CNN)-based\nVoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model\n(SSM)-based MambaMorph, have demonstrated effective performance in this domain.\nThe recent Visual State Space Model (VMamba), which incorporates a cross-scan\nmodule with SSM, has exhibited promising improvements in modeling global-range\ndependencies with efficient computational cost in computer vision tasks. This\npaper hereby introduces an exploration of VMamba with image registration, named\nVMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for\n3D image registration. Utilizing a U-shaped network architecture, VMambaMorph\ncomputes the deformation field based on target and source volumes. The\nVMamba-based block with 2D cross-scan module is redesigned for 3D volumetric\nfeature processing, and a fine-grained feature extraction module is proposed\nfor high-dimensional feature learning. We validate VMambaMorph using a public\nbenchmark brain MR-CT registration dataset, comparing its performance against\ncurrent state-of-the-art methods. The results indicate that VMambaMorph\nachieves competitive registration quality. The code for VMambaMorph is\navailable on GitHub.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Chao Ma","Tao Guo"],"pdf_url":"https://arxiv.org/pdf/2404.05105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05102v1","updated":"2024-04-07T22:58:18Z","published":"2024-04-07T22:58:18Z","title":"LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance\n Volumetric Medical Image Segmentation","summary":" As a result of the rise of Transformer architectures in medical image\nanalysis, specifically in the domain of medical image segmentation, a multitude\nof hybrid models have been created that merge the advantages of Convolutional\nNeural Networks (CNNs) and Transformers. These hybrid models have achieved\nnotable success by significantly improving segmentation accuracy. Yet, this\nprogress often comes at the cost of increased model complexity, both in terms\nof parameters and computational demand. Moreover, many of these models fail to\nconsider the crucial interplay between spatial and channel features, which\ncould further refine and improve segmentation outcomes. To address this, we\nintroduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric\nmedical image segmentation. LHU-Net is meticulously designed to prioritize\nspatial feature analysis in its initial layers before shifting focus to\nchannel-based features in its deeper layers, ensuring a comprehensive feature\nextraction process. Rigorous evaluation across five benchmark datasets -\nSynapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior\nperformance, showcasing its dual capacity for efficiency and accuracy. Notably,\nLHU-Net sets new performance benchmarks, such as attaining a Dice score of\n92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and\nquartering the computational load compared to existing state-of-the-art models.\nAchieved without any reliance on pre-training, additional data, or model\nensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art\nperformance across all evaluated datasets, utilizing fewer than 11 million\nparameters. This achievement highlights that balancing computational efficiency\nwith high accuracy in medical image segmentation is feasible. Our\nimplementation of LHU-Net is freely accessible to the research community on\nGitHub.\n","authors":["Yousef Sadegheih","Afshin Bozorgpour","Pratibha Kumari","Reza Azad","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2404.05102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04001v4","updated":"2024-04-07T22:46:13Z","published":"2023-09-07T20:07:57Z","title":"MMSFormer: Multimodal Transformer for Material and Semantic Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different modality\ncombinations. We also propose a new model named Multi-Modal Segmentation\nTransFormer (MMSFormer) that incorporates the proposed fusion strategy to\nperform multimodal material and semantic segmentation tasks. MMSFormer\noutperforms current state-of-the-art models on three different datasets. As we\nbegin with only one input modality, performance improves progressively as\nadditional modalities are incorporated, showcasing the effectiveness of the\nfusion block in combining useful information from diverse input modalities.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v4.pdf","comment":"Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3\n figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.02634v2","updated":"2024-04-07T22:18:52Z","published":"2024-01-05T04:53:33Z","title":"AG-ReID.v2: Bridging Aerial and Ground Views for Person\n Re-identification","summary":" Aerial-ground person re-identification (Re-ID) presents unique challenges in\ncomputer vision, stemming from the distinct differences in viewpoints, poses,\nand resolutions between high-altitude aerial and ground-based cameras. Existing\nresearch predominantly focuses on ground-to-ground matching, with aerial\nmatching less explored due to a dearth of comprehensive datasets. To address\nthis, we introduce AG-ReID.v2, a dataset specifically designed for person Re-ID\nin mixed aerial and ground scenarios. This dataset comprises 100,502 images of\n1,615 unique individuals, each annotated with matching IDs and 15 soft\nattribute labels. Data were collected from diverse perspectives using a UAV,\nstationary CCTV, and smart glasses-integrated camera, providing a rich variety\nof intra-identity variations. Additionally, we have developed an explainable\nattention network tailored for this dataset. This network features a\nthree-stream architecture that efficiently processes pairwise image distances,\nemphasizes key top-down features, and adapts to variations in appearance due to\naltitude differences. Comparative evaluations demonstrate the superiority of\nour approach over existing baselines. We plan to release the dataset and\nalgorithm source code publicly, aiming to advance research in this specialized\nfield of computer vision. For access, please visit\nhttps://github.com/huynguyen792/AG-ReID.v2.\n","authors":["Huy Nguyen","Kien Nguyen","Sridha Sridharan","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2401.02634v2.pdf","comment":"13 pages, Accepted by TIFS 2023"},{"id":"http://arxiv.org/abs/2404.05083v1","updated":"2024-04-07T21:46:47Z","published":"2024-04-07T21:46:47Z","title":"HaVTR: Improving Video-Text Retrieval Through Augmentation Using Large\n Foundation Models","summary":" While recent progress in video-text retrieval has been driven by the\nexploration of powerful model architectures and training strategies, the\nrepresentation learning ability of video-text retrieval models is still limited\ndue to low-quality and scarce training data annotations. To address this issue,\nwe present a novel video-text learning paradigm, HaVTR, which augments video\nand text data to learn more generalized features. Specifically, we first adopt\na simple augmentation method, which generates self-similar data by randomly\nduplicating or dropping subwords and frames. In addition, inspired by the\nrecent advancement in visual and language generative models, we propose a more\npowerful augmentation method through textual paraphrasing and video stylization\nusing large language models (LLMs) and visual generative models (VGMs).\nFurther, to bring richer information into video and text, we propose a\nhallucination-based augmentation method, where we use LLMs and VGMs to generate\nand add new relevant information to the original data. Benefiting from the\nenriched data, extensive experiments on several video-text retrieval benchmarks\ndemonstrate the superiority of HaVTR over existing methods.\n","authors":["Yimu Wang","Shuai Yuan","Xiangru Jian","Wei Pang","Mushi Wang","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2404.05083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06886v2","updated":"2024-04-07T21:41:05Z","published":"2023-12-11T23:20:31Z","title":"Relightful Harmonization: Lighting-aware Portrait Background Replacement","summary":" Portrait harmonization aims to composite a subject into a new background,\nadjusting its lighting and color to ensure harmony with the background scene.\nExisting harmonization techniques often only focus on adjusting the global\ncolor and brightness of the foreground and ignore crucial illumination cues\nfrom the background such as apparent lighting direction, leading to unrealistic\ncompositions. We introduce Relightful Harmonization, a lighting-aware diffusion\nmodel designed to seamlessly harmonize sophisticated lighting effect for the\nforeground portrait using any background image. Our approach unfolds in three\nstages. First, we introduce a lighting representation module that allows our\ndiffusion model to encode lighting information from target image background.\nSecond, we introduce an alignment network that aligns lighting features learned\nfrom image background with lighting features learned from panorama environment\nmaps, which is a complete representation for scene illumination. Last, to\nfurther boost the photorealism of the proposed method, we introduce a novel\ndata simulation pipeline that generates synthetic training pairs from a diverse\nrange of natural images, which are used to refine the model. Our method\noutperforms existing benchmarks in visual fidelity and lighting coherence,\nshowing superior generalization in real-world testing scenarios, highlighting\nits versatility and practicality.\n","authors":["Mengwei Ren","Wei Xiong","Jae Shin Yoon","Zhixin Shu","Jianming Zhang","HyunJoon Jung","Guido Gerig","He Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.06886v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2404.05072v1","updated":"2024-04-07T21:00:14Z","published":"2024-04-07T21:00:14Z","title":"Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind","summary":" As humans move around, performing their daily tasks, they are able to recall\nwhere they have positioned objects in their environment, even if these objects\nare currently out of sight. In this paper, we aim to mimic this spatial\ncognition ability. We thus formulate the task of Out of Sight, Not Out of Mind\n- 3D tracking active objects using observations captured through an egocentric\ncamera. We introduce Lift, Match and Keep (LMK), a method which lifts partial\n2D observations to 3D world coordinates, matches them over time using visual\nappearance, 3D location and interactions to form object tracks, and keeps these\nobject tracks even when they go out-of-view of the camera - hence keeping in\nmind what is out of sight. We test LMK on 100 long videos from EPIC-KITCHENS.\nOur results demonstrate that spatial cognition is critical for correctly\nlocating objects over short and long time scales. E.g., for one long egocentric\nvideo, we estimate the 3D location of 50 active objects. Of these, 60% can be\ncorrectly positioned in 3D after 2 minutes of leaving the camera view.\n","authors":["Chiara Plizzari","Shubham Goel","Toby Perrett","Jacob Chalk","Angjoo Kanazawa","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05072v1.pdf","comment":"21 pages including references and appendix. Project Webpage:\n http://dimadamen.github.io/OSNOM/"},{"id":"http://arxiv.org/abs/2404.05069v1","updated":"2024-04-07T20:39:31Z","published":"2024-04-07T20:39:31Z","title":"AirShot: Efficient Few-Shot Detection for Autonomous Exploration","summary":" Few-shot object detection has drawn increasing attention in the field of\nrobotic exploration, where robots are required to find unseen objects with a\nfew online provided examples. Despite recent efforts have been made to yield\nonline processing capabilities, slow inference speeds of low-powered robots\nfail to meet the demands of real-time detection-making them impractical for\nautonomous exploration. Existing methods still face performance and efficiency\nchallenges, mainly due to unreliable features and exhaustive class loops. In\nthis work, we propose a new paradigm AirShot, and discover that, by fully\nexploiting the valuable correlation map, AirShot can result in a more robust\nand faster few-shot object detection system, which is more applicable to\nrobotics community. The core module Top Prediction Filter (TPF) can operate on\nmulti-scale correlation maps in both the training and inference stages. During\ntraining, TPF supervises the generation of a more representative correlation\nmap, while during inference, it reduces looping iterations by selecting\ntop-ranked classes, thus cutting down on computational costs with better\nperformance. Surprisingly, this dual functionality exhibits general\neffectiveness and efficiency on various off-the-shelf models. Exhaustive\nexperiments on COCO2017, VOC2014, and SubT datasets demonstrate that TPF can\nsignificantly boost the efficacy and efficiency of most off-the-shelf models,\nachieving up to 36.4% precision improvements along with 56.3% faster inference\nspeed. Code and Data are at: https://github.com/ImNotPrepared/AirShot.\n","authors":["Zihan Wang","Bowen Li","Chen Wang","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2404.05069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17328v3","updated":"2024-04-07T20:20:09Z","published":"2023-05-27T02:08:51Z","title":"Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention\n Graph in Pre-Trained Transformers","summary":" Deployment of Transformer models on edge devices is becoming increasingly\nchallenging due to the exponentially growing inference cost that scales\nquadratically with the number of tokens in the input sequence. Token pruning is\nan emerging solution to address this challenge due to its ease of deployment on\nvarious Transformer backbones. However, most token pruning methods require\ncomputationally expensive fine-tuning, which is undesirable in many edge\ndeployment cases. In this work, we propose Zero-TPrune, the first zero-shot\nmethod that considers both the importance and similarity of tokens in\nperforming token pruning. It leverages the attention graph of pre-trained\nTransformer models to produce an importance distribution for tokens via our\nproposed Weighted Page Rank (WPR) algorithm. This distribution further guides\ntoken partitioning for efficient similarity-based pruning. Due to the\nelimination of the fine-tuning overhead, Zero-TPrune can prune large models at\nnegligible computational cost, switch between different pruning configurations\nat no computational cost, and perform hyperparameter tuning efficiently. We\nevaluate the performance of Zero-TPrune on vision tasks by applying it to\nvarious vision Transformer backbones and testing them on ImageNet. Without any\nfine-tuning, Zero-TPrune reduces the FLOPs cost of DeiT-S by 34.7% and improves\nits throughput by 45.3% with only 0.4% accuracy loss. Compared with\nstate-of-the-art pruning methods that require fine-tuning, Zero-TPrune not only\neliminates the need for fine-tuning after pruning but also does so with only\n0.1% accuracy loss. Compared with state-of-the-art fine-tuning-free pruning\nmethods, Zero-TPrune reduces accuracy loss by up to 49% with similar FLOPs\nbudgets. Project webpage: https://jha-lab.github.io/zerotprune.\n","authors":["Hongjie Wang","Bhishma Dedhia","Niraj K. Jha"],"pdf_url":"https://arxiv.org/pdf/2305.17328v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2404.05063v1","updated":"2024-04-07T20:19:04Z","published":"2024-04-07T20:19:04Z","title":"AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with\n Implicit Disentanglement","summary":" Facial action unit (AU) intensity plays a pivotal role in quantifying\nfine-grained expression behaviors, which is an effective condition for facial\nexpression manipulation. However, publicly available datasets containing\nintensity annotations for multiple AUs remain severely limited, often featuring\na restricted number of subjects. This limitation places challenges to the AU\nintensity manipulation in images due to disentanglement issues, leading\nresearchers to resort to other large datasets with pretrained AU intensity\nestimators for pseudo labels. In addressing this constraint and fully\nleveraging manual annotations of AU intensities for precise manipulation, we\nintroduce AUEditNet. Our proposed model achieves impressive intensity\nmanipulation across 12 AUs, trained effectively with only 18 subjects.\nUtilizing a dual-branch architecture, our approach achieves comprehensive\ndisentanglement of facial attributes and identity without necessitating\nadditional loss functions or implementing with large batch sizes. This approach\noffers a potential solution to achieve desired facial attribute editing despite\nthe dataset's limited subject count. Our experiments demonstrate AUEditNet's\nsuperior accuracy in editing AU intensities, affirming its capability in\ndisentangling facial attributes and identity within a limited subject pool.\nAUEditNet allows conditioning by either intensity values or target images,\neliminating the need for constructing AU combinations for specific facial\nexpression synthesis. Moreover, AU intensity estimation, as a downstream task,\nvalidates the consistency between real and edited images, confirming the\neffectiveness of our proposed AU intensity manipulation method.\n","authors":["Shiwei Jin","Peng Liu","Zhen Wang","Lei Wang","Ning Bi","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05061v1","updated":"2024-04-07T20:15:40Z","published":"2024-04-07T20:15:40Z","title":"Automated Prediction of Breast Cancer Response to Neoadjuvant\n Chemotherapy from DWI Data","summary":" Effective surgical planning for breast cancer hinges on accurately predicting\npathological complete response (pCR) to neoadjuvant chemotherapy (NAC).\nDiffusion-weighted MRI (DWI) and machine learning offer a non-invasive approach\nfor early pCR assessment. However, most machine-learning models require manual\ntumor segmentation, a cumbersome and error-prone task. We propose a deep\nlearning model employing \"Size-Adaptive Lesion Weighting\" for automatic DWI\ntumor segmentation to enhance pCR prediction accuracy. Despite\nhistopathological changes during NAC complicating DWI image segmentation, our\nmodel demonstrates robust performance. Utilizing the BMMR2 challenge dataset,\nit matches human experts in pCR prediction pre-NAC with an area under the curve\n(AUC) of 0.76 vs. 0.796, and surpasses standard automated methods mid-NAC, with\nan AUC of 0.729 vs. 0.654 and 0.576. Our approach represents a significant\nadvancement in automating breast cancer treatment planning, enabling more\nreliable pCR predictions without manual segmentation.\n","authors":["Shir Nitzan","Maya Gilad","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2404.05061v1.pdf","comment":"Accepted for presentation at the IEEE International Symposium on\n Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2401.04244v2","updated":"2024-04-07T20:13:45Z","published":"2024-01-08T21:35:05Z","title":"Spatio-Temporal Turbulence Mitigation: A Translational Perspective","summary":" Recovering images distorted by atmospheric turbulence is a challenging\ninverse problem due to the stochastic nature of turbulence. Although numerous\nturbulence mitigation (TM) algorithms have been proposed, their efficiency and\ngeneralization to real-world dynamic scenarios remain severely limited.\nBuilding upon the intuitions of classical TM algorithms, we present the Deep\nAtmospheric TUrbulence Mitigation network (DATUM). DATUM aims to overcome major\nchallenges when transitioning from classical to deep learning approaches. By\ncarefully integrating the merits of classical multi-frame TM methods into a\ndeep network structure, we demonstrate that DATUM can efficiently perform\nlong-range temporal aggregation using a recurrent fashion, while deformable\nattention and temporal-channel attention seamlessly facilitate pixel\nregistration and lucky imaging. With additional supervision, tilt and blur\ndegradation can be jointly mitigated. These inductive biases empower DATUM to\nsignificantly outperform existing methods while delivering a tenfold increase\nin processing speed. A large-scale training dataset, ATSyn, is presented as a\nco-invention to enable generalization in real turbulence. Our code and datasets\nare available at https://xg416.github.io/DATUM.\n","authors":["Xingguang Zhang","Nicholas Chimitt","Yiheng Chi","Zhiyuan Mao","Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2401.04244v2.pdf","comment":"Accepted by CVPR 2024, project page https://xg416.github.io/DATUM/"},{"id":"http://arxiv.org/abs/2312.15719v2","updated":"2024-04-07T19:59:00Z","published":"2023-12-25T13:12:36Z","title":"Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric\n Videos","summary":" We propose the task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the\nreconstruction of frames during which the hand is stably holding the object. We\nfirst develop the stable grasp definition based on the intuition that the\nin-contact area between the hand and object should remain stable. By analysing\nthe 3D ARCTIC dataset, we identify stable grasp durations and showcase that\nobjects in stable grasps move within a single degree of freedom (1-DoF). We\nthereby propose a method to jointly optimise all frames within a stable grasp,\nminimising object motions to a latent 1-DoF. Finally, we extend the knowledge\nto in-the-wild videos by labelling 2.4K clips of stable grasps. Our proposed\nEPIC-Grasps dataset includes 390 object instances of 9 categories, featuring\nstable grasps from videos of daily interactions in 141 environments. Without 3D\nground truth, we use stable contact areas and 2D projection masks to assess the\nHO-SGR task in the wild. We evaluate relevant methods and our approach\npreserves significantly higher stable contact area, on both EPIC-Grasps and\nstable grasp sub-sequences from the ARCTIC dataset.\n","authors":["Zhifan Zhu","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2312.15719v2.pdf","comment":"webpage: https://zhifanzhu.github.io/getagrip"},{"id":"http://arxiv.org/abs/2404.05052v1","updated":"2024-04-07T19:23:28Z","published":"2024-04-07T19:23:28Z","title":"Facial Affective Behavior Analysis with Instruction Tuning","summary":" Facial affective behavior analysis (FABA) is crucial for understanding human\nmental states from images. However, traditional approaches primarily deploy\nmodels to discriminate among discrete emotion categories, and lack the fine\ngranularity and reasoning capability for complex facial behaviors. The advent\nof Multi-modal Large Language Models (MLLMs) has been proven successful in\ngeneral visual understanding tasks. However, directly harnessing MLLMs for FABA\nis challenging due to the scarcity of datasets and benchmarks, neglecting\nfacial prior knowledge, and low training efficiency. To address these\nchallenges, we introduce (i) an instruction-following dataset for two FABA\ntasks, e.g., emotion and action unit recognition, (ii) a benchmark FABA-Bench\nwith a new metric considering both recognition and generation ability, and\n(iii) a new MLLM \"EmoLA\" as a strong baseline to the community. Our initiative\non the dataset and benchmarks reveal the nature and rationale of facial\naffective behaviors, i.e., fine-grained facial movement, interpretability, and\nreasoning. Moreover, to build an effective and efficient FABA MLLM, we\nintroduce a facial prior expert module with face structure knowledge and a\nlow-rank adaptation module into pre-trained MLLM. We conduct extensive\nexperiments on FABA-Bench and four commonly-used FABA datasets. The results\ndemonstrate that the proposed facial prior expert can boost the performance and\nEmoLA achieves the best results on our FABA-Bench. On commonly-used FABA\ndatasets, EmoLA is competitive rivaling task-specific state-of-the-art models.\n","authors":["Yifan Li","Anh Dao","Wentao Bao","Zhen Tan","Tianlong Chen","Huan Liu","Yu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.05052v1.pdf","comment":"V1.0"},{"id":"http://arxiv.org/abs/2404.05049v1","updated":"2024-04-07T19:10:02Z","published":"2024-04-07T19:10:02Z","title":"PlateSegFL: A Privacy-Preserving License Plate Detection Using Federated\n Segmentation Learning","summary":" Automatic License Plate Recognition (ALPR) is an integral component of an\nintelligent transport system with extensive applications in secure\ntransportation, vehicle-to-vehicle communication, stolen vehicles detection,\ntraffic violations, and traffic flow management. The existing license plate\ndetection system focuses on one-shot learners or pre-trained models that\noperate with a geometric bounding box, limiting the model's performance.\nFurthermore, continuous video data streams uploaded to the central server\nresult in network and complexity issues. To combat this, PlateSegFL was\nintroduced, which implements U-Net-based segmentation along with Federated\nLearning (FL). U-Net is well-suited for multi-class image segmentation tasks\nbecause it can analyze a large number of classes and generate a pixel-level\nsegmentation map for each class. Federated Learning is used to reduce the\nquantity of data required while safeguarding the user's privacy. Different\ncomputing platforms, such as mobile phones, are able to collaborate on the\ndevelopment of a standard prediction model where it makes efficient use of\none's time; incorporates more diverse data; delivers projections in real-time;\nand requires no physical effort from the user; resulting around 95% F1 score.\n","authors":["Md. Shahriar Rahman Anuvab","Mishkat Sultana","Md. Atif Hossain","Shashwata Das","Suvarthi Chowdhury","Rafeed Rahman","Dibyo Fabian Dofadar","Shahriar Rahman Rana"],"pdf_url":"https://arxiv.org/pdf/2404.05049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05046v1","updated":"2024-04-07T19:00:45Z","published":"2024-04-07T19:00:45Z","title":"FGAIF: Aligning Large Vision-Language Models with Fine-grained AI\n Feedback","summary":" Large Vision-Language Models (LVLMs) have demonstrated proficiency in\ntackling a variety of visual-language tasks. However, current LVLMs suffer from\nmisalignment between text and image modalities which causes three kinds of\nhallucination problems, i.e., object existence, object attribute, and object\nrelationship. To tackle this issue, existing methods mainly utilize\nReinforcement Learning (RL) to align modalities in LVLMs. However, they still\nsuffer from three main limitations: (1) General feedback can not indicate the\nhallucination type contained in the response; (2) Sparse rewards only give the\nsequence-level reward for the whole response; and (3)Annotation cost is\ntime-consuming and labor-intensive. To handle these limitations, we propose an\ninnovative method to align modalities in LVLMs through Fine-Grained Artificial\nIntelligence Feedback (FGAIF), which mainly consists of three steps: AI-based\nFeedback Collection, Fine-grained Reward Model Training, and Reinforcement\nLearning with Fine-grained Reward. Specifically, We first utilize AI tools to\npredict the types of hallucination for each segment in the response and obtain\na collection of fine-grained feedback. Then, based on the collected reward\ndata, three specialized reward models are trained to produce dense rewards.\nFinally, a novel fine-grained feedback module is integrated into the Proximal\nPolicy Optimization (PPO) algorithm. Extensive experiments are conducted on\nhallucination and general benchmarks, demonstrating the superior performance of\nour proposed method. Notably, compared with previous models trained with the\nRL-based aligning method, our proposed method is effective even with fewer\nparameters.\n","authors":["Liqiang Jing","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2404.05046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14435v6","updated":"2024-04-07T18:04:04Z","published":"2023-06-26T06:04:09Z","title":"DragDiffusion: Harnessing Diffusion Models for Interactive Point-based\n Image Editing","summary":" Accurate and controllable image editing is a challenging task that has\nattracted significant attention recently. Notably, DragGAN is an interactive\npoint-based image editing framework that achieves impressive editing results\nwith pixel-level precision. However, due to its reliance on generative\nadversarial networks (GANs), its generality is limited by the capacity of\npretrained GAN models. In this work, we extend this editing framework to\ndiffusion models and propose a novel approach DragDiffusion. By harnessing\nlarge-scale pretrained diffusion models, we greatly enhance the applicability\nof interactive point-based editing on both real and diffusion-generated images.\nOur approach involves optimizing the diffusion latents to achieve precise\nspatial control. The supervision signal of this optimization process is from\nthe diffusion model's UNet features, which are known to contain rich semantic\nand geometric information. Moreover, we introduce two additional techniques,\nnamely LoRA fine-tuning and latent-MasaCtrl, to further preserve the identity\nof the original image. Lastly, we present a challenging benchmark dataset\ncalled DragBench -- the first benchmark to evaluate the performance of\ninteractive point-based image editing methods. Experiments across a wide range\nof challenging cases (e.g., images with multiple objects, diverse object\ncategories, various styles, etc.) demonstrate the versatility and generality of\nDragDiffusion. Code: https://github.com/Yujun-Shi/DragDiffusion.\n","authors":["Yujun Shi","Chuhui Xue","Jun Hao Liew","Jiachun Pan","Hanshu Yan","Wenqing Zhang","Vincent Y. F. Tan","Song Bai"],"pdf_url":"https://arxiv.org/pdf/2306.14435v6.pdf","comment":"Code is released at https://github.com/Yujun-Shi/DragDiffusion"},{"id":"http://arxiv.org/abs/2404.05029v1","updated":"2024-04-07T17:51:53Z","published":"2024-04-07T17:51:53Z","title":"LOGO: A Long-Form Video Dataset for Group Action Quality Assessment","summary":" Action quality assessment (AQA) has become an emerging topic since it can be\nextensively applied in numerous scenarios. However, most existing methods and\ndatasets focus on single-person short-sequence scenes, hindering the\napplication of AQA in more complex situations. To address this issue, we\nconstruct a new multi-person long-form video dataset for action quality\nassessment named LOGO. Distinguished in scenario complexity, our dataset\ncontains 200 videos from 26 artistic swimming events with 8 athletes in each\nsample along with an average duration of 204.2 seconds. As for richness in\nannotations, LOGO includes formation labels to depict group information of\nmultiple athletes and detailed annotations on action procedures. Furthermore,\nwe propose a simple yet effective method to model relations among athletes and\nreason about the potential temporal logic in long-form videos. Specifically, we\ndesign a group-aware attention module, which can be easily plugged into\nexisting AQA methods, to enrich the clip-wise representations based on\ncontextual group information. To benchmark LOGO, we systematically conduct\ninvestigations on the performance of several popular methods in AQA and action\nsegmentation. The results reveal the challenges our dataset brings. Extensive\nexperiments also show that our approach achieves state-of-the-art on the LOGO\ndataset. The dataset and code will be released at\n\\url{https://github.com/shiyi-zh0408/LOGO }.\n","authors":["Shiyi Zhang","Wenxun Dai","Sujia Wang","Xiangwei Shen","Jiwen Lu","Jie Zhou","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2404.05029v1.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2404.05024v1","updated":"2024-04-07T17:31:53Z","published":"2024-04-07T17:31:53Z","title":"PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a\n Mobile Robot","summary":" The study of non-line-of-sight (NLOS) imaging is growing due to its many\npotential applications, including rescue operations and pedestrian detection by\nself-driving cars. However, implementing NLOS imaging on a moving camera\nremains an open area of research. Existing NLOS imaging methods rely on\ntime-resolved detectors and laser configurations that require precise optical\nalignment, making it difficult to deploy them in dynamic environments. This\nwork proposes a data-driven approach to NLOS imaging, PathFinder, that can be\nused with a standard RGB camera mounted on a small, power-constrained mobile\nrobot, such as an aerial drone. Our experimental pipeline is designed to\naccurately estimate the 2D trajectory of a person who moves in a\nManhattan-world environment while remaining hidden from the camera's\nfield-of-view. We introduce a novel approach to process a sequence of dynamic\nsuccessive frames in a line-of-sight (LOS) video using an attention-based\nneural network that performs inference in real-time. The method also includes a\npreprocessing selection metric that analyzes images from a moving camera which\ncontain multiple vertical planar surfaces, such as walls and building facades,\nand extracts planes that return maximum NLOS information. We validate the\napproach on in-the-wild scenes using a drone for video capture, thus\ndemonstrating low-cost NLOS imaging in dynamic capture environments.\n","authors":["Shenbagaraj Kannapiran","Sreenithy Chandran","Suren Jayasuriya","Spring Berman"],"pdf_url":"https://arxiv.org/pdf/2404.05024v1.pdf","comment":"First two authors have equal contribution"},{"id":"http://arxiv.org/abs/2404.05023v1","updated":"2024-04-07T17:30:57Z","published":"2024-04-07T17:30:57Z","title":"Scalable and Efficient Hierarchical Visual Topological Mapping","summary":" Hierarchical topological representations can significantly reduce search\ntimes within mapping and localization algorithms. Although recent research has\nshown the potential for such approaches, limited consideration has been given\nto the suitability and comparative performance of different global feature\nrepresentations within this context. In this work, we evaluate state-of-the-art\nhand-crafted and learned global descriptors using a hierarchical topological\nmapping technique on benchmark datasets and present results of a comprehensive\nevaluation of the impact of the global descriptor used. Although learned\ndescriptors have been incorporated into place recognition methods to improve\nretrieval accuracy and enhance overall recall, the problem of scalability and\nefficiency when applied to longer trajectories has not been adequately\naddressed in a majority of research studies. Based on our empirical analysis of\nmultiple runs, we identify that continuity and distinctiveness are crucial\ncharacteristics for an optimal global descriptor that enable efficient and\nscalable hierarchical mapping, and present a methodology for quantifying and\ncontrasting these characteristics across different global descriptors. Our\nstudy demonstrates that the use of global descriptors based on an unsupervised\nlearned Variational Autoencoder (VAE) excels in these characteristics and\nachieves significantly lower runtime. It runs on a consumer grade desktop, up\nto 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x\nfaster than the hand-crafted descriptor, PHOG, on the longest track evaluated\n(St Lucia, 17.6 km), without sacrificing overall recall performance.\n","authors":["Saravanabalagi Ramachandran","Jonathan Horgan","Ganesh Sistu","John McDonald"],"pdf_url":"https://arxiv.org/pdf/2404.05023v1.pdf","comment":"Published in the 21st International Conference on Advanced Robotics\n (ICAR 2023)"},{"id":"http://arxiv.org/abs/2404.05022v1","updated":"2024-04-07T17:25:52Z","published":"2024-04-07T17:25:52Z","title":"DinoBloom: A Foundation Model for Generalizable Cell Embeddings in\n Hematology","summary":" In hematology, computational models offer significant potential to improve\ndiagnostic accuracy, streamline workflows, and reduce the tedious work of\nanalyzing single cells in peripheral blood or bone marrow smears. However,\nclinical adoption of computational models has been hampered by the lack of\ngeneralization due to large batch effects, small dataset sizes, and poor\nperformance in transfer learning from natural images. To address these\nchallenges, we introduce DinoBloom, the first foundation model for single cell\nimages in hematology, utilizing a tailored DINOv2 pipeline. Our model is built\nupon an extensive collection of 13 diverse, publicly available datasets of\nperipheral blood and bone marrow smears, the most substantial open-source\ncohort in hematology so far, comprising over 380,000 white blood cell images.\nTo assess its generalization capability, we evaluate it on an external dataset\nwith a challenging domain shift. We show that our model outperforms existing\nmedical and non-medical vision models in (i) linear probing and k-nearest\nneighbor evaluations for cell-type classification on blood and bone marrow\nsmears and (ii) weakly supervised multiple instance learning for acute myeloid\nleukemia subtyping by a large margin. A family of four DinoBloom models (small,\nbase, large, and giant) can be adapted for a wide range of downstream\napplications, be a strong baseline for classification problems, and facilitate\nthe assessment of batch effects in new datasets. All models are available at\ngithub.com/marrlab/DinoBloom.\n","authors":["Valentin Koch","Sophia J. Wagner","Salome Kazeminia","Ece Sancar","Matthias Hehr","Julia Schnabel","Tingying Peng","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16671v4","updated":"2024-04-07T17:22:46Z","published":"2023-09-28T17:59:56Z","title":"Demystifying CLIP Data","summary":" Contrastive Language-Image Pre-training (CLIP) is an approach that has\nadvanced research and applications in computer vision, fueling modern\nrecognition systems and generative models. We believe that the main ingredient\nto the success of CLIP is its data and not the model architecture or\npre-training objective. However, CLIP only provides very limited information\nabout its data and how it has been collected, leading to works that aim to\nreproduce CLIP's data by filtering with its model parameters. In this work, we\nintend to reveal CLIP's data curation approach and in our pursuit of making it\nopen to the community introduce Metadata-Curated Language-Image Pre-training\n(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's\nconcepts) and yields a balanced subset over the metadata distribution. Our\nexperimental study rigorously isolates the model and training settings,\nconcentrating solely on data. MetaCLIP applied to CommonCrawl with 400M\nimage-text data pairs outperforms CLIP's data on multiple standard benchmarks.\nIn zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy,\nsurpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining\nthe same training budget, attains 72.4%. Our observations hold across various\nmodel sizes, exemplified by ViT-H achieving 80.5%, without any\nbells-and-whistles. Curation code and training data distribution on metadata is\nmade available at https://github.com/facebookresearch/MetaCLIP.\n","authors":["Hu Xu","Saining Xie","Xiaoqing Ellen Tan","Po-Yao Huang","Russell Howes","Vasu Sharma","Shang-Wen Li","Gargi Ghosh","Luke Zettlemoyer","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2309.16671v4.pdf","comment":"17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by\n other authors"},{"id":"http://arxiv.org/abs/2112.04731v5","updated":"2024-04-07T17:09:58Z","published":"2021-12-09T07:20:32Z","title":"Mimicking the Oracle: An Initial Phase Decorrelation Approach for Class\n Incremental Learning","summary":" Class Incremental Learning (CIL) aims at learning a multi-class classifier in\na phase-by-phase manner, in which only data of a subset of the classes are\nprovided at each phase. Previous works mainly focus on mitigating forgetting in\nphases after the initial one. However, we find that improving CIL at its\ninitial phase is also a promising direction. Specifically, we experimentally\nshow that directly encouraging CIL Learner at the initial phase to output\nsimilar representations as the model jointly trained on all classes can greatly\nboost the CIL performance. Motivated by this, we study the difference between a\nna\\\"ively-trained initial-phase model and the oracle model. Specifically, since\none major difference between these two models is the number of training\nclasses, we investigate how such difference affects the model representations.\nWe find that, with fewer training classes, the data representations of each\nclass lie in a long and narrow region; with more training classes, the\nrepresentations of each class scatter more uniformly. Inspired by this\nobservation, we propose Class-wise Decorrelation (CwD) that effectively\nregularizes representations of each class to scatter more uniformly, thus\nmimicking the model jointly trained with all classes (i.e., the oracle model).\nOur CwD is simple to implement and easy to plug into existing methods.\nExtensive experiments on various benchmark datasets show that CwD consistently\nand significantly improves the performance of existing state-of-the-art methods\nby around 1\\% to 3\\%. Code will be released.\n","authors":["Yujun Shi","Kuangqi Zhou","Jian Liang","Zihang Jiang","Jiashi Feng","Philip Torr","Song Bai","Vincent Y. F. Tan"],"pdf_url":"https://arxiv.org/pdf/2112.04731v5.pdf","comment":"CVPR 2022 Camera-Ready Version"},{"id":"http://arxiv.org/abs/2404.05016v1","updated":"2024-04-07T17:06:22Z","published":"2024-04-07T17:06:22Z","title":"Hyperbolic Learning with Synthetic Captions for Open-World Detection","summary":" Open-world detection poses significant challenges, as it requires the\ndetection of any object using either object class labels or free-form texts.\nExisting related works often use large-scale manual annotated caption datasets\nfor training, which are extremely expensive to collect. Instead, we propose to\ntransfer knowledge from vision-language models (VLMs) to enrich the\nopen-vocabulary descriptions automatically. Specifically, we bootstrap dense\nsynthetic captions using pre-trained VLMs to provide rich descriptions on\ndifferent regions in images, and incorporate these captions to train a novel\ndetector that generalizes to novel concepts. To mitigate the noise caused by\nhallucination in synthetic captions, we also propose a novel hyperbolic\nvision-language learning approach to impose a hierarchy between visual and\ncaption embeddings. We call our detector ``HyperLearner''. We conduct extensive\nexperiments on a wide variety of open-world detection benchmarks (COCO, LVIS,\nObject Detection in the Wild, RefCOCO) and our results show that our model\nconsistently outperforms existing state-of-the-art methods, such as GLIP,\nGLIPv2 and Grounding DINO, when using the same backbone.\n","authors":["Fanjie Kong","Yanbei Chen","Jiarui Cai","Davide Modolo"],"pdf_url":"https://arxiv.org/pdf/2404.05016v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.05713v3","updated":"2024-04-07T16:59:41Z","published":"2024-02-08T14:40:32Z","title":"Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on\n Vulnerable Patient Populations","summary":" The proliferation of artificial intelligence (AI) in radiology has shed light\non the risk of deep learning (DL) models exacerbating clinical biases towards\nvulnerable patient populations. While prior literature has focused on\nquantifying biases exhibited by trained DL models, demographically targeted\nadversarial bias attacks on DL models and its implication in the clinical\nenvironment remains an underexplored field of research in medical imaging. In\nthis work, we demonstrate that demographically targeted label poisoning attacks\ncan introduce undetectable underdiagnosis bias in DL models. Our results across\nmultiple performance metrics and demographic groups like sex, age, and their\nintersectional subgroups show that adversarial bias attacks demonstrate\nhigh-selectivity for bias in the targeted group by degrading group model\nperformance without impacting overall model performance. Furthermore, our\nresults indicate that adversarial bias attacks result in biased DL models that\npropagate prediction bias even when evaluated with external datasets.\n","authors":["Pranav Kulkarni","Andrew Chan","Nithya Navarathna","Skylar Chan","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2402.05713v3.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.05014v1","updated":"2024-04-07T16:49:07Z","published":"2024-04-07T16:49:07Z","title":"MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators","summary":" Recent advances in Text-to-Video generation (T2V) have achieved remarkable\nsuccess in synthesizing high-quality general videos from textual descriptions.\nA largely overlooked problem in T2V is that existing models have not adequately\nencoded physical knowledge of the real world, thus generated videos tend to\nhave limited motion and poor variations. In this paper, we propose\n\\textbf{MagicTime}, a metamorphic time-lapse video generation model, which\nlearns real-world physics knowledge from time-lapse videos and implements\nmetamorphic generation. First, we design a MagicAdapter scheme to decouple\nspatial and temporal training, encode more physical knowledge from metamorphic\nvideos, and transform pre-trained T2V models to generate metamorphic videos.\nSecond, we introduce a Dynamic Frames Extraction strategy to adapt to\nmetamorphic time-lapse videos, which have a wider variation range and cover\ndramatic object metamorphic processes, thus embodying more physical knowledge\nthan general videos. Finally, we introduce a Magic Text-Encoder to improve the\nunderstanding of metamorphic video prompts. Furthermore, we create a time-lapse\nvideo-text dataset called \\textbf{ChronoMagic}, specifically curated to unlock\nthe metamorphic video generation ability. Extensive experiments demonstrate the\nsuperiority and effectiveness of MagicTime for generating high-quality and\ndynamic metamorphic videos, suggesting time-lapse video generation is a\npromising path toward building metamorphic simulators of the physical world.\n","authors":["Shenghai Yuan","Jinfa Huang","Yujun Shi","Yongqi Xu","Ruijie Zhu","Bin Lin","Xinhua Cheng","Li Yuan","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.05014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18172v2","updated":"2024-04-07T16:43:51Z","published":"2024-02-28T09:02:33Z","title":"NiteDR: Nighttime Image De-Raining with Cross-View Sensor Cooperative\n Learning for Dynamic Driving Scenes","summary":" In real-world environments, outdoor imaging systems are often affected by\ndisturbances such as rain degradation. Especially, in nighttime driving scenes,\ninsufficient and uneven lighting shrouds the scenes in darkness, resulting\ndegradation of both the image quality and visibility. Particularly, in the\nfield of autonomous driving, the visual perception ability of RGB sensors\nexperiences a sharp decline in such harsh scenarios. Additionally, driving\nassistance systems suffer from reduced capabilities in capturing and discerning\nthe surrounding environment, posing a threat to driving safety. Single-view\ninformation captured by single-modal sensors cannot comprehensively depict the\nentire scene. To address these challenges, we developed an image de-raining\nframework tailored for rainy nighttime driving scenes. It aims to remove rain\nartifacts, enrich scene representation, and restore useful information.\nSpecifically, we introduce cooperative learning between visible and infrared\nimages captured by different sensors. By cross-view fusion of these\nmulti-source data, the scene within the images gains richer texture details and\nenhanced contrast. We constructed an information cleaning module called\nCleanNet as the first stage of our framework. Moreover, we designed an\ninformation fusion module called FusionNet as the second stage to fuse the\nclean visible images with infrared images. Using this stage-by-stage learning\nstrategy, we obtain de-rained fusion images with higher quality and better\nvisual perception. Extensive experiments demonstrate the effectiveness of our\nproposed Cross-View Cooperative Learning (CVCL) in adverse driving scenarios in\nlow-light rainy environments. The proposed approach addresses the gap in the\nutilization of existing rain removal algorithms in specific low-light\nconditions.\n","authors":["Cidan Shi","Lihuang Fang","Han Wu","Xiaoyu Xian","Yukai Shi","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2402.18172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12588v3","updated":"2024-04-07T16:05:03Z","published":"2023-11-21T13:21:22Z","title":"HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning\n for RGB-D 6DoF Object Pose Estimation","summary":" In this work, we present a novel dense-correspondence method for 6DoF object\npose estimation from a single RGB-D image. While many existing data-driven\nmethods achieve impressive performance, they tend to be time-consuming due to\ntheir reliance on rendering-based refinement approaches. To circumvent this\nlimitation, we present HiPose, which establishes 3D-3D correspondences in a\ncoarse-to-fine manner with a hierarchical binary surface encoding. Unlike\nprevious dense-correspondence methods, we estimate the correspondence surface\nby employing point-to-surface matching and iteratively constricting the surface\nuntil it becomes a correspondence point while gradually removing outliers.\nExtensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate\nthat our method surpasses all refinement-free methods and is even on par with\nexpensive refinement-based approaches. Crucially, our approach is\ncomputationally efficient and enables real-time critical applications with high\naccuracy requirements.\n","authors":["Yongliang Lin","Yongzhi Su","Praveen Nathan","Sandeep Inuganti","Yan Di","Martin Sundermeyer","Fabian Manhardt","Didier Stricker","Jason Rambach","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12588v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05003v1","updated":"2024-04-07T15:58:25Z","published":"2024-04-07T15:58:25Z","title":"Camera-Based Remote Physiology Sensing for Hundreds of Subjects Across\n Skin Tones","summary":" Remote photoplethysmography (rPPG) emerges as a promising method for\nnon-invasive, convenient measurement of vital signs, utilizing the widespread\npresence of cameras. Despite advancements, existing datasets fall short in\nterms of size and diversity, limiting comprehensive evaluation under diverse\nconditions. This paper presents an in-depth analysis of the VitalVideo dataset,\nthe largest real-world rPPG dataset to date, encompassing 893 subjects and 6\nFitzpatrick skin tones. Our experimentation with six unsupervised methods and\nthree supervised models demonstrates that datasets comprising a few hundred\nsubjects(i.e., 300 for UBFC-rPPG, 500 for PURE, and 700 for MMPD-Simple) are\nsufficient for effective rPPG model training. Our findings highlight the\nimportance of diversity and consistency in skin tones for precise performance\nevaluation across different datasets.\n","authors":["Jiankai Tang","Xinyi Li","Jiacheng Liu","Xiyuxing Zhang","Zeyu Wang","Yuntao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05003v1.pdf","comment":"11 pages, 5 figures, CHI24 Workshop PhysioCHI"},{"id":"http://arxiv.org/abs/2404.05001v1","updated":"2024-04-07T15:53:21Z","published":"2024-04-07T15:53:21Z","title":"Dual-Scale Transformer for Large-Scale Single-Pixel Imaging","summary":" Single-pixel imaging (SPI) is a potential computational imaging technique\nwhich produces image by solving an illposed reconstruction problem from few\nmeasurements captured by a single-pixel detector. Deep learning has achieved\nimpressive success on SPI reconstruction. However, previous poor reconstruction\nperformance and impractical imaging model limit its real-world applications. In\nthis paper, we propose a deep unfolding network with hybrid-attention\nTransformer on Kronecker SPI model, dubbed HATNet, to improve the imaging\nquality of real SPI cameras. Specifically, we unfold the computation graph of\nthe iterative shrinkagethresholding algorithm (ISTA) into two alternative\nmodules: efficient tensor gradient descent and hybrid-attention multiscale\ndenoising. By virtue of Kronecker SPI, the gradient descent module can avoid\nhigh computational overheads rooted in previous gradient descent modules based\non vectorized SPI. The denoising module is an encoder-decoder architecture\npowered by dual-scale spatial attention for high- and low-frequency aggregation\nand channel attention for global information recalibration. Moreover, we build\na SPI prototype to verify the effectiveness of the proposed method. Extensive\nexperiments on synthetic and real data demonstrate that our method achieves the\nstate-of-the-art performance. The source code and pre-trained models are\navailable at https://github.com/Gang-Qu/HATNet-SPI.\n","authors":["Gang Qu","Ping Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.05001v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04998v1","updated":"2024-04-07T15:48:33Z","published":"2024-04-07T15:48:33Z","title":"Weakly Supervised Deep Hyperspherical Quantization for Image Retrieval","summary":" Deep quantization methods have shown high efficiency on large-scale image\nretrieval. However, current models heavily rely on ground-truth information,\nhindering the application of quantization in label-hungry scenarios. A more\nrealistic demand is to learn from inexhaustible uploaded images that are\nassociated with informal tags provided by amateur users. Though such sketchy\ntags do not obviously reveal the labels, they actually contain useful semantic\ninformation for supervising deep quantization. To this end, we propose\nWeakly-Supervised Deep Hyperspherical Quantization (WSDHQ), which is the first\nwork to learn deep quantization from weakly tagged images. Specifically, 1) we\nuse word embeddings to represent the tags and enhance their semantic\ninformation based on a tag correlation graph. 2) To better preserve semantic\ninformation in quantization codes and reduce quantization error, we jointly\nlearn semantics-preserving embeddings and supervised quantizer on hypersphere\nby employing a well-designed fusion layer and tailor-made loss functions.\nExtensive experiments show that WSDHQ can achieve state-of-art performance on\nweakly-supervised compact coding. Code is available at\nhttps://github.com/gimpong/AAAI21-WSDHQ.\n","authors":["Jinpeng Wang","Bin Chen","Qiang Zhang","Zaiqiao Meng","Shangsong Liang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2404.04998v1.pdf","comment":"In proceedings of AAAI 2021. Code and data are available"},{"id":"http://arxiv.org/abs/2404.04996v1","updated":"2024-04-07T15:34:40Z","published":"2024-04-07T15:34:40Z","title":"Fantastic Animals and Where to Find Them: Segment Any Marine Animal with\n Dual SAM","summary":" As an important pillar of underwater intelligence, Marine Animal Segmentation\n(MAS) involves segmenting animals within marine environments. Previous methods\ndon't excel in extracting long-range contextual features and overlook the\nconnectivity between discrete pixels. Recently, Segment Anything Model (SAM)\noffers a universal framework for general segmentation tasks. Unfortunately,\ntrained with natural images, SAM does not obtain the prior knowledge from\nmarine images. In addition, the single-position prompt of SAM is very\ninsufficient for prior guidance. To address these issues, we propose a novel\nfeature learning framework, named Dual-SAM for high-performance MAS. To this\nend, we first introduce a dual structure with SAM's paradigm to enhance feature\nlearning of marine images. Then, we propose a Multi-level Coupled Prompt (MCP)\nstrategy to instruct comprehensive underwater prior information, and enhance\nthe multi-level features of SAM's encoder with adapters. Subsequently, we\ndesign a Dilated Fusion Attention Module (DFAM) to progressively integrate\nmulti-level features from SAM's encoder. Finally, instead of directly\npredicting the masks of marine animals, we propose a Criss-Cross Connectivity\nPrediction (C$^3$P) paradigm to capture the inter-connectivity between discrete\npixels. With dual decoders, it generates pseudo-labels and achieves mutual\nsupervision for complementary feature representations, resulting in\nconsiderable improvements over previous techniques. Extensive experiments\nverify that our proposed method achieves state-of-the-art performances on five\nwidely-used MAS datasets. The code is available at\nhttps://github.com/Drchip61/Dual_SAM.\n","authors":["Pingping Zhang","Tianyu Yan","Yang Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04996v1.pdf","comment":"Accepted by CVPR2024 as Poster(Highlight)"},{"id":"http://arxiv.org/abs/2404.04992v1","updated":"2024-04-07T15:27:35Z","published":"2024-04-07T15:27:35Z","title":"Efficient Surgical Tool Recognition via HMM-Stabilized Deep Learning","summary":" Recognizing various surgical tools, actions and phases from surgery videos is\nan important problem in computer vision with exciting clinical applications.\nExisting deep-learning-based methods for this problem either process each\nsurgical video as a series of independent images without considering their\ndependence, or rely on complicated deep learning models to count for dependence\nof video frames. In this study, we revealed from exploratory data analysis that\nsurgical videos enjoy relatively simple semantic structure, where the presence\nof surgical phases and tools can be well modeled by a compact hidden Markov\nmodel (HMM). Based on this observation, we propose an HMM-stabilized deep\nlearning method for tool presence detection. A wide range of experiments\nconfirm that the proposed approaches achieve better performance with lower\ntraining and running costs, and support more flexible ways to construct and\nutilize training data in scenarios where not all surgery videos of interest are\nextensively labelled. These results suggest that popular deep learning\napproaches with over-complicated model structures may suffer from inefficient\nutilization of data, and integrating ingredients of deep learning and\nstatistical learning wisely may lead to more powerful algorithms that enjoy\ncompetitive performance, transparent interpretation and convenient model\ntraining simultaneously.\n","authors":["Haifeng Wang","Hao Xu","Jun Wang","Jian Zhou","Ke Deng"],"pdf_url":"https://arxiv.org/pdf/2404.04992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04986v1","updated":"2024-04-07T15:06:48Z","published":"2024-04-07T15:06:48Z","title":"Dynamic Distinction Learning: Adaptive Pseudo Anomalies for Video\n Anomaly Detection","summary":" We introduce Dynamic Distinction Learning (DDL) for Video Anomaly Detection,\na novel video anomaly detection methodology that combines pseudo-anomalies,\ndynamic anomaly weighting, and a distinction loss function to improve detection\naccuracy. By training on pseudo-anomalies, our approach adapts to the\nvariability of normal and anomalous behaviors without fixed anomaly thresholds.\nOur model showcases superior performance on the Ped2, Avenue and ShanghaiTech\ndatasets, where individual models are tailored for each scene. These\nachievements highlight DDL's effectiveness in advancing anomaly detection,\noffering a scalable and adaptable solution for video surveillance challenges.\n","authors":["Demetris Lappas","Vasileios Argyriou","Dimitrios Makris"],"pdf_url":"https://arxiv.org/pdf/2404.04986v1.pdf","comment":"To be published in the CVPR2024 Workshop"},{"id":"http://arxiv.org/abs/2404.00521v3","updated":"2024-04-07T15:04:47Z","published":"2024-03-31T01:41:36Z","title":"CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz\n continuity constrAIned Normalization","summary":" Generative Adversarial Networks (GANs) significantly advanced image\ngeneration but their performance heavily depends on abundant training data. In\nscenarios with limited data, GANs often struggle with discriminator overfitting\nand unstable training. Batch Normalization (BN), despite being known for\nenhancing generalization and training stability, has rarely been used in the\ndiscriminator of Data-Efficient GANs. Our work addresses this gap by\nidentifying a critical flaw in BN: the tendency for gradient explosion during\nthe centering and scaling steps. To tackle this issue, we present CHAIN\n(lipsCHitz continuity constrAIned Normalization), which replaces the\nconventional centering step with zero-mean regularization and integrates a\nLipschitz continuity constraint in the scaling step. CHAIN further enhances GAN\ntraining by adaptively interpolating the normalized and unnormalized features,\neffectively avoiding discriminator overfitting. Our theoretical analyses firmly\nestablishes CHAIN's effectiveness in reducing gradients in latent features and\nweights, improving stability and generalization in GAN training. Empirical\nevidence supports our theory. CHAIN achieves state-of-the-art results in\ndata-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven\nhigh-resolution few-shot image datasets. Code:\nhttps://github.com/MaxwellYaoNi/CHAIN\n","authors":["Yao Ni","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2404.00521v3.pdf","comment":"Accepted by CVPR2024. 26 pages full version. Code:\n https://github.com/MaxwellYaoNi/CHAIN"},{"id":"http://arxiv.org/abs/2404.04983v1","updated":"2024-04-07T15:03:46Z","published":"2024-04-07T15:03:46Z","title":"Primary liver cancer classification from routine tumour biopsy using\n weakly supervised deep learning","summary":" The diagnosis of primary liver cancers (PLCs) can be challenging, especially\non biopsies and for combined hepatocellular-cholangiocarcinoma (cHCC-CCA). We\nautomatically classified PLCs on routine-stained biopsies using a weakly\nsupervised learning method. Weak tumour/non-tumour annotations served as labels\nfor training a Resnet18 neural network, and the network's last convolutional\nlayer was used to extract new tumour tile features. Without knowledge of the\nprecise labels of the malignancies, we then applied an unsupervised clustering\nalgorithm. Our model identified specific features of hepatocellular carcinoma\n(HCC) and intrahepatic cholangiocarcinoma (iCCA). Despite no specific features\nof cHCC-CCA being recognized, the identification of HCC and iCCA tiles within a\nslide could facilitate the diagnosis of primary liver cancers, particularly\ncHCC-CCA.\n Method and results: 166 PLC biopsies were divided into training, internal and\nexternal validation sets: 90, 29 and 47 samples. Two liver pathologists\nreviewed each whole-slide hematein eosin saffron (HES)-stained image (WSI).\nAfter annotating the tumour/non-tumour areas, 256x256 pixel tiles were\nextracted from the WSIs and used to train a ResNet18. The network was used to\nextract new tile features. An unsupervised clustering algorithm was then\napplied to the new tile features. In a two-cluster model, Clusters 0 and 1\ncontained mainly HCC and iCCA histological features. The diagnostic agreement\nbetween the pathological diagnosis and the model predictions in the internal\nand external validation sets was 100% (11/11) and 96% (25/26) for HCC and 78%\n(7/9) and 87% (13/15) for iCCA, respectively. For cHCC-CCA, we observed a\nhighly variable proportion of tiles from each cluster (Cluster 0: 5-97%;\nCluster 1: 2-94%).\n","authors":["Aurélie Beaufrère","Nora Ouzir","Paul Emile Zafar","Astrid Laurent-Bellue","Miguel Albuquerque","Gwladys Lubuela","Jules Grégory","Catherine Guettier","Kévin Mondet","Jean-Christophe Pesquet","Valérie Paradis"],"pdf_url":"https://arxiv.org/pdf/2404.04983v1.pdf","comment":"https://www.sciencedirect.com/science/article/pii/S2589555924000090"},{"id":"http://arxiv.org/abs/2311.15879v2","updated":"2024-04-07T14:43:38Z","published":"2023-11-27T14:51:37Z","title":"EVCap: Retrieval-Augmented Image Captioning with External Visual-Name\n Memory for Open-World Comprehension","summary":" Large language models (LLMs)-based image captioning has the capability of\ndescribing objects not explicitly observed in training data; yet novel objects\noccur frequently, necessitating the requirement of sustaining up-to-date object\nknowledge for open-world comprehension. Instead of relying on large amounts of\ndata and/or scaling up network parameters, we introduce a highly effective\nretrieval-augmented image captioning method that prompts LLMs with object names\nretrieved from External Visual--name memory (EVCap). We build ever-changing\nobject knowledge memory using objects' visuals and names, enabling us to (i)\nupdate the memory at a minimal cost and (ii) effortlessly augment LLMs with\nretrieved object names by utilizing a lightweight and fast-to-train model. Our\nmodel, which was trained only on the COCO dataset, can adapt to out-of-domain\nwithout requiring additional fine-tuning or re-training. Our experiments\nconducted on benchmarks and synthetic commonsense-violating data show that\nEVCap, with only 3.97M trainable parameters, exhibits superior performance\ncompared to other methods based on frozen pre-trained LLMs. Its performance is\nalso competitive to specialist SOTAs that require extensive training.\n","authors":["Jiaxuan Li","Duc Minh Vo","Akihiro Sugimoto","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2311.15879v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04971v1","updated":"2024-04-07T14:21:37Z","published":"2024-04-07T14:21:37Z","title":"FPL+: Filtered Pseudo Label-based Unsupervised Cross-Modality Adaptation\n for 3D Medical Image Segmentation","summary":" Adapting a medical image segmentation model to a new domain is important for\nimproving its cross-domain transferability, and due to the expensive annotation\nprocess, Unsupervised Domain Adaptation (UDA) is appealing where only unlabeled\nimages are needed for the adaptation. Existing UDA methods are mainly based on\nimage or feature alignment with adversarial training for regularization, and\nthey are limited by insufficient supervision in the target domain. In this\npaper, we propose an enhanced Filtered Pseudo Label (FPL+)-based UDA method for\n3D medical image segmentation. It first uses cross-domain data augmentation to\ntranslate labeled images in the source domain to a dual-domain training set\nconsisting of a pseudo source-domain set and a pseudo target-domain set. To\nleverage the dual-domain augmented images to train a pseudo label generator,\ndomain-specific batch normalization layers are used to deal with the domain\nshift while learning the domain-invariant structure features, generating\nhigh-quality pseudo labels for target-domain images. We then combine labeled\nsource-domain images and target-domain images with pseudo labels to train a\nfinal segmentor, where image-level weighting based on uncertainty estimation\nand pixel-level weighting based on dual-domain consensus are proposed to\nmitigate the adverse effect of noisy pseudo labels. Experiments on three public\nmulti-modal datasets for Vestibular Schwannoma, brain tumor and whole heart\nsegmentation show that our method surpassed ten state-of-the-art UDA methods,\nand it even achieved better results than fully supervised learning in the\ntarget domain in some cases.\n","authors":["Jianghao Wu","Dong Guo","Guotai Wang","Qiang Yue","Huijun Yu","Kang Li","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04971v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.06462v2","updated":"2024-04-07T14:05:53Z","published":"2023-12-11T15:51:38Z","title":"Cooperation Does Matter: Exploring Multi-Order Bilateral Relations for\n Audio-Visual Segmentation","summary":" Recently, an audio-visual segmentation (AVS) task has been introduced, aiming\nto group pixels with sounding objects within a given video. This task\nnecessitates a first-ever audio-driven pixel-level understanding of the scene,\nposing significant challenges. In this paper, we propose an innovative\naudio-visual transformer framework, termed COMBO, an acronym for COoperation of\nMulti-order Bilateral relatiOns. For the first time, our framework explores\nthree types of bilateral entanglements within AVS: pixel entanglement, modality\nentanglement, and temporal entanglement. Regarding pixel entanglement, we\nemploy a Siam-Encoder Module (SEM) that leverages prior knowledge to generate\nmore precise visual features from the foundational model. For modality\nentanglement, we design a Bilateral-Fusion Module (BFM), enabling COMBO to\nalign corresponding visual and auditory signals bi-directionally. As for\ntemporal entanglement, we introduce an innovative adaptive inter-frame\nconsistency loss according to the inherent rules of temporal. Comprehensive\nexperiments and ablation studies on AVSBench-object (84.7 mIoU on S4, 59.2 mIou\non MS3) and AVSBench-semantic (42.1 mIoU on AVSS) datasets demonstrate that\nCOMBO surpasses previous state-of-the-art methods. Code and more results will\nbe publicly available at https://yannqi.github.io/AVS-COMBO/.\n","authors":["Qi Yang","Xing Nie","Tong Li","Pengfei Gao","Ying Guo","Cheng Zhen","Pengfei Yan","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.06462v2.pdf","comment":"CVPR 2024 Highlight. 13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.04960v1","updated":"2024-04-07T13:40:29Z","published":"2024-04-07T13:40:29Z","title":"PairAug: What Can Augmented Image-Text Pairs Do for Radiology?","summary":" Current vision-language pre-training (VLP) methodologies predominantly depend\non paired image-text datasets, a resource that is challenging to acquire in\nradiology due to privacy considerations and labelling complexities. Data\naugmentation provides a practical solution to overcome the issue of data\nscarcity, however, most augmentation methods exhibit a limited focus,\nprioritising either image or text augmentation exclusively. Acknowledging this\nlimitation, our objective is to devise a framework capable of concurrently\naugmenting medical image and text data. We design a Pairwise Augmentation\n(PairAug) approach that contains an Inter-patient Augmentation (InterAug)\nbranch and an Intra-patient Augmentation (IntraAug) branch. Specifically, the\nInterAug branch of our approach generates radiology images using synthesised\nyet plausible reports derived from a Large Language Model (LLM). The generated\npairs can be considered a collection of new patient cases since they are\nartificially created and may not exist in the original dataset. In contrast,\nthe IntraAug branch uses newly generated reports to manipulate images. This\nprocess allows us to create new paired data for each individual with diverse\nmedical conditions. Our extensive experiments on various downstream tasks\ncovering medical image classification zero-shot and fine-tuning analysis\ndemonstrate that our PairAug, concurrently expanding both image and text data,\nsubstantially outperforms image-/text-only expansion baselines and advanced\nmedical VLP baselines. Our code is released at\n\\url{https://github.com/YtongXie/PairAug}.\n","authors":["Yutong Xie","Qi Chen","Sinuo Wang","Minh-Son To","Iris Lee","Ee Win Khoo","Kerolos Hendy","Daniel Koh","Yong Xia","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04960v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2311.16514v2","updated":"2024-04-07T13:33:56Z","published":"2023-11-27T13:14:06Z","title":"Video Anomaly Detection via Spatio-Temporal Pseudo-Anomaly Generation :\n A Unified Approach","summary":" Video Anomaly Detection (VAD) is an open-set recognition task, which is\nusually formulated as a one-class classification (OCC) problem, where training\ndata is comprised of videos with normal instances while test data contains both\nnormal and anomalous instances. Recent works have investigated the creation of\npseudo-anomalies (PAs) using only the normal data and making strong assumptions\nabout real-world anomalies with regards to abnormality of objects and speed of\nmotion to inject prior information about anomalies in an autoencoder (AE) based\nreconstruction model during training. This work proposes a novel method for\ngenerating generic spatio-temporal PAs by inpainting a masked out region of an\nimage using a pre-trained Latent Diffusion Model and further perturbing the\noptical flow using mixup to emulate spatio-temporal distortions in the data. In\naddition, we present a simple unified framework to detect real-world anomalies\nunder the OCC setting by learning three types of anomaly indicators, namely\nreconstruction quality, temporal irregularity and semantic inconsistency.\nExtensive experiments on four VAD benchmark datasets namely Ped2, Avenue,\nShanghaiTech and UBnormal demonstrate that our method performs on par with\nother existing state-of-the-art PAs generation and reconstruction based methods\nunder the OCC setting. Our analysis also examines the transferability and\ngeneralisation of PAs across these datasets, offering valuable insights by\nidentifying real-world anomalies through PAs.\n","authors":["Ayush K. Rai","Tarun Krishna","Feiyan Hu","Alexandru Drimbarean","Kevin McGuinness","Alan F. Smeaton","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2311.16514v2.pdf","comment":"Accepted in CVPRW 2024 - VAND Workshop"},{"id":"http://arxiv.org/abs/2404.04956v1","updated":"2024-04-07T13:30:10Z","published":"2024-04-07T13:30:10Z","title":"Gaussian Shading: Provable Performance-Lossless Image Watermarking for\n Diffusion Models","summary":" Ethical concerns surrounding copyright protection and inappropriate content\ngeneration pose challenges for the practical implementation of diffusion\nmodels. One effective solution involves watermarking the generated images.\nHowever, existing methods often compromise the model performance or require\nadditional training, which is undesirable for operators and users. To address\nthis issue, we propose Gaussian Shading, a diffusion model watermarking\ntechnique that is both performance-lossless and training-free, while serving\nthe dual purpose of copyright protection and tracing of offending content. Our\nwatermark embedding is free of model parameter modifications and thus is\nplug-and-play. We map the watermark to latent representations following a\nstandard Gaussian distribution, which is indistinguishable from latent\nrepresentations obtained from the non-watermarked diffusion model. Therefore we\ncan achieve watermark embedding with lossless performance, for which we also\nprovide theoretical proof. Furthermore, since the watermark is intricately\nlinked with image semantics, it exhibits resilience to lossy processing and\nerasure attempts. The watermark can be extracted by Denoising Diffusion\nImplicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian\nShading on multiple versions of Stable Diffusion, and the results demonstrate\nthat Gaussian Shading not only is performance-lossless but also outperforms\nexisting methods in terms of robustness.\n","authors":["Zijin Yang","Kai Zeng","Kejiang Chen","Han Fang","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04956v1.pdf","comment":"17 pages, 11 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04953v1","updated":"2024-04-07T13:17:47Z","published":"2024-04-07T13:17:47Z","title":"High-Discriminative Attribute Feature Learning for Generalized Zero-Shot\n Learning","summary":" Zero-shot learning(ZSL) aims to recognize new classes without prior exposure\nto their samples, relying on semantic knowledge from observed classes. However,\ncurrent attention-based models may overlook the transferability of visual\nfeatures and the distinctiveness of attribute localization when learning\nregional features in images. Additionally, they often overlook shared\nattributes among different objects. Highly discriminative attribute features\nare crucial for identifying and distinguishing unseen classes. To address these\nissues, we propose an innovative approach called High-Discriminative Attribute\nFeature Learning for Generalized Zero-Shot Learning (HDAFL). HDAFL optimizes\nvisual features by learning attribute features to obtain discriminative visual\nembeddings. Specifically, HDAFL utilizes multiple convolutional kernels to\nautomatically learn discriminative regions highly correlated with attributes in\nimages, eliminating irrelevant interference in image features. Furthermore, we\nintroduce a Transformer-based attribute discrimination encoder to enhance the\ndiscriminative capability among attributes. Simultaneously, the method employs\ncontrastive loss to alleviate dataset biases and enhance the transferability of\nvisual features, facilitating better semantic transfer between seen and unseen\nclasses. Experimental results demonstrate the effectiveness of HDAFL across\nthree widely used datasets.\n","authors":["Yu Lei","Guoshuai Sheng","Fangfang Li","Quanxue Gao","Cheng Deng","Qin Li"],"pdf_url":"https://arxiv.org/pdf/2404.04953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09434v2","updated":"2024-04-07T13:05:24Z","published":"2024-03-14T14:25:10Z","title":"Reconstruction and Simulation of Elastic Objects with Spring-Mass 3D\n Gaussians","summary":" Reconstructing and simulating elastic objects from visual observations is\ncrucial for applications in computer vision and robotics. Existing methods,\nsuch as 3D Gaussians, model 3D appearance and geometry, but lack the ability to\nestimate physical properties for objects and simulate them. The core challenge\nlies in integrating an expressive yet efficient physical dynamics model. We\npropose Spring-Gaus, a 3D physical object representation for reconstructing and\nsimulating elastic objects from videos of the object from multiple viewpoints.\nIn particular, we develop and integrate a 3D Spring-Mass model into 3D Gaussian\nkernels, enabling the reconstruction of the visual appearance, shape, and\nphysical dynamics of the object. Our approach enables future prediction and\nsimulation under various initial states and environmental properties. We\nevaluate Spring-Gaus on both synthetic and real-world datasets, demonstrating\naccurate reconstruction and simulation of elastic objects. Project page:\nhttps://zlicheng.com/spring_gaus.\n","authors":["Licheng Zhong","Hong-Xing Yu","Jiajun Wu","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2403.09434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05964v2","updated":"2024-04-07T13:03:58Z","published":"2024-02-05T12:16:28Z","title":"A Survey on Transformer Compression","summary":" Transformer plays a vital role in the realms of natural language processing\n(NLP) and computer vision (CV), specially for constructing large language\nmodels (LLM) and large vision models (LVM). Model compression methods reduce\nthe memory and computational cost of Transformer, which is a necessary step to\nimplement large language/vision models on practical devices. Given the unique\narchitecture of Transformer, featuring alternative attention and feedforward\nneural network (FFN) modules, specific compression techniques are usually\nrequired. The efficiency of these compression methods is also paramount, as\nretraining large models on the entire training dataset is usually impractical.\nThis survey provides a comprehensive review of recent compression methods, with\na specific focus on their application to Transformer-based models. The\ncompression methods are primarily categorized into pruning, quantization,\nknowledge distillation, and efficient architecture design (Mamba, RetNet, RWKV,\netc.). In each category, we discuss compression methods for both language and\nvision tasks, highlighting common underlying principles. Finally, we delve into\nthe relation between various compression methods, and discuss further\ndirections in this domain.\n","authors":["Yehui Tang","Yunhe Wang","Jianyuan Guo","Zhijun Tu","Kai Han","Hailin Hu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.05964v2.pdf","comment":"Model Compression, Transformer, Large Language Model, Large Vision\n Model, LLM"},{"id":"http://arxiv.org/abs/2404.04946v1","updated":"2024-04-07T12:57:41Z","published":"2024-04-07T12:57:41Z","title":"AnimateZoo: Zero-shot Video Generation of Cross-Species Animation via\n Subject Alignment","summary":" Recent video editing advancements rely on accurate pose sequences to animate\nsubjects. However, these efforts are not suitable for cross-species animation\ndue to pose misalignment between species (for example, the poses of a cat\ndiffers greatly from that of a pig due to differences in body structure). In\nthis paper, we present AnimateZoo, a zero-shot diffusion-based video generator\nto address this challenging cross-species animation issue, aiming to accurately\nproduce animal animations while preserving the background. The key technique\nused in our AnimateZoo is subject alignment, which includes two steps. First,\nwe improve appearance feature extraction by integrating a Laplacian detail\nbooster and a prompt-tuning identity extractor. These components are\nspecifically designed to capture essential appearance information, including\nidentity and fine details. Second, we align shape features and address\nconflicts from differing subjects by introducing a scale-information remover.\nThis ensures accurate cross-species animation. Moreover, we introduce two\nhigh-quality animal video datasets featuring a wide variety of species. Trained\non these extensive datasets, our model is capable of generating videos\ncharacterized by accurate movements, consistent appearance, and high-fidelity\nframes, without the need for the pre-inference fine-tuning that prior arts\nrequired. Extensive experiments showcase the outstanding performance of our\nmethod in cross-species action following tasks, demonstrating exceptional shape\nadaptation capability. The project page is available at\nhttps://justinxu0.github.io/AnimateZoo/.\n","authors":["Yuanfeng Xu","Yuhao Chen","Zhongzhan Huang","Zijian He","Guangrun Wang","Philip Torr","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.04946v1.pdf","comment":"Technical report,15 pages"},{"id":"http://arxiv.org/abs/2404.03043v2","updated":"2024-04-07T12:37:04Z","published":"2024-04-03T20:05:00Z","title":"Linear Anchored Gaussian Mixture Model for Location and Width\n Computation of Objects in Thick Line Shape","summary":" An accurate detection of the centerlines of linear objects is a challenging\ntopic in many sensitive real-world applications such X-ray imaging, remote\nsensing and lane marking detection in road traffic. Model-based approaches\nusing Hough and Radon transforms are often used but, are not recommended for\nthick line detection, whereas approaches based on image derivatives need\nfurther step-by-step processing, making their efficiency dependent on each step\noutcomes. In this paper, we aim to detect linear structures found in images by\nconsidering the 3D representation of the image gray levels as a finite mixture\nmodel of statistical distribution. The latter, which we named linear anchored\nGaussian distribution could be parametrized by a scale value ${\\sigma}$\ndescribing the linear structure thickness and a line equation, parametrized, in\nturn, by a radius ${\\rho}$ and an orientation angle ${\\theta}$, describing the\nlinear structure centerline location. Expectation-Maximization (EM) algorithm\nis used for the mixture model parameter estimation, where a new paradigm, using\nthe background subtraction for the likelihood function computation, is\nproposed. For the EM algorithm, two ${\\theta}$ parameter initialization schemes\nare used: the first one is based on a random choice of the first component of\n${\\theta}$ vector, whereas the second is based on the image Hessian with a\nsimultaneous computation of the mixture model components number. Experiments on\nreal world images and synthetic images corrupted by blur and additive noise\nshow the good performance of the proposed methods, where the algorithm using\nbackground subtraction and Hessian-based ${\\theta}$ initialization provides an\noutstanding accuracy of the linear structure detection despite irregular image\nbackground and presence of blur and noise.\n","authors":["Nafaa Nacereddine","Aicha Baya Goumeidane","Djemel Ziou"],"pdf_url":"https://arxiv.org/pdf/2404.03043v2.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2305.13799v2","updated":"2024-04-07T12:33:08Z","published":"2023-05-23T08:13:09Z","title":"UPNet: Uncertainty-based Picking Deep Learning Network for Robust First\n Break Picking","summary":" In seismic exploration, first break (FB) picking is a crucial aspect in the\ndetermination of subsurface velocity models, significantly influencing the\nplacement of wells. Many deep neural networks (DNNs)-based automatic picking\nmethods have been proposed to accelerate this processing. Significantly, the\nsegmentation-based DNN methods provide a segmentation map and then estimate FB\nfrom the map using a picking threshold. However, the uncertainty of the results\npicked by DNNs still needs to be analyzed. Thus, the automatic picking methods\napplied in field datasets can not ensure robustness, especially in the case of\na low signal-to-noise ratio (SNR). In this paper, we introduce uncertainty\nquantification into the FB picking task and propose a novel uncertainty-based\npicking deep learning network called UPNet. UPNet not only estimates the\nuncertainty of network output but also can filter the pickings with low\nconfidence. Many experiments evaluate that UPNet exhibits higher accuracy and\nrobustness than the deterministic DNN-based model, achieving State-of-the-Art\n(SOTA) performance in field surveys. In addition, we verify that the\nmeasurement uncertainty is meaningful, which can provide a reference for human\ndecision-making.\n","authors":["Hongtao Wang","Jiangshe Zhang","Xiaoli Wei","Li Long","Chunxia Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04936v1","updated":"2024-04-07T12:17:40Z","published":"2024-04-07T12:17:40Z","title":"Bootstrapping Chest CT Image Understanding by Distilling Knowledge from\n X-ray Expert Models","summary":" Radiologists highly desire fully automated versatile AI for medical imaging\ninterpretation. However, the lack of extensively annotated large-scale\nmulti-disease datasets has hindered the achievement of this goal. In this\npaper, we explore the feasibility of leveraging language as a naturally\nhigh-quality supervision for chest CT imaging. In light of the limited\navailability of image-report pairs, we bootstrap the understanding of 3D chest\nCT images by distilling chest-related diagnostic knowledge from an extensively\npre-trained 2D X-ray expert model. Specifically, we propose a language-guided\nretrieval method to match each 3D CT image with its semantically closest 2D\nX-ray image, and perform pair-wise and semantic relation knowledge\ndistillation. Subsequently, we use contrastive learning to align images and\nreports within the same patient while distinguishing them from the other\npatients. However, the challenge arises when patients have similar semantic\ndiagnoses, such as healthy patients, potentially confusing if treated as\nnegatives. We introduce a robust contrastive learning that identifies and\ncorrects these false negatives. We train our model with over 12,000 pairs of\nchest CT images and radiology reports. Extensive experiments across multiple\nscenarios, including zero-shot learning, report generation, and fine-tuning\nprocesses, demonstrate the model's feasibility in interpreting chest CT images.\n","authors":["Weiwei Cao","Jianpeng Zhang","Yingda Xia","Tony C. W. Mok","Zi Li","Xianghua Ye","Le Lu","Jian Zheng","Yuxing Tang","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04936v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04935v1","updated":"2024-04-07T12:15:53Z","published":"2024-04-07T12:15:53Z","title":"Anomaly Detection in Electrocardiograms: Advancing Clinical Diagnosis\n Through Self-Supervised Learning","summary":" The electrocardiogram (ECG) is an essential tool for diagnosing heart\ndisease, with computer-aided systems improving diagnostic accuracy and reducing\nhealthcare costs. Despite advancements, existing systems often miss rare\ncardiac anomalies that could be precursors to serious, life-threatening issues\nor alterations in the cardiac macro/microstructure. We address this gap by\nfocusing on self-supervised anomaly detection (AD), training exclusively on\nnormal ECGs to recognize deviations indicating anomalies. We introduce a novel\nself-supervised learning framework for ECG AD, utilizing a vast dataset of\nnormal ECGs to autonomously detect and localize cardiac anomalies. It proposes\na novel masking and restoration technique alongside a multi-scale\ncross-attention module, enhancing the model's ability to integrate global and\nlocal signal features. The framework emphasizes accurate localization of\nanomalies within ECG signals, ensuring the method's clinical relevance and\nreliability. To reduce the impact of individual variability, the approach\nfurther incorporates crucial patient-specific information from ECG reports,\nsuch as age and gender, thus enabling accurate identification of a broad\nspectrum of cardiac anomalies, including rare ones. Utilizing an extensive\ndataset of 478,803 ECG graphic reports from real-world clinical practice, our\nmethod has demonstrated exceptional effectiveness in AD across all tested\nconditions, regardless of their frequency of occurrence, significantly\noutperforming existing models. It achieved superior performance metrics,\nincluding an AUROC of 91.2%, an F1 score of 83.7%, a sensitivity rate of 84.2%,\na specificity of 83.0%, and a precision of 75.6% with a fixed recall rate of\n90%. It has also demonstrated robust localization capabilities, with an AUROC\nof 76.5% and a Dice coefficient of 65.3% for anomaly localization.\n","authors":["Aofan Jiang","Chaoqin Huang","Qing Cao","Yuchen Xu","Zi Zeng","Kang Chen","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04933v1","updated":"2024-04-07T12:14:42Z","published":"2024-04-07T12:14:42Z","title":"UniMD: Towards Unifying Moment Retrieval and Temporal Action Detection","summary":" Temporal Action Detection (TAD) focuses on detecting pre-defined actions,\nwhile Moment Retrieval (MR) aims to identify the events described by open-ended\nnatural language within untrimmed videos. Despite that they focus on different\nevents, we observe they have a significant connection. For instance, most\ndescriptions in MR involve multiple actions from TAD. In this paper, we aim to\ninvestigate the potential synergy between TAD and MR. Firstly, we propose a\nunified architecture, termed Unified Moment Detection (UniMD), for both TAD and\nMR. It transforms the inputs of the two tasks, namely actions for TAD or events\nfor MR, into a common embedding space, and utilizes two novel query-dependent\ndecoders to generate a uniform output of classification score and temporal\nsegments. Secondly, we explore the efficacy of two task fusion learning\napproaches, pre-training and co-training, in order to enhance the mutual\nbenefits between TAD and MR. Extensive experiments demonstrate that the\nproposed task fusion learning scheme enables the two tasks to help each other\nand outperform the separately trained counterparts. Impressively, UniMD\nachieves state-of-the-art results on three paired datasets Ego4D, Charades-STA,\nand ActivityNet. Our code will be released at\nhttps://github.com/yingsen1/UniMD.\n","authors":["Yingsen Zeng","Yujie Zhong","Chengjian Feng","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04933v1.pdf","comment":"Tech report"},{"id":"http://arxiv.org/abs/2402.13185v4","updated":"2024-04-07T12:11:28Z","published":"2024-02-20T17:52:12Z","title":"UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance\n Editing","summary":" Recent advances in text-guided video editing have showcased promising results\nin appearance editing (e.g., stylization). However, video motion editing in the\ntemporal dimension (e.g., from eating to waving), which distinguishes video\nediting from image editing, is underexplored. In this work, we present UniEdit,\na tuning-free framework that supports both video motion and appearance editing\nby harnessing the power of a pre-trained text-to-video generator within an\ninversion-then-generation framework. To realize motion editing while preserving\nsource video content, based on the insights that temporal and spatial\nself-attention layers encode inter-frame and intra-frame dependency\nrespectively, we introduce auxiliary motion-reference and reconstruction\nbranches to produce text-guided motion and source features respectively. The\nobtained features are then injected into the main editing path via temporal and\nspatial self-attention layers. Extensive experiments demonstrate that UniEdit\ncovers video motion editing and various appearance editing scenarios, and\nsurpasses the state-of-the-art methods. Our code will be publicly available.\n","authors":["Jianhong Bai","Tianyu He","Yuchi Wang","Junliang Guo","Haoji Hu","Zuozhu Liu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2402.13185v4.pdf","comment":"Project page: https://jianhongbai.github.io/UniEdit/"},{"id":"http://arxiv.org/abs/2305.13600v2","updated":"2024-04-07T11:50:34Z","published":"2023-05-23T02:02:36Z","title":"SiCL: Silhouette-Driven Contrastive Learning for Unsupervised Person\n Re-Identification with Clothes Change","summary":" In this paper, we address a highly challenging yet critical task:\nunsupervised long-term person re-identification with clothes change. Existing\nunsupervised person re-id methods are mainly designed for short-term scenarios\nand usually rely on RGB cues so that fail to perceive feature patterns that are\nindependent of the clothes. To crack this bottleneck, we propose a\nsilhouette-driven contrastive learning (SiCL) method, which is designed to\nlearn cross-clothes invariance by integrating both the RGB cues and the\nsilhouette information within a contrastive learning framework. To our\nknowledge, this is the first tailor-made framework for unsupervised long-term\nclothes change \\reid{}, with superior performance on six benchmark datasets. We\nconduct extensive experiments to evaluate our proposed SiCL compared to the\nstate-of-the-art unsupervised person reid methods across all the representative\ndatasets. Experimental results demonstrate that our proposed SiCL significantly\noutperforms other unsupervised re-id methods.\n","authors":["Mingkun Li","Peng Xu","Chun-Guang Li","Jun Guo"],"pdf_url":"https://arxiv.org/pdf/2305.13600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04924v1","updated":"2024-04-07T11:48:07Z","published":"2024-04-07T11:48:07Z","title":"GvT: A Graph-based Vision Transformer with Talking-Heads Utilizing\n Sparsity, Trained from Scratch on Small Datasets","summary":" Vision Transformers (ViTs) have achieved impressive results in large-scale\nimage classification. However, when training from scratch on small datasets,\nthere is still a significant performance gap between ViTs and Convolutional\nNeural Networks (CNNs), which is attributed to the lack of inductive bias. To\naddress this issue, we propose a Graph-based Vision Transformer (GvT) that\nutilizes graph convolutional projection and graph-pooling. In each block,\nqueries and keys are calculated through graph convolutional projection based on\nthe spatial adjacency matrix, while dot-product attention is used in another\ngraph convolution to generate values. When using more attention heads, the\nqueries and keys become lower-dimensional, making their dot product an\nuninformative matching function. To overcome this low-rank bottleneck in\nattention heads, we employ talking-heads technology based on bilinear pooled\nfeatures and sparse selection of attention tensors. This allows interaction\namong filtered attention scores and enables each attention mechanism to depend\non all queries and keys. Additionally, we apply graph-pooling between two\nintermediate blocks to reduce the number of tokens and aggregate semantic\ninformation more effectively. Our experimental results show that GvT produces\ncomparable or superior outcomes to deep convolutional networks and surpasses\nvision transformers without pre-training on large datasets. The code for our\nproposed model is publicly available on the website.\n","authors":["Dongjing Shan","guiqiang chen"],"pdf_url":"https://arxiv.org/pdf/2404.04924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12488v2","updated":"2024-04-07T11:38:48Z","published":"2024-03-19T06:54:33Z","title":"DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of\n MLLM","summary":" We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot\nobject detection ability of multimodal large language models (MLLMs), such as\nGPT-4V and Gemini. Our approach consists of a detection prompting toolkit\ninspired by high-precision detection priors and a new Chain-of-Thought to\nimplement these prompts. Specifically, the prompts in the toolkit are designed\nto guide the MLLM to focus on regional information (e.g., zooming in), read\ncoordinates according to measure standards (e.g., overlaying rulers and\ncompasses), and infer from the contextual information (e.g., overlaying scene\ngraphs). Building upon these tools, the new detection chain-of-thought can\nautomatically decompose the task into simple subtasks, diagnose the\npredictions, and plan for progressive box refinements. The effectiveness of our\nframework is demonstrated across a spectrum of detection tasks, especially hard\ncases. Compared to existing state-of-the-art methods, GPT-4V with our\nDetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS\nCOCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val\nset for zero-shot referring expression comprehension, +14.5% AP on D-cube\ndescribe object detection FULL setting.\n","authors":["Yixuan Wu","Yizhou Wang","Shixiang Tang","Wenhao Wu","Tong He","Wanli Ouyang","Jian Wu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.12488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04922v1","updated":"2024-04-07T11:25:04Z","published":"2024-04-07T11:25:04Z","title":"Efficient Learnable Collaborative Attention for Single Image\n Super-Resolution","summary":" Non-Local Attention (NLA) is a powerful technique for capturing long-range\nfeature correlations in deep single image super-resolution (SR). However, NLA\nsuffers from high computational complexity and memory consumption, as it\nrequires aggregating all non-local feature information for each query response\nand recalculating the similarity weight distribution for different abstraction\nlevels of features. To address these challenges, we propose a novel Learnable\nCollaborative Attention (LCoA) that introduces inductive bias into non-local\nmodeling. Our LCoA consists of two components: Learnable Sparse Pattern (LSP)\nand Collaborative Attention (CoA). LSP uses the k-means clustering algorithm to\ndynamically adjust the sparse attention pattern of deep features, which reduces\nthe number of non-local modeling rounds compared with existing sparse\nsolutions. CoA leverages the sparse attention pattern and weights learned by\nLSP, and co-optimizes the similarity matrix across different abstraction\nlevels, which avoids redundant similarity matrix calculations. The experimental\nresults show that our LCoA can reduce the non-local modeling time by about 83%\nin the inference stage. In addition, we integrate our LCoA into a deep\nLearnable Collaborative Attention Network (LCoAN), which achieves competitive\nperformance in terms of inference time, memory consumption, and reconstruction\nquality compared with other state-of-the-art SR methods.\n","authors":["Yigang Zhao Chaowei Zheng","Jiannan Su"," GuangyongChen"," MinGan"],"pdf_url":"https://arxiv.org/pdf/2404.04922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16499v2","updated":"2024-04-07T11:16:15Z","published":"2024-03-25T07:34:06Z","title":"Self-Supervised Learning for Medical Image Data with Anatomy-Oriented\n Imaging Planes","summary":" Self-supervised learning has emerged as a powerful tool for pretraining deep\nnetworks on unlabeled data, prior to transfer learning of target tasks with\nlimited annotation. The relevance between the pretraining pretext and target\ntasks is crucial to the success of transfer learning. Various pretext tasks\nhave been proposed to utilize properties of medical image data (e.g., three\ndimensionality), which are more relevant to medical image analysis than generic\nones for natural images. However, previous work rarely paid attention to data\nwith anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance\nimaging views. As these imaging planes are defined according to the anatomy of\nthe imaged organ, pretext tasks effectively exploiting this information can\npretrain the networks to gain knowledge on the organ of interest. In this work,\nwe propose two complementary pretext tasks for this group of medical image data\nbased on the spatial relationship of the imaging planes. The first is to learn\nthe relative orientation between the imaging planes and implemented as\nregressing their intersecting lines. The second exploits parallel imaging\nplanes to regress their relative slice locations within a stack. Both pretext\ntasks are conceptually straightforward and easy to implement, and can be\ncombined in multitask learning for better representation learning. Thorough\nexperiments on two anatomical structures (heart and knee) and representative\ntarget tasks (semantic segmentation and classification) demonstrate that the\nproposed pretext tasks are effective in pretraining deep networks for\nremarkably boosted performance on the target tasks, and superior to other\nrecent approaches.\n","authors":["Tianwei Zhang","Dong Wei","Mengmeng Zhu","Shi Gu","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.16499v2.pdf","comment":"Medical Image Analysis"},{"id":"http://arxiv.org/abs/2404.04916v1","updated":"2024-04-07T10:57:54Z","published":"2024-04-07T10:57:54Z","title":"Correcting Diffusion-Based Perceptual Image Compression with Privileged\n End-to-End Decoder","summary":" The images produced by diffusion models can attain excellent perceptual\nquality. However, it is challenging for diffusion models to guarantee\ndistortion, hence the integration of diffusion models and image compression\nmodels still needs more comprehensive explorations. This paper presents a\ndiffusion-based image compression method that employs a privileged end-to-end\ndecoder model as correction, which achieves better perceptual quality while\nguaranteeing the distortion to an extent. We build a diffusion model and design\na novel paradigm that combines the diffusion model and an end-to-end decoder,\nand the latter is responsible for transmitting the privileged information\nextracted at the encoder side. Specifically, we theoretically analyze the\nreconstruction process of the diffusion models at the encoder side with the\noriginal images being visible. Based on the analysis, we introduce an\nend-to-end convolutional decoder to provide a better approximation of the score\nfunction $\\nabla_{\\mathbf{x}_t}\\log p(\\mathbf{x}_t)$ at the encoder side and\neffectively transmit the combination. Experiments demonstrate the superiority\nof our method in both distortion and perception compared with previous\nperceptual compression methods.\n","authors":["Yiyang Ma","Wenhan Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04913v1","updated":"2024-04-07T10:49:59Z","published":"2024-04-07T10:49:59Z","title":"CodecNeRF: Toward Fast Encoding and Decoding, Compact, and High-quality\n Novel-view Synthesis","summary":" Neural Radiance Fields (NeRF) have achieved huge success in effectively\ncapturing and representing 3D objects and scenes. However, several factors have\nimpeded its further proliferation as next-generation 3D media. To establish a\nubiquitous presence in everyday media formats, such as images and videos, it is\nimperative to devise a solution that effectively fulfills three key objectives:\nfast encoding and decoding time, compact model sizes, and high-quality\nrenderings. Despite significant advancements, a comprehensive algorithm that\nadequately addresses all objectives has yet to be fully realized. In this work,\nwe present CodecNeRF, a neural codec for NeRF representations, consisting of a\nnovel encoder and decoder architecture that can generate a NeRF representation\nin a single forward pass. Furthermore, inspired by the recent\nparameter-efficient finetuning approaches, we develop a novel finetuning method\nto efficiently adapt the generated NeRF representations to a new test instance,\nleading to high-quality image renderings and compact code sizes. The proposed\nCodecNeRF, a newly suggested encoding-decoding-finetuning pipeline for NeRF,\nachieved unprecedented compression performance of more than 150x and 20x\nreduction in encoding time while maintaining (or improving) the image quality\non widely used 3D object datasets, such as ShapeNet and Objaverse.\n","authors":["Gyeongjin Kang","Younggeun Lee","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2404.04913v1.pdf","comment":"34 pages, 22 figures, Project page:\n https://gynjn.github.io/Codec-NeRF/"},{"id":"http://arxiv.org/abs/2404.04910v1","updated":"2024-04-07T10:39:04Z","published":"2024-04-07T10:39:04Z","title":"MonoTAKD: Teaching Assistant Knowledge Distillation for Monocular 3D\n Object Detection","summary":" Monocular 3D object detection (Mono3D) is an indispensable research topic in\nautonomous driving, thanks to the cost-effective monocular camera sensors and\nits wide range of applications. Since the image perspective has depth\nambiguity, the challenges of Mono3D lie in understanding 3D scene geometry and\nreconstructing 3D object information from a single image. Previous methods\nattempted to transfer 3D information directly from the LiDAR-based teacher to\nthe camera-based student. However, a considerable gap in feature representation\nmakes direct cross-modal distillation inefficient, resulting in a significant\nperformance deterioration between the LiDAR-based teacher and the camera-based\nstudent. To address this issue, we propose the Teaching Assistant Knowledge\nDistillation (MonoTAKD) to break down the learning objective by integrating\nintra-modal distillation with cross-modal residual distillation. In particular,\nwe employ a strong camera-based teaching assistant model to distill powerful\nvisual knowledge effectively through intra-modal distillation. Subsequently, we\nintroduce the cross-modal residual distillation to transfer the 3D spatial\ncues. By acquiring both visual knowledge and 3D spatial cues, the predictions\nof our approach are rigorously evaluated on the KITTI 3D object detection\nbenchmark and achieve state-of-the-art performance in Mono3D.\n","authors":["Hou-I Liu","Christine Wu","Jen-Hao Cheng","Wenhao Chai","Shian-Yun Wang","Gaowen Liu","Jenq-Neng Hwang","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.04910v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.04908v1","updated":"2024-04-07T10:28:01Z","published":"2024-04-07T10:28:01Z","title":"Dual-Camera Smooth Zoom on Mobile Phones","summary":" When zooming between dual cameras on a mobile, noticeable jumps in geometric\ncontent and image color occur in the preview, inevitably affecting the user's\nzoom experience. In this work, we introduce a new task, ie, dual-camera smooth\nzoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI)\ntechnique is a potential solution but struggles with ground-truth collection.\nTo address the issue, we suggest a data factory solution where continuous\nvirtual cameras are assembled to generate DCSZ data by rendering reconstructed\n3D models of the scene. In particular, we propose a novel dual-camera smooth\nzoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is\nintroduced to construct a specific 3D model for each virtual camera. With the\nproposed data factory, we construct a synthetic dataset for DCSZ, and we\nutilize it to fine-tune FI models. In addition, we collect real-world dual-zoom\nimages without ground-truth for evaluation. Extensive experiments are conducted\nwith multiple FI methods. The results show that the fine-tuned FI models\nachieve a significant performance improvement over the original ones on DCSZ\ntask. The datasets, codes, and pre-trained models will be publicly available.\n","authors":["Renlong Wu","Zhilu Zhang","Yu Yang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.04908v1.pdf","comment":"24"},{"id":"http://arxiv.org/abs/2403.16834v2","updated":"2024-04-07T09:56:54Z","published":"2024-03-25T14:57:29Z","title":"From Two-Stream to One-Stream: Efficient RGB-T Tracking via Mutual\n Prompt Learning and Knowledge Distillation","summary":" Due to the complementary nature of visible light and thermal infrared\nmodalities, object tracking based on the fusion of visible light images and\nthermal images (referred to as RGB-T tracking) has received increasing\nattention from researchers in recent years. How to achieve more comprehensive\nfusion of information from the two modalities at a lower cost has been an issue\nthat researchers have been exploring. Inspired by visual prompt learning, we\ndesigned a novel two-stream RGB-T tracking architecture based on cross-modal\nmutual prompt learning, and used this model as a teacher to guide a one-stream\nstudent model for rapid learning through knowledge distillation techniques.\nExtensive experiments have shown that, compared to similar RGB-T trackers, our\ndesigned teacher model achieved the highest precision rate, while the student\nmodel, with comparable precision rate to the teacher model, realized an\ninference speed more than three times faster than the teacher model.(Codes will\nbe available if accepted.)\n","authors":["Yang Luo","Xiqing Guo","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2403.16834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11700v4","updated":"2024-04-07T09:17:34Z","published":"2023-11-20T12:08:23Z","title":"GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting","summary":" In this paper, we introduce \\textbf{GS-SLAM} that first utilizes 3D Gaussian\nrepresentation in the Simultaneous Localization and Mapping (SLAM) system. It\nfacilitates a better balance between efficiency and accuracy. Compared to\nrecent SLAM methods employing neural implicit representations, our method\nutilizes a real-time differentiable splatting rendering pipeline that offers\nsignificant speedup to map optimization and RGB-D rendering. Specifically, we\npropose an adaptive expansion strategy that adds new or deletes noisy 3D\nGaussians in order to efficiently reconstruct new observed scene geometry and\nimprove the mapping of previously observed areas. This strategy is essential to\nextend 3D Gaussian representation to reconstruct the whole scene rather than\nsynthesize a static object in existing methods. Moreover, in the pose tracking\nprocess, an effective coarse-to-fine technique is designed to select reliable\n3D Gaussian representations to optimize camera pose, resulting in runtime\nreduction and robust estimation. Our method achieves competitive performance\ncompared with existing state-of-the-art real-time methods on the Replica,\nTUM-RGBD datasets. Project page: https://gs-slam.github.io/.\n","authors":["Chi Yan","Delin Qu","Dan Xu","Bin Zhao","Zhigang Wang","Dong Wang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2311.11700v4.pdf","comment":"Accepted to CVPR 2024(highlight). Project Page:\n https://gs-slam.github.io/"},{"id":"http://arxiv.org/abs/2404.04891v1","updated":"2024-04-07T09:17:00Z","published":"2024-04-07T09:17:00Z","title":"DL-EWF: Deep Learning Empowering Women's Fashion with\n Grounded-Segment-Anything Segmentation for Body Shape Classification","summary":" The global fashion industry plays a pivotal role in the global economy, and\naddressing fundamental issues within the industry is crucial for developing\ninnovative solutions. One of the most pressing challenges in the fashion\nindustry is the mismatch between body shapes and the garments of individuals\nthey purchase. This issue is particularly prevalent among individuals with\nnon-ideal body shapes, exacerbating the challenges faced. Considering\ninter-individual variability in body shapes is essential for designing and\nproducing garments that are widely accepted by consumers. Traditional methods\nfor determining human body shape are limited due to their low accuracy, high\ncosts, and time-consuming nature. New approaches, utilizing digital imaging and\ndeep neural networks (DNN), have been introduced to identify human body shape.\nIn this study, the Style4BodyShape dataset is used for classifying body shapes\ninto five categories: Rectangle, Triangle, Inverted Triangle, Hourglass, and\nApple. In this paper, the body shape segmentation of a person is extracted from\nthe image, disregarding the surroundings and background. Then, Various\npre-trained models, such as ResNet18, ResNet34, ResNet50, VGG16, VGG19, and\nInception v3, are used to classify the segmentation results. Among these\npre-trained models, the Inception V3 model demonstrates superior performance\nregarding f1-score evaluation metric and accuracy compared to the other models.\n","authors":["Fatemeh Asghari","Mohammad Reza Soheili","Faezeh Gholamrezaie"],"pdf_url":"https://arxiv.org/pdf/2404.04891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04890v1","updated":"2024-04-07T09:15:45Z","published":"2024-04-07T09:15:45Z","title":"A Unified Diffusion Framework for Scene-aware Human Motion Estimation\n from Sparse Signals","summary":" Estimating full-body human motion via sparse tracking signals from\nhead-mounted displays and hand controllers in 3D scenes is crucial to\napplications in AR/VR. One of the biggest challenges to this task is the\none-to-many mapping from sparse observations to dense full-body motions, which\nendowed inherent ambiguities. To help resolve this ambiguous problem, we\nintroduce a new framework to combine rich contextual information provided by\nscenes to benefit full-body motion tracking from sparse observations. To\nestimate plausible human motions given sparse tracking signals and 3D scenes,\nwe develop $\\text{S}^2$Fusion, a unified framework fusing \\underline{S}cene and\nsparse \\underline{S}ignals with a conditional dif\\underline{Fusion} model.\n$\\text{S}^2$Fusion first extracts the spatial-temporal relations residing in\nthe sparse signals via a periodic autoencoder, and then produces time-alignment\nfeature embedding as additional inputs. Subsequently, by drawing initial noisy\nmotion from a pre-trained prior, $\\text{S}^2$Fusion utilizes conditional\ndiffusion to fuse scene geometry and sparse tracking signals to generate\nfull-body scene-aware motions. The sampling procedure of $\\text{S}^2$Fusion is\nfurther guided by a specially designed scene-penetration loss and\nphase-matching loss, which effectively regularizes the motion of the lower body\neven in the absence of any tracking signals, making the generated motion much\nmore plausible and coherent. Extensive experimental results have demonstrated\nthat our $\\text{S}^2$Fusion outperforms the state-of-the-art in terms of\nestimation quality and smoothness.\n","authors":["Jiangnan Tang","Jingya Wang","Kaiyang Ji","Lan Xu","Jingyi Yu","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2404.04890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04887v1","updated":"2024-04-07T09:08:14Z","published":"2024-04-07T09:08:14Z","title":"A Clinical-oriented Multi-level Contrastive Learning Method for Disease\n Diagnosis in Low-quality Medical Images","summary":" Representation learning offers a conduit to elucidate distinctive features\nwithin the latent space and interpret the deep models. However, the randomness\nof lesion distribution and the complexity of low-quality factors in medical\nimages pose great challenges for models to extract key lesion features. Disease\ndiagnosis methods guided by contrastive learning (CL) have shown significant\nadvantages in lesion feature representation. Nevertheless, the effectiveness of\nCL is highly dependent on the quality of the positive and negative sample\npairs. In this work, we propose a clinical-oriented multi-level CL framework\nthat aims to enhance the model's capacity to extract lesion features and\ndiscriminate between lesion and low-quality factors, thereby enabling more\naccurate disease diagnosis from low-quality medical images. Specifically, we\nfirst construct multi-level positive and negative pairs to enhance the model's\ncomprehensive recognition capability of lesion features by integrating\ninformation from different levels and qualities of medical images. Moreover, to\nimprove the quality of the learned lesion embeddings, we introduce a dynamic\nhard sample mining method based on self-paced learning. The proposed CL\nframework is validated on two public medical image datasets, EyeQ and Chest\nX-ray, demonstrating superior performance compared to other state-of-the-art\ndisease diagnostic methods.\n","authors":["Qingshan Hou","Shuai Cheng","Peng Cao","Jinzhu Yang","Xiaoli Liu","Osmar R. Zaiane","Yih Chung Tham"],"pdf_url":"https://arxiv.org/pdf/2404.04887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04884v1","updated":"2024-04-07T09:05:04Z","published":"2024-04-07T09:05:04Z","title":"LRNet: Change detection of high-resolution remote sensing imagery via\n strategy of localization-then-refinement","summary":" Change detection, as a research hotspot in the field of remote sensing, has\nwitnessed continuous development and progress. However, the discrimination of\nboundary details remains a significant bottleneck due to the complexity of\nsurrounding elements between change areas and backgrounds. Discriminating the\nboundaries of large change areas results in misalignment, while connecting\nboundaries occurs for small change targets. To address the above issues, a\nnovel network based on the localization-then-refinement strategy is proposed in\nthis paper, namely LRNet. LRNet consists of two stages: localization and\nrefinement. In the localization stage, a three-branch encoder simultaneously\nextracts original image features and their differential features for\ninteractive localization of the position of each change area. To minimize\ninformation loss during feature extraction, learnable optimal pooling (LOP) is\nproposed to replace the widely used max-pooling. Additionally, this process is\ntrainable and contributes to the overall optimization of the network. To\neffectively interact features from different branches and accurately locate\nchange areas of various sizes, change alignment attention (C2A) and\nhierarchical change alignment module (HCA) are proposed. In the refinement\nstage, the localization results from the localization stage are corrected by\nconstraining the change areas and change edges through the edge-area alignment\nmodule (E2A). Subsequently, the decoder, combined with the difference features\nstrengthened by C2A in the localization phase, refines change areas of\ndifferent sizes, ultimately achieving accurate boundary discrimination of\nchange areas. The proposed LRNet outperforms 13 other state-of-the-art methods\nin terms of comprehensive evaluation metrics and provides the most precise\nboundary discrimination results on the LEVIR-CD and WHU-CD datasets.\n","authors":["Huan Zhong","Chen Wu","Ziqi Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.04884v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.04883v1","updated":"2024-04-07T09:01:50Z","published":"2024-04-07T09:01:50Z","title":"Mixture of Low-rank Experts for Transferable AI-Generated Image\n Detection","summary":" Generative models have shown a giant leap in synthesizing photo-realistic\nimages with minimal expertise, sparking concerns about the authenticity of\nonline information. This study aims to develop a universal AI-generated image\ndetector capable of identifying images from diverse sources. Existing methods\nstruggle to generalize across unseen generative models when provided with\nlimited sample sources. Inspired by the zero-shot transferability of\npre-trained vision-language models, we seek to harness the nontrivial\nvisual-world knowledge and descriptive proficiency of CLIP-ViT to generalize\nover unknown domains. This paper presents a novel parameter-efficient\nfine-tuning approach, mixture of low-rank experts, to fully exploit CLIP-ViT's\npotential while preserving knowledge and expanding capacity for transferable\ndetection. We adapt only the MLP layers of deeper ViT blocks via an integration\nof shared and separate LoRAs within an MoE-based structure. Extensive\nexperiments on public benchmarks show that our method achieves superiority over\nstate-of-the-art approaches in cross-generator generalization and robustness to\nperturbations. Remarkably, our best-performing ViT-L/14 variant requires\ntraining only 0.08% of its parameters to surpass the leading baseline by +3.64%\nmAP and +12.72% avg.Acc across unseen diffusion and autoregressive models. This\neven outperforms the baseline with just 0.28% of the training data. Our code\nand pre-trained models will be available at\nhttps://github.com/zhliuworks/CLIPMoLE.\n","authors":["Zihan Liu","Hanyi Wang","Yaoyu Kang","Shilin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04880v1","updated":"2024-04-07T08:51:31Z","published":"2024-04-07T08:51:31Z","title":"GauU-Scene V2: Expanse Lidar Image Dataset Shows Unreliable Geometric\n Reconstruction Using Gaussian Splatting and NeRF","summary":" We introduce a novel large-scale scene reconstruction benchmark that utilizes\nnewly developed 3D representation approaches: Gaussian Splatting and Neural\nRadiance Fields, on our expansive GauU-Scene V2 dataset. GauU-Scene V2\nencompasses over 6.5 square kilometers and features a comprehensive RGB dataset\ncoupled with LiDAR ground truth. This dataset offers a unique blend of urban\nand academic environments for advanced spatial analysis, covering more than 6.5\nkm2. We also provide detailed supplementary information on data collection\nprotocols. Furthermore, we present an easy-to-follow pipeline to align the\nCOLMAP sparse point cloud with the detailed LiDAR dataset. Our evaluation of\nU-Scene, which includes a detailed analysis across various novel viewpoints\nusing image-based metrics such as SSIM, LPIPS, and PSNR, shows contradictory\nresults when applying geometric-based metrics, such as Chamfer distance. This\nleads to doubts about the reliability of current image-based measurement\nmatrices and geometric extraction methods on Gaussian Splatting. We also make\nthe dataset available on the following anonymous project page\n","authors":["Butian Xiong","Nanjun Zheng","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2404.04880v1.pdf","comment":"8 pages(No reference) 6 figures 4 tabs"},{"id":"http://arxiv.org/abs/2404.04878v1","updated":"2024-04-07T08:48:01Z","published":"2024-04-07T08:48:01Z","title":"CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale\n Volumetric Super-Resolution of Medical Data","summary":" In the realm of medical 3D data, such as CT and MRI images, prevalent\nanisotropic resolution is characterized by high intra-slice but diminished\ninter-slice resolution. The lowered resolution between adjacent slices poses\nchallenges, hindering optimal viewing experiences and impeding the development\nof robust downstream analysis algorithms. Various volumetric super-resolution\nalgorithms aim to surmount these challenges, enhancing inter-slice resolution\nand overall 3D medical imaging quality. However, existing approaches confront\ninherent challenges: 1) often tailored to specific upsampling factors, lacking\nflexibility for diverse clinical scenarios; 2) newly generated slices\nfrequently suffer from over-smoothing, degrading fine details, and leading to\ninter-slice inconsistency. In response, this study presents CycleINR, a novel\nenhanced Implicit Neural Representation model for 3D medical data volumetric\nsuper-resolution. Leveraging the continuity of the learned implicit function,\nthe CycleINR model can achieve results with arbitrary up-sampling rates,\neliminating the need for separate training. Additionally, we enhance the grid\nsampling in CycleINR with a local attention mechanism and mitigate\nover-smoothing by integrating cycle-consistent loss. We introduce a new metric,\nSlice-wise Noise Level Inconsistency (SNLI), to quantitatively assess\ninter-slice noise level inconsistency. The effectiveness of our approach is\ndemonstrated through image quality evaluations on an in-house dataset and a\ndownstream task analysis on the Medical Segmentation Decathlon liver tumor\ndataset.\n","authors":["Wei Fang","Yuxing Tang","Heng Guo","Mingze Yuan","Tony C. W. Mok","Ke Yan","Jiawen Yao","Xin Chen","Zaiyi Liu","Le Lu","Ling Zhang","Minfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04878v1.pdf","comment":"CVPR accepted paper"},{"id":"http://arxiv.org/abs/2404.04876v1","updated":"2024-04-07T08:46:06Z","published":"2024-04-07T08:46:06Z","title":"HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and\n Low-Frequency Information of Parametric Models","summary":" Reconstructing 3D clothed human involves creating a detailed geometry of\nindividuals in clothing, with applications ranging from virtual try-on, movies,\nto games. To enable practical and widespread applications, recent advances\npropose to generate a clothed human from an RGB image. However, they struggle\nto reconstruct detailed and robust avatars simultaneously. We empirically find\nthat the high-frequency (HF) and low-frequency (LF) information from a\nparametric model has the potential to enhance geometry details and improve\nrobustness to noise, respectively. Based on this, we propose HiLo, namely\nclothed human reconstruction with high- and low-frequency information, which\ncontains two components. 1) To recover detailed geometry using HF information,\nwe propose a progressive HF Signed Distance Function to enhance the detailed 3D\ngeometry of a clothed human. We analyze that our progressive learning manner\nalleviates large gradients that hinder model convergence. 2) To achieve robust\nreconstruction against inaccurate estimation of the parametric model by using\nLF information, we propose a spatial interaction implicit function. This\nfunction effectively exploits the complementary spatial information from a\nlow-resolution voxel grid of the parametric model. Experimental results\ndemonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and\n9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets,\nrespectively. Additionally, HiLo demonstrates robustness to noise from the\nparametric model, challenging poses, and various clothing styles.\n","authors":["Yifan Yang","Dong Liu","Shuhai Zhang","Zeshuai Deng","Zixiong Huang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2404.04876v1.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.04875v1","updated":"2024-04-07T08:42:38Z","published":"2024-04-07T08:42:38Z","title":"NeRF2Points: Large-Scale Point Cloud Generation From Street Views'\n Radiance Field Optimization","summary":" Neural Radiance Fields (NeRF) have emerged as a paradigm-shifting methodology\nfor the photorealistic rendering of objects and environments, enabling the\nsynthesis of novel viewpoints with remarkable fidelity. This is accomplished\nthrough the strategic utilization of object-centric camera poses characterized\nby significant inter-frame overlap. This paper explores a compelling,\nalternative utility of NeRF: the derivation of point clouds from aggregated\nurban landscape imagery. The transmutation of street-view data into point\nclouds is fraught with complexities, attributable to a nexus of interdependent\nvariables. First, high-quality point cloud generation hinges on precise camera\nposes, yet many datasets suffer from inaccuracies in pose metadata. Also, the\nstandard approach of NeRF is ill-suited for the distinct characteristics of\nstreet-view data from autonomous vehicles in vast, open settings. Autonomous\nvehicle cameras often record with limited overlap, leading to blurring,\nartifacts, and compromised pavement representation in NeRF-based point clouds.\nIn this paper, we present NeRF2Points, a tailored NeRF variant for urban point\ncloud synthesis, notable for its high-quality output from RGB inputs alone. Our\npaper is supported by a bespoke, high-resolution 20-kilometer urban street\ndataset, designed for point cloud generation and evaluation. NeRF2Points\nadeptly navigates the inherent challenges of NeRF-based point cloud synthesis\nthrough the implementation of the following strategic innovations: (1)\nIntegration of Weighted Iterative Geometric Optimization (WIGO) and Structure\nfrom Motion (SfM) for enhanced camera pose accuracy, elevating street-view data\nprecision. (2) Layered Perception and Integrated Modeling (LPiM) is designed\nfor distinct radiance field modeling in urban environments, resulting in\ncoherent point cloud representations.\n","authors":["Peng Tu","Xun Zhou","Mingming Wang","Xiaojun Yang","Bo Peng","Ping Chen","Xiu Su","Yawen Huang","Yefeng Zheng","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04875v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2404.04871v1","updated":"2024-04-07T08:32:16Z","published":"2024-04-07T08:32:16Z","title":"Data Stream Sampling with Fuzzy Task Boundaries and Noisy Labels","summary":" In the realm of continual learning, the presence of noisy labels within data\nstreams represents a notable obstacle to model reliability and fairness. We\nfocus on the data stream scenario outlined in pertinent literature,\ncharacterized by fuzzy task boundaries and noisy labels. To address this\nchallenge, we introduce a novel and intuitive sampling method called Noisy Test\nDebiasing (NTD) to mitigate noisy labels in evolving data streams and establish\na fair and robust continual learning algorithm. NTD is straightforward to\nimplement, making it feasible across various scenarios. Our experiments\nbenchmark four datasets, including two synthetic noise datasets (CIFAR10 and\nCIFAR100) and real-world noise datasets (mini-WebVision and Food-101N). The\nresults validate the efficacy of NTD for online continual learning in scenarios\nwith noisy labels in data streams. Compared to the previous leading approach,\nNTD achieves a training speedup enhancement over two times while maintaining or\nsurpassing accuracy levels. Moreover, NTD utilizes less than one-fifth of the\nGPU memory resources compared to previous leading methods.\n","authors":["Yu-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04865v1","updated":"2024-04-07T08:17:48Z","published":"2024-04-07T08:17:48Z","title":"On the Learnability of Out-of-distribution Detection","summary":" Supervised learning aims to train a classifier under the assumption that\ntraining and test data are from the same distribution. To ease the above\nassumption, researchers have studied a more realistic setting:\nout-of-distribution (OOD) detection, where test data may come from classes that\nare unknown during training (i.e., OOD data). Due to the unavailability and\ndiversity of OOD data, good generalization ability is crucial for effective OOD\ndetection algorithms, and corresponding learning theory is still an open\nproblem. To study the generalization of OOD detection, this paper investigates\nthe probably approximately correct (PAC) learning theory of OOD detection that\nfits the commonly used evaluation metrics in the literature. First, we find a\nnecessary condition for the learnability of OOD detection. Then, using this\ncondition, we prove several impossibility theorems for the learnability of OOD\ndetection under some scenarios. Although the impossibility theorems are\nfrustrating, we find that some conditions of these impossibility theorems may\nnot hold in some practical scenarios. Based on this observation, we next give\nseveral necessary and sufficient conditions to characterize the learnability of\nOOD detection in some practical scenarios. Lastly, we offer theoretical support\nfor representative OOD detection works based on our OOD theory.\n","authors":["Zhen Fang","Yixuan Li","Feng Liu","Bo Han","Jie Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04865v1.pdf","comment":"Accepted by JMLR in 7th of April, 2024. This is a journal extension\n of the previous NeurIPS 2022 Outstanding Paper \"Is Out-of-distribution\n Detection Learnable?\" [arXiv:2210.14707]"},{"id":"http://arxiv.org/abs/2308.06791v5","updated":"2024-04-07T08:13:38Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection\n Features and Variable Receptive Field Voxel Features","summary":" LiDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, real-time inference from extremely sparse 3D data is a\nformidable challenge. To address this problem, a typical class of approaches\ntransforms the point cloud cast into a regular data representation (voxels or\nprojection maps). Then, it performs feature extraction with convolutional\nneural networks. However, such methods often result in a certain degree of\ninformation loss due to down-sampling or over-compression of feature\ninformation. This paper proposes a multi-modal point cloud feature fusion\nmethod for projection features and variable receptive field voxel features\n(PV-SSD) based on projection and variable voxelization to solve the information\nloss problem. We design a two-branch feature extraction structure with a 2D\nconvolutional neural network to extract the point cloud's projection features\nin bird's-eye view to focus on the correlation between local features. A voxel\nfeature extraction branch is used to extract local fine-grained features.\nMeanwhile, we propose a voxel feature extraction method with variable sensory\nfields to reduce the information loss of voxel branches due to downsampling. It\navoids missing critical point information by selecting more useful feature\npoints based on feature point weights for the detection task. In addition, we\npropose a multi-modal feature fusion module for point clouds. To validate the\neffectiveness of our method, we tested it on the KITTI dataset and ONCE\ndataset.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan","Peng Liao"],"pdf_url":"https://arxiv.org/pdf/2308.06791v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04860v1","updated":"2024-04-07T08:07:14Z","published":"2024-04-07T08:07:14Z","title":"ByteEdit: Boost, Comply and Accelerate Generative Image Editing","summary":" Recent advancements in diffusion-based generative image editing have sparked\na profound revolution, reshaping the landscape of image outpainting and\ninpainting tasks. Despite these strides, the field grapples with inherent\nchallenges, including: i) inferior quality; ii) poor consistency; iii)\ninsufficient instrcution adherence; iv) suboptimal generation efficiency. To\naddress these obstacles, we present ByteEdit, an innovative feedback learning\nframework meticulously designed to Boost, Comply, and Accelerate Generative\nImage Editing tasks. ByteEdit seamlessly integrates image reward models\ndedicated to enhancing aesthetics and image-text alignment, while also\nintroducing a dense, pixel-level reward model tailored to foster coherence in\nthe output. Furthermore, we propose a pioneering adversarial and progressive\nfeedback learning strategy to expedite the model's inference speed. Through\nextensive large-scale user evaluations, we demonstrate that ByteEdit surpasses\nleading generative image editing products, including Adobe, Canva, and MeiTu,\nin both generation quality and consistency. ByteEdit-Outpainting exhibits a\nremarkable enhancement of 388% and 135% in quality and consistency,\nrespectively, when compared to the baseline model. Experiments also verfied\nthat our acceleration models maintains excellent performance results in terms\nof quality and consistency.\n","authors":["Yuxi Ren","Jie Wu","Yanzuo Lu","Huafeng Kuang","Xin Xia","Xionghui Wang","Qianqian Wang","Yixing Zhu","Pan Xie","Shiyin Wang","Xuefeng Xiao","Yitong Wang","Min Zheng","Lean Fu"],"pdf_url":"https://arxiv.org/pdf/2404.04860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04856v1","updated":"2024-04-07T08:03:42Z","published":"2024-04-07T08:03:42Z","title":"Msmsfnet: a multi-stream and multi-scale fusion net for edge detection","summary":" Edge detection is a long standing problem in computer vision. Recent deep\nlearning based algorithms achieve state of-the-art performance in publicly\navailable datasets. Despite the efficiency of these algorithms, their\nperformance, however, relies heavily on the pretrained weights of the backbone\nnetwork on the ImageNet dataset. This limits heavily the design space of deep\nlearning based edge detectors. Whenever we want to devise a new model, we have\nto train this new model on the ImageNet dataset first, and then fine tune the\nmodel using the edge detection datasets. The comparison would be unfair\notherwise. However, it is usually not feasible for many researchers to train a\nmodel on the ImageNet dataset due to the limited computation resources. In this\nwork, we study the performance that can be achieved by state-of-the-art deep\nlearning based edge detectors in publicly available datasets when they are\ntrained from scratch, and devise a new network architecture, the multi-stream\nand multi scale fusion net (msmsfnet), for edge detection. We show in our\nexperiments that by training all models from scratch to ensure the fairness of\ncomparison, out model outperforms state-of-the art deep learning based edge\ndetectors in three publicly available datasets.\n","authors":["Chenguang Liu","Chisheng Wang","Feifei Dong","Xin Su","Chuanhua Zhu","Dejin Zhang","Qingquan Li"],"pdf_url":"https://arxiv.org/pdf/2404.04856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00292v2","updated":"2024-04-07T07:55:51Z","published":"2024-03-30T08:51:23Z","title":"LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge\n Retrieval-Augmented Diffusion","summary":" Camouflaged vision perception is an important vision task with numerous\npractical applications. Due to the expensive collection and labeling costs,\nthis community struggles with a major bottleneck that the species category of\nits datasets is limited to a small number of object species. However, the\nexisting camouflaged generation methods require specifying the background\nmanually, thus failing to extend the camouflaged sample diversity in a low-cost\nmanner. In this paper, we propose a Latent Background Knowledge\nRetrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To\nour knowledge, our contributions mainly include: (1) For the first time, we\npropose a camouflaged generation paradigm that does not need to receive any\nbackground inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented\nmethod with interpretability for camouflaged generation, in which we propose an\nidea that knowledge retrieval and reasoning enhancement are separated\nexplicitly, to alleviate the task-specific challenges. Moreover, our method is\nnot restricted to specific foreground targets or backgrounds, offering a\npotential for extending camouflaged vision perception to more diverse domains.\n(3) Experimental results demonstrate that our method outperforms the existing\napproaches, generating more realistic camouflage images.\n","authors":["Pancheng Zhao","Peng Xu","Pengda Qin","Deng-Ping Fan","Zhicheng Zhang","Guoli Jia","Bowen Zhou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00292v2.pdf","comment":"Accepted by CVPR 2024, Fig.3 revised"},{"id":"http://arxiv.org/abs/2306.08498v2","updated":"2024-04-07T07:50:37Z","published":"2023-06-14T13:27:28Z","title":"Extending CLIP's Image-Text Alignment to Referring Image Segmentation","summary":" Referring Image Segmentation (RIS) is a cross-modal task that aims to segment\nan instance described by a natural language expression. Recent methods leverage\nlarge-scale pretrained unimodal models as backbones along with fusion\ntechniques for joint reasoning across modalities. However, the inherent\ncross-modal nature of RIS raises questions about the effectiveness of unimodal\nbackbones. We propose RISCLIP, a novel framework that effectively leverages the\ncross-modal nature of CLIP for RIS. Observing CLIP's inherent alignment between\nimage and text features, we capitalize on this starting point and introduce\nsimple but strong modules that enhance unimodal feature extraction and leverage\nrich alignment knowledge in CLIP's image-text shared-embedding space. RISCLIP\nexhibits outstanding results on all three major RIS benchmarks and also\noutperforms previous CLIP-based methods, demonstrating the efficacy of our\nstrategy in extending CLIP's image-text alignment to RIS.\n","authors":["Seoyeon Kim","Minguk Kang","Dongwon Kim","Jaesik Park","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2306.08498v2.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2404.04848v1","updated":"2024-04-07T07:42:04Z","published":"2024-04-07T07:42:04Z","title":"Task-Aware Encoder Control for Deep Video Compression","summary":" Prior research on deep video compression (DVC) for machine tasks typically\nnecessitates training a unique codec for each specific task, mandating a\ndedicated decoder per task. In contrast, traditional video codecs employ a\nflexible encoder controller, enabling the adaptation of a single codec to\ndifferent tasks through mechanisms like mode prediction. Drawing inspiration\nfrom this, we introduce an innovative encoder controller for deep video\ncompression for machines. This controller features a mode prediction and a\nGroup of Pictures (GoP) selection module. Our approach centralizes control at\nthe encoding stage, allowing for adaptable encoder adjustments across different\ntasks, such as detection and tracking, while maintaining compatibility with a\nstandard pre-trained DVC decoder. Empirical evidence demonstrates that our\nmethod is applicable across multiple tasks with various existing pre-trained\nDVCs. Moreover, extensive experiments demonstrate that our method outperforms\nprevious DVC by about 25% bitrate for different tasks, with only one\npre-trained decoder.\n","authors":["Xingtong Ge","Jixiang Luo","Xinjie Zhang","Tongda Xu","Guo Lu","Dailan He","Jing Geng","Yan Wang","Jun Zhang","Hongwei Qin"],"pdf_url":"https://arxiv.org/pdf/2404.04848v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12434v3","updated":"2024-04-07T07:37:59Z","published":"2024-03-19T04:47:56Z","title":"Human Mesh Recovery from Arbitrary Multi-view Images","summary":" Human mesh recovery from arbitrary multi-view images involves two\ncharacteristics: the arbitrary camera poses and arbitrary number of camera\nviews. Because of the variability, designing a unified framework to tackle this\ntask is challenging. The challenges can be summarized as the dilemma of being\nable to simultaneously estimate arbitrary camera poses and recover human mesh\nfrom arbitrary multi-view images while maintaining flexibility. To solve this\ndilemma, we propose a divide and conquer framework for Unified Human Mesh\nRecovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR\nconsists of a decoupled structure and two main components: camera and body\ndecoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion\n(AVF). As camera poses and human body mesh are independent of each other, CBD\nsplits the estimation of them into two sub-tasks for two individual\nsub-networks (ie, CPE and AVF) to handle respectively, thus the two sub-tasks\nare disentangled. In CPE, since each camera pose is unrelated to the others, we\nadopt a shared MLP to process all views in a parallel way. In AVF, in order to\nfuse multi-view information and make the fusion operation independent of the\nnumber of views, we introduce a transformer decoder with a SMPL parameters\nquery token to extract cross-view features for mesh recovery. To demonstrate\nthe efficacy and flexibility of the proposed framework and effect of each\ncomponent, we conduct extensive experiments on three public datasets:\nHuman3.6M, MPI-INF-3DHP, and TotalCapture.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12434v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11863v2","updated":"2024-04-07T07:37:15Z","published":"2023-11-20T15:59:41Z","title":"GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene\n Understanding","summary":" Applying NeRF to downstream perception tasks for scene understanding and\nrepresentation is becoming increasingly popular. Most existing methods treat\nsemantic prediction as an additional rendering task, \\textit{i.e.}, the \"label\nrendering\" task, to build semantic NeRFs. However, by rendering\nsemantic/instance labels per pixel without considering the contextual\ninformation of the rendered image, these methods usually suffer from unclear\nboundary segmentation and abnormal segmentation of pixels within an object. To\nsolve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel\npipeline that makes the widely used segmentation model and NeRF work compatibly\nunder a unified framework, for facilitating context-aware 3D scene perception.\nTo accomplish this goal, we introduce transformers to aggregate radiance as\nwell as semantic embedding fields jointly for novel views and facilitate the\njoint volumetric rendering of both fields. In addition, we propose two\nself-distillation mechanisms, i.e., the Semantic Distill Loss and the\nDepth-Guided Semantic Distill Loss, to enhance the discrimination and quality\nof the semantic field and the maintenance of geometric consistency. In\nevaluation, we conduct experimental comparisons under two perception tasks\n(\\textit{i.e.} semantic and instance segmentation) using both synthetic and\nreal-world datasets. Notably, our method outperforms SOTA approaches by 6.94\\%,\n11.76\\%, and 8.47\\% on generalized semantic segmentation, finetuning semantic\nsegmentation, and instance segmentation, respectively.\n","authors":["Hao Li","Dingwen Zhang","Yalun Dai","Nian Liu","Lechao Cheng","Jingfeng Li","Jingdong Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.11863v2.pdf","comment":"CVPR 2024 (Highlight). Project Page:\n https://lifuguan.github.io/gpnerf-pages/"},{"id":"http://arxiv.org/abs/2404.03654v2","updated":"2024-04-07T07:20:31Z","published":"2024-04-04T17:59:50Z","title":"RaFE: Generative Radiance Fields Restoration","summary":" NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel\nview synthesis and 3D reconstruction, but its performance is sensitive to input\nimage quality, which struggles to achieve high-fidelity rendering when provided\nwith low-quality sparse input viewpoints. Previous methods for NeRF restoration\nare tailored for specific degradation type, ignoring the generality of\nrestoration. To overcome this limitation, we propose a generic radiance fields\nrestoration pipeline, named RaFE, which applies to various types of\ndegradations, such as low resolution, blurriness, noise, compression artifacts,\nor their combinations. Our approach leverages the success of off-the-shelf 2D\nrestoration methods to recover the multi-view images individually. Instead of\nreconstructing a blurred NeRF by averaging inconsistencies, we introduce a\nnovel approach using Generative Adversarial Networks (GANs) for NeRF generation\nto better accommodate the geometric and appearance inconsistencies present in\nthe multi-view images. Specifically, we adopt a two-level tri-plane\narchitecture, where the coarse level remains fixed to represent the low-quality\nNeRF, and a fine-level residual tri-plane to be added to the coarse level is\nmodeled as a distribution with GAN to capture potential variations in\nrestoration. We validate RaFE on both synthetic and real cases for various\nrestoration tasks, demonstrating superior performance in both quantitative and\nqualitative evaluations, surpassing other 3D restoration methods specific to\nsingle task. Please see our project website\nhttps://zkaiwu.github.io/RaFE-Project/.\n","authors":["Zhongkai Wu","Ziyu Wan","Jing Zhang","Jing Liao","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.03654v2.pdf","comment":"Project Page: https://zkaiwu.github.io/RaFE"},{"id":"http://arxiv.org/abs/2305.03238v4","updated":"2024-04-07T07:07:49Z","published":"2023-05-05T01:40:00Z","title":"Reduction of Class Activation Uncertainty with Background Information","summary":" Multitask learning is a popular approach to training high-performing neural\nnetworks with improved generalization. In this paper, we propose a background\nclass to achieve improved generalization at a lower computation compared to\nmultitask learning to help researchers and organizations with limited\ncomputation power. We also present a methodology for selecting background\nimages and discuss potential future improvements. We apply our approach to\nseveral datasets and achieve improved generalization with much lower\ncomputation. Through the class activation mappings (CAMs) of the trained\nmodels, we observed the tendency towards looking at a bigger picture with the\nproposed model training methodology. Applying the vision transformer with the\nproposed background class, we receive state-of-the-art (SOTA) performance on\nSTL-10, Caltech-101, and CINIC-10 datasets. Example scripts are available in\nthe 'CAM' folder of the following GitHub Repository: github.com/dipuk0506/UQ\n","authors":["H M Dipu Kabir"],"pdf_url":"https://arxiv.org/pdf/2305.03238v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04833v1","updated":"2024-04-07T06:56:51Z","published":"2024-04-07T06:56:51Z","title":"ShoeModel: Learning to Wear on the User-specified Shoes via Diffusion\n Model","summary":" With the development of the large-scale diffusion model, Artificial\nIntelligence Generated Content (AIGC) techniques are popular recently. However,\nhow to truly make it serve our daily lives remains an open question. To this\nend, in this paper, we focus on employing AIGC techniques in one filed of\nE-commerce marketing, i.e., generating hyper-realistic advertising images for\ndisplaying user-specified shoes by human. Specifically, we propose a\nshoe-wearing system, called Shoe-Model, to generate plausible images of human\nlegs interacting with the given shoes. It consists of three modules: (1) shoe\nwearable-area detection module (WD), (2) leg-pose synthesis module (LpS) and\nthe final (3) shoe-wearing image generation module (SW). Them three are\nperformed in ordered stages. Compared to baselines, our ShoeModel is shown to\ngeneralize better to different type of shoes and has ability of keeping the\nID-consistency of the given shoes, as well as automatically producing\nreasonable interactions with human. Extensive experiments show the\neffectiveness of our proposed shoe-wearing system. Figure 1 shows the input and\noutput examples of our ShoeModel.\n","authors":["Binghui Chen","Wenyu Li","Yifeng Geng","Xuansong Xie","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.04833v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2401.10891v2","updated":"2024-04-07T06:52:21Z","published":"2024-01-19T18:59:52Z","title":"Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data","summary":" This work presents Depth Anything, a highly practical solution for robust\nmonocular depth estimation. Without pursuing novel technical modules, we aim to\nbuild a simple yet powerful foundation model dealing with any images under any\ncircumstances. To this end, we scale up the dataset by designing a data engine\nto collect and automatically annotate large-scale unlabeled data (~62M), which\nsignificantly enlarges the data coverage and thus is able to reduce the\ngeneralization error. We investigate two simple yet effective strategies that\nmake data scaling-up promising. First, a more challenging optimization target\nis created by leveraging data augmentation tools. It compels the model to\nactively seek extra visual knowledge and acquire robust representations.\nSecond, an auxiliary supervision is developed to enforce the model to inherit\nrich semantic priors from pre-trained encoders. We evaluate its zero-shot\ncapabilities extensively, including six public datasets and randomly captured\nphotos. It demonstrates impressive generalization ability. Further, through\nfine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs\nare set. Our better depth model also results in a better depth-conditioned\nControlNet. Our models are released at\nhttps://github.com/LiheYoung/Depth-Anything.\n","authors":["Lihe Yang","Bingyi Kang","Zilong Huang","Xiaogang Xu","Jiashi Feng","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10891v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://depth-anything.github.io"},{"id":"http://arxiv.org/abs/2212.12857v2","updated":"2024-04-07T06:34:37Z","published":"2022-12-25T05:24:08Z","title":"StepNet: Spatial-temporal Part-aware Network for Isolated Sign Language\n Recognition","summary":" The goal of sign language recognition (SLR) is to help those who are hard of\nhearing or deaf overcome the communication barrier. Most existing approaches\ncan be typically divided into two lines, i.e., Skeleton-based and RGB-based\nmethods, but both the two lines of methods have their limitations.\nSkeleton-based methods do not consider facial expressions, while RGB-based\napproaches usually ignore the fine-grained hand structure. To overcome both\nlimitations, we propose a new framework called Spatial-temporal Part-aware\nnetwork~(StepNet), based on RGB parts. As its name suggests, it is made up of\ntwo modules: Part-level Spatial Modeling and Part-level Temporal Modeling.\nPart-level Spatial Modeling, in particular, automatically captures the\nappearance-based properties, such as hands and faces, in the feature space\nwithout the use of any keypoint-level annotations. On the other hand,\nPart-level Temporal Modeling implicitly mines the long-short term context to\ncapture the relevant attributes over time. Extensive experiments demonstrate\nthat our StepNet, thanks to spatial-temporal modules, achieves competitive\nTop-1 Per-instance accuracy on three commonly-used SLR benchmarks, i.e., 56.89%\non WLASL, 77.2% on NMFs-CSL, and 77.1% on BOBSL. Additionally, the proposed\nmethod is compatible with the optical flow input and can produce superior\nperformance if fused. For those who are hard of hearing, we hope that our work\ncan act as a preliminary step.\n","authors":["Xiaolong Shen","Zhedong Zheng","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2212.12857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01134v2","updated":"2024-04-07T06:30:39Z","published":"2024-02-02T04:17:02Z","title":"DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping","summary":" Automated Aerial Triangulation (AAT), aiming to restore image pose and\nreconstruct sparse points simultaneously, plays a pivotal role in earth\nobservation. With its rich research heritage spanning several decades in\nphotogrammetry, AAT has evolved into a fundamental process widely applied in\nlarge-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its\nadvancements, classic AAT methods still face challenges like low efficiency and\nlimited robustness. This paper introduces DeepAAT, a deep learning network\ndesigned specifically for AAT of UAV imagery. DeepAAT considers both spatial\nand spectral characteristics of imagery, enhancing its capability to resolve\nerroneous matching pairs and accurately predict image poses. DeepAAT marks a\nsignificant leap in AAT's efficiency, ensuring thorough scene coverage and\nprecision. Its processing speed outpaces incremental AAT methods by hundreds of\ntimes and global AAT methods by tens of times while maintaining a comparable\nlevel of reconstruction accuracy. Additionally, DeepAAT's scene clustering and\nmerging strategy facilitate rapid localization and pose determination for\nlarge-scale UAV images, even under constrained computing resources. The\nexperimental results demonstrate DeepAAT's substantial improvements over\nconventional AAT methods, highlighting its potential in the efficiency and\naccuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry\nsociety, the code of DeepAAT will be released at:\nhttps://github.com/WHU-USI3DV/DeepAAT.\n","authors":["Zequan Chen","Jianping Li","Qusheng Li","Bisheng Yang","Zhen Dong"],"pdf_url":"https://arxiv.org/pdf/2402.01134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04828v1","updated":"2024-04-07T06:28:53Z","published":"2024-04-07T06:28:53Z","title":"Strictly-ID-Preserved and Controllable Accessory Advertising Image\n Generation","summary":" Customized generative text-to-image models have the ability to produce images\nthat closely resemble a given subject. However, in the context of generating\nadvertising images for e-commerce scenarios, it is crucial that the generated\nsubject's identity aligns perfectly with the product being advertised. In order\nto address the need for strictly-ID preserved advertising image generation, we\nhave developed a Control-Net based customized image generation pipeline and\nhave taken earring model advertising as an example. Our approach facilitates a\nseamless interaction between the earrings and the model's face, while ensuring\nthat the identity of the earrings remains intact. Furthermore, to achieve a\ndiverse and controllable display, we have proposed a multi-branch\ncross-attention architecture, which allows for control over the scale, pose,\nand appearance of the model, going beyond the limitations of text prompts. Our\nmethod manages to achieve fine-grained control of the generated model's face,\nresulting in controllable and captivating advertising effects.\n","authors":["Youze Xue","Binghui Chen","Yifeng Geng","Xuansong Xie","Jiansheng Chen","Hongbing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04828v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2310.08370v2","updated":"2024-04-07T06:21:21Z","published":"2023-10-12T14:39:58Z","title":"UniPAD: A Universal Pre-training Paradigm for Autonomous Driving","summary":" In the context of autonomous driving, the significance of effective feature\nlearning is widely acknowledged. While conventional 3D self-supervised\npre-training methods have shown widespread success, most methods follow the\nideas originally designed for 2D images. In this paper, we present UniPAD, a\nnovel self-supervised learning paradigm applying 3D volumetric differentiable\nrendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction\nof continuous 3D shape structures and the intricate appearance characteristics\nof their 2D projections. The flexibility of our method enables seamless\nintegration into both 2D and 3D frameworks, enabling a more holistic\ncomprehension of the scenes. We manifest the feasibility and effectiveness of\nUniPAD by conducting extensive experiments on various downstream 3D tasks. Our\nmethod significantly improves lidar-, camera-, and lidar-camera-based baseline\nby 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline\nachieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic\nsegmentation on the nuScenes validation set, achieving state-of-the-art results\nin comparison with previous methods. The code will be available at\nhttps://github.com/Nightmare-n/UniPAD.\n","authors":["Honghui Yang","Sha Zhang","Di Huang","Xiaoyang Wu","Haoyi Zhu","Tong He","Shixiang Tang","Hengshuang Zhao","Qibo Qiu","Binbin Lin","Xiaofei He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.08370v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.04823v1","updated":"2024-04-07T06:17:10Z","published":"2024-04-07T06:17:10Z","title":"3D Building Reconstruction from Monocular Remote Sensing Images with\n Multi-level Supervisions","summary":" 3D building reconstruction from monocular remote sensing images is an\nimportant and challenging research problem that has received increasing\nattention in recent years, owing to its low cost of data acquisition and\navailability for large-scale applications. However, existing methods rely on\nexpensive 3D-annotated samples for fully-supervised training, restricting their\napplication to large-scale cross-city scenarios. In this work, we propose\nMLS-BRN, a multi-level supervised building reconstruction network that can\nflexibly utilize training samples with different annotation levels to achieve\nbetter reconstruction results in an end-to-end manner. To alleviate the demand\non full 3D supervision, we design two new modules, Pseudo Building Bbox\nCalculator and Roof-Offset guided Footprint Extractor, as well as new tasks and\ntraining strategies for different types of samples. Experimental results on\nseveral public and new datasets demonstrate that our proposed MLS-BRN achieves\ncompetitive performance using much fewer 3D-annotated samples, and\nsignificantly improves the footprint extraction and 3D reconstruction\nperformance compared with current state-of-the-art. The code and datasets of\nthis work will be released at https://github.com/opendatalab/MLS-BRN.git.\n","authors":["Weijia Li","Haote Yang","Zhenghao Hu","Juepeng Zheng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2404.04823v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01133v2","updated":"2024-04-07T06:17:07Z","published":"2024-04-01T14:24:40Z","title":"CityGaussian: Real-time High-quality Large-Scale Scene Rendering with\n Gaussians","summary":" The advancement of real-time 3D scene reconstruction and novel view synthesis\nhas been significantly propelled by 3D Gaussian Splatting (3DGS). However,\neffectively training large-scale 3DGS and rendering it in real-time across\nvarious scales remains challenging. This paper introduces CityGaussian\n(CityGS), which employs a novel divide-and-conquer training approach and\nLevel-of-Detail (LoD) strategy for efficient large-scale 3DGS training and\nrendering. Specifically, the global scene prior and adaptive training data\nselection enables efficient training and seamless fusion. Based on fused\nGaussian primitives, we generate different detail levels through compression,\nand realize fast rendering across various scales through the proposed\nblock-wise detail levels selection and aggregation strategy. Extensive\nexperimental results on large-scale scenes demonstrate that our approach\nattains state-of-theart rendering quality, enabling consistent real-time\nrendering of largescale scenes across vastly different scales. Our project page\nis available at https://dekuliutesla.github.io/citygs/.\n","authors":["Yang Liu","He Guan","Chuanchen Luo","Lue Fan","Junran Peng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01133v2.pdf","comment":"Project Page: https://dekuliutesla.github.io/citygs/"},{"id":"http://arxiv.org/abs/2404.04819v1","updated":"2024-04-07T06:01:49Z","published":"2024-04-07T06:01:49Z","title":"Joint Reconstruction of 3D Human and Object via Contact-Based Refinement\n Transformer","summary":" Human-object contact serves as a strong cue to understand how humans\nphysically interact with objects. Nevertheless, it is not widely explored to\nutilize human-object contact information for the joint reconstruction of 3D\nhuman and object from a single image. In this work, we present a novel joint 3D\nhuman-object reconstruction method (CONTHO) that effectively exploits contact\ninformation between humans and objects. There are two core designs in our\nsystem: 1) 3D-guided contact estimation and 2) contact-based 3D human and\nobject refinement. First, for accurate human-object contact estimation, CONTHO\ninitially reconstructs 3D humans and objects and utilizes them as explicit 3D\nguidance for contact estimation. Second, to refine the initial reconstructions\nof 3D human and object, we propose a novel contact-based refinement Transformer\nthat effectively aggregates human features and object features based on the\nestimated human-object contact. The proposed contact-based refinement prevents\nthe learning of erroneous correlation between human and object, which enables\naccurate 3D reconstruction. As a result, our CONTHO achieves state-of-the-art\nperformance in both human-object contact estimation and joint reconstruction of\n3D human and object. The code is publicly available at\nhttps://github.com/dqj5182/CONTHO_RELEASE.\n","authors":["Hyeongjin Nam","Daniel Sungho Jung","Gyeongsik Moon","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.04819v1.pdf","comment":"Published at CVPR 2024, 19 pages including the supplementary material"},{"id":"http://arxiv.org/abs/2305.00510v3","updated":"2024-04-07T05:59:05Z","published":"2023-04-30T15:38:36Z","title":"Towards AI-Architecture Liberty: A Comprehensive Survey on Designing and\n Collaborating Virtual Architecture by Deep Learning in the Metaverse","summary":" 3D shape generation techniques leveraging deep learning have garnered\nsignificant interest from both the computer vision and architectural design\ncommunities, promising to enrich the content of the future metaverse. However,\nresearch on virtual architectural design remains limited, particularly\nregarding human-AI collaboration and deep learning-assisted design. We first\nilluminate the principles, generation techniques, and current literature of\nvirtual architecture, focusing on challenges such as datasets, multimodality,\ndesign intuition, and generative frameworks. In our survey, we reviewed 187\nrelated articles (80.7\\% of articles published between 2018 and 2022) covering\narchitectural research, virtual environments, and technical approaches. This\nsurvey investigates the latest approaches to 3D object generation with deep\ngenerative models (DGMs) and summarizes four characteristics of deep-learning\ngeneration approaches for virtual architecture. According to our analysis of\nthe survey, we expound on four research agendas, including agency,\ncommunication, user consideration, and integrating tools, and highlight three\nimportant enablers of ubiquitous interaction with immersive systems in deep\nlearning-assisted architectural generation. Our work contributes to fostering\nunderstanding between designers and deep learning techniques, broadening access\nto human-AI collaboration. We advocate for interdisciplinary efforts to address\nthis timely research topic, facilitating content designing and generation in\nthe metaverse.\n","authors":["Anqi Wang","Jiahua Dong","Lik-Hang Lee","Jiachuan Shen","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2305.00510v3.pdf","comment":"37 pages, 9 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2404.04818v1","updated":"2024-04-07T05:56:42Z","published":"2024-04-07T05:56:42Z","title":"DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking","summary":" Multimodal entity linking (MEL) aims to utilize multimodal information\n(usually textual and visual information) to link ambiguous mentions to\nunambiguous entities in knowledge base. Current methods facing main issues:\n(1)treating the entire image as input may contain redundant information. (2)the\ninsufficient utilization of entity-related information, such as attributes in\nimages. (3)semantic inconsistency between the entity in knowledge base and its\nrepresentation. To this end, we propose DWE+ for multimodal entity linking.\nDWE+ could capture finer semantics and dynamically maintain semantic\nconsistency with entities. This is achieved by three aspects: (a)we introduce a\nmethod for extracting fine-grained image features by partitioning the image\ninto multiple local objects. Then, hierarchical contrastive learning is used to\nfurther align semantics between coarse-grained information(text and image) and\nfine-grained (mention and visual objects). (b)we explore ways to extract visual\nattributes from images to enhance fusion feature such as facial features and\nidentity. (c)we leverage Wikipedia and ChatGPT to capture the entity\nrepresentation, achieving semantic enrichment from both static and dynamic\nperspectives, which better reflects the real-world entity semantics.\nExperiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the\neffectiveness of DWE+ in improving MEL performance. Specifically, we optimize\nthese datasets and achieve state-of-the-art performance on the enhanced\ndatasets. The code and enhanced datasets are released on\nhttps://github.com/season1blue/DWET\n","authors":["Shezheng Song","Shasha Li","Shan Zhao","Xiaopeng Li","Chengyu Wang","Jie Yu","Jun Ma","Tianwei Yan","Bin Ji","Xiaoguang Mao"],"pdf_url":"https://arxiv.org/pdf/2404.04818v1.pdf","comment":"under review on TOIS"},{"id":"http://arxiv.org/abs/2303.04989v3","updated":"2024-04-07T05:50:18Z","published":"2023-03-09T02:20:56Z","title":"ARS-DETR: Aspect Ratio-Sensitive Detection Transformer for Aerial\n Oriented Object Detection","summary":" Existing oriented object detection methods commonly use metric AP$_{50}$ to\nmeasure the performance of the model. We argue that AP$_{50}$ is inherently\nunsuitable for oriented object detection due to its large tolerance in angle\ndeviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$,\nto measure the performance of models. In this paper, we propose an Aspect Ratio\nSensitive Oriented Object Detector with Transformer, termed ARS-DETR, which\nexhibits a competitive performance in high-precision oriented object detection.\nSpecifically, a new angle classification method, calling Aspect Ratio aware\nCircle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more\nreasonable way and discard the hyperparameter that introduced by previous work\n(e.g. CSL). Then, a rotated deformable attention module is designed to rotate\nthe sampling points with the corresponding angles and eliminate the\nmisalignment between region features and sampling points. Moreover, a dynamic\nweight coefficient according to the aspect ratio is adopted to calculate the\nangle loss. Comprehensive experiments on several challenging datasets show that\nour method achieves competitive performance on the high-precision oriented\nobject detection task.\n","authors":["Ying Zeng","Yushi Chen","Xue Yang","Qingyun Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2303.04989v3.pdf","comment":"15 pages, 13 figures, 13 tables, the source code is available at\n https://github.com/httle/ARS-DETR"},{"id":"http://arxiv.org/abs/2404.01959v2","updated":"2024-04-07T05:26:08Z","published":"2024-04-02T13:54:22Z","title":"Bi-LORA: A Vision-Language Approach for Synthetic Image Detection","summary":" Advancements in deep image synthesis techniques, such as generative\nadversarial networks (GANs) and diffusion models (DMs), have ushered in an era\nof generating highly realistic images. While this technological progress has\ncaptured significant interest, it has also raised concerns about the potential\ndifficulty in distinguishing real images from their synthetic counterparts.\nThis paper takes inspiration from the potent convergence capabilities between\nvision and language, coupled with the zero-shot nature of vision-language\nmodels (VLMs). We introduce an innovative method called Bi-LORA that leverages\nVLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance\nthe precision of synthetic image detection for unseen model-generated images.\nThe pivotal conceptual shift in our methodology revolves around reframing\nbinary classification as an image captioning task, leveraging the distinctive\ncapabilities of cutting-edge VLM, notably bootstrapping language image\npre-training (BLIP2). Rigorous and comprehensive experiments are conducted to\nvalidate the effectiveness of our proposed approach, particularly in detecting\nunseen diffusion-generated images from unknown diffusion-based generative\nmodels during training, showcasing robustness to noise, and demonstrating\ngeneralization capabilities to GANs. The obtained results showcase an\nimpressive average accuracy of 93.41% in synthetic image detection on unseen\ngeneration models. The code and models associated with this research can be\npublicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hessen Bougueffa Eutamene","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.01959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04808v1","updated":"2024-04-07T04:56:58Z","published":"2024-04-07T04:56:58Z","title":"MemFlow: Optical Flow Estimation and Prediction with Memory","summary":" Optical flow is a classical task that is important to the vision community.\nClassical optical flow estimation uses two frames as input, whilst some recent\nmethods consider multiple frames to explicitly model long-range information.\nThe former ones limit their ability to fully leverage temporal coherence along\nthe video sequence; and the latter ones incur heavy computational overhead,\ntypically not possible for real-time flow estimation. Some multi-frame-based\napproaches even necessitate unseen future frames for current estimation,\ncompromising real-time applicability in safety-critical scenarios. To this end,\nwe present MemFlow, a real-time method for optical flow estimation and\nprediction with memory. Our method enables memory read-out and update modules\nfor aggregating historical motion information in real-time. Furthermore, we\nintegrate resolution-adaptive re-scaling to accommodate diverse video\nresolutions. Besides, our approach seamlessly extends to the future prediction\nof optical flow based on past observations. Leveraging effective historical\nmotion aggregation, our method outperforms VideoFlow with fewer parameters and\nfaster inference speed on Sintel and KITTI-15 datasets in terms of\ngeneralization performance. At the time of submission, MemFlow also leads in\nperformance on the 1080p Spring dataset. Codes and models will be available at:\nhttps://dqiaole.github.io/MemFlow/.\n","authors":["Qiaole Dong","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2404.04808v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04807v1","updated":"2024-04-07T04:55:58Z","published":"2024-04-07T04:55:58Z","title":"D2SL: Decouple Defogging and Semantic Learning for Foggy Domain-Adaptive\n Segmentation","summary":" We investigated domain adaptive semantic segmentation in foggy weather\nscenarios, which aims to enhance the utilization of unlabeled foggy data and\nimprove the model's adaptability to foggy conditions. Current methods rely on\nclear images as references, jointly learning defogging and segmentation for\nfoggy images. Despite making some progress, there are still two main drawbacks:\n(1) the coupling of segmentation and defogging feature representations,\nresulting in a decrease in semantic representation capability, and (2) the\nfailure to leverage real fog priors in unlabeled foggy data, leading to\ninsufficient model generalization ability. To address these issues, we propose\na novel training framework, Decouple Defogging and Semantic learning, called\nD2SL, aiming to alleviate the adverse impact of defogging tasks on the final\nsegmentation task. In this framework, we introduce a domain-consistent transfer\nstrategy to establish a connection between defogging and segmentation tasks.\nFurthermore, we design a real fog transfer strategy to improve defogging\neffects by fully leveraging the fog priors from real foggy images. Our approach\nenhances the semantic representations required for segmentation during the\ndefogging learning process and maximizes the representation capability of fog\ninvariance by effectively utilizing real fog data. Comprehensive experiments\nvalidate the effectiveness of the proposed method.\n","authors":["Xuan Sun","Zhanfu An","Yuyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01843v2","updated":"2024-04-07T04:17:32Z","published":"2024-04-02T11:03:24Z","title":"Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation","summary":" Recently, image-to-3D approaches have achieved significant results with a\nnatural image as input. However, it is not always possible to access these\nenriched color input samples in practical applications, where only sketches are\navailable. Existing sketch-to-3D researches suffer from limitations in broad\napplications due to the challenges of lacking color information and multi-view\ncontent. To overcome them, this paper proposes a novel generation paradigm\nSketch3D to generate realistic 3D assets with shape aligned with the input\nsketch and color matching the textual description. Concretely, Sketch3D first\ninstantiates the given sketch in the reference image through the\nshape-preserving generation process. Second, the reference image is leveraged\nto deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance\nimages are generated based on the renderings of the 3D Gaussians. Finally,\nthree strategies are designed to optimize 3D Gaussians, i.e., structural\noptimization via a distribution transfer mechanism, color optimization with a\nstraightforward MSE loss and sketch similarity optimization with a CLIP-based\ngeometric similarity loss. Extensive visual comparisons and quantitative\nanalysis illustrate the advantage of our Sketch3D in generating realistic 3D\nassets while preserving consistency with the input.\n","authors":["Wangguandong Zheng","Haifeng Xia","Rui Chen","Ming Shao","Siyu Xia","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2404.01843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04804v1","updated":"2024-04-07T04:10:06Z","published":"2024-04-07T04:10:06Z","title":"Light the Night: A Multi-Condition Diffusion Framework for Unpaired\n Low-Light Enhancement in Autonomous Driving","summary":" Vision-centric perception systems for autonomous driving have gained\nconsiderable attention recently due to their cost-effectiveness and\nscalability, especially compared to LiDAR-based systems. However, these systems\noften struggle in low-light conditions, potentially compromising their\nperformance and safety. To address this, our paper introduces LightDiff, a\ndomain-tailored framework designed to enhance the low-light image quality for\nautonomous driving applications. Specifically, we employ a multi-condition\ncontrolled diffusion model. LightDiff works without any human-collected paired\ndata, leveraging a dynamic data degradation process instead. It incorporates a\nnovel multi-condition adapter that adaptively controls the input weights from\ndifferent modalities, including depth maps, RGB images, and text captions, to\neffectively illuminate dark scenes while maintaining context consistency.\nFurthermore, to align the enhanced images with the detection model's knowledge,\nLightDiff employs perception-specific scores as rewards to guide the diffusion\ntraining process through reinforcement learning. Extensive experiments on the\nnuScenes datasets demonstrate that LightDiff can significantly improve the\nperformance of several state-of-the-art 3D detectors in night-time conditions\nwhile achieving high visual quality scores, highlighting its potential to\nsafeguard autonomous driving.\n","authors":["Jinlong Li","Baolu Li","Zhengzhong Tu","Xinyu Liu","Qing Guo","Felix Juefei-Xu","Runsheng Xu","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04804v1.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2306.02416v3","updated":"2024-04-07T03:53:33Z","published":"2023-06-04T17:39:08Z","title":"Training Like a Medical Resident: Context-Prior Learning Toward\n Universal Medical Image Segmentation","summary":" A major focus of clinical imaging workflow is disease diagnosis and\nmanagement, leading to medical imaging datasets strongly tied to specific\nclinical objectives. This scenario has led to the prevailing practice of\ndeveloping task-specific segmentation models, without gaining insights from\nwidespread imaging cohorts. Inspired by the training program of medical\nradiology residents, we propose a shift towards universal medical image\nsegmentation, a paradigm aiming to build medical image understanding foundation\nmodels by leveraging the diversity and commonality across clinical targets,\nbody regions, and imaging modalities. Towards this goal, we develop Hermes, a\nnovel context-prior learning approach to address the challenges of data\nheterogeneity and annotation differences in medical image segmentation. In a\nlarge collection of eleven diverse datasets (2,438 3D images) across five\nmodalities (CT, PET, T1, T2 and cine MRI) and multiple body regions, we\ndemonstrate the merit of the universal paradigm over the traditional paradigm\non addressing multiple tasks within a single model. By exploiting the synergy\nacross tasks, Hermes achieves state-of-the-art performance on all testing\ndatasets and shows superior model scalability. Results on two additional\ndatasets reveals Hermes' strong performance for transfer learning, incremental\nlearning, and generalization to downstream tasks. Hermes's learned priors\ndemonstrate an appealing trait to reflect the intricate relations among tasks\nand modalities, which aligns with the established anatomical and imaging\nprinciples in radiology. The code is available:\nhttps://github.com/yhygao/universal-medical-image-segmentation.\n","authors":["Yunhe Gao","Zhuowei Li","Di Liu","Mu Zhou","Shaoting Zhang","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2306.02416v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.08129v3","updated":"2024-04-07T03:53:29Z","published":"2023-10-12T08:36:25Z","title":"Tailored Visions: Enhancing Text-to-Image Generation with Personalized\n Prompt Rewriting","summary":" Despite significant progress in the field, it is still challenging to create\npersonalized visual representations that align closely with the desires and\npreferences of individual users. This process requires users to articulate\ntheir ideas in words that are both comprehensible to the models and accurately\ncapture their vision, posing difficulties for many users. In this paper, we\ntackle this challenge by leveraging historical user interactions with the\nsystem to enhance user prompts. We propose a novel approach that involves\nrewriting user prompts based on a newly collected large-scale text-to-image\ndataset with over 300k prompts from 3115 users. Our rewriting model enhances\nthe expressiveness and alignment of user prompts with their intended visual\noutputs. Experimental results demonstrate the superiority of our methods over\nbaseline approaches, as evidenced in our new offline evaluation method and\nonline tests. Our code and dataset are available at\nhttps://github.com/zzjchen/Tailored-Visions.\n","authors":["Zijie Chen","Lichao Zhang","Fangsheng Weng","Lili Pan","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2310.08129v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19098v2","updated":"2024-04-07T03:49:39Z","published":"2024-03-28T02:22:28Z","title":"GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving","summary":" Modeling complicated interactions among the ego-vehicle, road agents, and map\nelements has been a crucial part for safety-critical autonomous driving.\nPrevious works on end-to-end autonomous driving rely on the attention mechanism\nfor handling heterogeneous interactions, which fails to capture the geometric\npriors and is also computationally intensive. In this paper, we propose the\nInteraction Scene Graph (ISG) as a unified method to model the interactions\namong the ego-vehicle, road agents, and map elements. With the representation\nof the ISG, the driving agents aggregate essential information from the most\ninfluential elements, including the road agents with potential collisions and\nthe map elements to follow. Since a mass of unnecessary interactions are\nomitted, the more efficient scene-graph-based framework is able to focus on\nindispensable connections and leads to better performance. We evaluate the\nproposed method for end-to-end autonomous driving on the nuScenes dataset.\nCompared with strong baselines, our method significantly outperforms in the\nfull-stack driving tasks, including perception, prediction, and planning. Code\nwill be released at https://github.com/zhangyp15/GraphAD.\n","authors":["Yunpeng Zhang","Deheng Qian","Ding Li","Yifeng Pan","Yong Chen","Zhenbao Liang","Zhiyao Zhang","Shurui Zhang","Hongxu Li","Maolei Fu","Yun Ye","Zhujin Liang","Yi Shan","Dalong Du"],"pdf_url":"https://arxiv.org/pdf/2403.19098v2.pdf","comment":"project page: https://github.com/zhangyp15/GraphAD"},{"id":"http://arxiv.org/abs/2401.01207v2","updated":"2024-04-07T03:44:59Z","published":"2024-01-02T13:28:39Z","title":"Towards a Simultaneous and Granular Identity-Expression Control in\n Personalized Face Generation","summary":" In human-centric content generation, the pre-trained text-to-image models\nstruggle to produce user-wanted portrait images, which retain the identity of\nindividuals while exhibiting diverse expressions. This paper introduces our\nefforts towards personalized face generation. To this end, we propose a novel\nmulti-modal face generation framework, capable of simultaneous\nidentity-expression control and more fine-grained expression synthesis. Our\nexpression control is so sophisticated that it can be specialized by the\nfine-grained emotional vocabulary. We devise a novel diffusion model that can\nundertake the task of simultaneously face swapping and reenactment. Due to the\nentanglement of identity and expression, it's nontrivial to separately and\nprecisely control them in one framework, thus has not been explored yet. To\novercome this, we propose several innovative designs in the conditional\ndiffusion model, including balancing identity and expression encoder, improved\nmidpoint sampling, and explicitly background conditioning. Extensive\nexperiments have demonstrated the controllability and scalability of the\nproposed framework, in comparison with state-of-the-art text-to-image, face\nswapping, and face reenactment methods.\n","authors":["Renshuai Liu","Bowen Ma","Wei Zhang","Zhipeng Hu","Changjie Fan","Tangjie Lv","Yu Ding","Xuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.01207v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04800v1","updated":"2024-04-07T03:41:45Z","published":"2024-04-07T03:41:45Z","title":"Coordinated Sparse Recovery of Label Noise","summary":" Label noise is a common issue in real-world datasets that inevitably impacts\nthe generalization of models. This study focuses on robust classification tasks\nwhere the label noise is instance-dependent. Estimating the transition matrix\naccurately in this task is challenging, and methods based on sample selection\noften exhibit confirmation bias to varying degrees. Sparse over-parameterized\ntraining (SOP) has been theoretically effective in estimating and recovering\nlabel noise, offering a novel solution for noise-label learning. However, this\nstudy empirically observes and verifies a technical flaw of SOP: the lack of\ncoordination between model predictions and noise recovery leads to increased\ngeneralization error. To address this, we propose a method called Coordinated\nSparse Recovery (CSR). CSR introduces a collaboration matrix and confidence\nweights to coordinate model predictions and noise recovery, reducing error\nleakage. Based on CSR, this study designs a joint sample selection strategy and\nconstructs a comprehensive and powerful learning framework called CSR+. CSR+\nsignificantly reduces confirmation bias, especially for datasets with more\nclasses and a high proportion of instance-specific noise. Experimental results\non simulated and real-world noisy datasets demonstrate that both CSR and CSR+\nachieve outstanding performance compared to methods at the same level.\n","authors":["Yukun Yang","Naihao Wang","Haixin Yang","Ruirui Li"],"pdf_url":"https://arxiv.org/pdf/2404.04800v1.pdf","comment":"Pre-print prior to submission to journal"},{"id":"http://arxiv.org/abs/2404.04799v1","updated":"2024-04-07T03:37:29Z","published":"2024-04-07T03:37:29Z","title":"Few-Shot Object Detection: Research Advances and Challenges","summary":" Object detection as a subfield within computer vision has achieved remarkable\nprogress, which aims to accurately identify and locate a specific object from\nimages or videos. Such methods rely on large-scale labeled training samples for\neach object category to ensure accurate detection, but obtaining extensive\nannotated data is a labor-intensive and expensive process in many real-world\nscenarios. To tackle this challenge, researchers have explored few-shot object\ndetection (FSOD) that combines few-shot learning and object detection\ntechniques to rapidly adapt to novel objects with limited annotated samples.\nThis paper presents a comprehensive survey to review the significant\nadvancements in the field of FSOD in recent years and summarize the existing\nchallenges and solutions. Specifically, we first introduce the background and\ndefinition of FSOD to emphasize potential value in advancing the field of\ncomputer vision. We then propose a novel FSOD taxonomy method and survey the\nplentifully remarkable FSOD algorithms based on this fact to report a\ncomprehensive overview that facilitates a deeper understanding of the FSOD\nproblem and the development of innovative solutions. Finally, we discuss the\nadvantages and limitations of these algorithms to summarize the challenges,\npotential research direction, and development trend of object detection in the\ndata scarcity scenario.\n","authors":["Zhimeng Xin","Shiming Chen","Tianxu Wu","Yuanjie Shao","Weiping Ding","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2404.04799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18331v2","updated":"2024-04-07T02:51:02Z","published":"2024-02-28T13:50:46Z","title":"FineDiffusion: Scaling up Diffusion Models for Fine-grained Image\n Generation with 10,000 Classes","summary":" The class-conditional image generation based on diffusion models is renowned\nfor generating high-quality and diverse images. However, most prior efforts\nfocus on generating images for general categories, e.g., 1000 classes in\nImageNet-1k. A more challenging task, large-scale fine-grained image\ngeneration, remains the boundary to explore. In this work, we present a\nparameter-efficient strategy, called FineDiffusion, to fine-tune large\npre-trained diffusion models scaling to large-scale fine-grained image\ngeneration with 10,000 categories. FineDiffusion significantly accelerates\ntraining and reduces storage overhead by only fine-tuning tiered class\nembedder, bias terms, and normalization layers' parameters. To further improve\nthe image generation quality of fine-grained categories, we propose a novel\nsampling method for fine-grained image generation, which utilizes\nsuperclass-conditioned guidance, specifically tailored for fine-grained\ncategories, to replace the conventional classifier-free guidance sampling.\nCompared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x\ntraining speed-up and requires storing merely 1.77% of the total model\nparameters, while achieving state-of-the-art FID of 9.776 on image generation\nof 10,000 classes. Extensive qualitative and quantitative experiments\ndemonstrate the superiority of our method compared to other parameter-efficient\nfine-tuning methods. The code and more generated results are available at our\nproject website: https://finediffusion.github.io/.\n","authors":["Ziying Pan","Kun Wang","Gang Li","Feihong He","Xiwang Li","Yongxuan Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17228v2","updated":"2024-04-07T02:43:54Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.11502v3","updated":"2024-04-07T02:42:27Z","published":"2024-02-18T08:21:05Z","title":"GenAD: Generative End-to-End Autonomous Driving","summary":" Directly producing planning results from raw sensors has been a long-desired\nsolution for autonomous driving and has attracted increasing attention\nrecently. Most existing end-to-end autonomous driving methods factorize this\nproblem into perception, motion prediction, and planning. However, we argue\nthat the conventional progressive pipeline still cannot comprehensively model\nthe entire traffic evolution process, e.g., the future interaction between the\nego car and other traffic participants and the structural trajectory prior. In\nthis paper, we explore a new paradigm for end-to-end autonomous driving, where\nthe key is to predict how the ego car and the surroundings evolve given past\nscenes. We propose GenAD, a generative framework that casts autonomous driving\ninto a generative modeling problem. We propose an instance-centric scene\ntokenizer that first transforms the surrounding scenes into map-aware instance\ntokens. We then employ a variational autoencoder to learn the future trajectory\ndistribution in a structural latent space for trajectory prior modeling. We\nfurther adopt a temporal model to capture the agent and ego movements in the\nlatent space to generate more effective future trajectories. GenAD finally\nsimultaneously performs motion prediction and planning by sampling\ndistributions in the learned structural latent space conditioned on the\ninstance tokens and using the learned temporal model to generate futures.\nExtensive experiments on the widely used nuScenes benchmark show that the\nproposed GenAD achieves state-of-the-art performance on vision-centric\nend-to-end autonomous driving with high efficiency. Code:\nhttps://github.com/wzzheng/GenAD.\n","authors":["Wenzhao Zheng","Ruiqi Song","Xianda Guo","Chenming Zhang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2402.11502v3.pdf","comment":"Code is available at: https://github.com/wzzheng/GenAD"},{"id":"http://arxiv.org/abs/2309.16496v3","updated":"2024-04-07T02:39:31Z","published":"2023-09-28T15:03:44Z","title":"CCEdit: Creative and Controllable Video Editing via Diffusion Models","summary":" In this paper, we present CCEdit, a versatile generative video editing\nframework based on diffusion models. Our approach employs a novel trident\nnetwork structure that separates structure and appearance control, ensuring\nprecise and creative editing capabilities. Utilizing the foundational\nControlNet architecture, we maintain the structural integrity of the video\nduring editing. The incorporation of an additional appearance branch enables\nusers to exert fine-grained control over the edited key frame. These two side\nbranches seamlessly integrate into the main branch, which is constructed upon\nexisting text-to-image (T2I) generation models, through learnable temporal\nlayers. The versatility of our framework is demonstrated through a diverse\nrange of choices in both structure representations and personalized T2I models,\nas well as the option to provide the edited key frame. To facilitate\ncomprehensive evaluation, we introduce the BalanceCC benchmark dataset,\ncomprising 100 videos and 4 target prompts for each video. Our extensive user\nstudies compare CCEdit with eight state-of-the-art video editing methods. The\noutcomes demonstrate CCEdit's substantial superiority over all other methods.\n","authors":["Ruoyu Feng","Wenming Weng","Yanhui Wang","Yuhui Yuan","Jianmin Bao","Chong Luo","Zhibo Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2309.16496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12850v2","updated":"2024-04-07T02:18:23Z","published":"2023-10-19T14:04:53Z","title":"PrivImage: Differentially Private Synthetic Image Generation using\n Diffusion Models with Semantic-Aware Pretraining","summary":" Differential Privacy (DP) image data synthesis, which leverages the DP\ntechnique to generate synthetic data to replace the sensitive data, allowing\norganizations to share and utilize synthetic images without privacy concerns.\nPrevious methods incorporate the advanced techniques of generative models and\npre-training on a public dataset to produce exceptional DP image data, but\nsuffer from problems of unstable training and massive computational resource\ndemands. This paper proposes a novel DP image synthesis method, termed\nPRIVIMAGE, which meticulously selects pre-training data, promoting the\nefficient creation of DP datasets with high fidelity and utility. PRIVIMAGE\nfirst establishes a semantic query function using a public dataset. Then, this\nfunction assists in querying the semantic distribution of the sensitive\ndataset, facilitating the selection of data from the public dataset with\nanalogous semantics for pre-training. Finally, we pre-train an image generative\nmodel using the selected data and then fine-tune this model on the sensitive\ndataset using Differentially Private Stochastic Gradient Descent (DP-SGD).\nPRIVIMAGE allows us to train a lightly parameterized generative model, reducing\nthe noise in the gradient during DP-SGD training and enhancing training\nstability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the\npublic dataset for pre-training and 7.6% of the parameters in the generative\nmodel compared to the state-of-the-art method, whereas achieves superior\nsynthetic performance and conserves more computational resources. On average,\nPRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy\nthan the state-of-the-art method. The replication package and datasets can be\naccessed online.\n","authors":["Kecen Li","Chen Gong","Zhixiang Li","Yuzhong Zhao","Xinwen Hou","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12850v2.pdf","comment":"Accepted at USENIX Security 2024"},{"id":"http://arxiv.org/abs/2404.04785v1","updated":"2024-04-07T02:15:43Z","published":"2024-04-07T02:15:43Z","title":"Rethinking Diffusion Model for Multi-Contrast MRI Super-Resolution","summary":" Recently, diffusion models (DM) have been applied in magnetic resonance\nimaging (MRI) super-resolution (SR) reconstruction, exhibiting impressive\nperformance, especially with regard to detailed reconstruction. However, the\ncurrent DM-based SR reconstruction methods still face the following issues: (1)\nThey require a large number of iterations to reconstruct the final image, which\nis inefficient and consumes a significant amount of computational resources.\n(2) The results reconstructed by these methods are often misaligned with the\nreal high-resolution images, leading to remarkable distortion in the\nreconstructed MR images. To address the aforementioned issues, we propose an\nefficient diffusion model for multi-contrast MRI SR, named as DiffMSR.\nSpecifically, we apply DM in a highly compact low-dimensional latent space to\ngenerate prior knowledge with high-frequency detail information. The highly\ncompact latent space ensures that DM requires only a few simple iterations to\nproduce accurate prior knowledge. In addition, we design the Prior-Guide Large\nWindow Transformer (PLWformer) as the decoder for DM, which can extend the\nreceptive field while fully utilizing the prior knowledge generated by DM to\nensure that the reconstructed MR image remains undistorted. Extensive\nexperiments on public and clinical datasets demonstrate that our DiffMSR\noutperforms state-of-the-art methods.\n","authors":["Guangyuan Li","Chen Rao","Juncheng Mo","Zhanjie Zhang","Wei Xing","Lei Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.04785v1.pdf","comment":"14 pages, 12 figures, Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.00674v2","updated":"2024-04-07T01:56:15Z","published":"2024-03-31T12:45:23Z","title":"Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated\n Objects","summary":" We present Knowledge NeRF to synthesize novel views for dynamic scenes.\nReconstructing dynamic 3D scenes from few sparse views and rendering them from\narbitrary perspectives is a challenging problem with applications in various\ndomains. Previous dynamic NeRF methods learn the deformation of articulated\nobjects from monocular videos. However, qualities of their reconstructed scenes\nare limited. To clearly reconstruct dynamic scenes, we propose a new framework\nby considering two frames at a time.We pretrain a NeRF model for an articulated\nobject.When articulated objects moves, Knowledge NeRF learns to generate novel\nviews at the new state by incorporating past knowledge in the pretrained NeRF\nmodel with minimal observations in the present state. We propose a projection\nmodule to adapt NeRF for dynamic scenes, learning the correspondence between\npretrained knowledge base and current states. Experimental results demonstrate\nthe effectiveness of our method in reconstructing dynamic 3D scenes with 5\ninput images in one state. Knowledge NeRF is a new pipeline and promising\nsolution for novel view synthesis in dynamic articulated objects. The data and\nimplementation are publicly available at\nhttps://github.com/RussRobin/Knowledge_NeRF.\n","authors":["Wenxiao Cai","Xinyue Lei","Xinyu He","Junming Leo Chen","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00674v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02449v2","updated":"2024-04-07T01:55:40Z","published":"2024-03-04T20:05:28Z","title":"Optimizing Illuminant Estimation in Dual-Exposure HDR Imaging","summary":" High dynamic range (HDR) imaging involves capturing a series of frames of the\nsame scene, each with different exposure settings, to broaden the dynamic range\nof light. This can be achieved through burst capturing or using staggered HDR\nsensors that capture long and short exposures simultaneously in the camera\nimage signal processor (ISP). Within camera ISP pipeline, illuminant estimation\nis a crucial step aiming to estimate the color of the global illuminant in the\nscene. This estimation is used in camera ISP white-balance module to remove\nundesirable color cast in the final image. Despite the multiple frames captured\nin the HDR pipeline, conventional illuminant estimation methods often rely only\non a single frame of the scene. In this paper, we explore leveraging\ninformation from frames captured with different exposure times. Specifically,\nwe introduce a simple feature extracted from dual-exposure images to guide\nilluminant estimators, referred to as the dual-exposure feature (DEF). To\nvalidate the efficiency of DEF, we employed two illuminant estimators using the\nproposed DEF: 1) a multilayer perceptron network (MLP), referred to as\nexposure-based MLP (EMLP), and 2) a modified version of the convolutional color\nconstancy (CCC) to integrate our DEF, that we call ECCC. Both EMLP and ECCC\nachieve promising results, in some cases surpassing prior methods that require\nhundreds of thousands or millions of parameters, with only a few hundred\nparameters for EMLP and a few thousand parameters for ECCC.\n","authors":["Mahmoud Afifi","Zhenhua Hu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2403.02449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20210v3","updated":"2024-04-07T01:25:09Z","published":"2023-10-31T06:19:09Z","title":"UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale\n Transformer","summary":" Underwater images often exhibit poor quality, distorted color balance and low\ncontrast due to the complex and intricate interplay of light, water, and\nobjects. Despite the significant contributions of previous underwater\nenhancement techniques, there exist several problems that demand further\nimprovement: (i) The current deep learning methods rely on Convolutional Neural\nNetworks (CNNs) that lack the multi-scale enhancement, and global perception\nfield is also limited. (ii) The scarcity of paired real-world underwater\ndatasets poses a significant challenge, and the utilization of synthetic image\npairs could lead to overfitting. To address the aforementioned problems, this\npaper introduces a Multi-scale Transformer-based Network called UWFormer for\nenhancing images at multiple frequencies via semi-supervised learning, in which\nwe propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale\nFusion Feed-forward Network for low-frequency enhancement. Besides, we\nintroduce a special underwater semi-supervised training strategy, where we\npropose a Subaqueous Perceptual Loss function to generate reliable pseudo\nlabels. Experiments using full-reference and non-reference underwater\nbenchmarks demonstrate that our method outperforms state-of-the-art methods in\nterms of both quantity and visual quality.\n","authors":["Weiwen Chen","Yingtie Lei","Shenghong Luo","Ziyang Zhou","Mingxian Li","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2310.20210v3.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.04763v1","updated":"2024-04-07T00:28:13Z","published":"2024-04-07T00:28:13Z","title":"GenEARL: A Training-Free Generative Framework for Multimodal Event\n Argument Role Labeling","summary":" Multimodal event argument role labeling (EARL), a task that assigns a role\nfor each event participant (object) in an image is a complex challenge. It\nrequires reasoning over the entire image, the depicted event, and the\ninteractions between various objects participating in the event. Existing\nmodels heavily rely on high-quality event-annotated training data to understand\nthe event semantics and structures, and they fail to generalize to new event\ntypes and domains. In this paper, we propose GenEARL, a training-free\ngenerative framework that harness the power of the modern generative models to\nunderstand event task descriptions given image contexts to perform the EARL\ntask. Specifically, GenEARL comprises two stages of generative prompting with a\nfrozen vision-language model (VLM) and a frozen large language model (LLM).\nFirst, a generative VLM learns the semantics of the event argument roles and\ngenerates event-centric object descriptions based on the image. Subsequently, a\nLLM is prompted with the generated object descriptions with a predefined\ntemplate for EARL (i.e., assign an object with an event argument role). We show\nthat GenEARL outperforms the contrastive pretraining (CLIP) baseline by 9.4%\nand 14.2% accuracy for zero-shot EARL on the M2E2 and SwiG datasets,\nrespectively. In addition, we outperform CLIP-Event by 22% precision on M2E2\ndataset. The framework also allows flexible adaptation and generalization to\nunseen domains.\n","authors":["Hritik Bansal","Po-Nien Kung","P. Jeffrey Brantingham","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2404.04763v1.pdf","comment":"20 pages, 15 Figures, 13 figures"},{"id":"http://arxiv.org/abs/2404.06332v1","updated":"2024-04-07T12:42:02Z","published":"2024-04-07T12:42:02Z","title":"X-VARS: Introducing Explainability in Football Refereeing with\n Multi-Modal Large Language Model","summary":" The rapid advancement of artificial intelligence has led to significant\nimprovements in automated decision-making. However, the increased performance\nof models often comes at the cost of explainability and transparency of their\ndecision-making processes. In this paper, we investigate the capabilities of\nlarge language models to explain decisions, using football refereeing as a\ntesting ground, given its decision complexity and subjectivity. We introduce\nthe Explainable Video Assistant Referee System, X-VARS, a multi-modal large\nlanguage model designed for understanding football videos from the point of\nview of a referee. X-VARS can perform a multitude of tasks, including video\ndescription, question answering, action recognition, and conducting meaningful\nconversations based on video content and in accordance with the Laws of the\nGame for football referees. We validate X-VARS on our novel dataset,\nSoccerNet-XFoul, which consists of more than 22k video-question-answer triplets\nannotated by over 70 experienced football referees. Our experiments and human\nstudy illustrate the impressive capabilities of X-VARS in interpreting complex\nfootball clips. Furthermore, we highlight the potential of X-VARS to reach\nhuman performance and support football referees in the future.\n","authors":["Jan Held","Hani Itani","Anthony Cioppa","Silvio Giancola","Bernard Ghanem","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2404.06332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04818v1","updated":"2024-04-07T05:56:42Z","published":"2024-04-07T05:56:42Z","title":"DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking","summary":" Multimodal entity linking (MEL) aims to utilize multimodal information\n(usually textual and visual information) to link ambiguous mentions to\nunambiguous entities in knowledge base. Current methods facing main issues:\n(1)treating the entire image as input may contain redundant information. (2)the\ninsufficient utilization of entity-related information, such as attributes in\nimages. (3)semantic inconsistency between the entity in knowledge base and its\nrepresentation. To this end, we propose DWE+ for multimodal entity linking.\nDWE+ could capture finer semantics and dynamically maintain semantic\nconsistency with entities. This is achieved by three aspects: (a)we introduce a\nmethod for extracting fine-grained image features by partitioning the image\ninto multiple local objects. Then, hierarchical contrastive learning is used to\nfurther align semantics between coarse-grained information(text and image) and\nfine-grained (mention and visual objects). (b)we explore ways to extract visual\nattributes from images to enhance fusion feature such as facial features and\nidentity. (c)we leverage Wikipedia and ChatGPT to capture the entity\nrepresentation, achieving semantic enrichment from both static and dynamic\nperspectives, which better reflects the real-world entity semantics.\nExperiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the\neffectiveness of DWE+ in improving MEL performance. Specifically, we optimize\nthese datasets and achieve state-of-the-art performance on the enhanced\ndatasets. The code and enhanced datasets are released on\nhttps://github.com/season1blue/DWET\n","authors":["Shezheng Song","Shasha Li","Shan Zhao","Xiaopeng Li","Chengyu Wang","Jie Yu","Jun Ma","Tianwei Yan","Bin Ji","Xiaoguang Mao"],"pdf_url":"https://arxiv.org/pdf/2404.04818v1.pdf","comment":"under review on TOIS. arXiv admin note: substantial text overlap with\n arXiv:2312.11816"}]},"2024-04-09T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.06512v1","updated":"2024-04-09T17:59:32Z","published":"2024-04-09T17:59:32Z","title":"InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model\n Handling Resolutions from 336 Pixels to 4K HD","summary":" The Large Vision-Language Model (LVLM) field has seen significant\nadvancements, yet its progression has been hindered by challenges in\ncomprehending fine-grained visual content due to limited resolution. Recent\nefforts have aimed to enhance the high-resolution understanding capabilities of\nLVLMs, yet they remain capped at approximately 1500 x 1500 pixels and\nconstrained to a relatively narrow resolution range. This paper represents\nInternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM\nresolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently,\nconsidering the ultra-high resolution may not be necessary in all scenarios, it\nsupports a wide range of diverse resolutions from 336 pixels to 4K standard,\nsignificantly broadening its scope of applicability. Specifically, this\nresearch advances the patch division paradigm by introducing a novel extension:\ndynamic resolution with automatic patch configuration. It maintains the\ntraining image aspect ratios while automatically varying patch counts and\nconfiguring layouts based on a pre-trained Vision Transformer (ViT) (336 x\n336), leading to dynamic training resolution from 336 pixels to 4K standard.\nOur research demonstrates that scaling training resolution up to 4K HD leads to\nconsistent performance enhancements without hitting the ceiling of potential\nimprovements. InternLM-XComposer2-4KHD shows superb capability that matches or\neven surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The\nInternLM-XComposer2-4KHD model series with 7B parameters are publicly available\nat https://github.com/InternLM/InternLM-XComposer.\n","authors":["Xiaoyi Dong","Pan Zhang","Yuhang Zang","Yuhang Cao","Bin Wang","Linke Ouyang","Songyang Zhang","Haodong Duan","Wenwei Zhang","Yining Li","Hang Yan","Yang Gao","Zhe Chen","Xinyue Zhang","Wei Li","Jingwen Li","Wenhai Wang","Kai Chen","Conghui He","Xingcheng Zhang","Jifeng Dai","Yu Qiao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06512v1.pdf","comment":"Code and models are publicly available at\n https://github.com/InternLM/InternLM-XComposer"},{"id":"http://arxiv.org/abs/2404.06511v1","updated":"2024-04-09T17:59:31Z","published":"2024-04-09T17:59:31Z","title":"MoReVQA: Exploring Modular Reasoning Models for Video Question Answering","summary":" This paper addresses the task of video question answering (videoQA) via a\ndecomposed multi-stage, modular reasoning framework. Previous modular methods\nhave shown promise with a single planning stage ungrounded in visual content.\nHowever, through a simple and effective baseline, we find that such systems can\nlead to brittle behavior in practice for challenging videoQA settings. Thus,\nunlike traditional single-stage planning methods, we propose a multi-stage\nsystem consisting of an event parser, a grounding stage, and a final reasoning\nstage in conjunction with an external memory. All stages are training-free, and\nperformed using few-shot prompting of large models, creating interpretable\nintermediate outputs at each stage. By decomposing the underlying planning and\ntask complexity, our method, MoReVQA, improves over prior work on standard\nvideoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with\nstate-of-the-art results, and extensions to related tasks (grounded videoQA,\nparagraph captioning).\n","authors":["Juhong Min","Shyamal Buch","Arsha Nagrani","Minsu Cho","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.06511v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06510v1","updated":"2024-04-09T17:59:04Z","published":"2024-04-09T17:59:04Z","title":"Can Feedback Enhance Semantic Grounding in Large Vision-Language Models?","summary":" Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often\ninvolves collecting domain-specific training data, refining the network\narchitectures, or modifying the training recipes. In this work, we venture into\nan orthogonal direction and explore whether VLMs can improve their semantic\ngrounding by \"receiving\" feedback, without requiring in-domain data,\nfine-tuning, or modifications to the network architectures. We systematically\nanalyze this hypothesis using a feedback mechanism composed of a binary signal.\nWe find that if prompted appropriately, VLMs can utilize feedback both in a\nsingle step and iteratively, showcasing the potential of feedback as an\nalternative technique to improve grounding in internet-scale VLMs. Furthermore,\nVLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we\nfind that this issue can be mitigated via a binary verification mechanism.\nFinally, we explore the potential and limitations of amalgamating these\nfindings and applying them iteratively to automatically enhance VLMs' grounding\nperformance, showing grounding accuracy consistently improves using automated\nfeedback across all models in all settings investigated. Overall, our iterative\nframework improves semantic grounding in VLMs by more than 15 accuracy points\nunder noise-free feedback and up to 5 accuracy points under a simple automated\nbinary verification mechanism. The project website is hosted at\nhttps://andrewliao11.github.io/vlms_feedback\n","authors":["Yuan-Hong Liao","Rafid Mahmood","Sanja Fidler","David Acuna"],"pdf_url":"https://arxiv.org/pdf/2404.06510v1.pdf","comment":"31 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.06507v1","updated":"2024-04-09T17:55:41Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17048v3","updated":"2024-04-09T17:54:12Z","published":"2023-11-28T18:55:37Z","title":"Zero-shot Referring Expression Comprehension via Structural Similarity\n Between Images and Captions","summary":" Zero-shot referring expression comprehension aims at localizing bounding\nboxes in an image corresponding to provided textual prompts, which requires:\n(i) a fine-grained disentanglement of complex visual scene and textual context,\nand (ii) a capacity to understand relationships among disentangled entities.\nUnfortunately, existing large vision-language alignment (VLA) models, e.g.,\nCLIP, struggle with both aspects so cannot be directly used for this task. To\nmitigate this gap, we leverage large foundation models to disentangle both\nimages and texts into triplets in the format of (subject, predicate, object).\nAfter that, grounding is accomplished by calculating the structural similarity\nmatrix between visual and textual triplets with a VLA model, and subsequently\npropagate it to an instance-level similarity matrix. Furthermore, to equip VLA\nmodels with the ability of relationship understanding, we design a\ntriplet-matching objective to fine-tune the VLA models on a collection of\ncurated dataset containing abundant entity relationships. Experiments\ndemonstrate that our visual grounding performance increase of up to 19.5% over\nthe SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo\ndataset, our zero-shot approach achieves comparable accuracy to the fully\nsupervised model. Code is available at\nhttps://github.com/Show-han/Zeroshot_REC.\n","authors":["Zeyu Han","Fangrui Zhu","Qianru Lao","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17048v3.pdf","comment":"CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC"},{"id":"http://arxiv.org/abs/2212.08731v3","updated":"2024-04-09T17:52:49Z","published":"2022-12-16T22:03:37Z","title":"Multi-person 3D pose estimation from unlabelled data","summary":" Its numerous applications make multi-human 3D pose estimation a remarkably\nimpactful area of research. Nevertheless, assuming a multiple-view system\ncomposed of several regular RGB cameras, 3D multi-pose estimation presents\nseveral challenges. First of all, each person must be uniquely identified in\nthe different views to separate the 2D information provided by the cameras.\nSecondly, the 3D pose estimation process from the multi-view 2D information of\neach person must be robust against noise and potential occlusions in the\nscenario. In this work, we address these two challenges with the help of deep\nlearning. Specifically, we present a model based on Graph Neural Networks\ncapable of predicting the cross-view correspondence of the people in the\nscenario along with a Multilayer Perceptron that takes the 2D points to yield\nthe 3D poses of each person. These two models are trained in a self-supervised\nmanner, thus avoiding the need for large datasets with 3D annotations.\n","authors":["Daniel Rodriguez-Criado","Pilar Bachiller","George Vogiatzis","Luis J. Manso"],"pdf_url":"https://arxiv.org/pdf/2212.08731v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06493v1","updated":"2024-04-09T17:48:52Z","published":"2024-04-09T17:48:52Z","title":"Flying With Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v1.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2303.12054v4","updated":"2024-04-09T17:44:24Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06483v1","updated":"2024-04-09T17:34:19Z","published":"2024-04-09T17:34:19Z","title":"RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length\n Videos","summary":" Remote photoplethysmography (rPPG) is a non-contact method for detecting\nphysiological signals from facial videos, holding great potential in various\napplications such as healthcare, affective computing, and anti-spoofing.\nExisting deep learning methods struggle to address two core issues of rPPG\nsimultaneously: extracting weak rPPG signals from video segments with large\nspatiotemporal redundancy and understanding the periodic patterns of rPPG among\nlong contexts. This represents a trade-off between computational complexity and\nthe ability to capture long-range dependencies, posing a challenge for rPPG\nthat is suitable for deployment on mobile devices. Based on the in-depth\nexploration of Mamba's comprehension of spatial and temporal information, this\npaper introduces RhythmMamba, an end-to-end Mamba-based method that employs\nmulti-temporal Mamba to constrain both periodic patterns and short-term trends,\ncoupled with frequency domain feed-forward to enable Mamba to robustly\nunderstand the quasi-periodic patterns of rPPG. Extensive experiments show that\nRhythmMamba achieves state-of-the-art performance with reduced parameters and\nlower computational complexity. The proposed RhythmMamba can be applied to\nvideo segments of any length without performance degradation. The codes are\navailable at https://github.com/zizheng-guo/RhythmMamba.\n","authors":["Bochao Zou","Zizheng Guo","Xiaocheng Hu","Huimin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06483v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.12788"},{"id":"http://arxiv.org/abs/2404.06479v1","updated":"2024-04-09T17:30:18Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v1.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06470v1","updated":"2024-04-09T17:17:48Z","published":"2024-04-09T17:17:48Z","title":"Learning State-Invariant Representations of Objects from Image\n Collections with State, Pose, and Viewpoint Changes","summary":" We add one more invariance - state invariance - to the more commonly used\nother invariances for learning object representations for recognition and\nretrieval. By state invariance, we mean robust with respect to changes in the\nstructural form of the object, such as when an umbrella is folded, or when an\nitem of clothing is tossed on the floor. Since humans generally have no\ndifficulty in recognizing objects despite such state changes, we are naturally\nfaced with the question of whether it is possible to devise a neural\narchitecture with similar abilities. To that end, we present a novel dataset,\nObjectsWithStateChange, that captures state and pose variations in the object\nimages recorded from arbitrary viewpoints. We believe that this dataset will\nfacilitate research in fine-grained object recognition and retrieval of objects\nthat are capable of state changes. The goal of such research would be to train\nmodels capable of generating object embeddings that remain invariant to state\nchanges while also staying invariant to transformations induced by changes in\nviewpoint, pose, illumination, etc. To demonstrate the usefulness of the\nObjectsWithStateChange dataset, we also propose a curriculum learning strategy\nthat uses the similarity relationships in the learned embedding space after\neach epoch to guide the training process. The model learns discriminative\nfeatures by comparing visually similar objects within and across different\ncategories, encouraging it to differentiate between objects that may be\nchallenging to distinguish due to changes in their state. We believe that this\nstrategy enhances the model's ability to capture discriminative features for\nfine-grained tasks that may involve objects with state changes, leading to\nperformance improvements on object-level tasks not only on our new dataset, but\nalso on two other challenging multi-view datasets such as ModelNet40 and\nObjectPI.\n","authors":["Rohan Sarkar","Avinash Kak"],"pdf_url":"https://arxiv.org/pdf/2404.06470v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2304.06140v3","updated":"2024-04-09T17:09:03Z","published":"2023-04-12T19:47:13Z","title":"An Edit Friendly DDPM Noise Space: Inversion and Manipulations","summary":" Denoising diffusion probabilistic models (DDPMs) employ a sequence of white\nGaussian noise samples to generate an image. In analogy with GANs, those noise\nmaps could be considered as the latent code associated with the generated\nimage. However, this native noise space does not possess a convenient\nstructure, and is thus challenging to work with in editing tasks. Here, we\npropose an alternative latent noise space for DDPM that enables a wide range of\nediting operations via simple means, and present an inversion method for\nextracting these edit-friendly noise maps for any given image (real or\nsynthetically generated). As opposed to the native DDPM noise space, the\nedit-friendly noise maps do not have a standard normal distribution and are not\nstatistically independent across timesteps. However, they allow perfect\nreconstruction of any desired image, and simple transformations on them\ntranslate into meaningful manipulations of the output image (e.g. shifting,\ncolor edits). Moreover, in text-conditional models, fixing those noise maps\nwhile changing the text prompt, modifies semantics while retaining structure.\nWe illustrate how this property enables text-based editing of real images via\nthe diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM\ninversion). We also show how it can be used within existing diffusion-based\nediting methods to improve their quality and diversity. Webpage:\nhttps://inbarhub.github.io/DDPM_inversion\n","authors":["Inbar Huberman-Spiegelglas","Vladimir Kulikov","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2304.06140v3.pdf","comment":"CVPR 2024. Code and examples are available at\n https://github.com/inbarhub/DDPM_inversion"},{"id":"http://arxiv.org/abs/2404.06455v1","updated":"2024-04-09T16:55:23Z","published":"2024-04-09T16:55:23Z","title":"A comparative analysis of deep learning models for lung segmentation on\n X-ray images","summary":" Robust and highly accurate lung segmentation in X-rays is crucial in medical\nimaging. This study evaluates deep learning solutions for this task, ranking\nexisting methods and analyzing their performance under diverse image\nmodifications. Out of 61 analyzed papers, only nine offered implementation or\npre-trained models, enabling assessment of three prominent methods: Lung VAE,\nTransResUNet, and CE-Net. The analysis revealed that CE-Net performs best,\ndemonstrating the highest values in dice similarity coefficient and\nintersection over union metric.\n","authors":["Weronika Hryniewska-Guzik","Jakub Bilski","Bartosz Chrostowski","Jakub Drak Sbahi","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.06455v1.pdf","comment":"published at the Polish Conference on Artificial Intelligence\n (PP-RAI), 2024"},{"id":"http://arxiv.org/abs/2404.06453v1","updated":"2024-04-09T16:54:19Z","published":"2024-04-09T16:54:19Z","title":"PURE: Turning Polysemantic Neurons Into Pure Features by Identifying\n Relevant Circuits","summary":" The field of mechanistic interpretability aims to study the role of\nindividual neurons in Deep Neural Networks. Single neurons, however, have the\ncapability to act polysemantically and encode for multiple (unrelated)\nfeatures, which renders their interpretation difficult. We present a method for\ndisentangling polysemanticity of any Deep Neural Network by decomposing a\npolysemantic neuron into multiple monosemantic \"virtual\" neurons. This is\nachieved by identifying the relevant sub-graph (\"circuit\") for each \"pure\"\nfeature. We demonstrate how our approach allows us to find and disentangle\nvarious polysemantic units of ResNet models trained on ImageNet. While\nevaluating feature visualizations using CLIP, our method effectively\ndisentangles representations, improving upon methods based on neuron\nactivations. Our code is available at https://github.com/maxdreyer/PURE.\n","authors":["Maximilian Dreyer","Erblina Purelku","Johanna Vielhaben","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2404.06453v1.pdf","comment":"14 pages (4 pages manuscript, 2 pages references, 8 pages appendix)"},{"id":"http://arxiv.org/abs/2404.06451v1","updated":"2024-04-09T16:53:43Z","published":"2024-04-09T16:53:43Z","title":"SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions","summary":" Human visual imagination usually begins with analogies or rough sketches. For\nexample, given an image with a girl playing guitar before a building, one may\nanalogously imagine how it seems like if Iron Man playing guitar before Pyramid\nin Egypt. Nonetheless, visual condition may not be precisely aligned with the\nimaginary result indicated by text prompt, and existing layout-controllable\ntext-to-image (T2I) generation models is prone to producing degraded generated\nresults with obvious artifacts. To address this issue, we present a novel T2I\ngeneration method dubbed SmartControl, which is designed to modify the rough\nvisual conditions for adapting to text prompt. The key idea of our SmartControl\nis to relax the visual condition on the areas that are conflicted with text\nprompts. In specific, a Control Scale Predictor (CSP) is designed to identify\nthe conflict regions and predict the local control scales, while a dataset with\ntext prompts and rough visual conditions is constructed for training CSP. It is\nworth noting that, even with a limited number (e.g., 1,000~2,000) of training\nsamples, our SmartControl can generalize well to unseen objects. Extensive\nexperiments on four typical visual condition types clearly show the efficacy of\nour SmartControl against state-of-the-arts. Source code, pre-trained models,\nand datasets are available at https://github.com/liuxiaoyu1104/SmartControl.\n","authors":["Xiaoyu Liu","Yuxiang Wei","Ming Liu","Xianhui Lin","Peiran Ren","Xuansong Xie","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.06451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06447v1","updated":"2024-04-09T16:49:42Z","published":"2024-04-09T16:49:42Z","title":"The Central Spanning Tree Problem","summary":" Spanning trees are an important primitive in many data analysis tasks, when a\ndata set needs to be summarized in terms of its \"skeleton\", or when a\ntree-shaped graph over all observations is required for downstream processing.\nPopular definitions of spanning trees include the minimum spanning tree and the\noptimum distance spanning tree, a.k.a. the minimum routing cost tree. When\nsearching for the shortest spanning tree but admitting additional branching\npoints, even shorter spanning trees can be realized: Steiner trees.\nUnfortunately, both minimum spanning and Steiner trees are not robust with\nrespect to noise in the observations; that is, small perturbations of the\noriginal data set often lead to drastic changes in the associated spanning\ntrees. In response, we make two contributions when the data lies in a Euclidean\nspace: on the theoretical side, we introduce a new optimization problem, the\n\"(branched) central spanning tree\", which subsumes all previously mentioned\ndefinitions as special cases. On the practical side, we show empirically that\nthe (branched) central spanning tree is more robust to noise in the data, and\nas such is better suited to summarize a data set in terms of its skeleton. We\nalso propose a heuristic to address the NP-hard optimization problem, and\nillustrate its use on single cell RNA expression data from biology and 3D point\nclouds of plants.\n","authors":["Enrique Fita Sanmartín","Christoph Schnörr","Fred A. Hamprecht"],"pdf_url":"https://arxiv.org/pdf/2404.06447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06443v1","updated":"2024-04-09T16:45:34Z","published":"2024-04-09T16:45:34Z","title":"Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial\n Action Units Recognition","summary":" Human facial action units (AUs) are mutually related in a hierarchical\nmanner, as not only they are associated with each other in both spatial and\ntemporal domains but also AUs located in the same/close facial regions show\nstronger relationships than those of different facial regions. While none of\nexisting approach thoroughly model such hierarchical inter-dependencies among\nAUs, this paper proposes to comprehensively model multi-scale AU-related\ndynamic and hierarchical spatio-temporal relationship among AUs for their\noccurrences recognition. Specifically, we first propose a novel multi-scale\ntemporal differencing network with an adaptive weighting block to explicitly\ncapture facial dynamics across frames at different spatial scales, which\nspecifically considers the heterogeneity of range and magnitude in different\nAUs' activation. Then, a two-stage strategy is introduced to hierarchically\nmodel the relationship among AUs based on their spatial distribution (i.e.,\nlocal and cross-region AU relationship modelling). Experimental results\nachieved on BP4D and DISFA show that our approach is the new state-of-the-art\nin the field of AU occurrence recognition. Our code is publicly available at\nhttps://github.com/CVI-SZU/MDHR.\n","authors":["Zihan Wang","Siyang Song","Cheng Luo","Songhe Deng","Weicheng Xie","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.06443v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.06442v1","updated":"2024-04-09T16:42:54Z","published":"2024-04-09T16:42:54Z","title":"QueSTMaps: Queryable Semantic Topological Maps for 3D Scene\n Understanding","summary":" Understanding the structural organisation of 3D indoor scenes in terms of\nrooms is often accomplished via floorplan extraction. Robotic tasks such as\nplanning and navigation require a semantic understanding of the scene as well.\nThis is typically achieved via object-level semantic segmentation. However,\nsuch methods struggle to segment out topological regions like \"kitchen\" in the\nscene. In this work, we introduce a two-step pipeline. First, we extract a\ntopological map, i.e., floorplan of the indoor scene using a novel\nmulti-channel occupancy representation. Then, we generate CLIP-aligned features\nand semantic labels for every room instance based on the objects it contains\nusing a self-attention transformer. Our language-topology alignment supports\nnatural language querying, e.g., a \"place to cook\" locates the \"kitchen\". We\noutperform the current state-of-the-art on room segmentation by ~20% and room\nclassification by ~12%. Our detailed qualitative analysis and ablation studies\nprovide insights into the problem of joint structural and semantic 3D scene\nunderstanding.\n","authors":["Yash Mehan","Kumaraditya Gupta","Rohit Jayanti","Anirudh Govil","Sourav Garg","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.06442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12962v2","updated":"2024-04-09T16:39:00Z","published":"2021-10-25T13:56:00Z","title":"Event Data Association via Robust Model Fitting for Event-based Object\n Tracking","summary":" Event-based approaches, which are based on bio-inspired asynchronous event\ncameras, have achieved promising performance on various computer vision tasks.\nHowever, the study of the fundamental event data association problem is still\nin its infancy. In this paper, we propose a novel Event Data Association\n(called EDA) approach to explicitly address the event association and fusion\nproblem. The proposed EDA seeks for event trajectories that best fit the event\ndata, in order to perform unifying data association and information fusion. In\nEDA, we first asynchronously fuse the event data based on its information\nentropy. Then, we introduce a deterministic model hypothesis generation\nstrategy, which effectively generates model hypotheses from the fused events,\nto represent the corresponding event trajectories. After that, we present a\ntwo-stage weighting algorithm, which robustly weighs and selects true models\nfrom the generated model hypotheses, through multi-structural geometric model\nfitting. Meanwhile, we also propose an adaptive model selection strategy to\nautomatically determine the number of the true models. Finally, we use the\nselected true models to associate and fuse the event data, without being\naffected by sensor noise and irrelevant structures. We evaluate the performance\nof the proposed EDA on the object tracking task. The experimental results show\nthe effectiveness of EDA under challenging scenarios, such as high speed,\nmotion blur, and high dynamic range conditions.\n","authors":["Haosheng Chen","Shuyuan Lin","Yan Yan","Hanzi Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2110.12962v2.pdf","comment":"32 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.02408v2","updated":"2024-04-09T16:35:41Z","published":"2024-03-04T19:06:13Z","title":"A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement","summary":" Distortions caused by low-light conditions are not only visually unpleasant\nbut also degrade the performance of computer vision tasks. The restoration and\nenhancement have proven to be highly beneficial. However, there are only a\nlimited number of enhancement methods explicitly designed for videos acquired\nin low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet)\nmodel using a Swin Transformer as a backbone to capture low light video\nfeatures and exploit their spatio-temporal correlations. The STA-SUNet model is\ntrained on a novel, fully registered dataset (BVI), which comprises dynamic\nscenes captured under varying light conditions. It is further analysed\ncomparatively against various other models over three test datasets. The model\ndemonstrates superior adaptivity across all datasets, obtaining the highest\nPSNR and SSIM values. It is particularly effective in extreme low-light\nconditions, yielding fairly good visualisation results.\n","authors":["Ruirui Lin","Nantheera Anantrasirichai","Alexandra Malyugina","David Bull"],"pdf_url":"https://arxiv.org/pdf/2403.02408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03108v3","updated":"2024-04-09T16:31:33Z","published":"2023-07-06T16:27:39Z","title":"DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion\n Models","summary":" Recent text-to-image diffusion models have shown surprising performance in\ngenerating high-quality images. However, concerns have arisen regarding the\nunauthorized data usage during the training or fine-tuning process. One example\nis when a model trainer collects a set of images created by a particular artist\nand attempts to train a model capable of generating similar images without\nobtaining permission and giving credit to the artist. To address this issue, we\npropose a method for detecting such unauthorized data usage by planting the\ninjected memorization into the text-to-image diffusion models trained on the\nprotected dataset. Specifically, we modify the protected images by adding\nunique contents on these images using stealthy image warping functions that are\nnearly imperceptible to humans but can be captured and memorized by diffusion\nmodels. By analyzing whether the model has memorized the injected content\n(i.e., whether the generated images are processed by the injected\npost-processing function), we can detect models that had illegally utilized the\nunauthorized data. Experiments on Stable Diffusion and VQ Diffusion with\ndifferent model training or fine-tuning methods (i.e, LoRA, DreamBooth, and\nstandard training) demonstrate the effectiveness of our proposed method in\ndetecting unauthorized data usages. Code:\nhttps://github.com/ZhentingWang/DIAGNOSIS.\n","authors":["Zhenting Wang","Chen Chen","Lingjuan Lyu","Dimitris N. Metaxas","Shiqing Ma"],"pdf_url":"https://arxiv.org/pdf/2307.03108v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.06437v1","updated":"2024-04-09T16:28:54Z","published":"2024-04-09T16:28:54Z","title":"Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks","summary":" With climate change expected to exacerbate fire weather conditions, the\naccurate anticipation of wildfires on a global scale becomes increasingly\ncrucial for disaster mitigation. In this study, we utilize SeasFire, a\ncomprehensive global wildfire dataset with climate, vegetation, oceanic\nindices, and human-related variables, to enable seasonal wildfire forecasting\nwith machine learning. For the predictive analysis, we train deep learning\nmodels with different architectures that capture the spatio-temporal context\nleading to wildfires. Our investigation focuses on assessing the effectiveness\nof these models in predicting the presence of burned areas at varying\nforecasting time horizons globally, extending up to six months into the future,\nand on how different spatial or/and temporal context affects the performance of\nthe models. Our findings demonstrate the great potential of deep learning\nmodels in seasonal fire forecasting; longer input time-series leads to more\nrobust predictions across varying forecasting horizons, while integrating\nspatial information to capture wildfire spatio-temporal dynamics boosts\nperformance. Finally, our results hint that in order to enhance performance at\nlonger forecasting horizons, a larger receptive field spatially needs to be\nconsidered.\n","authors":["Dimitrios Michail","Lefki-Ioanna Panagiotou","Charalampos Davalas","Ioannis Prapas","Spyros Kondylatos","Nikolaos Ioannis Bountos","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2404.06437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06430v1","updated":"2024-04-09T16:23:01Z","published":"2024-04-09T16:23:01Z","title":"pfl-research: simulation framework for accelerating research in Private\n Federated Learning","summary":" Federated learning (FL) is an emerging machine learning (ML) training\nparadigm where clients own their data and collaborate to train a global model,\nwithout revealing any data to the server and other participants. Researchers\ncommonly perform experiments in a simulation environment to quickly iterate on\nideas. However, existing open-source tools do not offer the efficiency required\nto simulate FL on larger and more realistic FL datasets. We introduce\npfl-research, a fast, modular, and easy-to-use Python framework for simulating\nFL. It supports TensorFlow, PyTorch, and non-neural network models, and is\ntightly integrated with state-of-the-art privacy algorithms. We study the speed\nof open-source FL frameworks and show that pfl-research is 7-72$\\times$ faster\nthan alternative open-source frameworks on common cross-device setups. Such\nspeedup will significantly boost the productivity of the FL research community\nand enable testing hypotheses on realistic FL datasets that were previously too\nresource intensive. We release a suite of benchmarks that evaluates an\nalgorithm's overall performance on a diverse set of realistic scenarios. The\ncode is available on GitHub at https://github.com/apple/pfl-research.\n","authors":["Filip Granqvist","Congzheng Song","Áine Cahill","Rogier van Dalen","Martin Pelikan","Yi Sheng Chan","Xiaojun Feng","Natarajan Krishnaswami","Vojta Jina","Mona Chitnis"],"pdf_url":"https://arxiv.org/pdf/2404.06430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06429v1","updated":"2024-04-09T16:20:03Z","published":"2024-04-09T16:20:03Z","title":"Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion","summary":" Benefiting from the rapid development of 2D diffusion models, 3D content\ncreation has made significant progress recently. One promising solution\ninvolves the fine-tuning of pre-trained 2D diffusion models to harness their\ncapacity for producing multi-view images, which are then lifted into accurate\n3D models via methods like fast-NeRFs or large reconstruction models. However,\nas inconsistency still exists and limited generated resolution, the generation\nresults of such methods still lack intricate textures and complex geometries.\nTo solve this problem, we propose Magic-Boost, a multi-view conditioned\ndiffusion model that significantly refines coarse generative results through a\nbrief period of SDS optimization ($\\sim15$min). Compared to the previous text\nor single image based diffusion models, Magic-Boost exhibits a robust\ncapability to generate images with high consistency from pseudo synthesized\nmulti-view images. It provides precise SDS guidance that well aligns with the\nidentity of the input images, enriching the local detail in both geometry and\ntexture of the initial generative results. Extensive experiments show\nMagic-Boost greatly enhances the coarse inputs and generates high-quality 3D\nassets with rich geometric and textural details. (Project Page:\nhttps://magic-research.github.io/magic-boost/)\n","authors":["Fan Yang","Jianfeng Zhang","Yichun Shi","Bowen Chen","Chenxu Zhang","Huichao Zhang","Xiaofeng Yang","Jiashi Feng","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06425v1","updated":"2024-04-09T16:15:03Z","published":"2024-04-09T16:15:03Z","title":"ZeST: Zero-Shot Material Transfer from a Single Image","summary":" We propose ZeST, a method for zero-shot material transfer to an object in the\ninput image given a material exemplar image. ZeST leverages existing diffusion\nadapters to extract implicit material representation from the exemplar image.\nThis representation is used to transfer the material using pre-trained\ninpainting diffusion model on the object in the input image using depth\nestimates as geometry cue and grayscale object shading as illumination cues.\nThe method works on real images without any training resulting a zero-shot\napproach. Both qualitative and quantitative results on real and synthetic\ndatasets demonstrate that ZeST outputs photorealistic images with transferred\nmaterials. We also show the application of ZeST to perform multiple edits and\nrobust material assignment under different illuminations. Project Page:\nhttps://ttchengab.github.io/zest\n","authors":["Ta-Ying Cheng","Prafull Sharma","Andrew Markham","Niki Trigoni","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.06425v1.pdf","comment":"Project Page: https://ttchengab.github.io/zest"},{"id":"http://arxiv.org/abs/2404.06406v1","updated":"2024-04-09T15:54:03Z","published":"2024-04-09T15:54:03Z","title":"Emergent Dynamics in Neural Cellular Automata","summary":" Neural Cellular Automata (NCA) models are trainable variations of traditional\nCellular Automata (CA). Emergent motion in the patterns created by NCA has been\nsuccessfully applied to synthesize dynamic textures. However, the conditions\nrequired for an NCA to display dynamic patterns remain unexplored. Here, we\ninvestigate the relationship between the NCA architecture and the emergent\ndynamics of the trained models. Specifically, we vary the number of channels in\nthe cell state and the number of hidden neurons in the MultiLayer Perceptron\n(MLP), and draw a relationship between the combination of these two variables\nand the motion strength between successive frames. Our analysis reveals that\nthe disparity and proportionality between these two variables have a strong\ncorrelation with the emergent dynamics in the NCA output. We thus propose a\ndesign principle for creating dynamic NCA.\n","authors":["Yitao Xu","Ehsan Pajouheshgar","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06406v1.pdf","comment":"2 pages"},{"id":"http://arxiv.org/abs/2312.09168v3","updated":"2024-04-09T15:47:56Z","published":"2023-12-14T17:34:53Z","title":"DiffusionLight: Light Probes for Free by Painting a Chrome Ball","summary":" We present a simple yet effective technique to estimate lighting in a single\ninput image. Current techniques rely heavily on HDR panorama datasets to train\nneural networks to regress an input with limited field-of-view to a full\nenvironment map. However, these approaches often struggle with real-world,\nuncontrolled settings due to the limited diversity and size of their datasets.\nTo address this problem, we leverage diffusion models trained on billions of\nstandard images to render a chrome ball into the input image. Despite its\nsimplicity, this task remains challenging: the diffusion models often insert\nincorrect or inconsistent objects and cannot readily generate images in HDR\nformat. Our research uncovers a surprising relationship between the appearance\nof chrome balls and the initial diffusion noise map, which we utilize to\nconsistently generate high-quality chrome balls. We further fine-tune an LDR\ndiffusion model (Stable Diffusion XL) with LoRA, enabling it to perform\nexposure bracketing for HDR light estimation. Our method produces convincing\nlight estimates across diverse settings and demonstrates superior\ngeneralization to in-the-wild scenarios.\n","authors":["Pakkapon Phongthawee","Worameth Chinchuthakun","Nontaphat Sinsunthithet","Amit Raj","Varun Jampani","Pramook Khungurn","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2312.09168v3.pdf","comment":"CVPR 2024 Oral. For more information and code, please visit our\n website https://diffusionlight.github.io/"},{"id":"http://arxiv.org/abs/2204.03330v2","updated":"2024-04-09T15:44:05Z","published":"2022-04-07T09:56:36Z","title":"Learning Local and Global Temporal Contexts for Video Semantic\n Segmentation","summary":" Contextual information plays a core role for video semantic segmentation\n(VSS). This paper summarizes contexts for VSS in two-fold: local temporal\ncontexts (LTC) which define the contexts from neighboring frames, and global\ntemporal contexts (GTC) which represent the contexts from the whole video. As\nfor LTC, it includes static and motional contexts, corresponding to static and\nmoving content in neighboring frames, respectively. Previously, both static and\nmotional contexts have been studied. However, there is no research about\nsimultaneously learning static and motional contexts (highly complementary).\nHence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a\nunified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature\nAssembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static\nand motional contexts, and CFM mines useful information from nearby frames to\nenhance target features. To further exploit more temporal contexts, we propose\nCFFM++ by additionally learning GTC from the whole video. Specifically, we\nuniformly sample certain frames from the video and extract global contextual\nprototypes by k-means. The information within those prototypes is mined by CFM\nto refine target features. Experimental results on popular benchmarks\ndemonstrate that CFFM and CFFM++ perform favorably against state-of-the-art\nmethods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM\n","authors":["Guolei Sun","Yun Liu","Henghui Ding","Min Wu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2204.03330v2.pdf","comment":"Accepted to TPAMI, an extended version of a paper published in CVPR\n 2022"},{"id":"http://arxiv.org/abs/2401.16110v2","updated":"2024-04-09T15:33:10Z","published":"2024-01-29T12:31:13Z","title":"SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D\n Object Detection","summary":" Roadside perception can greatly increase the safety of autonomous vehicles by\nextending their perception ability beyond the visual range and addressing blind\nspots. However, current state-of-the-art vision-based roadside detection\nmethods possess high accuracy on labeled scenes but have inferior performance\non new scenes. This is because roadside cameras remain stationary after\ninstallation and can only collect data from a single scene, resulting in the\nalgorithm overfitting these roadside backgrounds and camera poses. To address\nthis issue, in this paper, we propose an innovative Scenario Generalization\nFramework for Vision-based Roadside 3D Object Detection, dubbed SGV3D.\nSpecifically, we employ a Background-suppressed Module (BSM) to mitigate\nbackground overfitting in vision-centric pipelines by attenuating background\nfeatures during the 2D to bird's-eye-view projection. Furthermore, by\nintroducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled\nimages from new scenes, diverse instance foregrounds with varying camera poses\nare generated, addressing the risk of overfitting specific camera poses. We\nevaluate our method on two large-scale roadside benchmarks. Our method\nsurpasses all previous methods by a significant margin in new scenes, including\n+42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to\nBEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D\nheterologous benchmark, we achieve notable gains of 14.48% for car and 12.41%\nfor large vehicle. We aspire to contribute insights on the exploration of\nroadside perception techniques, emphasizing their capability for scenario\ngeneralization. The code will be available at\nhttps://github.com/yanglei18/SGV3D\n","authors":["Lei Yang","Xinyu Zhang","Jun Li","Li Wang","Chuang Zhang","Li Ju","Zhiwei Li","Yang Shen"],"pdf_url":"https://arxiv.org/pdf/2401.16110v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.06389v1","updated":"2024-04-09T15:31:48Z","published":"2024-04-09T15:31:48Z","title":"Raster Forge: Interactive Raster Manipulation Library and GUI for Python","summary":" Raster Forge is a Python library and graphical user interface for raster data\nmanipulation and analysis. The tool is focused on remote sensing applications,\nparticularly in wildfire management. It allows users to import, visualize, and\nprocess raster layers for tasks such as image compositing or topographical\nanalysis. For wildfire management, it generates fuel maps using predefined\nmodels. Its impact extends from disaster management to hydrological modeling,\nagriculture, and environmental monitoring. Raster Forge can be a valuable asset\nfor geoscientists and researchers who rely on raster data analysis, enhancing\ngeospatial data processing and visualization across various disciplines.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.06389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20330v2","updated":"2024-04-09T15:17:50Z","published":"2024-03-29T17:59:34Z","title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","summary":" Large vision-language models (LVLMs) have recently achieved rapid progress,\nsparking numerous studies to evaluate their multi-modal capabilities. However,\nwe dig into current evaluation works and identify two primary issues: 1) Visual\ncontent is unnecessary for many samples. The answers can be directly inferred\nfrom the questions and options, or the world knowledge embedded in LLMs. This\nphenomenon is prevalent across current benchmarks. For instance, GeminiPro\nachieves 42.9% on the MMMU benchmark without any visual input, and outperforms\nthe random choice baseline across six benchmarks over 24% on average. 2)\nUnintentional data leakage exists in LLM and LVLM training. LLM and LVLM could\nstill answer some visual-necessary questions without visual content, indicating\nthe memorizing of these samples within large-scale training data. For example,\nSphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM\nbackbone with 17.9%. Both problems lead to misjudgments of actual multi-modal\ngains and potentially misguide the study of LVLM. To this end, we present\nMMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500\nsamples meticulously selected by humans. MMStar benchmarks 6 core capabilities\nand 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with\ncarefully balanced and purified samples. These samples are first roughly\nselected from current benchmarks with an automated pipeline, human review is\nthen involved to ensure each curated sample exhibits visual dependency, minimal\ndata leakage, and requires advanced multi-modal capabilities. Moreover, two\nmetrics are developed to measure data leakage and actual performance gain in\nmulti-modal training. We evaluate 16 leading LVLMs on MMStar to assess their\nmulti-modal capabilities, and on 7 benchmarks with the proposed metrics to\ninvestigate their data leakage and actual multi-modal gain.\n","authors":["Lin Chen","Jinsong Li","Xiaoyi Dong","Pan Zhang","Yuhang Zang","Zehui Chen","Haodong Duan","Jiaqi Wang","Yu Qiao","Dahua Lin","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20330v2.pdf","comment":"Project page: https://mmstar-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2403.04198v2","updated":"2024-04-09T15:07:08Z","published":"2024-03-07T03:59:47Z","title":"CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors\n Object Detection from Multi-view Images","summary":" This paper introduces CN-RMA, a novel approach for 3D indoor object detection\nfrom multi-view images. We observe the key challenge as the ambiguity of image\nand 3D correspondence without explicit geometry to provide occlusion\ninformation. To address this issue, CN-RMA leverages the synergy of 3D\nreconstruction networks and 3D object detection networks, where the\nreconstruction network provides a rough Truncated Signed Distance Function\n(TSDF) and guides image features to vote to 3D space correctly in an end-to-end\nmanner. Specifically, we associate weights to sampled points of each ray\nthrough ray marching, representing the contribution of a pixel in an image to\ncorresponding 3D locations. Such weights are determined by the predicted signed\ndistances so that image features vote only to regions near the reconstructed\nsurface. Our method achieves state-of-the-art performance in 3D object\ndetection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the\nScanNet and ARKitScenes datasets. The code and models are released at\nhttps://github.com/SerCharles/CN-RMA.\n","authors":["Guanlin Shen","Jingwei Huang","Zhihua Hu","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.04198v2.pdf","comment":"CVPR2024 poster paper, 8 pages of main part, and 4 pages of\n supplementary material"},{"id":"http://arxiv.org/abs/2311.06798v2","updated":"2024-04-09T15:07:02Z","published":"2023-11-12T10:21:04Z","title":"MetaMix: Meta-state Precision Searcher for Mixed-precision Activation\n Quantization","summary":" Mixed-precision quantization of efficient networks often suffer from\nactivation instability encountered in the exploration of bit selections. To\naddress this problem, we propose a novel method called MetaMix which consists\nof bit selection and weight training phases. The bit selection phase iterates\ntwo steps, (1) the mixed-precision-aware weight update, and (2) the bit-search\ntraining with the fixed mixed-precision-aware weights, both of which combined\nreduce activation instability in mixed-precision quantization and contribute to\nfast and high-quality bit selection. The weight training phase exploits the\nweights and step sizes trained in the bit selection phase and fine-tunes them\nthereby offering fast training. Our experiments with efficient and\nhard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet\nshow that our proposed method pushes the boundary of mixed-precision\nquantization, in terms of accuracy vs. operations, by outperforming both mixed-\nand single-precision SOTA methods.\n","authors":["Han-Byul Kim","Joo Hyung Lee","Sungjoo Yoo","Hong-Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2311.06798v2.pdf","comment":"Proc. The 38th Annual AAAI Conference on Artificial Intelligence\n (AAAI)"},{"id":"http://arxiv.org/abs/2404.06369v1","updated":"2024-04-09T15:05:48Z","published":"2024-04-09T15:05:48Z","title":"VISION2UI: A Real-World Dataset with Layout for Code Generation from UI\n Designs","summary":" Automatically generating UI code from webpage design visions can\nsignificantly alleviate the burden of developers, enabling beginner developers\nor designers to directly generate Web pages from design diagrams. Currently,\nprior research has accomplished the objective of generating UI code from\nrudimentary design visions or sketches through designing deep neural networks.\nInspired by the groundbreaking advancements achieved by Multimodal Large\nLanguage Models (MLLMs), the automatic generation of UI code from high-fidelity\ndesign images is now emerging as a viable possibility. Nevertheless, our\ninvestigation reveals that existing MLLMs are hampered by the scarcity of\nauthentic, high-quality, and large-scale datasets, leading to unsatisfactory\nperformance in automated UI code generation. To mitigate this gap, we present a\nnovel dataset, termed VISION2UI, extracted from real-world scenarios, augmented\nwith comprehensive layout information, tailored specifically for finetuning\nMLLMs in UI code generation. Specifically, this dataset is derived through a\nseries of operations, encompassing collecting, cleaning, and filtering of the\nopen-source Common Crawl dataset. In order to uphold its quality, a neural\nscorer trained on labeled samples is utilized to refine the data, retaining\nhigher-quality instances. Ultimately, this process yields a dataset comprising\n2,000 (Much more is coming soon) parallel samples encompassing design visions\nand UI code. The dataset is available at\nhttps://huggingface.co/datasets/xcodemind/vision2ui.\n","authors":["Yi Gui","Zhen Li","Yao Wan","Yemin Shi","Hongyu Zhang","Yi Su","Shaoling Dong","Xing Zhou","Wenbin Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06365v1","updated":"2024-04-09T15:02:01Z","published":"2024-04-09T15:02:01Z","title":"Dynamic Resolution Guidance for Facial Expression Recognition","summary":" Facial expression recognition (FER) is vital for human-computer interaction\nand emotion analysis, yet recognizing expressions in low-resolution images\nremains challenging. This paper introduces a practical method called Dynamic\nResolution Guidance for Facial Expression Recognition (DRGFER) to effectively\nrecognize facial expressions in images with varying resolutions without\ncompromising FER model accuracy. Our framework comprises two main components:\nthe Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation\nFacial Expression Recognition Network (MRAFER). The RRN determines image\nresolution, outputs a binary vector, and the MRAFER assigns images to suitable\nfacial expression recognition networks based on resolution. We evaluated DRGFER\non widely-used datasets RAFDB and FERPlus, demonstrating that our method\nretains optimal model performance at each resolution and outperforms\nalternative resolution approaches. The proposed framework exhibits robustness\nagainst resolution variations and facial expressions, offering a promising\nsolution for real-world applications.\n","authors":["Jie Ou","Xu Li","Tianxiang Jiang","Yuanlun Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06362v1","updated":"2024-04-09T14:56:34Z","published":"2024-04-09T14:56:34Z","title":"Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot\n Medical Image Segmentation","summary":" The Segment Anything Model (SAM) and CLIP are remarkable vision foundation\nmodels (VFMs). SAM, a prompt driven segmentation model, excels in segmentation\ntasks across diverse domains, while CLIP is renowned for its zero shot\nrecognition capabilities. However, their unified potential has not yet been\nexplored in medical image segmentation. To adapt SAM to medical imaging,\nexisting methods primarily rely on tuning strategies that require extensive\ndata or prior prompts tailored to the specific task, making it particularly\nchallenging when only a limited number of data samples are available. This work\npresents an in depth exploration of integrating SAM and CLIP into a unified\nframework for medical image segmentation. Specifically, we propose a simple\nunified framework, SaLIP, for organ segmentation. Initially, SAM is used for\npart based segmentation within the image, followed by CLIP to retrieve the mask\ncorresponding to the region of interest (ROI) from the pool of SAM generated\nmasks. Finally, SAM is prompted by the retrieved ROI to segment a specific\norgan. Thus, SaLIP is training and fine tuning free and does not rely on domain\nexpertise or labeled data for prompt engineering. Our method shows substantial\nenhancements in zero shot segmentation, showcasing notable improvements in DICE\nscores across diverse segmentation tasks like brain (63.46%), lung (50.11%),\nand fetal head (30.82%), when compared to un prompted SAM. Code and text\nprompts will be available online.\n","authors":["Sidra Aleem","Fangyijie Wang","Mayug Maniparambil","Eric Arazo","Julia Dietlmeier","Kathleen Curran","Noel E. O'Connor","Suzanne Little"],"pdf_url":"https://arxiv.org/pdf/2404.06362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06353v1","updated":"2024-04-09T14:44:12Z","published":"2024-04-09T14:44:12Z","title":"High Noise Scheduling is a Must","summary":" Consistency models possess high capabilities for image generation, advancing\nsampling steps to a single step through their advanced techniques. Current\nadvancements move one step forward consistency training techniques and\neliminates the limitation of distillation training. Even though the proposed\ncurriculum and noise scheduling in improved training techniques yield better\nresults than basic consistency models, it lacks well balanced noise\ndistribution and its consistency between curriculum. In this study, it is\ninvestigated the balance between high and low noise levels in noise\ndistribution and offered polynomial noise distribution to maintain the\nstability. This proposed polynomial noise distribution is also supported with a\npredefined Karras noises to prevent unique noise levels arises with Karras\nnoise generation algorithm. Furthermore, by elimination of learned noisy steps\nwith a curriculum based on sinusoidal function increase the performance of the\nmodel in denoising. To make a fair comparison with the latest released\nconsistency model training techniques, experiments are conducted with same\nhyper-parameters except curriculum and noise distribution. The models utilized\nduring experiments are determined with low depth to prove the robustness of our\nproposed technique. The results show that the polynomial noise distribution\noutperforms the model trained with log-normal noise distribution, yielding a\n33.54 FID score after 100,000 training steps with constant discretization\nsteps. Additionally, the implementation of a sinusoidal-based curriculum\nenhances denoising performance, resulting in a FID score of 30.48.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Jie Zhang","Ge Wang","Jin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06352v1","updated":"2024-04-09T14:43:19Z","published":"2024-04-09T14:43:19Z","title":"DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View\n Segmentation with Occlusion Reasoning","summary":" Semantic segmentation is an effective way to perform scene understanding.\nRecently, segmentation in 3D Bird's Eye View (BEV) space has become popular as\nits directly used by drive policy. However, there is limited work on BEV\nsegmentation for surround-view fisheye cameras, commonly used in commercial\nvehicles. As this task has no real-world public dataset and existing synthetic\ndatasets do not handle amodal regions due to occlusion, we create a synthetic\ndataset using the Cognata simulator comprising diverse road types, weather, and\nlighting conditions. We generalize the BEV segmentation to work with any camera\nmodel; this is useful for mixing diverse cameras. We implement a baseline by\napplying cylindrical rectification on the fisheye images and using a standard\nLSS-based BEV segmentation model. We demonstrate that we can achieve better\nperformance without undistortion, which has the adverse effects of increased\nruntime due to pre-processing, reduced field-of-view, and resampling artifacts.\nFurther, we introduce a distortion-aware learnable BEV pooling strategy that is\nmore effective for the fisheye cameras. We extend the model with an occlusion\nreasoning module, which is critical for estimating in BEV space. Qualitative\nperformance of DaF-BEVSeg is showcased in the video at\nhttps://streamable.com/ge4v51.\n","authors":["Senthil Yogamani","David Unger","Venkatraman Narayanan","Varun Ravi Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.06352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06351v1","updated":"2024-04-09T14:42:31Z","published":"2024-04-09T14:42:31Z","title":"HPNet: Dynamic Trajectory Forecasting with Historical Prediction\n Attention","summary":" Predicting the trajectories of road agents is essential for autonomous\ndriving systems. The recent mainstream methods follow a static paradigm, which\npredicts the future trajectory by using a fixed duration of historical frames.\nThese methods make the predictions independently even at adjacent time steps,\nwhich leads to potential instability and temporal inconsistency. As successive\ntime steps have largely overlapping historical frames, their forecasting should\nhave intrinsic correlation, such as overlapping predicted trajectories should\nbe consistent, or be different but share the same motion goal depending on the\nroad situation. Motivated by this, in this work, we introduce HPNet, a novel\ndynamic trajectory forecasting method. Aiming for stable and accurate\ntrajectory forecasting, our method leverages not only historical frames\nincluding maps and agent states, but also historical predictions. Specifically,\nwe newly design a Historical Prediction Attention module to automatically\nencode the dynamic relationship between successive predictions. Besides, it\nalso extends the attention range beyond the currently visible window\nbenefitting from the use of historical predictions. The proposed Historical\nPrediction Attention together with the Agent Attention and Mode Attention is\nfurther formulated as the Triple Factorized Attention module, serving as the\ncore design of HPNet.Experiments on the Argoverse and INTERACTION datasets show\nthat HPNet achieves state-of-the-art performance, and generates accurate and\nstable future trajectories. Our code are available at\nhttps://github.com/XiaolongTang23/HPNet.\n","authors":["Xiaolong Tang","Meina Kan","Shiguang Shan","Zhilong Ji","Jinfeng Bai","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06351v1.pdf","comment":"accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.06350v1","updated":"2024-04-09T14:40:54Z","published":"2024-04-09T14:40:54Z","title":"Rolling Shutter Correction with Intermediate Distortion Flow Estimation","summary":" This paper proposes to correct the rolling shutter (RS) distorted images by\nestimating the distortion flow from the global shutter (GS) to RS directly.\nExisting methods usually perform correction using the undistortion flow from\nthe RS to GS. They initially predict the flow from consecutive RS frames,\nsubsequently rescaling it as the displacement fields from the RS frame to the\nunderlying GS image using time-dependent scaling factors. Following this,\nRS-aware forward warping is employed to convert the RS image into its GS\ncounterpart. Nevertheless, this strategy is prone to two shortcomings. First,\nthe undistortion flow estimation is rendered inaccurate by merely linear\nscaling the flow, due to the complex non-linear motion nature. Second, RS-aware\nforward warping often results in unavoidable artifacts. To address these\nlimitations, we introduce a new framework that directly estimates the\ndistortion flow and rectifies the RS image with the backward warping operation.\nMore specifically, we first propose a global correlation-based flow attention\nmechanism to estimate the initial distortion flow and GS feature jointly, which\nare then refined by the following coarse-to-fine decoder layers. Additionally,\na multi-distortion flow prediction strategy is integrated to mitigate the issue\nof inaccurate flow estimation further. Experimental results validate the\neffectiveness of the proposed method, which outperforms state-of-the-art\napproaches on various benchmarks while maintaining high efficiency. The project\nis available at \\url{https://github.com/ljzycmd/DFRSC}.\n","authors":["Mingdeng Cao","Sidi Yang","Yujiu Yang","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06350v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.20035v2","updated":"2024-04-09T14:29:10Z","published":"2024-03-29T08:03:42Z","title":"UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces\n Parameters for Skin Lesion Segmentation","summary":" Traditionally for improving the segmentation performance of models, most\napproaches prefer to use adding more complex modules. And this is not suitable\nfor the medical field, especially for mobile medical devices, where\ncomputationally loaded models are not suitable for real clinical environments\ndue to computational resource constraints. Recently, state-space models (SSMs),\nrepresented by Mamba, have become a strong competitor to traditional CNNs and\nTransformers. In this paper, we deeply explore the key elements of parameter\ninfluence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight\nVM-UNet) based on this. Specifically, we propose a method for processing\nfeatures in parallel Vision Mamba, named PVM Layer, which achieves excellent\nperformance with the lowest computational load while keeping the overall number\nof processing channels constant. We conducted comparisons and ablation\nexperiments with several state-of-the-art lightweight models on three skin\nlesion public datasets and demonstrated that the UltraLight VM-UNet exhibits\nthe same strong performance competitiveness with parameters of only 0.049M and\nGFLOPs of 0.060. In addition, this study deeply explores the key elements of\nparameter influence in Mamba, which will lay a theoretical foundation for Mamba\nto possibly become a new mainstream module for lightweighting in the future.\nThe code is available from https://github.com/wurenkai/UltraLight-VM-UNet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.20035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06337v1","updated":"2024-04-09T14:22:50Z","published":"2024-04-09T14:22:50Z","title":"Matching 2D Images in 3D: Metric Relative Pose from Metric\n Correspondences","summary":" Given two images, we can estimate the relative camera pose between them by\nestablishing image-to-image correspondences. Usually, correspondences are\n2D-to-2D and the pose we estimate is defined only up to scale. Some\napplications, aiming at instant augmented reality anywhere, require\nscale-metric pose estimates, and hence, they rely on external depth estimators\nto recover the scale. We present MicKey, a keypoint matching pipeline that is\nable to predict metric correspondences in 3D camera space. By learning to match\n3D coordinates across images, we are able to infer the metric relative pose\nwithout depth measurements. Depth measurements are also not required for\ntraining, nor are scene reconstructions or image overlap information. MicKey is\nsupervised only by pairs of images and their relative poses. MicKey achieves\nstate-of-the-art performance on the Map-Free Relocalisation benchmark while\nrequiring less supervision than competing approaches.\n","authors":["Axel Barroso-Laguna","Sowmya Munukutla","Victor Adrian Prisacariu","Eric Brachmann"],"pdf_url":"https://arxiv.org/pdf/2404.06337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04818v4","updated":"2024-04-09T14:15:32Z","published":"2023-11-08T16:42:14Z","title":"Cross-Silo Federated Learning Across Divergent Domains with Iterative\n Parameter Alignment","summary":" Learning from the collective knowledge of data dispersed across private\nsources can provide neural networks with enhanced generalization capabilities.\nFederated learning, a method for collaboratively training a machine learning\nmodel across remote clients, achieves this by combining client models via the\norchestration of a central server. However, current approaches face two\ncritical limitations: i) they struggle to converge when client domains are\nsufficiently different, and ii) current aggregation techniques produce an\nidentical global model for each client. In this work, we address these issues\nby reformulating the typical federated learning setup: rather than learning a\nsingle global model, we learn N models each optimized for a common objective.\nTo achieve this, we apply a weighted distance minimization to model parameters\nshared in a peer-to-peer topology. The resulting framework, Iterative Parameter\nAlignment, applies naturally to the cross-silo setting, and has the following\nproperties: (i) a unique solution for each participant, with the option to\nglobally converge each model in the federation, and (ii) an optional\nearly-stopping mechanism to elicit fairness among peers in collaborative\nlearning settings. These characteristics jointly provide a flexible new\nframework for iteratively learning from peer models trained on disparate\ndatasets. We find that the technique achieves competitive results on a variety\nof data partitions compared to state-of-the-art approaches. Further, we show\nthat the method is robust to divergent domains (i.e. disjoint classes across\npeers) where existing approaches struggle.\n","authors":["Matt Gorbett","Hossein Shirazi","Indrakshi Ray"],"pdf_url":"https://arxiv.org/pdf/2311.04818v4.pdf","comment":"Published at IEEE Big Data 2023"},{"id":"http://arxiv.org/abs/2402.18078v2","updated":"2024-04-09T14:12:02Z","published":"2024-02-28T06:07:07Z","title":"Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis","summary":" Diffusion model is a promising approach to image generation and has been\nemployed for Pose-Guided Person Image Synthesis (PGPIS) with competitive\nperformance. While existing methods simply align the person appearance to the\ntarget pose, they are prone to overfitting due to the lack of a high-level\nsemantic understanding on the source person image. In this paper, we propose a\nnovel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence\nof image-caption pairs and textual prompts, we develop a novel training\nparadigm purely based on images to control the generation process of a\npre-trained text-to-image diffusion model. A perception-refined decoder is\ndesigned to progressively refine a set of learnable queries and extract\nsemantic understanding of person images as a coarse-grained prompt. This allows\nfor the decoupling of fine-grained appearance and pose information controls at\ndifferent stages, and thus circumventing the potential overfitting problem. To\ngenerate more realistic texture details, a hybrid-granularity attention module\nis proposed to encode multi-scale fine-grained appearance features as bias\nterms to augment the coarse-grained prompt. Both quantitative and qualitative\nexperimental results on the DeepFashion benchmark demonstrate the superiority\nof our method over the state of the arts for PGPIS. Code is available at\nhttps://github.com/YanzuoLu/CFLD.\n","authors":["Yanzuo Lu","Manlin Zhang","Andy J Ma","Xiaohua Xie","Jian-Huang Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18078v2.pdf","comment":"Accepted by CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.01558v2","updated":"2024-04-09T13:59:18Z","published":"2024-01-03T06:18:30Z","title":"One-Step Late Fusion Multi-view Clustering with Compressed Subspace","summary":" Late fusion multi-view clustering (LFMVC) has become a rapidly growing class\nof methods in the multi-view clustering (MVC) field, owing to its excellent\ncomputational speed and clustering performance. One bottleneck faced by\nexisting late fusion methods is that they are usually aligned to the average\nkernel function, which makes the clustering performance highly dependent on the\nquality of datasets. Another problem is that they require subsequent k-means\nclustering after obtaining the consensus partition matrix to get the final\ndiscrete labels, and the resulting separation of the label learning and cluster\nstructure optimization processes limits the integrity of these models. To\naddress the above issues, we propose an integrated framework named One-Step\nLate Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS).\nSpecifically, we use the consensus subspace to align the partition matrix while\noptimizing the partition fusion, and utilize the fused partition matrix to\nguide the learning of discrete labels. A six-step iterative optimization\napproach with verified convergence is proposed. Sufficient experiments on\nmultiple datasets validate the effectiveness and efficiency of our proposed\nmethod.\n","authors":["Qiyuan Ou","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.01558v2.pdf","comment":"Accepted by ICASSP2024"},{"id":"http://arxiv.org/abs/2403.17881v2","updated":"2024-04-09T13:56:06Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" In addition to the advancements in deepfake generation, corresponding\ndetection technologies need to continuously evolve to regulate the potential\nmisuse of deepfakes, such as for privacy invasion and phishing attacks. This\nsurvey comprehensively reviews the latest developments in deepfake generation\nand detection, summarizing and analyzing the current state of the art in this\nrapidly evolving field. We first unify task definitions, comprehensively\nintroduce datasets and metrics, and discuss the development of generation and\ndetection technology frameworks. Then, we discuss the development of several\nrelated sub-fields and focus on researching four mainstream deepfake fields:\npopular face swap, face reenactment, talking face generation, and facial\nattribute editing, as well as foreign detection. Subsequently, we\ncomprehensively benchmark representative methods on popular datasets for each\nfield, fully evaluating the latest and influential works published in top\nconferences/journals. Finally, we analyze the challenges and future research\ndirections of the discussed fields. We closely follow the latest developments\nin https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Zhenyu Zhang","Chengjie Wang","Yunsheng Wu","Guangtao Zhai","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05334v3","updated":"2024-04-09T13:54:48Z","published":"2023-09-11T09:32:45Z","title":"MultIOD: Rehearsal-free Multihead Incremental Object Detector","summary":" Class-Incremental learning (CIL) refers to the ability of artificial agents\nto integrate new classes as they appear in a stream. It is particularly\ninteresting in evolving environments where agents have limited access to memory\nand computational resources. The main challenge of incremental learning is\ncatastrophic forgetting, the inability of neural networks to retain past\nknowledge when learning a new one. Unfortunately, most existing\nclass-incremental methods for object detection are applied to two-stage\nalgorithms such as Faster-RCNN, and rely on rehearsal memory to retain past\nknowledge. We argue that those are not suitable in resource-limited\nenvironments, and more effort should be dedicated to anchor-free and\nrehearsal-free object detection. In this paper, we propose MultIOD, a\nclass-incremental object detector based on CenterNet. Our contributions are:\n(1) we propose a multihead feature pyramid and multihead detection architecture\nto efficiently separate class representations, (2) we employ transfer learning\nbetween classes learned initially and those learned incrementally to tackle\ncatastrophic forgetting, and (3) we use a class-wise non-max-suppression as a\npost-processing technique to remove redundant boxes. Results show that our\nmethod outperforms state-of-the-art methods on two Pascal VOC datasets, while\nonly saving the model in its current state, contrary to other\ndistillation-based counterparts.\n","authors":["Eden Belouadah","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2309.05334v3.pdf","comment":"Accepted at the archival track of the Workshop on Continual Learning\n in Computer Vision (CVPR 2024)"},{"id":"http://arxiv.org/abs/2401.17053v3","updated":"2024-04-09T13:47:18Z","published":"2024-01-30T14:34:19Z","title":"BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane\n Extrapolation","summary":" We present BlockFusion, a diffusion-based model that generates 3D scenes as\nunit blocks and seamlessly incorporates new blocks to extend the scene.\nBlockFusion is trained using datasets of 3D blocks that are randomly cropped\nfrom complete 3D scene meshes. Through per-block fitting, all training blocks\nare converted into the hybrid neural fields: with a tri-plane containing the\ngeometry features, followed by a Multi-layer Perceptron (MLP) for decoding the\nsigned distance values. A variational auto-encoder is employed to compress the\ntri-planes into the latent tri-plane space, on which the denoising diffusion\nprocess is performed. Diffusion applied to the latent representations allows\nfor high-quality and diverse 3D scene generation. To expand a scene during\ngeneration, one needs only to append empty blocks to overlap with the current\nscene and extrapolate existing latent tri-planes to populate new blocks. The\nextrapolation is done by conditioning the generation process with the feature\nsamples from the overlapping tri-planes during the denoising iterations. Latent\ntri-plane extrapolation produces semantically and geometrically meaningful\ntransitions that harmoniously blend with the existing scene. A 2D layout\nconditioning mechanism is used to control the placement and arrangement of\nscene elements. Experimental results indicate that BlockFusion is capable of\ngenerating diverse, geometrically consistent and unbounded large 3D scenes with\nunprecedented high-quality shapes in both indoor and outdoor scenarios.\n","authors":["Zhennan Wu","Yang Li","Han Yan","Taizhang Shang","Weixuan Sun","Senbo Wang","Ruikai Cui","Weizhe Liu","Hiroyuki Sato","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2401.17053v3.pdf","comment":"Video: https://www.youtube.com/watch?v=PxIBtd6G0mA"},{"id":"http://arxiv.org/abs/2403.03309v4","updated":"2024-04-09T13:44:54Z","published":"2024-03-05T20:21:49Z","title":"Learning Zero-Shot Material States Segmentation, by Implanting Natural\n Image Patterns in Synthetic Data","summary":" Visual understanding and segmentation of materials and their states is\nfundamental to understanding the physical world. The myriad textures, shapes,\nand often blurry boundaries formed by materials make this task particularly\nhard to generalize. Whether it's identifying wet regions of a surface, minerals\nin rocks, infected regions in plants, or pollution in water, each material\nstate has its own unique form. For neural nets to learn general class-agnostic\nmaterial segmentation, it is necessary to first collect and annotate data that\ncaptures this complexity. Collecting and manually annotating real-world images\nis limited by the cost and precision of manual labor. In contrast, synthetic\nCGI data is highly accurate and almost cost-free, but fails to replicate the\nvast diversity of the material world. This work offers a method to bridge this\ncrucial gap by implanting patterns extracted from real-world images in\nsynthetic data. Hence, patterns automatically collected from natural images are\nused to map materials into synthetic scenes. This unsupervised approach allows\nthe generated data to capture the vast complexity of the real world while\nmaintaining the precision and scale of synthetic data. We also present the\nfirst general benchmark for zero-shot material state segmentation. The\nbenchmark contains a wide range of real-world images of material states, like\nfood, rocks, construction, plants, liquids, and many others, each in various\nstates (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The\nannotation includes both partial similarity between regions with similar but\nnot identical materials, and hard segmentation of only points in the exact same\nmaterial state. We show that net trains on MatSeg significantly outperform\nexisting state-of-the-art methods on this task. The dataset, code, and trained\nmodel are available\n","authors":["Sagi Eppel","Jolina Li","Manuel Drehwald","Alan Aspuru-Guzik"],"pdf_url":"https://arxiv.org/pdf/2403.03309v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18171v5","updated":"2024-04-09T13:42:07Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the negative effect under massive\nfalse negatives; second, mixed sample data augmentation for probabilistic\nmatching. Experimental results on MS-COCO Caption and two extended benchmarks,\nCxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to\nstate-of-the-art ITM methods. The robustness of PCME++ is also evaluated under\nnoisy image-text correspondences. In addition, the potential applicability of\nPCME++ in automatic prompt-filtering for zero-shot classification is shown. The\ncode is available at https://github.com/naver-ai/pcmepp\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v5.pdf","comment":"ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp.\n Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB"},{"id":"http://arxiv.org/abs/2404.06309v1","updated":"2024-04-09T13:39:37Z","published":"2024-04-09T13:39:37Z","title":"Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large\n Multi-Modal Models","summary":" Audio-visual zero-shot learning methods commonly build on features extracted\nfrom pre-trained models, e.g. video or audio classification models. However,\nexisting benchmarks predate the popularization of large multi-modal models,\nsuch as CLIP and CLAP. In this work, we explore such large pre-trained models\nto obtain features, i.e. CLIP for visual features, and CLAP for audio features.\nFurthermore, the CLIP and CLAP text encoders provide class label embeddings\nwhich are combined to boost the performance of the system. We propose a simple\nyet effective model that only relies on feed-forward neural networks,\nexploiting the strong generalization capabilities of the new audio, visual and\ntextual features. Our framework achieves state-of-the-art performance on\nVGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and\ndata available at: https://github.com/dkurzend/ClipClap-GZSL.\n","authors":["David Kurzendörfer","Otniel-Bogdan Mercea","A. Sophia Koepke","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2404.06309v1.pdf","comment":"CVPRw 2024 (L3D-IVU)"},{"id":"http://arxiv.org/abs/2309.14265v2","updated":"2024-04-09T13:33:30Z","published":"2023-09-25T16:23:49Z","title":"Industrial Application of 6D Pose Estimation for Robotic Manipulation in\n Automotive Internal Logistics","summary":" Despite the advances in robotics a large proportion of the of parts handling\ntasks in the automotive industry's internal logistics are not automated but\nstill performed by humans. A key component to competitively automate these\nprocesses is a 6D pose estimation that can handle a large number of different\nparts, is adaptable to new parts with little manual effort, and is sufficiently\naccurate and robust with respect to industry requirements. In this context, the\nquestion arises as to the current status quo with respect to these measures. To\naddress this we built a representative 6D pose estimation pipeline with\nstate-of-the-art components from economically scalable real to synthetic data\ngeneration to pose estimators and evaluated it on automotive parts with regards\nto a realistic sequencing process. We found that using the data generation\napproaches, the performance of the trained 6D pose estimators are promising,\nbut do not meet industry requirements. We reveal that the reason for this is\nthe inability of the estimators to provide reliable uncertainties for their\nposes, rather than the ability of to provide sufficiently accurate poses. In\nthis context we further analyzed how RGB- and RGB-D-based approaches compare\nagainst this background and show that they are differently vulnerable to the\ndomain gap induced by synthetic data.\n","authors":["Philipp Quentin","Dino Knoll","Daniel Goehring"],"pdf_url":"https://arxiv.org/pdf/2309.14265v2.pdf","comment":"Accepted for publication at IEEE International Conference on\n Automation Science and Engineering (CASE 2023)"},{"id":"http://arxiv.org/abs/2212.04227v2","updated":"2024-04-09T13:30:15Z","published":"2022-12-08T12:20:35Z","title":"Self-training via Metric Learning for Source-Free Domain Adaptation of\n Semantic Segmentation","summary":" Unsupervised source-free domain adaptation methods aim to train a model for\nthe target domain utilizing a pretrained source-domain model and unlabeled\ntarget-domain data, particularly when accessibility to source data is\nrestricted due to intellectual property or privacy concerns. Traditional\nmethods usually use self-training with pseudo-labeling, which is often\nsubjected to thresholding based on prediction confidence. However, such\nthresholding limits the effectiveness of self-training due to insufficient\nsupervision. This issue becomes more severe in a source-free setting, where\nsupervision comes solely from the predictions of the pre-trained source model.\nIn this study, we propose a novel approach by incorporating a mean-teacher\nmodel, wherein the student network is trained using all predictions from the\nteacher network. Instead of employing thresholding on predictions, we introduce\na method to weight the gradients calculated from pseudo-labels based on the\nreliability of the teacher's predictions. To assess reliability, we introduce a\nnovel approach using proxy-based metric learning. Our method is evaluated in\nsynthetic-to-real and cross-city scenarios, demonstrating superior performance\ncompared to existing state-of-the-art methods.\n","authors":["Ibrahim Batuhan Akkaya","Ugur Halici"],"pdf_url":"https://arxiv.org/pdf/2212.04227v2.pdf","comment":"This paper is under consideration at Computer Vision and Image\n Understanding"},{"id":"http://arxiv.org/abs/2404.06294v1","updated":"2024-04-09T13:19:43Z","published":"2024-04-09T13:19:43Z","title":"Fortifying Fully Convolutional Generative Adversarial Networks for Image\n Super-Resolution Using Divergence Measures","summary":" Super-Resolution (SR) is a time-hallowed image processing problem that aims\nto improve the quality of a Low-Resolution (LR) sample up to the standard of\nits High-Resolution (HR) counterpart. We aim to address this by introducing\nSuper-Resolution Generator (SuRGe), a fully-convolutional Generative\nAdversarial Network (GAN)-based architecture for SR. We show that distinct\nconvolutional features obtained at increasing depths of a GAN generator can be\noptimally combined by a set of learnable convex weights to improve the quality\nof generated SR samples. In the process, we employ the Jensen-Shannon and the\nGromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of\ndistributions to further aid the generator of SuRGe to better exploit the\navailable information in an attempt to improve SR. Moreover, we train the\ndiscriminator of SuRGe with the Wasserstein loss with gradient penalty, to\nprimarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN\nworkflow tailor-made for super-resolution, offers improved performance while\nmaintaining low inference time. The efficacy of SuRGe is substantiated by its\nsuperior performance compared to 18 state-of-the-art contenders on 10 benchmark\ndatasets.\n","authors":["Arkaprabha Basu","Kushal Bose","Sankha Subhra Mullick","Anish Chakrabarty","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2404.06294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02730v3","updated":"2024-04-09T13:18:22Z","published":"2023-07-06T02:30:56Z","title":"Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of\n Figure Skating","summary":" The fine-grained action analysis of the existing action datasets is\nchallenged by insufficient action categories, low fine granularities, limited\nmodalities, and tasks. In this paper, we propose a Multi-modality and\nMulti-task dataset of Figure Skating (MMFS) which was collected from the World\nFigure Skating Championships. MMFS, which possesses action recognition and\naction quality assessment, captures RGB, skeleton, and is collected the score\nof actions from 11671 clips with 256 categories including spatial and temporal\nlabels. The key contributions of our dataset fall into three aspects as\nfollows. (1) Independently spatial and temporal categories are first proposed\nto further explore fine-grained action recognition and quality assessment. (2)\nMMFS first introduces the skeleton modality for complex fine-grained action\nquality assessment. (3) Our multi-modality and multi-task dataset encourage\nmore action analysis models. To benchmark our dataset, we adopt RGB-based and\nskeleton-based baseline methods for action recognition and action quality\nassessment.\n","authors":["Sheng-Lan Liu","Yu-Ning Ding","Gang Yan","Si-Fan Zhang","Jin-Rong Zhang","Wen-Yue Chen","Xue-Hai Xu"],"pdf_url":"https://arxiv.org/pdf/2307.02730v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06287v1","updated":"2024-04-09T13:13:24Z","published":"2024-04-09T13:13:24Z","title":"Counterfactual Reasoning for Multi-Label Image Classification via\n Patching-Based Training","summary":" The key to multi-label image classification (MLC) is to improve model\nperformance by leveraging label correlations. Unfortunately, it has been shown\nthat overemphasizing co-occurrence relationships can cause the overfitting\nissue of the model, ultimately leading to performance degradation. In this\npaper, we provide a causal inference framework to show that the correlative\nfeatures caused by the target object and its co-occurring objects can be\nregarded as a mediator, which has both positive and negative impacts on model\npredictions. On the positive side, the mediator enhances the recognition\nperformance of the model by capturing co-occurrence relationships; on the\nnegative side, it has the harmful causal effect that causes the model to make\nan incorrect prediction for the target object, even when only co-occurring\nobjects are present in an image. To address this problem, we propose a\ncounterfactual reasoning method to measure the total direct effect, achieved by\nenhancing the direct effect caused only by the target object. Due to the\nunknown location of the target object, we propose patching-based training and\ninference to accomplish this goal, which divides an image into multiple patches\nand identifies the pivot patch that contains the target object. Experimental\nresults on multiple benchmark datasets with diverse configurations validate\nthat the proposed method can achieve state-of-the-art performance.\n","authors":["Ming-Kun Xie","Jia-Hao Xiao","Pei Peng","Gang Niu","Masashi Sugiyama","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06279v1","updated":"2024-04-09T13:02:33Z","published":"2024-04-09T13:02:33Z","title":"NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural\n Cellular Automata","summary":" Neural Cellular Automata (NCA) is a class of Cellular Automata where the\nupdate rule is parameterized by a neural network that can be trained using\ngradient descent. In this paper, we focus on NCA models used for texture\nsynthesis, where the update rule is inspired by partial differential equations\n(PDEs) describing reaction-diffusion systems. To train the NCA model, the\nspatio-termporal domain is discretized, and Euler integration is used to\nnumerically simulate the PDE. However, whether a trained NCA truly learns the\ncontinuous dynamic described by the corresponding PDE or merely overfits the\ndiscretization used in training remains an open question. We study NCA models\nat the limit where space-time discretization approaches continuity. We find\nthat existing NCA models tend to overfit the training discretization,\nespecially in the proximity of the initial condition, also called \"seed\". To\naddress this, we propose a solution that utilizes uniform noise as the initial\ncondition. We demonstrate the effectiveness of our approach in preserving the\nconsistency of NCA dynamics across a wide range of spatio-temporal\ngranularities. Our improved NCA model enables two new test-time interactions by\nallowing continuous control over the speed of pattern formation and the scale\nof the synthesized patterns. We demonstrate this new NCA feature in our\ninteractive online demo. Our work reveals that NCA models can learn continuous\ndynamics and opens new venues for NCA research from a dynamical systems'\nperspective.\n","authors":["Ehsan Pajouheshgar","Yitao Xu","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06279v1.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.06277v1","updated":"2024-04-09T13:01:26Z","published":"2024-04-09T13:01:26Z","title":"Learning Embeddings with Centroid Triplet Loss for Object Identification\n in Robotic Grasping","summary":" Foundation models are a strong trend in deep learning and computer vision.\nThese models serve as a base for applications as they require minor or no\nfurther fine-tuning by developers to integrate into their applications.\nFoundation models for zero-shot object segmentation such as Segment Anything\n(SAM) output segmentation masks from images without any further object\ninformation. When they are followed in a pipeline by an object identification\nmodel, they can perform object detection without training. Here, we focus on\ntraining such an object identification model. A crucial practical aspect for an\nobject identification model is to be flexible in input size. As object\nidentification is an image retrieval problem, a suitable method should handle\nmulti-query multi-gallery situations without constraining the number of input\nimages (e.g. by having fixed-size aggregation layers). The key solution to\ntrain such a model is the centroid triplet loss (CTL), which aggregates image\nfeatures to their centroids. CTL yields high accuracy, avoids misleading\ntraining signals and keeps the model input size flexible. In our experiments,\nwe establish a new state of the art on the ArmBench object identification task,\nwhich shows general applicability of our model. We furthermore demonstrate an\nintegrated unseen object detection pipeline on the challenging HOPE dataset,\nwhich requires fine-grained detection. There, our pipeline matches and\nsurpasses related methods which have been trained on dataset-specific data.\n","authors":["Anas Gouda","Max Schwarz","Christopher Reining","Sven Behnke","Alice Kirchheim"],"pdf_url":"https://arxiv.org/pdf/2404.06277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04561v2","updated":"2024-04-09T12:50:16Z","published":"2024-04-06T09:01:19Z","title":"Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering\n Regularization for Multi-Modal 3D Semantic Occupancy Prediction","summary":" 3D semantic occupancy prediction is a pivotal task in the field of autonomous\ndriving. Recent approaches have made great advances in 3D semantic occupancy\npredictions on a single modality. However, multi-modal semantic occupancy\nprediction approaches have encountered difficulties in dealing with the\nmodality heterogeneity, modality misalignment, and insufficient modality\ninteractions that arise during the fusion of different modalities data, which\nmay result in the loss of important geometric and semantic information. This\nletter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy\nprediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera\nfeature fusion with implicit volume rendering regularization. The key insight\nis that volume rendering in the feature space can proficiently bridge the gap\nbetween 3D LiDAR sweeps and 2D images while serving as a physical\nregularization to enhance LiDAR-camera fused volumetric representation.\nSpecifically, we first propose a Geometric- and Semantic-aware Fusion\n(GSFusion) module to explicitly enhance LiDAR features by incorporating\nneighboring camera features through a K-nearest neighbors (KNN) search. Then,\nwe employ volume rendering to project the fused feature back to the image\nplanes for reconstructing color and depth maps. These maps are then supervised\nby input images from the camera and depth estimations derived from LiDAR,\nrespectively. Extensive experiments on the popular nuScenes and SemanticKITTI\nbenchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy\nprediction. The project page is available at\nhttps://rorisis.github.io/Co-Occ_project-page/.\n","authors":["Jingyi Pan","Zipeng Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06273v1","updated":"2024-04-09T12:48:24Z","published":"2024-04-09T12:48:24Z","title":"Robust Confidence Intervals in Stereo Matching using Possibility Theory","summary":" We propose a method for estimating disparity confidence intervals in stereo\nmatching problems. Confidence intervals provide complementary information to\nusual confidence measures. To the best of our knowledge, this is the first\nmethod creating disparity confidence intervals based on the cost volume. This\nmethod relies on possibility distributions to interpret the epistemic\nuncertainty of the cost volume. Our method has the benefit of having a\nwhite-box nature, differing in this respect from current state-of-the-art deep\nneural networks approaches. The accuracy and size of confidence intervals are\nvalidated using the Middlebury stereo datasets as well as a dataset of\nsatellite images. This contribution is freely available on GitHub.\n","authors":["Roman Malinowski","Emmanuelle Sarrazin","Loïc Dumas","Emmanuel Dubois","Sébastien Destercke"],"pdf_url":"https://arxiv.org/pdf/2404.06273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06270v1","updated":"2024-04-09T12:47:30Z","published":"2024-04-09T12:47:30Z","title":"3D Geometry-aware Deformable Gaussian Splatting for Dynamic View\n Synthesis","summary":" In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting\nmethod for dynamic view synthesis. Existing neural radiance fields (NeRF) based\nsolutions learn the deformation in an implicit manner, which cannot incorporate\n3D scene geometry. Therefore, the learned deformation is not necessarily\ngeometrically coherent, which results in unsatisfactory dynamic view synthesis\nand 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new\nrepresentation of the 3D scene, building upon which the 3D geometry could be\nexploited in learning the complex 3D deformation. Specifically, the scenes are\nrepresented as a collection of 3D Gaussian, where each 3D Gaussian is optimized\nto move and rotate over time to model the deformation. To enforce the 3D scene\ngeometry constraint during deformation, we explicitly extract 3D geometry\nfeatures and integrate them in learning the 3D deformation. In this way, our\nsolution achieves 3D geometry-aware deformation modeling, which enables\nimproved dynamic view synthesis and 3D dynamic reconstruction. Extensive\nexperimental results on both synthetic and real datasets prove the superiority\nof our solution, which achieves new state-of-the-art performance.\n The project is available at https://npucvr.github.io/GaGS/\n","authors":["Zhicheng Lu","Xiang Guo","Le Hui","Tianrui Chen","Min Yang","Xiao Tang","Feng Zhu","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.06270v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/"},{"id":"http://arxiv.org/abs/2404.06265v1","updated":"2024-04-09T12:44:34Z","published":"2024-04-09T12:44:34Z","title":"Spatial-Temporal Multi-level Association for Video Object Segmentation","summary":" Existing semi-supervised video object segmentation methods either focus on\ntemporal feature matching or spatial-temporal feature modeling. However, they\ndo not address the issues of sufficient target interaction and efficient\nparallel processing simultaneously, thereby constraining the learning of\ndynamic, target-aware features. To tackle these limitations, this paper\nproposes a spatial-temporal multi-level association framework, which jointly\nassociates reference frame, test frame, and object features to achieve\nsufficient interaction and parallel target ID association with a\nspatial-temporal memory bank for efficient video object segmentation.\nSpecifically, we construct a spatial-temporal multi-level feature association\nmodule to learn better target-aware features, which formulates feature\nextraction and interaction as the efficient operations of object\nself-attention, reference object enhancement, and test reference correlation.\nIn addition, we propose a spatial-temporal memory to assist feature association\nand temporal ID assignment and correlation. We evaluate the proposed method by\nconducting extensive experiments on numerous video object segmentation\ndatasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS\n2018/2019 val. The favorable performance against the state-of-the-art methods\ndemonstrates the effectiveness of our approach. All source code and trained\nmodels will be made publicly available.\n","authors":["Deshui Miao","Xin Li","Zhenyu He","Huchuan Lu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07166v2","updated":"2024-04-09T12:40:18Z","published":"2023-10-11T03:29:13Z","title":"Anchor-based Multi-view Subspace Clustering with Hierarchical Feature\n Descent","summary":" Multi-view clustering has attracted growing attention owing to its\ncapabilities of aggregating information from various sources and its promising\nhorizons in public affairs. Up till now, many advanced approaches have been\nproposed in recent literature. However, there are several ongoing difficulties\nto be tackled. One common dilemma occurs while attempting to align the features\nof different views. {Moreover, due to the fact that many existing multi-view\nclustering algorithms stem from spectral clustering, this results to cubic time\ncomplexity w.r.t. the number of dataset. However, we propose Anchor-based\nMulti-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to\ntackle the discrepancy among views through hierarchical feature descent and\nproject to a common subspace( STAGE 1), which reveals dependency of different\nviews. We further reduce the computational complexity to linear time cost\nthrough a unified sampling strategy in the common subspace( STAGE 2), followed\nby anchor-based subspace clustering to learn the bipartite graph collectively(\nSTAGE 3). }Extensive experimental results on public benchmark datasets\ndemonstrate that our proposed model consistently outperforms the\nstate-of-the-art techniques.\n","authors":["Qiyuan Ou","Siwei Wang","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06261v1","updated":"2024-04-09T12:34:28Z","published":"2024-04-09T12:34:28Z","title":"Playing to Vision Foundation Model's Strengths in Stereo Matching","summary":" Stereo matching has become a key technique for 3D environment perception in\nintelligent vehicles. For a considerable time, convolutional neural networks\n(CNNs) have remained the mainstream choice for feature extraction in this\ndomain. Nonetheless, there is a growing consensus that the existing paradigm\nshould evolve towards vision foundation models (VFM), particularly those\ndeveloped based on vision Transformers (ViTs) and pre-trained through\nself-supervision on extensive, unlabeled datasets. While VFMs are adept at\nextracting informative, general-purpose visual features, specifically for dense\nprediction tasks, their performance often lacks in geometric vision tasks. This\nstudy serves as the first exploration of a viable approach for adapting VFMs to\nstereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon\nthree types of modules: spatial differentiation, patch attention fusion, and\ncross-attention. The first module initializes feature pyramids, while the\nlatter two aggregate stereo and multi-scale contextual information into\nfine-grained features, respectively. ViTAStereo, which combines ViTAS with cost\nvolume-based stereo matching back-end processes, achieves the top rank on the\nKITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by\napproximately 7.9% in terms of the percentage of error pixels, with a tolerance\nof 3 pixels. Additional experiments across diverse scenarios further\ndemonstrate its superior generalizability compared to all other\nstate-of-the-art approaches. We believe this new paradigm will pave the way for\nthe next generation of stereo matching networks.\n","authors":["Chuang-Wei Liu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.06261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06258v1","updated":"2024-04-09T12:32:10Z","published":"2024-04-09T12:32:10Z","title":"Robust feature knowledge distillation for enhanced performance of\n lightweight crack segmentation models","summary":" Vision-based crack detection faces deployment challenges due to the size of\nrobust models and edge device limitations. These can be addressed with\nlightweight models trained with knowledge distillation (KD). However,\nstate-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper\ndevelops Robust Feature Knowledge Distillation (RFKD), a framework to improve\nrobustness while retaining the precision of light models for crack\nsegmentation. RFKD distils knowledge from a teacher model's logit layers and\nintermediate feature maps while leveraging mixed clean and noisy images to\ntransfer robust patterns to the student model, improving its precision,\ngeneralisation, and anti-noise performance. To validate the proposed RFKD, a\nlightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M\nparameters, is also designed and used as the student to run the framework. The\nresults show a significant enhancement in noisy images, with RFKD reaching a\n62% enhanced mean Dice score (mDS) compared to SOTA KD methods.\n","authors":["Zhaohui Chen","Elyas Asadi Shamsabadi","Sheng Jiang","Luming Shen","Daniel Dias-da-Costa"],"pdf_url":"https://arxiv.org/pdf/2404.06258v1.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.06256v1","updated":"2024-04-09T12:29:16Z","published":"2024-04-09T12:29:16Z","title":"Label-Efficient 3D Object Detection For Road-Side Units","summary":" Occlusion presents a significant challenge for safety-critical applications\nsuch as autonomous driving. Collaborative perception has recently attracted a\nlarge research interest thanks to the ability to enhance the perception of\nautonomous vehicles via deep information fusion with intelligent roadside units\n(RSU), thus minimizing the impact of occlusion. While significant advancement\nhas been made, the data-hungry nature of these methods creates a major hurdle\nfor their real-world deployment, particularly due to the need for annotated RSU\ndata. Manually annotating the vast amount of RSU data required for training is\nprohibitively expensive, given the sheer number of intersections and the effort\ninvolved in annotating point clouds. We address this challenge by devising a\nlabel-efficient object detection method for RSU based on unsupervised object\ndiscovery. Our paper introduces two new modules: one for object discovery based\non a spatial-temporal aggregation of point clouds, and another for refinement.\nFurthermore, we demonstrate that fine-tuning on a small portion of annotated\ndata allows our object discovery models to narrow the performance gap with, or\neven surpass, fully supervised models. Extensive experiments are carried out in\nsimulated and real-world datasets to evaluate our method.\n","authors":["Minh-Quan Dao","Holger Caesar","Julie Stephany Berrio","Mao Shan","Stewart Worrall","Vincent Frémont","Ezio Malis"],"pdf_url":"https://arxiv.org/pdf/2404.06256v1.pdf","comment":"IV 2024"},{"id":"http://arxiv.org/abs/2404.06253v1","updated":"2024-04-09T12:25:06Z","published":"2024-04-09T12:25:06Z","title":"From Barlow Twins to Triplet Training: Differentiating Dementia with\n Limited Data","summary":" Differential diagnosis of dementia is challenging due to overlapping\nsymptoms, with structural magnetic resonance imaging (MRI) being the primary\nmethod for diagnosis. Despite the clinical value of computer-aided differential\ndiagnosis, research has been limited, mainly due to the absence of public\ndatasets that contain diverse types of dementia. This leaves researchers with\nsmall in-house datasets that are insufficient for training deep neural networks\n(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI\nscans in training, but small batch sizes for volumetric brain scans make its\napplication challenging. To address these issues, we propose Triplet Training\nfor differential diagnosis with limited target data. It consists of three key\nstages: (i) self-supervised pre-training on unlabeled data with Barlow Twins,\n(ii) self-distillation on task-related data, and (iii) fine-tuning on the\ntarget dataset. Our approach significantly outperforms traditional training\nstrategies, achieving a balanced accuracy of 75.6%. We further provide insights\ninto the training process by visualizing changes in the latent space after each\nstep. Finally, we validate the robustness of Triplet Training in terms of its\nindividual components in a comprehensive ablation study. Our code is available\nat https://github.com/ai-med/TripletTraining.\n","authors":["Yitong Li","Tom Nuno Wolf","Sebastian Pölsterl","Igor Yakushev","Dennis M. Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2404.06253v1.pdf","comment":"Accepted for presentation at MIDL 2024"},{"id":"http://arxiv.org/abs/2404.06251v1","updated":"2024-04-09T12:23:30Z","published":"2024-04-09T12:23:30Z","title":"ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation\n Network for Video Colorization","summary":" How to effectively explore spatial-temporal features is important for video\ncolorization. Instead of stacking multiple frames along the temporal dimension\nor recurrently propagating estimated features that will accumulate errors or\ncannot explore information from far-apart frames, we develop a memory-based\nfeature propagation module that can establish reliable connections with\nfeatures from far-apart frames and alleviate the influence of inaccurately\nestimated features. To extract better features from each frame for the\nabove-mentioned feature propagation, we explore the features from\nlarge-pretrained visual models to guide the feature estimation of each frame so\nthat the estimated features can model complex scenarios. In addition, we note\nthat adjacent frames usually contain similar contents. To explore this property\nfor better spatial and temporal feature utilization, we develop a local\nattention module to aggregate the features from adjacent frames in a\nspatial-temporal neighborhood. We formulate our memory-based feature\npropagation module, large-pretrained visual model guided feature estimation\nmodule, and local attention module into an end-to-end trainable network (named\nColorMNet) and show that it performs favorably against state-of-the-art methods\non both the benchmark datasets and real-world scenarios. The source code and\npre-trained models will be available at\n\\url{https://github.com/yyang181/colormnet}.\n","authors":["Yixin Yang","Jiangxin Dong","Jinhui Tang","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2404.06251v1.pdf","comment":"Project website: \\url{https://github.com/yyang181/colormnet}"},{"id":"http://arxiv.org/abs/2404.06247v1","updated":"2024-04-09T12:13:40Z","published":"2024-04-09T12:13:40Z","title":"LRR: Language-Driven Resamplable Continuous Representation against\n Adversarial Tracking Attacks","summary":" Visual object tracking plays a critical role in visual-based autonomous\nsystems, as it aims to estimate the position and size of the object of interest\nwithin a live video. Despite significant progress made in this field,\nstate-of-the-art (SOTA) trackers often fail when faced with adversarial\nperturbations in the incoming frames. This can lead to significant robustness\nand security issues when these trackers are deployed in the real world. To\nachieve high accuracy on both clean and adversarial data, we propose building a\nspatial-temporal continuous representation using the semantic text guidance of\nthe object of interest. This novel continuous representation enables us to\nreconstruct incoming frames to maintain semantic and appearance consistency\nwith the object of interest and its clean counterparts. As a result, our\nproposed method successfully defends against different SOTA adversarial\ntracking attacks while maintaining high accuracy on clean data. In particular,\nour method significantly increases tracking accuracy under adversarial attacks\nwith around 90% relative improvement on UAV123, which is even higher than the\naccuracy on clean data.\n","authors":["Jianlang Chen","Xuhong Ren","Qing Guo","Felix Juefei-Xu","Di Lin","Wei Feng","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06246v1","updated":"2024-04-09T12:11:25Z","published":"2024-04-09T12:11:25Z","title":"GHNeRF: Learning Generalizable Human Features with Efficient Neural\n Radiance Fields","summary":" Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising\nresults in 3D scene representations, including 3D human representations.\nHowever, these representations often lack crucial information on the underlying\nhuman pose and structure, which is crucial for AR/VR applications and games. In\nthis paper, we introduce a novel approach, termed GHNeRF, designed to address\nthese limitations by learning 2D/3D joint locations of human subjects with NeRF\nrepresentation. GHNeRF uses a pre-trained 2D encoder streamlined to extract\nessential human features from 2D images, which are then incorporated into the\nNeRF framework in order to encode human biomechanic features. This allows our\nnetwork to simultaneously learn biomechanic features, such as joint locations,\nalong with human geometry and texture. To assess the effectiveness of our\nmethod, we conduct a comprehensive comparison with state-of-the-art human NeRF\ntechniques and joint estimation algorithms. Our results show that GHNeRF can\nachieve state-of-the-art results in near real-time.\n","authors":["Arnab Dey","Di Yang","Rohith Agaram","Antitza Dantcheva","Andrew I. Comport","Srinath Sridhar","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06244v1","updated":"2024-04-09T12:10:54Z","published":"2024-04-09T12:10:54Z","title":"Anchor-based Robust Finetuning of Vision-Language Models","summary":" We aim at finetuning a vision-language model without hurting its\nout-of-distribution (OOD) generalization. We address two types of OOD\ngeneralization, i.e., i) domain shift such as natural to sketch images, and ii)\nzero-shot capability to recognize the category that was not contained in the\nfinetune data. Arguably, the diminished OOD generalization after finetuning\nstems from the excessively simplified finetuning target, which only provides\nthe class information, such as ``a photo of a [CLASS]''. This is distinct from\nthe process in that CLIP was pretrained, where there is abundant text\nsupervision with rich semantic information. Therefore, we propose to compensate\nfor the finetune process using auxiliary supervision with rich semantic\ninformation, which acts as anchors to preserve the OOD generalization.\nSpecifically, two types of anchors are elaborated in our method, including i)\ntext-compensated anchor which uses the images from the finetune set but\nenriches the text supervision from a pretrained captioner, ii) image-text-pair\nanchor which is retrieved from the dataset similar to pretraining data of CLIP\naccording to the downstream task, associating with the original CLIP text with\nrich semantics. Those anchors are utilized as auxiliary semantic information to\nmaintain the original feature space of CLIP, thereby preserving the OOD\ngeneralization capabilities. Comprehensive experiments demonstrate that our\nmethod achieves in-distribution performance akin to conventional finetuning\nwhile attaining new state-of-the-art results on domain shift and zero-shot\nlearning benchmarks.\n","authors":["Jinwei Han","Zhiwen Lin","Zhongyisun Sun","Yingguo Gao","Ke Yan","Shouhong Ding","Yuan Gao","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2404.06244v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.06243v1","updated":"2024-04-09T12:09:56Z","published":"2024-04-09T12:09:56Z","title":"ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised\n Action Recognition in Videos","summary":" Human action or activity recognition in videos is a fundamental task in\ncomputer vision with applications in surveillance and monitoring, self-driving\ncars, sports analytics, human-robot interaction and many more. Traditional\nsupervised methods require large annotated datasets for training, which are\nexpensive and time-consuming to acquire. This work proposes a novel approach\nusing Cross-Architecture Pseudo-Labeling with contrastive learning for\nsemi-supervised action recognition. Our framework leverages both labeled and\nunlabelled data to robustly learn action representations in videos, combining\npseudo-labeling with contrastive learning for effective learning from both\ntypes of samples. We introduce a novel cross-architecture approach where 3D\nConvolutional Neural Networks (3D CNNs) and video transformers (VIT) are\nutilised to capture different aspects of action representations; hence we call\nit ActNetFormer. The 3D CNNs excel at capturing spatial features and local\ndependencies in the temporal domain, while VIT excels at capturing long-range\ndependencies across frames. By integrating these complementary architectures\nwithin the ActNetFormer framework, our approach can effectively capture both\nlocal and global contextual information of an action. This comprehensive\nrepresentation learning enables the model to achieve better performance in\nsemi-supervised action recognition tasks by leveraging the strengths of each of\nthese architectures. Experimental results on standard action recognition\ndatasets demonstrate that our approach performs better than the existing\nmethods, achieving state-of-the-art performance with only a fraction of labeled\ndata. The official website of this work is available at:\nhttps://github.com/rana2149/ActNetFormer.\n","authors":["Sharana Dharshikgan Suresh Dass","Hrishav Bakul Barua","Ganesh Krishnasamy","Raveendran Paramesran","Raphael C. -W. Phan"],"pdf_url":"https://arxiv.org/pdf/2404.06243v1.pdf","comment":"Submitted for peer review"},{"id":"http://arxiv.org/abs/2404.06240v1","updated":"2024-04-09T12:06:21Z","published":"2024-04-09T12:06:21Z","title":"Hyperparameter-Free Medical Image Synthesis for Sharing Data and\n Improving Site-Specific Segmentation","summary":" Sharing synthetic medical images is a promising alternative to sharing real\nimages that can improve patient privacy and data security. To get good results,\nexisting methods for medical image synthesis must be manually adjusted when\nthey are applied to unseen data. To remove this manual burden, we introduce a\nHyperparameter-Free distributed learning method for automatic medical image\nSynthesis, Sharing, and Segmentation called HyFree-S3. For three diverse\nsegmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of\nHyFree-S3 results in improved performance over training only with site-specific\ndata (in the majority of cases). The hyperparameter-free nature of the method\nshould make data synthesis and sharing easier, potentially leading to an\nincrease in the quantity of available data and consequently the quality of the\nmodels trained that may ultimately be applied in the clinic. Our code is\navailable at https://github.com/AwesomeLemon/HyFree-S3\n","authors":["Alexander Chebykin","Peter A. N. Bosman","Tanja Alderliesten"],"pdf_url":"https://arxiv.org/pdf/2404.06240v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2311.18649v3","updated":"2024-04-09T11:55:20Z","published":"2023-11-30T15:57:34Z","title":"Simple Semantic-Aided Few-Shot Learning","summary":" Learning from a limited amount of data, namely Few-Shot Learning, stands out\nas a challenging computer vision task. Several works exploit semantics and\ndesign complicated semantic fusion mechanisms to compensate for rare\nrepresentative features within restricted data. However, relying on naive\nsemantics such as class names introduces biases due to their brevity, while\nacquiring extensive semantics from external knowledge takes a huge time and\neffort. This limitation severely constrains the potential of semantics in\nFew-Shot Learning. In this paper, we design an automatic way called Semantic\nEvolution to generate high-quality semantics. The incorporation of high-quality\nsemantics alleviates the need for complex network structures and learning\nalgorithms used in previous works. Hence, we employ a simple two-layer network\ntermed Semantic Alignment Network to transform semantics and visual features\ninto robust class prototypes with rich discriminative features for few-shot\nclassification. The experimental results show our framework outperforms all\nprevious methods on six benchmarks, demonstrating a simple network with\nhigh-quality semantics can beat intricate multi-modal modules on few-shot\nclassification tasks. Code is available at\nhttps://github.com/zhangdoudou123/SemFew.\n","authors":["Hai Zhang","Junzhe Xu","Shanlin Jiang","Zhenan He"],"pdf_url":"https://arxiv.org/pdf/2311.18649v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.10974v3","updated":"2024-04-09T11:23:10Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v3.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2404.06219v1","updated":"2024-04-09T11:13:36Z","published":"2024-04-09T11:13:36Z","title":"Automatic Defect Detection in Sewer Network Using Deep Learning Based\n Object Detector","summary":" Maintaining sewer systems in large cities is important, but also time and\neffort consuming, because visual inspections are currently done manually. To\nreduce the amount of aforementioned manual work, defects within sewer pipes\nshould be located and classified automatically. In the past, multiple works\nhave attempted solving this problem using classical image processing, machine\nlearning, or a combination of those. However, each provided solution only focus\non detecting a limited set of defect/structure types, such as fissure, root,\nand/or connection. Furthermore, due to the use of hand-crafted features and\nsmall training datasets, generalization is also problematic. In order to\novercome these deficits, a sizable dataset with 14.7 km of various sewer pipes\nwere annotated by sewer maintenance experts in the scope of this work. On top\nof that, an object detector (EfficientDet-D0) was trained for automatic defect\ndetection. From the result of several expermients, peculiar natures of defects\nin the context of object detection, which greatly effect annotation and\ntraining process, are found and discussed. At the end, the final detector was\nable to detect 83% of defects in the test set; out of the missing 17%, only\n0.77% are very severe defects. This work provides an example of applying deep\nlearning-based object detection into an important but quiet engineering field.\nIt also gives some practical pointers on how to annotate peculiar \"object\",\nsuch as defects.\n","authors":["Bach Ha","Birgit Schalter","Laura White","Joachim Koehler"],"pdf_url":"https://arxiv.org/pdf/2404.06219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06212v1","updated":"2024-04-09T11:00:19Z","published":"2024-04-09T11:00:19Z","title":"OmniFusion Technical Report","summary":" Last year, multimodal architectures served up a revolution in AI-based\napproaches and solutions, extending the capabilities of large language models\n(LLM). We propose an \\textit{OmniFusion} model based on a pretrained LLM and\nadapters for visual modality. We evaluated and compared several architecture\ndesign principles for better text and visual data coupling: MLP and transformer\nadapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their\nfusing approach, image encoding method (whole image or tiles encoding) and two\n7B LLMs (the proprietary one and open-source Mistral). Experiments on 8\nvisual-language benchmarks show the top score for the best OmniFusion setup in\nterms of different VQA tasks in comparison with open-source LLaVA-like\nsolutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We\nalso propose a variety of situations, where OmniFusion provides highly-detailed\nanswers in different domains: housekeeping, sightseeing, culture, medicine,\nhandwritten and scanned equations recognition, etc. Mistral-based OmniFusion\nmodel is an open-source solution with weights, training and inference scripts\navailable at https://github.com/AIRI-Institute/OmniFusion.\n","authors":["Elizaveta Goncharova","Anton Razzhigaev","Matvey Mikhalchuk","Maxim Kurkin","Irina Abdullaeva","Matvey Skripkin","Ivan Oseledets","Denis Dimitrov","Andrey Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2404.06212v1.pdf","comment":"17 pages, 4 figures, 9 tables, 2 appendices"},{"id":"http://arxiv.org/abs/2404.06211v1","updated":"2024-04-09T11:00:11Z","published":"2024-04-09T11:00:11Z","title":"Unified Physical-Digital Attack Detection Challenge","summary":" Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR)\nSystems. In real-world scenarios, FRs are confronted with both physical and\ndigital attacks. However, existing algorithms often address only one type of\nattack at a time, which poses significant limitations in real-world scenarios\nwhere FR systems face hybrid physical-digital threats. To facilitate the\nresearch of Unified Attack Detection (UAD) algorithms, a large-scale\nUniAttackData dataset has been collected. UniAttackData is the largest public\ndataset for Unified Attack Detection, with a total of 28,706 videos, where each\nunique identity encompasses all advanced attack types. Based on this dataset,\nwe organized a Unified Physical-Digital Face Attack Detection Challenge to\nboost the research in Unified Attack Detections. It attracted 136 teams for the\ndevelopment phase, with 13 qualifying for the final round. The results\nre-verified by the organizing team were used for the final ranking. This paper\ncomprehensively reviews the challenge, detailing the dataset introduction,\nprotocol definition, evaluation criteria, and a summary of published results.\nFinally, we focus on the detailed analysis of the highest-performing algorithms\nand offer potential directions for unified physical-digital attack detection\ninspired by this competition. Challenge Website:\nhttps://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024.\n","authors":["Haocheng Yuan","Ajian Liu","Junze Zheng","Jun Wan","Jiankang Deng","Sergio Escalera","Hugo Jair Escalante","Isabelle Guyon","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2404.06211v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06207v1","updated":"2024-04-09T10:56:46Z","published":"2024-04-09T10:56:46Z","title":"Leveraging edge detection and neural networks for better UAV\n localization","summary":" We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)\nin environments lacking Global Navigation Satellite Systems (GNSS). Current\nstate-of-the-art techniques employ an offline-trained encoder to generate a\nvector representation (embedding) of the UAV's current view, which is then\ncompared with pre-computed embeddings of geo-referenced images to determine the\nUAV's position. Here, we demonstrate that the performance of these methods can\nbe significantly enhanced by preprocessing the images to extract their edges,\nwhich exhibit robustness to seasonal and illumination variations. Furthermore,\nwe establish that utilizing edges enhances resilience to orientation and\naltitude inaccuracies. Additionally, we introduce a confidence criterion for\nlocalization. Our findings are substantiated through synthetic experiments.\n","authors":["Theo Di Piazza","Enric Meinhardt-Llopis","Gabriele Facciolo","Benedicte Bascle","Corentin Abgrall","Jean-Clement Devaux"],"pdf_url":"https://arxiv.org/pdf/2404.06207v1.pdf","comment":"Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.06202v1","updated":"2024-04-09T10:47:43Z","published":"2024-04-09T10:47:43Z","title":"Automated National Urban Map Extraction","summary":" Developing countries usually lack the proper governance means to generate and\nregularly update a national rooftop map. Using traditional photogrammetry and\nsurveying methods to produce a building map at the federal level is costly and\ntime consuming. Using earth observation and deep learning methods, we can\nbridge this gap and propose an automated pipeline to fetch such national urban\nmaps. This paper aims to exploit the power of fully convolutional neural\nnetworks for multi-class buildings' instance segmentation to leverage high\nobject-wise accuracy results. Buildings' instance segmentation from sub-meter\nhigh-resolution satellite images can be achieved with relatively high\npixel-wise metric scores. We detail all engineering steps to replicate this\nwork and ensure highly accurate results in dense and slum areas witnessed in\nregions that lack proper urban planning in the Global South. We applied a case\nstudy of the proposed pipeline to Lebanon and successfully produced the first\ncomprehensive national building footprint map with approximately 1 Million\nunits with an 84% accuracy. The proposed architecture relies on advanced\naugmentation techniques to overcome dataset scarcity, which is often the case\nin developing countries.\n","authors":["Hasan Nasrallah","Abed Ellatif Samhat","Cristiano Nattero","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2404.06202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v1","updated":"2024-04-09T10:27:22Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06181v1","updated":"2024-04-09T10:04:06Z","published":"2024-04-09T10:04:06Z","title":"EPL: Evidential Prototype Learning for Semi-supervised Medical Image\n Segmentation","summary":" Although current semi-supervised medical segmentation methods can achieve\ndecent performance, they are still affected by the uncertainty in unlabeled\ndata and model predictions, and there is currently a lack of effective\nstrategies that can explore the uncertain aspects of both simultaneously. To\naddress the aforementioned issues, we propose Evidential Prototype Learning\n(EPL), which utilizes an extended probabilistic framework to effectively fuse\nvoxel probability predictions from different sources and achieves prototype\nfusion utilization of labeled and unlabeled data under a generalized evidential\nframework, leveraging voxel-level dual uncertainty masking. The uncertainty not\nonly enables the model to self-correct predictions but also improves the guided\nlearning process with pseudo-labels and is able to feed back into the\nconstruction of hidden features. The method proposed in this paper has been\nexperimented on LA, Pancreas-CT and TBAD datasets, achieving the\nstate-of-the-art performance in three different labeled ratios, which strongly\ndemonstrates the effectiveness of our strategy.\n","authors":["Yuanpeng He"],"pdf_url":"https://arxiv.org/pdf/2404.06181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06180v1","updated":"2024-04-09T10:03:44Z","published":"2024-04-09T10:03:44Z","title":"YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images","summary":" Detecting objects from aerial images poses significant challenges due to the\nfollowing factors: 1) Aerial images typically have very large sizes, generally\nwith millions or even hundreds of millions of pixels, while computational\nresources are limited. 2) Small object size leads to insufficient information\nfor effective detection. 3) Non-uniform object distribution leads to\ncomputational resource wastage. To address these issues, we propose YOLC (You\nOnly Look Clusters), an efficient and effective framework that builds on an\nanchor-free object detector, CenterNet. To overcome the challenges posed by\nlarge-scale images and non-uniform object distribution, we introduce a Local\nScale Module (LSM) that adaptively searches cluster regions for zooming in for\naccurate detection. Additionally, we modify the regression loss using Gaussian\nWasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable\nconvolution and refinement methods are employed in the detection head to\nenhance the detection of small objects. We perform extensive experiments on two\naerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the\neffectiveness and superiority of our proposed approach.\n","authors":["Chenguang Liu","Guangshuai Gao","Ziyue Huang","Zhenghui Hu","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06180v1.pdf","comment":"accepted to TITS"},{"id":"http://arxiv.org/abs/2404.06177v1","updated":"2024-04-09T09:58:10Z","published":"2024-04-09T09:58:10Z","title":"Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised\n Medical Image Segmentation","summary":" Although the existing uncertainty-based semi-supervised medical segmentation\nmethods have achieved excellent performance, they usually only consider a\nsingle uncertainty evaluation, which often fails to solve the problem related\nto credibility completely. Therefore, based on the framework of evidential deep\nlearning, this paper integrates the evidential predictive results in the\ncross-region of mixed and original samples to reallocate the confidence degree\nand uncertainty measure of each voxel, which is realized by emphasizing\nuncertain information of probability assignments fusion rule of traditional\nevidence theory. Furthermore, we design a voxel-level asymptotic learning\nstrategy by introducing information entropy to combine with the fused\nuncertainty measure to estimate voxel prediction more precisely. The model will\ngradually pay attention to the prediction results with high uncertainty in the\nlearning process, to learn the features that are difficult to master. The\nexperimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the\nsuperior performance of our proposed method in comparison with the existing\nstate of the arts.\n","authors":["Yuanpeng He","Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.06177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06173v1","updated":"2024-04-09T09:54:21Z","published":"2024-04-09T09:54:21Z","title":"Improving Interpretable Embeddings for Ad-hoc Video Search with\n Generative Captions and Multi-word Concept Bank","summary":" Aligning a user query and video clips in cross-modal latent space and that\nwith semantic concepts are two mainstream approaches for ad-hoc video search\n(AVS). However, the effectiveness of existing approaches is bottlenecked by the\nsmall sizes of available video-text datasets and the low quality of concept\nbanks, which results in the failures of unseen queries and the\nout-of-vocabulary problem. This paper addresses these two problems by\nconstructing a new dataset and developing a multi-word concept bank.\nSpecifically, capitalizing on a generative model, we construct a new dataset\nconsisting of 7 million generated text and video pairs for pre-training. To\ntackle the out-of-vocabulary problem, we develop a multi-word concept bank\nbased on syntax analysis to enhance the capability of a state-of-the-art\ninterpretable AVS method in modeling relationships between query words. We also\nstudy the impact of current advanced features on the method. Experimental\nresults show that the integration of the above-proposed elements doubles the\nR@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP\non the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2%\nto 77%, with an average about 20%.\n","authors":["Jiaxin Wu","Chong-Wah Ngo","Wing-Kwong Chan"],"pdf_url":"https://arxiv.org/pdf/2404.06173v1.pdf","comment":"Accepted in ICMR2024"},{"id":"http://arxiv.org/abs/2403.10376v2","updated":"2024-04-09T09:52:54Z","published":"2024-03-15T15:05:29Z","title":"PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively\n Aggregated Spatio-Temporal Alignment","summary":" Leveraging Transformer attention has led to great advancements in HDR\ndeghosting. However, the intricate nature of self-attention introduces\npractical challenges, as existing state-of-the-art methods often demand\nhigh-end GPUs or exhibit slow inference speeds, especially for high-resolution\nimages like 2K. Striking an optimal balance between performance and latency\nremains a critical concern. In response, this work presents PASTA, a novel\nProgressively Aggregated Spatio-Temporal Alignment framework for HDR\ndeghosting. Our approach achieves effectiveness and efficiency by harnessing\nhierarchical representation during feature distanglement. Through the\nutilization of diverse granularities within the hierarchical structure, our\nmethod substantially boosts computational speed and optimizes the HDR imaging\nworkflow. In addition, we explore within-scale feature modeling with local and\nglobal attention, gradually merging and refining them in a coarse-to-fine\nfashion. Experimental results showcase PASTA's superiority over current SOTA\nmethods in both visual quality and performance metrics, accompanied by a\nsubstantial 3-fold (x3) increase in inference speed.\n","authors":["Xiaoning Liu","Ao Li","Zongwei Wu","Yapeng Du","Le Zhang","Yulun Zhang","Radu Timofte","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.10376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05393v2","updated":"2024-04-09T09:52:32Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06165v1","updated":"2024-04-09T09:42:18Z","published":"2024-04-09T09:42:18Z","title":"Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data\n for Sensor Fusion Applications","summary":" Radar and camera fusion yields robustness in perception tasks by leveraging\nthe strength of both sensors. The typical extracted radar point cloud is 2D\nwithout height information due to insufficient antennas along the elevation\naxis, which challenges the network performance. This work introduces a\nlearning-based approach to infer the height of radar points associated with 3D\nobjects. A novel robust regression loss is introduced to address the sparse\ntarget challenge. In addition, a multi-task training strategy is employed,\nemphasizing important features. The average radar absolute height error\ndecreases from 1.69 to 0.25 meters compared to the state-of-the-art height\nextension method. The estimated target height values are used to preprocess and\nenrich radar data for downstream perception tasks. Integrating this refined\nradar information further enhances the performance of existing radar camera\nfusion models for object detection and depth estimation tasks.\n","authors":["Huawei Sun","Hao Feng","Gianfranco Mauro","Julius Ott","Georg Stettinger","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2404.06165v1.pdf","comment":"Accepted by IEEE Intelligent Vehicles Symposium (IV 2024)"},{"id":"http://arxiv.org/abs/2404.06155v1","updated":"2024-04-09T09:28:05Z","published":"2024-04-09T09:28:05Z","title":"Efficient and Robust Point Cloud Registration via Heuristics-guided\n Parameter Search","summary":" Estimating the rigid transformation with 6 degrees of freedom based on a\nputative 3D correspondence set is a crucial procedure in point cloud\nregistration. Existing correspondence identification methods usually lead to\nlarge outlier ratios ($>$ 95 $\\%$ is common), underscoring the significance of\nrobust registration methods. Many researchers turn to parameter search-based\nstrategies (e.g., Branch-and-Bround) for robust registration. Although related\nmethods show high robustness, their efficiency is limited to the\nhigh-dimensional search space. This paper proposes a heuristics-guided\nparameter search strategy to accelerate the search while maintaining high\nrobustness. We first sample some correspondences (i.e., heuristics) and then\njust need to sequentially search the feasible regions that make each sample an\ninlier. Our strategy largely reduces the search space and can guarantee\naccuracy with only a few inlier samples, therefore enjoying an excellent\ntrade-off between efficiency and robustness. Since directly parameterizing the\n6-dimensional nonlinear feasible region for efficient search is intractable, we\nconstruct a three-stage decomposition pipeline to reparameterize the feasible\nregion, resulting in three lower-dimensional sub-problems that are easily\nsolvable via our strategy. Besides reducing the searching dimension, our\ndecomposition enables the leverage of 1-dimensional interval stabbing at all\nthree stages for searching acceleration. Moreover, we propose a valid sampling\nstrategy to guarantee our sampling effectiveness, and a compatibility\nverification setup to further accelerate our search. Extensive experiments on\nboth simulated and real-world datasets demonstrate that our approach exhibits\ncomparable robustness with state-of-the-art methods while achieving a\nsignificant efficiency boost.\n","authors":["Tianyu Huang","Haoang Li","Liangzu Peng","Yinlong Liu","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06155v1.pdf","comment":"21 pages, 16 figures. Accepted to IEEE Transactions on Pattern\n Analysis and Machine Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.06154v1","updated":"2024-04-09T09:27:54Z","published":"2024-04-09T09:27:54Z","title":"Concise Plane Arrangements for Low-Poly Surface and Volume Modelling","summary":" Plane arrangements are a useful tool for surface and volume modelling.\nHowever, their main drawback is poor scalability. We introduce two key\nnovelties that enable the construction of plane arrangements for complex\nobjects and entire scenes: an ordering scheme for the plane insertion and the\ndirect use of input points during arrangement construction. Both ingredients\nreduce the number of unwanted splits, resulting in improved scalability of the\nconstruction mechanism by up to two orders of magnitude compared to existing\nalgorithms. We further introduce a remeshing and simplification technique that\nallows us to extract low-polygon surface meshes and lightweight convex\ndecompositions of volumes from the arrangement. We show that our approach leads\nto state-of-the-art results for the aforementioned tasks by comparing it to\nlearning-based and traditional approaches on various different datasets. Our\nimplementation is available at https://github.com/raphaelsulzer/compod .\n","authors":["Raphael Sulzer","Florent Lafarge"],"pdf_url":"https://arxiv.org/pdf/2404.06154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06152v1","updated":"2024-04-09T09:23:04Z","published":"2024-04-09T09:23:04Z","title":"HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields","summary":" In recent advancements in novel view synthesis, generalizable Neural Radiance\nFields (NeRF) based methods applied to human subjects have shown remarkable\nresults in generating novel views from few images. However, this generalization\nability cannot capture the underlying structural features of the skeleton\nshared across all instances. Building upon this, we introduce HFNeRF: a novel\ngeneralizable human feature NeRF aimed at generating human biomechanic features\nusing a pre-trained image encoder. While previous human NeRF methods have shown\npromising results in the generation of photorealistic virtual avatars, such\nmethods lack underlying human structure or biomechanic features such as\nskeleton or joint information that are crucial for downstream applications\nincluding Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D\npre-trained foundation models toward learning human features in 3D using neural\nrendering, and then volume rendering towards generating 2D feature maps. We\nevaluate HFNeRF in the skeleton estimation task by predicting heatmaps as\nfeatures. The proposed method is fully differentiable, allowing to successfully\nlearn color, geometry, and human skeleton in a simultaneous manner. This paper\npresents preliminary results of HFNeRF, illustrating its potential in\ngenerating realistic virtual avatars with biomechanic features using NeRF.\n","authors":["Arnab Dey","Di Yang","Antitza Dantcheva","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10634v2","updated":"2024-04-09T09:18:26Z","published":"2023-12-17T07:33:06Z","title":"Anomaly Score: Evaluating Generative Models and Individual Generated\n Images based on Complexity and Vulnerability","summary":" With the advancement of generative models, the assessment of generated images\nbecomes more and more important. Previous methods measure distances between\nfeatures of reference and generated images from trained vision models. In this\npaper, we conduct an extensive investigation into the relationship between the\nrepresentation space and input space around generated images. We first propose\ntwo measures related to the presence of unnatural elements within images:\ncomplexity, which indicates how non-linear the representation space is, and\nvulnerability, which is related to how easily the extracted feature changes by\nadversarial input changes. Based on these, we introduce a new metric to\nevaluating image-generative models called anomaly score (AS). Moreover, we\npropose AS-i (anomaly score for individual images) that can effectively\nevaluate generated images individually. Experimental results demonstrate the\nvalidity of the proposed approach.\n","authors":["Jaehui Hwang","Junghyuk Lee","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2312.10634v2.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00915v2","updated":"2024-04-09T09:16:29Z","published":"2024-04-01T04:43:39Z","title":"Scalable 3D Registration via Truncated Entry-wise Absolute Residuals","summary":" Given an input set of $3$D point pairs, the goal of outlier-robust $3$D\nregistration is to compute some rotation and translation that align as many\npoint pairs as possible. This is an important problem in computer vision, for\nwhich many highly accurate approaches have been recently proposed. Despite\ntheir impressive performance, these approaches lack scalability, often\noverflowing the $16$GB of memory of a standard laptop to handle roughly\n$30,000$ point pairs. In this paper, we propose a $3$D registration approach\nthat can process more than ten million ($10^7$) point pairs with over $99\\%$\nrandom outliers. Moreover, our method is efficient, entails low memory costs,\nand maintains high accuracy at the same time. We call our method TEAR, as it\ninvolves minimizing an outlier-robust loss that computes Truncated Entry-wise\nAbsolute Residuals. To minimize this loss, we decompose the original\n$6$-dimensional problem into two subproblems of dimensions $3$ and $2$,\nrespectively, solved in succession to global optimality via a customized\nbranch-and-bound method. While branch-and-bound is often slow and unscalable,\nthis does not apply to TEAR as we propose novel bounding functions that are\ntight and computationally efficient. Experiments on various datasets are\nconducted to validate the scalability and efficiency of our method.\n","authors":["Tianyu Huang","Liangzu Peng","René Vidal","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00915v2.pdf","comment":"24 pages, 12 figures. Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.08801v2","updated":"2024-04-09T09:13:01Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02813v2","updated":"2024-04-09T09:12:58Z","published":"2023-12-05T14:56:55Z","title":"BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis\n via Bridging Image and Video Diffusion Models","summary":" Diffusion models have made tremendous progress in text-driven image and video\ngeneration. Now text-to-image foundation models are widely applied to various\ndownstream image synthesis tasks, such as controllable image generation and\nimage editing, while downstream video synthesis tasks are less explored for\nseveral reasons. First, it requires huge memory and computation overhead to\ntrain a video generation foundation model. Even with video foundation models,\nadditional costly training is still required for downstream video synthesis\ntasks. Second, although some works extend image diffusion models into videos in\na training-free manner, temporal consistency cannot be well preserved. Finally,\nthese adaption methods are specifically designed for one task and fail to\ngeneralize to different tasks. To mitigate these issues, we propose a\ntraining-free general-purpose video synthesis framework, coined as {\\bf\nBIVDiff}, via bridging specific image diffusion models and general\ntext-to-video foundation diffusion models. Specifically, we first use a\nspecific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for\nframe-wise video generation, then perform Mixed Inversion on the generated\nvideo, and finally input the inverted latents into the video diffusion models\n(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework\nenables flexible image model selection for different purposes with strong task\ngeneralization and high efficiency. To validate the effectiveness and general\nuse of BIVDiff, we perform a wide range of video synthesis tasks, including\ncontrollable video generation, video editing, video inpainting, and\noutpainting.\n","authors":["Fengyuan Shi","Jiaxi Gu","Hang Xu","Songcen Xu","Wei Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02813v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://bivdiff.github.io;\n GitHub repository: https://github.com/MCG-NJU/BIVDiff"},{"id":"http://arxiv.org/abs/2404.06139v1","updated":"2024-04-09T09:05:23Z","published":"2024-04-09T09:05:23Z","title":"DiffHarmony: Latent Diffusion Model Meets Image Harmonization","summary":" Image harmonization, which involves adjusting the foreground of a composite\nimage to attain a unified visual consistency with the background, can be\nconceptualized as an image-to-image translation task. Diffusion models have\nrecently promoted the rapid development of image-to-image translation tasks .\nHowever, training diffusion models from scratch is computationally intensive.\nFine-tuning pre-trained latent diffusion models entails dealing with the\nreconstruction error induced by the image compression autoencoder, making it\nunsuitable for image generation tasks that involve pixel-level evaluation\nmetrics. To deal with these issues, in this paper, we first adapt a pre-trained\nlatent diffusion model to the image harmonization task to generate the\nharmonious but potentially blurry initial images. Then we implement two\nstrategies: utilizing higher-resolution images during inference and\nincorporating an additional refinement stage, to further enhance the clarity of\nthe initially harmonized images. Extensive experiments on iHarmony4 datasets\ndemonstrate the superiority of our proposed method. The code and model will be\nmade publicly available at https://github.com/nicecv/DiffHarmony .\n","authors":["Pengfei Zhou","Fangxiang Feng","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06139v1.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2404.06135v1","updated":"2024-04-09T09:02:21Z","published":"2024-04-09T09:02:21Z","title":"Mansformer: Efficient Transformer of Mixed Attention for Image\n Deblurring and Beyond","summary":" Transformer has made an enormous success in natural language processing and\nhigh-level vision over the past few years. However, the complexity of\nself-attention is quadratic to the image size, which makes it infeasible for\nhigh-resolution vision tasks. In this paper, we propose the Mansformer, a\nTransformer of mixed attention that combines multiple self-attentions, gate,\nand multi-layer perceptions (MLPs), to explore and employ more possibilities of\nself-attention. Taking efficiency into account, we design four kinds of\nself-attention, whose complexities are all linear. By elaborate adjustment of\nthe tensor shapes and dimensions for the dot product, we split the typical\nself-attention of quadratic complexity into four operations of linear\ncomplexity. To adaptively merge these different kinds of self-attention, we\ntake advantage of an architecture similar to Squeeze-and-Excitation Networks.\nFurthermore, we make it to merge the two-staged Transformer design into one\nstage by the proposed gated-dconv MLP. Image deblurring is our main target,\nwhile extensive quantitative and qualitative evaluations show that this method\nperforms favorably against the state-of-the-art methods far more than simply\ndeblurring. The source codes and trained models will be made available to the\npublic.\n","authors":["Pin-Hung Kuo","Jinshan Pan","Shao-Yi Chien","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06128v1","updated":"2024-04-09T08:51:44Z","published":"2024-04-09T08:51:44Z","title":"Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for\n Realistic Endoscopic Reconstruction","summary":" Within colorectal cancer diagnostics, conventional colonoscopy techniques\nface critical limitations, including a limited field of view and a lack of\ndepth information, which can impede the detection of precancerous lesions.\nCurrent methods struggle to provide comprehensive and accurate 3D\nreconstructions of the colonic surface which can help minimize the missing\nregions and reinspection for pre-cancerous polyps. Addressing this, we\nintroduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting\n(3D GS) combined with a Recurrent Neural Network-based Simultaneous\nLocalization and Mapping (RNNSLAM) system. By introducing geometric and depth\nregularization into the 3D GS framework, our approach ensures more accurate\nalignment of Gaussians with the colon surface, resulting in smoother 3D\nreconstructions with novel viewing of detailed textures and structures.\nEvaluations across three diverse datasets show that Gaussian Pancakes enhances\nnovel view synthesis quality, surpassing current leading methods with a 18%\nboost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster\nrendering and more than 10X shorter training times, making it a practical tool\nfor real-time applications. Hence, this holds promise for achieving clinical\ntranslation for better detection and diagnosis of colorectal cancer.\n","authors":["Sierra Bonilla","Shuai Zhang","Dimitrios Psychogyios","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2404.06128v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.06124v1","updated":"2024-04-09T08:49:01Z","published":"2024-04-09T08:49:01Z","title":"Hierarchical Insights: Exploiting Structural Similarities for Reliable\n 3D Semantic Segmentation","summary":" Safety-critical applications like autonomous driving call for robust 3D\nenvironment perception algorithms which can withstand highly diverse and\nambiguous surroundings. The predictive performance of any classification model\nstrongly depends on the underlying dataset and the prior knowledge conveyed by\nthe annotated labels. While the labels provide a basis for the learning\nprocess, they usually fail to represent inherent relations between the classes\n- representations, which are a natural element of the human perception system.\nWe propose a training strategy which enables a 3D LiDAR semantic segmentation\nmodel to learn structural relationships between the different classes through\nabstraction. We achieve this by implicitly modeling those relationships through\na learning rule for hierarchical multi-label classification (HMC). With a\ndetailed analysis we show, how this training strategy not only improves the\nmodel's confidence calibration, but also preserves additional information for\ndownstream tasks like fusion, prediction and planning.\n","authors":["Mariella Dreissig","Florian Piewak","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2404.06124v1.pdf","comment":"submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2404.06119v1","updated":"2024-04-09T08:41:13Z","published":"2024-04-09T08:41:13Z","title":"DreamView: Injecting View-specific Text Guidance into Text-to-3D\n Generation","summary":" Text-to-3D generation, which synthesizes 3D assets according to an overall\ntext description, has significantly progressed. However, a challenge arises\nwhen the specific appearances need customizing at designated viewpoints but\nreferring solely to the overall description for generating 3D objects. For\ninstance, ambiguity easily occurs when producing a T-shirt with distinct\npatterns on its front and back using a single overall text guidance. In this\nwork, we propose DreamView, a text-to-image approach enabling multi-view\ncustomization while maintaining overall consistency by adaptively injecting the\nview-specific and overall text guidance through a collaborative text guidance\ninjection module, which can also be lifted to 3D generation via score\ndistillation sampling. DreamView is trained with large-scale rendered\nmulti-view images and their corresponding view-specific texts to learn to\nbalance the separate content manipulation in each view and the global\nconsistency of the overall object, resulting in a dual achievement of\ncustomization and consistency. Consequently, DreamView empowers artists to\ndesign 3D objects creatively, fostering the creation of more innovative and\ndiverse 3D assets. Code and model will be released at\nhttps://github.com/iSEE-Laboratory/DreamView.\n","authors":["Junkai Yan","Yipeng Gao","Qize Yang","Xihan Wei","Xuansong Xie","Ancong Wu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06109v1","updated":"2024-04-09T08:20:37Z","published":"2024-04-09T08:20:37Z","title":"Revising Densification in Gaussian Splatting","summary":" In this paper, we address the limitations of Adaptive Density Control (ADC)\nin 3D Gaussian Splatting (3DGS), a scene representation method achieving\nhigh-quality, photorealistic results for novel view synthesis. ADC has been\nintroduced for automatic 3D point primitive management, controlling\ndensification and pruning, however, with certain limitations in the\ndensification logic. Our main contribution is a more principled, pixel-error\ndriven formulation for density control in 3DGS, leveraging an auxiliary,\nper-pixel error function as the criterion for densification. We further\nintroduce a mechanism to control the total number of primitives generated per\nscene and correct a bias in the current opacity handling strategy of ADC during\ncloning operations. Our approach leads to consistent quality improvements\nacross a variety of benchmark scenes, without sacrificing the method's\nefficiency.\n","authors":["Samuel Rota Bulò","Lorenzo Porzi","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.06109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04617v2","updated":"2024-04-09T08:20:08Z","published":"2024-04-06T12:50:08Z","title":"Empowering Image Recovery_ A Multi-Attention Approach","summary":" We propose Diverse Restormer (DART), a novel image restoration method that\neffectively integrates information from various sources (long sequences, local\nand global regions, feature dimensions, and positional dimensions) to address\nrestoration challenges. While Transformer models have demonstrated excellent\nperformance in image restoration due to their self-attention mechanism, they\nface limitations in complex scenarios. Leveraging recent advancements in\nTransformers and various attention mechanisms, our method utilizes customized\nattention mechanisms to enhance overall performance. DART, our novel network\narchitecture, employs windowed attention to mimic the selective focusing\nmechanism of human eyes. By dynamically adjusting receptive fields, it\noptimally captures the fundamental features crucial for image resolution\nreconstruction. Efficiency and performance balance are achieved through the\nLongIR attention mechanism for long sequence image restoration. Integration of\nattention mechanisms across feature and positional dimensions further enhances\nthe recovery of fine details. Evaluation across five restoration tasks\nconsistently positions DART at the forefront. Upon acceptance, we commit to\nproviding publicly accessible code and models to ensure reproducibility and\nfacilitate further research.\n","authors":["Juan Wen","Yawei Li","Chao Zhang","Weiyan Hou","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.04617v2.pdf","comment":"12 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2401.13961v2","updated":"2024-04-09T08:07:48Z","published":"2024-01-25T05:50:48Z","title":"TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation\n in VEM images","summary":" While imaging techniques at macro and mesoscales have garnered substantial\nattention and resources, microscale VEM imaging, capable of revealing intricate\nvascular details, has lacked the necessary benchmarking infrastructure. In this\npaper, we address a significant gap in the field of neuroimaging by introducing\nthe largest-to-date public benchmark, \\textbf{BvEM}, designed specifically for\ncortical blood vessel segmentation in volume electron microscopy (VEM) images.\nOur BvEM benchmark is based on VEM image volumes from three mammal species:\nadult mouse, macaque, and human. We standardized the resolution, addressed\nimaging variations, and meticulously annotated blood vessels through\nsemi-automatic, manual, and quality control processes, ensuring high-quality 3D\nsegmentation. Furthermore, we developed a zero-shot cortical blood vessel\nsegmentation method named TriSAM, which leverages the powerful segmentation\nmodel SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation,\nTriSAM employs a multi-seed tracking framework, leveraging the reliability of\ncertain image planes for tracking while using others to identify potential\nturning points. This approach effectively achieves long-term 3D blood vessel\nsegmentation without model training or fine-tuning. Experimental results show\nthat TriSAM achieved superior performances on the BvEM benchmark across three\nspecies.\n","authors":["Jia Wan","Wanhua Li","Jason Ken Adhinarta","Atmadeep Banerjee","Evelina Sjostedt","Jingpeng Wu","Jeff Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2401.13961v2.pdf","comment":"BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9"},{"id":"http://arxiv.org/abs/2403.13358v2","updated":"2024-04-09T07:55:41Z","published":"2024-03-20T07:36:43Z","title":"GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped\n Robot","summary":" Multi-task robot learning holds significant importance in tackling diverse\nand complex scenarios. However, current approaches are hindered by performance\nissues and difficulties in collecting training datasets. In this paper, we\npropose GeRM (Generalist Robotic Model). We utilize offline reinforcement\nlearning to optimize data utilization strategies to learn from both\ndemonstrations and sub-optimal data, thus surpassing the limitations of human\ndemonstrations. Thereafter, we employ a transformer-based VLA network to\nprocess multi-modal inputs and output actions. By introducing the\nMixture-of-Experts structure, GeRM allows faster inference speed with higher\nwhole model capacity, and thus resolves the issue of limited RL parameters,\nenhancing model performance in multi-task learning while controlling\ncomputational costs. Through a series of experiments, we demonstrate that GeRM\noutperforms other methods across all tasks, while also validating its\nefficiency in both training and inference processes. Additionally, we uncover\nits potential to acquire emergent skills. Additionally, we contribute the\nQUARD-Auto dataset, collected automatically to support our training approach\nand foster advancements in multi-task quadruped robot learning. This work\npresents a new paradigm for reducing the cost of collecting robot data and\ndriving progress in the multi-task learning community. You can reach our\nproject and video through the link: https://songwxuan.github.io/GeRM/ .\n","authors":["Wenxuan Song","Han Zhao","Pengxiang Ding","Can Cui","Shangke Lyu","Yaning Fan","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05970v3","updated":"2024-04-09T07:49:55Z","published":"2023-03-10T15:01:51Z","title":"Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D\n Perception","summary":" Long-term temporal fusion is a crucial but often overlooked technique in\ncamera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly\nin a parallel manner. While parallel fusion can benefit from long-term\ninformation, it suffers from increasing computational and memory overheads as\nthe fusion window size grows. Alternatively, BEVFormer adopts a recurrent\nfusion pipeline so that history information can be efficiently integrated, yet\nit fails to benefit from longer temporal frames. In this paper, we explore an\nembarrassingly simple long-term recurrent fusion strategy built upon the\nLSS-based methods and find it already able to enjoy the merits from both sides,\ni.e., rich long-term information and efficient fusion pipeline. A temporal\nembedding module is further proposed to improve the model's robustness against\noccasionally missed frames in practical scenarios. We name this simple but\neffective fusing pipeline VideoBEV. Experimental results on the nuScenes\nbenchmark show that VideoBEV obtains strong performance on various camera-based\n3D perception tasks, including object detection (55.4\\% mAP and 62.9\\% NDS),\nsegmentation (48.6\\% vehicle mIoU), tracking (54.8\\% AMOTA), and motion\nprediction (0.80m minADE and 0.463 EPA).\n","authors":["Chunrui Han","Jinrong Yang","Jianjian Sun","Zheng Ge","Runpei Dong","Hongyu Zhou","Weixin Mao","Yuang Peng","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.05970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06091v1","updated":"2024-04-09T07:49:30Z","published":"2024-04-09T07:49:30Z","title":"Hash3D: Training-free Acceleration for 3D Generation","summary":" The evolution of 3D generative modeling has been notably propelled by the\nadoption of 2D diffusion models. Despite this progress, the cumbersome\noptimization process per se presents a critical hurdle to efficiency. In this\npaper, we introduce Hash3D, a universal acceleration for 3D generation without\nmodel training. Central to Hash3D is the insight that feature-map redundancy is\nprevalent in images rendered from camera positions and diffusion time-steps in\nclose proximity. By effectively hashing and reusing these feature maps across\nneighboring timesteps and camera angles, Hash3D substantially prevents\nredundant calculations, thus accelerating the diffusion model's inference in 3D\ngeneration tasks. We achieve this through an adaptive grid-based hashing.\nSurprisingly, this feature-sharing mechanism not only speed up the generation\nbut also enhances the smoothness and view consistency of the synthesized 3D\nobjects. Our experiments covering 5 text-to-3D and 3 image-to-3D models,\ndemonstrate Hash3D's versatility to speed up optimization, enhancing efficiency\nby 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian\nsplatting largely speeds up 3D model creation, reducing text-to-3D processing\nto about 10 minutes and image-to-3D conversion to roughly 30 seconds. The\nproject page is at https://adamdad.github.io/hash3D/.\n","authors":["Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06091v1.pdf","comment":"https://adamdad.github.io/hash3D/"},{"id":"http://arxiv.org/abs/2311.17002v3","updated":"2024-04-09T07:46:43Z","published":"2023-11-28T17:57:44Z","title":"Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following","summary":" Existing text-to-image (T2I) diffusion models usually struggle in\ninterpreting complex prompts, especially those with quantity, object-attribute\nbinding, and multi-subject descriptions. In this work, we introduce a semantic\npanel as the middleware in decoding texts to images, supporting the generator\nto better follow instructions. The panel is obtained through arranging the\nvisual concepts parsed from the input text by the aid of large language models,\nand then injected into the denoising network as a detailed control signal to\ncomplement the text condition. To facilitate text-to-panel learning, we come up\nwith a carefully designed semantic formatting protocol, accompanied by a\nfully-automatic data preparation pipeline. Thanks to such a design, our\napproach, which we call Ranni, manages to enhance a pre-trained T2I generator\nregarding its textual controllability. More importantly, the introduction of\nthe generative middleware brings a more convenient form of interaction (i.e.,\ndirectly adjusting the elements in the panel or using language instructions)\nand further allows users to finely customize their generation, based on which\nwe develop a practical system and showcase its potential in continuous\ngeneration and chatting-based editing. Our project page is at\nhttps://ranni-t2i.github.io/Ranni.\n","authors":["Yutong Feng","Biao Gong","Di Chen","Yujun Shen","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.17002v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05559v2","updated":"2024-04-09T07:43:29Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v2.pdf","comment":"Accepted to CVPR 2024. Project Webpage:\n https://jacobchalk.github.io/TIM-Project"},{"id":"http://arxiv.org/abs/2404.06080v1","updated":"2024-04-09T07:39:21Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07937v4","updated":"2024-04-09T07:31:25Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06075v1","updated":"2024-04-09T07:25:30Z","published":"2024-04-09T07:25:30Z","title":"LIPT: Latency-aware Image Processing Transformer","summary":" Transformer is leading a trend in the field of image processing. Despite the\ngreat success that existing lightweight image processing transformers have\nachieved, they are tailored to FLOPs or parameters reduction, rather than\npractical inference acceleration. In this paper, we present a latency-aware\nimage processing transformer, termed LIPT. We devise the low-latency proportion\nLIPT block that substitutes memory-intensive operators with the combination of\nself-attention and convolutions to achieve practical speedup. Specifically, we\npropose a novel non-volatile sparse masking self-attention (NVSM-SA) that\nutilizes a pre-computing sparse mask to capture contextual information from a\nlarger window with no extra computation overload. Besides, a high-frequency\nreparameterization module (HRM) is proposed to make LIPT block\nreparameterization friendly, which improves the model's detail reconstruction\ncapability. Extensive experiments on multiple image processing tasks (e.g.,\nimage super-resolution (SR), JPEG artifact reduction, and image denoising)\ndemonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves\nreal-time GPU inference with state-of-the-art performance on multiple image SR\nbenchmarks.\n","authors":["Junbo Qiao","Wei Li","Haizhen Xie","Hanting Chen","Yunshuai Zhou","Zhijun Tu","Jie Hu","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03892v2","updated":"2024-04-09T07:21:32Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The study introduces an integrated framework combining Convolutional Neural\nNetworks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced\ndiagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned\nResNet50 architecture, our investigation not only provides effective\ndifferentiation of mammographic images into benign and malignant categories but\nalso addresses the opaque \"black-box\" nature of deep learning models by\nemploying XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN\ndecision-making processes for healthcare professionals. Our methodology\nencompasses an elaborate data preprocessing pipeline and advanced data\naugmentation techniques to counteract dataset limitations, and transfer\nlearning using pre-trained networks, such as VGG-16, DenseNet and ResNet was\nemployed. A focal point of our study is the evaluation of XAI's effectiveness\nin interpreting model predictions, highlighted by utilising the Hausdorff\nmeasure to assess the alignment between AI-generated explanations and expert\nannotations quantitatively. This approach plays a critical role for XAI in\npromoting trustworthiness and ethical fairness in AI-assisted diagnostics. The\nfindings from our research illustrate the effective collaboration between CNNs\nand XAI in advancing diagnostic methods for breast cancer, thereby facilitating\na more seamless integration of advanced AI technologies within clinical\nsettings. By enhancing the interpretability of AI-driven decisions, this work\nlays the groundwork for improved collaboration between AI systems and medical\npractitioners, ultimately enriching patient care. Furthermore, the implications\nof our research extend well beyond the current methodologies, advocating for\nsubsequent inquiries into the integration of multimodal data and the refinement\nof AI explanations to satisfy the needs of clinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18201v2","updated":"2024-04-09T07:18:41Z","published":"2024-02-28T09:46:56Z","title":"Learning Invariant Inter-pixel Correlations for Superpixel Generation","summary":" Deep superpixel algorithms have made remarkable strides by substituting\nhand-crafted features with learnable ones. Nevertheless, we observe that\nexisting deep superpixel methods, serving as mid-level representation\noperations, remain sensitive to the statistical properties (e.g., color\ndistribution, high-level semantics) embedded within the training dataset.\nConsequently, learnable features exhibit constrained discriminative capability,\nresulting in unsatisfactory pixel grouping performance, particularly in\nuntrainable application scenarios. To address this issue, we propose the\nContent Disentangle Superpixel (CDS) algorithm to selectively separate the\ninvariant inter-pixel correlations and statistical properties, i.e., style\nnoise. Specifically, We first construct auxiliary modalities that are\nhomologous to the original RGB image but have substantial stylistic variations.\nThen, driven by mutual information, we propose the local-grid correlation\nalignment across modalities to reduce the distribution discrepancy of\nadaptively selected features and learn invariant inter-pixel correlations.\nAfterwards, we perform global-style mutual information minimization to enforce\nthe separation of invariant content and train data styles. The experimental\nresults on four benchmark datasets demonstrate the superiority of our approach\nto existing state-of-the-art methods, regarding boundary adherence,\ngeneralization, and efficiency. Code and pre-trained model are available at\nhttps://github.com/rookiie/CDSpixel.\n","authors":["Sen Xu","Shikui Wei","Tao Ruan","Lixin Liao"],"pdf_url":"https://arxiv.org/pdf/2402.18201v2.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2404.06065v1","updated":"2024-04-09T07:08:00Z","published":"2024-04-09T07:08:00Z","title":"Unified Entropy Optimization for Open-Set Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims at adapting a model pre-trained on the\nlabeled source domain to the unlabeled target domain. Existing methods usually\nfocus on improving TTA performance under covariate shifts, while neglecting\nsemantic shifts. In this paper, we delve into a realistic open-set TTA setting\nwhere the target domain may contain samples from unknown classes. Many\nstate-of-the-art closed-set TTA methods perform poorly when applied to open-set\nscenarios, which can be attributed to the inaccurate estimation of data\ndistribution and model confidence. To address these issues, we propose a simple\nbut effective framework called unified entropy optimization (UniEnt), which is\ncapable of simultaneously adapting to covariate-shifted in-distribution (csID)\ndata and detecting covariate-shifted out-of-distribution (csOOD) data.\nSpecifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test\ndata, followed by entropy minimization on the pseudo-csID data and entropy\nmaximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to\nalleviate the noise caused by hard data partition leveraging sample-level\nconfidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show\nthe superiority of our framework. The code is available at\nhttps://github.com/gaozhengqing/UniEnt\n","authors":["Zhengqing Gao","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06065v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04580v2","updated":"2024-04-09T06:56:02Z","published":"2024-04-06T10:30:31Z","title":"SDFR: Synthetic Data for Face Recognition Competition","summary":" Large-scale face recognition datasets are collected by crawling the Internet\nand without individuals' consent, raising legal, ethical, and privacy concerns.\nWith the recent advances in generative models, recently several works proposed\ngenerating synthetic face recognition datasets to mitigate concerns in\nweb-crawled face recognition datasets. This paper presents the summary of the\nSynthetic Data for Face Recognition (SDFR) Competition held in conjunction with\nthe 18th IEEE International Conference on Automatic Face and Gesture\nRecognition (FG 2024) and established to investigate the use of synthetic data\nfor training face recognition models. The SDFR competition was split into two\ntasks, allowing participants to train face recognition systems using new\nsynthetic datasets and/or existing ones. In the first task, the face\nrecognition backbone was fixed and the dataset size was limited, while the\nsecond task provided almost complete freedom on the model backbone, the\ndataset, and the training pipeline. The submitted models were trained on\nexisting and also new synthetic datasets and used clever methods to improve\ntraining with synthetic data. The submissions were evaluated and ranked on a\ndiverse set of seven benchmarking datasets. The paper gives an overview of the\nsubmitted face recognition models and reports achieved performance compared to\nbaseline models trained on real and synthetic datasets. Furthermore, the\nevaluation of submissions is extended to bias assessment across different\ndemography groups. Lastly, an outlook on the current state of the research in\ntraining face recognition models using synthetic data is presented, and\nexisting problems as well as potential future directions are also discussed.\n","authors":["Hatef Otroshi Shahreza","Christophe Ecabert","Anjith George","Alexander Unnervik","Sébastien Marcel","Nicolò Di Domenico","Guido Borghi","Davide Maltoni","Fadi Boutros","Julia Vogel","Naser Damer","Ángela Sánchez-Pérez"," EnriqueMas-Candela","Jorge Calvo-Zaragoza","Bernardo Biesseck","Pedro Vidal","Roger Granada","David Menotti","Ivan DeAndres-Tame","Simone Maurizio La Cava","Sara Concas","Pietro Melzi","Ruben Tolosana","Ruben Vera-Rodriguez","Gianpaolo Perelli","Giulia Orrù","Gian Luca Marcialis","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2404.04580v2.pdf","comment":"The 18th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.06057v1","updated":"2024-04-09T06:47:44Z","published":"2024-04-09T06:47:44Z","title":"Unified Multi-modal Diagnostic Framework with Reconstruction\n Pre-training and Heterogeneity-combat Tuning","summary":" Medical multi-modal pre-training has revealed promise in computer-aided\ndiagnosis by leveraging large-scale unlabeled datasets. However, existing\nmethods based on masked autoencoders mainly rely on data-level reconstruction\ntasks, but lack high-level semantic information. Furthermore, two significant\nheterogeneity challenges hinder the transfer of pre-trained knowledge to\ndownstream tasks, \\textit{i.e.}, the distribution heterogeneity between\npre-training data and downstream data, and the modality heterogeneity within\ndownstream data. To address these challenges, we propose a Unified Medical\nMulti-modal Diagnostic (UMD) framework with tailored pre-training and\ndownstream tuning strategies. Specifically, to enhance the representation\nabilities of vision and language encoders, we propose the Multi-level\nReconstruction Pre-training (MR-Pretrain) strategy, including a feature-level\nand data-level reconstruction, which guides models to capture the semantic\ninformation from masked inputs of different modalities. Moreover, to tackle two\nkinds of heterogeneities during the downstream tuning, we present the\nheterogeneity-combat downstream tuning strategy, which consists of a\nTask-oriented Distribution Calibration (TD-Calib) and a Gradient-guided\nModality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the\npre-trained model regarding the distribution of downstream datasets, and\nGM-Coord adjusts the gradient weights according to the dynamic optimization\nstatus of different modalities. Extensive experiments on five public medical\ndatasets demonstrate the effectiveness of our UMD framework, which remarkably\noutperforms existing approaches on three kinds of downstream tasks.\n","authors":["Yupei Zhang","Li Pan","Qiushi Yang","Tan Li","Zhen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06057v1.pdf","comment":"to be published in IEEE JBHI; Code available at\n https://github.com/helenypzhang/UMD"},{"id":"http://arxiv.org/abs/2404.06050v1","updated":"2024-04-09T06:27:35Z","published":"2024-04-09T06:27:35Z","title":"Incremental Joint Learning of Depth, Pose and Implicit Scene\n Representation on Monocular Camera in Large-scale Scenes","summary":" Dense scene reconstruction for photo-realistic view synthesis has various\napplications, such as VR/AR, autonomous vehicles. However, most existing\nmethods have difficulties in large-scale scenes due to three core challenges:\n\\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get\nin real-world large-scale scenes. \\textit{(b) inaccurate pose estimation.} Most\nexisting approaches rely on accurate pre-estimated camera poses. \\textit{(c)\ninsufficient scene representation capability.} A single global radiance field\nlacks the capacity to effectively scale to large-scale scenes. To this end, we\npropose an incremental joint learning framework, which can achieve accurate\ndepth, pose estimation, and large-scale scene reconstruction. A vision\ntransformer-based network is adopted as the backbone to enhance performance in\nscale information estimation. For pose estimation, a feature-metric bundle\nadjustment (FBA) method is designed for accurate and robust camera tracking in\nlarge-scale scenes. In terms of implicit scene representation, we propose an\nincremental scene representation method to construct the entire large-scale\nscene as multiple local radiance fields to enhance the scalability of 3D scene\nrepresentation. Extended experiments have been conducted to demonstrate the\neffectiveness and accuracy of our method in depth estimation, pose estimation,\nand large-scale scene reconstruction.\n","authors":["Tianchen Deng","Nailin Wang","Chongdi Wang","Shenghai Yuan","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04421v2","updated":"2024-04-09T06:23:35Z","published":"2024-04-05T21:44:57Z","title":"PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual\n Observations","summary":" Modeling and rendering photorealistic avatars is of crucial importance in\nmany applications. Existing methods that build a 3D avatar from visual\nobservations, however, struggle to reconstruct clothed humans. We introduce\nPhysAvatar, a novel framework that combines inverse rendering with inverse\nphysics to automatically estimate the shape and appearance of a human from\nmulti-view video data along with the physical parameters of the fabric of their\nclothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for\nspatio-temporal mesh tracking as well as a physically based inverse renderer to\nestimate the intrinsic material properties. PhysAvatar integrates a physics\nsimulator to estimate the physical parameters of the garments using\ngradient-based optimization in a principled manner. These novel capabilities\nenable PhysAvatar to create high-quality novel-view renderings of avatars\ndressed in loose-fitting clothes under motions and lighting conditions not seen\nin the training data. This marks a significant advancement towards modeling\nphotorealistic digital humans using physically based inverse rendering with\nphysics in the loop. Our project website is at:\nhttps://qingqing-zhao.github.io/PhysAvatar\n","authors":["Yang Zheng","Qingqing Zhao","Guandao Yang","Wang Yifan","Donglai Xiang","Florian Dubost","Dmitry Lagun","Thabo Beeler","Federico Tombari","Leonidas Guibas","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2404.04421v2.pdf","comment":"Project Page: https://qingqing-zhao.github.io/PhysAvatar"},{"id":"http://arxiv.org/abs/2404.06044v1","updated":"2024-04-09T06:10:15Z","published":"2024-04-09T06:10:15Z","title":"Object Dynamics Modeling with Hierarchical Point Cloud-based\n Representations","summary":" Modeling object dynamics with a neural network is an important problem with\nnumerous applications. Most recent work has been based on graph neural\nnetworks. However, physics happens in 3D space, where geometric information\npotentially plays an important role in modeling physical phenomena. In this\nwork, we propose a novel U-net architecture based on continuous point\nconvolution which naturally embeds information from 3D coordinates and allows\nfor multi-scale feature representations with established downsampling and\nupsampling procedures. Bottleneck layers in the downsampled point clouds lead\nto better long-range interaction modeling. Besides, the flexibility of point\nconvolutions allows our approach to generalize to sparsely sampled points from\nmesh vertices and dynamically generate features on important interaction points\non mesh faces. Experimental results demonstrate that our approach significantly\nimproves the state-of-the-art, especially in scenarios that require accurate\ngravity or collision reasoning.\n","authors":["Chanho Kim","Li Fuxin"],"pdf_url":"https://arxiv.org/pdf/2404.06044v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.11729v2","updated":"2024-04-09T05:57:18Z","published":"2023-06-20T17:57:23Z","title":"Dense Video Object Captioning from Disjoint Supervision","summary":" We propose a new task and model for dense video object captioning --\ndetecting, tracking and captioning trajectories of objects in a video. This\ntask unifies spatial and temporal localization in video, whilst also requiring\nfine-grained visual understanding that is best described by natural language.\nWe propose a unified model, and demonstrate how our end-to-end approach is more\naccurate and temporally coherent than a multi-stage pipeline combining\nstate-of-the-art detection, tracking, and captioning models. Moreover, we\npropose a training strategy based on a mixture of disjoint tasks, which allows\nus to leverage diverse, large-scale datasets which supervise different parts of\nour model. Although each pretraining task only provides weak supervision, they\nare complementary and, when combined, result in noteworthy zero-shot ability\nand serve as strong initialization for additional finetuning to further improve\naccuracy. We carefully design new metrics capturing all components of our task,\nand show how we can repurpose existing video grounding datasets (e.g. VidSTG\nand VLN) for our new task. We show that our model improves upon a number of\nstrong baselines for this new task. Furthermore, we can apply our model to the\ntask of spatial grounding, outperforming prior state-of-the-art on VidSTG and\nVLN, without explicitly training for it. Code is available at\nhttps://github.com/google-research/scenic/tree/main/scenic/projects/densevoc.\n","authors":["Xingyi Zhou","Anurag Arnab","Chen Sun","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2306.11729v2.pdf","comment":"Code is available at\n https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc"},{"id":"http://arxiv.org/abs/2404.06036v1","updated":"2024-04-09T05:49:04Z","published":"2024-04-09T05:49:04Z","title":"Space-Time Video Super-resolution with Neural Operator","summary":" This paper addresses the task of space-time video super-resolution (ST-VSR).\nExisting methods generally suffer from inaccurate motion estimation and motion\ncompensation (MEMC) problems for large motions. Inspired by recent progress in\nphysics-informed neural networks, we model the challenges of MEMC in ST-VSR as\na mapping between two continuous function spaces. Specifically, our approach\ntransforms independent low-resolution representations in the coarse-grained\ncontinuous function space into refined representations with enriched\nspatiotemporal details in the fine-grained continuous function space. To\nachieve efficient and accurate MEMC, we design a Galerkin-type attention\nfunction to perform frame alignment and temporal interpolation. Due to the\nlinear complexity of the Galerkin-type attention mechanism, our model avoids\npatch partitioning and offers global receptive fields, enabling precise\nestimation of large motions. The experimental results show that the proposed\nmethod surpasses state-of-the-art techniques in both fixed-size and continuous\nspace-time video super-resolution tasks.\n","authors":["Yuantong Zhang","Hanyou Zheng","Daiqin Yang","Zhenzhong Chen","Haichuan Ma","Wenpeng Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10473v4","updated":"2024-04-09T05:47:57Z","published":"2023-02-21T06:31:53Z","title":"Oriented Object Detection in Optical Remote Sensing Images using Deep\n Learning: A Survey","summary":" Oriented object detection is one of the most fundamental and challenging\ntasks in remote sensing, aiming to locate and classify objects with arbitrary\norientations. Recent years have witnessed remarkable progress in oriented\nobject detection using deep learning techniques. Given the rapid development of\nthis field, this paper aims to provide a comprehensive survey of recent\nadvances in oriented object detection. To be specific, we first review the\ntechnical evolution from horizontal object detection to oriented object\ndetection and summarize the specific challenges, including feature\nmisalignment, spatial misalignment, and periodicity of angle. Subsequently, we\nfurther categorize existing methods into detection framework, oriented bounding\nbox (OBB) regression, and feature representations, and discuss how these\nmethods address the above challenges in detail. In addition, we cover several\npublicly available datasets and performance evaluation protocols. Furthermore,\nwe provide a comprehensive comparison and analysis of state-of-the-art oriented\nobject detection methods. Toward the end of this paper, we discuss several\nfuture directions for oriented object detection.\n","authors":["Kun Wang","Zi Wang","Zhang Li","Ang Su","Xichao Teng","Minhao Liu","Qifeng Yu"],"pdf_url":"https://arxiv.org/pdf/2302.10473v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06668v2","updated":"2024-04-09T05:47:39Z","published":"2024-03-11T12:36:14Z","title":"PeerAiD: Improving Adversarial Distillation from a Specialized Peer\n Tutor","summary":" Adversarial robustness of the neural network is a significant concern when it\nis applied to security-critical domains. In this situation, adversarial\ndistillation is a promising option which aims to distill the robustness of the\nteacher network to improve the robustness of a small student network. Previous\nworks pretrain the teacher network to make it robust to the adversarial\nexamples aimed at itself. However, the adversarial examples are dependent on\nthe parameters of the target network. The fixed teacher network inevitably\ndegrades its robustness against the unseen transferred adversarial examples\nwhich targets the parameters of the student network in the adversarial\ndistillation process. We propose PeerAiD to make a peer network learn the\nadversarial examples of the student network instead of adversarial examples\naimed at itself. PeerAiD is an adversarial distillation that trains the peer\nnetwork and the student network simultaneously in order to make the peer\nnetwork specialized for defending the student network. We observe that such\npeer networks surpass the robustness of pretrained robust teacher network\nagainst student-attacked adversarial samples. With this peer network and\nadversarial distillation, PeerAiD achieves significantly higher robustness of\nthe student network with AutoAttack (AA) accuracy up to 1.66%p and improves the\nnatural accuracy of the student network up to 4.72%p with ResNet-18 and\nTinyImageNet dataset.\n","authors":["Jaewon Jung","Hongsun Jang","Jaeyong Song","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.06668v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06033v1","updated":"2024-04-09T05:44:00Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06029v1","updated":"2024-04-09T05:30:58Z","published":"2024-04-09T05:30:58Z","title":"Improving Facial Landmark Detection Accuracy and Efficiency with\n Knowledge Distillation","summary":" The domain of computer vision has experienced significant advancements in\nfacial-landmark detection, becoming increasingly essential across various\napplications such as augmented reality, facial recognition, and emotion\nanalysis. Unlike object detection or semantic segmentation, which focus on\nidentifying objects and outlining boundaries, faciallandmark detection aims to\nprecisely locate and track critical facial features. However, deploying deep\nlearning-based facial-landmark detection models on embedded systems with\nlimited computational resources poses challenges due to the complexity of\nfacial features, especially in dynamic settings. Additionally, ensuring\nrobustness across diverse ethnicities and expressions presents further\nobstacles. Existing datasets often lack comprehensive representation of facial\nnuances, particularly within populations like those in Taiwan. This paper\nintroduces a novel approach to address these challenges through the development\nof a knowledge distillation method. By transferring knowledge from larger\nmodels to smaller ones, we aim to create lightweight yet powerful deep learning\nmodels tailored specifically for facial-landmark detection tasks. Our goal is\nto design models capable of accurately locating facial landmarks under varying\nconditions, including diverse expressions, orientations, and lighting\nenvironments. The ultimate objective is to achieve high accuracy and real-time\nperformance suitable for deployment on embedded systems. This method was\nsuccessfully implemented and achieved a top 6th place finish out of 165\nparticipants in the IEEE ICME 2024 PAIR competition.\n","authors":["Zong-Wei Hong","Yu-Chen Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06029v1.pdf","comment":"technical report. 6th/165 in IEEE ICME 2024 PAIR competition"},{"id":"http://arxiv.org/abs/2404.06025v1","updated":"2024-04-09T05:21:32Z","published":"2024-04-09T05:21:32Z","title":"Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs","summary":" Morphing attacks are an emerging threat to state-of-the-art Face Recognition\n(FR) systems, which aim to create a single image that contains the biometric\ninformation of multiple identities. Diffusion Morphs (DiM) are a recently\nproposed morphing attack that has achieved state-of-the-art performance for\nrepresentation-based morphing attacks. However, none of the existing research\non DiMs have leveraged the iterative nature of DiMs and left the DiM model as a\nblack box, treating it no differently than one would a Generative Adversarial\nNetwork (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on\nthe iterative sampling process of DiM models which searches for an optimal step\nguided by an identity-based heuristic function. We compare our proposed\nalgorithm against ten other state-of-the-art morphing algorithms using the\nopen-source SYN-MAD 2022 competition dataset. We find that our proposed\nalgorithm is unreasonably effective, fooling all of the tested FR systems with\nan MMPMR of 100%, outperforming all other morphing algorithms compared.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06025v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06022v1","updated":"2024-04-09T05:11:28Z","published":"2024-04-09T05:11:28Z","title":"Band-Attention Modulated RetNet for Face Forgery Detection","summary":" The transformer networks are extensively utilized in face forgery detection\ndue to their scalability across large datasets.Despite their success,\ntransformers face challenges in balancing the capture of global context, which\nis crucial for unveiling forgery clues, with computational complexity.To\nmitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a\nlightweight network designed to efficiently process extensive visual contexts\nwhile avoiding catastrophic forgetting.Our approach empowers the target token\nto perceive global information by assigning differential attention levels to\ntokens at varying distances. We implement self-attention along both spatial\naxes, thereby maintaining spatial priors and easing the computational\nburden.Moreover, we present the adaptive frequency Band-Attention Modulation\nmechanism, which treats the entire Discrete Cosine Transform spectrogram as a\nseries of frequency bands with learnable weights.Together, BAR-Net achieves\nfavorable performance on several face forgery datasets, outperforming current\nstate-of-the-art methods.\n","authors":["Zhida Zhang","Jie Cao","Wenkui Yang","Qihang Fan","Kai Zhou","Ran He"],"pdf_url":"https://arxiv.org/pdf/2404.06022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16271v4","updated":"2024-04-09T05:09:56Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v4.pdf","comment":"37 pages, 17 figures"},{"id":"http://arxiv.org/abs/2312.13980v2","updated":"2024-04-09T04:41:53Z","published":"2023-12-21T16:10:33Z","title":"Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion\n Models with RL Finetuning","summary":" Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT)\nto text-to-image diffusion models, have driven recent breakthroughs in\ntext-to-3D research. However, due to the limited size and quality of existing\n3D datasets, they still suffer from multi-view inconsistencies and Neural\nRadiance Field (NeRF) reconstruction artifacts. We argue that multi-view\ndiffusion models can benefit from further Reinforcement Learning Finetuning\n(RLFT), which allows models to learn from the data generated by themselves and\nimprove beyond their dataset limitations during SFT. To this end, we introduce\nCarve3D, an improved RLFT algorithm coupled with a novel Multi-view\nReconstruction Consistency (MRC) metric, to enhance the consistency of\nmulti-view diffusion models. To measure the MRC metric on a set of multi-view\nimages, we compare them with their corresponding NeRF renderings at the same\ncamera viewpoints. The resulting model, which we denote as Carve3DM,\ndemonstrates superior multi-view consistency and NeRF reconstruction quality\nthan existing models. Our results suggest that pairing SFT with Carve3D's RLFT\nis essential for developing multi-view-consistent diffusion models, mirroring\nthe standard Large Language Model (LLM) alignment pipeline. Our code, training\nand testing data, and video results are available at:\nhttps://desaixie.github.io/carve-3d.\n","authors":["Desai Xie","Jiahao Li","Hao Tan","Xin Sun","Zhixin Shu","Yi Zhou","Sai Bi","Sören Pirk","Arie E. Kaufman"],"pdf_url":"https://arxiv.org/pdf/2312.13980v2.pdf","comment":"22 pages, 16 figures. Our code, training and testing data, and video\n results are available at: https://desaixie.github.io/carve-3d. This paper has\n been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024\n camera-ready version"},{"id":"http://arxiv.org/abs/2404.06012v1","updated":"2024-04-09T04:41:05Z","published":"2024-04-09T04:41:05Z","title":"Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data","summary":" The millimeter-wave radar sensor maintains stable performance under adverse\nenvironmental conditions, making it a promising solution for all-weather\nperception tasks, such as outdoor mobile robotics. However, the radar point\nclouds are relatively sparse and contain massive ghost points, which greatly\nlimits the development of mmWave radar technology. In this paper, we propose a\nnovel point cloud super-resolution approach for 3D mmWave radar data, named\nRadar-diffusion. Our approach employs the diffusion model defined by\nmean-reverting stochastic differential equations(SDE). Using our proposed new\nobjective function with supervision from corresponding LiDAR point clouds, our\napproach efficiently handles radar ghost points and enhances the sparse mmWave\nradar point clouds to dense LiDAR-like point clouds. We evaluate our approach\non two different datasets, and the experimental results show that our method\noutperforms the state-of-the-art baseline methods in 3D radar super-resolution\ntasks. Furthermore, we demonstrate that our enhanced radar point cloud is\ncapable of downstream radar point-based registration tasks.\n","authors":["Kai Luan","Chenghao Shi","Neng Wang","Yuwei Cheng","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05997v1","updated":"2024-04-09T04:04:50Z","published":"2024-04-09T04:04:50Z","title":"Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis","summary":" The black-box nature of deep learning models has raised concerns about their\ninterpretability for successful deployment in real-world clinical applications.\nTo address the concerns, eXplainable Artificial Intelligence (XAI) aims to\nprovide clear and understandable explanations of the decision-making process.\nIn the medical domain, concepts such as attributes of lesions or abnormalities\nserve as key evidence for deriving diagnostic results. However, existing\nconcept-based models mainly depend on concepts that appear independently and\nrequire fine-grained concept annotations such as bounding boxes. A medical\nimage usually contains multiple concepts and the fine-grained concept\nannotations are difficult to acquire. In this paper, we propose a novel\nConcept-Attention Whitening (CAW) framework for interpretable skin lesion\ndiagnosis. CAW is comprised of a disease diagnosis branch and a concept\nalignment branch. In the former branch, we train the CNN with a CAW layer\ninserted to perform skin lesion diagnosis. The CAW layer decorrelates features\nand aligns image features to conceptual meanings via an orthogonal matrix. In\nthe latter branch, we calculate the orthogonal matrix under the guidance of the\nconcept attention mask. We particularly introduce a weakly-supervised concept\nmask generator that only leverages coarse concept labels for filtering local\nregions that are relevant to certain concepts, improving the optimization of\nthe orthogonal matrix. Extensive experiments on two public skin lesion\ndiagnosis datasets demonstrated that CAW not only enhanced interpretability but\nalso maintained a state-of-the-art diagnostic performance.\n","authors":["Junlin Hou","Jilan Xu","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05981v1","updated":"2024-04-09T03:27:09Z","published":"2024-04-09T03:27:09Z","title":"A Lightweight Measure of Classification Difficulty from Application\n Dataset Characteristics","summary":" Despite accuracy and computation benchmarks being widely available to help\nchoose among neural network models, these are usually trained on datasets with\nmany classes, and do not give a precise idea of performance for applications of\nfew (< 10) classes. The conventional procedure to predict performance is to\ntrain and test repeatedly on the different models and dataset variations of\ninterest. However, this is computationally expensive. We propose an efficient\nclassification difficulty measure that is calculated from the number of classes\nand intra- and inter-class similarity metrics of the dataset. After a single\nstage of training and testing per model family, relative performance for\ndifferent datasets and models of the same family can be predicted by comparing\ndifficulty measures - without further training and testing. We show how this\nmeasure can help a practitioner select a computationally efficient model for a\nsmall dataset 6 to 29x faster than through repeated training and testing. We\ngive an example of use of the measure for an industrial application in which\noptions are identified to select a model 42% smaller than the baseline\nYOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements,\n85% smaller.\n","authors":["Bryan Bo Cao","Abhinav Sharma","Lawrence O'Gorman","Michael Coss","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2404.05981v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.05980v1","updated":"2024-04-09T03:24:10Z","published":"2024-04-09T03:24:10Z","title":"Tackling Structural Hallucination in Image Translation with Local\n Diffusion","summary":" Recent developments in diffusion models have advanced conditioned image\ngeneration, yet they struggle with reconstructing out-of-distribution (OOD)\nimages, such as unseen tumors in medical images, causing ``image\nhallucination'' and risking misdiagnosis. We hypothesize such hallucinations\nresult from local OOD regions in the conditional images. We verify that\npartitioning the OOD region and conducting separate image generations\nalleviates hallucinations in several applications. From this, we propose a\ntraining-free diffusion framework that reduces hallucination with multiple\nLocal Diffusion processes. Our approach involves OOD estimation followed by two\nmodules: a ``branching'' module generates locally both within and outside OOD\nregions, and a ``fusion'' module integrates these predictions into one. Our\nevaluation shows our method mitigates hallucination over baseline models\nquantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the\nreal-world medical and natural image datasets, respectively. It also\ndemonstrates compatibility with various pre-trained diffusion models.\n","authors":["Seunghoi Kim","Chen Jin","Tom Diethe","Matteo Figini","Henry F. J. Tregidgo","Asher Mullokandov","Philip Teare","Daniel C. Alexander"],"pdf_url":"https://arxiv.org/pdf/2404.05980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05979v1","updated":"2024-04-09T03:22:36Z","published":"2024-04-09T03:22:36Z","title":"StoryImager: A Unified and Efficient Framework for Coherent Story\n Visualization and Completion","summary":" Story visualization aims to generate a series of realistic and coherent\nimages based on a storyline. Current models adopt a frame-by-frame architecture\nby transforming the pre-trained text-to-image model into an auto-regressive\nmanner. Although these models have shown notable progress, there are still\nthree flaws. 1) The unidirectional generation of auto-regressive manner\nrestricts the usability in many scenarios. 2) The additional introduced story\nhistory encoders bring an extremely high computational cost. 3) The story\nvisualization and continuation models are trained and inferred independently,\nwhich is not user-friendly. To these ends, we propose a bidirectional, unified,\nand efficient framework, namely StoryImager. The StoryImager enhances the\nstoryboard generative ability inherited from the pre-trained text-to-image\nmodel for a bidirectional generation. Specifically, we introduce a Target Frame\nMasking Strategy to extend and unify different story image generation tasks.\nFurthermore, we propose a Frame-Story Cross Attention Module that decomposes\nthe cross attention for local fidelity and global coherence. Moreover, we\ndesign a Contextual Feature Extractor to extract contextual information from\nthe whole storyline. The extensive experimental results demonstrate the\nexcellent performance of our StoryImager. The code is available at\nhttps://github.com/tobran/StoryImager.\n","authors":["Ming Tao","Bing-Kun Bao","Hao Tang","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.05979v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2403.14085v2","updated":"2024-04-09T02:59:41Z","published":"2024-03-21T02:31:17Z","title":"Surface Reconstruction from Point Clouds via Grid-based Intersection\n Prediction","summary":" Surface reconstruction from point clouds is a crucial task in the fields of\ncomputer vision and computer graphics. SDF-based methods excel at\nreconstructing smooth meshes with minimal error and artefacts but struggle with\nrepresenting open surfaces. On the other hand, UDF-based methods can\neffectively represent open surfaces but often introduce noise, leading to\nartefacts in the mesh. In this work, we propose a novel approach that directly\npredicts the intersection points between line segment of point pairs and\nimplicit surfaces. To achieve it, we propose two modules named Relative\nIntersection Module and Sign Module respectively with the feature of point pair\nas input. To preserve the continuity of the surface, we also integrate symmetry\ninto the two modules, which means the position of predicted intersection will\nnot change even if the input order of the point pair changes. This method not\nonly preserves the ability to represent open surfaces but also eliminates most\nartefacts on the mesh. Our approach demonstrates state-of-the-art performance\non three datasets: ShapeNet, MGN, and ScanNet. The code will be made available\nupon acceptance.\n","authors":["Hui Tian","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2403.14085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03394v2","updated":"2024-04-09T02:56:27Z","published":"2024-04-04T11:53:37Z","title":"Background Noise Reduction of Attention Map for Weakly Supervised\n Semantic Segmentation","summary":" In weakly-supervised semantic segmentation (WSSS) using only image-level\nclass labels, a problem with CNN-based Class Activation Maps (CAM) is that they\ntend to activate the most discriminative local regions of objects. On the other\nhand, methods based on Transformers learn global features but suffer from the\nissue of background noise contamination. This paper focuses on addressing the\nissue of background noise in attention weights within the existing WSSS method\nbased on Conformer, known as TransCAM. The proposed method successfully reduces\nbackground noise, leading to improved accuracy of pseudo labels. Experimental\nresults demonstrate that our model achieves segmentation performance of 70.5%\non the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS\nCOCO 2014 data, outperforming TransCAM in terms of segmentation performance.\n","authors":["Izumi Fujimori","Masaki Oono","Masami Shishibori"],"pdf_url":"https://arxiv.org/pdf/2404.03394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05967v1","updated":"2024-04-09T02:55:12Z","published":"2024-04-09T02:55:12Z","title":"JSTR: Judgment Improves Scene Text Recognition","summary":" In this paper, we present a method for enhancing the accuracy of scene text\nrecognition tasks by judging whether the image and text match each other. While\nprevious studies focused on generating the recognition results from input\nimages, our approach also considers the model's misrecognition results to\nunderstand its error tendencies, thus improving the text recognition pipeline.\nThis method boosts text recognition accuracy by providing explicit feedback on\nthe data that the model is likely to misrecognize by predicting correct or\nincorrect between the image and text. The experimental results on publicly\navailable datasets demonstrate that our proposed method outperforms the\nbaseline and state-of-the-art methods in scene text recognition.\n","authors":["Masato Fujitake"],"pdf_url":"https://arxiv.org/pdf/2404.05967v1.pdf","comment":"IntelliSys 2024"},{"id":"http://arxiv.org/abs/2404.05960v1","updated":"2024-04-09T02:47:52Z","published":"2024-04-09T02:47:52Z","title":"EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker","summary":" Most of 3D single object trackers (SOT) in point clouds follow the two-stream\nmulti-stage 3D Siamese or motion tracking paradigms, which process the template\nand search area point clouds with two parallel branches, built on supervised\npoint cloud backbones. In this work, beyond typical 3D Siamese or motion\ntracking, we propose a neat and compact one-stream transformer 3D SOT paradigm\nfrom the novel perspective, termed as \\textbf{EasyTrack}, which consists of\nthree special designs: 1) A 3D point clouds tracking feature pre-training\nmodule is developed to exploit the masked autoencoding for learning 3D point\nclouds tracking representations. 2) A unified 3D tracking feature learning and\nfusion network is proposed to simultaneously learns target-aware 3D features,\nand extensively captures mutual correlation through the flexible self-attention\nmechanism. 3) A target location network in the dense bird's eye view (BEV)\nfeature space is constructed for target classification and regression.\nMoreover, we develop an enhanced version named EasyTrack++, which designs the\ncenter points interaction (CPI) strategy to reduce the ambiguous targets caused\nby the noise point cloud background information. The proposed EasyTrack and\nEasyTrack++ set a new state-of-the-art performance ($\\textbf{18\\%}$,\n$\\textbf{40\\%}$ and $\\textbf{3\\%}$ success gains) in KITTI, NuScenes, and Waymo\nwhile runing at \\textbf{52.6fps} with few parameters (\\textbf{1.3M}). The code\nwill be available at https://github.com/KnightApple427/Easytrack.\n","authors":["Baojie Fan","Wuyang Zhou","Kai Wang","Shijun Zhou","Fengyu Xu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12554v4","updated":"2024-04-09T02:42:28Z","published":"2023-01-29T22:05:28Z","title":"Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive\n Smoothing","summary":" While prior research has proposed a plethora of methods that build neural\nclassifiers robust against adversarial robustness, practitioners are still\nreluctant to adopt them due to their unacceptably severe clean accuracy\npenalties. This paper significantly alleviates this accuracy-robustness\ntrade-off by mixing the output probabilities of a standard classifier and a\nrobust classifier, where the standard network is optimized for clean accuracy\nand is not robust in general. We show that the robust base classifier's\nconfidence difference for correct and incorrect examples is the key to this\nimprovement. In addition to providing intuitions and empirical evidence, we\ntheoretically certify the robustness of the mixed classifier under realistic\nassumptions. Furthermore, we adapt an adversarial input detector into a mixing\nnetwork that adaptively adjusts the mixture of the two base models, further\nreducing the accuracy penalty of achieving robustness. The proposed flexible\nmethod, termed \"adaptive smoothing\", can work in conjunction with existing or\neven future methods that improve clean accuracy, robustness, or adversary\ndetection. Our empirical evaluation considers strong attack methods, including\nAutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves\nan 85.21% clean accuracy while maintaining a 38.72% $\\ell_\\infty$-AutoAttacked\n($\\epsilon = 8/255$) accuracy, becoming the second most robust method on the\nRobustBench CIFAR-100 benchmark as of submission, while improving the clean\naccuracy by ten percentage points compared with all listed models. The code\nthat implements our method is available at\nhttps://github.com/Bai-YT/AdaptiveSmoothing.\n","authors":["Yatong Bai","Brendon G. Anderson","Aerin Kim","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2301.12554v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06136v3","updated":"2024-04-09T02:38:16Z","published":"2024-02-09T01:48:44Z","title":"SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor\n Scenes","summary":" We propose SIR, an efficient method to decompose differentiable shadows for\ninverse rendering on indoor scenes using multi-view data, addressing the\nchallenges in accurately decomposing the materials and lighting conditions.\nUnlike previous methods that struggle with shadow fidelity in complex lighting\nenvironments, our approach explicitly learns shadows for enhanced realism in\nmaterial estimation under unknown light positions. Utilizing posed HDR images\nas input, SIR employs an SDF-based neural radiance field for comprehensive\nscene representation. Then, SIR integrates a shadow term with a three-stage\nmaterial estimation approach to improve SVBRDF quality. Specifically, SIR is\ndesigned to learn a differentiable shadow, complemented by BRDF regularization,\nto optimize inverse rendering accuracy. Extensive experiments on both synthetic\nand real-world indoor scenes demonstrate the superior performance of SIR over\nexisting methods in both quantitative metrics and qualitative analysis. The\nsignificant decomposing ability of SIR enables sophisticated editing\ncapabilities like free-view relighting, object insertion, and material\nreplacement. The code and data are available at\nhttps://xiaokangwei.github.io/SIR/.\n","authors":["Xiaokang Wei","Zhuoman Liu","Yan Luximon"],"pdf_url":"https://arxiv.org/pdf/2402.06136v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15033v2","updated":"2024-04-09T02:29:32Z","published":"2024-03-22T08:32:30Z","title":"Toward Tiny and High-quality Facial Makeup with Data Amplify Learning","summary":" Contemporary makeup approaches primarily hinge on unpaired learning\nparadigms, yet they grapple with the challenges of inaccurate supervision\n(e.g., face misalignment) and sophisticated facial prompts (including face\nparsing, and landmark detection). These challenges prohibit low-cost deployment\nof facial makeup models, especially on mobile devices. To solve above problems,\nwe propose a brand-new learning paradigm, termed \"Data Amplify Learning (DAL),\"\nalongside a compact makeup model named \"TinyBeauty.\" The core idea of DAL lies\nin employing a Diffusion-based Data Amplifier (DDA) to \"amplify\" limited images\nfor the model training, thereby enabling accurate pixel-to-pixel supervision\nwith merely a handful of annotations. Two pivotal innovations in DDA facilitate\nthe above training approach: (1) A Residual Diffusion Model (RDM) is designed\nto generate high-fidelity detail and circumvent the detail vanishing problem in\nthe vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is\nproposed to achieve precise makeup control and combination while retaining face\nidentity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to\nachieve a state-of-the-art performance without intricate face prompts.\nMeanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on\nthe iPhone 13. Extensive experiments show that DAL can produce highly\ncompetitive makeup models using only 5 image pairs.\n","authors":["Qiaoqiao Jin","Xuanhong Chen","Meiguang Jin","Ying Chen","Rui Shi","Yucheng Zheng","Yupeng Zhu","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2403.15033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03662v2","updated":"2024-04-09T01:43:11Z","published":"2024-03-06T12:31:02Z","title":"Harnessing Meta-Learning for Improving Full-Frame Video Stabilization","summary":" Video stabilization is a longstanding computer vision problem, particularly\npixel-level synthesis solutions for video stabilization which synthesize full\nframes add to the complexity of this task. These techniques aim to stabilize\nvideos by synthesizing full frames while enhancing the stability of the\nconsidered video. This intensifies the complexity of the task due to the\ndistinct mix of unique motion profiles and visual content present in each video\nsequence, making robust generalization with fixed parameters difficult. In our\nstudy, we introduce a novel approach to enhance the performance of pixel-level\nsynthesis solutions for video stabilization by adapting these models to\nindividual input video sequences. The proposed adaptation exploits low-level\nvisual cues accessible during test-time to improve both the stability and\nquality of resulting videos. We highlight the efficacy of our methodology of\n\"test-time adaptation\" through simple fine-tuning of one of these models,\nfollowed by significant stability gain via the integration of meta-learning\ntechniques. Notably, significant improvement is achieved with only a single\nadaptation step. The versatility of the proposed algorithm is demonstrated by\nconsistently improving the performance of various pixel-level synthesis models\nfor video stabilization in real-world scenarios.\n","authors":["Muhammad Kashif Ali","Eun Woo Im","Dongjin Kim","Tae Hyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.03662v2.pdf","comment":"CVPR 2024, Code will be made availble on:\n http://github.com/MKashifAli/MetaVideoStab"},{"id":"http://arxiv.org/abs/2309.13475v3","updated":"2024-04-09T01:26:58Z","published":"2023-09-23T20:33:38Z","title":"Detecting and Mitigating System-Level Anomalies of Vision-Based\n Controllers","summary":" Autonomous systems, such as self-driving cars and drones, have made\nsignificant strides in recent years by leveraging visual inputs and machine\nlearning for decision-making and control. Despite their impressive performance,\nthese vision-based controllers can make erroneous predictions when faced with\nnovel or out-of-distribution inputs. Such errors can cascade to catastrophic\nsystem failures and compromise system safety. In this work, we introduce a\nrun-time anomaly monitor to detect and mitigate such closed-loop, system-level\nfailures. Specifically, we leverage a reachability-based framework to\nstress-test the vision-based controller offline and mine its system-level\nfailures. This data is then used to train a classifier that is leveraged online\nto flag inputs that might cause system breakdowns. The anomaly detector\nhighlights issues that transcend individual modules and pertain to the safety\nof the overall system. We also design a fallback controller that robustly\nhandles these detected anomalies to preserve system safety. We validate the\nproposed approach on an autonomous aircraft taxiing system that uses a\nvision-based controller for taxiing. Our results show the efficacy of the\nproposed approach in identifying and handling system-level anomalies,\noutperforming methods such as prediction error-based detection, and ensembling,\nthereby enhancing the overall safety and robustness of autonomous systems.\n","authors":["Aryaman Gupta","Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2309.13475v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10240v2","updated":"2024-04-09T01:16:07Z","published":"2023-12-15T22:18:38Z","title":"Rich Human Feedback for Text-to-Image Generation","summary":" Recent Text-to-Image (T2I) generation models such as Stable Diffusion and\nImagen have made significant progress in generating high-resolution images\nbased on text descriptions. However, many generated images still suffer from\nissues such as artifacts/implausibility, misalignment with text descriptions,\nand low aesthetic quality. Inspired by the success of Reinforcement Learning\nwith Human Feedback (RLHF) for large language models, prior works collected\nhuman-provided scores as feedback on generated images and trained a reward\nmodel to improve the T2I generation. In this paper, we enrich the feedback\nsignal by (i) marking image regions that are implausible or misaligned with the\ntext, and (ii) annotating which words in the text prompt are misrepresented or\nmissing on the image. We collect such rich human feedback on 18K generated\nimages (RichHF-18K) and train a multimodal transformer to predict the rich\nfeedback automatically. We show that the predicted rich human feedback can be\nleveraged to improve image generation, for example, by selecting high-quality\ntraining data to finetune and improve the generative models, or by creating\nmasks with predicted heatmaps to inpaint the problematic regions. Notably, the\nimprovements generalize to models (Muse) beyond those used to generate the\nimages on which human feedback data were collected (Stable Diffusion variants).\nThe RichHF-18K data set will be released in our GitHub repository:\nhttps://github.com/google-research/google-research/tree/master/richhf_18k.\n","authors":["Youwei Liang","Junfeng He","Gang Li","Peizhao Li","Arseniy Klimovskiy","Nicholas Carolan","Jiao Sun","Jordi Pont-Tuset","Sarah Young","Feng Yang","Junjie Ke","Krishnamurthy Dj Dvijotham","Katie Collins","Yiwen Luo","Yang Li","Kai J Kohlhoff","Deepak Ramachandran","Vidhya Navalpakkam"],"pdf_url":"https://arxiv.org/pdf/2312.10240v2.pdf","comment":"CVPR'24"},{"id":"http://arxiv.org/abs/2402.17228v3","updated":"2024-04-09T01:10:15Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2308.13072v2","updated":"2024-04-09T01:09:41Z","published":"2023-08-24T20:29:09Z","title":"Full-dose Whole-body PET Synthesis from Low-dose PET Using\n High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency\n Model","summary":" Objective: Positron Emission Tomography (PET) has been a commonly used\nimaging modality in broad clinical applications. One of the most important\ntradeoffs in PET imaging is between image quality and radiation dose: high\nimage quality comes with high radiation exposure. Improving image quality is\ndesirable for all clinical applications while minimizing radiation exposure is\nneeded to reduce risk to patients. Approach: We introduce PET Consistency Model\n(PET-CM), an efficient diffusion-based method for generating high-quality\nfull-dose PET images from low-dose PET images. It employs a two-step process,\nadding Gaussian noise to full-dose PET images in the forward diffusion, and\nthen denoising them using a PET Shifted-window Vision Transformer (PET-VIT)\nnetwork in the reverse diffusion. The PET-VIT network learns a consistency\nfunction that enables direct denoising of Gaussian noise into clean full-dose\nPET images. PET-CM achieves state-of-the-art image quality while requiring\nsignificantly less computation time than other methods. Results: In experiments\ncomparing eighth-dose to full-dose images, PET-CM demonstrated impressive\nperformance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of\n0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of\n0.255+/-0.318%, with an average generation time of 62 seconds per patient. This\nis a significant improvement compared to the state-of-the-art diffusion-based\nmodel with PET-CM reaching this result 12x faster. Similarly, in the\nquarter-dose to full-dose image experiments, PET-CM delivered competitive\noutcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM\nof 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of\n0.151+/-0.192% using the same generation process, which underlining its high\nquantitative and clinical precision in both denoising scenario.\n","authors":["Shaoyan Pan","Elham Abouei","Junbo Peng","Joshua Qian","Jacob F Wynne","Tonghe Wang","Chih-Wei Chang","Justin Roper","Jonathon A Nye","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05916v1","updated":"2024-04-09T00:30:16Z","published":"2024-04-09T00:30:16Z","title":"Prompt-driven Universal Model for View-Agnostic Echocardiography\n Analysis","summary":" Echocardiography segmentation for cardiac analysis is time-consuming and\nresource-intensive due to the variability in image quality and the necessity to\nprocess scans from various standard views. While current automated segmentation\nmethods in echocardiography show promising performance, they are trained on\nspecific scan views to analyze corresponding data. However, this solution has a\nlimitation as the number of required models increases with the number of\nstandard views. To address this, in this paper, we present a prompt-driven\nuniversal method for view-agnostic echocardiography analysis. Considering the\ndomain shift between standard views, we first introduce a method called prompt\nmatching, aimed at learning prompts specific to different views by matching\nprompts and querying input embeddings using a pre-trained vision model. Then,\nwe utilized a pre-trained medical language model to align textual information\nwith pixel data for accurate segmentation. Extensive experiments on three\nstandard views showed that our approach significantly outperforms the\nstate-of-the-art universal methods and achieves comparable or even better\nperformances over the segmentation model trained and tested on same views.\n","authors":["Sekeun Kim","Hui Ren","Peng Guo","Abder-Rahman Ali","Patrick Zhang","Kyungsang Kim","Xiang Li","Quanzheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05911v1","updated":"2024-04-09T00:05:45Z","published":"2024-04-09T00:05:45Z","title":"LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions\n for Brain Tumor Segmentation","summary":" Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI)\nscans is crucial for prompt and effective treatment. However, this process\nfaces the challenge of precise delineation due to the tumors' complex\nheterogeneity. Moreover, energy sustainability targets and resource\nlimitations, especially in developing countries, require efficient and\naccessible medical imaging solutions. The proposed architecture, a Lightweight\n3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these\nissues. It is specifically designed to reduce computational requirements\nsignificantly while maintaining high segmentation performance. By incorporating\nparallel convolutions, it enhances feature representation by capturing\nmulti-scale information. It further integrates an attention mechanism to refine\nsegmentation through selective feature recalibration. LATUP-Net achieves\npromising segmentation performance: the average Dice scores for the whole\ntumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%,\n83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and\n83.92%, respectively. Hausdorff distance metrics further indicate its improved\nability to delineate tumor boundaries. With its significantly reduced\ncomputational demand using only 3.07 M parameters, about 59 times fewer than\nother state-of-the-art models, and running on a single V100 GPU, LATUP-Net\nstands out as a promising solution for real-world clinical applications,\nparticularly in settings with limited resources. Investigations into the\nmodel's interpretability, utilizing gradient-weighted class activation mapping\nand confusion matrices, reveal that while attention mechanisms enhance the\nsegmentation of small regions, their impact is nuanced. Achieving the most\naccurate tumor delineation requires carefully balancing local and global\nfeatures.\n","authors":["Ebtihal J. Alwadee","Xianfang Sun","Yipeng Qin","Frank C. Langbein"],"pdf_url":"https://arxiv.org/pdf/2404.05911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06657v1","updated":"2024-04-09T23:47:53Z","published":"2024-04-09T23:47:53Z","title":"Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image\n Reconstruction","summary":" Conventional deep learning-based image reconstruction methods require a large\namount of training data which can be hard to obtain in practice. Untrained deep\nlearning methods overcome this limitation by training a network to invert a\nphysical model of the image formation process. Here we present a novel\nuntrained Res-U2Net model for phase retrieval. We use the extracted phase\ninformation to determine changes in an object's surface and generate a mesh\nrepresentation of its 3D structure. We compare the performance of Res-U2Net\nphase retrieval against UNet and U2Net using images from the GDXRAY dataset.\n","authors":["Carlos Osorio Quero","Daniel Leykam","Irving Rondon Ojeda"],"pdf_url":"https://arxiv.org/pdf/2404.06657v1.pdf","comment":"16 pages, 8 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2312.00825v2","updated":"2024-04-09T23:28:49Z","published":"2023-11-30T18:32:14Z","title":"SocialCounterfactuals: Probing and Mitigating Intersectional Social\n Biases in Vision-Language Models with Counterfactual Examples","summary":" While vision-language models (VLMs) have achieved remarkable performance\nimprovements recently, there is growing evidence that these models also posses\nharmful biases with respect to social attributes such as gender and race. Prior\nstudies have primarily focused on probing such bias attributes individually\nwhile ignoring biases associated with intersections between social attributes.\nThis could be due to the difficulty of collecting an exhaustive set of\nimage-text pairs for various combinations of social attributes. To address this\nchallenge, we employ text-to-image diffusion models to produce counterfactual\nexamples for probing intersectional social biases at scale. Our approach\nutilizes Stable Diffusion with cross attention control to produce sets of\ncounterfactual image-text pairs that are highly similar in their depiction of a\nsubject (e.g., a given occupation) while differing only in their depiction of\nintersectional social attributes (e.g., race & gender). Through our\nover-generate-then-filter methodology, we produce SocialCounterfactuals, a\nhigh-quality dataset containing 171k image-text pairs for probing\nintersectional biases related to gender, race, and physical characteristics. We\nconduct extensive experiments to demonstrate the usefulness of our generated\ndataset for probing and mitigating intersectional social biases in\nstate-of-the-art VLMs.\n","authors":["Phillip Howard","Avinash Madasu","Tiep Le","Gustavo Lujan Moreno","Anahita Bhiwandiwalla","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2312.00825v2.pdf","comment":"Accepted to CVPR 2024. arXiv admin note: text overlap with\n arXiv:2310.02988"},{"id":"http://arxiv.org/abs/2404.06653v1","updated":"2024-04-09T23:24:19Z","published":"2024-04-09T23:24:19Z","title":"FlameFinder: Illuminating Obscured Fire through Smoke with Attentive\n Deep Metric Learning","summary":" FlameFinder is a deep metric learning (DML) framework designed to accurately\ndetect flames, even when obscured by smoke, using thermal images from\nfirefighter drones during wildfire monitoring. Traditional RGB cameras struggle\nin such conditions, but thermal cameras can capture smoke-obscured flame\nfeatures. However, they lack absolute thermal reference points, leading to\nfalse positives.To address this issue, FlameFinder utilizes paired thermal-RGB\nimages for training. By learning latent flame features from smoke-free samples,\nthe model becomes less biased towards relative thermal gradients. In testing,\nit identifies flames in smoky patches by analyzing their equivalent\nthermal-domain distribution. This method improves performance using both\nsupervised and distance-based clustering metrics.The framework incorporates a\nflame segmentation method and a DML-aided detection framework. This includes\nutilizing center loss (CL), triplet center loss (TCL), and triplet cosine\ncenter loss (TCCL) to identify optimal cluster representatives for\nclassification. However, the dominance of center loss over the other losses\nleads to the model missing features sensitive to them. To address this\nlimitation, an attention mechanism is proposed. This mechanism allows for\nnon-uniform feature contribution, amplifying the critical role of cosine and\ntriplet loss in the DML framework. Additionally, it improves interpretability,\nclass discrimination, and decreases intra-class variance. As a result, the\nproposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in\nthe FLAME3 dataset for unobscured flame detection accuracy. Moreover, it\ndemonstrates enhanced class separation in obscured scenarios compared to VGG19,\nResNet18, and three backbone models tailored for flame detection.\n","authors":["Hossein Rajoli","Sahand Khoshdel","Fatemeh Afghah","Xiaolong Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06653v1.pdf","comment":"Submitted as a Journal Paper to IEEE Transactions on Geoscience and\n Remote Sensing"},{"id":"http://arxiv.org/abs/2404.05139v2","updated":"2024-04-09T23:17:07Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v2.pdf","comment":"Accepted by ICRA 2024. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.06638v1","updated":"2024-04-09T22:17:20Z","published":"2024-04-09T22:17:20Z","title":"SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron\n Micrograph Segmentation","summary":" Image segmentation is a critical enabler for tasks ranging from medical\ndiagnostics to autonomous driving. However, the correct segmentation semantics\n- where are boundaries located? what segments are logically similar? - change\ndepending on the domain, such that state-of-the-art foundation models can\ngenerate meaningless and incorrect results. Moreover, in certain domains,\nfine-tuning and retraining techniques are infeasible: obtaining labels is\ncostly and time-consuming; domain images (micrographs) can be exponentially\ndiverse; and data sharing (for third-party retraining) is restricted. To enable\nrapid adaptation of the best segmentation technology, we propose the concept of\nsemantic boosting: given a zero-shot foundation model, guide its segmentation\nand adjust results to match domain expectations. We apply semantic boosting to\nthe Segment Anything Model (SAM) to obtain microstructure segmentation for\ntransmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and\ntextural features of various intermediate masks to perform mask removal and\nmask merging operations. We demonstrate a zero-shot performance increase of\n(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06%\ndrop in mean false positive masks across images of three difficulty classes\nover vanilla SAM (ViT-L).\n","authors":["Waqwoya Abebe","Jan Strube","Luanzheng Guo","Nathan R. Tallent","Oceane Bel","Steven Spurgeon","Christina Doty","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2404.06638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06637v1","updated":"2024-04-09T22:16:34Z","published":"2024-04-09T22:16:34Z","title":"GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis","summary":" We present GeoSynth, a model for synthesizing satellite images with global\nstyle and image-driven layout control. The global style control is via textual\nprompts or geographic location. These enable the specification of scene\nsemantics or regional appearance respectively, and can be used together. We\ntrain our model on a large dataset of paired satellite imagery, with\nautomatically generated captions, and OpenStreetMap data. We evaluate various\ncombinations of control inputs, including different types of layout controls.\nResults demonstrate that our model can generate diverse, high-quality images\nand exhibits excellent zero-shot generalization. The code and model checkpoints\nare available at https://github.com/mvrl/GeoSynth.\n","authors":["Srikumar Sastry","Subash Khanal","Aayush Dhakal","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2404.06637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05195v2","updated":"2024-04-09T22:14:37Z","published":"2024-02-07T19:07:10Z","title":"$λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion\n Models by Leveraging CLIP Latent Space","summary":" Despite the recent advances in personalized text-to-image (P-T2I) generative\nmodels, it remains challenging to perform finetuning-free multi-subject-driven\nT2I in a resource-efficient manner. Predominantly, contemporary approaches,\ninvolving the training of Hypernetworks and Multimodal Large Language Models\n(MLLMs), require heavy computing resources that range from 600 to 12300 GPU\nhours of training. These subject-driven T2I methods hinge on Latent Diffusion\nModels (LDMs), which facilitate T2I mapping through cross-attention layers.\nWhile LDMs offer distinct advantages, P-T2I methods' reliance on the latent\nspace of these diffusion models significantly escalates resource demands,\nleading to inconsistent results and necessitating numerous iterations for a\nsingle desired image. In this paper, we present $\\lambda$-ECLIPSE, an\nalternative prior-training strategy that works in the latent space of a\npre-trained CLIP model without relying on the diffusion UNet models.\n$\\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast\nand effective multi-subject-driven P-T2I. Through extensive experiments, we\nestablish that $\\lambda$-ECLIPSE surpasses existing baselines in composition\nalignment while preserving concept alignment performance, even with\nsignificantly lower resource utilization. $\\lambda$-ECLIPSE performs\nmulti-subject driven P-T2I with just 34M parameters and is trained on a mere 74\nGPU hours. Additionally, $\\lambda$-ECLIPSE demonstrates the unique ability to\nperform multi-concept interpolations.\n","authors":["Maitreya Patel","Sangmin Jung","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2402.05195v2.pdf","comment":"Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/"},{"id":"http://arxiv.org/abs/2312.04746v2","updated":"2024-04-09T21:48:42Z","published":"2023-12-07T23:16:37Z","title":"Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized\n Narratives from Open-Source Histopathology Videos","summary":" Diagnosis in histopathology requires a global whole slide images (WSIs)\nanalysis, requiring pathologists to compound evidence from different WSI\npatches. The gigapixel scale of WSIs poses a challenge for histopathology\nmulti-modal models. Training multi-model models for histopathology requires\ninstruction tuning datasets, which currently contain information for individual\nimage patches, without a spatial grounding of the concepts within each patch\nand without a wider view of the WSI. Therefore, they lack sufficient diagnostic\ncapacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a\nlarge-scale dataset of 107,131 histopathology-specific instruction\nquestion/answer pairs, grounded within diagnostically relevant image patches\nthat make up the WSI. Our dataset is collected by leveraging educational\nhistopathology videos from YouTube, which provides spatial localization of\nnarrations by automatically extracting the narrators' cursor positions.\nQuilt-Instruct supports contextual reasoning by extracting diagnosis and\nsupporting facts from the entire WSI. Using Quilt-Instruct, we train\nQuilt-LLaVA, which can reason beyond the given single image patch, enabling\ndiagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a\ncomprehensive evaluation dataset created from 985 images and 1283\nhuman-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using\npublic histopathology datasets, where Quilt-LLaVA significantly outperforms\nSOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set\nVQA. Our code, data, and model are publicly accessible at\nquilt-llava.github.io.\n","authors":["Mehmet Saygin Seyfioglu","Wisdom O. Ikezogwo","Fatemeh Ghezloo","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2312.04746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06622v1","updated":"2024-04-09T21:12:31Z","published":"2024-04-09T21:12:31Z","title":"Calibrating Higher-Order Statistics for Few-Shot Class-Incremental\n Learning with Pre-trained Vision Transformers","summary":" Few-shot class-incremental learning (FSCIL) aims to adapt the model to new\nclasses from very few data (5 samples) without forgetting the previously\nlearned classes. Recent works in many-shot CIL (MSCIL) (using all available\ntraining data) exploited pre-trained models to reduce forgetting and achieve\nbetter plasticity. In a similar fashion, we use ViT models pre-trained on\nlarge-scale datasets for few-shot settings, which face the critical issue of\nlow plasticity. FSCIL methods start with a many-shot first task to learn a very\ngood feature extractor and then move to the few-shot setting from the second\ntask onwards. While the focus of most recent studies is on how to learn the\nmany-shot first task so that the model generalizes to all future few-shot\ntasks, we explore in this work how to better model the few-shot data using\npre-trained models, irrespective of how the first task is trained. Inspired by\nrecent works in MSCIL, we explore how using higher-order feature statistics can\ninfluence the classification of few-shot classes. We identify the main\nchallenge of obtaining a good covariance matrix from few-shot data and propose\nto calibrate the covariance matrix for new classes based on semantic similarity\nto the many-shot base classes. Using the calibrated feature statistics in\ncombination with existing methods significantly improves few-shot continual\nclassification on several FSCIL benchmarks. Code is available at\nhttps://github.com/dipamgoswami/FSCIL-Calibration.\n","authors":["Dipam Goswami","Bartłomiej Twardowski","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2404.06622v1.pdf","comment":"Accepted at CLVision workshop (CVPR 2024)"},{"id":"http://arxiv.org/abs/2403.08092v2","updated":"2024-04-09T20:55:01Z","published":"2024-03-12T22:03:19Z","title":"Mitigating the Impact of Attribute Editing on Face Recognition","summary":" Through a large-scale study over diverse face images, we show that facial\nattribute editing using modern generative AI models can severely degrade\nautomated face recognition systems. This degradation persists even with\nidentity-preserving generative models. To mitigate this issue, we propose two\nnovel techniques for local and global attribute editing. We empirically ablate\ntwenty-six facial semantic, demographic and expression-based attributes that\nhave been edited using state-of-the-art generative models, and evaluate them\nusing ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets.\nFinally, we use LLaVA, an emerging visual question-answering framework for\nattribute prediction to validate our editing techniques. Our methods outperform\nthe current state-of-the-art at facial editing (BLIP, InstantID) while\nimproving identity retention by a significant extent.\n","authors":["Sudipta Banerjee","Sai Pranaswi Mullangi","Shruti Wagle","Chinmay Hegde","Nasir Memon"],"pdf_url":"https://arxiv.org/pdf/2403.08092v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.06605v1","updated":"2024-04-09T20:24:29Z","published":"2024-04-09T20:24:29Z","title":"RoadBEV: Road Surface Reconstruction in Bird's Eye View","summary":" Road surface conditions, especially geometry profiles, enormously affect\ndriving performance of autonomous vehicles. Vision-based online road\nreconstruction promisingly captures road information in advance. Existing\nsolutions like monocular depth estimation and stereo matching suffer from\nmodest performance. The recent technique of Bird's-Eye-View (BEV) perception\nprovides immense potential to more reliable and accurate reconstruction. This\npaper uniformly proposes two simple yet effective models for road elevation\nreconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate\nroad elevation with monocular and stereo images, respectively. The former\ndirectly fits elevation values based on voxel features queried from image view,\nwhile the latter efficiently recognizes road elevation patterns based on BEV\nvolume representing discrepancy between left and right voxel features.\nInsightful analyses reveal their consistence and difference with perspective\nview. Experiments on real-world dataset verify the models' effectiveness and\nsuperiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm\nand 0.56cm, respectively. The estimation performance improves by 50\\% in BEV\nbased on monocular image. Our models are promising for practical applications,\nproviding valuable references for vision-based BEV perception in autonomous\ndriving. The code is released at https://github.com/ztsrxh/RoadBEV.\n","authors":["Tong Zhao","Lei Yang","Yichen Xie","Mingyu Ding","Masayoshi Tomizuka","Yintao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.06605v1.pdf","comment":"Dataset page: https://thu-rsxd.com/rsrd Code:\n https://github.com/ztsrxh/RoadBEV"},{"id":"http://arxiv.org/abs/2404.06593v1","updated":"2024-04-09T19:49:01Z","published":"2024-04-09T19:49:01Z","title":"Spatially Optimized Compact Deep Metric Learning Model for Similarity\n Search","summary":" Spatial optimization is often overlooked in many computer vision tasks.\nFilters should be able to recognize the features of an object regardless of\nwhere it is in the image. Similarity search is a crucial task where spatial\nfeatures decide an important output. The capacity of convolution to capture\nvisual patterns across various locations is limited. In contrast to\nconvolution, the involution kernel is dynamically created at each pixel based\non the pixel value and parameters that have been learned. This study\ndemonstrates that utilizing a single layer of involution feature extractor\nalongside a compact convolution model significantly enhances the performance of\nsimilarity search. Additionally, we improve predictions by using the GELU\nactivation function rather than the ReLU. The negligible amount of weight\nparameters in involution with a compact model with better performance makes the\nmodel very useful in real-world implementations. Our proposed model is below 1\nmegabyte in size. We have experimented with our proposed methodology and other\nmodels on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method\noutperforms across all three datasets.\n","authors":["Md. Farhadul Islam","Md. Tanzim Reza","Meem Arafat Manab","Mohammad Rakibul Hasan Mahin","Sarah Zabeen","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2404.06593v1.pdf","comment":"5 pages, 3 figures,"},{"id":"http://arxiv.org/abs/2404.06589v1","updated":"2024-04-09T19:33:05Z","published":"2024-04-09T19:33:05Z","title":"Leveraging Latents for Efficient Thermography Classification and\n Segmentation","summary":" Breast cancer is a prominent health concern worldwide, currently being the\nsecondmost common and second-deadliest type of cancer in women. While current\nbreast cancer diagnosis mainly relies on mammography imaging, in recent years\nthe use of thermography for breast cancer imaging has been garnering growing\npopularity. Thermographic imaging relies on infrared cameras to capture\nbody-emitted heat distributions. While these heat signatures have proven useful\nfor computer-vision systems for accurate breast cancer segmentation and\nclassification, prior work often relies on handcrafted feature engineering or\ncomplex architectures, potentially limiting the comparability and applicability\nof these methods. In this work, we present a novel algorithm for both breast\ncancer classification and segmentation. Rather than focusing efforts on manual\nfeature and architecture engineering, our algorithm focuses on leveraging an\ninformative, learned feature space, thus making our solution simpler to use and\nextend to other frameworks and downstream tasks, as well as more applicable to\ndata-scarce settings. Our classification produces SOTA results, while we are\nthe first work to produce segmentation regions studied in this paper.\n","authors":["Tamir Shor","Chaim Baskin","Alex Bronstein"],"pdf_url":"https://arxiv.org/pdf/2404.06589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01102v2","updated":"2024-04-09T19:26:36Z","published":"2024-04-01T13:23:04Z","title":"Diffusion based Zero-shot Medical Image-to-Image Translation for Cross\n Modality Segmentation","summary":" Cross-modality image segmentation aims to segment the target modalities using\na method designed in the source modality. Deep generative models can translate\nthe target modality images into the source modality, thus enabling\ncross-modality segmentation. However, a vast body of existing cross-modality\nimage translation methods relies on supervised learning. In this work, we aim\nto address the challenge of zero-shot learning-based image translation tasks\n(extreme scenarios in the target modality is unseen in the training phase). To\nleverage generative learning for zero-shot cross-modality image segmentation,\nwe propose a novel unsupervised image translation method. The framework learns\nto translate the unseen source image to the target modality for image\nsegmentation by leveraging the inherent statistical consistency between\ndifferent modalities for diffusion guidance. Our framework captures identical\ncross-modality features in the statistical domain, offering diffusion guidance\nwithout relying on direct mappings between the source and target domains. This\nadvantage allows our method to adapt to changing source domains without the\nneed for retraining, making it highly practical when sufficient labeled source\ndomain data is not available. The proposed framework is validated in zero-shot\ncross-modality image segmentation tasks through empirical comparisons with\ninfluential generative models, including adversarial-based and diffusion-based\nmodels.\n","authors":["Zihao Wang","Yingyu Yang","Yuzhou Chen","Tingting Yuan","Maxime Sermesant","Herve Delingette","Ona Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01102v2.pdf","comment":"Neurips 2023 Diffusion Workshop"},{"id":"http://arxiv.org/abs/2212.05140v2","updated":"2024-04-09T19:17:07Z","published":"2022-12-09T22:53:40Z","title":"Local Neighborhood Features for 3D Classification","summary":" With advances in deep learning model training strategies, the training of\nPoint cloud classification methods is significantly improving. For example,\nPointNeXt, which adopts prominent training techniques and InvResNet layers into\nPointNet++, achieves over 7% improvement on the real-world ScanObjectNN\ndataset. However, most of these models use point coordinates features of\nneighborhood points mapped to higher dimensional space while ignoring the\nneighborhood point features computed before feeding to the network layers. In\nthis paper, we revisit the PointNeXt model to study the usage and benefit of\nsuch neighborhood point features. We train and evaluate PointNeXt on ModelNet40\n(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world\ngrocery dataset, i.e., 3DGrocery100. In addition, we provide an additional\ninference strategy of weight averaging the top two checkpoints of PointNeXt to\nimprove classification accuracy. Together with the abovementioned ideas, we\ngain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model\nwith real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's\nApple10, Fruits, Vegetables, and Packages subsets, respectively. We also\nachieve a comparable 0.2% accuracy gain on ModelNet40.\n","authors":["Shivanand Venkanna Sheshappanavar","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2212.05140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05490v2","updated":"2024-04-09T18:55:43Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02527v3","updated":"2024-04-09T18:26:27Z","published":"2024-03-04T22:42:17Z","title":"A dataset of over one thousand computed tomography scans of battery\n cells","summary":" Battery technology is increasingly important for global electrification\nefforts. However, batteries are highly sensitive to small manufacturing\nvariations that can induce reliability or safety issues. An important\ntechnology for battery quality control is computed tomography (CT) scanning,\nwhich is widely used for non-destructive 3D inspection across a variety of\nclinical and industrial applications. Historically, however, the utility of CT\nscanning for high-volume manufacturing has been limited by its low throughput\nas well as the difficulty of handling its large file sizes. In this work, we\npresent a dataset of over one thousand CT scans of as-produced commercially\navailable batteries. The dataset spans various chemistries (lithium-ion and\nsodium-ion) as well as various battery form factors (cylindrical, pouch, and\nprismatic). We evaluate seven different battery types in total. The\nmanufacturing variability and the presence of battery defects can be observed\nvia this dataset. This dataset may be of interest to scientists and engineers\nworking on battery technology, computer vision, or both.\n","authors":["Amariah Condon","Bailey Buscarino","Eric Moch","William J. Sehnert","Owen Miles","Patrick K. Herring","Peter M. Attia"],"pdf_url":"https://arxiv.org/pdf/2403.02527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08514v2","updated":"2024-04-09T18:23:39Z","published":"2023-12-13T21:02:03Z","title":"TAM-VT: Transformation-Aware Multi-scale Video Transformer for\n Segmentation and Tracking","summary":" Video Object Segmentation (VOS) has emerged as an increasingly important\nproblem with availability of larger datasets and more complex and realistic\nsettings, which involve long videos with global motion (e.g, in egocentric\nsettings), depicting small objects undergoing both rigid and non-rigid\n(including state) deformations. While a number of recent approaches have been\nexplored for this task, these data characteristics still present challenges. In\nthis work we propose a novel, clip-based DETR-style encoder-decoder\narchitecture, which focuses on systematically analyzing and addressing\naforementioned challenges. Specifically, we propose a novel\ntransformation-aware loss that focuses learning on portions of the video where\nan object undergoes significant deformations -- a form of \"soft\" hard examples\nmining. Further, we propose a multiplicative time-coded memory, beyond vanilla\nadditive positional encoding, which helps propagate context across long videos.\nFinally, we incorporate these in our proposed holistic multi-scale video\ntransformer for tracking via multi-scale memory matching and decoding to ensure\nsensitivity and accuracy for long videos and small objects. Our model enables\non-line inference with long videos in a windowed fashion, by breaking the video\ninto clips and propagating context among them. We illustrate that short clip\nlength and longer memory with learned time-coding are important design choices\nfor improved performance. Collectively, these technical contributions enable\nour model to achieve new state-of-the-art (SoTA) performance on two complex\negocentric datasets -- VISOR and VOST, while achieving comparable to SoTA\nresults on the conventional VOS benchmark, DAVIS'17. A series of detailed\nablations validate our design choices as well as provide insights into the\nimportance of parameter choices and their impact on performance.\n","authors":["Raghav Goyal","Wan-Cyuan Fan","Mennatullah Siam","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2312.08514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06559v1","updated":"2024-04-09T18:23:34Z","published":"2024-04-09T18:23:34Z","title":"The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios","summary":" Face morphing attacks present an emerging threat to the face recognition\nsystem. On top of that, printing and scanning the morphed images could obscure\nthe artifacts generated during the morphing process, which makes morphed image\ndetection even harder. In this work, we investigate the impact that printing\nand scanning has on morphing attacks through a series of heterogeneous tests.\nOur experiments show that we can increase the possibility of a false match by\nup to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has\nbeen printed and scanned, regardless it is morphed or bona fide, to a Face\nRecognition (FR) system. Likewise, using Frechet Inception Distance (FID)\nmetric, strictly print-scanned morph attacks performed on average 9.185%\nstronger than non-print-scanned digital morphs.\n","authors":["Richard E. Neddo","Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06559v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06542v1","updated":"2024-04-09T18:00:25Z","published":"2024-04-09T18:00:25Z","title":"Training-Free Open-Vocabulary Segmentation with Offline\n Diffusion-Augmented Prototype Generation","summary":" Open-vocabulary semantic segmentation aims at segmenting arbitrary categories\nexpressed in textual form. Previous works have trained over large amounts of\nimage-caption pairs to enforce pixel-level multimodal alignments. However,\ncaptions provide global information about the semantics of a given image but\nlack direct localization of individual concepts. Further, training on\nlarge-scale datasets inevitably brings significant computational costs. In this\npaper, we propose FreeDA, a training-free diffusion-augmented method for\nopen-vocabulary semantic segmentation, which leverages the ability of diffusion\nmodels to visually localize generated concepts and local-global similarities to\nmatch class-agnostic regions with semantic classes. Our approach involves an\noffline stage in which textual-visual reference embeddings are collected,\nstarting from a large set of captions and leveraging visual and semantic\ncontexts. At test time, these are queried to support the visual matching\nprocess, which is carried out by jointly considering class-agnostic regions and\nglobal semantic similarities. Extensive analyses demonstrate that FreeDA\nachieves state-of-the-art performance on five datasets, surpassing previous\nmethods by more than 7.0 average points in terms of mIoU and without requiring\nany training.\n","authors":["Luca Barsellotti","Roberto Amoroso","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.06542v1.pdf","comment":"CVPR 2024. Project page: https://aimagelab.github.io/freeda/"},{"id":"http://arxiv.org/abs/2208.11650v3","updated":"2024-04-09T17:59:34Z","published":"2022-08-24T16:40:27Z","title":"Lane Change Classification and Prediction with Action Recognition\n Networks","summary":" Anticipating lane change intentions of surrounding vehicles is crucial for\nefficient and safe driving decision making in an autonomous driving system.\nPrevious works often adopt physical variables such as driving speed,\nacceleration and so forth for lane change classification. However, physical\nvariables do not contain semantic information. Although 3D CNNs have been\ndeveloping rapidly, the number of methods utilising action recognition models\nand appearance feature for lane change recognition is low, and they all require\nadditional information to pre-process data. In this work, we propose an\nend-to-end framework including two action recognition methods for lane change\nrecognition, using video data collected by cameras. Our method achieves the\nbest lane change classification results using only the RGB video data of the\nPREVENTION dataset. Class activation maps demonstrate that action recognition\nmodels can efficiently extract lane change motions. A method to better extract\nmotion clues is also proposed in this paper.\n","authors":["Kai Liang","Jun Wang","Abhir Bhalerao"],"pdf_url":"https://arxiv.org/pdf/2208.11650v3.pdf","comment":"Accepted to ECCV2022 AVVISION"},{"id":"http://arxiv.org/abs/2404.06486v1","updated":"2024-04-09T17:37:08Z","published":"2024-04-09T17:37:08Z","title":"GO4Align: Group Optimization for Multi-Task Alignment","summary":" This paper proposes \\textit{GO4Align}, a multi-task optimization approach\nthat tackles task imbalance by explicitly aligning the optimization across\ntasks. To achieve this, we design an adaptive group risk minimization strategy,\ncompromising two crucial techniques in implementation: (i) dynamical group\nassignment, which clusters similar tasks based on task interactions; (ii)\nrisk-guided group indicators, which exploit consistent task correlations with\nrisk information from previous iterations. Comprehensive experimental results\non diverse typical benchmarks demonstrate our method's performance superiority\nwith even lower computational costs.\n","authors":["Jiayi Shen","Cheems Wang","Zehao Xiao","Nanne Van Noord","Marcel Worring"],"pdf_url":"https://arxiv.org/pdf/2404.06486v1.pdf","comment":null}]},"2024-04-10T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.07206v1","updated":"2024-04-10T17:59:59Z","published":"2024-04-10T17:59:59Z","title":"GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models","summary":" In this paper, we introduce GoodDrag, a novel approach to improve the\nstability and image quality of drag editing. Unlike existing methods that\nstruggle with accumulated perturbations and often result in distortions,\nGoodDrag introduces an AlDD framework that alternates between drag and\ndenoising operations within the diffusion process, effectively improving the\nfidelity of the result. We also propose an information-preserving motion\nsupervision operation that maintains the original features of the starting\npoint for precise manipulation and artifact reduction. In addition, we\ncontribute to the benchmarking of drag editing by introducing a new dataset,\nDrag100, and developing dedicated quality assessment metrics, Dragging Accuracy\nIndex and Gemini Score, utilizing Large Multimodal Models. Extensive\nexperiments demonstrate that the proposed GoodDrag compares favorably against\nthe state-of-the-art approaches both qualitatively and quantitatively. The\nproject page is https://gooddrag.github.io.\n","authors":["Zewei Zhang","Huan Liu","Jun Chen","Xiangyu Xu"],"pdf_url":"https://arxiv.org/pdf/2404.07206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07204v1","updated":"2024-04-10T17:59:45Z","published":"2024-04-10T17:59:45Z","title":"BRAVE: Broadening the visual encoding of vision-language models","summary":" Vision-language models (VLMs) are typically composed of a vision encoder,\ne.g. CLIP, and a language model (LM) that interprets the encoded features to\nsolve downstream tasks. Despite remarkable progress, VLMs are subject to\nseveral shortcomings due to the limited capabilities of vision encoders, e.g.\n\"blindness\" to certain image features, visual hallucination, etc. To address\nthese issues, we study broadening the visual encoding capabilities of VLMs. We\nfirst comprehensively benchmark several vision encoders with different\ninductive biases for solving VLM tasks. We observe that there is no single\nencoding configuration that consistently achieves top performance across\ndifferent tasks, and encoders with different biases can perform surprisingly\nsimilarly. Motivated by this, we introduce a method, named BRAVE, that\nconsolidates features from multiple frozen encoders into a more versatile\nrepresentation that can be directly fed as the input to a frozen LM. BRAVE\nachieves state-of-the-art performance on a broad range of captioning and VQA\nbenchmarks and significantly reduces the aforementioned issues of VLMs, while\nrequiring a smaller number of trainable parameters than existing methods and\nhaving a more compressed representation. Our results highlight the potential of\nincorporating different visual biases for a more broad and contextualized\nvisual understanding of VLMs.\n","authors":["Oğuzhan Fatih Kar","Alessio Tonioni","Petra Poklukar","Achin Kulshrestha","Amir Zamir","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.07204v1.pdf","comment":"Project page at https://brave-vlms.epfl.ch/"},{"id":"http://arxiv.org/abs/2404.07202v1","updated":"2024-04-10T17:59:20Z","published":"2024-04-10T17:59:20Z","title":"UMBRAE: Unified Multimodal Decoding of Brain Signals","summary":" We address prevailing challenges of the brain-powered research, departing\nfrom the observation that the literature hardly recover accurate spatial\ninformation and require subject-specific models. To address these challenges,\nwe propose UMBRAE, a unified multimodal decoding of brain signals. First, to\nextract instance-level conceptual and spatial details from neural signals, we\nintroduce an efficient universal brain encoder for multimodal-brain alignment\nand recover object descriptions at multiple levels of granularity from\nsubsequent multimodal large language model (MLLM). Second, we introduce a\ncross-subject training strategy mapping subject-specific features to a common\nfeature space. This allows a model to be trained on multiple subjects without\nextra resources, even yielding superior results compared to subject-specific\nmodels. Further, we demonstrate this supports weakly-supervised adaptation to\nnew subjects, with only a fraction of the total training data. Experiments\ndemonstrate that UMBRAE not only achieves superior results in the newly\nintroduced tasks but also outperforms methods in well established tasks. To\nassess our method, we construct and share with the community a comprehensive\nbrain understanding benchmark BrainHub. Our code and benchmark are available at\nhttps://weihaox.github.io/UMBRAE.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2404.07202v1.pdf","comment":"Project Page: https://weihaox.github.io/UMBRAE"},{"id":"http://arxiv.org/abs/2404.07199v1","updated":"2024-04-10T17:57:41Z","published":"2024-04-10T17:57:41Z","title":"RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth\n Diffusion","summary":" We introduce RealmDreamer, a technique for generation of general\nforward-facing 3D scenes from text descriptions. Our technique optimizes a 3D\nGaussian Splatting representation to match complex text prompts. We initialize\nthese splats by utilizing the state-of-the-art text-to-image generators,\nlifting their samples into 3D, and computing the occlusion volume. We then\noptimize this representation across multiple views as a 3D inpainting task with\nimage-conditional diffusion models. To learn correct geometric structure, we\nincorporate a depth diffusion model by conditioning on the samples from the\ninpainting model, giving rich geometric structure. Finally, we finetune the\nmodel using sharpened samples from image generators. Notably, our technique\ndoes not require video or multi-view data and can synthesize a variety of\nhigh-quality 3D scenes in different styles, consisting of multiple objects. Its\ngenerality additionally allows 3D synthesis from a single image.\n","authors":["Jaidev Shriram","Alex Trevithick","Lingjie Liu","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2404.07199v1.pdf","comment":"Project Page: https://realmdreamer.github.io/"},{"id":"http://arxiv.org/abs/2404.07191v1","updated":"2024-04-10T17:48:37Z","published":"2024-04-10T17:48:37Z","title":"InstantMesh: Efficient 3D Mesh Generation from a Single Image with\n Sparse-view Large Reconstruction Models","summary":" We present InstantMesh, a feed-forward framework for instant 3D mesh\ngeneration from a single image, featuring state-of-the-art generation quality\nand significant training scalability. By synergizing the strengths of an\noff-the-shelf multiview diffusion model and a sparse-view reconstruction model\nbased on the LRM architecture, InstantMesh is able to create diverse 3D assets\nwithin 10 seconds. To enhance the training efficiency and exploit more\ngeometric supervisions, e.g, depths and normals, we integrate a differentiable\niso-surface extraction module into our framework and directly optimize on the\nmesh representation. Experimental results on public datasets demonstrate that\nInstantMesh significantly outperforms other latest image-to-3D baselines, both\nqualitatively and quantitatively. We release all the code, weights, and demo of\nInstantMesh, with the intention that it can make substantial contributions to\nthe community of 3D generative AI and empower both researchers and content\ncreators.\n","authors":["Jiale Xu","Weihao Cheng","Yiming Gao","Xintao Wang","Shenghua Gao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.07191v1.pdf","comment":"Technical report. Project: https://github.com/TencentARC/InstantMesh"},{"id":"http://arxiv.org/abs/2404.07188v1","updated":"2024-04-10T17:41:41Z","published":"2024-04-10T17:41:41Z","title":"GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on\n FPGA","summary":" Graph neural networks (GNNs) have recently empowered various novel computer\nvision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN\nlayers or only GNN layers are employed. This paper introduces GCV-Turbo, a\ndomain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV\ntasks. GCV-Turbo consists of two key components: (1) a \\emph{novel} hardware\narchitecture optimized for the computation kernels in both CNNs and GNNs using\nthe same set of computation resources. (2) a PyTorch-compatible compiler that\ntakes a user-defined model as input, performs end-to-end optimization for the\ncomputation graph of a given GNN-based CV task, and produces optimized code for\nhardware execution. The hardware architecture and the compiler work\nsynergistically to support a variety of GNN-based CV tasks. We implement\nGCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six\nrepresentative GNN-based CV tasks with diverse input data modalities (e.g.,\nimage, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU)\nimplementations, GCV-Turbo achieves an average latency reduction of\n$68.4\\times$ ($4.1\\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo\nsupports the execution of the standalone CNNs or GNNs, achieving performance\ncomparable to that of state-of-the-art CNN (GNN) accelerators for widely used\nCNN-only (GNN-only) models.\n","authors":["Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.07188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14855v2","updated":"2024-04-10T17:35:16Z","published":"2022-12-30T18:04:25Z","title":"Disentangled Explanations of Neural Network Predictions by Finding\n Relevant Subspaces","summary":" Explainable AI aims to overcome the black-box nature of complex ML models\nlike neural networks by generating explanations for their predictions.\nExplanations often take the form of a heatmap identifying input features (e.g.\npixels) that are relevant to the model's decision. These explanations, however,\nentangle the potentially multiple factors that enter into the overall complex\ndecision strategy. We propose to disentangle explanations by extracting at some\nintermediate layer of a neural network, subspaces that capture the multiple and\ndistinct activation patterns (e.g. visual concepts) that are relevant to the\nprediction. To automatically extract these subspaces, we propose two new\nanalyses, extending principles found in PCA or ICA to explanations. These novel\nanalyses, which we call principal relevant component analysis (PRCA) and\ndisentangled relevant subspace analysis (DRSA), maximize relevance instead of\ne.g. variance or kurtosis. This allows for a much stronger focus of the\nanalysis on what the ML model actually uses for predicting, ignoring\nactivations or concepts to which the model is invariant. Our approach is\ngeneral enough to work alongside common attribution techniques such as Shapley\nValue, Integrated Gradients, or LRP. Our proposed methods show to be\npractically useful and compare favorably to the state of the art as\ndemonstrated on benchmarks and three use cases.\n","authors":["Pattarawat Chormai","Jan Herrmann","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2212.14855v2.pdf","comment":"17 pages + supplement"},{"id":"http://arxiv.org/abs/2404.07178v1","updated":"2024-04-10T17:28:16Z","published":"2024-04-10T17:28:16Z","title":"Move Anything with Layered Scene Diffusion","summary":" Diffusion models generate images with an unprecedented level of quality, but\nhow can we freely rearrange image layouts? Recent works generate controllable\nscenes via learning spatially disentangled latent codes, but these methods do\nnot apply to diffusion models due to their fixed forward process. In this work,\nwe propose SceneDiffusion to optimize a layered scene representation during the\ndiffusion sampling process. Our key insight is that spatial disentanglement can\nbe obtained by jointly denoising scene renderings at different spatial layouts.\nOur generated scenes support a wide range of spatial editing operations,\nincluding moving, resizing, cloning, and layer-wise appearance editing\noperations, including object restyling and replacing. Moreover, a scene can be\ngenerated conditioned on a reference image, thus enabling object moving for\nin-the-wild images. Notably, this approach is training-free, compatible with\ngeneral text-to-image diffusion models, and responsive in less than a second.\n","authors":["Jiawei Ren","Mengmeng Xu","Jui-Chieh Wu","Ziwei Liu","Tao Xiang","Antoine Toisoul"],"pdf_url":"https://arxiv.org/pdf/2404.07178v1.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2404.07176v1","updated":"2024-04-10T17:25:42Z","published":"2024-04-10T17:25:42Z","title":"Self-supervised Monocular Depth Estimation on Water Scenes via Specular\n Reflection Prior","summary":" Monocular depth estimation from a single image is an ill-posed problem for\ncomputer vision due to insufficient reliable cues as the prior knowledge.\nBesides the inter-frame supervision, namely stereo and adjacent frames,\nextensive prior information is available in the same frame. Reflections from\nspecular surfaces, informative intra-frame priors, enable us to reformulate the\nill-posed depth estimation task as a multi-view synthesis. This paper proposes\nthe first self-supervision for deep-learning depth estimation on water scenes\nvia intra-frame priors, known as reflection supervision and geometrical\nconstraints. In the first stage, a water segmentation network is performed to\nseparate the reflection components from the entire image. Next, we construct a\nself-supervised framework to predict the target appearance from reflections,\nperceived as other perspectives. The photometric re-projection error,\nincorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to\noptimize pose and depth estimation by aligning the transformed virtual depths\nand source ones. As a supplement, the water surface is determined from real and\nvirtual camera positions, which complement the depth of the water area.\nFurthermore, to alleviate these laborious ground truth annotations, we\nintroduce a large-scale water reflection scene (WRS) dataset rendered from\nUnreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility\nof the proposed method compared to state-of-the-art depth estimation\ntechniques.\n","authors":["Zhengyang Lu","Ying Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07176v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2212.11120v2","updated":"2024-04-10T17:15:23Z","published":"2022-12-10T07:50:29Z","title":"Deep Learning for Inertial Sensor Alignment","summary":" Accurate alignment of a fixed mobile device equipped with inertial sensors\ninside a moving vehicle is important for navigation, activity recognition, and\nother applications. Accurate estimation of the device mounting angle is\nrequired to rotate the inertial measurement from the sensor frame to the moving\nplatform frame to standardize measurements and improve the performance of the\ntarget task. In this work, a data-driven approach using deep neural networks\n(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped\nwith an inertial measurement unit (IMU) and strapped to a car. The proposed\nmodel uses only the accelerometer and gyroscope readings from an IMU as input\nand, in contrast to existing solutions, does not require global position inputs\nfrom global navigation satellite systems (GNSS). To train the model in a\nsupervised manner, IMU data is collected for training and validation with the\nsensor mounted at a known yaw mounting angle, and a range of ground truth\nlabels is generated by applying a random rotation in a bounded range to the\nmeasurements. The trained model is tested on data with real rotations showing\nsimilar performance as with synthetic rotations. The trained model is deployed\non an Android device and evaluated in real-time to test the accuracy of the\nestimated yaw mounting angle. The model is shown to find the mounting angle at\nan accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An\nexperiment is conducted to compare the proposed model with an existing\noff-the-shelf solution.\n","authors":["Maxim Freydin","Niv Sfaradi","Nimrod Segol","Areej Eweida","Barak Or"],"pdf_url":"https://arxiv.org/pdf/2212.11120v2.pdf","comment":"9 Pages, Preprint. Accepted IEEE"},{"id":"http://arxiv.org/abs/2404.07155v1","updated":"2024-04-10T16:44:11Z","published":"2024-04-10T16:44:11Z","title":"Unified Language-driven Zero-shot Domain Adaptation","summary":" This paper introduces Unified Language-driven Zero-shot Domain Adaptation\n(ULDA), a novel task setting that enables a single model to adapt to diverse\ntarget domains without explicit domain-ID knowledge. We identify the\nconstraints in the existing language-driven zero-shot domain adaptation task,\nparticularly the requirement for domain IDs and domain-specific models, which\nmay restrict flexibility and scalability. To overcome these issues, we propose\na new framework for ULDA, consisting of Hierarchical Context Alignment (HCA),\nDomain Consistent Representation Learning (DCRL), and Text-Driven Rectifier\n(TDR). These components work synergistically to align simulated features with\ntarget text across multiple visual levels, retain semantic correlations between\ndifferent regional representations, and rectify biases between simulated and\nreal target visual features, respectively. Our extensive empirical evaluations\ndemonstrate that this framework achieves competitive performance in both\nsettings, surpassing even the model that requires domain-ID, showcasing its\nsuperiority and generalization ability. The proposed method is not only\neffective but also maintains practicality and efficiency, as it does not\nintroduce additional computational costs during inference. Our project page is\nhttps://senqiaoyang.com/project/ULDA .\n","authors":["Senqiao Yang","Zhuotao Tian","Li Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2404.07155v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07153v1","updated":"2024-04-10T16:39:50Z","published":"2024-04-10T16:39:50Z","title":"Lost in Translation: Modern Neural Networks Still Struggle With Small\n Realistic Image Transformations","summary":" Deep neural networks that achieve remarkable performance in image\nclassification have previously been shown to be easily fooled by tiny\ntransformations such as a one pixel translation of the input image. In order to\naddress this problem, two approaches have been proposed in recent years. The\nfirst approach suggests using huge datasets together with data augmentation in\nthe hope that a highly varied training set will teach the network to learn to\nbe invariant. The second approach suggests using architectural modifications\nbased on sampling theory to deal explicitly with image translations. In this\npaper, we show that these approaches still fall short in robustly handling\n'natural' image translations that simulate a subtle change in camera\norientation. Our findings reveal that a mere one-pixel translation can result\nin a significant change in the predicted image representation for approximately\n40% of the test images in state-of-the-art models (e.g. open-CLIP trained on\nLAION-2B or DINO-v2) , while models that are explicitly constructed to be\nrobust to cyclic translations can still be fooled with 1 pixel realistic\n(non-cyclic) translations 11% of the time. We present Robust Inference by Crop\nSelection: a simple method that can be proven to achieve any desired level of\nconsistency, although with a modest tradeoff with the model's accuracy.\nImportantly, we demonstrate how employing this method reduces the ability to\nfool state-of-the-art models with a 1 pixel translation to less than 5% while\nsuffering from only a 1% drop in classification accuracy. Additionally, we show\nthat our method can be easy adjusted to deal with circular shifts as well. In\nsuch case we achieve 100% robustness to integer shifts with state-of-the-art\naccuracy, and with no need for any further training.\n","authors":["Ofir Shifman","Yair Weiss"],"pdf_url":"https://arxiv.org/pdf/2404.07153v1.pdf","comment":"14 pages, 6 appendices, 17 figures"},{"id":"http://arxiv.org/abs/2312.00068v2","updated":"2024-04-10T16:04:48Z","published":"2023-11-29T20:59:00Z","title":"GLiDR: Topologically Regularized Graph Generative Network for Sparse\n LiDAR Point Clouds","summary":" Sparse LiDAR point clouds cause severe loss of detail of static structures\nand reduce the density of static points available for navigation. Reduced\ndensity can be detrimental to navigation under several scenarios. We observe\nthat despite high sparsity, in most cases, the global topology of LiDAR\noutlining the static structures can be inferred. We utilize this property to\nobtain a backbone skeleton of a LiDAR scan in the form of a single connected\ncomponent that is a proxy to its global topology. We utilize the backbone to\naugment new points along static structures to overcome sparsity. Newly\nintroduced points could correspond to existing static structures or to static\npoints that were earlier obstructed by dynamic objects. To the best of our\nknowledge, we are the first to use such a strategy for sparse LiDAR point\nclouds. Existing solutions close to our approach fail to identify and preserve\nthe global static LiDAR topology and generate sub-optimal points. We propose\nGLiDR, a Graph Generative network that is topologically regularized using\n0-dimensional Persistent Homology ($\\mathcal{PH}$) constraints. This enables\nGLiDR to introduce newer static points along a topologically consistent global\nstatic LiDAR backbone. GLiDR generates precise static points using $32\\times$\nsparser dynamic scans and performs better than the baselines across three\ndatasets. GLiDR generates a valuable byproduct - an accurate binary\nsegmentation mask of static and dynamic objects that are helpful for navigation\nplanning and safety in constrained environments. The newly introduced static\npoints allow GLiDR to outperform LiDAR-based navigation using SLAM in several\nsettings. Source code is available at\n$\\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$.\n","authors":["Prashant Kumar","Kshitij Madhav Bhat","Vedang Bhupesh Shenvi Nadkarni","Prem Kalra"],"pdf_url":"https://arxiv.org/pdf/2312.00068v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)"},{"id":"http://arxiv.org/abs/2404.07124v1","updated":"2024-04-10T16:04:21Z","published":"2024-04-10T16:04:21Z","title":"Measuring proximity to standard planes during fetal brain ultrasound\n scanning","summary":" This paper introduces a novel pipeline designed to bring ultrasound (US)\nplane pose estimation closer to clinical use for more effective navigation to\nthe standard planes (SPs) in the fetal brain. We propose a semi-supervised\nsegmentation model utilizing both labeled SPs and unlabeled 3D US volume\nslices. Our model enables reliable segmentation across a diverse set of fetal\nbrain images. Furthermore, the model incorporates a classification mechanism to\nidentify the fetal brain precisely. Our model not only filters out frames\nlacking the brain but also generates masks for those containing it, enhancing\nthe relevance of plane pose regression in clinical settings. We focus on fetal\nbrain navigation from 2D ultrasound (US) video analysis and combine this model\nwith a US plane pose regression network to provide sensorless proximity\ndetection to SPs and non-SPs planes; we emphasize the importance of proximity\ndetection to SPs for guiding sonographers, offering a substantial advantage\nover traditional methods by allowing earlier and more precise adjustments\nduring scanning. We demonstrate the practical applicability of our approach\nthrough validation on real fetal scan videos obtained from sonographers of\nvarying expertise levels. Our findings demonstrate the potential of our\napproach to complement existing fetal US technologies and advance prenatal\ndiagnostic practices.\n","authors":["Chiara Di Vece","Antonio Cirigliano","Meala Le Lous","Raffaele Napolitano","Anna L. David","Donald Peebles","Pierre Jannin","Francisco Vasconcelos","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2404.07124v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07122v1","updated":"2024-04-10T16:01:37Z","published":"2024-04-10T16:01:37Z","title":"Driver Attention Tracking and Analysis","summary":" We propose a novel method to estimate a driver's points-of-gaze using a pair\nof ordinary cameras mounted on the windshield and dashboard of a car. This is a\nchallenging problem due to the dynamics of traffic environments with 3D scenes\nof unknown depths. This problem is further complicated by the volatile distance\nbetween the driver and the camera system. To tackle these challenges, we\ndevelop a novel convolutional network that simultaneously analyzes the image of\nthe scene and the image of the driver's face. This network has a camera\ncalibration module that can compute an embedding vector that represents the\nspatial configuration between the driver and the camera system. This\ncalibration module improves the overall network's performance, which can be\njointly trained end to end.\n We also address the lack of annotated data for training and evaluation by\nintroducing a large-scale driving dataset with point-of-gaze annotations. This\nis an in situ dataset of real driving sessions in an urban city, containing\nsynchronized images of the driving scene as well as the face and gaze of the\ndriver. Experiments on this dataset show that the proposed method outperforms\nvarious baseline methods, having the mean prediction error of 29.69 pixels,\nwhich is relatively small compared to the $1280{\\times}720$ resolution of the\nscene camera.\n","authors":["Dat Viet Thanh Nguyen","Anh Tran","Nam Vu","Cuong Pham","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.07122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10908v3","updated":"2024-04-10T15:59:31Z","published":"2023-12-18T03:34:07Z","title":"CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update","summary":" Utilizing large language models (LLMs) to compose off-the-shelf visual tools\nrepresents a promising avenue of research for developing robust visual\nassistants capable of addressing diverse visual tasks. However, these methods\noften overlook the potential for continual learning, typically by freezing the\nutilized tools, thus limiting their adaptation to environments requiring new\nknowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual\nAssistant, which operates within a framework encompassing inference,\nreflection, and learning phases. During the inference phase, LLMs generate\nprograms and execute corresponding tools to complete assigned tasks. In the\nreflection phase, a multimodal global-local reflection scheme analyzes human\nfeedback to determine which tools require updating. Lastly, the learning phase\nemploys three flexible approaches to automatically gather training data and\nintroduces a novel prompt tuning scheme to update the tools, allowing CLOVA to\nefficiently acquire new knowledge. Experimental findings demonstrate that CLOVA\nsurpasses existing tool-usage methods by 5% in visual question answering and\nmultiple-image reasoning, by 10% in knowledge tagging, and by 20% in image\nediting. These results underscore the significance of the continual learning\ncapability in general visual assistants.\n","authors":["Zhi Gao","Yuntao Du","Xintong Zhang","Xiaojian Ma","Wenjuan Han","Song-Chun Zhu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2312.10908v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.11468v3","updated":"2024-04-10T15:58:09Z","published":"2023-11-13T20:41:48Z","title":"Bias-Reduced Neural Networks for Parameter Estimation in Quantitative\n MRI","summary":" Purpose: To develop neural network (NN)-based quantitative MRI parameter\nestimators with minimal bias and a variance close to the Cram\\'er-Rao bound.\n Theory and Methods: We generalize the mean squared error loss to control the\nbias and variance of the NN's estimates, which involves averaging over multiple\nnoise realizations of the same measurements during training. Bias and variance\nproperties of the resulting NNs are studied for two neuroimaging applications.\n Results: In simulations, the proposed strategy reduces the estimates' bias\nthroughout parameter space and achieves a variance close to the Cram\\'er-Rao\nbound. In vivo, we observe good concordance between parameter maps estimated\nwith the proposed NNs and traditional estimators, such as non-linear\nleast-squares fitting, while state-of-the-art NNs show larger deviations.\n Conclusion: The proposed NNs have greatly reduced bias compared to those\ntrained using the mean squared error and offer significantly improved\ncomputational efficiency over traditional estimators with comparable or better\naccuracy.\n","authors":["Andrew Mao","Sebastian Flassbeck","Jakob Assländer"],"pdf_url":"https://arxiv.org/pdf/2312.11468v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07112v1","updated":"2024-04-10T15:51:46Z","published":"2024-04-10T15:51:46Z","title":"Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images","summary":" Deep subspace clustering methods are now prominent in clustering, typically\nusing fully connected networks and a self-representation loss function.\nHowever, these methods often struggle with overfitting and lack\ninterpretability. In this paper, we explore an alternative clustering approach\nbased on deep unfolding. By unfolding iterative optimization methods into\nneural networks, this approach offers enhanced interpretability and reliability\ncompared to data-driven deep learning methods, and greater adaptability and\ngeneralization than model-based approaches. Hence, unfolding has become widely\nused in inverse imaging problems, such as image restoration, reconstruction,\nand super-resolution, but has not been sufficiently explored yet in the context\nof clustering. In this work, we introduce an innovative clustering architecture\nfor hyperspectral images (HSI) by unfolding an iterative solver based on the\nAlternating Direction Method of Multipliers (ADMM) for sparse subspace\nclustering. To our knowledge, this is the first attempt to apply unfolding ADMM\nfor computing the self-representation matrix in subspace clustering. Moreover,\nour approach captures well the structural characteristics of HSI data by\nemploying the K nearest neighbors algorithm as part of a structure preservation\nmodule. Experimental evaluation of three established HSI datasets shows clearly\nthe potential of the unfolding approach in HSI clustering and even demonstrates\nsuperior performance compared to state-of-the-art techniques.\n","authors":["Xianlu Li","Nicolas Nadisic","Shaoguang Huang","Aleksandra Pižurica"],"pdf_url":"https://arxiv.org/pdf/2404.07112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07110v1","updated":"2024-04-10T15:47:35Z","published":"2024-04-10T15:47:35Z","title":"Wild Visual Navigation: Fast Traversability Learning via Pre-Trained\n Models and Online Self-Supervision","summary":" Natural environments such as forests and grasslands are challenging for\nrobotic navigation because of the false perception of rigid obstacles from high\ngrass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN),\nan online self-supervised learning system for visual traversability estimation.\nThe system is able to continuously adapt from a short human demonstration in\nthe field, only using onboard sensing and computing. One of the key ideas to\nachieve this is the use of high-dimensional features from pre-trained\nself-supervised models, which implicitly encode semantic information that\nmassively simplifies the learning task. Further, the development of an online\nscheme for supervision generator enables concurrent training and inference of\nthe learned model in the wild. We demonstrate our approach through diverse\nreal-world deployments in forests, parks, and grasslands. Our system is able to\nbootstrap the traversable terrain segmentation in less than 5 min of in-field\ntraining time, enabling the robot to navigate in complex, previously unseen\noutdoor terrains. Code: https://bit.ly/498b0CV - Project\npage:https://bit.ly/3M6nMHH\n","authors":["Matías Mattamala","Jonas Frey","Piotr Libera","Nived Chebrolu","Georg Martius","Cesar Cadena","Marco Hutter","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2404.07110v1.pdf","comment":"Extended version of arXiv:2305.08510"},{"id":"http://arxiv.org/abs/2404.07106v1","updated":"2024-04-10T15:45:03Z","published":"2024-04-10T15:45:03Z","title":"3DMambaComplete: Exploring Structured State Space Model for Point Cloud\n Completion","summary":" Point cloud completion aims to generate a complete and high-fidelity point\ncloud from an initially incomplete and low-quality input. A prevalent strategy\ninvolves leveraging Transformer-based models to encode global features and\nfacilitate the reconstruction process. However, the adoption of pooling\noperations to obtain global feature representations often results in the loss\nof local details within the point cloud. Moreover, the attention mechanism\ninherent in Transformers introduces additional computational complexity,\nrendering it challenging to handle long sequences effectively. To address these\nissues, we propose 3DMambaComplete, a point cloud completion network built on\nthe novel Mamba framework. It comprises three modules: HyperPoint Generation\nencodes point cloud features using Mamba's selection mechanism and predicts a\nset of Hyperpoints. A specific offset is estimated, and the down-sampled points\nbecome HyperPoints. The HyperPoint Spread module disperses these HyperPoints\nacross different spatial locations to avoid concentration. Finally, a\ndeformation method transforms the 2D mesh representation of HyperPoints into a\nfine-grained 3D structure for point cloud reconstruction. Extensive experiments\nconducted on various established benchmarks demonstrate that 3DMambaComplete\nsurpasses state-of-the-art point cloud completion methods, as confirmed by\nqualitative and quantitative analyses.\n","authors":["Yixuan Li","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2404.07106v1.pdf","comment":"10 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07097v1","updated":"2024-04-10T15:37:00Z","published":"2024-04-10T15:37:00Z","title":"Learning Priors for Non Rigid SfM from Casual Videos","summary":" We tackle the long-standing challenge of reconstructing 3D structures and\ncamera positions from videos. The problem is particularly hard when objects are\ntransformed in a non-rigid way. Current approaches to this problem make\nunrealistic assumptions or require a long optimization time.\n We present TracksTo4D, a novel deep learning-based approach that enables\ninferring 3D structure and camera positions from dynamic content originating\nfrom in-the-wild videos using a single feed-forward pass on a sparse point\ntrack matrix. To achieve this, we leverage recent advances in 2D point tracking\nand design an equivariant neural architecture tailored for directly processing\n2D point tracks by leveraging their symmetries. TracksTo4D is trained on a\ndataset of in-the-wild videos utilizing only the 2D point tracks extracted from\nthe videos, without any 3D supervision. Our experiments demonstrate that\nTracksTo4D generalizes well to unseen videos of unseen semantic categories at\ninference time, producing equivalent results to state-of-the-art methods while\nsignificantly reducing the runtime compared to other baselines.\n","authors":["Yoni Kasten","Wuyue Lu","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2404.07097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07094v1","updated":"2024-04-10T15:34:10Z","published":"2024-04-10T15:34:10Z","title":"MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation\n from 2D Keypoints","summary":" This paper presents Key2Mesh, a model that takes a set of 2D human pose\nkeypoints as input and estimates the corresponding body mesh. Since this\nprocess does not involve any visual (i.e. RGB image) data, the model can be\ntrained on large-scale motion capture (MoCap) datasets, thereby overcoming the\nscarcity of image datasets with 3D labels. To enable the model's application on\nRGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D\nkeypoints, and then feed these 2D keypoints to Key2Mesh. To improve the\nperformance of our model on RGB images, we apply an adversarial domain\nadaptation (DA) method to bridge the gap between the MoCap and visual domains.\nCrucially, our DA method does not require 3D labels for visual data, which\nenables adaptation to target sets without the need for costly labels. We\nevaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints,\nin the absence of RGB and mesh label pairs. Our results on widely used H3.6M\nand 3DPW datasets show that Key2Mesh sets the new state-of-the-art by\noutperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE\nfor the 3DPW dataset. Thanks to our model's simple architecture, it operates at\nleast 12x faster than the prior state-of-the-art model, LGD. Additional\nqualitative samples and code are available on the project website:\nhttps://key2mesh.github.io/.\n","authors":["Bedirhan Uguz","Ozhan Suat","Batuhan Karagoz","Emre Akbas"],"pdf_url":"https://arxiv.org/pdf/2404.07094v1.pdf","comment":"accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2401.07745v2","updated":"2024-04-10T15:30:23Z","published":"2024-01-15T14:56:15Z","title":"MaskClustering: View Consensus based Mask Graph Clustering for\n Open-Vocabulary 3D Instance Segmentation","summary":" Open-vocabulary 3D instance segmentation is cutting-edge for its ability to\nsegment 3D instances without predefined categories. However, progress in 3D\nlags behind its 2D counterpart due to limited annotated 3D data. To address\nthis, recent works first generate 2D open-vocabulary masks through 2D models\nand then merge them into 3D instances based on metrics calculated between two\nneighboring frames. In contrast to these local metrics, we propose a novel\nmetric, view consensus rate, to enhance the utilization of multi-view\nobservations. The key insight is that two 2D masks should be deemed part of the\nsame 3D instance if a significant number of other 2D masks from different views\ncontain both these two masks. Using this metric as edge weight, we construct a\nglobal mask graph where each mask is a node. Through iterative clustering of\nmasks showing high view consensus, we generate a series of clusters, each\nrepresenting a distinct 3D instance. Notably, our model is training-free.\nThrough extensive experiments on publicly available datasets, including\nScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves\nstate-of-the-art performance in open-vocabulary 3D instance segmentation. Our\nproject page is at https://pku-epic.github.io/MaskClustering.\n","authors":["Mi Yan","Jiazhao Zhang","Yan Zhu","He Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02233v2","updated":"2024-04-10T15:22:05Z","published":"2024-04-02T18:40:55Z","title":"Visual Concept Connectome (VCC): Open World Concept Discovery and their\n Interlayer Connections in Deep Models","summary":" Understanding what deep network models capture in their learned\nrepresentations is a fundamental challenge in computer vision. We present a new\nmethodology to understanding such vision models, the Visual Concept Connectome\n(VCC), which discovers human interpretable concepts and their interlayer\nconnections in a fully unsupervised manner. Our approach simultaneously reveals\nfine-grained concepts at a layer, connection weightings across all layers and\nis amendable to global analysis of network structure (e.g., branching pattern\nof hierarchical concept assemblies). Previous work yielded ways to extract\ninterpretable concepts from single layers and examine their impact on\nclassification, but did not afford multilayer concept analysis across an entire\nnetwork architecture. Quantitative and qualitative empirical results show the\neffectiveness of VCCs in the domain of image classification. Also, we leverage\nVCCs for the application of failure mode debugging to reveal where mistakes\narise in deep networks.\n","authors":["Matthew Kowal","Richard P. Wildes","Konstantinos G. Derpanis"],"pdf_url":"https://arxiv.org/pdf/2404.02233v2.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.10831v3","updated":"2024-04-10T15:19:07Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we show that VTCD\ncan be used for fine-grained action recognition and video object segmentation.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v3.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2402.18320v2","updated":"2024-04-10T15:09:22Z","published":"2024-02-28T13:33:43Z","title":"Location-guided Head Pose Estimation for Fisheye Image","summary":" Camera with a fisheye or ultra-wide lens covers a wide field of view that\ncannot be modeled by the perspective projection. Serious fisheye lens\ndistortion in the peripheral region of the image leads to degraded performance\nof the existing head pose estimation models trained on undistorted images. This\npaper presents a new approach for head pose estimation that uses the knowledge\nof head location in the image to reduce the negative effect of fisheye\ndistortion. We develop an end-to-end convolutional neural network to estimate\nthe head pose with the multi-task learning of head pose and head location. Our\nproposed network estimates the head pose directly from the fisheye image\nwithout the operation of rectification or calibration. We also created a\nfisheye-distorted version of the three popular head pose estimation datasets,\nBIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that\nour network remarkably improves the accuracy of head pose estimation compared\nwith other state-of-the-art one-stage and two-stage methods.\n","authors":["Bing Li","Dong Zhang","Cheng Huang","Yun Xian","Ming Li","Dah-Jye Lee"],"pdf_url":"https://arxiv.org/pdf/2402.18320v2.pdf","comment":"Revised Introduction and Related Work; Submitted to lEEE Transactions\n on Cognitive and Developmental Systems for review"},{"id":"http://arxiv.org/abs/2404.07078v1","updated":"2024-04-10T15:09:15Z","published":"2024-04-10T15:09:15Z","title":"VLLMs Provide Better Context for Emotion Understanding Through Common\n Sense Reasoning","summary":" Recognising emotions in context involves identifying the apparent emotions of\nan individual, taking into account contextual cues from the surrounding scene.\nPrevious approaches to this task have involved the design of explicit\nscene-encoding architectures or the incorporation of external scene-related\ninformation, such as captions. However, these methods often utilise limited\ncontextual information or rely on intricate training pipelines. In this work,\nwe leverage the groundbreaking capabilities of Vision-and-Large-Language Models\n(VLLMs) to enhance in-context emotion classification without introducing\ncomplexity to the training process in a two-stage approach. In the first stage,\nwe propose prompting VLLMs to generate descriptions in natural language of the\nsubject's apparent emotion relative to the visual context. In the second stage,\nthe descriptions are used as contextual information and, along with the image\ninput, are used to train a transformer-based architecture that fuses text and\nvisual features before the final classification task. Our experimental results\nshow that the text and image features have complementary information, and our\nfused architecture significantly outperforms the individual modalities without\nany complex training methods. We evaluate our approach on three different\ndatasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or\ncomparable accuracy across all datasets and metrics compared to much more\ncomplex approaches. The code will be made publicly available on github:\nhttps://github.com/NickyFot/EmoCommonSense.git\n","authors":["Alexandros Xenos","Niki Maria Foteinopoulou","Ioanna Ntinou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2404.07078v1.pdf","comment":"A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this\n work; 14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07072v1","updated":"2024-04-10T15:02:26Z","published":"2024-04-10T15:02:26Z","title":"Implicit Multi-Spectral Transformer: An Lightweight and Effective\n Visible to Infrared Image Translation Model","summary":" In the field of computer vision, visible light images often exhibit low\ncontrast in low-light conditions, presenting a significant challenge. While\ninfrared imagery provides a potential solution, its utilization entails high\ncosts and practical limitations. Recent advancements in deep learning,\nparticularly the deployment of Generative Adversarial Networks (GANs), have\nfacilitated the transformation of visible light images to infrared images.\nHowever, these methods often experience unstable training phases and may\nproduce suboptimal outputs. To address these issues, we propose a novel\nend-to-end Transformer-based model that efficiently converts visible light\nimages into high-fidelity infrared images. Initially, the Texture Mapping\nModule and Color Perception Adapter collaborate to extract texture and color\nfeatures from the visible light image. The Dynamic Fusion Aggregation Module\nsubsequently integrates these features. Finally, the transformation into an\ninfrared image is refined through the synergistic action of the Color\nPerception Adapter and the Enhanced Perception Attention mechanism.\nComprehensive benchmarking experiments confirm that our model outperforms\nexisting methods, producing infrared images of markedly superior quality, both\nqualitatively and quantitatively. Furthermore, the proposed model enables more\neffective downstream applications for infrared images than other methods.\n","authors":["Yijia Chen","Pinghua Chen","Xiangxin Zhou","Yingtie Lei","Ziyang Zhou","Mingxian Li"],"pdf_url":"https://arxiv.org/pdf/2404.07072v1.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.07045v1","updated":"2024-04-10T14:35:22Z","published":"2024-04-10T14:35:22Z","title":"Identification of Fine-grained Systematic Errors via Controlled Scene\n Generation","summary":" Many safety-critical applications, especially in autonomous driving, require\nreliable object detectors. They can be very effectively assisted by a method to\nsearch for and identify potential failures and systematic errors before these\ndetectors are deployed. Systematic errors are characterized by combinations of\nattributes such as object location, scale, orientation, and color, as well as\nthe composition of their respective backgrounds. To identify them, one must\nrely on something other than real images from a test set because they do not\naccount for very rare but possible combinations of attributes. To overcome this\nlimitation, we propose a pipeline for generating realistic synthetic scenes\nwith fine-grained control, allowing the creation of complex scenes with\nmultiple objects. Our approach, BEV2EGO, allows for a realistic generation of\nthe complete scene with road-contingent control that maps 2D bird's-eye view\n(BEV) scene configurations to a first-person view (EGO). In addition, we\npropose a benchmark for controlled scene generation to select the most\nappropriate generative outpainting model for BEV2EGO. We further use it to\nperform a systematic analysis of multiple state-of-the-art object detection\nmodels and discover differences between them.\n","authors":["Valentyn Boreiko","Matthias Hein","Jan Hendrik Metzen"],"pdf_url":"https://arxiv.org/pdf/2404.07045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07032v1","updated":"2024-04-10T14:25:23Z","published":"2024-04-10T14:25:23Z","title":"An Evidential-enhanced Tri-Branch Consistency Learning Method for\n Semi-supervised Medical Image Segmentation","summary":" Semi-supervised segmentation presents a promising approach for large-scale\nmedical image analysis, effectively reducing annotation burdens while achieving\ncomparable performance. This methodology holds substantial potential for\nstreamlining the segmentation process and enhancing its feasibility within\nclinical settings for translational investigations. While cross-supervised\ntraining, based on distinct co-training sub-networks, has become a prevalent\nparadigm for this task, addressing critical issues such as predication\ndisagreement and label-noise suppression requires further attention and\nprogress in cross-supervised training. In this paper, we introduce an\nEvidential Tri-Branch Consistency learning framework (ETC-Net) for\nsemi-supervised medical image segmentation. ETC-Net employs three branches: an\nevidential conservative branch, an evidential progressive branch, and an\nevidential fusion branch. The first two branches exhibit complementary\ncharacteristics, allowing them to address prediction diversity and enhance\ntraining stability. We also integrate uncertainty estimation from the\nevidential learning into cross-supervised training, mitigating the negative\nimpact of erroneous supervision signals. Additionally, the evidential fusion\nbranch capitalizes on the complementary attributes of the first two branches\nand leverages an evidence-based Dempster-Shafer fusion strategy, supervised by\nmore reliable and accurate pseudo-labels of unlabeled data. Extensive\nexperiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that\nETC-Net surpasses other state-of-the-art methods for semi-supervised\nsegmentation. The code will be made available in the near future at\nhttps://github.com/Medsemiseg.\n","authors":["Zhenxi Zhang","Heng Zhou","Xiaoran Shi","Ran Ran","Chunna Tian","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.07032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10166v2","updated":"2024-04-10T14:25:12Z","published":"2024-01-18T17:55:39Z","title":"VMamba: Visual State Space Model","summary":" Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long\nbeen the predominant backbone networks for visual representation learning.\nWhile ViTs have recently gained prominence over CNNs due to their superior\nfitting capabilities, their scalability is largely constrained by the quadratic\ncomplexity of attention computation. Inspired by the capability of Mamba in\nefficiently modeling long sequences, we propose VMamba, a generic vision\nbackbone model aiming to reduce the computational complexity to linear while\nretaining ViTs' advantageous features. To enhance VMamba's adaptability in\nprocessing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D\nselective scanning in 2D image space with global receptive fields.\nAdditionally, we make further improvements in implementation details and\narchitectural designs to enhance VMamba's performance and boost its inference\nspeed. Extensive experimental results demonstrate VMamba's promising\nperformance across various visual perception tasks, highlighting its pronounced\nadvantages in input scaling efficiency compared to existing benchmark models.\nSource code is available at https://github.com/MzeroMiko/VMamba.\n","authors":["Yue Liu","Yunjie Tian","Yuzhong Zhao","Hongtian Yu","Lingxi Xie","Yaowei Wang","Qixiang Ye","Yunfan Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10166v2.pdf","comment":"21 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.07031v1","updated":"2024-04-10T14:24:10Z","published":"2024-04-10T14:24:10Z","title":"ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR\n Domain Modeling","summary":" Every day, countless surgeries are performed worldwide, each within the\ndistinct settings of operating rooms (ORs) that vary not only in their setups\nbut also in the personnel, tools, and equipment used. This inherent diversity\nposes a substantial challenge for achieving a holistic understanding of the OR,\nas it requires models to generalize beyond their initial training datasets. To\nreduce this gap, we introduce ORacle, an advanced vision-language model\ndesigned for holistic OR domain modeling, which incorporates multi-view and\ntemporal capabilities and can leverage external knowledge during inference,\nenabling it to adapt to previously unseen surgical scenarios. This capability\nis further enhanced by our novel data augmentation framework, which\nsignificantly diversifies the training dataset, ensuring ORacle's proficiency\nin applying the provided knowledge effectively. In rigorous testing, in scene\ngraph generation, and downstream tasks on the 4D-OR dataset, ORacle not only\ndemonstrates state-of-the-art performance but does so requiring less data than\nexisting models. Furthermore, its adaptability is displayed through its ability\nto interpret unseen views, actions, and appearances of tools and equipment.\nThis demonstrates ORacle's potential to significantly enhance the scalability\nand affordability of OR domain modeling and opens a pathway for future\nadvancements in surgical data science. We will release our code and data upon\nacceptance.\n","authors":["Ege Özsoy","Chantal Pellegrini","Matthias Keicher","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2404.07031v1.pdf","comment":"11 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07029v1","updated":"2024-04-10T14:22:16Z","published":"2024-04-10T14:22:16Z","title":"Diffusion-based inpainting of incomplete Euclidean distance matrices of\n trajectories generated by a fractional Brownian motion","summary":" Fractional Brownian trajectories (fBm) feature both randomness and strong\nscale-free correlations, challenging generative models to reproduce the\nintrinsic memory characterizing the underlying process. Here we test a\ndiffusion probabilistic model on a specific dataset of corrupted images\ncorresponding to incomplete Euclidean distance matrices of fBm at various\nmemory exponents $H$. Our dataset implies uniqueness of the data imputation in\nthe regime of low missing ratio, where the remaining partial graph is rigid,\nproviding the ground truth for the inpainting. We find that the conditional\ndiffusion generation stably reproduces the statistics of missing\nfBm-distributed distances for different values of $H$ exponent. Furthermore,\nwhile diffusion models have been recently shown to remember samples from the\ntraining database, we show that diffusion-based inpainting behaves\nqualitatively different from the database search with the increasing database\nsize. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for\ncompletion of chromosome distance matrices obtained in single-cell microscopy\nexperiments, showing its superiority over the standard bioinformatics\nalgorithms. Our source code is available on GitHub at\nhttps://github.com/alobashev/diffusion_fbm.\n","authors":["Alexander Lobashev","Kirill Polovnikov"],"pdf_url":"https://arxiv.org/pdf/2404.07029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10144v4","updated":"2024-04-10T13:58:08Z","published":"2023-12-15T19:00:07Z","title":"Data-Efficient Multimodal Fusion on a Single GPU","summary":" The goal of multimodal alignment is to learn a single latent space that is\nshared between multimodal inputs. The most powerful models in this space have\nbeen trained using massive datasets of paired inputs and large-scale\ncomputational resources, making them prohibitively expensive to train in many\npractical scenarios. We surmise that existing unimodal encoders pre-trained on\nlarge amounts of unimodal data should provide an effective bootstrap to create\nmultimodal models from unimodal ones at much lower costs. We therefore propose\nFuseMix, a multimodal augmentation scheme that operates on the latent spaces of\narbitrary pre-trained unimodal encoders. Using FuseMix for multimodal\nalignment, we achieve competitive performance -- and in certain cases\noutperform state-of-the art methods -- in both image-text and audio-text\nretrieval, with orders of magnitude less compute and data: for example, we\noutperform CLIP on the Flickr30K text-to-image retrieval task with $\\sim \\!\n600\\times$ fewer GPU days and $\\sim \\! 80\\times$ fewer image-text pairs.\nAdditionally, we show how our method can be applied to convert pre-trained\ntext-to-image generative models into audio-to-image ones. Code is available at:\nhttps://github.com/layer6ai-labs/fusemix.\n","authors":["Noël Vouitsis","Zhaoyan Liu","Satya Krishna Gorti","Valentin Villecroze","Jesse C. Cresswell","Guangwei Yu","Gabriel Loaiza-Ganem","Maksims Volkovs"],"pdf_url":"https://arxiv.org/pdf/2312.10144v4.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2307.12256v2","updated":"2024-04-10T13:43:54Z","published":"2023-07-23T08:02:37Z","title":"Building-road Collaborative Extraction from Remotely Sensed Images via\n Cross-Interaction","summary":" Buildings are the basic carrier of social production and human life; roads\nare the links that interconnect social networks. Building and road information\nhas important application value in the frontier fields of regional coordinated\ndevelopment, disaster prevention, auto-driving, etc. Mapping buildings and\nroads from very high-resolution (VHR) remote sensing images have become a hot\nresearch topic. However, the existing methods often ignore the strong spatial\ncorrelation between roads and buildings and extract them in isolation. To fully\nutilize the complementary advantages between buildings and roads, we propose a\nbuilding-road collaborative extraction method based on multi-task and\ncross-scale feature interaction to improve the accuracy of both tasks in a\ncomplementary way. A multi-task interaction module is proposed to interact\ninformation across tasks and preserve the unique information of each task,\nwhich tackle the seesaw phenomenon in multitask learning. By considering the\nvariation in appearance and structure between buildings and roads, a\ncross-scale interaction module is designed to automatically learn the optimal\nreception field for different tasks. Compared with many existing methods that\ntrain each task individually, the proposed collaborative extraction method can\nutilize the complementary advantages between buildings and roads by the\nproposed inter-task and inter-scale feature interactions, and automatically\nselect the optimal reception field for different tasks. Experiments on a wide\nrange of urban and rural scenarios show that the proposed algorithm can achieve\nbuilding-road extraction with outstanding performance and efficiency.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12256v2.pdf","comment":"IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2312.07937v5","updated":"2024-04-10T13:35:51Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v5.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05317v2","updated":"2024-04-10T13:30:09Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v2.pdf","comment":"minor fixes (typos, URLs etc.)"},{"id":"http://arxiv.org/abs/2309.06067v6","updated":"2024-04-10T13:17:52Z","published":"2023-09-12T09:07:03Z","title":"Implicit Neural Representation for MRI Parallel Imaging Reconstruction","summary":" Magnetic resonance imaging (MRI) usually faces lengthy acquisition times,\nprompting the exploration of strategies such as parallel imaging (PI) to\nalleviate this problem by periodically skipping specific K-space lines and\nsubsequently reconstructing high-quality images from the undersampled K-space.\nImplicit neural representation (INR) has recently emerged as a promising deep\nlearning technique, characterizing objects as continuous functions of spatial\ncoordinates typically parameterized by a multilayer perceptron (MLP). In this\nstudy, we propose a novel MRI PI reconstruction method that uses INR. Our\napproach represents reconstructed fully-sampled images as functions of voxel\ncoordinates and prior feature vectors from undersampled images, addressing the\ngeneralization challenges of INR. Specifically, we introduce a scale-embedded\nencoder to generate scale-independent, voxel-specific features from MR images\nacross various undersampling scales. These features are then concatenated with\ncoordinate vectors to reconstruct fully-sampled MR images, facilitating\nmultiple-scale reconstructions. To evaluate our method's performance, we\nconducted experiments using publicly available MRI datasets, comparing it with\nalternative reconstruction techniques. Our quantitative assessment demonstrates\nthe superiority of our proposed method.\n","authors":["Hao Li","Yusheng Zhou","Jianan Liu","Xiling Liu","Tao Huang","Zhihan Lv","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2309.06067v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12220v2","updated":"2024-04-10T13:15:41Z","published":"2023-07-23T03:55:13Z","title":"Expediting Building Footprint Extraction from High-resolution Remote\n Sensing Images via progressive lenient supervision","summary":" The efficacy of building footprint segmentation from remotely sensed images\nhas been hindered by model transfer effectiveness. Many existing building\nsegmentation methods were developed upon the encoder-decoder architecture of\nU-Net, in which the encoder is finetuned from the newly developed backbone\nnetworks that are pre-trained on ImageNet. However, the heavy computational\nburden of the existing decoder designs hampers the successful transfer of these\nmodern encoder networks to remote sensing tasks. Even the widely-adopted deep\nsupervision strategy fails to mitigate these challenges due to its invalid loss\nin hybrid regions where foreground and background pixels are intermixed. In\nthis paper, we conduct a comprehensive evaluation of existing decoder network\ndesigns for building footprint segmentation and propose an efficient framework\ndenoted as BFSeg to enhance learning efficiency and effectiveness.\nSpecifically, a densely-connected coarse-to-fine feature fusion decoder network\nthat facilitates easy and fast feature fusion across scales is proposed.\nMoreover, considering the invalidity of hybrid regions in the down-sampled\nground truth during the deep supervision process, we present a lenient deep\nsupervision and distillation strategy that enables the network to learn proper\nknowledge from deep supervision. Building upon these advancements, we have\ndeveloped a new family of building segmentation networks, which consistently\nsurpass prior works with outstanding performance and efficiency across a wide\nrange of newly developed encoder networks.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Xin Su","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06991v1","updated":"2024-04-10T13:10:52Z","published":"2024-04-10T13:10:52Z","title":"Ray-driven Spectral CT Reconstruction Based on Neural Base-Material\n Fields","summary":" In spectral CT reconstruction, the basis materials decomposition involves\nsolving a large-scale nonlinear system of integral equations, which is highly\nill-posed mathematically. This paper proposes a model that parameterizes the\nattenuation coefficients of the object using a neural field representation,\nthereby avoiding the complex calculations of pixel-driven projection\ncoefficient matrices during the discretization process of line integrals. It\nintroduces a lightweight discretization method for line integrals based on a\nray-driven neural field, enhancing the accuracy of the integral approximation\nduring the discretization process. The basis materials are represented as\ncontinuous vector-valued implicit functions to establish a neural field\nparameterization model for the basis materials. The auto-differentiation\nframework of deep learning is then used to solve the implicit continuous\nfunction of the neural base-material fields. This method is not limited by the\nspatial resolution of reconstructed images, and the network has compact and\nregular properties. Experimental validation shows that our method performs\nexceptionally well in addressing the spectral CT reconstruction. Additionally,\nit fulfils the requirements for the generation of high-resolution\nreconstruction images.\n","authors":["Ligen Shi","Chang Liu","Ping Yang","Jun Qiu","Xing Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06991v1.pdf","comment":"14 pages,16 figures"},{"id":"http://arxiv.org/abs/2404.01563v2","updated":"2024-04-10T13:02:59Z","published":"2024-04-02T01:57:08Z","title":"Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level\n Awareness","summary":" To obtain high-quality positron emission tomography (PET) while minimizing\nradiation exposure, a range of methods have been designed to reconstruct\nstandard-dose PET (SPET) from corresponding low-dose PET (LPET) images.\nHowever, most current methods merely learn the mapping between\nsingle-dose-level LPET and SPET images, but omit the dose disparity of LPET\nimages in clinical scenarios. In this paper, to reconstruct high-quality SPET\nimages from multi-dose-level LPET images, we design a novel two-phase\nmulti-dose-level PET reconstruction algorithm with dose level awareness,\ncontaining a pre-training phase and a SPET prediction phase. Specifically, the\npre-training phase is devised to explore both fine-grained discriminative\nfeatures and effective semantic representation. The SPET prediction phase\nadopts a coarse prediction network utilizing pre-learned dose level prior to\ngenerate preliminary result, and a refinement network to precisely preserve the\ndetails. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge\nDataset have demonstrated the superiority of our method.\n","authors":["Yuchen Fei","Yanmei Luo","Yan Wang","Jiaqi Cui","Yuanyuan Xu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01563v2.pdf","comment":"Accepted by ISBI2024"},{"id":"http://arxiv.org/abs/2404.06033v2","updated":"2024-04-10T12:55:49Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at\nhttps://github.com/ZhiyingDu/BHFMEF.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02265v2","updated":"2024-04-10T12:54:12Z","published":"2023-10-03T17:59:58Z","title":"DREAM: Visual Decoding from Reversing Human Visual System","summary":" In this work we present DREAM, an fMRI-to-image method for reconstructing\nviewed images from brain activities, grounded on fundamental knowledge of the\nhuman visual system. We craft reverse pathways that emulate the hierarchical\nand parallel nature of how humans perceive the visual world. These tailored\npathways are specialized to decipher semantics, color, and depth cues from fMRI\ndata, mirroring the forward pathways from visual stimuli to fMRI recordings. To\ndo so, two components mimic the inverse processes within the human visual\nsystem: the Reverse Visual Association Cortex (R-VAC) which reverses pathways\nof this brain region, extracting semantics from fMRI data; the Reverse Parallel\nPKM (R-PKM) component simultaneously predicting color and depth from fMRI\nsignals. The experiments indicate that our method outperforms the current\nstate-of-the-art models in terms of the consistency of appearance, structure,\nand semantics. Code will be made publicly available to facilitate further\nresearch in this field.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2310.02265v2.pdf","comment":"Project Page: https://weihaox.github.io/DREAM"},{"id":"http://arxiv.org/abs/2404.06977v1","updated":"2024-04-10T12:45:27Z","published":"2024-04-10T12:45:27Z","title":"Accurate Tennis Court Line Detection on Amateur Recorded Matches","summary":" Typically, tennis court line detection is done by running\nHough-Line-Detection to find straight lines in the image, and then computing a\ntransformation matrix from the detected lines to create the final court\nstructure. We propose numerous improvements and enhancements to this algorithm,\nincluding using pretrained State-of-the-Art shadow-removal and object-detection\nML models to make our line-detection more robust. Compared to the original\nalgorithm, our method can accurately detect lines on amateur, dirty courts.\nWhen combined with a robust ball-tracking system, our method will enable\naccurate, automatic refereeing for amateur and professional tennis matches\nalike.\n","authors":["Sameer Agrawal","Ragoth Sundararajan","Vishak Sagar"],"pdf_url":"https://arxiv.org/pdf/2404.06977v1.pdf","comment":"Accepted to 5th International conference on Image, Video Processing\n and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2404.06971v1","updated":"2024-04-10T12:31:43Z","published":"2024-04-10T12:31:43Z","title":"TrajPRed: Trajectory Prediction with Region-based Relation Learning","summary":" Forecasting human trajectories in traffic scenes is critical for safety\nwithin mixed or fully autonomous systems. Human future trajectories are driven\nby two major stimuli, social interactions, and stochastic goals. Thus, reliable\nforecasting needs to capture these two stimuli. Edge-based relation modeling\nrepresents social interactions using pairwise correlations from precise\nindividual states. Nevertheless, edge-based relations can be vulnerable under\nperturbations. To alleviate these issues, we propose a region-based relation\nlearning paradigm that models social interactions via region-wise dynamics of\njoint states, i.e., the changes in the density of crowds. In particular,\nregion-wise agent joint information is encoded within convolutional feature\ngrids. Social relations are modeled by relating the temporal changes of local\njoint information from a global perspective. We show that region-based\nrelations are less susceptible to perturbations. In order to account for the\nstochastic individual goals, we exploit a conditional variational autoencoder\nto realize multi-goal estimation and diverse future prediction. Specifically,\nwe perform variational inference via the latent distribution, which is\nconditioned on the correlation between input states and associated target\ngoals. Sampling from the latent distribution enables the framework to reliably\ncapture the stochastic behavior in test data. We integrate multi-goal\nestimation and region-based relation learning to model the two stimuli, social\ninteractions, and stochastic goals, in a prediction framework. We evaluate our\nframework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that\nthe diverse prediction better fits the ground truth when incorporating the\nrelation module. Our framework outperforms the state-of-the-art models on SDD\nby $27.61\\%$/$18.20\\%$ of ADE/FDE metrics.\n","authors":["Chen Zhou","Ghassan AlRegib","Armin Parchami","Kunjan Singh"],"pdf_url":"https://arxiv.org/pdf/2404.06971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06963v1","updated":"2024-04-10T12:22:19Z","published":"2024-04-10T12:22:19Z","title":"V-MAD: Video-based Morphing Attack Detection in Operational Scenarios","summary":" In response to the rising threat of the face morphing attack, this paper\nintroduces and explores the potential of Video-based Morphing Attack Detection\n(V-MAD) systems in real-world operational scenarios. While current morphing\nattack detection methods primarily focus on a single or a pair of images, V-MAD\nis based on video sequences, exploiting the video streams often acquired by\nface verification tools available, for instance, at airport gates. Through this\nstudy, we show for the first time the advantages that the availability of\nmultiple probe frames can bring to the morphing attack detection task,\nespecially in scenarios where the quality of probe images is varied and might\nbe affected, for instance, by pose or illumination variations. Experimental\nresults on a real operational database demonstrate that video sequences\nrepresent valuable information for increasing the robustness and performance of\nmorphing attack detection systems.\n","authors":["Guido Borghi","Annalisa Franco","Nicolò Di Domenico","Matteo Ferrara","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.06963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06957v1","updated":"2024-04-10T12:17:25Z","published":"2024-04-10T12:17:25Z","title":"Adversarial purification for no-reference image-quality metrics:\n applicability study and new methods","summary":" Recently, the area of adversarial attacks on image quality metrics has begun\nto be explored, whereas the area of defences remains under-researched. In this\nstudy, we aim to cover that case and check the transferability of adversarial\npurification defences from image classifiers to IQA methods. In this paper, we\napply several widespread attacks on IQA models and examine the success of the\ndefences against them. The purification methodologies covered different\npreprocessing techniques, including geometrical transformations, compression,\ndenoising, and modern neural network-based methods. Also, we address the\nchallenge of assessing the efficacy of a defensive methodology by proposing\nways to estimate output visual quality and the success of neutralizing attacks.\nDefences were tested against attack on three IQA metrics -- Linearity, MetaIQA\nand SPAQ. The code for attacks and defences is available at: (link is hidden\nfor a blind review).\n","authors":["Aleksandr Gushchin","Anna Chistyakova","Vladislav Minashkin","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2404.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04350v3","updated":"2024-04-10T11:58:24Z","published":"2024-01-09T04:33:03Z","title":"Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial\n Robustness","summary":" Large-scale pre-trained vision-language models like CLIP have demonstrated\nimpressive performance across various tasks, and exhibit remarkable zero-shot\ngeneralization capability, while they are also vulnerable to imperceptible\nadversarial examples. Existing works typically employ adversarial training\n(fine-tuning) as a defense method against adversarial examples. However, direct\napplication to the CLIP model may result in overfitting, compromising the\nmodel's capacity for generalization. In this paper, we propose Pre-trained\nModel Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages\nsupervision from the original pre-trained model by carefully designing an\nauxiliary branch, to enhance the model's zero-shot adversarial robustness.\nSpecifically, PMG-AFT minimizes the distance between the features of\nadversarial examples in the target model and those in the pre-trained model,\naiming to preserve the generalization features already captured by the\npre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate\nthat PMG-AFT significantly outperforms the state-of-the-art method, improving\nthe top-1 robust accuracy by an average of 4.99%. Furthermore, our approach\nconsistently improves clean accuracy by an average of 8.72%. Our code is\navailable at\nhttps://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness.\n","authors":["Sibo Wang","Jie Zhang","Zheng Yuan","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2401.04350v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06275v3","updated":"2024-04-10T11:49:05Z","published":"2023-12-11T10:26:21Z","title":"DG-TTA: Out-of-domain medical image segmentation through Domain\n Generalization and Test-Time Adaptation","summary":" Applying pre-trained medical segmentation models on out-of-domain images\noften yields predictions of insufficient quality. Several strategies have been\nproposed to maintain model performance, such as finetuning or unsupervised- and\nsource-free domain adaptation. These strategies set restrictive requirements\nfor data availability. In this study, we propose to combine domain\ngeneralization and test-time adaptation to create a highly effective approach\nfor reusing pre-trained models in unseen target domains. Domain-generalized\npre-training on source data is used to obtain the best initial performance in\nthe target domain. We introduce the MIND descriptor previously used in image\nregistration tasks as a further technique to achieve generalization and present\nsuperior performance for small-scale datasets compared to existing approaches.\nAt test-time, high-quality segmentation for every single unseen scan is ensured\nby optimizing the model weights for consistency given different image\naugmentations. That way, our method enables separate use of source and target\ndata and thus removes current data availability barriers. Moreover, the\npresented method is highly modular as it does not require specific model\narchitectures or prior knowledge of involved domains and labels. We demonstrate\nthis by integrating it into the nnUNet, which is currently the most popular and\naccurate framework for medical image segmentation. We employ multiple datasets\ncovering abdominal, cardiac, and lumbar spine scans and compose several\nout-of-domain scenarios in this study. We demonstrate that our method, combined\nwith pre-trained whole-body CT models, can effectively segment MR images with\nhigh accuracy in all of the aforementioned scenarios. Open-source code can be\nfound here: https://github.com/multimodallearning/DG-TTA\n","authors":["Christian Weihsbach","Christian N. Kruse","Alexander Bigalke","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2312.06275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06941v1","updated":"2024-04-10T11:47:51Z","published":"2024-04-10T11:47:51Z","title":"Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven\n Approach","summary":" Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark\nmodality for the comprehensive assessment of cardiac function. Nevertheless,\nthe acquisition process of cine CMR is considered as an impediment due to its\nprolonged scanning time. One commonly used strategy to expedite the acquisition\nprocess is through k-space undersampling, though it comes with a drawback of\nintroducing aliasing effects in the reconstructed image. Lately, deep\nlearning-based methods have shown remarkable results over traditional\napproaches in rapidly achieving precise CMR reconstructed images. This study\naims to explore the untapped potential of attention mechanisms incorporated\nwith a deep learning model within the context of the CMR reconstruction\nproblem. We are motivated by the fact that attention has proven beneficial in\ndownstream tasks such as image classification and segmentation, but has not\nbeen systematically analysed in the context of CMR reconstruction. Our primary\ngoal is to identify the strengths and potential limitations of attention\nalgorithms when integrated with a convolutional backbone model such as a U-Net.\nTo achieve this, we benchmark different state-of-the-art spatial and channel\nattention mechanisms on the CMRxRecon dataset and quantitatively evaluate the\nquality of reconstruction using objective metrics. Furthermore, inspired by the\nbest performing attention mechanism, we propose a new, simple yet effective,\nattention pipeline specifically optimised for the task of cardiac image\nreconstruction that outperforms other state-of-the-art attention methods. The\nlayer and model code will be made publicly available.\n","authors":["Anam Hashmi","Julia Dietlmeier","Kathleen M. Curran","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2404.06941v1.pdf","comment":"This paper has been submitted for the 32nd European Signal Processing\n Conference EUSIPCO 2024 in Lyon"},{"id":"http://arxiv.org/abs/2306.10798v3","updated":"2024-04-10T11:42:22Z","published":"2023-06-19T09:38:21Z","title":"ExpPoint-MAE: Better interpretability and performance for\n self-supervised point cloud transformers","summary":" In this paper we delve into the properties of transformers, attained through\nself-supervision, in the point cloud domain. Specifically, we evaluate the\neffectiveness of Masked Autoencoding as a pretraining scheme, and explore\nMomentum Contrast as an alternative. In our study we investigate the impact of\ndata quantity on the learned features, and uncover similarities in the\ntransformer's behavior across domains. Through comprehensive visualiations, we\nobserve that the transformer learns to attend to semantically meaningful\nregions, indicating that pretraining leads to a better understanding of the\nunderlying geometry. Moreover, we examine the finetuning process and its effect\non the learned representations. Based on that, we devise an unfreezing strategy\nwhich consistently outperforms our baseline without introducing any other\nmodifications to the model or the training pipeline, and achieve\nstate-of-the-art results in the classification task among transformer models.\n","authors":["Ioannis Romanelis","Vlassis Fotis","Konstantinos Moustakas","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2306.10798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06936v1","updated":"2024-04-10T11:40:02Z","published":"2024-04-10T11:40:02Z","title":"Efficient and Generic Point Model for Lossless Point Cloud Attribute\n Compression","summary":" The past several years have witnessed the emergence of learned point cloud\ncompression (PCC) techniques. However, current learning-based lossless point\ncloud attribute compression (PCAC) methods either suffer from high\ncomputational complexity or deteriorated compression performance. Moreover, the\nsignificant variations in point cloud scale and sparsity encountered in\nreal-world applications make developing an all-in-one neural model a\nchallenging task. In this paper, we propose PoLoPCAC, an efficient and generic\nlossless PCAC method that achieves high compression efficiency and strong\ngeneralizability simultaneously. We formulate lossless PCAC as the task of\ninferring explicit distributions of attributes from group-wise autoregressive\npriors. A progressive random grouping strategy is first devised to efficiently\nresolve the point cloud into groups, and then the attributes of each group are\nmodeled sequentially from accumulated antecedents. A locality-aware attention\nmechanism is utilized to exploit prior knowledge from context windows in\nparallel. Since our method directly operates on points, it can naturally avoids\ndistortion caused by voxelization, and can be executed on point clouds with\narbitrary scale and density. Experiments show that our method can be instantly\ndeployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying\ncontinuous bit-rate reduction over the latest G-PCCv23 on various datasets\n(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding\ntime than G-PCCv23 on the majority of sequences with a lightweight model size\n(2.6MB), which is highly attractive for practical applications. Dataset, code\nand trained model are available at\nhttps://github.com/I2-Multimedia-Lab/PoLoPCAC.\n","authors":["Kang You","Pan Gao","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06918v1","updated":"2024-04-10T11:10:50Z","published":"2024-04-10T11:10:50Z","title":"HRVDA: High-Resolution Visual Document Assistant","summary":" Leveraging vast training data, multimodal large language models (MLLMs) have\ndemonstrated formidable general visual comprehension capabilities and achieved\nremarkable performance across various tasks. However, their performance in\nvisual document understanding still leaves much room for improvement. This\ndiscrepancy is primarily attributed to the fact that visual document\nunderstanding is a fine-grained prediction task. In natural scenes, MLLMs\ntypically use low-resolution images, leading to a substantial loss of visual\ninformation. Furthermore, general-purpose MLLMs do not excel in handling\ndocument-oriented instructions. In this paper, we propose a High-Resolution\nVisual Document Assistant (HRVDA), which bridges the gap between MLLMs and\nvisual document understanding. This model employs a content filtering mechanism\nand an instruction filtering module to separately filter out the\ncontent-agnostic visual tokens and instruction-agnostic visual tokens, thereby\nachieving efficient model training and inference for high-resolution images. In\naddition, we construct a document-oriented visual instruction tuning dataset\nand apply a multi-stage training strategy to enhance the model's document\nmodeling capabilities. Extensive experiments demonstrate that our model\nachieves state-of-the-art performance across multiple document understanding\ndatasets, while maintaining training efficiency and inference speed comparable\nto low-resolution models.\n","authors":["Chaohu Liu","Kun Yin","Haoyu Cao","Xinghua Jiang","Xin Li","Yinsong Liu","Deqiang Jiang","Xing Sun","Linli Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06918v1.pdf","comment":"Accepted to CVPR 2024 main conference"},{"id":"http://arxiv.org/abs/2404.06913v1","updated":"2024-04-10T11:06:29Z","published":"2024-04-10T11:06:29Z","title":"Sparse Global Matching for Video Frame Interpolation with Large Motion","summary":" Large motion poses a critical challenge in Video Frame Interpolation (VFI)\ntask. Existing methods are often constrained by limited receptive fields,\nresulting in sub-optimal performance when handling scenarios with large motion.\nIn this paper, we introduce a new pipeline for VFI, which can effectively\nintegrate global-level information to alleviate issues associated with large\nmotion. Specifically, we first estimate a pair of initial intermediate flows\nusing a high-resolution feature map for extracting local details. Then, we\nincorporate a sparse global matching branch to compensate for flow estimation,\nwhich consists of identifying flaws in initial flows and generating sparse flow\ncompensation with a global receptive field. Finally, we adaptively merge the\ninitial flow estimation with global flow compensation, yielding a more accurate\nintermediate flow. To evaluate the effectiveness of our method in handling\nlarge motion, we carefully curate a more challenging subset from commonly used\nbenchmarks. Our method demonstrates the state-of-the-art performance on these\nVFI subsets with large motion.\n","authors":["Chunxu Liu","Guozhen Zhang","Rui Zhao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06913v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/"},{"id":"http://arxiv.org/abs/2306.00977v4","updated":"2024-04-10T10:56:00Z","published":"2023-06-01T17:59:10Z","title":"AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation","summary":" During interactive segmentation, a model and a user work together to\ndelineate objects of interest in a 3D point cloud. In an iterative process, the\nmodel assigns each data point to an object (or the background), while the user\ncorrects errors in the resulting segmentation and feeds them back into the\nmodel. The current best practice formulates the problem as binary\nclassification and segments objects one at a time. The model expects the user\nto provide positive clicks to indicate regions wrongly assigned to the\nbackground and negative clicks on regions wrongly assigned to the object.\nSequentially visiting objects is wasteful since it disregards synergies between\nobjects: a positive click for a given object can, by definition, serve as a\nnegative click for nearby objects. Moreover, a direct competition between\nadjacent objects can speed up the identification of their common boundary. We\nintroduce AGILE3D, an efficient, attention-based model that (1) supports\nsimultaneous segmentation of multiple 3D objects, (2) yields more accurate\nsegmentation masks with fewer user clicks, and (3) offers faster inference. Our\ncore idea is to encode user clicks as spatial-temporal queries and enable\nexplicit interactions between click queries as well as between them and the 3D\nscene through a click attention module. Every time new clicks are added, we\nonly need to run a lightweight decoder that produces updated segmentation\nmasks. In experiments with four different 3D point cloud datasets, AGILE3D sets\na new state-of-the-art. Moreover, we also verify its practicality in real-world\nsetups with real user studies.\n","authors":["Yuanwen Yue","Sabarinath Mahadevan","Jonas Schult","Francis Engelmann","Bastian Leibe","Konrad Schindler","Theodora Kontogianni"],"pdf_url":"https://arxiv.org/pdf/2306.00977v4.pdf","comment":"ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D"},{"id":"http://arxiv.org/abs/2404.06903v1","updated":"2024-04-10T10:46:59Z","published":"2024-04-10T10:46:59Z","title":"DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic\n Gaussian Splatting","summary":" The increasing demand for virtual reality applications has highlighted the\nsignificance of crafting immersive 3D assets. We present a text-to-3D\n360$^{\\circ}$ scene generation pipeline that facilitates the creation of\ncomprehensive 360$^{\\circ}$ scenes for in-the-wild environments in a matter of\nminutes. Our approach utilizes the generative power of a 2D diffusion model and\nprompt self-refinement to create a high-quality and globally coherent panoramic\nimage. This image acts as a preliminary \"flat\" (2D) scene representation.\nSubsequently, it is lifted into 3D Gaussians, employing splatting techniques to\nenable real-time exploration. To produce consistent 3D geometry, our pipeline\nconstructs a spatially coherent structure by aligning the 2D monocular depth\ninto a globally optimized point cloud. This point cloud serves as the initial\nstate for the centroids of 3D Gaussians. In order to address invisible issues\ninherent in single-view inputs, we impose semantic and geometric constraints on\nboth synthesized and input camera views as regularizations. These guide the\noptimization of Gaussians, aiding in the reconstruction of unseen regions. In\nsummary, our method offers a globally consistent 3D scene within a\n360$^{\\circ}$ perspective, providing an enhanced immersive experience over\nexisting techniques. Project website at: http://dreamscene360.github.io/\n","authors":["Shijie Zhou","Zhiwen Fan","Dejia Xu","Haoran Chang","Pradyumna Chari","Tejas Bharadwaj","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2404.06903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12796v2","updated":"2024-04-10T10:37:22Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06894v1","updated":"2024-04-10T10:36:15Z","published":"2024-04-10T10:36:15Z","title":"O-TALC: Steps Towards Combating Oversegmentation within Online Action\n Segmentation","summary":" Online temporal action segmentation shows a strong potential to facilitate\nmany HRI tasks where extended human action sequences must be tracked and\nunderstood in real time. Traditional action segmentation approaches, however,\noperate in an offline two stage approach, relying on computationally expensive\nvideo wide features for segmentation, rendering them unsuitable for online HRI\napplications. In order to facilitate online action segmentation on a stream of\nincoming video data, we introduce two methods for improved training and\ninference of backbone action recognition models, allowing them to be deployed\ndirectly for online frame level classification. Firstly, we introduce surround\ndense sampling whilst training to facilitate training vs. inference clip\nmatching and improve segment boundary predictions. Secondly, we introduce an\nOnline Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce\noversegmentation during online inference. As our methods are backbone\ninvariant, they can be deployed with computationally efficient spatio-temporal\naction recognition models capable of operating in real time with a small\nsegmentation latency. We show our method outperforms similar online action\nsegmentation work as well as matches the performance of many offline models\nwith access to full temporal resolution when operating on challenging\nfine-grained datasets.\n","authors":["Matthew Kent Myers","Nick Wright","A. Stephen McGough","Nicholas Martin"],"pdf_url":"https://arxiv.org/pdf/2404.06894v1.pdf","comment":"5 pages, 3 figures. Accepted as a short (unindexed) paper at the\n TAHRI conference"},{"id":"http://arxiv.org/abs/2404.06892v1","updated":"2024-04-10T10:34:34Z","published":"2024-04-10T10:34:34Z","title":"SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End\n Autonomous Driving","summary":" End-to-End paradigms use a unified framework to implement multi-tasks in an\nautonomous driving system. Despite simplicity and clarity, the performance of\nend-to-end autonomous driving methods on sub-tasks is still far behind the\nsingle-task methods. Meanwhile, the widely used dense BEV features in previous\nend-to-end methods make it costly to extend to more modalities or tasks. In\nthis paper, we propose a Sparse query-centric paradigm for end-to-end\nAutonomous Driving (SparseAD), where the sparse queries completely represent\nthe whole driving scenario across space, time and tasks without any dense BEV\nrepresentation. Concretely, we design a unified sparse architecture for\nperception tasks including detection, tracking, and online mapping. Moreover,\nwe revisit motion prediction and planning, and devise a more justifiable motion\nplanner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA\nfull-task performance among end-to-end methods and significantly narrows the\nperformance gap between end-to-end paradigms and single-task methods. Codes\nwill be released soon.\n","authors":["Diankun Zhang","Guoan Wang","Runwen Zhu","Jianbo Zhao","Xiwu Chen","Siyu Zhang","Jiahao Gong","Qibin Zhou","Wenyuan Zhang","Ningzi Wang","Feiyang Tan","Hangning Zhou","Ziyao Xu","Haotian Yao","Chi Zhang","Xiaojun Liu","Xiaoguang Di","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2404.06892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06883v1","updated":"2024-04-10T10:13:37Z","published":"2024-04-10T10:13:37Z","title":"Research on Detection of Floating Objects in River and Lake Based on AI\n Intelligent Image Recognition","summary":" With the rapid advancement of artificial intelligence technology, AI-enabled\nimage recognition has emerged as a potent tool for addressing challenges in\ntraditional environmental monitoring. This study focuses on the detection of\nfloating objects in river and lake environments, exploring an innovative\napproach based on deep learning. By intricately analyzing the technical\npathways for detecting static and dynamic features and considering the\ncharacteristics of river and lake debris, a comprehensive image acquisition and\nprocessing workflow has been developed. The study highlights the application\nand performance comparison of three mainstream deep learning models -SSD,\nFaster-RCNN, and YOLOv5- in debris identification. Additionally, a detection\nsystem for floating objects has been designed and implemented, encompassing\nboth hardware platform construction and software framework development. Through\nrigorous experimental validation, the proposed system has demonstrated its\nability to significantly enhance the accuracy and efficiency of debris\ndetection, thus offering a new technological avenue for water quality\nmonitoring in rivers and lakes\n","authors":["Jingyu Zhang","Ao Xiang","Yu Cheng","Qin Yang","Liyang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07887v2","updated":"2024-04-10T10:06:46Z","published":"2023-10-11T20:48:20Z","title":"Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging\n Noise","summary":" Accurate analysis of microscopy images is hindered by the presence of noise.\nThis noise is usually signal-dependent and often additionally correlated along\nrows or columns of pixels. Current self- and unsupervised denoisers can address\nsignal-dependent noise, but none can reliably remove noise that is also row- or\ncolumn-correlated. Here, we present the first fully unsupervised deep\nlearning-based denoiser capable of handling imaging noise that is\nrow-correlated as well as signal-dependent. Our approach uses a Variational\nAutoencoder (VAE) with a specially designed autoregressive decoder. This\ndecoder is capable of modeling row-correlated and signal-dependent noise but is\nincapable of independently modeling underlying clean signal. The VAE therefore\nproduces latent variables containing only clean signal information, and these\nare mapped back into image space using a proposed second decoder network. Our\nmethod does not require a pre-trained noise model and can be trained from\nscratch using unpaired noisy data. We show that our approach achieves\ncompetitive results when applied to a range of different sensor types and\nimaging modalities.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.07887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03190v5","updated":"2024-04-10T09:51:11Z","published":"2024-03-05T18:29:17Z","title":"Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract\n Reasoning process","summary":" Abstract reasoning problems pose significant challenges to artificial\nintelligence algorithms, demanding cognitive capabilities beyond those required\nfor perception tasks. This study introduces the Triple-CFN approach to tackle\nthe Bongard-Logo problem, achieving notable reasoning accuracy by implicitly\nreorganizing the concept space of conflicting instances. Additionally, the\nTriple-CFN paradigm proves effective for the RPM problem with necessary\nmodifications, yielding competitive results. To further enhance performance on\nthe RPM issue, we develop the Meta Triple-CFN network, which explicitly\nstructures the problem space while maintaining interpretability on progressive\npatterns. The success of Meta Triple-CFN is attributed to its paradigm of\nmodeling the conceptual space, equivalent to normalizing reasoning information.\nBased on this ideology, we introduce the Re-space layer, enhancing the\nperformance of both Meta Triple-CFN and Triple-CFN. This paper aims to\ncontribute to advancements in machine intelligence by exploring innovative\nnetwork designs for addressing abstract reasoning problems, paving the way for\nfurther breakthroughs in this domain.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03190v5.pdf","comment":"14 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.06865v1","updated":"2024-04-10T09:45:02Z","published":"2024-04-10T09:45:02Z","title":"Fine color guidance in diffusion models and its application to image\n compression at extremely low bitrates","summary":" This study addresses the challenge of, without training or fine-tuning,\ncontrolling the global color aspect of images generated with a diffusion model.\nWe rewrite the guidance equations to ensure that the outputs are closer to a\nknown color map, and this without hindering the quality of the generation. Our\nmethod leads to new guidance equations. We show in the color guidance context\nthat, the scaling of the guidance should not decrease but remains high\nthroughout the diffusion process. In a second contribution, our guidance is\napplied in a compression framework, we combine both semantic and general color\ninformation on the image to decode the images at low cost. We show that our\nmethod is effective at improving fidelity and realism of compressed images at\nextremely low bit rates, when compared to other classical or more semantic\noriented approaches.\n","authors":["Tom Bordin","Thomas Maugey"],"pdf_url":"https://arxiv.org/pdf/2404.06865v1.pdf","comment":"Submitted to IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2404.06863v1","updated":"2024-04-10T09:40:56Z","published":"2024-04-10T09:40:56Z","title":"RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds","summary":" While deep learning-based methods have demonstrated outstanding results in\nnumerous domains, some important functionalities are missing. Resolution\nscalability is one of them. In this work, we introduce a novel architecture,\ndubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of\npoint clouds. In contrast to existing works, the proposed method does not\nrequire the whole point cloud to be available to start inference. Once a\nlow-resolution version of the input point cloud is available, first semantic\npredictions can be generated in an extremely fast manner. This enables early\ndecision-making in subsequent processing steps. As additional points become\navailable, these are processed in parallel. To improve performance, features\nfrom previously computed scales are employed as prior knowledge at the current\nscale. Our experiments show that RESSCAL3D is 31-62% faster than the\nnon-scalable baseline while keeping a limited impact on performance. To the\nbest of our knowledge, the proposed method is the first to propose a\nresolution-scalable approach for 3D semantic segmentation of point clouds based\non deep learning.\n","authors":["Remco Royen","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2404.06863v1.pdf","comment":"Published at 2023 IEEE International Conference on Image Processing\n (ICIP)"},{"id":"http://arxiv.org/abs/2404.06860v1","updated":"2024-04-10T09:35:50Z","published":"2024-04-10T09:35:50Z","title":"Monocular 3D lane detection for Autonomous Driving: Recent Achievements,\n Challenges, and Outlooks","summary":" 3D lane detection plays a crucial role in autonomous driving by extracting\nstructural and traffic information from the road in 3D space to assist the\nself-driving car in rational, safe, and comfortable path planning and motion\ncontrol. Due to the consideration of sensor costs and the advantages of visual\ndata in color information, in practical applications, 3D lane detection based\non monocular vision is one of the important research directions in the field of\nautonomous driving, which has attracted more and more attention in both\nindustry and academia. Unfortunately, recent progress in visual perception\nseems insufficient to develop completely reliable 3D lane detection algorithms,\nwhich also hinders the development of vision-based fully autonomous\nself-driving cars, i.e., achieving level 5 autonomous driving, driving like\nhuman-controlled cars. This is one of the conclusions drawn from this review\npaper: there is still a lot of room for improvement and significant\nimprovements are still needed in the 3D lane detection algorithm for autonomous\ndriving cars using visual sensors. Motivated by this, this review defines,\nanalyzes, and reviews the current achievements in the field of 3D lane\ndetection research, and the vast majority of the current progress relies\nheavily on computationally complex deep learning models. In addition, this\nreview covers the 3D lane detection pipeline, investigates the performance of\nstate-of-the-art algorithms, analyzes the time complexity of cutting-edge\nmodeling choices, and highlights the main achievements and limitations of\ncurrent research efforts. The survey also includes a comprehensive discussion\nof available 3D lane detection datasets and the challenges that researchers\nhave faced but have not yet resolved. Finally, our work outlines future\nresearch directions and welcomes researchers and practitioners to enter this\nexciting field.\n","authors":["Fulong Ma","Weiqing Qi","Guoyang Zhao","Linwei Zheng","Sheng Wang","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06859v1","updated":"2024-04-10T09:35:36Z","published":"2024-04-10T09:35:36Z","title":"Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark","summary":" Multi-label image classification in dynamic environments is a problem that\nposes significant challenges. Previous studies have primarily focused on\nscenarios such as Domain Incremental Learning and Class Incremental Learning,\nwhich do not fully capture the complexity of real-world applications. In this\npaper, we study the problem of classification of medical imaging in the\nscenario termed New Instances \\& New Classes, which combines the challenges of\nboth new class arrivals and domain shifts in a single framework. Unlike\ntraditional scenarios, it reflects the realistic nature of CL in domains such\nas medical imaging, where updates may introduce both new classes and changes in\ndomain characteristics. To address the unique challenges posed by this complex\nscenario, we introduce a novel approach called Pseudo-Label Replay. This method\naims to mitigate forgetting while adapting to new classes and domain shifts by\ncombining the advantages of the Replay and Pseudo-Label methods and solving\ntheir limitations in the proposed scenario. % part3 We evaluate our proposed\napproach on a challenging benchmark consisting of two datasets, seven tasks,\nand nineteen classes, modeling a realistic Continual Learning scenario. Our\nexperimental findings demonstrate the effectiveness of Pseudo-Label Replay in\naddressing the challenges posed by the complex scenario proposed. Our method\nsurpasses existing approaches, exhibiting superior performance while showing\nminimal forgetting.\n","authors":["Marina Ceccon","Davide Dalle Pezze","Alessandro Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2404.06859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10035v3","updated":"2024-04-10T09:34:03Z","published":"2023-02-20T15:34:03Z","title":"Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey","summary":" With the urgent demand for generalized deep models, many pre-trained big\nmodels are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of\nthese models in single domains (like computer vision and natural language\nprocessing), the multi-modal pre-trained big models have also drawn more and\nmore attention in recent years. In this work, we give a comprehensive survey of\nthese models and hope this paper could provide new insights and helps fresh\nresearchers to track the most cutting-edge works. Specifically, we firstly\nintroduce the background of multi-modal pre-training by reviewing the\nconventional deep learning, pre-training works in natural language process,\ncomputer vision, and speech. Then, we introduce the task definition, key\nchallenges, and advantages of multi-modal pre-training models (MM-PTMs), and\ndiscuss the MM-PTMs with a focus on data, objectives, network architectures,\nand knowledge enhanced pre-training. After that, we introduce the downstream\ntasks used for the validation of large-scale MM-PTMs, including generative,\nclassification, and regression tasks. We also give visualization and analysis\nof the model parameters and results on representative downstream tasks.\nFinally, we point out possible research directions for this topic that may\nbenefit future works. In addition, we maintain a continuously updated paper\nlist for large-scale pre-trained multi-modal big models:\nhttps://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has\nbeen published by the journal Machine Intelligence Research (MIR),\nhttps://link.springer.com/article/10.1007/s11633-022-1410-8, DOI:\n10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023.\n","authors":["Xiao Wang","Guangyao Chen","Guangwu Qian","Pengcheng Gao","Xiao-Yong Wei","Yaowei Wang","Yonghong Tian","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2302.10035v3.pdf","comment":"Accepted by Machine Intelligence Research (MIR)"},{"id":"http://arxiv.org/abs/2404.06851v1","updated":"2024-04-10T09:24:54Z","published":"2024-04-10T09:24:54Z","title":"UDiFF: Generating Conditional Unsigned Distance Fields with Optimal\n Wavelet Diffusion","summary":" Diffusion models have shown remarkable results for image generation, editing\nand inpainting. Recent works explore diffusion models for 3D shape generation\nwith neural implicit functions, i.e., signed distance function and occupancy\nfunction. However, they are limited to shapes with closed surfaces, which\nprevents them from generating diverse 3D real-world contents containing open\nsurfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned\ndistance fields (UDFs) which is capable to generate textured 3D shapes with\nopen surfaces from text conditions or unconditionally. Our key idea is to\ngenerate UDFs in spatial-frequency domain with an optimal wavelet\ntransformation, which produces a compact representation space for UDF\ngeneration. Specifically, instead of selecting an appropriate wavelet\ntransformation which requires expensive manual efforts and still leads to large\ninformation loss, we propose a data-driven approach to learn the optimal\nwavelet transformation for UDFs. We evaluate UDiFF to show our advantages by\nnumerical and visual comparisons with the latest methods on widely used\nbenchmarks. Page: https://weiqi-zhang.github.io/UDiFF.\n","authors":["Junsheng Zhou","Weiqi Zhang","Baorui Ma","Kanle Shi","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2404.06851v1.pdf","comment":"To appear at CVPR2024. Project page:\n https://weiqi-zhang.github.io/UDiFF"},{"id":"http://arxiv.org/abs/2404.06842v1","updated":"2024-04-10T09:14:28Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2402.02263v2","updated":"2024-04-10T09:00:44Z","published":"2024-02-03T21:12:36Z","title":"MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly\n Mixed Classifiers","summary":" Adversarial robustness often comes at the cost of degraded accuracy, impeding\nthe real-life application of robust classification models. Training-based\nsolutions for better trade-offs are limited by incompatibilities with\nalready-trained high-performance large models, necessitating the exploration of\ntraining-free ensemble approaches. Observing that robust models are more\nconfident in correct predictions than in incorrect ones on clean and\nadversarial data alike, we speculate amplifying this \"benign confidence\nproperty\" can reconcile accuracy and robustness in an ensemble setting. To\nachieve so, we propose \"MixedNUTS\", a training-free method where the output\nlogits of a robust classifier and a standard non-robust classifier are\nprocessed by nonlinear transformations with only three parameters, which are\noptimized through an efficient algorithm. MixedNUTS then converts the\ntransformed logits into probabilities and mixes them as the overall output. On\nCIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom\nstrong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and\nnear-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points,\nsacrificing merely 0.87 points in robust accuracy.\n","authors":["Yatong Bai","Mo Zhou","Vishal M. Patel","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2402.02263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06836v1","updated":"2024-04-10T08:54:43Z","published":"2024-04-10T08:54:43Z","title":"O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit\n Representation","summary":" Online construction of open-ended language scenes is crucial for robotic\napplications, where open-vocabulary interactive scene understanding is\nrequired. Recently, neural implicit representation has provided a promising\ndirection for online interactive mapping. However, implementing open-vocabulary\nscene understanding capability into online neural implicit mapping still faces\nthree challenges: lack of local scene updating ability, blurry spatial\nhierarchical semantic segmentation and difficulty in maintaining multi-view\nconsistency. To this end, we proposed O2V-mapping, which utilizes voxel-based\nlanguage and geometric features to create an open-vocabulary field, thus\nallowing for local updates during online training process. Additionally, we\nleverage a foundational model for image segmentation to extract language\nfeatures on object-level entities, achieving clear segmentation boundaries and\nhierarchical semantic features. For the purpose of preserving consistency in 3D\nobject properties across different viewpoints, we propose a spatial adaptive\nvoxel adjustment mechanism and a multi-view weight selection method. Extensive\nexperiments on open-vocabulary object localization and semantic segmentation\ndemonstrate that O2V-mapping achieves online construction of language scenes\nwhile enhancing accuracy, outperforming the previous SOTA method.\n","authors":["Muer Tie","Julong Wei","Zhengjun Wang","Ke Wu","Shansuai Yuan","Kaizhao Zhang","Jie Jia","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06835v1","updated":"2024-04-10T08:54:00Z","published":"2024-04-10T08:54:00Z","title":"Tuning-Free Adaptive Style Incorporation for Structure-Consistent\n Text-Driven Style Transfer","summary":" In this work, we target the task of text-driven style transfer in the context\nof text-to-image (T2I) diffusion models. The main challenge is consistent\nstructure preservation while enabling effective style transfer effects. The\npast approaches in this field directly concatenate the content and style\nprompts for a prompt-level style injection, leading to unavoidable structure\ndistortions. In this work, we propose a novel solution to the text-driven style\ntransfer task, namely, Adaptive Style Incorporation~(ASI), to achieve\nfine-grained feature-level style incorporation. It consists of the Siamese\nCross-Attention~(SiCA) to decouple the single-track cross-attention to a\ndual-track structure to obtain separate content and style features, and the\nAdaptive Content-Style Blending (AdaBlending) module to couple the content and\nstyle information from a structure-consistent manner. Experimentally, our\nmethod exhibits much better performance in both structure preservation and\nstylized effects.\n","authors":["Yanqi Ge","Jiaqi Liu","Qingnan Fan","Xi Jiang","Ye Huang","Shuai Qin","Hong Gu","Wen Li","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2404.06835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06832v1","updated":"2024-04-10T08:48:09Z","published":"2024-04-10T08:48:09Z","title":"SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection","summary":" Detecting anomalies in images has become a well-explored problem in both\nacademia and industry. State-of-the-art algorithms are able to detect defects\nin increasingly difficult settings and data modalities. However, most current\nmethods are not suited to address 3D objects captured from differing poses.\nWhile solutions using Neural Radiance Fields (NeRFs) have been proposed, they\nsuffer from excessive computation requirements, which hinder real-world\nusability. For this reason, we propose the novel 3D Gaussian splatting-based\nframework SplatPose which, given multi-view images of a 3D object, accurately\nestimates the pose of unseen views in a differentiable manner, and detects\nanomalies in them. We achieve state-of-the-art results in both training and\ninference speed, and detection performance, even when using less training data\nthan competing methods. We thoroughly evaluate our framework using the recently\nproposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly\ndetection (MAD) data set.\n","authors":["Mathis Kruse","Marco Rudolph","Dominik Woiwode","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2404.06832v1.pdf","comment":"Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02668v2","updated":"2024-04-10T08:47:32Z","published":"2024-04-03T12:06:01Z","title":"RS-Mamba for Large Remote Sensing Image Dense Prediction","summary":" Context modeling is critical for remote sensing image dense prediction tasks.\nNowadays, the growing size of very-high-resolution (VHR) remote sensing images\nposes challenges in effectively modeling context. While transformer-based\nmodels possess global modeling capabilities, they encounter computational\nchallenges when applied to large VHR images due to their quadratic complexity.\nThe conventional practice of cropping large images into smaller patches results\nin a notable loss of contextual information. To address these issues, we\npropose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR\nremote sensing images. RSM is specifically designed to capture the global\ncontext of remote sensing images with linear complexity, facilitating the\neffective processing of large VHR images. Considering that the land covers in\nremote sensing images are distributed in arbitrary spatial directions due to\ncharacteristics of remote sensing over-head imaging, the RSM incorporates an\nomnidirectional selective scan module to globally model the context of images\nin multiple directions, capturing large spatial features from various\ndirections. Extensive experiments on semantic segmentation and change detection\ntasks across various land covers demonstrate the effectiveness of the proposed\nRSM. We designed simple yet effective models based on RSM, achieving\nstate-of-the-art performance on dense prediction tasks in VHR remote sensing\nimages without fancy training strategies. Leveraging the linear complexity and\nglobal modeling capabilities, RSM achieves better efficiency and accuracy than\ntransformer-based models on large remote sensing images. Interestingly, we also\ndemonstrated that our model generally performs better with a larger image size\non dense prediction tasks. Our code is available at\nhttps://github.com/walking-shadow/Official_Remote_Sensing_Mamba.\n","authors":["Sijie Zhao","Hao Chen","Xueliang Zhang","Pengfeng Xiao","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2404.02668v2.pdf","comment":"15 pages,8 figures"},{"id":"http://arxiv.org/abs/2312.03502v2","updated":"2024-04-10T08:29:23Z","published":"2023-12-06T13:59:22Z","title":"Improving the Generalization of Segmentation Foundation Model under\n Distribution Shift via Weakly Supervised Adaptation","summary":" The success of large language models has inspired the computer vision\ncommunity to explore image segmentation foundation model that is able to\nzero/few-shot generalize through prompt engineering. Segment-Anything(SAM),\namong others, is the state-of-the-art image segmentation foundation model\ndemonstrating strong zero/few-shot generalization. Despite the success, recent\nstudies reveal the weakness of SAM under strong distribution shift. In\nparticular, SAM performs awkwardly on corrupted natural images, camouflaged\nimages, medical images, etc. Motivated by the observations, we aim to develop a\nself-training based strategy to adapt SAM to target distribution. Given the\nunique challenges of large source dataset, high computation cost and incorrect\npseudo label, we propose a weakly supervised self-training architecture with\nanchor regularization and low-rank finetuning to improve the robustness and\ncomputation efficiency of adaptation. We validate the effectiveness on 5 types\nof downstream segmentation tasks including natural clean/corrupted images,\nmedical images, camouflaged images and robotic images. Our proposed method is\ntask-agnostic in nature and outperforms pre-trained SAM and state-of-the-art\ndomain adaptation methods on almost all downstream tasks with the same testing\nprompt inputs.\n","authors":["Haojie Zhang","Yongyi Su","Xun Xu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2312.03502v2.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.10610v4","updated":"2024-04-10T08:16:18Z","published":"2023-08-21T10:20:46Z","title":"Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing\n Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset","summary":" Deep learning-based ear disease diagnosis technology has proven effective and\naffordable. However, due to the lack of ear endoscope datasets with diversity,\nthe practical potential of the deep learning model has not been thoroughly\nstudied. Moreover, existing research failed to achieve a good trade-off between\nmodel inference speed and parameter size, rendering models inapplicable in\nreal-world settings. To address these challenges, we constructed the first\nlarge-scale ear endoscopic dataset comprising eight types of ear diseases and\ndisease-free samples from two institutions. Inspired by ShuffleNetV2, we\nproposed Best-EarNet, an ultrafast and ultralight network enabling real-time\near disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial\nFeature Fusion Module and multi-scale supervision strategy, which facilitates\nthe model focusing on global-local information within feature maps at various\nlevels. Utilizing transfer learning, the accuracy of Best-EarNet with only\n0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external\n1,652 images), respectively. In particular, it achieves an average frame per\nsecond of 80 on the CPU. From the perspective of model practicality, the\nproposed Best-EarNet is superior to state-of-the-art backbone models in ear\nlesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis\nsystem based Best-EarNet, was developed successfully and deployed on common\nelectronic devices (smartphone, tablet computer and personal computer). In the\nfuture, Ear-Keeper has the potential to assist the public and healthcare\nproviders in performing comprehensive scanning and diagnosis of the ear canal\nin real-time video, thereby promptly detecting ear lesions.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Fan Zhang","Yunxin Liang","Yan Liu","Zhenzhang Li","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v4.pdf","comment":"18 pages,8 figures"},{"id":"http://arxiv.org/abs/2404.06814v1","updated":"2024-04-10T08:02:17Z","published":"2024-04-10T08:02:17Z","title":"Zero-shot Point Cloud Completion Via 2D Priors","summary":" 3D point cloud completion is designed to recover complete shapes from\npartially observed point clouds. Conventional completion methods typically\ndepend on extensive point cloud data for training %, with their effectiveness\noften constrained to object categories similar to those seen during training.\nIn contrast, we propose a zero-shot framework aimed at completing partially\nobserved point clouds across any unseen categories. Leveraging point rendering\nvia Gaussian Splatting, we develop techniques of Point Cloud Colorization and\nZero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion\nmodels to infer missing regions. Experimental results on both synthetic and\nreal-world scanned point clouds demonstrate that our approach outperforms\nexisting methods in completing a variety of objects without any requirement for\nspecific training data.\n","authors":["Tianxin Huang","Zhiwen Yan","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.06814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05916v2","updated":"2024-04-10T07:58:44Z","published":"2024-03-09T13:56:25Z","title":"GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual\n Affective Computing","summary":" Multimodal large language models (MLLMs) are designed to process and\nintegrate information from multiple sources, such as text, speech, images, and\nvideos. Despite its success in language understanding, it is critical to\nevaluate the performance of downstream tasks for better human-centric\napplications. This paper assesses the application of MLLMs with 5 crucial\nabilities for affective computing, spanning from visual affective tasks and\nreasoning tasks. The results show that \\gpt has high accuracy in facial action\nunit recognition and micro-expression detection while its general facial\nexpression recognition performance is not accurate. We also highlight the\nchallenges of achieving fine-grained micro-expression recognition and the\npotential for further study and demonstrate the versatility and potential of\n\\gpt for handling advanced tasks in emotion recognition and related fields by\nintegrating with task-related agents for more complex tasks, such as heart rate\nestimation through signal processing. In conclusion, this paper provides\nvaluable insights into the potential applications and challenges of MLLMs in\nhuman-centric computing. Our interesting examples are at\nhttps://github.com/EnVision-Research/GPT4Affectivity.\n","authors":["Hao Lu","Xuesong Niu","Jiyao Wang","Yin Wang","Qingyong Hu","Jiaqi Tang","Yuting Zhang","Kaishen Yuan","Bin Huang","Zitong Yu","Dengbo He","Shuiguang Deng","Hao Chen","Yingcong Chen","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2403.05916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08551v3","updated":"2024-04-10T07:58:04Z","published":"2024-03-13T14:02:54Z","title":"GaussianImage: 1000 FPS Image Representation and Compression by 2D\n Gaussian Splatting","summary":" Implicit neural representations (INRs) recently achieved great success in\nimage representation and compression, offering high visual quality and fast\nrendering speeds with 10-1000 FPS, assuming sufficient GPU resources are\navailable. However, this requirement often hinders their use on low-end devices\nwith limited memory. In response, we propose a groundbreaking paradigm of image\nrepresentation and compression by 2D Gaussian Splatting, named GaussianImage.\nWe first introduce 2D Gaussian to represent the image, where each Gaussian has\n8 parameters including position, covariance and color. Subsequently, we unveil\na novel rendering algorithm based on accumulated summation. Remarkably, our\nmethod with a minimum of 3$\\times$ lower GPU memory usage and 5$\\times$ faster\nfitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation\nperformance, but also delivers a faster rendering speed of 1500-2000 FPS\nregardless of parameter size. Furthermore, we integrate existing vector\nquantization technique to build an image codec. Experimental results\ndemonstrate that our codec attains rate-distortion performance comparable to\ncompression-based INRs such as COIN and COIN++, while facilitating decoding\nspeeds of approximately 1000 FPS. Additionally, preliminary proof of concept\nshows that our codec surpasses COIN and COIN++ in performance when using\npartial bits-back coding. Code will be available at\nhttps://github.com/Xinjie-Q/GaussianImage.\n","authors":["Xinjie Zhang","Xingtong Ge","Tongda Xu","Dailan He","Yan Wang","Hongwei Qin","Guo Lu","Jing Geng","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08551v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07354v4","updated":"2024-04-10T07:54:14Z","published":"2024-02-12T01:03:39Z","title":"Re-DiffiNet: Modeling discrepancies in tumor segmentation using\n diffusion models","summary":" Identification of tumor margins is essential for surgical decision-making for\nglioblastoma patients and provides reliable assistance for neurosurgeons.\nDespite improvements in deep learning architectures for tumor segmentation over\nthe years, creating a fully autonomous system suitable for clinical floors\nremains a formidable challenge because the model predictions have not yet\nreached the desired level of accuracy and generalizability for clinical\napplications. Generative modeling techniques have seen significant improvements\nin recent times. Specifically, Generative Adversarial Networks (GANs) and\nDenoising-diffusion-based models (DDPMs) have been used to generate\nhigher-quality images with fewer artifacts and finer attributes. In this work,\nwe introduce a framework called Re-Diffinet for modeling the discrepancy\nbetween the outputs of a segmentation model like U-Net and the ground truth,\nusing DDPMs. By explicitly modeling the discrepancy, the results show an\naverage improvement of 0.55\\% in the Dice score and 16.28\\% in HD95 from\ncross-validation over 5-folds, compared to the state-of-the-art U-Net\nsegmentation model.\n","authors":["Tianyi Ren","Abhishek Sharma","Juampablo Heras Rivera","Harshitha Rebala","Ethan Honey","Agamdeep Chopra","Jacob Ruzevick","Mehmet Kurt"],"pdf_url":"https://arxiv.org/pdf/2402.07354v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05063v2","updated":"2024-04-10T07:44:40Z","published":"2024-04-07T20:19:04Z","title":"AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with\n Implicit Disentanglement","summary":" Facial action unit (AU) intensity plays a pivotal role in quantifying\nfine-grained expression behaviors, which is an effective condition for facial\nexpression manipulation. However, publicly available datasets containing\nintensity annotations for multiple AUs remain severely limited, often featuring\na restricted number of subjects. This limitation places challenges to the AU\nintensity manipulation in images due to disentanglement issues, leading\nresearchers to resort to other large datasets with pretrained AU intensity\nestimators for pseudo labels. In addressing this constraint and fully\nleveraging manual annotations of AU intensities for precise manipulation, we\nintroduce AUEditNet. Our proposed model achieves impressive intensity\nmanipulation across 12 AUs, trained effectively with only 18 subjects.\nUtilizing a dual-branch architecture, our approach achieves comprehensive\ndisentanglement of facial attributes and identity without necessitating\nadditional loss functions or implementing with large batch sizes. This approach\noffers a potential solution to achieve desired facial attribute editing despite\nthe dataset's limited subject count. Our experiments demonstrate AUEditNet's\nsuperior accuracy in editing AU intensities, affirming its capability in\ndisentangling facial attributes and identity within a limited subject pool.\nAUEditNet allows conditioning by either intensity values or target images,\neliminating the need for constructing AU combinations for specific facial\nexpression synthesis. Moreover, AU intensity estimation, as a downstream task,\nvalidates the consistency between real and edited images, confirming the\neffectiveness of our proposed AU intensity manipulation method.\n","authors":["Shiwei Jin","Zhen Wang","Lei Wang","Peng Liu","Ning Bi","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06798v1","updated":"2024-04-10T07:41:35Z","published":"2024-04-10T07:41:35Z","title":"MedRG: Medical Report Grounding with Multi-modal Large Language Model","summary":" Medical Report Grounding is pivotal in identifying the most relevant regions\nin medical images based on a given phrase query, a critical aspect in medical\nimage analysis and radiological diagnosis. However, prevailing visual grounding\napproaches necessitate the manual extraction of key phrases from medical\nreports, imposing substantial burdens on both system efficiency and physicians.\nIn this paper, we introduce a novel framework, Medical Report Grounding\n(MedRG), an end-to-end solution for utilizing a multi-modal Large Language\nModel to predict key phrase by incorporating a unique token, BOX, into the\nvocabulary to serve as an embedding for unlocking detection capabilities.\nSubsequently, the vision encoder-decoder jointly decodes the hidden embedding\nand the input medical image, generating the corresponding grounding box. The\nexperimental results validate the effectiveness of MedRG, surpassing the\nperformance of the existing state-of-the-art medical phrase grounding methods.\nThis study represents a pioneering exploration of the medical report grounding\ntask, marking the first-ever endeavor in this domain.\n","authors":["Ke Zou","Yang Bai","Zhihao Chen","Yang Zhou","Yidi Chen","Kai Ren","Meng Wang","Xuedong Yuan","Xiaojing Shen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2404.06798v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.15361v2","updated":"2024-04-10T06:46:08Z","published":"2023-11-26T17:27:26Z","title":"Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot\n Interaction","summary":" Hand gestures play a significant role in human interactions where non-verbal\nintentions, thoughts and commands are conveyed. In Human-Robot Interaction\n(HRI), hand gestures offer a similar and efficient medium for conveying clear\nand rapid directives to a robotic agent. However, state-of-the-art vision-based\nmethods for gesture recognition have been shown to be effective only up to a\nuser-camera distance of seven meters. Such a short distance range limits\npractical HRI with, for example, service robots, search and rescue robots and\ndrones. In this work, we address the Ultra-Range Gesture Recognition (URGR)\nproblem by aiming for a recognition distance of up to 25 meters and in the\ncontext of HRI. We propose the URGR framework, a novel deep-learning, using\nsolely a simple RGB camera. Gesture inference is based on a single image.\nFirst, a novel super-resolution model termed High-Quality Network (HQ-Net) uses\na set of self-attention and convolutional layers to enhance the low-resolution\nimage of the user. Then, we propose a novel URGR classifier termed Graph Vision\nTransformer (GViT) which takes the enhanced image as input. GViT combines the\nbenefits of a Graph Convolutional Network (GCN) and a modified Vision\nTransformer (ViT). Evaluation of the proposed framework over diverse test data\nyields a high recognition rate of 98.1%. The framework has also exhibited\nsuperior performance compared to human recognition in ultra-range distances.\nWith the framework, we analyze and demonstrate the performance of an autonomous\nquadruped robot directed by human gestures in complex ultra-range indoor and\noutdoor environments, acquiring 96% recognition rate on average.\n","authors":["Eran Bamani","Eden Nissinman","Inbar Meir","Lisa Koenigsberg","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2311.15361v2.pdf","comment":"Engineering Applications of Artificial Intelligence, In press"},{"id":"http://arxiv.org/abs/2404.06780v1","updated":"2024-04-10T06:41:30Z","published":"2024-04-10T06:41:30Z","title":"Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior","summary":" Text-to-3D generation has achieved remarkable success via large-scale\ntext-to-image diffusion models. Nevertheless, there is no paradigm for scaling\nup the methodology to urban scale. Urban scenes, characterized by numerous\nelements, intricate arrangement relationships, and vast scale, present a\nformidable barrier to the interpretability of ambiguous textual descriptions\nfor effective model optimization. In this work, we surmount the limitations by\nintroducing a compositional 3D layout representation into text-to-3D paradigm,\nserving as an additional prior. It comprises a set of semantic primitives with\nsimple geometric structures and explicit arrangement relationships,\ncomplementing textual descriptions and enabling steerable generation. Upon\nthis, we propose two modifications -- (1) We introduce Layout-Guided\nVariational Score Distillation to address model optimization inadequacies. It\nconditions the score distillation sampling process with geometric and semantic\nconstraints of 3D layouts. (2) To handle the unbounded nature of urban scenes,\nwe represent 3D scene with a Scalable Hash Grid structure, incrementally\nadapting to the growing scale of urban scenes. Extensive experiments\nsubstantiate the capability of our framework to scale text-to-3D generation to\nlarge-scale urban scenes that cover over 1000m driving distance for the first\ntime. We also present various scene editing demonstrations, showing the powers\nof steerable urban scene generation. Website: https://urbanarchitect.github.io.\n","authors":["Fan Lu","Kwan-Yee Lin","Yan Xu","Hongsheng Li","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06780v1.pdf","comment":"Project page: https://urbanarchitect.github.io/"},{"id":"http://arxiv.org/abs/2404.06779v1","updated":"2024-04-10T06:39:18Z","published":"2024-04-10T06:39:18Z","title":"Efficient and Scalable Chinese Vector Font Generation via Component\n Composition","summary":" Chinese vector font generation is challenging due to the complex structure\nand huge amount of Chinese characters. Recent advances remain limited to\ngenerating a small set of characters with simple structure. In this work, we\nfirst observe that most Chinese characters can be disassembled into\nfrequently-reused components. Therefore, we introduce the first efficient and\nscalable Chinese vector font generation approach via component composition,\nallowing generating numerous vector characters from a small set of components.\nTo achieve this, we collect a large-scale dataset that contains over\n\\textit{90K} Chinese characters with their components and layout information.\nUpon the dataset, we propose a simple yet effective framework based on spatial\ntransformer networks (STN) and multiple losses tailored to font characteristics\nto learn the affine transformation of the components, which can be directly\napplied to the B\\'ezier curves, resulting in Chinese characters in vector\nformat. Our qualitative and quantitative experiments have demonstrated that our\nmethod significantly surpasses the state-of-the-art vector font generation\nmethods in generating large-scale complex Chinese characters in both font\ngeneration and zero-shot font extension.\n","authors":["Jinyu Song","Weitao You","Shuhui Shi","Shuxuan Guo","Lingyun Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06779v1.pdf","comment":"15 pages, 23 figures"},{"id":"http://arxiv.org/abs/2403.19837v3","updated":"2024-04-10T23:47:34Z","published":"2024-03-28T21:15:38Z","title":"Concept-based Analysis of Neural Networks via Vision-Language Models","summary":" The analysis of vision-based deep neural networks (DNNs) is highly desirable\nbut it is very challenging due to the difficulty of expressing formal\nspecifications for vision tasks and the lack of efficient verification\nprocedures. In this paper, we propose to leverage emerging multimodal,\nvision-language, foundation models (VLMs) as a lens through which we can reason\nabout vision models. VLMs have been trained on a large body of images\naccompanied by their textual description, and are thus implicitly aware of\nhigh-level, human-understandable concepts describing the images. We describe a\nlogical specification language $\\texttt{Con}_{\\texttt{spec}}$ designed to\nfacilitate writing specifications in terms of these concepts. To define and\nformally check $\\texttt{Con}_{\\texttt{spec}}$ specifications, we build a map\nbetween the internal representations of a given vision model and a VLM, leading\nto an efficient verification procedure of natural-language properties for\nvision models. We demonstrate our techniques on a ResNet-based classifier\ntrained on the RIVAL-10 dataset using CLIP as the multimodal model.\n","authors":["Ravi Mangal","Nina Narodytska","Divya Gopinath","Boyue Caroline Hu","Anirban Roy","Susmit Jha","Corina Pasareanu"],"pdf_url":"https://arxiv.org/pdf/2403.19837v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14666v2","updated":"2024-04-10T23:39:38Z","published":"2023-08-24T17:47:32Z","title":"Learning to Predict 3D Rotational Dynamics from Images of a Rigid Body\n with Unknown Mass Distribution","summary":" In many real-world settings, image observations of freely rotating 3D rigid\nbodies may be available when low-dimensional measurements are not. However, the\nhigh-dimensionality of image data precludes the use of classical estimation\ntechniques to learn the dynamics. The usefulness of standard deep learning\nmethods is also limited, because an image of a rigid body reveals nothing about\nthe distribution of mass inside the body, which, together with initial angular\nvelocity, is what determines how the body will rotate. We present a\nphysics-based neural network model to estimate and predict 3D rotational\ndynamics from image sequences. We achieve this using a multi-stage prediction\npipeline that maps individual images to a latent representation homeomorphic to\n$\\mathbf{SO}(3)$, computes angular velocities from latent pairs, and predicts\nfuture latent states using the Hamiltonian equations of motion. We demonstrate\nthe efficacy of our approach on new rotating rigid-body datasets of sequences\nof synthetic images of rotating objects, including cubes, prisms and\nsatellites, with unknown uniform and non-uniform mass distributions. Our model\noutperforms competing baselines on our datasets, producing better qualitative\npredictions and reducing the error observed for the state-of-the-art\nHamiltonian Generative Network by a factor of 2.\n","authors":["Justice Mason","Christine Allen-Blanchette","Nicholas Zolman","Elizabeth Davison","Naomi Ehrich Leonard"],"pdf_url":"https://arxiv.org/pdf/2308.14666v2.pdf","comment":"Previously appeared as arXiv:2209.11355v2, which was submitted as a\n replacement by accident. arXiv admin note: text overlap with arXiv:2209.11355"},{"id":"http://arxiv.org/abs/2404.07389v1","updated":"2024-04-10T23:30:54Z","published":"2024-04-10T23:30:54Z","title":"Object-Conditioned Energy-Based Attention Map Alignment in Text-to-Image\n Diffusion Models","summary":" Text-to-image diffusion models have shown great success in generating\nhigh-quality text-guided images. Yet, these models may still fail to\nsemantically align generated images with the provided text prompts, leading to\nproblems like incorrect attribute binding and/or catastrophic object neglect.\nGiven the pervasive object-oriented structure underlying text prompts, we\nintroduce a novel object-conditioned Energy-Based Attention Map Alignment\n(EBAMA) method to address the aforementioned problems. We show that an\nobject-centric attribute binding loss naturally emerges by approximately\nmaximizing the log-likelihood of a $z$-parameterized energy-based model with\nthe help of the negative sampling technique. We further propose an\nobject-centric intensity regularizer to prevent excessive shifts of objects\nattention towards their attributes. Extensive qualitative and quantitative\nexperiments, including human evaluation, on several challenging benchmarks\ndemonstrate the superior performance of our method over previous strong\ncounterparts. With better aligned attention maps, our approach shows great\npromise in further enhancing the text-controlled image editing ability of\ndiffusion models.\n","authors":["Yasi Zhang","Peiyu Yu","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07377v1","updated":"2024-04-10T22:35:06Z","published":"2024-04-10T22:35:06Z","title":"Deep Generative Sampling in the Dual Divergence Space: A Data-efficient\n & Interpretative Approach for Generative AI","summary":" Building on the remarkable achievements in generative sampling of natural\nimages, we propose an innovative challenge, potentially overly ambitious, which\ninvolves generating samples of entire multivariate time series that resemble\nimages. However, the statistical challenge lies in the small sample size,\nsometimes consisting of a few hundred subjects. This issue is especially\nproblematic for deep generative models that follow the conventional approach of\ngenerating samples from a canonical distribution and then decoding or denoising\nthem to match the true data distribution. In contrast, our method is grounded\nin information theory and aims to implicitly characterize the distribution of\nimages, particularly the (global and local) dependency structure between\npixels. We achieve this by empirically estimating its KL-divergence in the dual\nform with respect to the respective marginal distribution. This enables us to\nperform generative sampling directly in the optimized 1-D dual divergence\nspace. Specifically, in the dual space, training samples representing the data\ndistribution are embedded in the form of various clusters between two end\npoints. In theory, any sample embedded between those two end points is\nin-distribution w.r.t. the data distribution. Our key idea for generating novel\nsamples of images is to interpolate between the clusters via a walk as per\ngradients of the dual function w.r.t. the data dimensions. In addition to the\ndata efficiency gained from direct sampling, we propose an algorithm that\noffers a significant reduction in sample complexity for estimating the\ndivergence of the data distribution with respect to the marginal distribution.\nWe provide strong theoretical guarantees along with an extensive empirical\nevaluation using many real-world datasets from diverse domains, establishing\nthe superiority of our approach w.r.t. state-of-the-art deep learning methods.\n","authors":["Sahil Garg","Anderson Schneider","Anant Raj","Kashif Rasul","Yuriy Nevmyvaka","Sneihil Gopal","Amit Dhurandhar","Guillermo Cecchi","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2404.07377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07374v1","updated":"2024-04-10T22:16:20Z","published":"2024-04-10T22:16:20Z","title":"Improving Multi-Center Generalizability of GAN-Based Fat Suppression\n using Federated Learning","summary":" Generative Adversarial Network (GAN)-based synthesis of fat suppressed (FS)\nMRIs from non-FS proton density sequences has the potential to accelerate\nacquisition of knee MRIs. However, GANs trained on single-site data have poor\ngeneralizability to external data. We show that federated learning can improve\nmulti-center generalizability of GANs for synthesizing FS MRIs, while\nfacilitating privacy-preserving multi-institutional collaborations.\n","authors":["Pranav Kulkarni","Adway Kanhere","Harshita Kukreja","Vivian Zhang","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2404.07374v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.07356v1","updated":"2024-04-10T21:23:13Z","published":"2024-04-10T21:23:13Z","title":"GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic\n Microplastics Data","summary":" Microplastic particle ingestion or inhalation by humans is a problem of\ngrowing concern. Unfortunately, current research methods that use machine\nlearning to understand their potential harms are obstructed by a lack of\navailable data. Deep learning techniques in particular are challenged by such\ndomains where only small or imbalanced data sets are available. Overcoming this\nchallenge often involves oversampling underrepresented classes or augmenting\nthe existing data to improve model performance. This paper proposes GANsemble:\na two-module framework connecting data augmentation with conditional generative\nadversarial networks (cGANs) to generate class-conditioned synthetic data.\nFirst, the data chooser module automates augmentation strategy selection by\nsearching for the best data augmentation strategy. Next, the cGAN module uses\nthis strategy to train a cGAN for generating enhanced synthetic data. We\nexperiment with the GANsemble framework on a small and imbalanced microplastics\ndata set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines\nfor synthetic microplastics (SYMP) data are established in terms of Frechet\nInception Distance (FID) and Inception Scores (IS). We also provide a synthetic\nmicroplastics filter (SYMP-Filter) algorithm to increase the quality of\ngenerated SYMP. Additionally, we show the best amount of oversampling with\naugmentation to fix class imbalance in small microplastics data sets. To our\nknowledge, this study is the first application of generative AI to\nsynthetically create microplastics data.\n","authors":["Daniel Platnick","Sourena Khanzadeh","Alireza Sadeghian","Richard Anthony Valenzano"],"pdf_url":"https://arxiv.org/pdf/2404.07356v1.pdf","comment":"Accepted to the 37th Canadian Artificial Intelligence Conference\n (2024), 12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.16133v2","updated":"2024-04-10T21:19:33Z","published":"2023-09-28T03:30:50Z","title":"Mask4Former: Mask Transformer for 4D Panoptic Segmentation","summary":" Accurately perceiving and tracking instances over time is essential for the\ndecision-making processes of autonomous agents interacting safely in dynamic\nenvironments. With this intention, we propose Mask4Former for the challenging\ntask of 4D panoptic segmentation of LiDAR point clouds. Mask4Former is the\nfirst transformer-based approach unifying semantic instance segmentation and\ntracking of sparse and irregular sequences of 3D point clouds into a single\njoint model. Our model directly predicts semantic instances and their temporal\nassociations without relying on hand-crafted non-learned association strategies\nsuch as probabilistic clustering or voting-based center prediction. Instead,\nMask4Former introduces spatio-temporal instance queries that encode the\nsemantic and geometric properties of each semantic tracklet in the sequence. In\nan in-depth study, we find that promoting spatially compact instance\npredictions is critical as spatio-temporal instance queries tend to merge\nmultiple semantically similar instances, even if they are spatially distant. To\nthis end, we regress 6-DOF bounding box parameters from spatio-temporal\ninstance queries, which are used as an auxiliary task to foster spatially\ncompact predictions. Mask4Former achieves a new state-of-the-art on the\nSemanticKITTI test set with a score of 68.4 LSTQ.\n","authors":["Kadir Yilmaz","Jonas Schult","Alexey Nekrasov","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2309.16133v2.pdf","comment":"Renamed from MASK4D to Mask4Former. ICRA 2024. Project page:\n https://vision.rwth-aachen.de/Mask4Former"},{"id":"http://arxiv.org/abs/2404.07351v1","updated":"2024-04-10T21:14:33Z","published":"2024-04-10T21:14:33Z","title":"A Transformer-Based Model for the Prediction of Human Gaze Behavior on\n Videos","summary":" Eye-tracking applications that utilize the human gaze in video understanding\ntasks have become increasingly important. To effectively automate the process\nof video analysis based on eye-tracking data, it is important to accurately\nreplicate human gaze behavior. However, this task presents significant\nchallenges due to the inherent complexity and ambiguity of human gaze patterns.\nIn this work, we introduce a novel method for simulating human gaze behavior.\nOur approach uses a transformer-based reinforcement learning algorithm to train\nan agent that acts as a human observer, with the primary role of watching\nvideos and simulating human gaze behavior. We employed an eye-tracking dataset\ngathered from videos generated by the VirtualHome simulator, with a primary\nfocus on activity recognition. Our experimental results demonstrate the\neffectiveness of our gaze prediction method by highlighting its capability to\nreplicate human gaze behavior and its applicability for downstream tasks where\nreal human-gaze is used as input.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07351v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2309.04071v2","updated":"2024-04-10T21:09:15Z","published":"2023-09-08T02:05:03Z","title":"Enhancing Hierarchical Transformers for Whole Brain Segmentation with\n Intracranial Measurements Integration","summary":" Whole brain segmentation with magnetic resonance imaging (MRI) enables the\nnon-invasive measurement of brain regions, including total intracranial volume\n(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain\nsegmentation methodology to incorporate intracranial measurements offers a\nheightened level of comprehensiveness in the analysis of brain structures.\nDespite its potential, the task of generalizing deep learning techniques for\nintracranial measurements faces data availability constraints due to limited\nmanually annotated atlases encompassing whole brain and TICV/PFV labels. In\nthis paper, we enhancing the hierarchical transformer UNesT for whole brain\nsegmentation to achieve segmenting whole brain with 133 classes and TICV/PFV\nsimultaneously. To address the problem of data scarcity, the model is first\npretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites.\nThese volumes are processed through a multi-atlas segmentation pipeline for\nlabel generation, while TICV/PFV labels are unavailable. Subsequently, the\nmodel is finetuned with 45 T1w 3D volumes from Open Access Series Imaging\nStudies (OASIS) where both 133 whole brain classes and TICV/PFV labels are\navailable. We evaluate our method with Dice similarity coefficients(DSC). We\nshow that our model is able to conduct precise TICV/PFV estimation while\nmaintaining the 132 brain regions performance at a comparable level. Code and\ntrained model are available at:\nhttps://github.com/MASILab/UNesT/tree/main/wholebrainSeg.\n","authors":["Xin Yu","Yucheng Tang","Qi Yang","Ho Hin Lee","Shunxing Bao","Yuankai Huo","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2309.04071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07347v1","updated":"2024-04-10T21:03:23Z","published":"2024-04-10T21:03:23Z","title":"Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on\n Intention","summary":" Humans utilize their gaze to concentrate on essential information while\nperceiving and interpreting intentions in videos. Incorporating human gaze into\ncomputational algorithms can significantly enhance model performance in video\nunderstanding tasks. In this work, we address a challenging and innovative task\nin video understanding: predicting the actions of an agent in a video based on\na partial video. We introduce the Gaze-guided Action Anticipation algorithm,\nwhich establishes a visual-semantic graph from the video input. Our method\nutilizes a Graph Neural Network to recognize the agent's intention and predict\nthe action sequence to fulfill this intention. To assess the efficiency of our\napproach, we collect a dataset containing household activities generated in the\nVirtualHome environment, accompanied by human gaze data of viewing videos. Our\nmethod outperforms state-of-the-art techniques, achieving a 7\\% improvement in\naccuracy for 18-class intention recognition. This highlights the efficiency of\nour method in learning important features from human gaze data.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07347v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2404.07336v1","updated":"2024-04-10T20:32:24Z","published":"2024-04-10T20:32:24Z","title":"PEAVS: Perceptual Evaluation of Audio-Visual Synchrony Grounded in\n Viewers' Opinion Scores","summary":" Recent advancements in audio-visual generative modeling have been propelled\nby progress in deep learning and the availability of data-rich benchmarks.\nHowever, the growth is not attributed solely to models and benchmarks.\nUniversally accepted evaluation metrics also play an important role in\nadvancing the field. While there are many metrics available to evaluate audio\nand visual content separately, there is a lack of metrics that offer a\nquantitative and interpretable measure of audio-visual synchronization for\nvideos \"in the wild\". To address this gap, we first created a large scale human\nannotated dataset (100+ hrs) representing nine types of synchronization errors\nin audio-visual content and how human perceive them. We then developed a PEAVS\n(Perceptual Evaluation of Audio-Visual Synchrony) score, a novel automatic\nmetric with a 5-point scale that evaluates the quality of audio-visual\nsynchronization. We validate PEAVS using a newly generated dataset, achieving a\nPearson correlation of 0.79 at the set level and 0.54 at the clip level when\ncompared to human labels. In our experiments, we observe a relative gain 50%\nover a natural extension of Fr\\'echet based metrics for Audio-Visual synchrony,\nconfirming PEAVS efficacy in objectively modeling subjective perceptions of\naudio-visual synchronization for videos \"in the wild\".\n","authors":["Lucas Goncalves","Prashant Mathur","Chandrashekhar Lavania","Metehan Cekic","Marcello Federico","Kyu J. Han"],"pdf_url":"https://arxiv.org/pdf/2404.07336v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.19653v2","updated":"2024-04-10T20:03:05Z","published":"2024-03-28T17:59:42Z","title":"Detecting Image Attribution for Text-to-Image Diffusion Models in RGB\n and Beyond","summary":" Modern text-to-image (T2I) diffusion models can generate images with\nremarkable realism and creativity. These advancements have sparked research in\nfake image detection and attribution, yet prior studies have not fully explored\nthe practical and scientific dimensions of this task. In addition to\nattributing images to 12 state-of-the-art T2I generators, we provide extensive\nanalyses on what inference stage hyperparameters and image modifications are\ndiscernible. Our experiments reveal that initialization seeds are highly\ndetectable, along with other subtle variations in the image generation process\nto some extent. We further investigate what visual traces are leveraged in\nimage attribution by perturbing high-frequency details and employing mid-level\nrepresentations of image style and structure. Notably, altering high-frequency\ninformation causes only slight reductions in accuracy, and training an\nattributor on style representations outperforms training on RGB images. Our\nanalyses underscore that fake images are detectable and attributable at various\nlevels of visual granularity than previously explored.\n","authors":["Katherine Xu","Lingzhi Zhang","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19653v2.pdf","comment":"Code available at https://github.com/k8xu/ImageAttribution"},{"id":"http://arxiv.org/abs/2404.07318v1","updated":"2024-04-10T19:39:43Z","published":"2024-04-10T19:39:43Z","title":"Rethinking Perceptual Metrics for Medical Image Translation","summary":" Modern medical image translation methods use generative models for tasks such\nas the conversion of CT images to MRI. Evaluating these methods typically\nrelies on some chosen downstream task in the target domain, such as\nsegmentation. On the other hand, task-agnostic metrics are attractive, such as\nthe network feature-based perceptual metrics (e.g., FID) that are common to\nimage translation in general computer vision. In this paper, we investigate\nevaluation metrics for medical image translation on two medical image\ntranslation tasks (GE breast MRI to Siemens breast MRI and lumbar spine MRI to\nCT), tested on various state-of-the-art translation methods. We show that\nperceptual metrics do not generally correlate with segmentation metrics due to\nthem extending poorly to the anatomical constraints of this sub-field, with FID\nbeing especially inconsistent. However, we find that the lesser-used\npixel-level SWD metric may be useful for subtle intra-modality translation. Our\nresults demonstrate the need for further research into helpful metrics for\nmedical image translation.\n","authors":["Nicholas Konz","Yuwen Chen","Hanxue Gu","Haoyu Dong","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2404.07318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07306v1","updated":"2024-04-10T18:58:05Z","published":"2024-04-10T18:58:05Z","title":"AI-Guided Defect Detection Techniques to Model Single Crystal Diamond\n Growth","summary":" From a process development perspective, diamond growth via chemical vapor\ndeposition has made significant strides. However, challenges persist in\nachieving high quality and large-area material production. These difficulties\ninclude controlling conditions to maintain uniform growth rates for the entire\ngrowth surface. As growth progresses, various factors or defect states emerge,\naltering the uniform conditions. These changes affect the growth rate and\nresult in the formation of crystalline defects at the microscale. However,\nthere is a distinct lack of methods to identify these defect states and their\ngeometry using images taken during the growth process. This paper details\nseminal work on defect segmentation pipeline using in-situ optical images to\nidentify features that indicate defective states that are visible at the\nmacroscale. Using a semantic segmentation approach as applied in our previous\nwork, these defect states and corresponding derivative features are isolated\nand classified by their pixel masks. Using an annotation focused\nhuman-in-the-loop software architecture to produce training datasets, with\nmodules for selective data labeling using active learning, data augmentations,\nand model-assisted labeling, our approach achieves effective annotation\naccuracy and drastically reduces the time and cost of labeling by orders of\nmagnitude. On the model development front, we found that deep learning-based\nalgorithms are the most efficient. They can accurately learn complex\nrepresentations from feature-rich datasets. Our best-performing model, based on\nthe YOLOV3 and DeeplabV3plus architectures, achieved excellent accuracy for\nspecific features of interest. Specifically, it reached 93.35% accuracy for\ncenter defects, 92.83% for polycrystalline defects, and 91.98% for edge\ndefects.\n","authors":["Rohan Reddy Mekala","Elias Garratt","Matthias Muehle","Arjun Srinivasan","Adam Porter","Mikael Lindvall"],"pdf_url":"https://arxiv.org/pdf/2404.07306v1.pdf","comment":"12 pages,4 figures,ACMME 2024"},{"id":"http://arxiv.org/abs/2404.07292v1","updated":"2024-04-10T18:40:23Z","published":"2024-04-10T18:40:23Z","title":"Solving Masked Jigsaw Puzzles with Diffusion Vision Transformers","summary":" Solving image and video jigsaw puzzles poses the challenging task of\nrearranging image fragments or video frames from unordered sequences to restore\nmeaningful images and video sequences. Existing approaches often hinge on\ndiscriminative models tasked with predicting either the absolute positions of\npuzzle elements or the permutation actions applied to the original data.\nUnfortunately, these methods face limitations in effectively solving puzzles\nwith a large number of elements. In this paper, we propose JPDVT, an innovative\napproach that harnesses diffusion transformers to address this challenge.\nSpecifically, we generate positional information for image patches or video\nframes, conditioned on their underlying visual content. This information is\nthen employed to accurately assemble the puzzle pieces in their correct\npositions, even in scenarios involving missing pieces. Our method achieves\nstate-of-the-art performance on several datasets.\n","authors":["Jinyang Liu","Wondmgezahu Teshome","Sandesh Ghimire","Mario Sznaier","Octavia Camps"],"pdf_url":"https://arxiv.org/pdf/2404.07292v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.06287v2","updated":"2024-04-10T18:16:32Z","published":"2024-01-11T23:00:24Z","title":"Hierarchical Augmentation and Distillation for Class Incremental\n Audio-Visual Video Recognition","summary":" Audio-visual video recognition (AVVR) aims to integrate audio and visual\nclues to categorize videos accurately. While existing methods train AVVR models\nusing provided datasets and achieve satisfactory results, they struggle to\nretain historical class knowledge when confronted with new classes in\nreal-world situations. Currently, there are no dedicated methods for addressing\nthis problem, so this paper concentrates on exploring Class Incremental\nAudio-Visual Video Recognition (CIAVVR). For CIAVVR, since both stored data and\nlearned model of past classes contain historical knowledge, the core challenge\nis how to capture past data knowledge and past model knowledge to prevent\ncatastrophic forgetting. We introduce Hierarchical Augmentation and\nDistillation (HAD), which comprises the Hierarchical Augmentation Module (HAM)\nand Hierarchical Distillation Module (HDM) to efficiently utilize the\nhierarchical structure of data and models, respectively. Specifically, HAM\nimplements a novel augmentation strategy, segmental feature augmentation, to\npreserve hierarchical model knowledge. Meanwhile, HDM introduces newly designed\nhierarchical (video-distribution) logical distillation and hierarchical\n(snippet-video) correlative distillation to capture and maintain the\nhierarchical intra-sample knowledge of each data and the hierarchical\ninter-sample knowledge between data, respectively. Evaluations on four\nbenchmarks (AVE, AVK-100, AVK-200, and AVK-400) demonstrate that the proposed\nHAD effectively captures hierarchical information in both data and models,\nresulting in better preservation of historical class knowledge and improved\nperformance. Furthermore, we provide a theoretical analysis to support the\nnecessity of the segmental feature augmentation strategy.\n","authors":["Yukun Zuo","Hantao Yao","Liansheng Zhuang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.06287v2.pdf","comment":"Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2308.15321v6","updated":"2024-04-10T18:13:00Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir \\textit{exposure bias} problem, described as the input mismatch between\ntraining and sampling, lacks in-depth exploration. In this paper, we\nsystematically investigate the exposure bias problem in diffusion models by\nfirst analytically modelling the sampling distribution, based on which we then\nattribute the prediction error at each sampling step as the root cause of the\nexposure bias issue. Furthermore, we discuss potential solutions to this issue\nand propose an intuitive metric for it. Along with the elucidation of exposure\nbias, we propose a simple, yet effective, training-free method called Epsilon\nScaling to alleviate the exposure bias. We show that Epsilon Scaling explicitly\nmoves the sampling trajectory closer to the vector field learned in the\ntraining phase by scaling down the network output, mitigating the input\nmismatch between training and sampling. Experiments on various diffusion\nframeworks (ADM, DDIM, EDM, LDM, DiT, PFGM++) verify the effectiveness of our\nmethod. Remarkably, our ADM-ES, as a state-of-the-art stochastic sampler,\nobtains 2.17 FID on CIFAR-10 under 100-step unconditional generation. The code\nis available at \\url{https://github.com/forever208/ADM-ES} and\n\\url{https://github.com/forever208/EDM-ES}.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v6.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.06776v1","updated":"2024-04-10T06:35:25Z","published":"2024-04-10T06:35:25Z","title":"Logit Calibration and Feature Contrast for Robust Federated Learning on\n Non-IID Data","summary":" Federated learning (FL) is a privacy-preserving distributed framework for\ncollaborative model training on devices in edge networks. However, challenges\narise due to vulnerability to adversarial examples (AEs) and the\nnon-independent and identically distributed (non-IID) nature of data\ndistribution among devices, hindering the deployment of adversarially robust\nand accurate learning models at the edge. While adversarial training (AT) is\ncommonly acknowledged as an effective defense strategy against adversarial\nattacks in centralized training, we shed light on the adverse effects of\ndirectly applying AT in FL that can severely compromise accuracy, especially in\nnon-IID challenges. Given this limitation, this paper proposes FatCC, which\nincorporates local logit \\underline{C}alibration and global feature\n\\underline{C}ontrast into the vanilla federated adversarial training\n(\\underline{FAT}) process from both logit and feature perspectives. This\napproach can effectively enhance the federated system's robust accuracy (RA)\nand clean accuracy (CA). First, we propose logit calibration, where the logits\nare calibrated during local adversarial updates, thereby improving adversarial\nrobustness. Second, FatCC introduces feature contrast, which involves a global\nalignment term that aligns each local representation with unbiased global\nfeatures, thus further enhancing robustness and accuracy in federated\nadversarial environments. Extensive experiments across multiple datasets\ndemonstrate that FatCC achieves comparable or superior performance gains in\nboth CA and RA compared to other baselines.\n","authors":["Yu Qiao","Chaoning Zhang","Apurba Adhikary","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.06776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06773v1","updated":"2024-04-10T06:30:08Z","published":"2024-04-10T06:30:08Z","title":"Adapting LLaMA Decoder to Vision Transformer","summary":" This work examines whether decoder-only Transformers such as LLaMA, which\nwere originally designed for large language models (LLMs), can be adapted to\nthe computer vision field. We first \"LLaMAfy\" a standard ViT step-by-step to\nalign with LLaMA's architecture, and find that directly applying a casual mask\nto the self-attention brings an attention collapse issue, resulting in the\nfailure to the network training. We suggest to reposition the class token\nbehind the image tokens with a post-sequence class token technique to overcome\nthis challenge, enabling causal self-attention to efficiently capture the\nentire image's information. Additionally, we develop a soft mask strategy that\ngradually introduces a casual mask to the self-attention at the onset of\ntraining to facilitate the optimization behavior. The tailored model, dubbed as\nimage LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct\nsupervised learning. Its causal self-attention boosts computational efficiency\nand learns complex representation by elevating attention map ranks. iLLaMA\nrivals the performance with its encoder-only counterparts, achieving 75.1%\nImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M\nand pre-training on ImageNet-21K further enhances the accuracy to 86.0%.\nExtensive experiments demonstrate iLLaMA's reliable properties: calibration,\nshape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR\ntransfer learning. We hope our study can kindle fresh views to visual model\ndesign in the wave of LLMs. Pre-trained models and codes are available here.\n","authors":["Jiahao Wang","Wenqi Shao","Mengzhao Chen","Chengyue Wu","Yong Liu","Kaipeng Zhang","Songyang Zhang","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2404.06773v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06753v1","updated":"2024-04-10T05:41:05Z","published":"2024-04-10T05:41:05Z","title":"MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D\n Reconstruction of Indoor Scenes from Monocular RGB Views","summary":" Current monocular 3D scene reconstruction (3DR) works are either\nfully-supervised, or not generalizable, or implicit in 3D representation. We\npropose a novel framework - MonoSelfRecon that for the first time achieves\nexplicit 3D mesh reconstruction for generalizable indoor scenes with monocular\nRGB views by purely self-supervision on voxel-SDF (signed distance function).\nMonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and\na generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF\nin self-supervision. We propose novel self-supervised losses, which not only\nsupport pure self-supervision, but can be used together with supervised signals\nto further boost supervised training. Our experiments show that \"MonoSelfRecon\"\ntrained in pure self-supervision outperforms current best self-supervised\nindoor depth estimation models and is comparable to 3DR models trained in fully\nsupervision with depth annotations. MonoSelfRecon is not restricted by specific\nmodel design, which can be used to any models with voxel-SDF for purely\nself-supervised manner.\n","authors":["Runfa Li","Upal Mahbub","Vasudev Bhaskaran","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.06753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06744v1","updated":"2024-04-10T05:10:05Z","published":"2024-04-10T05:10:05Z","title":"YOLO based Ocean Eddy Localization with AWS SageMaker","summary":" Ocean eddies play a significant role both on the sea surface and beneath it,\ncontributing to the sustainability of marine life dependent on oceanic\nbehaviors. Therefore, it is crucial to investigate ocean eddies to monitor\nchanges in the Earth, particularly in the oceans, and their impact on climate.\nThis study aims to pinpoint ocean eddies using AWS cloud services, specifically\nSageMaker. The primary objective is to detect small-scale (<20km) ocean eddies\nfrom satellite remote images and assess the feasibility of utilizing SageMaker,\nwhich offers tools for deploying AI applications. Moreover, this research not\nonly explores the deployment of cloud-based services for remote sensing of\nEarth data but also evaluates several YOLO (You Only Look Once) models using\nsingle and multi-GPU-based services in the cloud. Furthermore, this study\nunderscores the potential of these services, their limitations, challenges\nrelated to deployment and resource management, and their user-riendliness for\nEarth science projects.\n","authors":["Seraj Al Mahmud Mostafa","Jinbo Wang","Benjamin Holt","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06744v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.06741v1","updated":"2024-04-10T04:59:51Z","published":"2024-04-10T04:59:51Z","title":"An Animation-based Augmentation Approach for Action Recognition from\n Discontinuous Video","summary":" The study of action recognition has attracted considerable attention recently\ndue to its broad applications in multiple areas. However, with the issue of\ndiscontinuous training video, which not only decreases the performance of\naction recognition model, but complicates the data augmentation process as\nwell, still remains under-exploration. In this study, we introduce the 4A\n(Action Animation-based Augmentation Approach), an innovative pipeline for data\naugmentation to address the problem. The main contributions remain in our work\nincludes: (1) we investigate the problem of severe decrease on performance of\naction recognition task training by discontinuous video, and the limitation of\nexisting augmentation methods on solving this problem. (2) we propose a novel\naugmentation pipeline, 4A, to address the problem of discontinuous video for\ntraining, while achieving a smoother and natural-looking action representation\nthan the latest data augmentation methodology. (3) We achieve the same\nperformance with only 10% of the original data for training as with all of the\noriginal data from the real-world dataset, and a better performance on\nIn-the-wild videos, by employing our data augmentation techniques.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Xin-Qiang Cai","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.06741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02736v4","updated":"2024-04-10T04:51:33Z","published":"2022-11-04T20:22:58Z","title":"Discovering Closed-Loop Failures of Vision-Based Controllers via\n Reachability Analysis","summary":" Machine learning driven image-based controllers allow robotic systems to take\nintelligent actions based on the visual feedback from their environment.\nUnderstanding when these controllers might lead to system safety violations is\nimportant for their integration in safety-critical applications and engineering\ncorrective safety measures for the system. Existing methods leverage\nsimulation-based testing (or falsification) to find the failures of\nvision-based controllers, i.e., the visual inputs that lead to closed-loop\nsafety violations. However, these techniques do not scale well to the scenarios\ninvolving high-dimensional and complex visual inputs, such as RGB images. In\nthis work, we cast the problem of finding closed-loop vision failures as a\nHamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based\nanalysis with HJ reachability methods to compute an approximation of the\nbackward reachable tube (BRT) of the system, i.e., the set of unsafe states for\nthe system under vision-based controllers. Utilizing the BRT, we can tractably\nand systematically find the system states and corresponding visual inputs that\nlead to closed-loop failures. These visual inputs can be subsequently analyzed\nto find the input characteristics that might have caused the failure. Besides\nits scalability to high-dimensional visual inputs, an explicit computation of\nBRT allows the proposed approach to capture non-trivial system failures that\nare difficult to expose via random simulations. We demonstrate our framework on\ntwo case studies involving an RGB image-based neural network controller for (a)\nautonomous indoor navigation, and (b) autonomous aircraft taxiing.\n","authors":["Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2211.02736v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01289v3","updated":"2024-04-10T04:42:10Z","published":"2023-06-02T06:15:36Z","title":"nnMobileNe: Rethinking CNN for Retinopathy Research","summary":" Over the past few decades, convolutional neural networks (CNNs) have been at\nthe forefront of the detection and tracking of various retinal diseases (RD).\nDespite their success, the emergence of vision transformers (ViT) in the 2020s\nhas shifted the trajectory of RD model development. The leading-edge\nperformance of ViT-based models in RD can be largely credited to their\nscalability-their ability to improve as more parameters are added. As a result,\nViT-based models tend to outshine traditional CNNs in RD applications, albeit\nat the cost of increased data and computational demands. ViTs also differ from\nCNNs in their approach to processing images, working with patches rather than\nlocal regions, which can complicate the precise localization of small, variably\npresented lesions in RD. In our study, we revisited and updated the\narchitecture of a CNN model, specifically MobileNet, to enhance its utility in\nRD diagnostics. We found that an optimized MobileNet, through selective\nmodifications, can surpass ViT-based models in various RD benchmarks, including\ndiabetic retinopathy grading, detection of multiple fundus diseases, and\nclassification of diabetic macular edema. The code is available at\nhttps://github.com/Retinal-Research/NN-MOBILENET\n","authors":["Wenhui Zhu","Peijie Qiu","Xiwen Chen","Xin Li","Natasha Lepore","Oana M. Dumitrascu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01289v3.pdf","comment":"Accepted as a conference paper to 2024 CVPRW"},{"id":"http://arxiv.org/abs/2404.06727v1","updated":"2024-04-10T04:24:42Z","published":"2024-04-10T04:24:42Z","title":"Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural\n Radiance Fields","summary":" We present the Bayesian Neural Radiance Field (NeRF), which explicitly\nquantifies uncertainty in geometric volume structures without the need for\nadditional networks, making it adept for challenging observations and\nuncontrolled images. NeRF diverges from traditional geometric methods by\noffering an enriched scene representation, rendering color and density in 3D\nspace from various viewpoints. However, NeRF encounters limitations in relaxing\nuncertainties by using geometric structure information, leading to inaccuracies\nin interpretation under insufficient real-world observations. Recent research\nefforts aimed at addressing this issue have primarily relied on empirical\nmethods or auxiliary networks. To fundamentally address this issue, we propose\na series of formulational extensions to NeRF. By introducing generalized\napproximations and defining density-related uncertainty, our method seamlessly\nextends to manage uncertainty not only for RGB but also for depth, without the\nneed for additional networks or empirical assumptions. In experiments we show\nthat our method significantly enhances performance on RGB and depth images in\nthe comprehensive dataset, demonstrating the reliability of the Bayesian NeRF\napproach to quantifying uncertainty based on the geometric structure.\n","authors":["Sibeak Lee","Kyeongsu Kang","Hyeonwoo Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03384v2","updated":"2024-04-10T04:24:36Z","published":"2024-04-04T11:33:29Z","title":"LongVLM: Efficient Long Video Understanding via Large Language Models","summary":" Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs\nhave driven progress in various video understanding tasks. These models encode\nvideo representations through pooling or query aggregation over a vast number\nof visual tokens, making computational and memory costs affordable. Despite\nsuccessfully providing an overall comprehension of video content, existing\nVideoLLMs still face challenges in achieving detailed understanding in videos\ndue to overlooking local information in long-term videos. To tackle this\nchallenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for\nlong video understanding, building upon the observation that long videos often\nconsist of sequential key events, complex actions, and camera movements. Our\napproach proposes to decompose long videos into multiple short-term segments\nand encode local features for each local segment via a hierarchical token\nmerging module. These features are concatenated in temporal order to maintain\nthe storyline across sequential short-term segments. Additionally, we propose\nto integrate global semantics into each local feature to enhance context\nunderstanding. In this way, we encode video representations that incorporate\nboth local and global information, enabling the LLM to generate comprehensive\nresponses for long-term videos. Experimental results on the VideoChatGPT\nbenchmark and zero-shot video question-answering datasets demonstrate the\nsuperior capabilities of our model over the previous state-of-the-art methods.\nQualitative examples demonstrate that our model produces more precise responses\nfor long videos understanding. Code will be available at\nhttps://github.com/ziplab/LongVLM.\n","authors":["Yuetian Weng","Mingfei Han","Haoyu He","Xiaojun Chang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.03384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11848v2","updated":"2024-04-10T04:05:24Z","published":"2024-03-18T15:00:38Z","title":"GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object\n Detection","summary":" Integrating LiDAR and camera information into Bird's-Eye-View (BEV)\nrepresentation has emerged as a crucial aspect of 3D object detection in\nautonomous driving. However, existing methods are susceptible to the inaccurate\ncalibration relationship between LiDAR and the camera sensor. Such inaccuracies\nresult in errors in depth estimation for the camera branch, ultimately causing\nmisalignment between LiDAR and camera BEV features. In this work, we propose a\nrobust fusion framework called Graph BEV. Addressing errors caused by\ninaccurate point cloud projection, we introduce a Local Align module that\nemploys neighbor-aware depth features via Graph matching. Additionally, we\npropose a Global Align module to rectify the misalignment between LiDAR and\ncamera BEV features. Our Graph BEV framework achieves state-of-the-art\nperformance, with an mAP of 70.1\\%, surpassing BEV Fusion by 1.6\\% on the\nnuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by\n8.3\\% under conditions with misalignment noise.\n","authors":["Ziying Song","Lei Yang","Shaoqing Xu","Lin Liu","Dongyang Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v2","updated":"2024-04-10T04:01:43Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06715v1","updated":"2024-04-10T03:54:53Z","published":"2024-04-10T03:54:53Z","title":"Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR\n Data","summary":" 3D detection is a critical task that enables machines to identify and locate\nobjects in three-dimensional space. It has a broad range of applications in\nseveral fields, including autonomous driving, robotics and augmented reality.\nMonocular 3D detection is attractive as it requires only a single camera,\nhowever, it lacks the accuracy and robustness required for real world\napplications. High resolution LiDAR on the other hand, can be expensive and\nlead to interference problems in heavy traffic given their active\ntransmissions. We propose a balanced approach that combines the advantages of\nmonocular and point cloud-based 3D detection. Our method requires only a small\nnumber of 3D points, that can be obtained from a low-cost, low-resolution\nsensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR\nframe in the KITTI dataset. Our method reconstructs a complete 3D point cloud\nfrom this limited 3D information combined with a single image. The\nreconstructed 3D point cloud and corresponding image can be used by any\nmulti-modal off-the-shelf detector for 3D object detection. By using the\nproposed network architecture with an off-the-shelf multi-modal 3D detector,\nthe accuracy of 3D detection improves by 20% compared to the state-of-the-art\nmonocular detection methods and 6% to 9% compare to the baseline multi-modal\nmethods on KITTI and JackRabbot datasets.\n","authors":["Aakash Kumar","Chen Chen","Ajmal Mian","Neils Lobo","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2404.06715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01929v2","updated":"2024-04-10T03:36:33Z","published":"2024-04-02T13:23:21Z","title":"Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A\n Semi-Supervised Video Object Detection Method","summary":" This study aims to establish a computer-aided diagnostic system for lung\nlesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians\nin identifying lesion areas. During EBUS-transbronchial needle aspiration\n(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to\ndetermine the location of lesions. However, these images often contain\nsignificant noise and can be influenced by surrounding tissues or blood\nvessels, making interpretation challenging. Previous research has lacked the\napplication of object detection models to EBUS-TBNA, and there has been no\nwell-defined solution for annotating the EBUS-TBNA dataset. In related studies\non ultrasound images, although models have been successful in capturing target\nregions for their respective tasks, their training and predictions have been\nbased on two-dimensional images, limiting their ability to leverage temporal\nfeatures for improved predictions. This study introduces a three-dimensional\nimage-based object detection model. It utilizes an attention mechanism to\ncapture temporal correlations and we will implements a filtering mechanism to\nselect relevant information from previous frames. Subsequently, a\nteacher-student model training approach is employed to optimize the model\nfurther, leveraging unlabeled data. To mitigate the impact of poor-quality\npseudo-labels on the student model, we will add a special Gaussian Mixture\nModel (GMM) to ensure the quality of pseudo-labels.\n","authors":["Jyun-An Lin","Yun-Chien Cheng","Ching-Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06080v2","updated":"2024-04-10T03:35:35Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06038v2","updated":"2024-04-10T03:27:04Z","published":"2023-07-12T09:33:21Z","title":"Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D\n Images","summary":" Accurately recovering the dense 3D mesh of both hands from monocular images\nposes considerable challenges due to occlusions and projection ambiguity. Most\nof the existing methods extract features from color images to estimate the\nroot-aligned hand meshes, which neglect the crucial depth and scale information\nin the real world. Given the noisy sensor measurements with limited resolution,\ndepth-based methods predict 3D keypoints rather than a dense mesh. These\nlimitations motivate us to take advantage of these two complementary inputs to\nacquire dense hand meshes on a real-world scale. In this work, we propose an\nend-to-end framework for recovering dense meshes for both hands, which employ\nsingle-view RGB-D image pairs as input. The primary challenge lies in\neffectively utilizing two different input modalities to mitigate the blurring\neffects in RGB images and noises in depth images. Instead of directly treating\ndepth maps as additional channels for RGB images, we encode the depth\ninformation into the unordered point cloud to preserve more geometric details.\nSpecifically, our framework employs ResNet50 and PointNet++ to derive features\nfrom RGB and point cloud, respectively. Additionally, we introduce a novel\npyramid deep fusion network (PDFNet) to aggregate features at different scales,\nwhich demonstrates superior efficacy compared to previous fusion strategies.\nFurthermore, we employ a GCN-based decoder to process the fused features and\nrecover the corresponding 3D pose and dense mesh. Through comprehensive\nablation experiments, we have not only demonstrated the effectiveness of our\nproposed fusion algorithm but also outperformed the state-of-the-art approaches\non publicly available datasets. To reproduce the results, we will make our\nsource code and models publicly available at\n{https://github.com/zijinxuxu/PDFNet}.\n","authors":["Jinwei Ren","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.06038v2.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2404.06704v1","updated":"2024-04-10T03:20:33Z","published":"2024-04-10T03:20:33Z","title":"Convolution-based Probability Gradient Loss for Semantic Segmentation","summary":" In this paper, we introduce a novel Convolution-based Probability Gradient\n(CPG) loss for semantic segmentation. It employs convolution kernels similar to\nthe Sobel operator, capable of computing the gradient of pixel intensity in an\nimage. This enables the computation of gradients for both ground-truth and\npredicted category-wise probabilities. It enhances network performance by\nmaximizing the similarity between these two probability gradients. Moreover, to\nspecifically enhance accuracy near the object's boundary, we extract the object\nboundary based on the ground-truth probability gradient and exclusively apply\nthe CPG loss to pixels belonging to boundaries. CPG loss proves to be highly\nconvenient and effective. It establishes pixel relationships through\nconvolution, calculating errors from a distinct dimension compared to\npixel-wise loss functions such as cross-entropy loss. We conduct qualitative\nand quantitative analyses to evaluate the impact of the CPG loss on three\nwell-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and\nLRASPP_MobileNet_V3_Large) across three standard segmentation datasets\n(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results\nconsistently and significantly demonstrate that the CPG loss enhances the mean\nIntersection over Union.\n","authors":["Guohang Shan","Shuangcheng Jia"],"pdf_url":"https://arxiv.org/pdf/2404.06704v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.06700v1","updated":"2024-04-10T03:11:10Z","published":"2024-04-10T03:11:10Z","title":"Scaling Multi-Camera 3D Object Detection through Weak-to-Strong\n Eliciting","summary":" The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by\nbird's-eye view (BEV) representation, signifies a notable progression in 3D\nobject detection. Scaling MC3D-Det training effectively accommodates varied\ncamera parameters and urban landscapes, paving the way for the MC3D-Det\nfoundation model. However, the multi-view fusion stage of the MC3D-Det method\nrelies on the ill-posed monocular perception during training rather than\nsurround refinement ability, leading to what we term \"surround refinement\ndegradation\". To this end, our study presents a weak-to-strong eliciting\nframework aimed at enhancing surround refinement while maintaining robust\nmonocular perception. Specifically, our framework employs weakly tuned experts\ntrained on distinct subsets, and each is inherently biased toward specific\ncamera configurations and scenarios. These biased experts can learn the\nperception of monocular degeneration, which can help the multi-view fusion\nstage to enhance surround refinement abilities. Moreover, a composite\ndistillation strategy is proposed to integrate the universal knowledge of 2D\nfoundation models and task-specific information. Finally, for MC3D-Det joint\ntraining, the elaborate dataset merge strategy is designed to solve the problem\nof inconsistent camera numbers and camera parameters. We set up a multiple\ndataset joint training benchmark for MC3D-Det and adequately evaluated existing\nmethods. Further, we demonstrate the proposed framework brings a generalized\nand significant boost over multiple baselines. Our code is at\n\\url{https://github.com/EnVision-Research/Scale-BEV}.\n","authors":["Hao Lu","Jiaqi Tang","Xinli Xu","Xu Cao","Yunpeng Zhang","Guoqing Wang","Dalong Du","Hao Chen","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05645v2","updated":"2024-04-10T03:05:04Z","published":"2023-09-11T17:37:08Z","title":"CitDet: A Benchmark Dataset for Citrus Fruit Detection","summary":" In this letter, we present a new dataset to advance the state of the art in\ndetecting citrus fruit and accurately estimate yield on trees affected by the\nHuanglongbing (HLB) disease in orchard environments via imaging. Despite the\nfact that significant progress has been made in solving the fruit detection\nproblem, the lack of publicly available datasets has complicated direct\ncomparison of results. For instance, citrus detection has long been of interest\nto the agricultural research community, yet there is an absence of work,\nparticularly involving public datasets of citrus affected by HLB. To address\nthis issue, we enhance state-of-the-art object detection methods for use in\ntypical orchard settings. Concretely, we provide high-resolution images of\ncitrus trees located in an area known to be highly affected by HLB, along with\nhigh-quality bounding box annotations of citrus fruit. Fruit on both the trees\nand the ground are labeled to allow for identification of fruit location, which\ncontributes to advancements in yield estimation and potential measure of HLB\nimpact via fruit drop. The dataset consists of over 32,000 bounding box\nannotations for fruit instances contained in 579 high-resolution images. In\nsummary, our contributions are the following: (i) we introduce a novel dataset\nalong with baseline performance benchmarks on multiple contemporary object\ndetection algorithms, (ii) we show the ability to accurately capture fruit\nlocation on tree or on ground, and finally (ii) we present a correlation of our\nresults with yield estimations.\n","authors":["Jordan A. James","Heather K. Manching","Matthew R. Mattia","Kim D. Bowman","Amanda M. Hulse-Kemp","William J. Beksi"],"pdf_url":"https://arxiv.org/pdf/2309.05645v2.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2404.06693v1","updated":"2024-04-10T02:47:05Z","published":"2024-04-10T02:47:05Z","title":"Binomial Self-compensation for Motion Error in Dynamic 3D Scanning","summary":" Phase shifting profilometry (PSP) is favored in high-precision 3D scanning\ndue to its high accuracy, robustness, and pixel-wise property. However, a\nfundamental assumption of PSP that the object should remain static is violated\nin dynamic measurement, making PSP susceptible to object moving, resulting in\nripple-like errors in the point clouds. We propose a pixel-wise and frame-wise\nloopable binomial self-compensation (BSC) algorithm to effectively and flexibly\neliminate motion error in the four-step PSP. Our mathematical model\ndemonstrates that by summing successive motion-affected phase frames weighted\nby binomial coefficients, motion error exponentially diminishes as the binomial\norder increases, accomplishing automatic error compensation through the\nmotion-affected phase sequence, without the assistance of any intermediate\nvariable. Extensive experiments show that our BSC outperforms the existing\nmethods in reducing motion error, while achieving a depth map frame rate equal\nto the camera's acquisition rate (90 fps), enabling high-accuracy 3D\nreconstruction with a quasi-single-shot frame rate.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06692v1","updated":"2024-04-10T02:40:17Z","published":"2024-04-10T02:40:17Z","title":"Perception-Oriented Video Frame Interpolation via Asymmetric Blending","summary":" Previous methods for Video Frame Interpolation (VFI) have encountered\nchallenges, notably the manifestation of blur and ghosting effects. These\nissues can be traced back to two pivotal factors: unavoidable motion errors and\nmisalignment in supervision. In practice, motion estimates often prove to be\nerror-prone, resulting in misaligned features. Furthermore, the reconstruction\nloss tends to bring blurry results, particularly in misaligned regions. To\nmitigate these challenges, we propose a new paradigm called PerVFI\n(Perception-oriented Video Frame Interpolation). Our approach incorporates an\nAsymmetric Synergistic Blending module (ASB) that utilizes features from both\nsides to synergistically blend intermediate features. One reference frame\nemphasizes primary content, while the other contributes complementary\ninformation. To impose a stringent constraint on the blending process, we\nintroduce a self-learned sparse quasi-binary mask which effectively mitigates\nghosting and blur artifacts in the output. Additionally, we employ a\nnormalizing flow-based generator and utilize the negative log-likelihood loss\nto learn the conditional distribution of the output, which further facilitates\nthe generation of clear and fine details. Experimental results validate the\nsuperiority of PerVFI, demonstrating significant improvements in perceptual\nquality compared to existing methods. Codes are available at\n\\url{https://github.com/mulns/PerVFI}\n","authors":["Guangyang Wu","Xin Tao","Changlin Li","Wenyi Wang","Xiaohong Liu","Qingqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06692v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2210.16101v2","updated":"2024-04-10T02:33:57Z","published":"2022-10-27T13:24:08Z","title":"A Generic Shared Attention Mechanism for Various Backbone Neural\n Networks","summary":" The self-attention mechanism has emerged as a critical component for\nimproving the performance of various backbone neural networks. However, current\nmainstream approaches individually incorporate newly designed self-attention\nmodules (SAMs) into each layer of the network for granted without fully\nexploiting their parameters' potential. This leads to suboptimal performance\nand increased parameter consumption as the network depth increases. To improve\nthis paradigm, in this paper, we first present a counterintuitive but inherent\nphenomenon: SAMs tend to produce strongly correlated attention maps across\ndifferent layers, with an average Pearson correlation coefficient of up to\n0.85. Inspired by this inherent observation, we propose Dense-and-Implicit\nAttention (DIA), which directly shares SAMs across layers and employs a long\nshort-term memory module to calibrate and bridge the highly correlated\nattention maps of different layers, thus improving the parameter utilization\nefficiency of SAMs. This design of DIA is also consistent with the neural\nnetwork's dynamical system perspective. Through extensive experiments, we\ndemonstrate that our simple yet effective DIA can consistently enhance various\nnetwork backbones, including ResNet, Transformer, and UNet, across tasks such\nas image classification, object detection, and image generation using diffusion\nmodels.\n","authors":["Zhongzhan Huang","Senwei Liang","Mingfu Liang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2210.16101v2.pdf","comment":"Work in progress. arXiv admin note: text overlap with\n arXiv:1905.10671"},{"id":"http://arxiv.org/abs/2404.06493v2","updated":"2024-04-10T02:24:58Z","published":"2024-04-09T17:48:52Z","title":"Flying with Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v2.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2404.06507v2","updated":"2024-04-10T02:23:09Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v2.pdf","comment":"Project page: https://janehwu.github.io/mcc-ho"},{"id":"http://arxiv.org/abs/2311.10568v2","updated":"2024-04-10T02:19:19Z","published":"2023-11-17T15:08:15Z","title":"Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging","summary":" On 3D imaging, light field cameras typically are of single shot, and however,\nthey heavily suffer from low spatial resolution and depth accuracy. In this\npaper, by employing an optical projector to project a group of single\nhigh-frequency phase-shifted sinusoid patterns, we propose a phase guided light\nfield algorithm to significantly improve both the spatial and depth resolutions\nfor off-the-shelf light field cameras. First, for correcting the axial\naberrations caused by the main lens of our light field camera, we propose a\ndeformed cone model to calibrate our structured light field system. Second,\nover wrapped phases computed from patterned images, we propose a stereo\nmatching algorithm, i.e. phase guided sum of absolute difference, to robustly\nobtain the correspondence for each pair of neighbored two lenslets. Finally, by\nintroducing a virtual camera according to the basic geometrical optics of light\nfield imaging, we propose a reorganization strategy to reconstruct 3D point\nclouds with spatial-depth high resolution. Experimental results show that,\ncompared with the state-of-the-art active light field methods, the proposed\nreconstructs 3D point clouds with a spatial resolution of 1280$\\times$720 with\nfactors 10$\\times$ increased, while maintaining the same high depth resolution\nand needing merely a single group of high-frequency patterns.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu","Yipeng Liu"],"pdf_url":"https://arxiv.org/pdf/2311.10568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06479v2","updated":"2024-04-10T02:12:27Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v2.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06683v1","updated":"2024-04-10T02:03:14Z","published":"2024-04-10T02:03:14Z","title":"Unsupervised Visible-Infrared ReID via Pseudo-label Correction and\n Modality-level Alignment","summary":" Unsupervised visible-infrared person re-identification (UVI-ReID) has\nrecently gained great attention due to its potential for enhancing human\ndetection in diverse environments without labeling. Previous methods utilize\nintra-modality clustering and cross-modality feature matching to achieve\nUVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be\ngenerated in the clustering process, and 2) the cross-modality feature\nalignment via matching the marginal distribution of visible and infrared\nmodalities may misalign the different identities from two modalities. In this\npaper, we first conduct a theoretic analysis where an interpretable\ngeneralization upper bound is introduced. Based on the analysis, we then\npropose a novel unsupervised cross-modality person re-identification framework\n(PRAISE). Specifically, to address the first challenge, we propose a\npseudo-label correction strategy that utilizes a Beta Mixture Model to predict\nthe probability of mis-clustering based network's memory effect and rectifies\nthe correspondence by adding a perceptual term to contrastive learning. Next,\nwe introduce a modality-level alignment strategy that generates paired\nvisible-infrared latent features and reduces the modality gap by aligning the\nlabeling function of visible and infrared features to learn identity\ndiscriminative and modality-invariant features. Experimental results on two\nbenchmark datasets demonstrate that our method achieves state-of-the-art\nperformance than the unsupervised visible-ReID methods.\n","authors":["Yexin Liu","Weiming Zhang","Athanasios V. Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06683v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.02065v2","updated":"2024-04-10T01:53:17Z","published":"2024-04-02T16:06:20Z","title":"Multi-Level Label Correction by Distilling Proximate Patterns for\n Semi-supervised Semantic Segmentation","summary":" Semi-supervised semantic segmentation relieves the reliance on large-scale\nlabeled data by leveraging unlabeled data. Recent semi-supervised semantic\nsegmentation approaches mainly resort to pseudo-labeling methods to exploit\nunlabeled data. However, unreliable pseudo-labeling can undermine the\nsemi-supervision processes. In this paper, we propose an algorithm called\nMulti-Level Label Correction (MLLC), which aims to use graph neural networks to\ncapture structural relationships in Semantic-Level Graphs (SLGs) and\nClass-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically,\nSLGs represent semantic affinities between pairs of pixel features, and CLGs\ndescribe classification consistencies between pairs of pixel labels. With the\nsupport of proximate pattern information from graphs, MLLC can rectify\nincorrectly predicted pseudo-labels and can facilitate discriminative feature\nrepresentations. We design an end-to-end network to train and perform this\neffective label corrections mechanism. Experiments demonstrate that MLLC can\nsignificantly improve supervised baselines and outperforms state-of-the-art\napproaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets.\nSpecifically, MLLC improves the supervised baseline by at least 5% and 2% with\nDeepLabV2 and DeepLabV3+ respectively under different partition protocols.\n","authors":["Hui Xiao","Yuting Hong","Li Dong","Diqun Yan","Jiayan Zhuang","Junjie Xiong","Dongtai Liang","Chengbin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.02065v2.pdf","comment":"12 pages, 8 figures. IEEE Transactions on Multimedia, 2024"},{"id":"http://arxiv.org/abs/2301.04218v4","updated":"2024-04-10T01:11:15Z","published":"2023-01-10T21:50:26Z","title":"Leveraging Diffusion For Strong and High Quality Face Morphing Attacks","summary":" Face morphing attacks seek to deceive a Face Recognition (FR) system by\npresenting a morphed image consisting of the biometric qualities from two\ndifferent identities with the aim of triggering a false acceptance with one of\nthe two identities, thereby presenting a significant threat to biometric\nsystems. The success of a morphing attack is dependent on the ability of the\nmorphed image to represent the biometric characteristics of both identities\nthat were used to create the image. We present a novel morphing attack that\nuses a Diffusion-based architecture to improve the visual fidelity of the image\nand the ability of the morphing attack to represent characteristics from both\nidentities. We demonstrate the effectiveness of the proposed attack by\nevaluating its visual fidelity via the Frechet Inception Distance (FID). Also,\nextensive experiments are conducted to measure the vulnerability of FR systems\nto the proposed attack. The ability of a morphing attack detector to detect the\nproposed attack is measured and compared against two state-of-the-art GAN-based\nmorphing attacks along with two Landmark-based attacks. Additionally, a novel\nmetric to measure the relative strength between different morphing attacks is\nintroduced and evaluated.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2301.04218v4.pdf","comment":"Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM"},{"id":"http://arxiv.org/abs/2404.05215v2","updated":"2024-04-10T00:49:11Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v2.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.06666v1","updated":"2024-04-10T00:26:08Z","published":"2024-04-10T00:26:08Z","title":"SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models","summary":" Text-to-image (T2I) models, such as Stable Diffusion, have exhibited\nremarkable performance in generating high-quality images from text descriptions\nin recent years. However, text-to-image models may be tricked into generating\nnot-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing\ncountermeasures mostly focus on filtering inappropriate inputs and outputs, or\nsuppressing improper text embeddings, which can block explicit NSFW-related\ncontent (e.g., naked or sexy) but may still be vulnerable to adversarial\nprompts inputs that appear innocent but are ill-intended. In this paper, we\npresent SafeGen, a framework to mitigate unsafe content generation by\ntext-to-image models in a text-agnostic manner. The key idea is to eliminate\nunsafe visual representations from the model regardless of the text input. In\nthis way, the text-to-image model is resistant to adversarial prompts since\nunsafe visual representations are obstructed from within. Extensive experiments\nconducted on four datasets demonstrate SafeGen's effectiveness in mitigating\nunsafe content generation while preserving the high-fidelity of benign images.\nSafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1%\nsexual content removal performance. Furthermore, our constructed benchmark of\nadversarial prompts provides a basis for future development and evaluation of\nanti-NSFW-generation methods.\n","authors":["Xinfeng Li","Yuchen Yang","Jiangyi Deng","Chen Yan","Yanjiao Chen","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06665v1","updated":"2024-04-10T00:25:09Z","published":"2024-04-10T00:25:09Z","title":"Deep Generative Data Assimilation in Multimodal Setting","summary":" Robust integration of physical knowledge and data is key to improve\ncomputational simulations, such as Earth system models. Data assimilation is\ncrucial for achieving this goal because it provides a systematic framework to\ncalibrate model outputs with observations, which can include remote sensing\nimagery and ground station measurements, with uncertainty quantification.\nConventional methods, including Kalman filters and variational approaches,\ninherently rely on simplifying linear and Gaussian assumptions, and can be\ncomputationally expensive. Nevertheless, with the rapid adoption of data-driven\nmethods in many areas of computational sciences, we see the potential of\nemulating traditional data assimilation with deep learning, especially\ngenerative models. In particular, the diffusion-based probabilistic framework\nhas large overlaps with data assimilation principles: both allows for\nconditional generation of samples with a Bayesian inverse framework. These\nmodels have shown remarkable success in text-conditioned image generation or\nimage-controlled video synthesis. Likewise, one can frame data assimilation as\nobservation-conditioned state calibration. In this work, we propose SLAMS:\nScore-based Latent Assimilation in Multimodal Setting. Specifically, we\nassimilate in-situ weather station data and ex-situ satellite imagery to\ncalibrate the vertical temperature profiles, globally. Through extensive\nablation, we demonstrate that SLAMS is robust even in low-resolution, noisy,\nand sparse data settings. To our knowledge, our work is the first to apply deep\ngenerative framework for multimodal data assimilation using real-world\ndatasets; an important step for building robust computational simulators,\nincluding the next-generation Earth system models. Our code is available at:\nhttps://github.com/yongquan-qu/SLAMS\n","authors":["Yongquan Qu","Juan Nathaniel","Shuolin Li","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2404.06665v1.pdf","comment":"Accepted to CVPR2024 EarthVision"},{"id":"http://arxiv.org/abs/2404.06663v1","updated":"2024-04-10T00:11:03Z","published":"2024-04-10T00:11:03Z","title":"Multi-modal Document Presentation Attack Detection With Forensics Trace\n Disentanglement","summary":" Document Presentation Attack Detection (DPAD) is an important measure in\nprotecting the authenticity of a document image. However, recent DPAD methods\ndemand additional resources, such as manual effort in collecting additional\ndata or knowing the parameters of acquisition devices. This work proposes a\nDPAD method based on multi-modal disentangled traces (MMDT) without the above\ndrawbacks. We first disentangle the recaptured traces by a self-supervised\ndisentanglement and synthesis network to enhance the generalization capacity in\ndocument images with different contents and layouts. Then, unlike the existing\nDPAD approaches that rely only on data in the RGB domain, we propose to\nexplicitly employ the disentangled recaptured traces as new modalities in the\ntransformer backbone through adaptive multi-modal adapters to fuse RGB/trace\nfeatures efficiently. Visualization of the disentangled traces confirms the\neffectiveness of the proposed method in different document contents. Extensive\nexperiments on three benchmark datasets demonstrate the superiority of our MMDT\nmethod on representing forensic traces of recapturing distortion.\n","authors":["Changsheng Chen","Yongyi Deng","Liangwei Lin","Zitong Yu","Zhimao Lai"],"pdf_url":"https://arxiv.org/pdf/2404.06663v1.pdf","comment":"Accepted to ICME 2024"},{"id":"http://arxiv.org/abs/2404.06661v1","updated":"2024-04-10T00:05:55Z","published":"2024-04-10T00:05:55Z","title":"Efficient Denoising using Score Embedding in Score-based Diffusion\n Models","summary":" It is well known that training a denoising score-based diffusion models\nrequires tens of thousands of epochs and a substantial number of image data to\ntrain the model. In this paper, we propose to increase the efficiency in\ntraining score-based diffusion models. Our method allows us to decrease the\nnumber of epochs needed to train the diffusion model. We accomplish this by\nsolving the log-density Fokker-Planck (FP) Equation numerically to compute the\nscore \\textit{before} training. The pre-computed score is embedded into the\nimage to encourage faster training under slice Wasserstein distance.\nConsequently, it also allows us to decrease the number of images we need to\ntrain the neural network to learn an accurate score. We demonstrate through our\nnumerical experiments the improved performance of our proposed method compared\nto standard score-based diffusion models. Our proposed method achieves a\nsimilar quality to the standard method meaningfully faster.\n","authors":["Andrew S. Na","William Gao","Justin W. L. Wan"],"pdf_url":"https://arxiv.org/pdf/2404.06661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08017v1","updated":"2024-04-10T19:16:08Z","published":"2024-04-10T19:16:08Z","title":"AI-Guided Feature Segmentation Techniques to Model Features from Single\n Crystal Diamond Growth","summary":" Process refinement to consistently produce high-quality material over a large\narea of the grown crystal, enabling various applications from optics crystals\nto quantum detectors, has long been a goal for diamond growth. Machine learning\noffers a promising path toward this goal, but faces challenges such as the\ncomplexity of features within datasets, their time-dependency, and the volume\nof data produced per growth run. Accurate spatial feature extraction from image\nto image for real-time monitoring of diamond growth is crucial yet complicated\ndue to the low-volume and high feature complexity nature of the datasets. This\npaper compares various traditional and machine learning-driven approaches for\nfeature extraction in the diamond growth domain, proposing a novel deep\nlearning-driven semantic segmentation approach to isolate and classify accurate\npixel masks of geometric features like diamond, pocket holder, and background,\nalong with their derivative features based on shape and size. Using an\nannotation-focused human-in-the-loop software architecture for training\ndatasets, with modules for selective data labeling using active learning, data\naugmentations, and model-assisted labeling, our approach achieves effective\nannotation accuracy and drastically reduces labeling time and cost. Deep\nlearning algorithms prove highly efficient in accurately learning complex\nrepresentations from datasets with many features. Our top-performing model,\nbased on the DeeplabV3plus architecture, achieves outstanding accuracy in\nclassifying features of interest, with accuracies of 96.31% for pocket holder,\n98.60% for diamond top, and 91.64% for diamond side features.\n","authors":["Rohan Reddy Mekala","Elias Garratt","Matthias Muehle","Arjun Srinivasan","Adam Porter","Mikael Lindvall"],"pdf_url":"https://arxiv.org/pdf/2404.08017v1.pdf","comment":"12 pages,4 figures,ACMME 2024. arXiv admin note: substantial text\n overlap with arXiv:2404.07306"},{"id":"http://arxiv.org/abs/2404.08013v1","updated":"2024-04-10T15:37:15Z","published":"2024-04-10T15:37:15Z","title":"Enhanced Cooperative Perception for Autonomous Vehicles Using Imperfect\n Communication","summary":" Sharing and joint processing of camera feeds and sensor measurements, known\nas Cooperative Perception (CP), has emerged as a new technique to achieve\nhigher perception qualities. CP can enhance the safety of Autonomous Vehicles\n(AVs) where their individual visual perception quality is compromised by\nadverse weather conditions (haze as foggy weather), low illumination, winding\nroads, and crowded traffic. To cover the limitations of former methods, in this\npaper, we propose a novel approach to realize an optimized CP under constrained\ncommunications. At the core of our approach is recruiting the best helper from\nthe available list of front vehicles to augment the visual range and enhance\nthe Object Detection (OD) accuracy of the ego vehicle. In this two-step\nprocess, we first select the helper vehicles that contribute the most to CP\nbased on their visual range and lowest motion blur. Next, we implement a radio\nblock optimization among the candidate vehicles to further improve\ncommunication efficiency. We specifically focus on pedestrian detection as an\nexemplary scenario. To validate our approach, we used the CARLA simulator to\ncreate a dataset of annotated videos for different driving scenarios where\npedestrian detection is challenging for an AV with compromised vision. Our\nresults demonstrate the efficacy of our two-step optimization process in\nimproving the overall performance of cooperative perception in challenging\nscenarios, substantially improving driving safety under adverse conditions.\nFinally, we note that the networking assumptions are adopted from LTE Release\n14 Mode 4 side-link communication, commonly used for Vehicle-to-Vehicle (V2V)\ncommunication. Nonetheless, our method is flexible and applicable to arbitrary\nV2V communications.\n","authors":["Ahmad Sarlak","Hazim Alzorgan","Sayed Pedram Haeri Boroujeni","Abolfazl Razi","Rahul Amin"],"pdf_url":"https://arxiv.org/pdf/2404.08013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08011v1","updated":"2024-04-10T06:30:33Z","published":"2024-04-10T06:30:33Z","title":"An inclusive review on deep learning techniques and their scope in\n handwriting recognition","summary":" Deep learning expresses a category of machine learning algorithms that have\nthe capability to combine raw inputs into intermediate features layers. These\ndeep learning algorithms have demonstrated great results in different fields.\nDeep learning has particularly witnessed for a great achievement of human level\nperformance across a number of domains in computer vision and pattern\nrecognition. For the achievement of state-of-the-art performances in diverse\ndomains, the deep learning used different architectures and these architectures\nused activation functions to perform various computations between hidden and\noutput layers of any architecture. This paper presents a survey on the existing\nstudies of deep learning in handwriting recognition field. Even though the\nrecent progress indicates that the deep learning methods has provided valuable\nmeans for speeding up or proving accurate results in handwriting recognition,\nbut following from the extensive literature survey, the present study finds\nthat the deep learning has yet to revolutionize more and has to resolve many of\nthe most pressing challenges in this field, but promising advances have been\nmade on the prior state of the art. Additionally, an inadequate availability of\nlabelled data to train presents problems in this domain. Nevertheless, the\npresent handwriting recognition survey foresees deep learning enabling changes\nat both bench and bedside with the potential to transform several domains as\nimage processing, speech recognition, computer vision, machine translation,\nrobotics and control, medical imaging, medical information processing,\nbio-informatics, natural language processing, cyber security, and many others.\n","authors":["Sukhdeep Singh","Sudhir Rohilla","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.08011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07351v1","updated":"2024-04-10T21:14:33Z","published":"2024-04-10T21:14:33Z","title":"A Transformer-Based Model for the Prediction of Human Gaze Behavior on\n Videos","summary":" Eye-tracking applications that utilize the human gaze in video understanding\ntasks have become increasingly important. To effectively automate the process\nof video analysis based on eye-tracking data, it is important to accurately\nreplicate human gaze behavior. However, this task presents significant\nchallenges due to the inherent complexity and ambiguity of human gaze patterns.\nIn this work, we introduce a novel method for simulating human gaze behavior.\nOur approach uses a transformer-based reinforcement learning algorithm to train\nan agent that acts as a human observer, with the primary role of watching\nvideos and simulating human gaze behavior. We employed an eye-tracking dataset\ngathered from videos generated by the VirtualHome simulator, with a primary\nfocus on activity recognition. Our experimental results demonstrate the\neffectiveness of our gaze prediction method by highlighting its capability to\nreplicate human gaze behavior and its applicability for downstream tasks where\nreal human-gaze is used as input.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2404.07351v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2404.07347v1","updated":"2024-04-10T21:03:23Z","published":"2024-04-10T21:03:23Z","title":"Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on\n Intention","summary":" Humans utilize their gaze to concentrate on essential information while\nperceiving and interpreting intentions in videos. Incorporating human gaze into\ncomputational algorithms can significantly enhance model performance in video\nunderstanding tasks. In this work, we address a challenging and innovative task\nin video understanding: predicting the actions of an agent in a video based on\na partial video. We introduce the Gaze-guided Action Anticipation algorithm,\nwhich establishes a visual-semantic graph from the video input. Our method\nutilizes a Graph Neural Network to recognize the agent's intention and predict\nthe action sequence to fulfill this intention. To assess the efficiency of our\napproach, we collect a dataset containing household activities generated in the\nVirtualHome environment, accompanied by human gaze data of viewing videos. Our\nmethod outperforms state-of-the-art techniques, achieving a 7\\% improvement in\naccuracy for 18-class intention recognition. This highlights the efficiency of\nour method in learning important features from human gaze data.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2404.07347v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"}]},"2024-04-11T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.07992v1","updated":"2024-04-11T17:59:59Z","published":"2024-04-11T17:59:59Z","title":"GoMVS: Geometrically Consistent Cost Aggregation for Multi-View Stereo","summary":" Matching cost aggregation plays a fundamental role in learning-based\nmulti-view stereo networks. However, directly aggregating adjacent costs can\nlead to suboptimal results due to local geometric inconsistency. Related\nmethods either seek selective aggregation or improve aggregated depth in the 2D\nspace, both are unable to handle geometric inconsistency in the cost volume\neffectively. In this paper, we propose GoMVS to aggregate geometrically\nconsistent costs, yielding better utilization of adjacent geometries. More\nspecifically, we correspond and propagate adjacent costs to the reference pixel\nby leveraging the local geometric smoothness in conjunction with surface\nnormals. We achieve this by the geometric consistent propagation (GCP) module.\nIt computes the correspondence from the adjacent depth hypothesis space to the\nreference depth space using surface normals, then uses the correspondence to\npropagate adjacent costs to the reference geometry, followed by a convolution\nfor aggregation. Our method achieves new state-of-the-art performance on DTU,\nTanks & Temple, and ETH3D datasets. Notably, our method ranks 1st on the Tanks\n& Temple Advanced benchmark.\n","authors":["Jiang Wu","Rui Li","Haofei Xu","Wenxun Zhao","Yu Zhu","Jinqiu Sun","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07992v1.pdf","comment":"CVPR 2024. Project page: https://wuuu3511.github.io/gomvs/ Code:\n https://github.com/Wuuu3511/GoMVS"},{"id":"http://arxiv.org/abs/2404.07993v1","updated":"2024-04-11T17:59:59Z","published":"2024-04-11T17:59:59Z","title":"Connecting NeRFs, Images, and Text","summary":" Neural Radiance Fields (NeRFs) have emerged as a standard framework for\nrepresenting 3D scenes and objects, introducing a novel data type for\ninformation exchange and storage. Concurrently, significant progress has been\nmade in multimodal representation learning for text and image data. This paper\nexplores a novel research direction that aims to connect the NeRF modality with\nother modalities, similar to established methodologies for images and text. To\nthis end, we propose a simple framework that exploits pre-trained models for\nNeRF representations alongside multimodal models for text and image processing.\nOur framework learns a bidirectional mapping between NeRF embeddings and those\nobtained from corresponding images and text. This mapping unlocks several novel\nand useful applications, including NeRF zero-shot classification and NeRF\nretrieval from images or text.\n","authors":["Francesco Ballerini","Pierluigi Zama Ramirez","Roberto Mirabella","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2404.07993v1.pdf","comment":"Accepted at CVPRW-INRV 2024"},{"id":"http://arxiv.org/abs/2404.07991v1","updated":"2024-04-11T17:59:57Z","published":"2024-04-11T17:59:57Z","title":"GoMAvatar: Efficient Animatable Human Modeling from Monocular Video\n Using Gaussians-on-Mesh","summary":" We introduce GoMAvatar, a novel approach for real-time, memory-efficient,\nhigh-quality animatable human modeling. GoMAvatar takes as input a single\nmonocular video to create a digital avatar capable of re-articulation in new\nposes and real-time rendering from novel viewpoints, while seamlessly\nintegrating with rasterization-based graphics pipelines. Central to our method\nis the Gaussians-on-Mesh representation, a hybrid 3D model combining rendering\nquality and speed of Gaussian splatting with geometry modeling and\ncompatibility of deformable meshes. We assess GoMAvatar on ZJU-MoCap data and\nvarious YouTube videos. GoMAvatar matches or surpasses current monocular human\nmodeling algorithms in rendering quality and significantly outperforms them in\ncomputational efficiency (43 FPS) while being memory-efficient (3.63 MB per\nsubject).\n","authors":["Jing Wen","Xiaoming Zhao","Zhongzheng Ren","Alexander G. Schwing","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07991v1.pdf","comment":"CVPR 2024; project page: https://wenj.github.io/GoMAvatar/"},{"id":"http://arxiv.org/abs/2404.07990v1","updated":"2024-04-11T17:59:56Z","published":"2024-04-11T17:59:56Z","title":"OpenBias: Open-set Bias Detection in Text-to-Image Generative Models","summary":" Text-to-image generative models are becoming increasingly popular and\naccessible to the general public. As these models see large-scale deployments,\nit is necessary to deeply investigate their safety and fairness to not\ndisseminate and perpetuate any kind of biases. However, existing works focus on\ndetecting closed sets of biases defined a priori, limiting the studies to\nwell-known concepts. In this paper, we tackle the challenge of open-set bias\ndetection in text-to-image generative models presenting OpenBias, a new\npipeline that identifies and quantifies the severity of biases agnostically,\nwithout access to any precompiled set. OpenBias has three stages. In the first\nphase, we leverage a Large Language Model (LLM) to propose biases given a set\nof captions. Secondly, the target generative model produces images using the\nsame set of captions. Lastly, a Vision Question Answering model recognizes the\npresence and extent of the previously proposed biases. We study the behavior of\nStable Diffusion 1.5, 2, and XL emphasizing new biases, never investigated\nbefore. Via quantitative experiments, we demonstrate that OpenBias agrees with\ncurrent closed-set bias detection methods and human judgement.\n","authors":["Moreno D'Incà","Elia Peruzzo","Massimiliano Mancini","Dejia Xu","Vidit Goel","Xingqian Xu","Zhangyang Wang","Humphrey Shi","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2404.07990v1.pdf","comment":"CVPR 2024 Highlight - Code:\n https://github.com/Picsart-AI-Research/OpenBias"},{"id":"http://arxiv.org/abs/2404.07989v1","updated":"2024-04-11T17:59:45Z","published":"2024-04-11T17:59:45Z","title":"Any2Point: Empowering Any-modality Large Models for Efficient 3D\n Understanding","summary":" Large foundation models have recently emerged as a prominent focus of\ninterest, attaining superior performance in widespread scenarios. Due to the\nscarcity of 3D data, many efforts have been made to adapt pre-trained\ntransformers from vision to 3D domains. However, such 2D-to-3D approaches are\nstill limited, due to the potential loss of spatial geometries and high\ncomputation cost. More importantly, their frameworks are mainly designed for 2D\nmodels, lacking a general any-to-3D paradigm. In this paper, we introduce\nAny2Point, a parameter-efficient method to empower any-modality large models\n(vision, language, audio) for 3D understanding. Given a frozen transformer from\nany source modality, we propose a 3D-to-any (1D or 2D) virtual projection\nstrategy that correlates the input 3D points to the original 1D or 2D positions\nwithin the source modality. This mechanism enables us to assign each 3D token\nwith a positional encoding paired with the pre-trained model, which avoids 3D\ngeometry loss caused by the true projection and better motivates the\ntransformer for 3D learning with 1D/2D positional priors. Then, within each\ntransformer block, we insert an any-to-3D guided adapter module for\nparameter-efficient fine-tuning. The adapter incorporates prior spatial\nknowledge from the source modality to guide the local feature aggregation of 3D\ntokens, compelling the semantic adaption of any-modality transformers. We\nconduct extensive experiments to showcase the effectiveness and efficiency of\nour method. Code and models are released at\nhttps://github.com/Ivan-Tang-3D/Any2Point.\n","authors":["Yiwen Tang","Jiaming Liu","Dong Wang","Zhigang Wang","Shanghang Zhang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07989v1.pdf","comment":"Code and models are released at\n https://github.com/Ivan-Tang-3D/Any2Point"},{"id":"http://arxiv.org/abs/2401.10222v2","updated":"2024-04-11T17:59:42Z","published":"2024-01-18T18:58:54Z","title":"Supervised Fine-tuning in turn Improves Visual Foundation Models","summary":" Image-text training like CLIP has dominated the pretraining of vision\nfoundation models in recent years. Subsequent efforts have been made to\nintroduce region-level visual learning into CLIP's pretraining but face\nscalability challenges due to the lack of large-scale region-level datasets.\nDrawing inspiration from supervised fine-tuning (SFT) in natural language\nprocessing such as instruction tuning, we explore the potential of fine-grained\nSFT in enhancing the generation of vision foundation models after their\npretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash\nthe fine-grained knowledge of vision foundation models. In ViSFT, the vision\nfoundation model is enhanced by performing visual joint learning on some\nin-domain tasks and then tested on out-of-domain benchmarks. With updating\nusing ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over\n4.4B parameters shows improvements across various out-of-domain benchmarks\nincluding vision and vision-linguistic scenarios.\n","authors":["Xiaohu Jiang","Yixiao Ge","Yuying Ge","Dachuan Shi","Chun Yuan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.10222v2.pdf","comment":"23 pages, 3 figures, Project page:\n https://github.com/TencentARC/ViSFT/tree/main"},{"id":"http://arxiv.org/abs/2404.07988v1","updated":"2024-04-11T17:59:40Z","published":"2024-04-11T17:59:40Z","title":"QuasiSim: Parameterized Quasi-Physical Simulators for Dexterous\n Manipulations Transfer","summary":" We explore the dexterous manipulation transfer problem by designing\nsimulators. The task wishes to transfer human manipulations to dexterous robot\nhand simulations and is inherently difficult due to its intricate,\nhighly-constrained, and discontinuous dynamics and the need to control a\ndexterous hand with a DoF to accurately replicate human manipulations. Previous\napproaches that optimize in high-fidelity black-box simulators or a modified\none with relaxed constraints only demonstrate limited capabilities or are\nrestricted by insufficient simulation fidelity. We introduce parameterized\nquasi-physical simulators and a physics curriculum to overcome these\nlimitations. The key ideas are 1) balancing between fidelity and optimizability\nof the simulation via a curriculum of parameterized simulators, and 2) solving\nthe problem in each of the simulators from the curriculum, with properties\nranging from high task optimizability to high fidelity. We successfully enable\na dexterous hand to track complex and diverse manipulations in high-fidelity\nsimulated environments, boosting the success rate by 11\\%+ from the\nbest-performed baseline. The project website is available at\nhttps://meowuu7.github.io/QuasiSim/.\n","authors":["Xueyi Liu","Kangbo Lyu","Jieqiong Zhang","Tao Du","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.07988v1.pdf","comment":"Project website: https://meowuu7.github.io/QuasiSim/ Code:\n https://github.com/Meowuu7/QuasiSim Hugging Face Demo:\n https://huggingface.co/spaces/xymeow7/quasi-physical-sims"},{"id":"http://arxiv.org/abs/2404.07987v1","updated":"2024-04-11T17:59:09Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 7.9% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v1.pdf","comment":"Project Page: https://liming-ai.github.io/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2404.07985v1","updated":"2024-04-11T17:58:44Z","published":"2024-04-11T17:58:44Z","title":"WaveMo: Learning Wavefront Modulations to See Through Scattering","summary":" Imaging through scattering media is a fundamental and pervasive challenge in\nfields ranging from medical diagnostics to astronomy. A promising strategy to\novercome this challenge is wavefront modulation, which induces measurement\ndiversity during image acquisition. Despite its importance, designing optimal\nwavefront modulations to image through scattering remains under-explored. This\npaper introduces a novel learning-based framework to address the gap. Our\napproach jointly optimizes wavefront modulations and a computationally\nlightweight feedforward \"proxy\" reconstruction network. This network is trained\nto recover scenes obscured by scattering, using measurements that are modified\nby these modulations. The learned modulations produced by our framework\ngeneralize effectively to unseen scattering scenarios and exhibit remarkable\nversatility. During deployment, the learned modulations can be decoupled from\nthe proxy network to augment other more computationally expensive restoration\nalgorithms. Through extensive experiments, we demonstrate our approach\nsignificantly advances the state of the art in imaging through scattering\nmedia. Our project webpage is at https://wavemo-2024.github.io/.\n","authors":["Mingyang Xie","Haiyun Guo","Brandon Y. Feng","Lingbo Jin","Ashok Veeraraghavan","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2404.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07984v1","updated":"2024-04-11T17:58:11Z","published":"2024-04-11T17:58:11Z","title":"View Selection for 3D Captioning via Diffusion Ranking","summary":" Scalable annotation approaches are crucial for constructing extensive 3D-text\ndatasets, facilitating a broader range of applications. However, existing\nmethods sometimes lead to the generation of hallucinated captions, compromising\ncaption quality. This paper explores the issue of hallucination in 3D object\ncaptioning, with a focus on Cap3D method, which renders 3D objects into 2D\nviews for captioning using pre-trained models. We pinpoint a major challenge:\ncertain rendered views of 3D objects are atypical, deviating from the training\ndata of standard image captioning models and causing hallucinations. To tackle\nthis, we present DiffuRank, a method that leverages a pre-trained text-to-3D\nmodel to assess the alignment between 3D objects and their 2D rendered views,\nwhere the view with high alignment closely represent the object's\ncharacteristics. By ranking all rendered views and feeding the top-ranked ones\ninto GPT4-Vision, we enhance the accuracy and detail of captions, enabling the\ncorrection of 200k captions in the Cap3D dataset and extending it to 1 million\ncaptions across Objaverse and Objaverse-XL datasets. Additionally, we showcase\nthe adaptability of DiffuRank by applying it to pre-trained text-to-image\nmodels for a Visual Question Answering task, where it outperforms the CLIP\nmodel.\n","authors":["Tiange Luo","Justin Johnson","Honglak Lee"],"pdf_url":"https://arxiv.org/pdf/2404.07984v1.pdf","comment":"Dataset link: https://huggingface.co/datasets/tiange/Cap3D"},{"id":"http://arxiv.org/abs/2404.07983v1","updated":"2024-04-11T17:58:06Z","published":"2024-04-11T17:58:06Z","title":"Two Effects, One Trigger: On the Modality Gap, Object Bias, and\n Information Imbalance in Contrastive Vision-Language Representation Learning","summary":" Contrastive vision-language models like CLIP have gained popularity for their\nversatile applicable learned representations in various downstream tasks.\nDespite their successes in some tasks, like zero-shot image recognition, they\nalso perform surprisingly poor on other tasks, like attribute detection.\nPrevious work has attributed these challenges to the modality gap, a separation\nof image and text in the shared representation space, and a bias towards\nobjects over other factors, such as attributes. In this work we investigate\nboth phenomena. We find that only a few embedding dimensions drive the modality\ngap. Further, we propose a measure for object bias and find that object bias\ndoes not lead to worse performance on other concepts, such as attributes. But\nwhat leads to the emergence of the modality gap and object bias? To answer this\nquestion we carefully designed an experimental setting which allows us to\ncontrol the amount of shared information between the modalities. This revealed\nthat the driving factor behind both, the modality gap and the object bias, is\nthe information imbalance between images and captions.\n","authors":["Simon Schrodi","David T. Hoffmann","Max Argus","Volker Fischer","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2404.07983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07977v1","updated":"2024-04-11T17:57:19Z","published":"2024-04-11T17:57:19Z","title":"Gaga: Group Any Gaussians via 3D-aware Memory Bank","summary":" We introduce Gaga, a framework that reconstructs and segments open-world 3D\nscenes by leveraging inconsistent 2D masks predicted by zero-shot segmentation\nmodels. Contrasted to prior 3D scene segmentation approaches that heavily rely\non video object tracking, Gaga utilizes spatial information and effectively\nassociates object masks across diverse camera poses. By eliminating the\nassumption of continuous view changes in training images, Gaga demonstrates\nrobustness to variations in camera poses, particularly beneficial for sparsely\nsampled images, ensuring precise mask label consistency. Furthermore, Gaga\naccommodates 2D segmentation masks from diverse sources and demonstrates robust\nperformance with different open-world zero-shot segmentation models, enhancing\nits versatility. Extensive qualitative and quantitative evaluations demonstrate\nthat Gaga performs favorably against state-of-the-art methods, emphasizing its\npotential for real-world applications such as scene understanding and\nmanipulation.\n","authors":["Weijie Lyu","Xueting Li","Abhijit Kundu","Yi-Hsuan Tsai","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07977v1.pdf","comment":"Project Page: https://www.gaga.gallery"},{"id":"http://arxiv.org/abs/2404.07976v1","updated":"2024-04-11T17:56:40Z","published":"2024-04-11T17:56:40Z","title":"Self-supervised Dataset Distillation: A Good Compression Is All You Need","summary":" Dataset distillation aims to compress information from a large-scale original\ndataset to a new compact dataset while striving to preserve the utmost degree\nof the original data informational essence. Previous studies have predominantly\nconcentrated on aligning the intermediate statistics between the original and\ndistilled data, such as weight trajectory, features, gradient, BatchNorm, etc.\nIn this work, we consider addressing this task through the new lens of model\ninformativeness in the compression stage on the original dataset pretraining.\nWe observe that with the prior state-of-the-art SRe$^2$L, as model sizes\nincrease, it becomes increasingly challenging for supervised pretrained models\nto recover learned information during data synthesis, as the channel-wise mean\nand variance inside the model are flatting and less informative. We further\nnotice that larger variances in BN statistics from self-supervised models\nenable larger loss signals to update the recovered data by gradients, enjoying\nmore informativeness during synthesis. Building on this observation, we\nintroduce SC-DD, a simple yet effective Self-supervised Compression framework\nfor Dataset Distillation that facilitates diverse information compression and\nrecovery compared to traditional supervised learning schemes, further reaps the\npotential of large pretrained models with enhanced capabilities. Extensive\nexperiments are conducted on CIFAR-100, Tiny-ImageNet and ImageNet-1K datasets\nto demonstrate the superiority of our proposed approach. The proposed SC-DD\noutperforms all previous state-of-the-art supervised dataset distillation\nmethods when employing larger models, such as SRe$^2$L, MTT, TESLA, DC, CAFE,\netc., by large margins under the same recovery and post-training budgets. Code\nis available at https://github.com/VILA-Lab/SRe2L/tree/main/SCDD/.\n","authors":["Muxin Zhou","Zeyuan Yin","Shitong Shao","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.07976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07973v1","updated":"2024-04-11T17:56:05Z","published":"2024-04-11T17:56:05Z","title":"Ferret-v2: An Improved Baseline for Referring and Grounding with Large\n Language Models","summary":" While Ferret seamlessly integrates regional understanding into the Large\nLanguage Model (LLM) to facilitate its referring and grounding capability, it\nposes certain limitations: constrained by the pre-trained fixed visual encoder\nand failed to perform well on broader tasks. In this work, we unveil Ferret-v2,\na significant upgrade to Ferret, with three key designs. (1) Any resolution\ngrounding and referring: A flexible approach that effortlessly handles higher\nimage resolution, improving the model's ability to process and understand\nimages in greater detail. (2) Multi-granularity visual encoding: By integrating\nthe additional DINOv2 encoder, the model learns better and diverse underlying\ncontexts for global and fine-grained visual information. (3) A three-stage\ntraining paradigm: Besides image-caption alignment, an additional stage is\nproposed for high-resolution dense alignment before the final instruction\ntuning. Experiments show that Ferret-v2 provides substantial improvements over\nFerret and other state-of-the-art methods, thanks to its high-resolution\nscaling and fine-grained visual processing.\n","authors":["Haotian Zhang","Haoxuan You","Philipp Dufter","Bowen Zhang","Chen Chen","Hong-You Chen","Tsu-Jui Fu","William Yang Wang","Shih-Fu Chang","Zhe Gan","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07973v1.pdf","comment":"Preprint. 14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.07949v1","updated":"2024-04-11T17:46:14Z","published":"2024-04-11T17:46:14Z","title":"Taming Stable Diffusion for Text to 360° Panorama Image Generation","summary":" Generative models, e.g., Stable Diffusion, have enabled the creation of\nphotorealistic images from text prompts. Yet, the generation of 360-degree\npanorama images from text remains a challenge, particularly due to the dearth\nof paired text-panorama data and the domain gap between panorama and\nperspective images. In this paper, we introduce a novel dual-branch diffusion\nmodel named PanFusion to generate a 360-degree image from a text prompt. We\nleverage the stable diffusion model as one branch to provide prior knowledge in\nnatural image generation and register it to another panorama branch for\nholistic image generation. We propose a unique cross-attention mechanism with\nprojection awareness to minimize distortion during the collaborative denoising\nprocess. Our experiments validate that PanFusion surpasses existing methods\nand, thanks to its dual-branch structure, can integrate additional constraints\nlike room layout for customized panorama outputs. Code is available at\nhttps://chengzhag.github.io/publication/panfusion.\n","authors":["Cheng Zhang","Qianyi Wu","Camilo Cruz Gambardella","Xiaoshui Huang","Dinh Phung","Wanli Ouyang","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2404.07949v1.pdf","comment":"CVPR 2024. Project Page:\n https://chengzhag.github.io/publication/panfusion Code:\n https://github.com/chengzhag/PanFusion"},{"id":"http://arxiv.org/abs/2404.07933v1","updated":"2024-04-11T17:30:24Z","published":"2024-04-11T17:30:24Z","title":"Boosting Self-Supervision for Single-View Scene Completion via Knowledge\n Distillation","summary":" Inferring scene geometry from images via Structure from Motion is a\nlong-standing and fundamental problem in computer vision. While classical\napproaches and, more recently, depth map predictions only focus on the visible\nparts of a scene, the task of scene completion aims to reason about geometry\neven in occluded regions. With the popularity of neural radiance fields\n(NeRFs), implicit representations also became popular for scene completion by\npredicting so-called density fields. Unlike explicit approaches. e.g.\nvoxel-based methods, density fields also allow for accurate depth prediction\nand novel-view synthesis via image-based rendering. In this work, we propose to\nfuse the scene reconstruction from multiple images and distill this knowledge\ninto a more accurate single-view scene reconstruction. To this end, we propose\nMulti-View Behind the Scenes (MVBTS) to fuse density fields from multiple posed\nimages, trained fully self-supervised only from image data. Using knowledge\ndistillation, we use MVBTS to train a single-view scene completion network via\ndirect supervision called KDBTS. It achieves state-of-the-art performance on\noccupancy prediction, especially in occluded regions.\n","authors":["Keonhee Han","Dominik Muhle","Felix Wimbauer","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.07933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07932v1","updated":"2024-04-11T17:29:56Z","published":"2024-04-11T17:29:56Z","title":"FusionMamba: Efficient Image Fusion with State Space Model","summary":" Image fusion aims to generate a high-resolution multi/hyper-spectral image by\ncombining a high-resolution image with limited spectral information and a\nlow-resolution image with abundant spectral data. Current deep learning\n(DL)-based methods for image fusion primarily rely on CNNs or Transformers to\nextract features and merge different types of data. While CNNs are efficient,\ntheir receptive fields are limited, restricting their capacity to capture\nglobal context. Conversely, Transformers excel at learning global information\nbut are hindered by their quadratic complexity. Fortunately, recent\nadvancements in the State Space Model (SSM), particularly Mamba, offer a\npromising solution to this issue by enabling global awareness with linear\ncomplexity. However, there have been few attempts to explore the potential of\nSSM in information fusion, which is a crucial ability in domains like image\nfusion. Therefore, we propose FusionMamba, an innovative method for efficient\nimage fusion. Our contributions mainly focus on two aspects. Firstly,\nrecognizing that images from different sources possess distinct properties, we\nincorporate Mamba blocks into two U-shaped networks, presenting a novel\narchitecture that extracts spatial and spectral features in an efficient,\nindependent, and hierarchical manner. Secondly, to effectively combine spatial\nand spectral information, we extend the Mamba block to accommodate dual inputs.\nThis expansion leads to the creation of a new module called the FusionMamba\nblock, which outperforms existing fusion techniques such as concatenation and\ncross-attention. To validate FusionMamba's effectiveness, we conduct a series\nof experiments on five datasets related to three image fusion tasks. The\nquantitative and qualitative evaluation results demonstrate that our method\nachieves state-of-the-art (SOTA) performance, underscoring the superiority of\nFusionMamba.\n","authors":["Siran Peng","Xiangyu Zhu","Haoyu Deng","Zhen Lei","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.07932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07930v1","updated":"2024-04-11T17:27:39Z","published":"2024-04-11T17:27:39Z","title":"Parameter Hierarchical Optimization for Visible-Infrared Person\n Re-Identification","summary":" Visible-infrared person re-identification (VI-reID) aims at matching\ncross-modality pedestrian images captured by disjoint visible or infrared\ncameras. Existing methods alleviate the cross-modality discrepancies via\ndesigning different kinds of network architectures. Different from available\nmethods, in this paper, we propose a novel parameter optimizing paradigm,\nparameter hierarchical optimization (PHO) method, for the task of VI-ReID. It\nallows part of parameters to be directly optimized without any training, which\nnarrows the search space of parameters and makes the whole network more easier\nto be trained. Specifically, we first divide the parameters into different\ntypes, and then introduce a self-adaptive alignment strategy (SAS) to\nautomatically align the visible and infrared images through transformation.\nConsidering that features in different dimension have varying importance, we\ndevelop an auto-weighted alignment learning (AAL) module that can automatically\nweight features according to their importance. Importantly, in the alignment\nprocess of SAS and AAL, all the parameters are immediately optimized with\noptimization principles rather than training the whole network, which yields a\nbetter parameter training manner. Furthermore, we establish the cross-modality\nconsistent learning (CCL) loss to extract discriminative person representations\nwith translation consistency. We provide both theoretical justification and\nempirical evidence that our proposed PHO method outperform existing VI-reID\napproaches.\n","authors":["Zeng YU","Yunxiao Shi"],"pdf_url":"https://arxiv.org/pdf/2404.07930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07922v1","updated":"2024-04-11T17:09:28Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. All code and model\nweights are public at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2401.04716v3","updated":"2024-04-11T16:46:52Z","published":"2024-01-09T18:40:52Z","title":"Low-Resource Vision Challenges for Foundation Models","summary":" Low-resource settings are well-established in natural language processing,\nwhere many languages lack sufficient data for deep learning at scale. However,\nlow-resource problems are under-explored in computer vision. In this paper, we\naddress this gap and explore the challenges of low-resource image tasks with\nvision foundation models. We first collect a benchmark of genuinely\nlow-resource image data, covering historic maps, circuit diagrams, and\nmechanical drawings. These low-resource settings all share three challenges:\ndata scarcity, fine-grained differences, and the distribution shift from\nnatural images to the specialized domain of interest. While existing foundation\nmodels have shown impressive generalizability, we find they cannot transfer\nwell to our low-resource tasks. To begin to tackle the challenges of\nlow-resource vision, we introduce one simple baseline per challenge.\nSpecifically, we i) enlarge the data space by generative models, ii) adopt the\nbest sub-kernels to encode local regions for fine-grained difference discovery\nand iii) learn attention for specialized domains. Experiments on our three\nlow-resource tasks demonstrate our proposals already provide a better baseline\nthan transfer learning, data augmentation, and fine-grained methods. This\nhighlights the unique characteristics and challenges of low-resource vision for\nfoundation models that warrant further investigation. Project page:\nhttps://xiaobai1217.github.io/Low-Resource-Vision/.\n","authors":["Yunhua Zhang","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2401.04716v3.pdf","comment":"Accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2401.08739v2","updated":"2024-04-11T16:35:22Z","published":"2024-01-16T18:55:22Z","title":"EgoGen: An Egocentric Synthetic Data Generator","summary":" Understanding the world in first-person view is fundamental in Augmented\nReality (AR). This immersive perspective brings dramatic visual changes and\nunique challenges compared to third-person views. Synthetic data has empowered\nthird-person-view vision models, but its application to embodied egocentric\nperception tasks remains largely unexplored. A critical challenge lies in\nsimulating natural human movements and behaviors that effectively steer the\nembodied cameras to capture a faithful egocentric representation of the 3D\nworld. To address this challenge, we introduce EgoGen, a new synthetic data\ngenerator that can produce accurate and rich ground-truth training data for\negocentric perception tasks. At the heart of EgoGen is a novel human motion\nsynthesis model that directly leverages egocentric visual inputs of a virtual\nhuman to sense the 3D environment. Combined with collision-avoiding motion\nprimitives and a two-stage reinforcement learning approach, our motion\nsynthesis model offers a closed-loop solution where the embodied perception and\nmovement of the virtual human are seamlessly coupled. Compared to previous\nworks, our model eliminates the need for a pre-defined global path, and is\ndirectly applicable to dynamic environments. Combined with our easy-to-use and\nscalable data generation pipeline, we demonstrate EgoGen's efficacy in three\ntasks: mapping and localization for head-mounted cameras, egocentric camera\ntracking, and human mesh recovery from egocentric views. EgoGen will be fully\nopen-sourced, offering a practical solution for creating realistic egocentric\ntraining data and aiming to serve as a useful tool for egocentric computer\nvision research. Refer to our project page: https://ego-gen.github.io/.\n","authors":["Gen Li","Kaifeng Zhao","Siwei Zhang","Xiaozhong Lyu","Mihai Dusmanu","Yan Zhang","Marc Pollefeys","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2401.08739v2.pdf","comment":"Accepted by CVPR 2024 (Oral). 23 pages, 17 figures. Project page:\n https://ego-gen.github.io/"},{"id":"http://arxiv.org/abs/2404.07887v1","updated":"2024-04-11T16:17:36Z","published":"2024-04-11T16:17:36Z","title":"Context-aware Video Anomaly Detection in Long-Term Datasets","summary":" Video anomaly detection research is generally evaluated on short, isolated\nbenchmark videos only a few minutes long. However, in real-world environments,\nsecurity cameras observe the same scene for months or years at a time, and the\nnotion of anomalous behavior critically depends on context, such as the time of\nday, day of week, or schedule of events. Here, we propose a context-aware video\nanomaly detection algorithm, Trinity, specifically targeted to these scenarios.\nTrinity is especially well-suited to crowded scenes in which individuals cannot\nbe easily tracked, and anomalies are due to speed, direction, or absence of\ngroup motion. Trinity is a contrastive learning framework that aims to learn\nalignments between context, appearance, and motion, and uses alignment quality\nto classify videos as normal or anomalous. We evaluate our algorithm on both\nconventional benchmarks and a public webcam-based dataset we collected that\nspans more than three months of activity.\n","authors":["Zhengye Yang","Richard Radke"],"pdf_url":"https://arxiv.org/pdf/2404.07887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06564v2","updated":"2024-04-11T16:06:39Z","published":"2024-04-09T18:28:55Z","title":"MambaAD: Exploring State Space Models for Multi-class Unsupervised\n Anomaly Detection","summary":" Recent advancements in anomaly detection have seen the efficacy of CNN- and\ntransformer-based approaches. However, CNNs struggle with long-range\ndependencies, while transformers are burdened by quadratic computational\ncomplexity. Mamba-based models, with their superior long-range modeling and\nlinear efficiency, have garnered substantial attention. This study pioneers the\napplication of Mamba to multi-class unsupervised anomaly detection, presenting\nMambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring\n(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS\nmodule, integrating parallel cascaded (Hybrid State Space) HSS blocks and\nmulti-kernel convolutions operations, effectively captures both long-range and\nlocal information. The HSS block, utilizing (Hybrid Scanning) HS encoders,\nencodes feature maps into five scanning methods and eight directions, thereby\nstrengthening global connections through the (State Space Model) SSM. The use\nof Hilbert scanning and eight directions significantly improves feature\nsequence modeling. Comprehensive experiments on six diverse anomaly detection\ndatasets and seven metrics demonstrate state-of-the-art performance,\nsubstantiating the method's effectiveness.\n","authors":["Haoyang He","Yuhu Bai","Jiangning Zhang","Qingdong He","Hongxu Chen","Zhenye Gan","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07867v1","updated":"2024-04-11T16:01:00Z","published":"2024-04-11T16:01:00Z","title":"The Power of Properties: Uncovering the Influential Factors in Emotion\n Classification","summary":" Facial expression-based human emotion recognition is a critical research area\nin psychology and medicine. State-of-the-art classification performance is only\nreached by end-to-end trained neural networks. Nevertheless, such black-box\nmodels lack transparency in their decision-making processes, prompting efforts\nto ascertain the rules that underlie classifiers' decisions. Analyzing single\ninputs alone fails to expose systematic learned biases. These biases can be\ncharacterized as facial properties summarizing abstract information like age or\nmedical conditions. Therefore, understanding a model's prediction behavior\nrequires an analysis rooted in causality along such selected properties. We\ndemonstrate that up to 91.25% of classifier output behavior changes are\nstatistically significant concerning basic properties. Among those are age,\ngender, and facial symmetry. Furthermore, the medical usage of surface\nelectromyography significantly influences emotion prediction. We introduce a\nworkflow to evaluate explicit properties and their impact. These insights might\nhelp medical professionals select and apply classifiers regarding their\nspecialized data and properties.\n","authors":["Tim Büchner","Niklas Penzel","Orlando Guntinas-Lichius","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2404.07867v1.pdf","comment":"8 pages, 3 tables, 1 figure, accepted at ICPRAI 2024"},{"id":"http://arxiv.org/abs/2404.06177v2","updated":"2024-04-11T15:57:52Z","published":"2024-04-09T09:58:10Z","title":"Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised\n Medical Image Segmentation","summary":" Although the existing uncertainty-based semi-supervised medical segmentation\nmethods have achieved excellent performance, they usually only consider a\nsingle uncertainty evaluation, which often fails to solve the problem related\nto credibility completely. Therefore, based on the framework of evidential deep\nlearning, this paper integrates the evidential predictive results in the\ncross-region of mixed and original samples to reallocate the confidence degree\nand uncertainty measure of each voxel, which is realized by emphasizing\nuncertain information of probability assignments fusion rule of traditional\nevidence theory. Furthermore, we design a voxel-level asymptotic learning\nstrategy by introducing information entropy to combine with the fused\nuncertainty measure to estimate voxel prediction more precisely. The model will\ngradually pay attention to the prediction results with high uncertainty in the\nlearning process, to learn the features that are difficult to master. The\nexperimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the\nsuperior performance of our proposed method in comparison with the existing\nstate of the arts.\n","authors":["Yuanpeng He","Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.06177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07855v1","updated":"2024-04-11T15:51:52Z","published":"2024-04-11T15:51:52Z","title":"Resolve Domain Conflicts for Generalizable Remote Physiological\n Measurement","summary":" Remote photoplethysmography (rPPG) technology has become increasingly popular\ndue to its non-invasive monitoring of various physiological indicators, making\nit widely applicable in multimedia interaction, healthcare, and emotion\nanalysis. Existing rPPG methods utilize multiple datasets for training to\nenhance the generalizability of models. However, they often overlook the\nunderlying conflict issues across different datasets, such as (1) label\nconflict resulting from different phase delays between physiological signal\nlabels and face videos at the instance level, and (2) attribute conflict\nstemming from distribution shifts caused by head movements, illumination\nchanges, skin types, etc. To address this, we introduce the DOmain-HArmonious\nframework (DOHA). Specifically, we first propose a harmonious phase strategy to\neliminate uncertain phase delays and preserve the temporal variation of\nphysiological signals. Next, we design a harmonious hyperplane optimization\nthat reduces irrelevant attribute shifts and encourages the model's\noptimization towards a global solution that fits more valid scenarios. Our\nexperiments demonstrate that DOHA significantly improves the performance of\nexisting methods under multiple protocols. Our code is available at\nhttps://github.com/SWY666/rPPG-DOHA.\n","authors":["Weiyu Sun","Xinyu Zhang","Hao Lu","Ying Chen","Yun Ge","Xiaolin Huang","Jie Yuan","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07855v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2404.07850v1","updated":"2024-04-11T15:46:42Z","published":"2024-04-11T15:46:42Z","title":"MindBridge: A Cross-Subject Brain Decoding Framework","summary":" Brain decoding, a pivotal field in neuroscience, aims to reconstruct stimuli\nfrom acquired brain signals, primarily utilizing functional magnetic resonance\nimaging (fMRI). Currently, brain decoding is confined to a\nper-subject-per-model paradigm, limiting its applicability to the same\nindividual for whom the decoding model is trained. This constraint stems from\nthree key challenges: 1) the inherent variability in input dimensions across\nsubjects due to differences in brain size; 2) the unique intrinsic neural\npatterns, influencing how different individuals perceive and process sensory\ninformation; 3) limited data availability for new subjects in real-world\nscenarios hampers the performance of decoding models. In this paper, we present\na novel approach, MindBridge, that achieves cross-subject brain decoding by\nemploying only one model. Our proposed framework establishes a generic paradigm\ncapable of addressing these challenges by introducing biological-inspired\naggregation function and novel cyclic fMRI reconstruction mechanism for\nsubject-invariant representation learning. Notably, by cycle reconstruction of\nfMRI, MindBridge can enable novel fMRI synthesis, which also can serve as\npseudo data augmentation. Within the framework, we also devise a novel\nreset-tuning method for adapting a pretrained model to a new subject.\nExperimental results demonstrate MindBridge's ability to reconstruct images for\nmultiple subjects, which is competitive with dedicated subject-specific models.\nFurthermore, with limited data for a new subject, we achieve a high level of\ndecoding accuracy, surpassing that of subject-specific models. This advancement\nin cross-subject brain decoding suggests promising directions for wider\napplications in neuroscience and indicates potential for more efficient\nutilization of limited fMRI data in real-world scenarios. Project page:\nhttps://littlepure2333.github.io/MindBridge\n","authors":["Shizun Wang","Songhua Liu","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07850v1.pdf","comment":"CVPR 2024 highlight. Code is available at\n https://github.com/littlepure2333/MindBridge"},{"id":"http://arxiv.org/abs/2404.07847v1","updated":"2024-04-11T15:42:53Z","published":"2024-04-11T15:42:53Z","title":"Fuss-Free Network: A Simplified and Efficient Neural Network for Crowd\n Counting","summary":" In the field of crowd-counting research, many recent deep learning based\nmethods have demonstrated robust capabilities for accurately estimating crowd\nsizes. However, the enhancement in their performance often arises from an\nincrease in the complexity of the model structure. This paper introduces the\nFuss-Free Network (FFNet), a crowd counting deep learning model that is\ncharacterized by its simplicity and efficiency in terms of its structure. The\nmodel comprises only a backbone of a neural network and a multi-scale feature\nfusion structure.The multi-scale feature fusion structure is a simple\narchitecture consisting of three branches, each only equipped with a focus\ntransition module, and combines the features from these branches through the\nconcatenation operation.Our proposed crowd counting model is trained and\nevaluated on four widely used public datasets, and it achieves accuracy that is\ncomparable to that of existing complex models.The experimental results further\nindicate that excellent performance in crowd counting tasks can also be\nachieved by utilizing a simple, low-parameter, and computationally efficient\nneural network structure.\n","authors":["Lei Chen","Xingen Gao"],"pdf_url":"https://arxiv.org/pdf/2404.07847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07846v1","updated":"2024-04-11T15:39:10Z","published":"2024-04-11T15:39:10Z","title":"TBSN: Transformer-Based Blind-Spot Network for Self-Supervised Image\n Denoising","summary":" Blind-spot networks (BSN) have been prevalent network architectures in\nself-supervised image denoising (SSID). Existing BSNs are mostly conducted with\nconvolution layers. Although transformers offer potential solutions to the\nlimitations of convolutions and have demonstrated success in various image\nrestoration tasks, their attention mechanisms may violate the blind-spot\nrequirement, thus restricting their applicability in SSID. In this paper, we\npresent a transformer-based blind-spot network (TBSN) by analyzing and\nredesigning the transformer operators that meet the blind-spot requirement.\nSpecifically, TBSN follows the architectural principles of dilated BSNs, and\nincorporates spatial as well as channel self-attention layers to enhance the\nnetwork capability. For spatial self-attention, an elaborate mask is applied to\nthe attention matrix to restrict its receptive field, thus mimicking the\ndilated convolution. For channel self-attention, we observe that it may leak\nthe blind-spot information when the channel number is greater than spatial size\nin the deep layers of multi-scale architectures. To eliminate this effect, we\ndivide the channel into several groups and perform channel attention\nseparately. Furthermore, we introduce a knowledge distillation strategy that\ndistills TBSN into smaller denoisers to improve computational efficiency while\nmaintaining performance. Extensive experiments on real-world image denoising\ndatasets show that TBSN largely extends the receptive field and exhibits\nfavorable performance against state-of-the-art SSID methods. The code and\npre-trained models will be publicly available at\nhttps://github.com/nagejacob/TBSN.\n","authors":["Junyi Li","Zhilu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.07846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08890v3","updated":"2024-04-11T15:34:46Z","published":"2023-02-17T14:19:28Z","title":"Deep Learning for Event-based Vision: A Comprehensive Survey and\n Benchmarks","summary":" Event cameras are bio-inspired sensors that capture the per-pixel intensity\nchanges asynchronously and produce event streams encoding the time, pixel\nposition, and polarity (sign) of the intensity changes. Event cameras possess a\nmyriad of advantages over canonical frame-based cameras, such as high temporal\nresolution, high dynamic range, low latency, etc. Being capable of capturing\ninformation in challenging visual conditions, event cameras have the potential\nto overcome the limitations of frame-based cameras in the computer vision and\nrobotics community. In very recent years, deep learning (DL) has been brought\nto this emerging field and inspired active research endeavors in mining its\npotential. However, there is still a lack of taxonomies in DL techniques for\nevent-based vision. We first scrutinize the typical event representations with\nquality enhancement methods as they play a pivotal role as inputs to the DL\nmodels. We then provide a comprehensive survey of existing DL-based methods by\nstructurally grouping them into two major categories: 1) image/video\nreconstruction and restoration; 2) event-based scene understanding and 3D\nvision. We conduct benchmark experiments for the existing methods in some\nrepresentative research directions, i.e., image reconstruction, deblurring, and\nobject recognition, to identify some critical insights and problems. Finally,\nwe have discussions regarding the challenges and provide new perspectives for\ninspiring more research studies.\n","authors":["Xu Zheng","Yexin Liu","Yunfan Lu","Tongyan Hua","Tianbo Pan","Weiming Zhang","Dacheng Tao","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2302.08890v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06842v2","updated":"2024-04-11T15:28:36Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07833v1","updated":"2024-04-11T15:18:34Z","published":"2024-04-11T15:18:34Z","title":"Streamlined Photoacoustic Image Processing with Foundation Models: A\n Training-Free Solution","summary":" Foundation models have rapidly evolved and have achieved significant\naccomplishments in computer vision tasks. Specifically, the prompt mechanism\nconveniently allows users to integrate image prior information into the model,\nmaking it possible to apply models without any training. Therefore, we propose\na method based on foundation models and zero training to solve the tasks of\nphotoacoustic (PA) image segmentation. We employed the segment anything model\n(SAM) by setting simple prompts and integrating the model's outputs with prior\nknowledge of the imaged objects to accomplish various tasks, including: (1)\nremoving the skin signal in three-dimensional PA image rendering; (2) dual\nspeed-of-sound reconstruction, and (3) segmentation of finger blood vessels.\nThrough these demonstrations, we have concluded that deep learning can be\ndirectly applied in PA imaging without the requirement for network design and\ntraining. This potentially allows for a hands-on, convenient approach to\nachieving efficient and accurate segmentation of PA images. This letter serves\nas a comprehensive tutorial, facilitating the mastery of the technique through\nthe provision of code and sample datasets.\n","authors":["Handi Deng","Yucheng Zhou","Jiaxuan Xiang","Liujie Gu","Yan Luo","Hai Feng","Mingyuan Liu","Cheng Ma"],"pdf_url":"https://arxiv.org/pdf/2404.07833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07824v1","updated":"2024-04-11T15:09:22Z","published":"2024-04-11T15:09:22Z","title":"Heron-Bench: A Benchmark for Evaluating Vision Language Models in\n Japanese","summary":" Vision Language Models (VLMs) have undergone a rapid evolution, giving rise\nto significant advancements in the realm of multimodal understanding tasks.\nHowever, the majority of these models are trained and evaluated on\nEnglish-centric datasets, leaving a gap in the development and evaluation of\nVLMs for other languages, such as Japanese. This gap can be attributed to the\nlack of methodologies for constructing VLMs and the absence of benchmarks to\naccurately measure their performance. To address this issue, we introduce a\nnovel benchmark, Japanese Heron-Bench, for evaluating Japanese capabilities of\nVLMs. The Japanese Heron-Bench consists of a variety of imagequestion answer\npairs tailored to the Japanese context. Additionally, we present a baseline\nJapanese VLM that has been trained with Japanese visual instruction tuning\ndatasets. Our Heron-Bench reveals the strengths and limitations of the proposed\nVLM across various ability dimensions. Furthermore, we clarify the capability\ngap between strong closed models like GPT-4V and the baseline model, providing\nvaluable insights for future research in this domain. We release the benchmark\ndataset and training code to facilitate further developments in Japanese VLM\nresearch.\n","authors":["Yuichi Inoue","Kento Sasaki","Yuma Ochi","Kazuki Fujii","Kotaro Tanahashi","Yu Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2404.07824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07821v1","updated":"2024-04-11T15:00:55Z","published":"2024-04-11T15:00:55Z","title":"Sparse Laneformer","summary":" Lane detection is a fundamental task in autonomous driving, and has achieved\ngreat progress as deep learning emerges. Previous anchor-based methods often\ndesign dense anchors, which highly depend on the training dataset and remain\nfixed during inference. We analyze that dense anchors are not necessary for\nlane detection, and propose a transformer-based lane detection framework based\non a sparse anchor mechanism. To this end, we generate sparse anchors with\nposition-aware lane queries and angle queries instead of traditional explicit\nanchors. We adopt Horizontal Perceptual Attention (HPA) to aggregate the lane\nfeatures along the horizontal direction, and adopt Lane-Angle Cross Attention\n(LACA) to perform interactions between lane queries and angle queries. We also\npropose Lane Perceptual Attention (LPA) based on deformable cross attention to\nfurther refine the lane predictions. Our method, named Sparse Laneformer, is\neasy-to-implement and end-to-end trainable. Extensive experiments demonstrate\nthat Sparse Laneformer performs favorably against the state-of-the-art methods,\ne.g., surpassing Laneformer by 3.0% F1 score and O2SFormer by 0.7% F1 score\nwith fewer MACs on CULane with the same ResNet-34 backbone.\n","authors":["Ji Liu","Zifeng Zhang","Mingjie Lu","Hongyang Wei","Dong Li","Yile Xie","Jinzhang Peng","Lu Tian","Ashish Sirasao","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2404.07821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07807v1","updated":"2024-04-11T14:51:12Z","published":"2024-04-11T14:51:12Z","title":"Voice-Assisted Real-Time Traffic Sign Recognition System Using\n Convolutional Neural Network","summary":" Traffic signs are important in communicating information to drivers. Thus,\ncomprehension of traffic signs is essential for road safety and ignorance may\nresult in road accidents. Traffic sign detection has been a research spotlight\nover the past few decades. Real-time and accurate detections are the\npreliminaries of robust traffic sign detection system which is yet to be\nachieved. This study presents a voice-assisted real-time traffic sign\nrecognition system which is capable of assisting drivers. This system functions\nunder two subsystems. Initially, the detection and recognition of the traffic\nsigns are carried out using a trained Convolutional Neural Network (CNN). After\nrecognizing the specific traffic sign, it is narrated to the driver as a voice\nmessage using a text-to-speech engine. An efficient CNN model for a benchmark\ndataset is developed for real-time detection and recognition using Deep\nLearning techniques. The advantage of this system is that even if the driver\nmisses a traffic sign, or does not look at the traffic sign, or is unable to\ncomprehend the sign, the system detects it and narrates it to the driver. A\nsystem of this type is also important in the development of autonomous\nvehicles.\n","authors":["Mayura Manawadu","Udaya Wijenayake"],"pdf_url":"https://arxiv.org/pdf/2404.07807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07794v1","updated":"2024-04-11T14:35:59Z","published":"2024-04-11T14:35:59Z","title":"DGMamba: Domain Generalization via Generalized State Space Model","summary":" Domain generalization~(DG) aims at solving distribution shift problems in\nvarious scenes. Existing approaches are based on Convolution Neural Networks\n(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive\nfields or quadratic complexities issues. Mamba, as an emerging state space\nmodel (SSM), possesses superior linear complexity and global receptive fields.\nDespite this, it can hardly be applied to DG to address distribution shifts,\ndue to the hidden state issues and inappropriate scan mechanisms. In this\npaper, we propose a novel framework for DG, named DGMamba, that excels in\nstrong generalizability toward unseen domains and meanwhile has the advantages\nof global receptive fields, and efficient linear complexity. Our DGMamba\ncompromises two core components: Hidden State Suppressing~(HSS) and\nSemantic-aware Patch refining~(SPR). In particular, HSS is introduced to\nmitigate the influence of hidden states associated with domain-specific\nfeatures during output prediction. SPR strives to encourage the model to\nconcentrate more on objects rather than context, consisting of two designs:\nPrior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely,\nPFS aims to shuffle the non-semantic patches within images, creating more\nflexible and effective sequences from images, and DCI is designed to regularize\nMamba with the combination of mismatched non-semantic and semantic information\nby fusing patches among domains. Extensive experiments on four commonly used DG\nbenchmarks demonstrate that the proposed DGMamba achieves remarkably superior\nresults to state-of-the-art models. The code will be made publicly available.\n","authors":["Shaocong Long","Qianyu Zhou","Xiangtai Li","Xuequan Lu","Chenhao Ying","Yuan Luo","Lizhuang Ma","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2404.07794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07790v1","updated":"2024-04-11T14:31:11Z","published":"2024-04-11T14:31:11Z","title":"VIFNet: An End-to-end Visible-Infrared Fusion Network for Image Dehazing","summary":" Image dehazing poses significant challenges in environmental perception.\nRecent research mainly focus on deep learning-based methods with single\nmodality, while they may result in severe information loss especially in\ndense-haze scenarios. The infrared image exhibits robustness to the haze,\nhowever, existing methods have primarily treated the infrared modality as\nauxiliary information, failing to fully explore its rich information in\ndehazing. To address this challenge, the key insight of this study is to design\na visible-infrared fusion network for image dehazing. In particular, we propose\na multi-scale Deep Structure Feature Extraction (DSFE) module, which\nincorporates the Channel-Pixel Attention Block (CPAB) to restore more spatial\nand marginal information within the deep structural features. Additionally, we\nintroduce an inconsistency weighted fusion strategy to merge the two modalities\nby leveraging the more reliable information. To validate this, we construct a\nvisible-infrared multimodal dataset called AirSim-VID based on the AirSim\nsimulation platform. Extensive experiments performed on challenging real and\nsimulated image datasets demonstrate that VIFNet can outperform many\nstate-of-the-art competing methods. The code and dataset are available at\nhttps://github.com/mengyu212/VIFNet_dehazing.\n","authors":["Meng Yu","Te Cui","Haoyang Lu","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2404.07790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07788v1","updated":"2024-04-11T14:29:30Z","published":"2024-04-11T14:29:30Z","title":"AUG: A New Dataset and An Efficient Model for Aerial Image Urban Scene\n Graph Generation","summary":" Scene graph generation (SGG) aims to understand the visual objects and their\nsemantic relationships from one given image. Until now, lots of SGG datasets\nwith the eyelevel view are released but the SGG dataset with the overhead view\nis scarcely studied. By contrast to the object occlusion problem in the\neyelevel view, which impedes the SGG, the overhead view provides a new\nperspective that helps to promote the SGG by providing a clear perception of\nthe spatial relationships of objects in the ground scene. To fill in the gap of\nthe overhead view dataset, this paper constructs and releases an aerial image\nurban scene graph generation (AUG) dataset. Images from the AUG dataset are\ncaptured with the low-attitude overhead view. In the AUG dataset, 25,594\nobjects, 16,970 relationships, and 27,175 attributes are manually annotated. To\navoid the local context being overwhelmed in the complex aerial urban scene,\nthis paper proposes one new locality-preserving graph convolutional network\n(LPG). Different from the traditional graph convolutional network, which has\nthe natural advantage of capturing the global context for SGG, the\nconvolutional layer in the LPG integrates the non-destructive initial features\nof the objects with dynamically updated neighborhood information to preserve\nthe local context under the premise of mining the global context. To address\nthe problem that there exists an extra-large number of potential object\nrelationship pairs but only a small part of them is meaningful in AUG, we\npropose the adaptive bounding box scaling factor for potential relationship\ndetection (ABS-PRD) to intelligently prune the meaningless relationship pairs.\nExtensive experiments on the AUG dataset show that our LPG can significantly\noutperform the state-of-the-art methods and the effectiveness of the proposed\nlocality-preserving strategy.\n","authors":["Yansheng Li","Kun Li","Yongjun Zhang","Linlin Wang","Dingwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07785v1","updated":"2024-04-11T14:28:04Z","published":"2024-04-11T14:28:04Z","title":"PRAM: Place Recognition Anywhere Model for Efficient Visual Localization","summary":" Humans localize themselves efficiently in known environments by first\nrecognizing landmarks defined on certain objects and their spatial\nrelationships, and then verifying the location by aligning detailed structures\nof recognized objects with those in the memory. Inspired by this, we propose\nthe place recognition anywhere model (PRAM) to perform visual localization as\nefficiently as humans do. PRAM consists of two main components - recognition\nand registration. In detail, first of all, a self-supervised map-centric\nlandmark definition strategy is adopted, making places in either indoor or\noutdoor scenes act as unique landmarks. Then, sparse keypoints extracted from\nimages, are utilized as the input to a transformer-based deep neural network\nfor landmark recognition; these keypoints enable PRAM to recognize hundreds of\nlandmarks with high time and memory efficiency. Keypoints along with recognized\nlandmark labels are further used for registration between query images and the\n3D landmark map. Different from previous hierarchical methods, PRAM discards\nglobal and local descriptors, and reduces over 90% storage. Since PRAM utilizes\nrecognition and landmark-wise verification to replace global reference search\nand exhaustive matching respectively, it runs 2.4 times faster than prior\nstate-of-the-art approaches. Moreover, PRAM opens new directions for visual\nlocalization including multi-modality localization, map-centric feature\nlearning, and hierarchical scene coordinate regression.\n","authors":["Fei Xue","Ignas Budvytis","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2404.07785v1.pdf","comment":"project page: https://feixue94.github.io/pram-project/"},{"id":"http://arxiv.org/abs/2404.04562v2","updated":"2024-04-11T14:28:00Z","published":"2024-04-06T09:03:18Z","title":"Diffusion Time-step Curriculum for One Image to 3D Generation","summary":" Score distillation sampling~(SDS) has been widely adopted to overcome the\nabsence of unseen views in reconstructing 3D objects from a \\textbf{single}\nimage. It leverages pre-trained 2D diffusion models as teacher to guide the\nreconstruction of student 3D models. Despite their remarkable success,\nSDS-based methods often encounter geometric artifacts and texture saturation.\nWe find out the crux is the overlooked indiscriminate treatment of diffusion\ntime-steps during optimization: it unreasonably treats the student-teacher\nknowledge distillation to be equal at all time-steps and thus entangles\ncoarse-grained and fine-grained modeling. Therefore, we propose the Diffusion\nTime-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the\nteacher and student models collaborating with the time-step curriculum in a\ncoarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and\nLevel50 benchmark demonstrate that DTC123 can produce multi-view consistent,\nhigh-quality, and diverse 3D assets. Codes and more generation demos will be\nreleased in https://github.com/yxymessi/DTC123.\n","authors":["Xuanyu Yi","Zike Wu","Qingshan Xu","Pan Zhou","Joo-Hwee Lim","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04562v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.07782v2","updated":"2024-04-11T14:27:27Z","published":"2024-01-15T15:43:56Z","title":"Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in\n Remote Sensing","summary":" Self-supervised learning through masked autoencoders (MAEs) has recently\nattracted great attention for remote sensing (RS) image representation\nlearning, and thus embodies a significant potential for content-based image\nretrieval (CBIR) from ever-growing RS image archives. However, the existing\nstudies on MAEs in RS assume that the considered RS images are acquired by a\nsingle image sensor, and thus are only suitable for uni-modal CBIR problems.\nThe effectiveness of MAEs for cross-sensor CBIR, which aims to search\nsemantically similar images across different image modalities, has not been\nexplored yet. In this paper, we take the first step to explore the\neffectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a\nsystematic overview on the possible adaptations of the vanilla MAE to exploit\nmasked image modeling on multi-sensor RS image archives (denoted as\ncross-sensor masked autoencoders [CSMAEs]). Based on different adjustments\napplied to the vanilla MAE, we introduce different CSMAE models. We also\nprovide an extensive experimental analysis of these CSMAE models. We finally\nderive a guideline to exploit masked image modeling for uni-modal and\ncross-modal CBIR problems in RS. The code of this work is publicly available at\nhttps://github.com/jakhac/CSMAE.\n","authors":["Jakob Hackstein","Gencer Sumbul","Kai Norman Clasen","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2401.07782v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Our code is available at https://github.com/jakhac/CSMAE"},{"id":"http://arxiv.org/abs/2309.09590v2","updated":"2024-04-11T14:24:09Z","published":"2023-09-18T08:54:29Z","title":"An Autonomous Vision-Based Algorithm for Interplanetary Navigation","summary":" The surge of deep-space probes makes it unsustainable to navigate them with\nstandard radiometric tracking. Self-driving interplanetary satellites represent\na solution to this problem. In this work, a full vision-based navigation\nalgorithm is built by combining an orbit determination method with an image\nprocessing pipeline suitable for interplanetary transfers of autonomous\nplatforms. To increase the computational efficiency of the algorithm, a\nnon-dimensional extended Kalman filter is selected as state estimator, fed by\nthe positions of the planets extracted from deep-space images. An enhancement\nof the estimation accuracy is performed by applying an optimal strategy to\nselect the best pair of planets to track. Moreover, a novel analytical\nmeasurement model for deep-space navigation is developed providing a\nfirst-order approximation of the light-aberration and light-time effects.\nAlgorithm performance is tested on a high-fidelity, Earth--Mars interplanetary\ntransfer, showing the algorithm applicability for deep-space navigation.\n","authors":["Eleonora Andreis","Paolo Panicucci","Francesco Topputo"],"pdf_url":"https://arxiv.org/pdf/2309.09590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18551v2","updated":"2024-04-11T14:10:43Z","published":"2024-03-27T13:31:39Z","title":"Attention Calibration for Disentangled Text-to-Image Personalization","summary":" Recent thrilling progress in large-scale text-to-image (T2I) models has\nunlocked unprecedented synthesis quality of AI-generated content (AIGC)\nincluding image generation, 3D and video composition. Further, personalized\ntechniques enable appealing customized production of a novel concept given only\nseveral images as reference. However, an intriguing problem persists: Is it\npossible to capture multiple, novel concepts from one single reference image?\nIn this paper, we identify that existing approaches fail to preserve visual\nconsistency with the reference image and eliminate cross-influence from\nconcepts. To alleviate this, we propose an attention calibration mechanism to\nimprove the concept-level understanding of the T2I model. Specifically, we\nfirst introduce new learnable modifiers bound with classes to capture\nattributes of multiple concepts. Then, the classes are separated and\nstrengthened following the activation of the cross-attention operation,\nensuring comprehensive and self-contained concepts. Additionally, we suppress\nthe attention activation of different classes to mitigate mutual influence\namong concepts. Together, our proposed method, dubbed DisenDiff, can learn\ndisentangled multiple concepts from one single image and produce novel\ncustomized images with learned concepts. We demonstrate that our method\noutperforms the current state of the art in both qualitative and quantitative\nevaluations. More importantly, our proposed techniques are compatible with LoRA\nand inpainting pipelines, enabling more interactive experiences.\n","authors":["Yanbing Zhang","Mengping Yang","Qin Zhou","Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18551v2.pdf","comment":"CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.07773v1","updated":"2024-04-11T14:08:45Z","published":"2024-04-11T14:08:45Z","title":"ConsistencyDet: Robust Object Detector with Denoising Paradigm of\n Consistency Model","summary":" Object detection, a quintessential task in the realm of perceptual computing,\ncan be tackled using a generative methodology. In the present study, we\nintroduce a novel framework designed to articulate object detection as a\ndenoising diffusion process, which operates on perturbed bounding boxes of\nannotated entities. This framework, termed ConsistencyDet, leverages an\ninnovative denoising concept known as the Consistency Model. The hallmark of\nthis model is its self-consistency feature, which empowers the model to map\ndistorted information from any temporal stage back to its pristine state,\nthereby realizing a ``one-step denoising'' mechanism. Such an attribute\nmarkedly elevates the operational efficiency of the model, setting it apart\nfrom the conventional Diffusion Model. Throughout the training phase,\nConsistencyDet initiates the diffusion sequence with noise-infused boxes\nderived from the ground-truth annotations and conditions the model to perform\nthe denoising task. Subsequently, in the inference stage, the model employs a\ndenoising sampling strategy that commences with bounding boxes randomly sampled\nfrom a normal distribution. Through iterative refinement, the model transforms\nan assortment of arbitrarily generated boxes into the definitive detections.\nComprehensive evaluations employing standard benchmarks, such as MS-COCO and\nLVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in\nperformance metrics.\n","authors":["Lifan Jiang","Zhihui Wang","Changmiao Wang","Ming Li","Jiaxu Leng","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07770v1","updated":"2024-04-11T14:07:16Z","published":"2024-04-11T14:07:16Z","title":"Joint Conditional Diffusion Model for Image Restoration with Mixed\n Degradations","summary":" Image restoration is rather challenging in adverse weather conditions,\nespecially when multiple degradations occur simultaneously. Blind image\ndecomposition was proposed to tackle this issue, however, its effectiveness\nheavily relies on the accurate estimation of each component. Although\ndiffusion-based models exhibit strong generative abilities in image restoration\ntasks, they may generate irrelevant contents when the degraded images are\nseverely corrupted. To address these issues, we leverage physical constraints\nto guide the whole restoration process, where a mixed degradation model based\non atmosphere scattering model is constructed. Then we formulate our Joint\nConditional Diffusion Model (JCDM) by incorporating the degraded image and\ndegradation mask to provide precise guidance. To achieve better color and\ndetail recovery results, we further integrate a refinement network to\nreconstruct the restored image, where Uncertainty Estimation Block (UEB) is\nemployed to enhance the features. Extensive experiments performed on both\nmulti-weather and weather-specific datasets demonstrate the superiority of our\nmethod over state-of-the-art competing methods.\n","authors":["Yufeng Yue","Meng Yu","Luojie Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07766v1","updated":"2024-04-11T14:05:37Z","published":"2024-04-11T14:05:37Z","title":"RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric\n Stereo Network","summary":" Predicting accurate normal maps of objects from two-dimensional images in\nregions of complex structure and spatial material variations is challenging\nusing photometric stereo methods due to the influence of surface reflection\nproperties caused by variations in object geometry and surface materials. To\naddress this issue, we propose a photometric stereo network called a RMAFF-PSN\nthat uses residual multiscale attentional feature fusion to handle the\n``difficult'' regions of the object. Unlike previous approaches that only use\nstacked convolutional layers to extract deep features from the input image, our\nmethod integrates feature information from different resolution stages and\nscales of the image. This approach preserves more physical information, such as\ntexture and geometry of the object in complex regions, through shallow-deep\nstage feature extraction, double branching enhancement, and attention\noptimization. To test the network structure under real-world conditions, we\npropose a new real dataset called Simple PS data, which contains multiple\nobjects with varying structures and materials. Experimental results on a\npublicly available benchmark dataset demonstrate that our method outperforms\nmost existing calibrated photometric stereo methods for the same number of\ninput images, especially in the case of highly non-convex object structures.\nOur method also obtains good results under sparse lighting conditions.\n","authors":["Kai Luo","Yakun Ju","Lin Qi","Kaixuan Wang","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.07766v1.pdf","comment":"17 pages,12 figures"},{"id":"http://arxiv.org/abs/2404.07762v1","updated":"2024-04-11T14:03:16Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/wljungbergh/NeuroNCAP\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07754v1","updated":"2024-04-11T14:00:20Z","published":"2024-04-11T14:00:20Z","title":"Generating Synthetic Satellite Imagery With Deep-Learning Text-to-Image\n Models -- Technical Challenges and Implications for Monitoring and\n Verification","summary":" Novel deep-learning (DL) architectures have reached a level where they can\ngenerate digital media, including photorealistic images, that are difficult to\ndistinguish from real data. These technologies have already been used to\ngenerate training data for Machine Learning (ML) models, and large\ntext-to-image models like DALL-E 2, Imagen, and Stable Diffusion are achieving\nremarkable results in realistic high-resolution image generation. Given these\ndevelopments, issues of data authentication in monitoring and verification\ndeserve a careful and systematic analysis: How realistic are synthetic images?\nHow easily can they be generated? How useful are they for ML researchers, and\nwhat is their potential for Open Science? In this work, we use novel DL models\nto explore how synthetic satellite images can be created using conditioning\nmechanisms. We investigate the challenges of synthetic satellite image\ngeneration and evaluate the results based on authenticity and state-of-the-art\nmetrics. Furthermore, we investigate how synthetic data can alleviate the lack\nof data in the context of ML methods for remote-sensing. Finally we discuss\nimplications of synthetic satellite imagery in the context of monitoring and\nverification.\n","authors":["Tuong Vy Nguyen","Alexander Glaser","Felix Biessmann"],"pdf_url":"https://arxiv.org/pdf/2404.07754v1.pdf","comment":"https://resources.inmm.org/annual-meeting-proceedings/generating-synthetic-satellite-imagery-deep-learning-text-image-models"},{"id":"http://arxiv.org/abs/2404.07748v1","updated":"2024-04-11T13:46:05Z","published":"2024-04-11T13:46:05Z","title":"3D-CSAD: Untrained 3D Anomaly Detection for Complex Manufacturing\n Surfaces","summary":" The surface quality inspection of manufacturing parts based on 3D point cloud\ndata has attracted increasing attention in recent years. The reason is that the\n3D point cloud can capture the entire surface of manufacturing parts, unlike\nthe previous practices that focus on some key product characteristics. However,\nachieving accurate 3D anomaly detection is challenging, due to the complex\nsurfaces of manufacturing parts and the difficulty of collecting sufficient\nanomaly samples. To address these challenges, we propose a novel untrained\nanomaly detection method based on 3D point cloud data for complex manufacturing\nparts, which can achieve accurate anomaly detection in a single sample without\ntraining data. In the proposed framework, we transform an input sample into two\nsets of profiles along different directions. Based on one set of the profiles,\na novel segmentation module is devised to segment the complex surface into\nmultiple basic and simple components. In each component, another set of\nprofiles, which have the nature of similar shapes, can be modeled as a low-rank\nmatrix. Thus, accurate 3D anomaly detection can be achieved by using Robust\nPrincipal Component Analysis (RPCA) on these low-rank matrices. Extensive\nnumerical experiments on different types of parts show that our method achieves\npromising results compared with the benchmark methods.\n","authors":["Xuanming Cao","Chengyu Tao","Juan Du"],"pdf_url":"https://arxiv.org/pdf/2404.07748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05539v2","updated":"2024-04-11T13:39:18Z","published":"2023-11-09T17:34:57Z","title":"A Deep Learning Method for Simultaneous Denoising and Missing Wedge\n Reconstruction in Cryogenic Electron Tomography","summary":" Cryogenic electron tomography is a technique for imaging biological samples\nin 3D. A microscope collects a series of 2D projections of the sample, and the\ngoal is to reconstruct the 3D density of the sample called the tomogram.\nReconstruction is difficult as the 2D projections are noisy and can not be\nrecorded from all directions, resulting in a missing wedge of information.\nTomograms conventionally reconstructed with filtered back-projection suffer\nfrom noise and strong artifacts due to the missing wedge. Here, we propose a\ndeep-learning approach for simultaneous denoising and missing wedge\nreconstruction called DeepDeWedge. The algorithm requires no ground truth data\nand is based on fitting a neural network to the 2D projections using a\nself-supervised loss. DeepDeWedge performs better than CryoCARE and IsoNet,\nwhich are state-of-the-art methods for denoising and missing wedge\nreconstruction, and similarly and, in some cases, better than the combination\nof the two methods. At the same time, DeepDeWedge is simpler than this two-step\napproach, as it does denoising and missing wedge reconstruction simultaneously\nrather than sequentially.\n","authors":["Simon Wiedemann","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2311.05539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07739v1","updated":"2024-04-11T13:37:51Z","published":"2024-04-11T13:37:51Z","title":"Exploiting Object-based and Segmentation-based Semantic Features for\n Deep Learning-based Indoor Scene Classification","summary":" Indoor scenes are usually characterized by scattered objects and their\nrelationships, which turns the indoor scene classification task into a\nchallenging computer vision task. Despite the significant performance boost in\nclassification tasks achieved in recent years, provided by the use of\ndeep-learning-based methods, limitations such as inter-category ambiguity and\nintra-category variation have been holding back their performance. To overcome\nsuch issues, gathering semantic information has been shown to be a promising\nsource of information towards a more complete and discriminative feature\nrepresentation of indoor scenes. Therefore, the work described in this paper\nuses both semantic information, obtained from object detection, and semantic\nsegmentation techniques. While object detection techniques provide the 2D\nlocation of objects allowing to obtain spatial distributions between objects,\nsemantic segmentation techniques provide pixel-level information that allows to\nobtain, at a pixel-level, a spatial distribution and shape-related features of\nthe segmentation categories. Hence, a novel approach that uses a semantic\nsegmentation mask to provide Hu-moments-based segmentation categories' shape\ncharacterization, designated by Segmentation-based Hu-Moments Features (SHMFs),\nis proposed. Moreover, a three-main-branch network, designated by\nGOS$^2$F$^2$App, that exploits deep-learning-based global features,\nobject-based features, and semantic segmentation-based features is also\nproposed. GOS$^2$F$^2$App was evaluated in two indoor scene benchmark datasets:\nSUN RGB-D and NYU Depth V2, where, to the best of our knowledge,\nstate-of-the-art results were achieved on both datasets, which present\nevidences of the effectiveness of the proposed approach.\n","authors":["Ricardo Pereira","Luís Garrote","Tiago Barros","Ana Lopes","Urbano J. Nunes"],"pdf_url":"https://arxiv.org/pdf/2404.07739v1.pdf","comment":"This preprint was submitted at IEEE Transactions on Image Processing"},{"id":"http://arxiv.org/abs/2404.05392v2","updated":"2024-04-11T13:36:58Z","published":"2024-04-08T10:51:29Z","title":"T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise\n Event Spotting in Sports Videos","summary":" In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer\nEncoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses\nmultiple challenges in the task, including the need for discriminability among\nframe representations, high output temporal resolution to maintain prediction\nprecision, and the necessity to capture information at different temporal\nscales to handle events with varying dynamics. It tackles these challenges\nthrough its specifically designed architecture, featuring an encoder-decoder\nfor leveraging multiple temporal scales and achieving high output temporal\nresolution, along with temporal modules designed to increase token\ndiscriminability. Leveraging these characteristics, T-DEED achieves SOTA\nperformance on the FigureSkating and FineDiving datasets. Code is available at\nhttps://github.com/arturxe2/T-DEED.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.05392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07729v1","updated":"2024-04-11T13:19:46Z","published":"2024-04-11T13:19:46Z","title":"Realistic Continual Learning Approach using Pre-trained Models","summary":" Continual learning (CL) is crucial for evaluating adaptability in learning\nsolutions to retain knowledge. Our research addresses the challenge of\ncatastrophic forgetting, where models lose proficiency in previously learned\ntasks as they acquire new ones. While numerous solutions have been proposed,\nexisting experimental setups often rely on idealized class-incremental learning\nscenarios. We introduce Realistic Continual Learning (RealCL), a novel CL\nparadigm where class distributions across tasks are random, departing from\nstructured setups.\n We also present CLARE (Continual Learning Approach with pRE-trained models\nfor RealCL scenarios), a pre-trained model-based solution designed to integrate\nnew knowledge while preserving past learning. Our contributions include\npioneering RealCL as a generalization of traditional CL setups, proposing CLARE\nas an adaptable approach for RealCL tasks, and conducting extensive experiments\ndemonstrating its effectiveness across various RealCL scenarios. Notably, CLARE\noutperforms existing models on RealCL benchmarks, highlighting its versatility\nand robustness in unpredictable learning environments.\n","authors":["Nadia Nasri","Carlos Gutiérrez-Álvarez","Sergio Lafuente-Arroyo","Saturnino Maldonado-Bascón","Roberto J. López-Sastre"],"pdf_url":"https://arxiv.org/pdf/2404.07729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07724v1","updated":"2024-04-11T13:16:47Z","published":"2024-04-11T13:16:47Z","title":"Applying Guidance in a Limited Interval Improves Sample and Distribution\n Quality in Diffusion Models","summary":" Guidance is a crucial technique for extracting the best performance out of\nimage-generating diffusion models. Traditionally, a constant guidance weight\nhas been applied throughout the sampling chain of an image. We show that\nguidance is clearly harmful toward the beginning of the chain (high noise\nlevels), largely unnecessary toward the end (low noise levels), and only\nbeneficial in the middle. We thus restrict it to a specific range of noise\nlevels, improving both the inference speed and result quality. This limited\nguidance interval improves the record FID in ImageNet-512 significantly, from\n1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial\nacross different sampler parameters, network architectures, and datasets,\nincluding the large-scale setting of Stable Diffusion XL. We thus suggest\nexposing the guidance interval as a hyperparameter in all diffusion models that\nuse guidance.\n","authors":["Tuomas Kynkäänniemi","Miika Aittala","Tero Karras","Samuli Laine","Timo Aila","Jaakko Lehtinen"],"pdf_url":"https://arxiv.org/pdf/2404.07724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03778v2","updated":"2024-04-11T13:12:48Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincar{é} Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16783v2","updated":"2024-04-11T13:07:43Z","published":"2023-03-29T15:19:01Z","title":"Exploring Efficient Asymmetric Blind-Spots for Self-Supervised Denoising\n in Real-World Scenarios","summary":" Self-supervised denoising has attracted widespread attention due to its\nability to train without clean images. However, noise in real-world scenarios\nis often spatially correlated, which causes many self-supervised algorithms\nthat assume pixel-wise independent noise to perform poorly. Recent works have\nattempted to break noise correlation with downsampling or neighborhood masking.\nHowever, denoising on downsampled subgraphs can lead to aliasing effects and\nloss of details due to a lower sampling rate. Furthermore, the neighborhood\nmasking methods either come with high computational complexity or do not\nconsider local spatial preservation during inference. Through the analysis of\nexisting methods, we point out that the key to obtaining high-quality and\ntexture-rich results in real-world self-supervised denoising tasks is to train\nat the original input resolution structure and use asymmetric operations during\ntraining and inference. Based on this, we propose Asymmetric Tunable Blind-Spot\nNetwork (AT-BSN), where the blind-spot size can be freely adjusted, thus better\nbalancing noise correlation suppression and image local spatial destruction\nduring training and inference. In addition, we regard the pre-trained AT-BSN as\na meta-teacher network capable of generating various teacher networks by\nsampling different blind-spots. We propose a blind-spot based multi-teacher\ndistillation strategy to distill a lightweight network, significantly improving\nperformance. Experimental results on multiple datasets prove that our method\nachieves state-of-the-art, and is superior to other self-supervised algorithms\nin terms of computational overhead and visual effects.\n","authors":["Shiyan Chen","Jiyuan Zhang","Zhaofei Yu","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2303.16783v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03936v2","updated":"2024-04-11T13:02:58Z","published":"2024-04-05T07:44:17Z","title":"Deep Learning for Satellite Image Time Series Analysis: A Review","summary":" Earth observation (EO) satellite missions have been providing detailed images\nabout the state of the Earth and its land cover for over 50 years. Long term\nmissions, such as NASA's Landsat, Terra, and Aqua satellites, and more\nrecently, the ESA's Sentinel missions, record images of the entire world every\nfew days. Although single images provide point-in-time data, repeated images of\nthe same area, or satellite image time series (SITS) provide information about\nthe changing state of vegetation and land use. These SITS are useful for\nmodeling dynamic processes and seasonal changes such as plant phenology. They\nhave potential benefits for many aspects of land and natural resource\nmanagement, including applications in agricultural, forest, water, and disaster\nmanagement, urban planning, and mining. However, the resulting satellite image\ntime series (SITS) are complex, incorporating information from the temporal,\nspatial, and spectral dimensions. Therefore, deep learning methods are often\ndeployed as they can analyze these complex relationships. This review presents\na summary of the state-of-the-art methods of modelling environmental,\nagricultural, and other Earth observation variables from SITS data using deep\nlearning methods. We aim to provide a resource for remote sensing experts\ninterested in using deep learning techniques to enhance Earth observation\nmodels with temporal information.\n","authors":["Lynn Miller","Charlotte Pelletier","Geoffrey I. Webb"],"pdf_url":"https://arxiv.org/pdf/2404.03936v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.07713v1","updated":"2024-04-11T12:59:38Z","published":"2024-04-11T12:59:38Z","title":"Progressive Semantic-Guided Vision Transformer for Zero-Shot Learning","summary":" Zero-shot learning (ZSL) recognizes the unseen classes by conducting\nvisual-semantic interactions to transfer semantic knowledge from seen classes\nto unseen ones, supported by semantic information (e.g., attributes). However,\nexisting ZSL methods simply extract visual features using a pre-trained network\nbackbone (i.e., CNN or ViT), which fail to learn matched visual-semantic\ncorrespondences for representing semantic-related visual features as lacking of\nthe guidance of semantic information, resulting in undesirable visual-semantic\ninteractions. To tackle this issue, we propose a progressive semantic-guided\nvision transformer for zero-shot learning (dubbed ZSLViT). ZSLViT mainly\nconsiders two properties in the whole network: i) discover the semantic-related\nvisual representations explicitly, and ii) discard the semantic-unrelated\nvisual information. Specifically, we first introduce semantic-embedded token\nlearning to improve the visual-semantic correspondences via semantic\nenhancement and discover the semantic-related visual tokens explicitly with\nsemantic-guided token attention. Then, we fuse low semantic-visual\ncorrespondence visual tokens to discard the semantic-unrelated visual\ninformation for visual enhancement. These two operations are integrated into\nvarious encoders to progressively learn semantic-related visual representations\nfor accurate visual-semantic interactions in ZSL. The extensive experiments\nshow that our ZSLViT achieves significant performance gains on three popular\nbenchmark datasets, i.e., CUB, SUN, and AWA2.\n","authors":["Shiming Chen","Wenjin Hou","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.07713v1.pdf","comment":"Accepted to CVPR'24"},{"id":"http://arxiv.org/abs/2404.07711v1","updated":"2024-04-11T12:58:12Z","published":"2024-04-11T12:58:12Z","title":"OpenTrench3D: A Photogrammetric 3D Point Cloud Dataset for Semantic\n Segmentation of Underground Utilities","summary":" Identifying and classifying underground utilities is an important task for\nefficient and effective urban planning and infrastructure maintenance. We\npresent OpenTrench3D, a novel and comprehensive 3D Semantic Segmentation point\ncloud dataset, designed to advance research and development in underground\nutility surveying and mapping. OpenTrench3D covers a completely novel domain\nfor public 3D point cloud datasets and is unique in its focus, scope, and\ncost-effective capturing method. The dataset consists of 310 point clouds\ncollected across 7 distinct areas. These include 5 water utility areas and 2\ndistrict heating utility areas. The inclusion of different geographical areas\nand main utilities (water and district heating utilities) makes OpenTrench3D\nparticularly valuable for inter-domain transfer learning experiments. We\nprovide benchmark results for the dataset using three state-of-the-art semantic\nsegmentation models, PointNeXt, PointVector and PointMetaBase. Benchmarks are\nconducted by training on data from water areas, fine-tuning on district heating\narea 1 and evaluating on district heating area 2. The dataset is publicly\navailable. With OpenTrench3D, we seek to foster innovation and progress in the\nfield of 3D semantic segmentation in applications related to detection and\ndocumentation of underground utilities as well as in transfer learning methods\nin general.\n","authors":["Lasse H. Hansen","Simon B. Jensen","Mark P. Philipsen","Andreas Møgelmose","Lars Bodum","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2404.07711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07705v1","updated":"2024-04-11T12:49:56Z","published":"2024-04-11T12:49:56Z","title":"ViM-UNet: Vision Mamba for Biomedical Segmentation","summary":" CNNs, most notably the UNet, are the default architecture for biomedical\nsegmentation. Transformer-based approaches, such as UNETR, have been proposed\nto replace them, benefiting from a global field of view, but suffering from\nlarger runtimes and higher parameter counts. The recent Vision Mamba\narchitecture offers a compelling alternative to transformers, also providing a\nglobal field of view, but at higher efficiency. Here, we introduce ViM-UNet, a\nnovel segmentation architecture based on it and compare it to UNet and UNETR\nfor two challenging microscopy instance segmentation tasks. We find that it\nperforms similarly or better than UNet, depending on the task, and outperforms\nUNETR while being more efficient. Our code is open source and documented at\nhttps://github.com/constantinpape/torch-em/blob/main/vimunet.md.\n","authors":["Anwai Archit","Constantin Pape"],"pdf_url":"https://arxiv.org/pdf/2404.07705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07698v1","updated":"2024-04-11T12:44:15Z","published":"2024-04-11T12:44:15Z","title":"Point Cloud Geometry Scalable Coding with a Quality-Conditioned Latents\n Probability Estimator","summary":" The widespread usage of point clouds (PC) for immersive visual applications\nhas resulted in the use of very heterogeneous receiving conditions and devices,\nnotably in terms of network, hardware, and display capabilities. In this\nscenario, quality scalability, i.e., the ability to reconstruct a signal at\ndifferent qualities by progressively decoding a single bitstream, is a major\nrequirement that has yet to be conveniently addressed, notably in most\nlearning-based PC coding solutions. This paper proposes a quality scalability\nscheme, named Scalable Quality Hyperprior (SQH), adaptable to learning-based\nstatic point cloud geometry codecs, which uses a Quality-conditioned Latents\nProbability Estimator (QuLPE) to decode a high-quality version of a PC\nlearning-based representation, based on an available lower quality base layer.\nSQH is integrated in the future JPEG PC coding standard, allowing to create a\nlayered bitstream that can be used to progressively decode the PC geometry with\nincreasing quality and fidelity. Experimental results show that SQH offers the\nquality scalability feature with very limited or no compression performance\npenalty at all when compared with the corresponding non-scalable solution, thus\npreserving the significant compression gains over other state-of-the-art PC\ncodecs.\n","authors":["Daniele Mari","André F. R. Guarda","Nuno M. M. Rodrigues","Simone Milani","Fernando Pereira"],"pdf_url":"https://arxiv.org/pdf/2404.07698v1.pdf","comment":"Submitted at ICIP 2024"},{"id":"http://arxiv.org/abs/2404.07696v1","updated":"2024-04-11T12:42:18Z","published":"2024-04-11T12:42:18Z","title":"Flatness Improves Backbone Generalisation in Few-shot Classification","summary":" Deployment of deep neural networks in real-world settings typically requires\nadaptation to new tasks with few examples. Few-shot classification (FSC)\nprovides a solution to this problem by leveraging pre-trained backbones for\nfast adaptation to new classes. Surprisingly, most efforts have only focused on\ndeveloping architectures for easing the adaptation to the target domain without\nconsidering the importance of backbone training for good generalisation. We\nshow that flatness-aware backbone training with vanilla fine-tuning results in\na simpler yet competitive baseline compared to the state-of-the-art. Our\nresults indicate that for in- and cross-domain FSC, backbone training is\ncrucial to achieving good generalisation across different adaptation methods.\nWe advocate more care should be taken when training these models.\n","authors":["Rui Li","Martin Trapp","Marcus Klasson","Arno Solin"],"pdf_url":"https://arxiv.org/pdf/2404.07696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07687v1","updated":"2024-04-11T12:26:10Z","published":"2024-04-11T12:26:10Z","title":"Chaos in Motion: Unveiling Robustness in Remote Heart Rate Measurement\n through Brain-Inspired Skin Tracking","summary":" Heart rate is an important physiological indicator of human health status.\nExisting remote heart rate measurement methods typically involve facial\ndetection followed by signal extraction from the region of interest (ROI).\nThese SOTA methods have three serious problems: (a) inaccuracies even failures\nin detection caused by environmental influences or subject movement; (b)\nfailures for special patients such as infants and burn victims; (c) privacy\nleakage issues resulting from collecting face video. To address these issues,\nwe regard the remote heart rate measurement as the process of analyzing the\nspatiotemporal characteristics of the optical flow signal in the video. We\napply chaos theory to computer vision tasks for the first time, thus designing\na brain-inspired framework. Firstly, using an artificial primary visual cortex\nmodel to extract the skin in the videos, and then calculate heart rate by\ntime-frequency analysis on all pixels. Our method achieves Robust Skin Tracking\nfor Heart Rate measurement, called HR-RST. The experimental results show that\nHR-RST overcomes the difficulty of environmental influences and effectively\ntracks the subject movement. Moreover, the method could extend to other body\nparts. Consequently, the method can be applied to special patients and\neffectively protect individual privacy, offering an innovative solution.\n","authors":["Jie Wang","Jing Lian","Minjie Ma","Junqiang Lei","Chunbiao Li","Bin Li","Jizhao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.07687v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.07686v1","updated":"2024-04-11T12:25:54Z","published":"2024-04-11T12:25:54Z","title":"Depth Estimation using Weighted-loss and Transfer Learning","summary":" Depth estimation from 2D images is a common computer vision task that has\napplications in many fields including autonomous vehicles, scene understanding\nand robotics. The accuracy of a supervised depth estimation method mainly\nrelies on the chosen loss function, the model architecture, quality of data and\nperformance metrics. In this study, we propose a simplified and adaptable\napproach to improve depth estimation accuracy using transfer learning and an\noptimized loss function. The optimized loss function is a combination of\nweighted losses to which enhance robustness and generalization: Mean Absolute\nError (MAE), Edge Loss and Structural Similarity Index (SSIM). We use a grid\nsearch and a random search method to find optimized weights for the losses,\nwhich leads to an improved model. We explore multiple encoder-decoder-based\nmodels including DenseNet121, DenseNet169, DenseNet201, and EfficientNet for\nthe supervised depth estimation model on NYU Depth Dataset v2. We observe that\nthe EfficientNet model, pre-trained on ImageNet for classification when used as\nan encoder, with a simple upsampling decoder, gives the best results in terms\nof RSME, REL and log10: 0.386, 0.113 and 0.049, respectively. We also perform a\nqualitative analysis which illustrates that our model produces depth maps that\nclosely resemble ground truth, even in cases where the ground truth is flawed.\nThe results indicate significant improvements in accuracy and robustness, with\nEfficientNet being the most successful architecture.\n","authors":["Muhammad Adeel Hafeez","Michael G. Madden","Ganesh Sistu","Ihsan Ullah"],"pdf_url":"https://arxiv.org/pdf/2404.07686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09657v2","updated":"2024-04-11T12:25:45Z","published":"2022-08-20T10:59:33Z","title":"Is Medieval Distant Viewing Possible? : Extending and Enriching\n Annotation of Legacy Image Collections using Visual Analytics","summary":" Distant viewing approaches have typically used image datasets close to the\ncontemporary image data used to train machine learning models. To work with\nimages from other historical periods requires expert annotated data, and the\nquality of labels is crucial for the quality of results. Especially when\nworking with cultural heritage collections that contain myriad uncertainties,\nannotating data, or re-annotating, legacy data is an arduous task. In this\npaper, we describe working with two pre-annotated sets of medieval manuscript\nimages that exhibit conflicting and overlapping metadata. Since a manual\nreconciliation of the two legacy ontologies would be very expensive, we aim (1)\nto create a more uniform set of descriptive labels to serve as a \"bridge\" in\nthe combined dataset, and (2) to establish a high quality hierarchical\nclassification that can be used as a valuable input for subsequent supervised\nmachine learning. To achieve these goals, we developed visualization and\ninteraction mechanisms, enabling medievalists to combine, regularize and extend\nthe vocabulary used to describe these, and other cognate, image datasets. The\nvisual interfaces provide experts an overview of relationships in the data\ngoing beyond the sum total of the metadata. Word and image embeddings as well\nas co-occurrences of labels across the datasets, enable batch re-annotation of\nimages, recommendation of label candidates and support composing a hierarchical\nclassification of labels.\n","authors":["Christofer Meinecke","Estelle Guéville","David Joseph Wrisley","Stefan Jänicke"],"pdf_url":"https://arxiv.org/pdf/2208.09657v2.pdf","comment":"Revision after DSH Peer Review. Paper is now accepted at DSH"},{"id":"http://arxiv.org/abs/2404.07685v1","updated":"2024-04-11T12:24:47Z","published":"2024-04-11T12:24:47Z","title":"Run-time Monitoring of 3D Object Detection in Automated Driving Systems\n Using Early Layer Neural Activation Patterns","summary":" Monitoring the integrity of object detection for errors within the perception\nmodule of automated driving systems (ADS) is paramount for ensuring safety.\nDespite recent advancements in deep neural network (DNN)-based object\ndetectors, their susceptibility to detection errors, particularly in the\nless-explored realm of 3D object detection, remains a significant concern.\nState-of-the-art integrity monitoring (also known as introspection) mechanisms\nin 2D object detection mainly utilise the activation patterns in the final\nlayer of the DNN-based detector's backbone. However, that may not sufficiently\naddress the complexities and sparsity of data in 3D object detection. To this\nend, we conduct, in this article, an extensive investigation into the effects\nof activation patterns extracted from various layers of the backbone network\nfor introspecting the operation of 3D object detectors. Through a comparative\nanalysis using Kitti and NuScenes datasets with PointPillars and CenterPoint\ndetectors, we demonstrate that using earlier layers' activation patterns\nenhances the error detection performance of the integrity monitoring system,\nyet increases computational complexity. To address the real-time operation\nrequirements in ADS, we also introduce a novel introspection method that\ncombines activation patterns from multiple layers of the detector's backbone\nand report its performance.\n","authors":["Hakan Yekta Yatbaz","Mehrdad Dianati","Konstantinos Koufos","Roger Woodman"],"pdf_url":"https://arxiv.org/pdf/2404.07685v1.pdf","comment":"Accepted by CVPR 2024 Workshop on Safe Autonomy for All Domains\n (SAIAD)"},{"id":"http://arxiv.org/abs/2404.07676v1","updated":"2024-04-11T12:14:48Z","published":"2024-04-11T12:14:48Z","title":"Model-based Cleaning of the QUILT-1M Pathology Dataset for\n Text-Conditional Image Synthesis","summary":" The QUILT-1M dataset is the first openly available dataset containing images\nharvested from various online sources. While it provides a huge data variety,\nthe image quality and composition is highly heterogeneous, impacting its\nutility for text-conditional image synthesis. We propose an automatic pipeline\nthat provides predictions of the most common impurities within the images,\ne.g., visibility of narrators, desktop environment and pathology software, or\ntext within the image. Additionally, we propose to use semantic alignment\nfiltering of the image-text pairs. Our findings demonstrate that by rigorously\nfiltering the dataset, there is a substantial enhancement of image fidelity in\ntext-to-image tasks.\n","authors":["Marc Aubreville","Jonathan Ganz","Jonas Ammeling","Christopher C. Kaltenecker","Christof A. Bertram"],"pdf_url":"https://arxiv.org/pdf/2404.07676v1.pdf","comment":"4 pages (short paper)"},{"id":"http://arxiv.org/abs/2402.13255v2","updated":"2024-04-11T12:13:27Z","published":"2024-02-20T18:59:57Z","title":"How NeRFs and 3D Gaussian Splatting are Reshaping SLAM: a Survey","summary":" Over the past two decades, research in the field of Simultaneous Localization\nand Mapping (SLAM) has undergone a significant evolution, highlighting its\ncritical role in enabling autonomous exploration of unknown environments. This\nevolution ranges from hand-crafted methods, through the era of deep learning,\nto more recent developments focused on Neural Radiance Fields (NeRFs) and 3D\nGaussian Splatting (3DGS) representations. Recognizing the growing body of\nresearch and the absence of a comprehensive survey on the topic, this paper\naims to provide the first comprehensive overview of SLAM progress through the\nlens of the latest advancements in radiance fields. It sheds light on the\nbackground, evolutionary path, inherent strengths and limitations, and serves\nas a fundamental reference to highlight the dynamic progress and specific\nchallenges.\n","authors":["Fabio Tosi","Youmin Zhang","Ziren Gong","Erik Sandström","Stefano Mattoccia","Martin R. Oswald","Matteo Poggi"],"pdf_url":"https://arxiv.org/pdf/2402.13255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07671v1","updated":"2024-04-11T12:06:50Z","published":"2024-04-11T12:06:50Z","title":"Deep learning-driven pulmonary arteries and veins segmentation reveals\n demography-associated pulmonary vasculature anatomy","summary":" Pulmonary artery-vein segmentation is crucial for diagnosing pulmonary\ndiseases and surgical planning, and is traditionally achieved by Computed\nTomography Pulmonary Angiography (CTPA). However, concerns regarding adverse\nhealth effects from contrast agents used in CTPA have constrained its clinical\nutility. In contrast, identifying arteries and veins using non-contrast CT, a\nconventional and low-cost clinical examination routine, has long been\nconsidered impossible. Here we propose a High-abundant Pulmonary Artery-vein\nSegmentation (HiPaS) framework achieving accurate artery-vein segmentation on\nboth non-contrast CT and CTPA across various spatial resolutions. HiPaS first\nperforms spatial normalization on raw CT scans via a super-resolution module,\nand then iteratively achieves segmentation results at different branch levels\nby utilizing the low-level vessel segmentation as a prior for high-level vessel\nsegmentation. We trained and validated HiPaS on our established multi-centric\ndataset comprising 1,073 CT volumes with meticulous manual annotation. Both\nquantitative experiments and clinical evaluation demonstrated the superior\nperformance of HiPaS, achieving a dice score of 91.8% and a sensitivity of\n98.0%. Further experiments demonstrated the non-inferiority of HiPaS\nsegmentation on non-contrast CT compared to segmentation on CTPA. Employing\nHiPaS, we have conducted an anatomical study of pulmonary vasculature on 10,613\nparticipants in China (five sites), discovering a new association between\npulmonary vessel abundance and sex and age: vessel abundance is significantly\nhigher in females than in males, and slightly decreases with age, under the\ncontrolling of lung volumes (p < 0.0001). HiPaS realizing accurate artery-vein\nsegmentation delineates a promising avenue for clinical diagnosis and\nunderstanding pulmonary physiology in a non-invasive manner.\n","authors":["Yuetan Chu","Gongning Luo","Longxi Zhou","Shaodong Cao","Guolin Ma","Xianglin Meng","Juexiao Zhou","Changchun Yang","Dexuan Xie","Ricardo Henao","Xigang Xiao","Lianming Wu","Zhaowen Qiu","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.07671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11111v2","updated":"2024-04-11T12:01:34Z","published":"2024-03-17T06:31:16Z","title":"3D Human Reconstruction in the Wild with Synthetic Data Using Generative\n Models","summary":" In this work, we show that synthetic data created by generative models is\ncomplementary to computer graphics (CG) rendered data for achieving remarkable\ngeneralization performance on diverse real-world scenes for 3D human pose and\nshape estimation (HPS). Specifically, we propose an effective approach based on\nrecent diffusion models, termed HumanWild, which can effortlessly generate\nhuman images and corresponding 3D mesh annotations. We first collect a\nlarge-scale human-centric dataset with comprehensive annotations, e.g., text\ncaptions and surface normal images. Then, we train a customized ControlNet\nmodel upon this dataset to generate diverse human images and initial\nground-truth labels. At the core of this step is that we can easily obtain\nnumerous surface normal images from a 3D human parametric model, e.g., SMPL-X,\nby rendering the 3D mesh onto the image plane. As there exists inevitable noise\nin the initial labels, we then apply an off-the-shelf foundation segmentation\nmodel, i.e., SAM, to filter negative data samples. Our data generation pipeline\nis flexible and customizable to facilitate different real-world tasks, e.g.,\nego-centric scenes and perspective-distortion scenes. The generated dataset\ncomprises 0.79M images with corresponding 3D annotations, covering versatile\nviewpoints, scenes, and human identities. We train various HPS regressors on\ntop of the generated data and evaluate them on a wide range of benchmarks\n(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the\ngenerated data. By exclusively employing generative models, we generate\nlarge-scale in-the-wild human images and high-quality annotations, eliminating\nthe need for real-world data collection.\n","authors":["Yongtao Ge","Wenjia Wang","Yongfan Chen","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.11111v2.pdf","comment":"project page: https://yongtaoge.github.io/projects/humanwild"},{"id":"http://arxiv.org/abs/2404.07668v1","updated":"2024-04-11T12:00:13Z","published":"2024-04-11T12:00:13Z","title":"Shape Completion in the Dark: Completing Vertebrae Morphology from 3D\n Ultrasound","summary":" Purpose: Ultrasound (US) imaging, while advantageous for its radiation-free\nnature, is challenging to interpret due to only partially visible organs and a\nlack of complete 3D information. While performing US-based diagnosis or\ninvestigation, medical professionals therefore create a mental map of the 3D\nanatomy. In this work, we aim to replicate this process and enhance the visual\nrepresentation of anatomical structures.\n Methods: We introduce a point-cloud-based probabilistic DL method to complete\noccluded anatomical structures through 3D shape completion and choose US-based\nspine examinations as our application. To enable training, we generate\nsynthetic 3D representations of partially occluded spinal views by mimicking US\nphysics and accounting for inherent artifacts.\n Results: The proposed model performs consistently on synthetic and patient\ndata, with mean and median differences of 2.02 and 0.03 in CD, respectively.\nOur ablation study demonstrates the importance of US physics-based data\ngeneration, reflected in the large mean and median difference of 11.8 CD and\n9.55 CD, respectively. Additionally, we demonstrate that anatomic landmarks,\nsuch as the spinous process (with reconstruction CD of 4.73) and the facet\njoints (mean distance to GT of 4.96mm) are preserved in the 3D completion.\n Conclusion: Our work establishes the feasibility of 3D shape completion for\nlumbar vertebrae, ensuring the preservation of level-wise characteristics and\nsuccessful generalization from synthetic to real data. The incorporation of US\nphysics contributes to more accurate patient data completions. Notably, our\nmethod preserves essential anatomic landmarks and reconstructs crucial\ninjections sites at their correct locations. The generated data and source code\nwill be made publicly available\n(https://github.com/miruna20/Shape-Completion-in-the-Dark).\n","authors":["Miruna-Alexandra Gafencu","Yordanka Velikova","Mahdi Saleh","Tamas Ungi","Nassir Navab","Thomas Wendler","Mohammad Farid Azampour"],"pdf_url":"https://arxiv.org/pdf/2404.07668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07667v1","updated":"2024-04-11T12:00:06Z","published":"2024-04-11T12:00:06Z","title":"Dealing with Subject Similarity in Differential Morphing Attack\n Detection","summary":" The advent of morphing attacks has posed significant security concerns for\nautomated Face Recognition systems, raising the pressing need for robust and\neffective Morphing Attack Detection (MAD) methods able to effectively address\nthis issue. In this paper, we focus on Differential MAD (D-MAD), where a\ntrusted live capture, usually representing the criminal, is compared with the\ndocument image to classify it as morphed or bona fide. We show these approaches\nbased on identity features are effective when the morphed image and the live\none are sufficiently diverse; unfortunately, the effectiveness is significantly\nreduced when the same approaches are applied to look-alike subjects or in all\nthose cases when the similarity between the two compared images is high (e.g.\ncomparison between the morphed image and the accomplice). Therefore, in this\npaper, we propose ACIdA, a modular D-MAD system, consisting of a module for the\nattempt type classification, and two modules for the identity and artifacts\nanalysis on input images. Successfully addressing this task would allow\nbroadening the D-MAD applications including, for instance, the document\nenrollment stage, which currently relies entirely on human evaluation, thus\nlimiting the possibility of releasing ID documents with manipulated images, as\nwell as the automated gates to detect both accomplices and criminals. An\nextensive cross-dataset experimental evaluation conducted on the introduced\nscenario shows that ACIdA achieves state-of-the-art results, outperforming\nliterature competitors, while maintaining good performance in traditional D-MAD\nbenchmarks.\n","authors":["Nicolò Di Domenico","Guido Borghi","Annalisa Franco","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.07667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07664v1","updated":"2024-04-11T11:55:42Z","published":"2024-04-11T11:55:42Z","title":"Finding Dino: A plug-and-play framework for unsupervised detection of\n out-of-distribution objects using prototypes","summary":" Detecting and localising unknown or Out-of-distribution (OOD) objects in any\nscene can be a challenging task in vision. Particularly, in safety-critical\ncases involving autonomous systems like automated vehicles or trains.\nSupervised anomaly segmentation or open-world object detection models depend on\ntraining on exhaustively annotated datasets for every domain and still struggle\nin distinguishing between background and OOD objects. In this work, we present\na plug-and-play generalised framework - PRototype-based zero-shot OOD detection\nWithout Labels (PROWL). It is an inference-based method that does not require\ntraining on the domain dataset and relies on extracting relevant features from\nself-supervised pre-trained models. PROWL can be easily adapted to detect OOD\nobjects in any operational design domain by specifying a list of known classes\nfrom this domain. PROWL, as an unsupervised method, outperforms other\nsupervised methods trained without auxiliary OOD data on the RoadAnomaly and\nRoadObstacle datasets provided in SegmentMeIfYouCan (SMIYC) benchmark. We also\ndemonstrate its suitability for other domains such as rail and maritime scenes.\n","authors":["Poulami Sinhamahapatra","Franziska Schwaiger","Shirsha Bose","Huiyu Wang","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2404.07664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03122v2","updated":"2024-04-11T11:42:13Z","published":"2024-03-05T17:07:29Z","title":"NRDF: Neural Riemannian Distance Fields for Learning Articulated Pose\n Priors","summary":" Faithfully modeling the space of articulations is a crucial task that allows\nrecovery and generation of realistic poses, and remains a notorious challenge.\nTo this end, we introduce Neural Riemannian Distance Fields (NRDFs),\ndata-driven priors modeling the space of plausible articulations, represented\nas the zero-level-set of a neural field in a high-dimensional\nproduct-quaternion space. To train NRDFs only on positive examples, we\nintroduce a new sampling algorithm, ensuring that the geodesic distances follow\na desired distribution, yielding a principled distance field learning paradigm.\nWe then devise a projection algorithm to map any random pose onto the level-set\nby an adaptive-step Riemannian optimizer, adhering to the product manifold of\njoint rotations at all times. NRDFs can compute the Riemannian gradient via\nbackpropagation and by mathematical analogy, are related to Riemannian flow\nmatching, a recent generative model. We conduct a comprehensive evaluation of\nNRDF against other pose priors in various downstream tasks, i.e., pose\ngeneration, image-based pose estimation, and solving inverse kinematics,\nhighlighting NRDF's superior performance. Besides humans, NRDF's versatility\nextends to hand and animal poses, as it can effectively represent any\narticulation.\n","authors":["Yannan He","Garvita Tiwari","Tolga Birdal","Jan Eric Lenssen","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2403.03122v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://virtualhumans.mpi-inf.mpg.de/nrdf"},{"id":"http://arxiv.org/abs/2404.07649v1","updated":"2024-04-11T11:12:06Z","published":"2024-04-11T11:12:06Z","title":"Separated Attention: An Improved Cycle GAN Based Under Water Image\n Enhancement Method","summary":" In this paper we have present an improved Cycle GAN based model for under\nwater image enhancement. We have utilized the cycle consistent learning\ntechnique of the state-of-the-art Cycle GAN model with modification in the loss\nfunction in terms of depth-oriented attention which enhance the contrast of the\noverall image, keeping global content, color, local texture, and style\ninformation intact. We trained the Cycle GAN model with the modified loss\nfunctions on the benchmarked Enhancing Underwater Visual Perception (EUPV)\ndataset a large dataset including paired and unpaired sets of underwater images\n(poor and good quality) taken with seven distinct cameras in a range of\nvisibility situation during research on ocean exploration and human-robot\ncooperation. In addition, we perform qualitative and quantitative evaluation\nwhich supports the given technique applied and provided a better contrast\nenhancement model of underwater imagery. More significantly, the upgraded\nimages provide better results from conventional models and further for under\nwater navigation, pose estimation, saliency prediction, object detection and\ntracking. The results validate the appropriateness of the model for autonomous\nunderwater vehicles (AUV) in visual navigation.\n","authors":["Tashmoy Ghosh"],"pdf_url":"https://arxiv.org/pdf/2404.07649v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.07645v1","updated":"2024-04-11T11:07:57Z","published":"2024-04-11T11:07:57Z","title":"Simba: Mamba augmented U-ShiftGCN for Skeletal Action Recognition in\n Videos","summary":" Skeleton Action Recognition (SAR) involves identifying human actions using\nskeletal joint coordinates and their interconnections. While plain Transformers\nhave been attempted for this task, they still fall short compared to the\ncurrent leading methods, which are rooted in Graph Convolutional Networks\n(GCNs) due to the absence of structural priors. Recently, a novel selective\nstate space model, Mamba, has surfaced as a compelling alternative to the\nattention mechanism in Transformers, offering efficient modeling of long\nsequences. In this work, to the utmost extent of our awareness, we present the\nfirst SAR framework incorporating Mamba. Each fundamental block of our model\nadopts a novel U-ShiftGCN architecture with Mamba as its core component. The\nencoder segment of the U-ShiftGCN is devised to extract spatial features from\nthe skeletal data using downsampling vanilla Shift S-GCN blocks. These spatial\nfeatures then undergo intermediate temporal modeling facilitated by the Mamba\nblock before progressing to the encoder section, which comprises vanilla\nupsampling Shift S-GCN blocks. Additionally, a Shift T-GCN (ShiftTCN) temporal\nmodeling unit is employed before the exit of each fundamental block to refine\ntemporal representations. This particular integration of downsampling spatial,\nintermediate temporal, upsampling spatial, and ultimate temporal subunits\nyields promising results for skeleton action recognition. We dub the resulting\nmodel \\textbf{Simba}, which attains state-of-the-art performance across three\nwell-known benchmark skeleton action recognition datasets: NTU RGB+D, NTU RGB+D\n120, and Northwestern-UCLA. Interestingly, U-ShiftGCN (Simba without\nIntermediate Mamba Block) by itself is capable of performing reasonably well\nand surpasses our baseline.\n","authors":["Soumyabrata Chaudhuri","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2404.07645v1.pdf","comment":"20 pages, 6 tables, 1 figure"},{"id":"http://arxiv.org/abs/2404.03425v2","updated":"2024-04-11T10:51:34Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have inherent shortcomings. Recently, the Mamba architecture,\nbased on state space models, has shown remarkable performance in a series of\nnatural language processing tasks, which can effectively compensate for the\nshortcomings of the above two architectures. In this paper, we explore for the\nfirst time the potential of the Mamba architecture for remote sensing CD tasks.\nWe tailor the corresponding frameworks, called MambaBCD, MambaSCD, and\nMambaBDA, for binary change detection (BCD), semantic change detection (SCD),\nand building damage assessment (BDA), respectively. All three frameworks adopt\nthe cutting-edge Visual Mamba architecture as the encoder, which allows full\nlearning of global spatial contextual information from the input images. For\nthe change decoder, which is available in all three architectures, we propose\nthree spatio-temporal relationship modeling mechanisms, which can be naturally\ncombined with the Mamba architecture and fully utilize its attribute to achieve\nspatio-temporal interaction of multi-temporal features, thereby obtaining\naccurate change information. On five benchmark datasets, our proposed\nframeworks outperform current CNN- and Transformer-based approaches without\nusing any complex training strategies or tricks, fully demonstrating the\npotential of the Mamba architecture in CD tasks. Specifically, we obtained\n83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+,\nand WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA\ndataset xBD, we obtained 81.41% overall F1 score. Further experiments show that\nour architecture is quite robust to degraded data. The source code will be\navailable in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16074v2","updated":"2024-04-11T10:45:05Z","published":"2023-10-24T15:16:19Z","title":"RePoseDM: Recurrent Pose Alignment and Gradient Guidance for Pose Guided\n Image Synthesis","summary":" Pose-guided person image synthesis task requires re-rendering a reference\nimage, which should have a photorealistic appearance and flawless pose\ntransfer. Since person images are highly structured, existing approaches\nrequire dense connections for complex deformations and occlusions because these\nare generally handled through multi-level warping and masking in latent space.\nThe feature maps generated by convolutional neural networks do not have\nequivariance, and hence multi-level warping is required to perform pose\nalignment. Inspired by the ability of the diffusion model to generate\nphotorealistic images from the given conditional guidance, we propose recurrent\npose alignment to provide pose-aligned texture features as conditional\nguidance. Due to the leakage of the source pose in conditional guidance, we\npropose gradient guidance from pose interaction fields, which output the\ndistance from the valid pose manifold given a predicted pose as input. This\nhelps in learning plausible pose transfer trajectories that result in\nphotorealism and undistorted texture details. Extensive results on two\nlarge-scale benchmarks and a user study demonstrate the ability of our proposed\napproach to generate photorealistic pose transfer under challenging scenarios.\nAdditionally, we demonstrate the efficiency of gradient guidance in pose-guided\nimage generation on the HumanArt dataset with fine-tuned stable diffusion.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16074v2.pdf","comment":"Accepted at CVPR 2024 SyntaGen Workshop, 13 pages, 4 tables, 7\n figures"},{"id":"http://arxiv.org/abs/2312.01919v2","updated":"2024-04-11T10:38:33Z","published":"2023-12-04T14:23:18Z","title":"COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy\n Prediction","summary":" The autonomous driving community has shown significant interest in 3D\noccupancy prediction, driven by its exceptional geometric perception and\ngeneral object recognition capabilities. To achieve this, current works try to\nconstruct a Tri-Perspective View (TPV) or Occupancy (OCC) representation\nextending from the Bird-Eye-View perception. However, compressed views like TPV\nrepresentation lose 3D geometry information while raw and sparse OCC\nrepresentation requires heavy but redundant computational costs. To address the\nabove limitations, we propose Compact Occupancy TRansformer (COTR), with a\ngeometry-aware occupancy encoder and a semantic-aware group decoder to\nreconstruct a compact 3D OCC representation. The occupancy encoder first\ngenerates a compact geometrical OCC feature through efficient explicit-implicit\nview transformation. Then, the occupancy decoder further enhances the semantic\ndiscriminability of the compact OCC representation by a coarse-to-fine semantic\ngrouping strategy. Empirical experiments show that there are evident\nperformance gains across multiple baselines, e.g., COTR outperforms baselines\nwith a relative improvement of 8%-15%, demonstrating the superiority of our\nmethod.\n","authors":["Qihang Ma","Xin Tan","Yanyun Qu","Lizhuang Ma","Zhizhong Zhang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2312.01919v2.pdf","comment":"CVPR2024. Code is available at https://github.com/NotACracker/COTR"},{"id":"http://arxiv.org/abs/2404.07626v1","updated":"2024-04-11T10:26:40Z","published":"2024-04-11T10:26:40Z","title":"Homography Guided Temporal Fusion for Road Line and Marking Segmentation","summary":" Reliable segmentation of road lines and markings is critical to autonomous\ndriving. Our work is motivated by the observations that road lines and markings\nare (1) frequently occluded in the presence of moving vehicles, shadow, and\nglare and (2) highly structured with low intra-class shape variance and overall\nhigh appearance consistency. To solve these issues, we propose a Homography\nGuided Fusion (HomoFusion) module to exploit temporally-adjacent video frames\nfor complementary cues facilitating the correct classification of the partially\noccluded road lines or markings. To reduce computational complexity, a novel\nsurface normal estimator is proposed to establish spatial correspondences\nbetween the sampled frames, allowing the HomoFusion module to perform a\npixel-to-pixel attention mechanism in updating the representation of the\noccluded road lines or markings. Experiments on ApolloScape, a large-scale lane\nmark segmentation dataset, and ApolloScape Night with artificial simulated\nnight-time road conditions, demonstrate that our method outperforms other\nexisting SOTA lane mark segmentation models with less than 9\\% of their\nparameters and computational complexity. We show that exploiting available\ncamera intrinsic data and ground plane assumption for cross-frame\ncorrespondence can lead to a light-weight network with significantly improved\nperformances in speed and accuracy. We also prove the versatility of our\nHomoFusion approach by applying it to the problem of water puddle segmentation\nand achieving SOTA performance.\n","authors":["Shan Wang","Chuong Nguyen","Jiawei Liu","Kaihao Zhang","Wenhan Luo","Yanhao Zhang","Sundaram Muthu","Fahira Afzal Maken","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07626v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15855v2","updated":"2024-04-11T10:19:41Z","published":"2023-08-30T08:44:21Z","title":"IIDM: Inter and Intra-domain Mixing for Semi-supervised Domain\n Adaptation in Semantic Segmentation","summary":" Despite recent advances in semantic segmentation, an inevitable challenge is\nthe performance degradation caused by the domain shift in real applications.\nCurrent dominant approach to solve this problem is unsupervised domain\nadaptation (UDA). However, the absence of labeled target data in UDA is overly\nrestrictive and limits performance. To overcome this limitation, a more\npractical scenario called semi-supervised domain adaptation (SSDA) has been\nproposed. Existing SSDA methods are derived from the UDA paradigm and primarily\nfocus on leveraging the unlabeled target data and source data. In this paper,\nwe highlight the significance of exploiting the intra-domain information\nbetween the labeled target data and unlabeled target data. Instead of solely\nusing the scarce labeled target data for supervision, we propose a novel SSDA\nframework that incorporates both Inter and Intra Domain Mixing (IIDM), where\ninter-domain mixing mitigates the source-target domain gap and intra-domain\nmixing enriches the available target domain information, and the network can\ncapture more domain-invariant features. We also explore different domain mixing\nstrategies to better exploit the target domain information. Comprehensive\nexperiments conducted on the GTA5 to Cityscapes and SYNTHIA to Cityscapes\nbenchmarks demonstrate the effectiveness of IIDM, surpassing previous methods\nby a large margin.\n","authors":["Weifu Fu","Qiang Nie","Jialin Li","Yuhuan Lin","Kai Wu","Jian Li","Yabiao Wang","Yong Liu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15855v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.07622v1","updated":"2024-04-11T10:16:44Z","published":"2024-04-11T10:16:44Z","title":"Multi-Image Visual Question Answering for Unsupervised Anomaly Detection","summary":" Unsupervised anomaly detection enables the identification of potential\npathological areas by juxtaposing original images with their pseudo-healthy\nreconstructions generated by models trained exclusively on normal images.\nHowever, the clinical interpretation of resultant anomaly maps presents a\nchallenge due to a lack of detailed, understandable explanations. Recent\nadvancements in language models have shown the capability of mimicking\nhuman-like understanding and providing detailed descriptions. This raises an\ninteresting question: \\textit{How can language models be employed to make the\nanomaly maps more explainable?} To the best of our knowledge, we are the first\nto leverage a language model for unsupervised anomaly detection, for which we\nconstruct a dataset with different questions and answers. Additionally, we\npresent a novel multi-image visual question answering framework tailored for\nanomaly detection, incorporating diverse feature fusion strategies to enhance\nvisual knowledge extraction. Our experiments reveal that the framework,\naugmented by our new Knowledge Q-Former module, adeptly answers questions on\nthe anomaly detection dataset. Besides, integrating anomaly maps as inputs\ndistinctly aids in improving the detection of unseen pathologies.\n","authors":["Jun Li","Cosmin I. Bercea","Philip Müller","Lina Felsner","Suhwan Kim","Daniel Rueckert","Benedikt Wiestler","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2404.07622v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.07620v1","updated":"2024-04-11T10:14:56Z","published":"2024-04-11T10:14:56Z","title":"Diffusion Probabilistic Multi-cue Level Set for Reducing Edge\n Uncertainty in Pancreas Segmentation","summary":" Accurately segmenting the pancreas remains a huge challenge. Traditional\nmethods encounter difficulties in semantic localization due to the small volume\nand distorted structure of the pancreas, while deep learning methods encounter\nchallenges in obtaining accurate edges because of low contrast and organ\noverlapping. To overcome these issues, we propose a multi-cue level set method\nbased on the diffusion probabilistic model, namely Diff-mcs. Our method adopts\na coarse-to-fine segmentation strategy. We use the diffusion probabilistic\nmodel in the coarse segmentation stage, with the obtained probability\ndistribution serving as both the initial localization and prior cues for the\nlevel set method. In the fine segmentation stage, we combine the prior cues\nwith grayscale cues and texture cues to refine the edge by maximizing the\ndifference between probability distributions of the cues inside and outside the\nlevel set curve. The method is validated on three public datasets and achieves\nstate-of-the-art performance, which can obtain more accurate segmentation\nresults with lower uncertainty segmentation edges. In addition, we conduct\nablation studies and uncertainty analysis to verify that the diffusion\nprobability model provides a more appropriate initialization for the level set\nmethod. Furthermore, when combined with multiple cues, the level set method can\nbetter obtain edges and improve the overall accuracy. Our code is available at\nhttps://github.com/GOUYUEE/Diff-mcs.\n","authors":["Yue Gou","Yuming Xing","Shengzhu Shi","Zhichang Guo"],"pdf_url":"https://arxiv.org/pdf/2404.07620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18956v2","updated":"2024-04-11T10:06:10Z","published":"2024-02-29T08:51:51Z","title":"WWW: A Unified Framework for Explaining What, Where and Why of Neural\n Networks by Interpretation of Neuron Concepts","summary":" Recent advancements in neural networks have showcased their remarkable\ncapabilities across various domains. Despite these successes, the \"black box\"\nproblem still remains. Addressing this, we propose a novel framework, WWW, that\noffers the 'what', 'where', and 'why' of the neural network decisions in\nhuman-understandable terms. Specifically, WWW utilizes adaptive selection for\nconcept discovery, employing adaptive cosine similarity and thresholding\ntechniques to effectively explain 'what'. To address the 'where' and 'why', we\nproposed a novel combination of neuron activation maps (NAMs) with Shapley\nvalues, generating localized concept maps and heatmaps for individual inputs.\nFurthermore, WWW introduces a method for predicting uncertainty, leveraging\nheatmap similarities to estimate 'how' reliable the prediction is. Experimental\nevaluations of WWW demonstrate superior performance in both quantitative and\nqualitative metrics, outperforming existing methods in interpretability. WWW\nprovides a unified solution for explaining 'what', 'where', and 'why',\nintroducing a method for localized explanations from global interpretations and\noffering a plug-and-play solution adaptable to various architectures.\n","authors":["Yong Hyun Ahn","Hyeon Bae Kim","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2402.18956v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01705v2","updated":"2024-04-11T10:05:12Z","published":"2024-04-02T07:38:16Z","title":"Samba: Semantic Segmentation of Remotely Sensed Images with State Space\n Model","summary":" High-resolution remotely sensed images pose a challenge for commonly used\nsemantic segmentation methods such as Convolutional Neural Network (CNN) and\nVision Transformer (ViT). CNN-based methods struggle with handling such\nhigh-resolution images due to their limited receptive field, while ViT faces\nchallenges in handling long sequences. Inspired by Mamba, which adopts a State\nSpace Model (SSM) to efficiently capture global semantic information, we\npropose a semantic segmentation framework for high-resolution remotely sensed\nimages, named Samba. Samba utilizes an encoder-decoder architecture, with Samba\nblocks serving as the encoder for efficient multi-level semantic information\nextraction, and UperNet functioning as the decoder. We evaluate Samba on the\nLoveDA, ISPRS Vaihingen, and ISPRS Potsdam datasets, comparing its performance\nagainst top-performing CNN and ViT methods. The results reveal that Samba\nachieved unparalleled performance on commonly used remote sensing datasets for\nsemantic segmentation. Our proposed Samba demonstrates for the first time the\neffectiveness of SSM in semantic segmentation of remotely sensed images,\nsetting a new benchmark in performance for Mamba-based techniques in this\nspecific application. The source code and baseline implementations are\navailable at https://github.com/zhuqinfeng1999/Samba.\n","authors":["Qinfeng Zhu","Yuanzhi Cai","Yuan Fang","Yihan Yang","Cheng Chen","Lei Fan","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.01705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07610v1","updated":"2024-04-11T09:58:23Z","published":"2024-04-11T09:58:23Z","title":"Do You Remember? Dense Video Captioning with Cross-Modal Memory\n Retrieval","summary":" There has been significant attention to the research on dense video\ncaptioning, which aims to automatically localize and caption all events within\nuntrimmed video. Several studies introduce methods by designing dense video\ncaptioning as a multitasking problem of event localization and event captioning\nto consider inter-task relations. However, addressing both tasks using only\nvisual input is challenging due to the lack of semantic content. In this study,\nwe address this by proposing a novel framework inspired by the cognitive\ninformation processing of humans. Our model utilizes external memory to\nincorporate prior knowledge. The memory retrieval method is proposed with\ncross-modal video-to-text matching. To effectively incorporate retrieved text\nfeatures, the versatile encoder and the decoder with visual and textual\ncross-attention modules are designed. Comparative experiments have been\nconducted to show the effectiveness of the proposed method on ActivityNet\nCaptions and YouCook2 datasets. Experimental results show promising performance\nof our model without extensive pretraining from a large video dataset.\n","authors":["Minkuk Kim","Hyeon Bae Kim","Jinyoung Moon","Jinwoo Choi","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07610v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07607v1","updated":"2024-04-11T09:50:05Z","published":"2024-04-11T09:50:05Z","title":"Automatic Detection of Dark Ship-to-Ship Transfers using Deep Learning\n and Satellite Imagery","summary":" Despite extensive research into ship detection via remote sensing, no studies\nidentify ship-to-ship transfers in satellite imagery. Given the importance of\ntransshipment in illicit shipping practices, this is a significant gap. In what\nfollows, I train a convolutional neural network to accurately detect 4\ndifferent types of cargo vessel and two different types of Ship-to-Ship\ntransfer in PlanetScope satellite imagery. I then elaborate a pipeline for the\nautomatic detection of suspected illicit ship-to-ship transfers by\ncross-referencing satellite detections with vessel borne GPS data. Finally, I\napply this method to the Kerch Strait between Ukraine and Russia to identify\nover 400 dark transshipment events since 2022.\n","authors":["Ollie Ballinger"],"pdf_url":"https://arxiv.org/pdf/2404.07607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07605v1","updated":"2024-04-11T09:47:52Z","published":"2024-04-11T09:47:52Z","title":"Contrastive-Based Deep Embeddings for Label Noise-Resilient\n Histopathology Image Classification","summary":" Recent advancements in deep learning have proven highly effective in medical\nimage classification, notably within histopathology. However, noisy labels\nrepresent a critical challenge in histopathology image classification, where\naccurate annotations are vital for training robust deep learning models.\nIndeed, deep neural networks can easily overfit label noise, leading to severe\ndegradations in model performance. While numerous public pathology foundation\nmodels have emerged recently, none have evaluated their resilience to label\nnoise. Through thorough empirical analyses across multiple datasets, we exhibit\nthe label noise resilience property of embeddings extracted from foundation\nmodels trained in a self-supervised contrastive manner. We demonstrate that\ntraining with such embeddings substantially enhances label noise robustness\nwhen compared to non-contrastive-based ones as well as commonly used\nnoise-resilient methods. Our results unequivocally underline the superiority of\ncontrastive learning in effectively mitigating the label noise challenge. Code\nis publicly available at\nhttps://github.com/LucasDedieu/NoiseResilientHistopathology.\n","authors":["Lucas Dedieu","Nicolas Nerrienet","Adrien Nivaggioli","Clara Simmat","Marceau Clavel","Arnaud Gauthier","Stéphane Sockeel","Rémy Peyret"],"pdf_url":"https://arxiv.org/pdf/2404.07605v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.07603v1","updated":"2024-04-11T09:43:07Z","published":"2024-04-11T09:43:07Z","title":"GLID: Pre-training a Generalist Encoder-Decoder Vision Model","summary":" This paper proposes a GeneraLIst encoder-Decoder (GLID) pre-training method\nfor better handling various downstream computer vision tasks. While\nself-supervised pre-training approaches, e.g., Masked Autoencoder, have shown\nsuccess in transfer learning, task-specific sub-architectures are still\nrequired to be appended for different downstream tasks, which cannot enjoy the\nbenefits of large-scale pre-training. GLID overcomes this challenge by allowing\nthe pre-trained generalist encoder-decoder to be fine-tuned on various vision\ntasks with minimal task-specific architecture modifications. In the GLID\ntraining scheme, pre-training pretext task and other downstream tasks are\nmodeled as \"query-to-answer\" problems, including the pre-training pretext task\nand other downstream tasks. We pre-train a task-agnostic encoder-decoder with\nquery-mask pairs. During fine-tuning, GLID maintains the pre-trained\nencoder-decoder and queries, only replacing the topmost linear transformation\nlayer with task-specific linear heads. This minimizes the pretrain-finetune\narchitecture inconsistency and enables the pre-trained model to better adapt to\ndownstream tasks. GLID achieves competitive performance on various vision\ntasks, including object detection, image segmentation, pose estimation, and\ndepth estimation, outperforming or matching specialist models such as\nMask2Former, DETR, ViTPose, and BinsFormer.\n","authors":["Jihao Liu","Jinliang Zheng","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.07603v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07602v1","updated":"2024-04-11T09:41:14Z","published":"2024-04-11T09:41:14Z","title":"Attention based End to end network for Offline Writer Identification on\n Word level data","summary":" Writer identification due to its widespread application in various fields has\ngained popularity over the years. In scenarios where optimum handwriting\nsamples are available, whether they be in the form of a single line, a\nsentence, or an entire page, writer identification algorithms have demonstrated\nnoteworthy levels of accuracy. However, in scenarios where only a limited\nnumber of handwritten samples are available, particularly in the form of word\nimages, there is a significant scope for improvement.\n In this paper, we propose a writer identification system based on an\nattention-driven Convolutional Neural Network (CNN). The system is trained\nutilizing image segments, known as fragments, extracted from word images,\nemploying a pyramid-based strategy. This methodology enables the system to\ncapture a comprehensive representation of the data, encompassing both\nfine-grained details and coarse features across various levels of abstraction.\nThese extracted fragments serve as the training data for the convolutional\nnetwork, enabling it to learn a more robust representation compared to\ntraditional convolution-based networks trained on word images. Additionally,\nthe paper explores the integration of an attention mechanism to enhance the\nrepresentational power of the learned features. The efficacy of the proposed\nalgorithm is evaluated on three benchmark databases, demonstrating its\nproficiency in writer identification tasks, particularly in scenarios with\nlimited access to handwriting data.\n","authors":["Vineet Kumar","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2404.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07600v1","updated":"2024-04-11T09:39:58Z","published":"2024-04-11T09:39:58Z","title":"Implicit and Explicit Language Guidance for Diffusion-based Visual\n Perception","summary":" Text-to-image diffusion models have shown powerful ability on conditional\nimage synthesis. With large-scale vision-language pre-training, diffusion\nmodels are able to generate high-quality images with rich texture and\nreasonable structure under different text prompts. However, it is an open\nproblem to adapt the pre-trained diffusion model for visual perception. In this\npaper, we propose an implicit and explicit language guidance framework for\ndiffusion-based perception, named IEDP. Our IEDP comprises of an implicit\nlanguage guidance branch and an explicit language guidance branch. The implicit\nbranch employs frozen CLIP image encoder to directly generate implicit text\nembeddings that are fed to diffusion model, without using explicit text\nprompts. The explicit branch utilizes the ground-truth labels of corresponding\nimages as text prompts to condition feature extraction of diffusion model.\nDuring training, we jointly train diffusion model by sharing the model weights\nof these two branches. As a result, implicit and explicit branches can jointly\nguide feature learning. During inference, we only employ implicit branch for\nfinal prediction, which does not require any ground-truth labels. Experiments\nare performed on two typical perception tasks, including semantic segmentation\nand depth estimation. Our IEDP achieves promising performance on both tasks.\nFor semantic segmentation, our IEDP has the mIoU score of 55.9% on AD20K\nvalidation set, which outperforms the baseline method VPD by 2.2%. For depth\nestimation, our IEDP outperforms the baseline method VPD with a relative gain\nof 10.2%.\n","authors":["Hefeng Wang","Jiale Cao","Jin Xie","Aiping Yang","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2404.07600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07594v1","updated":"2024-04-11T09:23:44Z","published":"2024-04-11T09:23:44Z","title":"Weakly-Supervised Learning via Multi-Lateral Decoder Branching for\n Guidewire Segmentation in Robot-Assisted Cardiovascular Catheterization","summary":" Although robot-assisted cardiovascular catheterization is commonly performed\nfor intervention of cardiovascular diseases, more studies are needed to support\nthe procedure with automated tool segmentation. This can aid surgeons on tool\ntracking and visualization during intervention. Learning-based segmentation has\nrecently offered state-of-the-art segmentation performances however, generating\nground-truth signals for fully-supervised methods is labor-intensive and time\nconsuming for the interventionists. In this study, a weakly-supervised learning\nmethod with multi-lateral pseudo labeling is proposed for tool segmentation in\ncardiac angiograms. The method includes a modified U-Net model with one encoder\nand multiple lateral-branched decoders that produce pseudo labels as\nsupervision signals under different perturbation. The pseudo labels are\nself-generated through a mixed loss function and shared consistency in the\ndecoders. We trained the model end-to-end with weakly-annotated data obtained\nduring robotic cardiac catheterization. Experiments with the proposed model\nshows weakly annotated data has closer performance to when fully annotated data\nis used. Compared to three existing weakly-supervised methods, our approach\nyielded higher segmentation performance across three different cardiac\nangiogram data. With ablation study, we showed consistent performance under\ndifferent parameters. Thus, we offer a less expensive method for real-time tool\nsegmentation and tracking during robot-assisted cardiac catheterization.\n","authors":["Olatunji Mumini Omisore","Toluwanimi Akinyemi","Anh Nguyen","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07580v1","updated":"2024-04-11T09:13:50Z","published":"2024-04-11T09:13:50Z","title":"Multi-rater Prompting for Ambiguous Medical Image Segmentation","summary":" Multi-rater annotations commonly occur when medical images are independently\nannotated by multiple experts (raters). In this paper, we tackle two challenges\narisen in multi-rater annotations for medical image segmentation (called\nambiguous medical image segmentation): (1) How to train a deep learning model\nwhen a group of raters produces a set of diverse but plausible annotations, and\n(2) how to fine-tune the model efficiently when computation resources are not\navailable for re-training the entire model on a different dataset domain. We\npropose a multi-rater prompt-based approach to address these two challenges\naltogether. Specifically, we introduce a series of rater-aware prompts that can\nbe plugged into the U-Net model for uncertainty estimation to handle\nmulti-annotation cases. During the prompt-based fine-tuning process, only 0.3%\nof learnable parameters are required to be updated comparing to training the\nentire model. Further, in order to integrate expert consensus and disagreement,\nwe explore different multi-rater incorporation strategies and design a\nmix-training strategy for comprehensive insight learning. Extensive experiments\nverify the effectiveness of our new approach for ambiguous medical image\nsegmentation on two public datasets while alleviating the heavy burden of model\nre-training.\n","authors":["Jinhong Wang","Yi Cheng","Jintai Chen","Hongxia Xu","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07122v2","updated":"2024-04-11T09:10:21Z","published":"2024-04-10T16:01:37Z","title":"Driver Attention Tracking and Analysis","summary":" We propose a novel method to estimate a driver's points-of-gaze using a pair\nof ordinary cameras mounted on the windshield and dashboard of a car. This is a\nchallenging problem due to the dynamics of traffic environments with 3D scenes\nof unknown depths. This problem is further complicated by the volatile distance\nbetween the driver and the camera system. To tackle these challenges, we\ndevelop a novel convolutional network that simultaneously analyzes the image of\nthe scene and the image of the driver's face. This network has a camera\ncalibration module that can compute an embedding vector that represents the\nspatial configuration between the driver and the camera system. This\ncalibration module improves the overall network's performance, which can be\njointly trained end to end.\n We also address the lack of annotated data for training and evaluation by\nintroducing a large-scale driving dataset with point-of-gaze annotations. This\nis an in situ dataset of real driving sessions in an urban city, containing\nsynchronized images of the driving scene as well as the face and gaze of the\ndriver. Experiments on this dataset show that the proposed method outperforms\nvarious baseline methods, having the mean prediction error of 29.69 pixels,\nwhich is relatively small compared to the $1280{\\times}720$ resolution of the\nscene camera.\n","authors":["Dat Viet Thanh Nguyen","Anh Tran","Hoai Nam Vu","Cuong Pham","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.07122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07564v1","updated":"2024-04-11T08:50:12Z","published":"2024-04-11T08:50:12Z","title":"ObjBlur: A Curriculum Learning Approach With Progressive Object-Level\n Blurring for Improved Layout-to-Image Generation","summary":" We present ObjBlur, a novel curriculum learning approach to improve\nlayout-to-image generation models, where the task is to produce realistic\nimages from layouts composed of boxes and labels. Our method is based on\nprogressive object-level blurring, which effectively stabilizes training and\nenhances the quality of generated images. This curriculum learning strategy\nsystematically applies varying degrees of blurring to individual objects or the\nbackground during training, starting from strong blurring to progressively\ncleaner images. Our findings reveal that this approach yields significant\nperformance improvements, stabilized training, smoother convergence, and\nreduced variance between multiple runs. Moreover, our technique demonstrates\nits versatility by being compatible with generative adversarial networks and\ndiffusion models, underlining its applicability across various generative\nmodeling paradigms. With ObjBlur, we reach new state-of-the-art results on the\ncomplex COCO and Visual Genome datasets.\n","authors":["Stanislav Frolov","Brian B. Moser","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2404.07564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06710v2","updated":"2024-04-11T08:40:42Z","published":"2024-04-10T03:31:32Z","title":"SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike\n Camera","summary":" One of the most critical factors in achieving sharp Novel View Synthesis\n(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS) is the quality of the training images. However,\nConventional RGB cameras are susceptible to motion blur. In contrast,\nneuromorphic cameras like event and spike cameras inherently capture more\ncomprehensive temporal information, which can provide a sharp representation of\nthe scene as additional training data. Recent methods have explored the\nintegration of event cameras to improve the quality of NVS. The event-RGB\napproaches have some limitations, such as high training costs and the inability\nto work effectively in the background. Instead, our study introduces a new\nmethod that uses the spike camera to overcome these limitations. By considering\ntexture reconstruction from spike streams as ground truth, we design the\nTexture from Spike (TfS) loss. Since the spike camera relies on temporal\nintegration instead of temporal differentiation used by event cameras, our\nproposed TfS loss maintains manageable training costs. It handles foreground\nobjects with backgrounds simultaneously. We also provide a real-world dataset\ncaptured with our spike-RGB camera system to facilitate future research\nendeavors. We conduct extensive experiments using synthetic and real-world\ndatasets to demonstrate that our design can enhance novel view synthesis across\nNeRF and 3DGS. The code and dataset will be made available for public access.\n","authors":["Gaole Dai","Zhenyu Wang","Qinwen Xu","Ming Lu","Wen Cheng","Baixin Shi","Shanghang Zhang","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07556v1","updated":"2024-04-11T08:36:36Z","published":"2024-04-11T08:36:36Z","title":"Attention-Aware Laparoscopic Image Desmoking Network with Lightness\n Embedding and Hybrid Guided Embedding","summary":" This paper presents a novel method of smoke removal from the laparoscopic\nimages. Due to the heterogeneous nature of surgical smoke, a two-stage network\nis proposed to estimate the smoke distribution and reconstruct a clear,\nsmoke-free surgical scene. The utilization of the lightness channel plays a\npivotal role in providing vital information pertaining to smoke density. The\nreconstruction of smoke-free image is guided by a hybrid embedding, which\ncombines the estimated smoke mask with the initial image. Experimental results\ndemonstrate that the proposed method boasts a Peak Signal to Noise Ratio that\nis $2.79\\%$ higher than the state-of-the-art methods, while also exhibits a\nremarkable $38.2\\%$ reduction in run-time. Overall, the proposed method offers\ncomparable or even superior performance in terms of both smoke removal quality\nand computational efficiency when compared to existing state-of-the-art\nmethods. This work will be publicly available on\nhttp://homepage.hit.edu.cn/wpgao\n","authors":["Ziteng Liu","Jiahua Zhu","Bainan Liu","Hao Liu","Wenpeng Gao","Yili Fu"],"pdf_url":"https://arxiv.org/pdf/2404.07556v1.pdf","comment":"ISBI2024"},{"id":"http://arxiv.org/abs/2404.07554v1","updated":"2024-04-11T08:36:13Z","published":"2024-04-11T08:36:13Z","title":"CAT: Contrastive Adapter Training for Personalized Image Generation","summary":" The emergence of various adapters, including Low-Rank Adaptation (LoRA)\napplied from the field of natural language processing, has allowed diffusion\nmodels to personalize image generation at a low cost. However, due to the\nvarious challenges including limited datasets and shortage of regularization\nand computation resources, adapter training often results in unsatisfactory\noutcomes, leading to the corruption of the backbone model's prior knowledge.\nOne of the well known phenomena is the loss of diversity in object generation,\nespecially within the same class which leads to generating almost identical\nobjects with minor variations. This poses challenges in generation\ncapabilities. To solve this issue, we present Contrastive Adapter Training\n(CAT), a simple yet effective strategy to enhance adapter training through the\napplication of CAT loss. Our approach facilitates the preservation of the base\nmodel's original knowledge when the model initiates adapters. Furthermore, we\nintroduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to\nkeep the former information. We qualitatively and quantitatively compare CAT's\nimprovement. Finally, we mention the possibility of CAT in the aspects of\nmulti-concept adapter and optimization.\n","authors":["Jae Wan Park","Sang Hyun Park","Jun Young Koh","Junha Lee","Min Song"],"pdf_url":"https://arxiv.org/pdf/2404.07554v1.pdf","comment":"CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.07553v1","updated":"2024-04-11T08:35:24Z","published":"2024-04-11T08:35:24Z","title":"SFSORT: Scene Features-based Simple Online Real-Time Tracker","summary":" This paper introduces SFSORT, the world's fastest multi-object tracking\nsystem based on experiments conducted on MOT Challenge datasets. To achieve an\naccurate and computationally efficient tracker, this paper employs a\ntracking-by-detection method, following the online real-time tracking approach\nestablished in prior literature. By introducing a novel cost function called\nthe Bounding Box Similarity Index, this work eliminates the Kalman Filter,\nleading to reduced computational requirements. Additionally, this paper\ndemonstrates the impact of scene features on enhancing object-track association\nand improving track post-processing. Using a 2.2 GHz Intel Xeon CPU, the\nproposed method achieves an HOTA of 61.7\\% with a processing speed of 2242 Hz\non the MOT17 dataset and an HOTA of 60.9\\% with a processing speed of 304 Hz on\nthe MOT20 dataset. The tracker's source code, fine-tuned object detection\nmodel, and tutorials are available at\n\\url{https://github.com/gitmehrdad/SFSORT}.\n","authors":["M. M. Morsali","Z. Sharifi","F. Fallah","S. Hashembeiki","H. Mohammadzade","S. Bagheri Shouraki"],"pdf_url":"https://arxiv.org/pdf/2404.07553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07551v1","updated":"2024-04-11T08:34:10Z","published":"2024-04-11T08:34:10Z","title":"Event-Enhanced Snapshot Compressive Videography at 10K FPS","summary":" Video snapshot compressive imaging (SCI) encodes the target dynamic scene\ncompactly into a snapshot and reconstructs its high-speed frame sequence\nafterward, greatly reducing the required data footprint and transmission\nbandwidth as well as enabling high-speed imaging with a low frame rate\nintensity camera. In implementation, high-speed dynamics are encoded via\ntemporally varying patterns, and only frames at corresponding temporal\nintervals can be reconstructed, while the dynamics occurring between\nconsecutive frames are lost. To unlock the potential of conventional snapshot\ncompressive videography, we propose a novel hybrid \"intensity+event\" imaging\nscheme by incorporating an event camera into a video SCI setup. Our proposed\nsystem consists of a dual-path optical setup to record the coded intensity\nmeasurement and intermediate event signals simultaneously, which is compact and\nphoton-efficient by collecting the half photons discarded in conventional video\nSCI. Correspondingly, we developed a dual-branch Transformer utilizing the\nreciprocal relationship between two data modes to decode dense video frames.\nExtensive experiments on both simulated and real-captured data demonstrate our\nsuperiority to state-of-the-art video SCI and video frame interpolation (VFI)\nmethods. Benefiting from the new hybrid design leveraging both intrinsic\nredundancy in videos and the unique feature of event cameras, we achieve\nhigh-quality videography at 0.1ms time intervals with a low-cost CMOS image\nsensor working at 24 FPS.\n","authors":["Bo Zhang","Jinli Suo","Qionghai Dai"],"pdf_url":"https://arxiv.org/pdf/2404.07551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10372v2","updated":"2024-04-11T08:21:09Z","published":"2023-10-16T13:11:35Z","title":"Learning Object Permanence from Videos via Latent Imaginations","summary":" While human infants exhibit knowledge about object permanence from two months\nof age onwards, deep-learning approaches still largely fail to recognize\nobjects' continued existence. We introduce a slot-based autoregressive deep\nlearning system, the looped location and identity tracking model Loci-Looped,\nwhich learns to adaptively fuse latent imaginations with pixel-space\nobservations into consistent latent object-specific what and where encodings\nover time. The novel loop empowers Loci-Looped to learn the physical concepts\nof object permanence, directional inertia, and object solidity through\nobservation alone. As a result, Loci-Looped tracks objects through occlusions,\nanticipates their reappearance, and shows signs of surprise and internal\nrevisions when observing implausible object behavior. Notably, Loci-Looped\noutperforms state-of-the-art baseline models in handling object occlusions and\ntemporary sensory interruptions while exhibiting more compositional,\ninterpretable internal activity patterns. Our work thus introduces the first\nself-supervised interpretable learning model that learns about object\npermanence directly from video data without supervision.\n","authors":["Manuel Traub","Frederic Becker","Sebastian Otte","Martin V. Butz"],"pdf_url":"https://arxiv.org/pdf/2310.10372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15011v3","updated":"2024-04-11T08:16:53Z","published":"2023-11-25T12:34:02Z","title":"VSCode: General Visual Salient and Camouflaged Object Detection with 2D\n Prompt Learning","summary":" Salient object detection (SOD) and camouflaged object detection (COD) are\nrelated yet distinct binary mapping tasks. These tasks involve multiple\nmodalities, sharing commonalities and unique cues. Existing research often\nemploys intricate task-specific specialist models, potentially leading to\nredundancy and suboptimal results. We introduce VSCode, a generalist model with\nnovel 2D prompt learning, to jointly address four SOD tasks and three COD\ntasks. We utilize VST as the foundation model and introduce 2D prompts within\nthe encoder-decoder architecture to learn domain and task-specific knowledge on\ntwo separate dimensions. A prompt discrimination loss helps disentangle\npeculiarities to benefit model optimization. VSCode outperforms\nstate-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot\ngeneralization to unseen tasks by combining 2D prompts, such as RGB-D COD.\nSource code has been available at https://github.com/Sssssuperior/VSCode.\n","authors":["Ziyang Luo","Nian Liu","Wangbo Zhao","Xuguang Yang","Dingwen Zhang","Deng-Ping Fan","Fahad Khan","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.15011v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2204.01348v2","updated":"2024-04-11T08:12:50Z","published":"2022-04-04T09:46:30Z","title":"Extended Reality for Mental Health Evaluation -A Scoping Review","summary":" Mental health disorders are the leading cause of health-related problems\nglobally. It is projected that mental health disorders will be the leading\ncause of morbidity among adults as the incidence rates of anxiety and\ndepression grows globally. Recently, extended reality (XR), a general term\ncovering virtual reality (VR), augmented reality (AR) and mixed reality (MR),\nis paving a new way to deliver mental health care. In this paper, we conduct a\nscoping review on the development and application of XR in the area of mental\ndisorders. We performed a scoping database search to identify the relevant\nstudies indexed in Google Scholar, PubMed, and the ACM Digital Library. A\nsearch period between August 2016 and December 2023 was defined to select\narticles related to the usage of VR, AR, and MR in a mental health context. We\nidentified a total of 85 studies from 27 countries across the globe. By\nperforming data analysis, we found that most of the studies focused on\ndeveloped countries such as the US (16.47%) and Germany (12.94%). None of the\nstudies were for African countries. The majority of the articles reported that\nXR techniques led to a significant reduction in symptoms of anxiety or\ndepression. More studies were published in the year 2021, i.e., 31.76% (n =\n31). This could indicate that mental disorder intervention received a higher\nattention when COVID-19 emerged. Most studies (n = 65) focused on a population\nbetween 18 and 65 years old, only a few studies focused on teenagers (n = 2).\nAlso, more studies were done experimentally (n = 67, 78.82%) rather than by\nanalytical and modeling approaches (n = 8, 9.41%). This shows that there is a\nrapid development of XR technology for mental health care. Furthermore, these\nstudies showed that XR technology can effectively be used for evaluating mental\ndisorders in similar or better way as the conventional approaches.\n","authors":["Omisore Olatunji","Ifeanyi Odenigbo","Joseph Orji","Amelia Beltran","Nilufar Baghaei","Meier Sandra","Rita Orji"],"pdf_url":"https://arxiv.org/pdf/2204.01348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07545v1","updated":"2024-04-11T08:12:48Z","published":"2024-04-11T08:12:48Z","title":"Stereo-LiDAR Depth Estimation with Deformable Propagation and Learned\n Disparity-Depth Conversion","summary":" Accurate and dense depth estimation with stereo cameras and LiDAR is an\nimportant task for automatic driving and robotic perception. While sparse hints\nfrom LiDAR points have improved cost aggregation in stereo matching, their\neffectiveness is limited by the low density and non-uniform distribution. To\naddress this issue, we propose a novel stereo-LiDAR depth estimation network\nwith Semi-Dense hint Guidance, named SDG-Depth. Our network includes a\ndeformable propagation module for generating a semi-dense hint map and a\nconfidence map by propagating sparse hints using a learned deformable window.\nThese maps then guide cost aggregation in stereo matching. To reduce the\ntriangulation error in depth recovery from disparity, especially in distant\nregions, we introduce a disparity-depth conversion module. Our method is both\naccurate and efficient. The experimental results on benchmark tests show its\nsuperior performance. Our code is available at\nhttps://github.com/SJTU-ViSYS/SDG-Depth.\n","authors":["Ang Li","Anning Hu","Wei Xi","Wenxian Yu","Danping Zou"],"pdf_url":"https://arxiv.org/pdf/2404.07545v1.pdf","comment":"Accepted in ICRA 2024. 8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.07543v1","updated":"2024-04-11T08:11:36Z","published":"2024-04-11T08:11:36Z","title":"Content-Adaptive Non-Local Convolution for Remote Sensing Pansharpening","summary":" Currently, machine learning-based methods for remote sensing pansharpening\nhave progressed rapidly. However, existing pansharpening methods often do not\nfully exploit differentiating regional information in non-local spaces, thereby\nlimiting the effectiveness of the methods and resulting in redundant learning\nparameters. In this paper, we introduce a so-called content-adaptive non-local\nconvolution (CANConv), a novel method tailored for remote sensing image\npansharpening. Specifically, CANConv employs adaptive convolution, ensuring\nspatial adaptability, and incorporates non-local self-similarity through the\nsimilarity relationship partition (SRP) and the partition-wise adaptive\nconvolution (PWAC) sub-modules. Furthermore, we also propose a corresponding\nnetwork architecture, called CANNet, which mainly utilizes the multi-scale\nself-similarity. Extensive experiments demonstrate the superior performance of\nCANConv, compared with recent promising fusion methods. Besides, we\nsubstantiate the method's effectiveness through visualization, ablation\nexperiments, and comparison with existing methods on multiple test sets. The\nsource code is publicly available at https://github.com/duanyll/CANConv.\n","authors":["Yule Duan","Xiao Wu","Haoyu Deng","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.07543v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.11725v2","updated":"2024-04-11T08:11:20Z","published":"2023-10-18T05:44:49Z","title":"VST++: Efficient and Stronger Visual Saliency Transformer","summary":" While previous CNN-based models have exhibited promising results for salient\nobject detection (SOD), their ability to explore global long-range dependencies\nis restricted. Our previous work, the Visual Saliency Transformer (VST),\naddressed this constraint from a transformer-based sequence-to-sequence\nperspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task\ntransformer decoder that concurrently predicts saliency and boundary outcomes\nin a pure transformer architecture. Moreover, we introduced a novel token\nupsampling method called reverse T2T for predicting a high-resolution saliency\nmap effortlessly within transformer-based structures. Building upon the VST\nmodel, we further propose an efficient and stronger VST version in this work,\ni.e. VST++. To mitigate the computational costs of the VST model, we propose a\nSelect-Integrate Attention (SIA) module, partitioning foreground into\nfine-grained segments and aggregating background information into a single\ncoarse-grained token. To incorporate 3D depth information with low cost, we\ndesign a novel depth position encoding method tailored for depth maps.\nFurthermore, we introduce a token-supervised prediction loss to provide\nstraightforward guidance for the task-related tokens. We evaluate our VST++\nmodel across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD\nbenchmark datasets. Experimental results show that our model outperforms\nexisting methods while achieving a 25% reduction in computational costs without\nsignificant performance compromise. The demonstrated strong ability for\ngeneralization, enhanced performance, and heightened efficiency of our VST++\nmodel highlight its potential.\n","authors":["Nian Liu","Ziyang Luo","Ni Zhang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2310.11725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00349v2","updated":"2024-04-11T08:08:10Z","published":"2023-01-01T05:02:46Z","title":"Towards Reliable Medical Image Segmentation by utilizing Evidential\n Calibrated Uncertainty","summary":" Medical image segmentation is critical for disease diagnosis and treatment\nassessment. However, concerns regarding the reliability of segmentation regions\npersist among clinicians, mainly attributed to the absence of confidence\nassessment, robustness, and calibration to accuracy. To address this, we\nintroduce DEviS, an easily implementable foundational model that seamlessly\nintegrates into various medical image segmentation networks. DEviS not only\nenhances the calibration and robustness of baseline segmentation accuracy but\nalso provides high-efficiency uncertainty estimation for reliable predictions.\nBy leveraging subjective logic theory, we explicitly model probability and\nuncertainty for the problem of medical image segmentation. Here, the Dirichlet\ndistribution parameterizes the distribution of probabilities for different\nclasses of the segmentation results. To generate calibrated predictions and\nuncertainty, we develop a trainable calibrated uncertainty penalty.\nFurthermore, DEviS incorporates an uncertainty-aware filtering module, which\nutilizes the metric of uncertainty-calibrated error to filter reliable data\nwithin the dataset. We conducted validation studies to assess both the accuracy\nand robustness of DEviS segmentation, along with evaluating the efficiency and\nreliability of uncertainty estimation. These evaluations were performed using\npublicly available datasets including ISIC2018, LiTS2017, and BraTS2019.\nAdditionally, two potential clinical trials are being conducted at Johns\nHopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in\nfiltering high-quality or out-of-distribution data. Our code has been released\nin https://github.com/Cocofeat/DEviS.\n","authors":["Ke Zou","Yidi Chen","Ling Huang","Xuedong Yuan","Xiaojing Shen","Meng Wang","Rick Siow Mong Goh","Yong Liu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2301.00349v2.pdf","comment":"34 pages, 11 figures"},{"id":"http://arxiv.org/abs/2306.00696v2","updated":"2024-04-11T08:03:25Z","published":"2023-06-01T14:06:48Z","title":"Analyzing the Internals of Neural Radiance Fields","summary":" Modern Neural Radiance Fields (NeRFs) learn a mapping from position to\nvolumetric density leveraging proposal network samplers. In contrast to the\ncoarse-to-fine sampling approach with two NeRFs, this offers significant\npotential for acceleration using lower network capacity. Given that NeRFs\nutilize most of their network capacity to estimate radiance, they could store\nvaluable density information in their parameters or their deep features. To\ninvestigate this proposition, we take one step back and analyze large, trained\nReLU-MLPs used in coarse-to-fine sampling. Building on our novel activation\nvisualization method, we find that trained NeRFs, Mip-NeRFs and proposal\nnetwork samplers map samples with high density to local minima along a ray in\nactivation feature space. We show how these large MLPs can be accelerated by\ntransforming intermediate activations to a weight estimate, without any\nmodifications to the training protocol or the network architecture. With our\napproach, we can reduce the computational requirements of trained NeRFs by up\nto 50% with only a slight hit in rendering quality. Extensive experimental\nevaluation on a variety of datasets and architectures demonstrates the\neffectiveness of our approach. Consequently, our methodology provides valuable\ninsight into the inner workings of NeRFs.\n","authors":["Lukas Radl","Andreas Kurz","Michael Steiner","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2306.00696v2.pdf","comment":"Accepted to CVPRW'24! Project Page:\n https://r4dl.github.io/nerfinternals/"},{"id":"http://arxiv.org/abs/2404.07537v1","updated":"2024-04-11T08:03:23Z","published":"2024-04-11T08:03:23Z","title":"How is Visual Attention Influenced by Text Guidance? Database and Model","summary":" The analysis and prediction of visual attention have long been crucial tasks\nin the fields of computer vision and image processing. In practical\napplications, images are generally accompanied by various text descriptions,\nhowever, few studies have explored the influence of text descriptions on visual\nattention, let alone developed visual saliency prediction models considering\ntext guidance. In this paper, we conduct a comprehensive study on text-guided\nimage saliency (TIS) from both subjective and objective perspectives.\nSpecifically, we construct a TIS database named SJTU-TIS, which includes 1200\ntext-image pairs and the corresponding collected eye-tracking data. Based on\nthe established SJTU-TIS database, we analyze the influence of various text\ndescriptions on visual attention. Then, to facilitate the development of\nsaliency prediction models considering text influence, we construct a benchmark\nfor the established SJTU-TIS database using state-of-the-art saliency models.\nFinally, considering the effect of text descriptions on visual attention, while\nmost existing saliency models ignore this impact, we further propose a\ntext-guided saliency (TGSal) prediction model, which extracts and integrates\nboth image features and text features to predict the image saliency under\nvarious text-description conditions. Our proposed model significantly\noutperforms the state-of-the-art saliency models on both the SJTU-TIS database\nand the pure image saliency databases in terms of various evaluation metrics.\nThe SJTU-TIS database and the code of the proposed TGSal model will be released\nat: https://github.com/IntMeGroup/TGSal.\n","authors":["Yinan Sun","Xiongkuo Min","Huiyu Duan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.07537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09107v2","updated":"2024-04-11T07:42:43Z","published":"2024-03-14T05:00:29Z","title":"S^2MVTC: a Simple yet Efficient Scalable Multi-View Tensor Clustering","summary":" Anchor-based large-scale multi-view clustering has attracted considerable\nattention for its effectiveness in handling massive datasets. However, current\nmethods mainly seek the consensus embedding feature for clustering by exploring\nglobal correlations between anchor graphs or projection matrices.In this paper,\nwe propose a simple yet efficient scalable multi-view tensor clustering\n(S^2MVTC) approach, where our focus is on learning correlations of embedding\nfeatures within and across views. Specifically, we first construct the\nembedding feature tensor by stacking the embedding features of different views\ninto a tensor and rotating it. Additionally, we build a novel tensor\nlow-frequency approximation (TLFA) operator, which incorporates graph\nsimilarity into embedding feature learning, efficiently achieving smooth\nrepresentation of embedding features within different views. Furthermore,\nconsensus constraints are applied to embedding features to ensure inter-view\nsemantic consistency. Experimental results on six large-scale multi-view\ndatasets demonstrate that S^2MVTC significantly outperforms state-of-the-art\nalgorithms in terms of clustering performance and CPU execution time,\nespecially when handling massive data. The code of S^2MVTC is publicly\navailable at https://github.com/longzhen520/S2MVTC.\n","authors":["Zhen Long","Qiyuan Wang","Yazhou Ren","Yipeng Liu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.09107v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.07520v1","updated":"2024-04-11T07:26:00Z","published":"2024-04-11T07:26:00Z","title":"PromptSync: Bridging Domain Gaps in Vision-Language Models through\n Class-Aware Prototype Alignment and Discrimination","summary":" The potential for zero-shot generalization in vision-language (V-L) models\nsuch as CLIP has spurred their widespread adoption in addressing numerous\ndownstream tasks. Previous methods have employed test-time prompt tuning to\nadapt the model to unseen domains, but they overlooked the issue of imbalanced\nclass distributions. In this study, we explicitly address this problem by\nemploying class-aware prototype alignment weighted by mean class probabilities\nobtained for the test sample and filtered augmented views. Additionally, we\nensure that the class probabilities are as accurate as possible by performing\nprototype discrimination using contrastive learning. The combination of\nalignment and discriminative loss serves as a geometric regularizer, preventing\nthe prompt representation from collapsing onto a single class and effectively\nbridging the distribution gap between the source and test domains. Our method,\nnamed PromptSync, synchronizes the prompts for each test sample on both the\ntext and vision branches of the V-L model. In empirical evaluations on the\ndomain generalization benchmark, our method outperforms previous best methods\nby 2.33\\% in overall performance, by 1\\% in base-to-novel generalization, and\nby 2.84\\% in cross-dataset transfer tasks.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2404.07520v1.pdf","comment":"Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures"},{"id":"http://arxiv.org/abs/2404.06859v2","updated":"2024-04-11T07:24:59Z","published":"2024-04-10T09:35:36Z","title":"Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark","summary":" Multi-label image classification in dynamic environments is a problem that\nposes significant challenges. Previous studies have primarily focused on\nscenarios such as Domain Incremental Learning and Class Incremental Learning,\nwhich do not fully capture the complexity of real-world applications. In this\npaper, we study the problem of classification of medical imaging in the\nscenario termed New Instances and New Classes, which combines the challenges of\nboth new class arrivals and domain shifts in a single framework. Unlike\ntraditional scenarios, it reflects the realistic nature of CL in domains such\nas medical imaging, where updates may introduce both new classes and changes in\ndomain characteristics. To address the unique challenges posed by this complex\nscenario, we introduce a novel approach called Pseudo-Label Replay. This method\naims to mitigate forgetting while adapting to new classes and domain shifts by\ncombining the advantages of the Replay and Pseudo-Label methods and solving\ntheir limitations in the proposed scenario. We evaluate our proposed approach\non a challenging benchmark consisting of two datasets, seven tasks, and\nnineteen classes, modeling a realistic Continual Learning scenario. Our\nexperimental findings demonstrate the effectiveness of Pseudo-Label Replay in\naddressing the challenges posed by the complex scenario proposed. Our method\nsurpasses existing approaches, exhibiting superior performance while showing\nminimal forgetting.\n","authors":["Marina Ceccon","Davide Dalle Pezze","Alessandro Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2404.06859v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07518v1","updated":"2024-04-11T07:22:14Z","published":"2024-04-11T07:22:14Z","title":"Remembering Transformer for Continual Learning","summary":" Neural networks encounter the challenge of Catastrophic Forgetting (CF) in\ncontinual learning, where new task knowledge interferes with previously learned\nknowledge. We propose Remembering Transformer, inspired by the brain's\nComplementary Learning Systems (CLS), to tackle this issue. Remembering\nTransformer employs a mixture-of-adapters and a generative model-based routing\nmechanism to alleviate CF by dynamically routing task data to relevant\nadapters. Our approach demonstrated a new SOTA performance in various vision\ncontinual learning tasks and great parameter efficiency.\n","authors":["Yuwei Sun","Jun Sakuma","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2404.07518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16073v2","updated":"2024-04-11T07:20:52Z","published":"2023-10-24T14:59:51Z","title":"FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal\n Consistency and Correlation Debiasing","summary":" Dynamic scene graph generation (SGG) from videos requires not only a\ncomprehensive understanding of objects across scenes but also a method to\ncapture the temporal motions and interactions with different objects. Moreover,\nthe long-tailed distribution of visual relationships is a crucial bottleneck\nfor most dynamic SGG methods. This is because many of them focus on capturing\nspatio-temporal context using complex architectures, leading to the generation\nof biased scene graphs. To address these challenges, we propose\n\\textsc{FloCoDe}: \\textbf{Flo}w-aware Temporal Consistency and\n\\textbf{Co}rrelation \\textbf{De}biasing with uncertainty attenuation for\nunbiased dynamic scene graphs. \\textsc{FloCoDe} employs feature warping using\nflow to detect temporally consistent objects across frames. To address the\nlong-tail issue of visual relationships, we propose correlation debiasing and a\nlabel correlation-based loss to learn unbiased relation representations for\nlong-tailed classes. Specifically, we propose to incorporate label correlations\nusing contrastive loss to capture commonly co-occurring relations, which aids\nin learning robust representations for long-tailed classes. Further, we adopt\nthe uncertainty attenuation-based classifier framework to handle noisy\nannotations in the SGG data. Extensive experimental evaluation shows a\nperformance gain as high as 4.1\\%, demonstrating the superiority of generating\nmore unbiased scene graphs.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16073v2.pdf","comment":"Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2404.05426v2","updated":"2024-04-11T07:12:35Z","published":"2024-04-08T11:54:49Z","title":"Test-Time Zero-Shot Temporal Action Localization","summary":" Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate\nactions in untrimmed videos unseen during training. Existing ZS-TAL methods\ninvolve fine-tuning a model on a large amount of annotated training data. While\neffective, training-based ZS-TAL approaches assume the availability of labeled\ndata for supervised learning, which can be impractical in some applications.\nFurthermore, the training process naturally induces a domain bias into the\nlearned model, which may adversely affect the model's generalization ability to\narbitrary videos. These considerations prompt us to approach the ZS-TAL problem\nfrom a radically novel perspective, relaxing the requirement for training data.\nTo this aim, we introduce a novel method that performs Test-Time adaptation for\nTemporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained\nVision and Language Model (VLM). T3AL operates in three steps. First, a\nvideo-level pseudo-label of the action category is computed by aggregating\ninformation from the entire video. Then, action localization is performed\nadopting a novel procedure inspired by self-supervised learning. Finally,\nframe-level textual descriptions extracted with a state-of-the-art captioning\nmodel are employed for refining the action region proposals. We validate the\neffectiveness of T3AL by conducting experiments on the THUMOS14 and the\nActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly\noutperforms zero-shot baselines based on state-of-the-art VLMs, confirming the\nbenefit of a test-time adaptation approach.\n","authors":["Benedetta Liberatori","Alessandro Conti","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05426v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07514v1","updated":"2024-04-11T07:11:43Z","published":"2024-04-11T07:11:43Z","title":"Generalization Gap in Data Augmentation: Insights from Illumination","summary":" In the field of computer vision, data augmentation is widely used to enrich\nthe feature complexity of training datasets with deep learning techniques.\nHowever, regarding the generalization capabilities of models, the difference in\nartificial features generated by data augmentation and natural visual features\nhas not been fully revealed. This study focuses on the visual representation\nvariable 'illumination', by simulating its distribution degradation and\nexamining how data augmentation techniques enhance model performance on a\nclassification task. Our goal is to investigate the differences in\ngeneralization between models trained with augmented data and those trained\nunder real-world illumination conditions. Results indicate that after\nundergoing various data augmentation methods, model performance has been\nsignificantly improved. Yet, a noticeable generalization gap still exists after\nutilizing various data augmentation methods, emphasizing the critical role of\nfeature diversity in the training set for enhancing model generalization.\n","authors":["Jianqiang Xiao","Weiwen Guo","Junfeng Liu","Mengze Li"],"pdf_url":"https://arxiv.org/pdf/2404.07514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01446v2","updated":"2024-04-11T06:58:18Z","published":"2024-04-01T19:33:41Z","title":"Finding Regions of Interest in Whole Slide Images Using Multiple\n Instance Learning","summary":" Whole Slide Images (WSI), obtained by high-resolution digital scanning of\nmicroscope slides at multiple scales, are the cornerstone of modern Digital\nPathology. However, they represent a particular challenge to\nAI-based/AI-mediated analysis because pathology labeling is typically done at\nslide-level, instead of tile-level. It is not just that medical diagnostics is\nrecorded at the specimen level, the detection of oncogene mutation is also\nexperimentally obtained, and recorded by initiatives like The Cancer Genome\nAtlas (TCGA), at the slide level. This configures a dual challenge: a)\naccurately predicting the overall cancer phenotype and b) finding out what\ncellular morphologies are associated with it at the tile level. To address\nthese challenges, a weakly supervised Multiple Instance Learning (MIL) approach\nwas explored for two prevalent cancer types, Invasive Breast Carcinoma\n(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was\nexplored for tumor detection at low magnification levels and TP53 mutations at\nvarious levels. Our results show that a novel additive implementation of MIL\nmatched the performance of reference implementation (AUC 0.96), and was only\nslightly outperformed by Attention MIL (AUC 0.97). More interestingly from the\nperspective of the molecular pathologist, these different AI architectures\nidentify distinct sensitivities to morphological features (through the\ndetection of Regions of Interest, RoI) at different amplification levels.\nTellingly, TP53 mutation was most sensitive to features at the higher\napplications where cellular morphology is resolved.\n","authors":["Martim Afonso","Praphulla M. S. Bhawsar","Monjoy Saha","Jonas S. Almeida","Arlindo L. Oliveira"],"pdf_url":"https://arxiv.org/pdf/2404.01446v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07507v1","updated":"2024-04-11T06:55:44Z","published":"2024-04-11T06:55:44Z","title":"Learning to Classify New Foods Incrementally Via Compressed Exemplars","summary":" Food image classification systems play a crucial role in health monitoring\nand diet tracking through image-based dietary assessment techniques. However,\nexisting food recognition systems rely on static datasets characterized by a\npre-defined fixed number of food classes. This contrasts drastically with the\nreality of food consumption, which features constantly changing data.\nTherefore, food image classification systems should adapt to and manage data\nthat continuously evolves. This is where continual learning plays an important\nrole. A challenge in continual learning is catastrophic forgetting, where ML\nmodels tend to discard old knowledge upon learning new information. While\nmemory-replay algorithms have shown promise in mitigating this problem by\nstoring old data as exemplars, they are hampered by the limited capacity of\nmemory buffers, leading to an imbalance between new and previously learned\ndata. To address this, our work explores the use of neural image compression to\nextend buffer size and enhance data diversity. We introduced the concept of\ncontinuously learning a neural compression model to adaptively improve the\nquality of compressed data and optimize the bitrates per pixel (bpp) to store\nmore exemplars. Our extensive experiments, including evaluations on\nfood-specific datasets including Food-101 and VFN-74, as well as the general\ndataset ImageNet-100, demonstrate improvements in classification accuracy. This\nprogress is pivotal in advancing more realistic food recognition systems that\nare capable of adapting to continually evolving data. Moreover, the principles\nand methodologies we've developed hold promise for broader applications,\nextending their benefits to other domains of continual machine learning\nsystems.\n","authors":["Justin Yang","Zhihao Duan","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.07507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15430v2","updated":"2024-04-11T06:40:12Z","published":"2024-02-23T16:50:07Z","title":"Hierarchical Invariance for Robust and Interpretable Vision Tasks at\n Larger Scales","summary":" Developing robust and interpretable vision systems is a crucial step towards\ntrustworthy artificial intelligence. In this regard, a promising paradigm\nconsiders embedding task-required invariant structures, e.g., geometric\ninvariance, in the fundamental image representation. However, such invariant\nrepresentations typically exhibit limited discriminability, limiting their\napplications in larger-scale trustworthy vision tasks. For this open problem,\nwe conduct a systematic investigation of hierarchical invariance, exploring\nthis topic from theoretical, practical, and application perspectives. At the\ntheoretical level, we show how to construct over-complete invariants with a\nConvolutional Neural Networks (CNN)-like hierarchical architecture yet in a\nfully interpretable manner. The general blueprint, specific definitions,\ninvariant properties, and numerical implementations are provided. At the\npractical level, we discuss how to customize this theoretical framework into a\ngiven task. With the over-completeness, discriminative features w.r.t. the task\ncan be adaptively formed in a Neural Architecture Search (NAS)-like manner. We\ndemonstrate the above arguments with accuracy, invariance, and efficiency\nresults on texture, digit, and parasite classification experiments.\nFurthermore, at the application level, our representations are explored in\nreal-world forensics tasks on adversarial perturbations and Artificial\nIntelligence Generated Content (AIGC). Such applications reveal that the\nproposed strategy not only realizes the theoretically promised invariance, but\nalso exhibits competitive discriminability even in the era of deep learning.\nFor robust and interpretable vision tasks at larger scales, hierarchical\ninvariant representation can be considered as an effective alternative to\ntraditional CNN and invariants.\n","authors":["Shuren Qi","Yushu Zhang","Chao Wang","Zhihua Xia","Xiaochun Cao","Jian Weng"],"pdf_url":"https://arxiv.org/pdf/2402.15430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07504v1","updated":"2024-04-11T06:39:53Z","published":"2024-04-11T06:39:53Z","title":"Mitigating Object Dependencies: Improving Point Cloud Self-Supervised\n Learning through Object Exchange","summary":" In the realm of point cloud scene understanding, particularly in indoor\nscenes, objects are arranged following human habits, resulting in objects of\ncertain semantics being closely positioned and displaying notable inter-object\ncorrelations. This can create a tendency for neural networks to exploit these\nstrong dependencies, bypassing the individual object patterns. To address this\nchallenge, we introduce a novel self-supervised learning (SSL) strategy. Our\napproach leverages both object patterns and contextual cues to produce robust\nfeatures. It begins with the formulation of an object-exchanging strategy,\nwhere pairs of objects with comparable sizes are exchanged across different\nscenes, effectively disentangling the strong contextual dependencies.\nSubsequently, we introduce a context-aware feature learning strategy, which\nencodes object patterns without relying on their specific context by\naggregating object features across various scenes. Our extensive experiments\ndemonstrate the superiority of our method over existing SSL techniques, further\nshowing its better robustness to environmental changes. Moreover, we showcase\nthe applicability of our approach by transferring pre-trained models to diverse\npoint cloud datasets.\n","authors":["Yanhao Wu","Tong Zhang","Wei Ke","Congpei Qiu","Sabine Susstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2404.07504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08801v3","updated":"2024-04-11T06:25:41Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07868v2","updated":"2024-04-11T06:21:29Z","published":"2023-01-19T03:42:56Z","title":"MV-Adapter: Multimodal Video Transfer Learning for Video Text Retrieval","summary":" State-of-the-art video-text retrieval (VTR) methods typically involve fully\nfine-tuning a pre-trained model (e.g. CLIP) on specific datasets. However, this\ncan result in significant storage costs in practical applications as a separate\nmodel per task must be stored. To address this issue, we present our pioneering\nwork that enables parameter-efficient VTR using a pre-trained model, with only\na small number of tunable parameters during training. Towards this goal, we\npropose a new method dubbed Multimodal Video Adapter (MV-Adapter) for\nefficiently transferring the knowledge in the pre-trained CLIP from image-text\nto video-text. Specifically, MV-Adapter utilizes bottleneck structures in both\nvideo and text branches, along with two novel components. The first is a\nTemporal Adaptation Module that is incorporated in the video branch to\nintroduce global and local temporal contexts. We also train weights\ncalibrations to adjust to dynamic variations across frames. The second is Cross\nModality Tying that generates weights for video/text branches through sharing\ncross modality factors, for better aligning between modalities. Thanks to above\ninnovations, MV-Adapter can achieve comparable or better performance than\nstandard full fine-tuning with negligible parameters overhead. Notably,\nMV-Adapter consistently outperforms various competing methods in V2T/T2V tasks\nwith large margins on five widely used VTR benchmarks (MSR-VTT, MSVD, LSMDC,\nDiDemo, and ActivityNet).\n","authors":["Xiaojie Jin","Bowen Zhang","Weibo Gong","Kai Xu","XueQing Deng","Peng Wang","Zhao Zhang","Xiaohui Shen","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2301.07868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07495v1","updated":"2024-04-11T06:06:56Z","published":"2024-04-11T06:06:56Z","title":"PillarTrack: Redesigning Pillar-based Transformer Network for Single\n Object Tracking on Point Clouds","summary":" LiDAR-based 3D single object tracking (3D SOT) is a critical issue in\nrobotics and autonomous driving. It aims to obtain accurate 3D BBox from the\nsearch area based on similarity or motion. However, existing 3D SOT methods\nusually follow the point-based pipeline, where the sampling operation\ninevitably leads to redundant or lost information, resulting in unexpected\nperformance. To address these issues, we propose PillarTrack, a pillar-based 3D\nsingle object tracking framework. Firstly, we transform sparse point clouds\ninto dense pillars to preserve the local and global geometrics. Secondly, we\nintroduce a Pyramid-type Encoding Pillar Feature Encoder (PE-PFE) design to\nhelp the feature representation of each pillar. Thirdly, we present an\nefficient Transformer-based backbone from the perspective of modality\ndifferences. Finally, we construct our PillarTrack tracker based above designs.\nExtensive experiments on the KITTI and nuScenes dataset demonstrate the\nsuperiority of our proposed method. Notably, our method achieves\nstate-of-the-art performance on the KITTI and nuScenes dataset and enables\nreal-time tracking speed. We hope our work could encourage the community to\nrethink existing 3D SOT tracker designs.We will open source our code to the\nresearch community in https://github.com/StiphyJay/PillarTrack.\n","authors":["Weisheng Xu","Sifan Zhou","Zhihang Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.07495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07487v1","updated":"2024-04-11T05:51:06Z","published":"2024-04-11T05:51:06Z","title":"Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton\n Action Recognition","summary":" Skeleton-based zero-shot action recognition aims to recognize unknown human\nactions based on the learned priors of the known skeleton-based actions and a\nsemantic descriptor space shared by both known and unknown categories. However,\nprevious works focus on establishing the bridges between the known skeleton\nrepresentation space and semantic descriptions space at the coarse-grained\nlevel for recognizing unknown action categories, ignoring the fine-grained\nalignment of these two spaces, resulting in suboptimal performance in\ndistinguishing high-similarity action categories. To address these challenges,\nwe propose a novel method via Side information and dual-prompts learning for\nskeleton-based zero-shot action recognition (STAR) at the fine-grained level.\nSpecifically, 1) we decompose the skeleton into several parts based on its\ntopology structure and introduce the side information concerning multi-part\ndescriptions of human body movements for alignment between the skeleton and the\nsemantic space at the fine-grained level; 2) we design the visual-attribute and\nsemantic-part prompts to improve the intra-class compactness within the\nskeleton space and inter-class separability within the semantic space,\nrespectively, to distinguish the high-similarity actions. Extensive experiments\nshow that our method achieves state-of-the-art performance in ZSL and GZSL\nsettings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets.\n","authors":["Yang Chen","Jingcai Guo","Tian He","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07487v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.00644v3","updated":"2024-04-11T05:48:36Z","published":"2024-03-01T16:25:17Z","title":"Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks","summary":" Diffusion models trained on large-scale datasets have achieved remarkable\nprogress in image synthesis. However, due to the randomness in the diffusion\nprocess, they often struggle with handling diverse low-level tasks that require\ndetails preservation. To overcome this limitation, we present a new Diff-Plugin\nframework to enable a single pre-trained diffusion model to generate\nhigh-fidelity results across a variety of low-level tasks. Specifically, we\nfirst propose a lightweight Task-Plugin module with a dual branch design to\nprovide task-specific priors, guiding the diffusion process in preserving image\ncontent. We then propose a Plugin-Selector that can automatically select\ndifferent Task-Plugins based on the text instruction, allowing users to edit\nimages by indicating multiple low-level tasks with natural language. We conduct\nextensive experiments on 8 low-level vision tasks. The results demonstrate the\nsuperiority of Diff-Plugin over existing methods, particularly in real-world\nscenarios. Our ablations further validate that Diff-Plugin is stable,\nschedulable, and supports robust training across different dataset sizes.\n","authors":["Yuhao Liu","Zhanghan Ke","Fang Liu","Nanxuan Zhao","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.00644v3.pdf","comment":"Accepted to CVPR2024. Replaced some celebrity images to avoid\n copyright disputes"},{"id":"http://arxiv.org/abs/2404.06351v2","updated":"2024-04-11T05:17:44Z","published":"2024-04-09T14:42:31Z","title":"HPNet: Dynamic Trajectory Forecasting with Historical Prediction\n Attention","summary":" Predicting the trajectories of road agents is essential for autonomous\ndriving systems. The recent mainstream methods follow a static paradigm, which\npredicts the future trajectory by using a fixed duration of historical frames.\nThese methods make the predictions independently even at adjacent time steps,\nwhich leads to potential instability and temporal inconsistency. As successive\ntime steps have largely overlapping historical frames, their forecasting should\nhave intrinsic correlation, such as overlapping predicted trajectories should\nbe consistent, or be different but share the same motion goal depending on the\nroad situation. Motivated by this, in this work, we introduce HPNet, a novel\ndynamic trajectory forecasting method. Aiming for stable and accurate\ntrajectory forecasting, our method leverages not only historical frames\nincluding maps and agent states, but also historical predictions. Specifically,\nwe newly design a Historical Prediction Attention module to automatically\nencode the dynamic relationship between successive predictions. Besides, it\nalso extends the attention range beyond the currently visible window\nbenefitting from the use of historical predictions. The proposed Historical\nPrediction Attention together with the Agent Attention and Mode Attention is\nfurther formulated as the Triple Factorized Attention module, serving as the\ncore design of HPNet.Experiments on the Argoverse and INTERACTION datasets show\nthat HPNet achieves state-of-the-art performance, and generates accurate and\nstable future trajectories. Our code are available at\nhttps://github.com/XiaolongTang23/HPNet.\n","authors":["Xiaolong Tang","Meina Kan","Shiguang Shan","Zhilong Ji","Jinfeng Bai","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06351v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.00511v3","updated":"2024-04-11T05:14:35Z","published":"2024-03-31T01:16:02Z","title":"MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in\n Conversations with Multimodal Language Models","summary":" This paper presents our winning submission to Subtask 2 of SemEval 2024 Task\n3 on multimodal emotion cause analysis in conversations. We propose a novel\nMultimodal Emotion Recognition and Multimodal Emotion Cause Extraction\n(MER-MCE) framework that integrates text, audio, and visual modalities using\nspecialized emotion encoders. Our approach sets itself apart from\ntop-performing teams by leveraging modality-specific features for enhanced\nemotion understanding and causality inference. Experimental evaluation\ndemonstrates the advantages of our multimodal approach, with our submission\nachieving a competitive weighted F1 score of 0.3435, ranking third with a\nmargin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team.\nProject: https://github.com/MIPS-COLT/MER-MCE.git\n","authors":["Zebang Cheng","Fuqiang Niu","Yuxiang Lin","Zhi-Qi Cheng","Bowen Zhang","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2404.00511v3.pdf","comment":"Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st &\n 2nd by 0.0339 & 0.0025"},{"id":"http://arxiv.org/abs/2404.07474v1","updated":"2024-04-11T04:58:18Z","published":"2024-04-11T04:58:18Z","title":"G-NeRF: Geometry-enhanced Novel View Synthesis from Single-View Images","summary":" Novel view synthesis aims to generate new view images of a given view image\ncollection. Recent attempts address this problem relying on 3D geometry priors\n(e.g., shapes, sizes, and positions) learned from multi-view images. However,\nsuch methods encounter the following limitations: 1) they require a set of\nmulti-view images as training data for a specific scene (e.g., face, car or\nchair), which is often unavailable in many real-world scenarios; 2) they fail\nto extract the geometry priors from single-view images due to the lack of\nmulti-view supervision. In this paper, we propose a Geometry-enhanced NeRF\n(G-NeRF), which seeks to enhance the geometry priors by a geometry-guided\nmulti-view synthesis approach, followed by a depth-aware training. In the\nsynthesis process, inspired that existing 3D GAN models can unconditionally\nsynthesize high-fidelity multi-view images, we seek to adopt off-the-shelf 3D\nGAN models, such as EG3D, as a free source to provide geometry priors through\nsynthesizing multi-view data. Simultaneously, to further improve the geometry\nquality of the synthetic data, we introduce a truncation method to effectively\nsample latent codes within 3D GAN models. To tackle the absence of multi-view\nsupervision for single-view images, we design the depth-aware training\napproach, incorporating a depth-aware discriminator to guide geometry priors\nthrough depth maps. Experiments demonstrate the effectiveness of our method in\nterms of both qualitative and quantitative results.\n","authors":["Zixiong Huang","Qi Chen","Libo Sun","Yifan Yang","Naizhou Wang","Mingkui Tan","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07474v1.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.07473v1","updated":"2024-04-11T04:54:42Z","published":"2024-04-11T04:54:42Z","title":"LUCF-Net: Lightweight U-shaped Cascade Fusion Network for Medical Image\n Segmentation","summary":" In this study, the performance of existing U-shaped neural network\narchitectures was enhanced for medical image segmentation by adding\nTransformer. Although Transformer architectures are powerful at extracting\nglobal information, its ability to capture local information is limited due to\nits high complexity. To address this challenge, we proposed a new lightweight\nU-shaped cascade fusion network (LUCF-Net) for medical image segmentation. It\nutilized an asymmetrical structural design and incorporated both local and\nglobal modules to enhance its capacity for local and global modeling.\nAdditionally, a multi-layer cascade fusion decoding network was designed to\nfurther bolster the network's information fusion capabilities. Validation\nresults achieved on multi-organ datasets in CT format, cardiac segmentation\ndatasets in MRI format, and dermatology datasets in image format demonstrated\nthat the proposed model outperformed other state-of-the-art methods in handling\nlocal-global information, achieving an improvement of 1.54% in Dice coefficient\nand 2.6 mm in Hausdorff distance on multi-organ segmentation. Furthermore, as a\nnetwork that combines Convolutional Neural Network and Transformer\narchitectures, it achieves competitive segmentation performance with only 6.93\nmillion parameters and 6.6 gigabytes of floating point operations, without the\nneed of pre-training. In summary, the proposed method demonstrated enhanced\nperformance while retaining a simpler model design compared to other\nTransformer-based segmentation networks.\n","authors":["Songkai Sun","Qingshan She","Yuliang Ma","Rihui Li","Yingchun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06845v2","updated":"2024-04-11T04:17:13Z","published":"2024-03-11T16:03:35Z","title":"DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video\n Generation","summary":" World models have demonstrated superiority in autonomous driving,\nparticularly in the generation of multi-view driving videos. However,\nsignificant challenges still exist in generating customized driving videos. In\nthis paper, we propose DriveDreamer-2, which builds upon the framework of\nDriveDreamer and incorporates a Large Language Model (LLM) to generate\nuser-defined driving videos. Specifically, an LLM interface is initially\nincorporated to convert a user's query into agent trajectories. Subsequently, a\nHDMap, adhering to traffic regulations, is generated based on the trajectories.\nUltimately, we propose the Unified Multi-View Model to enhance temporal and\nspatial coherence in the generated driving videos. DriveDreamer-2 is the first\nworld model to generate customized driving videos, it can generate uncommon\ndriving videos (e.g., vehicles abruptly cut in) in a user-friendly manner.\nBesides, experimental results demonstrate that the generated videos enhance the\ntraining of driving perception methods (e.g., 3D detection and tracking).\nFurthermore, video generation quality of DriveDreamer-2 surpasses other\nstate-of-the-art methods, showcasing FID and FVD scores of 11.2 and 55.7,\nrepresenting relative improvements of 30% and 50%.\n","authors":["Guosheng Zhao","Xiaofeng Wang","Zheng Zhu","Xinze Chen","Guan Huang","Xiaoyi Bao","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06845v2.pdf","comment":"Project Page: https://drivedreamer2.github.io"},{"id":"http://arxiv.org/abs/2404.07467v1","updated":"2024-04-11T04:14:48Z","published":"2024-04-11T04:14:48Z","title":"Trashbusters: Deep Learning Approach for Litter Detection and Tracking","summary":" The illegal disposal of trash is a major public health and environmental\nconcern. Disposing of trash in unplanned places poses serious health and\nenvironmental risks. We should try to restrict public trash cans as much as\npossible. This research focuses on automating the penalization of litterbugs,\naddressing the persistent problem of littering in public places. Traditional\napproaches relying on manual intervention and witness reporting suffer from\ndelays, inaccuracies, and anonymity issues. To overcome these challenges, this\npaper proposes a fully automated system that utilizes surveillance cameras and\nadvanced computer vision algorithms for litter detection, object tracking, and\nface recognition. The system accurately identifies and tracks individuals\nengaged in littering activities, attaches their identities through face\nrecognition, and enables efficient enforcement of anti-littering policies. By\nreducing reliance on manual intervention, minimizing human error, and providing\nprompt identification, the proposed system offers significant advantages in\naddressing littering incidents. The primary contribution of this research lies\nin the implementation of the proposed system, leveraging advanced technologies\nto enhance surveillance operations and automate the penalization of litterbugs.\n","authors":["Kashish Jain","Manthan Juthani","Jash Jain","Anant V. Nimkar"],"pdf_url":"https://arxiv.org/pdf/2404.07467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10974v4","updated":"2024-04-11T04:14:33Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v4.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2402.16994v2","updated":"2024-04-11T03:44:49Z","published":"2024-02-26T20:00:57Z","title":"GEM3D: GEnerative Medial Abstractions for 3D Shape Synthesis","summary":" We introduce GEM3D -- a new deep, topology-aware generative model of 3D\nshapes. The key ingredient of our method is a neural skeleton-based\nrepresentation encoding information on both shape topology and geometry.\nThrough a denoising diffusion probabilistic model, our method first generates\nskeleton-based representations following the Medial Axis Transform (MAT), then\ngenerates surfaces through a skeleton-driven neural implicit formulation. The\nneural implicit takes into account the topological and geometric information\nstored in the generated skeleton representations to yield surfaces that are\nmore topologically and geometrically accurate compared to previous neural field\nformulations. We discuss applications of our method in shape synthesis and\npoint cloud reconstruction tasks, and evaluate our method both qualitatively\nand quantitatively. We demonstrate significantly more faithful surface\nreconstruction and diverse shape generation results compared to the\nstate-of-the-art, also involving challenging scenarios of reconstructing and\nsynthesizing structurally complex, high-genus shape surfaces from Thingi10K and\nShapeNet.\n","authors":["Dmitry Petrov","Pradyumn Goyal","Vikas Thamizharasan","Vladimir G. Kim","Matheus Gadelha","Melinos Averkiou","Siddhartha Chaudhuri","Evangelos Kalogerakis"],"pdf_url":"https://arxiv.org/pdf/2402.16994v2.pdf","comment":"Webpage: https://lodurality.github.io/GEM3D/ -- Cond. accept. to\n SIGGRAPH 2024 (conf. track) -- Changes (based on reviews): changed style to\n sigconf; rearranged figures for readability; added missing citations; fixed\n misaligned centers in Fig. 3; added failure cases (Fig. 10); rewrote\n discussion; added categories averages to Tab. 8; added Tab. 10 with model\n capacities"},{"id":"http://arxiv.org/abs/2404.07449v1","updated":"2024-04-11T03:09:34Z","published":"2024-04-11T03:09:34Z","title":"Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs","summary":" Integration of Large Language Models (LLMs) into visual domain tasks,\nresulting in visual-LLMs (V-LLMs), has enabled exceptional performance in\nvision-language tasks, particularly for visual question answering (VQA).\nHowever, existing V-LLMs (e.g. BLIP-2, LLaVA) demonstrate weak spatial\nreasoning and localization awareness. Despite generating highly descriptive and\nelaborate textual answers, these models fail at simple tasks like\ndistinguishing a left vs right location. In this work, we explore how\nimage-space coordinate based instruction fine-tuning objectives could inject\nspatial awareness into V-LLMs. We discover optimal coordinate representations,\ndata-efficient instruction fine-tuning objectives, and pseudo-data generation\nstrategies that lead to improved spatial awareness in V-LLMs. Additionally, our\nresulting model improves VQA across image and video domains, reduces undesired\nhallucination, and generates better contextual object descriptions. Experiments\nacross 5 vision-language tasks involving 14 different datasets establish the\nclear performance improvements achieved by our proposed framework.\n","authors":["Kanchana Ranasinghe","Satya Narayan Shukla","Omid Poursaeed","Michael S. Ryoo","Tsung-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2404.07449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07448v1","updated":"2024-04-11T03:08:53Z","published":"2024-04-11T03:08:53Z","title":"Transferable and Principled Efficiency for Open-Vocabulary Segmentation","summary":" Recent success of pre-trained foundation vision-language models makes\nOpen-Vocabulary Segmentation (OVS) possible. Despite the promising performance,\nthis approach introduces heavy computational overheads for two challenges: 1)\nlarge model sizes of the backbone; 2) expensive costs during the fine-tuning.\nThese challenges hinder this OVS strategy from being widely applicable and\naffordable in real-world scenarios. Although traditional methods such as model\ncompression and efficient fine-tuning can address these challenges, they often\nrely on heuristics. This means that their solutions cannot be easily\ntransferred and necessitate re-training on different models, which comes at a\ncost. In the context of efficient OVS, we target achieving performance that is\ncomparable to or even better than prior OVS works based on large\nvision-language foundation models, by utilizing smaller models that incur lower\ntraining costs. The core strategy is to make our efficiency principled and thus\nseamlessly transferable from one OVS framework to others without further\ncustomization. Comprehensive experiments on diverse OVS benchmarks demonstrate\nour superior trade-off between segmentation accuracy and computation costs over\nprevious works. Our code is available on https://github.com/Xujxyang/OpenTrans\n","authors":["Jingxuan Xu","Wuyang Chen","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.07448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16923v2","updated":"2024-04-11T03:01:41Z","published":"2024-01-30T11:46:27Z","title":"Fourier Prompt Tuning for Modality-Incomplete Scene Segmentation","summary":" Integrating information from multiple modalities enhances the robustness of\nscene perception systems in autonomous vehicles, providing a more comprehensive\nand reliable sensory framework. However, the modality incompleteness in\nmulti-modal segmentation remains under-explored. In this work, we establish a\ntask called Modality-Incomplete Scene Segmentation (MISS), which encompasses\nboth system-level modality absence and sensor-level modality errors. To avoid\nthe predominant modality reliance in multi-modal fusion, we introduce a\nMissing-aware Modal Switch (MMS) strategy to proactively manage missing\nmodalities during training. Utilizing bit-level batch-wise sampling enhances\nthe model's performance in both complete and incomplete testing scenarios.\nFurthermore, we introduce the Fourier Prompt Tuning (FPT) method to incorporate\nrepresentative spectral information into a limited number of learnable prompts\nthat maintain robustness against all MISS scenarios. Akin to fine-tuning\neffects but with fewer tunable parameters (1.1%). Extensive experiments prove\nthe efficacy of our proposed approach, showcasing an improvement of 5.84% mIoU\nover the prior state-of-the-art parameter-efficient methods in modality\nmissing. The source code is publicly available at\nhttps://github.com/RuipingL/MISS.\n","authors":["Ruiping Liu","Jiaming Zhang","Kunyu Peng","Yufan Chen","Ke Cao","Junwei Zheng","M. Saquib Sarfraz","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2401.16923v2.pdf","comment":"Accepted to IEEE IV 2024. The source code is publicly available at\n https://github.com/RuipingL/MISS"},{"id":"http://arxiv.org/abs/2404.07445v1","updated":"2024-04-11T03:00:00Z","published":"2024-04-11T03:00:00Z","title":"Multi-view Aggregation Network for Dichotomous Image Segmentation","summary":" Dichotomous Image Segmentation (DIS) has recently emerged towards\nhigh-precision object segmentation from high-resolution natural images.\n When designing an effective DIS model, the main challenge is how to balance\nthe semantic dispersion of high-resolution targets in the small receptive field\nand the loss of high-precision details in the large receptive field. Existing\nmethods rely on tedious multiple encoder-decoder streams and stages to\ngradually complete the global localization and local refinement.\n Human visual system captures regions of interest by observing them from\nmultiple views. Inspired by it, we model DIS as a multi-view object perception\nproblem and provide a parsimonious multi-view aggregation network (MVANet),\nwhich unifies the feature fusion of the distant view and close-up view into a\nsingle stream with one encoder-decoder structure. With the help of the proposed\nmulti-view complementary localization and refinement modules, our approach\nestablished long-range, profound visual interactions across multiple views,\nallowing the features of the detailed close-up view to focus on highly slender\nstructures.Experiments on the popular DIS-5K dataset show that our MVANet\nsignificantly outperforms state-of-the-art methods in both accuracy and speed.\nThe source code and datasets will be publicly available at\n\\href{https://github.com/qianyu-dlut/MVANet}{MVANet}.\n","authors":["Qian Yu","Xiaoqi Zhao","Youwei Pang","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.07445v1.pdf","comment":"Accepted by CVPR2024 as Highlight"},{"id":"http://arxiv.org/abs/2310.14576v2","updated":"2024-04-11T02:57:21Z","published":"2023-10-23T05:25:49Z","title":"Tensor Decomposition Based Attention Module for Spiking Neural Networks","summary":" The attention mechanism has been proven to be an effective way to improve\nspiking neural network (SNN). However, based on the fact that the current SNN\ninput data flow is split into tensors to process on GPUs, none of the previous\nworks consider the properties of tensors to implement an attention module. This\ninspires us to rethink current SNN from the perspective of tensor-relevant\ntheories. Using tensor decomposition, we design the \\textit{projected full\nattention} (PFA) module, which demonstrates excellent results with linearly\ngrowing parameters. Specifically, PFA is composed by the \\textit{linear\nprojection of spike tensor} (LPST) module and \\textit{attention map composing}\n(AMC) module. In LPST, we start by compressing the original spike tensor into\nthree projected tensors using a single property-preserving strategy with\nlearnable parameters for each dimension. Then, in AMC, we exploit the inverse\nprocedure of the tensor decomposition process to combine the three tensors into\nthe attention map using a so-called connecting factor. To validate the\neffectiveness of the proposed PFA module, we integrate it into the widely used\nVGG and ResNet architectures for classification tasks. Our method achieves\nstate-of-the-art performance on both static and dynamic benchmark datasets,\nsurpassing the existing SNN models with Transformer-based and CNN-based\nbackbones.\n","authors":["Haoyu Deng","Ruijie Zhu","Xuerui Qiu","Yule Duan","Malu Zhang","Liangjian Deng"],"pdf_url":"https://arxiv.org/pdf/2310.14576v2.pdf","comment":"Accepted by Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2403.17920v2","updated":"2024-04-11T02:42:59Z","published":"2024-03-26T17:55:11Z","title":"TC4D: Trajectory-Conditioned Text-to-4D Generation","summary":" Recent techniques for text-to-4D generation synthesize dynamic 3D scenes\nusing supervision from pre-trained text-to-video models. However, existing\nrepresentations for motion, such as deformation models or time-dependent neural\nrepresentations, are limited in the amount of motion they can generate-they\ncannot synthesize motion extending far beyond the bounding box used for volume\nrendering. The lack of a more flexible motion model contributes to the gap in\nrealism between 4D generation methods and recent, near-photorealistic video\ngeneration models. Here, we propose TC4D: trajectory-conditioned text-to-4D\ngeneration, which factors motion into global and local components. We represent\nthe global motion of a scene's bounding box using rigid transformation along a\ntrajectory parameterized by a spline. We learn local deformations that conform\nto the global trajectory using supervision from a text-to-video model. Our\napproach enables the synthesis of scenes animated along arbitrary trajectories,\ncompositional scene generation, and significant improvements to the realism and\namount of generated motion, which we evaluate qualitatively and through a user\nstudy. Video results can be viewed on our website:\nhttps://sherwinbahmani.github.io/tc4d.\n","authors":["Sherwin Bahmani","Xian Liu","Yifan Wang","Ivan Skorokhodov","Victor Rong","Ziwei Liu","Xihui Liu","Jeong Joon Park","Sergey Tulyakov","Gordon Wetzstein","Andrea Tagliasacchi","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2403.17920v2.pdf","comment":"Project Page: https://sherwinbahmani.github.io/tc4d"},{"id":"http://arxiv.org/abs/2404.07435v1","updated":"2024-04-11T02:29:08Z","published":"2024-04-11T02:29:08Z","title":"Encoding Urban Ecologies: Automated Building Archetype Generation\n through Self-Supervised Learning for Energy Modeling","summary":" As the global population and urbanization expand, the building sector has\nemerged as the predominant energy consumer and carbon emission contributor. The\nneed for innovative Urban Building Energy Modeling grows, yet existing building\narchetypes often fail to capture the unique attributes of local buildings and\nthe nuanced distinctions between different cities, jeopardizing the precision\nof energy modeling. This paper presents an alternative tool employing\nself-supervised learning to distill complex geometric data into representative,\nlocale-specific archetypes. This study attempts to foster a new paradigm of\ninteraction with built environments, incorporating local parameters to conduct\nbespoke energy simulations at the community level. The catered archetypes can\naugment the precision and applicability of energy consumption modeling at\ndifferent scales across diverse building inventories. This tool provides a\npotential solution that encourages the exploration of emerging local ecologies.\nBy integrating building envelope characteristics and cultural granularity into\nthe building archetype generation process, we seek a future where architecture\nand urban design are intricately interwoven with the energy sector in shaping\nour built environments.\n","authors":["Xinwei Zhuang","Zixun Huang","Wentao Zeng","Luisa Caldas"],"pdf_url":"https://arxiv.org/pdf/2404.07435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10853v4","updated":"2024-04-11T01:56:38Z","published":"2023-07-20T13:16:10Z","title":"Exploring Effective Priors and Efficient Models for Weakly-Supervised\n Change Detection","summary":" Weakly-supervised change detection (WSCD) aims to detect pixel-level changes\nwith only image-level annotations. Owing to its label efficiency, WSCD is\ndrawing increasing attention recently. However, current WSCD methods often\nencounter the challenge of change missing and fabricating, i.e., the\ninconsistency between image-level annotations and pixel-level predictions.\nSpecifically, change missing refer to the situation that the WSCD model fails\nto predict any changed pixels, even though the image-level label indicates\nchanged, and vice versa for change fabricating. To address this challenge, in\nthis work, we leverage global-scale and local-scale priors in WSCD and propose\ntwo components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.\nThe DP decoder decodes samples with the changed image-level label, skips\nsamples with the unchanged label, and replaces them with an all-unchanged\npixel-level label. The LG constraint is derived from the correspondence between\nchanged representations and image-level labels, penalizing the model when it\nmispredicts the change status. Additionally, we develop TransWCD, a simple yet\npowerful transformer-based model, showcasing the potential of weakly-supervised\nlearning in change detection. By integrating the DP decoder and LG constraint\ninto TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL\nachieve significant +6.33% and +9.55% F1 score improvements over the\nstate-of-the-art methods on the WHU-CD dataset, respectively. Some performance\nmetrics even exceed several fully-supervised change detection (FSCD)\ncompetitors. Code will be available at\nhttps://github.com/zhenghuizhao/TransWCD.\n","authors":["Zhenghui Zhao","Lixiang Ru","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10853v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07424v1","updated":"2024-04-11T01:33:45Z","published":"2024-04-11T01:33:45Z","title":"CopilotCAD: Empowering Radiologists with Report Completion Models and\n Quantitative Evidence from Medical Image Foundation Models","summary":" Computer-aided diagnosis systems hold great promise to aid radiologists and\nclinicians in radiological clinical practice and enhance diagnostic accuracy\nand efficiency. However, the conventional systems primarily focus on delivering\ndiagnostic results through text report generation or medical image\nclassification, positioning them as standalone decision-makers rather than\nhelpers and ignoring radiologists' expertise. This study introduces an\ninnovative paradigm to create an assistive co-pilot system for empowering\nradiologists by leveraging Large Language Models (LLMs) and medical image\nanalysis tools. Specifically, we develop a collaborative framework to integrate\nLLMs and quantitative medical image analysis results generated by foundation\nmodels with radiologists in the loop, achieving efficient and safe generation\nof radiology reports and effective utilization of computational power of AI and\nthe expertise of medical professionals. This approach empowers radiologists to\ngenerate more precise and detailed diagnostic reports, enhancing patient\noutcomes while reducing the burnout of clinicians. Our methodology underscores\nthe potential of AI as a supportive tool in medical diagnostics, promoting a\nharmonious integration of technology and human expertise to advance the field\nof radiology.\n","authors":["Sheng Wang","Tianming Du","Katherine Fischer","Gregory E Tasian","Justin Ziemba","Joanie M Garratt","Hersh Sagreiya","Yong Fan"],"pdf_url":"https://arxiv.org/pdf/2404.07424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07410v1","updated":"2024-04-11T00:49:38Z","published":"2024-04-11T00:49:38Z","title":"Improving Shift Invariance in Convolutional Neural Networks with\n Translation Invariant Polyphase Sampling","summary":" Downsampling operators break the shift invariance of convolutional neural\nnetworks (CNNs) and this affects the robustness of features learned by CNNs\nwhen dealing with even small pixel-level shift. Through a large-scale\ncorrelation analysis framework, we study shift invariance of CNNs by inspecting\nexisting downsampling operators in terms of their maximum-sampling bias (MSB),\nand find that MSB is negatively correlated with shift invariance. Based on this\ncrucial insight, we propose a learnable pooling operator called Translation\nInvariant Polyphase Sampling (TIPS) and two regularizations on the intermediate\nfeature maps of TIPS to reduce MSB and learn translation-invariant\nrepresentations. TIPS can be integrated into any CNN and can be trained\nend-to-end with marginal computational overhead. Our experiments demonstrate\nthat TIPS results in consistent performance gains in terms of accuracy, shift\nconsistency, and shift fidelity on multiple benchmarks for image classification\nand semantic segmentation compared to previous methods and also leads to\nimprovements in adversarial and distributional robustness. TIPS results in the\nlowest MSB compared to all previous methods, thus explaining our strong\nempirical results.\n","authors":["Sourajit Saha","Tejas Gokhale"],"pdf_url":"https://arxiv.org/pdf/2404.07410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07405v1","updated":"2024-04-11T00:45:10Z","published":"2024-04-11T00:45:10Z","title":"Simplifying Two-Stage Detectors for On-Device Inference in Remote\n Sensing","summary":" Deep learning has been successfully applied to object detection from remotely\nsensed images. Images are typically processed on the ground rather than\non-board due to the computation power of the ground system. Such offloaded\nprocessing causes delays in acquiring target mission information, which hinders\nits application to real-time use cases. For on-device object detection,\nresearches have been conducted on designing efficient detectors or model\ncompression to reduce inference latency. However, highly accurate two-stage\ndetectors still need further exploitation for acceleration. In this paper, we\npropose a model simplification method for two-stage object detectors. Instead\nof constructing a general feature pyramid, we utilize only one feature\nextraction in the two-stage detector. To compensate for the accuracy drop, we\napply a high pass filter to the RPN's score map. Our approach is applicable to\nany two-stage detector using a feature pyramid network. In the experiments with\nstate-of-the-art two-stage detectors such as ReDet, Oriented-RCNN, and LSKNet,\nour method reduced computation costs upto 61.2% with the accuracy loss within\n2.1% on the DOTAv1.5 dataset. Source code will be released.\n","authors":["Jaemin Kang","Hoeseok Yang","Hyungshin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10300v4","updated":"2024-04-11T00:35:04Z","published":"2023-05-17T15:37:47Z","title":"One-Prompt to Segment All Medical Images","summary":" Large foundation models, known for their strong zero-shot generalization,\nhave excelled in visual and language applications. However, applying them to\nmedical image segmentation, a domain with diverse imaging types and target\nlabels, remains an open challenge. Current approaches, such as adapting\ninteractive segmentation models like Segment Anything Model (SAM), require user\nprompts for each sample during inference. Alternatively, transfer learning\nmethods like few/one-shot models demand labeled samples, leading to high costs.\nThis paper introduces a new paradigm toward the universal medical image\nsegmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation\ncombines the strengths of one-shot and interactive methods. In the inference\nstage, with just \\textbf{one prompted sample}, it can adeptly handle the unseen\ntask in a single forward pass. We train One-Prompt Model on 64 open-source\nmedical datasets, accompanied by the collection of over 3,000 clinician-labeled\nprompts. Tested on 14 previously unseen datasets, the One-Prompt Model\nshowcases superior zero-shot segmentation capabilities, outperforming a wide\nrange of related methods. The code and data is released as\n\\url{https://github.com/KidsWithTokens/one-prompt}.\n","authors":["Junde Wu","Jiayuan Zhu","Yuanpei Liu","Yueming Jin","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2305.10300v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.12620"},{"id":"http://arxiv.org/abs/2404.07399v1","updated":"2024-04-11T00:23:28Z","published":"2024-04-11T00:23:28Z","title":"Post-hurricane building damage assessment using street-view imagery and\n structured data: A multi-modal deep learning approach","summary":" Accurately assessing building damage is critical for disaster response and\nrecovery. However, many existing models for detecting building damage have poor\nprediction accuracy due to their limited capabilities of identifying detailed,\ncomprehensive structural and/or non-structural damage from the street-view\nimage. Additionally, these models mainly rely on the imagery data for damage\nclassification, failing to account for other critical information, such as wind\nspeed, building characteristics, evacuation zones, and distance of the building\nto the hurricane track. To address these limitations, in this study, we propose\na novel multi-modal (i.e., imagery and structured data) approach for\npost-hurricane building damage classification, named the Multi-Modal Swin\nTransformer (MMST). We empirically train and evaluate the proposed MMST using\ndata collected from the 2022 Hurricane Ian in Florida, USA. Results show that\nMMST outperforms all selected state-of-the-art benchmark models and can achieve\nan accuracy of 92.67%, which are 7.71% improvement in accuracy compared to\nVisual Geometry Group 16 (VGG-16). In addition to the street-view imagery data,\nbuilding value, building age, and wind speed are the most important predictors\nfor damage level classification. The proposed MMST can be deployed to assist in\nrapid damage assessment and guide reconnaissance efforts in future hurricanes.\n","authors":["Zhuoqun Xue","Xiaojian Zhang","David O. Prevatt","Jennifer Bridge","Susu Xu","Xilei Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.07399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07395v1","updated":"2024-04-11T00:02:57Z","published":"2024-04-11T00:02:57Z","title":"Global versus Local: Evaluating AlexNet Architectures for Tropical\n Cyclone Intensity Estimation","summary":" Given the destructive impacts of tropical cyclones, it is critical to have a\nreliable system for cyclone intensity detection. Various techniques are\navailable for this purpose, each with differing levels of accuracy. In this\npaper, we introduce two ensemble-based models based on AlexNet architecture to\nestimate tropical cyclone intensity using visible satellite images. The first\nmodel, trained on the entire dataset, is called the global AlexNet model. The\nsecond model is a distributed version of AlexNet in which multiple AlexNets are\ntrained separately on subsets of the training data categorized according to the\nSaffir-Simpson wind speed scale prescribed by the meterologists. We evaluated\nthe performance of both models against a deep learning benchmark model called\n\\textit{Deepti} using a publicly available cyclone image dataset. Results\nindicate that both the global model (with a root mean square error (RMSE) of\n9.03 knots) and the distributed model (with a RMSE of 9.3 knots) outperform the\nbenchmark model (with a RMSE of 13.62 knots). We provide a thorough discussion\nof our solution approach, including an explanantion of the AlexNet's\nperformance using gradient class activation maps (grad-CAM). Our proposed\nsolution strategy allows future experimentation with various deep learning\nmodels in both single and multi-channel settings.\n","authors":["Vikas Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.07395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04526v2","updated":"2024-04-11T23:50:32Z","published":"2023-08-08T18:41:38Z","title":"Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours\n Maps","summary":" In this work, we describe a method for large-scale 3D cell-tracking through a\nsegmentation selection approach. The proposed method is effective at tracking\ncells across large microscopy datasets on two fronts: (i) It can solve problems\ncontaining millions of segmentation instances in terabyte-scale 3D+t datasets;\n(ii) It achieves competitive results with or without deep learning, which\nrequires 3D annotated data, that is scarce in the fluorescence microscopy\nfield. The proposed method computes cell tracks and segments using a hierarchy\nof segmentation hypotheses and selects disjoint segments by maximizing the\noverlap between adjacent frames. We show that this method achieves\nstate-of-the-art results in 3D images from the cell tracking challenge and has\na faster integer linear programming formulation. Moreover, our framework is\nflexible and supports segmentations from off-the-shelf cell segmentation models\nand can combine them into an ensemble that improves tracking. The code is\navailable https://github.com/royerlab/ultrack.\n","authors":["Jordão Bragantini","Merlin Lange","Loïc Royer"],"pdf_url":"https://arxiv.org/pdf/2308.04526v2.pdf","comment":"13 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.16400v2","updated":"2024-04-11T23:38:06Z","published":"2024-03-25T03:30:37Z","title":"ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D\n Pose Estimation","summary":" In medical and industrial domains, providing guidance for assembly processes\nis critical to ensure efficiency and safety. Errors in assembly can lead to\nsignificant consequences such as extended surgery times, and prolonged\nmanufacturing or maintenance times in industry. Assembly scenarios can benefit\nfrom in-situ AR visualization to provide guidance, reduce assembly times and\nminimize errors. To enable in-situ visualization 6D pose estimation can be\nleveraged. Existing 6D pose estimation techniques primarily focus on individual\nobjects and static captures. However, assembly scenarios have various dynamics\nincluding occlusion during assembly and dynamics in the assembly objects\nappearance. Existing work, combining object detection/6D pose estimation and\nassembly state detection focuses either on pure deep learning-based approaches,\nor limit the assembly state detection to building blocks. To address the\nchallenges of 6D pose estimation in combination with assembly state detection,\nour approach ASDF builds upon the strengths of YOLOv8, a real-time capable\nobject detection framework. We extend this framework, refine the object pose\nand fuse pose knowledge with network-detected pose information. Utilizing our\nlate fusion in our Pose2State module results in refined 6D pose estimation and\nassembly state detection. By combining both pose and state information, our\nPose2State module predicts the final assembly state with precision. Our\nevaluation on our ASDF dataset shows that our Pose2State module leads to an\nimproved assembly state detection and that the improvement of the assembly\nstate further leads to a more robust 6D pose estimation. Moreover, on the GBOT\ndataset, we outperform the pure deep learning-based network, and even\noutperform the hybrid and pure tracking-based approaches.\n","authors":["Hannah Schieber","Shiyu Li","Niklas Corell","Philipp Beckerle","Julian Kreimeier","Daniel Roth"],"pdf_url":"https://arxiv.org/pdf/2403.16400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01492v2","updated":"2024-04-11T23:09:25Z","published":"2024-04-01T21:28:50Z","title":"Modality Translation for Object Detection Adaptation Without Forgetting\n Prior Knowledge","summary":" A common practice in deep learning consists of training large neural networks\non massive datasets to perform accurately for different domains and tasks.\nWhile this methodology may work well in numerous application areas, it only\napplies across modalities due to a larger distribution shift in data captured\nusing different sensors. This paper focuses on the problem of adapting a large\nobject detection model to one or multiple modalities while being efficient. To\ndo so, we propose ModTr as an alternative to the common approach of fine-tuning\nlarge models. ModTr consists of adapting the input with a small transformation\nnetwork trained to minimize the detection loss directly. The original model can\ntherefore work on the translated inputs without any further change or\nfine-tuning to its parameters. Experimental results on translating from IR to\nRGB images on two well-known datasets show that this simple ModTr approach\nprovides detectors that can perform comparably or better than the standard\nfine-tuning without forgetting the original knowledge. This opens the doors to\na more flexible and efficient service-based detection pipeline in which,\ninstead of using a different detector for each modality, a unique and unaltered\nserver is constantly running, where multiple modalities with the corresponding\ntranslations can query it. Code: https://github.com/heitorrapela/ModTr.\n","authors":["Heitor Rapela Medeiros","Masih Aminbeidokhti","Fidel Guerrero Pena","David Latortue","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2404.01492v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12982v2","updated":"2024-04-11T22:47:39Z","published":"2023-10-19T17:59:56Z","title":"Putting the Object Back into Video Object Segmentation","summary":" We present Cutie, a video object segmentation (VOS) network with object-level\nmemory reading, which puts the object representation from memory back into the\nvideo object segmentation result. Recent works on VOS employ bottom-up\npixel-level memory reading which struggles due to matching noise, especially in\nthe presence of distractors, resulting in lower performance in more challenging\ndata. In contrast, Cutie performs top-down object-level memory reading by\nadapting a small set of object queries. Via those, it interacts with the\nbottom-up pixel features iteratively with a query-based object transformer (qt,\nhence Cutie). The object queries act as a high-level summary of the target\nobject, while high-resolution feature maps are retained for accurate\nsegmentation. Together with foreground-background masked attention, Cutie\ncleanly separates the semantics of the foreground object from the background.\nOn the challenging MOSE dataset, Cutie improves by 8.7 J&F over XMem with a\nsimilar running time and improves by 4.2 J&F over DeAOT while being three times\nfaster. Code is available at: https://hkchengrex.github.io/Cutie\n","authors":["Ho Kei Cheng","Seoung Wug Oh","Brian Price","Joon-Young Lee","Alexander Schwing"],"pdf_url":"https://arxiv.org/pdf/2310.12982v2.pdf","comment":"CVPR 2024 Highlight. Project page: https://hkchengrex.github.io/Cutie"},{"id":"http://arxiv.org/abs/2307.15904v2","updated":"2024-04-11T22:39:15Z","published":"2023-07-29T06:23:51Z","title":"Sat2Cap: Mapping Fine-Grained Textual Descriptions from Satellite Images","summary":" We propose a weakly supervised approach for creating maps using free-form\ntextual descriptions. We refer to this work of creating textual maps as\nzero-shot mapping. Prior works have approached mapping tasks by developing\nmodels that predict a fixed set of attributes using overhead imagery. However,\nthese models are very restrictive as they can only solve highly specific tasks\nfor which they were trained. Mapping text, on the other hand, allows us to\nsolve a large variety of mapping problems with minimal restrictions. To achieve\nthis, we train a contrastive learning framework called Sat2Cap on a new\nlarge-scale dataset with 6.1M pairs of overhead and ground-level images. For a\ngiven location and overhead image, our model predicts the expected CLIP\nembeddings of the ground-level scenery. The predicted CLIP embeddings are then\nused to learn about the textual space associated with that location. Sat2Cap is\nalso conditioned on date-time information, allowing it to model temporally\nvarying concepts over a location. Our experimental results demonstrate that our\nmodels successfully capture ground-level concepts and allow large-scale mapping\nof fine-grained textual queries. Our approach does not require any text-labeled\ndata, making the training easily scalable. The code, dataset, and models will\nbe made publicly available.\n","authors":["Aayush Dhakal","Adeel Ahmad","Subash Khanal","Srikumar Sastry","Hannah Kerner","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2307.15904v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.08135v1","updated":"2024-04-11T21:41:55Z","published":"2024-04-11T21:41:55Z","title":"SciFlow: Empowering Lightweight Optical Flow Models with Self-Cleaning\n Iterations","summary":" Optical flow estimation is crucial to a variety of vision tasks. Despite\nsubstantial recent advancements, achieving real-time on-device optical flow\nestimation remains a complex challenge. First, an optical flow model must be\nsufficiently lightweight to meet computation and memory constraints to ensure\nreal-time performance on devices. Second, the necessity for real-time on-device\noperation imposes constraints that weaken the model's capacity to adequately\nhandle ambiguities in flow estimation, thereby intensifying the difficulty of\npreserving flow accuracy. This paper introduces two synergistic techniques,\nSelf-Cleaning Iteration (SCI) and Regression Focal Loss (RFL), designed to\nenhance the capabilities of optical flow models, with a focus on addressing\noptical flow regression ambiguities. These techniques prove particularly\neffective in mitigating error propagation, a prevalent issue in optical flow\nmodels that employ iterative refinement. Notably, these techniques add\nnegligible to zero overhead in model parameters and inference latency, thereby\npreserving real-time on-device efficiency. The effectiveness of our proposed\nSCI and RFL techniques, collectively referred to as SciFlow for brevity, is\ndemonstrated across two distinct lightweight optical flow model architectures\nin our experiments. Remarkably, SciFlow enables substantial reduction in error\nmetrics (EPE and Fl-all) over the baseline models by up to 6.3% and 10.5% for\nin-domain scenarios and by up to 6.2% and 13.5% for cross-domain scenarios on\nthe Sintel and KITTI 2015 datasets, respectively.\n","authors":["Jamie Menjay Lin","Jisoo Jeong","Hong Cai","Risheek Garrepalli","Kai Wang","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2404.08135v1.pdf","comment":"CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08127v1","updated":"2024-04-11T21:07:38Z","published":"2024-04-11T21:07:38Z","title":"Self-Supervised Learning of Color Constancy","summary":" Color constancy (CC) describes the ability of the visual system to perceive\nan object as having a relatively constant color despite changes in lighting\nconditions. While CC and its limitations have been carefully characterized in\nhumans, it is still unclear how the visual system acquires this ability during\ndevelopment. Here, we present a first study showing that CC develops in a\nneural network trained in a self-supervised manner through an invariance\nlearning objective. During learning, objects are presented under changing\nilluminations, while the network aims to map subsequent views of the same\nobject onto close-by latent representations. This gives rise to representations\nthat are largely invariant to the illumination conditions, offering a plausible\nexample of how CC could emerge during human cognitive development via a form of\nself-supervised learning.\n","authors":["Markus R. Ernst","Francisco M. López","Arthur Aubret","Roland W. Fleming","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2404.08127v1.pdf","comment":"7 pages, 5 figures, submitted to the IEEE International Conference on\n Development and Learning (ICDL 2024)"},{"id":"http://arxiv.org/abs/2404.08111v1","updated":"2024-04-11T20:25:26Z","published":"2024-04-11T20:25:26Z","title":"S3Editor: A Sparse Semantic-Disentangled Self-Training Framework for\n Face Video Editing","summary":" Face attribute editing plays a pivotal role in various applications. However,\nexisting methods encounter challenges in achieving high-quality results while\npreserving identity, editing faithfulness, and temporal consistency. These\nchallenges are rooted in issues related to the training pipeline, including\nlimited supervision, architecture design, and optimization strategy. In this\nwork, we introduce S3Editor, a Sparse Semantic-disentangled Self-training\nframework for face video editing. S3Editor is a generic solution that\ncomprehensively addresses these challenges with three key contributions.\nFirstly, S3Editor adopts a self-training paradigm to enhance the training\nprocess through semi-supervision. Secondly, we propose a semantic disentangled\narchitecture with a dynamic routing mechanism that accommodates diverse editing\nrequirements. Thirdly, we present a structured sparse optimization schema that\nidentifies and deactivates malicious neurons to further disentangle impacts\nfrom untarget attributes. S3Editor is model-agnostic and compatible with\nvarious editing approaches. Our extensive qualitative and quantitative results\naffirm that our approach significantly enhances identity preservation, editing\nfidelity, as well as temporal consistency.\n","authors":["Guangzhi Wang","Tianyi Chen","Kamran Ghasedi","HsiangTao Wu","Tianyu Ding","Chris Nuesmeyer","Ilya Zharkov","Mohan Kankanhalli","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2404.08111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01001v2","updated":"2024-04-11T20:07:20Z","published":"2023-12-02T02:09:31Z","title":"Learning county from pixels: Corn yield prediction with\n attention-weighted multiple instance learning","summary":" Remote sensing technology has become a promising tool in yield prediction.\nMost prior work employs satellite imagery for county-level corn yield\nprediction by spatially aggregating all pixels within a county into a single\nvalue, potentially overlooking the detailed information and valuable insights\noffered by more granular data. To this end, this research examines each county\nat the pixel level and applies multiple instance learning to leverage detailed\ninformation within a county. In addition, our method addresses the \"mixed\npixel\" issue caused by the inconsistent resolution between feature datasets and\ncrop mask, which may introduce noise into the model and therefore hinder\naccurate yield prediction. Specifically, the attention mechanism is employed to\nautomatically assign weights to different pixels, which can mitigate the\ninfluence of mixed pixels. The experimental results show that the developed\nmodel outperforms four other machine learning models over the past five years\nin the U.S. corn belt and demonstrates its best performance in 2022, achieving\na coefficient of determination (R2) value of 0.84 and a root mean square error\n(RMSE) of 0.83. This paper demonstrates the advantages of our approach from\nboth spatial and temporal perspectives. Furthermore, through an in-depth study\nof the relationship between mixed pixels and attention, it is verified that our\napproach can capture critical feature information while filtering out noise\nfrom mixed pixels.\n","authors":["Xiaoyu Wang","Yuchi Ma","Qunying Huang","Zhengwei Yang","Zhou Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.01001v2.pdf","comment":"I am writing to request the withdrawal of my paper submitted to\n arXiv. Upon further review, I have identified an error in the paper that\n significantly affects the results and conclusions. To maintain the integrity\n of the scientific record and prevent the dissemination of incorrect\n information, I believe it is necessary to withdraw the paper from the archive"},{"id":"http://arxiv.org/abs/2307.03798v2","updated":"2024-04-11T19:24:50Z","published":"2023-07-07T18:54:11Z","title":"Fooling Contrastive Language-Image Pre-trained Models with\n CLIPMasterPrints","summary":" Models leveraging both visual and textual data such as Contrastive\nLanguage-Image Pre-training (CLIP), are the backbone of many recent advances in\nartificial intelligence. In this work, we show that despite their versatility,\nsuch models are vulnerable to what we refer to as fooling master images.\nFooling master images are capable of maximizing the confidence score of a CLIP\nmodel for a significant number of widely varying prompts, while being either\nunrecognizable or unrelated to the attacked prompts for humans. The existence\nof such images is problematic as it could be used by bad actors to maliciously\ninterfere with CLIP-trained image retrieval models in production with\ncomparably small effort as a single image can attack many different prompts. We\ndemonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined\nusing stochastic gradient descent, projected gradient descent, or blackbox\noptimization. Contrary to many common adversarial attacks, the blackbox\noptimization approach allows us to mine CLIPMasterPrints even when the weights\nof the model are not accessible. We investigate the properties of the mined\nimages, and find that images trained on a small number of image captions\ngeneralize to a much larger number of semantically related captions. We\nevaluate possible mitigation strategies, where we increase the robustness of\nthe model and introduce an approach to automatically detect CLIPMasterPrints to\nsanitize the input of vulnerable models. Finally, we find that vulnerability to\nCLIPMasterPrints is related to a modality gap in contrastive pre-trained\nmulti-modal networks. Code available at\nhttps://github.com/matfrei/CLIPMasterPrints.\n","authors":["Matthias Freiberger","Peter Kun","Christian Igel","Anders Sundnes Løvlie","Sebastian Risi"],"pdf_url":"https://arxiv.org/pdf/2307.03798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13004v3","updated":"2024-04-11T19:22:41Z","published":"2022-10-24T07:50:02Z","title":"Efficient Representation of Natural Image Patches","summary":" Utilizing an abstract information processing model based on minimal yet\nrealistic assumptions inspired by biological systems, we study how to achieve\nthe early visual system's two ultimate objectives: efficient information\ntransmission and accurate sensor probability distribution modeling. We prove\nthat optimizing for information transmission does not guarantee optimal\nprobability distribution modeling in general. We illustrate, using a two-pixel\n(2D) system and image patches, that an efficient representation can be realized\nthrough a nonlinear population code driven by two types of biologically\nplausible loss functions that depend solely on output. After unsupervised\nlearning, our abstract information processing model bears remarkable\nresemblances to biological systems, despite not mimicking many features of real\nneurons, such as spiking activity. A preliminary comparison with a contemporary\ndeep learning model suggests that our model offers a significant efficiency\nadvantage. Our model provides novel insights into the computational theory of\nearly visual systems as well as a potential new approach to enhance the\nefficiency of deep learning models.\n","authors":["Cheng Guo"],"pdf_url":"https://arxiv.org/pdf/2210.13004v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08088v1","updated":"2024-04-11T19:06:36Z","published":"2024-04-11T19:06:36Z","title":"Visual Context-Aware Person Fall Detection","summary":" As the global population ages, the number of fall-related incidents is on the\nrise. Effective fall detection systems, specifically in healthcare sector, are\ncrucial to mitigate the risks associated with such events. This study evaluates\nthe role of visual context, including background objects, on the accuracy of\nfall detection classifiers. We present a segmentation pipeline to\nsemi-automatically separate individuals and objects in images. Well-established\nmodels like ResNet-18, EfficientNetV2-S, and Swin-Small are trained and\nevaluated. During training, pixel-based transformations are applied to\nsegmented objects, and the models are then evaluated on raw images without\nsegmentation. Our findings highlight the significant influence of visual\ncontext on fall detection. The application of Gaussian blur to the image\nbackground notably improves the performance and generalization capabilities of\nall models. Background objects such as beds, chairs, or wheelchairs can\nchallenge fall detection systems, leading to false positive alarms. However, we\ndemonstrate that object-specific contextual transformations during training\neffectively mitigate this challenge. Further analysis using saliency maps\nsupports our observation that visual context is crucial in classification\ntasks. We create both dataset processing API and segmentation pipeline,\navailable at https://github.com/A-NGJ/image-segmentation-cli.\n","authors":["Aleksander Nagaj","Zenjie Li","Dim P. Papadopoulos","Kamal Nasrollahi"],"pdf_url":"https://arxiv.org/pdf/2404.08088v1.pdf","comment":"10 pages, 6 figures, KES IDT-24 conference"},{"id":"http://arxiv.org/abs/2404.03507v2","updated":"2024-04-11T18:54:24Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17205v4","updated":"2024-04-11T18:48:04Z","published":"2023-12-28T18:40:31Z","title":"EFHQ: Multi-purpose ExtremePose-Face-HQ dataset","summary":" The existing facial datasets, while having plentiful images at near frontal\nviews, lack images with extreme head poses, leading to the downgraded\nperformance of deep learning models when dealing with profile or pitched faces.\nThis work aims to address this gap by introducing a novel dataset named Extreme\nPose Face High-Quality Dataset (EFHQ), which includes a maximum of 450k\nhigh-quality images of faces at extreme poses. To produce such a massive\ndataset, we utilize a novel and meticulous dataset processing pipeline to\ncurate two publicly available datasets, VFHQ and CelebV-HQ, which contain many\nhigh-resolution face videos captured in various settings. Our dataset can\ncomplement existing datasets on various facial-related tasks, such as facial\nsynthesis with 2D/3D-aware GAN, diffusion-based text-to-image face generation,\nand face reenactment. Specifically, training with EFHQ helps models generalize\nwell across diverse poses, significantly improving performance in scenarios\ninvolving extreme views, confirmed by extensive experiments. Additionally, we\nutilize EFHQ to define a challenging cross-view face verification benchmark, in\nwhich the performance of SOTA face recognition models drops 5-37% compared to\nfrontal-to-frontal scenarios, aiming to stimulate studies on face recognition\nunder severe pose conditions in the wild.\n","authors":["Trung Tuan Dao","Duc Hong Vu","Cuong Pham","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2312.17205v4.pdf","comment":"Project Page: https://bomcon123456.github.io/efhq/"},{"id":"http://arxiv.org/abs/2404.08081v1","updated":"2024-04-11T18:42:14Z","published":"2024-04-11T18:42:14Z","title":"Real-Time Detection and Analysis of Vehicles and Pedestrians using Deep\n Learning","summary":" Computer vision, particularly vehicle and pedestrian identification is\ncritical to the evolution of autonomous driving, artificial intelligence, and\nvideo surveillance. Current traffic monitoring systems confront major\ndifficulty in recognizing small objects and pedestrians effectively in\nreal-time, posing a serious risk to public safety and contributing to traffic\ninefficiency. Recognizing these difficulties, our project focuses on the\ncreation and validation of an advanced deep-learning framework capable of\nprocessing complex visual input for precise, real-time recognition of cars and\npeople in a variety of environmental situations. On a dataset representing\ncomplicated urban settings, we trained and evaluated different versions of the\nYOLOv8 and RT-DETR models. The YOLOv8 Large version proved to be the most\neffective, especially in pedestrian recognition, with great precision and\nrobustness. The results, which include Mean Average Precision and recall rates,\ndemonstrate the model's ability to dramatically improve traffic monitoring and\nsafety. This study makes an important addition to real-time, reliable detection\nin computer vision, establishing new benchmarks for traffic management systems.\n","authors":["Md Nahid Sadik","Tahmim Hossain","Faisal Sayeed"],"pdf_url":"https://arxiv.org/pdf/2404.08081v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.08079v1","updated":"2024-04-11T18:34:29Z","published":"2024-04-11T18:34:29Z","title":"DIMAT: Decentralized Iterative Merging-And-Training for Deep Learning\n Models","summary":" Recent advances in decentralized deep learning algorithms have demonstrated\ncutting-edge performance on various tasks with large pre-trained models.\nHowever, a pivotal prerequisite for achieving this level of competitiveness is\nthe significant communication and computation overheads when updating these\nmodels, which prohibits the applications of them to real-world scenarios. To\naddress this issue, drawing inspiration from advanced model merging techniques\nwithout requiring additional training, we introduce the Decentralized Iterative\nMerging-And-Training (DIMAT) paradigm--a novel decentralized deep learning\nframework. Within DIMAT, each agent is trained on their local data and\nperiodically merged with their neighboring agents using advanced model merging\ntechniques like activation matching until convergence is achieved. DIMAT\nprovably converges with the best available rate for nonconvex functions with\nvarious first-order methods, while yielding tighter error bounds compared to\nthe popular existing approaches. We conduct a comprehensive empirical analysis\nto validate DIMAT's superiority over baselines across diverse computer vision\ntasks sourced from multiple datasets. Empirical results validate our\ntheoretical claims by showing that DIMAT attains faster and higher initial gain\nin accuracy with independent and identically distributed (IID) and non-IID\ndata, incurring lower communication overhead. This DIMAT paradigm presents a\nnew opportunity for the future decentralized learning, enhancing its\nadaptability to real-world with sparse and light-weight communication and\ncomputation.\n","authors":["Nastaran Saadati","Minh Pham","Nasla Saleem","Joshua R. Waite","Aditya Balu","Zhanhong Jiang","Chinmay Hegde","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2404.08079v1.pdf","comment":"CVPR 2024 accepted paper, 22 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.02059v2","updated":"2024-04-11T18:29:01Z","published":"2024-04-02T15:58:36Z","title":"IISAN: Efficiently Adapting Multimodal Representation for Sequential\n Recommendation with Decoupled PEFT","summary":" Multimodal foundation models are transformative in sequential recommender\nsystems, leveraging powerful representation learning capabilities. While\nParameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation\nmodels for recommendation tasks, most research prioritizes parameter\nefficiency, often overlooking critical factors like GPU memory efficiency and\ntraining speed. Addressing this gap, our paper introduces IISAN (Intra- and\nInter-modal Side Adapted Network for Multimodal Representation), a simple\nplug-and-play architecture using a Decoupled PEFT structure and exploiting both\nintra- and inter-modal adaptation.\n IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art\nPEFT. More importantly, it significantly reduces GPU memory usage - from 47GB\nto just 3GB for multimodal sequential recommendation tasks. Additionally, it\naccelerates training time per epoch from 443s to 22s compared to FFT. This is\nalso a notable improvement over the Adapter and LoRA, which require 37-39 GB\nGPU memory and 350-380 seconds per epoch for training.\n Furthermore, we propose a new composite efficiency metric, TPME\n(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the\nprevalent misconception that \"parameter efficiency represents overall\nefficiency\". TPME provides more comprehensive insights into practical\nefficiency comparisons between different methods. Besides, we give an\naccessible efficiency analysis of all PEFT and FFT approaches, which\ndemonstrate the superiority of IISAN. We release our codes and other materials\nat https://github.com/GAIR-Lab/IISAN.\n","authors":["Junchen Fu","Xuri Ge","Xin Xin","Alexandros Karatzoglou","Ioannis Arapakis","Jie Wang","Joemon M. Jose"],"pdf_url":"https://arxiv.org/pdf/2404.02059v2.pdf","comment":"Accepted by SIGIR2024"},{"id":"http://arxiv.org/abs/2404.08031v1","updated":"2024-04-11T17:59:52Z","published":"2024-04-11T17:59:52Z","title":"Latent Guard: a Safety Framework for Text-to-image Generation","summary":" With the ability to generate high-quality images, text-to-image (T2I) models\ncan be exploited for creating inappropriate content. To prevent misuse,\nexisting safety measures are either based on text blacklists, which can be\neasily circumvented, or harmful content classification, requiring large\ndatasets for training and offering low flexibility. Hence, we propose Latent\nGuard, a framework designed to improve safety measures in text-to-image\ngeneration. Inspired by blacklist-based approaches, Latent Guard learns a\nlatent space on top of the T2I model's text encoder, where it is possible to\ncheck the presence of harmful concepts in the input text embeddings. Our\nproposed framework is composed of a data generation pipeline specific to the\ntask using large language models, ad-hoc architectural components, and a\ncontrastive learning strategy to benefit from the generated data. The\neffectiveness of our method is verified on three datasets and against four\nbaselines. Code and data will be shared at\nhttps://github.com/rt219/LatentGuard.\n","authors":["Runtao Liu","Ashkan Khakzar","Jindong Gu","Qifeng Chen","Philip Torr","Fabio Pizzati"],"pdf_url":"https://arxiv.org/pdf/2404.08031v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2404.08030v1","updated":"2024-04-11T17:59:43Z","published":"2024-04-11T17:59:43Z","title":"Rethinking Artistic Copyright Infringements in the Era of Text-to-Image\n Generative Models","summary":" Recent text-to-image generative models such as Stable Diffusion are extremely\nadept at mimicking and generating copyrighted content, raising concerns amongst\nartists that their unique styles may be improperly copied. Understanding how\ngenerative models copy \"artistic style\" is more complex than duplicating a\nsingle image, as style is comprised by a set of elements (or signature) that\nfrequently co-occurs across a body of work, where each individual work may vary\nsignificantly. In our paper, we first reformulate the problem of \"artistic\ncopyright infringement\" to a classification problem over image sets, instead of\nprobing image-wise similarities. We then introduce ArtSavant, a practical\n(i.e., efficient and easy to understand) tool to (i) determine the unique style\nof an artist by comparing it to a reference dataset of works from 372 artists\ncurated from WikiArt, and (ii) recognize if the identified style reappears in\ngenerated images. We leverage two complementary methods to perform artistic\nstyle classification over image sets, includingTagMatch, which is a novel\ninherently interpretable and attributable method, making it more suitable for\nbroader use by non-technical stake holders (artists, lawyers, judges, etc).\nLeveraging ArtSavant, we then perform a large-scale empirical study to provide\nquantitative insight on the prevalence of artistic style copying across 3\npopular text-to-image generative models. Namely, amongst a dataset of prolific\nartists (including many famous ones), only 20% of them appear to have their\nstyles be at a risk of copying via simple prompting of today's popular\ntext-to-image generative models.\n","authors":["Mazda Moayeri","Samyadeep Basu","Sriram Balasubramanian","Priyatham Kattakinda","Atoosa Chengini","Robert Brauneis","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2404.08030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08027v1","updated":"2024-04-11T15:58:12Z","published":"2024-04-11T15:58:12Z","title":"SurvMamba: State Space Model with Multi-grained Multi-modal Interaction\n for Survival Prediction","summary":" Multi-modal learning that combines pathological images with genomic data has\nsignificantly enhanced the accuracy of survival prediction. Nevertheless,\nexisting methods have not fully utilized the inherent hierarchical structure\nwithin both whole slide images (WSIs) and transcriptomic data, from which\nbetter intra-modal representations and inter-modal integration could be\nderived. Moreover, many existing studies attempt to improve multi-modal\nrepresentations through attention mechanisms, which inevitably lead to high\ncomplexity when processing high-dimensional WSIs and transcriptomic data.\nRecently, a structured state space model named Mamba emerged as a promising\napproach for its superior performance in modeling long sequences with low\ncomplexity. In this study, we propose Mamba with multi-grained multi-modal\ninteraction (SurvMamba) for survival prediction. SurvMamba is implemented with\na Hierarchical Interaction Mamba (HIM) module that facilitates efficient\nintra-modal interactions at different granularities, thereby capturing more\ndetailed local features as well as rich global representations. In addition, an\nInteraction Fusion Mamba (IFM) module is used for cascaded inter-modal\ninteractive fusion, yielding more comprehensive features for survival\nprediction. Comprehensive evaluations on five TCGA datasets demonstrate that\nSurvMamba outperforms other existing methods in terms of performance and\ncomputational cost.\n","authors":["Ying Chen","Jiajing Xie","Yuxiang Lin","Yuhang Song","Wenxian Yang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08703v1","updated":"2024-04-11T05:06:51Z","published":"2024-04-11T05:06:51Z","title":"Synthetic Brain Images: Bridging the Gap in Brain Mapping With\n Generative Adversarial Model","summary":" Magnetic Resonance Imaging (MRI) is a vital modality for gaining precise\nanatomical information, and it plays a significant role in medical imaging for\ndiagnosis and therapy planning. Image synthesis problems have seen a revolution\nin recent years due to the introduction of deep learning techniques,\nspecifically Generative Adversarial Networks (GANs). This work investigates the\nuse of Deep Convolutional Generative Adversarial Networks (DCGAN) for producing\nhigh-fidelity and realistic MRI image slices. The suggested approach uses a\ndataset with a variety of brain MRI scans to train a DCGAN architecture. While\nthe discriminator network discerns between created and real slices, the\ngenerator network learns to synthesise realistic MRI image slices. The\ngenerator refines its capacity to generate slices that closely mimic real MRI\ndata through an adversarial training approach. The outcomes demonstrate that\nthe DCGAN promise for a range of uses in medical imaging research, since they\nshow that it can effectively produce MRI image slices if we train them for a\nconsequent number of epochs. This work adds to the expanding corpus of research\non the application of deep learning techniques for medical image synthesis. The\nslices that are could be produced possess the capability to enhance datasets,\nprovide data augmentation in the training of deep learning models, as well as a\nnumber of functions are made available to make MRI data cleaning easier, and a\nthree ready to use and clean dataset on the major anatomical plans.\n","authors":["Drici Mourad","Kazeem Oluwakemi Oseni"],"pdf_url":"https://arxiv.org/pdf/2404.08703v1.pdf","comment":null}]},"2024-04-12T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.08640v1","updated":"2024-04-12T17:59:47Z","published":"2024-04-12T17:59:47Z","title":"EventEgo3D: 3D Human Motion Capture from Egocentric Event Streams","summary":" Monocular egocentric 3D human motion capture is a challenging and actively\nresearched problem. Existing methods use synchronously operating visual sensors\n(e.g. RGB cameras) and often fail under low lighting and fast motions, which\ncan be restricting in many applications involving head-mounted devices. In\nresponse to the existing limitations, this paper 1) introduces a new problem,\ni.e., 3D human motion capture from an egocentric monocular event camera with a\nfisheye lens, and 2) proposes the first approach to it called EventEgo3D\n(EE3D). Event streams have high temporal resolution and provide reliable cues\nfor 3D human motion capture under high-speed human motions and rapidly changing\nillumination. The proposed EE3D framework is specifically tailored for learning\nwith event streams in the LNES representation, enabling high 3D reconstruction\naccuracy. We also design a prototype of a mobile head-mounted device with an\nevent camera and record a real dataset with event observations and the\nground-truth 3D human poses (in addition to the synthetic dataset). Our EE3D\ndemonstrates robustness and superior 3D accuracy compared to existing solutions\nacross various challenging experiments while supporting real-time 3D pose\nupdate rates of 140Hz.\n","authors":["Christen Millerdurai","Hiroyasu Akada","Jian Wang","Diogo Luvizon","Christian Theobalt","Vladislav Golyanik"],"pdf_url":"https://arxiv.org/pdf/2404.08640v1.pdf","comment":"14 pages, 11 figures and 6 tables; project page:\n https://4dqv.mpi-inf.mpg.de/EventEgo3D/; Computer Vision and Pattern\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2404.08639v1","updated":"2024-04-12T17:59:40Z","published":"2024-04-12T17:59:40Z","title":"COCONut: Modernizing COCO Segmentation","summary":" In recent decades, the vision community has witnessed remarkable progress in\nvisual recognition, partially owing to advancements in dataset benchmarks.\nNotably, the established COCO benchmark has propelled the development of modern\ndetection and segmentation systems. However, the COCO segmentation benchmark\nhas seen comparatively slow improvement over the last decade. Originally\nequipped with coarse polygon annotations for thing instances, it gradually\nincorporated coarse superpixel annotations for stuff regions, which were\nsubsequently heuristically amalgamated to yield panoptic segmentation\nannotations. These annotations, executed by different groups of raters, have\nresulted not only in coarse segmentation masks but also in inconsistencies\nbetween segmentation types. In this study, we undertake a comprehensive\nreevaluation of the COCO segmentation annotations. By enhancing the annotation\nquality and expanding the dataset to encompass 383K images with more than 5.18M\npanoptic masks, we introduce COCONut, the COCO Next Universal segmenTation\ndataset. COCONut harmonizes segmentation annotations across semantic, instance,\nand panoptic segmentation with meticulously crafted high-quality masks, and\nestablishes a robust benchmark for all segmentation tasks. To our knowledge,\nCOCONut stands as the inaugural large-scale universal segmentation dataset,\nverified by human raters. We anticipate that the release of COCONut will\nsignificantly contribute to the community's ability to assess the progress of\nnovel neural networks.\n","authors":["Xueqing Deng","Qihang Yu","Peng Wang","Xiaohui Shen","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08639v1.pdf","comment":"Accepted at CVPR2024, data available at\n https://xdeng7.github.io/coconut.github.io/"},{"id":"http://arxiv.org/abs/2404.08636v1","updated":"2024-04-12T17:58:04Z","published":"2024-04-12T17:58:04Z","title":"Probing the 3D Awareness of Visual Foundation Models","summary":" Recent advances in large-scale pretraining have yielded visual foundation\nmodels with strong capabilities. Not only can recent models generalize to\narbitrary images for their training task, their intermediate representations\nare useful for other visual tasks such as detection and segmentation. Given\nthat such models can classify, delineate, and localize objects in 2D, we ask\nwhether they also represent their 3D structure? In this work, we analyze the 3D\nawareness of visual foundation models. We posit that 3D awareness implies that\nrepresentations (1) encode the 3D structure of the scene and (2) consistently\nrepresent the surface across views. We conduct a series of experiments using\ntask-specific probes and zero-shot inference procedures on frozen features. Our\nexperiments reveal several limitations of the current models. Our code and\nanalysis can be found at https://github.com/mbanani/probe3d.\n","authors":["Mohamed El Banani","Amit Raj","Kevis-Kokitsi Maninis","Abhishek Kar","Yuanzhen Li","Michael Rubinstein","Deqing Sun","Leonidas Guibas","Justin Johnson","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.08636v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://github.com/mbanani/probe3d"},{"id":"http://arxiv.org/abs/2403.15388v4","updated":"2024-04-12T17:34:29Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 18 times on average, and achieve\ncomparable performance across diverse visual question-answering and reasoning\ntasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v4.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2404.08611v1","updated":"2024-04-12T17:20:57Z","published":"2024-04-12T17:20:57Z","title":"Automatic Quantification of Serial PET/CT Images for Pediatric Hodgkin\n Lymphoma Patients Using a Longitudinally-Aware Segmentation Network","summary":" $\\textbf{Purpose}$: Automatic quantification of longitudinal changes in PET\nscans for lymphoma patients has proven challenging, as residual disease in\ninterim-therapy scans is often subtle and difficult to detect. Our goal was to\ndevelop a longitudinally-aware segmentation network (LAS-Net) that can quantify\nserial PET/CT images for pediatric Hodgkin lymphoma patients.\n$\\textbf{Materials and Methods}$: This retrospective study included baseline\n(PET1) and interim (PET2) PET/CT images from 297 patients enrolled in two\nChildren's Oncology Group clinical trials (AHOD1331 and AHOD0831). LAS-Net\nincorporates longitudinal cross-attention, allowing relevant features from PET1\nto inform the analysis of PET2. Model performance was evaluated using Dice\ncoefficients for PET1 and detection F1 scores for PET2. Additionally, we\nextracted and compared quantitative PET metrics, including metabolic tumor\nvolume (MTV) and total lesion glycolysis (TLG) in PET1, as well as qPET and\n$\\Delta$SUVmax in PET2, against physician measurements. We quantified their\nagreement using Spearman's $\\rho$ correlations and employed bootstrap\nresampling for statistical analysis. $\\textbf{Results}$: LAS-Net detected\nresidual lymphoma in PET2 with an F1 score of 0.606 (precision/recall:\n0.615/0.600), outperforming all comparator methods (P<0.01). For baseline\nsegmentation, LAS-Net achieved a mean Dice score of 0.772. In PET\nquantification, LAS-Net's measurements of qPET, $\\Delta$SUVmax, MTV and TLG\nwere strongly correlated with physician measurements, with Spearman's $\\rho$ of\n0.78, 0.80, 0.93 and 0.96, respectively. The performance remained high, with a\nslight decrease, in an external testing cohort. $\\textbf{Conclusion}$: LAS-Net\nachieved high performance in quantifying PET metrics across serial scans,\nhighlighting the value of longitudinal awareness in evaluating multi-time-point\nimaging datasets.\n","authors":["Xin Tie","Muheon Shin","Changhee Lee","Scott B. Perlman","Zachary Huemann","Amy J. Weisman","Sharon M. Castellino","Kara M. Kelly","Kathleen M. McCarten","Adina L. Alazraki","Junjie Hu","Steve Y. Cho","Tyler J. Bradshaw"],"pdf_url":"https://arxiv.org/pdf/2404.08611v1.pdf","comment":"6 figures, 4 tables in the main text"},{"id":"http://arxiv.org/abs/2310.16073v3","updated":"2024-04-12T17:04:15Z","published":"2023-10-24T14:59:51Z","title":"FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal\n Consistency and Correlation Debiasing","summary":" Dynamic scene graph generation (SGG) from videos requires not only a\ncomprehensive understanding of objects across scenes but also a method to\ncapture the temporal motions and interactions with different objects. Moreover,\nthe long-tailed distribution of visual relationships is a crucial bottleneck\nfor most dynamic SGG methods. This is because many of them focus on capturing\nspatio-temporal context using complex architectures, leading to the generation\nof biased scene graphs. To address these challenges, we propose FloCoDe:\nFlow-aware Temporal Consistency and Correlation Debiasing with uncertainty\nattenuation for unbiased dynamic scene graphs. FloCoDe employs feature warping\nusing flow to detect temporally consistent objects across frames. To address\nthe long-tail issue of visual relationships, we propose correlation debiasing\nand a label correlation-based loss to learn unbiased relation representations\nfor long-tailed classes. Specifically, we propose to incorporate label\ncorrelations using contrastive loss to capture commonly co-occurring relations,\nwhich aids in learning robust representations for long-tailed classes. Further,\nwe adopt the uncertainty attenuation-based classifier framework to handle noisy\nannotations in the SGG data. Extensive experimental evaluation shows a\nperformance gain as high as 4.1%, demonstrating the superiority of generating\nmore unbiased scene graphs.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16073v3.pdf","comment":"Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2404.08603v1","updated":"2024-04-12T17:02:56Z","published":"2024-04-12T17:02:56Z","title":"Training-free Boost for Open-Vocabulary Object Detection with Confidence\n Aggregation","summary":" Open-vocabulary object detection (OVOD) aims at localizing and recognizing\nvisual objects from novel classes unseen at the training time. Whereas,\nempirical studies reveal that advanced detectors generally assign lower scores\nto those novel instances, which are inadvertently suppressed during inference\nby commonly adopted greedy strategies like Non-Maximum Suppression (NMS),\nleading to sub-optimal detection performance for novel classes. This paper\nsystematically investigates this problem with the commonly-adopted two-stage\nOVOD paradigm. Specifically, in the region-proposal stage, proposals that\ncontain novel instances showcase lower objectness scores, since they are\ntreated as background proposals during the training phase. Meanwhile, in the\nobject-classification stage, novel objects share lower region-text similarities\n(i.e., classification scores) due to the biased visual-language alignment by\nseen training samples. To alleviate this problem, this paper introduces two\nadvanced measures to adjust confidence scores and conserve erroneously\ndismissed objects: (1) a class-agnostic localization quality estimate via\noverlap degree of region/object proposals, and (2) a text-guided visual\nsimilarity estimate with proxy prototypes for novel classes. Integrated with\nadjusting techniques specifically designed for the region-proposal and\nobject-classification stages, this paper derives the aggregated confidence\nestimate for the open-vocabulary object detection paradigm (AggDet). Our AggDet\nis a generic and training-free post-processing scheme, which consistently\nbolsters open-vocabulary detectors across model scales and architecture\ndesigns. For instance, AggDet receives 3.3% and 1.5% gains on OV-COCO and\nOV-LVIS benchmarks respectively, without any training cost.\n","authors":["Yanhao Zheng","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07520v2","updated":"2024-04-12T17:01:04Z","published":"2024-04-11T07:26:00Z","title":"PromptSync: Bridging Domain Gaps in Vision-Language Models through\n Class-Aware Prototype Alignment and Discrimination","summary":" The potential for zero-shot generalization in vision-language (V-L) models\nsuch as CLIP has spurred their widespread adoption in addressing numerous\ndownstream tasks. Previous methods have employed test-time prompt tuning to\nadapt the model to unseen domains, but they overlooked the issue of imbalanced\nclass distributions. In this study, we explicitly address this problem by\nemploying class-aware prototype alignment weighted by mean class probabilities\nobtained for the test sample and filtered augmented views. Additionally, we\nensure that the class probabilities are as accurate as possible by performing\nprototype discrimination using contrastive learning. The combination of\nalignment and discriminative loss serves as a geometric regularizer, preventing\nthe prompt representation from collapsing onto a single class and effectively\nbridging the distribution gap between the source and test domains. Our method,\nnamed PromptSync, synchronizes the prompts for each test sample on both the\ntext and vision branches of the V-L model. In empirical evaluations on the\ndomain generalization benchmark, our method outperforms previous best methods\nby 2.33% in overall performance, by 1% in base-to-novel generalization, and by\n2.84% in cross-dataset transfer tasks.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2404.07520v2.pdf","comment":"Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures"},{"id":"http://arxiv.org/abs/2312.03884v2","updated":"2024-04-12T16:47:05Z","published":"2023-12-06T20:22:32Z","title":"WonderJourney: Going from Anywhere to Everywhere","summary":" We introduce WonderJourney, a modularized framework for perpetual 3D scene\ngeneration. Unlike prior work on view generation that focuses on a single type\nof scenes, we start at any user-provided location (by a text description or an\nimage) and generate a journey through a long sequence of diverse yet coherently\nconnected 3D scenes. We leverage an LLM to generate textual descriptions of the\nscenes in this journey, a text-driven point cloud generation pipeline to make a\ncompelling and coherent sequence of 3D scenes, and a large VLM to verify the\ngenerated scenes. We show compelling, diverse visual results across various\nscene types and styles, forming imaginary \"wonderjourneys\". Project website:\nhttps://kovenyu.com/WonderJourney/\n","authors":["Hong-Xing Yu","Haoyi Duan","Junhwa Hur","Kyle Sargent","Michael Rubinstein","William T. Freeman","Forrester Cole","Deqing Sun","Noah Snavely","Jiajun Wu","Charles Herrmann"],"pdf_url":"https://arxiv.org/pdf/2312.03884v2.pdf","comment":"Project website with video results:\n https://kovenyu.com/WonderJourney/"},{"id":"http://arxiv.org/abs/2404.08590v1","updated":"2024-04-12T16:38:48Z","published":"2024-04-12T16:38:48Z","title":"Improving Referring Image Segmentation using Vision-Aware Text Features","summary":" Referring image segmentation is a challenging task that involves generating\npixel-wise segmentation masks based on natural language descriptions. Existing\nmethods have relied mostly on visual features to generate the segmentation\nmasks while treating text features as supporting components. This over-reliance\non visual features can lead to suboptimal results, especially in complex\nscenarios where text prompts are ambiguous or context-dependent. To overcome\nthese challenges, we present a novel framework VATEX to improve referring image\nsegmentation by enhancing object and context understanding with Vision-Aware\nText Feature. Our method involves using CLIP to derive a CLIP Prior that\nintegrates an object-centric visual heatmap with text description, which can be\nused as the initial query in DETR-based architecture for the segmentation task.\nFurthermore, by observing that there are multiple ways to describe an instance\nin an image, we enforce feature similarity between text variations referring to\nthe same visual input by two components: a novel Contextual Multimodal Decoder\nthat turns text embeddings into vision-aware text features, and a Meaning\nConsistency Constraint to ensure further the coherent and consistent\ninterpretation of language expressions with the context understanding obtained\nfrom the image. Our method achieves a significant performance improvement on\nthree benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Code is available at:\nhttps://nero1342.github.io/VATEX\\_RIS.\n","authors":["Hai Nguyen-Truong","E-Ro Nguyen","Tuan-Anh Vu","Minh-Triet Tran","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.08590v1.pdf","comment":"30 pages including supplementary"},{"id":"http://arxiv.org/abs/2401.01448v2","updated":"2024-04-12T16:37:46Z","published":"2024-01-02T22:15:20Z","title":"ProbMCL: Simple Probabilistic Contrastive Learning for Multi-label\n Visual Classification","summary":" Multi-label image classification presents a challenging task in many domains,\nincluding computer vision and medical imaging. Recent advancements have\nintroduced graph-based and transformer-based methods to improve performance and\ncapture label dependencies. However, these methods often include complex\nmodules that entail heavy computation and lack interpretability. In this paper,\nwe propose Probabilistic Multi-label Contrastive Learning (ProbMCL), a novel\nframework to address these challenges in multi-label image classification\ntasks. Our simple yet effective approach employs supervised contrastive\nlearning, in which samples that share enough labels with an anchor image based\non a decision threshold are introduced as a positive set. This structure\ncaptures label dependencies by pulling positive pair embeddings together and\npushing away negative samples that fall below the threshold. We enhance\nrepresentation learning by incorporating a mixture density network into\ncontrastive learning and generating Gaussian mixture distributions to explore\nthe epistemic uncertainty of the feature encoder. We validate the effectiveness\nof our framework through experimentation with datasets from the computer vision\nand medical imaging domains. Our method outperforms the existing\nstate-of-the-art methods while achieving a low computational footprint on both\ndatasets. Visualization analyses also demonstrate that ProbMCL-learned\nclassifiers maintain a meaningful semantic topology.\n","authors":["Ahmad Sajedi","Samir Khaki","Yuri A. Lawryshyn","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2401.01448v2.pdf","comment":"This paper has been accepted for the ICASSP 2024 - 2024 IEEE\n International Conference on Acoustics, Speech and Signal Processing (ICASSP)"},{"id":"http://arxiv.org/abs/2404.08589v1","updated":"2024-04-12T16:35:23Z","published":"2024-04-12T16:35:23Z","title":"Enhancing Visual Question Answering through Question-Driven Image\n Captions as Prompts","summary":" Visual question answering (VQA) is known as an AI-complete task as it\nrequires understanding, reasoning, and inferring about the vision and the\nlanguage content. Over the past few years, numerous neural architectures have\nbeen suggested for the VQA problem. However, achieving success in zero-shot VQA\nremains a challenge due to its requirement for advanced generalization and\nreasoning skills. This study explores the impact of incorporating image\ncaptioning as an intermediary process within the VQA pipeline. Specifically, we\nexplore the efficacy of utilizing image captions instead of images and\nleveraging large language models (LLMs) to establish a zero-shot setting. Since\nimage captioning is the most crucial step in this process, we compare the\nimpact of state-of-the-art image captioning models on VQA performance across\nvarious question types in terms of structure and semantics. We propose a\nstraightforward and efficient question-driven image captioning approach within\nthis pipeline to transfer contextual information into the question-answering\n(QA) model. This method involves extracting keywords from the question,\ngenerating a caption for each image-question pair using the keywords, and\nincorporating the question-driven caption into the LLM prompt. We evaluate the\nefficacy of using general-purpose and question-driven image captions in the VQA\npipeline. Our study highlights the potential of employing image captions and\nharnessing the capabilities of LLMs to achieve competitive performance on GQA\nunder the zero-shot setting. Our code is available at\n\\url{https://github.com/ovguyo/captions-in-VQA}.\n","authors":["Övgü Özdemir","Erdem Akagündüz"],"pdf_url":"https://arxiv.org/pdf/2404.08589v1.pdf","comment":"The paper has been accepted for presentation at CVPR 2024 Workshop on\n Prompting in Vision"},{"id":"http://arxiv.org/abs/2404.08585v1","updated":"2024-04-12T16:30:15Z","published":"2024-04-12T16:30:15Z","title":"Advanced wood species identification based on multiple anatomical\n sections and using deep feature transfer and fusion","summary":" In recent years, we have seen many advancements in wood species\nidentification. Methods like DNA analysis, Near Infrared (NIR) spectroscopy,\nand Direct Analysis in Real Time (DART) mass spectrometry complement the\nlong-established wood anatomical assessment of cell and tissue morphology.\nHowever, most of these methods have some limitations such as high costs, the\nneed for skilled experts for data interpretation, and the lack of good datasets\nfor professional reference. Therefore, most of these methods, and certainly the\nwood anatomical assessment, may benefit from tools based on Artificial\nIntelligence. In this paper, we apply two transfer learning techniques with\nConvolutional Neural Networks (CNNs) to a multi-view Congolese wood species\ndataset including sections from different orientations and viewed at different\nmicroscopic magnifications. We explore two feature extraction methods in\ndetail, namely Global Average Pooling (GAP) and Random Encoding of Aggregated\nDeep Activation Maps (RADAM), for efficient and accurate wood species\nidentification. Our results indicate superior accuracy on diverse datasets and\nanatomical sections, surpassing the results of other methods. Our proposal\nrepresents a significant advancement in wood species identification, offering a\nrobust tool to support the conservation of forest ecosystems and promote\nsustainable forestry practices.\n","authors":["Kallil M. Zielinski","Leonardo Scabini","Lucas C. Ribas","Núbia R. da Silva","Hans Beeckman","Jan Verwaeren","Odemir M. Bruno","Bernard De Baets"],"pdf_url":"https://arxiv.org/pdf/2404.08585v1.pdf","comment":"33 pages, 7 tables, 9 figures"},{"id":"http://arxiv.org/abs/2404.08584v1","updated":"2024-04-12T16:29:49Z","published":"2024-04-12T16:29:49Z","title":"Pathological Primitive Segmentation Based on Visual Foundation Model\n with Zero-Shot Mask Generation","summary":" Medical image processing usually requires a model trained with carefully\ncrafted datasets due to unique image characteristics and domain-specific\nchallenges, especially in pathology. Primitive detection and segmentation in\ndigitized tissue samples are essential for objective and automated diagnosis\nand prognosis of cancer. SAM (Segment Anything Model) has recently been\ndeveloped to segment general objects from natural images with high accuracy,\nbut it requires human prompts to generate masks. In this work, we present a\nnovel approach that adapts pre-trained natural image encoders of SAM for\ndetection-based region proposals. Regions proposed by a pre-trained encoder are\nsent to cascaded feature propagation layers for projection. Then, local\nsemantic and global context is aggregated from multi-scale for bounding box\nlocalization and classification. Finally, the SAM decoder uses the identified\nbounding boxes as essential prompts to generate a comprehensive primitive\nsegmentation map. The entire base framework, SAM, requires no additional\ntraining or fine-tuning but could produce an end-to-end result for two\nfundamental segmentation tasks in pathology. Our method compares with\nstate-of-the-art models in F1 score for nuclei detection and binary/multiclass\npanoptic(bPQ/mPQ) and mask quality(dice) for segmentation quality on the\nPanNuke dataset while offering end-to-end efficiency. Our model also achieves\nremarkable Average Precision (+4.5%) on the secondary dataset (HuBMAP Kidney)\ncompared to Faster RCNN. The code is publicly available at\nhttps://github.com/learner-codec/autoprom_sam.\n","authors":["Abu Bakor Hayat Arnob","Xiangxue Wang","Yiping Jiao","Xiao Gan","Wenlong Ming","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2404.08584v1.pdf","comment":"2024 IEEE International Symposium on Biomedical Imaging"},{"id":"http://arxiv.org/abs/2404.08582v1","updated":"2024-04-12T16:28:30Z","published":"2024-04-12T16:28:30Z","title":"FashionFail: Addressing Failure Cases in Fashion Object Detection and\n Segmentation","summary":" In the realm of fashion object detection and segmentation for online shopping\nimages, existing state-of-the-art fashion parsing models encounter limitations,\nparticularly when exposed to non-model-worn apparel and close-up shots. To\naddress these failures, we introduce FashionFail; a new fashion dataset with\ne-commerce images for object detection and segmentation. The dataset is\nefficiently curated using our novel annotation tool that leverages recent\nfoundation models. The primary objective of FashionFail is to serve as a test\nbed for evaluating the robustness of models. Our analysis reveals the\nshortcomings of leading models, such as Attribute-Mask R-CNN and Fashionformer.\nAdditionally, we propose a baseline approach using naive data augmentation to\nmitigate common failure cases and improve model robustness. Through this work,\nwe aim to inspire and support further research in fashion item detection and\nsegmentation for industrial applications. The dataset, annotation tool, code,\nand models are available at \\url{https://rizavelioglu.github.io/fashionfail/}.\n","authors":["Riza Velioglu","Robin Chan","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2404.08582v1.pdf","comment":"to be published in 2024 International Joint Conference on Neural\n Networks (IJCNN)"},{"id":"http://arxiv.org/abs/2404.08580v1","updated":"2024-04-12T16:23:42Z","published":"2024-04-12T16:23:42Z","title":"Lossy Image Compression with Foundation Diffusion Models","summary":" Incorporating diffusion models in the image compression domain has the\npotential to produce realistic and detailed reconstructions, especially at\nextremely low bitrates. Previous methods focus on using diffusion models as\nexpressive decoders robust to quantization errors in the conditioning signals,\nyet achieving competitive results in this manner requires costly training of\nthe diffusion model and long inference times due to the iterative generative\nprocess. In this work we formulate the removal of quantization error as a\ndenoising task, using diffusion to recover lost information in the transmitted\nimage latent. Our approach allows us to perform less than 10\\% of the full\ndiffusion generative process and requires no architectural changes to the\ndiffusion model, enabling the use of foundation models as a strong prior\nwithout additional fine tuning of the backbone. Our proposed codec outperforms\nprevious methods in quantitative realism metrics, and we verify that our\nreconstructions are qualitatively preferred by end users, even when other\nmethods use twice the bitrate.\n","authors":["Lucas Relic","Roberto Azevedo","Markus Gross","Christopher Schroers"],"pdf_url":"https://arxiv.org/pdf/2404.08580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06994v2","updated":"2024-04-12T16:07:55Z","published":"2024-02-10T17:02:53Z","title":"A Change Detection Reality Check","summary":" In recent years, there has been an explosion of proposed change detection\ndeep learning architectures in the remote sensing literature. These approaches\nclaim to offer state-of-the-art performance on different standard benchmark\ndatasets. However, has the field truly made significant progress? In this paper\nwe perform experiments which conclude a simple U-Net segmentation baseline\nwithout training tricks or complicated architectural changes is still a top\nperformer for the task of change detection.\n","authors":["Isaac Corley","Caleb Robinson","Anthony Ortiz"],"pdf_url":"https://arxiv.org/pdf/2402.06994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08561v1","updated":"2024-04-12T16:00:03Z","published":"2024-04-12T16:00:03Z","title":"IDD-X: A Multi-View Dataset for Ego-relative Important Object\n Localization and Explanation in Dense and Unstructured Traffic","summary":" Intelligent vehicle systems require a deep understanding of the interplay\nbetween road conditions, surrounding entities, and the ego vehicle's driving\nbehavior for safe and efficient navigation. This is particularly critical in\ndeveloping countries where traffic situations are often dense and unstructured\nwith heterogeneous road occupants. Existing datasets, predominantly geared\ntowards structured and sparse traffic scenarios, fall short of capturing the\ncomplexity of driving in such environments. To fill this gap, we present IDD-X,\na large-scale dual-view driving video dataset. With 697K bounding boxes, 9K\nimportant object tracks, and 1-12 objects per video, IDD-X offers comprehensive\nego-relative annotations for multiple important road objects covering 10\ncategories and 19 explanation label categories. The dataset also incorporates\nrearview information to provide a more complete representation of the driving\nenvironment. We also introduce custom-designed deep networks aimed at multiple\nimportant object localization and per-object explanation prediction. Overall,\nour dataset and introduced prediction models form the foundation for studying\nhow road conditions and surrounding entities affect driving behavior in complex\ntraffic situations.\n","authors":["Chirag Parikh","Rohit Saluja","C. V. Jawahar","Ravi Kiran Sarvadevabhatla"],"pdf_url":"https://arxiv.org/pdf/2404.08561v1.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.08557v1","updated":"2024-04-12T15:54:48Z","published":"2024-04-12T15:54:48Z","title":"Scalability in Building Component Data Annotation: Enhancing Facade\n Material Classification with Synthetic Data","summary":" Computer vision models trained on Google Street View images can create\nmaterial cadastres. However, current approaches need manually annotated\ndatasets that are difficult to obtain and often have class imbalance. To\naddress these challenges, this paper fine-tuned a Swin Transformer model on a\nsynthetic dataset generated with DALL-E and compared the performance to a\nsimilar manually annotated dataset. Although manual annotation remains the gold\nstandard, the synthetic dataset performance demonstrates a reasonable\nalternative. The findings will ease annotation needed to develop material\ncadastres, offering architects insights into opportunities for material reuse,\nthus contributing to the reduction of demolition waste.\n","authors":["Josie Harrison","Alexander Hollberg","Yinan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08557v1.pdf","comment":"10 pages, 6 figures, submitted to 2024 European Conference of\n Computing in Construction"},{"id":"http://arxiv.org/abs/2310.02557v3","updated":"2024-04-12T15:48:47Z","published":"2023-10-04T03:30:32Z","title":"Generalization in diffusion models arises from geometry-adaptive\n harmonic representations","summary":" Deep neural networks (DNNs) trained for image denoising are able to generate\nhigh-quality samples with score-based reverse diffusion algorithms. These\nimpressive capabilities seem to imply an escape from the curse of\ndimensionality, but recent reports of memorization of the training set raise\nthe question of whether these networks are learning the \"true\" continuous\ndensity of the data. Here, we show that two DNNs trained on non-overlapping\nsubsets of a dataset learn nearly the same score function, and thus the same\ndensity, when the number of training images is large enough. In this regime of\nstrong generalization, diffusion-generated images are distinct from the\ntraining set, and are of high visual quality, suggesting that the inductive\nbiases of the DNNs are well-aligned with the data density. We analyze the\nlearned denoising functions and show that the inductive biases give rise to a\nshrinkage operation in a basis adapted to the underlying image. Examination of\nthese bases reveals oscillating harmonic structures along contours and in\nhomogeneous regions. We demonstrate that trained denoisers are inductively\nbiased towards these geometry-adaptive harmonic bases since they arise not only\nwhen the network is trained on photographic images, but also when it is trained\non image classes supported on low-dimensional manifolds for which the harmonic\nbasis is suboptimal. Finally, we show that when trained on regular image\nclasses for which the optimal basis is known to be geometry-adaptive and\nharmonic, the denoising performance of the networks is near-optimal.\n","authors":["Zahra Kadkhodaie","Florentin Guth","Eero P. Simoncelli","Stéphane Mallat"],"pdf_url":"https://arxiv.org/pdf/2310.02557v3.pdf","comment":"Accepted for oral presentation at ICLR, Vienna, May 2024"},{"id":"http://arxiv.org/abs/2404.08549v1","updated":"2024-04-12T15:45:26Z","published":"2024-04-12T15:45:26Z","title":"Benchmarking the Cell Image Segmentation Models Robustness under the\n Microscope Optical Aberrations","summary":" Cell segmentation is essential in biomedical research for analyzing cellular\nmorphology and behavior. Deep learning methods, particularly convolutional\nneural networks (CNNs), have revolutionized cell segmentation by extracting\nintricate features from images. However, the robustness of these methods under\nmicroscope optical aberrations remains a critical challenge. This study\ncomprehensively evaluates the performance of cell instance segmentation models\nunder simulated aberration conditions using the DynamicNuclearNet (DNN) and\nLIVECell datasets. Aberrations, including Astigmatism, Coma, Spherical, and\nTrefoil, were simulated using Zernike polynomial equations. Various\nsegmentation models, such as Mask R-CNN with different network heads (FPN, C3)\nand backbones (ResNet, VGG19, SwinS), were trained and tested under aberrated\nconditions. Results indicate that FPN combined with SwinS demonstrates superior\nrobustness in handling simple cell images affected by minor aberrations.\nConversely, Cellpose2.0 proves effective for complex cell images under similar\nconditions. Our findings provide insights into selecting appropriate\nsegmentation models based on cell morphology and aberration severity, enhancing\nthe reliability of cell segmentation in biomedical applications. Further\nresearch is warranted to validate these methods with diverse aberration types\nand emerging segmentation models. Overall, this research aims to guide\nresearchers in effectively utilizing cell segmentation models in the presence\nof minor optical aberrations.\n","authors":["Boyuan Peng","Jiaju Chen","Qihui Ye","Minjiang Chen","Peiwu Qin","Chenggang Yan","Dongmei Yu","Zhenglin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08544v1","updated":"2024-04-12T15:37:53Z","published":"2024-04-12T15:37:53Z","title":"Analyzing Decades-Long Environmental Changes in Namibia Using Archival\n Aerial Photography and Deep Learning","summary":" This study explores object detection in historical aerial photographs of\nNamibia to identify long-term environmental changes. Specifically, we aim to\nidentify key objects -- \\textit{Waterholes}, \\textit{Omuti homesteads}, and\n\\textit{Big trees} -- around Oshikango in Namibia using sub-meter gray-scale\naerial imagery from 1943 and 1972. In this work, we propose a workflow for\nanalyzing historical aerial imagery using a deep semantic segmentation model on\nsparse hand-labels. To this end, we employ a number of strategies including\nclass-weighting, pseudo-labeling and empirical p-value-based filtering to\nbalance skewed and sparse representations of objects in the ground truth data.\nResults demonstrate the benefits of these different training strategies\nresulting in an average $F_1=0.661$ and $F_1=0.755$ over the three objects of\ninterest for the 1943 and 1972 imagery, respectively. We also identified that\nthe average size of Waterhole and Big trees increased while the average size of\nOmutis decreased between 1943 and 1972 reflecting some of the local effects of\nthe massive post-Second World War economic, agricultural, demographic, and\nenvironmental changes. This work also highlights the untapped potential of\nhistorical aerial photographs in understanding long-term environmental changes\nbeyond Namibia (and Africa). With the lack of adequate satellite technology in\nthe past, archival aerial photography offers a great alternative to uncover\ndecades-long environmental changes.\n","authors":["Girmaw Abebe Tadesse","Caleb Robinson","Gilles Quentin Hacheme","Akram Zaytar","Rahul Dodhia","Tsering Wangyal Shawa","Juan M. Lavista Ferres","Emmanuel H. Kreike"],"pdf_url":"https://arxiv.org/pdf/2404.08544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08540v1","updated":"2024-04-12T15:35:20Z","published":"2024-04-12T15:35:20Z","title":"On the Robustness of Language Guidance for Low-Level Vision Tasks:\n Findings from Depth Estimation","summary":" Recent advances in monocular depth estimation have been made by incorporating\nnatural language as additional guidance. Although yielding impressive results,\nthe impact of the language prior, particularly in terms of generalization and\nrobustness, remains unexplored. In this paper, we address this gap by\nquantifying the impact of this prior and introduce methods to benchmark its\neffectiveness across various settings. We generate \"low-level\" sentences that\nconvey object-centric, three-dimensional spatial relationships, incorporate\nthem as additional language priors and evaluate their downstream impact on\ndepth estimation. Our key finding is that current language-guided depth\nestimators perform optimally only with scene-level descriptions and\ncounter-intuitively fare worse with low level descriptions. Despite leveraging\nadditional data, these methods are not robust to directed adversarial attacks\nand decline in performance with an increase in distribution shift. Finally, to\nprovide a foundation for future research, we identify points of failures and\noffer insights to better understand these shortcomings. With an increasing\nnumber of methods using language for depth estimation, our findings highlight\nthe opportunities and pitfalls that require careful consideration for effective\ndeployment in real-world settings\n","authors":["Agneet Chatterjee","Tejas Gokhale","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.08540v1.pdf","comment":"Accepted to CVPR 2024. Project webpage:\n https://agneetchatterjee.com/robustness_depth_lang/"},{"id":"http://arxiv.org/abs/2404.08535v1","updated":"2024-04-12T15:30:03Z","published":"2024-04-12T15:30:03Z","title":"Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking","summary":" Contrastive learning has gained widespread adoption for retrieval tasks due\nto its minimal requirement for manual annotations. However, popular contrastive\nframeworks typically learn from binary relevance, making them ineffective at\nincorporating direct fine-grained rankings. In this paper, we curate a\nlarge-scale dataset featuring detailed relevance scores for each query-document\npair to facilitate future research and evaluation. Subsequently, we propose\nGeneralized Contrastive Learning for Multi-Modal Retrieval and Ranking (GCL),\nwhich is designed to learn from fine-grained rankings beyond binary relevance\nscores. Our results show that GCL achieves a 94.5% increase in NDCG@10 for\nin-domain and 26.3 to 48.8% increases for cold-start evaluations, all relative\nto the CLIP baseline and involving ground truth rankings.\n","authors":["Tianyu Zhu","Myong Chol Jung","Jesse Clark"],"pdf_url":"https://arxiv.org/pdf/2404.08535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08531v1","updated":"2024-04-12T15:18:25Z","published":"2024-04-12T15:18:25Z","title":"Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly\n Detection","summary":" Weakly supervised video anomaly detection (WSVAD) is a challenging task.\nGenerating fine-grained pseudo-labels based on weak-label and then\nself-training a classifier is currently a promising solution. However, since\nthe existing methods use only RGB visual modality and the utilization of\ncategory text information is neglected, thus limiting the generation of more\naccurate pseudo-labels and affecting the performance of self-training. Inspired\nby the manual labeling process based on the event description, in this paper,\nwe propose a novel pseudo-label generation and self-training framework based on\nText Prompt with Normality Guidance (TPWNG) for WSVAD. Our idea is to transfer\nthe rich language-visual knowledge of the contrastive language-image\npre-training (CLIP) model for aligning the video event description text and\ncorresponding video frames to generate pseudo-labels. Specifically, We first\nfine-tune the CLIP for domain adaptation by designing two ranking losses and a\ndistributional inconsistency loss. Further, we propose a learnable text prompt\nmechanism with the assist of a normality visual prompt to further improve the\nmatching accuracy of video event description text and video frames. Then, we\ndesign a pseudo-label generation module based on the normality guidance to\ninfer reliable frame-level pseudo-labels. Finally, we introduce a temporal\ncontext self-adaptive learning module to learn the temporal dependencies of\ndifferent video events more flexibly and accurately. Extensive experiments show\nthat our method achieves state-of-the-art performance on two benchmark\ndatasets, UCF-Crime and XD-Viole\n","authors":["Zhiwei Yang","Jing Liu","Peng Wu"],"pdf_url":"https://arxiv.org/pdf/2404.08531v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2402.11568v2","updated":"2024-04-12T15:17:17Z","published":"2024-02-18T12:31:29Z","title":"A novel Fourier neural operator framework for classification of\n multi-sized images: Application to three dimensional digital porous media","summary":" Fourier neural operators (FNOs) are invariant with respect to the size of\ninput images, and thus images with any size can be fed into FNO-based\nframeworks without any modification of network architectures, in contrast to\ntraditional convolutional neural networks (CNNs). Leveraging the advantage of\nFNOs, we propose a novel deep-learning framework for classifying images with\nvarying sizes. Particularly, we simultaneously train the proposed network on\nmulti-sized images. As a practical application, we consider the problem of\npredicting the label (e.g., permeability) of three-dimensional digital porous\nmedia. To construct the framework, an intuitive approach is to connect FNO\nlayers to a classifier using adaptive max pooling. First, we show that this\napproach is only effective for porous media with fixed sizes, whereas it fails\nfor porous media of varying sizes. To overcome this limitation, we introduce\nour approach: instead of using adaptive max pooling, we use static max pooling\nwith the size of channel width of FNO layers. Since the channel width of the\nFNO layers is independent of input image size, the introduced framework can\nhandle multi-sized images during training. We show the effectiveness of the\nintroduced framework and compare its performance with the intuitive approach\nthrough the example of the classification of three-dimensional digital porous\nmedia of varying sizes.\n","authors":["Ali Kashefi","Tapan Mukerji"],"pdf_url":"https://arxiv.org/pdf/2402.11568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08526v1","updated":"2024-04-12T15:15:39Z","published":"2024-04-12T15:15:39Z","title":"Masked Image Modeling as a Framework for Self-Supervised Learning across\n Eye Movements","summary":" To make sense of their surroundings, intelligent systems must transform\ncomplex sensory inputs to structured codes that are reduced to task-relevant\ninformation such as object category. Biological agents achieve this in a\nlargely autonomous manner, presumably via self-\\allowbreak super-\\allowbreak\nvised learning. Whereas previous attempts to model the underlying mechanisms\nwere largely discriminative in nature, there is ample evidence that the brain\nemploys a generative model of the world. Here, we propose that eye movements,\nin combination with the focused nature of primate vision, constitute a\ngenerative, self-supervised task of predicting and revealing visual\ninformation. We construct a proof-of-principle model starting from the\nframework of masked image modeling (MIM), a common approach in deep\nrepresentation learning. To do so, we analyze how core components of MIM such\nas masking technique and data augmentation influence the formation of\ncategory-specific representations. This allows us not only to better understand\nthe principles behind MIM, but to then reassemble a MIM more in line with the\nfocused nature of biological perception. From a theoretical angle, we find that\nMIM disentangles neurons in latent space, a property that has been suggested to\nstructure visual representations in primates, without explicit regulation.\nTogether with previous findings of invariance learning, this highlights an\ninteresting connection of MIM to latent regularization approaches for\nself-supervised learning. The source code is available under\nhttps://github.com/RobinWeiler/FocusMIM\n","authors":["Robin Weiler","Matthias Brucklacher","Cyriel M. A. Pennartz","Sander M. Bohté"],"pdf_url":"https://arxiv.org/pdf/2404.08526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11868v3","updated":"2024-04-12T15:01:10Z","published":"2024-03-18T15:22:09Z","title":"View-Consistent 3D Editing with Gaussian Splatting","summary":" The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing,\noffering efficient, high-fidelity rendering and enabling precise local\nmanipulations. Currently, diffusion-based 2D editing models are harnessed to\nmodify multi-view rendered images, which then guide the editing of 3DGS models.\nHowever, this approach faces a critical issue of multi-view inconsistency,\nwhere the guidance images exhibit significant discrepancies across views,\nleading to mode collapse and visual artifacts of 3DGS. To this end, we\nintroduce View-consistent Editing (VcEdit), a novel framework that seamlessly\nincorporates 3DGS into image editing processes, ensuring multi-view consistency\nin edited guidance images and effectively mitigating mode collapse issues.\nVcEdit employs two innovative consistency modules: the Cross-attention\nConsistency Module and the Editing Consistency Module, both designed to reduce\ninconsistencies in edited images. By incorporating these consistency modules\ninto an iterative pattern, VcEdit proficiently resolves the issue of multi-view\ninconsistency, facilitating high-quality 3DGS editing across a diverse range of\nscenes.\n","authors":["Yuxuan Wang","Xuanyu Yi","Zike Wu","Na Zhao","Long Chen","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11868v3.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2404.06710v3","updated":"2024-04-12T14:58:21Z","published":"2024-04-10T03:31:32Z","title":"SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike\n Camera","summary":" One of the most critical factors in achieving sharp Novel View Synthesis\n(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS) is the quality of the training images. However,\nConventional RGB cameras are susceptible to motion blur. In contrast,\nneuromorphic cameras like event and spike cameras inherently capture more\ncomprehensive temporal information, which can provide a sharp representation of\nthe scene as additional training data. Recent methods have explored the\nintegration of event cameras to improve the quality of NVS. The event-RGB\napproaches have some limitations, such as high training costs and the inability\nto work effectively in the background. Instead, our study introduces a new\nmethod that uses the spike camera to overcome these limitations. By considering\ntexture reconstruction from spike streams as ground truth, we design the\nTexture from Spike (TfS) loss. Since the spike camera relies on temporal\nintegration instead of temporal differentiation used by event cameras, our\nproposed TfS loss maintains manageable training costs. It handles foreground\nobjects with backgrounds simultaneously. We also provide a real-world dataset\ncaptured with our spike-RGB camera system to facilitate future research\nendeavors. We conduct extensive experiments using synthetic and real-world\ndatasets to demonstrate that our design can enhance novel view synthesis across\nNeRF and 3DGS. The code and dataset will be made available for public access.\n","authors":["Gaole Dai","Zhenyu Wang","Qinwen Xu","Ming Lu","Wen Chen","Boxin Shi","Shanghang Zhang","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08515v1","updated":"2024-04-12T14:54:34Z","published":"2024-04-12T14:54:34Z","title":"ChatGPT and general-purpose AI count fruits in pictures surprisingly\n well","summary":" Object counting is a popular task in deep learning applications in various\ndomains, including agriculture. A conventional deep learning approach requires\na large amount of training data, often a logistic problem in a real-world\napplication. To address this issue, we examined how well ChatGPT (GPT4V) and a\ngeneral-purpose AI (foundation model for object counting, T-Rex) can count the\nnumber of fruit bodies (coffee cherries) in 100 images. The foundation model\nwith few-shot learning outperformed the trained YOLOv8 model (R2 = 0.923 and\n0.900, respectively). ChatGPT also showed some interesting potential,\nespecially when few-shot learning with human feedback was applied (R2 = 0.360\nand 0.460, respectively). Moreover, we examined the time required for\nimplementation as a practical question. Obtaining the results with the\nfoundation model and ChatGPT were much shorter than the YOLOv8 model (0.83 hrs,\n1.75 hrs, and 161 hrs). We interpret these results as two surprises for deep\nlearning users in applied domains: a foundation model with few-shot\ndomain-specific learning can drastically save time and effort compared to the\nconventional approach, and ChatGPT can reveal a relatively good performance.\nBoth approaches do not need coding skills, which can foster AI education and\ndissemination.\n","authors":["Konlavach Mengsuwan","Juan Camilo Rivera Palacio","Masahiro Ryo"],"pdf_url":"https://arxiv.org/pdf/2404.08515v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.08514v1","updated":"2024-04-12T14:54:26Z","published":"2024-04-12T14:54:26Z","title":"NIR-Assisted Image Denoising: A Selective Fusion Approach and A\n Real-World Benchmark Datase","summary":" Despite the significant progress in image denoising, it is still challenging\nto restore fine-scale details while removing noise, especially in extremely\nlow-light environments. Leveraging near-infrared (NIR) images to assist visible\nRGB image denoising shows the potential to address this issue, becoming a\npromising technology. Nonetheless, existing works still struggle with taking\nadvantage of NIR information effectively for real-world image denoising, due to\nthe content inconsistency between NIR-RGB images and the scarcity of real-world\npaired datasets. To alleviate the problem, we propose an efficient Selective\nFusion Module (SFM), which can be plug-and-played into the advanced denoising\nnetworks to merge the deep NIR-RGB features. Specifically, we sequentially\nperform the global and local modulation for NIR and RGB features, and then\nintegrate the two modulated features. Furthermore, we present a Real-world\nNIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse\nscenarios as well as various noise levels. Extensive experiments on both\nsynthetic and our real-world datasets demonstrate that the proposed method\nachieves better results than state-of-the-art ones. The dataset, codes, and\npre-trained models will be publicly available at\nhttps://github.com/ronjonxu/NAID.\n","authors":["Rongjian Xu","Zhilu Zhang","Renlong Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.08514v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2401.03785v2","updated":"2024-04-12T14:44:04Z","published":"2024-01-08T10:06:52Z","title":"Identifying Important Group of Pixels using Interactions","summary":" To better understand the behavior of image classifiers, it is useful to\nvisualize the contribution of individual pixels to the model prediction. In\nthis study, we propose a method, MoXI ($\\textbf{Mo}$del e$\\textbf{X}$planation\nby $\\textbf{I}$nteractions), that efficiently and accurately identifies a group\nof pixels with high prediction confidence. The proposed method employs\ngame-theoretic concepts, Shapley values and interactions, taking into account\nthe effects of individual pixels and the cooperative influence of pixels on\nmodel confidence. Theoretical analysis and experiments demonstrate that our\nmethod better identifies the pixels that are highly contributing to the model\noutputs than widely-used visualization by Grad-CAM, Attention rollout, and\nShapley value. While prior studies have suffered from the exponential\ncomputational cost in the computation of Shapley value and interactions, we\nshow that this can be reduced to quadratic cost for our task. The code is\navailable at https://github.com/KosukeSumiyasu/MoXI.\n","authors":["Kosuke Sumiyasu","Kazuhiko Kawamoto","Hiroshi Kera"],"pdf_url":"https://arxiv.org/pdf/2401.03785v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08506v1","updated":"2024-04-12T14:40:45Z","published":"2024-04-12T14:40:45Z","title":"LaSagnA: Language-based Segmentation Assistant for Complex Queries","summary":" Recent advancements have empowered Large Language Models for Vision (vLLMs)\nto generate detailed perceptual outcomes, including bounding boxes and masks.\nNonetheless, there are two constraints that restrict the further application of\nthese vLLMs: the incapability of handling multiple targets per query and the\nfailure to identify the absence of query objects in the image. In this study,\nwe acknowledge that the main cause of these problems is the insufficient\ncomplexity of training queries. Consequently, we define the general sequence\nformat for complex queries. Then we incorporate a semantic segmentation task in\nthe current pipeline to fulfill the requirements of training data. Furthermore,\nwe present three novel strategies to effectively handle the challenges arising\nfrom the direct integration of the proposed format. The effectiveness of our\nmodel in processing complex queries is validated by the comparable results with\nconventional methods on both close-set and open-set semantic segmentation\ndatasets. Additionally, we outperform a series of vLLMs in reasoning and\nreferring segmentation, showcasing our model's remarkable capabilities. We\nrelease the code at https://github.com/congvvc/LaSagnA.\n","authors":["Cong Wei","Haoxian Tan","Yujie Zhong","Yujiu Yang","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08504v1","updated":"2024-04-12T14:34:24Z","published":"2024-04-12T14:34:24Z","title":"3D Human Scan With A Moving Event Camera","summary":" Capturing the 3D human body is one of the important tasks in computer vision\nwith a wide range of applications such as virtual reality and sports analysis.\nHowever, conventional frame cameras are limited by their temporal resolution\nand dynamic range, which imposes constraints in real-world application setups.\nEvent cameras have the advantages of high temporal resolution and high dynamic\nrange (HDR), but the development of event-based methods is necessary to handle\ndata with different characteristics. This paper proposes a novel event-based\nmethod for 3D pose estimation and human mesh recovery. Prior work on\nevent-based human mesh recovery require frames (images) as well as event data.\nThe proposed method solely relies on events; it carves 3D voxels by moving the\nevent camera around a stationary body, reconstructs the human pose and mesh by\nattenuated rays, and fit statistical body models, preserving high-frequency\ndetails. The experimental results show that the proposed method outperforms\nconventional frame-based methods in the estimation accuracy of both pose and\nbody mesh. We also demonstrate results in challenging situations where a\nconventional camera has motion blur. This is the first to demonstrate\nevent-only human mesh recovery, and we hope that it is the first step toward\nachieving robust and accurate 3D human body scanning from vision sensors.\n","authors":["Kai Kohyama","Shintaro Shiba","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2404.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14991v2","updated":"2024-04-12T14:21:20Z","published":"2023-12-22T11:56:22Z","title":"FoodLMM: A Versatile Food Assistant using Large Multi-modal Model","summary":" Large Multi-modal Models (LMMs) have made impressive progress in many\nvision-language tasks. Nevertheless, the performance of general LMMs in\nspecific domains is still far from satisfactory. This paper proposes FoodLMM, a\nversatile food assistant based on LMMs with various capabilities, including\nfood recognition, ingredient recognition, recipe generation, nutrition\nestimation, food segmentation and multi-round conversation. To facilitate\nFoodLMM to deal with tasks beyond pure text output, we introduce a series of\nnovel task-specific tokens and heads, enabling the model to predict food\nnutritional values and multiple segmentation masks. We adopt a two-stage\ntraining strategy. In the first stage, we utilize multiple public food\nbenchmarks for multi-task learning by leveraging the instruct-following\nparadigm. In the second stage, we construct a multi-round conversation dataset\nand a reasoning segmentation dataset to fine-tune the model, enabling it to\nconduct professional dialogues and generate segmentation masks based on complex\nreasoning in the food domain. Our fine-tuned FoodLMM achieves state-of-the-art\nresults across several food benchmarks. We will make our code, models and\ndatasets publicly available.\n","authors":["Yuehao Yin","Huiyan Qi","Bin Zhu","Jingjing Chen","Yu-Gang Jiang","Chong-Wah Ngo"],"pdf_url":"https://arxiv.org/pdf/2312.14991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08489v1","updated":"2024-04-12T14:12:03Z","published":"2024-04-12T14:12:03Z","title":"SpectralMamba: Efficient Mamba for Hyperspectral Image Classification","summary":" Recurrent neural networks and Transformers have recently dominated most\napplications in hyperspectral (HS) imaging, owing to their capability to\ncapture long-range dependencies from spectrum sequences. However, despite the\nsuccess of these sequential architectures, the non-ignorable inefficiency\ncaused by either difficulty in parallelization or computationally prohibitive\nattention still hinders their practicality, especially for large-scale\nobservation in remote sensing scenarios. To address this issue, we herein\npropose SpectralMamba -- a novel state space model incorporated efficient deep\nlearning framework for HS image classification. SpectralMamba features the\nsimplified but adequate modeling of HS data dynamics at two levels. First, in\nspatial-spectral space, a dynamical mask is learned by efficient convolutions\nto simultaneously encode spatial regularity and spectral peculiarity, thus\nattenuating the spectral variability and confusion in discriminative\nrepresentation learning. Second, the merged spectrum can then be efficiently\noperated in the hidden state space with all parameters learned input-dependent,\nyielding selectively focused responses without reliance on redundant attention\nor imparallelizable recurrence. To explore the room for further computational\ndownsizing, a piece-wise scanning mechanism is employed in-between,\ntransferring approximately continuous spectrum into sequences with squeezed\nlength while maintaining short- and long-term contextual profiles among\nhundreds of bands. Through extensive experiments on four benchmark HS datasets\nacquired by satellite-, aircraft-, and UAV-borne imagers, SpectralMamba\nsurprisingly creates promising win-wins from both performance and efficiency\nperspectives.\n","authors":["Jing Yao","Danfeng Hong","Chenyu Li","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2404.08489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00513v2","updated":"2024-04-12T13:58:33Z","published":"2024-03-31T01:20:16Z","title":"Transformer based Pluralistic Image Completion with Reduced Information\n Loss","summary":" Transformer based methods have achieved great success in image inpainting\nrecently. However, we find that these solutions regard each pixel as a token,\nthus suffering from an information loss issue from two aspects: 1) They\ndownsample the input image into much lower resolutions for efficiency\nconsideration. 2) They quantize $256^3$ RGB values to a small number (such as\n512) of quantized color values. The indices of quantized pixels are used as\ntokens for the inputs and prediction targets of the transformer. To mitigate\nthese issues, we propose a new transformer based framework called \"PUT\".\nSpecifically, to avoid input downsampling while maintaining computation\nefficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts\nthe masked image into non-overlapped patch tokens and the decoder recovers the\nmasked regions from the inpainted tokens while keeping the unmasked regions\nunchanged. To eliminate the information loss caused by input quantization, an\nUn-quantized Transformer is applied. It directly takes features from the\nP-VQVAE encoder as input without any quantization and only regards the\nquantized tokens as prediction targets. Furthermore, to make the inpainting\nprocess more controllable, we introduce semantic and structural conditions as\nextra guidance. Extensive experiments show that our method greatly outperforms\nexisting transformer based methods on image fidelity and achieves much higher\ndiversity and better fidelity than state-of-the-art pluralistic inpainting\nmethods on complex large-scale datasets (e.g., ImageNet). Codes are available\nat https://github.com/liuqk3/PUT.\n","authors":["Qiankun Liu","Yuqi Jiang","Zhentao Tan","Dongdong Chen","Ying Fu","Qi Chu","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.00513v2.pdf","comment":"Accepted by TPAMI (2024). arXiv admin note: text overlap with\n arXiv:2205.05076"},{"id":"http://arxiv.org/abs/2404.08477v1","updated":"2024-04-12T13:55:05Z","published":"2024-04-12T13:55:05Z","title":"New Efficient Visual OILU Markers","summary":" Basic patterns are the source of a wide range of more or less complex\ngeometric structures. We will exploit such patterns to develop new efficient\nvisual markers. Besides being projective invariants, the proposed markers allow\nproducing rich panel of unique identifiers, highly required for\nresource-intensive navigation and augmented reality applications. The spiral\ntopology of our markers permits the validation of an accurate identification\nscheme, which is based on level set methods. The robustness of the markers\nagainst acquisition and geometric distortions is validated by extensive\nexperimental tests.\n","authors":["Youssef Chahir","Messaoud Mostefai","Hamza Saida"],"pdf_url":"https://arxiv.org/pdf/2404.08477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13570v2","updated":"2024-04-12T13:44:44Z","published":"2023-11-22T18:25:51Z","title":"WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space","summary":" Modern learning-based approaches to 3D-aware image synthesis achieve high\nphotorealism and 3D-consistent viewpoint changes for the generated images.\nExisting approaches represent instances in a shared canonical space. However,\nfor in-the-wild datasets a shared canonical system can be difficult to define\nor might not even exist. In this work, we instead model instances in view\nspace, alleviating the need for posed images and learned camera distributions.\nWe find that in this setting, existing GAN-based methods are prone to\ngenerating flat geometry and struggle with distribution coverage. We hence\npropose WildFusion, a new approach to 3D-aware image synthesis based on latent\ndiffusion models (LDMs). We first train an autoencoder that infers a compressed\nlatent representation, which additionally captures the images' underlying 3D\nstructure and enables not only reconstruction but also novel view synthesis. To\nlearn a faithful 3D representation, we leverage cues from monocular depth\nprediction. Then, we train a diffusion model in the 3D-aware latent space,\nthereby enabling synthesis of high-quality 3D-consistent image samples,\noutperforming recent state-of-the-art GAN-based methods. Importantly, our\n3D-aware LDM is trained without any direct supervision from multiview images or\n3D geometry and does not require posed images or learned pose or camera\ndistributions. It directly learns a 3D representation without relying on\ncanonical camera coordinates. This opens up promising research avenues for\nscalable 3D-aware image synthesis and 3D content creation from in-the-wild\nimage data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D\nresults.\n","authors":["Katja Schwarz","Seung Wook Kim","Jun Gao","Sanja Fidler","Andreas Geiger","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2311.13570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08452v1","updated":"2024-04-12T13:02:08Z","published":"2024-04-12T13:02:08Z","title":"MoE-FFD: Mixture of Experts for Generalized and Parameter-Efficient Face\n Forgery Detection","summary":" Deepfakes have recently raised significant trust issues and security concerns\namong the public. Compared to CNN face forgery detectors, ViT-based methods\ntake advantage of the expressivity of transformers, achieving superior\ndetection performance. However, these approaches still exhibit the following\nlimitations: (1). Fully fine-tuning ViT-based models from ImageNet weights\ndemands substantial computational and storage resources; (2). ViT-based methods\nstruggle to capture local forgery clues, leading to model bias and limited\ngeneralizability. To tackle these challenges, this work introduces\nMixture-of-Experts modules for Face Forgery Detection (MoE-FFD), a generalized\nyet parameter-efficient ViT-based approach. MoE-FFD only updates lightweight\nLow-Rank Adaptation (LoRA) and Adapter layers while keeping the ViT backbone\nfrozen, thereby achieving parameter-efficient training. Moreover, MoE-FFD\nleverages the expressivity of transformers and local priors of CNNs to\nsimultaneously extract global and local forgery clues. Additionally, novel MoE\nmodules are designed to scale the model's capacity and select optimal forgery\nexperts, further enhancing forgery detection performance. The proposed MoE\nlearning scheme can be seamlessly adapted to various transformer backbones in a\nplug-and-play manner. Extensive experimental results demonstrate that the\nproposed method achieves state-of-the-art face forgery detection performance\nwith reduced parameter overhead. The code will be released upon acceptance.\n","authors":["Chenqi Kong","Anwei Luo","Song Xia","Yi Yu","Haoliang Li","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2404.08452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08450v1","updated":"2024-04-12T13:01:22Z","published":"2024-04-12T13:01:22Z","title":"Joint Physical-Digital Facial Attack Detection Via Simulating Spoofing\n Clues","summary":" Face recognition systems are frequently subjected to a variety of physical\nand digital attacks of different types. Previous methods have achieved\nsatisfactory performance in scenarios that address physical attacks and digital\nattacks, respectively. However, few methods are considered to integrate a model\nthat simultaneously addresses both physical and digital attacks, implying the\nnecessity to develop and maintain multiple models. To jointly detect physical\nand digital attacks within a single model, we propose an innovative approach\nthat can adapt to any network architecture. Our approach mainly contains two\ntypes of data augmentation, which we call Simulated Physical Spoofing Clues\naugmentation (SPSC) and Simulated Digital Spoofing Clues augmentation (SDSC).\nSPSC and SDSC augment live samples into simulated attack samples by simulating\nspoofing clues of physical and digital attacks, respectively, which\nsignificantly improve the capability of the model to detect \"unseen\" attack\ntypes. Extensive experiments show that SPSC and SDSC can achieve\nstate-of-the-art generalization in Protocols 2.1 and 2.2 of the UniAttackData\ndataset, respectively. Our method won first place in \"Unified Physical-Digital\nFace Attack Detection\" of the 5th Face Anti-spoofing Challenge@CVPR2024. Our\nfinal submission obtains 3.75% APCER, 0.93% BPCER, and 2.34% ACER,\nrespectively. Our code is available at\nhttps://github.com/Xianhua-He/cvpr2024-face-anti-spoofing-challenge.\n","authors":["Xianhua He","Dashuang Liang","Song Yang","Zhanlong Hao","Hui Ma","Binjie Mao","Xi Li","Yao Wang","Pengfei Yan","Ajian Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08450v1.pdf","comment":"10 pages with 6 figures, Accepted by CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08449v1","updated":"2024-04-12T13:00:06Z","published":"2024-04-12T13:00:06Z","title":"OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering","summary":" Rendering dynamic 3D human from monocular videos is crucial for various\napplications such as virtual reality and digital entertainment. Most methods\nassume the people is in an unobstructed scene, while various objects may cause\nthe occlusion of body parts in real-life scenarios. Previous method utilizing\nNeRF for surface rendering to recover the occluded areas, but it requiring more\nthan one day to train and several seconds to render, failing to meet the\nrequirements of real-time interactive applications. To address these issues, we\npropose OccGaussian based on 3D Gaussian Splatting, which can be trained within\n6 minutes and produces high-quality human renderings up to 160 FPS with\noccluded input. OccGaussian initializes 3D Gaussian distributions in the\ncanonical space, and we perform occlusion feature query at occluded regions,\nthe aggregated pixel-align feature is extracted to compensate for the missing\ninformation. Then we use Gaussian Feature MLP to further process the feature\nalong with the occlusion-aware loss functions to better perceive the occluded\narea. Extensive experiments both in simulated and real-world occlusions,\ndemonstrate that our method achieves comparable or even superior performance\ncompared to the state-of-the-art method. And we improving training and\ninference speeds by 250x and 800x, respectively. Our code will be available for\nresearch purposes.\n","authors":["Jingrui Ye","Zongkai Zhang","Yujiao Jiang","Qingmin Liao","Wenming Yang","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2404.08449v1.pdf","comment":"12 April, 2024; originally announced April 2024"},{"id":"http://arxiv.org/abs/2404.08433v1","updated":"2024-04-12T12:30:48Z","published":"2024-04-12T12:30:48Z","title":"MSSTNet: A Multi-Scale Spatio-Temporal CNN-Transformer Network for\n Dynamic Facial Expression Recognition","summary":" Unlike typical video action recognition, Dynamic Facial Expression\nRecognition (DFER) does not involve distinct moving targets but relies on\nlocalized changes in facial muscles. Addressing this distinctive attribute, we\npropose a Multi-Scale Spatio-temporal CNN-Transformer network (MSSTNet). Our\napproach takes spatial features of different scales extracted by CNN and feeds\nthem into a Multi-scale Embedding Layer (MELayer). The MELayer extracts\nmulti-scale spatial information and encodes these features before sending them\ninto a Temporal Transformer (T-Former). The T-Former simultaneously extracts\ntemporal information while continually integrating multi-scale spatial\ninformation. This process culminates in the generation of multi-scale\nspatio-temporal features that are utilized for the final classification. Our\nmethod achieves state-of-the-art results on two in-the-wild datasets.\nFurthermore, a series of ablation experiments and visualizations provide\nfurther validation of our approach's proficiency in leveraging spatio-temporal\ninformation within DFER.\n","authors":["Linhuang Wang","Xin Kang","Fei Ding","Satoshi Nakagawa","Fuji Ren"],"pdf_url":"https://arxiv.org/pdf/2404.08433v1.pdf","comment":"Accepted to 2024 IEEE International Conference on Acoustics, Speech,\n and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2404.08421v1","updated":"2024-04-12T12:10:53Z","published":"2024-04-12T12:10:53Z","title":"Adapting the Segment Anything Model During Usage in Novel Situations","summary":" The interactive segmentation task consists in the creation of object\nsegmentation masks based on user interactions. The most common way to guide a\nmodel towards producing a correct segmentation consists in clicks on the object\nand background. The recently published Segment Anything Model (SAM) supports a\ngeneralized version of the interactive segmentation problem and has been\ntrained on an object segmentation dataset which contains 1.1B masks. Though\nbeing trained extensively and with the explicit purpose of serving as a\nfoundation model, we show significant limitations of SAM when being applied for\ninteractive segmentation on novel domains or object types. On the used\ndatasets, SAM displays a failure rate $\\text{FR}_{30}@90$ of up to $72.6 \\%$.\nSince we still want such foundation models to be immediately applicable, we\npresent a framework that can adapt SAM during immediate usage. For this we will\nleverage the user interactions and masks, which are constructed during the\ninteractive segmentation process. We use this information to generate\npseudo-labels, which we use to compute a loss function and optimize a part of\nthe SAM model. The presented method causes a relative reduction of up to $48.1\n\\%$ in the $\\text{FR}_{20}@85$ and $46.6 \\%$ in the $\\text{FR}_{30}@90$\nmetrics.\n","authors":["Robin Schön","Julian Lorenz","Katja Ludwig","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2404.08421v1.pdf","comment":"11 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.08419v1","updated":"2024-04-12T12:08:06Z","published":"2024-04-12T12:08:06Z","title":"Direct May Not Be the Best: An Incremental Evolution View of Pose\n Generation","summary":" Pose diversity is an inherent representative characteristic of 2D images. Due\nto the 3D to 2D projection mechanism, there is evident content discrepancy\namong distinct pose images. This is the main obstacle bothering pose\ntransformation related researches. To deal with this challenge, we propose a\nfine-grained incremental evolution centered pose generation framework, rather\nthan traditional direct one-to-one in a rush. Since proposed approach actually\nbypasses the theoretical difficulty of directly modeling dramatic non-linear\nvariation, the incurred content distortion and blurring could be effectively\nconstrained, at the same time the various individual pose details, especially\nclothes texture, could be precisely maintained. In order to systematically\nguide the evolution course, both global and incremental evolution constraints\nare elaborately designed and merged into the overall frame?work. And a novel\ntriple-path knowledge fusion structure is worked out to take full advantage of\nall available valuable knowledge to conduct high-quality pose synthesis. In\naddition, our framework could generate a series of valuable byproducts, namely\nthe various intermediate poses. Extensive experiments have been conducted to\nverify the effectiveness of the proposed approach. Code is available at\nhttps://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation.\n","authors":["Yuelong Li","Tengfei Xiao","Lei Geng","Jianming Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06707v2","updated":"2024-04-12T11:56:18Z","published":"2023-04-13T17:56:08Z","title":"Toward Reliable Human Pose Forecasting with Uncertainty","summary":" Recently, there has been an arms race of pose forecasting methods aimed at\nsolving the spatio-temporal task of predicting a sequence of future 3D poses of\na person given a sequence of past observed ones. However, the lack of unified\nbenchmarks and limited uncertainty analysis have hindered progress in the\nfield. To address this, we first develop an open-source library for human pose\nforecasting, including multiple models, supporting several datasets, and\nemploying standardized evaluation metrics, with the aim of promoting research\nand moving toward a unified and consistent evaluation. Second, we devise two\ntypes of uncertainty in the problem to increase performance and convey better\ntrust: 1) we propose a method for modeling aleatoric uncertainty by using\nuncertainty priors to inject knowledge about the pattern of uncertainty. This\nfocuses the capacity of the model in the direction of more meaningful\nsupervision while reducing the number of learned parameters and improving\nstability; 2) we introduce a novel approach for quantifying the epistemic\nuncertainty of any model through clustering and measuring the entropy of its\nassignments. Our experiments demonstrate up to $25\\%$ improvements in\nforecasting at short horizons, with no loss on longer horizons on Human3.6M,\nAMSS, and 3DPW datasets, and better performance in uncertainty estimation. The\ncode is available online at https://github.com/vita-epfl/UnPOSed.\n","authors":["Saeed Saadatnejad","Mehrshad Mirmohammadi","Matin Daghyani","Parham Saremi","Yashar Zoroofchi Benisi","Amirhossein Alimohammadi","Zahra Tehraninasab","Taylor Mordan","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2304.06707v2.pdf","comment":"Published in RA-L 2024"},{"id":"http://arxiv.org/abs/2404.08406v1","updated":"2024-04-12T11:33:26Z","published":"2024-04-12T11:33:26Z","title":"MambaDFuse: A Mamba-based Dual-phase Model for Multi-modality Image\n Fusion","summary":" Multi-modality image fusion (MMIF) aims to integrate complementary\ninformation from different modalities into a single fused image to represent\nthe imaging scene and facilitate downstream visual tasks comprehensively. In\nrecent years, significant progress has been made in MMIF tasks due to advances\nin deep neural networks. However, existing methods cannot effectively and\nefficiently extract modality-specific and modality-fused features constrained\nby the inherent local reductive bias (CNN) or quadratic computational\ncomplexity (Transformers). To overcome this issue, we propose a Mamba-based\nDual-phase Fusion (MambaDFuse) model. Firstly, a dual-level feature extractor\nis designed to capture long-range features from single-modality images by\nextracting low and high-level features from CNN and Mamba blocks. Then, a\ndual-phase feature fusion module is proposed to obtain fusion features that\ncombine complementary information from different modalities. It uses the\nchannel exchange method for shallow fusion and the enhanced Multi-modal Mamba\n(M3) blocks for deep fusion. Finally, the fused image reconstruction module\nutilizes the inverse transformation of the feature extraction to generate the\nfused result. Through extensive experiments, our approach achieves promising\nfusion results in infrared-visible image fusion and medical image fusion.\nAdditionally, in a unified benchmark, MambaDFuse has also demonstrated improved\nperformance in downstream tasks such as object detection. Code with checkpoints\nwill be available after the peer-review process.\n","authors":["Zhe Li","Haiwei Pan","Kejia Zhang","Yuhua Wang","Fengming Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08401v1","updated":"2024-04-12T11:15:15Z","published":"2024-04-12T11:15:15Z","title":"No Bells, Just Whistles: Sports Field Registration by Leveraging\n Geometric Properties","summary":" Broadcast sports field registration is traditionally addressed as a\nhomography estimation task, mapping the visible image area to a planar field\nmodel, predominantly focusing on the main camera shot. Addressing the\nshortcomings of previous approaches, we propose a novel calibration pipeline\nenabling camera calibration using a 3D soccer field model and extending the\nprocess to assess the multiple-view nature of broadcast videos. Our approach\nbegins with a keypoint generation pipeline derived from SoccerNet dataset\nannotations, leveraging the geometric properties of the court. Subsequently, we\nexecute classical camera calibration through DLT algorithm in a minimalist\nfashion, without further refinement. Through extensive experimentation on\nreal-world soccer broadcast datasets such as SoccerNet-Calibration, WorldCup\n2014 and TS- WorldCup, our method demonstrates superior performance in both\nmultiple- and single-view 3D camera calibration while maintaining competitive\nresults in homography estimation compared to state-of-the-art techniques.\n","authors":["Marc Gutiérrez-Pérez","Antonio Agudo"],"pdf_url":"https://arxiv.org/pdf/2404.08401v1.pdf","comment":"Accepted in CVPRW 2024"},{"id":"http://arxiv.org/abs/2105.03026v2","updated":"2024-04-12T11:14:04Z","published":"2021-05-07T01:32:37Z","title":"Efficient Masked Face Recognition Method during the COVID-19 Pandemic","summary":" The coronavirus disease (COVID-19) is an unparalleled crisis leading to a\nhuge number of casualties and security problems. In order to reduce the spread\nof coronavirus, people often wear masks to protect themselves. This makes face\nrecognition a very difficult task since certain parts of the face are hidden. A\nprimary focus of researchers during the ongoing coronavirus pandemic is to come\nup with suggestions to handle this problem through rapid and efficient\nsolutions. In this paper, we propose a reliable method based on occlusion\nremoval and deep learning-based features in order to address the problem of the\nmasked face recognition process. The first step is to remove the masked face\nregion. Next, we apply three pre-trained deep Convolutional Neural Networks\n(CNN) namely, VGG-16, AlexNet, and ResNet-50, and use them to extract deep\nfeatures from the obtained regions (mostly eyes and forehead regions). The\nBag-of-features paradigm is then applied to the feature maps of the last\nconvolutional layer in order to quantize them and to get a slight\nrepresentation comparing to the fully connected layer of classical CNN.\nFinally, Multilayer Perceptron (MLP) is applied for the classification process.\nExperimental results on Real-World-Masked-Face-Dataset show high recognition\nperformance compared to other state-of-the-art methods.\n","authors":["Walid Hariri"],"pdf_url":"https://arxiv.org/pdf/2105.03026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08399v1","updated":"2024-04-12T11:08:26Z","published":"2024-04-12T11:08:26Z","title":"Mitigating Challenges of the Space Environment for Onboard Artificial\n Intelligence: Design Overview of the Imaging Payload on SpIRIT","summary":" Artificial intelligence (AI) and autonomous edge computing in space are\nemerging areas of interest to augment capabilities of nanosatellites, where\nmodern sensors generate orders of magnitude more data than can typically be\ntransmitted to mission control. Here, we present the hardware and software\ndesign of an onboard AI subsystem hosted on SpIRIT. The system is optimised for\non-board computer vision experiments based on visible light and long wave\ninfrared cameras. This paper highlights the key design choices made to maximise\nthe robustness of the system in harsh space conditions, and their motivation\nrelative to key mission requirements, such as limited compute resources,\nresilience to cosmic radiation, extreme temperature variations, distribution\nshifts, and very low transmission bandwidths. The payload, called Loris,\nconsists of six visible light cameras, three infrared cameras, a camera control\nboard and a Graphics Processing Unit (GPU) system-on-module. Loris enables the\nexecution of AI models with on-orbit fine-tuning as well as a next-generation\nimage compression algorithm, including progressive coding. This innovative\napproach not only enhances the data processing capabilities of nanosatellites\nbut also lays the groundwork for broader applications to remote sensing from\nspace.\n","authors":["Miguel Ortiz del Castillo","Jonathan Morgan","Jack McRobbie","Clint Therakam","Zaher Joukhadar","Robert Mearns","Simon Barraclough","Richard Sinnott","Andrew Woods","Chris Bayliss","Kris Ehinger","Ben Rubinstein","James Bailey","Airlie Chapman","Michele Trenti"],"pdf_url":"https://arxiv.org/pdf/2404.08399v1.pdf","comment":"AI4Space 2024, 3rd Workshop on AI for Space, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08392v1","updated":"2024-04-12T10:54:11Z","published":"2024-04-12T10:54:11Z","title":"NC-TTT: A Noise Contrastive Approach for Test-Time Training","summary":" Despite their exceptional performance in vision tasks, deep learning models\noften struggle when faced with domain shifts during testing. Test-Time Training\n(TTT) methods have recently gained popularity by their ability to enhance the\nrobustness of models through the addition of an auxiliary objective that is\njointly optimized with the main task. Being strictly unsupervised, this\nauxiliary objective is used at test time to adapt the model without any access\nto labels. In this work, we propose Noise-Contrastive Test-Time Training\n(NC-TTT), a novel unsupervised TTT technique based on the discrimination of\nnoisy feature maps. By learning to classify noisy views of projected feature\nmaps, and then adapting the model accordingly on new domains, classification\nperformance can be recovered by an important margin. Experiments on several\npopular test-time adaptation baselines demonstrate the advantages of our method\ncompared to recent approaches for this task. The code can be found\nat:https://github.com/GustavoVargasHakim/NCTTT.git\n","authors":["David Osowiechi","Gustavo A. Vargas Hakim","Mehrdad Noori","Milad Cheraghalikhani","Ali Bahri","Moslem Yazdanpanah","Ismail Ben Ayed","Christian Desrosiers"],"pdf_url":"https://arxiv.org/pdf/2404.08392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04385v2","updated":"2024-04-12T10:15:45Z","published":"2024-03-07T10:25:23Z","title":"Impacts of Color and Texture Distortions on Earth Observation Data in\n Deep Learning","summary":" Land cover classification and change detection are two important applications\nof remote sensing and Earth observation (EO) that have benefited greatly from\nthe advances of deep learning. Convolutional and transformer-based U-net models\nare the state-of-the-art architectures for these tasks, and their performances\nhave been boosted by an increased availability of large-scale annotated EO\ndatasets. However, the influence of different visual characteristics of the\ninput EO data on a model's predictions is not well understood. In this work we\nsystematically examine model sensitivities with respect to several color- and\ntexture-based distortions on the input EO data during inference, given models\nthat have been trained without such distortions. We conduct experiments with\nmultiple state-of-the-art segmentation networks for land cover classification\nand show that they are in general more sensitive to texture than to color\ndistortions. Beyond revealing intriguing characteristics of widely used land\ncover classification models, our results can also be used to guide the\ndevelopment of more robust models within the EO domain.\n","authors":["Martin Willbo","Aleksis Pirinen","John Martinsson","Edvin Listo Zec","Olof Mogren","Mikael Nilsson"],"pdf_url":"https://arxiv.org/pdf/2403.04385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08363v1","updated":"2024-04-12T10:04:03Z","published":"2024-04-12T10:04:03Z","title":"Let It Flow: Simultaneous Optimization of 3D Flow and Object Clustering","summary":" We study the problem of self-supervised 3D scene flow estimation from real\nlarge-scale raw point cloud sequences, which is crucial to various tasks like\ntrajectory prediction or instance segmentation. In the absence of ground truth\nscene flow labels, contemporary approaches concentrate on deducing optimizing\nflow across sequential pairs of point clouds by incorporating structure based\nregularization on flow and object rigidity. The rigid objects are estimated by\na variety of 3D spatial clustering methods. While state-of-the-art methods\nsuccessfully capture overall scene motion using the Neural Prior structure,\nthey encounter challenges in discerning multi-object motions. We identified the\nstructural constraints and the use of large and strict rigid clusters as the\nmain pitfall of the current approaches and we propose a novel clustering\napproach that allows for combination of overlapping soft clusters as well as\nnon-overlapping rigid clusters representation. Flow is then jointly estimated\nwith progressively growing non-overlapping rigid clusters together with fixed\nsize overlapping soft clusters. We evaluate our method on multiple datasets\nwith LiDAR point clouds, demonstrating the superior performance over the\nself-supervised baselines reaching new state of the art results. Our method\nespecially excels in resolving flow in complicated dynamic scenes with multiple\nindependently moving objects close to each other which includes pedestrians,\ncyclists and other vulnerable road users. Our codes will be publicly available.\n","authors":["Patrik Vacek","David Hurych","Tomáš Svoboda","Karel Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2404.08363v1.pdf","comment":"ECCV submission"},{"id":"http://arxiv.org/abs/2404.08353v1","updated":"2024-04-12T09:44:18Z","published":"2024-04-12T09:44:18Z","title":"TDANet: Target-Directed Attention Network For Object-Goal Visual\n Navigation With Zero-Shot Ability","summary":" The generalization of the end-to-end deep reinforcement learning (DRL) for\nobject-goal visual navigation is a long-standing challenge since object classes\nand placements vary in new test environments. Learning domain-independent\nvisual representation is critical for enabling the trained DRL agent with the\nability to generalize to unseen scenes and objects. In this letter, a\ntarget-directed attention network (TDANet) is proposed to learn the end-to-end\nobject-goal visual navigation policy with zero-shot ability. TDANet features a\nnovel target attention (TA) module that learns both the spatial and semantic\nrelationships among objects to help TDANet focus on the most relevant observed\nobjects to the target. With the Siamese architecture (SA) design, TDANet\ndistinguishes the difference between the current and target states and\ngenerates the domain-independent visual representation. To evaluate the\nnavigation performance of TDANet, extensive experiments are conducted in the\nAI2-THOR embodied AI environment. The simulation results demonstrate a strong\ngeneralization ability of TDANet to unseen scenes and target objects, with\nhigher navigation success rate (SR) and success weighted by length (SPL) than\nother state-of-the-art models.\n","authors":["Shiwei Lian","Feitian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16588v2","updated":"2024-04-12T09:38:33Z","published":"2023-09-28T16:45:46Z","title":"Vision Transformers Need Registers","summary":" Transformers have recently emerged as a powerful tool for learning visual\nrepresentations. In this paper, we identify and characterize artifacts in\nfeature maps of both supervised and self-supervised ViT networks. The artifacts\ncorrespond to high-norm tokens appearing during inference primarily in\nlow-informative background areas of images, that are repurposed for internal\ncomputations. We propose a simple yet effective solution based on providing\nadditional tokens to the input sequence of the Vision Transformer to fill that\nrole. We show that this solution fixes that problem entirely for both\nsupervised and self-supervised models, sets a new state of the art for\nself-supervised visual models on dense visual prediction tasks, enables object\ndiscovery methods with larger models, and most importantly leads to smoother\nfeature maps and attention maps for downstream visual processing.\n","authors":["Timothée Darcet","Maxime Oquab","Julien Mairal","Piotr Bojanowski"],"pdf_url":"https://arxiv.org/pdf/2309.16588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16254v2","updated":"2024-04-12T09:37:37Z","published":"2023-11-27T19:02:17Z","title":"Safe-CLIP: Removing NSFW Concepts from Vision-and-Language Models","summary":" Large-scale vision-and-language models, such as CLIP, are typically trained\non web-scale data, which can introduce inappropriate content and lead to the\ndevelopment of unsafe and biased behavior. This, in turn, hampers their\napplicability in sensitive and trustworthy contexts and could raise significant\nconcerns in their adoption. Our research introduces a novel approach to\nenhancing the safety of vision-and-language models by diminishing their\nsensitivity to NSFW (not safe for work) inputs. In particular, our methodology\nseeks to sever \"toxic\" linguistic and visual concepts, unlearning the linkage\nbetween unsafe linguistic or visual items and unsafe regions of the embedding\nspace. We show how this can be done by fine-tuning a CLIP model on synthetic\ndata obtained from a large language model trained to convert between safe and\nunsafe sentences, and a text-to-image generator. We conduct extensive\nexperiments on the resulting embedding space for cross-modal retrieval,\ntext-to-image, and image-to-text generation, where we show that our model can\nbe remarkably employed with pre-trained generative models. Our source code and\ntrained models are available at: https://github.com/aimagelab/safe-clip.\n","authors":["Samuele Poppi","Tobia Poppi","Federico Cocchi","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2311.16254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07236v2","updated":"2024-04-12T09:34:38Z","published":"2024-04-08T08:50:09Z","title":"Lightweight Deep Learning for Resource-Constrained Environments: A\n Survey","summary":" Over the past decade, the dominance of deep learning has prevailed across\nvarious domains of artificial intelligence, including natural language\nprocessing, computer vision, and biomedical signal processing. While there have\nbeen remarkable improvements in model accuracy, deploying these models on\nlightweight devices, such as mobile phones and microcontrollers, is constrained\nby limited resources. In this survey, we provide comprehensive design guidance\ntailored for these devices, detailing the meticulous design of lightweight\nmodels, compression methods, and hardware acceleration strategies. The\nprincipal goal of this work is to explore methods and concepts for getting\naround hardware constraints without compromising the model's accuracy.\nAdditionally, we explore two notable paths for lightweight deep learning in the\nfuture: deployment techniques for TinyML and Large Language Models. Although\nthese paths undoubtedly have potential, they also present significant\nchallenges, encouraging research into unexplored areas.\n","authors":["Hou-I Liu","Marco Galindo","Hongxia Xie","Lai-Kuan Wong","Hong-Han Shuai","Yung-Hui Li","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.07236v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2404.08351v1","updated":"2024-04-12T09:31:55Z","published":"2024-04-12T09:31:55Z","title":"OmniSat: Self-Supervised Modality Fusion for Earth Observation","summary":" The field of Earth Observations (EO) offers a wealth of data from diverse\nsensors, presenting a great opportunity for advancing self-supervised\nmultimodal learning. However, current multimodal EO datasets and models focus\non a single data type, either mono-date images or time series, which limits\ntheir expressivity. We introduce OmniSat, a novel architecture that exploits\nthe spatial alignment between multiple EO modalities to learn expressive\nmultimodal representations without labels. To demonstrate the advantages of\ncombining modalities of different natures, we augment two existing datasets\nwith new modalities. As demonstrated on three downstream tasks: forestry, land\ncover classification, and crop mapping. OmniSat can learn rich representations\nin an unsupervised manner, leading to improved performance in the semi- and\nfully-supervised settings, even when only one modality is available for\ninference. The code and dataset are available at github.com/gastruc/OmniSat.\n","authors":["Guillaume Astruc","Nicolas Gonthier","Clement Mallet","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2404.08351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08350v1","updated":"2024-04-12T09:31:11Z","published":"2024-04-12T09:31:11Z","title":"Self-Supervised k-Space Regularization for Motion-Resolved Abdominal MRI\n Using Neural Implicit k-Space Representation","summary":" Neural implicit k-space representations have shown promising results for\ndynamic MRI at high temporal resolutions. Yet, their exclusive training in\nk-space limits the application of common image regularization methods to\nimprove the final reconstruction. In this work, we introduce the concept of\nparallel imaging-inspired self-consistency (PISCO), which we incorporate as\nnovel self-supervised k-space regularization enforcing a consistent\nneighborhood relationship. At no additional data cost, the proposed\nregularization significantly improves neural implicit k-space reconstructions\non simulated data. Abdominal in-vivo reconstructions using PISCO result in\nenhanced spatio-temporal image quality compared to state-of-the-art methods.\nCode is available at https://github.com/vjspi/PISCO-NIK.\n","authors":["Veronika Spieker","Hannah Eichhorn","Jonathan K. Stelter","Wenqi Huang","Rickmer F. Braren","Daniel Rückert","Francisco Sahli Costabal","Kerstin Hammernik","Claudia Prieto","Dimitrios C. Karampinos","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2404.08350v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.08347v1","updated":"2024-04-12T09:22:24Z","published":"2024-04-12T09:22:24Z","title":"Learning to Rebalance Multi-Modal Optimization by Adaptively Masking\n Subnetworks","summary":" Multi-modal learning aims to enhance performance by unifying models from\nvarious modalities but often faces the \"modality imbalance\" problem in real\ndata, leading to a bias towards dominant modalities and neglecting others,\nthereby limiting its overall effectiveness. To address this challenge, the core\nidea is to balance the optimization of each modality to achieve a joint\noptimum. Existing approaches often employ a modal-level control mechanism for\nadjusting the update of each modal parameter. However, such a global-wise\nupdating mechanism ignores the different importance of each parameter. Inspired\nby subnetwork optimization, we explore a uniform sampling-based optimization\nstrategy and find it more effective than global-wise updating. According to the\nfindings, we further propose a novel importance sampling-based, element-wise\njoint optimization method, called Adaptively Mask Subnetworks Considering Modal\nSignificance(AMSS). Specifically, we incorporate mutual information rates to\ndetermine the modal significance and employ non-uniform adaptive sampling to\nselect foreground subnetworks from each modality for parameter updates, thereby\nrebalancing multi-modal learning. Additionally, we demonstrate the reliability\nof the AMSS strategy through convergence analysis. Building upon theoretical\ninsights, we further enhance the multi-modal mask subnetwork strategy using\nunbiased estimation, referred to as AMSS+. Extensive experiments reveal the\nsuperiority of our approach over comparison methods.\n","authors":["Yang Yang","Hongpeng Pan","Qing-Yuan Jiang","Yi Xu","Jinghui Tang"],"pdf_url":"https://arxiv.org/pdf/2404.08347v1.pdf","comment":"17 pages;6 figures"},{"id":"http://arxiv.org/abs/2308.09372v2","updated":"2024-04-12T09:21:33Z","published":"2023-08-18T08:06:49Z","title":"Which Transformer to Favor: A Comparative Analysis of Efficiency in\n Vision Transformers","summary":" Transformers come with a high computational cost, yet their effectiveness in\naddressing problems in language and vision has sparked extensive research aimed\nat enhancing their efficiency. However, diverse experimental conditions,\nspanning multiple input domains, prevent a fair comparison based solely on\nreported results, posing challenges for model selection. To address this gap in\ncomparability, we design a comprehensive benchmark of more than 30 models for\nimage classification, evaluating key efficiency aspects, including accuracy,\nspeed, and memory usage. This benchmark provides a standardized baseline across\nthe landscape of efficiency-oriented transformers and our framework of\nanalysis, based on Pareto optimality, reveals surprising insights. Despite\nclaims of other models being more efficient, ViT remains Pareto optimal across\nmultiple metrics. We observe that hybrid attention-CNN models exhibit\nremarkable inference memory- and parameter-efficiency. Moreover, our benchmark\nshows that using a larger model in general is more efficient than using higher\nresolution images. Thanks to our holistic evaluation, we provide a centralized\nresource for practitioners and researchers, facilitating informed decisions\nwhen selecting transformers or measuring progress of the development of\nefficient transformers.\n","authors":["Tobias Christian Nauen","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.09372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08341v1","updated":"2024-04-12T09:13:37Z","published":"2024-04-12T09:13:37Z","title":"Counterfactual Explanations for Face Forgery Detection via Adversarial\n Removal of Artifacts","summary":" Highly realistic AI generated face forgeries known as deepfakes have raised\nserious social concerns. Although DNN-based face forgery detection models have\nachieved good performance, they are vulnerable to latest generative methods\nthat have less forgery traces and adversarial attacks. This limitation of\ngeneralization and robustness hinders the credibility of detection results and\nrequires more explanations. In this work, we provide counterfactual\nexplanations for face forgery detection from an artifact removal perspective.\nSpecifically, we first invert the forgery images into the StyleGAN latent\nspace, and then adversarially optimize their latent representations with the\ndiscrimination supervision from the target detection model. We verify the\neffectiveness of the proposed explanations from two aspects: (1) Counterfactual\nTrace Visualization: the enhanced forgery images are useful to reveal artifacts\nby visually contrasting the original images and two different visualization\nmethods; (2) Transferable Adversarial Attacks: the adversarial forgery images\ngenerated by attacking the detection model are able to mislead other detection\nmodels, implying the removed artifacts are general. Extensive experiments\ndemonstrate that our method achieves over 90% attack success rate and superior\nattack transferability. Compared with naive adversarial noise methods, our\nmethod adopts both generative and discriminative model priors, and optimize the\nlatent representations in a synthesis-by-analysis way, which forces the search\nof counterfactual explanations on the natural face manifold. Thus, more general\ncounterfactual traces can be found and better adversarial attack\ntransferability can be achieved.\n","authors":["Yang Li","Songlin Yang","Wei Wang","Ziwen He","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.08341v1.pdf","comment":"Accepted to ICME2024"},{"id":"http://arxiv.org/abs/2404.07762v2","updated":"2024-04-12T09:13:29Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/wljungbergh/NeuroNCAP\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16794v2","updated":"2024-04-12T09:04:05Z","published":"2023-12-28T02:54:34Z","title":"ZONE: Zero-Shot Instruction-Guided Local Editing","summary":" Recent advances in vision-language models like Stable Diffusion have shown\nremarkable power in creative image synthesis and editing.However, most existing\ntext-to-image editing methods encounter two obstacles: First, the text prompt\nneeds to be carefully crafted to achieve good results, which is not intuitive\nor user-friendly. Second, they are insensitive to local edits and can\nirreversibly affect non-edited regions, leaving obvious editing traces. To\ntackle these problems, we propose a Zero-shot instructiON-guided local image\nEditing approach, termed ZONE. We first convert the editing intent from the\nuser-provided instruction (e.g., \"make his tie blue\") into specific image\nediting regions through InstructPix2Pix. We then propose a Region-IoU scheme\nfor precise image layer extraction from an off-the-shelf segment model. We\nfurther develop an edge smoother based on FFT for seamless blending between the\nlayer and the image.Our method allows for arbitrary manipulation of a specific\nregion with a single instruction while preserving the rest. Extensive\nexperiments demonstrate that our ZONE achieves remarkable local editing results\nand user-friendliness, outperforming state-of-the-art methods. Code is\navailable at https://github.com/lsl001006/ZONE.\n","authors":["Shanglin Li","Bohan Zeng","Yutang Feng","Sicheng Gao","Xuhui Liu","Jiaming Liu","Li Lin","Xu Tang","Yao Hu","Jianzhuang Liu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.16794v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.06567v2","updated":"2024-04-12T08:52:24Z","published":"2024-03-11T10:06:45Z","title":"Leveraging Foundation Models for Content-Based Medical Image Retrieval\n in Radiology","summary":" Content-based image retrieval (CBIR) has the potential to significantly\nimprove diagnostic aid and medical research in radiology. Current CBIR systems\nface limitations due to their specialization to certain pathologies, limiting\ntheir utility. In response, we propose using vision foundation models as\npowerful and versatile off-the-shelf feature extractors for content-based\nmedical image retrieval. By benchmarking these models on a comprehensive\ndataset of 1.6 million 2D radiological images spanning four modalities and 161\npathologies, we identify weakly-supervised models as superior, achieving a P@1\nof up to 0.594. This performance not only competes with a specialized model but\ndoes so without the need for fine-tuning. Our analysis further explores the\nchallenges in retrieving pathological versus anatomical structures, indicating\nthat accurate retrieval of pathological features presents greater difficulty.\nDespite these challenges, our research underscores the vast potential of\nfoundation models for CBIR in radiology, proposing a shift towards versatile,\ngeneral-purpose medical image retrieval systems that do not require specific\ntuning.\n","authors":["Stefan Denner","David Zimmerer","Dimitrios Bounias","Markus Bujotzek","Shuhan Xiao","Lisa Kausch","Philipp Schader","Tobias Penzkofer","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2403.06567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08330v1","updated":"2024-04-12T08:46:53Z","published":"2024-04-12T08:46:53Z","title":"Emerging Property of Masked Token for Effective Pre-training","summary":" Driven by the success of Masked Language Modeling (MLM), the realm of\nself-supervised learning for computer vision has been invigorated by the\ncentral role of Masked Image Modeling (MIM) in driving recent breakthroughs.\nNotwithstanding the achievements of MIM across various downstream tasks, its\noverall efficiency is occasionally hampered by the lengthy duration of the\npre-training phase. This paper presents a perspective that the optimization of\nmasked tokens as a means of addressing the prevailing issue. Initially, we\ndelve into an exploration of the inherent properties that a masked token ought\nto possess. Within the properties, we principally dedicated to articulating and\nemphasizing the `data singularity' attribute inherent in masked tokens. Through\na comprehensive analysis of the heterogeneity between masked tokens and visible\ntokens within pre-trained models, we propose a novel approach termed masked\ntoken optimization (MTO), specifically designed to improve model efficiency\nthrough weight recalibration and the enhancement of the key property of masked\ntokens. The proposed method serves as an adaptable solution that seamlessly\nintegrates into any MIM approach that leverages masked tokens. As a result, MTO\nachieves a considerable improvement in pre-training efficiency, resulting in an\napproximately 50% reduction in pre-training epochs required to attain converged\nperformance of the recent approaches.\n","authors":["Hyesong Choi","Hunsang Lee","Seyoung Joung","Hyejin Park","Jiyeong Kim","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2404.08330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01449v2","updated":"2024-04-12T08:40:55Z","published":"2024-03-03T09:07:16Z","title":"DUFOMap: Efficient Dynamic Awareness Mapping","summary":" The dynamic nature of the real world is one of the main challenges in\nrobotics. The first step in dealing with it is to detect which parts of the\nworld are dynamic. A typical benchmark task is to create a map that contains\nonly the static part of the world to support, for example, localization and\nplanning. Current solutions are often applied in post-processing, where\nparameter tuning allows the user to adjust the setting for a specific dataset.\nIn this paper, we propose DUFOMap, a novel dynamic awareness mapping framework\ndesigned for efficient online processing. Despite having the same parameter\nsettings for all scenarios, it performs better or is on par with\nstate-of-the-art methods. Ray casting is utilized to identify and classify\nfully observed empty regions. Since these regions have been observed empty, it\nfollows that anything inside them at another time must be dynamic. Evaluation\nis carried out in various scenarios, including outdoor environments in KITTI\nand Argoverse 2, open areas on the KTH campus, and with different sensor types.\nDUFOMap outperforms the state of the art in terms of accuracy and computational\nefficiency. The source code, benchmarks, and links to the datasets utilized are\nprovided. See https://kth-rpl.github.io/dufomap for more details.\n","authors":["Daniel Duberg","Qingwen Zhang","MingKai Jia","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.01449v2.pdf","comment":"The first two authors hold equal contribution. 8 pages, 7 figures,\n project page https://kth-rpl.github.io/dufomap"},{"id":"http://arxiv.org/abs/2404.08327v1","updated":"2024-04-12T08:38:51Z","published":"2024-04-12T08:38:51Z","title":"Salience-Based Adaptive Masking: Revisiting Token Dynamics for Enhanced\n Pre-training","summary":" In this paper, we introduce Saliency-Based Adaptive Masking (SBAM), a novel\nand cost-effective approach that significantly enhances the pre-training\nperformance of Masked Image Modeling (MIM) approaches by prioritizing token\nsalience. Our method provides robustness against variations in masking ratios,\neffectively mitigating the performance instability issues common in existing\nmethods. This relaxes the sensitivity of MIM-based pre-training to masking\nratios, which in turn allows us to propose an adaptive strategy for `tailored'\nmasking ratios for each data sample, which no existing method can provide.\nToward this goal, we propose an Adaptive Masking Ratio (AMR) strategy that\ndynamically adjusts the proportion of masking for the unique content of each\nimage based on token salience. We show that our method significantly improves\nover the state-of-the-art in mask-based pre-training on the ImageNet-1K\ndataset.\n","authors":["Hyesong Choi","Hyejin Park","Kwang Moo Yi","Sungmin Cha","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2404.08327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.14335v2","updated":"2024-04-12T08:37:47Z","published":"2021-09-29T10:41:41Z","title":"A Systematic Survey of Deep Learning-based Single-Image Super-Resolution","summary":" Single-image super-resolution (SISR) is an important task in image\nprocessing, which aims to enhance the resolution of imaging systems. Recently,\nSISR has made a huge leap and has achieved promising results with the help of\ndeep learning (DL). In this survey, we give an overview of DL-based SISR\nmethods and group them according to their design targets. Specifically, we\nfirst introduce the problem definition, research background, and the\nsignificance of SISR. Secondly, we introduce some related works, including\nbenchmark datasets, upsampling methods, optimization objectives, and image\nquality assessment methods. Thirdly, we provide a detailed investigation of\nSISR and give some domain-specific applications of it. Fourthly, we present the\nreconstruction results of some classic SISR methods to intuitively know their\nperformance. Finally, we discuss some issues that still exist in SISR and\nsummarize some new trends and future directions. This is an exhaustive survey\nof SISR, which can help researchers better understand SISR and inspire more\nexciting research in this field. An investigation project for SISR is provided\nat https://github.com/CV-JunchengLi/SISR-Survey.\n","authors":["Juncheng Li","Zehua Pei","Wenjie Li","Guangwei Gao","Longguang Wang","Yingqian Wang","Tieyong Zeng"],"pdf_url":"https://arxiv.org/pdf/2109.14335v2.pdf","comment":"40 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.07537v2","updated":"2024-04-12T08:18:44Z","published":"2024-04-11T08:03:23Z","title":"How is Visual Attention Influenced by Text Guidance? Database and Model","summary":" The analysis and prediction of visual attention have long been crucial tasks\nin the fields of computer vision and image processing. In practical\napplications, images are generally accompanied by various text descriptions,\nhowever, few studies have explored the influence of text descriptions on visual\nattention, let alone developed visual saliency prediction models considering\ntext guidance. In this paper, we conduct a comprehensive study on text-guided\nimage saliency (TIS) from both subjective and objective perspectives.\nSpecifically, we construct a TIS database named SJTU-TIS, which includes 1200\ntext-image pairs and the corresponding collected eye-tracking data. Based on\nthe established SJTU-TIS database, we analyze the influence of various text\ndescriptions on visual attention. Then, to facilitate the development of\nsaliency prediction models considering text influence, we construct a benchmark\nfor the established SJTU-TIS database using state-of-the-art saliency models.\nFinally, considering the effect of text descriptions on visual attention, while\nmost existing saliency models ignore this impact, we further propose a\ntext-guided saliency (TGSal) prediction model, which extracts and integrates\nboth image features and text features to predict the image saliency under\nvarious text-description conditions. Our proposed model significantly\noutperforms the state-of-the-art saliency models on both the SJTU-TIS database\nand the pure image saliency databases in terms of various evaluation metrics.\nThe SJTU-TIS database and the code of the proposed TGSal model will be released\nat: https://github.com/IntMeGroup/TGSal.\n","authors":["Yinan Sun","Xiongkuo Min","Huiyu Duan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.07537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08312v1","updated":"2024-04-12T08:14:17Z","published":"2024-04-12T08:14:17Z","title":"GPN: Generative Point-based NeRF","summary":" Scanning real-life scenes with modern registration devices typically gives\nincomplete point cloud representations, primarily due to the limitations of\npartial scanning, 3D occlusions, and dynamic light conditions. Recent works on\nprocessing incomplete point clouds have always focused on point cloud\ncompletion. However, these approaches do not ensure consistency between the\ncompleted point cloud and the captured images regarding color and geometry. We\npropose using Generative Point-based NeRF (GPN) to reconstruct and repair a\npartial cloud by fully utilizing the scanning images and the corresponding\nreconstructed cloud. The repaired point cloud can achieve multi-view\nconsistency with the captured images at high spatial resolution. For the\nfinetunes of a single scene, we optimize the global latent condition by\nincorporating an Auto-Decoder architecture while retaining multi-view\nconsistency. As a result, the generated point clouds are smooth, plausible, and\ngeometrically consistent with the partial scanning images. Extensive\nexperiments on ShapeNet demonstrate that our works achieve competitive\nperformances to the other state-of-the-art point cloud-based neural scene\nrendering and editing performances.\n","authors":["Haipeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08801v4","updated":"2024-04-12T07:48:45Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17759v4","updated":"2024-04-12T07:44:25Z","published":"2024-01-31T11:36:12Z","title":"Rapid post-disaster infrastructure damage characterisation enabled by\n remote sensing and deep learning technologies -- a tiered approach","summary":" Critical infrastructure, such as transport networks and bridges, are\nsystematically targeted during wars and suffer damage during extensive natural\ndisasters because it is vital for enabling connectivity and transportation of\npeople and goods, and hence, underpins national and international economic\ngrowth. Mass destruction of transport assets, in conjunction with minimal or no\naccessibility in the wake of natural and anthropogenic disasters, prevents us\nfrom delivering rapid recovery and adaptation. As a result, systemic\noperability is drastically reduced, leading to low levels of resilience. Thus,\nthere is a need for rapid assessment of its condition to allow for informed\ndecision-making for restoration prioritisation. A solution to this challenge is\nto use technology that enables stand-off observations. Nevertheless, no methods\nexist for automated characterisation of damage at multiple scales, i.e.\nregional (e.g., network), asset (e.g., bridges), and structural (e.g., road\npavement) scales. We propose a methodology based on an integrated, multi-scale\ntiered approach to fill this capability gap. In doing so, we demonstrate how\nautomated damage characterisation can be enabled by fit-for-purpose digital\ntechnologies. Next, the methodology is applied and validated to a case study in\nUkraine that includes 17 bridges, damaged by human targeted interventions. From\nregional to component scale, we deploy technology to integrate assessments\nusing Sentinel-1 SAR images, crowdsourced information, and high-resolution\nimages for deep learning to facilitate automatic damage detection and\ncharacterisation. For the first time, the interferometric coherence difference\nand semantic segmentation of images were deployed in a tiered multi-scale\napproach to improve the reliability of damage characterisations at different\nscales.\n","authors":["Nadiia Kopiika","Andreas Karavias","Pavlos Krassakis","Zehao Ye","Jelena Ninic","Nataliya Shakhovska","Nikolaos Koukouzas","Sotirios Argyroudis","Stergios-Aristoteles Mitoulis"],"pdf_url":"https://arxiv.org/pdf/2401.17759v4.pdf","comment":"43 pages; 20 figures"},{"id":"http://arxiv.org/abs/2310.12877v4","updated":"2024-04-12T07:43:35Z","published":"2023-10-19T16:32:18Z","title":"Perceptual Assessment and Optimization of High Dynamic Range Image\n Rendering","summary":" High dynamic range (HDR) rendering has the ability to faithfully reproduce\nthe wide luminance ranges in natural scenes, but how to accurately assess the\nrendering quality is relatively underexplored. Existing quality models are\nmostly designed for low dynamic range (LDR) images, and do not align well with\nhuman perception of HDR image quality. To fill this gap, we propose a family of\nHDR quality metrics, in which the key step is employing a simple inverse\ndisplay model to decompose an HDR image into a stack of LDR images with varying\nexposures. Subsequently, these decomposed images are assessed through\nwell-established LDR quality metrics. Our HDR quality models present three\ndistinct benefits. First, they directly inherit the recent advancements of LDR\nquality metrics. Second, they do not rely on human perceptual data of HDR image\nquality for re-calibration. Third, they facilitate the alignment and\nprioritization of specific luminance ranges for more accurate and detailed\nquality assessment. Experimental results show that our HDR quality metrics\nconsistently outperform existing models in terms of quality assessment on four\nHDR image quality datasets and perceptual optimization of HDR novel view\nsynthesis.\n","authors":["Peibei Cao","Rafal K. Mantiuk","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2310.12877v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08298v1","updated":"2024-04-12T07:41:17Z","published":"2024-04-12T07:41:17Z","title":"Interference Motion Removal for Doppler Radar Vital Sign Detection Using\n Variational Encoder-Decoder Neural Network","summary":" The treatment of interfering motion contributions remains one of the key\nchallenges in the domain of radar-based vital sign monitoring. Removal of the\ninterference to extract the vital sign contributions is demanding due to\noverlapping Doppler bands, the complex structure of the interference motions\nand significant variations in the power levels of their contributions. A novel\napproach to the removal of interference through the use of a probabilistic deep\nlearning model is presented. Results show that a convolutional encoder-decoder\nneural network with a variational objective is capable of learning a meaningful\nrepresentation space of vital sign Doppler-time distribution facilitating their\nextraction from a mixture signal. The approach is tested on semi-experimental\ndata containing real vital sign signatures and simulated returns from\ninterfering body motions. The application of the proposed network enhances the\nextraction of the micro-Doppler frequency corresponding to the respiration rate\nis demonstrated.\n","authors":["Mikolaj Czerkawski","Christos Ilioudis","Carmine Clemente","Craig Michie","Ivan Andonovic","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2404.08298v1.pdf","comment":"Presented at 2021 IEEE Radar Conference (RadarConf21)"},{"id":"http://arxiv.org/abs/2404.08293v1","updated":"2024-04-12T07:30:52Z","published":"2024-04-12T07:30:52Z","title":"Overcoming Scene Context Constraints for Object Detection in wild using\n Defilters","summary":" This paper focuses on improving object detection performance by addressing\nthe issue of image distortions, commonly encountered in uncontrolled\nacquisition environments. High-level computer vision tasks such as object\ndetection, recognition, and segmentation are particularly sensitive to image\ndistortion. To address this issue, we propose a novel approach employing an\nimage defilter to rectify image distortion prior to object detection. This\nmethod enhances object detection accuracy, as models perform optimally when\ntrained on non-distorted images. Our experiments demonstrate that utilizing\ndefiltered images significantly improves mean average precision compared to\ntraining object detection models on distorted images. Consequently, our\nproposed method offers considerable benefits for real-world applications\nplagued by image distortion. To our knowledge, the contribution lies in\nemploying distortion-removal paradigm for object detection on images captured\nin natural settings. We achieved an improvement of 0.562 and 0.564 of mean\nAverage precision on validation and test data.\n","authors":["Vamshi Krishna Kancharla","Neelam sinha"],"pdf_url":"https://arxiv.org/pdf/2404.08293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08292v1","updated":"2024-04-12T07:30:24Z","published":"2024-04-12T07:30:24Z","title":"AdaContour: Adaptive Contour Descriptor with Hierarchical Representation","summary":" Existing angle-based contour descriptors suffer from lossy representation for\nnon-starconvex shapes. By and large, this is the result of the shape being\nregistered with a single global inner center and a set of radii corresponding\nto a polar coordinate parameterization. In this paper, we propose AdaContour,\nan adaptive contour descriptor that uses multiple local representations to\ndesirably characterize complex shapes. After hierarchically encoding object\nshapes in a training set and constructing a contour matrix of all subdivided\nregions, we compute a robust low-rank robust subspace and approximate each\nlocal contour by linearly combining the shared basis vectors to represent an\nobject. Experiments show that AdaContour is able to represent shapes more\naccurately and robustly than other descriptors while retaining effectiveness.\nWe validate AdaContour by integrating it into off-the-shelf detectors to enable\ninstance segmentation which demonstrates faithful performance. The code is\navailable at https://github.com/tding1/AdaContour.\n","authors":["Tianyu Ding","Jinxin Zhou","Tianyi Chen","Zhihui Zhu","Ilya Zharkov","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2404.08292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08291v1","updated":"2024-04-12T07:30:08Z","published":"2024-04-12T07:30:08Z","title":"On Input Formats for Radar Micro-Doppler Signature Processing by\n Convolutional Neural Networks","summary":" Convolutional neural networks have often been proposed for processing radar\nMicro-Doppler signatures, most commonly with the goal of classifying the\nsignals. The majority of works tend to disregard phase information from the\ncomplex time-frequency representation. Here, the utility of the phase\ninformation, as well as the optimal format of the Doppler-time input for a\nconvolutional neural network, is analysed. It is found that the performance\nachieved by convolutional neural network classifiers is heavily influenced by\nthe type of input representation, even across formats with equivalent\ninformation. Furthermore, it is demonstrated that the phase component of the\nDoppler-time representation contains rich information useful for classification\nand that unwrapping the phase in the temporal dimension can improve the results\ncompared to a magnitude-only solution, improving accuracy from 0.920 to 0.938\non the tested human activity dataset. Further improvement of 0.947 is achieved\nby training a linear classifier on embeddings from multiple-formats.\n","authors":["Mikolaj Czerkawski","Carmine Clemente","Craig Michie","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2404.08291v1.pdf","comment":"Presented at International Conference on Radar Systems (RADAR 2022)"},{"id":"http://arxiv.org/abs/2404.08285v1","updated":"2024-04-12T07:19:16Z","published":"2024-04-12T07:19:16Z","title":"A Survey of Neural Network Robustness Assessment in Image Recognition","summary":" In recent years, there has been significant attention given to the robustness\nassessment of neural networks. Robustness plays a critical role in ensuring\nreliable operation of artificial intelligence (AI) systems in complex and\nuncertain environments. Deep learning's robustness problem is particularly\nsignificant, highlighted by the discovery of adversarial attacks on image\nclassification models. Researchers have dedicated efforts to evaluate\nrobustness in diverse perturbation conditions for image recognition tasks.\nRobustness assessment encompasses two main techniques: robustness verification/\ncertification for deliberate adversarial attacks and robustness testing for\nrandom data corruptions. In this survey, we present a detailed examination of\nboth adversarial robustness (AR) and corruption robustness (CR) in neural\nnetwork assessment. Analyzing current research papers and standards, we provide\nan extensive overview of robustness assessment in image recognition. Three\nessential aspects are analyzed: concepts, metrics, and assessment methods. We\ninvestigate the perturbation metrics and range representations used to measure\nthe degree of perturbations on images, as well as the robustness metrics\nspecifically for the robustness conditions of classification models. The\nstrengths and limitations of the existing methods are also discussed, and some\npotential directions for future research are provided.\n","authors":["Jie Wang","Jun Ai","Minyan Lu","Haoran Su","Dan Yu","Yutao Zhang","Junda Zhu","Jingyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08281v1","updated":"2024-04-12T07:13:32Z","published":"2024-04-12T07:13:32Z","title":"Calibration & Reconstruction: Deep Integrated Language for Referring\n Image Segmentation","summary":" Referring image segmentation aims to segment an object referred to by natural\nlanguage expression from an image. The primary challenge lies in the efficient\npropagation of fine-grained semantic information from textual features to\nvisual features. Many recent works utilize a Transformer to address this\nchallenge. However, conventional transformer decoders can distort linguistic\ninformation with deeper layers, leading to suboptimal results. In this paper,\nwe introduce CRFormer, a model that iteratively calibrates multi-modal features\nin the transformer decoder. We start by generating language queries using\nvision features, emphasizing different aspects of the input language. Then, we\npropose a novel Calibration Decoder (CDec) wherein the multi-modal features can\niteratively calibrated by the input language features. In the Calibration\nDecoder, we use the output of each decoder layer and the original language\nfeatures to generate new queries for continuous calibration, which gradually\nupdates the language features. Based on CDec, we introduce a Language\nReconstruction Module and a reconstruction loss. This module leverages queries\nfrom the final layer of the decoder to reconstruct the input language and\ncompute the reconstruction loss. This can further prevent the language\ninformation from being lost or distorted. Our experiments consistently show the\nsuperior performance of our approach across RefCOCO, RefCOCO+, and G-Ref\ndatasets compared to state-of-the-art methods.\n","authors":["Yichen Yan","Xingjian He","Sihan Chen","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08281v1.pdf","comment":"9 pages, 8 figures ICMR2024. arXiv admin note: text overlap with\n arXiv:2305.14969"},{"id":"http://arxiv.org/abs/2404.08279v1","updated":"2024-04-12T07:08:05Z","published":"2024-04-12T07:08:05Z","title":"Convolutional neural network classification of cancer cytopathology\n images: taking breast cancer as an example","summary":" Breast cancer is a relatively common cancer among gynecological cancers. Its\ndiagnosis often relies on the pathology of cells in the lesion. The\npathological diagnosis of breast cancer not only requires professionals and\ntime, but also sometimes involves subjective judgment. To address the\nchallenges of dependence on pathologists expertise and the time-consuming\nnature of achieving accurate breast pathological image classification, this\npaper introduces an approach utilizing convolutional neural networks (CNNs) for\nthe rapid categorization of pathological images, aiming to enhance the\nefficiency of breast pathological image detection. And the approach enables the\nrapid and automatic classification of pathological images into benign and\nmalignant groups. The methodology involves utilizing a convolutional neural\nnetwork (CNN) model leveraging the Inceptionv3 architecture and transfer\nlearning algorithm for extracting features from pathological images. Utilizing\na neural network with fully connected layers and employing the SoftMax function\nfor image classification. Additionally, the concept of image partitioning is\nintroduced to handle high-resolution images. To achieve the ultimate\nclassification outcome, the classification probabilities of each image block\nare aggregated using three algorithms: summation, product, and maximum.\nExperimental validation was conducted on the BreaKHis public dataset, resulting\nin accuracy rates surpassing 0.92 across all four magnification coefficients\n(40X, 100X, 200X, and 400X). It demonstrates that the proposed method\neffectively enhances the accuracy in classifying pathological images of breast\ncancer.\n","authors":["MingXuan Xiao","Yufeng Li","Xu Yan","Min Gao","Weimin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02492v3","updated":"2024-04-12T07:06:52Z","published":"2023-10-03T23:44:35Z","title":"FairVision: Equitable Deep Learning for Eye Disease Screening via Fair\n Identity Scaling","summary":" Equity in AI for healthcare is crucial due to its direct impact on human\nwell-being. Despite advancements in 2D medical imaging fairness, the fairness\nof 3D models remains underexplored, hindered by the small sizes of 3D fairness\ndatasets. Since 3D imaging surpasses 2D imaging in SOTA clinical care, it is\ncritical to understand the fairness of these 3D models. To address this\nresearch gap, we conduct the first comprehensive study on the fairness of 3D\nmedical imaging models across multiple protected attributes. Our investigation\nspans both 2D and 3D models and evaluates fairness across five architectures on\nthree common eye diseases, revealing significant biases across race, gender,\nand ethnicity. To alleviate these biases, we propose a novel fair identity\nscaling (FIS) method that improves both overall performance and fairness,\noutperforming various SOTA fairness methods. Moreover, we release\nHarvard-FairVision, the first large-scale medical fairness dataset with 30,000\nsubjects featuring both 2D and 3D imaging data and six demographic identity\nattributes. Harvard-FairVision provides labels for three major eye disorders\naffecting about 380 million people worldwide, serving as a valuable resource\nfor both 2D and 3D fairness learning. Our code and dataset are publicly\naccessible at\n\\url{https://ophai.hms.harvard.edu/datasets/harvard-fairvision30k}.\n","authors":["Yan Luo","Muhammad Osama Khan","Yu Tian","Min Shi","Zehao Dou","Tobias Elze","Yi Fang","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08277v1","updated":"2024-04-12T07:04:56Z","published":"2024-04-12T07:04:56Z","title":"FaceFilterSense: A Filter-Resistant Face Recognition and Facial\n Attribute Analysis Framework","summary":" With the advent of social media, fun selfie filters have come into tremendous\nmainstream use affecting the functioning of facial biometric systems as well as\nimage recognition systems. These filters vary from beautification filters and\nAugmented Reality (AR)-based filters to filters that modify facial landmarks.\nHence, there is a need to assess the impact of such filters on the performance\nof existing face recognition systems. The limitation associated with existing\nsolutions is that these solutions focus more on the beautification filters.\nHowever, the current AR-based filters and filters which distort facial key\npoints are in vogue recently and make the faces highly unrecognizable even to\nthe naked eye. Also, the filters considered are mostly obsolete with limited\nvariations. To mitigate these limitations, we aim to perform a holistic impact\nanalysis of the latest filters and propose an user recognition model with the\nfiltered images. We have utilized a benchmark dataset for baseline images, and\napplied the latest filters over them to generate a beautified/filtered dataset.\nNext, we have introduced a model FaceFilterNet for beautified user recognition.\nIn this framework, we also utilize our model to comment on various attributes\nof the person including age, gender, and ethnicity. In addition, we have also\npresented a filter-wise impact analysis on face recognition, age estimation,\ngender, and ethnicity prediction. The proposed method affirms the efficacy of\nour dataset with an accuracy of 87.25% and an optimal accuracy for facial\nattribute analysis.\n","authors":["Shubham Tiwari","Yash Sethia","Ritesh Kumar","Ashwani Tanwar","Rudresh Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.08277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08273v1","updated":"2024-04-12T06:52:40Z","published":"2024-04-12T06:52:40Z","title":"Struggle with Adversarial Defense? Try Diffusion","summary":" Adversarial attacks induce misclassification by introducing subtle\nperturbations. Recently, diffusion models are applied to the image classifiers\nto improve adversarial robustness through adversarial training or by purifying\nadversarial noise. However, diffusion-based adversarial training often\nencounters convergence challenges and high computational expenses.\nAdditionally, diffusion-based purification inevitably causes data shift and is\ndeemed susceptible to stronger adaptive attacks. To tackle these issues, we\npropose the Truth Maximization Diffusion Classifier (TMDC), a generative\nBayesian classifier that builds upon pre-trained diffusion models and the\nBayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian\nprinciples, utilizes the conditional likelihood from diffusion models to\ndetermine the class probabilities of input images, thereby insulating against\nthe influences of data shift and the limitations of adversarial training.\nMoreover, to enhance TMDC's resilience against more potent adversarial attacks,\nwe propose an optimization strategy for diffusion classifiers. This strategy\ninvolves post-training the diffusion model on perturbed datasets with\nground-truth labels as conditions, guiding the diffusion model to learn the\ndata distribution and maximizing the likelihood under the ground-truth labels.\nThe proposed method achieves state-of-the-art performance on the CIFAR10\ndataset against heavy white-box attacks and strong adaptive attacks.\nSpecifically, TMDC achieves robust accuracies of 82.81% against $l_{\\infty}$\nnorm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded\nperturbations, respectively, with $\\epsilon=0.05$.\n","authors":["Yujie Li","Yanbin Wang","Haitao xu","Bin Liu","Jianguo Sun","Zhenhao Guo","Wenrui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05516v2","updated":"2024-04-12T06:51:06Z","published":"2022-06-11T12:39:37Z","title":"Deep Learning-Based MR Image Re-parameterization","summary":" Magnetic resonance (MR) image re-parameterization refers to the process of\ngenerating via simulations of an MR image with a new set of MRI scanning\nparameters. Different parameter values generate distinct contrast between\ndifferent tissues, helping identify pathologic tissue. Typically, more than one\nscan is required for diagnosis; however, acquiring repeated scans can be\ncostly, time-consuming, and difficult for patients. Thus, using MR image\nre-parameterization to predict and estimate the contrast in these imaging scans\ncan be an effective alternative. In this work, we propose a novel deep learning\n(DL) based convolutional model for MRI re-parameterization. Based on our\npreliminary results, DL-based techniques hold the potential to learn the\nnon-linearities that govern the re-parameterization.\n","authors":["Abhijeet Narang","Abhigyan Raj","Mihaela Pop","Mehran Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2206.05516v2.pdf","comment":"A. Narang, A. Raj, M. Pop and M. Ebrahimi, \"Deep Learning-Based MR\n Image Re-parameterization,\" 2023 Congress in Computer Science, Computer\n Engineering, & Applied Computing (CSCE), Las Vegas, NV, USA, 2023, pp.\n 536-541, doi: 10.1109/CSCE60160.2023.00094"},{"id":"http://arxiv.org/abs/2303.03761v2","updated":"2024-04-12T06:42:47Z","published":"2023-03-07T09:56:23Z","title":"Graph Neural Networks in Vision-Language Image Understanding: A Survey","summary":" 2D image understanding is a complex problem within computer vision, but it\nholds the key to providing human-level scene comprehension. It goes further\nthan identifying the objects in an image, and instead, it attempts to\nunderstand the scene. Solutions to this problem form the underpinning of a\nrange of tasks, including image captioning, visual question answering (VQA),\nand image retrieval. Graphs provide a natural way to represent the relational\narrangement between objects in an image, and thus, in recent years graph neural\nnetworks (GNNs) have become a standard component of many 2D image understanding\npipelines, becoming a core architectural component, especially in the VQA group\nof tasks. In this survey, we review this rapidly evolving field and we provide\na taxonomy of graph types used in 2D image understanding approaches, a\ncomprehensive list of the GNN models used in this domain, and a roadmap of\nfuture potential developments. To the best of our knowledge, this is the first\ncomprehensive survey that covers image captioning, visual question answering,\nand image retrieval techniques that focus on using GNNs as the main part of\ntheir architecture.\n","authors":["Henry Senior","Gregory Slabaugh","Shanxin Yuan","Luca Rossi"],"pdf_url":"https://arxiv.org/pdf/2303.03761v2.pdf","comment":"20 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.08264v1","updated":"2024-04-12T06:23:48Z","published":"2024-04-12T06:23:48Z","title":"Guided Masked Self-Distillation Modeling for Distributed Multimedia\n Sensor Event Analysis","summary":" Observations with distributed sensors are essential in analyzing a series of\nhuman and machine activities (referred to as 'events' in this paper) in complex\nand extensive real-world environments. This is because the information obtained\nfrom a single sensor is often missing or fragmented in such an environment;\nobservations from multiple locations and modalities should be integrated to\nanalyze events comprehensively. However, a learning method has yet to be\nestablished to extract joint representations that effectively combine such\ndistributed observations. Therefore, we propose Guided Masked sELf-Distillation\nmodeling (Guided-MELD) for inter-sensor relationship modeling. The basic idea\nof Guided-MELD is to learn to supplement the information from the masked sensor\nwith information from other sensors needed to detect the event. Guided-MELD is\nexpected to enable the system to effectively distill the fragmented or\nredundant target event information obtained by the sensors without being overly\ndependent on any specific sensors. To validate the effectiveness of the\nproposed method in novel tasks of distributed multimedia sensor event analysis,\nwe recorded two new datasets that fit the problem setting: MM-Store and\nMM-Office. These datasets consist of human activities in a convenience store\nand an office, recorded using distributed cameras and microphones. Experimental\nresults on these datasets show that the proposed Guided-MELD improves event\ntagging and detection performance and outperforms conventional inter-sensor\nrelationship modeling methods. Furthermore, the proposed method performed\nrobustly even when sensors were reduced.\n","authors":["Masahiro Yasuda","Noboru Harada","Yasunori Ohishi","Shoichiro Saito","Akira Nakayama","Nobutaka Ono"],"pdf_url":"https://arxiv.org/pdf/2404.08264v1.pdf","comment":"13page, 7figure, under review"},{"id":"http://arxiv.org/abs/2312.16837v3","updated":"2024-04-12T06:23:45Z","published":"2023-12-28T05:46:26Z","title":"DiffusionGAN3D: Boosting Text-guided 3D Generation and Domain Adaptation\n by Combining 3D GANs and Diffusion Priors","summary":" Text-guided domain adaptation and generation of 3D-aware portraits find many\napplications in various fields. However, due to the lack of training data and\nthe challenges in handling the high variety of geometry and appearance, the\nexisting methods for these tasks suffer from issues like inflexibility,\ninstability, and low fidelity. In this paper, we propose a novel framework\nDiffusionGAN3D, which boosts text-guided 3D domain adaptation and generation by\ncombining 3D GANs and diffusion priors. Specifically, we integrate the\npre-trained 3D generative models (e.g., EG3D) and text-to-image diffusion\nmodels. The former provides a strong foundation for stable and high-quality\navatar generation from text. And the diffusion models in turn offer powerful\npriors and guide the 3D generator finetuning with informative direction to\nachieve flexible and efficient text-guided domain adaptation. To enhance the\ndiversity in domain adaptation and the generation capability in text-to-avatar,\nwe introduce the relative distance loss and case-specific learnable triplane\nrespectively. Besides, we design a progressive texture refinement module to\nimprove the texture quality for both tasks above. Extensive experiments\ndemonstrate that the proposed framework achieves excellent results in both\ndomain adaptation and text-to-avatar tasks, outperforming existing methods in\nterms of generation quality and efficiency. The project homepage is at\nhttps://younglbw.github.io/DiffusionGAN3D-homepage/.\n","authors":["Biwen Lei","Kai Yu","Mengyang Feng","Miaomiao Cui","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2312.16837v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05268v2","updated":"2024-04-12T06:20:49Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation aims to synthesize instantiations of\nuser-specified concepts and has achieved unprecedented progress in handling\nindividual concept. However, when extending to multiple customized concepts,\nexisting methods exhibit limitations in terms of flexibility and fidelity, only\naccommodating the combination of limited types of models and potentially\nresulting in a mix of characteristics from different concepts. In this paper,\nwe introduce the Multi-concept guidance for Multi-concept customization, termed\nMC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the\nrequirements for model architecture via inference time optimization, allowing\nthe integration of various heterogeneous single-concept customized models. It\nadaptively refines the attention weights between visual and textual tokens,\ndirecting image regions to focus on their associated words while diminishing\nthe impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$\neven surpasses previous methods that require additional training in terms of\nconsistency with input prompt and reference images. Moreover, MC$^2$ can be\nextended to elevate the compositional capabilities of text-to-image generation,\nyielding appealing results. Code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08255v1","updated":"2024-04-12T06:09:24Z","published":"2024-04-12T06:09:24Z","title":"Practical Region-level Attack against Segment Anything Models","summary":" Segment Anything Models (SAM) have made significant advancements in image\nsegmentation, allowing users to segment target portions of an image with a\nsingle click (i.e., user prompt). Given its broad applications, the robustness\nof SAM against adversarial attacks is a critical concern. While recent works\nhave explored adversarial attacks against a pre-defined prompt/click, their\nthreat model is not yet realistic: (1) they often assume the user-click\nposition is known to the attacker (point-based attack), and (2) they often\noperate under a white-box setting with limited transferability. In this paper,\nwe propose a more practical region-level attack where attackers do not need to\nknow the precise user prompt. The attack remains effective as the user clicks\non any point on the target object in the image, hiding the object from SAM.\nAlso, by adapting a spectrum transformation method, we make the attack more\ntransferable under a black-box setting. Both control experiments and testing\nagainst real-world SAM services confirm its effectiveness.\n","authors":["Yifan Shen","Zhengyuan Li","Gang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08252v1","updated":"2024-04-12T05:43:10Z","published":"2024-04-12T05:43:10Z","title":"MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based\n Monocular Guidance","summary":" The latest regularized Neural Radiance Field (NeRF) approaches produce poor\ngeometry and view extrapolation for multiview stereo (MVS) benchmarks such as\nETH3D. In this paper, we aim to create 3D models that provide accurate geometry\nand view synthesis, partially closing the large geometric performance gap\nbetween NeRF and traditional MVS methods. We propose a patch-based approach\nthat effectively leverages monocular surface normal and relative depth\npredictions. The patch-based ray sampling also enables the appearance\nregularization of normalized cross-correlation (NCC) and structural similarity\n(SSIM) between randomly sampled virtual and training views. We further show\nthat \"density restrictions\" based on sparse structure-from-motion points can\nhelp greatly improve geometric accuracy with a slight drop in novel view\nsynthesis metrics. Our experiments show 4x the performance of RegNeRF and 8x\nthat of FreeNeRF on average F1@2cm for ETH3D MVS benchmark, suggesting a\nfruitful research direction to improve the geometric accuracy of NeRF-based\nmodels, and sheds light on a potential future approach to enable NeRF-based\noptimization to eventually outperform traditional MVS.\n","authors":["Yuqun Wu","Jae Yong Lee","Chuhang Zou","Shenlong Wang","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2404.08252v1.pdf","comment":"26 pages, 15 figures"},{"id":"http://arxiv.org/abs/2309.08966v2","updated":"2024-04-12T05:34:02Z","published":"2023-09-16T11:42:41Z","title":"FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering\n and Local to Global Optimization","summary":" Cross-modality point cloud registration is confronted with significant\nchallenges due to inherent differences in modalities between different sensors.\nWe propose a cross-modality point cloud registration framework FF-LOGO: a\ncross-modality point cloud registration method with feature filtering and\nlocal-global optimization. The cross-modality feature correlation filtering\nmodule extracts geometric transformation-invariant features from cross-modality\npoint clouds and achieves point selection by feature matching. We also\nintroduce a cross-modality optimization process, including a local adaptive key\nregion aggregation module and a global modality consistency fusion optimization\nmodule. Experimental results demonstrate that our two-stage optimization\nsignificantly improves the registration accuracy of the feature association and\nselection module. Our method achieves a substantial increase in recall rate\ncompared to the current state-of-the-art methods on the 3DCSR dataset,\nimproving from 40.59% to 75.74%. Our code will be available at\nhttps://github.com/wangmohan17/FFLOGO.\n","authors":["Nan Ma","Mohan Wang","Yiheng Han","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.08966v2.pdf","comment":"Accepted by 2024 IEEE International Conference on Robotics and\n Automation (ICRA),7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.15070v3","updated":"2024-04-12T05:26:59Z","published":"2023-08-29T07:11:52Z","title":"DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior","summary":" We present DiffBIR, a general restoration pipeline that could handle\ndifferent blind image restoration tasks in a unified framework. DiffBIR\ndecouples blind image restoration problem into two stages: 1) degradation\nremoval: removing image-independent content; 2) information regeneration:\ngenerating the lost image content. Each stage is developed independently but\nthey work seamlessly in a cascaded manner. In the first stage, we use\nrestoration modules to remove degradations and obtain high-fidelity restored\nresults. For the second stage, we propose IRControlNet that leverages the\ngenerative ability of latent diffusion models to generate realistic details.\nSpecifically, IRControlNet is trained based on specially produced condition\nimages without distracting noisy content for stable generation performance.\nMoreover, we design a region-adaptive restoration guidance that can modify the\ndenoising process during inference without model re-training, allowing users to\nbalance realness and fidelity through a tunable guidance scale. Extensive\nexperiments have demonstrated DiffBIR's superiority over state-of-the-art\napproaches for blind image super-resolution, blind face restoration and blind\nimage denoising tasks on both synthetic and real-world datasets. The code is\navailable at https://github.com/XPixelGroup/DiffBIR.\n","authors":["Xinqi Lin","Jingwen He","Ziyan Chen","Zhaoyang Lyu","Bo Dai","Fanghua Yu","Wanli Ouyang","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15070v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10356v3","updated":"2024-04-12T05:07:28Z","published":"2023-09-19T06:32:19Z","title":"RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene\n Parsing","summary":" The recent advancements in deep convolutional neural networks have shown\nsignificant promise in the domain of road scene parsing. Nevertheless, the\nexisting works focus primarily on freespace detection, with little attention\ngiven to hazardous road defects that could compromise both driving safety and\ncomfort. In this paper, we introduce RoadFormer, a novel Transformer-based\ndata-fusion network developed for road scene parsing. RoadFormer utilizes a\nduplex encoder architecture to extract heterogeneous features from both RGB\nimages and surface normal information. The encoded features are subsequently\nfed into a novel heterogeneous feature synergy block for effective feature\nfusion and recalibration. The pixel decoder then learns multi-scale long-range\ndependencies from the fused and recalibrated heterogeneous features, which are\nsubsequently processed by a Transformer decoder to produce the final semantic\nprediction. Additionally, we release SYN-UDTIRI, the first large-scale road\nscene parsing dataset that contains over 10,407 RGB images, dense depth images,\nand the corresponding pixel-level annotations for both freespace and road\ndefects of different shapes and sizes. Extensive experimental evaluations\nconducted on our SYN-UDTIRI dataset, as well as on three public datasets,\nincluding KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer\noutperforms all other state-of-the-art networks for road scene parsing.\nSpecifically, RoadFormer ranks first on the KITTI road benchmark. Our source\ncode, created dataset, and demo video are publicly available at\nmias.group/RoadFormer.\n","authors":["Jiahang Li","Yikang Zhang","Peng Yun","Guangliang Zhou","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2309.10356v3.pdf","comment":"9 pages 7 figures. Accepted by Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2403.14047v2","updated":"2024-04-12T05:07:27Z","published":"2024-03-21T00:09:04Z","title":"Accelerating ViT Inference on FPGA through Static and Dynamic Pruning","summary":" Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various\ncomputer vision tasks. However, their high computational complexity prevents\nthem from being applied to many real-world applications. Weight and token\npruning are two well-known methods for reducing complexity: weight pruning\nreduces the model size and associated computational demands, while token\npruning further dynamically reduces the computation based on the input.\nCombining these two techniques should significantly reduce computation\ncomplexity and model size; however, naively integrating them results in\nirregular computation patterns, leading to significant accuracy drops and\ndifficulties in hardware acceleration.\n Addressing the above challenges, we propose a comprehensive\nalgorithm-hardware codesign for accelerating ViT on FPGA through simultaneous\npruning -combining static weight pruning and dynamic token pruning. For\nalgorithm design, we systematically combine a hardware-aware structured\nblock-pruning method for pruning model parameters and a dynamic token pruning\nmethod for removing unimportant token vectors. Moreover, we design a novel\ntraining algorithm to recover the model's accuracy. For hardware design, we\ndevelop a novel hardware accelerator for executing the pruned model. The\nproposed hardware design employs multi-level parallelism with load balancing\nstrategy to efficiently deal with the irregular computation pattern led by the\ntwo pruning approaches. Moreover, we develop an efficient hardware mechanism\nfor efficiently executing the on-the-fly token pruning.\n","authors":["Dhruv Parikh","Shouyi Li","Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2403.14047v2.pdf","comment":"FCCM 2024"},{"id":"http://arxiv.org/abs/2208.07463v4","updated":"2024-04-12T04:48:48Z","published":"2022-08-15T22:51:23Z","title":"Conv-Adapter: Exploring Parameter Efficient Transfer Learning for\n ConvNets","summary":" While parameter efficient tuning (PET) methods have shown great potential\nwith transformer architecture on Natural Language Processing (NLP) tasks, their\neffectiveness with large-scale ConvNets is still under-studied on Computer\nVision (CV) tasks. This paper proposes Conv-Adapter, a PET module designed for\nConvNets. Conv-Adapter is light-weight, domain-transferable, and\narchitecture-agnostic with generalized performance on different tasks. When\ntransferring on downstream tasks, Conv-Adapter learns tasks-specific feature\nmodulation to the intermediate representations of backbones while keeping the\npre-trained parameters frozen. By introducing only a tiny amount of learnable\nparameters, e.g., only 3.5% full fine-tuning parameters of ResNet50. It can\nalso be applied for transformer-based backbones. Conv-Adapter outperforms\nprevious PET baseline methods and achieves comparable or surpasses the\nperformance of full fine-tuning on 23 classification tasks of various domains.\nIt also presents superior performance on the few-shot classification with an\naverage margin of 3.39%. Beyond classification, Conv-Adapter can generalize to\ndetection and segmentation tasks with more than 50% reduction of parameters but\ncomparable performance to the traditional full fine-tuning.\n","authors":["Hao Chen","Ran Tao","Han Zhang","Yidong Wang","Xiang Li","Wei Ye","Jindong Wang","Guosheng Hu","Marios Savvides"],"pdf_url":"https://arxiv.org/pdf/2208.07463v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08238v1","updated":"2024-04-12T04:45:51Z","published":"2024-04-12T04:45:51Z","title":"Simulation of a Vision Correction Display System","summary":" Eyes serve as our primary sensory organs, responsible for processing up to\n80\\% of our sensory input. However, common visual aberrations like myopia and\nhyperopia affect a significant portion of the global population. This paper\nfocuses on simulating a Vision Correction Display (VCD) to enhance the visual\nexperience of individuals with various visual impairments. Utilising Blender,\nwe digitally model the functionality of a VCD in correcting refractive errors\nsuch as myopia and hyperopia. With these simulations we can see potential\nimprovements in visual acuity and comfort. These simulations provide valuable\ninsights for the design and development of future VCD technologies, ultimately\nadvancing accessibility and usability for individuals with visual challenges.\n","authors":["Vidya Sunil","Renu M Rameshan"],"pdf_url":"https://arxiv.org/pdf/2404.08238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08237v1","updated":"2024-04-12T04:44:11Z","published":"2024-04-12T04:44:11Z","title":"IFViT: Interpretable Fixed-Length Representation for Fingerprint\n Matching via Vision Transformer","summary":" Determining dense feature points on fingerprints used in constructing deep\nfixed-length representations for accurate matching, particularly at the pixel\nlevel, is of significant interest. To explore the interpretability of\nfingerprint matching, we propose a multi-stage interpretable fingerprint\nmatching network, namely Interpretable Fixed-length Representation for\nFingerprint Matching via Vision Transformer (IFViT), which consists of two\nprimary modules. The first module, an interpretable dense registration module,\nestablishes a Vision Transformer (ViT)-based Siamese Network to capture\nlong-range dependencies and the global context in fingerprint pairs. It\nprovides interpretable dense pixel-wise correspondences of feature points for\nfingerprint alignment and enhances the interpretability in the subsequent\nmatching stage. The second module takes into account both local and global\nrepresentations of the aligned fingerprint pair to achieve an interpretable\nfixed-length representation extraction and matching. It employs the ViTs\ntrained in the first module with the additional fully connected layer and\nretrains them to simultaneously produce the discriminative fixed-length\nrepresentation and interpretable dense pixel-wise correspondences of feature\npoints. Extensive experimental results on diverse publicly available\nfingerprint databases demonstrate that the proposed framework not only exhibits\nsuperior performance on dense registration and matching but also significantly\npromotes the interpretability in deep fixed-length representations-based\nfingerprint matching.\n","authors":["Yuhang Qiu","Honghui Chen","Xingbo Dong","Zheng Lin","Iman Yi Liao","Massimo Tistarelli","Zhe Jin"],"pdf_url":"https://arxiv.org/pdf/2404.08237v1.pdf","comment":"ready to submit to IEEE Transactions on Information Forensics and\n Security (TIFS)"},{"id":"http://arxiv.org/abs/2302.06874v2","updated":"2024-04-12T04:42:29Z","published":"2023-02-14T07:39:37Z","title":"Robust Representation Learning with Self-Distillation for Domain\n Generalization","summary":" Despite the recent success of deep neural networks, there remains a need for\neffective methods to enhance domain generalization using vision transformers.\nIn this paper, we propose a novel domain generalization technique called Robust\nRepresentation Learning with Self-Distillation (RRLD) comprising i)\nintermediate-block self-distillation and ii) augmentation-guided\nself-distillation to improve the generalization capabilities of\ntransformer-based models on unseen domains. This approach enables the network\nto learn robust and general features that are invariant to different\naugmentations and domain shifts while effectively mitigating overfitting to\nsource domains. To evaluate the effectiveness of our proposed method, we\nperform extensive experiments on PACS and OfficeHome benchmark datasets, as\nwell as an industrial wafer semiconductor defect dataset. The results\ndemonstrate that RRLD achieves robust and accurate generalization performance.\nWe observe an average accuracy improvement in the range of 1.2% to 2.3% over\nthe state-of-the-art on the three datasets.\n","authors":["Ankur Singh","Senthilnath Jayavelu"],"pdf_url":"https://arxiv.org/pdf/2302.06874v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.05960v2","updated":"2024-04-12T04:23:12Z","published":"2024-04-09T02:47:52Z","title":"EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker","summary":" Most of 3D single object trackers (SOT) in point clouds follow the two-stream\nmulti-stage 3D Siamese or motion tracking paradigms, which process the template\nand search area point clouds with two parallel branches, built on supervised\npoint cloud backbones. In this work, beyond typical 3D Siamese or motion\ntracking, we propose a neat and compact one-stream transformer 3D SOT paradigm\nfrom the novel perspective, termed as \\textbf{EasyTrack}, which consists of\nthree special designs: 1) A 3D point clouds tracking feature pre-training\nmodule is developed to exploit the masked autoencoding for learning 3D point\nclouds tracking representations. 2) A unified 3D tracking feature learning and\nfusion network is proposed to simultaneously learns target-aware 3D features,\nand extensively captures mutual correlation through the flexible self-attention\nmechanism. 3) A target location network in the dense bird's eye view (BEV)\nfeature space is constructed for target classification and regression.\nMoreover, we develop an enhanced version named EasyTrack++, which designs the\ncenter points interaction (CPI) strategy to reduce the ambiguous targets caused\nby the noise point cloud background information. The proposed EasyTrack and\nEasyTrack++ set a new state-of-the-art performance ($\\textbf{18\\%}$,\n$\\textbf{40\\%}$ and $\\textbf{3\\%}$ success gains) in KITTI, NuScenes, and Waymo\nwhile runing at \\textbf{52.6fps} with few parameters (\\textbf{1.3M}). The code\nwill be available at https://github.com/KnightApple427/Easytrack.\n","authors":["Baojie Fan","Wuyang Zhou","Kai Wang","Shijun Zhou","Fengyu Xu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08229v1","updated":"2024-04-12T04:08:21Z","published":"2024-04-12T04:08:21Z","title":"Enhancing Traffic Safety with Parallel Dense Video Captioning for\n End-to-End Event Analysis","summary":" This paper introduces our solution for Track 2 in AI City Challenge 2024. The\ntask aims to solve traffic safety description and analysis with the dataset of\nWoven Traffic Safety (WTS), a real-world Pedestrian-Centric Traffic Video\nDataset for Fine-grained Spatial-Temporal Understanding. Our solution mainly\nfocuses on the following points: 1) To solve dense video captioning, we\nleverage the framework of dense video captioning with parallel decoding (PDVC)\nto model visual-language sequences and generate dense caption by chapters for\nvideo. 2) Our work leverages CLIP to extract visual features to more\nefficiently perform cross-modality training between visual and textual\nrepresentations. 3) We conduct domain-specific model adaptation to mitigate\ndomain shift problem that poses recognition challenge in video understanding.\n4) Moreover, we leverage BDD-5K captioned videos to conduct knowledge transfer\nfor better understanding WTS videos and more accurate captioning. Our solution\nhas yielded on the test set, achieving 6th place in the competition. The open\nsource code will be available at https://github.com/UCF-SST-Lab/AICity2024CVPRW\n","authors":["Maged Shoman","Dongdong Wang","Armstrong Aboah","Mohamed Abdel-Aty"],"pdf_url":"https://arxiv.org/pdf/2404.08229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08226v1","updated":"2024-04-12T03:43:37Z","published":"2024-04-12T03:43:37Z","title":"Improving Continuous Sign Language Recognition with Adapted Image Models","summary":" The increase of web-scale weakly labelled image-text pairs have greatly\nfacilitated the development of large-scale vision-language models (e.g., CLIP),\nwhich have shown impressive generalization performance over a series of\ndownstream tasks. However, the massive model size and scarcity of available\ndata limit their applications to fine-tune the whole model in downstream tasks.\nBesides, fully fine-tuning the model easily forgets the generic essential\nknowledge acquired in the pretraining stage and overfits the downstream data.\nTo enable high efficiency when adapting these large vision-language models\n(e.g., CLIP) to performing continuous sign language recognition (CSLR) while\npreserving their generalizability, we propose a novel strategy (AdaptSign).\nEspecially, CLIP is adopted as the visual backbone to extract frame-wise\nfeatures whose parameters are fixed, and a set of learnable modules are\nintroduced to model spatial sign variations or capture temporal sign movements.\nThe introduced additional modules are quite lightweight, only owning 3.2% extra\ncomputations with high efficiency. The generic knowledge acquired in the\npretraining stage is well-preserved in the frozen CLIP backbone in this\nprocess. Extensive experiments show that despite being efficient, AdaptSign is\nable to demonstrate superior performance across a series of CSLR benchmarks\nincluding PHOENIX14, PHOENIX14-T, CSL-Daily and CSL compared to existing\nmethods. Visualizations show that AdaptSign could learn to dynamically pay\nmajor attention to the informative spatial regions and cross-frame trajectories\nin sign videos.\n","authors":["Lianyu Hu","Tongkai Shi","Liqing Gao","Zekang Liu","Wei Feng"],"pdf_url":"https://arxiv.org/pdf/2404.08226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04582v2","updated":"2024-04-12T03:33:31Z","published":"2023-10-06T20:48:43Z","title":"Universal Humanoid Motion Representations for Physics-Based Control","summary":" We present a universal motion representation that encompasses a comprehensive\nrange of motor skills for physics-based humanoid control. Due to the high\ndimensionality of humanoids and the inherent difficulties in reinforcement\nlearning, prior methods have focused on learning skill embeddings for a narrow\nrange of movement styles (e.g. locomotion, game characters) from specialized\nmotion datasets. This limited scope hampers their applicability in complex\ntasks. We close this gap by significantly increasing the coverage of our motion\nrepresentation space. To achieve this, we first learn a motion imitator that\ncan imitate all of human motion from a large, unstructured motion dataset. We\nthen create our motion representation by distilling skills directly from the\nimitator. This is achieved by using an encoder-decoder structure with a\nvariational information bottleneck. Additionally, we jointly learn a prior\nconditioned on proprioception (humanoid's own pose and velocities) to improve\nmodel expressiveness and sampling efficiency for downstream tasks. By sampling\nfrom the prior, we can generate long, stable, and diverse human motions. Using\nthis latent space for hierarchical RL, we show that our policies solve tasks\nusing human-like behavior. We demonstrate the effectiveness of our motion\nrepresentation by solving generative tasks (e.g. strike, terrain traversal) and\nmotion tracking using VR controllers.\n","authors":["Zhengyi Luo","Jinkun Cao","Josh Merel","Alexander Winkler","Jing Huang","Kris Kitani","Weipeng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.04582v2.pdf","comment":"ICLR 2024 Spotlight. Project page:\n https://zhengyiluo.github.io/PULSE/"},{"id":"http://arxiv.org/abs/2403.12416v2","updated":"2024-04-12T03:15:26Z","published":"2024-03-19T03:59:14Z","title":"Eye-gaze Guided Multi-modal Alignment Framework for Radiology","summary":" In multi-modal frameworks, the alignment of cross-modal features presents a\nsignificant challenge. The predominant approach in multi-modal pre-training\nemphasizes either global or local alignment between modalities, utilizing\nextensive datasets. This bottom-up driven method often suffers from a lack of\ninterpretability, a critical concern in radiology. Previous studies have\nintegrated high-level labels in medical images or text, but these still rely on\nmanual annotation, a costly and labor-intensive process. Our work introduces a\nnovel approach by using eye-gaze data, collected synchronously by radiologists\nduring diagnostic evaluations. This data, indicating radiologists' focus areas,\nnaturally links chest X-rays to diagnostic texts. We propose the Eye-gaze\nGuided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for\nbetter alignment of image and text features, aiming to reduce reliance on\nmanual annotations and thus cut training costs. Our model demonstrates robust\nperformance, outperforming other state-of-the-art methods in zero-shot\nclassification and retrieval tasks. The incorporation of easily-obtained\neye-gaze data during routine radiological diagnoses signifies a step towards\nminimizing manual annotation dependency. Additionally, we explore the impact of\nvarying amounts of eye-gaze data on model performance, highlighting the\nfeasibility and utility of integrating this auxiliary data into multi-modal\npre-training.\n","authors":["Chong Ma","Hanqi Jiang","Wenting Chen","Zihao Wu","Xiaowei Yu","Fang Zeng","Lei Guo","Dajiang Zhu","Tuo Zhang","Dinggang Shen","Tianming Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.12416v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.15036v3","updated":"2024-04-12T03:14:34Z","published":"2023-10-23T15:34:03Z","title":"A Technique for Classifying Static Gestures Using UWB Radar","summary":" Our paper presents a robust framework for UWB-based static gesture\nrecognition, leveraging proprietary UWB radar sensor technology. Extensive data\ncollection efforts were undertaken to compile datasets containing five commonly\nused gestures. Our approach involves a comprehensive data pre-processing\npipeline that encompasses outlier handling, aspect ratio-preserving resizing,\nand false-color image transformation. Both CNN and MobileNet models were\ntrained on the processed images. Remarkably, our best-performing model achieved\nan accuracy of 96.78%. Additionally, we developed a user-friendly GUI framework\nto assess the model's system resource usage and processing times, which\nrevealed low memory utilization and real-time task completion in under one\nsecond. This research marks a significant step towards enhancing static gesture\nrecognition using UWB technology, promising practical applications in various\ndomains.\n","authors":["Abhishek Sebastian","Pragna R"],"pdf_url":"https://arxiv.org/pdf/2310.15036v3.pdf","comment":"This is not a technical research paper, but an excerpt of what was\n applied during a funded project for the promotion of Open Science"},{"id":"http://arxiv.org/abs/2312.17428v2","updated":"2024-04-12T03:06:07Z","published":"2023-12-29T01:42:20Z","title":"ChangeNet: Multi-Temporal Asymmetric Change Detection Dataset","summary":" Change Detection (CD) has been attracting extensive interests with the\navailability of bi-temporal datasets. However, due to the huge cost of\nmulti-temporal images acquisition and labeling, existing change detection\ndatasets are small in quantity, short in temporal, and low in practicability.\nTherefore, a large-scale practical-oriented dataset covering wide temporal\nphases is urgently needed to facilitate the community. To this end, the\nChangeNet dataset is presented especially for multi-temporal change detection,\nalong with the new task of \"Asymmetric Change Detection\". Specifically,\nChangeNet consists of 31,000 multi-temporal images pairs, a wide range of\ncomplex scenes from 100 cities, and 6 pixel-level annotated categories, which\nis far superior to all the existing change detection datasets including\nLEVIR-CD, WHU Building CD, etc.. In addition, ChangeNet contains amounts of\nreal-world perspective distortions in different temporal phases on the same\nareas, which is able to promote the practical application of change detection\nalgorithms. The ChangeNet dataset is suitable for both binary change detection\n(BCD) and semantic change detection (SCD) tasks. Accordingly, we benchmark the\nChangeNet dataset on six BCD methods and two SCD methods, and extensive\nexperiments demonstrate its challenges and great significance. The dataset is\navailable at https://github.com/jankyee/ChangeNet.\n","authors":["Deyi Ji","Siqi Gao","Mingyuan Tao","Hongtao Lu","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.17428v2.pdf","comment":"Accepted to ICASSP 2024 Oral/Lecture"},{"id":"http://arxiv.org/abs/2402.09055v2","updated":"2024-04-12T02:51:45Z","published":"2024-02-14T10:05:19Z","title":"Comment-aided Video-Language Alignment via Contrastive Pre-training for\n Short-form Video Humor Detection","summary":" The growing importance of multi-modal humor detection within affective\ncomputing correlates with the expanding influence of short-form video sharing\non social media platforms. In this paper, we propose a novel two-branch\nhierarchical model for short-form video humor detection (SVHD), named\nComment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal\ncontrastive pre-training. Notably, our CVLA not only operates on raw signals\nacross various modal channels but also yields an appropriate multi-modal\nrepresentation by aligning the video and language components within a\nconsistent semantic space. The experimental results on two humor detection\ndatasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically\noutperforms state-of-the-art and several competitive baseline approaches. Our\ndataset, code and model release at https://github.com/yliu-cs/CVLA.\n","authors":["Yang Liu","Tongfei Shen","Dong Zhang","Qingying Sun","Shoushan Li","Guodong Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.09055v2.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2403.18554v2","updated":"2024-04-12T02:27:09Z","published":"2024-03-27T13:33:14Z","title":"CosalPure: Learning Concept from Group Images for Robust Co-Saliency\n Detection","summary":" Co-salient object detection (CoSOD) aims to identify the common and salient\n(usually in the foreground) regions across a given group of images. Although\nachieving significant progress, state-of-the-art CoSODs could be easily\naffected by some adversarial perturbations, leading to substantial accuracy\nreduction. The adversarial perturbations can mislead CoSODs but do not change\nthe high-level semantic information (e.g., concept) of the co-salient objects.\nIn this paper, we propose a novel robustness enhancement framework by first\nlearning the concept of the co-salient objects based on the input group images\nand then leveraging this concept to purify adversarial perturbations, which are\nsubsequently fed to CoSODs for robustness enhancement. Specifically, we propose\nCosalPure containing two modules, i.e., group-image concept learning and\nconcept-guided diffusion purification. For the first module, we adopt a\npre-trained text-to-image diffusion model to learn the concept of co-salient\nobjects within group images where the learned concept is robust to adversarial\nexamples. For the second module, we map the adversarial image to the latent\nspace and then perform diffusion generation by embedding the learned concept\ninto the noise prediction function as an extra condition. Our method can\neffectively alleviate the influence of the SOTA adversarial attack containing\ndifferent adversarial patterns, including exposure and noise. The extensive\nresults demonstrate that our method could enhance the robustness of CoSODs\nsignificantly.\n","authors":["Jiayi Zhu","Qing Guo","Felix Juefei-Xu","Yihao Huang","Yang Liu","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2403.18554v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08201v1","updated":"2024-04-12T02:14:35Z","published":"2024-04-12T02:14:35Z","title":"A Mutual Inclusion Mechanism for Precise Boundary Segmentation in\n Medical Images","summary":" In medical imaging, accurate image segmentation is crucial for quantifying\ndiseases, assessing prognosis, and evaluating treatment outcomes. However,\nexisting methods lack an in-depth integration of global and local features,\nfailing to pay special attention to abnormal regions and boundary details in\nmedical images. To this end, we present a novel deep learning-based approach,\nMIPC-Net, for precise boundary segmentation in medical images. Our approach,\ninspired by radiologists' working patterns, features two distinct modules: (i)\n\\textbf{Mutual Inclusion of Position and Channel Attention (MIPC) module}: To\nenhance the precision of boundary segmentation in medical images, we introduce\nthe MIPC module, which enhances the focus on channel information when\nextracting position features and vice versa; (ii) \\textbf{GL-MIPC-Residue}: To\nimprove the restoration of medical images, we propose the GL-MIPC-Residue, a\nglobal residual connection that enhances the integration of the encoder and\ndecoder by filtering out invalid information and restoring the most effective\ninformation lost during the feature extraction process. We evaluate the\nperformance of the proposed model using metrics such as Dice coefficient (DSC)\nand Hausdorff Distance (HD) on three publicly accessible datasets: Synapse,\nISIC2018-Task, and Segpc. Our ablation study shows that each module contributes\nto improving the quality of segmentation results. Furthermore, with the\nassistance of both modules, our approach outperforms state-of-the-art methods\nacross all metrics on the benchmark datasets, notably achieving a 2.23mm\nreduction in HD on the Synapse dataset, strongly evidencing our model's\nenhanced capability for precise image boundary segmentation. Codes will be\navailable at https://github.com/SUN-1024/MIPC-Net.\n","authors":["Yizhi Pan","Junyi Xin","Tianhua Yang","Teeradaj Racharak","Le-Minh Nguyen","Guanqun Sun"],"pdf_url":"https://arxiv.org/pdf/2404.08201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08197v1","updated":"2024-04-12T02:04:34Z","published":"2024-04-12T02:04:34Z","title":"Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and\n Training Strategies","summary":" This paper investigates the performance of the Contrastive Language-Image\nPre-training (CLIP) when scaled down to limited computation budgets. We explore\nCLIP along three dimensions: data, architecture, and training strategies. With\nregards to data, we demonstrate the significance of high-quality training data\nand show that a smaller dataset of high-quality data can outperform a larger\ndataset with lower quality. We also examine how model performance varies with\ndifferent dataset sizes, suggesting that smaller ViT models are better suited\nfor smaller datasets, while larger models perform better on larger datasets\nwith fixed compute. Additionally, we provide guidance on when to choose a\nCNN-based architecture or a ViT-based architecture for CLIP training. We\ncompare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data\nAugmentation - and show that the choice of training strategy depends on the\navailable compute resource. Our analysis reveals that CLIP+Data Augmentation\ncan achieve comparable performance to CLIP using only half of the training\ndata. This work provides practical insights into how to effectively train and\ndeploy CLIP models, making them more accessible and affordable for practical\nuse in various applications.\n","authors":["Zichao Li","Cihang Xie","Ekin Dogus Cubuk"],"pdf_url":"https://arxiv.org/pdf/2404.08197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08195v1","updated":"2024-04-12T01:54:59Z","published":"2024-04-12T01:54:59Z","title":"Tackling Ambiguity from Perspective of Uncertainty Inference and\n Affinity Diversification for Weakly Supervised Semantic Segmentation","summary":" Weakly supervised semantic segmentation (WSSS) with image-level labels\nintends to achieve dense tasks without laborious annotations. However, due to\nthe ambiguous contexts and fuzzy regions, the performance of WSSS, especially\nthe stages of generating Class Activation Maps (CAMs) and refining pseudo\nmasks, widely suffers from ambiguity while being barely noticed by previous\nliterature. In this work, we propose UniA, a unified single-staged WSSS\nframework, to efficiently tackle this issue from the perspective of uncertainty\ninference and affinity diversification, respectively. When activating class\nobjects, we argue that the false activation stems from the bias to the\nambiguous regions during the feature extraction. Therefore, we design a more\nrobust feature representation with a probabilistic Gaussian distribution and\nintroduce the uncertainty estimation to avoid the bias. A distribution loss is\nparticularly proposed to supervise the process, which effectively captures the\nambiguity and models the complex dependencies among features. When refining\npseudo labels, we observe that the affinity from the prevailing refinement\nmethods intends to be similar among ambiguities. To this end, an affinity\ndiversification module is proposed to promote diversity among semantics. A\nmutual complementing refinement is proposed to initially rectify the ambiguous\naffinity with multiple inferred pseudo labels. More importantly, a contrastive\naffinity loss is further designed to diversify the relations among unrelated\nsemantics, which reliably propagates the diversity into the whole feature\nrepresentations and helps generate better pseudo masks. Extensive experiments\nare conducted on PASCAL VOC, MS COCO, and medical ACDC datasets, which validate\nthe efficiency of UniA tackling ambiguity and the superiority over recent\nsingle-staged or even most multi-staged competitors.\n","authors":["Zhiwei Yang","Yucong Meng","Kexue Fu","Shuo Wang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2404.08195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08187v1","updated":"2024-04-12T01:36:00Z","published":"2024-04-12T01:36:00Z","title":"Adapting CNNs for Fisheye Cameras without Retraining","summary":" The majority of image processing approaches assume images are in or can be\nrectified to a perspective projection. However, in many applications it is\nbeneficial to use non conventional cameras, such as fisheye cameras, that have\na larger field of view (FOV). The issue arises that these large-FOV images\ncan't be rectified to a perspective projection without significant cropping of\nthe original image. To address this issue we propose Rectified Convolutions\n(RectConv); a new approach for adapting pre-trained convolutional networks to\noperate with new non-perspective images, without any retraining. Replacing the\nconvolutional layers of the network with RectConv layers allows the network to\nsee both rectified patches and the entire FOV. We demonstrate RectConv adapting\nmultiple pre-trained networks to perform segmentation and detection on fisheye\nimagery from two publicly available datasets. Our approach requires no\nadditional data or training, and operates directly on the native image as\ncaptured from the camera. We believe this work is a step toward adapting the\nvast resources available for perspective images to operate across a broad range\nof camera geometries.\n","authors":["Ryan Griffiths","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.08187v1.pdf","comment":"Project page: https://roboticimaging.org/Projects/RectConv/"},{"id":"http://arxiv.org/abs/2404.08184v1","updated":"2024-04-12T01:13:23Z","published":"2024-04-12T01:13:23Z","title":"Measuring Domain Shifts using Deep Learning Remote Photoplethysmography\n Model Similarity","summary":" Domain shift differences between training data for deep learning models and\nthe deployment context can result in severe performance issues for models which\nfail to generalize. We study the domain shift problem under the context of\nremote photoplethysmography (rPPG), a technique for video-based heart rate\ninference. We propose metrics based on model similarity which may be used as a\nmeasure of domain shift, and we demonstrate high correlation between these\nmetrics and empirical performance. One of the proposed metrics with viable\ncorrelations, DS-diff, does not assume access to the ground truth of the target\ndomain, i.e. it may be applied to in-the-wild data. To that end, we investigate\na model selection problem in which ground truth results for the evaluation\ndomain is not known, demonstrating a 13.9% performance improvement over the\naverage case baseline.\n","authors":["Nathan Vance","Patrick Flynn"],"pdf_url":"https://arxiv.org/pdf/2404.08184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08181v1","updated":"2024-04-12T01:08:04Z","published":"2024-04-12T01:08:04Z","title":"Pay Attention to Your Neighbours: Training-Free Open-Vocabulary Semantic\n Segmentation","summary":" Despite the significant progress in deep learning for dense visual\nrecognition problems, such as semantic segmentation, traditional methods are\nconstrained by fixed class sets. Meanwhile, vision-language foundation models,\nsuch as CLIP, have showcased remarkable effectiveness in numerous zero-shot\nimage-level tasks, owing to their robust generalizability. Recently, a body of\nwork has investigated utilizing these models in open-vocabulary semantic\nsegmentation (OVSS). However, existing approaches often rely on impractical\nsupervised pre-training or access to additional pre-trained networks. In this\nwork, we propose a strong baseline for training-free OVSS, termed\nNeighbour-Aware CLIP (NACLIP), representing a straightforward adaptation of\nCLIP tailored for this scenario. Our method enforces localization of patches in\nthe self-attention of CLIP's vision transformer which, despite being crucial\nfor dense prediction tasks, has been overlooked in the OVSS literature. By\nincorporating design choices favouring segmentation, our approach significantly\nimproves performance without requiring additional data, auxiliary pre-trained\nnetworks, or extensive hyperparameter tuning, making it highly practical for\nreal-world applications. Experiments are performed on 8 popular semantic\nsegmentation benchmarks, yielding state-of-the-art performance on most\nscenarios. Our code is publicly available at https://github.com/sinahmr/NACLIP .\n","authors":["Sina Hajimiri","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2404.08181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03584v2","updated":"2024-04-12T00:52:35Z","published":"2023-06-06T11:03:05Z","title":"RDFC-GAN: RGB-Depth Fusion CycleGAN for Indoor Depth Completion","summary":" Raw depth images captured in indoor scenarios frequently exhibit extensive\nmissing values due to the inherent limitations of the sensors and environments.\nFor example, transparent materials frequently elude detection by depth sensors;\nsurfaces may introduce measurement inaccuracies due to their polished textures,\nextended distances, and oblique incidence angles from the sensor. The presence\nof incomplete depth maps imposes significant challenges for subsequent vision\napplications, prompting the development of numerous depth completion techniques\nto mitigate this problem. Numerous methods excel at reconstructing dense depth\nmaps from sparse samples, but they often falter when faced with extensive\ncontiguous regions of missing depth values, a prevalent and critical challenge\nin indoor environments. To overcome these challenges, we design a novel\ntwo-branch end-to-end fusion network named RDFC-GAN, which takes a pair of RGB\nand incomplete depth images as input to predict a dense and completed depth\nmap. The first branch employs an encoder-decoder structure, by adhering to the\nManhattan world assumption and utilizing normal maps from RGB-D information as\nguidance, to regress the local dense depth values from the raw depth map. The\nother branch applies an RGB-depth fusion CycleGAN, adept at translating RGB\nimagery into detailed, textured depth maps while ensuring high fidelity through\ncycle consistency. We fuse the two branches via adaptive fusion modules named\nW-AdaIN and train the model with the help of pseudo depth maps. Comprehensive\nevaluations on NYU-Depth V2 and SUN RGB-D datasets show that our method\nsignificantly enhances depth completion performance particularly in realistic\nindoor settings.\n","authors":["Haowen Wang","Zhengping Che","Yufan Yang","Mingyuan Wang","Zhiyuan Xu","Xiuquan Qiao","Mengshi Qi","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2306.03584v2.pdf","comment":"Haowen Wang and Zhengping Che are with equal contributions. Paper\n accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence\n (TPAMI). An earlier version has been accepted by CVPR 2022\n (arXiv:2203.10856). arXiv admin note: text overlap with arXiv:2203.10856"},{"id":"http://arxiv.org/abs/2305.09948v5","updated":"2024-04-12T00:46:26Z","published":"2023-05-17T05:03:46Z","title":"HICO-DET-SG and V-COCO-SG: New Data Splits for Evaluating the Systematic\n Generalization Performance of Human-Object Interaction Detection Models","summary":" Human-Object Interaction (HOI) detection is a task to localize humans and\nobjects in an image and predict the interactions in human-object pairs. In\nreal-world scenarios, HOI detection models need systematic generalization,\ni.e., generalization to novel combinations of objects and interactions, because\nthe train data are expected to cover a limited portion of all possible\ncombinations. To evaluate the systematic generalization performance of HOI\ndetection models, we created two new sets of HOI detection data splits named\nHICO-DET-SG and V-COCO-SG based on the HICO-DET and V-COCO datasets,\nrespectively. When evaluated on the new data splits, HOI detection models with\nvarious characteristics performed much more poorly than when evaluated on the\noriginal splits. This shows that systematic generalization is a challenging\ngoal in HOI detection. By analyzing the evaluation results, we also gain\ninsights for improving the systematic generalization performance and identify\nfour possible future research directions. We hope that our new data splits and\npresented analysis will encourage further research on systematic generalization\nin HOI detection.\n","authors":["Kentaro Takemoto","Moyuru Yamada","Tomotake Sasaki","Hisanao Akima"],"pdf_url":"https://arxiv.org/pdf/2305.09948v5.pdf","comment":"19 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.08853v1","updated":"2024-04-12T23:49:37Z","published":"2024-04-12T23:49:37Z","title":"Uncertainty Quantification in Detecting Choroidal Metastases on MRI via\n Evolutionary Strategies","summary":" Uncertainty quantification plays a vital role in facilitating the practical\nimplementation of AI in radiology by addressing growing concerns around\ntrustworthiness. Given the challenges associated with acquiring large,\nannotated datasets in this field, there is a need for methods that enable\nuncertainty quantification in small data AI approaches tailored to radiology\nimages. In this study, we focused on uncertainty quantification within the\ncontext of the small data evolutionary strategies-based technique of deep\nneuroevolution (DNE). Specifically, we employed DNE to train a simple\nConvolutional Neural Network (CNN) with MRI images of the eyes for binary\nclassification. The goal was to distinguish between normal eyes and those with\nmetastatic tumors called choroidal metastases. The training set comprised 18\nimages with choroidal metastases and 18 without tumors, while the testing set\ncontained a tumor-to-normal ratio of 15:15.\n We trained CNN model weights via DNE for approximately 40,000 episodes,\nultimately reaching a convergence of 100% accuracy on the training set. We\nsaved all models that achieved maximal training set accuracy. Then, by applying\nthese models to the testing set, we established an ensemble method for\nuncertainty quantification.The saved set of models produced distributions for\neach testing set image between the two classes of normal and tumor-containing.\nThe relative frequencies permitted uncertainty quantification of model\npredictions. Intriguingly, we found that subjective features appreciated by\nhuman radiologists explained images for which uncertainty was high,\nhighlighting the significance of uncertainty quantification in AI-driven\nradiological analyses.\n","authors":["Bala McRae-Posani","Andrei Holodny","Hrithwik Shalu","Joseph N Stember"],"pdf_url":"https://arxiv.org/pdf/2404.08853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05294v2","updated":"2024-04-12T23:33:27Z","published":"2024-01-10T17:53:59Z","title":"Enhanced Muscle and Fat Segmentation for CT-Based Body Composition\n Analysis: A Comparative Study","summary":" Purpose: Body composition measurements from routine abdominal CT can yield\npersonalized risk assessments for asymptomatic and diseased patients. In\nparticular, attenuation and volume measures of muscle and fat are associated\nwith important clinical outcomes, such as cardiovascular events, fractures, and\ndeath. This study evaluates the reliability of an Internal tool for the\nsegmentation of muscle and fat (subcutaneous and visceral) as compared to the\nwell-established public TotalSegmentator tool.\n Methods: We assessed the tools across 900 CT series from the publicly\navailable SAROS dataset, focusing on muscle, subcutaneous fat, and visceral\nfat. The Dice score was employed to assess accuracy in subcutaneous fat and\nmuscle segmentation. Due to the lack of ground truth segmentations for visceral\nfat, Cohen's Kappa was utilized to assess segmentation agreement between the\ntools.\n Results: Our Internal tool achieved a 3% higher Dice (83.8 vs. 80.8) for\nsubcutaneous fat and a 5% improvement (87.6 vs. 83.2) for muscle segmentation\nrespectively. A Wilcoxon signed-rank test revealed that our results were\nstatistically different with p<0.01. For visceral fat, the Cohen's kappa score\nof 0.856 indicated near-perfect agreement between the two tools. Our internal\ntool also showed very strong correlations for muscle volume (R^2=0.99), muscle\nattenuation (R^2=0.93), and subcutaneous fat volume (R^2=0.99) with a moderate\ncorrelation for subcutaneous fat attenuation (R^2=0.45).\n Conclusion: Our findings indicated that our Internal tool outperformed\nTotalSegmentator in measuring subcutaneous fat and muscle. The high Cohen's\nKappa score for visceral fat suggests a reliable level of agreement between the\ntwo tools. These results demonstrate the potential of our tool in advancing the\naccuracy of body composition analysis.\n","authors":["Benjamin Hou","Tejas Sudharshan Mathai","Jianfei Liu","Christopher Parnell","Ronald M. Summers"],"pdf_url":"https://arxiv.org/pdf/2401.05294v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05949v3","updated":"2024-04-12T22:30:54Z","published":"2024-03-09T16:02:46Z","title":"General surgery vision transformer: A video pre-trained foundation model\n for general surgery","summary":" The absence of openly accessible data and specialized foundation models is a\nmajor barrier for computational research in surgery. Toward this, (i) we\nopen-source the largest dataset of general surgery videos to-date, consisting\nof 680 hours of surgical videos, including data from robotic and laparoscopic\ntechniques across 28 procedures; (ii) we propose a technique for video\npre-training a general surgery vision transformer (GSViT) on surgical videos\nbased on forward video prediction that can run in real-time for surgical\napplications, toward which we open-source the code and weights of GSViT; (iii)\nwe also release code and weights for procedure-specific fine-tuned versions of\nGSViT across 10 procedures; (iv) we demonstrate the performance of GSViT on the\nCholec80 phase annotation task, displaying improved performance over\nstate-of-the-art single frame predictors.\n","authors":["Samuel Schmidgall","Ji Woong Kim","Jeffrey Jopling","Axel Krieger"],"pdf_url":"https://arxiv.org/pdf/2403.05949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17179v3","updated":"2024-04-12T22:23:32Z","published":"2023-11-28T19:14:40Z","title":"SatCLIP: Global, General-Purpose Location Embeddings with Satellite\n Imagery","summary":" Geographic information is essential for modeling tasks in fields ranging from\necology to epidemiology. However, extracting relevant location characteristics\nfor a given task can be challenging, often requiring expensive data fusion or\ndistillation from massive global imagery datasets. To address this challenge,\nwe introduce Satellite Contrastive Location-Image Pretraining (SatCLIP). This\nglobal, general-purpose geographic location encoder learns an implicit\nrepresentation of locations by matching CNN and ViT inferred visual patterns of\nopenly available satellite imagery with their geographic coordinates. The\nresulting SatCLIP location encoder efficiently summarizes the characteristics\nof any given location for convenient use in downstream tasks. In our\nexperiments, we use SatCLIP embeddings to improve prediction performance on\nnine diverse location-dependent tasks including temperature prediction, animal\nrecognition, and population density estimation. Across tasks, SatCLIP\nconsistently outperforms alternative location encoders and improves geographic\ngeneralization by encoding visual similarities of spatially distant\nenvironments. These results demonstrate the potential of vision-location models\nto learn meaningful representations of our planet from the vast, varied, and\nlargely untapped modalities of geospatial data.\n","authors":["Konstantin Klemmer","Esther Rolf","Caleb Robinson","Lester Mackey","Marc Rußwurm"],"pdf_url":"https://arxiv.org/pdf/2311.17179v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12850v3","updated":"2024-04-12T22:08:40Z","published":"2023-10-19T14:04:53Z","title":"PrivImage: Differentially Private Synthetic Image Generation using\n Diffusion Models with Semantic-Aware Pretraining","summary":" Differential Privacy (DP) image data synthesis, which leverages the DP\ntechnique to generate synthetic data to replace the sensitive data, allowing\norganizations to share and utilize synthetic images without privacy concerns.\nPrevious methods incorporate the advanced techniques of generative models and\npre-training on a public dataset to produce exceptional DP image data, but\nsuffer from problems of unstable training and massive computational resource\ndemands. This paper proposes a novel DP image synthesis method, termed\nPRIVIMAGE, which meticulously selects pre-training data, promoting the\nefficient creation of DP datasets with high fidelity and utility. PRIVIMAGE\nfirst establishes a semantic query function using a public dataset. Then, this\nfunction assists in querying the semantic distribution of the sensitive\ndataset, facilitating the selection of data from the public dataset with\nanalogous semantics for pre-training. Finally, we pre-train an image generative\nmodel using the selected data and then fine-tune this model on the sensitive\ndataset using Differentially Private Stochastic Gradient Descent (DP-SGD).\nPRIVIMAGE allows us to train a lightly parameterized generative model, reducing\nthe noise in the gradient during DP-SGD training and enhancing training\nstability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the\npublic dataset for pre-training and 7.6% of the parameters in the generative\nmodel compared to the state-of-the-art method, whereas achieves superior\nsynthetic performance and conserves more computational resources. On average,\nPRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy\nthan the state-of-the-art method. The replication package and datasets can be\naccessed online.\n","authors":["Kecen Li","Chen Gong","Zhixiang Li","Yuzhong Zhao","Xinwen Hou","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12850v3.pdf","comment":"Accepted at USENIX Security 2024. The first two authors contributed\n equally"},{"id":"http://arxiv.org/abs/2404.08831v1","updated":"2024-04-12T22:05:01Z","published":"2024-04-12T22:05:01Z","title":"Structured Model Pruning for Efficient Inference in Computational\n Pathology","summary":" Recent years have seen significant efforts to adopt Artificial Intelligence\n(AI) in healthcare for various use cases, from computer-aided diagnosis to ICU\ntriage. However, the size of AI models has been rapidly growing due to scaling\nlaws and the success of foundational models, which poses an increasing\nchallenge to leverage advanced models in practical applications. It is thus\nimperative to develop efficient models, especially for deploying AI solutions\nunder resource-constrains or with time sensitivity. One potential solution is\nto perform model compression, a set of techniques that remove less important\nmodel components or reduce parameter precision, to reduce model computation\ndemand. In this work, we demonstrate that model pruning, as a model compression\ntechnique, can effectively reduce inference cost for computational and digital\npathology based analysis with a negligible loss of analysis performance. To\nthis end, we develop a methodology for pruning the widely used U-Net-style\narchitectures in biomedical imaging, with which we evaluate multiple pruning\nheuristics on nuclei instance segmentation and classification, and empirically\ndemonstrate that pruning can compress models by at least 70% with a negligible\ndrop in performance.\n","authors":["Mohammed Adnan","Qinle Ba","Nazim Shaikh","Shivam Kalra","Satarupa Mukherjee","Auranuch Lorsakul"],"pdf_url":"https://arxiv.org/pdf/2404.08831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02263v3","updated":"2024-04-12T22:03:06Z","published":"2024-02-03T21:12:36Z","title":"MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly\n Mixed Classifiers","summary":" Adversarial robustness often comes at the cost of degraded accuracy, impeding\nthe real-life application of robust classification models. Training-based\nsolutions for better trade-offs are limited by incompatibilities with\nalready-trained high-performance large models, necessitating the exploration of\ntraining-free ensemble approaches. Observing that robust models are more\nconfident in correct predictions than in incorrect ones on clean and\nadversarial data alike, we speculate amplifying this \"benign confidence\nproperty\" can reconcile accuracy and robustness in an ensemble setting. To\nachieve so, we propose \"MixedNUTS\", a training-free method where the output\nlogits of a robust classifier and a standard non-robust classifier are\nprocessed by nonlinear transformations with only three parameters, which are\noptimized through an efficient algorithm. MixedNUTS then converts the\ntransformed logits into probabilities and mixes them as the overall output. On\nCIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom\nstrong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and\nnear-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points,\nsacrificing merely 0.87 points in robust accuracy.\n","authors":["Yatong Bai","Mo Zhou","Vishal M. Patel","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2402.02263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08827v1","updated":"2024-04-12T21:56:21Z","published":"2024-04-12T21:56:21Z","title":"\"Don't forget to put the milk back!\" Dataset for Enabling Embodied\n Agents to Detect Anomalous Situations","summary":" Home robots intend to make their users lives easier. Our work assists in this\ngoal by enabling robots to inform their users of dangerous or unsanitary\nanomalies in their home. Some examples of these anomalies include the user\nleaving their milk out, forgetting to turn off the stove, or leaving poison\naccessible to children. To move towards enabling home robots with these\nabilities, we have created a new dataset, which we call SafetyDetect. The\nSafetyDetect dataset consists of 1000 anomalous home scenes, each of which\ncontains unsafe or unsanitary situations for an agent to detect. Our approach\nutilizes large language models (LLMs) alongside both a graph representation of\nthe scene and the relationships between the objects in the scene. Our key\ninsight is that this connected scene graph and the object relationships it\nencodes enables the LLM to better reason about the scene -- especially as it\nrelates to detecting dangerous or unsanitary situations. Our most promising\napproach utilizes GPT-4 and pursues a categorization technique where object\nrelations from the scene graph are classified as normal, dangerous, unsanitary,\nor dangerous for children. This method is able to correctly identify over 90%\nof anomalous scenarios in the SafetyDetect Dataset. Additionally, we conduct\nreal world experiments on a ClearPath TurtleBot where we generate a scene graph\nfrom visuals of the real world scene, and run our approach with no\nmodification. This setup resulted in little performance loss. The SafetyDetect\nDataset and code will be released to the public upon this papers publication.\n","authors":["James F. Mullen Jr","Prasoon Goyal","Robinson Piramuthu","Michael Johnston","Dinesh Manocha","Reza Ghanadan"],"pdf_url":"https://arxiv.org/pdf/2404.08827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08820v1","updated":"2024-04-12T21:30:09Z","published":"2024-04-12T21:30:09Z","title":"Single-image driven 3d viewpoint training data augmentation for\n effective wine label recognition","summary":" Confronting the critical challenge of insufficient training data in the field\nof complex image recognition, this paper introduces a novel 3D viewpoint\naugmentation technique specifically tailored for wine label recognition. This\nmethod enhances deep learning model performance by generating visually\nrealistic training samples from a single real-world wine label image,\novercoming the challenges posed by the intricate combinations of text and\nlogos. Classical Generative Adversarial Network (GAN) methods fall short in\nsynthesizing such intricate content combination. Our proposed solution\nleverages time-tested computer vision and image processing strategies to expand\nour training dataset, thereby broadening the range of training samples for deep\nlearning applications. This innovative approach to data augmentation\ncircumvents the constraints of limited training resources. Using the augmented\ntraining images through batch-all triplet metric learning on a Vision\nTransformer (ViT) architecture, we can get the most discriminative embedding\nfeatures for every wine label, enabling us to perform one-shot recognition of\nexisting wine labels in the training classes or future newly collected wine\nlabels unavailable in the training. Experimental results show a significant\nincrease in recognition accuracy over conventional 2D data augmentation\ntechniques.\n","authors":["Yueh-Cheng Huang","Hsin-Yi Chen","Cheng-Jui Hung","Jen-Hui Chuang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.08820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07214v2","updated":"2024-04-12T21:20:37Z","published":"2024-02-20T18:57:34Z","title":"Exploring the Frontier of Vision-Language Models: A Survey of Current\n Methodologies and Future Directions","summary":" The advent of Large Language Models (LLMs) has significantly reshaped the\ntrajectory of the AI revolution. Nevertheless, these LLMs exhibit a notable\nlimitation, as they are primarily adept at processing textual information. To\naddress this constraint, researchers have endeavored to integrate visual\ncapabilities with LLMs, resulting in the emergence of Vision-Language Models\n(VLMs). These advanced models are instrumental in tackling more intricate tasks\nsuch as image captioning and visual question answering. In our comprehensive\nsurvey paper, we delve into the key advancements within the realm of VLMs. Our\nclassification organizes VLMs into three distinct categories: models dedicated\nto vision-language understanding, models that process multimodal inputs to\ngenerate unimodal (textual) outputs and models that both accept and produce\nmultimodal inputs and outputs.This classification is based on their respective\ncapabilities and functionalities in processing and generating various\nmodalities of data.We meticulously dissect each model, offering an extensive\nanalysis of its foundational architecture, training data sources, as well as\nits strengths and limitations wherever possible, providing readers with a\ncomprehensive understanding of its essential components. We also analyzed the\nperformance of VLMs in various benchmark datasets. By doing so, we aim to offer\na nuanced understanding of the diverse landscape of VLMs. Additionally, we\nunderscore potential avenues for future research in this dynamic domain,\nanticipating further breakthroughs and advancements.\n","authors":["Akash Ghosh","Arkadeep Acharya","Sriparna Saha","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2404.07214v2.pdf","comment":"The most extensive and up to date Survey on Visual Language Models\n covering 76 Visual Language Models"},{"id":"http://arxiv.org/abs/2312.01117v2","updated":"2024-04-12T21:19:36Z","published":"2023-12-02T12:23:07Z","title":"Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by\n Factoring the Real World","summary":" To achieve strong real world performance, neural networks must be trained on\nlarge, diverse datasets; however, obtaining and annotating such datasets is\ncostly and time-consuming, particularly for 3D point clouds. In this paper, we\ndescribe Paved2Paradise, a simple, cost-effective approach for generating fully\nlabeled, diverse, and realistic lidar datasets from scratch, all while\nrequiring minimal human annotation. Our key insight is that, by deliberately\ncollecting separate \"background\" and \"object\" datasets (i.e., \"factoring the\nreal world\"), we can intelligently combine them to produce a combinatorially\nlarge and diverse training set. The Paved2Paradise pipeline thus consists of\nfour steps: (1) collecting copious background data, (2) recording individuals\nfrom the desired object class(es) performing different behaviors in an isolated\nenvironment (like a parking lot), (3) bootstrapping labels for the object\ndataset, and (4) generating samples by placing objects at arbitrary locations\nin backgrounds. To demonstrate the utility of Paved2Paradise, we generated\nsynthetic datasets for two tasks: (1) human detection in orchards (a task for\nwhich no public data exists) and (2) pedestrian detection in urban\nenvironments. Qualitatively, we find that a model trained exclusively on\nPaved2Paradise synthetic data is highly effective at detecting humans in\norchards, including when individuals are heavily occluded by tree branches.\nQuantitatively, a model trained on Paved2Paradise data that sources backgrounds\nfrom KITTI performs comparably to a model trained on the actual dataset. These\nresults suggest the Paved2Paradise synthetic data pipeline can help accelerate\npoint cloud model development in sectors where acquiring lidar datasets has\npreviously been cost-prohibitive.\n","authors":["Michael A. Alcorn","Noah Schwartz"],"pdf_url":"https://arxiv.org/pdf/2312.01117v2.pdf","comment":"Accepted to the Synthetic Data for Computer Vision workshop at CVPR\n 2024"},{"id":"http://arxiv.org/abs/2404.08814v1","updated":"2024-04-12T21:14:20Z","published":"2024-04-12T21:14:20Z","title":"E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors\n to New Generators Using Limited Data","summary":" As generative AI progresses rapidly, new synthetic image generators continue\nto emerge at a swift pace. Traditional detection methods face two main\nchallenges in adapting to these generators: the forensic traces of synthetic\nimages from new techniques can vastly differ from those learned during\ntraining, and access to data for these new generators is often limited. To\naddress these issues, we introduce the Ensemble of Expert Embedders (E3), a\nnovel continual learning framework for updating synthetic image detectors. E3\nenables the accurate detection of images from newly emerged generators using\nminimal training data. Our approach does this by first employing transfer\nlearning to develop a suite of expert embedders, each specializing in the\nforensic traces of a specific generator. Then, all embeddings are jointly\nanalyzed by an Expert Knowledge Fusion Network to produce accurate and reliable\ndetection decisions. Our experiments demonstrate that E3 outperforms existing\ncontinual learning methods, including those developed specifically for\nsynthetic image detection.\n","authors":["Aref Azizpour","Tai D. Nguyen","Manil Shrestha","Kaidi Xu","Edward Kim","Matthew C. Stamm"],"pdf_url":"https://arxiv.org/pdf/2404.08814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05695v4","updated":"2024-04-12T21:11:16Z","published":"2023-08-10T16:57:14Z","title":"Masked Diffusion as Self-supervised Representation Learner","summary":" Denoising diffusion probabilistic models have recently demonstrated\nstate-of-the-art generative performance and have been used as strong\npixel-level representation learners. This paper decomposes the interrelation\nbetween the generative capability and representation learning ability inherent\nin diffusion models. We present the masked diffusion model (MDM), a scalable\nself-supervised representation learner for semantic segmentation, substituting\nthe conventional additive Gaussian noise of traditional diffusion with a\nmasking mechanism. Our proposed approach convincingly surpasses prior\nbenchmarks, demonstrating remarkable advancements in both medical and natural\nimage semantic segmentation tasks, particularly in few-shot scenarios.\n","authors":["Zixuan Pan","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2308.05695v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07381v2","updated":"2024-04-12T20:41:14Z","published":"2023-12-12T15:57:03Z","title":"ScribblePrompt: Fast and Flexible Interactive Segmentation for Any\n Biomedical Image","summary":" Biomedical image segmentation is a crucial part of both scientific research\nand clinical care. With enough labelled data, deep learning models can be\ntrained to accurately automate specific biomedical image segmentation tasks.\nHowever, manually segmenting images to create training data is highly labor\nintensive and requires domain expertise. We present ScribblePrompt, a flexible\nneural network based interactive segmentation tool for biomedical imaging that\nenables human annotators to segment previously unseen structures using\nscribbles, clicks, and bounding boxes. Through rigorous quantitative\nexperiments, we demonstrate that given comparable amounts of interaction,\nScribblePrompt produces more accurate segmentations than previous methods on\ndatasets unseen during training. In a user study with domain experts,\nScribblePrompt reduced annotation time by 28% while improving Dice by 15%\ncompared to the next best method. ScribblePrompt's success rests on a set of\ncareful design decisions. These include a training strategy that incorporates\nboth a highly diverse set of images and tasks, novel algorithms for simulated\nuser interactions and labels, and a network that enables fast inference. We\nshowcase ScribblePrompt in an online demo and provide code at\nhttps://scribbleprompt.csail.mit.edu\n","authors":["Hallee E. Wong","Marianne Rakic","John Guttag","Adrian V. Dalca"],"pdf_url":"https://arxiv.org/pdf/2312.07381v2.pdf","comment":"Project Website: https://scribbleprompt.csail.mit.edu Keywords:\n Interactive Segmentation, Medical Imaging, Segment Anything Model, SAM,\n Scribble Annotations, Prompt"},{"id":"http://arxiv.org/abs/2404.08805v1","updated":"2024-04-12T20:39:19Z","published":"2024-04-12T20:39:19Z","title":"Real-time guidewire tracking and segmentation in intraoperative x-ray","summary":" During endovascular interventions, physicians have to perform accurate and\nimmediate operations based on the available real-time information, such as the\nshape and position of guidewires observed on the fluoroscopic images, haptic\ninformation and the patients' physiological signals. For this purpose,\nreal-time and accurate guidewire segmentation and tracking can enhance the\nvisualization of guidewires and provide visual feedback for physicians during\nthe intervention as well as for robot-assisted interventions. Nevertheless,\nthis task often comes with the challenge of elongated deformable structures\nthat present themselves with low contrast in the noisy fluoroscopic image\nsequences. To address these issues, a two-stage deep learning framework for\nreal-time guidewire segmentation and tracking is proposed. In the first stage,\na Yolov5s detector is trained, using the original X-ray images as well as\nsynthetic ones, which is employed to output the bounding boxes of possible\ntarget guidewires. More importantly, a refinement module based on\nspatiotemporal constraints is incorporated to robustly localize the guidewire\nand remove false detections. In the second stage, a novel and efficient network\nis proposed to segment the guidewire in each detected bounding box. The network\ncontains two major modules, namely a hessian-based enhancement embedding module\nand a dual self-attention module. Quantitative and qualitative evaluations on\nclinical intra-operative images demonstrate that the proposed approach\nsignificantly outperforms our baselines as well as the current state of the art\nand, in comparison, shows higher robustness to low quality images.\n","authors":["Baochang Zhang","Mai Bui","Cheng Wang","Felix Bourier","Heribert Schunkert","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2404.08805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02344v2","updated":"2024-04-12T20:18:00Z","published":"2024-04-02T22:37:34Z","title":"Generative AI-Based Effective Malware Detection for Embedded Computing\n Systems","summary":" One of the pivotal security threats for the embedded computing systems is\nmalicious software a.k.a malware. With efficiency and efficacy, Machine\nLearning (ML) has been widely adopted for malware detection in recent times.\nDespite being efficient, the existing techniques require a tremendous number of\nbenign and malware samples for training and modeling an efficient malware\ndetector. Furthermore, such constraints limit the detection of emerging malware\nsamples due to the lack of sufficient malware samples required for efficient\ntraining. To address such concerns, we introduce a code-aware data generation\ntechnique that generates multiple mutated samples of the limitedly seen malware\nby the devices. Loss minimization ensures that the generated samples closely\nmimic the limitedly seen malware and mitigate the impractical samples. Such\ndeveloped malware is further incorporated into the training set to formulate\nthe model that can efficiently detect the emerging malware despite having\nlimited exposure. The experimental results demonstrates that the proposed\ntechnique achieves an accuracy of 90% in detecting limitedly seen malware,\nwhich is approximately 3x more than the accuracy attained by state-of-the-art\ntechniques.\n","authors":["Sreenitha Kasarapu","Sanket Shukla","Rakibul Hassan","Avesta Sasan","Houman Homayoun","Sai Manoj Pudukotai Dinakarrao"],"pdf_url":"https://arxiv.org/pdf/2404.02344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08799v1","updated":"2024-04-12T20:16:03Z","published":"2024-04-12T20:16:03Z","title":"Semantic Approach to Quantifying the Consistency of Diffusion Model\n Image Generation","summary":" In this study, we identify the need for an interpretable, quantitative score\nof the repeatability, or consistency, of image generation in diffusion models.\nWe propose a semantic approach, using a pairwise mean CLIP (Contrastive\nLanguage-Image Pretraining) score as our semantic consistency score. We applied\nthis metric to compare two state-of-the-art open-source image generation\ndiffusion models, Stable Diffusion XL and PixArt-{\\alpha}, and we found\nstatistically significant differences between the semantic consistency scores\nfor the models. Agreement between the Semantic Consistency Score selected model\nand aggregated human annotations was 94%. We also explored the consistency of\nSDXL and a LoRA-fine-tuned version of SDXL and found that the fine-tuned model\nhad significantly higher semantic consistency in generated images. The Semantic\nConsistency Score proposed here offers a measure of image generation alignment,\nfacilitating the evaluation of model architectures for specific tasks and\naiding in informed decision-making regarding model selection.\n","authors":["Brinnae Bent"],"pdf_url":"https://arxiv.org/pdf/2404.08799v1.pdf","comment":"Accepted to 2024 CVPR 3rd Explainable AI for Computer Vision (XAI4CV)\n Workshop"},{"id":"http://arxiv.org/abs/2403.05297v3","updated":"2024-04-12T20:10:29Z","published":"2024-03-08T13:24:46Z","title":"PEEB: Part-based Image Classifiers with an Explainable and Editable\n Language Bottleneck","summary":" CLIP-based classifiers rely on the prompt containing a {class name} that is\nknown to the text encoder. Therefore, they perform poorly on new classes or the\nclasses whose names rarely appear on the Internet (e.g., scientific names of\nbirds). For fine-grained classification, we propose PEEB - an explainable and\neditable classifier to (1) express the class name into a set of text\ndescriptors that describe the visual parts of that class; and (2) match the\nembeddings of the detected parts to their textual descriptors in each class to\ncompute a logit score for classification. In a zero-shot setting where the\nclass names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1\naccuracy). Compared to part-based classifiers, PEEB is not only the\nstate-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20%\naccuracy on CUB-200 and Dogs-120, respectively) but also the first to enable\nusers to edit the text descriptors to form a new classifier without any\nre-training. Compared to concept bottleneck models, PEEB is also the SOTA in\nboth zero-shot and supervised-learning settings.\n","authors":["Thang M. Pham","Peijie Chen","Tin Nguyen","Seunghyun Yoon","Trung Bui","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2403.05297v3.pdf","comment":"Findings of NAACL 2024 (long paper)"},{"id":"http://arxiv.org/abs/2401.12946v6","updated":"2024-04-12T19:48:27Z","published":"2024-01-23T18:07:07Z","title":"Coverage Axis++: Efficient Inner Point Selection for 3D Shape\n Skeletonization","summary":" We introduce Coverage Axis++, a novel and efficient approach to 3D shape\nskeletonization. The current state-of-the-art approaches for this task often\nrely on the watertightness of the input or suffer from substantial\ncomputational costs, thereby limiting their practicality. To address this\nchallenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal\npoints, offering a high-accuracy approximation of the Medial Axis Transform\n(MAT) while significantly mitigating computational intensity for various shape\nrepresentations. We introduce a simple yet effective strategy that considers\nshape coverage, uniformity, and centrality to derive skeletal points. The\nselection procedure enforces consistency with the shape structure while\nfavoring the dominant medial balls, which thus introduces a compact underlying\nshape representation in terms of MAT. As a result, Coverage Axis++ allows for\nskeletonization for various shape representations (e.g., water-tight meshes,\ntriangle soups, point clouds), specification of the number of skeletal points,\nfew hyperparameters, and highly efficient computation with improved\nreconstruction accuracy. Extensive experiments across a wide range of 3D shapes\nvalidate the efficiency and effectiveness of Coverage Axis++. The code will be\npublicly available once the paper is published.\n","authors":["Zimeng Wang","Zhiyang Dou","Rui Xu","Cheng Lin","Yuan Liu","Xiaoxiao Long","Shiqing Xin","Taku Komura","Xiaoming Yuan","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12946v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08788v1","updated":"2024-04-12T19:29:10Z","published":"2024-04-12T19:29:10Z","title":"Detecting AI-Generated Images via CLIP","summary":" As AI-generated image (AIGI) methods become more powerful and accessible, it\nhas become a critical task to determine if an image is real or AI-generated.\nBecause AIGI lack the signatures of photographs and have their own unique\npatterns, new models are needed to determine if an image is AI-generated. In\nthis paper, we investigate the ability of the Contrastive Language-Image\nPre-training (CLIP) architecture, pre-trained on massive internet-scale data\nsets, to perform this differentiation. We fine-tune CLIP on real images and\nAIGI from several generative models, enabling CLIP to determine if an image is\nAI-generated and, if so, determine what generation method was used to create\nit. We show that the fine-tuned CLIP architecture is able to differentiate AIGI\nas well or better than models whose architecture is specifically designed to\ndetect AIGI. Our method will significantly increase access to AIGI-detecting\ntools and reduce the negative effects of AIGI on society, as our CLIP\nfine-tuning procedures require no architecture changes from publicly available\nmodel repositories and consume significantly less GPU resources than other AIGI\ndetection models.\n","authors":["A. G. Moskowitz","T. Gaona","J. Peterson"],"pdf_url":"https://arxiv.org/pdf/2404.08788v1.pdf","comment":"submitted for publication in Machine Vision and Applications"},{"id":"http://arxiv.org/abs/2404.08785v1","updated":"2024-04-12T19:13:42Z","published":"2024-04-12T19:13:42Z","title":"Under pressure: learning-based analog gauge reading in the wild","summary":" We propose an interpretable framework for reading analog gauges that is\ndeployable on real world robotic systems. Our framework splits the reading task\ninto distinct steps, such that we can detect potential failures at each step.\nOur system needs no prior knowledge of the type of gauge or the range of the\nscale and is able to extract the units used. We show that our gauge reading\nalgorithm is able to extract readings with a relative reading error of less\nthan 2%.\n","authors":["Maurits Reitsma","Julian Keller","Kenneth Blomqvist","Roland Siegwart"],"pdf_url":"https://arxiv.org/pdf/2404.08785v1.pdf","comment":"7 pages, 8 figures, accepted for presentation at the 2024 IEEE\n International Conference on Robotics and Automation (ICRA) and for inclusion\n in the conference proceedings, finalist for the IEEE ICRA 2024 Best Paper\n Award in Automation, source code\n https://github.com/ethz-asl/analog_gauge_reader, Autonomous Systems Lab, ETH\n Zurich"},{"id":"http://arxiv.org/abs/2404.08778v1","updated":"2024-04-12T19:04:59Z","published":"2024-04-12T19:04:59Z","title":"Towards Sim-to-Real Industrial Parts Classification with Synthetic\n Dataset","summary":" This paper is about effectively utilizing synthetic data for training deep\nneural networks for industrial parts classification, in particular, by taking\ninto account the domain gap against real-world images. To this end, we\nintroduce a synthetic dataset that may serve as a preliminary testbed for the\nSim-to-Real challenge; it contains 17 objects of six industrial use cases,\nincluding isolated and assembled parts. A few subsets of objects exhibit large\nsimilarities in shape and albedo for reflecting challenging cases of industrial\nparts. All the sample images come with and without random backgrounds and\npost-processing for evaluating the importance of domain randomization. We call\nit Synthetic Industrial Parts dataset (SIP-17). We study the usefulness of\nSIP-17 through benchmarking the performance of five state-of-the-art deep\nnetwork models, supervised and self-supervised, trained only on the synthetic\ndata while testing them on real data. By analyzing the results, we deduce some\ninsights on the feasibility and challenges of using synthetic data for\nindustrial parts classification and for further developing larger-scale\nsynthetic datasets. Our dataset and code are publicly available.\n","authors":["Xiaomeng Zhu","Talha Bilal","Pär Mårtensson","Lars Hanson","Mårten Björkman","Atsuto Maki"],"pdf_url":"https://arxiv.org/pdf/2404.08778v1.pdf","comment":"Published in 2023 IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW)"},{"id":"http://arxiv.org/abs/2404.08767v1","updated":"2024-04-12T18:45:51Z","published":"2024-04-12T18:45:51Z","title":"LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning","summary":" Understanding human instructions to identify the target objects is vital for\nperception systems. In recent years, the advancements of Large Language Models\n(LLMs) have introduced new possibilities for image segmentation. In this work,\nwe delve into reasoning segmentation, a novel task that enables segmentation\nsystem to reason and interpret implicit user intention via large language model\nreasoning and then segment the corresponding target. Our work on reasoning\nsegmentation contributes on both the methodological design and dataset\nlabeling. For the model, we propose a new framework named LLM-Seg. LLM-Seg\neffectively connects the current foundational Segmentation Anything Model and\nthe LLM by mask proposals selection. For the dataset, we propose an automatic\ndata generation pipeline and construct a new reasoning segmentation dataset\nnamed LLM-Seg40K. Experiments demonstrate that our LLM-Seg exhibits competitive\nperformance compared with existing methods. Furthermore, our proposed pipeline\ncan efficiently produce high-quality reasoning segmentation datasets. The\nLLM-Seg40K dataset, developed through this pipeline, serves as a new benchmark\nfor training and evaluating various reasoning segmentation approaches. Our\ncode, models and dataset are at https://github.com/wangjunchi/LLMSeg.\n","authors":["Junchi Wang","Lei Ke"],"pdf_url":"https://arxiv.org/pdf/2404.08767v1.pdf","comment":"Github: https://github.com/wangjunchi/LLMSeg"},{"id":"http://arxiv.org/abs/2404.08761v1","updated":"2024-04-12T18:37:00Z","published":"2024-04-12T18:37:00Z","title":"`Eyes of a Hawk and Ears of a Fox': Part Prototype Network for\n Generalized Zero-Shot Learning","summary":" Current approaches in Generalized Zero-Shot Learning (GZSL) are built upon\nbase models which consider only a single class attribute vector representation\nover the entire image. This is an oversimplification of the process of novel\ncategory recognition, where different regions of the image may have properties\nfrom different seen classes and thus have different predominant attributes.\nWith this in mind, we take a fundamentally different approach: a pre-trained\nVision-Language detector (VINVL) sensitive to attribute information is employed\nto efficiently obtain region features. A learned function maps the region\nfeatures to region-specific attribute attention used to construct class part\nprototypes. We conduct experiments on a popular GZSL benchmark consisting of\nthe CUB, SUN, and AWA2 datasets where our proposed Part Prototype Network (PPN)\nachieves promising results when compared with other popular base models.\nCorresponding ablation studies and analysis show that our approach is highly\npractical and has a distinct advantage over global attribute attention when\nlocalized proposals are available.\n","authors":["Joshua Feinglass","Jayaraman J. Thiagarajan","Rushil Anirudh","T. S. Jayram","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.08761v1.pdf","comment":"Accepted to the CVPR 2024 LIMIT Workshop"},{"id":"http://arxiv.org/abs/2312.04552v2","updated":"2024-04-12T18:34:31Z","published":"2023-12-07T18:59:20Z","title":"Generating Illustrated Instructions","summary":" We introduce the new task of generating Illustrated Instructions, i.e.,\nvisual instructions customized to a user's needs. We identify desiderata unique\nto this task, and formalize it through a suite of automatic and human\nevaluation metrics, designed to measure the validity, consistency, and efficacy\nof the generations. We combine the power of large language models (LLMs)\ntogether with strong text-to-image generation diffusion models to propose a\nsimple approach called StackedDiffusion, which generates such illustrated\ninstructions given text as input. The resulting model strongly outperforms\nbaseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases,\nusers even prefer it to human-generated articles. Most notably, it enables\nvarious new and exciting applications far beyond what static articles on the\nweb can provide, such as personalized instructions complete with intermediate\nsteps and pictures in response to a user's individual situation.\n","authors":["Sachit Menon","Ishan Misra","Rohit Girdhar"],"pdf_url":"https://arxiv.org/pdf/2312.04552v2.pdf","comment":"Accepted to CVPR 2024. Project website:\n http://facebookresearch.github.io/IllustratedInstructions. Code reproduction:\n https://github.com/sachit-menon/generating-illustrated-instructions-reproduction"},{"id":"http://arxiv.org/abs/2404.08756v1","updated":"2024-04-12T18:29:10Z","published":"2024-04-12T18:29:10Z","title":"SCOUT+: Towards Practical Task-Driven Drivers' Gaze Prediction","summary":" Accurate prediction of drivers' gaze is an important component of\nvision-based driver monitoring and assistive systems. Of particular interest\nare safety-critical episodes, such as performing maneuvers or crossing\nintersections. In such scenarios, drivers' gaze distribution changes\nsignificantly and becomes difficult to predict, especially if the task and\ncontext information is represented implicitly, as is common in many\nstate-of-the-art models. However, explicit modeling of top-down factors\naffecting drivers' attention often requires additional information and\nannotations that may not be readily available.\n In this paper, we address the challenge of effective modeling of task and\ncontext with common sources of data for use in practical systems. To this end,\nwe introduce SCOUT+, a task- and context-aware model for drivers' gaze\nprediction, which leverages route and map information inferred from commonly\navailable GPS data. We evaluate our model on two datasets, DR(eye)VE and BDD-A,\nand demonstrate that using maps improves results compared to bottom-up models\nand reaches performance comparable to the top-down model SCOUT which relies on\nprivileged ground truth information. Code is available at\nhttps://github.com/ykotseruba/SCOUT.\n","authors":["Iuliia Kotseruba","John K. Tsotsos"],"pdf_url":"https://arxiv.org/pdf/2404.08756v1.pdf","comment":"Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024"},{"id":"http://arxiv.org/abs/2404.08755v1","updated":"2024-04-12T18:28:44Z","published":"2024-04-12T18:28:44Z","title":"Training a Vision Language Model as Smartphone Assistant","summary":" Addressing the challenge of a digital assistant capable of executing a wide\narray of user tasks, our research focuses on the realm of instruction-based\nmobile device control. We leverage recent advancements in large language models\n(LLMs) and present a visual language model (VLM) that can fulfill diverse tasks\non mobile devices. Our model functions by interacting solely with the user\ninterface (UI). It uses the visual input from the device screen and mimics\nhuman-like interactions, encompassing gestures such as tapping and swiping.\nThis generality in the input and output space allows our agent to interact with\nany application on the device. Unlike previous methods, our model operates not\nonly on a single screen image but on vision-language sentences created from\nsequences of past screenshots along with corresponding actions. Evaluating our\nmethod on the challenging Android in the Wild benchmark demonstrates its\npromising efficacy and potential.\n","authors":["Nicolai Dorka","Janusz Marecki","Ammar Anwar"],"pdf_url":"https://arxiv.org/pdf/2404.08755v1.pdf","comment":"ICLR 2024 workshop on Generative Models for Decision Making"},{"id":"http://arxiv.org/abs/2404.08749v1","updated":"2024-04-12T18:23:00Z","published":"2024-04-12T18:23:00Z","title":"Data Limitations for Modeling Top-Down Effects on Drivers' Attention","summary":" Driving is a visuomotor task, i.e., there is a connection between what\ndrivers see and what they do. While some models of drivers' gaze account for\ntop-down effects of drivers' actions, the majority learn only bottom-up\ncorrelations between human gaze and driving footage. The crux of the problem is\nlack of public data with annotations that could be used to train top-down\nmodels and evaluate how well models of any kind capture effects of task on\nattention. As a result, top-down models are trained and evaluated on private\ndata and public benchmarks measure only the overall fit to human data.\n In this paper, we focus on data limitations by examining four large-scale\npublic datasets, DR(eye)VE, BDD-A, MAAD, and LBW, used to train and evaluate\nalgorithms for drivers' gaze prediction. We define a set of driving tasks\n(lateral and longitudinal maneuvers) and context elements (intersections and\nright-of-way) known to affect drivers' attention, augment the datasets with\nannotations based on the said definitions, and analyze the characteristics of\ndata recording and processing pipelines w.r.t. capturing what the drivers see\nand do. In sum, the contributions of this work are: 1) quantifying biases of\nthe public datasets, 2) examining performance of the SOTA bottom-up models on\nsubsets of the data involving non-trivial drivers' actions, 3) linking\nshortcomings of the bottom-up models to data limitations, and 4)\nrecommendations for future data collection and processing. The new annotations\nand code for reproducing the results is available at\nhttps://github.com/ykotseruba/SCOUT.\n","authors":["Iuliia Kotseruba","John K. Tsotsos"],"pdf_url":"https://arxiv.org/pdf/2404.08749v1.pdf","comment":"Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024"},{"id":"http://arxiv.org/abs/2404.08748v1","updated":"2024-04-12T18:21:08Z","published":"2024-04-12T18:21:08Z","title":"Multi-Branch Generative Models for Multichannel Imaging with an\n Application to PET/CT Joint Reconstruction","summary":" This paper presents a proof-of-concept approach for learned synergistic\nreconstruction of medical images using multi-branch generative models.\nLeveraging variational autoencoders (VAEs) and generative adversarial networks\n(GANs), our models learn from pairs of images simultaneously, enabling\neffective denoising and reconstruction. Synergistic image reconstruction is\nachieved by incorporating the trained models in a regularizer that evaluates\nthe distance between the images and the model, in a similar fashion to\nmultichannel dictionary learning (DiL). We demonstrate the efficacy of our\napproach on both Modified National Institute of Standards and Technology\n(MNIST) and positron emission tomography (PET)/computed tomography (CT)\ndatasets, showcasing improved image quality and information sharing between\nmodalities. Despite challenges such as patch decomposition and model\nlimitations, our results underscore the potential of generative models for\nenhancing medical imaging reconstruction.\n","authors":["Noel Jeffrey Pinton","Alexandre Bousse","Catherine Cheze-Le-Rest","Dimitris Visvikis"],"pdf_url":"https://arxiv.org/pdf/2404.08748v1.pdf","comment":"12 pages, 16 figures, submitted to IEEE TRPMS"},{"id":"http://arxiv.org/abs/2310.09275v3","updated":"2024-04-12T18:10:51Z","published":"2023-10-13T17:38:41Z","title":"Understanding and Modeling the Effects of Task and Context on Drivers'\n Gaze Allocation","summary":" To further advance driver monitoring and assistance systems, it is important\nto understand how drivers allocate their attention, in other words, where do\nthey tend to look and why. Traditionally, factors affecting human visual\nattention have been divided into bottom-up (involuntary attraction to salient\nregions) and top-down (driven by the demands of the task being performed).\nAlthough both play a role in directing drivers' gaze, most of the existing\nmodels for drivers' gaze prediction apply techniques developed for bottom-up\nsaliency and do not consider influences of the drivers' actions explicitly.\nLikewise, common driving attention benchmarks lack relevant annotations for\ndrivers' actions and the context in which they are performed. Therefore, to\nenable analysis and modeling of these factors for drivers' gaze prediction, we\npropose the following: 1) we correct the data processing pipeline used in\nDR(eye)VE to reduce noise in the recorded gaze data; 2) we then add per-frame\nlabels for driving task and context; 3) we benchmark a number of baseline and\nSOTA models for saliency and driver gaze prediction and use new annotations to\nanalyze how their performance changes in scenarios involving different tasks;\nand, lastly, 4) we develop a novel model that modulates drivers' gaze\nprediction with explicit action and context information. While reducing noise\nin the DR(eye)VE gaze data improves results of all models, we show that using\ntask information in our proposed model boosts performance even further compared\nto bottom-up models on the cleaned up data, both overall (by 24% KLD and 89%\nNSS) and on scenarios that involve performing safety-critical maneuvers and\ncrossing intersections (by up to 10--30% KLD). Extended annotations and code\nare available at https://github.com/ykotseruba/SCOUT.\n","authors":["Iuliia Kotseruba","John K. Tsotsos"],"pdf_url":"https://arxiv.org/pdf/2310.09275v3.pdf","comment":"Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024"},{"id":"http://arxiv.org/abs/2404.10534v1","updated":"2024-04-12T21:41:50Z","published":"2024-04-12T21:41:50Z","title":"Into the Fog: Evaluating Multiple Object Tracking Robustness","summary":" State-of-the-art (SOTA) trackers have shown remarkable Multiple Object\nTracking (MOT) performance when trained and evaluated on current benchmarks.\nHowever, these benchmarks primarily consist of clear scenarios, overlooking\nadverse atmospheric conditions such as fog, haze, smoke and dust. As a result,\nthe robustness of SOTA trackers remains underexplored. To address these\nlimitations, we propose a pipeline for physic-based volumetric fog simulation\nin arbitrary real-world MOT dataset utilizing frame-by-frame monocular depth\nestimation and a fog formation optical model. Moreover, we enhance our\nsimulation by rendering of both homogeneous and heterogeneous fog effects. We\npropose to use the dark channel prior method to estimate fog (smoke) color,\nwhich shows promising results even in night and indoor scenes. We present the\nleading tracking benchmark MOTChallenge (MOT17 dataset) overlaid by fog (smoke\nfor indoor scenes) of various intensity levels and conduct a comprehensive\nevaluation of SOTA MOT methods, revealing their limitations under fog and\nfog-similar challenges.\n","authors":["Nadezda Kirillova","M. Jehanzeb Mirza","Horst Possegger","Horst Bischof"],"pdf_url":"https://arxiv.org/pdf/2404.10534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10540v1","updated":"2024-04-12T20:40:12Z","published":"2024-04-12T20:40:12Z","title":"SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic\n Perception","summary":" Recently, event-based vision sensors have gained attention for autonomous\ndriving applications, as conventional RGB cameras face limitations in handling\nchallenging dynamic conditions. However, the availability of real-world and\nsynthetic event-based vision datasets remains limited. In response to this gap,\nwe present SEVD, a first-of-its-kind multi-view ego, and fixed perception\nsynthetic event-based dataset using multiple dynamic vision sensors within the\nCARLA simulator. Data sequences are recorded across diverse lighting (noon,\nnighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy)\nwith domain shifts (discrete and continuous). SEVD spans urban, suburban,\nrural, and highway scenes featuring various classes of objects (car, truck,\nvan, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes\nRGB imagery, depth maps, optical flow, semantic, and instance segmentation,\nfacilitating a comprehensive understanding of the scene. Furthermore, we\nevaluate the dataset using state-of-the-art event-based (RED, RVT) and\nframe-based (YOLOv8) methods for traffic participant detection tasks and\nprovide baseline benchmarks for assessment. Additionally, we conduct\nexperiments to assess the synthetic event-based dataset's generalization\ncapabilities. The dataset is available at\nhttps://eventbasedvision.github.io/SEVD\n","authors":["Manideep Reddy Aliminati","Bharatesh Chakravarthi","Aayush Atul Verma","Arpitsinh Vaghela","Hua Wei","Xuesong Zhou","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.10540v1.pdf","comment":null}]},"2024-04-15T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.09752v1","updated":"2024-04-15T12:53:48Z","published":"2024-04-15T12:53:48Z","title":"Can We Break Free from Strong Data Augmentations in Self-Supervised\n Learning?","summary":" Self-supervised learning (SSL) has emerged as a promising solution for\naddressing the challenge of limited labeled data in deep neural networks\n(DNNs), offering scalability potential. However, the impact of design\ndependencies within the SSL framework remains insufficiently investigated. In\nthis study, we comprehensively explore SSL behavior across a spectrum of\naugmentations, revealing their crucial role in shaping SSL model performance\nand learning mechanisms. Leveraging these insights, we propose a novel learning\napproach that integrates prior knowledge, with the aim of curtailing the need\nfor extensive data augmentations and thereby amplifying the efficacy of learned\nrepresentations. Notably, our findings underscore that SSL models imbued with\nprior knowledge exhibit reduced texture bias, diminished reliance on shortcuts\nand augmentations, and improved robustness against both natural and adversarial\ncorruptions. These findings not only illuminate a new direction in SSL\nresearch, but also pave the way for enhancing DNN performance while\nconcurrently alleviating the imperative for intensive data augmentation,\nthereby enhancing scalability and real-world problem-solving capabilities.\n","authors":["Shruthi Gowda","Elahe Arani","Bahram Zonooz"],"pdf_url":"https://arxiv.org/pdf/2404.09752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09748v1","updated":"2024-04-15T12:50:44Z","published":"2024-04-15T12:50:44Z","title":"LetsGo: Large-Scale Garage Modeling and Rendering via LiDAR-Assisted\n Gaussian Primitives","summary":" Large garages are ubiquitous yet intricate scenes in our daily lives, posing\nchallenges characterized by monotonous colors, repetitive patterns, reflective\nsurfaces, and transparent vehicle glass. Conventional Structure from Motion\n(SfM) methods for camera pose estimation and 3D reconstruction fail in these\nenvironments due to poor correspondence construction. To address these\nchallenges, this paper introduces LetsGo, a LiDAR-assisted Gaussian splatting\napproach for large-scale garage modeling and rendering. We develop a handheld\nscanner, Polar, equipped with IMU, LiDAR, and a fisheye camera, to facilitate\naccurate LiDAR and image data scanning. With this Polar device, we present a\nGarageWorld dataset consisting of five expansive garage scenes with diverse\ngeometric structures and will release the dataset to the community for further\nresearch. We demonstrate that the collected LiDAR point cloud by the Polar\ndevice enhances a suite of 3D Gaussian splatting algorithms for garage scene\nmodeling and rendering. We also propose a novel depth regularizer for 3D\nGaussian splatting algorithm training, effectively eliminating floating\nartifacts in rendered images, and a lightweight Level of Detail (LOD) Gaussian\nrenderer for real-time viewing on web-based devices. Additionally, we explore a\nhybrid representation that combines the advantages of traditional mesh in\ndepicting simple geometry and colors (e.g., walls and the ground) with modern\n3D Gaussian representations capturing complex details and high-frequency\ntextures. This strategy achieves an optimal balance between memory performance\nand rendering quality. Experimental results on our dataset, along with\nScanNet++ and KITTI-360, demonstrate the superiority of our method in rendering\nquality and resource efficiency.\n","authors":["Jiadi Cui","Junming Cao","Yuhui Zhong","Liao Wang","Fuqiang Zhao","Penghao Wang","Yifan Chen","Zhipeng He","Lan Xu","Yujiao Shi","Yingliang Zhang","Jingyi Yu"],"pdf_url":"https://arxiv.org/pdf/2404.09748v1.pdf","comment":"Project Page: https://jdtsui.github.io/letsgo/"},{"id":"http://arxiv.org/abs/2404.09736v1","updated":"2024-04-15T12:37:26Z","published":"2024-04-15T12:37:26Z","title":"FSRT: Facial Scene Representation Transformer for Face Reenactment from\n Factorized Appearance, Head-pose, and Facial Expression Features","summary":" The task of face reenactment is to transfer the head motion and facial\nexpressions from a driving video to the appearance of a source image, which may\nbe of a different person (cross-reenactment). Most existing methods are\nCNN-based and estimate optical flow from the source image to the current\ndriving frame, which is then inpainted and refined to produce the output\nanimation. We propose a transformer-based encoder for computing a set-latent\nrepresentation of the source image(s). We then predict the output color of a\nquery pixel using a transformer-based decoder, which is conditioned with\nkeypoints and a facial expression vector extracted from the driving frame.\nLatent representations of the source person are learned in a self-supervised\nmanner that factorize their appearance, head pose, and facial expressions.\nThus, they are perfectly suited for cross-reenactment. In contrast to most\nrelated work, our method naturally extends to multiple source images and can\nthus adapt to person-specific facial dynamics. We also propose data\naugmentation and regularization schemes that are necessary to prevent\noverfitting and support generalizability of the learned representations. We\nevaluated our approach in a randomized user study. The results indicate\nsuperior performance compared to the state-of-the-art in terms of motion\ntransfer quality and temporal consistency.\n","authors":["Andre Rochow","Max Schwarz","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2404.09736v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09735v1","updated":"2024-04-15T12:35:10Z","published":"2024-04-15T12:35:10Z","title":"Equipping Diffusion Models with Differentiable Spatial Entropy for\n Low-Light Image Enhancement","summary":" Image restoration, which aims to recover high-quality images from their\ncorrupted counterparts, often faces the challenge of being an ill-posed problem\nthat allows multiple solutions for a single input. However, most deep learning\nbased works simply employ l1 loss to train their network in a deterministic\nway, resulting in over-smoothed predictions with inferior perceptual quality.\nIn this work, we propose a novel method that shifts the focus from a\ndeterministic pixel-by-pixel comparison to a statistical perspective,\nemphasizing the learning of distributions rather than individual pixel values.\nThe core idea is to introduce spatial entropy into the loss function to measure\nthe distribution difference between predictions and targets. To make this\nspatial entropy differentiable, we employ kernel density estimation (KDE) to\napproximate the probabilities for specific intensity values of each pixel with\ntheir neighbor areas. Specifically, we equip the entropy with diffusion models\nand aim for superior accuracy and enhanced perceptual quality over l1 based\nnoise matching loss. In the experiments, we evaluate the proposed method for\nlow light enhancement on two datasets and the NTIRE challenge 2024. All these\nresults illustrate the effectiveness of our statistic-based entropy loss. Code\nis available at https://github.com/shermanlian/spatial-entropy-loss.\n","authors":["Wenyi Lian","Wenjing Lian","Ziwei Luo"],"pdf_url":"https://arxiv.org/pdf/2404.09735v1.pdf","comment":"CVPRW 2024, best LPIPS in the NTIRE low light enhancement challenge\n 2024"},{"id":"http://arxiv.org/abs/2404.09732v1","updated":"2024-04-15T12:34:21Z","published":"2024-04-15T12:34:21Z","title":"Photo-Realistic Image Restoration in the Wild with Controlled\n Vision-Language Models","summary":" Though diffusion models have been successfully applied to various image\nrestoration (IR) tasks, their performance is sensitive to the choice of\ntraining datasets. Typically, diffusion models trained in specific datasets\nfail to recover images that have out-of-distribution degradations. To address\nthis problem, this work leverages a capable vision-language model and a\nsynthetic degradation pipeline to learn image restoration in the wild (wild\nIR). More specifically, all low-quality images are simulated with a synthetic\ndegradation pipeline that contains multiple common degradations such as blur,\nresize, noise, and JPEG compression. Then we introduce robust training for a\ndegradation-aware CLIP model to extract enriched image content features to\nassist high-quality image restoration. Our base diffusion model is the image\nrestoration SDE (IR-SDE). Built upon it, we further present a posterior\nsampling strategy for fast noise-free image generation. We evaluate our model\non both synthetic and real-world degradation datasets. Moreover, experiments on\nthe unified image restoration task illustrate that the proposed posterior\nsampling improves image generation quality for various degradations.\n","authors":["Ziwei Luo","Fredrik K. Gustafsson","Zheng Zhao","Jens Sjölund","Thomas B. Schön"],"pdf_url":"https://arxiv.org/pdf/2404.09732v1.pdf","comment":"CVPRW 2024; Code: https://github.com/Algolzw/daclip-uir"},{"id":"http://arxiv.org/abs/2404.06913v2","updated":"2024-04-15T12:27:51Z","published":"2024-04-10T11:06:29Z","title":"Sparse Global Matching for Video Frame Interpolation with Large Motion","summary":" Large motion poses a critical challenge in Video Frame Interpolation (VFI)\ntask. Existing methods are often constrained by limited receptive fields,\nresulting in sub-optimal performance when handling scenarios with large motion.\nIn this paper, we introduce a new pipeline for VFI, which can effectively\nintegrate global-level information to alleviate issues associated with large\nmotion. Specifically, we first estimate a pair of initial intermediate flows\nusing a high-resolution feature map for extracting local details. Then, we\nincorporate a sparse global matching branch to compensate for flow estimation,\nwhich consists of identifying flaws in initial flows and generating sparse flow\ncompensation with a global receptive field. Finally, we adaptively merge the\ninitial flow estimation with global flow compensation, yielding a more accurate\nintermediate flow. To evaluate the effectiveness of our method in handling\nlarge motion, we carefully curate a more challenging subset from commonly used\nbenchmarks. Our method demonstrates the state-of-the-art performance on these\nVFI subsets with large motion.\n","authors":["Chunxu Liu","Guozhen Zhang","Rui Zhao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06913v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/.\n Fixed some typos in the supplementary material"},{"id":"http://arxiv.org/abs/2402.06611v2","updated":"2024-04-15T12:13:42Z","published":"2024-02-09T18:42:30Z","title":"Image-based Deep Learning for the time-dependent prediction of fresh\n concrete properties","summary":" Increasing the degree of digitisation and automation in the concrete\nproduction process can play a crucial role in reducing the CO$_2$ emissions\nthat are associated with the production of concrete. In this paper, a method is\npresented that makes it possible to predict the properties of fresh concrete\nduring the mixing process based on stereoscopic image sequences of the\nconcretes flow behaviour. A Convolutional Neural Network (CNN) is used for the\nprediction, which receives the images supported by information on the mix\ndesign as input. In addition, the network receives temporal information in the\nform of the time difference between the time at which the images are taken and\nthe time at which the reference values of the concretes are carried out. With\nthis temporal information, the network implicitly learns the time-dependent\nbehaviour of the concretes properties. The network predicts the slump flow\ndiameter, the yield stress and the plastic viscosity. The time-dependent\nprediction potentially opens up the pathway to determine the temporal\ndevelopment of the fresh concrete properties already during mixing. This\nprovides a huge advantage for the concrete industry. As a result,\ncountermeasures can be taken in a timely manner. It is shown that an approach\nbased on depth and optical flow images, supported by information of the mix\ndesign, achieves the best results.\n","authors":["Max Meyer","Amadeus Langer","Max Mehltretter","Dries Beyer","Max Coenen","Tobias Schack","Michael Haist","Christian Heipke"],"pdf_url":"https://arxiv.org/pdf/2402.06611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08212v2","updated":"2024-04-15T12:08:41Z","published":"2024-01-16T08:56:52Z","title":"Human vs. LMMs: Exploring the Discrepancy in Emoji Interpretation and\n Usage in Digital Communication","summary":" Leveraging Large Multimodal Models (LMMs) to simulate human behaviors when\nprocessing multimodal information, especially in the context of social media,\nhas garnered immense interest due to its broad potential and far-reaching\nimplications. Emojis, as one of the most unique aspects of digital\ncommunication, are pivotal in enriching and often clarifying the emotional and\ntonal dimensions. Yet, there is a notable gap in understanding how these\nadvanced models, such as GPT-4V, interpret and employ emojis in the nuanced\ncontext of online interaction. This study intends to bridge this gap by\nexamining the behavior of GPT-4V in replicating human-like use of emojis. The\nfindings reveal a discernible discrepancy between human and GPT-4V behaviors,\nlikely due to the subjective nature of human interpretation and the limitations\nof GPT-4V's English-centric training, suggesting cultural biases and inadequate\nrepresentation of non-English cultures.\n","authors":["Hanjia Lyu","Weihong Qi","Zhongyu Wei","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2401.08212v2.pdf","comment":"Accepted for publication in ICWSM 2024"},{"id":"http://arxiv.org/abs/2404.09707v1","updated":"2024-04-15T12:06:00Z","published":"2024-04-15T12:06:00Z","title":"Adaptive Patching for High-resolution Image Segmentation with\n Transformers","summary":" Attention-based models are proliferating in the space of image analytics,\nincluding segmentation. The standard method of feeding images to transformer\nencoders is to divide the images into patches and then feed the patches to the\nmodel as a linear sequence of tokens. For high-resolution images, e.g.\nmicroscopic pathology images, the quadratic compute and memory cost prohibits\nthe use of an attention-based model, if we are to use smaller patch sizes that\nare favorable in segmentation. The solution is to either use custom complex\nmulti-resolution models or approximate attention schemes. We take inspiration\nfrom Adapative Mesh Refinement (AMR) methods in HPC by adaptively patching the\nimages, as a pre-processing step, based on the image details to reduce the\nnumber of patches being fed to the model, by orders of magnitude. This method\nhas a negligible overhead, and works seamlessly with any attention-based model,\ni.e. it is a pre-processing step that can be adopted by any attention-based\nmodel without friction. We demonstrate superior segmentation quality over SoTA\nsegmentation models for real-world pathology datasets while gaining a geomean\nspeedup of $6.9\\times$ for resolutions up to $64K^2$, on up to $2,048$ GPUs.\n","authors":["Enzhi Zhang","Isaac Lyngaas","Peng Chen","Xiao Wang","Jun Igarashi","Yuankai Huo","Mohamed Wahib","Masaharu Munetomo"],"pdf_url":"https://arxiv.org/pdf/2404.09707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09697v1","updated":"2024-04-15T11:59:19Z","published":"2024-04-15T11:59:19Z","title":"HSIDMamba: Exploring Bidirectional State-Space Models for Hyperspectral\n Denoising","summary":" Effectively discerning spatial-spectral dependencies in HSI denoising is\ncrucial, but prevailing methods using convolution or transformers still face\ncomputational efficiency limitations. Recently, the emerging Selective State\nSpace Model(Mamba) has risen with its nearly linear computational complexity in\nprocessing natural language sequences, which inspired us to explore its\npotential in handling long spectral sequences. In this paper, we propose\nHSIDMamba(HSDM), tailored to exploit the linear complexity for effectively\ncapturing spatial-spectral dependencies in HSI denoising. In particular, HSDM\ncomprises multiple Hyperspectral Continuous Scan Blocks, incorporating\nBCSM(Bidirectional Continuous Scanning Mechanism), scale residual, and spectral\nattention mechanisms to enhance the capture of long-range and local\nspatial-spectral information. BCSM strengthens spatial-spectral interactions by\nlinking forward and backward scans and enhancing information from eight\ndirections through SSM, significantly enhancing the perceptual capability of\nHSDM and improving denoising performance more effectively. Extensive\nevaluations against HSI denoising benchmarks validate the superior performance\nof HSDM, achieving state-of-the-art results in performance and surpassing the\nefficiency of the latest transformer architectures by $30\\%$.\n","authors":["Yang Liu","Jiahua Xiao","Yu Guo","Peilin Jiang","Haiwei Yang","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09692v1","updated":"2024-04-15T11:46:24Z","published":"2024-04-15T11:46:24Z","title":"XoFTR: Cross-modal Feature Matching Transformer","summary":" We introduce, XoFTR, a cross-modal cross-view method for local feature\nmatching between thermal infrared (TIR) and visible images. Unlike visible\nimages, TIR images are less susceptible to adverse lighting and weather\nconditions but present difficulties in matching due to significant texture and\nintensity differences. Current hand-crafted and learning-based methods for\nvisible-TIR matching fall short in handling viewpoint, scale, and texture\ndiversities. To address this, XoFTR incorporates masked image modeling\npre-training and fine-tuning with pseudo-thermal image augmentation to handle\nthe modality differences. Additionally, we introduce a refined matching\npipeline that adjusts for scale discrepancies and enhances match reliability\nthrough sub-pixel level refinement. To validate our approach, we collect a\ncomprehensive visible-thermal dataset, and show that our method outperforms\nexisting methods on many benchmarks.\n","authors":["Önder Tuzcuoğlu","Aybora Köksal","Buğra Sofu","Sinan Kalkan","A. Aydın Alatan"],"pdf_url":"https://arxiv.org/pdf/2404.09692v1.pdf","comment":"CVPR Image Matching Workshop, 2024. 12 pages, 7 figures, 5 tables.\n Codes and dataset are available at https://github.com/OnderT/XoFTR"},{"id":"http://arxiv.org/abs/2404.09690v1","updated":"2024-04-15T11:45:30Z","published":"2024-04-15T11:45:30Z","title":"Harnessing GPT-4V(ision) for Insurance: A Preliminary Exploration","summary":" The emergence of Large Multimodal Models (LMMs) marks a significant milestone\nin the development of artificial intelligence. Insurance, as a vast and complex\ndiscipline, involves a wide variety of data forms in its operational processes,\nincluding text, images, and videos, thereby giving rise to diverse multimodal\ntasks. Despite this, there has been limited systematic exploration of\nmultimodal tasks specific to insurance, nor a thorough investigation into how\nLMMs can address these challenges. In this paper, we explore GPT-4V's\ncapabilities in the insurance domain. We categorize multimodal tasks by\nfocusing primarily on visual aspects based on types of insurance (e.g., auto,\nhousehold/commercial property, health, and agricultural insurance) and\ninsurance stages (e.g., risk assessment, risk monitoring, and claims\nprocessing). Our experiment reveals that GPT-4V exhibits remarkable abilities\nin insurance-related tasks, demonstrating not only a robust understanding of\nmultimodal content in the insurance domain but also a comprehensive knowledge\nof insurance scenarios. However, there are notable shortcomings: GPT-4V\nstruggles with detailed risk rating and loss assessment, suffers from\nhallucination in image understanding, and shows variable support for different\nlanguages. Through this work, we aim to bridge the insurance domain with\ncutting-edge LMM technology, facilitate interdisciplinary exchange and\ndevelopment, and provide a foundation for the continued advancement and\nevolution of future research endeavors.\n","authors":["Chenwei Lin","Hanjia Lyu","Jiebo Luo","Xian Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12796v3","updated":"2024-04-15T11:40:39Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09683v1","updated":"2024-04-15T11:36:31Z","published":"2024-04-15T11:36:31Z","title":"Post-Training Network Compression for 3D Medical Image Segmentation:\n Reducing Computational Efforts via Tucker Decomposition","summary":" We address the computational barrier of deploying advanced deep learning\nsegmentation models in clinical settings by studying the efficacy of network\ncompression through tensor decomposition. We propose a post-training Tucker\nfactorization that enables the decomposition of pre-existing models to reduce\ncomputational requirements without impeding segmentation accuracy. We applied\nTucker decomposition to the convolutional kernels of the TotalSegmentator (TS)\nmodel, an nnU-Net model trained on a comprehensive dataset for automatic\nsegmentation of 117 anatomical structures. Our approach reduced the\nfloating-point operations (FLOPs) and memory required during inference,\noffering an adjustable trade-off between computational efficiency and\nsegmentation quality. This study utilized the publicly available TS dataset,\nemploying various downsampling factors to explore the relationship between\nmodel size, inference speed, and segmentation performance. The application of\nTucker decomposition to the TS model substantially reduced the model parameters\nand FLOPs across various compression rates, with limited loss in segmentation\naccuracy. We removed up to 88% of the model's parameters with no significant\nperformance changes in the majority of classes after fine-tuning. Practical\nbenefits varied across different graphics processing unit (GPU) architectures,\nwith more distinct speed-ups on less powerful hardware. Post-hoc network\ncompression via Tucker decomposition presents a viable strategy for reducing\nthe computational demand of medical image segmentation models without\nsubstantially sacrificing accuracy. This approach enables the broader adoption\nof advanced deep learning technologies in clinical practice, offering a way to\nnavigate the constraints of hardware capabilities.\n","authors":["Tobias Weber","Jakob Dexl","David Rügamer","Michael Ingrisch"],"pdf_url":"https://arxiv.org/pdf/2404.09683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00368v2","updated":"2024-04-15T11:18:00Z","published":"2024-03-30T13:41:57Z","title":"Towards Variable and Coordinated Holistic Co-Speech Motion Generation","summary":" This paper addresses the problem of generating lifelike holistic co-speech\nmotions for 3D avatars, focusing on two key aspects: variability and\ncoordination. Variability allows the avatar to exhibit a wide range of motions\neven with similar speech content, while coordination ensures a harmonious\nalignment among facial expressions, hand gestures, and body poses. We aim to\nachieve both with ProbTalk, a unified probabilistic framework designed to\njointly model facial, hand, and body movements in speech. ProbTalk builds on\nthe variational autoencoder (VAE) architecture and incorporates three core\ndesigns. First, we introduce product quantization (PQ) to the VAE, which\nenriches the representation of complex holistic motion. Second, we devise a\nnovel non-autoregressive model that embeds 2D positional encoding into the\nproduct-quantized representation, thereby preserving essential structure\ninformation of the PQ codes. Last, we employ a secondary stage to refine the\npreliminary prediction, further sharpening the high-frequency details. Coupling\nthese three designs enables ProbTalk to generate natural and diverse holistic\nco-speech motions, outperforming several state-of-the-art methods in\nqualitative and quantitative evaluations, particularly in terms of realism. Our\ncode and model will be released for research purposes at\nhttps://feifeifeiliu.github.io/probtalk/.\n","authors":["Yifei Liu","Qiong Cao","Yandong Wen","Huaiguang Jiang","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2404.00368v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.00362v2","updated":"2024-04-15T11:03:06Z","published":"2023-12-01T05:59:08Z","title":"Dancing with Still Images: Video Distillation via Static-Dynamic\n Disentanglement","summary":" Recently, dataset distillation has paved the way towards efficient machine\nlearning, especially for image datasets. However, the distillation for videos,\ncharacterized by an exclusive temporal dimension, remains an underexplored\ndomain. In this work, we provide the first systematic study of video\ndistillation and introduce a taxonomy to categorize temporal compression. Our\ninvestigation reveals that the temporal information is usually not well learned\nduring distillation, and the temporal dimension of synthetic data contributes\nlittle. The observations motivate our unified framework of disentangling the\ndynamic and static information in the videos. It first distills the videos into\nstill images as static memory and then compensates the dynamic and motion\ninformation with a learnable dynamic memory block. Our method achieves\nstate-of-the-art on video datasets at different scales, with a notably smaller\nmemory storage budget. Our code is available at\nhttps://github.com/yuz1wan/video_distillation.\n","authors":["Ziyu Wang","Yue Xu","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2312.00362v2.pdf","comment":"CVPR 2024, project page: https://mvig-rhos.com/video-distill"},{"id":"http://arxiv.org/abs/2404.09666v1","updated":"2024-04-15T10:57:16Z","published":"2024-04-15T10:57:16Z","title":"Deformable MRI Sequence Registration for AI-based Prostate Cancer\n Diagnosis","summary":" The PI-CAI (Prostate Imaging: Cancer AI) challenge led to expert-level\ndiagnostic algorithms for clinically significant prostate cancer detection. The\nalgorithms receive biparametric MRI scans as input, which consist of\nT2-weighted and diffusion-weighted scans. These scans can be misaligned due to\nmultiple factors in the scanning process. Image registration can alleviate this\nissue by predicting the deformation between the sequences. We investigate the\neffect of image registration on the diagnostic performance of AI-based prostate\ncancer diagnosis. First, the image registration algorithm, developed in\nMeVisLab, is analyzed using a dataset with paired lesion annotations. Second,\nthe effect on diagnosis is evaluated by comparing case-level cancer diagnosis\nperformance between using the original dataset, rigidly aligned\ndiffusion-weighted scans, or deformably aligned diffusion-weighted scans. Rigid\nregistration showed no improvement. Deformable registration demonstrated a\nsubstantial improvement in lesion overlap (+10% median Dice score) and a\npositive yet non-significant improvement in diagnostic performance (+0.3%\nAUROC, p=0.18). Our investigation shows that a substantial improvement in\nlesion alignment does not directly lead to a significant improvement in\ndiagnostic performance. Qualitative analysis indicated that jointly developing\nimage registration methods and diagnostic AI algorithms could enhance\ndiagnostic accuracy and patient outcomes.\n","authors":["Alessa Hering","Sarah de Boer","Anindo Saha","Jasper J. Twilt","Derya Yakar","Maarten de Rooij","Henkjan Huisman","Joeran S. Bosma"],"pdf_url":"https://arxiv.org/pdf/2404.09666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08285v2","updated":"2024-04-15T10:50:47Z","published":"2024-04-12T07:19:16Z","title":"A Survey of Neural Network Robustness Assessment in Image Recognition","summary":" In recent years, there has been significant attention given to the robustness\nassessment of neural networks. Robustness plays a critical role in ensuring\nreliable operation of artificial intelligence (AI) systems in complex and\nuncertain environments. Deep learning's robustness problem is particularly\nsignificant, highlighted by the discovery of adversarial attacks on image\nclassification models. Researchers have dedicated efforts to evaluate\nrobustness in diverse perturbation conditions for image recognition tasks.\nRobustness assessment encompasses two main techniques: robustness verification/\ncertification for deliberate adversarial attacks and robustness testing for\nrandom data corruptions. In this survey, we present a detailed examination of\nboth adversarial robustness (AR) and corruption robustness (CR) in neural\nnetwork assessment. Analyzing current research papers and standards, we provide\nan extensive overview of robustness assessment in image recognition. Three\nessential aspects are analyzed: concepts, metrics, and assessment methods. We\ninvestigate the perturbation metrics and range representations used to measure\nthe degree of perturbations on images, as well as the robustness metrics\nspecifically for the robustness conditions of classification models. The\nstrengths and limitations of the existing methods are also discussed, and some\npotential directions for future research are provided.\n","authors":["Jie Wang","Jun Ai","Minyan Lu","Haoran Su","Dan Yu","Yutao Zhang","Junda Zhu","Jingyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08285v2.pdf","comment":"Corrected typos and grammatical errors in Section 5"},{"id":"http://arxiv.org/abs/2404.09654v1","updated":"2024-04-15T10:42:22Z","published":"2024-04-15T10:42:22Z","title":"Do LLMs Understand Visual Anomalies? Uncovering LLM Capabilities in\n Zero-shot Anomaly Detection","summary":" Large vision-language models (LVLMs) are markedly proficient in deriving\nvisual representations guided by natural language. Recent explorations have\nutilized LVLMs to tackle zero-shot visual anomaly detection (VAD) challenges by\npairing images with textual descriptions indicative of normal and abnormal\nconditions, referred to as anomaly prompts. However, existing approaches depend\non static anomaly prompts that are prone to cross-semantic ambiguity, and\nprioritize global image-level representations over crucial local pixel-level\nimage-to-text alignment that is necessary for accurate anomaly localization. In\nthis paper, we present ALFA, a training-free approach designed to address these\nchallenges via a unified model. We propose a run-time prompt adaptation\nstrategy, which first generates informative anomaly prompts to leverage the\ncapabilities of a large language model (LLM). This strategy is enhanced by a\ncontextual scoring mechanism for per-image anomaly prompt adaptation and\ncross-semantic ambiguity mitigation. We further introduce a novel fine-grained\naligner to fuse local pixel-level semantics for precise anomaly localization,\nby projecting the image-text alignment from global to local semantic spaces.\nExtensive evaluations on the challenging MVTec and VisA datasets confirm ALFA's\neffectiveness in harnessing the language potential for zero-shot VAD, achieving\nsignificant PRO improvements of 12.1% on MVTec AD and 8.9% on VisA compared to\nstate-of-the-art zero-shot VAD approaches.\n","authors":["Jiaqi Zhu","Shaofeng Cai","Fang Deng","Junran Wu"],"pdf_url":"https://arxiv.org/pdf/2404.09654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03453v2","updated":"2024-04-15T10:28:44Z","published":"2023-09-07T02:28:04Z","title":"SyncDreamer: Generating Multiview-consistent Images from a Single-view\n Image","summary":" In this paper, we present a novel diffusion model called that generates\nmultiview-consistent images from a single-view image. Using pretrained\nlarge-scale 2D diffusion models, recent work Zero123 demonstrates the ability\nto generate plausible novel views from a single-view image of an object.\nHowever, maintaining consistency in geometry and colors for the generated\nimages remains a challenge. To address this issue, we propose a synchronized\nmultiview diffusion model that models the joint probability distribution of\nmultiview images, enabling the generation of multiview-consistent images in a\nsingle reverse process. SyncDreamer synchronizes the intermediate states of all\nthe generated images at every step of the reverse process through a 3D-aware\nfeature attention mechanism that correlates the corresponding features across\ndifferent views. Experiments show that SyncDreamer generates images with high\nconsistency across different views, thus making it well-suited for various 3D\ngeneration tasks such as novel-view-synthesis, text-to-3D, and image-to-3D.\n","authors":["Yuan Liu","Cheng Lin","Zijiao Zeng","Xiaoxiao Long","Lingjie Liu","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03453v2.pdf","comment":"ICLR 2024 Spotlight. Project page:\n https://liuyuan-pal.github.io/SyncDreamer/ Code:\n https://github.com/liuyuan-pal/SyncDreamer"},{"id":"http://arxiv.org/abs/2404.09645v1","updated":"2024-04-15T10:24:32Z","published":"2024-04-15T10:24:32Z","title":"Real-world Instance-specific Image Goal Navigation for Service Robots:\n Bridging the Domain Gap with Contrastive Learning","summary":" Improving instance-specific image goal navigation (InstanceImageNav), which\nlocates the identical object in a real-world environment from a query image, is\nessential for robotic systems to assist users in finding desired objects. The\nchallenge lies in the domain gap between low-quality images observed by the\nmoving robot, characterized by motion blur and low-resolution, and high-quality\nquery images provided by the user. Such domain gaps could significantly reduce\nthe task success rate but have not been the focus of previous work. To address\nthis, we propose a novel method called Few-shot Cross-quality Instance-aware\nAdaptation (CrossIA), which employs contrastive learning with an instance\nclassifier to align features between massive low- and few high-quality images.\nThis approach effectively reduces the domain gap by bringing the latent\nrepresentations of cross-quality images closer on an instance basis.\nAdditionally, the system integrates an object image collection with a\npre-trained deblurring model to enhance the observed image quality. Our method\nfine-tunes the SimSiam model, pre-trained on ImageNet, using CrossIA. We\nevaluated our method's effectiveness through an InstanceImageNav task with 20\ndifferent types of instances, where the robot identifies the same instance in a\nreal-world environment as a high-quality query image. Our experiments showed\nthat our method improves the task success rate by up to three times compared to\nthe baseline, a conventional approach based on SuperGlue. These findings\nhighlight the potential of leveraging contrastive learning and image\nenhancement techniques to bridge the domain gap and improve object localization\nin robotic applications. The project website is\nhttps://emergentsystemlabstudent.github.io/DomainBridgingNav/.\n","authors":["Taichi Sakaguchi","Akira Taniguchi","Yoshinobu Hagiwara","Lotfi El Hafi","Shoichi Hasegawa","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2404.09645v1.pdf","comment":"See website at\n https://emergentsystemlabstudent.github.io/DomainBridgingNav/. Submitted to\n IROS2024"},{"id":"http://arxiv.org/abs/2404.09640v1","updated":"2024-04-15T10:19:39Z","published":"2024-04-15T10:19:39Z","title":"CREST: Cross-modal Resonance through Evidential Deep Learning for\n Enhanced Zero-Shot Learning","summary":" Zero-shot learning (ZSL) enables the recognition of novel classes by\nleveraging semantic knowledge transfer from known to unknown categories. This\nknowledge, typically encapsulated in attribute descriptions, aids in\nidentifying class-specific visual features, thus facilitating visual-semantic\nalignment and improving ZSL performance. However, real-world challenges such as\ndistribution imbalances and attribute co-occurrence among instances often\nhinder the discernment of local variances in images, a problem exacerbated by\nthe scarcity of fine-grained, region-specific attribute annotations. Moreover,\nthe variability in visual presentation within categories can also skew\nattribute-category associations. In response, we propose a bidirectional\ncross-modal ZSL approach CREST. It begins by extracting representations for\nattribute and visual localization and employs Evidential Deep Learning (EDL) to\nmeasure underlying epistemic uncertainty, thereby enhancing the model's\nresilience against hard negatives. CREST incorporates dual learning pathways,\nfocusing on both visual-category and attribute-category alignments, to ensure\nrobust correlation between latent and observable spaces. Moreover, we introduce\nan uncertainty-informed cross-modal fusion technique to refine visual-attribute\ninference. Extensive experiments demonstrate our model's effectiveness and\nunique explainability across multiple datasets. Our code and data are available\nat: Comments: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at\nhttps://github.com/JethroJames/CREST.\n","authors":["Haojian Huang","Xiaozhen Qiao","Zhuo Chen","Haodong Chen","Bingyu Li","Zhe Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.09640v1.pdf","comment":"Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at\n https://github.com/JethroJames/CREST"},{"id":"http://arxiv.org/abs/2404.05468v2","updated":"2024-04-15T10:13:25Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v2.pdf","comment":"Pre-print to be updated"},{"id":"http://arxiv.org/abs/2403.16092v2","updated":"2024-04-15T10:06:41Z","published":"2024-03-24T11:09:41Z","title":"Are NeRFs ready for autonomous driving? Towards closing the\n real-to-simulation gap","summary":" Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing\nautonomous driving (AD) research, offering scalable closed-loop simulation and\ndata augmentation capabilities. However, to trust the results achieved in\nsimulation, one needs to ensure that AD systems perceive real and rendered data\nin the same way. Although the performance of rendering methods is increasing,\nmany scenarios will remain inherently challenging to reconstruct faithfully. To\nthis end, we propose a novel perspective for addressing the real-to-simulated\ndata gap. Rather than solely focusing on improving rendering fidelity, we\nexplore simple yet effective methods to enhance perception model robustness to\nNeRF artifacts without compromising performance on real data. Moreover, we\nconduct the first large-scale investigation into the real-to-simulated data gap\nin an AD setting using a state-of-the-art neural rendering technique.\nSpecifically, we evaluate object detectors and an online mapping model on real\nand simulated data, and study the effects of different fine-tuning\nstrategies.Our results show notable improvements in model robustness to\nsimulated data, even improving real-world performance in some cases. Last, we\ndelve into the correlation between the real-to-simulated gap and image\nreconstruction metrics, identifying FID and LPIPS as strong indicators. See\nhttps://research.zenseact.com/publications/closing-real2sim-gap for our project\npage.\n","authors":["Carl Lindström","Georg Hess","Adam Lilja","Maryam Fatemi","Lars Hammarstrand","Christoffer Petersson","Lennart Svensson"],"pdf_url":"https://arxiv.org/pdf/2403.16092v2.pdf","comment":"Accepted at Workshop on Autonomous Driving, CVPR 2024"},{"id":"http://arxiv.org/abs/2312.02244v3","updated":"2024-04-15T10:06:19Z","published":"2023-12-04T12:30:07Z","title":"Geometrically-driven Aggregation for Zero-shot 3D Point Cloud\n Understanding","summary":" Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language\nModels (VLMs). Existing strategies directly map Vision-Language Models from 2D\npixels of rendered or captured views to 3D points, overlooking the inherent and\nexpressible point cloud geometric structure. Geometrically similar or close\nregions can be exploited for bolstering point cloud understanding as they are\nlikely to share semantic information. To this end, we introduce the first\ntraining-free aggregation technique that leverages the point cloud's 3D\ngeometric structure to improve the quality of the transferred Vision-Language\nModels. Our approach operates iteratively, performing local-to-global\naggregation based on geometric and semantic point-level reasoning. We benchmark\nour approach on three downstream tasks, including classification, part\nsegmentation, and semantic segmentation, with a variety of datasets\nrepresenting both synthetic/real-world, and indoor/outdoor scenarios. Our\napproach achieves new state-of-the-art results in all benchmarks. Our approach\noperates iteratively, performing local-to-global aggregation based on geometric\nand semantic point-level reasoning. Code and dataset are available at\nhttps://luigiriz.github.io/geoze-website/\n","authors":["Guofeng Mei","Luigi Riz","Yiming Wang","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.02244v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09633v1","updated":"2024-04-15T10:05:36Z","published":"2024-04-15T10:05:36Z","title":"In-Context Translation: Towards Unifying Image Recognition, Processing,\n and Generation","summary":" We propose In-Context Translation (ICT), a general learning framework to\nunify visual recognition (e.g., semantic segmentation), low-level image\nprocessing (e.g., denoising), and conditional image generation (e.g.,\nedge-to-image synthesis). Thanks to unification, ICT significantly reduces the\ninherent inductive bias that comes with designing models for specific tasks,\nand it maximizes mutual enhancement across similar tasks. However, the\nunification across a large number of tasks is non-trivial due to various data\nformats and training pipelines. To this end, ICT introduces two designs.\nFirstly, it standardizes input-output data of different tasks into RGB image\npairs, e.g., semantic segmentation data pairs an RGB image with its\nsegmentation mask in the same RGB format. This turns different tasks into a\ngeneral translation task between two RGB images. Secondly, it standardizes the\ntraining of different tasks into a general in-context learning, where\n\"in-context\" means the input comprises an example input-output pair of the\ntarget task and a query image. The learning objective is to generate the\n\"missing\" data paired with the query. The implicit translation process is thus\nbetween the query and the generated image. In experiments, ICT unifies ten\nvision tasks and showcases impressive performance on their respective\nbenchmarks. Notably, compared to its competitors, e.g., Painter and\nPromptDiffusion, ICT trained on only 4 RTX 3090 GPUs is shown to be more\nefficient and less costly in training.\n","authors":["Han Xue","Qianru Sun","Li Song","Wenjun Zhang","Zhiwu Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09632v1","updated":"2024-04-15T10:04:15Z","published":"2024-04-15T10:04:15Z","title":"Bridging Vision and Language Spaces with Assignment Prediction","summary":" This paper introduces VLAP, a novel approach that bridges pretrained vision\nmodels and large language models (LLMs) to make frozen LLMs understand the\nvisual world. VLAP transforms the embedding space of pretrained vision models\ninto the LLMs' word embedding space using a single linear layer for efficient\nand general-purpose visual and language understanding. Specifically, we harness\nwell-established word embeddings to bridge two modality embedding spaces. The\nvisual and text representations are simultaneously assigned to a set of word\nembeddings within pretrained LLMs by formulating the assigning procedure as an\noptimal transport problem. We predict the assignment of one modality from the\nrepresentation of another modality data, enforcing consistent assignments for\npaired multimodal data. This allows vision and language representations to\ncontain the same information, grounding the frozen LLMs' word embedding space\nin visual data. Moreover, a robust semantic taxonomy of LLMs can be preserved\nwith visual data since the LLMs interpret and reason linguistic information\nfrom correlations between word embeddings. Experimental results show that VLAP\nachieves substantial improvements over the previous linear transformation-based\napproaches across a range of vision-language tasks, including image captioning,\nvisual question answering, and cross-modal retrieval. We also demonstrate the\nlearned visual representations hold a semantic taxonomy of LLMs, making visual\nsemantic arithmetic possible.\n","authors":["Jungin Park","Jiyoung Lee","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2404.09632v1.pdf","comment":"ICLR 2024 Camera-ready"},{"id":"http://arxiv.org/abs/2404.09624v1","updated":"2024-04-15T09:56:20Z","published":"2024-04-15T09:56:20Z","title":"AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics\n Perception","summary":" The highly abstract nature of image aesthetics perception (IAP) poses\nsignificant challenge for current multimodal large language models (MLLMs). The\nlack of human-annotated multi-modality aesthetic data further exacerbates this\ndilemma, resulting in MLLMs falling short of aesthetics perception\ncapabilities. To address the above challenge, we first introduce a\ncomprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT)\ndataset, which serves as the footstone for building multi-modality aesthetics\nfoundation models. Specifically, to align MLLMs with human aesthetics\nperception, we construct a corpus-rich aesthetic critique database with 21,904\ndiverse-sourced images and 88K human natural language feedbacks, which are\ncollected via progressive questions, ranging from coarse-grained aesthetic\ngrades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle\ndiverse queries, we further prompt GPT to refine the aesthetic critiques and\nassemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT,\nwhich consists of 409K multi-typed instructions to activate stronger aesthetic\ncapabilities. Based on the AesMMIT database, we fine-tune the open-sourced\ngeneral foundation models, achieving multi-modality Aesthetic Expert models,\ndubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert\nmodels deliver significantly better aesthetic perception performances than the\nstate-of-the-art MLLMs, including the most advanced GPT-4V and\nGemini-Pro-Vision. Source data will be available at\nhttps://github.com/yipoh/AesExpert.\n","authors":["Yipo Huang","Xiangfei Sheng","Zhichao Yang","Quan Yuan","Zhichao Duan","Pengfei Chen","Leida Li","Weisi Lin","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2404.09624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03778v3","updated":"2024-04-15T09:55:50Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincaré Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08339v2","updated":"2024-04-15T09:51:15Z","published":"2023-10-12T13:57:32Z","title":"TTK is Getting MPI-Ready","summary":" This system paper documents the technical foundations for the extension of\nthe Topology ToolKit (TTK) to distributed-memory parallelism with the Message\nPassing Interface (MPI). While several recent papers introduced topology-based\napproaches for distributed-memory environments, these were reporting\nexperiments obtained with tailored, mono-algorithm implementations. In\ncontrast, we describe in this paper a versatile approach (supporting both\ntriangulated domains and regular grids) for the support of topological analysis\npipelines, i.e. a sequence of topological algorithms interacting together.\nWhile developing this extension, we faced several algorithmic and software\nengineering challenges, which we document in this paper. We describe an MPI\nextension of TTK's data structure for triangulation representation and\ntraversal, a central component to the global performance and generality of\nTTK's topological implementations. We also introduce an intermediate interface\nbetween TTK and MPI, both at the global pipeline level, and at the fine-grain\nalgorithmic level. We provide a taxonomy for the distributed-memory topological\nalgorithms supported by TTK, depending on their communication needs and provide\nexamples of hybrid MPI+thread parallelizations. Performance analyses show that\nparallel efficiencies range from 20% to 80% (depending on the algorithms), and\nthat the MPI-specific preconditioning introduced by our framework induces a\nnegligible computation time overhead. We illustrate the new distributed-memory\ncapabilities of TTK with an example of advanced analysis pipeline, combining\nmultiple algorithms, run on the largest publicly available dataset we have\nfound (120 billion vertices) on a cluster with 64 nodes (for a total of 1536\ncores). Finally, we provide a roadmap for the completion of TTK's MPI\nextension, along with generic recommendations for each algorithm communication\ncategory.\n","authors":["Eve Le Guillou","Michael Will","Pierre Guillou","Jonas Lukasczyk","Pierre Fortin","Christoph Garth","Julien Tierny"],"pdf_url":"https://arxiv.org/pdf/2310.08339v2.pdf","comment":"18 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.09619v1","updated":"2024-04-15T09:47:48Z","published":"2024-04-15T09:47:48Z","title":"UNIAA: A Unified Multi-modal Image Aesthetic Assessment Baseline and\n Benchmark","summary":" As an alternative to expensive expert evaluation, Image Aesthetic Assessment\n(IAA) stands out as a crucial task in computer vision. However, traditional IAA\nmethods are typically constrained to a single data source or task, restricting\nthe universality and broader application. In this work, to better align with\nhuman aesthetics, we propose a Unified Multi-modal Image Aesthetic Assessment\n(UNIAA) framework, including a Multi-modal Large Language Model (MLLM) named\nUNIAA-LLaVA and a comprehensive benchmark named UNIAA-Bench. We choose MLLMs\nwith both visual perception and language ability for IAA and establish a\nlow-cost paradigm for transforming the existing datasets into unified and\nhigh-quality visual instruction tuning data, from which the UNIAA-LLaVA is\ntrained. To further evaluate the IAA capability of MLLMs, we construct the\nUNIAA-Bench, which consists of three aesthetic levels: Perception, Description,\nand Assessment. Extensive experiments validate the effectiveness and\nrationality of UNIAA. UNIAA-LLaVA achieves competitive performance on all\nlevels of UNIAA-Bench, compared with existing MLLMs. Specifically, our model\nperforms better than GPT-4V in aesthetic perception and even approaches the\njunior-level human. We find MLLMs have great potential in IAA, yet there\nremains plenty of room for further improvement. The UNIAA-LLaVA and UNIAA-Bench\nwill be released.\n","authors":["Zhaokun Zhou","Qiulin Wang","Bin Lin","Yiwei Su","Rui Chen","Xin Tao","Amin Zheng","Li Yuan","Pengfei Wan","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09616v1","updated":"2024-04-15T09:40:44Z","published":"2024-04-15T09:40:44Z","title":"A Review and Efficient Implementation of Scene Graph Generation Metrics","summary":" Scene graph generation has emerged as a prominent research field in computer\nvision, witnessing significant advancements in the recent years. However,\ndespite these strides, precise and thorough definitions for the metrics used to\nevaluate scene graph generation models are lacking. In this paper, we address\nthis gap in the literature by providing a review and precise definition of\ncommonly used metrics in scene graph generation. Our comprehensive examination\nclarifies the underlying principles of these metrics and can serve as a\nreference or introduction to scene graph metrics.\n Furthermore, to facilitate the usage of these metrics, we introduce a\nstandalone Python package called SGBench that efficiently implements all\ndefined metrics, ensuring their accessibility to the research community.\nAdditionally, we present a scene graph benchmarking web service, that enables\nresearchers to compare scene graph generation methods and increase visibility\nof new methods in a central place.\n All of our code can be found at https://lorjul.github.io/sgbench/.\n","authors":["Julian Lorenz","Robin Schön","Katja Ludwig","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2404.09616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11369v2","updated":"2024-04-15T09:33:21Z","published":"2023-06-20T08:19:51Z","title":"CrossKD: Cross-Head Knowledge Distillation for Object Detection","summary":" Knowledge Distillation (KD) has been validated as an effective model\ncompression technique for learning compact object detectors. Existing\nstate-of-the-art KD methods for object detection are mostly based on feature\nimitation. In this paper, we present a general and effective prediction\nmimicking distillation scheme, called CrossKD, which delivers the intermediate\nfeatures of the student's detection head to the teacher's detection head. The\nresulting cross-head predictions are then forced to mimic the teacher's\npredictions. This manner relieves the student's head from receiving\ncontradictory supervision signals from the annotations and the teacher's\npredictions, greatly improving the student's detection performance. Moreover,\nas mimicking the teacher's predictions is the target of KD, CrossKD offers more\ntask-oriented information in contrast with feature imitation. On MS COCO, with\nonly prediction mimicking losses applied, our CrossKD boosts the average\nprecision of GFL ResNet-50 with 1x training schedule from 40.2 to 43.7,\noutperforming all existing KD methods. In addition, our method also works well\nwhen distilling detectors with heterogeneous backbones. Code is available at\nhttps://github.com/jbwang1997/CrossKD.\n","authors":["Jiabao Wang","Yuming Chen","Zhaohui Zheng","Xiang Li","Ming-Ming Cheng","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2306.11369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17648v3","updated":"2024-04-15T09:31:17Z","published":"2023-05-28T06:44:33Z","title":"Z-GMOT: Zero-shot Generic Multiple Object Tracking","summary":" Despite recent significant progress, Multi-Object Tracking (MOT) faces\nlimitations such as reliance on prior knowledge and predefined categories and\nstruggles with unseen objects. To address these issues, Generic Multiple Object\nTracking (GMOT) has emerged as an alternative approach, requiring less prior\ninformation. However, current GMOT methods often rely on initial bounding boxes\nand struggle to handle variations in factors such as viewpoint, lighting,\nocclusion, and scale, among others. Our contributions commence with the\nintroduction of the \\textit{Referring GMOT dataset} a collection of videos,\neach accompanied by detailed textual descriptions of their attributes.\nSubsequently, we propose $\\mathtt{Z-GMOT}$, a cutting-edge tracking solution\ncapable of tracking objects from \\textit{never-seen categories} without the\nneed of initial bounding boxes or predefined categories. Within our\n$\\mathtt{Z-GMOT}$ framework, we introduce two novel components: (i)\n$\\mathtt{iGLIP}$, an improved Grounded language-image pretraining, for\naccurately detecting unseen objects with specific characteristics. (ii)\n$\\mathtt{MA-SORT}$, a novel object association approach that adeptly integrates\nmotion and appearance-based matching strategies to tackle the complex task of\ntracking objects with high similarity. Our contributions are benchmarked\nthrough extensive experiments conducted on the Referring GMOT dataset for GMOT\ntask. Additionally, to assess the generalizability of the proposed\n$\\mathtt{Z-GMOT}$, we conduct ablation studies on the DanceTrack and MOT20\ndatasets for the MOT task. Our dataset, code, and models are released at:\nhttps://fsoft-aic.github.io/Z-GMOT.\n","authors":["Kim Hoang Tran","Anh Duy Le Dinh","Tien Phat Nguyen","Thinh Phan","Pha Nguyen","Khoa Luu","Donald Adjeroh","Gianfranco Doretto","Ngan Hoang Le"],"pdf_url":"https://arxiv.org/pdf/2305.17648v3.pdf","comment":"Accepted to NAACL 2024"},{"id":"http://arxiv.org/abs/2307.03992v4","updated":"2024-04-15T09:19:01Z","published":"2023-07-08T14:59:41Z","title":"Stimulating the Diffusion Model for Image Denoising via Adaptive\n Embedding and Ensembling","summary":" Image denoising is a fundamental problem in computational photography, where\nachieving high perception with low distortion is highly demanding. Current\nmethods either struggle with perceptual quality or suffer from significant\ndistortion. Recently, the emerging diffusion model has achieved\nstate-of-the-art performance in various tasks and demonstrates great potential\nfor image denoising. However, stimulating diffusion models for image denoising\nis not straightforward and requires solving several critical problems. For one\nthing, the input inconsistency hinders the connection between diffusion models\nand image denoising. For another, the content inconsistency between the\ngenerated image and the desired denoised image introduces distortion. To tackle\nthese problems, we present a novel strategy called the Diffusion Model for\nImage Denoising (DMID) by understanding and rethinking the diffusion model from\na denoising perspective. Our DMID strategy includes an adaptive embedding\nmethod that embeds the noisy image into a pre-trained unconditional diffusion\nmodel and an adaptive ensembling method that reduces distortion in the denoised\nimage. Our DMID strategy achieves state-of-the-art performance on both\ndistortion-based and perception-based metrics, for both Gaussian and real-world\nimage denoising.The code is available at https://github.com/Li-Tong-621/DMID.\n","authors":["Tong Li","Hansen Feng","Lizhi Wang","Zhiwei Xiong","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2307.03992v4.pdf","comment":"18 pages,15 figures"},{"id":"http://arxiv.org/abs/2404.09601v1","updated":"2024-04-15T09:16:49Z","published":"2024-04-15T09:16:49Z","title":"Reactive Model Correction: Mitigating Harm to Task-Relevant Features via\n Conditional Bias Suppression","summary":" Deep Neural Networks are prone to learning and relying on spurious\ncorrelations in the training data, which, for high-risk applications, can have\nfatal consequences. Various approaches to suppress model reliance on harmful\nfeatures have been proposed that can be applied post-hoc without additional\ntraining. Whereas those methods can be applied with efficiency, they also tend\nto harm model performance by globally shifting the distribution of latent\nfeatures. To mitigate unintended overcorrection of model behavior, we propose a\nreactive approach conditioned on model-derived knowledge and eXplainable\nArtificial Intelligence (XAI) insights. While the reactive approach can be\napplied to many post-hoc methods, we demonstrate the incorporation of\nreactivity in particular for P-ClArC (Projective Class Artifact Compensation),\nintroducing a new method called R-ClArC (Reactive Class Artifact Compensation).\nThrough rigorous experiments in controlled settings (FunnyBirds) and with a\nreal-world dataset (ISIC2019), we show that introducing reactivity can minimize\nthe detrimental effect of the applied correction while simultaneously ensuring\nlow reliance on spurious features.\n","authors":["Dilyara Bareeva","Maximilian Dreyer","Frederik Pahde","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2404.09601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11821v3","updated":"2024-04-15T09:10:56Z","published":"2024-03-18T14:24:20Z","title":"Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality\n Metrics","summary":" Recent advances in text-to-image synthesis enabled through a combination of\nlanguage and vision foundation models have led to a proliferation of the tools\navailable and an increased attention to the field. When conducting\ntext-to-image synthesis, a central goal is to ensure that the content between\ntext and image is aligned. As such, there exist numerous evaluation metrics\nthat aim to mimic human judgement. However, it is often unclear which metric to\nuse for evaluating text-to-image synthesis systems as their evaluation is\nhighly nuanced. In this work, we provide a comprehensive overview of existing\ntext-to-image evaluation metrics. Based on our findings, we propose a new\ntaxonomy for categorizing these metrics. Our taxonomy is grounded in the\nassumption that there are two main quality criteria, namely compositionality\nand generality, which ideally map to human preferences. Ultimately, we derive\nguidelines for practitioners conducting text-to-image evaluation, discuss open\nchallenges of evaluation mechanisms, and surface limitations of current\nmetrics.\n","authors":["Sebastian Hartwig","Dominik Engel","Leon Sick","Hannah Kniesel","Tristan Payer","Poonam Poonam","Michael Glöckler","Alex Bäuerle","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2403.11821v3.pdf","comment":"preprint, 20 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.09591v1","updated":"2024-04-15T09:01:47Z","published":"2024-04-15T09:01:47Z","title":"3D Gaussian Splatting as Markov Chain Monte Carlo","summary":" While 3D Gaussian Splatting has recently become popular for neural rendering,\ncurrent methods rely on carefully engineered cloning and splitting strategies\nfor placing Gaussians, which does not always generalize and may lead to\npoor-quality renderings. In addition, for real-world scenes, they rely on a\ngood initial point cloud to perform well. In this work, we rethink 3D Gaussians\nas random samples drawn from an underlying probability distribution describing\nthe physical representation of the scene -- in other words, Markov Chain Monte\nCarlo (MCMC) samples. Under this view, we show that the 3D Gaussian updates are\nstrikingly similar to a Stochastic Langevin Gradient Descent (SGLD) update. As\nwith MCMC, samples are nothing but past visit locations, adding new Gaussians\nunder our framework can simply be realized without heuristics as placing\nGaussians at existing Gaussian locations. To encourage using fewer Gaussians\nfor efficiency, we introduce an L1-regularizer on the Gaussians. On various\nstandard evaluation scenes, we show that our method provides improved rendering\nquality, easy control over the number of Gaussians, and robustness to\ninitialization.\n","authors":["Shakiba Kheradmand","Daniel Rebain","Gopal Sharma","Weiwei Sun","Jeff Tseng","Hossam Isack","Abhishek Kar","Andrea Tagliasacchi","Kwang Moo Yi"],"pdf_url":"https://arxiv.org/pdf/2404.09591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09586v1","updated":"2024-04-15T08:54:33Z","published":"2024-04-15T08:54:33Z","title":"Mitigating the Curse of Dimensionality for Certified Robustness via Dual\n Randomized Smoothing","summary":" Randomized Smoothing (RS) has been proven a promising method for endowing an\narbitrary image classifier with certified robustness. However, the substantial\nuncertainty inherent in the high-dimensional isotropic Gaussian noise imposes\nthe curse of dimensionality on RS. Specifically, the upper bound of ${\\ell_2}$\ncertified robustness radius provided by RS exhibits a diminishing trend with\nthe expansion of the input dimension $d$, proportionally decreasing at a rate\nof $1/\\sqrt{d}$. This paper explores the feasibility of providing ${\\ell_2}$\ncertified robustness for high-dimensional input through the utilization of dual\nsmoothing in the lower-dimensional space. The proposed Dual Randomized\nSmoothing (DRS) down-samples the input image into two sub-images and smooths\nthe two sub-images in lower dimensions. Theoretically, we prove that DRS\nguarantees a tight ${\\ell_2}$ certified robustness radius for the original\ninput and reveal that DRS attains a superior upper bound on the ${\\ell_2}$\nrobustness radius, which decreases proportionally at a rate of $(1/\\sqrt m +\n1/\\sqrt n )$ with $m+n=d$. Extensive experiments demonstrate the\ngeneralizability and effectiveness of DRS, which exhibits a notable capability\nto integrate with established methodologies, yielding substantial improvements\nin both accuracy and ${\\ell_2}$ certified robustness baselines of RS on the\nCIFAR-10 and ImageNet datasets. Code is available at\nhttps://github.com/xiasong0501/DRS.\n","authors":["Song Xia","Yu Yi","Xudong Jiang","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2404.09586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09585v1","updated":"2024-04-15T08:52:51Z","published":"2024-04-15T08:52:51Z","title":"Pseudo-label Learning with Calibrated Confidence Using an Energy-based\n Model","summary":" In pseudo-labeling (PL), which is a type of semi-supervised learning,\npseudo-labels are assigned based on the confidence scores provided by the\nclassifier; therefore, accurate confidence is important for successful PL. In\nthis study, we propose a PL algorithm based on an energy-based model (EBM),\nwhich is referred to as the energy-based PL (EBPL). In EBPL, a neural\nnetwork-based classifier and an EBM are jointly trained by sharing their\nfeature extraction parts. This approach enables the model to learn both the\nclass decision boundary and input data distribution, enhancing confidence\ncalibration during network training. The experimental results demonstrate that\nEBPL outperforms the existing PL method in semi-supervised image classification\ntasks, with superior confidence calibration error and recognition accuracy.\n","authors":["Masahito Toba","Seiichi Uchida","Hideaki Hayashi"],"pdf_url":"https://arxiv.org/pdf/2404.09585v1.pdf","comment":"8 pages, 8 figures, Accepted at IJCNN 2024"},{"id":"http://arxiv.org/abs/2311.17955v2","updated":"2024-04-15T08:43:58Z","published":"2023-11-29T08:11:20Z","title":"PEAN: A Diffusion-Based Prior-Enhanced Attention Network for Scene Text\n Image Super-Resolution","summary":" Scene text image super-resolution (STISR) aims at simultaneously increasing\nthe resolution and readability of low-resolution scene text images, thus\nboosting the performance of the downstream recognition task. Two factors in\nscene text images, visual structure and semantic information, affect the\nrecognition performance significantly. To mitigate the effects from these\nfactors, this paper proposes a Prior-Enhanced Attention Network (PEAN).\nSpecifically, an attention-based modulation module is leveraged to understand\nscene text images by neatly perceiving the local and global dependence of\nimages, despite the shape of the text. Meanwhile, a diffusion-based module is\ndeveloped to enhance the text prior, hence offering better guidance for the SR\nnetwork to generate SR images with higher semantic accuracy. Additionally, a\nmulti-task learning paradigm is employed to optimize the network, enabling the\nmodel to generate legible SR images. As a result, PEAN establishes new SOTA\nresults on the TextZoom benchmark. Experiments are also conducted to analyze\nthe importance of the enhanced text prior as a means of improving the\nperformance of the SR network. Code will be made available at\nhttps://github.com/jdfxzzy/PEAN.\n","authors":["Zuoyan Zhao","Hui Xue","Pengfei Fang","Shipeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.17955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07662v3","updated":"2024-04-15T08:37:57Z","published":"2023-03-14T07:07:34Z","title":"Do More With What You Have: Transferring Depth-Scale from Labeled to\n Unlabeled Domains","summary":" Transferring the absolute depth prediction capabilities of an estimator to a\nnew domain is a task with significant real-world applications. This task is\nspecifically challenging when images from the new domain are collected without\nground-truth depth measurements, and possibly with sensors of different\nintrinsics. To overcome such limitations, a recent zero-shot solution was\ntrained on an extensive training dataset and encoded the various camera\nintrinsics. Other solutions generated synthetic data with depth labels that\nmatched the intrinsics of the new target data to enable depth-scale transfer\nbetween the domains.\n In this work we present an alternative solution that can utilize any existing\nsynthetic or real dataset, that has a small number of images annotated with\nground truth depth labels. Specifically, we show that self-supervised depth\nestimators result in up-to-scale predictions that are linearly correlated to\ntheir absolute depth values across the domain, a property that we model in this\nwork using a single scalar. In addition, aligning the field-of-view of two\ndatasets prior to training, results in a common linear relationship for both\ndomains. We use this observed property to transfer the depth-scale from source\ndatasets that have absolute depth labels to new target datasets that lack these\nmeasurements, enabling absolute depth predictions in the target domain.\n The suggested method was successfully demonstrated on the KITTI, DDAD and\nnuScenes datasets, while using other existing real or synthetic source\ndatasets, that have a different field-of-view, other image style or structural\ncontent, achieving comparable or better accuracy than other existing methods\nthat do not use target ground-truth depths.\n","authors":["Alexandra Dana","Nadav Carmel","Amit Shomer","Ofer Manela","Tomer Peleg"],"pdf_url":"https://arxiv.org/pdf/2303.07662v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09571v1","updated":"2024-04-15T08:32:41Z","published":"2024-04-15T08:32:41Z","title":"MTKD: Multi-Teacher Knowledge Distillation for Image Super-Resolution","summary":" Knowledge distillation (KD) has emerged as a promising technique in deep\nlearning, typically employed to enhance a compact student network through\nlearning from their high-performance but more complex teacher variant. When\napplied in the context of image super-resolution, most KD approaches are\nmodified versions of methods developed for other computer vision tasks, which\nare based on training strategies with a single teacher and simple loss\nfunctions. In this paper, we propose a novel Multi-Teacher Knowledge\nDistillation (MTKD) framework specifically for image super-resolution. It\nexploits the advantages of multiple teachers by combining and enhancing the\noutputs of these teacher models, which then guides the learning process of the\ncompact student network. To achieve more effective learning performance, we\nhave also developed a new wavelet-based loss function for MTKD, which can\nbetter optimize the training process by observing differences in both the\nspatial and frequency domains. We fully evaluate the effectiveness of the\nproposed method by comparing it to five commonly used KD methods for image\nsuper-resolution based on three popular network architectures. The results show\nthat the proposed MTKD method achieves evident improvements in super-resolution\nperformance, up to 0.46dB (based on PSNR), over state-of-the-art KD approaches\nacross different network structures. The source code of MTKD will be made\navailable here for public evaluation.\n","authors":["Yuxuan Jiang","Chen Feng","Fan Zhang","David Bull"],"pdf_url":"https://arxiv.org/pdf/2404.09571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09570v1","updated":"2024-04-15T08:32:18Z","published":"2024-04-15T08:32:18Z","title":"The revenge of BiSeNet: Efficient Multi-Task Image Segmentation","summary":" Recent advancements in image segmentation have focused on enhancing the\nefficiency of the models to meet the demands of real-time applications,\nespecially on edge devices. However, existing research has primarily\nconcentrated on single-task settings, especially on semantic segmentation,\nleading to redundant efforts and specialized architectures for different tasks.\nTo address this limitation, we propose a novel architecture for efficient\nmulti-task image segmentation, capable of handling various segmentation tasks\nwithout sacrificing efficiency or accuracy. We introduce BiSeNetFormer, that\nleverages the efficiency of two-stream semantic segmentation architectures and\nit extends them into a mask classification framework. Our approach maintains\nthe efficient spatial and context paths to capture detailed and semantic\ninformation, respectively, while leveraging an efficient transformed-based\nsegmentation head that computes the binary masks and class probabilities. By\nseamlessly supporting multiple tasks, namely semantic and panoptic\nsegmentation, BiSeNetFormer offers a versatile solution for multi-task\nsegmentation. We evaluate our approach on popular datasets, Cityscapes and\nADE20K, demonstrating impressive inference speeds while maintaining competitive\naccuracy compared to state-of-the-art architectures. Our results indicate that\nBiSeNetFormer represents a significant advancement towards fast, efficient, and\nmulti-task segmentation networks, bridging the gap between model efficiency and\ntask adaptability.\n","authors":["Gabriele Rosi","Claudia Cuttano","Niccolò Cavagnero","Giuseppe Averta","Fabio Cermelli"],"pdf_url":"https://arxiv.org/pdf/2404.09570v1.pdf","comment":"Accepted to ECV workshop at CVPR2024"},{"id":"http://arxiv.org/abs/2212.14855v3","updated":"2024-04-15T08:24:42Z","published":"2022-12-30T18:04:25Z","title":"Disentangled Explanations of Neural Network Predictions by Finding\n Relevant Subspaces","summary":" Explainable AI aims to overcome the black-box nature of complex ML models\nlike neural networks by generating explanations for their predictions.\nExplanations often take the form of a heatmap identifying input features (e.g.\npixels) that are relevant to the model's decision. These explanations, however,\nentangle the potentially multiple factors that enter into the overall complex\ndecision strategy. We propose to disentangle explanations by extracting at some\nintermediate layer of a neural network, subspaces that capture the multiple and\ndistinct activation patterns (e.g. visual concepts) that are relevant to the\nprediction. To automatically extract these subspaces, we propose two new\nanalyses, extending principles found in PCA or ICA to explanations. These novel\nanalyses, which we call principal relevant component analysis (PRCA) and\ndisentangled relevant subspace analysis (DRSA), maximize relevance instead of\ne.g. variance or kurtosis. This allows for a much stronger focus of the\nanalysis on what the ML model actually uses for predicting, ignoring\nactivations or concepts to which the model is invariant. Our approach is\ngeneral enough to work alongside common attribution techniques such as Shapley\nValue, Integrated Gradients, or LRP. Our proposed methods show to be\npractically useful and compare favorably to the state of the art as\ndemonstrated on benchmarks and three use cases.\n","authors":["Pattarawat Chormai","Jan Herrmann","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2212.14855v3.pdf","comment":"17 pages + supplement"},{"id":"http://arxiv.org/abs/2303.14017v3","updated":"2024-04-15T08:22:49Z","published":"2023-03-24T14:18:40Z","title":"CF-Font: Content Fusion for Few-shot Font Generation","summary":" Content and style disentanglement is an effective way to achieve few-shot\nfont generation. It allows to transfer the style of the font image in a source\ndomain to the style defined with a few reference images in a target domain.\nHowever, the content feature extracted using a representative font might not be\noptimal. In light of this, we propose a content fusion module (CFM) to project\nthe content feature into a linear space defined by the content features of\nbasis fonts, which can take the variation of content features caused by\ndifferent fonts into consideration. Our method also allows to optimize the\nstyle representation vector of reference images through a lightweight iterative\nstyle-vector refinement (ISR) strategy. Moreover, we treat the 1D projection of\na character image as a probability distribution and leverage the distance\nbetween two distributions as the reconstruction loss (namely projected\ncharacter loss, PCL). Compared to L2 or L1 reconstruction loss, the\ndistribution distance pays more attention to the global shape of characters. We\nhave evaluated our method on a dataset of 300 fonts with 6.5k characters each.\nExperimental results verify that our method outperforms existing\nstate-of-the-art few-shot font generation methods by a large margin. The source\ncode can be found at https://github.com/wangchi95/CF-Font.\n","authors":["Chi Wang","Min Zhou","Tiezheng Ge","Yuning Jiang","Hujun Bao","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2303.14017v3.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2404.09556v1","updated":"2024-04-15T08:19:08Z","published":"2024-04-15T08:19:08Z","title":"nnU-Net Revisited: A Call for Rigorous Validation in 3D Medical Image\n Segmentation","summary":" The release of nnU-Net marked a paradigm shift in 3D medical image\nsegmentation, demonstrating that a properly configured U-Net architecture could\nstill achieve state-of-the-art results. Despite this, the pursuit of novel\narchitectures, and the respective claims of superior performance over the U-Net\nbaseline, continued. In this study, we demonstrate that many of these recent\nclaims fail to hold up when scrutinized for common validation shortcomings,\nsuch as the use of inadequate baselines, insufficient datasets, and neglected\ncomputational resources. By meticulously avoiding these pitfalls, we conduct a\nthorough and comprehensive benchmarking of current segmentation methods\nincluding CNN-based, Transformer-based, and Mamba-based approaches. In contrast\nto current beliefs, we find that the recipe for state-of-the-art performance is\n1) employing CNN-based U-Net models, including ResNet and ConvNeXt variants, 2)\nusing the nnU-Net framework, and 3) scaling models to modern hardware\nresources. These results indicate an ongoing innovation bias towards novel\narchitectures in the field and underscore the need for more stringent\nvalidation standards in the quest for scientific progress.\n","authors":["Fabian Isensee","Tassilo Wald","Constantin Ulrich","Michael Baumgartner","Saikat Roy","Klaus Maier-Hein","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2404.09556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09555v1","updated":"2024-04-15T08:18:38Z","published":"2024-04-15T08:18:38Z","title":"AI-KD: Towards Alignment Invariant Face Image Quality Assessment Using\n Knowledge Distillation","summary":" Face Image Quality Assessment (FIQA) techniques have seen steady improvements\nover recent years, but their performance still deteriorates if the input face\nsamples are not properly aligned. This alignment sensitivity comes from the\nfact that most FIQA techniques are trained or designed using a specific face\nalignment procedure. If the alignment technique changes, the performance of\nmost existing FIQA techniques quickly becomes suboptimal. To address this\nproblem, we present in this paper a novel knowledge distillation approach,\ntermed AI-KD that can extend on any existing FIQA technique, improving its\nrobustness to alignment variations and, in turn, performance with different\nalignment procedures. To validate the proposed distillation approach, we\nconduct comprehensive experiments on 6 face datasets with 4 recent face\nrecognition models and in comparison to 7 state-of-the-art FIQA techniques. Our\nresults show that AI-KD consistently improves performance of the initial FIQA\ntechniques not only with misaligned samples, but also with properly aligned\nfacial images. Furthermore, it leads to a new state-of-the-art, when used with\na competitive initial FIQA approach. The code for AI-KD is made publicly\navailable from: https://github.com/LSIbabnikz/AI-KD.\n","authors":["Žiga Babnik","Fadi Boutros","Naser Damer","Peter Peer","Vitomir Štruc"],"pdf_url":"https://arxiv.org/pdf/2404.09555v1.pdf","comment":"IEEE International Workshop on Biometrics and Forensics (IWBF) 2024,\n pp. 6"},{"id":"http://arxiv.org/abs/2404.09540v1","updated":"2024-04-15T08:04:44Z","published":"2024-04-15T08:04:44Z","title":"Text-Driven Diverse Facial Texture Generation via Progressive\n Latent-Space Refinement","summary":" Automatic 3D facial texture generation has gained significant interest\nrecently. Existing approaches may not support the traditional physically based\nrendering pipeline or rely on 3D data captured by Light Stage. Our key\ncontribution is a progressive latent space refinement approach that can\nbootstrap from 3D Morphable Models (3DMMs)-based texture maps generated from\nfacial images to generate high-quality and diverse PBR textures, including\nalbedo, normal, and roughness. It starts with enhancing Generative Adversarial\nNetworks (GANs) for text-guided and diverse texture generation. To this end, we\ndesign a self-supervised paradigm to overcome the reliance on ground truth 3D\ntextures and train the generative model with only entangled texture maps.\nBesides, we foster mutual enhancement between GANs and Score Distillation\nSampling (SDS). SDS boosts GANs with more generative modes, while GANs promote\nmore efficient optimization of SDS. Furthermore, we introduce an edge-aware SDS\nfor multi-view consistent facial structure. Experiments demonstrate that our\nmethod outperforms existing 3D texture generation methods regarding\nphoto-realistic quality, diversity, and efficiency.\n","authors":["Chi Wang","Junming Huang","Rong Zhang","Qi Wang","Haotian Yang","Haibin Huang","Chongyang Ma","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05941v2","updated":"2024-04-15T07:59:37Z","published":"2023-12-10T17:07:37Z","title":"ASH: Animatable Gaussian Splats for Efficient and Photoreal Human\n Rendering","summary":" Real-time rendering of photorealistic and controllable human avatars stands\nas a cornerstone in Computer Vision and Graphics. While recent advances in\nneural implicit rendering have unlocked unprecedented photorealism for digital\navatars, real-time performance has mostly been demonstrated for static scenes\nonly. To address this, we propose ASH, an animatable Gaussian splatting\napproach for photorealistic rendering of dynamic humans in real-time. We\nparameterize the clothed human as animatable 3D Gaussians, which can be\nefficiently splatted into image space to generate the final rendering. However,\nnaively learning the Gaussian parameters in 3D space poses a severe challenge\nin terms of compute. Instead, we attach the Gaussians onto a deformable\ncharacter model, and learn their parameters in 2D texture space, which allows\nleveraging efficient 2D convolutional architectures that easily scale with the\nrequired number of Gaussians. We benchmark ASH with competing methods on\npose-controllable avatars, demonstrating that our method outperforms existing\nreal-time methods by a large margin and shows comparable or even better results\nthan offline methods.\n","authors":["Haokai Pang","Heming Zhu","Adam Kortylewski","Christian Theobalt","Marc Habermann"],"pdf_url":"https://arxiv.org/pdf/2312.05941v2.pdf","comment":"For project page, see https://vcai.mpi-inf.mpg.de/projects/ash/"},{"id":"http://arxiv.org/abs/2401.03522v2","updated":"2024-04-15T07:59:03Z","published":"2024-01-07T15:47:19Z","title":"Text-Driven Traffic Anomaly Detection with Temporal High-Frequency\n Modeling in Driving Videos","summary":" Traffic anomaly detection (TAD) in driving videos is critical for ensuring\nthe safety of autonomous driving and advanced driver assistance systems.\nPrevious single-stage TAD methods primarily rely on frame prediction, making\nthem vulnerable to interference from dynamic backgrounds induced by the rapid\nmovement of the dashboard camera. While two-stage TAD methods appear to be a\nnatural solution to mitigate such interference by pre-extracting\nbackground-independent features (such as bounding boxes and optical flow) using\nperceptual algorithms, they are susceptible to the performance of first-stage\nperceptual algorithms and may result in error propagation. In this paper, we\nintroduce TTHF, a novel single-stage method aligning video clips with text\nprompts, offering a new perspective on traffic anomaly detection. Unlike\nprevious approaches, the supervised signal of our method is derived from\nlanguages rather than orthogonal one-hot vectors, providing a more\ncomprehensive representation. Further, concerning visual representation, we\npropose to model the high frequency of driving videos in the temporal domain.\nThis modeling captures the dynamic changes of driving scenes, enhances the\nperception of driving behavior, and significantly improves the detection of\ntraffic anomalies. In addition, to better perceive various types of traffic\nanomalies, we carefully design an attentive anomaly focusing mechanism that\nvisually and linguistically guides the model to adaptively focus on the visual\ncontext of interest, thereby facilitating the detection of traffic anomalies.\nIt is shown that our proposed TTHF achieves promising performance,\noutperforming state-of-the-art competitors by +5.4% AUC on the DoTA dataset and\nachieving high generalization on the DADA dataset.\n","authors":["Rongqin Liang","Yuanman Li","Jiantao Zhou","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2401.03522v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.09533v1","updated":"2024-04-15T07:53:07Z","published":"2024-04-15T07:53:07Z","title":"WiTUnet: A U-Shaped Architecture Integrating CNN and Transformer for\n Improved Feature Alignment and Local Information Fusion","summary":" Low-dose computed tomography (LDCT) has become the technology of choice for\ndiagnostic medical imaging, given its lower radiation dose compared to standard\nCT, despite increasing image noise and potentially affecting diagnostic\naccuracy. To address this, advanced deep learning-based LDCT denoising\nalgorithms have been developed, primarily using Convolutional Neural Networks\n(CNNs) or Transformer Networks with the Unet architecture. This architecture\nenhances image detail by integrating feature maps from the encoder and decoder\nvia skip connections. However, current methods often overlook enhancements to\nthe Unet architecture itself, focusing instead on optimizing encoder and\ndecoder structures. This approach can be problematic due to the significant\ndifferences in feature map characteristics between the encoder and decoder,\nwhere simple fusion strategies may not effectively reconstruct images.In this\npaper, we introduce WiTUnet, a novel LDCT image denoising method that utilizes\nnested, dense skip pathways instead of traditional skip connections to improve\nfeature integration. WiTUnet also incorporates a windowed Transformer structure\nto process images in smaller, non-overlapping segments, reducing computational\nload. Additionally, the integration of a Local Image Perception Enhancement\n(LiPe) module in both the encoder and decoder replaces the standard multi-layer\nperceptron (MLP) in Transformers, enhancing local feature capture and\nrepresentation. Through extensive experimental comparisons, WiTUnet has\ndemonstrated superior performance over existing methods in key metrics such as\nPeak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), and Root Mean\nSquare Error (RMSE), significantly improving noise removal and image quality.\n","authors":["Bin Wang","Fei Deng","Peifan Jiang","Shuang Wang","Xiao Han","Hongjie Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.09533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09532v1","updated":"2024-04-15T07:51:40Z","published":"2024-04-15T07:51:40Z","title":"TMPQ-DM: Joint Timestep Reduction and Quantization Precision Selection\n for Efficient Diffusion Models","summary":" Diffusion models have emerged as preeminent contenders in the realm of\ngenerative models. Distinguished by their distinctive sequential generative\nprocesses, characterized by hundreds or even thousands of timesteps, diffusion\nmodels progressively reconstruct images from pure Gaussian noise, with each\ntimestep necessitating full inference of the entire model. However, the\nsubstantial computational demands inherent to these models present challenges\nfor deployment, quantization is thus widely used to lower the bit-width for\nreducing the storage and computing overheads. Current quantization\nmethodologies primarily focus on model-side optimization, disregarding the\ntemporal dimension, such as the length of the timestep sequence, thereby\nallowing redundant timesteps to continue consuming computational resources,\nleaving substantial scope for accelerating the generative process. In this\npaper, we introduce TMPQ-DM, which jointly optimizes timestep reduction and\nquantization to achieve a superior performance-efficiency trade-off, addressing\nboth temporal and model optimization aspects. For timestep reduction, we devise\na non-uniform grouping scheme tailored to the non-uniform nature of the\ndenoising process, thereby mitigating the explosive combinations of timesteps.\nIn terms of quantization, we adopt a fine-grained layer-wise approach to\nallocate varying bit-widths to different layers based on their respective\ncontributions to the final generative performance, thus rectifying performance\ndegradation observed in prior studies. To expedite the evaluation of\nfine-grained quantization, we further devise a super-network to serve as a\nprecision solver by leveraging shared quantization results. These two design\ncomponents are seamlessly integrated within our framework, enabling rapid joint\nexploration of the exponentially large decision space via a gradient-free\nevolutionary search algorithm.\n","authors":["Haojun Sun","Chen Tang","Zhi Wang","Yuan Meng","Jingyan jiang","Xinzhu Ma","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.09532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09531v1","updated":"2024-04-15T07:51:29Z","published":"2024-04-15T07:51:29Z","title":"Oblique-MERF: Revisiting and Improving MERF for Oblique Photography","summary":" Neural implicit fields have established a new paradigm for scene\nrepresentation, with subsequent work achieving high-quality real-time\nrendering. However, reconstructing 3D scenes from oblique aerial photography\npresents unique challenges, such as varying spatial scale distributions and a\nconstrained range of tilt angles, often resulting in high memory consumption\nand reduced rendering quality at extrapolated viewpoints. In this paper, we\nenhance MERF to accommodate these data characteristics by introducing an\ninnovative adaptive occupancy plane optimized during the volume rendering\nprocess and a smoothness regularization term for view-dependent color to\naddress these issues. Our approach, termed Oblique-MERF, surpasses\nstate-of-the-art real-time methods by approximately 0.7 dB, reduces VRAM usage\nby about 40%, and achieves higher rendering frame rates with more realistic\nrendering outcomes across most viewpoints.\n","authors":["Xiaoyi Zeng","Kaiwen Song","Leyuan Yang","Bailin Deng","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09530v1","updated":"2024-04-15T07:50:15Z","published":"2024-04-15T07:50:15Z","title":"RanLayNet: A Dataset for Document Layout Detection used for Domain\n Adaptation and Generalization","summary":" Large ground-truth datasets and recent advances in deep learning techniques\nhave been useful for layout detection. However, because of the restricted\nlayout diversity of these datasets, training on them requires a sizable number\nof annotated instances, which is both expensive and time-consuming. As a\nresult, differences between the source and target domains may significantly\nimpact how well these models function. To solve this problem, domain adaptation\napproaches have been developed that use a small quantity of labeled data to\nadjust the model to the target domain. In this research, we introduced a\nsynthetic document dataset called RanLayNet, enriched with automatically\nassigned labels denoting spatial positions, ranges, and types of layout\nelements. The primary aim of this endeavor is to develop a versatile dataset\ncapable of training models with robustness and adaptability to diverse document\nformats. Through empirical experimentation, we demonstrate that a deep layout\nidentification model trained on our dataset exhibits enhanced performance\ncompared to a model trained solely on actual documents. Moreover, we conduct a\ncomparative analysis by fine-tuning inference models using both PubLayNet and\nIIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that\nmodels enriched with our dataset are optimal for tasks such as achieving 0.398\nand 0.588 mAP95 score in the scientific document domain for the TABLE class.\n","authors":["Avinash Anand","Raj Jaiswal","Mohit Gupta","Siddhesh S Bangar","Pijush Bhuyan","Naman Lal","Rajeev Singh","Ritika Jha","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.09530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09516v1","updated":"2024-04-15T07:24:45Z","published":"2024-04-15T07:24:45Z","title":"State Space Model for New-Generation Network Alternative to\n Transformers: A Survey","summary":" In the post-deep learning era, the Transformer architecture has demonstrated\nits powerful performance across pre-trained big models and various downstream\ntasks. However, the enormous computational demands of this architecture have\ndeterred many researchers. To further reduce the complexity of attention\nmodels, numerous efforts have been made to design more efficient methods. Among\nthem, the State Space Model (SSM), as a possible replacement for the\nself-attention based Transformer model, has drawn more and more attention in\nrecent years. In this paper, we give the first comprehensive review of these\nworks and also provide experimental comparisons and analysis to better\ndemonstrate the features and advantages of SSM. Specifically, we first give a\ndetailed description of principles to help the readers quickly capture the key\nideas of SSM. After that, we dive into the reviews of existing SSMs and their\nvarious applications, including natural language processing, computer vision,\ngraph, multi-modal and multi-media, point cloud/event stream, time series data,\nand other domains. In addition, we give statistical comparisons and analysis of\nthese models and hope it helps the readers to understand the effectiveness of\ndifferent structures on various tasks. Then, we propose possible research\npoints in this direction to better promote the development of the theoretical\nmodel and application of SSM. More related works will be continuously updated\non the following GitHub:\nhttps://github.com/Event-AHU/Mamba_State_Space_Model_Paper_List.\n","authors":["Xiao Wang","Shiao Wang","Yuhe Ding","Yuehang Li","Wentao Wu","Yao Rong","Weizhe Kong","Ju Huang","Shihao Li","Haoxiang Yang","Ziwen Wang","Bo Jiang","Chenglong Li","Yaowei Wang","Yonghong Tian","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2404.09516v1.pdf","comment":"The First review of State Space Model (SSM)/Mamba and their\n applications in artificial intelligence, 33 pages"},{"id":"http://arxiv.org/abs/2404.09515v1","updated":"2024-04-15T07:20:09Z","published":"2024-04-15T07:20:09Z","title":"Deep image learning of quantitative structure-property relationships of\n cooper alloys via feature augmentation on Geodesic curve in shape space","summary":" Understanding how the structure of materials affects their properties is a\ncornerstone of materials science and engineering. However, traditional methods\nhave struggled to accurately describe the quantitative structure-property\nrelationships for complex structures. In our study, we bridge this gap by\nleveraging machine learning to analyze images of materials' microstructures,\nthus offering a novel way to understand and predict the properties of materials\nbased on their microstructures. We introduce a method known as FAGC (Feature\nAugmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr\nalloys. This approach utilizes machine learning to examine the shapes within\nimages of the alloys' microstructures and predict their mechanical and\nelectronic properties. This generative FAGC approach can effectively expand the\nrelatively small training datasets due to the limited availability of materials\nimages labeled with quantitative properties. The process begins with extracting\nfeatures from the images using neural networks. These features are then mapped\nonto the Pre-shape space to construct the Geodesic curves. Along these curves,\nnew features are generated, effectively increasing the dataset. Moreover, we\ndesign a pseudo-labeling mechanism for these newly generated features to\nfurther enhance the training dataset. Our FAGC method has shown remarkable\nresults, significantly improving the accuracy of predicting the electronic\nconductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978\nand 0.998, respectively. These outcomes underscore the potential of FAGC to\naddress the challenge of limited image data in materials science, providing a\npowerful tool for establishing detailed and quantitative relationships between\ncomplex microstructures and material properties.\n","authors":["Yuexing Han","Guanxin Wan","Bing Wang","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13783v2","updated":"2024-04-15T07:18:45Z","published":"2023-12-21T12:14:31Z","title":"Few Shot Part Segmentation Reveals Compositional Logic for Industrial\n Anomaly Detection","summary":" Logical anomalies (LA) refer to data violating underlying logical constraints\ne.g., the quantity, arrangement, or composition of components within an image.\nDetecting accurately such anomalies requires models to reason about various\ncomponent types through segmentation. However, curation of pixel-level\nannotations for semantic segmentation is both time-consuming and expensive.\nAlthough there are some prior few-shot or unsupervised co-part segmentation\nalgorithms, they often fail on images with industrial object. These images have\ncomponents with similar textures and shapes, and a precise differentiation\nproves challenging. In this study, we introduce a novel component segmentation\nmodel for LA detection that leverages a few labeled samples and unlabeled\nimages sharing logical constraints. To ensure consistent segmentation across\nunlabeled images, we employ a histogram matching loss in conjunction with an\nentropy loss. As segmentation predictions play a crucial role, we propose to\nenhance both local and global sample validity detection by capturing key\naspects from visual semantics via three memory banks: class histograms,\ncomponent composition embeddings and patch-level representations. For effective\nLA detection, we propose an adaptive scaling strategy to standardize anomaly\nscores from different memory banks in inference. Extensive experiments on the\npublic benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA\ndetection vs. 89.6% from competing methods.\n","authors":["Soopil Kim","Sion An","Philip Chikontwe","Myeongkyun Kang","Ehsan Adeli","Kilian M. Pohl","Sang Hyun Park"],"pdf_url":"https://arxiv.org/pdf/2312.13783v2.pdf","comment":"Accepted in AAAI2024"},{"id":"http://arxiv.org/abs/2312.01897v2","updated":"2024-04-15T07:15:43Z","published":"2023-12-04T13:51:16Z","title":"Adapting Short-Term Transformers for Action Detection in Untrimmed\n Videos","summary":" Vision Transformer (ViT) has shown high potential in video recognition, owing\nto its flexible design, adaptable self-attention mechanisms, and the efficacy\nof masked pre-training. Yet, it remains unclear how to adapt these pre-trained\nshort-term ViTs for temporal action detection (TAD) in untrimmed videos. The\nexisting works treat them as off-the-shelf feature extractors for each\nshort-trimmed snippet without capturing the fine-grained relation among\ndifferent snippets in a broader temporal context. To mitigate this issue, this\npaper focuses on designing a new mechanism for adapting these pre-trained ViT\nmodels as a unified long-form video transformer to fully unleash its modeling\npower in capturing inter-snippet relation, while still keeping low computation\noverhead and memory consumption for efficient TAD. To this end, we design\neffective cross-snippet propagation modules to gradually exchange short-term\nvideo information among different snippets from two levels. For inner-backbone\ninformation propagation, we introduce a cross-snippet propagation strategy to\nenable multi-snippet temporal feature interaction inside the backbone.For\npost-backbone information propagation, we propose temporal transformer layers\nfor further clip-level modeling. With the plain ViT-B pre-trained with\nVideoMAE, our end-to-end temporal action detector (ViT-TAD) yields a very\ncompetitive performance to previous temporal action detectors, riching up to\n69.5 average mAP on THUMOS14, 37.40 average mAP on ActivityNet-1.3 and 17.20\naverage mAP on FineAction.\n","authors":["Min Yang","Huan Gao","Ping Guo","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01897v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.09512v1","updated":"2024-04-15T07:15:39Z","published":"2024-04-15T07:15:39Z","title":"Magic Clothing: Controllable Garment-Driven Image Synthesis","summary":" We propose Magic Clothing, a latent diffusion model (LDM)-based network\narchitecture for an unexplored garment-driven image synthesis task. Aiming at\ngenerating customized characters wearing the target garments with diverse text\nprompts, the image controllability is the most critical issue, i.e., to\npreserve the garment details and maintain faithfulness to the text prompts. To\nthis end, we introduce a garment extractor to capture the detailed garment\nfeatures, and employ self-attention fusion to incorporate them into the\npretrained LDMs, ensuring that the garment details remain unchanged on the\ntarget character. Then, we leverage the joint classifier-free guidance to\nbalance the control of garment features and text prompts over the generated\nresults. Meanwhile, the proposed garment extractor is a plug-in module\napplicable to various finetuned LDMs, and it can be combined with other\nextensions like ControlNet and IP-Adapter to enhance the diversity and\ncontrollability of the generated characters. Furthermore, we design\nMatched-Points-LPIPS (MP-LPIPS), a robust metric for evaluating the consistency\nof the target image to the source garment. Extensive experiments demonstrate\nthat our Magic Clothing achieves state-of-the-art results under various\nconditional controls for garment-driven image synthesis. Our source code is\navailable at https://github.com/ShineChen1024/MagicClothing.\n","authors":["Weifeng Chen","Tao Gu","Yuhao Xu","Chengcai Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01238v2","updated":"2024-04-15T07:12:20Z","published":"2024-03-02T15:47:42Z","title":"On the Road to Portability: Compressing End-to-End Motion Planner for\n Autonomous Driving","summary":" End-to-end motion planning models equipped with deep neural networks have\nshown great potential for enabling full autonomous driving. However, the\noversized neural networks render them impractical for deployment on\nresource-constrained systems, which unavoidably requires more computational\ntime and resources during reference.To handle this, knowledge distillation\noffers a promising approach that compresses models by enabling a smaller\nstudent model to learn from a larger teacher model. Nevertheless, how to apply\nknowledge distillation to compress motion planners has not been explored so\nfar. In this paper, we propose PlanKD, the first knowledge distillation\nframework tailored for compressing end-to-end motion planners. First,\nconsidering that driving scenes are inherently complex, often containing\nplanning-irrelevant or even noisy information, transferring such information is\nnot beneficial for the student planner. Thus, we design an information\nbottleneck based strategy to only distill planning-relevant information, rather\nthan transfer all information indiscriminately. Second, different waypoints in\nan output planned trajectory may hold varying degrees of importance for motion\nplanning, where a slight deviation in certain crucial waypoints might lead to a\ncollision. Therefore, we devise a safety-aware waypoint-attentive distillation\nmodule that assigns adaptive weights to different waypoints based on the\nimportance, to encourage the student to accurately mimic more crucial\nwaypoints, thereby improving overall safety. Experiments demonstrate that our\nPlanKD can boost the performance of smaller planners by a large margin, and\nsignificantly reduce their reference time.\n","authors":["Kaituo Feng","Changsheng Li","Dongchun Ren","Ye Yuan","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2403.01238v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.00015v2","updated":"2024-04-15T07:06:54Z","published":"2023-12-28T14:14:31Z","title":"Maintaining User Trust Through Multistage Uncertainty Aware Inference","summary":" This paper describes and evaluates a multistage approach to AI deployment.\nEach stage involves a more accurate method of inference, yet engaging each\ncomes with an increasing cost. In outlining the architecture, we present a\nmethod for quantifying model uncertainty that facilitates confident deferral\ndecisions. The architecture is currently under active deployment to thousands\nof cotton farmers across India. The broader idea however is applicable to a\ngrowing sector of AI deployments in challenging low resources settings.\n","authors":["Chandan Agrawal","Ashish Papanai","Jerome White"],"pdf_url":"https://arxiv.org/pdf/2402.00015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09509v1","updated":"2024-04-15T07:05:14Z","published":"2024-04-15T07:05:14Z","title":"Fuse after Align: Improving Face-Voice Association Learning via\n Multimodal Encoder","summary":" Today, there have been many achievements in learning the association between\nvoice and face. However, most previous work models rely on cosine similarity or\nL2 distance to evaluate the likeness of voices and faces following contrastive\nlearning, subsequently applied to retrieval and matching tasks. This method\nonly considers the embeddings as high-dimensional vectors, utilizing a minimal\nscope of available information. This paper introduces a novel framework within\nan unsupervised setting for learning voice-face associations. By employing a\nmultimodal encoder after contrastive learning and addressing the problem\nthrough binary classification, we can learn the implicit information within the\nembeddings in a more effective and varied manner. Furthermore, by introducing\nan effective pair selection method, we enhance the learning outcomes of both\ncontrastive learning and the matching task. Empirical evidence demonstrates\nthat our framework achieves state-of-the-art results in voice-face matching,\nverification, and retrieval tasks, improving verification by approximately 3%,\nmatching by about 2.5%, and retrieval by around 1.3%.\n","authors":["Chong Peng","Liqiang He","Dan Su"],"pdf_url":"https://arxiv.org/pdf/2404.09509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09507v1","updated":"2024-04-15T06:58:09Z","published":"2024-04-15T06:58:09Z","title":"Clothes-Changing Person Re-Identification with Feasibility-Aware\n Intermediary Matching","summary":" Current clothes-changing person re-identification (re-id) approaches usually\nperform retrieval based on clothes-irrelevant features, while neglecting the\npotential of clothes-relevant features. However, we observe that relying solely\non clothes-irrelevant features for clothes-changing re-id is limited, since\nthey often lack adequate identity information and suffer from large intra-class\nvariations. On the contrary, clothes-relevant features can be used to discover\nsame-clothes intermediaries that possess informative identity clues. Based on\nthis observation, we propose a Feasibility-Aware Intermediary Matching (FAIM)\nframework to additionally utilize clothes-relevant features for retrieval.\nFirstly, an Intermediary Matching (IM) module is designed to perform an\nintermediary-assisted matching process. This process involves using\nclothes-relevant features to find informative intermediates, and then using\nclothes-irrelevant features of these intermediates to complete the matching.\nSecondly, in order to reduce the negative effect of low-quality intermediaries,\nan Intermediary-Based Feasibility Weighting (IBFW) module is designed to\nevaluate the feasibility of intermediary matching process by assessing the\nquality of intermediaries. Extensive experiments demonstrate that our method\noutperforms state-of-the-art methods on several widely-used clothes-changing\nre-id benchmarks.\n","authors":["Jiahe Zhao","Ruibing Hou","Hong Chang","Xinqian Gu","Bingpeng Ma","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09504v1","updated":"2024-04-15T06:50:58Z","published":"2024-04-15T06:50:58Z","title":"Learning Tracking Representations from Single Point Annotations","summary":" Existing deep trackers are typically trained with largescale video frames\nwith annotated bounding boxes. However, these bounding boxes are expensive and\ntime-consuming to annotate, in particular for large scale datasets. In this\npaper, we propose to learn tracking representations from single point\nannotations (i.e., 4.5x faster to annotate than the traditional bounding box)\nin a weakly supervised manner. Specifically, we propose a soft contrastive\nlearning (SoCL) framework that incorporates target objectness prior into\nend-to-end contrastive learning. Our SoCL consists of adaptive positive and\nnegative sample generation, which is memory-efficient and effective for\nlearning tracking representations. We apply the learned representation of SoCL\nto visual tracking and show that our method can 1) achieve better performance\nthan the fully supervised baseline trained with box annotations under the same\nannotation time cost; 2) achieve comparable performance of the fully supervised\nbaseline by using the same number of training frames and meanwhile reducing\nannotation time cost by 78% and total fees by 85%; 3) be robust to annotation\nnoise.\n","authors":["Qiangqiang Wu","Antoni B. Chan"],"pdf_url":"https://arxiv.org/pdf/2404.09504v1.pdf","comment":"Accept to CVPR2024-L3DIVU"},{"id":"http://arxiv.org/abs/2403.13392v2","updated":"2024-04-15T06:46:04Z","published":"2024-03-20T08:33:40Z","title":"Robust image segmentation model based on binary level set","summary":" In order to improve the robustness of traditional image segmentation models\nto noise, this paper models the illumination term in intensity inhomogeneity\nimages. Additionally, to enhance the model's robustness to noisy images, we\nincorporate the binary level set model into the proposed model. Compared to the\ntraditional level set, the binary level set eliminates the need for continuous\nreinitialization. Moreover, by introducing the variational operator GL, our\nmodel demonstrates better capability in segmenting noisy images. Finally, we\nemploy the three-step splitting operator method for solving, and the\neffectiveness of the proposed model is demonstrated on various images.\n","authors":["Wenqi Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.13392v2.pdf","comment":"SCI"},{"id":"http://arxiv.org/abs/2404.09502v1","updated":"2024-04-15T06:45:06Z","published":"2024-04-15T06:45:06Z","title":"SparseOcc: Rethinking Sparse Latent Representation for Vision-Based\n Semantic Occupancy Prediction","summary":" Vision-based perception for autonomous driving requires an explicit modeling\nof a 3D space, where 2D latent representations are mapped and subsequent 3D\noperators are applied. However, operating on dense latent spaces introduces a\ncubic time and space complexity, which limits scalability in terms of\nperception range or spatial resolution. Existing approaches compress the dense\nrepresentation using projections like Bird's Eye View (BEV) or Tri-Perspective\nView (TPV). Although efficient, these projections result in information loss,\nespecially for tasks like semantic occupancy prediction. To address this, we\npropose SparseOcc, an efficient occupancy network inspired by sparse point\ncloud processing. It utilizes a lossless sparse latent representation with\nthree key innovations. Firstly, a 3D sparse diffuser performs latent completion\nusing spatially decomposed 3D sparse convolutional kernels. Secondly, a feature\npyramid and sparse interpolation enhance scales with information from others.\nFinally, the transformer head is redesigned as a sparse variant. SparseOcc\nachieves a remarkable 74.9% reduction on FLOPs over the dense baseline.\nInterestingly, it also improves accuracy, from 12.8% to 14.1% mIOU, which in\npart can be attributed to the sparse representation's ability to avoid\nhallucinations on empty voxels.\n","authors":["Pin Tang","Zhongdao Wang","Guoqing Wang","Jilai Zheng","Xiangxuan Ren","Bailan Feng","Chao Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09502v1.pdf","comment":"10 pages, 4 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09499v1","updated":"2024-04-15T06:38:09Z","published":"2024-04-15T06:38:09Z","title":"Learning Human Motion from Monocular Videos via Cross-Modal Manifold\n Alignment","summary":" Learning 3D human motion from 2D inputs is a fundamental task in the realms\nof computer vision and computer graphics. Many previous methods grapple with\nthis inherently ambiguous task by introducing motion priors into the learning\nprocess. However, these approaches face difficulties in defining the complete\nconfigurations of such priors or training a robust model. In this paper, we\npresent the Video-to-Motion Generator (VTM), which leverages motion priors\nthrough cross-modal latent feature space alignment between 3D human motion and\n2D inputs, namely videos and 2D keypoints. To reduce the complexity of modeling\nmotion priors, we model the motion data separately for the upper and lower body\nparts. Additionally, we align the motion data with a scale-invariant virtual\nskeleton to mitigate the interference of human skeleton variations to the\nmotion priors. Evaluated on AIST++, the VTM showcases state-of-the-art\nperformance in reconstructing 3D human motion from monocular videos. Notably,\nour VTM exhibits the capabilities for generalization to unseen view angles and\nin-the-wild videos.\n","authors":["Shuaiying Hou","Hongyu Tao","Junheng Fang","Changqing Zou","Hujun Bao","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09498v1","updated":"2024-04-15T06:37:21Z","published":"2024-04-15T06:37:21Z","title":"FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion\n with Mamba","summary":" Multi-modal image fusion aims to combine information from different modes to\ncreate a single image with comprehensive information and detailed textures.\nHowever, fusion models based on convolutional neural networks encounter\nlimitations in capturing global image features due to their focus on local\nconvolution operations. Transformer-based models, while excelling in global\nfeature modeling, confront computational challenges stemming from their\nquadratic complexity. Recently, the Selective Structured State Space Model has\nexhibited significant potential for long-range dependency modeling with linear\ncomplexity, offering a promising avenue to address the aforementioned dilemma.\nIn this paper, we propose FusionMamba, a novel dynamic feature enhancement\nmethod for multimodal image fusion with Mamba. Specifically, we devise an\nimproved efficient Mamba model for image fusion, integrating efficient visual\nstate space model with dynamic convolution and channel attention. This refined\nmodel not only upholds the performance of Mamba and global modeling capability\nbut also diminishes channel redundancy while enhancing local enhancement\ncapability. Additionally, we devise a dynamic feature fusion module (DFFM)\ncomprising two dynamic feature enhancement modules (DFEM) and a cross modality\nfusion mamba module (CMFM). The former serves for dynamic texture enhancement\nand dynamic difference perception, whereas the latter enhances correlation\nfeatures between modes and suppresses redundant intermodal information.\nFusionMamba has yielded state-of-the-art (SOTA) performance across various\nmultimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared\nand visible image fusion task (IR-VIS) and multimodal biomedical image fusion\ndataset (GFP-PC), which is proved that our model has generalization ability.\nThe code for FusionMamba is available at\nhttps://github.com/millieXie/FusionMamba.\n","authors":["Xinyu Xie","Yawen Cui","Chio-In Ieong","Tao Tan","Xiaozhi Zhang","Xubin Zheng","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.09498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03883v2","updated":"2024-04-15T06:34:52Z","published":"2024-04-05T04:11:31Z","title":"LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and\n Image Classification","summary":" The fusion of hyperspectral and LiDAR data has been an active research topic.\nExisting fusion methods have ignored the high-dimensionality and redundancy\nchallenges in hyperspectral images, despite that band selection methods have\nbeen intensively studied for hyperspectral image (HSI) processing. This paper\naddresses this significant gap by introducing a cross-attention mechanism from\nthe transformer architecture for the selection of HSI bands guided by LiDAR\ndata. LiDAR provides high-resolution vertical structural information, which can\nbe useful in distinguishing different types of land cover that may have similar\nspectral signatures but different structural profiles. In our approach, the\nLiDAR data are used as the \"query\" to search and identify the \"key\" from the\nHSI to choose the most pertinent bands for LiDAR. This method ensures that the\nselected HSI bands drastically reduce redundancy and computational requirements\nwhile working optimally with the LiDAR data. Extensive experiments have been\nundertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and\nMUUFL. The results highlight the superiority of the cross-attention mechanism,\nunderlining the enhanced classification accuracy of the identified HSI bands\nwhen fused with the LiDAR features. The results also show that the use of fewer\nbands combined with LiDAR surpasses the performance of state-of-the-art fusion\nmodels.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee-Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.03883v2.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.09496v1","updated":"2024-04-15T06:33:32Z","published":"2024-04-15T06:33:32Z","title":"Towards Collaborative Autonomous Driving: Simulation Platform and\n End-to-End System","summary":" Vehicle-to-everything-aided autonomous driving (V2X-AD) has a huge potential\nto provide a safer driving solution. Despite extensive researches in\ntransportation and communication to support V2X-AD, the actual utilization of\nthese infrastructures and communication resources in enhancing driving\nperformances remains largely unexplored. This highlights the necessity of\ncollaborative autonomous driving: a machine learning approach that optimizes\nthe information sharing strategy to improve the driving performance of each\nvehicle. This effort necessitates two key foundations: a platform capable of\ngenerating data to facilitate the training and testing of V2X-AD, and a\ncomprehensive system that integrates full driving-related functionalities with\nmechanisms for information sharing. From the platform perspective, we present\nV2Xverse, a comprehensive simulation platform for collaborative autonomous\ndriving. This platform provides a complete pipeline for collaborative driving.\nFrom the system perspective, we introduce CoDriving, a novel end-to-end\ncollaborative driving system that properly integrates V2X communication over\nthe entire autonomous pipeline, promoting driving with shared perceptual\ninformation. The core idea is a novel driving-oriented communication strategy.\nLeveraging this strategy, CoDriving improves driving performance while\noptimizing communication efficiency. We make comprehensive benchmarks with\nV2Xverse, analyzing both modular performance and closed-loop driving\nperformance. Experimental results show that CoDriving: i) significantly\nimproves the driving score by 62.49% and drastically reduces the pedestrian\ncollision rate by 53.50% compared to the SOTA end-to-end driving method, and\nii) achieves sustaining driving performance superiority over dynamic constraint\ncommunication conditions.\n","authors":["Genjia Liu","Yue Hu","Chenxin Xu","Weibo Mao","Junhao Ge","Zhengxiang Huang","Yifan Lu","Yinda Xu","Junkai Xia","Yafei Wang","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09490v1","updated":"2024-04-15T06:24:56Z","published":"2024-04-15T06:24:56Z","title":"Leveraging Temporal Contextualization for Video Action Recognition","summary":" Pretrained vision-language models have shown effectiveness in video\nunderstanding. However, recent studies have not sufficiently leveraged\nessential temporal information from videos, simply averaging frame-wise\nrepresentations or referencing consecutive frames. We introduce Temporally\nContextualized CLIP (TC-CLIP), a pioneering framework for video understanding\nthat effectively and efficiently leverages comprehensive video information. We\npropose Temporal Contextualization (TC), a novel layer-wise temporal\ninformation infusion mechanism for video that extracts core information from\neach frame, interconnects relevant information across the video to summarize\ninto context tokens, and ultimately leverages the context tokens during the\nfeature encoding process. Furthermore, our Video-conditional Prompting (VP)\nmodule manufactures context tokens to generate informative prompts in text\nmodality. We conduct extensive experiments in zero-shot, few-shot,\nbase-to-novel, and fully-supervised action recognition to validate the\nsuperiority of our TC-CLIP. Ablation studies for TC and VP guarantee our design\nchoices. Code is available at https://github.com/naver-ai/tc-clip\n","authors":["Minji Kim","Dongyoon Han","Taekyung Kim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2404.09490v1.pdf","comment":"24 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2404.09486v1","updated":"2024-04-15T06:15:46Z","published":"2024-04-15T06:15:46Z","title":"MMCode: Evaluating Multi-Modal Code Large Language Models with Visually\n Rich Programming Problems","summary":" Programming often involves converting detailed and complex specifications\ninto code, a process during which developers typically utilize visual aids to\nmore effectively convey concepts. While recent developments in Large Multimodal\nModels have demonstrated remarkable abilities in visual reasoning and\nmathematical tasks, there is little work on investigating whether these models\ncan effectively interpret visual elements for code generation. To this end, we\npresent MMCode, the first multi-modal coding dataset for evaluating algorithmic\nproblem-solving skills in visually rich contexts. MMCode contains 3,548\nquestions and 6,620 images collected from real-world programming challenges\nharvested from 10 code competition websites, presenting significant challenges\ndue to the extreme demand for reasoning abilities. Our experiment results show\nthat current state-of-the-art models struggle to solve these problems. The\nresults highlight the lack of powerful vision-code models, and we hope MMCode\ncan serve as an inspiration for future works in this domain. The data and code\nare publicly available at https://github.com/happylkx/MMCode.\n","authors":["Kaixin Li","Yuchen Tian","Qisheng Hu","Ziyang Luo","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.09486v1.pdf","comment":"46 pages, 21 figures and 6 tables"},{"id":"http://arxiv.org/abs/2311.12198v3","updated":"2024-04-15T06:04:55Z","published":"2023-11-20T21:34:52Z","title":"PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics","summary":" We introduce PhysGaussian, a new method that seamlessly integrates physically\ngrounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel\nmotion synthesis. Employing a custom Material Point Method (MPM), our approach\nenriches 3D Gaussian kernels with physically meaningful kinematic deformation\nand mechanical stress attributes, all evolved in line with continuum mechanics\nprinciples. A defining characteristic of our method is the seamless integration\nbetween physical simulation and visual rendering: both components utilize the\nsame 3D Gaussian kernels as their discrete representations. This negates the\nnecessity for triangle/tetrahedron meshing, marching cubes, \"cage meshes,\" or\nany other geometry embedding, highlighting the principle of \"what you see is\nwhat you simulate (WS$^2$).\" Our method demonstrates exceptional versatility\nacross a wide variety of materials--including elastic entities, metals,\nnon-Newtonian fluids, and granular materials--showcasing its strong\ncapabilities in creating diverse visual content with novel viewpoints and\nmovements. Our project page is at: https://xpandora.github.io/PhysGaussian/\n","authors":["Tianyi Xie","Zeshun Zong","Yuxing Qiu","Xuan Li","Yutao Feng","Yin Yang","Chenfanfu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.12198v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09476v1","updated":"2024-04-15T06:02:31Z","published":"2024-04-15T06:02:31Z","title":"FreqMamba: Viewing Mamba from a Frequency Perspective for Image\n Deraining","summary":" Images corrupted by rain streaks often lose vital frequency information for\nperception, and image deraining aims to solve this issue which relies on global\nand local degradation modeling. Recent studies have witnessed the effectiveness\nand efficiency of Mamba for perceiving global and local information based on\nits exploiting local correlation among patches, however, rarely attempts have\nbeen explored to extend it with frequency analysis for image deraining,\nlimiting its ability to perceive global degradation that is relevant to\nfrequency modeling (e.g. Fourier transform). In this paper, we propose\nFreqMamba, an effective and efficient paradigm that leverages the complementary\nbetween Mamba and frequency analysis for image deraining. The core of our\nmethod lies in extending Mamba with frequency analysis from two perspectives:\nextending it with frequency-band for exploiting frequency correlation, and\nconnecting it with Fourier transform for global degradation modeling.\nSpecifically, FreqMamba introduces complementary triple interaction structures\nincluding spatial Mamba, frequency band Mamba, and Fourier global modeling.\nFrequency band Mamba decomposes the image into sub-bands of different\nfrequencies to allow 2D scanning from the frequency dimension. Furthermore,\nleveraging Mamba's unique data-dependent properties, we use rainy images at\ndifferent scales to provide degradation priors to the network, thereby\nfacilitating efficient training. Extensive experiments show that our method\noutperforms state-of-the-art methods both visually and quantitatively.\n","authors":["Zou Zhen","Yu Hu","Zhao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.09476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09475v1","updated":"2024-04-15T06:02:09Z","published":"2024-04-15T06:02:09Z","title":"Improving Weakly-Supervised Object Localization Using Adversarial\n Erasing and Pseudo Label","summary":" Weakly-supervised learning approaches have gained significant attention due\nto their ability to reduce the effort required for human annotations in\ntraining neural networks. This paper investigates a framework for\nweakly-supervised object localization, which aims to train a neural network\ncapable of predicting both the object class and its location using only images\nand their image-level class labels. The proposed framework consists of a shared\nfeature extractor, a classifier, and a localizer. The localizer predicts\npixel-level class probabilities, while the classifier predicts the object class\nat the image level. Since image-level class labels are insufficient for\ntraining the localizer, weakly-supervised object localization methods often\nencounter challenges in accurately localizing the entire object region. To\naddress this issue, the proposed method incorporates adversarial erasing and\npseudo labels to improve localization accuracy. Specifically, novel losses are\ndesigned to utilize adversarially erased foreground features and adversarially\nerased feature maps, reducing dependence on the most discriminative region.\nAdditionally, the proposed method employs pseudo labels to suppress activation\nvalues in the background while increasing them in the foreground. The proposed\nmethod is applied to two backbone networks (MobileNetV1 and InceptionV3) and is\nevaluated on three publicly available datasets (ILSVRC-2012, CUB-200-2011, and\nPASCAL VOC 2012). The experimental results demonstrate that the proposed method\noutperforms previous state-of-the-art methods across all evaluated metrics.\n","authors":["Byeongkeun Kang","Sinhae Cha","Yeejin Lee"],"pdf_url":"https://arxiv.org/pdf/2404.09475v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.09474v1","updated":"2024-04-15T06:01:48Z","published":"2024-04-15T06:01:48Z","title":"TCCT-Net: Two-Stream Network Architecture for Fast and Efficient\n Engagement Estimation via Behavioral Feature Signals","summary":" Engagement analysis finds various applications in healthcare, education,\nadvertisement, services. Deep Neural Networks, used for analysis, possess\ncomplex architecture and need large amounts of input data, computational power,\ninference time. These constraints challenge embedding systems into devices for\nreal-time use. To address these limitations, we present a novel two-stream\nfeature fusion \"Tensor-Convolution and Convolution-Transformer Network\"\n(TCCT-Net) architecture. To better learn the meaningful patterns in the\ntemporal-spatial domain, we design a \"CT\" stream that integrates a hybrid\nconvolutional-transformer. In parallel, to efficiently extract rich patterns\nfrom the temporal-frequency domain and boost processing speed, we introduce a\n\"TC\" stream that uses Continuous Wavelet Transform (CWT) to represent\ninformation in a 2D tensor form. Evaluated on the EngageNet dataset, the\nproposed method outperforms existing baselines, utilizing only two behavioral\nfeatures (head pose rotations) compared to the 98 used in baseline models.\nFurthermore, comparative analysis shows TCCT-Net's architecture offers an\norder-of-magnitude improvement in inference speed compared to state-of-the-art\nimage-based Recurrent Neural Network (RNN) methods. The code will be released\nat https://github.com/vedernikovphoto/TCCT_Net.\n","authors":["Alexander Vedernikov","Puneet Kumar","Haoyu Chen","Tapio Seppanen","Xiaobai Li"],"pdf_url":"https://arxiv.org/pdf/2404.09474v1.pdf","comment":"Accepted for the CVPR 2024 workshop (ABAW)"},{"id":"http://arxiv.org/abs/2404.09472v1","updated":"2024-04-15T05:53:26Z","published":"2024-04-15T05:53:26Z","title":"Q2A: Querying Implicit Fully Continuous Feature Pyramid to Align\n Features for Medical Image Segmentation","summary":" Recent medical image segmentation methods apply implicit neural\nrepresentation (INR) to the decoder for achieving a continuous coordinate\ndecoding to tackle the drawback of conventional discrete grid-based data\nrepresentations. However, the INR-based decoder cannot well handle the feature\nmisalignment problem brought about by the naive latent code acquisition\nstrategy in INR. Although there exist many feature alignment works, they all\nadopt a progressive multi-step aligning paradigm on a discrete feature pyramid,\nwhich is incompatible with the continuous one-step characteristics of INR-based\ndecoder, and thus fails to be the solution. Therefore, we propose Q2A, a novel\none-step query-based aligning paradigm, to solve the feature misalignment\nproblem in the INR-based decoder. Specifically, for each target coordinate, Q2A\nfirst generates several queries depicting the spatial offsets and the cell\nresolutions of the contextual features aligned to the coordinate, then\ncalculates the corresponding aligned features by feeding the queries into a\nnovel implicit fully continuous feature pyramid (FCFP), finally fuses the\naligned features to predict the class distribution. In FCFP, we further propose\na novel universal partition-and-aggregate strategy (P&A) to replace the naive\ninterpolation strategy for latent code acquisition in INR, which mitigates the\ninformation loss problem that occurs when the query cell resolution is\nrelatively large and achieves an effective feature decoding at arbitrary\ncontinuous resolution. We conduct extensive experiments on two medical\ndatasets, i.e. Glas and Synapse, and a universal dataset, i.e. Cityscapes, and\nthey show the superiority of the proposed Q2A.\n","authors":["Jiahao Yu","Li Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09472v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.09469v1","updated":"2024-04-15T05:44:03Z","published":"2024-04-15T05:44:03Z","title":"Virtually Enriched NYU Depth V2 Dataset for Monocular Depth Estimation:\n Do We Need Artificial Augmentation?","summary":" We present ANYU, a new virtually augmented version of the NYU depth v2\ndataset, designed for monocular depth estimation. In contrast to the well-known\napproach where full 3D scenes of a virtual world are utilized to generate\nartificial datasets, ANYU was created by incorporating RGB-D representations of\nvirtual reality objects into the original NYU depth v2 images. We specifically\ndid not match each generated virtual object with an appropriate texture and a\nsuitable location within the real-world image. Instead, an assignment of\ntexture, location, lighting, and other rendering parameters was randomized to\nmaximize a diversity of the training data, and to show that it is randomness\nthat can improve the generalizing ability of a dataset. By conducting extensive\nexperiments with our virtually modified dataset and validating on the original\nNYU depth v2 and iBims-1 benchmarks, we show that ANYU improves the monocular\ndepth estimation performance and generalization of deep neural networks with\nconsiderably different architectures, especially for the current\nstate-of-the-art VPD model. To the best of our knowledge, this is the first\nwork that augments a real-world dataset with randomly generated virtual 3D\nobjects for monocular depth estimation. We make our ANYU dataset publicly\navailable in two training configurations with 10% and 100% additional\nsynthetically enriched RGB-D pairs of training images, respectively, for\nefficient training and empirical exploration of virtual augmentation at\nhttps://github.com/ABrain-One/ANYU\n","authors":["Dmitry Ignatov","Andrey Ignatov","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.09469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03124v2","updated":"2024-04-15T05:38:16Z","published":"2024-02-05T15:51:34Z","title":"Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks","summary":" Gradient inversion attacks aim to reconstruct local training data from\nintermediate gradients exposed in the federated learning framework. Despite\nsuccessful attacks, all previous methods, starting from reconstructing a single\ndata point and then relaxing the single-image limit to batch level, are only\ntested under hard label constraints. Even for single-image reconstruction, we\nstill lack an analysis-based algorithm to recover augmented soft labels. In\nthis work, we change the focus from enlarging batchsize to investigating the\nhard label constraints, considering a more realistic circumstance where label\nsmoothing and mixup techniques are used in the training process. In particular,\nwe are the first to initiate a novel algorithm to simultaneously recover the\nground-truth augmented label and the input feature of the last fully-connected\nlayer from single-input gradients, and provide a necessary condition for any\nanalytical-based label recovery methods. Extensive experiments testify to the\nlabel recovery accuracy, as well as the benefits to the following image\nreconstruction. We believe soft labels in classification tasks are worth\nfurther attention in gradient inversion attacks.\n","authors":["Yanbo Wang","Jian Liang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2402.03124v2.pdf","comment":"ICLR2024 poster"},{"id":"http://arxiv.org/abs/2404.09465v1","updated":"2024-04-15T05:29:23Z","published":"2024-04-15T05:29:23Z","title":"PhyScene: Physically Interactable 3D Scene Synthesis for Embodied AI","summary":" With recent developments in Embodied Artificial Intelligence (EAI) research,\nthere has been a growing demand for high-quality, large-scale interactive scene\ngeneration. While prior methods in scene synthesis have prioritized the\nnaturalness and realism of the generated scenes, the physical plausibility and\ninteractivity of scenes have been largely left unexplored. To address this\ndisparity, we introduce PhyScene, a novel method dedicated to generating\ninteractive 3D scenes characterized by realistic layouts, articulated objects,\nand rich physical interactivity tailored for embodied agents. Based on a\nconditional diffusion model for capturing scene layouts, we devise novel\nphysics- and interactivity-based guidance mechanisms that integrate constraints\nfrom object collision, room layout, and object reachability. Through extensive\nexperiments, we demonstrate that PhyScene effectively leverages these guidance\nfunctions for physically interactable scene synthesis, outperforming existing\nstate-of-the-art scene synthesis methods by a large margin. Our findings\nsuggest that the scenes generated by PhyScene hold considerable potential for\nfacilitating diverse skill acquisition among agents within interactive\nenvironments, thereby catalyzing further advancements in embodied AI research.\nProject website: http://physcene.github.io.\n","authors":["Yandan Yang","Baoxiong Jia","Peiyuan Zhi","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09465v1.pdf","comment":"Accepted by CVPR 2024, 18 pages"},{"id":"http://arxiv.org/abs/2404.09461v1","updated":"2024-04-15T05:00:40Z","published":"2024-04-15T05:00:40Z","title":"Improved Object-Based Style Transfer with Single Deep Network","summary":" This research paper proposes a novel methodology for image-to-image style\ntransfer on objects utilizing a single deep convolutional neural network. The\nproposed approach leverages the You Only Look Once version 8 (YOLOv8)\nsegmentation model and the backbone neural network of YOLOv8 for style\ntransfer. The primary objective is to enhance the visual appeal of objects in\nimages by seamlessly transferring artistic styles while preserving the original\nobject characteristics. The proposed approach's novelty lies in combining\nsegmentation and style transfer in a single deep convolutional neural network.\nThis approach omits the need for multiple stages or models, thus resulting in\nsimpler training and deployment of the model for practical applications. The\nresults of this approach are shown on two content images by applying different\nstyle images. The paper also demonstrates the ability to apply style transfer\non multiple objects in the same image.\n","authors":["Harshmohan Kulkarni","Om Khare","Ninad Barve","Sunil Mane"],"pdf_url":"https://arxiv.org/pdf/2404.09461v1.pdf","comment":"In Proceedings of the Fourth International Conference on Innovations\n in Computational Intelligence and Computer Vision"},{"id":"http://arxiv.org/abs/2303.09792v3","updated":"2024-04-15T04:58:07Z","published":"2023-03-17T06:26:55Z","title":"Exploring Sparse Visual Prompt for Domain Adaptive Dense Prediction","summary":" The visual prompts have provided an efficient manner in addressing visual\ncross-domain problems. In previous works, Visual Domain Prompt (VDP) first\nintroduces domain prompts to tackle the classification Test-Time Adaptation\n(TTA) problem by warping image-level prompts on the input and fine-tuning\nprompts for each target domain. However, since the image-level prompts mask out\ncontinuous spatial details in the prompt-allocated region, it will suffer from\ninaccurate contextual information and limited domain knowledge extraction,\nparticularly when dealing with dense prediction TTA problems. To overcome these\nchallenges, we propose a novel Sparse Visual Domain Prompts (SVDP) approach,\nwhich holds minimal trainable parameters (e.g., 0.1\\%) in the image-level\nprompt and reserves more spatial information of the input. To better apply SVDP\nin extracting domain-specific knowledge, we introduce the Domain Prompt\nPlacement (DPP) method to adaptively allocates trainable parameters of SVDP on\nthe pixels with large distribution shifts. Furthermore, recognizing that each\ntarget domain sample exhibits a unique domain shift, we design Domain Prompt\nUpdating (DPU) strategy to optimize prompt parameters differently for each\nsample, facilitating efficient adaptation to the target domain. Extensive\nexperiments were conducted on widely-used TTA and continual TTA benchmarks, and\nour proposed method achieves state-of-the-art performance in both semantic\nsegmentation and depth estimation tasks.\n","authors":["Senqiao Yang","Jiarui Wu","Jiaming Liu","Xiaoqi Li","Qizhe Zhang","Mingjie Pan","Yulu Gan","Zehui Chen","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.09792v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2404.09458v1","updated":"2024-04-15T04:50:39Z","published":"2024-04-15T04:50:39Z","title":"CompGS: Efficient 3D Scene Representation via Compressed Gaussian\n Splatting","summary":" Gaussian splatting, renowned for its exceptional rendering quality and\nefficiency, has emerged as a prominent technique in 3D scene representation.\nHowever, the substantial data volume of Gaussian splatting impedes its\npractical utility in real-world applications. Herein, we propose an efficient\n3D scene representation, named Compressed Gaussian Splatting (CompGS), which\nharnesses compact Gaussian primitives for faithful 3D scene modeling with a\nremarkably reduced data size. To ensure the compactness of Gaussian primitives,\nwe devise a hybrid primitive structure that captures predictive relationships\nbetween each other. Then, we exploit a small set of anchor primitives for\nprediction, allowing the majority of primitives to be encapsulated into highly\ncompact residual forms. Moreover, we develop a rate-constrained optimization\nscheme to eliminate redundancies within such hybrid primitives, steering our\nCompGS towards an optimal trade-off between bitrate consumption and\nrepresentation efficacy. Experimental results show that the proposed CompGS\nsignificantly outperforms existing methods, achieving superior compactness in\n3D scene representation without compromising model accuracy and rendering\nquality. Our code will be released on GitHub for further research.\n","authors":["Xiangrui Liu","Xinju Wu","Pingping Zhang","Shiqi Wang","Zhu Li","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2404.09458v1.pdf","comment":"Submitted to a conference"},{"id":"http://arxiv.org/abs/2404.09454v1","updated":"2024-04-15T04:43:53Z","published":"2024-04-15T04:43:53Z","title":"Utility-Fairness Trade-Offs and How to Find Them","summary":" When building classification systems with demographic fairness\nconsiderations, there are two objectives to satisfy: 1) maximizing utility for\nthe specific task and 2) ensuring fairness w.r.t. a known demographic\nattribute. These objectives often compete, so optimizing both can lead to a\ntrade-off between utility and fairness. While existing works acknowledge the\ntrade-offs and study their limits, two questions remain unanswered: 1) What are\nthe optimal trade-offs between utility and fairness? and 2) How can we\nnumerically quantify these trade-offs from data for a desired prediction task\nand demographic attribute of interest? This paper addresses these questions. We\nintroduce two utility-fairness trade-offs: the Data-Space and Label-Space\nTrade-off. The trade-offs reveal three regions within the utility-fairness\nplane, delineating what is fully and partially possible and impossible. We\npropose U-FaTE, a method to numerically quantify the trade-offs for a given\nprediction task and group fairness definition from data samples. Based on the\ntrade-offs, we introduce a new scheme for evaluating representations. An\nextensive evaluation of fair representation learning methods and\nrepresentations from over 1000 pre-trained models revealed that most current\napproaches are far from the estimated and achievable fairness-utility\ntrade-offs across multiple datasets and prediction tasks.\n","authors":["Sepehr Dehdashtian","Bashir Sadeghi","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2404.09454v1.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024"},{"id":"http://arxiv.org/abs/2404.05317v3","updated":"2024-04-15T04:37:44Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v3.pdf","comment":"updated section II-C (\"A-Frame\"), updated references"},{"id":"http://arxiv.org/abs/2404.09451v1","updated":"2024-04-15T04:31:24Z","published":"2024-04-15T04:31:24Z","title":"Contrastive Mean-Shift Learning for Generalized Category Discovery","summary":" We address the problem of generalized category discovery (GCD) that aims to\npartition a partially labeled collection of images; only a small part of the\ncollection is labeled and the total number of target classes is unknown. To\naddress this generalized image clustering problem, we revisit the mean-shift\nalgorithm, i.e., a classic, powerful technique for mode seeking, and\nincorporate it into a contrastive learning framework. The proposed method,\ndubbed Contrastive Mean-Shift (CMS) learning, trains an image encoder to\nproduce representations with better clustering properties by an iterative\nprocess of mean shift and contrastive update. Experiments demonstrate that our\nmethod, both in settings with and without the total number of clusters being\nknown, achieves state-of-the-art performance on six public GCD benchmarks\nwithout bells and whistles.\n","authors":["Sua Choi","Dahyun Kang","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.09451v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09447v1","updated":"2024-04-15T04:20:01Z","published":"2024-04-15T04:20:01Z","title":"kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually\n Expanding Large Vocabularies","summary":" Rapid advancements in continual segmentation have yet to bridge the gap of\nscaling to large continually expanding vocabularies under compute-constrained\nscenarios. We discover that traditional continual training leads to\ncatastrophic forgetting under compute constraints, unable to outperform\nzero-shot segmentation methods. We introduce a novel strategy for semantic and\npanoptic segmentation with zero forgetting, capable of adapting to continually\ngrowing vocabularies without the need for retraining or large memory costs. Our\ntraining-free approach, kNN-CLIP, leverages a database of instance embeddings\nto enable open-vocabulary segmentation approaches to continually expand their\nvocabulary on any given domain with a single-pass through data, while only\nstoring embeddings minimizing both compute and memory costs. This method\nachieves state-of-the-art mIoU performance across large-vocabulary semantic and\npanoptic segmentation datasets. We hope kNN-CLIP represents a step forward in\nenabling more efficient and adaptable continual segmentation, paving the way\nfor advances in real-world large-vocabulary continual segmentation methods.\n","authors":["Zhongrui Gui","Shuyang Sun","Runjia Li","Jianhao Yuan","Zhaochong An","Karsten Roth","Ameya Prabhu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2404.09447v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.09445v1","updated":"2024-04-15T04:14:42Z","published":"2024-04-15T04:14:42Z","title":"Exploring Text-to-Motion Generation with Human Preference","summary":" This paper presents an exploration of preference learning in text-to-motion\ngeneration. We find that current improvements in text-to-motion generation\nstill rely on datasets requiring expert labelers with motion capture systems.\nInstead, learning from human preference data does not require motion capture\nsystems; a labeler with no expertise simply compares two generated motions.\nThis is particularly efficient because evaluating the model's output is easier\nthan gathering the motion that performs a desired task (e.g. backflip). To\npioneer the exploration of this paradigm, we annotate 3,528 preference pairs\ngenerated by MotionGPT, marking the first effort to investigate various\nalgorithms for learning from preference data. In particular, our exploration\nhighlights important design choices when using preference data. Additionally,\nour experimental results show that preference learning has the potential to\ngreatly improve current text-to-motion generative models. Our code and dataset\nare publicly available at\nhttps://github.com/THU-LYJ-Lab/InstructMotion}{https://github.com/THU-LYJ-Lab/InstructMotion\nto further facilitate research in this area.\n","authors":["Jenny Sheng","Matthieu Lin","Andrew Zhao","Kevin Pruvost","Yu-Hui Wen","Yangguang Li","Gao Huang","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09445v1.pdf","comment":"Accepted to CVPR 2024 HuMoGen Workshop"},{"id":"http://arxiv.org/abs/2402.09055v3","updated":"2024-04-15T03:23:07Z","published":"2024-02-14T10:05:19Z","title":"Comment-aided Video-Language Alignment via Contrastive Pre-training for\n Short-form Video Humor Detection","summary":" The growing importance of multi-modal humor detection within affective\ncomputing correlates with the expanding influence of short-form video sharing\non social media platforms. In this paper, we propose a novel two-branch\nhierarchical model for short-form video humor detection (SVHD), named\nComment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal\ncontrastive pre-training. Notably, our CVLA not only operates on raw signals\nacross various modal channels but also yields an appropriate multi-modal\nrepresentation by aligning the video and language components within a\nconsistent semantic space. The experimental results on two humor detection\ndatasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically\noutperforms state-of-the-art and several competitive baseline approaches. Our\ndataset, code and model release at https://github.com/yliu-cs/CVLA.\n","authors":["Yang Liu","Tongfei Shen","Dong Zhang","Qingying Sun","Shoushan Li","Guodong Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.09055v3.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2308.06603v3","updated":"2024-04-15T03:20:41Z","published":"2023-08-12T16:14:44Z","title":"LadleNet: A Two-Stage UNet for Infrared Image to Visible Image\n Translation Guided by Semantic Segmentation","summary":" The translation of thermal infrared (TIR) images into visible light (VI)\nimages plays a critical role in enhancing model performance and generalization\ncapability, particularly in various fields such as registration and fusion of\nTIR and VI images. However, current research in this field faces challenges of\ninsufficiently realistic image quality after translation and the difficulty of\nexisting models in adapting to unseen scenarios. In order to develop a more\ngeneralizable image translation architecture, we conducted an analysis of\nexisting translation architectures. By exploring the interpretability of\nintermediate modalities in existing translation architectures, we found that\nthe intermediate modality in the image translation process for street scene\nimages essentially performs semantic segmentation, distinguishing street images\nbased on background and foreground patterns before assigning color information.\nBased on these principles, we propose an improved algorithm based on U-net\ncalled LadleNet. This network utilizes a two-stage U-net concatenation\nstructure, consisting of Handle and Bowl modules. The Handle module is\nresponsible for constructing an abstract semantic space, while the Bowl module\ndecodes the semantic space to obtain the mapped VI image. Due to the\ncharacteristic of semantic segmentation, the Handle module has strong\nextensibility. Therefore, we also propose LadleNet+, which replaces the Handle\nmodule in LadleNet with a pre-trained DeepLabv3+ network, enabling the model to\nhave a more powerful capability in constructing semantic space. The proposed\nmethods were trained and tested on the KAIST dataset, followed by quantitative\nand qualitative analysis. Compared to existing methods, LadleNet and LadleNet+\nachieved an average improvement of 12.4% and 15.2% in SSIM metrics, and 37.9%\nand 50.6% in MS-SSIM metrics, respectively.\n","authors":["Tonghui Zou","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06603v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09432v1","updated":"2024-04-15T03:12:17Z","published":"2024-04-15T03:12:17Z","title":"The 8th AI City Challenge","summary":" The eighth AI City Challenge highlighted the convergence of computer vision\nand artificial intelligence in areas like retail, warehouse settings, and\nIntelligent Traffic Systems (ITS), presenting significant research\nopportunities. The 2024 edition featured five tracks, attracting unprecedented\ninterest from 726 teams in 47 countries and regions. Track 1 dealt with\nmulti-target multi-camera (MTMC) people tracking, highlighting significant\nenhancements in camera count, character number, 3D annotation, and camera\nmatrices, alongside new rules for 3D tracking and online tracking algorithm\nencouragement. Track 2 introduced dense video captioning for traffic safety,\nfocusing on pedestrian accidents using multi-camera feeds to improve insights\nfor insurance and prevention. Track 3 required teams to classify driver actions\nin a naturalistic driving analysis. Track 4 explored fish-eye camera analytics\nusing the FishEye8K dataset. Track 5 focused on motorcycle helmet rule\nviolation detection. The challenge utilized two leaderboards to showcase\nmethods, with participants setting new benchmarks, some surpassing existing\nstate-of-the-art achievements.\n","authors":["Shuo Wang","David C. Anastasiu","Zheng Tang","Ming-Ching Chang","Yue Yao","Liang Zheng","Mohammed Shaiqur Rahman","Meenakshi S. Arya","Anuj Sharma","Pranamesh Chakraborty","Sanjita Prajapati","Quan Kong","Norimasa Kobori","Munkhjargal Gochoo","Munkh-Erdene Otgonbold","Fady Alnajjar","Ganzorig Batnasan","Ping-Yang Chen","Jun-Wei Hsieh","Xunlei Wu","Sameer Satish Pusegaonkar","Yizhou Wang","Sujit Biswas","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2404.09432v1.pdf","comment":"Summary of the 8th AI City Challenge Workshop in conjunction with\n CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09431v1","updated":"2024-04-15T03:12:12Z","published":"2024-04-15T03:12:12Z","title":"VFMM3D: Releasing the Potential of Image by Vision Foundation Model for\n Monocular 3D Object Detection","summary":" Due to its cost-effectiveness and widespread availability, monocular 3D\nobject detection, which relies solely on a single camera during inference,\nholds significant importance across various applications, including autonomous\ndriving and robotics. Nevertheless, directly predicting the coordinates of\nobjects in 3D space from monocular images poses challenges. Therefore, an\neffective solution involves transforming monocular images into LiDAR-like\nrepresentations and employing a LiDAR-based 3D object detector to predict the\n3D coordinates of objects. The key step in this method is accurately converting\nthe monocular image into a reliable point cloud form. In this paper, we present\nVFMM3D, an innovative approach that leverages the capabilities of Vision\nFoundation Models (VFMs) to accurately transform single-view images into LiDAR\npoint cloud representations. VFMM3D utilizes the Segment Anything Model (SAM)\nand Depth Anything Model (DAM) to generate high-quality pseudo-LiDAR data\nenriched with rich foreground information. Specifically, the Depth Anything\nModel (DAM) is employed to generate dense depth maps. Subsequently, the Segment\nAnything Model (SAM) is utilized to differentiate foreground and background\nregions by predicting instance masks. These predicted instance masks and depth\nmaps are then combined and projected into 3D space to generate pseudo-LiDAR\npoints. Finally, any object detectors based on point clouds can be utilized to\npredict the 3D coordinates of objects. Comprehensive experiments are conducted\non the challenging 3D object detection dataset KITTI. Our VFMM3D establishes a\nnew state-of-the-art performance. Additionally, experimental results\ndemonstrate the generality of VFMM3D, showcasing its seamless integration into\nvarious LiDAR-based 3D object detectors.\n","authors":["Bonan Ding","Jin Xie","Jing Nie","Jiale Cao"],"pdf_url":"https://arxiv.org/pdf/2404.09431v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.09220v2","updated":"2024-04-15T02:47:01Z","published":"2023-07-18T12:52:49Z","title":"A Survey on Open-Vocabulary Detection and Segmentation: Past, Present,\n and Future","summary":" As the most fundamental scene understanding tasks, object detection and\nsegmentation have made tremendous progress in deep learning era. Due to the\nexpensive manual labeling cost, the annotated categories in existing datasets\nare often small-scale and pre-defined, i.e., state-of-the-art fully-supervised\ndetectors and segmentors fail to generalize beyond the closed vocabulary. To\nresolve this limitation, in the last few years, the community has witnessed an\nincreasing attention toward Open-Vocabulary Detection (OVD) and Segmentation\n(OVS). By ``open-vocabulary'', we mean that the models can classify objects\nbeyond pre-defined categories. In this survey, we provide a comprehensive\nreview on recent developments of OVD and OVS. A taxonomy is first developed to\norganize different tasks and methodologies. We find that the permission and\nusage of weak supervision signals can well discriminate different\nmethodologies, including: visual-semantic space mapping, novel visual feature\nsynthesis, region-aware training, pseudo-labeling, knowledge distillation, and\ntransfer learning. The proposed taxonomy is universal across different tasks,\ncovering object detection, semantic/instance/panoptic segmentation, 3D and\nvideo understanding. The main design principles, key challenges, development\nroutes, methodology strengths, and weaknesses are thoroughly analyzed. In\naddition, we benchmark each task along with the vital components of each method\nin appendix and updated online at\nhttps://github.com/seanzhuh/awesome-open-vocabulary-detection-and-segmentation.\nFinally, several promising directions are provided and discussed to stimulate\nfuture research.\n","authors":["Chaoyang Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2307.09220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09426v1","updated":"2024-04-15T02:44:23Z","published":"2024-04-15T02:44:23Z","title":"ViFu: Multiple 360$^\\circ$ Objects Reconstruction with Clean Background\n via Visible Part Fusion","summary":" In this paper, we propose a method to segment and recover a static, clean\nbackground and multiple 360$^\\circ$ objects from observations of scenes at\ndifferent timestamps. Recent works have used neural radiance fields to model 3D\nscenes and improved the quality of novel view synthesis, while few studies have\nfocused on modeling the invisible or occluded parts of the training images.\nThese under-reconstruction parts constrain both scene editing and rendering\nview selection, thereby limiting their utility for synthetic data generation\nfor downstream tasks. Our basic idea is that, by observing the same set of\nobjects in various arrangement, so that parts that are invisible in one scene\nmay become visible in others. By fusing the visible parts from each scene,\nocclusion-free rendering of both background and foreground objects can be\nachieved.\n We decompose the multi-scene fusion task into two main components: (1)\nobjects/background segmentation and alignment, where we leverage point\ncloud-based methods tailored to our novel problem formulation; (2) radiance\nfields fusion, where we introduce visibility field to quantify the visible\ninformation of radiance fields, and propose visibility-aware rendering for the\nfusion of series of scenes, ultimately obtaining clean background and\n360$^\\circ$ object rendering. Comprehensive experiments were conducted on\nsynthetic and real datasets, and the results demonstrate the effectiveness of\nour method.\n","authors":["Tianhan Xu","Takuya Ikeda","Koichi Nishiwaki"],"pdf_url":"https://arxiv.org/pdf/2404.09426v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.09425v1","updated":"2024-04-15T02:41:55Z","published":"2024-04-15T02:41:55Z","title":"Super-resolution of biomedical volumes with 2D supervision","summary":" Volumetric biomedical microscopy has the potential to increase the diagnostic\ninformation extracted from clinical tissue specimens and improve the diagnostic\naccuracy of both human pathologists and computational pathology models.\nUnfortunately, barriers to integrating 3-dimensional (3D) volumetric microscopy\ninto clinical medicine include long imaging times, poor depth / z-axis\nresolution, and an insufficient amount of high-quality volumetric data.\nLeveraging the abundance of high-resolution 2D microscopy data, we introduce\nmasked slice diffusion for super-resolution (MSDSR), which exploits the\ninherent equivalence in the data-generating distribution across all spatial\ndimensions of biological specimens. This intrinsic characteristic allows for\nsuper-resolution models trained on high-resolution images from one plane (e.g.,\nXY) to effectively generalize to others (XZ, YZ), overcoming the traditional\ndependency on orientation. We focus on the application of MSDSR to stimulated\nRaman histology (SRH), an optical imaging modality for biological specimen\nanalysis and intraoperative diagnosis, characterized by its rapid acquisition\nof high-resolution 2D images but slow and costly optical z-sectioning. To\nevaluate MSDSR's efficacy, we introduce a new performance metric, SliceFID, and\ndemonstrate MSDSR's superior performance over baseline models through extensive\nevaluations. Our findings reveal that MSDSR not only significantly enhances the\nquality and resolution of 3D volumetric data, but also addresses major\nobstacles hindering the broader application of 3D volumetric microscopy in\nclinical diagnostics and biomedical research.\n","authors":["Cheng Jiang","Alexander Gedeon","Yiwei Lyu","Eric Landgraf","Yufeng Zhang","Xinhai Hou","Akhil Kondepudi","Asadur Chowdury","Honglak Lee","Todd Hollon"],"pdf_url":"https://arxiv.org/pdf/2404.09425v1.pdf","comment":"CVPR Workshop on Computer Vision for Microscopy Image Analysis 2024"},{"id":"http://arxiv.org/abs/2404.07487v2","updated":"2024-04-15T02:25:22Z","published":"2024-04-11T05:51:06Z","title":"Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton\n Action Recognition","summary":" Skeleton-based zero-shot action recognition aims to recognize unknown human\nactions based on the learned priors of the known skeleton-based actions and a\nsemantic descriptor space shared by both known and unknown categories. However,\nprevious works focus on establishing the bridges between the known skeleton\nrepresentation space and semantic descriptions space at the coarse-grained\nlevel for recognizing unknown action categories, ignoring the fine-grained\nalignment of these two spaces, resulting in suboptimal performance in\ndistinguishing high-similarity action categories. To address these challenges,\nwe propose a novel method via Side information and dual-prompts learning for\nskeleton-based zero-shot action recognition (STAR) at the fine-grained level.\nSpecifically, 1) we decompose the skeleton into several parts based on its\ntopology structure and introduce the side information concerning multi-part\ndescriptions of human body movements for alignment between the skeleton and the\nsemantic space at the fine-grained level; 2) we design the visual-attribute and\nsemantic-part prompts to improve the intra-class compactness within the\nskeleton space and inter-class separability within the semantic space,\nrespectively, to distinguish the high-similarity actions. Extensive experiments\nshow that our method achieves state-of-the-art performance in ZSL and GZSL\nsettings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets.\n","authors":["Yang Chen","Jingcai Guo","Tian He","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07487v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.08449v2","updated":"2024-04-15T02:10:45Z","published":"2024-04-12T13:00:06Z","title":"OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering","summary":" Rendering dynamic 3D human from monocular videos is crucial for various\napplications such as virtual reality and digital entertainment. Most methods\nassume the people is in an unobstructed scene, while various objects may cause\nthe occlusion of body parts in real-life scenarios. Previous method utilizing\nNeRF for surface rendering to recover the occluded areas, but it requiring more\nthan one day to train and several seconds to render, failing to meet the\nrequirements of real-time interactive applications. To address these issues, we\npropose OccGaussian based on 3D Gaussian Splatting, which can be trained within\n6 minutes and produces high-quality human renderings up to 160 FPS with\noccluded input. OccGaussian initializes 3D Gaussian distributions in the\ncanonical space, and we perform occlusion feature query at occluded regions,\nthe aggregated pixel-align feature is extracted to compensate for the missing\ninformation. Then we use Gaussian Feature MLP to further process the feature\nalong with the occlusion-aware loss functions to better perceive the occluded\narea. Extensive experiments both in simulated and real-world occlusions,\ndemonstrate that our method achieves comparable or even superior performance\ncompared to the state-of-the-art method. And we improving training and\ninference speeds by 250x and 800x, respectively. Our code will be available for\nresearch purposes.\n","authors":["Jingrui Ye","Zongkai Zhang","Yujiao Jiang","Qingmin Liao","Wenming Yang","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2404.08449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09415v1","updated":"2024-04-15T02:02:15Z","published":"2024-04-15T02:02:15Z","title":"A Review on Machine Learning Algorithms for Dust Aerosol Detection using\n Satellite Data","summary":" Dust storms are associated with certain respiratory illnesses across\ndifferent areas in the world. Researchers have devoted time and resources to\nstudy the elements surrounding dust storm phenomena. This paper reviews the\nefforts of those who have investigated dust aerosols using sensors onboard of\nsatellites using machine learning-based approaches. We have reviewed the most\ncommon issues revolving dust aerosol modeling using different datasets and\ndifferent sensors from a historical perspective. Our findings suggest that\nmulti-spectral approaches based on linear and non-linear combinations of\nspectral bands are some of the most successful for visualization and\nquantitative analysis; however, when researchers have leveraged machine\nlearning, performance has been improved and new opportunities to solve unique\nproblems arise.\n","authors":["Nurul Rafi","Pablo Rivas"],"pdf_url":"https://arxiv.org/pdf/2404.09415v1.pdf","comment":"The 23rd International Conference on Artificial Intelligence (ICAI\n 2021)"},{"id":"http://arxiv.org/abs/2404.09412v1","updated":"2024-04-15T01:58:54Z","published":"2024-04-15T01:58:54Z","title":"DeferredGS: Decoupled and Editable Gaussian Splatting with Deferred\n Shading","summary":" Reconstructing and editing 3D objects and scenes both play crucial roles in\ncomputer graphics and computer vision. Neural radiance fields (NeRFs) can\nachieve realistic reconstruction and editing results but suffer from\ninefficiency in rendering. Gaussian splatting significantly accelerates\nrendering by rasterizing Gaussian ellipsoids. However, Gaussian splatting\nutilizes a single Spherical Harmonic (SH) function to model both texture and\nlighting, limiting independent editing capabilities of these components.\nRecently, attempts have been made to decouple texture and lighting with the\nGaussian splatting representation but may fail to produce plausible geometry\nand decomposition results on reflective scenes. Additionally, the forward\nshading technique they employ introduces noticeable blending artifacts during\nrelighting, as the geometry attributes of Gaussians are optimized under the\noriginal illumination and may not be suitable for novel lighting conditions. To\naddress these issues, we introduce DeferredGS, a method for decoupling and\nediting the Gaussian splatting representation using deferred shading. To\nachieve successful decoupling, we model the illumination with a learnable\nenvironment map and define additional attributes such as texture parameters and\nnormal direction on Gaussians, where the normal is distilled from a jointly\ntrained signed distance function. More importantly, we apply deferred shading,\nresulting in more realistic relighting effects compared to previous methods.\nBoth qualitative and quantitative experiments demonstrate the superior\nperformance of DeferredGS in novel view synthesis and editing tasks.\n","authors":["Tong Wu","Jia-Mu Sun","Yu-Kun Lai","Yuewen Ma","Leif Kobbelt","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.09412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13313v2","updated":"2024-04-15T01:49:23Z","published":"2023-12-20T09:16:47Z","title":"ParamISP: Learned Forward and Inverse ISPs using Camera Parameters","summary":" RAW images are rarely shared mainly due to its excessive data size compared\nto their sRGB counterparts obtained by camera ISPs. Learning the forward and\ninverse processes of camera ISPs has been recently demonstrated, enabling\nphysically-meaningful RAW-level image processing on input sRGB images. However,\nexisting learning-based ISP methods fail to handle the large variations in the\nISP processes with respect to camera parameters such as ISO and exposure time,\nand have limitations when used for various applications. In this paper, we\npropose ParamISP, a learning-based method for forward and inverse conversion\nbetween sRGB and RAW images, that adopts a novel neural-network module to\nutilize camera parameters, which is dubbed as ParamNet. Given the camera\nparameters provided in the EXIF data, ParamNet converts them into a feature\nvector to control the ISP networks. Extensive experiments demonstrate that\nParamISP achieve superior RAW and sRGB reconstruction results compared to\nprevious methods and it can be effectively used for a variety of applications\nsuch as deblurring dataset synthesis, raw deblurring, HDR reconstruction, and\ncamera-to-camera transfer.\n","authors":["Woohyeok Kim","Geonu Kim","Junyong Lee","Seungyong Lee","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2312.13313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09406v1","updated":"2024-04-15T01:47:44Z","published":"2024-04-15T01:47:44Z","title":"Human-in-the-Loop Segmentation of Multi-species Coral Imagery","summary":" Broad-scale marine surveys performed by underwater vehicles significantly\nincrease the availability of coral reef imagery, however it is costly and\ntime-consuming for domain experts to label images. Point label propagation is\nan approach used to leverage existing image data labeled with sparse point\nlabels. The resulting augmented ground truth generated is then used to train a\nsemantic segmentation model. Here, we first demonstrate that recent advances in\nfoundation models enable generation of multi-species coral augmented ground\ntruth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN),\nwithout the need for any pre-training or custom-designed algorithms. For\nextremely sparsely labeled images, we propose a labeling regime based on\nhuman-in-the-loop principles, resulting in significant improvement in\nannotation efficiency: If only 5 point labels per image are available, our\nproposed human-in-the-loop approach improves on the state-of-the-art by 17.3%\nfor pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point\nlabels per image are available. Even if the human-in-the-loop labeling regime\nis not used, the denoised DINOv2 features with a KNN outperforms the prior\nstate-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points).\nWe also provide a detailed analysis of how point labeling style and the\nquantity of points per image affects the point label propagation quality and\nprovide general recommendations on maximizing point label efficiency.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2404.09406v1.pdf","comment":"10 pages, 6 figures, an additional 4 pages of supplementary material"},{"id":"http://arxiv.org/abs/2307.11259v2","updated":"2024-04-15T01:31:57Z","published":"2023-07-20T22:35:27Z","title":"Investigating Low Data, Confidence Aware Image Prediction on Smooth\n Repetitive Videos using Gaussian Processes","summary":" The ability to predict future states is crucial to informed decision-making\nwhile interacting with dynamic environments. With cameras providing a prevalent\nand information-rich sensing modality, the problem of predicting future states\nfrom image sequences has garnered a lot of attention. Current state-of-the-art\nmethods typically train large parametric models for their predictions. Though\noften able to predict with accuracy these models often fail to provide\ninterpretable confidence metrics around their predictions. Additionally these\nmethods are reliant on the availability of large training datasets to converge\nto useful solutions. In this paper, we focus on the problem of predicting\nfuture images of an image sequence with interpretable confidence bounds from\nvery little training data. To approach this problem, we use non-parametric\nmodels to take a probabilistic approach to image prediction. We generate\nprobability distributions over sequentially predicted images, and propagate\nuncertainty through time to generate a confidence metric for our predictions.\nGaussian Processes are used for their data efficiency and ability to readily\nincorporate new training data online. Our methods predictions are evaluated on\na smooth fluid simulation environment. We showcase the capabilities of our\napproach on real world data by predicting pedestrian flows and weather patterns\nfrom satellite imagery.\n","authors":["Nikhil U. Shinde","Xiao Liang","Florian Richter","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2307.11259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09401v1","updated":"2024-04-15T01:27:07Z","published":"2024-04-15T01:27:07Z","title":"Watermark-embedded Adversarial Examples for Copyright Protection against\n Diffusion Models","summary":" Diffusion Models (DMs) have shown remarkable capabilities in various\nimage-generation tasks. However, there are growing concerns that DMs could be\nused to imitate unauthorized creations and thus raise copyright issues. To\naddress this issue, we propose a novel framework that embeds personal\nwatermarks in the generation of adversarial examples. Such examples can force\nDMs to generate images with visible watermarks and prevent DMs from imitating\nunauthorized images. We construct a generator based on conditional adversarial\nnetworks and design three losses (adversarial loss, GAN loss, and perturbation\nloss) to generate adversarial examples that have subtle perturbation but can\neffectively attack DMs to prevent copyright violations. Training a generator\nfor a personal watermark by our method only requires 5-10 samples within 2-3\nminutes, and once the generator is trained, it can generate adversarial\nexamples with that watermark significantly fast (0.2s per image). We conduct\nextensive experiments in various conditional image-generation scenarios.\nCompared to existing methods that generate images with chaotic textures, our\nmethod adds visible watermarks on the generated images, which is a more\nstraightforward way to indicate copyright violations. We also observe that our\nadversarial examples exhibit good transferability across unknown generative\nmodels. Therefore, this work provides a simple yet powerful way to protect\ncopyright from DM-based imitation.\n","authors":["Peifei Zhu","Tsubasa Takahashi","Hirokatsu Kataoka"],"pdf_url":"https://arxiv.org/pdf/2404.09401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00513v3","updated":"2024-04-15T01:15:34Z","published":"2024-03-31T01:20:16Z","title":"Transformer based Pluralistic Image Completion with Reduced Information\n Loss","summary":" Transformer based methods have achieved great success in image inpainting\nrecently. However, we find that these solutions regard each pixel as a token,\nthus suffering from an information loss issue from two aspects: 1) They\ndownsample the input image into much lower resolutions for efficiency\nconsideration. 2) They quantize $256^3$ RGB values to a small number (such as\n512) of quantized color values. The indices of quantized pixels are used as\ntokens for the inputs and prediction targets of the transformer. To mitigate\nthese issues, we propose a new transformer based framework called \"PUT\".\nSpecifically, to avoid input downsampling while maintaining computation\nefficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts\nthe masked image into non-overlapped patch tokens and the decoder recovers the\nmasked regions from the inpainted tokens while keeping the unmasked regions\nunchanged. To eliminate the information loss caused by input quantization, an\nUn-quantized Transformer is applied. It directly takes features from the\nP-VQVAE encoder as input without any quantization and only regards the\nquantized tokens as prediction targets. Furthermore, to make the inpainting\nprocess more controllable, we introduce semantic and structural conditions as\nextra guidance. Extensive experiments show that our method greatly outperforms\nexisting transformer based methods on image fidelity and achieves much higher\ndiversity and better fidelity than state-of-the-art pluralistic inpainting\nmethods on complex large-scale datasets (e.g., ImageNet). Codes are available\nat https://github.com/liuqk3/PUT.\n","authors":["Qiankun Liu","Yuqi Jiang","Zhentao Tan","Dongdong Chen","Ying Fu","Qi Chu","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.00513v3.pdf","comment":"Accepted by TPAMI (2024). arXiv admin note: text overlap with\n arXiv:2205.05076"},{"id":"http://arxiv.org/abs/2104.00170v3","updated":"2024-04-15T01:03:11Z","published":"2021-04-01T00:14:45Z","title":"Are Bias Mitigation Techniques for Deep Learning Effective?","summary":" A critical problem in deep learning is that systems learn inappropriate\nbiases, resulting in their inability to perform well on minority groups. This\nhas led to the creation of multiple algorithms that endeavor to mitigate bias.\nHowever, it is not clear how effective these methods are. This is because study\nprotocols differ among papers, systems are tested on datasets that fail to test\nmany forms of bias, and systems have access to hidden knowledge or are tuned\nspecifically to the test set. To address this, we introduce an improved\nevaluation protocol, sensible metrics, and a new dataset, which enables us to\nask and answer critical questions about bias mitigation algorithms. We evaluate\nseven state-of-the-art algorithms using the same network architecture and\nhyperparameter selection policy across three benchmark datasets. We introduce a\nnew dataset called Biased MNIST that enables assessment of robustness to\nmultiple bias sources. We use Biased MNIST and a visual question answering\n(VQA) benchmark to assess robustness to hidden biases. Rather than only tuning\nto the test set distribution, we study robustness across different tuning\ndistributions, which is critical because for many applications the test\ndistribution may not be known during development. We find that algorithms\nexploit hidden biases, are unable to scale to multiple forms of bias, and are\nhighly sensitive to the choice of tuning set. Based on our findings, we implore\nthe community to adopt more rigorous assessment of future bias mitigation\nmethods. All data, code, and results are publicly available at:\nhttps://github.com/erobic/bias-mitigators.\n","authors":["Robik Shrestha","Kushal Kafle","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2104.00170v3.pdf","comment":"WACV 2022"},{"id":"http://arxiv.org/abs/2404.09389v1","updated":"2024-04-15T00:19:47Z","published":"2024-04-15T00:19:47Z","title":"Masked and Shuffled Blind Spot Denoising for Real-World Images","summary":" We introduce a novel approach to single image denoising based on the Blind\nSpot Denoising principle, which we call MAsked and SHuffled Blind Spot\nDenoising (MASH). We focus on the case of correlated noise, which often plagues\nreal images. MASH is the result of a careful analysis to determine the\nrelationships between the level of blindness (masking) of the input and the\n(unknown) noise correlation. Moreover, we introduce a shuffling technique to\nweaken the local correlation of noise, which in turn yields an additional\ndenoising performance improvement. We evaluate MASH via extensive experiments\non real-world noisy image datasets. We demonstrate on par or better results\ncompared to existing self-supervised denoising methods.\n","authors":["Hamadi Chihaoui","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2404.09389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09387v1","updated":"2024-04-15T00:12:27Z","published":"2024-04-15T00:12:27Z","title":"RankCLIP: Ranking-Consistent Language-Image Pretraining","summary":" Among the ever-evolving development of vision-language models, contrastive\nlanguage-image pretraining (CLIP) has set new benchmarks in many downstream\ntasks such as zero-shot classifications by leveraging self-supervised\ncontrastive learning on large amounts of text-image pairs. However, its\ndependency on rigid one-to-one mappings overlooks the complex and often\nmultifaceted relationships between and within texts and images. To this end, we\nintroduce RankCLIP, a novel pretraining method that extends beyond the rigid\none-to-one matching framework of CLIP and its variants. By leveraging both\nin-modal and cross-modal ranking consistency, RankCLIP improves the alignment\nprocess, enabling it to capture the nuanced many-to-many relationships between\nand within each modality. Through comprehensive experiments, we demonstrate the\nenhanced capability of RankCLIP to effectively improve performance across\nvarious downstream tasks, notably achieving significant gains in zero-shot\nclassifications over state-of-the-art methods, underscoring the potential of\nRankCLIP in further advancing vision-language pretraining.\n","authors":["Yiming Zhang","Zhuokai Zhao","Zhaorun Chen","Zhili Feng","Zenghui Ding","Yining Sun"],"pdf_url":"https://arxiv.org/pdf/2404.09387v1.pdf","comment":"10 pages, 3 figures, 6 tables. Code and model checkpoints are\n available at https://github.com/Jam1ezhang/RankCLIP"},{"id":"http://arxiv.org/abs/2404.08419v2","updated":"2024-04-15T15:30:32Z","published":"2024-04-12T12:08:06Z","title":"Direct May Not Be the Best: An Incremental Evolution View of Pose\n Generation","summary":" Pose diversity is an inherent representative characteristic of 2D images. Due\nto the 3D to 2D projection mechanism, there is evident content discrepancy\namong distinct pose images. This is the main obstacle bothering pose\ntransformation related researches. To deal with this challenge, we propose a\nfine-grained incremental evolution centered pose generation framework, rather\nthan traditional direct one-to-one in a rush. Since proposed approach actually\nbypasses the theoretical difficulty of directly modeling dramatic non-linear\nvariation, the incurred content distortion and blurring could be effectively\nconstrained, at the same time the various individual pose details, especially\nclothes texture, could be precisely maintained. In order to systematically\nguide the evolution course, both global and incremental evolution constraints\nare elaborately designed and merged into the overall framework. And a novel\ntriple-path knowledge fusion structure is worked out to take full advantage of\nall available valuable knowledge to conduct high-quality pose synthesis. In\naddition, our framework could generate a series of valuable byproducts, namely\nthe various intermediate poses. Extensive experiments have been conducted to\nverify the effectiveness of the proposed approach. Code is available at\nhttps://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation.\n","authors":["Yuelong Li","Tengfei Xiao","Lei Geng","Jianming Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08419v2.pdf","comment":"Accepted at AAAI2024"},{"id":"http://arxiv.org/abs/2404.00722v4","updated":"2024-04-15T17:53:44Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based approaches for low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing non-local information. In the domain of\nsuper-resolution, Swin-transformer-based models have become mainstream due to\ntheir capability of global spatial information modeling and their\nshifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced model\nperformance by expanding the receptive fields or designing meticulous networks,\nyielding commendable results. However, we observed that it is a general\nphenomenon for the feature map intensity to be abruptly suppressed to small\nvalues towards the network's end. This implies an information bottleneck and a\ndiminishment of spatial information, implicitly limiting the model's potential.\nTo address this, we propose the Dense-residual-connected Transformer (DRCT),\naimed at mitigating the loss of spatial information and stabilizing the\ninformation flow through dense-residual connections between layers, thereby\nunleashing the model's potential and saving the model away from information\nbottleneck. Experiment results indicate that our approach surpasses\nstate-of-the-art methods on benchmark datasets and performs commendably at the\nNTIRE-2024 Image Super-Resolution (x4) Challenge. Our source code is available\nat https://github.com/ming053l/DRCT\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v4.pdf","comment":"Camera-ready version, NTIRE 2024 Image Super-resolution (x4)"},{"id":"http://arxiv.org/abs/2310.17347v3","updated":"2024-04-15T23:52:11Z","published":"2023-10-26T12:27:56Z","title":"CADS: Unleashing the Diversity of Diffusion Models through\n Condition-Annealed Sampling","summary":" While conditional diffusion models are known to have good coverage of the\ndata distribution, they still face limitations in output diversity,\nparticularly when sampled with a high classifier-free guidance scale for\noptimal image quality or when trained on small datasets. We attribute this\nproblem to the role of the conditioning signal in inference and offer an\nimproved sampling strategy for diffusion models that can increase generation\ndiversity, especially at high guidance scales, with minimal loss of sample\nquality. Our sampling strategy anneals the conditioning signal by adding\nscheduled, monotonically decreasing Gaussian noise to the conditioning vector\nduring inference to balance diversity and condition alignment. Our\nCondition-Annealed Diffusion Sampler (CADS) can be used with any pretrained\nmodel and sampling algorithm, and we show that it boosts the diversity of\ndiffusion models in various conditional generation tasks. Further, using an\nexisting pretrained diffusion model, CADS achieves a new state-of-the-art FID\nof 1.70 and 2.31 for class-conditional ImageNet generation at 256$\\times$256\nand 512$\\times$512 respectively.\n","authors":["Seyedmorteza Sadat","Jakob Buhmann","Derek Bradley","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2310.17347v3.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2402.19481v3","updated":"2024-04-15T23:37:46Z","published":"2024-02-29T18:59:58Z","title":"DistriFusion: Distributed Parallel Inference for High-Resolution\n Diffusion Models","summary":" Diffusion models have achieved great success in synthesizing high-quality\nimages. However, generating high-resolution images with diffusion models is\nstill challenging due to the enormous computational costs, resulting in a\nprohibitive latency for interactive applications. In this paper, we propose\nDistriFusion to tackle this problem by leveraging parallelism across multiple\nGPUs. Our method splits the model input into multiple patches and assigns each\npatch to a GPU. However, naively implementing such an algorithm breaks the\ninteraction between patches and loses fidelity, while incorporating such an\ninteraction will incur tremendous communication overhead. To overcome this\ndilemma, we observe the high similarity between the input from adjacent\ndiffusion steps and propose displaced patch parallelism, which takes advantage\nof the sequential nature of the diffusion process by reusing the pre-computed\nfeature maps from the previous timestep to provide context for the current\nstep. Therefore, our method supports asynchronous communication, which can be\npipelined by computation. Extensive experiments show that our method can be\napplied to recent Stable Diffusion XL with no quality degradation and achieve\nup to a 6.1$\\times$ speedup on eight NVIDIA A100s compared to one. Our code is\npublicly available at https://github.com/mit-han-lab/distrifuser.\n","authors":["Muyang Li","Tianle Cai","Jiaxin Cao","Qinsheng Zhang","Han Cai","Junjie Bai","Yangqing Jia","Ming-Yu Liu","Kai Li","Song Han"],"pdf_url":"https://arxiv.org/pdf/2402.19481v3.pdf","comment":"CVPR 2024 Highlight Code: https://github.com/mit-han-lab/distrifuser\n Website: https://hanlab.mit.edu/projects/distrifusion Blog:\n https://hanlab.mit.edu/blog/distrifusion"},{"id":"http://arxiv.org/abs/2311.13602v4","updated":"2024-04-15T23:29:51Z","published":"2023-11-22T18:59:53Z","title":"Retrieval-Augmented Layout Transformer for Content-Aware Layout\n Generation","summary":" Content-aware graphic layout generation aims to automatically arrange visual\nelements along with a given content, such as an e-commerce product image. In\nthis paper, we argue that the current layout generation approaches suffer from\nthe limited training data for the high-dimensional layout structure. We show\nthat a simple retrieval augmentation can significantly improve the generation\nquality. Our model, which is named Retrieval-Augmented Layout Transformer\n(RALF), retrieves nearest neighbor layout examples based on an input image and\nfeeds these results into an autoregressive generator. Our model can apply\nretrieval augmentation to various controllable generation tasks and yield\nhigh-quality layouts within a unified architecture. Our extensive experiments\nshow that RALF successfully generates content-aware layouts in both constrained\nand unconstrained settings and significantly outperforms the baselines.\n","authors":["Daichi Horita","Naoto Inoue","Kotaro Kikuchi","Kota Yamaguchi","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2311.13602v4.pdf","comment":"Accepted to CVPR 2024 (Oral), Project website:\n https://udonda.github.io/RALF/ , GitHub:\n https://github.com/CyberAgentAILab/RALF"},{"id":"http://arxiv.org/abs/2404.10178v1","updated":"2024-04-15T23:23:31Z","published":"2024-04-15T23:23:31Z","title":"CryoMAE: Few-Shot Cryo-EM Particle Picking with Masked Autoencoders","summary":" Cryo-electron microscopy (cryo-EM) emerges as a pivotal technology for\ndetermining the architecture of cells, viruses, and protein assemblies at\nnear-atomic resolution. Traditional particle picking, a key step in cryo-EM,\nstruggles with manual effort and automated methods' sensitivity to low\nsignal-to-noise ratio (SNR) and varied particle orientations. Furthermore,\nexisting neural network (NN)-based approaches often require extensive labeled\ndatasets, limiting their practicality. To overcome these obstacles, we\nintroduce cryoMAE, a novel approach based on few-shot learning that harnesses\nthe capabilities of Masked Autoencoders (MAE) to enable efficient selection of\nsingle particles in cryo-EM images. Contrary to conventional NN-based\ntechniques, cryoMAE requires only a minimal set of positive particle images for\ntraining yet demonstrates high performance in particle detection. Furthermore,\nthe implementation of a self-cross similarity loss ensures distinct features\nfor particle and background regions, thereby enhancing the discrimination\ncapability of cryoMAE. Experiments on large-scale cryo-EM datasets show that\ncryoMAE outperforms existing state-of-the-art (SOTA) methods, improving 3D\nreconstruction resolution by up to 22.4%.\n","authors":["Chentianye Xu","Xueying Zhan","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2404.10178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10175v1","updated":"2024-04-15T23:06:58Z","published":"2024-04-15T23:06:58Z","title":"PD-L1 Classification of Weakly-Labeled Whole Slide Images of Breast\n Cancer","summary":" Specific and effective breast cancer therapy relies on the accurate\nquantification of PD-L1 positivity in tumors, which appears in the form of\nbrown stainings in high resolution whole slide images (WSIs). However, the\nretrieval and extensive labeling of PD-L1 stained WSIs is a time-consuming and\nchallenging task for pathologists, resulting in low reproducibility, especially\nfor borderline images. This study aims to develop and compare models able to\nclassify PD-L1 positivity of breast cancer samples based on WSI analysis,\nrelying only on WSI-level labels. The task consists of two phases: identifying\nregions of interest (ROI) and classifying tumors as PD-L1 positive or negative.\nFor the latter, two model categories were developed, with different feature\nextraction methodologies. The first encodes images based on the colour distance\nfrom a base color. The second uses a convolutional autoencoder to obtain\nembeddings of WSI tiles, and aggregates them into a WSI-level embedding. For\nboth model types, features are fed into downstream ML classifiers. Two datasets\nfrom different clinical centers were used in two different training\nconfigurations: (1) training on one dataset and testing on the other; (2)\ncombining the datasets. We also tested the performance with or without human\npreprocessing to remove brown artefacts Colour distance based models achieve\nthe best performances on testing configuration (1) with artefact removal, while\nautoencoder-based models are superior in the remaining cases, which are prone\nto greater data variability.\n","authors":["Giacomo Cignoni","Cristian Scatena","Chiara Frascarelli","Nicola Fusco","Antonio Giuseppe Naccarato","Giuseppe Nicoló Fanelli","Alina Sîrbu"],"pdf_url":"https://arxiv.org/pdf/2404.10175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10172v1","updated":"2024-04-15T23:01:59Z","published":"2024-04-15T23:01:59Z","title":"Forensic Iris Image-Based Post-Mortem Interval Estimation","summary":" Post-mortem iris recognition is an emerging application of iris-based human\nidentification in a forensic setup. One factor that may be useful in\nconditioning iris recognition methods is the tissue decomposition level, which\nis correlated with the post-mortem interval (PMI), i.g., the number of hours\nthat have elapsed since death. PMI, however, is not always available, and its\nprecise estimation remains one of the core challenges in forensic examination.\nThis paper presents the first known to us method of PMI estimation directly\nfrom forensic iris images. To assess the feasibility of the iris-based PMI\nestimation, convolutional neural networks-based models (VGG19, DenseNet121,\nResNet152, and Inception_v3) were trained to predict the PMI from (a)\nnear-infrared (NIR), (b) visible (RGB), and (c) multispectral forensic iris\nimages. Models were evaluated following a 10-fold cross-validation in (S1)\nsample-disjoint, (S2) subject-disjoint, and (S3) cross-dataset scenarios. We\nfound that using the multispectral data offers a spectacularly low mean\nabsolute error (MAE) of approximately 3.5 hours in scenario (S1), a bit worse\nMAE of approximately 17.5 hours in scenario (S2), and an MAE of approximately\n69.0 hours of in the scenario (S3). This suggests that if the environmental\nconditions are favorable (e.g., bodies are kept in low temperatures), forensic\niris images provide features that are indicative of the PMI and can be\nautomatically estimated. The source codes and model weights are made available\nwith the paper.\n","authors":["Rasel Ahmed Bhuiyan","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2404.10172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10170v1","updated":"2024-04-15T22:49:37Z","published":"2024-04-15T22:49:37Z","title":"High-Resolution Detection of Earth Structural Heterogeneities from\n Seismic Amplitudes using Convolutional Neural Networks with Attention layers","summary":" Earth structural heterogeneities have a remarkable role in the petroleum\neconomy for both exploration and production projects. Automatic detection of\ndetailed structural heterogeneities is challenging when considering modern\nmachine learning techniques like deep neural networks. Typically, these\ntechniques can be an excellent tool for assisted interpretation of such\nheterogeneities, but it heavily depends on the amount of data to be trained.\n We propose an efficient and cost-effective architecture for detecting seismic\nstructural heterogeneities using Convolutional Neural Networks (CNNs) combined\nwith Attention layers. The attention mechanism reduces costs and enhances\naccuracy, even in cases with relatively noisy data. Our model has half the\nparameters compared to the state-of-the-art, and it outperforms previous\nmethods in terms of Intersection over Union (IoU) by 0.6% and precision by\n0.4%. By leveraging synthetic data, we apply transfer learning to train and\nfine-tune the model, addressing the challenge of limited annotated data\navailability.\n","authors":["Luiz Schirmer","Guilherme Schardong","Vinícius da Silva","Rogério Santos","Hélio Lopes"],"pdf_url":"https://arxiv.org/pdf/2404.10170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06394v2","updated":"2024-04-15T22:47:09Z","published":"2023-11-10T20:50:36Z","title":"A design of Convolutional Neural Network model for the Diagnosis of the\n COVID-19","summary":" With the spread of COVID-19 around the globe over the past year, the usage of\nartificial intelligence (AI) algorithms and image processing methods to analyze\nthe X-ray images of patients' chest with COVID-19 has become essential. The\nCOVID-19 virus recognition in the lung area of a patient is one of the basic\nand essential needs of clicical centers and hospitals. Most research in this\nfield has been devoted to papers on the basis of deep learning methods\nutilizing CNNs (Convolutional Neural Network), which mainly deal with the\nscreening of sick and healthy people.In this study, a new structure of a\n19-layer CNN has been recommended for accurately recognition of the COVID-19\nfrom the X-ray pictures of chest. The offered CNN is developed to serve as a\nprecise diagnosis system for a three class (viral pneumonia, Normal, COVID) and\na four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A\ncomparison is conducted among the outcomes of the offered procedure and some\npopular pretrained networks, including Inception, Alexnet, ResNet50,\nSqueezenet, and VGG19 and based on Specificity, Accuracy, Precision,\nSensitivity, Confusion Matrix, and F1-score. The experimental results of the\noffered CNN method specify its dominance over the existing published\nprocedures. This method can be a useful tool for clinicians in deciding\nproperly about COVID-19.\n","authors":["Xinyuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.06394v2.pdf","comment":"Important mistakes. Also, another author has contributed some to the\n revised version. So it is not appropriate for it to be with only my name"},{"id":"http://arxiv.org/abs/2404.10166v1","updated":"2024-04-15T22:32:50Z","published":"2024-04-15T22:32:50Z","title":"Self-Supervised Learning Featuring Small-Scale Image Dataset for\n Treatable Retinal Diseases Classification","summary":" Automated medical diagnosis through image-based neural networks has increased\nin popularity and matured over years. Nevertheless, it is confined by the\nscarcity of medical images and the expensive labor annotation costs.\nSelf-Supervised Learning (SSL) is an good alternative to Transfer Learning (TL)\nand is suitable for imbalanced image datasets. In this study, we assess four\npretrained SSL models and two TL models in treatable retinal diseases\nclassification using small-scale Optical Coherence Tomography (OCT) images\nranging from 125 to 4000 with balanced or imbalanced distribution for training.\nThe proposed SSL model achieves the state-of-art accuracy of 98.84% using only\n4,000 training images. Our results suggest the SSL models provide superior\nperformance under both the balanced and imbalanced training scenarios. The SSL\nmodel with MoCo-v2 scheme has consistent good performance under the imbalanced\nscenario and, especially, surpasses the other models when the training set is\nless than 500 images.\n","authors":["Luffina C. Huang","Darren J. Chiu","Manish Mehta"],"pdf_url":"https://arxiv.org/pdf/2404.10166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10163v1","updated":"2024-04-15T22:26:27Z","published":"2024-04-15T22:26:27Z","title":"EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided\n Reinforcement Learning","summary":" From a visual perception perspective, modern graphical user interfaces (GUIs)\ncomprise a complex graphics-rich two-dimensional visuospatial arrangement of\ntext, images, and interactive objects such as buttons and menus. While existing\nmodels can accurately predict regions and objects that are likely to attract\nattention ``on average'', so far there is no scanpath model capable of\npredicting scanpaths for an individual. To close this gap, we introduce\nEyeFormer, which leverages a Transformer architecture as a policy network to\nguide a deep reinforcement learning algorithm that controls gaze locations. Our\nmodel has the unique capability of producing personalized predictions when\ngiven a few user scanpath samples. It can predict full scanpath information,\nincluding fixation positions and duration, across individuals and various\nstimulus types. Additionally, we demonstrate applications in GUI layout\noptimization driven by our model. Our software and models will be publicly\navailable.\n","authors":["Yue Jiang","Zixin Guo","Hamed Rezazadegan Tavakoli","Luis A. Leiva","Antti Oulasvirta"],"pdf_url":"https://arxiv.org/pdf/2404.10163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02687v3","updated":"2024-04-15T22:13:39Z","published":"2022-12-06T01:10:31Z","title":"Vision Transformer Computation and Resilience for Dynamic Inference","summary":" State-of-the-art deep learning models for computer vision tasks are based on\nthe transformer architecture and often deployed in real-time applications. In\nthis scenario, the resources available for every inference can vary, so it is\nuseful to be able to dynamically adapt execution to trade accuracy for\nefficiency. To create dynamic models, we leverage the resilience of vision\ntransformers to pruning and switch between different scaled versions of a\nmodel. Surprisingly, we find that most FLOPs are generated by convolutions, not\nattention. These relative FLOP counts are not a good predictor of GPU\nperformance since GPUs have special optimizations for convolutions. Some models\nare fairly resilient and their model execution can be adapted without\nretraining, while all models achieve better accuracy with retraining\nalternative execution paths. These insights mean that we can leverage CNN\naccelerators and these alternative execution paths to enable efficient and\ndynamic vision transformer inference. Our analysis shows that leveraging this\ntype of dynamic execution can lead to saving 28\\% of energy with a 1.4\\%\naccuracy drop for SegFormer (63 GFLOPs), with no additional training, and 53\\%\nof energy for ResNet-50 (4 GFLOPs) with a 3.3\\% accuracy drop by switching\nbetween pretrained Once-For-All models.\n","authors":["Kavya Sreedhar","Jason Clemons","Rangharajan Venkatesan","Stephen W. Keckler","Mark Horowitz"],"pdf_url":"https://arxiv.org/pdf/2212.02687v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10157v1","updated":"2024-04-15T22:13:35Z","published":"2024-04-15T22:13:35Z","title":"Salient Object-Aware Background Generation using Text-Guided Diffusion\n Models","summary":" Generating background scenes for salient objects plays a crucial role across\nvarious domains including creative design and e-commerce, as it enhances the\npresentation and context of subjects by integrating them into tailored\nenvironments. Background generation can be framed as a task of text-conditioned\noutpainting, where the goal is to extend image content beyond a salient\nobject's boundaries on a blank background. Although popular diffusion models\nfor text-guided inpainting can also be used for outpainting by mask inversion,\nthey are trained to fill in missing parts of an image rather than to place an\nobject into a scene. Consequently, when used for background creation,\ninpainting models frequently extend the salient object's boundaries and thereby\nchange the object's identity, which is a phenomenon we call \"object expansion.\"\nThis paper introduces a model for adapting inpainting diffusion models to the\nsalient object outpainting task using Stable Diffusion and ControlNet\narchitectures. We present a series of qualitative and quantitative results\nacross models and datasets, including a newly proposed metric to measure object\nexpansion that does not require any human labeling. Compared to Stable\nDiffusion 2.0 Inpainting, our proposed approach reduces object expansion by\n3.6x on average with no degradation in standard visual metrics across multiple\ndatasets.\n","authors":["Amir Erfan Eshratifar","Joao V. B. Soares","Kapil Thadani","Shaunak Mishra","Mikhail Kuznetsov","Yueh-Ning Ku","Paloma de Juan"],"pdf_url":"https://arxiv.org/pdf/2404.10157v1.pdf","comment":"Accepted for publication at CVPR 2024's Generative Models for\n Computer Vision workshop"},{"id":"http://arxiv.org/abs/2404.10156v1","updated":"2024-04-15T22:12:05Z","published":"2024-04-15T22:12:05Z","title":"SegFormer3D: an Efficient Transformer for 3D Medical Image Segmentation","summary":" The adoption of Vision Transformers (ViTs) based architectures represents a\nsignificant advancement in 3D Medical Image (MI) segmentation, surpassing\ntraditional Convolutional Neural Network (CNN) models by enhancing global\ncontextual understanding. While this paradigm shift has significantly enhanced\n3D segmentation performance, state-of-the-art architectures require extremely\nlarge and complex architectures with large scale computing resources for\ntraining and deployment. Furthermore, in the context of limited datasets, often\nencountered in medical imaging, larger models can present hurdles in both model\ngeneralization and convergence. In response to these challenges and to\ndemonstrate that lightweight models are a valuable area of research in 3D\nmedical imaging, we present SegFormer3D, a hierarchical Transformer that\ncalculates attention across multiscale volumetric features. Additionally,\nSegFormer3D avoids complex decoders and uses an all-MLP decoder to aggregate\nlocal and global attention features to produce highly accurate segmentation\nmasks. The proposed memory efficient Transformer preserves the performance\ncharacteristics of a significantly larger model in a compact design.\nSegFormer3D democratizes deep learning for 3D medical image segmentation by\noffering a model with 33x less parameters and a 13x reduction in GFLOPS\ncompared to the current state-of-the-art (SOTA). We benchmark SegFormer3D\nagainst the current SOTA models on three widely used datasets Synapse, BRaTs,\nand ACDC, achieving competitive results. Code:\nhttps://github.com/OSUPCVLab/SegFormer3D.git\n","authors":["Shehan Perera","Pouyan Navard","Alper Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2404.10156v1.pdf","comment":"Accepted at CVPR Workshop 2024"},{"id":"http://arxiv.org/abs/2404.10147v1","updated":"2024-04-15T21:33:45Z","published":"2024-04-15T21:33:45Z","title":"Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban\n Crime Dynamics","summary":" This study addresses the challenge of urban safety in New York City by\nexamining the relationship between the built environment and crime rates using\nmachine learning and a comprehensive dataset of street view im- ages. We aim to\nidentify how urban landscapes correlate with crime statistics, focusing on the\ncharacteristics of street views and their association with crime rates. The\nfindings offer insights for urban planning and crime pre- vention, highlighting\nthe potential of environmental de- sign in enhancing public safety.\n","authors":["Zhixuan Qi","Huaiying Luo","Chen Chi"],"pdf_url":"https://arxiv.org/pdf/2404.10147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10146v1","updated":"2024-04-15T21:30:50Z","published":"2024-04-15T21:30:50Z","title":"Cross-Modal Self-Training: Aligning Images and Pointclouds to Learn\n Classification without Labels","summary":" Large-scale vision 2D vision language models, such as CLIP can be aligned\nwith a 3D encoder to learn generalizable (open-vocabulary) 3D vision models.\nHowever, current methods require supervised pre-training for such alignment,\nand the performance of such 3D zero-shot models remains sub-optimal for\nreal-world adaptation. In this work, we propose an optimization framework:\nCross-MoST: Cross-Modal Self-Training, to improve the label-free classification\nperformance of a zero-shot 3D vision model by simply leveraging unlabeled 3D\ndata and their accompanying 2D views. We propose a student-teacher framework to\nsimultaneously process 2D views and 3D point clouds and generate joint pseudo\nlabels to train a classifier and guide cross-model feature alignment. Thereby\nwe demonstrate that 2D vision language models such as CLIP can be used to\ncomplement 3D representation learning to improve classification performance\nwithout the need for expensive class annotations. Using synthetic and\nreal-world 3D datasets, we further demonstrate that Cross-MoST enables\nefficient cross-modal knowledge exchange resulting in both image and point\ncloud modalities learning from each other's rich representations.\n","authors":["Amaya Dharmasiri","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.10146v1.pdf","comment":"To be published in Workshop for Learning 3D with Multi-View\n Supervision (3DMV) at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10141v1","updated":"2024-04-15T21:19:10Z","published":"2024-04-15T21:19:10Z","title":"ANCHOR: LLM-driven News Subject Conditioning for Text-to-Image Synthesis","summary":" Text-to-Image (T2I) Synthesis has made tremendous strides in enhancing\nsynthesized image quality, but current datasets evaluate model performance only\non descriptive, instruction-based prompts. Real-world news image captions take\na more pragmatic approach, providing high-level situational and Named-Entity\n(NE) information and limited physical object descriptions, making them\nabstractive. To evaluate the ability of T2I models to capture intended subjects\nfrom news captions, we introduce the Abstractive News Captions with High-level\ncOntext Representation (ANCHOR) dataset, containing 70K+ samples sourced from 5\ndifferent news media organizations. With Large Language Models (LLM) achieving\nsuccess in language and commonsense reasoning tasks, we explore the ability of\ndifferent LLMs to identify and understand key subjects from abstractive\ncaptions. Our proposed method Subject-Aware Finetuning (SAFE), selects and\nenhances the representation of key subjects in synthesized images by leveraging\nLLM-generated subject weights. It also adapts to the domain distribution of\nnews images and captions through custom Domain Fine-tuning, outperforming\ncurrent T2I baselines on ANCHOR. By launching the ANCHOR dataset, we hope to\nmotivate research in furthering the Natural Language Understanding (NLU)\ncapabilities of T2I models.\n","authors":["Aashish Anantha Ramakrishnan","Sharon X. Huang","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2404.10141v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.06129v2","updated":"2024-04-15T21:10:37Z","published":"2024-01-11T18:59:53Z","title":"Distilling Vision-Language Models on Millions of Videos","summary":" The recent advance in vision-language models is largely attributed to the\nabundance of image-text data. We aim to replicate this success for\nvideo-language models, but there simply is not enough human-curated video-text\ndata available. We thus resort to fine-tuning a video-language model from a\nstrong image-language baseline with synthesized instructional data. The\nresulting video model by video-instruction-tuning (VIIT) is then used to\nauto-label millions of videos to generate high-quality captions. We show the\nadapted video-language model performs well on a wide range of video-language\nbenchmarks. For instance, it surpasses the best prior result on open-ended\nNExT-QA by 2.8%. Besides, our model generates detailed descriptions for\npreviously unseen videos, which provide better textual supervision than\nexisting methods. Experiments show that a video-language dual-encoder model\ncontrastively trained on these auto-generated captions is 3.8% better than the\nstrongest baseline that also leverages vision-language models. Our best model\noutperforms state-of-the-art methods on MSR-VTT zero-shot text-to-video\nretrieval by 6%. As a side product, we generate the largest video caption\ndataset to date.\n","authors":["Yue Zhao","Long Zhao","Xingyi Zhou","Jialin Wu","Chun-Te Chu","Hui Miao","Florian Schroff","Hartwig Adam","Ting Liu","Boqing Gong","Philipp Krähenbühl","Liangzhe Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.06129v2.pdf","comment":"CVPR 2024. Project page:\n https://zhaoyue-zephyrus.github.io/video-instruction-tuning"},{"id":"http://arxiv.org/abs/2403.15977v2","updated":"2024-04-15T21:08:05Z","published":"2024-03-24T01:20:08Z","title":"Towards Two-Stream Foveation-based Active Vision Learning","summary":" Deep neural network (DNN) based machine perception frameworks process the\nentire input in a one-shot manner to provide answers to both \"what object is\nbeing observed\" and \"where it is located\". In contrast, the \"two-stream\nhypothesis\" from neuroscience explains the neural processing in the human\nvisual cortex as an active vision system that utilizes two separate regions of\nthe brain to answer the what and the where questions. In this work, we propose\na machine learning framework inspired by the \"two-stream hypothesis\" and\nexplore the potential benefits that it offers. Specifically, the proposed\nframework models the following mechanisms: 1) ventral (what) stream focusing on\nthe input regions perceived by the fovea part of an eye (foveation), 2) dorsal\n(where) stream providing visual guidance, and 3) iterative processing of the\ntwo streams to calibrate visual focus and process the sequence of focused image\npatches. The training of the proposed framework is accomplished by label-based\nDNN training for the ventral stream model and reinforcement learning for the\ndorsal stream model. We show that the two-stream foveation-based learning is\napplicable to the challenging task of weakly-supervised object localization\n(WSOL), where the training data is limited to the object class or its\nattributes. The framework is capable of both predicting the properties of an\nobject and successfully localizing it by predicting its bounding box. We also\nshow that, due to the independent nature of the two streams, the dorsal model\ncan be applied on its own to unseen images to localize objects from different\ndatasets.\n","authors":["Timur Ibrayev","Amitangshu Mukherjee","Sai Aparna Aketi","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2403.15977v2.pdf","comment":"Accepted for publication at IEEE Transactions on Cognitive and\n Developmental Systems (IEEE TCDS), 18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.10133v1","updated":"2024-04-15T20:48:33Z","published":"2024-04-15T20:48:33Z","title":"WB LUTs: Contrastive Learning for White Balancing Lookup Tables","summary":" Automatic white balancing (AWB), one of the first steps in an integrated\nsignal processing (ISP) pipeline, aims to correct the color cast induced by the\nscene illuminant. An incorrect white balance (WB) setting or AWB failure can\nlead to an undesired blue or red tint in the rendered sRGB image. To address\nthis, recent methods pose the post-capture WB correction problem as an\nimage-to-image translation task and train deep neural networks to learn the\nnecessary color adjustments at a lower resolution. These low resolution outputs\nare post-processed to generate high resolution WB corrected images, forming a\nbottleneck in the end-to-end run time. In this paper we present a 3D Lookup\nTable (LUT) based WB correction model called WB LUTs that can generate high\nresolution outputs in real time. We introduce a contrastive learning framework\nwith a novel hard sample mining strategy, which improves the WB correction\nquality of baseline 3D LUTs by 25.5%. Experimental results demonstrate that the\nproposed WB LUTs perform competitively against state-of-the-art models on two\nbenchmark datasets while being 300 times faster using 12.7 times less memory.\nOur model and code are available at https://github.com/skrmanne/3DLUT_sRGB_WB.\n","authors":["Sai Kumar Reddy Manne","Michael Wan"],"pdf_url":"https://arxiv.org/pdf/2404.10133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10130v1","updated":"2024-04-15T20:37:52Z","published":"2024-04-15T20:37:52Z","title":"NOISe: Nuclei-Aware Osteoclast Instance Segmentation for Mouse-to-Human\n Domain Transfer","summary":" Osteoclast cell image analysis plays a key role in osteoporosis research, but\nit typically involves extensive manual image processing and hand annotations by\na trained expert. In the last few years, a handful of machine learning\napproaches for osteoclast image analysis have been developed, but none have\naddressed the full instance segmentation task required to produce the same\noutput as that of the human expert led process. Furthermore, none of the prior,\nfully automated algorithms have publicly available code, pretrained models, or\nannotated datasets, inhibiting reproduction and extension of their work. We\npresent a new dataset with ~2*10^5 expert annotated mouse osteoclast masks,\ntogether with a deep learning instance segmentation method which works for both\nin vitro mouse osteoclast cells on plastic tissue culture plates and human\nosteoclast cells on bone chips. To our knowledge, this is the first work to\nautomate the full osteoclast instance segmentation task. Our method achieves a\nperformance of 0.82 mAP_0.5 (mean average precision at intersection-over-union\nthreshold of 0.5) in cross validation for mouse osteoclasts. We present a novel\nnuclei-aware osteoclast instance segmentation training strategy (NOISe) based\non the unique biology of osteoclasts, to improve the model's generalizability\nand boost the mAP_0.5 from 0.60 to 0.82 on human osteoclasts. We publish our\nannotated mouse osteoclast image dataset, instance segmentation models, and\ncode at github.com/michaelwwan/noise to enable reproducibility and to provide a\npublic tool to accelerate osteoporosis research.\n","authors":["Sai Kumar Reddy Manne","Brendan Martin","Tyler Roy","Ryan Neilson","Rebecca Peters","Meghana Chillara","Christine W. Lary","Katherine J. Motyl","Michael Wan"],"pdf_url":"https://arxiv.org/pdf/2404.10130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01289v4","updated":"2024-04-15T20:35:03Z","published":"2023-06-02T06:15:36Z","title":"nnMobileNet: Rethinking CNN for Retinopathy Research","summary":" Over the past few decades, convolutional neural networks (CNNs) have been at\nthe forefront of the detection and tracking of various retinal diseases (RD).\nDespite their success, the emergence of vision transformers (ViT) in the 2020s\nhas shifted the trajectory of RD model development. The leading-edge\nperformance of ViT-based models in RD can be largely credited to their\nscalability-their ability to improve as more parameters are added. As a result,\nViT-based models tend to outshine traditional CNNs in RD applications, albeit\nat the cost of increased data and computational demands. ViTs also differ from\nCNNs in their approach to processing images, working with patches rather than\nlocal regions, which can complicate the precise localization of small, variably\npresented lesions in RD. In our study, we revisited and updated the\narchitecture of a CNN model, specifically MobileNet, to enhance its utility in\nRD diagnostics. We found that an optimized MobileNet, through selective\nmodifications, can surpass ViT-based models in various RD benchmarks, including\ndiabetic retinopathy grading, detection of multiple fundus diseases, and\nclassification of diabetic macular edema. The code is available at\nhttps://github.com/Retinal-Research/NN-MOBILENET\n","authors":["Wenhui Zhu","Peijie Qiu","Xiwen Chen","Xin Li","Natasha Lepore","Oana M. Dumitrascu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01289v4.pdf","comment":"Accepted as a conference paper to 2024 CVPRW"},{"id":"http://arxiv.org/abs/2404.10124v1","updated":"2024-04-15T20:21:05Z","published":"2024-04-15T20:21:05Z","title":"Epistemic Uncertainty Quantification For Pre-trained Neural Network","summary":" Epistemic uncertainty quantification (UQ) identifies where models lack\nknowledge. Traditional UQ methods, often based on Bayesian neural networks, are\nnot suitable for pre-trained non-Bayesian models. Our study addresses\nquantifying epistemic uncertainty for any pre-trained model, which does not\nneed the original training data or model modifications and can ensure broad\napplicability regardless of network architectures or training techniques.\nSpecifically, we propose a gradient-based approach to assess epistemic\nuncertainty, analyzing the gradients of outputs relative to model parameters,\nand thereby indicating necessary model adjustments to accurately represent the\ninputs. We first explore theoretical guarantees of gradient-based methods for\nepistemic UQ, questioning the view that this uncertainty is only calculable\nthrough differences between multiple models. We further improve gradient-driven\nUQ by using class-specific weights for integrating gradients and emphasizing\ndistinct contributions from neural network layers. Additionally, we enhance UQ\naccuracy by combining gradient and perturbation methods to refine the\ngradients. We evaluate our approach on out-of-distribution detection,\nuncertainty calibration, and active learning, demonstrating its superiority\nover current state-of-the-art UQ methods for pre-trained models.\n","authors":["Hanjing Wang","Qiang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.10124v1.pdf","comment":"Published at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10108v1","updated":"2024-04-15T19:43:16Z","published":"2024-04-15T19:43:16Z","title":"GeoAI Reproducibility and Replicability: a computational and spatial\n perspective","summary":" GeoAI has emerged as an exciting interdisciplinary research area that\ncombines spatial theories and data with cutting-edge AI models to address\ngeospatial problems in a novel, data-driven manner. While GeoAI research has\nflourished in the GIScience literature, its reproducibility and replicability\n(R&R), fundamental principles that determine the reusability, reliability, and\nscientific rigor of research findings, have rarely been discussed. This paper\naims to provide an in-depth analysis of this topic from both computational and\nspatial perspectives. We first categorize the major goals for reproducing GeoAI\nresearch, namely, validation (repeatability), learning and adapting the method\nfor solving a similar or new problem (reproducibility), and examining the\ngeneralizability of the research findings (replicability). Each of these goals\nrequires different levels of understanding of GeoAI, as well as different\nmethods to ensure its success. We then discuss the factors that may cause the\nlack of R&R in GeoAI research, with an emphasis on (1) the selection and use of\ntraining data; (2) the uncertainty that resides in the GeoAI model design,\ntraining, deployment, and inference processes; and more importantly (3) the\ninherent spatial heterogeneity of geospatial data and processes. We use a deep\nlearning-based image analysis task as an example to demonstrate the results'\nuncertainty and spatial variance caused by different factors. The findings\nreiterate the importance of knowledge sharing, as well as the generation of a\n\"replicability map\" that incorporates spatial autocorrelation and spatial\nheterogeneity into consideration in quantifying the spatial replicability of\nGeoAI research.\n","authors":["Wenwen Lia","Chia-Yu Hsu","Sizhe Wang","Peter Kedron"],"pdf_url":"https://arxiv.org/pdf/2404.10108v1.pdf","comment":"Accepted by Annals of the American Association of Geographers"},{"id":"http://arxiv.org/abs/2004.05704v3","updated":"2024-04-15T19:09:39Z","published":"2020-04-12T21:45:23Z","title":"Visual Grounding Methods for VQA are Working for the Wrong Reasons!","summary":" Existing Visual Question Answering (VQA) methods tend to exploit dataset\nbiases and spurious statistical correlations, instead of producing right\nanswers for the right reasons. To address this issue, recent bias mitigation\nmethods for VQA propose to incorporate visual cues (e.g., human attention maps)\nto better ground the VQA models, showcasing impressive gains. However, we show\nthat the performance improvements are not a result of improved visual\ngrounding, but a regularization effect which prevents over-fitting to\nlinguistic priors. For instance, we find that it is not actually necessary to\nprovide proper, human-based cues; random, insensible cues also result in\nsimilar improvements. Based on this observation, we propose a simpler\nregularization scheme that does not require any external annotations and yet\nachieves near state-of-the-art performance on VQA-CPv2.\n","authors":["Robik Shrestha","Kushal Kafle","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2004.05704v3.pdf","comment":"ACL 2020"},{"id":"http://arxiv.org/abs/2402.10021v2","updated":"2024-04-15T19:07:07Z","published":"2024-02-15T15:39:46Z","title":"SAWEC: Sensing-Assisted Wireless Edge Computing","summary":" Emerging mobile virtual reality (VR) systems will require to continuously\nperform complex computer vision tasks on ultra-high-resolution video frames\nthrough the execution of deep neural networks (DNNs)-based algorithms. Since\nstate-of-the-art DNNs require computational power that is excessive for mobile\ndevices, techniques based on wireless edge computing (WEC) have been recently\nproposed. However, existing WEC methods require the transmission and processing\nof a high amount of video data which may ultimately saturate the wireless link.\nIn this paper, we propose a novel Sensing-Assisted Wireless Edge Computing\n(SAWEC) paradigm to address this issue. SAWEC leverages knowledge about the\nphysical environment to reduce the end-to-end latency and overall computational\nburden by transmitting to the edge server only the relevant data for the\ndelivery of the service. Our intuition is that the transmission of the portion\nof the video frames where there are no changes with respect to previous frames\ncan be avoided. Specifically, we leverage wireless sensing techniques to\nestimate the location of objects in the environment and obtain insights about\nthe environment dynamics. Hence, only the part of the frames where any\nenvironmental change is detected is transmitted and processed. We evaluated\nSAWEC by using a 10K 360$^{\\circ}$ with a Wi-Fi 6 sensing system operating at\n160 MHz and performing localization and tracking. We considered instance\nsegmentation and object detection as benchmarking tasks for performance\nevaluation. We carried out experiments in an anechoic chamber and an entrance\nhall with two human subjects in six different setups. Experimental results show\nthat SAWEC reduces both the channel occupation and end-to-end latency by more\nthan 90% while improving the instance segmentation and object detection\nperformance with respect to state-of-the-art WEC approaches.\n","authors":["Khandaker Foysal Haque","Francesca Meneghello","Md. Ebtidaul Karim","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2402.10021v2.pdf","comment":"Submitted to ACM for possible publication"},{"id":"http://arxiv.org/abs/2404.10096v1","updated":"2024-04-15T19:06:58Z","published":"2024-04-15T19:06:58Z","title":"Vision Augmentation Prediction Autoencoder with Attention Design\n (VAPAAD)","summary":" Despite significant advancements in sequence prediction, current methods lack\nattention-based mechanisms for next-frame prediction. Our work introduces\nVAPAAD or Vision Augmentation Prediction Autoencoder with Attention Design, an\ninnovative model that enhances predictive performance by integrating attention\ndesigns, allowing for nuanced understanding and handling of temporal dynamics\nin video sequences. We demonstrate using the famous Moving MNIST dataset the\nrobust performance of the proposed model and potential applicability of such\ndesign in the literature.\n","authors":["Yiqiao Yin"],"pdf_url":"https://arxiv.org/pdf/2404.10096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07329v2","updated":"2024-04-15T18:54:10Z","published":"2024-02-11T23:39:33Z","title":"The Bias of Harmful Label Associations in Vision-Language Models","summary":" Despite the remarkable performance of foundation vision-language models, the\nshared representation space for text and vision can also encode harmful label\nassociations detrimental to fairness. While prior work has uncovered bias in\nvision-language models' (VLMs) classification performance across geography,\nwork has been limited along the important axis of harmful label associations\ndue to a lack of rich, labeled data. In this work, we investigate harmful label\nassociations in the recently released Casual Conversations datasets containing\nmore than 70,000 videos. We study bias in the frequency of harmful label\nassociations across self-provided labels for age, gender, apparent skin tone,\nand physical adornments across several leading VLMs. We find that VLMs are\n$4-7$x more likely to harmfully classify individuals with darker skin tones. We\nalso find scaling transformer encoder model size leads to higher confidence in\nharmful predictions. Finally, we find improvements on standard vision tasks\nacross VLMs does not address disparities in harmful label associations.\n","authors":["Caner Hazirbas","Alicia Sun","Yonathan Efroni","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2402.07329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10078v1","updated":"2024-04-15T18:32:52Z","published":"2024-04-15T18:32:52Z","title":"Low-Light Image Enhancement Framework for Improved Object Detection in\n Fisheye Lens Datasets","summary":" This study addresses the evolving challenges in urban traffic monitoring\ndetection systems based on fisheye lens cameras by proposing a framework that\nimproves the efficacy and accuracy of these systems. In the context of urban\ninfrastructure and transportation management, advanced traffic monitoring\nsystems have become critical for managing the complexities of urbanization and\nincreasing vehicle density. Traditional monitoring methods, which rely on\nstatic cameras with narrow fields of view, are ineffective in dynamic urban\nenvironments, necessitating the installation of multiple cameras, which raises\ncosts. Fisheye lenses, which were recently introduced, provide wide and\nomnidirectional coverage in a single frame, making them a transformative\nsolution. However, issues such as distorted views and blurriness arise,\npreventing accurate object detection on these images. Motivated by these\nchallenges, this study proposes a novel approach that combines a\nransformer-based image enhancement framework and ensemble learning technique to\naddress these challenges and improve traffic monitoring accuracy, making\nsignificant contributions to the future of intelligent traffic management\nsystems. Our proposed methodological framework won 5th place in the 2024 AI\nCity Challenge, Track 4, with an F1 score of 0.5965 on experimental validation\ndata. The experimental results demonstrate the effectiveness, efficiency, and\nrobustness of the proposed system. Our code is publicly available at\nhttps://github.com/daitranskku/AIC2024-TRACK4-TEAM15.\n","authors":["Dai Quoc Tran","Armstrong Aboah","Yuntae Jeon","Maged Shoman","Minsoo Park","Seunghee Park"],"pdf_url":"https://arxiv.org/pdf/2404.10078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10073v1","updated":"2024-04-15T18:26:03Z","published":"2024-04-15T18:26:03Z","title":"Explainable Light-Weight Deep Learning Pipeline for Improved Drought\n Stres","summary":" Early identification of drought stress in crops is vital for implementing\neffective mitigation measures and reducing yield loss. Non-invasive imaging\ntechniques hold immense potential by capturing subtle physiological changes in\nplants under water deficit. Sensor based imaging data serves as a rich source\nof information for machine learning and deep learning algorithms, facilitating\nfurther analysis aimed at identifying drought stress. While these approaches\nyield favorable results, real-time field applications requires algorithms\nspecifically designed for the complexities of natural agricultural conditions.\nOur work proposes a novel deep learning framework for classifying drought\nstress in potato crops captured by UAVs in natural settings. The novelty lies\nin the synergistic combination of a pretrained network with carefully designed\ncustom layers. This architecture leverages feature extraction capabilities of\nthe pre-trained network while the custom layers enable targeted dimensionality\nreduction and enhanced regularization, ultimately leading to improved\nperformance. A key innovation of our work involves the integration of\nGradient-Class Activation Mapping (Grad-CAM), an explainability technique.\nGrad-CAM sheds light on the internal workings of the deep learning model,\ntypically referred to as a black box. By visualizing the focus areas of the\nmodel within the images, Grad-CAM fosters interpretability and builds trust in\nthe decision-making process of the model. Our proposed framework achieves\nsuperior performance, particularly with the DenseNet121 pre-trained network,\nreaching a precision of 98% to identify the stressed class with an overall\naccuracy of 90%. Comparative analysis of existing state-of-the-art object\ndetection algorithms reveals the superiority of our approach in significantly\nhigher precision and accuracy.\n","authors":["Aswini Kumar Patra","Lingaraj Sahoo"],"pdf_url":"https://arxiv.org/pdf/2404.10073v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.11443v2","updated":"2024-04-15T18:11:29Z","published":"2023-05-19T05:50:24Z","title":"Equivariant Multi-Modality Image Fusion","summary":" Multi-modality image fusion is a technique that combines information from\ndifferent sensors or modalities, enabling the fused image to retain\ncomplementary features from each modality, such as functional highlights and\ntexture details. However, effective training of such fusion models is\nchallenging due to the scarcity of ground truth fusion data. To tackle this\nissue, we propose the Equivariant Multi-Modality imAge fusion (EMMA) paradigm\nfor end-to-end self-supervised learning. Our approach is rooted in the prior\nknowledge that natural imaging responses are equivariant to certain\ntransformations. Consequently, we introduce a novel training paradigm that\nencompasses a fusion module, a pseudo-sensing module, and an equivariant fusion\nmodule. These components enable the net training to follow the principles of\nthe natural sensing-imaging process while satisfying the equivariant imaging\nprior. Extensive experiments confirm that EMMA yields high-quality fusion\nresults for infrared-visible and medical images, concurrently facilitating\ndownstream multi-modal segmentation and detection tasks. The code is available\nat https://github.com/Zhaozixiang1228/MMIF-EMMA.\n","authors":["Zixiang Zhao","Haowen Bai","Jiangshe Zhang","Yulun Zhang","Kai Zhang","Shuang Xu","Dongdong Chen","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2305.11443v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.06627v4","updated":"2024-04-15T18:03:26Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10054v1","updated":"2024-04-15T18:00:30Z","published":"2024-04-15T18:00:30Z","title":"AIGeN: An Adversarial Approach for Instruction Generation in VLN","summary":" In the last few years, the research interest in Vision-and-Language\nNavigation (VLN) has grown significantly. VLN is a challenging task that\ninvolves an agent following human instructions and navigating in a previously\nunknown environment to reach a specified goal. Recent work in literature\nfocuses on different ways to augment the available datasets of instructions for\nimproving navigation performance by exploiting synthetic training data. In this\nwork, we propose AIGeN, a novel architecture inspired by Generative Adversarial\nNetworks (GANs) that produces meaningful and well-formed synthetic instructions\nto improve navigation agents' performance. The model is composed of a\nTransformer decoder (GPT-2) and a Transformer encoder (BERT). During the\ntraining phase, the decoder generates sentences for a sequence of images\ndescribing the agent's path to a particular point while the encoder\ndiscriminates between real and fake instructions. Experimentally, we evaluate\nthe quality of the generated instructions and perform extensive ablation\nstudies. Additionally, we generate synthetic instructions for 217K trajectories\nusing AIGeN on Habitat-Matterport 3D Dataset (HM3D) and show an improvement in\nthe performance of an off-the-shelf VLN method. The validation analysis of our\nproposal is conducted on REVERIE and R2R and highlights the promising aspects\nof our proposal, achieving state-of-the-art performance.\n","authors":["Niyati Rawal","Roberto Bigazzi","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.10054v1.pdf","comment":"Accepted to 7th Multimodal Learning and Applications Workshop (MULA\n 2024) at the IEEE/CVF Conference on Computer Vision and Pattern Recognition\n 2024"},{"id":"http://arxiv.org/abs/2404.09995v1","updated":"2024-04-15T17:59:57Z","published":"2024-04-15T17:59:57Z","title":"Taming Latent Diffusion Model for Neural Radiance Field Inpainting","summary":" Neural Radiance Field (NeRF) is a representation for 3D reconstruction from\nmulti-view images. Despite some recent work showing preliminary success in\nediting a reconstructed NeRF with diffusion prior, they remain struggling to\nsynthesize reasonable geometry in completely uncovered regions. One major\nreason is the high diversity of synthetic contents from the diffusion model,\nwhich hinders the radiance field from converging to a crisp and deterministic\ngeometry. Moreover, applying latent diffusion models on real data often yields\na textural shift incoherent to the image condition due to auto-encoding errors.\nThese two problems are further reinforced with the use of pixel-distance\nlosses. To address these issues, we propose tempering the diffusion model's\nstochasticity with per-scene customization and mitigating the textural shift\nwith masked adversarial training. During the analyses, we also found the\ncommonly used pixel and perceptual losses are harmful in the NeRF inpainting\ntask. Through rigorous experiments, our framework yields state-of-the-art NeRF\ninpainting results on various real-world scenes. Project page:\nhttps://hubert0527.github.io/MALD-NeRF\n","authors":["Chieh Hubert Lin","Changil Kim","Jia-Bin Huang","Qinbo Li","Chih-Yao Ma","Johannes Kopf","Ming-Hsuan Yang","Hung-Yu Tseng"],"pdf_url":"https://arxiv.org/pdf/2404.09995v1.pdf","comment":"Project page: https://hubert0527.github.io/MALD-NeRF"},{"id":"http://arxiv.org/abs/2404.09993v1","updated":"2024-04-15T17:59:56Z","published":"2024-04-15T17:59:56Z","title":"No More Ambiguity in 360° Room Layout via Bi-Layout Estimation","summary":" Inherent ambiguity in layout annotations poses significant challenges to\ndeveloping accurate 360{\\deg} room layout estimation models. To address this\nissue, we propose a novel Bi-Layout model capable of predicting two distinct\nlayout types. One stops at ambiguous regions, while the other extends to\nencompass all visible areas. Our model employs two global context embeddings,\nwhere each embedding is designed to capture specific contextual information for\neach layout type. With our novel feature guidance module, the image feature\nretrieves relevant context from these embeddings, generating layout-aware\nfeatures for precise bi-layout predictions. A unique property of our Bi-Layout\nmodel is its ability to inherently detect ambiguous regions by comparing the\ntwo predictions. To circumvent the need for manual correction of ambiguous\nannotations during testing, we also introduce a new metric for disambiguating\nground truth layouts. Our method demonstrates superior performance on benchmark\ndatasets, notably outperforming leading approaches. Specifically, on the\nMatterportLayout dataset, it improves 3DIoU from 81.70% to 82.57% across the\nfull test set and notably from 54.80% to 59.97% in subsets with significant\nambiguity. Project page: https://liagm.github.io/Bi_Layout/\n","authors":["Yu-Ju Tsai","Jin-Cheng Jhang","Jingjing Zheng","Wei Wang","Albert Y. C. Chen","Min Sun","Cheng-Hao Kuo","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.09993v1.pdf","comment":"CVPR 2024, Project page: https://liagm.github.io/Bi_Layout/"},{"id":"http://arxiv.org/abs/2404.09992v1","updated":"2024-04-15T17:59:50Z","published":"2024-04-15T17:59:50Z","title":"MMInA: Benchmarking Multihop Multimodal Internet Agents","summary":" Autonomous embodied agents live on an Internet of multimedia websites. Can\nthey hop around multimodal websites to complete complex user tasks? Existing\nbenchmarks fail to assess them in a realistic, evolving environment for their\nembodiment across websites. To answer this question, we present MMInA, a\nmultihop and multimodal benchmark to evaluate the embodied agents for\ncompositional Internet tasks, with several appealing properties: 1) Evolving\nreal-world multimodal websites. Our benchmark uniquely operates on evolving\nreal-world websites, ensuring a high degree of realism and applicability to\nnatural user tasks. Our data includes 1,050 human-written tasks covering\nvarious domains such as shopping and travel, with each task requiring the agent\nto autonomously extract multimodal information from web pages as observations;\n2) Multihop web browsing. Our dataset features naturally compositional tasks\nthat require information from or actions on multiple websites to solve, to\nassess long-range reasoning capabilities on web tasks; 3) Holistic evaluation.\nWe propose a novel protocol for evaluating an agent's progress in completing\nmultihop tasks. We experiment with both standalone (multimodal) language models\nand heuristic-based web agents. Extensive experiments demonstrate that while\nlong-chain multihop web tasks are easy for humans, they remain challenging for\nstate-of-the-art web agents. We identify that agents are more likely to fail on\nthe early hops when solving tasks of more hops, which results in lower task\nsuccess rates. To address this issue, we propose a simple memory augmentation\napproach replaying past action trajectories to reflect. Our method\nsignificantly improved both the single-hop and multihop web browsing abilities\nof agents. See our code and data at https://mmina.cliangyu.com\n","authors":["Ziniu Zhang","Shulin Tian","Liangyu Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09991v1","updated":"2024-04-15T17:59:47Z","published":"2024-04-15T17:59:47Z","title":"EgoPet: Egomotion and Interaction Data from an Animal's Perspective","summary":" Animals perceive the world to plan their actions and interact with other\nagents to accomplish complex tasks, demonstrating capabilities that are still\nunmatched by AI systems. To advance our understanding and reduce the gap\nbetween the capabilities of animals and AI systems, we introduce a dataset of\npet egomotion imagery with diverse examples of simultaneous egomotion and\nmulti-agent interaction. Current video datasets separately contain egomotion\nand interaction examples, but rarely both at the same time. In addition, EgoPet\noffers a radically distinct perspective from existing egocentric datasets of\nhumans or vehicles. We define two in-domain benchmark tasks that capture animal\nbehavior, and a third benchmark to assess the utility of EgoPet as a\npretraining resource to robotic quadruped locomotion, showing that models\ntrained from EgoPet outperform those trained from prior datasets.\n","authors":["Amir Bar","Arya Bakhtiar","Danny Tran","Antonio Loquercio","Jathushan Rajasegaran","Yann LeCun","Amir Globerson","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2404.09991v1.pdf","comment":"https://www.amirbar.net/egopet"},{"id":"http://arxiv.org/abs/2404.09990v1","updated":"2024-04-15T17:59:31Z","published":"2024-04-15T17:59:31Z","title":"HQ-Edit: A High-Quality Dataset for Instruction-based Image Editing","summary":" This study introduces HQ-Edit, a high-quality instruction-based image editing\ndataset with around 200,000 edits. Unlike prior approaches relying on attribute\nguidance or human feedback on building datasets, we devise a scalable data\ncollection pipeline leveraging advanced foundation models, namely GPT-4V and\nDALL-E 3. To ensure its high quality, diverse examples are first collected\nonline, expanded, and then used to create high-quality diptychs featuring input\nand output images with detailed text prompts, followed by precise alignment\nensured through post-processing. In addition, we propose two evaluation\nmetrics, Alignment and Coherence, to quantitatively assess the quality of image\nedit pairs using GPT-4V. HQ-Edits high-resolution images, rich in detail and\naccompanied by comprehensive editing prompts, substantially enhance the\ncapabilities of existing image editing models. For example, an HQ-Edit\nfinetuned InstructPix2Pix can attain state-of-the-art image editing\nperformance, even surpassing those models fine-tuned with human-annotated data.\nThe project page is https://thefllood.github.io/HQEdit_web.\n","authors":["Mude Hui","Siwei Yang","Bingchen Zhao","Yichun Shi","Heng Wang","Peng Wang","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2404.09990v1.pdf","comment":"Project Page: https://thefllood.github.io/HQEdit_web"},{"id":"http://arxiv.org/abs/2404.09988v1","updated":"2024-04-15T17:59:04Z","published":"2024-04-15T17:59:04Z","title":"in2IN: Leveraging individual Information to Generate Human INteractions","summary":" Generating human-human motion interactions conditioned on textual\ndescriptions is a very useful application in many areas such as robotics,\ngaming, animation, and the metaverse. Alongside this utility also comes a great\ndifficulty in modeling the highly dimensional inter-personal dynamics. In\naddition, properly capturing the intra-personal diversity of interactions has a\nlot of challenges. Current methods generate interactions with limited diversity\nof intra-person dynamics due to the limitations of the available datasets and\nconditioning strategies. For this, we introduce in2IN, a novel diffusion model\nfor human-human motion generation which is conditioned not only on the textual\ndescription of the overall interaction but also on the individual descriptions\nof the actions performed by each person involved in the interaction. To train\nthis model, we use a large language model to extend the InterHuman dataset with\nindividual descriptions. As a result, in2IN achieves state-of-the-art\nperformance in the InterHuman dataset. Furthermore, in order to increase the\nintra-personal diversity on the existing interaction datasets, we propose\nDualMDM, a model composition technique that combines the motions generated with\nin2IN and the motions generated by a single-person motion prior pre-trained on\nHumanML3D. As a result, DualMDM generates motions with higher individual\ndiversity and improves control over the intra-person dynamics while maintaining\ninter-personal coherence.\n","authors":["Pablo Ruiz Ponce","German Barquero","Cristina Palmero","Sergio Escalera","Jose Garcia-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2404.09988v1.pdf","comment":"Project page: https://pabloruizponce.github.io/in2IN/"},{"id":"http://arxiv.org/abs/2404.09987v1","updated":"2024-04-15T17:58:57Z","published":"2024-04-15T17:58:57Z","title":"OneChart: Purify the Chart Structural Extraction via One Auxiliary Token","summary":" Chart parsing poses a significant challenge due to the diversity of styles,\nvalues, texts, and so forth. Even advanced large vision-language models (LVLMs)\nwith billions of parameters struggle to handle such tasks satisfactorily. To\naddress this, we propose OneChart: a reliable agent specifically devised for\nthe structural extraction of chart information. Similar to popular LVLMs,\nOneChart incorporates an autoregressive main body. Uniquely, to enhance the\nreliability of the numerical parts of the output, we introduce an auxiliary\ntoken placed at the beginning of the total tokens along with an additional\ndecoder. The numerically optimized (auxiliary) token allows subsequent tokens\nfor chart parsing to capture enhanced numerical features through causal\nattention. Furthermore, with the aid of the auxiliary token, we have devised a\nself-evaluation mechanism that enables the model to gauge the reliability of\nits chart parsing results by providing confidence scores for the generated\ncontent. Compared to current state-of-the-art (SOTA) chart parsing models,\ne.g., DePlot, ChartVLM, ChartAst, OneChart significantly outperforms in Average\nPrecision (AP) for chart structural extraction across multiple public\nbenchmarks, despite enjoying only 0.2 billion parameters. Moreover, as a chart\nparsing agent, it also brings 10%+ accuracy gains for the popular LVLM\n(LLaVA-1.6) in the downstream ChartQA benchmark.\n","authors":["Jinyue Chen","Lingyu Kong","Haoran Wei","Chenglong Liu","Zheng Ge","Liang Zhao","Jianjian Sun","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09987v1.pdf","comment":"14 pages, 9 figures and 6 tables"},{"id":"http://arxiv.org/abs/2404.09979v1","updated":"2024-04-15T17:56:05Z","published":"2024-04-15T17:56:05Z","title":"One-Click Upgrade from 2D to 3D: Sandwiched RGB-D Video Compression for\n Stereoscopic Teleconferencing","summary":" Stereoscopic video conferencing is still challenging due to the need to\ncompress stereo RGB-D video in real-time. Though hardware implementations of\nstandard video codecs such as H.264 / AVC and HEVC are widely available, they\nare not designed for stereoscopic videos and suffer from reduced quality and\nperformance. Specific multiview or 3D extensions of these codecs are complex\nand lack efficient implementations. In this paper, we propose a new approach to\nupgrade a 2D video codec to support stereo RGB-D video compression, by wrapping\nit with a neural pre- and post-processor pair. The neural networks are\nend-to-end trained with an image codec proxy, and shown to work with a more\nsophisticated video codec. We also propose a geometry-aware loss function to\nimprove rendering quality. We train the neural pre- and post-processors on a\nsynthetic 4D people dataset, and evaluate it on both synthetic and\nreal-captured stereo RGB-D videos. Experimental results show that the neural\nnetworks generalize well to unseen data and work out-of-box with various video\ncodecs. Our approach saves about 30% bit-rate compared to a conventional video\ncoding scheme and MV-HEVC at the same level of rendering quality from a novel\nview, without the need of a task-specific hardware upgrade.\n","authors":["Yueyu Hu","Onur G. Guleryuz","Philip A. Chou","Danhang Tang","Jonathan Taylor","Rus Maxham","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09979v1.pdf","comment":"Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for\n Streaming https://ai4streaming-workshop.github.io )"},{"id":"http://arxiv.org/abs/2404.09977v1","updated":"2024-04-15T17:55:56Z","published":"2024-04-15T17:55:56Z","title":"MaxFusion: Plug&Play Multi-Modal Generation in Text-to-Image Diffusion\n Models","summary":" Large diffusion-based Text-to-Image (T2I) models have shown impressive\ngenerative powers for text-to-image generation as well as spatially conditioned\nimage generation. For most applications, we can train the model end-toend with\npaired data to obtain photorealistic generation quality. However, to add an\nadditional task, one often needs to retrain the model from scratch using paired\ndata across all modalities to retain good generation performance. In this\npaper, we tackle this issue and propose a novel strategy to scale a generative\nmodel across new tasks with minimal compute. During our experiments, we\ndiscovered that the variance maps of intermediate feature maps of diffusion\nmodels capture the intensity of conditioning. Utilizing this prior information,\nwe propose MaxFusion, an efficient strategy to scale up text-to-image\ngeneration models to accommodate new modality conditions. Specifically, we\ncombine aligned features of multiple models, hence bringing a compositional\neffect. Our fusion strategy can be integrated into off-the-shelf models to\nenhance their generative prowess.\n","authors":["Nithin Gopalakrishnan Nair","Jeya Maria Jose Valanarasu","Vishal M Patel"],"pdf_url":"https://arxiv.org/pdf/2404.09977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09976v1","updated":"2024-04-15T17:55:43Z","published":"2024-04-15T17:55:43Z","title":"Diffscaler: Enhancing the Generative Prowess of Diffusion Transformers","summary":" Recently, diffusion transformers have gained wide attention with its\nexcellent performance in text-to-image and text-to-vidoe models, emphasizing\nthe need for transformers as backbone for diffusion models. Transformer-based\nmodels have shown better generalization capability compared to CNN-based models\nfor general vision tasks. However, much less has been explored in the existing\nliterature regarding the capabilities of transformer-based diffusion backbones\nand expanding their generative prowess to other datasets. This paper focuses on\nenabling a single pre-trained diffusion transformer model to scale across\nmultiple datasets swiftly, allowing for the completion of diverse generative\ntasks using just one model. To this end, we propose DiffScaler, an efficient\nscaling strategy for diffusion models where we train a minimal amount of\nparameters to adapt to different tasks. In particular, we learn task-specific\ntransformations at each layer by incorporating the ability to utilize the\nlearned subspaces of the pre-trained model, as well as the ability to learn\nadditional task-specific subspaces, which may be absent in the pre-training\ndataset. As these parameters are independent, a single diffusion model with\nthese task-specific parameters can be used to perform multiple tasks\nsimultaneously. Moreover, we find that transformer-based diffusion models\nsignificantly outperform CNN-based diffusion models methods while performing\nfine-tuning over smaller datasets. We perform experiments on four unconditional\nimage generation datasets. We show that using our proposed method, a single\npre-trained model can scale up to perform these conditional and unconditional\ntasks, respectively, with minimal parameter tuning while performing as close as\nfine-tuning an entire diffusion model for that particular task.\n","authors":["Nithin Gopalakrishnan Nair","Jeya Maria Jose Valanarasu","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.09976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09967v1","updated":"2024-04-15T17:45:36Z","published":"2024-04-15T17:45:36Z","title":"Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse\n Controls to Any Diffusion Model","summary":" ControlNets are widely used for adding spatial control in image generation\nwith different conditions, such as depth maps, canny edges, and human poses.\nHowever, there are several challenges when leveraging the pretrained image\nControlNets for controlled video generation. First, pretrained ControlNet\ncannot be directly plugged into new backbone models due to the mismatch of\nfeature spaces, and the cost of training ControlNets for new backbones is a big\nburden. Second, ControlNet features for different frames might not effectively\nhandle the temporal consistency. To address these challenges, we introduce\nCtrl-Adapter, an efficient and versatile framework that adds diverse controls\nto any image/video diffusion models, by adapting pretrained ControlNets (and\nimproving temporal alignment for videos). Ctrl-Adapter provides diverse\ncapabilities including image control, video control, video control with sparse\nframes, multi-condition control, compatibility with different backbones,\nadaptation to unseen control conditions, and video editing. In Ctrl-Adapter, we\ntrain adapter layers that fuse pretrained ControlNet features to different\nimage/video diffusion models, while keeping the parameters of the ControlNets\nand the diffusion models frozen. Ctrl-Adapter consists of temporal and spatial\nmodules so that it can effectively handle the temporal consistency of videos.\nWe also propose latent skipping and inverse timestep sampling for robust\nadaptation and sparse control. Moreover, Ctrl-Adapter enables control from\nmultiple conditions by simply taking the (weighted) average of ControlNet\noutputs. With diverse image/video diffusion backbones (SDXL, Hotshot-XL,\nI2VGen-XL, and SVD), Ctrl-Adapter matches ControlNet for image control and\noutperforms all baselines for video control (achieving the SOTA accuracy on the\nDAVIS 2017 dataset) with significantly lower computational costs (less than 10\nGPU hours).\n","authors":["Han Lin","Jaemin Cho","Abhay Zala","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2404.09967v1.pdf","comment":"First two authors contributed equally; Project page:\n https://ctrl-adapter.github.io/"},{"id":"http://arxiv.org/abs/2404.09964v1","updated":"2024-04-15T17:40:23Z","published":"2024-04-15T17:40:23Z","title":"Design and Analysis of Efficient Attention in Transformers for Social\n Group Activity Recognition","summary":" Social group activity recognition is a challenging task extended from group\nactivity recognition, where social groups must be recognized with their\nactivities and group members. Existing methods tackle this task by leveraging\nregion features of individuals following existing group activity recognition\nmethods. However, the effectiveness of region features is susceptible to person\nlocalization and variable semantics of individual actions. To overcome these\nissues, we propose leveraging attention modules in transformers to generate\nsocial group features. In this method, multiple embeddings are used to\naggregate features for a social group, each of which is assigned to a group\nmember without duplication. Due to this non-duplicated assignment, the number\nof embeddings must be significant to avoid missing group members and thus\nrenders attention in transformers ineffective. To find optimal attention\ndesigns with a large number of embeddings, we explore several design choices of\nqueries for feature aggregation and self-attention modules in transformer\ndecoders. Extensive experimental results show that the proposed method achieves\nstate-of-the-art performance and verify that the proposed attention designs are\nhighly effective on social group activity recognition.\n","authors":["Masato Tamura"],"pdf_url":"https://arxiv.org/pdf/2404.09964v1.pdf","comment":"Accepted to IJCV, preprint version"},{"id":"http://arxiv.org/abs/2404.09961v1","updated":"2024-04-15T17:38:47Z","published":"2024-04-15T17:38:47Z","title":"Ti-Patch: Tiled Physical Adversarial Patch for no-reference video\n quality metrics","summary":" Objective no-reference image- and video-quality metrics are crucial in many\ncomputer vision tasks. However, state-of-the-art no-reference metrics have\nbecome learning-based and are vulnerable to adversarial attacks. The\nvulnerability of quality metrics imposes restrictions on using such metrics in\nquality control systems and comparing objective algorithms. Also, using\nvulnerable metrics as a loss for deep learning model training can mislead\ntraining to worsen visual quality. Because of that, quality metrics testing for\nvulnerability is a task of current interest. This paper proposes a new method\nfor testing quality metrics vulnerability in the physical space. To our\nknowledge, quality metrics were not previously tested for vulnerability to this\nattack; they were only tested in the pixel space. We applied a physical\nadversarial Ti-Patch (Tiled Patch) attack to quality metrics and did\nexperiments both in pixel and physical space. We also performed experiments on\nthe implementation of physical adversarial wallpaper. The proposed method can\nbe used as additional quality metrics in vulnerability evaluation,\ncomplementing traditional subjective comparison and vulnerability tests in the\npixel space. We made our code and adversarial videos available on GitHub:\nhttps://github.com/leonenkova/Ti-Patch.\n","authors":["Victoria Leonenkova","Ekaterina Shumitskaya","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2404.09961v1.pdf","comment":"Accepted to WAIT AINL 2024"},{"id":"http://arxiv.org/abs/2404.09957v1","updated":"2024-04-15T17:31:32Z","published":"2024-04-15T17:31:32Z","title":"How to build the best medical image segmentation algorithm using\n foundation models: a comprehensive empirical study with Segment Anything\n Model","summary":" Automated segmentation is a fundamental medical image analysis task, which\nenjoys significant advances due to the advent of deep learning. While\nfoundation models have been useful in natural language processing and some\nvision tasks for some time, the foundation model developed with image\nsegmentation in mind - Segment Anything Model (SAM) - has been developed only\nrecently and has shown similar promise. However, there are still no systematic\nanalyses or ``best-practice'' guidelines for optimal fine-tuning of SAM for\nmedical image segmentation. This work summarizes existing fine-tuning\nstrategies with various backbone architectures, model components, and\nfine-tuning algorithms across 18 combinations, and evaluates them on 17\ndatasets covering all common radiology modalities. Our study reveals that (1)\nfine-tuning SAM leads to slightly better performance than previous segmentation\nmethods, (2) fine-tuning strategies that use parameter-efficient learning in\nboth the encoder and decoder are superior to other strategies, (3) network\narchitecture has a small impact on final performance, (4) further training SAM\nwith self-supervised learning can improve final model performance. We also\ndemonstrate the ineffectiveness of some methods popular in the literature and\nfurther expand our experiments into few-shot and prompt-based settings. Lastly,\nwe released our code and MRI-specific fine-tuned weights, which consistently\nobtained superior performance over the original SAM, at\nhttps://github.com/mazurowski-lab/finetune-SAM.\n","authors":["Hanxue Gu","Haoyu Dong","Jichen Yang","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2404.09957v1.pdf","comment":"Code available at https://github.com/mazurowski-lab/finetune-SAM"},{"id":"http://arxiv.org/abs/2404.10034v1","updated":"2024-04-15T17:25:21Z","published":"2024-04-15T17:25:21Z","title":"Realistic Model Selection for Weakly Supervised Object Localization","summary":" Weakly Supervised Object Localization (WSOL) allows for training deep\nlearning models for classification and localization, using only global\nclass-level labels. The lack of bounding box (bbox) supervision during training\nrepresents a considerable challenge for hyper-parameter search and model\nselection. Earlier WSOL works implicitly observed localization performance over\na test set which leads to biased performance evaluation. More recently, a\nbetter WSOL protocol has been proposed, where a validation set with bbox\nannotations is held out for model selection. Although it does not rely on the\ntest set, this protocol is unrealistic since bboxes are not available in\nreal-world applications, and when available, it is better to use them directly\nto fit model weights. Our initial empirical analysis shows that the\nlocalization performance of a model declines significantly when using only\nimage-class labels for model selection (compared to using bounding-box\nannotations). This suggests that adding bounding-box labels is preferable for\nselecting the best model for localization. In this paper, we introduce a new\nWSOL validation protocol that provides a localization signal without the need\nfor manual bbox annotations. In particular, we leverage noisy pseudo boxes from\nan off-the-shelf ROI proposal generator such as Selective-Search, CLIP, and RPN\npretrained models for model selection. Our experimental results with several\nWSOL methods on ILSVRC and CUB-200-2011 datasets show that our noisy boxes\nallow selecting models with performance close to those selected using ground\ntruth boxes, and better than models selected using only image-class labels.\n","authors":["Shakeeb Murtaza","Soufiane Belharbi","Marco Pedersoli","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2404.10034v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.09951v1","updated":"2024-04-15T17:24:57Z","published":"2024-04-15T17:24:57Z","title":"Unifying Global and Local Scene Entities Modelling for Precise Action\n Spotting","summary":" Sports videos pose complex challenges, including cluttered backgrounds,\ncamera angle changes, small action-representing objects, and imbalanced action\nclass distribution. Existing methods for detecting actions in sports videos\nheavily rely on global features, utilizing a backbone network as a black box\nthat encompasses the entire spatial frame. However, these approaches tend to\noverlook the nuances of the scene and struggle with detecting actions that\noccupy a small portion of the frame. In particular, they face difficulties when\ndealing with action classes involving small objects, such as balls or\nyellow/red cards in soccer, which only occupy a fraction of the screen space.\nTo address these challenges, we introduce a novel approach that analyzes and\nmodels scene entities using an adaptive attention mechanism. Particularly, our\nmodel disentangles the scene content into the global environment feature and\nlocal relevant scene entities feature. To efficiently extract environmental\nfeatures while considering temporal information with less computational cost,\nwe propose the use of a 2D backbone network with a time-shift mechanism. To\naccurately capture relevant scene entities, we employ a Vision-Language model\nin conjunction with the adaptive attention mechanism. Our model has\ndemonstrated outstanding performance, securing the 1st place in the\nSoccerNet-v2 Action Spotting, FineDiving, and FineGym challenge with a\nsubstantial performance improvement of 1.6, 2.0, and 1.3 points in avg-mAP\ncompared to the runner-up methods. Furthermore, our approach offers\ninterpretability capabilities in contrast to other deep learning models, which\nare often designed as black boxes. Our code and models are released at:\nhttps://github.com/Fsoft-AIC/unifying-global-local-feature.\n","authors":["Kim Hoang Tran","Phuc Vuong Do","Ngoc Quoc Ly","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2404.09951v1.pdf","comment":"Accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.09942v1","updated":"2024-04-15T17:11:25Z","published":"2024-04-15T17:11:25Z","title":"Knowledge-enhanced Visual-Language Pretraining for Computational\n Pathology","summary":" In this paper, we consider the problem of visual representation learning for\ncomputational pathology, by exploiting large-scale image-text pairs gathered\nfrom public resources, along with the domain specific knowledge in pathology.\nSpecifically, we make the following contributions: (i) We curate a pathology\nknowledge tree that consists of 50,470 informative attributes for 4,718\ndiseases requiring pathology diagnosis from 32 human tissues. To our knowledge,\nthis is the first comprehensive structured pathology knowledge base; (ii) We\ndevelop a knowledge-enhanced visual-language pretraining approach, where we\nfirst project pathology-specific knowledge into latent embedding space via\nlanguage model, and use it to guide the visual representation learning; (iii)\nWe conduct thorough experiments to validate the effectiveness of our proposed\ncomponents, demonstrating significant performance improvement on various\ndownstream tasks, including cross-modal retrieval, zero-shot classification on\npathology patches, and zero-shot tumor subtyping on whole slide images (WSIs).\nAll codes, models and the pathology knowledge tree will be released to the\nresearch community\n","authors":["Xiao Zhou","Xiaoman Zhang","Chaoyi Wu","Ya Zhang","Weidi Xie","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09941v1","updated":"2024-04-15T17:09:53Z","published":"2024-04-15T17:09:53Z","title":"Evolving Interpretable Visual Classifiers with Large Language Models","summary":" Multimodal pre-trained models, such as CLIP, are popular for zero-shot\nclassification due to their open-vocabulary flexibility and high performance.\nHowever, vision-language models, which compute similarity scores between images\nand class labels, are largely black-box, with limited interpretability, risk\nfor bias, and inability to discover new visual concepts not written down.\nMoreover, in practical settings, the vocabulary for class names and attributes\nof specialized concepts will not be known, preventing these methods from\nperforming well on images uncommon in large-scale vision-language datasets. To\naddress these limitations, we present a novel method that discovers\ninterpretable yet discriminative sets of attributes for visual recognition. We\nintroduce an evolutionary search algorithm that uses a large language model and\nits in-context learning abilities to iteratively mutate a concept bottleneck of\nattributes for classification. Our method produces state-of-the-art,\ninterpretable fine-grained classifiers. We outperform the latest baselines by\n18.4% on five fine-grained iNaturalist datasets and by 22.2% on two KikiBouba\ndatasets, despite the baselines having access to privileged information about\nclass names.\n","authors":["Mia Chiquier","Utkarsh Mall","Carl Vondrick"],"pdf_url":"https://arxiv.org/pdf/2404.09941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09940v1","updated":"2024-04-15T17:08:53Z","published":"2024-04-15T17:08:53Z","title":"eMotion-GAN: A Motion-based GAN for Photorealistic and Facial Expression\n Preserving Frontal View Synthesis","summary":" Many existing facial expression recognition (FER) systems encounter\nsubstantial performance degradation when faced with variations in head pose.\nNumerous frontalization methods have been proposed to enhance these systems'\nperformance under such conditions. However, they often introduce undesirable\ndeformations, rendering them less suitable for precise facial expression\nanalysis. In this paper, we present eMotion-GAN, a novel deep learning approach\ndesigned for frontal view synthesis while preserving facial expressions within\nthe motion domain. Considering the motion induced by head variation as noise\nand the motion induced by facial expression as the relevant information, our\nmodel is trained to filter out the noisy motion in order to retain only the\nmotion related to facial expression. The filtered motion is then mapped onto a\nneutral frontal face to generate the corresponding expressive frontal face. We\nconducted extensive evaluations using several widely recognized dynamic FER\ndatasets, which encompass sequences exhibiting various degrees of head pose\nvariations in both intensity and orientation. Our results demonstrate the\neffectiveness of our approach in significantly reducing the FER performance gap\nbetween frontal and non-frontal faces. Specifically, we achieved a FER\nimprovement of up to +5\\% for small pose variations and up to +20\\% improvement\nfor larger pose variations. Code available at\n\\url{https://github.com/o-ikne/eMotion-GAN.git}.\n","authors":["Omar Ikne","Benjamin Allaert","Ioan Marius Bilasco","Hazem Wannous"],"pdf_url":"https://arxiv.org/pdf/2404.09940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02692v2","updated":"2024-04-15T17:01:23Z","published":"2023-10-04T10:03:07Z","title":"Clustering-based Image-Text Graph Matching for Domain Generalization","summary":" Learning domain-invariant visual representations is important to train a\nmodel that can generalize well to unseen target task domains. Recent works\ndemonstrate that text descriptions contain high-level class-discriminative\ninformation and such auxiliary semantic cues can be used as effective pivot\nembedding for domain generalization problem. However, they use pivot embedding\nin global manner (i.e., aligning an image embedding with sentence-level text\nembedding), not fully utilizing the semantic cues of given text description. In\nthis work, we advocate for the use of local alignment between image regions and\ncorresponding textual descriptions. To this end, we first represent image and\ntext inputs with graphs. We subsequently cluster nodes in those graphs and\nmatch the graph-based image node features into textual graphs. This matching\nprocess is conducted globally and locally, tightly aligning visual and textual\nsemantic sub-structures. We experiment with large-scale public datasets, such\nas CUB-DG and DomainBed, and our model achieves matched or better\nstate-of-the-art performance on these datasets. Our code will be publicly\navailable upon publication.\n","authors":["Nokyung Park","Daewon Chae","Jeongyong Shim","Sangpil Kim","Eun-Sol Kim","Jinkyu Kim"],"pdf_url":"https://arxiv.org/pdf/2310.02692v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09933v1","updated":"2024-04-15T16:59:00Z","published":"2024-04-15T16:59:00Z","title":"HOI-Ref: Hand-Object Interaction Referral in Egocentric Vision","summary":" Large Vision Language Models (VLMs) are now the de facto state-of-the-art for\na number of tasks including visual question answering, recognising objects, and\nspatial referral. In this work, we propose the HOI-Ref task for egocentric\nimages that aims to understand interactions between hands and objects using\nVLMs. To enable HOI-Ref, we curate the HOI-QA dataset that consists of 3.9M\nquestion-answer pairs for training and evaluating VLMs. HOI-QA includes\nquestions relating to locating hands, objects, and critically their\ninteractions (e.g. referring to the object being manipulated by the hand). We\ntrain the first VLM for HOI-Ref on this dataset and call it VLM4HOI. Our\nresults demonstrate that VLMs trained for referral on third person images fail\nto recognise and refer hands and objects in egocentric images. When fine-tuned\non our egocentric HOI-QA dataset, performance improves by 27.9% for referring\nhands and objects, and by 26.7% for referring interactions.\n","authors":["Siddhant Bansal","Michael Wray","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.09933v1.pdf","comment":"Project Page: https://sid2697.github.io/hoi-ref/"},{"id":"http://arxiv.org/abs/2404.09931v1","updated":"2024-04-15T16:56:58Z","published":"2024-04-15T16:56:58Z","title":"Zero-shot detection of buildings in mobile LiDAR using Language Vision\n Model","summary":" Recent advances have demonstrated that Language Vision Models (LVMs) surpass\nthe existing State-of-the-Art (SOTA) in two-dimensional (2D) computer vision\ntasks, motivating attempts to apply LVMs to three-dimensional (3D) data. While\nLVMs are efficient and effective in addressing various downstream 2D vision\ntasks without training, they face significant challenges when it comes to point\nclouds, a representative format for representing 3D data. It is more difficult\nto extract features from 3D data and there are challenges due to large data\nsizes and the cost of the collection and labelling, resulting in a notably\nlimited availability of datasets. Moreover, constructing LVMs for point clouds\nis even more challenging due to the requirements for large amounts of data and\ntraining time. To address these issues, our research aims to 1) apply the\nGrounded SAM through Spherical Projection to transfer 3D to 2D, and 2)\nexperiment with synthetic data to evaluate its effectiveness in bridging the\ngap between synthetic and real-world data domains. Our approach exhibited high\nperformance with an accuracy of 0.96, an IoU of 0.85, precision of 0.92, recall\nof 0.91, and an F1 score of 0.92, confirming its potential. However, challenges\nsuch as occlusion problems and pixel-level overlaps of multi-label points\nduring spherical image generation remain to be addressed in future studies.\n","authors":["June Moh Goo","Zichao Zeng","Jan Boehm"],"pdf_url":"https://arxiv.org/pdf/2404.09931v1.pdf","comment":"7 pages, 6 figures, conference"},{"id":"http://arxiv.org/abs/2403.17192v2","updated":"2024-04-15T16:55:38Z","published":"2024-03-25T21:08:26Z","title":"Strategies to Improve Real-World Applicability of Laparoscopic Anatomy\n Segmentation Models","summary":" Accurate identification and localization of anatomical structures of varying\nsize and appearance in laparoscopic imaging are necessary to leverage the\npotential of computer vision techniques for surgical decision support.\nSegmentation performance of such models is traditionally reported using metrics\nof overlap such as IoU. However, imbalanced and unrealistic representation of\nclasses in the training data and suboptimal selection of reported metrics have\nthe potential to skew nominal segmentation performance and thereby ultimately\nlimit clinical translation. In this work, we systematically analyze the impact\nof class characteristics (i.e., organ size differences), training and test data\ncomposition (i.e., representation of positive and negative examples), and\nmodeling parameters (i.e., foreground-to-background class weight) on eight\nsegmentation metrics: accuracy, precision, recall, IoU, F1 score (Dice\nSimilarity Coefficient), specificity, Hausdorff Distance, and Average Symmetric\nSurface Distance. Our findings support two adjustments to account for data\nbiases in surgical data science: First, training on datasets that are similar\nto the clinical real-world scenarios in terms of class distribution, and\nsecond, class weight adjustments to optimize segmentation model performance\nwith regard to metrics of particular relevance in the respective clinical\nsetting.\n","authors":["Fiona R. Kolbinger","Jiangpeng He","Jinge Ma","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.17192v2.pdf","comment":"14 pages, 5 figures, 4 tables; accepted for the workshop \"Data\n Curation and Augmentation in Medical Imaging\" at CVPR 2024 (archival track)"},{"id":"http://arxiv.org/abs/2404.09921v1","updated":"2024-04-15T16:47:22Z","published":"2024-04-15T16:47:22Z","title":"Zero-shot Building Age Classification from Facade Image Using GPT-4","summary":" A building's age of construction is crucial for supporting many geospatial\napplications. Much current research focuses on estimating building age from\nfacade images using deep learning. However, building an accurate deep learning\nmodel requires a considerable amount of labelled training data, and the trained\nmodels often have geographical constraints. Recently, large pre-trained vision\nlanguage models (VLMs) such as GPT-4 Vision, which demonstrate significant\ngeneralisation capabilities, have emerged as potential training-free tools for\ndealing with specific vision tasks, but their applicability and reliability for\nbuilding information remain unexplored. In this study, a zero-shot building age\nclassifier for facade images is developed using prompts that include logical\ninstructions. Taking London as a test case, we introduce a new dataset,\nFI-London, comprising facade images and building age epochs. Although the\ntraining-free classifier achieved a modest accuracy of 39.69%, the mean\nabsolute error of 0.85 decades indicates that the model can predict building\nage epochs successfully albeit with a small bias. The ensuing discussion\nreveals that the classifier struggles to predict the age of very old buildings\nand is challenged by fine-grained predictions within 2 decades. Overall, the\nclassifier utilising GPT-4 Vision is capable of predicting the rough age epoch\nof a building from a single facade image without any training.\n","authors":["Zichao Zeng","June Moh Goo","Xinglei Wang","Bin Chi","Meihui Wang","Jan Boehm"],"pdf_url":"https://arxiv.org/pdf/2404.09921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09918v1","updated":"2024-04-15T16:45:08Z","published":"2024-04-15T16:45:08Z","title":"EdgeRelight360: Text-Conditioned 360-Degree HDR Image Generation for\n Real-Time On-Device Video Portrait Relighting","summary":" In this paper, we present EdgeRelight360, an approach for real-time video\nportrait relighting on mobile devices, utilizing text-conditioned generation of\n360-degree high dynamic range image (HDRI) maps. Our method proposes a\ndiffusion-based text-to-360-degree image generation in the HDR domain, taking\nadvantage of the HDR10 standard. This technique facilitates the generation of\nhigh-quality, realistic lighting conditions from textual descriptions, offering\nflexibility and control in portrait video relighting task. Unlike the previous\nrelighting frameworks, our proposed system performs video relighting directly\non-device, enabling real-time inference with real 360-degree HDRI maps. This\non-device processing ensures both privacy and guarantees low runtime, providing\nan immediate response to changes in lighting conditions or user inputs. Our\napproach paves the way for new possibilities in real-time video applications,\nincluding video conferencing, gaming, and augmented reality, by allowing\ndynamic, text-based control of lighting conditions.\n","authors":["Min-Hui Lin","Mahesh Reddy","Guillaume Berger","Michel Sarkis","Fatih Porikli","Ning Bi"],"pdf_url":"https://arxiv.org/pdf/2404.09918v1.pdf","comment":"Camera-ready version (CVPR workshop - EDGE'24)"},{"id":"http://arxiv.org/abs/2311.01908v3","updated":"2024-04-15T16:43:57Z","published":"2023-11-03T13:38:42Z","title":"LLM-driven Multimodal Target Volume Contouring in Radiation Oncology","summary":" Target volume contouring for radiation therapy is considered significantly\nmore challenging than the normal organ segmentation tasks as it necessitates\nthe utilization of both image and text-based clinical information. Inspired by\nthe recent advancement of large language models (LLMs) that can facilitate the\nintegration of the textural information and images, here we present a novel\nLLM-driven multimodal AI, namely LLMSeg, that utilizes the clinical text\ninformation and is applicable to the challenging task of target volume\ncontouring for radiation therapy, and validate it within the context of breast\ncancer radiation therapy target volume contouring. Using external validation\nand data-insufficient environments, which attributes highly conducive to\nreal-world applications, we demonstrate that the proposed model exhibits\nmarkedly improved performance compared to conventional unimodal AI models,\nparticularly exhibiting robust generalization performance and data efficiency.\nTo our best knowledge, this is the first LLM-driven multimodal AI model that\nintegrates the clinical text information into target volume delineation for\nradiation oncology.\n","authors":["Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Yeona Cho","Ik Jae Lee","Jin Sung Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.01908v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09917v1","updated":"2024-04-15T16:43:24Z","published":"2024-04-15T16:43:24Z","title":"Evaluating the Explainability of Attributes and Prototypes for a Medical\n Classification Model","summary":" Due to the sensitive nature of medicine, it is particularly important and\nhighly demanded that AI methods are explainable. This need has been recognised\nand there is great research interest in xAI solutions with medical\napplications. However, there is a lack of user-centred evaluation regarding the\nactual impact of the explanations. We evaluate attribute- and prototype-based\nexplanations with the Proto-Caps model. This xAI model reasons the target\nclassification with human-defined visual features of the target object in the\nform of scores and attribute-specific prototypes. The model thus provides a\nmultimodal explanation that is intuitively understandable to humans thanks to\npredefined attributes. A user study involving six radiologists shows that the\nexplanations are subjectivly perceived as helpful, as they reflect their\ndecision-making process. The results of the model are considered a second\nopinion that radiologists can discuss using the model's explanations. However,\nit was shown that the inclusion and increased magnitude of model explanations\nobjectively can increase confidence in the model's predictions when the model\nis incorrect. We can conclude that attribute scores and visual prototypes\nenhance confidence in the model. However, additional development and repeated\nuser studies are needed to tailor the explanation to the respective use case.\n","authors":["Luisa Gallée","Catharina Silvia Lisson","Christoph Gerhard Lisson","Daniela Drees","Felix Weig","Daniel Vogele","Meinrad Beer","Michael Götz"],"pdf_url":"https://arxiv.org/pdf/2404.09917v1.pdf","comment":"Accepted at The 2nd World Conference on eXplainable Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2403.01505v2","updated":"2024-04-15T16:42:50Z","published":"2024-03-03T13:08:32Z","title":"SCott: Accelerating Diffusion Models with Stochastic Consistency\n Distillation","summary":" The iterative sampling procedure employed by diffusion models (DMs) often\nleads to significant inference latency. To address this, we propose Stochastic\nConsistency Distillation (SCott) to enable accelerated text-to-image\ngeneration, where high-quality generations can be achieved with just 1-2\nsampling steps, and further improvements can be obtained by adding additional\nsteps. In contrast to vanilla consistency distillation (CD) which distills the\nordinary differential equation solvers-based sampling process of a pretrained\nteacher model into a student, SCott explores the possibility and validates the\nefficacy of integrating stochastic differential equation (SDE) solvers into CD\nto fully unleash the potential of the teacher. SCott is augmented with\nelaborate strategies to control the noise strength and sampling process of the\nSDE solver. An adversarial loss is further incorporated to strengthen the\nsample quality with rare sampling steps. Empirically, on the MSCOCO-2017 5K\ndataset with a Stable Diffusion-V1.5 teacher, SCott achieves an FID (Frechet\nInceptio Distance) of 22.1, surpassing that (23.4) of the 1-step InstaFlow (Liu\net al., 2023) and matching that of 4-step UFOGen (Xue et al., 2023b). Moreover,\nSCott can yield more diverse samples than other consistency models for\nhigh-resolution image generation (Luo et al., 2023a), with up to 16%\nimprovement in a qualified metric. The code and checkpoints are coming soon.\n","authors":["Hongjian Liu","Qingsong Xie","Zhijie Deng","Chen Chen","Shixiang Tang","Fueyang Fu","Zheng-jun Zha","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.01505v2.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2401.17542v2","updated":"2024-04-15T16:33:38Z","published":"2024-01-31T02:09:21Z","title":"A Medical Data-Effective Learning Benchmark for Highly Efficient\n Pre-training of Foundation Models","summary":" Foundation models, pre-trained on massive datasets, have achieved\nunprecedented generalizability. However, is it truly necessary to involve such\nvast amounts of data in pre-training, consuming extensive computational\nresources? This paper introduces data-effective learning, aiming to use data in\nthe most impactful way to pre-train foundation models. This involves strategies\nthat focus on data quality rather than quantity, ensuring the data used for\ntraining has high informational value. Data-effective learning plays a profound\nrole in accelerating foundation model training, reducing computational costs,\nand saving data storage, which is very important as the volume of medical data\nin recent years has grown beyond many people's expectations. However, due to\nthe lack of standards and comprehensive benchmarks, research on medical\ndata-effective learning is poorly studied. To address this gap, our paper\nintroduces a comprehensive benchmark specifically for evaluating data-effective\nlearning in the medical field. This benchmark includes a dataset with millions\nof data samples from 31 medical centers (DataDEL), a baseline method for\ncomparison (MedDEL), and a new evaluation metric (NormDEL) to objectively\nmeasure data-effective learning performance. Our extensive experimental results\nshow the baseline MedDEL can achieve performance comparable to the original\nlarge dataset with only 5% of the data. Establishing such an open\ndata-effective learning benchmark is crucial for the medical foundation model\nresearch community because it facilitates efficient data use, promotes\ncollaborative breakthroughs, and fosters the development of cost-effective,\nscalable, and impactful healthcare solutions.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04166v4","updated":"2024-04-15T16:31:05Z","published":"2023-06-07T05:36:45Z","title":"BAA-NGP: Bundle-Adjusting Accelerated Neural Graphics Primitives","summary":" Implicit neural representations have become pivotal in robotic perception,\nenabling robots to comprehend 3D environments from 2D images. Given a set of\ncamera poses and associated images, the models can be trained to synthesize\nnovel, unseen views. To successfully navigate and interact in dynamic settings,\nrobots require the understanding of their spatial surroundings driven by\nunassisted reconstruction of 3D scenes and camera poses from real-time video\nfootage. Existing approaches like COLMAP and bundle-adjusting neural radiance\nfield methods take hours to days to process due to the high computational\ndemands of feature matching, dense point sampling, and training of a\nmulti-layer perceptron structure with a large number of parameters. To address\nthese challenges, we propose a framework called bundle-adjusting accelerated\nneural graphics primitives (BAA-NGP) which leverages accelerated sampling and\nhash encoding to expedite automatic pose refinement/estimation and 3D scene\nreconstruction. Experimental results demonstrate 10 to 20 x speed improvement\ncompared to other bundle-adjusting neural radiance field methods without\nsacrificing the quality of pose estimation. The github repository can be found\nhere https://github.com/IntelLabs/baa-ngp.\n","authors":["Sainan Liu","Shan Lin","Jingpei Lu","Alexey Supikov","Michael Yip"],"pdf_url":"https://arxiv.org/pdf/2306.04166v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09886v1","updated":"2024-04-15T15:54:30Z","published":"2024-04-15T15:54:30Z","title":"ReffAKD: Resource-efficient Autoencoder-based Knowledge Distillation","summary":" In this research, we propose an innovative method to boost Knowledge\nDistillation efficiency without the need for resource-heavy teacher models.\nKnowledge Distillation trains a smaller ``student'' model with guidance from a\nlarger ``teacher'' model, which is computationally costly. However, the main\nbenefit comes from the soft labels provided by the teacher, helping the student\ngrasp nuanced class similarities. In our work, we propose an efficient method\nfor generating these soft labels, thereby eliminating the need for a large\nteacher model. We employ a compact autoencoder to extract essential features\nand calculate similarity scores between different classes. Afterward, we apply\nthe softmax function to these similarity scores to obtain a soft probability\nvector. This vector serves as valuable guidance during the training of the\nstudent model. Our extensive experiments on various datasets, including\nCIFAR-100, Tiny Imagenet, and Fashion MNIST, demonstrate the superior resource\nefficiency of our approach compared to traditional knowledge distillation\nmethods that rely on large teacher models. Importantly, our approach\nconsistently achieves similar or even superior performance in terms of model\naccuracy. We also perform a comparative study with various techniques recently\ndeveloped for knowledge distillation showing our approach achieves competitive\nperformance with using significantly less resources. We also show that our\napproach can be easily added to any logit based knowledge distillation method.\nThis research contributes to making knowledge distillation more accessible and\ncost-effective for practical applications, making it a promising avenue for\nimproving the efficiency of model training. The code for this work is available\nat, https://github.com/JEKimLab/ReffAKD.\n","authors":["Divyang Doshi","Jung-Eun Kim"],"pdf_url":"https://arxiv.org/pdf/2404.09886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09884v1","updated":"2024-04-15T15:53:23Z","published":"2024-04-15T15:53:23Z","title":"Map-Relative Pose Regression for Visual Re-Localization","summary":" Pose regression networks predict the camera pose of a query image relative to\na known environment. Within this family of methods, absolute pose regression\n(APR) has recently shown promising accuracy in the range of a few centimeters\nin position error. APR networks encode the scene geometry implicitly in their\nweights. To achieve high accuracy, they require vast amounts of training data\nthat, realistically, can only be created using novel view synthesis in a\ndays-long process. This process has to be repeated for each new scene again and\nagain. We present a new approach to pose regression, map-relative pose\nregression (marepo), that satisfies the data hunger of the pose regression\nnetwork in a scene-agnostic fashion. We condition the pose regressor on a\nscene-specific map representation such that its pose predictions are relative\nto the scene map. This allows us to train the pose regressor across hundreds of\nscenes to learn the generic relation between a scene-specific map\nrepresentation and the camera pose. Our map-relative pose regressor can be\napplied to new map representations immediately or after mere minutes of\nfine-tuning for the highest accuracy. Our approach outperforms previous pose\nregression methods by far on two public datasets, indoor and outdoor. Code is\navailable: https://nianticlabs.github.io/marepo\n","authors":["Shuai Chen","Tommaso Cavallari","Victor Adrian Prisacariu","Eric Brachmann"],"pdf_url":"https://arxiv.org/pdf/2404.09884v1.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)\n 2024, Highlight Paper"},{"id":"http://arxiv.org/abs/2308.04466v3","updated":"2024-04-15T15:52:41Z","published":"2023-08-08T05:46:47Z","title":"Backdoor Federated Learning by Poisoning Backdoor-Critical Layers","summary":" Federated learning (FL) has been widely deployed to enable machine learning\ntraining on sensitive data across distributed devices. However, the\ndecentralized learning paradigm and heterogeneity of FL further extend the\nattack surface for backdoor attacks. Existing FL attack and defense\nmethodologies typically focus on the whole model. None of them recognizes the\nexistence of backdoor-critical (BC) layers-a small subset of layers that\ndominate the model vulnerabilities. Attacking the BC layers achieves equivalent\neffects as attacking the whole model but at a far smaller chance of being\ndetected by state-of-the-art (SOTA) defenses. This paper proposes a general\nin-situ approach that identifies and verifies BC layers from the perspective of\nattackers. Based on the identified BC layers, we carefully craft a new backdoor\nattack methodology that adaptively seeks a fundamental balance between\nattacking effects and stealthiness under various defense strategies. Extensive\nexperiments show that our BC layer-aware backdoor attacks can successfully\nbackdoor FL under seven SOTA defenses with only 10% malicious clients and\noutperform the latest backdoor attack methods.\n","authors":["Haomin Zhuang","Mingxian Yu","Hao Wang","Yang Hua","Jian Li","Xu Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.04466v3.pdf","comment":"Accepted to ICLR'24"},{"id":"http://arxiv.org/abs/2207.14624v2","updated":"2024-04-15T15:48:43Z","published":"2022-07-29T11:50:35Z","title":"Post-processing of coronary and myocardial spatial data","summary":" Numerical simulations of real-world phenomenon are implemented with at least\ntwo parts: the computational scheme and the computational domain. In the\ncontext of hemodynamics, the computational domain of a simulation represents\nthe blood vessel network through which blood flows. Such blood vessel networks\ncan contain millions of individual vessels that are joined together to form a\nin series and parallel to form the network. It is computationally unfeasible to\nexplicitly simulate blood flow in all blood vessels. Here, from imaged data of\na single porcine left coronary arterial tree, we develop a data-pipeline to\nobtain computational domains for hemodynmaic simulations from a graph\nrepresenting the coronary vascular tree. Further, we develop a method to\nascertain which subregions of the left ventricle are most likely to be perfused\nvia a given artery using a comparison with the American Heart Association\ndivision of the left ventricle as a sense check.\n","authors":["Jay Aodh Mackenzie","Megan Jeanne Miller","Nicholas Hill","Mette Olufsen"],"pdf_url":"https://arxiv.org/pdf/2207.14624v2.pdf","comment":"21 pages, 22 figures"},{"id":"http://arxiv.org/abs/2404.09872v1","updated":"2024-04-15T15:43:52Z","published":"2024-04-15T15:43:52Z","title":"Conditional Prototype Rectification Prompt Learning","summary":" Pre-trained large-scale vision-language models (VLMs) have acquired profound\nunderstanding of general visual concepts. Recent advancements in efficient\ntransfer learning (ETL) have shown remarkable success in fine-tuning VLMs\nwithin the scenario of limited data, introducing only a few parameters to\nharness task-specific insights from VLMs. Despite significant progress, current\nleading ETL methods tend to overfit the narrow distributions of base classes\nseen during training and encounter two primary challenges: (i) only utilizing\nuni-modal information to modeling task-specific knowledge; and (ii) using\ncostly and time-consuming methods to supplement knowledge. To address these\nissues, we propose a Conditional Prototype Rectification Prompt Learning (CPR)\nmethod to correct the bias of base examples and augment limited data in an\neffective way. Specifically, we alleviate overfitting on base classes from two\naspects. First, each input image acquires knowledge from both textual and\nvisual prototypes, and then generates sample-conditional text tokens. Second,\nwe extract utilizable knowledge from unlabeled data to further refine the\nprototypes. These two strategies mitigate biases stemming from base classes,\nyielding a more effective classifier. Extensive experiments on 11 benchmark\ndatasets show that our CPR achieves state-of-the-art performance on both\nfew-shot classification and base-to-new generalization tasks. Our code is\navaliable at \\url{https://github.com/chenhaoxing/CPR}.\n","authors":["Haoxing Chen","Yaohui Li","Zizheng Huang","Yan Hong","Zhuoer Xu","Zhangxuan Gu","Jun Lan","Huijia Zhu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09870v1","updated":"2024-04-15T15:36:38Z","published":"2024-04-15T15:36:38Z","title":"Table tennis ball spin estimation with an event camera","summary":" Spin plays a pivotal role in ball-based sports. Estimating spin becomes a key\nskill due to its impact on the ball's trajectory and bouncing behavior. Spin\ncannot be observed directly, making it inherently challenging to estimate. In\ntable tennis, the combination of high velocity and spin renders traditional low\nframe rate cameras inadequate for quickly and accurately observing the ball's\nlogo to estimate the spin due to the motion blur. Event cameras do not suffer\nas much from motion blur, thanks to their high temporal resolution. Moreover,\nthe sparse nature of the event stream solves communication bandwidth\nlimitations many frame cameras face. To the best of our knowledge, we present\nthe first method for table tennis spin estimation using an event camera. We use\nordinal time surfaces to track the ball and then isolate the events generated\nby the logo on the ball. Optical flow is then estimated from the extracted\nevents to infer the ball's spin. We achieved a spin magnitude mean error of\n$10.7 \\pm 17.3$ rps and a spin axis mean error of $32.9 \\pm 38.2\\deg$ in real\ntime for a flying ball.\n","authors":["Thomas Gossard","Julian Krismer","Andreas Ziegler","Jonas Tebbe","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2404.09870v1.pdf","comment":"Accepted to CVsport (CVPRW 2024)"},{"id":"http://arxiv.org/abs/2403.14534v2","updated":"2024-04-15T15:30:31Z","published":"2024-03-21T16:36:40Z","title":"Transfer Learning for Cross-dataset Isolated Sign Language Recognition\n in Under-Resourced Datasets","summary":" Sign language recognition (SLR) has recently achieved a breakthrough in\nperformance thanks to deep neural networks trained on large annotated sign\ndatasets. Of the many different sign languages, these annotated datasets are\nonly available for a select few. Since acquiring gloss-level labels on sign\nlanguage videos is difficult, learning by transferring knowledge from existing\nannotated sources is useful for recognition in under-resourced sign languages.\nThis study provides a publicly available cross-dataset transfer learning\nbenchmark from two existing public Turkish SLR datasets. We use a temporal\ngraph convolution-based sign language recognition approach to evaluate five\nsupervised transfer learning approaches and experiment with closed-set and\npartial-set cross-dataset transfer learning. Experiments demonstrate that\nimprovement over finetuning based transfer learning is possible with\nspecialized supervised transfer learning methods.\n","authors":["Ahmet Alp Kindiroglu","Ozgur Kara","Ogulcan Ozdemir","Lale Akarun"],"pdf_url":"https://arxiv.org/pdf/2403.14534v2.pdf","comment":"Accepted to The 18th IEEE International Conference on Automatic Face\n and Gesture Recognition 2024, Code available in\n https://github.com/alpk/tid-supervised-transfer-learning-dataset"},{"id":"http://arxiv.org/abs/2309.11711v2","updated":"2024-04-15T15:26:29Z","published":"2023-09-21T01:31:54Z","title":"MoDA: Leveraging Motion Priors from Videos for Advancing Unsupervised\n Domain Adaptation in Semantic Segmentation","summary":" Unsupervised domain adaptation (UDA) has been a potent technique to handle\nthe lack of annotations in the target domain, particularly in semantic\nsegmentation task. This study introduces a different UDA scenarios where the\ntarget domain contains unlabeled video frames. Drawing upon recent advancements\nof self-supervised learning of the object motion from unlabeled videos with\ngeometric constraint, we design a \\textbf{Mo}tion-guided \\textbf{D}omain\n\\textbf{A}daptive semantic segmentation framework (MoDA). MoDA harnesses the\nself-supervised object motion cues to facilitate cross-domain alignment for\nsegmentation task. First, we present an object discovery module to localize and\nsegment target moving objects using object motion information. Then, we propose\na semantic mining module that takes the object masks to refine the pseudo\nlabels in the target domain. Subsequently, these high-quality pseudo labels are\nused in the self-training loop to bridge the cross-domain gap. On domain\nadaptive video and image segmentation experiments, MoDA shows the effectiveness\nutilizing object motion as guidance for domain alignment compared with optical\nflow information. Moreover, MoDA exhibits versatility as it can complement\nexisting state-of-the-art UDA approaches. Code at\nhttps://github.com/feipanir/MoDA.\n","authors":["Fei Pan","Xu Yin","Seokju Lee","Axi Niu","Sungeui Yoon","In So Kweon"],"pdf_url":"https://arxiv.org/pdf/2309.11711v2.pdf","comment":"CVPR 2024 Workshop on Learning with Limited Labelled Data for Image\n and Video Understanding. Best Paper Award"},{"id":"http://arxiv.org/abs/2404.09857v1","updated":"2024-04-15T15:12:53Z","published":"2024-04-15T15:12:53Z","title":"Empowering Embodied Visual Tracking with Visual Foundation Models and\n Offline RL","summary":" Embodied visual tracking is to follow a target object in dynamic 3D\nenvironments using an agent's egocentric vision. This is a vital and\nchallenging skill for embodied agents. However, existing methods suffer from\ninefficient training and poor generalization. In this paper, we propose a novel\nframework that combines visual foundation models (VFM) and offline\nreinforcement learning (offline RL) to empower embodied visual tracking. We use\na pre-trained VFM, such as ``Tracking Anything\", to extract semantic\nsegmentation masks with text prompts. We then train a recurrent policy network\nwith offline RL, e.g., Conservative Q-Learning, to learn from the collected\ndemonstrations without online agent-environment interactions. To further\nimprove the robustness and generalization of the policy network, we also\nintroduce a mask re-targeting mechanism and a multi-level data collection\nstrategy. In this way, we can train a robust tracker within an hour on a\nconsumer-level GPU, e.g., Nvidia RTX 3090. Such efficiency is unprecedented for\nRL-based visual tracking methods. We evaluate our tracker on several\nhigh-fidelity environments with challenging situations, such as distraction and\nocclusion. The results show that our agent outperforms state-of-the-art methods\nin terms of sample efficiency, robustness to distractors, and generalization to\nunseen scenarios and targets. We also demonstrate the transferability of the\nlearned tracker from the virtual world to real-world scenarios.\n","authors":["Fangwei Zhong","Kui Wu","Hai Ci","Churan Wang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01964v3","updated":"2024-04-15T15:00:49Z","published":"2023-12-04T15:23:49Z","title":"Semantics-aware Motion Retargeting with Vision-Language Models","summary":" Capturing and preserving motion semantics is essential to motion retargeting\nbetween animation characters. However, most of the previous works neglect the\nsemantic information or rely on human-designed joint-level representations.\nHere, we present a novel Semantics-aware Motion reTargeting (SMT) method with\nthe advantage of vision-language models to extract and maintain meaningful\nmotion semantics. We utilize a differentiable module to render 3D motions. Then\nthe high-level motion semantics are incorporated into the motion retargeting\nprocess by feeding the vision-language model with the rendered images and\naligning the extracted semantic embeddings. To ensure the preservation of\nfine-grained motion details and high-level semantics, we adopt a two-stage\npipeline consisting of skeleton-aware pre-training and fine-tuning with\nsemantics and geometry constraints. Experimental results show the effectiveness\nof the proposed method in producing high-quality motion retargeting results\nwhile accurately preserving motion semantics.\n","authors":["Haodong Zhang","ZhiKe Chen","Haocheng Xu","Lei Hao","Xiaofei Wu","Songcen Xu","Zhensong Zhang","Yue Wang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.01964v3.pdf","comment":"Accepted in CVPR2024"},{"id":"http://arxiv.org/abs/2404.09846v1","updated":"2024-04-15T14:55:43Z","published":"2024-04-15T14:55:43Z","title":"A Diffusion-based Data Generator for Training Object Recognition Models\n in Ultra-Range Distance","summary":" Object recognition, commonly performed by a camera, is a fundamental\nrequirement for robots to complete complex tasks. Some tasks require\nrecognizing objects far from the robot's camera. A challenging example is\nUltra-Range Gesture Recognition (URGR) in human-robot interaction where the\nuser exhibits directive gestures at a distance of up to 25~m from the robot.\nHowever, training a model to recognize hardly visible objects located in\nultra-range requires an exhaustive collection of a significant amount of\nlabeled samples. The generation of synthetic training datasets is a recent\nsolution to the lack of real-world data, while unable to properly replicate the\nrealistic visual characteristics of distant objects in images. In this letter,\nwe propose the Diffusion in Ultra-Range (DUR) framework based on a Diffusion\nmodel to generate labeled images of distant objects in various scenes. The DUR\ngenerator receives a desired distance and class (e.g., gesture) and outputs a\ncorresponding synthetic image. We apply DUR to train a URGR model with\ndirective gestures in which fine details of the gesturing hand are challenging\nto distinguish. DUR is compared to other types of generative models showcasing\nsuperiority both in fidelity and in recognition success rate when training a\nURGR model. More importantly, training a DUR model on a limited amount of real\ndata and then using it to generate synthetic data for training a URGR model\noutperforms directly training the URGR model on real data. The synthetic-based\nURGR model is also demonstrated in gesture-based direction of a ground robot.\n","authors":["Eran Bamani","Eden Nissinman","Lisa Koenigsberg","Inbar Meir","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2404.09846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09842v1","updated":"2024-04-15T14:52:02Z","published":"2024-04-15T14:52:02Z","title":"STMixer: A One-Stage Sparse Action Detector","summary":" Traditional video action detectors typically adopt the two-stage pipeline,\nwhere a person detector is first employed to generate actor boxes and then 3D\nRoIAlign is used to extract actor-specific features for classification. This\ndetection paradigm requires multi-stage training and inference, and the feature\nsampling is constrained inside the box, failing to effectively leverage richer\ncontext information outside. Recently, a few query-based action detectors have\nbeen proposed to predict action instances in an end-to-end manner. However,\nthey still lack adaptability in feature sampling and decoding, thus suffering\nfrom the issues of inferior performance or slower convergence. In this paper,\nwe propose two core designs for a more flexible one-stage sparse action\ndetector. First, we present a query-based adaptive feature sampling module,\nwhich endows the detector with the flexibility of mining a group of\ndiscriminative features from the entire spatio-temporal domain. Second, we\ndevise a decoupled feature mixing module, which dynamically attends to and\nmixes video features along the spatial and temporal dimensions respectively for\nbetter feature decoding. Based on these designs, we instantiate two detection\npipelines, that is, STMixer-K for keyframe action detection and STMixer-T for\naction tubelet detection. Without bells and whistles, our STMixer detectors\nobtain state-of-the-art results on five challenging spatio-temporal action\ndetection benchmarks for keyframe action detection or action tube detection.\n","authors":["Tao Wu","Mengqi Cao","Ziteng Gao","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09842v1.pdf","comment":"Extended version of the paper arXiv:2303.15879 presented at CVPR\n 2023. Accepted by TPAMI 2024"},{"id":"http://arxiv.org/abs/2403.12075v2","updated":"2024-04-15T14:41:09Z","published":"2024-02-14T22:21:12Z","title":"Adversarial Nibbler: An Open Red-Teaming Method for Identifying Diverse\n Harms in Text-to-Image Generation","summary":" With the rise of text-to-image (T2I) generative AI models reaching wide\naudiences, it is critical to evaluate model robustness against non-obvious\nattacks to mitigate the generation of offensive images. By focusing on\n``implicitly adversarial'' prompts (those that trigger T2I models to generate\nunsafe images for non-obvious reasons), we isolate a set of difficult safety\nissues that human creativity is well-suited to uncover. To this end, we built\nthe Adversarial Nibbler Challenge, a red-teaming methodology for crowdsourcing\na diverse set of implicitly adversarial prompts. We have assembled a suite of\nstate-of-the-art T2I models, employed a simple user interface to identify and\nannotate harms, and engaged diverse populations to capture long-tail safety\nissues that may be overlooked in standard testing. The challenge is run in\nconsecutive rounds to enable a sustained discovery and analysis of safety\npitfalls in T2I models.\n In this paper, we present an in-depth account of our methodology, a\nsystematic study of novel attack strategies and discussion of safety failures\nrevealed by challenge participants. We also release a companion visualization\ntool for easy exploration and derivation of insights from the dataset. The\nfirst challenge round resulted in over 10k prompt-image pairs with machine\nannotations for safety. A subset of 1.5k samples contains rich human\nannotations of harm types and attack styles. We find that 14% of images that\nhumans consider harmful are mislabeled as ``safe'' by machines. We have\nidentified new attack strategies that highlight the complexity of ensuring T2I\nmodel robustness. Our findings emphasize the necessity of continual auditing\nand adaptation as new vulnerabilities emerge. We are confident that this work\nwill enable proactive, iterative safety assessments and promote responsible\ndevelopment of T2I models.\n","authors":["Jessica Quaye","Alicia Parrish","Oana Inel","Charvi Rastogi","Hannah Rose Kirk","Minsuk Kahng","Erin van Liemt","Max Bartolo","Jess Tsang","Justin White","Nathan Clement","Rafael Mosquera","Juan Ciro","Vijay Janapa Reddi","Lora Aroyo"],"pdf_url":"https://arxiv.org/pdf/2403.12075v2.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2212.00621v2","updated":"2024-04-15T14:39:19Z","published":"2022-12-01T16:15:54Z","title":"CONDA: Continual Unsupervised Domain Adaptation Learning in Visual\n Perception for Self-Driving Cars","summary":" Although unsupervised domain adaptation methods have achieved remarkable\nperformance in semantic scene segmentation in visual perception for\nself-driving cars, these approaches remain impractical in real-world use cases.\nIn practice, the segmentation models may encounter new data that have not been\nseen yet. Also, the previous data training of segmentation models may be\ninaccessible due to privacy problems. Therefore, to address these problems, in\nthis work, we propose a Continual Unsupervised Domain Adaptation (CONDA)\napproach that allows the model to continuously learn and adapt with respect to\nthe presence of the new data. Moreover, our proposed approach is designed\nwithout the requirement of accessing previous training data. To avoid the\ncatastrophic forgetting problem and maintain the performance of the\nsegmentation models, we present a novel Bijective Maximum Likelihood loss to\nimpose the constraint of predicted segmentation distribution shifts. The\nexperimental results on the benchmark of continual unsupervised domain\nadaptation have shown the advanced performance of the proposed CONDA method.\n","authors":["Thanh-Dat Truong","Pierce Helton","Ahmed Moustafa","Jackson David Cothren","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2212.00621v2.pdf","comment":"Accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2402.11874v2","updated":"2024-04-15T14:37:57Z","published":"2024-02-19T06:32:23Z","title":"Language-guided Image Reflection Separation","summary":" This paper studies the problem of language-guided reflection separation,\nwhich aims at addressing the ill-posed reflection separation problem by\nintroducing language descriptions to provide layer content. We propose a\nunified framework to solve this problem, which leverages the cross-attention\nmechanism with contrastive learning strategies to construct the correspondence\nbetween language descriptions and image layers. A gated network design and a\nrandomized training strategy are employed to tackle the recognizable layer\nambiguity. The effectiveness of the proposed method is validated by the\nsignificant performance advantage over existing reflection separation methods\non both quantitative and qualitative comparisons.\n","authors":["Haofeng Zhong","Yuchen Hong","Shuchen Weng","Jinxiu Liang","Boxin Shi"],"pdf_url":"https://arxiv.org/pdf/2402.11874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09833v1","updated":"2024-04-15T14:32:32Z","published":"2024-04-15T14:32:32Z","title":"Video2Game: Real-time, Interactive, Realistic and Browser-Compatible\n Environment from a Single Video","summary":" Creating high-quality and interactive virtual environments, such as games and\nsimulators, often involves complex and costly manual modeling processes. In\nthis paper, we present Video2Game, a novel approach that automatically converts\nvideos of real-world scenes into realistic and interactive game environments.\nAt the heart of our system are three core components:(i) a neural radiance\nfields (NeRF) module that effectively captures the geometry and visual\nappearance of the scene; (ii) a mesh module that distills the knowledge from\nNeRF for faster rendering; and (iii) a physics module that models the\ninteractions and physical dynamics among the objects. By following the\ncarefully designed pipeline, one can construct an interactable and actionable\ndigital replica of the real world. We benchmark our system on both indoor and\nlarge-scale outdoor scenes. We show that we can not only produce\nhighly-realistic renderings in real-time, but also build interactive games on\ntop.\n","authors":["Hongchi Xia","Zhi-Hao Lin","Wei-Chiu Ma","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09833v1.pdf","comment":"CVPR 2024. Project page (with code): https://video2game.github.io/"},{"id":"http://arxiv.org/abs/2404.09831v1","updated":"2024-04-15T14:29:47Z","published":"2024-04-15T14:29:47Z","title":"Digging into contrastive learning for robust depth estimation with\n diffusion models","summary":" Recently, diffusion-based depth estimation methods have drawn widespread\nattention due to their elegant denoising patterns and promising performance.\nHowever, they are typically unreliable under adverse conditions prevalent in\nreal-world scenarios, such as rainy, snowy, etc. In this paper, we propose a\nnovel robust depth estimation method called D4RD, featuring a custom\ncontrastive learning mode tailored for diffusion models to mitigate performance\ndegradation in complex environments. Concretely, we integrate the strength of\nknowledge distillation into contrastive learning, building the `trinity'\ncontrastive scheme. This scheme utilizes the sampled noise of the forward\ndiffusion process as a natural reference, guiding the predicted noise in\ndiverse scenes toward a more stable and precise optimum. Moreover, we extend\nnoise-level trinity to encompass more generic feature and image levels,\nestablishing a multi-level contrast to distribute the burden of robust\nperception across the overall network. Before addressing complex scenarios, we\nenhance the stability of the baseline diffusion model with three\nstraightforward yet effective improvements, which facilitate convergence and\nremove depth outliers. Extensive experiments demonstrate that D4RD surpasses\nexisting state-of-the-art solutions on synthetic corruption datasets and\nreal-world weather conditions. The code for D4RD will be made available for\nfurther exploration and adoption.\n","authors":["Jiyuan Wang","Chunyu Lin","Lang Nie","Kang Liao","Shuwei Shao","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.09831v1.pdf","comment":"8 pages,6 figures"},{"id":"http://arxiv.org/abs/2404.09828v1","updated":"2024-04-15T14:26:00Z","published":"2024-04-15T14:26:00Z","title":"Interaction as Explanation: A User Interaction-based Method for\n Explaining Image Classification Models","summary":" In computer vision, explainable AI (xAI) methods seek to mitigate the\n'black-box' problem by making the decision-making process of deep learning\nmodels more interpretable and transparent. Traditional xAI methods concentrate\non visualizing input features that influence model predictions, providing\ninsights primarily suited for experts. In this work, we present an\ninteraction-based xAI method that enhances user comprehension of image\nclassification models through their interaction. Thus, we developed a web-based\nprototype allowing users to modify images via painting and erasing, thereby\nobserving changes in classification results. Our approach enables users to\ndiscern critical features influencing the model's decision-making process,\naligning their mental models with the model's logic. Experiments conducted with\nfive images demonstrate the potential of the method to reveal feature\nimportance through user interaction. Our work contributes a novel perspective\nto xAI by centering on end-user engagement and understanding, paving the way\nfor more intuitive and accessible explainability in AI systems.\n","authors":["Hyeonggeun Yun"],"pdf_url":"https://arxiv.org/pdf/2404.09828v1.pdf","comment":"5 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.09826v1","updated":"2024-04-15T14:23:39Z","published":"2024-04-15T14:23:39Z","title":"A Recipe for CAC: Mosaic-based Generalized Loss for Improved\n Class-Agnostic Counting","summary":" Class agnostic counting (CAC) is a vision task that can be used to count the\ntotal occurrence number of any given reference objects in the query image. The\ntask is usually formulated as a density map estimation problem through\nsimilarity computation among a few image samples of the reference object and\nthe query image. In this paper, we point out a severe issue of the existing CAC\nframework: Given a multi-class setting, models don't consider reference images\nand instead blindly match all dominant objects in the query image. Moreover,\nthe current evaluation metrics and dataset cannot be used to faithfully assess\nthe model's generalization performance and robustness. To this end, we discover\nthat the combination of mosaic augmentation with generalized loss is essential\nfor addressing the aforementioned issue of CAC models to count objects of\nmajority (i.e. dominant objects) regardless of the references. Furthermore, we\nintroduce a new evaluation protocol and metrics for resolving the problem\nbehind the existing CAC evaluation scheme and better benchmarking CAC models in\na more fair manner. Besides, extensive evaluation results demonstrate that our\nproposed recipe can consistently improve the performance of different CAC\nmodels. The code will be released upon acceptance.\n","authors":["Tsung-Han Chou","Brian Wang","Wei-Chen Chiu","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09819v1","updated":"2024-04-15T14:20:07Z","published":"2024-04-15T14:20:07Z","title":"3D Face Tracking from 2D Video through Iterative Dense UV to Image Flow","summary":" When working with 3D facial data, improving fidelity and avoiding the uncanny\nvalley effect is critically dependent on accurate 3D facial performance\ncapture. Because such methods are expensive and due to the widespread\navailability of 2D videos, recent methods have focused on how to perform\nmonocular 3D face tracking. However, these methods often fall short in\ncapturing precise facial movements due to limitations in their network\narchitecture, training, and evaluation processes. Addressing these challenges,\nwe propose a novel face tracker, FlowFace, that introduces an innovative 2D\nalignment network for dense per-vertex alignment. Unlike prior work, FlowFace\nis trained on high-quality 3D scan annotations rather than weak supervision or\nsynthetic data. Our 3D model fitting module jointly fits a 3D face model from\none or many observations, integrating existing neutral shape priors for\nenhanced identity and expression disentanglement and per-vertex deformations\nfor detailed facial feature reconstruction. Additionally, we propose a novel\nmetric and benchmark for assessing tracking accuracy. Our method exhibits\nsuperior performance on both custom and publicly available benchmarks. We\nfurther validate the effectiveness of our tracker by generating high-quality 3D\ndata from 2D videos, which leads to performance gains on downstream tasks.\n","authors":["Felix Taubner","Prashant Raina","Mathieu Tuli","Eu Wern Teh","Chul Lee","Jinmiao Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09819v1.pdf","comment":"22 pages, 25 figures, to be published in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09809v1","updated":"2024-04-15T14:07:33Z","published":"2024-04-15T14:07:33Z","title":"Neighbour-level Message Interaction Encoding for Improved Representation\n Learning on Graphs","summary":" Message passing has become the dominant framework in graph representation\nlearning. The essential idea of the message-passing framework is to update node\nembeddings based on the information aggregated from local neighbours. However,\nmost existing aggregation methods have not encoded neighbour-level message\ninteractions into the aggregated message, resulting in an information lost in\nembedding generation. And this information lost could be accumulated and become\nmore serious as more layers are added to the graph network model. To address\nthis issue, we propose a neighbour-level message interaction information\nencoding method for improving graph representation learning. For messages that\nare aggregated at a node, we explicitly generate an encoding between each\nmessage and the rest messages using an encoding function. Then we aggregate\nthese learned encodings and take the sum of the aggregated encoding and the\naggregated message to update the embedding for the node. By this way,\nneighbour-level message interaction information is integrated into the\ngenerated node embeddings. The proposed encoding method is a generic method\nwhich can be integrated into message-passing graph convolutional networks.\nExtensive experiments are conducted on six popular benchmark datasets across\nfour highly-demanded tasks. The results show that integrating neighbour-level\nmessage interactions achieves improved performance of the base models,\nadvancing the state of the art results for representation learning over graphs.\n","authors":["Haimin Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09809v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.09807v1","updated":"2024-04-15T14:03:31Z","published":"2024-04-15T14:03:31Z","title":"A Universal Protocol to Benchmark Camera Calibration for Sports","summary":" Camera calibration is a crucial component in the realm of sports analytics,\nas it serves as the foundation to extract 3D information out of the broadcast\nimages. Despite the significance of camera calibration research in sports\nanalytics, progress is impeded by outdated benchmarking criteria. Indeed, the\nannotation data and evaluation metrics provided by most currently available\nbenchmarks strongly favor and incite the development of sports field\nregistration methods, i.e. methods estimating homographies that map the sports\nfield plane to the image plane. However, such homography-based methods are\ndoomed to overlook the broader capabilities of camera calibration in bridging\nthe 3D world to the image. In particular, real-world non-planar sports field\nelements (such as goals, corner flags, baskets, ...) and image distortion\ncaused by broadcast camera lenses are out of the scope of sports field\nregistration methods. To overcome these limitations, we designed a new\nbenchmarking protocol, named ProCC, based on two principles: (1) the protocol\nshould be agnostic to the camera model chosen for a camera calibration method,\nand (2) the protocol should fairly evaluate camera calibration methods using\nthe reprojection of arbitrary yet accurately known 3D objects. Indirectly, we\nalso provide insights into the metric used in SoccerNet-calibration, which\nsolely relies on image annotation data of viewed 3D objects as ground truth,\nthus implementing our protocol. With experiments on the World Cup 2014, CARWC,\nand SoccerNet datasets, we show that our benchmarking protocol provides fairer\nevaluations of camera calibration methods. By defining our requirements for\nproper benchmarking, we hope to pave the way for a new stage in camera\ncalibration for sports applications with high accuracy standards.\n","authors":["Floriane Magera","Thomas Hoyoux","Olivier Barnich","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2404.09807v1.pdf","comment":"12 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.09797v1","updated":"2024-04-15T13:54:35Z","published":"2024-04-15T13:54:35Z","title":"TextCoT: Zoom In for Enhanced Multimodal Text-Rich Image Understanding","summary":" The advent of Large Multimodal Models (LMMs) has sparked a surge in research\naimed at harnessing their remarkable reasoning abilities. However, for\nunderstanding text-rich images, challenges persist in fully leveraging the\npotential of LMMs, and existing methods struggle with effectively processing\nhigh-resolution images. In this work, we propose TextCoT, a novel\nChain-of-Thought framework for text-rich image understanding. TextCoT utilizes\nthe captioning ability of LMMs to grasp the global context of the image and the\ngrounding capability to examine local textual regions. This allows for the\nextraction of both global and local visual information, facilitating more\naccurate question-answering. Technically, TextCoT consists of three stages,\nincluding image overview, coarse localization, and fine-grained observation.\nThe image overview stage provides a comprehensive understanding of the global\nscene information, and the coarse localization stage approximates the image\narea containing the answer based on the question asked. Then, integrating the\nobtained global image descriptions, the final stage further examines specific\nregions to provide accurate answers. Our method is free of extra training,\noffering immediate plug-and-play functionality. Extensive experiments are\nconducted on a series of text-rich image question-answering benchmark datasets\nbased on several advanced LMMs, and the results demonstrate the effectiveness\nand strong generalization ability of our method. Code is available at\nhttps://github.com/bzluan/TextCoT.\n","authors":["Bozhi Luan","Hao Feng","Hong Chen","Yonghui Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.09797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02469v2","updated":"2024-04-15T13:51:30Z","published":"2024-03-04T20:29:51Z","title":"Vision-Language Models for Medical Report Generation and Visual Question\n Answering: A Review","summary":" Medical vision-language models (VLMs) combine computer vision (CV) and\nnatural language processing (NLP) to analyze visual and textual medical data.\nOur paper reviews recent advancements in developing VLMs specialized for\nhealthcare, focusing on models designed for medical report generation and\nvisual question answering (VQA). We provide background on NLP and CV,\nexplaining how techniques from both fields are integrated into VLMs to enable\nlearning from multimodal data. Key areas we address include the exploration of\nmedical vision-language datasets, in-depth analyses of architectures and\npre-training strategies employed in recent noteworthy medical VLMs, and\ncomprehensive discussion on evaluation metrics for assessing VLMs' performance\nin medical report generation and VQA. We also highlight current challenges and\npropose future directions, including enhancing clinical validity and addressing\npatient privacy concerns. Overall, our review summarizes recent progress in\ndeveloping VLMs to harness multimodal medical data for improved healthcare\napplications.\n","authors":["Iryna Hartsock","Ghulam Rasool"],"pdf_url":"https://arxiv.org/pdf/2403.02469v2.pdf","comment":"43 pages; paper edited and restructured"},{"id":"http://arxiv.org/abs/2402.19159v2","updated":"2024-04-15T13:51:17Z","published":"2024-02-29T13:44:14Z","title":"Trajectory Consistency Distillation: Improved Latent Consistency\n Distillation by Semi-Linear Consistency Function with Trajectory Mapping","summary":" Latent Consistency Model (LCM) extends the Consistency Model to the latent\nspace and leverages the guided consistency distillation technique to achieve\nimpressive performance in accelerating text-to-image synthesis. However, we\nobserved that LCM struggles to generate images with both clarity and detailed\nintricacy. Consequently, we introduce Trajectory Consistency Distillation\n(TCD), which encompasses trajectory consistency function and strategic\nstochastic sampling. The trajectory consistency function diminishes the\nparameterisation and distillation errors by broadening the scope of the\nself-consistency boundary condition with trajectory mapping and endowing the\nTCD with the ability to accurately trace the entire trajectory of the\nProbability Flow ODE in semi-linear form with an Exponential Integrator.\nAdditionally, strategic stochastic sampling provides explicit control of\nstochastic and circumvents the accumulated errors inherent in multi-step\nconsistency sampling. Experiments demonstrate that TCD not only significantly\nenhances image quality at low NFEs but also yields more detailed results\ncompared to the teacher model at high NFEs.\n","authors":["Jianbin Zheng","Minghui Hu","Zhongyi Fan","Chaoyue Wang","Changxing Ding","Dacheng Tao","Tat-Jen Cham"],"pdf_url":"https://arxiv.org/pdf/2402.19159v2.pdf","comment":"Project Page: https://mhh0318.github.io/tcd"},{"id":"http://arxiv.org/abs/2402.19404v2","updated":"2024-04-15T13:47:31Z","published":"2024-02-29T18:03:00Z","title":"EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image\n Captioning","summary":" News image captioning requires model to generate an informative caption rich\nin entities, with the news image and the associated news article. Though\nMultimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in addressing various vision-language tasks, our research finds\nthat current MLLMs still bear limitations in handling entity information on\nnews image captioning task. Besides, while MLLMs have the ability to process\nlong inputs, generating high-quality news image captions still requires a\ntrade-off between sufficiency and conciseness of textual input information. To\nexplore the potential of MLLMs and address problems we discovered, we propose :\nan Entity-Aware Multimodal Alignment based approach for news image captioning.\nOur approach first aligns the MLLM through Balance Training Strategy with two\nextra alignment tasks: Entity-Aware Sentence Selection task and Entity\nSelection task, together with News Image Captioning task, to enhance its\ncapability in handling multimodal entity information. The aligned MLLM will\nutilizes the additional entity-related information it explicitly extract to\nsupplement its textual input while generating news image captions. Our approach\nachieves better results than all previous models in CIDEr score on GoodNews\ndataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61).\n","authors":["Junzhe Zhang","Huixuan Zhang","Xunjian Yin","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2402.19404v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09790v1","updated":"2024-04-15T13:45:48Z","published":"2024-04-15T13:45:48Z","title":"NTIRE 2024 Challenge on Image Super-Resolution ($\\times$4): Methods and\n Results","summary":" This paper reviews the NTIRE 2024 challenge on image super-resolution\n($\\times$4), highlighting the solutions proposed and the outcomes obtained. The\nchallenge involves generating corresponding high-resolution (HR) images,\nmagnified by a factor of four, from low-resolution (LR) inputs using prior\ninformation. The LR images originate from bicubic downsampling degradation. The\naim of the challenge is to obtain designs/solutions with the most advanced SR\nperformance, with no constraints on computational resources (e.g., model size\nand FLOPs) or training data. The track of this challenge assesses performance\nwith the PSNR metric on the DIV2K testing dataset. The competition attracted\n199 registrants, with 20 teams submitting valid entries. This collective\nendeavour not only pushes the boundaries of performance in single-image SR but\nalso offers a comprehensive overview of current trends in this field.\n","authors":["Zheng Chen","Zongwei Wu","Eduard Zamfir","Kai Zhang","Yulun Zhang","Radu Timofte","Xiaokang Yang","Hongyuan Yu","Cheng Wan","Yuxin Hong","Zhijuan Huang","Yajun Zou","Yuan Huang","Jiamin Lin","Bingnan Han","Xianyu Guan","Yongsheng Yu","Daoan Zhang","Xuanwu Yin","Kunlong Zuo","Jinhua Hao","Kai Zhao","Kun Yuan","Ming Sun","Chao Zhou","Hongyu An","Xinfeng Zhang","Zhiyuan Song","Ziyue Dong","Qing Zhao","Xiaogang Xu","Pengxu Wei","Zhi-chao Dou","Gui-ling Wang","Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou","Cansu Korkmaz","A. Murat Tekalp","Yubin Wei","Xiaole Yan","Binren Li","Haonan Chen","Siqi Zhang","Sihan Chen","Amogh Joshi","Nikhil Akalwadi","Sampada Malagi","Palani Yashaswini","Chaitra Desai","Ramesh Ashok Tabib","Ujwala Patil","Uma Mudenagudi","Anjali Sarvaiya","Pooja Choksy","Jagrit Joshi","Shubh Kawa","Kishor Upla","Sushrut Patwardhan","Raghavendra Ramachandra","Sadat Hossain","Geongi Park","S. M. Nadim Uddin","Hao Xu","Yanhui Guo","Aman Urumbekov","Xingzhuo Yan","Wei Hao","Minghan Fu","Isaac Orais","Samuel Smith","Ying Liu","Wangwang Jia","Qisheng Xu","Kele Xu","Weijun Yuan","Zhan Li","Wenqin Kuang","Ruijin Guan","Ruting Deng","Zhao Zhang","Bo Wang","Suiyi Zhao","Yan Luo","Yanyan Wei","Asif Hussain Khan","Christian Micheloni","Niki Martinel"],"pdf_url":"https://arxiv.org/pdf/2404.09790v1.pdf","comment":"NTIRE 2024 webpage: https://cvlai.net/ntire/2024. Code:\n https://github.com/zhengchen1999/NTIRE2024_ImageSR_x4"},{"id":"http://arxiv.org/abs/2309.05418v2","updated":"2024-04-15T13:42:13Z","published":"2023-09-11T12:35:17Z","title":"FlowIBR: Leveraging Pre-Training for Efficient Neural Image-Based\n Rendering of Dynamic Scenes","summary":" We introduce FlowIBR, a novel approach for efficient monocular novel view\nsynthesis of dynamic scenes. Existing techniques already show impressive\nrendering quality but tend to focus on optimization within a single scene\nwithout leveraging prior knowledge, resulting in long optimization times per\nscene. FlowIBR circumvents this limitation by integrating a neural image-based\nrendering method, pre-trained on a large corpus of widely available static\nscenes, with a per-scene optimized scene flow field. Utilizing this flow field,\nwe bend the camera rays to counteract the scene dynamics, thereby presenting\nthe dynamic scene as if it were static to the rendering network. The proposed\nmethod reduces per-scene optimization time by an order of magnitude, achieving\ncomparable rendering quality to existing methods -- all on a single\nconsumer-grade GPU.\n","authors":["Marcel Büsching","Josef Bengtson","David Nilsson","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2309.05418v2.pdf","comment":"Accepted to CVPR 2024 Workshop on Efficient Deep Learning for\n Computer Vision. Project page: https://flowibr.github.io"},{"id":"http://arxiv.org/abs/2404.09778v1","updated":"2024-04-15T13:30:34Z","published":"2024-04-15T13:30:34Z","title":"The Devil is in the Few Shots: Iterative Visual Knowledge Completion for\n Few-shot Learning","summary":" Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot\nlearning performance. Few-shot learning aims to further enhance the transfer\ncapability of CLIP by giving few images in each class, aka 'few shots'. Most\nexisting methods either implicitly learn from the few shots by incorporating\nlearnable prompts or adapters, or explicitly embed them in a cache model for\ninference. However, the narrow distribution of few shots often contains\nincomplete class information, leading to biased visual knowledge with high risk\nof misclassification. To tackle this problem, recent methods propose to\nsupplement visual knowledge by generative models or extra databases, which can\nbe costly and time-consuming. In this paper, we propose an Iterative Visual\nKnowledge CompLetion (KCL) method to complement visual knowledge by properly\ntaking advantages of unlabeled samples without access to any auxiliary or\nsynthetic data. Specifically, KCL first measures the similarities between\nunlabeled samples and each category. Then, the samples with top confidence to\neach category is selected and collected by a designed confidence criterion.\nFinally, the collected samples are treated as labeled ones and added to few\nshots to jointly re-estimate the remaining unlabeled ones. The above procedures\nwill be repeated for a certain number of iterations with more and more samples\nbeing collected until convergence, ensuring a progressive and robust knowledge\ncompletion process. Extensive experiments on 11 benchmark datasets demonstrate\nthe effectiveness and efficiency of KCL as a plug-and-play module under both\nfew-shot and zero-shot learning settings. Code is available at\nhttps://github.com/Mark-Sky/KCL.\n","authors":["Yaohui Li","Qifeng Zhou","Haoxing Chen","Jianbing Zhang","Xinyu Dai","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.09778v1.pdf","comment":"26 pages, submitted to ECCV 2024"},{"id":"http://arxiv.org/abs/2303.16611v2","updated":"2024-04-15T13:29:47Z","published":"2023-03-29T11:50:21Z","title":"4D Facial Expression Diffusion Model","summary":" Facial expression generation is one of the most challenging and long-sought\naspects of character animation, with many interesting applications. The\nchallenging task, traditionally having relied heavily on digital craftspersons,\nremains yet to be explored. In this paper, we introduce a generative framework\nfor generating 3D facial expression sequences (i.e. 4D faces) that can be\nconditioned on different inputs to animate an arbitrary 3D face mesh. It is\ncomposed of two tasks: (1) Learning the generative model that is trained over a\nset of 3D landmark sequences, and (2) Generating 3D mesh sequences of an input\nfacial mesh driven by the generated landmark sequences. The generative model is\nbased on a Denoising Diffusion Probabilistic Model (DDPM), which has achieved\nremarkable success in generative tasks of other domains. While it can be\ntrained unconditionally, its reverse process can still be conditioned by\nvarious condition signals. This allows us to efficiently develop several\ndownstream tasks involving various conditional generation, by using expression\nlabels, text, partial sequences, or simply a facial geometry. To obtain the\nfull mesh deformation, we then develop a landmark-guided encoder-decoder to\napply the geometrical deformation embedded in landmarks on a given facial mesh.\nExperiments show that our model has learned to generate realistic, quality\nexpressions solely from the dataset of relatively small size, improving over\nthe state-of-the-art methods. Videos and qualitative comparisons with other\nmethods can be found at \\url{https://github.com/ZOUKaifeng/4DFM}.\n","authors":["Kaifeng Zou","Sylvain Faisan","Boyang Yu","Sébastien Valette","Hyewon Seo"],"pdf_url":"https://arxiv.org/pdf/2303.16611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01369v2","updated":"2024-04-15T13:29:32Z","published":"2023-09-04T05:34:19Z","title":"Exploring Limits of Diffusion-Synthetic Training with Weakly Supervised\n Semantic Segmentation","summary":" The advance of generative models for images has inspired various training\ntechniques for image recognition utilizing synthetic images. In semantic\nsegmentation, one promising approach is extracting pseudo-masks from attention\nmaps in text-to-image diffusion models, which enables\nreal-image-and-annotation-free training. However, the pioneering training\nmethod using the diffusion-synthetic images and pseudo-masks, i.e., DiffuMask\nhas limitations in terms of mask quality, scalability, and ranges of applicable\ndomains. To overcome these limitations, this work introduces three techniques\nfor diffusion-synthetic semantic segmentation training. First,\nreliability-aware robust training, originally used in weakly supervised\nlearning, helps segmentation with insufficient synthetic mask quality. %Second,\nlarge-scale pretraining of whole segmentation models, not only backbones, on\nsynthetic ImageNet-1k-class images with pixel-labels benefits downstream\nsegmentation tasks. Second, we introduce prompt augmentation, data augmentation\nto the prompt text set to scale up and diversify training images with a limited\ntext resources. Finally, LoRA-based adaptation of Stable Diffusion enables the\ntransfer to a distant domain, e.g., auto-driving images. Experiments in PASCAL\nVOC, ImageNet-S, and Cityscapes show that our method effectively closes gap\nbetween real and synthetic training in semantic segmentation.\n","authors":["Ryota Yoshihashi","Yuya Otsuka","Kenji Doi","Tomohiro Tanaka","Hirokatsu Kataoka"],"pdf_url":"https://arxiv.org/pdf/2309.01369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09774v1","updated":"2024-04-15T13:28:13Z","published":"2024-04-15T13:28:13Z","title":"RandAlign: A Parameter-Free Method for Regularizing Graph Convolutional\n Networks","summary":" Studies continually find that message-passing graph convolutional networks\nsuffer from the over-smoothing issue. Basically, the issue of over-smoothing\nrefers to the phenomenon that the learned embeddings for all nodes can become\nvery similar to one another and therefore are uninformative after repeatedly\napplying message passing iterations. Intuitively, we can expect the generated\nembeddings become smooth asymptotically layerwisely, that is each layer of\ngraph convolution generates a smoothed version of embeddings as compared to\nthat generated by the previous layer. Based on this intuition, we propose\nRandAlign, a stochastic regularization method for graph convolutional networks.\nThe idea of RandAlign is to randomly align the learned embedding for each node\nwith that of the previous layer using randomly interpolation in each graph\nconvolution layer. Through alignment, the smoothness of the generated\nembeddings is explicitly reduced. To better maintain the benefit yielded by the\ngraph convolution, in the alignment step we introduce to first scale the\nembedding of the previous layer to the same norm as the generated embedding and\nthen perform random interpolation for aligning the generated embedding.\nRandAlign is a parameter-free method and can be directly applied without\nintroducing additional trainable weights or hyper-parameters. We experimentally\nevaluate RandAlign on different graph domain tasks on seven benchmark datasets.\nThe experimental results show that RandAlign is a general method that improves\nthe generalization performance of various graph convolutional network models\nand also improves the numerical stability of optimization, advancing the state\nof the art performance for graph representation learning.\n","authors":["Haimin Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09774v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2307.08199v3","updated":"2024-04-15T13:25:28Z","published":"2023-07-17T02:03:17Z","title":"Unbiased Image Synthesis via Manifold Guidance in Diffusion Models","summary":" Diffusion Models are a potent class of generative models capable of producing\nhigh-quality images. However, they often inadvertently favor certain data\nattributes, undermining the diversity of generated images. This issue is\nstarkly apparent in skewed datasets like CelebA, where the initial dataset\ndisproportionately favors females over males by 57.9%, this bias amplified in\ngenerated data where female representation outstrips males by 148%. In\nresponse, we propose a plug-and-play method named Manifold Guidance Sampling,\nwhich is also the first unsupervised method to mitigate bias issue in DDPMs.\nLeveraging the inherent structure of the data manifold, this method steers the\nsampling process towards a more uniform distribution, effectively dispersing\nthe clustering of biased data. Without the need for modifying the existing\nmodel or additional training, it significantly mitigates data bias and enhances\nthe quality and unbiasedness of the generated images.\n","authors":["Xingzhe Su","Daixi Jia","Fengge Wu","Junsuo Zhao","Changwen Zheng","Wenwen Qiang"],"pdf_url":"https://arxiv.org/pdf/2307.08199v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06819v2","updated":"2024-04-15T13:24:46Z","published":"2023-04-13T21:02:32Z","title":"Modeling Dense Multimodal Interactions Between Biological Pathways and\n Histology for Survival Prediction","summary":" Integrating whole-slide images (WSIs) and bulk transcriptomics for predicting\npatient survival can improve our understanding of patient prognosis. However,\nthis multimodal task is particularly challenging due to the different nature of\nthese data: WSIs represent a very high-dimensional spatial description of a\ntumor, while bulk transcriptomics represent a global description of gene\nexpression levels within that tumor. In this context, our work aims to address\ntwo key challenges: (1) how can we tokenize transcriptomics in a semantically\nmeaningful and interpretable way?, and (2) how can we capture dense multimodal\ninteractions between these two modalities? Specifically, we propose to learn\nbiological pathway tokens from transcriptomics that can encode specific\ncellular functions. Together with histology patch tokens that encode the\ndifferent morphological patterns in the WSI, we argue that they form\nappropriate reasoning units for downstream interpretability analyses. We\npropose fusing both modalities using a memory-efficient multimodal Transformer\nthat can model interactions between pathway and histology patch tokens. Our\nproposed model, SURVPATH, achieves state-of-the-art performance when evaluated\nagainst both unimodal and multimodal baselines on five datasets from The Cancer\nGenome Atlas. Our interpretability framework identifies key multimodal\nprognostic factors, and, as such, can provide valuable insights into the\ninteraction between genotype and phenotype, enabling a deeper understanding of\nthe underlying biological mechanisms at play. We make our code public at:\nhttps://github.com/ajv012/SurvPath.\n","authors":["Guillaume Jaume","Anurag Vaidya","Richard Chen","Drew Williamson","Paul Liang","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2304.06819v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09768v1","updated":"2024-04-15T13:13:56Z","published":"2024-04-15T13:13:56Z","title":"Contrastive Pretraining for Visual Concept Explanations of Socioeconomic\n Outcomes","summary":" Predicting socioeconomic indicators from satellite imagery with deep learning\nhas become an increasingly popular research direction. Post-hoc concept-based\nexplanations can be an important step towards broader adoption of these models\nin policy-making as they enable the interpretation of socioeconomic outcomes\nbased on visual concepts that are intuitive to humans. In this paper, we study\nthe interplay between representation learning using an additional task-specific\ncontrastive loss and post-hoc concept explainability for socioeconomic studies.\nOur results on two different geographical locations and tasks indicate that the\ntask-specific pretraining imposes a continuous ordering of the latent space\nembeddings according to the socioeconomic outcomes. This improves the model's\ninterpretability as it enables the latent space of the model to associate urban\nconcepts with continuous intervals of socioeconomic outcomes. Further, we\nillustrate how analyzing the model's conceptual sensitivity for the intervals\nof socioeconomic outcomes can shed light on new insights for urban studies.\n","authors":["Ivica Obadic","Alex Levering","Lars Pennig","Dario Oliveira","Diego Marcos","Xiaoxiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.09768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09761v1","updated":"2024-04-15T13:03:42Z","published":"2024-04-15T13:03:42Z","title":"Deep Learning-Based Segmentation of Tumors in PET/CT Volumes: Benchmark\n of Different Architectures and Training Strategies","summary":" Cancer is one of the leading causes of death globally, and early diagnosis is\ncrucial for patient survival. Deep learning algorithms have great potential for\nautomatic cancer analysis. Artificial intelligence has achieved high\nperformance in recognizing and segmenting single lesions. However, diagnosing\nmultiple lesions remains a challenge. This study examines and compares various\nneural network architectures and training strategies for automatically\nsegmentation of cancer lesions using PET/CT images from the head, neck, and\nwhole body. The authors analyzed datasets from the AutoPET and HECKTOR\nchallenges, exploring popular single-step segmentation architectures and\npresenting a two-step approach. The results indicate that the V-Net and nnU-Net\nmodels were the most effective for their respective datasets. The results for\nthe HECKTOR dataset ranged from 0.75 to 0.76 for the aggregated Dice\ncoefficient. Eliminating cancer-free cases from the AutoPET dataset was found\nto improve the performance of most models. In the case of AutoPET data, the\naverage segmentation efficiency after training only on images containing cancer\nlesions increased from 0.55 to 0.66 for the classic Dice coefficient and from\n0.65 to 0.73 for the aggregated Dice coefficient. The research demonstrates the\npotential of artificial intelligence in precise oncological diagnostics and may\ncontribute to the development of more targeted and effective cancer assessment\ntechniques.\n","authors":["Monika Górka","Daniel Jaworek","Marek Wodzinski"],"pdf_url":"https://arxiv.org/pdf/2404.09761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16515v2","updated":"2024-04-15T12:58:26Z","published":"2023-09-28T15:22:02Z","title":"Latent Noise Segmentation: How Neural Noise Leads to the Emergence of\n Segmentation and Grouping","summary":" Humans are able to segment images effortlessly without supervision using\nperceptual grouping. In this work, we propose a counter-intuitive computational\napproach to solving unsupervised perceptual grouping and segmentation: that\nthey arise \\textit{because} of neural noise, rather than in spite of it. We (1)\nmathematically demonstrate that under realistic assumptions, neural noise can\nbe used to separate objects from each other; (2) that adding noise in a DNN\nenables the network to segment images even though it was never trained on any\nsegmentation labels; and (3) that segmenting objects using noise results in\nsegmentation performance that aligns with the perceptual grouping phenomena\nobserved in humans, and is sample-efficient. We introduce the Good Gestalt (GG)\ndatasets -- six datasets designed to specifically test perceptual grouping, and\nshow that our DNN models reproduce many important phenomena in human\nperception, such as illusory contours, closure, continuity, proximity, and\nocclusion. Finally, we (4) show that our model improves performance on our GG\ndatasets compared to other tested unsupervised models by $24.9\\%$. Together,\nour results suggest a novel unsupervised segmentation method requiring few\nassumptions, a new explanation for the formation of perceptual grouping, and a\nnovel potential benefit of neural noise.\n","authors":["Ben Lonnqvist","Zhengqing Wu","Michael H. Herzog"],"pdf_url":"https://arxiv.org/pdf/2309.16515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08570v2","updated":"2024-04-15T12:27:13Z","published":"2024-01-16T18:57:50Z","title":"RoHM: Robust Human Motion Reconstruction via Diffusion","summary":" We propose RoHM, an approach for robust 3D human motion reconstruction from\nmonocular RGB(-D) videos in the presence of noise and occlusions. Most previous\napproaches either train neural networks to directly regress motion in 3D or\nlearn data-driven motion priors and combine them with optimization at test\ntime. The former do not recover globally coherent motion and fail under\nocclusions; the latter are time-consuming, prone to local minima, and require\nmanual tuning. To overcome these shortcomings, we exploit the iterative,\ndenoising nature of diffusion models. RoHM is a novel diffusion-based motion\nmodel that, conditioned on noisy and occluded input data, reconstructs\ncomplete, plausible motions in consistent global coordinates. Given the\ncomplexity of the problem -- requiring one to address different tasks\n(denoising and infilling) in different solution spaces (local and global\nmotion) -- we decompose it into two sub-tasks and learn two models, one for\nglobal trajectory and one for local motion. To capture the correlations between\nthe two, we then introduce a novel conditioning module, combining it with an\niterative inference scheme. We apply RoHM to a variety of tasks -- from motion\nreconstruction and denoising to spatial and temporal infilling. Extensive\nexperiments on three popular datasets show that our method outperforms\nstate-of-the-art approaches qualitatively and quantitatively, while being\nfaster at test time. The code is available at\nhttps://sanweiliti.github.io/ROHM/ROHM.html.\n","authors":["Siwei Zhang","Bharat Lal Bhatnagar","Yuanlu Xu","Alexander Winkler","Petr Kadlecek","Siyu Tang","Federica Bogo"],"pdf_url":"https://arxiv.org/pdf/2401.08570v2.pdf","comment":"With the appendix included"},{"id":"http://arxiv.org/abs/2109.14406v2","updated":"2024-04-15T06:19:32Z","published":"2021-09-29T13:10:46Z","title":"Neural Knitworks: Patched Neural Implicit Representation Networks","summary":" Coordinate-based Multilayer Perceptron (MLP) networks, despite being capable\nof learning neural implicit representations, are not performant for internal\nimage synthesis applications. Convolutional Neural Networks (CNNs) are\ntypically used instead for a variety of internal generative tasks, at the cost\nof a larger model. We propose Neural Knitwork, an architecture for neural\nimplicit representation learning of natural images that achieves image\nsynthesis by optimizing the distribution of image patches in an adversarial\nmanner and by enforcing consistency between the patch predictions. To the best\nof our knowledge, this is the first implementation of a coordinate-based MLP\ntailored for synthesis tasks such as image inpainting, super-resolution, and\ndenoising. We demonstrate the utility of the proposed technique by training on\nthese three tasks. The results show that modeling natural images using patches,\nrather than pixels, produces results of higher fidelity. The resulting model\nrequires 80% fewer parameters than alternative CNN-based solutions while\nachieving comparable performance and training time.\n","authors":["Mikolaj Czerkawski","Javier Cardona","Robert Atkinson","Craig Michie","Ivan Andonovic","Carmine Clemente","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2109.14406v2.pdf","comment":"Published in Pattern Recognition"},{"id":"http://arxiv.org/abs/2404.10147v1","updated":"2024-04-15T21:33:45Z","published":"2024-04-15T21:33:45Z","title":"Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban\n Crime Dynamics","summary":" This study addresses the challenge of urban safety in New York City by\nexamining the relationship between the built environment and crime rates using\nmachine learning and a comprehensive dataset of street view images. We aim to\nidentify how urban landscapes correlate with crime statistics, focusing on the\ncharacteristics of street views and their association with crime rates. The\nfindings offer insights for urban planning and crime prevention, highlighting\nthe potential of environmental design in enhancing public safety.\n","authors":["Zhixuan Qi","Huaiying Luo","Chen Chi"],"pdf_url":"https://arxiv.org/pdf/2404.10147v1.pdf","comment":null}]},"2024-04-14T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2302.04871v4","updated":"2024-04-14T23:46:05Z","published":"2023-02-09T18:59:56Z","title":"In-N-Out: Faithful 3D GAN Inversion with Volumetric Decomposition for\n Face Editing","summary":" 3D-aware GANs offer new capabilities for view synthesis while preserving the\nediting functionalities of their 2D counterparts. GAN inversion is a crucial\nstep that seeks the latent code to reconstruct input images or videos,\nsubsequently enabling diverse editing tasks through manipulation of this latent\ncode. However, a model pre-trained on a particular dataset (e.g., FFHQ) often\nhas difficulty reconstructing images with out-of-distribution (OOD) objects\nsuch as faces with heavy make-up or occluding objects. We address this issue by\nexplicitly modeling OOD objects from the input in 3D-aware GANs. Our core idea\nis to represent the image using two individual neural radiance fields: one for\nthe in-distribution content and the other for the out-of-distribution object.\nThe final reconstruction is achieved by optimizing the composition of these two\nradiance fields with carefully designed regularization. We demonstrate that our\nexplicit decomposition alleviates the inherent trade-off between reconstruction\nfidelity and editability. We evaluate reconstruction accuracy and editability\nof our method on challenging real face images and videos and showcase favorable\nresults against other baselines.\n","authors":["Yiran Xu","Zhixin Shu","Cameron Smith","Seoung Wug Oh","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2302.04871v4.pdf","comment":"Project page: https://in-n-out-3d.github.io/"},{"id":"http://arxiv.org/abs/2404.09378v1","updated":"2024-04-14T23:30:35Z","published":"2024-04-14T23:30:35Z","title":"Orientation-conditioned Facial Texture Mapping for Video-based Facial\n Remote Photoplethysmography Estimation","summary":" Camera-based remote photoplethysmography (rPPG) enables contactless\nmeasurement of important physiological signals such as pulse rate (PR).\nHowever, dynamic and unconstrained subject motion introduces significant\nvariability into the facial appearance in video, confounding the ability of\nvideo-based methods to accurately extract the rPPG signal. In this study, we\nleverage the 3D facial surface to construct a novel orientation-conditioned\nfacial texture video representation which improves the motion robustness of\nexisting video-based facial rPPG estimation methods. Our proposed method\nachieves a significant 18.2% performance improvement in cross-dataset testing\non MMPD over our baseline using the PhysNet model trained on PURE, highlighting\nthe efficacy and generalization benefits of our designed video representation.\nWe demonstrate significant performance improvements of up to 29.6% in all\ntested motion scenarios in cross-dataset testing on MMPD, even in the presence\nof dynamic and unconstrained subject motion. Emphasizing the benefits the\nbenefits of disentangling motion through modeling the 3D facial surface for\nmotion robust facial rPPG estimation. We validate the efficacy of our design\ndecisions and the impact of different video processing steps through an\nablation study. Our findings illustrate the potential strengths of exploiting\nthe 3D facial surface as a general strategy for addressing dynamic and\nunconstrained subject motion in videos. The code is available at\nhttps://samcantrill.github.io/orientation-uv-rppg/.\n","authors":["Sam Cantrill","David Ahmedt-Aristizabal","Lars Petersson","Hanna Suominen","Mohammad Ali Armin"],"pdf_url":"https://arxiv.org/pdf/2404.09378v1.pdf","comment":"12 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.09376v1","updated":"2024-04-14T23:17:01Z","published":"2024-04-14T23:17:01Z","title":"\\textit{sweet} -- An Open Source Modular Platform for Contactless Hand\n Vascular Biometric Experiments","summary":" Current finger-vein or palm-vein recognition systems usually require direct\ncontact of the subject with the apparatus. This can be problematic in\nenvironments where hygiene is of primary importance. In this work we present a\ncontactless vascular biometrics sensor platform named \\sweet which can be used\nfor hand vascular biometrics studies (wrist-, palm- and finger-vein) and\nsurface features such as palmprint. It supports several acquisition modalities\nsuch as multi-spectral Near-Infrared (NIR), RGB-color, Stereo Vision (SV) and\nPhotometric Stereo (PS). Using this platform we collect a dataset consisting of\nthe fingers, palm and wrist vascular data of 120 subjects and develop a\npowerful 3D pipeline for the pre-processing of this data. We then present\nbiometric experimental results, focusing on Finger-Vein Recognition (FVR).\nFinally, we discuss fusion of multiple modalities, such palm-vein combined with\npalm-print biometrics. The acquisition software, parts of the hardware design,\nthe new FV dataset, as well as source-code for our experiments are publicly\navailable for research purposes.\n","authors":["David Geissbühler","Sushil Bhattacharjee","Ketan Kotwal","Guillaume Clivaz","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2404.09376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06741v2","updated":"2024-04-14T22:33:27Z","published":"2023-12-11T18:19:04Z","title":"Gaussian Splatting SLAM","summary":" We present the first application of 3D Gaussian Splatting in monocular SLAM,\nthe most fundamental but the hardest setup for Visual SLAM. Our method, which\nruns live at 3fps, utilises Gaussians as the only 3D representation, unifying\nthe required representation for accurate, efficient tracking, mapping, and\nhigh-quality rendering. Designed for challenging monocular settings, our\napproach is seamlessly extendable to RGB-D SLAM when an external depth sensor\nis available. Several innovations are required to continuously reconstruct 3D\nscenes with high fidelity from a live camera. First, to move beyond the\noriginal 3DGS algorithm, which requires accurate poses from an offline\nStructure from Motion (SfM) system, we formulate camera tracking for 3DGS using\ndirect optimisation against the 3D Gaussians, and show that this enables fast\nand robust tracking with a wide basin of convergence. Second, by utilising the\nexplicit nature of the Gaussians, we introduce geometric verification and\nregularisation to handle the ambiguities occurring in incremental 3D dense\nreconstruction. Finally, we introduce a full SLAM system which not only\nachieves state-of-the-art results in novel view synthesis and trajectory\nestimation but also reconstruction of tiny and even transparent objects.\n","authors":["Hidenobu Matsuki","Riku Murai","Paul H. J. Kelly","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2312.06741v2.pdf","comment":"CVPR2024 Highlight. First two authors contributed equally to this\n work. Project Page: https://rmurai.co.uk/projects/GaussianSplattingSLAM/"},{"id":"http://arxiv.org/abs/2310.08580v2","updated":"2024-04-14T22:23:18Z","published":"2023-10-12T17:59:38Z","title":"OmniControl: Control Any Joint at Any Time for Human Motion Generation","summary":" We present a novel approach named OmniControl for incorporating flexible\nspatial control signals into a text-conditioned human motion generation model\nbased on the diffusion process. Unlike previous methods that can only control\nthe pelvis trajectory, OmniControl can incorporate flexible spatial control\nsignals over different joints at different times with only one model.\nSpecifically, we propose analytic spatial guidance that ensures the generated\nmotion can tightly conform to the input control signals. At the same time,\nrealism guidance is introduced to refine all the joints to generate more\ncoherent motion. Both the spatial and realism guidance are essential and they\nare highly complementary for balancing control accuracy and motion realism. By\ncombining them, OmniControl generates motions that are realistic, coherent, and\nconsistent with the spatial constraints. Experiments on HumanML3D and KIT-ML\ndatasets show that OmniControl not only achieves significant improvement over\nstate-of-the-art methods on pelvis control but also shows promising results\nwhen incorporating the constraints over other joints.\n","authors":["Yiming Xie","Varun Jampani","Lei Zhong","Deqing Sun","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.08580v2.pdf","comment":"ICLR 2024. Project page: https://neu-vi.github.io/omnicontrol/"},{"id":"http://arxiv.org/abs/2404.09359v1","updated":"2024-04-14T21:14:47Z","published":"2024-04-14T21:14:47Z","title":"Exploring Feedback Generation in Automated Skeletal Movement Assessment:\n A Comprehensive Overview","summary":" The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection from 2D or 3D videos. While the primary objective of\nautomatic assessment tasks is to score movements, the automatic generation of\nfeedback highlighting key movement issues has the potential to significantly\nenhance and accelerate the rehabilitation process. In this study, we explain\nthe types of feedback that can be generated, review existing solutions for\nautomatic feedback generation, and discuss future research directions. To our\nknowledge, this is the first comprehensive review of feedback generation in\nskeletal movement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19001v3","updated":"2024-04-14T21:13:01Z","published":"2024-02-29T09:52:39Z","title":"Analysis of the Two-Step Heterogeneous Transfer Learning for Laryngeal\n Blood Vessel Classification: Issue and Improvement","summary":" Accurate classification of laryngeal vascular as benign or malignant is\ncrucial for early detection of laryngeal cancer. However, organizations with\nlimited access to laryngeal vascular images face challenges due to the lack of\nlarge and homogeneous public datasets for effective learning. Distinguished\nfrom the most familiar works, which directly transfer the ImageNet pre-trained\nmodels to the target domain for fine-tuning, this work pioneers exploring\ntwo-step heterogeneous transfer learning (THTL) for laryngeal lesion\nclassification with nine deep-learning models, utilizing the diabetic\nretinopathy color fundus images, semantically non-identical yet vascular\nimages, as the intermediate domain. Attention visualization technique, Layer\nClass Activate Map (LayerCAM), reveals a novel finding that yet the\nintermediate and the target domain both reflect vascular structure to a certain\nextent, the prevalent radial vascular pattern in the intermediate domain\nprevents learning the features of twisted and tangled vessels that distinguish\nthe malignant class in the target domain, summarizes a vital rule for laryngeal\nlesion classification using THTL. To address this, we introduce an enhanced\nfine-tuning strategy in THTL called Step-Wise Fine-Tuning (SWFT) and apply it\nto the ResNet models. SWFT progressively refines model performance by\naccumulating fine-tuning layers from back to front, guided by the visualization\nresults of LayerCAM. Comparison with the original THTL approach shows\nsignificant improvements. For ResNet18, the accuracy and malignant recall\nincreases by 26.1% and 79.8%, respectively, while for ResNet50, these\nindicators improve by 20.4% and 62.2%, respectively.\n","authors":["Xinyi Fang","Xu Yang","Chak Fong Chong","Kei Long Wong","Yapeng Wang","Tiankui Zhang","Sio-Kei Im"],"pdf_url":"https://arxiv.org/pdf/2402.19001v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09349v1","updated":"2024-04-14T20:14:38Z","published":"2024-04-14T20:14:38Z","title":"Adversarial Robustness Limits via Scaling-Law and Human-Alignment\n Studies","summary":" This paper revisits the simple, long-studied, yet still unsolved problem of\nmaking image classifiers robust to imperceptible perturbations. Taking CIFAR10\nas an example, SOTA clean accuracy is about $100$%, but SOTA robustness to\n$\\ell_{\\infty}$-norm bounded perturbations barely exceeds $70$%. To understand\nthis gap, we analyze how model size, dataset size, and synthetic data quality\naffect robustness by developing the first scaling laws for adversarial\ntraining. Our scaling laws reveal inefficiencies in prior art and provide\nactionable feedback to advance the field. For instance, we discovered that SOTA\nmethods diverge notably from compute-optimal setups, using excess compute for\ntheir level of robustness. Leveraging a compute-efficient setup, we surpass the\nprior SOTA with $20$% ($70$%) fewer training (inference) FLOPs. We trained\nvarious compute-efficient models, with our best achieving $74$% AutoAttack\naccuracy ($+3$% gain). However, our scaling laws also predict robustness slowly\ngrows then plateaus at $90$%: dwarfing our new SOTA by scaling is impractical,\nand perfect robustness is impossible. To better understand this predicted\nlimit, we carry out a small-scale human evaluation on the AutoAttack data that\nfools our top-performing model. Concerningly, we estimate that human\nperformance also plateaus near $90$%, which we show to be attributable to\n$\\ell_{\\infty}$-constrained attacks' generation of invalid images not\nconsistent with their original labels. Having characterized limiting\nroadblocks, we outline promising paths for future research.\n","authors":["Brian R. Bartoldson","James Diffenderfer","Konstantinos Parasyris","Bhavya Kailkhura"],"pdf_url":"https://arxiv.org/pdf/2404.09349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09342v1","updated":"2024-04-14T19:51:32Z","published":"2024-04-14T19:51:32Z","title":"Face-voice Association in Multilingual Environments (FAME) Challenge\n 2024 Evaluation Plan","summary":" The advancements of technology have led to the use of multimodal systems in\nvarious real-world applications. Among them, the audio-visual systems are one\nof the widely used multimodal systems. In the recent years, associating face\nand voice of a person has gained attention due to presence of unique\ncorrelation between them. The Face-voice Association in Multilingual\nEnvironments (FAME) Challenge 2024 focuses on exploring face-voice association\nunder a unique condition of multilingual scenario. This condition is inspired\nfrom the fact that half of the world's population is bilingual and most often\npeople communicate under multilingual scenario. The challenge uses a dataset\nnamely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice\nassociation in multilingual environments. This report provides the details of\nthe challenge, dataset, baselines and task details for the FAME Challenge.\n","authors":["Muhammad Saad Saeed","Shah Nawaz","Muhammad Salman Tahir","Rohan Kumar Das","Muhammad Zaigham Zaheer","Marta Moscati","Markus Schedl","Muhammad Haris Khan","Karthik Nandakumar","Muhammad Haroon Yousaf"],"pdf_url":"https://arxiv.org/pdf/2404.09342v1.pdf","comment":"ACM Multimedia Conference - Grand Challenge"},{"id":"http://arxiv.org/abs/2404.09326v1","updated":"2024-04-14T18:57:38Z","published":"2024-04-14T18:57:38Z","title":"Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision\n Transformers","summary":" Few-shot knowledge distillation recently emerged as a viable approach to\nharness the knowledge of large-scale pre-trained models, using limited data and\ncomputational resources. In this paper, we propose a novel few-shot feature\ndistillation approach for vision transformers. Our approach is based on two key\nsteps. Leveraging the fact that vision transformers have a consistent\ndepth-wise structure, we first copy the weights from intermittent layers of\nexisting pre-trained vision transformers (teachers) into shallower\narchitectures (students), where the intermittence factor controls the\ncomplexity of the student transformer with respect to its teacher. Next, we\nemploy an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge\ninto the student in a few-shot scenario, aiming to recover the information\nprocessing carried out by the skipped teacher layers. We present comprehensive\nexperiments with supervised and self-supervised transformers as teachers, on\nfive data sets from various domains, including natural, medical and satellite\nimages. The empirical results confirm the superiority of our approach over\ncompetitive baselines. Moreover, the ablation results demonstrate the\nusefulness of each component of the proposed pipeline.\n","authors":["Diana-Nicoleta Grigore","Mariana-Iuliana Georgescu","Jon Alvarez Justo","Tor Johansen","Andreea Iuliana Ionescu","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2404.09326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05105v2","updated":"2024-04-14T18:27:41Z","published":"2024-04-07T23:10:26Z","title":"VMambaMorph: a Multi-Modality Deformable Image Registration Framework\n based on Visual State Space Model with Cross-Scan Module","summary":" Image registration, a critical process in medical imaging, involves aligning\ndifferent sets of medical imaging data into a single unified coordinate system.\nDeep learning networks, such as the Convolutional Neural Network (CNN)-based\nVoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model\n(SSM)-based MambaMorph, have demonstrated effective performance in this domain.\nThe recent Visual State Space Model (VMamba), which incorporates a cross-scan\nmodule with SSM, has exhibited promising improvements in modeling global-range\ndependencies with efficient computational cost in computer vision tasks. This\npaper hereby introduces an exploration of VMamba with image registration, named\nVMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for\n3D image registration. Utilizing a U-shaped network architecture, VMambaMorph\ncomputes the deformation field based on target and source volumes. The\nVMamba-based block with 2D cross-scan module is redesigned for 3D volumetric\nfeature processing. To overcome the complex motion and structure on\nmulti-modality images, we further propose a fine-tune recursive registration\nframework. We validate VMambaMorph using a public benchmark brain MR-CT\nregistration dataset, comparing its performance against current\nstate-of-the-art methods. The results indicate that VMambaMorph achieves\ncompetitive registration quality. The code for VMambaMorph with all baseline\nmethods is available on GitHub.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Chao Ma","Tao Guo"],"pdf_url":"https://arxiv.org/pdf/2404.05105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16519v2","updated":"2024-04-14T17:56:49Z","published":"2023-12-27T10:57:03Z","title":"Image Restoration by Denoising Diffusion Models with Iteratively\n Preconditioned Guidance","summary":" Training deep neural networks has become a common approach for addressing\nimage restoration problems. An alternative for training a \"task-specific\"\nnetwork for each observation model is to use pretrained deep denoisers for\nimposing only the signal's prior within iterative algorithms, without\nadditional training. Recently, a sampling-based variant of this approach has\nbecome popular with the rise of diffusion/score-based generative models. Using\ndenoisers for general purpose restoration requires guiding the iterations to\nensure agreement of the signal with the observations. In low-noise settings,\nguidance that is based on back-projection (BP) has been shown to be a promising\nstrategy (used recently also under the names \"pseudoinverse\" or\n\"range/null-space\" guidance). However, the presence of noise in the\nobservations hinders the gains from this approach. In this paper, we propose a\nnovel guidance technique, based on preconditioning that allows traversing from\nBP-based guidance to least squares based guidance along the restoration scheme.\nThe proposed approach is robust to noise while still having much simpler\nimplementation than alternative methods (e.g., it does not require SVD or a\nlarge number of iterations). We use it within both an optimization scheme and a\nsampling-based scheme, and demonstrate its advantages over existing methods for\nimage deblurring and super-resolution.\n","authors":["Tomer Garber","Tom Tirer"],"pdf_url":"https://arxiv.org/pdf/2312.16519v2.pdf","comment":"CVPR 2024 (camera-ready). Code can be found at:\n https://github.com/tirer-lab/DDPG"},{"id":"http://arxiv.org/abs/2312.05239v3","updated":"2024-04-14T17:39:27Z","published":"2023-12-08T18:44:09Z","title":"SwiftBrush: One-Step Text-to-Image Diffusion Model with Variational\n Score Distillation","summary":" Despite their ability to generate high-resolution and diverse images from\ntext prompts, text-to-image diffusion models often suffer from slow iterative\nsampling processes. Model distillation is one of the most effective directions\nto accelerate these models. However, previous distillation methods fail to\nretain the generation quality while requiring a significant amount of images\nfor training, either from real data or synthetically generated by the teacher\nmodel. In response to this limitation, we present a novel image-free\ndistillation scheme named $\\textbf{SwiftBrush}$. Drawing inspiration from\ntext-to-3D synthesis, in which a 3D neural radiance field that aligns with the\ninput prompt can be obtained from a 2D text-to-image diffusion prior via a\nspecialized loss without the use of any 3D data ground-truth, our approach\nre-purposes that same loss for distilling a pretrained multi-step text-to-image\nmodel to a student network that can generate high-fidelity images with just a\nsingle inference step. In spite of its simplicity, our model stands as one of\nthe first one-step text-to-image generators that can produce images of\ncomparable quality to Stable Diffusion without reliance on any training image\ndata. Remarkably, SwiftBrush achieves an FID score of $\\textbf{16.67}$ and a\nCLIP score of $\\textbf{0.29}$ on the COCO-30K benchmark, achieving competitive\nresults or even substantially surpassing existing state-of-the-art distillation\ntechniques.\n","authors":["Thuan Hoang Nguyen","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2312.05239v3.pdf","comment":"Accepted to CVPR 2024; Project Page:\n https://thuanz123.github.io/swiftbrush/"},{"id":"http://arxiv.org/abs/2404.09308v1","updated":"2024-04-14T17:33:33Z","published":"2024-04-14T17:33:33Z","title":"In My Perspective, In My Hands: Accurate Egocentric 2D Hand Pose and\n Action Recognition","summary":" Action recognition is essential for egocentric video understanding, allowing\nautomatic and continuous monitoring of Activities of Daily Living (ADLs)\nwithout user effort. Existing literature focuses on 3D hand pose input, which\nrequires computationally intensive depth estimation networks or wearing an\nuncomfortable depth sensor. In contrast, there has been insufficient research\nin understanding 2D hand pose for egocentric action recognition, despite the\navailability of user-friendly smart glasses in the market capable of capturing\na single RGB image. Our study aims to fill this research gap by exploring the\nfield of 2D hand pose estimation for egocentric action recognition, making two\ncontributions. Firstly, we introduce two novel approaches for 2D hand pose\nestimation, namely EffHandNet for single-hand estimation and EffHandEgoNet,\ntailored for an egocentric perspective, capturing interactions between hands\nand objects. Both methods outperform state-of-the-art models on H2O and FPHA\npublic benchmarks. Secondly, we present a robust action recognition\narchitecture from 2D hand and object poses. This method incorporates\nEffHandEgoNet, and a transformer-based action recognition method. Evaluated on\nH2O and FPHA datasets, our architecture has a faster inference time and\nachieves an accuracy of 91.32% and 94.43%, respectively, surpassing state of\nthe art, including 3D-based methods. Our work demonstrates that using 2D\nskeletal data is a robust approach for egocentric action understanding.\nExtensive evaluation and ablation studies show the impact of the hand pose\nestimation approach, and how each input affects the overall performance.\n","authors":["Wiktor Mucha","Martin Kampel"],"pdf_url":"https://arxiv.org/pdf/2404.09308v1.pdf","comment":"Accepted at: The 18th IEEE International Conference on Automatic Face\n and Gesture Recognition"},{"id":"http://arxiv.org/abs/2309.07849v3","updated":"2024-04-14T17:29:46Z","published":"2023-09-14T16:48:31Z","title":"TFNet: Exploiting Temporal Cues for Fast and Accurate LiDAR Semantic\n Segmentation","summary":" LiDAR semantic segmentation plays a crucial role in enabling autonomous\ndriving and robots to understand their surroundings accurately and robustly. A\nmultitude of methods exist within this domain, including point-based,\nrange-image-based, polar-coordinate-based, and hybrid strategies. Among these,\nrange-image-based techniques have gained widespread adoption in practical\napplications due to their efficiency. However, they face a significant\nchallenge known as the ``many-to-one'' problem caused by the range image's\nlimited horizontal and vertical angular resolution. As a result, around 20% of\nthe 3D points can be occluded. In this paper, we present TFNet, a\nrange-image-based LiDAR semantic segmentation method that utilizes temporal\ninformation to address this issue. Specifically, we incorporate a temporal\nfusion layer to extract useful information from previous scans and integrate it\nwith the current scan. We then design a max-voting-based post-processing\ntechnique to correct false predictions, particularly those caused by the\n``many-to-one'' issue. We evaluated the approach on two benchmarks and\ndemonstrated that the plug-in post-processing technique is generic and can be\napplied to various networks.\n","authors":["Rong Li","ShiJie Li","Xieyuanli Chen","Teli Ma","Juergen Gall","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2309.07849v3.pdf","comment":"accepted by CVPR2024 Workshop on Autonomous Driving"},{"id":"http://arxiv.org/abs/2404.09301v1","updated":"2024-04-14T16:55:23Z","published":"2024-04-14T16:55:23Z","title":"A Simple Strategy for Body Estimation from Partial-View Images","summary":" Virtual try-on and product personalization have become increasingly important\nin modern online shopping, highlighting the need for accurate body measurement\nestimation. Although previous research has advanced in estimating 3D body\nshapes from RGB images, the task is inherently ambiguous as the observed scale\nof human subjects in the images depends on two unknown factors: capture\ndistance and body dimensions. This ambiguity is particularly pronounced in\npartial-view scenarios. To address this challenge, we propose a modular and\nsimple height normalization solution. This solution relocates the subject\nskeleton to the desired position, thereby normalizing the scale and\ndisentangling the relationship between the two variables. Our experimental\nresults demonstrate that integrating this technique into state-of-the-art human\nmesh reconstruction models significantly enhances partial body measurement\nestimation. Additionally, we illustrate the applicability of this approach to\nmulti-view settings, showcasing its versatility.\n","authors":["Yafei Mao","Xuelu Li","Brandon Smith","Jinjin Li","Raja Bala"],"pdf_url":"https://arxiv.org/pdf/2404.09301v1.pdf","comment":"Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design"},{"id":"http://arxiv.org/abs/2404.07191v2","updated":"2024-04-14T16:54:24Z","published":"2024-04-10T17:48:37Z","title":"InstantMesh: Efficient 3D Mesh Generation from a Single Image with\n Sparse-view Large Reconstruction Models","summary":" We present InstantMesh, a feed-forward framework for instant 3D mesh\ngeneration from a single image, featuring state-of-the-art generation quality\nand significant training scalability. By synergizing the strengths of an\noff-the-shelf multiview diffusion model and a sparse-view reconstruction model\nbased on the LRM architecture, InstantMesh is able to create diverse 3D assets\nwithin 10 seconds. To enhance the training efficiency and exploit more\ngeometric supervisions, e.g, depths and normals, we integrate a differentiable\niso-surface extraction module into our framework and directly optimize on the\nmesh representation. Experimental results on public datasets demonstrate that\nInstantMesh significantly outperforms other latest image-to-3D baselines, both\nqualitatively and quantitatively. We release all the code, weights, and demo of\nInstantMesh, with the intention that it can make substantial contributions to\nthe community of 3D generative AI and empower both researchers and content\ncreators.\n","authors":["Jiale Xu","Weihao Cheng","Yiming Gao","Xintao Wang","Shenghua Gao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.07191v2.pdf","comment":"Technical report. Project: https://github.com/TencentARC/InstantMesh"},{"id":"http://arxiv.org/abs/2404.09293v1","updated":"2024-04-14T16:09:33Z","published":"2024-04-14T16:09:33Z","title":"A Novel State Space Model with Local Enhancement and State Sharing for\n Image Fusion","summary":" In image fusion tasks, images from different sources possess distinct\ncharacteristics. This has driven the development of numerous methods to explore\nbetter ways of fusing them while preserving their respective characteristics.\nMamba, as a state space model, has emerged in the field of natural language\nprocessing. Recently, many studies have attempted to extend Mamba to vision\ntasks. However, due to the nature of images different from casual language\nsequences, the limited state capacity of Mamba weakens its ability to model\nimage information. Additionally, the sequence modeling ability of Mamba is only\ncapable of spatial information and cannot effectively capture the rich spectral\ninformation in images. Motivated by these challenges, we customize and improve\nthe vision Mamba network designed for the image fusion task. Specifically, we\npropose the local-enhanced vision Mamba block, dubbed as LEVM. The LEVM block\ncan improve local information perception of the network and simultaneously\nlearn local and global spatial information. Furthermore, we propose the state\nsharing technique to enhance spatial details and integrate spatial and spectral\ninformation. Finally, the overall network is a multi-scale structure based on\nvision Mamba, called LE-Mamba. Extensive experiments show the proposed methods\nachieve state-of-the-art results on multispectral pansharpening and\nmultispectral and hyperspectral image fusion datasets, and demonstrate the\neffectiveness of the proposed approach. Code will be made available.\n","authors":["Zihan Cao","Xiao Wu","Liang-Jian Deng","Yu Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.09293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09292v1","updated":"2024-04-14T15:58:35Z","published":"2024-04-14T15:58:35Z","title":"Bridging Data Islands: Geographic Heterogeneity-Aware Federated Learning\n for Collaborative Remote Sensing Semantic Segmentation","summary":" Remote sensing semantic segmentation (RSS) is an essential task in Earth\nObservation missions. Due to data privacy concerns, high-quality remote sensing\nimages with annotations cannot be well shared among institutions, making it\ndifficult to fully utilize RSS data to train a generalized model. Federated\nLearning (FL), a privacy-preserving collaborative learning technology, is a\npotential solution. However, the current research on how to effectively apply\nFL in RSS is still scarce and requires further investigation. Remote sensing\nimages in various institutions often exhibit strong geographical heterogeneity.\nMore specifically, it is reflected in terms of class-distribution heterogeneity\nand object-appearance heterogeneity. Unfortunately, most existing FL studies\nshow inadequate focus on geographical heterogeneity, thus leading to\nperformance degradation in the global model. Considering the aforementioned\nissues, we propose a novel Geographic Heterogeneity-Aware Federated Learning\n(GeoFed) framework to address privacy-preserving RSS. Through Global Feature\nExtension and Tail Regeneration modules, class-distribution heterogeneity is\nalleviated. Additionally, we design an Essential Feature Mining strategy to\nalleviate object-appearance heterogeneity by constructing essential features.\nExtensive experiments on three datasets (i.e., FBP, CASID, Inria) show that our\nGeoFed consistently outperforms the current state-of-the-art methods. The code\nwill be available publicly.\n","authors":["Jieyi Tan","Yansheng Li","Sergey A. Bartalev","Bo Dang","Wei Chen","Yongjun Zhang","Liangqi Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.09292v1.pdf","comment":"13 pages,9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.09290v1","updated":"2024-04-14T15:50:10Z","published":"2024-04-14T15:50:10Z","title":"RoofDiffusion: Constructing Roofs from Severely Corrupted Point Data via\n Diffusion","summary":" Accurate completion and denoising of roof height maps are crucial to\nreconstructing high-quality 3D buildings. Repairing sparse points can enhance\nlow-cost sensor use and reduce UAV flight overlap. RoofDiffusion is a new\nend-to-end self-supervised diffusion technique for robustly completing, in\nparticular difficult, roof height maps. RoofDiffusion leverages\nwidely-available curated footprints and can so handle up to 99\\% point sparsity\nand 80\\% roof area occlusion (regional incompleteness). A variant, No-FP\nRoofDiffusion, simultaneously predicts building footprints and heights. Both\nquantitatively outperform state-of-the-art unguided depth completion and\nrepresentative inpainting methods for Digital Elevation Models (DEM), on both a\nroof-specific benchmark and the BuildingNet dataset. Qualitative assessments\nshow the effectiveness of RoofDiffusion for datasets with real-world scans\nincluding AHN3, Dales3D, and USGS 3DEP LiDAR. Tested with the leading City3D\nalgorithm, preprocessing height maps with RoofDiffusion noticeably improves 3D\nbuilding reconstruction. RoofDiffusion is complemented by a new dataset of 13k\ncomplex roof geometries, focusing on long-tail issues in remote sensing; a\nnovel simulation of tree occlusion; and a wide variety of large-area roof\ncut-outs for data augmentation and benchmarking.\n","authors":["Kyle Shih-Huang Lo","Jörg Peters","Eric Spellman"],"pdf_url":"https://arxiv.org/pdf/2404.09290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09277v1","updated":"2024-04-14T14:58:52Z","published":"2024-04-14T14:58:52Z","title":"SyntStereo2Real: Edge-Aware GAN for Remote Sensing Image-to-Image\n Translation while Maintaining Stereo Constraint","summary":" In the field of remote sensing, the scarcity of stereo-matched and\nparticularly lack of accurate ground truth data often hinders the training of\ndeep neural networks. The use of synthetically generated images as an\nalternative, alleviates this problem but suffers from the problem of domain\ngeneralization. Unifying the capabilities of image-to-image translation and\nstereo-matching presents an effective solution to address the issue of domain\ngeneralization. Current methods involve combining two networks, an unpaired\nimage-to-image translation network and a stereo-matching network, while jointly\noptimizing them. We propose an edge-aware GAN-based network that effectively\ntackles both tasks simultaneously. We obtain edge maps of input images from the\nSobel operator and use it as an additional input to the encoder in the\ngenerator to enforce geometric consistency during translation. We additionally\ninclude a warping loss calculated from the translated images to maintain the\nstereo consistency. We demonstrate that our model produces qualitatively and\nquantitatively superior results than existing models, and its applicability\nextends to diverse domains, including autonomous driving.\n","authors":["Vasudha Venkatesan","Daniel Panangian","Mario Fuentes Reyes","Ksenia Bittner"],"pdf_url":"https://arxiv.org/pdf/2404.09277v1.pdf","comment":"Accepted to IEEE Conference on Computer Vision and Pattern\n Recognition Workshop (CVPRW) EarthVision"},{"id":"http://arxiv.org/abs/2304.02649v3","updated":"2024-04-14T14:55:55Z","published":"2023-04-03T20:19:56Z","title":"Specialty-Oriented Generalist Medical AI for Chest CT Screening","summary":" Modern medical records include a vast amount of multimodal free text clinical\ndata and imaging data from radiology, cardiology, and digital pathology. Fully\nmining such big data requires multitasking; otherwise, occult but important\naspects may be overlooked, adversely affecting clinical management and\npopulation healthcare. Despite remarkable successes of AI in individual tasks\nwith single-modal data, the progress in developing generalist medical AI\nremains relatively slow to combine multimodal data for multitasks because of\nthe dual challenges of data curation and model architecture. The data challenge\ninvolves querying and curating multimodal structured and unstructured text,\nalphanumeric, and especially 3D tomographic scans on an individual patient\nlevel for real-time decisions and on a scale to estimate population health\nstatistics. The model challenge demands a scalable and adaptable network\narchitecture to integrate multimodal datasets for diverse clinical tasks. Here\nwe propose the first-of-its-kind medical multimodal-multitask foundation model\n(M3FM) with application in lung cancer screening and related tasks. After we\ncurated a comprehensive multimodal multitask dataset consisting of 49 clinical\ndata types including 163,725 chest CT series and 17 medical tasks involved in\nLCS, we develop a multimodal question-answering framework as a unified training\nand inference strategy to synergize multimodal information and perform multiple\ntasks via free-text prompting. M3FM consistently outperforms the\nstate-of-the-art single-modal task-specific models, identifies multimodal data\nelements informative for clinical tasks and flexibly adapts to new tasks with a\nsmall out-of-distribution dataset. As a specialty-oriented generalist medical\nAI model, M3FM paves the way for similar breakthroughs in other areas of\nmedicine, closing the gap between specialists and the generalist.\n","authors":["Chuang Niu","Qing Lyu","Christopher D. Carothers","Parisa Kaviani","Josh Tan","Pingkun Yan","Mannudeep K. Kalra","Christopher T. Whitlow","Ge Wang"],"pdf_url":"https://arxiv.org/pdf/2304.02649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01029v3","updated":"2024-04-14T14:53:32Z","published":"2023-04-03T14:28:29Z","title":"Domain Generalization for Crop Segmentation with Standardized Ensemble\n Knowledge Distillation","summary":" In recent years, precision agriculture has gradually oriented farming closer\nto automation processes to support all the activities related to field\nmanagement. Service robotics plays a predominant role in this evolution by\ndeploying autonomous agents that can navigate fields while performing tasks\nsuch as monitoring, spraying, and harvesting without human intervention. To\nexecute these precise actions, mobile robots need a real-time perception system\nthat understands their surroundings and identifies their targets in the wild.\nExisting methods, however, often fall short in generalizing to new crops and\nenvironmental conditions. This limit is critical for practical applications\nwhere labeled samples are rarely available. In this paper, we investigate the\nproblem of crop segmentation and propose a novel approach to enhance domain\ngeneralization using knowledge distillation. In the proposed framework, we\ntransfer knowledge from a standardized ensemble of models individually trained\non source domains to a student model that can adapt to unseen realistic\nscenarios. To support the proposed method, we present a synthetic multi-domain\ndataset for crop segmentation containing plants of variegate species and\ncovering different terrain styles, weather conditions, and light scenarios for\nmore than 70,000 samples. We demonstrate significant improvements in\nperformance over state-of-the-art methods and superior sim-to-real\ngeneralization. Our approach provides a promising solution for domain\ngeneralization in crop segmentation and has the potential to enhance a wide\nvariety of agriculture applications.\n","authors":["Simone Angarano","Mauro Martini","Alessandro Navone","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2304.01029v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09275v1","updated":"2024-04-14T14:51:44Z","published":"2024-04-14T14:51:44Z","title":"TrafficVLM: A Controllable Visual Language Model for Traffic Video\n Captioning","summary":" Traffic video description and analysis have received much attention recently\ndue to the growing demand for efficient and reliable urban surveillance\nsystems. Most existing methods only focus on locating traffic event segments,\nwhich severely lack descriptive details related to the behaviour and context of\nall the subjects of interest in the events. In this paper, we present\nTrafficVLM, a novel multi-modal dense video captioning model for vehicle ego\ncamera view. TrafficVLM models traffic video events at different levels of\nanalysis, both spatially and temporally, and generates long fine-grained\ndescriptions for the vehicle and pedestrian at different phases of the event.\nWe also propose a conditional component for TrafficVLM to control the\ngeneration outputs and a multi-task fine-tuning paradigm to enhance\nTrafficVLM's learning capability. Experiments show that TrafficVLM performs\nwell on both vehicle and overhead camera views. Our solution achieved\noutstanding results in Track 2 of the AI City Challenge 2024, ranking us third\nin the challenge standings. Our code is publicly available at\nhttps://github.com/quangminhdinh/TrafficVLM.\n","authors":["Quang Minh Dinh","Minh Khoi Ho","Anh Quan Dang","Hung Phong Tran"],"pdf_url":"https://arxiv.org/pdf/2404.09275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09271v1","updated":"2024-04-14T14:26:33Z","published":"2024-04-14T14:26:33Z","title":"VRS-NeRF: Visual Relocalization with Sparse Neural Radiance Field","summary":" Visual relocalization is a key technique to autonomous driving, robotics, and\nvirtual/augmented reality. After decades of explorations, absolute pose\nregression (APR), scene coordinate regression (SCR), and hierarchical methods\n(HMs) have become the most popular frameworks. However, in spite of high\nefficiency, APRs and SCRs have limited accuracy especially in large-scale\noutdoor scenes; HMs are accurate but need to store a large number of 2D\ndescriptors for matching, resulting in poor efficiency. In this paper, we\npropose an efficient and accurate framework, called VRS-NeRF, for visual\nrelocalization with sparse neural radiance field. Precisely, we introduce an\nexplicit geometric map (EGM) for 3D map representation and an implicit learning\nmap (ILM) for sparse patches rendering. In this localization process, EGP\nprovides priors of spare 2D points and ILM utilizes these sparse points to\nrender patches with sparse NeRFs for matching. This allows us to discard a\nlarge number of 2D descriptors so as to reduce the map size. Moreover,\nrendering patches only for useful points rather than all pixels in the whole\nimage reduces the rendering time significantly. This framework inherits the\naccuracy of HMs and discards their low efficiency. Experiments on 7Scenes,\nCambridgeLandmarks, and Aachen datasets show that our method gives much better\naccuracy than APRs and SCRs, and close performance to HMs but is much more\nefficient.\n","authors":["Fei Xue","Ignas Budvytis","Daniel Olmeda Reino","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2404.09271v1.pdf","comment":"source code https://github.com/feixue94/vrs-nerf"},{"id":"http://arxiv.org/abs/2404.09269v1","updated":"2024-04-14T14:24:13Z","published":"2024-04-14T14:24:13Z","title":"PANet: A Physics-guided Parametric Augmentation Net for Image Dehazing\n by Hazing","summary":" Image dehazing faces challenges when dealing with hazy images in real-world\nscenarios. A huge domain gap between synthetic and real-world haze images\ndegrades dehazing performance in practical settings. However, collecting\nreal-world image datasets for training dehazing models is challenging since\nboth hazy and clean pairs must be captured under the same conditions. In this\npaper, we propose a Physics-guided Parametric Augmentation Network (PANet) that\ngenerates photo-realistic hazy and clean training pairs to effectively enhance\nreal-world dehazing performance. PANet comprises a Haze-to-Parameter Mapper\n(HPM) to project hazy images into a parameter space and a Parameter-to-Haze\nMapper (PHM) to map the resampled haze parameters back to hazy images. In the\nparameter space, we can pixel-wisely resample individual haze parameter maps to\ngenerate diverse hazy images with physically-explainable haze conditions unseen\nin the training set. Our experimental results demonstrate that PANet can\naugment diverse realistic hazy images to enrich existing hazy image benchmarks\nso as to effectively boost the performances of state-of-the-art image dehazing\nmodels.\n","authors":["Chih-Ling Chang","Fu-Jen Tsai","Zi-Ling Huang","Lin Gu","Chia-Wen Lin"],"pdf_url":"https://arxiv.org/pdf/2404.09269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07564v2","updated":"2024-04-14T14:11:58Z","published":"2024-03-12T11:51:59Z","title":"RSBuilding: Towards General Remote Sensing Image Building Extraction and\n Change Detection with Foundation Model","summary":" The intelligent interpretation of buildings plays a significant role in urban\nplanning and management, macroeconomic analysis, population dynamics, etc.\nRemote sensing image building interpretation primarily encompasses building\nextraction and change detection. However, current methodologies often treat\nthese two tasks as separate entities, thereby failing to leverage shared\nknowledge. Moreover, the complexity and diversity of remote sensing image\nscenes pose additional challenges, as most algorithms are designed to model\nindividual small datasets, thus lacking cross-scene generalization. In this\npaper, we propose a comprehensive remote sensing image building understanding\nmodel, termed RSBuilding, developed from the perspective of the foundation\nmodel. RSBuilding is designed to enhance cross-scene generalization and task\nuniversality. Specifically, we extract image features based on the prior\nknowledge of the foundation model and devise a multi-level feature sampler to\naugment scale information. To unify task representation and integrate image\nspatiotemporal clues, we introduce a cross-attention decoder with task prompts.\nAddressing the current shortage of datasets that incorporate annotations for\nboth tasks, we have developed a federated training strategy to facilitate\nsmooth model convergence even when supervision for some tasks is missing,\nthereby bolstering the complementarity of different tasks. Our model was\ntrained on a dataset comprising up to 245,000 images and validated on multiple\nbuilding extraction and change detection datasets. The experimental results\nsubstantiate that RSBuilding can concurrently handle two structurally distinct\ntasks and exhibits robust zero-shot generalization capabilities.\n","authors":["Mingze Wang","Lili Su","Cilin Yan","Sheng Xu","Pengcheng Yuan","Xiaolong Jiang","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.07564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09263v1","updated":"2024-04-14T14:06:42Z","published":"2024-04-14T14:06:42Z","title":"Task-Driven Exploration: Decoupling and Inter-Task Feedback for Joint\n Moment Retrieval and Highlight Detection","summary":" Video moment retrieval and highlight detection are two highly valuable tasks\nin video understanding, but until recently they have been jointly studied.\nAlthough existing studies have made impressive advancement recently, they\npredominantly follow the data-driven bottom-up paradigm. Such paradigm\noverlooks task-specific and inter-task effects, resulting in poor model\nperformance. In this paper, we propose a novel task-driven top-down framework\nTaskWeave for joint moment retrieval and highlight detection. The framework\nintroduces a task-decoupled unit to capture task-specific and common\nrepresentations. To investigate the interplay between the two tasks, we propose\nan inter-task feedback mechanism, which transforms the results of one task as\nguiding masks to assist the other task. Different from existing methods, we\npresent a task-dependent joint loss function to optimize the model.\nComprehensive experiments and in-depth ablation studies on QVHighlights, TVSum,\nand Charades-STA datasets corroborate the effectiveness and flexibility of the\nproposed framework. Codes are available at\nhttps://github.com/EdenGabriel/TaskWeave.\n","authors":["Jin Yang","Ping Wei","Huan Li","Ziyang Ren"],"pdf_url":"https://arxiv.org/pdf/2404.09263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09259v1","updated":"2024-04-14T13:56:30Z","published":"2024-04-14T13:56:30Z","title":"FedCCL: Federated Dual-Clustered Feature Contrast Under Domain\n Heterogeneity","summary":" Federated learning (FL) facilitates a privacy-preserving neural network\ntraining paradigm through collaboration between edge clients and a central\nserver. One significant challenge is that the distributed data is not\nindependently and identically distributed (non-IID), typically including both\nintra-domain and inter-domain heterogeneity. However, recent research is\nlimited to simply using averaged signals as a form of regularization and only\nfocusing on one aspect of these non-IID challenges. Given these limitations,\nthis paper clarifies these two non-IID challenges and attempts to introduce\ncluster representation to address them from both local and global perspectives.\nSpecifically, we propose a dual-clustered feature contrast-based FL framework\nwith dual focuses. First, we employ clustering on the local representations of\neach client, aiming to capture intra-class information based on these local\nclusters at a high level of granularity. Then, we facilitate cross-client\nknowledge sharing by pulling the local representation closer to clusters shared\nby clients with similar semantics while pushing them away from clusters with\ndissimilar semantics. Second, since the sizes of local clusters belonging to\nthe same class may differ for each client, we further utilize clustering on the\nglobal side and conduct averaging to create a consistent global signal for\nguiding each local training in a contrastive manner. Experimental results on\nmultiple datasets demonstrate that our proposal achieves comparable or superior\nperformance gain under intra-domain and inter-domain heterogeneity.\n","authors":["Yu Qiao","Huy Q. Le","Mengchun Zhang","Apurba Adhikary","Chaoning Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.09259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09254v1","updated":"2024-04-14T13:39:02Z","published":"2024-04-14T13:39:02Z","title":"TEXT2TASTE: A Versatile Egocentric Vision System for Intelligent Reading\n Assistance Using Large Language Model","summary":" The ability to read, understand and find important information from written\ntext is a critical skill in our daily lives for our independence, comfort and\nsafety. However, a significant part of our society is affected by partial\nvision impairment, which leads to discomfort and dependency in daily\nactivities. To address the limitations of this part of society, we propose an\nintelligent reading assistant based on smart glasses with embedded RGB cameras\nand a Large Language Model (LLM), whose functionality goes beyond corrective\nlenses. The video recorded from the egocentric perspective of a person wearing\nthe glasses is processed to localise text information using object detection\nand optical character recognition methods. The LLM processes the data and\nallows the user to interact with the text and responds to a given query, thus\nextending the functionality of corrective lenses with the ability to find and\nsummarize knowledge from the text. To evaluate our method, we create a\nchat-based application that allows the user to interact with the system. The\nevaluation is conducted in a real-world setting, such as reading menus in a\nrestaurant, and involves four participants. The results show robust accuracy in\ntext retrieval. The system not only provides accurate meal suggestions but also\nachieves high user satisfaction, highlighting the potential of smart glasses\nand LLMs in assisting people with special needs.\n","authors":["Wiktor Mucha","Florin Cuconasu","Naome A. Etori","Valia Kalokyri","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2404.09254v1.pdf","comment":"Accepted at ICCHP 2024"},{"id":"http://arxiv.org/abs/2312.06709v4","updated":"2024-04-14T13:35:14Z","published":"2023-12-10T17:07:29Z","title":"AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains\n Into One","summary":" A handful of visual foundation models (VFMs) have recently emerged as the\nbackbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are\ntrained with distinct objectives, exhibiting unique characteristics for various\ndownstream tasks. We find that despite their conceptual differences, these\nmodels can be effectively merged into a unified model through multi-teacher\ndistillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All\nDomains Into One). This integrative approach not only surpasses the performance\nof individual teacher models but also amalgamates their distinctive features,\nsuch as zero-shot vision-language comprehension, detailed pixel-level\nunderstanding, and open vocabulary segmentation capabilities. In pursuit of the\nmost hardware-efficient backbone, we evaluated numerous architectures in our\nmulti-teacher distillation pipeline using the same training recipe. This led to\nthe development of a novel architecture (E-RADIO) that exceeds the performance\nof its predecessors and is at least 7x faster than the teacher models. Our\ncomprehensive benchmarking process covers downstream tasks including ImageNet\nclassification, ADE20k semantic segmentation, COCO object detection and\nLLaVa-1.5 framework.\n Code: https://github.com/NVlabs/RADIO\n","authors":["Mike Ranzinger","Greg Heinrich","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2312.06709v4.pdf","comment":"CVPR 2024 Version 3: CVPR Camera Ready, reconfigured full paper,\n table 1 is now more comprehensive Version 2: Added more acknowledgements and\n updated table 7 with more recent results. Ensured that the link in the\n abstract to our code is working properly Version 3: Fix broken hyperlinks"},{"id":"http://arxiv.org/abs/2404.07766v2","updated":"2024-04-14T13:14:54Z","published":"2024-04-11T14:05:37Z","title":"RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric\n Stereo Network","summary":" Predicting accurate normal maps of objects from two-dimensional images in\nregions of complex structure and spatial material variations is challenging\nusing photometric stereo methods due to the influence of surface reflection\nproperties caused by variations in object geometry and surface materials. To\naddress this issue, we propose a photometric stereo network called a RMAFF-PSN\nthat uses residual multiscale attentional feature fusion to handle the\n``difficult'' regions of the object. Unlike previous approaches that only use\nstacked convolutional layers to extract deep features from the input image, our\nmethod integrates feature information from different resolution stages and\nscales of the image. This approach preserves more physical information, such as\ntexture and geometry of the object in complex regions, through shallow-deep\nstage feature extraction, double branching enhancement, and attention\noptimization. To test the network structure under real-world conditions, we\npropose a new real dataset called Simple PS data, which contains multiple\nobjects with varying structures and materials. Experimental results on a\npublicly available benchmark dataset demonstrate that our method outperforms\nmost existing calibrated photometric stereo methods for the same number of\ninput images, especially in the case of highly non-convex object structures.\nOur method also obtains good results under sparse lighting conditions.\n","authors":["Kai Luo","Yakun Ju","Lin Qi","Kaixuan Wang","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.07766v2.pdf","comment":"17 pages,12 figures"},{"id":"http://arxiv.org/abs/2404.09245v1","updated":"2024-04-14T13:14:13Z","published":"2024-04-14T13:14:13Z","title":"Arena: A Patch-of-Interest ViT Inference Acceleration System for\n Edge-Assisted Video Analytics","summary":" The advent of edge computing has made real-time intelligent video analytics\nfeasible. Previous works, based on traditional model architecture (e.g., CNN,\nRNN, etc.), employ various strategies to filter out non-region-of-interest\ncontent to minimize bandwidth and computation consumption but show inferior\nperformance in adverse environments. Recently, visual foundation models based\non transformers have shown great performance in adverse environments due to\ntheir amazing generalization capability. However, they require a large amount\nof computation power, which limits their applications in real-time intelligent\nvideo analytics. In this paper, we find visual foundation models like Vision\nTransformer (ViT) also have a dedicated acceleration mechanism for video\nanalytics. To this end, we introduce Arena, an end-to-end edge-assisted video\ninference acceleration system based on ViT. We leverage the capability of ViT\nthat can be accelerated through token pruning by only offloading and feeding\nPatches-of-Interest (PoIs) to the downstream models. Additionally, we employ\nprobability-based patch sampling, which provides a simple but efficient\nmechanism for determining PoIs where the probable locations of objects are in\nsubsequent frames. Through extensive evaluations on public datasets, our\nfindings reveal that Arena can boost inference speeds by up to $1.58\\times$ and\n$1.82\\times$ on average while consuming only 54% and 34% of the bandwidth,\nrespectively, all with high inference accuracy.\n","authors":["Haosong Peng","Wei Feng","Hao Li","Yufeng Zhan","Qihua Zhou","Yuanqing Xia"],"pdf_url":"https://arxiv.org/pdf/2404.09245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12091v4","updated":"2024-04-14T13:03:26Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v4.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2211.08089v4","updated":"2024-04-14T13:02:59Z","published":"2022-11-15T12:15:29Z","title":"DeS3: Adaptive Attention-driven Self and Soft Shadow Removal using ViT\n Similarity","summary":" Removing soft and self shadows that lack clear boundaries from a single image\nis still challenging. Self shadows are shadows that are cast on the object\nitself. Most existing methods rely on binary shadow masks, without considering\nthe ambiguous boundaries of soft and self shadows. In this paper, we present\nDeS3, a method that removes hard, soft and self shadows based on adaptive\nattention and ViT similarity. Our novel ViT similarity loss utilizes features\nextracted from a pre-trained Vision Transformer. This loss helps guide the\nreverse sampling towards recovering scene structures. Our adaptive attention is\nable to differentiate shadow regions from the underlying objects, as well as\nshadow regions from the object casting the shadow. This capability enables DeS3\nto better recover the structures of objects even when they are partially\noccluded by shadows. Different from existing methods that rely on constraints\nduring the training phase, we incorporate the ViT similarity during the\nsampling stage. Our method outperforms state-of-the-art methods on the SRD,\nAISTD, LRSS, USR and UIUC datasets, removing hard, soft, and self shadows\nrobustly. Specifically, our method outperforms the SOTA method by 16\\% of the\nRMSE of the whole image on the LRSS dataset. Our data and code is available at:\n\\url{https://github.com/jinyeying/DeS3_Deshadow}\n","authors":["Yeying Jin","Wei Ye","Wenhan Yang","Yuan Yuan","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2211.08089v4.pdf","comment":"Accepted to AAAI2024, diffusion shadow removal,\n \\url{https://github.com/jinyeying/DeS3_Deshadow}"},{"id":"http://arxiv.org/abs/2404.05238v2","updated":"2024-04-14T12:48:55Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature attribution maps \\cite{bansal2020sam}\nhave been established as a common means for finding how important each input\nfeature is to an AI's decisions. It is an interesting, unexplored question\nwhether allowing users to edit the feature importance at test time would\nimprove a human-AI team's accuracy on downstream tasks. In this paper, we\naddress this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc\nexplainable classifier \\cite{taesiri2022visual} that first predicts patch-wise\ncorrespondences between the input and training-set images, and then base on\nthem to make classification decisions. We build CHM-Corr++, an interactive\ninterface for CHM-Corr, enabling users to edit the feature attribution map\nprovided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users\ncan gain insights into if, when, and how the model changes its outputs,\nimproving their understanding beyond static explanations. However, our user\nstudy with 18 users who performed 1,400 decisions finds no statistical\nsignificance that our interactive approach improves user accuracy on CUB-200\nbird image classification over static explanations. This challenges the\nhypothesis that interactivity can boost human-AI team\naccuracy~\\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding}\nand raises needs for future research. We open-source CHM-Corr++, an interactive\ntool for editing image classifier attention (see an interactive demo\n\\href{http://137.184.82.109:7080/}{here}). % , and it lays the groundwork for\nfuture research to enable effective human-AI interaction in computer vision. We\nrelease code and data on\n\\href{https://github.com/anguyen8/chm-corr-interactive}{github}.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v2.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2308.11949v2","updated":"2024-04-14T12:43:56Z","published":"2023-08-23T06:45:11Z","title":"High-quality Image Dehazing with Diffusion Model","summary":" Image dehazing is quite challenging in dense-haze scenarios, where quite less\noriginal information remains in the hazy image. Though previous methods have\nmade marvelous progress, they still suffer from information loss in content and\ncolor in dense-haze scenarios. The recently emerged Denoising Diffusion\nProbabilistic Model (DDPM) exhibits strong generation ability, showing\npotential for solving this problem. However, DDPM fails to consider the physics\nproperty of dehazing task, limiting its information completion capacity. In\nthis work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing\nframework that applies to complex hazy scenarios. Specifically, DehazeDDPM\nworks in two stages. The former stage physically models the dehazing task with\nthe Atmospheric Scattering Model (ASM), pulling the distribution closer to the\nclear data and endowing DehazeDDPM with fog-aware ability. The latter stage\nexploits the strong generation ability of DDPM to compensate for the\nhaze-induced huge information loss, by working in conjunction with the physical\nmodelling. Extensive experiments demonstrate that our method attains\nstate-of-the-art performance on both synthetic and real-world hazy datasets.\n","authors":["Hu Yu","Jie Huang","Kaiwen Zheng","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09231v1","updated":"2024-04-14T12:19:16Z","published":"2024-04-14T12:19:16Z","title":"Tri-modal Confluence with Temporal Dynamics for Scene Graph Generation\n in Operating Rooms","summary":" A comprehensive understanding of surgical scenes allows for monitoring of the\nsurgical process, reducing the occurrence of accidents and enhancing efficiency\nfor medical professionals. Semantic modeling within operating rooms, as a scene\ngraph generation (SGG) task, is challenging since it involves consecutive\nrecognition of subtle surgical actions over prolonged periods. To address this\nchallenge, we propose a Tri-modal (i.e., images, point clouds, and language)\nconfluence with Temporal dynamics framework, termed TriTemp-OR. Diverging from\nprevious approaches that integrated temporal information via memory graphs, our\nmethod embraces two advantages: 1) we directly exploit bi-modal temporal\ninformation from the video streaming for hierarchical feature interaction, and\n2) the prior knowledge from Large Language Models (LLMs) is embedded to\nalleviate the class-imbalance problem in the operating theatre. Specifically,\nour model performs temporal interactions across 2D frames and 3D point clouds,\nincluding a scale-adaptive multi-view temporal interaction (ViewTemp) and a\ngeometric-temporal point aggregation (PointTemp). Furthermore, we transfer\nknowledge from the biomedical LLM, LLaVA-Med, to deepen the comprehension of\nintraoperative relations. The proposed TriTemp-OR enables the aggregation of\ntri-modal features through relation-aware unification to predict relations so\nas to generate scene graphs. Experimental results on the 4D-OR benchmark\ndemonstrate the superior performance of our model for long-term OR streaming.\n","authors":["Diandian Guo","Manxi Lin","Jialun Pei","He Tang","Yueming Jin","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2404.09231v1.pdf","comment":"10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.09227v1","updated":"2024-04-14T12:13:07Z","published":"2024-04-14T12:13:07Z","title":"DreamScape: 3D Scene Creation via Gaussian Splatting joint Correlation\n Modeling","summary":" Recent progress in text-to-3D creation has been propelled by integrating the\npotent prior of Diffusion Models from text-to-image generation into the 3D\ndomain. Nevertheless, generating 3D scenes characterized by multiple instances\nand intricate arrangements remains challenging. In this study, we present\nDreamScape, a method for creating highly consistent 3D scenes solely from\ntextual descriptions, leveraging the strong 3D representation capabilities of\nGaussian Splatting and the complex arrangement abilities of large language\nmodels (LLMs). Our approach involves a 3D Gaussian Guide ($3{DG^2}$) for scene\nrepresentation, consisting of semantic primitives (objects) and their spatial\ntransformations and relationships derived directly from text prompts using\nLLMs. This compositional representation allows for local-to-global optimization\nof the entire scene. A progressive scale control is tailored during local\nobject generation, ensuring that objects of different sizes and densities adapt\nto the scene, which addresses training instability issue arising from simple\nblending in the subsequent global optimization stage. To mitigate potential\nbiases of LLM priors, we model collision relationships between objects at the\nglobal level, enhancing physical correctness and overall realism. Additionally,\nto generate pervasive objects like rain and snow distributed extensively across\nthe scene, we introduce a sparse initialization and densification strategy.\nExperiments demonstrate that DreamScape offers high usability and\ncontrollability, enabling the generation of high-fidelity 3D scenes from only\ntext prompts and achieving state-of-the-art performance compared to other\nmethods.\n","authors":["Xuening Yuan","Hongyu Yang","Yueming Zhao","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09226v1","updated":"2024-04-14T12:09:47Z","published":"2024-04-14T12:09:47Z","title":"Breast Cancer Image Classification Method Based on Deep Transfer\n Learning","summary":" To address the issues of limited samples, time-consuming feature design, and\nlow accuracy in detection and classification of breast cancer pathological\nimages, a breast cancer image classification model algorithm combining deep\nlearning and transfer learning is proposed. This algorithm is based on the\nDenseNet structure of deep neural networks, and constructs a network model by\nintroducing attention mechanisms, and trains the enhanced dataset using\nmulti-level transfer learning. Experimental results demonstrate that the\nalgorithm achieves an efficiency of over 84.0\\% in the test set, with a\nsignificantly improved classification accuracy compared to previous models,\nmaking it applicable to medical breast cancer detection tasks.\n","authors":["Weimin Wang","Min Gao","Mingxuan Xiao","Xu Yan","Yufeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.09226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09792v2","updated":"2024-04-14T11:57:55Z","published":"2024-03-14T18:24:55Z","title":"Images are Achilles' Heel of Alignment: Exploiting Visual\n Vulnerabilities for Jailbreaking Multimodal Large Language Models","summary":" In this paper, we study the harmlessness alignment problem of multimodal\nlarge language models (MLLMs). We conduct a systematic empirical analysis of\nthe harmlessness performance of representative MLLMs and reveal that the image\ninput poses the alignment vulnerability of MLLMs. Inspired by this, we propose\na novel jailbreak method named HADES, which hides and amplifies the harmfulness\nof the malicious intent within the text input, using meticulously crafted\nimages. Experimental results show that HADES can effectively jailbreak existing\nMLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for\nLLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data will be publicly\nreleased.\n","authors":["Yifan Li","Hangyu Guo","Kun Zhou","Wayne Xin Zhao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2403.09792v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2404.09216v1","updated":"2024-04-14T11:01:44Z","published":"2024-04-14T11:01:44Z","title":"DetCLIPv3: Towards Versatile Generative Open-vocabulary Object Detection","summary":" Existing open-vocabulary object detectors typically require a predefined set\nof categories from users, significantly confining their application scenarios.\nIn this paper, we introduce DetCLIPv3, a high-performing detector that excels\nnot only at both open-vocabulary object detection, but also generating\nhierarchical labels for detected objects. DetCLIPv3 is characterized by three\ncore designs: 1. Versatile model architecture: we derive a robust open-set\ndetection framework which is further empowered with generation ability via the\nintegration of a caption head. 2. High information density data: we develop an\nauto-annotation pipeline leveraging visual large language model to refine\ncaptions for large-scale image-text pairs, providing rich, multi-granular\nobject labels to enhance the training. 3. Efficient training strategy: we\nemploy a pre-training stage with low-resolution inputs that enables the object\ncaptioner to efficiently learn a broad spectrum of visual concepts from\nextensive image-text paired data. This is followed by a fine-tuning stage that\nleverages a small number of high-resolution samples to further enhance\ndetection performance. With these effective designs, DetCLIPv3 demonstrates\nsuperior open-vocabulary detection performance, \\eg, our Swin-T backbone model\nachieves a notable 47.0 zero-shot fixed AP on the LVIS minival benchmark,\noutperforming GLIPv2, GroundingDINO, and DetCLIPv2 by 18.0/19.6/6.6 AP,\nrespectively. DetCLIPv3 also achieves a state-of-the-art 19.7 AP in dense\ncaptioning task on VG dataset, showcasing its strong generative capability.\n","authors":["Lewei Yao","Renjie Pi","Jianhua Han","Xiaodan Liang","Hang Xu","Wei Zhang","Zhenguo Li","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.09216v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.03425v3","updated":"2024-04-14T10:41:40Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have inherent shortcomings. Recently, the Mamba architecture,\nbased on state space models, has shown remarkable performance in a series of\nnatural language processing tasks, which can effectively compensate for the\nshortcomings of the above two architectures. In this paper, we explore for the\nfirst time the potential of the Mamba architecture for remote sensing CD tasks.\nWe tailor the corresponding frameworks, called MambaBCD, MambaSCD, and\nMambaBDA, for binary change detection (BCD), semantic change detection (SCD),\nand building damage assessment (BDA), respectively. All three frameworks adopt\nthe cutting-edge Visual Mamba architecture as the encoder, which allows full\nlearning of global spatial contextual information from the input images. For\nthe change decoder, which is available in all three architectures, we propose\nthree spatio-temporal relationship modeling mechanisms, which can be naturally\ncombined with the Mamba architecture and fully utilize its attribute to achieve\nspatio-temporal interaction of multi-temporal features, thereby obtaining\naccurate change information. On five benchmark datasets, our proposed\nframeworks outperform current CNN- and Transformer-based approaches without\nusing any complex training strategies or tricks, fully demonstrating the\npotential of the Mamba architecture in CD tasks. Specifically, we obtained\n83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+,\nand WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA\ndataset xBD, we obtained 81.41% overall F1 score. Further experiments show that\nour architecture is quite robust to degraded data. The source code will be\navailable in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09210v1","updated":"2024-04-14T10:23:30Z","published":"2024-04-14T10:23:30Z","title":"FedDistill: Global Model Distillation for Local Model De-Biasing in\n Non-IID Federated Learning","summary":" Federated Learning (FL) is a novel approach that allows for collaborative\nmachine learning while preserving data privacy by leveraging models trained on\ndecentralized devices. However, FL faces challenges due to non-uniformly\ndistributed (non-iid) data across clients, which impacts model performance and\nits generalization capabilities. To tackle the non-iid issue, recent efforts\nhave utilized the global model as a teaching mechanism for local models.\nHowever, our pilot study shows that their effectiveness is constrained by\nimbalanced data distribution, which induces biases in local models and leads to\na 'local forgetting' phenomenon, where the ability of models to generalize\ndegrades over time, particularly for underrepresented classes. This paper\nintroduces FedDistill, a framework enhancing the knowledge transfer from the\nglobal model to local models, focusing on the issue of imbalanced class\ndistribution. Specifically, FedDistill employs group distillation, segmenting\nclasses based on their frequency in local datasets to facilitate a focused\ndistillation process to classes with fewer samples. Additionally, FedDistill\ndissects the global model into a feature extractor and a classifier. This\nseparation empowers local models with more generalized data representation\ncapabilities and ensures more accurate classification across all classes.\nFedDistill mitigates the adverse effects of data imbalance, ensuring that local\nmodels do not forget underrepresented classes but instead become more adept at\nrecognizing and classifying them accurately. Our comprehensive experiments\ndemonstrate FedDistill's effectiveness, surpassing existing baselines in\naccuracy and convergence speed across several benchmark datasets.\n","authors":["Changlin Song","Divya Saxena","Jiannong Cao","Yuqing Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.09210v1.pdf","comment":"13 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.03441v5","updated":"2024-04-14T10:13:25Z","published":"2023-12-06T11:50:14Z","title":"UFineBench: Towards Text-based Person Retrieval with Ultra-fine\n Granularity","summary":" Existing text-based person retrieval datasets often have relatively\ncoarse-grained text annotations. This hinders the model to comprehend the\nfine-grained semantics of query texts in real scenarios. To address this\nproblem, we contribute a new benchmark named \\textbf{UFineBench} for text-based\nperson retrieval with ultra-fine granularity.\n Firstly, we construct a new \\textbf{dataset} named UFine6926. We collect a\nlarge number of person images and manually annotate each image with two\ndetailed textual descriptions, averaging 80.8 words each. The average word\ncount is three to four times that of the previous datasets. In addition of\nstandard in-domain evaluation, we also propose a special \\textbf{evaluation\nparadigm} more representative of real scenarios. It contains a new evaluation\nset with cross domains, cross textual granularity and cross textual styles,\nnamed UFine3C, and a new evaluation metric for accurately measuring retrieval\nability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a\nmore efficient \\textbf{algorithm} especially designed for text-based person\nretrieval with ultra fine-grained texts. It achieves fine granularity mining by\nadopting a shared cross-modal granularity decoder and hard negative match\nmechanism.\n With standard in-domain evaluation, CFAM establishes competitive performance\nacross various datasets, especially on our ultra fine-grained UFine6926.\nFurthermore, by evaluating on UFine3C, we demonstrate that training on our\nUFine6926 significantly improves generalization to real scenarios compared with\nother coarse-grained datasets. The dataset and code will be made publicly\navailable at \\url{https://github.com/Zplusdragon/UFineBench}.\n","authors":["Jialong Zuo","Hanyu Zhou","Ying Nie","Feng Zhang","Tianyu Guo","Nong Sang","Yunhe Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.03441v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09204v1","updated":"2024-04-14T09:48:37Z","published":"2024-04-14T09:48:37Z","title":"TextHawk: Exploring Efficient Fine-Grained Perception of Multimodal\n Large Language Models","summary":" Multimodal Large Language Models (MLLMs) have shown impressive results on\nvarious multimodal tasks. However, most existing MLLMs are not well suited for\ndocument-oriented tasks, which require fine-grained image perception and\ninformation compression. In this paper, we present TextHawk, a MLLM that is\nspecifically designed for document-oriented tasks, while preserving the general\ncapabilities of MLLMs. TextHawk is aimed to explore efficient fine-grained\nperception by designing four dedicated components. Firstly, a ReSampling and\nReArrangement (ReSA) module is proposed to reduce the redundancy in the\ndocument texts and lower the computational cost of the MLLM. We explore\nencoding the positions of each local feature by presenting Scalable Positional\nEmbeddings (SPEs), which can preserve the scalability of various image sizes. A\nQuery Proposal Network (QPN) is then adopted to initialize the queries\ndynamically among different sub-images. To further enhance the fine-grained\nvisual perceptual ability of the MLLM, we design a Multi-Level Cross-Attention\n(MLCA) mechanism that captures the hierarchical structure and semantic\nrelations of document images. Furthermore, we create a new instruction-tuning\ndataset for document-oriented tasks by enriching the multimodal document data\nwith Gemini Pro. We conduct extensive experiments on both general and\ndocument-oriented MLLM benchmarks, and show that TextHawk outperforms the\nstate-of-the-art methods, demonstrating its effectiveness and superiority in\nfine-grained document perception and general abilities.\n","authors":["Ya-Qi Yu","Minghui Liao","Jihao Wu","Yongxin Liao","Xiaoyu Zheng","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.09204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06075v2","updated":"2024-04-14T09:28:28Z","published":"2024-03-10T03:43:02Z","title":"Multisize Dataset Condensation","summary":" While dataset condensation effectively enhances training efficiency, its\napplication in on-device scenarios brings unique challenges. 1) Due to the\nfluctuating computational resources of these devices, there's a demand for a\nflexible dataset size that diverges from a predefined size. 2) The limited\ncomputational power on devices often prevents additional condensation\noperations. These two challenges connect to the \"subset degradation problem\" in\ntraditional dataset condensation: a subset from a larger condensed dataset is\noften unrepresentative compared to directly condensing the whole dataset to\nthat smaller size. In this paper, we propose Multisize Dataset Condensation\n(MDC) by compressing N condensation processes into a single condensation\nprocess to obtain datasets with multiple sizes. Specifically, we introduce an\n\"adaptive subset loss\" on top of the basic condensation loss to mitigate the\n\"subset degradation problem\". Our MDC method offers several benefits: 1) No\nadditional condensation process is required; 2) reduced storage requirement by\nreusing condensed images. Experiments validate our findings on networks\nincluding ConvNet, ResNet and DenseNet, and datasets including SVHN, CIFAR-10,\nCIFAR-100 and ImageNet. For example, we achieved 5.22%-6.40% average accuracy\ngains on condensing CIFAR-10 to ten images per class. Code is available at:\nhttps://github.com/he-y/Multisize-Dataset-Condensation.\n","authors":["Yang He","Lingao Xiao","Joey Tianyi Zhou","Ivor Tsang"],"pdf_url":"https://arxiv.org/pdf/2403.06075v2.pdf","comment":"Accepted by ICLR 2024 Oral"},{"id":"http://arxiv.org/abs/2404.06564v3","updated":"2024-04-14T09:14:23Z","published":"2024-04-09T18:28:55Z","title":"MambaAD: Exploring State Space Models for Multi-class Unsupervised\n Anomaly Detection","summary":" Recent advancements in anomaly detection have seen the efficacy of CNN- and\ntransformer-based approaches. However, CNNs struggle with long-range\ndependencies, while transformers are burdened by quadratic computational\ncomplexity. Mamba-based models, with their superior long-range modeling and\nlinear efficiency, have garnered substantial attention. This study pioneers the\napplication of Mamba to multi-class unsupervised anomaly detection, presenting\nMambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring\n(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS\nmodule, integrating parallel cascaded (Hybrid State Space) HSS blocks and\nmulti-kernel convolutions operations, effectively captures both long-range and\nlocal information. The HSS block, utilizing (Hybrid Scanning) HS encoders,\nencodes feature maps into five scanning methods and eight directions, thereby\nstrengthening global connections through the (State Space Model) SSM. The use\nof Hilbert scanning and eight directions significantly improves feature\nsequence modeling. Comprehensive experiments on six diverse anomaly detection\ndatasets and seven metrics demonstrate state-of-the-art performance,\nsubstantiating the method's effectiveness.\n","authors":["Haoyang He","Yuhu Bai","Jiangning Zhang","Qingdong He","Hongxu Chen","Zhenye Gan","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09193v1","updated":"2024-04-14T09:01:26Z","published":"2024-04-14T09:01:26Z","title":"FaceCat: Enhancing Face Recognition Security with a Unified Generative\n Model Framework","summary":" Face anti-spoofing (FAS) and adversarial detection (FAD) have been regarded\nas critical technologies to ensure the safety of face recognition systems. As a\nconsequence of their limited practicality and generalization, some existing\nmethods aim to devise a framework capable of concurrently detecting both\nthreats to address the challenge. Nevertheless, these methods still encounter\nchallenges of insufficient generalization and suboptimal robustness,\npotentially owing to the inherent drawback of discriminative models. Motivated\nby the rich structural and detailed features of face generative models, we\npropose FaceCat which utilizes the face generative model as a pre-trained model\nto improve the performance of FAS and FAD. Specifically, FaceCat elaborately\ndesigns a hierarchical fusion mechanism to capture rich face semantic features\nof the generative model. These features then serve as a robust foundation for a\nlightweight head, designed to execute FAS and FAD tasks simultaneously. As\nrelying solely on single-modality data often leads to suboptimal performance,\nwe further propose a novel text-guided multi-modal alignment strategy that\nutilizes text prompts to enrich feature representation, thereby enhancing\nperformance. For fair evaluations, we build a comprehensive protocol with a\nwide range of 28 attack types to benchmark the performance. Extensive\nexperiments validate the effectiveness of FaceCat generalizes significantly\nbetter and obtains excellent robustness against input transformations.\n","authors":["Jiawei Chen","Xiao Yang","Yinpeng Dong","Hang Su","Jianteng Peng","Zhaoxia Yin"],"pdf_url":"https://arxiv.org/pdf/2404.09193v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.06270v2","updated":"2024-04-14T08:40:51Z","published":"2024-04-09T12:47:30Z","title":"3D Geometry-aware Deformable Gaussian Splatting for Dynamic View\n Synthesis","summary":" In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting\nmethod for dynamic view synthesis. Existing neural radiance fields (NeRF) based\nsolutions learn the deformation in an implicit manner, which cannot incorporate\n3D scene geometry. Therefore, the learned deformation is not necessarily\ngeometrically coherent, which results in unsatisfactory dynamic view synthesis\nand 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new\nrepresentation of the 3D scene, building upon which the 3D geometry could be\nexploited in learning the complex 3D deformation. Specifically, the scenes are\nrepresented as a collection of 3D Gaussian, where each 3D Gaussian is optimized\nto move and rotate over time to model the deformation. To enforce the 3D scene\ngeometry constraint during deformation, we explicitly extract 3D geometry\nfeatures and integrate them in learning the 3D deformation. In this way, our\nsolution achieves 3D geometry-aware deformation modeling, which enables\nimproved dynamic view synthesis and 3D dynamic reconstruction. Extensive\nexperimental results on both synthetic and real datasets prove the superiority\nof our solution, which achieves new state-of-the-art performance.\n The project is available at https://npucvr.github.io/GaGS/\n","authors":["Zhicheng Lu","Xiang Guo","Le Hui","Tianrui Chen","Min Yang","Xiao Tang","Feng Zhu","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.06270v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/"},{"id":"http://arxiv.org/abs/2404.09179v1","updated":"2024-04-14T08:09:33Z","published":"2024-04-14T08:09:33Z","title":"Change Guiding Network: Incorporating Change Prior to Guide Change\n Detection in Remote Sensing Imagery","summary":" The rapid advancement of automated artificial intelligence algorithms and\nremote sensing instruments has benefited change detection (CD) tasks. However,\nthere is still a lot of space to study for precise detection, especially the\nedge integrity and internal holes phenomenon of change features. In order to\nsolve these problems, we design the Change Guiding Network (CGNet), to tackle\nthe insufficient expression problem of change features in the conventional\nU-Net structure adopted in previous methods, which causes inaccurate edge\ndetection and internal holes. Change maps from deep features with rich semantic\ninformation are generated and used as prior information to guide multi-scale\nfeature fusion, which can improve the expression ability of change features.\nMeanwhile, we propose a self-attention module named Change Guide Module (CGM),\nwhich can effectively capture the long-distance dependency among pixels and\neffectively overcome the problem of the insufficient receptive field of\ntraditional convolutional neural networks. On four major CD datasets, we verify\nthe usefulness and efficiency of the CGNet, and a large number of experiments\nand ablation studies demonstrate the effectiveness of CGNet. We're going to\nopen-source our code at https://github.com/ChengxiHAN/CGNet-CD.\n","authors":["Chengxi Han","Chen Wu","Haonan Guo","Meiqi Hu","Jiepan Li","Hongruixuan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09178v1","updated":"2024-04-14T08:01:27Z","published":"2024-04-14T08:01:27Z","title":"HANet: A Hierarchical Attention Network for Change Detection With\n Bitemporal Very-High-Resolution Remote Sensing Images","summary":" Benefiting from the developments in deep learning technology,\ndeep-learning-based algorithms employing automatic feature extraction have\nachieved remarkable performance on the change detection (CD) task. However, the\nperformance of existing deep-learning-based CD methods is hindered by the\nimbalance between changed and unchanged pixels. To tackle this problem, a\nprogressive foreground-balanced sampling strategy on the basis of not adding\nchange information is proposed in this article to help the model accurately\nlearn the features of the changed pixels during the early training process and\nthereby improve detection performance.Furthermore, we design a discriminative\nSiamese network, hierarchical attention network (HANet), which can integrate\nmultiscale features and refine detailed features. The main part of HANet is the\nHAN module, which is a lightweight and effective self-attention mechanism.\nExtensive experiments and ablation studies on two CDdatasets with extremely\nunbalanced labels validate the effectiveness and efficiency of the proposed\nmethod.\n","authors":["Chengxi Han","Chen Wu","Haonan Guo","Meiqi Hu","Hongruixuan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04760v3","updated":"2024-04-14T07:44:19Z","published":"2023-07-10T17:58:17Z","title":"Learning Spatial Features from Audio-Visual Correspondence in Egocentric\n Videos","summary":" We propose a self-supervised method for learning representations based on\nspatial audio-visual correspondences in egocentric videos. Our method uses a\nmasked auto-encoding framework to synthesize masked binaural (multi-channel)\naudio through the synergy of audio and vision, thereby learning useful spatial\nrelationships between the two modalities. We use our pretrained features to\ntackle two downstream video tasks requiring spatial understanding in social\nscenarios: active speaker detection and spatial audio denoising. Through\nextensive experiments, we show that our features are generic enough to improve\nover multiple state-of-the-art baselines on both tasks on two challenging\negocentric video datasets that offer binaural audio, EgoCom and EasyCom.\nProject: http://vision.cs.utexas.edu/projects/ego_av_corr.\n","authors":["Sagnik Majumder","Ziad Al-Halah","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.04760v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09172v1","updated":"2024-04-14T07:36:18Z","published":"2024-04-14T07:36:18Z","title":"LoopAnimate: Loopable Salient Object Animation","summary":" Research on diffusion model-based video generation has advanced rapidly.\nHowever, limitations in object fidelity and generation length hinder its\npractical applications. Additionally, specific domains like animated wallpapers\nrequire seamless looping, where the first and last frames of the video match\nseamlessly. To address these challenges, this paper proposes LoopAnimate, a\nnovel method for generating videos with consistent start and end frames. To\nenhance object fidelity, we introduce a framework that decouples multi-level\nimage appearance and textual semantic information. Building upon an\nimage-to-image diffusion model, our approach incorporates both pixel-level and\nfeature-level information from the input image, injecting image appearance and\ntextual semantic embeddings at different positions of the diffusion model.\nExisting UNet-based video generation models require to input the entire videos\nduring training to encode temporal and positional information at once. However,\ndue to limitations in GPU memory, the number of frames is typically restricted\nto 16. To address this, this paper proposes a three-stage training strategy\nwith progressively increasing frame numbers and reducing fine-tuning modules.\nAdditionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend\nthe capacity for encoding temporal and positional information up to 36 frames.\nThe proposed LoopAnimate, which for the first time extends the single-pass\ngeneration length of UNet-based video generation models to 35 frames while\nmaintaining high-quality video generation. Experiments demonstrate that\nLoopAnimate achieves state-of-the-art performance in both objective metrics,\nsuch as fidelity and temporal consistency, and subjective evaluation results.\n","authors":["Fanyi Wang","Peng Liu","Haotian Hu","Dan Meng","Jingwen Su","Jinjin Xu","Yanhao Zhang","Xiaoming Ren","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19473v3","updated":"2024-04-14T07:01:41Z","published":"2024-02-29T18:59:01Z","title":"Retrieval-Augmented Generation for AI-Generated Content: A Survey","summary":" Advancements in model algorithms, the growth of foundational models, and\naccess to high-quality datasets have propelled the evolution of Artificial\nIntelligence Generated Content (AIGC). Despite its notable successes, AIGC\nstill faces hurdles such as updating knowledge, handling long-tail data,\nmitigating data leakage, and managing high training and inference costs.\nRetrieval-Augmented Generation (RAG) has recently emerged as a paradigm to\naddress such challenges. In particular, RAG introduces the information\nretrieval process, which enhances the generation process by retrieving relevant\nobjects from available data stores, leading to higher accuracy and better\nrobustness. In this paper, we comprehensively review existing efforts that\nintegrate RAG technique into AIGC scenarios. We first classify RAG foundations\naccording to how the retriever augments the generator, distilling the\nfundamental abstractions of the augmentation methodologies for various\nretrievers and generators. This unified perspective encompasses all RAG\nscenarios, illuminating advancements and pivotal technologies that help with\npotential future progress. We also summarize additional enhancements methods\nfor RAG, facilitating effective engineering and implementation of RAG systems.\nThen from another view, we survey on practical applications of RAG across\ndifferent modalities and tasks, offering valuable references for researchers\nand practitioners. Furthermore, we introduce the benchmarks for RAG, discuss\nthe limitations of current RAG systems, and suggest potential directions for\nfuture research. Github: https://github.com/PKU-DAIR/RAG-Survey.\n","authors":["Penghao Zhao","Hailin Zhang","Qinhan Yu","Zhengren Wang","Yunteng Geng","Fangcheng Fu","Ling Yang","Wentao Zhang","Jie Jiang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2402.19473v3.pdf","comment":"Citing 377 papers, 28 pages, 1 table, 12 figures. Project:\n https://github.com/PKU-DAIR/RAG-Survey"},{"id":"http://arxiv.org/abs/2401.03890v2","updated":"2024-04-14T06:50:24Z","published":"2024-01-08T13:42:59Z","title":"A Survey on 3D Gaussian Splatting","summary":" 3D Gaussian splatting (GS) has recently emerged as a transformative technique\nin the realm of explicit radiance field and computer graphics. This innovative\napproach, characterized by the utilization of millions of learnable 3D\nGaussians, represents a significant departure from mainstream neural radiance\nfield approaches, which predominantly use implicit, coordinate-based models to\nmap spatial coordinates to pixel values. 3D GS, with its explicit scene\nrepresentation and differentiable rendering algorithm, not only promises\nreal-time rendering capability but also introduces unprecedented levels of\neditability. This positions 3D GS as a potential game-changer for the next\ngeneration of 3D reconstruction and representation. In the present paper, we\nprovide the first systematic overview of the recent developments and critical\ncontributions in the domain of 3D GS. We begin with a detailed exploration of\nthe underlying principles and the driving forces behind the emergence of 3D GS,\nlaying the groundwork for understanding its significance. A focal point of our\ndiscussion is the practical applicability of 3D GS. By enabling unprecedented\nrendering speed, 3D GS opens up a plethora of applications, ranging from\nvirtual reality to interactive media and beyond. This is complemented by a\ncomparative analysis of leading 3D GS models, evaluated across various\nbenchmark tasks to highlight their performance and practical utility. The\nsurvey concludes by identifying current challenges and suggesting potential\navenues for future research in this domain. Through this survey, we aim to\nprovide a valuable resource for both newcomers and seasoned researchers,\nfostering further exploration and advancement in applicable and explicit\nradiance field representation.\n","authors":["Guikun Chen","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.03890v2.pdf","comment":"Ongoing project"},{"id":"http://arxiv.org/abs/2404.09161v1","updated":"2024-04-14T06:46:16Z","published":"2024-04-14T06:46:16Z","title":"Coreset Selection for Object Detection","summary":" Coreset selection is a method for selecting a small, representative subset of\nan entire dataset. It has been primarily researched in image classification,\nassuming there is only one object per image. However, coreset selection for\nobject detection is more challenging as an image can contain multiple objects.\nAs a result, much research has yet to be done on this topic. Therefore, we\nintroduce a new approach, Coreset Selection for Object Detection (CSOD). CSOD\ngenerates imagewise and classwise representative feature vectors for multiple\nobjects of the same class within each image. Subsequently, we adopt submodular\noptimization for considering both representativeness and diversity and utilize\nthe representative vectors in the submodular optimization process to select a\nsubset. When we evaluated CSOD on the Pascal VOC dataset, CSOD outperformed\nrandom selection by +6.4%p in AP$_{50}$ when selecting 200 images.\n","authors":["Hojun Lee","Suyoung Kim","Junhoo Lee","Jaeyoung Yoo","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2404.09161v1.pdf","comment":"Accepted by CVPR 2024: 1st Workshop on Dataset Distillation for\n Computer Vision"},{"id":"http://arxiv.org/abs/2404.09158v1","updated":"2024-04-14T06:19:46Z","published":"2024-04-14T06:19:46Z","title":"StreakNet-Arch: An Anti-scattering Network-based Architecture for\n Underwater Carrier LiDAR-Radar Imaging","summary":" In this paper, we introduce StreakNet-Arch, a novel signal processing\narchitecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging\nsystems, to address the limitations in scatter suppression and real-time\nimaging. StreakNet-Arch formulates the signal processing as a real-time,\nend-to-end binary classification task, enabling real-time image acquisition. To\nachieve this, we leverage Self-Attention networks and propose a novel Double\nBranch Cross Attention (DBC-Attention) mechanism that surpasses the performance\nof traditional methods. Furthermore, we present a method for embedding\nstreak-tube camera images into attention networks, effectively acting as a\nlearned bandpass filter. To facilitate further research, we contribute a\npublicly available streak-tube camera image dataset. The dataset contains\n2,695,168 real-world underwater 3D point cloud data. These advancements\nsignificantly improve UCLR capabilities, enhancing its performance and\napplicability in underwater imaging tasks. The source code and dataset can be\nfound at https://github.com/BestAnHongjun/StreakNet .\n","authors":["Xuelong Li","Hongjun An","Guangying Li","Xing Wang","Guanghua Cheng","Zhe Sun"],"pdf_url":"https://arxiv.org/pdf/2404.09158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09146v1","updated":"2024-04-14T05:28:46Z","published":"2024-04-14T05:28:46Z","title":"Fusion-Mamba for Cross-modality Object Detection","summary":" Cross-modality fusing complementary information from different modalities\neffectively improves object detection performance, making it more useful and\nrobust for a wider range of applications. Existing fusion strategies combine\ndifferent types of images or merge different backbone features through\nelaborated neural network modules. However, these methods neglect that modality\ndisparities affect cross-modality fusion performance, as different modalities\nwith different camera focal lengths, placements, and angles are hardly fused.\nIn this paper, we investigate cross-modality fusion by associating cross-modal\nfeatures in a hidden state space based on an improved Mamba with a gating\nmechanism. We design a Fusion-Mamba block (FMB) to map cross-modal features\ninto a hidden state space for interaction, thereby reducing disparities between\ncross-modal features and enhancing the representation consistency of fused\nfeatures. FMB contains two modules: the State Space Channel Swapping (SSCS)\nmodule facilitates shallow feature fusion, and the Dual State Space Fusion\n(DSSF) enables deep fusion in a hidden state space. Through extensive\nexperiments on public datasets, our proposed approach outperforms the\nstate-of-the-art methods on $m$AP with 5.9% on $M^3FD$ and 4.9% on FLIR-Aligned\ndatasets, demonstrating superior object detection performance. To the best of\nour knowledge, this is the first work to explore the potential of Mamba for\ncross-modal fusion and establish a new baseline for cross-modality object\ndetection.\n","authors":["Wenhao Dong","Haodong Zhu","Shaohui Lin","Xiaoyan Luo","Yunhang Shen","Xuhui Liu","Juan Zhang","Guodong Guo","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17493v3","updated":"2024-04-14T05:20:10Z","published":"2023-05-27T15:10:41Z","title":"The Curse of Recursion: Training on Generated Data Makes Models Forget","summary":" Stable Diffusion revolutionised image creation from descriptive text. GPT-2,\nGPT-3(.5) and GPT-4 demonstrated astonishing performance across a variety of\nlanguage tasks. ChatGPT introduced such language models to the general public.\nIt is now clear that large language models (LLMs) are here to stay, and will\nbring about drastic change in the whole ecosystem of online text and images. In\nthis paper we consider what the future might hold. What will happen to GPT-{n}\nonce LLMs contribute much of the language found online? We find that use of\nmodel-generated content in training causes irreversible defects in the\nresulting models, where tails of the original content distribution disappear.\nWe refer to this effect as Model Collapse and show that it can occur in\nVariational Autoencoders, Gaussian Mixture Models and LLMs. We build\ntheoretical intuition behind the phenomenon and portray its ubiquity amongst\nall learned generative models. We demonstrate that it has to be taken seriously\nif we are to sustain the benefits of training from large-scale data scraped\nfrom the web. Indeed, the value of data collected about genuine human\ninteractions with systems will be increasingly valuable in the presence of\ncontent generated by LLMs in data crawled from the Internet.\n","authors":["Ilia Shumailov","Zakhar Shumaylov","Yiren Zhao","Yarin Gal","Nicolas Papernot","Ross Anderson"],"pdf_url":"https://arxiv.org/pdf/2305.17493v3.pdf","comment":"Fixed typos in eqn 4,5"},{"id":"http://arxiv.org/abs/2303.12307v3","updated":"2024-04-14T05:16:49Z","published":"2023-03-22T04:49:23Z","title":"Curvature-Balanced Feature Manifold Learning for Long-Tailed\n Classification","summary":" To address the challenges of long-tailed classification, researchers have\nproposed several approaches to reduce model bias, most of which assume that\nclasses with few samples are weak classes. However, recent studies have shown\nthat tail classes are not always hard to learn, and model bias has been\nobserved on sample-balanced datasets, suggesting the existence of other factors\nthat affect model bias. In this work, we systematically propose a series of\ngeometric measurements for perceptual manifolds in deep neural networks, and\nthen explore the effect of the geometric characteristics of perceptual\nmanifolds on classification difficulty and how learning shapes the geometric\ncharacteristics of perceptual manifolds. An unanticipated finding is that the\ncorrelation between the class accuracy and the separation degree of perceptual\nmanifolds gradually decreases during training, while the negative correlation\nwith the curvature gradually increases, implying that curvature imbalance leads\nto model bias. Therefore, we propose curvature regularization to facilitate the\nmodel to learn curvature-balanced and flatter perceptual manifolds. Evaluations\non multiple long-tailed and non-long-tailed datasets show the excellent\nperformance and exciting generality of our approach, especially in achieving\nsignificant performance improvements based on current state-of-the-art\ntechniques. Our work opens up a geometric analysis perspective on model bias\nand reminds researchers to pay attention to model bias on non-long-tailed and\neven sample-balanced datasets. The code and model will be made public.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Shuyuan Yang","Xu Liu","Lingling Li"],"pdf_url":"https://arxiv.org/pdf/2303.12307v3.pdf","comment":"20pages, Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2301.00349v3","updated":"2024-04-14T03:59:35Z","published":"2023-01-01T05:02:46Z","title":"Towards Reliable Medical Image Segmentation by utilizing Evidential\n Calibrated Uncertainty","summary":" Medical image segmentation is critical for disease diagnosis and treatment\nassessment. However, concerns regarding the reliability of segmentation regions\npersist among clinicians, mainly attributed to the absence of confidence\nassessment, robustness, and calibration to accuracy. To address this, we\nintroduce DEviS, an easily implementable foundational model that seamlessly\nintegrates into various medical image segmentation networks. DEviS not only\nenhances the calibration and robustness of baseline segmentation accuracy but\nalso provides high-efficiency uncertainty estimation for reliable predictions.\nBy leveraging subjective logic theory, we explicitly model probability and\nuncertainty for the problem of medical image segmentation. Here, the Dirichlet\ndistribution parameterizes the distribution of probabilities for different\nclasses of the segmentation results. To generate calibrated predictions and\nuncertainty, we develop a trainable calibrated uncertainty penalty.\nFurthermore, DEviS incorporates an uncertainty-aware filtering module, which\nutilizes the metric of uncertainty-calibrated error to filter reliable data\nwithin the dataset. We conducted validation studies to assess both the accuracy\nand robustness of DEviS segmentation, along with evaluating the efficiency and\nreliability of uncertainty estimation. These evaluations were performed using\npublicly available datasets including ISIC2018, LiTS2017, and BraTS2019.\nAdditionally, two potential clinical trials are being conducted at Johns\nHopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in\nfiltering high-quality or out-of-distribution data. Our code has been released\nin https://github.com/Cocofeat/DEviS.\n","authors":["Ke Zou","Yidi Chen","Ling Huang","Xuedong Yuan","Xiaojing Shen","Meng Wang","Rick Siow Mong Goh","Yong Liu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2301.00349v3.pdf","comment":"34 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.09115v1","updated":"2024-04-14T01:51:11Z","published":"2024-04-14T01:51:11Z","title":"GCC: Generative Calibration Clustering","summary":" Deep clustering as an important branch of unsupervised representation\nlearning focuses on embedding semantically similar samples into the identical\nfeature space. This core demand inspires the exploration of contrastive\nlearning and subspace clustering. However, these solutions always rely on the\nbasic assumption that there are sufficient and category-balanced samples for\ngenerating valid high-level representation. This hypothesis actually is too\nstrict to be satisfied for real-world applications. To overcome such a\nchallenge, the natural strategy is utilizing generative models to augment\nconsiderable instances. How to use these novel samples to effectively fulfill\nclustering performance improvement is still difficult and under-explored. In\nthis paper, we propose a novel Generative Calibration Clustering (GCC) method\nto delicately incorporate feature learning and augmentation into clustering\nprocedure. First, we develop a discriminative feature alignment mechanism to\ndiscover intrinsic relationship across real and generated samples. Second, we\ndesign a self-supervised metric learning to generate more reliable cluster\nassignment to boost the conditional diffusion generation. Extensive\nexperimental results on three benchmarks validate the effectiveness and\nadvantage of our proposed method over the state-of-the-art methods.\n","authors":["Haifeng Xia","Hai Huang","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2404.09115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09111v1","updated":"2024-04-14T01:23:19Z","published":"2024-04-14T01:23:19Z","title":"Exploring Generative AI for Sim2Real in Driving Data Synthesis","summary":" Datasets are essential for training and testing vehicle perception\nalgorithms. However, the collection and annotation of real-world images is\ntime-consuming and expensive. Driving simulators offer a solution by\nautomatically generating various driving scenarios with corresponding\nannotations, but the simulation-to-reality (Sim2Real) domain gap remains a\nchallenge. While most of the Generative Artificial Intelligence (AI) follows\nthe de facto Generative Adversarial Nets (GANs)-based methods, the recent\nemerging diffusion probabilistic models have not been fully explored in\nmitigating Sim2Real challenges for driving data synthesis. To explore the\nperformance, this paper applied three different generative AI methods to\nleverage semantic label maps from a driving simulator as a bridge for the\ncreation of realistic datasets. A comparative analysis of these methods is\npresented from the perspective of image quality and perception. New synthetic\ndatasets, which include driving images and auto-generated high-quality\nannotations, are produced with low costs and high scene variability. The\nexperimental results show that although GAN-based methods are adept at\ngenerating high-quality images when provided with manually annotated labels,\nControlNet produces synthetic datasets with fewer artefacts and more structural\nfidelity when using simulator-generated labels. This suggests that the\ndiffusion-based approach may provide improved stability and an alternative\nmethod for addressing Sim2Real challenges.\n","authors":["Haonan Zhao","Yiting Wang","Thomas Bashford-Rogers","Valentina Donzella","Kurt Debattista"],"pdf_url":"https://arxiv.org/pdf/2404.09111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09105v1","updated":"2024-04-14T00:08:56Z","published":"2024-04-14T00:08:56Z","title":"EGGS: Edge Guided Gaussian Splatting for Radiance Fields","summary":" The Gaussian splatting methods are getting popular. However, their loss\nfunction only contains the $\\ell_1$ norm and the structural similarity between\nthe rendered and input images, without considering the edges in these images.\nIt is well-known that the edges in an image provide important information.\nTherefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS)\nmethod that leverages the edges in the input images. More specifically, we give\nthe edge region a higher weight than the flat region. With such edge guidance,\nthe resulting Gaussian particles focus more on the edges instead of the flat\nregions. Moreover, such edge guidance does not crease the computation cost\nduring the training and rendering stage. The experiments confirm that such\nsimple edge-weighted loss function indeed improves about $1\\sim2$ dB on several\ndifference data sets. With simply plugging in the edge guidance, the proposed\nmethod can improve all Gaussian splatting methods in different scenarios, such\nas human head modeling, building 3D reconstruction, etc.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2404.09105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10539v1","updated":"2024-04-14T15:49:02Z","published":"2024-04-14T15:49:02Z","title":"VideoSAGE: Video Summarization with Graph Representation Learning","summary":" We propose a graph-based representation learning framework for video\nsummarization. First, we convert an input video to a graph where nodes\ncorrespond to each of the video frames. Then, we impose sparsity on the graph\nby connecting only those pairs of nodes that are within a specified temporal\ndistance. We then formulate the video summarization task as a binary node\nclassification problem, precisely classifying video frames whether they should\nbelong to the output summary video. A graph constructed this way aims to\ncapture long-range interactions among video frames, and the sparsity ensures\nthe model trains without hitting the memory and compute bottleneck. Experiments\non two datasets(SumMe and TVSum) demonstrate the effectiveness of the proposed\nnimble model compared to existing state-of-the-art summarization approaches\nwhile being one order of magnitude more efficient in compute time and memory\n","authors":["Jose M. Rojas Chaves","Subarna Tripathi"],"pdf_url":"https://arxiv.org/pdf/2404.10539v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2207.07783"}]},"2024-04-13T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.13659v4","updated":"2024-04-13T22:52:19Z","published":"2024-03-20T15:08:43Z","title":"Recursive Joint Cross-Modal Attention for Multimodal Fusion in\n Dimensional Emotion Recognition","summary":" Though multimodal emotion recognition has achieved significant progress over\nrecent years, the potential of rich synergic relationships across the\nmodalities is not fully exploited. In this paper, we introduce Recursive Joint\nCross-Modal Attention (RJCMA) to effectively capture both intra- and\ninter-modal relationships across audio, visual, and text modalities for\ndimensional emotion recognition. In particular, we compute the attention\nweights based on cross-correlation between the joint audio-visual-text feature\nrepresentations and the feature representations of individual modalities to\nsimultaneously capture intra- and intermodal relationships across the\nmodalities. The attended features of the individual modalities are again fed as\ninput to the fusion model in a recursive mechanism to obtain more refined\nfeature representations. We have also explored Temporal Convolutional Networks\n(TCNs) to improve the temporal modeling of the feature representations of\nindividual modalities. Extensive experiments are conducted to evaluate the\nperformance of the proposed fusion model on the challenging Affwild2 dataset.\nBy effectively capturing the synergic intra- and inter-modal relationships\nacross audio, visual, and text modalities, the proposed fusion model achieves a\nConcordance Correlation Coefficient (CCC) of 0.585 (0.542) and 0.674 (0.619)\nfor valence and arousal respectively on the validation set(test set). This\nshows a significant improvement over the baseline of 0.240 (0.211) and 0.200\n(0.191) for valence and arousal, respectively, in the validation set (test\nset), achieving second place in the valence-arousal challenge of the 6th\nAffective Behavior Analysis in-the-Wild (ABAW) competition.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.13659v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09081v1","updated":"2024-04-13T21:02:49Z","published":"2024-04-13T21:02:49Z","title":"Probabilistic Directed Distance Fields for Ray-Based Shape\n Representations","summary":" In modern computer vision, the optimal representation of 3D shape continues\nto be task-dependent. One fundamental operation applied to such representations\nis differentiable rendering, as it enables inverse graphics approaches in\nlearning frameworks. Standard explicit shape representations (voxels, point\nclouds, or meshes) are often easily rendered, but can suffer from limited\ngeometric fidelity, among other issues. On the other hand, implicit\nrepresentations (occupancy, distance, or radiance fields) preserve greater\nfidelity, but suffer from complex or inefficient rendering processes, limiting\nscalability. In this work, we devise Directed Distance Fields (DDFs), a novel\nneural shape representation that builds upon classical distance fields. The\nfundamental operation in a DDF maps an oriented point (position and direction)\nto surface visibility and depth. This enables efficient differentiable\nrendering, obtaining depth with a single forward pass per pixel, as well as\ndifferential geometric quantity extraction (e.g., surface normals), with only\nadditional backward passes. Using probabilistic DDFs (PDDFs), we show how to\nmodel inherent discontinuities in the underlying field. We then apply DDFs to\nseveral applications, including single-shape fitting, generative modelling, and\nsingle-image 3D reconstruction, showcasing strong performance with simple\narchitectural components via the versatility of our representation. Finally,\nsince the dimensionality of DDFs permits view-dependent geometric artifacts, we\nconduct a theoretical investigation of the constraints necessary for view\nconsistency. We find a small set of field properties that are sufficient to\nguarantee a DDF is consistent, without knowing, for instance, which shape the\nfield is expressing.\n","authors":["Tristan Aumentado-Armstrong","Stavros Tsogkas","Sven Dickinson","Allan Jepson"],"pdf_url":"https://arxiv.org/pdf/2404.09081v1.pdf","comment":"Extension of arXiv:2112.05300"},{"id":"http://arxiv.org/abs/2403.11376v3","updated":"2024-04-13T20:42:17Z","published":"2024-03-18T00:03:48Z","title":"ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal\n Instance Segmentation","summary":" Amodal Instance Segmentation (AIS) presents a challenging task as it involves\npredicting both visible and occluded parts of objects within images. Existing\nAIS methods rely on a bidirectional approach, encompassing both the transition\nfrom amodal features to visible features (amodal-to-visible) and from visible\nfeatures to amodal features (visible-to-amodal). Our observation shows that the\nutilization of amodal features through the amodal-to-visible can confuse the\nvisible features due to the extra information of occluded/hidden segments not\npresented in visible display. Consequently, this compromised quality of visible\nfeatures during the subsequent visible-to-amodal transition. To tackle this\nissue, we introduce ShapeFormer, a decoupled Transformer-based model with a\nvisible-to-amodal transition. It facilitates the explicit relationship between\noutput segmentations and avoids the need for amodal-to-visible transitions.\nShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for\npredicting visible segmentation with occlusion awareness, (ii) Shape-Prior\nAmodal Mask Head for predicting amodal and occluded masks, and (iii)\nCategory-Specific Shape Prior Retriever aims to provide shape prior knowledge.\nComprehensive experiments and extensive ablation studies across various AIS\nbenchmarks demonstrate the effectiveness of our ShapeFormer. The code is\navailable at: https://github.com/UARK-AICV/ShapeFormer\n","authors":["Minh Tran","Winston Bounsavy","Khoa Vo","Anh Nguyen","Tri Nguyen","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2403.11376v3.pdf","comment":"Accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.09067v1","updated":"2024-04-13T19:34:14Z","published":"2024-04-13T19:34:14Z","title":"Exploring Explainability in Video Action Recognition","summary":" Image Classification and Video Action Recognition are perhaps the two most\nfoundational tasks in computer vision. Consequently, explaining the inner\nworkings of trained deep neural networks is of prime importance. While numerous\nefforts focus on explaining the decisions of trained deep neural networks in\nimage classification, exploration in the domain of its temporal version, video\naction recognition, has been scant. In this work, we take a deeper look at this\nproblem. We begin by revisiting Grad-CAM, one of the popular feature\nattribution methods for Image Classification, and its extension to Video Action\nRecognition tasks and examine the method's limitations. To address these, we\nintroduce Video-TCAV, by building on TCAV for Image Classification tasks, which\naims to quantify the importance of specific concepts in the decision-making\nprocess of Video Action Recognition models. As the scalable generation of\nconcepts is still an open problem, we propose a machine-assisted approach to\ngenerate spatial and spatiotemporal concepts relevant to Video Action\nRecognition for testing Video-TCAV. We then establish the importance of\ntemporally-varying concepts by demonstrating the superiority of dynamic\nspatiotemporal concepts over trivial spatial concepts. In conclusion, we\nintroduce a framework for investigating hypotheses in action recognition and\nquantitatively testing them, thus advancing research in the explainability of\ndeep neural networks used in video action recognition.\n","authors":["Avinab Saha","Shashank Gupta","Sravan Kumar Ankireddy","Karl Chahine","Joydeep Ghosh"],"pdf_url":"https://arxiv.org/pdf/2404.09067v1.pdf","comment":"6 pages, 10 figures, Accepted to the 3rd Explainable AI for Computer\n Vision (XAI4CV) Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05980v2","updated":"2024-04-13T18:10:00Z","published":"2024-04-09T03:24:10Z","title":"Tackling Structural Hallucination in Image Translation with Local\n Diffusion","summary":" Recent developments in diffusion models have advanced conditioned image\ngeneration, yet they struggle with reconstructing out-of-distribution (OOD)\nimages, such as unseen tumors in medical images, causing ``image\nhallucination'' and risking misdiagnosis. We hypothesize such hallucinations\nresult from local OOD regions in the conditional images. We verify that\npartitioning the OOD region and conducting separate image generations\nalleviates hallucinations in several applications. From this, we propose a\ntraining-free diffusion framework that reduces hallucination with multiple\nLocal Diffusion processes. Our approach involves OOD estimation followed by two\nmodules: a ``branching'' module generates locally both within and outside OOD\nregions, and a ``fusion'' module integrates these predictions into one. Our\nevaluation shows our method mitigates hallucination over baseline models\nquantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the\nreal-world medical and natural image datasets, respectively. It also\ndemonstrates compatibility with various pre-trained diffusion models.\n","authors":["Seunghoi Kim","Chen Jin","Tom Diethe","Matteo Figini","Henry F. J. Tregidgo","Asher Mullokandov","Philip Teare","Daniel C. Alexander"],"pdf_url":"https://arxiv.org/pdf/2404.05980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09051v1","updated":"2024-04-13T17:31:11Z","published":"2024-04-13T17:31:11Z","title":"Rethinking Iterative Stereo Matching from Diffusion Bridge Model\n Perspective","summary":" Recently, iteration-based stereo matching has shown great potential. However,\nthese models optimize the disparity map using RNN variants. The discrete\noptimization process poses a challenge of information loss, which restricts the\nlevel of detail that can be expressed in the generated disparity map. In order\nto address these issues, we propose a novel training approach that incorporates\ndiffusion models into the iterative optimization process. We designed a\nTime-based Gated Recurrent Unit (T-GRU) to correlate temporal and disparity\noutputs. Unlike standard recurrent units, we employ Agent Attention to generate\nmore expressive features. We also designed an attention-based context network\nto capture a large amount of contextual information. Experiments on several\npublic benchmarks show that we have achieved competitive stereo matching\nperformance. Our model ranks first in the Scene Flow dataset, achieving over a\n7% improvement compared to competing methods, and requires only 8 iterations to\nachieve state-of-the-art results.\n","authors":["Yuguang Shi"],"pdf_url":"https://arxiv.org/pdf/2404.09051v1.pdf","comment":"tip. arXiv admin note: text overlap with arXiv:2303.06615 by other\n authors"},{"id":"http://arxiv.org/abs/2305.14882v2","updated":"2024-04-13T17:13:55Z","published":"2023-05-24T08:33:15Z","title":"Dynamic Clue Bottlenecks: Towards Interpretable-by-Design Visual\n Question Answering","summary":" Recent advances in multimodal large language models (LLMs) have shown extreme\neffectiveness in visual question answering (VQA). However, the design nature of\nthese end-to-end models prevents them from being interpretable to humans,\nundermining trust and applicability in critical domains. While post-hoc\nrationales offer certain insight into understanding model behavior, these\nexplanations are not guaranteed to be faithful to the model. In this paper, we\naddress these shortcomings by introducing an interpretable by design model that\nfactors model decisions into intermediate human-legible explanations, and\nallows people to easily understand why a model fails or succeeds. We propose\nthe Dynamic Clue Bottleneck Model ( (DCLUB), a method that is designed towards\nan inherently interpretable VQA system. DCLUB provides an explainable\nintermediate space before the VQA decision and is faithful from the beginning,\nwhile maintaining comparable performance to black-box systems. Given a\nquestion, DCLUB first returns a set of visual clues: natural language\nstatements of visually salient evidence from the image, and then generates the\noutput based solely on the visual clues. To supervise and evaluate the\ngeneration of VQA explanations within DCLUB, we collect a dataset of 1.7k\nreasoning-focused questions with visual clues. Evaluations show that our\ninherently interpretable system can improve 4.64% over a comparable black-box\nsystem in reasoning-focused questions while preserving 99.43% of performance on\nVQA-v2.\n","authors":["Xingyu Fu","Ben Zhou","Sihao Chen","Mark Yatskar","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2305.14882v2.pdf","comment":"Multimodal, Visual Question Answering, Vision and Language"},{"id":"http://arxiv.org/abs/2212.12043v2","updated":"2024-04-13T17:02:25Z","published":"2022-12-22T21:27:12Z","title":"When are Lemons Purple? The Concept Association Bias of Vision-Language\n Models","summary":" Large-scale vision-language models such as CLIP have shown impressive\nperformance on zero-shot image classification and image-to-text retrieval.\nHowever, such performance does not realize in tasks that require a\nfiner-grained correspondence between vision and language, such as Visual\nQuestion Answering (VQA). As a potential cause of the difficulty of applying\nthese models to VQA and similar tasks, we report an interesting phenomenon of\nvision-language models, which we call the Concept Association Bias (CAB). We\nfind that models with CAB tend to treat input as a bag of concepts and attempt\nto fill in the other missing concept crossmodally, leading to an unexpected\nzero-shot prediction. We demonstrate CAB by showing that CLIP's zero-shot\nclassification performance greatly suffers when there is a strong concept\nassociation between an object (e.g. eggplant) and an attribute (e.g. color\npurple). We also show that the strength of CAB predicts the performance on VQA.\nWe observe that CAB is prevalent in vision-language models trained with\ncontrastive losses, even when autoregressive losses are jointly employed.\nHowever, a model that solely relies on autoregressive loss seems to exhibit\nminimal or no signs of CAB.\n","authors":["Yutaro Yamada","Yingtian Tang","Yoyo Zhang","Ilker Yildirim"],"pdf_url":"https://arxiv.org/pdf/2212.12043v2.pdf","comment":"EMNLP 2023 main"},{"id":"http://arxiv.org/abs/2404.09042v1","updated":"2024-04-13T16:57:37Z","published":"2024-04-13T16:57:37Z","title":"Improving Personalisation in Valence and Arousal Prediction using Data\n Augmentation","summary":" In the field of emotion recognition and Human-Machine Interaction (HMI),\npersonalised approaches have exhibited their efficacy in capturing\nindividual-specific characteristics and enhancing affective prediction\naccuracy. However, personalisation techniques often face the challenge of\nlimited data for target individuals. This paper presents our work on an\nenhanced personalisation strategy, that leverages data augmentation to develop\ntailored models for continuous valence and arousal prediction. Our proposed\napproach, Distance Weighting Augmentation (DWA), employs a weighting-based\naugmentation method that expands a target individual's dataset, leveraging\ndistance metrics to identify similar samples at the segment-level. Experimental\nresults on the MuSe-Personalisation 2023 Challenge dataset demonstrate that our\nmethod significantly improves the performance of features sets which have low\nbaseline performance, on the test set. This improvement in poor-performing\nfeatures comes without sacrificing performance on high-performing features. In\nparticular, our method achieves a maximum combined testing CCC of 0.78,\ncompared to the reported baseline score of 0.76 (reproduced at 0.72). It also\nachieved a peak arousal and valence scores of 0.81 and 0.76, compared to\nreproduced baseline scores of 0.76 and 0.67 respectively. Through this work, we\nmake significant contributions to the advancement of personalised affective\ncomputing models, enhancing the practicality and adaptability of data-level\npersonalisation in real world contexts.\n","authors":["Munachiso Nwadike","Jialin Li","Hanan Salam"],"pdf_url":"https://arxiv.org/pdf/2404.09042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09126v2","updated":"2024-04-13T16:43:01Z","published":"2024-01-17T11:02:52Z","title":"Objects With Lighting: A Real-World Dataset for Evaluating\n Reconstruction and Rendering for Object Relighting","summary":" Reconstructing an object from photos and placing it virtually in a new\nenvironment goes beyond the standard novel view synthesis task as the\nappearance of the object has to not only adapt to the novel viewpoint but also\nto the new lighting conditions and yet evaluations of inverse rendering methods\nrely on novel view synthesis data or simplistic synthetic datasets for\nquantitative analysis. This work presents a real-world dataset for measuring\nthe reconstruction and rendering of objects for relighting. To this end, we\ncapture the environment lighting and ground truth images of the same objects in\nmultiple environments allowing to reconstruct the objects from images taken in\none environment and quantify the quality of the rendered views for the unseen\nlighting environments. Further, we introduce a simple baseline composed of\noff-the-shelf methods and test several state-of-the-art methods on the\nrelighting task and show that novel view synthesis is not a reliable proxy to\nmeasure performance. Code and dataset are available at\nhttps://github.com/isl-org/objects-with-lighting .\n","authors":["Benjamin Ummenhofer","Sanskar Agrawal","Rene Sepulveda","Yixing Lao","Kai Zhang","Tianhang Cheng","Stephan Richter","Shenlong Wang","German Ros"],"pdf_url":"https://arxiv.org/pdf/2401.09126v2.pdf","comment":"Accepted at 3DV 2024, Oral presentation. For the project page see\n https://github.com/isl-org/objects-with-lighting"},{"id":"http://arxiv.org/abs/2310.11890v3","updated":"2024-04-13T14:57:15Z","published":"2023-10-18T11:19:32Z","title":"IRAD: Implicit Representation-driven Image Resampling against\n Adversarial Attacks","summary":" We introduce a novel approach to counter adversarial attacks, namely, image\nresampling. Image resampling transforms a discrete image into a new one,\nsimulating the process of scene recapturing or rerendering as specified by a\ngeometrical transformation. The underlying rationale behind our idea is that\nimage resampling can alleviate the influence of adversarial perturbations while\npreserving essential semantic information, thereby conferring an inherent\nadvantage in defending against adversarial attacks. To validate this concept,\nwe present a comprehensive study on leveraging image resampling to defend\nagainst adversarial attacks. We have developed basic resampling methods that\nemploy interpolation strategies and coordinate shifting magnitudes. Our\nanalysis reveals that these basic methods can partially mitigate adversarial\nattacks. However, they come with apparent limitations: the accuracy of clean\nimages noticeably decreases, while the improvement in accuracy on adversarial\nexamples is not substantial. We propose implicit representation-driven image\nresampling (IRAD) to overcome these limitations. First, we construct an\nimplicit continuous representation that enables us to represent any input image\nwithin a continuous coordinate space. Second, we introduce SampleNet, which\nautomatically generates pixel-wise shifts for resampling in response to\ndifferent inputs. Furthermore, we can extend our approach to the\nstate-of-the-art diffusion-based method, accelerating it with fewer time steps\nwhile preserving its defense capability. Extensive experiments demonstrate that\nour method significantly enhances the adversarial robustness of diverse deep\nmodels against various attacks while maintaining high accuracy on clean images.\n","authors":["Yue Cao","Tianlin Li","Xiaofeng Cao","Ivor Tsang","Yang Liu","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2310.11890v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06791v6","updated":"2024-04-13T14:39:51Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection\n Features and Variable Receptive Field Voxel Features","summary":" LiDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, real-time inference from extremely sparse 3D data is a\nformidable challenge. To address this problem, a typical class of approaches\ntransforms the point cloud cast into a regular data representation (voxels or\nprojection maps). Then, it performs feature extraction with convolutional\nneural networks. However, such methods often result in a certain degree of\ninformation loss due to down-sampling or over-compression of feature\ninformation. This paper proposes a multi-modal point cloud feature fusion\nmethod for projection features and variable receptive field voxel features\n(PV-SSD) based on projection and variable voxelization to solve the information\nloss problem. We design a two-branch feature extraction structure with a 2D\nconvolutional neural network to extract the point cloud's projection features\nin bird's-eye view to focus on the correlation between local features. A voxel\nfeature extraction branch is used to extract local fine-grained features.\nMeanwhile, we propose a voxel feature extraction method with variable sensory\nfields to reduce the information loss of voxel branches due to downsampling. It\navoids missing critical point information by selecting more useful feature\npoints based on feature point weights for the detection task. In addition, we\npropose a multi-modal feature fusion module for point clouds. To validate the\neffectiveness of our method, we tested it on the KITTI dataset and ONCE\ndataset.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan","Peng Liao"],"pdf_url":"https://arxiv.org/pdf/2308.06791v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09016v1","updated":"2024-04-13T14:08:56Z","published":"2024-04-13T14:08:56Z","title":"Theoretical research on generative diffusion models: an overview","summary":" Generative diffusion models showed high success in many fields with a\npowerful theoretical background. They convert the data distribution to noise\nand remove the noise back to obtain a similar distribution. Many existing\nreviews focused on the specific application areas without concentrating on the\nresearch about the algorithm. Unlike them we investigated the theoretical\ndevelopments of the generative diffusion models. These approaches mainly divide\ninto two: training-based and sampling-based. Awakening to this allowed us a\nclear and understandable categorization for the researchers who will make new\ndevelopments in the future.\n","authors":["Melike Nur Yeğin","Mehmet Fatih Amasyalı"],"pdf_url":"https://arxiv.org/pdf/2404.09016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06773v2","updated":"2024-04-13T13:58:29Z","published":"2024-04-10T06:30:08Z","title":"Adapting LLaMA Decoder to Vision Transformer","summary":" This work examines whether decoder-only Transformers such as LLaMA, which\nwere originally designed for large language models (LLMs), can be adapted to\nthe computer vision field. We first \"LLaMAfy\" a standard ViT step-by-step to\nalign with LLaMA's architecture, and find that directly applying a casual mask\nto the self-attention brings an attention collapse issue, resulting in the\nfailure to the network training. We suggest to reposition the class token\nbehind the image tokens with a post-sequence class token technique to overcome\nthis challenge, enabling causal self-attention to efficiently capture the\nentire image's information. Additionally, we develop a soft mask strategy that\ngradually introduces a casual mask to the self-attention at the onset of\ntraining to facilitate the optimization behavior. The tailored model, dubbed as\nimage LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct\nsupervised learning. Its causal self-attention boosts computational efficiency\nand learns complex representation by elevating attention map ranks. iLLaMA\nrivals the performance with its encoder-only counterparts, achieving 75.1%\nImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M\nand pre-training on ImageNet-21K further enhances the accuracy to 86.0%.\nExtensive experiments demonstrate iLLaMA's reliable properties: calibration,\nshape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR\ntransfer learning. We hope our study can kindle fresh views to visual model\ndesign in the wave of LLMs. Pre-trained models and codes are available here.\n","authors":["Jiahao Wang","Wenqi Shao","Mengzhao Chen","Chengyue Wu","Yong Liu","Kaipeng Zhang","Songyang Zhang","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2404.06773v2.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.07922v2","updated":"2024-04-13T13:57:51Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. All code and model\nweights are public at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v2.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2404.09011v1","updated":"2024-04-13T13:41:13Z","published":"2024-04-13T13:41:13Z","title":"PracticalDG: Perturbation Distillation on Vision-Language Models for\n Hybrid Domain Generalization","summary":" Domain Generalization (DG) aims to resolve distribution shifts between source\nand target domains, and current DG methods are default to the setting that data\nfrom source and target domains share identical categories. Nevertheless, there\nexists unseen classes from target domains in practical scenarios. To address\nthis issue, Open Set Domain Generalization (OSDG) has emerged and several\nmethods have been exclusively proposed. However, most existing methods adopt\ncomplex architectures with slight improvement compared with DG methods.\nRecently, vision-language models (VLMs) have been introduced in DG following\nthe fine-tuning paradigm, but consume huge training overhead with large vision\nmodels. Therefore, in this paper, we innovate to transfer knowledge from VLMs\nto lightweight vision models and improve the robustness by introducing\nPerturbation Distillation (PD) from three perspectives, including Score, Class\nand Instance (SCI), named SCI-PD. Moreover, previous methods are oriented by\nthe benchmarks with identical and fixed splits, ignoring the divergence between\nsource domains. These methods are revealed to suffer from sharp performance\ndecay with our proposed new benchmark Hybrid Domain Generalization (HDG) and a\nnovel metric $H^{2}$-CV, which construct various splits to comprehensively\nassess the robustness of algorithms. Extensive experiments demonstrate that our\nmethod outperforms state-of-the-art algorithms on multiple datasets, especially\nimproving the robustness when confronting data scarcity.\n","authors":["Zining Chen","Weiqiu Wang","Zhicheng Zhao","Fei Su","Aidong Men","Hongying Meng"],"pdf_url":"https://arxiv.org/pdf/2404.09011v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.14472v3","updated":"2024-04-13T13:39:50Z","published":"2024-03-21T15:18:30Z","title":"Detoxifying Large Language Models via Knowledge Editing","summary":" This paper investigates using knowledge editing techniques to detoxify Large\nLanguage Models (LLMs). We construct a benchmark, SafeEdit, which covers nine\nunsafe categories with various powerful attack prompts and equips comprehensive\nmetrics for systematic evaluation. We conduct experiments with several\nknowledge editing approaches, indicating that knowledge editing has the\npotential to efficiently detoxify LLMs with limited impact on general\nperformance. Then, we propose a simple yet effective baseline, dubbed\nDetoxifying with Intraoperative Neural Monitoring (DINM), to diminish the\ntoxicity of LLMs within a few tuning steps via only one instance. We further\nprovide an in-depth analysis of the internal mechanism for various detoxifying\napproaches, demonstrating that previous methods like SFT and DPO may merely\nsuppress the activations of toxic parameters, while DINM mitigates the toxicity\nof the toxic parameters to a certain extent, making permanent adjustments. We\nhope that these insights could shed light on future work of developing\ndetoxifying approaches and the underlying knowledge mechanisms of LLMs. Code\nand benchmark are available at https://github.com/zjunlp/EasyEdit.\n","authors":["Mengru Wang","Ningyu Zhang","Ziwen Xu","Zekun Xi","Shumin Deng","Yunzhi Yao","Qishen Zhang","Linyi Yang","Jindong Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14472v3.pdf","comment":"Ongoing work. Project website:\n https://zjunlp.github.io/project/SafeEdit Add and update experimental results\n in Tables 1 and 3"},{"id":"http://arxiv.org/abs/2404.09010v1","updated":"2024-04-13T13:39:26Z","published":"2024-04-13T13:39:26Z","title":"MMA-DFER: MultiModal Adaptation of unimodal models for Dynamic Facial\n Expression Recognition in-the-wild","summary":" Dynamic Facial Expression Recognition (DFER) has received significant\ninterest in the recent years dictated by its pivotal role in enabling empathic\nand human-compatible technologies. Achieving robustness towards in-the-wild\ndata in DFER is particularly important for real-world applications. One of the\ndirections aimed at improving such models is multimodal emotion recognition\nbased on audio and video data. Multimodal learning in DFER increases the model\ncapabilities by leveraging richer, complementary data representations. Within\nthe field of multimodal DFER, recent methods have focused on exploiting\nadvances of self-supervised learning (SSL) for pre-training of strong\nmultimodal encoders. Another line of research has focused on adapting\npre-trained static models for DFER. In this work, we propose a different\nperspective on the problem and investigate the advancement of multimodal DFER\nperformance by adapting SSL-pre-trained disjoint unimodal encoders. We identify\nmain challenges associated with this task, namely, intra-modality adaptation,\ncross-modal alignment, and temporal adaptation, and propose solutions to each\nof them. As a result, we demonstrate improvement over current state-of-the-art\non two popular DFER benchmarks, namely DFEW and MFAW.\n","authors":["Kateryna Chumachenko","Alexandros Iosifidis","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2404.09010v1.pdf","comment":"accepted to CVPR 2024 ABAW Workshop"},{"id":"http://arxiv.org/abs/2404.09003v1","updated":"2024-04-13T13:08:57Z","published":"2024-04-13T13:08:57Z","title":"THQA: A Perceptual Quality Assessment Database for Talking Heads","summary":" In the realm of media technology, digital humans have gained prominence due\nto rapid advancements in computer technology. However, the manual modeling and\ncontrol required for the majority of digital humans pose significant obstacles\nto efficient development. The speech-driven methods offer a novel avenue for\nmanipulating the mouth shape and expressions of digital humans. Despite the\nproliferation of driving methods, the quality of many generated talking head\n(TH) videos remains a concern, impacting user visual experiences. To tackle\nthis issue, this paper introduces the Talking Head Quality Assessment (THQA)\ndatabase, featuring 800 TH videos generated through 8 diverse speech-driven\nmethods. Extensive experiments affirm the THQA database's richness in character\nand speech features. Subsequent subjective quality assessment experiments\nanalyze correlations between scoring results and speech-driven methods, ages,\nand genders. In addition, experimental results show that mainstream image and\nvideo quality assessment methods have limitations for the THQA database,\nunderscoring the imperative for further research to enhance TH video quality\nassessment. The THQA database is publicly accessible at\nhttps://github.com/zyj-2000/THQA.\n","authors":["Yingjie Zhou","Zicheng Zhang","Wei Sun","Xiaohong Liu","Xiongkuo Min","Zhihua Wang","Xiao-Ping Zhang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.09003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09001v1","updated":"2024-04-13T13:03:59Z","published":"2024-04-13T13:03:59Z","title":"Smart Help: Strategic Opponent Modeling for Proactive and Adaptive Robot\n Assistance in Households","summary":" Despite the significant demand for assistive technology among vulnerable\ngroups (e.g., the elderly, children, and the disabled) in daily tasks, research\ninto advanced AI-driven assistive solutions that genuinely accommodate their\ndiverse needs remains sparse. Traditional human-machine interaction tasks often\nrequire machines to simply help without nuanced consideration of human\nabilities and feelings, such as their opportunity for practice and learning,\nsense of self-improvement, and self-esteem. Addressing this gap, we define a\npivotal and novel challenge Smart Help, which aims to provide proactive yet\nadaptive support to human agents with diverse disabilities and dynamic goals in\nvarious tasks and environments. To establish this challenge, we leverage\nAI2-THOR to build a new interactive 3D realistic household environment for the\nSmart Help task. We introduce an innovative opponent modeling module that\nprovides a nuanced understanding of the main agent's capabilities and goals, in\norder to optimize the assisting agent's helping policy. Rigorous experiments\nvalidate the efficacy of our model components and show the superiority of our\nholistic approach against established baselines. Our findings illustrate the\npotential of AI-imbued assistive robots in improving the well-being of\nvulnerable groups.\n","authors":["Zhihao Cao","Zidong Wang","Siwen Xie","Anji Liu","Lifeng Fan"],"pdf_url":"https://arxiv.org/pdf/2404.09001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09000v1","updated":"2024-04-13T13:03:19Z","published":"2024-04-13T13:03:19Z","title":"MaSkel: A Model for Human Whole-body X-rays Generation from Human\n Masking Images","summary":" The human whole-body X-rays could offer a valuable reference for various\napplications, including medical diagnostics, digital animation modeling, and\nergonomic design. The traditional method of obtaining X-ray information\nrequires the use of CT (Computed Tomography) scan machines, which emit\npotentially harmful radiation. Thus it faces a significant limitation for\nrealistic applications because it lacks adaptability and safety. In our work,\nWe proposed a new method to directly generate the 2D human whole-body X-rays\nfrom the human masking images. The predicted images will be similar to the real\nones with the same image style and anatomic structure. We employed a\ndata-driven strategy. By leveraging advanced generative techniques, our model\nMaSkel(Masking image to Skeleton X-rays) could generate a high-quality X-ray\nimage from a human masking image without the need for invasive and harmful\nradiation exposure, which not only provides a new path to generate highly\nanatomic and customized data but also reduces health risks. To our knowledge,\nour model MaSkel is the first work for predicting whole-body X-rays. In this\npaper, we did two parts of the work. The first one is to solve the data\nlimitation problem, the diffusion-based techniques are utilized to make a data\naugmentation, which provides two synthetic datasets for preliminary\npretraining. Then we designed a two-stage training strategy to train MaSkel. At\nlast, we make qualitative and quantitative evaluations of the generated X-rays.\nIn addition, we invite some professional doctors to assess our predicted data.\nThese evaluations demonstrate the MaSkel's superior ability to generate\nanatomic X-rays from human masking images. The related code and links of the\ndataset are available at https://github.com/2022yingjie/MaSkel.\n","authors":["Yingjie Xi","Boyuan Cheng","Jingyao Cai","Jian Jun Zhang","Xiaosong Yang"],"pdf_url":"https://arxiv.org/pdf/2404.09000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08995v1","updated":"2024-04-13T12:41:40Z","published":"2024-04-13T12:41:40Z","title":"Beyond Known Clusters: Probe New Prototypes for Efficient Generalized\n Class Discovery","summary":" Generalized Class Discovery (GCD) aims to dynamically assign labels to\nunlabelled data partially based on knowledge learned from labelled data, where\nthe unlabelled data may come from known or novel classes. The prevailing\napproach generally involves clustering across all data and learning conceptions\nby prototypical contrastive learning. However, existing methods largely hinge\non the performance of clustering algorithms and are thus subject to their\ninherent limitations. Firstly, the estimated cluster number is often smaller\nthan the ground truth, making the existing methods suffer from the lack of\nprototypes for comprehensive conception learning. To address this issue, we\npropose an adaptive probing mechanism that introduces learnable potential\nprototypes to expand cluster prototypes (centers). As there is no ground truth\nfor the potential prototype, we develop a self-supervised prototype learning\nframework to optimize the potential prototype in an end-to-end fashion.\nSecondly, clustering is computationally intensive, and the conventional\nstrategy of clustering both labelled and unlabelled instances exacerbates this\nissue. To counteract this inefficiency, we opt to cluster only the unlabelled\ninstances and subsequently expand the cluster prototypes with our introduced\npotential prototypes to fast explore novel classes. Despite the simplicity of\nour proposed method, extensive empirical analysis on a wide range of datasets\nconfirms that our method consistently delivers state-of-the-art results.\nSpecifically, our method surpasses the nearest competitor by a significant\nmargin of \\textbf{9.7}$\\%$ within the Stanford Cars dataset and\n\\textbf{12$\\times$} clustering efficiency within the Herbarium 19 dataset. We\nwill make the code and checkpoints publicly available at\n\\url{https://github.com/xjtuYW/PNP.git}.\n","authors":["Ye Wang","Yaxiong Wang","Yujiao Wu","Bingchen Zhao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2404.08995v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.08990v1","updated":"2024-04-13T12:28:40Z","published":"2024-04-13T12:28:40Z","title":"A Fourier-enhanced multi-modal 3D small object optical mark recognition\n and positioning method for percutaneous abdominal puncture surgical\n navigation","summary":" Navigation for thoracoabdominal puncture surgery is used to locate the needle\nentry point on the patient's body surface. The traditional reflective ball\nnavigation method is difficult to position the needle entry point on the soft,\nirregular, smooth chest and abdomen. Due to the lack of clear characteristic\npoints on the body surface using structured light technology, it is difficult\nto identify and locate arbitrary needle insertion points. Based on the high\nstability and high accuracy requirements of surgical navigation, this paper\nproposed a novel method, a muti-modal 3D small object medical marker detection\nmethod, which identifies the center of a small single ring as the needle\ninsertion point. Moreover, this novel method leverages Fourier transform\nenhancement technology to augment the dataset, enrich image details, and\nenhance the network's capability. The method extracts the Region of Interest\n(ROI) of the feature image from both enhanced and original images, followed by\ngenerating a mask map. Subsequently, the point cloud of the ROI from the depth\nmap is obtained through the registration of ROI point cloud contour fitting. In\naddition, this method employs Tukey loss for optimal precision. The\nexperimental results show this novel method proposed in this paper not only\nachieves high-precision and high-stability positioning, but also enables the\npositioning of any needle insertion point.\n","authors":["Zezhao Guo","Yanzhong Guo","Zhanfang Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.08990v1.pdf","comment":"19 pages, 6 figures,"},{"id":"http://arxiv.org/abs/2401.14387v2","updated":"2024-04-13T12:26:44Z","published":"2024-01-25T18:46:35Z","title":"Inconsistency Masks: Removing the Uncertainty from Input-Pseudo-Label\n Pairs","summary":" Efficiently generating sufficient labeled data remains a major bottleneck in\ndeep learning, particularly for image segmentation tasks where labeling\nrequires significant time and effort. This study tackles this issue in a\nresource-constrained environment, devoid of extensive datasets or pre-existing\nmodels. We introduce Inconsistency Masks (IM), a novel approach that filters\nuncertainty in image-pseudo-label pairs to substantially enhance segmentation\nquality, surpassing traditional semi-supervised learning techniques. Employing\nIM, we achieve strong segmentation results with as little as 10% labeled data,\nacross four diverse datasets and it further benefits from integration with\nother techniques, indicating broad applicability. Notably on the ISIC 2018\ndataset, three of our hybrid approaches even outperform models trained on the\nfully labeled dataset. We also present a detailed comparative analysis of\nprevalent semi-supervised learning strategies, all under uniform starting\nconditions, to underline our approach's effectiveness and robustness. The full\ncode is available at: https://github.com/MichaelVorndran/InconsistencyMasks\n","authors":["Michael R. H. Vorndran","Bernhard F. Roeck"],"pdf_url":"https://arxiv.org/pdf/2401.14387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08981v1","updated":"2024-04-13T12:09:37Z","published":"2024-04-13T12:09:37Z","title":"Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active\n Image Classification","summary":" Deep active learning (AL) seeks to minimize the annotation costs for training\ndeep neural networks. BAIT, a recently proposed AL strategy based on the Fisher\nInformation, has demonstrated impressive performance across various datasets.\nHowever, BAIT's high computational and memory requirements hinder its\napplicability on large-scale classification tasks, resulting in current\nresearch neglecting BAIT in their evaluation. This paper introduces two methods\nto enhance BAIT's computational efficiency and scalability. Notably, we\nsignificantly reduce its time complexity by approximating the Fisher\nInformation. In particular, we adapt the original formulation by i) taking the\nexpectation over the most probable classes, and ii) constructing a binary\nclassification task, leading to an alternative likelihood for gradient\ncomputations. Consequently, this allows the efficient use of BAIT on\nlarge-scale datasets, including ImageNet. Our unified and comprehensive\nevaluation across a variety of datasets demonstrates that our approximations\nachieve strong performance with considerably reduced time complexity.\nFurthermore, we provide an extensive open-source toolbox that implements recent\nstate-of-the-art AL strategies, available at\nhttps://github.com/dhuseljic/dal-toolbox.\n","authors":["Denis Huseljic","Paul Hahn","Marek Herde","Lukas Rauch","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2404.08981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15706v2","updated":"2024-04-13T12:06:35Z","published":"2024-03-23T03:56:31Z","title":"G-ACIL: Analytic Learning for Exemplar-Free Generalized Class\n Incremental Learning","summary":" Class incremental learning (CIL) trains a network on sequential tasks with\nseparated categories but suffers from catastrophic forgetting, where models\nquickly lose previously learned knowledge when acquiring new tasks. The\ngeneralized CIL (GCIL) aims to address the CIL problem in a more real-world\nscenario, where incoming data have mixed data categories and unknown sample\nsize distribution, leading to intensified forgetting. Existing attempts for the\nGCIL either have poor performance, or invade data privacy by saving historical\nexemplars. To address this, in this paper, we propose an exemplar-free\ngeneralized analytic class incremental learning (G-ACIL). The G-ACIL adopts\nanalytic learning (a gradient-free training technique), and delivers an\nanalytical solution (i.e., closed-form) to the GCIL scenario. This solution is\nderived via decomposing the incoming data into exposed and unexposed classes,\nallowing an equivalence between the incremental learning and its joint\ntraining, i.e., the weight-invariant property. Such an equivalence is\ntheoretically validated through matrix analysis tools, and hence contributes\ninterpretability in GCIL. It is also empirically evidenced by experiments on\nvarious datasets and settings of GCIL. The results show that the G-ACIL\nexhibits leading performance with high robustness compared with existing\ncompetitive GCIL methods. Codes will be ready at\n\\url{https://github.com/ZHUANGHP/Analytic-continual-learning}.\n","authors":["Huiping Zhuang","Yizhu Chen","Di Fang","Run He","Kai Tong","Hongxin Wei","Ziqian Zeng","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.15706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08979v1","updated":"2024-04-13T12:06:29Z","published":"2024-04-13T12:06:29Z","title":"BG-YOLO: A Bidirectional-Guided Method for Underwater Object Detection","summary":" Degraded underwater images decrease the accuracy of underwater object\ndetection. However, existing methods for underwater image enhancement mainly\nfocus on improving the indicators in visual aspects, which may not benefit the\ntasks of underwater image detection, and may lead to serious degradation in\nperformance. To alleviate this problem, we proposed a bidirectional-guided\nmethod for underwater object detection, referred to as BG-YOLO. In the proposed\nmethod, network is organized by constructing an enhancement branch and a\ndetection branch in a parallel way. The enhancement branch consists of a\ncascade of an image enhancement subnet and an object detection subnet. And the\ndetection branch only consists of a detection subnet. A feature guided module\nconnects the shallow convolution layer of the two branches. When training the\nenhancement branch, the object detection subnet in the enhancement branch\nguides the image enhancement subnet to be optimized towards the direction that\nis most conducive to the detection task. The shallow feature map of the trained\nenhancement branch will be output to the feature guided module, constraining\nthe optimization of detection branch through consistency loss and prompting\ndetection branch to learn more detailed information of the objects. And hence\nthe detection performance will be refined. During the detection tasks, only\ndetection branch will be reserved so that no additional cost of computation\nwill be introduced. Extensive experiments demonstrate that the proposed method\nshows significant improvement in performance of the detector in severely\ndegraded underwater scenes while maintaining a remarkable detection speed.\n","authors":["Jian Zhang","Ruiteng Zhang","Xinyue Yan","Xiting Zhuang","Ruicheng Cao"],"pdf_url":"https://arxiv.org/pdf/2404.08979v1.pdf","comment":"15 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.08968v1","updated":"2024-04-13T11:13:56Z","published":"2024-04-13T11:13:56Z","title":"MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes","summary":" Recent advancements in post-hoc and inherently interpretable methods have\nmarkedly enhanced the explanations of black box classifier models. These\nmethods operate either through post-analysis or by integrating concept learning\nduring model training. Although being effective in bridging the semantic gap\nbetween a model's latent space and human interpretation, these explanation\nmethods only partially reveal the model's decision-making process. The outcome\nis typically limited to high-level semantics derived from the last feature map.\nWe argue that the explanations lacking insights into the decision processes at\nlow and mid-level features are neither fully faithful nor useful. Addressing\nthis gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet),\nan inherently interpretable model. MCPNet autonomously learns meaningful\nconcept prototypes across multiple feature map levels using Centered Kernel\nAlignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so\nwithout reliance on predefined concept labels. Further, we propose a novel\nclassifier paradigm that learns and aligns multi-level concept prototype\ndistributions for classification purposes via Class-aware Concept Distribution\n(CCD) loss. Our experiments reveal that our proposed MCPNet while being\nadaptable to various model architectures, offers comprehensive multi-level\nexplanations while maintaining classification accuracy. Additionally, its\nconcept distribution-based classification approach shows improved\ngeneralization capabilities in few-shot classification scenarios.\n","authors":["Bor-Shiun Wang","Chien-Yi Wang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2404.08968v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.11248v3","updated":"2024-04-13T11:11:44Z","published":"2024-02-17T11:03:02Z","title":"CoLLaVO: Crayon Large Language and Vision mOdel","summary":" The remarkable success of Large Language Models (LLMs) and instruction tuning\ndrives the evolution of Vision Language Models (VLMs) towards a versatile\ngeneral-purpose model. Yet, it remains unexplored whether current VLMs\ngenuinely possess quality object-level image understanding capabilities\ndetermined from `what objects are in the image?' or `which object corresponds\nto a specified bounding box?'. Our findings reveal that the image understanding\ncapabilities of current VLMs are strongly correlated with their zero-shot\nperformance on vision language (VL) tasks. This suggests that prioritizing\nbasic image understanding is crucial for VLMs to excel at VL tasks. To enhance\nobject-level image understanding, we propose Crayon Large Language and Vision\nmOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a\nnew visual prompt tuning scheme based on panoptic color maps. Furthermore, we\npresent a learning strategy of Dual QLoRA to preserve object-level image\nunderstanding without forgetting it during visual instruction tuning, thereby\nachieving a significant leap in numerous VL benchmarks in a zero-shot setting.\n","authors":["Byung-Kwan Lee","Beomchan Park","Chae Won Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2402.11248v3.pdf","comment":"Code available: https://github.com/ByungKwanLee/CoLLaVO"},{"id":"http://arxiv.org/abs/2404.08966v1","updated":"2024-04-13T11:07:53Z","published":"2024-04-13T11:07:53Z","title":"LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via\n Eulerian Motion Field","summary":" Cinemagraph is a unique form of visual media that combines elements of still\nphotography and subtle motion to create a captivating experience. However, the\nmajority of videos generated by recent works lack depth information and are\nconfined to the constraints of 2D image space. In this paper, inspired by\nsignificant progress in the field of novel view synthesis (NVS) achieved by 3D\nGaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from\n2D image space to 3D space using 3D Gaussian modeling. To achieve this, we\nfirst employ the 3D-GS method to reconstruct 3D Gaussian point clouds from\nmulti-view images of static scenes,incorporating shape regularization terms to\nprevent blurring or artifacts caused by object deformation. We then adopt an\nautoencoder tailored for 3D Gaussian to project it into feature space. To\nmaintain the local continuity of the scene, we devise SuperGaussian for\nclustering based on the acquired features. By calculating the similarity\nbetween clusters and employing a two-stage estimation method, we derive an\nEulerian motion field to describe velocities across the entire scene. The 3D\nGaussian points then move within the estimated Eulerian motion field. Through\nbidirectional animation techniques, we ultimately generate a 3D Cinemagraph\nthat exhibits natural and seamlessly loopable dynamics. Experiment results\nvalidate the effectiveness of our approach, demonstrating high-quality and\nvisually appealing scene generation.\n","authors":["Jiyang Li","Lechao Cheng","Zhangye Wang","Tingting Mu","Jingxuan He"],"pdf_url":"https://arxiv.org/pdf/2404.08966v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.08965v1","updated":"2024-04-13T11:07:10Z","published":"2024-04-13T11:07:10Z","title":"Seeing Text in the Dark: Algorithm and Benchmark","summary":" Localizing text in low-light environments is challenging due to visual\ndegradations. Although a straightforward solution involves a two-stage pipeline\nwith low-light image enhancement (LLE) as the initial step followed by\ndetector, LLE is primarily designed for human vision instead of machine and can\naccumulate errors. In this work, we propose an efficient and effective\nsingle-stage approach for localizing text in dark that circumvents the need for\nLLE. We introduce a constrained learning module as an auxiliary mechanism\nduring the training stage of the text detector. This module is designed to\nguide the text detector in preserving textual spatial features amidst feature\nmap resizing, thus minimizing the loss of spatial information in texts under\nlow-light visual degradations. Specifically, we incorporate spatial\nreconstruction and spatial semantic constraints within this module to ensure\nthe text detector acquires essential positional and contextual range knowledge.\nOur approach enhances the original text detector's ability to identify text's\nlocal topological features using a dynamic snake feature pyramid network and\nadopts a bottom-up contour shaping strategy with a novel rectangular\naccumulation technique for accurate delineation of streamlined text features.\nIn addition, we present a comprehensive low-light dataset for arbitrary-shaped\ntext, encompassing diverse scenes and languages. Notably, our method achieves\nstate-of-the-art results on this low-light dataset and exhibits comparable\nperformance on standard normal light datasets. The code and dataset will be\nreleased.\n","authors":["Chengpei Xu","Hao Fu","Long Ma","Wenjing Jia","Chengqi Zhang","Feng Xia","Xiaoyu Ai","Binghao Li","Wenjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08964v1","updated":"2024-04-13T11:06:49Z","published":"2024-04-13T11:06:49Z","title":"Understanding Multimodal Deep Neural Networks: A Concept Selection View","summary":" The multimodal deep neural networks, represented by CLIP, have generated rich\ndownstream applications owing to their excellent performance, thus making\nunderstanding the decision-making process of CLIP an essential research topic.\nDue to the complex structure and the massive pre-training data, it is often\nregarded as a black-box model that is too difficult to understand and\ninterpret. Concept-based models map the black-box visual representations\nextracted by deep neural networks onto a set of human-understandable concepts\nand use the concepts to make predictions, enhancing the transparency of the\ndecision-making process. However, these methods involve the datasets labeled\nwith fine-grained attributes by expert knowledge, which incur high costs and\nintroduce excessive human prior knowledge and bias. In this paper, we observe\nthe long-tail distribution of concepts, based on which we propose a two-stage\nConcept Selection Model (CSM) to mine core concepts without introducing any\nhuman priors. The concept greedy rough selection algorithm is applied to\nextract head concepts, and then the concept mask fine selection method performs\nthe extraction of core concepts. Experiments show that our approach achieves\ncomparable performance to end-to-end black-box models, and human evaluation\ndemonstrates that the concepts discovered by our method are interpretable and\ncomprehensible for humans.\n","authors":["Chenming Shang","Hengyuan Zhang","Hao Wen","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.08964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07853v2","updated":"2024-04-13T10:56:49Z","published":"2024-01-15T17:28:37Z","title":"VeCAF: Vision-language Collaborative Active Finetuning with Training\n Objective Awareness","summary":" Finetuning a pretrained vision model (PVM) is a common technique for learning\ndownstream vision tasks. However, the conventional finetuning process with\nrandomly sampled data points results in diminished training efficiency. To\naddress this drawback, we propose a novel approach, Vision-language\nCollaborative Active Finetuning (VeCAF). With the emerging availability of\nlabels and natural language annotations of images through web-scale crawling or\ncontrolled generation, VeCAF makes use of these information to perform\nparametric data selection for PVM finetuning. VeCAF incorporates the finetuning\nobjective to select significant data points that effectively guide the PVM\ntowards faster convergence to meet the performance goal. This process is\nassisted by the inherent semantic richness of the text embedding space which we\nuse to augment image features. Furthermore, the flexibility of text-domain\naugmentation allows VeCAF to handle out-of-distribution scenarios without\nexternal data. Extensive experiments show the leading performance and high\ncomputational efficiency of VeCAF that is superior to baselines in both\nin-distribution and out-of-distribution image classification tasks. On\nImageNet, VeCAF uses up to 3.3x less training batches to reach the target\nperformance compared to full finetuning, and achieves an accuracy improvement\nof 2.7% over the state-of-the-art active finetuning method with the same number\nof batches.\n","authors":["Rongyu Zhang","Zefan Cai","Huanrui Yang","Zidong Liu","Denis Gudovskiy","Tomoyuki Okuno","Yohei Nakata","Kurt Keutzer","Baobao Chang","Yuan Du","Li Du","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07853v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2404.08958v1","updated":"2024-04-13T10:46:11Z","published":"2024-04-13T10:46:11Z","title":"AMU-Tuning: Effective Logit Bias for CLIP-based Few-shot Learning","summary":" Recently, pre-trained vision-language models (e.g., CLIP) have shown great\npotential in few-shot learning and attracted a lot of research interest.\nAlthough efforts have been made to improve few-shot ability of CLIP, key\nfactors on the effectiveness of existing methods have not been well studied,\nlimiting further exploration of CLIP's potential in few-shot learning. In this\npaper, we first introduce a unified formulation to analyze CLIP-based few-shot\nlearning methods from a perspective of logit bias, which encourages us to learn\nan effective logit bias for further improving performance of CLIP-based\nfew-shot learning methods. To this end, we disassemble three key components\ninvolved in computation of logit bias (i.e., logit features, logit predictor,\nand logit fusion) and empirically analyze the effect on performance of few-shot\nclassification. Based on analysis of key components, this paper proposes a\nnovel AMU-Tuning method to learn effective logit bias for CLIP-based few-shot\nclassification. Specifically, our AMU-Tuning predicts logit bias by exploiting\nthe appropriate $\\underline{\\textbf{A}}$uxiliary features, which are fed into\nan efficient feature-initialized linear classifier with\n$\\underline{\\textbf{M}}$ulti-branch training. Finally, an\n$\\underline{\\textbf{U}}$ncertainty-based fusion is developed to incorporate\nlogit bias into CLIP for few-shot classification. The experiments are conducted\non several widely used benchmarks, and the results show AMU-Tuning clearly\noutperforms its counterparts while achieving state-of-the-art performance of\nCLIP-based few-shot learning without bells and whistles.\n","authors":["Yuwei Tang","Zhenyi Lin","Qilong Wang","Pengfei Zhu","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2404.08958v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.05975v2","updated":"2024-04-13T10:45:47Z","published":"2023-12-10T19:33:40Z","title":"FM-G-CAM: A Holistic Approach for Explainable AI in Computer Vision","summary":" Explainability is an aspect of modern AI that is vital for impact and\nusability in the real world. The main objective of this paper is to emphasise\nthe need to understand the predictions of Computer Vision models, specifically\nConvolutional Neural Network (CNN) based models. Existing methods of explaining\nCNN predictions are mostly based on Gradient-weighted Class Activation Maps\n(Grad-CAM) and solely focus on a single target class. We show that from the\npoint of the target class selection, we make an assumption on the prediction\nprocess, hence neglecting a large portion of the predictor CNN model's thinking\nprocess. In this paper, we present an exhaustive methodology called Fused\nMulti-class Gradient-weighted Class Activation Map (FM-G-CAM) that considers\nmultiple top predicted classes, which provides a holistic explanation of the\npredictor CNN's thinking rationale. We also provide a detailed and\ncomprehensive mathematical and algorithmic description of our method.\nFurthermore, along with a concise comparison of existing methods, we compare\nFM-G-CAM with Grad-CAM, highlighting its benefits through real-world practical\nuse cases. Finally, we present an open-source Python library with FM-G-CAM\nimplementation to conveniently generate saliency maps for CNN-based model\npredictions.\n","authors":["Ravidu Suien Rammuni Silva","Jordan J. Bird"],"pdf_url":"https://arxiv.org/pdf/2312.05975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08951v1","updated":"2024-04-13T10:15:51Z","published":"2024-04-13T10:15:51Z","title":"Constructing and Exploring Intermediate Domains in Mixed Domain\n Semi-supervised Medical Image Segmentation","summary":" Both limited annotation and domain shift are prevalent challenges in medical\nimage segmentation. Traditional semi-supervised segmentation and unsupervised\ndomain adaptation methods address one of these issues separately. However, the\ncoexistence of limited annotation and domain shift is quite common, which\nmotivates us to introduce a novel and challenging scenario: Mixed Domain\nSemi-supervised medical image Segmentation (MiDSS). In this scenario, we handle\ndata from multiple medical centers, with limited annotations available for a\nsingle domain and a large amount of unlabeled data from multiple domains. We\nfound that the key to solving the problem lies in how to generate reliable\npseudo labels for the unlabeled data in the presence of domain shift with\nlabeled data. To tackle this issue, we employ Unified Copy-Paste (UCP) between\nimages to construct intermediate domains, facilitating the knowledge transfer\nfrom the domain of labeled data to the domains of unlabeled data. To fully\nutilize the information within the intermediate domain, we propose a symmetric\nGuidance training strategy (SymGD), which additionally offers direct guidance\nto unlabeled data by merging pseudo labels from intermediate samples.\nSubsequently, we introduce a Training Process aware Random Amplitude MixUp\n(TP-RAM) to progressively incorporate style-transition components into\nintermediate samples. Compared with existing state-of-the-art approaches, our\nmethod achieves a notable 13.57% improvement in Dice score on Prostate dataset,\nas demonstrated on three public datasets. Our code is available at\nhttps://github.com/MQinghe/MiDSS .\n","authors":["Qinghe Ma","Jian Zhang","Lei Qi","Qian Yu","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2404.08951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03443v3","updated":"2024-04-13T10:00:28Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v3.pdf","comment":"Accepted By International Joint Conference on Neural Networks 2024"},{"id":"http://arxiv.org/abs/2401.06614v2","updated":"2024-04-13T09:23:21Z","published":"2024-01-12T15:05:08Z","title":"Motion2VecSets: 4D Latent Vector Set Diffusion for Non-rigid Shape\n Reconstruction and Tracking","summary":" We introduce Motion2VecSets, a 4D diffusion model for dynamic surface\nreconstruction from point cloud sequences. While existing state-of-the-art\nmethods have demonstrated success in reconstructing non-rigid objects using\nneural field representations, conventional feed-forward networks encounter\nchallenges with ambiguous observations from noisy, partial, or sparse point\nclouds. To address these challenges, we introduce a diffusion model that\nexplicitly learns the shape and motion distribution of non-rigid objects\nthrough an iterative denoising process of compressed latent representations.\nThe diffusion-based priors enable more plausible and probabilistic\nreconstructions when handling ambiguous inputs. We parameterize 4D dynamics\nwith latent sets instead of using global latent codes. This novel 4D\nrepresentation allows us to learn local shape and deformation patterns, leading\nto more accurate non-linear motion capture and significantly improving\ngeneralizability to unseen motions and identities. For more temporally-coherent\nobject tracking, we synchronously denoise deformation latent sets and exchange\ninformation across multiple frames. To avoid computational overhead, we\ndesigned an interleaved space and time attention block to alternately aggregate\ndeformation latents along spatial and temporal domains. Extensive comparisons\nagainst state-of-the-art methods demonstrate the superiority of our\nMotion2VecSets in 4D reconstruction from various imperfect observations. More\ndetailed information can be found at\nhttps://vveicao.github.io/projects/Motion2VecSets/.\n","authors":["Wei Cao","Chang Luo","Biao Zhang","Matthias Nießner","Jiapeng Tang"],"pdf_url":"https://arxiv.org/pdf/2401.06614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08937v1","updated":"2024-04-13T09:17:51Z","published":"2024-04-13T09:17:51Z","title":"ChimpVLM: Ethogram-Enhanced Chimpanzee Behaviour Recognition","summary":" We show that chimpanzee behaviour understanding from camera traps can be\nenhanced by providing visual architectures with access to an embedding of text\ndescriptions that detail species behaviours. In particular, we present a\nvision-language model which employs multi-modal decoding of visual features\nextracted directly from camera trap videos to process query tokens representing\nbehaviours and output class predictions. Query tokens are initialised using a\nstandardised ethogram of chimpanzee behaviour, rather than using random or\nname-based initialisations. In addition, the effect of initialising query\ntokens using a masked language model fine-tuned on a text corpus of known\nbehavioural patterns is explored. We evaluate our system on the PanAf500 and\nPanAf20K datasets and demonstrate the performance benefits of our multi-modal\ndecoding approach and query initialisation strategy on multi-class and\nmulti-label recognition tasks, respectively. Results and ablations corroborate\nperformance improvements. We achieve state-of-the-art performance over vision\nand vision-language models in top-1 accuracy (+6.34%) on PanAf500 and overall\n(+1.1%) and tail-class (+2.26%) mean average precision on PanAf20K. We share\ncomplete source code and network weights for full reproducibility of results\nand easy utilisation.\n","authors":["Otto Brookes","Majid Mirmehdi","Hjalmar Kuhl","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2404.08937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08936v1","updated":"2024-04-13T09:10:33Z","published":"2024-04-13T09:10:33Z","title":"Shifting Spotlight for Co-supervision: A Simple yet Efficient\n Single-branch Network to See Through Camouflage","summary":" Efficient and accurate camouflaged object detection (COD) poses a challenge\nin the field of computer vision. Recent approaches explored the utility of edge\ninformation for network co-supervision, achieving notable advancements.\nHowever, these approaches introduce an extra branch for complex edge\nextraction, complicate the model architecture and increases computational\ndemands. Addressing this issue, our work replicates the effect that animal's\ncamouflage can be easily revealed under a shifting spotlight, and leverages it\nfor network co-supervision to form a compact yet efficient single-branch\nnetwork, the Co-Supervised Spotlight Shifting Network (CS$^3$Net). The\nspotlight shifting strategy allows CS$^3$Net to learn additional prior within a\nsingle-branch framework, obviating the need for resource demanding multi-branch\ndesign. To leverage the prior of spotlight shifting co-supervision, we propose\nShadow Refinement Module (SRM) and Projection Aware Attention (PAA) for feature\nrefinement and enhancement. To ensure the continuity of multi-scale features\naggregation, we utilize the Extended Neighbor Connection Decoder (ENCD) for\ngenerating the final predictions. Empirical evaluations on public datasets\nconfirm that our CS$^3$Net offers an optimal balance between efficiency and\nperformance: it accomplishes a 32.13% reduction in Multiply-Accumulate (MACs)\noperations compared to leading efficient COD models, while also delivering\nsuperior performance.\n","authors":["Yang Hu","Jinxia Zhang","Kaihua Zhang","Yin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.08936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09486v3","updated":"2024-04-13T09:00:35Z","published":"2023-12-15T01:52:35Z","title":"Unraveling Batch Normalization for Realistic Test-Time Adaptation","summary":" While recent test-time adaptations exhibit efficacy by adjusting batch\nnormalization to narrow domain disparities, their effectiveness diminishes with\nrealistic mini-batches due to inaccurate target estimation. As previous\nattempts merely introduce source statistics to mitigate this issue, the\nfundamental problem of inaccurate target estimation still persists, leaving the\nintrinsic test-time domain shifts unresolved. This paper delves into the\nproblem of mini-batch degradation. By unraveling batch normalization, we\ndiscover that the inexact target statistics largely stem from the substantially\nreduced class diversity in batch. Drawing upon this insight, we introduce a\nstraightforward tool, Test-time Exponential Moving Average (TEMA), to bridge\nthe class diversity gap between training and testing batches. Importantly, our\nTEMA adaptively extends the scope of typical methods beyond the current batch\nto incorporate a diverse set of class information, which in turn boosts an\naccurate target estimation. Built upon this foundation, we further design a\nnovel layer-wise rectification strategy to consistently promote test-time\nperformance. Our proposed method enjoys a unique advantage as it requires\nneither training nor tuning parameters, offering a truly hassle-free solution.\nIt significantly enhances model robustness against shifted domains and\nmaintains resilience in diverse real-world scenarios with various batch sizes,\nachieving state-of-the-art performance on several major benchmarks. Code is\navailable at \\url{https://github.com/kiwi12138/RealisticTTA}.\n","authors":["Zixian Su","Jingwei Guo","Kai Yao","Xi Yang","Qiufeng Wang","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.09486v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2404.04880v2","updated":"2024-04-13T08:53:28Z","published":"2024-04-07T08:51:31Z","title":"GauU-Scene V2: Assessing the Reliability of Image-Based Metrics with\n Expansive Lidar Image Dataset Using 3DGS and NeRF","summary":" We introduce a novel, multimodal large-scale scene reconstruction benchmark\nthat utilizes newly developed 3D representation approaches: Gaussian Splatting\nand Neural Radiance Fields (NeRF). Our expansive U-Scene dataset surpasses any\npreviously existing real large-scale outdoor LiDAR and image dataset in both\narea and point count. GauU-Scene encompasses over 6.5 square kilometers and\nfeatures a comprehensive RGB dataset coupled with LiDAR ground truth.\nAdditionally, we are the first to propose a LiDAR and image alignment method\nfor a drone-based dataset. Our assessment of GauU-Scene includes a detailed\nanalysis across various novel viewpoints, employing image-based metrics such as\nSSIM, LPIPS, and PSNR on NeRF and Gaussian Splatting based methods. This\nanalysis reveals contradictory results when applying geometric-based metrics\nlike Chamfer distance. The experimental results on our multimodal dataset\nhighlight the unreliability of current image-based metrics and reveal\nsignificant drawbacks in geometric reconstruction using the current Gaussian\nSplatting-based method, further illustrating the necessity of our dataset for\nassessing geometry reconstruction tasks. We also provide detailed supplementary\ninformation on data collection protocols and make the dataset available on the\nfollowing anonymous project page\n","authors":["Butian Xiong","Nanjun Zheng","Junhua Liu","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2404.04880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09625v2","updated":"2024-04-13T08:51:33Z","published":"2023-12-15T09:08:14Z","title":"Weakly-Supervised 3D Visual Grounding based on Visual Linguistic\n Alignment","summary":" Learning to ground natural language queries to target objects or regions in\n3D point clouds is quite essential for 3D scene understanding. Nevertheless,\nexisting 3D visual grounding approaches require a substantial number of\nbounding box annotations for text queries, which is time-consuming and\nlabor-intensive to obtain. In this paper, we propose \\textbf{3D-VLA}, a weakly\nsupervised approach for \\textbf{3D} visual grounding based on \\textbf{V}isual\n\\textbf{L}inguistic \\textbf{A}lignment. Our 3D-VLA exploits the superior\nability of current large-scale vision-language models (VLMs) on aligning the\nsemantics between texts and 2D images, as well as the naturally existing\ncorrespondences between 2D images and 3D point clouds, and thus implicitly\nconstructs correspondences between texts and 3D point clouds with no need for\nfine-grained box annotations in the training procedure. During the inference\nstage, the learned text-3D correspondence will help us ground the text queries\nto the 3D target objects even without 2D images. To the best of our knowledge,\nthis is the first work to investigate 3D visual grounding in a weakly\nsupervised manner by involving large scale vision-language models, and\nextensive experiments on ReferIt3D and ScanRefer datasets demonstrate that our\n3D-VLA achieves comparable and even superior results over the fully supervised\nmethods.\n","authors":["Xiaoxu Xu","Yitian Yuan","Qiudan Zhang","Wenhui Wu","Zequn Jie","Lin Ma","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2312.09625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08931v1","updated":"2024-04-13T08:49:17Z","published":"2024-04-13T08:49:17Z","title":"Label-free Anomaly Detection in Aerial Agricultural Images with Masked\n Image Modeling","summary":" Detecting various types of stresses (nutritional, water, nitrogen, etc.) in\nagricultural fields is critical for farmers to ensure maximum productivity.\nHowever, stresses show up in different shapes and sizes across different crop\ntypes and varieties. Hence, this is posed as an anomaly detection task in\nagricultural images. Accurate anomaly detection in agricultural UAV images is\nvital for early identification of field irregularities. Traditional supervised\nlearning faces challenges in adapting to diverse anomalies, necessitating\nextensive annotated data. In this work, we overcome this limitation with\nself-supervised learning using a masked image modeling approach. Masked\nAutoencoders (MAE) extract meaningful normal features from unlabeled image\nsamples which produces high reconstruction error for the abnormal pixels during\nreconstruction. To remove the need of using only ``normal\" data while training,\nwe use an anomaly suppression loss mechanism that effectively minimizes the\nreconstruction of anomalous pixels and allows the model to learn anomalous\nareas without explicitly separating ``normal\" images for training. Evaluation\non the Agriculture-Vision data challenge shows a mIOU score improvement in\ncomparison to prior state of the art in unsupervised and self-supervised\nmethods. A single model generalizes across all the anomaly categories in the\nAgri-Vision Challenge Dataset\n","authors":["Sambal Shikhar","Anupam Sobti"],"pdf_url":"https://arxiv.org/pdf/2404.08931v1.pdf","comment":"The paper has been accepted to CVPR 2024 5th Workshop on Vision for\n Agriculture as an Oral Paper"},{"id":"http://arxiv.org/abs/2403.11134v2","updated":"2024-04-13T08:40:52Z","published":"2024-03-17T07:57:08Z","title":"Recent Advances in 3D Gaussian Splatting","summary":" The emergence of 3D Gaussian Splatting (3DGS) has greatly accelerated the\nrendering speed of novel view synthesis. Unlike neural implicit representations\nlike Neural Radiance Fields (NeRF) that represent a 3D scene with position and\nviewpoint-conditioned neural networks, 3D Gaussian Splatting utilizes a set of\nGaussian ellipsoids to model the scene so that efficient rendering can be\naccomplished by rasterizing Gaussian ellipsoids into images. Apart from the\nfast rendering speed, the explicit representation of 3D Gaussian Splatting\nfacilitates editing tasks like dynamic reconstruction, geometry editing, and\nphysical simulation. Considering the rapid change and growing number of works\nin this field, we present a literature review of recent 3D Gaussian Splatting\nmethods, which can be roughly classified into 3D reconstruction, 3D editing,\nand other downstream applications by functionality. Traditional point-based\nrendering methods and the rendering formulation of 3D Gaussian Splatting are\nalso illustrated for a better understanding of this technique. This survey aims\nto help beginners get into this field quickly and provide experienced\nresearchers with a comprehensive overview, which can stimulate the future\ndevelopment of the 3D Gaussian Splatting representation.\n","authors":["Tong Wu","Yu-Jie Yuan","Ling-Xiao Zhang","Jie Yang","Yan-Pei Cao","Ling-Qi Yan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2403.11134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08928v1","updated":"2024-04-13T08:36:13Z","published":"2024-04-13T08:36:13Z","title":"DeDoDe v2: Analyzing and Improving the DeDoDe Keypoint Detector","summary":" In this paper, we analyze and improve into the recently proposed DeDoDe\nkeypoint detector. We focus our analysis on some key issues. First, we find\nthat DeDoDe keypoints tend to cluster together, which we fix by performing\nnon-max suppression on the target distribution of the detector during training.\nSecond, we address issues related to data augmentation. In particular, the\nDeDoDe detector is sensitive to large rotations. We fix this by including\n90-degree rotations as well as horizontal flips. Finally, the decoupled nature\nof the DeDoDe detector makes evaluation of downstream usefulness problematic.\nWe fix this by matching the keypoints with a pretrained dense matcher (RoMa)\nand evaluating two-view pose estimates. We find that the original long training\nis detrimental to performance, and therefore propose a much shorter training\nschedule. We integrate all these improvements into our proposed detector DeDoDe\nv2 and evaluate it with the original DeDoDe descriptor on the MegaDepth-1500\nand IMC2022 benchmarks. Our proposed detector significantly increases pose\nestimation results, notably from 75.9 to 78.3 mAA on the IMC2022 challenge.\nCode and weights are available at https://github.com/Parskatt/DeDoDe\n","authors":["Johan Edstedt","Georg Bökman","Zhenjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.08928v1.pdf","comment":"Accepted to Sixth Workshop on Image Matching - CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08926v1","updated":"2024-04-13T08:27:10Z","published":"2024-04-13T08:27:10Z","title":"Diffusion Models Meet Remote Sensing: Principles, Methods, and\n Perspectives","summary":" As a newly emerging advance in deep generative models, diffusion models have\nachieved state-of-the-art results in many fields, including computer vision,\nnatural language processing, and molecule design. The remote sensing community\nhas also noticed the powerful ability of diffusion models and quickly applied\nthem to a variety of tasks for image processing. Given the rapid increase in\nresearch on diffusion models in the field of remote sensing, it is necessary to\nconduct a comprehensive review of existing diffusion model-based remote sensing\npapers, to help researchers recognize the potential of diffusion models and\nprovide some directions for further exploration. Specifically, this paper first\nintroduces the theoretical background of diffusion models, and then\nsystematically reviews the applications of diffusion models in remote sensing,\nincluding image generation, enhancement, and interpretation. Finally, the\nlimitations of existing remote sensing diffusion models and worthy research\ndirections for further exploration are discussed and summarized.\n","authors":["Yidan Liu","Jun Yue","Shaobo Xia","Pedram Ghamisi","Weiying Xie","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2404.08926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08923v1","updated":"2024-04-13T08:15:57Z","published":"2024-04-13T08:15:57Z","title":"Trustworthy Multimodal Fusion for Sentiment Analysis in Ordinal\n Sentiment Space","summary":" Multimodal video sentiment analysis aims to integrate multiple modal\ninformation to analyze the opinions and attitudes of speakers. Most previous\nwork focuses on exploring the semantic interactions of intra- and\ninter-modality. However, these works ignore the reliability of multimodality,\ni.e., modalities tend to contain noise, semantic ambiguity, missing modalities,\netc. In addition, previous multimodal approaches treat different modalities\nequally, largely ignoring their different contributions. Furthermore, existing\nmultimodal sentiment analysis methods directly regress sentiment scores without\nconsidering ordinal relationships within sentiment categories, with limited\nperformance. To address the aforementioned problems, we propose a trustworthy\nmultimodal sentiment ordinal network (TMSON) to improve performance in\nsentiment analysis. Specifically, we first devise a unimodal feature extractor\nfor each modality to obtain modality-specific features. Then, an uncertainty\ndistribution estimation network is customized, which estimates the unimodal\nuncertainty distributions. Next, Bayesian fusion is performed on the learned\nunimodal distributions to obtain multimodal distributions for sentiment\nprediction. Finally, an ordinal-aware sentiment space is constructed, where\nordinal regression is used to constrain the multimodal distributions. Our\nproposed TMSON outperforms baselines on multimodal sentiment analysis tasks,\nand empirical results demonstrate that TMSON is capable of reducing uncertainty\nto obtain more robust predictions.\n","authors":["Zhuyang Xie","Yan Yang","Jie Wang","Xiaorong Liu","Xiaofan Li"],"pdf_url":"https://arxiv.org/pdf/2404.08923v1.pdf","comment":"14 pages, 9 figures, Accepted by IEEE Transactions on Circuits and\n Systems for Video Technology"},{"id":"http://arxiv.org/abs/2310.03420v2","updated":"2024-04-13T08:07:05Z","published":"2023-10-05T09:57:23Z","title":"FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained\n Diffusion Models and Monocular Depth Estimators","summary":" Matching cross-modality features between images and point clouds is a\nfundamental problem for image-to-point cloud registration. However, due to the\nmodality difference between images and points, it is difficult to learn robust\nand discriminative cross-modality features by existing metric learning methods\nfor feature matching. Instead of applying metric learning on cross-modality\ndata, we propose to unify the modality between images and point clouds by\npretrained large-scale models first, and then establish robust correspondence\nwithin the same modality. We show that the intermediate features, called\ndiffusion features, extracted by depth-to-image diffusion models are\nsemantically consistent between images and point clouds, which enables the\nbuilding of coarse but robust cross-modality correspondences. We further\nextract geometric features on depth maps produced by the monocular depth\nestimator. By matching such geometric features, we significantly improve the\naccuracy of the coarse correspondences produced by diffusion features.\nExtensive experiments demonstrate that without any task-specific training,\ndirect utilization of both features produces accurate image-to-point cloud\nregistration. On three public indoor and outdoor benchmarks, the proposed\nmethod averagely achieves a 20.6 percent improvement in Inlier Ratio, a\nthree-fold higher Inlier Number, and a 48.6 percent improvement in Registration\nRecall than existing state-of-the-arts.\n","authors":["Haiping Wang","Yuan Liu","Bing Wang","Yujing Sun","Zhen Dong","Wenping Wang","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2310.03420v2.pdf","comment":"CameraReady version for ICLR 2024. Project Page:\n https://whu-usi3dv.github.io/FreeReg/"},{"id":"http://arxiv.org/abs/2404.08921v1","updated":"2024-04-13T07:50:17Z","published":"2024-04-13T07:50:17Z","title":"PNeRV: Enhancing Spatial Consistency via Pyramidal Neural Representation\n for Videos","summary":" The primary focus of Neural Representation for Videos (NeRV) is to\neffectively model its spatiotemporal consistency. However, current NeRV systems\noften face a significant issue of spatial inconsistency, leading to decreased\nperceptual quality. To address this issue, we introduce the Pyramidal Neural\nRepresentation for Videos (PNeRV), which is built on a multi-scale information\nconnection and comprises a lightweight rescaling operator, Kronecker\nFully-connected layer (KFc), and a Benign Selective Memory (BSM) mechanism. The\nKFc, inspired by the tensor decomposition of the vanilla Fully-connected layer,\nfacilitates low-cost rescaling and global correlation modeling. BSM merges\nhigh-level features with granular ones adaptively. Furthermore, we provide an\nanalysis based on the Universal Approximation Theory of the NeRV system and\nvalidate the effectiveness of the proposed PNeRV.We conducted comprehensive\nexperiments to demonstrate that PNeRV surpasses the performance of contemporary\nNeRV models, achieving the best results in video regression on UVG and DAVIS\nunder various metrics (PSNR, SSIM, LPIPS, and FVD). Compared to vanilla NeRV,\nPNeRV achieves a +4.49 dB gain in PSNR and a 231% increase in FVD on UVG, along\nwith a +3.28 dB PSNR and 634% FVD increase on DAVIS.\n","authors":["Qi Zhao","M. Salman Asif","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03270v4","updated":"2024-04-13T07:33:57Z","published":"2023-10-05T02:51:53Z","title":"EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit\n Diffusion Models","summary":" Diffusion models have demonstrated remarkable capabilities in image synthesis\nand related generative tasks. Nevertheless, their practicality for real-world\napplications is constrained by substantial computational costs and latency\nissues. Quantization is a dominant way to compress and accelerate diffusion\nmodels, where post-training quantization (PTQ) and quantization-aware training\n(QAT) are two main approaches, each bearing its own properties. While PTQ\nexhibits efficiency in terms of both time and data usage, it may lead to\ndiminished performance in low bit-width. On the other hand, QAT can alleviate\nperformance degradation but comes with substantial demands on computational and\ndata resources. In this paper, we introduce a data-free and parameter-efficient\nfine-tuning framework for low-bit diffusion models, dubbed EfficientDM, to\nachieve QAT-level performance with PTQ-like efficiency. Specifically, we\npropose a quantization-aware variant of the low-rank adapter (QALoRA) that can\nbe merged with model weights and jointly quantized to low bit-width. The\nfine-tuning process distills the denoising capabilities of the full-precision\nmodel into its quantized counterpart, eliminating the requirement for training\ndata. We also introduce scale-aware optimization and temporal learned step-size\nquantization to further enhance performance. Extensive experimental results\ndemonstrate that our method significantly outperforms previous PTQ-based\ndiffusion models while maintaining similar time and data efficiency.\nSpecifically, there is only a 0.05 sFID increase when quantizing both weights\nand activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to QAT-based\nmethods, our EfficientDM also boasts a 16.2x faster quantization speed with\ncomparable generation quality. Code is available at\n\\href{https://github.com/ThisisBillhe/EfficientDM}{this hrl}.\n","authors":["Yefei He","Jing Liu","Weijia Wu","Hong Zhou","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.03270v4.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2404.08917v1","updated":"2024-04-13T07:30:17Z","published":"2024-04-13T07:30:17Z","title":"MAProtoNet: A Multi-scale Attentive Interpretable Prototypical Part\n Network for 3D Magnetic Resonance Imaging Brain Tumor Classification","summary":" Automated diagnosis with artificial intelligence has emerged as a promising\narea in the realm of medical imaging, while the interpretability of the\nintroduced deep neural networks still remains an urgent concern. Although\ncontemporary works, such as XProtoNet and MProtoNet, has sought to design\ninterpretable prediction models for the issue, the localization precision of\ntheir resulting attribution maps can be further improved. To this end, we\npropose a Multi-scale Attentive Prototypical part Network, termed MAProtoNet,\nto provide more precise maps for attribution. Specifically, we introduce a\nconcise multi-scale module to merge attentive features from quadruplet\nattention layers, and produces attribution maps. The proposed quadruplet\nattention layers can enhance the existing online class activation mapping loss\nvia capturing interactions between the spatial and channel dimension, while the\nmulti-scale module then fuses both fine-grained and coarse-grained information\nfor precise maps generation. We also apply a novel multi-scale mapping loss for\nsupervision on the proposed multi-scale module. Compared to existing\ninterpretable prototypical part networks in medical imaging, MAProtoNet can\nachieve state-of-the-art performance in localization on brain tumor\nsegmentation (BraTS) datasets, resulting in approximately 4% overall\nimprovement on activation precision score (with a best score of 85.8%), without\nusing additional annotated labels of segmentation. Our code will be released in\nhttps://github.com/TUAT-Novice/maprotonet.\n","authors":["Binghua Li","Jie Mao","Zhe Sun","Chao Li","Qibin Zhao","Toshihisa Tanaka"],"pdf_url":"https://arxiv.org/pdf/2404.08917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08916v1","updated":"2024-04-13T07:30:16Z","published":"2024-04-13T07:30:16Z","title":"Meply: A Large-scale Dataset and Baseline Evaluations for Metastatic\n Perirectal Lymph Node Detection and Segmentation","summary":" Accurate segmentation of metastatic lymph nodes in rectal cancer is crucial\nfor the staging and treatment of rectal cancer. However, existing segmentation\napproaches face challenges due to the absence of pixel-level annotated datasets\ntailored for lymph nodes around the rectum. Additionally, metastatic lymph\nnodes are characterized by their relatively small size, irregular shapes, and\nlower contrast compared to the background, further complicating the\nsegmentation task. To address these challenges, we present the first\nlarge-scale perirectal metastatic lymph node CT image dataset called Meply,\nwhich encompasses pixel-level annotations of 269 patients diagnosed with rectal\ncancer. Furthermore, we introduce a novel lymph-node segmentation model named\nCoSAM. The CoSAM utilizes sequence-based detection to guide the segmentation of\nmetastatic lymph nodes in rectal cancer, contributing to improved localization\nperformance for the segmentation model. It comprises three key components:\nsequence-based detection module, segmentation module, and collaborative\nconvergence unit. To evaluate the effectiveness of CoSAM, we systematically\ncompare its performance with several popular segmentation methods using the\nMeply dataset. Our code and dataset will be publicly available at:\nhttps://github.com/kanydao/CoSAM.\n","authors":["Weidong Guo","Hantao Zhang","Shouhong Wan","Bingbing Zou","Wanqin Wang","Chenyang Qiu","Jun Li","Peiquan Jin"],"pdf_url":"https://arxiv.org/pdf/2404.08916v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2404.08915v1","updated":"2024-04-13T07:27:06Z","published":"2024-04-13T07:27:06Z","title":"PM2: A New Prompting Multi-modal Model Paradigm for Few-shot Medical\n Image Classification","summary":" Few-shot learning has been successfully applied to medical image\nclassification as only very few medical examples are available for training.\nDue to the challenging problem of limited number of annotated medical images,\nimage representations should not be solely derived from a single image modality\nwhich is insufficient for characterizing concept classes. In this paper, we\npropose a new prompting multi-modal model paradigm on medical image\nclassification based on multi-modal foundation models, called PM2. Besides\nimage modality,PM2 introduces another supplementary text input, known as\nprompt, to further describe corresponding image or concept classes and\nfacilitate few-shot learning across diverse modalities. To better explore the\npotential of prompt engineering, we empirically investigate five distinct\nprompt schemes under the new paradigm. Furthermore, linear probing in\nmulti-modal models acts as a linear classification head taking as input only\nclass token, which ignores completely merits of rich statistics inherent in\nhigh-level visual tokens. Thus, we alternatively perform a linear\nclassification on feature distribution of visual tokens and class token\nsimultaneously. To effectively mine such rich statistics, a global covariance\npooling with efficient matrix power normalization is used to aggregate visual\ntokens. Then we study and combine two classification heads. One is shared for\nclass token of image from vision encoder and prompt representation encoded by\ntext encoder. The other is to classification on feature distribution of visual\ntokens from vision encoder. Extensive experiments on three medical datasets\nshow that our PM2 significantly outperforms counterparts regardless of prompt\nschemes and achieves state-of-the-art performance.\n","authors":["Zhenwei Wang","Qiule Sun","Bingbing Zhang","Pengfei Wang","Jianxin Zhang","Qiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03331v2","updated":"2024-04-13T07:20:03Z","published":"2023-01-09T13:35:03Z","title":"A Specific Task-oriented Semantic Image Communication System for\n substation patrol inspection","summary":" Intelligent inspection robots are widely used in substation patrol\ninspection, which can help check potential safety hazards by patrolling the\nsubstation and sending back scene images. However, when patrolling some\nmarginal areas with weak signal, the scene images cannot be sucessfully\ntransmissted to be used for hidden danger elimination, which greatly reduces\nthe quality of robots'daily work. To solve such problem, a Specific\nTask-oriented Semantic Communication System for Imag-STSCI is designed, which\ninvolves the semantic features extraction, transmission, restoration and\nenhancement to get clearer images sent by intelligent robots under weak\nsignals. Inspired by that only some specific details of the image are needed in\nsuch substation patrol inspection task, we proposed a new paradigm of semantic\nenhancement in such specific task to ensure the clarity of key semantic\ninformation when facing a lower bit rate or a low signal-to-noise ratio\nsituation. Across the reality-based simulation, experiments show our STSCI can\ngenerally surpass traditional image-compression-based and channel-codingbased\nor other semantic communication system in the substation patrol inspection task\nwith a lower bit rate even under a low signal-to-noise ratio situation.\n","authors":["Senran Fan","Haotai Liang","Chen Dong","Xiaodong Xu","Geng Liu"],"pdf_url":"https://arxiv.org/pdf/2301.03331v2.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.06061v2","updated":"2024-04-13T06:14:42Z","published":"2023-05-10T11:26:36Z","title":"Visual Tuning","summary":" Fine-tuning visual models has been widely shown promising performance on many\ndownstream visual tasks. With the surprising development of pre-trained visual\nfoundation models, visual tuning jumped out of the standard modus operandi that\nfine-tunes the whole pre-trained model or just the fully connected layer.\nInstead, recent advances can achieve superior performance than full-tuning the\nwhole pre-trained parameters by updating far fewer parameters, enabling edge\ndevices and downstream applications to reuse the increasingly large foundation\nmodels deployed on the cloud. With the aim of helping researchers get the full\npicture and future directions of visual tuning, this survey characterizes a\nlarge and thoughtful selection of recent works, providing a systematic and\ncomprehensive overview of existing work and models. Specifically, it provides a\ndetailed background of visual tuning and categorizes recent visual tuning\ntechniques into five groups: prompt tuning, adapter tuning, parameter tuning,\nand remapping tuning. Meanwhile, it offers some exciting research directions\nfor prospective pre-training and various interactions in visual tuning.\n","authors":["Bruce X. B. Yu","Jianlong Chang","Haixin Wang","Lingbo Liu","Shijie Wang","Zhiyu Wang","Junfan Lin","Lingxi Xie","Haojie Li","Zhouchen Lin","Qi Tian","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2305.06061v2.pdf","comment":"37 pages. Accepted to ACM CSUR"},{"id":"http://arxiv.org/abs/2401.03749v2","updated":"2024-04-13T05:56:09Z","published":"2024-01-08T09:20:46Z","title":"The Method of Detecting Flying Birds in Surveillance Video Based on\n Their Characteristics","summary":" Aiming at the characteristics of the flying bird object in surveillance\nvideo, such as the single frame image feature is not obvious, the size is small\nin most cases, and asymmetric, this paper proposes a Flying Bird Object\nDetection method in Surveillance Video (FBOD-SV). Firstly, a new feature\naggregation module, the Correlation Attention Feature Aggregation\n(Co-Attention-FA) module, is designed to aggregate the features of the flying\nbird object according to the bird object's correlation on multiple consecutive\nframes of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net)\nwith down-sampling and then up-sampling is designed, which uses a large feature\nlayer that fuses fine spatial information and large receptive field information\nto detect special multi-scale (mostly small-scale) bird objects. Finally, the\nSimOTA dynamic label allocation method is applied to One-Category object\ndetection, and the SimOTA-OC dynamic label strategy is proposed to solve the\ndifficult problem of label allocation caused by irregular flying bird objects.\nIn this paper, the algorithm's performance is verified by the experimental data\nset of the surveillance video of the flying bird object of the traction\nsubstation. The experimental results show that the surveillance video flying\nbird object detection method proposed in this paper effectively improves the\ndetection performance of flying bird objects.\n","authors":["Ziwei Sun","Zexi Hua","Hengchao Li","Yan Li"],"pdf_url":"https://arxiv.org/pdf/2401.03749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10942v4","updated":"2024-04-13T05:52:04Z","published":"2023-10-17T02:38:09Z","title":"UNK-VQA: A Dataset and a Probe into the Abstention Ability of\n Multi-modal Large Models","summary":" Teaching Visual Question Answering (VQA) models to refrain from answering\nunanswerable questions is necessary for building a trustworthy AI system.\nExisting studies, though have explored various aspects of VQA but somewhat\nignored this particular attribute. This paper aims to bridge the research gap\nby contributing a comprehensive dataset, called UNK-VQA. The dataset is\nspecifically designed to address the challenge of questions that models do not\nknow. To this end, we first augment the existing data via deliberate\nperturbations on either the image or question. In specific, we carefully ensure\nthat the question-image semantics remain close to the original unperturbed\ndistribution. By this means, the identification of unanswerable questions\nbecomes challenging, setting our dataset apart from others that involve mere\nimage replacement. We then extensively evaluate the zero- and few-shot\nperformance of several emerging multi-modal large models and discover their\nsignificant limitations when applied to our dataset. Additionally, we also\npropose a straightforward method to tackle these unanswerable questions. This\ndataset, we believe, will serve as a valuable benchmark for enhancing the\nabstention capability of VQA models, thereby leading to increased\ntrustworthiness of AI systems. We have made the dataset\n(https://github.com/guoyang9/UNK-VQA) available to facilitate further\nexploration in this area.\n","authors":["Yangyang Guo","Fangkai Jiao","Zhiqi Shen","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2310.10942v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17963v2","updated":"2024-04-13T04:16:18Z","published":"2023-11-29T11:30:33Z","title":"M$^{2}$Chat: Empowering VLM for Multimodal LLM Interleaved Text-Image\n Generation","summary":" While current LLM chatbots like GPT-4V bridge the gap between human\ninstructions and visual representations to enable text-image generations, they\nstill lack efficient alignment methods for high-fidelity performance on\nmultiple downstream tasks. In this paper, we propose \\textbf{$M^{2}Chat$}, a\nnovel unified multimodal LLM framework for generating interleaved text-image\nconversation across various scenarios. Specifically, we propose an\n$M^{3}Adapter$ that efficiently integrates granular low-level visual\ninformation and high-level semantic features from multi-modality prompts. Upon\nthe well-aligned fused feature, $M^{3}Adapter$ tailors a learnable gating\nstrategy to balance the model creativity and consistency across various tasks\nadaptively. Moreover, to further enhance the effectiveness of $M^{3}Adapter$\nwhile preserving the coherence of semantic context comprehension, we introduce\na two-stage $M^{3}FT$ fine-tuning strategy. This strategy optimizes disjoint\ngroups of parameters for image-text alignment and visual-instruction\nrespectively. Extensive experiments demonstrate our $M^{2}Chat$ surpasses\nstate-of-the-art counterparts across diverse benchmarks, showcasing its prowess\nin interleaving generation, storytelling, and multimodal dialogue systems. The\ndemo and code are available at\n\\red{https://mattie-e.github.io/M2Chat.github.io}.\n","authors":["Xiaowei Chi","Rongyu Zhang","Zhengkai Jiang","Yijiang Liu","Yatian Wang","Xingqun Qi","Wenhan Luo","Peng Gao","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2311.17963v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08894v1","updated":"2024-04-13T04:01:35Z","published":"2024-04-13T04:01:35Z","title":"HEAT: Head-level Parameter Efficient Adaptation of Vision Transformers\n with Taylor-expansion Importance Scores","summary":" Prior computer vision research extensively explores adapting pre-trained\nvision transformers (ViT) to downstream tasks. However, the substantial number\nof parameters requiring adaptation has led to a focus on Parameter Efficient\nTransfer Learning (PETL) as an approach to efficiently adapt large pre-trained\nmodels by training only a subset of parameters, achieving both parameter and\nstorage efficiency. Although the significantly reduced parameters have shown\npromising performance under transfer learning scenarios, the structural\nredundancy inherent in the model still leaves room for improvement, which\nwarrants further investigation. In this paper, we propose Head-level Efficient\nAdaptation with Taylor-expansion importance score (HEAT): a simple method that\nefficiently fine-tuning ViTs at head levels. In particular, the first-order\nTaylor expansion is employed to calculate each head's importance score, termed\nTaylor-expansion Importance Score (TIS), indicating its contribution to\nspecific tasks. Additionally, three strategies for calculating TIS have been\nemployed to maximize the effectiveness of TIS. These strategies calculate TIS\nfrom different perspectives, reflecting varying contributions of parameters.\nBesides ViT, HEAT has also been applied to hierarchical transformers such as\nSwin Transformer, demonstrating its versatility across different transformer\narchitectures. Through extensive experiments, HEAT has demonstrated superior\nperformance over state-of-the-art PETL methods on the VTAB-1K benchmark.\n","authors":["Yibo Zhong","Yao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.08894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05773v3","updated":"2024-04-13T03:56:11Z","published":"2024-02-08T16:00:25Z","title":"UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery","summary":" Raindrops adhering to the lens of UAVs can obstruct visibility of the\nbackground scene and degrade image quality. Despite recent progress in image\nderaining methods and datasets, there is a lack of focus on raindrop removal\nfrom UAV aerial imagery due to the unique challenges posed by varying angles\nand rapid movement during drone flight. To fill the gap in this research, we\nfirst construct a new benchmark dataset for removing raindrops from UAV images,\ncalled UAV-Rain1k. In this letter, we provide a dataset generation pipeline,\nwhich includes modeling raindrop shapes using Blender, collecting background\nimages from various UAV angles, random sampling of rain masks and etc. Based on\nthe proposed benchmark, we further present a comprehensive evaluation of\nexisting representative image deraining algorithms, and reveal future research\nopportunities worth exploring. The proposed dataset is publicly available at\nhttps://github.com/cschenxiang/UAV-Rain1k.\n","authors":["Wenhui Chang","Hongming Chen","Xin He","Xiang Chen","Liangduo Shen"],"pdf_url":"https://arxiv.org/pdf/2402.05773v3.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW) 2024"},{"id":"http://arxiv.org/abs/2404.08892v1","updated":"2024-04-13T03:46:35Z","published":"2024-04-13T03:46:35Z","title":"ChangeAnywhere: Sample Generation for Remote Sensing Change Detection\n via Semantic Latent Diffusion Model","summary":" Remote sensing change detection (CD) is a pivotal technique that pinpoints\nchanges on a global scale based on multi-temporal images. With the recent\nexpansion of deep learning, supervised deep learning-based CD models have shown\nsatisfactory performance. However, CD sample labeling is very time-consuming as\nit is densely labeled and requires expert knowledge. To alleviate this problem,\nwe introduce ChangeAnywhere, a novel CD sample generation method using the\nsemantic latent diffusion model and single-temporal images. Specifically,\nChangeAnywhere leverages the relative ease of acquiring large single-temporal\nsemantic datasets to generate large-scale, diverse, and semantically annotated\nbi-temporal CD datasets. ChangeAnywhere captures the two essentials of CD\nsamples, i.e., change implies semantically different, and non-change implies\nreasonable change under the same semantic constraints. We generated\nChangeAnywhere-100K, the largest synthesis CD dataset with 100,000 pairs of CD\nsamples based on the proposed method. The ChangeAnywhere-100K significantly\nimproved both zero-shot and few-shot performance on two CD benchmark datasets\nfor various deep learning-based CD models, as demonstrated by transfer\nexperiments. This paper delineates the enormous potential of ChangeAnywhere for\nCD sample generation and demonstrates the subsequent enhancement of model\nperformance. Therefore, ChangeAnywhere offers a potent tool for remote sensing\nCD. All codes and pre-trained models will be available at\nhttps://github.com/tangkai-RS/ChangeAnywhere.\n","authors":["Kai Tang","Jin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08892v1.pdf","comment":"Concise manuscript version of ChangeAnywhere"},{"id":"http://arxiv.org/abs/2209.14145v3","updated":"2024-04-13T03:36:29Z","published":"2022-09-28T14:49:28Z","title":"Multi-scale Attention Network for Single Image Super-Resolution","summary":" ConvNets can compete with transformers in high-level tasks by exploiting\nlarger receptive fields. To unleash the potential of ConvNet in\nsuper-resolution, we propose a multi-scale attention network (MAN), by coupling\nclassical multi-scale mechanism with emerging large kernel attention. In\nparticular, we proposed multi-scale large kernel attention (MLKA) and gated\nspatial attention unit (GSAU). Through our MLKA, we modify large kernel\nattention with multi-scale and gate schemes to obtain the abundant attention\nmap at various granularity levels, thereby aggregating global and local\ninformation and avoiding potential blocking artifacts. In GSAU, we integrate\ngate mechanism and spatial attention to remove the unnecessary linear layer and\naggregate informative spatial context. To confirm the effectiveness of our\ndesigns, we evaluate MAN with multiple complexities by simply stacking\ndifferent numbers of MLKA and GSAU. Experimental results illustrate that our\nMAN can perform on par with SwinIR and achieve varied trade-offs between\nstate-of-the-art performance and computations.\n","authors":["Yan Wang","Yusen Li","Gang Wang","Xiaoguang Liu"],"pdf_url":"https://arxiv.org/pdf/2209.14145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08886v1","updated":"2024-04-13T03:15:56Z","published":"2024-04-13T03:15:56Z","title":"EIVEN: Efficient Implicit Attribute Value Extraction using Multimodal\n LLM","summary":" In e-commerce, accurately extracting product attribute values from multimodal\ndata is crucial for improving user experience and operational efficiency of\nretailers. However, previous approaches to multimodal attribute value\nextraction often struggle with implicit attribute values embedded in images or\ntext, rely heavily on extensive labeled data, and can easily confuse similar\nattribute values. To address these issues, we introduce EIVEN, a data- and\nparameter-efficient generative framework that pioneers the use of multimodal\nLLM for implicit attribute value extraction. EIVEN leverages the rich inherent\nknowledge of a pre-trained LLM and vision encoder to reduce reliance on labeled\ndata. We also introduce a novel Learning-by-Comparison technique to reduce\nmodel confusion by enforcing attribute value comparison and difference\nidentification. Additionally, we construct initial open-source datasets for\nmultimodal implicit attribute value extraction. Our extensive experiments\nreveal that EIVEN significantly outperforms existing methods in extracting\nimplicit attribute values while requiring less labeled data.\n","authors":["Henry Peng Zou","Gavin Heqing Yu","Ziwei Fan","Dan Bu","Han Liu","Peng Dai","Dongmei Jia","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2404.08886v1.pdf","comment":"Accepted by NAACL 2024 Industry Track"},{"id":"http://arxiv.org/abs/2401.00094v2","updated":"2024-04-13T02:21:10Z","published":"2023-12-29T23:04:00Z","title":"Generating Enhanced Negatives for Training Language-Based Object\n Detectors","summary":" The recent progress in language-based open-vocabulary object detection can be\nlargely attributed to finding better ways of leveraging large-scale data with\nfree-form text annotations. Training such models with a discriminative\nobjective function has proven successful, but requires good positive and\nnegative samples. However, the free-form nature and the open vocabulary of\nobject descriptions make the space of negatives extremely large. Prior works\nrandomly sample negatives or use rule-based techniques to build them. In\ncontrast, we propose to leverage the vast knowledge built into modern\ngenerative models to automatically build negatives that are more relevant to\nthe original data. Specifically, we use large-language-models to generate\nnegative text descriptions, and text-to-image diffusion models to also generate\ncorresponding negative images. Our experimental analysis confirms the relevance\nof the generated negative data, and its use in language-based detectors\nimproves performance on two complex benchmarks. Code is available at\n\\url{https://github.com/xiaofeng94/Gen-Enhanced-Negs}.\n","authors":["Shiyu Zhao","Long Zhao","Vijay Kumar B. G","Yumin Suh","Dimitris N. Metaxas","Manmohan Chandraker","Samuel Schulter"],"pdf_url":"https://arxiv.org/pdf/2401.00094v2.pdf","comment":"Accepted to CVPR 2024. The supplementary document included"},{"id":"http://arxiv.org/abs/2404.00292v3","updated":"2024-04-13T02:01:50Z","published":"2024-03-30T08:51:23Z","title":"LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge\n Retrieval-Augmented Diffusion","summary":" Camouflaged vision perception is an important vision task with numerous\npractical applications. Due to the expensive collection and labeling costs,\nthis community struggles with a major bottleneck that the species category of\nits datasets is limited to a small number of object species. However, the\nexisting camouflaged generation methods require specifying the background\nmanually, thus failing to extend the camouflaged sample diversity in a low-cost\nmanner. In this paper, we propose a Latent Background Knowledge\nRetrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To\nour knowledge, our contributions mainly include: (1) For the first time, we\npropose a camouflaged generation paradigm that does not need to receive any\nbackground inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented\nmethod with interpretability for camouflaged generation, in which we propose an\nidea that knowledge retrieval and reasoning enhancement are separated\nexplicitly, to alleviate the task-specific challenges. Moreover, our method is\nnot restricted to specific foreground targets or backgrounds, offering a\npotential for extending camouflaged vision perception to more diverse domains.\n(3) Experimental results demonstrate that our method outperforms the existing\napproaches, generating more realistic camouflage images.\n","authors":["Pancheng Zhao","Peng Xu","Pengda Qin","Deng-Ping Fan","Zhicheng Zhang","Guoli Jia","Bowen Zhou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00292v3.pdf","comment":"Accepted by CVPR 2024, Fig.3 revised"},{"id":"http://arxiv.org/abs/2308.06412v3","updated":"2024-04-13T01:40:03Z","published":"2023-08-11T23:03:50Z","title":"Taming Self-Training for Open-Vocabulary Object Detection","summary":" Recent studies have shown promising performance in open-vocabulary object\ndetection (OVD) by utilizing pseudo labels (PLs) from pretrained vision and\nlanguage models (VLMs). However, teacher-student self-training, a powerful and\nwidely used paradigm to leverage PLs, is rarely explored for OVD. This work\nidentifies two challenges of using self-training in OVD: noisy PLs from VLMs\nand frequent distribution changes of PLs. To address these challenges, we\npropose SAS-Det that tames self-training for OVD from two key perspectives.\nFirst, we present a split-and-fusion (SAF) head that splits a standard\ndetection into an open-branch and a closed-branch. This design can reduce noisy\nsupervision from pseudo boxes. Moreover, the two branches learn complementary\nknowledge from different training data, significantly enhancing performance\nwhen fused together. Second, in our view, unlike in closed-set tasks, the PL\ndistributions in OVD are solely determined by the teacher model. We introduce a\nperiodic update strategy to decrease the number of updates to the teacher,\nthereby decreasing the frequency of changes in PL distributions, which\nstabilizes the training process. Extensive experiments demonstrate SAS-Det is\nboth efficient and effective. SAS-Det outperforms recent models of the same\nscale by a clear margin and achieves 37.4 AP50 and 29.1 APr on novel categories\nof the COCO and LVIS benchmarks, respectively. Code is available at\n\\url{https://github.com/xiaofeng94/SAS-Det}.\n","authors":["Shiyu Zhao","Samuel Schulter","Long Zhao","Zhixing Zhang","Vijay Kumar B. G","Yumin Suh","Manmohan Chandraker","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2308.06412v3.pdf","comment":"Accepted to CVPR 2024. The supplementary document included"},{"id":"http://arxiv.org/abs/2403.16051v3","updated":"2024-04-13T01:19:39Z","published":"2024-03-24T07:36:38Z","title":"Segment Anything Model for Road Network Graph Extraction","summary":" We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for\nextracting large-scale, vectorized road network graphs from satellite imagery.\nTo predict graph geometry, we formulate it as a dense semantic segmentation\ntask, leveraging the inherent strengths of SAM. The image encoder of SAM is\nfine-tuned to produce probability masks for roads and intersections, from which\nthe graph vertices are extracted via simple non-maximum suppression. To predict\ngraph topology, we designed a lightweight transformer-based graph neural\nnetwork, which leverages the SAM image embeddings to estimate the edge\nexistence probabilities between vertices. Our approach directly predicts the\ngraph vertices and edges for large regions without expensive and complex\npost-processing heuristics, and is capable of building complete road network\ngraphs spanning multiple square kilometers in a matter of seconds. With its\nsimple, straightforward, and minimalist design, SAM-Road achieves comparable\naccuracy with the state-of-the-art method RNGDet++, while being 40 times faster\non the City-scale dataset. We thus demonstrate the power of a foundational\nvision model when applied to a graph learning task. The code is available at\nhttps://github.com/htcr/sam_road.\n","authors":["Congrui Hetang","Haoru Xue","Cindy Le","Tianwei Yue","Wenping Wang","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2403.16051v3.pdf","comment":"Accepted by IEEE/CVF Computer Vision and Pattern Recognition\n Conference (CVPR) 2024, 2nd Workshop on Scene Graphs and Graph Representation\n Learning"},{"id":"http://arxiv.org/abs/2211.13854v5","updated":"2024-04-13T00:14:03Z","published":"2022-11-25T01:37:48Z","title":"ComCLIP: Training-Free Compositional Image and Text Matching","summary":" Contrastive Language-Image Pretraining (CLIP) has demonstrated great\nzero-shot performance for matching images and text. However, it is still\nchallenging to adapt vision-lanaguage pretrained models like CLIP to\ncompositional image and text matching -- a more challenging image and text\nmatching task requiring the model understanding of compositional word concepts\nand visual components. Towards better compositional generalization in zero-shot\nimage and text matching, in this paper, we study the problem from a causal\nperspective: the erroneous semantics of individual entities are essentially\nconfounders that cause the matching failure. Therefore, we propose a novel\n\\textbf{\\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP\ndisentangles input images into subjects, objects, and action sub-images and\ncomposes CLIP's vision encoder and text encoder to perform evolving matching\nover compositional text embedding and sub-image embeddings. In this way,\nComCLIP can mitigate spurious correlations introduced by the pretrained CLIP\nmodels and dynamically evaluate the importance of each component. Experiments\non four compositional image-text matching datasets: SVO, ComVG, Winoground, and\nVL-checklist, and two general image-text retrieval datasets: Flick30K, and\nMSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts\nthe \\textbf{\\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even\nwithout further training or fine-tuning. Our codes can be found at\nhttps://github.com/eric-ai-lab/ComCLIP.\n","authors":["Kenan Jiang","Xuehai He","Ruize Xu","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2211.13854v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08858v1","updated":"2024-04-13T00:13:20Z","published":"2024-04-13T00:13:20Z","title":"A Lightweight Spatiotemporal Network for Online Eye Tracking with Event\n Camera","summary":" Event-based data are commonly encountered in edge computing environments\nwhere efficiency and low latency are critical. To interface with such data and\nleverage their rich temporal features, we propose a causal spatiotemporal\nconvolutional network. This solution targets efficient implementation on\nedge-appropriate hardware with limited resources in three ways: 1) deliberately\ntargets a simple architecture and set of operations (convolutions, ReLU\nactivations) 2) can be configured to perform online inference efficiently via\nbuffering of layer outputs 3) can achieve more than 90% activation sparsity\nthrough regularization during training, enabling very significant efficiency\ngains on event-based processors. In addition, we propose a general affine\naugmentation strategy acting directly on the events, which alleviates the\nproblem of dataset scarcity for event-based systems. We apply our model on the\nAIS 2024 event-based eye tracking challenge, reaching a score of 0.9916 p10\naccuracy on the Kaggle private testset.\n","authors":["Yan Ru Pei","Sasskia Brüers","Sébastien Crouzet","Douglas McLelland","Olivier Coenen"],"pdf_url":"https://arxiv.org/pdf/2404.08858v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2207.13316v2","updated":"2024-04-13T07:39:29Z","published":"2022-07-27T06:25:47Z","title":"NICEST: Noisy Label Correction and Training for Robust Scene Graph\n Generation","summary":" Nearly all existing scene graph generation (SGG) models have overlooked the\nground-truth annotation qualities of mainstream SGG datasets, i.e., they\nassume: 1) all the manually annotated positive samples are equally correct; 2)\nall the un-annotated negative samples are absolutely background. In this paper,\nwe argue that neither of the assumptions applies to SGG: there are numerous\nnoisy ground-truth predicate labels that break these two assumptions and harm\nthe training of unbiased SGG models. To this end, we propose a novel NoIsy\nlabel CorrEction and Sample Training strategy for SGG: NICEST. Specifically, it\nconsists of two parts: NICE and NIST, which rule out these noisy label issues\nby generating high-quality samples and the effective training strategy,\nrespectively. NICE first detects noisy samples and then reassigns them more\nhigh-quality soft predicate labels. NIST is a multi-teacher knowledge\ndistillation based training strategy, which enables the model to learn unbiased\nfusion knowledge. And a dynamic trade-off weighting strategy in NIST is\ndesigned to penalize the bias of different teachers. Due to the model-agnostic\nnature of both NICE and NIST, our NICEST can be seamlessly incorporated into\nany SGG architecture to boost its performance on different predicate\ncategories. In addition, to better evaluate the generalization of SGG models,\nwe further propose a new benchmark VG-OOD, by re-organizing the prevalent VG\ndataset and deliberately making the predicate distributions of the training and\ntest sets as different as possible for each subject-object category pair. This\nnew benchmark helps disentangle the influence of subject-object category based\nfrequency biases. Extensive ablations and results on different backbones and\ntasks have attested to the effectiveness and generalization ability of each\ncomponent of NICEST.\n","authors":["Lin Li","Jun Xiao","Hanrong Shi","Hanwang Zhang","Yi Yang","Wei Liu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2207.13316v2.pdf","comment":"Extension of CVPR'22 work (The Devil is in the Labels: Noisy Label\n Correction for Robust Scene Graph Generation). arXiv admin note: substantial\n text overlap with arXiv:2206.03014"},{"id":"http://arxiv.org/abs/2404.10790v1","updated":"2024-04-13T01:31:25Z","published":"2024-04-13T01:31:25Z","title":"Multimodal Attack Detection for Action Recognition Models","summary":" Adversarial machine learning attacks on video action recognition models is a\ngrowing research area and many effective attacks were introduced in recent\nyears. These attacks show that action recognition models can be breached in\nmany ways. Hence using these models in practice raises significant security\nconcerns. However, there are very few works which focus on defending against or\ndetecting attacks. In this work, we propose a novel universal detection method\nwhich is compatible with any action recognition model. In our extensive\nexperiments, we show that our method consistently detects various attacks\nagainst different target models with high true positive rates while satisfying\nvery low false positive rates. Tested against four state-of-the-art attacks\ntargeting four action recognition models, the proposed detector achieves an\naverage AUC of 0.911 over 16 test cases while the best performance achieved by\nthe existing detectors is 0.645 average AUC. This 41.2% improvement is enabled\nby the robustness of the proposed detector to varying attack methods and target\nmodels. The lowest AUC achieved by our detector across the 16 test cases is\n0.837 while the competing detector's performance drops as low as 0.211. We also\nshow that the proposed detector is robust to varying attack strengths. In\naddition, we analyze our method's real-time performance with different hardware\nsetups to demonstrate its potential as a practical defense mechanism.\n","authors":["Furkan Mumcu","Yasin Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2404.10790v1.pdf","comment":null}]},"2024-04-16T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.16846v2","updated":"2024-04-16T17:59:53Z","published":"2024-02-26T18:59:33Z","title":"GROUNDHOG: Grounding Large Language Models to Holistic Segmentation","summary":" Most multimodal large language models (MLLMs) learn language-to-object\ngrounding through causal language modeling where grounded objects are captured\nby bounding boxes as sequences of location tokens. This paradigm lacks\npixel-level representations that are important for fine-grained visual\nunderstanding and diagnosis. In this work, we introduce GROUNDHOG, an MLLM\ndeveloped by grounding Large Language Models to holistic segmentation.\nGROUNDHOG incorporates a masked feature extractor and converts extracted\nfeatures into visual entity tokens for the MLLM backbone, which then connects\ngroundable phrases to unified grounding masks by retrieving and merging the\nentity masks. To train GROUNDHOG, we carefully curated M3G2, a grounded visual\ninstruction tuning dataset with Multi-Modal Multi-Grained Grounding, by\nharvesting a collection of segmentation-grounded datasets with rich\nannotations. Our experimental results show that GROUNDHOG achieves superior\nperformance on various language grounding tasks without task-specific\nfine-tuning, and significantly reduces object hallucination. GROUNDHOG also\ndemonstrates better grounding towards complex forms of visual input and\nprovides easy-to-understand diagnosis in failure cases.\n","authors":["Yichi Zhang","Ziqiao Ma","Xiaofeng Gao","Suhaila Shakiah","Qiaozi Gao","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2402.16846v2.pdf","comment":"Accepted to CVPR 2024. Website: https://groundhog-mllm.github.io/"},{"id":"http://arxiv.org/abs/2404.10775v1","updated":"2024-04-16T17:59:11Z","published":"2024-04-16T17:59:11Z","title":"COMBO: Compositional World Models for Embodied Multi-Agent Cooperation","summary":" In this paper, we investigate the problem of embodied multi-agent\ncooperation, where decentralized agents must cooperate given only partial\negocentric views of the world. To effectively plan in this setting, in contrast\nto learning world dynamics in a single-agent scenario, we must simulate world\ndynamics conditioned on an arbitrary number of agents' actions given only\npartial egocentric visual observations of the world. To address this issue of\npartial observability, we first train generative models to estimate the overall\nworld state given partial egocentric observations. To enable accurate\nsimulation of multiple sets of actions on this world state, we then propose to\nlearn a compositional world model for multi-agent cooperation by factorizing\nthe naturally composable joint actions of multiple agents and compositionally\ngenerating the video. By leveraging this compositional world model, in\ncombination with Vision Language Models to infer the actions of other agents,\nwe can use a tree search procedure to integrate these modules and facilitate\nonline cooperative planning. To evaluate the efficacy of our methods, we create\ntwo challenging embodied multi-agent long-horizon cooperation tasks using the\nThreeDWorld simulator and conduct experiments with 2-4 agents. The results show\nour compositional world model is effective and the framework enables the\nembodied agents to cooperate efficiently with different agents across various\ntasks and an arbitrary number of agents, showing the promising future of our\nproposed framework. More videos can be found at\nhttps://vis-www.cs.umass.edu/combo/.\n","authors":["Hongxin Zhang","Zeyuan Wang","Qiushi Lyu","Zheyuan Zhang","Sunli Chen","Tianmin Shu","Yilun Du","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2404.10775v1.pdf","comment":"23 pages. The first three authors contributed equally"},{"id":"http://arxiv.org/abs/2404.10772v1","updated":"2024-04-16T17:57:19Z","published":"2024-04-16T17:57:19Z","title":"Gaussian Opacity Fields: Efficient and Compact Surface Reconstruction in\n Unbounded Scenes","summary":" Recently, 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view\nsynthesis results, while allowing the rendering of high-resolution images in\nreal-time. However, leveraging 3D Gaussians for surface reconstruction poses\nsignificant challenges due to the explicit and disconnected nature of 3D\nGaussians. In this work, we present Gaussian Opacity Fields (GOF), a novel\napproach for efficient, high-quality, and compact surface reconstruction in\nunbounded scenes. Our GOF is derived from ray-tracing-based volume rendering of\n3D Gaussians, enabling direct geometry extraction from 3D Gaussians by\nidentifying its levelset, without resorting to Poisson reconstruction or TSDF\nfusion as in previous work. We approximate the surface normal of Gaussians as\nthe normal of the ray-Gaussian intersection plane, enabling the application of\nregularization that significantly enhances geometry. Furthermore, we develop an\nefficient geometry extraction method utilizing marching tetrahedra, where the\ntetrahedral grids are induced from 3D Gaussians and thus adapt to the scene's\ncomplexity. Our evaluations reveal that GOF surpasses existing 3DGS-based\nmethods in surface reconstruction and novel view synthesis. Further, it\ncompares favorably to, or even outperforms, neural implicit methods in both\nquality and speed.\n","authors":["Zehao Yu","Torsten Sattler","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2404.10772v1.pdf","comment":"Project page:\n https://niujinshuchong.github.io/gaussian-opacity-fields"},{"id":"http://arxiv.org/abs/2312.13150v2","updated":"2024-04-16T17:56:19Z","published":"2023-12-20T16:14:58Z","title":"Splatter Image: Ultra-Fast Single-View 3D Reconstruction","summary":" We introduce the \\method, an ultra-efficient approach for monocular 3D object\nreconstruction. Splatter Image is based on Gaussian Splatting, which allows\nfast and high-quality reconstruction of 3D scenes from multiple images. We\napply Gaussian Splatting to monocular reconstruction by learning a neural\nnetwork that, at test time, performs reconstruction in a feed-forward manner,\nat 38 FPS. Our main innovation is the surprisingly straightforward design of\nthis network, which, using 2D operators, maps the input image to one 3D\nGaussian per pixel. The resulting set of Gaussians thus has the form an image,\nthe Splatter Image. We further extend the method take several images as input\nvia cross-view attention. Owning to the speed of the renderer (588 FPS), we use\na single GPU for training while generating entire images at each iteration to\noptimize perceptual metrics like LPIPS. On several synthetic, real,\nmulti-category and large-scale benchmark datasets, we achieve better results in\nterms of PSNR, LPIPS, and other metrics while training and evaluating much\nfaster than prior works. Code, models, demo and more results are available at\nhttps://szymanowiczs.github.io/splatter-image.\n","authors":["Stanislaw Szymanowicz","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2312.13150v2.pdf","comment":"CVPR 2024. Project page:\n https://szymanowiczs.github.io/splatter-image.html . Code:\n https://github.com/szymanowiczs/splatter-image , Demo:\n https://huggingface.co/spaces/szymanowiczs/splatter_image"},{"id":"http://arxiv.org/abs/2312.13752v2","updated":"2024-04-16T17:55:53Z","published":"2023-12-21T11:33:10Z","title":"Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the\n AIIB23 challenge","summary":" Airway-related quantitative imaging biomarkers are crucial for examination,\ndiagnosis, and prognosis in pulmonary diseases. However, the manual delineation\nof airway trees remains prohibitively time-consuming. While significant efforts\nhave been made towards enhancing airway modelling, current public-available\ndatasets concentrate on lung diseases with moderate morphological variations.\nThe intricate honeycombing patterns present in the lung tissues of fibrotic\nlung disease patients exacerbate the challenges, often leading to various\nprediction errors. To address this issue, the 'Airway-Informed Quantitative CT\nImaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was\norganized in conjunction with the official 2023 International Conference on\nMedical Image Computing and Computer Assisted Intervention (MICCAI). The airway\nstructures were meticulously annotated by three experienced radiologists.\nCompetitors were encouraged to develop automatic airway segmentation models\nwith high robustness and generalization abilities, followed by exploring the\nmost correlated QIB of mortality prediction. A training set of 120\nhigh-resolution computerised tomography (HRCT) scans were publicly released\nwith expert annotations and mortality status. The online validation set\nincorporated 52 HRCT scans from patients with fibrotic lung disease and the\noffline test set included 140 cases from fibrosis and COVID-19 patients. The\nresults have shown that the capacity of extracting airway trees from patients\nwith fibrotic lung disease could be enhanced by introducing voxel-wise weighted\ngeneral union loss and continuity loss. In addition to the competitive image\nbiomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5,\np<0.0001) was revealed for survival prognostication compared with existing\nclinical measurements, clinician assessment and AI-based biomarkers.\n","authors":["Yang Nan","Xiaodan Xing","Shiyi Wang","Zeyu Tang","Federico N Felder","Sheng Zhang","Roberta Eufrasia Ledda","Xiaoliu Ding","Ruiqi Yu","Weiping Liu","Feng Shi","Tianyang Sun","Zehong Cao","Minghui Zhang","Yun Gu","Hanxiao Zhang","Jian Gao","Pingyu Wang","Wen Tang","Pengxin Yu","Han Kang","Junqiang Chen","Xing Lu","Boyu Zhang","Michail Mamalakis","Francesco Prinzi","Gianluca Carlini","Lisa Cuneo","Abhirup Banerjee","Zhaohu Xing","Lei Zhu","Zacharia Mesbah","Dhruv Jain","Tsiry Mayet","Hongyu Yuan","Qing Lyu","Abdul Qayyum","Moona Mazher","Athol Wells","Simon LF Walsh","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2312.13752v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2404.10766v1","updated":"2024-04-16T17:50:09Z","published":"2024-04-16T17:50:09Z","title":"RapidVol: Rapid Reconstruction of 3D Ultrasound Volumes from Sensorless\n 2D Scans","summary":" Two-dimensional (2D) freehand ultrasonography is one of the most commonly\nused medical imaging modalities, particularly in obstetrics and gynaecology.\nHowever, it only captures 2D cross-sectional views of inherently 3D anatomies,\nlosing valuable contextual information. As an alternative to requiring costly\nand complex 3D ultrasound scanners, 3D volumes can be constructed from 2D scans\nusing machine learning. However this usually requires long computational time.\nHere, we propose RapidVol: a neural representation framework to speed up\nslice-to-volume ultrasound reconstruction. We use tensor-rank decomposition, to\ndecompose the typical 3D volume into sets of tri-planes, and store those\ninstead, as well as a small neural network. A set of 2D ultrasound scans, with\ntheir ground truth (or estimated) 3D position and orientation (pose) is all\nthat is required to form a complete 3D reconstruction. Reconstructions are\nformed from real fetal brain scans, and then evaluated by requesting novel\ncross-sectional views. When compared to prior approaches based on fully\nimplicit representation (e.g. neural radiance fields), our method is over 3x\nquicker, 46% more accurate, and if given inaccurate poses is more robust.\nFurther speed-up is also possible by reconstructing from a structural prior\nrather than from scratch.\n","authors":["Mark C. Eid","Pak-Hei Yeung","Madeleine K. Wyburd","João F. Henriques","Ana I. L. Namburete"],"pdf_url":"https://arxiv.org/pdf/2404.10766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10765v1","updated":"2024-04-16T17:50:02Z","published":"2024-04-16T17:50:02Z","title":"RefFusion: Reference Adapted Diffusion Models for 3D Scene Inpainting","summary":" Neural reconstruction approaches are rapidly emerging as the preferred\nrepresentation for 3D scenes, but their limited editability is still posing a\nchallenge. In this work, we propose an approach for 3D scene inpainting -- the\ntask of coherently replacing parts of the reconstructed scene with desired\ncontent. Scene inpainting is an inherently ill-posed task as there exist many\nsolutions that plausibly replace the missing content. A good inpainting method\nshould therefore not only enable high-quality synthesis but also a high degree\nof control. Based on this observation, we focus on enabling explicit control\nover the inpainted content and leverage a reference image as an efficient means\nto achieve this goal. Specifically, we introduce RefFusion, a novel 3D\ninpainting method based on a multi-scale personalization of an image inpainting\ndiffusion model to the given reference view. The personalization effectively\nadapts the prior distribution to the target scene, resulting in a lower\nvariance of score distillation objective and hence significantly sharper\ndetails. Our framework achieves state-of-the-art results for object removal\nwhile maintaining high controllability. We further demonstrate the generality\nof our formulation on other downstream tasks such as object insertion, scene\noutpainting, and sparse view reconstruction.\n","authors":["Ashkan Mirzaei","Riccardo De Lutio","Seung Wook Kim","David Acuna","Jonathan Kelly","Sanja Fidler","Igor Gilitschenski","Zan Gojcic"],"pdf_url":"https://arxiv.org/pdf/2404.10765v1.pdf","comment":"Project page: https://reffusion.github.io"},{"id":"http://arxiv.org/abs/2404.10763v1","updated":"2024-04-16T17:47:16Z","published":"2024-04-16T17:47:16Z","title":"LaDiC: Are Diffusion Models Really Inferior to Autoregressive\n Counterparts for Image-to-Text Generation?","summary":" Diffusion models have exhibited remarkable capabilities in text-to-image\ngeneration. However, their performance in image-to-text generation,\nspecifically image captioning, has lagged behind Auto-Regressive (AR) models,\ncasting doubt on their applicability for such tasks. In this work, we revisit\ndiffusion models, highlighting their capacity for holistic context modeling and\nparallel decoding. With these benefits, diffusion models can alleviate the\ninherent limitations of AR methods, including their slow inference speed, error\npropagation, and unidirectional constraints. Furthermore, we identify the prior\nunderperformance of diffusion models stemming from the absence of an effective\nlatent space for image-text alignment, and the discrepancy between continuous\ndiffusion processes and discrete textual data. In response, we introduce a\nnovel architecture, LaDiC, which utilizes a split BERT to create a dedicated\nlatent space for captions and integrates a regularization module to manage\nvarying text lengths. Our framework also includes a diffuser for semantic\nimage-to-text conversion and a Back&Refine technique to enhance token\ninteractivity during inference. LaDiC achieves state-of-the-art performance for\ndiffusion-based methods on the MS COCO dataset with 38.2 BLEU@4 and 126.2\nCIDEr, demonstrating exceptional performance without pre-training or ancillary\nmodules. This indicates strong competitiveness with AR models, revealing the\npreviously untapped potential of diffusion models in image-to-text generation.\n","authors":["Yuchi Wang","Shuhuai Ren","Rundong Gao","Linli Yao","Qingyan Guo","Kaikai An","Jianhong Bai","Xu Sun"],"pdf_url":"https://arxiv.org/pdf/2404.10763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10760v1","updated":"2024-04-16T17:38:26Z","published":"2024-04-16T17:38:26Z","title":"Learning Feature Inversion for Multi-class Anomaly Detection under\n General-purpose COCO-AD Benchmark","summary":" Anomaly detection (AD) is often focused on detecting anomaly areas for\nindustrial quality inspection and medical lesion examination. However, due to\nthe specific scenario targets, the data scale for AD is relatively small, and\nevaluation metrics are still deficient compared to classic vision tasks, such\nas object detection and semantic segmentation. To fill these gaps, this work\nfirst constructs a large-scale and general-purpose COCO-AD dataset by extending\nCOCO to the AD field. This enables fair evaluation and sustainable development\nfor different methods on this challenging benchmark. Moreover, current metrics\nsuch as AU-ROC have nearly reached saturation on simple datasets, which\nprevents a comprehensive evaluation of different methods. Inspired by the\nmetrics in the segmentation field, we further propose several more practical\nthreshold-dependent AD-specific metrics, ie, m$F_1$$^{.2}_{.8}$,\nmAcc$^{.2}_{.8}$, mIoU$^{.2}_{.8}$, and mIoU-max. Motivated by GAN inversion's\nhigh-quality reconstruction capability, we propose a simple but more powerful\nInvAD framework to achieve high-quality feature reconstruction. Our method\nimproves the effectiveness of reconstruction-based methods on popular MVTec AD,\nVisA, and our newly proposed COCO-AD datasets under a multi-class unsupervised\nsetting, where only a single detection model is trained to detect anomalies\nfrom different classes. Extensive ablation experiments have demonstrated the\neffectiveness of each component of our InvAD. Full codes and models are\navailable at https://github.com/zhangzjn/ader.\n","authors":["Jiangning Zhang","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Zhucun Xue","Yong Liu","Guansong Pang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2404.10760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10758v1","updated":"2024-04-16T17:35:35Z","published":"2024-04-16T17:35:35Z","title":"Watch Your Step: Optimal Retrieval for Continual Learning at Scale","summary":" One of the most widely used approaches in continual learning is referred to\nas replay. Replay methods support interleaved learning by storing past\nexperiences in a replay buffer. Although there are methods for selectively\nconstructing the buffer and reprocessing its contents, there is limited\nexploration of the problem of selectively retrieving samples from the buffer.\nCurrent solutions have been tested in limited settings and, more importantly,\nin isolation. Existing work has also not explored the impact of duplicate\nreplays on performance. In this work, we propose a framework for evaluating\nselective retrieval strategies, categorized by simple, independent class- and\nsample-selective primitives. We evaluated several combinations of existing\nstrategies for selective retrieval and present their performances. Furthermore,\nwe propose a set of strategies to prevent duplicate replays and explore whether\nnew samples with low loss values can be learned without replay. In an effort to\nmatch our problem setting to a realistic continual learning pipeline, we\nrestrict our experiments to a setting involving a large, pre-trained, open\nvocabulary object detection model, which is fully fine-tuned on a sequence of\n15 datasets.\n","authors":["Truman Hickok","Dhireesha Kudithipudi"],"pdf_url":"https://arxiv.org/pdf/2404.10758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17484v2","updated":"2024-04-16T16:55:35Z","published":"2024-01-30T22:37:24Z","title":"Pixel to Elevation: Learning to Predict Elevation Maps at Long Range\n using Images for Autonomous Offroad Navigation","summary":" Understanding terrain topology at long-range is crucial for the success of\noff-road robotic missions, especially when navigating at high-speeds. LiDAR\nsensors, which are currently heavily relied upon for geometric mapping, provide\nsparse measurements when mapping at greater distances. To address this\nchallenge, we present a novel learning-based approach capable of predicting\nterrain elevation maps at long-range using only onboard egocentric images in\nreal-time. Our proposed method is comprised of three main elements. First, a\ntransformer-based encoder is introduced that learns cross-view associations\nbetween the egocentric views and prior bird-eye-view elevation map predictions.\nSecond, an orientation-aware positional encoding is proposed to incorporate the\n3D vehicle pose information over complex unstructured terrain with multi-view\nvisual image features. Lastly, a history-augmented learn-able map embedding is\nproposed to achieve better temporal consistency between elevation map\npredictions to facilitate the downstream navigational tasks. We experimentally\nvalidate the applicability of our proposed approach for autonomous offroad\nrobotic navigation in complex and unstructured terrain using real-world offroad\ndriving data. Furthermore, the method is qualitatively and quantitatively\ncompared against the current state-of-the-art methods. Extensive field\nexperiments demonstrate that our method surpasses baseline models in accurately\npredicting terrain elevation while effectively capturing the overall terrain\ntopology at long-ranges. Finally, ablation studies are conducted to highlight\nand understand the effect of key components of the proposed approach and\nvalidate their suitability to improve offroad robotic navigation capabilities.\n","authors":["Chanyoung Chung","Georgios Georgakis","Patrick Spieler","Curtis Padgett","Shehryar Khattak"],"pdf_url":"https://arxiv.org/pdf/2401.17484v2.pdf","comment":"8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2301.13656v3","updated":"2024-04-16T16:52:18Z","published":"2023-01-31T14:18:19Z","title":"A Survey and Benchmark of Automatic Surface Reconstruction from Point\n Clouds","summary":" We present a comprehensive survey and benchmark of both traditional and\nlearning-based methods for surface reconstruction from point clouds. This task\nis particularly challenging for real-world acquisitions due to factors like\nnoise, outliers, non-uniform sampling, and missing data. Traditional approaches\noften simplify the problem by imposing handcrafted priors on either the input\npoint clouds or the resulting surface, a process that can necessitate tedious\nhyperparameter tuning. Conversely, deep learning models have the capability to\ndirectly learn the properties of input point clouds and desired surfaces from\ndata. We study the influence of these handcrafted and learned priors on the\nprecision and robustness of surface reconstruction techniques. We evaluate\nvarious time-tested and contemporary methods in a standardized manner. When\nboth trained and evaluated on point clouds with identical characteristics, the\nlearning-based models consistently produce superior surfaces compared to their\ntraditional counterparts$\\unicode{x2013}$even in scenarios involving novel\nshape categories. However, traditional methods demonstrate greater resilience\nto the diverse array of point cloud anomalies commonly found in real-world 3D\nacquisitions. For the benefit of the research community, we make our code and\ndatasets available, inviting further enhancements to learning-based surface\nreconstruction. This can be accessed at\nhttps://github.com/raphaelsulzer/dsr-benchmark .\n","authors":["Raphael Sulzer","Renaud Marlet","Bruno Vallet","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2301.13656v3.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2404.10718v1","updated":"2024-04-16T16:51:27Z","published":"2024-04-16T16:51:27Z","title":"GazeHTA: End-to-end Gaze Target Detection with Head-Target Association","summary":" We propose an end-to-end approach for gaze target detection: predicting a\nhead-target connection between individuals and the target image regions they\nare looking at. Most of the existing methods use independent components such as\noff-the-shelf head detectors or have problems in establishing associations\nbetween heads and gaze targets. In contrast, we investigate an end-to-end\nmulti-person Gaze target detection framework with Heads and Targets Association\n(GazeHTA), which predicts multiple head-target instances based solely on input\nscene image. GazeHTA addresses challenges in gaze target detection by (1)\nleveraging a pre-trained diffusion model to extract scene features for rich\nsemantic understanding, (2) re-injecting a head feature to enhance the head\npriors for improved head understanding, and (3) learning a connection map as\nthe explicit visual associations between heads and gaze targets. Our extensive\nexperimental results demonstrate that GazeHTA outperforms state-of-the-art gaze\ntarget detection methods and two adapted diffusion-based baselines on two\nstandard datasets.\n","authors":["Zhi-Yi Lin","Jouh Yeong Chew","Jan van Gemert","Xucong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10717v1","updated":"2024-04-16T16:51:12Z","published":"2024-04-16T16:51:12Z","title":"Mixed Prototype Consistency Learning for Semi-supervised Medical Image\n Segmentation","summary":" Recently, prototype learning has emerged in semi-supervised medical image\nsegmentation and achieved remarkable performance. However, the scarcity of\nlabeled data limits the expressiveness of prototypes in previous methods,\npotentially hindering the complete representation of prototypes for class\nembedding. To address this problem, we propose the Mixed Prototype Consistency\nLearning (MPCL) framework, which includes a Mean Teacher and an auxiliary\nnetwork. The Mean Teacher generates prototypes for labeled and unlabeled data,\nwhile the auxiliary network produces additional prototypes for mixed data\nprocessed by CutMix. Through prototype fusion, mixed prototypes provide extra\nsemantic information to both labeled and unlabeled prototypes. High-quality\nglobal prototypes for each class are formed by fusing two enhanced prototypes,\noptimizing the distribution of hidden embeddings used in consistency learning.\nExtensive experiments on the left atrium and type B aortic dissection datasets\ndemonstrate MPCL's superiority over previous state-of-the-art approaches,\nconfirming the effectiveness of our framework. The code will be released soon.\n","authors":["Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.10717v1.pdf","comment":"15 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.10716v1","updated":"2024-04-16T16:50:35Z","published":"2024-04-16T16:50:35Z","title":"MOWA: Multiple-in-One Image Warping Model","summary":" While recent image warping approaches achieved remarkable success on existing\nbenchmarks, they still require training separate models for each specific task\nand cannot generalize well to different camera models or customized\nmanipulations. To address diverse types of warping in practice, we propose a\nMultiple-in-One image WArping model (named MOWA) in this work. Specifically, we\nmitigate the difficulty of multi-task learning by disentangling the motion\nestimation at both the region level and pixel level. To further enable dynamic\ntask-aware image warping, we introduce a lightweight point-based classifier\nthat predicts the task type, serving as prompts to modulate the feature maps\nfor better estimation. To our knowledge, this is the first work that solves\nmultiple practical warping tasks in one single model. Extensive experiments\ndemonstrate that our MOWA, which is trained on six tasks for multiple-in-one\nimage warping, outperforms state-of-the-art task-specific models across most\ntasks. Moreover, MOWA also exhibits promising potential to generalize into\nunseen scenes, as evidenced by cross-domain and zero-shot evaluations. The code\nwill be made publicly available.\n","authors":["Kang Liao","Zongsheng Yue","Zhonghua Wu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2404.10716v1.pdf","comment":"Project page: https://kangliao929.github.io/projects/mowa/"},{"id":"http://arxiv.org/abs/2404.10714v1","updated":"2024-04-16T16:43:36Z","published":"2024-04-16T16:43:36Z","title":"AV-GAN: Attention-Based Varifocal Generative Adversarial Network for\n Uneven Medical Image Translation","summary":" Different types of staining highlight different structures in organs, thereby\nassisting in diagnosis. However, due to the impossibility of repeated staining,\nwe cannot obtain different types of stained slides of the same tissue area.\nTranslating the slide that is easy to obtain (e.g., H&E) to slides of staining\ntypes difficult to obtain (e.g., MT, PAS) is a promising way to solve this\nproblem. However, some regions are closely connected to other regions, and to\nmaintain this connection, they often have complex structures and are difficult\nto translate, which may lead to wrong translations. In this paper, we propose\nthe Attention-Based Varifocal Generative Adversarial Network (AV-GAN), which\nsolves multiple problems in pathologic image translation tasks, such as uneven\ntranslation difficulty in different regions, mutual interference of multiple\nresolution information, and nuclear deformation. Specifically, we develop an\nAttention-Based Key Region Selection Module, which can attend to regions with\nhigher translation difficulty. We then develop a Varifocal Module to translate\nthese regions at multiple resolutions. Experimental results show that our\nproposed AV-GAN outperforms existing image translation methods with two virtual\nkidney tissue staining tasks and improves FID values by 15.9 and 4.16\nrespectively in the H&E-MT and H&E-PAS tasks.\n","authors":["Zexin Li","Yiyang Lin","Zijie Fang","Shuyan Li","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2404.10714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10713v1","updated":"2024-04-16T16:43:14Z","published":"2024-04-16T16:43:14Z","title":"A Plausibility Study of Using Augmented Reality in the\n Ventriculoperitoneal Shunt Operations","summary":" The field of augmented reality (AR) has undergone substantial growth, finding\ndiverse applications in the medical industry. This paper delves into various\ntechniques employed in medical surgeries, scrutinizing factors such as cost,\nimplementation, and accessibility. The focus of this exploration is on AR-based\nsolutions, with a particular emphasis on addressing challenges and proposing an\ninnovative solution for ventriculoperitoneal shunt (VP) operations. The\nproposed solution introduces a novel flow in the pre-surgery phase, aiming to\nsubstantially reduce setup time and operation duration by creating 3D models of\nthe skull and ventricles. Experiments are conducted where the models are\nvisualized on a 3D- printed skull through an AR device, specifically the\nMicrosoft HoloLens 2. The paper then conducts an in-depth analysis of this\nproposed solution, discussing its feasibility, advantages, limitations,and\nfuture implications.\n","authors":["Tandin Dorji","Pakinee Aimmanee","Vich Yindeedej"],"pdf_url":"https://arxiv.org/pdf/2404.10713v1.pdf","comment":"Accepted for the 2024 - 16th International Conference on Knowledge\n and Smart Technology (KST). To be published in IEEEXplore Digital Library\n (#61284), ISBN: 979-8-3503-7073-7"},{"id":"http://arxiv.org/abs/2404.10710v1","updated":"2024-04-16T16:36:50Z","published":"2024-04-16T16:36:50Z","title":"Dual Modalities of Text: Visual and Textual Generative Pre-training","summary":" Harnessing visual texts represents a burgeoning frontier in the evolution of\nlanguage modeling. In this paper, we introduce a novel pre-training framework\nfor a suite of pixel-based autoregressive language models, pre-training on a\ncorpus of over 400 million documents rendered as RGB images. Our approach is\ncharacterized by a dual-modality training regimen, engaging both visual data\nthrough next patch prediction with a regression head and textual data via next\ntoken prediction with a classification head. This study is particularly focused\non investigating the synergistic interplay between visual and textual\nmodalities of language. Our comprehensive evaluation across a diverse array of\nbenchmarks reveals that the confluence of visual and textual data substantially\naugments the efficacy of pixel-based language models. Notably, our findings\nshow that a unidirectional pixel-based model, devoid of textual data during\ntraining, can match the performance levels of advanced bidirectional\npixel-based models on various language understanding benchmarks. This work\nhighlights the considerable untapped potential of integrating visual and\ntextual information for language modeling purposes. We will release our code,\ndata, and checkpoints to inspire further research advancement.\n","authors":["Yekun Chai","Qingyi Liu","Jingwu Xiao","Shuohuan Wang","Yu Sun","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.11679v2","updated":"2024-04-16T16:28:42Z","published":"2021-12-22T06:05:02Z","title":"Ghost-dil-NetVLAD: A Lightweight Neural Network for Visual Place\n Recognition","summary":" Visual place recognition (VPR) is a challenging task with the unbalance\nbetween enormous computational cost and high recognition performance. Thanks to\nthe practical feature extraction ability of the lightweight convolution neural\nnetworks (CNNs) and the train-ability of the vector of locally aggregated\ndescriptors (VLAD) layer, we propose a lightweight weakly supervised end-to-end\nneural network consisting of a front-ended perception model called GhostCNN and\na learnable VLAD layer as a back-end. GhostCNN is based on Ghost modules that\nare lightweight CNN-based architectures. They can generate redundant feature\nmaps using linear operations instead of the traditional convolution process,\nmaking a good trade-off between computation resources and recognition accuracy.\nTo enhance our proposed lightweight model further, we add dilated convolutions\nto the Ghost module to get features containing more spatial semantic\ninformation, improving accuracy. Finally, rich experiments conducted on a\ncommonly used public benchmark and our private dataset validate that the\nproposed neural network reduces the FLOPs and parameters of VGG16-NetVLAD by\n99.04% and 80.16%, respectively. Besides, both models achieve similar accuracy.\n","authors":["Qingyuan Gong","Yu Liu","Liqiang Zhang","Renhe Liu"],"pdf_url":"https://arxiv.org/pdf/2112.11679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16278v2","updated":"2024-04-16T16:26:35Z","published":"2023-11-27T19:34:04Z","title":"VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle\n Re-identification","summary":" Vehicle Re-identification (Re-ID) has been broadly studied in the last\ndecade; however, the different camera view angle leading to confused\ndiscrimination in the feature subspace for the vehicles of various poses, is\nstill challenging for the Vehicle Re-ID models in the real world. To promote\nthe Vehicle Re-ID models, this paper proposes to synthesize a large number of\nvehicle images in the target pose, whose idea is to project the vehicles of\ndiverse poses into the unified target pose so as to enhance feature\ndiscrimination. Considering that the paired data of the same vehicles in\ndifferent traffic surveillance cameras might be not available in the real\nworld, we propose the first Pair-flexible Pose Guided Image Synthesis method\nfor Vehicle Re-ID, named as VehicleGAN in this paper, which works for both\nsupervised and unsupervised settings without the knowledge of geometric 3D\nmodels. Because of the feature distribution difference between real and\nsynthetic data, simply training a traditional metric learning based Re-ID model\nwith data-level fusion (i.e., data augmentation) is not satisfactory, therefore\nwe propose a new Joint Metric Learning (JML) via effective feature-level fusion\nfrom both real and synthetic data. Intensive experimental results on the public\nVeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our\nproposed VehicleGAN and JML.\n","authors":["Baolu Li","Ping Liu","Lan Fu","Jinlong Li","Jianwu Fang","Zhigang Xu","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2311.16278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10700v1","updated":"2024-04-16T16:17:48Z","published":"2024-04-16T16:17:48Z","title":"Rawformer: Unpaired Raw-to-Raw Translation for Learnable Camera ISPs","summary":" Modern smartphone camera quality heavily relies on the image signal processor\n(ISP) to enhance captured raw images, utilizing carefully designed modules to\nproduce final output images encoded in a standard color space (e.g., sRGB).\nNeural-based end-to-end learnable ISPs offer promising advancements,\npotentially replacing traditional ISPs with their ability to adapt without\nrequiring extensive tuning for each new camera model, as is often the case for\nnearly every module in traditional ISPs. However, the key challenge with the\nrecent learning-based ISPs is the urge to collect large paired datasets for\neach distinct camera model due to the influence of intrinsic camera\ncharacteristics on the formation of input raw images. This paper tackles this\nchallenge by introducing a novel method for unpaired learning of raw-to-raw\ntranslation across diverse cameras. Specifically, we propose Rawformer, an\nunsupervised Transformer-based encoder-decoder method for raw-to-raw\ntranslation. It accurately maps raw images captured by a certain camera to the\ntarget camera, facilitating the generalization of learnable ISPs to new unseen\ncameras. Our method demonstrates superior performance on real camera datasets,\nachieving higher accuracy compared to previous state-of-the-art techniques, and\npreserving a more robust correlation between the original and translated raw\nimages.\n","authors":["Georgy Perevozchikov","Nancy Mehta","Mahmoud Afifi","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.10700v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.10699v1","updated":"2024-04-16T16:16:40Z","published":"2024-04-16T16:16:40Z","title":"ECLAIR: A High-Fidelity Aerial LiDAR Dataset for Semantic Segmentation","summary":" We introduce ECLAIR (Extended Classification of Lidar for AI Recognition), a\nnew outdoor large-scale aerial LiDAR dataset designed specifically for\nadvancing research in point cloud semantic segmentation. As the most extensive\nand diverse collection of its kind to date, the dataset covers a total area of\n10$km^2$ with close to 600 million points and features eleven distinct object\ncategories. To guarantee the dataset's quality and utility, we have thoroughly\ncurated the point labels through an internal team of experts, ensuring accuracy\nand consistency in semantic labeling. The dataset is engineered to move forward\nthe fields of 3D urban modeling, scene understanding, and utility\ninfrastructure management by presenting new challenges and potential\napplications. As a benchmark, we report qualitative and quantitative analysis\nof a voxel-based point cloud segmentation approach based on the Minkowski\nEngine.\n","authors":["Iaroslav Melekhov","Anand Umashankar","Hyeong-Jin Kim","Vladislav Serkov","Dusty Argyle"],"pdf_url":"https://arxiv.org/pdf/2404.10699v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10690v1","updated":"2024-04-16T16:10:23Z","published":"2024-04-16T16:10:23Z","title":"MathWriting: A Dataset For Handwritten Mathematical Expression\n Recognition","summary":" We introduce MathWriting, the largest online handwritten mathematical\nexpression dataset to date. It consists of 230k human-written samples and an\nadditional 400k synthetic ones. MathWriting can also be used for offline HME\nrecognition and is larger than all existing offline HME datasets like\nIM2LATEX-100K. We introduce a benchmark based on MathWriting data in order to\nadvance research on both online and offline HME recognition.\n","authors":["Philippe Gervais","Asya Fadeeva","Andrii Maksai"],"pdf_url":"https://arxiv.org/pdf/2404.10690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10688v1","updated":"2024-04-16T16:08:59Z","published":"2024-04-16T16:08:59Z","title":"Efficient Conditional Diffusion Model with Probability Flow Sampling for\n Image Super-resolution","summary":" Image super-resolution is a fundamentally ill-posed problem because multiple\nvalid high-resolution images exist for one low-resolution image.\nSuper-resolution methods based on diffusion probabilistic models can deal with\nthe ill-posed nature by learning the distribution of high-resolution images\nconditioned on low-resolution images, avoiding the problem of blurry images in\nPSNR-oriented methods. However, existing diffusion-based super-resolution\nmethods have high time consumption with the use of iterative sampling, while\nthe quality and consistency of generated images are less than ideal due to\nproblems like color shifting. In this paper, we propose Efficient Conditional\nDiffusion Model with Probability Flow Sampling (ECDP) for image\nsuper-resolution. To reduce the time consumption, we design a continuous-time\nconditional diffusion model for image super-resolution, which enables the use\nof probability flow sampling for efficient generation. Additionally, to improve\nthe consistency of generated images, we propose a hybrid parametrization for\nthe denoiser network, which interpolates between the data-predicting\nparametrization and the noise-predicting parametrization for different noise\nscales. Moreover, we design an image quality loss as a complement to the score\nmatching loss of diffusion models, further improving the consistency and\nquality of super-resolution. Extensive experiments on DIV2K, ImageNet, and\nCelebA demonstrate that our method achieves higher super-resolution quality\nthan existing diffusion-based image super-resolution methods while having lower\ntime consumption. Our code is available at https://github.com/Yuan-Yutao/ECDP.\n","authors":["Yutao Yuan","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.10688v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2404.10685v1","updated":"2024-04-16T16:04:38Z","published":"2024-04-16T16:04:38Z","title":"Generating Human Interaction Motions in Scenes with Text Control","summary":" We present TeSMo, a method for text-controlled scene-aware motion generation\nbased on denoising diffusion models. Previous text-to-motion methods focus on\ncharacters in isolation without considering scenes due to the limited\navailability of datasets that include motion, text descriptions, and\ninteractive scenes. Our approach begins with pre-training a scene-agnostic\ntext-to-motion diffusion model, emphasizing goal-reaching constraints on\nlarge-scale motion-capture datasets. We then enhance this model with a\nscene-aware component, fine-tuned using data augmented with detailed scene\ninformation, including ground plane and object shapes. To facilitate training,\nwe embed annotated navigation and interaction motions within scenes. The\nproposed method produces realistic and diverse human-object interactions, such\nas navigation and sitting, in different scenes with various object shapes,\norientations, initial body positions, and poses. Extensive experiments\ndemonstrate that our approach surpasses prior techniques in terms of the\nplausibility of human-scene interactions, as well as the realism and variety of\nthe generated motions. Code will be released upon publication of this work at\nhttps://research.nvidia.com/labs/toronto-ai/tesmo.\n","authors":["Hongwei Yi","Justus Thies","Michael J. Black","Xue Bin Peng","Davis Rempe"],"pdf_url":"https://arxiv.org/pdf/2404.10685v1.pdf","comment":"Project Page: https://research.nvidia.com/labs/toronto-ai/tesmo/"},{"id":"http://arxiv.org/abs/2404.10681v1","updated":"2024-04-16T15:58:49Z","published":"2024-04-16T15:58:49Z","title":"StyleCity: Large-Scale 3D Urban Scenes Stylization with Vision-and-Text\n Reference via Progressive Optimization","summary":" Creating large-scale virtual urban scenes with variant styles is inherently\nchallenging. To facilitate prototypes of virtual production and bypass the need\nfor complex materials and lighting setups, we introduce the first\nvision-and-text-driven texture stylization system for large-scale urban scenes,\nStyleCity. Taking an image and text as references, StyleCity stylizes a 3D\ntextured mesh of a large-scale urban scene in a semantics-aware fashion and\ngenerates a harmonic omnidirectional sky background. To achieve that, we\npropose to stylize a neural texture field by transferring 2D vision-and-text\npriors to 3D globally and locally. During 3D stylization, we progressively\nscale the planned training views of the input 3D scene at different levels in\norder to preserve high-quality scene content. We then optimize the scene style\nglobally by adapting the scale of the style image with the scale of the\ntraining views. Moreover, we enhance local semantics consistency by the\nsemantics-aware style loss which is crucial for photo-realistic stylization.\nBesides texture stylization, we further adopt a generative diffusion model to\nsynthesize a style-consistent omnidirectional sky image, which offers a more\nimmersive atmosphere and assists the semantic stylization process. The stylized\nneural texture field can be baked into an arbitrary-resolution texture,\nenabling seamless integration into conventional rendering pipelines and\nsignificantly easing the virtual production prototyping process. Extensive\nexperiments demonstrate our stylized scenes' superiority in qualitative and\nquantitative performance and user preferences.\n","authors":["Yingshu Chen","Huajian Huang","Tuan-Anh Vu","Ka Chun Shum","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.10681v1.pdf","comment":"project page: https://chenyingshu.github.io/stylecity3d/"},{"id":"http://arxiv.org/abs/2312.02126v3","updated":"2024-04-16T15:50:56Z","published":"2023-12-04T18:53:24Z","title":"SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM","summary":" Dense simultaneous localization and mapping (SLAM) is crucial for robotics\nand augmented reality applications. However, current methods are often hampered\nby the non-volumetric or implicit way they represent a scene. This work\nintroduces SplaTAM, an approach that, for the first time, leverages explicit\nvolumetric representations, i.e., 3D Gaussians, to enable high-fidelity\nreconstruction from a single unposed RGB-D camera, surpassing the capabilities\nof existing methods. SplaTAM employs a simple online tracking and mapping\nsystem tailored to the underlying Gaussian representation. It utilizes a\nsilhouette mask to elegantly capture the presence of scene density. This\ncombination enables several benefits over prior representations, including fast\nrendering and dense optimization, quickly determining if areas have been\npreviously mapped, and structured map expansion by adding more Gaussians.\nExtensive experiments show that SplaTAM achieves up to 2x superior performance\nin camera pose estimation, map construction, and novel-view synthesis over\nexisting methods, paving the way for more immersive high-fidelity SLAM\napplications.\n","authors":["Nikhil Keetha","Jay Karhade","Krishna Murthy Jatavallabhula","Gengshan Yang","Sebastian Scherer","Deva Ramanan","Jonathon Luiten"],"pdf_url":"https://arxiv.org/pdf/2312.02126v3.pdf","comment":"CVPR 2024. Website: https://spla-tam.github.io/"},{"id":"http://arxiv.org/abs/2404.10667v1","updated":"2024-04-16T15:43:22Z","published":"2024-04-16T15:43:22Z","title":"VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time","summary":" We introduce VASA, a framework for generating lifelike talking faces with\nappealing visual affective skills (VAS) given a single static image and a\nspeech audio clip. Our premiere model, VASA-1, is capable of not only producing\nlip movements that are exquisitely synchronized with the audio, but also\ncapturing a large spectrum of facial nuances and natural head motions that\ncontribute to the perception of authenticity and liveliness. The core\ninnovations include a holistic facial dynamics and head movement generation\nmodel that works in a face latent space, and the development of such an\nexpressive and disentangled face latent space using videos. Through extensive\nexperiments including evaluation on a set of new metrics, we show that our\nmethod significantly outperforms previous methods along various dimensions\ncomprehensively. Our method not only delivers high video quality with realistic\nfacial and head dynamics but also supports the online generation of 512x512\nvideos at up to 40 FPS with negligible starting latency. It paves the way for\nreal-time engagements with lifelike avatars that emulate human conversational\nbehaviors.\n","authors":["Sicheng Xu","Guojun Chen","Yu-Xiao Guo","Jiaolong Yang","Chong Li","Zhenyu Zang","Yizhong Zhang","Xin Tong","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2404.10667v1.pdf","comment":"Tech Report. Project webpage:\n https://www.microsoft.com/en-us/research/project/vasa-1/"},{"id":"http://arxiv.org/abs/2404.10664v1","updated":"2024-04-16T15:40:18Z","published":"2024-04-16T15:40:18Z","title":"Assessing The Impact of CNN Auto Encoder-Based Image Denoising on Image\n Classification Tasks","summary":" Images captured from the real world are often affected by different types of\nnoise, which can significantly impact the performance of Computer Vision\nsystems and the quality of visual data. This study presents a novel approach\nfor defect detection in casting product noisy images, specifically focusing on\nsubmersible pump impellers. The methodology involves utilizing deep learning\nmodels such as VGG16, InceptionV3, and other models in both the spatial and\nfrequency domains to identify noise types and defect status. The research\nprocess begins with preprocessing images, followed by applying denoising\ntechniques tailored to specific noise categories. The goal is to enhance the\naccuracy and robustness of defect detection by integrating noise detection and\ndenoising into the classification pipeline. The study achieved remarkable\nresults using VGG16 for noise type classification in the frequency domain,\nachieving an accuracy of over 99%. Removal of salt and pepper noise resulted in\nan average SSIM of 87.9, while Gaussian noise removal had an average SSIM of\n64.0, and periodic noise removal yielded an average SSIM of 81.6. This\ncomprehensive approach showcases the effectiveness of the deep AutoEncoder\nmodel and median filter, for denoising strategies in real-world industrial\napplications. Finally, our study reports significant improvements in binary\nclassification accuracy for defect detection compared to previous methods. For\nthe VGG16 classifier, accuracy increased from 94.6% to 97.0%, demonstrating the\neffectiveness of the proposed noise detection and denoising approach.\nSimilarly, for the InceptionV3 classifier, accuracy improved from 84.7% to\n90.0%, further validating the benefits of integrating noise analysis into the\nclassification pipeline.\n","authors":["Mohsen Hami","Mahdi JameBozorg"],"pdf_url":"https://arxiv.org/pdf/2404.10664v1.pdf","comment":"13 pages, 13 figures, 13th International conference on innovative\n technologies in the field of science, engineering and technology"},{"id":"http://arxiv.org/abs/2404.07922v3","updated":"2024-04-16T15:33:45Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. Our project is\npublic at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v3.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.10633v1","updated":"2024-04-16T15:04:55Z","published":"2024-04-16T15:04:55Z","title":"Contextrast: Contextual Contrastive Learning for Semantic Segmentation","summary":" Despite great improvements in semantic segmentation, challenges persist\nbecause of the lack of local/global contexts and the relationship between them.\nIn this paper, we propose Contextrast, a contrastive learning-based semantic\nsegmentation method that allows to capture local/global contexts and comprehend\ntheir relationships. Our proposed method comprises two parts: a) contextual\ncontrastive learning (CCL) and b) boundary-aware negative (BANE) sampling.\nContextual contrastive learning obtains local/global context from multi-scale\nfeature aggregation and inter/intra-relationship of features for better\ndiscrimination capabilities. Meanwhile, BANE sampling selects embedding\nfeatures along the boundaries of incorrectly predicted regions to employ them\nas harder negative samples on our contrastive learning, resolving segmentation\nissues along the boundary region by exploiting fine-grained details. We\ndemonstrate that our Contextrast substantially enhances the performance of\nsemantic segmentation networks, outperforming state-of-the-art contrastive\nlearning approaches on diverse public datasets, e.g. Cityscapes, CamVid,\nPASCAL-C, COCO-Stuff, and ADE20K, without an increase in computational cost\nduring inference.\n","authors":["Changki Sung","Wanhee Kim","Jungho An","Wooju Lee","Hyungtae Lim","Hyun Myung"],"pdf_url":"https://arxiv.org/pdf/2404.10633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09172v2","updated":"2024-04-16T14:56:32Z","published":"2024-04-14T07:36:18Z","title":"LoopAnimate: Loopable Salient Object Animation","summary":" Research on diffusion model-based video generation has advanced rapidly.\nHowever, limitations in object fidelity and generation length hinder its\npractical applications. Additionally, specific domains like animated wallpapers\nrequire seamless looping, where the first and last frames of the video match\nseamlessly. To address these challenges, this paper proposes LoopAnimate, a\nnovel method for generating videos with consistent start and end frames. To\nenhance object fidelity, we introduce a framework that decouples multi-level\nimage appearance and textual semantic information. Building upon an\nimage-to-image diffusion model, our approach incorporates both pixel-level and\nfeature-level information from the input image, injecting image appearance and\ntextual semantic embeddings at different positions of the diffusion model.\nExisting UNet-based video generation models require to input the entire videos\nduring training to encode temporal and positional information at once. However,\ndue to limitations in GPU memory, the number of frames is typically restricted\nto 16. To address this, this paper proposes a three-stage training strategy\nwith progressively increasing frame numbers and reducing fine-tuning modules.\nAdditionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend\nthe capacity for encoding temporal and positional information up to 36 frames.\nThe proposed LoopAnimate, which for the first time extends the single-pass\ngeneration length of UNet-based video generation models to 35 frames while\nmaintaining high-quality video generation. Experiments demonstrate that\nLoopAnimate achieves state-of-the-art performance in both objective metrics,\nsuch as fidelity and temporal consistency, and subjective evaluation results.\n","authors":["Fanyi Wang","Peng Liu","Haotian Hu","Dan Meng","Jingwen Su","Jinjin Xu","Yanhao Zhang","Xiaoming Ren","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10626v1","updated":"2024-04-16T14:52:15Z","published":"2024-04-16T14:52:15Z","title":"Exploring selective image matching methods for zero-shot and few-sample\n unsupervised domain adaptation of urban canopy prediction","summary":" We explore simple methods for adapting a trained multi-task UNet which\npredicts canopy cover and height to a new geographic setting using remotely\nsensed data without the need of training a domain-adaptive classifier and\nextensive fine-tuning. Extending previous research, we followed a selective\nalignment process to identify similar images in the two geographical domains\nand then tested an array of data-based unsupervised domain adaptation\napproaches in a zero-shot setting as well as with a small amount of\nfine-tuning. We find that the selective aligned data-based image matching\nmethods produce promising results in a zero-shot setting, and even more so with\na small amount of fine-tuning. These methods outperform both an untransformed\nbaseline and a popular data-based image-to-image translation model. The best\nperforming methods were pixel distribution adaptation and fourier domain\nadaptation on the canopy cover and height tasks respectively.\n","authors":["John Francis","Stephen Law"],"pdf_url":"https://arxiv.org/pdf/2404.10626v1.pdf","comment":"ICLR 2024 Machine Learning for Remote Sensing (ML4RS) Workshop"},{"id":"http://arxiv.org/abs/2404.10625v1","updated":"2024-04-16T14:48:40Z","published":"2024-04-16T14:48:40Z","title":"Gaussian Splatting Decoder for 3D-aware Generative Adversarial Networks","summary":" NeRF-based 3D-aware Generative Adversarial Networks (GANs) like EG3D or\nGIRAFFE have shown very high rendering quality under large representational\nvariety. However, rendering with Neural Radiance Fields poses challenges for 3D\napplications: First, the significant computational demands of NeRF rendering\npreclude its use on low-power devices, such as mobiles and VR/AR headsets.\nSecond, implicit representations based on neural networks are difficult to\nincorporate into explicit 3D scenes, such as VR environments or video games. 3D\nGaussian Splatting (3DGS) overcomes these limitations by providing an explicit\n3D representation that can be rendered efficiently at high frame rates. In this\nwork, we present a novel approach that combines the high rendering quality of\nNeRF-based 3D-aware GANs with the flexibility and computational advantages of\n3DGS. By training a decoder that maps implicit NeRF representations to explicit\n3D Gaussian Splatting attributes, we can integrate the representational\ndiversity and quality of 3D GANs into the ecosystem of 3D Gaussian Splatting\nfor the first time. Additionally, our approach allows for a high resolution GAN\ninversion and real-time GAN editing with 3D Gaussian Splatting scenes.\n","authors":["Florian Barthel","Arian Beckmann","Wieland Morgenstern","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2404.10625v1.pdf","comment":"CVPRW"},{"id":"http://arxiv.org/abs/2403.08801v5","updated":"2024-04-16T14:48:34Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14045v2","updated":"2024-04-16T14:45:44Z","published":"2024-02-21T13:06:48Z","title":"A Systematic Review of Low-Rank and Local Low-Rank Matrix Approximation\n in Big Data Medical Imaging","summary":" The large volume and complexity of medical imaging datasets are bottlenecks\nfor storage, transmission, and processing. To tackle these challenges, the\napplication of low-rank matrix approximation (LRMA) and its derivative, local\nLRMA (LLRMA) has demonstrated potential.\n A detailed analysis of the literature identifies LRMA and LLRMA methods\napplied to various imaging modalities, and the challenges and limitations\nassociated with existing LRMA and LLRMA methods are addressed.\n We note a significant shift towards a preference for LLRMA in the medical\nimaging field since 2015, demonstrating its potential and effectiveness in\ncapturing complex structures in medical data compared to LRMA. Acknowledging\nthe limitations of shallow similarity methods used with LLRMA, we suggest\nadvanced semantic image segmentation for similarity measure, explaining in\ndetail how it can measure similar patches and their feasibility.\n We note that LRMA and LLRMA are mainly applied to unstructured medical data,\nand we propose extending their application to different medical data types,\nincluding structured and semi-structured. This paper also discusses how LRMA\nand LLRMA can be applied to regular data with missing entries and the impact of\ninaccuracies in predicting missing values and their effects. We discuss the\nimpact of patch size and propose the use of random search (RS) to determine the\noptimal patch size. To enhance feasibility, a hybrid approach using Bayesian\noptimization and RS is proposed, which could improve the application of LRMA\nand LLRMA in medical imaging.\n","authors":["Sisipho Hamlomo","Marcellin Atemkeng","Yusuf Brima","Chuneeta Nunhokee","Jeremy Baxter"],"pdf_url":"https://arxiv.org/pdf/2402.14045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10620v1","updated":"2024-04-16T14:43:33Z","published":"2024-04-16T14:43:33Z","title":"PyTorchGeoNodes: Enabling Differentiable Shape Programs for 3D Shape\n Reconstruction","summary":" We propose PyTorchGeoNodes, a differentiable module for reconstructing 3D\nobjects from images using interpretable shape programs. In comparison to\ntraditional CAD model retrieval methods, the use of shape programs for 3D\nreconstruction allows for reasoning about the semantic properties of\nreconstructed objects, editing, low memory footprint, etc. However, the\nutilization of shape programs for 3D scene understanding has been largely\nneglected in past works. As our main contribution, we enable gradient-based\noptimization by introducing a module that translates shape programs designed in\nBlender, for example, into efficient PyTorch code. We also provide a method\nthat relies on PyTorchGeoNodes and is inspired by Monte Carlo Tree Search\n(MCTS) to jointly optimize discrete and continuous parameters of shape programs\nand reconstruct 3D objects for input scenes. In our experiments, we apply our\nalgorithm to reconstruct 3D objects in the ScanNet dataset and evaluate our\nresults against CAD model retrieval-based reconstructions. Our experiments\nindicate that our reconstructions match well the input scenes while enabling\nsemantic reasoning about reconstructed objects.\n","authors":["Sinisa Stekovic","Stefan Ainetter","Mattia D'Urso","Friedrich Fraundorfer","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2404.10620v1.pdf","comment":"In Submission"},{"id":"http://arxiv.org/abs/2404.10618v1","updated":"2024-04-16T14:42:49Z","published":"2024-04-16T14:42:49Z","title":"Private Attribute Inference from Images with Vision-Language Models","summary":" As large language models (LLMs) become ubiquitous in our daily tasks and\ndigital interactions, associated privacy risks are increasingly in focus. While\nLLM privacy research has primarily focused on the leakage of model training\ndata, it has recently been shown that the increase in models' capabilities has\nenabled LLMs to make accurate privacy-infringing inferences from previously\nunseen texts. With the rise of multimodal vision-language models (VLMs),\ncapable of understanding both images and text, a pertinent question is whether\nsuch results transfer to the previously unexplored domain of benign images\nposted online. To investigate the risks associated with the image reasoning\ncapabilities of newly emerging VLMs, we compile an image dataset with\nhuman-annotated labels of the image owner's personal attributes. In order to\nunderstand the additional privacy risk posed by VLMs beyond traditional human\nattribute recognition, our dataset consists of images where the inferable\nprivate attributes do not stem from direct depictions of humans. On this\ndataset, we evaluate the inferential capabilities of 7 state-of-the-art VLMs,\nfinding that they can infer various personal attributes at up to 77.6%\naccuracy. Concerningly, we observe that accuracy scales with the general\ncapabilities of the models, implying that future models can be misused as\nstronger adversaries, establishing an imperative for the development of\nadequate defenses.\n","authors":["Batuhan Tömekçe","Mark Vero","Robin Staab","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2404.10618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10121v3","updated":"2024-04-16T14:35:13Z","published":"2023-11-16T10:45:46Z","title":"Slide-SAM: Medical SAM Meets Sliding Window","summary":" The Segment Anything Model (SAM) has achieved a notable success in\ntwo-dimensional image segmentation in natural images. However, the substantial\ngap between medical and natural images hinders its direct application to\nmedical image segmentation tasks. Particularly in 3D medical images, SAM\nstruggles to learn contextual relationships between slices, limiting its\npractical applicability. Moreover, applying 2D SAM to 3D images requires\nprompting the entire volume, which is time- and label-consuming. To address\nthese problems, we propose Slide-SAM, which treats a stack of three adjacent\nslices as a prediction window. It firstly takes three slices from a 3D volume\nand point- or bounding box prompts on the central slice as inputs to predict\nsegmentation masks for all three slices. Subsequently, the masks of the top and\nbottom slices are then used to generate new prompts for adjacent slices.\nFinally, step-wise prediction can be achieved by sliding the prediction window\nforward or backward through the entire volume. Our model is trained on multiple\npublic and private medical datasets and demonstrates its effectiveness through\nextensive 3D segmetnation experiments, with the help of minimal prompts. Code\nis available at \\url{https://github.com/Curli-quan/Slide-SAM}.\n","authors":["Quan Quan","Fenghe Tang","Zikang Xu","Heqin Zhu","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.10121v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10603v1","updated":"2024-04-16T14:28:57Z","published":"2024-04-16T14:28:57Z","title":"Enhancing 3D Fidelity of Text-to-3D using Cross-View Correspondences","summary":" Leveraging multi-view diffusion models as priors for 3D optimization have\nalleviated the problem of 3D consistency, e.g., the Janus face problem or the\ncontent drift problem, in zero-shot text-to-3D models. However, the 3D\ngeometric fidelity of the output remains an unresolved issue; albeit the\nrendered 2D views are realistic, the underlying geometry may contain errors\nsuch as unreasonable concavities. In this work, we propose CorrespondentDream,\nan effective method to leverage annotation-free, cross-view correspondences\nyielded from the diffusion U-Net to provide additional 3D prior to the NeRF\noptimization process. We find that these correspondences are strongly\nconsistent with human perception, and by adopting it in our loss design, we are\nable to produce NeRF models with geometries that are more coherent with common\nsense, e.g., more smoothed object surface, yielding higher 3D fidelity. We\ndemonstrate the efficacy of our approach through various comparative\nqualitative results and a solid user study.\n","authors":["Seungwook Kim","Kejie Li","Xueqing Deng","Yichun Shi","Minsu Cho","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10603v1.pdf","comment":"25 pages, 22 figures, accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10600v1","updated":"2024-04-16T14:26:55Z","published":"2024-04-16T14:26:55Z","title":"Intra-operative tumour margin evaluation in breast-conserving surgery\n with deep learning","summary":" A positive margin may result in an increased risk of local recurrences after\nbreast retention surgery for any malignant tumour. In order to reduce the\nnumber of positive margins would offer surgeon real-time intra-operative\ninformation on the presence of positive resection margins. This study aims to\ndesign an intra-operative tumour margin evaluation scheme by using specimen\nmammography in breast-conserving surgery. Total of 30 cases were evaluated and\ncompared with the manually determined contours by experienced physicians and\npathology report. The proposed method utilizes image thresholding to extract\nregions of interest and then performs a deep learning model, i.e. SegNet, to\nsegment tumour tissue. The margin width of normal tissues surrounding it is\nevaluated as the result. The desired size of margin around the tumor was set\nfor 10 mm. The smallest average difference to manual sketched margin (6.53 mm\n+- 5.84). In the all case, the SegNet architecture was utilized to obtain\ntissue specimen boundary and tumor contour, respectively. The simulation\nresults indicated that this technology is helpful in discriminating positive\nfrom negative margins in the intra-operative setting. The aim of proposed\nscheme was a potential procedure in the intra-operative measurement system. The\nexperimental results reveal that deep learning techniques can draw results that\nare consistent with pathology reports.\n","authors":["Wei-Chung Shia","Yu-Len Huang","Yi-Chun Chen","Hwa-Koon Wu","Dar-Ren Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10600v1.pdf","comment":"1 pages, 6 figures and 2 tables"},{"id":"http://arxiv.org/abs/2404.10595v1","updated":"2024-04-16T14:20:55Z","published":"2024-04-16T14:20:55Z","title":"Automated Evaluation of Large Vision-Language Models on Self-driving\n Corner Cases","summary":" Large Vision-Language Models (LVLMs), due to the remarkable visual reasoning\nability to understand images and videos, have received widespread attention in\nthe autonomous driving domain, which significantly advances the development of\ninterpretable end-to-end autonomous driving. However, current evaluations of\nLVLMs primarily focus on the multi-faceted capabilities in common scenarios,\nlacking quantifiable and automated assessment in autonomous driving contexts,\nlet alone severe road corner cases that even the state-of-the-art autonomous\ndriving perception systems struggle to handle. In this paper, we propose\nCODA-LM, a novel vision-language benchmark for self-driving, which provides the\nfirst automatic and quantitative evaluation of LVLMs for interpretable\nautonomous driving including general perception, regional perception, and\ndriving suggestions. CODA-LM utilizes the texts to describe the road images,\nexploiting powerful text-only large language models (LLMs) without image inputs\nto assess the capabilities of LVLMs in autonomous driving scenarios, which\nreveals stronger alignment with human preferences than LVLM judges. Experiments\ndemonstrate that even the closed-sourced commercial LVLMs like GPT-4V cannot\ndeal with road corner cases well, suggesting that we are still far from a\nstrong LVLM-powered intelligent driving agent, and we hope our CODA-LM can\nbecome the catalyst to promote future development.\n","authors":["Yanze Li","Wenhua Zhang","Kai Chen","Yanxin Liu","Pengxiang Li","Ruiyuan Gao","Lanqing Hong","Meng Tian","Xinhai Zhao","Zhenguo Li","Dit-Yan Yeung","Huchuan Lu","Xu Jia"],"pdf_url":"https://arxiv.org/pdf/2404.10595v1.pdf","comment":"Project Page: https://coda-dataset.github.io/coda-lm/"},{"id":"http://arxiv.org/abs/2404.08814v2","updated":"2024-04-16T14:17:51Z","published":"2024-04-12T21:14:20Z","title":"E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors\n to New Generators Using Limited Data","summary":" As generative AI progresses rapidly, new synthetic image generators continue\nto emerge at a swift pace. Traditional detection methods face two main\nchallenges in adapting to these generators: the forensic traces of synthetic\nimages from new techniques can vastly differ from those learned during\ntraining, and access to data for these new generators is often limited. To\naddress these issues, we introduce the Ensemble of Expert Embedders (E3), a\nnovel continual learning framework for updating synthetic image detectors. E3\nenables the accurate detection of images from newly emerged generators using\nminimal training data. Our approach does this by first employing transfer\nlearning to develop a suite of expert embedders, each specializing in the\nforensic traces of a specific generator. Then, all embeddings are jointly\nanalyzed by an Expert Knowledge Fusion Network to produce accurate and reliable\ndetection decisions. Our experiments demonstrate that E3 outperforms existing\ncontinual learning methods, including those developed specifically for\nsynthetic image detection.\n","authors":["Aref Azizpour","Tai D. Nguyen","Manil Shrestha","Kaidi Xu","Edward Kim","Matthew C. Stamm"],"pdf_url":"https://arxiv.org/pdf/2404.08814v2.pdf","comment":"11 pages, 4 figures, To be published in CVPRWMF24"},{"id":"http://arxiv.org/abs/2403.14421v2","updated":"2024-04-16T14:16:48Z","published":"2024-03-21T14:17:28Z","title":"DP-RDM: Adapting Diffusion Models to Private Domains Without Fine-Tuning","summary":" Text-to-image diffusion models have been shown to suffer from sample-level\nmemorization, possibly reproducing near-perfect replica of images that they are\ntrained on, which may be undesirable. To remedy this issue, we develop the\nfirst differentially private (DP) retrieval-augmented generation algorithm that\nis capable of generating high-quality image samples while providing provable\nprivacy guarantees. Specifically, we assume access to a text-to-image diffusion\nmodel trained on a small amount of public data, and design a DP retrieval\nmechanism to augment the text prompt with samples retrieved from a private\nretrieval dataset. Our \\emph{differentially private retrieval-augmented\ndiffusion model} (DP-RDM) requires no fine-tuning on the retrieval dataset to\nadapt to another domain, and can use state-of-the-art generative models to\ngenerate high-quality image samples while satisfying rigorous DP guarantees.\nFor instance, when evaluated on MS-COCO, our DP-RDM can generate samples with a\nprivacy budget of $\\epsilon=10$, while providing a $3.5$ point improvement in\nFID compared to public-only retrieval for up to $10,000$ queries.\n","authors":["Jonathan Lebensold","Maziar Sanjabi","Pietro Astolfi","Adriana Romero-Soriano","Kamalika Chaudhuri","Mike Rabbat","Chuan Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08966v2","updated":"2024-04-16T14:16:40Z","published":"2024-04-13T11:07:53Z","title":"LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via\n Eulerian Motion Field","summary":" Cinemagraph is a unique form of visual media that combines elements of still\nphotography and subtle motion to create a captivating experience. However, the\nmajority of videos generated by recent works lack depth information and are\nconfined to the constraints of 2D image space. In this paper, inspired by\nsignificant progress in the field of novel view synthesis (NVS) achieved by 3D\nGaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from\n2D image space to 3D space using 3D Gaussian modeling. To achieve this, we\nfirst employ the 3D-GS method to reconstruct 3D Gaussian point clouds from\nmulti-view images of static scenes,incorporating shape regularization terms to\nprevent blurring or artifacts caused by object deformation. We then adopt an\nautoencoder tailored for 3D Gaussian to project it into feature space. To\nmaintain the local continuity of the scene, we devise SuperGaussian for\nclustering based on the acquired features. By calculating the similarity\nbetween clusters and employing a two-stage estimation method, we derive an\nEulerian motion field to describe velocities across the entire scene. The 3D\nGaussian points then move within the estimated Eulerian motion field. Through\nbidirectional animation techniques, we ultimately generate a 3D Cinemagraph\nthat exhibits natural and seamlessly loopable dynamics. Experiment results\nvalidate the effectiveness of our approach, demonstrating high-quality and\nvisually appealing scene generation. The project is available at\nhttps://pokerlishao.github.io/LoopGaussian/.\n","authors":["Jiyang Li","Lechao Cheng","Zhangye Wang","Tingting Mu","Jingxuan He"],"pdf_url":"https://arxiv.org/pdf/2404.08966v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2202.13588v3","updated":"2024-04-16T14:15:40Z","published":"2022-02-28T07:44:59Z","title":"Using Multi-scale SwinTransformer-HTC with Data augmentation in CoNIC\n Challenge","summary":" Colorectal cancer is one of the most common cancers worldwide, so early\npathological examination is very important. However, it is time-consuming and\nlabor-intensive to identify the number and type of cells on H&E images in\nclinical. Therefore, automatic segmentation and classification task and\ncounting the cellular composition of H&E images from pathological sections is\nproposed by CoNIC Challenge 2022. We proposed a multi-scale Swin transformer\nwith HTC for this challenge, and also applied the known normalization methods\nto generate more augmentation data. Finally, our strategy showed that the\nmulti-scale played a crucial role to identify different scale features and the\naugmentation arose the recognition of model.\n","authors":["Chia-Yen Lee","Hsiang-Chin Chien","Ching-Ping Wang","Hong Yen","Kai-Wen Zhen","Hong-Kun Lin"],"pdf_url":"https://arxiv.org/pdf/2202.13588v3.pdf","comment":"Errors have been identified in the analysis"},{"id":"http://arxiv.org/abs/2404.10588v1","updated":"2024-04-16T14:13:44Z","published":"2024-04-16T14:13:44Z","title":"Do Counterfactual Examples Complicate Adversarial Training?","summary":" We leverage diffusion models to study the robustness-performance tradeoff of\nrobust classifiers. Our approach introduces a simple, pretrained diffusion\nmethod to generate low-norm counterfactual examples (CEs): semantically altered\ndata which results in different true class membership. We report that the\nconfidence and accuracy of robust models on their clean training data are\nassociated with the proximity of the data to their CEs. Moreover, robust models\nperform very poorly when evaluated on the CEs directly, as they become\nincreasingly invariant to the low-norm, semantic changes brought by CEs. The\nresults indicate a significant overlap between non-robust and semantic\nfeatures, countering the common assumption that non-robust features are not\ninterpretable.\n","authors":["Eric Yeats","Cameron Darwin","Eduardo Ortega","Frank Liu","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2404.10588v1.pdf","comment":"Accepted as a short paper to the GCV Workshop at CVPR'24"},{"id":"http://arxiv.org/abs/2404.10584v1","updated":"2024-04-16T14:10:42Z","published":"2024-04-16T14:10:42Z","title":"ReWiTe: Realistic Wide-angle and Telephoto Dual Camera Fusion Dataset\n via Beam Splitter Camera Rig","summary":" The fusion of images from dual camera systems featuring a wide-angle and a\ntelephoto camera has become a hotspot problem recently. By integrating\nsimultaneously captured wide-angle and telephoto images from these systems, the\nresulting fused image achieves a wide field of view (FOV) coupled with\nhigh-definition quality. Existing approaches are mostly deep learning methods,\nand predominantly rely on supervised learning, where the training dataset plays\na pivotal role. However, current datasets typically adopt a data synthesis\napproach generate input pairs of wide-angle and telephoto images alongside\nground-truth images. Notably, the wide-angle inputs are synthesized rather than\ncaptured using real wide-angle cameras, and the ground-truth image is captured\nby wide-angle camera whose quality is substantially lower than that of input\ntelephoto images captured by telephoto cameras. To address these limitations,\nwe introduce a novel hardware setup utilizing a beam splitter to simultaneously\ncapture three images, i.e. input pairs and ground-truth images, from two\nauthentic cellphones equipped with wide-angle and telephoto dual cameras.\nSpecifically, the wide-angle and telephoto images captured by cellphone 2 serve\nas the input pair, while the telephoto image captured by cellphone 1, which is\ncalibrated to match the optical path of the wide-angle image from cellphone 2,\nserves as the ground-truth image, maintaining quality on par with the input\ntelephoto image. Experiments validate the efficacy of our newly introduced\ndataset, named ReWiTe, significantly enhances the performance of various\nexisting methods for real-world wide-angle and telephoto dual image fusion\ntasks.\n","authors":["Chunli Peng","Xuan Dong","Tiantian Cao","Zhengqing Li","Kun Dong","Weixin Li"],"pdf_url":"https://arxiv.org/pdf/2404.10584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15368v3","updated":"2024-04-16T14:08:03Z","published":"2023-03-27T16:35:28Z","title":"2S-UDF: A Novel Two-stage UDF Learning Method for Robust Non-watertight\n Model Reconstruction from Multi-view Images","summary":" Recently, building on the foundation of neural radiance field, various\ntechniques have emerged to learn unsigned distance fields (UDF) to reconstruct\n3D non-watertight models from multi-view images. Yet, a central challenge in\nUDF-based volume rendering is formulating a proper way to convert unsigned\ndistance values into volume density, ensuring that the resulting weight\nfunction remains unbiased and sensitive to occlusions. Falling short on these\nrequirements often results in incorrect topology or large reconstruction errors\nin resulting models. This paper addresses this challenge by presenting a novel\ntwo-stage algorithm, 2S-UDF, for learning a high-quality UDF from multi-view\nimages. Initially, the method applies an easily trainable density function\nthat, while slightly biased and transparent, aids in coarse reconstruction. The\nsubsequent stage then refines the geometry and appearance of the object to\nachieve a high-quality reconstruction by directly adjusting the weight function\nused in volume rendering to ensure that it is unbiased and occlusion-aware.\nDecoupling density and weight in two stages makes our training stable and\nrobust, distinguishing our technique from existing UDF learning approaches.\nEvaluations on the DeepFashion3D, DTU, and BlendedMVS datasets validate the\nrobustness and effectiveness of our proposed approach. In both quantitative\nmetrics and visual quality, the results indicate our superior performance over\nother UDF learning techniques in reconstructing 3D non-watertight models from\nmulti-view images. Our code is available at\nhttps://bitbucket.org/jkdeng/2sudf/.\n","authors":["Junkai Deng","Fei Hou","Xuhui Chen","Wencheng Wang","Ying He"],"pdf_url":"https://arxiv.org/pdf/2303.15368v3.pdf","comment":"accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10575v1","updated":"2024-04-16T13:53:58Z","published":"2024-04-16T13:53:58Z","title":"EMC$^2$: Efficient MCMC Negative Sampling for Contrastive Learning with\n Global Convergence","summary":" A key challenge in contrastive learning is to generate negative samples from\na large sample set to contrast with positive samples, for learning better\nencoding of the data. These negative samples often follow a softmax\ndistribution which are dynamically updated during the training process.\nHowever, sampling from this distribution is non-trivial due to the high\ncomputational costs in computing the partition function. In this paper, we\npropose an Efficient Markov Chain Monte Carlo negative sampling method for\nContrastive learning (EMC$^2$). We follow the global contrastive learning loss\nas introduced in SogCLR, and propose EMC$^2$ which utilizes an adaptive\nMetropolis-Hastings subroutine to generate hardness-aware negative samples in\nan online fashion during the optimization. We prove that EMC$^2$ finds an\n$\\mathcal{O}(1/\\sqrt{T})$-stationary point of the global contrastive loss in\n$T$ iterations. Compared to prior works, EMC$^2$ is the first algorithm that\nexhibits global convergence (to stationarity) regardless of the choice of batch\nsize while exhibiting low computation and memory cost. Numerical experiments\nvalidate that EMC$^2$ is effective with small batch training and achieves\ncomparable or better performance than baseline algorithms. We report the\nresults for pre-training image encoders on STL-10 and Imagenet-100.\n","authors":["Chung-Yiu Yau","Hoi-To Wai","Parameswaran Raman","Soumajyoti Sarkar","Mingyi Hong"],"pdf_url":"https://arxiv.org/pdf/2404.10575v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2404.10574v1","updated":"2024-04-16T13:52:00Z","published":"2024-04-16T13:52:00Z","title":"Uncertainty-guided Open-Set Source-Free Unsupervised Domain Adaptation\n with Target-private Class Segregation","summary":" Standard Unsupervised Domain Adaptation (UDA) aims to transfer knowledge from\na labeled source domain to an unlabeled target but usually requires\nsimultaneous access to both source and target data. Moreover, UDA approaches\ncommonly assume that source and target domains share the same labels space.\nYet, these two assumptions are hardly satisfied in real-world scenarios. This\npaper considers the more challenging Source-Free Open-set Domain Adaptation\n(SF-OSDA) setting, where both assumptions are dropped. We propose a novel\napproach for SF-OSDA that exploits the granularity of target-private categories\nby segregating their samples into multiple unknown classes. Starting from an\ninitial clustering-based assignment, our method progressively improves the\nsegregation of target-private samples by refining their pseudo-labels with the\nguide of an uncertainty-based sample selection module. Additionally, we propose\na novel contrastive loss, named NL-InfoNCELoss, that, integrating negative\nlearning into self-supervised contrastive learning, enhances the model\nrobustness to noisy pseudo-labels. Extensive experiments on benchmark datasets\ndemonstrate the superiority of the proposed method over existing approaches,\nestablishing new state-of-the-art performance. Notably, additional analyses\nshow that our method is able to learn the underlying semantics of novel\nclasses, opening the possibility to perform novel class discovery.\n","authors":["Mattia Litrico","Davide Talon","Sebastiano Battiato","Alessio Del Bue","Mario Valerio Giuffrida","Pietro Morerio"],"pdf_url":"https://arxiv.org/pdf/2404.10574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10572v1","updated":"2024-04-16T13:47:27Z","published":"2024-04-16T13:47:27Z","title":"Label merge-and-split: A graph-colouring approach for memory-efficient\n brain parcellation","summary":" Whole brain parcellation requires inferring hundreds of segmentation labels\nin large image volumes and thus presents significant practical challenges for\ndeep learning approaches. We introduce label merge-and-split, a method that\nfirst greatly reduces the effective number of labels required for\nlearning-based whole brain parcellation and then recovers original labels.\nUsing a greedy graph colouring algorithm, our method automatically groups and\nmerges multiple spatially separate labels prior to model training and\ninference. The merged labels may be semantically unrelated. A deep learning\nmodel is trained to predict merged labels. At inference time, original labels\nare restored using atlas-based influence regions. In our experiments, the\nproposed approach reduces the number of labels by up to 68% while achieving\nsegmentation accuracy comparable to the baseline method without label merging\nand splitting. Moreover, model training and inference times as well as GPU\nmemory requirements were reduced significantly. The proposed method can be\napplied to all semantic segmentation tasks with a large number of spatially\nseparate classes within an atlas-based prior.\n","authors":["Aaron Kujawa","Reuben Dorent","Sebastien Ourselin","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2404.10572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10571v1","updated":"2024-04-16T13:47:21Z","published":"2024-04-16T13:47:21Z","title":"CMU-Flownet: Exploring Point Cloud Scene Flow Estimation in Occluded\n Scenario","summary":" Occlusions hinder point cloud frame alignment in LiDAR data, a challenge\ninadequately addressed by scene flow models tested mainly on occlusion-free\ndatasets. Attempts to integrate occlusion handling within networks often suffer\naccuracy issues due to two main limitations: a) the inadequate use of occlusion\ninformation, often merging it with flow estimation without an effective\nintegration strategy, and b) reliance on distance-weighted upsampling that\nfalls short in correcting occlusion-related errors. To address these\nchallenges, we introduce the Correlation Matrix Upsampling Flownet\n(CMU-Flownet), incorporating an occlusion estimation module within its cost\nvolume layer, alongside an Occlusion-aware Cost Volume (OCV) mechanism.\nSpecifically, we propose an enhanced upsampling approach that expands the\nsensory field of the sampling process which integrates a Correlation Matrix\ndesigned to evaluate point-level similarity. Meanwhile, our model robustly\nintegrates occlusion data within the context of scene flow, deploying this\ninformation strategically during the refinement phase of the flow estimation.\nThe efficacy of this approach is demonstrated through subsequent experimental\nvalidation. Empirical assessments reveal that CMU-Flownet establishes\nstate-of-the-art performance within the realms of occluded Flyingthings3D and\nKITTY datasets, surpassing previous methodologies across a majority of\nevaluated metrics.\n","authors":["Jingze Chen","Junfeng Yao","Qiqin Lin","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2404.10571v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2308.16215v6","updated":"2024-04-16T13:32:25Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control for Vision Models","summary":" Standardized lossy video coding is at the core of almost all real-world video\nprocessing pipelines. Rate control is used to enable standard codecs to adapt\nto different network bandwidth conditions or storage constraints. However,\nstandard video codecs (e.g., H.264) and their rate control modules aim to\nminimize video distortion w.r.t. human quality assessment. We demonstrate\nempirically that standard-coded videos vastly deteriorate the performance of\ndeep vision models. To overcome the deterioration of vision performance, this\npaper presents the first end-to-end learnable deep video codec control that\nconsiders both bandwidth constraints and downstream deep vision performance,\nwhile adhering to existing standardization. We demonstrate that our approach\nbetter preserves downstream deep vision performance than traditional standard\nvideo coding.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Daniel Cremers","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v6.pdf","comment":"Accepted at CVPR 2024 Workshop on AI for Streaming (AIS)"},{"id":"http://arxiv.org/abs/2404.00724v2","updated":"2024-04-16T13:28:22Z","published":"2024-03-31T15:50:52Z","title":"Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic\n Distribution Alignment","summary":" Conventional unsupervised anomaly detection (UAD) methods build separate\nmodels for each object category. Recent studies have proposed to train a\nunified model for multiple classes, namely model-unified UAD. However, such\nmethods still implement the unified model separately on each class during\ninference with respective anomaly decision thresholds, which hinders their\napplication when the image categories are entirely unavailable. In this work,\nwe present a simple yet powerful method to address multi-class anomaly\ndetection without any class information, namely \\textit{absolute-unified} UAD.\nWe target the crux of prior works in this challenging setting: different\nobjects have mismatched anomaly score distributions. We propose Class-Agnostic\nDistribution Alignment (CADA) to align the mismatched score distribution of\neach implicit class without knowing class information, which enables unified\nanomaly detection for all classes and samples. The essence of CADA is to\npredict each class's score distribution of normal samples given any image,\nnormal or anomalous, of this class. As a general component, CADA can activate\nthe potential of nearly all UAD methods under absolute-unified setting. Our\napproach is extensively evaluated under the proposed setting on two popular UAD\nbenchmark datasets, MVTec AD and VisA, where we exceed previous\nstate-of-the-art by a large margin.\n","authors":["Jia Guo","Haonan Han","Shuai Lu","Weihang Zhang","Huiqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.00724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00218v3","updated":"2024-04-16T13:22:08Z","published":"2022-11-01T02:00:32Z","title":"Pixel-Wise Contrastive Distillation","summary":" We present a simple but effective pixel-level self-supervised distillation\nframework friendly to dense prediction tasks. Our method, called Pixel-Wise\nContrastive Distillation (PCD), distills knowledge by attracting the\ncorresponding pixels from student's and teacher's output feature maps. PCD\nincludes a novel design called SpatialAdaptor which ``reshapes'' a part of the\nteacher network while preserving the distribution of its output features. Our\nablation experiments suggest that this reshaping behavior enables more\ninformative pixel-to-pixel distillation. Moreover, we utilize a plug-in\nmulti-head self-attention module that explicitly relates the pixels of\nstudent's feature maps to enhance the effective receptive field, leading to a\nmore competitive student. PCD \\textbf{outperforms} previous self-supervised\ndistillation methods on various dense prediction tasks. A backbone of\n\\mbox{ResNet-18-FPN} distilled by PCD achieves $37.4$ AP$^\\text{bbox}$ and\n$34.0$ AP$^\\text{mask}$ on COCO dataset using the detector of \\mbox{Mask\nR-CNN}. We hope our study will inspire future research on how to pre-train a\nsmall model friendly to dense prediction tasks in a self-supervised fashion.\n","authors":["Junqiang Huang","Zichao Guo"],"pdf_url":"https://arxiv.org/pdf/2211.00218v3.pdf","comment":"ICCV 2023 camera-ready"},{"id":"http://arxiv.org/abs/2304.08272v4","updated":"2024-04-16T13:20:44Z","published":"2023-04-17T13:33:23Z","title":"About latent roles in forecasting players in team sports","summary":" Forecasting players in sports has grown in popularity due to the potential\nfor a tactical advantage and the applicability of such research to multi-agent\ninteraction systems. Team sports contain a significant social component that\ninfluences interactions between teammates and opponents. However, it still\nneeds to be fully exploited. In this work, we hypothesize that each participant\nhas a specific function in each action and that role-based interaction is\ncritical for predicting players' future moves. We create RolFor, a novel\nend-to-end model for Role-based Forecasting. RolFor uses a new module we\ndeveloped called Ordering Neural Networks (OrderNN) to permute the order of the\nplayers such that each player is assigned to a latent role. The latent role is\nthen modeled with a RoleGCN. Thanks to its graph representation, it provides a\nfully learnable adjacency matrix that captures the relationships between roles\nand is subsequently used to forecast the players' future trajectories.\nExtensive experiments on a challenging NBA basketball dataset back up the\nimportance of roles and justify our goal of modeling them using optimizable\nmodels. When an oracle provides roles, the proposed RolFor compares favorably\nto the current state-of-the-art (it ranks first in terms of ADE and second in\nterms of FDE errors). However, training the end-to-end RolFor incurs the issues\nof differentiability of permutation methods, which we experimentally review.\nFinally, this work restates differentiable ranking as a difficult open problem\nand its great potential in conjunction with graph-based interaction models.\nProject is available at: https://www.pinlab.org/aboutlatentroles\n","authors":["Luca Scofano","Alessio Sampieri","Giuseppe Re","Matteo Almanza","Alessandro Panconesi","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2304.08272v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10548v1","updated":"2024-04-16T13:18:02Z","published":"2024-04-16T13:18:02Z","title":"Classification of Prostate Cancer in 3D Magnetic Resonance Imaging Data\n based on Convolutional Neural Networks","summary":" Prostate cancer is a commonly diagnosed cancerous disease among men\nworld-wide. Even with modern technology such as multi-parametric magnetic\nresonance tomography and guided biopsies, the process for diagnosing prostate\ncancer remains time consuming and requires highly trained professionals. In\nthis paper, different convolutional neural networks (CNN) are evaluated on\ntheir abilities to reliably classify whether an MRI sequence contains malignant\nlesions. Implementations of a ResNet, a ConvNet and a ConvNeXt for 3D image\ndata are trained and evaluated. The models are trained using different data\naugmentation techniques, learning rates, and optimizers. The data is taken from\na private dataset, provided by Cantonal Hospital Aarau. The best result was\nachieved by a ResNet3D, yielding an average precision score of 0.4583 and AUC\nROC score of 0.6214.\n","authors":["Malte Rippa","Ruben Schulze","Marian Himstedt","Felice Burn"],"pdf_url":"https://arxiv.org/pdf/2404.10548v1.pdf","comment":"Previous version published in Buzug T.M., Handels H., M\\\"uller S.,\n H\\\"ubner C., Mertins A., Rostalski P.: Student Conference Proceedings 2023,\n Infinite Science Publishing, 2023 (ISBN/EAN 978-3-945954-72-0). 7 pages, 2\n figures"},{"id":"http://arxiv.org/abs/2311.15658v2","updated":"2024-04-16T12:58:57Z","published":"2023-11-27T09:40:14Z","title":"Regularization by Texts for Latent Diffusion Inverse Solvers","summary":" The recent advent of diffusion models has led to significant progress in\nsolving inverse problems, leveraging these models as effective generative\npriors. Nonetheless, there remain challenges related to the ill-posed nature of\nsuch problems, often due to inherent ambiguities in measurements or intrinsic\nsystem symmetries. To address this, drawing inspiration from the human ability\nto resolve visual ambiguities through perceptual biases, here we introduce a\nnovel latent diffusion inverse solver by regularization by texts (TReg).\nSpecifically, TReg applies the textual description of the preconception of the\nsolution during the reverse diffusion sampling, of which the description is\ndynamically reinforced through null-text optimization for adaptive negation.\nOur comprehensive experimental results demonstrate that TReg successfully\nmitigates ambiguity in the inverse problems, enhancing their effectiveness and\naccuracy.\n","authors":["Jeongsol Kim","Geon Yeong Park","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10527v1","updated":"2024-04-16T12:55:15Z","published":"2024-04-16T12:55:15Z","title":"SPVLoc: Semantic Panoramic Viewport Matching for 6D Camera Localization\n in Unseen Environments","summary":" In this paper, we present SPVLoc, a global indoor localization method that\naccurately determines the six-dimensional (6D) camera pose of a query image and\nrequires minimal scene-specific prior knowledge and no scene-specific training.\nOur approach employs a novel matching procedure to localize the perspective\ncamera's viewport, given as an RGB image, within a set of panoramic semantic\nlayout representations of the indoor environment. The panoramas are rendered\nfrom an untextured 3D reference model, which only comprises approximate\nstructural information about room shapes, along with door and window\nannotations. We demonstrate that a straightforward convolutional network\nstructure can successfully achieve image-to-panorama and ultimately\nimage-to-model matching. Through a viewport classification score, we rank\nreference panoramas and select the best match for the query image. Then, a 6D\nrelative pose is estimated between the chosen panorama and query image. Our\nexperiments demonstrate that this approach not only efficiently bridges the\ndomain gap but also generalizes well to previously unseen scenes that are not\npart of the training data. Moreover, it achieves superior localization accuracy\ncompared to the state of the art methods and also estimates more degrees of\nfreedom of the camera pose. We will make our source code publicly available at\nhttps://github.com/fraunhoferhhi/spvloc .\n","authors":["Niklas Gard","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2404.10527v1.pdf","comment":"This submission includes the paper and supplementary material. 24\n pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.02155v3","updated":"2024-04-16T12:43:35Z","published":"2023-12-04T18:59:55Z","title":"GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for\n Real-time Human Novel View Synthesis","summary":" We present a new approach, termed GPS-Gaussian, for synthesizing novel views\nof a character in a real-time manner. The proposed method enables 2K-resolution\nrendering under a sparse-view camera setting. Unlike the original Gaussian\nSplatting or neural implicit rendering methods that necessitate per-subject\noptimizations, we introduce Gaussian parameter maps defined on the source views\nand regress directly Gaussian Splatting properties for instant novel view\nsynthesis without any fine-tuning or optimization. To this end, we train our\nGaussian parameter regression module on a large amount of human scan data,\njointly with a depth estimation module to lift 2D parameter maps to 3D space.\nThe proposed framework is fully differentiable and experiments on several\ndatasets demonstrate that our method outperforms state-of-the-art methods while\nachieving an exceeding rendering speed.\n","authors":["Shunyuan Zheng","Boyao Zhou","Ruizhi Shao","Boning Liu","Shengping Zhang","Liqiang Nie","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02155v3.pdf","comment":"Accepted by CVPR 2024 (Highlight). Project page:\n https://shunyuanzheng.github.io/GPS-Gaussian"},{"id":"http://arxiv.org/abs/2404.10518v1","updated":"2024-04-16T12:41:25Z","published":"2024-04-16T12:41:25Z","title":"MobileNetV4 - Universal Models for the Mobile Ecosystem","summary":" We present the latest generation of MobileNets, known as MobileNetV4 (MNv4),\nfeaturing universally efficient architecture designs for mobile devices. At its\ncore, we introduce the Universal Inverted Bottleneck (UIB) search block, a\nunified and flexible structure that merges Inverted Bottleneck (IB), ConvNext,\nFeed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant.\nAlongside UIB, we present Mobile MQA, an attention block tailored for mobile\naccelerators, delivering a significant 39% speedup. An optimized neural\narchitecture search (NAS) recipe is also introduced which improves MNv4 search\neffectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe\nresults in a new suite of MNv4 models that are mostly Pareto optimal across\nmobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural\nEngine and Google Pixel EdgeTPU - a characteristic not found in any other\nmodels tested. Finally, to further boost accuracy, we introduce a novel\ndistillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model\ndelivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just\n3.8ms.\n","authors":["Danfeng Qin","Chas Leichner","Manolis Delakis","Marco Fornoni","Shixin Luo","Fan Yang","Weijun Wang","Colby Banbury","Chengxi Ye","Berkin Akin","Vaibhav Aggarwal","Tenghui Zhu","Daniele Moro","Andrew Howard"],"pdf_url":"https://arxiv.org/pdf/2404.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14950v2","updated":"2024-04-16T12:40:41Z","published":"2022-11-27T22:01:47Z","title":"Leveraging Image Matching Toward End-to-End Relative Camera Pose\n Regression","summary":" This paper proposes a generalizable, end-to-end deep learning-based method\nfor relative pose regression between two images. Given two images of the same\nscene captured from different viewpoints, our method predicts the relative\nrotation and translation (including direction and scale) between the two\nrespective cameras. Inspired by the classical pipeline, our method leverages\nImage Matching (IM) as a pre-trained task for relative pose regression.\nSpecifically, we use LoFTR, an architecture that utilizes an attention-based\nnetwork pre-trained on Scannet, to extract semi-dense feature maps, which are\nthen warped and fed into a pose regression network. Notably, we use a loss\nfunction that utilizes separate terms to account for the translation direction\nand scale. We believe such a separation is important because translation\ndirection is determined by point correspondences while the scale is inferred\nfrom prior on shape sizes. Our ablations further support this choice. We\nevaluate our method on several datasets and show that it outperforms previous\nend-to-end methods. The method also generalizes well to unseen datasets.\n","authors":["Fadi Khatib","Yuval Margalit","Meirav Galun","Ronen Basri"],"pdf_url":"https://arxiv.org/pdf/2211.14950v2.pdf","comment":"Project webpage: https://fadikhatib.github.io/GRelPose"},{"id":"http://arxiv.org/abs/2404.10501v1","updated":"2024-04-16T12:19:54Z","published":"2024-04-16T12:19:54Z","title":"Self-Supervised Visual Preference Alignment","summary":" This paper makes the first attempt towards unsupervised preference alignment\nin Vision-Language Models (VLMs). We generate chosen and rejected responses\nwith regard to the original and augmented image pairs, and conduct preference\nalignment with direct preference optimization. It is based on a core idea:\nproperly designed augmentation to the image input will induce VLM to generate\nfalse but hard negative responses, which helps the model to learn from and\nproduce more robust and powerful answers. The whole pipeline no longer hinges\non supervision from GPT4 or human involvement during alignment, and is highly\nefficient with few lines of code. With only 8k randomly sampled unsupervised\ndata, it achieves 90\\% relative score to GPT-4 on complex reasoning in\nLLaVA-Bench, and improves LLaVA-7B/13B by 6.7\\%/5.6\\% score on complex\nmulti-modal benchmark MM-Vet. Visualizations shows its improved ability to\nalign with user-intentions. A series of ablations are firmly conducted to\nreveal the latent mechanism of the approach, which also indicates its potential\ntowards further scaling. Code will be available.\n","authors":["Ke Zhu","Liang Zhao","Zheng Ge","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10499v1","updated":"2024-04-16T12:18:08Z","published":"2024-04-16T12:18:08Z","title":"Robust Noisy Label Learning via Two-Stream Sample Distillation","summary":" Noisy label learning aims to learn robust networks under the supervision of\nnoisy labels, which plays a critical role in deep learning. Existing work\neither conducts sample selection or label correction to deal with noisy labels\nduring the model training process. In this paper, we design a simple yet\neffective sample selection framework, termed Two-Stream Sample Distillation\n(TSSD), for noisy label learning, which can extract more high-quality samples\nwith clean labels to improve the robustness of network training. Firstly, a\nnovel Parallel Sample Division (PSD) module is designed to generate a certain\ntraining set with sufficient reliable positive and negative samples by jointly\nconsidering the sample structure in feature space and the human prior in loss\nspace. Secondly, a novel Meta Sample Purification (MSP) module is further\ndesigned to mine adequate semi-hard samples from the remaining uncertain\ntraining set by learning a strong meta classifier with extra golden data. As a\nresult, more and more high-quality samples will be distilled from the noisy\ntraining set to train networks robustly in every iteration. Extensive\nexperiments on four benchmark datasets, including CIFAR-10, CIFAR-100,\nTiny-ImageNet, and Clothing-1M, show that our method has achieved\nstate-of-the-art results over its competitors.\n","authors":["Sihan Bai","Sanping Zhou","Zheng Qin","Le Wang","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.10499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10498v1","updated":"2024-04-16T12:12:06Z","published":"2024-04-16T12:12:06Z","title":"LAECIPS: Large Vision Model Assisted Adaptive Edge-Cloud Collaboration\n for IoT-based Perception System","summary":" Recent large vision models (e.g., SAM) enjoy great potential to facilitate\nintelligent perception with high accuracy. Yet, the resource constraints in the\nIoT environment tend to limit such large vision models to be locally deployed,\nincurring considerable inference latency thereby making it difficult to support\nreal-time applications, such as autonomous driving and robotics. Edge-cloud\ncollaboration with large-small model co-inference offers a promising approach\nto achieving high inference accuracy and low latency. However, existing\nedge-cloud collaboration methods are tightly coupled with the model\narchitecture and cannot adapt to the dynamic data drifts in heterogeneous IoT\nenvironments. To address the issues, we propose LAECIPS, a new edge-cloud\ncollaboration framework. In LAECIPS, both the large vision model on the cloud\nand the lightweight model on the edge are plug-and-play. We design an\nedge-cloud collaboration strategy based on hard input mining, optimized for\nboth high accuracy and low latency. We propose to update the edge model and its\ncollaboration strategy with the cloud under the supervision of the large vision\nmodel, so as to adapt to the dynamic IoT data streams. Theoretical analysis of\nLAECIPS proves its feasibility. Experiments conducted in a robotic semantic\nsegmentation system using real-world datasets show that LAECIPS outperforms its\nstate-of-the-art competitors in accuracy, latency, and communication overhead\nwhile having better adaptability to dynamic environments.\n","authors":["Shijing Hu","Ruijun Deng","Xin Du","Zhihui Lu","Qiang Duan","Yi He","Shih-Chia Huang","Jie Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06665v2","updated":"2024-04-16T12:09:24Z","published":"2024-04-10T00:25:09Z","title":"Deep Generative Data Assimilation in Multimodal Setting","summary":" Robust integration of physical knowledge and data is key to improve\ncomputational simulations, such as Earth system models. Data assimilation is\ncrucial for achieving this goal because it provides a systematic framework to\ncalibrate model outputs with observations, which can include remote sensing\nimagery and ground station measurements, with uncertainty quantification.\nConventional methods, including Kalman filters and variational approaches,\ninherently rely on simplifying linear and Gaussian assumptions, and can be\ncomputationally expensive. Nevertheless, with the rapid adoption of data-driven\nmethods in many areas of computational sciences, we see the potential of\nemulating traditional data assimilation with deep learning, especially\ngenerative models. In particular, the diffusion-based probabilistic framework\nhas large overlaps with data assimilation principles: both allows for\nconditional generation of samples with a Bayesian inverse framework. These\nmodels have shown remarkable success in text-conditioned image generation or\nimage-controlled video synthesis. Likewise, one can frame data assimilation as\nobservation-conditioned state calibration. In this work, we propose SLAMS:\nScore-based Latent Assimilation in Multimodal Setting. Specifically, we\nassimilate in-situ weather station data and ex-situ satellite imagery to\ncalibrate the vertical temperature profiles, globally. Through extensive\nablation, we demonstrate that SLAMS is robust even in low-resolution, noisy,\nand sparse data settings. To our knowledge, our work is the first to apply deep\ngenerative framework for multimodal data assimilation using real-world\ndatasets; an important step for building robust computational simulators,\nincluding the next-generation Earth system models. Our code is available at:\nhttps://github.com/yongquan-qu/SLAMS\n","authors":["Yongquan Qu","Juan Nathaniel","Shuolin Li","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2404.06665v2.pdf","comment":"CVPR2024 EarthVision"},{"id":"http://arxiv.org/abs/2312.07039v2","updated":"2024-04-16T12:05:55Z","published":"2023-12-12T07:52:33Z","title":"Open-Pose 3D Zero-Shot Learning: Benchmark and Challenges","summary":" With the explosive 3D data growth, the urgency of utilizing zero-shot\nlearning to facilitate data labeling becomes evident. Recently, methods\ntransferring language or language-image pre-training models like Contrastive\nLanguage-Image Pre-training (CLIP) to 3D vision have made significant progress\nin the 3D zero-shot classification task. These methods primarily focus on 3D\nobject classification with an aligned pose; such a setting is, however, rather\nrestrictive, which overlooks the recognition of 3D objects with open poses\ntypically encountered in real-world scenarios, such as an overturned chair or a\nlying teddy bear. To this end, we propose a more realistic and challenging\nscenario named open-pose 3D zero-shot classification, focusing on the\nrecognition of 3D objects regardless of their orientation. First, we revisit\nthe current research on 3D zero-shot classification, and propose two benchmark\ndatasets specifically designed for the open-pose setting. We empirically\nvalidate many of the most popular methods in the proposed open-pose benchmark.\nOur investigations reveal that most current 3D zero-shot classification models\nsuffer from poor performance, indicating a substantial exploration room towards\nthe new direction. Furthermore, we study a concise pipeline with an iterative\nangle refinement mechanism that automatically optimizes one ideal angle to\nclassify these open-pose 3D objects. In particular, to make validation more\ncompelling and not just limited to existing CLIP-based methods, we also pioneer\nthe exploration of knowledge transfer based on Diffusion models. While the\nproposed solutions can serve as a new benchmark for open-pose 3D zero-shot\nclassification, we discuss the complexities and challenges of this scenario\nthat remain for further research development. The code is available publicly at\nhttps://github.com/weiguangzhao/Diff-OP3D.\n","authors":["Weiguang Zhao","Guanyu Yang","Rui Zhang","Chenru Jiang","Chaolong Yang","Yuyao Yan","Amir Hussain","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.07039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04016v2","updated":"2024-04-16T12:04:01Z","published":"2023-12-07T03:10:03Z","title":"PartDistill: 3D Shape Part Segmentation by Vision-Language Model\n Distillation","summary":" This paper proposes a cross-modal distillation framework, PartDistill, which\ntransfers 2D knowledge from vision-language models (VLMs) to facilitate 3D\nshape part segmentation. PartDistill addresses three major challenges in this\ntask: the lack of 3D segmentation in invisible or undetected regions in the 2D\nprojections, inconsistent 2D predictions by VLMs, and the lack of knowledge\naccumulation across different 3D shapes. PartDistill consists of a teacher\nnetwork that uses a VLM to make 2D predictions and a student network that\nlearns from the 2D predictions while extracting geometrical features from\nmultiple 3D shapes to carry out 3D part segmentation. A bi-directional\ndistillation, including forward and backward distillations, is carried out\nwithin the framework, where the former forward distills the 2D predictions to\nthe student network, and the latter improves the quality of the 2D predictions,\nwhich subsequently enhances the final 3D segmentation. Moreover, PartDistill\ncan exploit generative models that facilitate effortless 3D shape creation for\ngenerating knowledge sources to be distilled. Through extensive experiments,\nPartDistill boosts the existing methods with substantial margins on widely used\nShapeNetPart and PartNetE datasets, by more than 15% and 12% higher mIoU\nscores, respectively. The code for this work is available at\nhttps://github.com/ardianumam/PartDistill.\n","authors":["Ardian Umam","Cheng-Kun Yang","Min-Hung Chen","Jen-Hui Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2312.04016v2.pdf","comment":"CVPR 2024 Accepted"},{"id":"http://arxiv.org/abs/2404.10490v1","updated":"2024-04-16T11:57:03Z","published":"2024-04-16T11:57:03Z","title":"Teaching Chinese Sign Language with Feedback in Mixed Reality","summary":" Traditional sign language teaching methods face challenges such as limited\nfeedback and diverse learning scenarios. Although 2D resources lack real-time\nfeedback, classroom teaching is constrained by a scarcity of teacher. Methods\nbased on VR and AR have relatively primitive interaction feedback mechanisms.\nThis study proposes an innovative teaching model that uses real-time monocular\nvision and mixed reality technology. First, we introduce an improved\nhand-posture reconstruction method to achieve sign language semantic retention\nand real-time feedback. Second, a ternary system evaluation algorithm is\nproposed for a comprehensive assessment, maintaining good consistency with\nexperts in sign language. Furthermore, we use mixed reality technology to\nconstruct a scenario-based 3D sign language classroom and explore the user\nexperience of scenario teaching. Overall, this paper presents a novel teaching\nmethod that provides an immersive learning experience, advanced posture\nreconstruction, and precise feedback, achieving positive feedback on user\nexperience and learning effectiveness.\n","authors":["Hongli Wen","Yang Xu","Lin Li","Xudong Ru"],"pdf_url":"https://arxiv.org/pdf/2404.10490v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.17701v3","updated":"2024-04-16T11:46:39Z","published":"2024-03-26T13:40:18Z","title":"Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical\n Image Segmentation","summary":" Image segmentation holds a vital position in the realms of diagnosis and\ntreatment within the medical domain. Traditional convolutional neural networks\n(CNNs) and Transformer models have made significant advancements in this realm,\nbut they still encounter challenges because of limited receptive field or high\ncomputing complexity. Recently, State Space Models (SSMs), particularly Mamba\nand its variants, have demonstrated notable performance in the field of vision.\nHowever, their feature extraction methods may not be sufficiently effective and\nretain some redundant structures, leaving room for parameter reduction.\nMotivated by previous spatial and channel attention methods, we propose Triplet\nMamba-UNet. The method leverages residual VSS Blocks to extract intensive\ncontextual features, while Triplet SSM is employed to fuse features across\nspatial and channel dimensions. We conducted experiments on ISIC17, ISIC18,\nCVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets,\ndemonstrating the superior segmentation performance of our proposed TM-UNet.\nAdditionally, compared to the previous VM-UNet, our model achieves a one-third\nreduction in parameters.\n","authors":["Hao Tang","Lianglun Cheng","Guoheng Huang","Zhengguang Tan","Junhao Lu","Kaihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17701v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10484v1","updated":"2024-04-16T11:44:12Z","published":"2024-04-16T11:44:12Z","title":"AbsGS: Recovering Fine Details for 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3D-GS) technique couples 3D Gaussian primitives with\ndifferentiable rasterization to achieve high-quality novel view synthesis\nresults while providing advanced real-time rendering performance. However, due\nto the flaw of its adaptive density control strategy in 3D-GS, it frequently\nsuffers from over-reconstruction issue in intricate scenes containing\nhigh-frequency details, leading to blurry rendered images. The underlying\nreason for the flaw has still been under-explored. In this work, we present a\ncomprehensive analysis of the cause of aforementioned artifacts, namely\ngradient collision, which prevents large Gaussians in over-reconstructed\nregions from splitting. To address this issue, we propose the novel\nhomodirectional view-space positional gradient as the criterion for\ndensification. Our strategy efficiently identifies large Gaussians in\nover-reconstructed regions, and recovers fine details by splitting. We evaluate\nour proposed method on various challenging datasets. The experimental results\nindicate that our approach achieves the best rendering quality with reduced or\nsimilar memory consumption. Our method is easy to implement and can be\nincorporated into a wide variety of most recent Gaussian Splatting-based\nmethods. We will open source our codes upon formal publication. Our project\npage is available at: https://ty424.github.io/AbsGS.github.io/\n","authors":["Zongxin Ye","Wenyu Li","Sidun Liu","Peng Qiao","Yong Dou"],"pdf_url":"https://arxiv.org/pdf/2404.10484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10476v1","updated":"2024-04-16T11:38:44Z","published":"2024-04-16T11:38:44Z","title":"Efficient optimal dispersed Haar-like filters for face detection","summary":" This paper introduces a new dispersed Haar-like filter for efficiently\ndetection face. The basic idea for finding the filter is maximising\nbetween-class and minimising within-class variance. The proposed filters can be\nconsidered as an optimal configuration dispersed Haar-like filters; filters\nwith disjoint black and white parts.\n","authors":["Zeinab Sedaghatjoo","Hossein Hosseinzadeh","Ahmad shirzadi"],"pdf_url":"https://arxiv.org/pdf/2404.10476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02612v2","updated":"2024-04-16T11:35:37Z","published":"2023-11-05T10:01:18Z","title":"GPT-4V-AD: Exploring Grounding Potential of VQA-oriented GPT-4V for\n Zero-shot Anomaly Detection","summary":" Large Multimodal Model (LMM) GPT-4V(ision) endows GPT-4 with visual grounding\ncapabilities, making it possible to handle certain tasks through the Visual\nQuestion Answering (VQA) paradigm. This paper explores the potential of\nVQA-oriented GPT-4V in the recently popular visual Anomaly Detection (AD) and\nis the first to conduct qualitative and quantitative evaluations on the popular\nMVTec AD and VisA datasets. Considering that this task requires both\nimage-/pixel-level evaluations, the proposed GPT-4V-AD framework contains three\ncomponents: \\textbf{\\textit{1)}} Granular Region Division, \\textbf{\\textit{2)}}\nPrompt Designing, \\textbf{\\textit{3)}} Text2Segmentation for easy quantitative\nevaluation, and have made some different attempts for comparative analysis. The\nresults show that GPT-4V can achieve certain results in the zero-shot AD task\nthrough a VQA paradigm, such as achieving image-level 77.1/88.0 and pixel-level\n68.0/76.6 AU-ROCs on MVTec AD and VisA datasets, respectively. However, its\nperformance still has a certain gap compared to the state-of-the-art zero-shot\nmethod, \\eg, WinCLIP and CLIP-AD, and further researches are needed. This study\nprovides a baseline reference for the research of VQA-oriented LMM in the\nzero-shot AD task, and we also post several possible future works. Code is\navailable at \\url{https://github.com/zhangzjn/GPT-4V-AD}.\n","authors":["Jiangning Zhang","Haoyang He","Xuhai Chen","Zhucun Xue","Yabiao Wang","Chengjie Wang","Lei Xie","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.02612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10474v1","updated":"2024-04-16T11:29:43Z","published":"2024-04-16T11:29:43Z","title":"Toward a Realistic Benchmark for Out-of-Distribution Detection","summary":" Deep neural networks are increasingly used in a wide range of technologies\nand services, but remain highly susceptible to out-of-distribution (OOD)\nsamples, that is, drawn from a different distribution than the original\ntraining set. A common approach to address this issue is to endow deep neural\nnetworks with the ability to detect OOD samples. Several benchmarks have been\nproposed to design and validate OOD detection techniques. However, many of them\nare based on far-OOD samples drawn from very different distributions, and thus\nlack the complexity needed to capture the nuances of real-world scenarios. In\nthis work, we introduce a comprehensive benchmark for OOD detection, based on\nImageNet and Places365, that assigns individual classes as in-distribution or\nout-of-distribution depending on the semantic similarity with the training set.\nSeveral techniques can be used to determine which classes should be considered\nin-distribution, yielding benchmarks with varying properties. Experimental\nresults on different OOD detection techniques show how their measured efficacy\ndepends on the selected benchmark and how confidence-based techniques may\noutperform classifier-based ones on near-OOD samples.\n","authors":["Pietro Recalcati","Fabio Garcea","Luca Piano","Fabrizio Lamberti","Lia Morra"],"pdf_url":"https://arxiv.org/pdf/2404.10474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11029v4","updated":"2024-04-16T11:24:36Z","published":"2023-06-19T15:46:41Z","title":"RemoteCLIP: A Vision Language Foundation Model for Remote Sensing","summary":" General-purpose foundation models have led to recent breakthroughs in\nartificial intelligence. In remote sensing, self-supervised learning (SSL) and\nMasked Image Modeling (MIM) have been adopted to build foundation models.\nHowever, these models primarily learn low-level features and require annotated\ndata for fine-tuning. Moreover, they are inapplicable for retrieval and\nzero-shot applications due to the lack of language understanding. To address\nthese limitations, we propose RemoteCLIP, the first vision-language foundation\nmodel for remote sensing that aims to learn robust visual features with rich\nsemantics and aligned text embeddings for seamless downstream application. To\naddress the scarcity of pre-training data, we leverage data scaling which\nconverts heterogeneous annotations into a unified image-caption data format\nbased on Box-to-Caption (B2C) and Mask-to-Box (M2B) conversion. By further\nincorporating UAV imagery, we produce a 12 $\\times$ larger pretraining dataset\nthan the combination of all available datasets. RemoteCLIP can be applied to a\nvariety of downstream tasks, including zero-shot image classification, linear\nprobing, $\\textit{k}$-NN classification, few-shot classification, image-text\nretrieval, and object counting in remote sensing images. Evaluation on 16\ndatasets, including a newly introduced RemoteCount benchmark to test the object\ncounting ability, shows that RemoteCLIP consistently outperforms baseline\nfoundation models across different model scales. Impressively, RemoteCLIP beats\nthe state-of-the-art method by 9.14% mean recall on the RSITMD dataset and\n8.92% on the RSICD dataset. For zero-shot classification, our RemoteCLIP\noutperforms the CLIP baseline by up to 6.39% average accuracy on 12 downstream\ndatasets. Project website: https://github.com/ChenDelong1999/RemoteCLIP\n","authors":["Fan Liu","Delong Chen","Zhangqingyun Guan","Xiaocong Zhou","Jiale Zhu","Qiaolin Ye","Liyong Fu","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.11029v4.pdf","comment":"Accepted by IEEE Transactions on Geoscience and Remote Sensing (TGRS)"},{"id":"http://arxiv.org/abs/2404.10454v1","updated":"2024-04-16T10:50:16Z","published":"2024-04-16T10:50:16Z","title":"A Computer Vision-Based Quality Assessment Technique for the automatic\n control of consumables for analytical laboratories","summary":" The rapid growth of the Industry 4.0 paradigm is increasing the pressure to\ndevelop effective automated monitoring systems. Artificial Intelligence (AI) is\na convenient tool to improve the efficiency of industrial processes while\nreducing errors and waste. In fact, it allows the use of real-time data to\nincrease the effectiveness of monitoring systems, minimize errors, make the\nproduction process more sustainable, and save costs. In this paper, a novel\nautomatic monitoring system is proposed in the context of production process of\nplastic consumables used in analysis laboratories, with the aim to increase the\neffectiveness of the control process currently performed by a human operator.\nIn particular, we considered the problem of classifying the presence or absence\nof a transparent anticoagulant substance inside test tubes. Specifically, a\nhand-designed deep network model is used and compared with some\nstate-of-the-art models for its ability to categorize different images of vials\nthat can be either filled with the anticoagulant or empty. Collected results\nindicate that the proposed approach is competitive with state-of-the-art models\nin terms of accuracy. Furthermore, we increased the complexity of the task by\ntraining the models on the ability to discriminate not only the presence or\nabsence of the anticoagulant inside the vial, but also the size of the test\ntube. The analysis performed in the latter scenario confirms the\ncompetitiveness of our approach. Moreover, our model is remarkably superior in\nterms of its generalization ability and requires significantly fewer resources.\nThese results suggest the possibility of successfully implementing such a model\nin the production process of a plastic consumables company.\n","authors":["Meriam Zribi","Paolo Pagliuca","Francesca Pitolli"],"pdf_url":"https://arxiv.org/pdf/2404.10454v1.pdf","comment":"31 pages, 13 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.09342v2","updated":"2024-04-16T10:33:36Z","published":"2024-04-14T19:51:32Z","title":"Face-voice Association in Multilingual Environments (FAME) Challenge\n 2024 Evaluation Plan","summary":" The advancements of technology have led to the use of multimodal systems in\nvarious real-world applications. Among them, the audio-visual systems are one\nof the widely used multimodal systems. In the recent years, associating face\nand voice of a person has gained attention due to presence of unique\ncorrelation between them. The Face-voice Association in Multilingual\nEnvironments (FAME) Challenge 2024 focuses on exploring face-voice association\nunder a unique condition of multilingual scenario. This condition is inspired\nfrom the fact that half of the world's population is bilingual and most often\npeople communicate under multilingual scenario. The challenge uses a dataset\nnamely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice\nassociation in multilingual environments. This report provides the details of\nthe challenge, dataset, baselines and task details for the FAME Challenge.\n","authors":["Muhammad Saad Saeed","Shah Nawaz","Muhammad Salman Tahir","Rohan Kumar Das","Muhammad Zaigham Zaheer","Marta Moscati","Markus Schedl","Muhammad Haris Khan","Karthik Nandakumar","Muhammad Haroon Yousaf"],"pdf_url":"https://arxiv.org/pdf/2404.09342v2.pdf","comment":"ACM Multimedia Conference - Grand Challenge"},{"id":"http://arxiv.org/abs/2404.10441v1","updated":"2024-04-16T10:26:57Z","published":"2024-04-16T10:26:57Z","title":"1st Place Solution for ICCV 2023 OmniObject3D Challenge: Sparse-View\n Reconstruction","summary":" In this report, we present the 1st place solution for ICCV 2023 OmniObject3D\nChallenge: Sparse-View Reconstruction. The challenge aims to evaluate\napproaches for novel view synthesis and surface reconstruction using only a few\nposed images of each object. We utilize Pixel-NeRF as the basic model, and\napply depth supervision as well as coarse-to-fine positional encoding. The\nexperiments demonstrate the effectiveness of our approach in improving\nsparse-view reconstruction quality. We ranked first in the final test with a\nPSNR of 25.44614.\n","authors":["Hang Du","Yaping Xue","Weidong Dai","Xuejun Yan","Jingjing Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08504v2","updated":"2024-04-16T10:18:56Z","published":"2024-04-12T14:34:24Z","title":"3D Human Scan With A Moving Event Camera","summary":" Capturing a 3D human body is one of the important tasks in computer vision\nwith a wide range of applications such as virtual reality and sports analysis.\nHowever, conventional frame cameras are limited by their temporal resolution\nand dynamic range, which imposes constraints in real-world application setups.\nEvent cameras have the advantages of high temporal resolution and high dynamic\nrange (HDR), but the development of event-based methods is necessary to handle\ndata with different characteristics. This paper proposes a novel event-based\nmethod for 3D pose estimation and human mesh recovery. Prior work on\nevent-based human mesh recovery require frames (images) as well as event data.\nThe proposed method solely relies on events; it carves 3D voxels by moving the\nevent camera around a stationary body, reconstructs the human pose and mesh by\nattenuated rays, and fit statistical body models, preserving high-frequency\ndetails. The experimental results show that the proposed method outperforms\nconventional frame-based methods in the estimation accuracy of both pose and\nbody mesh. We also demonstrate results in challenging situations where a\nconventional camera has motion blur. This is the first to demonstrate\nevent-only human mesh recovery, and we hope that it is the first step toward\nachieving robust and accurate 3D human body scanning from vision sensors.\nhttps://florpeng.github.io/event-based-human-scan/\n","authors":["Kai Kohyama","Shintaro Shiba","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2404.08504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10438v1","updated":"2024-04-16T10:04:38Z","published":"2024-04-16T10:04:38Z","title":"The Unreasonable Effectiveness of Pre-Trained Features for Camera Pose\n Refinement","summary":" Pose refinement is an interesting and practically relevant research\ndirection. Pose refinement can be used to (1) obtain a more accurate pose\nestimate from an initial prior (e.g., from retrieval), (2) as pre-processing,\ni.e., to provide a better starting point to a more expensive pose estimator,\n(3) as post-processing of a more accurate localizer. Existing approaches focus\non learning features / scene representations for the pose refinement task. This\ninvolves training an implicit scene representation or learning features while\noptimizing a camera pose-based loss. A natural question is whether training\nspecific features / representations is truly necessary or whether similar\nresults can be already achieved with more generic features. In this work, we\npresent a simple approach that combines pre-trained features with a particle\nfilter and a renderable representation of the scene. Despite its simplicity, it\nachieves state-of-the-art results, demonstrating that one can easily build a\npose refiner without the need for specific training. The code is at\nhttps://github.com/ga1i13o/mcloc_poseref\n","authors":["Gabriele Trivigno","Carlo Masone","Barbara Caputo","Torsten Sattler"],"pdf_url":"https://arxiv.org/pdf/2404.10438v1.pdf","comment":"Accepted to CVPR2024 (Highlight)"},{"id":"http://arxiv.org/abs/2404.05468v3","updated":"2024-04-16T10:02:17Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v3.pdf","comment":"Pre-print to be updated. Work in progress"},{"id":"http://arxiv.org/abs/2404.10433v1","updated":"2024-04-16T09:56:08Z","published":"2024-04-16T09:56:08Z","title":"Explainable concept mappings of MRI: Revealing the mechanisms underlying\n deep learning-based brain disease classification","summary":" Motivation. While recent studies show high accuracy in the classification of\nAlzheimer's disease using deep neural networks, the underlying learned concepts\nhave not been investigated.\n Goals. To systematically identify changes in brain regions through concepts\nlearned by the deep neural network for model validation.\n Approach. Using quantitative R2* maps we separated Alzheimer's patients\n(n=117) from normal controls (n=219) by using a convolutional neural network\nand systematically investigated the learned concepts using Concept Relevance\nPropagation and compared these results to a conventional region of\ninterest-based analysis.\n Results. In line with established histological findings and the region of\ninterest-based analyses, highly relevant concepts were primarily found in and\nadjacent to the basal ganglia.\n Impact. The identification of concepts learned by deep neural networks for\ndisease classification enables validation of the models and could potentially\nimprove reliability.\n","authors":["Christian Tinauer","Anna Damulina","Maximilian Sackl","Martin Soellradl","Reduan Achtibat","Maximilian Dreyer","Frederik Pahde","Sebastian Lapuschkin","Reinhold Schmidt","Stefan Ropele","Wojciech Samek","Christian Langkammer"],"pdf_url":"https://arxiv.org/pdf/2404.10433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18975v2","updated":"2024-04-16T09:49:15Z","published":"2024-02-29T09:27:40Z","title":"Theoretically Achieving Continuous Representation of Oriented Bounding\n Boxes","summary":" Considerable efforts have been devoted to Oriented Object Detection (OOD).\nHowever, one lasting issue regarding the discontinuity in Oriented Bounding Box\n(OBB) representation remains unresolved, which is an inherent bottleneck for\nextant OOD methods. This paper endeavors to completely solve this issue in a\ntheoretically guaranteed manner and puts an end to the ad-hoc efforts in this\ndirection. Prior studies typically can only address one of the two cases of\ndiscontinuity: rotation and aspect ratio, and often inadvertently introduce\ndecoding discontinuity, e.g. Decoding Incompleteness (DI) and Decoding\nAmbiguity (DA) as discussed in literature. Specifically, we propose a novel\nrepresentation method called Continuous OBB (COBB), which can be readily\nintegrated into existing detectors e.g. Faster-RCNN as a plugin. It can\ntheoretically ensure continuity in bounding box regression which to our best\nknowledge, has not been achieved in literature for rectangle-based object\nrepresentation. For fairness and transparency of experiments, we have developed\na modularized benchmark based on the open-source deep learning framework\nJittor's detection toolbox JDet for OOD evaluation. On the popular DOTA\ndataset, by integrating Faster-RCNN as the same baseline model, our new method\noutperforms the peer method Gliding Vertex by 1.13% mAP50 (relative improvement\n1.54%), and 2.46% mAP75 (relative improvement 5.91%), without any tricks.\n","authors":["Zi-Kai Xiao","Guo-Ye Yang","Xue Yang","Tai-Jiang Mu","Junchi Yan","Shi-min Hu"],"pdf_url":"https://arxiv.org/pdf/2402.18975v2.pdf","comment":"17 pages, 12 tables, 8 figures. Accepted by CVPR'24. Code:\n https://github.com/514flowey/JDet-COBB"},{"id":"http://arxiv.org/abs/2404.10411v1","updated":"2024-04-16T09:28:54Z","published":"2024-04-16T09:28:54Z","title":"Camera clustering for scalable stream-based active distillation","summary":" We present a scalable framework designed to craft efficient lightweight\nmodels for video object detection utilizing self-training and knowledge\ndistillation techniques. We scrutinize methodologies for the ideal selection of\ntraining images from video streams and the efficacy of model sharing across\nnumerous cameras. By advocating for a camera clustering methodology, we aim to\ndiminish the requisite number of models for training while augmenting the\ndistillation dataset. The findings affirm that proper camera clustering notably\namplifies the accuracy of distilled models, eclipsing the methodologies that\nemploy distinct models for each camera or a universal model trained on the\naggregate camera data.\n","authors":["Dani Manjah","Davide Cacciarelli","Christophe De Vleeschouwer","Benoit Macq"],"pdf_url":"https://arxiv.org/pdf/2404.10411v1.pdf","comment":"This manuscript is currently under review at IEEE Transactions on\n Circuits and Systems for Video Technology"},{"id":"http://arxiv.org/abs/2404.10408v1","updated":"2024-04-16T09:19:23Z","published":"2024-04-16T09:19:23Z","title":"Adversarial Identity Injection for Semantic Face Image Synthesis","summary":" Nowadays, deep learning models have reached incredible performance in the\ntask of image generation. Plenty of literature works address the task of face\ngeneration and editing, with human and automatic systems that struggle to\ndistinguish what's real from generated. Whereas most systems reached excellent\nvisual generation quality, they still face difficulties in preserving the\nidentity of the starting input subject. Among all the explored techniques,\nSemantic Image Synthesis (SIS) methods, whose goal is to generate an image\nconditioned on a semantic segmentation mask, are the most promising, even\nthough preserving the perceived identity of the input subject is not their main\nconcern. Therefore, in this paper, we investigate the problem of identity\npreservation in face image generation and present an SIS architecture that\nexploits a cross-attention mechanism to merge identity, style, and semantic\nfeatures to generate faces whose identities are as similar as possible to the\ninput ones. Experimental results reveal that the proposed method is not only\nsuitable for preserving the identity but is also effective in the face\nrecognition adversarial attack, i.e. hiding a second identity in the generated\nfaces.\n","authors":["Giuseppe Tarollo","Tomaso Fontanini","Claudio Ferrari","Guido Borghi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2404.10408v1.pdf","comment":"Paper accepted at CVPR 2024 Biometrics Workshop"},{"id":"http://arxiv.org/abs/2404.10407v1","updated":"2024-04-16T09:19:11Z","published":"2024-04-16T09:19:11Z","title":"Comprehensive Survey of Model Compression and Speed up for Vision\n Transformers","summary":" Vision Transformers (ViT) have marked a paradigm shift in computer vision,\noutperforming state-of-the-art models across diverse tasks. However, their\npractical deployment is hampered by high computational and memory demands. This\nstudy addresses the challenge by evaluating four primary model compression\ntechniques: quantization, low-rank approximation, knowledge distillation, and\npruning. We methodically analyze and compare the efficacy of these techniques\nand their combinations in optimizing ViTs for resource-constrained\nenvironments. Our comprehensive experimental evaluation demonstrates that these\nmethods facilitate a balanced compromise between model accuracy and\ncomputational efficiency, paving the way for wider application in edge\ncomputing devices.\n","authors":["Feiyang Chen","Ziqian Luo","Lisang Zhou","Xueting Pan","Ying Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.10407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10405v1","updated":"2024-04-16T09:12:16Z","published":"2024-04-16T09:12:16Z","title":"Integration of Self-Supervised BYOL in Semi-Supervised Medical Image\n Recognition","summary":" Image recognition techniques heavily rely on abundant labeled data,\nparticularly in medical contexts. Addressing the challenges associated with\nobtaining labeled data has led to the prominence of self-supervised learning\nand semi-supervised learning, especially in scenarios with limited annotated\ndata. In this paper, we proposed an innovative approach by integrating\nself-supervised learning into semi-supervised models to enhance medical image\nrecognition. Our methodology commences with pre-training on unlabeled data\nutilizing the BYOL method. Subsequently, we merge pseudo-labeled and labeled\ndatasets to construct a neural network classifier, refining it through\niterative fine-tuning. Experimental results on three different datasets\ndemonstrate that our approach optimally leverages unlabeled data, outperforming\nexisting methods in terms of accuracy for medical image recognition.\n","authors":["Hao Feng","Yuanzhe Jia","Ruijia Xu","Mukesh Prasad","Ali Anaissi","Ali Braytee"],"pdf_url":"https://arxiv.org/pdf/2404.10405v1.pdf","comment":"Accepted by ICCS 2024"},{"id":"http://arxiv.org/abs/2205.10120v7","updated":"2024-04-16T09:03:32Z","published":"2022-05-17T14:00:58Z","title":"Privacy Preserving Image Registration","summary":" Image registration is a key task in medical imaging applications, allowing to\nrepresent medical images in a common spatial reference frame. Current\napproaches to image registration are generally based on the assumption that the\ncontent of the images is usually accessible in clear form, from which the\nspatial transformation is subsequently estimated. This common assumption may\nnot be met in practical applications, since the sensitive nature of medical\nimages may ultimately require their analysis under privacy constraints,\npreventing to openly share the image content.In this work, we formulate the\nproblem of image registration under a privacy preserving regime, where images\nare assumed to be confidential and cannot be disclosed in clear. We derive our\nprivacy preserving image registration framework by extending classical\nregistration paradigms to account for advanced cryptographic tools, such as\nsecure multi-party computation and homomorphic encryption, that enable the\nexecution of operations without leaking the underlying data. To overcome the\nproblem of performance and scalability of cryptographic tools in high\ndimensions, we propose several techniques to optimize the image registration\noperations by using gradient approximations, and by revisiting the use of\nhomomorphic encryption trough packing, to allow the efficient encryption and\nmultiplication of large matrices. We demonstrate our privacy preserving\nframework in linear and non-linear registration problems, evaluating its\naccuracy and scalability with respect to standard, non-private counterparts.\nOur results show that privacy preserving image registration is feasible and can\nbe adopted in sensitive medical imaging applications.\n","authors":["Riccardo Taiello","Melek Önen","Francesco Capano","Olivier Humbert","Marco Lorenzi"],"pdf_url":"https://arxiv.org/pdf/2205.10120v7.pdf","comment":"v4 Accepted at Medical Image Computing and Computer Assisted\n Intervention (2022) 130-140"},{"id":"http://arxiv.org/abs/2404.10394v1","updated":"2024-04-16T08:52:42Z","published":"2024-04-16T08:52:42Z","title":"Portrait3D: Text-Guided High-Quality 3D Portrait Generation Using\n Pyramid Representation and GANs Prior","summary":" Existing neural rendering-based text-to-3D-portrait generation methods\ntypically make use of human geometry prior and diffusion models to obtain\nguidance. However, relying solely on geometry information introduces issues\nsuch as the Janus problem, over-saturation, and over-smoothing. We present\nPortrait3D, a novel neural rendering-based framework with a novel joint\ngeometry-appearance prior to achieve text-to-3D-portrait generation that\novercomes the aforementioned issues. To accomplish this, we train a 3D portrait\ngenerator, 3DPortraitGAN-Pyramid, as a robust prior. This generator is capable\nof producing 360{\\deg} canonical 3D portraits, serving as a starting point for\nthe subsequent diffusion-based generation process. To mitigate the \"grid-like\"\nartifact caused by the high-frequency information in the feature-map-based 3D\nrepresentation commonly used by most 3D-aware GANs, we integrate a novel\npyramid tri-grid 3D representation into 3DPortraitGAN-Pyramid. To generate 3D\nportraits from text, we first project a randomly generated image aligned with\nthe given prompt into the pre-trained 3DPortraitGAN-Pyramid's latent space. The\nresulting latent code is then used to synthesize a pyramid tri-grid. Beginning\nwith the obtained pyramid tri-grid, we use score distillation sampling to\ndistill the diffusion model's knowledge into the pyramid tri-grid. Following\nthat, we utilize the diffusion model to refine the rendered images of the 3D\nportrait and then use these refined images as training data to further optimize\nthe pyramid tri-grid, effectively eliminating issues with unrealistic color and\nunnatural artifacts. Our experimental results show that Portrait3D can produce\nrealistic, high-quality, and canonical 3D portraits that align with the prompt.\n","authors":["Yiqian Wu","Hao Xu","Xiangjun Tang","Xien Chen","Siyu Tang","Zhebin Zhang","Chen Li","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2404.10394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10387v1","updated":"2024-04-16T08:39:29Z","published":"2024-04-16T08:39:29Z","title":"CNN-based explanation ensembling for dataset, representation and\n explanations evaluation","summary":" Explainable Artificial Intelligence has gained significant attention due to\nthe widespread use of complex deep learning models in high-stake domains such\nas medicine, finance, and autonomous cars. However, different explanations\noften present different aspects of the model's behavior. In this research\nmanuscript, we explore the potential of ensembling explanations generated by\ndeep classification models using convolutional model. Through experimentation\nand analysis, we aim to investigate the implications of combining explanations\nto uncover a more coherent and reliable patterns of the model's behavior,\nleading to the possibility of evaluating the representation learned by the\nmodel. With our method, we can uncover problems of under-representation of\nimages in a certain class. Moreover, we discuss other side benefits like\nfeatures' reduction by replacing the original image with its explanations\nresulting in the removal of some sensitive information. Through the use of\ncarefully selected evaluation metrics from the Quantus library, we demonstrated\nthe method's superior performance in terms of Localisation and Faithfulness,\ncompared to individual explanations.\n","authors":["Weronika Hryniewska-Guzik","Luca Longo","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.10387v1.pdf","comment":"accepted at 2nd World Conference on eXplainable Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2404.10383v1","updated":"2024-04-16T08:25:36Z","published":"2024-04-16T08:25:36Z","title":"Learning to Score Sign Language with Two-stage Method","summary":" Human action recognition and performance assessment have been hot research\ntopics in recent years. Recognition problems have mature solutions in the field\nof sign language, but past research in performance analysis has focused on\ncompetitive sports and medical training, overlooking the scoring assessment\n,which is an important part of sign language teaching digitalization. In this\npaper, we analyze the existing technologies for performance assessment and\nadopt methods that perform well in human pose reconstruction tasks combined\nwith motion rotation embedded expressions, proposing a two-stage sign language\nperformance evaluation pipeline. Our analysis shows that choosing\nreconstruction tasks in the first stage can provide more expressive features,\nand using smoothing methods can provide an effective reference for assessment.\nExperiments show that our method provides good score feedback mechanisms and\nhigh consistency with professional assessments compared to end-to-end\nevaluations.\n","authors":["Wen Hongli","Xu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.10383v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10378v1","updated":"2024-04-16T08:15:10Z","published":"2024-04-16T08:15:10Z","title":"Second Edition FRCSyn Challenge at CVPR 2024: Face Recognition Challenge\n in the Era of Synthetic Data","summary":" Synthetic data is gaining increasing relevance for training machine learning\nmodels. This is mainly motivated due to several factors such as the lack of\nreal data and intra-class variability, time and errors produced in manual\nlabeling, and in some cases privacy concerns, among others. This paper presents\nan overview of the 2nd edition of the Face Recognition Challenge in the Era of\nSynthetic Data (FRCSyn) organized at CVPR 2024. FRCSyn aims to investigate the\nuse of synthetic data in face recognition to address current technological\nlimitations, including data privacy concerns, demographic biases,\ngeneralization to novel scenarios, and performance constraints in challenging\nsituations such as aging, pose variations, and occlusions. Unlike the 1st\nedition, in which synthetic data from DCFace and GANDiffFace methods was only\nallowed to train face recognition systems, in this 2nd edition we propose new\nsub-tasks that allow participants to explore novel face generative methods. The\noutcomes of the 2nd FRCSyn Challenge, along with the proposed experimental\nprotocol and benchmarking contribute significantly to the application of\nsynthetic data to face recognition.\n","authors":["Ivan DeAndres-Tame","Ruben Tolosana","Pietro Melzi","Ruben Vera-Rodriguez","Minchul Kim","Christian Rathgeb","Xiaoming Liu","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia","Zhizhou Zhong","Yuge Huang","Yuxi Mi","Shouhong Ding","Shuigeng Zhou","Shuai He","Lingzhi Fu","Heng Cong","Rongyu Zhang","Zhihong Xiao","Evgeny Smirnov","Anton Pimenov","Aleksei Grigorev","Denis Timoshenko","Kaleb Mesfin Asfaw","Cheng Yaw Low","Hao Liu","Chuyi Wang","Qing Zuo","Zhixiang He","Hatef Otroshi Shahreza","Anjith George","Alexander Unnervik","Parsa Rahimi","Sébastien Marcel","Pedro C. Neto","Marco Huber","Jan Niklas Kolf","Naser Damer","Fadi Boutros","Jaime S. Cardoso","Ana F. Sequeira","Andrea Atzori","Gianni Fenu","Mirko Marras","Vitomir Štruc","Jiang Yu","Zhangjie Li","Jichun Li","Weisong Zhao","Zhen Lei","Xiangyu Zhu","Xiao-Yu Zhang","Bernardo Biesseck","Pedro Vidal","Luiz Coelho","Roger Granada","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2404.10378v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2311.10476"},{"id":"http://arxiv.org/abs/2404.10370v1","updated":"2024-04-16T08:08:47Z","published":"2024-04-16T08:08:47Z","title":"Know Yourself Better: Diverse Discriminative Feature Learning Improves\n Open Set Recognition","summary":" Open set recognition (OSR) is a critical aspect of machine learning,\naddressing the challenge of detecting novel classes during inference. Within\nthe realm of deep learning, neural classifiers trained on a closed set of data\ntypically struggle to identify novel classes, leading to erroneous predictions.\nTo address this issue, various heuristic methods have been proposed, allowing\nmodels to express uncertainty by stating \"I don't know.\" However, a gap in the\nliterature remains, as there has been limited exploration of the underlying\nmechanisms of these methods. In this paper, we conduct an analysis of open set\nrecognition methods, focusing on the aspect of feature diversity. Our research\nreveals a significant correlation between learning diverse discriminative\nfeatures and enhancing OSR performance. Building on this insight, we propose a\nnovel OSR approach that leverages the advantages of feature diversity. The\nefficacy of our method is substantiated through rigorous evaluation on a\nstandard OSR testbench, demonstrating a substantial improvement over\nstate-of-the-art methods.\n","authors":["Jiawen Xu"],"pdf_url":"https://arxiv.org/pdf/2404.10370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08514v2","updated":"2024-04-16T07:56:01Z","published":"2024-04-12T14:54:26Z","title":"NIR-Assisted Image Denoising: A Selective Fusion Approach and A\n Real-World Benchmark Datase","summary":" Despite the significant progress in image denoising, it is still challenging\nto restore fine-scale details while removing noise, especially in extremely\nlow-light environments. Leveraging near-infrared (NIR) images to assist visible\nRGB image denoising shows the potential to address this issue, becoming a\npromising technology. Nonetheless, existing works still struggle with taking\nadvantage of NIR information effectively for real-world image denoising, due to\nthe content inconsistency between NIR-RGB images and the scarcity of real-world\npaired datasets. To alleviate the problem, we propose an efficient Selective\nFusion Module (SFM), which can be plug-and-played into the advanced denoising\nnetworks to merge the deep NIR-RGB features. Specifically, we sequentially\nperform the global and local modulation for NIR and RGB features, and then\nintegrate the two modulated features. Furthermore, we present a Real-world\nNIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse\nscenarios as well as various noise levels. Extensive experiments on both\nsynthetic and our real-world datasets demonstrate that the proposed method\nachieves better results than state-of-the-art ones.\n","authors":["Rongjian Xu","Zhilu Zhang","Renlong Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.08514v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.12815v2","updated":"2024-04-16T07:51:12Z","published":"2023-09-24T02:57:56Z","title":"Proposing an intelligent mesh smoothing method with graph neural\n networks","summary":" In CFD, mesh smoothing methods are commonly utilized to refine the mesh\nquality to achieve high-precision numerical simulations. Specifically,\noptimization-based smoothing is used for high-quality mesh smoothing, but it\nincurs significant computational overhead. Pioneer works improve its smoothing\nefficiency by adopting supervised learning to learn smoothing methods from\nhigh-quality meshes. However, they pose difficulty in smoothing the mesh nodes\nwith varying degrees and also need data augmentation to address the node input\nsequence problem. Additionally, the required labeled high-quality meshes\nfurther limit the applicability of the proposed method. In this paper, we\npresent GMSNet, a lightweight neural network model for intelligent mesh\nsmoothing. GMSNet adopts graph neural networks to extract features of the\nnode's neighbors and output the optimal node position. During smoothing, we\nalso introduce a fault-tolerance mechanism to prevent GMSNet from generating\nnegative volume elements. With a lightweight model, GMSNet can effectively\nsmoothing mesh nodes with varying degrees and remain unaffected by the order of\ninput data. A novel loss function, MetricLoss, is also developed to eliminate\nthe need for high-quality meshes, which provides a stable and rapid convergence\nduring training. We compare GMSNet with commonly used mesh smoothing methods on\ntwo-dimensional triangle meshes. The experimental results show that GMSNet\nachieves outstanding mesh smoothing performances with 5% model parameters of\nthe previous model, and attains 13.56 times faster than optimization-based\nsmoothing.\n","authors":["Zhichao Wang","Xinhai Chen","Junjun Yan","Jie Liu"],"pdf_url":"https://arxiv.org/pdf/2311.12815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16016v3","updated":"2024-04-16T07:47:19Z","published":"2023-06-28T08:44:00Z","title":"Positive Label Is All You Need for Multi-Label Classification","summary":" Multi-label classification (MLC) faces challenges from label noise in\ntraining data due to annotating diverse semantic labels for each image. Current\nmethods mainly target identifying and correcting label mistakes using trained\nMLC models, but still struggle with persistent noisy labels during training,\nresulting in imprecise recognition and reduced performance. Our paper addresses\nlabel noise in MLC by introducing a positive and unlabeled multi-label\nclassification (PU-MLC) method. To counteract noisy labels, we directly discard\nnegative labels, focusing on the abundance of negative labels and the origin of\nmost noisy labels. PU-MLC employs positive-unlabeled learning, training the\nmodel with only positive labels and unlabeled data. The method incorporates\nadaptive re-balance factors and temperature coefficients in the loss function\nto address label distribution imbalance and prevent over-smoothing of\nprobabilities during training. Additionally, we introduce a local-global\nconvolution module to capture both local and global dependencies in the image\nwithout requiring backbone retraining. PU-MLC proves effective on MLC and MLC\nwith partial labels (MLC-PL) tasks, demonstrating significant improvements on\nMS-COCO and PASCAL VOC datasets with fewer annotations. Code is available at:\nhttps://github.com/TAKELAMAG/PU-MLC.\n","authors":["Zhixiang Yuan","Kaixin Zhang","Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2306.16016v3.pdf","comment":"ICME 2024"},{"id":"http://arxiv.org/abs/2404.10358v1","updated":"2024-04-16T07:46:55Z","published":"2024-04-16T07:46:55Z","title":"Improving Bracket Image Restoration and Enhancement with Flow-guided\n Alignment and Enhanced Feature Aggregation","summary":" In this paper, we address the Bracket Image Restoration and Enhancement\n(BracketIRE) task using a novel framework, which requires restoring a\nhigh-quality high dynamic range (HDR) image from a sequence of noisy, blurred,\nand low dynamic range (LDR) multi-exposure RAW inputs. To overcome this\nchallenge, we present the IREANet, which improves the multiple exposure\nalignment and aggregation with a Flow-guide Feature Alignment Module (FFAM) and\nan Enhanced Feature Aggregation Module (EFAM). Specifically, the proposed FFAM\nincorporates the inter-frame optical flow as guidance to facilitate the\ndeformable alignment and spatial attention modules for better feature\nalignment. The EFAM further employs the proposed Enhanced Residual Block (ERB)\nas a foundational component, wherein a unidirectional recurrent network\naggregates the aligned temporal features to better reconstruct the results. To\nimprove model generalization and performance, we additionally employ the Bayer\npreserving augmentation (BayerAug) strategy to augment the multi-exposure RAW\ninputs. Our experimental evaluations demonstrate that the proposed IREANet\nshows state-of-the-art performance compared with previous methods.\n","authors":["Wenjie Lin","Zhen Liu","Chengzhi Jiang","Mingyan Han","Ting Jiang","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10357v1","updated":"2024-04-16T07:44:52Z","published":"2024-04-16T07:44:52Z","title":"Optimization of Prompt Learning via Multi-Knowledge Representation for\n Vision-Language Models","summary":" Vision-Language Models (VLMs), such as CLIP, play a foundational role in\nvarious cross-modal applications. To fully leverage VLMs' potential in adapting\nto downstream tasks, context optimization methods like Prompt Tuning are\nessential. However, one key limitation is the lack of diversity in prompt\ntemplates, whether they are hand-crafted or learned through additional modules.\nThis limitation restricts the capabilities of pretrained VLMs and can result in\nincorrect predictions in downstream tasks. To address this challenge, we\npropose Context Optimization with Multi-Knowledge Representation (CoKnow), a\nframework that enhances Prompt Learning for VLMs with rich contextual\nknowledge. To facilitate CoKnow during inference, we trained lightweight\nsemantic knowledge mappers, which are capable of generating Multi-Knowledge\nRepresentation for an input image without requiring additional priors.\nExperimentally, We conducted extensive experiments on 11 publicly available\ndatasets, demonstrating that CoKnow outperforms a series of previous methods.\nWe will make all resources open-source: https://github.com/EMZucas/CoKnow.\n","authors":["Enming Zhang","Bingke zhu","Yingying Chen","Qinghai Miao","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06207v2","updated":"2024-04-16T07:41:49Z","published":"2024-04-09T10:56:46Z","title":"Leveraging edge detection and neural networks for better UAV\n localization","summary":" We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)\nin environments lacking Global Navigation Satellite Systems (GNSS). Current\nstate-of-the-art techniques employ an offline-trained encoder to generate a\nvector representation (embedding) of the UAV's current view, which is then\ncompared with pre-computed embeddings of geo-referenced images to determine the\nUAV's position. Here, we demonstrate that the performance of these methods can\nbe significantly enhanced by preprocessing the images to extract their edges,\nwhich exhibit robustness to seasonal and illumination variations. Furthermore,\nwe establish that utilizing edges enhances resilience to orientation and\naltitude inaccuracies. Additionally, we introduce a confidence criterion for\nlocalization. Our findings are substantiated through synthetic experiments.\n","authors":["Theo Di Piazza","Enric Meinhardt-Llopis","Gabriele Facciolo","Benedicte Bascle","Corentin Abgrall","Jean-Clement Devaux"],"pdf_url":"https://arxiv.org/pdf/2404.06207v2.pdf","comment":"Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.09450v2","updated":"2024-04-16T07:35:41Z","published":"2023-12-22T11:15:16Z","title":"Joining Forces for Pathology Diagnostics with AI Assistance: The EMPAIA\n Initiative","summary":" Over the past decade, artificial intelligence (AI) methods in pathology have\nadvanced substantially. However, integration into routine clinical practice has\nbeen slow due to numerous challenges, including technical and regulatory\nhurdles in translating research results into clinical diagnostic products and\nthe lack of standardized interfaces. The open and vendor-neutral EMPAIA\ninitiative addresses these challenges. Here, we provide an overview of EMPAIA's\nachievements and lessons learned. EMPAIA integrates various stakeholders of the\npathology AI ecosystem, i.e., pathologists, computer scientists, and industry.\nIn close collaboration, we developed technical interoperability standards,\nrecommendations for AI testing and product development, and explainability\nmethods. We implemented the modular and open-source EMPAIA platform and\nsuccessfully integrated 14 AI-based image analysis apps from 8 different\nvendors, demonstrating how different apps can use a single standardized\ninterface. We prioritized requirements and evaluated the use of AI in real\nclinical settings with 14 different pathology laboratories in Europe and Asia.\nIn addition to technical developments, we created a forum for all stakeholders\nto share information and experiences on digital pathology and AI. Commercial,\nclinical, and academic stakeholders can now adopt EMPAIA's common open-source\ninterfaces, providing a unique opportunity for large-scale standardization and\nstreamlining of processes. Further efforts are needed to effectively and\nbroadly establish AI assistance in routine laboratory use. To this end, a\nsustainable infrastructure, the non-profit association EMPAIA International,\nhas been established to continue standardization and support broad\nimplementation and advocacy for an AI-assisted digital pathology future.\n","authors":["Norman Zerbe","Lars Ole Schwen","Christian Geißler","Katja Wiesemann","Tom Bisson","Peter Boor","Rita Carvalho","Michael Franz","Christoph Jansen","Tim-Rasmus Kiehl","Björn Lindequist","Nora Charlotte Pohlan","Sarah Schmell","Klaus Strohmenger","Falk Zakrzewski","Markus Plass","Michael Takla","Tobias Küster","André Homeyer","Peter Hufnagl"],"pdf_url":"https://arxiv.org/pdf/2401.09450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10343v1","updated":"2024-04-16T07:26:20Z","published":"2024-04-16T07:26:20Z","title":"The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report","summary":" This paper provides a comprehensive review of the NTIRE 2024 challenge,\nfocusing on efficient single-image super-resolution (ESR) solutions and their\noutcomes. The task of this challenge is to super-resolve an input image with a\nmagnification factor of x4 based on pairs of low and corresponding\nhigh-resolution images. The primary objective is to develop networks that\noptimize various aspects such as runtime, parameters, and FLOPs, while still\nmaintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on\nthe DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In\naddition, this challenge has 4 tracks including the main track (overall\nperformance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3\n(parameters). In the main track, all three metrics (ie runtime, FLOPs, and\nparameter count) were considered. The ranking of the main track is calculated\nbased on a weighted sum-up of the scores of all other sub-tracks. In sub-track\n1, the practical runtime performance of the submissions was evaluated, and the\ncorresponding score was used to determine the ranking. In sub-track 2, the\nnumber of FLOPs was considered. The score calculated based on the corresponding\nFLOPs was used to determine the ranking. In sub-track 3, the number of\nparameters was considered. The score calculated based on the corresponding\nparameters was used to determine the ranking. RLFN is set as the baseline for\nefficiency measurement. The challenge had 262 registered participants, and 34\nteams made valid submissions. They gauge the state-of-the-art in efficient\nsingle-image super-resolution. To facilitate the reproducibility of the\nchallenge and enable other researchers to build upon these findings, the code\nand the pre-trained model of validated solutions are made publicly available at\nhttps://github.com/Amazingren/NTIRE2024_ESR/.\n","authors":["Bin Ren","Yawei Li","Nancy Mehta","Radu Timofte","Hongyuan Yu","Cheng Wan","Yuxin Hong","Bingnan Han","Zhuoyuan Wu","Yajun Zou","Yuqing Liu","Jizhe Li","Keji He","Chao Fan","Heng Zhang","Xiaolin Zhang","Xuanwu Yin","Kunlong Zuo","Bohao Liao","Peizhe Xia","Long Peng","Zhibo Du","Xin Di","Wangkai Li","Yang Wang","Wei Zhai","Renjing Pei","Jiaming Guo","Songcen Xu","Yang Cao","Zhengjun Zha","Yan Wang","Yi Liu","Qing Wang","Gang Zhang","Liou Zhang","Shijie Zhao","Long Sun","Jinshan Pan","Jiangxin Dong","Jinhui Tang","Xin Liu","Min Yan","Qian Wang","Menghan Zhou","Yiqiang Yan","Yixuan Liu","Wensong Chan","Dehua Tang","Dong Zhou","Li Wang","Lu Tian","Barsoum Emad","Bohan Jia","Junbo Qiao","Yunshuai Zhou","Yun Zhang","Wei Li","Shaohui Lin","Shenglong Zhou","Binbin Chen","Jincheng Liao","Suiyi Zhao","Zhao Zhang","Bo Wang","Yan Luo","Yanyan Wei","Feng Li","Mingshen Wang","Yawei Li","Jinhan Guan","Dehua Hu","Jiawei Yu","Qisheng Xu","Tao Sun","Long Lan","Kele Xu","Xin Lin","Jingtong Yue","Lehan Yang","Shiyi Du","Lu Qi","Chao Ren","Zeyu Han","Yuhan Wang","Chaolin Chen","Haobo Li","Mingjun Zheng","Zhongbao Yang","Lianhong Song","Xingzhuo Yan","Minghan Fu","Jingyi Zhang","Baiang Li","Qi Zhu","Xiaogang Xu","Dan Guo","Chunle Guo","Jiadi Chen","Huanhuan Long","Chunjiang Duanmu","Xiaoyan Lei","Jie Liu","Weilin Jia","Weifeng Cao","Wenlong Zhang","Yanyu Mao","Ruilong Guo","Nihao Zhang","Qian Wang","Manoj Pandey","Maksym Chernozhukov","Giang Le","Shuli Cheng","Hongyuan Wang","Ziyan Wei","Qingting Tang","Liejun Wang","Yongming Li","Yanhui Guo","Hao Xu","Akram Khatami-Rizi","Ahmad Mahmoudi-Aznaveh","Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou","Amogh Joshi","Nikhil Akalwadi","Sampada Malagi","Palani Yashaswini","Chaitra Desai","Ramesh Ashok Tabib","Ujwala Patil","Uma Mudenagudi"],"pdf_url":"https://arxiv.org/pdf/2404.10343v1.pdf","comment":"The report paper of NTIRE2024 Efficient Super-resolution, accepted by\n CVPRW2024"},{"id":"http://arxiv.org/abs/2404.10342v1","updated":"2024-04-16T07:25:17Z","published":"2024-04-16T07:25:17Z","title":"Referring Flexible Image Restoration","summary":" In reality, images often exhibit multiple degradations, such as rain and fog\nat night (triple degradations). However, in many cases, individuals may not\nwant to remove all degradations, for instance, a blurry lens revealing a\nbeautiful snowy landscape (double degradations). In such scenarios, people may\nonly desire to deblur. These situations and requirements shed light on a new\nchallenge in image restoration, where a model must perceive and remove specific\ndegradation types specified by human commands in images with multiple\ndegradations. We term this task Referring Flexible Image Restoration (RFIR). To\naddress this, we first construct a large-scale synthetic dataset called RFIR,\ncomprising 153,423 samples with the degraded image, text prompt for specific\ndegradation removal and restored image. RFIR consists of five basic degradation\ntypes: blur, rain, haze, low light and snow while six main sub-categories are\nincluded for varying degrees of degradation removal. To tackle the challenge,\nwe propose a novel transformer-based multi-task model named TransRFIR, which\nsimultaneously perceives degradation types in the degraded image and removes\nspecific degradation upon text prompt. TransRFIR is based on two devised\nattention modules, Multi-Head Agent Self-Attention (MHASA) and Multi-Head Agent\nCross Attention (MHACA), where MHASA and MHACA introduce the agent token and\nreach the linear complexity, achieving lower computation cost than vanilla\nself-attention and cross-attention and obtaining competitive performances. Our\nTransRFIR achieves state-of-the-art performances compared with other\ncounterparts and is proven as an effective architecture for image restoration.\nWe release our project at https://github.com/GuanRunwei/FIR-CP.\n","authors":["Runwei Guan","Rongsheng Hu","Zhuhao Zhou","Tianlang Xue","Ka Lok Man","Jeremy Smith","Eng Gee Lim","Weiping Ding","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2404.10342v1.pdf","comment":"15 pages, 19 figures"},{"id":"http://arxiv.org/abs/2404.10335v1","updated":"2024-04-16T07:19:52Z","published":"2024-04-16T07:19:52Z","title":"Efficiently Adversarial Examples Generation for Visual-Language Models\n under Targeted Transfer Scenarios using Diffusion Models","summary":" Targeted transfer-based attacks involving adversarial examples pose a\nsignificant threat to large visual-language models (VLMs). However, the\nstate-of-the-art (SOTA) transfer-based attacks incur high costs due to\nexcessive iteration counts. Furthermore, the generated adversarial examples\nexhibit pronounced adversarial noise and demonstrate limited efficacy in\nevading defense methods such as DiffPure. To address these issues, inspired by\nscore matching, we introduce AdvDiffVLM, which utilizes diffusion models to\ngenerate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM\nemploys Adaptive Ensemble Gradient Estimation to modify the score during the\ndiffusion model's reverse generation process, ensuring the adversarial examples\nproduced contain natural adversarial semantics and thus possess enhanced\ntransferability. Simultaneously, to enhance the quality of adversarial examples\nfurther, we employ the GradCAM-guided Mask method to disperse adversarial\nsemantics throughout the image, rather than concentrating them in a specific\narea. Experimental results demonstrate that our method achieves a speedup\nranging from 10X to 30X compared to existing transfer-based attack methods,\nwhile maintaining superior quality of adversarial examples. Additionally, the\ngenerated adversarial examples possess strong transferability and exhibit\nincreased robustness against adversarial defense methods. Notably, AdvDiffVLM\ncan successfully attack commercial VLMs, including GPT-4V, in a black-box\nmanner.\n","authors":["Qi Guo","Shanmin Pang","Xiaojun Jia","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2404.10335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10332v1","updated":"2024-04-16T07:14:32Z","published":"2024-04-16T07:14:32Z","title":"Prescribing the Right Remedy: Mitigating Hallucinations in Large\n Vision-Language Models via Targeted Instruction Tuning","summary":" Despite achieving outstanding performance on various cross-modal tasks,\ncurrent large vision-language models (LVLMs) still suffer from hallucination\nissues, manifesting as inconsistencies between their generated responses and\nthe corresponding images. Prior research has implicated that the low quality of\ninstruction data, particularly the skewed balance between positive and negative\nsamples, is a significant contributor to model hallucinations. Recently,\nresearchers have proposed high-quality instruction datasets, such as\nLRV-Instruction, to mitigate model hallucination. Nonetheless, our\ninvestigation reveals that hallucinatory concepts from different LVLMs exhibit\nspecificity, i.e. the distribution of hallucinatory concepts varies\nsignificantly across models. Existing datasets did not consider the\nhallucination specificity of different models in the design processes, thereby\ndiminishing their efficacy in mitigating model hallucination. In this paper, we\npropose a targeted instruction data generation framework named DFTG that\ntailored to the hallucination specificity of different models. Concretely, DFTG\nconsists of two stages: hallucination diagnosis, which extracts the necessary\ninformation from the model's responses and images for hallucination diagnosis;\nand targeted data generation, which generates targeted instruction data based\non diagnostic results. The experimental results on hallucination benchmarks\ndemonstrate that the targeted instruction data generated by our method are more\neffective in mitigating hallucinations compared to previous datasets.\n","authors":["Rui Hu","Yahan Tu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2404.10332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10322v1","updated":"2024-04-16T07:07:40Z","published":"2024-04-16T07:07:40Z","title":"Domain-Rectifying Adapter for Cross-Domain Few-Shot Segmentation","summary":" Few-shot semantic segmentation (FSS) has achieved great success on segmenting\nobjects of novel classes, supported by only a few annotated samples. However,\nexisting FSS methods often underperform in the presence of domain shifts,\nespecially when encountering new domain styles that are unseen during training.\nIt is suboptimal to directly adapt or generalize the entire model to new\ndomains in the few-shot scenario. Instead, our key idea is to adapt a small\nadapter for rectifying diverse target domain styles to the source domain.\nConsequently, the rectified target domain features can fittingly benefit from\nthe well-optimized source domain segmentation model, which is intently trained\non sufficient source domain data. Training domain-rectifying adapter requires\nsufficiently diverse target domains. We thus propose a novel local-global style\nperturbation method to simulate diverse potential target domains by\nperturbating the feature channel statistics of the individual images and\ncollective statistics of the entire source domain, respectively. Additionally,\nwe propose a cyclic domain alignment module to facilitate the adapter\neffectively rectifying domains using a reverse domain rectification\nsupervision. The adapter is trained to rectify the image features from diverse\nsynthesized target domains to align with the source domain. During testing on\ntarget domains, we start by rectifying the image features and then conduct\nfew-shot segmentation on the domain-rectified features. Extensive experiments\ndemonstrate the effectiveness of our method, achieving promising results on\ncross-domain few-shot semantic segmentation tasks. Our code is available at\nhttps://github.com/Matt-Su/DR-Adapter.\n","authors":["Jiapeng Su","Qi Fan","Guangming Lu","Fanglin Chen","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2404.10322v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10319v1","updated":"2024-04-16T06:59:26Z","published":"2024-04-16T06:59:26Z","title":"Application of Deep Learning Methods to Processing of Noisy Medical\n Video Data","summary":" Cells count become a challenging problem when the cells move in a continuous\nstream, and their boundaries are difficult for visual detection. To resolve\nthis problem we modified the training and decision making processes using\ncurriculum learning and multi-view predictions techniques, respectively.\n","authors":["Danil Afonchikov","Elena Kornaeva","Irina Makovik","Alexey Kornaev"],"pdf_url":"https://arxiv.org/pdf/2404.10319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10318v1","updated":"2024-04-16T06:58:30Z","published":"2024-04-16T06:58:30Z","title":"SRGS: Super-Resolution 3D Gaussian Splatting","summary":" Recently, 3D Gaussian Splatting (3DGS) has gained popularity as a novel\nexplicit 3D representation. This approach relies on the representation power of\nGaussian primitives to provide a high-quality rendering. However, primitives\noptimized at low resolution inevitably exhibit sparsity and texture deficiency,\nposing a challenge for achieving high-resolution novel view synthesis (HRNVS).\nTo address this problem, we propose Super-Resolution 3D Gaussian Splatting\n(SRGS) to perform the optimization in a high-resolution (HR) space. The\nsub-pixel constraint is introduced for the increased viewpoints in HR space,\nexploiting the sub-pixel cross-view information of the multiple low-resolution\n(LR) views. The gradient accumulated from more viewpoints will facilitate the\ndensification of primitives. Furthermore, a pre-trained 2D super-resolution\nmodel is integrated with the sub-pixel constraint, enabling these dense\nprimitives to learn faithful texture features. In general, our method focuses\non densification and texture learning to effectively enhance the representation\nability of primitives. Experimentally, our method achieves high rendering\nquality on HRNVS only with LR inputs, outperforming state-of-the-art methods on\nchallenging datasets such as Mip-NeRF 360 and Tanks & Temples. Related codes\nwill be released upon acceptance.\n","authors":["Xiang Feng","Yongbo He","Yubo Wang","Yan Yang","Zhenzhong Kuang","Yu Jun","Jianping Fan","Jiajun ding"],"pdf_url":"https://arxiv.org/pdf/2404.10318v1.pdf","comment":"submit ACM MM 2024"},{"id":"http://arxiv.org/abs/2404.10314v1","updated":"2024-04-16T06:40:51Z","published":"2024-04-16T06:40:51Z","title":"Awareness of uncertainty in classification using a multivariate model\n and multi-views","summary":" One of the ways to make artificial intelligence more natural is to give it\nsome room for doubt. Two main questions should be resolved in that way. First,\nhow to train a model to estimate uncertainties of its own predictions? And\nthen, what to do with the uncertain predictions if they appear? First, we\nproposed an uncertainty-aware negative log-likelihood loss for the case of\nN-dimensional multivariate normal distribution with spherical variance matrix\nto the solution of N-classes classification tasks. The loss is similar to the\nheteroscedastic regression loss. The proposed model regularizes uncertain\npredictions, and trains to calculate both the predictions and their uncertainty\nestimations. The model fits well with the label smoothing technique. Second, we\nexpanded the limits of data augmentation at the training and test stages, and\nmade the trained model to give multiple predictions for a given number of\naugmented versions of each test sample. Given the multi-view predictions\ntogether with their uncertainties and confidences, we proposed several methods\nto calculate final predictions, including mode values and bin counts with soft\nand hard weights. For the latter method, we formalized the model tuning task in\nthe form of multimodal optimization with non-differentiable criteria of maximum\naccuracy, and applied particle swarm optimization to solve the tuning task. The\nproposed methodology was tested using CIFAR-10 dataset with clean and noisy\nlabels and demonstrated good results in comparison with other uncertainty\nestimation methods related to sample selection, co-teaching, and label\nsmoothing.\n","authors":["Alexey Kornaev","Elena Kornaeva","Oleg Ivanov","Ilya Pershin","Danis Alukaev"],"pdf_url":"https://arxiv.org/pdf/2404.10314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10312v1","updated":"2024-04-16T06:39:37Z","published":"2024-04-16T06:39:37Z","title":"OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable\n Diffusion Model","summary":" Omnidirectional images (ODIs) are commonly used in real-world visual tasks,\nand high-resolution ODIs help improve the performance of related visual tasks.\nMost existing super-resolution methods for ODIs use end-to-end learning\nstrategies, resulting in inferior realness of generated images and a lack of\neffective out-of-domain generalization capabilities in training methods. Image\ngeneration methods represented by diffusion model provide strong priors for\nvisual tasks and have been proven to be effectively applied to image\nrestoration tasks. Leveraging the image priors of the Stable Diffusion (SD)\nmodel, we achieve omnidirectional image super-resolution with both fidelity and\nrealness, dubbed as OmniSSR. Firstly, we transform the equirectangular\nprojection (ERP) images into tangent projection (TP) images, whose distribution\napproximates the planar image domain. Then, we use SD to iteratively sample\ninitial high-resolution results. At each denoising iteration, we further\ncorrect and update the initial results using the proposed Octadecaplex Tangent\nInformation Interaction (OTII) and Gradient Decomposition (GD) technique to\nensure better consistency. Finally, the TP images are transformed back to\nobtain the final high-resolution results. Our method is zero-shot, requiring no\ntraining or fine-tuning. Experiments of our method on two benchmark datasets\ndemonstrate the effectiveness of our proposed method.\n","authors":["Runyi Li","Xuhan Sheng","Weiqi Li","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15406v2","updated":"2024-04-16T06:33:09Z","published":"2023-12-24T04:49:06Z","title":"Objects as volumes: A stochastic geometry view of opaque solids","summary":" We develop a theory for the representation of opaque solids as volumes.\nStarting from a stochastic representation of opaque solids as random indicator\nfunctions, we prove the conditions under which such solids can be modeled using\nexponential volumetric transport. We also derive expressions for the volumetric\nattenuation coefficient as a functional of the probability distributions of the\nunderlying indicator functions. We generalize our theory to account for\nisotropic and anisotropic scattering at different parts of the solid, and for\nrepresentations of opaque solids as stochastic implicit surfaces. We derive our\nvolumetric representation from first principles, which ensures that it\nsatisfies physical constraints such as reciprocity and reversibility. We use\nour theory to explain, compare, and correct previous volumetric\nrepresentations, as well as propose meaningful extensions that lead to improved\nperformance in 3D reconstruction tasks.\n","authors":["Bailey Miller","Hanyu Chen","Alice Lai","Ioannis Gkioulekas"],"pdf_url":"https://arxiv.org/pdf/2312.15406v2.pdf","comment":"project page: https://imaging.cs.cmu.edu/volumetric_opaque_solids"},{"id":"http://arxiv.org/abs/2404.10307v1","updated":"2024-04-16T06:33:08Z","published":"2024-04-16T06:33:08Z","title":"Learnable Prompt for Few-Shot Semantic Segmentation in Remote Sensing\n Domain","summary":" Few-shot segmentation is a task to segment objects or regions of novel\nclasses within an image given only a few annotated examples. In the generalized\nsetting, the task extends to segment both the base and the novel classes. The\nmain challenge is how to train the model such that the addition of novel\nclasses does not hurt the base classes performance, also known as catastrophic\nforgetting. To mitigate this issue, we use SegGPT as our base model and train\nit on the base classes. Then, we use separate learnable prompts to handle\npredictions for each novel class. To handle various object sizes which\ntypically present in remote sensing domain, we perform patch-based prediction.\nTo address the discontinuities along patch boundaries, we propose a\npatch-and-stitch technique by re-framing the problem as an image inpainting\ntask. During inference, we also utilize image similarity search over image\nembeddings for prompt selection and novel class filtering to reduce false\npositive predictions. Based on our experiments, our proposed method boosts the\nweighted mIoU of a simple fine-tuned SegGPT from 15.96 to 35.08 on the\nvalidation set of few-shot OpenEarthMap dataset given in the challenge.\n","authors":["Steve Andreas Immanuel","Hagai Raja Sinulingga"],"pdf_url":"https://arxiv.org/pdf/2404.10307v1.pdf","comment":"Accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2303.16242v4","updated":"2024-04-16T06:26:46Z","published":"2023-03-28T18:36:19Z","title":"CuNeRF: Cube-Based Neural Radiance Field for Zero-Shot Medical Image\n Arbitrary-Scale Super Resolution","summary":" Medical image arbitrary-scale super-resolution (MIASSR) has recently gained\nwidespread attention, aiming to super sample medical volumes at arbitrary\nscales via a single model. However, existing MIASSR methods face two major\nlimitations: (i) reliance on high-resolution (HR) volumes and (ii) limited\ngeneralization ability, which restricts their application in various scenarios.\nTo overcome these limitations, we propose Cube-based Neural Radiance Field\n(CuNeRF), a zero-shot MIASSR framework that can yield medical images at\narbitrary scales and viewpoints in a continuous domain. Unlike existing MIASSR\nmethods that fit the mapping between low-resolution (LR) and HR volumes, CuNeRF\nfocuses on building a coordinate-intensity continuous representation from LR\nvolumes without the need for HR references. This is achieved by the proposed\ndifferentiable modules: including cube-based sampling, isotropic volume\nrendering, and cube-based hierarchical rendering. Through extensive experiments\non magnetic resource imaging (MRI) and computed tomography (CT) modalities, we\ndemonstrate that CuNeRF outperforms state-of-the-art MIASSR methods. CuNeRF\nyields better visual verisimilitude and reduces aliasing artifacts at various\nupsampling factors. Moreover, our CuNeRF does not need any LR-HR training\npairs, which is more flexible and easier to be used than others. Our code is\nreleased at https://github.com/NarcissusEx/CuNeRF.\n","authors":["Zixuan Chen","Jian-Huang Lai","Lingxiao Yang","Xiaohua Xie"],"pdf_url":"https://arxiv.org/pdf/2303.16242v4.pdf","comment":"This paper is accepted by the International Conference on Computer\n Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2404.10305v1","updated":"2024-04-16T06:24:53Z","published":"2024-04-16T06:24:53Z","title":"TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table\n Structure & Content","summary":" The automatic recognition of tabular data in document images presents a\nsignificant challenge due to the diverse range of table styles and complex\nstructures. Tables offer valuable content representation, enhancing the\npredictive capabilities of various systems such as search engines and Knowledge\nGraphs. Addressing the two main problems, namely table detection (TD) and table\nstructure recognition (TSR), has traditionally been approached independently.\nIn this research, we propose an end-to-end pipeline that integrates deep\nlearning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve\ncomprehensive image-based table recognition. This integrated approach\neffectively handles diverse table styles, complex structures, and image\ndistortions, resulting in improved accuracy and efficiency compared to existing\nmethods like Table Transformers. Our system achieves simultaneous table\ndetection (TD), table structure recognition (TSR), and table content\nrecognition (TCR), preserving table structures and accurately extracting\ntabular data from document images. The integration of multiple models addresses\nthe intricacies of table recognition, making our approach a promising solution\nfor image-based table understanding, data extraction, and information retrieval\napplications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy\nof 78%, showcasing a remarkable improvement of approximately 25% in the OCR\nAccuracy compared to the previous Table Transformer approach.\n","authors":["Avinash Anand","Raj Jaiswal","Pijush Bhuyan","Mohit Gupta","Siddhesh Bangar","Md. Modassir Imam","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.10305v1.pdf","comment":"8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for\n Information Retrieval"},{"id":"http://arxiv.org/abs/2404.09406v2","updated":"2024-04-16T05:58:39Z","published":"2024-04-15T01:47:44Z","title":"Human-in-the-Loop Segmentation of Multi-species Coral Imagery","summary":" Broad-scale marine surveys performed by underwater vehicles significantly\nincrease the availability of coral reef imagery, however it is costly and\ntime-consuming for domain experts to label images. Point label propagation is\nan approach used to leverage existing image data labeled with sparse point\nlabels. The resulting augmented ground truth generated is then used to train a\nsemantic segmentation model. Here, we first demonstrate that recent advances in\nfoundation models enable generation of multi-species coral augmented ground\ntruth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN),\nwithout the need for any pre-training or custom-designed algorithms. For\nextremely sparsely labeled images, we propose a labeling regime based on\nhuman-in-the-loop principles, resulting in significant improvement in\nannotation efficiency: If only 5 point labels per image are available, our\nproposed human-in-the-loop approach improves on the state-of-the-art by 17.3%\nfor pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point\nlabels per image are available. Even if the human-in-the-loop labeling regime\nis not used, the denoised DINOv2 features with a KNN outperforms the prior\nstate-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points).\nWe also provide a detailed analysis of how point labeling style and the\nquantity of points per image affects the point label propagation quality and\nprovide general recommendations on maximizing point label efficiency.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2404.09406v2.pdf","comment":"Accepted at the CVPR2024 3rd Workshop on Learning with Limited\n Labelled Data for Image and Video Understanding (L3D-IVU), 10 pages, 6\n figures, an additional 4 pages of supplementary material"},{"id":"http://arxiv.org/abs/2404.10292v1","updated":"2024-04-16T05:29:14Z","published":"2024-04-16T05:29:14Z","title":"From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for\n Efficient Text-based Person Search","summary":" In text-based person search endeavors, data generation has emerged as a\nprevailing practice, addressing concerns over privacy preservation and the\narduous task of manual annotation. Although the number of synthesized data can\nbe infinite in theory, the scientific conundrum persists that how much\ngenerated data optimally fuels subsequent model training. We observe that only\na subset of the data in these constructed datasets plays a decisive role.\nTherefore, we introduce a new Filtering-WoRA paradigm, which contains a\nfiltering algorithm to identify this crucial data subset and WoRA (Weighted\nLow-Rank Adaptation) learning strategy for light fine-tuning. The filtering\nalgorithm is based on the cross-modality relevance to remove the lots of coarse\nmatching synthesis pairs. As the number of data decreases, we do not need to\nfine-tune the entire model. Therefore, we propose a WoRA learning strategy to\nefficiently update a minimal portion of model parameters. WoRA streamlines the\nlearning process, enabling heightened efficiency in extracting knowledge from\nfewer, yet potent, data instances. Extensive experimentation validates the\nefficacy of pretraining, where our model achieves advanced and efficient\nretrieval performance on challenging real-world benchmarks. Notably, on the\nCUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing\nmodel training time by 19.82%.\n","authors":["Jintao Sun","Zhedong Zheng","Gangyi Ding"],"pdf_url":"https://arxiv.org/pdf/2404.10292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10290v1","updated":"2024-04-16T05:28:07Z","published":"2024-04-16T05:28:07Z","title":"NeuroMorphix: A Novel Brain MRI Asymmetry-specific Feature Construction\n Approach For Seizure Recurrence Prediction","summary":" Seizure recurrence is an important concern after an initial unprovoked\nseizure; without drug treatment, it occurs within 2 years in 40-50% of cases.\nThe decision to treat currently relies on predictors of seizure recurrence risk\nthat are inaccurate, resulting in unnecessary, possibly harmful, treatment in\nsome patients and potentially preventable seizures in others. Because of the\nlink between brain lesions and seizure recurrence, we developed a recurrence\nprediction tool using machine learning and clinical 3T brain MRI. We developed\nNeuroMorphix, a feature construction approach based on MRI brain anatomy. Each\nof seven NeuroMorphix features measures the absolute or relative difference\nbetween corresponding regions in each cerebral hemisphere. FreeSurfer was used\nto segment brain regions and to generate values for morphometric parameters (8\nfor each cortical region and 5 for each subcortical region). The parameters\nwere then mapped to whole brain NeuroMorphix features, yielding a total of 91\nfeatures per subject. Features were generated for a first seizure patient\ncohort (n = 169) categorised into seizure recurrence and non-recurrence\nsubgroups. State-of-the-art classification algorithms were trained and tested\nusing NeuroMorphix features to predict seizure recurrence. Classification\nmodels using the top 5 features, ranked by sequential forward selection,\ndemonstrated excellent performance in predicting seizure recurrence, with area\nunder the ROC curve of 88-93%, accuracy of 83-89%, and F1 score of 83-90%.\nHighly ranked features aligned with structural alterations known to be\nassociated with epilepsy. This study highlights the potential for targeted,\ndata-driven approaches to aid clinical decision-making in brain disorders.\n","authors":["Soumen Ghosh","Viktor Vegh","Shahrzad Moinian","Hamed Moradi","Alice-Ann Sullivan","John Phamnguyen","David Reutens"],"pdf_url":"https://arxiv.org/pdf/2404.10290v1.pdf","comment":"This work has been submitted to the IEEE TMI for possible publication"},{"id":"http://arxiv.org/abs/2404.10282v1","updated":"2024-04-16T04:52:41Z","published":"2024-04-16T04:52:41Z","title":"Tripod: Three Complementary Inductive Biases for Disentangled\n Representation Learning","summary":" Inductive biases are crucial in disentangled representation learning for\nnarrowing down an underspecified solution set. In this work, we consider\nendowing a neural network autoencoder with three select inductive biases from\nthe literature: data compression into a grid-like latent space via\nquantization, collective independence amongst latents, and minimal functional\ninfluence of any latent on how other latents determine data generation. In\nprinciple, these inductive biases are deeply complementary: they most directly\nspecify properties of the latent space, encoder, and decoder, respectively. In\npractice, however, naively combining existing techniques instantiating these\ninductive biases fails to yield significant benefits. To address this, we\npropose adaptations to the three techniques that simplify the learning problem,\nequip key regularization terms with stabilizing invariances, and quash\ndegenerate incentives. The resulting model, Tripod, achieves state-of-the-art\nresults on a suite of four image disentanglement benchmarks. We also verify\nthat Tripod significantly improves upon its naive incarnation and that all\nthree of its \"legs\" are necessary for best performance.\n","authors":["Kyle Hsu","Jubayer Ibn Hamid","Kaylee Burns","Chelsea Finn","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10282v1.pdf","comment":"22 pages, 10 figures, code available at\n https://github.com/kylehkhsu/tripod"},{"id":"http://arxiv.org/abs/2404.10279v1","updated":"2024-04-16T04:44:16Z","published":"2024-04-16T04:44:16Z","title":"EucliDreamer: Fast and High-Quality Texturing for 3D Models with\n Depth-Conditioned Stable Diffusion","summary":" We present EucliDreamer, a simple and effective method to generate textures\nfor 3D models given text prompts and meshes. The texture is parametrized as an\nimplicit function on the 3D surface, which is optimized with the Score\nDistillation Sampling (SDS) process and differentiable rendering. To generate\nhigh-quality textures, we leverage a depth-conditioned Stable Diffusion model\nguided by the depth image rendered from the mesh. We test our approach on 3D\nmodels in Objaverse and conducted a user study, which shows its superior\nquality compared to existing texturing methods like Text2Tex. In addition, our\nmethod converges 2 times faster than DreamFusion. Through text prompting,\ntextures of diverse art styles can be produced. We hope Euclidreamer proides a\nviable solution to automate a labor-intensive stage in 3D content creation.\n","authors":["Cindy Le","Congrui Hetang","Chendi Lin","Ang Cao","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2404.10279v1.pdf","comment":"Short version of arXiv:2311.15573"},{"id":"http://arxiv.org/abs/2403.14987v2","updated":"2024-04-16T04:15:32Z","published":"2024-03-22T06:45:45Z","title":"Generative Active Learning for Image Synthesis Personalization","summary":" This paper presents a pilot study that explores the application of active\nlearning, traditionally studied in the context of discriminative models, to\ngenerative models. We specifically focus on image synthesis personalization\ntasks. The primary challenge in conducting active learning on generative models\nlies in the open-ended nature of querying, which differs from the closed form\nof querying in discriminative models that typically target a single concept. We\nintroduce the concept of anchor directions to transform the querying process\ninto a semi-open problem. We propose a direction-based uncertainty sampling\nstrategy to enable generative active learning and tackle the\nexploitation-exploration dilemma. Extensive experiments are conducted to\nvalidate the effectiveness of our approach, demonstrating that an open-source\nmodel can achieve superior performance compared to closed-source models\ndeveloped by large companies, such as Google's StyleDrop. The source code is\navailable at https://github.com/zhangxulu1996/GAL4Personalization.\n","authors":["Xulu Zhang","Wengyu Zhang","Xiao-Yong Wei","Jinlin Wu","Zhaoxiang Zhang","Zhen Lei","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2403.14987v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10272v1","updated":"2024-04-16T04:05:33Z","published":"2024-04-16T04:05:33Z","title":"Plug-and-Play Acceleration of Occupancy Grid-based NeRF Rendering using\n VDB Grid and Hierarchical Ray Traversal","summary":" Transmittance estimators such as Occupancy Grid (OG) can accelerate the\ntraining and rendering of Neural Radiance Field (NeRF) by predicting important\nsamples that contributes much to the generated image. However, OG manages\noccupied regions in the form of the dense binary grid, in which there are many\nblocks with the same values that cause redundant examination of voxels'\nemptiness in ray-tracing. In our work, we introduce two techniques to improve\nthe efficiency of ray-tracing in trained OG without fine-tuning. First, we\nreplace the dense grids with VDB grids to reduce the spatial redundancy.\nSecond, we use hierarchical digital differential analyzer (HDDA) to efficiently\ntrace voxels in the VDB grids. Our experiments on NeRF-Synthetic and Mip-NeRF\n360 datasets show that our proposed method successfully accelerates rendering\nNeRF-Synthetic dataset by 12% in average and Mip-NeRF 360 dataset by 4% in\naverage, compared to a fast implementation of OG, NerfAcc, without losing the\nquality of rendered images.\n","authors":["Yoshio Kato","Shuhei Tarashima"],"pdf_url":"https://arxiv.org/pdf/2404.10272v1.pdf","comment":"Short paper for CVPR Neural Rendering Intelligence Workshop 2024.\n Code: https://github.com/Yosshi999/faster-occgrid"},{"id":"http://arxiv.org/abs/2305.00635v2","updated":"2024-04-16T03:46:03Z","published":"2023-05-01T02:51:38Z","title":"Learning Self-Prior for Mesh Inpainting Using Self-Supervised Graph\n Convolutional Networks","summary":" In this paper, we present a self-prior-based mesh inpainting framework that\nrequires only an incomplete mesh as input, without the need for any training\ndatasets. Additionally, our method maintains the polygonal mesh format\nthroughout the inpainting process without converting the shape format to an\nintermediate one, such as a voxel grid, a point cloud, or an implicit function,\nwhich are typically considered easier for deep neural networks to process. To\nachieve this goal, we introduce two graph convolutional networks (GCNs):\nsingle-resolution GCN (SGCN) and multi-resolution GCN (MGCN), both trained in a\nself-supervised manner. Our approach refines a watertight mesh obtained from\nthe initial hole filling to generate a complete output mesh. Specifically, we\ntrain the GCNs to deform an oversmoothed version of the input mesh into the\nexpected complete shape. The deformation is described by vertex displacements,\nand the GCNs are supervised to obtain accurate displacements at vertices in\nreal holes. To this end, we specify several connected regions of the mesh as\nfake holes, thereby generating meshes with various sets of fake holes. The\ncorrect displacements of vertices are known in these fake holes, thus enabling\ntraining GCNs with loss functions that assess the accuracy of vertex\ndisplacements. We demonstrate that our method outperforms traditional\ndataset-independent approaches and exhibits greater robustness compared with\nother deep-learning-based methods for shapes that infrequently appear in shape\ndatasets. Our code and test data are available at\nhttps://github.com/astaka-pe/SeMIGCN.\n","authors":["Shota Hattori","Tatsuya Yatagawa","Yutaka Ohtake","Hiromasa Suzuki"],"pdf_url":"https://arxiv.org/pdf/2305.00635v2.pdf","comment":"18 pages, 18 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.10267v1","updated":"2024-04-16T03:45:45Z","published":"2024-04-16T03:45:45Z","title":"OneActor: Consistent Character Generation via Cluster-Conditioned\n Guidance","summary":" Text-to-image diffusion models benefit artists with high-quality image\ngeneration. Yet its stochastic nature prevent artists from creating consistent\nimages of the same character. Existing methods try to tackle this challenge and\ngenerate consistent content in various ways. However, they either depend on\nexternal data or require expensive tuning of the diffusion model. For this\nissue, we argue that a lightweight but intricate guidance is enough to\nfunction. Aiming at this, we lead the way to formalize the objective of\nconsistent generation, derive a clustering-based score function and propose a\nnovel paradigm, OneActor. We design a cluster-conditioned model which\nincorporates posterior samples to guide the denoising trajectories towards the\ntarget cluster. To overcome the overfitting challenge shared by one-shot tuning\npipelines, we devise auxiliary components to simultaneously augment the tuning\nand regulate the inference. This technique is later verified to significantly\nenhance the content diversity of generated images. Comprehensive experiments\nshow that our method outperforms a variety of baselines with satisfactory\ncharacter consistency, superior prompt conformity as well as high image\nquality. And our method is at least 4 times faster than tuning-based baselines.\nFurthermore, to our best knowledge, we first prove that the semantic space has\nthe same interpolation property as the latent space dose. This property can\nserve as another promising tool for fine generation control.\n","authors":["Jiahao Wang","Caixia Yan","Haonan Lin","Weizhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13127v4","updated":"2024-04-16T03:43:43Z","published":"2023-11-22T03:31:31Z","title":"MetaCloak: Preventing Unauthorized Subject-driven Text-to-image\n Diffusion-based Synthesis via Meta-learning","summary":" Text-to-image diffusion models allow seamless generation of personalized\nimages from scant reference photos. Yet, these tools, in the wrong hands, can\nfabricate misleading or harmful content, endangering individuals. To address\nthis problem, existing poisoning-based approaches perturb user images in an\nimperceptible way to render them \"unlearnable\" from malicious uses. We identify\ntwo limitations of these defending approaches: i) sub-optimal due to the\nhand-crafted heuristics for solving the intractable bilevel optimization and\nii) lack of robustness against simple data transformations like Gaussian\nfiltering. To solve these challenges, we propose MetaCloak, which solves the\nbi-level poisoning problem with a meta-learning framework with an additional\ntransformation sampling process to craft transferable and robust perturbation.\nSpecifically, we employ a pool of surrogate diffusion models to craft\ntransferable and model-agnostic perturbation. Furthermore, by incorporating an\nadditional transformation process, we design a simple denoising-error\nmaximization loss that is sufficient for causing transformation-robust semantic\ndistortion and degradation in a personalized generation. Extensive experiments\non the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing\napproaches. Notably, MetaCloak can successfully fool online training services\nlike Replicate, in a black-box manner, demonstrating the effectiveness of\nMetaCloak in real-world scenarios. Our code is available at\nhttps://github.com/liuyixin-louis/MetaCloak.\n","authors":["Yixin Liu","Chenrui Fan","Yutong Dai","Xun Chen","Pan Zhou","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13127v4.pdf","comment":"Accepted to CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.09640v2","updated":"2024-04-16T03:43:11Z","published":"2024-04-15T10:19:39Z","title":"CREST: Cross-modal Resonance through Evidential Deep Learning for\n Enhanced Zero-Shot Learning","summary":" Zero-shot learning (ZSL) enables the recognition of novel classes by\nleveraging semantic knowledge transfer from known to unknown categories. This\nknowledge, typically encapsulated in attribute descriptions, aids in\nidentifying class-specific visual features, thus facilitating visual-semantic\nalignment and improving ZSL performance. However, real-world challenges such as\ndistribution imbalances and attribute co-occurrence among instances often\nhinder the discernment of local variances in images, a problem exacerbated by\nthe scarcity of fine-grained, region-specific attribute annotations. Moreover,\nthe variability in visual presentation within categories can also skew\nattribute-category associations. In response, we propose a bidirectional\ncross-modal ZSL approach CREST. It begins by extracting representations for\nattribute and visual localization and employs Evidential Deep Learning (EDL) to\nmeasure underlying epistemic uncertainty, thereby enhancing the model's\nresilience against hard negatives. CREST incorporates dual learning pathways,\nfocusing on both visual-category and attribute-category alignments, to ensure\nrobust correlation between latent and observable spaces. Moreover, we introduce\nan uncertainty-informed cross-modal fusion technique to refine visual-attribute\ninference. Extensive experiments demonstrate our model's effectiveness and\nunique explainability across multiple datasets. Our code and data are available\nat: https://github.com/JethroJames/CREST.\n","authors":["Haojian Huang","Xiaozhen Qiao","Zhuo Chen","Haodong Chen","Bingyu Li","Zhe Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.09640v2.pdf","comment":"Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at:\n https://github.com/JethroJames/CREST"},{"id":"http://arxiv.org/abs/2404.09378v2","updated":"2024-04-16T03:39:27Z","published":"2024-04-14T23:30:35Z","title":"Orientation-conditioned Facial Texture Mapping for Video-based Facial\n Remote Photoplethysmography Estimation","summary":" Camera-based remote photoplethysmography (rPPG) enables contactless\nmeasurement of important physiological signals such as pulse rate (PR).\nHowever, dynamic and unconstrained subject motion introduces significant\nvariability into the facial appearance in video, confounding the ability of\nvideo-based methods to accurately extract the rPPG signal. In this study, we\nleverage the 3D facial surface to construct a novel orientation-conditioned\nfacial texture video representation which improves the motion robustness of\nexisting video-based facial rPPG estimation methods. Our proposed method\nachieves a significant 18.2% performance improvement in cross-dataset testing\non MMPD over our baseline using the PhysNet model trained on PURE, highlighting\nthe efficacy and generalization benefits of our designed video representation.\nWe demonstrate significant performance improvements of up to 29.6% in all\ntested motion scenarios in cross-dataset testing on MMPD, even in the presence\nof dynamic and unconstrained subject motion, emphasizing the benefits of\ndisentangling motion through modeling the 3D facial surface for motion robust\nfacial rPPG estimation. We validate the efficacy of our design decisions and\nthe impact of different video processing steps through an ablation study. Our\nfindings illustrate the potential strengths of exploiting the 3D facial surface\nas a general strategy for addressing dynamic and unconstrained subject motion\nin videos. The code is available at\nhttps://samcantrill.github.io/orientation-uv-rppg/.\n","authors":["Sam Cantrill","David Ahmedt-Aristizabal","Lars Petersson","Hanna Suominen","Mohammad Ali Armin"],"pdf_url":"https://arxiv.org/pdf/2404.09378v2.pdf","comment":"12 pages, 8 figures, 6 tables; corrected abstract typo"},{"id":"http://arxiv.org/abs/2404.00231v2","updated":"2024-04-16T03:38:31Z","published":"2024-03-30T03:23:52Z","title":"Attention-based Shape-Deformation Networks for Artifact-Free Geometry\n Reconstruction of Lumbar Spine from MR Images","summary":" Lumbar disc degeneration, a progressive structural wear and tear of lumbar\nintervertebral disc, is regarded as an essential role on low back pain, a\nsignificant global health concern. Automated lumbar spine geometry\nreconstruction from MR images will enable fast measurement of medical\nparameters to evaluate the lumbar status, in order to determine a suitable\ntreatment. Existing image segmentation-based techniques often generate\nerroneous segments or unstructured point clouds, unsuitable for medical\nparameter measurement. In this work, we present TransDeformer: a novel\nattention-based deep learning approach that reconstructs the geometry of the\nlumbar spine with high spatial accuracy and mesh correspondence across\npatients, and we also present a variant of TransDeformer for error estimation.\nSpecially, we devise new attention modules with a new attention formula, which\nintegrate image features and tokenized contour features to predict the\ndisplacements of the points on a shape template without the need for image\nsegmentation. The deformed template reveals the lumbar spine geometry in an\nimage. Experiment results show that our TransDeformer generates artifact-free\ngeometry outputs, and its variant predicts the error of a reconstructed\ngeometry. Our code is available at\nhttps://github.com/linchenq/TransDeformer-Mesh.\n","authors":["Linchen Qian","Jiasong Chen","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2404.00231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10263v1","updated":"2024-04-16T03:34:35Z","published":"2024-04-16T03:34:35Z","title":"PreGSU-A Generalized Traffic Scene Understanding Model for Autonomous\n Driving based on Pre-trained Graph Attention Network","summary":" Scene understanding, defined as learning, extraction, and representation of\ninteractions among traffic elements, is one of the critical challenges toward\nhigh-level autonomous driving (AD). Current scene understanding methods mainly\nfocus on one concrete single task, such as trajectory prediction and risk level\nevaluation. Although they perform well on specific metrics, the generalization\nability is insufficient to adapt to the real traffic complexity and downstream\ndemand diversity. In this study, we propose PreGSU, a generalized pre-trained\nscene understanding model based on graph attention network to learn the\nuniversal interaction and reasoning of traffic scenes to support various\ndownstream tasks. After the feature engineering and sub-graph module, all\nelements are embedded as nodes to form a dynamic weighted graph. Then, four\ngraph attention layers are applied to learn the relationships among agents and\nlanes. In the pre-train phase, the understanding model is trained on two\nself-supervised tasks: Virtual Interaction Force (VIF) modeling and Masked Road\nModeling (MRM). Based on the artificial potential field theory, VIF modeling\nenables PreGSU to capture the agent-to-agent interactions while MRM extracts\nagent-to-road connections. In the fine-tuning process, the pre-trained\nparameters are loaded to derive detailed understanding outputs. We conduct\nvalidation experiments on two downstream tasks, i.e., trajectory prediction in\nurban scenario, and intention recognition in highway scenario, to verify the\ngeneralized ability and understanding ability. Results show that compared with\nthe baselines, PreGSU achieves better accuracy on both tasks, indicating the\npotential to be generalized to various scenes and targets. Ablation study shows\nthe effectiveness of pre-train task design.\n","authors":["Yuning Wang","Zhiyuan Liu","Haotian Lin","Junkai Jiang","Shaobing Xu","Jianqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10263v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2404.09301v2","updated":"2024-04-16T03:27:00Z","published":"2024-04-14T16:55:23Z","title":"A Simple Strategy for Body Estimation from Partial-View Images","summary":" Virtual try-on and product personalization have become increasingly important\nin modern online shopping, highlighting the need for accurate body measurement\nestimation. Although previous research has advanced in estimating 3D body\nshapes from RGB images, the task is inherently ambiguous as the observed scale\nof human subjects in the images depends on two unknown factors: capture\ndistance and body dimensions. This ambiguity is particularly pronounced in\npartial-view scenarios. To address this challenge, we propose a modular and\nsimple height normalization solution. This solution relocates the subject\nskeleton to the desired position, thereby normalizing the scale and\ndisentangling the relationship between the two variables. Our experimental\nresults demonstrate that integrating this technique into state-of-the-art human\nmesh reconstruction models significantly enhances partial body measurement\nestimation. Additionally, we illustrate the applicability of this approach to\nmulti-view settings, showcasing its versatility.\n","authors":["Yafei Mao","Xuelu Li","Brandon Smith","Jinjin Li","Raja Bala"],"pdf_url":"https://arxiv.org/pdf/2404.09301v2.pdf","comment":"Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design"},{"id":"http://arxiv.org/abs/2401.15914v2","updated":"2024-04-16T03:25:25Z","published":"2024-01-29T06:57:48Z","title":"Overcoming the Pitfalls of Vision-Language Model Finetuning for OOD\n Generalization","summary":" Existing vision-language models exhibit strong generalization on a variety of\nvisual domains and tasks. However, such models mainly perform zero-shot\nrecognition in a closed-set manner, and thus struggle to handle open-domain\nvisual concepts by design. There are recent finetuning methods, such as prompt\nlearning, that not only study the discrimination between in-distribution (ID)\nand out-of-distribution (OOD) samples, but also show some improvements in both\nID and OOD accuracies. In this paper, we first demonstrate that vision-language\nmodels, after long enough finetuning but without proper regularization, tend to\noverfit the known classes in the given dataset, with degraded performance on\nunknown classes. Then we propose a novel approach OGEN to address this pitfall,\nwith the main focus on improving the OOD GENeralization of finetuned models.\nSpecifically, a class-conditional feature generator is introduced to synthesize\nOOD features using just the class name of any unknown class. Such synthesized\nfeatures will provide useful knowledge about unknowns and help regularize the\ndecision boundary between ID and OOD data when optimized jointly. Equally\nimportant is our adaptive self-distillation mechanism to regularize our feature\ngeneration model during joint optimization, i.e., adaptively transferring\nknowledge between model states to further prevent overfitting. Experiments\nvalidate that our method yields convincing gains in OOD generalization\nperformance in different settings. Code: https://github.com/apple/ml-ogen.\n","authors":["Yuhang Zang","Hanlin Goh","Josh Susskind","Chen Huang"],"pdf_url":"https://arxiv.org/pdf/2401.15914v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2312.06797v2","updated":"2024-04-16T03:10:34Z","published":"2023-12-11T19:13:38Z","title":"Improving the Robustness of 3D Human Pose Estimation: A Benchmark and\n Learning from Noisy Input","summary":" Despite the promising performance of current 3D human pose estimation\ntechniques, understanding and enhancing their generalization on challenging\nin-the-wild videos remain an open problem. In this work, we focus on the\nrobustness of 2D-to-3D pose lifters. To this end, we develop two benchmark\ndatasets, namely Human3.6M-C and HumanEva-I-C, to examine the robustness of\nvideo-based 3D pose lifters to a wide range of common video corruptions\nincluding temporary occlusion, motion blur, and pixel-level noise. We observe\nthe poor generalization of state-of-the-art 3D pose lifters in the presence of\ncorruption and establish two techniques to tackle this issue. First, we\nintroduce Temporal Additive Gaussian Noise (TAGN) as a simple yet effective 2D\ninput pose data augmentation. Additionally, to incorporate the confidence\nscores output by the 2D pose detectors, we design a confidence-aware\nconvolution (CA-Conv) block. Extensively tested on corrupted videos, the\nproposed strategies consistently boost the robustness of 3D pose lifters and\nserve as new baselines for future research.\n","authors":["Trung-Hieu Hoang","Mona Zehni","Huy Phan","Duc Minh Vo","Minh N. Do"],"pdf_url":"https://arxiv.org/pdf/2312.06797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17521v2","updated":"2024-04-16T03:02:04Z","published":"2024-02-27T14:05:05Z","title":"AVS-Net: Point Sampling with Adaptive Voxel Size for 3D Scene\n Understanding","summary":" The recent advancements in point cloud learning have enabled intelligent\nvehicles and robots to comprehend 3D environments better. However, processing\nlarge-scale 3D scenes remains a challenging problem, such that efficient\ndownsampling methods play a crucial role in point cloud learning. Existing\ndownsampling methods either require a huge computational burden or sacrifice\nfine-grained geometric information. For such purpose, this paper presents an\nadvanced sampler that achieves both high accuracy and efficiency. The proposed\nmethod utilizes voxel centroid sampling as a foundation but effectively\naddresses the challenges regarding voxel size determination and the\npreservation of critical geometric cues. Specifically, we propose a Voxel\nAdaptation Module that adaptively adjusts voxel sizes with the reference of\npoint-based downsampling ratio. This ensures that the sampling results exhibit\na favorable distribution for comprehending various 3D objects or scenes.\nMeanwhile, we introduce a network compatible with arbitrary voxel sizes for\nsampling and feature extraction while maintaining high efficiency. The proposed\napproach is demonstrated with 3D object detection and 3D semantic segmentation.\nCompared to existing state-of-the-art methods, our approach achieves better\naccuracy on outdoor and indoor large-scale datasets, e.g. Waymo and ScanNet,\nwith promising efficiency.\n","authors":["Hongcheng Yang","Dingkang Liang","Dingyuan Zhang","Zhe Liu","Zhikang Zou","Xingyu Jiang","Yingying Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.17521v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10242v1","updated":"2024-04-16T02:42:06Z","published":"2024-04-16T02:42:06Z","title":"Masked Autoencoders for Microscopy are Scalable Learners of Cellular\n Biology","summary":" Featurizing microscopy images for use in biological research remains a\nsignificant challenge, especially for large-scale experiments spanning millions\nof images. This work explores the scaling properties of weakly supervised\nclassifiers and self-supervised masked autoencoders (MAEs) when training with\nincreasingly larger model backbones and microscopy datasets. Our results show\nthat ViT-based MAEs outperform weakly supervised classifiers on a variety of\ntasks, achieving as much as a 11.5% relative improvement when recalling known\nbiological relationships curated from public databases. Additionally, we\ndevelop a new channel-agnostic MAE architecture (CA-MAE) that allows for\ninputting images of different numbers and orders of channels at inference time.\nWe demonstrate that CA-MAEs effectively generalize by inferring and evaluating\non a microscopy image dataset (JUMP-CP) generated under different experimental\nconditions with a different channel structure than our pretraining data\n(RPI-93M). Our findings motivate continued research into scaling\nself-supervised learning on microscopy data in order to create powerful\nfoundation models of cellular biology that have the potential to catalyze\nadvancements in drug discovery and beyond.\n","authors":["Oren Kraus","Kian Kenyon-Dean","Saber Saberian","Maryam Fallah","Peter McLean","Jess Leung","Vasudev Sharma","Ayla Khan","Jia Balakrishnan","Safiye Celik","Dominique Beaini","Maciej Sypetkowski","Chi Vicky Cheng","Kristen Morse","Maureen Makes","Ben Mabey","Berton Earnshaw"],"pdf_url":"https://arxiv.org/pdf/2404.10242v1.pdf","comment":"CVPR 2024 Highlight. arXiv admin note: text overlap with\n arXiv:2309.16064"},{"id":"http://arxiv.org/abs/2404.10241v1","updated":"2024-04-16T02:40:35Z","published":"2024-04-16T02:40:35Z","title":"Vision-and-Language Navigation via Causal Learning","summary":" In the pursuit of robust and generalizable environment perception and\nlanguage understanding, the ubiquitous challenge of dataset bias continues to\nplague vision-and-language navigation (VLN) agents, hindering their performance\nin unseen environments. This paper introduces the generalized cross-modal\ncausal transformer (GOAT), a pioneering solution rooted in the paradigm of\ncausal inference. By delving into both observable and unobservable confounders\nwithin vision, language, and history, we propose the back-door and front-door\nadjustment causal learning (BACL and FACL) modules to promote unbiased learning\nby comprehensively mitigating potential spurious correlations. Additionally, to\ncapture global confounder features, we propose a cross-modal feature pooling\n(CFP) module supervised by contrastive learning, which is also shown to be\neffective in improving cross-modal representations during pre-training.\nExtensive experiments across multiple VLN datasets (R2R, REVERIE, RxR, and\nSOON) underscore the superiority of our proposed method over previous\nstate-of-the-art approaches. Code is available at\nhttps://github.com/CrystalSixone/VLN-GOAT.\n","authors":["Liuyi Wang","Zongtao He","Ronghao Dang","Mengjiao Shen","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08123v3","updated":"2024-04-16T02:37:47Z","published":"2023-07-16T18:42:01Z","title":"Solving Inverse Problems with Latent Diffusion Models via Hard Data\n Consistency","summary":" Diffusion models have recently emerged as powerful generative priors for\nsolving inverse problems. However, training diffusion models in the pixel space\nare both data-intensive and computationally demanding, which restricts their\napplicability as priors for high-dimensional real-world data such as medical\nimages. Latent diffusion models, which operate in a much lower-dimensional\nspace, offer a solution to these challenges. However, incorporating latent\ndiffusion models to solve inverse problems remains a challenging problem due to\nthe nonlinearity of the encoder and decoder. To address these issues, we\npropose \\textit{ReSample}, an algorithm that can solve general inverse problems\nwith pre-trained latent diffusion models. Our algorithm incorporates data\nconsistency by solving an optimization problem during the reverse sampling\nprocess, a concept that we term as hard data consistency. Upon solving this\noptimization problem, we propose a novel resampling scheme to map the\nmeasurement-consistent sample back onto the noisy data manifold and\ntheoretically demonstrate its benefits. Lastly, we apply our algorithm to solve\na wide range of linear and nonlinear inverse problems in both natural and\nmedical images, demonstrating that our approach outperforms existing\nstate-of-the-art approaches, including those based on pixel-space diffusion\nmodels.\n","authors":["Bowen Song","Soo Min Kwon","Zecheng Zhang","Xinyu Hu","Qing Qu","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2307.08123v3.pdf","comment":"27 pages, 20 figures"},{"id":"http://arxiv.org/abs/2404.10237v1","updated":"2024-04-16T02:35:17Z","published":"2024-04-16T02:35:17Z","title":"MoE-TinyMed: Mixture of Experts for Tiny Medical Large Vision-Language\n Models","summary":" Mixture of Expert Tuning (MoE-Tuning) has effectively enhanced the\nperformance of general MLLMs with fewer parameters, yet its application in\nresource-limited medical settings has not been fully explored. To address this\ngap, we developed MoE-TinyMed, a model tailored for medical applications that\nsignificantly lowers parameter demands. In evaluations on the VQA-RAD, SLAKE,\nand Path-VQA datasets, MoE-TinyMed outperformed LLaVA-Med in all Med-VQA closed\nsettings with just 3.6B parameters. Additionally, a streamlined version with 2B\nparameters surpassed LLaVA-Med's performance in PathVQA, showcasing its\neffectiveness in resource-limited healthcare settings.\n","authors":["Songtao Jiang","Tuo Zheng","Yan Zhang","Yeying Jin","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10234v1","updated":"2024-04-16T02:29:00Z","published":"2024-04-16T02:29:00Z","title":"Compressible and Searchable: AI-native Multi-Modal Retrieval System with\n Learned Image Compression","summary":" The burgeoning volume of digital content across diverse modalities\nnecessitates efficient storage and retrieval methods. Conventional approaches\nstruggle to cope with the escalating complexity and scale of multimedia data.\nIn this paper, we proposed framework addresses this challenge by fusing\nAI-native multi-modal search capabilities with neural image compression. First\nwe analyze the intricate relationship between compressibility and\nsearchability, recognizing the pivotal role each plays in the efficiency of\nstorage and retrieval systems. Through the usage of simple adapter is to bridge\nthe feature of Learned Image Compression(LIC) and Contrastive Language-Image\nPretraining(CLIP) while retaining semantic fidelity and retrieval of\nmulti-modal data. Experimental evaluations on Kodak datasets demonstrate the\nefficacy of our approach, showcasing significant enhancements in compression\nefficiency and search accuracy compared to existing methodologies. Our work\nmarks a significant advancement towards scalable and efficient multi-modal\nsearch systems in the era of big data.\n","authors":["Jixiang Luo"],"pdf_url":"https://arxiv.org/pdf/2404.10234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10227v1","updated":"2024-04-16T02:18:18Z","published":"2024-04-16T02:18:18Z","title":"MS-MANO: Enabling Hand Pose Tracking with Biomechanical Constraints","summary":" This work proposes a novel learning framework for visual hand dynamics\nanalysis that takes into account the physiological aspects of hand motion. The\nexisting models, which are simplified joint-actuated systems, often produce\nunnatural motions. To address this, we integrate a musculoskeletal system with\na learnable parametric hand model, MANO, to create a new model, MS-MANO. This\nmodel emulates the dynamics of muscles and tendons to drive the skeletal\nsystem, imposing physiologically realistic constraints on the resulting torque\ntrajectories. We further propose a simulation-in-the-loop pose refinement\nframework, BioPR, that refines the initial estimated pose through a multi-layer\nperceptron (MLP) network. Our evaluation of the accuracy of MS-MANO and the\nefficacy of the BioPR is conducted in two separate parts. The accuracy of\nMS-MANO is compared with MyoSuite, while the efficacy of BioPR is benchmarked\nagainst two large-scale public datasets and two recent state-of-the-art\nmethods. The results demonstrate that our approach consistently improves the\nbaseline methods both quantitatively and qualitatively.\n","authors":["Pengfei Xie","Wenqiang Xu","Tutian Tang","Zhenjun Yu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2404.10227v1.pdf","comment":"11 pages, 5 figures; CVPR 2024"},{"id":"http://arxiv.org/abs/2307.03157v2","updated":"2024-04-16T02:12:11Z","published":"2023-07-06T17:32:38Z","title":"Achieving Reliable and Fair Skin Lesion Diagnosis via Unsupervised\n Domain Adaptation","summary":" The development of reliable and fair diagnostic systems is often constrained\nby the scarcity of labeled data. To address this challenge, our work explores\nthe feasibility of unsupervised domain adaptation (UDA) to integrate large\nexternal datasets for developing reliable classifiers. The adoption of UDA with\nmultiple sources can simultaneously enrich the training set and bridge the\ndomain gap between different skin lesion datasets, which vary due to distinct\nacquisition protocols. Particularly, UDA shows practical promise for improving\ndiagnostic reliability when training with a custom skin lesion dataset, where\nonly limited labeled data are available from the target domain. In this study,\nwe investigate three UDA training schemes based on source data utilization:\nsingle-source, combined-source, and multi-source UDA. Our findings demonstrate\nthe effectiveness of applying UDA on multiple sources for binary and\nmulti-class classification. A strong correlation between test error and label\nshift in multi-class tasks has been observed in the experiment. Crucially, our\nstudy shows that UDA can effectively mitigate bias against minority groups and\nenhance fairness in diagnostic systems, while maintaining superior\nclassification performance. This is achieved even without directly implementing\nfairness-focused techniques. This success is potentially attributed to the\nincreased and well-adapted demographic information obtained from multiple\nsources.\n","authors":["Janet Wang","Yunbei Zhang","Zhengming Ding","Jihun Hamm"],"pdf_url":"https://arxiv.org/pdf/2307.03157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10226v1","updated":"2024-04-16T02:11:46Z","published":"2024-04-16T02:11:46Z","title":"Find The Gap: Knowledge Base Reasoning For Visual Question Answering","summary":" We analyze knowledge-based visual question answering, for which given a\nquestion, the models need to ground it into the visual modality and retrieve\nthe relevant knowledge from a given large knowledge base (KB) to be able to\nanswer. Our analysis has two folds, one based on designing neural architectures\nand training them from scratch, and another based on large pre-trained language\nmodels (LLMs). Our research questions are: 1) Can we effectively augment models\nby explicit supervised retrieval of the relevant KB information to solve the\nKB-VQA problem? 2) How do task-specific and LLM-based models perform in the\nintegration of visual and external knowledge, and multi-hop reasoning over both\nsources of information? 3) Is the implicit knowledge of LLMs sufficient for\nKB-VQA and to what extent it can replace the explicit KB? Our results\ndemonstrate the positive impact of empowering task-specific and LLM models with\nsupervised external and visual knowledge retrieval models. Our findings show\nthat though LLMs are stronger in 1-hop reasoning, they suffer in 2-hop\nreasoning in comparison with our fine-tuned NN model even if the relevant\ninformation from both modalities is available to the model. Moreover, we\nobserved that LLM models outperform the NN model for KB-related questions which\nconfirms the effectiveness of implicit knowledge in LLMs however, they do not\nalleviate the need for external KB.\n","authors":["Elham J. Barezi","Parisa Kordjamshidi"],"pdf_url":"https://arxiv.org/pdf/2404.10226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10220v1","updated":"2024-04-16T02:01:56Z","published":"2024-04-16T02:01:56Z","title":"Closed-Loop Open-Vocabulary Mobile Manipulation with GPT-4V","summary":" Autonomous robot navigation and manipulation in open environments require\nreasoning and replanning with closed-loop feedback. We present COME-robot, the\nfirst closed-loop framework utilizing the GPT-4V vision-language foundation\nmodel for open-ended reasoning and adaptive planning in real-world scenarios.\nWe meticulously construct a library of action primitives for robot exploration,\nnavigation, and manipulation, serving as callable execution modules for GPT-4V\nin task planning. On top of these modules, GPT-4V serves as the brain that can\naccomplish multimodal reasoning, generate action policy with code, verify the\ntask progress, and provide feedback for replanning. Such design enables\nCOME-robot to (i) actively perceive the environments, (ii) perform situated\nreasoning, and (iii) recover from failures. Through comprehensive experiments\ninvolving 8 challenging real-world tabletop and manipulation tasks, COME-robot\ndemonstrates a significant improvement in task success rate (~25%) compared to\nstate-of-the-art baseline methods. We further conduct comprehensive analyses to\nelucidate how COME-robot's design facilitates failure recovery, free-form\ninstruction following, and long-horizon task planning.\n","authors":["Peiyuan Zhi","Zhiyuan Zhang","Muzhi Han","Zeyu Zhang","Zhitian Li","Ziyuan Jiao","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.10220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16654v2","updated":"2024-04-16T01:55:19Z","published":"2023-06-29T03:31:46Z","title":"Self-Supervised MRI Reconstruction with Unrolled Diffusion Models","summary":" Magnetic Resonance Imaging (MRI) produces excellent soft tissue contrast,\nalbeit it is an inherently slow imaging modality. Promising deep learning\nmethods have recently been proposed to reconstruct accelerated MRI scans.\nHowever, existing methods still suffer from various limitations regarding image\nfidelity, contextual sensitivity, and reliance on fully-sampled acquisitions\nfor model training. To comprehensively address these limitations, we propose a\nnovel self-supervised deep reconstruction model, named Self-Supervised\nDiffusion Reconstruction (SSDiffRecon). SSDiffRecon expresses a conditional\ndiffusion process as an unrolled architecture that interleaves cross-attention\ntransformers for reverse diffusion steps with data-consistency blocks for\nphysics-driven processing. Unlike recent diffusion methods for MRI\nreconstruction, a self-supervision strategy is adopted to train SSDiffRecon\nusing only undersampled k-space data. Comprehensive experiments on public brain\nMR datasets demonstrates the superiority of SSDiffRecon against\nstate-of-the-art supervised, and self-supervised baselines in terms of\nreconstruction speed and quality. Implementation will be available at\nhttps://github.com/yilmazkorkmaz1/SSDiffRecon.\n","authors":["Yilmaz Korkmaz","Tolga Cukur","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2306.16654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12308v5","updated":"2024-04-16T01:52:00Z","published":"2023-04-24T17:57:15Z","title":"Segment Anything in 3D with Radiance Fields","summary":" The Segment Anything Model (SAM) emerges as a powerful vision foundation\nmodel to generate high-quality 2D segmentation results. This paper aims to\ngeneralize SAM to segment 3D objects. Rather than replicating the data\nacquisition and annotation procedure which is costly in 3D, we design an\nefficient solution, leveraging the radiance field as a cheap and off-the-shelf\nprior that connects multi-view 2D images to the 3D space. We refer to the\nproposed solution as SA3D, short for Segment Anything in 3D. With SA3D, the\nuser is only required to provide a 2D segmentation prompt (e.g., rough points)\nfor the target object in a single view, which is used to generate its\ncorresponding 2D mask with SAM. Next, SA3D alternately performs mask inverse\nrendering and cross-view self-prompting across various views to iteratively\nrefine the 3D mask of the target object. For one view, mask inverse rendering\nprojects the 2D mask obtained by SAM into the 3D space with guidance of the\ndensity distribution learned by the radiance field for 3D mask refinement;\nThen, cross-view self-prompting extracts reliable prompts automatically as the\ninput to SAM from the rendered 2D mask of the inaccurate 3D mask for a new\nview. We show in experiments that SA3D adapts to various scenes and achieves 3D\nsegmentation within seconds. Our research reveals a potential methodology to\nlift the ability of a 2D segmentation model to 3D. Our code is available at\nhttps://github.com/Jumpat/SegmentAnythingin3D.\n","authors":["Jiazhong Cen","Jiemin Fang","Zanwei Zhou","Chen Yang","Lingxi Xie","Xiaopeng Zhang","Wei Shen","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2304.12308v5.pdf","comment":"Extension version of SA3D (NeurIPS 2023). Project page:\n https://jumpat.github.io/SA3D/"},{"id":"http://arxiv.org/abs/2404.10213v1","updated":"2024-04-16T01:50:10Z","published":"2024-04-16T01:50:10Z","title":"GaitPoint+: A Gait Recognition Network Incorporating Point Cloud\n Analysis and Recycling","summary":" Gait is a behavioral biometric modality that can be used to recognize\nindividuals by the way they walk from a far distance. Most existing gait\nrecognition approaches rely on either silhouettes or skeletons, while their\njoint use is underexplored. Features from silhouettes and skeletons can provide\ncomplementary information for more robust recognition against appearance\nchanges or pose estimation errors. To exploit the benefits of both silhouette\nand skeleton features, we propose a new gait recognition network, referred to\nas the GaitPoint+. Our approach models skeleton key points as a 3D point cloud,\nand employs a computational complexity-conscious 3D point processing approach\nto extract skeleton features, which are then combined with silhouette features\nfor improved accuracy. Since silhouette- or CNN-based methods already require\nconsiderable amount of computational resources, it is preferable that the key\npoint learning module is faster and more lightweight. We present a detailed\nanalysis of the utilization of every human key point after the use of\ntraditional max-pooling, and show that while elbow and ankle points are used\nmost commonly, many useful points are discarded by max-pooling. Thus, we\npresent a method to recycle some of the discarded points by a Recycling\nMax-Pooling module, during processing of skeleton point clouds, and achieve\nfurther performance improvement. We provide a comprehensive set of experimental\nresults showing that (i) incorporating skeleton features obtained by a\npoint-based 3D point cloud processing approach boosts the performance of three\ndifferent state-of-the-art silhouette- and CNN-based baselines; (ii) recycling\nthe discarded points increases the accuracy further. Ablation studies are also\nprovided to show the effectiveness and contribution of different components of\nour approach.\n","authors":["Huantao Ren","Jiajing Chen","Senem Velipasalar"],"pdf_url":"https://arxiv.org/pdf/2404.10213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10212v1","updated":"2024-04-16T01:49:35Z","published":"2024-04-16T01:49:35Z","title":"LWIRPOSE: A novel LWIR Thermal Image Dataset and Benchmark","summary":" Human pose estimation faces hurdles in real-world applications due to factors\nlike lighting changes, occlusions, and cluttered environments. We introduce a\nunique RGB-Thermal Nearly Paired and Annotated 2D Pose Dataset, comprising over\n2,400 high-quality LWIR (thermal) images. Each image is meticulously annotated\nwith 2D human poses, offering a valuable resource for researchers and\npractitioners. This dataset, captured from seven actors performing diverse\neveryday activities like sitting, eating, and walking, facilitates pose\nestimation on occlusion and other challenging scenarios. We benchmark\nstate-of-the-art pose estimation methods on the dataset to showcase its\npotential, establishing a strong baseline for future research. Our results\ndemonstrate the dataset's effectiveness in promoting advancements in pose\nestimation for various applications, including surveillance, healthcare, and\nsports analytics. The dataset and code are available at\nhttps://github.com/avinres/LWIRPOSE\n","authors":["Avinash Upadhyay","Bhipanshu Dhupar","Manoj Sharma","Ankit Shukla","Ajith Abraham"],"pdf_url":"https://arxiv.org/pdf/2404.10212v1.pdf","comment":"Submitted in ICIP2024"},{"id":"http://arxiv.org/abs/2404.10210v1","updated":"2024-04-16T01:41:22Z","published":"2024-04-16T01:41:22Z","title":"MK-SGN: A Spiking Graph Convolutional Network with Multimodal Fusion and\n Knowledge Distillation for Skeleton-based Action Recognition","summary":" In recent years, skeleton-based action recognition, leveraging multimodal\nGraph Convolutional Networks (GCN), has achieved remarkable results. However,\ndue to their deep structure and reliance on continuous floating-point\noperations, GCN-based methods are energy-intensive. To address this issue, we\npropose an innovative Spiking Graph Convolutional Network with Multimodal\nFusion and Knowledge Distillation (MK-SGN). By merging the energy efficiency of\nSpiking Neural Network (SNN) with the graph representation capability of GCN,\nthe proposed MK-SGN reduces energy consumption while maintaining recognition\naccuracy. Firstly, we convert GCN into Spiking Graph Convolutional Network\n(SGN) and construct a foundational Base-SGN for skeleton-based action\nrecognition, establishing a new benchmark and paving the way for future\nresearch exploration. Secondly, we further propose a Spiking Multimodal Fusion\nmodule (SMF), leveraging mutual information to process multimodal data more\nefficiently. Additionally, we introduce a spiking attention mechanism and\ndesign a Spatio Graph Convolution module with a Spatial Global Spiking\nAttention mechanism (SA-SGC), enhancing feature learning capability.\nFurthermore, we delve into knowledge distillation methods from multimodal GCN\nto SGN and propose a novel, integrated method that simultaneously focuses on\nboth intermediate layer distillation and soft label distillation to improve the\nperformance of SGN. On two challenging datasets for skeleton-based action\nrecognition, MK-SGN outperforms the state-of-the-art GCN-like frameworks in\nreducing computational load and energy consumption. In contrast, typical GCN\nmethods typically consume more than 35mJ per action sample, while MK-SGN\nreduces energy consumption by more than 98%.\n","authors":["Naichuan Zheng","Hailun Xia","Zeyu Liang"],"pdf_url":"https://arxiv.org/pdf/2404.10210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10523v3","updated":"2024-04-16T01:34:34Z","published":"2023-04-20T17:52:58Z","title":"GenCorres: Consistent Shape Matching via Coupled Implicit-Explicit Shape\n Generative Models","summary":" This paper introduces GenCorres, a novel unsupervised joint shape matching\n(JSM) approach. Our key idea is to learn a mesh generator to fit an unorganized\ndeformable shape collection while constraining deformations between adjacent\nsynthetic shapes to preserve geometric structures such as local rigidity and\nlocal conformality. GenCorres presents three appealing advantages over existing\nJSM techniques. First, GenCorres performs JSM among a synthetic shape\ncollection whose size is much bigger than the input shapes and fully leverages\nthe datadriven power of JSM. Second, GenCorres unifies consistent shape\nmatching and pairwise matching (i.e., by enforcing deformation priors between\nadjacent synthetic shapes). Third, the generator provides a concise encoding of\nconsistent shape correspondences. However, learning a mesh generator from an\nunorganized shape collection is challenging, requiring a good initialization.\nGenCorres addresses this issue by learning an implicit generator from the input\nshapes, which provides intermediate shapes between two arbitrary shapes. We\nintroduce a novel approach for computing correspondences between adjacent\nimplicit surfaces, which we use to regularize the implicit generator. Synthetic\nshapes of the implicit generator then guide initial fittings (i.e., via\ntemplate-based deformation) for learning the mesh generator. Experimental\nresults show that GenCorres considerably outperforms state-of-the-art JSM\ntechniques. The synthetic shapes of GenCorres also achieve salient performance\ngains against state-of-the-art deformable shape generators.\n","authors":["Haitao Yang","Xiangru Huang","Bo Sun","Chandrajit Bajaj","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2304.10523v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.08197v2","updated":"2024-04-16T01:13:35Z","published":"2024-04-12T02:04:34Z","title":"Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and\n Training Strategies","summary":" This paper investigates the performance of the Contrastive Language-Image\nPre-training (CLIP) when scaled down to limited computation budgets. We explore\nCLIP along three dimensions: data, architecture, and training strategies. With\nregards to data, we demonstrate the significance of high-quality training data\nand show that a smaller dataset of high-quality data can outperform a larger\ndataset with lower quality. We also examine how model performance varies with\ndifferent dataset sizes, suggesting that smaller ViT models are better suited\nfor smaller datasets, while larger models perform better on larger datasets\nwith fixed compute. Additionally, we provide guidance on when to choose a\nCNN-based architecture or a ViT-based architecture for CLIP training. We\ncompare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data\nAugmentation - and show that the choice of training strategy depends on the\navailable compute resource. Our analysis reveals that CLIP+Data Augmentation\ncan achieve comparable performance to CLIP using only half of the training\ndata. This work provides practical insights into how to effectively train and\ndeploy CLIP models, making them more accessible and affordable for practical\nuse in various applications.\n","authors":["Zichao Li","Cihang Xie","Ekin Dogus Cubuk"],"pdf_url":"https://arxiv.org/pdf/2404.08197v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10193v1","updated":"2024-04-16T00:28:26Z","published":"2024-04-16T00:28:26Z","title":"Consistency and Uncertainty: Identifying Unreliable Responses From\n Black-Box Vision-Language Models for Selective Visual Question Answering","summary":" The goal of selective prediction is to allow an a model to abstain when it\nmay not be able to deliver a reliable prediction, which is important in\nsafety-critical contexts. Existing approaches to selective prediction typically\nrequire access to the internals of a model, require retraining a model or study\nonly unimodal models. However, the most powerful models (e.g. GPT-4) are\ntypically only available as black boxes with inaccessible internals, are not\nretrainable by end-users, and are frequently used for multimodal tasks. We\nstudy the possibility of selective prediction for vision-language models in a\nrealistic, black-box setting. We propose using the principle of\n\\textit{neighborhood consistency} to identify unreliable responses from a\nblack-box vision-language model in question answering tasks. We hypothesize\nthat given only a visual question and model response, the consistency of the\nmodel's responses over the neighborhood of a visual question will indicate\nreliability. It is impossible to directly sample neighbors in feature space in\na black-box setting. Instead, we show that it is possible to use a smaller\nproxy model to approximately sample from the neighborhood. We find that\nneighborhood consistency can be used to identify model responses to visual\nquestions that are likely unreliable, even in adversarial settings or settings\nthat are out-of-distribution to the proxy model.\n","authors":["Zaid Khan","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2404.10193v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10947v1","updated":"2024-04-16T23:05:17Z","published":"2024-04-16T23:05:17Z","title":"Residual Connections Harm Self-Supervised Abstract Feature Learning","summary":" We demonstrate that adding a weighting factor to decay the strength of\nidentity shortcuts within residual networks substantially improves semantic\nfeature learning in the state-of-the-art self-supervised masked autoencoding\n(MAE) paradigm. Our modification to the identity shortcuts within a VIT-B/16\nbackbone of an MAE boosts linear probing accuracy on ImageNet from 67.3% to\n72.3%. This significant gap suggests that, while residual connection structure\nserves an essential role in facilitating gradient propagation, it may have a\nharmful side effect of reducing capacity for abstract learning by virtue of\ninjecting an echo of shallower representations into deeper layers. We\nameliorate this downside via a fixed formula for monotonically decreasing the\ncontribution of identity connections as layer depth increases. Our design\npromotes the gradual development of feature abstractions, without impacting\nnetwork trainability. Analyzing the representations learned by our modified\nresidual networks, we find correlation between low effective feature rank and\ndownstream task performance.\n","authors":["Xiao Zhang","Ruoxi Jiang","William Gao","Rebecca Willett","Michael Maire"],"pdf_url":"https://arxiv.org/pdf/2404.10947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10940v1","updated":"2024-04-16T22:44:29Z","published":"2024-04-16T22:44:29Z","title":"Neuromorphic Vision-based Motion Segmentation with Graph Transformer\n Neural Network","summary":" Moving object segmentation is critical to interpret scene dynamics for\nrobotic navigation systems in challenging environments. Neuromorphic vision\nsensors are tailored for motion perception due to their asynchronous nature,\nhigh temporal resolution, and reduced power consumption. However, their\nunconventional output requires novel perception paradigms to leverage their\nspatially sparse and temporally dense nature. In this work, we propose a novel\nevent-based motion segmentation algorithm using a Graph Transformer Neural\nNetwork, dubbed GTNN. Our proposed algorithm processes event streams as 3D\ngraphs by a series of nonlinear transformations to unveil local and global\nspatiotemporal correlations between events. Based on these correlations, events\nbelonging to moving objects are segmented from the background without prior\nknowledge of the dynamic scene geometry. The algorithm is trained on publicly\navailable datasets including MOD, EV-IMO, and \\textcolor{black}{EV-IMO2} using\nthe proposed training scheme to facilitate efficient training on extensive\ndatasets. Moreover, we introduce the Dynamic Object Mask-aware Event Labeling\n(DOMEL) approach for generating approximate ground-truth labels for event-based\nmotion segmentation datasets. We use DOMEL to label our own recorded Event\ndataset for Motion Segmentation (EMS-DOMEL), which we release to the public for\nfurther research and benchmarking. Rigorous experiments are conducted on\nseveral unseen publicly-available datasets where the results revealed that GTNN\noutperforms state-of-the-art methods in the presence of dynamic background\nvariations, motion patterns, and multiple dynamic objects with varying sizes\nand velocities. GTNN achieves significant performance gains with an average\nincrease of 9.4% and 4.5% in terms of motion segmentation accuracy (IoU%) and\ndetection rate (DR%), respectively.\n","authors":["Yusra Alkendi","Rana Azzam","Sajid Javed","Lakmal Seneviratne","Yahya Zweiri"],"pdf_url":"https://arxiv.org/pdf/2404.10940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17121v2","updated":"2024-04-16T22:41:26Z","published":"2023-11-28T13:44:33Z","title":"ScribbleGen: Generative Data Augmentation Improves Scribble-supervised\n Semantic Segmentation","summary":" Recent advances in generative models, such as diffusion models, have made\ngenerating high-quality synthetic images widely accessible. Prior works have\nshown that training on synthetic images improves many perception tasks, such as\nimage classification, object detection, and semantic segmentation. We are the\nfirst to explore generative data augmentations for scribble-supervised semantic\nsegmentation. We propose ScribbleGen, a generative data augmentation method\nthat leverages a ControlNet diffusion model conditioned on semantic scribbles\nto produce high-quality training data. However, naive implementations of\ngenerative data augmentations may inadvertently harm the performance of the\ndownstream segmentor rather than improve it. We leverage classifier-free\ndiffusion guidance to enforce class consistency and introduce encode ratios to\ntrade off data diversity for data realism. Using the guidance scale and encode\nratio, we can generate a spectrum of high-quality training images. We propose\nmultiple augmentation schemes and find that these schemes significantly impact\nmodel performance, especially in the low-data regime. Our framework further\nreduces the gap between the performance of scribble-supervised segmentation and\nthat of fully-supervised segmentation. We also show that our framework\nsignificantly improves segmentation performance on small datasets, even\nsurpassing fully-supervised segmentation. The code is available at\nhttps://github.com/mengtang-lab/scribblegen.\n","authors":["Jacob Schnell","Jieke Wang","Lu Qi","Vincent Tao Hu","Meng Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13938v2","updated":"2024-04-16T22:29:17Z","published":"2023-07-26T03:30:28Z","title":"Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese\n Structure Network","summary":" Semi-supervised semantic segmentation (SSS) is an important task that\nutilizes both labeled and unlabeled data to reduce expenses on labeling\ntraining examples. However, the effectiveness of SSS algorithms is limited by\nthe difficulty of fully exploiting the potential of unlabeled data. To address\nthis, we propose a dual-level Siamese structure network (DSSN) for pixel-wise\ncontrastive learning. By aligning positive pairs with a pixel-wise contrastive\nloss using strong augmented views in both low-level image space and high-level\nfeature space, the proposed DSSN is designed to maximize the utilization of\navailable unlabeled data. Additionally, we introduce a novel class-aware\npseudo-label selection strategy for weak-to-strong supervision, which addresses\nthe limitations of most existing methods that do not perform selection or apply\na predefined threshold for all classes. Specifically, our strategy selects the\ntop high-confidence prediction of the weak view for each class to generate\npseudo labels that supervise the strong augmented views. This strategy is\ncapable of taking into account the class imbalance and improving the\nperformance of long-tailed classes. Our proposed method achieves\nstate-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,\noutperforming other SSS algorithms by a significant margin. The source code is\navailable at https://github.com/kunzhan/DSSN.\n","authors":["Zhibo Tain","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13938v2.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2402.18771v2","updated":"2024-04-16T22:15:58Z","published":"2024-02-29T00:25:26Z","title":"NARUTO: Neural Active Reconstruction from Uncertain Target Observations","summary":" We present NARUTO, a neural active reconstruction system that combines a\nhybrid neural representation with uncertainty learning, enabling high-fidelity\nsurface reconstruction. Our approach leverages a multi-resolution hash-grid as\nthe mapping backbone, chosen for its exceptional convergence speed and capacity\nto capture high-frequency local features.The centerpiece of our work is the\nincorporation of an uncertainty learning module that dynamically quantifies\nreconstruction uncertainty while actively reconstructing the environment. By\nharnessing learned uncertainty, we propose a novel uncertainty aggregation\nstrategy for goal searching and efficient path planning. Our system\nautonomously explores by targeting uncertain observations and reconstructs\nenvironments with remarkable completeness and fidelity. We also demonstrate the\nutility of this uncertainty-aware approach by enhancing SOTA neural SLAM\nsystems through an active ray sampling strategy. Extensive evaluations of\nNARUTO in various environments, using an indoor scene simulator, confirm its\nsuperior performance and state-of-the-art status in active reconstruction, as\nevidenced by its impressive results on benchmark datasets like Replica and\nMP3D.\n","authors":["Ziyue Feng","Huangying Zhan","Zheng Chen","Qingan Yan","Xiangyu Xu","Changjiang Cai","Bing Li","Qilun Zhu","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2402.18771v2.pdf","comment":"Accepted to CVPR2024. Project page:\n https://oppo-us-research.github.io/NARUTO-website/. Code:\n https://github.com/oppo-us-research/NARUTO"},{"id":"http://arxiv.org/abs/2403.09799v2","updated":"2024-04-16T22:03:16Z","published":"2024-03-14T18:37:46Z","title":"BOP Challenge 2023 on Detection, Segmentation and Pose Estimation of\n Seen and Unseen Rigid Objects","summary":" We present the evaluation methodology, datasets and results of the BOP\nChallenge 2023, the fifth in a series of public competitions organized to\ncapture the state of the art in model-based 6D object pose estimation from an\nRGB/RGB-D image and related tasks. Besides the three tasks from 2022\n(model-based 2D detection, 2D segmentation, and 6D localization of objects seen\nduring training), the 2023 challenge introduced new variants of these tasks\nfocused on objects unseen during training. In the new tasks, methods were\nrequired to learn new objects during a short onboarding stage (max 5 minutes, 1\nGPU) from provided 3D object models. The best 2023 method for 6D localization\nof unseen objects (GenFlow) notably reached the accuracy of the best 2020\nmethod for seen objects (CosyPose), although being noticeably slower. The best\n2023 method for seen objects (GPose) achieved a moderate accuracy improvement\nbut a significant 43% run-time improvement compared to the best 2022\ncounterpart (GDRNPP). Since 2017, the accuracy of 6D localization of seen\nobjects has improved by more than 50% (from 56.9 to 85.6 AR_C). The online\nevaluation system stays open and is available at: http://bop.felk.cvut.cz/.\n","authors":["Tomas Hodan","Martin Sundermeyer","Yann Labbe","Van Nguyen Nguyen","Gu Wang","Eric Brachmann","Bertram Drost","Vincent Lepetit","Carsten Rother","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2403.09799v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.13075"},{"id":"http://arxiv.org/abs/2404.10927v1","updated":"2024-04-16T21:57:58Z","published":"2024-04-16T21:57:58Z","title":"A Concise Tiling Strategy for Preserving Spatial Context in Earth\n Observation Imagery","summary":" We propose a new tiling strategy, Flip-n-Slide, which has been developed for\nspecific use with large Earth observation satellite images when the location of\nobjects-of-interest (OoI) is unknown and spatial context can be necessary for\nclass disambiguation. Flip-n-Slide is a concise and minimalistic approach that\nallows OoI to be represented at multiple tile positions and orientations. This\nstrategy introduces multiple views of spatio-contextual information, without\nintroducing redundancies into the training set. By maintaining distinct\ntransformation permutations for each tile overlap, we enhance the\ngeneralizability of the training set without misrepresenting the true data\ndistribution. Our experiments validate the effectiveness of Flip-n-Slide in the\ntask of semantic segmentation, a necessary data product in geophysical studies.\nWe find that Flip-n-Slide outperforms the previous state-of-the-art\naugmentation routines for tiled data in all evaluation metrics. For\nunderrepresented classes, Flip-n-Slide increases precision by as much as 15.8%.\n","authors":["Ellianna Abrahams","Tasha Snow","Matthew R. Siegfried","Fernando Pérez"],"pdf_url":"https://arxiv.org/pdf/2404.10927v1.pdf","comment":"Accepted to the Machine Learning for Remote Sensing (ML4RS) Workshop\n at ICLR 2024"},{"id":"http://arxiv.org/abs/2110.14553v4","updated":"2024-04-16T21:52:28Z","published":"2021-10-27T16:24:39Z","title":"GenURL: A General Framework for Unsupervised Representation Learning","summary":" Unsupervised representation learning (URL), which learns compact embeddings\nof high-dimensional data without supervision, has made remarkable progress\nrecently. However, the development of URLs for different requirements is\nindependent, which limits the generalization of the algorithms, especially\nprohibitive as the number of tasks grows. For example, dimension reduction\nmethods, t-SNE, and UMAP optimize pair-wise data relationships by preserving\nthe global geometric structure, while self-supervised learning, SimCLR, and\nBYOL focus on mining the local statistics of instances under specific\naugmentations. To address this dilemma, we summarize and propose a unified\nsimilarity-based URL framework, GenURL, which can smoothly adapt to various URL\ntasks. In this paper, we regard URL tasks as different implicit constraints on\nthe data geometric structure that help to seek optimal low-dimensional\nrepresentations that boil down to data structural modeling (DSM) and\nlow-dimensional transformation (LDT). Specifically, DMS provides a\nstructure-based submodule to describe the global structures, and LDT learns\ncompact low-dimensional embeddings with given pretext tasks. Moreover, an\nobjective function, General Kullback-Leibler divergence (GKL), is proposed to\nconnect DMS and LDT naturally. Comprehensive experiments demonstrate that\nGenURL achieves consistent state-of-the-art performance in self-supervised\nvisual learning, unsupervised knowledge distillation (KD), graph embeddings\n(GE), and dimension reduction.\n","authors":["Siyuan Li","Zicheng Liu","Zelin Zang","Di Wu","Zhiyuan Chen","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2110.14553v4.pdf","comment":"TNNLS 2024 version with 13 pages and 14 figures"},{"id":"http://arxiv.org/abs/2402.01203v2","updated":"2024-04-16T21:44:32Z","published":"2024-02-02T08:13:18Z","title":"Neural Language of Thought Models","summary":" The Language of Thought Hypothesis suggests that human cognition operates on\na structured, language-like system of mental representations. While neural\nlanguage models can naturally benefit from the compositional structure\ninherently and explicitly expressed in language data, learning such\nrepresentations from non-linguistic general observations, like images, remains\na challenge. In this work, we introduce the Neural Language of Thought Model\n(NLoTM), a novel approach for unsupervised learning of LoTH-inspired\nrepresentation and generation. NLoTM comprises two key components: (1) the\nSemantic Vector-Quantized Variational Autoencoder, which learns hierarchical,\ncomposable discrete representations aligned with objects and their properties,\nand (2) the Autoregressive LoT Prior, an autoregressive transformer that learns\nto generate semantic concept tokens compositionally, capturing the underlying\ndata distribution. We evaluate NLoTM on several 2D and 3D image datasets,\ndemonstrating superior performance in downstream tasks, out-of-distribution\ngeneralization, and image generation quality compared to patch-based VQ-VAE and\ncontinuous object-centric representations. Our work presents a significant step\ntowards creating neural networks exhibiting more human-like understanding by\ndeveloping LoT-like representations and offers insights into the intersection\nof cognitive science and machine learning.\n","authors":["Yi-Fu Wu","Minseung Lee","Sungjin Ahn"],"pdf_url":"https://arxiv.org/pdf/2402.01203v2.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2310.07687v2","updated":"2024-04-16T21:13:12Z","published":"2023-10-11T17:36:17Z","title":"Orbital Polarimetric Tomography of a Flare Near the Sagittarius A*\n Supermassive Black Hole","summary":" The interaction between the supermassive black hole at the center of the\nMilky Way, Sagittarius A*, and its accretion disk occasionally produces\nhigh-energy flares seen in X-ray, infrared, and radio. One proposed mechanism\nthat produces flares is the formation of compact, bright regions that appear\nwithin the accretion disk and close to the event horizon. Understanding these\nflares provides a window into accretion processes. Although sophisticated\nsimulations predict the formation of these flares, their structure has yet to\nbe recovered by observations. Here we show the first three-dimensional (3D)\nreconstruction of an emission flare recovered from ALMA light curves observed\non April 11, 2017. Our recovery shows compact, bright regions at a distance of\nroughly six times the event horizon. Moreover, it suggests a clockwise rotation\nin a low-inclination orbital plane, consistent with prior studies by GRAVITY\nand EHT. To recover this emission structure, we solve an ill-posed tomography\nproblem by integrating a neural 3D representation with a gravitational model\nfor black holes. Although the recovery is subject to, and sometimes sensitive\nto, the model assumptions, under physically motivated choices, our results are\nstable, and our approach is successful on simulated data.\n","authors":["Aviad Levis","Andrew A. Chael","Katherine L. Bouman","Maciek Wielgus","Pratul P. Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2310.07687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16168v2","updated":"2024-04-16T21:05:24Z","published":"2023-12-26T18:56:49Z","title":"Social-Transmotion: Promptable Human Trajectory Prediction","summary":" Accurate human trajectory prediction is crucial for applications such as\nautonomous vehicles, robotics, and surveillance systems. Yet, existing models\noften fail to fully leverage the non-verbal social cues human subconsciously\ncommunicate when navigating the space. To address this, we introduce\nSocial-Transmotion, a generic Transformer-based model that exploits diverse and\nnumerous visual cues to predict human behavior. We translate the idea of a\nprompt from Natural Language Processing (NLP) to the task of human trajectory\nprediction, where a prompt can be a sequence of x-y coordinates on the ground,\nbounding boxes in the image plane, or body pose keypoints in either 2D or 3D.\nThis, in turn, augments trajectory data, leading to enhanced human trajectory\nprediction. Using masking technique, our model exhibits flexibility and\nadaptability by capturing spatiotemporal interactions between agents based on\nthe available visual cues. We delve into the merits of using 2D versus 3D\nposes, and a limited set of poses. Additionally, we investigate the spatial and\ntemporal attention map to identify which keypoints and time-steps in the\nsequence are vital for optimizing human trajectory prediction. Our approach is\nvalidated on multiple datasets, including JTA, JRDB, Pedestrians and Cyclists\nin Road Traffic, and ETH-UCY. The code is publicly available:\nhttps://github.com/vita-epfl/social-transmotion.\n","authors":["Saeed Saadatnejad","Yang Gao","Kaouther Messaoud","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2312.16168v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2307.03798v3","updated":"2024-04-16T20:57:35Z","published":"2023-07-07T18:54:11Z","title":"Fooling Contrastive Language-Image Pre-trained Models with\n CLIPMasterPrints","summary":" Models leveraging both visual and textual data such as Contrastive\nLanguage-Image Pre-training (CLIP), are the backbone of many recent advances in\nartificial intelligence. In this work, we show that despite their versatility,\nsuch models are vulnerable to what we refer to as fooling master images.\nFooling master images are capable of maximizing the confidence score of a CLIP\nmodel for a significant number of widely varying prompts, while being either\nunrecognizable or unrelated to the attacked prompts for humans. The existence\nof such images is problematic as it could be used by bad actors to maliciously\ninterfere with CLIP-trained image retrieval models in production with\ncomparably small effort as a single image can attack many different prompts. We\ndemonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined\nusing stochastic gradient descent, projected gradient descent, or blackbox\noptimization. Contrary to many common adversarial attacks, the blackbox\noptimization approach allows us to mine CLIPMasterPrints even when the weights\nof the model are not accessible. We investigate the properties of the mined\nimages, and find that images trained on a small number of image captions\ngeneralize to a much larger number of semantically related captions. We\nevaluate possible mitigation strategies, where we increase the robustness of\nthe model and introduce an approach to automatically detect CLIPMasterPrints to\nsanitize the input of vulnerable models. Finally, we find that vulnerability to\nCLIPMasterPrints is related to a modality gap in contrastive pre-trained\nmulti-modal networks. Code available at\nhttps://github.com/matfrei/CLIPMasterPrints.\n","authors":["Matthias Freiberger","Peter Kun","Christian Igel","Anders Sundnes Løvlie","Sebastian Risi"],"pdf_url":"https://arxiv.org/pdf/2307.03798v3.pdf","comment":"This work was supported by a research grant (40575) from VILLUM\n FONDEN"},{"id":"http://arxiv.org/abs/2404.10904v1","updated":"2024-04-16T20:51:36Z","published":"2024-04-16T20:51:36Z","title":"Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression\n Recognition","summary":" Human communication is multi-modal; e.g., face-to-face interaction involves\nauditory signals (speech) and visual signals (face movements and hand\ngestures). Hence, it is essential to exploit multiple modalities when designing\nmachine learning-based facial expression recognition systems. In addition,\ngiven the ever-growing quantities of video data that capture human facial\nexpressions, such systems should utilize raw unlabeled videos without requiring\nexpensive annotations. Therefore, in this work, we employ a multitask\nmulti-modal self-supervised learning method for facial expression recognition\nfrom in-the-wild video data. Our model combines three self-supervised objective\nfunctions: First, a multi-modal contrastive loss, that pulls diverse data\nmodalities of the same video together in the representation space. Second, a\nmulti-modal clustering loss that preserves the semantic structure of input data\nin the representation space. Finally, a multi-modal data reconstruction loss.\nWe conduct a comprehensive study on this multimodal multi-task self-supervised\nlearning method on three facial expression recognition benchmarks. To that end,\nwe examine the performance of learning through different combinations of\nself-supervised tasks on the facial expression recognition downstream task. Our\nmodel ConCluGen outperforms several multi-modal self-supervised and fully\nsupervised baselines on the CMU-MOSEI dataset. Our results generally show that\nmulti-modal self-supervision tasks offer large performance gains for\nchallenging tasks such as facial expression recognition, while also reducing\nthe amount of manual annotations required. We release our pre-trained models as\nwell as source code publicly\n","authors":["Marah Halawa","Florian Blume","Pia Bideau","Martin Maier","Rasha Abdel Rahman","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2404.10904v1.pdf","comment":"The paper will appear in the CVPR 2024 workshops proceedings"},{"id":"http://arxiv.org/abs/2404.10896v1","updated":"2024-04-16T20:37:54Z","published":"2024-04-16T20:37:54Z","title":"From a Lossless (~1.5:1) Compression Algorithm for Llama2 7B Weights to\n Variable Precision, Variable Range, Compressed Numeric Data Types for CNNs\n and LLMs","summary":" This paper starts with a simple lossless ~1.5:1 compression algorithm for the\nweights of the Large Language Model (LLM) Llama2 7B [1] that can be implemented\nin ~200 LUTs in AMD FPGAs, processing over 800 million bfloat16 numbers per\nsecond. This framework is then extended to variable precision, variable range,\ncompressed numerical data types that are a user defined super set of both\nfloats and posits [2]. The paper then discusses a simple hardware\nimplementation of such format based on ANS (Asymmetrical Numeral Systems) [3]\nthat acts as a bridge between this flexible data format and a computational\nengine while, at the same time, achieving bandwidth reduction. An example of a\ntoken factory using weight compression and sharing is also given.\n","authors":["Vincenzo Liguori"],"pdf_url":"https://arxiv.org/pdf/2404.10896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10894v1","updated":"2024-04-16T20:37:14Z","published":"2024-04-16T20:37:14Z","title":"Semantics-Aware Attention Guidance for Diagnosing Whole Slide Images","summary":" Accurate cancer diagnosis remains a critical challenge in digital pathology,\nlargely due to the gigapixel size and complex spatial relationships present in\nwhole slide images. Traditional multiple instance learning (MIL) methods often\nstruggle with these intricacies, especially in preserving the necessary context\nfor accurate diagnosis. In response, we introduce a novel framework named\nSemantics-Aware Attention Guidance (SAG), which includes 1) a technique for\nconverting diagnostically relevant entities into attention signals, and 2) a\nflexible attention loss that efficiently integrates various semantically\nsignificant information, such as tissue anatomy and cancerous regions. Our\nexperiments on two distinct cancer datasets demonstrate consistent improvements\nin accuracy, precision, and recall with two state-of-the-art baseline models.\nQualitative analysis further reveals that the incorporation of heuristic\nguidance enables the model to focus on regions critical for diagnosis. SAG is\nnot only effective for the models discussed here, but its adaptability extends\nto any attention-based diagnostic model. This opens up exciting possibilities\nfor further improving the accuracy and efficiency of cancer diagnostics.\n","authors":["Kechun Liu","Wenjun Wu","Joann G. Elmore","Linda G. Shapiro"],"pdf_url":"https://arxiv.org/pdf/2404.10894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10892v1","updated":"2024-04-16T20:30:16Z","published":"2024-04-16T20:30:16Z","title":"Automatic classification of prostate MR series type using image content\n and metadata","summary":" With the wealth of medical image data, efficient curation is essential.\nAssigning the sequence type to magnetic resonance images is necessary for\nscientific studies and artificial intelligence-based analysis. However,\nincomplete or missing metadata prevents effective automation. We therefore\npropose a deep-learning method for classification of prostate cancer scanning\nsequences based on a combination of image data and DICOM metadata. We\ndemonstrate superior results compared to metadata or image data alone, and make\nour code publicly available at\nhttps://github.com/deepakri201/DICOMScanClassification.\n","authors":["Deepa Krishnaswamy","Bálint Kovács","Stefan Denner","Steve Pieper","David Clunie","Christopher P. Bridge","Tina Kapur","Klaus H. Maier-Hein","Andrey Fedorov"],"pdf_url":"https://arxiv.org/pdf/2404.10892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10880v1","updated":"2024-04-16T19:59:21Z","published":"2024-04-16T19:59:21Z","title":"HumMUSS: Human Motion Understanding using State Space Models","summary":" Understanding human motion from video is essential for a range of\napplications, including pose estimation, mesh recovery and action recognition.\nWhile state-of-the-art methods predominantly rely on transformer-based\narchitectures, these approaches have limitations in practical scenarios.\nTransformers are slower when sequentially predicting on a continuous stream of\nframes in real-time, and do not generalize to new frame rates. In light of\nthese constraints, we propose a novel attention-free spatiotemporal model for\nhuman motion understanding building upon recent advancements in state space\nmodels. Our model not only matches the performance of transformer-based models\nin various motion understanding tasks but also brings added benefits like\nadaptability to different video frame rates and enhanced training speed when\nworking with longer sequence of keypoints. Moreover, the proposed model\nsupports both offline and real-time applications. For real-time sequential\nprediction, our model is both memory efficient and several times faster than\ntransformer-based approaches while maintaining their high accuracy.\n","authors":["Arnab Kumar Mondal","Stefano Alletto","Denis Tome"],"pdf_url":"https://arxiv.org/pdf/2404.10880v1.pdf","comment":"CVPR 24"},{"id":"http://arxiv.org/abs/2404.10865v1","updated":"2024-04-16T19:29:27Z","published":"2024-04-16T19:29:27Z","title":"OSR-ViT: A Simple and Modular Framework for Open-Set Object Detection\n and Discovery","summary":" An object detector's ability to detect and flag \\textit{novel} objects during\nopen-world deployments is critical for many real-world applications.\nUnfortunately, much of the work in open object detection today is disjointed\nand fails to adequately address applications that prioritize unknown object\nrecall \\textit{in addition to} known-class accuracy. To close this gap, we\npresent a new task called Open-Set Object Detection and Discovery (OSODD) and\nas a solution propose the Open-Set Regions with ViT features (OSR-ViT)\ndetection framework. OSR-ViT combines a class-agnostic proposal network with a\npowerful ViT-based classifier. Its modular design simplifies optimization and\nallows users to easily swap proposal solutions and feature extractors to best\nsuit their application. Using our multifaceted evaluation protocol, we show\nthat OSR-ViT obtains performance levels that far exceed state-of-the-art\nsupervised methods. Our method also excels in low-data settings, outperforming\nsupervised baselines using a fraction of the training data.\n","authors":["Matthew Inkawhich","Nathan Inkawhich","Hao Yang","Jingyang Zhang","Randolph Linderman","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10865v1.pdf","comment":"28 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.10864v1","updated":"2024-04-16T19:27:21Z","published":"2024-04-16T19:27:21Z","title":"Vocabulary-free Image Classification and Semantic Segmentation","summary":" Large vision-language models revolutionized image classification and semantic\nsegmentation paradigms. However, they typically assume a pre-defined set of\ncategories, or vocabulary, at test time for composing textual prompts. This\nassumption is impractical in scenarios with unknown or evolving semantic\ncontext. Here, we address this issue and introduce the Vocabulary-free Image\nClassification (VIC) task, which aims to assign a class from an unconstrained\nlanguage-induced semantic space to an input image without needing a known\nvocabulary. VIC is challenging due to the vastness of the semantic space, which\ncontains millions of concepts, including fine-grained categories. To address\nVIC, we propose Category Search from External Databases (CaSED), a\ntraining-free method that leverages a pre-trained vision-language model and an\nexternal database. CaSED first extracts the set of candidate categories from\nthe most semantically similar captions in the database and then assigns the\nimage to the best-matching candidate category according to the same\nvision-language model. Furthermore, we demonstrate that CaSED can be applied\nlocally to generate a coarse segmentation mask that classifies image regions,\nintroducing the task of Vocabulary-free Semantic Segmentation. CaSED and its\nvariants outperform other more complex vision-language models, on\nclassification and semantic segmentation benchmarks, while using much fewer\nparameters.\n","authors":["Alessandro Conti","Enrico Fini","Massimiliano Mancini","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.10864v1.pdf","comment":"Under review, 22 pages, 10 figures, code is available at\n https://github.com/altndrr/vicss. arXiv admin note: text overlap with\n arXiv:2306.00917"},{"id":"http://arxiv.org/abs/2208.11050v3","updated":"2024-04-16T19:16:34Z","published":"2022-08-23T15:57:19Z","title":"Tunable Hybrid Proposal Networks for the Open World","summary":" Current state-of-the-art object proposal networks are trained with a\nclosed-world assumption, meaning they learn to only detect objects of the\ntraining classes. These models fail to provide high recall in open-world\nenvironments where important novel objects may be encountered. While a handful\nof recent works attempt to tackle this problem, they fail to consider that the\noptimal behavior of a proposal network can vary significantly depending on the\ndata and application. Our goal is to provide a flexible proposal solution that\ncan be easily tuned to suit a variety of open-world settings. To this end, we\ndesign a Tunable Hybrid Proposal Network (THPN) that leverages an adjustable\nhybrid architecture, a novel self-training procedure, and dynamic loss\ncomponents to optimize the tradeoff between known and unknown object detection\nperformance. To thoroughly evaluate our method, we devise several new\nchallenges which invoke varying degrees of label bias by altering known class\ndiversity and label count. We find that in every task, THPN easily outperforms\nexisting baselines (e.g., RPN, OLN). Our method is also highly data efficient,\nsurpassing baseline recall with a fraction of the labeled data.\n","authors":["Matthew Inkawhich","Nathan Inkawhich","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2208.11050v3.pdf","comment":"Published in WACV 2024. 22 pages, 9 figures, 12 tables"},{"id":"http://arxiv.org/abs/2404.10856v1","updated":"2024-04-16T19:10:40Z","published":"2024-04-16T19:10:40Z","title":"UruDendro, a public dataset of cross-section images of Pinus taeda","summary":" The automatic detection of tree-ring boundaries and other anatomical features\nusing image analysis has progressed substantially over the past decade with\nadvances in machine learning and imagery technology, as well as increasing\ndemands from the dendrochronology community. This paper presents a publicly\navailable database of 64 scanned images of transverse sections of commercially\ngrown Pinus taeda trees from northern Uruguay, ranging from 17 to 24 years old.\nThe collection contains several challenging features for automatic ring\ndetection, including illumination and surface preparation variation, fungal\ninfection (blue stains), knot formation, missing cortex or interruptions in\nouter rings, and radial cracking. This dataset can be used to develop and test\nautomatic tree ring detection algorithms. This paper presents to the\ndendrochronology community one such method, Cross-Section Tree-Ring Detection\n(CS-TRD), which identifies and marks complete annual rings in cross-sections\nfor tree species presenting a clear definition between early and latewood. We\ncompare the CS-TRD performance against the ground truth manual delineation of\nall rings over the UruDendro dataset. The CS-TRD software identified rings with\nan average F-score of 89% and RMSE error of 5.27px for the entire database in\nless than 20 seconds per image. Finally, we propose a robust measure of the\nring growth using the \\emph{equivalent radius} of a circle having the same area\nenclosed by the detected tree ring. Overall, this study contributes to the\ndendrochronologist's toolbox of fast and low-cost methods to automatically\ndetect rings in conifer species, particularly for measuring diameter growth\nrates and stem transverse area using entire cross-sections.\n","authors":["Henry Marichal","Diego Passarella","Christine Lucas","Ludmila Profumo","Verónica Casaravilla","María Noel Rocha Galli","Serrana Ambite","Gregory Randall"],"pdf_url":"https://arxiv.org/pdf/2404.10856v1.pdf","comment":"Submitted to Dendrochronologia. arXiv admin note: text overlap with\n arXiv:2305.10809"},{"id":"http://arxiv.org/abs/2402.02286v2","updated":"2024-04-16T19:07:06Z","published":"2024-02-03T22:51:17Z","title":"Multi-Level Feature Aggregation and Recursive Alignment Network for\n Real-Time Semantic Segmentation","summary":" Real-time semantic segmentation is a crucial research for real-world\napplications. However, many methods lay particular emphasis on reducing the\ncomputational complexity and model size, while largely sacrificing the\naccuracy. To tackle this problem, we propose a parallel inference network\ncustomized for semantic segmentation tasks to achieve a good trade-off between\nspeed and accuracy. We employ a shallow backbone to ensure real-time speed, and\npropose three core components to compensate for the reduced model capacity to\nimprove accuracy. Specifically, we first design a dual-pyramidal path\narchitecture (Multi-level Feature Aggregation Module, MFAM) to aggregate\nmulti-level features from the encoder to each scale, providing hierarchical\nclues for subsequent spatial alignment and corresponding in-network inference.\nThen, we build Recursive Alignment Module (RAM) by combining the flow-based\nalignment module with recursive upsampling architecture for accurate spatial\nalignment between multi-scale feature maps with half the computational\ncomplexity of the straightforward alignment method. Finally, we perform\nindependent parallel inference on the aligned features to obtain multi-scale\nscores, and adaptively fuse them through an attention-based Adaptive Scores\nFusion Module (ASFM) so that the final prediction can favor objects of multiple\nscales. Our framework shows a better balance between speed and accuracy than\nstate-of-the-art real-time methods on Cityscapes and CamVid datasets. We also\nconducted systematic ablation studies to gain insight into our motivation and\narchitectural design. Code is available at:\nhttps://github.com/Yanhua-Zhang/MFARANet.\n","authors":["Yanhua Zhang","Ke Zhang","Jingyu Wang","Yulin Wu","Wuwei Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02286v2.pdf","comment":"15 pages, 9 figures and 12 Tables. Manuscript completed on April 30,\n 2022"},{"id":"http://arxiv.org/abs/2404.10841v1","updated":"2024-04-16T18:38:23Z","published":"2024-04-16T18:38:23Z","title":"Gasformer: A Transformer-based Architecture for Segmenting Methane\n Emissions from Livestock in Optical Gas Imaging","summary":" Methane emissions from livestock, particularly cattle, significantly\ncontribute to climate change. Effective methane emission mitigation strategies\nare crucial as the global population and demand for livestock products\nincrease. We introduce Gasformer, a novel semantic segmentation architecture\nfor detecting low-flow rate methane emissions from livestock, and controlled\nrelease experiments using optical gas imaging. We present two unique datasets\ncaptured with a FLIR GF77 OGI camera. Gasformer leverages a Mix Vision\nTransformer encoder and a Light-Ham decoder to generate multi-scale features\nand refine segmentation maps. Gasformer outperforms other state-of-the-art\nmodels on both datasets, demonstrating its effectiveness in detecting and\nsegmenting methane plumes in controlled and real-world scenarios. On the\nlivestock dataset, Gasformer achieves mIoU of 88.56%, surpassing other\nstate-of-the-art models. Materials are available at:\ngithub.com/toqitahamid/Gasformer.\n","authors":["Toqi Tahamid Sarker","Mohamed G Embaby","Khaled R Ahmed","Amer AbuGhazaleh"],"pdf_url":"https://arxiv.org/pdf/2404.10841v1.pdf","comment":"9 pages, 5 figures, this paper has been submitted and accepted for\n publication at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.10838v1","updated":"2024-04-16T18:22:49Z","published":"2024-04-16T18:22:49Z","title":"Dynamic Self-adaptive Multiscale Distillation from Pre-trained\n Multimodal Large Model for Efficient Cross-modal Representation Learning","summary":" In recent years, pre-trained multimodal large models have attracted\nwidespread attention due to their outstanding performance in various multimodal\napplications. Nonetheless, the extensive computational resources and vast\ndatasets required for their training present significant hurdles for deployment\nin environments with limited computational resources. To address this\nchallenge, we propose a novel dynamic self-adaptive multiscale distillation\nfrom pre-trained multimodal large model for efficient cross-modal\nrepresentation learning for the first time. Unlike existing distillation\nmethods, our strategy employs a multiscale perspective, enabling the extraction\nstructural knowledge across from the pre-trained multimodal large model.\nEnsuring that the student model inherits a comprehensive and nuanced\nunderstanding of the teacher knowledge. To optimize each distillation loss in a\nbalanced and efficient manner, we propose a dynamic self-adaptive distillation\nloss balancer, a novel component eliminating the need for manual loss weight\nadjustments and dynamically balances each loss item during the distillation\nprocess. Our methodology streamlines pre-trained multimodal large models using\nonly their output features and original image-level information, requiring\nminimal computational resources. This efficient approach is suited for various\napplications and allows the deployment of advanced multimodal technologies even\nin resource-limited settings. Extensive experiments has demonstrated that our\nmethod maintains high performance while significantly reducing model complexity\nand training costs. Moreover, our distilled student model utilizes only\nimage-level information to achieve state-of-the-art performance on cross-modal\nretrieval tasks, surpassing previous methods that relied on region-level\ninformation.\n","authors":["Zhengyang Liang","Meiyu Liang","Wei Huang","Yawen Li","Zhe Xue"],"pdf_url":"https://arxiv.org/pdf/2404.10838v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.10836v1","updated":"2024-04-16T18:15:57Z","published":"2024-04-16T18:15:57Z","title":"Semantic-Based Active Perception for Humanoid Visual Tasks with Foveal\n Sensors","summary":" The aim of this work is to establish how accurately a recent semantic-based\nfoveal active perception model is able to complete visual tasks that are\nregularly performed by humans, namely, scene exploration and visual search.\nThis model exploits the ability of current object detectors to localize and\nclassify a large number of object classes and to update a semantic description\nof a scene across multiple fixations. It has been used previously in scene\nexploration tasks. In this paper, we revisit the model and extend its\napplication to visual search tasks. To illustrate the benefits of using\nsemantic information in scene exploration and visual search tasks, we compare\nits performance against traditional saliency-based models. In the task of scene\nexploration, the semantic-based method demonstrates superior performance\ncompared to the traditional saliency-based model in accurately representing the\nsemantic information present in the visual scene. In visual search experiments,\nsearching for instances of a target class in a visual field containing multiple\ndistractors shows superior performance compared to the saliency-driven model\nand a random gaze selection algorithm. Our results demonstrate that semantic\ninformation, from the top-down, influences visual exploration and search tasks\nsignificantly, suggesting a potential area of research for integrating it with\ntraditional bottom-up cues.\n","authors":["João Luzio","Alexandre Bernardino","Plinio Moreno"],"pdf_url":"https://arxiv.org/pdf/2404.10836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18635v2","updated":"2024-04-16T18:07:51Z","published":"2023-11-30T15:43:13Z","title":"DiffusionAvatars: Deferred Diffusion for High-fidelity 3D Head Avatars","summary":" DiffusionAvatars synthesizes a high-fidelity 3D head avatar of a person,\noffering intuitive control over both pose and expression. We propose a\ndiffusion-based neural renderer that leverages generic 2D priors to produce\ncompelling images of faces. For coarse guidance of the expression and head\npose, we render a neural parametric head model (NPHM) from the target\nviewpoint, which acts as a proxy geometry of the person. Additionally, to\nenhance the modeling of intricate facial expressions, we condition\nDiffusionAvatars directly on the expression codes obtained from NPHM via\ncross-attention. Finally, to synthesize consistent surface details across\ndifferent viewpoints and expressions, we rig learnable spatial features to the\nhead's surface via TriPlane lookup in NPHM's canonical space. We train\nDiffusionAvatars on RGB videos and corresponding fitted NPHM meshes of a person\nand test the obtained avatars in both self-reenactment and animation scenarios.\nOur experiments demonstrate that DiffusionAvatars generates temporally\nconsistent and visually appealing videos for novel poses and expressions of a\nperson, outperforming existing approaches.\n","authors":["Tobias Kirschstein","Simon Giebenhain","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2311.18635v2.pdf","comment":"Project Page: https://tobias-kirschstein.github.io/diffusion-avatars/\n , Video: https://youtu.be/nSjDiiTnp2E"},{"id":"http://arxiv.org/abs/2404.10518v1","updated":"2024-04-16T12:41:25Z","published":"2024-04-16T12:41:25Z","title":"MobileNetV4 -- Universal Models for the Mobile Ecosystem","summary":" We present the latest generation of MobileNets, known as MobileNetV4 (MNv4),\nfeaturing universally efficient architecture designs for mobile devices. At its\ncore, we introduce the Universal Inverted Bottleneck (UIB) search block, a\nunified and flexible structure that merges Inverted Bottleneck (IB), ConvNext,\nFeed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant.\nAlongside UIB, we present Mobile MQA, an attention block tailored for mobile\naccelerators, delivering a significant 39% speedup. An optimized neural\narchitecture search (NAS) recipe is also introduced which improves MNv4 search\neffectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe\nresults in a new suite of MNv4 models that are mostly Pareto optimal across\nmobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural\nEngine and Google Pixel EdgeTPU - a characteristic not found in any other\nmodels tested. Finally, to further boost accuracy, we introduce a novel\ndistillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model\ndelivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just\n3.8ms.\n","authors":["Danfeng Qin","Chas Leichner","Manolis Delakis","Marco Fornoni","Shixin Luo","Fan Yang","Weijun Wang","Colby Banbury","Chengxi Ye","Berkin Akin","Vaibhav Aggarwal","Tenghui Zhu","Daniele Moro","Andrew Howard"],"pdf_url":"https://arxiv.org/pdf/2404.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12407v1","updated":"2024-04-16T17:47:45Z","published":"2024-04-16T17:47:45Z","title":"TV100: A TV Series Dataset that Pre-Trained CLIP Has Not Seen","summary":" The era of pre-trained models has ushered in a wealth of new insights for the\nmachine learning community. Among the myriad of questions that arise, one of\nparamount importance is: 'Do pre-trained models possess comprehensive\nknowledge?' This paper seeks to address this crucial inquiry. In line with our\nobjective, we have made publicly available a novel dataset comprised of images\nfrom TV series released post-2021. This dataset holds significant potential for\nuse in various research areas, including the evaluation of incremental\nlearning, novel class discovery, and long-tailed learning, among others.\nProject page: https://tv-100.github.io/\n","authors":["Da-Wei Zhou","Zhi-Hong Qi","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.12407v1.pdf","comment":"Project page: https://tv-100.github.io/"}]},"2024-04-17T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.11615v1","updated":"2024-04-17T17:59:59Z","published":"2024-04-17T17:59:59Z","title":"Factorized Diffusion: Perceptual Illusions by Noise Decomposition","summary":" Given a factorization of an image into a sum of linear components, we present\na zero-shot method to control each individual component through diffusion model\nsampling. For example, we can decompose an image into low and high spatial\nfrequencies and condition these components on different text prompts. This\nproduces hybrid images, which change appearance depending on viewing distance.\nBy decomposing an image into three frequency subbands, we can generate hybrid\nimages with three prompts. We also use a decomposition into grayscale and color\ncomponents to produce images whose appearance changes when they are viewed in\ngrayscale, a phenomena that naturally occurs under dim lighting. And we explore\na decomposition by a motion blur kernel, which produces images that change\nappearance under motion blurring. Our method works by denoising with a\ncomposite noise estimate, built from the components of noise estimates\nconditioned on different prompts. We also show that for certain decompositions,\nour method recovers prior approaches to compositional generation and spatial\ncontrol. Finally, we show that we can extend our approach to generate hybrid\nimages from real images. We do this by holding one component fixed and\ngenerating the remaining components, effectively solving an inverse problem.\n","authors":["Daniel Geng","Inbum Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2404.11615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11614v1","updated":"2024-04-17T17:59:55Z","published":"2024-04-17T17:59:55Z","title":"Dynamic Typography: Bringing Words to Life","summary":" Text animation serves as an expressive medium, transforming static\ncommunication into dynamic experiences by infusing words with motion to evoke\nemotions, emphasize meanings, and construct compelling narratives. Crafting\nanimations that are semantically aware poses significant challenges, demanding\nexpertise in graphic design and animation. We present an automated text\nanimation scheme, termed \"Dynamic Typography\", which combines two challenging\ntasks. It deforms letters to convey semantic meaning and infuses them with\nvibrant movements based on user prompts. Our technique harnesses vector\ngraphics representations and an end-to-end optimization-based framework. This\nframework employs neural displacement fields to convert letters into base\nshapes and applies per-frame motion, encouraging coherence with the intended\ntextual concept. Shape preservation techniques and perceptual loss\nregularization are employed to maintain legibility and structural integrity\nthroughout the animation process. We demonstrate the generalizability of our\napproach across various text-to-video models and highlight the superiority of\nour end-to-end methodology over baseline methods, which might comprise separate\ntasks. Through quantitative and qualitative evaluations, we demonstrate the\neffectiveness of our framework in generating coherent text animations that\nfaithfully interpret user prompts while maintaining readability. Our code is\navailable at: https://animate-your-word.github.io/demo/.\n","authors":["Zichen Liu","Yihao Meng","Hao Ouyang","Yue Yu","Bolin Zhao","Daniel Cohen-Or","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2404.11614v1.pdf","comment":"Our demo page is available at:\n https://animate-your-word.github.io/demo/"},{"id":"http://arxiv.org/abs/2404.11613v1","updated":"2024-04-17T17:59:53Z","published":"2024-04-17T17:59:53Z","title":"InFusion: Inpainting 3D Gaussians via Learning Depth Completion from\n Diffusion Prior","summary":" 3D Gaussians have recently emerged as an efficient representation for novel\nview synthesis. This work studies its editability with a particular focus on\nthe inpainting task, which aims to supplement an incomplete set of 3D Gaussians\nwith additional points for visually harmonious rendering. Compared to 2D\ninpainting, the crux of inpainting 3D Gaussians is to figure out the\nrendering-relevant properties of the introduced points, whose optimization\nlargely benefits from their initial 3D positions. To this end, we propose to\nguide the point initialization with an image-conditioned depth completion\nmodel, which learns to directly restore the depth map based on the observed\nimage. Such a design allows our model to fill in depth values at an aligned\nscale with the original depth, and also to harness strong generalizability from\nlargescale diffusion prior. Thanks to the more accurate depth completion, our\napproach, dubbed InFusion, surpasses existing alternatives with sufficiently\nbetter fidelity and efficiency under various complex scenarios. We further\ndemonstrate the effectiveness of InFusion with several practical applications,\nsuch as inpainting with user-specific texture or with novel object insertion.\n","authors":["Zhiheng Liu","Hao Ouyang","Qiuyu Wang","Ka Leong Cheng","Jie Xiao","Kai Zhu","Nan Xue","Yu Liu","Yujun Shen","Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2404.11613v1.pdf","comment":"Project page: https://johanan528.github.io/Infusion"},{"id":"http://arxiv.org/abs/2311.16278v3","updated":"2024-04-17T17:58:59Z","published":"2023-11-27T19:34:04Z","title":"VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle\n Re-identification","summary":" Vehicle Re-identification (Re-ID) has been broadly studied in the last\ndecade; however, the different camera view angle leading to confused\ndiscrimination in the feature subspace for the vehicles of various poses, is\nstill challenging for the Vehicle Re-ID models in the real world. To promote\nthe Vehicle Re-ID models, this paper proposes to synthesize a large number of\nvehicle images in the target pose, whose idea is to project the vehicles of\ndiverse poses into the unified target pose so as to enhance feature\ndiscrimination. Considering that the paired data of the same vehicles in\ndifferent traffic surveillance cameras might be not available in the real\nworld, we propose the first Pair-flexible Pose Guided Image Synthesis method\nfor Vehicle Re-ID, named as VehicleGAN in this paper, which works for both\nsupervised and unsupervised settings without the knowledge of geometric 3D\nmodels. Because of the feature distribution difference between real and\nsynthetic data, simply training a traditional metric learning based Re-ID model\nwith data-level fusion (i.e., data augmentation) is not satisfactory, therefore\nwe propose a new Joint Metric Learning (JML) via effective feature-level fusion\nfrom both real and synthetic data. Intensive experimental results on the public\nVeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our\nproposed VehicleGAN and JML.\n","authors":["Baolu Li","Ping Liu","Lan Fu","Jinlong Li","Jianwu Fang","Zhigang Xu","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2311.16278v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11605v1","updated":"2024-04-17T17:54:49Z","published":"2024-04-17T17:54:49Z","title":"VG4D: Vision-Language Model Goes 4D Video Recognition","summary":" Understanding the real world through point cloud video is a crucial aspect of\nrobotics and autonomous driving systems. However, prevailing methods for 4D\npoint cloud recognition have limitations due to sensor resolution, which leads\nto a lack of detailed information. Recent advances have shown that\nVision-Language Models (VLM) pre-trained on web-scale text-image datasets can\nlearn fine-grained visual concepts that can be transferred to various\ndownstream tasks. However, effectively integrating VLM into the domain of 4D\npoint clouds remains an unresolved problem. In this work, we propose the\nVision-Language Models Goes 4D (VG4D) framework to transfer VLM knowledge from\nvisual-text pre-trained models to a 4D point cloud network. Our approach\ninvolves aligning the 4D encoder's representation with a VLM to learn a shared\nvisual and text space from training on large-scale image-text pairs. By\ntransferring the knowledge of the VLM to the 4D encoder and combining the VLM,\nour VG4D achieves improved recognition performance. To enhance the 4D encoder,\nwe modernize the classic dynamic point cloud backbone and propose an improved\nversion of PSTNet, im-PSTNet, which can efficiently model point cloud videos.\nExperiments demonstrate that our method achieves state-of-the-art performance\nfor action recognition on both the NTU RGB+D 60 dataset and the NTU RGB+D 120\ndataset. Code is available at \\url{https://github.com/Shark0-0/VG4D}.\n","authors":["Zhichao Deng","Xiangtai Li","Xia Li","Yunhai Tong","Shen Zhao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11605v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2112.06979v2","updated":"2024-04-17T17:50:54Z","published":"2021-12-13T19:25:16Z","title":"The Brain Tumor Sequence Registration (BraTS-Reg) Challenge:\n Establishing Correspondence Between Pre-Operative and Follow-up MRI Scans of\n Diffuse Glioma Patients","summary":" Registration of longitudinal brain MRI scans containing pathologies is\nchallenging due to dramatic changes in tissue appearance. Although there has\nbeen progress in developing general-purpose medical image registration\ntechniques, they have not yet attained the requisite precision and reliability\nfor this task, highlighting its inherent complexity. Here we describe the Brain\nTumor Sequence Registration (BraTS-Reg) challenge, as the first public\nbenchmark environment for deformable registration algorithms focusing on\nestimating correspondences between pre-operative and follow-up scans of the\nsame patient diagnosed with a diffuse brain glioma. The BraTS-Reg data comprise\nde-identified multi-institutional multi-parametric MRI (mpMRI) scans, curated\nfor size and resolution according to a canonical anatomical template, and\ndivided into training, validation, and testing sets. Clinical experts annotated\nground truth (GT) landmark points of anatomical locations distinct across the\ntemporal domain. Quantitative evaluation and ranking were based on the Median\nEuclidean Error (MEE), Robustness, and the determinant of the Jacobian of the\ndisplacement field. The top-ranked methodologies yielded similar performance\nacross all evaluation metrics and shared several methodological commonalities,\nincluding pre-alignment, deep neural networks, inverse consistency analysis,\nand test-time instance optimization per-case basis as a post-processing step.\nThe top-ranked method attained the MEE at or below that of the inter-rater\nvariability for approximately 60% of the evaluated landmarks, underscoring the\nscope for further accuracy and robustness improvements, especially relative to\nhuman experts. The aim of BraTS-Reg is to continue to serve as an active\nresource for research, with the data and online evaluation tools accessible at\nhttps://bratsreg.github.io/.\n","authors":["Bhakti Baheti","Satrajit Chakrabarty","Hamed Akbari","Michel Bilello","Benedikt Wiestler","Julian Schwarting","Evan Calabrese","Jeffrey Rudie","Syed Abidi","Mina Mousa","Javier Villanueva-Meyer","Brandon K. K. Fields","Florian Kofler","Russell Takeshi Shinohara","Juan Eugenio Iglesias","Tony C. W. Mok","Albert C. S. Chung","Marek Wodzinski","Artur Jurgas","Niccolo Marini","Manfredo Atzori","Henning Muller","Christoph Grobroehmer","Hanna Siebert","Lasse Hansen","Mattias P. Heinrich","Luca Canalini","Jan Klein","Annika Gerken","Stefan Heldmann","Alessa Hering","Horst K. Hahn","Mingyuan Meng","Lei Bi","Dagan Feng","Jinman Kim","Ramy A. Zeineldin","Mohamed E. Karar","Franziska Mathis-Ullrich","Oliver Burgert","Javid Abderezaei","Aymeric Pionteck","Agamdeep Chopra","Mehmet Kurt","Kewei Yan","Yonghong Yan","Zhe Tang","Jianqiang Ma","Sahar Almahfouz Nasser","Nikhil Cherian Kurian","Mohit Meena","Saqib Shamsi","Amit Sethi","Nicholas J. Tustison","Brian B. Avants","Philip Cook","James C. Gee","Lin Tian","Hastings Greer","Marc Niethammer","Andrew Hoopes","Malte Hoffmann","Adrian V. Dalca","Stergios Christodoulidis","Theo Estiene","Maria Vakalopoulou","Nikos Paragios","Daniel S. Marcus","Christos Davatzikos","Aristeidis Sotiras","Bjoern Menze","Spyridon Bakas","Diana Waldmannstetter"],"pdf_url":"https://arxiv.org/pdf/2112.06979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11599v1","updated":"2024-04-17T17:50:24Z","published":"2024-04-17T17:50:24Z","title":"Variational Bayesian Last Layers","summary":" We introduce a deterministic variational formulation for training Bayesian\nlast layer neural networks. This yields a sampling-free, single-pass model and\nloss that effectively improves uncertainty estimation. Our variational Bayesian\nlast layer (VBLL) can be trained and evaluated with only quadratic complexity\nin last layer width, and is thus (nearly) computationally free to add to\nstandard architectures. We experimentally investigate VBLLs, and show that they\nimprove predictive accuracy, calibration, and out of distribution detection\nover baselines across both regression and classification. Finally, we\ninvestigate combining VBLL layers with variational Bayesian feature learning,\nyielding a lower variance collapsed variational inference method for Bayesian\nneural networks.\n","authors":["James Harrison","John Willes","Jasper Snoek"],"pdf_url":"https://arxiv.org/pdf/2404.11599v1.pdf","comment":"International Conference on Learning Representations (ICLR) 2024"},{"id":"http://arxiv.org/abs/2404.11593v1","updated":"2024-04-17T17:45:08Z","published":"2024-04-17T17:45:08Z","title":"IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under\n Unknown Illumination","summary":" This paper aims to recover object materials from posed images captured under\nan unknown static lighting condition. Recent methods solve this task by\noptimizing material parameters through differentiable physically based\nrendering. However, due to the coupling between object geometry, materials, and\nenvironment lighting, there is inherent ambiguity during the inverse rendering\nprocess, preventing previous methods from obtaining accurate results. To\novercome this ill-posed problem, our key idea is to learn the material prior\nwith a generative model for regularizing the optimization process. We observe\nthat the general rendering equation can be split into diffuse and specular\nshading terms, and thus formulate the material prior as diffusion models of\nalbedo and specular. Thanks to this design, our model can be trained using the\nexisting abundant 3D object data, and naturally acts as a versatile tool to\nresolve the ambiguity when recovering material representations from RGB images.\nIn addition, we develop a coarse-to-fine training strategy that leverages\nestimated materials to guide diffusion models to satisfy multi-view consistent\nconstraints, leading to more stable and accurate results. Extensive experiments\non real-world and synthetic datasets demonstrate that our approach achieves\nstate-of-the-art performance on material recovery. The code will be available\nat https://zju3dv.github.io/IntrinsicAnything.\n","authors":["Xi Chen","Sida Peng","Dongchen Yang","Yuan Liu","Bowen Pan","Chengfei Lv","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.11593v1.pdf","comment":"Project page: https://zju3dv.github.io/IntrinsicAnything"},{"id":"http://arxiv.org/abs/2312.02255v2","updated":"2024-04-17T17:44:44Z","published":"2023-12-04T18:56:08Z","title":"Re-Nerfing: Improving Novel Views Synthesis through Novel Views\n Synthesis","summary":" Neural Radiance Fields (NeRFs) have shown remarkable novel view synthesis\ncapabilities even in large-scale, unbounded scenes, albeit requiring hundreds\nof views or introducing artifacts in sparser settings. Their optimization\nsuffers from shape-radiance ambiguities wherever only a small visual overlap is\navailable. This leads to erroneous scene geometry and artifacts. In this paper,\nwe propose Re-Nerfing, a simple and general multi-stage data augmentation\napproach that leverages NeRF's own view synthesis ability to address these\nlimitations. With Re-Nerfing, we enhance the geometric consistency of novel\nviews as follows: First, we train a NeRF with the available views. Then, we use\nthe optimized NeRF to synthesize pseudo-views around the original ones with a\nview selection strategy to improve coverage and preserve view quality. Finally,\nwe train a second NeRF with both the original images and the pseudo views\nmasking out uncertain regions. Extensive experiments applying Re-Nerfing on\nvarious pipelines on the mip-NeRF 360 dataset, including Gaussian Splatting,\nprovide valuable insights into the improvements achievable without external\ndata or supervision, on denser and sparser input scenarios. Project page:\nhttps://renerfing.github.io\n","authors":["Felix Tristram","Stefano Gasperini","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.02255v2.pdf","comment":"Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2310.01040v3","updated":"2024-04-17T17:44:24Z","published":"2023-10-02T09:33:54Z","title":"Segmenting the motion components of a video: A long-term unsupervised\n model","summary":" Human beings have the ability to continuously analyze a video and immediately\nextract the motion components. We want to adopt this paradigm to provide a\ncoherent and stable motion segmentation over the video sequence. In this\nperspective, we propose a novel long-term spatio-temporal model operating in a\ntotally unsupervised way. It takes as input the volume of consecutive optical\nflow (OF) fields, and delivers a volume of segments of coherent motion over the\nvideo. More specifically, we have designed a transformer-based network, where\nwe leverage a mathematically well-founded framework, the Evidence Lower Bound\n(ELBO), to derive the loss function. The loss function combines a flow\nreconstruction term involving spatio-temporal parametric motion models\ncombining, in a novel way, polynomial (quadratic) motion models for the spatial\ndimensions and B-splines for the time dimension of the video sequence, and a\nregularization term enforcing temporal consistency on the segments. We report\nexperiments on four VOS benchmarks, demonstrating competitive quantitative\nresults, while performing motion segmentation on a whole sequence in one go. We\nalso highlight through visual results the key contributions on temporal\nconsistency brought by our method.\n","authors":["Etienne Meunier","Patrick Bouthemy"],"pdf_url":"https://arxiv.org/pdf/2310.01040v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11590v1","updated":"2024-04-17T17:39:59Z","published":"2024-04-17T17:39:59Z","title":"A Subspace-Constrained Tyler's Estimator and its Applications to\n Structure from Motion","summary":" We present the subspace-constrained Tyler's estimator (STE) designed for\nrecovering a low-dimensional subspace within a dataset that may be highly\ncorrupted with outliers. STE is a fusion of the Tyler's M-estimator (TME) and a\nvariant of the fast median subspace. Our theoretical analysis suggests that,\nunder a common inlier-outlier model, STE can effectively recover the underlying\nsubspace, even when it contains a smaller fraction of inliers relative to other\nmethods in the field of robust subspace recovery. We apply STE in the context\nof Structure from Motion (SfM) in two ways: for robust estimation of the\nfundamental matrix and for the removal of outlying cameras, enhancing the\nrobustness of the SfM pipeline. Numerical experiments confirm the\nstate-of-the-art performance of our method in these applications. This research\nmakes significant contributions to the field of robust subspace recovery,\nparticularly in the context of computer vision and 3D reconstruction.\n","authors":["Feng Yu","Teng Zhang","Gilad Lerman"],"pdf_url":"https://arxiv.org/pdf/2404.11590v1.pdf","comment":"23 pages, accepted by CVPR 24"},{"id":"http://arxiv.org/abs/2404.11589v1","updated":"2024-04-17T17:38:56Z","published":"2024-04-17T17:38:56Z","title":"Prompt Optimizer of Text-to-Image Diffusion Models for Abstract Concept\n Understanding","summary":" The rapid evolution of text-to-image diffusion models has opened the door of\ngenerative AI, enabling the translation of textual descriptions into visually\ncompelling images with remarkable quality. However, a persistent challenge\nwithin this domain is the optimization of prompts to effectively convey\nabstract concepts into concrete objects. For example, text encoders can hardly\nexpress \"peace\", while can easily illustrate olive branches and white doves.\nThis paper introduces a novel approach named Prompt Optimizer for Abstract\nConcepts (POAC) specifically designed to enhance the performance of\ntext-to-image diffusion models in interpreting and generating images from\nabstract concepts. We propose a Prompt Language Model (PLM), which is\ninitialized from a pre-trained language model, and then fine-tuned with a\ncurated dataset of abstract concept prompts. The dataset is created with GPT-4\nto extend the abstract concept to a scene and concrete objects. Our framework\nemploys a Reinforcement Learning (RL)-based optimization strategy, focusing on\nthe alignment between the generated images by a stable diffusion model and\noptimized prompts. Through extensive experiments, we demonstrate that our\nproposed POAC significantly improves the accuracy and aesthetic quality of\ngenerated images, particularly in the description of abstract concepts and\nalignment with optimized prompts. We also present a comprehensive analysis of\nour model's performance across diffusion models under different settings,\nshowcasing its versatility and effectiveness in enhancing abstract concept\nrepresentation.\n","authors":["Zezhong Fan","Xiaohan Li","Chenhao Fang","Topojoy Biswas","Kaushiki Nag","Jianpeng Xu","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2404.11589v1.pdf","comment":"WWW 2024 Companion"},{"id":"http://arxiv.org/abs/2206.10177v3","updated":"2024-04-17T17:36:19Z","published":"2022-06-21T08:16:08Z","title":"TCJA-SNN: Temporal-Channel Joint Attention for Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs) are attracting widespread interest due to\ntheir biological plausibility, energy efficiency, and powerful spatio-temporal\ninformation representation ability. Given the critical role of attention\nmechanisms in enhancing neural network performance, the integration of SNNs and\nattention mechanisms exhibits potential to deliver energy-efficient and\nhigh-performance computing paradigms. We present a novel Temporal-Channel Joint\nAttention mechanism for SNNs, referred to as TCJA-SNN. The proposed TCJA-SNN\nframework can effectively assess the significance of spike sequence from both\nspatial and temporal dimensions. More specifically, our essential technical\ncontribution lies on: 1) We employ the squeeze operation to compress the spike\nstream into an average matrix. Then, we leverage two local attention mechanisms\nbased on efficient 1D convolutions to facilitate comprehensive feature\nextraction at the temporal and channel levels independently. 2) We introduce\nthe Cross Convolutional Fusion (CCF) layer as a novel approach to model the\ninter-dependencies between the temporal and channel scopes. This layer breaks\nthe independence of these two dimensions and enables the interaction between\nfeatures. Experimental results demonstrate that the proposed TCJA-SNN\noutperforms SOTA by up to 15.7% accuracy on standard static and neuromorphic\ndatasets, including Fashion-MNIST, CIFAR10-DVS, N-Caltech 101, and DVS128\nGesture. Furthermore, we apply the TCJA-SNN framework to image generation tasks\nby leveraging a variation autoencoder. To the best of our knowledge, this study\nis the first instance where the SNN-attention mechanism has been employed for\nimage classification and generation tasks. Notably, our approach has achieved\nSOTA performance in both domains, establishing a significant advancement in the\nfield. Codes are available at https://github.com/ridgerchu/TCJA.\n","authors":["Rui-Jie Zhu","Malu Zhang","Qihang Zhao","Haoyu Deng","Yule Duan","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2206.10177v3.pdf","comment":"Accepted by IEEE Transactions on Neural Networks and Learning Systems"},{"id":"http://arxiv.org/abs/2404.11576v1","updated":"2024-04-17T17:19:48Z","published":"2024-04-17T17:19:48Z","title":"State-space Decomposition Model for Video Prediction Considering\n Long-term Motion Trend","summary":" Stochastic video prediction enables the consideration of uncertainty in\nfuture motion, thereby providing a better reflection of the dynamic nature of\nthe environment. Stochastic video prediction methods based on image\nauto-regressive recurrent models need to feed their predictions back into the\nlatent space. Conversely, the state-space models, which decouple frame\nsynthesis and temporal prediction, proves to be more efficient. However,\ninferring long-term temporal information about motion and generalizing to\ndynamic scenarios under non-stationary assumptions remains an unresolved\nchallenge. In this paper, we propose a state-space decomposition stochastic\nvideo prediction model that decomposes the overall video frame generation into\ndeterministic appearance prediction and stochastic motion prediction. Through\nadaptive decomposition, the model's generalization capability to dynamic\nscenarios is enhanced. In the context of motion prediction, obtaining a prior\non the long-term trend of future motion is crucial. Thus, in the stochastic\nmotion prediction branch, we infer the long-term motion trend from conditional\nframes to guide the generation of future frames that exhibit high consistency\nwith the conditional frames. Experimental results demonstrate that our model\noutperforms baselines on multiple datasets.\n","authors":["Fei Cui","Jiaojiao Fang","Xiaojiang Wu","Zelong Lai","Mengke Yang","Menghan Jia","Guizhong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11569v1","updated":"2024-04-17T17:11:47Z","published":"2024-04-17T17:11:47Z","title":"Simple Image Signal Processing using Global Context Guidance","summary":" In modern smartphone cameras, the Image Signal Processor (ISP) is the core\nelement that converts the RAW readings from the sensor into perceptually\npleasant RGB images for the end users. The ISP is typically proprietary and\nhandcrafted and consists of several blocks such as white balance, color\ncorrection, and tone mapping. Deep learning-based ISPs aim to transform RAW\nimages into DSLR-like RGB images using deep neural networks. However, most\nlearned ISPs are trained using patches (small regions) due to computational\nlimitations. Such methods lack global context, which limits their efficacy on\nfull-resolution images and harms their ability to capture global properties\nsuch as color constancy or illumination. First, we propose a novel module that\ncan be integrated into any neural ISP to capture the global context information\nfrom the full RAW images. Second, we propose an efficient and simple neural ISP\nthat utilizes our proposed module. Our model achieves state-of-the-art results\non different benchmarks using diverse and real smartphone images.\n","authors":["Omar Elezabi","Marcos V. Conde","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.11569v1.pdf","comment":"Preprint under review"},{"id":"http://arxiv.org/abs/2404.11565v1","updated":"2024-04-17T17:08:05Z","published":"2024-04-17T17:08:05Z","title":"MoA: Mixture-of-Attention for Subject-Context Disentanglement in\n Personalized Image Generation","summary":" We introduce a new architecture for personalization of text-to-image\ndiffusion models, coined Mixture-of-Attention (MoA). Inspired by the\nMixture-of-Experts mechanism utilized in large language models (LLMs), MoA\ndistributes the generation workload between two attention pathways: a\npersonalized branch and a non-personalized prior branch. MoA is designed to\nretain the original model's prior by fixing its attention layers in the prior\nbranch, while minimally intervening in the generation process with the\npersonalized branch that learns to embed subjects in the layout and context\ngenerated by the prior branch. A novel routing mechanism manages the\ndistribution of pixels in each layer across these branches to optimize the\nblend of personalized and generic content creation. Once trained, MoA\nfacilitates the creation of high-quality, personalized images featuring\nmultiple subjects with compositions and interactions as diverse as those\ngenerated by the original model. Crucially, MoA enhances the distinction\nbetween the model's pre-existing capability and the newly augmented\npersonalized intervention, thereby offering a more disentangled subject-context\ncontrol that was previously unattainable. Project page:\nhttps://snap-research.github.io/mixture-of-attention\n","authors":[" Kuan-Chieh"," Wang","Daniil Ostashev","Yuwei Fang","Sergey Tulyakov","Kfir Aberman"],"pdf_url":"https://arxiv.org/pdf/2404.11565v1.pdf","comment":"Project Website: https://snap-research.github.io/mixture-of-attention"},{"id":"http://arxiv.org/abs/2404.11554v1","updated":"2024-04-17T16:56:31Z","published":"2024-04-17T16:56:31Z","title":"Predicting Long-horizon Futures by Conditioning on Geometry and Time","summary":" Our work explores the task of generating future sensor observations\nconditioned on the past. We are motivated by `predictive coding' concepts from\nneuroscience as well as robotic applications such as self-driving vehicles.\nPredictive video modeling is challenging because the future may be multi-modal\nand learning at scale remains computationally expensive for video processing.\nTo address both challenges, our key insight is to leverage the large-scale\npretraining of image diffusion models which can handle multi-modality. We\nrepurpose image models for video prediction by conditioning on new frame\ntimestamps. Such models can be trained with videos of both static and dynamic\nscenes. To allow them to be trained with modestly-sized datasets, we introduce\ninvariances by factoring out illumination and texture by forcing the model to\npredict (pseudo) depth, readily obtained for in-the-wild videos via\noff-the-shelf monocular depth networks. In fact, we show that simply modifying\nnetworks to predict grayscale pixels already improves the accuracy of video\nprediction. Given the extra controllability with timestamp conditioning, we\npropose sampling schedules that work better than the traditional autoregressive\nand hierarchical sampling strategies. Motivated by probabilistic metrics from\nthe object forecasting literature, we create a benchmark for video prediction\non a diverse set of videos spanning indoor and outdoor scenes and a large\nvocabulary of objects. Our experiments illustrate the effectiveness of learning\nto condition on timestamps, and show the importance of predicting the future\nwith invariant modalities.\n","authors":["Tarasha Khurana","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2404.11554v1.pdf","comment":"Project page: http://www.cs.cmu.edu/~tkhurana/depthforecasting/"},{"id":"http://arxiv.org/abs/2403.11376v4","updated":"2024-04-17T16:46:02Z","published":"2024-03-18T00:03:48Z","title":"ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal\n Instance Segmentation","summary":" Amodal Instance Segmentation (AIS) presents a challenging task as it involves\npredicting both visible and occluded parts of objects within images. Existing\nAIS methods rely on a bidirectional approach, encompassing both the transition\nfrom amodal features to visible features (amodal-to-visible) and from visible\nfeatures to amodal features (visible-to-amodal). Our observation shows that the\nutilization of amodal features through the amodal-to-visible can confuse the\nvisible features due to the extra information of occluded/hidden segments not\npresented in visible display. Consequently, this compromised quality of visible\nfeatures during the subsequent visible-to-amodal transition. To tackle this\nissue, we introduce ShapeFormer, a decoupled Transformer-based model with a\nvisible-to-amodal transition. It facilitates the explicit relationship between\noutput segmentations and avoids the need for amodal-to-visible transitions.\nShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for\npredicting visible segmentation with occlusion awareness, (ii) Shape-Prior\nAmodal Mask Head for predicting amodal and occluded masks, and (iii)\nCategory-Specific Shape Prior Retriever aims to provide shape prior knowledge.\nComprehensive experiments and extensive ablation studies across various AIS\nbenchmarks demonstrate the effectiveness of our ShapeFormer. The code is\navailable at: \\url{https://github.com/UARK-AICV/ShapeFormer}\n","authors":["Minh Tran","Winston Bounsavy","Khoa Vo","Anh Nguyen","Tri Nguyen","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2403.11376v4.pdf","comment":"Accepted to IJCNN2024"},{"id":"http://arxiv.org/abs/2312.03678v2","updated":"2024-04-17T16:37:44Z","published":"2023-12-06T18:41:01Z","title":"Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching","summary":" Non-isometric shape correspondence remains a fundamental challenge in\ncomputer vision. Traditional methods using Laplace-Beltrami operator (LBO)\neigenmodes face limitations in characterizing high-frequency extrinsic shape\nchanges like bending and creases. We propose a novel approach of combining the\nnon-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell\nhessian with the intrinsic ones of the LBO, creating a hybrid spectral space in\nwhich we construct functional maps. To this end, we present a theoretical\nframework to effectively integrate non-orthogonal basis functions into\ndescriptor- and learning-based functional map methods. Our approach can be\nincorporated easily into existing functional map pipelines across varying\napplications and is able to handle complex deformations beyond isometries. We\nshow extensive evaluations across various supervised and unsupervised settings\nand demonstrate significant improvements. Notably, our approach achieves up to\n15% better mean geodesic error for non-isometric correspondence settings and up\nto 45% improvement in scenarios with topological noise.\n","authors":["Lennart Bastian","Yizheng Xie","Nassir Navab","Zorah Lähner"],"pdf_url":"https://arxiv.org/pdf/2312.03678v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11537v1","updated":"2024-04-17T16:30:56Z","published":"2024-04-17T16:30:56Z","title":"SSDiff: Spatial-spectral Integrated Diffusion Model for Remote Sensing\n Pansharpening","summary":" Pansharpening is a significant image fusion technique that merges the spatial\ncontent and spectral characteristics of remote sensing images to generate\nhigh-resolution multispectral images. Recently, denoising diffusion\nprobabilistic models have been gradually applied to visual tasks, enhancing\ncontrollable image generation through low-rank adaptation (LoRA). In this\npaper, we introduce a spatial-spectral integrated diffusion model for the\nremote sensing pansharpening task, called SSDiff, which considers the\npansharpening process as the fusion process of spatial and spectral components\nfrom the perspective of subspace decomposition. Specifically, SSDiff utilizes\nspatial and spectral branches to learn spatial details and spectral features\nseparately, then employs a designed alternating projection fusion module (APFM)\nto accomplish the fusion. Furthermore, we propose a frequency modulation\ninter-branch module (FMIM) to modulate the frequency distribution between\nbranches. The two components of SSDiff can perform favorably against the APFM\nwhen utilizing a LoRA-like branch-wise alternative fine-tuning method. It\nrefines SSDiff to capture component-discriminating features more sufficiently.\nFinally, extensive experiments on four commonly used datasets, i.e.,\nWorldView-3, WorldView-2, GaoFen-2, and QuickBird, demonstrate the superiority\nof SSDiff both visually and quantitatively. The code will be made open source\nafter possible acceptance.\n","authors":["Yu Zhong","Xiao Wu","Liang-Jian Deng","Zihan Cao"],"pdf_url":"https://arxiv.org/pdf/2404.11537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11525v1","updated":"2024-04-17T16:16:12Z","published":"2024-04-17T16:16:12Z","title":"JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on\n Long-Tailed OCTA","summary":" The oxygen saturation level in the blood (SaO2) is crucial for health,\nparticularly in relation to sleep-related breathing disorders. However,\ncontinuous monitoring of SaO2 is time-consuming and highly variable depending\non patients' conditions. Recently, optical coherence tomography angiography\n(OCTA) has shown promising development in rapidly and effectively screening\neye-related lesions, offering the potential for diagnosing sleep-related\ndisorders. To bridge this gap, our paper presents three key contributions.\nFirstly, we propose JointViT, a novel model based on the Vision Transformer\narchitecture, incorporating a joint loss function for supervision. Secondly, we\nintroduce a balancing augmentation technique during data preprocessing to\nimprove the model's performance, particularly on the long-tail distribution\nwithin the OCTA dataset. Lastly, through comprehensive experiments on the OCTA\ndataset, our proposed method significantly outperforms other state-of-the-art\nmethods, achieving improvements of up to 12.28% in overall accuracy. This\nadvancement lays the groundwork for the future utilization of OCTA in\ndiagnosing sleep-related disorders. See project website\nhttps://steve-zeyu-zhang.github.io/JointViT\n","authors":["Zeyu Zhang","Xuyin Qi","Mingxi Chen","Guangxi Li","Ryan Pham","Ayub Zuhair","Ella Berry","Zhibin Liao","Owen Siggs","Robert Mclaughlin","Jamie Craig","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2404.11525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05889v2","updated":"2024-04-17T16:13:22Z","published":"2023-12-10T13:44:03Z","title":"SuperPrimitive: Scene Reconstruction at a Primitive Level","summary":" Joint camera pose and dense geometry estimation from a set of images or a\nmonocular video remains a challenging problem due to its computational\ncomplexity and inherent visual ambiguities. Most dense incremental\nreconstruction systems operate directly on image pixels and solve for their 3D\npositions using multi-view geometry cues. Such pixel-level approaches suffer\nfrom ambiguities or violations of multi-view consistency (e.g. caused by\ntextureless or specular surfaces).\n We address this issue with a new image representation which we call a\nSuperPrimitive. SuperPrimitives are obtained by splitting images into\nsemantically correlated local regions and enhancing them with estimated surface\nnormal directions, both of which are predicted by state-of-the-art single image\nneural networks. This provides a local geometry estimate per SuperPrimitive,\nwhile their relative positions are adjusted based on multi-view observations.\n We demonstrate the versatility of our new representation by addressing three\n3D reconstruction tasks: depth completion, few-view structure from motion, and\nmonocular dense visual odometry.\n","authors":["Kirill Mazur","Gwangbin Bae","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2312.05889v2.pdf","comment":"CVPR2024. Project Page: https://makezur.github.io/SuperPrimitive/"},{"id":"http://arxiv.org/abs/2404.11511v1","updated":"2024-04-17T16:06:29Z","published":"2024-04-17T16:06:29Z","title":"Event Cameras Meet SPADs for High-Speed, Low-Bandwidth Imaging","summary":" Traditional cameras face a trade-off between low-light performance and\nhigh-speed imaging: longer exposure times to capture sufficient light results\nin motion blur, whereas shorter exposures result in Poisson-corrupted noisy\nimages. While burst photography techniques help mitigate this tradeoff,\nconventional cameras are fundamentally limited in their sensor noise\ncharacteristics. Event cameras and single-photon avalanche diode (SPAD) sensors\nhave emerged as promising alternatives to conventional cameras due to their\ndesirable properties. SPADs are capable of single-photon sensitivity with\nmicrosecond temporal resolution, and event cameras can measure brightness\nchanges up to 1 MHz with low bandwidth requirements. We show that these\nproperties are complementary, and can help achieve low-light, high-speed image\nreconstruction with low bandwidth requirements. We introduce a sensor fusion\nframework to combine SPADs with event cameras to improves the reconstruction of\nhigh-speed, low-light scenes while reducing the high bandwidth cost associated\nwith using every SPAD frame. Our evaluation, on both synthetic and real sensor\ndata, demonstrates significant enhancements ( > 5 dB PSNR) in reconstructing\nlow-light scenes at high temporal resolution (100 kHz) compared to conventional\ncameras. Event-SPAD fusion shows great promise for real-world applications,\nsuch as robotics or medical imaging.\n","authors":["Manasi Muglikar","Siddharth Somasundaram","Akshat Dave","Edoardo Charbon","Ramesh Raskar","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2404.11511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06567v3","updated":"2024-04-17T15:58:36Z","published":"2024-03-11T10:06:45Z","title":"Leveraging Foundation Models for Content-Based Medical Image Retrieval\n in Radiology","summary":" Content-based image retrieval (CBIR) has the potential to significantly\nimprove diagnostic aid and medical research in radiology. Current CBIR systems\nface limitations due to their specialization to certain pathologies, limiting\ntheir utility. In response, we propose using vision foundation models as\npowerful and versatile off-the-shelf feature extractors for content-based\nmedical image retrieval. By benchmarking these models on a comprehensive\ndataset of 1.6 million 2D radiological images spanning four modalities and 161\npathologies, we identify weakly-supervised models as superior, achieving a P@1\nof up to 0.594. This performance not only competes with a specialized model but\ndoes so without the need for fine-tuning. Our analysis further explores the\nchallenges in retrieving pathological versus anatomical structures, indicating\nthat accurate retrieval of pathological features presents greater difficulty.\nDespite these challenges, our research underscores the vast potential of\nfoundation models for CBIR in radiology, proposing a shift towards versatile,\ngeneral-purpose medical image retrieval systems that do not require specific\ntuning.\n","authors":["Stefan Denner","David Zimmerer","Dimitrios Bounias","Markus Bujotzek","Shuhan Xiao","Lisa Kausch","Philipp Schader","Tobias Penzkofer","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2403.06567v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11492v1","updated":"2024-04-17T15:47:26Z","published":"2024-04-17T15:47:26Z","title":"arcjetCV: an open-source software to analyze material ablation","summary":" arcjetCV is an open-source Python software designed to automate time-resolved\nmeasurements of heatshield material recession and recession rates from arcjet\ntest video footage. This new automated and accessible capability greatly\nexceeds previous manual extraction methods, enabling rapid and detailed\ncharacterization of material recession for any sample with a profile video.\narcjetCV automates the video segmentation process using machine learning\nmodels, including a one-dimensional (1D) Convolutional Neural Network (CNN) to\ninfer the time-window of interest, a two-dimensional (2D) CNN for image and\nedge segmentation, and a Local Outlier Factor (LOF) for outlier filtering. A\ngraphical user interface (GUI) simplifies the user experience and an\napplication programming interface (API) allows users to call the core functions\nfrom scripts, enabling video batch processing. arcjetCV's capability to measure\ntime-resolved recession in turn enables characterization of non-linear\nprocesses (shrinkage, swelling, melt flows, etc.), contributing to higher\nfidelity validation and improved modeling of heatshield material performance.\nThe source code associated with this article can be found at\nhttps://github.com/magnus-haw/arcjetCV.\n","authors":["Alexandre Quintart","Magnus Haw","Federico Semeraro"],"pdf_url":"https://arxiv.org/pdf/2404.11492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11488v1","updated":"2024-04-17T15:45:49Z","published":"2024-04-17T15:45:49Z","title":"Multi-resolution Rescored ByteTrack for Video Object Detection on\n Ultra-low-power Embedded Systems","summary":" This paper introduces Multi-Resolution Rescored Byte-Track (MR2-ByteTrack), a\nnovel video object detection framework for ultra-low-power embedded processors.\nThis method reduces the average compute load of an off-the-shelf Deep Neural\nNetwork (DNN) based object detector by up to 2.25$\\times$ by alternating the\nprocessing of high-resolution images (320$\\times$320 pixels) with multiple\ndown-sized frames (192$\\times$192 pixels). To tackle the accuracy degradation\ndue to the reduced image input size, MR2-ByteTrack correlates the output\ndetections over time using the ByteTrack tracker and corrects potential\nmisclassification using a novel probabilistic Rescore algorithm. By\ninterleaving two down-sized images for every high-resolution one as the input\nof different state-of-the-art DNN object detectors with our MR2-ByteTrack, we\ndemonstrate an average accuracy increase of 2.16% and a latency reduction of\n43% on the GAP9 microcontroller compared to a baseline frame-by-frame inference\nscheme using exclusively full-resolution images. Code available at:\nhttps://github.com/Bomps4/Multi_Resolution_Rescored_ByteTrack\n","authors":["Luca Bompani","Manuele Rusci","Daniele Palossi","Francesco Conti","Luca Benini"],"pdf_url":"https://arxiv.org/pdf/2404.11488v1.pdf","comment":"9 pages, 3 figures Accepted for publication at the Embedded Vision\n Workshop of the Computer Vision and Pattern Recognition conference, Seattle,\n 2024"},{"id":"http://arxiv.org/abs/2404.11475v1","updated":"2024-04-17T15:31:06Z","published":"2024-04-17T15:31:06Z","title":"AdaIR: Exploiting Underlying Similarities of Image Restoration Tasks\n with Adapters","summary":" Existing image restoration approaches typically employ extensive networks\nspecifically trained for designated degradations. Despite being effective, such\nmethods inevitably entail considerable storage costs and computational\noverheads due to the reliance on task-specific networks. In this work, we go\nbeyond this well-established framework and exploit the inherent commonalities\namong image restoration tasks. The primary objective is to identify components\nthat are shareable across restoration tasks and augment the shared components\nwith modules specifically trained for individual tasks. Towards this goal, we\npropose AdaIR, a novel framework that enables low storage cost and efficient\ntraining without sacrificing performance. Specifically, a generic restoration\nnetwork is first constructed through self-supervised pre-training using\nsynthetic degradations. Subsequent to the pre-training phase, adapters are\ntrained to adapt the pre-trained network to specific degradations. AdaIR\nrequires solely the training of lightweight, task-specific modules, ensuring a\nmore efficient storage and training regimen. We have conducted extensive\nexperiments to validate the effectiveness of AdaIR and analyze the influence of\nthe pre-training strategy on discovering shareable components. Extensive\nexperimental results show that AdaIR achieves outstanding results on multi-task\nrestoration while utilizing significantly fewer parameters (1.9 MB) and less\ntraining time (7 hours) for each restoration task. The source codes and trained\nmodels will be released.\n","authors":["Hao-Wei Chen","Yu-Syuan Xu","Kelvin C. K. Chan","Hsien-Kai Kuo","Chun-Yi Lee","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.11475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11474v1","updated":"2024-04-17T15:28:53Z","published":"2024-04-17T15:28:53Z","title":"Towards Highly Realistic Artistic Style Transfer via Stable Diffusion\n with Step-aware and Layer-aware Prompt","summary":" Artistic style transfer aims to transfer the learned artistic style onto an\narbitrary content image, generating artistic stylized images. Existing\ngenerative adversarial network-based methods fail to generate highly realistic\nstylized images and always introduce obvious artifacts and disharmonious\npatterns. Recently, large-scale pre-trained diffusion models opened up a new\nway for generating highly realistic artistic stylized images. However,\ndiffusion model-based methods generally fail to preserve the content structure\nof input content images well, introducing some undesired content structure and\nstyle patterns. To address the above problems, we propose a novel pre-trained\ndiffusion-based artistic style transfer method, called LSAST, which can\ngenerate highly realistic artistic stylized images while preserving the content\nstructure of input content images well, without bringing obvious artifacts and\ndisharmonious style patterns. Specifically, we introduce a Step-aware and\nLayer-aware Prompt Space, a set of learnable prompts, which can learn the style\ninformation from the collection of artworks and dynamically adjusts the input\nimages' content structure and style pattern. To train our prompt space, we\npropose a novel inversion method, called Step-ware and Layer-aware Prompt\nInversion, which allows the prompt space to learn the style information of the\nartworks collection. In addition, we inject a pre-trained conditional branch of\nControlNet into our LSAST, which further improved our framework's ability to\nmaintain content structure. Extensive experiments demonstrate that our proposed\nmethod can generate more highly realistic artistic stylized images than the\nstate-of-the-art artistic style transfer methods.\n","authors":["Zhanjie Zhang","Quanwei Zhang","Huaizhong Lin","Wei Xing","Juncheng Mo","Shuaicheng Huang","Jinheng Xie","Guangyuan Li","Junsheng Luan","Lei Zhao","Dalong Zhang","Lixia Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11474v1.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2303.12054v5","updated":"2024-04-17T15:12:29Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11461v1","updated":"2024-04-17T15:09:31Z","published":"2024-04-17T15:09:31Z","title":"Using Game Engines and Machine Learning to Create Synthetic Satellite\n Imagery for a Tabletop Verification Exercise","summary":" Satellite imagery is regarded as a great opportunity for citizen-based\nmonitoring of activities of interest. Relevant imagery may however not be\navailable at sufficiently high resolution, quality, or cadence -- let alone be\nuniformly accessible to open-source analysts. This limits an assessment of the\ntrue long-term potential of citizen-based monitoring of nuclear activities\nusing publicly available satellite imagery. In this article, we demonstrate how\nmodern game engines combined with advanced machine-learning techniques can be\nused to generate synthetic imagery of sites of interest with the ability to\nchoose relevant parameters upon request; these include time of day, cloud\ncover, season, or level of activity onsite. At the same time, resolution and\noff-nadir angle can be adjusted to simulate different characteristics of the\nsatellite. While there are several possible use-cases for synthetic imagery,\nhere we focus on its usefulness to support tabletop exercises in which simple\nmonitoring scenarios can be examined to better understand verification\ncapabilities enabled by new satellite constellations and very short revisit\ntimes.\n","authors":["Johannes Hoster","Sara Al-Sayed","Felix Biessmann","Alexander Glaser","Kristian Hildebrand","Igor Moric","Tuong Vy Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.11461v1.pdf","comment":"Annual Meeting of the Institute of Nuclear Materials Management\n (INMM), Vienna"},{"id":"http://arxiv.org/abs/2404.11459v1","updated":"2024-04-17T15:07:06Z","published":"2024-04-17T15:07:06Z","title":"Octopus v3: Technical Report for On-device Sub-billion Multimodal AI\n Agent","summary":" A multimodal AI agent is characterized by its ability to process and learn\nfrom various types of data, including natural language, visual, and audio\ninputs, to inform its actions. Despite advancements in large language models\nthat incorporate visual data, such as GPT-4V, effectively translating\nimage-based data into actionable outcomes for AI agents continues to be\nchallenging. In this paper, we introduce a multimodal model that incorporates\nthe concept of functional token specifically designed for AI agent\napplications. To ensure compatibility with edge devices, our model is optimized\nto a compact size of less than 1B parameters. Like GPT-4, our model can process\nboth English and Chinese. We demonstrate that this model is capable of\noperating efficiently on a wide range of edge devices, including as constrained\nas a Raspberry Pi.\n","authors":["Wei Chen","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2404.11459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05309v2","updated":"2024-04-17T15:04:14Z","published":"2023-02-10T15:12:40Z","title":"The LuViRA Dataset: Measurement Description","summary":" We present a dataset to evaluate localization algorithms, which utilizes\nvision, audio, and radio sensors: the Lund University Vision, Radio, and Audio\n(LuViRA) Dataset. The dataset includes RGB images, corresponding depth maps,\nIMU readings, channel response between a massive MIMO channel sounder and a\nuser equipment, audio recorded by 12 microphones, and 0.5 mm accurate 6DoF pose\nground truth. We synchronize these sensors to make sure that all data are\nrecorded simultaneously. A camera, speaker, and transmit antenna are placed on\ntop of a slowly moving service robot and 88 trajectories are recorded. Each\ntrajectory includes 20 to 50 seconds of recorded sensor data and ground truth\nlabels. The data from different sensors can be used separately or jointly to\nconduct localization tasks and a motion capture system is used to verify the\nresults obtained by the localization algorithms. The main aim of this dataset\nis to enable research on fusing the most commonly used sensors for localization\ntasks. However, the full dataset or some parts of it can also be used for other\nresearch areas such as channel estimation, image classification, etc. Fusing\nsensor data can lead to increased localization accuracy and reliability, as\nwell as decreased latency and power consumption. The created dataset will be\nmade public at a later date.\n","authors":["Ilayda Yaman","Guoda Tian","Martin Larsson","Patrik Persson","Michiel Sandra","Alexander Dürr","Erik Tegler","Nikhil Challa","Henrik Garde","Fredrik Tufvesson","Kalle Åström","Ove Edfors","Steffen Malkowsky","Liang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.05309v2.pdf","comment":"7 pages, 7 figures, Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2305.15964v5","updated":"2024-04-17T15:01:39Z","published":"2023-05-25T12:03:31Z","title":"ChatCAD+: Towards a Universal and Reliable Interactive CAD using LLMs","summary":" The integration of Computer-Aided Diagnosis (CAD) with Large Language Models\n(LLMs) presents a promising frontier in clinical applications, notably in\nautomating diagnostic processes akin to those performed by radiologists and\nproviding consultations similar to a virtual family doctor. Despite the\npromising potential of this integration, current works face at least two\nlimitations: (1) From the perspective of a radiologist, existing studies\ntypically have a restricted scope of applicable imaging domains, failing to\nmeet the diagnostic needs of different patients. Also, the insufficient\ndiagnostic capability of LLMs further undermine the quality and reliability of\nthe generated medical reports. (2) Current LLMs lack the requisite depth in\nmedical expertise, rendering them less effective as virtual family doctors due\nto the potential unreliability of the advice provided during patient\nconsultations. To address these limitations, we introduce ChatCAD+, to be\nuniversal and reliable. Specifically, it is featured by two main modules: (1)\nReliable Report Generation and (2) Reliable Interaction. The Reliable Report\nGeneration module is capable of interpreting medical images from diverse\ndomains and generate high-quality medical reports via our proposed hierarchical\nin-context learning. Concurrently, the interaction module leverages up-to-date\ninformation from reputable medical websites to provide reliable medical advice.\nTogether, these designed modules synergize to closely align with the expertise\nof human medical professionals, offering enhanced consistency and reliability\nfor interpretation and advice. The source code is available at\nhttps://github.com/zhaozh10/ChatCAD.\n","authors":["Zihao Zhao","Sheng Wang","Jinchen Gu","Yitao Zhu","Lanzhuju Mei","Zixu Zhuang","Zhiming Cui","Qian Wang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2305.15964v5.pdf","comment":"Authors Zihao Zhao, Sheng Wang, Jinchen Gu, Yitao Zhu contributed\n equally to this work and should be considered co-first authors"},{"id":"http://arxiv.org/abs/2403.18807v4","updated":"2024-04-17T14:59:51Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059 (14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The project page is available at\nhttps://ecodepth-iitd.github.io\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v4.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2402.00186v2","updated":"2024-04-17T14:54:56Z","published":"2024-01-31T21:28:40Z","title":"Distance and Collision Probability Estimation from Gaussian Surface\n Models","summary":" This paper describes continuous-space methodologies to estimate the collision\nprobability, Euclidean distance and gradient between an ellipsoidal robot model\nand an environment surface modeled as a set of Gaussian distributions.\nContinuous-space collision probability estimation is critical for\nuncertainty-aware motion planning. Most collision detection and avoidance\napproaches assume the robot is modeled as a sphere, but ellipsoidal\nrepresentations provide tighter approximations and enable navigation in\ncluttered and narrow spaces. State-of-the-art methods derive the Euclidean\ndistance and gradient by processing raw point clouds, which is computationally\nexpensive for large workspaces. Recent advances in Gaussian surface modeling\n(e.g. mixture models, splatting) enable compressed and high-fidelity surface\nrepresentations. Few methods exist to estimate continuous-space occupancy from\nsuch models. They require Gaussians to model free space and are unable to\nestimate the collision probability, Euclidean distance and gradient for an\nellipsoidal robot. The proposed methods bridge this gap by extending prior work\nin ellipsoid-to-ellipsoid Euclidean distance and collision probability\nestimation to Gaussian surface models. A geometric blending approach is also\nproposed to improve collision probability estimation. The approaches are\nevaluated with numerical 2D and 3D experiments using real-world point cloud\ndata. Methods for efficient calculation of these quantities are demonstrated to\nexecute within a few microseconds per ellipsoid pair using a single-thread on\nlow-power CPUs of modern embedded computers\n","authors":["Kshitij Goel","Wennie Tabib"],"pdf_url":"https://arxiv.org/pdf/2402.00186v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11429v1","updated":"2024-04-17T14:34:56Z","published":"2024-04-17T14:34:56Z","title":"CarcassFormer: An End-to-end Transformer-based Framework for\n Simultaneous Localization, Segmentation and Classification of Poultry Carcass\n Defect","summary":" In the food industry, assessing the quality of poultry carcasses during\nprocessing is a crucial step. This study proposes an effective approach for\nautomating the assessment of carcass quality without requiring skilled labor or\ninspector involvement. The proposed system is based on machine learning (ML)\nand computer vision (CV) techniques, enabling automated defect detection and\ncarcass quality assessment. To this end, an end-to-end framework called\nCarcassFormer is introduced. It is built upon a Transformer-based architecture\ndesigned to effectively extract visual representations while simultaneously\ndetecting, segmenting, and classifying poultry carcass defects. Our proposed\nframework is capable of analyzing imperfections resulting from production and\ntransport welfare issues, as well as processing plant stunner, scalder, picker,\nand other equipment malfunctions. To benchmark the framework, a dataset of\n7,321 images was initially acquired, which contained both single and multiple\ncarcasses per image. In this study, the performance of the CarcassFormer system\nis compared with other state-of-the-art (SOTA) approaches for both\nclassification, detection, and segmentation tasks. Through extensive\nquantitative experiments, our framework consistently outperforms existing\nmethods, demonstrating remarkable improvements across various evaluation\nmetrics such as AP, AP@50, and AP@75. Furthermore, the qualitative results\nhighlight the strengths of CarcassFormer in capturing fine details, including\nfeathers, and accurately localizing and segmenting carcasses with high\nprecision. To facilitate further research and collaboration, the pre-trained\nmodel and source code of CarcassFormer is available for research purposes at:\n\\url{https://github.com/UARK-AICV/CarcassFormer}.\n","authors":["Minh Tran","Sang Truong","Arthur F. A. Fernandes","Michael T. Kidd","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2404.11429v1.pdf","comment":"Accepted to Poultry Science Journal"},{"id":"http://arxiv.org/abs/2404.11428v1","updated":"2024-04-17T14:34:35Z","published":"2024-04-17T14:34:35Z","title":"Explainable Lung Disease Classification from Chest X-Ray Images\n Utilizing Deep Learning and XAI","summary":" Lung diseases remain a critical global health concern, and it's crucial to\nhave accurate and quick ways to diagnose them. This work focuses on classifying\ndifferent lung diseases into five groups: viral pneumonia, bacterial pneumonia,\nCOVID, tuberculosis, and normal lungs. Employing advanced deep learning\ntechniques, we explore a diverse range of models including CNN, hybrid models,\nensembles, transformers, and Big Transfer. The research encompasses\ncomprehensive methodologies such as hyperparameter tuning, stratified k-fold\ncross-validation, and transfer learning with fine-tuning.Remarkably, our\nfindings reveal that the Xception model, fine-tuned through 5-fold\ncross-validation, achieves the highest accuracy of 96.21\\%. This success shows\nthat our methods work well in accurately identifying different lung diseases.\nThe exploration of explainable artificial intelligence (XAI) methodologies\nfurther enhances our understanding of the decision-making processes employed by\nthese models, contributing to increased trust in their clinical applications.\n","authors":["Tanzina Taher Ifty","Saleh Ahmed Shafin","Shoeb Mohammad Shahriar","Tashfia Towhid"],"pdf_url":"https://arxiv.org/pdf/2404.11428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11426v1","updated":"2024-04-17T14:33:41Z","published":"2024-04-17T14:33:41Z","title":"SPAMming Labels: Efficient Annotations for the Trackers of Tomorrow","summary":" Increasing the annotation efficiency of trajectory annotations from videos\nhas the potential to enable the next generation of data-hungry tracking\nalgorithms to thrive on large-scale datasets. Despite the importance of this\ntask, there are currently very few works exploring how to efficiently label\ntracking datasets comprehensively. In this work, we introduce SPAM, a tracking\ndata engine that provides high-quality labels with minimal human intervention.\nSPAM is built around two key insights: i) most tracking scenarios can be easily\nresolved. To take advantage of this, we utilize a pre-trained model to generate\nhigh-quality pseudo-labels, reserving human involvement for a smaller subset of\nmore difficult instances; ii) handling the spatiotemporal dependencies of track\nannotations across time can be elegantly and efficiently formulated through\ngraphs. Therefore, we use a unified graph formulation to address the annotation\nof both detections and identity association for tracks across time. Based on\nthese insights, SPAM produces high-quality annotations with a fraction of\nground truth labeling cost. We demonstrate that trackers trained on SPAM labels\nachieve comparable performance to those trained on human annotations while\nrequiring only 3-20% of the human labeling effort. Hence, SPAM paves the way\ntowards highly efficient labeling of large-scale tracking datasets. Our code\nand models will be available upon acceptance.\n","authors":["Orcun Cetintas","Tim Meinhardt","Guillem Brasó","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2404.11426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11419v1","updated":"2024-04-17T14:23:28Z","published":"2024-04-17T14:23:28Z","title":"SLAIM: Robust Dense Neural SLAM for Online Tracking and Mapping","summary":" We present SLAIM - Simultaneous Localization and Implicit Mapping. We propose\na novel coarse-to-fine tracking model tailored for Neural Radiance Field SLAM\n(NeRF-SLAM) to achieve state-of-the-art tracking performance. Notably, existing\nNeRF-SLAM systems consistently exhibit inferior tracking performance compared\nto traditional SLAM algorithms. NeRF-SLAM methods solve camera tracking via\nimage alignment and photometric bundle-adjustment. Such optimization processes\nare difficult to optimize due to the narrow basin of attraction of the\noptimization loss in image space (local minima) and the lack of initial\ncorrespondences. We mitigate these limitations by implementing a Gaussian\npyramid filter on top of NeRF, facilitating a coarse-to-fine tracking\noptimization strategy. Furthermore, NeRF systems encounter challenges in\nconverging to the right geometry with limited input views. While prior\napproaches use a Signed-Distance Function (SDF)-based NeRF and directly\nsupervise SDF values by approximating ground truth SDF through depth\nmeasurements, this often results in suboptimal geometry. In contrast, our\nmethod employs a volume density representation and introduces a novel KL\nregularizer on the ray termination distribution, constraining scene geometry to\nconsist of empty space and opaque surfaces. Our solution implements both local\nand global bundle-adjustment to produce a robust (coarse-to-fine) and accurate\n(KL regularizer) SLAM solution. We conduct experiments on multiple datasets\n(ScanNet, TUM, Replica) showing state-of-the-art results in tracking and in\nreconstruction accuracy.\n","authors":["Vincent Cartillier","Grant Schindler","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2404.11419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11416v1","updated":"2024-04-17T14:17:05Z","published":"2024-04-17T14:17:05Z","title":"Neural Shrödinger Bridge Matching for Pansharpening","summary":" Recent diffusion probabilistic models (DPM) in the field of pansharpening\nhave been gradually gaining attention and have achieved state-of-the-art (SOTA)\nperformance. In this paper, we identify shortcomings in directly applying DPMs\nto the task of pansharpening as an inverse problem: 1) initiating sampling\ndirectly from Gaussian noise neglects the low-resolution multispectral image\n(LRMS) as a prior; 2) low sampling efficiency often necessitates a higher\nnumber of sampling steps. We first reformulate pansharpening into the\nstochastic differential equation (SDE) form of an inverse problem. Building\nupon this, we propose a Schr\\\"odinger bridge matching method that addresses\nboth issues.\n We design an efficient deep neural network architecture tailored for the\nproposed SB matching.\n In comparison to the well-established DL-regressive-based framework and the\nrecent DPM framework, our method demonstrates SOTA performance with fewer\nsampling steps. Moreover, we discuss the relationship between SB matching and\nother methods based on SDEs and ordinary differential equations (ODEs), as well\nas its connection with optimal transport.\n Code will be available.\n","authors":["Zihan Cao","Xiao Wu","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.11416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11401v1","updated":"2024-04-17T14:07:22Z","published":"2024-04-17T14:07:22Z","title":"RainyScape: Unsupervised Rainy Scene Reconstruction using Decoupled\n Neural Rendering","summary":" We propose RainyScape, an unsupervised framework for reconstructing clean\nscenes from a collection of multi-view rainy images. RainyScape consists of two\nmain modules: a neural rendering module and a rain-prediction module that\nincorporates a predictor network and a learnable latent embedding that captures\nthe rain characteristics of the scene. Specifically, based on the spectral bias\nproperty of neural networks, we first optimize the neural rendering pipeline to\nobtain a low-frequency scene representation. Subsequently, we jointly optimize\nthe two modules, driven by the proposed adaptive direction-sensitive\ngradient-based reconstruction loss, which encourages the network to distinguish\nbetween scene details and rain streaks, facilitating the propagation of\ngradients to the relevant components. Extensive experiments on both the classic\nneural radiance field and the recently proposed 3D Gaussian splatting\ndemonstrate the superiority of our method in effectively eliminating rain\nstreaks and rendering clean images, achieving state-of-the-art performance. The\nconstructed high-quality dataset and source code will be publicly available.\n","authors":["Xianqiang Lyu","Hui Liu","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2404.11401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16749v3","updated":"2024-04-17T14:06:28Z","published":"2024-02-26T17:11:11Z","title":"MISC: Ultra-low Bitrate Image Semantic Compression Driven by Large\n Multimodal Model","summary":" With the evolution of storage and communication protocols, ultra-low bitrate\nimage compression has become a highly demanding topic. However, existing\ncompression algorithms must sacrifice either consistency with the ground truth\nor perceptual quality at ultra-low bitrate. In recent years, the rapid\ndevelopment of the Large Multimodal Model (LMM) has made it possible to balance\nthese two goals. To solve this problem, this paper proposes a method called\nMultimodal Image Semantic Compression (MISC), which consists of an LMM encoder\nfor extracting the semantic information of the image, a map encoder to locate\nthe region corresponding to the semantic, an image encoder generates an\nextremely compressed bitstream, and a decoder reconstructs the image based on\nthe above information. Experimental results show that our proposed MISC is\nsuitable for compressing both traditional Natural Sense Images (NSIs) and\nemerging AI-Generated Images (AIGIs) content. It can achieve optimal\nconsistency and perception results while saving 50% bitrate, which has strong\npotential applications in the next generation of storage and communication. The\ncode will be released on https://github.com/lcysyzxdxc/MISC.\n","authors":["Chunyi Li","Guo Lu","Donghui Feng","Haoning Wu","Zicheng Zhang","Xiaohong Liu","Guangtao Zhai","Weisi Lin","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.16749v3.pdf","comment":"13 page, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2312.06722v2","updated":"2024-04-17T13:56:06Z","published":"2023-12-11T03:35:58Z","title":"EgoPlan-Bench: Benchmarking Egocentric Embodied Planning with Multimodal\n Large Language Models","summary":" Multimodal Large Language Models, combining the remarkable reasoning and\ngeneralization capabilities of Large Language Models (LLMs) with the ability to\ncomprehend visual inputs, have opened up new avenues for embodied task\nplanning. Given diverse environmental inputs, including real-time task\nprogress, visual observations, and open-form language instructions, a\nproficient task planner is expected to predict feasible actions, which is a\nfeat inherently achievable by Multimodal Large Language Models (MLLMs). In this\npaper, we aim to quantitatively investigate the potential of MLLMs as embodied\ntask planners in real-world scenarios by introducing a benchmark with human\nannotations named EgoPlan-Bench. Our benchmark is distinguished by realistic\ntasks derived from real-world videos, a diverse set of actions involving\ninteractions with hundreds of different objects, and complex visual\nobservations from varied scenes. We evaluate a wide range of MLLMs, revealing\nthat these models have not yet evolved into embodied planning generalists (even\nGPT-4V). We further construct an instruction-tuning dataset EgoPlan-IT from\nvideos with human-object interactions, to facilitate the learning of high-level\ntask planning in intricate real-world situations. The experiment results\ndemonstrate that the model tuned on EgoPlan-IT not only significantly improves\nperformance on our benchmark, but can also be applied as a task planner for\nguiding embodied agents in simulations.\n","authors":["Yi Chen","Yuying Ge","Yixiao Ge","Mingyu Ding","Bohao Li","Rui Wang","Ruifeng Xu","Ying Shan","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2312.06722v2.pdf","comment":"Project released at: https://github.com/ChenYi99/EgoPlan"},{"id":"http://arxiv.org/abs/2310.20621v2","updated":"2024-04-17T13:41:07Z","published":"2023-10-31T16:54:14Z","title":"Deepfake detection by exploiting surface anomalies: the SurFake approach","summary":" The ever-increasing use of synthetically generated content in different\nsectors of our everyday life, one for all media information, poses a strong\nneed for deepfake detection tools in order to avoid the proliferation of\naltered messages. The process to identify manipulated content, in particular\nimages and videos, is basically performed by looking for the presence of some\ninconsistencies and/or anomalies specifically due to the fake generation\nprocess. Different techniques exist in the scientific literature that exploit\ndiverse ad-hoc features in order to highlight possible modifications. In this\npaper, we propose to investigate how deepfake creation can impact on the\ncharacteristics that the whole scene had at the time of the acquisition. In\nparticular, when an image (video) is captured the overall geometry of the scene\n(e.g. surfaces) and the acquisition process (e.g. illumination) determine a\nunivocal environment that is directly represented by the image pixel values;\nall these intrinsic relations are possibly changed by the deepfake generation\nprocess. By resorting to the analysis of the characteristics of the surfaces\ndepicted in the image it is possible to obtain a descriptor usable to train a\nCNN for deepfake detection: we refer to such an approach as SurFake.\nExperimental results carried out on the FF++ dataset for different kinds of\ndeepfake forgeries and diverse deep learning models confirm that such a feature\ncan be adopted to discriminate between pristine and altered images;\nfurthermore, experiments witness that it can also be combined with visual data\nto provide a certain improvement in terms of detection accuracy.\n","authors":["Andrea Ciamarra","Roberto Caldelli","Federico Becattini","Lorenzo Seidenari","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2310.20621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11375v1","updated":"2024-04-17T13:33:09Z","published":"2024-04-17T13:33:09Z","title":"Text-controlled Motion Mamba: Text-Instructed Temporal Grounding of\n Human Motion","summary":" Human motion understanding is a fundamental task with diverse practical\napplications, facilitated by the availability of large-scale motion capture\ndatasets. Recent studies focus on text-motion tasks, such as text-based motion\ngeneration, editing and question answering. In this study, we introduce the\nnovel task of text-based human motion grounding (THMG), aimed at precisely\nlocalizing temporal segments corresponding to given textual descriptions within\nuntrimmed motion sequences. Capturing global temporal information is crucial\nfor the THMG task. However, transformer-based models that rely on global\ntemporal self-attention face challenges when handling long untrimmed sequences\ndue to the quadratic computational cost. We address these challenges by\nproposing Text-controlled Motion Mamba (TM-Mamba), a unified model that\nintegrates temporal global context, language query control, and spatial graph\ntopology with only linear memory cost. The core of the model is a\ntext-controlled selection mechanism which dynamically incorporates global\ntemporal information based on text query. The model is further enhanced to be\ntopology-aware through the integration of relational embeddings. For\nevaluation, we introduce BABEL-Grounding, the first text-motion dataset that\nprovides detailed textual descriptions of human actions along with their\ncorresponding temporal segments. Extensive evaluations demonstrate the\neffectiveness of TM-Mamba on BABEL-Grounding.\n","authors":["Xinghan Wang","Zixi Kang","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2404.11375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13756v3","updated":"2024-04-17T13:32:15Z","published":"2024-02-21T12:34:31Z","title":"High-throughput Visual Nano-drone to Nano-drone Relative Localization\n using Onboard Fully Convolutional Networks","summary":" Relative drone-to-drone localization is a fundamental building block for any\nswarm operations. We address this task in the context of miniaturized\nnano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to\nnovel use cases enabled by their reduced form factor. The price for their\nversatility comes with limited onboard resources, i.e., sensors, processing\nunits, and memory, which limits the complexity of the onboard algorithms. A\ntraditional solution to overcome these limitations is represented by\nlightweight deep learning models directly deployed aboard nano-drones. This\nwork tackles the challenging relative pose estimation between nano-drones using\nonly a gray-scale low-resolution camera and an ultra-low-power System-on-Chip\n(SoC) hosted onboard. We present a vertically integrated system based on a\nnovel vision-based fully convolutional neural network (FCNN), which runs at\n39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8\nSoC. We compare our FCNN against three State-of-the-Art (SoA) systems.\nConsidering the best-performing SoA approach, our model results in an R-squared\nimprovement from 32 to 47% on the horizontal image coordinate and from 18 to\n55% on the vertical image coordinate, on a real-world dataset of 30k images.\nFinally, our in-field tests show a reduction of the average tracking error of\n37% compared to a previous SoA work and an endurance performance up to the\nentire battery lifetime of 4 minutes.\n","authors":["Luca Crupi","Alessandro Giusti","Daniele Palossi"],"pdf_url":"https://arxiv.org/pdf/2402.13756v3.pdf","comment":"ICRA 2024, IEEE Conference"},{"id":"http://arxiv.org/abs/2401.11470v2","updated":"2024-04-17T13:25:38Z","published":"2024-01-21T11:55:42Z","title":"Exploring Missing Modality in Multimodal Egocentric Datasets","summary":" Multimodal video understanding is crucial for analyzing egocentric videos,\nwhere integrating multiple sensory signals significantly enhances action\nrecognition and moment localization. However, practical applications often\ngrapple with incomplete modalities due to factors like privacy concerns,\nefficiency demands, or hardware malfunctions. Addressing this, our study delves\ninto the impact of missing modalities on egocentric action recognition,\nparticularly within transformer-based models. We introduce a novel concept\n-Missing Modality Token (MMT)-to maintain performance even when modalities are\nabsent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and\nEpic-Sounds datasets. Our method mitigates the performance loss, reducing it\nfrom its original $\\sim 30\\%$ drop to only $\\sim 10\\%$ when half of the test\nset is modal-incomplete. Through extensive experimentation, we demonstrate the\nadaptability of MMT to different training scenarios and its superiority in\nhandling missing modalities compared to current methods. Our research\ncontributes a comprehensive analysis and an innovative approach, opening\navenues for more resilient multimodal systems in real-world settings.\n","authors":["Merey Ramazanova","Alejandro Pardo","Humam Alwassel","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2401.11470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10247v2","updated":"2024-04-17T13:25:35Z","published":"2023-03-17T20:54:04Z","title":"Video shutter angle estimation using optical flow and linear blur","summary":" We present a method for estimating the shutter angle, a.k.a. exposure\nfraction - the ratio of the exposure time and the reciprocal of frame rate - of\nvideoclips containing motion. The approach exploits the relation of the\nexposure fraction, optical flow, and linear motion blur. Robustness is achieved\nby selecting image patches where both the optical flow and blur estimates are\nreliable, checking their consistency. The method was evaluated on the publicly\navailable Beam-Splitter Dataset with a range of exposure fractions from 0.015\nto 0.36. The best achieved mean absolute error of estimates was 0.039. We\nsuccessfully test the suitability of the method for a forensic application of\ndetection of video tampering by frame removal or insertion\n","authors":["David Korcak","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2303.10247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11361v1","updated":"2024-04-17T13:18:39Z","published":"2024-04-17T13:18:39Z","title":"Boosting Medical Image Segmentation Performance with Adaptive\n Convolution Layer","summary":" Medical image segmentation plays a vital role in various clinical\napplications, enabling accurate delineation and analysis of anatomical\nstructures or pathological regions. Traditional CNNs have achieved remarkable\nsuccess in this field. However, they often rely on fixed kernel sizes, which\ncan limit their performance and adaptability in medical images where features\nexhibit diverse scales and configurations due to variability in equipment,\ntarget sizes, and expert interpretations.\n In this paper, we propose an adaptive layer placed ahead of leading\ndeep-learning models such as UCTransNet, which dynamically adjusts the kernel\nsize based on the local context of the input image.\n By adaptively capturing and fusing features at multiple scales, our approach\nenhances the network's ability to handle diverse anatomical structures and\nsubtle image details, even for recently performing architectures that\ninternally implement intra-scale modules, such as UCTransnet.\n Extensive experiments are conducted on\n benchmark medical image datasets to evaluate the effectiveness of our\nproposal. It consistently outperforms traditional \\glspl{CNN} with fixed kernel\nsizes with a similar number of parameters, achieving superior segmentation\nAccuracy, Dice, and IoU in popular datasets such as SegPC2021 and ISIC2018. The\nmodel and data are published in the open-source repository, ensuring\ntransparency and reproducibility of our promising results.\n","authors":["Seyed M. R. Modaresi","Aomar Osmani","Mohammadreza Razzazi","Abdelghani Chibani"],"pdf_url":"https://arxiv.org/pdf/2404.11361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11358v1","updated":"2024-04-17T13:14:52Z","published":"2024-04-17T13:14:52Z","title":"DeblurGS: Gaussian Splatting for Camera Motion Blur","summary":" Although significant progress has been made in reconstructing sharp 3D scenes\nfrom motion-blurred images, a transition to real-world applications remains\nchallenging. The primary obstacle stems from the severe blur which leads to\ninaccuracies in the acquisition of initial camera poses through\nStructure-from-Motion, a critical aspect often overlooked by previous\napproaches. To address this challenge, we propose DeblurGS, a method to\noptimize sharp 3D Gaussian Splatting from motion-blurred images, even with the\nnoisy camera pose initialization. We restore a fine-grained sharp scene by\nleveraging the remarkable reconstruction capability of 3D Gaussian Splatting.\nOur approach estimates the 6-Degree-of-Freedom camera motion for each blurry\nobservation and synthesizes corresponding blurry renderings for the\noptimization process. Furthermore, we propose Gaussian Densification Annealing\nstrategy to prevent the generation of inaccurate Gaussians at erroneous\nlocations during the early training stages when camera motion is still\nimprecise. Comprehensive experiments demonstrate that our DeblurGS achieves\nstate-of-the-art performance in deblurring and novel view synthesis for\nreal-world and synthetic benchmark datasets, as well as field-captured blurry\nsmartphone videos.\n","authors":["Jeongtaek Oh","Jaeyoung Chung","Dongwoo Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.11358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11357v1","updated":"2024-04-17T13:12:14Z","published":"2024-04-17T13:12:14Z","title":"Detector Collapse: Backdooring Object Detection to Catastrophic Overload\n or Blindness","summary":" Object detection tasks, crucial in safety-critical systems like autonomous\ndriving, focus on pinpointing object locations. These detectors are known to be\nsusceptible to backdoor attacks. However, existing backdoor techniques have\nprimarily been adapted from classification tasks, overlooking deeper\nvulnerabilities specific to object detection. This paper is dedicated to\nbridging this gap by introducing Detector Collapse} (DC), a brand-new backdoor\nattack paradigm tailored for object detection. DC is designed to instantly\nincapacitate detectors (i.e., severely impairing detector's performance and\nculminating in a denial-of-service). To this end, we develop two innovative\nattack schemes: Sponge for triggering widespread misidentifications and\nBlinding for rendering objects invisible. Remarkably, we introduce a novel\npoisoning strategy exploiting natural objects, enabling DC to act as a\npractical backdoor in real-world environments. Our experiments on different\ndetectors across several benchmarks show a significant improvement\n($\\sim$10\\%-60\\% absolute and $\\sim$2-7$\\times$ relative) in attack efficacy\nover state-of-the-art attacks.\n","authors":["Hangtao Zhang","Shengshan Hu","Yichen Wang","Leo Yu Zhang","Ziqi Zhou","Xianlong Wang","Yanjun Zhang","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11357v1.pdf","comment":"Accepted by IJCAI-24"},{"id":"http://arxiv.org/abs/2404.11355v1","updated":"2024-04-17T13:09:44Z","published":"2024-04-17T13:09:44Z","title":"Consisaug: A Consistency-based Augmentation for Polyp Detection in\n Endoscopy Image Analysis","summary":" Colorectal cancer (CRC), which frequently originates from initially benign\npolyps, remains a significant contributor to global cancer-related mortality.\nEarly and accurate detection of these polyps via colonoscopy is crucial for CRC\nprevention. However, traditional colonoscopy methods depend heavily on the\noperator's experience, leading to suboptimal polyp detection rates. Besides,\nthe public database are limited in polyp size and shape diversity. To enhance\nthe available data for polyp detection, we introduce Consisaug, an innovative\nand effective methodology to augment data that leverages deep learning. We\nutilize the constraint that when the image is flipped the class label should be\nequal and the bonding boxes should be consistent. We implement our Consisaug on\nfive public polyp datasets and at three backbones, and the results show the\neffectiveness of our method.\n","authors":["Ziyu Zhou","Wenyuan Shen","Chang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11355v1.pdf","comment":"MLMI 2023"},{"id":"http://arxiv.org/abs/2404.11339v1","updated":"2024-04-17T13:00:05Z","published":"2024-04-17T13:00:05Z","title":"Best Practices for a Handwritten Text Recognition System","summary":" Handwritten text recognition has been developed rapidly in the recent years,\nfollowing the rise of deep learning and its applications. Though deep learning\nmethods provide notable boost in performance concerning text recognition,\nnon-trivial deviation in performance can be detected even when small\npre-processing or architectural/optimization elements are changed. This work\nfollows a ``best practice'' rationale; highlight simple yet effective empirical\npractices that can further help training and provide well-performing\nhandwritten text recognition systems. Specifically, we considered three basic\naspects of a deep HTR system and we proposed simple yet effective solutions: 1)\nretain the aspect ratio of the images in the preprocessing step, 2) use\nmax-pooling for converting the 3D feature map of CNN output into a sequence of\nfeatures and 3) assist the training procedure via an additional CTC loss which\nacts as a shortcut on the max-pooled sequential features. Using these proposed\nsimple modifications, one can attain close to state-of-the-art results, while\nconsidering a basic convolutional-recurrent (CNN+LSTM) architecture, for both\nIAM and RIMES datasets. Code is available at\nhttps://github.com/georgeretsi/HTR-best-practices/.\n","authors":["George Retsinas","Giorgos Sfikas","Basilis Gatos","Christophoros Nikou"],"pdf_url":"https://arxiv.org/pdf/2404.11339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11336v1","updated":"2024-04-17T12:53:57Z","published":"2024-04-17T12:53:57Z","title":"Vision-based control for landing an aerial vehicle on a marine vessel","summary":" This work addresses the landing problem of an aerial vehicle, exemplified by\na simple quadrotor, on a moving platform using image-based visual servo\ncontrol. First, the mathematical model of the quadrotor aircraft is introduced,\nfollowed by the design of the inner-loop control. At the second stage, the\nimage features on the textured target plane are exploited to derive a\nvision-based control law. The image of the spherical centroid of a set of\nlandmarks present in the landing target is used as a position measurement,\nwhereas the translational optical flow is used as velocity measurement. The\nkinematics of the vision-based system is expressed in terms of the observable\nfeatures, and the proposed control law guarantees convergence without\nestimating the unknown distance between the vision system and the target, which\nis also guaranteed to remain strictly positive, avoiding undesired collisions.\nThe performance of the proposed control law is evaluated in MATLAB and 3-D\nsimulation software Gazebo. Simulation results for a quadrotor UAV are provided\nfor different velocity profiles of the moving target, showcasing the robustness\nof the proposed controller.\n","authors":["Haohua Dong"],"pdf_url":"https://arxiv.org/pdf/2404.11336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11335v1","updated":"2024-04-17T12:53:45Z","published":"2024-04-17T12:53:45Z","title":"SoccerNet Game State Reconstruction: End-to-End Athlete Tracking and\n Identification on a Minimap","summary":" Tracking and identifying athletes on the pitch holds a central role in\ncollecting essential insights from the game, such as estimating the total\ndistance covered by players or understanding team tactics. This tracking and\nidentification process is crucial for reconstructing the game state, defined by\nthe athletes' positions and identities on a 2D top-view of the pitch, (i.e. a\nminimap). However, reconstructing the game state from videos captured by a\nsingle camera is challenging. It requires understanding the position of the\nathletes and the viewpoint of the camera to localize and identify players\nwithin the field. In this work, we formalize the task of Game State\nReconstruction and introduce SoccerNet-GSR, a novel Game State Reconstruction\ndataset focusing on football videos. SoccerNet-GSR is composed of 200 video\nsequences of 30 seconds, annotated with 9.37 million line points for pitch\nlocalization and camera calibration, as well as over 2.36 million athlete\npositions on the pitch with their respective role, team, and jersey number.\nFurthermore, we introduce GS-HOTA, a novel metric to evaluate game state\nreconstruction methods. Finally, we propose and release an end-to-end baseline\nfor game state reconstruction, bootstrapping the research on this task. Our\nexperiments show that GSR is a challenging novel task, which opens the field\nfor future research. Our dataset and codebase are publicly available at\nhttps://github.com/SoccerNet/sn-gamestate.\n","authors":["Vladimir Somers","Victor Joos","Anthony Cioppa","Silvio Giancola","Seyed Abolfazl Ghasemzadeh","Floriane Magera","Baptiste Standaert","Amir Mohammad Mansourian","Xin Zhou","Shohreh Kasaei","Bernard Ghanem","Alexandre Alahi","Marc Van Droogenbroeck","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2404.11335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11327v1","updated":"2024-04-17T12:39:48Z","published":"2024-04-17T12:39:48Z","title":"Following the Human Thread in Social Navigation","summary":" The success of collaboration between humans and robots in shared environments\nrelies on the robot's real-time adaptation to human motion. Specifically, in\nSocial Navigation, the agent should be close enough to assist but ready to back\nup to let the human move freely, avoiding collisions. Human trajectories emerge\nas crucial cues in Social Navigation, but they are partially observable from\nthe robot's egocentric view and computationally complex to process.\n We propose the first Social Dynamics Adaptation model (SDA) based on the\nrobot's state-action history to infer the social dynamics. We propose a\ntwo-stage Reinforcement Learning framework: the first learns to encode the\nhuman trajectories into social dynamics and learns a motion policy conditioned\non this encoded information, the current status, and the previous action. Here,\nthe trajectories are fully visible, i.e., assumed as privileged information. In\nthe second stage, the trained policy operates without direct access to\ntrajectories. Instead, the model infers the social dynamics solely from the\nhistory of previous actions and statuses in real-time. Tested on the novel\nHabitat 3.0 platform, SDA sets a novel state of the art (SoA) performance in\nfinding and following humans.\n","authors":["Luca Scofano","Alessio Sampieri","Tommaso Campari","Valentino Sacco","Indro Spinelli","Lamberto Ballan","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2404.11327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11326v1","updated":"2024-04-17T12:38:58Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01431v2","updated":"2024-04-17T12:36:06Z","published":"2023-12-03T15:40:10Z","title":"D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for\n Few-shot Action Recognition","summary":" Adapting large pre-trained image models to few-shot action recognition has\nproven to be an effective and efficient strategy for learning robust feature\nextractors, which is essential for few-shot learning. Typical fine-tuning based\nadaptation paradigm is prone to overfitting in the few-shot learning scenarios\nand offers little modeling flexibility for learning temporal features in video\ndata. In this work we present the Disentangled-and-Deformable Spatio-Temporal\nAdapter (D$^2$ST-Adapter), which is a novel adapter tuning framework\nwell-suited for few-shot action recognition due to lightweight design and low\nparameter-learning overhead. It is designed in a dual-pathway architecture to\nencode spatial and temporal features in a disentangled manner. In particular,\nwe devise the anisotropic Deformable Spatio-Temporal Attention module as the\ncore component of D$^2$ST-Adapter, which can be tailored with anisotropic\nsampling densities along spatial and temporal domains to learn spatial and\ntemporal features specifically in corresponding pathways, allowing our\nD$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space\nwhile maintaining a lightweight design. Extensive experiments with\ninstantiations of our method on both pre-trained ResNet and ViT demonstrate the\nsuperiority of our method over state-of-the-art methods for few-shot action\nrecognition. Our method is particularly well-suited to challenging scenarios\nwhere temporal dynamics are critical for action recognition.\n","authors":["Wenjie Pei","Qizhong Tan","Guangming Lu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2312.01431v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11322v1","updated":"2024-04-17T12:34:49Z","published":"2024-04-17T12:34:49Z","title":"VBR: A Vision Benchmark in Rome","summary":" This paper presents a vision and perception research dataset collected in\nRome, featuring RGB data, 3D point clouds, IMU, and GPS data. We introduce a\nnew benchmark targeting visual odometry and SLAM, to advance the research in\nautonomous robotics and computer vision. This work complements existing\ndatasets by simultaneously addressing several issues, such as environment\ndiversity, motion patterns, and sensor frequency. It uses up-to-date devices\nand presents effective procedures to accurately calibrate the intrinsic and\nextrinsic of the sensors while addressing temporal synchronization. During\nrecording, we cover multi-floor buildings, gardens, urban and highway\nscenarios. Combining handheld and car-based data collections, our setup can\nsimulate any robot (quadrupeds, quadrotors, autonomous vehicles). The dataset\nincludes an accurate 6-dof ground truth based on a novel methodology that\nrefines the RTK-GPS estimate with LiDAR point clouds through Bundle Adjustment.\nAll sequences divided in training and testing are accessible through our\nwebsite.\n","authors":["Leonardo Brizi","Emanuele Giacomini","Luca Di Giammarino","Simone Ferrari","Omar Salem","Lorenzo De Rebotti","Giorgio Grisetti"],"pdf_url":"https://arxiv.org/pdf/2404.11322v1.pdf","comment":"Accepted at IEEE ICRA 2024 Website:\n https://rvp-group.net/datasets/slam.html"},{"id":"http://arxiv.org/abs/2404.11318v1","updated":"2024-04-17T12:32:10Z","published":"2024-04-17T12:32:10Z","title":"Leveraging Fine-Grained Information and Noise Decoupling for Remote\n Sensing Change Detection","summary":" Change detection aims to identify remote sense object changes by analyzing\ndata between bitemporal image pairs. Due to the large temporal and spatial span\nof data collection in change detection image pairs, there are often a\nsignificant amount of task-specific and task-agnostic noise. Previous effort\nhas focused excessively on denoising, with this goes a great deal of loss of\nfine-grained information. In this paper, we revisit the importance of\nfine-grained features in change detection and propose a series of operations\nfor fine-grained information compensation and noise decoupling (FINO). First,\nthe context is utilized to compensate for the fine-grained information in the\nfeature space. Next, a shape-aware and a brightness-aware module are designed\nto improve the capacity for representation learning. The shape-aware module\nguides the backbone for more precise shape estimation, guiding the backbone\nnetwork in extracting object shape features. The brightness-aware module learns\na overall brightness estimation to improve the model's robustness to\ntask-agnostic noise. Finally, a task-specific noise decoupling structure is\ndesigned as a way to improve the model's ability to separate noise interference\nfrom feature similarity. With these training schemes, our proposed method\nachieves new state-of-the-art (SOTA) results in multiple change detection\nbenchmarks. The code will be made available.\n","authors":["Qiangang Du","Jinlong Peng","Changan Wang","Xu Chen","Qingdong He","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11317v1","updated":"2024-04-17T12:30:54Z","published":"2024-04-17T12:30:54Z","title":"Improving Composed Image Retrieval via Contrastive Learning with Scaling\n Positives and Negatives","summary":" The Composed Image Retrieval (CIR) task aims to retrieve target images using\na composed query consisting of a reference image and a modified text. Advanced\nmethods often utilize contrastive learning as the optimization objective, which\nbenefits from adequate positive and negative examples. However, the triplet for\nCIR incurs high manual annotation costs, resulting in limited positive\nexamples. Furthermore, existing methods commonly use in-batch negative\nsampling, which reduces the negative number available for the model. To address\nthe problem of lack of positives, we propose a data generation method by\nleveraging a multi-modal large language model to construct triplets for CIR. To\nintroduce more negatives during fine-tuning, we design a two-stage fine-tuning\nframework for CIR, whose second stage introduces plenty of static\nrepresentations of negatives to optimize the representation space rapidly. The\nabove two improvements can be effectively stacked and designed to be\nplug-and-play, easily applied to existing CIR models without changing their\noriginal architectures. Extensive experiments and ablation analysis demonstrate\nthat our method effectively scales positives and negatives and achieves\nstate-of-the-art results on both FashionIQ and CIRR datasets. In addition, our\nmethods also perform well in zero-shot composed image retrieval, providing a\nnew CIR solution for the low-resources scenario.\n","authors":["Zhangchi Feng","Richong Zhang","Zhijie Nie"],"pdf_url":"https://arxiv.org/pdf/2404.11317v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.11930v2","updated":"2024-04-17T12:27:25Z","published":"2023-09-21T09:44:39Z","title":"Bridging the Gap: Learning Pace Synchronization for Open-World\n Semi-Supervised Learning","summary":" In open-world semi-supervised learning, a machine learning model is tasked\nwith uncovering novel categories from unlabeled data while maintaining\nperformance on seen categories from labeled data. The central challenge is the\nsubstantial learning gap between seen and novel categories, as the model learns\nthe former faster due to accurate supervisory information. Moreover, capturing\nthe semantics of unlabeled novel category samples is also challenging due to\nthe missing label information. To address the above issues, we introduce 1) the\nadaptive synchronizing marginal loss which imposes class-specific negative\nmargins to alleviate the model bias towards seen classes, and 2) the\npseudo-label contrastive clustering which exploits pseudo-labels predicted by\nthe model to group unlabeled data from the same category together in the output\nspace. Extensive experiments on benchmark datasets demonstrate that previous\napproaches may significantly hinder novel class learning, whereas our method\nstrikingly balances the learning pace between seen and novel classes, achieving\na remarkable 3% average accuracy increase on the ImageNet dataset. Importantly,\nwe find that fine-tuning the self-supervised pre-trained model significantly\nboosts the performance, which is overlooked in prior literature. Our code is\navailable at https://github.com/yebo0216best/LPS-main.\n","authors":["Bo Ye","Kai Gan","Tong Wei","Min-Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.11930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11309v1","updated":"2024-04-17T12:21:57Z","published":"2024-04-17T12:21:57Z","title":"Achieving Rotation Invariance in Convolution Operations: Shifting from\n Data-Driven to Mechanism-Assured","summary":" Achieving rotation invariance in deep neural networks without relying on data\nhas always been a hot research topic. Intrinsic rotation invariance can enhance\nthe model's feature representation capability, enabling better performance in\ntasks such as multi-orientation object recognition and detection. Based on\nvarious types of non-learnable operators, including gradient, sort, local\nbinary pattern, maximum, etc., this paper designs a set of new convolution\noperations that are natually invariant to arbitrary rotations. Unlike most\nprevious studies, these rotation-invariant convolutions (RIConvs) have the same\nnumber of learnable parameters and a similar computational process as\nconventional convolution operations, allowing them to be interchangeable. Using\nthe MNIST-Rot dataset, we first verify the invariance of these RIConvs under\nvarious rotation angles and compare their performance with previous\nrotation-invariant convolutional neural networks (RI-CNNs). Two types of\nRIConvs based on gradient operators achieve state-of-the-art results.\nSubsequently, we combine RIConvs with different types and depths of classic CNN\nbackbones. Using the OuTex_00012, MTARSI, and NWPU-RESISC-45 datasets, we test\ntheir performance on texture recognition, aircraft type recognition, and remote\nsensing image classification tasks. The results show that RIConvs significantly\nimprove the accuracy of these CNN backbones, especially when the training data\nis limited. Furthermore, we find that even with data augmentation, RIConvs can\nfurther enhance model performance.\n","authors":["Hanlin Mo","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.11309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11302v1","updated":"2024-04-17T12:13:18Z","published":"2024-04-17T12:13:18Z","title":"A Semantic Segmentation-guided Approach for Ground-to-Aerial Image\n Matching","summary":" Nowadays the accurate geo-localization of ground-view images has an important\nrole across domains as diverse as journalism, forensics analysis, transports,\nand Earth Observation. This work addresses the problem of matching a query\nground-view image with the corresponding satellite image without GPS data. This\nis done by comparing the features from a ground-view image and a satellite one,\ninnovatively leveraging the corresponding latter's segmentation mask through a\nthree-stream Siamese-like network. The proposed method, Semantic Align Net\n(SAN), focuses on limited Field-of-View (FoV) and ground panorama images\n(images with a FoV of 360{\\deg}). The novelty lies in the fusion of satellite\nimages in combination with their semantic segmentation masks, aimed at ensuring\nthat the model can extract useful features and focus on the significant parts\nof the images. This work shows how SAN through semantic analysis of images\nimproves the performance on the unlabelled CVUSA dataset for all the tested\nFoVs.\n","authors":["Francesco Pro","Nikolaos Dionelis","Luca Maiano","Bertrand Le Saux","Irene Amerini"],"pdf_url":"https://arxiv.org/pdf/2404.11302v1.pdf","comment":"6 pages, 2 figures, 2 tables, Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2404.11299v1","updated":"2024-04-17T12:12:48Z","published":"2024-04-17T12:12:48Z","title":"Learning from Unlabelled Data with Transformers: Domain Adaptation for\n Semantic Segmentation of High Resolution Aerial Images","summary":" Data from satellites or aerial vehicles are most of the times unlabelled.\nAnnotating such data accurately is difficult, requires expertise, and is costly\nin terms of time. Even if Earth Observation (EO) data were correctly labelled,\nlabels might change over time. Learning from unlabelled data within a\nsemi-supervised learning framework for segmentation of aerial images is\nchallenging. In this paper, we develop a new model for semantic segmentation of\nunlabelled images, the Non-annotated Earth Observation Semantic Segmentation\n(NEOS) model. NEOS performs domain adaptation as the target domain does not\nhave ground truth semantic segmentation masks. The distribution inconsistencies\nbetween the target and source domains are due to differences in acquisition\nscenes, environment conditions, sensors, and times. Our model aligns the\nlearned representations of the different domains to make them coincide. The\nevaluation results show that NEOS is successful and outperforms other models\nfor semantic segmentation of unlabelled data.\n","authors":["Nikolaos Dionelis","Francesco Pro","Luca Maiano","Irene Amerini","Bertrand Le Saux"],"pdf_url":"https://arxiv.org/pdf/2404.11299v1.pdf","comment":"6 pages, 7 figures, Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2404.10588v2","updated":"2024-04-17T12:09:17Z","published":"2024-04-16T14:13:44Z","title":"Do Counterfactual Examples Complicate Adversarial Training?","summary":" We leverage diffusion models to study the robustness-performance tradeoff of\nrobust classifiers. Our approach introduces a simple, pretrained diffusion\nmethod to generate low-norm counterfactual examples (CEs): semantically altered\ndata which results in different true class membership. We report that the\nconfidence and accuracy of robust models on their clean training data are\nassociated with the proximity of the data to their CEs. Moreover, robust models\nperform very poorly when evaluated on the CEs directly, as they become\nincreasingly invariant to the low-norm, semantic changes brought by CEs. The\nresults indicate a significant overlap between non-robust and semantic\nfeatures, countering the common assumption that non-robust features are not\ninterpretable.\n","authors":["Eric Yeats","Cameron Darwin","Eduardo Ortega","Frank Liu","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2404.10588v2.pdf","comment":"Accepted as a short paper to the GCV Workshop at CVPR'24"},{"id":"http://arxiv.org/abs/2403.00303v2","updated":"2024-04-17T12:05:28Z","published":"2024-03-01T06:13:53Z","title":"ODM: A Text-Image Further Alignment Pre-training Approach for Scene Text\n Detection and Spotting","summary":" In recent years, text-image joint pre-training techniques have shown\npromising results in various tasks. However, in Optical Character Recognition\n(OCR) tasks, aligning text instances with their corresponding text regions in\nimages poses a challenge, as it requires effective alignment between text and\nOCR-Text (referring to the text in images as OCR-Text to distinguish from the\ntext in natural language) rather than a holistic understanding of the overall\nimage content. In this paper, we propose a new pre-training method called\nOCR-Text Destylization Modeling (ODM) that transfers diverse styles of text\nfound in images to a uniform style based on the text prompt. With ODM, we\nachieve better alignment between text and OCR-Text and enable pre-trained\nmodels to adapt to the complex and diverse styles of scene text detection and\nspotting tasks. Additionally, we have designed a new labeling generation method\nspecifically for ODM and combined it with our proposed Text-Controller module\nto address the challenge of annotation costs in OCR tasks, allowing a larger\namount of unlabeled data to participate in pre-training. Extensive experiments\non multiple public datasets demonstrate that our method significantly improves\nperformance and outperforms current pre-training methods in scene text\ndetection and spotting tasks. Code is available at\nhttps://github.com/PriNing/ODM.\n","authors":["Chen Duan","Pei Fu","Shan Guo","Qianyi Jiang","Xiaoming Wei"],"pdf_url":"https://arxiv.org/pdf/2403.00303v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.11291v1","updated":"2024-04-17T11:55:45Z","published":"2024-04-17T11:55:45Z","title":"Closely Interactive Human Reconstruction with Proxemics and\n Physics-Guided Adaption","summary":" Existing multi-person human reconstruction approaches mainly focus on\nrecovering accurate poses or avoiding penetration, but overlook the modeling of\nclose interactions. In this work, we tackle the task of reconstructing closely\ninteractive humans from a monocular video. The main challenge of this task\ncomes from insufficient visual information caused by depth ambiguity and severe\ninter-person occlusion. In view of this, we propose to leverage knowledge from\nproxemic behavior and physics to compensate the lack of visual information.\nThis is based on the observation that human interaction has specific patterns\nfollowing the social proxemics. Specifically, we first design a latent\nrepresentation based on Vector Quantised-Variational AutoEncoder (VQ-VAE) to\nmodel human interaction. A proxemics and physics guided diffusion model is then\nintroduced to denoise the initial distribution. We design the diffusion model\nas dual branch with each branch representing one individual such that the\ninteraction can be modeled via cross attention. With the learned priors of\nVQ-VAE and physical constraint as the additional information, our proposed\napproach is capable of estimating accurate poses that are also proxemics and\nphysics plausible. Experimental results on Hi4D, 3DPW, and CHI3D demonstrate\nthat our method outperforms existing approaches. The code is available at\n\\url{https://github.com/boycehbz/HumanInteraction}.\n","authors":["Buzhen Huang","Chen Li","Chongyang Xu","Liang Pan","Yangang Wang","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.11291v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2306.16533v2","updated":"2024-04-17T11:38:12Z","published":"2023-06-28T20:06:36Z","title":"ICSVR: Investigating Compositional and Syntactic Understanding in Video\n Retrieval Models","summary":" Video retrieval (VR) involves retrieving the ground truth video from the\nvideo database given a text caption or vice-versa. The two important components\nof compositionality: objects & attributes and actions are joined using correct\nsyntax to form a proper text query. These components (objects & attributes,\nactions and syntax) each play an important role to help distinguish among\nvideos and retrieve the correct ground truth video. However, it is unclear what\nis the effect of these components on the video retrieval performance. We\ntherefore, conduct a systematic study to evaluate the compositional and\nsyntactic understanding of video retrieval models on standard benchmarks such\nas MSRVTT, MSVD and DIDEMO. The study is performed on two categories of video\nretrieval models: (i) which are pre-trained on video-text pairs and fine-tuned\non downstream video retrieval datasets (Eg. Frozen-in-Time, Violet, MCQ etc.)\n(ii) which adapt pre-trained image-text representations like CLIP for video\nretrieval (Eg. CLIP4Clip, XCLIP, CLIP2Video etc.). Our experiments reveal that\nactions and syntax play a minor role compared to objects & attributes in video\nunderstanding. Moreover, video retrieval models that use pre-trained image-text\nrepresentations (CLIP) have better syntactic and compositional understanding as\ncompared to models pre-trained on video-text data. The code is available at\nhttps://github.com/IntelLabs/multimodal_cognitive_ai/tree/main/ICSVR\n","authors":["Avinash Madasu","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2306.16533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11273v1","updated":"2024-04-17T11:25:19Z","published":"2024-04-17T11:25:19Z","title":"Training Transformer Models by Wavelet Losses Improves Quantitative and\n Visual Performance in Single Image Super-Resolution","summary":" Transformer-based models have achieved remarkable results in low-level vision\ntasks including image super-resolution (SR). However, early Transformer-based\napproaches that rely on self-attention within non-overlapping windows encounter\nchallenges in acquiring global information. To activate more input pixels\nglobally, hybrid attention models have been proposed. Moreover, training by\nsolely minimizing pixel-wise RGB losses, such as L1, have been found inadequate\nfor capturing essential high-frequency details. This paper presents two\ncontributions: i) We introduce convolutional non-local sparse attention (NLSA)\nblocks to extend the hybrid transformer architecture in order to further\nenhance its receptive field. ii) We employ wavelet losses to train Transformer\nmodels to improve quantitative and subjective performance. While wavelet losses\nhave been explored previously, showing their power in training\nTransformer-based SR models is novel. Our experimental results demonstrate that\nthe proposed model provides state-of-the-art PSNR results as well as superior\nvisual performance across various benchmark datasets.\n","authors":["Cansu Korkmaz","A. Murat Tekalp"],"pdf_url":"https://arxiv.org/pdf/2404.11273v1.pdf","comment":"total of 10 pages including references, 5 tables and 5 figures,\n accepted for NTIRE 2024 Single Image Super Resolution (x4) challenge"},{"id":"http://arxiv.org/abs/2404.11266v1","updated":"2024-04-17T11:17:12Z","published":"2024-04-17T11:17:12Z","title":"Criteria for Uncertainty-based Corner Cases Detection in Instance\n Segmentation","summary":" The operating environment of a highly automated vehicle is subject to change,\ne.g., weather, illumination, or the scenario containing different objects and\nother participants in which the highly automated vehicle has to navigate its\npassengers safely. These situations must be considered when developing and\nvalidating highly automated driving functions. This already poses a problem for\ntraining and evaluating deep learning models because without the costly\nlabeling of thousands of recordings, not knowing whether the data contains\nrelevant, interesting data for further model training, it is a guess under\nwhich conditions and situations the model performs poorly. For this purpose, we\npresent corner case criteria based on the predictive uncertainty. With our\ncorner case criteria, we are able to detect uncertainty-based corner cases of\nan object instance segmentation model without relying on ground truth (GT)\ndata. We evaluated each corner case criterion using the COCO and the NuImages\ndataset to analyze the potential of our approach. We also provide a corner case\ndecision function that allows us to distinguish each object into True Positive\n(TP), localization and/or classification corner case, or False Positive (FP).\nWe also present our first results of an iterative training cycle that\noutperforms the baseline and where the data added to the training dataset is\nselected based on the corner case decision function.\n","authors":["Florian Heidecker","Ahmad El-Khateeb","Maarten Bieshaar","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2404.11266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11265v1","updated":"2024-04-17T11:15:58Z","published":"2024-04-17T11:15:58Z","title":"The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a\n Clean Model on Poisoned Data","summary":" Recently, backdoor attacks have posed a serious security threat to the\ntraining process of deep neural networks (DNNs). The attacked model behaves\nnormally on benign samples but outputs a specific result when the trigger is\npresent. However, compared with the rocketing progress of backdoor attacks,\nexisting defenses are difficult to deal with these threats effectively or\nrequire benign samples to work, which may be unavailable in real scenarios. In\nthis paper, we find that the poisoned samples and benign samples can be\ndistinguished with prediction entropy. This inspires us to propose a novel\ndual-network training framework: The Victim and The Beneficiary (V&B), which\nexploits a poisoned model to train a clean model without extra benign samples.\nFirstly, we sacrifice the Victim network to be a powerful poisoned sample\ndetector by training on suspicious samples. Secondly, we train the Beneficiary\nnetwork on the credible samples selected by the Victim to inhibit backdoor\ninjection. Thirdly, a semi-supervised suppression strategy is adopted for\nerasing potential backdoors and improving model performance. Furthermore, to\nbetter inhibit missed poisoned samples, we propose a strong data augmentation\nmethod, AttentionMix, which works well with our proposed V&B framework.\nExtensive experiments on two widely used datasets against 6 state-of-the-art\nattacks demonstrate that our framework is effective in preventing backdoor\ninjection and robust to various attacks while maintaining the performance on\nbenign samples. Our code is available at https://github.com/Zixuan-Zhu/VaB.\n","authors":["Zixuan Zhu","Rui Wang","Cong Zou","Lihua Jing"],"pdf_url":"https://arxiv.org/pdf/2404.11265v1.pdf","comment":"13 pages, 6 figures, published to ICCV"},{"id":"http://arxiv.org/abs/2402.17187v3","updated":"2024-04-17T11:08:02Z","published":"2024-02-27T03:53:27Z","title":"PE-MVCNet: Multi-view and Cross-modal Fusion Network for Pulmonary\n Embolism Prediction","summary":" The early detection of a pulmonary embolism (PE) is critical for enhancing\npatient survival rates. Both image-based and non-image-based features are of\nutmost importance in medical classification tasks. In a clinical setting,\nphysicians tend to rely on the contextual information provided by Electronic\nMedical Records (EMR) to interpret medical imaging. However, very few models\neffectively integrate clinical information with imaging data. To address this\nshortcoming, we suggest a multimodal fusion methodology, termed PE-MVCNet,\nwhich capitalizes on Computed Tomography Pulmonary Angiography imaging and EMR\ndata. This method comprises the Image-only module with an integrated multi-view\nblock, the EMR-only module, and the Cross-modal Attention Fusion (CMAF) module.\nThese modules cooperate to extract comprehensive features that subsequently\ngenerate predictions for PE. We conducted experiments using the publicly\naccessible Stanford University Medical Center dataset, achieving an AUROC of\n94.1%, an accuracy rate of 90.2%, and an F1 score of 90.6%. Our proposed model\noutperforms existing methodologies, corroborating that our multimodal fusion\nmodel excels compared to models that use a single data modality. Our source\ncode is available at https://github.com/LeavingStarW/PE-MVCNET.\n","authors":["Zhaoxin Guo","Zhipeng Wang","Ruiquan Ge","Jianxun Yu","Feiwei Qin","Yuan Tian","Yuqing Peng","Yonghong Li","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2402.17187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11256v1","updated":"2024-04-17T11:06:42Z","published":"2024-04-17T11:06:42Z","title":"MMCBE: Multi-modality Dataset for Crop Biomass Estimation and Beyond","summary":" Crop biomass, a critical indicator of plant growth, health, and productivity,\nis invaluable for crop breeding programs and agronomic research. However, the\naccurate and scalable quantification of crop biomass remains inaccessible due\nto limitations in existing measurement methods. One of the obstacles impeding\nthe advancement of current crop biomass prediction methodologies is the\nscarcity of publicly available datasets. Addressing this gap, we introduce a\nnew dataset in this domain, i.e. Multi-modality dataset for crop biomass\nestimation (MMCBE). Comprising 216 sets of multi-view drone images, coupled\nwith LiDAR point clouds, and hand-labelled ground truth, MMCBE represents the\nfirst multi-modality one in the field. This dataset aims to establish benchmark\nmethods for crop biomass quantification and foster the development of\nvision-based approaches. We have rigorously evaluated state-of-the-art crop\nbiomass estimation methods using MMCBE and ventured into additional potential\napplications, such as 3D crop reconstruction from drone imagery and novel-view\nrendering. With this publication, we are making our comprehensive dataset\navailable to the broader community.\n","authors":["Xuesong Li","Zeeshan Hayder","Ali Zia","Connor Cassidy","Shiming Liu","Warwick Stiller","Eric Stone","Warren Conaty","Lars Petersson","Vivien Rolland"],"pdf_url":"https://arxiv.org/pdf/2404.11256v1.pdf","comment":"10 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2305.10300v5","updated":"2024-04-17T11:04:57Z","published":"2023-05-17T15:37:47Z","title":"One-Prompt to Segment All Medical Images","summary":" Large foundation models, known for their strong zero-shot generalization,\nhave excelled in visual and language applications. However, applying them to\nmedical image segmentation, a domain with diverse imaging types and target\nlabels, remains an open challenge. Current approaches, such as adapting\ninteractive segmentation models like Segment Anything Model (SAM), require user\nprompts for each sample during inference. Alternatively, transfer learning\nmethods like few/one-shot models demand labeled samples, leading to high costs.\nThis paper introduces a new paradigm toward the universal medical image\nsegmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation\ncombines the strengths of one-shot and interactive methods. In the inference\nstage, with just \\textbf{one prompted sample}, it can adeptly handle the unseen\ntask in a single forward pass. We train One-Prompt Model on 64 open-source\nmedical datasets, accompanied by the collection of over 3,000 clinician-labeled\nprompts. Tested on 14 previously unseen datasets, the One-Prompt Model\nshowcases superior zero-shot segmentation capabilities, outperforming a wide\nrange of related methods. The code and data is released as\nhttps://github.com/KidsWithTokens/one-prompt.\n","authors":["Junde Wu","Jiayuan Zhu","Yueming Jin","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2305.10300v5.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.12620"},{"id":"http://arxiv.org/abs/2404.11249v1","updated":"2024-04-17T10:56:06Z","published":"2024-04-17T10:56:06Z","title":"A Progressive Framework of Vision-language Knowledge Distillation and\n Alignment for Multilingual Scene","summary":" Pre-trained vision-language (V-L) models such as CLIP have shown excellent\nperformance in many downstream cross-modal tasks. However, most of them are\nonly applicable to the English context. Subsequent research has focused on this\nproblem and proposed improved models, such as CN-CLIP and AltCLIP, to\nfacilitate their applicability to Chinese and even other languages.\nNevertheless, these models suffer from high latency and a large memory\nfootprint in inference, which limits their further deployment on\nresource-constrained edge devices. In this work, we propose a conceptually\nsimple yet effective multilingual CLIP Compression framework and train a\nlightweight multilingual vision-language model, called DC-CLIP, for both\nChinese and English context. In this framework, we collect high-quality Chinese\nand English text-image pairs and design two training stages, including\nmultilingual vision-language feature distillation and alignment. During the\nfirst stage, lightweight image/text student models are designed to learn robust\nvisual/multilingual textual feature representation ability from corresponding\nteacher models, respectively. Subsequently, the multilingual vision-language\nalignment stage enables effective alignment of visual and multilingual textual\nfeatures to further improve the model's multilingual performance. Comprehensive\nexperiments in zero-shot image classification, conducted based on the ELEVATER\nbenchmark, showcase that DC-CLIP achieves superior performance in the English\ncontext and competitive performance in the Chinese context, even with less\ntraining data, when compared to existing models of similar parameter magnitude.\nThe evaluation demonstrates the effectiveness of our designed training\nmechanism.\n","authors":["Wenbo Zhang","Yifan Zhang","Jianfeng Lin","Binqiang Huang","Jinlu Zhang","Wenhao Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11243v1","updated":"2024-04-17T10:49:00Z","published":"2024-04-17T10:49:00Z","title":"Optical Image-to-Image Translation Using Denoising Diffusion Models:\n Heterogeneous Change Detection as a Use Case","summary":" We introduce an innovative deep learning-based method that uses a denoising\ndiffusion-based model to translate low-resolution images to high-resolution\nones from different optical sensors while preserving the contents and avoiding\nundesired artifacts. The proposed method is trained and tested on a large and\ndiverse data set of paired Sentinel-II and Planet Dove images. We show that it\ncan solve serious image generation issues observed when the popular\nclassifier-free guided Denoising Diffusion Implicit Model (DDIM) framework is\nused in the task of Image-to-Image Translation of multi-sensor optical remote\nsensing images and that it can generate large images with highly consistent\npatches, both in colors and in features. Moreover, we demonstrate how our\nmethod improves heterogeneous change detection results in two urban areas:\nBeirut, Lebanon, and Austin, USA. Our contributions are: i) a new training and\ntesting algorithm based on denoising diffusion models for optical image\ntranslation; ii) a comprehensive image quality evaluation and ablation study;\niii) a comparison with the classifier-free guided DDIM framework; and iv)\nchange detection experiments on heterogeneous data.\n","authors":["João Gabriel Vinholi","Marco Chini","Anis Amziane","Renato Machado","Danilo Silva","Patrick Matgen"],"pdf_url":"https://arxiv.org/pdf/2404.11243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14065v7","updated":"2024-04-17T10:42:06Z","published":"2023-09-25T11:57:16Z","title":"AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile\n Platform Real-Time RGB-D Semantic Segmentation","summary":" Understanding indoor scenes is crucial for urban studies. Considering the\ndynamic nature of indoor environments, effective semantic segmentation requires\nboth real-time operation and high accuracy.To address this, we propose\nAsymFormer, a novel network that improves real-time semantic segmentation\naccuracy using RGB-D multi-modal information without substantially increasing\nnetwork complexity. AsymFormer uses an asymmetrical backbone for multimodal\nfeature extraction, reducing redundant parameters by optimizing computational\nresource distribution. To fuse asymmetric multimodal features, a Local\nAttention-Guided Feature Selection (LAFS) module is used to selectively fuse\nfeatures from different modalities by leveraging their dependencies.\nSubsequently, a Cross-Modal Attention-Guided Feature Correlation Embedding\n(CMA) module is introduced to further extract cross-modal representations. The\nAsymFormer demonstrates competitive results with 54.1% mIoU on NYUv2 and 49.1%\nmIoU on SUNRGBD. Notably, AsymFormer achieves an inference speed of 65 FPS (79\nFPS after implementing mixed precision quantization) on RTX3090, demonstrating\nthat AsymFormer can strike a balance between high accuracy and efficiency.\n","authors":["Siqi Du","Weixi Wang","Renzhong Guo","Ruisheng Wang","Yibin Tian","Shengjun Tang"],"pdf_url":"https://arxiv.org/pdf/2309.14065v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11236v1","updated":"2024-04-17T10:38:51Z","published":"2024-04-17T10:38:51Z","title":"ONOT: a High-Quality ICAO-compliant Synthetic Mugshot Dataset","summary":" Nowadays, state-of-the-art AI-based generative models represent a viable\nsolution to overcome privacy issues and biases in the collection of datasets\ncontaining personal information, such as faces. Following this intuition, in\nthis paper we introduce ONOT, a synthetic dataset specifically focused on the\ngeneration of high-quality faces in adherence to the requirements of the\nISO/IEC 39794-5 standards that, following the guidelines of the International\nCivil Aviation Organization (ICAO), defines the interchange formats of face\nimages in electronic Machine-Readable Travel Documents (eMRTD). The strictly\ncontrolled and varied mugshot images included in ONOT are useful in research\nfields related to the analysis of face images in eMRTD, such as Morphing Attack\nDetection and Face Quality Assessment. The dataset is publicly released, in\ncombination with the generation procedure details in order to improve the\nreproducibility and enable future extensions.\n","authors":["Nicolò Di Domenico","Guido Borghi","Annalisa Franco","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.11236v1.pdf","comment":"Paper accepted in IEEE FG 2024"},{"id":"http://arxiv.org/abs/2404.11230v1","updated":"2024-04-17T10:26:49Z","published":"2024-04-17T10:26:49Z","title":"Energy-Efficient Uncertainty-Aware Biomass Composition Prediction at the\n Edge","summary":" Clover fixates nitrogen from the atmosphere to the ground, making\ngrass-clover mixtures highly desirable to reduce external nitrogen\nfertilization. Herbage containing clover additionally promotes higher food\nintake, resulting in higher milk production. Herbage probing however remains\nlargely unused as it requires a time-intensive manual laboratory analysis.\nWithout this information, farmers are unable to perform localized clover sowing\nor take targeted fertilization decisions. Deep learning algorithms have been\nproposed with the goal to estimate the dry biomass composition from images of\nthe grass directly in the fields. The energy-intensive nature of deep learning\nhowever limits deployment to practical edge devices such as smartphones. This\npaper proposes to fill this gap by applying filter pruning to reduce the energy\nrequirement of existing deep learning solutions. We report that although pruned\nnetworks are accurate on controlled, high-quality images of the grass, they\nstruggle to generalize to real-world smartphone images that are blurry or taken\nfrom challenging angles. We address this challenge by training filter-pruned\nmodels using a variance attenuation loss so they can predict the uncertainty of\ntheir predictions. When the uncertainty exceeds a threshold, we re-infer using\na more accurate unpruned model. This hybrid approach allows us to reduce energy\nconsumption while retaining a high accuracy. We evaluate our algorithm on two\ndatasets: the GrassClover and the Irish clover using an NVIDIA Jetson Nano edge\ndevice. We find that we reduce energy reduction with respect to\nstate-of-the-art solutions by 50% on average with only 4% accuracy loss.\n","authors":["Muhammad Zawish","Paul Albert","Flavio Esposito","Steven Davy","Lizy Abraham"],"pdf_url":"https://arxiv.org/pdf/2404.11230v1.pdf","comment":"The paper has been accepted to CVPR 2024 5th Workshop on Vision for\n Agriculture"},{"id":"http://arxiv.org/abs/2404.11226v1","updated":"2024-04-17T10:20:16Z","published":"2024-04-17T10:20:16Z","title":"Simple In-place Data Augmentation for Surveillance Object Detection","summary":" Motivated by the need to improve model performance in traffic monitoring\ntasks with limited labeled samples, we propose a straightforward augmentation\ntechnique tailored for object detection datasets, specifically designed for\nstationary camera-based applications. Our approach focuses on placing objects\nin the same positions as the originals to ensure its effectiveness. By applying\nin-place augmentation on objects from the same camera input image, we address\nthe challenge of overlapping with original and previously selected objects.\nThrough extensive testing on two traffic monitoring datasets, we illustrate the\nefficacy of our augmentation strategy in improving model performance,\nparticularly in scenarios with limited labeled samples and imbalanced class\ndistributions. Notably, our method achieves comparable performance to models\ntrained on the entire dataset while utilizing only 8.5 percent of the original\ndata. Moreover, we report significant improvements, with mAP@.5 increasing from\n0.4798 to 0.5025, and the mAP@.5:.95 rising from 0.29 to 0.3138 on the\nFishEye8K dataset. These results highlight the potential of our augmentation\napproach in enhancing object detection models for traffic monitoring\napplications.\n","authors":["Munkh-Erdene Otgonbold","Ganzorig Batnasan","Munkhjargal Gochoo"],"pdf_url":"https://arxiv.org/pdf/2404.11226v1.pdf","comment":"CVPR Workshop 2024"},{"id":"http://arxiv.org/abs/2404.11214v1","updated":"2024-04-17T09:58:53Z","published":"2024-04-17T09:58:53Z","title":"Feature Corrective Transfer Learning: End-to-End Solutions to Object\n Detection in Non-Ideal Visual Conditions","summary":" A significant challenge in the field of object detection lies in the system's\nperformance under non-ideal imaging conditions, such as rain, fog, low\nillumination, or raw Bayer images that lack ISP processing. Our study\nintroduces \"Feature Corrective Transfer Learning\", a novel approach that\nleverages transfer learning and a bespoke loss function to facilitate the\nend-to-end detection of objects in these challenging scenarios without the need\nto convert non-ideal images into their RGB counterparts. In our methodology, we\ninitially train a comprehensive model on a pristine RGB image dataset.\nSubsequently, non-ideal images are processed by comparing their feature maps\nagainst those from the initial ideal RGB model. This comparison employs the\nExtended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function\ndesigned to quantify similarities and integrate them into the detection loss.\nThis approach refines the model's ability to perform object detection across\nvarying conditions through direct feature map correction, encapsulating the\nessence of Feature Corrective Transfer Learning. Experimental validation on\nvariants of the KITTI dataset demonstrates a significant improvement in mean\nAverage Precision (mAP), resulting in a 3.8-8.1% relative enhancement in\ndetection under non-ideal conditions compared to the baseline model, and a less\nmarginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved\nunder ideal conditions by the standard Faster RCNN algorithm.\n","authors":["Chuheng Wei","Guoyuan Wu","Matthew J. Barth"],"pdf_url":"https://arxiv.org/pdf/2404.11214v1.pdf","comment":"10 pages, 3 figures, accepted by 2024 CVPR UG2 Workshop"},{"id":"http://arxiv.org/abs/2311.10339v2","updated":"2024-04-17T09:50:25Z","published":"2023-11-17T05:49:50Z","title":"A2XP: Towards Private Domain Generalization","summary":" Deep Neural Networks (DNNs) have become pivotal in various fields, especially\nin computer vision, outperforming previous methodologies. A critical challenge\nin their deployment is the bias inherent in data across different domains, such\nas image style and environmental conditions, leading to domain gaps. This\nnecessitates techniques for learning general representations from biased\ntraining data, known as domain generalization. This paper presents Attend to\neXpert Prompts (A2XP), a novel approach for domain generalization that\npreserves the privacy and integrity of the network architecture. A2XP consists\nof two phases: Expert Adaptation and Domain Generalization. In the first phase,\nprompts for each source domain are optimized to guide the model towards the\noptimal direction. In the second phase, two embedder networks are trained to\neffectively amalgamate these expert prompts, aiming for an optimal output. Our\nextensive experiments demonstrate that A2XP achieves state-of-the-art results\nover existing non-private domain generalization methods. The experimental\nresults validate that the proposed approach not only tackles the domain\ngeneralization challenge in DNNs but also offers a privacy-preserving,\nefficient solution to the broader field of computer vision.\n","authors":["Geunhyeok Yu","Hyoseok Hwang"],"pdf_url":"https://arxiv.org/pdf/2311.10339v2.pdf","comment":"Accepted to CVPR 2024. Our code is available at\n https://github.com/AIRLABkhu/A2XP"},{"id":"http://arxiv.org/abs/2404.11209v1","updated":"2024-04-17T09:45:43Z","published":"2024-04-17T09:45:43Z","title":"Prompt-Guided Generation of Structured Chest X-Ray Report Using a\n Pre-trained LLM","summary":" Medical report generation automates radiology descriptions from images,\neasing the burden on physicians and minimizing errors. However, current methods\nlack structured outputs and physician interactivity for clear, clinically\nrelevant reports. Our method introduces a prompt-guided approach to generate\nstructured chest X-ray reports using a pre-trained large language model (LLM).\nFirst, we identify anatomical regions in chest X-rays to generate focused\nsentences that center on key visual elements, thereby establishing a structured\nreport foundation with anatomy-based sentences. We also convert the detected\nanatomy into textual prompts conveying anatomical comprehension to the LLM.\nAdditionally, the clinical context prompts guide the LLM to emphasize\ninteractivity and clinical requirements. By integrating anatomy-focused\nsentences and anatomy/clinical prompts, the pre-trained LLM can generate\nstructured chest X-ray reports tailored to prompted anatomical regions and\nclinical contexts. We evaluate using language generation and clinical\neffectiveness metrics, demonstrating strong performance.\n","authors":["Hongzhao Li","Hongyu Wang","Xia Sun","Hua He","Jun Feng"],"pdf_url":"https://arxiv.org/pdf/2404.11209v1.pdf","comment":"Accepted by IEEE Conference on Multimedia Expo 2024"},{"id":"http://arxiv.org/abs/2404.11207v1","updated":"2024-04-17T09:39:07Z","published":"2024-04-17T09:39:07Z","title":"Exploring the Transferability of Visual Prompting for Multimodal Large\n Language Models","summary":" Although Multimodal Large Language Models (MLLMs) have demonstrated promising\nversatile capabilities, their performance is still inferior to specialized\nmodels on downstream tasks, which makes adaptation necessary to enhance their\nutility. However, fine-tuning methods require independent training for every\nmodel, leading to huge computation and memory overheads. In this paper, we\npropose a novel setting where we aim to improve the performance of diverse\nMLLMs with a group of shared parameters optimized for a downstream task. To\nachieve this, we propose Transferable Visual Prompting (TVP), a simple and\neffective approach to generate visual prompts that can transfer to different\nmodels and improve their performance on downstream tasks after trained on only\none model. We introduce two strategies to address the issue of cross-model\nfeature corruption of existing visual prompting methods and enhance the\ntransferability of the learned prompts, including 1) Feature Consistency\nAlignment: which imposes constraints to the prompted feature changes to\nmaintain task-agnostic knowledge; 2) Task Semantics Enrichment: which\nencourages the prompted images to contain richer task-specific semantics with\nlanguage guidance. We validate the effectiveness of TVP through extensive\nexperiments with 6 modern MLLMs on a wide variety of tasks ranging from object\nrecognition and counting to multimodal reasoning and hallucination correction.\n","authors":["Yichi Zhang","Yinpeng Dong","Siyuan Zhang","Tianzan Min","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.11207v1.pdf","comment":"Accepted in CVPR 2024 as Poster (Highlight)"},{"id":"http://arxiv.org/abs/2404.11205v1","updated":"2024-04-17T09:37:25Z","published":"2024-04-17T09:37:25Z","title":"Kathakali Hand Gesture Recognition With Minimal Data","summary":" The Indian classical dance-drama Kathakali has a set of hand gestures called\nMudras, which form the fundamental units of all its dance moves and postures.\nRecognizing the depicted mudra becomes one of the first steps in its digital\nprocessing. The work treats the problem as a 24-class classification task and\nproposes a vector-similarity-based approach using pose estimation, eliminating\nthe need for further training or fine-tuning. This approach overcomes the\nchallenge of data scarcity that limits the application of AI in similar\ndomains. The method attains 92% accuracy which is a similar or better\nperformance as other model-training-based works existing in the domain, with\nthe added advantage that the method can still work with data sizes as small as\n1 or 5 samples with a slightly reduced performance. Working with images,\nvideos, and even real-time streams is possible. The system can work with\nhand-cropped or full-body images alike. We have developed and made public a\ndataset for the Kathakali Mudra Recognition as part of this work.\n","authors":["Kavitha Raju","Nandini J. Warrier"],"pdf_url":"https://arxiv.org/pdf/2404.11205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11202v1","updated":"2024-04-17T09:33:31Z","published":"2024-04-17T09:33:31Z","title":"GhostNetV3: Exploring the Training Strategies for Compact Models","summary":" Compact neural networks are specially designed for applications on edge\ndevices with faster inference speed yet modest performance. However, training\nstrategies of compact models are borrowed from that of conventional models at\npresent, which ignores their difference in model capacity and thus may impede\nthe performance of compact models. In this paper, by systematically\ninvestigating the impact of different training ingredients, we introduce a\nstrong training strategy for compact models. We find that the appropriate\ndesigns of re-parameterization and knowledge distillation are crucial for\ntraining high-performance compact models, while some commonly used data\naugmentations for training conventional models, such as Mixup and CutMix, lead\nto worse performance. Our experiments on ImageNet-1K dataset demonstrate that\nour specialized training strategy for compact models is applicable to various\narchitectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2.\nSpecifically, equipped with our strategy, GhostNetV3 1.3$\\times$ achieves a\ntop-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile\ndevices, surpassing its ordinarily trained counterpart by a large margin.\nMoreover, our observation can also be extended to object detection scenarios.\nPyTorch code and checkpoints can be found at\nhttps://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch.\n","authors":["Zhenhua Liu","Zhiwei Hao","Kai Han","Yehui Tang","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02977v2","updated":"2024-04-17T09:09:17Z","published":"2023-10-04T17:12:18Z","title":"T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation","summary":" Recent methods in text-to-3D leverage powerful pretrained diffusion models to\noptimize NeRF. Notably, these methods are able to produce high-quality 3D\nscenes without training on 3D data. Due to the open-ended nature of the task,\nmost studies evaluate their results with subjective case studies and user\nexperiments, thereby presenting a challenge in quantitatively addressing the\nquestion: How has current progress in Text-to-3D gone so far? In this paper, we\nintroduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing\ndiverse text prompts of three increasing complexity levels that are specially\ndesigned for 3D generation. To assess both the subjective quality and the text\nalignment, we propose two automatic metrics based on multi-view images produced\nby the 3D contents. The quality metric combines multi-view text-image scores\nand regional convolution to detect quality and view inconsistency. The\nalignment metric uses multi-view captioning and GPT-4 evaluation to measure\ntext-3D consistency. Both metrics closely correlate with different dimensions\nof human judgments, providing a paradigm for efficiently evaluating text-to-3D\nmodels. The benchmarking results, shown in Fig. 1, reveal performance\ndifferences among an extensive 10 prevalent text-to-3D methods. Our analysis\nfurther highlights the common struggles for current methods on generating\nsurroundings and multi-object scenes, as well as the bottleneck of leveraging\n2D guidance for 3D generation. Our project page is available at:\nhttps://t3bench.com.\n","authors":["Yuze He","Yushi Bai","Matthieu Lin","Wang Zhao","Yubin Hu","Jenny Sheng","Ran Yi","Juanzi Li","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.02977v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.18402v2","updated":"2024-04-17T08:57:35Z","published":"2023-11-30T09:51:53Z","title":"MV-CLIP: Multi-View CLIP for Zero-shot 3D Shape Recognition","summary":" Large-scale pre-trained models have demonstrated impressive performance in\nvision and language tasks within open-world scenarios. Due to the lack of\ncomparable pre-trained models for 3D shapes, recent methods utilize\nlanguage-image pre-training to realize zero-shot 3D shape recognition. However,\ndue to the modality gap, pretrained language-image models are not confident\nenough in the generalization to 3D shape recognition. Consequently, this paper\naims to improve the confidence with view selection and hierarchical prompts.\nLeveraging the CLIP model as an example, we employ view selection on the vision\nside by identifying views with high prediction confidence from multiple\nrendered views of a 3D shape. On the textual side, the strategy of hierarchical\nprompts is proposed for the first time. The first layer prompts several\nclassification candidates with traditional class-level descriptions, while the\nsecond layer refines the prediction based on function-level descriptions or\nfurther distinctions between the candidates. Remarkably, without the need for\nadditional training, our proposed method achieves impressive zero-shot 3D\nclassification accuracies of 84.44%, 91.51%, and 66.17% on ModelNet40,\nModelNet10, and ShapeNet Core55, respectively. Furthermore, we will make the\ncode publicly available to facilitate reproducibility and further research in\nthis area.\n","authors":["Dan Song","Xinwei Fu","Weizhi Nie","Wenhui Li","Lanjun Wang","You Yang","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10710v2","updated":"2024-04-17T08:44:30Z","published":"2024-04-16T16:36:50Z","title":"Dual Modalities of Text: Visual and Textual Generative Pre-training","summary":" Harnessing visual texts represents a burgeoning frontier in the evolution of\nlanguage modeling. In this paper, we introduce a novel pre-training framework\nfor a suite of pixel-based autoregressive language models, pre-training on a\ncorpus of over 400 million documents rendered as RGB images. Our approach is\ncharacterized by a dual-modality training regimen, engaging both visual data\nthrough next patch prediction with a regression head and textual data via next\ntoken prediction with a classification head. This study is particularly focused\non investigating the synergistic interplay between visual and textual\nmodalities of language. Our comprehensive evaluation across a diverse array of\nbenchmarks reveals that the confluence of visual and textual data substantially\naugments the efficacy of pixel-based language models. Notably, our findings\nshow that a unidirectional pixel-based model, devoid of textual data during\ntraining, can match the performance levels of advanced bidirectional\npixel-based models on various language understanding benchmarks. This work\nhighlights the considerable untapped potential of integrating visual and\ntextual information for language modeling purposes. We will release our code,\ndata, and checkpoints to inspire further research advancement.\n","authors":["Yekun Chai","Qingyi Liu","Jingwu Xiao","Shuohuan Wang","Yu Sun","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00311v3","updated":"2024-04-17T08:40:57Z","published":"2023-12-01T03:05:21Z","title":"3D Face Reconstruction with the Geometric Guidance of Facial Part\n Segmentation","summary":" 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in\nvarious applications. However, existing methods struggle to reconstruct faces\nwith extreme expressions due to deficiencies in supervisory signals, such as\nsparse or inaccurate landmarks. Segmentation information contains effective\ngeometric contexts for face reconstruction. Certain attempts intuitively depend\non differentiable renderers to compare the rendered silhouettes of\nreconstruction with segmentation, which is prone to issues like local optima\nand gradient instability. In this paper, we fully utilize the facial part\nsegmentation geometry by introducing Part Re-projection Distance Loss (PRDL).\nSpecifically, PRDL transforms facial part segmentation into 2D points and\nre-projects the reconstruction onto the image plane. Subsequently, by\nintroducing grid anchors and computing different statistical distances from\nthese anchors to the point sets, PRDL establishes geometry descriptors to\noptimize the distribution of the point sets for face reconstruction. PRDL\nexhibits a clear gradient compared to the renderer-based methods and presents\nstate-of-the-art reconstruction performance in extensive quantitative and\nqualitative experiments. Our project is available at\nhttps://github.com/wang-zidu/3DDFA-V3 .\n","authors":["Zidu Wang","Xiangyu Zhu","Tianshuo Zhang","Baiqin Wang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2312.00311v3.pdf","comment":"CVPR2024 (Highlight)"},{"id":"http://arxiv.org/abs/2312.08555v2","updated":"2024-04-17T08:38:54Z","published":"2023-12-13T23:00:48Z","title":"KDAS: Knowledge Distillation via Attention Supervision Framework for\n Polyp Segmentation","summary":" Polyp segmentation, a contentious issue in medical imaging, has seen numerous\nproposed methods aimed at improving the quality of segmented masks. While\ncurrent state-of-the-art techniques yield impressive results, the size and\ncomputational cost of these models create challenges for practical industry\napplications. To address this challenge, we present KDAS, a Knowledge\nDistillation framework that incorporates attention supervision, and our\nproposed Symmetrical Guiding Module. This framework is designed to facilitate a\ncompact student model with fewer parameters, allowing it to learn the strengths\nof the teacher model and mitigate the inconsistency between teacher features\nand student features, a common challenge in Knowledge Distillation, via the\nSymmetrical Guiding Module. Through extensive experiments, our compact models\ndemonstrate their strength by achieving competitive results with\nstate-of-the-art methods, offering a promising approach to creating compact\nmodels with high accuracy for polyp segmentation and in the medical imaging\nfield. The implementation is available on https://github.com/huyquoctrinh/KDAS.\n","authors":["Quoc-Huy Trinh","Minh-Van Nguyen","Phuoc-Thao Vo Thi"],"pdf_url":"https://arxiv.org/pdf/2312.08555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11161v1","updated":"2024-04-17T08:21:02Z","published":"2024-04-17T08:21:02Z","title":"Pre-processing matters: A segment search method for WSI classification","summary":" Pre-processing for whole slide images can affect classification performance\nboth in the training and inference stages. Our study analyzes the impact of\npre-processing parameters on inference and training across single- and\nmultiple-domain datasets. However, searching for an optimal parameter set is\ntime-consuming. To overcome this, we propose a novel Similarity-based Simulated\nAnnealing approach for fast parameter tuning to enhance inference performance\non single-domain data. Our method demonstrates significant performance\nimprovements in accuracy, which raise accuracy from 0.512 to 0.847 in a single\ndomain. We further extend our insight into training performance in multi-domain\ndata by employing a novel Bayesian optimization to search optimal\npre-processing parameters, resulting in a high AUC of 0.967. We highlight that\nbetter pre-processing for WSI can contribute to further accuracy improvement in\nthe histology area.\n","authors":["Jun Wang","Yufei Cui","Yu Mao","Nan Guan","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2404.11161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11159v1","updated":"2024-04-17T08:15:25Z","published":"2024-04-17T08:15:25Z","title":"Deep Portrait Quality Assessment. A NTIRE 2024 Challenge Survey","summary":" This paper reviews the NTIRE 2024 Portrait Quality Assessment Challenge,\nhighlighting the proposed solutions and results. This challenge aims to obtain\nan efficient deep neural network capable of estimating the perceptual quality\nof real portrait photos. The methods must generalize to diverse scenes and\ndiverse lighting conditions (indoor, outdoor, low-light), movement, blur, and\nother challenging conditions. In the challenge, 140 participants registered,\nand 35 submitted results during the challenge period. The performance of the\ntop 5 submissions is reviewed and provided here as a gauge for the current\nstate-of-the-art in Portrait Quality Assessment.\n","authors":["Nicolas Chahine","Marcos V. Conde","Daniela Carfora","Gabriel Pacianotto","Benoit Pochon","Sira Ferradans","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.11159v1.pdf","comment":"CVPRW - NTIRE 2024"},{"id":"http://arxiv.org/abs/2404.11156v1","updated":"2024-04-17T08:09:25Z","published":"2024-04-17T08:09:25Z","title":"Learning SO(3)-Invariant Semantic Correspondence via Local Shape\n Transform","summary":" Establishing accurate 3D correspondences between shapes stands as a pivotal\nchallenge with profound implications for computer vision and robotics. However,\nexisting self-supervised methods for this problem assume perfect input shape\nalignment, restricting their real-world applicability. In this work, we\nintroduce a novel self-supervised Rotation-Invariant 3D correspondence learner\nwith Local Shape Transform, dubbed RIST, that learns to establish dense\ncorrespondences between shapes even under challenging intra-class variations\nand arbitrary orientations. Specifically, RIST learns to dynamically formulate\nan SO(3)-invariant local shape transform for each point, which maps the\nSO(3)-equivariant global shape descriptor of the input shape to a local shape\ndescriptor. These local shape descriptors are provided as inputs to our decoder\nto facilitate point cloud self- and cross-reconstruction. Our proposed\nself-supervised training pipeline encourages semantically corresponding points\nfrom different shapes to be mapped to similar local shape descriptors, enabling\nRIST to establish dense point-wise correspondences. RIST demonstrates\nstate-of-the-art performances on 3D part label transfer and semantic keypoint\ntransfer given arbitrarily rotated point cloud pairs, outperforming existing\nmethods by significant margins.\n","authors":["Chunghyun Park","Seungwook Sim","Jaesik Park","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.11156v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11155v1","updated":"2024-04-17T08:08:34Z","published":"2024-04-17T08:08:34Z","title":"HybriMap: Hybrid Clues Utilization for Effective Vectorized HD Map\n Construction","summary":" Constructing vectorized high-definition maps from surround-view cameras has\ngarnered significant attention in recent years. However, the commonly employed\nmulti-stage sequential workflow in prevailing approaches often leads to the\nloss of early-stage information, particularly in perspective-view features.\nUsually, such loss is observed as an instance missing or shape mismatching in\nthe final birds-eye-view predictions. To address this concern, we propose a\nnovel approach, namely \\textbf{HybriMap}, which effectively exploits clues from\nhybrid features to ensure the delivery of valuable information. Specifically,\nwe design the Dual Enhancement Module, to enable both explicit integration and\nimplicit modification under the guidance of hybrid features. Additionally, the\nperspective keypoints are utilized as supervision, further directing the\nfeature enhancement process. Extensive experiments conducted on existing\nbenchmarks have demonstrated the state-of-the-art performance of our proposed\napproach.\n","authors":["Chi Zhang","Qi Song","Feifei Li","Yongquan Chen","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2404.11155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07773v2","updated":"2024-04-17T08:06:51Z","published":"2024-04-11T14:08:45Z","title":"ConsistencyDet: A Robust Object Detector with a Denoising Paradigm of\n Consistency Model","summary":" Object detection, a quintessential task in the realm of perceptual computing,\ncan be tackled using a generative methodology. In the present study, we\nintroduce a novel framework designed to articulate object detection as a\ndenoising diffusion process, which operates on the perturbed bounding boxes of\nannotated entities. This framework, termed ConsistencyDet, leverages an\ninnovative denoising concept known as the Consistency Model. The hallmark of\nthis model is its self-consistency feature, which empowers the model to map\ndistorted information from any temporal stage back to its pristine state,\nthereby realizing a \"one-step denoising\" mechanism. Such an attribute markedly\nelevates the operational efficiency of the model, setting it apart from the\nconventional Diffusion Model. Throughout the training phase, ConsistencyDet\ninitiates the diffusion sequence with noise-infused boxes derived from the\nground-truth annotations and conditions the model to perform the denoising\ntask. Subsequently, in the inference stage, the model employs a denoising\nsampling strategy that commences with bounding boxes randomly sampled from a\nnormal distribution. Through iterative refinement, the model transforms an\nassortment of arbitrarily generated boxes into definitive detections.\nComprehensive evaluations employing standard benchmarks, such as MS-COCO and\nLVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in\nperformance metrics. Our code is available at\nhttps://github.com/Tankowa/ConsistencyDet.\n","authors":["Lifan Jiang","Zhihui Wang","Changmiao Wang","Ming Li","Jiaxu Leng","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11152v1","updated":"2024-04-17T08:05:04Z","published":"2024-04-17T08:05:04Z","title":"Multi-target and multi-stage liver lesion segmentation and detection in\n multi-phase computed tomography scans","summary":" Multi-phase computed tomography (CT) scans use contrast agents to highlight\ndifferent anatomical structures within the body to improve the probability of\nidentifying and detecting anatomical structures of interest and abnormalities\nsuch as liver lesions. Yet, detecting these lesions remains a challenging task\nas these lesions vary significantly in their size, shape, texture, and contrast\nwith respect to surrounding tissue. Therefore, radiologists need to have an\nextensive experience to be able to identify and detect these lesions.\nSegmentation-based neural networks can assist radiologists with this task.\nCurrent state-of-the-art lesion segmentation networks use the encoder-decoder\ndesign paradigm based on the UNet architecture where the multi-phase CT scan\nvolume is fed to the network as a multi-channel input. Although this approach\nutilizes information from all the phases and outperform single-phase\nsegmentation networks, we demonstrate that their performance is not optimal and\ncan be further improved by incorporating the learning from models trained on\neach single-phase individually. Our approach comprises three stages. The first\nstage identifies the regions within the liver where there might be lesions at\nthree different scales (4, 8, and 16 mm). The second stage includes the main\nsegmentation model trained using all the phases as well as a segmentation model\ntrained on each of the phases individually. The third stage uses the\nmulti-phase CT volumes together with the predictions from each of the\nsegmentation models to generate the final segmentation map. Overall, our\napproach improves relative liver lesion segmentation performance by 1.6% while\nreducing performance variability across subjects by 8% when compared to the\ncurrent state-of-the-art models.\n","authors":["Abdullah F. Al-Battal","Soan T. M. Duong","Van Ha Tang","Quang Duc Tran","Steven Q. H. Truong","Chien Phan","Truong Q. Nguyen","Cheolhong An"],"pdf_url":"https://arxiv.org/pdf/2404.11152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11151v1","updated":"2024-04-17T08:01:55Z","published":"2024-04-17T08:01:55Z","title":"REACTO: Reconstructing Articulated Objects from a Single Video","summary":" In this paper, we address the challenge of reconstructing general articulated\n3D objects from a single video. Existing works employing dynamic neural\nradiance fields have advanced the modeling of articulated objects like humans\nand animals from videos, but face challenges with piece-wise rigid general\narticulated objects due to limitations in their deformation models. To tackle\nthis, we propose Quasi-Rigid Blend Skinning, a novel deformation model that\nenhances the rigidity of each part while maintaining flexible deformation of\nthe joints. Our primary insight combines three distinct approaches: 1) an\nenhanced bone rigging system for improved component modeling, 2) the use of\nquasi-sparse skinning weights to boost part rigidity and reconstruction\nfidelity, and 3) the application of geodesic point assignment for precise\nmotion and seamless deformation. Our method outperforms previous works in\nproducing higher-fidelity 3D reconstructions of general articulated objects, as\ndemonstrated on both real and synthetic datasets. Project page:\nhttps://chaoyuesong.github.io/REACTO.\n","authors":["Chaoyue Song","Jiacheng Wei","Chuan-Sheng Foo","Guosheng Lin","Fayao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09326v2","updated":"2024-04-17T07:46:28Z","published":"2024-04-14T18:57:38Z","title":"Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision\n Transformers","summary":" Few-shot knowledge distillation recently emerged as a viable approach to\nharness the knowledge of large-scale pre-trained models, using limited data and\ncomputational resources. In this paper, we propose a novel few-shot feature\ndistillation approach for vision transformers. Our approach is based on two key\nsteps. Leveraging the fact that vision transformers have a consistent\ndepth-wise structure, we first copy the weights from intermittent layers of\nexisting pre-trained vision transformers (teachers) into shallower\narchitectures (students), where the intermittence factor controls the\ncomplexity of the student transformer with respect to its teacher. Next, we\nemploy an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge\ninto the student in a few-shot scenario, aiming to recover the information\nprocessing carried out by the skipped teacher layers. We present comprehensive\nexperiments with supervised and self-supervised transformers as teachers, on\nfive data sets from various domains, including natural, medical and satellite\nimages. The empirical results confirm the superiority of our approach over\ncompetitive baselines. Moreover, the ablation results demonstrate the\nusefulness of each component of the proposed pipeline.\n","authors":["Diana-Nicoleta Grigore","Mariana-Iuliana Georgescu","Jon Alvarez Justo","Tor Johansen","Andreea Iuliana Ionescu","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2404.09326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03788v2","updated":"2024-04-17T07:41:48Z","published":"2024-01-08T10:08:48Z","title":"Low-light Image Enhancement via CLIP-Fourier Guided Wavelet Diffusion","summary":" Low-light image enhancement techniques have significantly progressed, but\nunstable image quality recovery and unsatisfactory visual perception are still\nsignificant challenges. To solve these problems, we propose a novel and robust\nlow-light image enhancement method via CLIP-Fourier Guided Wavelet Diffusion,\nabbreviated as CFWD. Specifically, CFWD leverages multimodal visual-language\ninformation in the frequency domain space created by multiple wavelet\ntransforms to guide the enhancement process. Multi-scale supervision across\ndifferent modalities facilitates the alignment of image features with semantic\nfeatures during the wavelet diffusion process, effectively bridging the gap\nbetween degraded and normal domains. Moreover, to further promote the effective\nrecovery of the image details, we combine the Fourier transform based on the\nwavelet transform and construct a Hybrid High Frequency Perception Module\n(HFPM) with a significant perception of the detailed features. This module\navoids the diversity confusion of the wavelet diffusion process by guiding the\nfine-grained structure recovery of the enhancement results to achieve\nfavourable metric and perceptually oriented enhancement. Extensive quantitative\nand qualitative experiments on publicly available real-world benchmarks show\nthat our approach outperforms existing state-of-the-art methods, achieving\nsignificant progress in image quality and noise suppression. The project code\nis available at https://github.com/hejh8/CFWD.\n","authors":["Minglong Xue","Jinhong He","Wenhai Wang","Mingliang Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.03788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08926v2","updated":"2024-04-17T07:38:32Z","published":"2024-04-13T08:27:10Z","title":"Diffusion Models Meet Remote Sensing: Principles, Methods, and\n Perspectives","summary":" As a newly emerging advance in deep generative models, diffusion models have\nachieved state-of-the-art results in many fields, including computer vision,\nnatural language processing, and molecule design. The remote sensing community\nhas also noticed the powerful ability of diffusion models and quickly applied\nthem to a variety of tasks for image processing. Given the rapid increase in\nresearch on diffusion models in the field of remote sensing, it is necessary to\nconduct a comprehensive review of existing diffusion model-based remote sensing\npapers, to help researchers recognize the potential of diffusion models and\nprovide some directions for further exploration. Specifically, this paper first\nintroduces the theoretical background of diffusion models, and then\nsystematically reviews the applications of diffusion models in remote sensing,\nincluding image generation, enhancement, and interpretation. Finally, the\nlimitations of existing remote sensing diffusion models and worthy research\ndirections for further exploration are discussed and summarized.\n","authors":["Yidan Liu","Jun Yue","Shaobo Xia","Pedram Ghamisi","Weiying Xie","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2404.08926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11139v1","updated":"2024-04-17T07:34:21Z","published":"2024-04-17T07:34:21Z","title":"GeoReF: Geometric Alignment Across Shape Variation for Category-level\n Object Pose Refinement","summary":" Object pose refinement is essential for robust object pose estimation.\nPrevious work has made significant progress towards instance-level object pose\nrefinement. Yet, category-level pose refinement is a more challenging problem\ndue to large shape variations within a category and the discrepancies between\nthe target object and the shape prior. To address these challenges, we\nintroduce a novel architecture for category-level object pose refinement. Our\napproach integrates an HS-layer and learnable affine transformations, which\naims to enhance the extraction and alignment of geometric information.\nAdditionally, we introduce a cross-cloud transformation mechanism that\nefficiently merges diverse data sources. Finally, we push the limits of our\nmodel by incorporating the shape prior information for translation and size\nerror prediction. We conducted extensive experiments to demonstrate the\neffectiveness of the proposed framework. Through extensive quantitative\nexperiments, we demonstrate significant improvement over the baseline method by\na large margin across all metrics.\n","authors":["Linfang Zheng","Tze Ho Elden Tse","Chen Wang","Yinghan Sun","Hua Chen","Ales Leonardis","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.11139v1.pdf","comment":"The IEEE/CVF Conference on Computer Vision and Pattern Recognition\n 2024"},{"id":"http://arxiv.org/abs/2404.11129v1","updated":"2024-04-17T07:20:56Z","published":"2024-04-17T07:20:56Z","title":"Fact :Teaching MLLMs with Faithful, Concise and Transferable Rationales","summary":" The remarkable performance of Multimodal Large Language Models (MLLMs) has\nunequivocally demonstrated their proficient understanding capabilities in\nhandling a wide array of visual tasks. Nevertheless, the opaque nature of their\nblack-box reasoning processes persists as an enigma, rendering them\nuninterpretable and struggling with hallucination. Their ability to execute\nintricate compositional reasoning tasks is also constrained, culminating in a\nstagnation of learning progression for these models. In this work, we introduce\nFact, a novel paradigm designed to generate multimodal rationales that are\nfaithful, concise, and transferable for teaching MLLMs. This paradigm utilizes\nverifiable visual programming to generate executable code guaranteeing\nfaithfulness and precision. Subsequently, through a series of operations\nincluding pruning, merging, and bridging, the rationale enhances its\nconciseness. Furthermore, we filter rationales that can be transferred to\nend-to-end paradigms from programming paradigms to guarantee transferability.\nEmpirical evidence from experiments demonstrates the superiority of our method\nacross models of varying parameter sizes, significantly enhancing their\ncompositional reasoning and generalization ability. Our approach also reduces\nhallucinations owing to its high correlation between images and text.\n","authors":["Minghe Gao","Shuang Chen","Liang Pang","Yuan Yao","Jisheng Dang","Wenqiao Zhang","Juncheng Li","Siliang Tang","Yueting Zhuang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2404.11129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11127v1","updated":"2024-04-17T07:17:47Z","published":"2024-04-17T07:17:47Z","title":"D-Aug: Enhancing Data Augmentation for Dynamic LiDAR Scenes","summary":" Creating large LiDAR datasets with pixel-level labeling poses significant\nchallenges. While numerous data augmentation methods have been developed to\nreduce the reliance on manual labeling, these methods predominantly focus on\nstatic scenes and they overlook the importance of data augmentation for dynamic\nscenes, which is critical for autonomous driving. To address this issue, we\npropose D-Aug, a LiDAR data augmentation method tailored for augmenting dynamic\nscenes. D-Aug extracts objects and inserts them into dynamic scenes,\nconsidering the continuity of these objects across consecutive frames. For\nseamless insertion into dynamic scenes, we propose a reference-guided method\nthat involves dynamic collision detection and rotation alignment. Additionally,\nwe present a pixel-level road identification strategy to efficiently determine\nsuitable insertion positions. We validated our method using the nuScenes\ndataset with various 3D detection and tracking methods. Comparative experiments\ndemonstrate the superiority of D-Aug.\n","authors":["Jiaxing Zhao","Peng Zheng","Rui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.11127v1.pdf","comment":"4pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.02562v2","updated":"2024-04-17T07:13:27Z","published":"2024-04-03T08:33:08Z","title":"Representation Alignment Contrastive Regularization for Multi-Object\n Tracking","summary":" Achieving high-performance in multi-object tracking algorithms heavily relies\non modeling spatio-temporal relationships during the data association stage.\nMainstream approaches encompass rule-based and deep learning-based methods for\nspatio-temporal relationship modeling. While the former relies on physical\nmotion laws, offering wider applicability but yielding suboptimal results for\ncomplex object movements, the latter, though achieving high-performance, lacks\ninterpretability and involves complex module designs. This work aims to\nsimplify deep learning-based spatio-temporal relationship models and introduce\ninterpretability into features for data association. Specifically, a\nlightweight single-layer transformer encoder is utilized to model\nspatio-temporal relationships. To make features more interpretative, two\ncontrastive regularization losses based on representation alignment are\nproposed, derived from spatio-temporal consistency rules. By applying weighted\nsummation to affinity matrices, the aligned features can seamlessly integrate\ninto the data association stage of the original tracking workflow. Experimental\nresults showcase that our model enhances the majority of existing tracking\nnetworks' performance without excessive complexity, with minimal increase in\ntraining overhead and nearly negligible computational and storage costs.\n","authors":["Zhonglin Liu","Shujie Chen","Jianfeng Dong","Xun Wang","Di Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.02562v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11120v1","updated":"2024-04-17T07:08:38Z","published":"2024-04-17T07:08:38Z","title":"TiNO-Edit: Timestep and Noise Optimization for Robust Diffusion-Based\n Image Editing","summary":" Despite many attempts to leverage pre-trained text-to-image models (T2I) like\nStable Diffusion (SD) for controllable image editing, producing good\npredictable results remains a challenge. Previous approaches have focused on\neither fine-tuning pre-trained T2I models on specific datasets to generate\ncertain kinds of images (e.g., with a specific object or person), or on\noptimizing the weights, text prompts, and/or learning features for each input\nimage in an attempt to coax the image generator to produce the desired result.\nHowever, these approaches all have shortcomings and fail to produce good\nresults in a predictable and controllable manner. To address this problem, we\npresent TiNO-Edit, an SD-based method that focuses on optimizing the noise\npatterns and diffusion timesteps during editing, something previously\nunexplored in the literature. With this simple change, we are able to generate\nresults that both better align with the original images and reflect the desired\nresult. Furthermore, we propose a set of new loss functions that operate in the\nlatent domain of SD, greatly speeding up the optimization when compared to\nprior approaches, which operate in the pixel domain. Our method can be easily\napplied to variations of SD including Textual Inversion and DreamBooth that\nencode new concepts and incorporate them into the edited results. We present a\nhost of image-editing capabilities enabled by our approach. Our code is\npublicly available at https://github.com/SherryXTChen/TiNO-Edit.\n","authors":["Sherry X. Chen","Yaron Vaxman","Elad Ben Baruch","David Asulin","Aviad Moreshet","Kuo-Chin Lien","Misha Sra","Pradeep Sen"],"pdf_url":"https://arxiv.org/pdf/2404.11120v1.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2404.11118v1","updated":"2024-04-17T07:06:22Z","published":"2024-04-17T07:06:22Z","title":"MHLR: Moving Haar Learning Rate Scheduler for Large-scale Face\n Recognition Training with One GPU","summary":" Face recognition (FR) has seen significant advancements due to the\nutilization of large-scale datasets. Training deep FR models on large-scale\ndatasets with multiple GPUs is now a common practice. In fact, computing power\nhas evolved into a foundational and indispensable resource in the area of deep\nlearning. It is nearly impossible to train a deep FR model without holding\nadequate hardware resources. Recognizing this challenge, some FR approaches\nhave started exploring ways to reduce the time complexity of the\nfully-connected layer in FR models. Unlike other approaches, this paper\nintroduces a simple yet highly effective approach, Moving Haar Learning Rate\n(MHLR) scheduler, for scheduling the learning rate promptly and accurately in\nthe training process. MHLR supports large-scale FR training with only one GPU,\nwhich is able to accelerate the model to 1/4 of its original training time\nwithout sacrificing more than 1% accuracy. More specifically, MHLR only needs\n$30$ hours to train the model ResNet100 on the dataset WebFace12M containing\nmore than 12M face images with 0.6M identities. Extensive experiments validate\nthe efficiency and effectiveness of MHLR.\n","authors":["Xueyuan Gong","Yain-whar Si","Zheng Zhang","Xiaochen Yuan","Ke Wang","Xinyuan Zhang","Cong Lin","Xiaoxiang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11111v1","updated":"2024-04-17T06:57:57Z","published":"2024-04-17T06:57:57Z","title":"CorrNet+: Sign Language Recognition and Translation via Spatial-Temporal\n Correlation","summary":" In sign language, the conveyance of human body trajectories predominantly\nrelies upon the coordinated movements of hands and facial expressions across\nsuccessive frames. Despite the recent advancements of sign language\nunderstanding methods, they often solely focus on individual frames, inevitably\noverlooking the inter-frame correlations that are essential for effectively\nmodeling human body trajectories. To address this limitation, this paper\nintroduces a spatial-temporal correlation network, denoted as CorrNet+, which\nexplicitly identifies body trajectories across multiple frames. In specific,\nCorrNet+ employs a correlation module and an identification module to build\nhuman body trajectories. Afterwards, a temporal attention module is followed to\nadaptively evaluate the contributions of different frames. The resultant\nfeatures offer a holistic perspective on human body movements, facilitating a\ndeeper understanding of sign language. As a unified model, CorrNet+ achieves\nnew state-of-the-art performance on two extensive sign language understanding\ntasks, including continuous sign language recognition (CSLR) and sign language\ntranslation (SLT). Especially, CorrNet+ surpasses previous methods equipped\nwith resource-intensive pose-estimation networks or pre-extracted heatmaps for\nhand and facial feature extraction. Compared with CorrNet, CorrNet+ achieves a\nsignificant performance boost across all benchmarks while halving the\ncomputational overhead. A comprehensive comparison with previous\nspatial-temporal reasoning methods verifies the superiority of CorrNet+. Code\nis available at https://github.com/hulianyuyy/CorrNet_Plus.\n","authors":["Lianyu Hu","Wei Feng","Liqing Gao","Zekang Liu","Liang Wan"],"pdf_url":"https://arxiv.org/pdf/2404.11111v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.03202"},{"id":"http://arxiv.org/abs/2404.11108v1","updated":"2024-04-17T06:47:17Z","published":"2024-04-17T06:47:17Z","title":"LADDER: An Efficient Framework for Video Frame Interpolation","summary":" Video Frame Interpolation (VFI) is a crucial technique in various\napplications such as slow-motion generation, frame rate conversion, video frame\nrestoration etc. This paper introduces an efficient video frame interpolation\nframework that aims to strike a favorable balance between efficiency and\nquality. Our framework follows a general paradigm consisting of a flow\nestimator and a refinement module, while incorporating carefully designed\ncomponents. First of all, we adopt depth-wise convolution with large kernels in\nthe flow estimator that simultaneously reduces the parameters and enhances the\nreceptive field for encoding rich context and handling complex motion.\nSecondly, diverging from a common design for the refinement module with a\nUNet-structure (encoder-decoder structure), which we find redundant, our\ndecoder-only refinement module directly enhances the result from coarse to fine\nfeatures, offering a more efficient process. In addition, to address the\nchallenge of handling high-definition frames, we also introduce an innovative\nHD-aware augmentation strategy during training, leading to consistent\nenhancement on HD images. Extensive experiments are conducted on diverse\ndatasets, Vimeo90K, UCF101, Xiph and SNU-FILM. The results demonstrate that our\napproach achieves state-of-the-art performance with clear improvement while\nrequiring much less FLOPs and parameters, reaching to a better spot for\nbalancing efficiency and quality.\n","authors":["Tong Shen","Dong Li","Ziheng Gao","Lu Tian","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2404.11108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11104v1","updated":"2024-04-17T06:40:47Z","published":"2024-04-17T06:40:47Z","title":"Object Remover Performance Evaluation Methods using Class-wise Object\n Removal Images","summary":" Object removal refers to the process of erasing designated objects from an\nimage while preserving the overall appearance, and it is one area where image\ninpainting is widely used in real-world applications. The performance of an\nobject remover is quantitatively evaluated by measuring the quality of object\nremoval results, similar to how the performance of an image inpainter is\ngauged. Current works reporting quantitative performance evaluations utilize\noriginal images as references. In this letter, to validate the current\nevaluation methods cannot properly evaluate the performance of an object\nremover, we create a dataset with object removal ground truth and compare the\nevaluations made by the current methods using original images to those\nutilizing object removal ground truth images. The disparities between two\nevaluation sets validate that the current methods are not suitable for\nmeasuring the performance of an object remover. Additionally, we propose new\nevaluation methods tailored to gauge the performance of an object remover. The\nproposed methods evaluate the performance through class-wise object removal\nresults and utilize images without the target class objects as a comparison\nset. We confirm that the proposed methods can make judgments consistent with\nhuman evaluators in the COCO dataset, and that they can produce measurements\naligning with those using object removal ground truth in the self-acquired\ndataset.\n","authors":["Changsuk Oh","Dongseok Shim","Taekbeom Lee","H. Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.11104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11100v1","updated":"2024-04-17T06:36:17Z","published":"2024-04-17T06:36:17Z","title":"Synthesizing Realistic Data for Table Recognition","summary":" To overcome the limitations and challenges of current automatic table data\nannotation methods and random table data synthesis approaches, we propose a\nnovel method for synthesizing annotation data specifically designed for table\nrecognition. This method utilizes the structure and content of existing complex\ntables, facilitating the efficient creation of tables that closely replicate\nthe authentic styles found in the target domain. By leveraging the actual\nstructure and content of tables from Chinese financial announcements, we have\ndeveloped the first extensive table annotation dataset in this domain. We used\nthis dataset to train several recent deep learning-based end-to-end table\nrecognition models. Additionally, we have established the inaugural benchmark\nfor real-world complex tables in the Chinese financial announcement domain,\nusing it to assess the performance of models trained on our synthetic data,\nthereby effectively validating our method's practicality and effectiveness.\nFurthermore, we applied our synthesis method to augment the FinTabNet dataset,\nextracted from English financial announcements, by increasing the proportion of\ntables with multiple spanning cells to introduce greater complexity. Our\nexperiments show that models trained on this augmented dataset achieve\ncomprehensive improvements in performance, especially in the recognition of\ntables with multiple spanning cells.\n","authors":["Qiyu Hou","Jun Wang","Meixuan Qiao","Lujun Tian"],"pdf_url":"https://arxiv.org/pdf/2404.11100v1.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2404.11098v1","updated":"2024-04-17T06:32:42Z","published":"2024-04-17T06:32:42Z","title":"LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing\n Diffusion Models","summary":" In the era of AIGC, the demand for low-budget or even on-device applications\nof diffusion models emerged. In terms of compressing the Stable Diffusion\nmodels (SDMs), several approaches have been proposed, and most of them\nleveraged the handcrafted layer removal methods to obtain smaller U-Nets, along\nwith knowledge distillation to recover the network performance. However, such a\nhandcrafting manner of layer removal is inefficient and lacks scalability and\ngeneralization, and the feature distillation employed in the retraining phase\nfaces an imbalance issue that a few numerically significant feature loss terms\ndominate over others throughout the retraining process. To this end, we\nproposed the layer pruning and normalized distillation for compressing\ndiffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to\ncompress SDM's U-Net automatically and proposed an effective one-shot pruning\ncriterion whose one-shot performance is guaranteed by its good additivity\nproperty, surpassing other layer pruning and handcrafted layer removal methods,\n2) proposed the normalized feature distillation for retraining, alleviated the\nimbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of\nSDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0%\ndecline in PickScore at a pruning ratio of 50% while the comparative methods'\nminimal PickScore decline is 8.2%. We will release our code.\n","authors":["Dingkun Zhang","Sijia Li","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.11098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10312v2","updated":"2024-04-17T06:30:00Z","published":"2024-04-16T06:39:37Z","title":"OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable\n Diffusion Model","summary":" Omnidirectional images (ODIs) are commonly used in real-world visual tasks,\nand high-resolution ODIs help improve the performance of related visual tasks.\nMost existing super-resolution methods for ODIs use end-to-end learning\nstrategies, resulting in inferior realness of generated images and a lack of\neffective out-of-domain generalization capabilities in training methods. Image\ngeneration methods represented by diffusion model provide strong priors for\nvisual tasks and have been proven to be effectively applied to image\nrestoration tasks. Leveraging the image priors of the Stable Diffusion (SD)\nmodel, we achieve omnidirectional image super-resolution with both fidelity and\nrealness, dubbed as OmniSSR. Firstly, we transform the equirectangular\nprojection (ERP) images into tangent projection (TP) images, whose distribution\napproximates the planar image domain. Then, we use SD to iteratively sample\ninitial high-resolution results. At each denoising iteration, we further\ncorrect and update the initial results using the proposed Octadecaplex Tangent\nInformation Interaction (OTII) and Gradient Decomposition (GD) technique to\nensure better consistency. Finally, the TP images are transformed back to\nobtain the final high-resolution results. Our method is zero-shot, requiring no\ntraining or fine-tuning. Experiments of our method on two benchmark datasets\ndemonstrate the effectiveness of our proposed method.\n","authors":["Runyi Li","Xuhan Sheng","Weiqi Li","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00425v2","updated":"2024-04-17T06:26:04Z","published":"2023-12-01T08:47:56Z","title":"Retina : Low-Power Eye Tracking with Event Camera and Spiking Hardware","summary":" This paper introduces a neuromorphic methodology for eye tracking, harnessing\npure event data captured by a Dynamic Vision Sensor (DVS) camera. The framework\nintegrates a directly trained Spiking Neuron Network (SNN) regression model and\nleverages a state-of-the-art low power edge neuromorphic processor - Speck,\ncollectively aiming to advance the precision and efficiency of eye-tracking\nsystems. First, we introduce a representative event-based eye-tracking dataset,\n\"Ini-30\", which was collected with two glass-mounted DVS cameras from thirty\nvolunteers. Then,a SNN model, based on Integrate And Fire (IAF) neurons, named\n\"Retina\", is described , featuring only 64k parameters (6.63x fewer than the\nlatest) and achieving pupil tracking error of only 3.24 pixels in a 64x64 DVS\ninput. The continous regression output is obtained by means of convolution\nusing a non-spiking temporal 1D filter slided across the output spiking layer.\nFinally, we evaluate Retina on the neuromorphic processor, showing an\nend-to-end power between 2.89-4.8 mW and a latency of 5.57-8.01 mS dependent on\nthe time window. We also benchmark our model against the latest event-based\neye-tracking method, \"3ET\", which was built upon event frames. Results show\nthat Retina achieves superior precision with 1.24px less pupil centroid error\nand reduced computational complexity with 35 times fewer MAC operations. We\nhope this work will open avenues for further investigation of close-loop\nneuromorphic solutions and true event-based training pursuing edge performance.\n","authors":["Pietro Bonazzi","Sizhen Bian","Giovanni Lippolis","Yawei Li","Sadique Sheik","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2312.00425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09831v2","updated":"2024-04-17T05:55:33Z","published":"2024-04-15T14:29:47Z","title":"Digging into contrastive learning for robust depth estimation with\n diffusion models","summary":" Recently, diffusion-based depth estimation methods have drawn widespread\nattention due to their elegant denoising patterns and promising performance.\nHowever, they are typically unreliable under adverse conditions prevalent in\nreal-world scenarios, such as rainy, snowy, etc. In this paper, we propose a\nnovel robust depth estimation method called D4RD, featuring a custom\ncontrastive learning mode tailored for diffusion models to mitigate performance\ndegradation in complex environments. Concretely, we integrate the strength of\nknowledge distillation into contrastive learning, building the `trinity'\ncontrastive scheme. This scheme utilizes the sampled noise of the forward\ndiffusion process as a natural reference, guiding the predicted noise in\ndiverse scenes toward a more stable and precise optimum. Moreover, we extend\nnoise-level trinity to encompass more generic feature and image levels,\nestablishing a multi-level contrast to distribute the burden of robust\nperception across the overall network. Before addressing complex scenarios, we\nenhance the stability of the baseline diffusion model with three\nstraightforward yet effective improvements, which facilitate convergence and\nremove depth outliers. Extensive experiments demonstrate that D4RD surpasses\nexisting state-of-the-art solutions on synthetic corruption datasets and\nreal-world weather conditions. The code for D4RD will be made available for\nfurther exploration and adoption.\n","authors":["Jiyuan Wang","Chunyu Lin","Lang Nie","Kang Liao","Shuwei Shao","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.09831v2.pdf","comment":"8 pages,6 figures"},{"id":"http://arxiv.org/abs/2402.19474v3","updated":"2024-04-17T05:55:04Z","published":"2024-02-29T18:59:17Z","title":"The All-Seeing Project V2: Towards General Relation Comprehension of the\n Open World","summary":" We present the All-Seeing Project V2: a new model and dataset designed for\nunderstanding object relations in images. Specifically, we propose the\nAll-Seeing Model V2 (ASMv2) that integrates the formulation of text generation,\nobject localization, and relation comprehension into a relation conversation\n(ReC) task. Leveraging this unified task, our model excels not only in\nperceiving and recognizing all objects within the image but also in grasping\nthe intricate relation graph between them, diminishing the relation\nhallucination often encountered by Multi-modal Large Language Models (MLLMs).\nTo facilitate training and evaluation of MLLMs in relation understanding, we\ncreated the first high-quality ReC dataset ({AS-V2) which is aligned with the\nformat of standard instruction tuning data. In addition, we design a new\nbenchmark, termed Circular-based Relation Probing Evaluation (CRPE) for\ncomprehensively evaluating the relation comprehension capabilities of MLLMs.\nNotably, our ASMv2 achieves an overall accuracy of 52.04 on this relation-aware\nbenchmark, surpassing the 43.14 of LLaVA-1.5 by a large margin. We hope that\nour work can inspire more future research and contribute to the evolution\ntowards artificial general intelligence. Our project is released at\nhttps://github.com/OpenGVLab/all-seeing.\n","authors":["Weiyun Wang","Yiming Ren","Haowen Luo","Tiantong Li","Chenxiang Yan","Zhe Chen","Wenhai Wang","Qingyun Li","Lewei Lu","Xizhou Zhu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2402.19474v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.08968v2","updated":"2024-04-17T05:42:52Z","published":"2024-04-13T11:13:56Z","title":"MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes","summary":" Recent advancements in post-hoc and inherently interpretable methods have\nmarkedly enhanced the explanations of black box classifier models. These\nmethods operate either through post-analysis or by integrating concept learning\nduring model training. Although being effective in bridging the semantic gap\nbetween a model's latent space and human interpretation, these explanation\nmethods only partially reveal the model's decision-making process. The outcome\nis typically limited to high-level semantics derived from the last feature map.\nWe argue that the explanations lacking insights into the decision processes at\nlow and mid-level features are neither fully faithful nor useful. Addressing\nthis gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet),\nan inherently interpretable model. MCPNet autonomously learns meaningful\nconcept prototypes across multiple feature map levels using Centered Kernel\nAlignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so\nwithout reliance on predefined concept labels. Further, we propose a novel\nclassifier paradigm that learns and aligns multi-level concept prototype\ndistributions for classification purposes via Class-aware Concept Distribution\n(CCD) loss. Our experiments reveal that our proposed MCPNet while being\nadaptable to various model architectures, offers comprehensive multi-level\nexplanations while maintaining classification accuracy. Additionally, its\nconcept distribution-based classification approach shows improved\ngeneralization capabilities in few-shot classification scenarios.\n","authors":["Bor-Shiun Wang","Chien-Yi Wang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2404.08968v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11070v1","updated":"2024-04-17T04:59:36Z","published":"2024-04-17T04:59:36Z","title":"Sky-GVIO: an enhanced GNSS/INS/Vision navigation with FCN-based\n sky-segmentation in urban canyon","summary":" Accurate, continuous, and reliable positioning is a critical component of\nachieving autonomous driving. However, in complex urban canyon environments,\nthe vulnerability of a stand-alone sensor and non-line-of-sight (NLOS) caused\nby high buildings, trees, and elevated structures seriously affect positioning\nresults. To address these challenges, a sky-view images segmentation algorithm\nbased on Fully Convolutional Network (FCN) is proposed for GNSS NLOS detection.\nBuilding upon this, a novel NLOS detection and mitigation algorithm (named\nS-NDM) is extended to the tightly coupled Global Navigation Satellite Systems\n(GNSS), Inertial Measurement Units (IMU), and visual feature system which is\ncalled Sky-GVIO, with the aim of achieving continuous and accurate positioning\nin urban canyon environments. Furthermore, the system harmonizes Single Point\nPositioning (SPP) with Real-Time Kinematic (RTK) methodologies to bolster its\noperational versatility and resilience. In urban canyon environments, the\npositioning performance of S-NDM algorithm proposed in this paper is evaluated\nunder different tightly coupled SPP-related and RTK-related models. The results\nexhibit that Sky-GVIO system achieves meter-level accuracy under SPP mode and\nsub-decimeter precision with RTK, surpassing the performance of GNSS/INS/Vision\nframeworks devoid of S-NDM. Additionally, the sky-view image dataset, inclusive\nof training and evaluation subsets, has been made publicly accessible for\nscholarly exploration at https://github.com/whuwangjr/sky-view-images .\n","authors":["Jingrong Wang","Bo Xu","Ronghe Jin","Shoujian Zhang","Kefu Gao","Jingnan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11064v1","updated":"2024-04-17T04:46:27Z","published":"2024-04-17T04:46:27Z","title":"Rethinking 3D Dense Caption and Visual Grounding in A Unified Framework\n through Prompt-based Localization","summary":" 3D Visual Grounding (3DVG) and 3D Dense Captioning (3DDC) are two crucial\ntasks in various 3D applications, which require both shared and complementary\ninformation in localization and visual-language relationships. Therefore,\nexisting approaches adopt the two-stage \"detect-then-describe/discriminate\"\npipeline, which relies heavily on the performance of the detector, resulting in\nsuboptimal performance. Inspired by DETR, we propose a unified framework,\n3DGCTR, to jointly solve these two distinct but closely related tasks in an\nend-to-end fashion. The key idea is to reconsider the prompt-based localization\nability of the 3DVG model. In this way, the 3DVG model with a well-designed\nprompt as input can assist the 3DDC task by extracting localization information\nfrom the prompt. In terms of implementation, we integrate a Lightweight Caption\nHead into the existing 3DVG network with a Caption Text Prompt as a connection,\neffectively harnessing the existing 3DVG model's inherent localization\ncapacity, thereby boosting 3DDC capability. This integration facilitates\nsimultaneous multi-task training on both tasks, mutually enhancing their\nperformance. Extensive experimental results demonstrate the effectiveness of\nthis approach. Specifically, on the ScanRefer dataset, 3DGCTR surpasses the\nstate-of-the-art 3DDC method by 4.3% in CIDEr@0.5IoU in MLE training and\nimproves upon the SOTA 3DVG method by 3.16% in Acc@0.25IoU.\n","authors":["Yongdong Luo","Haojia Lin","Xiawu Zheng","Yigeng Jiang","Fei Chao","Jie Hu","Guannan Jiang","Songan Zhang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.11064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03492v2","updated":"2024-04-17T04:07:47Z","published":"2023-06-06T08:19:30Z","title":"Efficient Anomaly Detection with Budget Annotation Using Semi-Supervised\n Residual Transformer","summary":" Anomaly Detection is challenging as usually only the normal samples are seen\nduring training and the detector needs to discover anomalies on-the-fly. The\nrecently proposed deep-learning-based approaches could somehow alleviate the\nproblem but there is still a long way to go in obtaining an industrial-class\nanomaly detector for real-world applications. On the other hand, in some\nparticular AD tasks, a few anomalous samples are labeled manually for achieving\nhigher accuracy. However, this performance gain is at the cost of considerable\nannotation efforts, which can be intractable in many practical scenarios.\n In this work, the above two problems are addressed in a unified framework.\nFirstly, inspired by the success of the patch-matching-based AD algorithms, we\ntrain a sliding vision transformer over the residuals generated by a novel\nposition-constrained patch-matching. Secondly, the conventional pixel-wise\nsegmentation problem is cast into a block-wise classification problem. Thus the\nsliding transformer can attain even higher accuracy with much less annotation\nlabor. Thirdly, to further reduce the labeling cost, we propose to label the\nanomalous regions using only bounding boxes. The unlabeled regions caused by\nthe weak labels are effectively exploited using a highly-customized\nsemi-supervised learning scheme equipped with two novel data augmentation\nmethods. The proposed method outperforms all the state-of-the-art approaches\nusing all the evaluation metrics in both the unsupervised and supervised\nscenarios. On the popular MVTec-AD dataset, our SemiREST algorithm obtains the\nAverage Precision (AP) of 81.2% in the unsupervised condition and 84.4% AP for\nsupervised anomaly detection. Surprisingly, with the bounding-box-based\nsemi-supervisions, SemiREST still outperforms the SOTA methods with full\nsupervision (83.8% AP) on MVTec-AD.\n","authors":["Hanxi Li","Jingqi Wu","Hao Chen","Mingwen Wang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2306.03492v2.pdf","comment":"20 pages,6 figures"},{"id":"http://arxiv.org/abs/2404.11054v1","updated":"2024-04-17T03:56:28Z","published":"2024-04-17T03:56:28Z","title":"Multilateral Temporal-view Pyramid Transformer for Video Inpainting\n Detection","summary":" The task of video inpainting detection is to expose the pixel-level inpainted\nregions within a video sequence. Existing methods usually focus on leveraging\nspatial and temporal inconsistencies. However, these methods typically employ\nfixed operations to combine spatial and temporal clues, limiting their\napplicability in different scenarios. In this paper, we introduce a novel\nMultilateral Temporal-view Pyramid Transformer ({\\em MumPy}) that collaborates\nspatial-temporal clues flexibly. Our method utilizes a newly designed\nmultilateral temporal-view encoder to extract various collaborations of\nspatial-temporal clues and introduces a deformable window-based temporal-view\ninteraction module to enhance the diversity of these collaborations.\nSubsequently, we develop a multi-pyramid decoder to aggregate the various types\nof features and generate detection maps. By adjusting the contribution strength\nof spatial and temporal clues, our method can effectively identify inpainted\nregions. We validate our method on existing datasets and also introduce a new\nchallenging and large-scale Video Inpainting dataset based on the YouTube-VOS\ndataset, which employs several more recent inpainting methods. The results\ndemonstrate the superiority of our method in both in-domain and cross-domain\nevaluation scenarios.\n","authors":["Ying Zhang","Bo Peng","Jiaran Zhou","Huiyu Zhou","Junyu Dong","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2404.11054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11052v1","updated":"2024-04-17T03:51:55Z","published":"2024-04-17T03:51:55Z","title":"Supervised Contrastive Vision Transformer for Breast Histopathological\n Image Classification","summary":" Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer.\nBreast tissue histopathological examination is critical in diagnosing and\nclassifying breast cancer. Although existing methods have shown promising\nresults, there is still room for improvement in the classification accuracy and\ngeneralization of IDC using histopathology images. We present a novel approach,\nSupervised Contrastive Vision Transformer (SupCon-ViT), for improving the\nclassification of invasive ductal carcinoma in terms of accuracy and\ngeneralization by leveraging the inherent strengths and advantages of both\ntransfer learning, i.e., pre-trained vision transformer, and supervised\ncontrastive learning. Our results on a benchmark breast cancer dataset\ndemonstrate that SupCon-Vit achieves state-of-the-art performance in IDC\nclassification, with an F1-score of 0.8188, precision of 0.7692, and\nspecificity of 0.8971, outperforming existing methods. In addition, the\nproposed model demonstrates resilience in scenarios with minimal labeled data,\nmaking it highly efficient in real-world clinical settings where labelled data\nis limited. Our findings suggest that supervised contrastive learning in\nconjunction with pre-trained vision transformers appears to be a viable\nstrategy for an accurate classification of IDC, thus paving the way for a more\nefficient and reliable diagnosis of breast cancer through histopathological\nimage analysis.\n","authors":["Mohammad Shiri","Jiangwen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.11052v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.11051v1","updated":"2024-04-17T03:51:24Z","published":"2024-04-17T03:51:24Z","title":"WPS-Dataset: A benchmark for wood plate segmentation in bark removal\n processing","summary":" Using deep learning methods is a promising approach to improving bark removal\nefficiency and enhancing the quality of wood products. However, the lack of\npublicly available datasets for wood plate segmentation in bark removal\nprocessing poses challenges for researchers in this field. To address this\nissue, a benchmark for wood plate segmentation in bark removal processing named\nWPS-dataset is proposed in this study, which consists of 4863 images. We\ndesigned an image acquisition device and assembled it on a bark removal\nequipment to capture images in real industrial settings. We evaluated the\nWPS-dataset using six typical segmentation models. The models effectively learn\nand understand the WPS-dataset characteristics during training, resulting in\nhigh performance and accuracy in wood plate segmentation tasks. We believe that\nour dataset can lay a solid foundation for future research in bark removal\nprocessing and contribute to advancements in this field.\n","authors":["Rijun Wang","Guanghao Zhang","Fulong Liang","Bo Wang","Xiangwei Mou","Yesheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11046v1","updated":"2024-04-17T03:42:48Z","published":"2024-04-17T03:42:48Z","title":"Lightweight Unsupervised Federated Learning with Pretrained Vision\n Language Model","summary":" Federated learning aims to tackle the ``isolated data island\" problem, where\nit trains a collective model from physically isolated clients while\nsafeguarding the privacy of users' data. However, supervised federated learning\nnecessitates that each client labels their data for training, which can be both\ntime-consuming and resource-intensive, and may even be impractical for edge\ndevices. Moreover, the training and transmission of deep models present\nchallenges to the computation and communication capabilities of the clients. To\naddress these two inherent challenges in supervised federated learning, we\npropose a novel lightweight unsupervised federated learning approach that\nleverages unlabeled data on each client to perform lightweight model training\nand communication by harnessing pretrained vision-language models, such as\nCLIP. By capitalizing on the zero-shot prediction capability and the\nwell-trained image encoder of the pre-trained CLIP model, we have carefully\ncrafted an efficient and resilient self-training approach. This method refines\nthe initial zero-shot predicted pseudo-labels of unlabeled instances through\nthe sole training of a linear classifier on top of the fixed image encoder.\nAdditionally, to address data heterogeneity within each client, we propose a\nclass-balanced text feature sampling strategy for generating synthetic\ninstances in the feature space to support local training. Experiments are\nconducted on multiple benchmark datasets. The experimental results demonstrate\nthat our proposed method greatly enhances model performance in comparison to\nCLIP's zero-shot predictions and even outperforms supervised federated learning\nbenchmark methods given limited computational and communication overhead.\n","authors":["Hao Yan","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07922v4","updated":"2024-04-17T03:23:33Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. Our project is\npublic at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v4.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.08251v3","updated":"2024-04-17T03:14:21Z","published":"2023-06-14T05:34:02Z","title":"GBSD: Generative Bokeh with Stage Diffusion","summary":" The bokeh effect is an artistic technique that blurs out-of-focus areas in a\nphotograph and has gained interest due to recent developments in text-to-image\nsynthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior\nwork on rendering bokeh effects have focused on post hoc image manipulation to\nproduce similar blurring effects in existing photographs using classical\ncomputer graphics or neural rendering techniques, but have either depth\ndiscontinuity artifacts or are restricted to reproducing bokeh effects that are\npresent in the training data. More recent diffusion based models can synthesize\nimages with an artistic style, but either require the generation of\nhigh-dimensional masks, expensive fine-tuning, or affect global image\ncharacteristics. In this paper, we present GBSD, the first generative\ntext-to-image model that synthesizes photorealistic images with a bokeh style.\nMotivated by how image synthesis occurs progressively in diffusion models, our\napproach combines latent diffusion models with a 2-stage conditioning algorithm\nto render bokeh effects on semantically defined objects. Since we can focus the\neffect on objects, this semantic bokeh effect is more versatile than classical\nrendering techniques. We evaluate GBSD both quantitatively and qualitatively\nand demonstrate its ability to be applied in both text-to-image and\nimage-to-image settings.\n","authors":["Jieren Deng","Xin Zhou","Hao Tian","Zhihong Pan","Derek Aguiar"],"pdf_url":"https://arxiv.org/pdf/2306.08251v3.pdf","comment":"Short Version is accepted by International Conference on Acoustics,\n Speech, and Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.03907v2","updated":"2024-04-17T03:14:00Z","published":"2024-01-08T14:10:24Z","title":"RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM","summary":" Multi-modal 3D object detectors are dedicated to exploring secure and\nreliable perception systems for autonomous driving (AD). However, while\nachieving state-of-the-art (SOTA) performance on clean benchmark datasets, they\ntend to overlook the complexity and harsh conditions of real-world\nenvironments. Meanwhile, with the emergence of visual foundation models (VFMs),\nopportunities and challenges are presented for improving the robustness and\ngeneralization of multi-modal 3D object detection in autonomous driving.\nTherefore, we propose RoboFusion, a robust framework that leverages VFMs like\nSAM to tackle out-of-distribution (OOD) noise scenarios. We first adapt the\noriginal SAM for autonomous driving scenarios named SAM-AD. To align SAM or\nSAM-AD with multi-modal methods, we then introduce AD-FPN for upsampling the\nimage features extracted by SAM. We employ wavelet decomposition to denoise the\ndepth-guided images for further noise reduction and weather interference.\nLastly, we employ self-attention mechanisms to adaptively reweight the fused\nfeatures, enhancing informative features while suppressing excess noise. In\nsummary, our RoboFusion gradually reduces noise by leveraging the\ngeneralization and robustness of VFMs, thereby enhancing the resilience of\nmulti-modal 3D object detection. Consequently, our RoboFusion achieves\nstate-of-the-art performance in noisy scenarios, as demonstrated by the KITTI-C\nand nuScenes-C benchmarks.\n","authors":["Ziying Song","Guoxing Zhang","Lin Liu","Lei Yang","Shaoqing Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2401.03907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11031v1","updated":"2024-04-17T03:13:58Z","published":"2024-04-17T03:13:58Z","title":"TaCOS: Task-Specific Camera Optimization with Simulation","summary":" The performance of robots in their applications heavily depends on the\nquality of sensory input. However, designing sensor payloads and their\nparameters for specific robotic tasks is an expensive process that requires\nwell-established sensor knowledge and extensive experiments with physical\nhardware. With cameras playing a pivotal role in robotic perception, we\nintroduce a novel end-to-end optimization approach for co-designing a camera\nwith specific robotic tasks by combining derivative-free and gradient-based\noptimizers. The proposed method leverages recent computer graphics techniques\nand physical camera characteristics to prototype the camera in software,\nsimulate operational environments and tasks for robots, and optimize the camera\ndesign based on the desired tasks in a cost-effective way. We validate the\naccuracy of our camera simulation by comparing it with physical cameras, and\ndemonstrate the design of cameras with stronger performance than common\noff-the-shelf alternatives. Our approach supports the optimization of both\ncontinuous and discrete camera parameters, manufacturing constraints, and can\nbe generalized to a broad range of camera design scenarios including multiple\ncameras and unconventional cameras. This work advances the fully automated\ndesign of cameras for specific robotics tasks.\n","authors":["Chengyang Yan","Donald Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.11031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09276v3","updated":"2024-04-17T03:02:38Z","published":"2023-10-13T17:38:45Z","title":"Transformer-based Multimodal Change Detection with Multitask Consistency\n Constraints","summary":" Change detection plays a fundamental role in Earth observation for analyzing\ntemporal iterations over time. However, recent studies have largely neglected\nthe utilization of multimodal data that presents significant practical and\ntechnical advantages compared to single-modal approaches. This research focuses\non leveraging {pre-event} digital surface model (DSM) data and {post-event}\ndigital aerial images captured at different times for detecting change beyond\n2D. We observe that the current change detection methods struggle with the\nmultitask conflicts between semantic and height change detection tasks. To\naddress this challenge, we propose an efficient Transformer-based network that\nlearns shared representation between cross-dimensional inputs through\ncross-attention. {It adopts a consistency constraint to establish the\nmultimodal relationship. Initially, pseudo-changes are derived by employing\nheight change thresholding. Subsequently, the $L2$ distance between semantic\nand pseudo-changes within their overlapping regions is minimized. This\nexplicitly endows the height change detection (regression task) and semantic\nchange detection (classification task) with representation consistency.} A\nDSM-to-image multimodal dataset encompassing three cities in the Netherlands\nwas constructed. It lays a new foundation for beyond-2D change detection from\ncross-dimensional inputs. Compared to five state-of-the-art change detection\nmethods, our model demonstrates consistent multitask superiority in terms of\nsemantic and height change detection. Furthermore, the consistency strategy can\nbe seamlessly adapted to the other methods, yielding promising improvements.\n","authors":["Biyuan Liu","Huaixin Chen","Kun Li","Michael Ying Yang"],"pdf_url":"https://arxiv.org/pdf/2310.09276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11025v1","updated":"2024-04-17T03:01:47Z","published":"2024-04-17T03:01:47Z","title":"Spatial-Aware Image Retrieval: A Hyperdimensional Computing Approach for\n Efficient Similarity Hashing","summary":" In the face of burgeoning image data, efficiently retrieving similar images\nposes a formidable challenge. Past research has focused on refining hash\nfunctions to distill images into compact indicators of resemblance. Initial\nattempts used shallow models, evolving to attention mechanism-based\narchitectures from Convolutional Neural Networks (CNNs) to advanced models.\nRecognizing limitations in gradient-based models for spatial information\nembedding, we propose an innovative image hashing method, NeuroHash leveraging\nHyperdimensional Computing (HDC). HDC symbolically encodes spatial information\ninto high-dimensional vectors, reshaping image representation. Our approach\ncombines pre-trained large vision models with HDC operations, enabling\nspatially encoded feature representations. Hashing with locality-sensitive\nhashing (LSH) ensures swift and efficient image retrieval. Notably, our\nframework allows dynamic hash manipulation for conditional image retrieval. Our\nwork introduces a transformative image hashing framework enabling spatial-aware\nconditional retrieval. By seamlessly combining DNN-based neural and HDC-based\nsymbolic models, our methodology breaks from traditional training, offering\nflexible and conditional image retrieval. Performance evaluations signify a\nparadigm shift in image-hashing methodologies, demonstrating enhanced retrieval\naccuracy.\n","authors":["Sanggeon Yun","Ryozo Masukawa","SungHeon Jeong","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2404.11025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16627v2","updated":"2024-04-17T02:57:58Z","published":"2024-03-25T11:16:23Z","title":"SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions","summary":" Recent advancements in diffusion models have positioned them at the forefront\nof image generation. Despite their superior performance, diffusion models are\nnot without drawbacks; they are characterized by complex architectures and\nsubstantial computational demands, resulting in significant latency due to\ntheir iterative sampling process. To mitigate these limitations, we introduce a\ndual approach involving model miniaturization and a reduction in sampling\nsteps, aimed at significantly decreasing model latency. Our methodology\nleverages knowledge distillation to streamline the U-Net and image decoder\narchitectures, and introduces an innovative one-step DM training technique that\nutilizes feature matching and score distillation. We present two models,\nSDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS\n(30x faster than SD v1.5) and 30 FPS (60x faster than SDXL) on a single GPU,\nrespectively. Moreover, our training approach offers promising applications in\nimage-conditioned control, facilitating efficient image-to-image translation.\n","authors":["Yuda Song","Zehao Sun","Xuanwu Yin"],"pdf_url":"https://arxiv.org/pdf/2403.16627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10357v2","updated":"2024-04-17T02:48:49Z","published":"2024-04-16T07:44:52Z","title":"Optimization of Prompt Learning via Multi-Knowledge Representation for\n Vision-Language Models","summary":" Vision-Language Models (VLMs), such as CLIP, play a foundational role in\nvarious cross-modal applications. To fully leverage VLMs' potential in adapting\nto downstream tasks, context optimization methods like Prompt Tuning are\nessential. However, one key limitation is the lack of diversity in prompt\ntemplates, whether they are hand-crafted or learned through additional modules.\nThis limitation restricts the capabilities of pretrained VLMs and can result in\nincorrect predictions in downstream tasks. To address this challenge, we\npropose Context Optimization with Multi-Knowledge Representation (CoKnow), a\nframework that enhances Prompt Learning for VLMs with rich contextual\nknowledge. To facilitate CoKnow during inference, we trained lightweight\nsemantic knowledge mappers, which are capable of generating Multi-Knowledge\nRepresentation for an input image without requiring additional priors.\nExperimentally, We conducted extensive experiments on 11 publicly available\ndatasets, demonstrating that CoKnow outperforms a series of previous methods.\nWe will make all resources open-source: https://github.com/EMZucas/CoKnow.\n","authors":["Enming Zhang","Bingke Zhu","Yingying Chen","Qinghai Miao","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.10357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11016v1","updated":"2024-04-17T02:47:39Z","published":"2024-04-17T02:47:39Z","title":"MaeFuse: Transferring Omni Features with Pretrained Masked Autoencoders\n for Infrared and Visible Image Fusion via Guided Training","summary":" In this research, we introduce MaeFuse, a novel autoencoder model designed\nfor infrared and visible image fusion (IVIF). The existing approaches for image\nfusion often rely on training combined with downstream tasks to obtain\nhigh-level visual information, which is effective in emphasizing target objects\nand delivering impressive results in visual quality and task-specific\napplications. MaeFuse, however, deviates from the norm. Instead of being driven\nby downstream tasks, our model utilizes a pretrained encoder from Masked\nAutoencoders (MAE), which facilities the omni features extraction for low-level\nreconstruction and high-level vision tasks, to obtain perception friendly\nfeatures with a low cost. In order to eliminate the domain gap of different\nmodal features and the block effect caused by the MAE encoder, we further\ndevelop a guided training strategy. This strategy is meticulously crafted to\nensure that the fusion layer seamlessly adjusts to the feature space of the\nencoder, gradually enhancing the fusion effect. It facilitates the\ncomprehensive integration of feature vectors from both infrared and visible\nmodalities, preserving the rich details inherent in each. MaeFuse not only\nintroduces a novel perspective in the realm of fusion techniques but also\nstands out with impressive performance across various public datasets.\n","authors":["Jiayang Li","Junjun Jiang","Pengwei Liang","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2404.11016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.10971v2","updated":"2024-04-17T02:39:19Z","published":"2022-07-22T09:37:48Z","title":"Kinematics Modeling Network for Video-based Human Pose Estimation","summary":" Estimating human poses from videos is critical in human-computer interaction.\nJoints cooperate rather than move independently during human movement. There\nare both spatial and temporal correlations between joints. Despite the positive\nresults of previous approaches, most focus on modeling the spatial correlation\nbetween joints while only straightforwardly integrating features along the\ntemporal dimension, ignoring the temporal correlation between joints. In this\nwork, we propose a plug-and-play kinematics modeling module (KMM) to explicitly\nmodel temporal correlations between joints across different frames by\ncalculating their temporal similarity. In this way, KMM can capture motion cues\nof the current joint relative to all joints in different time. Besides, we\nformulate video-based human pose estimation as a Markov Decision Process and\ndesign a novel kinematics modeling network (KIMNet) to simulate the Markov\nChain, allowing KIMNet to locate joints recursively. Our approach achieves\nstate-of-the-art results on two challenging benchmarks. In particular, KIMNet\nshows robustness to the occlusion. The code will be released at\nhttps://github.com/YHDang/KIMNet.\n","authors":["Yonghao Dang","Jianqin Yin","Shaojie Zhang","Jiping Liu","Yanzhu Hu"],"pdf_url":"https://arxiv.org/pdf/2207.10971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11008v1","updated":"2024-04-17T02:36:02Z","published":"2024-04-17T02:36:02Z","title":"AKGNet: Attribute Knowledge-Guided Unsupervised Lung-Infected Area\n Segmentation","summary":" Lung-infected area segmentation is crucial for assessing the severity of lung\ndiseases. However, existing image-text multi-modal methods typically rely on\nlabour-intensive annotations for model training, posing challenges regarding\ntime and expertise. To address this issue, we propose a novel attribute\nknowledge-guided framework for unsupervised lung-infected area segmentation\n(AKGNet), which achieves segmentation solely based on image-text data without\nany mask annotation. AKGNet facilitates text attribute knowledge learning,\nattribute-image cross-attention fusion, and high-confidence-based pseudo-label\nexploration simultaneously. It can learn statistical information and capture\nspatial correlations between image and text attributes in the embedding space,\niteratively refining the mask to enhance segmentation. Specifically, we\nintroduce a text attribute knowledge learning module by extracting attribute\nknowledge and incorporating it into feature representations, enabling the model\nto learn statistical information and adapt to different attributes. Moreover,\nwe devise an attribute-image cross-attention module by calculating the\ncorrelation between attributes and images in the embedding space to capture\nspatial dependency information, thus selectively focusing on relevant regions\nwhile filtering irrelevant areas. Finally, a self-training mask improvement\nprocess is employed by generating pseudo-labels using high-confidence\npredictions to iteratively enhance the mask and segmentation. Experimental\nresults on a benchmark medical image dataset demonstrate the superior\nperformance of our method compared to state-of-the-art segmentation techniques\nin unsupervised scenarios.\n","authors":["Qing En","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11003v1","updated":"2024-04-17T02:29:44Z","published":"2024-04-17T02:29:44Z","title":"InfoMatch: Entropy Neural Estimation for Semi-Supervised Image\n Classification","summary":" Semi-supervised image classification, leveraging pseudo supervision and\nconsistency regularization, has demonstrated remarkable success. However, the\nongoing challenge lies in fully exploiting the potential of unlabeled data. To\naddress this, we employ information entropy neural estimation to harness the\npotential of unlabeled samples. Inspired by contrastive learning, the entropy\nis estimated by maximizing a lower bound on mutual information across different\naugmented views. Moreover, we theoretically analyze that the information\nentropy of the posterior of an image classifier is approximated by maximizing\nthe likelihood function of the softmax predictions. Guided by these insights,\nwe optimize our model from both perspectives to ensure that the predicted\nprobability distribution closely aligns with the ground-truth distribution.\nGiven the theoretical connection to information entropy, we name our method\n\\textit{InfoMatch}. Through extensive experiments, we show its superior\nperformance.\n","authors":["Qi Han","Zhibo Tian","Chengwei Xia","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.11003v1.pdf","comment":"IJCAI 2024"},{"id":"http://arxiv.org/abs/2308.13072v3","updated":"2024-04-17T02:09:54Z","published":"2023-08-24T20:29:09Z","title":"Full-dose Whole-body PET Synthesis from Low-dose PET Using\n High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency\n Model","summary":" Objective: Positron Emission Tomography (PET) has been a commonly used\nimaging modality in broad clinical applications. One of the most important\ntradeoffs in PET imaging is between image quality and radiation dose: high\nimage quality comes with high radiation exposure. Improving image quality is\ndesirable for all clinical applications while minimizing radiation exposure is\nneeded to reduce risk to patients. Approach: We introduce PET Consistency Model\n(PET-CM), an efficient diffusion-based method for generating high-quality\nfull-dose PET images from low-dose PET images. It employs a two-step process,\nadding Gaussian noise to full-dose PET images in the forward diffusion, and\nthen denoising them using a PET Shifted-window Vision Transformer (PET-VIT)\nnetwork in the reverse diffusion. The PET-VIT network learns a consistency\nfunction that enables direct denoising of Gaussian noise into clean full-dose\nPET images. PET-CM achieves state-of-the-art image quality while requiring\nsignificantly less computation time than other methods. Results: In experiments\ncomparing eighth-dose to full-dose images, PET-CM demonstrated impressive\nperformance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of\n0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of\n0.255+/-0.318%, with an average generation time of 62 seconds per patient. This\nis a significant improvement compared to the state-of-the-art diffusion-based\nmodel with PET-CM reaching this result 12x faster. Similarly, in the\nquarter-dose to full-dose image experiments, PET-CM delivered competitive\noutcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM\nof 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of\n0.151+/-0.192% using the same generation process, which underlining its high\nquantitative and clinical precision in both denoising scenario.\n","authors":["Shaoyan Pan","Elham Abouei","Junbo Peng","Joshua Qian","Jacob F Wynne","Tonghe Wang","Chih-Wei Chang","Justin Roper","Jonathon A Nye","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13072v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10992v1","updated":"2024-04-17T02:05:05Z","published":"2024-04-17T02:05:05Z","title":"How to deal with glare for improved perception of Autonomous Vehicles","summary":" Vision sensors are versatile and can capture a wide range of visual cues,\nsuch as color, texture, shape, and depth. This versatility, along with the\nrelatively inexpensive availability of machine vision cameras, played an\nimportant role in adopting vision-based environment perception systems in\nautonomous vehicles (AVs). However, vision-based perception systems can be\neasily affected by glare in the presence of a bright source of light, such as\nthe sun or the headlights of the oncoming vehicle at night or simply by light\nreflecting off snow or ice-covered surfaces; scenarios encountered frequently\nduring driving. In this paper, we investigate various glare reduction\ntechniques, including the proposed saturated pixel-aware glare reduction\ntechnique for improved performance of the computer vision (CV) tasks employed\nby the perception layer of AVs. We evaluate these glare reduction methods based\non various performance metrics of the CV algorithms used by the perception\nlayer. Specifically, we considered object detection, object recognition, object\ntracking, depth estimation, and lane detection which are crucial for autonomous\ndriving. The experimental findings validate the efficacy of the proposed glare\nreduction approach, showcasing enhanced performance across diverse perception\ntasks and remarkable resilience against varying levels of glare.\n","authors":["Muhammad Z. Alam","Zeeshan Kaleem","Sousso Kelouwani"],"pdf_url":"https://arxiv.org/pdf/2404.10992v1.pdf","comment":"14 pages, 9 figures, Accepted IEEE TIV"},{"id":"http://arxiv.org/abs/2404.10096v2","updated":"2024-04-17T02:02:33Z","published":"2024-04-15T19:06:58Z","title":"Vision Augmentation Prediction Autoencoder with Attention Design\n (VAPAAD)","summary":" Recent advancements in sequence prediction have significantly improved the\naccuracy of video data interpretation; however, existing models often overlook\nthe potential of attention-based mechanisms for next-frame prediction. This\nstudy introduces the Vision Augmentation Prediction Autoencoder with Attention\nDesign (VAPAAD), an innovative approach that integrates attention mechanisms\ninto sequence prediction, enabling nuanced analysis and understanding of\ntemporal dynamics in video sequences. Utilizing the Moving MNIST dataset, we\ndemonstrate VAPAAD's robust performance and superior handling of complex\ntemporal data compared to traditional methods. VAPAAD combines data\naugmentation, ConvLSTM2D layers, and a custom-built self-attention mechanism to\neffectively focus on salient features within a sequence, enhancing predictive\naccuracy and context-aware analysis. This methodology not only adheres to human\ncognitive processes during video interpretation but also addresses limitations\nin conventional models, which often struggle with the variability inherent in\nvideo sequences. The experimental results confirm that VAPAAD outperforms\nexisting models, especially in integrating attention mechanisms, which\nsignificantly improve predictive performance.\n","authors":["Yiqiao Yin"],"pdf_url":"https://arxiv.org/pdf/2404.10096v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.10989v1","updated":"2024-04-17T01:53:03Z","published":"2024-04-17T01:53:03Z","title":"FairSSD: Understanding Bias in Synthetic Speech Detectors","summary":" Methods that can generate synthetic speech which is perceptually\nindistinguishable from speech recorded by a human speaker, are easily\navailable. Several incidents report misuse of synthetic speech generated from\nthese methods to commit fraud. To counter such misuse, many methods have been\nproposed to detect synthetic speech. Some of these detectors are more\ninterpretable, can generalize to detect synthetic speech in the wild and are\nrobust to noise. However, limited work has been done on understanding bias in\nthese detectors. In this work, we examine bias in existing synthetic speech\ndetectors to determine if they will unfairly target a particular gender, age\nand accent group. We also inspect whether these detectors will have a higher\nmisclassification rate for bona fide speech from speech-impaired speakers w.r.t\nfluent speakers. Extensive experiments on 6 existing synthetic speech detectors\nusing more than 0.9 million speech signals demonstrate that most detectors are\ngender, age and accent biased, and future work is needed to ensure fairness. To\nsupport future research, we release our evaluation dataset, models used in our\nstudy and source code at https://gitlab.com/viper-purdue/fairssd.\n","authors":["Amit Kumar Singh Yadav","Kratika Bhagtani","Davide Salvi","Paolo Bestagini","Edward J. Delp"],"pdf_url":"https://arxiv.org/pdf/2404.10989v1.pdf","comment":"Accepted at CVPR 2024 (WMF)"},{"id":"http://arxiv.org/abs/2404.02155v2","updated":"2024-04-17T01:41:59Z","published":"2024-04-02T17:58:57Z","title":"Alpha Invariance: On Inverse Scaling Between Distance and Volume Density\n in Neural Radiance Fields","summary":" Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of\nvolumetric densities in neural radiance fields, i.e., the densities double when\nscene size is halved, and vice versa. We call this property alpha invariance.\nFor NeRFs to better maintain alpha invariance, we recommend 1) parameterizing\nboth distance and volume densities in log space, and 2) a\ndiscretization-agnostic initialization strategy to guarantee high ray\ntransmittance. We revisit a few popular radiance field models and find that\nthese systems use various heuristics to deal with issues arising from scene\nscaling. We test their behaviors and show our recipe to be more robust.\n","authors":["Joshua Ahn","Haochen Wang","Raymond A. Yeh","Greg Shakhnarovich"],"pdf_url":"https://arxiv.org/pdf/2404.02155v2.pdf","comment":"CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance"},{"id":"http://arxiv.org/abs/2404.10985v1","updated":"2024-04-17T01:35:52Z","published":"2024-04-17T01:35:52Z","title":"Pixel-Wise Symbol Spotting via Progressive Points Location for Parsing\n CAD Images","summary":" Parsing Computer-Aided Design (CAD) drawings is a fundamental step for CAD\nrevision, semantic-based management, and the generation of 3D prototypes in\nboth the architecture and engineering industries. Labeling symbols from a CAD\ndrawing is a challenging yet notorious task from a practical point of view. In\nthis work, we propose to label and spot symbols from CAD images that are\nconverted from CAD drawings. The advantage of spotting symbols from CAD images\nlies in the low requirement of labelers and the low-cost annotation. However,\npixel-wise spotting symbols from CAD images is challenging work. We propose a\npixel-wise point location via Progressive Gaussian Kernels (PGK) to balance\nbetween training efficiency and location accuracy. Besides, we introduce a\nlocal offset to the heatmap-based point location method. Based on the keypoints\ndetection, we propose a symbol grouping method to redraw the rectangle symbols\nin CAD images. We have released a dataset containing CAD images of equipment\nrooms from telecommunication industrial CAD drawings. Extensive experiments on\nthis real-world dataset show that the proposed method has good generalization\nability.\n","authors":["Junbiao Pang","Zailin Dong","Jiaxin Deng","Mengyuan Zhu","Yunwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10985v1.pdf","comment":"10 pages, 10 figures,6 tables"},{"id":"http://arxiv.org/abs/2404.10980v1","updated":"2024-04-17T01:26:15Z","published":"2024-04-17T01:26:15Z","title":"Hyper Evidential Deep Learning to Quantify Composite Classification\n Uncertainty","summary":" Deep neural networks (DNNs) have been shown to perform well on exclusive,\nmulti-class classification tasks. However, when different classes have similar\nvisual features, it becomes challenging for human annotators to differentiate\nthem. This scenario necessitates the use of composite class labels. In this\npaper, we propose a novel framework called Hyper-Evidential Neural Network\n(HENN) that explicitly models predictive uncertainty due to composite class\nlabels in training data in the context of the belief theory called Subjective\nLogic (SL). By placing a grouped Dirichlet distribution on the class\nprobabilities, we treat predictions of a neural network as parameters of\nhyper-subjective opinions and learn the network that collects both single and\ncomposite evidence leading to these hyper-opinions by a deterministic DNN from\ndata. We introduce a new uncertainty type called vagueness originally designed\nfor hyper-opinions in SL to quantify composite classification uncertainty for\nDNNs. Our results demonstrate that HENN outperforms its state-of-the-art\ncounterparts based on four image datasets. The code and datasets are available\nat: https://github.com/Hugo101/HyperEvidentialNN.\n","authors":["Changbin Li","Kangshuo Li","Yuzhe Ou","Lance M. Kaplan","Audun Jøsang","Jin-Hee Cho","Dong Hyun Jeong","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.10980v1.pdf","comment":"In Proceedings of The Twelfth International Conference on Learning\n Representations, ICLR 2024"},{"id":"http://arxiv.org/abs/2404.10978v1","updated":"2024-04-17T01:23:49Z","published":"2024-04-17T01:23:49Z","title":"Leveraging 3D LiDAR Sensors to Enable Enhanced Urban Safety and Public\n Health: Pedestrian Monitoring and Abnormal Activity Detection","summary":" The integration of Light Detection and Ranging (LiDAR) and Internet of Things\n(IoT) technologies offers transformative opportunities for public health\ninformatics in urban safety and pedestrian well-being. This paper proposes a\nnovel framework utilizing these technologies for enhanced 3D object detection\nand activity classification in urban traffic scenarios. By employing elevated\nLiDAR, we obtain detailed 3D point cloud data, enabling precise pedestrian\nactivity monitoring. To overcome urban data scarcity, we create a specialized\ndataset through simulated traffic environments in Blender, facilitating\ntargeted model training. Our approach employs a modified Point\nVoxel-Region-based Convolutional Neural Network (PV-RCNN) for robust 3D\ndetection and PointNet for classifying pedestrian activities, significantly\nbenefiting urban traffic management and public health by offering insights into\npedestrian behavior and promoting safer urban environments. Our dual-model\napproach not only enhances urban traffic management but also contributes\nsignificantly to public health by providing insights into pedestrian behavior\nand promoting safer urban environment.\n","authors":["Nawfal Guefrachi","Jian Shi","Hakim Ghazzai","Ahmad Alsharoa"],"pdf_url":"https://arxiv.org/pdf/2404.10978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02145v2","updated":"2024-04-17T01:10:28Z","published":"2024-04-02T17:57:31Z","title":"Iterated Learning Improves Compositionality in Large Vision-Language\n Models","summary":" A fundamental characteristic common to both human vision and natural language\nis their compositional nature. Yet, despite the performance gains contributed\nby large vision and language pretraining, recent investigations find that\nmost-if not all-our state-of-the-art vision-language models struggle at\ncompositionality. They are unable to distinguish between images of \" a girl in\nwhite facing a man in black\" and \"a girl in black facing a man in white\".\nMoreover, prior work suggests that compositionality doesn't arise with scale:\nlarger model sizes or training data don't help. This paper develops a new\niterated training algorithm that incentivizes compositionality. We draw on\ndecades of cognitive science research that identifies cultural transmission-the\nneed to teach a new generation-as a necessary inductive prior that incentivizes\nhumans to develop compositional languages. Specifically, we reframe\nvision-language contrastive learning as the Lewis Signaling Game between a\nvision agent and a language agent, and operationalize cultural transmission by\niteratively resetting one of the agent's weights during training. After every\niteration, this training paradigm induces representations that become \"easier\nto learn\", a property of compositional languages: e.g. our model trained on\nCC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the\nSugarCrepe benchmark.\n","authors":["Chenhao Zheng","Jieyu Zhang","Aniruddha Kembhavi","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.02145v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.10383v2","updated":"2024-04-17T01:05:07Z","published":"2024-04-16T08:25:36Z","title":"Learning to Score Sign Language with Two-stage Method","summary":" Human action recognition and performance assessment have been hot research\ntopics in recent years. Recognition problems have mature solutions in the field\nof sign language, but past research in performance analysis has focused on\ncompetitive sports and medical training, overlooking the scoring assessment\n,which is an important part of sign language teaching digitalization. In this\npaper, we analyze the existing technologies for performance assessment and\nadopt methods that perform well in human pose reconstruction tasks combined\nwith motion rotation embedded expressions, proposing a two-stage sign language\nperformance evaluation pipeline. Our analysis shows that choosing\nreconstruction tasks in the first stage can provide more expressive features,\nand using smoothing methods can provide an effective reference for assessment.\nExperiments show that our method provides good score feedback mechanisms and\nhigh consistency with professional assessments compared to end-to-end\nevaluations.\n","authors":["Hongli Wen","Yang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.10383v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.11700v2","updated":"2024-04-17T01:04:07Z","published":"2023-10-18T04:15:39Z","title":"Runner re-identification from single-view running video in the\n open-world setting","summary":" In many sports, player re-identification is crucial for automatic video\nprocessing and analysis. However, most of the current studies on player\nre-identification in multi- or single-view sports videos focus on\nre-identification in the closed-world setting using labeled image dataset, and\nplayer re-identification in the open-world setting for automatic video analysis\nis not well developed. In this paper, we propose a runner re-identification\nsystem that directly processes single-view video to address the open-world\nsetting. In the open-world setting, we cannot use labeled dataset and have to\nprocess video directly. The proposed system automatically processes raw video\nas input to identify runners, and it can identify runners even when they are\nframed out multiple times. For the automatic processing, we first detect the\nrunners in the video using the pre-trained YOLOv8 and the fine-tuned\nEfficientNet. We then track the runners using ByteTrack and detect their shoes\nwith the fine-tuned YOLOv8. Finally, we extract the image features of the\nrunners using an unsupervised method with the gated recurrent unit autoencoder\nand global and local features mixing. To improve the accuracy of runner\nre-identification, we use shoe images as local image features and dynamic\nfeatures of running sequence images. We evaluated the system on a running\npractice video dataset and showed that the proposed method identified runners\nwith higher accuracy than some state-of-the-art models in unsupervised\nre-identification. We also showed that our proposed local image feature and\nrunning dynamic feature were effective for runner re-identification. Our runner\nre-identification system can be useful for the automatic analysis of running\nvideos.\n","authors":["Tomohiro Suzuki","Kazushi Tsutsui","Kazuya Takeda","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2310.11700v2.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.03557v2","updated":"2024-04-17T01:01:17Z","published":"2024-02-05T22:15:55Z","title":"Robust Analysis of Multi-Task Learning Efficiency: New Benchmarks on\n Light-Weighed Backbones and Effective Measurement of Multi-Task Learning\n Challenges by Feature Disentanglement","summary":" One of the main motivations of MTL is to develop neural networks capable of\ninferring multiple tasks simultaneously. While countless methods have been\nproposed in the past decade investigating robust model architectures and\nefficient training algorithms, there is still lack of understanding of these\nmethods when applied on smaller feature extraction backbones, the\ngeneralizability of the commonly used fast approximation technique of replacing\nparameter-level gradients with feature level gradients, and lack of\ncomprehensive understanding of MTL challenges and how one can efficiently and\neffectively identify the challenges. In this paper, we focus on the\naforementioned efficiency aspects of existing MTL methods. We first carry out\nlarge-scale experiments of the methods with smaller backbones and on a the\nMetaGraspNet dataset as a new test ground. We also compare the existing methods\nwith and without using the fast gradient surrogate and empirically study the\ngeneralizability of this technique. Lastly, we propose Feature Disentanglement\nmeasure as a novel and efficient identifier of the challenges in MTL, and\npropose Ranking Similarity score as an evaluation metric for different\nidentifiers to prove the faithfulness of our method.\n","authors":["Dayou Mao","Yuhao Chen","Yifan Wu","Maximilian Gilles","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2402.03557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10966v1","updated":"2024-04-17T00:21:36Z","published":"2024-04-17T00:21:36Z","title":"Domain-Specific Block Selection and Paired-View Pseudo-Labeling for\n Online Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test\ndomain without access to source data after deployment. Existing approaches\ntypically rely on self-training with pseudo-labels since ground-truth cannot be\nobtained from test data. Although the quality of pseudo labels is important for\nstable and accurate long-term adaptation, it has not been previously addressed.\nIn this work, we propose DPLOT, a simple yet effective TTA framework that\nconsists of two components: (1) domain-specific block selection and (2)\npseudo-label generation using paired-view images. Specifically, we select\nblocks that involve domain-specific feature extraction and train these blocks\nby entropy minimization. After blocks are adjusted for current test domain, we\ngenerate pseudo-labels by averaging given test images and corresponding flipped\ncounterparts. By simply using flip augmentation, we prevent a decrease in the\nquality of the pseudo-labels, which can be caused by the domain gap resulting\nfrom strong augmentation. Our experimental results demonstrate that DPLOT\noutperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C\nbenchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also,\nwe provide an extensive analysis to demonstrate effectiveness of our framework.\nCode is available at\nhttps://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA.\n","authors":["Yeonguk Yu","Sungho Shin","Seunghyeok Back","Minhwan Ko","Sangjun Noh","Kyoobin Lee"],"pdf_url":"https://arxiv.org/pdf/2404.10966v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11803v1","updated":"2024-04-17T23:49:00Z","published":"2024-04-17T23:49:00Z","title":"TempBEV: Improving Learned BEV Encoders with Combined Image and BEV\n Space Temporal Aggregation","summary":" Autonomous driving requires an accurate representation of the environment. A\nstrategy toward high accuracy is to fuse data from several sensors. Learned\nBird's-Eye View (BEV) encoders can achieve this by mapping data from individual\nsensors into one joint latent space. For cost-efficient camera-only systems,\nthis provides an effective mechanism to fuse data from multiple cameras with\ndifferent views. Accuracy can further be improved by aggregating sensor\ninformation over time. This is especially important in monocular camera systems\nto account for the lack of explicit depth and velocity measurements. Thereby,\nthe effectiveness of developed BEV encoders crucially depends on the operators\nused to aggregate temporal information and on the used latent representation\nspaces. We analyze BEV encoders proposed in the literature and compare their\neffectiveness, quantifying the effects of aggregation operators and latent\nrepresentations. While most existing approaches aggregate temporal information\neither in image or in BEV latent space, our analyses and performance\ncomparisons suggest that these latent representations exhibit complementary\nstrengths. Therefore, we develop a novel temporal BEV encoder, TempBEV, which\nintegrates aggregated temporal information from both latent spaces. We consider\nsubsequent image frames as stereo through time and leverage methods from\noptical flow estimation for temporal stereo encoding. Empirical evaluation on\nthe NuScenes dataset shows a significant improvement by TempBEV over the\nbaseline for 3D object detection and BEV segmentation. The ablation uncovers a\nstrong synergy of joint temporal aggregation in the image and BEV latent space.\nThese results indicate the overall effectiveness of our approach and make a\nstrong case for aggregating temporal information in both image and BEV latent\nspaces.\n","authors":["Thomas Monninger","Vandana Dokkadi","Md Zafar Anwar","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2404.11803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11798v1","updated":"2024-04-17T23:33:34Z","published":"2024-04-17T23:33:34Z","title":"Establishing a Baseline for Gaze-driven Authentication Performance in\n VR: A Breadth-First Investigation on a Very Large Dataset","summary":" This paper performs the crucial work of establishing a baseline for\ngaze-driven authentication performance to begin answering fundamental research\nquestions using a very large dataset of gaze recordings from 9202 people with a\nlevel of eye tracking (ET) signal quality equivalent to modern consumer-facing\nvirtual reality (VR) platforms. The size of the employed dataset is at least an\norder-of-magnitude larger than any other dataset from previous related work.\nBinocular estimates of the optical and visual axes of the eyes and a minimum\nduration for enrollment and verification are required for our model to achieve\na false rejection rate (FRR) of below 3% at a false acceptance rate (FAR) of 1\nin 50,000. In terms of identification accuracy which decreases with gallery\nsize, we estimate that our model would fall below chance-level accuracy for\ngallery sizes of 148,000 or more. Our major findings indicate that gaze\nauthentication can be as accurate as required by the FIDO standard when driven\nby a state-of-the-art machine learning architecture and a sufficiently large\ntraining dataset.\n","authors":["Dillon Lohr","Michael J. Proulx","Oleg Komogortsev"],"pdf_url":"https://arxiv.org/pdf/2404.11798v1.pdf","comment":"28 pages, 18 figures, 5 tables, includes supplementary material"},{"id":"http://arxiv.org/abs/2404.11797v1","updated":"2024-04-17T23:30:48Z","published":"2024-04-17T23:30:48Z","title":"When are Foundation Models Effective? Understanding the Suitability for\n Pixel-Level Classification Using Multispectral Imagery","summary":" Foundation models, i.e., very large deep learning models, have demonstrated\nimpressive performances in various language and vision tasks that are otherwise\ndifficult to reach using smaller-size models. The major success of GPT-type of\nlanguage models is particularly exciting and raises expectations on the\npotential of foundation models in other domains including satellite remote\nsensing. In this context, great efforts have been made to build foundation\nmodels to test their capabilities in broader applications, and examples include\nPrithvi by NASA-IBM, Segment-Anything-Model, ViT, etc. This leads to an\nimportant question: Are foundation models always a suitable choice for\ndifferent remote sensing tasks, and when or when not? This work aims to enhance\nthe understanding of the status and suitability of foundation models for\npixel-level classification using multispectral imagery at moderate resolution,\nthrough comparisons with traditional machine learning (ML) and regular-size\ndeep learning models. Interestingly, the results reveal that in many scenarios\ntraditional ML models still have similar or better performance compared to\nfoundation models, especially for tasks where texture is less useful for\nclassification. On the other hand, deep learning models did show more promising\nresults for tasks where labels partially depend on texture (e.g., burn scar),\nwhile the difference in performance between foundation models and deep learning\nmodels is not obvious. The results conform with our analysis: The suitability\nof foundation models depend on the alignment between the self-supervised\nlearning tasks and the real downstream tasks, and the typical masked\nautoencoder paradigm is not necessarily suitable for many remote sensing\nproblems.\n","authors":["Yiqun Xie","Zhihao Wang","Weiye Chen","Zhili Li","Xiaowei Jia","Yanhua Li","Ruichen Wang","Kangyang Chai","Ruohan Li","Sergii Skakun"],"pdf_url":"https://arxiv.org/pdf/2404.11797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10913v2","updated":"2024-04-17T23:27:02Z","published":"2023-08-20T22:29:16Z","title":"Automated mapping of virtual environments with visual predictive coding","summary":" Humans construct internal cognitive maps of their environment directly from\nsensory inputs without access to a system of explicit coordinates or distance\nmeasurements. While machine learning algorithms like SLAM utilize specialized\nvisual inference procedures to identify visual features and construct spatial\nmaps from visual and odometry data, the general nature of cognitive maps in the\nbrain suggests a unified mapping algorithmic strategy that can generalize to\nauditory, tactile, and linguistic inputs. Here, we demonstrate that predictive\ncoding provides a natural and versatile neural network algorithm for\nconstructing spatial maps using sensory data. We introduce a framework in which\nan agent navigates a virtual environment while engaging in visual predictive\ncoding using a self-attention-equipped convolutional neural network. While\nlearning a next image prediction task, the agent automatically constructs an\ninternal representation of the environment that quantitatively reflects\ndistances. The internal map enables the agent to pinpoint its location relative\nto landmarks using only visual information.The predictive coding network\ngenerates a vectorized encoding of the environment that supports vector\nnavigation where individual latent space units delineate localized, overlapping\nneighborhoods in the environment. Broadly, our work introduces predictive\ncoding as a unified algorithmic framework for constructing cognitive maps that\ncan naturally extend to the mapping of auditory, sensorimotor, and linguistic\ninputs.\n","authors":["James Gornet","Matthew Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.10913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11795v1","updated":"2024-04-17T23:10:11Z","published":"2024-04-17T23:10:11Z","title":"Prompt-Driven Feature Diffusion for Open-World Semi-Supervised Learning","summary":" In this paper, we present a novel approach termed Prompt-Driven Feature\nDiffusion (PDFD) within a semi-supervised learning framework for Open World\nSemi-Supervised Learning (OW-SSL). At its core, PDFD deploys an efficient\nfeature-level diffusion model with the guidance of class-specific prompts to\nsupport discriminative feature representation learning and feature generation,\ntackling the challenge of the non-availability of labeled data for unseen\nclasses in OW-SSL. In particular, PDFD utilizes class prototypes as prompts in\nthe diffusion model, leveraging their class-discriminative and semantic\ngeneralization ability to condition and guide the diffusion process across all\nthe seen and unseen classes. Furthermore, PDFD incorporates a class-conditional\nadversarial loss for diffusion model training, ensuring that the features\ngenerated via the diffusion process can be discriminatively aligned with the\nclass-conditional features of the real data. Additionally, the class prototypes\nof the unseen classes are computed using only unlabeled instances with\nconfident predictions within a semi-supervised learning framework. We conduct\nextensive experiments to evaluate the proposed PDFD. The empirical results show\nPDFD exhibits remarkable performance enhancements over many state-of-the-art\nexisting methods.\n","authors":["Marzi Heidari","Hanping Zhang","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03338v2","updated":"2024-04-17T23:02:39Z","published":"2022-12-06T21:42:05Z","title":"Framework-agnostic Semantically-aware Global Reasoning for Segmentation","summary":" Recent advances in pixel-level tasks (e.g. segmentation) illustrate the\nbenefit of of long-range interactions between aggregated region-based\nrepresentations that can enhance local features. However, such aggregated\nrepresentations, often in the form of attention, fail to model the underlying\nsemantics of the scene (e.g. individual objects and, by extension, their\ninteractions). In this work, we address the issue by proposing a component that\nlearns to project image features into latent representations and reason between\nthem using a transformer encoder to generate contextualized and\nscene-consistent representations which are fused with original image features.\nOur design encourages the latent regions to represent semantic concepts by\nensuring that the activated regions are spatially disjoint and the union of\nsuch regions corresponds to a connected object segment. The proposed semantic\nglobal reasoning (SGR) component is end-to-end trainable and can be easily\nadded to a wide variety of backbones (CNN or transformer-based) and\nsegmentation heads (per-pixel or mask classification) to consistently improve\nthe segmentation results on different datasets. In addition, our latent tokens\nare semantically interpretable and diverse and provide a rich set of features\nthat can be transferred to downstream tasks like object detection and\nsegmentation, with improved performance. Furthermore, we also proposed metrics\nto quantify the semantics of latent tokens at both class \\& instance level.\n","authors":["Mir Rayat Imtiaz Hossain","Leonid Sigal","James J. Little"],"pdf_url":"https://arxiv.org/pdf/2212.03338v2.pdf","comment":"Published in WACV 2024"},{"id":"http://arxiv.org/abs/2403.14115v2","updated":"2024-04-17T22:38:14Z","published":"2024-03-21T04:01:26Z","title":"Training point-based deep learning networks for forest segmentation with\n synthetic data","summary":" Remote sensing through unmanned aerial systems (UAS) has been increasing in\nforestry in recent years, along with using machine learning for data\nprocessing. Deep learning architectures, extensively applied in natural\nlanguage and image processing, have recently been extended to the point cloud\ndomain. However, the availability of point cloud datasets for training and\ntesting remains limited. Creating forested environment point cloud datasets is\nexpensive, requires high-precision sensors, and is time-consuming as manual\npoint classification is required. Moreover, forest areas could be inaccessible\nor dangerous for humans, further complicating data collection. Then, a question\narises whether it is possible to use synthetic data to train deep learning\nnetworks without the need to rely on large volumes of real forest data. To\nanswer this question, we developed a realistic simulator that procedurally\ngenerates synthetic forest scenes. Thanks to this, we have conducted a\ncomparative study of different state-of-the-art point-based deep learning\nnetworks for forest segmentation. Using created datasets, we determined the\nfeasibility of using synthetic data to train deep learning networks to classify\npoint clouds from real forest datasets. Both the simulator and the datasets are\nreleased as part of this work.\n","authors":["Francisco Raverta Capua","Juan Schandin","Pablo De Cristóforis"],"pdf_url":"https://arxiv.org/pdf/2403.14115v2.pdf","comment":"15 pages, 4 figures. Submitted to the International Conference on\n Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2404.11778v1","updated":"2024-04-17T22:02:22Z","published":"2024-04-17T22:02:22Z","title":"CU-Mamba: Selective State Space Models with Channel Learning for Image\n Restoration","summary":" Reconstructing degraded images is a critical task in image processing.\nAlthough CNN and Transformer-based models are prevalent in this field, they\nexhibit inherent limitations, such as inadequate long-range dependency modeling\nand high computational costs. To overcome these issues, we introduce the\nChannel-Aware U-Shaped Mamba (CU-Mamba) model, which incorporates a dual State\nSpace Model (SSM) framework into the U-Net architecture. CU-Mamba employs a\nSpatial SSM module for global context encoding and a Channel SSM component to\npreserve channel correlation features, both in linear computational complexity\nrelative to the feature map size. Extensive experimental results validate\nCU-Mamba's superiority over existing state-of-the-art methods, underscoring the\nimportance of integrating both spatial and channel contexts in image\nrestoration.\n","authors":["Rui Deng","Tianpei Gu"],"pdf_url":"https://arxiv.org/pdf/2404.11778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11776v1","updated":"2024-04-17T21:57:29Z","published":"2024-04-17T21:57:29Z","title":"3D object quality prediction for Metal Jet Printer with Multimodal\n thermal encoder","summary":" With the advancements in 3D printing technologies, it is extremely important\nthat the quality of 3D printed objects, and dimensional accuracies should meet\nthe customer's specifications. Various factors during metal printing affect the\nprinted parts' quality, including the power quality, the printing stage\nparameters, the print part's location inside the print bed, the curing stage\nparameters, and the metal sintering process. With the large data gathered from\nHP's MetJet printing process, AI techniques can be used to analyze, learn, and\neffectively infer the printed part quality metrics, as well as assist in\nimproving the print yield. In-situ thermal sensing data captured by\nprinter-installed thermal sensors contains the part thermal signature of fusing\nlayers. Such part thermal signature contains a convoluted impact from various\nfactors. In this paper, we use a multimodal thermal encoder network to fuse\ndata of a different nature including the video data vectorized printer control\ndata, and exact part thermal signatures with a trained encoder-decoder module.\nWe explored the data fusing techniques and stages for data fusing, the\noptimized end-to-end model architecture indicates an improved part quality\nprediction accuracy.\n","authors":[" Rachel"," Chen","Wenjia Zheng","Sandeep Jalui","Pavan Suri","Jun Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.11776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11770v1","updated":"2024-04-17T21:53:01Z","published":"2024-04-17T21:53:01Z","title":"Event-Based Eye Tracking. AIS 2024 Challenge Survey","summary":" This survey reviews the AIS 2024 Event-Based Eye Tracking (EET) Challenge.\nThe task of the challenge focuses on processing eye movement recorded with\nevent cameras and predicting the pupil center of the eye. The challenge\nemphasizes efficient eye tracking with event cameras to achieve good task\naccuracy and efficiency trade-off. During the challenge period, 38 participants\nregistered for the Kaggle competition, and 8 teams submitted a challenge\nfactsheet. The novel and diverse methods from the submitted factsheets are\nreviewed and analyzed in this survey to advance future event-based eye tracking\nresearch.\n","authors":["Zuowen Wang","Chang Gao","Zongwei Wu","Marcos V. Conde","Radu Timofte","Shih-Chii Liu","Qinyu Chen","Zheng-jun Zha","Wei Zhai","Han Han","Bohao Liao","Yuliang Wu","Zengyu Wan","Zhong Wang","Yang Cao","Ganchao Tan","Jinze Chen","Yan Ru Pei","Sasskia Brüers","Sébastien Crouzet","Douglas McLelland","Oliver Coenen","Baoheng Zhang","Yizhao Gao","Jingyuan Li","Hayden Kwok-Hay So","Philippe Bich","Chiara Boretti","Luciano Prono","Mircea Lică","David Dinucu-Jianu","Cătălin Grîu","Xiaopeng Lin","Hongwei Ren","Bojun Cheng","Xinan Zhang","Valentin Vial","Anthony Yezzi","James Tsai"],"pdf_url":"https://arxiv.org/pdf/2404.11770v1.pdf","comment":"Qinyu Chen is the corresponding author"},{"id":"http://arxiv.org/abs/2404.11769v1","updated":"2024-04-17T21:52:21Z","published":"2024-04-17T21:52:21Z","title":"QGen: On the Ability to Generalize in Quantization Aware Training","summary":" Quantization lowers memory usage, computational requirements, and latency by\nutilizing fewer bits to represent model weights and activations. In this work,\nwe investigate the generalization properties of quantized neural networks, a\ncharacteristic that has received little attention despite its implications on\nmodel performance. In particular, first, we develop a theoretical model for\nquantization in neural networks and demonstrate how quantization functions as a\nform of regularization. Second, motivated by recent work connecting the\nsharpness of the loss landscape and generalization, we derive an approximate\nbound for the generalization of quantized models conditioned on the amount of\nquantization noise. We then validate our hypothesis by experimenting with over\n2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on\nconvolutional and transformer-based models.\n","authors":["MohammadHossein AskariHemmat","Ahmadreza Jeddi","Reyhane Askari Hemmat","Ivan Lazarevich","Alexander Hoffman","Sudhakar Sah","Ehsan Saboori","Yvon Savaria","Jean-Pierre David"],"pdf_url":"https://arxiv.org/pdf/2404.11769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11764v1","updated":"2024-04-17T21:47:45Z","published":"2024-04-17T21:47:45Z","title":"Multimodal 3D Object Detection on Unseen Domains","summary":" LiDAR datasets for autonomous driving exhibit biases in properties such as\npoint cloud density, range, and object dimensions. As a result, object\ndetection networks trained and evaluated in different environments often\nexperience performance degradation. Domain adaptation approaches assume access\nto unannotated samples from the test distribution to address this problem.\nHowever, in the real world, the exact conditions of deployment and access to\nsamples representative of the test dataset may be unavailable while training.\nWe argue that the more realistic and challenging formulation is to require\nrobustness in performance to unseen target domains. We propose to address this\nproblem in a two-pronged manner. First, we leverage paired LiDAR-image data\npresent in most autonomous driving datasets to perform multimodal object\ndetection. We suggest that working with multimodal features by leveraging both\nimages and LiDAR point clouds for scene understanding tasks results in object\ndetectors more robust to unseen domain shifts. Second, we train a 3D object\ndetector to learn multimodal object features across different distributions and\npromote feature invariance across these source domains to improve\ngeneralizability to unseen target domains. To this end, we propose\nCLIX$^\\text{3D}$, a multimodal fusion and supervised contrastive learning\nframework for 3D object detection that performs alignment of object features\nfrom same-class samples of different domains while pushing the features from\ndifferent classes apart. We show that CLIX$^\\text{3D}$ yields state-of-the-art\ndomain generalization performance under multiple dataset shifts.\n","authors":["Deepti Hegde","Suhas Lohit","Kuan-Chuan Peng","Michael J. Jones","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.11764v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2404.11762v1","updated":"2024-04-17T21:43:43Z","published":"2024-04-17T21:43:43Z","title":"IrrNet: Advancing Irrigation Mapping with Incremental Patch Size\n Training on Remote Sensing Imagery","summary":" Irrigation mapping plays a crucial role in effective water management,\nessential for preserving both water quality and quantity, and is key to\nmitigating the global issue of water scarcity. The complexity of agricultural\nfields, adorned with diverse irrigation practices, especially when multiple\nsystems coexist in close quarters, poses a unique challenge. This complexity is\nfurther compounded by the nature of Landsat's remote sensing data, where each\npixel is rich with densely packed information, complicating the task of\naccurate irrigation mapping. In this study, we introduce an innovative approach\nthat employs a progressive training method, which strategically increases patch\nsizes throughout the training process, utilizing datasets from Landsat 5 and 7,\nlabeled with the WRLU dataset for precise labeling. This initial focus allows\nthe model to capture detailed features, progressively shifting to broader, more\ngeneral features as the patch size enlarges. Remarkably, our method enhances\nthe performance of existing state-of-the-art models by approximately 20%.\nFurthermore, our analysis delves into the significance of incorporating various\nspectral bands into the model, assessing their impact on performance. The\nfindings reveal that additional bands are instrumental in enabling the model to\ndiscern finer details more effectively. This work sets a new standard for\nleveraging remote sensing imagery in irrigation mapping.\n","authors":["Oishee Bintey Hoque","Samarth Swarup","Abhijin Adiga","Sayjro Kossi Nouwakpo","Madhav Marathe"],"pdf_url":"https://arxiv.org/pdf/2404.11762v1.pdf","comment":"Full version of the paper will be appearing in Proceedings of the\n IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n Workshops, 2024"},{"id":"http://arxiv.org/abs/2302.04143v2","updated":"2024-04-17T21:20:14Z","published":"2023-02-08T15:41:21Z","title":"Predicting Thrombectomy Recanalization from CT Imaging Using Deep\n Learning Models","summary":" For acute ischemic stroke (AIS) patients with large vessel occlusions,\nclinicians must decide if the benefit of mechanical thrombectomy (MTB)\noutweighs the risks and potential complications following an invasive\nprocedure. Pre-treatment computed tomography (CT) and angiography (CTA) are\nwidely used to characterize occlusions in the brain vasculature. If a patient\nis deemed eligible, a modified treatment in cerebral ischemia (mTICI) score\nwill be used to grade how well blood flow is reestablished throughout and\nfollowing the MTB procedure. An estimation of the likelihood of successful\nrecanalization can support treatment decision-making. In this study, we\nproposed a fully automated prediction of a patient's recanalization score using\npre-treatment CT and CTA imaging. We designed a spatial cross attention network\n(SCANet) that utilizes vision transformers to localize to pertinent slices and\nbrain regions. Our top model achieved an average cross-validated ROC-AUC of\n77.33 $\\pm$ 3.9\\%. This is a promising result that supports future applications\nof deep learning on CT and CTA for the identification of eligible AIS patients\nfor MTB.\n","authors":["Haoyue Zhang","Jennifer S. Polson","Eric J. Yang","Kambiz Nael","William Speier","Corey W. Arnold"],"pdf_url":"https://arxiv.org/pdf/2302.04143v2.pdf","comment":"Medical Imaging with Deep Learning 2022 accepted short paper Jun 2022"},{"id":"http://arxiv.org/abs/2210.12100v2","updated":"2024-04-17T21:16:56Z","published":"2022-10-21T16:52:16Z","title":"Boomerang: Local sampling on image manifolds using diffusion models","summary":" The inference stage of diffusion models can be seen as running a reverse-time\ndiffusion stochastic differential equation, where samples from a Gaussian\nlatent distribution are transformed into samples from a target distribution\nthat usually reside on a low-dimensional manifold, e.g., an image manifold. The\nintermediate values between the initial latent space and the image manifold can\nbe interpreted as noisy images, with the amount of noise determined by the\nforward diffusion process noise schedule. We utilize this interpretation to\npresent Boomerang, an approach for local sampling of image manifolds. As\nimplied by its name, Boomerang local sampling involves adding noise to an input\nimage, moving it closer to the latent space, and then mapping it back to the\nimage manifold through a partial reverse diffusion process. Thus, Boomerang\ngenerates images on the manifold that are ``similar,'' but nonidentical, to the\noriginal input image. We can control the proximity of the generated images to\nthe original by adjusting the amount of noise added. Furthermore, due to the\nstochastic nature of the reverse diffusion process in Boomerang, the generated\nimages display a certain degree of stochasticity, allowing us to obtain local\nsamples from the manifold without encountering any duplicates. Boomerang offers\nthe flexibility to work seamlessly with any pretrained diffusion model, such as\nStable Diffusion, without necessitating any adjustments to the reverse\ndiffusion process. We present three applications for Boomerang. First, we\nprovide a framework for constructing privacy-preserving datasets having\ncontrollable degrees of anonymity. Second, we show that using Boomerang for\ndata augmentation increases generalization performance and outperforms\nstate-of-the-art synthetic data augmentation. Lastly, we introduce a perceptual\nimage enhancement framework, which enables resolution enhancement.\n","authors":["Lorenzo Luzi","Paul M Mayer","Josue Casco-Rodriguez","Ali Siahkoohi","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2210.12100v2.pdf","comment":"Published in Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2404.11741v1","updated":"2024-04-17T20:48:19Z","published":"2024-04-17T20:48:19Z","title":"Diffusion Schrödinger Bridge Models for High-Quality MR-to-CT\n Synthesis for Head and Neck Proton Treatment Planning","summary":" In recent advancements in proton therapy, MR-based treatment planning is\ngaining momentum to minimize additional radiation exposure compared to\ntraditional CT-based methods. This transition highlights the critical need for\naccurate MR-to-CT image synthesis, which is essential for precise proton dose\ncalculations. Our research introduces the Diffusion Schr\\\"odinger Bridge Models\n(DSBM), an innovative approach for high-quality MR-to-CT synthesis. DSBM learns\nthe nonlinear diffusion processes between MR and CT data distributions. This\nmethod improves upon traditional diffusion models by initiating synthesis from\nthe prior distribution rather than the Gaussian distribution, enhancing both\ngeneration quality and efficiency. We validated the effectiveness of DSBM on a\nhead and neck cancer dataset, demonstrating its superiority over traditional\nimage synthesis methods through both image-level and dosimetric-level\nevaluations. The effectiveness of DSBM in MR-based proton treatment planning\nhighlights its potential as a valuable tool in various clinical scenarios.\n","authors":["Muheng Li","Xia Li","Sairos Safai","Damien Weber","Antony Lomax","Ye Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.11741v1.pdf","comment":"International Conference on the use of Computers in Radiation therapy\n (ICCR)"},{"id":"http://arxiv.org/abs/2404.11737v1","updated":"2024-04-17T20:41:49Z","published":"2024-04-17T20:41:49Z","title":"Equivariant Spatio-Temporal Self-Supervision for LiDAR Object Detection","summary":" Popular representation learning methods encourage feature invariance under\ntransformations applied at the input. However, in 3D perception tasks like\nobject localization and segmentation, outputs are naturally equivariant to some\ntransformations, such as rotation. Using pre-training loss functions that\nencourage equivariance of features under certain transformations provides a\nstrong self-supervision signal while also retaining information of geometric\nrelationships between transformed feature representations. This can enable\nimproved performance in downstream tasks that are equivariant to such\ntransformations. In this paper, we propose a spatio-temporal equivariant\nlearning framework by considering both spatial and temporal augmentations\njointly. Our experiments show that the best performance arises with a\npre-training approach that encourages equivariance to translation, scaling, and\nflip, rotation and scene flow. For spatial augmentations, we find that\ndepending on the transformation, either a contrastive objective or an\nequivariance-by-classification objective yields best results. To leverage\nreal-world object deformations and motion, we consider sequential LiDAR scene\npairs and develop a novel 3D scene flow-based equivariance objective that leads\nto improved performance overall. We show our pre-training method for 3D object\ndetection which outperforms existing equivariant and invariant approaches in\nmany settings.\n","authors":["Deepti Hegde","Suhas Lohit","Kuan-Chuan Peng","Michael J. Jones","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.11737v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2404.11735v1","updated":"2024-04-17T20:37:29Z","published":"2024-04-17T20:37:29Z","title":"Learning with 3D rotations, a hitchhiker's guide to SO(3)","summary":" Many settings in machine learning require the selection of a rotation\nrepresentation. However, choosing a suitable representation from the many\navailable options is challenging. This paper acts as a survey and guide through\nrotation representations. We walk through their properties that harm or benefit\ndeep learning with gradient-based optimization. By consolidating insights from\nrotation-based learning, we provide a comprehensive overview of learning\nfunctions with rotation representations. We provide guidance on selecting\nrepresentations based on whether rotations are in the model's input or output\nand whether the data primarily comprises small angles.\n","authors":["A. René Geist","Jonas Frey","Mikel Zobro","Anna Levina","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2404.11735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11732v1","updated":"2024-04-17T20:35:00Z","published":"2024-04-17T20:35:00Z","title":"Visual Prompting for Generalized Few-shot Segmentation: A Multi-scale\n Approach","summary":" The emergence of attention-based transformer models has led to their\nextensive use in various tasks, due to their superior generalization and\ntransfer properties. Recent research has demonstrated that such models, when\nprompted appropriately, are excellent for few-shot inference. However, such\ntechniques are under-explored for dense prediction tasks like semantic\nsegmentation. In this work, we examine the effectiveness of prompting a\ntransformer-decoder with learned visual prompts for the generalized few-shot\nsegmentation (GFSS) task. Our goal is to achieve strong performance not only on\nnovel categories with limited examples, but also to retain performance on base\ncategories. We propose an approach to learn visual prompts with limited\nexamples. These learned visual prompts are used to prompt a multiscale\ntransformer decoder to facilitate accurate dense predictions. Additionally, we\nintroduce a unidirectional causal attention mechanism between the novel\nprompts, learned with limited examples, and the base prompts, learned with\nabundant data. This mechanism enriches the novel prompts without deteriorating\nthe base class performance. Overall, this form of prompting helps us achieve\nstate-of-the-art performance for GFSS on two different benchmark datasets:\nCOCO-$20^i$ and Pascal-$5^i$, without the need for test-time optimization (or\ntransduction). Furthermore, test-time optimization leveraging unlabelled test\ndata can be used to improve the prompts, which we refer to as transductive\nprompt tuning.\n","authors":["Mir Rayat Imtiaz Hossain","Mennatullah Siam","Leonid Sigal","James J. Little"],"pdf_url":"https://arxiv.org/pdf/2404.11732v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2401.06341v2","updated":"2024-04-17T20:33:56Z","published":"2024-01-12T03:21:02Z","title":"AffordanceLLM: Grounding Affordance from Vision Language Models","summary":" Affordance grounding refers to the task of finding the area of an object with\nwhich one can interact. It is a fundamental but challenging task, as a\nsuccessful solution requires the comprehensive understanding of a scene in\nmultiple aspects including detection, localization, and recognition of objects\nwith their parts, of geo-spatial configuration/layout of the scene, of 3D\nshapes and physics, as well as of the functionality and potential interaction\nof the objects and humans. Much of the knowledge is hidden and beyond the image\ncontent with the supervised labels from a limited training set. In this paper,\nwe make an attempt to improve the generalization capability of the current\naffordance grounding by taking the advantage of the rich world, abstract, and\nhuman-object-interaction knowledge from pretrained large-scale vision language\nmodels. Under the AGD20K benchmark, our proposed model demonstrates a\nsignificant performance gain over the competing methods for in-the-wild object\naffordance grounding. We further demonstrate it can ground affordance for\nobjects from random Internet images, even if both objects and actions are\nunseen during training. Project site: https://jasonqsy.github.io/AffordanceLLM/\n","authors":["Shengyi Qian","Weifeng Chen","Min Bai","Xiong Zhou","Zhuowen Tu","Li Erran Li"],"pdf_url":"https://arxiv.org/pdf/2401.06341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11727v1","updated":"2024-04-17T20:28:15Z","published":"2024-04-17T20:28:15Z","title":"Deep Learning for Video-Based Assessment of Endotracheal Intubation\n Skills","summary":" Endotracheal intubation (ETI) is an emergency procedure performed in civilian\nand combat casualty care settings to establish an airway. Objective and\nautomated assessment of ETI skills is essential for the training and\ncertification of healthcare providers. However, the current approach is based\non manual feedback by an expert, which is subjective, time- and\nresource-intensive, and is prone to poor inter-rater reliability and halo\neffects. This work proposes a framework to evaluate ETI skills using single and\nmulti-view videos. The framework consists of two stages. First, a 2D\nconvolutional autoencoder (AE) and a pre-trained self-supervision network\nextract features from videos. Second, a 1D convolutional enhanced with a\ncross-view attention module takes the features from the AE as input and outputs\npredictions for skill evaluation. The ETI datasets were collected in two\nphases. In the first phase, ETI is performed by two subject cohorts: Experts\nand Novices. In the second phase, novice subjects perform ETI under time\npressure, and the outcome is either Successful or Unsuccessful. A third dataset\nof videos from a single head-mounted camera for Experts and Novices is also\nanalyzed. The study achieved an accuracy of 100% in identifying Expert/Novice\ntrials in the initial phase. In the second phase, the model showed 85% accuracy\nin classifying Successful/Unsuccessful procedures. Using head-mounted cameras\nalone, the model showed a 96% accuracy on Expert and Novice classification\nwhile maintaining an accuracy of 85% on classifying successful and\nunsuccessful. In addition, GradCAMs are presented to explain the differences\nbetween Expert and Novice behavior and Successful and Unsuccessful trials. The\napproach offers a reliable and objective method for automated assessment of ETI\nskills.\n","authors":["Jean-Paul Ainam","Erim Yanik","Rahul Rahul","Taylor Kunkes","Lora Cavuoto","Brian Clemency","Kaori Tanaka","Matthew Hackett","Jack Norfleet","Suvranu De"],"pdf_url":"https://arxiv.org/pdf/2404.11727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11725v1","updated":"2024-04-17T20:23:07Z","published":"2024-04-17T20:23:07Z","title":"Postoperative glioblastoma segmentation: Development of a fully\n automated pipeline using deep convolutional neural networks and comparison\n with currently available models","summary":" Accurately assessing tumor removal is paramount in the management of\nglioblastoma. We developed a pipeline using MRI scans and neural networks to\nsegment tumor subregions and the surgical cavity in postoperative images. Our\nmodel excels in accurately classifying the extent of resection, offering a\nvaluable tool for clinicians in assessing treatment effectiveness.\n","authors":["Santiago Cepeda","Roberto Romero","Daniel Garcia-Perez","Guillermo Blasco","Luigi Tommaso Luppino","Samuel Kuttner","Ignacio Arrese","Ole Solheim","Live Eikenes","Anna Karlberg","Angel Perez-Nunez","Trinidad Escudero","Roberto Hornero","Rosario Sarabia"],"pdf_url":"https://arxiv.org/pdf/2404.11725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14176v2","updated":"2024-04-17T20:18:45Z","published":"2023-03-24T17:38:45Z","title":"A Hybrid ANN-SNN Architecture for Low-Power and Low-Latency Visual\n Perception","summary":" Spiking Neural Networks (SNN) are a class of bio-inspired neural networks\nthat promise to bring low-power and low-latency inference to edge devices\nthrough asynchronous and sparse processing. However, being temporal models,\nSNNs depend heavily on expressive states to generate predictions on par with\nclassical artificial neural networks (ANNs). These states converge only after\nlong transient periods, and quickly decay without input data, leading to higher\nlatency, power consumption, and lower accuracy. This work addresses this issue\nby initializing the state with an auxiliary ANN running at a low rate. The SNN\nthen uses the state to generate predictions with high temporal resolution until\nthe next initialization phase. Our hybrid ANN-SNN model thus combines the best\nof both worlds: It does not suffer from long state transients and state decay\nthanks to the ANN, and can generate predictions with high temporal resolution,\nlow latency, and low power thanks to the SNN. We show for the task of\nevent-based 2D and 3D human pose estimation that our method consumes 88% less\npower with only a 4% decrease in performance compared to its fully ANN\ncounterparts when run at the same inference rate. Moreover, when compared to\nSNNs, our method achieves a 74% lower error. This research thus provides a new\nunderstanding of how ANNs and SNNs can be used to maximize their respective\nbenefits.\n","authors":["Asude Aydin","Mathias Gehrig","Daniel Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2303.14176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08888v2","updated":"2024-04-17T19:32:47Z","published":"2023-12-13T13:11:44Z","title":"Read Between the Layers: Leveraging Intra-Layer Representations for\n Rehearsal-Free Continual Learning with Pre-Trained Models","summary":" We address the Continual Learning (CL) problem, wherein a model must learn a\nsequence of tasks from non-stationary distributions while preserving prior\nknowledge upon encountering new experiences. With the advancement of foundation\nmodels, CL research has pivoted from the initial learning-from-scratch paradigm\ntowards utilizing generic features from large-scale pre-training. However,\nexisting approaches to CL with pre-trained models primarily focus on separating\nclass-specific features from the final representation layer and neglect the\npotential of intermediate representations to capture low- and mid-level\nfeatures, which are more invariant to domain shifts. In this work, we propose\nLayUP, a new prototype-based approach to continual learning that leverages\nsecond-order feature statistics from multiple intermediate layers of a\npre-trained network. Our method is conceptually simple, does not require access\nto prior data, and works out of the box with any foundation model. LayUP\nsurpasses the state of the art in four of the seven class-incremental learning\nbenchmarks, all three domain-incremental learning benchmarks and in six of the\nseven online continual learning benchmarks, while significantly reducing memory\nand computational requirements compared to existing baselines. Our results\ndemonstrate that fully exhausting the representational capacities of\npre-trained models in CL goes well beyond their final embeddings.\n","authors":["Kyra Ahrens","Hans Hergen Lehmann","Jae Hee Lee","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2312.08888v2.pdf","comment":"Preprint under review"},{"id":"http://arxiv.org/abs/2402.17177v3","updated":"2024-04-17T18:41:39Z","published":"2024-02-27T03:30:58Z","title":"Sora: A Review on Background, Technology, Limitations, and Opportunities\n of Large Vision Models","summary":" Sora is a text-to-video generative AI model, released by OpenAI in February\n2024. The model is trained to generate videos of realistic or imaginative\nscenes from text instructions and show potential in simulating the physical\nworld. Based on public technical reports and reverse engineering, this paper\npresents a comprehensive review of the model's background, related\ntechnologies, applications, remaining challenges, and future directions of\ntext-to-video AI models. We first trace Sora's development and investigate the\nunderlying technologies used to build this \"world simulator\". Then, we describe\nin detail the applications and potential impact of Sora in multiple industries\nranging from film-making and education to marketing. We discuss the main\nchallenges and limitations that need to be addressed to widely deploy Sora,\nsuch as ensuring safe and unbiased video generation. Lastly, we discuss the\nfuture development of Sora and video generation models in general, and how\nadvancements in the field could enable new ways of human-AI interaction,\nboosting productivity and creativity of video generation.\n","authors":["Yixin Liu","Kai Zhang","Yuan Li","Zhiling Yan","Chujie Gao","Ruoxi Chen","Zhengqing Yuan","Yue Huang","Hanchi Sun","Jianfeng Gao","Lifang He","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2402.17177v3.pdf","comment":"37 pages, 18 figures; GitHub:\n https://github.com/lichao-sun/SoraReview"},{"id":"http://arxiv.org/abs/2404.11683v1","updated":"2024-04-17T18:29:32Z","published":"2024-04-17T18:29:32Z","title":"Unifying Scene Representation and Hand-Eye Calibration with 3D\n Foundation Models","summary":" Representing the environment is a central challenge in robotics, and is\nessential for effective decision-making. Traditionally, before capturing images\nwith a manipulator-mounted camera, users need to calibrate the camera using a\nspecific external marker, such as a checkerboard or AprilTag. However, recent\nadvances in computer vision have led to the development of \\emph{3D foundation\nmodels}. These are large, pre-trained neural networks that can establish fast\nand accurate multi-view correspondences with very few images, even in the\nabsence of rich visual features. This paper advocates for the integration of 3D\nfoundation models into scene representation approaches for robotic systems\nequipped with manipulator-mounted RGB cameras. Specifically, we propose the\nJoint Calibration and Representation (JCR) method. JCR uses RGB images,\ncaptured by a manipulator-mounted camera, to simultaneously construct an\nenvironmental representation and calibrate the camera relative to the robot's\nend-effector, in the absence of specific calibration markers. The resulting 3D\nenvironment representation is aligned with the robot's coordinate frame and\nmaintains physically accurate scales. We demonstrate that JCR can build\neffective scene representations using a low-cost RGB camera attached to a\nmanipulator, without prior calibration.\n","authors":["Weiming Zhi","Haozhan Tang","Tianyi Zhang","Matthew Johnson-Roberson"],"pdf_url":"https://arxiv.org/pdf/2404.11683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11669v1","updated":"2024-04-17T18:08:00Z","published":"2024-04-17T18:08:00Z","title":"Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis","summary":" Designing a 3D representation of a dynamic scene for fast optimization and\nrendering is a challenging task. While recent explicit representations enable\nfast learning and rendering of dynamic radiance fields, they require a dense\nset of input viewpoints. In this work, we focus on learning a fast\nrepresentation for dynamic radiance fields with sparse input viewpoints.\nHowever, the optimization with sparse input is under-constrained and\nnecessitates the use of motion priors to constrain the learning. Existing fast\ndynamic scene models do not explicitly model the motion, making them difficult\nto be constrained with motion priors. We design an explicit motion model as a\nfactorized 4D representation that is fast and can exploit the spatio-temporal\ncorrelation of the motion field. We then introduce reliable flow priors\nincluding a combination of sparse flow priors across cameras and dense flow\npriors within cameras to regularize our motion model. Our model is fast,\ncompact and achieves very good performance on popular multi-view dynamic scene\ndatasets with sparse input viewpoints. The source code for our model can be\nfound on our project page:\nhttps://nagabhushansn95.github.io/publications/2024/RF-DeRF.html.\n","authors":["Nagabhushan Somraj","Kapil Choudhary","Sai Harsha Mupparaju","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2404.11669v1.pdf","comment":"Accepted at SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2404.11667v1","updated":"2024-04-17T18:04:37Z","published":"2024-04-17T18:04:37Z","title":"Deep Dependency Networks and Advanced Inference Schemes for Multi-Label\n Classification","summary":" We present a unified framework called deep dependency networks (DDNs) that\ncombines dependency networks and deep learning architectures for multi-label\nclassification, with a particular emphasis on image and video data. The primary\nadvantage of dependency networks is their ease of training, in contrast to\nother probabilistic graphical models like Markov networks. In particular, when\ncombined with deep learning architectures, they provide an intuitive,\neasy-to-use loss function for multi-label classification. A drawback of DDNs\ncompared to Markov networks is their lack of advanced inference schemes,\nnecessitating the use of Gibbs sampling. To address this challenge, we propose\nnovel inference schemes based on local search and integer linear programming\nfor computing the most likely assignment to the labels given observations. We\nevaluate our novel methods on three video datasets (Charades, TACoS, Wetlab)\nand three image datasets (MS-COCO, PASCAL VOC, NUS-WIDE), comparing their\nperformance with (a) basic neural architectures and (b) neural architectures\ncombined with Markov networks equipped with advanced inference and learning\ntechniques. Our results demonstrate the superiority of our new DDN methods over\nthe two competing approaches.\n","authors":["Shivvrat Arya","Yu Xiang","Vibhav Gogate"],"pdf_url":"https://arxiv.org/pdf/2404.11667v1.pdf","comment":"Will appear in AISTATS 2024. arXiv admin note: substantial text\n overlap with arXiv:2302.00633"},{"id":"http://arxiv.org/abs/2404.12163v1","updated":"2024-04-17T17:38:54Z","published":"2024-04-17T17:38:54Z","title":"Unsupervised Microscopy Video Denoising","summary":" In this paper, we introduce a novel unsupervised network to denoise\nmicroscopy videos featured by image sequences captured by a fixed location\nmicroscopy camera. Specifically, we propose a DeepTemporal Interpolation\nmethod, leveraging a temporal signal filter integrated into the bottom CNN\nlayers, to restore microscopy videos corrupted by unknown noise types. Our\nunsupervised denoising architecture is distinguished by its ability to adapt to\nmultiple noise conditions without the need for pre-existing noise distribution\nknowledge, addressing a significant challenge in real-world medical\napplications. Furthermore, we evaluate our denoising framework using both real\nmicroscopy recordings and simulated data, validating our outperforming video\ndenoising performance across a broad spectrum of noise scenarios. Extensive\nexperiments demonstrate that our unsupervised model consistently outperforms\nstate-of-the-art supervised and unsupervised video denoising techniques,\nproving especially effective for microscopy videos.\n","authors":["Mary Aiyetigbo","Alexander Korte","Ethan Anderson","Reda Chalhoub","Peter Kalivas","Feng Luo","Nianyi Li"],"pdf_url":"https://arxiv.org/pdf/2404.12163v1.pdf","comment":"Accepted at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.12142v1","updated":"2024-04-17T16:50:14Z","published":"2024-04-17T16:50:14Z","title":"SDIP: Self-Reinforcement Deep Image Prior Framework for Image Processing","summary":" Deep image prior (DIP) proposed in recent research has revealed the inherent\ntrait of convolutional neural networks (CNN) for capturing substantial\nlow-level image statistics priors. This framework efficiently addresses the\ninverse problems in image processing and has induced extensive applications in\nvarious domains. However, as the whole algorithm is initialized randomly, the\nDIP algorithm often lacks stability. Thus, this method still has space for\nfurther improvement. In this paper, we propose the self-reinforcement deep\nimage prior (SDIP) as an improved version of the original DIP. We observed that\nthe changes in the DIP networks' input and output are highly correlated during\neach iteration. SDIP efficiently utilizes this trait in a reinforcement\nlearning manner, where the current iteration's output is utilized by a steering\nalgorithm to update the network input for the next iteration, guiding the\nalgorithm toward improved results. Experimental results across multiple\napplications demonstrate that our proposed SDIP framework offers improvement\ncompared to the original DIP method and other state-of-the-art methods.\n","authors":["Ziyu Shu","Zhixin Pan"],"pdf_url":"https://arxiv.org/pdf/2404.12142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12144v1","updated":"2024-04-17T12:37:25Z","published":"2024-04-17T12:37:25Z","title":"Mushroom Segmentation and 3D Pose Estimation from Point Clouds using\n Fully Convolutional Geometric Features and Implicit Pose Encoding","summary":" Modern agricultural applications rely more and more on deep learning\nsolutions. However, training well-performing deep networks requires a large\namount of annotated data that may not be available and in the case of 3D\nannotation may not even be feasible for human annotators. In this work, we\ndevelop a deep learning approach to segment mushrooms and estimate their pose\non 3D data, in the form of point clouds acquired by depth sensors. To\ncircumvent the annotation problem, we create a synthetic dataset of mushroom\nscenes, where we are fully aware of 3D information, such as the pose of each\nmushroom. The proposed network has a fully convolutional backbone, that parses\nsparse 3D data, and predicts pose information that implicitly defines both\ninstance segmentation and pose estimation task. We have validated the\neffectiveness of the proposed implicit-based approach for a synthetic test set,\nas well as provided qualitative results for a small set of real acquired point\nclouds with depth sensors. Code is publicly available at\nhttps://github.com/georgeretsi/mushroom-pose.\n","authors":["George Retsinas","Niki Efthymiou","Petros Maragos"],"pdf_url":"https://arxiv.org/pdf/2404.12144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12415v1","updated":"2024-04-17T17:57:20Z","published":"2024-04-17T17:57:20Z","title":"Soil Fertility Prediction Using Combined USB-microscope Based Soil\n Image, Auxiliary Variables, and Portable X-Ray Fluorescence Spectrometry","summary":" This study explored the application of portable X-ray fluorescence (PXRF)\nspectrometry and soil image analysis to rapidly assess soil fertility, focusing\non critical parameters such as available B, organic carbon (OC), available Mn,\navailable S, and the sulfur availability index (SAI). Analyzing 1,133 soil\nsamples from various agro-climatic zones in Eastern India, the research\ncombined color and texture features from microscopic soil images, PXRF data,\nand auxiliary soil variables (AVs) using a Random Forest model. Results\nindicated that integrating image features (IFs) with auxiliary variables (AVs)\nsignificantly enhanced prediction accuracy for available B (R^2 = 0.80) and OC\n(R^2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF data,\nfurther improved predictions for available Mn and SAI with R^2 values of 0.72\nand 0.70, respectively. The study demonstrated how these integrated\ntechnologies have the potential to provide quick and affordable options for\nsoil testing, opening up access to more sophisticated prediction models and a\nbetter comprehension of the fertility and health of the soil. Future research\nshould focus on the application of deep learning models on a larger dataset of\nsoil images, developed using soils from a broader range of agro-climatic zones\nunder field condition.\n","authors":["Shubhadip Dasgupta","Satwik Pate","Divya Rathore","L. G. Divyanth","Ayan Das","Anshuman Nayak","Subhadip Dey","Asim Biswas","David C. Weindorf","Bin Li","Sergio Henrique Godinho Silva","Bruno Teixeira Ribeiro","Sanjay Srivastava","Somsubhra Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2404.12415v1.pdf","comment":"37 pages, 10 figures; manuscript under peer-review for publication in\n the jounral 'Computers and Electronics in Agriculture'"},{"id":"http://arxiv.org/abs/2404.11565v1","updated":"2024-04-17T17:08:05Z","published":"2024-04-17T17:08:05Z","title":"MoA: Mixture-of-Attention for Subject-Context Disentanglement in\n Personalized Image Generation","summary":" We introduce a new architecture for personalization of text-to-image\ndiffusion models, coined Mixture-of-Attention (MoA). Inspired by the\nMixture-of-Experts mechanism utilized in large language models (LLMs), MoA\ndistributes the generation workload between two attention pathways: a\npersonalized branch and a non-personalized prior branch. MoA is designed to\nretain the original model's prior by fixing its attention layers in the prior\nbranch, while minimally intervening in the generation process with the\npersonalized branch that learns to embed subjects in the layout and context\ngenerated by the prior branch. A novel routing mechanism manages the\ndistribution of pixels in each layer across these branches to optimize the\nblend of personalized and generic content creation. Once trained, MoA\nfacilitates the creation of high-quality, personalized images featuring\nmultiple subjects with compositions and interactions as diverse as those\ngenerated by the original model. Crucially, MoA enhances the distinction\nbetween the model's pre-existing capability and the newly augmented\npersonalized intervention, thereby offering a more disentangled subject-context\ncontrol that was previously unattainable. Project page:\nhttps://snap-research.github.io/mixture-of-attention\n","authors":["Kuan-Chieh Wang","Daniil Ostashev","Yuwei Fang","Sergey Tulyakov","Kfir Aberman"],"pdf_url":"https://arxiv.org/pdf/2404.11565v1.pdf","comment":"Project Website: https://snap-research.github.io/mixture-of-attention"}]},"2024-04-18T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.12391v1","updated":"2024-04-18T17:59:58Z","published":"2024-04-18T17:59:58Z","title":"On the Content Bias in Fréchet Video Distance","summary":" Fr\\'echet Video Distance (FVD), a prominent metric for evaluating video\ngeneration models, is known to conflict with human perception occasionally. In\nthis paper, we aim to explore the extent of FVD's bias toward per-frame quality\nover temporal realism and identify its sources. We first quantify the FVD's\nsensitivity to the temporal axis by decoupling the frame and motion quality and\nfind that the FVD increases only slightly with large temporal corruption. We\nthen analyze the generated videos and show that via careful sampling from a\nlarge set of generated videos that do not contain motions, one can drastically\ndecrease FVD without improving the temporal quality. Both studies suggest FVD's\nbias towards the quality of individual frames. We further observe that the bias\ncan be attributed to the features extracted from a supervised video classifier\ntrained on the content-biased dataset. We show that FVD with features extracted\nfrom the recent large-scale self-supervised video models is less biased toward\nimage quality. Finally, we revisit a few real-world examples to validate our\nhypothesis.\n","authors":["Songwei Ge","Aniruddha Mahapatra","Gaurav Parmar","Jun-Yan Zhu","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2404.12391v1.pdf","comment":"CVPR 2024. Project webpage: https://content-debiased-fvd.github.io/"},{"id":"http://arxiv.org/abs/2404.01300v2","updated":"2024-04-18T17:59:57Z","published":"2024-04-01T17:59:55Z","title":"NeRF-MAE: Masked AutoEncoders for Self-Supervised 3D Representation\n Learning for Neural Radiance Fields","summary":" Neural fields excel in computer vision and robotics due to their ability to\nunderstand the 3D visual world such as inferring semantics, geometry, and\ndynamics. Given the capabilities of neural fields in densely representing a 3D\nscene from 2D images, we ask the question: Can we scale their self-supervised\npretraining, specifically using masked autoencoders, to generate effective 3D\nrepresentations from posed RGB images. Owing to the astounding success of\nextending transformers to novel data modalities, we employ standard 3D Vision\nTransformers to suit the unique formulation of NeRFs. We leverage NeRF's\nvolumetric grid as a dense input to the transformer, contrasting it with other\n3D representations such as pointclouds where the information density can be\nuneven, and the representation is irregular. Due to the difficulty of applying\nmasked autoencoders to an implicit representation, such as NeRF, we opt for\nextracting an explicit representation that canonicalizes scenes across domains\nby employing the camera trajectory for sampling. Our goal is made possible by\nmasking random patches from NeRF's radiance and density grid and employing a\nstandard 3D Swin Transformer to reconstruct the masked patches. In doing so,\nthe model can learn the semantic and spatial structure of complete scenes. We\npretrain this representation at scale on our proposed curated posed-RGB data,\ntotaling over 1.6 million images. Once pretrained, the encoder is used for\neffective 3D transfer learning. Our novel self-supervised pretraining for\nNeRFs, NeRF-MAE, scales remarkably well and improves performance on various\nchallenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining,\nNeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF\nscene understanding baselines on Front3D and ScanNet datasets with an absolute\nperformance improvement of over 20% AP50 and 8% AP25 for 3D object detection.\n","authors":["Muhammad Zubair Irshad","Sergey Zakahrov","Vitor Guizilini","Adrien Gaidon","Zsolt Kira","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2404.01300v2.pdf","comment":"29 pages, 13 figures. Project Page: https://nerf-mae.github.io/"},{"id":"http://arxiv.org/abs/2404.12390v1","updated":"2024-04-18T17:59:54Z","published":"2024-04-18T17:59:54Z","title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","summary":" We introduce Blink, a new benchmark for multimodal language models (LLMs)\nthat focuses on core visual perception abilities not found in other\nevaluations. Most of the Blink tasks can be solved by humans \"within a blink\"\n(e.g., relative depth estimation, visual correspondence, forensics detection,\nand multi-view reasoning). However, we find these perception-demanding tasks\ncast significant challenges for current multimodal LLMs because they resist\nmediation through natural language. Blink reformats 14 classic computer vision\ntasks into 3,807 multiple-choice questions, paired with single or multiple\nimages and visual prompting. While humans get 95.70% accuracy on average, Blink\nis surprisingly challenging for existing multimodal LLMs: even the\nbest-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only\n13.17% and 7.63% higher than random guessing, indicating that such perception\nabilities have not \"emerged\" yet in recent multimodal LLMs. Our analysis also\nhighlights that specialist CV models could solve these problems much better,\nsuggesting potential pathways for future improvements. We believe Blink will\nstimulate the community to help multimodal LLMs catch up with human-level\nvisual perception.\n","authors":["Xingyu Fu","Yushi Hu","Bangzheng Li","Yu Feng","Haoyu Wang","Xudong Lin","Dan Roth","Noah A. Smith","Wei-Chiu Ma","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.12390v1.pdf","comment":"Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/"},{"id":"http://arxiv.org/abs/2404.12388v1","updated":"2024-04-18T17:59:53Z","published":"2024-04-18T17:59:53Z","title":"VideoGigaGAN: Towards Detail-rich Video Super-Resolution","summary":" Video super-resolution (VSR) approaches have shown impressive temporal\nconsistency in upsampled videos. However, these approaches tend to generate\nblurrier results than their image counterparts as they are limited in their\ngenerative capability. This raises a fundamental question: can we extend the\nsuccess of a generative image upsampler to the VSR task while preserving the\ntemporal consistency? We introduce VideoGigaGAN, a new generative VSR model\nthat can produce videos with high-frequency details and temporal consistency.\nVideoGigaGAN builds upon a large-scale image upsampler -- GigaGAN. Simply\ninflating GigaGAN to a video model by adding temporal modules produces severe\ntemporal flickering. We identify several key issues and propose techniques that\nsignificantly improve the temporal consistency of upsampled videos. Our\nexperiments show that, unlike previous VSR methods, VideoGigaGAN generates\ntemporally consistent videos with more fine-grained appearance details. We\nvalidate the effectiveness of VideoGigaGAN by comparing it with\nstate-of-the-art VSR models on public datasets and showcasing video results\nwith $8\\times$ super-resolution.\n","authors":["Yiran Xu","Taesung Park","Richard Zhang","Yang Zhou","Eli Shechtman","Feng Liu","Jia-Bin Huang","Difan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12388v1.pdf","comment":"project page: https://videogigagan.github.io/"},{"id":"http://arxiv.org/abs/2404.12389v1","updated":"2024-04-18T17:59:53Z","published":"2024-04-18T17:59:53Z","title":"Moving Object Segmentation: All You Need Is SAM (and Flow)","summary":" The objective of this paper is motion segmentation -- discovering and\nsegmenting the moving objects in a video. This is a much studied area with\nnumerous careful,and sometimes complex, approaches and training schemes\nincluding: self-supervised learning, learning from synthetic datasets,\nobject-centric representations, amodal representations, and many more. Our\ninterest in this paper is to determine if the Segment Anything model (SAM) can\ncontribute to this task. We investigate two models for combining SAM with\noptical flow that harness the segmentation power of SAM with the ability of\nflow to discover and group moving objects. In the first model, we adapt SAM to\ntake optical flow, rather than RGB, as an input. In the second, SAM takes RGB\nas an input, and flow is used as a segmentation prompt. These surprisingly\nsimple methods, without any further modifications, outperform all previous\napproaches by a considerable margin in both single and multi-object benchmarks.\nWe also extend these frame-level segmentations to sequence-level segmentations\nthat maintain object identity. Again, this simple model outperforms previous\nmethods on multiple video object segmentation benchmarks.\n","authors":["Junyu Xie","Charig Yang","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2404.12389v1.pdf","comment":"Project Page: https://www.robots.ox.ac.uk/~vgg/research/flowsam/"},{"id":"http://arxiv.org/abs/2404.12387v1","updated":"2024-04-18T17:59:48Z","published":"2024-04-18T17:59:48Z","title":"Reka Core, Flash, and Edge: A Series of Powerful Multimodal Language\n Models","summary":" We introduce Reka Core, Flash, and Edge, a series of powerful multimodal\nlanguage models trained from scratch by Reka. Reka models are able to process\nand reason with text, images, video, and audio inputs. This technical report\ndiscusses details of training some of these models and provides comprehensive\nevaluation results. We show that Reka Edge and Reka Flash are not only\nstate-of-the-art but also outperform many much larger models, delivering\noutsized values for their respective compute class. Meanwhile, our most capable\nand largest model, Reka Core, approaches the best frontier models on both\nautomatic evaluations and blind human evaluations. On image question answering\nbenchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V.\nMeanwhile, on multimodal chat, Core ranks as the second most preferred model\nunder a blind third-party human evaluation setup, outperforming other models\nsuch as Claude 3 Opus. On text benchmarks, Core not only performs competitively\nto other frontier models on a set of well-established benchmarks (e.g. MMLU,\nGSM8K) but also outperforms GPT4-0613 on human evaluation. On video question\nanswering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped\nin production at http://chat.reka.ai . A showcase of non cherry picked\nqualitative examples can also be found at http://showcase.reka.ai .\n","authors":["Aitor Ormazabal","Che Zheng","Cyprien de Masson d'Autume","Dani Yogatama","Deyu Fu","Donovan Ong","Eric Chen","Eugenie Lamprecht","Hai Pham","Isaac Ong","Kaloyan Aleksiev","Lei Li","Matthew Henderson","Max Bain","Mikel Artetxe","Nishant Relan","Piotr Padlewski","Qi Liu","Ren Chen","Samuel Phua","Yazheng Yang","Yi Tay","Yuqi Wang","Zhongkai Zhu","Zhihui Xie"],"pdf_url":"https://arxiv.org/pdf/2404.12387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12386v1","updated":"2024-04-18T17:59:46Z","published":"2024-04-18T17:59:46Z","title":"SOHES: Self-supervised Open-world Hierarchical Entity Segmentation","summary":" Open-world entity segmentation, as an emerging computer vision task, aims at\nsegmenting entities in images without being restricted by pre-defined classes,\noffering impressive generalization capabilities on unseen images and concepts.\nDespite its promise, existing entity segmentation methods like Segment Anything\nModel (SAM) rely heavily on costly expert annotators. This work presents\nSelf-supervised Open-world Hierarchical Entity Segmentation (SOHES), a novel\napproach that eliminates the need for human annotations. SOHES operates in\nthree phases: self-exploration, self-instruction, and self-correction. Given a\npre-trained self-supervised representation, we produce abundant high-quality\npseudo-labels through visual feature clustering. Then, we train a segmentation\nmodel on the pseudo-labels, and rectify the noises in pseudo-labels via a\nteacher-student mutual-learning procedure. Beyond segmenting entities, SOHES\nalso captures their constituent parts, providing a hierarchical understanding\nof visual entities. Using raw images as the sole training data, our method\nachieves unprecedented performance in self-supervised open-world segmentation,\nmarking a significant milestone towards high-quality open-world entity\nsegmentation in the absence of human-annotated masks. Project page:\nhttps://SOHES.github.io.\n","authors":["Shengcao Cao","Jiuxiang Gu","Jason Kuen","Hao Tan","Ruiyi Zhang","Handong Zhao","Ani Nenkova","Liang-Yan Gui","Tong Sun","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12386v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.12385v1","updated":"2024-04-18T17:59:41Z","published":"2024-04-18T17:59:41Z","title":"MeshLRM: Large Reconstruction Model for High-Quality Mesh","summary":" We propose MeshLRM, a novel LRM-based approach that can reconstruct a\nhigh-quality mesh from merely four input images in less than one second.\nDifferent from previous large reconstruction models (LRMs) that focus on\nNeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction\nand rendering within the LRM framework. This allows for end-to-end mesh\nreconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering.\nMoreover, we improve the LRM architecture by simplifying several complex\ndesigns in previous LRMs. MeshLRM's NeRF initialization is sequentially trained\nwith low- and high-resolution images; this new LRM training strategy enables\nsignificantly faster convergence and thereby leads to better quality with less\ncompute. Our approach achieves state-of-the-art mesh reconstruction from\nsparse-view inputs and also allows for many downstream applications, including\ntext-to-3D and single-image-to-3D generation. Project page:\nhttps://sarahweiii.github.io/meshlrm/\n","authors":["Xinyue Wei","Kai Zhang","Sai Bi","Hao Tan","Fujun Luan","Valentin Deschaintre","Kalyan Sunkavalli","Hao Su","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.12385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12383v1","updated":"2024-04-18T17:59:28Z","published":"2024-04-18T17:59:28Z","title":"G-HOP: Generative Hand-Object Prior for Interaction Reconstruction and\n Grasp Synthesis","summary":" We propose G-HOP, a denoising diffusion based generative prior for\nhand-object interactions that allows modeling both the 3D object and a human\nhand, conditioned on the object category. To learn a 3D spatial diffusion model\nthat can capture this joint distribution, we represent the human hand via a\nskeletal distance field to obtain a representation aligned with the (latent)\nsigned distance field for the object. We show that this hand-object prior can\nthen serve as generic guidance to facilitate other tasks like reconstruction\nfrom interaction clip and human grasp synthesis. We believe that our model,\ntrained by aggregating seven diverse real-world interaction datasets spanning\nacross 155 categories, represents a first approach that allows jointly\ngenerating both hand and object. Our empirical evaluations demonstrate the\nbenefit of this joint prior in video-based reconstruction and human grasp\nsynthesis, outperforming current task-specific baselines.\n Project website: https://judyye.github.io/ghop-www\n","authors":["Yufei Ye","Abhinav Gupta","Kris Kitani","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2404.12383v1.pdf","comment":"accepted to CVPR2024; project page at\n https://judyye.github.io/ghop-www"},{"id":"http://arxiv.org/abs/2404.12382v1","updated":"2024-04-18T17:59:27Z","published":"2024-04-18T17:59:27Z","title":"Lazy Diffusion Transformer for Interactive Image Editing","summary":" We introduce a novel diffusion transformer, LazyDiffusion, that generates\npartial image updates efficiently. Our approach targets interactive image\nediting applications in which, starting from a blank canvas or an image, a user\nspecifies a sequence of localized image modifications using binary masks and\ntext prompts. Our generator operates in two phases. First, a context encoder\nprocesses the current canvas and user mask to produce a compact global context\ntailored to the region to generate. Second, conditioned on this context, a\ndiffusion-based transformer decoder synthesizes the masked pixels in a \"lazy\"\nfashion, i.e., it only generates the masked region. This contrasts with\nprevious works that either regenerate the full canvas, wasting time and\ncomputation, or confine processing to a tight rectangular crop around the mask,\nignoring the global image context altogether. Our decoder's runtime scales with\nthe mask size, which is typically small, while our encoder introduces\nnegligible overhead. We demonstrate that our approach is competitive with\nstate-of-the-art inpainting methods in terms of quality and fidelity while\nproviding a 10x speedup for typical user interactions, where the editing mask\nrepresents 10% of the image.\n","authors":["Yotam Nitzan","Zongze Wu","Richard Zhang","Eli Shechtman","Daniel Cohen-Or","Taesung Park","Michaël Gharbi"],"pdf_url":"https://arxiv.org/pdf/2404.12382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12378v1","updated":"2024-04-18T17:58:16Z","published":"2024-04-18T17:58:16Z","title":"6Img-to-3D: Few-Image Large-Scale Outdoor Driving Scene Reconstruction","summary":" Current 3D reconstruction techniques struggle to infer unbounded scenes from\na few images faithfully. Specifically, existing methods have high computational\ndemands, require detailed pose information, and cannot reconstruct occluded\nregions reliably. We introduce 6Img-to-3D, an efficient, scalable\ntransformer-based encoder-renderer method for single-shot image to 3D\nreconstruction. Our method outputs a 3D-consistent parameterized triplane from\nonly six outward-facing input images for large-scale, unbounded outdoor driving\nscenarios. We take a step towards resolving existing shortcomings by combining\ncontracted custom cross- and self-attention mechanisms for triplane\nparameterization, differentiable volume rendering, scene contraction, and image\nfeature projection. We showcase that six surround-view vehicle images from a\nsingle timestamp without global pose information are enough to reconstruct\n360$^{\\circ}$ scenes during inference time, taking 395 ms. Our method allows,\nfor example, rendering third-person images and birds-eye views. Our code is\navailable at https://github.com/continental/6Img-to-3D, and more examples can\nbe found at our website here https://6Img-to-3D.GitHub.io/.\n","authors":["Théo Gieruc","Marius Kästingschäfer","Sebastian Bernhard","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2404.12378v1.pdf","comment":"Joint first authorship. Project page: https://6Img-to-3D.GitHub.io/\n Code https://github.com/continental/6Img-to-3D"},{"id":"http://arxiv.org/abs/2404.12379v1","updated":"2024-04-18T17:58:16Z","published":"2024-04-18T17:58:16Z","title":"Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular\n Videos","summary":" Modern 3D engines and graphics pipelines require mesh as a memory-efficient\nrepresentation, which allows efficient rendering, geometry processing, texture\nediting, and many other downstream operations. However, it is still highly\ndifficult to obtain high-quality mesh in terms of structure and detail from\nmonocular visual observations. The problem becomes even more challenging for\ndynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh\n(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh\ngiven a single monocular video. Our work leverages the recent advancement in 3D\nGaussian Splatting to construct the mesh sequence with temporal consistency\nfrom a video. Building on top of this representation, DG-Mesh recovers\nhigh-quality meshes from the Gaussian points and can track the mesh vertices\nover time, which enables applications such as texture editing on dynamic\nobjects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly\ndistributed Gaussians, resulting better mesh reconstruction through mesh-guided\ndensification and pruning on the deformed Gaussians. By applying\ncycle-consistent deformation between the canonical and the deformed space, we\ncan project the anchored Gaussian back to the canonical space and optimize\nGaussians across all time frames. During the evaluation on different datasets,\nDG-Mesh provides significantly better mesh reconstruction and rendering than\nbaselines.\n","authors":["Isabella Liu","Hao Su","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12379v1.pdf","comment":"Project page: https://www.liuisabella.com/DG-Mesh/"},{"id":"http://arxiv.org/abs/2404.12372v1","updated":"2024-04-18T17:53:19Z","published":"2024-04-18T17:53:19Z","title":"MedThink: Explaining Medical Visual Question Answering via Multimodal\n Decision-Making Rationale","summary":" Medical Visual Question Answering (MedVQA), which offers language responses\nto image-based medical inquiries, represents a challenging task and significant\nadvancement in healthcare. It assists medical experts to swiftly interpret\nmedical images, thereby enabling faster and more accurate diagnoses. However,\nthe model interpretability and transparency of existing MedVQA solutions are\noften limited, posing challenges in understanding their decision-making\nprocesses. To address this issue, we devise a semi-automated annotation process\nto streamlining data preparation and build new benchmark MedVQA datasets R-RAD\nand R-SLAKE. The R-RAD and R-SLAKE datasets provide intermediate medical\ndecision-making rationales generated by multimodal large language models and\nhuman annotations for question-answering pairs in existing MedVQA datasets,\ni.e., VQA-RAD and SLAKE. Moreover, we design a novel framework which finetunes\nlightweight pretrained generative models by incorporating medical\ndecision-making rationales into the training process. The framework includes\nthree distinct strategies to generate decision outcomes and corresponding\nrationales, thereby clearly showcasing the medical decision-making process\nduring reasoning. Extensive experiments demonstrate that our method can achieve\nan accuracy of 83.5% on R-RAD and 86.3% on R-SLAKE, significantly outperforming\nexisting state-of-the-art baselines. Dataset and code will be released.\n","authors":["Xiaotang Gai","Chenyi Zhou","Jiaxiang Liu","Yang Feng","Jian Wu","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12368v1","updated":"2024-04-18T17:50:23Z","published":"2024-04-18T17:50:23Z","title":"Gradient-Regularized Out-of-Distribution Detection","summary":" One of the challenges for neural networks in real-life applications is the\noverconfident errors these models make when the data is not from the original\ntraining distribution.\n Addressing this issue is known as Out-of-Distribution (OOD) detection.\n Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate\nfor OOD data during training to achieve improved performance.\n However, these methods fail to fully exploit the local information embedded\nin the auxiliary dataset.\n In this work, we propose the idea of leveraging the information embedded in\nthe gradient of the loss function during training to enable the network to not\nonly learn a desired OOD score for each sample but also to exhibit similar\nbehavior in a local neighborhood around each sample.\n We also develop a novel energy-based sampling method to allow the network to\nbe exposed to more informative OOD samples during the training phase. This is\nespecially important when the auxiliary dataset is large. We demonstrate the\neffectiveness of our method through extensive experiments on several OOD\nbenchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet\nexperiment.\n We further provide a theoretical analysis through the lens of certified\nrobustness and Lipschitz analysis to showcase the theoretical foundation of our\nwork. We will publicly release our code after the review process.\n","authors":["Sina Sharifi","Taha Entesari","Bardia Safaei","Vishal M. Patel","Mahyar Fazlyab"],"pdf_url":"https://arxiv.org/pdf/2404.12368v1.pdf","comment":"Under review for the 18th European Conference on Computer Vision\n (ECCV) 2024"},{"id":"http://arxiv.org/abs/2404.12359v1","updated":"2024-04-18T17:37:53Z","published":"2024-04-18T17:37:53Z","title":"Inverse Neural Rendering for Explainable Multi-Object Tracking","summary":" Today, most methods for image understanding tasks rely on feed-forward neural\nnetworks. While this approach has allowed for empirical accuracy, efficiency,\nand task adaptation via fine-tuning, it also comes with fundamental\ndisadvantages. Existing networks often struggle to generalize across different\ndatasets, even on the same task. By design, these networks ultimately reason\nabout high-dimensional scene features, which are challenging to analyze. This\nis true especially when attempting to predict 3D information based on 2D\nimages. We propose to recast 3D multi-object tracking from RGB cameras as an\n\\emph{Inverse Rendering (IR)} problem, by optimizing via a differentiable\nrendering pipeline over the latent space of pre-trained 3D object\nrepresentations and retrieve the latents that best represent object instances\nin a given input image. To this end, we optimize an image loss over generative\nlatent spaces that inherently disentangle shape and appearance properties. We\ninvestigate not only an alternate take on tracking but our method also enables\nexamining the generated objects, reasoning about failure situations, and\nresolving ambiguous cases. We validate the generalization and scaling\ncapabilities of our method by learning the generative prior exclusively from\nsynthetic data and assessing camera-based 3D tracking on the nuScenes and Waymo\ndatasets. Both these datasets are completely unseen to our method and do not\nrequire fine-tuning. Videos and code are available at\nhttps://light.princeton.edu/inverse-rendering-tracking/.\n","authors":["Julian Ost","Tanushree Banerjee","Mario Bijelic","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2404.12359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12353v1","updated":"2024-04-18T17:32:46Z","published":"2024-04-18T17:32:46Z","title":"V2Xum-LLM: Cross-Modal Video Summarization with Temporal Prompt\n Instruction Tuning","summary":" Video summarization aims to create short, accurate, and cohesive summaries of\nlonger videos. Despite the existence of various video summarization datasets, a\nnotable limitation is their limited amount of source videos, which hampers the\neffective fine-tuning of advanced large vision-language models (VLMs).\nAdditionally, most existing datasets are created for video-to-video\nsummarization, overlooking the contemporary need for multimodal video content\nsummarization. Recent efforts have been made to expand from unimodal to\nmultimodal video summarization, categorizing the task into three sub-tasks\nbased on the summary's modality: video-to-video (V2V), video-to-text (V2T), and\na combination of video and text summarization (V2VT). However, the textual\nsummaries in previous multimodal datasets are inadequate. To address these\nissues, we introduce Instruct-V2Xum, a cross-modal video summarization dataset\nfeaturing 30,000 diverse videos sourced from YouTube, with lengths ranging from\n40 to 940 seconds and an average summarization ratio of 16.39\\%. Each video\nsummary in Instruct-V2Xum is paired with a textual summary that references\nspecific frame indexes, facilitating the generation of aligned video and\ntextual summaries. In addition, we propose a new video summarization framework\nnamed V2Xum-LLM. V2Xum-LLM, specifically V2Xum-LLaMA in this study, is the\nfirst framework that unifies different video summarization tasks into one large\nlanguage model's (LLM) text decoder and achieves task-controllable video\nsummarization with temporal prompts and task instructions. Experiments show\nthat V2Xum-LLaMA outperforms strong baseline models on multiple video\nsummarization tasks. Furthermore, we propose an enhanced evaluation metric for\nV2V and V2VT summarization tasks.\n","authors":["Hang Hua","Yunlong Tang","Chenliang Xu","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.12353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12352v1","updated":"2024-04-18T17:32:32Z","published":"2024-04-18T17:32:32Z","title":"Point-In-Context: Understanding Point Cloud via In-Context Learning","summary":" With the emergence of large-scale models trained on diverse datasets,\nin-context learning has emerged as a promising paradigm for multitasking,\nnotably in natural language processing and image processing. However, its\napplication in 3D point cloud tasks remains largely unexplored. In this work,\nwe introduce Point-In-Context (PIC), a novel framework for 3D point cloud\nunderstanding via in-context learning. We address the technical challenge of\neffectively extending masked point modeling to 3D point clouds by introducing a\nJoint Sampling module and proposing a vanilla version of PIC called\nPoint-In-Context-Generalist (PIC-G). PIC-G is designed as a generalist model\nfor various 3D point cloud tasks, with inputs and outputs modeled as\ncoordinates. In this paradigm, the challenging segmentation task is achieved by\nassigning label points with XYZ coordinates for each category; the final\nprediction is then chosen based on the label point closest to the predictions.\nTo break the limitation by the fixed label-coordinate assignment, which has\npoor generalization upon novel classes, we propose two novel training\nstrategies, In-Context Labeling and In-Context Enhancing, forming an extended\nversion of PIC named Point-In-Context-Segmenter (PIC-S), targeting improving\ndynamic context labeling and model training. By utilizing dynamic in-context\nlabels and extra in-context pairs, PIC-S achieves enhanced performance and\ngeneralization capability in and across part segmentation datasets. PIC is a\ngeneral framework so that other tasks or datasets can be seamlessly introduced\ninto our PIC through a unified data format. We conduct extensive experiments to\nvalidate the versatility and adaptability of our proposed methods in handling a\nwide range of tasks and segmenting multi-datasets. Our PIC-S is capable of\ngeneralizing unseen datasets and performing novel part segmentation by\ncustomizing prompts.\n","authors":["Mengyuan Liu","Zhongbin Fang","Xia Li","Joachim M. Buhmann","Xiangtai Li","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2404.12352v1.pdf","comment":"Project page: https://fanglaosi.github.io/Point-In-Context_Pages.\n arXiv admin note: text overlap with arXiv:2306.08659"},{"id":"http://arxiv.org/abs/2404.08995v2","updated":"2024-04-18T17:26:30Z","published":"2024-04-13T12:41:40Z","title":"Beyond Known Clusters: Probe New Prototypes for Efficient Generalized\n Class Discovery","summary":" Generalized Class Discovery (GCD) aims to dynamically assign labels to\nunlabelled data partially based on knowledge learned from labelled data, where\nthe unlabelled data may come from known or novel classes. The prevailing\napproach generally involves clustering across all data and learning conceptions\nby prototypical contrastive learning. However, existing methods largely hinge\non the performance of clustering algorithms and are thus subject to their\ninherent limitations. Firstly, the estimated cluster number is often smaller\nthan the ground truth, making the existing methods suffer from the lack of\nprototypes for comprehensive conception learning. To address this issue, we\npropose an adaptive probing mechanism that introduces learnable potential\nprototypes to expand cluster prototypes (centers). As there is no ground truth\nfor the potential prototype, we develop a self-supervised prototype learning\nframework to optimize the potential prototype in an end-to-end fashion.\nSecondly, clustering is computationally intensive, and the conventional\nstrategy of clustering both labelled and unlabelled instances exacerbates this\nissue. To counteract this inefficiency, we opt to cluster only the unlabelled\ninstances and subsequently expand the cluster prototypes with our introduced\npotential prototypes to fast explore novel classes. Despite the simplicity of\nour proposed method, extensive empirical analysis on a wide range of datasets\nconfirms that our method consistently delivers state-of-the-art results.\nSpecifically, our method surpasses the nearest competitor by a significant\nmargin of \\textbf{9.7}$\\%$ within the Stanford Cars dataset and\n\\textbf{12$\\times$} clustering efficiency within the Herbarium 19 dataset. We\nwill make the code and checkpoints publicly available at\n\\url{https://github.com/xjtuYW/PNP.git}.\n","authors":["Ye Wang","Yaxiong Wang","Yujiao Wu","Bingchen Zhao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2404.08995v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.12347v1","updated":"2024-04-18T17:24:28Z","published":"2024-04-18T17:24:28Z","title":"AniClipart: Clipart Animation with Text-to-Video Priors","summary":" Clipart, a pre-made graphic art form, offers a convenient and efficient way\nof illustrating visual content. Traditional workflows to convert static clipart\nimages into motion sequences are laborious and time-consuming, involving\nnumerous intricate steps like rigging, key animation and in-betweening. Recent\nadvancements in text-to-video generation hold great potential in resolving this\nproblem. Nevertheless, direct application of text-to-video generation models\noften struggles to retain the visual identity of clipart images or generate\ncartoon-style motions, resulting in unsatisfactory animation outcomes. In this\npaper, we introduce AniClipart, a system that transforms static clipart images\ninto high-quality motion sequences guided by text-to-video priors. To generate\ncartoon-style and smooth motion, we first define B\\'{e}zier curves over\nkeypoints of the clipart image as a form of motion regularization. We then\nalign the motion trajectories of the keypoints with the provided text prompt by\noptimizing the Video Score Distillation Sampling (VSDS) loss, which encodes\nadequate knowledge of natural motion within a pretrained text-to-video\ndiffusion model. With a differentiable As-Rigid-As-Possible shape deformation\nalgorithm, our method can be end-to-end optimized while maintaining deformation\nrigidity. Experimental results show that the proposed AniClipart consistently\noutperforms existing image-to-video generation models, in terms of text-video\nalignment, visual identity preservation, and motion consistency. Furthermore,\nwe showcase the versatility of AniClipart by adapting it to generate a broader\narray of animation formats, such as layered animation, which allows topological\nchanges.\n","authors":["Ronghuan Wu","Wanchao Su","Kede Ma","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2404.12347v1.pdf","comment":"Project Page: https://aniclipart.github.io/"},{"id":"http://arxiv.org/abs/2404.12341v1","updated":"2024-04-18T17:10:18Z","published":"2024-04-18T17:10:18Z","title":"Measuring Feature Dependency of Neural Networks by Collapsing Feature\n Dimensions in the Data Manifold","summary":" This paper introduces a new technique to measure the feature dependency of\nneural network models. The motivation is to better understand a model by\nquerying whether it is using information from human-understandable features,\ne.g., anatomical shape, volume, or image texture. Our method is based on the\nprinciple that if a model is dependent on a feature, then removal of that\nfeature should significantly harm its performance. A targeted feature is\n\"removed\" by collapsing the dimension in the data distribution that corresponds\nto that feature. We perform this by moving data points along the feature\ndimension to a baseline feature value while staying on the data manifold, as\nestimated by a deep generative model. Then we observe how the model's\nperformance changes on the modified test data set, with the target feature\ndimension removed. We test our method on deep neural network models trained on\nsynthetic image data with known ground truth, an Alzheimer's disease prediction\ntask using MRI and hippocampus segmentations from the OASIS-3 dataset, and a\ncell nuclei classification task using the Lizard dataset.\n","authors":["Yinzhu Jin","Matthew B. Dwyer","P. Thomas Fletcher"],"pdf_url":"https://arxiv.org/pdf/2404.12341v1.pdf","comment":"Accepted and will be pulished in International Symposium on\n Biomedical Imaging (ISBI) 2024"},{"id":"http://arxiv.org/abs/2404.12339v1","updated":"2024-04-18T17:09:10Z","published":"2024-04-18T17:09:10Z","title":"SPOT: Point Cloud Based Stereo Visual Place Recognition for Similar and\n Opposing Viewpoints","summary":" Recognizing places from an opposing viewpoint during a return trip is a\ncommon experience for human drivers. However, the analogous robotics\ncapability, visual place recognition (VPR) with limited field of view cameras\nunder 180 degree rotations, has proven to be challenging to achieve. To address\nthis problem, this paper presents Same Place Opposing Trajectory (SPOT), a\ntechnique for opposing viewpoint VPR that relies exclusively on structure\nestimated through stereo visual odometry (VO). The method extends recent\nadvances in lidar descriptors and utilizes a novel double (similar and\nopposing) distance matrix sequence matching method. We evaluate SPOT on a\npublicly available dataset with 6.7-7.6 km routes driven in similar and\nopposing directions under various lighting conditions. The proposed algorithm\ndemonstrates remarkable improvement over the state-of-the-art, achieving up to\n91.7% recall at 100% precision in opposing viewpoint cases, while requiring\nless storage than all baselines tested and running faster than all but one.\nMoreover, the proposed method assumes no a priori knowledge of whether the\nviewpoint is similar or opposing, and also demonstrates competitive performance\nin similar viewpoint cases.\n","authors":["Spencer Carmichael","Rahul Agrawal","Ram Vasudevan","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2404.12339v1.pdf","comment":"Accepted to ICRA 2024, project website:\n https://umautobots.github.io/spot"},{"id":"http://arxiv.org/abs/2309.16208v2","updated":"2024-04-18T17:08:53Z","published":"2023-09-28T07:17:44Z","title":"Low-rank tensor completion via tensor joint rank with logarithmic\n composite norm","summary":" Low-rank tensor completion (LRTC) aims to recover a complete low-rank tensor\nfrom incomplete observed tensor, attracting extensive attention in various\npractical applications such as image processing and computer vision. However,\ncurrent methods often perform well only when there is a sufficient of observed\ninformation, and they perform poorly or may fail when the observed information\nis less than 5\\%. In order to improve the utilization of observed information,\na new method called the tensor joint rank with logarithmic composite norm\n(TJLC) method is proposed. This method simultaneously exploits two types of\ntensor low-rank structures, namely tensor Tucker rank and tubal rank, thereby\nenhancing the inherent correlations between known and missing elements. To\naddress the challenge of applying two tensor ranks with significantly different\ndirectly to LRTC, a new tensor Logarithmic composite norm is further proposed.\nSubsequently, the TJLC model and algorithm for the LRTC problem are proposed.\nAdditionally, theoretical convergence guarantees for the TJLC method are\nprovided. Experiments on various real datasets demonstrate that the proposed\nmethod outperforms state-of-the-art methods significantly. Particularly, the\nproposed method achieves satisfactory recovery even when the observed\ninformation is as low as 1\\%, and the recovery performance improves\nsignificantly as the observed information increases.\n","authors":["Hongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.16208v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12333v1","updated":"2024-04-18T16:59:51Z","published":"2024-04-18T16:59:51Z","title":"Customizing Text-to-Image Diffusion with Camera Viewpoint Control","summary":" Model customization introduces new concepts to existing text-to-image models,\nenabling the generation of the new concept in novel contexts. However, such\nmethods lack accurate camera view control w.r.t the object, and users must\nresort to prompt engineering (e.g., adding \"top-view\") to achieve coarse view\ncontrol. In this work, we introduce a new task -- enabling explicit control of\ncamera viewpoint for model customization. This allows us to modify object\nproperties amongst various background scenes via text prompts, all while\nincorporating the target camera pose as additional control. This new task\npresents significant challenges in merging a 3D representation from the\nmulti-view images of the new concept with a general, 2D text-to-image model. To\nbridge this gap, we propose to condition the 2D diffusion process on rendered,\nview-dependent features of the new object. During training, we jointly adapt\nthe 2D diffusion modules and 3D feature predictions to reconstruct the object's\nappearance and geometry while reducing overfitting to the input multi-view\nimages. Our method outperforms existing image editing and model personalization\nbaselines in preserving the custom object's identity while following the input\ntext prompt and the object's camera pose.\n","authors":["Nupur Kumari","Grace Su","Richard Zhang","Taesung Park","Eli Shechtman","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.12333v1.pdf","comment":"project page: https://customdiffusion360.github.io"},{"id":"http://arxiv.org/abs/2404.12330v1","updated":"2024-04-18T16:58:05Z","published":"2024-04-18T16:58:05Z","title":"A Perspective on Deep Vision Performance with Standard Image and Video\n Codecs","summary":" Resource-constrained hardware, such as edge devices or cell phones, often\nrely on cloud servers to provide the required computational resources for\ninference in deep vision models. However, transferring image and video data\nfrom an edge or mobile device to a cloud server requires coding to deal with\nnetwork constraints. The use of standardized codecs, such as JPEG or H.264, is\nprevalent and required to ensure interoperability. This paper aims to examine\nthe implications of employing standardized codecs within deep vision pipelines.\nWe find that using JPEG and H.264 coding significantly deteriorates the\naccuracy across a broad range of vision tasks and models. For instance, strong\ncompression rates reduce semantic segmentation accuracy by more than 80% in\nmIoU. In contrast to previous findings, our analysis extends beyond image and\naction classification to localization and dense prediction tasks, thus\nproviding a more comprehensive perspective.\n","authors":["Christoph Reich","Oliver Hahn","Daniel Cremers","Stefan Roth","Biplob Debnath"],"pdf_url":"https://arxiv.org/pdf/2404.12330v1.pdf","comment":"Accepted at CVPR 2024 Workshop on AI for Streaming (AIS)"},{"id":"http://arxiv.org/abs/2404.12322v1","updated":"2024-04-18T16:53:08Z","published":"2024-04-18T16:53:08Z","title":"Generalizable Face Landmarking Guided by Conditional Face Warping","summary":" As a significant step for human face modeling, editing, and generation, face\nlandmarking aims at extracting facial keypoints from images. A generalizable\nface landmarker is required in practice because real-world facial images, e.g.,\nthe avatars in animations and games, are often stylized in various ways.\nHowever, achieving generalizable face landmarking is challenging due to the\ndiversity of facial styles and the scarcity of labeled stylized faces. In this\nstudy, we propose a simple but effective paradigm to learn a generalizable face\nlandmarker based on labeled real human faces and unlabeled stylized faces. Our\nmethod learns the face landmarker as the key module of a conditional face\nwarper. Given a pair of real and stylized facial images, the conditional face\nwarper predicts a warping field from the real face to the stylized one, in\nwhich the face landmarker predicts the ending points of the warping field and\nprovides us with high-quality pseudo landmarks for the corresponding stylized\nfacial images. Applying an alternating optimization strategy, we learn the face\nlandmarker to minimize $i)$ the discrepancy between the stylized faces and the\nwarped real ones and $ii)$ the prediction errors of both real and pseudo\nlandmarks. Experiments on various datasets show that our method outperforms\nexisting state-of-the-art domain adaptation methods in face landmarking tasks,\nleading to a face landmarker with better generalizability. Code is available at\nhttps://plustwo0.github.io/project-face-landmarker}{https://plustwo0.github.io/project-face-landmarker.\n","authors":["Jiayi Liang","Haotian Liu","Hongteng Xu","Dixin Luo"],"pdf_url":"https://arxiv.org/pdf/2404.12322v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.12309v1","updated":"2024-04-18T16:38:02Z","published":"2024-04-18T16:38:02Z","title":"iRAG: An Incremental Retrieval Augmented Generation System for Videos","summary":" Retrieval augmented generation (RAG) systems combine the strengths of\nlanguage generation and information retrieval to power many real-world\napplications like chatbots. Use of RAG for combined understanding of multimodal\ndata such as text, images and videos is appealing but two critical limitations\nexist: one-time, upfront capture of all content in large multimodal data as\ntext descriptions entails high processing times, and not all information in the\nrich multimodal data is typically in the text descriptions. Since the user\nqueries are not known apriori, developing a system for multimodal to text\nconversion and interactive querying of multimodal data is challenging.\n To address these limitations, we propose iRAG, which augments RAG with a\nnovel incremental workflow to enable interactive querying of large corpus of\nmultimodal data. Unlike traditional RAG, iRAG quickly indexes large\nrepositories of multimodal data, and in the incremental workflow, it uses the\nindex to opportunistically extract more details from select portions of the\nmultimodal data to retrieve context relevant to an interactive user query. Such\nan incremental workflow avoids long multimodal to text conversion times,\novercomes information loss issues by doing on-demand query-specific extraction\nof details in multimodal data, and ensures high quality of responses to\ninteractive user queries that are often not known apriori. To the best of our\nknowledge, iRAG is the first system to augment RAG with an incremental workflow\nto support efficient interactive querying of large, real-world multimodal data.\nExperimental results on real-world long videos demonstrate 23x to 25x faster\nvideo to text ingestion, while ensuring that quality of responses to\ninteractive user queries is comparable to responses from a traditional RAG\nwhere all video data is converted to text upfront before any querying.\n","authors":["Md Adnan Arefeen","Biplob Debnath","Md Yusuf Sarwar Uddin","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2404.12309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12295v1","updated":"2024-04-18T16:18:41Z","published":"2024-04-18T16:18:41Z","title":"When Medical Imaging Met Self-Attention: A Love Story That Didn't Quite\n Work Out","summary":" A substantial body of research has focused on developing systems that assist\nmedical professionals during labor-intensive early screening processes, many\nbased on convolutional deep-learning architectures. Recently, multiple studies\nexplored the application of so-called self-attention mechanisms in the vision\ndomain. These studies often report empirical improvements over fully\nconvolutional approaches on various datasets and tasks. To evaluate this trend\nfor medical imaging, we extend two widely adopted convolutional architectures\nwith different self-attention variants on two different medical datasets. With\nthis, we aim to specifically evaluate the possible advantages of additional\nself-attention. We compare our models with similarly sized convolutional and\nattention-based baselines and evaluate performance gains statistically.\nAdditionally, we investigate how including such layers changes the features\nlearned by these models during the training. Following a hyperparameter search,\nand contrary to our expectations, we observe no significant improvement in\nbalanced accuracy over fully convolutional models. We also find that important\nfeatures, such as dermoscopic structures in skin lesion images, are still not\nlearned by employing self-attention. Finally, analyzing local explanations, we\nconfirm biased feature usage. We conclude that merely incorporating attention\nis insufficient to surpass the performance of existing fully convolutional\nmethods.\n","authors":["Tristan Piater","Niklas Penzel","Gideon Stein","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2404.12295v1.pdf","comment":"10 pages, 2 figures, 5 tables, presented at VISAPP 2024"},{"id":"http://arxiv.org/abs/2404.12292v1","updated":"2024-04-18T16:12:38Z","published":"2024-04-18T16:12:38Z","title":"Reducing Bias in Pre-trained Models by Tuning while Penalizing Change","summary":" Deep models trained on large amounts of data often incorporate implicit\nbiases present during training time. If later such a bias is discovered during\ninference or deployment, it is often necessary to acquire new data and retrain\nthe model. This behavior is especially problematic in critical areas such as\nautonomous driving or medical decision-making. In these scenarios, new data is\noften expensive and hard to come by. In this work, we present a method based on\nchange penalization that takes a pre-trained model and adapts the weights to\nmitigate a previously detected bias. We achieve this by tuning a\nzero-initialized copy of a frozen pre-trained network. Our method needs very\nfew, in extreme cases only a single, examples that contradict the bias to\nincrease performance. Additionally, we propose an early stopping criterion to\nmodify baselines and reduce overfitting. We evaluate our approach on a\nwell-known bias in skin lesion classification and three other datasets from the\ndomain shift literature. We find that our approach works especially well with\nvery few images. Simple fine-tuning combined with our early stopping also leads\nto performance benefits for a larger number of tuning samples.\n","authors":["Niklas Penzel","Gideon Stein","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2404.12292v1.pdf","comment":"12 pages, 12 figures, presented at VISAPP 2024"},{"id":"http://arxiv.org/abs/2404.12285v1","updated":"2024-04-18T16:04:14Z","published":"2024-04-18T16:04:14Z","title":"Performance Evaluation of Segment Anything Model with Variational\n Prompting for Application to Non-Visible Spectrum Imagery","summary":" The Segment Anything Model (SAM) is a deep neural network foundational model\ndesigned to perform instance segmentation which has gained significant\npopularity given its zero-shot segmentation ability. SAM operates by generating\nmasks based on various input prompts such as text, bounding boxes, points, or\nmasks, introducing a novel methodology to overcome the constraints posed by\ndataset-specific scarcity. While SAM is trained on an extensive dataset,\ncomprising ~11M images, it mostly consists of natural photographic images with\nonly very limited images from other modalities. Whilst the rapid progress in\nvisual infrared surveillance and X-ray security screening imaging technologies,\ndriven forward by advances in deep learning, has significantly enhanced the\nability to detect, classify and segment objects with high accuracy, it is not\nevident if the SAM zero-shot capabilities can be transferred to such\nmodalities. This work assesses SAM capabilities in segmenting objects of\ninterest in the X-ray/infrared modalities. Our approach reuses the pre-trained\nSAM with three different prompts: bounding box, centroid and random points. We\npresent quantitative/qualitative results to showcase the performance on\nselected datasets. Our results show that SAM can segment objects in the X-ray\nmodality when given a box prompt, but its performance varies for point prompts.\nSpecifically, SAM performs poorly in segmenting slender objects and organic\nmaterials, such as plastic bottles. We find that infrared objects are also\nchallenging to segment with point prompts given the low-contrast nature of this\nmodality. This study shows that while SAM demonstrates outstanding zero-shot\ncapabilities with box prompts, its performance ranges from moderate to poor for\npoint prompts, indicating that special consideration on the cross-modal\ngeneralisation of SAM is needed when considering use on X-ray/infrared imagery.\n","authors":["Yona Falinie A. Gaus","Neelanjan Bhowmik","Brian K. S. Isaac-Medina","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2404.12285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08273v2","updated":"2024-04-18T15:55:56Z","published":"2024-04-12T06:52:40Z","title":"Struggle with Adversarial Defense? Try Diffusion","summary":" Adversarial attacks induce misclassification by introducing subtle\nperturbations. Recently, diffusion models are applied to the image classifiers\nto improve adversarial robustness through adversarial training or by purifying\nadversarial noise. However, diffusion-based adversarial training often\nencounters convergence challenges and high computational expenses.\nAdditionally, diffusion-based purification inevitably causes data shift and is\ndeemed susceptible to stronger adaptive attacks. To tackle these issues, we\npropose the Truth Maximization Diffusion Classifier (TMDC), a generative\nBayesian classifier that builds upon pre-trained diffusion models and the\nBayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian\nprinciples, utilizes the conditional likelihood from diffusion models to\ndetermine the class probabilities of input images, thereby insulating against\nthe influences of data shift and the limitations of adversarial training.\nMoreover, to enhance TMDC's resilience against more potent adversarial attacks,\nwe propose an optimization strategy for diffusion classifiers. This strategy\ninvolves post-training the diffusion model on perturbed datasets with\nground-truth labels as conditions, guiding the diffusion model to learn the\ndata distribution and maximizing the likelihood under the ground-truth labels.\nThe proposed method achieves state-of-the-art performance on the CIFAR10\ndataset against heavy white-box attacks and strong adaptive attacks.\nSpecifically, TMDC achieves robust accuracies of 82.81% against $l_{\\infty}$\nnorm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded\nperturbations, respectively, with $\\epsilon=0.05$.\n","authors":["Yujie Li","Yanbin Wang","Haitao Xu","Bin Liu","Jianguo Sun","Zhenhao Guo","Wenrui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04517v4","updated":"2024-04-18T15:50:37Z","published":"2023-01-11T15:31:15Z","title":"A new dataset for measuring the performance of blood vessel segmentation\n methods under distribution shifts","summary":" Creating a dataset for training supervised machine learning algorithms can be\na demanding task. This is especially true for medical image segmentation since\none or more specialists are usually required for image annotation, and creating\nground truth labels for just a single image can take up to several hours. In\naddition, it is paramount that the annotated samples represent well the\ndifferent conditions that might affect the imaged tissues as well as possible\nchanges in the image acquisition process. This can only be achieved by\nconsidering samples that are typical in the dataset as well as atypical, or\neven outlier, samples. We introduce VessMAP, a heterogeneous blood vessel\nsegmentation dataset acquired by carefully sampling relevant images from a\nlarger non-annotated dataset. A methodology was developed to select both\nprototypical and atypical samples from the base dataset, thus defining an\nassorted set of images that can be used for measuring the performance of\nsegmentation algorithms on samples that are highly distinct from each other. To\ndemonstrate the potential of the new dataset, we show that the validation\nperformance of a neural network changes significantly depending on the splits\nused for training the network.\n","authors":["Matheus Viana da Silva","Natália de Carvalho Santos","Julie Ouellette","Baptiste Lacoste","Cesar Henrique Comin"],"pdf_url":"https://arxiv.org/pdf/2301.04517v4.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2310.08475v5","updated":"2024-04-18T15:46:22Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":" In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v5.pdf","comment":"EMNLP 2023. Add the Exact Match/Accuracy results of Reliability and\n T-Generality"},{"id":"http://arxiv.org/abs/2309.16388v2","updated":"2024-04-18T15:32:30Z","published":"2023-09-28T12:36:12Z","title":"Exposing Image Splicing Traces in Scientific Publications via\n Uncertainty-guided Refinement","summary":" Recently, a surge in scientific publications suspected of image manipulation\nhas led to numerous retractions, bringing the issue of image integrity into\nsharp focus. Although research on forensic detectors for image plagiarism and\nimage synthesis exists, the detection of image splicing traces in scientific\npublications remains unexplored. Compared to image duplication and synthesis,\nimage splicing detection is more challenging due to the lack of reference\nimages and the typically small tampered areas. Furthermore, disruptive factors\nin scientific images, such as artifacts from digital compression, abnormal\npatterns, and noise from physical operations, present misleading features like\nsplicing traces, significantly increasing the difficulty of this task.\nMoreover, the scarcity of high-quality datasets of spliced scientific images\nlimits potential advancements. In this work, we propose an Uncertainty-guided\nRefinement Network (URN) to mitigate the impact of these disruptive factors.\nOur URN can explicitly suppress the propagation of unreliable information flow\ncaused by disruptive factors between regions, thus obtaining robust splicing\nfeatures. Additionally, the URN is designed to concentrate improvements in\nuncertain prediction areas during the decoding phase. We also construct a\ndataset for image splicing detection (SciSp) containing 1,290 spliced images.\nCompared to existing datasets, SciSp includes the largest number of spliced\nimages and the most diverse sources. Comprehensive experiments conducted on\nthree benchmark datasets demonstrate the superiority of our approach. We also\nvalidate the URN's generalisability in resisting cross-dataset domain shifts\nand its robustness against various post-processing techniques, including\nadvanced deep-learning-based inpainting.\n","authors":["Xun Lin","Wenzhong Tang","Haoran Wang","Yizhong Liu","Yakun Ju","Shuai Wang","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2309.16388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15584v3","updated":"2024-04-18T15:29:14Z","published":"2024-02-23T19:51:55Z","title":"State Space Models for Event Cameras","summary":" Today, state-of-the-art deep neural networks that process event-camera data\nfirst convert a temporal window of events into dense, grid-like input\nrepresentations. As such, they exhibit poor generalizability when deployed at\nhigher inference frequencies (i.e., smaller temporal windows) than the ones\nthey were trained on. We address this challenge by introducing state-space\nmodels (SSMs) with learnable timescale parameters to event-based vision. This\ndesign adapts to varying frequencies without the need to retrain the network at\ndifferent frequencies. Additionally, we investigate two strategies to\ncounteract aliasing effects when deploying the model at higher frequencies. We\ncomprehensively evaluate our approach against existing methods based on RNN and\nTransformer architectures across various benchmarks, including Gen1 and 1 Mpx\nevent camera datasets. Our results demonstrate that SSM-based models train 33%\nfaster and also exhibit minimal performance degradation when tested at higher\nfrequencies than the training input. Traditional RNN and Transformer models\nexhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.76\nmAP, highlighting the effectiveness of SSMs in event-based vision tasks.\n","authors":["Nikola Zubić","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2402.15584v3.pdf","comment":"18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper"},{"id":"http://arxiv.org/abs/2404.12260v1","updated":"2024-04-18T15:28:34Z","published":"2024-04-18T15:28:34Z","title":"Alleviating Catastrophic Forgetting in Facial Expression Recognition\n with Emotion-Centered Models","summary":" Facial expression recognition is a pivotal component in machine learning,\nfacilitating various applications. However, convolutional neural networks\n(CNNs) are often plagued by catastrophic forgetting, impeding their\nadaptability. The proposed method, emotion-centered generative replay (ECgr),\ntackles this challenge by integrating synthetic images from generative\nadversarial networks. Moreover, ECgr incorporates a quality assurance algorithm\nto ensure the fidelity of generated images. This dual approach enables CNNs to\nretain past knowledge while learning new tasks, enhancing their performance in\nemotion recognition. The experimental results on four diverse facial expression\ndatasets demonstrate that incorporating images generated by our\npseudo-rehearsal method enhances training on the targeted dataset and the\nsource dataset while making the CNN retain previously learned knowledge.\n","authors":["Israel A. Laurensi","Alceu de Souza Britto Jr.","Jean Paul Barddal","Alessandro Lameiras Koerich"],"pdf_url":"https://arxiv.org/pdf/2404.12260v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.12258v1","updated":"2024-04-18T15:25:59Z","published":"2024-04-18T15:25:59Z","title":"DeepLocalization: Using change point detection for Temporal Action\n Localization","summary":" In this study, we introduce DeepLocalization, an innovative framework devised\nfor the real-time localization of actions tailored explicitly for monitoring\ndriver behavior. Utilizing the power of advanced deep learning methodologies,\nour objective is to tackle the critical issue of distracted driving-a\nsignificant factor contributing to road accidents. Our strategy employs a dual\napproach: leveraging Graph-Based Change-Point Detection for pinpointing actions\nin time alongside a Video Large Language Model (Video-LLM) for precisely\ncategorizing activities. Through careful prompt engineering, we customize the\nVideo-LLM to adeptly handle driving activities' nuances, ensuring its\nclassification efficacy even with sparse data. Engineered to be lightweight,\nour framework is optimized for consumer-grade GPUs, making it vastly applicable\nin practical scenarios. We subjected our method to rigorous testing on the\nSynDD2 dataset, a complex benchmark for distracted driving behaviors, where it\ndemonstrated commendable performance-achieving 57.5% accuracy in event\nclassification and 51% in event detection. These outcomes underscore the\nsubstantial promise of DeepLocalization in accurately identifying diverse\ndriver behaviors and their temporal occurrences, all within the bounds of\nlimited computational resources.\n","authors":["Mohammed Shaiqur Rahman","Ibne Farabi Shihab","Lynna Chu","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.12258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12257v1","updated":"2024-04-18T15:23:37Z","published":"2024-04-18T15:23:37Z","title":"Food Portion Estimation via 3D Object Scaling","summary":" Image-based methods to analyze food images have alleviated the user burden\nand biases associated with traditional methods. However, accurate portion\nestimation remains a major challenge due to the loss of 3D information in the\n2D representation of foods captured by smartphone cameras or wearable devices.\nIn this paper, we propose a new framework to estimate both food volume and\nenergy from 2D images by leveraging the power of 3D food models and physical\nreference in the eating scene. Our method estimates the pose of the camera and\nthe food object in the input image and recreates the eating occasion by\nrendering an image of a 3D model of the food with the estimated poses. We also\nintroduce a new dataset, SimpleFood45, which contains 2D images of 45 food\nitems and associated annotations including food volume, weight, and energy. Our\nmethod achieves an average error of 31.10 kCal (17.67%) on this dataset,\noutperforming existing portion estimation methods.\n","authors":["Gautham Vinod","Jiangpeng He","Zeman Shao","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.12257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12252v1","updated":"2024-04-18T15:20:59Z","published":"2024-04-18T15:20:59Z","title":"Deep Gaussian mixture model for unsupervised image segmentation","summary":" The recent emergence of deep learning has led to a great deal of work on\ndesigning supervised deep semantic segmentation algorithms. As in many tasks\nsufficient pixel-level labels are very difficult to obtain, we propose a method\nwhich combines a Gaussian mixture model (GMM) with unsupervised deep learning\ntechniques. In the standard GMM the pixel values with each sub-region are\nmodelled by a Gaussian distribution. In order to identify the different\nregions, the parameter vector that minimizes the negative log-likelihood (NLL)\nfunction regarding the GMM has to be approximated. For this task, usually\niterative optimization methods such as the expectation-maximization (EM)\nalgorithm are used. In this paper, we propose to estimate these parameters\ndirectly from the image using a convolutional neural network (CNN). We thus\nchange the iterative procedure in the EM algorithm replacing the\nexpectation-step by a gradient-step with regard to the networks parameters.\nThis means that the network is trained to minimize the NLL function of the GMM\nwhich comes with at least two advantages. As once trained, the network is able\nto predict label probabilities very quickly compared with time consuming\niterative optimization methods. Secondly, due to the deep image prior our\nmethod is able to partially overcome one of the main disadvantages of GMM,\nwhich is not taking into account correlation between neighboring pixels, as it\nassumes independence between them. We demonstrate the advantages of our method\nin various experiments on the example of myocardial infarct segmentation on\nmulti-sequence MRI images.\n","authors":["Matthias Schwab","Agnes Mayr","Markus Haltmeier"],"pdf_url":"https://arxiv.org/pdf/2404.12252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12251v1","updated":"2024-04-18T15:18:14Z","published":"2024-04-18T15:18:14Z","title":"Dynamic Modality and View Selection for Multimodal Emotion Recognition\n with Missing Modalities","summary":" The study of human emotions, traditionally a cornerstone in fields like\npsychology and neuroscience, has been profoundly impacted by the advent of\nartificial intelligence (AI). Multiple channels, such as speech (voice) and\nfacial expressions (image), are crucial in understanding human emotions.\nHowever, AI's journey in multimodal emotion recognition (MER) is marked by\nsubstantial technical challenges. One significant hurdle is how AI models\nmanage the absence of a particular modality - a frequent occurrence in\nreal-world situations. This study's central focus is assessing the performance\nand resilience of two strategies when confronted with the lack of one modality:\na novel multimodal dynamic modality and view selection and a cross-attention\nmechanism. Results on the RECOLA dataset show that dynamic selection-based\nmethods are a promising approach for MER. In the missing modalities scenarios,\nall dynamic selection-based methods outperformed the baseline. The study\nconcludes by emphasizing the intricate interplay between audio and video\nmodalities in emotion prediction, showcasing the adaptability of dynamic\nselection methods in handling missing modalities.\n","authors":["Luciana Trinkaus Menon","Luiz Carlos Ribeiro Neduziak","Jean Paul Barddal","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr"],"pdf_url":"https://arxiv.org/pdf/2404.12251v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.12246v1","updated":"2024-04-18T15:11:02Z","published":"2024-04-18T15:11:02Z","title":"Blind Localization and Clustering of Anomalies in Textures","summary":" Anomaly detection and localization in images is a growing field in computer\nvision. In this area, a seemingly understudied problem is anomaly clustering,\ni.e., identifying and grouping different types of anomalies in a fully\nunsupervised manner. In this work, we propose a novel method for clustering\nanomalies in largely stationary images (textures) in a blind setting. That is,\nthe input consists of normal and anomalous images without distinction and\nwithout labels. What contributes to the difficulty of the task is that\nanomalous regions are often small and may present only subtle changes in\nappearance, which can be easily overshadowed by the genuine variance in the\ntexture. Moreover, each anomaly type may have a complex appearance\ndistribution. We introduce a novel scheme for solving this task using a\ncombination of blind anomaly localization and contrastive learning. By\nidentifying the anomalous regions with high fidelity, we can restrict our focus\nto those regions of interest; then, contrastive learning is employed to\nincrease the separability of different anomaly types and reduce the intra-class\nvariation. Our experiments show that the proposed solution yields significantly\nbetter results compared to prior work, setting a new state of the art. Project\npage: https://reality.tf.fau.de/pub/ardelean2024blind.html.\n","authors":["Andrei-Timotei Ardelean","Tim Weyrich"],"pdf_url":"https://arxiv.org/pdf/2404.12246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11474v4","updated":"2024-04-18T15:10:47Z","published":"2023-05-19T06:55:04Z","title":"Reciprocal Attention Mixing Transformer for Lightweight Image\n Restoration","summary":" Although many recent works have made advancements in the image restoration\n(IR) field, they often suffer from an excessive number of parameters. Another\nissue is that most Transformer-based IR methods focus only on either local or\nglobal features, leading to limited receptive fields or deficient parameter\nissues. To address these problems, we propose a lightweight IR network,\nReciprocal Attention Mixing Transformer (RAMiT). It employs our proposed\ndimensional reciprocal attention mixing Transformer (D-RAMiT) blocks, which\ncompute bi-dimensional (spatial and channel) self-attentions in parallel with\ndifferent numbers of multi-heads. The bi-dimensional attentions help each other\nto complement their counterpart's drawbacks and are then mixed. Additionally,\nwe introduce a hierarchical reciprocal attention mixing (H-RAMi) layer that\ncompensates for pixel-level information losses and utilizes semantic\ninformation while maintaining an efficient hierarchical structure. Furthermore,\nwe revisit and modify MobileNet V1 and V2 to attach efficient convolutions to\nour proposed components. The experimental results demonstrate that RAMiT\nachieves state-of-the-art performance on multiple lightweight IR tasks,\nincluding super-resolution, color denoising, grayscale denoising, low-light\nenhancement, and deraining. Codes are available at\nhttps://github.com/rami0205/RAMiT.\n","authors":["Haram Choi","Cheolwoong Na","Jihyeon Oh","Seungjae Lee","Jinseop Kim","Subeen Choe","Jeongmin Lee","Taehoon Kim","Jihoon Yang"],"pdf_url":"https://arxiv.org/pdf/2305.11474v4.pdf","comment":"CVPR 2024 Workshop - NTIRE. Codes are available at\n https://github.com/rami0205/RAMiT"},{"id":"http://arxiv.org/abs/2404.09683v2","updated":"2024-04-18T14:51:55Z","published":"2024-04-15T11:36:31Z","title":"Post-Training Network Compression for 3D Medical Image Segmentation:\n Reducing Computational Efforts via Tucker Decomposition","summary":" We address the computational barrier of deploying advanced deep learning\nsegmentation models in clinical settings by studying the efficacy of network\ncompression through tensor decomposition. We propose a post-training Tucker\nfactorization that enables the decomposition of pre-existing models to reduce\ncomputational requirements without impeding segmentation accuracy. We applied\nTucker decomposition to the convolutional kernels of the TotalSegmentator (TS)\nmodel, an nnU-Net model trained on a comprehensive dataset for automatic\nsegmentation of 117 anatomical structures. Our approach reduced the\nfloating-point operations (FLOPs) and memory required during inference,\noffering an adjustable trade-off between computational efficiency and\nsegmentation quality. This study utilized the publicly available TS dataset,\nemploying various downsampling factors to explore the relationship between\nmodel size, inference speed, and segmentation performance. The application of\nTucker decomposition to the TS model substantially reduced the model parameters\nand FLOPs across various compression rates, with limited loss in segmentation\naccuracy. We removed up to 88% of the model's parameters with no significant\nperformance changes in the majority of classes after fine-tuning. Practical\nbenefits varied across different graphics processing unit (GPU) architectures,\nwith more distinct speed-ups on less powerful hardware. Post-hoc network\ncompression via Tucker decomposition presents a viable strategy for reducing\nthe computational demand of medical image segmentation models without\nsubstantially sacrificing accuracy. This approach enables the broader adoption\nof advanced deep learning technologies in clinical practice, offering a way to\nnavigate the constraints of hardware capabilities.\n","authors":["Tobias Weber","Jakob Dexl","David Rügamer","Michael Ingrisch"],"pdf_url":"https://arxiv.org/pdf/2404.09683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12235v1","updated":"2024-04-18T14:51:42Z","published":"2024-04-18T14:51:42Z","title":"Beyond Average: Individualized Visual Scanpath Prediction","summary":" Understanding how attention varies across individuals has significant\nscientific and societal impacts. However, existing visual scanpath models treat\nattention uniformly, neglecting individual differences. To bridge this gap,\nthis paper focuses on individualized scanpath prediction (ISP), a new attention\nmodeling task that aims to accurately predict how different individuals shift\ntheir attention in diverse visual tasks. It proposes an ISP method featuring\nthree novel technical components: (1) an observer encoder to characterize and\nintegrate an observer's unique attention traits, (2) an observer-centric\nfeature integration approach that holistically combines visual features, task\nguidance, and observer-specific characteristics, and (3) an adaptive fixation\nprioritization mechanism that refines scanpath predictions by dynamically\nprioritizing semantic feature maps based on individual observers' attention\ntraits. These novel components allow scanpath models to effectively address the\nattention variations across different observers. Our method is generally\napplicable to different datasets, model architectures, and visual tasks,\noffering a comprehensive tool for transforming general scanpath models into\nindividualized ones. Comprehensive evaluations using value-based and\nranking-based metrics verify the method's effectiveness and generalizability.\n","authors":["Xianyu Chen","Ming Jiang","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.12235v1.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2404.12216v1","updated":"2024-04-18T14:20:30Z","published":"2024-04-18T14:20:30Z","title":"ProTA: Probabilistic Token Aggregation for Text-Video Retrieval","summary":" Text-video retrieval aims to find the most relevant cross-modal samples for a\ngiven query. Recent methods focus on modeling the whole spatial-temporal\nrelations. However, since video clips contain more diverse content than\ncaptions, the model aligning these asymmetric video-text pairs has a high risk\nof retrieving many false positive results. In this paper, we propose\nProbabilistic Token Aggregation (\\textit{ProTA}) to handle cross-modal\ninteraction with content asymmetry. Specifically, we propose dual\npartial-related aggregation to disentangle and re-aggregate token\nrepresentations in both low-dimension and high-dimension spaces. We propose\ntoken-based probabilistic alignment to generate token-level probabilistic\nrepresentation and maintain the feature representation diversity. In addition,\nan adaptive contrastive loss is proposed to learn compact cross-modal\ndistribution space. Based on extensive experiments, \\textit{ProTA} achieves\nsignificant improvements on MSR-VTT (50.9%), LSMDC (25.8%), and DiDeMo (47.2%).\n","authors":["Han Fang","Xianghao Zang","Chao Ban","Zerun Feng","Lanxiang Zhou","Zhongjiang He","Yongxiang Li","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12210v1","updated":"2024-04-18T14:14:44Z","published":"2024-04-18T14:14:44Z","title":"Observation, Analysis, and Solution: Exploring Strong Lightweight Vision\n Transformers via Masked Image Modeling Pre-Training","summary":" Masked image modeling (MIM) pre-training for large-scale vision transformers\n(ViTs) in computer vision has enabled promising downstream performance on top\nof the learned self-supervised ViT features. In this paper, we question if the\nextremely simple ViTs' fine-tuning performance with a small-scale architecture\ncan also benefit from this pre-training paradigm, which is considerably less\nstudied yet in contrast to the well-established lightweight architecture design\nmethodology with sophisticated components introduced. By carefully adapting\nvarious typical MIM pre-training methods to this lightweight regime and\ncomparing them with the contrastive learning (CL) pre-training on various\ndownstream image classification and dense prediction tasks, we systematically\nobserve different behaviors between MIM and CL with respect to the downstream\nfine-tuning data scales. Furthermore, we analyze the frozen features under\nlinear probing evaluation and also the layer representation similarities and\nattention maps across the obtained models, which clearly show the inferior\nlearning of MIM pre-training on higher layers, leading to unsatisfactory\nfine-tuning performance on data-insufficient downstream tasks. This finding is\nnaturally a guide to choosing appropriate distillation strategies during\npre-training to solve the above deterioration problem. Extensive experiments on\nvarious vision tasks demonstrate the effectiveness of our\nobservation-analysis-solution flow. In particular, our pre-training with\ndistillation on pure lightweight ViTs with vanilla/hierarchical design\n(5.7M/6.5M) can achieve 79.4%/78.9% top-1 accuracy on ImageNet-1K. It also\nenables SOTA performance on the ADE20K semantic segmentation task (42.8% mIoU)\nand LaSOT visual tracking task (66.1% AUC) in the lightweight regime. The\nlatter even surpasses all the current SOTA lightweight CPU-realtime trackers.\n","authors":["Jin Gao","Shubo Lin","Shaoru Wang","Yutong Kou","Zeming Li","Liang Li","Congxuan Zhang","Xiaoqin Zhang","Yizheng Wang","Weiming Hu"],"pdf_url":"https://arxiv.org/pdf/2404.12210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12209v1","updated":"2024-04-18T14:14:07Z","published":"2024-04-18T14:14:07Z","title":"Partial-to-Partial Shape Matching with Geometric Consistency","summary":" Finding correspondences between 3D shapes is an important and long-standing\nproblem in computer vision, graphics and beyond. A prominent challenge are\npartial-to-partial shape matching settings, which occur when the shapes to\nmatch are only observed incompletely (e.g. from 3D scanning). Although\npartial-to-partial matching is a highly relevant setting in practice, it is\nrarely explored. Our work bridges the gap between existing (rather artificial)\n3D full shape matching and partial-to-partial real-world settings by exploiting\ngeometric consistency as a strong constraint. We demonstrate that it is indeed\npossible to solve this challenging problem in a variety of settings. For the\nfirst time, we achieve geometric consistency for partial-to-partial matching,\nwhich is realized by a novel integer non-linear program formalism building on\ntriangle product spaces, along with a new pruning algorithm based on linear\ninteger programming. Further, we generate a new inter-class dataset for\npartial-to-partial shape-matching. We show that our method outperforms current\nSOTA methods on both an established intra-class dataset and our novel\ninter-class dataset.\n","authors":["Viktoria Ehm","Maolin Gao","Paul Roetzer","Marvin Eisenberger","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2404.12209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12203v1","updated":"2024-04-18T14:07:08Z","published":"2024-04-18T14:07:08Z","title":"GraFIQs: Face Image Quality Assessment Using Gradient Magnitudes","summary":" Face Image Quality Assessment (FIQA) estimates the utility of face images for\nautomated face recognition (FR) systems. We propose in this work a novel\napproach to assess the quality of face images based on inspecting the required\nchanges in the pre-trained FR model weights to minimize differences between\ntesting samples and the distribution of the FR training dataset. To achieve\nthat, we propose quantifying the discrepancy in Batch Normalization statistics\n(BNS), including mean and variance, between those recorded during FR training\nand those obtained by processing testing samples through the pretrained FR\nmodel. We then generate gradient magnitudes of pretrained FR weights by\nbackpropagating the BNS through the pretrained model. The cumulative absolute\nsum of these gradient magnitudes serves as the FIQ for our approach. Through\ncomprehensive experimentation, we demonstrate the effectiveness of our\ntraining-free and quality labeling-free approach, achieving competitive\nperformance to recent state-of-theart FIQA approaches without relying on\nquality labeling, the need to train regression networks, specialized\narchitectures, or designing and optimizing specific loss functions.\n","authors":["Jan Niklas Kolf","Naser Damer","Fadi Boutros"],"pdf_url":"https://arxiv.org/pdf/2404.12203v1.pdf","comment":"Accepted at CVPR Workshop 2024"},{"id":"http://arxiv.org/abs/2404.12192v1","updated":"2024-04-18T13:56:03Z","published":"2024-04-18T13:56:03Z","title":"Aligning Actions and Walking to LLM-Generated Textual Descriptions","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nvarious domains, including data augmentation and synthetic data generation.\nThis work explores the use of LLMs to generate rich textual descriptions for\nmotion sequences, encompassing both actions and walking patterns. We leverage\nthe expressive power of LLMs to align motion representations with high-level\nlinguistic cues, addressing two distinct tasks: action recognition and\nretrieval of walking sequences based on appearance attributes. For action\nrecognition, we employ LLMs to generate textual descriptions of actions in the\nBABEL-60 dataset, facilitating the alignment of motion sequences with\nlinguistic representations. In the domain of gait analysis, we investigate the\nimpact of appearance attributes on walking patterns by generating textual\ndescriptions of motion sequences from the DenseGait dataset using LLMs. These\ndescriptions capture subtle variations in walking styles influenced by factors\nsuch as clothing choices and footwear. Our approach demonstrates the potential\nof LLMs in augmenting structured motion attributes and aligning multi-modal\nrepresentations. The findings contribute to the advancement of comprehensive\nmotion understanding and open up new avenues for leveraging LLMs in multi-modal\nalignment and data augmentation for motion analysis. We make the code publicly\navailable at https://github.com/Radu1999/WalkAndText\n","authors":["Radu Chivereanu","Adrian Cosma","Andy Catruna","Razvan Rughinis","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2404.12192v1.pdf","comment":"Accepted at 2nd Workshop on Learning with Few or without Annotated\n Face, Body and Gesture Data"},{"id":"http://arxiv.org/abs/2404.12183v1","updated":"2024-04-18T13:46:16Z","published":"2024-04-18T13:46:16Z","title":"Gait Recognition from Highly Compressed Videos","summary":" Surveillance footage represents a valuable resource and opportunities for\nconducting gait analysis. However, the typical low quality and high noise\nlevels in such footage can severely impact the accuracy of pose estimation\nalgorithms, which are foundational for reliable gait analysis. Existing\nliterature suggests a direct correlation between the efficacy of pose\nestimation and the subsequent gait analysis results. A common mitigation\nstrategy involves fine-tuning pose estimation models on noisy data to improve\nrobustness. However, this approach may degrade the downstream model's\nperformance on the original high-quality data, leading to a trade-off that is\nundesirable in practice. We propose a processing pipeline that incorporates a\ntask-targeted artifact correction model specifically designed to pre-process\nand enhance surveillance footage before pose estimation. Our artifact\ncorrection model is optimized to work alongside a state-of-the-art pose\nestimation network, HRNet, without requiring repeated fine-tuning of the pose\nestimation model. Furthermore, we propose a simple and robust method for\nobtaining low quality videos that are annotated with poses in an automatic\nmanner with the purpose of training the artifact correction model. We\nsystematically evaluate the performance of our artifact correction model\nagainst a range of noisy surveillance data and demonstrate that our approach\nnot only achieves improved pose estimation on low-quality surveillance footage,\nbut also preserves the integrity of the pose estimation on high resolution\nfootage. Our experiments show a clear enhancement in gait analysis performance,\nsupporting the viability of the proposed method as a superior alternative to\ndirect fine-tuning strategies. Our contributions pave the way for more reliable\ngait analysis using surveillance data in real-world applications, regardless of\ndata quality.\n","authors":["Andrei Niculae","Andy Catruna","Adrian Cosma","Daniel Rosner","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2404.12183v1.pdf","comment":"Accepted at 2nd Workshop on Learning with Few or without Annotated\n Face, Body and Gesture Data"},{"id":"http://arxiv.org/abs/2404.10335v2","updated":"2024-04-18T13:34:08Z","published":"2024-04-16T07:19:52Z","title":"Efficiently Adversarial Examples Generation for Visual-Language Models\n under Targeted Transfer Scenarios using Diffusion Models","summary":" Targeted transfer-based attacks involving adversarial examples pose a\nsignificant threat to large visual-language models (VLMs). However, the\nstate-of-the-art (SOTA) transfer-based attacks incur high costs due to\nexcessive iteration counts. Furthermore, the generated adversarial examples\nexhibit pronounced adversarial noise and demonstrate limited efficacy in\nevading defense methods such as DiffPure. To address these issues, inspired by\nscore matching, we introduce AdvDiffVLM, which utilizes diffusion models to\ngenerate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM\nemploys Adaptive Ensemble Gradient Estimation to modify the score during the\ndiffusion model's reverse generation process, ensuring the adversarial examples\nproduced contain natural adversarial semantics and thus possess enhanced\ntransferability. Simultaneously, to enhance the quality of adversarial examples\nfurther, we employ the GradCAM-guided Mask method to disperse adversarial\nsemantics throughout the image, rather than concentrating them in a specific\narea. Experimental results demonstrate that our method achieves a speedup\nranging from 10X to 30X compared to existing transfer-based attack methods,\nwhile maintaining superior quality of adversarial examples. Additionally, the\ngenerated adversarial examples possess strong transferability and exhibit\nincreased robustness against adversarial defense methods. Notably, AdvDiffVLM\ncan successfully attack commercial VLMs, including GPT-4V, in a black-box\nmanner.\n","authors":["Qi Guo","Shanmin Pang","Xiaojun Jia","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2404.10335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02286v3","updated":"2024-04-18T13:33:32Z","published":"2024-02-03T22:51:17Z","title":"Multi-Level Aggregation and Recursive Alignment Architecture for\n Efficient Parallel Inference Segmentation Network","summary":" Real-time semantic segmentation is a crucial research for real-world\napplications. However, many methods lay particular emphasis on reducing the\ncomputational complexity and model size, while largely sacrificing the\naccuracy. To tackle this problem, we propose a parallel inference network\ncustomized for semantic segmentation tasks to achieve a good trade-off between\nspeed and accuracy. We employ a shallow backbone to ensure real-time speed, and\npropose three core components to compensate for the reduced model capacity to\nimprove accuracy. Specifically, we first design a dual-pyramidal path\narchitecture (Multi-level Feature Aggregation Module, MFAM) to aggregate\nmulti-level features from the encoder to each scale, providing hierarchical\nclues for subsequent spatial alignment and corresponding in-network inference.\nThen, we build Recursive Alignment Module (RAM) by combining the flow-based\nalignment module with recursive upsampling architecture for accurate spatial\nalignment between multi-scale feature maps with half the computational\ncomplexity of the straightforward alignment method. Finally, we perform\nindependent parallel inference on the aligned features to obtain multi-scale\nscores, and adaptively fuse them through an attention-based Adaptive Scores\nFusion Module (ASFM) so that the final prediction can favor objects of multiple\nscales. Our framework shows a better balance between speed and accuracy than\nstate-of-the-art real-time methods on Cityscapes and CamVid datasets. We also\nconducted systematic ablation studies to gain insight into our motivation and\narchitectural design. Code is available at:\nhttps://github.com/Yanhua-Zhang/MFARANet.\n","authors":["Yanhua Zhang","Ke Zhang","Jingyu Wang","Yulin Wu","Wuwei Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02286v3.pdf","comment":"15 pages, 9 figures and 12 Tables. Manuscript completed on April 30,\n 2022"},{"id":"http://arxiv.org/abs/2404.12172v1","updated":"2024-04-18T13:27:29Z","published":"2024-04-18T13:27:29Z","title":"How to Benchmark Vision Foundation Models for Semantic Segmentation?","summary":" Recent vision foundation models (VFMs) have demonstrated proficiency in\nvarious tasks but require supervised fine-tuning to perform the task of\nsemantic segmentation effectively. Benchmarking their performance is essential\nfor selecting current models and guiding future model developments for this\ntask. The lack of a standardized benchmark complicates comparisons. Therefore,\nthe primary objective of this paper is to study how VFMs should be benchmarked\nfor semantic segmentation. To do so, various VFMs are fine-tuned under various\nsettings, and the impact of individual settings on the performance ranking and\ntraining time is assessed. Based on the results, the recommendation is to\nfine-tune the ViT-B variants of VFMs with a 16x16 patch size and a linear\ndecoder, as these settings are representative of using a larger model, more\nadvanced decoder and smaller patch size, while reducing training time by more\nthan 13 times. Using multiple datasets for training and evaluation is also\nrecommended, as the performance ranking across datasets and domain shifts\nvaries. Linear probing, a common practice for some VFMs, is not recommended, as\nit is not representative of end-to-end fine-tuning. The benchmarking setup\nrecommended in this paper enables a performance analysis of VFMs for semantic\nsegmentation. The findings of such an analysis reveal that pretraining with\npromptable segmentation is not beneficial, whereas masked image modeling (MIM)\nwith abstract representations is crucial, even more important than the type of\nsupervision used. The code for efficiently fine-tuning VFMs for semantic\nsegmentation can be accessed through the project page at:\nhttps://tue-mps.github.io/benchmark-vfm-ss/.\n","authors":["Tommie Kerssies","Daan de Geus","Gijs Dubbelman"],"pdf_url":"https://arxiv.org/pdf/2404.12172v1.pdf","comment":"CVPR 2024 Workshop Proceedings for the Second Workshop on Foundation\n Models"},{"id":"http://arxiv.org/abs/2404.12168v1","updated":"2024-04-18T13:22:56Z","published":"2024-04-18T13:22:56Z","title":"Real-World Efficient Blind Motion Deblurring via Blur Pixel\n Discretization","summary":" As recent advances in mobile camera technology have enabled the capability to\ncapture high-resolution images, such as 4K images, the demand for an efficient\ndeblurring model handling large motion has increased. In this paper, we\ndiscover that the image residual errors, i.e., blur-sharp pixel differences,\ncan be grouped into some categories according to their motion blur type and how\ncomplex their neighboring pixels are. Inspired by this, we decompose the\ndeblurring (regression) task into blur pixel discretization (pixel-level blur\nclassification) and discrete-to-continuous conversion (regression with blur\nclass map) tasks. Specifically, we generate the discretized image residual\nerrors by identifying the blur pixels and then transform them to a continuous\nform, which is computationally more efficient than naively solving the original\nregression problem with continuous values. Here, we found that the\ndiscretization result, i.e., blur segmentation map, remarkably exhibits visual\nsimilarity with the image residual errors. As a result, our efficient model\nshows comparable performance to state-of-the-art methods in realistic\nbenchmarks, while our method is up to 10 times computationally more efficient.\n","authors":["Insoo Kim","Jae Seok Choi","Geonseok Seo","Kinam Kwon","Jinwoo Shin","Hyong-Euk Lee"],"pdf_url":"https://arxiv.org/pdf/2404.12168v1.pdf","comment":"CVPR2024 Camera-Ready"},{"id":"http://arxiv.org/abs/2311.17116v4","updated":"2024-04-18T13:03:44Z","published":"2023-11-28T12:14:22Z","title":"REF$^2$-NeRF: Reflection and Refraction aware Neural Radiance Field","summary":" Recently, significant progress has been made in the study of methods for 3D\nreconstruction from multiple images using implicit neural representations,\nexemplified by the neural radiance field (NeRF) method. Such methods, which are\nbased on volume rendering, can model various light phenomena, and various\nextended methods have been proposed to accommodate different scenes and\nsituations. However, when handling scenes with multiple glass objects, e.g.,\nobjects in a glass showcase, modeling the target scene accurately has been\nchallenging due to the presence of multiple reflection and refraction effects.\nThus, this paper proposes a NeRF-based modeling method for scenes containing a\nglass case. In the proposed method, refraction and reflection are modeled using\nelements that are dependent and independent of the viewer's perspective. This\napproach allows us to estimate the surfaces where refraction occurs, i.e.,\nglass surfaces, and enables the separation and modeling of both direct and\nreflected light components. The proposed method requires predetermined camera\nposes, but accurately estimating these poses in scenes with glass objects is\ndifficult. Therefore, we used a robotic arm with an attached camera to acquire\nimages with known poses. Compared to existing methods, the proposed method\nenables more accurate modeling of both glass refraction and the overall scene.\n","authors":["Wooseok Kim","Taiki Fukiage","Takeshi Oishi"],"pdf_url":"https://arxiv.org/pdf/2311.17116v4.pdf","comment":"10 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.12154v1","updated":"2024-04-18T12:58:55Z","published":"2024-04-18T12:58:55Z","title":"StyleBooth: Image Style Editing with Multimodal Instruction","summary":" Given an original image, image editing aims to generate an image that align\nwith the provided instruction. The challenges are to accept multimodal inputs\nas instructions and a scarcity of high-quality training data, including crucial\ntriplets of source/target image pairs and multimodal (text and image)\ninstructions. In this paper, we focus on image style editing and present\nStyleBooth, a method that proposes a comprehensive framework for image editing\nand a feasible strategy for building a high-quality style editing dataset. We\nintegrate encoded textual instruction and image exemplar as a unified condition\nfor diffusion model, enabling the editing of original image following\nmultimodal instructions. Furthermore, by iterative style-destyle tuning and\nediting and usability filtering, the StyleBooth dataset provides\ncontent-consistent stylized/plain image pairs in various categories of styles.\nTo show the flexibility of StyleBooth, we conduct experiments on diverse tasks,\nsuch as text-based style editing, exemplar-based style editing and\ncompositional style editing. The results demonstrate that the quality and\nvariety of training data significantly enhance the ability to preserve content\nand improve the overall quality of generated images in editing tasks. Project\npage can be found at https://ali-vilab.github.io/stylebooth-page/.\n","authors":["Zhen Han","Chaojie Mao","Zeyinzi Jiang","Yulin Pan","Jingfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15260v3","updated":"2024-04-18T12:44:56Z","published":"2023-11-26T10:27:22Z","title":"NeuRAD: Neural Rendering for Autonomous Driving","summary":" Neural radiance fields (NeRFs) have gained popularity in the autonomous\ndriving (AD) community. Recent methods show NeRFs' potential for closed-loop\nsimulation, enabling testing of AD systems, and as an advanced training data\naugmentation technique. However, existing methods often require long training\ntimes, dense semantic supervision, or lack generalizability. This, in turn,\nhinders the application of NeRFs for AD at scale. In this paper, we propose\nNeuRAD, a robust novel view synthesis method tailored to dynamic AD data. Our\nmethod features simple network design, extensive sensor modeling for both\ncamera and lidar -- including rolling shutter, beam divergence and ray dropping\n-- and is applicable to multiple datasets out of the box. We verify its\nperformance on five popular AD datasets, achieving state-of-the-art performance\nacross the board. To encourage further development, we will openly release the\nNeuRAD source code. See https://github.com/georghess/NeuRAD .\n","authors":["Adam Tonderski","Carl Lindström","Georg Hess","William Ljungbergh","Lennart Svensson","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2311.15260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12139v1","updated":"2024-04-18T12:41:33Z","published":"2024-04-18T12:41:33Z","title":"Omniview-Tuning: Boosting Viewpoint Invariance of Vision-Language\n Pre-training Models","summary":" Vision-Language Pre-training (VLP) models like CLIP have achieved remarkable\nsuccess in computer vision and particularly demonstrated superior robustness to\ndistribution shifts of 2D images. However, their robustness under 3D viewpoint\nvariations is still limited, which can hinder the development for real-world\napplications. This paper successfully addresses this concern while keeping\nVLPs' original performance by breaking through two primary obstacles: 1) the\nscarcity of training data and 2) the suboptimal fine-tuning paradigms. To\ncombat data scarcity, we build the Multi-View Caption (MVCap) dataset -- a\ncomprehensive collection of over four million multi-view image-text pairs\nacross more than 100K objects, providing more potential for VLP models to\ndevelop generalizable viewpoint-invariant representations. To address the\nlimitations of existing paradigms in performance trade-offs and training\nefficiency, we design a novel fine-tuning framework named Omniview-Tuning\n(OVT). Specifically, OVT introduces a Cross-Viewpoint Alignment objective\nthrough a minimax-like optimization strategy, which effectively aligns\nrepresentations of identical objects from diverse viewpoints without causing\noverfitting. Additionally, OVT fine-tunes VLP models in a parameter-efficient\nmanner, leading to minimal computational cost. Extensive experiments on various\nVLP models with different architectures validate that OVT significantly\nimproves the models' resilience to viewpoint shifts and keeps the original\nperformance, establishing a pioneering standard for boosting the viewpoint\ninvariance of VLP models.\n","authors":["Shouwei Ruan","Yinpeng Dong","Hanqing Liu","Yao Huang","Hang Su","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2404.12139v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2311.06634v2","updated":"2024-04-18T12:39:15Z","published":"2023-11-11T18:32:06Z","title":"Back to Basics: Fast Denoising Iterative Algorithm","summary":" We introduce Back to Basics (BTB), a fast iterative algorithm for noise\nreduction. Our method is computationally efficient, does not require training\nor ground truth data, and can be applied in the presence of independent noise,\nas well as correlated (coherent) noise, where the noise level is unknown. We\nexamine three study cases: natural image denoising in the presence of additive\nwhite Gaussian noise, Poisson-distributed image denoising, and speckle\nsuppression in optical coherence tomography (OCT). Experimental results\ndemonstrate that the proposed approach can effectively improve image quality,\nin challenging noise settings. Theoretical guarantees are provided for\nconvergence stability.\n","authors":["Deborah Pereg"],"pdf_url":"https://arxiv.org/pdf/2311.06634v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12130v1","updated":"2024-04-18T12:31:48Z","published":"2024-04-18T12:31:48Z","title":"One-Shot Sequential Federated Learning for Non-IID Data by Enhancing\n Local Model Diversity","summary":" Traditional federated learning mainly focuses on parallel settings (PFL),\nwhich can suffer significant communication and computation costs. In contrast,\none-shot and sequential federated learning (SFL) have emerged as innovative\nparadigms to alleviate these costs. However, the issue of non-IID (Independent\nand Identically Distributed) data persists as a significant challenge in\none-shot and SFL settings, exacerbated by the restricted communication between\nclients. In this paper, we improve the one-shot sequential federated learning\nfor non-IID data by proposing a local model diversity-enhancing strategy.\nSpecifically, to leverage the potential of local model diversity for improving\nmodel performance, we introduce a local model pool for each client that\ncomprises diverse models generated during local training, and propose two\ndistance measurements to further enhance the model diversity and mitigate the\neffect of non-IID data. Consequently, our proposed framework can improve the\nglobal model performance while maintaining low communication costs. Extensive\nexperiments demonstrate that our method exhibits superior performance to\nexisting one-shot PFL methods and achieves better accuracy compared with\nstate-of-the-art one-shot SFL methods on both label-skew and domain-shift tasks\n(e.g., 6%+ accuracy improvement on the CIFAR-10 dataset).\n","authors":["Naibo Wang","Yuchen Deng","Wenjie Feng","Shichen Fan","Jianwei Yin","See-Kiong Ng"],"pdf_url":"https://arxiv.org/pdf/2404.12130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12120v1","updated":"2024-04-18T12:13:09Z","published":"2024-04-18T12:13:09Z","title":"Fortify the Guardian, Not the Treasure: Resilient Adversarial Detectors","summary":" This paper presents RADAR-Robust Adversarial Detection via Adversarial\nRetraining-an approach designed to enhance the robustness of adversarial\ndetectors against adaptive attacks, while maintaining classifier performance.\nAn adaptive attack is one where the attacker is aware of the defenses and\nadapts their strategy accordingly. Our proposed method leverages adversarial\ntraining to reinforce the ability to detect attacks, without compromising clean\naccuracy. During the training phase, we integrate into the dataset adversarial\nexamples, which were optimized to fool both the classifier and the adversarial\ndetector, enabling the adversarial detector to learn and adapt to potential\nattack scenarios. Experimental evaluations on the CIFAR-10 and SVHN datasets\ndemonstrate that our proposed algorithm significantly improves a detector's\nability to accurately identify adaptive adversarial attacks -- without\nsacrificing clean accuracy.\n","authors":["Raz Lapid","Almog Dubin","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2404.12120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08182v2","updated":"2024-04-18T11:57:49Z","published":"2023-10-12T10:17:40Z","title":"XIMAGENET-12: An Explainable AI Benchmark Dataset for Model Robustness\n Evaluation","summary":" Despite the promising performance of existing visual models on public\nbenchmarks, the critical assessment of their robustness for real-world\napplications remains an ongoing challenge. To bridge this gap, we propose an\nexplainable visual dataset, XIMAGENET-12, to evaluate the robustness of visual\nmodels. XIMAGENET-12 consists of over 200K images with 15,410 manual semantic\nannotations. Specifically, we deliberately selected 12 categories from\nImageNet, representing objects commonly encountered in practical life. To\nsimulate real-world situations, we incorporated six diverse scenarios, such as\noverexposure, blurring, and color changes, etc. We further develop a\nquantitative criterion for robustness assessment, allowing for a nuanced\nunderstanding of how visual models perform under varying conditions, notably in\nrelation to the background. We make the XIMAGENET-12 dataset and its\ncorresponding code openly accessible at\n\\url{https://sites.google.com/view/ximagenet-12/home}. We expect the\nintroduction of the XIMAGENET-12 dataset will empower researchers to thoroughly\nevaluate the robustness of their visual models under challenging conditions.\n","authors":["Qiang Li","Dan Zhang","Shengzhao Lei","Xun Zhao","Porawit Kamnoedboon","WeiWei Li","Junhao Dong","Shuyan Li"],"pdf_url":"https://arxiv.org/pdf/2310.08182v2.pdf","comment":"Paper accepted by Synthetic Data for Computer Vision Workshop @ IEEE\n CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06211v2","updated":"2024-04-18T11:52:11Z","published":"2024-04-09T11:00:11Z","title":"Unified Physical-Digital Attack Detection Challenge","summary":" Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR)\nSystems. In real-world scenarios, FRs are confronted with both physical and\ndigital attacks. However, existing algorithms often address only one type of\nattack at a time, which poses significant limitations in real-world scenarios\nwhere FR systems face hybrid physical-digital threats. To facilitate the\nresearch of Unified Attack Detection (UAD) algorithms, a large-scale\nUniAttackData dataset has been collected. UniAttackData is the largest public\ndataset for Unified Attack Detection, with a total of 28,706 videos, where each\nunique identity encompasses all advanced attack types. Based on this dataset,\nwe organized a Unified Physical-Digital Face Attack Detection Challenge to\nboost the research in Unified Attack Detections. It attracted 136 teams for the\ndevelopment phase, with 13 qualifying for the final round. The results\nre-verified by the organizing team were used for the final ranking. This paper\ncomprehensively reviews the challenge, detailing the dataset introduction,\nprotocol definition, evaluation criteria, and a summary of published results.\nFinally, we focus on the detailed analysis of the highest-performing algorithms\nand offer potential directions for unified physical-digital attack detection\ninspired by this competition. Challenge Website:\nhttps://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024.\n","authors":["Haocheng Yuan","Ajian Liu","Junze Zheng","Jun Wan","Jiankang Deng","Sergio Escalera","Hugo Jair Escalante","Isabelle Guyon","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2404.06211v2.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.12104v1","updated":"2024-04-18T11:38:25Z","published":"2024-04-18T11:38:25Z","title":"Ethical-Lens: Curbing Malicious Usages of Open-Source Text-to-Image\n Models","summary":" The burgeoning landscape of text-to-image models, exemplified by innovations\nsuch as Midjourney and DALLE 3, has revolutionized content creation across\ndiverse sectors. However, these advancements bring forth critical ethical\nconcerns, particularly with the misuse of open-source models to generate\ncontent that violates societal norms. Addressing this, we introduce\nEthical-Lens, a framework designed to facilitate the value-aligned usage of\ntext-to-image tools without necessitating internal model revision. Ethical-Lens\nensures value alignment in text-to-image models across toxicity and bias\ndimensions by refining user commands and rectifying model outputs. Systematic\nevaluation metrics, combining GPT4-V, HEIM, and FairFace scores, assess\nalignment capability. Our experiments reveal that Ethical-Lens enhances\nalignment capabilities to levels comparable with or superior to commercial\nmodels like DALLE 3, ensuring user-generated content adheres to ethical\nstandards while maintaining image quality. This study indicates the potential\nof Ethical-Lens to ensure the sustainable development of open-source\ntext-to-image tools and their beneficial integration into society. Our code is\navailable at https://github.com/yuzhu-cai/Ethical-Lens.\n","authors":["Yuzhu Cai","Sheng Yin","Yuxi Wei","Chenxin Xu","Weibo Mao","Felix Juefei-Xu","Siheng Chen","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12104v1.pdf","comment":"42 pages, 17 figures, 29 tables"},{"id":"http://arxiv.org/abs/2404.12103v1","updated":"2024-04-18T11:36:37Z","published":"2024-04-18T11:36:37Z","title":"S3R-Net: A Single-Stage Approach to Self-Supervised Shadow Removal","summary":" In this paper we present S3R-Net, the Self-Supervised Shadow Removal Network.\nThe two-branch WGAN model achieves self-supervision relying on the\nunify-and-adaptphenomenon - it unifies the style of the output data and infers\nits characteristics from a database of unaligned shadow-free reference images.\nThis approach stands in contrast to the large body of supervised frameworks.\nS3R-Net also differentiates itself from the few existing self-supervised models\noperating in a cycle-consistent manner, as it is a non-cyclic, unidirectional\nsolution. The proposed framework achieves comparable numerical scores to recent\nselfsupervised shadow removal models while exhibiting superior qualitative\nperformance and keeping the computational cost low.\n","authors":["Nikolina Kubiak","Armin Mustafa","Graeme Phillipson","Stephen Jolly","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2404.12103v1.pdf","comment":"NTIRE workshop @ CVPR 2024. Code & models available at\n https://github.com/n-kubiak/S3R-Net"},{"id":"http://arxiv.org/abs/2303.13959v4","updated":"2024-04-18T11:31:00Z","published":"2023-03-24T12:33:44Z","title":"Bridging Stereo Geometry and BEV Representation with Reliable Mutual\n Interaction for Semantic Scene Completion","summary":" 3D semantic scene completion (SSC) is an ill-posed perception task that\nrequires inferring a dense 3D scene from limited observations. Previous\ncamera-based methods struggle to predict accurate semantic scenes due to\ninherent geometric ambiguity and incomplete observations. In this paper, we\nresort to stereo matching technique and bird's-eye-view (BEV) representation\nlearning to address such issues in SSC. Complementary to each other, stereo\nmatching mitigates geometric ambiguity with epipolar constraint while BEV\nrepresentation enhances the hallucination ability for invisible regions with\nglobal semantic context. However, due to the inherent representation gap\nbetween stereo geometry and BEV features, it is non-trivial to bridge them for\ndense prediction task of SSC. Therefore, we further develop a unified\noccupancy-based framework dubbed BRGScene, which effectively bridges these two\nrepresentations with dense 3D volumes for reliable semantic scene completion.\nSpecifically, we design a novel Mutual Interactive Ensemble (MIE) block for\npixel-level reliable aggregation of stereo geometry and BEV features. Within\nthe MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced\nwith confidence re-weighting, is employed to encourage fine-grained interaction\nthrough mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is\nintroduced to facilitate complementary aggregation through channel-wise\nrecalibration and multi-group voting. Our method outperforms all published\ncamera-based methods on SemanticKITTI for semantic scene completion. Our code\nis available on \\url{https://github.com/Arlo0o/StereoScene}.\n","authors":["Bohan Li","Yasheng Sun","Zhujin Liang","Dalong Du","Zhuanghui Zhang","Xiaofeng Wang","Yunnan Wang","Xin Jin","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.13959v4.pdf","comment":"IJCAI2024"},{"id":"http://arxiv.org/abs/2404.12091v1","updated":"2024-04-18T11:20:53Z","published":"2024-04-18T11:20:53Z","title":"Harnessing Joint Rain-/Detail-aware Representations to Eliminate\n Intricate Rains","summary":" Recent advances in image deraining have focused on training powerful models\non mixed multiple datasets comprising diverse rain types and backgrounds.\nHowever, this approach tends to overlook the inherent differences among rainy\nimages, leading to suboptimal results. To overcome this limitation, we focus on\naddressing various rainy images by delving into meaningful representations that\nencapsulate both the rain and background components. Leveraging these\nrepresentations as instructive guidance, we put forth a Context-based\nInstance-level Modulation (CoI-M) mechanism adept at efficiently modulating\nCNN- or Transformer-based models. Furthermore, we devise a rain-/detail-aware\ncontrastive learning strategy to help extract joint rain-/detail-aware\nrepresentations. By integrating CoI-M with the rain-/detail-aware Contrastive\nlearning, we develop CoIC, an innovative and potent algorithm tailored for\ntraining models on mixed datasets. Moreover, CoIC offers insight into modeling\nrelationships of datasets, quantitatively assessing the impact of rain and\ndetails on restoration, and unveiling distinct behaviors of models given\ndiverse inputs. Extensive experiments validate the efficacy of CoIC in boosting\nthe deraining ability of CNN and Transformer models. CoIC also enhances the\nderaining prowess remarkably when real-world dataset is included.\n","authors":["Wu Ran","Peirong Ma","Zhiquan He","Hao Ren","Hong Lu"],"pdf_url":"https://arxiv.org/pdf/2404.12091v1.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.12083v1","updated":"2024-04-18T11:09:25Z","published":"2024-04-18T11:09:25Z","title":"MambaPupil: Bidirectional Selective Recurrent model for Event-based Eye\n tracking","summary":" Event-based eye tracking has shown great promise with the high temporal\nresolution and low redundancy provided by the event camera. However, the\ndiversity and abruptness of eye movement patterns, including blinking,\nfixating, saccades, and smooth pursuit, pose significant challenges for eye\nlocalization. To achieve a stable event-based eye-tracking system, this paper\nproposes a bidirectional long-term sequence modeling and time-varying state\nselection mechanism to fully utilize contextual temporal information in\nresponse to the variability of eye movements. Specifically, the MambaPupil\nnetwork is proposed, which consists of the multi-layer convolutional encoder to\nextract features from the event representations, a bidirectional Gated\nRecurrent Unit (GRU), and a Linear Time-Varying State Space Module (LTV-SSM),\nto selectively capture contextual correlation from the forward and backward\ntemporal relationship. Furthermore, the Bina-rep is utilized as a compact event\nrepresentation, and the tailor-made data augmentation, called as Event-Cutout,\nis proposed to enhance the model's robustness by applying spatial random\nmasking to the event image. The evaluation on the ThreeET-plus benchmark shows\nthe superior performance of the MambaPupil, which secured the 1st place in\nCVPR'2024 AIS Event-based Eye Tracking challenge.\n","authors":["Zhong Wang","Zengyu Wan","Han Han","Bohao Liao","Yuliang Wu","Wei Zhai","Yang Cao","Zheng-jun Zha"],"pdf_url":"https://arxiv.org/pdf/2404.12083v1.pdf","comment":"Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for\n Streaming), top solution of challenge Event-based Eye Tracking, see\n https://www.kaggle.com/competitions/event-based-eye-tracking-ais2024"},{"id":"http://arxiv.org/abs/2404.12081v1","updated":"2024-04-18T11:05:15Z","published":"2024-04-18T11:05:15Z","title":"MaskCD: A Remote Sensing Change Detection Network Based on Mask\n Classification","summary":" Change detection (CD) from remote sensing (RS) images using deep learning has\nbeen widely investigated in the literature. It is typically regarded as a\npixel-wise labeling task that aims to classify each pixel as changed or\nunchanged. Although per-pixel classification networks in encoder-decoder\nstructures have shown dominance, they still suffer from imprecise boundaries\nand incomplete object delineation at various scenes. For high-resolution RS\nimages, partly or totally changed objects are more worthy of attention rather\nthan a single pixel. Therefore, we revisit the CD task from the mask prediction\nand classification perspective and propose MaskCD to detect changed areas by\nadaptively generating categorized masks from input image pairs. Specifically,\nit utilizes a cross-level change representation perceiver (CLCRP) to learn\nmultiscale change-aware representations and capture spatiotemporal relations\nfrom encoded features by exploiting deformable multihead self-attention\n(DeformMHSA). Subsequently, a masked-attention-based detection transformers\n(MA-DETR) decoder is developed to accurately locate and identify changed\nobjects based on masked attention and self-attention mechanisms. It\nreconstructs the desired changed objects by decoding the pixel-wise\nrepresentations into learnable mask proposals and making final predictions from\nthese candidates. Experimental results on five benchmark datasets demonstrate\nthe proposed approach outperforms other state-of-the-art models. Codes and\npretrained models are available online (https://github.com/EricYu97/MaskCD).\n","authors":["Weikang Yu","Xiaokang Zhang","Samiran Das","Xiao Xiang Zhu","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2404.12081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15663v2","updated":"2024-04-18T10:48:15Z","published":"2024-01-28T13:59:58Z","title":"Low-resolution Prior Equilibrium Network for CT Reconstruction","summary":" The unrolling method has been investigated for learning variational models in\nX-ray computed tomography. However, it has been observed that directly\nunrolling the regularization model through gradient descent does not produce\nsatisfactory results. In this paper, we present a novel deep learning-based CT\nreconstruction model, where the low-resolution image is introduced to obtain an\neffective regularization term for improving the network`s robustness. Our\napproach involves constructing the backbone network architecture by algorithm\nunrolling that is realized using the deep equilibrium architecture. We\ntheoretically discuss the convergence of the proposed low-resolution prior\nequilibrium model and provide the conditions to guarantee convergence.\nExperimental results on both sparse-view and limited-angle reconstruction\nproblems are provided, demonstrating that our end-to-end low-resolution prior\nequilibrium model outperforms other state-of-the-art methods in terms of noise\nreduction, contrast-to-noise ratio, and preservation of edge details.\n","authors":["Yijie Yang","Qifeng Gao","Yuping Duan"],"pdf_url":"https://arxiv.org/pdf/2401.15663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04519v3","updated":"2024-04-18T10:40:35Z","published":"2023-12-07T18:38:39Z","title":"Bootstrapping Autonomous Driving Radars with Self-Supervised Learning","summary":" The perception of autonomous vehicles using radars has attracted increased\nresearch interest due its ability to operate in fog and bad weather. However,\ntraining radar models is hindered by the cost and difficulty of annotating\nlarge-scale radar data. To overcome this bottleneck, we propose a\nself-supervised learning framework to leverage the large amount of unlabeled\nradar data to pre-train radar-only embeddings for self-driving perception\ntasks. The proposed method combines radar-to-radar and radar-to-vision\ncontrastive losses to learn a general representation from unlabeled radar\nheatmaps paired with their corresponding camera images. When used for\ndownstream object detection, we demonstrate that the proposed self-supervision\nframework can improve the accuracy of state-of-the-art supervised baselines by\n$5.8\\%$ in mAP. Code is available at \\url{https://github.com/yiduohao/Radical}.\n","authors":["Yiduo Hao","Sohrab Madani","Junfeng Guan","Mohammed Alloulah","Saurabh Gupta","Haitham Hassanieh"],"pdf_url":"https://arxiv.org/pdf/2312.04519v3.pdf","comment":"12 pages, 5 figures, to be published in Proceedings of the IEEE/CVF\n Conference on Computer Vision and Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2404.12064v1","updated":"2024-04-18T10:23:10Z","published":"2024-04-18T10:23:10Z","title":"PureForest: A Large-scale Aerial Lidar and Aerial Imagery Dataset for\n Tree Species Classification in Monospecific Forests","summary":" Knowledge of tree species distribution is fundamental to managing forests.\nNew deep learning approaches promise significant accuracy gains for forest\nmapping, and are becoming a critical tool for mapping multiple tree species at\nscale. To advance the field, deep learning researchers need large benchmark\ndatasets with high-quality annotations. To this end, we present the PureForest\ndataset: a large-scale, open, multimodal dataset designed for tree species\nclassification from both Aerial Lidar Scanning (ALS) point clouds and Very High\nResolution (VHR) aerial images. Most current public Lidar datasets for tree\nspecies classification have low diversity as they only span a small area of a\nfew dozen annotated hectares at most. In contrast, PureForest has 18 tree\nspecies grouped into 13 semantic classes, and spans 339 km$^2$ across 449\ndistinct monospecific forests, and is to date the largest and most\ncomprehensive Lidar dataset for the identification of tree species. By making\nPureForest publicly available, we hope to provide a challenging benchmark\ndataset to support the development of deep learning approaches for tree species\nidentification from Lidar and/or aerial imagery. In this data paper, we\ndescribe the annotation workflow, the dataset, the recommended evaluation\nmethodology, and establish a baseline performance from both 3D and 2D\nmodalities.\n","authors":["Charles Gaydon","Floryne Roche"],"pdf_url":"https://arxiv.org/pdf/2404.12064v1.pdf","comment":"14 pages | 5 figures | Dataset is available at\n http://huggingface.co/datasets/IGNF/PureForest"},{"id":"http://arxiv.org/abs/2404.12062v1","updated":"2024-04-18T10:20:37Z","published":"2024-04-18T10:20:37Z","title":"MIDGET: Music Conditioned 3D Dance Generation","summary":" In this paper, we introduce a MusIc conditioned 3D Dance GEneraTion model,\nnamed MIDGET based on Dance motion Vector Quantised Variational AutoEncoder\n(VQ-VAE) model and Motion Generative Pre-Training (GPT) model to generate\nvibrant and highquality dances that match the music rhythm. To tackle\nchallenges in the field, we introduce three new components: 1) a pre-trained\nmemory codebook based on the Motion VQ-VAE model to store different human pose\ncodes, 2) employing Motion GPT model to generate pose codes with music and\nmotion Encoders, 3) a simple framework for music feature extraction. We compare\nwith existing state-of-the-art models and perform ablation experiments on\nAIST++, the largest publicly available music-dance dataset. Experiments\ndemonstrate that our proposed framework achieves state-of-the-art performance\non motion quality and its alignment with the music.\n","authors":["Jinwu Wang","Wei Mao","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12062v1.pdf","comment":"12 pages, 6 figures Published in AI 2023: Advances in Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2312.16867v2","updated":"2024-04-18T10:14:31Z","published":"2023-12-28T07:37:11Z","title":"DualFluidNet: an Attention-based Dual-pipeline Network for FLuid\n Simulation","summary":" Fluid motion can be considered as a point cloud transformation when using the\nSPH method. Compared to traditional numerical analysis methods, using machine\nlearning techniques to learn physics simulations can achieve near-accurate\nresults, while significantly increasing efficiency. In this paper, we propose\nan innovative approach for 3D fluid simulations utilizing an Attention-based\nDual-pipeline Network, which employs a dual-pipeline architecture, seamlessly\nintegrated with an Attention-based Feature Fusion Module. Unlike previous\nmethods, which often make difficult trade-offs between global fluid control and\nphysical law constraints, we find a way to achieve a better balance between\nthese two crucial aspects with a well-designed dual-pipeline approach.\nAdditionally, we design a Type-aware Input Module to adaptively recognize\nparticles of different types and perform feature fusion afterward, such that\nfluid-solid coupling issues can be better dealt with. Furthermore, we propose a\nnew dataset, Tank3D, to further explore the network's ability to handle more\ncomplicated scenes. The experiments demonstrate that our approach not only\nattains a quantitative enhancement in various metrics, surpassing the\nstate-of-the-art methods but also signifies a qualitative leap in neural\nnetwork-based simulation by faithfully adhering to the physical laws. Code and\nvideo demonstrations are available at\nhttps://github.com/chenyu-xjtu/DualFluidNet.\n","authors":["Yu Chen","Shuai Zheng","Menglong Jin","Yan Chang","Nianyi Wang"],"pdf_url":"https://arxiv.org/pdf/2312.16867v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.12055v1","updated":"2024-04-18T10:10:56Z","published":"2024-04-18T10:10:56Z","title":"Improving the perception of visual fiducial markers in the field using\n Adaptive Active Exposure Control","summary":" Accurate localization is fundamental for autonomous underwater vehicles\n(AUVs) to carry out precise tasks, such as manipulation and construction.\nVision-based solutions using fiducial marker are promising, but extremely\nchallenging underwater because of harsh lighting condition underwater. This\npaper introduces a gradient-based active camera exposure control method to\ntackle sharp lighting variations during image acquisition, which can establish\nbetter foundation for subsequent image enhancement procedures. Considering a\ntypical scenario for underwater operations where visual tags are used, we\nproposed several experiments comparing our method with other state-of-the-art\nexposure control method including Active Exposure Control (AEC) and\nGradient-based Exposure Control (GEC). Results show a significant improvement\nin the accuracy of robot localization. This method is an important component\nthat can be used in visual-based state estimation pipeline to improve the\noverall localization accuracy.\n","authors":["Ziang Ren","Samuel Lensgraf","Alberto Quattrini Li"],"pdf_url":"https://arxiv.org/pdf/2404.12055v1.pdf","comment":"Paper accepted by ISER 2023"},{"id":"http://arxiv.org/abs/2404.09624v2","updated":"2024-04-18T10:10:00Z","published":"2024-04-15T09:56:20Z","title":"AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics\n Perception","summary":" The highly abstract nature of image aesthetics perception (IAP) poses\nsignificant challenge for current multimodal large language models (MLLMs). The\nlack of human-annotated multi-modality aesthetic data further exacerbates this\ndilemma, resulting in MLLMs falling short of aesthetics perception\ncapabilities. To address the above challenge, we first introduce a\ncomprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT)\ndataset, which serves as the footstone for building multi-modality aesthetics\nfoundation models. Specifically, to align MLLMs with human aesthetics\nperception, we construct a corpus-rich aesthetic critique database with 21,904\ndiverse-sourced images and 88K human natural language feedbacks, which are\ncollected via progressive questions, ranging from coarse-grained aesthetic\ngrades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle\ndiverse queries, we further prompt GPT to refine the aesthetic critiques and\nassemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT,\nwhich consists of 409K multi-typed instructions to activate stronger aesthetic\ncapabilities. Based on the AesMMIT database, we fine-tune the open-sourced\ngeneral foundation models, achieving multi-modality Aesthetic Expert models,\ndubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert\nmodels deliver significantly better aesthetic perception performances than the\nstate-of-the-art MLLMs, including the most advanced GPT-4V and\nGemini-Pro-Vision. Source data will be available at\nhttps://github.com/yipoh/AesExpert.\n","authors":["Yipo Huang","Xiangfei Sheng","Zhichao Yang","Quan Yuan","Zhichao Duan","Pengfei Chen","Leida Li","Weisi Lin","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2404.09624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12037v1","updated":"2024-04-18T09:44:56Z","published":"2024-04-18T09:44:56Z","title":"Data-free Knowledge Distillation for Fine-grained Visual Categorization","summary":" Data-free knowledge distillation (DFKD) is a promising approach for\naddressing issues related to model compression, security privacy, and\ntransmission restrictions. Although the existing methods exploiting DFKD have\nachieved inspiring achievements in coarse-grained classification, in practical\napplications involving fine-grained classification tasks that require more\ndetailed distinctions between similar categories, sub-optimal results are\nobtained. To address this issue, we propose an approach called DFKD-FGVC that\nextends DFKD to fine-grained visual categorization~(FGVC) tasks. Our approach\nutilizes an adversarial distillation framework with attention generator, mixed\nhigh-order attention distillation, and semantic feature contrast learning.\nSpecifically, we introduce a spatial-wise attention mechanism to the generator\nto synthesize fine-grained images with more details of discriminative parts. We\nalso utilize the mixed high-order attention mechanism to capture complex\ninteractions among parts and the subtle differences among discriminative\nfeatures of the fine-grained categories, paying attention to both local\nfeatures and semantic context relationships. Moreover, we leverage the teacher\nand student models of the distillation framework to contrast high-level\nsemantic feature maps in the hyperspace, comparing variances of different\ncategories. We evaluate our approach on three widely-used FGVC benchmarks\n(Aircraft, Cars196, and CUB200) and demonstrate its superior performance.\n","authors":["Renrong Shao","Wei Zhang","Jianhua Yin","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08277v2","updated":"2024-04-18T09:43:26Z","published":"2024-04-12T07:04:56Z","title":"FaceFilterSense: A Filter-Resistant Face Recognition and Facial\n Attribute Analysis Framework","summary":" With the advent of social media, fun selfie filters have come into tremendous\nmainstream use affecting the functioning of facial biometric systems as well as\nimage recognition systems. These filters vary from beautification filters and\nAugmented Reality (AR)-based filters to filters that modify facial landmarks.\nHence, there is a need to assess the impact of such filters on the performance\nof existing face recognition systems. The limitation associated with existing\nsolutions is that these solutions focus more on the beautification filters.\nHowever, the current AR-based filters and filters which distort facial key\npoints are in vogue recently and make the faces highly unrecognizable even to\nthe naked eye. Also, the filters considered are mostly obsolete with limited\nvariations. To mitigate these limitations, we aim to perform a holistic impact\nanalysis of the latest filters and propose an user recognition model with the\nfiltered images. We have utilized a benchmark dataset for baseline images, and\napplied the latest filters over them to generate a beautified/filtered dataset.\nNext, we have introduced a model FaceFilterNet for beautified user recognition.\nIn this framework, we also utilize our model to comment on various attributes\nof the person including age, gender, and ethnicity. In addition, we have also\npresented a filter-wise impact analysis on face recognition, age estimation,\ngender, and ethnicity prediction. The proposed method affirms the efficacy of\nour dataset with an accuracy of 87.25% and an optimal accuracy for facial\nattribute analysis.\n","authors":["Shubham Tiwari","Yash Sethia","Ritesh Kumar","Ashwani Tanwar","Rudresh Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.08277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12031v1","updated":"2024-04-18T09:31:03Z","published":"2024-04-18T09:31:03Z","title":"MLS-Track: Multilevel Semantic Interaction in RMOT","summary":" The new trend in multi-object tracking task is to track objects of interest\nusing natural language. However, the scarcity of paired prompt-instance data\nhinders its progress. To address this challenge, we propose a high-quality yet\nlow-cost data generation method base on Unreal Engine 5 and construct a\nbrand-new benchmark dataset, named Refer-UE-City, which primarily includes\nscenes from intersection surveillance videos, detailing the appearance and\nactions of people and vehicles. Specifically, it provides 14 videos with a\ntotal of 714 expressions, and is comparable in scale to the Refer-KITTI\ndataset. Additionally, we propose a multi-level semantic-guided multi-object\nframework called MLS-Track, where the interaction between the model and text is\nenhanced layer by layer through the introduction of Semantic Guidance Module\n(SGM) and Semantic Correlation Branch (SCB). Extensive experiments on\nRefer-UE-City and Refer-KITTI datasets demonstrate the effectiveness of our\nproposed framework and it achieves state-of-the-art performance. Code and\ndatatsets will be available.\n","authors":["Zeliang Ma","Song Yang","Zhe Cui","Zhicheng Zhao","Fei Su","Delong Liu","Jingyu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12031v1.pdf","comment":"17 pages 8 figures"},{"id":"http://arxiv.org/abs/2404.12024v1","updated":"2024-04-18T09:21:16Z","published":"2024-04-18T09:21:16Z","title":"Meta-Auxiliary Learning for Micro-Expression Recognition","summary":" Micro-expressions (MEs) are involuntary movements revealing people's hidden\nfeelings, which has attracted numerous interests for its objectivity in emotion\ndetection. However, despite its wide applications in various scenarios,\nmicro-expression recognition (MER) remains a challenging problem in real life\ndue to three reasons, including (i) data-level: lack of data and imbalanced\nclasses, (ii) feature-level: subtle, rapid changing, and complex features of\nMEs, and (iii) decision-making-level: impact of individual differences. To\naddress these issues, we propose a dual-branch meta-auxiliary learning method,\ncalled LightmanNet, for fast and robust micro-expression recognition.\nSpecifically, LightmanNet learns general MER knowledge from limited data\nthrough a dual-branch bi-level optimization process: (i) In the first level, it\nobtains task-specific MER knowledge by learning in two branches, where the\nfirst branch is for learning MER features via primary MER tasks, while the\nother branch is for guiding the model obtain discriminative features via\nauxiliary tasks, i.e., image alignment between micro-expressions and\nmacro-expressions since their resemblance in both spatial and temporal\nbehavioral patterns. The two branches of learning jointly constrain the model\nof learning meaningful task-specific MER knowledge while avoiding learning\nnoise or superficial connections between MEs and emotions that may damage its\ngeneralization ability. (ii) In the second level, LightmanNet further refines\nthe learned task-specific knowledge, improving model generalization and\nefficiency. Extensive experiments on various benchmark datasets demonstrate the\nsuperior robustness and efficiency of LightmanNet.\n","authors":["Jingyao Wang","Yunhan Tian","Yuxuan Yang","Xiaoxin Chen","Changwen Zheng","Wenwen Qiang"],"pdf_url":"https://arxiv.org/pdf/2404.12024v1.pdf","comment":"10 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.12020v1","updated":"2024-04-18T09:16:02Z","published":"2024-04-18T09:16:02Z","title":"Look, Listen, and Answer: Overcoming Biases for Audio-Visual Question\n Answering","summary":" Audio-Visual Question Answering (AVQA) is a complex multi-modal reasoning\ntask, demanding intelligent systems to accurately respond to natural language\nqueries based on audio-video input pairs. Nevertheless, prevalent AVQA\napproaches are prone to overlearning dataset biases, resulting in poor\nrobustness. Furthermore, current datasets may not provide a precise diagnostic\nfor these methods. To tackle these challenges, firstly, we propose a novel\ndataset, \\textit{MUSIC-AVQA-R}, crafted in two steps: rephrasing questions\nwithin the test split of a public dataset (\\textit{MUSIC-AVQA}) and\nsubsequently introducing distribution shifts to split questions. The former\nleads to a large, diverse test space, while the latter results in a\ncomprehensive robustness evaluation on rare, frequent, and overall questions.\nSecondly, we propose a robust architecture that utilizes a multifaceted cycle\ncollaborative debiasing strategy to overcome bias learning. Experimental\nresults show that this architecture achieves state-of-the-art performance on\nboth datasets, especially obtaining a significant improvement of 9.68\\% on the\nproposed dataset. Extensive ablation experiments are conducted on these two\ndatasets to validate the effectiveness of the debiasing strategy. Additionally,\nwe highlight the limited robustness of existing multi-modal QA methods through\nthe evaluation on our dataset.\n","authors":["Jie Ma","Min Hu","Pinghui Wang","Wangchun Sun","Lingyun Song","Hongbin Pei","Jun Liu","Youtian Du"],"pdf_url":"https://arxiv.org/pdf/2404.12020v1.pdf","comment":"16 pages, 9 figures,5 Tables"},{"id":"http://arxiv.org/abs/2404.12015v1","updated":"2024-04-18T09:06:05Z","published":"2024-04-18T09:06:05Z","title":"What does CLIP know about peeling a banana?","summary":" Humans show an innate capability to identify tools to support specific\nactions. The association between objects parts and the actions they facilitate\nis usually named affordance. Being able to segment objects parts depending on\nthe tasks they afford is crucial to enable intelligent robots to use objects of\ndaily living. Traditional supervised learning methods for affordance\nsegmentation require costly pixel-level annotations, while weakly supervised\napproaches, though less demanding, still rely on object-interaction examples\nand support a closed set of actions. These limitations hinder scalability, may\nintroduce biases, and usually restrict models to a limited set of predefined\nactions. This paper proposes AffordanceCLIP, to overcome these limitations by\nleveraging the implicit affordance knowledge embedded within large pre-trained\nVision-Language models like CLIP. We experimentally demonstrate that CLIP,\nalthough not explicitly trained for affordances detection, retains valuable\ninformation for the task. Our AffordanceCLIP achieves competitive zero-shot\nperformance compared to methods with specialized training, while offering\nseveral advantages: i) it works with any action prompt, not just a predefined\nset; ii) it requires training only a small number of additional parameters\ncompared to existing solutions and iii) eliminates the need for direct\nsupervision on action-object pairs, opening new perspectives for\nfunctionality-based reasoning of models.\n","authors":["Claudia Cuttano","Gabriele Rosi","Gabriele Trivigno","Giuseppe Averta"],"pdf_url":"https://arxiv.org/pdf/2404.12015v1.pdf","comment":"Accepted to MAR Workshop at CVPR2024"},{"id":"http://arxiv.org/abs/2311.09104v2","updated":"2024-04-18T09:03:04Z","published":"2023-11-15T16:51:18Z","title":"Cross-view and Cross-pose Completion for 3D Human Understanding","summary":" Human perception and understanding is a major domain of computer vision\nwhich, like many other vision subdomains recently, stands to gain from the use\nof large models pre-trained on large datasets. We hypothesize that the most\ncommon pre-training strategy of relying on general purpose, object-centric\nimage datasets such as ImageNet, is limited by an important domain shift. On\nthe other hand, collecting domain-specific ground truth such as 2D or 3D labels\ndoes not scale well. Therefore, we propose a pre-training approach based on\nself-supervised learning that works on human-centric data using only images.\nOur method uses pairs of images of humans: the first is partially masked and\nthe model is trained to reconstruct the masked parts given the visible ones and\na second image. It relies on both stereoscopic (cross-view) pairs, and temporal\n(cross-pose) pairs taken from videos, in order to learn priors about 3D as well\nas human motion. We pre-train a model for body-centric tasks and one for\nhand-centric tasks. With a generic transformer architecture, these models\noutperform existing self-supervised pre-training methods on a wide set of\nhuman-centric downstream tasks, and obtain state-of-the-art performance for\ninstance when fine-tuning for model-based and model-free human mesh recovery.\n","authors":["Matthieu Armando","Salma Galaaoui","Fabien Baradel","Thomas Lucas","Vincent Leroy","Romain Brégier","Philippe Weinzaepfel","Grégory Rogez"],"pdf_url":"https://arxiv.org/pdf/2311.09104v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2303.12307v4","updated":"2024-04-18T08:54:01Z","published":"2023-03-22T04:49:23Z","title":"Predicting and Enhancing the Fairness of DNNs with the Curvature of\n Perceptual Manifolds","summary":" To address the challenges of long-tailed classification, researchers have\nproposed several approaches to reduce model bias, most of which assume that\nclasses with few samples are weak classes. However, recent studies have shown\nthat tail classes are not always hard to learn, and model bias has been\nobserved on sample-balanced datasets, suggesting the existence of other factors\nthat affect model bias. In this work, we first establish a geometric\nperspective for analyzing model fairness and then systematically propose a\nseries of geometric measurements for perceptual manifolds in deep neural\nnetworks. Subsequently, we comprehensively explore the effect of the geometric\ncharacteristics of perceptual manifolds on classification difficulty and how\nlearning shapes the geometric characteristics of perceptual manifolds. An\nunanticipated finding is that the correlation between the class accuracy and\nthe separation degree of perceptual manifolds gradually decreases during\ntraining, while the negative correlation with the curvature gradually\nincreases, implying that curvature imbalance leads to model bias.Building upon\nthese observations, we propose curvature regularization to facilitate the model\nto learn curvature-balanced and flatter perceptual manifolds. Evaluations on\nmultiple long-tailed and non-long-tailed datasets show the excellent\nperformance and exciting generality of our approach, especially in achieving\nsignificant performance improvements based on current state-of-the-art\ntechniques. Our work opens up a geometric analysis perspective on model bias\nand reminds researchers to pay attention to model bias on non-long-tailed and\neven sample-balanced datasets.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Maoji Wen","Lingling Li","Wenping Ma","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12307v4.pdf","comment":"17pages, Accepted by CVPR 2023, Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2311.09590v2","updated":"2024-04-18T08:49:03Z","published":"2023-11-16T06:02:03Z","title":"MARformer: An Efficient Metal Artifact Reduction Transformer for Dental\n CBCT Images","summary":" Cone Beam Computed Tomography (CBCT) plays a key role in dental diagnosis and\nsurgery. However, the metal teeth implants could bring annoying metal artifacts\nduring the CBCT imaging process, interfering diagnosis and downstream\nprocessing such as tooth segmentation. In this paper, we develop an efficient\nTransformer to perform metal artifacts reduction (MAR) from dental CBCT images.\nThe proposed MAR Transformer (MARformer) reduces computation complexity in the\nmultihead self-attention by a new Dimension-Reduced Self-Attention (DRSA)\nmodule, based on that the CBCT images have globally similar structure. A\nPatch-wise Perceptive Feed Forward Network (P2FFN) is also proposed to perceive\nlocal image information for fine-grained restoration. Experimental results on\nCBCT images with synthetic and real-world metal artifacts show that our\nMARformer is efficient and outperforms previous MAR methods and two restoration\nTransformers.\n","authors":["Yuxuan Shi","Jun Xu","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2311.09590v2.pdf","comment":"under consideration of Computer Vision and Image Understanding\n journal"},{"id":"http://arxiv.org/abs/2404.11998v1","updated":"2024-04-18T08:46:12Z","published":"2024-04-18T08:46:12Z","title":"Curriculum Point Prompting for Weakly-Supervised Referring Image\n Segmentation","summary":" Referring image segmentation (RIS) aims to precisely segment referents in\nimages through corresponding natural language expressions, yet relying on\ncost-intensive mask annotations. Weakly supervised RIS thus learns from\nimage-text pairs to pixel-level semantics, which is challenging for segmenting\nfine-grained masks. A natural approach to enhancing segmentation precision is\nto empower weakly supervised RIS with the image segmentation foundation model\nSAM. Nevertheless, we observe that simply integrating SAM yields limited\nbenefits and can even lead to performance regression due to the inevitable\nnoise issues and challenges in excessive focus on object parts. In this paper,\nwe present an innovative framework, Point PrompTing (PPT), incorporated with\nthe proposed multi-source curriculum learning strategy to address these\nchallenges. Specifically, the core of PPT is a point generator that not only\nharnesses CLIP's text-image alignment capability and SAM's powerful mask\ngeneration ability but also generates negative point prompts to address the\nnoisy and excessive focus issues inherently and effectively. In addition, we\nintroduce a curriculum learning strategy with object-centric images to help PPT\ngradually learn from simpler yet precise semantic alignment to more complex\nRIS. Experiments demonstrate that our PPT significantly and consistently\noutperforms prior weakly supervised techniques on mIoU by 11.34%, 14.14%, and\n6.97% across RefCOCO, RefCOCO+, and G-Ref, respectively.\n","authors":["Qiyuan Dai","Sibei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.11998v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15182v2","updated":"2024-04-18T08:40:58Z","published":"2024-03-22T13:11:26Z","title":"PDE-CNNs: Axiomatic Derivations and Applications","summary":" PDE-based Group Convolutional Neural Networks (PDE-G-CNNs) utilize solvers of\ngeometrically meaningful evolution PDEs as substitutes for the conventional\ncomponents in G-CNNs. PDE-G-CNNs offer several key benefits all at once: fewer\nparameters, inherent equivariance, better performance, data efficiency, and\ngeometric interpretability.\n In this article we focus on Euclidean equivariant PDE-G-CNNs where the\nfeature maps are two dimensional throughout. We call this variant of the\nframework a PDE-CNN.\n From a machine learning perspective, we list several practically desirable\naxioms and derive from these which PDEs should be used in a PDE-CNN. Here our\napproach to geometric learning via PDEs is inspired by the axioms of classical\nlinear and morphological scale-space theory, which we generalize by introducing\nsemifield-valued signals.\n Furthermore, we experimentally confirm for small networks that PDE-CNNs offer\nfewer parameters, increased performance, and better data efficiency when\ncompared to CNNs. We also investigate what effect the use of different\nsemifields has on the performance of the models.\n","authors":["Gijs Bellaard","Sei Sakata","Bart M. N. Smets","Remco Duits"],"pdf_url":"https://arxiv.org/pdf/2403.15182v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04265v5","updated":"2024-04-18T08:33:37Z","published":"2023-12-07T12:43:00Z","title":"Stronger, Fewer, & Superior: Harnessing Vision Foundation Models for\n Domain Generalized Semantic Segmentation","summary":" In this paper, we first assess and harness various Vision Foundation Models\n(VFMs) in the context of Domain Generalized Semantic Segmentation (DGSS).\nDriven by the motivation that Leveraging Stronger pre-trained models and Fewer\ntrainable parameters for Superior generalizability, we introduce a robust\nfine-tuning approach, namely Rein, to parameter-efficiently harness VFMs for\nDGSS. Built upon a set of trainable tokens, each linked to distinct instances,\nRein precisely refines and forwards the feature maps from each layer to the\nnext layer within the backbone. This process produces diverse refinements for\ndifferent categories within a single image. With fewer trainable parameters,\nRein efficiently fine-tunes VFMs for DGSS tasks, surprisingly surpassing full\nparameter fine-tuning. Extensive experiments across various settings\ndemonstrate that Rein significantly outperforms state-of-the-art methods.\nRemarkably, with just an extra 1% of trainable parameters within the frozen\nbackbone, Rein achieves a mIoU of 78.4% on the Cityscapes, without accessing\nany real urban-scene datasets.Code is available at\nhttps://github.com/w1oves/Rein.git.\n","authors":["Zhixiang Wei","Lin Chen","Yi Jin","Xiaoxiao Ma","Tianle Liu","Pengyang Ling","Ben Wang","Huaian Chen","Jinjin Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.04265v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v4","updated":"2024-04-18T08:29:48Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v4.pdf","comment":"minor fixes/rephrasing"},{"id":"http://arxiv.org/abs/2404.11987v1","updated":"2024-04-18T08:29:29Z","published":"2024-04-18T08:29:29Z","title":"MultiPhys: Multi-Person Physics-aware 3D Motion Estimation","summary":" We introduce MultiPhys, a method designed for recovering multi-person motion\nfrom monocular videos. Our focus lies in capturing coherent spatial placement\nbetween pairs of individuals across varying degrees of engagement. MultiPhys,\nbeing physically aware, exhibits robustness to jittering and occlusions, and\neffectively eliminates penetration issues between the two individuals. We\ndevise a pipeline in which the motion estimated by a kinematic-based method is\nfed into a physics simulator in an autoregressive manner. We introduce distinct\ncomponents that enable our model to harness the simulator's properties without\ncompromising the accuracy of the kinematic estimates. This results in final\nmotion estimates that are both kinematically coherent and physically compliant.\nExtensive evaluations on three challenging datasets characterized by\nsubstantial inter-person interaction show that our method significantly reduces\nerrors associated with penetration and foot skating, while performing\ncompetitively with the state-of-the-art on motion accuracy and smoothness.\nResults and code can be found on our project page\n(http://www.iri.upc.edu/people/nugrinovic/multiphys/).\n","authors":["Nicolas Ugrinovic","Boxiao Pan","Georgios Pavlakos","Despoina Paschalidou","Bokui Shen","Jordi Sanchez-Riera","Francesc Moreno-Noguer","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2404.11987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11981v1","updated":"2024-04-18T08:23:24Z","published":"2024-04-18T08:23:24Z","title":"Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental\n Semantic Segmentation","summary":" Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a\npre-trained segmentation model to segment new classes using cost-effective and\nreadily available image-level labels. A prevailing way to solve WILSS is the\ngeneration of seed areas for each new class, serving as a form of pixel-level\nsupervision. However, a scenario usually arises where a pixel is concurrently\npredicted as an old class by the pre-trained segmentation model and a new class\nby the seed areas. Such a scenario becomes particularly problematic in WILSS,\nas the lack of pixel-level annotations on new classes makes it intractable to\nascertain whether the pixel pertains to the new class or not. To surmount this\nissue, we propose an innovative, tendency-driven relationship of mutual\nexclusivity, meticulously tailored to govern the behavior of the seed areas and\nthe predictions generated by the pre-trained segmentation model. This\nrelationship stipulates that predictions for the new and old classes must not\nconflict whilst prioritizing the preservation of predictions for the old\nclasses, which not only addresses the conflicting prediction issue but also\neffectively mitigates the inherent challenge of incremental learning -\ncatastrophic forgetting. Furthermore, under the auspices of this\ntendency-driven mutual exclusivity relationship, we generate pseudo masks for\nthe new classes, allowing for concurrent execution with model parameter\nupdating via the resolution of a bi-level optimization problem. Extensive\nexperiments substantiate the effectiveness of our framework, resulting in the\nestablishment of new benchmarks and paving the way for further research in this\nfield.\n","authors":["Chongjie Si","Xuehui Wang","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2404.11981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11525v2","updated":"2024-04-18T08:23:05Z","published":"2024-04-17T16:16:12Z","title":"JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on\n Long-Tailed OCTA","summary":" The oxygen saturation level in the blood (SaO2) is crucial for health,\nparticularly in relation to sleep-related breathing disorders. However,\ncontinuous monitoring of SaO2 is time-consuming and highly variable depending\non patients' conditions. Recently, optical coherence tomography angiography\n(OCTA) has shown promising development in rapidly and effectively screening\neye-related lesions, offering the potential for diagnosing sleep-related\ndisorders. To bridge this gap, our paper presents three key contributions.\nFirstly, we propose JointViT, a novel model based on the Vision Transformer\narchitecture, incorporating a joint loss function for supervision. Secondly, we\nintroduce a balancing augmentation technique during data preprocessing to\nimprove the model's performance, particularly on the long-tail distribution\nwithin the OCTA dataset. Lastly, through comprehensive experiments on the OCTA\ndataset, our proposed method significantly outperforms other state-of-the-art\nmethods, achieving improvements of up to 12.28% in overall accuracy. This\nadvancement lays the groundwork for the future utilization of OCTA in\ndiagnosing sleep-related disorders. See project website\nhttps://steve-zeyu-zhang.github.io/JointViT\n","authors":["Zeyu Zhang","Xuyin Qi","Mingxi Chen","Guangxi Li","Ryan Pham","Ayub Qassim","Ella Berry","Zhibin Liao","Owen Siggs","Robert Mclaughlin","Jamie Craig","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2404.11525v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11979v1","updated":"2024-04-18T08:16:56Z","published":"2024-04-18T08:16:56Z","title":"MTGA: Multi-view Temporal Granularity aligned Aggregation for\n Event-based Lip-reading","summary":" Lip-reading is to utilize the visual information of the speaker's lip\nmovements to recognize words and sentences. Existing event-based lip-reading\nsolutions integrate different frame rate branches to learn spatio-temporal\nfeatures of varying granularities. However, aggregating events into event\nframes inevitably leads to the loss of fine-grained temporal information within\nframes. To remedy this drawback, we propose a novel framework termed Multi-view\nTemporal Granularity aligned Aggregation (MTGA). Specifically, we first present\na novel event representation method, namely time-segmented voxel graph list,\nwhere the most significant local voxels are temporally connected into a graph\nlist. Then we design a spatio-temporal fusion module based on temporal\ngranularity alignment, where the global spatial features extracted from event\nframes, together with the local relative spatial and temporal features\ncontained in voxel graph list are effectively aligned and integrated. Finally,\nwe design a temporal aggregation module that incorporates positional encoding,\nwhich enables the capture of local absolute spatial and global temporal\ninformation. Experiments demonstrate that our method outperforms both the\nevent-based and video-based lip-reading counterparts. Our code will be publicly\navailable.\n","authors":["Wenhao Zhang","Jun Wang","Yong Luo","Lei Yu","Wei Yu","Zheng He"],"pdf_url":"https://arxiv.org/pdf/2404.11979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06244v2","updated":"2024-04-18T08:08:45Z","published":"2024-02-09T08:33:48Z","title":"Quantifying and Enhancing Multi-modal Robustness with Modality\n Preference","summary":" Multi-modal models have shown a promising capability to effectively integrate\ninformation from various sources, yet meanwhile, they are found vulnerable to\npervasive perturbations, such as uni-modal attacks and missing conditions. To\ncounter these perturbations, robust multi-modal representations are highly\nexpected, which are positioned well away from the discriminative multi-modal\ndecision boundary. In this paper, different from conventional empirical\nstudies, we focus on a commonly used joint multi-modal framework and\ntheoretically discover that larger uni-modal representation margins and more\nreliable integration for modalities are essential components for achieving\nhigher robustness. This discovery can further explain the limitation of\nmulti-modal robustness and the phenomenon that multi-modal models are often\nvulnerable to attacks on the specific modality. Moreover, our analysis reveals\nhow the widespread issue, that the model has different preferences for\nmodalities, limits the multi-modal robustness by influencing the essential\ncomponents and could lead to attacks on the specific modality highly effective.\nInspired by our theoretical finding, we introduce a training procedure called\nCertifiable Robust Multi-modal Training (CRMT), which can alleviate this\ninfluence from modality preference and explicitly regulate essential components\nto significantly improve robustness in a certifiable manner. Our method\ndemonstrates substantial improvements in performance and robustness compared\nwith existing methods. Furthermore, our training procedure can be easily\nextended to enhance other robust training strategies, highlighting its\ncredibility and flexibility.\n","authors":["Zequn Yang","Yake Wei","Ce Liang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2402.06244v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2404.11974v1","updated":"2024-04-18T08:05:23Z","published":"2024-04-18T08:05:23Z","title":"Device (In)Dependence of Deep Learning-based Image Age Approximation","summary":" The goal of temporal image forensic is to approximate the age of a digital\nimage relative to images from the same device. Usually, this is based on traces\nleft during the image acquisition pipeline. For example, several methods exist\nthat exploit the presence of in-field sensor defects for this purpose. In\naddition to these 'classical' methods, there is also an approach in which a\nConvolutional Neural Network (CNN) is trained to approximate the image age. One\nadvantage of a CNN is that it independently learns the age features used. This\nwould make it possible to exploit other (different) age traces in addition to\nthe known ones (i.e., in-field sensor defects). In a previous work, we have\nshown that the presence of strong in-field sensor defects is irrelevant for a\nCNN to predict the age class. Based on this observation, the question arises\nhow device (in)dependent the learned features are. In this work, we empirically\nasses this by training a network on images from a single device and then apply\nthe trained model to images from different devices. This evaluation is\nperformed on 14 different devices, including 10 devices from the publicly\navailable 'Northumbria Temporal Image Forensics' database. These 10 different\ndevices are based on five different device pairs (i.e., with the identical\ncamera model).\n","authors":["Robert Jöchl","Andreas Uhl"],"pdf_url":"https://arxiv.org/pdf/2404.11974v1.pdf","comment":"This work was accepted and presented in: 2022 ICPR-Workshop on\n Artificial Intelligence for Multimedia Forensics and Disinformation\n Detection. Montreal, Quebec, Canada. However, due to a technical issue on the\n publishing companies' side, the work does not appear in the workshop\n proceedings"},{"id":"http://arxiv.org/abs/2305.00220v2","updated":"2024-04-18T08:01:26Z","published":"2023-04-29T10:10:25Z","title":"Relaxed forced choice improves performance of visual quality assessment\n methods","summary":" In image quality assessment, a collective visual quality score for an image\nor video is obtained from the individual ratings of many subjects. One commonly\nused format for these experiments is the two-alternative forced choice method.\nTwo stimuli with the same content but differing visual quality are presented\nsequentially or side-by-side. Subjects are asked to select the one of better\nquality, and when uncertain, they are required to guess. The relaxed\nalternative forced choice format aims to reduce the cognitive load and the\nnoise in the responses due to the guessing by providing a third response\noption, namely, ``not sure''. This work presents a large and comprehensive\ncrowdsourcing experiment to compare these two response formats: the one with\nthe ``not sure'' option and the one without it. To provide unambiguous ground\ntruth for quality evaluation, subjects were shown pairs of images with\ndiffering numbers of dots and asked each time to choose the one with more dots.\nOur crowdsourcing study involved 254 participants and was conducted using a\nwithin-subject design. Each participant was asked to respond to 40 pair\ncomparisons with and without the ``not sure'' response option and completed a\nquestionnaire to evaluate their cognitive load for each testing condition. The\nexperimental results show that the inclusion of the ``not sure'' response\noption in the forced choice method reduced mental load and led to models with\nbetter data fit and correspondence to ground truth. We also tested for the\nequivalence of the models and found that they were different. The dataset is\navailable at http://database.mmsp-kn.de/cogvqa-database.html.\n","authors":["Mohsen Jenadeleh","Johannes Zagermann","Harald Reiterer","Ulf-Dietrich Reips","Raouf Hamzaoui","Dietmar Saupe"],"pdf_url":"https://arxiv.org/pdf/2305.00220v2.pdf","comment":"6 pages, 3 figures, accepted at the 2023 15th International\n Conference on Quality of Multimedia Experience (QoMEX). Database is publicly\n accessible at http://database.mmsp-kn.de/cogvqa-database.html"},{"id":"http://arxiv.org/abs/2404.11962v1","updated":"2024-04-18T07:48:00Z","published":"2024-04-18T07:48:00Z","title":"©Plug-in Authorization for Human Content Copyright Protection\n in Text-to-Image Model","summary":" This paper addresses the contentious issue of copyright infringement in\nimages generated by text-to-image models, sparking debates among AI developers,\ncontent creators, and legal entities. State-of-the-art models create\nhigh-quality content without crediting original creators, causing concern in\nthe artistic community. To mitigate this, we propose the \\copyright Plug-in\nAuthorization framework, introducing three operations: addition, extraction,\nand combination. Addition involves training a \\copyright plug-in for specific\ncopyright, facilitating proper credit attribution. Extraction allows creators\nto reclaim copyright from infringing models, and combination enables users to\nmerge different \\copyright plug-ins. These operations act as permits,\nincentivizing fair use and providing flexibility in authorization. We present\ninnovative approaches,\"Reverse LoRA\" for extraction and \"EasyMerge\" for\nseamless combination. Experiments in artist-style replication and cartoon IP\nrecreation demonstrate \\copyright plug-ins' effectiveness, offering a valuable\nsolution for human copyright protection in the age of generative AIs.\n","authors":["Chao Zhou","Huishuai Zhang","Jiang Bian","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11962v1.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.01188v2","updated":"2024-04-18T07:42:19Z","published":"2023-11-02T12:34:23Z","title":"Terrain-Informed Self-Supervised Learning: Enhancing Building Footprint\n Extraction from LiDAR Data with Limited Annotations","summary":" Estimating building footprint maps from geospatial data is of paramount\nimportance in urban planning, development, disaster management, and various\nother applications. Deep learning methodologies have gained prominence in\nbuilding segmentation maps, offering the promise of precise footprint\nextraction without extensive post-processing. However, these methods face\nchallenges in generalization and label efficiency, particularly in remote\nsensing, where obtaining accurate labels can be both expensive and\ntime-consuming. To address these challenges, we propose terrain-aware\nself-supervised learning, tailored to remote sensing, using digital elevation\nmodels from LiDAR data. We propose to learn a model to differentiate between\nbare Earth and superimposed structures enabling the network to implicitly learn\ndomain-relevant features without the need for extensive pixel-level\nannotations. We test the effectiveness of our approach by evaluating building\nsegmentation performance on test datasets with varying label fractions.\nRemarkably, with only 1% of the labels (equivalent to 25 labeled examples), our\nmethod improves over ImageNet pre-training, showing the advantage of leveraging\nunlabeled data for feature extraction in the domain of remote sensing. The\nperformance improvement is more pronounced in few-shot scenarios and gradually\ncloses the gap with ImageNet pre-training as the label fraction increases. We\ntest on a dataset characterized by substantial distribution shifts and labeling\nerrors to demonstrate the generalizability of our approach. When compared to\nother baselines, including ImageNet pretraining and more complex architectures,\nour approach consistently performs better, demonstrating the efficiency and\neffectiveness of self-supervised terrain-aware feature learning.\n","authors":["Anuja Vats","David Völgyes","Martijn Vermeer","Marius Pedersen","Kiran Raja","Daniele S. M. Fantin","Jacob Alexander Hay"],"pdf_url":"https://arxiv.org/pdf/2311.01188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11459v2","updated":"2024-04-18T07:32:52Z","published":"2024-04-17T15:07:06Z","title":"Octopus v3: Technical Report for On-device Sub-billion Multimodal AI\n Agent","summary":" A multimodal AI agent is characterized by its ability to process and learn\nfrom various types of data, including natural language, visual, and audio\ninputs, to inform its actions. Despite advancements in large language models\nthat incorporate visual data, such as GPT-4V, effectively translating\nimage-based data into actionable outcomes for AI agents continues to be\nchallenging. In this paper, we introduce a multimodal model that incorporates\nthe concept of functional token specifically designed for AI agent\napplications. To ensure compatibility with edge devices, our model is optimized\nto a compact size of less than 1B parameters. Like GPT-4, our model can process\nboth English and Chinese. We demonstrate that this model is capable of\noperating efficiently on a wide range of edge devices, including as constrained\nas a Raspberry Pi.\n","authors":["Wei Chen","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2404.11459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11958v1","updated":"2024-04-18T07:25:59Z","published":"2024-04-18T07:25:59Z","title":"Not All Voxels Are Equal: Hardness-Aware Semantic Scene Completion with\n Self-Distillation","summary":" Semantic scene completion, also known as semantic occupancy prediction, can\nprovide dense geometric and semantic information for autonomous vehicles, which\nattracts the increasing attention of both academia and industry. Unfortunately,\nexisting methods usually formulate this task as a voxel-wise classification\nproblem and treat each voxel equally in 3D space during training. As the hard\nvoxels have not been paid enough attention, the performance in some challenging\nregions is limited. The 3D dense space typically contains a large number of\nempty voxels, which are easy to learn but require amounts of computation due to\nhandling all the voxels uniformly for the existing models. Furthermore, the\nvoxels in the boundary region are more challenging to differentiate than those\nin the interior. In this paper, we propose HASSC approach to train the semantic\nscene completion model with hardness-aware design. The global hardness from the\nnetwork optimization process is defined for dynamical hard voxel selection.\nThen, the local hardness with geometric anisotropy is adopted for voxel-wise\nrefinement. Besides, self-distillation strategy is introduced to make training\nprocess stable and consistent. Extensive experiments show that our HASSC scheme\ncan effectively promote the accuracy of the baseline model without incurring\nthe extra inference cost. Source code is available at:\nhttps://github.com/songw-zju/HASSC.\n","authors":["Song Wang","Jiawei Yu","Wentong Li","Wenyu Liu","Xiaolu Liu","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.11958v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.11957v1","updated":"2024-04-18T07:22:38Z","published":"2024-04-18T07:22:38Z","title":"The devil is in the object boundary: towards annotation-free instance\n segmentation using Foundation Models","summary":" Foundation models, pre-trained on a large amount of data have demonstrated\nimpressive zero-shot capabilities in various downstream tasks. However, in\nobject detection and instance segmentation, two fundamental computer vision\ntasks heavily reliant on extensive human annotations, foundation models such as\nSAM and DINO struggle to achieve satisfactory performance. In this study, we\nreveal that the devil is in the object boundary, \\textit{i.e.}, these\nfoundation models fail to discern boundaries between individual objects. For\nthe first time, we probe that CLIP, which has never accessed any instance-level\nannotations, can provide a highly beneficial and strong instance-level boundary\nprior in the clustering results of its particular intermediate layer. Following\nthis surprising observation, we propose $\\textbf{Zip}$ which $\\textbf{Z}$ips up\nCL$\\textbf{ip}$ and SAM in a novel classification-first-then-discovery\npipeline, enabling annotation-free, complex-scene-capable, open-vocabulary\nobject detection and instance segmentation. Our Zip significantly boosts SAM's\nmask AP on COCO dataset by 12.5% and establishes state-of-the-art performance\nin various settings, including training-free, self-training, and\nlabel-efficient finetuning. Furthermore, annotation-free Zip even achieves\ncomparable performance to the best-performing open-vocabulary object detecters\nusing base annotations. Code is released at\nhttps://github.com/ChengShiest/Zip-Your-CLIP\n","authors":["Cheng Shi","Sibei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.11957v1.pdf","comment":"ICLR2024, Code is released at\n https://github.com/ChengShiest/Zip-Your-CLIP"},{"id":"http://arxiv.org/abs/2404.11949v1","updated":"2024-04-18T07:07:38Z","published":"2024-04-18T07:07:38Z","title":"Sketch-guided Image Inpainting with Partial Discrete Diffusion Process","summary":" In this work, we study the task of sketch-guided image inpainting. Unlike the\nwell-explored natural language-guided image inpainting, which excels in\ncapturing semantic details, the relatively less-studied sketch-guided\ninpainting offers greater user control in specifying the object's shape and\npose to be inpainted. As one of the early solutions to this task, we introduce\na novel partial discrete diffusion process (PDDP). The forward pass of the PDDP\ncorrupts the masked regions of the image and the backward pass reconstructs\nthese masked regions conditioned on hand-drawn sketches using our proposed\nsketch-guided bi-directional transformer. The proposed novel transformer module\naccepts two inputs -- the image containing the masked region to be inpainted\nand the query sketch to model the reverse diffusion process. This strategy\neffectively addresses the domain gap between sketches and natural images,\nthereby, enhancing the quality of inpainting results. In the absence of a\nlarge-scale dataset specific to this task, we synthesize a dataset from the\nMS-COCO to train and extensively evaluate our proposed framework against\nvarious competent approaches in the literature. The qualitative and\nquantitative results and user studies establish that the proposed method\ninpaints realistic objects that fit the context in terms of the visual\nappearance of the provided sketch. To aid further research, we have made our\ncode publicly available at https://github.com/vl2g/Sketch-Inpainting .\n","authors":["Nakul Sharma","Aditay Tripathi","Anirban Chakraborty","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.11949v1.pdf","comment":"Accepted to NTIRE Workshop @ CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11947v1","updated":"2024-04-18T06:59:40Z","published":"2024-04-18T06:59:40Z","title":"VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled\n Examples in Semi-supervised Learning","summary":" Despite the progress of Semi-supervised Learning (SSL), existing methods fail\nto utilize unlabeled data effectively and efficiently. Many pseudo-label-based\nmethods select unlabeled examples based on inaccurate confidence scores from\nthe classifier. Most prior work also uses all available unlabeled data without\npruning, making it difficult to handle large amounts of unlabeled data. To\naddress these issues, we propose two methods: Variational Confidence\nCalibration (VCC) and Influence-Function-based Unlabeled Sample Elimination\n(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a\nvariational autoencoder to select more accurate pseudo labels based on three\ntypes of consistency scores. INFUSE is a data pruning method that constructs a\ncore dataset of unlabeled examples under SSL. Our methods are effective in\nmultiple datasets and settings, reducing classification errors rates and saving\ntraining time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the\nCIFAR-100 dataset by 1.08% while saving nearly half of the training time.\n","authors":["Shijie Fang","Qianhan Feng","Tong Lin"],"pdf_url":"https://arxiv.org/pdf/2404.11947v1.pdf","comment":"Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng\n contributed equally to this paper"},{"id":"http://arxiv.org/abs/2404.11946v1","updated":"2024-04-18T06:58:02Z","published":"2024-04-18T06:58:02Z","title":"S4TP: Social-Suitable and Safety-Sensitive Trajectory Planning for\n Autonomous Vehicles","summary":" In public roads, autonomous vehicles (AVs) face the challenge of frequent\ninteractions with human-driven vehicles (HDVs), which render uncertain driving\nbehavior due to varying social characteristics among humans. To effectively\nassess the risks prevailing in the vicinity of AVs in social interactive\ntraffic scenarios and achieve safe autonomous driving, this article proposes a\nsocial-suitable and safety-sensitive trajectory planning (S4TP) framework.\nSpecifically, S4TP integrates the Social-Aware Trajectory Prediction (SATP) and\nSocial-Aware Driving Risk Field (SADRF) modules. SATP utilizes Transformers to\neffectively encode the driving scene and incorporates an AV's planned\ntrajectory during the prediction decoding process. SADRF assesses the expected\nsurrounding risk degrees during AVs-HDVs interactions, each with different\nsocial characteristics, visualized as two-dimensional heat maps centered on the\nAV. SADRF models the driving intentions of the surrounding HDVs and predicts\ntrajectories based on the representation of vehicular interactions. S4TP\nemploys an optimization-based approach for motion planning, utilizing the\npredicted HDVs'trajectories as input. With the integration of SADRF, S4TP\nexecutes real-time online optimization of the planned trajectory of AV within\nlowrisk regions, thus improving the safety and the interpretability of the\nplanned trajectory. We have conducted comprehensive tests of the proposed\nmethod using the SMARTS simulator. Experimental results in complex social\nscenarios, such as unprotected left turn intersections, merging, cruising, and\novertaking, validate the superiority of our proposed S4TP in terms of safety\nand rationality. S4TP achieves a pass rate of 100% across all scenarios,\nsurpassing the current state-of-the-art methods Fanta of 98.25% and\nPredictive-Decision of 94.75%.\n","authors":["Xiao Wang","Ke Tang","Xingyuan Dai","Jintao Xu","Quancheng Du","Rui Ai","Yuxiao Wang","Weihao Gu"],"pdf_url":"https://arxiv.org/pdf/2404.11946v1.pdf","comment":"12 pages,4 figures, published to IEEE Transactions on Intelligent\n Vehicles"},{"id":"http://arxiv.org/abs/2401.16158v2","updated":"2024-04-18T06:53:38Z","published":"2024-01-29T13:46:37Z","title":"Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual\n Perception","summary":" Mobile device agent based on Multimodal Large Language Models (MLLM) is\nbecoming a popular application. In this paper, we introduce Mobile-Agent, an\nautonomous multi-modal mobile device agent. Mobile-Agent first leverages visual\nperception tools to accurately identify and locate both the visual and textual\nelements within the app's front-end interface. Based on the perceived vision\ncontext, it then autonomously plans and decomposes the complex operation task,\nand navigates the mobile Apps through operations step by step. Different from\nprevious solutions that rely on XML files of Apps or mobile system metadata,\nMobile-Agent allows for greater adaptability across diverse mobile operating\nenvironments in a vision-centric way, thereby eliminating the necessity for\nsystem-specific customizations. To assess the performance of Mobile-Agent, we\nintroduced Mobile-Eval, a benchmark for evaluating mobile device operations.\nBased on Mobile-Eval, we conducted a comprehensive evaluation of Mobile-Agent.\nThe experimental results indicate that Mobile-Agent achieved remarkable\naccuracy and completion rates. Even with challenging instructions, such as\nmulti-app operations, Mobile-Agent can still complete the requirements. Code\nand model will be open-sourced at https://github.com/X-PLUG/MobileAgent.\n","authors":["Junyang Wang","Haiyang Xu","Jiabo Ye","Ming Yan","Weizhou Shen","Ji Zhang","Fei Huang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2401.16158v2.pdf","comment":"Accepted by ICLR 2024 Workshop in Large Language Model (LLM) Agents"},{"id":"http://arxiv.org/abs/2404.11936v1","updated":"2024-04-18T06:35:37Z","published":"2024-04-18T06:35:37Z","title":"LD-Pruner: Efficient Pruning of Latent Diffusion Models using\n Task-Agnostic Insights","summary":" Latent Diffusion Models (LDMs) have emerged as powerful generative models,\nknown for delivering remarkable results under constrained computational\nresources. However, deploying LDMs on resource-limited devices remains a\ncomplex issue, presenting challenges such as memory consumption and inference\nspeed. To address this issue, we introduce LD-Pruner, a novel\nperformance-preserving structured pruning method for compressing LDMs.\nTraditional pruning methods for deep neural networks are not tailored to the\nunique characteristics of LDMs, such as the high computational cost of training\nand the absence of a fast, straightforward and task-agnostic method for\nevaluating model performance. Our method tackles these challenges by leveraging\nthe latent space during the pruning process, enabling us to effectively\nquantify the impact of pruning on model performance, independently of the task\nat hand. This targeted pruning of components with minimal impact on the output\nallows for faster convergence during training, as the model has less\ninformation to re-learn, thereby addressing the high computational cost of\ntraining. Consequently, our approach achieves a compressed model that offers\nimproved inference speed and reduced parameter count, while maintaining minimal\nperformance degradation. We demonstrate the effectiveness of our approach on\nthree different tasks: text-to-image (T2I) generation, Unconditional Image\nGeneration (UIG) and Unconditional Audio Generation (UAG). Notably, we reduce\nthe inference time of Stable Diffusion (SD) by 34.9% while simultaneously\nimproving its FID by 5.2% on MS-COCO T2I benchmark. This work paves the way for\nmore efficient pruning methods for LDMs, enhancing their applicability.\n","authors":["Thibault Castells","Hyoung-Kyu Song","Bo-Kyeong Kim","Shinkook Choi"],"pdf_url":"https://arxiv.org/pdf/2404.11936v1.pdf","comment":"8 pages, accepted to CVPR24 First Workshop on Efficient and On-Device\n Generation (EDGE)"},{"id":"http://arxiv.org/abs/2404.11929v1","updated":"2024-04-18T06:18:48Z","published":"2024-04-18T06:18:48Z","title":"A Symmetric Regressor for MRI-Based Assessment of Striatal Dopamine\n Transporter Uptake in Parkinson's Disease","summary":" Dopamine transporter (DAT) imaging is commonly used for monitoring\nParkinson's disease (PD), where striatal DAT uptake amount is computed to\nassess PD severity. However, DAT imaging has a high cost and the risk of\nradiance exposure and is not available in general clinics. Recently, MRI patch\nof the nigral region has been proposed as a safer and easier alternative. This\npaper proposes a symmetric regressor for predicting the DAT uptake amount from\nthe nigral MRI patch. Acknowledging the symmetry between the right and left\nnigrae, the proposed regressor incorporates a paired input-output model that\nsimultaneously predicts the DAT uptake amounts for both the right and left\nstriata. Moreover, it employs a symmetric loss that imposes a constraint on the\ndifference between right-to-left predictions, resembling the high correlation\nin DAT uptake amounts in the two lateral sides. Additionally, we propose a\nsymmetric Monte-Carlo (MC) dropout method for providing a fruitful uncertainty\nestimate of the DAT uptake prediction, which utilizes the above symmetry. We\nevaluated the proposed approach on 734 nigral patches, which demonstrated\nsignificantly improved performance of the symmetric regressor compared with the\nstandard regressors while giving better explainability and feature\nrepresentation. The symmetric MC dropout also gave precise uncertainty ranges\nwith a high probability of including the true DAT uptake amounts within the\nrange.\n","authors":["Walid Abdullah Al","Il Dong Yun","Yun Jung Bae"],"pdf_url":"https://arxiv.org/pdf/2404.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08513v3","updated":"2024-04-18T06:12:57Z","published":"2023-09-15T16:19:09Z","title":"SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient\n Channels","summary":" Pre-trained vision transformers have strong representation benefits to\nvarious downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT)\nmethods have been proposed, and their experiments demonstrate that tuning only\n1% of extra parameters could surpass full fine-tuning in low-data resource\nscenarios. However, these methods overlook the task-specific information when\nfine-tuning diverse downstream tasks. In this paper, we propose a simple yet\neffective method called \"Salient Channel Tuning\" (SCT) to leverage the\ntask-specific information by forwarding the model with the task images to\nselect partial channels in a feature map that enables us to tune only 1/8\nchannels leading to significantly lower parameter costs. Experiments outperform\nfull fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only\n0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning\ncounterpart. Furthermore, experiments on domain generalization and few-shot\nlearning surpass other PEFT methods with lower parameter costs, demonstrating\nour proposed tuning technique's strong capability and effectiveness in the\nlow-data regime.\n","authors":["Henry Hengyuan Zhao","Pichao Wang","Yuyang Zhao","Hao Luo","Fan Wang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2309.08513v3.pdf","comment":"This work has been accepted by IJCV2023"},{"id":"http://arxiv.org/abs/2310.05886v2","updated":"2024-04-18T06:11:43Z","published":"2023-10-09T17:28:35Z","title":"Streaming Anchor Loss: Augmenting Supervision with Temporal Significance","summary":" Streaming neural network models for fast frame-wise responses to various\nspeech and sensory signals are widely adopted on resource-constrained\nplatforms. Hence, increasing the learning capacity of such streaming models\n(i.e., by adding more parameters) to improve the predictive power may not be\nviable for real-world tasks. In this work, we propose a new loss, Streaming\nAnchor Loss (SAL), to better utilize the given learning capacity by encouraging\nthe model to learn more from essential frames. More specifically, our SAL and\nits focal variations dynamically modulate the frame-wise cross entropy loss\nbased on the importance of the corresponding frames so that a higher loss\npenalty is assigned for frames within the temporal proximity of semantically\ncritical events. Therefore, our loss ensures that the model training focuses on\npredicting the relatively rare but task-relevant frames. Experimental results\nwith standard lightweight convolutional and recurrent streaming networks on\nthree different speech based detection tasks demonstrate that SAL enables the\nmodel to learn the overall task more effectively with improved accuracy and\nlatency, without any additional data, model parameters, or architectural\nchanges.\n","authors":["Utkarsh Oggy Sarawgi","John Berkowitz","Vineet Garg","Arnav Kundu","Minsik Cho","Sai Srujana Buddi","Saurabh Adya","Ahmed Tewfik"],"pdf_url":"https://arxiv.org/pdf/2310.05886v2.pdf","comment":"Published at IEEE ICASSP 2024, please see\n https://ieeexplore.ieee.org/abstract/document/10447222"},{"id":"http://arxiv.org/abs/2404.11614v2","updated":"2024-04-18T06:06:29Z","published":"2024-04-17T17:59:55Z","title":"Dynamic Typography: Bringing Text to Life via Video Diffusion Prior","summary":" Text animation serves as an expressive medium, transforming static\ncommunication into dynamic experiences by infusing words with motion to evoke\nemotions, emphasize meanings, and construct compelling narratives. Crafting\nanimations that are semantically aware poses significant challenges, demanding\nexpertise in graphic design and animation. We present an automated text\nanimation scheme, termed \"Dynamic Typography\", which combines two challenging\ntasks. It deforms letters to convey semantic meaning and infuses them with\nvibrant movements based on user prompts. Our technique harnesses vector\ngraphics representations and an end-to-end optimization-based framework. This\nframework employs neural displacement fields to convert letters into base\nshapes and applies per-frame motion, encouraging coherence with the intended\ntextual concept. Shape preservation techniques and perceptual loss\nregularization are employed to maintain legibility and structural integrity\nthroughout the animation process. We demonstrate the generalizability of our\napproach across various text-to-video models and highlight the superiority of\nour end-to-end methodology over baseline methods, which might comprise separate\ntasks. Through quantitative and qualitative evaluations, we demonstrate the\neffectiveness of our framework in generating coherent text animations that\nfaithfully interpret user prompts while maintaining readability. Our code is\navailable at: https://animate-your-word.github.io/demo/.\n","authors":["Zichen Liu","Yihao Meng","Hao Ouyang","Yue Yu","Bolin Zhao","Daniel Cohen-Or","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2404.11614v2.pdf","comment":"Our demo page is available at:\n https://animate-your-word.github.io/demo/"},{"id":"http://arxiv.org/abs/2404.11925v1","updated":"2024-04-18T06:02:54Z","published":"2024-04-18T06:02:54Z","title":"EdgeFusion: On-Device Text-to-Image Generation","summary":" The intensive computational burden of Stable Diffusion (SD) for text-to-image\ngeneration poses a significant hurdle for its practical application. To tackle\nthis challenge, recent research focuses on methods to reduce sampling steps,\nsuch as Latent Consistency Model (LCM), and on employing architectural\noptimizations, including pruning and knowledge distillation. Diverging from\nexisting approaches, we uniquely start with a compact SD variant, BK-SDM. We\nobserve that directly applying LCM to BK-SDM with commonly used crawled\ndatasets yields unsatisfactory results. It leads us to develop two strategies:\n(1) leveraging high-quality image-text pairs from leading generative models and\n(2) designing an advanced distillation process tailored for LCM. Through our\nthorough exploration of quantization, profiling, and on-device deployment, we\nachieve rapid generation of photo-realistic, text-aligned images in just two\nsteps, with latency under one second on resource-limited edge devices.\n","authors":["Thibault Castells","Hyoung-Kyu Song","Tairen Piao","Shinkook Choi","Bo-Kyeong Kim","Hanyoung Yim","Changgwun Lee","Jae Gon Kim","Tae-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2404.11925v1.pdf","comment":"4 pages, accepted to CVPR24 First Workshop on Efficient and On-Device\n Generation (EDGE)"},{"id":"http://arxiv.org/abs/2403.17924v2","updated":"2024-04-18T05:11:54Z","published":"2024-03-26T17:57:05Z","title":"AID: Attention Interpolation of Text-to-Image Diffusion","summary":" Conditional diffusion models can create unseen images in various settings,\naiding image interpolation. Interpolation in latent spaces is well-studied, but\ninterpolation with specific conditions like text or poses is less understood.\nSimple approaches, such as linear interpolation in the space of conditions,\noften result in images that lack consistency, smoothness, and fidelity. To that\nend, we introduce a novel training-free technique named Attention Interpolation\nvia Diffusion (AID). Our key contributions include 1) proposing an inner/outer\ninterpolated attention layer; 2) fusing the interpolated attention with\nself-attention to boost fidelity; and 3) applying beta distribution to\nselection to increase smoothness. We also present a variant, Prompt-guided\nAttention Interpolation via Diffusion (PAID), that considers interpolation as a\ncondition-dependent generative process. This method enables the creation of new\nimages with greater consistency, smoothness, and efficiency, and offers control\nover the exact path of interpolation. Our approach demonstrates effectiveness\nfor conceptual and spatial interpolation. Code and demo are available at\nhttps://github.com/QY-H00/attention-interpolation-diffusion.\n","authors":["Qiyuan He","Jinghao Wang","Ziwei Liu","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.17924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17893v2","updated":"2024-04-18T05:09:04Z","published":"2024-03-26T17:29:26Z","title":"A Survey on 3D Egocentric Human Pose Estimation","summary":" Egocentric human pose estimation aims to estimate human body poses and\ndevelop body representations from a first-person camera perspective. It has\ngained vast popularity in recent years because of its wide range of\napplications in sectors like XR-technologies, human-computer interaction, and\nfitness tracking. However, to the best of our knowledge, there is no systematic\nliterature review based on the proposed solutions regarding egocentric 3D human\npose estimation. To that end, the aim of this survey paper is to provide an\nextensive overview of the current state of egocentric pose estimation research.\nIn this paper, we categorize and discuss the popular datasets and the different\npose estimation models, highlighting the strengths and weaknesses of different\nmethods by comparative analysis. This survey can be a valuable resource for\nboth researchers and practitioners in the field, offering insights into key\nconcepts and cutting-edge solutions in egocentric pose estimation, its\nwide-ranging applications, as well as the open problems with future scope.\n","authors":["Md Mushfiqur Azam","Kevin Desai"],"pdf_url":"https://arxiv.org/pdf/2403.17893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11903v1","updated":"2024-04-18T05:06:12Z","published":"2024-04-18T05:06:12Z","title":"Simultaneous Detection and Interaction Reasoning for Object-Centric\n Action Recognition","summary":" The interactions between human and objects are important for recognizing\nobject-centric actions. Existing methods usually adopt a two-stage pipeline,\nwhere object proposals are first detected using a pretrained detector, and then\nare fed to an action recognition model for extracting video features and\nlearning the object relations for action recognition. However, since the action\nprior is unknown in the object detection stage, important objects could be\neasily overlooked, leading to inferior action recognition performance. In this\npaper, we propose an end-to-end object-centric action recognition framework\nthat simultaneously performs Detection And Interaction Reasoning in one stage.\nParticularly, after extracting video features with a base network, we create\nthree modules for concurrent object detection and interaction reasoning. First,\na Patch-based Object Decoder generates proposals from video patch tokens. Then,\nan Interactive Object Refining and Aggregation identifies important objects for\naction recognition, adjusts proposal scores based on position and appearance,\nand aggregates object-level info into a global video representation. Lastly, an\nObject Relation Modeling module encodes object relations. These three modules\ntogether with the video feature extractor can be trained jointly in an\nend-to-end fashion, thus avoiding the heavy reliance on an off-the-shelf object\ndetector, and reducing the multi-stage training burden. We conduct experiments\non two datasets, Something-Else and Ikea-Assembly, to evaluate the performance\nof our proposed approach on conventional, compositional, and few-shot action\nrecognition tasks. Through in-depth experimental analysis, we show the crucial\nrole of interactive objects in learning for action recognition, and we can\noutperform state-of-the-art methods on both datasets.\n","authors":["Xunsong Li","Pengzhan Sun","Yangcen Liu","Lixin Duan","Wen Li"],"pdf_url":"https://arxiv.org/pdf/2404.11903v1.pdf","comment":"12 pages, 5 figures, submitted to IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2404.11897v1","updated":"2024-04-18T04:54:28Z","published":"2024-04-18T04:54:28Z","title":"AG-NeRF: Attention-guided Neural Radiance Fields for Multi-height\n Large-scale Outdoor Scene Rendering","summary":" Existing neural radiance fields (NeRF)-based novel view synthesis methods for\nlarge-scale outdoor scenes are mainly built on a single altitude. Moreover,\nthey often require a priori camera shooting height and scene scope, leading to\ninefficient and impractical applications when camera altitude changes. In this\nwork, we propose an end-to-end framework, termed AG-NeRF, and seek to reduce\nthe training cost of building good reconstructions by synthesizing\nfree-viewpoint images based on varying altitudes of scenes. Specifically, to\ntackle the detail variation problem from low altitude (drone-level) to high\naltitude (satellite-level), a source image selection method and an\nattention-based feature fusion approach are developed to extract and fuse the\nmost relevant features of target view from multi-height images for\nhigh-fidelity rendering. Extensive experiments demonstrate that AG-NeRF\nachieves SOTA performance on 56 Leonard and Transamerica benchmarks and only\nrequires a half hour of training time to reach the competitive PSNR as compared\nto the latest BungeeNeRF.\n","authors":["Jingfeng Guo","Xiaohan Zhang","Baozhu Zhao","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2404.11897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11895v1","updated":"2024-04-18T04:47:28Z","published":"2024-04-18T04:47:28Z","title":"FreeDiff: Progressive Frequency Truncation for Image Editing with\n Diffusion Models","summary":" Precise image editing with text-to-image models has attracted increasing\ninterest due to their remarkable generative capabilities and user-friendly\nnature. However, such attempts face the pivotal challenge of misalignment\nbetween the intended precise editing target regions and the broader area\nimpacted by the guidance in practice. Despite excellent methods leveraging\nattention mechanisms that have been developed to refine the editing guidance,\nthese approaches necessitate modifications through complex network architecture\nand are limited to specific editing tasks. In this work, we re-examine the\ndiffusion process and misalignment problem from a frequency perspective,\nrevealing that, due to the power law of natural images and the decaying noise\nschedule, the denoising network primarily recovers low-frequency image\ncomponents during the earlier timesteps and thus brings excessive low-frequency\nsignals for editing. Leveraging this insight, we introduce a novel fine-tuning\nfree approach that employs progressive $\\textbf{Fre}$qu$\\textbf{e}$ncy\ntruncation to refine the guidance of $\\textbf{Diff}$usion models for universal\nediting tasks ($\\textbf{FreeDiff}$). Our method achieves comparable results\nwith state-of-the-art methods across a variety of editing tasks and on a\ndiverse set of images, highlighting its potential as a versatile tool in image\nediting applications.\n","authors":["Wei Wu","Qingnan Fan","Shuai Qin","Hong Gu","Ruoyu Zhao","Antoni B. Chan"],"pdf_url":"https://arxiv.org/pdf/2404.11895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06118v2","updated":"2024-04-18T04:33:53Z","published":"2024-02-09T01:00:14Z","title":"ViGoR: Improving Visual Grounding of Large Vision Language Models with\n Fine-Grained Reward Modeling","summary":" By combining natural language understanding, generation capabilities, and\nbreadth of knowledge of large language models with image perception, recent\nlarge vision language models (LVLMs) have shown unprecedented visual reasoning\ncapabilities. However, the generated text often suffers from inaccurate\ngrounding in the visual input, resulting in errors such as hallucination of\nnonexistent scene elements, missing significant parts of the scene, and\ninferring incorrect attributes of and relationships between objects. To address\nthese issues, we introduce a novel framework, ViGoR(Visual Grounding Through\nFine-Grained Reward Modeling) that utilizes fine-grained reward modeling to\nsignificantly enhance the visual grounding of LVLMs over pre-trained baselines.\nThis improvement is efficiently achieved using much cheaper human evaluations\ninstead of full supervisions, as well as automated methods. We show the\neffectiveness of our approach through a variety of evaluation methods and\nbenchmarks. Additionally, we plan to release our human annotation comprising\napproximately 16,000 images and generated text pairs with fine-grained\nevaluations to contribute to related research in the community.\n","authors":["Siming Yan","Min Bai","Weifeng Chen","Xiong Zhou","Qixing Huang","Li Erran Li"],"pdf_url":"https://arxiv.org/pdf/2402.06118v2.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.11889v1","updated":"2024-04-18T04:25:56Z","published":"2024-04-18T04:25:56Z","title":"Multi-view X-ray Image Synthesis with Multiple Domain Disentanglement\n from CT Scans","summary":" X-ray images play a vital role in the intraoperative processes due to their\nhigh resolution and fast imaging speed and greatly promote the subsequent\nsegmentation, registration and reconstruction. However, over-dosed X-rays\nsuperimpose potential risks to human health to some extent. Data-driven\nalgorithms from volume scans to X-ray images are restricted by the scarcity of\npaired X-ray and volume data. Existing methods are mainly realized by modelling\nthe whole X-ray imaging procedure. In this study, we propose a learning-based\napproach termed CT2X-GAN to synthesize the X-ray images in an end-to-end manner\nusing the content and style disentanglement from three different image domains.\nOur method decouples the anatomical structure information from CT scans and\nstyle information from unpaired real X-ray images/ digital reconstructed\nradiography (DRR) images via a series of decoupling encoders. Additionally, we\nintroduce a novel consistency regularization term to improve the stylistic\nresemblance between synthesized X-ray images and real X-ray images. Meanwhile,\nwe also impose a supervised process by computing the similarity of computed\nreal DRR and synthesized DRR images. We further develop a pose attention module\nto fully strengthen the comprehensive information in the decoupled content code\nfrom CT scans, facilitating high-quality multi-view image synthesis in the\nlower 2D space. Extensive experiments were conducted on the publicly available\nCTSpine1K dataset and achieved 97.8350, 0.0842 and 3.0938 in terms of FID, KID\nand defined user-scored X-ray similarity, respectively. In comparison with\n3D-aware methods ($\\pi$-GAN, EG3D), CT2X-GAN is superior in improving the\nsynthesis quality and realistic to the real X-ray images.\n","authors":["Lixing Tan","Shuang Song","Kangneng Zhou","Chengbo Duan","Lanying Wang","Huayang Ren","Linlin Liu","Wei Zhang","Ruoxiu Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.11889v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.11326v2","updated":"2024-04-18T04:22:07Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Liren He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16512v4","updated":"2024-04-18T04:12:32Z","published":"2023-08-31T07:49:06Z","title":"MVDream: Multi-view Diffusion for 3D Generation","summary":" We introduce MVDream, a diffusion model that is able to generate consistent\nmulti-view images from a given text prompt. Learning from both 2D and 3D data,\na multi-view diffusion model can achieve the generalizability of 2D diffusion\nmodels and the consistency of 3D renderings. We demonstrate that such a\nmulti-view diffusion model is implicitly a generalizable 3D prior agnostic to\n3D representations. It can be applied to 3D generation via Score Distillation\nSampling, significantly enhancing the consistency and stability of existing\n2D-lifting methods. It can also learn new concepts from a few 2D examples, akin\nto DreamBooth, but for 3D generation.\n","authors":["Yichun Shi","Peng Wang","Jianglong Ye","Mai Long","Kejie Li","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16512v4.pdf","comment":"Reorganized for arXiv; Our project page is https://MV-Dream.github.io"},{"id":"http://arxiv.org/abs/2404.11884v1","updated":"2024-04-18T03:58:27Z","published":"2024-04-18T03:58:27Z","title":"Seeing Motion at Nighttime with an Event Camera","summary":" We focus on a very challenging task: imaging at nighttime dynamic scenes.\nMost previous methods rely on the low-light enhancement of a conventional RGB\ncamera. However, they would inevitably face a dilemma between the long exposure\ntime of nighttime and the motion blur of dynamic scenes. Event cameras react to\ndynamic changes with higher temporal resolution (microsecond) and higher\ndynamic range (120dB), offering an alternative solution. In this work, we\npresent a novel nighttime dynamic imaging method with an event camera.\nSpecifically, we discover that the event at nighttime exhibits temporal\ntrailing characteristics and spatial non-stationary distribution. Consequently,\nwe propose a nighttime event reconstruction network (NER-Net) which mainly\nincludes a learnable event timestamps calibration module (LETC) to align the\ntemporal trailing events and a non-uniform illumination aware module (NIAM) to\nstabilize the spatiotemporal distribution of events. Moreover, we construct a\npaired real low-light event dataset (RLED) through a co-axial imaging system,\nincluding 64,200 spatially and temporally aligned image GTs and low-light\nevents. Extensive experiments demonstrate that the proposed method outperforms\nstate-of-the-art methods in terms of visual quality and generalization ability\non real-world nighttime datasets. The project are available at:\nhttps://github.com/Liu-haoyue/NER-Net.\n","authors":["Haoyue Liu","Shihan Peng","Lin Zhu","Yi Chang","Hanyu Zhou","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2404.11884v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.01858v3","updated":"2024-04-18T03:54:39Z","published":"2024-02-02T19:28:33Z","title":"Explaining latent representations of generative models with large\n multimodal models","summary":" Learning interpretable representations of data generative latent factors is\nan important topic for the development of artificial intelligence. With the\nrise of the large multimodal model, it can align images with text to generate\nanswers. In this work, we propose a framework to comprehensively explain each\nlatent variable in the generative models using a large multimodal model. We\nfurther measure the uncertainty of our generated explanations, quantitatively\nevaluate the performance of explanation generation among multiple large\nmultimodal models, and qualitatively visualize the variations of each latent\nvariable to learn the disentanglement effects of different generative models on\nexplanations. Finally, we discuss the explanatory capabilities and limitations\nof state-of-the-art large multimodal models.\n","authors":["Mengdan Zhu","Zhenke Liu","Bo Pan","Abhinav Angirekula","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01858v3.pdf","comment":"ICLR 2024 Workshop on Reliable and Responsible Foundation Models"},{"id":"http://arxiv.org/abs/2404.05673v2","updated":"2024-04-18T03:36:58Z","published":"2024-04-08T16:55:39Z","title":"CoReS: Orchestrating the Dance of Reasoning and Segmentation","summary":" The reasoning segmentation task, which demands a nuanced comprehension of\nintricate queries to accurately pinpoint object regions, is attracting\nincreasing attention. However, Multi-modal Large Language Models (MLLM) often\nfind it difficult to accurately localize the objects described in complex\nreasoning contexts. We believe that the act of reasoning segmentation should\nmirror the cognitive stages of human visual search, where each step is a\nprogressive refinement of thought toward the final object. Thus we introduce\nthe Chains of Reasoning and Segmenting (CoReS) and find this top-down visual\nhierarchy indeed enhances the visual search process. Specifically, we propose a\ndual-chain structure that generates multi-modal, chain-like outputs to aid the\nsegmentation process. Furthermore, to steer the MLLM's outputs into this\nintended hierarchy, we incorporate in-context inputs as guidance. Extensive\nexperiments demonstrate the superior performance of our CoReS, which surpasses\nthe state-of-the-art method by 7.1\\% on the ReasonSeg dataset. Project:\nhttps://chain-of-reasoning-and-segmentation.github.io/.\n","authors":["Xiaoyi Bao","Siyang Sun","Shuailei Ma","Kecheng Zheng","Yuxin Guo","Guosheng Zhao","Yun Zheng","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11630v1","updated":"2024-04-18T03:21:28Z","published":"2024-04-18T03:21:28Z","title":"SNP: Structured Neuron-level Pruning to Preserve Attention Scores","summary":" Multi-head self-attention (MSA) is a key component of Vision Transformers\n(ViTs), which have achieved great success in various vision tasks. However,\ntheir high computational cost and memory footprint hinder their deployment on\nresource-constrained devices. Conventional pruning approaches can only compress\nand accelerate the MSA module using head pruning, although the head is not an\natomic unit. To address this issue, we propose a novel graph-aware neuron-level\npruning method, Structured Neuron-level Pruning (SNP). SNP prunes neurons with\nless informative attention scores and eliminates redundancy among heads.\nSpecifically, it prunes graphically connected query and key layers having the\nleast informative attention scores while preserving the overall attention\nscores. Value layers, which can be pruned independently, are pruned to\neliminate inter-head redundancy. Our proposed method effectively compresses and\naccelerates Transformer-based models for both edge devices and server\nprocessors. For instance, the DeiT-Small with SNP runs 3.1$\\times$ faster than\nthe original model and achieves performance that is 21.94\\% faster and 1.12\\%\nhigher than the DeiT-Tiny. Additionally, SNP combine successfully with\nconventional head or block pruning approaches. SNP with head pruning could\ncompress the DeiT-Base by 80\\% of the parameters and computational costs and\nachieve 3.85$\\times$ faster inference speed on RTX3090 and 4.93$\\times$ on\nJetson Nano.\n","authors":["Kyunghwan Shim","Jaewoong Yun","Shinkook Choi"],"pdf_url":"https://arxiv.org/pdf/2404.11630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11358v2","updated":"2024-04-18T03:18:36Z","published":"2024-04-17T13:14:52Z","title":"DeblurGS: Gaussian Splatting for Camera Motion Blur","summary":" Although significant progress has been made in reconstructing sharp 3D scenes\nfrom motion-blurred images, a transition to real-world applications remains\nchallenging. The primary obstacle stems from the severe blur which leads to\ninaccuracies in the acquisition of initial camera poses through\nStructure-from-Motion, a critical aspect often overlooked by previous\napproaches. To address this challenge, we propose DeblurGS, a method to\noptimize sharp 3D Gaussian Splatting from motion-blurred images, even with the\nnoisy camera pose initialization. We restore a fine-grained sharp scene by\nleveraging the remarkable reconstruction capability of 3D Gaussian Splatting.\nOur approach estimates the 6-Degree-of-Freedom camera motion for each blurry\nobservation and synthesizes corresponding blurry renderings for the\noptimization process. Furthermore, we propose Gaussian Densification Annealing\nstrategy to prevent the generation of inaccurate Gaussians at erroneous\nlocations during the early training stages when camera motion is still\nimprecise. Comprehensive experiments demonstrate that our DeblurGS achieves\nstate-of-the-art performance in deblurring and novel view synthesis for\nreal-world and synthetic benchmark datasets, as well as field-captured blurry\nsmartphone videos.\n","authors":["Jeongtaek Oh","Jaeyoung Chung","Dongwoo Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.11358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11871v1","updated":"2024-04-18T03:10:04Z","published":"2024-04-18T03:10:04Z","title":"Group-On: Boosting One-Shot Segmentation with Supportive Query","summary":" One-shot semantic segmentation aims to segment query images given only ONE\nannotated support image of the same class. This task is challenging because\ntarget objects in the support and query images can be largely different in\nappearance and pose (i.e., intra-class variation). Prior works suggested that\nincorporating more annotated support images in few-shot settings boosts\nperformances but increases costs due to additional manual labeling. In this\npaper, we propose a novel approach for ONE-shot semantic segmentation, called\nGroup-On, which packs multiple query images in batches for the benefit of\nmutual knowledge support within the same category. Specifically, after coarse\nsegmentation masks of the batch of queries are predicted, query-mask pairs act\nas pseudo support data to enhance mask predictions mutually, under the guidance\nof a simple Group-On Voting module. Comprehensive experiments on three standard\nbenchmarks show that, in the ONE-shot setting, our Group-On approach\nsignificantly outperforms previous works by considerable margins. For example,\non the COCO-20i dataset, we increase mIoU scores by 8.21% and 7.46% on ASNet\nand HSNet baselines, respectively. With only one support image, Group-On can be\neven competitive with the counterparts using 5 annotated support images.\n","authors":["Hanjing Zhou","Mingze Yin","JinTai Chen","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.11871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11868v1","updated":"2024-04-18T02:59:48Z","published":"2024-04-18T02:59:48Z","title":"OPTiML: Dense Semantic Invariance Using Optimal Transport for\n Self-Supervised Medical Image Representation","summary":" Self-supervised learning (SSL) has emerged as a promising technique for\nmedical image analysis due to its ability to learn without annotations.\nHowever, despite the promising potential, conventional SSL methods encounter\nlimitations, including challenges in achieving semantic alignment and capturing\nsubtle details. This leads to suboptimal representations, which fail to\naccurately capture the underlying anatomical structures and pathological\ndetails. In response to these constraints, we introduce a novel SSL framework\nOPTiML, employing optimal transport (OT), to capture the dense semantic\ninvariance and fine-grained details, thereby enhancing the overall\neffectiveness of SSL in medical image representation learning. The core idea is\nto integrate OT with a cross-viewpoint semantics infusion module (CV-SIM),\nwhich effectively captures complex, fine-grained details inherent in medical\nimages across different viewpoints. In addition to the CV-SIM module, OPTiML\nimposes the variance and covariance regularizations within OT framework to\nforce the model focus on clinically relevant information while discarding less\ninformative features. Through these, the proposed framework demonstrates its\ncapacity to learn semantically rich representations that can be applied to\nvarious medical imaging tasks. To validate its effectiveness, we conduct\nexperimental studies on three publicly available datasets from chest X-ray\nmodality. Our empirical results reveal OPTiML's superiority over\nstate-of-the-art methods across all evaluated tasks.\n","authors":["Azad Singh","Vandan Gorade","Deepak Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.11868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11865v1","updated":"2024-04-18T02:43:37Z","published":"2024-04-18T02:43:37Z","title":"From Image to Video, what do we need in multimodal LLMs?","summary":" Multimodal Large Language Models (MLLMs) have demonstrated profound\ncapabilities in understanding multimodal information, covering from Image LLMs\nto the more complex Video LLMs. Numerous studies have illustrated their\nexceptional cross-modal comprehension. Recently, integrating video foundation\nmodels with large language models to build a comprehensive video understanding\nsystem has been proposed to overcome the limitations of specific pre-defined\nvision tasks. However, the current advancements in Video LLMs tend to overlook\nthe foundational contributions of Image LLMs, often opting for more complicated\nstructures and a wide variety of multimodal data for pre-training. This\napproach significantly increases the costs associated with these methods.In\nresponse to these challenges, this work introduces an efficient method that\nstrategically leverages the priors of Image LLMs, facilitating a\nresource-efficient transition from Image to Video LLMs. We propose RED-VILLM, a\nResource-Efficient Development pipeline for Video LLMs from Image LLMs, which\nutilizes a temporal adaptation plug-and-play structure within the image fusion\nmodule of Image LLMs. This adaptation extends their understanding capabilities\nto include temporal information, enabling the development of Video LLMs that\nnot only surpass baseline performances but also do so with minimal\ninstructional data and training resources. Our approach highlights the\npotential for a more cost-effective and scalable advancement in multimodal\nmodels, effectively building upon the foundational work of Image LLMs.\n","authors":["Suyuan Huang","Haoxin Zhang","Yan Gao","Yao Hu","Zengchang Qin"],"pdf_url":"https://arxiv.org/pdf/2404.11865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11864v1","updated":"2024-04-18T02:40:31Z","published":"2024-04-18T02:40:31Z","title":"Progressive Multi-modal Conditional Prompt Tuning","summary":" Pre-trained vision-language models (VLMs) have shown remarkable\ngeneralization capabilities via prompting, which leverages VLMs as knowledge\nbases to extract information beneficial for downstream tasks. However, existing\nmethods primarily employ uni-modal prompting, which only engages a uni-modal\nbranch, failing to simultaneously adjust vision-language (V-L) features.\nAdditionally, the one-pass forward pipeline in VLM encoding struggles to align\nV-L features that have a huge gap. Confronting these challenges, we propose a\nnovel method, Progressive Multi-modal conditional Prompt Tuning (ProMPT).\nProMPT exploits a recurrent structure, optimizing and aligning V-L features by\niteratively utilizing image and current encoding information. It comprises an\ninitialization and a multi-modal iterative evolution (MIE) module.\nInitialization is responsible for encoding image and text using a VLM, followed\nby a feature filter that selects text features similar to image. MIE then\nfacilitates multi-modal prompting through class-conditional vision prompting,\ninstance-conditional text prompting, and feature filtering. In each MIE\niteration, vision prompts are obtained from the filtered text features via a\nvision generator, promoting image features to focus more on target object\nduring vision prompting. The encoded image features are fed into a text\ngenerator to produce text prompts that are more robust to class shift. Thus,\nV-L features are progressively aligned, enabling advance from coarse to exact\nclassifications. Extensive experiments are conducted in three settings to\nevaluate the efficacy of ProMPT. The results indicate that ProMPT outperforms\nexisting methods on average across all settings, demonstrating its superior\ngeneralization.\n","authors":["Xiaoyu Qiu","Hao Feng","Yuechen Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.11864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11052v2","updated":"2024-04-18T01:59:27Z","published":"2024-04-17T03:51:55Z","title":"Supervised Contrastive Vision Transformer for Breast Histopathological\n Image Classification","summary":" Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer.\nBreast tissue histopathological examination is critical in diagnosing and\nclassifying breast cancer. Although existing methods have shown promising\nresults, there is still room for improvement in the classification accuracy and\ngeneralization of IDC using histopathology images. We present a novel approach,\nSupervised Contrastive Vision Transformer (SupCon-ViT), for improving the\nclassification of invasive ductal carcinoma in terms of accuracy and\ngeneralization by leveraging the inherent strengths and advantages of both\ntransfer learning, i.e., pre-trained vision transformer, and supervised\ncontrastive learning. Our results on a benchmark breast cancer dataset\ndemonstrate that SupCon-Vit achieves state-of-the-art performance in IDC\nclassification, with an F1-score of 0.8188, precision of 0.7692, and\nspecificity of 0.8971, outperforming existing methods. In addition, the\nproposed model demonstrates resilience in scenarios with minimal labeled data,\nmaking it highly efficient in real-world clinical settings where labelled data\nis limited. Our findings suggest that supervised contrastive learning in\nconjunction with pre-trained vision transformers appears to be a viable\nstrategy for an accurate classification of IDC, thus paving the way for a more\nefficient and reliable diagnosis of breast cancer through histopathological\nimage analysis.\n","authors":["Mohammad Shiri","Monalika Padma Reddy","Jiangwen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.11052v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.11098v2","updated":"2024-04-18T01:58:07Z","published":"2024-04-17T06:32:42Z","title":"LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing\n Diffusion Models","summary":" In the era of AIGC, the demand for low-budget or even on-device applications\nof diffusion models emerged. In terms of compressing the Stable Diffusion\nmodels (SDMs), several approaches have been proposed, and most of them\nleveraged the handcrafted layer removal methods to obtain smaller U-Nets, along\nwith knowledge distillation to recover the network performance. However, such a\nhandcrafting manner of layer removal is inefficient and lacks scalability and\ngeneralization, and the feature distillation employed in the retraining phase\nfaces an imbalance issue that a few numerically significant feature loss terms\ndominate over others throughout the retraining process. To this end, we\nproposed the layer pruning and normalized distillation for compressing\ndiffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to\ncompress SDM's U-Net automatically and proposed an effective one-shot pruning\ncriterion whose one-shot performance is guaranteed by its good additivity\nproperty, surpassing other layer pruning and handcrafted layer removal methods,\n2) proposed the normalized feature distillation for retraining, alleviated the\nimbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of\nSDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0%\ndecline in PickScore at a pruning ratio of 50% while the comparative methods'\nminimal PickScore decline is 8.2%. We will release our code.\n","authors":["Dingkun Zhang","Sijia Li","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.11098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11848v1","updated":"2024-04-18T01:55:44Z","published":"2024-04-18T01:55:44Z","title":"Partial Large Kernel CNNs for Efficient Super-Resolution","summary":" Recently, in the super-resolution (SR) domain, transformers have outperformed\nCNNs with fewer FLOPs and fewer parameters since they can deal with long-range\ndependency and adaptively adjust weights based on instance. In this paper, we\ndemonstrate that CNNs, although less focused on in the current SR domain,\nsurpass Transformers in direct efficiency measures. By incorporating the\nadvantages of Transformers into CNNs, we aim to achieve both computational\nefficiency and enhanced performance. However, using a large kernel in the SR\ndomain, which mainly processes large images, incurs a large computational\noverhead. To overcome this, we propose novel approaches to employing the large\nkernel, which can reduce latency by 86\\% compared to the naive large kernel,\nand leverage an Element-wise Attention module to imitate instance-dependent\nweights. As a result, we introduce Partial Large Kernel CNNs for Efficient\nSuper-Resolution (PLKSR), which achieves state-of-the-art performance on four\ndatasets at a scale of $\\times$4, with reductions of 68.1\\% in latency and\n80.2\\% in maximum GPU memory occupancy compared to SRFormer-light.\n","authors":["Dongheon Lee","Seokju Yun","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2404.11848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11843v1","updated":"2024-04-18T01:46:31Z","published":"2024-04-18T01:46:31Z","title":"Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using\n hybrid CNN-Transformer Architecture","summary":" Medical imaging has been used for diagnosis of various conditions, making it\none of the most powerful resources for effective patient care. Due to\nwidespread availability, low cost, and low radiation, chest X-ray is one of the\nmost sought after radiology examination for the diagnosis of various thoracic\ndiseases. Due to advancements in medical imaging technologies and increasing\npatient load, current radiology workflow faces various challenges including\nincreasing backlogs, working long hours, and increase in diagnostic errors. An\nautomated computer-aided diagnosis system that can interpret chest X-rays to\naugment radiologists by providing actionable insights has potential to provide\nsecond opinion to radiologists, highlight relevant regions in the image, in\nturn expediting clinical workflow, reducing diagnostic errors, and improving\npatient care. In this study, we applied a novel architecture augmenting the\nDenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention\nmechanism using transformer, namely SA-DenseNet121, that can identify multiple\nthoracic diseases in chest X-rays. We conducted experiments on four of the\nlargest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG,\nand IU-CXR. Experimental results in terms of area under the receiver operating\ncharacteristics (AUC-ROC) shows that augmenting CNN with self-attention has\npotential in diagnosing different thoracic diseases from chest X-rays. The\nproposed methodology has the potential to support the reading workflow, improve\nefficiency, and reduce diagnostic errors.\n","authors":["Sonit Singh"],"pdf_url":"https://arxiv.org/pdf/2404.11843v1.pdf","comment":"24 pages, 13 Figures, 13 Tables. arXiv admin note: text overlap with\n arXiv:1904.09925 by other authors"},{"id":"http://arxiv.org/abs/2401.12451v2","updated":"2024-04-18T01:37:42Z","published":"2024-01-23T02:30:16Z","title":"Methods and strategies for improving the novel view synthesis quality of\n neural radiation field","summary":" Neural Radiation Field (NeRF) technology can learn a 3D implicit model of a\nscene from 2D images and synthesize realistic novel view images. This\ntechnology has received widespread attention from the industry and has good\napplication prospects. In response to the problem that the rendering quality of\nNeRF images needs to be improved, many researchers have proposed various\nmethods to improve the rendering quality in the past three years. The latest\nrelevant papers are classified and reviewed, the technical principles behind\nquality improvement are analyzed, and the future evolution direction of quality\nimprovement methods is discussed. This study can help researchers quickly\nunderstand the current state and evolutionary context of technology in this\nfield, which is helpful in inspiring the development of more efficient\nalgorithms and promoting the application of NeRF technology in related fields.\n","authors":["Shun Fang","Ming Cui","Xing Feng","Yanna Lv"],"pdf_url":"https://arxiv.org/pdf/2401.12451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11031v2","updated":"2024-04-18T01:10:44Z","published":"2024-04-17T03:13:58Z","title":"TaCOS: Task-Specific Camera Optimization with Simulation","summary":" The performance of robots in their applications heavily depends on the\nquality of sensory input. However, designing sensor payloads and their\nparameters for specific robotic tasks is an expensive process that requires\nwell-established sensor knowledge and extensive experiments with physical\nhardware. With cameras playing a pivotal role in robotic perception, we\nintroduce a novel end-to-end optimization approach for co-designing a camera\nwith specific robotic tasks by combining derivative-free and gradient-based\noptimizers. The proposed method leverages recent computer graphics techniques\nand physical camera characteristics to prototype the camera in software,\nsimulate operational environments and tasks for robots, and optimize the camera\ndesign based on the desired tasks in a cost-effective way. We validate the\naccuracy of our camera simulation by comparing it with physical cameras, and\ndemonstrate the design of cameras with stronger performance than common\noff-the-shelf alternatives. Our approach supports the optimization of both\ncontinuous and discrete camera parameters, manufacturing constraints, and can\nbe generalized to a broad range of camera design scenarios including multiple\ncameras and unconventional cameras. This work advances the fully automated\ndesign of cameras for specific robotics tasks.\n","authors":["Chengyang Yan","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.11031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11824v1","updated":"2024-04-18T01:10:24Z","published":"2024-04-18T01:10:24Z","title":"TextCenGen: Attention-Guided Text-Centric Background Adaptation for\n Text-to-Image Generation","summary":" Recent advancements in Text-to-image (T2I) generation have witnessed a shift\nfrom adapting text to fixed backgrounds to creating images around text.\nTraditional approaches are often limited to generate layouts within static\nimages for effective text placement. Our proposed approach, TextCenGen,\nintroduces a dynamic adaptation of the blank region for text-friendly image\ngeneration, emphasizing text-centric design and visual harmony generation. Our\nmethod employs force-directed attention guidance in T2I models to generate\nimages that strategically reserve whitespace for pre-defined text areas, even\nfor text or icons at the golden ratio. Observing how cross-attention maps\naffect object placement, we detect and repel conflicting objects using a\nforce-directed graph approach, combined with a Spatial Excluding\nCross-Attention Constraint for smooth attention in whitespace areas. As a novel\ntask in graphic design, experiments indicate that TextCenGen outperforms\nexisting methods with more harmonious compositions. Furthermore, our method\nsignificantly enhances T2I model outcomes on our specially collected prompt\ndatasets, catering to varied text positions. These results demonstrate the\nefficacy of TextCenGen in creating more harmonious and integrated text-image\ncompositions.\n","authors":["Tianyi Liang","Jiangqi Liu","Sicheng Song","Shiqi Jiang","Yifei Huang","Changbo Wang","Chenhui Li"],"pdf_url":"https://arxiv.org/pdf/2404.11824v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.11819v1","updated":"2024-04-18T00:41:32Z","published":"2024-04-18T00:41:32Z","title":"Utilizing Adversarial Examples for Bias Mitigation and Accuracy\n Enhancement","summary":" We propose a novel approach to mitigate biases in computer vision models by\nutilizing counterfactual generation and fine-tuning. While counterfactuals have\nbeen used to analyze and address biases in DNN models, the counterfactuals\nthemselves are often generated from biased generative models, which can\nintroduce additional biases or spurious correlations. To address this issue, we\npropose using adversarial images, that is images that deceive a deep neural\nnetwork but not humans, as counterfactuals for fair model training.\n Our approach leverages a curriculum learning framework combined with a\nfine-grained adversarial loss to fine-tune the model using adversarial\nexamples. By incorporating adversarial images into the training data, we aim to\nprevent biases from propagating through the pipeline. We validate our approach\nthrough both qualitative and quantitative assessments, demonstrating improved\nbias mitigation and accuracy compared to existing methods. Qualitatively, our\nresults indicate that post-training, the decisions made by the model are less\ndependent on the sensitive attribute and our model better disentangles the\nrelationship between sensitive attributes and classification variables.\n","authors":["Pushkar Shukla","Dhruv Srikanth","Lee Cohen","Matthew Turk"],"pdf_url":"https://arxiv.org/pdf/2404.11819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11812v1","updated":"2024-04-18T00:18:07Z","published":"2024-04-18T00:18:07Z","title":"Cross-model Mutual Learning for Exemplar-based Medical Image\n Segmentation","summary":" Medical image segmentation typically demands extensive dense annotations for\nmodel training, which is both time-consuming and skill-intensive. To mitigate\nthis burden, exemplar-based medical image segmentation methods have been\nintroduced to achieve effective training with only one annotated image. In this\npaper, we introduce a novel Cross-model Mutual learning framework for\nExemplar-based Medical image Segmentation (CMEMS), which leverages two models\nto mutually excavate implicit information from unlabeled data at multiple\ngranularities. CMEMS can eliminate confirmation bias and enable collaborative\ntraining to learn complementary information by enforcing consistency at\ndifferent granularities across models. Concretely, cross-model image\nperturbation based mutual learning is devised by using weakly perturbed images\nto generate high-confidence pseudo-labels, supervising predictions of strongly\nperturbed images across models. This approach enables joint pursuit of\nprediction consistency at the image granularity. Moreover, cross-model\nmulti-level feature perturbation based mutual learning is designed by letting\npseudo-labels supervise predictions from perturbed multi-level features with\ndifferent resolutions, which can broaden the perturbation space and enhance the\nrobustness of our framework. CMEMS is jointly trained using exemplar data,\nsynthetic data, and unlabeled data in an end-to-end manner. Experimental\nresults on two medical image datasets indicate that the proposed CMEMS\noutperforms the state-of-the-art segmentation methods with extremely limited\nsupervision.\n","authors":["Qing En","Yuhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.11812v1.pdf","comment":"AISTATS 2024"},{"id":"http://arxiv.org/abs/2107.09847v2","updated":"2024-04-18T08:11:49Z","published":"2021-07-21T02:33:37Z","title":"CogME: A Cognition-Inspired Multi-Dimensional Evaluation Metric for\n Story Understanding","summary":" We introduce CogME, a cognition-inspired, multi-dimensional evaluation metric\ndesigned for AI models focusing on story understanding. CogME is a framework\ngrounded in human thinking strategies and story elements that involve story\nunderstanding. With a specific breakdown of the questions, this approach\nprovides a nuanced assessment revealing not only AI models' particular\nstrengths and weaknesses but also the characteristics of the benchmark dataset.\nOur case study with the DramaQA dataset demonstrates a refined analysis of the\nmodel and the benchmark dataset. We argue the need for metrics based on\nunderstanding the nature of tasks and designed to align closely with human\ncognitive processes. This approach provides insights beyond traditional overall\nscores and paves the way for more sophisticated AI development targeting higher\ncognitive functions.\n","authors":["Minjung Shin","Seongho Choi","Yu-Jung Heo","Minsu Lee","Byoung-Tak Zhang","Jeh-Kwang Ryu"],"pdf_url":"https://arxiv.org/pdf/2107.09847v2.pdf","comment":"9 pages with 4 figures and 3 tables. This work has been accepted for\n presentation at CogSci 2024 and is currently under revision"},{"id":"http://arxiv.org/abs/2404.12547v1","updated":"2024-04-18T23:52:42Z","published":"2024-04-18T23:52:42Z","title":"Does Gaussian Splatting need SFM Initialization?","summary":" 3D Gaussian Splatting has recently been embraced as a versatile and effective\nmethod for scene reconstruction and novel view synthesis, owing to its\nhigh-quality results and compatibility with hardware rasterization. Despite its\nadvantages, Gaussian Splatting's reliance on high-quality point cloud\ninitialization by Structure-from-Motion (SFM) algorithms is a significant\nlimitation to be overcome. To this end, we investigate various initialization\nstrategies for Gaussian Splatting and delve into how volumetric reconstructions\nfrom Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on\nSFM data. Our findings demonstrate that random initialization can perform much\nbetter if carefully designed and that by employing a combination of improved\ninitialization strategies and structure distillation from low-cost NeRF models,\nit is possible to achieve equivalent results, or at times even superior, to\nthose obtained from SFM initialization.\n","authors":["Yalda Foroutan","Daniel Rebain","Kwang Moo Yi","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2404.12547v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.04725v2","updated":"2024-04-18T23:40:16Z","published":"2023-08-09T06:03:07Z","title":"Self-supervised Learning of Rotation-invariant 3D Point Set Features\n using Transformer and its Self-distillation","summary":" Invariance against rotations of 3D objects is an important property in\nanalyzing 3D point set data. Conventional 3D point set DNNs having rotation\ninvariance typically obtain accurate 3D shape features via supervised learning\nby using labeled 3D point sets as training samples. However, due to the rapid\nincrease in 3D point set data and the high cost of labeling, a framework to\nlearn rotation-invariant 3D shape features from numerous unlabeled 3D point\nsets is required. This paper proposes a novel self-supervised learning\nframework for acquiring accurate and rotation-invariant 3D point set features\nat object-level. Our proposed lightweight DNN architecture decomposes an input\n3D point set into multiple global-scale regions, called tokens, that preserve\nthe spatial layout of partial shapes composing the 3D object. We employ a\nself-attention mechanism to refine the tokens and aggregate them into an\nexpressive rotation-invariant feature per 3D point set. Our DNN is effectively\ntrained by using pseudo-labels generated by a self-distillation framework. To\nfacilitate the learning of accurate features, we propose to combine multi-crop\nand cut-mix data augmentation techniques to diversify 3D point sets for\ntraining. Through a comprehensive evaluation, we empirically demonstrate that,\n(1) existing rotation-invariant DNN architectures designed for supervised\nlearning do not necessarily learn accurate 3D shape features under a\nself-supervised learning scenario, and (2) our proposed algorithm learns\nrotation-invariant 3D point set features that are more accurate than those\nlearned by existing algorithms. Code is available at\nhttps://github.com/takahikof/RIPT_SDMM\n","authors":["Takahiko Furuya","Zhoujie Chen","Ryutarou Ohbuchi","Zhenzhong Kuang"],"pdf_url":"https://arxiv.org/pdf/2308.04725v2.pdf","comment":"Accepted to the CVIU journal"},{"id":"http://arxiv.org/abs/2404.12541v1","updated":"2024-04-18T23:25:27Z","published":"2024-04-18T23:25:27Z","title":"GenVideo: One-shot Target-image and Shape Aware Video Editing using T2I\n Diffusion Models","summary":" Video editing methods based on diffusion models that rely solely on a text\nprompt for the edit are hindered by the limited expressive power of text\nprompts. Thus, incorporating a reference target image as a visual guide becomes\ndesirable for precise control over edit. Also, most existing methods struggle\nto accurately edit a video when the shape and size of the object in the target\nimage differ from the source object. To address these challenges, we propose\n\"GenVideo\" for editing videos leveraging target-image aware T2I models. Our\napproach handles edits with target objects of varying shapes and sizes while\nmaintaining the temporal consistency of the edit using our novel target and\nshape aware InvEdit masks. Further, we propose a novel target-image aware\nlatent noise correction strategy during inference to improve the temporal\nconsistency of the edits. Experimental analyses indicate that GenVideo can\neffectively handle edits with objects of varying shapes, where existing\napproaches fail.\n","authors":["Sai Sree Harsha","Ambareesh Revanur","Dhwanit Agarwal","Shradha Agrawal"],"pdf_url":"https://arxiv.org/pdf/2404.12541v1.pdf","comment":"CVPRw 2024"},{"id":"http://arxiv.org/abs/2404.12538v1","updated":"2024-04-18T23:12:46Z","published":"2024-04-18T23:12:46Z","title":"TrACT: A Training Dynamics Aware Contrastive Learning Framework for\n Long-tail Trajectory Prediction","summary":" As a safety critical task, autonomous driving requires accurate predictions\nof road users' future trajectories for safe motion planning, particularly under\nchallenging conditions. Yet, many recent deep learning methods suffer from a\ndegraded performance on the challenging scenarios, mainly because these\nscenarios appear less frequently in the training data. To address such a\nlong-tail issue, existing methods force challenging scenarios closer together\nin the feature space during training to trigger information sharing among them\nfor more robust learning. These methods, however, primarily rely on the motion\npatterns to characterize scenarios, omitting more informative contextual\ninformation, such as interactions and scene layout. We argue that exploiting\nsuch information not only improves prediction accuracy but also scene\ncompliance of the generated trajectories. In this paper, we propose to\nincorporate richer training dynamics information into a prototypical\ncontrastive learning framework. More specifically, we propose a two-stage\nprocess. First, we generate rich contextual features using a baseline\nencoder-decoder framework. These features are split into clusters based on the\nmodel's output errors, using the training dynamics information, and a prototype\nis computed within each cluster. Second, we retrain the model using the\nprototypes in a contrastive learning framework. We conduct empirical\nevaluations of our approach using two large-scale naturalistic datasets and\nshow that our method achieves state-of-the-art performance by improving\naccuracy and scene compliance on the long-tail samples. Furthermore, we perform\nexperiments on a subset of the clusters to highlight the additional benefit of\nour approach in reducing training bias.\n","authors":["Junrui Zhang","Mozhgan Pourkeshavarz","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2404.12538v1.pdf","comment":"2024 IEEE Intelligent Vehicles Symposium (IV)"},{"id":"http://arxiv.org/abs/2404.12526v1","updated":"2024-04-18T22:01:56Z","published":"2024-04-18T22:01:56Z","title":"Adaptive Memory Replay for Continual Learning","summary":" Foundation Models (FMs) have become the hallmark of modern AI, however, these\nmodels are trained on massive data, leading to financially expensive training.\nUpdating FMs as new data becomes available is important, however, can lead to\n`catastrophic forgetting', where models underperform on tasks related to data\nsub-populations observed too long ago. This continual learning (CL) phenomenon\nhas been extensively studied, but primarily in a setting where only a small\namount of past data can be stored. We advocate for the paradigm where memory is\nabundant, allowing us to keep all previous data, but computational resources\nare limited. In this setting, traditional replay-based CL approaches are\noutperformed by a simple baseline which replays past data selected uniformly at\nrandom, indicating that this setting necessitates a new approach. We address\nthis by introducing a framework of adaptive memory replay for continual\nlearning, where sampling of past data is phrased as a multi-armed bandit\nproblem. We utilize Bolzmann sampling to derive a method which dynamically\nselects past data for training conditioned on the current task, assuming full\ndata access and emphasizing training efficiency. Through extensive evaluations\non both vision and language pre-training tasks, we demonstrate the\neffectiveness of our approach, which maintains high performance while reducing\nforgetting by up to 10% at no training efficiency cost.\n","authors":["James Seale Smith","Lazar Valkov","Shaunak Halbe","Vyshnavi Gutta","Rogerio Feris","Zsolt Kira","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2404.12526v1.pdf","comment":"CVPR-W 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2404.12524v1","updated":"2024-04-18T21:55:23Z","published":"2024-04-18T21:55:23Z","title":"DoughNet: A Visual Predictive Model for Topological Manipulation of\n Deformable Objects","summary":" Manipulation of elastoplastic objects like dough often involves topological\nchanges such as splitting and merging. The ability to accurately predict these\ntopological changes that a specific action might incur is critical for planning\ninteractions with elastoplastic objects. We present DoughNet, a\nTransformer-based architecture for handling these challenges, consisting of two\ncomponents. First, a denoising autoencoder represents deformable objects of\nvarying topology as sets of latent codes. Second, a visual predictive model\nperforms autoregressive set prediction to determine long-horizon geometrical\ndeformation and topological changes purely in latent space. Given a partial\ninitial state and desired manipulation trajectories, it infers all resulting\nobject geometries and topologies at each step. DoughNet thereby allows to plan\nrobotic manipulation; selecting a suited tool, its pose and opening width to\nrecreate robot- or human-made goals. Our experiments in simulated and real\nenvironments show that DoughNet is able to significantly outperform related\napproaches that consider deformation only as geometrical change.\n","authors":["Dominik Bauer","Zhenjia Xu","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2404.12524v1.pdf","comment":"Under review. 17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2402.14371v2","updated":"2024-04-18T21:29:39Z","published":"2024-02-22T08:21:46Z","title":"HR-APR: APR-agnostic Framework with Uncertainty Estimation and\n Hierarchical Refinement for Camera Relocalisation","summary":" Absolute Pose Regressors (APRs) directly estimate camera poses from monocular\nimages, but their accuracy is unstable for different queries. Uncertainty-aware\nAPRs provide uncertainty information on the estimated pose, alleviating the\nimpact of these unreliable predictions. However, existing uncertainty modelling\ntechniques are often coupled with a specific APR architecture, resulting in\nsuboptimal performance compared to state-of-the-art (SOTA) APR methods. This\nwork introduces a novel APR-agnostic framework, HR-APR, that formulates\nuncertainty estimation as cosine similarity estimation between the query and\ndatabase features. It does not rely on or affect APR network architecture,\nwhich is flexible and computationally efficient. In addition, we take advantage\nof the uncertainty for pose refinement to enhance the performance of APR. The\nextensive experiments demonstrate the effectiveness of our framework, reducing\n27.4\\% and 15.2\\% of computational overhead on the 7Scenes and Cambridge\nLandmarks datasets while maintaining the SOTA accuracy in single-image APRs.\n","authors":["Changkun Liu","Shuai Chen","Yukun Zhao","Huajian Huang","Victor Prisacariu","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2402.14371v2.pdf","comment":"Accepted in in 2024 IEEE International Conference on Robotics and\n Automation (ICRA). Code: https://github.com/lck666666/HR-APR"},{"id":"http://arxiv.org/abs/2404.12509v1","updated":"2024-04-18T21:09:34Z","published":"2024-04-18T21:09:34Z","title":"Compositional Neural Textures","summary":" Texture plays a vital role in enhancing visual richness in both real\nphotographs and computer-generated imagery. However, the process of editing\ntextures often involves laborious and repetitive manual adjustments of textons,\nwhich are the small, recurring local patterns that define textures. In this\nwork, we introduce a fully unsupervised approach for representing textures\nusing a compositional neural model that captures individual textons. We\nrepresent each texton as a 2D Gaussian function whose spatial support\napproximates its shape, and an associated feature that encodes its detailed\nappearance. By modeling a texture as a discrete composition of Gaussian\ntextons, the representation offers both expressiveness and ease of editing.\nTextures can be edited by modifying the compositional Gaussians within the\nlatent space, and new textures can be efficiently synthesized by feeding the\nmodified Gaussians through a generator network in a feed-forward manner. This\napproach enables a wide range of applications, including transferring\nappearance from an image texture to another image, diversifying textures,\ntexture interpolation, revealing/modifying texture variations, edit\npropagation, texture animation, and direct texton manipulation. The proposed\napproach contributes to advancing texture analysis, modeling, and editing\ntechniques, and opens up new possibilities for creating visually appealing\nimages with controllable textures.\n","authors":["Peihan Tu","Li-Yi Wei","Matthias Zwicker"],"pdf_url":"https://arxiv.org/pdf/2404.12509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12501v1","updated":"2024-04-18T20:43:33Z","published":"2024-04-18T20:43:33Z","title":"SPIdepth: Strengthened Pose Information for Self-supervised Monocular\n Depth Estimation","summary":" Self-supervised monocular depth estimation has garnered considerable\nattention for its applications in autonomous driving and robotics. While recent\nmethods have made strides in leveraging techniques like the Self Query Layer\n(SQL) to infer depth from motion, they often overlook the potential of\nstrengthening pose information. In this paper, we introduce SPIdepth, a novel\napproach that prioritizes enhancing the pose network for improved depth\nestimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the\nimportance of pose information in capturing fine-grained scene structures. By\nenhancing the pose network's capabilities, SPIdepth achieves remarkable\nadvancements in scene understanding and depth estimation. Experimental results\non benchmark datasets such as KITTI and Cityscapes showcase SPIdepth's\nstate-of-the-art performance, surpassing previous methods by significant\nmargins. Notably, SPIdepth's performance exceeds that of unsupervised models\nand, after finetuning on metric data, outperforms all existing methods.\nRemarkably, SPIdepth achieves these results using only a single image for\ninference, surpassing even methods that utilize video sequences for inference,\nthus demonstrating its efficacy and efficiency in real-world applications. Our\napproach represents a significant leap forward in self-supervised monocular\ndepth estimation, underscoring the importance of strengthening pose information\nfor advancing scene understanding in real-world applications.\n","authors":["Mykola Lavreniuk"],"pdf_url":"https://arxiv.org/pdf/2404.12501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12488v1","updated":"2024-04-18T20:03:56Z","published":"2024-04-18T20:03:56Z","title":"Global Counterfactual Directions","summary":" Despite increasing progress in development of methods for generating visual\ncounterfactual explanations, especially with the recent rise of Denoising\nDiffusion Probabilistic Models, previous works consider them as an entirely\nlocal technique. In this work, we take the first step at globalizing them.\nSpecifically, we discover that the latent space of Diffusion Autoencoders\nencodes the inference process of a given classifier in the form of global\ndirections. We propose a novel proxy-based approach that discovers two types of\nthese directions with the use of only single image in an entirely black-box\nmanner. Precisely, g-directions allow for flipping the decision of a given\nclassifier on an entire dataset of images, while h-directions further increase\nthe diversity of explanations. We refer to them in general as Global\nCounterfactual Directions (GCDs). Moreover, we show that GCDs can be naturally\ncombined with Latent Integrated Gradients resulting in a new black-box\nattribution method, while simultaneously enhancing the understanding of\ncounterfactual explanations. We validate our approach on existing benchmarks\nand show that it generalizes to real-world use-cases.\n","authors":["Bartlomiej Sobieski","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.12488v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.12487v1","updated":"2024-04-18T20:02:52Z","published":"2024-04-18T20:02:52Z","title":"Advancing Applications of Satellite Photogrammetry: Novel Approaches for\n Built-up Area Modeling and Natural Environment Monitoring using\n Stereo/Multi-view Satellite Image-derived 3D Data","summary":" With the development of remote sensing technology in recent decades,\nspaceborne sensors with sub-meter and meter spatial resolution (Worldview and\nPlanetScope) have achieved a considerable image quality to generate 3D\ngeospatial data via a stereo matching pipeline. These achievements have\nsignificantly increased the data accessibility in 3D, necessitating adapting\nthese 3D geospatial data to analyze human and natural environments. This\ndissertation explores several novel approaches based on stereo and multi-view\nsatellite image-derived 3D geospatial data, to deal with remote sensing\napplication issues for built-up area modeling and natural environment\nmonitoring, including building model 3D reconstruction, glacier dynamics\ntracking, and lake algae monitoring. Specifically, the dissertation introduces\nfour parts of novel approaches that deal with the spatial and temporal\nchallenges with satellite-derived 3D data. The first study advances LoD-2\nbuilding modeling from satellite-derived Orthophoto and DSMs with a novel\napproach employing a model-driven workflow that generates building rectangular\n3D geometry models. Secondly, we further enhanced our building reconstruction\nframework for dense urban areas and non-rectangular purposes, we implemented\ndeep learning for unit-level segmentation and introduced a gradient-based\ncircle reconstruction for circular buildings to develop a polygon composition\ntechnique for advanced building LoD2 reconstruction. Our third study utilizes\nhigh-spatiotemporal resolution PlanetScope satellite imagery for glacier\ntracking at 3D level in mid-latitude regions. Finally, we proposed a term as\n\"Algal Behavior Function\" to refine the quantification of chlorophyll-a\nconcentrations from satellite imagery in water quality monitoring, addressing\nalgae fluctuations and timing discrepancies between satellite observations and\nfield measurements, thus enhancing the precision of underwater algae volume\nestimates. Overall, this dissertation demonstrates the extensive potential of\nsatellite photogrammetry applications in addressing urban and environmental\nchallenges. It further showcases innovative analytical methodologies that\nenhance the applicability of adapting stereo and multi-view very\nhigh-resolution satellite-derived 3D data. (See full abstract in the document)\n","authors":["Shengxi Gui"],"pdf_url":"https://arxiv.org/pdf/2404.12487v1.pdf","comment":"Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State\n University, 2024, offical version is available in OhioLINK"},{"id":"http://arxiv.org/abs/2403.12459v2","updated":"2024-04-18T19:55:22Z","published":"2024-03-19T05:30:50Z","title":"Non-negative Contrastive Learning","summary":" Deep representations have shown promising performance when transferred to\ndownstream tasks in a black-box manner. Yet, their inherent lack of\ninterpretability remains a significant challenge, as these features are often\nopaque to human understanding. In this paper, we propose Non-negative\nContrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization\n(NMF) aimed at deriving interpretable features. The power of NCL lies in its\nenforcement of non-negativity constraints on features, reminiscent of NMF's\ncapability to extract features that align closely with sample clusters. NCL not\nonly aligns mathematically well with an NMF objective but also preserves NMF's\ninterpretability attributes, resulting in a more sparse and disentangled\nrepresentation compared to standard contrastive learning (CL). Theoretically,\nwe establish guarantees on the identifiability and downstream generalization of\nNCL. Empirically, we show that these advantages enable NCL to outperform CL\nsignificantly on feature disentanglement, feature selection, as well as\ndownstream classification tasks. At last, we show that NCL can be easily\nextended to other learning scenarios and benefit supervised learning as well.\nCode is available at https://github.com/PKU-ML/non_neg.\n","authors":["Yifei Wang","Qi Zhang","Yaoyu Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12459v2.pdf","comment":"22 pages. Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.17986v2","updated":"2024-04-18T19:44:53Z","published":"2024-02-28T02:06:11Z","title":"PolyOculus: Simultaneous Multi-view Image-based Novel View Synthesis","summary":" This paper considers the problem of generative novel view synthesis (GNVS),\ngenerating novel, plausible views of a scene given a limited number of known\nviews. Here, we propose a set-based generative model that can simultaneously\ngenerate multiple, self-consistent new views, conditioned on any number of\nviews. Our approach is not limited to generating a single image at a time and\ncan condition on a variable number of views. As a result, when generating a\nlarge number of views, our method is not restricted to a low-order\nautoregressive generation approach and is better able to maintain generated\nimage quality over large sets of images. We evaluate our model on standard NVS\ndatasets and show that it outperforms the state-of-the-art image-based GNVS\nbaselines. Further, we show that the model is capable of generating sets of\nviews that have no natural sequential ordering, like loops and binocular\ntrajectories, and significantly outperforms other methods on such tasks.\n","authors":["Jason J. Yu","Tristan Aumentado-Armstrong","Fereshteh Forghani","Konstantinos G. Derpanis","Marcus A. Brubaker"],"pdf_url":"https://arxiv.org/pdf/2402.17986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07531v2","updated":"2024-04-18T19:43:25Z","published":"2023-12-12T18:57:46Z","title":"WHAM: Reconstructing World-grounded Humans with Accurate 3D Motion","summary":" The estimation of 3D human motion from video has progressed rapidly but\ncurrent methods still have several key limitations. First, most methods\nestimate the human in camera coordinates. Second, prior work on estimating\nhumans in global coordinates often assumes a flat ground plane and produces\nfoot sliding. Third, the most accurate methods rely on computationally\nexpensive optimization pipelines, limiting their use to offline applications.\nFinally, existing video-based methods are surprisingly less accurate than\nsingle-frame methods. We address these limitations with WHAM (World-grounded\nHumans with Accurate Motion), which accurately and efficiently reconstructs 3D\nhuman motion in a global coordinate system from video. WHAM learns to lift 2D\nkeypoint sequences to 3D using motion capture data and fuses this with video\nfeatures, integrating motion context and visual information. WHAM exploits\ncamera angular velocity estimated from a SLAM method together with human motion\nto estimate the body's global trajectory. We combine this with a contact-aware\ntrajectory refinement method that lets WHAM capture human motion in diverse\nconditions, such as climbing stairs. WHAM outperforms all existing 3D human\nmotion recovery methods across multiple in-the-wild benchmarks. Code will be\navailable for research purposes at http://wham.is.tue.mpg.de/\n","authors":["Soyong Shin","Juyong Kim","Eni Halilaj","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2312.07531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16094v2","updated":"2024-04-18T19:38:18Z","published":"2023-11-27T18:59:02Z","title":"Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person\n Images","summary":" Most existing methods for virtual try-on focus on studio person images with a\nlimited range of poses and clean backgrounds. They can achieve plausible\nresults for this studio try-on setting by learning to warp a garment image to\nfit a person's body from paired training data, i.e., garment images paired with\nimages of people wearing the same garment. Such data is often collected from\ncommercial websites, where each garment is demonstrated both by itself and on\nseveral models. By contrast, it is hard to collect paired data for in-the-wild\nscenes, and therefore, virtual try-on for casual images of people with more\ndiverse poses against cluttered backgrounds is rarely studied.\n In this work, we fill the gap by introducing a StreetTryOn benchmark to\nevaluate in-the-wild virtual try-on performance and proposing a novel method\nthat can learn it without paired data, from a set of in-the-wild person images\ndirectly. Our method achieves robust performance across shop and street domains\nusing a novel DensePose warping correction method combined with diffusion-based\nconditional inpainting. Our experiments show competitive performance for\nstandard studio try-on tasks and SOTA performance for street try-on and\ncross-domain try-on tasks.\n","authors":["Aiyu Cui","Jay Mahajan","Viraj Shah","Preeti Gomathinayagam","Chang Liu","Svetlana Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2311.16094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08514v3","updated":"2024-04-18T19:30:49Z","published":"2024-04-12T14:54:26Z","title":"NIR-Assisted Image Denoising: A Selective Fusion Approach and A\n Real-World Benchmark Dataset","summary":" Despite the significant progress in image denoising, it is still challenging\nto restore fine-scale details while removing noise, especially in extremely\nlow-light environments. Leveraging near-infrared (NIR) images to assist visible\nRGB image denoising shows the potential to address this issue, becoming a\npromising technology. Nonetheless, existing works still struggle with taking\nadvantage of NIR information effectively for real-world image denoising, due to\nthe content inconsistency between NIR-RGB images and the scarcity of real-world\npaired datasets. To alleviate the problem, we propose an efficient Selective\nFusion Module (SFM), which can be plug-and-played into the advanced denoising\nnetworks to merge the deep NIR-RGB features. Specifically, we sequentially\nperform the global and local modulation for NIR and RGB features, and then\nintegrate the two modulated features. Furthermore, we present a Real-world\nNIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse\nscenarios as well as various noise levels. Extensive experiments on both\nsynthetic and our real-world datasets demonstrate that the proposed method\nachieves better results than state-of-the-art ones. The dataset, codes, and\npre-trained models will be publicly available at\nhttps://github.com/ronjonxu/NAID.\n","authors":["Rongjian Xu","Zhilu Zhang","Renlong Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.08514v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.00815v2","updated":"2024-04-18T19:22:37Z","published":"2024-03-31T22:18:56Z","title":"Towards Realistic Scene Generation with LiDAR Diffusion Models","summary":" Diffusion models (DMs) excel in photo-realistic image synthesis, but their\nadaptation to LiDAR scene generation poses a substantial hurdle. This is\nprimarily because DMs operating in the point space struggle to preserve the\ncurve-like patterns and 3D geometry of LiDAR scenes, which consumes much of\ntheir representation power. In this paper, we propose LiDAR Diffusion Models\n(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to\ncapture the realism of LiDAR scenes by incorporating geometric priors into the\nlearning pipeline. Our method targets three major desiderata: pattern realism,\ngeometry realism, and object realism. Specifically, we introduce curve-wise\ncompression to simulate real-world LiDAR patterns, point-wise coordinate\nsupervision to learn scene geometry, and patch-wise encoding for a full 3D\nobject context. With these three core designs, our method achieves competitive\nperformance on unconditional LiDAR generation in 64-beam scenario and state of\nthe art on conditional LiDAR generation, while maintaining high efficiency\ncompared to point-based DMs (up to 107$\\times$ faster). Furthermore, by\ncompressing LiDAR scenes into a latent space, we enable the controllability of\nDMs with various conditions such as semantic maps, camera views, and text\nprompts.\n","authors":["Haoxi Ran","Vitor Guizilini","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00815v2.pdf","comment":"CVPR 2024. Project link: https://lidar-diffusion.github.io"},{"id":"http://arxiv.org/abs/2404.12467v1","updated":"2024-04-18T19:04:27Z","published":"2024-04-18T19:04:27Z","title":"Towards Multi-modal Transformers in Federated Learning","summary":" Multi-modal transformers mark significant progress in different domains, but\nsiloed high-quality data hinders their further improvement. To remedy this,\nfederated learning (FL) has emerged as a promising privacy-preserving paradigm\nfor training models without direct access to the raw data held by different\nclients. Despite its potential, a considerable research direction regarding the\nunpaired uni-modal clients and the transformer architecture in FL remains\nunexplored. To fill this gap, this paper explores a transfer multi-modal\nfederated learning (MFL) scenario within the vision-language domain, where\nclients possess data of various modalities distributed across different\ndatasets. We systematically evaluate the performance of existing methods when a\ntransformer architecture is utilized and introduce a novel framework called\nFederated modality complementary and collaboration (FedCola) by addressing the\nin-modality and cross-modality gaps among clients. Through extensive\nexperiments across various FL settings, FedCola demonstrates superior\nperformance over previous approaches, offering new perspectives on future\nfederated training of multi-modal transformers.\n","authors":["Guangyu Sun","Matias Mendieta","Aritra Dutta","Xin Li","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09611v4","updated":"2024-04-18T18:51:04Z","published":"2024-03-14T17:51:32Z","title":"MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training","summary":" In this work, we discuss building performant Multimodal Large Language Models\n(MLLMs). In particular, we study the importance of various architecture\ncomponents and data choices. Through careful and comprehensive ablations of the\nimage encoder, the vision language connector, and various pre-training data\nchoices, we identified several crucial design lessons. For example, we\ndemonstrate that for large-scale multimodal pre-training using a careful mix of\nimage-caption, interleaved image-text, and text-only data is crucial for\nachieving state-of-the-art (SOTA) few-shot results across multiple benchmarks,\ncompared to other published pre-training results. Further, we show that the\nimage encoder together with image resolution and the image token count has\nsubstantial impact, while the vision-language connector design is of\ncomparatively negligible importance. By scaling up the presented recipe, we\nbuild MM1, a family of multimodal models up to 30B parameters, including both\ndense models and mixture-of-experts (MoE) variants, that are SOTA in\npre-training metrics and achieve competitive performance after supervised\nfine-tuning on a range of established multimodal benchmarks. Thanks to\nlarge-scale pre-training, MM1 enjoys appealing properties such as enhanced\nin-context learning, and multi-image reasoning, enabling few-shot\nchain-of-thought prompting.\n","authors":["Brandon McKinzie","Zhe Gan","Jean-Philippe Fauconnier","Sam Dodge","Bowen Zhang","Philipp Dufter","Dhruti Shah","Xianzhi Du","Futang Peng","Floris Weers","Anton Belyi","Haotian Zhang","Karanjeet Singh","Doug Kang","Ankur Jain","Hongyu Hè","Max Schwarzer","Tom Gunter","Xiang Kong","Aonan Zhang","Jianyu Wang","Chong Wang","Nan Du","Tao Lei","Sam Wiseman","Guoli Yin","Mark Lee","Zirui Wang","Ruoming Pang","Peter Grasch","Alexander Toshev","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.09611v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07398v2","updated":"2024-04-18T18:49:38Z","published":"2024-01-15T00:27:41Z","title":"Cross Domain Early Crop Mapping using CropSTGAN","summary":" Driven by abundant satellite imagery, machine learning-based approaches have\nrecently been promoted to generate high-resolution crop cultivation maps to\nsupport many agricultural applications. One of the major challenges faced by\nthese approaches is the limited availability of ground truth labels. In the\nabsence of ground truth, existing work usually adopts the \"direct transfer\nstrategy\" that trains a classifier using historical labels collected from other\nregions and then applies the trained model to the target region. Unfortunately,\nthe spectral features of crops exhibit inter-region and inter-annual\nvariability due to changes in soil composition, climate conditions, and crop\nprogress, the resultant models perform poorly on new and unseen regions or\nyears. Despite recent efforts, such as the application of the deep adaptation\nneural network (DANN) model structure in the deep adaptation crop\nclassification network (DACCN), to tackle the above cross-domain challenges,\ntheir effectiveness diminishes significantly when there is a large\ndissimilarity between the source and target regions. This paper introduces the\nCrop Mapping Spectral-temporal Generative Adversarial Neural Network\n(CropSTGAN), a novel solution for cross-domain challenges, that doesn't require\ntarget domain labels. CropSTGAN learns to transform the target domain's\nspectral features to those of the source domain, effectively bridging large\ndissimilarities. Additionally, it employs an identity loss to maintain the\nintrinsic local structure of the data. Comprehensive experiments across various\nregions and years demonstrate the benefits and effectiveness of the proposed\napproach. In experiments, CropSTGAN is benchmarked against various\nstate-of-the-art (SOTA) methods. Notably, CropSTGAN significantly outperforms\nthese methods in scenarios with large data distribution dissimilarities between\nthe target and source domains.\n","authors":["Yiqun Wang","Hui Huang","Radu State"],"pdf_url":"https://arxiv.org/pdf/2401.07398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15969v2","updated":"2024-04-18T18:48:31Z","published":"2024-01-29T08:58:07Z","title":"Routers in Vision Mixture of Experts: An Empirical Study","summary":" Mixture-of-Experts (MoE) models are a promising way to scale up model\ncapacity without significantly increasing computational cost. A key component\nof MoEs is the router, which decides which subset of parameters (experts)\nprocess which feature embeddings (tokens). In this paper, we present a\ncomprehensive study of routers in MoEs for computer vision tasks. We introduce\na unified MoE formulation that subsumes different MoEs with two parametric\nrouting tensors. This formulation covers both sparse MoE, which uses a binary\nor hard assignment between experts and tokens, and soft MoE, which uses a soft\nassignment between experts and weighted combinations of tokens. Routers for\nsparse MoEs can be further grouped into two variants: Token Choice, which\nmatches experts to each token, and Expert Choice, which matches tokens to each\nexpert. We conduct head-to-head experiments with 6 different routers, including\nexisting routers from prior work and new ones we introduce. We show that (i)\nmany routers originally developed for language modeling can be adapted to\nperform strongly in vision tasks, (ii) in sparse MoE, Expert Choice routers\ngenerally outperform Token Choice routers, and (iii) soft MoEs generally\noutperform sparse MoEs with a fixed compute budget. These results provide new\ninsights regarding the crucial role of routers in vision MoE models.\n","authors":["Tianlin Liu","Mathieu Blondel","Carlos Riquelme","Joan Puigcerver"],"pdf_url":"https://arxiv.org/pdf/2401.15969v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03452v5","updated":"2024-04-18T18:26:39Z","published":"2024-03-06T04:36:43Z","title":"D4C Glove-train: Solving the RPM and Bongard-logo Problem by\n Circumscribing and Building Distribution for Concepts","summary":" This paper achieves noteworthy progress in the realm of abstract reasoning,\nparticularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo\nchallenges. Initially, we introduce Lico-Net, a novel baseline model that\nresolves RPM problems with remarkable accuracy. Leveraging this foundation, we\nadvance with the D3C approach, which advocates representing the underlying\nconcepts in abstract reasoning problems through distributions. This perspective\nenhances the performance of both Lico-Net and a baseline model excelling in\nBongard-Logo tasks. To bolster the computational efficiency of D3C, we present\nthe D3C-cos variant, offering a streamlined yet precise solution. Furthermore,\nwe propose the D2C method, redefining conceptual boundaries within these\ndomains and bridging the divide between high-level abstractions and their\nlower-dimensional counterparts. Finally, we extend our methodology to D4C,\nemploying adversarial techniques to refine conceptual boundaries further and\ndemonstrate substantial improvements in both RPM and Bongard-Logo challenges.\nOverall, our contributions present a fresh outlook and practical advancements\nin the field of abstract reasoning.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03452v5.pdf","comment":"18 pages, 19 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.12450v1","updated":"2024-04-18T18:25:00Z","published":"2024-04-18T18:25:00Z","title":"Enhancing AI Diagnostics: Autonomous Lesion Masking via Semi-Supervised\n Deep Learning","summary":" This study presents an unsupervised domain adaptation method aimed at\nautonomously generating image masks outlining regions of interest (ROIs) for\ndifferentiating breast lesions in breast ultrasound (US) imaging. Our\nsemi-supervised learning approach utilizes a primitive model trained on a small\npublic breast US dataset with true annotations. This model is then iteratively\nrefined for the domain adaptation task, generating pseudo-masks for our\nprivate, unannotated breast US dataset. The dataset, twice the size of the\npublic one, exhibits considerable variability in image acquisition perspectives\nand demographic representation, posing a domain-shift challenge. Unlike typical\ndomain adversarial training, we employ downstream classification outcomes as a\nbenchmark to guide the updating of pseudo-masks in subsequent iterations. We\nfound the classification precision to be highly correlated with the\ncompleteness of the generated ROIs, which promotes the explainability of the\ndeep learning classification model. Preliminary findings demonstrate the\nefficacy and reliability of this approach in streamlining the ROI annotation\nprocess, thereby enhancing the classification and localization of breast\nlesions for more precise and interpretable diagnoses.\n","authors":["Ting-Ruen Wei","Michele Hell","Dang Bich Thuy Le","Aren Vierra","Ran Pang","Mahesh Patel","Young Kang","Yuling Yan"],"pdf_url":"https://arxiv.org/pdf/2404.12450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12440v1","updated":"2024-04-18T18:01:15Z","published":"2024-04-18T18:01:15Z","title":"Spot-Compose: A Framework for Open-Vocabulary Object Retrieval and\n Drawer Manipulation in Point Clouds","summary":" In recent years, modern techniques in deep learning and large-scale datasets\nhave led to impressive progress in 3D instance segmentation, grasp pose\nestimation, and robotics. This allows for accurate detection directly in 3D\nscenes, object- and environment-aware grasp prediction, as well as robust and\nrepeatable robotic manipulation. This work aims to integrate these recent\nmethods into a comprehensive framework for robotic interaction and manipulation\nin human-centric environments. Specifically, we leverage 3D reconstructions\nfrom a commodity 3D scanner for open-vocabulary instance segmentation,\nalongside grasp pose estimation, to demonstrate dynamic picking of objects, and\nopening of drawers. We show the performance and robustness of our model in two\nsets of real-world experiments including dynamic object retrieval and drawer\nopening, reporting a 51% and 82% success rate respectively. Code of our\nframework as well as videos are available on: https://spot-compose.github.io/.\n","authors":["Oliver Lemke","Zuria Bauer","René Zurbrügg","Marc Pollefeys","Francis Engelmann","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2404.12440v1.pdf","comment":"Accepted at ICRA 2024 Workshops. Code and videos available at\n https://spot-compose.github.io/"},{"id":"http://arxiv.org/abs/2404.12500v1","updated":"2024-04-18T20:43:08Z","published":"2024-04-18T20:43:08Z","title":"UIClip: A Data-driven Model for Assessing User Interface Design","summary":" User interface (UI) design is a difficult yet important task for ensuring the\nusability, accessibility, and aesthetic qualities of applications. In our\npaper, we develop a machine-learned model, UIClip, for assessing the design\nquality and visual relevance of a UI given its screenshot and natural language\ndescription. To train UIClip, we used a combination of automated crawling,\nsynthetic augmentation, and human ratings to construct a large-scale dataset of\nUIs, collated by description and ranked by design quality. Through training on\nthe dataset, UIClip implicitly learns properties of good and bad designs by i)\nassigning a numerical score that represents a UI design's relevance and quality\nand ii) providing design suggestions. In an evaluation that compared the\noutputs of UIClip and other baselines to UIs rated by 12 human designers, we\nfound that UIClip achieved the highest agreement with ground-truth rankings.\nFinally, we present three example applications that demonstrate how UIClip can\nfacilitate downstream applications that rely on instantaneous assessment of UI\ndesign quality: i) UI code generation, ii) UI design tips generation, and iii)\nquality-aware UI example search.\n","authors":["Jason Wu","Yi-Hao Peng","Amanda Li","Amanda Swearngin","Jeffrey P. Bigham","Jeffrey Nichols"],"pdf_url":"https://arxiv.org/pdf/2404.12500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05809v2","updated":"2024-04-18T09:47:48Z","published":"2024-02-08T16:47:43Z","title":"You Only Need One Color Space: An Efficient Network for Low-light Image\n Enhancement","summary":" Low-Light Image Enhancement (LLIE) task tends to restore the details and\nvisual information from corrupted low-light images. Most existing methods learn\nthe mapping function between low/normal-light images by Deep Neural Networks\n(DNNs) on sRGB and HSV color space. Nevertheless, enhancement involves\namplifying image signals, and applying these color spaces to low-light images\nwith a low signal-to-noise ratio can introduce sensitivity and instability into\nthe enhancement process. Consequently, this results in the presence of color\nartifacts and brightness artifacts in the enhanced images. To alleviate this\nproblem, we propose a novel trainable color space, named\nHorizontal/Vertical-Intensity (HVI). It not only decouples brightness and color\nfrom RGB channels to mitigate the instability during enhancement but also\nadapts to low-light images in different illumination ranges due to the\ntrainable parameters. Further, we design a novel Color and Intensity Decoupling\nNetwork (CIDNet) with two branches dedicated to processing the decoupled image\nbrightness and color in the HVI space. Within CIDNet, we introduce the\nLightweight Cross-Attention (LCA) module to facilitate interaction between\nimage structure and content information in both branches, while also\nsuppressing noise in low-light images. Finally, we conducted 22 quantitative\nand qualitative experiments to show that the proposed CIDNet outperforms the\nstate-of-the-art methods on 11 datasets. The code is available at\nhttps://github.com/Fediory/HVI-CIDNet.\n","authors":["Yixu Feng","Cheng Zhang","Pei Wang","Peng Wu","Qingsen Yan","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.05809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15364v1","updated":"2024-04-18T21:04:39Z","published":"2024-04-18T21:04:39Z","title":"MP-DPD: Low-Complexity Mixed-Precision Neural Networks for\n Energy-Efficient Digital Predistortion of Wideband Power Amplifiers","summary":" Digital Pre-Distortion (DPD) enhances signal quality in wideband RF power\namplifiers (PAs). As signal bandwidths expand in modern radio systems, DPD's\nenergy consumption increasingly impacts overall system efficiency. Deep Neural\nNetworks (DNNs) offer promising advancements in DPD, yet their high complexity\nhinders their practical deployment. This paper introduces open-source\nmixed-precision (MP) neural networks that employ quantized low-precision\nfixed-point parameters for energy-efficient DPD. This approach reduces\ncomputational complexity and memory footprint, thereby lowering power\nconsumption without compromising linearization efficacy. Applied to a 160MHz-BW\n1024-QAM OFDM signal from a digital RF PA, MP-DPD gives no performance loss\nagainst 32-bit floating-point precision DPDs, while achieving -43.75 (L)/-45.27\n(R) dBc in Adjacent Channel Power Ratio (ACPR) and -38.72 dB in Error Vector\nMagnitude (EVM). A 16-bit fixed-point-precision MP-DPD enables a 2.8X reduction\nin estimated inference power. The PyTorch learning and testing code is publicly\navailable at \\url{https://github.com/lab-emi/OpenDPD}.\n","authors":["Yizhuo Wu","Ang Li","Mohammadreza Beikmirza","Gagan Deep Singh","Qinyu Chen","Leo C. N. de Vreede","Morteza Alavi","Chang Gao"],"pdf_url":"https://arxiv.org/pdf/2404.15364v1.pdf","comment":"Accepted to IEEE Microwave and Wireless Technology Letters (MWTL)"}]},"2024-04-19T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.13046v1","updated":"2024-04-19T17:59:48Z","published":"2024-04-19T17:59:48Z","title":"MoVA: Adapting Mixture of Vision Experts to Multimodal Context","summary":" As the key component in multimodal large language models (MLLMs), the ability\nof the visual encoder greatly affects MLLM's understanding on diverse image\ncontent. Although some large-scale pretrained vision encoders such as vision\nencoders in CLIP and DINOv2 have brought promising performance, we found that\nthere is still no single vision encoder that can dominate various image content\nunderstanding, e.g., the CLIP vision encoder leads to outstanding results on\ngeneral image understanding but poor performance on document or chart content.\nTo alleviate the bias of CLIP vision encoder, we first delve into the inherent\nbehavior of different pre-trained vision encoders and then propose the MoVA, a\npowerful and novel MLLM, adaptively routing and fusing task-specific vision\nexperts with a coarse-to-fine mechanism. In the coarse-grained stage, we design\na context-aware expert routing strategy to dynamically select the most suitable\nvision experts according to the user instruction, input image, and expertise of\nvision experts. This benefits from the powerful model function understanding\nability of the large language model (LLM) equipped with expert-routing low-rank\nadaptation (LoRA). In the fine-grained stage, we elaborately conduct the\nmixture-of-vision-expert adapter (MoV-Adapter) to extract and fuse\ntask-specific knowledge from various experts. This coarse-to-fine paradigm\neffectively leverages representations from experts based on multimodal context\nand model expertise, further enhancing the generalization ability. We conduct\nextensive experiments to evaluate the effectiveness of the proposed approach.\nWithout any bells and whistles, MoVA can achieve significant performance gains\nover current state-of-the-art methods in a wide range of challenging multimodal\nbenchmarks. Codes and models will be available at\nhttps://github.com/TempleX98/MoVA.\n","authors":["Zhuofan Zong","Bingqi Ma","Dazhong Shen","Guanglu Song","Hao Shao","Dongzhi Jiang","Hongsheng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13044v1","updated":"2024-04-19T17:58:04Z","published":"2024-04-19T17:58:04Z","title":"Unified Scene Representation and Reconstruction for 3D Large Language\n Models","summary":" Enabling Large Language Models (LLMs) to interact with 3D environments is\nchallenging. Existing approaches extract point clouds either from ground truth\n(GT) geometry or 3D scenes reconstructed by auxiliary models. Text-image\naligned 2D features from CLIP are then lifted to point clouds, which serve as\ninputs for LLMs. However, this solution lacks the establishment of 3D\npoint-to-point connections, leading to a deficiency of spatial structure\ninformation. Concurrently, the absence of integration and unification between\nthe geometric and semantic representations of the scene culminates in a\ndiminished level of 3D scene understanding. In this paper, we demonstrate the\nimportance of having a unified scene representation and reconstruction\nframework, which is essential for LLMs in 3D scenes. Specifically, we introduce\nUni3DR^2 extracts 3D geometric and semantic aware representation features via\nthe frozen pre-trained 2D foundation models (e.g., CLIP and SAM) and a\nmulti-scale aggregate 3D decoder. Our learned 3D representations not only\ncontribute to the reconstruction process but also provide valuable knowledge\nfor LLMs. Experimental results validate that our Uni3DR^2 yields convincing\ngains over the baseline on the 3D reconstruction dataset ScanNet (increasing\nF-Score by +1.8\\%). When applied to LLMs, our Uni3DR^2-LLM exhibits superior\nperformance over the baseline on the 3D vision-language understanding dataset\nScanQA (increasing BLEU-1 by +4.0\\% and +4.2\\% on the val set and test set,\nrespectively). Furthermore, it outperforms the state-of-the-art method that\nuses additional GT point clouds on both ScanQA and 3DMV-VQA.\n","authors":["Tao Chu","Pan Zhang","Xiaoyi Dong","Yuhang Zang","Qiong Liu","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13044v1.pdf","comment":"Project Page: https://chtsy.github.io/uni3drr-page/"},{"id":"http://arxiv.org/abs/2404.13043v1","updated":"2024-04-19T17:57:29Z","published":"2024-04-19T17:57:29Z","title":"Data Alignment for Zero-Shot Concept Generation in Dermatology AI","summary":" AI in dermatology is evolving at a rapid pace but the major limitation to\ntraining trustworthy classifiers is the scarcity of data with ground-truth\nconcept level labels, which are meta-labels semantically meaningful to humans.\nFoundation models like CLIP providing zero-shot capabilities can help alleviate\nthis challenge by leveraging vast amounts of image-caption pairs available on\nthe internet. CLIP can be fine-tuned using domain specific image-caption pairs\nto improve classification performance. However, CLIP's pre-training data is not\nwell-aligned with the medical jargon that clinicians use to perform diagnoses.\nThe development of large language models (LLMs) in recent years has led to the\npossibility of leveraging the expressive nature of these models to generate\nrich text. Our goal is to use these models to generate caption text that aligns\nwell with both the clinical lexicon and with the natural human language used in\nCLIP's pre-training data. Starting with captions used for images in PubMed\narticles, we extend them by passing the raw captions through an LLM fine-tuned\non the field's several textbooks. We find that using captions generated by an\nexpressive fine-tuned LLM like GPT-3.5 improves downstream zero-shot concept\nclassification performance.\n","authors":["Soham Gadgil","Mahtab Bigverdi"],"pdf_url":"https://arxiv.org/pdf/2404.13043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13040v1","updated":"2024-04-19T17:53:43Z","published":"2024-04-19T17:53:43Z","title":"Analysis of Classifier-Free Guidance Weight Schedulers","summary":" Classifier-Free Guidance (CFG) enhances the quality and condition adherence\nof text-to-image diffusion models. It operates by combining the conditional and\nunconditional predictions using a fixed weight. However, recent works vary the\nweights throughout the diffusion process, reporting superior results but\nwithout providing any rationale or analysis. By conducting comprehensive\nexperiments, this paper provides insights into CFG weight schedulers. Our\nfindings suggest that simple, monotonically increasing weight schedulers\nconsistently lead to improved performances, requiring merely a single line of\ncode. In addition, more complex parametrized schedulers can be optimized for\nfurther improvement, but do not generalize across different models and tasks.\n","authors":["Xi Wang","Nicolas Dufour","Nefeli Andreou","Marie-Paule Cani","Victoria Fernandez Abrevaya","David Picard","Vicky Kalogeiton"],"pdf_url":"https://arxiv.org/pdf/2404.13040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13039v1","updated":"2024-04-19T17:51:52Z","published":"2024-04-19T17:51:52Z","title":"LaPA: Latent Prompt Assist Model For Medical Visual Question Answering","summary":" Medical visual question answering (Med-VQA) aims to automate the prediction\nof correct answers for medical images and questions, thereby assisting\nphysicians in reducing repetitive tasks and alleviating their workload.\nExisting approaches primarily focus on pre-training models using additional and\ncomprehensive datasets, followed by fine-tuning to enhance performance in\ndownstream tasks. However, there is also significant value in exploring\nexisting models to extract clinically relevant information. In this paper, we\npropose the Latent Prompt Assist model (LaPA) for medical visual question\nanswering. Firstly, we design a latent prompt generation module to generate the\nlatent prompt with the constraint of the target answer. Subsequently, we\npropose a multi-modal fusion block with latent prompt fusion module that\nutilizes the latent prompt to extract clinical-relevant information from\nuni-modal and multi-modal features. Additionally, we introduce a prior\nknowledge fusion module to integrate the relationship between diseases and\norgans with the clinical-relevant information. Finally, we combine the final\nintegrated information with image-language cross-modal information to predict\nthe final answers. Experimental results on three publicly available Med-VQA\ndatasets demonstrate that LaPA outperforms the state-of-the-art model ARL,\nachieving improvements of 1.83%, 0.63%, and 1.80% on VQA-RAD, SLAKE, and\nVQA-2019, respectively. The code is publicly available at\nhttps://github.com/GaryGuTC/LaPA_model.\n","authors":["Tiancheng Gu","Kaicheng Yang","Dongnan Liu","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2404.13039v1.pdf","comment":"10 pages, 4 figures, Accepted by CVPRW2024"},{"id":"http://arxiv.org/abs/2404.13026v1","updated":"2024-04-19T17:41:05Z","published":"2024-04-19T17:41:05Z","title":"PhysDreamer: Physics-Based Interaction with 3D Objects via Video\n Generation","summary":" Realistic object interactions are crucial for creating immersive virtual\nexperiences, yet synthesizing realistic 3D object dynamics in response to novel\ninteractions remains a significant challenge. Unlike unconditional or\ntext-conditioned dynamics generation, action-conditioned dynamics requires\nperceiving the physical material properties of objects and grounding the 3D\nmotion prediction on these properties, such as object stiffness. However,\nestimating physical material properties is an open problem due to the lack of\nmaterial ground-truth data, as measuring these properties for real objects is\nhighly difficult. We present PhysDreamer, a physics-based approach that endows\nstatic 3D objects with interactive dynamics by leveraging the object dynamics\npriors learned by video generation models. By distilling these priors,\nPhysDreamer enables the synthesis of realistic object responses to novel\ninteractions, such as external forces or agent manipulations. We demonstrate\nour approach on diverse examples of elastic objects and evaluate the realism of\nthe synthesized interactions through a user study. PhysDreamer takes a step\ntowards more engaging and realistic virtual experiences by enabling static 3D\nobjects to dynamically respond to interactive stimuli in a physically plausible\nmanner. See our project page at https://physdreamer.github.io/.\n","authors":["Tianyuan Zhang","Hong-Xing Yu","Rundi Wu","Brandon Y. Feng","Changxi Zheng","Noah Snavely","Jiajun Wu","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2404.13026v1.pdf","comment":"Project website at: https://physdreamer.github.io/"},{"id":"http://arxiv.org/abs/2404.13024v1","updated":"2024-04-19T17:39:50Z","published":"2024-04-19T17:39:50Z","title":"BANF: Band-limited Neural Fields for Levels of Detail Reconstruction","summary":" Largely due to their implicit nature, neural fields lack a direct mechanism\nfor filtering, as Fourier analysis from discrete signal processing is not\ndirectly applicable to these representations. Effective filtering of neural\nfields is critical to enable level-of-detail processing in downstream\napplications, and support operations that involve sampling the field on regular\ngrids (e.g. marching cubes). Existing methods that attempt to decompose neural\nfields in the frequency domain either resort to heuristics or require extensive\nmodifications to the neural field architecture. We show that via a simple\nmodification, one can obtain neural fields that are low-pass filtered, and in\nturn show how this can be exploited to obtain a frequency decomposition of the\nentire signal. We demonstrate the validity of our technique by investigating\nlevel-of-detail reconstruction, and showing how coarser representations can be\ncomputed effectively.\n","authors":["Ahan Shabanov","Shrisudhan Govindarajan","Cody Reading","Lily Goli","Daniel Rebain","Kwang Moo Yi","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2404.13024v1.pdf","comment":"Project Page: https://theialab.github.io/banf"},{"id":"http://arxiv.org/abs/2404.13016v1","updated":"2024-04-19T17:25:43Z","published":"2024-04-19T17:25:43Z","title":"Optimizing Calibration by Gaining Aware of Prediction Correctness","summary":" Model calibration aims to align confidence with prediction correctness. The\nCross-Entropy CE) loss is widely used for calibrator training, which enforces\nthe model to increase confidence on the ground truth class. However, we find\nthe CE loss has intrinsic limitations. For example, for a narrow\nmisclassification, a calibrator trained by the CE loss often produces high\nconfidence on the wrongly predicted class (e.g., a test sample is wrongly\nclassified and its softmax score on the ground truth class is around 0.4),\nwhich is undesirable. In this paper, we propose a new post-hoc calibration\nobjective derived from the aim of calibration. Intuitively, the proposed\nobjective function asks that the calibrator decrease model confidence on\nwrongly predicted samples and increase confidence on correctly predicted\nsamples. Because a sample itself has insufficient ability to indicate\ncorrectness, we use its transformed versions (e.g., rotated, greyscaled and\ncolor-jittered) during calibrator training. Trained on an in-distribution\nvalidation set and tested with isolated, individual test samples, our method\nachieves competitive calibration performance on both in-distribution and\nout-of-distribution test sets compared with the state of the art. Further, our\nanalysis points out the difference between our method and commonly used\nobjectives such as CE loss and mean square error loss, where the latters\nsometimes deviates from the calibration aim.\n","authors":["Yuchi Liu","Lei Wang","Yuli Zou","James Zou","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13013v1","updated":"2024-04-19T17:22:51Z","published":"2024-04-19T17:22:51Z","title":"Groma: Localized Visual Tokenization for Grounding Multimodal Large\n Language Models","summary":" We introduce Groma, a Multimodal Large Language Model (MLLM) with grounded\nand fine-grained visual perception ability. Beyond holistic image\nunderstanding, Groma is adept at region-level tasks such as region captioning\nand visual grounding. Such capabilities are built upon a localized visual\ntokenization mechanism, where an image input is decomposed into regions of\ninterest and subsequently encoded into region tokens. By integrating region\ntokens into user instructions and model responses, we seamlessly enable Groma\nto understand user-specified region inputs and ground its textual output to\nimages. Besides, to enhance the grounded chat ability of Groma, we curate a\nvisually grounded instruction dataset by leveraging the powerful GPT-4V and\nvisual prompting techniques. Compared with MLLMs that rely on the language\nmodel or external module for localization, Groma consistently demonstrates\nsuperior performances in standard referring and grounding benchmarks,\nhighlighting the advantages of embedding localization into image tokenization.\nProject page: https://groma-mllm.github.io/.\n","authors":["Chuofan Ma","Yi Jiang","Jiannan Wu","Zehuan Yuan","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2404.13013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13002v1","updated":"2024-04-19T16:59:04Z","published":"2024-04-19T16:59:04Z","title":"Towards Robust Ferrous Scrap Material Classification with Deep Learning\n and Conformal Prediction","summary":" In the steel production domain, recycling ferrous scrap is essential for\nenvironmental and economic sustainability, as it reduces both energy\nconsumption and greenhouse gas emissions. However, the classification of scrap\nmaterials poses a significant challenge, requiring advancements in automation\ntechnology. Additionally, building trust among human operators is a major\nobstacle. Traditional approaches often fail to quantify uncertainty and lack\nclarity in model decision-making, which complicates acceptance. In this\narticle, we describe how conformal prediction can be employed to quantify\nuncertainty and add robustness in scrap classification. We have adapted the\nSplit Conformal Prediction technique to seamlessly integrate with\nstate-of-the-art computer vision models, such as the Vision Transformer (ViT),\nSwin Transformer, and ResNet-50, while also incorporating Explainable\nArtificial Intelligence (XAI) methods. We evaluate the approach using a\ncomprehensive dataset of 8147 images spanning nine ferrous scrap classes. The\napplication of the Split Conformal Prediction method allowed for the\nquantification of each model's uncertainties, which enhanced the understanding\nof predictions and increased the reliability of the results. Specifically, the\nSwin Transformer model demonstrated more reliable outcomes than the others, as\nevidenced by its smaller average size of prediction sets and achieving an\naverage classification accuracy exceeding 95%. Furthermore, the Score-CAM\nmethod proved highly effective in clarifying visual features, significantly\nenhancing the explainability of the classification decisions.\n","authors":["Paulo Henrique dos Santos","Valéria de Carvalho Santos","Eduardo José da Silva Luz"],"pdf_url":"https://arxiv.org/pdf/2404.13002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13000v1","updated":"2024-04-19T16:55:12Z","published":"2024-04-19T16:55:12Z","title":"RadRotator: 3D Rotation of Radiographs with Diffusion Models","summary":" Transforming two-dimensional (2D) images into three-dimensional (3D) volumes\nis a well-known yet challenging problem for the computer vision community. In\nthe medical domain, a few previous studies attempted to convert two or more\ninput radiographs into computed tomography (CT) volumes. Following their\neffort, we introduce a diffusion model-based technology that can rotate the\nanatomical content of any input radiograph in 3D space, potentially enabling\nthe visualization of the entire anatomical content of the radiograph from any\nviewpoint in 3D. Similar to previous studies, we used CT volumes to create\nDigitally Reconstructed Radiographs (DRRs) as the training data for our model.\nHowever, we addressed two significant limitations encountered in previous\nstudies: 1. We utilized conditional diffusion models with classifier-free\nguidance instead of Generative Adversarial Networks (GANs) to achieve higher\nmode coverage and improved output image quality, with the only trade-off being\nslower inference time, which is often less critical in medical applications;\nand 2. We demonstrated that the unreliable output of style transfer deep\nlearning (DL) models, such as Cycle-GAN, to transfer the style of actual\nradiographs to DRRs could be replaced with a simple yet effective training\ntransformation that randomly changes the pixel intensity histograms of the\ninput and ground-truth imaging data during training. This transformation makes\nthe diffusion model agnostic to any distribution variations of the input data\npixel intensity, enabling the reliable training of a DL model on input DRRs and\napplying the exact same model to conventional radiographs (or DRRs) during\ninference.\n","authors":["Pouria Rouzrokh","Bardia Khosravi","Shahriar Faghani","Kellen L. Mulford","Michael J. Taunton","Bradley J. Erickson","Cody C. Wyles"],"pdf_url":"https://arxiv.org/pdf/2404.13000v1.pdf","comment":"Website: https://pouriarouzrokh.github.io/RadRotator Online demo:\n https://huggingface.co/spaces/Pouriarouzrokh/RadRotator Article information:\n 16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.11769v2","updated":"2024-04-19T16:50:05Z","published":"2024-04-17T21:52:21Z","title":"QGen: On the Ability to Generalize in Quantization Aware Training","summary":" Quantization lowers memory usage, computational requirements, and latency by\nutilizing fewer bits to represent model weights and activations. In this work,\nwe investigate the generalization properties of quantized neural networks, a\ncharacteristic that has received little attention despite its implications on\nmodel performance. In particular, first, we develop a theoretical model for\nquantization in neural networks and demonstrate how quantization functions as a\nform of regularization. Second, motivated by recent work connecting the\nsharpness of the loss landscape and generalization, we derive an approximate\nbound for the generalization of quantized models conditioned on the amount of\nquantization noise. We then validate our hypothesis by experimenting with over\n2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on\nconvolutional and transformer-based models.\n","authors":["MohammadHossein AskariHemmat","Ahmadreza Jeddi","Reyhane Askari Hemmat","Ivan Lazarevich","Alexander Hoffman","Sudhakar Sah","Ehsan Saboori","Yvon Savaria","Jean-Pierre David"],"pdf_url":"https://arxiv.org/pdf/2404.11769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12986v1","updated":"2024-04-19T16:36:21Z","published":"2024-04-19T16:36:21Z","title":"Nuclei Instance Segmentation of Cryosectioned H&E Stained Histological\n Images using Triple U-Net Architecture","summary":" Nuclei instance segmentation is crucial in oncological diagnosis and cancer\npathology research. H&E stained images are commonly used for medical diagnosis,\nbut pre-processing is necessary before using them for image processing tasks.\nTwo principal pre-processing methods are formalin-fixed paraffin-embedded\nsamples (FFPE) and frozen tissue samples (FS). While FFPE is widely used, it is\ntime-consuming, while FS samples can be processed quickly. Analyzing H&E\nstained images derived from fast sample preparation, staining, and scanning can\npose difficulties due to the swift process, which can result in the degradation\nof image quality. This paper proposes a method that leverages the unique\noptical characteristics of H&E stained images. A three-branch U-Net\narchitecture has been implemented, where each branch contributes to the final\nsegmentation results. The process includes applying watershed algorithm to\nseparate overlapping regions and enhance accuracy. The Triple U-Net\narchitecture comprises an RGB branch, a Hematoxylin branch, and a Segmentation\nbranch. This study focuses on a novel dataset named CryoNuSeg. The results\nobtained through robust experiments outperform the state-of-the-art results\nacross various metrics. The benchmark score for this dataset is AJI 52.5 and PQ\n47.7, achieved through the implementation of U-Net Architecture. However, the\nproposed Triple U-Net architecture achieves an AJI score of 67.41 and PQ of\n50.56. The proposed architecture improves more on AJI than other evaluation\nmetrics, which further justifies the superiority of the Triple U-Net\narchitecture over the baseline U-Net model, as AJI is a more strict evaluation\nmetric. The use of the three-branch U-Net model, followed by watershed\npost-processing, significantly surpasses the benchmark scores, showing\nsubstantial improvement in the AJI score\n","authors":["Zarif Ahmed","Chowdhury Nur E Alam Siddiqi","Fardifa Fathmiul Alam","Tasnim Ahmed","Tareque Mohmud Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2404.12986v1.pdf","comment":"To be published in \"6th IVPR & 11th ICIEV\""},{"id":"http://arxiv.org/abs/2301.00812v5","updated":"2024-04-19T16:10:40Z","published":"2022-12-16T01:04:52Z","title":"One-shot skill assessment in high-stakes domains with limited data via\n meta learning","summary":" Deep Learning (DL) has achieved robust competency assessment in various\nhigh-stakes fields. However, the applicability of DL models is often hampered\nby their substantial data requirements and confinement to specific training\ndomains. This prevents them from transitioning to new tasks where data is\nscarce. Therefore, domain adaptation emerges as a critical element for the\npractical implementation of DL in real-world scenarios. Herein, we introduce\nA-VBANet, a novel meta-learning model capable of delivering domain-agnostic\nskill assessment via one-shot learning. Our methodology has been tested by\nassessing surgical skills on five laparoscopic and robotic simulators and\nreal-life laparoscopic cholecystectomy. Our model successfully adapted with\naccuracies up to 99.5% in one-shot and 99.9% in few-shot settings for simulated\ntasks and 89.7% for laparoscopic cholecystectomy. This study marks the first\ninstance of a domain-agnostic methodology for skill assessment in critical\nfields setting a precedent for the broad application of DL across diverse\nreal-life domains with limited data.\n","authors":["Erim Yanik","Steven Schwaitzberg","Gene Yang","Xavier Intes","Jack Norfleet","Matthew Hackett","Suvranu De"],"pdf_url":"https://arxiv.org/pdf/2301.00812v5.pdf","comment":"23 pages (Main Manuscript + Supplementary Materials + Arxiv Logs), 4\n figures (+2 Supplementary Figures), 2 tables (+5 Supplementary Tables)"},{"id":"http://arxiv.org/abs/2404.12973v1","updated":"2024-04-19T16:01:00Z","published":"2024-04-19T16:01:00Z","title":"Cross-modal Diffusion Modelling for Super-resolved Spatial\n Transcriptomics","summary":" The recent advancement of spatial transcriptomics (ST) allows to characterize\nspatial gene expression within tissue for discovery research. However, current\nST platforms suffer from low resolution, hindering in-depth understanding of\nspatial gene expression. Super-resolution approaches promise to enhance ST maps\nby integrating histology images with gene expressions of profiled tissue spots.\nHowever, current super-resolution methods are limited by restoration\nuncertainty and mode collapse. Although diffusion models have shown promise in\ncapturing complex interactions between multi-modal conditions, it remains a\nchallenge to integrate histology images and gene expression for super-resolved\nST maps. This paper proposes a cross-modal conditional diffusion model for\nsuper-resolving ST maps with the guidance of histology images. Specifically, we\ndesign a multi-modal disentangling network with cross-modal adaptive modulation\nto utilize complementary information from histology images and spatial gene\nexpression. Moreover, we propose a dynamic cross-attention modelling strategy\nto extract hierarchical cell-to-tissue information from histology images.\nLastly, we propose a co-expression-based gene-correlation graph network to\nmodel the co-expression relationship of multiple genes. Experiments show that\nour method outperforms other state-of-the-art methods in ST super-resolution on\nthree public datasets.\n","authors":["Xiaofei Wang","Xingxu Huang","Stephen J. Price","Chao Li"],"pdf_url":"https://arxiv.org/pdf/2404.12973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12966v1","updated":"2024-04-19T15:53:27Z","published":"2024-04-19T15:53:27Z","title":"Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of\n Multi-modal Large Language Models","summary":" Counterfactual reasoning, as a crucial manifestation of human intelligence,\nrefers to making presuppositions based on established facts and extrapolating\npotential outcomes. Existing multimodal large language models (MLLMs) have\nexhibited impressive cognitive and reasoning capabilities, which have been\nexamined across a wide range of Visual Question Answering (VQA) benchmarks.\nNevertheless, how will existing MLLMs perform when faced with counterfactual\nquestions? To answer this question, we first curate a novel\n\\textbf{C}ounter\\textbf{F}actual \\textbf{M}ulti\\textbf{M}odal reasoning\nbenchmark, abbreviated as \\textbf{CFMM}, to systematically assess the\ncounterfactual reasoning capabilities of MLLMs. Our CFMM comprises six\nchallenging tasks, each including hundreds of carefully human-labeled\ncounterfactual questions, to evaluate MLLM's counterfactual reasoning\ncapabilities across diverse aspects. Through experiments, interestingly, we\nfind that existing MLLMs prefer to believe what they see, but ignore the\ncounterfactual presuppositions presented in the question, thereby leading to\ninaccurate responses. Furthermore, we evaluate a wide range of prevalent MLLMs\non our proposed CFMM. The significant gap between their performance on our CFMM\nand that on several VQA benchmarks indicates that there is still considerable\nroom for improvement in existing MLLMs toward approaching human-level\nintelligence. On the other hand, through boosting MLLMs performances on our\nCFMM in the future, potential avenues toward developing MLLMs with advanced\nintelligence can be explored.\n","authors":["Yian Li","Wentao Tian","Yang Jiao","Jingjing Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.12966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12958v1","updated":"2024-04-19T15:40:47Z","published":"2024-04-19T15:40:47Z","title":"Improving Pediatric Pneumonia Diagnosis with Adult Chest X-ray Images\n Utilizing Contrastive Learning and Embedding Similarity","summary":" Despite the advancement of deep learning-based computer-aided diagnosis (CAD)\nmethods for pneumonia from adult chest x-ray (CXR) images, the performance of\nCAD methods applied to pediatric images remains suboptimal, mainly due to the\nlack of large-scale annotated pediatric imaging datasets. Establishing a proper\nframework to leverage existing adult large-scale CXR datasets can thus enhance\npediatric pneumonia detection performance. In this paper, we propose a\nthree-branch parallel path learning-based framework that utilizes both adult\nand pediatric datasets to improve the performance of deep learning models on\npediatric test datasets. The paths are trained with pediatric only, adult only,\nand both types of CXRs, respectively. Our proposed framework utilizes the\nmulti-positive contrastive loss to cluster the classwise embeddings and the\nembedding similarity loss among these three parallel paths to make the\nclasswise embeddings as close as possible to reduce the effect of domain shift.\nExperimental evaluations on open-access adult and pediatric CXR datasets show\nthat the proposed method achieves a superior AUROC score of 0.8464 compared to\n0.8348 obtained using the conventional approach of join training on both\ndatasets. The proposed approach thus paves the way for generalized CAD models\nthat are effective for both adult and pediatric age groups.\n","authors":["Mohammad Zunaed","Anwarul Hasan","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2404.12958v1.pdf","comment":"Accepted to International Conference of IEEE Engineering in Medicine\n and Biology Society (EMBC), 2024"},{"id":"http://arxiv.org/abs/2404.04876v2","updated":"2024-04-19T15:33:44Z","published":"2024-04-07T08:46:06Z","title":"HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and\n Low-Frequency Information of Parametric Models","summary":" Reconstructing 3D clothed human involves creating a detailed geometry of\nindividuals in clothing, with applications ranging from virtual try-on, movies,\nto games. To enable practical and widespread applications, recent advances\npropose to generate a clothed human from an RGB image. However, they struggle\nto reconstruct detailed and robust avatars simultaneously. We empirically find\nthat the high-frequency (HF) and low-frequency (LF) information from a\nparametric model has the potential to enhance geometry details and improve\nrobustness to noise, respectively. Based on this, we propose HiLo, namely\nclothed human reconstruction with high- and low-frequency information, which\ncontains two components. 1) To recover detailed geometry using HF information,\nwe propose a progressive HF Signed Distance Function to enhance the detailed 3D\ngeometry of a clothed human. We analyze that our progressive learning manner\nalleviates large gradients that hinder model convergence. 2) To achieve robust\nreconstruction against inaccurate estimation of the parametric model by using\nLF information, we propose a spatial interaction implicit function. This\nfunction effectively exploits the complementary spatial information from a\nlow-resolution voxel grid of the parametric model. Experimental results\ndemonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and\n9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets,\nrespectively. Additionally, HiLo demonstrates robustness to noise from the\nparametric model, challenging poses, and various clothing styles.\n","authors":["Yifan Yang","Dong Liu","Shuhai Zhang","Zeshuai Deng","Zixiong Huang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2404.04876v2.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.12948v1","updated":"2024-04-19T15:26:36Z","published":"2024-04-19T15:26:36Z","title":"Next Generation Loss Function for Image Classification","summary":" Neural networks are trained by minimizing a loss function that defines the\ndiscrepancy between the predicted model output and the target value. The\nselection of the loss function is crucial to achieve task-specific behaviour\nand highly influences the capability of the model. A variety of loss functions\nhave been proposed for a wide range of tasks affecting training and model\nperformance. For classification tasks, the cross entropy is the de-facto\nstandard and usually the first choice. Here, we try to experimentally challenge\nthe well-known loss functions, including cross entropy (CE) loss, by utilizing\nthe genetic programming (GP) approach, a population-based evolutionary\nalgorithm. GP constructs loss functions from a set of operators and leaf nodes\nand these functions are repeatedly recombined and mutated to find an optimal\nstructure. Experiments were carried out on different small-sized datasets\nCIFAR-10, CIFAR-100 and Fashion-MNIST using an Inception model. The 5 best\nfunctions found were evaluated for different model architectures on a set of\nstandard datasets ranging from 2 to 102 classes and very different sizes. One\nfunction, denoted as Next Generation Loss (NGL), clearly stood out showing same\nor better performance for all tested datasets compared to CE. To evaluate the\nNGL function on a large-scale dataset, we tested its performance on the\nImagenet-1k dataset where it showed improved top-1 accuracy compared to models\ntrained with identical settings and other losses. Finally, the NGL was trained\non a segmentation downstream task for Pascal VOC 2012 and COCO-Stuff164k\ndatasets improving the underlying model performance.\n","authors":["Shakhnaz Akhmedova","Nils Körber"],"pdf_url":"https://arxiv.org/pdf/2404.12948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00132v3","updated":"2024-04-19T15:23:43Z","published":"2023-09-29T20:48:44Z","title":"QDFormer: Towards Robust Audiovisual Segmentation in Complex\n Environments with Quantization-based Semantic Decomposition","summary":" Audiovisual segmentation (AVS) is a challenging task that aims to segment\nvisual objects in videos according to their associated acoustic cues. With\nmultiple sound sources and background disturbances involved, establishing\nrobust correspondences between audio and visual contents poses unique\nchallenges due to (1) complex entanglement across sound sources and (2)\nfrequent changes in the occurrence of distinct sound events. Assuming sound\nevents occur independently, the multi-source semantic space can be represented\nas the Cartesian product of single-source sub-spaces. We are motivated to\ndecompose the multi-source audio semantics into single-source semantics for\nmore effective interactions with visual content. We propose a semantic\ndecomposition method based on product quantization, where the multi-source\nsemantics can be decomposed and represented by several disentangled and\nnoise-suppressed single-source semantics. Furthermore, we introduce a\nglobal-to-local quantization mechanism, which distills knowledge from stable\nglobal (clip-level) features into local (frame-level) ones, to handle frequent\nchanges in audio semantics. Extensive experiments demonstrate that our\nsemantically decomposed audio representation significantly improves AVS\nperformance, e.g., +21.2% mIoU on the challenging AVS-Semantic benchmark with\nResNet50 backbone. https://github.com/lxa9867/QSD.\n","authors":["Xiang Li","Jinglu Wang","Xiaohao Xu","Xiulian Peng","Rita Singh","Yan Lu","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2310.00132v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12942v1","updated":"2024-04-19T15:16:04Z","published":"2024-04-19T15:16:04Z","title":"Purposer: Putting Human Motion Generation in Context","summary":" We present a novel method to generate human motion to populate 3D indoor\nscenes. It can be controlled with various combinations of conditioning signals\nsuch as a path in a scene, target poses, past motions, and scenes represented\nas 3D point clouds. State-of-the-art methods are either models specialized to\none single setting, require vast amounts of high-quality and diverse training\ndata, or are unconditional models that do not integrate scene or other\ncontextual information. As a consequence, they have limited applicability and\nrely on costly training data. To address these limitations, we propose a new\nmethod ,dubbed Purposer, based on neural discrete representation learning. Our\nmodel is capable of exploiting, in a flexible manner, different types of\ninformation already present in open access large-scale datasets such as AMASS.\nFirst, we encode unconditional human motion into a discrete latent space.\nSecond, an autoregressive generative model, conditioned with key contextual\ninformation, either with prompting or additive tokens, and trained for\nnext-step prediction in this space, synthesizes sequences of latent indices. We\nfurther design a novel conditioning block to handle future conditioning\ninformation in such a causal model by using a network with two branches to\ncompute separate stacks of features. In this manner, Purposer can generate\nrealistic motion sequences in diverse test scenes. Through exhaustive\nevaluation, we demonstrate that our multi-contextual solution outperforms\nexisting specialized approaches for specific contextual information, both in\nterms of quality and diversity. Our model is trained with short sequences, but\na byproduct of being able to use various conditioning signals is that at test\ntime different combinations can be used to chain short sequences together and\ngenerate long motions within a context scene.\n","authors":["Nicolas Ugrinovic","Thomas Lucas","Fabien Baradel","Philippe Weinzaepfel","Gregory Rogez","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2404.12942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12940v1","updated":"2024-04-19T15:10:54Z","published":"2024-04-19T15:10:54Z","title":"Neural Flow Diffusion Models: Learnable Forward Process for Improved\n Diffusion Modelling","summary":" Conventional diffusion models typically relies on a fixed forward process,\nwhich implicitly defines complex marginal distributions over latent variables.\nThis can often complicate the reverse process' task in learning generative\ntrajectories, and results in costly inference for diffusion models. To address\nthese limitations, we introduce Neural Flow Diffusion Models (NFDM), a novel\nframework that enhances diffusion models by supporting a broader range of\nforward processes beyond the fixed linear Gaussian. We also propose a novel\nparameterization technique for learning the forward process. Our framework\nprovides an end-to-end, simulation-free optimization objective, effectively\nminimizing a variational upper bound on the negative log-likelihood.\nExperimental results demonstrate NFDM's strong performance, evidenced by\nstate-of-the-art likelihood estimation. Furthermore, we investigate NFDM's\ncapacity for learning generative dynamics with specific characteristics, such\nas deterministic straight lines trajectories. This exploration underscores\nNFDM's versatility and its potential for a wide range of applications.\n","authors":["Grigory Bartosh","Dmitry Vetrov","Christian A. Naesseth"],"pdf_url":"https://arxiv.org/pdf/2404.12940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12925v1","updated":"2024-04-19T14:52:25Z","published":"2024-04-19T14:52:25Z","title":"A Hybrid Generative and Discriminative PointNet on Unordered Point Sets","summary":" As point cloud provides a natural and flexible representation usable in\nmyriad applications (e.g., robotics and self-driving cars), the ability to\nsynthesize point clouds for analysis becomes crucial. Recently, Xie et al.\npropose a generative model for unordered point sets in the form of an\nenergy-based model (EBM). Despite the model achieving an impressive performance\nfor point cloud generation, one separate model needs to be trained for each\ncategory to capture the complex point set distributions. Besides, their method\nis unable to classify point clouds directly and requires additional fine-tuning\nfor classification. One interesting question is: Can we train a single network\nfor a hybrid generative and discriminative model of point clouds? A similar\nquestion has recently been answered in the affirmative for images, introducing\nthe framework of Joint Energy-based Model (JEM), which achieves high\nperformance in image classification and generation simultaneously. This paper\nproposes GDPNet, the first hybrid Generative and Discriminative PointNet that\nextends JEM for point cloud classification and generation. Our GDPNet retains\nstrong discriminative power of modern PointNet classifiers, while generating\npoint cloud samples rivaling state-of-the-art generative approaches.\n","authors":["Yang Ye","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2404.12925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12922v1","updated":"2024-04-19T14:45:27Z","published":"2024-04-19T14:45:27Z","title":"Is Retain Set All You Need in Machine Unlearning? Restoring Performance\n of Unlearned Models with Out-Of-Distribution Images","summary":" In this paper, we introduce Selective-distillation for Class and\nArchitecture-agnostic unleaRning (SCAR), a novel approximate unlearning method.\nSCAR efficiently eliminates specific information while preserving the model's\ntest accuracy without using a retain set, which is a key component in\nstate-of-the-art approximate unlearning algorithms. Our approach utilizes a\nmodified Mahalanobis distance to guide the unlearning of the feature vectors of\nthe instances to be forgotten, aligning them to the nearest wrong class\ndistribution. Moreover, we propose a distillation-trick mechanism that distills\nthe knowledge of the original model into the unlearning model with\nout-of-distribution images for retaining the original model's test performance\nwithout using any retain set. Importantly, we propose a self-forget version of\nSCAR that unlearns without having access to the forget set. We experimentally\nverified the effectiveness of our method, on three public datasets, comparing\nit with state-of-the-art methods. Our method obtains performance higher than\nmethods that operate without the retain set and comparable w.r.t the best\nmethods that rely on the retain set.\n","authors":["Jacopo Bonato","Marco Cotogni","Luigi Sabetta"],"pdf_url":"https://arxiv.org/pdf/2404.12922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12920v1","updated":"2024-04-19T14:43:48Z","published":"2024-04-19T14:43:48Z","title":"Zero-Shot Medical Phrase Grounding with Off-the-shelf Diffusion Models","summary":" Localizing the exact pathological regions in a given medical scan is an\nimportant imaging problem that requires a large amount of bounding box ground\ntruth annotations to be accurately solved. However, there exist alternative,\npotentially weaker, forms of supervision, such as accompanying free-text\nreports, which are readily available. The task of performing localization with\ntextual guidance is commonly referred to as phrase grounding. In this work, we\nuse a publicly available Foundation Model, namely the Latent Diffusion Model,\nto solve this challenging task. This choice is supported by the fact that the\nLatent Diffusion Model, despite being generative in nature, contains mechanisms\n(cross-attention) that implicitly align visual and textual features, thus\nleading to intermediate representations that are suitable for the task at hand.\nIn addition, we aim to perform this task in a zero-shot manner, i.e., without\nany further training on target data, meaning that the model's weights remain\nfrozen. To this end, we devise strategies to select features and also refine\nthem via post-processing without extra learnable parameters. We compare our\nproposed method with state-of-the-art approaches which explicitly enforce\nimage-text alignment in a joint embedding space via contrastive learning.\nResults on a popular chest X-ray benchmark indicate that our method is\ncompetitive wih SOTA on different types of pathology, and even outperforms them\non average in terms of two metrics (mean IoU and AUC-ROC). Source code will be\nreleased upon acceptance.\n","authors":["Konstantinos Vilouras","Pedro Sanchez","Alison Q. O'Neil","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2404.12920v1.pdf","comment":"8 pages, 3 figures, submitted to IEEE J-BHI Special Issue on\n Foundation Models in Medical Imaging"},{"id":"http://arxiv.org/abs/2404.12917v1","updated":"2024-04-19T14:42:42Z","published":"2024-04-19T14:42:42Z","title":"Zero-Shot Stitching in Reinforcement Learning using Relative\n Representations","summary":" Visual Reinforcement Learning is a popular and powerful framework that takes\nfull advantage of the Deep Learning breakthrough. However, it is also known\nthat variations in the input (e.g., different colors of the panorama due to the\nseason of the year) or the task (e.g., changing the speed limit for a car to\nrespect) could require complete retraining of the agents. In this work, we\nleverage recent developments in unifying latent representations to demonstrate\nthat it is possible to combine the components of an agent, rather than retrain\nit from scratch. We build upon the recent relative representations framework\nand adapt it for Visual RL. This allows us to create completely new agents\ncapable of handling environment-task combinations never seen during training.\nOur work paves the road toward a more accessible and flexible use of\nreinforcement learning.\n","authors":["Antonio Pio Ricciardi","Valentino Maiorca","Luca Moschella","Riccardo Marin","Emanuele Rodolà"],"pdf_url":"https://arxiv.org/pdf/2404.12917v1.pdf","comment":"13 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.12871v2","updated":"2024-04-19T14:36:15Z","published":"2023-11-18T01:21:38Z","title":"An Embodied Generalist Agent in 3D World","summary":" Leveraging massive knowledge and learning schemes from large language models\n(LLMs), recent machine learning models show notable successes in building\ngeneralist agents that exhibit the capability of general-purpose task solving\nin diverse domains, including natural language processing, computer vision, and\nrobotics. However, a significant challenge remains as these models exhibit\nlimited ability in understanding and interacting with the 3D world. We argue\nthis limitation significantly hinders the current models from performing\nreal-world tasks and further achieving general intelligence. To this end, we\nintroduce an embodied multi-modal and multi-task generalist agent that excels\nin perceiving, grounding, reasoning, planning, and acting in the 3D world. Our\nproposed agent, referred to as LEO, is trained with shared LLM-based model\narchitectures, objectives, and weights in two stages: (i) 3D vision-language\nalignment and (ii) 3D vision-language-action instruction tuning. To facilitate\nthe training, we meticulously curate and generate an extensive dataset\ncomprising object-level and scene-level multi-modal tasks with exceeding scale\nand complexity, necessitating a deep understanding of and interaction with the\n3D world. Through rigorous experiments, we demonstrate LEO's remarkable\nproficiency across a wide spectrum of tasks, including 3D captioning, question\nanswering, embodied reasoning, embodied navigation, and robotic manipulation.\nOur ablation results further provide valuable insights for the development of\nfuture embodied generalist agents.\n","authors":["Jiangyong Huang","Silong Yong","Xiaojian Ma","Xiongkun Linghu","Puhao Li","Yan Wang","Qing Li","Song-Chun Zhu","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2311.12871v2.pdf","comment":"The first four authors contribute equally. Project page:\n https://embodied-generalist.github.io"},{"id":"http://arxiv.org/abs/2404.12908v1","updated":"2024-04-19T14:30:41Z","published":"2024-04-19T14:30:41Z","title":"Robust CLIP-Based Detector for Exposing Diffusion Model-Generated Images","summary":" Diffusion models (DMs) have revolutionized image generation, producing\nhigh-quality images with applications spanning various fields. However, their\nability to create hyper-realistic images poses significant challenges in\ndistinguishing between real and synthetic content, raising concerns about\ndigital authenticity and potential misuse in creating deepfakes. This work\nintroduces a robust detection framework that integrates image and text features\nextracted by CLIP model with a Multilayer Perceptron (MLP) classifier. We\npropose a novel loss that can improve the detector's robustness and handle\nimbalanced datasets. Additionally, we flatten the loss landscape during the\nmodel training to improve the detector's generalization capabilities. The\neffectiveness of our method, which outperforms traditional detection\ntechniques, is demonstrated through extensive experiments, underscoring its\npotential to set a new state-of-the-art approach in DM-generated image\ndetection. The code is available at\nhttps://github.com/Purdue-M2/Robust_DM_Generated_Image_Detection.\n","authors":[" Santosh","Li Lin","Irene Amerini","Xin Wang","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2404.12908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03631v3","updated":"2024-04-19T14:29:02Z","published":"2023-12-06T17:28:03Z","title":"Mitigating Open-Vocabulary Caption Hallucinations","summary":" While recent years have seen rapid progress in image-conditioned text\ngeneration, image captioning still suffers from the fundamental issue of\nhallucinations, namely, the generation of spurious details that cannot be\ninferred from the given image. Existing methods largely use closed-vocabulary\nobject lists to mitigate or evaluate hallucinations in image captioning,\nignoring the long-tailed nature of hallucinations that occur in practice. To\nthis end, we propose a framework for addressing hallucinations in image\ncaptioning in the open-vocabulary setting. Our framework includes a new\nbenchmark, OpenCHAIR, that leverages generative foundation models to evaluate\nopen-vocabulary object hallucinations for image captioning, surpassing the\npopular and similarly-sized CHAIR benchmark in both diversity and accuracy.\nFurthermore, to mitigate open-vocabulary hallucinations without using a closed\nobject list, we propose MOCHa, an approach harnessing advancements in\nreinforcement learning. Our multi-objective reward function explicitly targets\nthe trade-off between fidelity and adequacy in generations without requiring\nany strong supervision. MOCHa improves a large variety of image captioning\nmodels, as captured by our OpenCHAIR benchmark and other existing metrics. We\nwill release our code and models.\n","authors":["Assaf Ben-Kish","Moran Yanuka","Morris Alper","Raja Giryes","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2312.03631v3.pdf","comment":"Website Link: https://assafbk.github.io/mocha/"},{"id":"http://arxiv.org/abs/2404.11214v2","updated":"2024-04-19T14:26:06Z","published":"2024-04-17T09:58:53Z","title":"Feature Corrective Transfer Learning: End-to-End Solutions to Object\n Detection in Non-Ideal Visual Conditions","summary":" A significant challenge in the field of object detection lies in the system's\nperformance under non-ideal imaging conditions, such as rain, fog, low\nillumination, or raw Bayer images that lack ISP processing. Our study\nintroduces \"Feature Corrective Transfer Learning\", a novel approach that\nleverages transfer learning and a bespoke loss function to facilitate the\nend-to-end detection of objects in these challenging scenarios without the need\nto convert non-ideal images into their RGB counterparts. In our methodology, we\ninitially train a comprehensive model on a pristine RGB image dataset.\nSubsequently, non-ideal images are processed by comparing their feature maps\nagainst those from the initial ideal RGB model. This comparison employs the\nExtended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function\ndesigned to quantify similarities and integrate them into the detection loss.\nThis approach refines the model's ability to perform object detection across\nvarying conditions through direct feature map correction, encapsulating the\nessence of Feature Corrective Transfer Learning. Experimental validation on\nvariants of the KITTI dataset demonstrates a significant improvement in mean\nAverage Precision (mAP), resulting in a 3.8-8.1% relative enhancement in\ndetection under non-ideal conditions compared to the baseline model, and a less\nmarginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved\nunder ideal conditions by the standard Faster RCNN algorithm.\n","authors":["Chuheng Wei","Guoyuan Wu","Matthew J. Barth"],"pdf_url":"https://arxiv.org/pdf/2404.11214v2.pdf","comment":"2024 CVPR UG2+ Workshop"},{"id":"http://arxiv.org/abs/2312.09780v2","updated":"2024-04-19T14:16:46Z","published":"2023-12-15T13:33:09Z","title":"RANRAC: Robust Neural Scene Representations via Random Ray Consensus","summary":" Learning-based scene representations such as neural radiance fields or light\nfield networks, that rely on fitting a scene model to image observations,\ncommonly encounter challenges in the presence of inconsistencies within the\nimages caused by occlusions, inaccurately estimated camera parameters or\neffects like lens flare. To address this challenge, we introduce RANdom RAy\nConsensus (RANRAC), an efficient approach to eliminate the effect of\ninconsistent data, thereby taking inspiration from classical RANSAC based\noutlier detection for model fitting. In contrast to the down-weighting of the\neffect of outliers based on robust loss formulations, our approach reliably\ndetects and excludes inconsistent perspectives, resulting in clean images\nwithout floating artifacts. For this purpose, we formulate a fuzzy adaption of\nthe RANSAC paradigm, enabling its application to large scale models. We\ninterpret the minimal number of samples to determine the model parameters as a\ntunable hyperparameter, investigate the generation of hypotheses with\ndata-driven models, and analyze the validation of hypotheses in noisy\nenvironments. We demonstrate the compatibility and potential of our solution\nfor both photo-realistic robust multi-view reconstruction from real-world\nimages based on neural radiance fields and for single-shot reconstruction based\non light-field networks. In particular, the results indicate significant\nimprovements compared to state-of-the-art robust methods for novel-view\nsynthesis on both synthetic and captured scenes with various inconsistencies\nincluding occlusions, noisy camera pose estimates, and unfocused perspectives.\nThe results further indicate significant improvements for single-shot\nreconstruction from occluded images. Project Page:\nhttps://bennobuschmann.com/ranrac/\n","authors":["Benno Buschmann","Andreea Dogaru","Elmar Eisemann","Michael Weinmann","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2312.09780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00639v3","updated":"2024-04-19T14:14:59Z","published":"2023-12-01T14:59:43Z","title":"RefinedFields: Radiance Fields Refinement for Unconstrained Scenes","summary":" Modeling large scenes from unconstrained images has proven to be a major\nchallenge in computer vision. Existing methods tackling in-the-wild scene\nmodeling operate in closed-world settings, where no conditioning on priors\nacquired from real-world images is present. We propose RefinedFields, which is,\nto the best of our knowledge, the first method leveraging pre-trained models to\nimprove in-the-wild scene modeling. We employ pre-trained networks to refine\nK-Planes representations via optimization guidance using an alternating\ntraining procedure. We carry out extensive experiments and verify the merit of\nour method on synthetic data and real tourism photo collections. RefinedFields\nenhances rendered scenes with richer details and improves upon its base\nrepresentation on the task of novel view synthesis in the wild. Our project\npage can be found at https://refinedfields.github.io.\n","authors":["Karim Kassab","Antoine Schnepf","Jean-Yves Franceschi","Laurent Caraffa","Jeremie Mary","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2312.00639v3.pdf","comment":"Corrected Table 2, where some comparisons were done among models\n trained at different resolutions"},{"id":"http://arxiv.org/abs/2404.12900v1","updated":"2024-04-19T14:13:46Z","published":"2024-04-19T14:13:46Z","title":"Training-and-prompt-free General Painterly Harmonization Using\n Image-wise Attention Sharing","summary":" Painterly Image Harmonization aims at seamlessly blending disparate visual\nelements within a single coherent image. However, previous approaches often\nencounter significant limitations due to training data constraints, the need\nfor time-consuming fine-tuning, or reliance on additional prompts. To surmount\nthese hurdles, we design a Training-and-prompt-Free General Painterly\nHarmonization method using image-wise attention sharing (TF-GPH), which\nintegrates a novel \"share-attention module\". This module redefines the\ntraditional self-attention mechanism by allowing for comprehensive image-wise\nattention, facilitating the use of a state-of-the-art pretrained latent\ndiffusion model without the typical training data limitations. Additionally, we\nfurther introduce \"similarity reweighting\" mechanism enhances performance by\neffectively harnessing cross-image information, surpassing the capabilities of\nfine-tuning or prompt-based approaches. At last, we recognize the deficiencies\nin existing benchmarks and propose the \"General Painterly Harmonization\nBenchmark\", which employs range-based evaluation metrics to more accurately\nreflect real-world application. Extensive experiments demonstrate the superior\nefficacy of our method across various benchmarks. The code and web demo are\navailable at https://github.com/BlueDyee/TF-GPH.\n","authors":["Teng-Fang Hsiao","Bo-Kai Ruan","Hong-Han Shuai"],"pdf_url":"https://arxiv.org/pdf/2404.12900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.07976v5","updated":"2024-04-19T14:13:26Z","published":"2022-03-15T15:05:40Z","title":"On the Pitfalls of Batch Normalization for End-to-End Video Learning: A\n Study on Surgical Workflow Analysis","summary":" Batch Normalization's (BN) unique property of depending on other samples in a\nbatch is known to cause problems in several tasks, including sequence modeling.\nYet, BN-related issues are hardly studied for long video understanding, despite\nthe ubiquitous use of BN in CNNs (Convolutional Neural Networks) for feature\nextraction. Especially in surgical workflow analysis, where the lack of\npretrained feature extractors has led to complex, multi-stage training\npipelines, limited awareness of BN issues may have hidden the benefits of\ntraining CNNs and temporal models end to end. In this paper, we analyze\npitfalls of BN in video learning, including issues specific to online tasks\nsuch as a 'cheating' effect in anticipation. We observe that BN's properties\ncreate major obstacles for end-to-end learning. However, using BN-free\nbackbones, even simple CNN-LSTMs beat the state of the art\n{\\color{\\colorrevtwo}on three surgical workflow benchmarks} by utilizing\nadequate end-to-end training strategies which maximize temporal context. We\nconclude that awareness of BN's pitfalls is crucial for effective end-to-end\nlearning in surgical tasks. By reproducing results on natural-video datasets,\nwe hope our insights will benefit other areas of video learning as well. Code\nis available at: \\url{https://gitlab.com/nct_tso_public/pitfalls_bn}\n","authors":["Dominik Rivoir","Isabel Funke","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2203.07976v5.pdf","comment":"Accepted at Medical Image Analysis (MedIA). Publication link:\n https://www.sciencedirect.com/science/article/pii/S1361841524000513"},{"id":"http://arxiv.org/abs/2211.07440v4","updated":"2024-04-19T14:05:03Z","published":"2022-11-14T15:14:50Z","title":"Leveraging Automatic Personalised Nutrition: Food Image Recognition\n Benchmark and Dataset based on Nutrition Taxonomy","summary":" Maintaining a healthy lifestyle has become increasingly challenging in\ntoday's sedentary society marked by poor eating habits. To address this issue,\nboth national and international organisations have made numerous efforts to\npromote healthier diets and increased physical activity. However, implementing\nthese recommendations in daily life can be difficult, as they are often generic\nand not tailored to individuals. This study presents the AI4Food-NutritionDB\ndatabase, the first nutrition database that incorporates food images and a\nnutrition taxonomy based on recommendations by national and international\nhealth authorities. The database offers a multi-level categorisation,\ncomprising 6 nutritional levels, 19 main categories (e.g., \"Meat\"), 73\nsubcategories (e.g., \"White Meat\"), and 893 specific food products (e.g.,\n\"Chicken\"). The AI4Food-NutritionDB opens the doors to new food computing\napproaches in terms of food intake frequency, quality, and categorisation.\nAlso, we present a standardised experimental protocol and benchmark including\nthree tasks based on the nutrition taxonomy (i.e., category, subcategory, and\nfinal product recognition). These resources are available to the research\ncommunity, including our deep learning models trained on AI4Food-NutritionDB,\nwhich can serve as pre-trained models, achieving accurate recognition results\nfor challenging food image databases.\n","authors":["Sergio Romero-Tapiador","Ruben Tolosana","Aythami Morales","Julian Fierrez","Ruben Vera-Rodriguez","Isabel Espinosa-Salinas","Gala Freixer","Enrique Carrillo de Santa Pau","Ana Ramírez de Molina","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2211.07440v4.pdf","comment":"12 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.02044v3","updated":"2024-04-19T14:02:26Z","published":"2024-01-04T03:09:39Z","title":"Multi-modal vision-language model for generalizable annotation-free\n pathological lesions localization and clinical diagnosis","summary":" Defining pathologies automatically from medical images aids the understanding\nof the emergence and progression of diseases, and such an ability is crucial in\nclinical diagnostics. However, existing deep learning models heavily rely on\nexpert annotations and lack generalization capabilities in open clinical\nenvironments. In this study, we present a generalizable vision-language\npre-training model for Annotation-Free pathological lesions Localization\n(AFLoc). The core strength of AFLoc lies in its extensive multi-level semantic\nstructure-based contrastive learning, which comprehensively aligns\nmulti-granularity medical concepts from reports with abundant image features,\nto adapt to the diverse expressions of pathologies and unseen pathologies\nwithout the reliance on image annotations from experts. We demonstrate the\nproof of concept on CXR images, with extensive experimental validation across 4\ndistinct external datasets, encompassing 11 types of chest pathologies. The\nresults demonstrate that AFLoc surpasses state-of-the-art methods in\npathological lesions localization and disease classification, and even\noutperforms the human benchmark in locating 5 different pathologies.\nAdditionally, we further verify its generalization ability by applying it to\nretinal fundus images. Our approach showcases AFoc versatilities and\nunderscores its suitability for clinical diagnoses in complex clinical\nenvironments.\n","authors":["Hao Yang","Hong-Yu Zhou","Zhihuan Li","Yuanxu Gao","Cheng Li","Weijian Huang","Jiarun Liu","Hairong Zheng","Kang Zhang","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.02044v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12888v1","updated":"2024-04-19T13:45:14Z","published":"2024-04-19T13:45:14Z","title":"Learn2Talk: 3D Talking Face Learns from 2D Talking Face","summary":" Speech-driven facial animation methods usually contain two main classes, 3D\nand 2D talking face, both of which attract considerable research attention in\nrecent years. However, to the best of our knowledge, the research on 3D talking\nface does not go deeper as 2D talking face, in the aspect of\nlip-synchronization (lip-sync) and speech perception. To mind the gap between\nthe two sub-fields, we propose a learning framework named Learn2Talk, which can\nconstruct a better 3D talking face network by exploiting two expertise points\nfrom the field of 2D talking face. Firstly, inspired by the audio-video sync\nnetwork, a 3D sync-lip expert model is devised for the pursuit of lip-sync\nbetween audio and 3D facial motion. Secondly, a teacher model selected from 2D\ntalking face methods is used to guide the training of the audio-to-3D motions\nregression network to yield more 3D vertex accuracy. Extensive experiments show\nthe advantages of the proposed framework in terms of lip-sync, vertex accuracy\nand speech perception, compared with state-of-the-arts. Finally, we show two\napplications of the proposed framework: audio-visual speech recognition and\nspeech-driven 3D Gaussian Splatting based avatar animation.\n","authors":["Yixiang Zhuang","Baoping Cheng","Yao Cheng","Yuntao Jin","Renshuai Liu","Chengyang Li","Xuan Cheng","Jing Liao","Juncong Lin"],"pdf_url":"https://arxiv.org/pdf/2404.12888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12887v1","updated":"2024-04-19T13:43:14Z","published":"2024-04-19T13:43:14Z","title":"3D Multi-frame Fusion for Video Stabilization","summary":" In this paper, we present RStab, a novel framework for video stabilization\nthat integrates 3D multi-frame fusion through volume rendering. Departing from\nconventional methods, we introduce a 3D multi-frame perspective to generate\nstabilized images, addressing the challenge of full-frame generation while\npreserving structure. The core of our approach lies in Stabilized Rendering\n(SR), a volume rendering module, which extends beyond the image fusion by\nincorporating feature fusion. The core of our RStab framework lies in\nStabilized Rendering (SR), a volume rendering module, fusing multi-frame\ninformation in 3D space. Specifically, SR involves warping features and colors\nfrom multiple frames by projection, fusing them into descriptors to render the\nstabilized image. However, the precision of warped information depends on the\nprojection accuracy, a factor significantly influenced by dynamic regions. In\nresponse, we introduce the Adaptive Ray Range (ARR) module to integrate depth\npriors, adaptively defining the sampling range for the projection process.\nAdditionally, we propose Color Correction (CC) assisting geometric constraints\nwith optical flow for accurate color aggregation. Thanks to the three modules,\nour RStab demonstrates superior performance compared with previous stabilizers\nin the field of view (FOV), image quality, and video stability across various\ndatasets.\n","authors":["Zhan Peng","Xinyi Ye","Weiyue Zhao","Tianqi Liu","Huiqiang Sun","Baopu Li","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2404.12887v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.12886v1","updated":"2024-04-19T13:40:25Z","published":"2024-04-19T13:40:25Z","title":"MCM: Multi-condition Motion Synthesis Framework","summary":" Conditional human motion synthesis (HMS) aims to generate human motion\nsequences that conform to specific conditions. Text and audio represent the two\npredominant modalities employed as HMS control conditions. While existing\nresearch has primarily focused on single conditions, the multi-condition human\nmotion synthesis remains underexplored. In this study, we propose a\nmulti-condition HMS framework, termed MCM, based on a dual-branch structure\ncomposed of a main branch and a control branch. This framework effectively\nextends the applicability of the diffusion model, which is initially predicated\nsolely on textual conditions, to auditory conditions. This extension\nencompasses both music-to-dance and co-speech HMS while preserving the\nintrinsic quality of motion and the capabilities for semantic association\ninherent in the original model. Furthermore, we propose the implementation of a\nTransformer-based diffusion model, designated as MWNet, as the main branch.\nThis model adeptly apprehends the spatial intricacies and inter-joint\ncorrelations inherent in motion sequences, facilitated by the integration of\nmulti-wise self-attention modules. Extensive experiments show that our method\nachieves competitive results in single-condition and multi-condition HMS tasks.\n","authors":["Zeyu Ling","Bo Han","Yongkang Wongkan","Han Lin","Mohan Kankanhalli","Weidong Geng"],"pdf_url":"https://arxiv.org/pdf/2404.12886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18453v5","updated":"2024-04-19T13:37:18Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis","summary":" Artificial intelligence (AI) in healthcare, especially in medical imaging,\nfaces challenges due to data scarcity and privacy concerns. Addressing these,\nwe introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI\nsynthesis. This model effectively tackles data scarcity and privacy issues by\nintegrating semantic conditioning. This involves the channel-wise concatenation\nof a conditioning image to the model input, enabling control in image\ngeneration. Med-DDPM demonstrates superior stability and performance compared\nto existing 3D brain imaging synthesis methods. It generates diverse,\nanatomically coherent images with high visual fidelity. In terms of dice score\naccuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the\n0.6531 accuracy of real images, and outperforms baseline models. Combined with\nreal images, it further increases segmentation accuracy to 0.6675, showing the\npotential of our proposed method for data augmentation. This model represents\nthe first use of a diffusion model in 3D semantic brain MRI synthesis,\nproducing high-quality images. Its semantic conditioning feature also shows\npotential for image anonymization in biomedical imaging, addressing data and\nprivacy issues. We provide the code and model weights for Med-DDPM on our\nGitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support\nreproducibility.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v5.pdf","comment":"This document is a preprint and has been accepted for publication in\n the IEEE Journal of Biomedical and Health Informatics. The final, published\n version can be accessed using the following DOI: 10.1109/JBHI.2024.3385504.\n Copyright for this article has been transferred to IEEE"},{"id":"http://arxiv.org/abs/2211.11424v2","updated":"2024-04-19T13:31:47Z","published":"2022-11-21T13:10:19Z","title":"Modeling Hierarchical Structural Distance for Unsupervised Domain\n Adaptation","summary":" Unsupervised domain adaptation (UDA) aims to estimate a transferable model\nfor unlabeled target domains by exploiting labeled source data. Optimal\nTransport (OT) based methods have recently been proven to be a promising\nsolution for UDA with a solid theoretical foundation and competitive\nperformance. However, most of these methods solely focus on domain-level OT\nalignment by leveraging the geometry of domains for domain-invariant features\nbased on the global embeddings of images. However, global representations of\nimages may destroy image structure, leading to the loss of local details that\noffer category-discriminative information. This study proposes an end-to-end\nDeep Hierarchical Optimal Transport method (DeepHOT), which aims to learn both\ndomain-invariant and category-discriminative representations by mining\nhierarchical structural relations among domains. The main idea is to\nincorporate a domain-level OT and image-level OT into a unified OT framework,\nhierarchical optimal transport, to model the underlying geometry in both domain\nspace and image space. In DeepHOT framework, an image-level OT serves as the\nground distance metric for the domain-level OT, leading to the hierarchical\nstructural distance. Compared with the ground distance of the conventional\ndomain-level OT, the image-level OT captures structural associations among\nlocal regions of images that are beneficial to classification. In this way,\nDeepHOT, a unified OT framework, not only aligns domains by domain-level OT,\nbut also enhances the discriminative power through image-level OT. Moreover, to\novercome the limitation of high computational complexity, we propose a robust\nand efficient implementation of DeepHOT by approximating origin OT with sliced\nWasserstein distance in image-level OT and accomplishing the mini-batch\nunbalanced domain-level OT.\n","authors":["Yingxue Xu","Guihua Wen","Yang Hu","Pei Yang"],"pdf_url":"https://arxiv.org/pdf/2211.11424v2.pdf","comment":"accepted by TCVST, code: https://github.com/Innse/DeepHOT"},{"id":"http://arxiv.org/abs/2404.12876v1","updated":"2024-04-19T13:25:27Z","published":"2024-04-19T13:25:27Z","title":"A Large-scale Medical Visual Task Adaptation Benchmark","summary":" Visual task adaptation has been demonstrated to be effective in adapting\npre-trained Vision Transformers (ViTs) to general downstream visual tasks using\nspecialized learnable layers or tokens. However, there is yet a large-scale\nbenchmark to fully explore the effect of visual task adaptation on the\nrealistic and important medical domain, particularly across diverse medical\nvisual modalities, such as color images, X-ray, and CT. To close this gap, we\npresent Med-VTAB, a large-scale Medical Visual Task Adaptation Benchmark\nconsisting of 1.68 million medical images for diverse organs, modalities, and\nadaptation approaches. Based on Med-VTAB, we explore the scaling law of medical\nprompt tuning concerning tunable parameters and the generalizability of medical\nvisual adaptation using non-medical/medical pre-train weights. Besides, we\nstudy the impact of patient ID out-of-distribution on medical visual\nadaptation, which is a real and challenging scenario. Furthermore, results from\nMed-VTAB indicate that a single pre-trained model falls short in medical task\nadaptation. Therefore, we introduce GMoE-Adapter, a novel method that combines\nmedical and general pre-training weights through a gated mixture-of-experts\nadapter, achieving state-of-the-art results in medical visual task adaptation.\n","authors":["Shentong Mo","Xufang Luo","Yansen Wang","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.12876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06860v2","updated":"2024-04-19T13:18:46Z","published":"2024-04-10T09:35:50Z","title":"Monocular 3D lane detection for Autonomous Driving: Recent Achievements,\n Challenges, and Outlooks","summary":" 3D lane detection is essential in autonomous driving as it extracts\nstructural and traffic information from the road in three-dimensional space,\naiding self-driving cars in logical, safe, and comfortable path planning and\nmotion control. Given the cost of sensors and the advantages of visual data in\ncolor information, 3D lane detection based on monocular vision is an important\nresearch direction in the realm of autonomous driving, increasingly gaining\nattention in both industry and academia. Regrettably, recent advancements in\nvisual perception seem inadequate for the development of fully reliable 3D lane\ndetection algorithms, which also hampers the progress of vision-based fully\nautonomous vehicles. We believe that there is still considerable room for\nimprovement in 3D lane detection algorithms for autonomous vehicles using\nvisual sensors, and significant enhancements are needed. This review looks back\nand analyzes the current state of achievements in the field of 3D lane\ndetection research. It covers all current monocular-based 3D lane detection\nprocesses, discusses the performance of these cutting-edge algorithms, analyzes\nthe time complexity of various algorithms, and highlights the main achievements\nand limitations of ongoing research efforts. The survey also includes a\ncomprehensive discussion of available 3D lane detection datasets and the\nchallenges that researchers face but have not yet resolved. Finally, our work\noutlines future research directions and invites researchers and practitioners\nto join this exciting field.\n","authors":["Fulong Ma","Weiqing Qi","Guoyang Zhao","Linwei Zheng","Sheng Wang","Yuxuan Liu","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12867v1","updated":"2024-04-19T13:08:43Z","published":"2024-04-19T13:08:43Z","title":"FipTR: A Simple yet Effective Transformer Framework for Future Instance\n Prediction in Autonomous Driving","summary":" The future instance prediction from a Bird's Eye View(BEV) perspective is a\nvital component in autonomous driving, which involves future instance\nsegmentation and instance motion prediction. Existing methods usually rely on a\nredundant and complex pipeline which requires multiple auxiliary outputs and\npost-processing procedures. Moreover, estimated errors on each of the auxiliary\npredictions will lead to degradation of the prediction performance. In this\npaper, we propose a simple yet effective fully end-to-end framework named\nFuture Instance Prediction Transformer(FipTR), which views the task as BEV\ninstance segmentation and prediction for future frames. We propose to adopt\ninstance queries representing specific traffic participants to directly\nestimate the corresponding future occupied masks, and thus get rid of complex\npost-processing procedures. Besides, we devise a flow-aware BEV predictor for\nfuture BEV feature prediction composed of a flow-aware deformable attention\nthat takes backward flow guiding the offset sampling. A novel future instance\nmatching strategy is also proposed to further improve the temporal coherence.\nExtensive experiments demonstrate the superiority of FipTR and its\neffectiveness under different temporal BEV encoders.\n","authors":["Xingtai Gui","Tengteng Huang","Haonan Shao","Haotian Yao","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12866v1","updated":"2024-04-19T13:05:37Z","published":"2024-04-19T13:05:37Z","title":"How Does the Textual Information Affect the Retrieval of Multimodal\n In-Context Learning?","summary":" The increase in parameter size of multimodal large language models (MLLMs)\nintroduces significant capabilities, particularly in-context learning, where\nMLLMs enhance task performance without updating pre-trained parameters. This\neffectiveness, however, hinges on the appropriate selection of in-context\nexamples, a process that is currently biased towards visual data, overlooking\ntextual information. Furthermore, the area of supervised retrievers for MLLMs,\ncrucial for optimal in-context example selection, continues to be\nuninvestigated. Our study offers an in-depth evaluation of the impact of\ntextual information on the unsupervised selection of in-context examples in\nmultimodal contexts, uncovering a notable sensitivity of retriever performance\nto the employed modalities. Responding to this, we introduce a novel supervised\nMLLM-retriever MSIER that employs a neural network to select examples that\nenhance multimodal in-context learning efficiency. This approach is validated\nthrough extensive testing across three distinct tasks, demonstrating the\nmethod's effectiveness. Additionally, we investigate the influence of\nmodalities on our supervised retrieval method's training and pinpoint factors\ncontributing to our model's success. This exploration paves the way for future\nadvancements, highlighting the potential for refined in-context learning in\nMLLMs through the strategic use of multimodal data.\n","authors":["Yang Luo","Zangwei Zheng","Zirui Zhu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2404.12866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12861v1","updated":"2024-04-19T13:01:30Z","published":"2024-04-19T13:01:30Z","title":"Foundation Model assisted Weakly Supervised LiDAR Semantic Segmentation","summary":" Current point cloud semantic segmentation has achieved great advances when\ngiven sufficient labels. However, the dense annotation of LiDAR point clouds\nremains prohibitively expensive and time-consuming, unable to keep up with the\ncontinuously growing volume of data. In this paper, we propose annotating\nimages with scattered points, followed by utilizing SAM (a Foundation model) to\ngenerate semantic segmentation labels for the images. Finally, by mapping the\nsegmentation labels of the images to the LiDAR space using the intrinsic and\nextrinsic parameters of the camera and LiDAR, we obtain labels for point cloud\nsemantic segmentation, and release Scatter-KITTI and Scatter-nuScenes, which\nare the first works to utilize image segmentation-based SAM for weakly\nsupervised point cloud semantic segmentation. Furthermore, to mitigate the\ninfluence of erroneous pseudo labels obtained from sparse annotations on point\ncloud features, we propose a multi-modal weakly supervised network for LiDAR\nsemantic segmentation, called MM-ScatterNet. This network combines features\nfrom both point cloud and image modalities, enhancing the representation\nlearning of point clouds by introducing consistency constraints between\nmulti-modal features and point cloud features. On the SemanticKITTI dataset, we\nachieve 66\\% of fully supervised performance using only 0.02% of annotated\ndata, and on the NuScenes dataset, we achieve 95% of fully supervised\nperformance using only 0.1% labeled points.\n","authors":["Yilong Chen","Zongyi Xu","xiaoshui Huang","Ruicheng Zhang","Xinqi Jiang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2404.12861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12856v1","updated":"2024-04-19T12:50:43Z","published":"2024-04-19T12:50:43Z","title":"Language-Driven Active Learning for Diverse Open-Set 3D Object Detection","summary":" Object detection is crucial for ensuring safe autonomous driving. However,\ndata-driven approaches face challenges when encountering minority or novel\nobjects in the 3D driving scene. In this paper, we propose VisLED, a\nlanguage-driven active learning framework for diverse open-set 3D Object\nDetection. Our method leverages active learning techniques to query diverse and\ninformative data samples from an unlabeled pool, enhancing the model's ability\nto detect underrepresented or novel objects. Specifically, we introduce the\nVision-Language Embedding Diversity Querying (VisLED-Querying) algorithm, which\noperates in both open-world exploring and closed-world mining settings. In\nopen-world exploring, VisLED-Querying selects data points most novel relative\nto existing data, while in closed-world mining, it mines new instances of known\nclasses. We evaluate our approach on the nuScenes dataset and demonstrate its\neffectiveness compared to random sampling and entropy-querying methods. Our\nresults show that VisLED-Querying consistently outperforms random sampling and\noffers competitive performance compared to entropy-querying despite the\nlatter's model-optimality, highlighting the potential of VisLED for improving\nobject detection in autonomous driving scenarios.\n","authors":["Ross Greer","Bjørk Antoniussen","Andreas Møgelmose","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2404.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11669v2","updated":"2024-04-19T12:46:03Z","published":"2024-04-17T18:08:00Z","title":"Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis","summary":" Designing a 3D representation of a dynamic scene for fast optimization and\nrendering is a challenging task. While recent explicit representations enable\nfast learning and rendering of dynamic radiance fields, they require a dense\nset of input viewpoints. In this work, we focus on learning a fast\nrepresentation for dynamic radiance fields with sparse input viewpoints.\nHowever, the optimization with sparse input is under-constrained and\nnecessitates the use of motion priors to constrain the learning. Existing fast\ndynamic scene models do not explicitly model the motion, making them difficult\nto be constrained with motion priors. We design an explicit motion model as a\nfactorized 4D representation that is fast and can exploit the spatio-temporal\ncorrelation of the motion field. We then introduce reliable flow priors\nincluding a combination of sparse flow priors across cameras and dense flow\npriors within cameras to regularize our motion model. Our model is fast,\ncompact and achieves very good performance on popular multi-view dynamic scene\ndatasets with sparse input viewpoints. The source code for our model can be\nfound on our project page:\nhttps://nagabhushansn95.github.io/publications/2024/RF-DeRF.html.\n","authors":["Nagabhushan Somraj","Kapil Choudhary","Sai Harsha Mupparaju","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2404.11669v2.pdf","comment":"Accepted at SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2404.12852v1","updated":"2024-04-19T12:42:31Z","published":"2024-04-19T12:42:31Z","title":"LSP Framework: A Compensatory Model for Defeating Trigger Reverse\n Engineering via Label Smoothing Poisoning","summary":" Deep neural networks are vulnerable to backdoor attacks. Among the existing\nbackdoor defense methods, trigger reverse engineering based approaches, which\nreconstruct the backdoor triggers via optimizations, are the most versatile and\neffective ones compared to other types of methods. In this paper, we summarize\nand construct a generic paradigm for the typical trigger reverse engineering\nprocess. Based on this paradigm, we propose a new perspective to defeat trigger\nreverse engineering by manipulating the classification confidence of backdoor\nsamples. To determine the specific modifications of classification confidence,\nwe propose a compensatory model to compute the lower bound of the modification.\nWith proper modifications, the backdoor attack can easily bypass the trigger\nreverse engineering based methods. To achieve this objective, we propose a\nLabel Smoothing Poisoning (LSP) framework, which leverages label smoothing to\nspecifically manipulate the classification confidences of backdoor samples.\nExtensive experiments demonstrate that the proposed work can defeat the\nstate-of-the-art trigger reverse engineering based methods, and possess good\ncompatibility with a variety of existing backdoor attacks.\n","authors":["Beichen Li","Yuanfang Guo","Heqi Peng","Yangxi Li","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12462v4","updated":"2024-04-19T12:39:09Z","published":"2023-08-23T22:55:45Z","title":"Overcoming Generic Knowledge Loss with Selective Parameter Update","summary":" Foundation models encompass an extensive knowledge base and offer remarkable\ntransferability. However, this knowledge becomes outdated or insufficient over\ntime. The challenge lies in continuously updating foundation models to\naccommodate novel information while retaining their original capabilities.\nLeveraging the fact that foundation models have initial knowledge on various\ntasks and domains, we propose a novel approach that, instead of updating all\nparameters equally, localizes the updates to a sparse set of parameters\nrelevant to the task being learned. We strike a balance between efficiency and\nnew task performance, while maintaining the transferability and\ngeneralizability of foundation models. We extensively evaluate our method on\nfoundational vision-language models with a diverse spectrum of continual\nlearning tasks. Our method achieves improvements on the accuracy of the newly\nlearned tasks up to 7% while preserving the pretraining knowledge with a\nnegligible decrease of 0.9% on a representative control set accuracy.\n","authors":["Wenxuan Zhang","Paul Janson","Rahaf Aljundi","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2308.12462v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04346v2","updated":"2024-04-19T12:30:07Z","published":"2024-04-05T18:33:04Z","title":"Koala: Key frame-conditioned long video-LLM","summary":" Long video question answering is a challenging task that involves recognizing\nshort-term activities and reasoning about their fine-grained relationships.\nState-of-the-art video Large Language Models (vLLMs) hold promise as a viable\nsolution due to their demonstrated emergent capabilities on new tasks. However,\ndespite being trained on millions of short seconds-long videos, vLLMs are\nunable to understand minutes-long videos and accurately answer questions about\nthem. To address this limitation, we propose a lightweight and self-supervised\napproach, Key frame-conditioned long video-LLM (Koala), that introduces\nlearnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to\nlonger videos. Our approach introduces two new tokenizers that condition on\nvisual tokens computed from sparse video key frames for understanding short and\nlong video moments. We train our proposed approach on HowTo100M and demonstrate\nits effectiveness on zero-shot long video understanding benchmarks, where it\noutperforms state-of-the-art large models by 3 - 6% in absolute accuracy across\nall tasks. Surprisingly, we also empirically show that our approach not only\nhelps a pretrained vLLM to understand long videos but also improves its\naccuracy on short-term action recognition.\n","authors":["Reuben Tan","Ximeng Sun","Ping Hu","Jui-hsien Wang","Hanieh Deilamsalehy","Bryan A. Plummer","Bryan Russell","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2404.04346v2.pdf","comment":"Accepted at CVPR 2024 as a poster highlight"},{"id":"http://arxiv.org/abs/2306.08386v2","updated":"2024-04-19T12:29:50Z","published":"2023-06-14T09:21:48Z","title":"Efficient Backdoor Attacks for Deep Neural Networks in Real-world\n Scenarios","summary":" Recent deep neural networks (DNNs) have came to rely on vast amounts of\ntraining data, providing an opportunity for malicious attackers to exploit and\ncontaminate the data to carry out backdoor attacks. However, existing backdoor\nattack methods make unrealistic assumptions, assuming that all training data\ncomes from a single source and that attackers have full access to the training\ndata. In this paper, we introduce a more realistic attack scenario where\nvictims collect data from multiple sources, and attackers cannot access the\ncomplete training data. We refer to this scenario as data-constrained backdoor\nattacks. In such cases, previous attack methods suffer from severe efficiency\ndegradation due to the entanglement between benign and poisoning features\nduring the backdoor injection process. To tackle this problem, we introduce\nthree CLIP-based technologies from two distinct streams: Clean Feature\nSuppression and Poisoning Feature Augmentation.effective solution for\ndata-constrained backdoor attacks. The results demonstrate remarkable\nimprovements, with some settings achieving over 100% improvement compared to\nexisting attacks in data-constrained scenarios. Code is available at\nhttps://github.com/sunh1113/Efficient-backdoor-attacks-for-deep-neural-networks-in-real-world-scenarios\n","authors":["Ziqiang Li","Hong Sun","Pengfei Xia","Heng Li","Beihao Xia","Yi Wu","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2306.08386v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.12841v1","updated":"2024-04-19T12:21:27Z","published":"2024-04-19T12:21:27Z","title":"Explainable Deepfake Video Detection using Convolutional Neural Network\n and CapsuleNet","summary":" Deepfake technology, derived from deep learning, seamlessly inserts\nindividuals into digital media, irrespective of their actual participation. Its\nfoundation lies in machine learning and Artificial Intelligence (AI).\nInitially, deepfakes served research, industry, and entertainment. While the\nconcept has existed for decades, recent advancements render deepfakes nearly\nindistinguishable from reality. Accessibility has soared, empowering even\nnovices to create convincing deepfakes. However, this accessibility raises\nsecurity concerns.The primary deepfake creation algorithm, GAN (Generative\nAdversarial Network), employs machine learning to craft realistic images or\nvideos. Our objective is to utilize CNN (Convolutional Neural Network) and\nCapsuleNet with LSTM to differentiate between deepfake-generated frames and\noriginals. Furthermore, we aim to elucidate our model's decision-making process\nthrough Explainable AI, fostering transparent human-AI relationships and\noffering practical examples for real-life scenarios.\n","authors":["Gazi Hasin Ishrak","Zalish Mahmud","MD. Zami Al Zunaed Farabe","Tahera Khanom Tinni","Tanzim Reza","Mohammad Zavid Parvez"],"pdf_url":"https://arxiv.org/pdf/2404.12841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12839v1","updated":"2024-04-19T12:20:49Z","published":"2024-04-19T12:20:49Z","title":"ECOR: Explainable CLIP for Object Recognition","summary":" Large Vision Language Models (VLMs), such as CLIP, have significantly\ncontributed to various computer vision tasks, including object recognition and\nobject detection. Their open vocabulary feature enhances their value. However,\ntheir black-box nature and lack of explainability in predictions make them less\ntrustworthy in critical domains. Recently, some work has been done to force\nVLMs to provide reasonable rationales for object recognition, but this often\ncomes at the expense of classification accuracy. In this paper, we first\npropose a mathematical definition of explainability in the object recognition\ntask based on the joint probability distribution of categories and rationales,\nthen leverage this definition to fine-tune CLIP in an explainable manner.\nThrough evaluations of different datasets, our method demonstrates\nstate-of-the-art performance in explainable classification. Notably, it excels\nin zero-shot settings, showcasing its adaptability. This advancement improves\nexplainable object recognition, enhancing trust across diverse applications.\nThe code will be made available online upon publication.\n","authors":["Ali Rasekh","Sepehr Kazemi Ranjbar","Milad Heidari","Wolfgang Nejdl"],"pdf_url":"https://arxiv.org/pdf/2404.12839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12832v1","updated":"2024-04-19T12:09:49Z","published":"2024-04-19T12:09:49Z","title":"COIN: Counterfactual inpainting for weakly supervised semantic\n segmentation for medical images","summary":" Deep learning is dramatically transforming the field of medical imaging and\nradiology, enabling the identification of pathologies in medical images,\nincluding computed tomography (CT) and X-ray scans. However, the performance of\ndeep learning models, particularly in segmentation tasks, is often limited by\nthe need for extensive annotated datasets. To address this challenge, the\ncapabilities of weakly supervised semantic segmentation are explored through\nthe lens of Explainable AI and the generation of counterfactual explanations.\nThe scope of this research is development of a novel counterfactual inpainting\napproach (COIN) that flips the predicted classification label from abnormal to\nnormal by using a generative model. For instance, if the classifier deems an\ninput medical image X as abnormal, indicating the presence of a pathology, the\ngenerative model aims to inpaint the abnormal region, thus reversing the\nclassifier's original prediction label. The approach enables us to produce\nprecise segmentations for pathologies without depending on pre-existing\nsegmentation masks. Crucially, image-level labels are utilized, which are\nsubstantially easier to acquire than creating detailed segmentation masks. The\neffectiveness of the method is demonstrated by segmenting synthetic targets and\nactual kidney tumors from CT images acquired from Tartu University Hospital in\nEstonia. The findings indicate that COIN greatly surpasses established\nattribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an\nalternative counterfactual explanation method introduced by Singla et al. This\nevidence suggests that COIN is a promising approach for semantic segmentation\nof tumors in CT images, and presents a step forward in making deep learning\napplications more accessible and effective in healthcare, where annotated data\nis scarce.\n","authors":["Dmytro Shvetsov","Joonas Ariva","Marharyta Domnich","Raul Vicente","Dmytro Fishman"],"pdf_url":"https://arxiv.org/pdf/2404.12832v1.pdf","comment":"This work has been accepted to be presented to The 2nd World\n Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19,\n 2024 - Valletta, Malta"},{"id":"http://arxiv.org/abs/2404.12819v1","updated":"2024-04-19T11:56:29Z","published":"2024-04-19T11:56:29Z","title":"Unveiling the Ambiguity in Neural Inverse Rendering: A Parameter\n Compensation Analysis","summary":" Inverse rendering aims to reconstruct the scene properties of objects solely\nfrom multiview images. However, it is an ill-posed problem prone to producing\nambiguous estimations deviating from physically accurate representations. In\nthis paper, we utilize Neural Microfacet Fields (NMF), a state-of-the-art\nneural inverse rendering method to illustrate the inherent ambiguity. We\npropose an evaluation framework to assess the degree of compensation or\ninteraction between the estimated scene properties, aiming to explore the\nmechanisms behind this ill-posed problem and potential mitigation strategies.\nSpecifically, we introduce artificial perturbations to one scene property and\nexamine how adjusting another property can compensate for these perturbations.\nTo facilitate such experiments, we introduce a disentangled NMF where material\nproperties are independent. The experimental findings underscore the intrinsic\nambiguity present in neural inverse rendering and highlight the importance of\nproviding additional guidance through geometry, material, and illumination\npriors.\n","authors":["Georgios Kouros","Minye Wu","Sushruth Nagesh","Xianling Zhang","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2404.12819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12814v1","updated":"2024-04-19T11:49:01Z","published":"2024-04-19T11:49:01Z","title":"Generative Modelling with High-Order Langevin Dynamics","summary":" Diffusion generative modelling (DGM) based on stochastic\n differential equations (SDEs) with\n score matching has achieved unprecedented results in data\n generation.\n In this paper, we propose a novel fast high-quality\n generative modelling method\n based on high-order\n Langevin dynamics (HOLD) with score matching.\n This motive is proved by third-order\n Langevin dynamics. By augmenting the\n previous SDEs, e.g.\n variance exploding or variance preserving SDEs\n for single-data variable processes, HOLD can simultaneously\n model position, velocity, and\n acceleration, thereby improving the quality\n and speed of the data\n generation at the same time.\n HOLD is composed of one Ornstein-Uhlenbeck process\n and two Hamiltonians,\n which reduce the mixing time by two orders of magnitude.\n Empirical experiments for unconditional image generation on the\n public data set CIFAR-10 and CelebA-HQ show that the effect is significant in\n both Frechet inception distance (FID) and negative log-likelihood,\n and achieves the\n state-of-the-art FID of 1.85 on CIFAR-10.\n","authors":["Ziqiang Shi","Rujie Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12814v1.pdf","comment":"Some of the results in this paper have been published or accepted at\n conferences such as wacv2024, icassp2024, and icme2024"},{"id":"http://arxiv.org/abs/2404.11981v2","updated":"2024-04-19T11:43:39Z","published":"2024-04-18T08:23:24Z","title":"Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental\n Semantic Segmentation","summary":" Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a\npre-trained segmentation model to segment new classes using cost-effective and\nreadily available image-level labels. A prevailing way to solve WILSS is the\ngeneration of seed areas for each new class, serving as a form of pixel-level\nsupervision. However, a scenario usually arises where a pixel is concurrently\npredicted as an old class by the pre-trained segmentation model and a new class\nby the seed areas. Such a scenario becomes particularly problematic in WILSS,\nas the lack of pixel-level annotations on new classes makes it intractable to\nascertain whether the pixel pertains to the new class or not. To surmount this\nissue, we propose an innovative, tendency-driven relationship of mutual\nexclusivity, meticulously tailored to govern the behavior of the seed areas and\nthe predictions generated by the pre-trained segmentation model. This\nrelationship stipulates that predictions for the new and old classes must not\nconflict whilst prioritizing the preservation of predictions for the old\nclasses, which not only addresses the conflicting prediction issue but also\neffectively mitigates the inherent challenge of incremental learning -\ncatastrophic forgetting. Furthermore, under the auspices of this\ntendency-driven mutual exclusivity relationship, we generate pseudo masks for\nthe new classes, allowing for concurrent execution with model parameter\nupdating via the resolution of a bi-level optimization problem. Extensive\nexperiments substantiate the effectiveness of our framework, resulting in the\nestablishment of new benchmarks and paving the way for further research in this\nfield.\n","authors":["Chongjie Si","Xuehui Wang","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2404.11981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12804v1","updated":"2024-04-19T11:38:34Z","published":"2024-04-19T11:38:34Z","title":"Linearly-evolved Transformer for Pan-sharpening","summary":" Vision transformer family has dominated the satellite pan-sharpening field\ndriven by the global-wise spatial information modeling mechanism from the core\nself-attention ingredient. The standard modeling rules within these promising\npan-sharpening methods are to roughly stack the transformer variants in a\ncascaded manner. Despite the remarkable advancement, their success may be at\nthe huge cost of model parameters and FLOPs, thus preventing its application\nover low-resource satellites.To address this challenge between favorable\nperformance and expensive computation, we tailor an efficient linearly-evolved\ntransformer variant and employ it to construct a lightweight pan-sharpening\nframework. In detail, we deepen into the popular cascaded transformer modeling\nwith cutting-edge methods and develop the alternative 1-order linearly-evolved\ntransformer variant with the 1-dimensional linear convolution chain to achieve\nthe same function. In this way, our proposed method is capable of benefiting\nthe cascaded modeling rule while achieving favorable performance in the\nefficient manner. Extensive experiments over multiple satellite datasets\nsuggest that our proposed method achieves competitive performance against other\nstate-of-the-art with fewer computational resources. Further, the consistently\nfavorable performance has been verified over the hyper-spectral image fusion\ntask. Our main focus is to provide an alternative global modeling framework\nwith an efficient structure. The code will be publicly available.\n","authors":["Junming Hou","Zihan Cao","Naishan Zheng","Xuan Li","Xiaoyu Chen","Xinyang Liu","Xiaofeng Cong","Man Zhou","Danfeng Hong"],"pdf_url":"https://arxiv.org/pdf/2404.12804v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.12803v1","updated":"2024-04-19T11:38:08Z","published":"2024-04-19T11:38:08Z","title":"TextSquare: Scaling up Text-Centric Visual Instruction Tuning","summary":" Text-centric visual question answering (VQA) has made great strides with the\ndevelopment of Multimodal Large Language Models (MLLMs), yet open-source models\nstill fall short of leading models like GPT4V and Gemini, partly due to a lack\nof extensive, high-quality instruction tuning data. To this end, we introduce a\nnew approach for creating a massive, high-quality instruction-tuning dataset,\nSquare-10M, which is generated using closed-source MLLMs. The data construction\nprocess, termed Square, consists of four steps: Self-Questioning, Answering,\nReasoning, and Evaluation. Our experiments with Square-10M led to three key\nfindings: 1) Our model, TextSquare, considerably surpasses open-source previous\nstate-of-the-art Text-centric MLLMs and sets a new standard on OCRBench(62.2%).\nIt even outperforms top-tier models like GPT4V and Gemini in 6 of 10\ntext-centric benchmarks. 2) Additionally, we demonstrate the critical role of\nVQA reasoning data in offering comprehensive contextual insights for specific\nquestions. This not only improves accuracy but also significantly mitigates\nhallucinations. Specifically, TextSquare scores an average of 75.1% across four\ngeneral VQA and hallucination evaluation datasets, outperforming previous\nstate-of-the-art models. 3) Notably, the phenomenon observed in scaling\ntext-centric VQA datasets reveals a vivid pattern: the exponential increase of\ninstruction tuning data volume is directly proportional to the improvement in\nmodel performance, thereby validating the necessity of the dataset scale and\nthe high quality of Square-10M.\n","authors":["Jingqun Tang","Chunhui Lin","Zhen Zhao","Shu Wei","Binghong Wu","Qi Liu","Hao Feng","Yang Li","Siqi Wang","Lei Liao","Wei Shi","Yuliang Liu","Hao Liu","Yuan Xie","Xiang Bai","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2404.12803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12798v1","updated":"2024-04-19T11:24:34Z","published":"2024-04-19T11:24:34Z","title":"A Point-Based Approach to Efficient LiDAR Multi-Task Perception","summary":" Multi-task networks can potentially improve performance and computational\nefficiency compared to single-task networks, facilitating online deployment.\nHowever, current multi-task architectures in point cloud perception combine\nmultiple task-specific point cloud representations, each requiring a separate\nfeature encoder and making the network structures bulky and slow. We propose\nPAttFormer, an efficient multi-task architecture for joint semantic\nsegmentation and object detection in point clouds that only relies on a\npoint-based representation. The network builds on transformer-based feature\nencoders using neighborhood attention and grid-pooling and a query-based\ndetection decoder using a novel 3D deformable-attention detection head design.\nUnlike other LiDAR-based multi-task architectures, our proposed PAttFormer does\nnot require separate feature encoders for multiple task-specific point cloud\nrepresentations, resulting in a network that is 3x smaller and 1.4x faster\nwhile achieving competitive performance on the nuScenes and KITTI benchmarks\nfor autonomous driving perception. Our extensive evaluations show substantial\ngains from multi-task learning, improving LiDAR semantic segmentation by +1.7%\nin mIou and 3D object detection by +1.7% in mAP on the nuScenes benchmark\ncompared to the single-task models.\n","authors":["Christopher Lang","Alexander Braun","Lars Schillingmann","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2404.12798v1.pdf","comment":"8 pages, 3 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.12794v1","updated":"2024-04-19T11:17:35Z","published":"2024-04-19T11:17:35Z","title":"MambaMOS: LiDAR-based 3D Moving Object Segmentation with Motion-aware\n State Space Model","summary":" LiDAR-based Moving Object Segmentation (MOS) aims to locate and segment\nmoving objects in point clouds of the current scan using motion information\nfrom previous scans. Despite the promising results achieved by previous MOS\nmethods, several key issues, such as the weak coupling of temporal and spatial\ninformation, still need further study. In this paper, we propose a novel\nLiDAR-based 3D Moving Object Segmentation with Motion-aware State Space Model,\ntermed MambaMOS. Firstly, we develop a novel embedding module, the Time Clue\nBootstrapping Embedding (TCBE), to enhance the coupling of temporal and spatial\ninformation in point clouds and alleviate the issue of overlooked temporal\nclues. Secondly, we introduce the Motion-aware State Space Model (MSSM) to\nendow the model with the capacity to understand the temporal correlations of\nthe same object across different time steps. Specifically, MSSM emphasizes the\nmotion states of the same object at different time steps through two distinct\ntemporal modeling and correlation steps. We utilize an improved state space\nmodel to represent these motion differences, significantly modeling the motion\nstates. Finally, extensive experiments on the SemanticKITTI-MOS and KITTI-Road\nbenchmarks demonstrate that the proposed MambaMOS achieves state-of-the-art\nperformance. The source code of this work will be made publicly available at\nhttps://github.com/Terminal-K/MambaMOS.\n","authors":["Kang Zeng","Hao Shi","Jiacheng Lin","Siyu Li","Jintao Cheng","Kaiwei Wang","Zhiyong Li","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12794v1.pdf","comment":"The source code will be made publicly available at\n https://github.com/Terminal-K/MambaMOS"},{"id":"http://arxiv.org/abs/2404.12784v1","updated":"2024-04-19T10:47:53Z","published":"2024-04-19T10:47:53Z","title":"Contrastive Gaussian Clustering: Weakly Supervised 3D Scene Segmentation","summary":" We introduce Contrastive Gaussian Clustering, a novel approach capable of\nprovide segmentation masks from any viewpoint and of enabling 3D segmentation\nof the scene. Recent works in novel-view synthesis have shown how to model the\nappearance of a scene via a cloud of 3D Gaussians, and how to generate accurate\nimages from a given viewpoint by projecting on it the Gaussians before $\\alpha$\nblending their color. Following this example, we train a model to include also\na segmentation feature vector for each Gaussian. These can then be used for 3D\nscene segmentation, by clustering Gaussians according to their feature vectors;\nand to generate 2D segmentation masks, by projecting the Gaussians on a plane\nand $\\alpha$ blending over their segmentation features. Using a combination of\ncontrastive learning and spatial regularization, our method can be trained on\ninconsistent 2D segmentation masks, and still learn to generate segmentation\nmasks consistent across all views. Moreover, the resulting model is extremely\naccurate, improving the IoU accuracy of the predicted masks by $+8\\%$ over the\nstate of the art. Code and trained models will be released soon.\n","authors":["Myrna C. Silva","Mahtab Dahaghin","Matteo Toso","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2404.12784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12782v1","updated":"2024-04-19T10:43:25Z","published":"2024-04-19T10:43:25Z","title":"Sentiment-oriented Transformer-based Variational Autoencoder Network for\n Live Video Commenting","summary":" Automatic live video commenting is with increasing attention due to its\nsignificance in narration generation, topic explanation, etc. However, the\ndiverse sentiment consideration of the generated comments is missing from the\ncurrent methods. Sentimental factors are critical in interactive commenting,\nand lack of research so far. Thus, in this paper, we propose a\nSentiment-oriented Transformer-based Variational Autoencoder (So-TVAE) network\nwhich consists of a sentiment-oriented diversity encoder module and a batch\nattention module, to achieve diverse video commenting with multiple sentiments\nand multiple semantics. Specifically, our sentiment-oriented diversity encoder\nelegantly combines VAE and random mask mechanism to achieve semantic diversity\nunder sentiment guidance, which is then fused with cross-modal features to\ngenerate live video comments. Furthermore, a batch attention module is also\nproposed in this paper to alleviate the problem of missing sentimental samples,\ncaused by the data imbalance, which is common in live videos as the popularity\nof videos varies. Extensive experiments on Livebot and VideoIC datasets\ndemonstrate that the proposed So-TVAE outperforms the state-of-the-art methods\nin terms of the quality and diversity of generated comments. Related code is\navailable at https://github.com/fufy1024/So-TVAE.\n","authors":["Fengyi Fu","Shancheng Fang","Weidong Chen","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2404.12782v1.pdf","comment":"27 pages, 10 figures, ACM Transactions on Multimedia Computing,\n Communications and Applications, 2024"},{"id":"http://arxiv.org/abs/2404.12777v1","updated":"2024-04-19T10:32:30Z","published":"2024-04-19T10:32:30Z","title":"EfficientGS: Streamlining Gaussian Splatting for Large-Scale\n High-Resolution Scene Representation","summary":" In the domain of 3D scene representation, 3D Gaussian Splatting (3DGS) has\nemerged as a pivotal technology. However, its application to large-scale,\nhigh-resolution scenes (exceeding 4k$\\times$4k pixels) is hindered by the\nexcessive computational requirements for managing a large number of Gaussians.\nAddressing this, we introduce 'EfficientGS', an advanced approach that\noptimizes 3DGS for high-resolution, large-scale scenes. We analyze the\ndensification process in 3DGS and identify areas of Gaussian\nover-proliferation. We propose a selective strategy, limiting Gaussian increase\nto key primitives, thereby enhancing the representational efficiency.\nAdditionally, we develop a pruning mechanism to remove redundant Gaussians,\nthose that are merely auxiliary to adjacent ones. For further enhancement, we\nintegrate a sparse order increment for Spherical Harmonics (SH), designed to\nalleviate storage constraints and reduce training overhead. Our empirical\nevaluations, conducted on a range of datasets including extensive 4K+ aerial\nimages, demonstrate that 'EfficientGS' not only expedites training and\nrendering times but also achieves this with a model size approximately tenfold\nsmaller than conventional 3DGS while maintaining high rendering fidelity.\n","authors":["Wenkai Liu","Tao Guan","Bin Zhu","Lili Ju","Zikai Song","Dan Li","Yuesong Wang","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.09325v2","updated":"2024-04-19T10:28:03Z","published":"2022-06-19T04:49:35Z","title":"EATFormer: Improving Vision Transformer Inspired by Evolutionary\n Algorithm","summary":" Motivated by biological evolution, this paper explains the rationality of\nVision Transformer by analogy with the proven practical Evolutionary Algorithm\n(EA) and derives that both have consistent mathematical formulation. Then\ninspired by effective EA variants, we propose a novel pyramid EATFormer\nbackbone that only contains the proposed \\emph{EA-based Transformer} (EAT)\nblock, which consists of three residual parts, i.e., \\emph{Multi-Scale Region\nAggregation} (MSRA), \\emph{Global and Local Interaction} (GLI), and\n\\emph{Feed-Forward Network} (FFN) modules, to model multi-scale, interactive,\nand individual information separately. Moreover, we design a \\emph{Task-Related\nHead} (TRH) docked with transformer backbone to complete final information\nfusion more flexibly and \\emph{improve} a \\emph{Modulated Deformable MSA}\n(MD-MSA) to dynamically model irregular locations. Massive quantitative and\nquantitative experiments on image classification, downstream tasks, and\nexplanatory experiments demonstrate the effectiveness and superiority of our\napproach over State-Of-The-Art (SOTA) methods. \\Eg, our Mobile (1.8M), Tiny\n(6.1M), Small (24.3M), and Base (49.0M) models achieve 69.4, 78.4, 83.1, and\n83.9 Top-1 only trained on ImageNet-1K with naive training recipe;\nEATFormer-Tiny/Small/Base armed Mask-R-CNN obtain 45.4/47.4/49.0 box AP and\n41.4/42.9/44.2 mask AP on COCO detection, surpassing contemporary MPViT-T,\nSwin-T, and Swin-S by 0.6/1.4/0.5 box AP and 0.4/1.3/0.9 mask AP separately\nwith less FLOPs; Our EATFormer-Small/Base achieve 47.3/49.3 mIoU on ADE20K by\nUpernet that exceeds Swin-T/S by 2.8/1.7. Code is available at\n\\url{https://github.com/zhangzjn/EATFormer}.\n","authors":["Jiangning Zhang","Xiangtai Li","Yabiao Wang","Chengjie Wang","Yibo Yang","Yong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2206.09325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12770v1","updated":"2024-04-19T10:21:33Z","published":"2024-04-19T10:21:33Z","title":"Camera Agnostic Two-Head Network for Ego-Lane Inference","summary":" Vision-based ego-lane inference using High-Definition (HD) maps is essential\nin autonomous driving and advanced driver assistance systems. The traditional\napproach necessitates well-calibrated cameras, which confines variation of\ncamera configuration, as the algorithm relies on intrinsic and extrinsic\ncalibration. In this paper, we propose a learning-based ego-lane inference by\ndirectly estimating the ego-lane index from a single image. To enhance robust\nperformance, our model incorporates the two-head structure inferring ego-lane\nin two perspectives simultaneously. Furthermore, we utilize an attention\nmechanism guided by vanishing point-and-line to adapt to changes in viewpoint\nwithout requiring accurate calibration. The high adaptability of our model was\nvalidated in diverse environments, devices, and camera mounting points and\norientations.\n","authors":["Chaehyeon Song","Sungho Yoon","Minhyeok Heo","Ayoung Kim","Sujung Kim"],"pdf_url":"https://arxiv.org/pdf/2404.12770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12768v1","updated":"2024-04-19T10:17:10Z","published":"2024-04-19T10:17:10Z","title":"MixLight: Borrowing the Best of both Spherical Harmonics and Gaussian\n Models","summary":" Accurately estimating scene lighting is critical for applications such as\nmixed reality. Existing works estimate illumination by generating illumination\nmaps or regressing illumination parameters. However, the method of generating\nillumination maps has poor generalization performance and parametric models\nsuch as Spherical Harmonic (SH) and Spherical Gaussian (SG) fall short in\ncapturing high-frequency or low-frequency components. This paper presents\nMixLight, a joint model that utilizes the complementary characteristics of SH\nand SG to achieve a more complete illumination representation, which uses SH\nand SG to capture low-frequency ambient and high-frequency light sources\nrespectively. In addition, a special spherical light source sparsemax\n(SLSparsemax) module that refers to the position and brightness relationship\nbetween spherical light sources is designed to improve their sparsity, which is\nsignificant but omitted by prior works. Extensive experiments demonstrate that\nMixLight surpasses state-of-the-art (SOTA) methods on multiple metrics. In\naddition, experiments on Web Dataset also show that MixLight as a parametric\nmethod has better generalization performance than non-parametric methods.\n","authors":["Xinlong Ji","Fangneng Zhan","Shijian Lu","Shi-Sheng Huang","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2404.12768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12766v1","updated":"2024-04-19T10:10:39Z","published":"2024-04-19T10:10:39Z","title":"Continual Learning on a Diet: Learning from Sparsely Labeled Streams\n Under Constrained Computation","summary":" We propose and study a realistic Continual Learning (CL) setting where\nlearning algorithms are granted a restricted computational budget per time step\nwhile training. We apply this setting to large-scale semi-supervised Continual\nLearning scenarios with sparse label rates. Previous proficient CL methods\nperform very poorly in this challenging setting. Overfitting to the sparse\nlabeled data and insufficient computational budget are the two main culprits\nfor such a poor performance. Our new setting encourages learning methods to\neffectively and efficiently utilize the unlabeled data during training. To that\nend, we propose a simple but highly effective baseline, DietCL, which utilizes\nboth unlabeled and labeled data jointly. DietCL meticulously allocates\ncomputational budget for both types of data. We validate our baseline, at\nscale, on several datasets, e.g., CLOC, ImageNet10K, and CGLM, under constraint\nbudget setups. DietCL outperforms, by a large margin, all existing supervised\nCL algorithms as well as more recent continual semi-supervised methods. Our\nextensive analysis and ablations demonstrate that DietCL is stable under a full\nspectrum of label sparsity, computational budget, and various other ablations.\n","authors":["Wenxuan Zhang","Youssef Mohamed","Bernard Ghanem","Philip H. S. Torr","Adel Bibi","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2404.12766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12739v1","updated":"2024-04-19T09:32:16Z","published":"2024-04-19T09:32:16Z","title":"The Solution for the CVPR2024 NICE Image Captioning Challenge","summary":" This report introduces a solution to the Topic 1 Zero-shot Image Captioning\nof 2024 NICE : New frontiers for zero-shot Image Captioning Evaluation. In\ncontrast to NICE 2023 datasets, this challenge involves new annotations by\nhumans with significant differences in caption style and content. Therefore, we\nenhance image captions effectively through retrieval augmentation and caption\ngrading methods. At the data level, we utilize high-quality captions generated\nby image caption models as training data to address the gap in text styles. At\nthe model level, we employ OFA (a large-scale visual-language pre-training\nmodel based on handcrafted templates) to perform the image captioning task.\nSubsequently, we propose caption-level strategy for the high-quality caption\ndata generated by the image caption models and integrate them with retrieval\naugmentation strategy into the template to compel the model to generate higher\nquality, more matching, and semantically enriched captions based on the\nretrieval augmentation prompts. Our approach ranks first on the leaderboard,\nachieving a CIDEr score of 234.11 and 1st in all other metrics.\n","authors":["Longfei Huang","Shupeng Zhong","Xiangyu Wu","Ruoxuan Li","Qingguo Chen","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12734v1","updated":"2024-04-19T09:28:16Z","published":"2024-04-19T09:28:16Z","title":"DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On\n Transformer","summary":" With the continuous development of OCR technology and the expansion of\napplication fields, text recognition in complex scenes has become a key\nchallenge. Factors such as multiple fonts, mixed scenes and complex layouts\nseriously affect the recognition accuracy of traditional OCR models. Although\nOCR models based on deep learning have performed well in specific fields or\nsimilar data sets in recent years, the generalization ability and robustness of\nthe model are still a big challenge when facing complex environments with\nmultiple scenes. Furthermore, training an OCR model from scratch or fine-tuning\nall parameters is very demanding on computing resources and inference time,\nwhich limits the flexibility of its application. This study focuses on a\nfundamental aspect of mixed text recognition in response to the challenges\nmentioned above, which involves effectively fine-tuning the pre-trained basic\nOCR model to demonstrate exceptional performance across various downstream\ntasks. To this end, we propose a parameter-efficient hybrid text recognition\nmethod based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method\nembeds DoRA into the image encoder and LoRA into the internal structure of the\ntext decoder, enabling efficient parameter fine-tuning for downstream tasks.\nExperimental results show that compared to similar parameter adjustment\nmethods, our model DLoRA-TrOCR has the smallest number of parameters and\nperforms better. It can achieve state-of-the-art performance on complex scene\ndata sets involving simultaneous recognition of mixed handwritten, printed and\nstreet view texts.\n","authors":["Da Chang","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2404.12734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12730v1","updated":"2024-04-19T09:22:20Z","published":"2024-04-19T09:22:20Z","title":"PATE-TripleGAN: Privacy-Preserving Image Synthesis with Gaussian\n Differential Privacy","summary":" Conditional Generative Adversarial Networks (CGANs) exhibit significant\npotential in supervised learning model training by virtue of their ability to\ngenerate realistic labeled images. However, numerous studies have indicated the\nprivacy leakage risk in CGANs models. The solution DPCGAN, incorporating the\ndifferential privacy framework, faces challenges such as heavy reliance on\nlabeled data for model training and potential disruptions to original gradient\ninformation due to excessive gradient clipping, making it difficult to ensure\nmodel accuracy. To address these challenges, we present a privacy-preserving\ntraining framework called PATE-TripleGAN. This framework incorporates a\nclassifier to pre-classify unlabeled data, establishing a three-party min-max\ngame to reduce dependence on labeled data. Furthermore, we present a hybrid\ngradient desensitization algorithm based on the Private Aggregation of Teacher\nEnsembles (PATE) framework and Differential Private Stochastic Gradient Descent\n(DPSGD) method. This algorithm allows the model to retain gradient information\nmore effectively while ensuring privacy protection, thereby enhancing the\nmodel's utility. Privacy analysis and extensive experiments affirm that the\nPATE-TripleGAN model can generate a higher quality labeled image dataset while\nensuring the privacy of the training data.\n","authors":["Zepeng Jiang","Weiwei Ni","Yifan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16050v2","updated":"2024-04-19T09:22:06Z","published":"2024-03-24T07:33:08Z","title":"Heterogeneous Federated Learning with Splited Language Model","summary":" Federated Split Learning (FSL) is a promising distributed learning paradigm\nin practice, which gathers the strengths of both Federated Learning (FL) and\nSplit Learning (SL) paradigms, to ensure model privacy while diminishing the\nresource overhead of each client, especially on large transformer models in a\nresource-constrained environment, e.g., Internet of Things (IoT). However,\nalmost all works merely investigate the performance with simple neural network\nmodels in FSL. Despite the minor efforts focusing on incorporating Vision\nTransformers (ViT) as model architectures, they train ViT from scratch, thereby\nleading to enormous training overhead in each device with limited resources.\nTherefore, in this paper, we harness Pre-trained Image Transformers (PITs) as\nthe initial model, coined FedV, to accelerate the training process and improve\nmodel robustness. Furthermore, we propose FedVZ to hinder the gradient\ninversion attack, especially having the capability compatible with black-box\nscenarios, where the gradient information is unavailable. Concretely, FedVZ\napproximates the server gradient by utilizing a zeroth-order (ZO) optimization,\nwhich replaces the backward propagation with just one forward process.\nEmpirically, we are the first to provide a systematic evaluation of FSL methods\nwith PITs in real-world datasets, different partial device participations, and\nheterogeneous data splits. Our experiments verify the effectiveness of our\nalgorithms.\n","authors":["Yifan Shi","Yuhui Zhang","Ziyue Huang","Xiaofeng Yang","Li Shen","Wei Chen","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12725v1","updated":"2024-04-19T09:08:44Z","published":"2024-04-19T09:08:44Z","title":"Separate in the Speech Chain: Cross-Modal Conditional Audio-Visual\n Target Speech Extraction","summary":" The integration of visual cues has revitalized the performance of the target\nspeech extraction task, elevating it to the forefront of the field.\nNevertheless, this multi-modal learning paradigm often encounters the challenge\nof modality imbalance. In audio-visual target speech extraction tasks, the\naudio modality tends to dominate, potentially overshadowing the importance of\nvisual guidance. To tackle this issue, we propose AVSepChain, drawing\ninspiration from the speech chain concept. Our approach partitions the\naudio-visual target speech extraction task into two stages: speech perception\nand speech production. In the speech perception stage, audio serves as the\ndominant modality, while visual information acts as the conditional modality.\nConversely, in the speech production stage, the roles are reversed. This\ntransformation of modality status aims to alleviate the problem of modality\nimbalance. Additionally, we introduce a contrastive semantic matching loss to\nensure that the semantic information conveyed by the generated speech aligns\nwith the semantic information conveyed by lip movements during the speech\nproduction stage. Through extensive experiments conducted on multiple benchmark\ndatasets for audio-visual target speech extraction, we showcase the superior\nperformance achieved by our proposed method.\n","authors":["Zhaoxi Mu","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12725v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.12721v1","updated":"2024-04-19T09:01:58Z","published":"2024-04-19T09:01:58Z","title":"Generalized Few-Shot Meets Remote Sensing: Discovering Novel Classes in\n Land Cover Mapping via Hybrid Semantic Segmentation Framework","summary":" Land-cover mapping is one of the vital applications in Earth observation,\naiming at classifying each pixel's land-cover type of remote-sensing images. As\nnatural and human activities change the landscape, the land-cover map needs to\nbe rapidly updated. However, discovering newly appeared land-cover types in\nexisting classification systems is still a non-trivial task hindered by various\nscales of complex land objects and insufficient labeled data over a wide-span\ngeographic area. In this paper, we propose a generalized few-shot\nsegmentation-based framework, named SegLand, to update novel classes in\nhigh-resolution land-cover mapping. Specifically, the proposed framework is\ndesigned in three parts: (a) Data pre-processing: the base training set and the\nfew-shot support sets of novel classes are analyzed and augmented; (b) Hybrid\nsegmentation structure; Multiple base learners and a modified Projection onto\nOrthogonal Prototypes (POP) network are combined to enhance the base-class\nrecognition and to dig novel classes from insufficient labels data; (c)\nUltimate fusion: the semantic segmentation results of the base learners and POP\nnetwork are reasonably fused. The proposed framework has won first place in the\nleaderboard of the OpenEarthMap Land Cover Mapping Few-Shot Challenge.\nExperiments demonstrate the superiority of the framework for automatically\nupdating novel land-cover classes with limited labeled data.\n","authors":["Zhuohong Li","Fangxiao Lu","Jiaqi Zou","Lei Hu","Hongyan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12721v1.pdf","comment":"11 pages, 11 figures, accepted by CVPR 2024 L3D-IVU Workshop"},{"id":"http://arxiv.org/abs/2404.12720v1","updated":"2024-04-19T09:00:05Z","published":"2024-04-19T09:00:05Z","title":"PDF-MVQA: A Dataset for Multimodal Information Retrieval in PDF-based\n Visual Question Answering","summary":" Document Question Answering (QA) presents a challenge in understanding\nvisually-rich documents (VRD), particularly those dominated by lengthy textual\ncontent like research journal articles. Existing studies primarily focus on\nreal-world documents with sparse text, while challenges persist in\ncomprehending the hierarchical semantic relations among multiple pages to\nlocate multimodal components. To address this gap, we propose PDF-MVQA, which\nis tailored for research journal articles, encompassing multiple pages and\nmultimodal information retrieval. Unlike traditional machine reading\ncomprehension (MRC) tasks, our approach aims to retrieve entire paragraphs\ncontaining answers or visually rich document entities like tables and figures.\nOur contributions include the introduction of a comprehensive PDF Document VQA\ndataset, allowing the examination of semantically hierarchical layout\nstructures in text-dominant documents. We also present new VRD-QA frameworks\ndesigned to grasp textual contents and relations among document layouts\nsimultaneously, extending page-level understanding to the entire multi-page\ndocument. Through this work, we aim to enhance the capabilities of existing\nvision-and-language models in handling challenges posed by text-dominant\ndocuments in VRD-QA.\n","authors":["Yihao Ding","Kaixuan Ren","Jiabin Huang","Siwen Luo","Soyeon Caren Han"],"pdf_url":"https://arxiv.org/pdf/2404.12720v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2307.01004v2","updated":"2024-04-19T08:59:37Z","published":"2023-07-03T13:40:20Z","title":"Joint Coordinate Regression and Association For Multi-Person Pose\n Estimation, A Pure Neural Network Approach","summary":" We introduce a novel one-stage end-to-end multi-person 2D pose estimation\nalgorithm, known as Joint Coordinate Regression and Association (JCRA), that\nproduces human pose joints and associations without requiring any\npost-processing. The proposed algorithm is fast, accurate, effective, and\nsimple. The one-stage end-to-end network architecture significantly improves\nthe inference speed of JCRA. Meanwhile, we devised a symmetric network\nstructure for both the encoder and decoder, which ensures high accuracy in\nidentifying keypoints. It follows an architecture that directly outputs part\npositions via a transformer network, resulting in a significant improvement in\nperformance. Extensive experiments on the MS COCO and CrowdPose benchmarks\ndemonstrate that JCRA outperforms state-of-the-art approaches in both accuracy\nand efficiency. Moreover, JCRA demonstrates 69.2 mAP and is 78\\% faster at\ninference acceleration than previous state-of-the-art bottom-up algorithms. The\ncode for this algorithm will be publicly available.\n","authors":["Dongyang Yu","Yunshi Xie","Wangpeng An","Li Zhang","Yufeng Yao"],"pdf_url":"https://arxiv.org/pdf/2307.01004v2.pdf","comment":"This paper has been accepted by MMasia 2023 and is an oral\n presentation"},{"id":"http://arxiv.org/abs/2404.12718v1","updated":"2024-04-19T08:58:53Z","published":"2024-04-19T08:58:53Z","title":"Improving Prediction Accuracy of Semantic Segmentation Methods Using\n Convolutional Autoencoder Based Pre-processing Layers","summary":" In this paper, we propose a method to improve prediction accuracy of semantic\nsegmentation methods as follows: (1) construct a neural network that has\npre-processing layers based on a convolutional autoencoder ahead of a semantic\nsegmentation network, and (2) train the entire network initialized by the\nweights of the pre-trained autoencoder. We applied this method to the fully\nconvolutional network (FCN) and experimentally compared its prediction accuracy\non the cityscapes dataset. The Mean IoU of the proposed target model with the\nHe normal initialization is 18.7% higher than that of FCN with the He normal\ninitialization. In addition, those of the modified models of the target model\nare significantly higher than that of FCN with the He normal initialization.\nThe accuracy and loss curves during the training showed that these are\nresulting from the improvement of the generalization ability. All of these\nresults provide strong evidence that the proposed method is significantly\neffective in improving the prediction accuracy of FCN. The proposed method has\nthe following features: it is comparatively simple, whereas the effect on\nimproving the generalization ability and prediction accuracy of FCN is\nsignificant; the increase in the number of parameters by using it is very\nsmall, and that in the computation time is substantially large. In principle,\nthe proposed method can be applied to other semantic segmentation methods. For\nsemantic segmentation, at present, there is no effective way to improve the\nprediction accuracy of existing methods. None have published a method which is\nthe same as or similar to our method and none have used such a method in\npractice. Therefore, we believe that our method is useful in practice and\nworthy of being widely known and used.\n","authors":["Hisashi Shimodaira"],"pdf_url":"https://arxiv.org/pdf/2404.12718v1.pdf","comment":"13 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2312.04861v2","updated":"2024-04-19T08:55:34Z","published":"2023-12-08T06:31:19Z","title":"Exploring Radar Data Representations in Autonomous Driving: A\n Comprehensive Review","summary":" With the rapid advancements of sensor technology and deep learning,\nautonomous driving systems are providing safe and efficient access to\nintelligent vehicles as well as intelligent transportation. Among these\nequipped sensors, the radar sensor plays a crucial role in providing robust\nperception information in diverse environmental conditions. This review focuses\non exploring different radar data representations utilized in autonomous\ndriving systems. Firstly, we introduce the capabilities and limitations of the\nradar sensor by examining the working principles of radar perception and signal\nprocessing of radar measurements. Then, we delve into the generation process of\nfive radar representations, including the ADC signal, radar tensor, point\ncloud, grid map, and micro-Doppler signature. For each radar representation, we\nexamine the related datasets, methods, advantages and limitations. Furthermore,\nwe discuss the challenges faced in these data representations and propose\npotential research directions. Above all, this comprehensive review offers an\nin-depth insight into how these representations enhance autonomous system\ncapabilities, providing guidance for radar perception researchers. To\nfacilitate retrieval and comparison of different data representations, datasets\nand methods, we provide an interactive website at\nhttps://radar-camera-fusion.github.io/radar.\n","authors":["Shanliang Yao","Runwei Guan","Zitian Peng","Chenhang Xu","Yilu Shi","Weiping Ding","Eng Gee Lim","Yong Yue","Hyungjoon Seo","Ka Lok Man","Jieming Ma","Xiaohui Zhu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2312.04861v2.pdf","comment":"24 pages, 10 figures, 5 tables. arXiv admin note: text overlap with\n arXiv:2304.10410"},{"id":"http://arxiv.org/abs/2311.15727v2","updated":"2024-04-19T08:51:58Z","published":"2023-11-27T11:24:25Z","title":"MARIS: Referring Image Segmentation via Mutual-Aware Attention Features","summary":" Referring image segmentation (RIS) aims to segment a particular region based\non a language expression prompt. Existing methods incorporate linguistic\nfeatures into visual features and obtain multi-modal features for mask\ndecoding. However, these methods may segment the visually salient entity\ninstead of the correct referring region, as the multi-modal features are\ndominated by the abundant visual context. In this paper, we propose MARIS, a\nreferring image segmentation method that leverages the Segment Anything Model\n(SAM) and introduces a mutual-aware attention mechanism to enhance the\ncross-modal fusion via two parallel branches. Specifically, our mutual-aware\nattention mechanism consists of Vision-Guided Attention and Language-Guided\nAttention, which bidirectionally model the relationship between visual and\nlinguistic features. Correspondingly, we design a Mask Decoder to enable\nexplicit linguistic guidance for more consistent segmentation with the language\nexpression. To this end, a multi-modal query token is proposed to integrate\nlinguistic information and interact with visual information simultaneously.\nExtensive experiments on three benchmark datasets show that our method\noutperforms the state-of-the-art RIS methods. Our code will be publicly\navailable.\n","authors":["Mengxi Zhang","Yiming Liu","Xiangjun Yin","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12712v1","updated":"2024-04-19T08:46:33Z","published":"2024-04-19T08:46:33Z","title":"uTRAND: Unsupervised Anomaly Detection in Traffic Trajectories","summary":" Deep learning-based approaches have achieved significant improvements on\npublic video anomaly datasets, but often do not perform well in real-world\napplications. This paper addresses two issues: the lack of labeled data and the\ndifficulty of explaining the predictions of a neural network. To this end, we\npresent a framework called uTRAND, that shifts the problem of anomalous\ntrajectory prediction from the pixel space to a semantic-topological domain.\nThe framework detects and tracks all types of traffic agents in bird's-eye-view\nvideos of traffic cameras mounted at an intersection. By conceptualizing the\nintersection as a patch-based graph, it is shown that the framework learns and\nmodels the normal behaviour of traffic agents without costly manual labeling.\nFurthermore, uTRAND allows to formulate simple rules to classify anomalous\ntrajectories in a way suited for human interpretation. We show that uTRAND\noutperforms other state-of-the-art approaches on a dataset of anomalous\ntrajectories collected in a real-world setting, while producing explainable\ndetection results.\n","authors":["Giacomo D'Amicantonio","Egor Bondarau","Peter H. N. de With"],"pdf_url":"https://arxiv.org/pdf/2404.12712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12711v1","updated":"2024-04-19T08:40:52Z","published":"2024-04-19T08:40:52Z","title":"Dynamic Temperature Knowledge Distillation","summary":" Temperature plays a pivotal role in moderating label softness in the realm of\nknowledge distillation (KD). Traditional approaches often employ a static\ntemperature throughout the KD process, which fails to address the nuanced\ncomplexities of samples with varying levels of difficulty and overlooks the\ndistinct capabilities of different teacher-student pairings. This leads to a\nless-than-ideal transfer of knowledge. To improve the process of knowledge\npropagation, we proposed Dynamic Temperature Knowledge Distillation (DTKD)\nwhich introduces a dynamic, cooperative temperature control for both teacher\nand student models simultaneously within each training iterafion. In\nparticular, we proposed \"\\textbf{sharpness}\" as a metric to quantify the\nsmoothness of a model's output distribution. By minimizing the sharpness\ndifference between the teacher and the student, we can derive sample-specific\ntemperatures for them respectively. Extensive experiments on CIFAR-100 and\nImageNet-2012 demonstrate that DTKD performs comparably to leading KD\ntechniques, with added robustness in Target Class KD and None-target Class KD\nscenarios.The code is available at https://github.com/JinYu1998/DTKD.\n","authors":["Yukang Wei","Yu Bai"],"pdf_url":"https://arxiv.org/pdf/2404.12711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07739v3","updated":"2024-04-19T08:36:25Z","published":"2024-02-12T15:57:31Z","title":"Task-conditioned adaptation of visual features in multi-task policy\n learning","summary":" Successfully addressing a wide variety of tasks is a core ability of\nautonomous agents, requiring flexibly adapting the underlying decision-making\nstrategies and, as we argue in this work, also adapting the perception modules.\nAn analogical argument would be the human visual system, which uses top-down\nsignals to focus attention determined by the current task. Similarly, we adapt\npre-trained large vision models conditioned on specific downstream tasks in the\ncontext of multi-task policy learning. We introduce task-conditioned adapters\nthat do not require finetuning any pre-trained weights, combined with a single\npolicy trained with behavior cloning and capable of addressing multiple tasks.\nWe condition the visual adapters on task embeddings, which can be selected at\ninference if the task is known, or alternatively inferred from a set of example\ndemonstrations. To this end, we propose a new optimization-based estimator. We\nevaluate the method on a wide variety of tasks from the CortexBench benchmark\nand show that, compared to existing work, it can be addressed with a single\npolicy. In particular, we demonstrate that adapting visual features is a key\ndesign choice and that the method generalizes to unseen tasks given a few\ndemonstrations.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.07739v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09084v3","updated":"2024-04-19T08:24:06Z","published":"2023-08-17T16:23:52Z","title":"MovePose: A High-performance Human Pose Estimation Algorithm on Mobile\n and Edge Devices","summary":" We present MovePose, an optimized lightweight convolutional neural network\ndesigned specifically for real-time body pose estimation on CPU-based mobile\ndevices. The current solutions do not provide satisfactory accuracy and speed\nfor human posture estimation, and MovePose addresses this gap. It aims to\nmaintain real-time performance while improving the accuracy of human posture\nestimation for mobile devices. Our MovePose algorithm has attained an Mean\nAverage Precision (mAP) score of 68.0 on the COCO \\cite{cocodata} validation\ndataset. The MovePose algorithm displayed efficiency with a performance of 69+\nframes per second (fps) when run on an Intel i9-10920x CPU. Additionally, it\nshowcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an\nAndroid phone equipped with a Snapdragon 8 + 4G processor, the fps reached\nabove 11. To enhance accuracy, we incorporated three techniques: deconvolution,\nlarge kernel convolution, and coordinate classification methods. Compared to\nbasic upsampling, deconvolution is trainable, improves model capacity, and\nenhances the receptive field. Large kernel convolution strengthens these\nproperties at a decreased computational cost. In summary, MovePose provides\nhigh accuracy and real-time performance, marking it a potential tool for a\nvariety of applications, including those focused on mobile-side human posture\nestimation. The code and models for this algorithm will be made publicly\naccessible.\n","authors":["Dongyang Yu","Haoyue Zhang","Ruisheng Zhao","Guoqi Chen","Wangpeng An","Yanhong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09084v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12702v1","updated":"2024-04-19T08:20:18Z","published":"2024-04-19T08:20:18Z","title":"Modeling Multi-Granularity Context Information Flow for Pavement Crack\n Detection","summary":" Crack detection has become an indispensable, interesting yet challenging task\nin the computer vision community. Specially, pavement cracks have a highly\ncomplex spatial structure, a low contrasting background and a weak spatial\ncontinuity, posing a significant challenge to an effective crack detection\nmethod. In this paper, we address these problems from a view that utilizes\ncontexts of the cracks and propose an end-to-end deep learning method to model\nthe context information flow. To precisely localize crack from an image, it is\ncritical to effectively extract and aggregate multi-granularity context,\nincluding the fine-grained local context around the cracks (in spatial-level)\nand the coarse-grained semantics (in segment-level). Concretely, in\nConvolutional Neural Network (CNN), low-level features extracted by the shallow\nlayers represent the local information, while the deep layers extract the\nsemantic features. Additionally, a second main insight in this work is that the\nsemantic context should be an guidance to local context feature. By the above\ninsights, the proposed method we first apply the dilated convolution as the\nbackbone feature extractor to model local context, then we build a context\nguidance module to leverage semantic context to guide local feature extraction\nat multiple stages. To handle label alignment between stages, we apply the\nMultiple Instance Learning (MIL) strategy to align the high-level feature to\nthe low-level ones in the stage-wise context flow. In addition, compared with\nthese public crack datasets, to our best knowledge, we release the largest,\nmost complex and most challenging Bitumen Pavement Crack (BPC) dataset. The\nexperimental results on the three crack datasets demonstrate that the proposed\nmethod performs well and outperforms the current state-of-the-art methods.\n","authors":["Junbiao Pang","Baocheng Xiong","Jiaqi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.12702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12694v1","updated":"2024-04-19T07:50:13Z","published":"2024-04-19T07:50:13Z","title":"ESC: Evolutionary Stitched Camera Calibration in the Wild","summary":" This work introduces a novel end-to-end approach for estimating extrinsic\nparameters of cameras in multi-camera setups on real-life sports fields. We\nidentify the source of significant calibration errors in multi-camera\nenvironments and address the limitations of existing calibration methods,\nparticularly the disparity between theoretical models and actual sports field\ncharacteristics. We propose the Evolutionary Stitched Camera calibration (ESC)\nalgorithm to bridge this gap. It consists of image segmentation followed by\nevolutionary optimization of a novel loss function, providing a unified and\naccurate multi-camera calibration solution with high visual fidelity. The\noutcome allows the creation of virtual stitched views from multiple video\nsources, being as important for practical applications as numerical accuracy.\nWe demonstrate the superior performance of our approach compared to\nstate-of-the-art methods across diverse real-life football fields with varying\nphysical characteristics.\n","authors":["Grzegorz Rypeść","Grzegorz Kurzejamski"],"pdf_url":"https://arxiv.org/pdf/2404.12694v1.pdf","comment":"Accepted for IEEE CEC 2024"},{"id":"http://arxiv.org/abs/2404.12693v1","updated":"2024-04-19T07:47:23Z","published":"2024-04-19T07:47:23Z","title":"Improving Chinese Character Representation with Formation Tree","summary":" Learning effective representations for Chinese characters presents unique\nchallenges, primarily due to the vast number of characters and their continuous\ngrowth, which requires models to handle an expanding category space.\nAdditionally, the inherent sparsity of character usage complicates the\ngeneralization of learned representations. Prior research has explored\nradical-based sequences to overcome these issues, achieving progress in\nrecognizing unseen characters. However, these approaches fail to fully exploit\nthe inherent tree structure of such sequences. To address these limitations and\nleverage established data properties, we propose Formation Tree-CLIP (FT-CLIP).\nThis model utilizes formation trees to represent characters and incorporates a\ndedicated tree encoder, significantly improving performance in both seen and\nunseen character recognition tasks. We further introduce masking for to both\ncharacter images and tree nodes, enabling efficient and effective training.\nThis approach accelerates training significantly (by a factor of 2 or more)\nwhile enhancing accuracy. Extensive experiments show that processing characters\nthrough formation trees aligns better with their inherent properties than\ndirect sequential methods, significantly enhancing the generality and usability\nof the representations.\n","authors":["Yang Hong","Yinfei Li","Xiaojun Qiao","Rui Li","Junsong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00394v2","updated":"2024-04-19T07:43:52Z","published":"2024-03-01T09:29:41Z","title":"List-Mode PET Image Reconstruction Using Dykstra-Like Splitting","summary":" Convergence of the block iterative method in image reconstruction for\npositron emission tomography (PET) requires careful control of relaxation\nparameters, which is a challenging task. The automatic determination of\nrelaxation parameters for list-mode reconstructions also remains challenging.\nTherefore, a different approach would be desirable. In this study, we propose a\nlist-mode maximum likelihood Dykstra-like splitting PET reconstruction\n(LM-MLDS). LM-MLDS converges the list-mode block iterative method by adding the\ndistance from an initial image as a penalty term into an objective function.\nLM-MLDS takes a two-step approach because its performance depends on the\nquality of the initial image. The first step uses a uniform image as the\ninitial image, and then the second step uses a reconstructed image after one\nmain iteration as the initial image. In a simulation study, LM-MLDS provided a\nbetter tradeoff curve between noise and contrast than the other methods. In a\nclinical study, LM-MLDS removed the false hotspots at the edge of the axial\nfield of view and improved the image quality of slices covering the top of the\nhead to the cerebellum. List-mode proximal splitting reconstruction is useful\nnot only for optimizing nondifferential functions but also for converging block\niterative methods without controlling relaxation parameters.\n","authors":["Kibo Ote","Fumio Hashimoto","Yuya Onishi","Yasuomi Ouchi"],"pdf_url":"https://arxiv.org/pdf/2403.00394v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.12680v1","updated":"2024-04-19T07:30:36Z","published":"2024-04-19T07:30:36Z","title":"VoxAtnNet: A 3D Point Clouds Convolutional Neural Network for\n Generalizable Face Presentation Attack Detection","summary":" Facial biometrics are an essential components of smartphones to ensure\nreliable and trustworthy authentication. However, face biometric systems are\nvulnerable to Presentation Attacks (PAs), and the availability of more\nsophisticated presentation attack instruments such as 3D silicone face masks\nwill allow attackers to deceive face recognition systems easily. In this work,\nwe propose a novel Presentation Attack Detection (PAD) algorithm based on 3D\npoint clouds captured using the frontal camera of a smartphone to detect\npresentation attacks. The proposed PAD algorithm, VoxAtnNet, processes 3D point\nclouds to obtain voxelization to preserve the spatial structure. Then, the\nvoxelized 3D samples were trained using the novel convolutional attention\nnetwork to detect PAs on the smartphone. Extensive experiments were carried out\non the newly constructed 3D face point cloud dataset comprising bona fide and\ntwo different 3D PAIs (3D silicone face mask and wrap photo mask), resulting in\n3480 samples. The performance of the proposed method was compared with existing\nmethods to benchmark the detection performance using three different evaluation\nprotocols. The experimental results demonstrate the improved performance of the\nproposed method in detecting both known and unknown face presentation attacks.\n","authors":["Raghavendra Ramachandra","Narayan Vetrekar","Sushma Venkatesh","Savita Nageshker","Jag Mohan Singh","R. S. Gad"],"pdf_url":"https://arxiv.org/pdf/2404.12680v1.pdf","comment":"Accepted in 2024 18th International Conference on Automatic Face and\n Gesture Recognition (FG)"},{"id":"http://arxiv.org/abs/2404.12679v1","updated":"2024-04-19T07:26:30Z","published":"2024-04-19T07:26:30Z","title":"MLSD-GAN -- Generating Strong High Quality Face Morphing Attacks using\n Latent Semantic Disentanglement","summary":" Face-morphing attacks are a growing concern for biometric researchers, as\nthey can be used to fool face recognition systems (FRS). These attacks can be\ngenerated at the image level (supervised) or representation level\n(unsupervised). Previous unsupervised morphing attacks have relied on\ngenerative adversarial networks (GANs). More recently, researchers have used\nlinear interpolation of StyleGAN-encoded images to generate morphing attacks.\nIn this paper, we propose a new method for generating high-quality morphing\nattacks using StyleGAN disentanglement. Our approach, called MLSD-GAN,\nspherically interpolates the disentangled latents to produce realistic and\ndiverse morphing attacks. We evaluate the vulnerability of MLSD-GAN on two\ndeep-learning-based FRS techniques. The results show that MLSD-GAN poses a\nsignificant threat to FRS, as it can generate morphing attacks that are highly\neffective at fooling these systems.\n","authors":["Aravinda Reddy PN","Raghavendra Ramachandra","Krothapalli Sreenivasa Rao","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2404.12679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12678v1","updated":"2024-04-19T07:24:32Z","published":"2024-04-19T07:24:32Z","title":"Exploring Interactive Semantic Alignment for Efficient HOI Detection\n with Vision-language Model","summary":" Human-Object Interaction (HOI) detection aims to localize human-object pairs\nand comprehend their interactions. Recently, two-stage transformer-based\nmethods have demonstrated competitive performance. However, these methods\nfrequently focus on object appearance features and ignore global contextual\ninformation. Besides, vision-language model CLIP which effectively aligns\nvisual and text embeddings has shown great potential in zero-shot HOI\ndetection. Based on the former facts, We introduce a novel HOI detector named\nISA-HOI, which extensively leverages knowledge from CLIP, aligning interactive\nsemantics between visual and textual features. We first extract global context\nof image and local features of object to Improve interaction Features in images\n(IF). On the other hand, we propose a Verb Semantic Improvement (VSI) module to\nenhance textual features of verb labels via cross-modal fusion. Ultimately, our\nmethod achieves competitive results on the HICO-DET and V-COCO benchmarks with\nmuch fewer training epochs, and outperforms the state-of-the-art under\nzero-shot settings.\n","authors":["Jihao Dong","Renjie Pan","Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12678v1.pdf","comment":"Accepted by ICME2024"},{"id":"http://arxiv.org/abs/2404.12667v1","updated":"2024-04-19T07:07:36Z","published":"2024-04-19T07:07:36Z","title":"Detecting Out-Of-Distribution Earth Observation Images with Diffusion\n Models","summary":" Earth Observation imagery can capture rare and unusual events, such as\ndisasters and major landscape changes, whose visual appearance contrasts with\nthe usual observations. Deep models trained on common remote sensing data will\noutput drastically different features for these out-of-distribution samples,\ncompared to those closer to their training dataset. Detecting them could\ntherefore help anticipate changes in the observations, either geographical or\nenvironmental. In this work, we show that the reconstruction error of diffusion\nmodels can effectively serve as unsupervised out-of-distribution detectors for\nremote sensing images, using them as a plausibility score. Moreover, we\nintroduce ODEED, a novel reconstruction-based scorer using the probability-flow\nODE of diffusion models. We validate it experimentally on SpaceNet 8 with\nvarious scenarios, such as classical OOD detection with geographical shift and\nnear-OOD setups: pre/post-flood and non-flooded/flooded image recognition. We\nshow that our ODEED scorer significantly outperforms other diffusion-based and\ndiscriminative baselines on the more challenging near-OOD scenarios of flood\nimage detection, where OOD images are close to the distribution tail. We aim to\npave the way towards better use of generative models for anomaly detection in\nremote sensing.\n","authors":["Georges Le Bellier","Nicolas Audebert"],"pdf_url":"https://arxiv.org/pdf/2404.12667v1.pdf","comment":"EARTHVISION 2024 IEEE/CVF CVPR Workshop. Large Scale Computer Vision\n for Remote Sensing Imagery, Jun 2024, Seattle, United States"},{"id":"http://arxiv.org/abs/2403.08511v2","updated":"2024-04-19T06:48:52Z","published":"2024-03-13T13:16:26Z","title":"A Multimodal Fusion Network For Student Emotion Recognition Based on\n Transformer and Tensor Product","summary":" This paper introduces a new multi-modal model based on the Transformer\narchitecture and tensor product fusion strategy, combining BERT's text vectors\nand ViT's image vectors to classify students' psychological conditions, with an\naccuracy of 93.65%. The purpose of the study is to accurately analyze the\nmental health status of students from various data sources. This paper\ndiscusses modal fusion methods, including early, late and intermediate fusion,\nto overcome the challenges of integrating multi-modal information. Ablation\nstudies compare the performance of different models and fusion techniques,\nshowing that the proposed model outperforms existing methods such as CLIP and\nViLBERT in terms of accuracy and inference speed. Conclusions indicate that\nwhile this model has significant advantages in emotion recognition, its\npotential to incorporate other data modalities provides areas for future\nresearch.\n","authors":["Ao Xiang","Zongqing Qi","Han Wang","Qin Yang","Danqing Ma"],"pdf_url":"https://arxiv.org/pdf/2403.08511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09530v2","updated":"2024-04-19T06:44:18Z","published":"2024-04-15T07:50:15Z","title":"RanLayNet: A Dataset for Document Layout Detection used for Domain\n Adaptation and Generalization","summary":" Large ground-truth datasets and recent advances in deep learning techniques\nhave been useful for layout detection. However, because of the restricted\nlayout diversity of these datasets, training on them requires a sizable number\nof annotated instances, which is both expensive and time-consuming. As a\nresult, differences between the source and target domains may significantly\nimpact how well these models function. To solve this problem, domain adaptation\napproaches have been developed that use a small quantity of labeled data to\nadjust the model to the target domain. In this research, we introduced a\nsynthetic document dataset called RanLayNet, enriched with automatically\nassigned labels denoting spatial positions, ranges, and types of layout\nelements. The primary aim of this endeavor is to develop a versatile dataset\ncapable of training models with robustness and adaptability to diverse document\nformats. Through empirical experimentation, we demonstrate that a deep layout\nidentification model trained on our dataset exhibits enhanced performance\ncompared to a model trained solely on actual documents. Moreover, we conduct a\ncomparative analysis by fine-tuning inference models using both PubLayNet and\nIIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that\nmodels enriched with our dataset are optimal for tasks such as achieving 0.398\nand 0.588 mAP95 score in the scientific document domain for the TABLE class.\n","authors":["Avinash Anand","Raj Jaiswal","Mohit Gupta","Siddhesh S Bangar","Pijush Bhuyan","Naman Lal","Rajeev Singh","Ritika Jha","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.09530v2.pdf","comment":"8 pages, 6 figures, MMAsia 2023 Proceedings of the 5th ACM\n International Conference on Multimedia in Asia"},{"id":"http://arxiv.org/abs/2404.12652v1","updated":"2024-04-19T06:41:32Z","published":"2024-04-19T06:41:32Z","title":"Pre-trained Vision-Language Models Learn Discoverable Visual Concepts","summary":" Do vision-language models (VLMs) pre-trained to caption an image of a\n\"durian\" learn visual concepts such as \"brown\" (color) and \"spiky\" (texture) at\nthe same time? We aim to answer this question as visual concepts learned \"for\nfree\" would enable wide applications such as neuro-symbolic reasoning or\nhuman-interpretable object classification. We assume that the visual concepts,\nif captured by pre-trained VLMs, can be extracted by their vision-language\ninterface with text-based concept prompts. We observe that recent works\nprompting VLMs with concepts often differ in their strategies to define and\nevaluate the visual concepts, leading to conflicting conclusions. We propose a\nnew concept definition strategy based on two observations: First, certain\nconcept prompts include shortcuts that recognize correct concepts for wrong\nreasons; Second, multimodal information (e.g. visual discriminativeness, and\ntextual knowledge) should be leveraged when selecting the concepts. Our\nproposed concept discovery and learning (CDL) framework is thus designed to\nidentify a diverse list of generic visual concepts (e.g. \"spiky\" as opposed to\n\"spiky durian\"), which are ranked and selected based on visual and language\nmutual information. We carefully design quantitative and human evaluations of\nthe discovered concepts on six diverse visual recognition datasets, which\nconfirm that pre-trained VLMs do learn visual concepts that provide accurate\nand thorough descriptions for the recognized objects. All code and models are\npublicly released.\n","authors":["Yuan Zang","Tian Yun","Hao Tan","Trung Bui","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12650v1","updated":"2024-04-19T06:32:21Z","published":"2024-04-19T06:32:21Z","title":"F2FLDM: Latent Diffusion Models with Histopathology Pre-Trained\n Embeddings for Unpaired Frozen Section to FFPE Translation","summary":" The Frozen Section (FS) technique is a rapid and efficient method, taking\nonly 15-30 minutes to prepare slides for pathologists' evaluation during\nsurgery, enabling immediate decisions on further surgical interventions.\nHowever, FS process often introduces artifacts and distortions like folds and\nice-crystal effects. In contrast, these artifacts and distortions are absent in\nthe higher-quality formalin-fixed paraffin-embedded (FFPE) slides, which\nrequire 2-3 days to prepare. While Generative Adversarial Network (GAN)-based\nmethods have been used to translate FS to FFPE images (F2F), they may leave\nmorphological inaccuracies with remaining FS artifacts or introduce new\nartifacts, reducing the quality of these translations for clinical assessments.\nIn this study, we benchmark recent generative models, focusing on GANs and\nLatent Diffusion Models (LDMs), to overcome these limitations. We introduce a\nnovel approach that combines LDMs with Histopathology Pre-Trained Embeddings to\nenhance restoration of FS images. Our framework leverages LDMs conditioned by\nboth text and pre-trained embeddings to learn meaningful features of FS and\nFFPE histopathology images. Through diffusion and denoising techniques, our\napproach not only preserves essential diagnostic attributes like color staining\nand tissue morphology but also proposes an embedding translation mechanism to\nbetter predict the targeted FFPE representation of input FS images. As a\nresult, this work achieves a significant improvement in classification\nperformance, with the Area Under the Curve rising from 81.99% to 94.64%,\naccompanied by an advantageous CaseFD. This work establishes a new benchmark\nfor FS to FFPE image translation quality, promising enhanced reliability and\naccuracy in histopathology FS image analysis. Our work is available at\nhttps://minhmanho.github.io/f2f_ldm/.\n","authors":["Man M. Ho","Shikha Dubey","Yosep Chong","Beatrice Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2404.12650v1.pdf","comment":"Preprint. Our work is available at\n https://minhmanho.github.io/f2f_ldm/"},{"id":"http://arxiv.org/abs/2404.10305v2","updated":"2024-04-19T06:23:20Z","published":"2024-04-16T06:24:53Z","title":"TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table\n Structure & Content","summary":" The automatic recognition of tabular data in document images presents a\nsignificant challenge due to the diverse range of table styles and complex\nstructures. Tables offer valuable content representation, enhancing the\npredictive capabilities of various systems such as search engines and Knowledge\nGraphs. Addressing the two main problems, namely table detection (TD) and table\nstructure recognition (TSR), has traditionally been approached independently.\nIn this research, we propose an end-to-end pipeline that integrates deep\nlearning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve\ncomprehensive image-based table recognition. This integrated approach\neffectively handles diverse table styles, complex structures, and image\ndistortions, resulting in improved accuracy and efficiency compared to existing\nmethods like Table Transformers. Our system achieves simultaneous table\ndetection (TD), table structure recognition (TSR), and table content\nrecognition (TCR), preserving table structures and accurately extracting\ntabular data from document images. The integration of multiple models addresses\nthe intricacies of table recognition, making our approach a promising solution\nfor image-based table understanding, data extraction, and information retrieval\napplications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy\nof 78%, showcasing a remarkable improvement of approximately 25% in the OCR\nAccuracy compared to the previous Table Transformer approach.\n","authors":["Avinash Anand","Raj Jaiswal","Pijush Bhuyan","Mohit Gupta","Siddhesh Bangar","Md. Modassir Imam","Rajiv Ratn Shah","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2404.10305v2.pdf","comment":"8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for\n Information Retrieval"},{"id":"http://arxiv.org/abs/2404.06883v2","updated":"2024-04-19T06:07:22Z","published":"2024-04-10T10:13:37Z","title":"Research on Detection of Floating Objects in River and Lake Based on AI\n Intelligent Image Recognition","summary":" With the rapid advancement of artificial intelligence technology, AI-enabled\nimage recognition has emerged as a potent tool for addressing challenges in\ntraditional environmental monitoring. This study focuses on the detection of\nfloating objects in river and lake environments, exploring an innovative\napproach based on deep learning. By intricately analyzing the technical\npathways for detecting static and dynamic features and considering the\ncharacteristics of river and lake debris, a comprehensive image acquisition and\nprocessing workflow has been developed. The study highlights the application\nand performance comparison of three mainstream deep learning models -SSD,\nFaster-RCNN, and YOLOv5- in debris identification. Additionally, a detection\nsystem for floating objects has been designed and implemented, encompassing\nboth hardware platform construction and software framework development. Through\nrigorous experimental validation, the proposed system has demonstrated its\nability to significantly enhance the accuracy and efficiency of debris\ndetection, thus offering a new technological avenue for water quality\nmonitoring in rivers and lakes\n","authors":["Jingyu Zhang","Ao Xiang","Yu Cheng","Qin Yang","Liyang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07918v3","updated":"2024-04-19T05:55:00Z","published":"2023-09-14T17:59:49Z","title":"Unified Human-Scene Interaction via Prompted Chain-of-Contacts","summary":" Human-Scene Interaction (HSI) is a vital component of fields like embodied AI\nand virtual reality. Despite advancements in motion quality and physical\nplausibility, two pivotal factors, versatile interaction control and the\ndevelopment of a user-friendly interface, require further exploration before\nthe practical application of HSI. This paper presents a unified HSI framework,\nUniHSI, which supports unified control of diverse interactions through language\ncommands. This framework is built upon the definition of interaction as Chain\nof Contacts (CoC): steps of human joint-object part pairs, which is inspired by\nthe strong correlation between interaction types and human-object contact\nregions. Based on the definition, UniHSI constitutes a Large Language Model\n(LLM) Planner to translate language prompts into task plans in the form of CoC,\nand a Unified Controller that turns CoC into uniform task execution. To\nfacilitate training and evaluation, we collect a new dataset named ScenePlan\nthat encompasses thousands of task plans generated by LLMs based on diverse\nscenarios. Comprehensive experiments demonstrate the effectiveness of our\nframework in versatile task execution and generalizability to real scanned\nscenes. The project page is at https://github.com/OpenRobotLab/UniHSI .\n","authors":["Zeqi Xiao","Tai Wang","Jingbo Wang","Jinkun Cao","Wenwei Zhang","Bo Dai","Dahua Lin","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2309.07918v3.pdf","comment":"A unified Human-Scene Interaction framework that supports versatile\n interactions through language commands.Project URL:\n https://xizaoqu.github.io/unihsi/ . Code:\n https://github.com/OpenRobotLab/UniHSI"},{"id":"http://arxiv.org/abs/2404.12642v1","updated":"2024-04-19T05:48:09Z","published":"2024-04-19T05:48:09Z","title":"Cooperative Sentiment Agents for Multimodal Sentiment Analysis","summary":" In this paper, we propose a new Multimodal Representation Learning (MRL)\nmethod for Multimodal Sentiment Analysis (MSA), which facilitates the adaptive\ninteraction between modalities through Cooperative Sentiment Agents, named\nCo-SA. Co-SA comprises two critical components: the Sentiment Agents\nEstablishment (SAE) phase and the Sentiment Agents Cooperation (SAC) phase.\nDuring the SAE phase, each sentiment agent deals with an unimodal signal and\nhighlights explicit dynamic sentiment variations within the modality via the\nModality-Sentiment Disentanglement (MSD) and Deep Phase Space Reconstruction\n(DPSR) modules. Subsequently, in the SAC phase, Co-SA meticulously designs\ntask-specific interaction mechanisms for sentiment agents so that coordinating\nmultimodal signals to learn the joint representation. Specifically, Co-SA\nequips an independent policy model for each sentiment agent that captures\nsignificant properties within the modality. These policies are optimized\nmutually through the unified reward adaptive to downstream tasks. Benefitting\nfrom the rewarding mechanism, Co-SA transcends the limitation of pre-defined\nfusion modes and adaptively captures unimodal properties for MRL in the\nmultimodal interaction setting. To demonstrate the effectiveness of Co-SA, we\napply it to address Multimodal Sentiment Analysis (MSA) and Multimodal Emotion\nRecognition (MER) tasks. Our comprehensive experimental results demonstrate\nthat Co-SA excels at discovering diverse cross-modal features, encompassing\nboth common and complementary aspects. The code can be available at\nhttps://github.com/smwanghhh/Co-SA.\n","authors":["Shanmin Wang","Hui Shuai","Qingshan Liu","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03781v3","updated":"2024-04-19T05:45:25Z","published":"2023-12-06T09:39:38Z","title":"Lite-Mind: Towards Efficient and Robust Brain Representation Network","summary":" The limited data availability and the low signal-to-noise ratio of fMRI\nsignals lead to the challenging task of fMRI-to-image retrieval.\nState-of-the-art MindEye remarkably improves fMRI-to-image retrieval\nperformance by leveraging a large model, i.e., a 996M MLP Backbone per subject,\nto align fMRI embeddings to the final hidden layer of CLIP's Vision Transformer\n(ViT). However, significant individual variations exist among subjects, even\nunder identical experimental setups, mandating the training of large\nsubject-specific models. The substantial parameters pose significant challenges\nin deploying fMRI decoding on practical devices. To this end, we propose\nLite-Mind, a lightweight, efficient, and robust brain representation learning\nparadigm based on Discrete Fourier Transform (DFT), which efficiently aligns\nfMRI voxels to fine-grained information of CLIP. We elaborately design a DFT\nbackbone with Spectrum Compression and Frequency Projector modules to learn\ninformative and robust voxel embeddings. Our experiments demonstrate that\nLite-Mind achieves an impressive 94.6% fMRI-to-image retrieval accuracy on the\nNSD dataset for Subject 1, with 98.7% fewer parameters than MindEye. Lite-Mind\nis also proven to be able to be migrated to smaller fMRI datasets and\nestablishes a new state-of-the-art for zero-shot classification on the GOD\ndataset.\n","authors":["Zixuan Gong","Qi Zhang","Guangyin Bao","Lei Zhu","Yu Zhang","Ke Liu","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2312.03781v3.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.12635v1","updated":"2024-04-19T05:32:37Z","published":"2024-04-19T05:32:37Z","title":"AED-PADA:Improving Generalizability of Adversarial Example Detection via\n Principal Adversarial Domain Adaptation","summary":" Adversarial example detection, which can be conveniently applied in many\nscenarios, is important in the area of adversarial defense. Unfortunately,\nexisting detection methods suffer from poor generalization performance, because\ntheir training process usually relies on the examples generated from a single\nknown adversarial attack and there exists a large discrepancy between the\ntraining and unseen testing adversarial examples. To address this issue, we\npropose a novel method, named Adversarial Example Detection via Principal\nAdversarial Domain Adaptation (AED-PADA). Specifically, our approach identifies\nthe Principal Adversarial Domains (PADs), i.e., a combination of features of\nthe adversarial examples from different attacks, which possesses large coverage\nof the entire adversarial feature space. Then, we pioneer to exploit\nmulti-source domain adaptation in adversarial example detection with PADs as\nsource domains. Experiments demonstrate the superior generalization ability of\nour proposed AED-PADA. Note that this superiority is particularly achieved in\nchallenging scenarios characterized by employing the minimal magnitude\nconstraint for the perturbations.\n","authors":["Heqi Peng","Yunhong Wang","Ruijie Yang","Beichen Li","Rui Wang","Yuanfang Guo"],"pdf_url":"https://arxiv.org/pdf/2404.12635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12634v1","updated":"2024-04-19T05:31:37Z","published":"2024-04-19T05:31:37Z","title":"Transformer-Based Classification Outcome Prediction for Multimodal\n Stroke Treatment","summary":" This study proposes a multi-modal fusion framework Multitrans based on the\nTransformer architecture and self-attention mechanism. This architecture\ncombines the study of non-contrast computed tomography (NCCT) images and\ndischarge diagnosis reports of patients undergoing stroke treatment, using a\nvariety of methods based on Transformer architecture approach to predicting\nfunctional outcomes of stroke treatment. The results show that the performance\nof single-modal text classification is significantly better than single-modal\nimage classification, but the effect of multi-modal combination is better than\nany single modality. Although the Transformer model only performs worse on\nimaging data, when combined with clinical meta-diagnostic information, both can\nlearn better complementary information and make good contributions to\naccurately predicting stroke treatment effects..\n","authors":["Danqing Ma","Meng Wang","Ao Xiang","Zongqing Qi","Qin Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09401v2","updated":"2024-04-19T05:26:28Z","published":"2024-04-15T01:27:07Z","title":"Watermark-embedded Adversarial Examples for Copyright Protection against\n Diffusion Models","summary":" Diffusion Models (DMs) have shown remarkable capabilities in various\nimage-generation tasks. However, there are growing concerns that DMs could be\nused to imitate unauthorized creations and thus raise copyright issues. To\naddress this issue, we propose a novel framework that embeds personal\nwatermarks in the generation of adversarial examples. Such examples can force\nDMs to generate images with visible watermarks and prevent DMs from imitating\nunauthorized images. We construct a generator based on conditional adversarial\nnetworks and design three losses (adversarial loss, GAN loss, and perturbation\nloss) to generate adversarial examples that have subtle perturbation but can\neffectively attack DMs to prevent copyright violations. Training a generator\nfor a personal watermark by our method only requires 5-10 samples within 2-3\nminutes, and once the generator is trained, it can generate adversarial\nexamples with that watermark significantly fast (0.2s per image). We conduct\nextensive experiments in various conditional image-generation scenarios.\nCompared to existing methods that generate images with chaotic textures, our\nmethod adds visible watermarks on the generated images, which is a more\nstraightforward way to indicate copyright violations. We also observe that our\nadversarial examples exhibit good transferability across unknown generative\nmodels. Therefore, this work provides a simple yet powerful way to protect\ncopyright from DM-based imitation.\n","authors":["Peifei Zhu","Tsubasa Takahashi","Hirokatsu Kataoka"],"pdf_url":"https://arxiv.org/pdf/2404.09401v2.pdf","comment":"updated references"},{"id":"http://arxiv.org/abs/2404.12630v1","updated":"2024-04-19T05:12:04Z","published":"2024-04-19T05:12:04Z","title":"MindTuner: Cross-Subject Visual Decoding with Visual Fingerprint and\n Semantic Correction","summary":" Decoding natural visual scenes from brain activity has flourished, with\nextensive research in single-subject tasks and, however, less in cross-subject\ntasks. Reconstructing high-quality images in cross-subject tasks is a\nchallenging problem due to profound individual differences between subjects and\nthe scarcity of data annotation. In this work, we proposed MindTuner for\ncross-subject visual decoding, which achieves high-quality and rich-semantic\nreconstructions using only 1 hour of fMRI training data benefiting from the\nphenomena of visual fingerprint in the human visual system and a novel\nfMRI-to-text alignment paradigm. Firstly, we pre-train a multi-subject model\namong 7 subjects and fine-tune it with scarce data on new subjects, where LoRAs\nwith Skip-LoRAs are utilized to learn the visual fingerprint. Then, we take the\nimage modality as the intermediate pivot modality to achieve fMRI-to-text\nalignment, which achieves impressive fMRI-to-text retrieval performance and\ncorrects fMRI-to-image reconstruction with fine-tuned semantics. The results of\nboth qualitative and quantitative analyses demonstrate that MindTuner surpasses\nstate-of-the-art cross-subject visual decoding models on the Natural Scenes\nDataset (NSD), whether using training data of 1 hour or 40 hours.\n","authors":["Zixuan Gong","Qi Zhang","Guangyin Bao","Lei Zhu","Ke Liu","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2404.12630v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.11911v2","updated":"2024-04-19T05:08:13Z","published":"2023-12-19T07:39:45Z","title":"EVI-SAM: Robust, Real-time, Tightly-coupled Event-Visual-Inertial State\n Estimation and 3D Dense Mapping","summary":" Event cameras are bio-inspired, motion-activated sensors that demonstrate\nsubstantial potential in handling challenging situations, such as motion blur\nand high-dynamic range. In this paper, we proposed EVI-SAM to tackle the\nproblem of 6 DoF pose tracking and 3D reconstruction using monocular event\ncamera. A novel event-based hybrid tracking framework is designed to estimate\nthe pose, leveraging the robustness of feature matching and the precision of\ndirect alignment. Specifically, we develop an event-based 2D-2D alignment to\nconstruct the photometric constraint, and tightly integrate it with the\nevent-based reprojection constraint. The mapping module recovers the dense and\ncolorful depth of the scene through the image-guided event-based mapping\nmethod. Subsequently, the appearance, texture, and surface mesh of the 3D scene\ncan be reconstructed by fusing the dense depth map from multiple viewpoints\nusing truncated signed distance function (TSDF) fusion. To the best of our\nknowledge, this is the first non-learning work to realize event-based dense\nmapping. Numerical evaluations are performed on both publicly available and\nself-collected datasets, which qualitatively and quantitatively demonstrate the\nsuperior performance of our method. Our EVI-SAM effectively balances accuracy\nand robustness while maintaining computational efficiency, showcasing superior\npose tracking and dense mapping performance in challenging scenarios. Video\nDemo: https://youtu.be/Nn40U4e5Si8.\n","authors":["Weipeng Guan","Peiyu Chen","Huibin Zhao","Yu Wang","Peng Lu"],"pdf_url":"https://arxiv.org/pdf/2312.11911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12625v1","updated":"2024-04-19T04:51:18Z","published":"2024-04-19T04:51:18Z","title":"SkelFormer: Markerless 3D Pose and Shape Estimation using Skeletal\n Transformers","summary":" We introduce SkelFormer, a novel markerless motion capture pipeline for\nmulti-view human pose and shape estimation. Our method first uses off-the-shelf\n2D keypoint estimators, pre-trained on large-scale in-the-wild data, to obtain\n3D joint positions. Next, we design a regression-based inverse-kinematic\nskeletal transformer that maps the joint positions to pose and shape\nrepresentations from heavily noisy observations. This module integrates prior\nknowledge about pose space and infers the full pose state at runtime.\nSeparating the 3D keypoint detection and inverse-kinematic problems, along with\nthe expressive representations learned by our skeletal transformer, enhance the\ngeneralization of our method to unseen noisy data. We evaluate our method on\nthree public datasets in both in-distribution and out-of-distribution settings\nusing three datasets, and observe strong performance with respect to prior\nworks. Moreover, ablation experiments demonstrate the impact of each of the\nmodules of our architecture. Finally, we study the performance of our method in\ndealing with noise and heavy occlusions and find considerable robustness with\nrespect to other solutions.\n","authors":["Vandad Davoodnia","Saeed Ghorbani","Alexandre Messier","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2404.12625v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.12624v1","updated":"2024-04-19T04:49:28Z","published":"2024-04-19T04:49:28Z","title":"Dragtraffic: A Non-Expert Interactive and Point-Based Controllable\n Traffic Scene Generation Framework","summary":" The evaluation and training of autonomous driving systems require diverse and\nscalable corner cases. However, most existing scene generation methods lack\ncontrollability, accuracy, and versatility, resulting in unsatisfactory\ngeneration results. To address this problem, we propose Dragtraffic, a\ngeneralized, point-based, and controllable traffic scene generation framework\nbased on conditional diffusion. Dragtraffic enables non-experts to generate a\nvariety of realistic driving scenarios for different types of traffic agents\nthrough an adaptive mixture expert architecture. We use a regression model to\nprovide a general initial solution and a refinement process based on the\nconditional diffusion model to ensure diversity. User-customized context is\nintroduced through cross-attention to ensure high controllability. Experiments\non a real-world driving dataset show that Dragtraffic outperforms existing\nmethods in terms of authenticity, diversity, and freedom.\n","authors":["Sheng Wang","Ge Sun","Fulong Ma","Tianshuai Hu","Yongkang Song","Lei Zhu","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02352v2","updated":"2024-04-19T04:22:04Z","published":"2024-02-04T05:33:04Z","title":"Region-Based Representations Revisited","summary":" We investigate whether region-based representations are effective for\nrecognition. Regions were once a mainstay in recognition approaches, but pixel\nand patch-based features are now used almost exclusively. We show that recent\nclass-agnostic segmenters like SAM can be effectively combined with strong\nunsupervised representations like DINOv2 and used for a wide variety of tasks,\nincluding semantic segmentation, object-based image retrieval, and multi-image\nanalysis. Once the masks and features are extracted, these representations,\neven with linear decoders, enable competitive performance, making them well\nsuited to applications that require custom queries. The compactness of the\nrepresentation also makes it well-suited to video analysis and other problems\nrequiring inference across many images.\n","authors":["Michal Shlapentokh-Rothman","Ansel Blume","Yao Xiao","Yuqun Wu","Sethuraman T V","Heyi Tao","Jae Yong Lee","Wilfredo Torres","Yu-Xiong Wang","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2402.02352v2.pdf","comment":"CVPR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2404.12612v1","updated":"2024-04-19T03:51:46Z","published":"2024-04-19T03:51:46Z","title":"SA-Attack: Speed-adaptive stealthy adversarial attack on trajectory\n prediction","summary":" Trajectory prediction is critical for the safe planning and navigation of\nautomated vehicles. The trajectory prediction models based on the neural\nnetworks are vulnerable to adversarial attacks. Previous attack methods have\nachieved high attack success rates but overlook the adaptability to realistic\nscenarios and the concealment of the deceits. To address this problem, we\npropose a speed-adaptive stealthy adversarial attack method named SA-Attack.\nThis method searches the sensitive region of trajectory prediction models and\ngenerates the adversarial trajectories by using the vehicle-following method\nand incorporating information about forthcoming trajectories. Our method has\nthe ability to adapt to different speed scenarios by reconstructing the\ntrajectory from scratch. Fusing future trajectory trends and curvature\nconstraints can guarantee the smoothness of adversarial trajectories, further\nensuring the stealthiness of attacks. The empirical study on the datasets of\nnuScenes and Apolloscape demonstrates the attack performance of our proposed\nmethod. Finally, we also demonstrate the adaptability and stealthiness of\nSA-Attack for different speed scenarios. Our code is available at the\nrepository: https://github.com/eclipse-bot/SA-Attack.\n","authors":["Huilin Yin","Jiaxiang Li","Pengju Zhen","Jun Yan"],"pdf_url":"https://arxiv.org/pdf/2404.12612v1.pdf","comment":"This work is published in IEEE IV Symposium"},{"id":"http://arxiv.org/abs/2310.03624v2","updated":"2024-04-19T03:48:13Z","published":"2023-10-05T16:01:29Z","title":"High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling\n and Motion Planning","summary":" A robot self-model is a task-agnostic representation of the robot's physical\nmorphology that can be used for motion planning tasks in the absence of a\nclassical geometric kinematic model. In particular, when the latter is hard to\nengineer or the robot's kinematics change unexpectedly, human-free\nself-modeling is a necessary feature of truly autonomous agents. In this work,\nwe leverage neural fields to allow a robot to self-model its kinematics as a\nneural-implicit query model learned only from 2D images annotated with camera\nposes and configurations. This enables significantly greater applicability than\nexisting approaches which have been dependent on depth images or geometry\nknowledge. To this end, alongside a curricular data sampling strategy, we\npropose a new encoder-based neural density field architecture for dynamic\nobject-centric scenes conditioned on high numbers of degrees of freedom (DOFs).\nIn a 7-DOF robot test setup, the learned self-model achieves a Chamfer-L2\ndistance of 2% of the robot's workspace dimension. We demonstrate the\ncapabilities of this model on motion planning tasks as an exemplary downstream\napplication.\n","authors":["Lennart Schulze","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2310.03624v2.pdf","comment":"International Conference on Robotics and Automation (ICRA) 2024; ICCV\n 2023 Workshop on Neural Fields for Autonomous Driving and Robotics (oral)"},{"id":"http://arxiv.org/abs/2404.12611v1","updated":"2024-04-19T03:45:12Z","published":"2024-04-19T03:45:12Z","title":"Rethinking Clothes Changing Person ReID: Conflicts, Synthesis, and\n Optimization","summary":" Clothes-changing person re-identification (CC-ReID) aims to retrieve images\nof the same person wearing different outfits. Mainstream researches focus on\ndesigning advanced model structures and strategies to capture identity\ninformation independent of clothing. However, the same-clothes discrimination\nas the standard ReID learning objective in CC-ReID is persistently ignored in\nprevious researches. In this study, we dive into the relationship between\nstandard and clothes-changing~(CC) learning objectives, and bring the inner\nconflicts between these two objectives to the fore. We try to magnify the\nproportion of CC training pairs by supplementing high-fidelity clothes-varying\nsynthesis, produced by our proposed Clothes-Changing Diffusion model. By\nincorporating the synthetic images into CC-ReID model training, we observe a\nsignificant improvement under CC protocol. However, such improvement sacrifices\nthe performance under the standard protocol, caused by the inner conflict\nbetween standard and CC. For conflict mitigation, we decouple these objectives\nand re-formulate CC-ReID learning as a multi-objective optimization (MOO)\nproblem. By effectively regularizing the gradient curvature across multiple\nobjectives and introducing preference restrictions, our MOO solution surpasses\nthe single-task training paradigm. Our framework is model-agnostic, and\ndemonstrates superior performance under both CC and standard ReID protocols.\n","authors":["Junjie Li","Guanshuo Wang","Fufu Yu","Yichao Yan","Qiong Jia","Shouhong Ding","Xingdong Sheng","Yunhui Liu","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00567v2","updated":"2024-04-19T03:17:16Z","published":"2024-03-01T14:44:41Z","title":"Flatten Long-Range Loss Landscapes for Cross-Domain Few-Shot Learning","summary":" Cross-domain few-shot learning (CDFSL) aims to acquire knowledge from limited\ntraining data in the target domain by leveraging prior knowledge transferred\nfrom source domains with abundant training samples. CDFSL faces challenges in\ntransferring knowledge across dissimilar domains and fine-tuning models with\nlimited training data. To address these challenges, we initially extend the\nanalysis of loss landscapes from the parameter space to the representation\nspace, which allows us to simultaneously interpret the transferring and\nfine-tuning difficulties of CDFSL models. We observe that sharp minima in the\nloss landscapes of the representation space result in representations that are\nhard to transfer and fine-tune. Moreover, existing flatness-based methods have\nlimited generalization ability due to their short-range flatness. To enhance\nthe transferability and facilitate fine-tuning, we introduce a simple yet\neffective approach to achieve long-range flattening of the minima in the loss\nlandscape. This approach considers representations that are differently\nnormalized as minima in the loss landscape and flattens the high-loss region in\nthe middle by randomly sampling interpolated representations. We implement this\nmethod as a new normalization layer that replaces the original one in both CNNs\nand ViTs. This layer is simple and lightweight, introducing only a minimal\nnumber of additional parameters. Experimental results on 8 datasets demonstrate\nthat our approach outperforms state-of-the-art methods in terms of average\naccuracy. Moreover, our method achieves performance improvements of up to 9\\%\ncompared to the current best approaches on individual datasets. Our code will\nbe released.\n","authors":["Yixiong Zou","Yicong Liu","Yiman Hu","Yuhua Li","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2403.00567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12606v1","updated":"2024-04-19T03:16:08Z","published":"2024-04-19T03:16:08Z","title":"ELEV-VISION-SAM: Integrated Vision Language and Foundation Model for\n Automated Estimation of Building Lowest Floor Elevation","summary":" Street view imagery, aided by advancements in image quality and\naccessibility, has emerged as a valuable resource for urban analytics research.\nRecent studies have explored its potential for estimating lowest floor\nelevation (LFE), offering a scalable alternative to traditional on-site\nmeasurements, crucial for assessing properties' flood risk and damage extent.\nWhile existing methods rely on object detection, the introduction of image\nsegmentation has broadened street view images' utility for LFE estimation,\nalthough challenges still remain in segmentation quality and capability to\ndistinguish front doors from other doors. To address these challenges in LFE\nestimation, this study integrates the Segment Anything model, a segmentation\nfoundation model, with vision language models to conduct text-prompt image\nsegmentation on street view images for LFE estimation. By evaluating various\nvision language models, integration methods, and text prompts, we identify the\nmost suitable model for street view image analytics and LFE estimation tasks,\nthereby improving the availability of the current LFE estimation model based on\nimage segmentation from 33% to 56% of properties. Remarkably, our proposed\nmethod significantly enhances the availability of LFE estimation to almost all\nproperties in which the front door is visible in the street view image. Also\nthe findings present the first baseline and comparison of various vision models\nof street view image-based LFE estimation. The model and findings not only\ncontribute to advancing street view image segmentation for urban analytics but\nalso provide a novel approach for image segmentation tasks for other civil\nengineering and infrastructure analytics tasks.\n","authors":["Yu-Hsuan Ho","Longxiang Li","Ali Mostafavi"],"pdf_url":"https://arxiv.org/pdf/2404.12606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12602v1","updated":"2024-04-19T03:12:17Z","published":"2024-04-19T03:12:17Z","title":"A visualization method for data domain changes in CNN networks and the\n optimization method for selecting thresholds in classification tasks","summary":" In recent years, Face Anti-Spoofing (FAS) has played a crucial role in\npreserving the security of face recognition technology. With the rise of\ncounterfeit face generation techniques, the challenge posed by digitally edited\nfaces to face anti-spoofing is escalating. Existing FAS technologies primarily\nfocus on intercepting physically forged faces and lack a robust solution for\ncross-domain FAS challenges. Moreover, determining an appropriate threshold to\nachieve optimal deployment results remains an issue for intra-domain FAS. To\naddress these issues, we propose a visualization method that intuitively\nreflects the training outcomes of models by visualizing the prediction results\non datasets. Additionally, we demonstrate that employing data augmentation\ntechniques, such as downsampling and Gaussian blur, can effectively enhance\nperformance on cross-domain tasks. Building upon our data visualization\napproach, we also introduce a methodology for setting threshold values based on\nthe distribution of the training dataset. Ultimately, our methods secured us\nsecond place in both the Unified Physical-Digital Face Attack Detection\ncompetition and the Snapshot Spectral Imaging Face Anti-spoofing contest. The\ntraining code is available at https://github.com/SeaRecluse/CVPRW2024.\n","authors":["Minzhe Huang","Changwei Nie","Weihong Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.12602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12599v1","updated":"2024-04-19T03:06:50Z","published":"2024-04-19T03:06:50Z","title":"QUTE: Quantifying Uncertainty in TinyML models with Early-exit-assisted\n ensembles","summary":" Existing methods for uncertainty quantification incur massive memory and\ncompute overhead, often requiring multiple models/inferences. Hence they are\nimpractical on ultra-low-power KB-sized TinyML devices. To reduce overhead,\nprior works have proposed the use of early-exit networks as ensembles to\nquantify uncertainty in a single forward-pass. However, they still have a\nprohibitive cost for tinyML. To address these challenges, we propose QUTE, a\nnovel resource-efficient early-exit-assisted ensemble architecture optimized\nfor tinyML models. QUTE adds additional output blocks at the final exit of the\nbase network and distills the knowledge of early-exits into these blocks to\ncreate a diverse and lightweight ensemble architecture. Our results show that\nQUTE outperforms popular prior works, and improves the quality of uncertainty\nestimates by 6% with 3.1x lower model size on average compared to the most\nrelevant prior work. Furthermore, we demonstrate that QUTE is also effective in\ndetecting co-variate shifted and out-of-distribution inputs, and shows\ncompetitive performance relative to G-ODIN, a state-of-the-art generalized OOD\ndetector.\n","authors":["Nikhil P Ghanathe","Steve Wilton"],"pdf_url":"https://arxiv.org/pdf/2404.12599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11326v3","updated":"2024-04-19T03:00:21Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Liren He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11098v3","updated":"2024-04-19T02:55:54Z","published":"2024-04-17T06:32:42Z","title":"LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing\n Diffusion Models","summary":" In the era of AIGC, the demand for low-budget or even on-device applications\nof diffusion models emerged. In terms of compressing the Stable Diffusion\nmodels (SDMs), several approaches have been proposed, and most of them\nleveraged the handcrafted layer removal methods to obtain smaller U-Nets, along\nwith knowledge distillation to recover the network performance. However, such a\nhandcrafting manner of layer removal is inefficient and lacks scalability and\ngeneralization, and the feature distillation employed in the retraining phase\nfaces an imbalance issue that a few numerically significant feature loss terms\ndominate over others throughout the retraining process. To this end, we\nproposed the layer pruning and normalized distillation for compressing\ndiffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to\ncompress SDM's U-Net automatically and proposed an effective one-shot pruning\ncriterion whose one-shot performance is guaranteed by its good additivity\nproperty, surpassing other layer pruning and handcrafted layer removal methods,\n2) proposed the normalized feature distillation for retraining, alleviated the\nimbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of\nSDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0%\ndecline in PickScore at a pruning ratio of 50% while the comparative methods'\nminimal PickScore decline is 8.2%. We will release our code.\n","authors":["Dingkun Zhang","Sijia Li","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.11098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12235v2","updated":"2024-04-19T02:42:24Z","published":"2024-04-18T14:51:42Z","title":"Beyond Average: Individualized Visual Scanpath Prediction","summary":" Understanding how attention varies across individuals has significant\nscientific and societal impacts. However, existing visual scanpath models treat\nattention uniformly, neglecting individual differences. To bridge this gap,\nthis paper focuses on individualized scanpath prediction (ISP), a new attention\nmodeling task that aims to accurately predict how different individuals shift\ntheir attention in diverse visual tasks. It proposes an ISP method featuring\nthree novel technical components: (1) an observer encoder to characterize and\nintegrate an observer's unique attention traits, (2) an observer-centric\nfeature integration approach that holistically combines visual features, task\nguidance, and observer-specific characteristics, and (3) an adaptive fixation\nprioritization mechanism that refines scanpath predictions by dynamically\nprioritizing semantic feature maps based on individual observers' attention\ntraits. These novel components allow scanpath models to effectively address the\nattention variations across different observers. Our method is generally\napplicable to different datasets, model architectures, and visual tasks,\noffering a comprehensive tool for transforming general scanpath models into\nindividualized ones. Comprehensive evaluations using value-based and\nranking-based metrics verify the method's effectiveness and generalizability.\n","authors":["Xianyu Chen","Ming Jiang","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.12235v2.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2404.12588v1","updated":"2024-04-19T02:33:23Z","published":"2024-04-19T02:33:23Z","title":"Cross-Modal Adapter: Parameter-Efficient Transfer Learning Approach for\n Vision-Language Models","summary":" Adapter-based parameter-efficient transfer learning has achieved exciting\nresults in vision-language models. Traditional adapter methods often require\ntraining or fine-tuning, facing challenges such as insufficient samples or\nresource limitations. While some methods overcome the need for training by\nleveraging image modality cache and retrieval, they overlook the text\nmodality's importance and cross-modal cues for the efficient adaptation of\nparameters in visual-language models. This work introduces a cross-modal\nparameter-efficient approach named XMAdapter. XMAdapter establishes cache\nmodels for both text and image modalities. It then leverages retrieval through\nvisual-language bimodal information to gather clues for inference. By\ndynamically adjusting the affinity ratio, it achieves cross-modal fusion,\ndecoupling different modal similarities to assess their respective\ncontributions. Additionally, it explores hard samples based on differences in\ncross-modal affinity and enhances model performance through adaptive adjustment\nof sample learning intensity. Extensive experimental results on benchmark\ndatasets demonstrate that XMAdapter outperforms previous adapter-based methods\nsignificantly regarding accuracy, generalization, and efficiency.\n","authors":["Juncheng Yang","Zuchao Li","Shuai Xie","Weiping Zhu","Wei Yu","Shijun Li"],"pdf_url":"https://arxiv.org/pdf/2404.12588v1.pdf","comment":"This paper is accepted to ICME 2024"},{"id":"http://arxiv.org/abs/2404.09778v2","updated":"2024-04-19T02:19:19Z","published":"2024-04-15T13:30:34Z","title":"The Devil is in the Few Shots: Iterative Visual Knowledge Completion for\n Few-shot Learning","summary":" Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot\nlearning performance. Few-shot learning aims to further enhance the transfer\ncapability of CLIP by giving few images in each class, aka 'few shots'. Most\nexisting methods either implicitly learn from the few shots by incorporating\nlearnable prompts or adapters, or explicitly embed them in a cache model for\ninference. However, the narrow distribution of few shots often contains\nincomplete class information, leading to biased visual knowledge with high risk\nof misclassification. To tackle this problem, recent methods propose to\nsupplement visual knowledge by generative models or extra databases, which can\nbe costly and time-consuming. In this paper, we propose an Iterative Visual\nKnowledge CompLetion (KCL) method to complement visual knowledge by properly\ntaking advantages of unlabeled samples without access to any auxiliary or\nsynthetic data. Specifically, KCL first measures the similarities between\nunlabeled samples and each category. Then, the samples with top confidence to\neach category is selected and collected by a designed confidence criterion.\nFinally, the collected samples are treated as labeled ones and added to few\nshots to jointly re-estimate the remaining unlabeled ones. The above procedures\nwill be repeated for a certain number of iterations with more and more samples\nbeing collected until convergence, ensuring a progressive and robust knowledge\ncompletion process. Extensive experiments on 11 benchmark datasets demonstrate\nthe effectiveness and efficiency of KCL as a plug-and-play module under both\nfew-shot and zero-shot learning settings. Code is available at\nhttps://github.com/Mark-Sky/KCL.\n","authors":["Yaohui Li","Qifeng Zhou","Haoxing Chen","Jianbing Zhang","Xinyu Dai","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.09778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16108v4","updated":"2024-04-19T02:05:02Z","published":"2023-09-28T02:20:59Z","title":"Channel Vision Transformers: An Image Is Worth 1 x 16 x 16 Words","summary":" Vision Transformer (ViT) has emerged as a powerful architecture in the realm\nof modern computer vision. However, its application in certain imaging fields,\nsuch as microscopy and satellite imaging, presents unique challenges. In these\ndomains, images often contain multiple channels, each carrying semantically\ndistinct and independent information. Furthermore, the model must demonstrate\nrobustness to sparsity in input channels, as they may not be densely available\nduring training or testing. In this paper, we propose a modification to the ViT\narchitecture that enhances reasoning across the input channels and introduce\nHierarchical Channel Sampling (HCS) as an additional regularization technique\nto ensure robustness when only partial channels are presented during test time.\nOur proposed model, ChannelViT, constructs patch tokens independently from each\ninput channel and utilizes a learnable channel embedding that is added to the\npatch tokens, similar to positional embeddings. We evaluate the performance of\nChannelViT on ImageNet, JUMP-CP (microscopy cell imaging), and So2Sat\n(satellite imaging). Our results show that ChannelViT outperforms ViT on\nclassification tasks and generalizes well, even when a subset of input channels\nis used during testing. Across our experiments, HCS proves to be a powerful\nregularizer, independent of the architecture employed, suggesting itself as a\nstraightforward technique for robust ViT training. Lastly, we find that\nChannelViT generalizes effectively even when there is limited access to all\nchannels during training, highlighting its potential for multi-channel imaging\nunder real-world conditions with sparse sensors. Our code is available at\nhttps://github.com/insitro/ChannelViT.\n","authors":["Yujia Bao","Srinivasan Sivanandan","Theofanis Karaletsos"],"pdf_url":"https://arxiv.org/pdf/2309.16108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11843v2","updated":"2024-04-19T01:45:02Z","published":"2024-04-18T01:46:31Z","title":"Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using\n hybrid CNN-Transformer Architecture","summary":" Medical imaging has been used for diagnosis of various conditions, making it\none of the most powerful resources for effective patient care. Due to\nwidespread availability, low cost, and low radiation, chest X-ray is one of the\nmost sought after radiology examination for the diagnosis of various thoracic\ndiseases. Due to advancements in medical imaging technologies and increasing\npatient load, current radiology workflow faces various challenges including\nincreasing backlogs, working long hours, and increase in diagnostic errors. An\nautomated computer-aided diagnosis system that can interpret chest X-rays to\naugment radiologists by providing actionable insights has potential to provide\nsecond opinion to radiologists, highlight relevant regions in the image, in\nturn expediting clinical workflow, reducing diagnostic errors, and improving\npatient care. In this study, we applied a novel architecture augmenting the\nDenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention\nmechanism using transformer, namely SA-DenseNet121, that can identify multiple\nthoracic diseases in chest X-rays. We conducted experiments on four of the\nlargest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG,\nand IU-CXR. Experimental results in terms of area under the receiver operating\ncharacteristics (AUC-ROC) shows that augmenting CNN with self-attention has\npotential in diagnosing different thoracic diseases from chest X-rays. The\nproposed methodology has the potential to support the reading workflow, improve\nefficiency, and reduce diagnostic errors.\n","authors":["Sonit Singh"],"pdf_url":"https://arxiv.org/pdf/2404.11843v2.pdf","comment":"24 pages, 13 Figures, 13 Tables. This article heavily draws from\n arXiv:1904.09925 where authors originally proposed attention-augmented\n convolutional network. arXiv admin note: text overlap with arXiv:1904.09925\n by other authors"},{"id":"http://arxiv.org/abs/2404.09515v2","updated":"2024-04-19T01:43:56Z","published":"2024-04-15T07:20:09Z","title":"Revealing the structure-property relationships of copper alloys with\n FAGC","summary":" Understanding how the structure of materials affects their properties is a\ncornerstone of materials science and engineering. However, traditional methods\nhave struggled to accurately describe the quantitative structure-property\nrelationships for complex structures. In our study, we bridge this gap by\nleveraging machine learning to analyze images of materials' microstructures,\nthus offering a novel way to understand and predict the properties of materials\nbased on their microstructures. We introduce a method known as FAGC (Feature\nAugmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr\nalloys. This approach utilizes machine learning to examine the shapes within\nimages of the alloys' microstructures and predict their mechanical and\nelectronic properties. This generative FAGC approach can effectively expand the\nrelatively small training datasets due to the limited availability of materials\nimages labeled with quantitative properties. The process begins with extracting\nfeatures from the images using neural networks. These features are then mapped\nonto the Pre-shape space to construct the Geodesic curves. Along these curves,\nnew features are generated, effectively increasing the dataset. Moreover, we\ndesign a pseudo-labeling mechanism for these newly generated features to\nfurther enhance the training dataset. Our FAGC method has shown remarkable\nresults, significantly improving the accuracy of predicting the electronic\nconductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978\nand 0.998, respectively. These outcomes underscore the potential of FAGC to\naddress the challenge of limited image data in materials science, providing a\npowerful tool for establishing detailed and quantitative relationships between\ncomplex microstructures and material properties.\n","authors":["Yuexing Han","Guanxin Wan","Tao Han","Bing Wang","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10718v2","updated":"2024-04-19T01:19:25Z","published":"2024-04-16T16:51:27Z","title":"GazeHTA: End-to-end Gaze Target Detection with Head-Target Association","summary":" We propose an end-to-end approach for gaze target detection: predicting a\nhead-target connection between individuals and the target image regions they\nare looking at. Most of the existing methods use independent components such as\noff-the-shelf head detectors or have problems in establishing associations\nbetween heads and gaze targets. In contrast, we investigate an end-to-end\nmulti-person Gaze target detection framework with Heads and Targets Association\n(GazeHTA), which predicts multiple head-target instances based solely on input\nscene image. GazeHTA addresses challenges in gaze target detection by (1)\nleveraging a pre-trained diffusion model to extract scene features for rich\nsemantic understanding, (2) re-injecting a head feature to enhance the head\npriors for improved head understanding, and (3) learning a connection map as\nthe explicit visual associations between heads and gaze targets. Our extensive\nexperimental results demonstrate that GazeHTA outperforms state-of-the-art gaze\ntarget detection methods and two adapted diffusion-based baselines on two\nstandard datasets.\n","authors":["Zhi-Yi Lin","Jouh Yeong Chew","Jan van Gemert","Xucong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11593v2","updated":"2024-04-19T00:35:35Z","published":"2022-03-22T10:21:11Z","title":"Unified Negative Pair Generation toward Well-discriminative Feature\n Space for Face Recognition","summary":" The goal of face recognition (FR) can be viewed as a pair similarity\noptimization problem, maximizing a similarity set $\\mathcal{S}^p$ over positive\npairs, while minimizing similarity set $\\mathcal{S}^n$ over negative pairs.\nIdeally, it is expected that FR models form a well-discriminative feature space\n(WDFS) that satisfies $\\inf{\\mathcal{S}^p} > \\sup{\\mathcal{S}^n}$. With regard\nto WDFS, the existing deep feature learning paradigms (i.e., metric and\nclassification losses) can be expressed as a unified perspective on different\npair generation (PG) strategies. Unfortunately, in the metric loss (ML), it is\ninfeasible to generate negative pairs taking all classes into account in each\niteration because of the limited mini-batch size. In contrast, in\nclassification loss (CL), it is difficult to generate extremely hard negative\npairs owing to the convergence of the class weight vectors to their center.\nThis leads to a mismatch between the two similarity distributions of the\nsampled pairs and all negative pairs. Thus, this paper proposes a unified\nnegative pair generation (UNPG) by combining two PG strategies (i.e., MLPG and\nCLPG) from a unified perspective to alleviate the mismatch. UNPG introduces\nuseful information about negative pairs using MLPG to overcome the CLPG\ndeficiency. Moreover, it includes filtering the similarities of noisy negative\npairs to guarantee reliable convergence and improved performance. Exhaustive\nexperiments show the superiority of UNPG by achieving state-of-the-art\nperformance across recent loss functions on public benchmark datasets. Our code\nand pretrained models are publicly available.\n","authors":["Junuk Jung","Seonhoon Lee","Heung-Seon Oh","Yongjun Park","Joochan Park","Sungbin Son"],"pdf_url":"https://arxiv.org/pdf/2203.11593v2.pdf","comment":"9 pages, 6 figures, Published at BMVC22"},{"id":"http://arxiv.org/abs/2310.10404v7","updated":"2024-04-19T00:00:45Z","published":"2023-10-16T13:49:46Z","title":"LLM4SGG: Large Language Models for Weakly Supervised Scene Graph\n Generation","summary":" Weakly-Supervised Scene Graph Generation (WSSGG) research has recently\nemerged as an alternative to the fully-supervised approach that heavily relies\non costly annotations. In this regard, studies on WSSGG have utilized image\ncaptions to obtain unlocalized triplets while primarily focusing on grounding\nthe unlocalized triplets over image regions. However, they have overlooked the\ntwo issues involved in the triplet formation process from the captions: 1)\nSemantic over-simplification issue arises when extracting triplets from\ncaptions, where fine-grained predicates in captions are undesirably converted\ninto coarse-grained predicates, resulting in a long-tailed predicate\ndistribution, and 2) Low-density scene graph issue arises when aligning the\ntriplets in the caption with entity/predicate classes of interest, where many\ntriplets are discarded and not used in training, leading to insufficient\nsupervision. To tackle the two issues, we propose a new approach, i.e., Large\nLanguage Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two\nissues by leveraging the LLM's in-depth understanding of language and reasoning\nability during the extraction of triplets from captions and alignment of\nentity/predicate classes with target data. To further engage the LLM in these\nprocesses, we adopt the idea of Chain-of-Thought and the in-context few-shot\nlearning strategy. To validate the effectiveness of LLM4SGG, we conduct\nextensive experiments on Visual Genome and GQA datasets, showing significant\nimprovements in both Recall@K and mean Recall@K compared to the\nstate-of-the-art WSSGG methods. A further appeal is that LLM4SGG is\ndata-efficient, enabling effective model training with a small amount of\ntraining images.\n","authors":["Kibum Kim","Kanghoon Yoon","Jaehyeong Jeon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2310.10404v7.pdf","comment":"8 pages; CVPR 2024"},{"id":"http://arxiv.org/abs/2312.07509v2","updated":"2024-04-19T22:38:48Z","published":"2023-12-12T18:43:05Z","title":"PEEKABOO: Interactive Video Generation via Masked-Diffusion","summary":" Modern video generation models like Sora have achieved remarkable success in\nproducing high-quality videos. However, a significant limitation is their\ninability to offer interactive control to users, a feature that promises to\nopen up unprecedented applications and creativity. In this work, we introduce\nthe first solution to equip diffusion-based video generation models with\nspatio-temporal control. We present Peekaboo, a novel masked attention module,\nwhich seamlessly integrates with current video generation models offering\ncontrol without the need for additional training or inference overhead. To\nfacilitate future research, we also introduce a comprehensive benchmark for\ninteractive video generation. This benchmark offers a standardized framework\nfor the community to assess the efficacy of emerging interactive video\ngeneration models. Our extensive qualitative and quantitative assessments\nreveal that Peekaboo achieves up to a 3.8x improvement in mIoU over baseline\nmodels, all while maintaining the same latency. Code and benchmark are\navailable on the webpage.\n","authors":["Yash Jain","Anshul Nasery","Vibhav Vineet","Harkirat Behl"],"pdf_url":"https://arxiv.org/pdf/2312.07509v2.pdf","comment":"Project webpage - https://jinga-lala.github.io/projects/Peekaboo/"},{"id":"http://arxiv.org/abs/2404.13194v1","updated":"2024-04-19T21:54:20Z","published":"2024-04-19T21:54:20Z","title":"Privacy-Preserving Debiasing using Data Augmentation and Machine\n Unlearning","summary":" Data augmentation is widely used to mitigate data bias in the training\ndataset. However, data augmentation exposes machine learning models to privacy\nattacks, such as membership inference attacks. In this paper, we propose an\neffective combination of data augmentation and machine unlearning, which can\nreduce data bias while providing a provable defense against known attacks.\nSpecifically, we maintain the fairness of the trained model with\ndiffusion-based data augmentation, and then utilize multi-shard unlearning to\nremove identifying information of original data from the ML model for\nprotection against privacy attacks. Experimental evaluation across diverse\ndatasets demonstrates that our approach can achieve significant improvements in\nbias reduction as well as robustness against state-of-the-art privacy attacks.\n","authors":["Zhixin Pan","Emma Andrews","Laura Chang","Prabhat Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.13194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13185v1","updated":"2024-04-19T21:21:36Z","published":"2024-04-19T21:21:36Z","title":"Unlocking Robust Segmentation Across All Age Groups via Continual\n Learning","summary":" Most deep learning models in medical imaging are trained on adult data with\nunclear performance on pediatric images. In this work, we aim to address this\nchallenge in the context of automated anatomy segmentation in whole-body\nComputed Tomography (CT). We evaluate the performance of CT organ segmentation\nalgorithms trained on adult data when applied to pediatric CT volumes and\nidentify substantial age-dependent underperformance. We subsequently propose\nand evaluate strategies, including data augmentation and continual learning\napproaches, to achieve good segmentation accuracy across all age groups. Our\nbest-performing model, trained using continual learning, achieves high\nsegmentation accuracy on both adult and pediatric data (Dice scores of 0.90 and\n0.84 respectively).\n","authors":["Chih-Ying Liu","Jeya Maria Jose Valanarasu","Camila Gonzalez","Curtis Langlotz","Andrew Ng","Sergios Gatidis"],"pdf_url":"https://arxiv.org/pdf/2404.13185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08876v6","updated":"2024-04-19T21:13:41Z","published":"2024-01-16T23:19:30Z","title":"Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image\n Labeling","summary":" As deep neural networks are more commonly deployed in high-stakes domains,\ntheir black-box nature makes uncertainty quantification challenging. We\ninvestigate the effects of presenting conformal prediction sets--a\ndistribution-free class of methods for generating prediction sets with\nspecified coverage--to express uncertainty in AI-advised decision-making.\nThrough a large online experiment, we compare the utility of conformal\nprediction sets to displays of Top-1 and Top-k predictions for AI-advised image\nlabeling. In a pre-registered analysis, we find that the utility of prediction\nsets for accuracy varies with the difficulty of the task: while they result in\naccuracy on par with or less than Top-1 and Top-k displays for easy images,\nprediction sets excel at assisting humans in labeling out-of-distribution (OOD)\nimages, especially when the set size is small. Our results empirically pinpoint\npractical challenges of conformal prediction sets and provide implications on\nhow to incorporate them for real-world decision-making.\n","authors":["Dongping Zhang","Angelos Chatzimparmpas","Negar Kamali","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2401.08876v6.pdf","comment":"19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024"},{"id":"http://arxiv.org/abs/2404.10540v2","updated":"2024-04-19T20:15:45Z","published":"2024-04-12T20:40:12Z","title":"SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic\n Perception","summary":" Recently, event-based vision sensors have gained attention for autonomous\ndriving applications, as conventional RGB cameras face limitations in handling\nchallenging dynamic conditions. However, the availability of real-world and\nsynthetic event-based vision datasets remains limited. In response to this gap,\nwe present SEVD, a first-of-its-kind multi-view ego, and fixed perception\nsynthetic event-based dataset using multiple dynamic vision sensors within the\nCARLA simulator. Data sequences are recorded across diverse lighting (noon,\nnighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy)\nwith domain shifts (discrete and continuous). SEVD spans urban, suburban,\nrural, and highway scenes featuring various classes of objects (car, truck,\nvan, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes\nRGB imagery, depth maps, optical flow, semantic, and instance segmentation,\nfacilitating a comprehensive understanding of the scene. Furthermore, we\nevaluate the dataset using state-of-the-art event-based (RED, RVT) and\nframe-based (YOLOv8) methods for traffic participant detection tasks and\nprovide baseline benchmarks for assessment. Additionally, we conduct\nexperiments to assess the synthetic event-based dataset's generalization\ncapabilities. The dataset is available at\nhttps://eventbasedvision.github.io/SEVD\n","authors":["Manideep Reddy Aliminati","Bharatesh Chakravarthi","Aayush Atul Verma","Arpitsinh Vaghela","Hua Wei","Xuesong Zhou","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.10540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13159v1","updated":"2024-04-19T19:55:15Z","published":"2024-04-19T19:55:15Z","title":"Equivariant Imaging for Self-supervised Hyperspectral Image Inpainting","summary":" Hyperspectral imaging (HSI) is a key technology for earth observation,\nsurveillance, medical imaging and diagnostics, astronomy and space exploration.\nThe conventional technology for HSI in remote sensing applications is based on\nthe push-broom scanning approach in which the camera records the spectral image\nof a stripe of the scene at a time, while the image is generated by the\naggregation of measurements through time. In real-world airborne and spaceborne\nHSI instruments, some empty stripes would appear at certain locations, because\nplatforms do not always maintain a constant programmed attitude, or have access\nto accurate digital elevation maps (DEM), and the travelling track is not\nnecessarily aligned with the hyperspectral cameras at all times. This makes the\nenhancement of the acquired HS images from incomplete or corrupted observations\nan essential task. We introduce a novel HSI inpainting algorithm here, called\nHyperspectral Equivariant Imaging (Hyper-EI). Hyper-EI is a self-supervised\nlearning-based method which does not require training on extensive datasets or\naccess to a pre-trained model. Experimental results show that the proposed\nmethod achieves state-of-the-art inpainting performance compared to the\nexisting methods.\n","authors":["Shuo Li","Mike Davies","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2404.13159v1.pdf","comment":"5 Pages, 4 Figures, 2 Tables"},{"id":"http://arxiv.org/abs/2404.13153v1","updated":"2024-04-19T19:44:24Z","published":"2024-04-19T19:44:24Z","title":"Motion-adaptive Separable Collaborative Filters for Blind Motion\n Deblurring","summary":" Eliminating image blur produced by various kinds of motion has been a\nchallenging problem. Dominant approaches rely heavily on model capacity to\nremove blurring by reconstructing residual from blurry observation in feature\nspace. These practices not only prevent the capture of spatially variable\nmotion in the real world but also ignore the tailored handling of various\nmotions in image space. In this paper, we propose a novel real-world deblurring\nfiltering model called the Motion-adaptive Separable Collaborative (MISC)\nFilter. In particular, we use a motion estimation network to capture motion\ninformation from neighborhoods, thereby adaptively estimating spatially-variant\nmotion flow, mask, kernels, weights, and offsets to obtain the MISC Filter. The\nMISC Filter first aligns the motion-induced blurring patterns to the motion\nmiddle along the predicted flow direction, and then collaboratively filters the\naligned image through the predicted kernels, weights, and offsets to generate\nthe output. This design can handle more generalized and complex motion in a\nspatially differentiated manner. Furthermore, we analyze the relationships\nbetween the motion estimation network and the residual reconstruction network.\nExtensive experiments on four widely used benchmarks demonstrate that our\nmethod provides an effective solution for real-world motion blur removal and\nachieves state-of-the-art performance. Code is available at\nhttps://github.com/ChengxuLiu/MISCFilter\n","authors":["Chengxu Liu","Xuan Wang","Xiangyu Xu","Ruhao Tian","Shuai Li","Xueming Qian","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13153v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13148v1","updated":"2024-04-19T19:25:26Z","published":"2024-04-19T19:25:26Z","title":"BACS: Background Aware Continual Semantic Segmentation","summary":" Semantic segmentation plays a crucial role in enabling comprehensive scene\nunderstanding for robotic systems. However, generating annotations is\nchallenging, requiring labels for every pixel in an image. In scenarios like\nautonomous driving, there's a need to progressively incorporate new classes as\nthe operating environment of the deployed agent becomes more complex. For\nenhanced annotation efficiency, ideally, only pixels belonging to new classes\nwould be annotated. This approach is known as Continual Semantic Segmentation\n(CSS). Besides the common problem of classical catastrophic forgetting in the\ncontinual learning setting, CSS suffers from the inherent ambiguity of the\nbackground, a phenomenon we refer to as the \"background shift'', since pixels\nlabeled as background could correspond to future classes (forward background\nshift) or previous classes (backward background shift). As a result, continual\nlearning approaches tend to fail. This paper proposes a Backward Background\nShift Detector (BACS) to detect previously observed classes based on their\ndistance in the latent space from the foreground centroids of previous steps.\nMoreover, we propose a modified version of the cross-entropy loss function,\nincorporating the BACS detector to down-weight background pixels associated\nwith formerly observed classes. To combat catastrophic forgetting, we employ\nmasked feature distillation alongside dark experience replay. Additionally, our\napproach includes a transformer decoder capable of adjusting to new classes\nwithout necessitating an additional classification head. We validate BACS's\nsuperior performance over existing state-of-the-art methods on standard CSS\nbenchmarks.\n","authors":["Mostafa ElAraby","Ali Harakeh","Liam Paull"],"pdf_url":"https://arxiv.org/pdf/2404.13148v1.pdf","comment":"8 pages, 4 figures, CRV 2024"},{"id":"http://arxiv.org/abs/2404.13146v1","updated":"2024-04-19T19:24:20Z","published":"2024-04-19T19:24:20Z","title":"DeepFake-O-Meter v2.0: An Open Platform for DeepFake Detection","summary":" Deepfakes, as AI-generated media, have increasingly threatened media\nintegrity and personal privacy with realistic yet fake digital content. In this\nwork, we introduce an open-source and user-friendly online platform,\nDeepFake-O-Meter v2.0, that integrates state-of-the-art methods for detecting\nDeepfake images, videos, and audio. Built upon DeepFake-O-Meter v1.0, we have\nmade significant upgrades and improvements in platform architecture design,\nincluding user interaction, detector integration, job balancing, and security\nmanagement. The platform aims to offer everyday users a convenient service for\nanalyzing DeepFake media using multiple state-of-the-art detection algorithms.\nIt ensures secure and private delivery of the analysis results. Furthermore, it\nserves as an evaluation and benchmarking platform for researchers in digital\nmedia forensics to compare the performance of multiple algorithms on the same\ninput. We have also conducted detailed usage analysis based on the collected\ndata to gain deeper insights into our platform's statistics. This involves\nanalyzing two-month trends in user activity and evaluating the processing\nefficiency of each detector.\n","authors":["Shuwei Hou","Yan Ju","Chengzhe Sun","Shan Jia","Lipeng Ke","Riky Zhou","Anita Nikolich","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2404.13146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13651v2","updated":"2024-04-19T19:17:03Z","published":"2023-08-25T19:40:56Z","title":"PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained\n Image Classification Accuracy for AIs and Humans","summary":" Nearest neighbors (NN) are traditionally used to compute final decisions,\ne.g., in Support Vector Machines or k-NN classifiers, and to provide users with\nexplanations for the model's decision. In this paper, we show a novel utility\nof nearest neighbors: To improve predictions of a frozen, pretrained classifier\nC. We leverage an image comparator S that (1) compares the input image with NN\nimages from the top-K most probable classes; and (2) uses S's output scores to\nweight the confidence scores of C. Our method consistently improves\nfine-grained image classification accuracy on CUB-200, Cars-196, and Dogs-120.\nAlso, a human study finds that showing lay users our probable-class nearest\nneighbors (PCNN) improves their decision accuracy over prior work which only\nshows only the top-1 class examples.\n","authors":["Giang Nguyen","Valerie Chen","Mohammad Reza Taesiri","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.13651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02899v2","updated":"2024-04-19T18:53:41Z","published":"2024-04-03T17:57:15Z","title":"MatAtlas: Text-driven Consistent Geometry Texturing and Material\n Assignment","summary":" We present MatAtlas, a method for consistent text-guided 3D model texturing.\nFollowing recent progress we leverage a large scale text-to-image generation\nmodel (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully\ndesign an RGB texturing pipeline that leverages a grid pattern diffusion,\ndriven by depth and edges. By proposing a multi-step texture refinement\nprocess, we significantly improve the quality and 3D consistency of the\ntexturing output. To further address the problem of baked-in lighting, we move\nbeyond RGB colors and pursue assigning parametric materials to the assets.\nGiven the high-quality initial RGB texture, we propose a novel material\nretrieval method capitalized on Large Language Models (LLM), enabling\neditabiliy and relightability. We evaluate our method on a wide variety of\ngeometries and show that our method significantly outperform prior arts. We\nalso analyze the role of each component through a detailed ablation study.\n","authors":["Duygu Ceylan","Valentin Deschaintre","Thibault Groueix","Rosalie Martin","Chun-Hao Huang","Romain Rouffet","Vladimir Kim","Gaëtan Lassagne"],"pdf_url":"https://arxiv.org/pdf/2404.02899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13134v1","updated":"2024-04-19T18:52:07Z","published":"2024-04-19T18:52:07Z","title":"Deep Learning-based Text-in-Image Watermarking","summary":" In this work, we introduce a novel deep learning-based approach to\ntext-in-image watermarking, a method that embeds and extracts textual\ninformation within images to enhance data security and integrity. Leveraging\nthe capabilities of deep learning, specifically through the use of\nTransformer-based architectures for text processing and Vision Transformers for\nimage feature extraction, our method sets new benchmarks in the domain. The\nproposed method represents the first application of deep learning in\ntext-in-image watermarking that improves adaptivity, allowing the model to\nintelligently adjust to specific image characteristics and emerging threats.\nThrough testing and evaluation, our method has demonstrated superior robustness\ncompared to traditional watermarking techniques, achieving enhanced\nimperceptibility that ensures the watermark remains undetectable across various\nimage contents.\n","authors":["Bishwa Karki","Chun-Hua Tsai","Pei-Chi Huang","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.13134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09673v2","updated":"2024-04-19T18:48:01Z","published":"2024-01-18T01:18:59Z","title":"Artwork Protection Against Neural Style Transfer Using Locally Adaptive\n Adversarial Color Attack","summary":" Neural style transfer (NST) generates new images by combining the style of\none image with the content of another. However, unauthorized NST can exploit\nartwork, raising concerns about artists' rights and motivating the development\nof proactive protection methods. We propose Locally Adaptive Adversarial Color\nAttack (LAACA), empowering artists to protect their artwork from unauthorized\nstyle transfer by processing before public release. By delving into the\nintricacies of human visual perception and the role of different frequency\ncomponents, our method strategically introduces frequency-adaptive\nperturbations in the image. These perturbations significantly degrade the\ngeneration quality of NST while maintaining an acceptable level of visual\nchange in the original image, ensuring that potential infringers are\ndiscouraged from using the protected artworks, because of its bad NST\ngeneration quality. Additionally, existing metrics often overlook the\nimportance of color fidelity in evaluating color-mattered tasks, such as the\nquality of NST-generated images, which is crucial in the context of artistic\nworks. To comprehensively assess the color-mattered tasks, we propose the\nAdversarial Color Distance Metric (ACDM), designed to quantify the color\ndifference of images pre- and post-manipulations. Experimental results confirm\nthat attacking NST using LAACA results in visually inferior style transfer, and\nthe ACDM can efficiently measure color-mattered tasks. By providing artists\nwith a tool to safeguard their intellectual property, our work relieves the\nsocio-technical challenges posed by the misuse of NST in the art community.\n","authors":["Zhongliang Guo","Junhao Dong","Yifei Qian","Kaixuan Wang","Weiye Li","Ziheng Guo","Yuheng Wang","Yanli Li","Ognjen Arandjelović","Lei Fang"],"pdf_url":"https://arxiv.org/pdf/2401.09673v2.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.13130v1","updated":"2024-04-19T18:34:52Z","published":"2024-04-19T18:34:52Z","title":"On-board classification of underwater images using hybrid\n classical-quantum CNN based method","summary":" Underwater images taken from autonomous underwater vehicles (AUV's) often\nsuffer from low light, high turbidity, poor contrast, motion-blur and excessive\nlight scattering and hence require image enhancement techniques for object\nrecognition. Machine learning methods are being increasingly used for object\nrecognition under such adverse conditions. These enhanced object recognition\nmethods of images taken from AUV's has potential applications in underwater\npipeline and optical fibre surveillance, ocean bed resource extraction, ocean\nfloor mapping, underwater species exploration, etc. While the classical machine\nlearning methods are very efficient in terms of accuracy, they require large\ndatasets and high computational time for image classification. In the current\nwork, we use quantum-classical hybrid machine learning methods for real-time\nunder-water object recognition on-board an AUV for the first time. We use\nreal-time motion-blurred and low-light images taken from an on-board camera of\nAUV built in-house and apply existing hybrid machine learning methods for\nobject recognition. Our hybrid methods consist of quantum encoding and\nflattening of classical images using quantum circuits and sending them to\nclassical neural networks for image classification. The results of hybrid\nmethods carried out using Pennylane based quantum simulators both on GPU and\nusing pre-trained models on an on-board NVIDIA GPU chipset are compared with\nresults from corresponding classical machine learning methods. We observe that\nthe hybrid quantum machine learning methods show an efficiency greater than\n65\\% and reduction in run-time by one-thirds and require 50\\% smaller dataset\nsizes for training the models compared to classical machine learning methods.\nWe hope that our work opens up further possibilities in quantum enhanced\nreal-time computer vision in autonomous vehicles.\n","authors":["Sreeraj Rajan Warrier","D Sri Harshavardhan Reddy","Sriya Bada","Rohith Achampeta","Sebastian Uppapalli","Jayasri Dontabhaktuni"],"pdf_url":"https://arxiv.org/pdf/2404.13130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14435v1","updated":"2024-04-19T16:40:24Z","published":"2024-04-19T16:40:24Z","title":"FreSeg: Frenet-Frame-based Part Segmentation for 3D Curvilinear\n Structures","summary":" Part segmentation is a crucial task for 3D curvilinear structures like neuron\ndendrites and blood vessels, enabling the analysis of dendritic spines and\naneurysms with scientific and clinical significance. However, their diversely\nwinded morphology poses a generalization challenge to existing deep learning\nmethods, which leads to labor-intensive manual correction. In this work, we\npropose FreSeg, a framework of part segmentation tasks for 3D curvilinear\nstructures. With Frenet-Frame-based point cloud transformation, it enables the\nmodels to learn more generalizable features and have significant performance\nimprovements on tasks involving elongated and curvy geometries. We evaluate\nFreSeg on 2 datasets: 1) DenSpineEM, an in-house dataset for dendritic spine\nsegmentation, and 2) IntrA, a public 3D dataset for intracranial aneurysm\nsegmentation. Further, we will release the DenSpineEM dataset, which includes\nroughly 6,000 spines from 69 dendrites from 3 public electron microscopy (EM)\ndatasets, to foster the development of effective dendritic spine instance\nextraction methods and, consequently, large-scale connectivity analysis to\nbetter understand mammalian brains.\n","authors":["Shixuan Gu","Jason Ken Adhinarta","Mikhail Bessmeltsev","Jiancheng Yang","Jessica Zhang","Daniel Berger","Jeff W. Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2404.14435v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.13108v1","updated":"2024-04-19T16:19:30Z","published":"2024-04-19T16:19:30Z","title":"RegWSI: Whole Slide Image Registration using Combined Deep Feature- and\n Intensity-Based Methods: Winner of the ACROBAT 2023 Challenge","summary":" The automatic registration of differently stained whole slide images (WSIs)\nis crucial for improving diagnosis and prognosis by fusing complementary\ninformation emerging from different visible structures. It is also useful to\nquickly transfer annotations between consecutive or restained slides, thus\nsignificantly reducing the annotation time and associated costs. Nevertheless,\nthe slide preparation is different for each stain and the tissue undergoes\ncomplex and large deformations. Therefore, a robust, efficient, and accurate\nregistration method is highly desired by the scientific community and hospitals\nspecializing in digital pathology. We propose a two-step hybrid method\nconsisting of (i) deep learning- and feature-based initial alignment algorithm,\nand (ii) intensity-based nonrigid registration using the instance optimization.\nThe proposed method does not require any fine-tuning to a particular dataset\nand can be used directly for any desired tissue type and stain. The method\nscored 1st place in the ACROBAT 2023 challenge. We evaluated using three open\ndatasets: (i) ANHIR, (ii) ACROBAT, and (iii) HyReCo, and performed several\nablation studies concerning the resolution used for registration and the\ninitial alignment robustness and stability. The method achieves the most\naccurate results for the ACROBAT dataset, the cell-level registration accuracy\nfor the restained slides from the HyReCo dataset, and is among the best methods\nevaluated on the ANHIR dataset. The method does not require any fine-tuning to\na new datasets and can be used out-of-the-box for other types of microscopic\nimages. The method is incorporated into the DeeperHistReg framework, allowing\nothers to directly use it to register, transform, and save the WSIs at any\ndesired pyramid level. The proposed method is a significant contribution to the\nWSI registration, thus advancing the field of digital pathology.\n","authors":["Marek Wodzinski","Niccolò Marini","Manfredo Atzori","Henning Müller"],"pdf_url":"https://arxiv.org/pdf/2404.13108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14434v1","updated":"2024-04-19T15:25:06Z","published":"2024-04-19T15:25:06Z","title":"DeeperHistReg: Robust Whole Slide Images Registration Framework","summary":" DeeperHistReg is a software framework dedicated to registering whole slide\nimages (WSIs) acquired using multiple stains. It allows one to perform the\npreprocessing, initial alignment, and nonrigid registration of WSIs acquired\nusing multiple stains (e.g. hematoxylin \\& eosin, immunochemistry). The\nframework implements several state-of-the-art registration algorithms and\nprovides an interface to operate on arbitrary resolution of the WSIs (up to\n200k x 200k). The framework is extensible and new algorithms can be easily\nintegrated by other researchers. The framework is available both as a PyPI\npackage and as a Docker container.\n","authors":["Marek Wodzinski","Niccolò Marini","Manfredo Atzori","Henning Müller"],"pdf_url":"https://arxiv.org/pdf/2404.14434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13106v1","updated":"2024-04-19T14:43:43Z","published":"2024-04-19T14:43:43Z","title":"Automatic Cranial Defect Reconstruction with Self-Supervised Deep\n Deformable Masked Autoencoders","summary":" Thousands of people suffer from cranial injuries every year. They require\npersonalized implants that need to be designed and manufactured before the\nreconstruction surgery. The manual design is expensive and time-consuming\nleading to searching for algorithms whose goal is to automatize the process.\nThe problem can be formulated as volumetric shape completion and solved by deep\nneural networks dedicated to supervised image segmentation. However, such an\napproach requires annotating the ground-truth defects which is costly and\ntime-consuming. Usually, the process is replaced with synthetic defect\ngeneration. However, even the synthetic ground-truth generation is\ntime-consuming and limits the data heterogeneity, thus the deep models'\ngeneralizability. In our work, we propose an alternative and simple approach to\nuse a self-supervised masked autoencoder to solve the problem. This approach by\ndesign increases the heterogeneity of the training set and can be seen as a\nform of data augmentation. We compare the proposed method with several\nstate-of-the-art deep neural networks and show both the quantitative and\nqualitative improvement on the SkullBreak and SkullFix datasets. The proposed\nmethod can be used to efficiently reconstruct the cranial defects in real time.\n","authors":["Marek Wodzinski","Daria Hemmerling","Mateusz Daniol"],"pdf_url":"https://arxiv.org/pdf/2404.13106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13105v1","updated":"2024-04-19T13:50:30Z","published":"2024-04-19T13:50:30Z","title":"On-Demand Earth System Data Cubes","summary":" Advancements in Earth system science have seen a surge in diverse datasets.\nEarth System Data Cubes (ESDCs) have been introduced to efficiently handle this\ninflux of high-dimensional data. ESDCs offer a structured, intuitive framework\nfor data analysis, organising information within spatio-temporal grids. The\nstructured nature of ESDCs unlocks significant opportunities for Artificial\nIntelligence (AI) applications. By providing well-organised data, ESDCs are\nideally suited for a wide range of sophisticated AI-driven tasks. An automated\nframework for creating AI-focused ESDCs with minimal user input could\nsignificantly accelerate the generation of task-specific training data. Here we\nintroduce cubo, an open-source Python tool designed for easy generation of\nAI-focused ESDCs. Utilising collections in SpatioTemporal Asset Catalogs (STAC)\nthat are stored as Cloud Optimised GeoTIFFs (COGs), cubo efficiently creates\nESDCs, requiring only central coordinates, spatial resolution, edge size, and\ntime range.\n","authors":["David Montero","César Aybar","Chaonan Ji","Guido Kraemer","Maximilian Söchting","Khalil Teber","Miguel D. Mahecha"],"pdf_url":"https://arxiv.org/pdf/2404.13105v1.pdf","comment":"Accepted at IGARSS24"},{"id":"http://arxiv.org/abs/2404.13103v1","updated":"2024-04-19T11:27:56Z","published":"2024-04-19T11:27:56Z","title":"ToNNO: Tomographic Reconstruction of a Neural Network's Output for\n Weakly Supervised Segmentation of 3D Medical Images","summary":" Annotating lots of 3D medical images for training segmentation models is\ntime-consuming. The goal of weakly supervised semantic segmentation is to train\nsegmentation models without using any ground truth segmentation masks. Our work\naddresses the case where only image-level categorical labels, indicating the\npresence or absence of a particular region of interest (such as tumours or\nlesions), are available. Most existing methods rely on class activation mapping\n(CAM). We propose a novel approach, ToNNO, which is based on the Tomographic\nreconstruction of a Neural Network's Output. Our technique extracts stacks of\nslices with different angles from the input 3D volume, feeds these slices to a\n2D encoder, and applies the inverse Radon transform in order to reconstruct a\n3D heatmap of the encoder's predictions. This generic method allows to perform\ndense prediction tasks on 3D volumes using any 2D image encoder. We apply it to\nweakly supervised medical image segmentation by training the 2D encoder to\noutput high values for slices containing the regions of interest. We test it on\nfour large scale medical image datasets and outperform 2D CAM methods. We then\nextend ToNNO by combining tomographic reconstruction with CAM methods,\nproposing Averaged CAM and Tomographic CAM, which obtain even better results.\n","authors":["Marius Schmidt-Mengin","Alexis Benichoux","Shibeshih Belachew","Nikos Komodakis","Nikos Paragios"],"pdf_url":"https://arxiv.org/pdf/2404.13103v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13102v1","updated":"2024-04-19T10:19:18Z","published":"2024-04-19T10:19:18Z","title":"Single-sample image-fusion upsampling of fluorescence lifetime images","summary":" Fluorescence lifetime imaging microscopy (FLIM) provides detailed information\nabout molecular interactions and biological processes. A major bottleneck for\nFLIM is image resolution at high acquisition speeds, due to the engineering and\nsignal-processing limitations of time-resolved imaging technology. Here we\npresent single-sample image-fusion upsampling (SiSIFUS), a data-fusion approach\nto computational FLIM super-resolution that combines measurements from a\nlow-resolution time-resolved detector (that measures photon arrival time) and a\nhigh-resolution camera (that measures intensity only). To solve this otherwise\nill-posed inverse retrieval problem, we introduce statistically informed priors\nthat encode local and global dependencies between the two single-sample\nmeasurements. This bypasses the risk of out-of-distribution hallucination as in\ntraditional data-driven approaches and delivers enhanced images compared for\nexample to standard bilinear interpolation. The general approach laid out by\nSiSIFUS can be applied to other image super-resolution problems where two\ndifferent datasets are available.\n","authors":["Valentin Kapitány","Areeba Fatima","Vytautas Zickus","Jamie Whitelaw","Ewan McGhee","Robert Insall","Laura Machesky","Daniele Faccio"],"pdf_url":"https://arxiv.org/pdf/2404.13102v1.pdf","comment":"18 pages, 11 figures. To be published in Science Advances"},{"id":"http://arxiv.org/abs/2404.13101v1","updated":"2024-04-19T09:52:32Z","published":"2024-04-19T09:52:32Z","title":"DensePANet: An improved generative adversarial network for photoacoustic\n tomography image reconstruction from sparse data","summary":" Image reconstruction is an essential step of every medical imaging method,\nincluding Photoacoustic Tomography (PAT), which is a promising modality of\nimaging, that unites the benefits of both ultrasound and optical imaging\nmethods. Reconstruction of PAT images using conventional methods results in\nrough artifacts, especially when applied directly to sparse PAT data. In recent\nyears, generative adversarial networks (GANs) have shown a powerful performance\nin image generation as well as translation, rendering them a smart choice to be\napplied to reconstruction tasks. In this study, we proposed an end-to-end\nmethod called DensePANet to solve the problem of PAT image reconstruction from\nsparse data. The proposed model employs a novel modification of UNet in its\ngenerator, called FD-UNet++, which considerably improves the reconstruction\nperformance. We evaluated the method on various in-vivo and simulated datasets.\nQuantitative and qualitative results show the better performance of our model\nover other prevalent deep learning techniques.\n","authors":["Hesam hakimnejad","Zohreh Azimifar","Narjes Goshtasbi"],"pdf_url":"https://arxiv.org/pdf/2404.13101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13097v1","updated":"2024-04-19T06:52:57Z","published":"2024-04-19T06:52:57Z","title":"DISC: Latent Diffusion Models with Self-Distillation from Separated\n Conditions for Prostate Cancer Grading","summary":" Latent Diffusion Models (LDMs) can generate high-fidelity images from noise,\noffering a promising approach for augmenting histopathology images for training\ncancer grading models. While previous works successfully generated\nhigh-fidelity histopathology images using LDMs, the generation of image tiles\nto improve prostate cancer grading has not yet been explored. Additionally,\nLDMs face challenges in accurately generating admixtures of multiple cancer\ngrades in a tile when conditioned by a tile mask. In this study, we train\nspecific LDMs to generate synthetic tiles that contain multiple Gleason Grades\n(GGs) by leveraging pixel-wise annotations in input tiles. We introduce a novel\nframework named Self-Distillation from Separated Conditions (DISC) that\ngenerates GG patterns guided by GG masks. Finally, we deploy a training\nframework for pixel-level and slide-level prostate cancer grading, where\nsynthetic tiles are effectively utilized to improve the cancer grading\nperformance of existing models. As a result, this work surpasses previous works\nin two domains: 1) our LDMs enhanced with DISC produce more accurate tiles in\nterms of GG patterns, and 2) our training scheme, incorporating synthetic data,\nsignificantly improves the generalization of the baseline model for prostate\ncancer grading, particularly in challenging cases of rare GG5, demonstrating\nthe potential of generative models to enhance cancer grading when data is\nlimited.\n","authors":["Man M. Ho","Elham Ghelichkhan","Yosep Chong","Yufei Zhou","Beatrice Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2404.13097v1.pdf","comment":"Abstract accepted for ISBI 2024. Extended version to be presented at\n SynData4CV @ CVPR 2024. See more at https://minhmanho.github.io/disc/"},{"id":"http://arxiv.org/abs/2404.15367v1","updated":"2024-04-19T13:24:09Z","published":"2024-04-19T13:24:09Z","title":"Leveraging Visibility Graphs for Enhanced Arrhythmia Classification with\n Graph Convolutional Networks","summary":" Arrhythmias, detectable via electrocardiograms (ECGs), pose significant\nhealth risks, emphasizing the need for robust automated identification\ntechniques. Although traditional deep learning methods have shown potential,\nrecent advances in graph-based strategies are aimed at enhancing arrhythmia\ndetection performance. However, effectively representing ECG signals as graphs\nremains a challenge. This study explores graph representations of ECG signals\nusing Visibility Graph (VG) and Vector Visibility Graph (VVG), coupled with\nGraph Convolutional Networks (GCNs) for arrhythmia classification. Through\nexperiments on the MIT-BIH dataset, we investigated various GCN architectures\nand preprocessing parameters. The results reveal that GCNs, when integrated\nwith VG and VVG for signal graph mapping, can classify arrhythmias without the\nneed for preprocessing or noise removal from ECG signals. While both VG and VVG\nmethods show promise, VG is notably more efficient. The proposed approach was\ncompetitive compared to baseline methods, although classifying the S class\nremains challenging, especially under the inter-patient paradigm. Computational\ncomplexity, particularly with the VVG method, required data balancing and\nsophisticated implementation strategies. The source code is publicly available\nfor further research and development at\nhttps://github.com/raffoliveira/VG_for_arrhythmia_classification_with_GCN.\n","authors":["Rafael F. Oliveira","Gladston J. P. Moreira","Vander L. S. Freitas","Eduardo J. S. Luz"],"pdf_url":"https://arxiv.org/pdf/2404.15367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13101v1","updated":"2024-04-19T09:52:32Z","published":"2024-04-19T09:52:32Z","title":"DensePANet: An improved generative adversarial network for photoacoustic\n tomography image reconstruction from sparse data","summary":" Image reconstruction is an essential step of every medical imaging method,\nincluding Photoacoustic Tomography (PAT), which is a promising modality of\nimaging, that unites the benefits of both ultrasound and optical imaging\nmethods. Reconstruction of PAT images using conventional methods results in\nrough artifacts, especially when applied directly to sparse PAT data. In recent\nyears, generative adversarial networks (GANs) have shown a powerful performance\nin image generation as well as translation, rendering them a smart choice to be\napplied to reconstruction tasks. In this study, we proposed an end-to-end\nmethod called DensePANet to solve the problem of PAT image reconstruction from\nsparse data. The proposed model employs a novel modification of UNet in its\ngenerator, called FD-UNet++, which considerably improves the reconstruction\nperformance. We evaluated the method on various in-vivo and simulated datasets.\nQuantitative and qualitative results show the better performance of our model\nover other prevalent deep learning techniques.\n","authors":["Hesam Hakimnejad","Zohreh Azimifar","Narjes Goshtasbi"],"pdf_url":"https://arxiv.org/pdf/2404.13101v1.pdf","comment":null}]},"2024-04-22T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.14412v1","updated":"2024-04-22T17:59:57Z","published":"2024-04-22T17:59:57Z","title":"AutoAD III: The Prequel -- Back to the Pixels","summary":" Generating Audio Description (AD) for movies is a challenging task that\nrequires fine-grained visual understanding and an awareness of the characters\nand their names. Currently, visual language models for AD generation are\nlimited by a lack of suitable training data, and also their evaluation is\nhampered by using performance measures not specialized to the AD domain. In\nthis paper, we make three contributions: (i) We propose two approaches for\nconstructing AD datasets with aligned video data, and build training and\nevaluation datasets using these. These datasets will be publicly released; (ii)\nWe develop a Q-former-based architecture which ingests raw video and generates\nAD, using frozen pre-trained visual encoders and large language models; and\n(iii) We provide new evaluation metrics to benchmark AD quality that are\nwell-matched to human performance. Taken together, we improve the state of the\nart on AD generation.\n","authors":["Tengda Han","Max Bain","Arsha Nagrani","Gül Varol","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2404.14412v1.pdf","comment":"CVPR2024. Project page:\n https://www.robots.ox.ac.uk/~vgg/research/autoad/"},{"id":"http://arxiv.org/abs/2404.14410v1","updated":"2024-04-22T17:59:50Z","published":"2024-04-22T17:59:50Z","title":"Guess The Unseen: Dynamic 3D Scene Reconstruction from Partial 2D\n Glimpses","summary":" In this paper, we present a method to reconstruct the world and multiple\ndynamic humans in 3D from a monocular video input. As a key idea, we represent\nboth the world and multiple humans via the recently emerging 3D Gaussian\nSplatting (3D-GS) representation, enabling to conveniently and efficiently\ncompose and render them together. In particular, we address the scenarios with\nseverely limited and sparse observations in 3D human reconstruction, a common\nchallenge encountered in the real world. To tackle this challenge, we introduce\na novel approach to optimize the 3D-GS representation in a canonical space by\nfusing the sparse cues in the common space, where we leverage a pre-trained 2D\ndiffusion model to synthesize unseen views while keeping the consistency with\nthe observed 2D appearances. We demonstrate our method can reconstruct\nhigh-quality animatable 3D humans in various challenging examples, in the\npresence of occlusion, image crops, few-shot, and extremely sparse\nobservations. After reconstruction, our method is capable of not only rendering\nthe scene in any novel views at arbitrary time instances, but also editing the\n3D scene by removing individual humans or applying different motions for each\nhuman. Through various experiments, we demonstrate the quality and efficiency\nof our methods over alternative existing approaches.\n","authors":["Inhee Lee","Byungjun Kim","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2404.14410v1.pdf","comment":"The project page is available at https://snuvclab.github.io/gtu/"},{"id":"http://arxiv.org/abs/2404.14409v1","updated":"2024-04-22T17:59:36Z","published":"2024-04-22T17:59:36Z","title":"CrossScore: Towards Multi-View Image Evaluation and Scoring","summary":" We introduce a novel cross-reference image quality assessment method that\neffectively fills the gap in the image assessment landscape, complementing the\narray of established evaluation schemes -- ranging from full-reference metrics\nlike SSIM, no-reference metrics such as NIQE, to general-reference metrics\nincluding FID, and Multi-modal-reference metrics, e.g., CLIPScore. Utilising a\nneural network with the cross-attention mechanism and a unique data collection\npipeline from NVS optimisation, our method enables accurate image quality\nassessment without requiring ground truth references. By comparing a query\nimage against multiple views of the same scene, our method addresses the\nlimitations of existing metrics in novel view synthesis (NVS) and similar tasks\nwhere direct reference images are unavailable. Experimental results show that\nour method is closely correlated to the full-reference metric SSIM, while not\nrequiring ground truth references.\n","authors":["Zirui Wang","Wenjing Bian","Omkar Parkhi","Yuheng Ren","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2404.14409v1.pdf","comment":"Project page see https://crossscore.active.vision"},{"id":"http://arxiv.org/abs/2404.12379v2","updated":"2024-04-22T17:59:27Z","published":"2024-04-18T17:58:16Z","title":"Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular\n Videos","summary":" Modern 3D engines and graphics pipelines require mesh as a memory-efficient\nrepresentation, which allows efficient rendering, geometry processing, texture\nediting, and many other downstream operations. However, it is still highly\ndifficult to obtain high-quality mesh in terms of structure and detail from\nmonocular visual observations. The problem becomes even more challenging for\ndynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh\n(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh\ngiven a single monocular video. Our work leverages the recent advancement in 3D\nGaussian Splatting to construct the mesh sequence with temporal consistency\nfrom a video. Building on top of this representation, DG-Mesh recovers\nhigh-quality meshes from the Gaussian points and can track the mesh vertices\nover time, which enables applications such as texture editing on dynamic\nobjects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly\ndistributed Gaussians, resulting better mesh reconstruction through mesh-guided\ndensification and pruning on the deformed Gaussians. By applying\ncycle-consistent deformation between the canonical and the deformed space, we\ncan project the anchored Gaussian back to the canonical space and optimize\nGaussians across all time frames. During the evaluation on different datasets,\nDG-Mesh provides significantly better mesh reconstruction and rendering than\nbaselines. Project page: https://www.liuisabella.com/DG-Mesh/\n","authors":["Isabella Liu","Hao Su","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12379v2.pdf","comment":"Project page: https://www.liuisabella.com/DG-Mesh/"},{"id":"http://arxiv.org/abs/2404.14406v1","updated":"2024-04-22T17:59:18Z","published":"2024-04-22T17:59:18Z","title":"Hyp-OC: Hyperbolic One Class Classification for Face Anti-Spoofing","summary":" Face recognition technology has become an integral part of modern security\nsystems and user authentication processes. However, these systems are\nvulnerable to spoofing attacks and can easily be circumvented. Most prior\nresearch in face anti-spoofing (FAS) approaches it as a two-class\nclassification task where models are trained on real samples and known spoof\nattacks and tested for detection performance on unknown spoof attacks. However,\nin practice, FAS should be treated as a one-class classification task where,\nwhile training, one cannot assume any knowledge regarding the spoof samples a\npriori. In this paper, we reformulate the face anti-spoofing task from a\none-class perspective and propose a novel hyperbolic one-class classification\nframework. To train our network, we use a pseudo-negative class sampled from\nthe Gaussian distribution with a weighted running mean and propose two novel\nloss functions: (1) Hyp-PC: Hyperbolic Pairwise Confusion loss, and (2) Hyp-CE:\nHyperbolic Cross Entropy loss, which operate in the hyperbolic space.\nAdditionally, we employ Euclidean feature clipping and gradient clipping to\nstabilize the training in the hyperbolic space. To the best of our knowledge,\nthis is the first work extending hyperbolic embeddings for face anti-spoofing\nin a one-class manner. With extensive experiments on five benchmark datasets:\nRose-Youtu, MSU-MFSD, CASIA-MFSD, Idiap Replay-Attack, and OULU-NPU, we\ndemonstrate that our method significantly outperforms the state-of-the-art,\nachieving better spoof detection performance.\n","authors":["Kartik Narayan","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2404.14406v1.pdf","comment":"Accepted in FG2024, Project Page -\n https://kartik-3004.github.io/hyp-oc/"},{"id":"http://arxiv.org/abs/2404.14403v1","updated":"2024-04-22T17:58:36Z","published":"2024-04-22T17:58:36Z","title":"GeoDiffuser: Geometry-Based Image Editing with Diffusion Models","summary":" The success of image generative models has enabled us to build methods that\ncan edit images based on text or other user input. However, these methods are\nbespoke, imprecise, require additional information, or are limited to only 2D\nimage edits. We present GeoDiffuser, a zero-shot optimization-based method that\nunifies common 2D and 3D image-based object editing capabilities into a single\nmethod. Our key insight is to view image editing operations as geometric\ntransformations. We show that these transformations can be directly\nincorporated into the attention layers in diffusion models to implicitly\nperform editing operations. Our training-free optimization method uses an\nobjective function that seeks to preserve object style but generate plausible\nimages, for instance with accurate lighting and shadows. It also inpaints\ndisoccluded parts of the image where the object was originally located. Given a\nnatural image and user input, we segment the foreground object using SAM and\nestimate a corresponding transform which is used by our optimization approach\nfor editing. GeoDiffuser can perform common 2D and 3D edits like object\ntranslation, 3D rotation, and removal. We present quantitative results,\nincluding a perceptual study, that shows how our approach is better than\nexisting methods. Visit https://ivl.cs.brown.edu/research/geodiffuser.html for\nmore information.\n","authors":["Rahul Sajnani","Jeroen Vanbaar","Jie Min","Kapil Katyal","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2404.14403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14396v1","updated":"2024-04-22T17:56:09Z","published":"2024-04-22T17:56:09Z","title":"SEED-X: Multimodal Models with Unified Multi-granularity Comprehension\n and Generation","summary":" The rapid evolution of multimodal foundation model has demonstrated\nsignificant progresses in vision-language understanding and generation, e.g.,\nour previous work SEED-LLaMA. However, there remains a gap between its\ncapability and the real-world applicability, primarily due to the model's\nlimited capacity to effectively respond to various user instructions and\ninteract with diverse visual data. In this work, we focus on bridging this gap\nthrough integrating two enhanced features: (1) comprehending images of\narbitrary sizes and ratios, and (2) enabling multi-granularity image\ngeneration. We present a unified and versatile foundation model, namely,\nSEED-X, which is able to model multi-granularity visual semantics for\ncomprehension and generation tasks. Besides the competitive results on public\nbenchmarks, SEED-X demonstrates its effectiveness in handling real-world\napplications across various domains after instruction tuning. We hope that our\nwork will inspire future research into what can be achieved by versatile\nmultimodal foundation models in real-world applications. The models, codes, and\ndatasets will be released in https://github.com/AILab-CVC/SEED-X.\n","authors":["Yuying Ge","Sijie Zhao","Jinguo Zhu","Yixiao Ge","Kun Yi","Lin Song","Chen Li","Xiaohan Ding","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.14396v1.pdf","comment":"Project released at: https://github.com/AILab-CVC/SEED-X"},{"id":"http://arxiv.org/abs/2404.14394v1","updated":"2024-04-22T17:55:11Z","published":"2024-04-22T17:55:11Z","title":"A Multimodal Automated Interpretability Agent","summary":" This paper describes MAIA, a Multimodal Automated Interpretability Agent.\nMAIA is a system that uses neural models to automate neural model understanding\ntasks like feature interpretation and failure mode discovery. It equips a\npre-trained vision-language model with a set of tools that support iterative\nexperimentation on subcomponents of other models to explain their behavior.\nThese include tools commonly used by human interpretability researchers: for\nsynthesizing and editing inputs, computing maximally activating exemplars from\nreal-world datasets, and summarizing and describing experimental results.\nInterpretability experiments proposed by MAIA compose these tools to describe\nand explain system behavior. We evaluate applications of MAIA to computer\nvision models. We first characterize MAIA's ability to describe (neuron-level)\nfeatures in learned representations of images. Across several trained models\nand a novel dataset of synthetic vision neurons with paired ground-truth\ndescriptions, MAIA produces descriptions comparable to those generated by\nexpert human experimenters. We then show that MAIA can aid in two additional\ninterpretability tasks: reducing sensitivity to spurious features, and\nautomatically identifying inputs likely to be mis-classified.\n","authors":["Tamar Rott Shaham","Sarah Schwettmann","Franklin Wang","Achyuta Rajaram","Evan Hernandez","Jacob Andreas","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2404.14394v1.pdf","comment":"25 pages, 13 figures"},{"id":"http://arxiv.org/abs/2402.18673v2","updated":"2024-04-22T17:54:17Z","published":"2024-02-28T19:35:30Z","title":"Trends, Applications, and Challenges in Human Attention Modelling","summary":" Human attention modelling has proven, in recent years, to be particularly\nuseful not only for understanding the cognitive processes underlying visual\nexploration, but also for providing support to artificial intelligence models\nthat aim to solve problems in various domains, including image and video\nprocessing, vision-and-language applications, and language modelling. This\nsurvey offers a reasoned overview of recent efforts to integrate human\nattention mechanisms into contemporary deep learning models and discusses\nfuture research directions and challenges. For a comprehensive overview on the\nongoing research refer to our dedicated repository available at\nhttps://github.com/aimagelab/awesome-human-visual-attention.\n","authors":["Giuseppe Cartella","Marcella Cornia","Vittorio Cuculo","Alessandro D'Amelio","Dario Zanca","Giuseppe Boccignone","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2402.18673v2.pdf","comment":"Accepted at IJCAI 2024 Survey Track"},{"id":"http://arxiv.org/abs/2404.10108v2","updated":"2024-04-22T17:53:08Z","published":"2024-04-15T19:43:16Z","title":"GeoAI Reproducibility and Replicability: a computational and spatial\n perspective","summary":" GeoAI has emerged as an exciting interdisciplinary research area that\ncombines spatial theories and data with cutting-edge AI models to address\ngeospatial problems in a novel, data-driven manner. While GeoAI research has\nflourished in the GIScience literature, its reproducibility and replicability\n(R&R), fundamental principles that determine the reusability, reliability, and\nscientific rigor of research findings, have rarely been discussed. This paper\naims to provide an in-depth analysis of this topic from both computational and\nspatial perspectives. We first categorize the major goals for reproducing GeoAI\nresearch, namely, validation (repeatability), learning and adapting the method\nfor solving a similar or new problem (reproducibility), and examining the\ngeneralizability of the research findings (replicability). Each of these goals\nrequires different levels of understanding of GeoAI, as well as different\nmethods to ensure its success. We then discuss the factors that may cause the\nlack of R&R in GeoAI research, with an emphasis on (1) the selection and use of\ntraining data; (2) the uncertainty that resides in the GeoAI model design,\ntraining, deployment, and inference processes; and more importantly (3) the\ninherent spatial heterogeneity of geospatial data and processes. We use a deep\nlearning-based image analysis task as an example to demonstrate the results'\nuncertainty and spatial variance caused by different factors. The findings\nreiterate the importance of knowledge sharing, as well as the generation of a\n\"replicability map\" that incorporates spatial autocorrelation and spatial\nheterogeneity into consideration in quantifying the spatial replicability of\nGeoAI research.\n","authors":["Wenwen Li","Chia-Yu Hsu","Sizhe Wang","Peter Kedron"],"pdf_url":"https://arxiv.org/pdf/2404.10108v2.pdf","comment":"Accepted by Annals of the American Association of Geographers"},{"id":"http://arxiv.org/abs/2404.14388v1","updated":"2024-04-22T17:46:29Z","published":"2024-04-22T17:46:29Z","title":"STROOBnet Optimization via GPU-Accelerated Proximal Recurrence\n Strategies","summary":" Spatiotemporal networks' observational capabilities are crucial for accurate\ndata gathering and informed decisions across multiple sectors. This study\nfocuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network\n(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events\nwithin defined geographical regions, enabling efficient monitoring. Using data\nfrom Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New\nOrleans, where RTCC combats rising crime amidst reduced police presence, we\naddress the network's initial observational imbalances. Aiming for uniform\nobservational efficacy, we propose the Proximal Recurrence approach. It\noutperformed traditional clustering methods like k-means and DBSCAN by offering\nholistic event frequency and spatial consideration, enhancing observational\ncoverage.\n","authors":["Ted Edward Holmberg","Mahdi Abdelguerfi","Elias Ioup"],"pdf_url":"https://arxiv.org/pdf/2404.14388v1.pdf","comment":"10 pages, 17 figures, 2023 IEEE International Conference on Big Data\n (BigData)"},{"id":"http://arxiv.org/abs/2404.14381v1","updated":"2024-04-22T17:36:03Z","published":"2024-04-22T17:36:03Z","title":"TAVGBench: Benchmarking Text to Audible-Video Generation","summary":" The Text to Audible-Video Generation (TAVG) task involves generating videos\nwith accompanying audio based on text descriptions. Achieving this requires\nskillful alignment of both audio and video elements. To support research in\nthis field, we have developed a comprehensive Text to Audible-Video Generation\nBenchmark (TAVGBench), which contains over 1.7 million clips with a total\nduration of 11.8 thousand hours. We propose an automatic annotation pipeline to\nensure each audible video has detailed descriptions for both its audio and\nvideo contents. We also introduce the Audio-Visual Harmoni score (AVHScore) to\nprovide a quantitative measure of the alignment between the generated audio and\nvideo modalities. Additionally, we present a baseline model for TAVG called\nTAVDiffusion, which uses a two-stream latent diffusion model to provide a\nfundamental starting point for further research in this area. We achieve the\nalignment of audio and video by employing cross-attention and contrastive\nlearning. Through extensive experiments and evaluations on TAVGBench, we\ndemonstrate the effectiveness of our proposed model under both conventional\nmetrics and our proposed metrics.\n","authors":["Yuxin Mao","Xuyang Shen","Jing Zhang","Zhen Qin","Jinxing Zhou","Mochu Xiang","Yiran Zhong","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.14381v1.pdf","comment":"Technical Report. Project\n page:https://github.com/OpenNLPLab/TAVGBench"},{"id":"http://arxiv.org/abs/2404.12547v2","updated":"2024-04-22T17:35:33Z","published":"2024-04-18T23:52:42Z","title":"Does Gaussian Splatting need SFM Initialization?","summary":" 3D Gaussian Splatting has recently been embraced as a versatile and effective\nmethod for scene reconstruction and novel view synthesis, owing to its\nhigh-quality results and compatibility with hardware rasterization. Despite its\nadvantages, Gaussian Splatting's reliance on high-quality point cloud\ninitialization by Structure-from-Motion (SFM) algorithms is a significant\nlimitation to be overcome. To this end, we investigate various initialization\nstrategies for Gaussian Splatting and delve into how volumetric reconstructions\nfrom Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on\nSFM data. Our findings demonstrate that random initialization can perform much\nbetter if carefully designed and that by employing a combination of improved\ninitialization strategies and structure distillation from low-cost NeRF models,\nit is possible to achieve equivalent results, or at times even superior, to\nthose obtained from SFM initialization.\n","authors":["Yalda Foroutan","Daniel Rebain","Kwang Moo Yi","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2404.12547v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.05664v2","updated":"2024-04-22T17:28:30Z","published":"2023-12-09T20:06:29Z","title":"CoGS: Controllable Gaussian Splatting","summary":" Capturing and re-animating the 3D structure of articulated objects present\nsignificant barriers. On one hand, methods requiring extensively calibrated\nmulti-view setups are prohibitively complex and resource-intensive, limiting\ntheir practical applicability. On the other hand, while single-camera Neural\nRadiance Fields (NeRFs) offer a more streamlined approach, they have excessive\ntraining and rendering costs. 3D Gaussian Splatting would be a suitable\nalternative but for two reasons. Firstly, existing methods for 3D dynamic\nGaussians require synchronized multi-view cameras, and secondly, the lack of\ncontrollability in dynamic scenarios. We present CoGS, a method for\nControllable Gaussian Splatting, that enables the direct manipulation of scene\nelements, offering real-time control of dynamic scenes without the prerequisite\nof pre-computing control signals. We evaluated CoGS using both synthetic and\nreal-world datasets that include dynamic objects that differ in degree of\ndifficulty. In our evaluations, CoGS consistently outperformed existing dynamic\nand controllable neural representations in terms of visual fidelity.\n","authors":["Heng Yu","Joel Julin","Zoltán Á. Milacski","Koichiro Niinuma","László A. Jeni"],"pdf_url":"https://arxiv.org/pdf/2312.05664v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14368v1","updated":"2024-04-22T17:20:38Z","published":"2024-04-22T17:20:38Z","title":"Graphic Design with Large Multimodal Model","summary":" In the field of graphic design, automating the integration of design elements\ninto a cohesive multi-layered artwork not only boosts productivity but also\npaves the way for the democratization of graphic design. One existing practice\nis Graphic Layout Generation (GLG), which aims to layout sequential design\nelements. It has been constrained by the necessity for a predefined correct\nsequence of layers, thus limiting creative potential and increasing user\nworkload. In this paper, we present Hierarchical Layout Generation (HLG) as a\nmore flexible and pragmatic setup, which creates graphic composition from\nunordered sets of design elements. To tackle the HLG task, we introduce\nGraphist, the first layout generation model based on large multimodal models.\nGraphist efficiently reframes the HLG as a sequence generation problem,\nutilizing RGB-A images as input, outputs a JSON draft protocol, indicating the\ncoordinates, size, and order of each element. We develop new evaluation metrics\nfor HLG. Graphist outperforms prior arts and establishes a strong baseline for\nthis field. Project homepage: https://github.com/graphic-design-ai/graphist\n","authors":["Yutao Cheng","Zhao Zhang","Maoke Yang","Hui Nie","Chunyuan Li","Xinglong Wu","Jie Shao"],"pdf_url":"https://arxiv.org/pdf/2404.14368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01497v2","updated":"2024-04-22T17:13:48Z","published":"2024-03-03T12:17:49Z","title":"Learning A Physical-aware Diffusion Model Based on Transformer for\n Underwater Image Enhancement","summary":" Underwater visuals undergo various complex degradations, inevitably\ninfluencing the efficiency of underwater vision tasks. Recently, diffusion\nmodels were employed to underwater image enhancement (UIE) tasks, and gained\nSOTA performance. However, these methods fail to consider the physical\nproperties and underwater imaging mechanisms in the diffusion process, limiting\ninformation completion capacity of diffusion models. In this paper, we\nintroduce a novel UIE framework, named PA-Diff, designed to exploiting the\nknowledge of physics to guide the diffusion process.\n PA-Diff consists of Physics Prior Generation (PPG) Branch, Implicit Neural\nReconstruction (INR) Branch, and Physics-aware Diffusion Transformer (PDT)\nBranch. Our designed PPG branch aims to produce the prior knowledge of physics.\nWith utilizing the physics prior knowledge to guide the diffusion process, PDT\nbranch can obtain underwater-aware ability and model the complex distribution\nin real-world underwater scenes. INR Branch can learn robust feature\nrepresentations from diverse underwater image via implicit neural\nrepresentation, which reduces the difficulty of restoration for PDT branch.\nExtensive experiments prove that our method achieves best performance on UIE\ntasks.\n","authors":["Chen Zhao","Chenyu Dong","Weiling Cai"],"pdf_url":"https://arxiv.org/pdf/2403.01497v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14351v1","updated":"2024-04-22T17:02:33Z","published":"2024-04-22T17:02:33Z","title":"Scene Coordinate Reconstruction: Posing of Image Collections via\n Incremental Learning of a Relocalizer","summary":" We address the task of estimating camera parameters from a set of images\ndepicting a scene. Popular feature-based structure-from-motion (SfM) tools\nsolve this task by incremental reconstruction: they repeat triangulation of\nsparse 3D points and registration of more camera views to the sparse point\ncloud. We re-interpret incremental structure-from-motion as an iterated\napplication and refinement of a visual relocalizer, that is, of a method that\nregisters new views to the current state of the reconstruction. This\nperspective allows us to investigate alternative visual relocalizers that are\nnot rooted in local feature matching. We show that scene coordinate regression,\na learning-based relocalization approach, allows us to build implicit, neural\nscene representations from unposed images. Different from other learning-based\nreconstruction methods, we do not require pose priors nor sequential inputs,\nand we optimize efficiently over thousands of images. Our method, ACE0 (ACE\nZero), estimates camera poses to an accuracy comparable to feature-based SfM,\nas demonstrated by novel view synthesis. Project page:\nhttps://nianticlabs.github.io/acezero/\n","authors":["Eric Brachmann","Jamie Wynn","Shuai Chen","Tommaso Cavallari","Áron Monszpart","Daniyar Turmukhambetov","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2404.14351v1.pdf","comment":"Project page: https://nianticlabs.github.io/acezero/"},{"id":"http://arxiv.org/abs/2404.14349v1","updated":"2024-04-22T17:00:57Z","published":"2024-04-22T17:00:57Z","title":"Automatic Discovery of Visual Circuits","summary":" To date, most discoveries of network subcomponents that implement\nhuman-interpretable computations in deep vision models have involved close\nstudy of single units and large amounts of human labor. We explore scalable\nmethods for extracting the subgraph of a vision model's computational graph\nthat underlies recognition of a specific visual concept. We introduce a new\nmethod for identifying these subgraphs: specifying a visual concept using a few\nexamples, and then tracing the interdependence of neuron activations across\nlayers, or their functional connectivity. We find that our approach extracts\ncircuits that causally affect model output, and that editing these circuits can\ndefend large pretrained models from adversarial attacks.\n","authors":["Achyuta Rajaram","Neil Chowdhury","Antonio Torralba","Jacob Andreas","Sarah Schwettmann"],"pdf_url":"https://arxiv.org/pdf/2404.14349v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.14344v1","updated":"2024-04-22T16:59:43Z","published":"2024-04-22T16:59:43Z","title":"On-the-Fly Point Annotation for Fast Medical Video Labeling","summary":" Purpose: In medical research, deep learning models rely on high-quality\nannotated data, a process often laborious and timeconsuming. This is\nparticularly true for detection tasks where bounding box annotations are\nrequired. The need to adjust two corners makes the process inherently\nframe-by-frame. Given the scarcity of experts' time, efficient annotation\nmethods suitable for clinicians are needed. Methods: We propose an on-the-fly\nmethod for live video annotation to enhance the annotation efficiency. In this\napproach, a continuous single-point annotation is maintained by keeping the\ncursor on the object in a live video, mitigating the need for tedious pausing\nand repetitive navigation inherent in traditional annotation methods. This\nnovel annotation paradigm inherits the point annotation's ability to generate\npseudo-labels using a point-to-box teacher model. We empirically evaluate this\napproach by developing a dataset and comparing on-the-fly annotation time\nagainst traditional annotation method. Results: Using our method, annotation\nspeed was 3.2x faster than the traditional annotation technique. We achieved a\nmean improvement of 6.51 +- 0.98 AP@50 over conventional method at equivalent\nannotation budgets on the developed dataset. Conclusion: Without bells and\nwhistles, our approach offers a significant speed-up in annotation tasks. It\ncan be easily implemented on any annotation platform to accelerate the\nintegration of deep learning in video-based medical research.\n","authors":["Meyer Adrien","Mazellier Jean-Paul","Jeremy Dana","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2404.14344v1.pdf","comment":"7 pages, 5 figures. Int J CARS (2024)"},{"id":"http://arxiv.org/abs/2404.14343v1","updated":"2024-04-22T16:58:37Z","published":"2024-04-22T16:58:37Z","title":"Heterogeneous Face Recognition Using Domain Invariant Units","summary":" Heterogeneous Face Recognition (HFR) aims to expand the applicability of Face\nRecognition (FR) systems to challenging scenarios, enabling the matching of\nface images across different domains, such as matching thermal images to\nvisible spectra. However, the development of HFR systems is challenging because\nof the significant domain gap between modalities and the lack of availability\nof large-scale paired multi-channel data. In this work, we leverage a\npretrained face recognition model as a teacher network to learn domaininvariant\nnetwork layers called Domain-Invariant Units (DIU) to reduce the domain gap.\nThe proposed DIU can be trained effectively even with a limited amount of\npaired training data, in a contrastive distillation framework. This proposed\napproach has the potential to enhance pretrained models, making them more\nadaptable to a wider range of variations in data. We extensively evaluate our\napproach on multiple challenging benchmarks, demonstrating superior performance\ncompared to state-of-the-art methods.\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2404.14343v1.pdf","comment":"6 pages, Accepted ICASSP 2024"},{"id":"http://arxiv.org/abs/2206.04406v2","updated":"2024-04-22T16:41:38Z","published":"2022-06-09T10:39:44Z","title":"Unsupervised Learning of the Total Variation Flow","summary":" The total variation (TV) flow generates a scale-space representation of an\nimage based on the TV functional. This gradient flow observes desirable\nfeatures for images, such as sharp edges and enables spectral, scale, and\ntexture analysis. Solving the TV flow is challenging; one reason is the the\nnon-uniqueness of the subgradients. The standard numerical approach for TV flow\nrequires solving multiple non-smooth optimisation problems. Even with\nstate-of-the-art convex optimisation techniques, this is often prohibitively\nexpensive and strongly motivates the use of alternative, faster approaches.\nInspired by and extending the framework of physics-informed neural networks\n(PINNs), we propose the TVflowNET, an unsupervised neural network approach, to\napproximate the solution of the TV flow given an initial image and a time\ninstance. The TVflowNET requires no ground truth data but rather makes use of\nthe PDE for optimisation of the network parameters. We circumvent the\nchallenges related to the non-uniqueness of the subgradients by additionally\nlearning the related diffusivity term. Our approach significantly speeds up the\ncomputation time and we show that the TVflowNET approximates the TV flow\nsolution with high fidelity for different image sizes and image types.\nAdditionally, we give a full comparison of different network architecture\ndesigns as well as training regimes to underscore the effectiveness of our\napproach.\n","authors":["Tamara G. Grossmann","Sören Dittmer","Yury Korolev","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2206.04406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14329v1","updated":"2024-04-22T16:40:11Z","published":"2024-04-22T16:40:11Z","title":"X-Ray: A Sequential 3D Representation for Generation","summary":" In this paper, we introduce X-Ray, an innovative approach to 3D generation\nthat employs a new sequential representation, drawing inspiration from the\ndepth-revealing capabilities of X-Ray scans to meticulously capture both the\nexternal and internal features of objects. Central to our method is the\nutilization of ray casting techniques originating from the camera's viewpoint,\nmeticulously recording the geometric and textural details encountered across\nall intersected surfaces. This process efficiently condenses complete objects\nor scenes into a multi-frame format, just like videos. Such a structure ensures\nthe 3D representation is composed solely of critical surface information.\nHighlighting the practicality and adaptability of our X-Ray representation, we\nshowcase its utility in synthesizing 3D objects, employing a network\narchitecture akin to that used in video diffusion models. The outcomes reveal\nour representation's superior performance in enhancing both the accuracy and\nefficiency of 3D synthesis, heralding new directions for ongoing research and\npractical implementations in the field.\n","authors":["Tao Hu","Wenhang Ge","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.14329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14326v1","updated":"2024-04-22T16:38:41Z","published":"2024-04-22T16:38:41Z","title":"Machine Learning Techniques for MRI Data Processing at Expanding Scale","summary":" Imaging sites around the world generate growing amounts of medical scan data\nwith ever more versatile and affordable technology. Large-scale studies acquire\nMRI for tens of thousands of participants, together with metadata ranging from\nlifestyle questionnaires to biochemical assays, genetic analyses and more.\nThese large datasets encode substantial information about human health and hold\nconsiderable potential for machine learning training and analysis. This chapter\nexamines ongoing large-scale studies and the challenge of distribution shifts\nbetween them. Transfer learning for overcoming such shifts is discussed,\ntogether with federated learning for safe access to distributed training data\nsecurely held at multiple institutions. Finally, representation learning is\nreviewed as a methodology for encoding embeddings that express abstract\nrelationships in multi-modal input formats.\n","authors":["Taro Langner"],"pdf_url":"https://arxiv.org/pdf/2404.14326v1.pdf","comment":"Book chapter pre-print"},{"id":"http://arxiv.org/abs/2404.14322v1","updated":"2024-04-22T16:33:06Z","published":"2024-04-22T16:33:06Z","title":"A Novel Approach to Chest X-ray Lung Segmentation Using U-net and\n Modified Convolutional Block Attention Module","summary":" Lung segmentation in chest X-ray images is of paramount importance as it\nplays a crucial role in the diagnosis and treatment of various lung diseases.\nThis paper presents a novel approach for lung segmentation in chest X-ray\nimages by integrating U-net with attention mechanisms. The proposed method\nenhances the U-net architecture by incorporating a Convolutional Block\nAttention Module (CBAM), which unifies three distinct attention mechanisms:\nchannel attention, spatial attention, and pixel attention. The channel\nattention mechanism enables the model to concentrate on the most informative\nfeatures across various channels. The spatial attention mechanism enhances the\nmodel's precision in localization by focusing on significant spatial locations.\nLastly, the pixel attention mechanism empowers the model to focus on individual\npixels, further refining the model's focus and thereby improving the accuracy\nof segmentation. The adoption of the proposed CBAM in conjunction with the\nU-net architecture marks a significant advancement in the field of medical\nimaging, with potential implications for improving diagnostic precision and\npatient outcomes. The efficacy of this method is validated against contemporary\nstate-of-the-art techniques, showcasing its superiority in segmentation\nperformance.\n","authors":["Mohammad Ali Labbaf Khaniki","Mohammad Manthouri"],"pdf_url":"https://arxiv.org/pdf/2404.14322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00816v3","updated":"2024-04-22T16:26:37Z","published":"2023-06-01T15:42:06Z","title":"Versatile Backdoor Attack with Visible, Semantic, Sample-Specific, and\n Compatible Triggers","summary":" Deep neural networks (DNNs) can be manipulated to exhibit specific behaviors\nwhen exposed to specific trigger patterns, without affecting their performance\non benign samples, dubbed \\textit{backdoor attack}. Currently, implementing\nbackdoor attacks in physical scenarios still faces significant challenges.\nPhysical attacks are labor-intensive and time-consuming, and the triggers are\nselected in a manual and heuristic way. Moreover, expanding digital attacks to\nphysical scenarios faces many challenges due to their sensitivity to visual\ndistortions and the absence of counterparts in the real world. To address these\nchallenges, we define a novel trigger called the \\textbf{V}isible,\n\\textbf{S}emantic, \\textbf{S}ample-Specific, and \\textbf{C}ompatible (VSSC)\ntrigger, to achieve effective, stealthy and robust simultaneously, which can\nalso be effectively deployed in the physical scenario using corresponding\nobjects. To implement the VSSC trigger, we propose an automated pipeline\ncomprising three modules: a trigger selection module that systematically\nidentifies suitable triggers leveraging large language models, a trigger\ninsertion module that employs generative models to seamlessly integrate\ntriggers into images, and a quality assessment module that ensures the natural\nand successful insertion of triggers through vision-language models. Extensive\nexperimental results and analysis validate the effectiveness, stealthiness, and\nrobustness of the VSSC trigger. It can not only maintain robustness under\nvisual distortions but also demonstrates strong practicality in the physical\nscenario. We hope that the proposed VSSC trigger and implementation approach\ncould inspire future studies on designing more practical triggers in backdoor\nattacks.\n","authors":["Ruotong Wang","Hongrui Chen","Zihao Zhu","Li Liu","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2306.00816v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06687v2","updated":"2024-04-22T16:18:53Z","published":"2024-03-11T13:04:21Z","title":"Advancing Graph Neural Networks with HL-HGAT: A Hodge-Laplacian and\n Attention Mechanism Approach for Heterogeneous Graph-Structured Data","summary":" Graph neural networks (GNNs) have proven effective in capturing relationships\namong nodes in a graph. This study introduces a novel perspective by\nconsidering a graph as a simplicial complex, encompassing nodes, edges,\ntriangles, and $k$-simplices, enabling the definition of graph-structured data\non any $k$-simplices. Our contribution is the Hodge-Laplacian heterogeneous\ngraph attention network (HL-HGAT), designed to learn heterogeneous signal\nrepresentations across $k$-simplices. The HL-HGAT incorporates three key\ncomponents: HL convolutional filters (HL-filters), simplicial projection (SP),\nand simplicial attention pooling (SAP) operators, applied to $k$-simplices.\nHL-filters leverage the unique topology of $k$-simplices encoded by the\nHodge-Laplacian (HL) operator, operating within the spectral domain of the\n$k$-th HL operator. To address computation challenges, we introduce a\npolynomial approximation for HL-filters, exhibiting spatial localization\nproperties. Additionally, we propose a pooling operator to coarsen\n$k$-simplices, combining features through simplicial attention mechanisms of\nself-attention and cross-attention via transformers and SP operators, capturing\ntopological interconnections across multiple dimensions of simplices. The\nHL-HGAT is comprehensively evaluated across diverse graph applications,\nincluding NP-hard problems, graph multi-label and classification challenges,\nand graph regression tasks in logistics, computer vision, biology, chemistry,\nand neuroscience. The results demonstrate the model's efficacy and versatility\nin handling a wide range of graph-based scenarios.\n","authors":["Jinghan Huang","Qiufeng Chen","Yijun Bian","Pengli Zhu","Nanguang Chen","Moo K. Chung","Anqi Qiu"],"pdf_url":"https://arxiv.org/pdf/2403.06687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11631v2","updated":"2024-04-22T16:18:38Z","published":"2024-02-18T16:17:25Z","title":"Neuromorphic Face Analysis: a Survey","summary":" Neuromorphic sensors, also known as event cameras, are a class of imaging\ndevices mimicking the function of biological visual systems. Unlike traditional\nframe-based cameras, which capture fixed images at discrete intervals,\nneuromorphic sensors continuously generate events that represent changes in\nlight intensity or motion in the visual field with high temporal resolution and\nlow latency. These properties have proven to be interesting in modeling human\nfaces, both from an effectiveness and a privacy-preserving point of view.\nNeuromorphic face analysis however is still a raw and unstructured field of\nresearch, with several attempts at addressing different tasks with no clear\nstandard or benchmark. This survey paper presents a comprehensive overview of\ncapabilities, challenges and emerging applications in the domain of\nneuromorphic face analysis, to outline promising directions and open issues.\nAfter discussing the fundamental working principles of neuromorphic vision and\npresenting an in-depth overview of the related research, we explore the current\nstate of available data, standard data representations, emerging challenges,\nand limitations that require further investigation. This paper aims to\nhighlight the recent process in this evolving field to provide to both\nexperienced and newly come researchers an all-encompassing analysis of the\nstate of the art along with its problems and shortcomings.\n","authors":["Federico Becattini","Lorenzo Berlincioni","Luca Cultrera","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2402.11631v2.pdf","comment":"Submitted to Patter Recognition Letters"},{"id":"http://arxiv.org/abs/2404.14309v1","updated":"2024-04-22T16:10:38Z","published":"2024-04-22T16:10:38Z","title":"Towards Better Adversarial Purification via Adversarial Denoising\n Diffusion Training","summary":" Recently, diffusion-based purification (DBP) has emerged as a promising\napproach for defending against adversarial attacks. However, previous studies\nhave used questionable methods to evaluate the robustness of DBP models, their\nexplanations of DBP robustness also lack experimental support. We re-examine\nDBP robustness using precise gradient, and discuss the impact of stochasticity\non DBP robustness. To better explain DBP robustness, we assess DBP robustness\nunder a novel attack setting, Deterministic White-box, and pinpoint\nstochasticity as the main factor in DBP robustness. Our results suggest that\nDBP models rely on stochasticity to evade the most effective attack direction,\nrather than directly countering adversarial perturbations. To improve the\nrobustness of DBP models, we propose Adversarial Denoising Diffusion Training\n(ADDT). This technique uses Classifier-Guided Perturbation Optimization (CGPO)\nto generate adversarial perturbation through guidance from a pre-trained\nclassifier, and uses Rank-Based Gaussian Mapping (RBGM) to convert adversarial\npertubation into a normal Gaussian distribution. Empirical results show that\nADDT improves the robustness of DBP models. Further experiments confirm that\nADDT equips DBP models with the ability to directly counter adversarial\nperturbations.\n","authors":["Yiming Liu","Kezhao Liu","Yao Xiao","Ziyi Dong","Xiaogang Xu","Pengxu Wei","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.14309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14281v1","updated":"2024-04-22T15:29:28Z","published":"2024-04-22T15:29:28Z","title":"Fast and Robust Normal Estimation for Sparse LiDAR Scans","summary":" Light Detection and Ranging (LiDAR) technology has proven to be an important\npart of many robotics systems. Surface normals estimated from LiDAR data are\ncommonly used for a variety of tasks in such systems. As most of the today's\nmechanical LiDAR sensors produce sparse data, estimating normals from a single\nscan in a robust manner poses difficulties.\n In this paper, we address the problem of estimating normals for sparse LiDAR\ndata avoiding the typical issues of smoothing out the normals in high curvature\nareas.\n Mechanical LiDARs rotate a set of rigidly mounted lasers. One firing of such\na set of lasers produces an array of points where each point's neighbor is\nknown due to the known firing pattern of the scanner. We use this knowledge to\nconnect these points to their neighbors and label them using the angles of the\nlines connecting them. When estimating normals at these points, we only\nconsider points with the same label as neighbors. This allows us to avoid\nestimating normals in high curvature areas.\n We evaluate our approach on various data, both self-recorded and publicly\navailable, acquired using various sparse LiDAR sensors. We show that using our\nmethod for normal estimation leads to normals that are more robust in areas\nwith high curvature which leads to maps of higher quality. We also show that\nour method only incurs a constant factor runtime overhead with respect to a\nlightweight baseline normal estimation procedure and is therefore suited for\noperation in computationally demanding environments.\n","authors":["Igor Bogoslavskyi","Konstantinos Zampogiannis","Raymond Phan"],"pdf_url":"https://arxiv.org/pdf/2404.14281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14280v1","updated":"2024-04-22T15:29:19Z","published":"2024-04-22T15:29:19Z","title":"RESFM: Robust Equivariant Multiview Structure from Motion","summary":" Multiview Structure from Motion is a fundamental and challenging computer\nvision problem. A recent deep-based approach was proposed utilizing matrix\nequivariant architectures for the simultaneous recovery of camera pose and 3D\nscene structure from large image collections. This work however made the\nunrealistic assumption that the point tracks given as input are clean of\noutliers. Here we propose an architecture suited to dealing with outliers by\nadding an inlier/outlier classifying module that respects the model\nequivariance and by adding a robust bundle adjustment step. Experiments\ndemonstrate that our method can be successfully applied in realistic settings\nthat include large image collections and point tracks extracted with common\nheuristics and include many outliers.\n","authors":["Fadi Khatib","Yoni Kasten","Dror Moran","Meirav Galun","Ronen Basri"],"pdf_url":"https://arxiv.org/pdf/2404.14280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14279v1","updated":"2024-04-22T15:28:42Z","published":"2024-04-22T15:28:42Z","title":"Co-designing a Sub-millisecond Latency Event-based Eye Tracking System\n with Submanifold Sparse CNN","summary":" Eye-tracking technology is integral to numerous consumer electronics\napplications, particularly in the realm of virtual and augmented reality\n(VR/AR). These applications demand solutions that excel in three crucial\naspects: low-latency, low-power consumption, and precision. Yet, achieving\noptimal performance across all these fronts presents a formidable challenge,\nnecessitating a balance between sophisticated algorithms and efficient backend\nhardware implementations. In this study, we tackle this challenge through a\nsynergistic software/hardware co-design of the system with an event camera.\nLeveraging the inherent sparsity of event-based input data, we integrate a\nnovel sparse FPGA dataflow accelerator customized for submanifold sparse\nconvolution neural networks (SCNN). The SCNN implemented on the accelerator can\nefficiently extract the embedding feature vector from each representation of\nevent slices by only processing the non-zero activations. Subsequently, these\nvectors undergo further processing by a gated recurrent unit (GRU) and a fully\nconnected layer on the host CPU to generate the eye centers. Deployment and\nevaluation of our system reveal outstanding performance metrics. On the\nEvent-based Eye-Tracking-AIS2024 dataset, our system achieves 81% p5 accuracy,\n99.5% p10 accuracy, and 3.71 Mean Euclidean Distance with 0.7 ms latency while\nonly consuming 2.29 mJ per inference. Notably, our solution opens up\nopportunities for future eye-tracking systems. Code is available at\nhttps://github.com/CASR-HKU/ESDA/tree/eye_tracking.\n","authors":["Baoheng Zhang","Yizhao Gao","Jingyuan Li","Hayden Kwok-Hay So"],"pdf_url":"https://arxiv.org/pdf/2404.14279v1.pdf","comment":"Accepted to CVPR 2024 workshop, AIS: Vision, Graphics, and AI for\n Streaming"},{"id":"http://arxiv.org/abs/2312.13328v2","updated":"2024-04-22T15:05:18Z","published":"2023-12-20T17:18:44Z","title":"NeLF-Pro: Neural Light Field Probes for Multi-Scale Novel View Synthesis","summary":" We present NeLF-Pro, a novel representation to model and reconstruct light\nfields in diverse natural scenes that vary in extent and spatial granularity.\nIn contrast to previous fast reconstruction methods that represent the 3D scene\nglobally, we model the light field of a scene as a set of local light field\nfeature probes, parameterized with position and multi-channel 2D feature maps.\nOur central idea is to bake the scene's light field into spatially varying\nlearnable representations and to query point features by weighted blending of\nprobes close to the camera - allowing for mipmap representation and rendering.\nWe introduce a novel vector-matrix-matrix (VMM) factorization technique that\neffectively represents the light field feature probes as products of core\nfactors (i.e., VM) shared among local feature probes, and a basis factor (i.e.,\nM) - efficiently encoding internal relationships and patterns within the scene.\nExperimentally, we demonstrate that NeLF-Pro significantly boosts the\nperformance of feature grid-based representations, and achieves fast\nreconstruction with better rendering quality while maintaining compact\nmodeling. Project webpage https://sinoyou.github.io/nelf-pro/.\n","authors":["Zinuo You","Andreas Geiger","Anpei Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13328v2.pdf","comment":"CVPR 2024 Conference Paper, Camera Ready Version"},{"id":"http://arxiv.org/abs/2404.14249v1","updated":"2024-04-22T15:01:32Z","published":"2024-04-22T15:01:32Z","title":"CLIP-GS: CLIP-Informed Gaussian Splatting for Real-time and\n View-consistent 3D Semantic Understanding","summary":" The recent 3D Gaussian Splatting (GS) exhibits high-quality and real-time\nsynthesis of novel views in 3D scenes. Currently, it primarily focuses on\ngeometry and appearance modeling, while lacking the semantic understanding of\nscenes. To bridge this gap, we present CLIP-GS, which integrates semantics from\nContrastive Language-Image Pre-Training (CLIP) into Gaussian Splatting to\nefficiently comprehend 3D environments without annotated semantic data. In\nspecific, rather than straightforwardly learning and rendering high-dimensional\nsemantic features of 3D Gaussians, which significantly diminishes the\nefficiency, we propose a Semantic Attribute Compactness (SAC) approach. SAC\nexploits the inherent unified semantics within objects to learn compact yet\neffective semantic representations of 3D Gaussians, enabling highly efficient\nrendering (>100 FPS). Additionally, to address the semantic ambiguity, caused\nby utilizing view-inconsistent 2D CLIP semantics to supervise Gaussians, we\nintroduce a 3D Coherent Self-training (3DCS) strategy, resorting to the\nmulti-view consistency originated from the 3D model. 3DCS imposes cross-view\nsemantic consistency constraints by leveraging refined, self-predicted\npseudo-labels derived from the trained 3D Gaussian model, thereby enhancing\nprecise and view-consistent segmentation results. Extensive experiments\ndemonstrate that our method remarkably outperforms existing state-of-the-art\napproaches, achieving improvements of 17.29% and 20.81% in mIoU metric on\nReplica and ScanNet datasets, respectively, while maintaining real-time\nrendering speed. Furthermore, our approach exhibits superior performance even\nwith sparse input data, verifying the robustness of our method.\n","authors":["Guibiao Liao","Jiankun Li","Zhenyu Bao","Xiaoqing Ye","Jingdong Wang","Qing Li","Kanglin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.14249v1.pdf","comment":"https://github.com/gbliao/CLIP-GS"},{"id":"http://arxiv.org/abs/2404.14248v1","updated":"2024-04-22T15:01:12Z","published":"2024-04-22T15:01:12Z","title":"NTIRE 2024 Challenge on Low Light Image Enhancement: Methods and Results","summary":" This paper reviews the NTIRE 2024 low light image enhancement challenge,\nhighlighting the proposed solutions and results. The aim of this challenge is\nto discover an effective network design or solution capable of generating\nbrighter, clearer, and visually appealing results when dealing with a variety\nof conditions, including ultra-high resolution (4K and beyond), non-uniform\nillumination, backlighting, extreme darkness, and night scenes. A notable total\nof 428 participants registered for the challenge, with 22 teams ultimately\nmaking valid submissions. This paper meticulously evaluates the\nstate-of-the-art advancements in enhancing low-light images, reflecting the\nsignificant progress and creativity in this field.\n","authors":["Xiaoning Liu","Zongwei Wu","Ao Li","Florin-Alexandru Vasluianu","Yulun Zhang","Shuhang Gu","Le Zhang","Ce Zhu","Radu Timofte","Zhi Jin","Hongjun Wu","Chenxi Wang","Haitao Ling","Yuanhao Cai","Hao Bian","Yuxin Zheng","Jing Lin","Alan Yuille","Ben Shao","Jin Guo","Tianli Liu","Mohao Wu","Yixu Feng","Shuo Hou","Haotian Lin","Yu Zhu","Peng Wu","Wei Dong","Jinqiu Sun","Yanning Zhang","Qingsen Yan","Wenbin Zou","Weipeng Yang","Yunxiang Li","Qiaomu Wei","Tian Ye","Sixiang Chen","Zhao Zhang","Suiyi Zhao","Bo Wang","Yan Luo","Zhichao Zuo","Mingshen Wang","Junhu Wang","Yanyan Wei","Xiaopeng Sun","Yu Gao","Jiancheng Huang","Hongming Chen","Xiang Chen","Hui Tang","Yuanbin Chen","Yuanbo Zhou","Xinwei Dai","Xintao Qiu","Wei Deng","Qinquan Gao","Tong Tong","Mingjia Li","Jin Hu","Xinyu He","Xiaojie Guo"," Sabarinathan","K Uma","A Sasithradevi","B Sathya Bama","S. Mohamed Mansoor Roomi","V. Srivatsav","Jinjuan Wang","Long Sun","Qiuying Chen","Jiahong Shao","Yizhi Zhang","Marcos V. Conde","Daniel Feijoo","Juan C. Benito","Alvaro García","Jaeho Lee","Seongwan Kim","Sharif S M A","Nodirkhuja Khujaev","Roman Tsoy","Ali Murtaza","Uswah Khairuddin","Ahmad 'Athif Mohd Faudzi","Sampada Malagi","Amogh Joshi","Nikhil Akalwadi","Chaitra Desai","Ramesh Ashok Tabib","Uma Mudenagudi","Wenyi Lian","Wenjing Lian","Jagadeesh Kalyanshetti","Vijayalaxmi Ashok Aralikatti","Palani Yashaswini","Nitish Upasi","Dikshit Hegde","Ujwala Patil","Sujata C","Xingzhuo Yan","Wei Hao","Minghan Fu","Pooja choksy","Anjali Sarvaiya","Kishor Upla","Kiran Raja","Hailong Yan","Yunkai Zhang","Baiang Li","Jingyi Zhang","Huan Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.14248v1.pdf","comment":"NTIRE 2024 Challenge Report"},{"id":"http://arxiv.org/abs/2404.14247v1","updated":"2024-04-22T15:00:51Z","published":"2024-04-22T15:00:51Z","title":"From Modalities to Styles: Rethinking the Domain Gap in Heterogeneous\n Face Recognition","summary":" Heterogeneous Face Recognition (HFR) focuses on matching faces from different\ndomains, for instance, thermal to visible images, making Face Recognition (FR)\nsystems more versatile for challenging scenarios. However, the domain gap\nbetween these domains and the limited large-scale datasets in the target HFR\nmodalities make it challenging to develop robust HFR models from scratch. In\nour work, we view different modalities as distinct styles and propose a method\nto modulate feature maps of the target modality to address the domain gap. We\npresent a new Conditional Adaptive Instance Modulation (CAIM ) module that\nseamlessly fits into existing FR networks, turning them into HFR-ready systems.\nThe CAIM block modulates intermediate feature maps, efficiently adapting to the\nstyle of the source modality and bridging the domain gap. Our method enables\nend-to-end training using a small set of paired samples. We extensively\nevaluate the proposed approach on various challenging HFR benchmarks, showing\nthat it outperforms state-of-the-art methods. The source code and protocols for\nreproducing the findings will be made publicly available\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2404.14247v1.pdf","comment":"Accepted for publication in IEEE TBIOM"},{"id":"http://arxiv.org/abs/2404.14241v1","updated":"2024-04-22T14:53:27Z","published":"2024-04-22T14:53:27Z","title":"UrbanCross: Enhancing Satellite Image-Text Retrieval with Cross-Domain\n Adaptation","summary":" Urbanization challenges underscore the necessity for effective satellite\nimage-text retrieval methods to swiftly access specific information enriched\nwith geographic semantics for urban applications. However, existing methods\noften overlook significant domain gaps across diverse urban landscapes,\nprimarily focusing on enhancing retrieval performance within single domains. To\ntackle this issue, we present UrbanCross, a new framework for cross-domain\nsatellite image-text retrieval. UrbanCross leverages a high-quality,\ncross-domain dataset enriched with extensive geo-tags from three countries to\nhighlight domain diversity. It employs the Large Multimodal Model (LMM) for\ntextual refinement and the Segment Anything Model (SAM) for visual\naugmentation, achieving a fine-grained alignment of images, segments and texts,\nyielding a 10% improvement in retrieval performance. Additionally, UrbanCross\nincorporates an adaptive curriculum-based source sampler and a weighted\nadversarial cross-domain fine-tuning module, progressively enhancing\nadaptability across various domains. Extensive experiments confirm UrbanCross's\nsuperior efficiency in retrieval and adaptation to new urban environments,\ndemonstrating an average performance increase of 15% over its version without\ndomain adaptation mechanisms, effectively bridging the domain gap.\n","authors":["Siru Zhong","Xixuan Hao","Yibo Yan","Ying Zhang","Yangqiu Song","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2404.14241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18985v2","updated":"2024-04-22T14:49:36Z","published":"2024-03-27T20:07:39Z","title":"Robustness and Visual Explanation for Black Box Image, Video, and ECG\n Signal Classification with Reinforcement Learning","summary":" We present a generic Reinforcement Learning (RL) framework optimized for\ncrafting adversarial attacks on different model types spanning from ECG signal\nanalysis (1D), image classification (2D), and video classification (3D). The\nframework focuses on identifying sensitive regions and inducing\nmisclassifications with minimal distortions and various distortion types. The\nnovel RL method outperforms state-of-the-art methods for all three\napplications, proving its efficiency. Our RL approach produces superior\nlocalization masks, enhancing interpretability for image classification and ECG\nanalysis models. For applications such as ECG analysis, our platform highlights\ncritical ECG segments for clinicians while ensuring resilience against\nprevalent distortions. This comprehensive tool aims to bolster both resilience\nwith adversarial training and transparency across varied applications and data\ntypes.\n","authors":["Soumyendu Sarkar","Ashwin Ramesh Babu","Sajad Mousavi","Vineet Gundecha","Avisek Naug","Sahand Ghorbanpour"],"pdf_url":"https://arxiv.org/pdf/2403.18985v2.pdf","comment":"AAAI Proceedings reference:\n https://ojs.aaai.org/index.php/AAAI/article/view/30579"},{"id":"http://arxiv.org/abs/2404.14239v1","updated":"2024-04-22T14:47:54Z","published":"2024-04-22T14:47:54Z","title":"MultiBooth: Towards Generating All Your Concepts in an Image from Text","summary":" This paper introduces MultiBooth, a novel and efficient technique for\nmulti-concept customization in image generation from text. Despite the\nsignificant advancements in customized generation methods, particularly with\nthe success of diffusion models, existing methods often struggle with\nmulti-concept scenarios due to low concept fidelity and high inference cost.\nMultiBooth addresses these issues by dividing the multi-concept generation\nprocess into two phases: a single-concept learning phase and a multi-concept\nintegration phase. During the single-concept learning phase, we employ a\nmulti-modal image encoder and an efficient concept encoding technique to learn\na concise and discriminative representation for each concept. In the\nmulti-concept integration phase, we use bounding boxes to define the generation\narea for each concept within the cross-attention map. This method enables the\ncreation of individual concepts within their specified regions, thereby\nfacilitating the formation of multi-concept images. This strategy not only\nimproves concept fidelity but also reduces additional inference cost.\nMultiBooth surpasses various baselines in both qualitative and quantitative\nevaluations, showcasing its superior performance and computational efficiency.\nProject Page: https://multibooth.github.io/\n","authors":["Chenyang Zhu","Kai Li","Yue Ma","Chunming He","Li Xiu"],"pdf_url":"https://arxiv.org/pdf/2404.14239v1.pdf","comment":"Project Page: https://multibooth.github.io/ . Github Page:\n https://github.com/chenyangzhu1/MultiBooth"},{"id":"http://arxiv.org/abs/2404.14233v1","updated":"2024-04-22T14:46:10Z","published":"2024-04-22T14:46:10Z","title":"Detecting and Mitigating Hallucination in Large Vision Language Models\n via Fine-Grained AI Feedback","summary":" The rapidly developing Large Vision Language Models (LVLMs) have shown\nnotable capabilities on a range of multi-modal tasks, but still face the\nhallucination phenomena where the generated texts do not align with the given\ncontexts, significantly restricting the usages of LVLMs. Most previous work\ndetects and mitigates hallucination at the coarse-grained level or requires\nexpensive annotation (e.g., labeling by proprietary models or human experts).\nTo address these issues, we propose detecting and mitigating hallucinations in\nLVLMs via fine-grained AI feedback. The basic idea is that we generate a\nsmall-size sentence-level hallucination annotation dataset by proprietary\nmodels, whereby we train a hallucination detection model which can perform\nsentence-level hallucination detection, covering primary hallucination types\n(i.e., object, attribute, and relationship). Then, we propose a\ndetect-then-rewrite pipeline to automatically construct preference dataset for\ntraining hallucination mitigating model. Furthermore, we propose\ndifferentiating the severity of hallucinations, and introducing a Hallucination\nSeverity-Aware Direct Preference Optimization (HSA-DPO) for mitigating\nhallucination in LVLMs by incorporating the severity of hallucinations into\npreference learning. Extensive experiments demonstrate the effectiveness of our\nmethod.\n","authors":["Wenyi Xiao","Ziwei Huang","Leilei Gan","Wanggui He","Haoyuan Li","Zhelun Yu","Hao Jiang","Fei Wu","Linchao Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.14233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16368v2","updated":"2024-04-22T14:44:45Z","published":"2024-02-26T07:45:14Z","title":"SPINEPS -- Automatic Whole Spine Segmentation of T2-weighted MR images\n using a Two-Phase Approach to Multi-class Semantic and Instance Segmentation","summary":" Purpose. To present SPINEPS, an open-source deep learning approach for\nsemantic and instance segmentation of 14 spinal structures (ten vertebra\nsubstructures, intervertebral discs, spinal cord, spinal canal, and sacrum) in\nwhole body T2w MRI.\n Methods. During this HIPPA-compliant, retrospective study, we utilized the\npublic SPIDER dataset (218 subjects, 63% female) and a subset of the German\nNational Cohort (1423 subjects, mean age 53, 49% female) for training and\nevaluation. We combined CT and T2w segmentations to train models that segment\n14 spinal structures in T2w sagittal scans both semantically and instance-wise.\nPerformance evaluation metrics included Dice similarity coefficient, average\nsymmetrical surface distance, panoptic quality, segmentation quality, and\nrecognition quality. Statistical significance was assessed using the Wilcoxon\nsigned-rank test. An in-house dataset was used to qualitatively evaluate\nout-of-distribution samples.\n Results. On the public dataset, our approach outperformed the baseline\n(instance-wise vertebra dice score 0.929 vs. 0.907, p-value<0.001). Training on\nauto-generated annotations and evaluating on manually corrected test data from\nthe GNC yielded global dice scores of 0.900 for vertebrae, 0.960 for\nintervertebral discs, and 0.947 for the spinal canal. Incorporating the SPIDER\ndataset during training increased these scores to 0.920, 0.967, 0.958,\nrespectively.\n Conclusions. The proposed segmentation approach offers robust segmentation of\n14 spinal structures in T2w sagittal images, including the spinal cord, spinal\ncanal, intervertebral discs, endplate, sacrum, and vertebrae. The approach\nyields both a semantic and instance mask as output, thus being easy to utilize.\nThis marks the first publicly available algorithm for whole spine segmentation\nin sagittal T2w MR imaging.\n","authors":["Hendrik Möller","Robert Graf","Joachim Schmitt","Benjamin Keinert","Matan Atad","Anjany Sekuboyina","Felix Streckenbach","Hanna Schön","Florian Kofler","Thomas Kroencke","Stefanie Bette","Stefan Willich","Thomas Keil","Thoralf Niendorf","Tobias Pischon","Beate Endemann","Bjoern Menze","Daniel Rueckert","Jan S. Kirschke"],"pdf_url":"https://arxiv.org/pdf/2402.16368v2.pdf","comment":"https://github.com/Hendrik-code/spineps"},{"id":"http://arxiv.org/abs/2404.00257v2","updated":"2024-04-22T14:38:25Z","published":"2024-03-30T06:17:39Z","title":"YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel\n Class Discovery","summary":" Because of its use in practice, open-world object detection (OWOD) has gotten\na lot of attention recently. The challenge is how can a model detect novel\nclasses and then incrementally learn them without forgetting previously known\nclasses. Previous approaches hinge on strongly-supervised or weakly-supervised\nnovel-class data for novel-class detection, which may not apply to real\napplications. We construct a new benchmark that novel classes are only\nencountered at the inference stage. And we propose a new OWOD detector YOLOOC,\nbased on the YOLO architecture yet for the Open-Class setup. We introduce label\nsmoothing to prevent the detector from over-confidently mapping novel classes\nto known classes and to discover novel classes. Extensive experiments conducted\non our more realistic setup demonstrate the effectiveness of our method for\ndiscovering novel classes in our new benchmark.\n","authors":["Qian Wan","Xiang Xiang","Qinhao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.00257v2.pdf","comment":"Withdrawn because it was submitted without consent of the first\n author. In addition, this submission has some errors"},{"id":"http://arxiv.org/abs/2207.04934v3","updated":"2024-04-22T14:11:18Z","published":"2022-07-11T15:15:33Z","title":"Multilevel Geometric Optimization for Regularised Constrained Linear\n Inverse Problems","summary":" We present a geometric multilevel optimization approach that smoothly\nincorporates box constraints. Given a box constrained optimization problem, we\nconsider a hierarchy of models with varying discretization levels. Finer models\nare accurate but expensive to compute, while coarser models are less accurate\nbut cheaper to compute. When working at the fine level, multilevel optimisation\ncomputes the search direction based on a coarser model which speeds up updates\nat the fine level. Moreover, exploiting geometry induced by the hierarchy the\nfeasibility of the updates is preserved. In particular, our approach extends\nclassical components of multigrid methods like restriction and prolongation to\nthe Riemannian structure of our constraints.\n","authors":["Sebastian Müller","Stefania Petra","Matthias Zisler"],"pdf_url":"https://arxiv.org/pdf/2207.04934v3.pdf","comment":"25 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.14199v1","updated":"2024-04-22T14:09:53Z","published":"2024-04-22T14:09:53Z","title":"Generalizable Neural Human Renderer","summary":" While recent advancements in animatable human rendering have achieved\nremarkable results, they require test-time optimization for each subject which\ncan be a significant limitation for real-world applications. To address this,\nwe tackle the challenging task of learning a Generalizable Neural Human\nRenderer (GNH), a novel method for rendering animatable humans from monocular\nvideo without any test-time optimization. Our core method focuses on\ntransferring appearance information from the input video to the output image\nplane by utilizing explicit body priors and multi-view geometry. To render the\nsubject in the intended pose, we utilize a straightforward CNN-based image\nrenderer, foregoing the more common ray-sampling or rasterizing-based rendering\nmodules. Our GNH achieves remarkable generalizable, photorealistic rendering\nwith unseen subjects with a three-stage process. We quantitatively and\nqualitatively demonstrate that GNH significantly surpasses current\nstate-of-the-art methods, notably achieving a 31.3% improvement in LPIPS.\n","authors":["Mana Masuda","Jinhyung Park","Shun Iwase","Rawal Khirodkar","Kris Kitani"],"pdf_url":"https://arxiv.org/pdf/2404.14199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14198v1","updated":"2024-04-22T14:07:42Z","published":"2024-04-22T14:07:42Z","title":"BCFPL: Binary classification ConvNet based Fast Parking space\n recognition with Low resolution image","summary":" The automobile plays an important role in the economic activities of mankind,\nespecially in the metropolis. Under the circumstances, the demand of quick\nsearch for available parking spaces has become a major concern for the\nautomobile drivers. Meanwhile, the public sense of privacy is also awaking, the\nimage-based parking space recognition methods lack the attention of privacy\nprotection. In this paper, we proposed a binary convolutional neural network\nwith lightweight design structure named BCFPL, which can be used to train with\nlow-resolution parking space images and offer a reasonable recognition result.\nThe images of parking space were collected from various complex environments,\nincluding different weather, occlusion conditions, and various camera angles.\nWe conducted the training and testing progresses among different datasets and\npartial subsets. The experimental results show that the accuracy of BCFPL does\nnot decrease compared with the original resolution image directly, and can\nreach the average level of the existing mainstream method. BCFPL also has low\nhardware requirements and fast recognition speed while meeting the privacy\nrequirements, so it has application potential in intelligent city construction\nand automatic driving field.\n","authors":["Shuo Zhang","Xin Chen","Zixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04661v3","updated":"2024-04-22T14:04:55Z","published":"2024-03-07T17:07:51Z","title":"Dynamic Cross Attention for Audio-Visual Person Verification","summary":" Although person or identity verification has been predominantly explored\nusing individual modalities such as face and voice, audio-visual fusion has\nrecently shown immense potential to outperform unimodal approaches. Audio and\nvisual modalities are often expected to pose strong complementary\nrelationships, which plays a crucial role in effective audio-visual fusion.\nHowever, they may not always strongly complement each other, they may also\nexhibit weak complementary relationships, resulting in poor audio-visual\nfeature representations. In this paper, we propose a Dynamic Cross-Attention\n(DCA) model that can dynamically select the cross-attended or unattended\nfeatures on the fly based on the strong or weak complementary relationships,\nrespectively, across audio and visual modalities. In particular, a conditional\ngating layer is designed to evaluate the contribution of the cross-attention\nmechanism and choose cross-attended features only when they exhibit strong\ncomplementary relationships, otherwise unattended features. Extensive\nexperiments are conducted on the Voxceleb1 dataset to demonstrate the\nrobustness of the proposed model. Results indicate that the proposed model\nconsistently improves the performance on multiple variants of cross-attention\nwhile outperforming the state-of-the-art methods.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.04661v3.pdf","comment":"Accepted to FG2024"},{"id":"http://arxiv.org/abs/2404.07600v2","updated":"2024-04-22T13:49:54Z","published":"2024-04-11T09:39:58Z","title":"Implicit and Explicit Language Guidance for Diffusion-based Visual\n Perception","summary":" Text-to-image diffusion models have shown powerful ability on conditional\nimage synthesis. With large-scale vision-language pre-training, diffusion\nmodels are able to generate high-quality images with rich texture and\nreasonable structure under different text prompts. However, it is an open\nproblem to adapt the pre-trained diffusion model for visual perception. In this\npaper, we propose an implicit and explicit language guidance framework for\ndiffusion-based perception, named IEDP. Our IEDP comprises an implicit language\nguidance branch and an explicit language guidance branch. The implicit branch\nemploys frozen CLIP image encoder to directly generate implicit text embeddings\nthat are fed to diffusion model, without using explicit text prompts. The\nexplicit branch utilizes the ground-truth labels of corresponding images as\ntext prompts to condition feature extraction of diffusion model. During\ntraining, we jointly train diffusion model by sharing the model weights of\nthese two branches. As a result, implicit and explicit branches can jointly\nguide feature learning. During inference, we only employ implicit branch for\nfinal prediction, which does not require any ground-truth labels. Experiments\nare performed on two typical perception tasks, including semantic segmentation\nand depth estimation. Our IEDP achieves promising performance on both tasks.\nFor semantic segmentation, our IEDP has the mIoU$^\\text{ss}$ score of 55.9% on\nAD20K validation set, which outperforms the baseline method VPD by 2.2%. For\ndepth estimation, our IEDP outperforms the baseline method VPD with a relative\ngain of 11.0%.\n","authors":["Hefeng Wang","Jiale Cao","Jin Xie","Aiping Yang","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2404.07600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14177v1","updated":"2024-04-22T13:49:42Z","published":"2024-04-22T13:49:42Z","title":"Face2Face: Label-driven Facial Retouching Restoration","summary":" With the popularity of social media platforms such as Instagram and TikTok,\nand the widespread availability and convenience of retouching tools, an\nincreasing number of individuals are utilizing these tools to beautify their\nfacial photographs. This poses challenges for fields that place high demands on\nthe authenticity of photographs, such as identity verification and social\nmedia. By altering facial images, users can easily create deceptive images,\nleading to the dissemination of false information. This may pose challenges to\nthe reliability of identity verification systems and social media, and even\nlead to online fraud. To address this issue, some work has proposed makeup\nremoval methods, but they still lack the ability to restore images involving\ngeometric deformations caused by retouching. To tackle the problem of facial\nretouching restoration, we propose a framework, dubbed Face2Face, which\nconsists of three components: a facial retouching detector, an image\nrestoration model named FaceR, and a color correction module called\nHierarchical Adaptive Instance Normalization (H-AdaIN). Firstly, the facial\nretouching detector predicts a retouching label containing three integers,\nindicating the retouching methods and their corresponding degrees. Then FaceR\nrestores the retouched image based on the predicted retouching label. Finally,\nH-AdaIN is applied to address the issue of color shift arising from diffusion\nmodels. Extensive experiments demonstrate the effectiveness of our framework\nand each module.\n","authors":["Guanhua Zhao","Yu Gu","Xuhan Sheng","Yujie Hu","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13534v2","updated":"2024-04-22T13:30:40Z","published":"2023-12-21T02:28:41Z","title":"SE(3)-Equivariant and Noise-Invariant 3D Rigid Motion Tracking in Brain\n MRI","summary":" Rigid motion tracking is paramount in many medical imaging applications where\nmovements need to be detected, corrected, or accounted for. Modern strategies\nrely on convolutional neural networks (CNN) and pose this problem as rigid\nregistration. Yet, CNNs do not exploit natural symmetries in this task, as they\nare equivariant to translations (their outputs shift with their inputs) but not\nto rotations. Here we propose EquiTrack, the first method that uses recent\nsteerable SE(3)-equivariant CNNs (E-CNN) for motion tracking. While steerable\nE-CNNs can extract corresponding features across different poses, testing them\non noisy medical images reveals that they do not have enough learning capacity\nto learn noise invariance. Thus, we introduce a hybrid architecture that pairs\na denoiser with an E-CNN to decouple the processing of anatomically irrelevant\nintensity features from the extraction of equivariant spatial features. Rigid\ntransforms are then estimated in closed-form. EquiTrack outperforms\nstate-of-the-art learning and optimisation methods for motion tracking in adult\nbrain MRI and fetal MRI time series. Our code is available at\nhttps://github.com/BBillot/EquiTrack.\n","authors":["Benjamin Billot","Neel Dey","Daniel Moyer","Malte Hoffmann","Esra Abaci Turk","Borjan Gagoski","Ellen Grant","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2312.13534v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2404.14162v1","updated":"2024-04-22T13:21:09Z","published":"2024-04-22T13:21:09Z","title":"FLDM-VTON: Faithful Latent Diffusion Model for Virtual Try-on","summary":" Despite their impressive generative performance, latent diffusion model-based\nvirtual try-on (VTON) methods lack faithfulness to crucial details of the\nclothes, such as style, pattern, and text. To alleviate these issues caused by\nthe diffusion stochastic nature and latent supervision, we propose a novel\nFaithful Latent Diffusion Model for VTON, termed FLDM-VTON. FLDM-VTON improves\nthe conventional latent diffusion process in three major aspects. First, we\npropose incorporating warped clothes as both the starting point and local\ncondition, supplying the model with faithful clothes priors. Second, we\nintroduce a novel clothes flattening network to constrain generated try-on\nimages, providing clothes-consistent faithful supervision. Third, we devise a\nclothes-posterior sampling for faithful inference, further enhancing the model\nperformance over conventional clothes-agnostic Gaussian sampling. Extensive\nexperimental results on the benchmark VITON-HD and Dress Code datasets\ndemonstrate that our FLDM-VTON outperforms state-of-the-art baselines and is\nable to generate photo-realistic try-on images with faithful clothing details.\n","authors":["Chenhui Wang","Tao Chen","Zhihao Chen","Zhizhong Huang","Taoran Jiang","Qi Wang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2404.14162v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2312.02567v2","updated":"2024-04-22T13:11:56Z","published":"2023-12-05T08:32:27Z","title":"Think Twice Before Selection: Federated Evidential Active Learning for\n Medical Image Analysis with Domain Shifts","summary":" Federated learning facilitates the collaborative learning of a global model\nacross multiple distributed medical institutions without centralizing data.\nNevertheless, the expensive cost of annotation on local clients remains an\nobstacle to effectively utilizing local data. To mitigate this issue, federated\nactive learning methods suggest leveraging local and global model predictions\nto select a relatively small amount of informative local data for annotation.\nHowever, existing methods mainly focus on all local data sampled from the same\ndomain, making them unreliable in realistic medical scenarios with domain\nshifts among different clients. In this paper, we make the first attempt to\nassess the informativeness of local data derived from diverse domains and\npropose a novel methodology termed Federated Evidential Active Learning (FEAL)\nto calibrate the data evaluation under domain shift. Specifically, we introduce\na Dirichlet prior distribution in both local and global models to treat the\nprediction as a distribution over the probability simplex and capture both\naleatoric and epistemic uncertainties by using the Dirichlet-based evidential\nmodel. Then we employ the epistemic uncertainty to calibrate the aleatoric\nuncertainty. Afterward, we design a diversity relaxation strategy to reduce\ndata redundancy and maintain data diversity. Extensive experiments and analysis\non five real multi-center medical image datasets demonstrate the superiority of\nFEAL over the state-of-the-art active learning methods in federated scenarios\nwith domain shifts. The code will be available at\nhttps://github.com/JiayiChen815/FEAL.\n","authors":["Jiayi Chen","Benteng Ma","Hengfei Cui","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2312.02567v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14135v1","updated":"2024-04-22T12:39:12Z","published":"2024-04-22T12:39:12Z","title":"Text in the Dark: Extremely Low-Light Text Image Enhancement","summary":" Extremely low-light text images are common in natural scenes, making scene\ntext detection and recognition challenging. One solution is to enhance these\nimages using low-light image enhancement methods before text extraction.\nHowever, previous methods often do not try to particularly address the\nsignificance of low-level features, which are crucial for optimal performance\non downstream scene text tasks. Further research is also hindered by the lack\nof extremely low-light text datasets. To address these limitations, we propose\na novel encoder-decoder framework with an edge-aware attention module to focus\non scene text regions during enhancement. Our proposed method uses novel text\ndetection and edge reconstruction losses to emphasize low-level scene text\nfeatures, leading to successful text extraction. Additionally, we present a\nSupervised Deep Curve Estimation (Supervised-DCE) model to synthesize extremely\nlow-light images based on publicly available scene text datasets such as\nICDAR15 (IC15). We also labeled texts in the extremely low-light See In the\nDark (SID) and ordinary LOw-Light (LOL) datasets to allow for objective\nassessment of extremely low-light image enhancement through scene text tasks.\nExtensive experiments show that our model outperforms state-of-the-art methods\nin terms of both image quality and scene text metrics on the widely-used LOL,\nSID, and synthetic IC15 datasets. Code and dataset will be released publicly at\nhttps://github.com/chunchet-ng/Text-in-the-Dark.\n","authors":["Che-Tsung Lin","Chun Chet Ng","Zhi Qin Tan","Wan Jun Nah","Xinyu Wang","Jie Long Kew","Pohao Hsu","Shang Hong Lai","Chee Seng Chan","Christopher Zach"],"pdf_url":"https://arxiv.org/pdf/2404.14135v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2404.14132v1","updated":"2024-04-22T12:33:18Z","published":"2024-04-22T12:33:18Z","title":"CRNet: A Detail-Preserving Network for Unified Image Restoration and\n Enhancement Task","summary":" In real-world scenarios, images captured often suffer from blurring, noise,\nand other forms of image degradation, and due to sensor limitations, people\nusually can only obtain low dynamic range images. To achieve high-quality\nimages, researchers have attempted various image restoration and enhancement\noperations on photographs, including denoising, deblurring, and high dynamic\nrange imaging. However, merely performing a single type of image enhancement\nstill cannot yield satisfactory images. In this paper, to deal with the\nchallenge above, we propose the Composite Refinement Network (CRNet) to address\nthis issue using multiple exposure images. By fully integrating\ninformation-rich multiple exposure inputs, CRNet can perform unified image\nrestoration and enhancement. To improve the quality of image details, CRNet\nexplicitly separates and strengthens high and low-frequency information through\npooling layers, using specially designed Multi-Branch Blocks for effective\nfusion of these frequencies. To increase the receptive field and fully\nintegrate input features, CRNet employs the High-Frequency Enhancement Module,\nwhich includes large kernel convolutions and an inverted bottleneck ConvFFN.\nOur model secured third place in the first track of the Bracketing Image\nRestoration and Enhancement Challenge, surpassing previous SOTA models in both\ntesting metrics and visual quality.\n","authors":["Kangzhen Yang","Tao Hu","Kexin Dai","Genggeng Chen","Yu Cao","Wei Dong","Peng Wu","Yanning Zhang","Qingsen Yan"],"pdf_url":"https://arxiv.org/pdf/2404.14132v1.pdf","comment":"This paper is accepted by CVPR2024 Workshop, Code:\n https://github.com/CalvinYang0/CRNet"},{"id":"http://arxiv.org/abs/2404.14117v1","updated":"2024-04-22T12:07:10Z","published":"2024-04-22T12:07:10Z","title":"Hierarchical localization with panoramic views and triplet loss\n functions","summary":" The main objective of this paper is to address the mobile robot localization\nproblem with Triplet Convolutional Neural Networks and test their robustness\nagainst changes of the lighting conditions. We have used omnidirectional images\nfrom real indoor environments captured in dynamic conditions that have been\nconverted to panoramic format. Two approaches are proposed to address\nlocalization by means of triplet neural networks. First, hierarchical\nlocalization, which consists in estimating the robot position in two stages: a\ncoarse localization, which involves a room retrieval task, and a fine\nlocalization is addressed by means of image retrieval in the previously\nselected room. Second, global localization, which consists in estimating the\nposition of the robot inside the entire map in a unique step. Besides, an\nexhaustive study of the loss function influence on the network learning process\nhas been made. The experimental section proves that triplet neural networks are\nan efficient and robust tool to address the localization of mobile robots in\nindoor environments, considering real operation conditions.\n","authors":["Marcos Alfaro","Juan José Cabrera","Luis Miguel Jiménez","Óscar Reinoso","Luis Payá"],"pdf_url":"https://arxiv.org/pdf/2404.14117v1.pdf","comment":"This work has been submitted to the Artificial Intelligence Journal\n (Ed. Elsevier) for possible publication. Copyright may be transferred without\n notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2404.14109v1","updated":"2024-04-22T11:52:40Z","published":"2024-04-22T11:52:40Z","title":"CKD: Contrastive Knowledge Distillation from A Sample-wise Perspective","summary":" In this paper, we present a simple yet effective contrastive knowledge\ndistillation approach, which can be formulated as a sample-wise alignment\nproblem with intra- and inter-sample constraints. Unlike traditional knowledge\ndistillation methods that concentrate on maximizing feature similarities or\npreserving class-wise semantic correlations between teacher and student\nfeatures, our method attempts to recover the \"dark knowledge\" by aligning\nsample-wise teacher and student logits. Specifically, our method first\nminimizes logit differences within the same sample by considering their\nnumerical values, thus preserving intra-sample similarities. Next, we bridge\nsemantic disparities by leveraging dissimilarities across different samples.\nNote that constraints on intra-sample similarities and inter-sample\ndissimilarities can be efficiently and effectively reformulated into a\ncontrastive learning framework with newly designed positive and negative pairs.\nThe positive pair consists of the teacher's and student's logits derived from\nan identical sample, while the negative pairs are formed by using logits from\ndifferent samples. With this formulation, our method benefits from the\nsimplicity and efficiency of contrastive learning through the optimization of\nInfoNCE, yielding a run-time complexity that is far less than $O(n^2)$, where\n$n$ represents the total number of training samples. Furthermore, our method\ncan eliminate the need for hyperparameter tuning, particularly related to\ntemperature parameters and large batch sizes. We conduct comprehensive\nexperiments on three datasets including CIFAR-100, ImageNet-1K, and MS COCO.\nExperimental results clearly confirm the effectiveness of the proposed method\non both image classification and object detection tasks. Our source codes will\nbe publicly available at https://github.com/wencheng-zhu/CKD.\n","authors":["Wencheng Zhu","Xin Zhou","Pengfei Zhu","Yu Wang","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2404.14109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14099v1","updated":"2024-04-22T11:37:35Z","published":"2024-04-22T11:37:35Z","title":"DynaMMo: Dynamic Model Merging for Efficient Class Incremental Learning\n for Medical Images","summary":" Continual learning, the ability to acquire knowledge from new data while\nretaining previously learned information, is a fundamental challenge in machine\nlearning. Various approaches, including memory replay, knowledge distillation,\nmodel regularization, and dynamic network expansion, have been proposed to\naddress this issue. Thus far, dynamic network expansion methods have achieved\nstate-of-the-art performance at the cost of incurring significant computational\noverhead. This is due to the need for additional model buffers, which makes it\nless feasible in resource-constrained settings, particularly in the medical\ndomain. To overcome this challenge, we propose Dynamic Model Merging, DynaMMo,\na method that merges multiple networks at different stages of model training to\nachieve better computational efficiency. Specifically, we employ lightweight\nlearnable modules for each task and combine them into a unified model to\nminimize computational overhead. DynaMMo achieves this without compromising\nperformance, offering a cost-effective solution for continual learning in\nmedical applications. We evaluate DynaMMo on three publicly available datasets,\ndemonstrating its effectiveness compared to existing approaches. DynaMMo offers\naround 10-fold reduction in GFLOPS with a small drop of 2.76 in average\naccuracy when compared to state-of-the-art dynamic-based approaches. The code\nimplementation of this work will be available upon the acceptance of this work\nat https://github.com/BioMedIA-MBZUAI/DynaMMo.\n","authors":["Mohammad Areeb Qazi","Ibrahim Almakky","Anees Ur Rehman Hashmi","Santosh Sanjeev","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2404.14099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02611v2","updated":"2024-04-22T11:15:46Z","published":"2024-03-05T02:59:35Z","title":"A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid\n Transformer and Contrastive Learning","summary":" Defocus blur is a persistent problem in microscope imaging that poses harm to\npathology interpretation and medical intervention in cell microscopy and\nmicroscope surgery. To address this problem, a unified framework including the\nmulti-pyramid transformer (MPT) and extended frequency contrastive\nregularization (EFCR) is proposed to tackle two outstanding challenges in\nmicroscopy deblur: longer attention span and data deficiency. The MPT employs\nan explicit pyramid structure at each network stage that integrates the\ncross-scale window attention (CSWA), the intra-scale channel attention (ISCA),\nand the feature-enhancing feed-forward network (FEFN) to capture long-range\ncross-scale spatial interaction and global channel context. The EFCR addresses\nthe data deficiency problem by exploring latent deblur signals from different\nfrequency bands. It also enables deblur knowledge transfer to learn\ncross-domain information from extra data, improving deblur performance for\nlabeled and unlabeled data. Extensive experiments and downstream task\nvalidation show the framework achieves state-of-the-art performance across\nmultiple datasets. Project page: https://github.com/PieceZhang/MPT-CataBlur.\n","authors":["Yuelin Zhang","Pengyu Zheng","Wanquan Yan","Chengyu Fang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.02611v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.09359v2","updated":"2024-04-22T10:52:32Z","published":"2024-04-14T21:14:47Z","title":"Exploring Feedback Generation in Automated Skeletal Movement Assessment:\n A Comprehensive Overview","summary":" The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection and analysis from 2D or 3D videos. While the primary\nobjective of automatic assessment tasks is to score movements, the automatic\ngeneration of feedback highlighting key movement issues has the potential to\nsignificantly enhance and accelerate the rehabilitation process. While numerous\nresearch works exist in the field of automatic movement assessment, only a\nhandful address feedback generation. In this study, we explain the types of\nfeedback that can be generated, review existing solutions for automatic\nfeedback generation, and discuss future research directions. To our knowledge,\nthis is the first comprehensive review of feedback generation in skeletal\nmovement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14077v1","updated":"2024-04-22T10:49:46Z","published":"2024-04-22T10:49:46Z","title":"Research on Robot Path Planning Based on Reinforcement Learning","summary":" This project has conducted research on robot path planning based on Visual\nSLAM. The main work of this project is as follows: (1) Construction of Visual\nSLAM system. Research has been conducted on the basic architecture of Visual\nSLAM. A Visual SLAM system is developed based on ORB-SLAM3 system, which can\nconduct dense point cloud mapping. (2) The map suitable for two-dimensional\npath planning is obtained through map conversion. This part converts the dense\npoint cloud map obtained by Visual SLAM system into an octomap and then\nperforms projection transformation to the grid map. The map conversion converts\nthe dense point cloud map containing a large amount of redundant map\ninformation into an extremely lightweight grid map suitable for path planning.\n(3) Research on path planning algorithm based on reinforcement learning. This\nproject has conducted experimental comparisons between the Q-learning\nalgorithm, the DQN algorithm, and the SARSA algorithm, and found that DQN is\nthe algorithm with the fastest convergence and best performance in\nhigh-dimensional complex environments. This project has conducted experimental\nverification of the Visual SLAM system in a simulation environment. The\nexperimental results obtained based on open-source dataset and self-made\ndataset prove the feasibility and effectiveness of the designed Visual SLAM\nsystem. At the same time, this project has also conducted comparative\nexperiments on the three reinforcement learning algorithms under the same\nexperimental condition to obtain the optimal algorithm under the experimental\ncondition.\n","authors":["Wang Ruiqi"],"pdf_url":"https://arxiv.org/pdf/2404.14077v1.pdf","comment":"My undergrad final year project report, 44 pages and 15 figures"},{"id":"http://arxiv.org/abs/2404.14076v1","updated":"2024-04-22T10:45:59Z","published":"2024-04-22T10:45:59Z","title":"Noise contrastive estimation with soft targets for conditional models","summary":" Soft targets combined with the cross-entropy loss have shown to improve\ngeneralization performance of deep neural networks on supervised classification\ntasks. The standard cross-entropy loss however assumes data to be categorically\ndistributed, which may often not be the case in practice. In contrast, InfoNCE\ndoes not rely on such an explicit assumption but instead implicitly estimates\nthe true conditional through negative sampling. Unfortunately, it cannot be\ncombined with soft targets in its standard formulation, hindering its use in\ncombination with sophisticated training strategies. In this paper, we address\nthis limitation by proposing a principled loss function that is compatible with\nprobabilistic targets. Our new soft target InfoNCE loss is conceptually simple,\nefficient to compute, and can be derived within the framework of noise\ncontrastive estimation. Using a toy example, we demonstrate shortcomings of the\ncategorical distribution assumption of cross-entropy, and discuss implications\nof sampling from soft distributions. We observe that soft target InfoNCE\nperforms on par with strong soft target cross-entropy baselines and outperforms\nhard target NLL and InfoNCE losses on popular benchmarks, including ImageNet.\nFinally, we provide a simple implementation of our loss, geared towards\nsupervised classification and fully compatible with deep classification model\ntrained with cross-entropy.\n","authors":["Johannes Hugger","Virginie Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2404.14076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17469v2","updated":"2024-04-22T10:40:50Z","published":"2023-06-30T08:34:08Z","title":"Manga109Dialog: A Large-scale Dialogue Dataset for Comics Speaker\n Detection","summary":" The expanding market for e-comics has spurred interest in the development of\nautomated methods to analyze comics. For further understanding of comics, an\nautomated approach is needed to link text in comics to characters speaking the\nwords. Comics speaker detection research has practical applications, such as\nautomatic character assignment for audiobooks, automatic translation according\nto characters' personalities, and inference of character relationships and\nstories.\n To deal with the problem of insufficient speaker-to-text annotations, we\ncreated a new annotation dataset Manga109Dialog based on Manga109.\nManga109Dialog is the world's largest comics speaker annotation dataset,\ncontaining 132,692 speaker-to-text pairs. We further divided our dataset into\ndifferent levels by prediction difficulties to evaluate speaker detection\nmethods more appropriately. Unlike existing methods mainly based on distances,\nwe propose a deep learning-based method using scene graph generation models.\nDue to the unique features of comics, we enhance the performance of our\nproposed model by considering the frame reading order. We conducted experiments\nusing Manga109Dialog and other datasets. Experimental results demonstrate that\nour scene-graph-based approach outperforms existing methods, achieving a\nprediction accuracy of over 75%.\n","authors":["Yingxuan Li","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2306.17469v2.pdf","comment":"Accepted to ICME2024"},{"id":"http://arxiv.org/abs/2403.01644v3","updated":"2024-04-22T10:34:09Z","published":"2024-03-03T23:46:06Z","title":"OccFusion: A Straightforward and Effective Multi-Sensor Fusion Framework\n for 3D Occupancy Prediction","summary":" This paper introduces OccFusion, a straightforward and efficient sensor\nfusion framework for predicting 3D occupancy. A comprehensive understanding of\n3D scenes is crucial in autonomous driving, and recent models for 3D semantic\noccupancy prediction have successfully addressed the challenge of describing\nreal-world objects with varied shapes and classes. However, existing methods\nfor 3D occupancy prediction heavily rely on surround-view camera images, making\nthem susceptible to changes in lighting and weather conditions. By integrating\nfeatures from additional sensors, such as lidar and surround view radars, our\nframework enhances the accuracy and robustness of occupancy prediction,\nresulting in top-tier performance on the nuScenes benchmark. Furthermore,\nextensive experiments conducted on the nuScenes dataset, including challenging\nnight and rainy scenarios, confirm the superior performance of our sensor\nfusion strategy across various perception ranges. The code for this framework\nwill be made available at https://github.com/DanielMing123/OCCFusion.\n","authors":["Zhenxing Ming","Julie Stephany Berrio","Mao Shan","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2403.01644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14066v1","updated":"2024-04-22T10:23:59Z","published":"2024-04-22T10:23:59Z","title":"SHE-Net: Syntax-Hierarchy-Enhanced Text-Video Retrieval","summary":" The user base of short video apps has experienced unprecedented growth in\nrecent years, resulting in a significant demand for video content analysis. In\nparticular, text-video retrieval, which aims to find the top matching videos\ngiven text descriptions from a vast video corpus, is an essential function, the\nprimary challenge of which is to bridge the modality gap. Nevertheless, most\nexisting approaches treat texts merely as discrete tokens and neglect their\nsyntax structures. Moreover, the abundant spatial and temporal clues in videos\nare often underutilized due to the lack of interaction with text. To address\nthese issues, we argue that using texts as guidance to focus on relevant\ntemporal frames and spatial regions within videos is beneficial. In this paper,\nwe propose a novel Syntax-Hierarchy-Enhanced text-video retrieval method\n(SHE-Net) that exploits the inherent semantic and syntax hierarchy of texts to\nbridge the modality gap from two perspectives. First, to facilitate a more\nfine-grained integration of visual content, we employ the text syntax\nhierarchy, which reveals the grammatical structure of text descriptions, to\nguide the visual representations. Second, to further enhance the multi-modal\ninteraction and alignment, we also utilize the syntax hierarchy to guide the\nsimilarity calculation. We evaluated our method on four public text-video\nretrieval datasets of MSR-VTT, MSVD, DiDeMo, and ActivityNet. The experimental\nresults and ablation studies confirm the advantages of our proposed method.\n","authors":["Xuzheng Yu","Chen Jiang","Xingning Dong","Tian Gan","Ming Yang","Qingpei Guo"],"pdf_url":"https://arxiv.org/pdf/2404.14066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14064v1","updated":"2024-04-22T10:21:41Z","published":"2024-04-22T10:21:41Z","title":"Multi-view Disentanglement for Reinforcement Learning with Multiple\n Cameras","summary":" The performance of image-based Reinforcement Learning (RL) agents can vary\ndepending on the position of the camera used to capture the images. Training on\nmultiple cameras simultaneously, including a first-person egocentric camera,\ncan leverage information from different camera perspectives to improve the\nperformance of RL. However, hardware constraints may limit the availability of\nmultiple cameras in real-world deployment. Additionally, cameras may become\ndamaged in the real-world preventing access to all cameras that were used\nduring training. To overcome these hardware constraints, we propose Multi-View\nDisentanglement (MVD), which uses multiple cameras to learn a policy that\nachieves zero-shot generalisation to any single camera from the training set.\nOur approach is a self-supervised auxiliary task for RL that learns a\ndisentangled representation from multiple cameras, with a shared representation\nthat is aligned across all cameras to allow generalisation to a single camera,\nand a private representation that is camera-specific. We show experimentally\nthat an RL agent trained on a single third-person camera is unable to learn an\noptimal policy in many control tasks; but, our approach, benefiting from\nmultiple cameras during training, is able to solve the task using only the same\nsingle third-person camera.\n","authors":["Mhairi Dunion","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2404.14064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14062v1","updated":"2024-04-22T10:19:16Z","published":"2024-04-22T10:19:16Z","title":"GatedLexiconNet: A Comprehensive End-to-End Handwritten Paragraph Text\n Recognition System","summary":" The Handwritten Text Recognition problem has been a challenge for researchers\nfor the last few decades, especially in the domain of computer vision, a\nsubdomain of pattern recognition. Variability of texts amongst writers,\ncursiveness, and different font styles of handwritten texts with degradation of\nhistorical text images make it a challenging problem. Recognizing scanned\ndocument images in neural network-based systems typically involves a two-step\napproach: segmentation and recognition. However, this method has several\ndrawbacks. These shortcomings encompass challenges in identifying text regions,\nanalyzing layout diversity within pages, and establishing accurate ground truth\nsegmentation. Consequently, these processes are prone to errors, leading to\nbottlenecks in achieving high recognition accuracies. Thus, in this study, we\npresent an end-to-end paragraph recognition system that incorporates internal\nline segmentation and gated convolutional layers based encoder. The gating is a\nmechanism that controls the flow of information and allows to adaptively\nselection of the more relevant features in handwritten text recognition models.\nThe attention module plays an important role in performing internal line\nsegmentation, allowing the page to be processed line-by-line. During the\ndecoding step, we have integrated a connectionist temporal classification-based\nword beam search decoder as a post-processing step. In this work, we have\nextended existing LexiconNet by carefully applying and utilizing gated\nconvolutional layers in the existing deep neural network. Our results at line\nand page levels also favour our new GatedLexiconNet. This study reported\ncharacter error rates of 2.27% on IAM, 0.9% on RIMES, and 2.13% on READ-16, and\nword error rates of 5.73% on IAM, 2.76% on RIMES, and 6.52% on READ-2016\ndatasets.\n","authors":["Lalita Kumari","Sukhdeep Singh","Vaibhav Varish Singh Rathore","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.14062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14055v1","updated":"2024-04-22T10:11:31Z","published":"2024-04-22T10:11:31Z","title":"RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key\n Identification","summary":" We revisit Tree-Ring Watermarking, a recent diffusion model watermarking\nmethod that demonstrates great robustness to various attacks. We conduct an\nin-depth study on it and reveal that the distribution shift unintentionally\nintroduced by the watermarking process, apart from watermark pattern matching,\ncontributes to its exceptional robustness. Our investigation further exposes\ninherent flaws in its original design, particularly in its ability to identify\nmultiple distinct keys, where distribution shift offers no assistance. Based on\nthese findings and analysis, we present RingID for enhanced multi-key\nidentification. It consists of a novel multi-channel heterogeneous watermarking\napproach designed to seamlessly amalgamate distinctive advantages from diverse\nwatermarks. Coupled with a series of suggested enhancements, RingID exhibits\nsubstantial advancements in multi-key identification.\n","authors":["Hai Ci","Pei Yang","Yiren Song","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2404.14055v1.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.08965v2","updated":"2024-04-22T10:08:54Z","published":"2024-04-13T11:07:10Z","title":"Seeing Text in the Dark: Algorithm and Benchmark","summary":" Localizing text in low-light environments is challenging due to visual\ndegradations. Although a straightforward solution involves a two-stage pipeline\nwith low-light image enhancement (LLE) as the initial step followed by\ndetector, LLE is primarily designed for human vision instead of machine and can\naccumulate errors. In this work, we propose an efficient and effective\nsingle-stage approach for localizing text in dark that circumvents the need for\nLLE. We introduce a constrained learning module as an auxiliary mechanism\nduring the training stage of the text detector. This module is designed to\nguide the text detector in preserving textual spatial features amidst feature\nmap resizing, thus minimizing the loss of spatial information in texts under\nlow-light visual degradations. Specifically, we incorporate spatial\nreconstruction and spatial semantic constraints within this module to ensure\nthe text detector acquires essential positional and contextual range knowledge.\nOur approach enhances the original text detector's ability to identify text's\nlocal topological features using a dynamic snake feature pyramid network and\nadopts a bottom-up contour shaping strategy with a novel rectangular\naccumulation technique for accurate delineation of streamlined text features.\nIn addition, we present a comprehensive low-light dataset for arbitrary-shaped\ntext, encompassing diverse scenes and languages. Notably, our method achieves\nstate-of-the-art results on this low-light dataset and exhibits comparable\nperformance on standard normal light datasets. The code and dataset will be\nreleased.\n","authors":["Chengpei Xu","Hao Fu","Long Ma","Wenjing Jia","Chengqi Zhang","Feng Xia","Xiaoyu Ai","Binghao Li","Wenjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13959v5","updated":"2024-04-22T10:00:27Z","published":"2023-03-24T12:33:44Z","title":"Bridging Stereo Geometry and BEV Representation with Reliable Mutual\n Interaction for Semantic Scene Completion","summary":" 3D semantic scene completion (SSC) is an ill-posed perception task that\nrequires inferring a dense 3D scene from limited observations. Previous\ncamera-based methods struggle to predict accurate semantic scenes due to\ninherent geometric ambiguity and incomplete observations. In this paper, we\nresort to stereo matching technique and bird's-eye-view (BEV) representation\nlearning to address such issues in SSC. Complementary to each other, stereo\nmatching mitigates geometric ambiguity with epipolar constraint while BEV\nrepresentation enhances the hallucination ability for invisible regions with\nglobal semantic context. However, due to the inherent representation gap\nbetween stereo geometry and BEV features, it is non-trivial to bridge them for\ndense prediction task of SSC. Therefore, we further develop a unified\noccupancy-based framework dubbed BRGScene, which effectively bridges these two\nrepresentations with dense 3D volumes for reliable semantic scene completion.\nSpecifically, we design a novel Mutual Interactive Ensemble (MIE) block for\npixel-level reliable aggregation of stereo geometry and BEV features. Within\nthe MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced\nwith confidence re-weighting, is employed to encourage fine-grained interaction\nthrough mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is\nintroduced to facilitate complementary aggregation through channel-wise\nrecalibration and multi-group voting. Our method outperforms all published\ncamera-based methods on SemanticKITTI for semantic scene completion. Our code\nis available on \\url{https://github.com/Arlo0o/StereoScene}.\n","authors":["Bohan Li","Yasheng Sun","Zhujin Liang","Dalong Du","Zhuanghui Zhang","Xiaofeng Wang","Yunnan Wang","Xin Jin","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.13959v5.pdf","comment":"IJCAI2024 (https://github.com/Arlo0o/StereoScene)"},{"id":"http://arxiv.org/abs/2404.14044v1","updated":"2024-04-22T09:57:53Z","published":"2024-04-22T09:57:53Z","title":"HashPoint: Accelerated Point Searching and Sampling for Neural Rendering","summary":" In this paper, we address the problem of efficient point searching and\nsampling for volume neural rendering. Within this realm, two typical approaches\nare employed: rasterization and ray tracing. The rasterization-based methods\nenable real-time rendering at the cost of increased memory and lower fidelity.\nIn contrast, the ray-tracing-based methods yield superior quality but demand\nlonger rendering time. We solve this problem by our HashPoint method combining\nthese two strategies, leveraging rasterization for efficient point searching\nand sampling, and ray marching for rendering. Our method optimizes point\nsearching by rasterizing points within the camera's view, organizing them in a\nhash table, and facilitating rapid searches. Notably, we accelerate the\nrendering process by adaptive sampling on the primary surface encountered by\nthe ray. Our approach yields substantial speed-up for a range of\nstate-of-the-art ray-tracing-based methods, maintaining equivalent or superior\naccuracy across synthetic and real test datasets. The code will be available at\nhttps://jiahao-ma.github.io/hashpoint/.\n","authors":["Jiahao Ma","Miaomiao Liu","David Ahmedt-Aristizaba","Chuong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.14044v1.pdf","comment":"CVPR2024 Highlight"},{"id":"http://arxiv.org/abs/2404.14042v1","updated":"2024-04-22T09:55:50Z","published":"2024-04-22T09:55:50Z","title":"CloudFort: Enhancing Robustness of 3D Point Cloud Classification Against\n Backdoor Attacks via Spatial Partitioning and Ensemble Prediction","summary":" The increasing adoption of 3D point cloud data in various applications, such\nas autonomous vehicles, robotics, and virtual reality, has brought about\nsignificant advancements in object recognition and scene understanding.\nHowever, this progress is accompanied by new security challenges, particularly\nin the form of backdoor attacks. These attacks involve inserting malicious\ninformation into the training data of machine learning models, potentially\ncompromising the model's behavior. In this paper, we propose CloudFort, a novel\ndefense mechanism designed to enhance the robustness of 3D point cloud\nclassifiers against backdoor attacks. CloudFort leverages spatial partitioning\nand ensemble prediction techniques to effectively mitigate the impact of\nbackdoor triggers while preserving the model's performance on clean data. We\nevaluate the effectiveness of CloudFort through extensive experiments,\ndemonstrating its strong resilience against the Point Cloud Backdoor Attack\n(PCBA). Our results show that CloudFort significantly enhances the security of\n3D point cloud classification models without compromising their accuracy on\nbenign samples. Furthermore, we explore the limitations of CloudFort and\ndiscuss potential avenues for future research in the field of 3D point cloud\nsecurity. The proposed defense mechanism represents a significant step towards\nensuring the trustworthiness and reliability of point-cloud-based systems in\nreal-world applications.\n","authors":["Wenhao Lan","Yijun Yang","Haihua Shen","Shan Li"],"pdf_url":"https://arxiv.org/pdf/2404.14042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14040v1","updated":"2024-04-22T09:53:55Z","published":"2024-04-22T09:53:55Z","title":"Surgical-DeSAM: Decoupling SAM for Instrument Segmentation in Robotic\n Surgery","summary":" Purpose: The recent Segment Anything Model (SAM) has demonstrated impressive\nperformance with point, text or bounding box prompts, in various applications.\nHowever, in safety-critical surgical tasks, prompting is not possible due to\n(i) the lack of per-frame prompts for supervised learning, (ii) it is\nunrealistic to prompt frame-by-frame in a real-time tracking application, and\n(iii) it is expensive to annotate prompts for offline applications.\n Methods: We develop Surgical-DeSAM to generate automatic bounding box prompts\nfor decoupling SAM to obtain instrument segmentation in real-time robotic\nsurgery. We utilise a commonly used detection architecture, DETR, and\nfine-tuned it to obtain bounding box prompt for the instruments. We then\nempolyed decoupling SAM (DeSAM) by replacing the image encoder with DETR\nencoder and fine-tune prompt encoder and mask decoder to obtain instance\nsegmentation for the surgical instruments. To improve detection performance, we\nadopted the Swin-transformer to better feature representation.\n Results: The proposed method has been validated on two publicly available\ndatasets from the MICCAI surgical instruments segmentation challenge EndoVis\n2017 and 2018. The performance of our method is also compared with SOTA\ninstrument segmentation methods and demonstrated significant improvements with\ndice metrics of 89.62 and 90.70 for the EndoVis 2017 and 2018.\n Conclusion: Our extensive experiments and validations demonstrate that\nSurgical-DeSAM enables real-time instrument segmentation without any additional\nprompting and outperforms other SOTA segmentation methods.\n","authors":["Yuyang Sheng","Sophia Bano","Matthew J. Clarkson","Mobarakol Islam"],"pdf_url":"https://arxiv.org/pdf/2404.14040v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.14037v1","updated":"2024-04-22T09:51:43Z","published":"2024-04-22T09:51:43Z","title":"GaussianTalker: Speaker-specific Talking Head Synthesis via 3D Gaussian\n Splatting","summary":" Recent works on audio-driven talking head synthesis using Neural Radiance\nFields (NeRF) have achieved impressive results. However, due to inadequate pose\nand expression control caused by NeRF implicit representation, these methods\nstill have some limitations, such as unsynchronized or unnatural lip movements,\nand visual jitter and artifacts. In this paper, we propose GaussianTalker, a\nnovel method for audio-driven talking head synthesis based on 3D Gaussian\nSplatting. With the explicit representation property of 3D Gaussians, intuitive\ncontrol of the facial motion is achieved by binding Gaussians to 3D facial\nmodels. GaussianTalker consists of two modules, Speaker-specific Motion\nTranslator and Dynamic Gaussian Renderer. Speaker-specific Motion Translator\nachieves accurate lip movements specific to the target speaker through\nuniversalized audio feature extraction and customized lip motion generation.\nDynamic Gaussian Renderer introduces Speaker-specific BlendShapes to enhance\nfacial detail representation via a latent pose, delivering stable and realistic\nrendered videos. Extensive experimental results suggest that GaussianTalker\noutperforms existing state-of-the-art methods in talking head synthesis,\ndelivering precise lip synchronization and exceptional visual quality. Our\nmethod achieves rendering speeds of 130 FPS on NVIDIA RTX4090 GPU,\nsignificantly exceeding the threshold for real-time rendering performance, and\ncan potentially be deployed on other hardware platforms.\n","authors":["Hongyun Yu","Zhan Qu","Qihang Yu","Jianchuan Chen","Zhonghua Jiang","Zhiwen Chen","Shengyu Zhang","Jimin Xu","Fei Wu","Chengfei Lv","Gang Yu"],"pdf_url":"https://arxiv.org/pdf/2404.14037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14034v1","updated":"2024-04-22T09:50:12Z","published":"2024-04-22T09:50:12Z","title":"PointDifformer: Robust Point Cloud Registration With Neural Diffusion\n and Transformer","summary":" Point cloud registration is a fundamental technique in 3-D computer vision\nwith applications in graphics, autonomous driving, and robotics. However,\nregistration tasks under challenging conditions, under which noise or\nperturbations are prevalent, can be difficult. We propose a robust point cloud\nregistration approach that leverages graph neural partial differential\nequations (PDEs) and heat kernel signatures. Our method first uses graph neural\nPDE modules to extract high dimensional features from point clouds by\naggregating information from the 3-D point neighborhood, thereby enhancing the\nrobustness of the feature representations. Then, we incorporate heat kernel\nsignatures into an attention mechanism to efficiently obtain corresponding\nkeypoints. Finally, a singular value decomposition (SVD) module with learnable\nweights is used to predict the transformation between two point clouds.\nEmpirical experiments on a 3-D point cloud dataset demonstrate that our\napproach not only achieves state-of-the-art performance for point cloud\nregistration but also exhibits better robustness to additive noise or 3-D shape\nperturbations.\n","authors":["Rui She","Qiyu Kang","Sijie Wang","Wee Peng Tay","Kai Zhao","Yang Song","Tianyu Geng","Yi Xu","Diego Navarro Navarro","Andreas Hartmannsgruber"],"pdf_url":"https://arxiv.org/pdf/2404.14034v1.pdf","comment":"Accepted by IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2404.14032v1","updated":"2024-04-22T09:50:05Z","published":"2024-04-22T09:50:05Z","title":"1st Place Solution to the 1st SkatingVerse Challenge","summary":" This paper presents the winning solution for the 1st SkatingVerse Challenge.\nWe propose a method that involves several steps. To begin, we leverage the DINO\nframework to extract the Region of Interest (ROI) and perform precise cropping\nof the raw video footage. Subsequently, we employ three distinct models, namely\nUnmasked Teacher, UniformerV2, and InfoGCN, to capture different aspects of the\ndata. By ensembling the prediction results based on logits, our solution\nattains an impressive leaderboard score of 95.73%.\n","authors":["Tao Sun","Yuanzi Fu","Kaicheng Yang","Jian Wu","Ziyong Feng"],"pdf_url":"https://arxiv.org/pdf/2404.14032v1.pdf","comment":"3 pages, 1st SkatingVerse Challenge, 18th IEEE International\n Conference on Automatic Face and Gesture Recognition workshop"},{"id":"http://arxiv.org/abs/2404.14027v1","updated":"2024-04-22T09:43:03Z","published":"2024-04-22T09:43:03Z","title":"OccFeat: Self-supervised Occupancy Feature Prediction for Pretraining\n BEV Segmentation Networks","summary":" We introduce a self-supervised pretraining method, called OcFeat, for\ncamera-only Bird's-Eye-View (BEV) segmentation networks. With OccFeat, we\npretrain a BEV network via occupancy prediction and feature distillation tasks.\nOccupancy prediction provides a 3D geometric understanding of the scene to the\nmodel. However, the geometry learned is class-agnostic. Hence, we add semantic\ninformation to the model in the 3D space through distillation from a\nself-supervised pretrained image foundation model. Models pretrained with our\nmethod exhibit improved BEV semantic segmentation performance, particularly in\nlow-data scenarios. Moreover, empirical results affirm the efficacy of\nintegrating feature distillation with 3D occupancy prediction in our\npretraining approach.\n","authors":["Sophia Sirko-Galouchenko","Alexandre Boulch","Spyros Gidaris","Andrei Bursuc","Antonin Vobecky","Patrick Pérez","Renaud Marlet"],"pdf_url":"https://arxiv.org/pdf/2404.14027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14025v1","updated":"2024-04-22T09:41:03Z","published":"2024-04-22T09:41:03Z","title":"DHRNet: A Dual-Path Hierarchical Relation Network for Multi-Person Pose\n Estimation","summary":" Multi-person pose estimation (MPPE) presents a formidable yet crucial\nchallenge in computer vision. Most existing methods predominantly concentrate\non isolated interaction either between instances or joints, which is inadequate\nfor scenarios demanding concurrent localization of both instances and joints.\nThis paper introduces a novel CNN-based single-stage method, named Dual-path\nHierarchical Relation Network (DHRNet), to extract instance-to-joint and\njoint-to-instance interactions concurrently. Specifically, we design a\ndual-path interaction modeling module (DIM) that strategically organizes\ncross-instance and cross-joint interaction modeling modules in two\ncomplementary orders, enriching interaction information by integrating merits\nfrom different correlation modeling branches. Notably, DHRNet excels in joint\nlocalization by leveraging information from other instances and joints.\nExtensive evaluations on challenging datasets, including COCO, CrowdPose, and\nOCHuman datasets, showcase DHRNet's state-of-the-art performance. The code will\nbe released at https://github.com/YHDang/dhrnet-multi-pose-estimation.\n","authors":["Yonghao Dang","Jianqin Yin","Liyuan Liu","Yuan Sun","Yanzhu Hu","Pengxiang Ding"],"pdf_url":"https://arxiv.org/pdf/2404.14025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14022v1","updated":"2024-04-22T09:36:17Z","published":"2024-04-22T09:36:17Z","title":"Collaborative Perception Datasets in Autonomous Driving: A Survey","summary":" This survey offers a comprehensive examination of collaborative perception\ndatasets in the context of Vehicle-to-Infrastructure (V2I), Vehicle-to-Vehicle\n(V2V), and Vehicle-to-Everything (V2X). It highlights the latest developments\nin large-scale benchmarks that accelerate advancements in perception tasks for\nautonomous vehicles. The paper systematically analyzes a variety of datasets,\ncomparing them based on aspects such as diversity, sensor setup, quality,\npublic availability, and their applicability to downstream tasks. It also\nhighlights the key challenges such as domain shift, sensor setup limitations,\nand gaps in dataset diversity and availability. The importance of addressing\nprivacy and security concerns in the development of datasets is emphasized,\nregarding data sharing and dataset creation. The conclusion underscores the\nnecessity for comprehensive, globally accessible datasets and collaborative\nefforts from both technological and research communities to overcome these\nchallenges and fully harness the potential of autonomous driving.\n","authors":["Melih Yazgan","Mythra Varun Akkanapragada","J. Marius Zoellner"],"pdf_url":"https://arxiv.org/pdf/2404.14022v1.pdf","comment":"8 pages,3 figures"},{"id":"http://arxiv.org/abs/2404.14019v1","updated":"2024-04-22T09:33:44Z","published":"2024-04-22T09:33:44Z","title":"A Multimodal Feature Distillation with CNN-Transformer Network for Brain\n Tumor Segmentation with Incomplete Modalities","summary":" Existing brain tumor segmentation methods usually utilize multiple Magnetic\nResonance Imaging (MRI) modalities in brain tumor images for segmentation,\nwhich can achieve better segmentation performance. However, in clinical\napplications, some modalities are missing due to resource constraints, leading\nto severe degradation in the performance of methods applying complete modality\nsegmentation. In this paper, we propose a Multimodal feature distillation with\nConvolutional Neural Network (CNN)-Transformer hybrid network (MCTSeg) for\naccurate brain tumor segmentation with missing modalities. We first design a\nMultimodal Feature Distillation (MFD) module to distill feature-level\nmultimodal knowledge into different unimodality to extract complete modality\ninformation. We further develop a Unimodal Feature Enhancement (UFE) module to\nmodel the relationship between global and local information semantically.\nFinally, we build a Cross-Modal Fusion (CMF) module to explicitly align the\nglobal correlations among different modalities even when some modalities are\nmissing. Complementary features within and across different modalities are\nrefined via the CNN-Transformer hybrid architectures in both the UFE and CMF\nmodules, where local and global dependencies are both captured. Our ablation\nstudy demonstrates the importance of the proposed modules with CNN-Transformer\nnetworks and the convolutional blocks in Transformer for improving the\nperformance of brain tumor segmentation with missing modalities. Extensive\nexperiments on the BraTS2018 and BraTS2020 datasets show that the proposed\nMCTSeg framework outperforms the state-of-the-art methods in missing modalities\ncases. Our code is available at: https://github.com/mkang315/MCTSeg.\n","authors":["Ming Kang","Fung Fung Ting","Raphaël C. -W. Phan","Zongyuan Ge","Chee-Ming Ting"],"pdf_url":"https://arxiv.org/pdf/2404.14019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14016v1","updated":"2024-04-22T09:29:14Z","published":"2024-04-22T09:29:14Z","title":"Ungeneralizable Examples","summary":" The training of contemporary deep learning models heavily relies on publicly\navailable data, posing a risk of unauthorized access to online data and raising\nconcerns about data privacy. Current approaches to creating unlearnable data\ninvolve incorporating small, specially designed noises, but these methods\nstrictly limit data usability, overlooking its potential usage in authorized\nscenarios. In this paper, we extend the concept of unlearnable data to\nconditional data learnability and introduce \\textbf{U}n\\textbf{G}eneralizable\n\\textbf{E}xamples (UGEs). UGEs exhibit learnability for authorized users while\nmaintaining unlearnability for potential hackers. The protector defines the\nauthorized network and optimizes UGEs to match the gradients of the original\ndata and its ungeneralizable version, ensuring learnability. To prevent\nunauthorized learning, UGEs are trained by maximizing a designated distance\nloss in a common feature space. Additionally, to further safeguard the\nauthorized side from potential attacks, we introduce additional undistillation\noptimization. Experimental results on multiple datasets and various networks\ndemonstrate that the proposed UGEs framework preserves data usability while\nreducing training performance on hacker networks, even under different types of\nattacks.\n","authors":["Jingwen Ye","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14016v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.14007v1","updated":"2024-04-22T09:16:25Z","published":"2024-04-22T09:16:25Z","title":"Infusion: Preventing Customized Text-to-Image Diffusion from Overfitting","summary":" Text-to-image (T2I) customization aims to create images that embody specific\nvisual concepts delineated in textual descriptions. However, existing works\nstill face a main challenge, concept overfitting. To tackle this challenge, we\nfirst analyze overfitting, categorizing it into concept-agnostic overfitting,\nwhich undermines non-customized concept knowledge, and concept-specific\noverfitting, which is confined to customize on limited modalities, i.e,\nbackgrounds, layouts, styles. To evaluate the overfitting degree, we further\nintroduce two metrics, i.e, Latent Fisher divergence and Wasserstein metric to\nmeasure the distribution changes of non-customized and customized concept\nrespectively. Drawing from the analysis, we propose Infusion, a T2I\ncustomization method that enables the learning of target concepts to avoid\nbeing constrained by limited training modalities, while preserving\nnon-customized knowledge. Remarkably, Infusion achieves this feat with\nremarkable efficiency, requiring a mere 11KB of trained parameters. Extensive\nexperiments also demonstrate that our approach outperforms state-of-the-art\nmethods in both single and multi-concept customized generation.\n","authors":["Weili Zeng","Yichao Yan","Qi Zhu","Zhuo Chen","Pengzhi Chu","Weiming Zhao","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.14007v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.14006v1","updated":"2024-04-22T09:16:14Z","published":"2024-04-22T09:16:14Z","title":"Distilled Datamodel with Reverse Gradient Matching","summary":" The proliferation of large-scale AI models trained on extensive datasets has\nrevolutionized machine learning. With these models taking on increasingly\ncentral roles in various applications, the need to understand their behavior\nand enhance interpretability has become paramount. To investigate the impact of\nchanges in training data on a pre-trained model, a common approach is\nleave-one-out retraining. This entails systematically altering the training\ndataset by removing specific samples to observe resulting changes within the\nmodel. However, retraining the model for each altered dataset presents a\nsignificant computational challenge, given the need to perform this operation\nfor every dataset variation. In this paper, we introduce an efficient framework\nfor assessing data impact, comprising offline training and online evaluation\nstages. During the offline training phase, we approximate the influence of\ntraining data on the target model through a distilled synset, formulated as a\nreversed gradient matching problem. For online evaluation, we expedite the\nleave-one-out process using the synset, which is then utilized to compute the\nattribution matrix based on the evaluation objective. Experimental evaluations,\nincluding training data attribution and assessments of data quality,\ndemonstrate that our proposed method achieves comparable model behavior\nevaluation while significantly speeding up the process compared to the direct\nretraining method.\n","authors":["Jingwen Ye","Ruonan Yu","Songhua Liu","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14006v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.11120v2","updated":"2024-04-22T09:06:54Z","published":"2024-03-17T07:02:55Z","title":"Unifying Feature and Cost Aggregation with Transformers for Semantic and\n Visual Correspondence","summary":" This paper introduces a Transformer-based integrative feature and cost\naggregation network designed for dense matching tasks. In the context of dense\nmatching, many works benefit from one of two forms of aggregation: feature\naggregation, which pertains to the alignment of similar features, or cost\naggregation, a procedure aimed at instilling coherence in the flow estimates\nacross neighboring pixels. In this work, we first show that feature aggregation\nand cost aggregation exhibit distinct characteristics and reveal the potential\nfor substantial benefits stemming from the judicious use of both aggregation\nprocesses. We then introduce a simple yet effective architecture that harnesses\nself- and cross-attention mechanisms to show that our approach unifies feature\naggregation and cost aggregation and effectively harnesses the strengths of\nboth techniques. Within the proposed attention layers, the features and cost\nvolume both complement each other, and the attention layers are interleaved\nthrough a coarse-to-fine design to further promote accurate correspondence\nestimation. Finally at inference, our network produces multi-scale predictions,\ncomputes their confidence scores, and selects the most confident flow for final\nprediction. Our framework is evaluated on standard benchmarks for semantic\nmatching, and also applied to geometric matching, where we show that our\napproach achieves significant improvements compared to existing methods.\n","authors":["Sunghwan Hong","Seokju Cho","Seungryong Kim","Stephen Lin"],"pdf_url":"https://arxiv.org/pdf/2403.11120v2.pdf","comment":"Accepted by ICLR'24"},{"id":"http://arxiv.org/abs/2404.13999v1","updated":"2024-04-22T09:03:21Z","published":"2024-04-22T09:03:21Z","title":"CoFInAl: Enhancing Action Quality Assessment with Coarse-to-Fine\n Instruction Alignment","summary":" Action Quality Assessment (AQA) is pivotal for quantifying actions across\ndomains like sports and medical care. Existing methods often rely on\npre-trained backbones from large-scale action recognition datasets to boost\nperformance on smaller AQA datasets. However, this common strategy yields\nsuboptimal results due to the inherent struggle of these backbones to capture\nthe subtle cues essential for AQA. Moreover, fine-tuning on smaller datasets\nrisks overfitting. To address these issues, we propose Coarse-to-Fine\nInstruction Alignment (CoFInAl). Inspired by recent advances in large language\nmodel tuning, CoFInAl aligns AQA with broader pre-trained tasks by\nreformulating it as a coarse-to-fine classification task. Initially, it learns\ngrade prototypes for coarse assessment and then utilizes fixed sub-grade\nprototypes for fine-grained assessment. This hierarchical approach mirrors the\njudging process, enhancing interpretability within the AQA framework.\nExperimental results on two long-term AQA datasets demonstrate CoFInAl achieves\nstate-of-the-art performance with significant correlation gains of 5.49% and\n3.55% on Rhythmic Gymnastics and Fis-V, respectively. Our code is available at\nhttps://github.com/ZhouKanglei/CoFInAl_AQA.\n","authors":["Kanglei Zhou","Junlin Li","Ruizhi Cai","Liyuan Wang","Xingxing Zhang","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2404.13999v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.13996v1","updated":"2024-04-22T09:01:14Z","published":"2024-04-22T09:01:14Z","title":"Challenges in automatic and selective plant-clearing","summary":" With the advent of multispectral imagery and AI, there have been numerous\nworks on automatic plant segmentation for purposes such as counting, picking,\nhealth monitoring, localized pesticide delivery, etc. In this paper, we tackle\nthe related problem of automatic and selective plant-clearing in a sustainable\nforestry context, where an autonomous machine has to detect and avoid specific\nplants while clearing any weeds which may compete with the species being\ncultivated. Such an autonomous system requires a high level of robustness to\nweather conditions, plant variability, terrain and weeds while remaining cheap\nand easy to maintain. We notably discuss the lack of robustness of spectral\nimagery, investigate the impact of the reference database's size and discuss\nissues specific to AI systems operating in uncontrolled environments.\n","authors":["Fabrice Mayran de Chamisso","Loïc Cotten","Valentine Dhers","Thomas Lompech","Florian Seywert","Arnaud Susset"],"pdf_url":"https://arxiv.org/pdf/2404.13996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13993v1","updated":"2024-04-22T08:59:35Z","published":"2024-04-22T08:59:35Z","title":"Zero-Shot Character Identification and Speaker Prediction in Comics via\n Iterative Multimodal Fusion","summary":" Recognizing characters and predicting speakers of dialogue are critical for\ncomic processing tasks, such as voice generation or translation. However,\nbecause characters vary by comic title, supervised learning approaches like\ntraining character classifiers which require specific annotations for each\ncomic title are infeasible. This motivates us to propose a novel zero-shot\napproach, allowing machines to identify characters and predict speaker names\nbased solely on unannotated comic images. In spite of their importance in\nreal-world applications, these task have largely remained unexplored due to\nchallenges in story comprehension and multimodal integration. Recent large\nlanguage models (LLMs) have shown great capability for text understanding and\nreasoning, while their application to multimodal content analysis is still an\nopen problem. To address this problem, we propose an iterative multimodal\nframework, the first to employ multimodal information for both character\nidentification and speaker prediction tasks. Our experiments demonstrate the\neffectiveness of the proposed framework, establishing a robust baseline for\nthese tasks. Furthermore, since our method requires no training data or\nannotations, it can be used as-is on any comic series.\n","authors":["Yingxuan Li","Ryota Hinami","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13992v1","updated":"2024-04-22T08:58:57Z","published":"2024-04-22T08:58:57Z","title":"Dynamic Proxy Domain Generalizes the Crowd Localization by Better Binary\n Segmentation","summary":" Crowd localization targets on predicting each instance precise location\nwithin an image. Current advanced methods propose the pixel-wise binary\nclassification to tackle the congested prediction, in which the pixel-level\nthresholds binarize the prediction confidence of being the pedestrian head.\nSince the crowd scenes suffer from extremely varying contents, counts and\nscales, the confidence-threshold learner is fragile and under-generalized\nencountering domain knowledge shift. Moreover, at the most time, the target\ndomain is agnostic in training. Hence, it is imperative to exploit how to\nenhance the generalization of confidence-threshold locator to the latent target\ndomain. In this paper, we propose a Dynamic Proxy Domain (DPD) method to\ngeneralize the learner under domain shift. Concretely, based on the theoretical\nanalysis to the generalization error risk upper bound on the latent target\ndomain to a binary classifier, we propose to introduce a generated proxy domain\nto facilitate generalization. Then, based on the theory, we design a DPD\nalgorithm which is composed by a training paradigm and proxy domain generator\nto enhance the domain generalization of the confidence-threshold learner.\nBesides, we conduct our method on five kinds of domain shift scenarios,\ndemonstrating the effectiveness on generalizing the crowd localization. Our\ncode will be available at https://github.com/zhangda1018/DPD.\n","authors":["Junyu Gao","Da Zhang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.13992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13984v1","updated":"2024-04-22T08:44:34Z","published":"2024-04-22T08:44:34Z","title":"RHanDS: Refining Malformed Hands for Generated Images with Decoupled\n Structure and Style Guidance","summary":" Although diffusion models can generate high-quality human images, their\napplications are limited by the instability in generating hands with correct\nstructures. Some previous works mitigate the problem by considering hand\nstructure yet struggle to maintain style consistency between refined malformed\nhands and other image regions. In this paper, we aim to solve the problem of\ninconsistency regarding hand structure and style. We propose a conditional\ndiffusion-based framework RHanDS to refine the hand region with the help of\ndecoupled structure and style guidance. Specifically, the structure guidance is\nthe hand mesh reconstructed from the malformed hand, serving to correct the\nhand structure. The style guidance is a hand image, e.g., the malformed hand\nitself, and is employed to furnish the style reference for hand refining. In\norder to suppress the structure leakage when referencing hand style and\neffectively utilize hand data to improve the capability of the model, we build\na multi-style hand dataset and introduce a twostage training strategy. In the\nfirst stage, we use paired hand images for training to generate hands with the\nsame style as the reference. In the second stage, various hand images generated\nbased on the human mesh are used for training to enable the model to gain\ncontrol over the hand structure. We evaluate our method and counterparts on the\ntest dataset of the proposed multi-style hand dataset. The experimental results\nshow that RHanDS can effectively refine hands structure- and style- correctly\ncompared with previous methods. The codes and datasets will be available soon.\n","authors":["Chengrui Wang","Pengfei Liu","Min Zhou","Ming Zeng","Xubin Li","Tiezheng Ge","Bo zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13983v1","updated":"2024-04-22T08:44:10Z","published":"2024-04-22T08:44:10Z","title":"Structure-Aware Human Body Reshaping with Adaptive Affinity-Graph\n Network","summary":" Given a source portrait, the automatic human body reshaping task aims at\nediting it to an aesthetic body shape. As the technology has been widely used\nin media, several methods have been proposed mainly focusing on generating\noptical flow to warp the body shape. However, those previous works only\nconsider the local transformation of different body parts (arms, torso, and\nlegs), ignoring the global affinity, and limiting the capacity to ensure\nconsistency and quality across the entire body. In this paper, we propose a\nnovel Adaptive Affinity-Graph Network (AAGN), which extracts the global\naffinity between different body parts to enhance the quality of the generated\noptical flow. Specifically, our AAGN primarily introduces the following\ndesigns: (1) we propose an Adaptive Affinity-Graph (AAG) Block that leverages\nthe characteristic of a fully connected graph. AAG represents different body\nparts as nodes in an adaptive fully connected graph and captures all the\naffinities between nodes to obtain a global affinity map. The design could\nbetter improve the consistency between body parts. (2) Besides, for\nhigh-frequency details are crucial for photo aesthetics, a Body Shape\nDiscriminator (BSD) is designed to extract information from both high-frequency\nand spatial domain. Particularly, an SRM filter is utilized to extract\nhigh-frequency details, which are combined with spatial features as input to\nthe BSD. With this design, BSD guides the Flow Generator (FG) to pay attention\nto various fine details rather than rigid pixel-level fitting. Extensive\nexperiments conducted on the BR-5K dataset demonstrate that our framework\nsignificantly enhances the aesthetic appeal of reshaped photos, marginally\nsurpassing all previous work to achieve state-of-the-art in all evaluation\nmetrics.\n","authors":["Qiwen Deng","Yangcen Liu","Wen Li","Guoqing Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13983v1.pdf","comment":"11 pages;"},{"id":"http://arxiv.org/abs/2404.09105v2","updated":"2024-04-22T08:40:43Z","published":"2024-04-14T00:08:56Z","title":"EGGS: Edge Guided Gaussian Splatting for Radiance Fields","summary":" The Gaussian splatting methods are getting popular. However, their loss\nfunction only contains the $\\ell_1$ norm and the structural similarity between\nthe rendered and input images, without considering the edges in these images.\nIt is well-known that the edges in an image provide important information.\nTherefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS)\nmethod that leverages the edges in the input images. More specifically, we give\nthe edge region a higher weight than the flat region. With such edge guidance,\nthe resulting Gaussian particles focus more on the edges instead of the flat\nregions. Moreover, such edge guidance does not crease the computation cost\nduring the training and rendering stage. The experiments confirm that such\nsimple edge-weighted loss function indeed improves about $1\\sim2$ dB on several\ndifference data sets. With simply plugging in the edge guidance, the proposed\nmethod can improve all Gaussian splatting methods in different scenarios, such\nas human head modeling, building 3D reconstruction, etc.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2404.09105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13972v1","updated":"2024-04-22T08:28:41Z","published":"2024-04-22T08:28:41Z","title":"Non-Uniform Exposure Imaging via Neuromorphic Shutter Control","summary":" By leveraging the blur-noise trade-off, imaging with non-uniform exposures\nlargely extends the image acquisition flexibility in harsh environments.\nHowever, the limitation of conventional cameras in perceiving intra-frame\ndynamic information prevents existing methods from being implemented in the\nreal-world frame acquisition for real-time adaptive camera shutter control. To\naddress this challenge, we propose a novel Neuromorphic Shutter Control (NSC)\nsystem to avoid motion blurs and alleviate instant noises, where the extremely\nlow latency of events is leveraged to monitor the real-time motion and\nfacilitate the scene-adaptive exposure. Furthermore, to stabilize the\ninconsistent Signal-to-Noise Ratio (SNR) caused by the non-uniform exposure\ntimes, we propose an event-based image denoising network within a\nself-supervised learning paradigm, i.e., SEID, exploring the statistics of\nimage noises and inter-frame motion information of events to obtain artificial\nsupervision signals for high-quality imaging in real-world scenes. To\nillustrate the effectiveness of the proposed NSC, we implement it in hardware\nby building a hybrid-camera imaging prototype system, with which we collect a\nreal-world dataset containing well-synchronized frames and events in diverse\nscenarios with different target scenes and motion patterns. Experiments on the\nsynthetic and real-world datasets demonstrate the superiority of our method\nover state-of-the-art approaches.\n","authors":["Mingyuan Lin","Jian Liu","Chi Zhang","Zibo Zhao","Chu He","Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2404.13972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13953v1","updated":"2024-04-22T07:54:53Z","published":"2024-04-22T07:54:53Z","title":"360VOTS: Visual Object Tracking and Segmentation in Omnidirectional\n Videos","summary":" Visual object tracking and segmentation in omnidirectional videos are\nchallenging due to the wide field-of-view and large spherical distortion\nbrought by 360{\\deg} images. To alleviate these problems, we introduce a novel\nrepresentation, extended bounding field-of-view (eBFoV), for target\nlocalization and use it as the foundation of a general 360 tracking framework\nwhich is applicable for both omnidirectional visual object tracking and\nsegmentation tasks. Building upon our previous work on omnidirectional visual\nobject tracking (360VOT), we propose a comprehensive dataset and benchmark that\nincorporates a new component called omnidirectional video object segmentation\n(360VOS). The 360VOS dataset includes 290 sequences accompanied by dense\npixel-wise masks and covers a broader range of target categories. To support\nboth the development and evaluation of algorithms in this domain, we divide the\ndataset into a training subset with 170 sequences and a testing subset with 120\nsequences. Furthermore, we tailor evaluation metrics for both omnidirectional\ntracking and segmentation to ensure rigorous assessment. Through extensive\nexperiments, we benchmark state-of-the-art approaches and demonstrate the\neffectiveness of our proposed 360 tracking framework and training dataset.\nHomepage: https://360vots.hkustvgd.com/\n","authors":["Yinzhe Xu","Huajian Huang","Yingshu Chen","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.13953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13949v1","updated":"2024-04-22T07:50:24Z","published":"2024-04-22T07:50:24Z","title":"PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for\n RGB-D Cameras with Limited Co-visibility","summary":" RGB-D cameras are crucial in robotic perception, given their ability to\nproduce images augmented with depth data. However, their limited FOV often\nrequires multiple cameras to cover a broader area. In multi-camera RGB-D\nsetups, the goal is typically to reduce camera overlap, optimizing spatial\ncoverage with as few cameras as possible. The extrinsic calibration of these\nsystems introduces additional complexities. Existing methods for extrinsic\ncalibration either necessitate specific tools or highly depend on the accuracy\nof camera motion estimation. To address these issues, we present PeLiCal, a\nnovel line-based calibration approach for RGB-D camera systems exhibiting\nlimited overlap. Our method leverages long line features from surroundings, and\nfilters out outliers with a novel convergence voting algorithm, achieving\ntargetless, real-time, and outlier-robust performance compared to existing\nmethods. We open source our implementation on\n\\url{https://github.com/joomeok/PeLiCal.git}.\n","authors":["Jaeho Shin","Seungsang Yun","Ayoung Kim"],"pdf_url":"https://arxiv.org/pdf/2404.13949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07762v3","updated":"2024-04-22T07:48:26Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/atonderski/neuro-ncap\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12734v2","updated":"2024-04-22T07:45:18Z","published":"2024-04-19T09:28:16Z","title":"DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On\n Transformer","summary":" With the continuous development of OCR technology and the expansion of\napplication fields, text recognition in complex scenes has become a key\nchallenge. Factors such as multiple fonts, mixed scenes and complex layouts\nseriously affect the recognition accuracy of traditional OCR models. Although\nOCR models based on deep learning have performed well in specific fields or\nsimilar datasets in recent years, the generalization ability and robustness of\nthe model are still a big challenge when facing complex environments with\nmultiple scenes. Furthermore, training an OCR model from scratch or fine-tuning\nall parameters is very demanding on computing resources and inference time,\nwhich limits the flexibility of its application. This study focuses on a\nfundamental aspect of mixed text recognition in response to the challenges\nmentioned above, which involves effectively fine-tuning the pre-trained basic\nOCR model to demonstrate exceptional performance across various downstream\ntasks. To this end, we propose a parameter-efficient mixed text recognition\nmethod based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method\nembeds DoRA into the image encoder and LoRA into the internal structure of the\ntext decoder, enabling efficient parameter fine-tuning for downstream tasks.\nExperimental results show that compared to similar parameter adjustment\nmethods, our model DLoRA-TrOCR has the smallest number of parameters and\nperforms better. It can achieve state-of-the-art performance on complex scene\ndatasets involving simultaneous recognition of mixed handwritten, printed and\nstreet view texts.\n","authors":["Da Chang","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2404.12734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13947v1","updated":"2024-04-22T07:44:20Z","published":"2024-04-22T07:44:20Z","title":"Boter: Bootstrapping Knowledge Selection and Question Answering for\n Knowledge-based VQA","summary":" Knowledge-based Visual Question Answering (VQA) requires models to\nincorporate external knowledge to respond to questions about visual content.\nPrevious methods mostly follow the \"retrieve and generate\" paradigm. Initially,\nthey utilize a pre-trained retriever to fetch relevant knowledge documents,\nsubsequently employing them to generate answers. While these methods have\ndemonstrated commendable performance in the task, they possess limitations: (1)\nthey employ an independent retriever to acquire knowledge solely based on the\nsimilarity between the query and knowledge embeddings, without assessing\nwhether the knowledge document is truly conducive to helping answer the\nquestion; (2) they convert the image into text and then conduct retrieval and\nanswering in natural language space, which may not ensure comprehensive\nacquisition of all image information. To address these limitations, we propose\nBoter, a novel framework designed to bootstrap knowledge selection and question\nanswering by leveraging the robust multimodal perception capabilities of the\nMultimodal Large Language Model (MLLM). The framework consists of two modules:\nSelector and Answerer, where both are initialized by the MLLM and\nparameter-efficiently finetuned in a simple cycle: find key knowledge in the\nretrieved knowledge documents using the Selector, and then use them to finetune\nthe Answerer to predict answers; obtain the pseudo-labels of key knowledge\ndocuments based on the predictions of the Answerer and weak supervision labels,\nand then finetune the Selector to select key knowledge; repeat. Our framework\nsignificantly enhances the performance of the baseline on the challenging\nopen-domain Knowledge-based VQA benchmark, OK-VQA, achieving a state-of-the-art\naccuracy of 62.83%.\n","authors":["Dongze Hao","Qunbo Wang","Longteng Guo","Jie Jiang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05817v2","updated":"2024-04-22T07:43:09Z","published":"2024-03-09T06:48:19Z","title":"SAFDNet: A Simple and Effective Network for Fully Sparse 3D Object\n Detection","summary":" LiDAR-based 3D object detection plays an essential role in autonomous\ndriving. Existing high-performing 3D object detectors usually build dense\nfeature maps in the backbone network and prediction head. However, the\ncomputational costs introduced by the dense feature maps grow quadratically as\nthe perception range increases, making these models hard to scale up to\nlong-range detection. Some recent works have attempted to construct fully\nsparse detectors to solve this issue; nevertheless, the resulting models either\nrely on a complex multi-stage pipeline or exhibit inferior performance. In this\nwork, we propose SAFDNet, a straightforward yet highly effective architecture,\ntailored for fully sparse 3D object detection. In SAFDNet, an adaptive feature\ndiffusion strategy is designed to address the center feature missing problem.\nWe conducted extensive experiments on Waymo Open, nuScenes, and Argoverse2\ndatasets. SAFDNet performed slightly better than the previous SOTA on the first\ntwo datasets but much better on the last dataset, which features long-range\ndetection, verifying the efficacy of SAFDNet in scenarios where long-range\ndetection is required. Notably, on Argoverse2, SAFDNet surpassed the previous\nbest hybrid detector HEDNet by 2.6% mAP while being 2.1x faster, and yielded\n2.1% mAP gains over the previous best sparse detector FSDv2 while being 1.3x\nfaster. The code will be available at https://github.com/zhanggang001/HEDNet.\n","authors":["Gang Zhang","Junnan Chen","Guohuan Gao","Jianmin Li","Si Liu","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2403.05817v2.pdf","comment":"Accepted by CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.13944v1","updated":"2024-04-22T07:40:53Z","published":"2024-04-22T07:40:53Z","title":"Gorgeous: Create Your Desired Character Facial Makeup from Any Ideas","summary":" Contemporary makeup transfer methods primarily focus on replicating makeup\nfrom one face to another, considerably limiting their use in creating diverse\nand creative character makeup essential for visual storytelling. Such methods\ntypically fail to address the need for uniqueness and contextual relevance,\nspecifically aligning with character and story settings as they depend heavily\non existing facial makeup in reference images. This approach also presents a\nsignificant challenge when attempting to source a perfectly matched facial\nmakeup style, further complicating the creation of makeup designs inspired by\nvarious story elements, such as theme, background, and props that do not\nnecessarily feature faces. To address these limitations, we introduce\n$Gorgeous$, a novel diffusion-based makeup application method that goes beyond\nsimple transfer by innovatively crafting unique and thematic facial makeup.\nUnlike traditional methods, $Gorgeous$ does not require the presence of a face\nin the reference images. Instead, it draws artistic inspiration from a minimal\nset of three to five images, which can be of any type, and transforms these\nelements into practical makeup applications directly on the face. Our\ncomprehensive experiments demonstrate that $Gorgeous$ can effectively generate\ndistinctive character facial makeup inspired by the chosen thematic reference\nimages. This approach opens up new possibilities for integrating broader story\nelements into character makeup, thereby enhancing the narrative depth and\nvisual impact in storytelling.\n","authors":["Jia Wei Sii","Chee Seng Chan"],"pdf_url":"https://arxiv.org/pdf/2404.13944v1.pdf","comment":"Project page: https://github.com/JiaWeiSii/gorgeous/"},{"id":"http://arxiv.org/abs/2308.04956v2","updated":"2024-04-22T07:22:39Z","published":"2023-08-09T13:41:30Z","title":"Improved cryo-EM Pose Estimation and 3D Classification through\n Latent-Space Disentanglement","summary":" Due to the extremely low signal-to-noise ratio (SNR) and unknown poses\n(projection angles and image shifts) in cryo-electron microscopy (cryo-EM)\nexperiments, reconstructing 3D volumes from 2D images is very challenging. In\naddition to these challenges, heterogeneous cryo-EM reconstruction requires\nconformational classification. In popular cryo-EM reconstruction algorithms,\nposes and conformation classification labels must be predicted for every input\ncryo-EM image, which can be computationally costly for large datasets. An\nemerging class of methods adopted the amortized inference approach. In these\nmethods, only a subset of the input dataset is needed to train neural networks\nfor the estimation of poses and conformations. Once trained, these neural\nnetworks can make pose/conformation predictions and 3D reconstructions at low\ncost for the entire dataset during inference. Unfortunately, when facing\nheterogeneous reconstruction tasks, it is hard for current\namortized-inference-based methods to effectively estimate the conformational\ndistribution and poses from entangled latent variables. Here, we propose a\nself-supervised variational autoencoder architecture called \"HetACUMN\" based on\namortized inference. We employed an auxiliary conditional pose prediction task\nby inverting the order of encoder-decoder to explicitly enforce the\ndisentanglement of conformation and pose predictions. Results on simulated\ndatasets show that HetACUMN generated more accurate conformational\nclassifications than other amortized or non-amortized methods. Furthermore, we\nshow that HetACUMN is capable of performing heterogeneous 3D reconstructions of\na real experimental dataset.\n","authors":["Weijie Chen","Yuhang Wang","Lin Yao"],"pdf_url":"https://arxiv.org/pdf/2308.04956v2.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2404.13929v1","updated":"2024-04-22T07:08:13Z","published":"2024-04-22T07:08:13Z","title":"Exploring Kinetic Curves Features for the Classification of Benign and\n Malignant Breast Lesions in DCE-MRI","summary":" Breast cancer is the most common malignant tumor among women and the second\ncause of cancer-related death. Early diagnosis in clinical practice is crucial\nfor timely treatment and prognosis. Dynamic contrast-enhanced magnetic\nresonance imaging (DCE-MRI) has revealed great usability in the preoperative\ndiagnosis and assessing therapy effects thanks to its capability to reflect the\nmorphology and dynamic characteristics of breast lesions. However, most\nexisting computer-assisted diagnosis algorithms only consider conventional\nradiomic features when classifying benign and malignant lesions in DCE-MRI. In\nthis study, we propose to fully leverage the dynamic characteristics from the\nkinetic curves as well as the radiomic features to boost the classification\naccuracy of benign and malignant breast lesions. The proposed method is a fully\nautomated solution by directly analyzing the 3D features from the DCE-MRI. The\nproposed method is evaluated on an in-house dataset including 200 DCE-MRI scans\nwith 298 breast tumors (172 benign and 126 malignant tumors), achieving\nfavorable classification accuracy with an area under curve (AUC) of 0.94. By\nsimultaneously considering the dynamic and radiomic features, it is beneficial\nto effectively distinguish between benign and malignant breast lesions.\n","authors":["Zixian Li","Yuming Zhong","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13929v1.pdf","comment":"6 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2404.13923v1","updated":"2024-04-22T07:00:17Z","published":"2024-04-22T07:00:17Z","title":"MaterialSeg3D: Segmenting Dense Materials from 2D Priors for 3D Assets","summary":" Driven by powerful image diffusion models, recent research has achieved the\nautomatic creation of 3D objects from textual or visual guidance. By performing\nscore distillation sampling (SDS) iteratively across different views, these\nmethods succeed in lifting 2D generative prior to the 3D space. However, such a\n2D generative image prior bakes the effect of illumination and shadow into the\ntexture. As a result, material maps optimized by SDS inevitably involve\nspurious correlated components. The absence of precise material definition\nmakes it infeasible to relight the generated assets reasonably in novel scenes,\nwhich limits their application in downstream scenarios. In contrast, humans can\neffortlessly circumvent this ambiguity by deducing the material of the object\nfrom its appearance and semantics. Motivated by this insight, we propose\nMaterialSeg3D, a 3D asset material generation framework to infer underlying\nmaterial from the 2D semantic prior. Based on such a prior model, we devise a\nmechanism to parse material in 3D space. We maintain a UV stack, each map of\nwhich is unprojected from a specific viewpoint. After traversing all\nviewpoints, we fuse the stack through a weighted voting scheme and then employ\nregion unification to ensure the coherence of the object parts. To fuel the\nlearning of semantics prior, we collect a material dataset, named Materialized\nIndividual Objects (MIO), which features abundant images, diverse categories,\nand accurate annotations. Extensive quantitative and qualitative experiments\ndemonstrate the effectiveness of our method.\n","authors":["Zeyu Li","Ruitong Gan","Chuanchen Luo","Yuxi Wang","Jiaheng Liu","Ziwei Zhu Man Zhang","Qing Li","Xucheng Yin","Zhaoxiang Zhang","Junran Peng"],"pdf_url":"https://arxiv.org/pdf/2404.13923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13921v1","updated":"2024-04-22T06:59:03Z","published":"2024-04-22T06:59:03Z","title":"NeRF-DetS: Enhancing Multi-View 3D Object Detection with\n Sampling-adaptive Network of Continuous NeRF-based Representation","summary":" As a preliminary work, NeRF-Det unifies the tasks of novel view synthesis and\n3D perception, demonstrating that perceptual tasks can benefit from novel view\nsynthesis methods like NeRF, significantly improving the performance of indoor\nmulti-view 3D object detection. Using the geometry MLP of NeRF to direct the\nattention of detection head to crucial parts and incorporating self-supervised\nloss from novel view rendering contribute to the achieved improvement. To\nbetter leverage the notable advantages of the continuous representation through\nneural rendering in space, we introduce a novel 3D perception network\nstructure, NeRF-DetS. The key component of NeRF-DetS is the Multi-level\nSampling-Adaptive Network, making the sampling process adaptively from coarse\nto fine. Also, we propose a superior multi-view information fusion method,\nknown as Multi-head Weighted Fusion. This fusion approach efficiently addresses\nthe challenge of losing multi-view information when using arithmetic mean,\nwhile keeping low computational costs. NeRF-DetS outperforms competitive\nNeRF-Det on the ScanNetV2 dataset, by achieving +5.02% and +5.92% improvement\nin mAP@.25 and mAP@.50, respectively.\n","authors":["Chi Huang","Xinyang Li","Shengchuan Zhang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.13921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03537v2","updated":"2024-04-22T23:15:32Z","published":"2024-04-04T15:45:25Z","title":"If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face\n Recognition through Synthetic Faces","summary":" Recent advances in deep face recognition have spurred a growing demand for\nlarge, diverse, and manually annotated face datasets. Acquiring authentic,\nhigh-quality data for face recognition has proven to be a challenge, primarily\ndue to privacy concerns. Large face datasets are primarily sourced from\nweb-based images, lacking explicit user consent. In this paper, we examine\nwhether and how synthetic face data can be used to train effective face\nrecognition models with reduced reliance on authentic images, thereby\nmitigating data collection concerns. First, we explored the performance gap\namong recent state-of-the-art face recognition models, trained with synthetic\ndata only and authentic (scarce) data only. Then, we deepened our analysis by\ntraining a state-of-the-art backbone with various combinations of synthetic and\nauthentic data, gaining insights into optimizing the limited use of the latter\nfor verification accuracy. Finally, we assessed the effectiveness of data\naugmentation approaches on synthetic and authentic data, with the same goal in\nmind. Our results highlighted the effectiveness of FR trained on combined\ndatasets, particularly when combined with appropriate augmentation techniques.\n","authors":["Andrea Atzori","Fadi Boutros","Naser Damer","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2404.03537v2.pdf","comment":"Accepted as full paper at FG 2024 main track"},{"id":"http://arxiv.org/abs/2401.08396v3","updated":"2024-04-22T23:04:41Z","published":"2024-01-16T14:41:20Z","title":"Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine","summary":" Recent studies indicate that Generative Pre-trained Transformer 4 with Vision\n(GPT-4V) outperforms human physicians in medical challenge tasks. However,\nthese evaluations primarily focused on the accuracy of multi-choice questions\nalone. Our study extends the current scope by conducting a comprehensive\nanalysis of GPT-4V's rationales of image comprehension, recall of medical\nknowledge, and step-by-step multimodal reasoning when solving New England\nJournal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test\nthe knowledge and diagnostic capabilities of medical professionals. Evaluation\nresults confirmed that GPT-4V performs comparatively to human physicians\nregarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in\ncases where physicians incorrectly answer, with over 78% accuracy. However, we\ndiscovered that GPT-4V frequently presents flawed rationales in cases where it\nmakes the correct final choices (35.5%), most prominent in image comprehension\n(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our\nfindings emphasize the necessity for further in-depth evaluations of its\nrationales before integrating such multimodal AI models into clinical\nworkflows.\n","authors":["Qiao Jin","Fangyuan Chen","Yiliang Zhou","Ziyang Xu","Justin M. Cheung","Robert Chen","Ronald M. Summers","Justin F. Rousseau","Peiyun Ni","Marc J Landsman","Sally L. Baxter","Subhi J. Al'Aref","Yijia Li","Alex Chen","Josef A. Brejt","Michael F. Chiang","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08396v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2303.06797v3","updated":"2024-04-22T22:39:12Z","published":"2023-03-13T01:07:32Z","title":"Multichannel Orthogonal Transform-Based Perceptron Layers for Efficient\n ResNets","summary":" In this paper, we propose a set of transform-based neural network layers as\nan alternative to the $3\\times3$ Conv2D layers in Convolutional Neural Networks\n(CNNs). The proposed layers can be implemented based on orthogonal transforms\nsuch as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and\nbiorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of\nthe convolution theorems, convolutional filtering operations are performed in\nthe transform domain using element-wise multiplications. Trainable\nsoft-thresholding layers, that remove noise in the transform domain, bring\nnonlinearity to the transform domain layers. Compared to the Conv2D layer,\nwhich is spatial-agnostic and channel-specific, the proposed layers are\nlocation-specific and channel-specific. Moreover, these proposed layers reduce\nthe number of parameters and multiplications significantly while improving the\naccuracy results of regular ResNets on the ImageNet-1K classification task.\nFurthermore, they can be inserted with a batch normalization layer before the\nglobal average pooling layer in the conventional ResNets as an additional layer\nto improve classification accuracy.\n","authors":["Hongyi Pan","Emadeldeen Hamdan","Xin Zhu","Salih Atici","Ahmet Enis Cetin"],"pdf_url":"https://arxiv.org/pdf/2303.06797v3.pdf","comment":"This work is accepted to IEEE Transactions on Neural Networks and\n Learning Systems. The initial title is \"Orthogonal Transform Domain\n Approaches for the Convolutional Layer\". We changed it to \"Multichannel\n Orthogonal Transform-Based Perceptron Layers for Efficient ResNets\" based on\n reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577"},{"id":"http://arxiv.org/abs/2404.14606v1","updated":"2024-04-22T22:02:19Z","published":"2024-04-22T22:02:19Z","title":"Cross-Task Multi-Branch Vision Transformer for Facial Expression and\n Mask Wearing Classification","summary":" With wearing masks becoming a new cultural norm, facial expression\nrecognition (FER) while taking masks into account has become a significant\nchallenge. In this paper, we propose a unified multi-branch vision transformer\nfor facial expression recognition and mask wearing classification tasks. Our\napproach extracts shared features for both tasks using a dual-branch\narchitecture that obtains multi-scale feature representations. Furthermore, we\npropose a cross-task fusion phase that processes tokens for each task with\nseparate branches, while exchanging information using a cross attention module.\nOur proposed framework reduces the overall complexity compared with using\nseparate networks for both tasks by the simple yet effective cross-task fusion\nphase. Extensive experiments demonstrate that our proposed model performs\nbetter than or on par with different state-of-the-art methods on both facial\nexpression recognition and facial mask wearing classification task.\n","authors":["Armando Zhu","Keqin Li","Tong Wu","Peng Zhao","Wenjing Zhou","Bo Hong"],"pdf_url":"https://arxiv.org/pdf/2404.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14588v1","updated":"2024-04-22T21:30:11Z","published":"2024-04-22T21:30:11Z","title":"Brain-Inspired Continual Learning-Robust Feature Distillation and\n Re-Consolidation for Class Incremental Learning","summary":" Artificial intelligence (AI) and neuroscience share a rich history, with\nadvancements in neuroscience shaping the development of AI systems capable of\nhuman-like knowledge retention. Leveraging insights from neuroscience and\nexisting research in adversarial and continual learning, we introduce a novel\nframework comprising two core concepts: feature distillation and\nre-consolidation. Our framework, named Robust Rehearsal, addresses the\nchallenge of catastrophic forgetting inherent in continual learning (CL)\nsystems by distilling and rehearsing robust features. Inspired by the mammalian\nbrain's memory consolidation process, Robust Rehearsal aims to emulate the\nrehearsal of distilled experiences during learning tasks. Additionally, it\nmimics memory re-consolidation, where new experiences influence the integration\nof past experiences to mitigate forgetting. Extensive experiments conducted on\nCIFAR10, CIFAR100, and real-world helicopter attitude datasets showcase the\nsuperior performance of CL models trained with Robust Rehearsal compared to\nbaseline methods. Furthermore, examining different optimization training\nobjectives-joint, continual, and adversarial learning-we highlight the crucial\nrole of feature learning in model performance. This underscores the\nsignificance of rehearsing CL-robust samples in mitigating catastrophic\nforgetting. In conclusion, aligning CL approaches with neuroscience insights\noffers promising solutions to the challenge of catastrophic forgetting, paving\nthe way for more robust and human-like AI systems.\n","authors":["Hikmat Khan","Nidhal Carla Bouaynaya","Ghulam Rasool"],"pdf_url":"https://arxiv.org/pdf/2404.14588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12459v3","updated":"2024-04-22T21:28:17Z","published":"2024-03-19T05:30:50Z","title":"Non-negative Contrastive Learning","summary":" Deep representations have shown promising performance when transferred to\ndownstream tasks in a black-box manner. Yet, their inherent lack of\ninterpretability remains a significant challenge, as these features are often\nopaque to human understanding. In this paper, we propose Non-negative\nContrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization\n(NMF) aimed at deriving interpretable features. The power of NCL lies in its\nenforcement of non-negativity constraints on features, reminiscent of NMF's\ncapability to extract features that align closely with sample clusters. NCL not\nonly aligns mathematically well with an NMF objective but also preserves NMF's\ninterpretability attributes, resulting in a more sparse and disentangled\nrepresentation compared to standard contrastive learning (CL). Theoretically,\nwe establish guarantees on the identifiability and downstream generalization of\nNCL. Empirically, we show that these advantages enable NCL to outperform CL\nsignificantly on feature disentanglement, feature selection, as well as\ndownstream classification tasks. At last, we show that NCL can be easily\nextended to other learning scenarios and benefit supervised learning as well.\nCode is available at https://github.com/PKU-ML/non_neg.\n","authors":["Yifei Wang","Qi Zhang","Yaoyu Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12459v3.pdf","comment":"22 pages. Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2404.14581v1","updated":"2024-04-22T21:00:13Z","published":"2024-04-22T21:00:13Z","title":"The Adversarial AI-Art: Understanding, Generation, Detection, and\n Benchmarking","summary":" Generative AI models can produce high-quality images based on text prompts.\nThe generated images often appear indistinguishable from images generated by\nconventional optical photography devices or created by human artists (i.e.,\nreal images). While the outstanding performance of such generative models is\ngenerally well received, security concerns arise. For instance, such image\ngenerators could be used to facilitate fraud or scam schemes, generate and\nspread misinformation, or produce fabricated artworks. In this paper, we\npresent a systematic attempt at understanding and detecting AI-generated images\n(AI-art) in adversarial scenarios. First, we collect and share a dataset of\nreal images and their corresponding artificial counterparts generated by four\npopular AI image generators. The dataset, named ARIA, contains over 140K images\nin five categories: artworks (painting), social media images, news photos,\ndisaster scenes, and anime pictures. This dataset can be used as a foundation\nto support future research on adversarial AI-art. Next, we present a user study\nthat employs the ARIA dataset to evaluate if real-world users can distinguish\nwith or without reference images. In a benchmarking study, we further evaluate\nif state-of-the-art open-source and commercial AI image detectors can\neffectively identify the images in the ARIA dataset. Finally, we present a\nResNet-50 classifier and evaluate its accuracy and transferability on the ARIA\ndataset.\n","authors":["Yuying Li","Zeyan Liu","Junyi Zhao","Liangqin Ren","Fengjun Li","Jiebo Luo","Bo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.14581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08974v2","updated":"2024-04-22T20:58:52Z","published":"2024-03-13T21:43:24Z","title":"Representing Anatomical Trees by Denoising Diffusion of Implicit Neural\n Fields","summary":" Anatomical trees play a central role in clinical diagnosis and treatment\nplanning. However, accurately representing anatomical trees is challenging due\nto their varying and complex topology and geometry. Traditional methods for\nrepresenting tree structures, captured using medical imaging, while invaluable\nfor visualizing vascular and bronchial networks, exhibit drawbacks in terms of\nlimited resolution, flexibility, and efficiency. Recently, implicit neural\nrepresentations (INRs) have emerged as a powerful tool for representing shapes\naccurately and efficiently. We propose a novel approach for representing\nanatomical trees using INR, while also capturing the distribution of a set of\ntrees via denoising diffusion in the space of INRs. We accurately capture the\nintricate geometries and topologies of anatomical trees at any desired\nresolution. Through extensive qualitative and quantitative evaluation, we\ndemonstrate high-fidelity tree reconstruction with arbitrary resolution yet\ncompact storage, and versatility across anatomical sites and tree complexities.\n","authors":["Ashish Sinha","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2403.08974v2.pdf","comment":"Preprint. In review. Code: https://github.com/sinAshish/TreeDiffusion"},{"id":"http://arxiv.org/abs/2311.00259v2","updated":"2024-04-22T20:43:55Z","published":"2023-11-01T03:15:10Z","title":"Solutions to Elliptic and Parabolic Problems via Finite Difference Based\n Unsupervised Small Linear Convolutional Neural Networks","summary":" In recent years, there has been a growing interest in leveraging deep\nlearning and neural networks to address scientific problems, particularly in\nsolving partial differential equations (PDEs). However, many neural\nnetwork-based methods like PINNs rely on auto differentiation and sampling\ncollocation points, leading to a lack of interpretability and lower accuracy\nthan traditional numerical methods. As a result, we propose a fully\nunsupervised approach, requiring no training data, to estimate finite\ndifference solutions for PDEs directly via small linear convolutional neural\nnetworks. Our proposed approach uses substantially fewer parameters than\nsimilar finite difference-based approaches while also demonstrating comparable\naccuracy to the true solution for several selected elliptic and parabolic\nproblems compared to the finite difference method.\n","authors":["Adrian Celaya","Keegan Kirk","David Fuentes","Beatrice Riviere"],"pdf_url":"https://arxiv.org/pdf/2311.00259v2.pdf","comment":"Submitted to CMA, under review"},{"id":"http://arxiv.org/abs/2312.01117v3","updated":"2024-04-22T20:38:05Z","published":"2023-12-02T12:23:07Z","title":"Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by\n Factoring the Real World","summary":" To achieve strong real world performance, neural networks must be trained on\nlarge, diverse datasets; however, obtaining and annotating such datasets is\ncostly and time-consuming, particularly for 3D point clouds. In this paper, we\ndescribe Paved2Paradise, a simple, cost-effective approach for generating fully\nlabeled, diverse, and realistic lidar datasets from scratch, all while\nrequiring minimal human annotation. Our key insight is that, by deliberately\ncollecting separate \"background\" and \"object\" datasets (i.e., \"factoring the\nreal world\"), we can intelligently combine them to produce a combinatorially\nlarge and diverse training set. The Paved2Paradise pipeline thus consists of\nfour steps: (1) collecting copious background data, (2) recording individuals\nfrom the desired object class(es) performing different behaviors in an isolated\nenvironment (like a parking lot), (3) bootstrapping labels for the object\ndataset, and (4) generating samples by placing objects at arbitrary locations\nin backgrounds. To demonstrate the utility of Paved2Paradise, we generated\nsynthetic datasets for two tasks: (1) human detection in orchards (a task for\nwhich no public data exists) and (2) pedestrian detection in urban\nenvironments. Qualitatively, we find that a model trained exclusively on\nPaved2Paradise synthetic data is highly effective at detecting humans in\norchards, including when individuals are heavily occluded by tree branches.\nQuantitatively, a model trained on Paved2Paradise data that sources backgrounds\nfrom KITTI performs comparably to a model trained on the actual dataset. These\nresults suggest the Paved2Paradise synthetic data pipeline can help accelerate\npoint cloud model development in sectors where acquiring lidar datasets has\npreviously been cost-prohibitive.\n","authors":["Michael A. Alcorn","Noah Schwartz"],"pdf_url":"https://arxiv.org/pdf/2312.01117v3.pdf","comment":"Accepted to the Synthetic Data for Computer Vision workshop at CVPR\n 2024"},{"id":"http://arxiv.org/abs/2402.13251v2","updated":"2024-04-22T20:35:38Z","published":"2024-02-20T18:59:00Z","title":"FlashTex: Fast Relightable Mesh Texturing with LightControlNet","summary":" Manually creating textures for 3D meshes is time-consuming, even for expert\nvisual content creators. We propose a fast approach for automatically texturing\nan input 3D mesh based on a user-provided text prompt. Importantly, our\napproach disentangles lighting from surface material/reflectance in the\nresulting texture so that the mesh can be properly relit and rendered in any\nlighting environment. We introduce LightControlNet, a new text-to-image model\nbased on the ControlNet architecture, which allows the specification of the\ndesired lighting as a conditioning image to the model. Our text-to-texture\npipeline then constructs the texture in two stages. The first stage produces a\nsparse set of visually consistent reference views of the mesh using\nLightControlNet. The second stage applies a texture optimization based on Score\nDistillation Sampling (SDS) that works with LightControlNet to increase the\ntexture quality while disentangling surface material from lighting. Our\nalgorithm is significantly faster than previous text-to-texture methods, while\nproducing high-quality and relightable textures.\n","authors":["Kangle Deng","Timothy Omernick","Alexander Weiss","Deva Ramanan","Jun-Yan Zhu","Tinghui Zhou","Maneesh Agrawala"],"pdf_url":"https://arxiv.org/pdf/2402.13251v2.pdf","comment":"Project page: https://flashtex.github.io/"},{"id":"http://arxiv.org/abs/2404.14568v1","updated":"2024-04-22T20:30:45Z","published":"2024-04-22T20:30:45Z","title":"UVMap-ID: A Controllable and Personalized UV Map Generative Model","summary":" Recently, diffusion models have made significant strides in synthesizing\nrealistic 2D human images based on provided text prompts. Building upon this,\nresearchers have extended 2D text-to-image diffusion models into the 3D domain\nfor generating human textures (UV Maps). However, some important problems about\nUV Map Generative models are still not solved, i.e., how to generate\npersonalized texture maps for any given face image, and how to define and\nevaluate the quality of these generated texture maps. To solve the above\nproblems, we introduce a novel method, UVMap-ID, which is a controllable and\npersonalized UV Map generative model. Unlike traditional large-scale training\nmethods in 2D, we propose to fine-tune a pre-trained text-to-image diffusion\nmodel which is integrated with a face fusion module for achieving ID-driven\ncustomized generation. To support the finetuning strategy, we introduce a\nsmall-scale attribute-balanced training dataset, including high-quality\ntextures with labeled text and Face ID. Additionally, we introduce some metrics\nto evaluate the multiple aspects of the textures. Finally, both quantitative\nand qualitative analyses demonstrate the effectiveness of our method in\ncontrollable and personalized UV Map generation. Code is publicly available via\nhttps://github.com/twowwj/UVMap-ID.\n","authors":["Weijie Wang","Jichao Zhang","Chang Liu","Xia Li","Xingqian Xu","Humphrey Shi","Nicu Sebe","Bruno Lepri"],"pdf_url":"https://arxiv.org/pdf/2404.14568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14565v1","updated":"2024-04-22T20:21:32Z","published":"2024-04-22T20:21:32Z","title":"\"Where am I?\" Scene Retrieval with Language","summary":" Natural language interfaces to embodied AI are becoming more ubiquitous in\nour daily lives. This opens further opportunities for language-based\ninteraction with embodied agents, such as a user instructing an agent to\nexecute some task in a specific location. For example, \"put the bowls back in\nthe cupboard next to the fridge\" or \"meet me at the intersection under the red\nsign.\" As such, we need methods that interface between natural language and map\nrepresentations of the environment. To this end, we explore the question of\nwhether we can use an open-set natural language query to identify a scene\nrepresented by a 3D scene graph. We define this task as \"language-based\nscene-retrieval\" and it is closely related to \"coarse-localization,\" but we are\ninstead searching for a match from a collection of disjoint scenes and not\nnecessarily a large-scale continuous map. Therefore, we present\nText2SceneGraphMatcher, a \"scene-retrieval\" pipeline that learns joint\nembeddings between text descriptions and scene graphs to determine if they are\nmatched. The code, trained models, and datasets will be made public.\n","authors":["Jiaqi Chen","Daniel Barath","Iro Armeni","Marc Pollefeys","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2404.14565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14560v1","updated":"2024-04-22T20:15:43Z","published":"2024-04-22T20:15:43Z","title":"Adaptive Local Binary Pattern: A Novel Feature Descriptor for Enhanced\n Analysis of Kidney Abnormalities in CT Scan Images using ensemble based\n Machine Learning Approach","summary":" The shortage of nephrologists and the growing public health concern over\nrenal failure have spurred the demand for AI systems capable of autonomously\ndetecting kidney abnormalities. Renal failure, marked by a gradual decline in\nkidney function, can result from factors like cysts, stones, and tumors.\nChronic kidney disease may go unnoticed initially, leading to untreated cases\nuntil they reach an advanced stage. The dataset, comprising 12,427 images from\nmultiple hospitals in Dhaka, was categorized into four groups: cyst, tumor,\nstone, and normal. Our methodology aims to enhance CT scan image quality using\nCropping, Resizing, and CALHE techniques, followed by feature extraction with\nour proposed Adaptive Local Binary Pattern (A-LBP) feature extraction method\ncompared with the state-of-the-art local binary pattern (LBP) method. Our\nproposed features fed into classifiers such as Random Forest, Decision Tree,\nNaive Bayes, K-Nearest Neighbor, and SVM. We explored an ensemble model with\nsoft voting to get a more robust model for our task. We got the highest of more\nthan 99% in accuracy using our feature descriptor and ensembling five\nclassifiers (Random Forest, Decision Tree, Naive Bayes, K-Nearest Neighbor,\nSupport Vector Machine) with the soft voting method.\n","authors":["Tahmim Hossain","Faisal Sayed","Solehin Islam"],"pdf_url":"https://arxiv.org/pdf/2404.14560v1.pdf","comment":"17 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2312.09067v2","updated":"2024-04-22T20:06:03Z","published":"2023-12-14T16:04:14Z","title":"Holodeck: Language Guided Generation of 3D Embodied AI Environments","summary":" 3D simulated environments play a critical role in Embodied AI, but their\ncreation requires expertise and extensive manual effort, restricting their\ndiversity and scope. To mitigate this limitation, we present Holodeck, a system\nthat generates 3D environments to match a user-supplied prompt fully\nautomatedly. Holodeck can generate diverse scenes, e.g., arcades, spas, and\nmuseums, adjust the designs for styles, and can capture the semantics of\ncomplex queries such as \"apartment for a researcher with a cat\" and \"office of\na professor who is a fan of Star Wars\". Holodeck leverages a large language\nmodel (i.e., GPT-4) for common sense knowledge about what the scene might look\nlike and uses a large collection of 3D assets from Objaverse to populate the\nscene with diverse objects. To address the challenge of positioning objects\ncorrectly, we prompt GPT-4 to generate spatial relational constraints between\nobjects and then optimize the layout to satisfy those constraints. Our\nlarge-scale human evaluation shows that annotators prefer Holodeck over\nmanually designed procedural baselines in residential scenes and that Holodeck\ncan produce high-quality outputs for diverse scene types. We also demonstrate\nan exciting application of Holodeck in Embodied AI, training agents to navigate\nin novel scenes like music rooms and daycares without human-constructed data,\nwhich is a significant step forward in developing general-purpose embodied\nagents.\n","authors":["Yue Yang","Fan-Yun Sun","Luca Weihs","Eli VanderBilt","Alvaro Herrasti","Winson Han","Jiajun Wu","Nick Haber","Ranjay Krishna","Lingjie Liu","Chris Callison-Burch","Mark Yatskar","Aniruddha Kembhavi","Christopher Clark"],"pdf_url":"https://arxiv.org/pdf/2312.09067v2.pdf","comment":"Published in CVPR 2024, 21 pages, 27 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.14542v1","updated":"2024-04-22T19:29:12Z","published":"2024-04-22T19:29:12Z","title":"UVEB: A Large-scale Benchmark and Baseline Towards Real-World Underwater\n Video Enhancement","summary":" Learning-based underwater image enhancement (UIE) methods have made great\nprogress. However, the lack of large-scale and high-quality paired training\nsamples has become the main bottleneck hindering the development of UIE. The\ninter-frame information in underwater videos can accelerate or optimize the UIE\nprocess. Thus, we constructed the first large-scale high-resolution underwater\nvideo enhancement benchmark (UVEB) to promote the development of underwater\nvision.It contains 1,308 pairs of video sequences and more than 453,000\nhigh-resolution with 38\\% Ultra-High-Definition (UHD) 4K frame pairs. UVEB\ncomes from multiple countries, containing various scenes and video degradation\ntypes to adapt to diverse and complex underwater environments. We also propose\nthe first supervised underwater video enhancement method, UVE-Net. UVE-Net\nconverts the current frame information into convolutional kernels and passes\nthem to adjacent frames for efficient inter-frame information exchange. By\nfully utilizing the redundant degraded information of underwater videos,\nUVE-Net completes video enhancement better. Experiments show the effective\nnetwork design and good performance of UVE-Net.\n","authors":["Yaofeng Xie","Lingwei Kong","Kai Chen","Ziqiang Zheng","Xiao Yu","Zhibin Yu","Bing Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.14542v1.pdf","comment":"10 pages,CVPR2024 accept"},{"id":"http://arxiv.org/abs/2403.08755v2","updated":"2024-04-22T19:17:49Z","published":"2024-03-13T17:53:47Z","title":"DAM: Dynamic Adapter Merging for Continual Video QA Learning","summary":" We present a parameter-efficient method for continual video\nquestion-answering (VidQA) learning. Our method, named DAM, uses the proposed\nDynamic Adapter Merging to (i) mitigate catastrophic forgetting, (ii) enable\nefficient adaptation to continually arriving datasets, (iii) handle inputs from\nunknown datasets during inference, and (iv) enable knowledge sharing across\nsimilar dataset domains. Given a set of continually streaming VidQA datasets,\nwe sequentially train dataset-specific adapters for each dataset while freezing\nthe parameters of a large pretrained video-language backbone. During inference,\ngiven a video-question sample from an unknown domain, our method first uses the\nproposed non-parametric router function to compute a probability for each\nadapter, reflecting how relevant that adapter is to the current video-question\ninput instance. Subsequently, the proposed dynamic adapter merging scheme\naggregates all the adapter weights into a new adapter instance tailored for\nthat particular test sample to compute the final VidQA prediction, mitigating\nthe impact of inaccurate router predictions and facilitating knowledge sharing\nacross domains. Our DAM model outperforms prior state-of-the-art continual\nlearning approaches by 9.1% while exhibiting 1.9% less forgetting on 6 VidQA\ndatasets spanning various domains. We further extend DAM to continual image\nclassification and image QA and outperform prior methods by a large margin. The\ncode is publicly available at: https://github.com/klauscc/DAM\n","authors":["Feng Cheng","Ziyang Wang","Yi-Lin Sung","Yan-Bo Lin","Mohit Bansal","Gedas Bertasius"],"pdf_url":"https://arxiv.org/pdf/2403.08755v2.pdf","comment":"The first two authors contribute equally"},{"id":"http://arxiv.org/abs/2404.14533v1","updated":"2024-04-22T19:01:18Z","published":"2024-04-22T19:01:18Z","title":"SwinFuSR: an image fusion-inspired model for RGB-guided thermal image\n super-resolution","summary":" Thermal imaging plays a crucial role in various applications, but the\ninherent low resolution of commonly available infrared (IR) cameras limits its\neffectiveness. Conventional super-resolution (SR) methods often struggle with\nthermal images due to their lack of high-frequency details. Guided SR leverages\ninformation from a high-resolution image, typically in the visible spectrum, to\nenhance the reconstruction of a high-res IR image from the low-res input.\nInspired by SwinFusion, we propose SwinFuSR, a guided SR architecture based on\nSwin transformers. In real world scenarios, however, the guiding modality (e.g.\nRBG image) may be missing, so we propose a training method that improves the\nrobustness of the model in this case. Our method has few parameters and\noutperforms state of the art models in terms of Peak Signal to Noise Ratio\n(PSNR) and Structural SIMilarity (SSIM). In Track 2 of the PBVS 2024 Thermal\nImage Super-Resolution Challenge, it achieves 3rd place in the PSNR metric. Our\ncode and pretained weights are available at\nhttps://github.com/VisionICLab/SwinFuSR.\n","authors":["Cyprien Arnold","Philippe Jouvet","Lama Seoud"],"pdf_url":"https://arxiv.org/pdf/2404.14533v1.pdf","comment":"Accepted at 20th IEEE Workshop on Perception Beyond the Visible\n Spectrum, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11593v2","updated":"2024-04-22T18:21:24Z","published":"2024-04-17T17:45:08Z","title":"IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under\n Unknown Illumination","summary":" This paper aims to recover object materials from posed images captured under\nan unknown static lighting condition. Recent methods solve this task by\noptimizing material parameters through differentiable physically based\nrendering. However, due to the coupling between object geometry, materials, and\nenvironment lighting, there is inherent ambiguity during the inverse rendering\nprocess, preventing previous methods from obtaining accurate results. To\novercome this ill-posed problem, our key idea is to learn the material prior\nwith a generative model for regularizing the optimization process. We observe\nthat the general rendering equation can be split into diffuse and specular\nshading terms, and thus formulate the material prior as diffusion models of\nalbedo and specular. Thanks to this design, our model can be trained using the\nexisting abundant 3D object data, and naturally acts as a versatile tool to\nresolve the ambiguity when recovering material representations from RGB images.\nIn addition, we develop a coarse-to-fine training strategy that leverages\nestimated materials to guide diffusion models to satisfy multi-view consistent\nconstraints, leading to more stable and accurate results. Extensive experiments\non real-world and synthetic datasets demonstrate that our approach achieves\nstate-of-the-art performance on material recovery. The code will be available\nat https://zju3dv.github.io/IntrinsicAnything.\n","authors":["Xi Chen","Sida Peng","Dongchen Yang","Yuan Liu","Bowen Pan","Chengfei Lv","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.11593v2.pdf","comment":"Project page: https://zju3dv.github.io/IntrinsicAnything"},{"id":"http://arxiv.org/abs/2404.14507v1","updated":"2024-04-22T18:18:41Z","published":"2024-04-22T18:18:41Z","title":"Align Your Steps: Optimizing Sampling Schedules in Diffusion Models","summary":" Diffusion models (DMs) have established themselves as the state-of-the-art\ngenerative modeling approach in the visual domain and beyond. A crucial\ndrawback of DMs is their slow sampling speed, relying on many sequential\nfunction evaluations through large neural networks. Sampling from DMs can be\nseen as solving a differential equation through a discretized set of noise\nlevels known as the sampling schedule. While past works primarily focused on\nderiving efficient solvers, little attention has been given to finding optimal\nsampling schedules, and the entire literature relies on hand-crafted\nheuristics. In this work, for the first time, we propose a general and\nprincipled approach to optimizing the sampling schedules of DMs for\nhigh-quality outputs, called $\\textit{Align Your Steps}$. We leverage methods\nfrom stochastic calculus and find optimal schedules specific to different\nsolvers, trained DMs and datasets. We evaluate our novel approach on several\nimage, video as well as 2D toy data synthesis benchmarks, using a variety of\ndifferent samplers, and observe that our optimized schedules outperform\nprevious hand-crafted schedules in almost all experiments. Our method\ndemonstrates the untapped potential of sampling schedule optimization,\nespecially in the few-step synthesis regime.\n","authors":["Amirmojtaba Sabour","Sanja Fidler","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2404.14507v1.pdf","comment":"Project page:\n https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/"},{"id":"http://arxiv.org/abs/2404.14471v1","updated":"2024-04-22T17:55:07Z","published":"2024-04-22T17:55:07Z","title":"Narrative Action Evaluation with Prompt-Guided Multimodal Interaction","summary":" In this paper, we investigate a new problem called narrative action\nevaluation (NAE). NAE aims to generate professional commentary that evaluates\nthe execution of an action. Unlike traditional tasks such as score-based action\nquality assessment and video captioning involving superficial sentences, NAE\nfocuses on creating detailed narratives in natural language. These narratives\nprovide intricate descriptions of actions along with objective evaluations. NAE\nis a more challenging task because it requires both narrative flexibility and\nevaluation rigor. One existing possible solution is to use multi-task learning,\nwhere narrative language and evaluative information are predicted separately.\nHowever, this approach results in reduced performance for individual tasks\nbecause of variations between tasks and differences in modality between\nlanguage information and evaluation information. To address this, we propose a\nprompt-guided multimodal interaction framework. This framework utilizes a pair\nof transformers to facilitate the interaction between different modalities of\ninformation. It also uses prompts to transform the score regression task into a\nvideo-text matching task, thus enabling task interactivity. To support further\nresearch in this field, we re-annotate the MTL-AQA and FineGym datasets with\nhigh-quality and comprehensive action narration. Additionally, we establish\nbenchmarks for NAE. Extensive experiment results prove that our method\noutperforms separate learning methods and naive multi-task learning methods.\nData and code are released at\n\\href{https://github.com/shiyi-zh0408/NAE_CVPR2024 }{here}.\n","authors":["Shiyi Zhang","Sule Bai","Guangyi Chen","Lei Chen","Jiwen Lu","Junle Wang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2404.14471v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13911v1","updated":"2024-04-22T06:43:18Z","published":"2024-04-22T06:43:18Z","title":"Global OpenBuildingMap -- Unveiling the Mystery of Global Buildings","summary":" Understanding how buildings are distributed globally is crucial to revealing\nthe human footprint on our home planet. This built environment affects local\nclimate, land surface albedo, resource distribution, and many other key factors\nthat influence well-being and human health. Despite this, quantitative and\ncomprehensive data on the distribution and properties of buildings worldwide is\nlacking. To this end, by using a big data analytics approach and nearly 800,000\nsatellite images, we generated the highest resolution and highest accuracy\nbuilding map ever created: the Global OpenBuildingMap (Global OBM). A joint\nanalysis of building maps and solar potentials indicates that rooftop solar\nenergy can supply the global energy consumption need at a reasonable cost.\nSpecifically, if solar panels were placed on the roofs of all buildings, they\ncould supply 1.1-3.3 times -- depending on the efficiency of the solar device\n-- the global energy consumption in 2020, which is the year with the highest\nconsumption on record. We also identified a clear geospatial correlation\nbetween building areas and key socioeconomic variables, which indicates our\nglobal building map can serve as an important input to modeling global\nsocioeconomic needs and drivers.\n","authors":["Xiao Xiang Zhu","Qingyu Li","Yilei Shi","Yuanyuan Wang","Adam Stewart","Jonathan Prexl"],"pdf_url":"https://arxiv.org/pdf/2404.13911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13904v1","updated":"2024-04-22T06:28:41Z","published":"2024-04-22T06:28:41Z","title":"Deep Regression Representation Learning with Topology","summary":" Most works studying representation learning focus only on classification and\nneglect regression. Yet, the learning objectives and therefore the\nrepresentation topologies of the two tasks are fundamentally different:\nclassification targets class separation, leading to disconnected\nrepresentations, whereas regression requires ordinality with respect to the\ntarget, leading to continuous representations. We thus wonder how the\neffectiveness of a regression representation is influenced by its topology,\nwith evaluation based on the Information Bottleneck (IB) principle.\n The IB principle is an important framework that provides principles for\nlearning effectiveness representations. We establish two connections between it\nand the topology of regression representations. The first connection reveals\nthat a lower intrinsic dimension of the feature space implies a reduced\ncomplexity of the representation Z. This complexity can be quantified as the\nconditional entropy of Z on the target space Y and serves as an upper bound on\nthe generalization error. The second connection suggests learning a feature\nspace that is topologically similar to the target space will better align with\nthe IB principle. Based on these two connections, we introduce PH-Reg, a\nregularizer specific to regression that matches the intrinsic dimension and\ntopology of the feature space with the target space. Experiments on synthetic\nand real-world regression tasks demonstrate the benefits of PH-Reg.\n","authors":["Shihao Zhang","kenji kawaguchi","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2404.13904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08513v4","updated":"2024-04-22T05:52:44Z","published":"2023-09-15T16:19:09Z","title":"SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient\n Channels","summary":" Pre-trained vision transformers have strong representation benefits to\nvarious downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT)\nmethods have been proposed, and their experiments demonstrate that tuning only\n1% of extra parameters could surpass full fine-tuning in low-data resource\nscenarios. However, these methods overlook the task-specific information when\nfine-tuning diverse downstream tasks. In this paper, we propose a simple yet\neffective method called \"Salient Channel Tuning\" (SCT) to leverage the\ntask-specific information by forwarding the model with the task images to\nselect partial channels in a feature map that enables us to tune only 1/8\nchannels leading to significantly lower parameter costs. Experiments outperform\nfull fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only\n0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning\ncounterpart. Furthermore, experiments on domain generalization and few-shot\nlearning surpass other PEFT methods with lower parameter costs, demonstrating\nour proposed tuning technique's strong capability and effectiveness in the\nlow-data regime.\n","authors":["Henry Hengyuan Zhao","Pichao Wang","Yuyang Zhao","Hao Luo","Fan Wang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2309.08513v4.pdf","comment":"This work has been accepted by IJCV2023"},{"id":"http://arxiv.org/abs/2106.14490v5","updated":"2024-04-22T05:24:45Z","published":"2021-06-28T09:09:14Z","title":"Making Images Real Again: A Comprehensive Survey on Deep Image\n Composition","summary":" As a common image editing operation, image composition aims to combine the\nforeground from one image and another background image, resulting in a\ncomposite image. However, there are many issues that could make the composite\nimages unrealistic. These issues can be summarized as the inconsistency between\nforeground and background, which includes appearance inconsistency (e.g.,\nincompatible illumination), geometry inconsistency (e.g., unreasonable size),\nand semantic inconsistency (e.g., mismatched semantic context). Image\ncomposition task could be decomposed into multiple sub-tasks, in which each\nsub-task targets at one or more issues. Specifically, object placement aims to\nfind reasonable scale, location, and shape for the foreground. Image blending\naims to address the unnatural boundary between foreground and background. Image\nharmonization aims to adjust the illumination statistics of foreground. Shadow\ngeneration aims to generate plausible shadow for the foreground. These\nsub-tasks can be executed sequentially or parallelly to acquire realistic\ncomposite images. To the best of our knowledge, there is no previous survey on\nimage composition. In this paper, we conduct comprehensive survey over the\nsub-tasks and combinatorial task of image composition. For each one, we\nsummarize the existing methods, available datasets, and common evaluation\nmetrics. Datasets and codes for image composition are summarized at\nhttps://github.com/bcmi/Awesome-Image-Composition. We have also contributed the\nfirst image composition toolbox: libcom https://github.com/bcmi/libcom, which\nassembles 10+ image composition related functions (e.g., image blending, image\nharmonization, object placement, shadow generation, generative composition).\nThe ultimate goal of this toolbox is solving all the problems related to image\ncomposition with simple `import libcom'.\n","authors":["Li Niu","Wenyan Cong","Liu Liu","Yan Hong","Bo Zhang","Jing Liang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2106.14490v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13884v1","updated":"2024-04-22T05:12:11Z","published":"2024-04-22T05:12:11Z","title":"MambaUIE&SR: Unraveling the Ocean's Secrets with Only 2.8 FLOPs","summary":" Underwater Image Enhancement (UIE) techniques aim to address the problem of\nunderwater image degradation due to light absorption and scattering. In recent\nyears, both Convolution Neural Network (CNN)-based and Transformer-based\nmethods have been widely explored. In addition, combining CNN and Transformer\ncan effectively combine global and local information for enhancement. However,\nthis approach is still affected by the secondary complexity of the Transformer\nand cannot maximize the performance. Recently, the state-space model (SSM)\nbased architecture Mamba has been proposed, which excels in modeling long\ndistances while maintaining linear complexity. This paper explores the\npotential of this SSM-based model for UIE from both efficiency and\neffectiveness perspectives. However, the performance of directly applying Mamba\nis poor because local fine-grained features, which are crucial for image\nenhancement, cannot be fully utilized. Specifically, we customize the MambaUIE\narchitecture for efficient UIE. Specifically, we introduce visual state space\n(VSS) blocks to capture global contextual information at the macro level while\nmining local information at the micro level. Also, for these two kinds of\ninformation, we propose a Dynamic Interaction Block (DIB) and Spatial\nfeed-forward Network (SGFN) for intra-block feature aggregation. MambaUIE is\nable to efficiently synthesize global and local information and maintains a\nvery small number of parameters with high accuracy. Experiments on UIEB\ndatasets show that our method reduces GFLOPs by 67.4% (2.715G) relative to the\nSOTA method. To the best of our knowledge, this is the first UIE model\nconstructed based on SSM that breaks the limitation of FLOPs on accuracy in\nUIE. The official repository of MambaUIE at\nhttps://github.com/1024AILab/MambaUIE.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.13884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15902v2","updated":"2024-04-22T05:10:57Z","published":"2024-01-29T06:06:45Z","title":"A Concise but High-performing Network for Image Guided Depth Completion\n in Autonomous Driving","summary":" Depth completion is a crucial task in autonomous driving, aiming to convert a\nsparse depth map into a dense depth prediction. Due to its potentially rich\nsemantic information, RGB image is commonly fused to enhance the completion\neffect. Image-guided depth completion involves three key challenges: 1) how to\neffectively fuse the two modalities; 2) how to better recover depth\ninformation; and 3) how to achieve real-time prediction for practical\nautonomous driving. To solve the above problems, we propose a concise but\neffective network, named CENet, to achieve high-performance depth completion\nwith a simple and elegant structure. Firstly, we use a fast guidance module to\nfuse the two sensor features, utilizing abundant auxiliary features extracted\nfrom the color space. Unlike other commonly used complicated guidance modules,\nour approach is intuitive and low-cost. In addition, we find and analyze the\noptimization inconsistency problem for observed and unobserved positions, and a\ndecoupled depth prediction head is proposed to alleviate the issue. The\nproposed decoupled head can better output the depth of valid and invalid\npositions with very few extra inference time. Based on the simple structure of\ndual-encoder and single-decoder, our CENet can achieve superior balance between\naccuracy and efficiency. In the KITTI depth completion benchmark, our CENet\nattains competitive performance and inference speed compared with the\nstate-of-the-art methods. To validate the generalization of our method, we also\nevaluate on indoor NYUv2 dataset, and our CENet still achieve impressive\nresults. The code of this work will be available at\nhttps://github.com/lmomoy/CHNet.\n","authors":["Moyun Liu","Bing Chen","Youping Chen","Jingming Xie","Lei Yao","Yang Zhang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.15902v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13880v1","updated":"2024-04-22T05:07:02Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Xinyu Shen","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13874v1","updated":"2024-04-22T04:49:22Z","published":"2024-04-22T04:49:22Z","title":"VALOR-EVAL: Holistic Coverage and Faithfulness Evaluation of Large\n Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) suffer from hallucination issues,\nwherein the models generate plausible-sounding but factually incorrect outputs,\nundermining their reliability. A comprehensive quantitative evaluation is\nnecessary to identify and understand the extent of hallucinations in these\nmodels. However, existing benchmarks are often limited in scope, focusing\nmainly on object hallucinations. Furthermore, current evaluation methods\nstruggle to effectively address the subtle semantic distinctions between model\noutputs and reference data, as well as the balance between hallucination and\ninformativeness. To address these issues, we introduce a multi-dimensional\nbenchmark covering objects, attributes, and relations, with challenging images\nselected based on associative biases. Moreover, we propose an large language\nmodel (LLM)-based two-stage evaluation framework that generalizes the popular\nCHAIR metric and incorporates both faithfulness and coverage into the\nevaluation. Experiments on 10 established LVLMs demonstrate that our evaluation\nmetric is more comprehensive and better correlated with humans than existing\nwork when evaluating on our challenging human annotated benchmark dataset. Our\nwork also highlights the critical balance between faithfulness and coverage of\nmodel outputs, and encourages future works to address hallucinations in LVLMs\nwhile keeping their outputs informative.\n","authors":["Haoyi Qiu","Wenbo Hu","Zi-Yi Dou","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2404.13874v1.pdf","comment":"Work in process"},{"id":"http://arxiv.org/abs/2404.13873v1","updated":"2024-04-22T04:47:52Z","published":"2024-04-22T04:47:52Z","title":"Texture-aware and Shape-guided Transformer for Sequential DeepFake\n Detection","summary":" Sequential DeepFake detection is an emerging task that aims to predict the\nmanipulation sequence in order. Existing methods typically formulate it as an\nimage-to-sequence problem, employing conventional Transformer architectures for\ndetection. However, these methods lack dedicated design and consequently result\nin limited performance. In this paper, we propose a novel Texture-aware and\nShape-guided Transformer to enhance detection performance. Our method features\nfour major improvements. Firstly, we describe a texture-aware branch that\neffectively captures subtle manipulation traces with the Diversiform Pixel\nDifference Attention module. Then we introduce a Bidirectional Interaction\nCross-attention module that seeks deep correlations among spatial and\nsequential features, enabling effective modeling of complex manipulation\ntraces. To further enhance the cross-attention, we describe a Shape-guided\nGaussian mapping strategy, providing initial priors of the manipulation shape.\nFinally, observing that the latter manipulation in a sequence may influence\ntraces left in the earlier one, we intriguingly invert the prediction order\nfrom forward to backward, leading to notable gains as expected. Extensive\nexperimental results demonstrate that our method outperforms others by a large\nmargin, highlighting the superiority of our method.\n","authors":["Yunfei Li","Jiaran Zhou","Xin Wang","Junyu Dong","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2404.13873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13872v1","updated":"2024-04-22T04:41:42Z","published":"2024-04-22T04:41:42Z","title":"FreqBlender: Enhancing DeepFake Detection by Blending Frequency\n Knowledge","summary":" Generating synthetic fake faces, known as pseudo-fake faces, is an effective\nway to improve the generalization of DeepFake detection. Existing methods\ntypically generate these faces by blending real or fake faces in color space.\nWhile these methods have shown promise, they overlook the simulation of\nfrequency distribution in pseudo-fake faces, limiting the learning of generic\nforgery traces in-depth. To address this, this paper introduces {\\em\nFreqBlender}, a new method that can generate pseudo-fake faces by blending\nfrequency knowledge. Specifically, we investigate the major frequency\ncomponents and propose a Frequency Parsing Network to adaptively partition\nfrequency components related to forgery traces. Then we blend this frequency\nknowledge from fake faces into real faces to generate pseudo-fake faces. Since\nthere is no ground truth for frequency components, we describe a dedicated\ntraining strategy by leveraging the inner correlations among different\nfrequency knowledge to instruct the learning process. Experimental results\ndemonstrate the effectiveness of our method in enhancing DeepFake detection,\nmaking it a potential plug-and-play strategy for other methods.\n","authors":["Hanzhe Li","Jiaran Zhou","Bin Li","Junyu Dong","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2404.13872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13868v1","updated":"2024-04-22T04:33:40Z","published":"2024-04-22T04:33:40Z","title":"TeamTrack: A Dataset for Multi-Sport Multi-Object Tracking in Full-pitch\n Videos","summary":" Multi-object tracking (MOT) is a critical and challenging task in computer\nvision, particularly in situations involving objects with similar appearances\nbut diverse movements, as seen in team sports. Current methods, largely reliant\non object detection and appearance, often fail to track targets in such complex\nscenarios accurately. This limitation is further exacerbated by the lack of\ncomprehensive and diverse datasets covering the full view of sports pitches.\nAddressing these issues, we introduce TeamTrack, a pioneering benchmark dataset\nspecifically designed for MOT in sports. TeamTrack is an extensive collection\nof full-pitch video data from various sports, including soccer, basketball, and\nhandball. Furthermore, we perform a comprehensive analysis and benchmarking\neffort to underscore TeamTrack's utility and potential impact. Our work\nsignifies a crucial step forward, promising to elevate the precision and\neffectiveness of MOT in complex, dynamic settings such as team sports. The\ndataset, project code and competition is released at:\nhttps://atomscott.github.io/TeamTrack/.\n","authors":["Atom Scott","Ikuma Uchida","Ning Ding","Rikuhei Umemoto","Rory Bunker","Ren Kobayashi","Takeshi Koyama","Masaki Onishi","Yoshinari Kameda","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2404.13868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13866v1","updated":"2024-04-22T04:31:09Z","published":"2024-04-22T04:31:09Z","title":"Plug-and-Play Algorithm Convergence Analysis From The Standpoint of\n Stochastic Differential Equation","summary":" The Plug-and-Play (PnP) algorithm is popular for inverse image\nproblem-solving. However, this algorithm lacks theoretical analysis of its\nconvergence with more advanced plug-in denoisers. We demonstrate that discrete\nPnP iteration can be described by a continuous stochastic differential equation\n(SDE). We can also achieve this transformation through Markov process\nformulation of PnP. Then, we can take a higher standpoint of PnP algorithms\nfrom stochastic differential equations, and give a unified framework for the\nconvergence property of PnP according to the solvability condition of its\ncorresponding SDE. We reveal that a much weaker condition, bounded denoiser\nwith Lipschitz continuous measurement function would be enough for its\nconvergence guarantee, instead of previous Lipschitz continuous denoiser\ncondition.\n","authors":["Zhongqi Wang","Bingnan Wang","Maosheng Xiang"],"pdf_url":"https://arxiv.org/pdf/2404.13866v1.pdf","comment":"17pages, Preprint, Under review"},{"id":"http://arxiv.org/abs/2404.13863v1","updated":"2024-04-22T04:25:02Z","published":"2024-04-22T04:25:02Z","title":"PM-VIS: High-Performance Box-Supervised Video Instance Segmentation","summary":" Labeling pixel-wise object masks in videos is a resource-intensive and\nlaborious process. Box-supervised Video Instance Segmentation (VIS) methods\nhave emerged as a viable solution to mitigate the labor-intensive annotation\nprocess. . In practical applications, the two-step approach is not only more\nflexible but also exhibits a higher recognition accuracy. Inspired by the\nrecent success of Segment Anything Model (SAM), we introduce a novel approach\nthat aims at harnessing instance box annotations from multiple perspectives to\ngenerate high-quality instance pseudo masks, thus enriching the information\ncontained in instance annotations. We leverage ground-truth boxes to create\nthree types of pseudo masks using the HQ-SAM model, the box-supervised VIS\nmodel (IDOL-BoxInst), and the VOS model (DeAOT) separately, along with three\ncorresponding optimization mechanisms. Additionally, we introduce two\nground-truth data filtering methods, assisted by high-quality pseudo masks, to\nfurther enhance the training dataset quality and improve the performance of\nfully supervised VIS methods. To fully capitalize on the obtained high-quality\nPseudo Masks, we introduce a novel algorithm, PM-VIS, to integrate mask losses\ninto IDOL-BoxInst. Our PM-VIS model, trained with high-quality pseudo mask\nannotations, demonstrates strong ability in instance mask prediction, achieving\nstate-of-the-art performance on the YouTube-VIS 2019, YouTube-VIS 2021, and\nOVIS validation sets, notably narrowing the gap between box-supervised and\nfully supervised VIS methods.\n","authors":["Zhangjing Yang","Dun Liu","Wensheng Cheng","Jinqiao Wang","Yi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.13863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13862v1","updated":"2024-04-22T04:22:30Z","published":"2024-04-22T04:22:30Z","title":"PGAHum: Prior-Guided Geometry and Appearance Learning for High-Fidelity\n Animatable Human Reconstruction","summary":" Recent techniques on implicit geometry representation learning and neural\nrendering have shown promising results for 3D clothed human reconstruction from\nsparse video inputs. However, it is still challenging to reconstruct detailed\nsurface geometry and even more difficult to synthesize photorealistic novel\nviews with animated human poses. In this work, we introduce PGAHum, a\nprior-guided geometry and appearance learning framework for high-fidelity\nanimatable human reconstruction. We thoroughly exploit 3D human priors in three\nkey modules of PGAHum to achieve high-quality geometry reconstruction with\nintricate details and photorealistic view synthesis on unseen poses. First, a\nprior-based implicit geometry representation of 3D human, which contains a\ndelta SDF predicted by a tri-plane network and a base SDF derived from the\nprior SMPL model, is proposed to model the surface details and the body shape\nin a disentangled manner. Second, we introduce a novel prior-guided sampling\nstrategy that fully leverages the prior information of the human pose and body\nto sample the query points within or near the body surface. By avoiding\nunnecessary learning in the empty 3D space, the neural rendering can recover\nmore appearance details. Last, we propose a novel iterative backward\ndeformation strategy to progressively find the correspondence for the query\npoint in observation space. A skinning weights prediction model is learned\nbased on the prior provided by the SMPL model to achieve the iterative backward\nLBS deformation. Extensive quantitative and qualitative comparisons on various\ndatasets are conducted and the results demonstrate the superiority of our\nframework. Ablation studies also verify the effectiveness of each scheme for\ngeometry and appearance learning.\n","authors":["Hao Wang","Qingshan Xu","Hongyuan Chen","Rui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.13862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13859v1","updated":"2024-04-22T04:16:40Z","published":"2024-04-22T04:16:40Z","title":"Unveiling and Mitigating Generalized Biases of DNNs through the\n Intrinsic Dimensions of Perceptual Manifolds","summary":" Building fair deep neural networks (DNNs) is a crucial step towards achieving\ntrustworthy artificial intelligence. Delving into deeper factors that affect\nthe fairness of DNNs is paramount and serves as the foundation for mitigating\nmodel biases. However, current methods are limited in accurately predicting DNN\nbiases, relying solely on the number of training samples and lacking more\nprecise measurement tools. Here, we establish a geometric perspective for\nanalyzing the fairness of DNNs, comprehensively exploring how DNNs internally\nshape the intrinsic geometric characteristics of datasets-the intrinsic\ndimensions (IDs) of perceptual manifolds, and the impact of IDs on the fairness\nof DNNs. Based on multiple findings, we propose Intrinsic Dimension\nRegularization (IDR), which enhances the fairness and performance of models by\npromoting the learning of concise and ID-balanced class perceptual manifolds.\nIn various image recognition benchmark tests, IDR significantly mitigates model\nbias while improving its performance.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Lingling Li","Wenping Ma","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13859v1.pdf","comment":"8pages, 6figures, Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2403.01693v2","updated":"2024-04-22T03:53:51Z","published":"2024-03-04T03:00:22Z","title":"HanDiffuser: Text-to-Image Generation With Realistic Hand Appearances","summary":" Text-to-image generative models can generate high-quality humans, but realism\nis lost when generating hands. Common artifacts include irregular hand poses,\nshapes, incorrect numbers of fingers, and physically implausible finger\norientations. To generate images with realistic hands, we propose a novel\ndiffusion-based architecture called HanDiffuser that achieves realism by\ninjecting hand embeddings in the generative process. HanDiffuser consists of\ntwo components: a Text-to-Hand-Params diffusion model to generate SMPL-Body and\nMANO-Hand parameters from input text prompts, and a Text-Guided\nHand-Params-to-Image diffusion model to synthesize images by conditioning on\nthe prompts and hand parameters generated by the previous component. We\nincorporate multiple aspects of hand representation, including 3D shapes and\njoint-level finger positions, orientations and articulations, for robust\nlearning and reliable performance during inference. We conduct extensive\nquantitative and qualitative experiments and perform user studies to\ndemonstrate the efficacy of our method in generating images with high-quality\nhands.\n","authors":["Supreeth Narasimhaswamy","Uttaran Bhattacharya","Xiang Chen","Ishita Dasgupta","Saayan Mitra","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2403.01693v2.pdf","comment":"Revisions: 1. Added a link to project page in the abstract, 2.\n Updated references and related work, 3. Fixed some grammatical errors"},{"id":"http://arxiv.org/abs/2404.13854v1","updated":"2024-04-22T03:39:03Z","published":"2024-04-22T03:39:03Z","title":"Self-Supervised Monocular Depth Estimation in the Dark: Towards Data\n Distribution Compensation","summary":" Nighttime self-supervised monocular depth estimation has received increasing\nattention in recent years. However, using night images for self-supervision is\nunreliable because the photometric consistency assumption is usually violated\nin the videos taken under complex lighting conditions. Even with domain\nadaptation or photometric loss repair, performance is still limited by the poor\nsupervision of night images on trainable networks. In this paper, we propose a\nself-supervised nighttime monocular depth estimation method that does not use\nany night images during training. Our framework utilizes day images as a stable\nsource for self-supervision and applies physical priors (e.g., wave optics,\nreflection model and read-shot noise model) to compensate for some key\nday-night differences. With day-to-night data distribution compensation, our\nframework can be trained in an efficient one-stage self-supervised manner.\nThough no nighttime images are considered during training, qualitative and\nquantitative results demonstrate that our method achieves SoTA depth estimating\nresults on the challenging nuScenes-Night and RobotCar-Night compared with\nexisting methods.\n","authors":["Haolin Yang","Chaoqiang Zhao","Lu Sheng","Yang Tang"],"pdf_url":"https://arxiv.org/pdf/2404.13854v1.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2311.15145v3","updated":"2024-04-22T03:32:18Z","published":"2023-11-26T00:06:12Z","title":"Choosing Wisely and Learning Deeply: Selective Cross-Modality\n Distillation via CLIP for Domain Generalization","summary":" Domain Generalization (DG), a crucial research area, seeks to train models\nacross multiple domains and test them on unseen ones. In this paper, we\nintroduce a novel approach, namely, Selective Cross-Modality Distillation for\nDomain Generalization (SCMD). SCMD leverages the capabilities of large\nvision-language models, specifically CLIP, to train a more efficient model,\nensuring it acquires robust generalization capabilities across unseen domains.\nOur primary contribution is a unique selection framework strategically designed\nto identify hard-to-learn samples for distillation. In parallel, we introduce a\nnovel cross-modality module that seamlessly combines the projected features of\nthe student model with the text embeddings from CLIP, ensuring the alignment of\nsimilarity distributions. We assess SCMD's performance on various benchmarks,\nwhere it empowers a ResNet50 to deliver state-of-the-art performance,\nsurpassing existing domain generalization methods. Furthermore, we provide a\ntheoretical analysis of our selection strategy, offering deeper insight into\nits effectiveness and potential in the field of DG.\n","authors":["Jixuan Leng","Yijiang Li","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13848v1","updated":"2024-04-22T03:15:42Z","published":"2024-04-22T03:15:42Z","title":"DSDRNet: Disentangling Representation and Reconstruct Network for Domain\n Generalization","summary":" Domain generalization faces challenges due to the distribution shift between\ntraining and testing sets, and the presence of unseen target domains. Common\nsolutions include domain alignment, meta-learning, data augmentation, or\nensemble learning, all of which rely on domain labels or domain adversarial\ntechniques. In this paper, we propose a Dual-Stream Separation and\nReconstruction Network, dubbed DSDRNet. It is a disentanglement-reconstruction\napproach that integrates features of both inter-instance and intra-instance\nthrough dual-stream fusion. The method introduces novel supervised signals by\ncombining inter-instance semantic distance and intra-instance similarity.\nIncorporating Adaptive Instance Normalization (AdaIN) into a two-stage cyclic\nreconstruction process enhances self-disentangled reconstruction signals to\nfacilitate model convergence. Extensive experiments on four benchmark datasets\ndemonstrate that DSDRNet outperforms other popular methods in terms of domain\ngeneralization capabilities.\n","authors":["Juncheng Yang","Zuchao Li","Shuai Xie","Wei Yu","Shijun Li"],"pdf_url":"https://arxiv.org/pdf/2404.13848v1.pdf","comment":"This paper is accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.13847v1","updated":"2024-04-22T03:05:32Z","published":"2024-04-22T03:05:32Z","title":"EventLens: Leveraging Event-Aware Pretraining and Cross-modal Linking\n Enhances Visual Commonsense Reasoning","summary":" Visual Commonsense Reasoning (VCR) is a cognitive task, challenging models to\nanswer visual questions requiring human commonsense, and to provide rationales\nexplaining why the answers are correct. With emergence of Large Language Models\n(LLMs), it is natural and imperative to explore their applicability to VCR.\nHowever, VCR task demands more external knowledge to tackle its challenging\nquestions, necessitating special designs to activate LLMs' commonsense\nreasoning abilities. Also, most existing Multimodal LLMs adopted an abstraction\nof entire input image, which makes it difficult to comprehend VCR's unique\nco-reference tags between image regions and text, posing challenges for\nfine-grained alignment. To address these issues, we propose EventLens that\nleverages Event-Aware Pretraining and Cross-modal Linking and EnhanceS VCR.\nFirst, by emulating the cognitive process of human reasoning, an Event-Aware\nPretraining auxiliary task is introduced to better activate LLM's global\ncomprehension of intricate scenarios. Second, during fine-tuning, we further\nutilize reference tags to bridge RoI features with texts, while preserving both\nmodality semantics. Finally, we use instruct-style prompts to narrow the gap\nbetween pretraining and fine-tuning, and task-specific adapters to better\nintegrate LLM's inherent knowledge with new commonsense. Experimental results\nshow the effectiveness of our proposed auxiliary task and fine-grained linking\nstrategy.\n","authors":["Mingjie Ma","Zhihuan Yu","Yichao Ma","Guohui Li"],"pdf_url":"https://arxiv.org/pdf/2404.13847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11202v2","updated":"2024-04-22T02:46:44Z","published":"2024-04-17T09:33:31Z","title":"GhostNetV3: Exploring the Training Strategies for Compact Models","summary":" Compact neural networks are specially designed for applications on edge\ndevices with faster inference speed yet modest performance. However, training\nstrategies of compact models are borrowed from that of conventional models at\npresent, which ignores their difference in model capacity and thus may impede\nthe performance of compact models. In this paper, by systematically\ninvestigating the impact of different training ingredients, we introduce a\nstrong training strategy for compact models. We find that the appropriate\ndesigns of re-parameterization and knowledge distillation are crucial for\ntraining high-performance compact models, while some commonly used data\naugmentations for training conventional models, such as Mixup and CutMix, lead\nto worse performance. Our experiments on ImageNet-1K dataset demonstrate that\nour specialized training strategy for compact models is applicable to various\narchitectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2.\nSpecifically, equipped with our strategy, GhostNetV3 1.3$\\times$ achieves a\ntop-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile\ndevices, surpassing its ordinarily trained counterpart by a large margin.\nMoreover, our observation can also be extended to object detection scenarios.\nPyTorch code and checkpoints can be found at\nhttps://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch.\n","authors":["Zhenhua Liu","Zhiwei Hao","Kai Han","Yehui Tang","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13842v1","updated":"2024-04-22T02:42:32Z","published":"2024-04-22T02:42:32Z","title":"On Support Relations Inference and Scene Hierarchy Graph Construction\n from Point Cloud in Clustered Environments","summary":" Over the years, scene understanding has attracted a growing interest in\ncomputer vision, providing the semantic and physical scene information\nnecessary for robots to complete some particular tasks autonomously. In 3D\nscenes, rich spatial geometric and topological information are often ignored by\nRGB-based approaches for scene understanding. In this study, we develop a\nbottom-up approach for scene understanding that infers support relations\nbetween objects from a point cloud. Our approach utilizes the spatial topology\ninformation of the plane pairs in the scene, consisting of three major steps.\n1) Detection of pairwise spatial configuration: dividing primitive pairs into\nlocal support connection and local inner connection; 2) primitive\nclassification: a combinatorial optimization method applied to classify\nprimitives; and 3) support relations inference and hierarchy graph\nconstruction: bottom-up support relations inference and scene hierarchy graph\nconstruction containing primitive level and object level. Through experiments,\nwe demonstrate that the algorithm achieves excellent performance in primitive\nclassification and support relations inference. Additionally, we show that the\nscene hierarchy graph contains rich geometric and topological information of\nobjects, and it possesses great scalability for scene understanding.\n","authors":["Gang Ma","Hui Wei"],"pdf_url":"https://arxiv.org/pdf/2404.13842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13838v1","updated":"2024-04-22T02:34:50Z","published":"2024-04-22T02:34:50Z","title":"C2F-SemiCD: A Coarse-to-Fine Semi-Supervised Change Detection Method\n Based on Consistency Regularization in High-Resolution Remote Sensing Images","summary":" A high-precision feature extraction model is crucial for change detection\n(CD). In the past, many deep learning-based supervised CD methods learned to\nrecognize change feature patterns from a large number of labelled bi-temporal\nimages, whereas labelling bi-temporal remote sensing images is very expensive\nand often time-consuming; therefore, we propose a coarse-to-fine\nsemi-supervised CD method based on consistency regularization (C2F-SemiCD),\nwhich includes a coarse-to-fine CD network with a multiscale attention\nmechanism (C2FNet) and a semi-supervised update method. Among them, the C2FNet\nnetwork gradually completes the extraction of change features from\ncoarse-grained to fine-grained through multiscale feature fusion, channel\nattention mechanism, spatial attention mechanism, global context module,\nfeature refine module, initial aggregation module, and final aggregation\nmodule. The semi-supervised update method uses the mean teacher method. The\nparameters of the student model are updated to the parameters of the teacher\nModel by using the exponential moving average (EMA) method. Through extensive\nexperiments on three datasets and meticulous ablation studies, including\ncrossover experiments across datasets, we verify the significant effectiveness\nand efficiency of the proposed C2F-SemiCD method. The code will be open at:\nhttps://github.com/ChengxiHAN/C2F-SemiCDand-C2FNet.\n","authors":["Chengxi Han","Chen Wu","Meiqi Hu","Jiepan Li","Hongruixuan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06973v2","updated":"2024-04-22T02:13:32Z","published":"2024-03-11T17:55:53Z","title":"Bayesian Diffusion Models for 3D Shape Reconstruction","summary":" We present Bayesian Diffusion Models (BDM), a prediction algorithm that\nperforms effective Bayesian inference by tightly coupling the top-down (prior)\ninformation with the bottom-up (data-driven) procedure via joint diffusion\nprocesses. We show the effectiveness of BDM on the 3D shape reconstruction\ntask. Compared to prototypical deep learning data-driven approaches trained on\npaired (supervised) data-labels (e.g. image-point clouds) datasets, our BDM\nbrings in rich prior information from standalone labels (e.g. point clouds) to\nimprove the bottom-up 3D reconstruction. As opposed to the standard Bayesian\nframeworks where explicit prior and likelihood are required for the inference,\nBDM performs seamless information fusion via coupled diffusion processes with\nlearned gradient computation networks. The specialty of our BDM lies in its\ncapability to engage the active and effective information exchange and fusion\nof the top-down and bottom-up processes where each itself is a diffusion\nprocess. We demonstrate state-of-the-art results on both synthetic and\nreal-world benchmarks for 3D shape reconstruction.\n","authors":["Haiyang Xu","Yu Lei","Zeyuan Chen","Xiang Zhang","Yue Zhao","Yilin Wang","Zhuowen Tu"],"pdf_url":"https://arxiv.org/pdf/2403.06973v2.pdf","comment":"Accepted to CVPR 2024; Project Page: https://mlpc-ucsd.github.io/BDM/"},{"id":"http://arxiv.org/abs/2404.13830v1","updated":"2024-04-22T02:05:15Z","published":"2024-04-22T02:05:15Z","title":"A Comprehensive Survey and Taxonomy on Point Cloud Registration Based on\n Deep Learning","summary":" Point cloud registration (PCR) involves determining a rigid transformation\nthat aligns one point cloud to another. Despite the plethora of outstanding\ndeep learning (DL)-based registration methods proposed, comprehensive and\nsystematic studies on DL-based PCR techniques are still lacking. In this paper,\nwe present a comprehensive survey and taxonomy of recently proposed PCR\nmethods. Firstly, we conduct a taxonomy of commonly utilized datasets and\nevaluation metrics. Secondly, we classify the existing research into two main\ncategories: supervised and unsupervised registration, providing insights into\nthe core concepts of various influential PCR models. Finally, we highlight open\nchallenges and potential directions for future research. A curated collection\nof valuable resources is made available at https://github.com/yxzhang15/PCR.\n","authors":["Yu-Xin Zhang","Jie Gui","Xiaofeng Cong","Xin Gong","Wenbing Tao"],"pdf_url":"https://arxiv.org/pdf/2404.13830v1.pdf","comment":"This paper is accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2310.19540v2","updated":"2024-04-22T02:03:02Z","published":"2023-10-30T13:47:46Z","title":"IterInv: Iterative Inversion for Pixel-Level T2I Models","summary":" Large-scale text-to-image diffusion models have been a ground-breaking\ndevelopment in generating convincing images following an input text prompt. The\ngoal of image editing research is to give users control over the generated\nimages by modifying the text prompt. Current image editing techniques\npredominantly hinge on DDIM inversion as a prevalent practice rooted in Latent\nDiffusion Models (LDM). However, the large pretrained T2I models working on the\nlatent space suffer from losing details due to the first compression stage with\nan autoencoder mechanism. Instead, other mainstream T2I pipeline working on the\npixel level, such as Imagen and DeepFloyd-IF, circumvents the above problem.\nThey are commonly composed of multiple stages, typically starting with a\ntext-to-image stage and followed by several super-resolution stages. In this\npipeline, the DDIM inversion fails to find the initial noise and generate the\noriginal image given that the super-resolution diffusion models are not\ncompatible with the DDIM technique. According to our experimental findings,\niteratively concatenating the noisy image as the condition is the root of this\nproblem. Based on this observation, we develop an iterative inversion (IterInv)\ntechnique for this category of T2I models and verify IterInv with the\nopen-source DeepFloyd-IF model.Specifically, IterInv employ NTI as the\ninversion and reconstruction of low-resolution image generation. In stages 2\nand 3, we update the latent variance at each timestep to find the deterministic\ninversion trace and promote the reconstruction process. By combining our method\nwith a popular image editing method, we prove the application prospects of\nIterInv. The code will be released upon acceptance. The code is available at\n\\url{https://github.com/Tchuanm/IterInv.git}.\n","authors":["Chuanming Tang","Kai Wang","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2310.19540v2.pdf","comment":"Accepted paper at ICME 2024"},{"id":"http://arxiv.org/abs/2404.13827v1","updated":"2024-04-22T01:59:48Z","published":"2024-04-22T01:59:48Z","title":"Swap It Like Its Hot: Segmentation-based spoof attacks on eye-tracking\n images","summary":" Video-based eye trackers capture the iris biometric and enable authentication\nto secure user identity. However, biometric authentication is susceptible to\nspoofing another user's identity through physical or digital manipulation. The\ncurrent standard to identify physical spoofing attacks on eye-tracking sensors\nuses liveness detection. Liveness detection classifies gaze data as real or\nfake, which is sufficient to detect physical presentation attacks. However,\nsuch defenses cannot detect a spoofing attack when real eye image inputs are\ndigitally manipulated to swap the iris pattern of another person. We propose\nIrisSwap as a novel attack on gaze-based liveness detection. IrisSwap allows\nattackers to segment and digitally swap in a victim's iris pattern to fool iris\nauthentication. Both offline and online attacks produce gaze data that deceives\nthe current state-of-the-art defense models at rates up to 58% and motivates\nthe need to develop more advanced authentication methods for eye trackers.\n","authors":["Anish S. Narkar","Brendan David-John"],"pdf_url":"https://arxiv.org/pdf/2404.13827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12621v4","updated":"2024-04-22T01:58:18Z","published":"2023-05-22T01:14:30Z","title":"DermSynth3D: Synthesis of in-the-wild Annotated Dermatology Images","summary":" In recent years, deep learning (DL) has shown great potential in the field of\ndermatological image analysis. However, existing datasets in this domain have\nsignificant limitations, including a small number of image samples, limited\ndisease conditions, insufficient annotations, and non-standardized image\nacquisitions. To address these shortcomings, we propose a novel framework\ncalled DermSynth3D. DermSynth3D blends skin disease patterns onto 3D textured\nmeshes of human subjects using a differentiable renderer and generates 2D\nimages from various camera viewpoints under chosen lighting conditions in\ndiverse background scenes. Our method adheres to top-down rules that constrain\nthe blending and rendering process to create 2D images with skin conditions\nthat mimic in-the-wild acquisitions, ensuring more meaningful results. The\nframework generates photo-realistic 2D dermoscopy images and the corresponding\ndense annotations for semantic segmentation of the skin, skin conditions, body\nparts, bounding boxes around lesions, depth maps, and other 3D scene\nparameters, such as camera position and lighting conditions. DermSynth3D allows\nfor the creation of custom datasets for various dermatology tasks. We\ndemonstrate the effectiveness of data generated using DermSynth3D by training\nDL models on synthetic data and evaluating them on various dermatology tasks\nusing real 2D dermatological images. We make our code publicly available at\nhttps://github.com/sfu-mial/DermSynth3D.\n","authors":["Ashish Sinha","Jeremy Kawahara","Arezou Pakzad","Kumar Abhishek","Matthieu Ruthven","Enjie Ghorbel","Anis Kacem","Djamila Aouada","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2305.12621v4.pdf","comment":"Accepted to Medical Image Analysis (MedIA) 2024"},{"id":"http://arxiv.org/abs/2306.16927v2","updated":"2024-04-22T01:46:43Z","published":"2023-06-29T14:17:24Z","title":"End-to-end Autonomous Driving: Challenges and Frontiers","summary":" The autonomous driving community has witnessed a rapid growth in approaches\nthat embrace an end-to-end algorithm framework, utilizing raw sensor input to\ngenerate vehicle motion plans, instead of concentrating on individual tasks\nsuch as detection and motion prediction. End-to-end systems, in comparison to\nmodular pipelines, benefit from joint feature optimization for perception and\nplanning. This field has flourished due to the availability of large-scale\ndatasets, closed-loop evaluation, and the increasing need for autonomous\ndriving algorithms to perform effectively in challenging scenarios. In this\nsurvey, we provide a comprehensive analysis of more than 270 papers, covering\nthe motivation, roadmap, methodology, challenges, and future trends in\nend-to-end autonomous driving. We delve into several critical challenges,\nincluding multi-modality, interpretability, causal confusion, robustness, and\nworld models, amongst others. Additionally, we discuss current advancements in\nfoundation models and visual pre-training, as well as how to incorporate these\ntechniques within the end-to-end driving framework. we maintain an active\nrepository that contains up-to-date literature and open-source projects at\nhttps://github.com/OpenDriveLab/End-to-end-Autonomous-Driving.\n","authors":["Li Chen","Penghao Wu","Kashyap Chitta","Bernhard Jaeger","Andreas Geiger","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.16927v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13819v1","updated":"2024-04-22T01:42:45Z","published":"2024-04-22T01:42:45Z","title":"HOIST-Former: Hand-held Objects Identification, Segmentation, and\n Tracking in the Wild","summary":" We address the challenging task of identifying, segmenting, and tracking\nhand-held objects, which is crucial for applications such as human action\nsegmentation and performance evaluation. This task is particularly challenging\ndue to heavy occlusion, rapid motion, and the transitory nature of objects\nbeing hand-held, where an object may be held, released, and subsequently picked\nup again. To tackle these challenges, we have developed a novel\ntransformer-based architecture called HOIST-Former. HOIST-Former is adept at\nspatially and temporally segmenting hands and objects by iteratively pooling\nfeatures from each other, ensuring that the processes of identification,\nsegmentation, and tracking of hand-held objects depend on the hands' positions\nand their contextual appearance. We further refine HOIST-Former with a contact\nloss that focuses on areas where hands are in contact with objects. Moreover,\nwe also contribute an in-the-wild video dataset called HOIST, which comprises\n4,125 videos complete with bounding boxes, segmentation masks, and tracking IDs\nfor hand-held objects. Through experiments on the HOIST dataset and two\nadditional public datasets, we demonstrate the efficacy of HOIST-Former in\nsegmenting and tracking hand-held objects.\n","authors":["Supreeth Narasimhaswamy","Huy Anh Nguyen","Lihan Huang","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.13819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05321v4","updated":"2024-04-22T01:39:07Z","published":"2022-09-12T15:26:13Z","title":"Deep Feature Statistics Mapping for Generalized Screen Content Image\n Quality Assessment","summary":" The statistical regularities of natural images, referred to as natural scene\nstatistics, play an important role in no-reference image quality assessment.\nHowever, it has been widely acknowledged that screen content images (SCIs),\nwhich are typically computer generated, do not hold such statistics. Here we\nmake the first attempt to learn the statistics of SCIs, based upon which the\nquality of SCIs can be effectively determined. The underlying mechanism of the\nproposed approach is based upon the mild assumption that the SCIs, which are\nnot physically acquired, still obey certain statistics that could be understood\nin a learning fashion. We empirically show that the statistics deviation could\nbe effectively leveraged in quality assessment, and the proposed method is\nsuperior when evaluated in different settings. Extensive experimental results\ndemonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA)\nmodel delivers promising performance compared with existing NR-IQA models and\nshows a high generalization capability in the cross-dataset settings. The\nimplementation of our method is publicly available at\nhttps://github.com/Baoliang93/DFSS-IQA.\n","authors":["Baoliang Chen","Hanwei Zhu","Lingyu Zhu","Shiqi Wang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2209.05321v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13816v1","updated":"2024-04-22T01:36:50Z","published":"2024-04-22T01:36:50Z","title":"Neural Radiance Field in Autonomous Driving: A Survey","summary":" Neural Radiance Field (NeRF) has garnered significant attention from both\nacademia and industry due to its intrinsic advantages, particularly its\nimplicit representation and novel view synthesis capabilities. With the rapid\nadvancements in deep learning, a multitude of methods have emerged to explore\nthe potential applications of NeRF in the domain of Autonomous Driving (AD).\nHowever, a conspicuous void is apparent within the current literature. To\nbridge this gap, this paper conducts a comprehensive survey of NeRF's\napplications in the context of AD. Our survey is structured to categorize\nNeRF's applications in Autonomous Driving (AD), specifically encompassing\nperception, 3D reconstruction, simultaneous localization and mapping (SLAM),\nand simulation. We delve into in-depth analysis and summarize the findings for\neach application category, and conclude by providing insights and discussions\non future directions in this field. We hope this paper serves as a\ncomprehensive reference for researchers in this domain. To the best of our\nknowledge, this is the first survey specifically focused on the applications of\nNeRF in the Autonomous Driving domain.\n","authors":["Lei He","Leheng Li","Wenchao Sun","Zeyu Han","Yichen Liu","Sifa Zheng","Jianqiang Wang","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.13816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12814v2","updated":"2024-04-22T01:14:11Z","published":"2024-04-19T11:49:01Z","title":"Generative Modelling with High-Order Langevin Dynamics","summary":" Diffusion generative modelling (DGM) based on stochastic differential\nequations (SDEs) with score matching has achieved unprecedented results in data\ngeneration. In this paper, we propose a novel fast high-quality generative\nmodelling method based on high-order Langevin dynamics (HOLD) with score\nmatching. This motive is proved by third-order Langevin dynamics. By augmenting\nthe previous SDEs, e.g. variance exploding or variance preserving SDEs for\nsingle-data variable processes, HOLD can simultaneously model position,\nvelocity, and acceleration, thereby improving the quality and speed of the data\ngeneration at the same time. HOLD is composed of one Ornstein-Uhlenbeck process\nand two Hamiltonians, which reduce the mixing time by two orders of magnitude.\nEmpirical experiments for unconditional image generation on the public data set\nCIFAR-10 and CelebA-HQ show that the effect is significant in both Frechet\ninception distance (FID) and negative log-likelihood, and achieves the\nstate-of-the-art FID of 1.85 on CIFAR-10.\n","authors":["Ziqiang Shi","Rujie Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12814v2.pdf","comment":"Some of the results in this paper have been published or accepted at\n conferences such as wacv2024, icassp2024, and icme2024"},{"id":"http://arxiv.org/abs/2404.13807v1","updated":"2024-04-22T00:44:13Z","published":"2024-04-22T00:44:13Z","title":"FaceFolds: Meshed Radiance Manifolds for Efficient Volumetric Rendering\n of Dynamic Faces","summary":" 3D rendering of dynamic face captures is a challenging problem, and it\ndemands improvements on several fronts$\\unicode{x2014}$photorealism,\nefficiency, compatibility, and configurability. We present a novel\nrepresentation that enables high-quality volumetric rendering of an actor's\ndynamic facial performances with minimal compute and memory footprint. It runs\nnatively on commodity graphics soft- and hardware, and allows for a graceful\ntrade-off between quality and efficiency. Our method utilizes recent advances\nin neural rendering, particularly learning discrete radiance manifolds to\nsparsely sample the scene to model volumetric effects. We achieve efficient\nmodeling by learning a single set of manifolds for the entire dynamic sequence,\nwhile implicitly modeling appearance changes as temporal canonical texture. We\nexport a single layered mesh and view-independent RGBA texture video that is\ncompatible with legacy graphics renderers without additional ML integration. We\ndemonstrate our method by rendering dynamic face captures of real actors in a\ngame engine, at comparable photorealism to state-of-the-art neural rendering\ntechniques at previously unseen frame rates.\n","authors":["Safa C. Medin","Gengyan Li","Ruofei Du","Stephan Garbin","Philip Davidson","Gregory W. Wornell","Thabo Beeler","Abhimitra Meka"],"pdf_url":"https://arxiv.org/pdf/2404.13807v1.pdf","comment":"In Proceedings of the ACM in Computer Graphics and Interactive\n Techniques, 2024"},{"id":"http://arxiv.org/abs/2312.15320v2","updated":"2024-04-22T00:41:34Z","published":"2023-12-23T18:40:25Z","title":"GestaltMML: Enhancing Rare Genetic Disease Diagnosis through Multimodal\n Machine Learning Combining Facial Images and Clinical Texts","summary":" Individuals with suspected rare genetic disorders often undergo multiple\nclinical evaluations, imaging studies, laboratory tests and genetic tests, to\nfind a possible answer over a prolonged period of time. Addressing this\n\"diagnostic odyssey\" thus has substantial clinical, psychosocial, and economic\nbenefits. Many rare genetic diseases have distinctive facial features, which\ncan be used by artificial intelligence algorithms to facilitate clinical\ndiagnosis, in prioritizing candidate diseases to be further examined by lab\ntests or genetic assays, or in helping the phenotype-driven reinterpretation of\ngenome/exome sequencing data. Existing methods using frontal facial photos were\nbuilt on conventional Convolutional Neural Networks (CNNs), rely exclusively on\nfacial images, and cannot capture non-facial phenotypic traits and demographic\ninformation essential for guiding accurate diagnoses. Here we introduce\nGestaltMML, a multimodal machine learning (MML) approach solely based on the\nTransformer architecture. It integrates facial images, demographic information\n(age, sex, ethnicity), and clinical notes (optionally, a list of Human\nPhenotype Ontology terms) to improve prediction accuracy. Furthermore, we also\nevaluated GestaltMML on a diverse range of datasets, including 528 diseases\nfrom the GestaltMatcher Database, several in-house datasets of\nBeckwith-Wiedemann syndrome (BWS, over-growth syndrome with distinct facial\nfeatures), Sotos syndrome (overgrowth syndrome with overlapping features with\nBWS), NAA10-related neurodevelopmental syndrome, Cornelia de Lange syndrome\n(multiple malformation syndrome), and KBG syndrome (multiple malformation\nsyndrome). Our results suggest that GestaltMML effectively incorporates\nmultiple modalities of data, greatly narrowing candidate genetic diagnoses of\nrare diseases and may facilitate the reinterpretation of genome/exome\nsequencing data.\n","authors":["Da Wu","Jingye Yang","Cong Liu","Tzung-Chien Hsieh","Elaine Marchi","Justin Blair","Peter Krawitz","Chunhua Weng","Wendy Chung","Gholson J. Lyon","Ian D. Krantz","Jennifer M. Kalish","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15320v2.pdf","comment":"Significant revisions"}]},"2024-04-23T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.15276v1","updated":"2024-04-23T17:59:59Z","published":"2024-04-23T17:59:59Z","title":"SMPLer: Taming Transformers for Monocular 3D Human Shape and Pose\n Estimation","summary":" Existing Transformers for monocular 3D human shape and pose estimation\ntypically have a quadratic computation and memory complexity with respect to\nthe feature length, which hinders the exploitation of fine-grained information\nin high-resolution features that is beneficial for accurate reconstruction. In\nthis work, we propose an SMPL-based Transformer framework (SMPLer) to address\nthis issue. SMPLer incorporates two key ingredients: a decoupled attention\noperation and an SMPL-based target representation, which allow effective\nutilization of high-resolution features in the Transformer. In addition, based\non these two designs, we also introduce several novel modules including a\nmulti-scale attention and a joint-aware attention to further boost the\nreconstruction performance. Extensive experiments demonstrate the effectiveness\nof SMPLer against existing 3D human shape and pose estimation methods both\nquantitatively and qualitatively. Notably, the proposed algorithm achieves an\nMPJPE of 45.2 mm on the Human3.6M dataset, improving upon Mesh Graphormer by\nmore than 10% with fewer than one-third of the parameters. Code and pretrained\nmodels are available at https://github.com/xuxy09/SMPLer.\n","authors":["Xiangyu Xu","Lijuan Liu","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2404.15276v1.pdf","comment":"Published at TPAMI 2024"},{"id":"http://arxiv.org/abs/2404.15275v1","updated":"2024-04-23T17:59:43Z","published":"2024-04-23T17:59:43Z","title":"ID-Animator: Zero-Shot Identity-Preserving Human Video Generation","summary":" Generating high fidelity human video with specified identities has attracted\nsignificant attention in the content generation community. However, existing\ntechniques struggle to strike a balance between training efficiency and\nidentity preservation, either requiring tedious case-by-case finetuning or\nusually missing the identity details in video generation process. In this\nstudy, we present ID-Animator, a zero-shot human-video generation approach that\ncan perform personalized video generation given single reference facial image\nwithout further training. ID-Animator inherits existing diffusion-based video\ngeneration backbones with a face adapter to encode the ID-relevant embeddings\nfrom learnable facial latent queries. To facilitate the extraction of identity\ninformation in video generation, we introduce an ID-oriented dataset\nconstruction pipeline, which incorporates decoupled human attribute and action\ncaptioning technique from a constructed facial image pool. Based on this\npipeline, a random face reference training method is further devised to\nprecisely capture the ID-relevant embeddings from reference images, thus\nimproving the fidelity and generalization capacity of our model for ID-specific\nvideo generation. Extensive experiments demonstrate the superiority of\nID-Animator to generate personalized human videos over previous models.\nMoreover, our method is highly compatible with popular pre-trained T2V models\nlike animatediff and various community backbone models, showing high\nextendability in real-world applications for video generation where identity\npreservation is highly desired. Our codes and checkpoints will be released at\nhttps://github.com/ID-Animator/ID-Animator.\n","authors":["Xuanhua He","Quande Liu","Shengju Qian","Xin Wang","Tao Hu","Ke Cao","Keyu Yan","Man Zhou","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15275v1.pdf","comment":"Project Page: https://id-animator.github.io/"},{"id":"http://arxiv.org/abs/2312.07530v2","updated":"2024-04-23T17:59:25Z","published":"2023-12-12T18:57:25Z","title":"Weakly Supervised 3D Object Detection via Multi-Level Visual Guidance","summary":" Weakly supervised 3D object detection aims to learn a 3D detector with lower\nannotation cost, e.g., 2D labels. Unlike prior work which still relies on few\naccurate 3D annotations, we propose a framework to study how to leverage\nconstraints between 2D and 3D domains without requiring any 3D labels.\nSpecifically, we employ visual data from three perspectives to establish\nconnections between 2D and 3D domains. First, we design a feature-level\nconstraint to align LiDAR and image features based on object-aware regions.\nSecond, the output-level constraint is developed to enforce the overlap between\n2D and projected 3D box estimations. Finally, the training-level constraint is\nutilized by producing accurate and consistent 3D pseudo-labels that align with\nthe visual data. We conduct extensive experiments on the KITTI dataset to\nvalidate the effectiveness of the proposed three constraints. Without using any\n3D labels, our method achieves favorable performance against state-of-the-art\napproaches and is competitive with the method that uses 500-frame 3D\nannotations. Code and models will be made publicly available at\nhttps://github.com/kuanchihhuang/VG-W3D.\n","authors":["Kuan-Chih Huang","Yi-Hsuan Tsai","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.07530v2.pdf","comment":"Project page: https://github.com/kuanchihhuang/VG-W3D"},{"id":"http://arxiv.org/abs/2404.15274v1","updated":"2024-04-23T17:59:12Z","published":"2024-04-23T17:59:12Z","title":"Metric-guided Image Reconstruction Bounds via Conformal Prediction","summary":" Recent advancements in machine learning have led to novel imaging systems and\nalgorithms that address ill-posed problems. Assessing their trustworthiness and\nunderstanding how to deploy them safely at test time remains an important and\nopen problem. We propose a method that leverages conformal prediction to\nretrieve upper/lower bounds and statistical inliers/outliers of reconstructions\nbased on the prediction intervals of downstream metrics. We apply our method to\nsparse-view CT for downstream radiotherapy planning and show 1) that\nmetric-guided bounds have valid coverage for downstream metrics while\nconventional pixel-wise bounds do not and 2) anatomical differences of\nupper/lower bounds between metric-guided and pixel-wise methods. Our work paves\nthe way for more meaningful reconstruction bounds. Code available at\nhttps://github.com/matthewyccheung/conformal-metric\n","authors":["Matt Y Cheung","Tucker J Netherton","Laurence E Court","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2404.15274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15272v1","updated":"2024-04-23T17:59:01Z","published":"2024-04-23T17:59:01Z","title":"CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and\n Radiology Reports for Full-Body Scenarios","summary":" Medical Vision-Language Pretraining (Med-VLP) establishes a connection\nbetween visual content from medical images and the relevant textual\ndescriptions. Existing Med-VLP methods primarily focus on 2D images depicting a\nsingle body part, notably chest X-rays. In this paper, we extend the scope of\nMed-VLP to encompass 3D images, specifically targeting full-body scenarios, by\nusing a multimodal dataset of CT images and reports. Compared with the 2D\ncounterpart, 3D VLP is required to effectively capture essential semantics from\nsignificantly sparser representation in 3D imaging. In this paper, we introduce\nCT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method\nthat constructs organ-level image-text pairs to enhance multimodal contrastive\nlearning, aligning grounded visual features with precise diagnostic text.\nAdditionally, we developed an abnormality dictionary to augment contrastive\nlearning with diverse negative samples. Our method, trained on a multimodal CT\ndataset comprising 44,011 organ-level vision-text pairs from 17,702 patients\nacross 104 organs, demonstrates it can identify organs and abnormalities in a\nzero-shot manner using natural languages. The performance of CT-GLIP is\nvalidated on a separate test set of 1,130 patients, focusing on the 16 most\nfrequent abnormalities across 7 organs. The experimental results show our\nmodel's superior performance over the standard CLIP framework across zero-shot\nand fine-tuning scenarios, using both CNN and ViT architectures.\n","authors":["Jingyang Lin","Yingda Xia","Jianpeng Zhang","Ke Yan","Le Lu","Jiebo Luo","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15272v1.pdf","comment":"12 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.15271v1","updated":"2024-04-23T17:58:33Z","published":"2024-04-23T17:58:33Z","title":"Automatic Layout Planning for Visually-Rich Documents with\n Instruction-Following Models","summary":" Recent advancements in instruction-following models have made user\ninteractions with models more user-friendly and efficient, broadening their\napplicability. In graphic design, non-professional users often struggle to\ncreate visually appealing layouts due to limited skills and resources. In this\nwork, we introduce a novel multimodal instruction-following framework for\nlayout planning, allowing users to easily arrange visual elements into tailored\nlayouts by specifying canvas size and design purpose, such as for book covers,\nposters, brochures, or menus. We developed three layout reasoning tasks to\ntrain the model in understanding and executing layout instructions. Experiments\non two benchmarks show that our method not only simplifies the design process\nfor non-professionals but also surpasses the performance of few-shot GPT-4V\nmodels, with mIoU higher by 12% on Crello. This progress highlights the\npotential of multimodal instruction-following models to automate and simplify\nthe design process, providing an approachable solution for a wide range of\ndesign tasks on visually-rich documents.\n","authors":["Wanrong Zhu","Jennifer Healey","Ruiyi Zhang","William Yang Wang","Tong Sun"],"pdf_url":"https://arxiv.org/pdf/2404.15271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15267v1","updated":"2024-04-23T17:56:08Z","published":"2024-04-23T17:56:08Z","title":"From Parts to Whole: A Unified Reference Framework for Controllable\n Human Image Generation","summary":" Recent advancements in controllable human image generation have led to\nzero-shot generation using structural signals (e.g., pose, depth) or facial\nappearance. Yet, generating human images conditioned on multiple parts of human\nappearance remains challenging. Addressing this, we introduce Parts2Whole, a\nnovel framework designed for generating customized portraits from multiple\nreference images, including pose images and various aspects of human\nappearance. To achieve this, we first develop a semantic-aware appearance\nencoder to retain details of different human parts, which processes each image\nbased on its textual label to a series of multi-scale feature maps rather than\none image token, preserving the image dimension. Second, our framework supports\nmulti-image conditioned generation through a shared self-attention mechanism\nthat operates across reference and target features during the diffusion\nprocess. We enhance the vanilla attention mechanism by incorporating mask\ninformation from the reference human images, allowing for the precise selection\nof any part. Extensive experiments demonstrate the superiority of our approach\nover existing alternatives, offering advanced capabilities for multi-part\ncontrollable human image customization. See our project page at\nhttps://huanngzh.github.io/Parts2Whole/.\n","authors":["Zehuan Huang","Hongxing Fan","Lipeng Wang","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2404.15267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12060v3","updated":"2024-04-23T17:55:37Z","published":"2023-03-21T17:51:23Z","title":"VideoXum: Cross-modal Visual and Textural Summarization of Videos","summary":" Video summarization aims to distill the most important information from a\nsource video to produce either an abridged clip or a textual narrative.\nTraditionally, different methods have been proposed depending on whether the\noutput is a video or text, thus ignoring the correlation between the two\nsemantically related tasks of visual summarization and textual summarization.\nWe propose a new joint video and text summarization task. The goal is to\ngenerate both a shortened video clip along with the corresponding textual\nsummary from a long video, collectively referred to as a cross-modal summary.\nThe generated shortened video clip and text narratives should be semantically\nwell aligned. To this end, we first build a large-scale human-annotated dataset\n-- VideoXum (X refers to different modalities). The dataset is reannotated\nbased on ActivityNet. After we filter out the videos that do not meet the\nlength requirements, 14,001 long videos remain in our new dataset. Each video\nin our reannotated dataset has human-annotated video summaries and the\ncorresponding narrative summaries. We then design a novel end-to-end model --\nVTSUM-BILP to address the challenges of our proposed task. Moreover, we propose\na new metric called VT-CLIPScore to help evaluate the semantic consistency of\ncross-modality summary. The proposed model achieves promising performance on\nthis new task and establishes a benchmark for future research.\n","authors":["Jingyang Lin","Hang Hua","Ming Chen","Yikang Li","Jenhao Hsiao","Chiuman Ho","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2303.12060v3.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.15264v1","updated":"2024-04-23T17:55:07Z","published":"2024-04-23T17:55:07Z","title":"TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via\n Gaussian Splatting","summary":" Radiance fields have demonstrated impressive performance in synthesizing\nlifelike 3D talking heads. However, due to the difficulty in fitting steep\nappearance changes, the prevailing paradigm that presents facial motions by\ndirectly modifying point appearance may lead to distortions in dynamic regions.\nTo tackle this challenge, we introduce TalkingGaussian, a deformation-based\nradiance fields framework for high-fidelity talking head synthesis. Leveraging\nthe point-based Gaussian Splatting, facial motions can be represented in our\nmethod by applying smooth and continuous deformations to persistent Gaussian\nprimitives, without requiring to learn the difficult appearance change like\nprevious methods. Due to this simplification, precise facial motions can be\nsynthesized while keeping a highly intact facial feature. Under such a\ndeformation paradigm, we further identify a face-mouth motion inconsistency\nthat would affect the learning of detailed speaking motions. To address this\nconflict, we decompose the model into two branches separately for the face and\ninside mouth areas, therefore simplifying the learning tasks to help\nreconstruct more accurate motion and structure of the mouth region. Extensive\nexperiments demonstrate that our method renders high-quality lip-synchronized\ntalking head videos, with better facial fidelity and higher efficiency compared\nwith previous methods.\n","authors":["Jiahe Li","Jiawei Zhang","Xiao Bai","Jin Zheng","Xin Ning","Jun Zhou","Lin Gu"],"pdf_url":"https://arxiv.org/pdf/2404.15264v1.pdf","comment":"Project page: https://fictionarry.github.io/TalkingGaussian/"},{"id":"http://arxiv.org/abs/2404.15263v1","updated":"2024-04-23T17:55:05Z","published":"2024-04-23T17:55:05Z","title":"Multi-Session SLAM with Differentiable Wide-Baseline Pose Optimization","summary":" We introduce a new system for Multi-Session SLAM, which tracks camera motion\nacross multiple disjoint videos under a single global reference. Our approach\ncouples the prediction of optical flow with solver layers to estimate camera\npose. The backbone is trained end-to-end using a novel differentiable solver\nfor wide-baseline two-view pose. The full system can connect disjoint\nsequences, perform visual odometry, and global optimization. Compared to\nexisting approaches, our design is accurate and robust to catastrophic\nfailures. Code is available at github.com/princeton-vl/MultiSlam_DiffPose\n","authors":["Lahav Lipson","Jia Deng"],"pdf_url":"https://arxiv.org/pdf/2404.15263v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15259v1","updated":"2024-04-23T17:46:50Z","published":"2024-04-23T17:46:50Z","title":"FlowMap: High-Quality Camera Poses, Intrinsics, and Depth via Gradient\n Descent","summary":" This paper introduces FlowMap, an end-to-end differentiable method that\nsolves for precise camera poses, camera intrinsics, and per-frame dense depth\nof a video sequence. Our method performs per-video gradient-descent\nminimization of a simple least-squares objective that compares the optical flow\ninduced by depth, intrinsics, and poses against correspondences obtained via\noff-the-shelf optical flow and point tracking. Alongside the use of point\ntracks to encourage long-term geometric consistency, we introduce\ndifferentiable re-parameterizations of depth, intrinsics, and pose that are\namenable to first-order optimization. We empirically show that camera\nparameters and dense depth recovered by our method enable photo-realistic novel\nview synthesis on 360-degree trajectories using Gaussian Splatting. Our method\nnot only far outperforms prior gradient-descent based bundle adjustment\nmethods, but surprisingly performs on par with COLMAP, the state-of-the-art SfM\nmethod, on the downstream task of 360-degree novel view synthesis (even though\nour method is purely gradient-descent based, fully differentiable, and presents\na complete departure from conventional SfM).\n","authors":["Cameron Smith","David Charatan","Ayush Tewari","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2404.15259v1.pdf","comment":"Project website: https://cameronosmith.github.io/flowmap/"},{"id":"http://arxiv.org/abs/2404.15256v1","updated":"2024-04-23T17:42:45Z","published":"2024-04-23T17:42:45Z","title":"TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and\n Proprioception Estimation","summary":" Legged navigation is typically examined within open-world, off-road, and\nchallenging environments. In these scenarios, estimating external disturbances\nrequires a complex synthesis of multi-modal information. This underlines a\nmajor limitation in existing works that primarily focus on avoiding obstacles.\nIn this work, we propose TOP-Nav, a novel legged navigation framework that\nintegrates a comprehensive path planner with Terrain awareness, Obstacle\navoidance and close-loop Proprioception. TOP-Nav underscores the synergies\nbetween vision and proprioception in both path and motion planning. Within the\npath planner, we present and integrate a terrain estimator that enables the\nrobot to select waypoints on terrains with higher traversability while\neffectively avoiding obstacles. In the motion planning level, we not only\nimplement a locomotion controller to track the navigation commands, but also\nconstruct a proprioception advisor to provide motion evaluations for the path\nplanner. Based on the close-loop motion feedback, we make online corrections\nfor the vision-based terrain and obstacle estimations. Consequently, TOP-Nav\nachieves open-world navigation that the robot can handle terrains or\ndisturbances beyond the distribution of prior knowledge and overcomes\nconstraints imposed by visual conditions. Building upon extensive experiments\nconducted in both simulation and real-world environments, TOP-Nav demonstrates\nsuperior performance in open-world navigation compared to existing methods.\n","authors":["Junli Ren","Yikai Liu","Yingru Dai","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.15256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15254v1","updated":"2024-04-23T17:39:27Z","published":"2024-04-23T17:39:27Z","title":"UniMERNet: A Universal Network for Real-World Mathematical Expression\n Recognition","summary":" This paper presents the UniMER dataset to provide the first study on\nMathematical Expression Recognition (MER) towards complex real-world scenarios.\nThe UniMER dataset consists of a large-scale training set UniMER-1M offering an\nunprecedented scale and diversity with one million training instances and a\nmeticulously designed test set UniMER-Test that reflects a diverse range of\nformula distributions prevalent in real-world scenarios. Therefore, the UniMER\ndataset enables the training of a robust and high-accuracy MER model and\ncomprehensive evaluation of model performance. Moreover, we introduce the\nUniversal Mathematical Expression Recognition Network (UniMERNet), an\ninnovative framework designed to enhance MER in practical scenarios. UniMERNet\nincorporates a Length-Aware Module to process formulas of varied lengths\nefficiently, thereby enabling the model to handle complex mathematical\nexpressions with greater accuracy. In addition, UniMERNet employs our UniMER-1M\ndata and image augmentation techniques to improve the model's robustness under\ndifferent noise conditions. Our extensive experiments demonstrate that\nUniMERNet outperforms existing MER models, setting a new benchmark in various\nscenarios and ensuring superior recognition quality in real-world applications.\nThe dataset and model are available at\nhttps://github.com/opendatalab/UniMERNet.\n","authors":["Bin Wang","Zhuangcheng Gu","Chao Xu","Bo Zhang","Botian Shi","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2404.15254v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.15252v1","updated":"2024-04-23T17:39:06Z","published":"2024-04-23T17:39:06Z","title":"Source-free Domain Adaptation for Video Object Detection Under Adverse\n Image Conditions","summary":" When deploying pre-trained video object detectors in real-world scenarios,\nthe domain gap between training and testing data caused by adverse image\nconditions often leads to performance degradation. Addressing this issue\nbecomes particularly challenging when only the pre-trained model and degraded\nvideos are available. Although various source-free domain adaptation (SFDA)\nmethods have been proposed for single-frame object detectors, SFDA for video\nobject detection (VOD) remains unexplored. Moreover, most unsupervised domain\nadaptation works for object detection rely on two-stage detectors, while SFDA\nfor one-stage detectors, which are more vulnerable to fine-tuning, is not well\naddressed in the literature. In this paper, we propose Spatial-Temporal\nAlternate Refinement with Mean Teacher (STAR-MT), a simple yet effective SFDA\nmethod for VOD. Specifically, we aim to improve the performance of the\none-stage VOD method, YOLOV, under adverse image conditions, including noise,\nair turbulence, and haze. Extensive experiments on the ImageNetVOD dataset and\nits degraded versions demonstrate that our method consistently improves video\nobject detection performance in challenging imaging conditions, showcasing its\npotential for real-world applications.\n","authors":["Xingguang Zhang","Chih-Hsien Chou"],"pdf_url":"https://arxiv.org/pdf/2404.15252v1.pdf","comment":"accepted by the UG2+ workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15244v1","updated":"2024-04-23T17:26:34Z","published":"2024-04-23T17:26:34Z","title":"Efficient Transformer Encoders for Mask2Former-style models","summary":" Vision transformer based models bring significant improvements for image\nsegmentation tasks. Although these architectures offer powerful capabilities\nirrespective of specific segmentation tasks, their use of computational\nresources can be taxing on deployed devices. One way to overcome this challenge\nis by adapting the computation level to the specific needs of the input image\nrather than the current one-size-fits-all approach. To this end, we introduce\nECO-M2F or EffiCient TransfOrmer Encoders for Mask2Former-style models. Noting\nthat the encoder module of M2F-style models incur high resource-intensive\ncomputations, ECO-M2F provides a strategy to self-select the number of hidden\nlayers in the encoder, conditioned on the input image. To enable this\nself-selection ability for providing a balance between performance and\ncomputational efficiency, we present a three step recipe. The first step is to\ntrain the parent architecture to enable early exiting from the encoder. The\nsecond step is to create an derived dataset of the ideal number of encoder\nlayers required for each training example. The third step is to use the\naforementioned derived dataset to train a gating network that predicts the\nnumber of encoder layers to be used, conditioned on the input image.\nAdditionally, to change the computational-accuracy tradeoff, only steps two and\nthree need to be repeated which significantly reduces retraining time.\nExperiments on the public datasets show that the proposed approach reduces\nexpected encoder computational cost while maintaining performance, adapts to\nvarious user compute resources, is flexible in architecture configurations, and\ncan be extended beyond the segmentation task to object detection.\n","authors":["Manyi Yao","Abhishek Aich","Yumin Suh","Amit Roy-Chowdhury","Christian Shelton","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2404.15244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15234v1","updated":"2024-04-23T17:10:49Z","published":"2024-04-23T17:10:49Z","title":"Massively Annotated Datasets for Assessment of Synthetic and Real Data\n in Face Recognition","summary":" Face recognition applications have grown in parallel with the size of\ndatasets, complexity of deep learning models and computational power. However,\nwhile deep learning models evolve to become more capable and computational\npower keeps increasing, the datasets available are being retracted and removed\nfrom public access. Privacy and ethical concerns are relevant topics within\nthese domains. Through generative artificial intelligence, researchers have put\nefforts into the development of completely synthetic datasets that can be used\nto train face recognition systems. Nonetheless, the recent advances have not\nbeen sufficient to achieve performance comparable to the state-of-the-art\nmodels trained on real data. To study the drift between the performance of\nmodels trained on real and synthetic datasets, we leverage a massive attribute\nclassifier (MAC) to create annotations for four datasets: two real and two\nsynthetic. From these annotations, we conduct studies on the distribution of\neach attribute within all four datasets. Additionally, we further inspect the\ndifferences between real and synthetic datasets on the attribute set. When\ncomparing through the Kullback-Leibler divergence we have found differences\nbetween real and synthetic samples. Interestingly enough, we have verified that\nwhile real samples suffice to explain the synthetic distribution, the opposite\ncould not be further from being true.\n","authors":["Pedro C. Neto","Rafael M. Mamede","Carolina Albuquerque","Tiago Gonçalves","Ana F. Sequeira"],"pdf_url":"https://arxiv.org/pdf/2404.15234v1.pdf","comment":"Accepted at FG 2024"},{"id":"http://arxiv.org/abs/2404.15228v1","updated":"2024-04-23T16:59:02Z","published":"2024-04-23T16:59:02Z","title":"Re-Thinking Inverse Graphics With Large Language Models","summary":" Inverse graphics -- the task of inverting an image into physical variables\nthat, when rendered, enable reproduction of the observed scene -- is a\nfundamental challenge in computer vision and graphics. Disentangling an image\ninto its constituent elements, such as the shape, color, and material\nproperties of the objects of the 3D scene that produced it, requires a\ncomprehensive understanding of the environment. This requirement limits the\nability of existing carefully engineered approaches to generalize across\ndomains. Inspired by the zero-shot ability of large language models (LLMs) to\ngeneralize to novel contexts, we investigate the possibility of leveraging the\nbroad world knowledge encoded in such models in solving inverse-graphics\nproblems. To this end, we propose the Inverse-Graphics Large Language Model\n(IG-LLM), an inverse-graphics framework centered around an LLM, that\nautoregressively decodes a visual embedding into a structured, compositional\n3D-scene representation. We incorporate a frozen pre-trained visual encoder and\na continuous numeric head to enable end-to-end training. Through our\ninvestigation, we demonstrate the potential of LLMs to facilitate inverse\ngraphics through next-token prediction, without the use of image-space\nsupervision. Our analysis opens up new possibilities for precise spatial\nreasoning about images that exploit the visual knowledge of LLMs. We will\nrelease our code and data to ensure the reproducibility of our investigation\nand to facilitate future research at https://ig-llm.is.tue.mpg.de/\n","authors":["Peter Kulits","Haiwen Feng","Weiyang Liu","Victoria Abrevaya","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.15228v1.pdf","comment":"31 pages; project page: https://ig-llm.is.tue.mpg.de/"},{"id":"http://arxiv.org/abs/2404.15224v1","updated":"2024-04-23T16:54:31Z","published":"2024-04-23T16:54:31Z","title":"Deep Models for Multi-View 3D Object Recognition: A Review","summary":" Human decision-making often relies on visual information from multiple\nperspectives or views. In contrast, machine learning-based object recognition\nutilizes information from a single image of the object. However, the\ninformation conveyed by a single image may not be sufficient for accurate\ndecision-making, particularly in complex recognition problems. The utilization\nof multi-view 3D representations for object recognition has thus far\ndemonstrated the most promising results for achieving state-of-the-art\nperformance. This review paper comprehensively covers recent progress in\nmulti-view 3D object recognition methods for 3D classification and retrieval\ntasks. Specifically, we focus on deep learning-based and transformer-based\ntechniques, as they are widely utilized and have achieved state-of-the-art\nperformance. We provide detailed information about existing deep learning-based\nand transformer-based multi-view 3D object recognition models, including the\nmost commonly used 3D datasets, camera configurations and number of views, view\nselection strategies, pre-trained CNN architectures, fusion strategies, and\nrecognition performance on 3D classification and 3D retrieval tasks.\nAdditionally, we examine various computer vision applications that use\nmulti-view classification. Finally, we highlight key findings and future\ndirections for developing multi-view 3D object recognition methods to provide\nreaders with a comprehensive understanding of the field.\n","authors":["Mona Alzahrani","Muhammad Usman","Salma Kammoun","Saeed Anwar","Tarek Helmy"],"pdf_url":"https://arxiv.org/pdf/2404.15224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15194v1","updated":"2024-04-23T16:33:28Z","published":"2024-04-23T16:33:28Z","title":"Closed Loop Interactive Embodied Reasoning for Robot Manipulation","summary":" Embodied reasoning systems integrate robotic hardware and cognitive processes\nto perform complex tasks typically in response to a natural language query\nabout a specific physical environment. This usually involves changing the\nbelief about the scene or physically interacting and changing the scene (e.g.\n'Sort the objects from lightest to heaviest'). In order to facilitate the\ndevelopment of such systems we introduce a new simulating environment that\nmakes use of MuJoCo physics engine and high-quality renderer Blender to provide\nrealistic visual observations that are also accurate to the physical state of\nthe scene. Together with the simulator we propose a new benchmark composed of\n10 classes of multi-step reasoning scenarios that require simultaneous visual\nand physical measurements. Finally, we develop a new modular Closed Loop\nInteractive Reasoning (CLIER) approach that takes into account the measurements\nof non-visual object properties, changes in the scene caused by external\ndisturbances as well as uncertain outcomes of robotic actions. We extensively\nevaluate our reasoning approach in simulation and in the real world\nmanipulation tasks with a success rate above 76% and 64%, respectively.\n","authors":["Michal Nazarczuk","Jan Kristof Behrens","Karla Stepanova","Matej Hoffmann","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2404.15194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15174v1","updated":"2024-04-23T16:14:20Z","published":"2024-04-23T16:14:20Z","title":"Fourier-enhanced Implicit Neural Fusion Network for Multispectral and\n Hyperspectral Image Fusion","summary":" Recently, implicit neural representations (INR) have made significant strides\nin various vision-related domains, providing a novel solution for Multispectral\nand Hyperspectral Image Fusion (MHIF) tasks. However, INR is prone to losing\nhigh-frequency information and is confined to the lack of global perceptual\ncapabilities. To address these issues, this paper introduces a Fourier-enhanced\nImplicit Neural Fusion Network (FeINFN) specifically designed for MHIF task,\ntargeting the following phenomena: The Fourier amplitudes of the HR-HSI latent\ncode and LR-HSI are remarkably similar; however, their phases exhibit different\npatterns. In FeINFN, we innovatively propose a spatial and frequency implicit\nfusion function (Spa-Fre IFF), helping INR capture high-frequency information\nand expanding the receptive field. Besides, a new decoder employing a complex\nGabor wavelet activation function, called Spatial-Frequency Interactive Decoder\n(SFID), is invented to enhance the interaction of INR features. Especially, we\nfurther theoretically prove that the Gabor wavelet activation possesses a\ntime-frequency tightness property that favors learning the optimal bandwidths\nin the decoder. Experiments on two benchmark MHIF datasets verify the\nstate-of-the-art (SOTA) performance of the proposed method, both visually and\nquantitatively. Also, ablation studies demonstrate the mentioned contributions.\nThe code will be available on Anonymous GitHub\n(https://anonymous.4open.science/r/FeINFN-15C9/) after possible acceptance.\n","authors":["Yu-Jie Liang","Zihan Cao","Liang-Jian Deng","Xiao Wu"],"pdf_url":"https://arxiv.org/pdf/2404.15174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15163v1","updated":"2024-04-23T16:02:33Z","published":"2024-04-23T16:02:33Z","title":"Adaptive Mixed-Scale Feature Fusion Network for Blind AI-Generated Image\n Quality Assessment","summary":" With the increasing maturity of the text-to-image and image-to-image\ngenerative models, AI-generated images (AGIs) have shown great application\npotential in advertisement, entertainment, education, social media, etc.\nAlthough remarkable advancements have been achieved in generative models, very\nfew efforts have been paid to design relevant quality assessment models. In\nthis paper, we propose a novel blind image quality assessment (IQA) network,\nnamed AMFF-Net, for AGIs. AMFF-Net evaluates AGI quality from three dimensions,\ni.e., \"visual quality\", \"authenticity\", and \"consistency\". Specifically,\ninspired by the characteristics of the human visual system and motivated by the\nobservation that \"visual quality\" and \"authenticity\" are characterized by both\nlocal and global aspects, AMFF-Net scales the image up and down and takes the\nscaled images and original-sized image as the inputs to obtain multi-scale\nfeatures. After that, an Adaptive Feature Fusion (AFF) block is used to\nadaptively fuse the multi-scale features with learnable weights. In addition,\nconsidering the correlation between the image and prompt, AMFF-Net compares the\nsemantic features from text encoder and image encoder to evaluate the\ntext-to-image alignment. We carry out extensive experiments on three AGI\nquality assessment databases, and the experimental results show that our\nAMFF-Net obtains better performance than nine state-of-the-art blind IQA\nmethods. The results of ablation experiments further demonstrate the\neffectiveness of the proposed multi-scale input strategy and AFF block.\n","authors":["Tianwei Zhou","Songbai Tan","Wei Zhou","Yu Luo","Yuan-Gen Wang","Guanghui Yue"],"pdf_url":"https://arxiv.org/pdf/2404.15163v1.pdf","comment":"IEEE Transactions on Broadcasting (TBC)"},{"id":"http://arxiv.org/abs/2404.15161v1","updated":"2024-04-23T16:01:33Z","published":"2024-04-23T16:01:33Z","title":"Combating Missing Modalities in Egocentric Videos at Test Time","summary":" Understanding videos that contain multiple modalities is crucial, especially\nin egocentric videos, where combining various sensory inputs significantly\nimproves tasks like action recognition and moment localization. However,\nreal-world applications often face challenges with incomplete modalities due to\nprivacy concerns, efficiency needs, or hardware issues. Current methods, while\neffective, often necessitate retraining the model entirely to handle missing\nmodalities, making them computationally intensive, particularly with large\ntraining datasets. In this study, we propose a novel approach to address this\nissue at test time without requiring retraining. We frame the problem as a\ntest-time adaptation task, where the model adjusts to the available unlabeled\ndata at test time. Our method, MiDl~(Mutual information with\nself-Distillation), encourages the model to be insensitive to the specific\nmodality source present during testing by minimizing the mutual information\nbetween the prediction and the available modality. Additionally, we incorporate\nself-distillation to maintain the model's original performance when both\nmodalities are available. MiDl represents the first self-supervised, online\nsolution for handling missing modalities exclusively at test time. Through\nexperiments with various pretrained models and datasets, MiDl demonstrates\nsubstantial performance improvement without the need for retraining.\n","authors":["Merey Ramazanova","Alejandro Pardo","Bernard Ghanem","Motasem Alfarra"],"pdf_url":"https://arxiv.org/pdf/2404.15161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06623v4","updated":"2024-04-23T15:56:34Z","published":"2023-11-11T17:52:06Z","title":"VT-Former: An Exploratory Study on Vehicle Trajectory Prediction for\n Highway Surveillance through Graph Isomorphism and Transformer","summary":" Enhancing roadway safety has become an essential computer vision focus area\nfor Intelligent Transportation Systems (ITS). As a part of ITS, Vehicle\nTrajectory Prediction (VTP) aims to forecast a vehicle's future positions based\non its past and current movements. VTP is a pivotal element for road safety,\naiding in applications such as traffic management, accident prevention,\nwork-zone safety, and energy optimization. While most works in this field focus\non autonomous driving, with the growing number of surveillance cameras, another\nsub-field emerges for surveillance VTP with its own set of challenges. In this\npaper, we introduce VT-Former, a novel transformer-based VTP approach for\nhighway safety and surveillance. In addition to utilizing transformers to\ncapture long-range temporal patterns, a new Graph Attentive Tokenization (GAT)\nmodule has been proposed to capture intricate social interactions among\nvehicles. This study seeks to explore both the advantages and the limitations\ninherent in combining transformer architecture with graphs for VTP. Our\ninvestigation, conducted across three benchmark datasets from diverse\nsurveillance viewpoints, showcases the State-of-the-Art (SotA) or comparable\nperformance of VT-Former in predicting vehicle trajectories. This study\nunderscores the potential of VT-Former and its architecture, opening new\navenues for future research and exploration.\n","authors":["Armin Danesh Pazho","Ghazal Alinezhad Noghre","Vinit Katariya","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2311.06623v4.pdf","comment":"Completely updated based on the reviews received for the paper"},{"id":"http://arxiv.org/abs/2404.15141v1","updated":"2024-04-23T15:47:58Z","published":"2024-04-23T15:47:58Z","title":"CutDiffusion: A Simple, Fast, Cheap, and Strong Diffusion Extrapolation\n Method","summary":" Transforming large pre-trained low-resolution diffusion models to cater to\nhigher-resolution demands, i.e., diffusion extrapolation, significantly\nimproves diffusion adaptability. We propose tuning-free CutDiffusion, aimed at\nsimplifying and accelerating the diffusion extrapolation process, making it\nmore affordable and improving performance. CutDiffusion abides by the existing\npatch-wise extrapolation but cuts a standard patch diffusion process into an\ninitial phase focused on comprehensive structure denoising and a subsequent\nphase dedicated to specific detail refinement. Comprehensive experiments\nhighlight the numerous almighty advantages of CutDiffusion: (1) simple method\nconstruction that enables a concise higher-resolution diffusion process without\nthird-party engagement; (2) fast inference speed achieved through a single-step\nhigher-resolution diffusion process, and fewer inference patches required; (3)\ncheap GPU cost resulting from patch-wise inference and fewer patches during the\ncomprehensive structure denoising; (4) strong generation performance, stemming\nfrom the emphasis on specific detail refinement.\n","authors":["Mingbao Lin","Zhihang Lin","Wengyi Zhan","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.15141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15129v1","updated":"2024-04-23T15:29:02Z","published":"2024-04-23T15:29:02Z","title":"Gallbladder Cancer Detection in Ultrasound Images based on YOLO and\n Faster R-CNN","summary":" Medical image analysis is a significant application of artificial\nintelligence for disease diagnosis. A crucial step in this process is the\nidentification of regions of interest within the images. This task can be\nautomated using object detection algorithms. YOLO and Faster R-CNN are renowned\nfor such algorithms, each with its own strengths and weaknesses. This study\naims to explore the advantages of both techniques to select more accurate\nbounding boxes for gallbladder detection from ultrasound images, thereby\nenhancing gallbladder cancer classification. A fusion method that leverages the\nbenefits of both techniques is presented in this study. The proposed method\ndemonstrated superior classification performance, with an accuracy of 92.62%,\ncompared to the individual use of Faster R-CNN and YOLOv8, which yielded\naccuracies of 90.16% and 82.79%, respectively.\n","authors":["Sara Dadjouy","Hedieh Sajedi"],"pdf_url":"https://arxiv.org/pdf/2404.15129v1.pdf","comment":"Published in 2024 10th International Conference on Artificial\n Intelligence and Robotics (QICAR)"},{"id":"http://arxiv.org/abs/2404.15127v1","updated":"2024-04-23T15:27:19Z","published":"2024-04-23T15:27:19Z","title":"MedDr: Diagnosis-Guided Bootstrapping for Large-Scale Medical\n Vision-Language Learning","summary":" The rapid advancement of large-scale vision-language models has showcased\nremarkable capabilities across various tasks. However, the lack of extensive\nand high-quality image-text data in medicine has greatly hindered the\ndevelopment of large-scale medical vision-language models. In this work, we\npresent a diagnosis-guided bootstrapping strategy that exploits both image and\nlabel information to construct vision-language datasets. Based on the\nconstructed dataset, we developed MedDr, a generalist foundation model for\nhealthcare capable of handling diverse medical data modalities, including\nradiology, pathology, dermatology, retinography, and endoscopy. Moreover,\nduring inference, we propose a simple but effective retrieval-augmented medical\ndiagnosis strategy, which enhances the model's generalization ability.\nExtensive experiments on visual question answering, medical report generation,\nand medical image diagnosis demonstrate the superiority of our method.\n","authors":["Sunan He","Yuxiang Nie","Zhixuan Chen","Zhiyuan Cai","Hongmei Wang","Shu Yang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15121v1","updated":"2024-04-23T15:20:17Z","published":"2024-04-23T15:20:17Z","title":"Taming Diffusion Probabilistic Models for Character Control","summary":" We present a novel character control framework that effectively utilizes\nmotion diffusion probabilistic models to generate high-quality and diverse\ncharacter animations, responding in real-time to a variety of dynamic\nuser-supplied control signals. At the heart of our method lies a\ntransformer-based Conditional Autoregressive Motion Diffusion Model (CAMDM),\nwhich takes as input the character's historical motion and can generate a range\nof diverse potential future motions conditioned on high-level, coarse user\ncontrol. To meet the demands for diversity, controllability, and computational\nefficiency required by a real-time controller, we incorporate several key\nalgorithmic designs. These include separate condition tokenization,\nclassifier-free guidance on past motion, and heuristic future trajectory\nextension, all designed to address the challenges associated with taming motion\ndiffusion probabilistic models for character control. As a result, our work\nrepresents the first model that enables real-time generation of high-quality,\ndiverse character animations based on user interactive control, supporting\nanimating the character in multiple styles with a single unified model. We\nevaluate our method on a diverse set of locomotion skills, demonstrating the\nmerits of our method over existing character controllers. Project page and\nsource codes: https://aiganimation.github.io/CAMDM/\n","authors":["Rui Chen","Mingyi Shi","Shaoli Huang","Ping Tan","Taku Komura","Xuelin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15121v1.pdf","comment":"Accepted by SIGGRAPH 2024 (Conference Track). Project page and source\n codes: https://aiganimation.github.io/CAMDM/"},{"id":"http://arxiv.org/abs/2308.10680v2","updated":"2024-04-23T15:19:17Z","published":"2023-08-21T12:27:18Z","title":"Co-Speech Gesture Detection through Multi-Phase Sequence Labeling","summary":" Gestures are integral components of face-to-face communication. They unfold\nover time, often following predictable movement phases of preparation, stroke,\nand retraction. Yet, the prevalent approach to automatic gesture detection\ntreats the problem as binary classification, classifying a segment as either\ncontaining a gesture or not, thus failing to capture its inherently sequential\nand contextual nature. To address this, we introduce a novel framework that\nreframes the task as a multi-phase sequence labeling problem rather than binary\nclassification. Our model processes sequences of skeletal movements over time\nwindows, uses Transformer encoders to learn contextual embeddings, and\nleverages Conditional Random Fields to perform sequence labeling. We evaluate\nour proposal on a large dataset of diverse co-speech gestures in task-oriented\nface-to-face dialogues. The results consistently demonstrate that our method\nsignificantly outperforms strong baseline models in detecting gesture strokes.\nFurthermore, applying Transformer encoders to learn contextual embeddings from\nmovement sequences substantially improves gesture unit detection. These results\nhighlight our framework's capacity to capture the fine-grained dynamics of\nco-speech gesture phases, paving the way for more nuanced and accurate gesture\ndetection and analysis.\n","authors":["Esam Ghaleb","Ilya Burenko","Marlou Rasenberg","Wim Pouw","Peter Uhrig","Judith Holler","Ivan Toni","Aslı Özyürek","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2308.10680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15100v1","updated":"2024-04-23T14:53:15Z","published":"2024-04-23T14:53:15Z","title":"Multimodal Large Language Model is a Human-Aligned Annotator for\n Text-to-Image Generation","summary":" Recent studies have demonstrated the exceptional potentials of leveraging\nhuman preference datasets to refine text-to-image generative models, enhancing\nthe alignment between generated images and textual prompts. Despite these\nadvances, current human preference datasets are either prohibitively expensive\nto construct or suffer from a lack of diversity in preference dimensions,\nresulting in limited applicability for instruction tuning in open-source\ntext-to-image generative models and hinder further exploration. To address\nthese challenges and promote the alignment of generative models through\ninstruction tuning, we leverage multimodal large language models to create\nVisionPrefer, a high-quality and fine-grained preference dataset that captures\nmultiple preference aspects. We aggregate feedback from AI annotators across\nfour aspects: prompt-following, aesthetic, fidelity, and harmlessness to\nconstruct VisionPrefer. To validate the effectiveness of VisionPrefer, we train\na reward model VP-Score over VisionPrefer to guide the training of\ntext-to-image generative models and the preference prediction accuracy of\nVP-Score is comparable to human annotators. Furthermore, we use two\nreinforcement learning methods to supervised fine-tune generative models to\nevaluate the performance of VisionPrefer, and extensive experimental results\ndemonstrate that VisionPrefer significantly improves text-image alignment in\ncompositional image generation across diverse aspects, e.g., aesthetic, and\ngeneralizes better than previous human-preference metrics across various image\ndistributions. Moreover, VisionPrefer indicates that the integration of\nAI-generated synthetic data as a supervisory signal is a promising avenue for\nachieving improved alignment with human preferences in vision generative\nmodels.\n","authors":["Xun Wu","Shaohan Huang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2404.15100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00110v2","updated":"2024-04-23T14:49:45Z","published":"2023-11-30T18:19:47Z","title":"CLIP-QDA: An Explainable Concept Bottleneck Model","summary":" In this paper, we introduce an explainable algorithm designed from a\nmulti-modal foundation model, that performs fast and explainable image\nclassification. Drawing inspiration from CLIP-based Concept Bottleneck Models\n(CBMs), our method creates a latent space where each neuron is linked to a\nspecific word. Observing that this latent space can be modeled with simple\ndistributions, we use a Mixture of Gaussians (MoG) formalism to enhance the\ninterpretability of this latent space. Then, we introduce CLIP-QDA, a\nclassifier that only uses statistical values to infer labels from the concepts.\nIn addition, this formalism allows for both local and global explanations.\nThese explanations come from the inner design of our architecture, our work is\npart of a new family of greybox models, combining performances of opaque\nfoundation models and the interpretability of transparent models. Our empirical\nfindings show that in instances where the MoG assumption holds, CLIP-QDA\nachieves similar accuracy with state-of-the-art methods CBMs. Our explanations\ncompete with existing XAI methods while being faster to compute.\n","authors":["Rémi Kazmierczak","Eloïse Berthier","Goran Frehse","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2312.00110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13576v2","updated":"2024-04-23T14:37:57Z","published":"2024-02-21T07:16:06Z","title":"Improving Video Corpus Moment Retrieval with Partial Relevance\n Enhancement","summary":" Video Corpus Moment Retrieval (VCMR) is a new video retrieval task aimed at\nretrieving a relevant moment from a large corpus of untrimmed videos using a\ntext query. The relevance between the video and query is partial, mainly\nevident in two aspects:~(1)~Scope: The untrimmed video contains many frames,\nbut not all are relevant to the query. Strong relevance is typically observed\nonly within the relevant moment.~(2)~Modality: The relevance of the query\nvaries with different modalities. Action descriptions align more with visual\nelements, while character conversations are more related to textual\ninformation.Existing methods often treat all video contents equally, leading to\nsub-optimal moment retrieval. We argue that effectively capturing the partial\nrelevance between the query and video is essential for the VCMR task. To this\nend, we propose a Partial Relevance Enhanced Model~(PREM) to improve VCMR. VCMR\ninvolves two sub-tasks: video retrieval and moment localization. To align with\ntheir distinct objectives, we implement specialized partial relevance\nenhancement strategies. For video retrieval, we introduce a multi-modal\ncollaborative video retriever, generating different query representations for\nthe two modalities by modality-specific pooling, ensuring a more effective\nmatch. For moment localization, we propose the focus-then-fuse moment\nlocalizer, utilizing modality-specific gates to capture essential content. We\nalso introduce relevant content-enhanced training methods for both retriever\nand localizer to enhance the ability of model to capture relevant content.\nExperimental results on TVR and DiDeMo datasets show that the proposed model\noutperforms the baselines, achieving a new state-of-the-art of VCMR. The code\nis available at \\url{https://github.com/hdy007007/PREM}.\n","authors":["Danyang Hou","Liang Pang","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.13576v2.pdf","comment":"camera-ready version of ACM ICMR 2024"},{"id":"http://arxiv.org/abs/2404.15082v1","updated":"2024-04-23T14:31:44Z","published":"2024-04-23T14:31:44Z","title":"Harnessing Optical Imaging Limit through Atmospheric Scattering Media","summary":" Recording and identifying faint objects through atmospheric scattering media\nby an optical system are fundamentally interesting and technologically\nimportant. In this work, we introduce a comprehensive model that incorporates\ncontributions from target characteristics, atmospheric effects, imaging system,\ndigital processing, and visual perception to assess the ultimate perceptible\nlimit of geometrical imaging, specifically the angular resolution at the\nboundary of visible distance. The model allows to reevaluate the effectiveness\nof conventional imaging recording, processing, and perception and to analyze\nthe limiting factors that constrain image recognition capabilities in\natmospheric media. The simulations were compared with the experimental results\nmeasured in a fog chamber and outdoor settings. The results reveal general good\nagreement between analysis and experimental, pointing out the way to harnessing\nthe physical limit for optical imaging in scattering media. An immediate\napplication of the study is the extension of the image range by an amount of\n1.2 times with noise reduction via multi-frame averaging, hence greatly\nenhancing the capability of optical imaging in the atmosphere.\n","authors":["Libang Chen","Jun Yang","Lingye Chen","Yuyang Shui","Yikun Liu","Jianying Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.15082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15081v1","updated":"2024-04-23T14:31:15Z","published":"2024-04-23T14:31:15Z","title":"Perturbing Attention Gives You More Bang for the Buck: Subtle Imaging\n Perturbations That Efficiently Fool Customized Diffusion Models","summary":" Diffusion models (DMs) embark a new era of generative modeling and offer more\nopportunities for efficient generating high-quality and realistic data samples.\nHowever, their widespread use has also brought forth new challenges in model\nsecurity, which motivates the creation of more effective adversarial attackers\non DMs to understand its vulnerability. We propose CAAT, a simple but generic\nand efficient approach that does not require costly training to effectively\nfool latent diffusion models (LDMs). The approach is based on the observation\nthat cross-attention layers exhibits higher sensitivity to gradient change,\nallowing for leveraging subtle perturbations on published images to\nsignificantly corrupt the generated images. We show that a subtle perturbation\non an image can significantly impact the cross-attention layers, thus changing\nthe mapping between text and image during the fine-tuning of customized\ndiffusion models. Extensive experiments demonstrate that CAAT is compatible\nwith diverse diffusion models and outperforms baseline attack methods in a more\neffective (more noise) and efficient (twice as fast as Anti-DreamBooth and\nMist) manner.\n","authors":["Jingyao Xu","Yuetong Lu","Yandong Li","Siyang Lu","Dongdong Wang","Xiang Wei"],"pdf_url":"https://arxiv.org/pdf/2404.15081v1.pdf","comment":"Published at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15041v1","updated":"2024-04-23T13:43:33Z","published":"2024-04-23T13:43:33Z","title":"LEAF: Unveiling Two Sides of the Same Coin in Semi-supervised Facial\n Expression Recognition","summary":" Semi-supervised learning has emerged as a promising approach to tackle the\nchallenge of label scarcity in facial expression recognition (FER) task.\nHowever, current state-of-the-art methods primarily focus on one side of the\ncoin, i.e., generating high-quality pseudo-labels, while overlooking the other\nside: enhancing expression-relevant representations. In this paper, we unveil\nboth sides of the coin by proposing a unified framework termed hierarchicaL\ndEcoupling And Fusing (LEAF) to coordinate expression-relevant representations\nand pseudo-labels for semi-supervised FER. LEAF introduces a hierarchical\nexpression-aware aggregation strategy that operates at three levels: semantic,\ninstance, and category. (1) At the semantic and instance levels, LEAF decouples\nrepresentations into expression-agnostic and expression-relevant components,\nand adaptively fuses them using learnable gating weights. (2) At the category\nlevel, LEAF assigns ambiguous pseudo-labels by decoupling predictions into\npositive and negative parts, and employs a consistency loss to ensure agreement\nbetween two augmented views of the same image. Extensive experiments on\nbenchmark datasets demonstrate that by unveiling and harmonizing both sides of\nthe coin, LEAF outperforms state-of-the-art semi-supervised FER methods,\neffectively leveraging both labeled and unlabeled data. Moreover, the proposed\nexpression-aware aggregation strategy can be seamlessly integrated into\nexisting semi-supervised frameworks, leading to significant performance gains.\n","authors":["Fan Zhang","Zhi-Qi Cheng","Jian Zhao","Xiaojiang Peng","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.15041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15037v1","updated":"2024-04-23T13:42:12Z","published":"2024-04-23T13:42:12Z","title":"DP-Net: Learning Discriminative Parts for image recognition","summary":" This paper presents Discriminative Part Network (DP-Net), a deep architecture\nwith strong interpretation capabilities, which exploits a pretrained\nConvolutional Neural Network (CNN) combined with a part-based recognition\nmodule. This system learns and detects parts in the images that are\ndiscriminative among categories, without the need for fine-tuning the CNN,\nmaking it more scalable than other part-based models. While part-based\napproaches naturally offer interpretable representations, we propose\nexplanations at image and category levels and introduce specific constraints on\nthe part learning process to make them more discrimative.\n","authors":["Ronan Sicre","Hanwei Zhang","Julien Dejasmin","Chiheb Daaloul","Stéphane Ayache","Thierry Artières"],"pdf_url":"https://arxiv.org/pdf/2404.15037v1.pdf","comment":"IEEE ICIP 2023"},{"id":"http://arxiv.org/abs/2402.14327v2","updated":"2024-04-23T13:41:47Z","published":"2024-02-22T06:47:44Z","title":"Subobject-level Image Tokenization","summary":" Transformer-based vision models typically tokenize images into fixed-size\nsquare patches as input units, which lacks the adaptability to image content\nand overlooks the inherent pixel grouping structure. Inspired by the subword\ntokenization widely adopted in language models, we propose an image tokenizer\nat a subobject level, where the subobjects are represented by semantically\nmeaningful image segments obtained by segmentation models (e.g., segment\nanything models). To implement a learning system based on subobject\ntokenization, we first introduced a Direct Segment Anything Model (DirectSAM)\nthat efficiently produces comprehensive segmentation of subobjects, then embed\nsubobjects into compact latent vectors and fed them into a large language model\nfor vision language learning. Empirical results demonstrated that our\nsubobject-level tokenization significantly facilitates efficient learning of\ntranslating images into object and attribute descriptions compared to the\ntraditional patch-level tokenization. Codes and models are open-sourced at\nhttps://github.com/ChenDelong1999/subobjects.\n","authors":["Delong Chen","Samuel Cahyawijaya","Jianfeng Liu","Baoyuan Wang","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2402.14327v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2004.05704v4","updated":"2024-04-23T13:38:36Z","published":"2020-04-12T21:45:23Z","title":"Visual Grounding Methods for VQA are Working for the Wrong Reasons!","summary":" Existing Visual Question Answering (VQA) methods tend to exploit dataset\nbiases and spurious statistical correlations, instead of producing right\nanswers for the right reasons. To address this issue, recent bias mitigation\nmethods for VQA propose to incorporate visual cues (e.g., human attention maps)\nto better ground the VQA models, showcasing impressive gains. However, we show\nthat the performance improvements are not a result of improved visual\ngrounding, but a regularization effect which prevents over-fitting to\nlinguistic priors. For instance, we find that it is not actually necessary to\nprovide proper, human-based cues; random, insensible cues also result in\nsimilar improvements. Based on this observation, we propose a simpler\nregularization scheme that does not require any external annotations and yet\nachieves near state-of-the-art performance on VQA-CPv2.\n","authors":["Robik Shrestha","Kushal Kafle","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2004.05704v4.pdf","comment":"Published in ACL 2020 under the title \"A negative case analysis of\n visual grounding methods for VQA\""},{"id":"http://arxiv.org/abs/2404.15033v1","updated":"2024-04-23T13:38:01Z","published":"2024-04-23T13:38:01Z","title":"IPAD: Industrial Process Anomaly Detection Dataset","summary":" Video anomaly detection (VAD) is a challenging task aiming to recognize\nanomalies in video frames, and existing large-scale VAD researches primarily\nfocus on road traffic and human activity scenes. In industrial scenes, there\nare often a variety of unpredictable anomalies, and the VAD method can play a\nsignificant role in these scenarios. However, there is a lack of applicable\ndatasets and methods specifically tailored for industrial production scenarios\ndue to concerns regarding privacy and security. To bridge this gap, we propose\na new dataset, IPAD, specifically designed for VAD in industrial scenarios. The\nindustrial processes in our dataset are chosen through on-site factory research\nand discussions with engineers. This dataset covers 16 different industrial\ndevices and contains over 6 hours of both synthetic and real-world video\nfootage. Moreover, we annotate the key feature of the industrial process, ie,\nperiodicity. Based on the proposed dataset, we introduce a period memory module\nand a sliding window inspection mechanism to effectively investigate the\nperiodic information in a basic reconstruction model. Our framework leverages\nLoRA adapter to explore the effective migration of pretrained models, which are\ninitially trained using synthetic data, into real-world scenarios. Our proposed\ndataset and method will fill the gap in the field of industrial video anomaly\ndetection and drive the process of video understanding tasks as well as smart\nfactory deployment.\n","authors":["Jinfan Liu","Yichao Yan","Junjie Li","Weiming Zhao","Pengzhi Chu","Xingdong Sheng","Yunhui Liu","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.15033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15028v1","updated":"2024-04-23T13:34:52Z","published":"2024-04-23T13:34:52Z","title":"PRISM: A Promptable and Robust Interactive Segmentation Model with\n Visual Prompts","summary":" In this paper, we present PRISM, a Promptable and Robust Interactive\nSegmentation Model, aiming for precise segmentation of 3D medical images. PRISM\naccepts various visual inputs, including points, boxes, and scribbles as sparse\nprompts, as well as masks as dense prompts. Specifically, PRISM is designed\nwith four principles to achieve robustness: (1) Iterative learning. The model\nproduces segmentations by using visual prompts from previous iterations to\nachieve progressive improvement. (2) Confidence learning. PRISM employs\nmultiple segmentation heads per input image, each generating a continuous map\nand a confidence score to optimize predictions. (3) Corrective learning.\nFollowing each segmentation iteration, PRISM employs a shallow corrective\nrefinement network to reassign mislabeled voxels. (4) Hybrid design. PRISM\nintegrates hybrid encoders to better capture both the local and global\ninformation. Comprehensive validation of PRISM is conducted using four public\ndatasets for tumor segmentation in the colon, pancreas, liver, and kidney,\nhighlighting challenges caused by anatomical variations and ambiguous\nboundaries in accurate tumor identification. Compared to state-of-the-art\nmethods, both with and without prompt engineering, PRISM significantly improves\nperformance, achieving results that are close to human levels. The code is\npublicly available at https://github.com/MedICL-VU/PRISM.\n","authors":["Hao Li","Han Liu","Dewei Hu","Jiacheng Wang","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2404.15028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07633v2","updated":"2024-04-23T13:33:26Z","published":"2023-10-11T16:28:24Z","title":"Attention-Map Augmentation for Hypercomplex Breast Cancer Classification","summary":" Breast cancer is the most widespread neoplasm among women and early detection\nof this disease is critical. Deep learning techniques have become of great\ninterest to improve diagnostic performance. However, distinguishing between\nmalignant and benign masses in whole mammograms poses a challenge, as they\nappear nearly identical to an untrained eye, and the region of interest (ROI)\nconstitutes only a small fraction of the entire image. In this paper, we\npropose a framework, parameterized hypercomplex attention maps (PHAM), to\novercome these problems. Specifically, we deploy an augmentation step based on\ncomputing attention maps. Then, the attention maps are used to condition the\nclassification step by constructing a multi-dimensional input comprised of the\noriginal breast cancer image and the corresponding attention map. In this step,\na parameterized hypercomplex neural network (PHNN) is employed to perform\nbreast cancer classification. The framework offers two main advantages. First,\nattention maps provide critical information regarding the ROI and allow the\nneural model to concentrate on it. Second, the hypercomplex architecture has\nthe ability to model local relations between input dimensions thanks to\nhypercomplex algebra rules, thus properly exploiting the information provided\nby the attention map. We demonstrate the efficacy of the proposed framework on\nboth mammography images as well as histopathological ones. We surpass\nattention-based state-of-the-art networks and the real-valued counterpart of\nour approach. The code of our work is available at\nhttps://github.com/ispamm/AttentionBCS.\n","authors":["Eleonora Lopez","Filippo Betello","Federico Carmignani","Eleonora Grassucci","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.07633v2.pdf","comment":"Published in Elsevier Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2404.15024v1","updated":"2024-04-23T13:32:29Z","published":"2024-04-23T13:32:29Z","title":"A Learning Paradigm for Interpretable Gradients","summary":" This paper studies interpretability of convolutional networks by means of\nsaliency maps. Most approaches based on Class Activation Maps (CAM) combine\ninformation from fully connected layers and gradient through variants of\nbackpropagation. However, it is well understood that gradients are noisy and\nalternatives like guided backpropagation have been proposed to obtain better\nvisualization at inference. In this work, we present a novel training approach\nto improve the quality of gradients for interpretability. In particular, we\nintroduce a regularization loss such that the gradient with respect to the\ninput image obtained by standard backpropagation is similar to the gradient\nobtained by guided backpropagation. We find that the resulting gradient is\nqualitatively less noisy and improves quantitatively the interpretability\nproperties of different networks, using several interpretability methods.\n","authors":["Felipe Torres Figueroa","Hanwei Zhang","Ronan Sicre","Yannis Avrithis","Stephane Ayache"],"pdf_url":"https://arxiv.org/pdf/2404.15024v1.pdf","comment":"VISAPP 2024"},{"id":"http://arxiv.org/abs/2404.15022v1","updated":"2024-04-23T13:31:18Z","published":"2024-04-23T13:31:18Z","title":"A review of deep learning-based information fusion techniques for\n multimodal medical image classification","summary":" Multimodal medical imaging plays a pivotal role in clinical diagnosis and\nresearch, as it combines information from various imaging modalities to provide\na more comprehensive understanding of the underlying pathology. Recently, deep\nlearning-based multimodal fusion techniques have emerged as powerful tools for\nimproving medical image classification. This review offers a thorough analysis\nof the developments in deep learning-based multimodal fusion for medical\nclassification tasks. We explore the complementary relationships among\nprevalent clinical modalities and outline three main fusion schemes for\nmultimodal classification networks: input fusion, intermediate fusion\n(encompassing single-level fusion, hierarchical fusion, and attention-based\nfusion), and output fusion. By evaluating the performance of these fusion\ntechniques, we provide insight into the suitability of different network\narchitectures for various multimodal fusion scenarios and application domains.\nFurthermore, we delve into challenges related to network architecture\nselection, handling incomplete multimodal data management, and the potential\nlimitations of multimodal fusion. Finally, we spotlight the promising future of\nTransformer-based multimodal fusion techniques and give recommendations for\nfuture research in this rapidly evolving field.\n","authors":["Yihao Li","Mostafa El Habib Daho","Pierre-Henri Conze","Rachid Zeghlache","Hugo Le Boité","Ramin Tadayoni","Béatrice Cochener","Mathieu Lamard","Gwenolé Quellec"],"pdf_url":"https://arxiv.org/pdf/2404.15022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13388v2","updated":"2024-04-23T13:25:01Z","published":"2024-04-20T14:15:25Z","title":"Diagnosis of Multiple Fundus Disorders Amidst a Scarcity of Medical\n Experts Via Self-supervised Machine Learning","summary":" Fundus diseases are major causes of visual impairment and blindness\nworldwide, especially in underdeveloped regions, where the shortage of\nophthalmologists hinders timely diagnosis. AI-assisted fundus image analysis\nhas several advantages, such as high accuracy, reduced workload, and improved\naccessibility, but it requires a large amount of expert-annotated data to build\nreliable models. To address this dilemma, we propose a general self-supervised\nmachine learning framework that can handle diverse fundus diseases from\nunlabeled fundus images. Our method's AUC surpasses existing supervised\napproaches by 15.7%, and even exceeds performance of a single human expert.\nFurthermore, our model adapts well to various datasets from different regions,\nraces, and heterogeneous image sources or qualities from multiple cameras or\ndevices. Our method offers a label-free general framework to diagnose fundus\ndiseases, which could potentially benefit telehealth programs for early\nscreening of people at risk of vision loss.\n","authors":["Yong Liu","Mengtian Kang","Shuo Gao","Chi Zhang","Ying Liu","Shiming Li","Yue Qi","Arokia Nathan","Wenjun Xu","Chenyu Tang","Edoardo Occhipinti","Mayinuer Yusufu","Ningli Wang","Weiling Bai","Luigi Occhipinti"],"pdf_url":"https://arxiv.org/pdf/2404.13388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15014v1","updated":"2024-04-23T13:20:09Z","published":"2024-04-23T13:20:09Z","title":"OccGen: Generative Multi-modal 3D Occupancy Prediction for Autonomous\n Driving","summary":" Existing solutions for 3D semantic occupancy prediction typically treat the\ntask as a one-shot 3D voxel-wise segmentation perception problem. These\ndiscriminative methods focus on learning the mapping between the inputs and\noccupancy map in a single step, lacking the ability to gradually refine the\noccupancy map and the reasonable scene imaginative capacity to complete the\nlocal regions somewhere. In this paper, we introduce OccGen, a simple yet\npowerful generative perception model for the task of 3D semantic occupancy\nprediction. OccGen adopts a ''noise-to-occupancy'' generative paradigm,\nprogressively inferring and refining the occupancy map by predicting and\neliminating noise originating from a random Gaussian distribution. OccGen\nconsists of two main components: a conditional encoder that is capable of\nprocessing multi-modal inputs, and a progressive refinement decoder that\napplies diffusion denoising using the multi-modal features as conditions. A key\ninsight of this generative pipeline is that the diffusion denoising process is\nnaturally able to model the coarse-to-fine refinement of the dense 3D occupancy\nmap, therefore producing more detailed predictions. Extensive experiments on\nseveral occupancy benchmarks demonstrate the effectiveness of the proposed\nmethod compared to the state-of-the-art methods. For instance, OccGen\nrelatively enhances the mIoU by 9.5%, 6.3%, and 13.3% on nuScenes-Occupancy\ndataset under the muli-modal, LiDAR-only, and camera-only settings,\nrespectively. Moreover, as a generative perception model, OccGen exhibits\ndesirable properties that discriminative models cannot achieve, such as\nproviding uncertainty estimates alongside its multiple-step predictions.\n","authors":["Guoqing Wang","Zhongdao Wang","Pin Tang","Jilai Zheng","Xiangxuan Ren","Bailan Feng","Chao Ma"],"pdf_url":"https://arxiv.org/pdf/2404.15014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15010v1","updated":"2024-04-23T13:15:35Z","published":"2024-04-23T13:15:35Z","title":"X-3D: Explicit 3D Structure Modeling for Point Cloud Recognition","summary":" Numerous prior studies predominantly emphasize constructing relation vectors\nfor individual neighborhood points and generating dynamic kernels for each\nvector and embedding these into high-dimensional spaces to capture implicit\nlocal structures. However, we contend that such implicit high-dimensional\nstructure modeling approch inadequately represents the local geometric\nstructure of point clouds due to the absence of explicit structural\ninformation. Hence, we introduce X-3D, an explicit 3D structure modeling\napproach. X-3D functions by capturing the explicit local structural information\nwithin the input 3D space and employing it to produce dynamic kernels with\nshared weights for all neighborhood points within the current local region.\nThis modeling approach introduces effective geometric prior and significantly\ndiminishes the disparity between the local structure of the embedding space and\nthe original input point cloud, thereby improving the extraction of local\nfeatures. Experiments show that our method can be used on a variety of methods\nand achieves state-of-the-art performance on segmentation, classification,\ndetection tasks with lower extra computational cost, such as \\textbf{90.7\\%} on\nScanObjectNN for classification, \\textbf{79.2\\%} on S3DIS 6 fold and\n\\textbf{74.3\\%} on S3DIS Area 5 for segmentation, \\textbf{76.3\\%} on ScanNetV2\nfor segmentation and \\textbf{64.5\\%} mAP , \\textbf{46.9\\%} mAP on SUN RGB-D and\n\\textbf{69.0\\%} mAP , \\textbf{51.1\\%} mAP on ScanNetV2 . Our code is available\nat\n\\href{https://github.com/sunshuofeng/X-3D}{https://github.com/sunshuofeng/X-3D}.\n","authors":["Shuofeng Sun","Yongming Rao","Jiwen Lu","Haibin Yan"],"pdf_url":"https://arxiv.org/pdf/2404.15010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15009v1","updated":"2024-04-23T13:15:22Z","published":"2024-04-23T13:15:22Z","title":"The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus\n on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs)","summary":" Pediatric tumors of the central nervous system are the most common cause of\ncancer-related death in children. The five-year survival rate for high-grade\ngliomas in children is less than 20%. Due to their rarity, the diagnosis of\nthese entities is often delayed, their treatment is mainly based on historic\ntreatment concepts, and clinical trials require multi-institutional\ncollaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs\nchallenge, focused on pediatric brain tumors with data acquired across multiple\ninternational consortia dedicated to pediatric neuro-oncology and clinical\ntrials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together\nclinicians and AI/imaging scientists to lead to faster development of automated\nsegmentation techniques that could benefit clinical trials, and ultimately the\ncare of children with brain tumors.\n","authors":["Anahita Fathi Kazerooni","Nastaran Khalili","Deep Gandhi","Xinyang Liu","Zhifan Jiang","Syed Muhammed Anwar","Jake Albrecht","Maruf Adewole","Udunna Anazodo","Hannah Anderson","Sina Bagheri","Ujjwal Baid","Timothy Bergquist","Austin J. Borja","Evan Calabrese","Verena Chung","Gian-Marco Conte","Farouk Dako","James Eddy","Ivan Ezhov","Ariana Familiar","Keyvan Farahani","Anurag Gottipati","Debanjan Haldar","Shuvanjan Haldar","Juan Eugenio Iglesias","Anastasia Janas","Elaine Johansen","Blaise V Jones","Neda Khalili","Florian Kofler","Dominic LaBella","Hollie Anne Lai","Koen Van Leemput","Hongwei Bran Li","Nazanin Maleki","Aaron S McAllister","Zeke Meier","Bjoern Menze","Ahmed W Moawad","Khanak K Nandolia","Julija Pavaine","Marie Piraud","Tina Poussaint","Sanjay P Prabhu","Zachary Reitman","Andres Rodriguez","Jeffrey D Rudie","Mariana Sanchez-Montano","Ibraheem Salman Shaikh","Lubdha M. Shah","Nakul Sheth","Russel Taki Shinohara","Wenxin Tu","Karthik Viswanathan","Chunhao Wang","Jeffrey B Ware","Benedikt Wiestler","Walter Wiggins","Anna Zapaishchykova","Mariam Aboian","Miriam Bornhorst","Peter de Blank","Michelle Deutsch","Maryam Fouladi","Lindsey Hoffman","Benjamin Kann","Margot Lazow","Leonie Mikael","Ali Nabavizadeh","Roger Packer","Spyridon Bakas","Adam Resnick","Brian Rood","Arastoo Vossough","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2404.15009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15008v1","updated":"2024-04-23T13:15:07Z","published":"2024-04-23T13:15:07Z","title":"External Prompt Features Enhanced Parameter-efficient Fine-tuning for\n Salient Object Detection","summary":" Salient object detection (SOD) aims at finding the most salient objects in\nimages and outputs pixel-level binary masks. Transformer-based methods achieve\npromising performance due to their global semantic understanding, crucial for\nidentifying salient objects. However, these models tend to be large and require\nnumerous training parameters. To better harness the potential of transformers\nfor SOD, we propose a novel parameter-efficient fine-tuning method aimed at\nreducing the number of training parameters while enhancing the salient object\ndetection capability. Our model, termed EXternal Prompt features Enhanced\nadapteR Tuning (ExPert), features an encoder-decoder structure with adapters\nand injectors interspersed between the layers of a frozen transformer encoder.\nThe adapter modules adapt the pre-trained backbone to SOD while the injector\nmodules incorporate external prompt features to enhance the awareness of\nsalient objects. Comprehensive experiments demonstrate the superiority of our\nmethod. Surpassing former state-of-the-art (SOTA) models across five SOD\ndatasets, ExPert achieves 0.215 mean absolute error (MAE) in ECSSD dataset with\n80.2M trained parameters, 21% better than transformer-based SOTA model and 47%\nbetter than CNN-based SOTA model.\n","authors":["Wen Liang","Peipei Ran","Mengchao Bai","Xiao Liu","P. Bilha Githinji","Wei Zhao","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2404.15008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14055v2","updated":"2024-04-23T13:04:44Z","published":"2024-04-22T10:11:31Z","title":"RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key\n Identification","summary":" We revisit Tree-Ring Watermarking, a recent diffusion model watermarking\nmethod that demonstrates great robustness to various attacks. We conduct an\nin-depth study on it and reveal that the distribution shift unintentionally\nintroduced by the watermarking process, apart from watermark pattern matching,\ncontributes to its exceptional robustness. Our investigation further exposes\ninherent flaws in its original design, particularly in its ability to identify\nmultiple distinct keys, where distribution shift offers no assistance. Based on\nthese findings and analysis, we present RingID for enhanced multi-key\nidentification. It consists of a novel multi-channel heterogeneous watermarking\napproach designed to seamlessly amalgamate distinctive advantages from diverse\nwatermarks. Coupled with a series of suggested enhancements, RingID exhibits\nsubstantial advancements in multi-key identification. Github Page:\nhttps://github.com/showlab/RingID\n","authors":["Hai Ci","Pei Yang","Yiren Song","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2404.14055v2.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.13896v2","updated":"2024-04-23T13:02:37Z","published":"2024-04-22T06:07:06Z","title":"CT-NeRF: Incremental Optimizing Neural Radiance Field and Poses with\n Complex Trajectory","summary":" Neural radiance field (NeRF) has achieved impressive results in high-quality\n3D scene reconstruction. However, NeRF heavily relies on precise camera poses.\nWhile recent works like BARF have introduced camera pose optimization within\nNeRF, their applicability is limited to simple trajectory scenes. Existing\nmethods struggle while tackling complex trajectories involving large rotations.\nTo address this limitation, we propose CT-NeRF, an incremental reconstruction\noptimization pipeline using only RGB images without pose and depth input. In\nthis pipeline, we first propose a local-global bundle adjustment under a pose\ngraph connecting neighboring frames to enforce the consistency between poses to\nescape the local minima caused by only pose consistency with the scene\nstructure. Further, we instantiate the consistency between poses as a\nreprojected geometric image distance constraint resulting from pixel-level\ncorrespondences between input image pairs. Through the incremental\nreconstruction, CT-NeRF enables the recovery of both camera poses and scene\nstructure and is capable of handling scenes with complex trajectories. We\nevaluate the performance of CT-NeRF on two real-world datasets, NeRFBuster and\nFree-Dataset, which feature complex trajectories. Results show CT-NeRF\noutperforms existing methods in novel view synthesis and pose estimation\naccuracy.\n","authors":["Yunlong Ran","Yanxu Li","Qi Ye","Yuchi Huo","Zechun Bai","Jiahao Sun","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14996v1","updated":"2024-04-23T12:57:35Z","published":"2024-04-23T12:57:35Z","title":"CA-Stream: Attention-based pooling for interpretable image recognition","summary":" Explanations obtained from transformer-based architectures in the form of raw\nattention, can be seen as a class-agnostic saliency map. Additionally,\nattention-based pooling serves as a form of masking the in feature space.\nMotivated by this observation, we design an attention-based pooling mechanism\nintended to replace Global Average Pooling (GAP) at inference. This mechanism,\ncalled Cross-Attention Stream (CA-Stream), comprises a stream of cross\nattention blocks interacting with features at different network depths.\nCA-Stream enhances interpretability in models, while preserving recognition\nperformance.\n","authors":["Felipe Torres","Hanwei Zhang","Ronan Sicre","Stéphane Ayache","Yannis Avrithis"],"pdf_url":"https://arxiv.org/pdf/2404.14996v1.pdf","comment":"CVPR XAI4CV workshop 2024"},{"id":"http://arxiv.org/abs/2401.03907v4","updated":"2024-04-23T12:48:23Z","published":"2024-01-08T14:10:24Z","title":"RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM","summary":" Multi-modal 3D object detectors are dedicated to exploring secure and\nreliable perception systems for autonomous driving (AD).Although achieving\nstate-of-the-art (SOTA) performance on clean benchmark datasets, they tend to\noverlook the complexity and harsh conditions of real-world environments. With\nthe emergence of visual foundation models (VFMs), opportunities and challenges\nare presented for improving the robustness and generalization of multi-modal 3D\nobject detection in AD. Therefore, we propose RoboFusion, a robust framework\nthat leverages VFMs like SAM to tackle out-of-distribution (OOD) noise\nscenarios. We first adapt the original SAM for AD scenarios named SAM-AD. To\nalign SAM or SAM-AD with multi-modal methods, we then introduce AD-FPN for\nupsampling the image features extracted by SAM. We employ wavelet decomposition\nto denoise the depth-guided images for further noise reduction and weather\ninterference. At last, we employ self-attention mechanisms to adaptively\nreweight the fused features, enhancing informative features while suppressing\nexcess noise. In summary, RoboFusion significantly reduces noise by leveraging\nthe generalization and robustness of VFMs, thereby enhancing the resilience of\nmulti-modal 3D object detection. Consequently, RoboFusion achieves SOTA\nperformance in noisy scenarios, as demonstrated by the KITTI-C and nuScenes-C\nbenchmarks. Code is available at https://github.com/adept-thu/RoboFusion.\n","authors":["Ziying Song","Guoxing Zhang","Lin Liu","Lei Yang","Shaoqing Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2401.03907v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08671v2","updated":"2024-04-23T12:46:34Z","published":"2024-02-13T18:53:13Z","title":"Are Semi-Dense Detector-Free Methods Good at Matching Local Features?","summary":" Semi-dense detector-free approaches (SDF), such as LoFTR, are currently among\nthe most popular image matching methods. While SDF methods are trained to\nestablish correspondences between two images, their performances are almost\nexclusively evaluated using relative pose estimation metrics. Thus, the link\nbetween their ability to establish correspondences and the quality of the\nresulting estimated pose has thus far received little attention. This paper is\na first attempt to study this link. We start with proposing a novel structured\nattention-based image matching architecture (SAM). It allows us to show a\ncounter-intuitive result on two datasets (MegaDepth and HPatches): on the one\nhand SAM either outperforms or is on par with SDF methods in terms of\npose/homography estimation metrics, but on the other hand SDF approaches are\nsignificantly better than SAM in terms of matching accuracy. We then propose to\nlimit the computation of the matching accuracy to textured regions, and show\nthat in this case SAM often surpasses SDF methods. Our findings highlight a\nstrong correlation between the ability to establish accurate correspondences in\ntextured regions and the accuracy of the resulting estimated pose/homography.\nOur code will be made available.\n","authors":["Matthieu Vilain","Rémi Giraud","Hugo Germain","Guillaume Bourmaud"],"pdf_url":"https://arxiv.org/pdf/2402.08671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14985v1","updated":"2024-04-23T12:42:07Z","published":"2024-04-23T12:42:07Z","title":"Other Tokens Matter: Exploring Global and Local Features of Vision\n Transformers for Object Re-Identification","summary":" Object Re-Identification (Re-ID) aims to identify and retrieve specific\nobjects from images captured at different places and times. Recently, object\nRe-ID has achieved great success with the advances of Vision Transformers\n(ViT). However, the effects of the global-local relation have not been fully\nexplored in Transformers for object Re-ID. In this work, we first explore the\ninfluence of global and local features of ViT and then further propose a novel\nGlobal-Local Transformer (GLTrans) for high-performance object Re-ID. We find\nthat the features from last few layers of ViT already have a strong\nrepresentational ability, and the global and local information can mutually\nenhance each other. Based on this fact, we propose a Global Aggregation Encoder\n(GAE) to utilize the class tokens of the last few Transformer layers and learn\ncomprehensive global features effectively. Meanwhile, we propose the Local\nMulti-layer Fusion (LMF) which leverages both the global cues from GAE and\nmulti-layer patch tokens to explore the discriminative local representations.\nExtensive experiments demonstrate that our proposed method achieves superior\nperformance on four object Re-ID benchmarks.\n","authors":["Yingquan Wang","Pingping Zhang","Dong Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.14985v1.pdf","comment":"Accepted by CVIU2024. More modifications may be performed"},{"id":"http://arxiv.org/abs/2404.14979v1","updated":"2024-04-23T12:36:24Z","published":"2024-04-23T12:36:24Z","title":"SGFormer: Spherical Geometry Transformer for 360 Depth Estimation","summary":" Panoramic distortion poses a significant challenge in 360 depth estimation,\nparticularly pronounced at the north and south poles. Existing methods either\nadopt a bi-projection fusion strategy to remove distortions or model long-range\ndependencies to capture global structures, which can result in either unclear\nstructure or insufficient local perception. In this paper, we propose a\nspherical geometry transformer, named SGFormer, to address the above issues,\nwith an innovative step to integrate spherical geometric priors into vision\ntransformers. To this end, we retarget the transformer decoder to a spherical\nprior decoder (termed SPDecoder), which endeavors to uphold the integrity of\nspherical structures during decoding. Concretely, we leverage bipolar\nre-projection, circular rotation, and curve local embedding to preserve the\nspherical characteristics of equidistortion, continuity, and surface distance,\nrespectively. Furthermore, we present a query-based global conditional position\nembedding to compensate for spatial structure at varying resolutions. It not\nonly boosts the global perception of spatial position but also sharpens the\ndepth structure across different patches. Finally, we conduct extensive\nexperiments on popular benchmarks, demonstrating our superiority over\nstate-of-the-art solutions.\n","authors":["Junsong Zhang","Zisong Chen","Chunyu Lin","Lang Nie","Zhijie Shen","Junda Huang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.14979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14975v1","updated":"2024-04-23T12:30:17Z","published":"2024-04-23T12:30:17Z","title":"CAGE: Circumplex Affect Guided Expression Inference","summary":" Understanding emotions and expressions is a task of interest across multiple\ndisciplines, especially for improving user experiences. Contrary to the common\nperception, it has been shown that emotions are not discrete entities but\ninstead exist along a continuum. People understand discrete emotions\ndifferently due to a variety of factors, including cultural background,\nindividual experiences, and cognitive biases. Therefore, most approaches to\nexpression understanding, particularly those relying on discrete categories,\nare inherently biased. In this paper, we present a comparative in-depth\nanalysis of two common datasets (AffectNet and EMOTIC) equipped with the\ncomponents of the circumplex model of affect. Further, we propose a model for\nthe prediction of facial expressions tailored for lightweight applications.\nUsing a small-scaled MaxViT-based model architecture, we evaluate the impact of\ndiscrete expression category labels in training with the continuous valence and\narousal labels. We show that considering valence and arousal in addition to\ndiscrete category labels helps to significantly improve expression inference.\nThe proposed model outperforms the current state-of-the-art models on\nAffectNet, establishing it as the best-performing model for inferring valence\nand arousal achieving a 7% lower RMSE. Training scripts and trained weights to\nreproduce our results can be found here:\nhttps://github.com/wagner-niklas/CAGE_expression_inference.\n","authors":["Niklas Wagner","Felix Mätzler","Samed R. Vossberg","Helen Schneider","Svetlana Pavlitska","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2404.14975v1.pdf","comment":"Accepted for publication at ABAW Workshop at CVPR2024"},{"id":"http://arxiv.org/abs/2311.12631v3","updated":"2024-04-23T12:24:58Z","published":"2023-11-21T14:24:37Z","title":"GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via\n Blender-Oriented GPT Planning","summary":" Recent advances in text-to-video generation have harnessed the power of\ndiffusion models to create visually compelling content conditioned on text\nprompts. However, they usually encounter high computational costs and often\nstruggle to produce videos with coherent physical motions. To tackle these\nissues, we propose GPT4Motion, a training-free framework that leverages the\nplanning capability of large language models such as GPT, the physical\nsimulation strength of Blender, and the excellent image generation ability of\ntext-to-image diffusion models to enhance the quality of video synthesis.\nSpecifically, GPT4Motion employs GPT-4 to generate a Blender script based on a\nuser textual prompt, which commands Blender's built-in physics engine to craft\nfundamental scene components that encapsulate coherent physical motions across\nframes. Then these components are inputted into Stable Diffusion to generate a\nvideo aligned with the textual prompt. Experimental results on three basic\nphysical motion scenarios, including rigid object drop and collision, cloth\ndraping and swinging, and liquid flow, demonstrate that GPT4Motion can generate\nhigh-quality videos efficiently in maintaining motion coherency and entity\nconsistency. GPT4Motion offers new insights in text-to-video research,\nenhancing its quality and broadening its horizon for further explorations.\n","authors":["Jiaxi Lv","Yi Huang","Mingfu Yan","Jiancheng Huang","Jianzhuang Liu","Yifan Liu","Yafei Wen","Xiaoxin Chen","Shifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12631v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14968v1","updated":"2024-04-23T12:23:42Z","published":"2024-04-23T12:23:42Z","title":"CenterArt: Joint Shape Reconstruction and 6-DoF Grasp Estimation of\n Articulated Objects","summary":" Precisely grasping and reconstructing articulated objects is key to enabling\ngeneral robotic manipulation. In this paper, we propose CenterArt, a novel\napproach for simultaneous 3D shape reconstruction and 6-DoF grasp estimation of\narticulated objects. CenterArt takes RGB-D images of the scene as input and\nfirst predicts the shape and joint codes through an encoder. The decoder then\nleverages these codes to reconstruct 3D shapes and estimate 6-DoF grasp poses\nof the objects. We further develop a mechanism for generating a dataset of\n6-DoF grasp ground truth poses for articulated objects. CenterArt is trained on\nrealistic scenes containing multiple articulated objects with randomized\ndesigns, textures, lighting conditions, and realistic depths. We perform\nextensive experiments demonstrating that CenterArt outperforms existing methods\nin accuracy and robustness.\n","authors":["Sassan Mokhtar","Eugenio Chisari","Nick Heppert","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2404.14968v1.pdf","comment":"4 pages, 2 figures, accepted to the ICRA 2024 Workshop on 3D Visual\n Representations for Robot Manipulation"},{"id":"http://arxiv.org/abs/2404.14967v1","updated":"2024-04-23T12:22:32Z","published":"2024-04-23T12:22:32Z","title":"CoARF: Controllable 3D Artistic Style Transfer for Radiance Fields","summary":" Creating artistic 3D scenes can be time-consuming and requires specialized\nknowledge. To address this, recent works such as ARF, use a radiance\nfield-based approach with style constraints to generate 3D scenes that resemble\na style image provided by the user. However, these methods lack fine-grained\ncontrol over the resulting scenes. In this paper, we introduce Controllable\nArtistic Radiance Fields (CoARF), a novel algorithm for controllable 3D scene\nstylization. CoARF enables style transfer for specified objects, compositional\n3D style transfer and semantic-aware style transfer. We achieve controllability\nusing segmentation masks with different label-dependent loss functions. We also\npropose a semantic-aware nearest neighbor matching algorithm to improve the\nstyle transfer quality. Our extensive experiments demonstrate that CoARF\nprovides user-specified controllability of style transfer and superior style\ntransfer quality with more precise feature matching.\n","authors":["Deheng Zhang","Clara Fernandez-Labrador","Christopher Schroers"],"pdf_url":"https://arxiv.org/pdf/2404.14967v1.pdf","comment":"International Conference on 3D Vision 2024"},{"id":"http://arxiv.org/abs/2404.14966v1","updated":"2024-04-23T12:20:27Z","published":"2024-04-23T12:20:27Z","title":"Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State\n Space Model","summary":" Existing Transformer-based models for point cloud analysis suffer from\nquadratic complexity, leading to compromised point cloud resolution and\ninformation loss. In contrast, the newly proposed Mamba model, based on state\nspace models (SSM), outperforms Transformer in multiple areas with only linear\ncomplexity. However, the straightforward adoption of Mamba does not achieve\nsatisfactory performance on point cloud tasks. In this work, we present\nMamba3D, a state space model tailored for point cloud learning to enhance local\nfeature extraction, achieving superior performance, high efficiency, and\nscalability potential. Specifically, we propose a simple yet effective Local\nNorm Pooling (LNP) block to extract local geometric features. Additionally, to\nobtain better global features, we introduce a bidirectional SSM (bi-SSM) with\nboth a token forward SSM and a novel backward SSM that operates on the feature\nchannel. Extensive experimental results show that Mamba3D surpasses\nTransformer-based counterparts and concurrent works in multiple tasks, with or\nwithout pre-training. Notably, Mamba3D achieves multiple SoTA, including an\noverall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1%\n(with single-modal pre-training) on the ModelNet40 classification task, with\nonly linear complexity.\n","authors":["Xu Han","Yuan Tang","Zhaoxuan Wang","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2404.14966v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.14956v1","updated":"2024-04-23T12:01:21Z","published":"2024-04-23T12:01:21Z","title":"DAWN: Domain-Adaptive Weakly Supervised Nuclei Segmentation via\n Cross-Task Interactions","summary":" Weakly supervised segmentation methods have gained significant attention due\nto their ability to reduce the reliance on costly pixel-level annotations\nduring model training. However, the current weakly supervised nuclei\nsegmentation approaches typically follow a two-stage pseudo-label generation\nand network training process. The performance of the nuclei segmentation\nheavily relies on the quality of the generated pseudo-labels, thereby limiting\nits effectiveness. This paper introduces a novel domain-adaptive weakly\nsupervised nuclei segmentation framework using cross-task interaction\nstrategies to overcome the challenge of pseudo-label generation. Specifically,\nwe utilize weakly annotated data to train an auxiliary detection task, which\nassists the domain adaptation of the segmentation network. To enhance the\nefficiency of domain adaptation, we design a consistent feature constraint\nmodule integrating prior knowledge from the source domain. Furthermore, we\ndevelop pseudo-label optimization and interactive training methods to improve\nthe domain transfer capability. To validate the effectiveness of our proposed\nmethod, we conduct extensive comparative and ablation experiments on six\ndatasets. The results demonstrate the superiority of our approach over existing\nweakly supervised approaches. Remarkably, our method achieves comparable or\neven better performance than fully supervised methods. Our code will be\nreleased in https://github.com/zhangye-zoe/DAWN.\n","authors":["Ye Zhang","Yifeng Wang","Zijie Fang","Hao Bian","Linghan Cai","Ziyue Wang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14956v1.pdf","comment":"13 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.14955v1","updated":"2024-04-23T12:00:20Z","published":"2024-04-23T12:00:20Z","title":"Traditional to Transformers: A Survey on Current Trends and Future\n Prospects for Hyperspectral Image Classification","summary":" Hyperspectral image classification is a challenging task due to the high\ndimensionality and complex nature of hyperspectral data. In recent years, deep\nlearning techniques have emerged as powerful tools for addressing these\nchallenges. This survey provides a comprehensive overview of the current trends\nand future prospects in hyperspectral image classification, focusing on the\nadvancements from deep learning models to the emerging use of transformers. We\nreview the key concepts, methodologies, and state-of-the-art approaches in deep\nlearning for hyperspectral image classification. Additionally, we discuss the\npotential of transformer-based models in this field and highlight the\nadvantages and challenges associated with these approaches. Comprehensive\nexperimental results have been undertaken using three Hyperspectral datasets to\nverify the efficacy of various conventional deep-learning models and\nTransformers. Finally, we outline future research directions and potential\napplications that can further enhance the accuracy and efficiency of\nhyperspectral image classification.\n The Source code is available at\nhttps://github.com/mahmad00/Conventional-to-Transformer-for-Hyperspectral-Image-Classification-Survey-2024.\n","authors":["Muhammad Ahmad","Salvatore Distifano","Manuel Mazzara","Adil Mehmood Khan"],"pdf_url":"https://arxiv.org/pdf/2404.14955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14952v1","updated":"2024-04-23T11:54:05Z","published":"2024-04-23T11:54:05Z","title":"Leveraging Speech for Gesture Detection in Multimodal Communication","summary":" Gestures are inherent to human interaction and often complement speech in\nface-to-face communication, forming a multimodal communication system. An\nimportant task in gesture analysis is detecting a gesture's beginning and end.\nResearch on automatic gesture detection has primarily focused on visual and\nkinematic information to detect a limited set of isolated or silent gestures\nwith low variability, neglecting the integration of speech and vision signals\nto detect gestures that co-occur with speech. This work addresses this gap by\nfocusing on co-speech gesture detection, emphasising the synchrony between\nspeech and co-speech hand gestures. We address three main challenges: the\nvariability of gesture forms, the temporal misalignment between gesture and\nspeech onsets, and differences in sampling rate between modalities. We\ninvestigate extended speech time windows and employ separate backbone models\nfor each modality to address the temporal misalignment and sampling rate\ndifferences. We utilize Transformer encoders in cross-modal and early fusion\ntechniques to effectively align and integrate speech and skeletal sequences.\nThe study results show that combining visual and speech information\nsignificantly enhances gesture detection performance. Our findings indicate\nthat expanding the speech buffer beyond visual time segments improves\nperformance and that multimodal integration using cross-modal and early fusion\ntechniques outperforms baseline methods using unimodal and late fusion methods.\nAdditionally, we find a correlation between the models' gesture prediction\nconfidence and low-level speech frequency features potentially associated with\ngestures. Overall, the study provides a better understanding and detection\nmethods for co-speech gestures, facilitating the analysis of multimodal\ncommunication.\n","authors":["Esam Ghaleb","Ilya Burenko","Marlou Rasenberg","Wim Pouw","Ivan Toni","Peter Uhrig","Anna Wilson","Judith Holler","Aslı Özyürek","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2404.14952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14951v1","updated":"2024-04-23T11:53:51Z","published":"2024-04-23T11:53:51Z","title":"Streamlining the Image Stitching Pipeline: Integrating Fusion and\n Rectangling into a Unified Model","summary":" Learning-based image stitching techniques typically involve three distinct\nstages: registration, fusion, and rectangling. These stages are often performed\nsequentially, each trained independently, leading to potential cascading error\npropagation and complex parameter tuning challenges. In rethinking the\nmathematical modeling of the fusion and rectangling stages, we discovered that\nthese processes can be effectively combined into a single, variety-intensity\ninpainting problem. Therefore, we propose the Simple and Robust Stitcher\n(SRStitcher), an efficient training-free image stitching method that merges the\nfusion and rectangling stages into a unified model. By employing the weighted\nmask and large-scale generative model, SRStitcher can solve the fusion and\nrectangling problems in a single inference, without additional training or\nfine-tuning of other models. Our method not only simplifies the stitching\npipeline but also enhances fault tolerance towards misregistration errors.\nExtensive experiments demonstrate that SRStitcher outperforms state-of-the-art\n(SOTA) methods in both quantitative assessments and qualitative evaluations.\nThe code is released at https://github.com/yayoyo66/SRStitcher\n","authors":["Ziqi Xie"],"pdf_url":"https://arxiv.org/pdf/2404.14951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14949v1","updated":"2024-04-23T11:45:32Z","published":"2024-04-23T11:45:32Z","title":"Multi-Modal Prompt Learning on Blind Image Quality Assessment","summary":" Image Quality Assessment (IQA) models benefit significantly from semantic\ninformation, which allows them to treat different types of objects distinctly.\nCurrently, leveraging semantic information to enhance IQA is a crucial research\ndirection. Traditional methods, hindered by a lack of sufficiently annotated\ndata, have employed the CLIP image-text pretraining model as their backbone to\ngain semantic awareness. However, the generalist nature of these pre-trained\nVision-Language (VL) models often renders them suboptimal for IQA-specific\ntasks. Recent approaches have attempted to address this mismatch using prompt\ntechnology, but these solutions have shortcomings. Existing prompt-based VL\nmodels overly focus on incremental semantic information from text, neglecting\nthe rich insights available from visual data analysis. This imbalance limits\ntheir performance improvements in IQA tasks. This paper introduces an\ninnovative multi-modal prompt-based methodology for IQA. Our approach employs\ncarefully crafted prompts that synergistically mine incremental semantic\ninformation from both visual and linguistic data. Specifically, in the visual\nbranch, we introduce a multi-layer prompt structure to enhance the VL model's\nadaptability. In the text branch, we deploy a dual-prompt scheme that steers\nthe model to recognize and differentiate between scene category and distortion\ntype, thereby refining the model's capacity to assess image quality. Our\nexperimental findings underscore the effectiveness of our method over existing\nBlind Image Quality Assessment (BIQA) approaches. Notably, it demonstrates\ncompetitive performance across various datasets. Our method achieves Spearman\nRank Correlation Coefficient (SRCC) values of 0.961(surpassing 0.946 in CSIQ)\nand 0.941 (exceeding 0.930 in KADID), illustrating its robustness and accuracy\nin diverse contexts.\n","authors":["Wensheng Pan","Timin Gao","Yan Zhang","Runze Hu","Xiawu Zheng","Enwei Zhang","Yuting Gao","Yutao Liu","Yunhang Shen","Ke Li","Shengchuan Zhang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.14949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09158v2","updated":"2024-04-23T11:45:29Z","published":"2024-04-14T06:19:46Z","title":"StreakNet-Arch: An Anti-scattering Network-based Architecture for\n Underwater Carrier LiDAR-Radar Imaging","summary":" In this paper, we introduce StreakNet-Arch, a novel signal processing\narchitecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging\nsystems, to address the limitations in scatter suppression and real-time\nimaging. StreakNet-Arch formulates the signal processing as a real-time,\nend-to-end binary classification task, enabling real-time image acquisition. To\nachieve this, we leverage Self-Attention networks and propose a novel Double\nBranch Cross Attention (DBC-Attention) mechanism that surpasses the performance\nof traditional methods. Furthermore, we present a method for embedding\nstreak-tube camera images into attention networks, effectively acting as a\nlearned bandpass filter. To facilitate further research, we contribute a\npublicly available streak-tube camera image dataset. The dataset contains\n2,695,168 real-world underwater 3D point cloud data. These advancements\nsignificantly improve UCLR capabilities, enhancing its performance and\napplicability in underwater imaging tasks. The source code and dataset can be\nfound at https://github.com/BestAnHongjun/StreakNet .\n","authors":["Xuelong Li","Hongjun An","Guangying Li","Xing Wang","Guanghua Cheng","Zhe Sun"],"pdf_url":"https://arxiv.org/pdf/2404.09158v2.pdf","comment":"Reduce the number of pages to 13"},{"id":"http://arxiv.org/abs/2404.14945v1","updated":"2024-04-23T11:41:19Z","published":"2024-04-23T11:41:19Z","title":"Pyramid Hierarchical Transformer for Hyperspectral Image Classification","summary":" The traditional Transformer model encounters challenges with variable-length\ninput sequences, particularly in Hyperspectral Image Classification (HSIC),\nleading to efficiency and scalability concerns. To overcome this, we propose a\npyramid-based hierarchical transformer (PyFormer). This innovative approach\norganizes input data hierarchically into segments, each representing distinct\nabstraction levels, thereby enhancing processing efficiency for lengthy\nsequences. At each level, a dedicated transformer module is applied,\neffectively capturing both local and global context. Spatial and spectral\ninformation flow within the hierarchy facilitates communication and abstraction\npropagation. Integration of outputs from different levels culminates in the\nfinal input representation. Experimental results underscore the superiority of\nthe proposed method over traditional approaches. Additionally, the\nincorporation of disjoint samples augments robustness and reliability, thereby\nhighlighting the potential of our approach in advancing HSIC.\n The source code is available at https://github.com/mahmad00/PyFormer.\n","authors":["Muhammad Ahmad","Muhammad Hassaan Farooq Butt","Manuel Mazzara","Salvatore Distifano"],"pdf_url":"https://arxiv.org/pdf/2404.14945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14944v1","updated":"2024-04-23T11:40:52Z","published":"2024-04-23T11:40:52Z","title":"Importance of Disjoint Sampling in Conventional and Transformer Models\n for Hyperspectral Image Classification","summary":" Disjoint sampling is critical for rigorous and unbiased evaluation of\nstate-of-the-art (SOTA) models. When training, validation, and test sets\noverlap or share data, it introduces a bias that inflates performance metrics\nand prevents accurate assessment of a model's true ability to generalize to new\nexamples. This paper presents an innovative disjoint sampling approach for\ntraining SOTA models on Hyperspectral image classification (HSIC) tasks. By\nseparating training, validation, and test data without overlap, the proposed\nmethod facilitates a fairer evaluation of how well a model can classify pixels\nit was not exposed to during training or validation. Experiments demonstrate\nthe approach significantly improves a model's generalization compared to\nalternatives that include training and validation data in test data. By\neliminating data leakage between sets, disjoint sampling provides reliable\nmetrics for benchmarking progress in HSIC. Researchers can have confidence that\nreported performance truly reflects a model's capabilities for classifying new\nscenes, not just memorized pixels. This rigorous methodology is critical for\nadvancing SOTA models and their real-world application to large-scale land\nmapping with Hyperspectral sensors.\n The source code is available at\nhttps://github.com/mahmad00/Disjoint-Sampling-for-Hyperspectral-Image-Classification.\n","authors":["Muhammad Ahmad","Manuel Mazzara","Salvatore Distifano"],"pdf_url":"https://arxiv.org/pdf/2404.14944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14934v1","updated":"2024-04-23T11:22:59Z","published":"2024-04-23T11:22:59Z","title":"G3R: Generating Rich and Fine-grained mmWave Radar Data from 2D Videos\n for Generalized Gesture Recognition","summary":" Millimeter wave radar is gaining traction recently as a promising modality\nfor enabling pervasive and privacy-preserving gesture recognition. However, the\nlack of rich and fine-grained radar datasets hinders progress in developing\ngeneralized deep learning models for gesture recognition across various user\npostures (e.g., standing, sitting), positions, and scenes. To remedy this, we\nresort to designing a software pipeline that exploits wealthy 2D videos to\ngenerate realistic radar data, but it needs to address the challenge of\nsimulating diversified and fine-grained reflection properties of user gestures.\nTo this end, we design G3R with three key components: (i) a gesture reflection\npoint generator expands the arm's skeleton points to form human reflection\npoints; (ii) a signal simulation model simulates the multipath reflection and\nattenuation of radar signals to output the human intensity map; (iii) an\nencoder-decoder model combines a sampling module and a fitting module to\naddress the differences in number and distribution of points between generated\nand real-world radar data for generating realistic radar data. We implement and\nevaluate G3R using 2D videos from public data sources and self-collected\nreal-world radar data, demonstrating its superiority over other\nstate-of-the-art approaches for gesture recognition.\n","authors":["Kaikai Deng","Dong Zhao","Wenxin Zheng","Yue Ling","Kangwen Yin","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2404.14934v1.pdf","comment":"18 pages, 29 figures"},{"id":"http://arxiv.org/abs/2404.14908v1","updated":"2024-04-23T10:51:15Z","published":"2024-04-23T10:51:15Z","title":"Mining Supervision for Dynamic Regions in Self-Supervised Monocular\n Depth Estimation","summary":" This paper focuses on self-supervised monocular depth estimation in dynamic\nscenes trained on monocular videos. Existing methods jointly estimate\npixel-wise depth and motion, relying mainly on an image reconstruction loss.\nDynamic regions1 remain a critical challenge for these methods due to the\ninherent ambiguity in depth and motion estimation, resulting in inaccurate\ndepth estimation. This paper proposes a self-supervised training framework\nexploiting pseudo depth labels for dynamic regions from training data. The key\ncontribution of our framework is to decouple depth estimation for static and\ndynamic regions of images in the training data. We start with an unsupervised\ndepth estimation approach, which provides reliable depth estimates for static\nregions and motion cues for dynamic regions and allows us to extract moving\nobject information at the instance level. In the next stage, we use an object\nnetwork to estimate the depth of those moving objects assuming rigid motions.\nThen, we propose a new scale alignment module to address the scale ambiguity\nbetween estimated depths for static and dynamic regions. We can then use the\ndepth labels generated to train an end-to-end depth estimation network and\nimprove its performance. Extensive experiments on the Cityscapes and KITTI\ndatasets show that our self-training strategy consistently outperforms existing\nself/unsupervised depth estimation methods.\n","authors":["Hoang Chuong Nguyen","Tianyu Wang","Jose M. Alvarez","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.14908v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2401.16386v2","updated":"2024-04-23T10:44:01Z","published":"2024-01-29T18:27:52Z","title":"Continual Learning with Pre-Trained Models: A Survey","summary":" Nowadays, real-world applications often face streaming data, which requires\nthe learning system to absorb new knowledge as data evolves. Continual Learning\n(CL) aims to achieve this goal and meanwhile overcome the catastrophic\nforgetting of former knowledge when learning new ones. Typical CL methods build\nthe model from scratch to grow with incoming data. However, the advent of the\npre-trained model (PTM) era has sparked immense research interest, particularly\nin leveraging PTMs' robust representational capabilities. This paper presents a\ncomprehensive survey of the latest advancements in PTM-based CL. We categorize\nexisting methodologies into three distinct groups, providing a comparative\nanalysis of their similarities, differences, and respective advantages and\ndisadvantages. Additionally, we offer an empirical study contrasting various\nstate-of-the-art methods to highlight concerns regarding fairness in\ncomparisons. The source code to reproduce these evaluations is available at:\nhttps://github.com/sun-hailong/LAMDA-PILOT\n","authors":["Da-Wei Zhou","Hai-Long Sun","Jingyi Ning","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2401.16386v2.pdf","comment":"Accepted to IJCAI 2024. Code is available at:\n https://github.com/sun-hailong/LAMDA-PILOT"},{"id":"http://arxiv.org/abs/2404.14906v1","updated":"2024-04-23T10:42:24Z","published":"2024-04-23T10:42:24Z","title":"Driver Activity Classification Using Generalizable Representations from\n Vision-Language Models","summary":" Driver activity classification is crucial for ensuring road safety, with\napplications ranging from driver assistance systems to autonomous vehicle\ncontrol transitions. In this paper, we present a novel approach leveraging\ngeneralizable representations from vision-language models for driver activity\nclassification. Our method employs a Semantic Representation Late Fusion Neural\nNetwork (SRLF-Net) to process synchronized video frames from multiple\nperspectives. Each frame is encoded using a pretrained vision-language encoder,\nand the resulting embeddings are fused to generate class probability\npredictions. By leveraging contrastively-learned vision-language\nrepresentations, our approach achieves robust performance across diverse driver\nactivities. We evaluate our method on the Naturalistic Driving Action\nRecognition Dataset, demonstrating strong accuracy across many classes. Our\nresults suggest that vision-language representations offer a promising avenue\nfor driver monitoring systems, providing both accuracy and interpretability\nthrough natural language descriptors.\n","authors":["Ross Greer","Mathias Viborg Andersen","Andreas Møgelmose","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2404.14906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14890v1","updated":"2024-04-23T10:17:42Z","published":"2024-04-23T10:17:42Z","title":"DENOISER: Rethinking the Robustness for Open-Vocabulary Action\n Recognition","summary":" As one of the fundamental video tasks in computer vision, Open-Vocabulary\nAction Recognition (OVAR) recently gains increasing attention, with the\ndevelopment of vision-language pre-trainings. To enable generalization of\narbitrary classes, existing methods treat class labels as text descriptions,\nthen formulate OVAR as evaluating embedding similarity between visual samples\nand textual classes. However, one crucial issue is completely ignored: the\nclass descriptions given by users may be noisy, e.g., misspellings and typos,\nlimiting the real-world practicality of vanilla OVAR. To fill the research gap,\nthis paper pioneers to evaluate existing methods by simulating multi-level\nnoises of various types, and reveals their poor robustness. To tackle the noisy\nOVAR task, we further propose one novel DENOISER framework, covering two parts:\ngeneration and discrimination. Concretely, the generative part denoises noisy\nclass-text names via one decoding process, i.e., propose text candidates, then\nutilize inter-modal and intra-modal information to vote for the best. At the\ndiscriminative part, we use vanilla OVAR models to assign visual samples to\nclass-text names, thus obtaining more semantics. For optimization, we\nalternately iterate between generative and discriminative parts for progressive\nrefinements. The denoised text classes help OVAR models classify visual samples\nmore accurately; in return, classified visual samples help better denoising. On\nthree datasets, we carry out extensive experiments to show our superior\nrobustness, and thorough ablations to dissect the effectiveness of each\ncomponent.\n","authors":["Haozhe Cheng","Cheng Ju","Haicheng Wang","Jinxiang Liu","Mengting Chen","Qiang Hu","Xiaoyun Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14885v1","updated":"2024-04-23T10:13:31Z","published":"2024-04-23T10:13:31Z","title":"Domain adaptive pose estimation via multi-level alignment","summary":" Domain adaptive pose estimation aims to enable deep models trained on source\ndomain (synthesized) datasets produce similar results on the target domain\n(real-world) datasets. The existing methods have made significant progress by\nconducting image-level or feature-level alignment. However, only aligning at a\nsingle level is not sufficient to fully bridge the domain gap and achieve\nexcellent domain adaptive results. In this paper, we propose a multi-level\ndomain adaptation aproach, which aligns different domains at the image,\nfeature, and pose levels. Specifically, we first utilize image style transer to\nensure that images from the source and target domains have a similar\ndistribution. Subsequently, at the feature level, we employ adversarial\ntraining to make the features from the source and target domains preserve\ndomain-invariant characeristics as much as possible. Finally, at the pose\nlevel, a self-supervised approach is utilized to enable the model to learn\ndiverse knowledge, implicitly addressing the domain gap. Experimental results\ndemonstrate that significant imrovement can be achieved by the proposed\nmulti-level alignment method in pose estimation, which outperforms previous\nstate-of-the-art in human pose by up to 2.4% and animal pose estimation by up\nto 3.1% for dogs and 1.4% for sheep.\n","authors":["Yugan Chen","Lin Zhao","Yalong Xu","Honglei Zu","Xiaoqi An","Guangyu Li"],"pdf_url":"https://arxiv.org/pdf/2404.14885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14882v1","updated":"2024-04-23T10:09:32Z","published":"2024-04-23T10:09:32Z","title":"A sensitivity analysis to quantify the impact of neuroimaging\n preprocessing strategies on subsequent statistical analyses","summary":" Even though novel imaging techniques have been successful in studying brain\nstructure and function, the measured biological signals are often contaminated\nby multiple sources of noise, arising due to e.g. head movements of the\nindividual being scanned, limited spatial/temporal resolution, or other issues\nspecific to each imaging technology. Data preprocessing (e.g. denoising) is\ntherefore critical. Preprocessing pipelines have become increasingly complex\nover the years, but also more flexible, and this flexibility can have a\nsignificant impact on the final results and conclusions of a given study. This\nlarge parameter space is often referred to as multiverse analyses. Here, we\nprovide conceptual and practical tools for statistical analyses that can\naggregate multiple pipeline results along with a new sensitivity analysis\ntesting for hypotheses across pipelines such as \"no effect across all\npipelines\" or \"at least one pipeline with no effect\". The proposed framework is\ngeneric and can be applied to any multiverse scenario, but we illustrate its\nuse based on positron emission tomography data.\n","authors":["Brize Ozenne","Martin Norgaard","Cyril Pernet","Melanie Ganz"],"pdf_url":"https://arxiv.org/pdf/2404.14882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09965v3","updated":"2024-04-23T10:03:59Z","published":"2023-10-15T21:54:45Z","title":"ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context","summary":" Neural Radiance Fields (NeRFs) have recently emerged as a popular option for\nphoto-realistic object capture due to their ability to faithfully capture\nhigh-fidelity volumetric content even from handheld video input. Although much\nresearch has been devoted to efficient optimization leading to real-time\ntraining and rendering, options for interactive editing NeRFs remain limited.\nWe present a very simple but effective neural network architecture that is fast\nand efficient while maintaining a low memory footprint. This architecture can\nbe incrementally guided through user-friendly image-based edits. Our\nrepresentation allows straightforward object selection via semantic feature\ndistillation at the training stage. More importantly, we propose a local\n3D-aware image context to facilitate view-consistent image editing that can\nthen be distilled into fine-tuned NeRFs, via geometric and appearance\nadjustments. We evaluate our setup on a variety of examples to demonstrate\nappearance and geometric edits and report 10-30x speedup over concurrent work\nfocusing on text-guided NeRF editing. Video results can be seen on our project\nwebpage at https://proteusnerf.github.io.\n","authors":["Binglun Wang","Niladri Shekhar Dutt","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2310.09965v3.pdf","comment":"Accepted at I3D'24 (ACM SIGGRAPH SYMPOSIUM ON INTERACTIVE 3D GRAPHICS\n AND GAMES)"},{"id":"http://arxiv.org/abs/2403.10558v2","updated":"2024-04-23T09:59:42Z","published":"2024-03-14T02:17:57Z","title":"Adaptive Hybrid Masking Strategy for Privacy-Preserving Face Recognition\n Against Model Inversion Attack","summary":" The utilization of personal sensitive data in training face recognition (FR)\nmodels poses significant privacy concerns, as adversaries can employ model\ninversion attacks (MIA) to infer the original training data. Existing defense\nmethods, such as data augmentation and differential privacy, have been employed\nto mitigate this issue. However, these methods often fail to strike an optimal\nbalance between privacy and accuracy. To address this limitation, this paper\nintroduces an adaptive hybrid masking algorithm against MIA. Specifically, face\nimages are masked in the frequency domain using an adaptive MixUp strategy.\nUnlike the traditional MixUp algorithm, which is predominantly used for data\naugmentation, our modified approach incorporates frequency domain mixing.\nPrevious studies have shown that increasing the number of images mixed in MixUp\ncan enhance privacy preservation but at the expense of reduced face recognition\naccuracy. To overcome this trade-off, we develop an enhanced adaptive MixUp\nstrategy based on reinforcement learning, which enables us to mix a larger\nnumber of images while maintaining satisfactory recognition accuracy. To\noptimize privacy protection, we propose maximizing the reward function (i.e.,\nthe loss function of the FR system) during the training of the strategy\nnetwork. While the loss function of the FR network is minimized in the phase of\ntraining the FR network. The strategy network and the face recognition network\ncan be viewed as antagonistic entities in the training process, ultimately\nreaching a more balanced trade-off. Experimental results demonstrate that our\nproposed hybrid masking scheme outperforms existing defense algorithms in terms\nof privacy preservation and recognition accuracy against MIA.\n","authors":["Yinggui Wang","Yuanqing Huang","Jianshu Li","Le Yang","Kai Song","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09812v2","updated":"2024-04-23T09:53:42Z","published":"2024-02-15T09:21:16Z","title":"DreamMatcher: Appearance Matching Self-Attention for\n Semantically-Consistent Text-to-Image Personalization","summary":" The objective of text-to-image (T2I) personalization is to customize a\ndiffusion model to a user-provided reference concept, generating diverse images\nof the concept aligned with the target prompts. Conventional methods\nrepresenting the reference concepts using unique text embeddings often fail to\naccurately mimic the appearance of the reference. To address this, one solution\nmay be explicitly conditioning the reference images into the target denoising\nprocess, known as key-value replacement. However, prior works are constrained\nto local editing since they disrupt the structure path of the pre-trained T2I\nmodel. To overcome this, we propose a novel plug-in method, called\nDreamMatcher, which reformulates T2I personalization as semantic matching.\nSpecifically, DreamMatcher replaces the target values with reference values\naligned by semantic matching, while leaving the structure path unchanged to\npreserve the versatile capability of pre-trained T2I models for generating\ndiverse structures. We also introduce a semantic-consistent masking strategy to\nisolate the personalized concept from irrelevant regions introduced by the\ntarget prompts. Compatible with existing T2I models, DreamMatcher shows\nsignificant improvements in complex scenarios. Intensive analyses demonstrate\nthe effectiveness of our approach.\n","authors":["Jisu Nam","Heesu Kim","DongJae Lee","Siyoon Jin","Seungryong Kim","Seunggyu Chang"],"pdf_url":"https://arxiv.org/pdf/2402.09812v2.pdf","comment":"Project page is available at https://ku-cvlab.github.io/DreamMatcher/"},{"id":"http://arxiv.org/abs/2402.15300v2","updated":"2024-04-23T09:32:25Z","published":"2024-02-23T12:57:16Z","title":"Seeing is Believing: Mitigating Hallucination in Large Vision-Language\n Models via CLIP-Guided Decoding","summary":" Large Vision-Language Models (LVLMs) are susceptible to object\nhallucinations, an issue in which their generated text contains non-existent\nobjects, greatly limiting their reliability and practicality. Current\napproaches often rely on the model's token likelihoods or other internal\ninformation, instruction tuning on additional datasets, or incorporating\ncomplex external tools. We first perform empirical analysis on sentence-level\nLVLM hallucination, finding that CLIP similarity to the image acts as a\nstronger and more robust indicator of hallucination compared to token\nlikelihoods. Motivated by this, we introduce our CLIP-Guided Decoding (CGD)\napproach, a straightforward but effective training-free approach to reduce\nobject hallucination at decoding time. CGD uses CLIP to guide the model's\ndecoding process by enhancing visual grounding of generated text with the\nimage. Experiments demonstrate that CGD effectively mitigates object\nhallucination across multiple LVLM families while preserving the utility of\ntext generation. Codes are available at\nhttps://github.com/d-ailin/CLIP-Guided-Decoding.\n","authors":["Ailin Deng","Zhirui Chen","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.15300v2.pdf","comment":"Code URL: https://github.com/d-ailin/CLIP-Guided-Decoding"},{"id":"http://arxiv.org/abs/2401.01454v2","updated":"2024-04-23T09:08:11Z","published":"2024-01-02T22:35:33Z","title":"A Survey on Autonomous Driving Datasets: Statistics, Annotation Quality,\n and a Future Outlook","summary":" Autonomous driving has rapidly developed and shown promising performance due\nto recent advances in hardware and deep learning techniques. High-quality\ndatasets are fundamental for developing reliable autonomous driving algorithms.\nPrevious dataset surveys either focused on a limited number or lacked detailed\ninvestigation of dataset characteristics. To this end, we present an exhaustive\nstudy of 265 autonomous driving datasets from multiple perspectives, including\nsensor modalities, data size, tasks, and contextual conditions. We introduce a\nnovel metric to evaluate the impact of datasets, which can also be a guide for\ncreating new datasets. Besides, we analyze the annotation processes, existing\nlabeling tools, and the annotation quality of datasets, showing the importance\nof establishing a standard annotation pipeline. On the other hand, we\nthoroughly analyze the impact of geographical and adversarial environmental\nconditions on the performance of autonomous driving systems. Moreover, we\nexhibit the data distribution of several vital datasets and discuss their pros\nand cons accordingly. Finally, we discuss the current challenges and the\ndevelopment trend of the future autonomous driving datasets.\n","authors":["Mingyu Liu","Ekim Yurtsever","Jonathan Fossaert","Xingcheng Zhou","Walter Zimmer","Yuning Cui","Bare Luka Zagar","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2401.01454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14852v1","updated":"2024-04-23T09:07:04Z","published":"2024-04-23T09:07:04Z","title":"Ultrasound Nodule Segmentation Using Asymmetric Learning with Simple\n Clinical Annotation","summary":" Recent advances in deep learning have greatly facilitated the automated\nsegmentation of ultrasound images, which is essential for nodule morphological\nanalysis. Nevertheless, most existing methods depend on extensive and precise\nannotations by domain experts, which are labor-intensive and time-consuming. In\nthis study, we suggest using simple aspect ratio annotations directly from\nultrasound clinical diagnoses for automated nodule segmentation. Especially, an\nasymmetric learning framework is developed by extending the aspect ratio\nannotations with two types of pseudo labels, i.e., conservative labels and\nradical labels, to train two asymmetric segmentation networks simultaneously.\nSubsequently, a conservative-radical-balance strategy (CRBS) strategy is\nproposed to complementally combine radical and conservative labels. An\ninconsistency-aware dynamically mixed pseudo-labels supervision (IDMPS) module\nis introduced to address the challenges of over-segmentation and\nunder-segmentation caused by the two types of labels. To further leverage the\nspatial prior knowledge provided by clinical annotations, we also present a\nnovel loss function namely the clinical anatomy prior loss. Extensive\nexperiments on two clinically collected ultrasound datasets (thyroid and\nbreast) demonstrate the superior performance of our proposed method, which can\nachieve comparable and even better performance than fully supervised methods\nusing ground truth annotations.\n","authors":["Xingyue Zhao","Zhongyu Li","Xiangde Luo","Peiqi Li","Peng Huang","Jianwei Zhu","Yang Liu","Jihua Zhu","Meng Yang","Shi Chang","Jun Dong"],"pdf_url":"https://arxiv.org/pdf/2404.14852v1.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2312.04233v3","updated":"2024-04-23T08:59:25Z","published":"2023-12-07T11:39:11Z","title":"Fine-tuning vision foundation model for crack segmentation in civil\n infrastructures","summary":" Large-scale foundation models have become the mainstream deep learning\nmethod, while in civil engineering, the scale of AI models is strictly limited.\nIn this work, a vision foundation model is introduced for crack segmentation.\nTwo parameter-efficient fine-tuning methods, adapter and low-rank adaptation,\nare adopted to fine-tune the foundation model in semantic segmentation: the\nSegment Anything Model (SAM). The fine-tuned CrackSAM shows excellent\nperformance on different scenes and materials. To test the zero-shot\nperformance of the proposed method, two unique datasets related to road and\nexterior wall cracks are collected, annotated and open-sourced, for a total of\n810 images. Comparative experiments are conducted with twelve mature semantic\nsegmentation models. On datasets with artificial noise and previously unseen\ndatasets, the performance of CrackSAM far exceeds that of all state-of-the-art\nmodels. CrackSAM exhibits remarkable superiority, particularly under\nchallenging conditions such as dim lighting, shadows, road markings,\nconstruction joints, and other interference factors. These cross-scenario\nresults demonstrate the outstanding zero-shot capability of foundation models\nand provide new ideas for developing vision models in civil engineering.\n","authors":["Kang Ge","Chen Wang","Yutao Guo","Yansong Tang","Zhenzhong Hu","Hongbing Chen"],"pdf_url":"https://arxiv.org/pdf/2312.04233v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14837v1","updated":"2024-04-23T08:43:32Z","published":"2024-04-23T08:43:32Z","title":"Ultrasound SAM Adapter: Adapting SAM for Breast Lesion Segmentation in\n Ultrasound Images","summary":" Segment Anything Model (SAM) has recently achieved amazing results in the\nfield of natural image segmentation. However, it is not effective for medical\nimage segmentation, owing to the large domain gap between natural and medical\nimages. In this paper, we mainly focus on ultrasound image segmentation. As we\nknow that it is very difficult to train a foundation model for ultrasound image\ndata due to the lack of large-scale annotated ultrasound image data. To address\nthese issues, in this paper, we develop a novel Breast Ultrasound SAM Adapter,\ntermed Breast Ultrasound Segment Anything Model (BUSSAM), which migrates the\nSAM to the field of breast ultrasound image segmentation by using the adapter\ntechnique. To be specific, we first design a novel CNN image encoder, which is\nfully trained on the BUS dataset. Our CNN image encoder is more lightweight,\nand focuses more on features of local receptive field, which provides the\ncomplementary information to the ViT branch in SAM. Then, we design a novel\nCross-Branch Adapter to allow the CNN image encoder to fully interact with the\nViT image encoder in SAM module. Finally, we add both of the Position Adapter\nand the Feature Adapter to the ViT branch to fine-tune the original SAM. The\nexperimental results on AMUBUS and BUSI datasets demonstrate that our proposed\nmodel outperforms other medical image segmentation models significantly. Our\ncode will be available at: https://github.com/bscs12/BUSSAM.\n","authors":["Zhengzheng Tu","Le Gu","Xixi Wang","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.14837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14835v1","updated":"2024-04-23T08:41:50Z","published":"2024-04-23T08:41:50Z","title":"Semi-supervised 2D Human Pose Estimation via Adaptive Keypoint Masking","summary":" Human pose estimation is a fundamental and challenging task in computer\nvision. Larger-scale and more accurate keypoint annotations, while helpful for\nimproving the accuracy of supervised pose estimation, are often expensive and\ndifficult to obtain. Semi-supervised pose estimation tries to leverage a large\namount of unlabeled data to improve model performance, which can alleviate the\nproblem of insufficient labeled samples. The latest semi-supervised learning\nusually adopts a strong and weak data augmented teacher-student learning\nframework to deal with the challenge of \"Human postural diversity and its\nlong-tailed distribution\". Appropriate data augmentation method is one of the\nkey factors affecting the accuracy and generalization of semi-supervised\nmodels. Aiming at the problem that the difference of sample learning is not\nconsidered in the fixed keypoint masking augmentation method, this paper\nproposes an adaptive keypoint masking method, which can fully mine the\ninformation in the samples and obtain better estimation performance. In order\nto further improve the generalization and robustness of the model, this paper\nproposes a dual-branch data augmentation scheme, which can perform Mixup on\nsamples and features on the basis of adaptive keypoint masking. The\neffectiveness of the proposed method is verified on COCO and MPII,\noutperforming the state-of-the-art semi-supervised pose estimation by 5.2% and\n0.3%, respectively.\n","authors":["Kexin Meng","Ruirui Li","Daguang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.14835v1.pdf","comment":"China Multimedia 2023"},{"id":"http://arxiv.org/abs/2403.08216v2","updated":"2024-04-23T08:41:47Z","published":"2024-03-13T03:28:39Z","title":"PaddingFlow: Improving Normalizing Flows with Padding-Dimensional Noise","summary":" Normalizing flow is a generative modeling approach with efficient sampling.\nHowever, Flow-based models suffer two issues: 1) If the target distribution is\nmanifold, due to the unmatch between the dimensions of the latent target\ndistribution and the data distribution, flow-based models might perform badly.\n2) Discrete data might make flow-based models collapse into a degenerate\nmixture of point masses. To sidestep such two issues, we propose PaddingFlow, a\nnovel dequantization method, which improves normalizing flows with\npadding-dimensional noise. To implement PaddingFlow, only the dimension of\nnormalizing flows needs to be modified. Thus, our method is easy to implement\nand computationally cheap. Moreover, the padding-dimensional noise is only\nadded to the padding dimension, which means PaddingFlow can dequantize without\nchanging data distributions. Implementing existing dequantization methods needs\nto change data distributions, which might degrade performance. We validate our\nmethod on the main benchmarks of unconditional density estimation, including\nfive tabular datasets and four image datasets for Variational Autoencoder (VAE)\nmodels, and the Inverse Kinematics (IK) experiments which are conditional\ndensity estimation. The results show that PaddingFlow can perform better in all\nexperiments in this paper, which means PaddingFlow is widely suitable for\nvarious tasks. The code is available at:\nhttps://github.com/AdamQLMeng/PaddingFlow.\n","authors":["Qinglong Meng","Chongkun Xia","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.08216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05180v2","updated":"2024-04-23T08:36:00Z","published":"2023-01-12T18:04:51Z","title":"Effective Decision Boundary Learning for Class Incremental Learning","summary":" Rehearsal approaches in class incremental learning (CIL) suffer from decision\nboundary overfitting to new classes, which is mainly caused by two factors:\ninsufficiency of old classes data for knowledge distillation and imbalanced\ndata learning between the learned and new classes because of the limited\nstorage memory. In this work, we present a simple but effective approach to\ntackle these two factors. First, we employ a re-sampling strategy and Mixup\nK}nowledge D}istillation (Re-MKD) to improve the performances of KD, which\nwould greatly alleviate the overfitting problem. Specifically, we combine mixup\nand re-sampling strategies to synthesize adequate data used in KD training that\nare more consistent with the latent distribution between the learned and new\nclasses. Second, we propose a novel incremental influence balance (IIB) method\nfor CIL to tackle the classification of imbalanced data by extending the\ninfluence balance method into the CIL setting, which re-weights samples by\ntheir influences to create a proper decision boundary. With these two\nimprovements, we present the effective decision boundary learning algorithm\n(EDBL) which improves the performance of KD and deals with the imbalanced data\nlearning simultaneously. Experiments show that the proposed EDBL achieves\nstate-of-the-art performances on several CIL benchmarks.\n","authors":["Chaoyue Ding","Kunchi Li","Jun Wan","Shan Yu"],"pdf_url":"https://arxiv.org/pdf/2301.05180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14830v1","updated":"2024-04-23T08:32:38Z","published":"2024-04-23T08:32:38Z","title":"CoProNN: Concept-based Prototypical Nearest Neighbors for Explaining\n Vision Models","summary":" Mounting evidence in explainability for artificial intelligence (XAI)\nresearch suggests that good explanations should be tailored to individual tasks\nand should relate to concepts relevant to the task. However, building task\nspecific explanations is time consuming and requires domain expertise which can\nbe difficult to integrate into generic XAI methods. A promising approach\ntowards designing useful task specific explanations with domain experts is\nbased on compositionality of semantic concepts. Here, we present a novel\napproach that enables domain experts to quickly create concept-based\nexplanations for computer vision tasks intuitively via natural language.\nLeveraging recent progress in deep generative methods we propose to generate\nvisual concept-based prototypes via text-to-image methods. These prototypes are\nthen used to explain predictions of computer vision models via a simple\nk-Nearest-Neighbors routine. The modular design of CoProNN is simple to\nimplement, it is straightforward to adapt to novel tasks and allows for\nreplacing the classification and text-to-image models as more powerful models\nare released. The approach can be evaluated offline against the ground-truth of\npredefined prototypes that can be easily communicated also to domain experts as\nthey are based on visual concepts. We show that our strategy competes very well\nwith other concept-based XAI approaches on coarse grained image classification\ntasks and may even outperform those methods on more demanding fine grained\ntasks. We demonstrate the effectiveness of our method for human-machine\ncollaboration settings in qualitative and quantitative user studies. All code\nand experimental data can be found in our GitHub\n$\\href{https://github.com/TeodorChiaburu/beexplainable}{repository}$.\n","authors":["Teodor Chiaburu","Frank Haußer","Felix Bießmann"],"pdf_url":"https://arxiv.org/pdf/2404.14830v1.pdf","comment":"24 pages, 9 figures, 2 tables, accepted at WCXAI 2024 Valletta"},{"id":"http://arxiv.org/abs/2312.11035v3","updated":"2024-04-23T08:32:03Z","published":"2023-12-18T09:11:28Z","title":"Towards Effective Multi-Moving-Camera Tracking: A New Dataset and\n Lightweight Link Model","summary":" Ensuring driving safety for autonomous vehicles has become increasingly\ncrucial, highlighting the need for systematic tracking of on-road pedestrians.\nMost vehicles are equipped with visual sensors, however, the large-scale visual\ndata has not been well studied yet. Multi-target multi-camera (MTMC) tracking\nsystems are composed of two modules: single-camera tracking (SCT) and\ninter-camera tracking (ICT). To reliably coordinate between them, MTMC tracking\nhas been a very complicated task, while tracking across multiple moving cameras\nmakes it even more challenging. In this paper, we focus on multi-target\nmulti-moving-camera (MTMMC) tracking, which is attracting increasing attention\nfrom the research community. Observing there are few datasets for MTMMC\ntracking, we collect a new dataset, called Multi-Moving-Camera Track (MMCT),\nwhich contains sequences under various driving scenarios. To address the common\nproblems of identity switch easily faced by most existing SCT trackers,\nespecially for moving cameras due to ego-motion between the camera and targets,\na lightweight appearance-free global link model, called Linker, is proposed to\nmitigate the identity switch by associating two disjoint tracklets of the same\ntarget into a complete trajectory within the same camera. Incorporated with\nLinker, existing SCT trackers generally obtain a significant improvement.\nMoreover, to alleviate the impact of the image style variations caused by\ndifferent cameras, a color transfer module is effectively incorporated to\nextract cross-camera consistent appearance features for pedestrian association\nacross moving cameras for ICT, resulting in a much improved MTMMC tracking\nsystem, which can constitute a step further towards coordinated mining of\nmultiple moving cameras. The project page is available at\nhttps://dhu-mmct.github.io/.\n","authors":["Yanting Zhang","Shuanghong Wang","Qingxiang Wang","Cairong Yan","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2312.11035v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14829v1","updated":"2024-04-23T08:31:55Z","published":"2024-04-23T08:31:55Z","title":"Revisiting Neural Networks for Continual Learning: An Architectural\n Perspective","summary":" Efforts to overcome catastrophic forgetting have primarily centered around\ndeveloping more effective Continual Learning (CL) methods. In contrast, less\nattention was devoted to analyzing the role of network architecture design\n(e.g., network depth, width, and components) in contributing to CL. This paper\nseeks to bridge this gap between network architecture design and CL, and to\npresent a holistic study on the impact of network architectures on CL. This\nwork considers architecture design at the network scaling level, i.e., width\nand depth, and also at the network components, i.e., skip connections, global\npooling layers, and down-sampling. In both cases, we first derive insights\nthrough systematically exploring how architectural designs affect CL. Then,\ngrounded in these insights, we craft a specialized search space for CL and\nfurther propose a simple yet effective ArchCraft method to steer a CL-friendly\narchitecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC.\nExperimental validation across various CL settings and scenarios demonstrates\nthat improved architectures are parameter-efficient, achieving state-of-the-art\nperformance of CL while being 86%, 61%, and 97% more compact in terms of\nparameters than the naive CL architecture in Class IL and Task IL. Code is\navailable at https://github.com/byyx666/ArchCraft.\n","authors":["Aojun Lu","Tao Feng","Hangjie Yuan","Xiaotian Song","Yanan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.14829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02772v2","updated":"2024-04-23T08:29:53Z","published":"2023-12-05T14:01:43Z","title":"FG-MDM: Towards Zero-Shot Human Motion Generation via Fine-Grained\n Descriptions","summary":" Recently, significant progress has been made in text-based motion generation,\nenabling the generation of diverse and high-quality human motions that conform\nto textual descriptions. However, generating motions beyond the distribution of\noriginal datasets remains challenging, i.e., zero-shot generation. By adopting\na divide-and-conquer strategy, we propose a new framework named Fine-Grained\nHuman Motion Diffusion Model (FG-MDM) for zero-shot human motion generation.\nSpecifically, we first parse previous vague textual annotations into\nfine-grained descriptions of different body parts by leveraging a large\nlanguage model. We then use these fine-grained descriptions to guide a\ntransformer-based diffusion model, which further adopts a design of part\ntokens. FG-MDM can generate human motions beyond the scope of original datasets\nowing to descriptions that are closer to motion essence. Our experimental\nresults demonstrate the superiority of FG-MDM over previous methods in\nzero-shot settings. We will release our fine-grained textual annotations for\nHumanML3D and KIT.\n","authors":["Xu Shi","Wei Yao","Chuanchen Luo","Junran Peng","Hongwen Zhang","Yunlian Sun"],"pdf_url":"https://arxiv.org/pdf/2312.02772v2.pdf","comment":"Project Page: https://sx0207.github.io/fg-mdm/"},{"id":"http://arxiv.org/abs/2404.14822v1","updated":"2024-04-23T08:19:08Z","published":"2024-04-23T08:19:08Z","title":"CNN2GNN: How to Bridge CNN with GNN","summary":" Although the convolutional neural network (CNN) has achieved excellent\nperformance in vision tasks by extracting the intra-sample representation, it\nwill take a higher training expense because of stacking numerous convolutional\nlayers. Recently, as the bilinear models, graph neural networks (GNN) have\nsucceeded in exploring the underlying topological relationship among the graph\ndata with a few graph neural layers. Unfortunately, it cannot be directly\nutilized on non-graph data due to the lack of graph structure and has high\ninference latency on large-scale scenarios. Inspired by these complementary\nstrengths and weaknesses, \\textit{we discuss a natural question, how to bridge\nthese two heterogeneous networks?} In this paper, we propose a novel CNN2GNN\nframework to unify CNN and GNN together via distillation. Firstly, to break the\nlimitations of GNN, a differentiable sparse graph learning module is designed\nas the head of networks to dynamically learn the graph for inductive learning.\nThen, a response-based distillation is introduced to transfer the knowledge\nfrom CNN to GNN and bridge these two heterogeneous networks. Notably, due to\nextracting the intra-sample representation of a single instance and the\ntopological relationship among the datasets simultaneously, the performance of\ndistilled ``boosted'' two-layer GNN on Mini-ImageNet is much higher than CNN\ncontaining dozens of layers such as ResNet152.\n","authors":["Ziheng Jiao","Hongyuan Zhang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.14822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07518v2","updated":"2024-04-23T08:02:23Z","published":"2024-04-11T07:22:14Z","title":"Remembering Transformer for Continual Learning","summary":" Neural networks encounter the challenge of Catastrophic Forgetting (CF) in\ncontinual learning, where new task knowledge interferes with previously learned\nknowledge. We propose Remembering Transformer, inspired by the brain's\nComplementary Learning Systems (CLS), to tackle this issue. Remembering\nTransformer employs a mixture-of-adapters and a generative model-based routing\nmechanism to alleviate CF by dynamically routing task data to relevant\nadapters. Our approach demonstrated a new SOTA performance in various vision\ncontinual learning tasks and great parameter efficiency.\n","authors":["Yuwei Sun","Ippei Fujisawa","Arthur Juliani","Jun Sakuma","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2404.07518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14808v1","updated":"2024-04-23T07:39:09Z","published":"2024-04-23T07:39:09Z","title":"Visual-Augmented Dynamic Semantic Prototype for Generative Zero-Shot\n Learning","summary":" Generative Zero-shot learning (ZSL) learns a generator to synthesize visual\nsamples for unseen classes, which is an effective way to advance ZSL. However,\nexisting generative methods rely on the conditions of Gaussian noise and the\npredefined semantic prototype, which limit the generator only optimized on\nspecific seen classes rather than characterizing each visual instance,\nresulting in poor generalizations (\\textit{e.g.}, overfitting to seen classes).\nTo address this issue, we propose a novel Visual-Augmented Dynamic Semantic\nprototype method (termed VADS) to boost the generator to learn accurate\nsemantic-visual mapping by fully exploiting the visual-augmented knowledge into\nsemantic conditions. In detail, VADS consists of two modules: (1) Visual-aware\nDomain Knowledge Learning module (VDKL) learns the local bias and global prior\nof the visual features (referred to as domain visual knowledge), which replace\npure Gaussian noise to provide richer prior noise information; (2)\nVision-Oriented Semantic Updation module (VOSU) updates the semantic prototype\naccording to the visual representations of the samples. Ultimately, we\nconcatenate their output as a dynamic semantic prototype, which serves as the\ncondition of the generator. Extensive experiments demonstrate that our VADS\nachieves superior CZSL and GZSL performances on three prominent datasets and\noutperforms other state-of-the-art methods with averaging increases by 6.4\\%,\n5.9\\% and 4.2\\% on SUN, CUB and AWA2, respectively.\n","authors":["Wenjin Hou","Shiming Chen","Shuhuang Chen","Ziming Hong","Yan Wang","Xuetao Feng","Salman Khan","Fahad Shahbaz Khan","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2404.14808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14807v1","updated":"2024-04-23T07:37:43Z","published":"2024-04-23T07:37:43Z","title":"Reference-Free Multi-Modality Volume Registration of X-Ray Microscopy\n and Light-Sheet Fluorescence Microscopy","summary":" Recently, X-ray microscopy (XRM) and light-sheet fluorescence microscopy\n(LSFM) have emerged as two pivotal imaging tools in preclinical research on\nbone remodeling diseases, offering micrometer-level resolution. Integrating\nthese complementary modalities provides a holistic view of bone\nmicrostructures, facilitating function-oriented volume analysis across\ndifferent disease cycles. However, registering such independently acquired\nlarge-scale volumes is extremely challenging under real and reference-free\nscenarios. This paper presents a fast two-stage pipeline for volume\nregistration of XRM and LSFM. The first stage extracts the surface features and\nemploys two successive point cloud-based methods for coarse alignment. The\nsecond stage fine-tunes the initial alignment using a modified\ncross-correlation method, ensuring precise volumetric registration. Moreover,\nwe propose residual similarity as a novel metric to assess the alignment of two\ncomplementary modalities. The results imply robust gradual improvement across\nthe stages. In the end, all correlating microstructures, particularly lacunae\nin XRM and bone cells in LSFM, are precisely matched, enabling new insights\ninto bone diseases like osteoporosis which are a substantial burden in aging\nsocieties.\n","authors":["Siyuan Mei","Fuxin Fan","Mareike Thies","Mingxuan Gu","Fabian Wagner","Oliver Aust","Ina Erceg","Zeynab Mirzaei","Georgiana Neag","Yipeng Sun","Yixing Huang","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2404.14807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02238v3","updated":"2024-04-23T07:35:14Z","published":"2023-12-04T09:19:38Z","title":"X-Adapter: Adding Universal Compatibility of Plugins for Upgraded\n Diffusion Model","summary":" We introduce X-Adapter, a universal upgrader to enable the pretrained\nplug-and-play modules (e.g., ControlNet, LoRA) to work directly with the\nupgraded text-to-image diffusion model (e.g., SDXL) without further retraining.\nWe achieve this goal by training an additional network to control the frozen\nupgraded model with the new text-image data pairs. In detail, X-Adapter keeps a\nfrozen copy of the old model to preserve the connectors of different plugins.\nAdditionally, X-Adapter adds trainable mapping layers that bridge the decoders\nfrom models of different versions for feature remapping. The remapped features\nwill be used as guidance for the upgraded model. To enhance the guidance\nability of X-Adapter, we employ a null-text training strategy for the upgraded\nmodel. After training, we also introduce a two-stage denoising strategy to\nalign the initial latents of X-Adapter and the upgraded model. Thanks to our\nstrategies, X-Adapter demonstrates universal compatibility with various plugins\nand also enables plugins of different versions to work together, thereby\nexpanding the functionalities of diffusion community. To verify the\neffectiveness of the proposed method, we conduct extensive experiments and the\nresults show that X-Adapter may facilitate wider application in the upgraded\nfoundational diffusion model.\n","authors":["Lingmin Ran","Xiaodong Cun","Jia-Wei Liu","Rui Zhao","Song Zijie","Xintao Wang","Jussi Keppo","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.02238v3.pdf","comment":"Project page: https://showlab.github.io/X-Adapter/"},{"id":"http://arxiv.org/abs/2404.14801v1","updated":"2024-04-23T07:31:19Z","published":"2024-04-23T07:31:19Z","title":"DesignProbe: A Graphic Design Benchmark for Multimodal Large Language\n Models","summary":" A well-executed graphic design typically achieves harmony in two levels, from\nthe fine-grained design elements (color, font and layout) to the overall\ndesign. This complexity makes the comprehension of graphic design challenging,\nfor it needs the capability to both recognize the design elements and\nunderstand the design. With the rapid development of Multimodal Large Language\nModels (MLLMs), we establish the DesignProbe, a benchmark to investigate the\ncapability of MLLMs in design. Our benchmark includes eight tasks in total,\nacross both the fine-grained element level and the overall design level. At\ndesign element level, we consider both the attribute recognition and semantic\nunderstanding tasks. At overall design level, we include style and metaphor. 9\nMLLMs are tested and we apply GPT-4 as evaluator. Besides, further experiments\nindicates that refining prompts can enhance the performance of MLLMs. We first\nrewrite the prompts by different LLMs and found increased performances appear\nin those who self-refined by their own LLMs. We then add extra task knowledge\nin two different ways (text descriptions and image examples), finding that\nadding images boost much more performance over texts.\n","authors":["Jieru Lin","Danqing Huang","Tiejun Zhao","Dechen Zhan","Chin-Yew Lin"],"pdf_url":"https://arxiv.org/pdf/2404.14801v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2404.07762v4","updated":"2024-04-23T07:29:18Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/atonderski/neuro-ncap\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13558v2","updated":"2024-04-23T07:17:03Z","published":"2024-04-21T07:13:56Z","title":"LASER: Tuning-Free LLM-Driven Attention Control for Efficient\n Text-conditioned Image-to-Animation","summary":" Revolutionary advancements in text-to-image models have unlocked new\ndimensions for sophisticated content creation, e.g., text-conditioned image\nediting, allowing us to edit the diverse images that convey highly complex\nvisual concepts according to the textual guidance. Despite being promising,\nexisting methods focus on texture- or non-rigid-based visual manipulation,\nwhich struggles to produce the fine-grained animation of smooth\ntext-conditioned image morphing without fine-tuning, i.e., due to their highly\nunstructured latent space. In this paper, we introduce a tuning-free LLM-driven\nattention control framework, encapsulated by the progressive process of LLM\nplanning, prompt-Aware editing, StablE animation geneRation, abbreviated as\nLASER. LASER employs a large language model (LLM) to refine coarse descriptions\ninto detailed prompts, guiding pre-trained text-to-image models for subsequent\nimage generation. We manipulate the model's spatial features and self-attention\nmechanisms to maintain animation integrity and enable seamless morphing\ndirectly from text prompts, eliminating the need for additional fine-tuning or\nannotations. Our meticulous control over spatial features and self-attention\nensures structural consistency in the images. This paper presents a novel\nframework integrating LLMs with text-to-image models to create high-quality\nanimations from a single text input. We also propose a Text-conditioned\nImage-to-Animation Benchmark to validate the effectiveness and efficacy of\nLASER. Extensive experiments demonstrate that LASER produces impressive,\nconsistent, and efficient results in animation generation, positioning it as a\npowerful tool for advanced digital content creation.\n","authors":["Haoyu Zheng","Wenqiao Zhang","Yaoke Wang","Hao Zhou","Jiang Liu","Juncheng Li","Zheqi Lv","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.13558v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.08968v3","updated":"2024-04-23T07:13:30Z","published":"2024-04-13T11:13:56Z","title":"MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes","summary":" Recent advancements in post-hoc and inherently interpretable methods have\nmarkedly enhanced the explanations of black box classifier models. These\nmethods operate either through post-analysis or by integrating concept learning\nduring model training. Although being effective in bridging the semantic gap\nbetween a model's latent space and human interpretation, these explanation\nmethods only partially reveal the model's decision-making process. The outcome\nis typically limited to high-level semantics derived from the last feature map.\nWe argue that the explanations lacking insights into the decision processes at\nlow and mid-level features are neither fully faithful nor useful. Addressing\nthis gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet),\nan inherently interpretable model. MCPNet autonomously learns meaningful\nconcept prototypes across multiple feature map levels using Centered Kernel\nAlignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so\nwithout reliance on predefined concept labels. Further, we propose a novel\nclassifier paradigm that learns and aligns multi-level concept prototype\ndistributions for classification purposes via Class-aware Concept Distribution\n(CCD) loss. Our experiments reveal that our proposed MCPNet while being\nadaptable to various model architectures, offers comprehensive multi-level\nexplanations while maintaining classification accuracy. Additionally, its\nconcept distribution-based classification approach shows improved\ngeneralization capabilities in few-shot classification scenarios.\n","authors":["Bor-Shiun Wang","Chien-Yi Wang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2404.08968v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14780v1","updated":"2024-04-23T06:37:54Z","published":"2024-04-23T06:37:54Z","title":"ContextualFusion: Context-Based Multi-Sensor Fusion for 3D Object\n Detection in Adverse Operating Conditions","summary":" The fusion of multimodal sensor data streams such as camera images and lidar\npoint clouds plays an important role in the operation of autonomous vehicles\n(AVs). Robust perception across a range of adverse weather and lighting\nconditions is specifically required for AVs to be deployed widely. While\nmulti-sensor fusion networks have been previously developed for perception in\nsunny and clear weather conditions, these methods show a significant\ndegradation in performance under night-time and poor weather conditions. In\nthis paper, we propose a simple yet effective technique called ContextualFusion\nto incorporate the domain knowledge about cameras and lidars behaving\ndifferently across lighting and weather variations into 3D object detection\nmodels. Specifically, we design a Gated Convolutional Fusion (GatedConv)\napproach for the fusion of sensor streams based on the operational context. To\naid in our evaluation, we use the open-source simulator CARLA to create a\nmultimodal adverse-condition dataset called AdverseOp3D to address the\nshortcomings of existing datasets being biased towards daytime and good-weather\nconditions. Our ContextualFusion approach yields an mAP improvement of 6.2%\nover state-of-the-art methods on our context-balanced synthetic dataset.\nFinally, our method enhances state-of-the-art 3D objection performance at night\non the real-world NuScenes dataset with a significant mAP improvement of 11.7%.\n","authors":["Shounak Sural","Nishad Sahu"," Ragunathan"," Rajkumar"],"pdf_url":"https://arxiv.org/pdf/2404.14780v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.14768v1","updated":"2024-04-23T06:10:43Z","published":"2024-04-23T06:10:43Z","title":"Enhancing Prompt Following with Visual Control Through Training-Free\n Mask-Guided Diffusion","summary":" Recently, integrating visual controls into text-to-image~(T2I) models, such\nas ControlNet method, has received significant attention for finer control\ncapabilities. While various training-free methods make efforts to enhance\nprompt following in T2I models, the issue with visual control is still rarely\nstudied, especially in the scenario that visual controls are misaligned with\ntext prompts. In this paper, we address the challenge of ``Prompt Following\nWith Visual Control\" and propose a training-free approach named Mask-guided\nPrompt Following (MGPF). Object masks are introduced to distinct aligned and\nmisaligned parts of visual controls and prompts. Meanwhile, a network, dubbed\nas Masked ControlNet, is designed to utilize these object masks for object\ngeneration in the misaligned visual control region. Further, to improve\nattribute matching, a simple yet efficient loss is designed to align the\nattention maps of attributes with object regions constrained by ControlNet and\nobject masks. The efficacy and superiority of MGPF are validated through\ncomprehensive quantitative and qualitative experiments.\n","authors":["Hongyu Chen","Yiqi Gao","Min Zhou","Peng Wang","Xubin Li","Tiezheng Ge","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.14768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14759v1","updated":"2024-04-23T05:50:02Z","published":"2024-04-23T05:50:02Z","title":"Unified Unsupervised Salient Object Detection via Knowledge Transfer","summary":" Recently, unsupervised salient object detection (USOD) has gained increasing\nattention due to its annotation-free nature. However, current methods mainly\nfocus on specific tasks such as RGB and RGB-D, neglecting the potential for\ntask migration. In this paper, we propose a unified USOD framework for generic\nUSOD tasks. Firstly, we propose a Progressive Curriculum Learning-based\nSaliency Distilling (PCL-SD) mechanism to extract saliency cues from a\npre-trained deep network. This mechanism starts with easy samples and\nprogressively moves towards harder ones, to avoid initial interference caused\nby hard samples. Afterwards, the obtained saliency cues are utilized to train a\nsaliency detector, and we employ a Self-rectify Pseudo-label Refinement (SPR)\nmechanism to improve the quality of pseudo-labels. Finally, an adapter-tuning\nmethod is devised to transfer the acquired saliency knowledge, leveraging\nshared knowledge to attain superior transferring performance on the target\ntasks. Extensive experiments on five representative SOD tasks confirm the\neffectiveness and feasibility of our proposed method. Code and supplement\nmaterials are available at https://github.com/I2-Multimedia-Lab/A2S-v3.\n","authors":["Yao Yuan","Wutao Liu","Pan Gao","Qun Dai","Jie Qin"],"pdf_url":"https://arxiv.org/pdf/2404.14759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12113v5","updated":"2024-04-23T05:46:01Z","published":"2023-08-23T13:06:59Z","title":"Advancements in Point Cloud Data Augmentation for Deep Learning: A\n Survey","summary":" Deep learning (DL) has become one of the mainstream and effective methods for\npoint cloud analysis tasks such as detection, segmentation and classification.\nTo reduce overfitting during training DL models and improve model performance\nespecially when the amount and/or diversity of training data are limited,\naugmentation is often crucial. Although various point cloud data augmentation\nmethods have been widely used in different point cloud processing tasks, there\nare currently no published systematic surveys or reviews of these methods.\nTherefore, this article surveys these methods, categorizing them into a\ntaxonomy framework that comprises basic and specialized point cloud data\naugmentation methods. Through a comprehensive evaluation of these augmentation\nmethods, this article identifies their potentials and limitations, serving as a\nuseful reference for choosing appropriate augmentation methods. In addition,\npotential directions for future research are recommended. This survey\ncontributes to providing a holistic overview of the current state of point\ncloud data augmentation, promoting its wider application and development.\n","authors":["Qinfeng Zhu","Lei Fan","Ningxin Weng"],"pdf_url":"https://arxiv.org/pdf/2308.12113v5.pdf","comment":"Accepted by Pattern Recognition"},{"id":"http://arxiv.org/abs/2404.14755v1","updated":"2024-04-23T05:36:33Z","published":"2024-04-23T05:36:33Z","title":"SkinGEN: an Explainable Dermatology Diagnosis-to-Generation Framework\n with Interactive Vision-Language Models","summary":" With the continuous advancement of vision language models (VLMs) technology,\nremarkable research achievements have emerged in the dermatology field, the\nfourth most prevalent human disease category. However, despite these\nadvancements, VLM still faces \"hallucination\" in dermatological diagnosis, and\ndue to the inherent complexity of dermatological conditions, existing tools\noffer relatively limited support for user comprehension. We propose SkinGEN, a\ndiagnosis-to-generation framework that leverages the stable diffusion (SD)\nmethod to generate reference demonstrations from diagnosis results provided by\nVLM, thereby enhancing the visual explainability for users. Through extensive\nexperiments with Low-Rank Adaptation (LoRA), we identify optimal strategies for\nskin condition image generation. We conduct a user study with 32 participants\nevaluating both the system performance and explainability. Results demonstrate\nthat SkinGEN significantly improves users' comprehension of VLM predictions and\nfosters increased trust in the diagnostic process. This work paves the way for\nmore transparent and user-centric VLM applications in dermatology and beyond.\n","authors":["Bo Lin","Yingjing Xu","Xuanwen Bao","Zhou Zhao","Zuyong Zhang","Zhouyang Wang","Jie Zhang","Shuiguang Deng","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2404.14755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12734v3","updated":"2024-04-23T05:36:31Z","published":"2024-04-19T09:28:16Z","title":"DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On\n Transformer","summary":" With the continuous development of Optical Character Recognition (OCR) and\nthe expansion of application fields, text recognition in complex scenes has\nbecome a key challenge. Factors such as multiple fonts, mixed scenes and\ncomplex layouts seriously affect the recognition accuracy of traditional OCR\nmodels. Although OCR models based on deep learning have performed well in\nspecific fields or similar datasets in recent years, the generalization ability\nand robustness of the model are still a big challenge when facing complex\nenvironments with multiple scenes. Furthermore, training an OCR model from\nscratch or fine-tuning all parameters is very demanding on computing resources\nand inference time, which limits the flexibility of its application. This study\nfocuses on a fundamental aspect of mixed text recognition in response to the\nchallenges mentioned above, which involves effectively fine-tuning the\npre-trained basic OCR model to demonstrate exceptional performance across\nvarious downstream tasks. To this end, we propose a parameter-efficient mixed\ntext recognition method based on pre-trained OCR Transformer, namely\nDLoRA-TrOCR. This method embeds DoRA into the image encoder and LoRA into the\ninternal structure of the text decoder, enabling efficient parameter\nfine-tuning for downstream tasks. Experiments show that compared to similar\nparameter adjustment methods, our model DLoRA-TrOCR has the smallest number of\nparameters and performs better. It can achieve state-of-the-art performance on\ncomplex scene datasets involving simultaneous recognition of mixed handwritten,\nprinted and street view texts.\n","authors":["Da Chang","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2404.12734v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17387v2","updated":"2024-04-23T05:34:50Z","published":"2024-03-26T05:12:18Z","title":"Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object\n Detection","summary":" We delve into pseudo-labeling for semi-supervised monocular 3D object\ndetection (SSM3OD) and discover two primary issues: a misalignment between the\nprediction quality of 3D and 2D attributes and the tendency of depth\nsupervision derived from pseudo-labels to be noisy, leading to significant\noptimization conflicts with other reliable forms of supervision. We introduce a\nnovel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach\nfeatures a Decoupled Pseudo-label Generation (DPG) module, designed to\nefficiently generate pseudo-labels by separately processing 2D and 3D\nattributes. This module incorporates a unique homography-based method for\nidentifying dependable pseudo-labels in BEV space, specifically for 3D\nattributes. Additionally, we present a DepthGradient Projection (DGP) module to\nmitigate optimization conflicts caused by noisy depth supervision of\npseudo-labels, effectively decoupling the depth gradient and removing\nconflicting gradients. This dual decoupling strategy-at both the pseudo-label\ngeneration and gradient levels-significantly improves the utilization of\npseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark\ndemonstrate the superiority of our method over existing approaches.\n","authors":["Jiacheng Zhang","Jiaming Li","Xiangru Lin","Wei Zhang","Xiao Tan","Junyu Han","Errui Ding","Jingdong Wang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17387v2.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2404.14750v1","updated":"2024-04-23T05:16:24Z","published":"2024-04-23T05:16:24Z","title":"Grounded Knowledge-Enhanced Medical VLP for Chest X-Ray","summary":" Medical vision-language pre-training has emerged as a promising approach for\nlearning domain-general representations of medical image and text. Current\nalgorithms that exploit the global and local alignment between medical image\nand text could however be marred by the redundant information in medical data.\nTo address this issue, we propose a grounded knowledge-enhanced medical\nvision-language pre-training (GK-MVLP) framework for chest X-ray. In this\nframework, medical knowledge is grounded to the appropriate anatomical regions\nby using a transformer-based grounded knowledge-enhanced module for\nfine-grained alignment between anatomical region-level visual features and the\ntextural features of medical knowledge. The performance of GK-MVLP is\ncompetitive with or exceeds the state of the art on downstream chest X-ray\ndisease classification, disease localization, report generation, and medical\nvisual question-answering tasks. Our results show the advantage of\nincorporating grounding mechanism to remove biases and improve the alignment\nbetween chest X-ray image and radiology report.\n","authors":["Qiao Deng","Zhongzhen Huang","Yunqi Wang","Zhichuan Wang","Zhao Wang","Xiaofan Zhang","Qi Dou","Yeung Yu Hui","Edward S. Hui"],"pdf_url":"https://arxiv.org/pdf/2404.14750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11318v2","updated":"2024-04-23T05:09:40Z","published":"2024-04-17T12:32:10Z","title":"Leveraging Fine-Grained Information and Noise Decoupling for Remote\n Sensing Change Detection","summary":" Change detection aims to identify remote sense object changes by analyzing\ndata between bitemporal image pairs. Due to the large temporal and spatial span\nof data collection in change detection image pairs, there are often a\nsignificant amount of task-specific and task-agnostic noise. Previous effort\nhas focused excessively on denoising, with this goes a great deal of loss of\nfine-grained information. In this paper, we revisit the importance of\nfine-grained features in change detection and propose a series of operations\nfor fine-grained information compensation and noise decoupling (FINO). First,\nthe context is utilized to compensate for the fine-grained information in the\nfeature space. Next, a shape-aware and a brightness-aware module are designed\nto improve the capacity for representation learning. The shape-aware module\nguides the backbone for more precise shape estimation, guiding the backbone\nnetwork in extracting object shape features. The brightness-aware module learns\na overall brightness estimation to improve the model's robustness to\ntask-agnostic noise. Finally, a task-specific noise decoupling structure is\ndesigned as a way to improve the model's ability to separate noise interference\nfrom feature similarity. With these training schemes, our proposed method\nachieves new state-of-the-art (SOTA) results in multiple change detection\nbenchmarks. The code will be made available.\n","authors":["Qiangang Du","Jinlong Peng","Changan Wang","Xu Chen","Qingdong He","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05108v3","updated":"2024-04-23T05:06:10Z","published":"2023-10-08T10:44:05Z","title":"Enhancing Representations through Heterogeneous Self-Supervised Learning","summary":" Incorporating heterogeneous representations from different architectures has\nfacilitated various vision tasks, e.g., some hybrid networks combine\ntransformers and convolutions. However, complementarity between such\nheterogeneous architectures has not been well exploited in self-supervised\nlearning. Thus, we propose Heterogeneous Self-Supervised Learning (HSSL), which\nenforces a base model to learn from an auxiliary head whose architecture is\nheterogeneous from the base model. In this process, HSSL endows the base model\nwith new characteristics in a representation learning way without structural\nchanges. To comprehensively understand the HSSL, we conduct experiments on\nvarious heterogeneous pairs containing a base model and an auxiliary head. We\ndiscover that the representation quality of the base model moves up as their\narchitecture discrepancy grows. This observation motivates us to propose a\nsearch strategy that quickly determines the most suitable auxiliary head for a\nspecific base model to learn and several simple but effective methods to\nenlarge the model discrepancy. The HSSL is compatible with various\nself-supervised methods, achieving superior performances on various downstream\ntasks, including image classification, semantic segmentation, instance\nsegmentation, and object detection. Our source code will be made publicly\navailable.\n","authors":["Zhong-Yu Li","Bo-Wen Yin","Yongxiang Liu","Li Liu","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.05108v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11326v4","updated":"2024-04-23T05:04:23Z","published":"2024-04-17T12:38:58Z","title":"Single-temporal Supervised Remote Change Detection for Domain\n Generalization","summary":" Change detection is widely applied in remote sensing image analysis. Existing\nmethods require training models separately for each dataset, which leads to\npoor domain generalization. Moreover, these methods rely heavily on large\namounts of high-quality pair-labelled data for training, which is expensive and\nimpractical. In this paper, we propose a multimodal contrastive learning\n(ChangeCLIP) based on visual-language pre-training for change detection domain\ngeneralization. Additionally, we propose a dynamic context optimization for\nprompt learning. Meanwhile, to address the data dependency issue of existing\nmethods, we introduce a single-temporal and controllable AI-generated training\nstrategy (SAIN). This allows us to train the model using a large number of\nsingle-temporal images without image pairs in the real world, achieving\nexcellent generalization. Extensive experiments on series of real change\ndetection datasets validate the superiority and strong generalization of\nChangeCLIP, outperforming state-of-the-art change detection methods. Code will\nbe available.\n","authors":["Qiangang Du","Jinlong Peng","Xu Chen","Qingdong He","Liren He","Qiang Nie","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11326v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14747v1","updated":"2024-04-23T04:59:34Z","published":"2024-04-23T04:59:34Z","title":"Differentiable Score-Based Likelihoods: Learning CT Motion Compensation\n From Clean Images","summary":" Motion artifacts can compromise the diagnostic value of computed tomography\n(CT) images. Motion correction approaches require a per-scan estimation of\npatient-specific motion patterns. In this work, we train a score-based model to\nact as a probability density estimator for clean head CT images. Given the\ntrained model, we quantify the deviation of a given motion-affected CT image\nfrom the ideal distribution through likelihood computation. We demonstrate that\nthe likelihood can be utilized as a surrogate metric for motion artifact\nseverity in the CT image facilitating the application of an iterative,\ngradient-based motion compensation algorithm. By optimizing the underlying\nmotion parameters to maximize likelihood, our method effectively reduces motion\nartifacts, bringing the image closer to the distribution of motion-free scans.\nOur approach achieves comparable performance to state-of-the-art methods while\neliminating the need for a representative data set of motion-affected samples.\nThis is particularly advantageous in real-world applications, where patient\nmotion patterns may exhibit unforeseen variability, ensuring robustness without\nimplicit assumptions about recoverable motion types.\n","authors":["Mareike Thies","Noah Maul","Siyuan Mei","Laura Pfaff","Nastassia Vysotskaya","Mingxuan Gu","Jonas Utz","Dennis Possart","Lukas Folle","Fabian Wagner","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2404.14747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04517v2","updated":"2024-04-23T04:54:51Z","published":"2024-04-06T06:15:07Z","title":"Latent-based Diffusion Model for Long-tailed Recognition","summary":" Long-tailed imbalance distribution is a common issue in practical computer\nvision applications. Previous works proposed methods to address this problem,\nwhich can be categorized into several classes: re-sampling, re-weighting,\ntransfer learning, and feature augmentation. In recent years, diffusion models\nhave shown an impressive generation ability in many sub-problems of deep\ncomputer vision. However, its powerful generation has not been explored in\nlong-tailed problems. We propose a new approach, the Latent-based Diffusion\nModel for Long-tailed Recognition (LDMLR), as a feature augmentation method to\ntackle the issue. First, we encode the imbalanced dataset into features using\nthe baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM)\nusing these encoded features to generate pseudo-features. Finally, we train the\nclassifier using the encoded and pseudo-features from the previous two steps.\nThe model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT\ndatasets by using the proposed method.\n","authors":["Pengxiao Han","Changkun Ye","Jieming Zhou","Jing Zhang","Jie Hong","Xuesong Li"],"pdf_url":"https://arxiv.org/pdf/2404.04517v2.pdf","comment":"8 pages, 3 figures. Accepted by L3DIVU-CVPR2024"},{"id":"http://arxiv.org/abs/2404.14745v1","updated":"2024-04-23T04:54:32Z","published":"2024-04-23T04:54:32Z","title":"TAAT: Think and Act from Arbitrary Texts in Text2Motion","summary":" Text2Motion aims to generate human motions from texts. Existing datasets rely\non the assumption that texts include action labels (such as \"walk, bend, and\npick up\"), which is not flexible for practical scenarios. This paper redefines\nthis problem with a more realistic assumption that the texts are arbitrary.\nSpecifically, arbitrary texts include existing action texts composed of action\nlabels (e.g., A person walks and bends to pick up something), and introduce\nscene texts without explicit action labels (e.g., A person notices his wallet\non the ground ahead).\n To bridge the gaps between this realistic setting and existing datasets, we\nexpand the action texts on the HumanML3D dataset to more scene texts, thereby\ncreating a new HumanML3D++ dataset including arbitrary texts. In this\nchallenging dataset, we benchmark existing state-of-the-art methods and propose\na novel two-stage framework to extract action labels from arbitrary texts by\nthe Large Language Model (LLM) and then generate motions from action labels.\nExtensive experiments are conducted under different application scenarios to\nvalidate the effectiveness of the proposed framework on existing and proposed\ndatasets. The results indicate that Text2Motion in this realistic setting is\nvery challenging, fostering new research in this practical direction. Our\ndataset and code will be released.\n","authors":["Runqi Wang","Caoyuan Ma","GuoPeng Li","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13949v2","updated":"2024-04-23T04:48:47Z","published":"2024-04-22T07:50:24Z","title":"PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for\n RGB-D Cameras with Limited Co-visibility","summary":" RGB-D cameras are crucial in robotic perception, given their ability to\nproduce images augmented with depth data. However, their limited FOV often\nrequires multiple cameras to cover a broader area. In multi-camera RGB-D\nsetups, the goal is typically to reduce camera overlap, optimizing spatial\ncoverage with as few cameras as possible. The extrinsic calibration of these\nsystems introduces additional complexities. Existing methods for extrinsic\ncalibration either necessitate specific tools or highly depend on the accuracy\nof camera motion estimation. To address these issues, we present PeLiCal, a\nnovel line-based calibration approach for RGB-D camera systems exhibiting\nlimited overlap. Our method leverages long line features from surroundings, and\nfilters out outliers with a novel convergence voting algorithm, achieving\ntargetless, real-time, and outlier-robust performance compared to existing\nmethods. We open source our implementation on\nhttps://github.com/joomeok/PeLiCal.git.\n","authors":["Jaeho Shin","Seungsang Yun","Ayoung Kim"],"pdf_url":"https://arxiv.org/pdf/2404.13949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14739v1","updated":"2024-04-23T04:45:23Z","published":"2024-04-23T04:45:23Z","title":"BMapOpt: Optimization of Brain Tissue Probability Maps using a\n Differentiable MRI Simulator","summary":" Reconstructing digital brain phantoms in the form of multi-channeled brain\ntissue probability maps for individual subjects is essential for capturing\nbrain anatomical variability, understanding neurological diseases, as well as\nfor testing image processing methods. We demonstrate the first framework that\noptimizes brain tissue probability maps (Gray Matter - GM, White Matter - WM,\nand Cerebrospinal fluid - CSF) with the help of a Physics-based differentiable\nMRI simulator that models the magnetization signal at each voxel in the image.\nGiven an observed $T_1$/$T_2$-weighted MRI scan, the corresponding clinical MRI\nsequence, and the MRI differentiable simulator, we optimize the simulator's\ninput probability maps by back-propagating the L2 loss between the simulator's\noutput and the $T_1$/$T_2$-weighted scan. This approach has the significant\nadvantage of not relying on any training data, and instead uses the strong\ninductive bias of the MRI simulator. We tested the model on 20 scans from the\nBrainWeb database and demonstrate a highly accurate reconstruction of GM, WM,\nand CSF.\n","authors":["Utkarsh Gupta","Emmanouil Nikolakakis","Moritz Zaiss","Razvan Marinescu"],"pdf_url":"https://arxiv.org/pdf/2404.14739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15569v2","updated":"2024-04-23T04:08:54Z","published":"2023-07-28T14:04:54Z","title":"Point Clouds Are Specialized Images: A Knowledge Transfer Approach for\n 3D Understanding","summary":" Self-supervised representation learning (SSRL) has gained increasing\nattention in point cloud understanding, in addressing the challenges posed by\n3D data scarcity and high annotation costs. This paper presents PCExpert, a\nnovel SSRL approach that reinterprets point clouds as \"specialized images\".\nThis conceptual shift allows PCExpert to leverage knowledge derived from\nlarge-scale image modality in a more direct and deeper manner, via extensively\nsharing the parameters with a pre-trained image encoder in a multi-way\nTransformer architecture. The parameter sharing strategy, combined with a novel\npretext task for pre-training, i.e., transformation estimation, empowers\nPCExpert to outperform the state of the arts in a variety of tasks, with a\nremarkable reduction in the number of trainable parameters. Notably, PCExpert's\nperformance under LINEAR fine-tuning (e.g., yielding a 90.02% overall accuracy\non ScanObjectNN) has already approached the results obtained with FULL model\nfine-tuning (92.66%), demonstrating its effective and robust representation\ncapability.\n","authors":["Jiachen Kang","Wenjing Jia","Xiangjian He","Kin Man Lam"],"pdf_url":"https://arxiv.org/pdf/2307.15569v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16279v3","updated":"2024-04-23T03:54:27Z","published":"2023-10-25T01:24:12Z","title":"TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer","summary":" Estimating the 6D object pose is an essential task in many applications. Due\nto the lack of depth information, existing RGB-based methods are sensitive to\nocclusion and illumination changes. How to extract and utilize the geometry\nfeatures in depth information is crucial to achieve accurate predictions. To\nthis end, we propose TransPose, a novel 6D pose framework that exploits\nTransformer Encoder with geometry-aware module to develop better learning of\npoint cloud feature representations. Specifically, we first uniformly sample\npoint cloud and extract local geometry features with the designed local feature\nextractor base on graph convolution network. To improve robustness to\nocclusion, we adopt Transformer to perform the exchange of global information,\nmaking each local feature contains global information. Finally, we introduce\ngeometry-aware module in Transformer Encoder, which to form an effective\nconstrain for point cloud feature learning and makes the global information\nexchange more tightly coupled with point cloud tasks. Extensive experiments\nindicate the effectiveness of TransPose, our pose estimation pipeline achieves\ncompetitive results on three benchmark datasets.\n","authors":["Xiao Lin","Deming Wang","Guangliang Zhou","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.16279v3.pdf","comment":"Accepted by NEUROCOMPUTING"},{"id":"http://arxiv.org/abs/2404.14716v1","updated":"2024-04-23T03:42:48Z","published":"2024-04-23T03:42:48Z","title":"Bayesian Example Selection Improves In-Context Learning for Speech,\n Text, and Visual Modalities","summary":" Large language models (LLMs) can adapt to new tasks through in-context\nlearning (ICL) based on a few examples presented in dialogue history without\nany model parameter update. Despite such convenience, the performance of ICL\nheavily depends on the quality of the in-context examples presented, which\nmakes the in-context example selection approach a critical choice. This paper\nproposes a novel Bayesian in-Context example Selection method (ByCS) for ICL.\nExtending the inference probability conditioned on in-context examples based on\nBayes' theorem, ByCS focuses on the inverse inference conditioned on test\ninput. Following the assumption that accurate inverse inference probability\n(likelihood) will result in accurate inference probability (posterior),\nin-context examples are selected based on their inverse inference results.\nDiverse and extensive cross-tasking and cross-modality experiments are\nperformed with speech, text, and image examples. Experimental results show the\nefficacy and robustness of our ByCS method on various models, tasks and\nmodalities.\n","authors":["Siyin Wang","Chao-Han Huck Yang","Ji Wu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14716v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.14715v1","updated":"2024-04-23T03:42:14Z","published":"2024-04-23T03:42:14Z","title":"FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection\n and Correction","summary":" Recent progress in large-scale pre-training has led to the development of\nadvanced vision-language models (VLMs) with remarkable proficiency in\ncomprehending and generating multimodal content. Despite the impressive ability\nto perform complex reasoning for VLMs, current models often struggle to\neffectively and precisely capture the compositional information on both the\nimage and text sides. To address this, we propose FineMatch, a new aspect-based\nfine-grained text and image matching benchmark, focusing on text and image\nmismatch detection and correction. This benchmark introduces a novel task for\nboosting and evaluating the VLMs' compositionality for aspect-based\nfine-grained text and image matching. In this task, models are required to\nidentify mismatched aspect phrases within a caption, determine the aspect's\nclass, and propose corrections for an image-text pair that may contain between\n0 and 3 mismatches. To evaluate the models' performance on this new task, we\npropose a new evaluation metric named ITM-IoU for which our experiments show a\nhigh correlation to human evaluation. In addition, we also provide a\ncomprehensive experimental analysis of existing mainstream VLMs, including\nfully supervised learning and in-context learning settings. We have found that\nmodels trained on FineMatch demonstrate enhanced proficiency in detecting\nfine-grained text and image mismatches. Moreover, models (e.g., GPT-4V, Gemini\nPro Vision) with strong abilities to perform multimodal in-context learning are\nnot as skilled at fine-grained compositional image and text matching analysis.\nWith FineMatch, we are able to build a system for text-to-image generation\nhallucination detection and correction.\n","authors":["Hang Hua","Jing Shi","Kushal Kafle","Simon Jenni","Daoan Zhang","John Collomosse","Scott Cohen","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.14715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06375v3","updated":"2024-04-23T03:35:53Z","published":"2024-03-11T01:58:04Z","title":"FlowVQTalker: High-Quality Emotional Talking Face Generation through\n Normalizing Flow and Quantization","summary":" Generating emotional talking faces is a practical yet challenging endeavor.\nTo create a lifelike avatar, we draw upon two critical insights from a human\nperspective: 1) The connection between audio and the non-deterministic facial\ndynamics, encompassing expressions, blinks, poses, should exhibit synchronous\nand one-to-many mapping. 2) Vibrant expressions are often accompanied by\nemotion-aware high-definition (HD) textures and finely detailed teeth. However,\nboth aspects are frequently overlooked by existing methods. To this end, this\npaper proposes using normalizing Flow and Vector-Quantization modeling to\nproduce emotional talking faces that satisfy both insights concurrently\n(FlowVQTalker). Specifically, we develop a flow-based coefficient generator\nthat encodes the dynamics of facial emotion into a multi-emotion-class latent\nspace represented as a mixture distribution. The generation process commences\nwith random sampling from the modeled distribution, guided by the accompanying\naudio, enabling both lip-synchronization and the uncertain nonverbal facial\ncues generation. Furthermore, our designed vector-quantization image generator\ntreats the creation of expressive facial images as a code query task, utilizing\na learned codebook to provide rich, high-quality textures that enhance the\nemotional perception of the results. Extensive experiments are conducted to\nshowcase the effectiveness of our approach.\n","authors":["Shuai Tan","Bin Ji","Ye Pan"],"pdf_url":"https://arxiv.org/pdf/2403.06375v3.pdf","comment":"11 pages, 11 figures, conference"},{"id":"http://arxiv.org/abs/2404.14709v1","updated":"2024-04-23T03:35:27Z","published":"2024-04-23T03:35:27Z","title":"SC-HVPPNet: Spatial and Channel Hybrid-Attention Video Post-Processing\n Network with CNN and Transformer","summary":" Convolutional Neural Network (CNN) and Transformer have attracted much\nattention recently for video post-processing (VPP). However, the interaction\nbetween CNN and Transformer in existing VPP methods is not fully explored,\nleading to inefficient communication between the local and global extracted\nfeatures. In this paper, we explore the interaction between CNN and Transformer\nin the task of VPP, and propose a novel Spatial and Channel Hybrid-Attention\nVideo Post-Processing Network (SC-HVPPNet), which can cooperatively exploit the\nimage priors in both spatial and channel domains. Specifically, in the spatial\ndomain, a novel spatial attention fusion module is designed, in which two\nattention weights are generated to fuse the local and global representations\ncollaboratively. In the channel domain, a novel channel attention fusion module\nis developed, which can blend the deep representations at the channel dimension\ndynamically. Extensive experiments show that SC-HVPPNet notably boosts video\nrestoration quality, with average bitrate savings of 5.29%, 12.42%, and 13.09%\nfor Y, U, and V components in the VTM-11.0-NNVC RA configuration.\n","authors":["Tong Zhang","Wenxue Cui","Shaohui Liu","Feng Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.14709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00391v2","updated":"2024-04-23T03:31:23Z","published":"2023-11-01T09:34:15Z","title":"Fixation-based Self-calibration for Eye Tracking in VR Headsets","summary":" This study proposes a novel self-calibration method for eye tracking in a\nvirtual reality (VR) headset. The proposed method is based on the assumptions\nthat the user's viewpoint can freely move and that the points of regard (PoRs)\nfrom different viewpoints are distributed within a small area on an object\nsurface during visual fixation. In the method, fixations are first detected\nfrom the time-series data of uncalibrated gaze directions using an extension of\nthe I-VDT (velocity and dispersion threshold identification) algorithm to a\nthree-dimensional (3D) scene. Then, the calibration parameters are optimized by\nminimizing the sum of a dispersion metrics of the PoRs. The proposed method can\npotentially identify the optimal calibration parameters representing the\nuser-dependent offset from the optical axis to the visual axis without explicit\nuser calibration, image processing, or marker-substitute objects. For the gaze\ndata of 18 participants walking in two VR environments with many occlusions,\nthe proposed method achieved an accuracy of 2.1$^\\circ$, which was\nsignificantly lower than the average offset. Our method is the first\nself-calibration method with an average error lower than 3$^\\circ$ in 3D\nenvironments. Further, the accuracy of the proposed method can be improved by\nup to 1.2$^\\circ$ by refining the fixation detection or optimization algorithm.\n","authors":["Ryusei Uramune","Sei Ikeda","Hiroki Ishizuka","Osamu Oshiro"],"pdf_url":"https://arxiv.org/pdf/2311.00391v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14572v2","updated":"2024-04-23T03:24:02Z","published":"2023-06-26T10:33:45Z","title":"Feature Imitating Networks Enhance The Performance, Reliability And\n Speed Of Deep Learning On Biomedical Image Processing Tasks","summary":" Feature-Imitating-Networks (FINs) are neural networks that are first trained\nto approximate closed-form statistical features (e.g. Entropy), and then\nembedded into other networks to enhance their performance. In this work, we\nperform the first evaluation of FINs for biomedical image processing tasks. We\nbegin by training a set of FINs to imitate six common radiomics features, and\nthen compare the performance of larger networks (with and without embedding the\nFINs) for three experimental tasks: COVID-19 detection from CT scans, brain\ntumor classification from MRI scans, and brain-tumor segmentation from MRI\nscans. We found that models embedded with FINs provided enhanced performance\nfor all three tasks when compared to baseline networks without FINs, even when\nthose baseline networks had more parameters. Additionally, we found that models\nembedded with FINs converged faster and more consistently compared to baseline\nnetworks with similar or greater representational capacity. The results of our\nexperiments provide evidence that FINs may offer state-of-the-art performance\nfor a variety of other biomedical image processing tasks.\n","authors":["Shangyang Min","Hassan B. Ebadian","Tuka Alhanai","Mohammad Mahdi Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2306.14572v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.00893v2","updated":"2024-04-23T03:23:10Z","published":"2022-06-02T06:46:12Z","title":"Leveraging Systematic Knowledge of 2D Transformations","summary":" The existing deep learning models suffer from out-of-distribution (o.o.d.)\nperformance drop in computer vision tasks. In comparison, humans have a\nremarkable ability to interpret images, even if the scenes in the images are\nrare, thanks to the systematicity of acquired knowledge. This work focuses on\n1) the acquisition of systematic knowledge of 2D transformations, and 2)\narchitectural components that can leverage the learned knowledge in image\nclassification tasks in an o.o.d. setting. With a new training methodology\nbased on synthetic datasets that are constructed under the causal framework,\nthe deep neural networks acquire knowledge from semantically different domains\n(e.g. even from noise), and exhibit certain level of systematicity in parameter\nestimation experiments. Based on this, a novel architecture is devised\nconsisting of a classifier, an estimator and an identifier (abbreviated as\n\"CED\"). By emulating the \"hypothesis-verification\" process in human visual\nperception, CED improves the classification accuracy significantly on test sets\nunder covariate shift.\n","authors":["Jiachen Kang","Wenjing Jia","Xiangjian He"],"pdf_url":"https://arxiv.org/pdf/2206.00893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14705v1","updated":"2024-04-23T03:22:06Z","published":"2024-04-23T03:22:06Z","title":"Think-Program-reCtify: 3D Situated Reasoning with Large Language Models","summary":" This work addresses the 3D situated reasoning task which aims to answer\nquestions given egocentric observations in a 3D environment. The task remains\nchallenging as it requires comprehensive 3D perception and complex reasoning\nskills. End-to-end models trained on supervised data for 3D situated reasoning\nsuffer from data scarcity and generalization ability. Inspired by the recent\nsuccess of leveraging large language models (LLMs) for visual reasoning, we\npropose LLM-TPC, a novel framework that leverages the planning, tool usage, and\nreflection capabilities of LLMs through a ThinkProgram-reCtify loop. The Think\nphase first decomposes the compositional question into a sequence of steps, and\nthen the Program phase grounds each step to a piece of code and calls carefully\ndesigned 3D visual perception modules. Finally, the Rectify phase adjusts the\nplan and code if the program fails to execute. Experiments and analysis on the\nSQA3D benchmark demonstrate the effectiveness, interpretability and robustness\nof our method. Our code is publicly available at\nhttps://qingrongh.github.io/LLM-TPC/.\n","authors":["Qingrong He","Kejun Lin","Shizhe Chen","Anwen Hu","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2404.14705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14704v1","updated":"2024-04-23T03:17:36Z","published":"2024-04-23T03:17:36Z","title":"Unsupervised Domain Adaptation Architecture Search with Self-Training\n for Land Cover Mapping","summary":" Unsupervised domain adaptation (UDA) is a challenging open problem in land\ncover mapping. Previous studies show encouraging progress in addressing\ncross-domain distribution shifts on remote sensing benchmarks for land cover\nmapping. The existing works are mainly built on large neural network\narchitectures, which makes them resource-hungry systems, limiting their\npractical impact for many real-world applications in resource-constrained\nenvironments. Thus, we proposed a simple yet effective framework to search for\nlightweight neural networks automatically for land cover mapping tasks under\ndomain shifts. This is achieved by integrating Markov random field neural\narchitecture search (MRF-NAS) into a self-training UDA framework to search for\nefficient and effective networks under a limited computation budget. This is\nthe first attempt to combine NAS with self-training UDA as a single framework\nfor land cover mapping. We also investigate two different pseudo-labelling\napproaches (confidence-based and energy-based) in self-training scheme.\nExperimental results on two recent datasets (OpenEarthMap & FLAIR #1) for\nremote sensing UDA demonstrate a satisfactory performance. With only less than\n2M parameters and 30.16 GFLOPs, the best-discovered lightweight network reaches\nstate-of-the-art performance on the regional target domain of OpenEarthMap\n(59.38% mIoU) and the considered target domain of FLAIR #1 (51.19% mIoU). The\ncode is at\nhttps://github.com/cliffbb/UDA-NAS}{https://github.com/cliffbb/UDA-NAS.\n","authors":["Clifford Broni-Bediako","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.14704v1.pdf","comment":"Accepted at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.13677v2","updated":"2024-04-23T03:12:24Z","published":"2024-04-21T14:36:57Z","title":"A Dataset and Model for Realistic License Plate Deblurring","summary":" Vehicle license plate recognition is a crucial task in intelligent traffic\nmanagement systems. However, the challenge of achieving accurate recognition\npersists due to motion blur from fast-moving vehicles. Despite the widespread\nuse of image synthesis approaches in existing deblurring and recognition\nalgorithms, their effectiveness in real-world scenarios remains unproven. To\naddress this, we introduce the first large-scale license plate deblurring\ndataset named License Plate Blur (LPBlur), captured by a dual-camera system and\nprocessed through a post-processing pipeline to avoid misalignment issues.\nThen, we propose a License Plate Deblurring Generative Adversarial Network\n(LPDGAN) to tackle the license plate deblurring: 1) a Feature Fusion Module to\nintegrate multi-scale latent codes; 2) a Text Reconstruction Module to restore\nstructure through textual modality; 3) a Partition Discriminator Module to\nenhance the model's perception of details in each letter. Extensive experiments\nvalidate the reliability of the LPBlur dataset for both model training and\ntesting, showcasing that our proposed model outperforms other state-of-the-art\nmotion deblurring methods in realistic license plate deblurring scenarios. The\ndataset and code are available at https://github.com/haoyGONG/LPDGAN.\n","authors":["Haoyan Gong","Yuzheng Feng","Zhenrong Zhang","Xianxu Hou","Jingxin Liu","Siqi Huang","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13677v2.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.14696v1","updated":"2024-04-23T02:54:12Z","published":"2024-04-23T02:54:12Z","title":"Adaptive Prompt Learning with Negative Textual Semantics and Uncertainty\n Modeling for Universal Multi-Source Domain Adaptation","summary":" Universal Multi-source Domain Adaptation (UniMDA) transfers knowledge from\nmultiple labeled source domains to an unlabeled target domain under domain\nshifts (different data distribution) and class shifts (unknown target classes).\nExisting solutions focus on excavating image features to detect unknown\nsamples, ignoring abundant information contained in textual semantics. In this\npaper, we propose an Adaptive Prompt learning with Negative textual semantics\nand uncErtainty modeling method based on Contrastive Language-Image\nPre-training (APNE-CLIP) for UniMDA classification tasks. Concretely, we\nutilize the CLIP with adaptive prompts to leverage textual information of class\nsemantics and domain representations, helping the model identify unknown\nsamples and address domain shifts. Additionally, we design a novel global\ninstance-level alignment objective by utilizing negative textual semantics to\nachieve more precise image-text pair alignment. Furthermore, we propose an\nenergy-based uncertainty modeling strategy to enlarge the margin distance\nbetween known and unknown samples. Extensive experiments demonstrate the\nsuperiority of our proposed method.\n","authors":["Yuxiang Yang","Lu Wen","Yuanyuan Xu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14696v1.pdf","comment":"Accepted by ICME2024"},{"id":"http://arxiv.org/abs/2308.04956v3","updated":"2024-04-23T02:51:28Z","published":"2023-08-09T13:41:30Z","title":"Improved Cryo-EM Pose Estimation and 3D Classification through\n Latent-Space Disentanglement","summary":" Due to the extremely low signal-to-noise ratio (SNR) and unknown poses\n(projection angles and image shifts) in cryo-electron microscopy (cryo-EM)\nexperiments, reconstructing 3D volumes from 2D images is very challenging. In\naddition to these challenges, heterogeneous cryo-EM reconstruction requires\nconformational classification. In popular cryo-EM reconstruction algorithms,\nposes and conformation classification labels must be predicted for every input\ncryo-EM image, which can be computationally costly for large datasets. An\nemerging class of methods adopted the amortized inference approach. In these\nmethods, only a subset of the input dataset is needed to train neural networks\nfor the estimation of poses and conformations. Once trained, these neural\nnetworks can make pose/conformation predictions and 3D reconstructions at low\ncost for the entire dataset during inference. Unfortunately, when facing\nheterogeneous reconstruction tasks, it is hard for current\namortized-inference-based methods to effectively estimate the conformational\ndistribution and poses from entangled latent variables. Here, we propose a\nself-supervised variational autoencoder architecture called \"HetACUMN\" based on\namortized inference. We employed an auxiliary conditional pose prediction task\nby inverting the order of encoder-decoder to explicitly enforce the\ndisentanglement of conformation and pose predictions. Results on simulated\ndatasets show that HetACUMN generated more accurate conformational\nclassifications than other amortized or non-amortized methods. Furthermore, we\nshow that HetACUMN is capable of performing heterogeneous 3D reconstructions of\na real experimental dataset.\n","authors":["Weijie Chen","Yuhang Wang","Lin Yao"],"pdf_url":"https://arxiv.org/pdf/2308.04956v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2404.14693v1","updated":"2024-04-23T02:50:38Z","published":"2024-04-23T02:50:38Z","title":"Double Privacy Guard: Robust Traceable Adversarial Watermarking against\n Face Recognition","summary":" The wide deployment of Face Recognition (FR) systems poses risks of privacy\nleakage. One countermeasure to address this issue is adversarial attacks, which\ndeceive malicious FR searches but simultaneously interfere the normal identity\nverification of trusted authorizers. In this paper, we propose the first Double\nPrivacy Guard (DPG) scheme based on traceable adversarial watermarking. DPG\nemploys a one-time watermark embedding to deceive unauthorized FR models and\nallows authorizers to perform identity verification by extracting the\nwatermark. Specifically, we propose an information-guided adversarial attack\nagainst FR models. The encoder embeds an identity-specific watermark into the\ndeep feature space of the carrier, guiding recognizable features of the image\nto deviate from the source identity. We further adopt a collaborative\nmeta-optimization strategy compatible with sub-tasks, which regularizes the\njoint optimization direction of the encoder and decoder. This strategy enhances\nthe representation of universal carrier features, mitigating multi-objective\noptimization conflicts in watermarking. Experiments confirm that DPG achieves\nsignificant attack success rates and traceability accuracy on state-of-the-art\nFR models, exhibiting remarkable robustness that outperforms the existing\nprivacy protection methods using adversarial attacks and deep watermarking, or\nsimple combinations of the two. Our work potentially opens up new insights into\nproactive protection for FR privacy.\n","authors":["Yunming Zhang","Dengpan Ye","Sipeng Shen","Caiyun Xie","Ziyi Liu","Jiacheng Deng","Long Tang"],"pdf_url":"https://arxiv.org/pdf/2404.14693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13903v2","updated":"2024-04-23T02:33:48Z","published":"2024-04-22T06:25:17Z","title":"Accelerating Image Generation with Sub-path Linear Approximation Model","summary":" Diffusion models have significantly advanced the state of the art in image,\naudio, and video generation tasks. However, their applications in practical\nscenarios are hindered by slow inference speed. Drawing inspiration from the\napproximation strategies utilized in consistency models, we propose the\nSub-path Linear Approximation Model (SLAM), which accelerates diffusion models\nwhile maintaining high-quality image generation. SLAM treats the PF-ODE\ntrajectory as a series of PF-ODE sub-paths divided by sampled points, and\nharnesses sub-path linear (SL) ODEs to form a progressive and continuous error\nestimation along each individual PF-ODE sub-path. The optimization on such\nSL-ODEs allows SLAM to construct denoising mappings with smaller cumulative\napproximated errors. An efficient distillation method is also developed to\nfacilitate the incorporation of more advanced diffusion models, such as latent\ndiffusion models. Our extensive experimental results demonstrate that SLAM\nachieves an efficient training regimen, requiring only 6 A100 GPU days to\nproduce a high-quality generative model capable of 2 to 4-step generation with\nhigh performance. Comprehensive evaluations on LAION, MS COCO 2014, and MS COCO\n2017 datasets also illustrate that SLAM surpasses existing acceleration methods\nin few-step generation tasks, achieving state-of-the-art performance both on\nFID and the quality of the generated images.\n","authors":["Chen Xu","Tianhui Song","Weixin Feng","Xubin Li","Tiezheng Ge","Bo Zheng","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14687v1","updated":"2024-04-23T02:32:57Z","published":"2024-04-23T02:32:57Z","title":"Pegasus-v1 Technical Report","summary":" This technical report introduces Pegasus-1, a multimodal language model\nspecialized in video content understanding and interaction through natural\nlanguage. Pegasus-1 is designed to address the unique challenges posed by video\ndata, such as interpreting spatiotemporal information, to offer nuanced video\ncontent comprehension across various lengths. This technical report overviews\nPegasus-1's architecture, training strategies, and its performance in\nbenchmarks on video conversation, zero-shot video question answering, and video\nsummarization. We also explore qualitative characteristics of Pegasus-1 ,\ndemonstrating its capabilities as well as its limitations, in order to provide\nreaders a balanced view of its current state and its future direction.\n","authors":["Raehyuk Jung","Hyojun Go","Jaehyuk Yi","Jiho Jang","Daniel Kim","Jay Suh","Aiden Lee","Cooper Han","Jae Lee","Jeff Kim","Jin-Young Kim","Junwan Kim","Kyle Park","Lucas Lee","Mars Ha","Minjoon Seo","Abraham Jo","Ed Park","Hassan Kianinejad","SJ Kim","Tony Moon","Wade Jeong","Andrei Popescu","Esther Kim","EK Yoon","Genie Heo","Henry Choi","Jenna Kang","Kevin Han","Noah Seo","Sunny Nguyen","Ryan Won","Yeonhoo Park","Anthony Giuliani","Dave Chung","Hans Yoon","James Le","Jenny Ahn","June Lee","Maninder Saini","Meredith Sanders","Soyoung Lee","Sue Kim","Travis Couture"],"pdf_url":"https://arxiv.org/pdf/2404.14687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14678v1","updated":"2024-04-23T02:06:10Z","published":"2024-04-23T02:06:10Z","title":"3DBench: A Scalable 3D Benchmark and Instruction-Tuning Dataset","summary":" Evaluating the performance of Multi-modal Large Language Models (MLLMs),\nintegrating both point cloud and language, presents significant challenges. The\nlack of a comprehensive assessment hampers determining whether these models\ntruly represent advancements, thereby impeding further progress in the field.\nCurrent evaluations heavily rely on classification and caption tasks, falling\nshort in providing a thorough assessment of MLLMs. A pressing need exists for a\nmore sophisticated evaluation method capable of thoroughly analyzing the\nspatial understanding and expressive capabilities of these models. To address\nthese issues, we introduce a scalable 3D benchmark, accompanied by a\nlarge-scale instruction-tuning dataset known as 3DBench, providing an\nextensible platform for a comprehensive evaluation of MLLMs. Specifically, we\nestablish the benchmark that spans a wide range of spatial and semantic scales,\nfrom object-level to scene-level, addressing both perception and planning\ntasks. Furthermore, we present a rigorous pipeline for automatically\nconstructing scalable 3D instruction-tuning datasets, covering 10 diverse\nmulti-modal tasks with more than 0.23 million QA pairs generated in total.\nThorough experiments evaluating trending MLLMs, comparisons against existing\ndatasets, and variations of training protocols demonstrate the superiority of\n3DBench, offering valuable insights into current limitations and potential\nresearch directions.\n","authors":["Junjie Zhang","Tianci Hu","Xiaoshui Huang","Yongshun Gong","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.14678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14676v1","updated":"2024-04-23T02:04:53Z","published":"2024-04-23T02:04:53Z","title":"DreamPBR: Text-driven Generation of High-resolution SVBRDF with\n Multi-modal Guidance","summary":" Prior material creation methods had limitations in producing diverse results\nmainly because reconstruction-based methods relied on real-world measurements\nand generation-based methods were trained on relatively small material\ndatasets. To address these challenges, we propose DreamPBR, a novel\ndiffusion-based generative framework designed to create spatially-varying\nappearance properties guided by text and multi-modal controls, providing high\ncontrollability and diversity in material generation. Key to achieving diverse\nand high-quality PBR material generation lies in integrating the capabilities\nof recent large-scale vision-language models trained on billions of text-image\npairs, along with material priors derived from hundreds of PBR material\nsamples. We utilize a novel material Latent Diffusion Model (LDM) to establish\nthe mapping between albedo maps and the corresponding latent space. The latent\nrepresentation is then decoded into full SVBRDF parameter maps using a\nrendering-aware PBR decoder. Our method supports tileable generation through\nconvolution with circular padding. Furthermore, we introduce a multi-modal\nguidance module, which includes pixel-aligned guidance, style image guidance,\nand 3D shape guidance, to enhance the control capabilities of the material LDM.\nWe demonstrate the effectiveness of DreamPBR in material creation, showcasing\nits versatility and user-friendliness on a wide range of controllable\ngeneration and editing applications.\n","authors":["Linxuan Xin","Zheng Zhang","Jinfu Wei","Ge Li","Duan Gao"],"pdf_url":"https://arxiv.org/pdf/2404.14676v1.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2404.14674v1","updated":"2024-04-23T02:00:58Z","published":"2024-04-23T02:00:58Z","title":"HOIN: High-Order Implicit Neural Representations","summary":" Implicit neural representations (INR) suffer from worsening spectral bias,\nwhich results in overly smooth solutions to the inverse problem. To deal with\nthis problem, we propose a universal framework for processing inverse problems\ncalled \\textbf{High-Order Implicit Neural Representations (HOIN)}. By refining\nthe traditional cascade structure to foster high-order interactions among\nfeatures, HOIN enhances the model's expressive power and mitigates spectral\nbias through its neural tangent kernel's (NTK) strong diagonal properties,\naccelerating and optimizing inverse problem resolution. By analyzing the\nmodel's expression space, high-order derivatives, and the NTK matrix, we\ntheoretically validate the feasibility of HOIN. HOIN realizes 1 to 3 dB\nimprovements in most inverse problems, establishing a new state-of-the-art\nrecovery quality and training efficiency, thus providing a new general paradigm\nfor INR and paving the way for it to solve the inverse problem.\n","authors":["Yang Chen","Ruituo Wu","Yipeng Liu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.14674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14671v1","updated":"2024-04-23T01:55:09Z","published":"2024-04-23T01:55:09Z","title":"LaneCorrect: Self-supervised Lane Detection","summary":" Lane detection has evolved highly functional autonomous driving system to\nunderstand driving scenes even under complex environments. In this paper, we\nwork towards developing a generalized computer vision system able to detect\nlanes without using any annotation. We make the following contributions: (i) We\nillustrate how to perform unsupervised 3D lane segmentation by leveraging the\ndistinctive intensity of lanes on the LiDAR point cloud frames, and then obtain\nthe noisy lane labels in the 2D plane by projecting the 3D points; (ii) We\npropose a novel self-supervised training scheme, dubbed LaneCorrect, that\nautomatically corrects the lane label by learning geometric consistency and\ninstance awareness from the adversarial augmentations; (iii) With the\nself-supervised pre-trained model, we distill to train a student network for\narbitrary target lane (e.g., TuSimple) detection without any human labels; (iv)\nWe thoroughly evaluate our self-supervised method on four major lane detection\nbenchmarks (including TuSimple, CULane, CurveLanes and LLAMAS) and demonstrate\nexcellent performance compared with existing supervised counterpart, whilst\nshowing more effective results on alleviating the domain gap, i.e., training on\nCULane and test on TuSimple.\n","authors":["Ming Nie","Xinyue Cai","Hang Xu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14667v1","updated":"2024-04-23T01:51:58Z","published":"2024-04-23T01:51:58Z","title":"3DFlowRenderer: One-shot Face Re-enactment via Dense 3D Facial Flow\n Estimation","summary":" Performing facial expression transfer under one-shot setting has been\nincreasing in popularity among research community with a focus on precise\ncontrol of expressions. Existing techniques showcase compelling results in\nperceiving expressions, but they lack robustness with extreme head poses. They\nalso struggle to accurately reconstruct background details, thus hindering the\nrealism. In this paper, we propose a novel warping technology which integrates\nthe advantages of both 2D and 3D methods to achieve robust face re-enactment.\nWe generate dense 3D facial flow fields in feature space to warp an input image\nbased on target expressions without depth information. This enables explicit 3D\ngeometric control for re-enacting misaligned source and target faces. We\nregularize the motion estimation capability of the 3D flow prediction network\nthrough proposed \"Cyclic warp loss\" by converting warped 3D features back into\n2D RGB space. To ensure the generation of finer facial region with\nnatural-background, our framework only renders the facial foreground region\nfirst and learns to inpaint the blank area which needs to be filled due to\nsource face translation, thus reconstructing the detailed background without\nany unwanted pixel motion. Extensive evaluation reveals that our method\noutperforms state-of-the-art techniques in rendering artifact-free facial\nimages.\n","authors":["Siddharth Nijhawan","Takuya Yashima","Tamaki Kojima"],"pdf_url":"https://arxiv.org/pdf/2404.14667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02162v2","updated":"2024-04-23T01:48:32Z","published":"2023-04-04T23:27:02Z","title":"Learning to Recover Spectral Reflectance from RGB Images","summary":" This paper tackles spectral reflectance recovery (SRR) from RGB images. Since\ncapturing ground-truth spectral reflectance and camera spectral sensitivity are\nchallenging and costly, most existing approaches are trained on synthetic\nimages and utilize the same parameters for all unseen testing images, which are\nsuboptimal especially when the trained models are tested on real images because\nthey never exploit the internal information of the testing images. To address\nthis issue, we adopt a self-supervised meta-auxiliary learning (MAXL) strategy\nthat fine-tunes the well-trained network parameters with each testing image to\ncombine external with internal information. To the best of our knowledge, this\nis the first work that successfully adapts the MAXL strategy to this problem.\nInstead of relying on naive end-to-end training, we also propose a novel\narchitecture that integrates the physical relationship between the spectral\nreflectance and the corresponding RGB images into the network based on our\nmathematical analysis. Besides, since the spectral reflectance of a scene is\nindependent to its illumination while the corresponding RGB images are not, we\nrecover the spectral reflectance of a scene from its RGB images captured under\nmultiple illuminations to further reduce the unknown. Qualitative and\nquantitative evaluations demonstrate the effectiveness of our proposed network\nand of the MAXL. Our code and data are available at\nhttps://github.com/Dong-Huo/SRR-MAXL.\n","authors":["Dong Huo","Jian Wang","Yiming Qian","Yee-Hong Yang"],"pdf_url":"https://arxiv.org/pdf/2304.02162v2.pdf","comment":"IEEE Transactions on Image Processing (TIP), 2024"},{"id":"http://arxiv.org/abs/2404.14661v1","updated":"2024-04-23T01:45:55Z","published":"2024-04-23T01:45:55Z","title":"First Mapping the Canopy Height of Primeval Forests in the Tallest Tree\n Area of Asia","summary":" We have developed the world's first canopy height map of the distribution\narea of world-level giant trees. This mapping is crucial for discovering more\nindividual and community world-level giant trees, and for analyzing and\nquantifying the effectiveness of biodiversity conservation measures in the\nYarlung Tsangpo Grand Canyon (YTGC) National Nature Reserve. We proposed a\nmethod to map the canopy height of the primeval forest within the world-level\ngiant tree distribution area by using a spaceborne LiDAR fusion satellite\nimagery (Global Ecosystem Dynamics Investigation (GEDI), ICESat-2, and\nSentinel-2) driven deep learning modeling. And we customized a pyramid\nreceptive fields depth separable CNN (PRFXception). PRFXception, a CNN\narchitecture specifically customized for mapping primeval forest canopy height\nto infer the canopy height at the footprint level of GEDI and ICESat-2 from\nSentinel-2 optical imagery with a 10-meter spatial resolution. We conducted a\nfield survey of 227 permanent plots using a stratified sampling method and\nmeasured several giant trees using UAV-LS. The predicted canopy height was\ncompared with ICESat-2 and GEDI validation data (RMSE =7.56 m, MAE=6.07 m,\nME=-0.98 m, R^2=0.58 m), UAV-LS point clouds (RMSE =5.75 m, MAE =3.72 m, ME =\n0.82 m, R^2= 0.65 m), and ground survey data (RMSE = 6.75 m, MAE = 5.56 m, ME=\n2.14 m, R^2=0.60 m). We mapped the potential distribution map of world-level\ngiant trees and discovered two previously undetected giant tree communities\nwith an 89% probability of having trees 80-100 m tall, potentially taller than\nAsia's tallest tree. This paper provides scientific evidence confirming\nsoutheastern Tibet--northwestern Yunnan as the fourth global distribution\ncenter of world-level giant trees initiatives and promoting the inclusion of\nthe YTGC giant tree distribution area within the scope of China's national park\nconservation.\n","authors":["Guangpeng Fan","Fei Yan","Xiangquan Zeng","Qingtao Xu","Ruoyoulan Wang","Binghong Zhang","Jialing Zhou","Liangliang Nan","Jinhu Wang","Zhiwei Zhang","Jia Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15841v4","updated":"2024-04-23T01:40:57Z","published":"2023-11-27T14:07:13Z","title":"Learning Disentangled Identifiers for Action-Customized Text-to-Image\n Generation","summary":" This study focuses on a novel task in text-to-image (T2I) generation, namely\naction customization. The objective of this task is to learn the co-existing\naction from limited data and generalize it to unseen humans or even animals.\nExperimental results show that existing subject-driven customization methods\nfail to learn the representative characteristics of actions and struggle in\ndecoupling actions from context features, including appearance. To overcome the\npreference for low-level features and the entanglement of high-level features,\nwe propose an inversion-based method Action-Disentangled Identifier (ADI) to\nlearn action-specific identifiers from the exemplar images. ADI first expands\nthe semantic conditioning space by introducing layer-wise identifier tokens,\nthereby increasing the representational richness while distributing the\ninversion across different features. Then, to block the inversion of\naction-agnostic features, ADI extracts the gradient invariance from the\nconstructed sample triples and masks the updates of irrelevant channels. To\ncomprehensively evaluate the task, we present an ActionBench that includes a\nvariety of actions, each accompanied by meticulously selected samples. Both\nquantitative and qualitative results show that our ADI outperforms existing\nbaselines in action-customized T2I generation. Our project page is at\nhttps://adi-t2i.github.io/ADI.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Xi Chen","Yuqian Fu","Yu Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15841v4.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.14657v1","updated":"2024-04-23T01:34:20Z","published":"2024-04-23T01:34:20Z","title":"Progressive Token Length Scaling in Transformer Encoders for Efficient\n Universal Segmentation","summary":" A powerful architecture for universal segmentation relies on transformers\nthat encode multi-scale image features and decode object queries into mask\npredictions. With efficiency being a high priority for scaling such models, we\nobserved that the state-of-the-art method Mask2Former uses ~50% of its compute\nonly on the transformer encoder. This is due to the retention of a full-length\ntoken-level representation of all backbone feature scales at each encoder\nlayer. With this observation, we propose a strategy termed PROgressive Token\nLength SCALing for Efficient transformer encoders (PRO-SCALE) that can be\nplugged-in to the Mask2Former-style segmentation architectures to significantly\nreduce the computational cost. The underlying principle of PRO-SCALE is:\nprogressively scale the length of the tokens with the layers of the encoder.\nThis allows PRO-SCALE to reduce computations by a large margin with minimal\nsacrifice in performance (~52% GFLOPs reduction with no drop in performance on\nCOCO dataset). We validate our framework on multiple public benchmarks.\n","authors":["Abhishek Aich","Yumin Suh","Samuel Schulter","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2404.14657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12368v2","updated":"2024-04-23T01:21:58Z","published":"2024-04-18T17:50:23Z","title":"Gradient-Regularized Out-of-Distribution Detection","summary":" One of the challenges for neural networks in real-life applications is the\noverconfident errors these models make when the data is not from the original\ntraining distribution.\n Addressing this issue is known as Out-of-Distribution (OOD) detection.\n Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate\nfor OOD data during training to achieve improved performance.\n However, these methods fail to fully exploit the local information embedded\nin the auxiliary dataset.\n In this work, we propose the idea of leveraging the information embedded in\nthe gradient of the loss function during training to enable the network to not\nonly learn a desired OOD score for each sample but also to exhibit similar\nbehavior in a local neighborhood around each sample.\n We also develop a novel energy-based sampling method to allow the network to\nbe exposed to more informative OOD samples during the training phase. This is\nespecially important when the auxiliary dataset is large. We demonstrate the\neffectiveness of our method through extensive experiments on several OOD\nbenchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet\nexperiment.\n We further provide a theoretical analysis through the lens of certified\nrobustness and Lipschitz analysis to showcase the theoretical foundation of our\nwork. We will publicly release our code after the review process.\n","authors":["Sina Sharifi","Taha Entesari","Bardia Safaei","Vishal M. Patel","Mahyar Fazlyab"],"pdf_url":"https://arxiv.org/pdf/2404.12368v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.14653v1","updated":"2024-04-23T01:19:19Z","published":"2024-04-23T01:19:19Z","title":"Machine Vision Based Assessment of Fall Color Changes in Apple Trees:\n Exploring Relationship with Leaf Nitrogen Concentration","summary":" Apple trees being deciduous trees, shed leaves each year which is preceded by\nthe change in color of leaves from green to yellow (also known as senescence)\nduring the fall season. The rate and timing of color change are affected by the\nnumber of factors including nitrogen (N) deficiencies. The green color of\nleaves is highly dependent on the chlorophyll content, which in turn depends on\nthe nitrogen concentration in the leaves. The assessment of the leaf color can\ngive vital information on the nutrient status of the tree. The use of a machine\nvision based system to capture and quantify these timings and changes in leaf\ncolor can be a great tool for that purpose.\n \\par This study is based on data collected during the fall of 2021 and 2023\nat a commercial orchard using a ground-based stereo-vision sensor for five\nweeks. The point cloud obtained from the sensor was segmented to get just the\ntree in the foreground. The study involved the segmentation of the trees in a\nnatural background using point cloud data and quantification of the color using\na custom-defined metric, \\textit{yellowness index}, varying from $-1$ to $+1$\n($-1$ being completely green and $+1$ being completely yellow), which gives the\nproportion of yellow leaves on a tree. The performance of K-means based\nalgorithm and gradient boosting algorithm were compared for \\textit{yellowness\nindex} calculation. The segmentation method proposed in the study was able to\nestimate the \\textit{yellowness index} on the trees with $R^2 = 0.72$. The\nresults showed that the metric was able to capture the gradual color transition\nfrom green to yellow over the study duration. It was also observed that the\ntrees with lower nitrogen showed the color transition to yellow earlier than\nthe trees with higher nitrogen. The onset of color transition during both years\naligned with the $29^{th}$ week post-full bloom.\n","authors":["Achyut Paudel","Jostan Brown","Priyanka Upadhyaya","Atif Bilal Asad","Safal Kshetri","Manoj Karkee","Joseph R. Davidson","Cindy Grimm","Ashley Thompson"],"pdf_url":"https://arxiv.org/pdf/2404.14653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14634v1","updated":"2024-04-23T00:18:00Z","published":"2024-04-23T00:18:00Z","title":"UPose3D: Uncertainty-Aware 3D Human Pose Estimation with Cross-View and\n Temporal Cues","summary":" We introduce UPose3D, a novel approach for multi-view 3D human pose\nestimation, addressing challenges in accuracy and scalability. Our method\nadvances existing pose estimation frameworks by improving robustness and\nflexibility without requiring direct 3D annotations. At the core of our method,\na pose compiler module refines predictions from a 2D keypoints estimator that\noperates on a single image by leveraging temporal and cross-view information.\nOur novel cross-view fusion strategy is scalable to any number of cameras,\nwhile our synthetic data generation strategy ensures generalization across\ndiverse actors, scenes, and viewpoints. Finally, UPose3D leverages the\nprediction uncertainty of both the 2D keypoint estimator and the pose compiler\nmodule. This provides robustness to outliers and noisy data, resulting in\nstate-of-the-art performance in out-of-distribution settings. In addition, for\nin-distribution settings, UPose3D yields a performance rivaling methods that\nrely on 3D annotated data, while being the state-of-the-art among methods\nrelying only on 2D supervision.\n","authors":["Vandad Davoodnia","Saeed Ghorbani","Marc-André Carbonneau","Alexandre Messier","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2404.14634v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.15564v1","updated":"2024-04-23T23:26:02Z","published":"2024-04-23T23:26:02Z","title":"Guided AbsoluteGrad: Magnitude of Gradients Matters to Explanation's\n Localization and Saliency","summary":" This paper proposes a new gradient-based XAI method called Guided\nAbsoluteGrad for saliency map explanations. We utilize both positive and\nnegative gradient magnitudes and employ gradient variance to distinguish the\nimportant areas for noise deduction. We also introduce a novel evaluation\nmetric named ReCover And Predict (RCAP), which considers the Localization and\nVisual Noise Level objectives of the explanations. We propose two propositions\nfor these two objectives and prove the necessity of evaluating them. We\nevaluate Guided AbsoluteGrad with seven gradient-based XAI methods using the\nRCAP metric and other SOTA metrics in three case studies: (1) ImageNet dataset\nwith ResNet50 model; (2) International Skin Imaging Collaboration (ISIC)\ndataset with EfficientNet model; (3) the Places365 dataset with DenseNet161\nmodel. Our method surpasses other gradient-based approaches, showcasing the\nquality of enhanced saliency map explanations through gradient magnitude.\n","authors":["Jun Huang","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.15564v1.pdf","comment":"CAI2024 Camera-ready Submission"},{"id":"http://arxiv.org/abs/2306.16699v3","updated":"2024-04-23T23:20:41Z","published":"2023-06-29T05:49:07Z","title":"Rapid-INR: Storage Efficient CPU-free DNN Training Using Implicit Neural\n Representation","summary":" Implicit Neural Representation (INR) is an innovative approach for\nrepresenting complex shapes or objects without explicitly defining their\ngeometry or surface structure. Instead, INR represents objects as continuous\nfunctions. Previous research has demonstrated the effectiveness of using neural\nnetworks as INR for image compression, showcasing comparable performance to\ntraditional methods such as JPEG. However, INR holds potential for various\napplications beyond image compression. This paper introduces Rapid-INR, a novel\napproach that utilizes INR for encoding and compressing images, thereby\naccelerating neural network training in computer vision tasks. Our methodology\ninvolves storing the whole dataset directly in INR format on a GPU, mitigating\nthe significant data communication overhead between the CPU and GPU during\ntraining. Additionally, the decoding process from INR to RGB format is highly\nparallelized and executed on-the-fly. To further enhance compression, we\npropose iterative and dynamic pruning, as well as layer-wise quantization,\nbuilding upon previous work. We evaluate our framework on the image\nclassification task, utilizing the ResNet-18 backbone network and three\ncommonly used datasets with varying image sizes. Rapid-INR reduces memory\nconsumption to only about 5% of the original dataset size in RGB format and\nachieves a maximum 6$\\times$ speedup over the PyTorch training pipeline, as\nwell as a maximum 1.2x speedup over the DALI training pipeline, with only a\nmarginal decrease in accuracy. Importantly, Rapid-INR can be readily applied to\nother computer vision tasks and backbone networks with reasonable engineering\nefforts. Our implementation code is publicly available at\nhttps://github.com/sharc-lab/Rapid-INR.\n","authors":["Hanqiu Chen","Hang Yang","Stephen Fitzmeyer","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2306.16699v3.pdf","comment":"Accepted by ICCAD 2023"},{"id":"http://arxiv.org/abs/2404.15552v1","updated":"2024-04-23T22:54:51Z","published":"2024-04-23T22:54:51Z","title":"Cross-Temporal Spectrogram Autoencoder (CTSAE): Unsupervised\n Dimensionality Reduction for Clustering Gravitational Wave Glitches","summary":" The advancement of The Laser Interferometer Gravitational-Wave Observatory\n(LIGO) has significantly enhanced the feasibility and reliability of\ngravitational wave detection. However, LIGO's high sensitivity makes it\nsusceptible to transient noises known as glitches, which necessitate effective\ndifferentiation from real gravitational wave signals. Traditional approaches\npredominantly employ fully supervised or semi-supervised algorithms for the\ntask of glitch classification and clustering. In the future task of identifying\nand classifying glitches across main and auxiliary channels, it is impractical\nto build a dataset with manually labeled ground-truth. In addition, the\npatterns of glitches can vary with time, generating new glitches without manual\nlabels. In response to this challenge, we introduce the Cross-Temporal\nSpectrogram Autoencoder (CTSAE), a pioneering unsupervised method for the\ndimensionality reduction and clustering of gravitational wave glitches. CTSAE\nintegrates a novel four-branch autoencoder with a hybrid of Convolutional\nNeural Networks (CNN) and Vision Transformers (ViT). To further extract\nfeatures across multi-branches, we introduce a novel multi-branch fusion method\nusing the CLS (Class) token. Our model, trained and evaluated on the GravitySpy\nO3 dataset on the main channel, demonstrates superior performance in clustering\ntasks when compared to state-of-the-art semi-supervised learning methods. To\nthe best of our knowledge, CTSAE represents the first unsupervised approach\ntailored specifically for clustering LIGO data, marking a significant step\nforward in the field of gravitational wave research. The code of this paper is\navailable at https://github.com/Zod-L/CTSAE\n","authors":["Yi Li","Yunan Wu","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2404.15552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15532v1","updated":"2024-04-23T21:37:22Z","published":"2024-04-23T21:37:22Z","title":"BattleAgent: Multi-modal Dynamic Emulation on Historical Battles to\n Complement Historical Analysis","summary":" This paper presents BattleAgent, an emulation system that combines the Large\nVision-Language Model and Multi-agent System. This novel system aims to\nsimulate complex dynamic interactions among multiple agents, as well as between\nagents and their environments, over a period of time. It emulates both the\ndecision-making processes of leaders and the viewpoints of ordinary\nparticipants, such as soldiers. The emulation showcases the current\ncapabilities of agents, featuring fine-grained multi-modal interactions between\nagents and landscapes. It develops customizable agent structures to meet\nspecific situational requirements, for example, a variety of battle-related\nactivities like scouting and trench digging. These components collaborate to\nrecreate historical events in a lively and comprehensive manner while offering\ninsights into the thoughts and feelings of individuals from diverse viewpoints.\nThe technological foundations of BattleAgent establish detailed and immersive\nsettings for historical battles, enabling individual agents to partake in,\nobserve, and dynamically respond to evolving battle scenarios. This methodology\nholds the potential to substantially deepen our understanding of historical\nevents, particularly through individual accounts. Such initiatives can also aid\nhistorical research, as conventional historical narratives often lack\ndocumentation and prioritize the perspectives of decision-makers, thereby\noverlooking the experiences of ordinary individuals. BattelAgent illustrates\nAI's potential to revitalize the human aspect in crucial social events, thereby\nfostering a more nuanced collective understanding and driving the progressive\ndevelopment of human society.\n","authors":["Shuhang Lin","Wenyue Hua","Lingyao Li","Che-Jui Chang","Lizhou Fan","Jianchao Ji","Hang Hua","Mingyu Jin","Jiebo Luo","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15532v1.pdf","comment":"26 pages, 14 figures The data and code for this project are\n accessible at https://github.com/agiresearch/battleagent"},{"id":"http://arxiv.org/abs/2404.15523v1","updated":"2024-04-23T21:11:30Z","published":"2024-04-23T21:11:30Z","title":"Understanding Hyperbolic Metric Learning through Hard Negative Sampling","summary":" In recent years, there has been a growing trend of incorporating hyperbolic\ngeometry methods into computer vision. While these methods have achieved\nstate-of-the-art performance on various metric learning tasks using hyperbolic\ndistance measurements, the underlying theoretical analysis supporting this\nsuperior performance remains under-exploited. In this study, we investigate the\neffects of integrating hyperbolic space into metric learning, particularly when\ntraining with contrastive loss. We identify a need for a comprehensive\ncomparison between Euclidean and hyperbolic spaces regarding the temperature\neffect in the contrastive loss within the existing literature. To address this\ngap, we conduct an extensive investigation to benchmark the results of Vision\nTransformers (ViTs) using a hybrid objective function that combines loss from\nEuclidean and hyperbolic spaces. Additionally, we provide a theoretical\nanalysis of the observed performance improvement. We also reveal that\nhyperbolic metric learning is highly related to hard negative sampling,\nproviding insights for future work. This work will provide valuable data points\nand experience in understanding hyperbolic image embeddings. To shed more light\non problem-solving and encourage further investigation into our approach, our\ncode is available online (https://github.com/YunYunY/HypMix).\n","authors":["Yun Yue","Fangzhou Lin","Guanyi Mou","Ziming Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15523v1.pdf","comment":"published in Proceedings of the IEEE/CVF Winter Conference on\n Applications of Computer Vision. 2024. arXiv admin note: text overlap with\n arXiv:2203.10833 by other authors"},{"id":"http://arxiv.org/abs/2404.10156v2","updated":"2024-04-23T21:00:39Z","published":"2024-04-15T22:12:05Z","title":"SegFormer3D: an Efficient Transformer for 3D Medical Image Segmentation","summary":" The adoption of Vision Transformers (ViTs) based architectures represents a\nsignificant advancement in 3D Medical Image (MI) segmentation, surpassing\ntraditional Convolutional Neural Network (CNN) models by enhancing global\ncontextual understanding. While this paradigm shift has significantly enhanced\n3D segmentation performance, state-of-the-art architectures require extremely\nlarge and complex architectures with large scale computing resources for\ntraining and deployment. Furthermore, in the context of limited datasets, often\nencountered in medical imaging, larger models can present hurdles in both model\ngeneralization and convergence. In response to these challenges and to\ndemonstrate that lightweight models are a valuable area of research in 3D\nmedical imaging, we present SegFormer3D, a hierarchical Transformer that\ncalculates attention across multiscale volumetric features. Additionally,\nSegFormer3D avoids complex decoders and uses an all-MLP decoder to aggregate\nlocal and global attention features to produce highly accurate segmentation\nmasks. The proposed memory efficient Transformer preserves the performance\ncharacteristics of a significantly larger model in a compact design.\nSegFormer3D democratizes deep learning for 3D medical image segmentation by\noffering a model with 33x less parameters and a 13x reduction in GFLOPS\ncompared to the current state-of-the-art (SOTA). We benchmark SegFormer3D\nagainst the current SOTA models on three widely used datasets Synapse, BRaTs,\nand ACDC, achieving competitive results. Code:\nhttps://github.com/OSUPCVLab/SegFormer3D.git\n","authors":["Shehan Perera","Pouyan Navard","Alper Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2404.10156v2.pdf","comment":"Accepted at CVPR Workshop 2024"},{"id":"http://arxiv.org/abs/2404.15516v1","updated":"2024-04-23T21:00:22Z","published":"2024-04-23T21:00:22Z","title":"Visual Delta Generator with Large Multi-modal Models for Semi-supervised\n Composed Image Retrieval","summary":" Composed Image Retrieval (CIR) is a task that retrieves images similar to a\nquery, based on a provided textual modification. Current techniques rely on\nsupervised learning for CIR models using labeled triplets of the reference\nimage, text, target image. These specific triplets are not as commonly\navailable as simple image-text pairs, limiting the widespread use of CIR and\nits scalability. On the other hand, zero-shot CIR can be relatively easily\ntrained with image-caption pairs without considering the image-to-image\nrelation, but this approach tends to yield lower accuracy. We propose a new\nsemi-supervised CIR approach where we search for a reference and its related\ntarget images in auxiliary data and learn our large language model-based Visual\nDelta Generator (VDG) to generate text describing the visual difference (i.e.,\nvisual delta) between the two. VDG, equipped with fluent language knowledge and\nbeing model agnostic, can generate pseudo triplets to boost the performance of\nCIR models. Our approach significantly improves the existing supervised\nlearning approaches and achieves state-of-the-art results on the CIR\nbenchmarks.\n","authors":["Young Kyun Jang","Donghyun Kim","Zihang Meng","Dat Huynh","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2404.15516v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2307.08939v3","updated":"2024-04-23T20:33:38Z","published":"2023-07-18T03:12:03Z","title":"Runtime Stealthy Perception Attacks against DNN-based Adaptive Cruise\n Control Systems","summary":" Adaptive Cruise Control (ACC) is a widely used driver assistance technology\nfor maintaining the desired speed and safe distance to the leading vehicle.\nThis paper evaluates the security of the deep neural network (DNN) based ACC\nsystems under runtime stealthy perception attacks that strategically inject\nperturbations into camera data to cause forward collisions. We present a\ncontext-aware strategy for the selection of the most critical times for\ntriggering the attacks and a novel optimization-based method for the adaptive\ngeneration of image perturbations at runtime. We evaluate the effectiveness of\nthe proposed attack using an actual vehicle, a publicly available driving\ndataset, and a realistic simulation platform with the control software from a\nproduction ACC system, a physical-world driving simulator, and interventions by\nthe human driver and safety features such as Advanced Emergency Braking System\n(AEBS). Experimental results show that the proposed attack achieves 142.9 times\nhigher success rate in causing hazards and 89.6% higher evasion rate than\nbaselines, while being stealthy and robust to real-world factors and dynamic\nchanges in the environment. This study highlights the role of human drivers and\nbasic safety mechanisms in preventing attacks.\n","authors":["Xugui Zhou","Anqi Chen","Maxfield Kouzel","Haotian Ren","Morgan McCarty","Cristina Nita-Rotaru","Homa Alemzadeh"],"pdf_url":"https://arxiv.org/pdf/2307.08939v3.pdf","comment":"19 pages, 23 figures, 11 tables"},{"id":"http://arxiv.org/abs/2312.00834v2","updated":"2024-04-23T19:36:04Z","published":"2023-11-30T22:58:30Z","title":"AV-RIR: Audio-Visual Room Impulse Response Estimation","summary":" Accurate estimation of Room Impulse Response (RIR), which captures an\nenvironment's acoustic properties, is important for speech processing and AR/VR\napplications. We propose AV-RIR, a novel multi-modal multi-task learning\napproach to accurately estimate the RIR from a given reverberant speech signal\nand the visual cues of its corresponding environment. AV-RIR builds on a novel\nneural codec-based architecture that effectively captures environment geometry\nand materials properties and solves speech dereverberation as an auxiliary task\nby using multi-task learning. We also propose Geo-Mat features that augment\nmaterial information into visual cues and CRIP that improves late reverberation\ncomponents in the estimated RIR via image-to-RIR retrieval by 86%. Empirical\nresults show that AV-RIR quantitatively outperforms previous audio-only and\nvisual-only approaches by achieving 36% - 63% improvement across various\nacoustic metrics in RIR estimation. Additionally, it also achieves higher\npreference scores in human evaluation. As an auxiliary benefit, dereverbed\nspeech from AV-RIR shows competitive performance with the state-of-the-art in\nvarious spoken language processing tasks and outperforms reverberation time\nerror score in the real-world AVSpeech dataset. Qualitative examples of both\nsynthesized reverberant speech and enhanced speech can be found at\nhttps://www.youtube.com/watch?v=tTsKhviukAE.\n","authors":["Anton Ratnarajah","Sreyan Ghosh","Sonal Kumar","Purva Chiniya","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2312.00834v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2312.08555v3","updated":"2024-04-23T19:31:25Z","published":"2023-12-13T23:00:48Z","title":"KDAS: Knowledge Distillation via Attention Supervision Framework for\n Polyp Segmentation","summary":" Polyp segmentation, a contentious issue in medical imaging, has seen numerous\nproposed methods aimed at improving the quality of segmented masks. While\ncurrent state-of-the-art techniques yield impressive results, the size and\ncomputational cost of these models create challenges for practical industry\napplications. To address this challenge, we present KDAS, a Knowledge\nDistillation framework that incorporates attention supervision, and our\nproposed Symmetrical Guiding Module. This framework is designed to facilitate a\ncompact student model with fewer parameters, allowing it to learn the strengths\nof the teacher model and mitigate the inconsistency between teacher features\nand student features, a common challenge in Knowledge Distillation, via the\nSymmetrical Guiding Module. Through extensive experiments, our compact models\ndemonstrate their strength by achieving competitive results with\nstate-of-the-art methods, offering a promising approach to creating compact\nmodels with high accuracy for polyp segmentation and in the medical imaging\nfield. The implementation is available on https://github.com/huyquoctrinh/KDAS.\n","authors":["Quoc-Huy Trinh","Minh-Van Nguyen","Phuoc-Thao Vo Thi"],"pdf_url":"https://arxiv.org/pdf/2312.08555v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08561v2","updated":"2024-04-23T19:19:35Z","published":"2024-04-12T16:00:03Z","title":"IDD-X: A Multi-View Dataset for Ego-relative Important Object\n Localization and Explanation in Dense and Unstructured Traffic","summary":" Intelligent vehicle systems require a deep understanding of the interplay\nbetween road conditions, surrounding entities, and the ego vehicle's driving\nbehavior for safe and efficient navigation. This is particularly critical in\ndeveloping countries where traffic situations are often dense and unstructured\nwith heterogeneous road occupants. Existing datasets, predominantly geared\ntowards structured and sparse traffic scenarios, fall short of capturing the\ncomplexity of driving in such environments. To fill this gap, we present IDD-X,\na large-scale dual-view driving video dataset. With 697K bounding boxes, 9K\nimportant object tracks, and 1-12 objects per video, IDD-X offers comprehensive\nego-relative annotations for multiple important road objects covering 10\ncategories and 19 explanation label categories. The dataset also incorporates\nrearview information to provide a more complete representation of the driving\nenvironment. We also introduce custom-designed deep networks aimed at multiple\nimportant object localization and per-object explanation prediction. Overall,\nour dataset and introduced prediction models form the foundation for studying\nhow road conditions and surrounding entities affect driving behavior in complex\ntraffic situations.\n","authors":["Chirag Parikh","Rohit Saluja","C. V. Jawahar","Ravi Kiran Sarvadevabhatla"],"pdf_url":"https://arxiv.org/pdf/2404.08561v2.pdf","comment":"Accepted at ICRA 2024; Project page: https://idd-x.github.io/"},{"id":"http://arxiv.org/abs/2404.15451v1","updated":"2024-04-23T18:46:07Z","published":"2024-04-23T18:46:07Z","title":"CFPFormer: Feature-pyramid like Transformer Decoder for Segmentation and\n Detection","summary":" Feature pyramids have been widely adopted in convolutional neural networks\n(CNNs) and transformers for tasks like medical image segmentation and object\ndetection. However, the currently existing models generally focus on the\nEncoder-side Transformer to extract features, from which decoder improvement\ncan bring further potential with well-designed architecture. We propose\nCFPFormer, a novel decoder block that integrates feature pyramids and\ntransformers. Specifically, by leveraging patch embedding, cross-layer feature\nconcatenation, and Gaussian attention mechanisms, CFPFormer enhances feature\nextraction capabilities while promoting generalization across diverse tasks.\nBenefiting from Transformer structure and U-shaped Connections, our introduced\nmodel gains the ability to capture long-range dependencies and effectively\nup-sample feature maps. Our model achieves superior performance in detecting\nsmall objects compared to existing methods. We evaluate CFPFormer on medical\nimage segmentation datasets and object detection benchmarks (VOC 2007, VOC2012,\nMS-COCO), demonstrating its effectiveness and versatility. On the ACDC\nPost-2017-MICCAI-Challenge online test set, our model reaches exceptionally\nimpressive accuracy, and performed well compared with the original decoder\nsetting in Synapse multi-organ segmentation dataset.\n","authors":["Hongyi Cai","Mohammad Mahdinur Rahman","Jingyu Wu","Yulun Deng"],"pdf_url":"https://arxiv.org/pdf/2404.15451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13651v3","updated":"2024-04-23T18:45:54Z","published":"2023-08-25T19:40:56Z","title":"PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained\n Image Classification Accuracy for AIs and Humans","summary":" Nearest neighbors (NN) are traditionally used to compute final decisions,\ne.g., in Support Vector Machines or k-NN classifiers, and to provide users with\nexplanations for the model's decision. In this paper, we show a novel utility\nof nearest neighbors: To improve predictions of a frozen, pretrained classifier\nC. We leverage an image comparator S that (1) compares the input image with NN\nimages from the top-K most probable classes; and (2) uses S's output scores to\nweight the confidence scores of C. Our method consistently improves\nfine-grained image classification accuracy on CUB-200, Cars-196, and Dogs-120.\nAlso, a human study finds that showing lay users our probable-class nearest\nneighbors (PCNN) improves their decision accuracy over prior work which only\nshows only the top-1 class examples.\n","authors":["Giang Nguyen","Valerie Chen","Mohammad Reza Taesiri","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.13651v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05980v3","updated":"2024-04-23T18:44:45Z","published":"2024-04-09T03:24:10Z","title":"Tackling Structural Hallucination in Image Translation with Local\n Diffusion","summary":" Recent developments in diffusion models have advanced conditioned image\ngeneration, yet they struggle with reconstructing out-of-distribution (OOD)\nimages, such as unseen tumors in medical images, causing \"image hallucination\"\nand risking misdiagnosis. We hypothesize such hallucinations result from local\nOOD regions in the conditional images. We verify that partitioning the OOD\nregion and conducting separate image generations alleviates hallucinations in\nseveral applications. From this, we propose a training-free diffusion framework\nthat reduces hallucination with multiple Local Diffusion processes. Our\napproach involves OOD estimation followed by two modules: a \"branching\" module\ngenerates locally both within and outside OOD regions, and a \"fusion\" module\nintegrates these predictions into one. Our evaluation shows our method\nmitigates hallucination over baseline models quantitatively and qualitatively,\nreducing misdiagnosis by 40% and 25% in the real-world medical and natural\nimage datasets, respectively. It also demonstrates compatibility with various\npre-trained diffusion models.\n","authors":["Seunghoi Kim","Chen Jin","Tom Diethe","Matteo Figini","Henry F. J. Tregidgo","Asher Mullokandov","Philip Teare","Daniel C. Alexander"],"pdf_url":"https://arxiv.org/pdf/2404.05980v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15449v1","updated":"2024-04-23T18:41:56Z","published":"2024-04-23T18:41:56Z","title":"ID-Aligner: Enhancing Identity-Preserving Text-to-Image Generation with\n Reward Feedback Learning","summary":" The rapid development of diffusion models has triggered diverse applications.\nIdentity-preserving text-to-image generation (ID-T2I) particularly has received\nsignificant attention due to its wide range of application scenarios like AI\nportrait and advertising. While existing ID-T2I methods have demonstrated\nimpressive results, several key challenges remain: (1) It is hard to maintain\nthe identity characteristics of reference portraits accurately, (2) The\ngenerated images lack aesthetic appeal especially while enforcing identity\nretention, and (3) There is a limitation that cannot be compatible with\nLoRA-based and Adapter-based methods simultaneously. To address these issues,\nwe present \\textbf{ID-Aligner}, a general feedback learning framework to\nenhance ID-T2I performance. To resolve identity features lost, we introduce\nidentity consistency reward fine-tuning to utilize the feedback from face\ndetection and recognition models to improve generated identity preservation.\nFurthermore, we propose identity aesthetic reward fine-tuning leveraging\nrewards from human-annotated preference data and automatically constructed\nfeedback on character structure generation to provide aesthetic tuning signals.\nThanks to its universal feedback fine-tuning framework, our method can be\nreadily applied to both LoRA and Adapter models, achieving consistent\nperformance gains. Extensive experiments on SD1.5 and SDXL diffusion models\nvalidate the effectiveness of our approach. \\textbf{Project Page:\n\\url{https://idaligner.github.io/}}\n","authors":["Weifeng Chen","Jiacheng Zhang","Jie Wu","Hefeng Wu","Xuefeng Xiao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.15449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15447v1","updated":"2024-04-23T18:39:57Z","published":"2024-04-23T18:39:57Z","title":"GLoD: Composing Global Contexts and Local Details in Image Generation","summary":" Diffusion models have demonstrated their capability to synthesize\nhigh-quality and diverse images from textual prompts. However, simultaneous\ncontrol over both global contexts (e.g., object layouts and interactions) and\nlocal details (e.g., colors and emotions) still remains a significant\nchallenge. The models often fail to understand complex descriptions involving\nmultiple objects and reflect specified visual attributes to wrong targets or\nignore them. This paper presents Global-Local Diffusion (\\textit{GLoD}), a\nnovel framework which allows simultaneous control over the global contexts and\nthe local details in text-to-image generation without requiring training or\nfine-tuning. It assigns multiple global and local prompts to corresponding\nlayers and composes their noises to guide a denoising process using pre-trained\ndiffusion models. Our framework enables complex global-local compositions,\nconditioning objects in the global prompt with the local prompts while\npreserving other unspecified identities. Our quantitative and qualitative\nevaluations demonstrate that GLoD effectively generates complex images that\nadhere to both user-provided object interactions and object details.\n","authors":["Moyuru Yamada"],"pdf_url":"https://arxiv.org/pdf/2404.15447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15445v1","updated":"2024-04-23T18:37:37Z","published":"2024-04-23T18:37:37Z","title":"Deep multi-prototype capsule networks","summary":" Capsule networks are a type of neural network that identify image parts and\nform the instantiation parameters of a whole hierarchically. The goal behind\nthe network is to perform an inverse computer graphics task, and the network\nparameters are the mapping weights that transform parts into a whole. The\ntrainability of capsule networks in complex data with high intra-class or\nintra-part variation is challenging. This paper presents a multi-prototype\narchitecture for guiding capsule networks to represent the variations in the\nimage parts. To this end, instead of considering a single capsule for each\nclass and part, the proposed method employs several capsules (co-group\ncapsules), capturing multiple prototypes of an object. In the final layer,\nco-group capsules compete, and their soft output is considered the target for a\ncompetitive cross-entropy loss. Moreover, in the middle layers, the most active\ncapsules map to the next layer with a shared weight among the co-groups.\nConsequently, due to the reduction in parameters, implicit weight-sharing makes\nit possible to have more deep capsule network layers. The experimental results\non MNIST, SVHN, C-Cube, CEDAR, MCYT, and UTSig datasets reveal that the\nproposed model outperforms others regarding image classification accuracy.\n","authors":["Saeid Abbassi","Kamaledin Ghiasi-Shirazi","Ahad Harati"],"pdf_url":"https://arxiv.org/pdf/2404.15445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15436v1","updated":"2024-04-23T18:26:11Z","published":"2024-04-23T18:26:11Z","title":"Iterative Cluster Harvesting for Wafer Map Defect Patterns","summary":" Unsupervised clustering of wafer map defect patterns is challenging because\nthe appearance of certain defect patterns varies significantly. This includes\nchanging shape, location, density, and rotation of the defect area on the\nwafer. We present a harvesting approach, which can cluster even challenging\ndefect patterns of wafer maps well. Our approach makes use of a well-known,\nthree-step procedure: feature extraction, dimension reduction, and clustering.\nThe novelty in our approach lies in repeating dimensionality reduction and\nclustering iteratively while filtering out one cluster per iteration according\nto its silhouette score. This method leads to an improvement of clustering\nperformance in general and is especially useful for difficult defect patterns.\nThe low computational effort allows for a quick assessment of large datasets\nand can be used to support manual labeling efforts. We benchmark against\nrelated approaches from the literature and show improved results on a\nreal-world industrial dataset.\n","authors":["Alina Pleli","Simon Baeuerle","Michel Janus","Jonas Barth","Ralf Mikut","Hendrik P. A. Lensch"],"pdf_url":"https://arxiv.org/pdf/2404.15436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15406v1","updated":"2024-04-23T18:00:09Z","published":"2024-04-23T18:00:09Z","title":"Wiki-LLaVA: Hierarchical Retrieval-Augmented Generation for Multimodal\n LLMs","summary":" Multimodal LLMs are the natural evolution of LLMs, and enlarge their\ncapabilities so as to work beyond the pure textual modality. As research is\nbeing carried out to design novel architectures and vision-and-language\nadapters, in this paper we concentrate on endowing such models with the\ncapability of answering questions that require external knowledge. Our\napproach, termed Wiki-LLaVA, aims at integrating an external knowledge source\nof multimodal documents, which is accessed through a hierarchical retrieval\npipeline. Relevant passages, using this approach, are retrieved from the\nexternal knowledge source and employed as additional context for the LLM,\naugmenting the effectiveness and precision of generated dialogues. We conduct\nextensive experiments on datasets tailored for visual question answering with\nexternal data and demonstrate the appropriateness of our approach.\n","authors":["Davide Caffagni","Federico Cocchi","Nicholas Moratelli","Sara Sarto","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.15406v1.pdf","comment":"CVPR 2024 Workshop on What is Next in Multimodal Foundation Models"},{"id":"http://arxiv.org/abs/2311.18836v2","updated":"2024-04-23T17:53:48Z","published":"2023-11-30T18:59:52Z","title":"ChatPose: Chatting about 3D Human Pose","summary":" We introduce ChatPose, a framework employing Large Language Models (LLMs) to\nunderstand and reason about 3D human poses from images or textual descriptions.\nOur work is motivated by the human ability to intuitively understand postures\nfrom a single image or a brief description, a process that intertwines image\ninterpretation, world knowledge, and an understanding of body language.\nTraditional human pose estimation and generation methods often operate in\nisolation, lacking semantic understanding and reasoning abilities. ChatPose\naddresses these limitations by embedding SMPL poses as distinct signal tokens\nwithin a multimodal LLM, enabling the direct generation of 3D body poses from\nboth textual and visual inputs. Leveraging the powerful capabilities of\nmultimodal LLMs, ChatPose unifies classical 3D human pose and generation tasks\nwhile offering user interactions. Additionally, ChatPose empowers LLMs to apply\ntheir extensive world knowledge in reasoning about human poses, leading to two\nadvanced tasks: speculative pose generation and reasoning about pose\nestimation. These tasks involve reasoning about humans to generate 3D poses\nfrom subtle text queries, possibly accompanied by images. We establish\nbenchmarks for these tasks, moving beyond traditional 3D pose generation and\nestimation methods. Our results show that ChatPose outperforms existing\nmultimodal LLMs and task-specific methods on these newly proposed tasks.\nFurthermore, ChatPose's ability to understand and generate 3D human poses based\non complex reasoning opens new directions in human pose analysis.\n","authors":["Yao Feng","Jing Lin","Sai Kumar Dwivedi","Yu Sun","Priyanka Patel","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2311.18836v2.pdf","comment":"Home page: https://yfeng95.github.io/ChatPose/"},{"id":"http://arxiv.org/abs/2404.15394v1","updated":"2024-04-23T17:11:07Z","published":"2024-04-23T17:11:07Z","title":"On Generating Cancelable Biometric Template using Reverse of Boolean XOR","summary":" Cancelable Biometric is repetitive distortion embedded in original Biometric\nimage for keeping it secure from unauthorized access. In this paper, we have\ngenerated Cancelable Biometric templates with Reverse Boolean XOR technique.\nThree different methods have been proposed for generation of Cancelable\nBiometric templates based on Visual Secret Sharing scheme. In each method, one\nSecret image and n-1 Cover images are used as: (M1) One original Biometric\nimage (Secret) with n- 1 randomly chosen Gray Cover images (M2) One original\nSecret image with n-1 Cover images, which are Randomly Permuted version of the\noriginal Secret image (M3) One Secret image with n-1 Cover images, both Secret\nimage and Cover images are Randomly Permuted version of original Biometric\nimage. Experiment works have performed on publicly available ORL Face database\nand IIT Delhi Iris database. The performance of the proposed methods is\ncompared in terms of Co-relation Coefficient (Cr), Mean Square Error (MSE),\nMean Absolute Error (MAE), Structural Similarity (SSIM), Peak Signal to Noise\nRatio (PSNR), Number of Pixel Change Rate (NPCR), and Unified Average Changing\nIntensity (UACI). It is found that among the three proposed method, M3\ngenerates good quality Cancelable templates and gives best performance in terms\nof quality. M3 is also better in quantitative terms on ORL dataset while M2 and\nM3 are comparable on IIT Delhi Iris dataset.\n","authors":[" Manisha","Nitin Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.15394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.00170v4","updated":"2024-04-23T13:42:45Z","published":"2021-04-01T00:14:45Z","title":"Are Bias Mitigation Techniques for Deep Learning Effective?","summary":" A critical problem in deep learning is that systems learn inappropriate\nbiases, resulting in their inability to perform well on minority groups. This\nhas led to the creation of multiple algorithms that endeavor to mitigate bias.\nHowever, it is not clear how effective these methods are. This is because study\nprotocols differ among papers, systems are tested on datasets that fail to test\nmany forms of bias, and systems have access to hidden knowledge or are tuned\nspecifically to the test set. To address this, we introduce an improved\nevaluation protocol, sensible metrics, and a new dataset, which enables us to\nask and answer critical questions about bias mitigation algorithms. We evaluate\nseven state-of-the-art algorithms using the same network architecture and\nhyperparameter selection policy across three benchmark datasets. We introduce a\nnew dataset called Biased MNIST that enables assessment of robustness to\nmultiple bias sources. We use Biased MNIST and a visual question answering\n(VQA) benchmark to assess robustness to hidden biases. Rather than only tuning\nto the test set distribution, we study robustness across different tuning\ndistributions, which is critical because for many applications the test\ndistribution may not be known during development. We find that algorithms\nexploit hidden biases, are unable to scale to multiple forms of bias, and are\nhighly sensitive to the choice of tuning set. Based on our findings, we implore\nthe community to adopt more rigorous assessment of future bias mitigation\nmethods. All data, code, and results are publicly available at:\nhttps://github.com/erobic/bias-mitigators.\n","authors":["Robik Shrestha","Kushal Kafle","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2104.00170v4.pdf","comment":"Published in WACV 2022 under the title \"An Investigation of Critical\n Issues in Bias Mitigation Techniques\""},{"id":"http://arxiv.org/abs/2404.15385v1","updated":"2024-04-23T10:59:44Z","published":"2024-04-23T10:59:44Z","title":"Sum of Group Error Differences: A Critical Examination of Bias\n Evaluation in Biometric Verification and a Dual-Metric Measure","summary":" Biometric Verification (BV) systems often exhibit accuracy disparities across\ndifferent demographic groups, leading to biases in BV applications. Assessing\nand quantifying these biases is essential for ensuring the fairness of BV\nsystems. However, existing bias evaluation metrics in BV have limitations, such\nas focusing exclusively on match or non-match error rates, overlooking bias on\ndemographic groups with performance levels falling between the best and worst\nperformance levels, and neglecting the magnitude of the bias present.\n This paper presents an in-depth analysis of the limitations of current bias\nevaluation metrics in BV and, through experimental analysis, demonstrates their\ncontextual suitability, merits, and limitations. Additionally, it introduces a\nnovel general-purpose bias evaluation measure for BV, the ``Sum of Group Error\nDifferences (SEDG)''. Our experimental results on controlled synthetic datasets\ndemonstrate the effectiveness of demographic bias quantification when using\nexisting metrics and our own proposed measure. We discuss the applicability of\nthe bias evaluation metrics in a set of simulated demographic bias scenarios\nand provide scenario-based metric recommendations. Our code is publicly\navailable under \\url{https://github.com/alaaobeid/SEDG}.\n","authors":["Alaa Elobaid","Nathan Ramoly","Lara Younes","Symeon Papadopoulos","Eirini Ntoutsi","Ioannis Kompatsiaris"],"pdf_url":"https://arxiv.org/pdf/2404.15385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15383v1","updated":"2024-04-23T10:20:17Z","published":"2024-04-23T10:20:17Z","title":"WANDR: Intention-guided Human Motion Generation","summary":" Synthesizing natural human motions that enable a 3D human avatar to walk and\nreach for arbitrary goals in 3D space remains an unsolved problem with many\napplications. Existing methods (data-driven or using reinforcement learning)\nare limited in terms of generalization and motion naturalness. A primary\nobstacle is the scarcity of training data that combines locomotion with goal\nreaching. To address this, we introduce WANDR, a data-driven model that takes\nan avatar's initial pose and a goal's 3D position and generates natural human\nmotions that place the end effector (wrist) on the goal location. To solve\nthis, we introduce novel intention features that drive rich goal-oriented\nmovement. Intention guides the agent to the goal, and interactively adapts the\ngeneration to novel situations without needing to define sub-goals or the\nentire motion path. Crucially, intention allows training on datasets that have\ngoal-oriented motions as well as those that do not. WANDR is a conditional\nVariational Auto-Encoder (c-VAE), which we train using the AMASS and CIRCLE\ndatasets. We evaluate our method extensively and demonstrate its ability to\ngenerate natural and long-term motions that reach 3D goals and generalize to\nunseen goal locations. Our models and code are available for research purposes\nat wandr.is.tue.mpg.de.\n","authors":["Markos Diomataris","Nikos Athanasiou","Omid Taheri","Xi Wang","Otmar Hilliges","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.15383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15378v1","updated":"2024-04-23T03:04:22Z","published":"2024-04-23T03:04:22Z","title":"Hierarchical Hybrid Sliced Wasserstein: A Scalable Metric for\n Heterogeneous Joint Distributions","summary":" Sliced Wasserstein (SW) and Generalized Sliced Wasserstein (GSW) have been\nwidely used in applications due to their computational and statistical\nscalability. However, the SW and the GSW are only defined between distributions\nsupported on a homogeneous domain. This limitation prevents their usage in\napplications with heterogeneous joint distributions with marginal distributions\nsupported on multiple different domains. Using SW and GSW directly on the joint\ndomains cannot make a meaningful comparison since their homogeneous slicing\noperator i.e., Radon Transform (RT) and Generalized Radon Transform (GRT) are\nnot expressive enough to capture the structure of the joint supports set. To\naddress the issue, we propose two new slicing operators i.e., Partial\nGeneralized Radon Transform (PGRT) and Hierarchical Hybrid Radon Transform\n(HHRT). In greater detail, PGRT is the generalization of Partial Radon\nTransform (PRT), which transforms a subset of function arguments non-linearly\nwhile HHRT is the composition of PRT and multiple domain-specific PGRT on\nmarginal domain arguments. By using HHRT, we extend the SW into Hierarchical\nHybrid Sliced Wasserstein (H2SW) distance which is designed specifically for\ncomparing heterogeneous joint distributions. We then discuss the topological,\nstatistical, and computational properties of H2SW. Finally, we demonstrate the\nfavorable performance of H2SW in 3D mesh deformation, deep 3D mesh\nautoencoders, and datasets comparison.\n","authors":["Khai Nguyen","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2404.15378v1.pdf","comment":"24 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.15405v1","updated":"2024-04-23T18:00:03Z","published":"2024-04-23T18:00:03Z","title":"Photometry of Saturated Stars with Machine Learning","summary":" We develop a deep neural network (DNN) to obtain photometry of saturated\nstars in the All-Sky Automated Survey for Supernovae (ASAS-SN). The DNN can\nobtain unbiased photometry for stars from g=4 to 14 mag with a dispersion\n(15%-85% 1sigma range around median) of 0.12 mag for saturated (g<11.5 mag)\nstars. More importantly, the light curve of a non-variable saturated star has a\nmedian dispersion of only 0.037 mag. The DNN light curves are, in many cases,\nspectacularly better than provided by the standard ASAS-SN pipelines. While the\nnetwork was trained on g band data from only one of ASAS-SN's 20 cameras,\ninitial experiments suggest that it can be used for any camera and the older\nASAS-SN V band data as well. The dominant problems seem to be associated with\ncorrectable issues in the ASAS-SN data reduction pipeline for saturated stars\nmore than the DNN itself. The method is publicly available as a light curve\noption on ASAS-SN Sky Patrol v1.0.\n","authors":["Dominek Winecki","Christopher S. Kochanek"],"pdf_url":"https://arxiv.org/pdf/2404.15405v1.pdf","comment":"submitted to ApJ"},{"id":"http://arxiv.org/abs/2404.16885v1","updated":"2024-04-23T23:14:30Z","published":"2024-04-23T23:14:30Z","title":"Adapting an Artificial Intelligence Sexually Transmitted Diseases\n Symptom Checker Tool for Mpox Detection: The HeHealth Experience","summary":" Artificial Intelligence applications have shown promise in the management of\npandemics and have been widely used to assist the identification,\nclassification, and diagnosis of medical images. In response to the global\noutbreak of Monkeypox (Mpox), the HeHealth.ai team leveraged an existing tool\nto screen for sexually transmitted diseases to develop a digital screening test\nfor symptomatic Mpox through AI approaches. Prior to the global outbreak of\nMpox, the team developed a smartphone app, where app users can use their own\nsmartphone cameras to take pictures of their own penises to screen for\nsymptomatic STD. The AI model was initially developed using 5000 cases and use\na modified convolutional neural network to output prediction scores across\nvisually diagnosable penis pathologies including Syphilis, Herpes Simplex\nVirus, and Human Papilloma Virus. From June 2022 to October 2022, a total of\nabout 22,000 users downloaded the HeHealth app, and about 21,000 images have\nbeen analyzed using HeHealth AI technology. We then engaged in formative\nresearch, stakeholder engagement, rapid consolidation images, a validation\nstudy, and implementation of the tool from July 2022. From July 2022 to October\n2022, a total of 1000 Mpox related images had been used to train the Mpox\nsymptom checker tool. Our digital symptom checker tool showed accuracy of 87%\nto rule in Mpox and 90% to rule out symptomatic Mpox. Several hurdles\nidentified included issues of data privacy and security for app users, initial\nlack of data to train the AI tool, and the potential generalizability of input\ndata. We offer several suggestions to help others get started on similar\nprojects in emergency situations, including engaging a wide range of\nstakeholders, having a multidisciplinary team, prioritizing pragmatism, as well\nas the concept that big data in fact is made up of small data.\n","authors":["Rayner Kay Jin Tan","Dilruk Perera","Salomi Arasaratnam","Yudara Kularathne"],"pdf_url":"https://arxiv.org/pdf/2404.16885v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.16882v1","updated":"2024-04-23T19:56:11Z","published":"2024-04-23T19:56:11Z","title":"ThermoPore: Predicting Part Porosity Based on Thermal Images Using Deep\n Learning","summary":" We present a deep learning approach for quantifying and localizing ex-situ\nporosity within Laser Powder Bed Fusion fabricated samples utilizing in-situ\nthermal image monitoring data. Our goal is to build the real time porosity map\nof parts based on thermal images acquired during the build. The quantification\ntask builds upon the established Convolutional Neural Network model\narchitecture to predict pore count and the localization task leverages the\nspatial and temporal attention mechanisms of the novel Video Vision Transformer\nmodel to indicate areas of expected porosity. Our model for porosity\nquantification achieved a $R^2$ score of 0.57 and our model for porosity\nlocalization produced an average IoU score of 0.32 and a maximum of 1.0. This\nwork is setting the foundations of part porosity \"Digital Twins\" based on\nadditive manufacturing monitoring data and can be applied downstream to reduce\ntime-intensive post-inspection and testing activities during part qualification\nand certification. In addition, we seek to accelerate the acquisition of\ncrucial insights normally only available through ex-situ part evaluation by\nmeans of machine learning analysis of in-situ process monitoring data.\n","authors":["Peter Myung-Won Pak","Francis Ogoke","Andrew Polonsky","Anthony Garland","Dan S. Bolintineanu","Dan R. Moser","Michael J. Heiden","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2404.16882v1.pdf","comment":null}]},"2024-04-21T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.13798v1","updated":"2024-04-21T23:34:45Z","published":"2024-04-21T23:34:45Z","title":"Enforcing Conditional Independence for Fair Representation Learning and\n Causal Image Generation","summary":" Conditional independence (CI) constraints are critical for defining and\nevaluating fairness in machine learning, as well as for learning unconfounded\nor causal representations. Traditional methods for ensuring fairness either\nblindly learn invariant features with respect to a protected variable (e.g.,\nrace when classifying sex from face images) or enforce CI relative to the\nprotected attribute only on the model output (e.g., the sex label). Neither of\nthese methods are effective in enforcing CI in high-dimensional feature spaces.\nIn this paper, we focus on a nascent approach characterizing the CI constraint\nin terms of two Jensen-Shannon divergence terms, and we extend it to\nhigh-dimensional feature spaces using a novel dynamic sampling strategy. In\ndoing so, we introduce a new training paradigm that can be applied to any\nencoder architecture. We are able to enforce conditional independence of the\ndiffusion autoencoder latent representation with respect to any protected\nattribute under the equalized odds constraint and show that this approach\nenables causal image generation with controllable latent spaces. Our\nexperimental results demonstrate that our approach can achieve high accuracy on\ndownstream tasks while upholding equality of odds.\n","authors":["Jensen Hwa","Qingyu Zhao","Aditya Lahiri","Adnan Masood","Babak Salimi","Ehsan Adeli"],"pdf_url":"https://arxiv.org/pdf/2404.13798v1.pdf","comment":"To appear at the 2024 IEEE CVPR Workshop on Fair, Data-Efficient, and\n Trusted Computer Vision"},{"id":"http://arxiv.org/abs/2404.13791v1","updated":"2024-04-21T23:01:08Z","published":"2024-04-21T23:01:08Z","title":"Universal Fingerprint Generation: Controllable Diffusion Model with\n Multimodal Conditions","summary":" The utilization of synthetic data for fingerprint recognition has garnered\nincreased attention due to its potential to alleviate privacy concerns\nsurrounding sensitive biometric data. However, current methods for generating\nfingerprints have limitations in creating impressions of the same finger with\nuseful intra-class variations. To tackle this challenge, we present GenPrint, a\nframework to produce fingerprint images of various types while maintaining\nidentity and offering humanly understandable control over different appearance\nfactors such as fingerprint class, acquisition type, sensor device, and quality\nlevel. Unlike previous fingerprint generation approaches, GenPrint is not\nconfined to replicating style characteristics from the training dataset alone:\nit enables the generation of novel styles from unseen devices without requiring\nadditional fine-tuning. To accomplish these objectives, we developed GenPrint\nusing latent diffusion models with multimodal conditions (text and image) for\nconsistent generation of style and identity. Our experiments leverage a variety\nof publicly available datasets for training and evaluation. Results demonstrate\nthe benefits of GenPrint in terms of identity preservation, explainable\ncontrol, and universality of generated images. Importantly, the\nGenPrint-generated images yield comparable or even superior accuracy to models\ntrained solely on real data and further enhances performance when augmenting\nthe diversity of existing real fingerprint datasets.\n","authors":["Steven A. Grosz","Anil K. Jain"],"pdf_url":"https://arxiv.org/pdf/2404.13791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13788v1","updated":"2024-04-21T22:33:57Z","published":"2024-04-21T22:33:57Z","title":"AnyPattern: Towards In-context Image Copy Detection","summary":" This paper explores in-context learning for image copy detection (ICD), i.e.,\nprompting an ICD model to identify replicated images with new tampering\npatterns without the need for additional training. The prompts (or the\ncontexts) are from a small set of image-replica pairs that reflect the new\npatterns and are used at inference time. Such in-context ICD has good realistic\nvalue, because it requires no fine-tuning and thus facilitates fast reaction\nagainst the emergence of unseen patterns. To accommodate the \"seen\n$\\rightarrow$ unseen\" generalization scenario, we construct the first\nlarge-scale pattern dataset named AnyPattern, which has the largest number of\ntamper patterns ($90$ for training and $10$ for testing) among all the existing\nones. We benchmark AnyPattern with popular ICD methods and reveal that existing\nmethods barely generalize to novel tamper patterns. We further propose a simple\nin-context ICD method named ImageStacker. ImageStacker learns to select the\nmost representative image-replica pairs and employs them as the pattern prompts\nin a stacking manner (rather than the popular concatenation manner).\nExperimental results show (1) training with our large-scale dataset\nsubstantially benefits pattern generalization ($+26.66 \\%$ $\\mu AP$), (2) the\nproposed ImageStacker facilitates effective in-context ICD (another round of\n$+16.75 \\%$ $\\mu AP$), and (3) AnyPattern enables in-context ICD, i.e. without\nsuch a large-scale dataset, in-context learning does not emerge even with our\nImageStacker. The project (including the proposed dataset AnyPattern and the\ncode for ImageStacker) is publicly available at https://anypattern.github.io\nunder the MIT Licence.\n","authors":["Wenhao Wang","Yifan Sun","Zhentao Tan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13784v1","updated":"2024-04-21T21:30:17Z","published":"2024-04-21T21:30:17Z","title":"Iteratively Prompting Multimodal LLMs to Reproduce Natural and\n AI-Generated Images","summary":" With the digital imagery landscape rapidly evolving, image stocks and\nAI-generated image marketplaces have become central to visual media.\nTraditional stock images now exist alongside innovative platforms that trade in\nprompts for AI-generated visuals, driven by sophisticated APIs like DALL-E 3\nand Midjourney. This paper studies the possibility of employing multi-modal\nmodels with enhanced visual understanding to mimic the outputs of these\nplatforms, introducing an original attack strategy. Our method leverages\nfine-tuned CLIP models, a multi-label classifier, and the descriptive\ncapabilities of GPT-4V to create prompts that generate images similar to those\navailable in marketplaces and from premium stock image providers, yet at a\nmarkedly lower expense. In presenting this strategy, we aim to spotlight a new\nclass of economic and security considerations within the realm of digital\nimagery. Our findings, supported by both automated metrics and human\nassessment, reveal that comparable visual content can be produced for a\nfraction of the prevailing market prices ($0.23 - $0.27 per image), emphasizing\nthe need for awareness and strategic discussions about the integrity of digital\nmedia in an increasingly AI-integrated landscape. Our work also contributes to\nthe field by assembling a dataset consisting of approximately 19 million\nprompt-image pairs generated by the popular Midjourney platform, which we plan\nto release publicly.\n","authors":["Ali Naseh","Katherine Thai","Mohit Iyyer","Amir Houmansadr"],"pdf_url":"https://arxiv.org/pdf/2404.13784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13770v1","updated":"2024-04-21T20:45:18Z","published":"2024-04-21T20:45:18Z","title":"EncodeNet: A Framework for Boosting DNN Accuracy with Entropy-driven\n Generalized Converting Autoencoder","summary":" Image classification is a fundamental task in computer vision, and the quest\nto enhance DNN accuracy without inflating model size or latency remains a\npressing concern. We make a couple of advances in this regard, leading to a\nnovel EncodeNet design and training framework. The first advancement involves\nConverting Autoencoders, a novel approach that transforms images into an\neasy-to-classify image of its class. Our prior work that applied the Converting\nAutoencoder and a simple classifier in tandem achieved moderate accuracy over\nsimple datasets, such as MNIST and FMNIST. However, on more complex datasets\nlike CIFAR-10, the Converting Autoencoder has a large reconstruction loss,\nmaking it unsuitable for enhancing DNN accuracy. To address these limitations,\nwe generalize the design of Converting Autoencoders by leveraging a larger\nclass of DNNs, those with architectures comprising feature extraction layers\nfollowed by classification layers. We incorporate a generalized algorithmic\ndesign of the Converting Autoencoder and intraclass clustering to identify\nrepresentative images, leading to optimized image feature learning. Next, we\ndemonstrate the effectiveness of our EncodeNet design and training framework,\nimproving the accuracy of well-trained baseline DNNs while maintaining the\noverall model size. EncodeNet's building blocks comprise the trained encoder\nfrom our generalized Converting Autoencoders transferring knowledge to a\nlightweight classifier network - also extracted from the baseline DNN. Our\nexperimental results demonstrate that EncodeNet improves the accuracy of VGG16\nfrom 92.64% to 94.05% on CIFAR-10 and RestNet20 from 74.56% to 76.04% on\nCIFAR-100. It outperforms state-of-the-art techniques that rely on knowledge\ndistillation and attention mechanisms, delivering higher accuracy for models of\ncomparable size.\n","authors":["Hasanul Mahmud","Kevin Desai","Palden Lama","Sushil K. Prasad"],"pdf_url":"https://arxiv.org/pdf/2404.13770v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.13767v1","updated":"2024-04-21T20:32:02Z","published":"2024-04-21T20:32:02Z","title":"Autonomous Robot for Disaster Mapping and Victim Localization","summary":" In response to the critical need for effective reconnaissance in disaster\nscenarios, this research article presents the design and implementation of a\ncomplete autonomous robot system using the Turtlebot3 with Robotic Operating\nSystem (ROS) Noetic. Upon deployment in closed, initially unknown environments,\nthe system aims to generate a comprehensive map and identify any present\n'victims' using AprilTags as stand-ins. We discuss our solution for search and\nrescue missions, while additionally exploring more advanced algorithms to\nimprove search and rescue functionalities. We introduce a Cubature Kalman\nFilter to help reduce the mean squared error [m] for AprilTag localization and\nan information-theoretic exploration algorithm to expedite exploration in\nunknown environments. Just like turtles, our system takes it slow and steady,\nbut when it's time to save the day, it moves at ninja-like speed! Despite\nDonatello's shell, he's no slowpoke - he zips through obstacles with the\nagility of a teenage mutant ninja turtle. So, hang on tight to your shells and\nget ready for a whirlwind of reconnaissance!\n Full pipeline code https://github.com/rzhao5659/MRProject/tree/main\n Exploration code https://github.com/rzhao5659/MRProject/tree/main\n","authors":["Michael Potter","Rahil Bhowal","Richard Zhao","Anuj Patel","Jingming Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.13767v1.pdf","comment":"Class final project for Northeastern University EECE 5550 Mobile\n Robotics Course"},{"id":"http://arxiv.org/abs/2403.20260v2","updated":"2024-04-21T20:29:17Z","published":"2024-03-29T16:08:59Z","title":"Prototype-based Interpretable Breast Cancer Prediction Models: Analysis\n and Challenges","summary":" Deep learning models have achieved high performance in medical applications,\nhowever, their adoption in clinical practice is hindered due to their black-box\nnature. Self-explainable models, like prototype-based models, can be especially\nbeneficial as they are interpretable by design. However, if the learnt\nprototypes are of low quality then the prototype-based models are as good as\nblack-box. Having high quality prototypes is a pre-requisite for a truly\ninterpretable model. In this work, we propose a prototype evaluation framework\nfor coherence (PEF-C) for quantitatively evaluating the quality of the\nprototypes based on domain knowledge. We show the use of PEF-C in the context\nof breast cancer prediction using mammography. Existing works on\nprototype-based models on breast cancer prediction using mammography have\nfocused on improving the classification performance of prototype-based models\ncompared to black-box models and have evaluated prototype quality through\nanecdotal evidence. We are the first to go beyond anecdotal evidence and\nevaluate the quality of the mammography prototypes systematically using our\nPEF-C. Specifically, we apply three state-of-the-art prototype-based models,\nProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer\nprediction and evaluate these models w.r.t. i) classification performance, and\nii) quality of the prototypes, on three public datasets. Our results show that\nprototype-based models are competitive with black-box models in terms of\nclassification performance, and achieve a higher score in detecting ROIs.\nHowever, the quality of the prototypes are not yet sufficient and can be\nimproved in aspects of relevance, purity and learning a variety of prototypes.\nWe call the XAI community to systematically evaluate the quality of the\nprototypes to check their true usability in high stake decisions and improve\nsuch models further.\n","authors":["Shreyasi Pathak","Jörg Schlötterer","Jeroen Veltman","Jeroen Geerdink","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2403.20260v2.pdf","comment":"Accepted at World Conference on Explainable Artificial Intelligence;\n 21 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.13766v1","updated":"2024-04-21T20:26:46Z","published":"2024-04-21T20:26:46Z","title":"Object-Attribute Binding in Text-to-Image Generation: Evaluation and\n Control","summary":" Current diffusion models create photorealistic images given a text prompt as\ninput but struggle to correctly bind attributes mentioned in the text to the\nright objects in the image. This is evidenced by our novel image-graph\nalignment model called EPViT (Edge Prediction Vision Transformer) for the\nevaluation of image-text alignment. To alleviate the above problem, we propose\nfocused cross-attention (FCA) that controls the visual attention maps by\nsyntactic constraints found in the input sentence. Additionally, the syntax\nstructure of the prompt helps to disentangle the multimodal CLIP embeddings\nthat are commonly used in T2I generation. The resulting DisCLIP embeddings and\nFCA are easily integrated in state-of-the-art diffusion models without\nadditional training of these models. We show substantial improvements in T2I\ngeneration and especially its attribute-object binding on several\ndatasets.\\footnote{Code and data will be made available upon acceptance.\n","authors":["Maria Mihaela Trusca","Wolf Nuyts","Jonathan Thomm","Robert Honig","Thomas Hofmann","Tinne Tuytelaars","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2404.13766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19966v2","updated":"2024-04-21T20:16:41Z","published":"2024-03-29T04:02:51Z","title":"Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning","summary":" Using single-task deep learning methods to reconstruct Magnetic Resonance\nImaging (MRI) data acquired with different imaging sequences is inherently\nchallenging. The trained deep learning model typically lacks generalizability,\nand the dissimilarity among image datasets with different types of contrast\nleads to suboptimal learning performance. This paper proposes a meta-learning\napproach to efficiently learn image features from multiple MR image datasets.\nOur algorithm can perform multi-task learning to simultaneously reconstruct MR\nimages acquired using different imaging sequences with different image\ncontrasts. The experiment results demonstrate the ability of our new\nmeta-learning reconstruction method to successfully reconstruct\nhighly-undersampled k-space data from multiple MRI datasets simultaneously,\noutperforming other compelling reconstruction methods previously developed for\nsingle-task learning.\n","authors":["Wanyu Bian","Albert Jang","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11720v2","updated":"2024-04-21T20:09:17Z","published":"2022-07-24T11:26:53Z","title":"Progressive Feature Learning for Realistic Cloth-Changing Gait\n Recognition","summary":" Gait recognition is instrumental in crime prevention and social security, for\nit can be conducted at a long distance to figure out the identity of persons.\nHowever, existing datasets and methods cannot satisfactorily deal with the most\nchallenging cloth-changing problem in practice. Specifically, the practical\ngait models are usually trained on automatically labeled data, in which the\nsequences' views and cloth conditions of each person have some restrictions. To\nbe concrete, the cross-view sub-dataset only has normal walking condition\nwithout cloth-changing, while the cross-cloth sub-dataset has cloth-changing\nsequences but only in front views. As a result, the cloth-changing accuracy\ncannot meet practical requirements. In this work, we formulate the problem as\nRealistic Cloth-Changing Gait Recognition (abbreviated as RCC-GR) and we\nconstruct two benchmarks: CASIA-BN-RCC and OUMVLP-RCC, to simulate the above\nsetting. Furthermore, we propose a new framework called Progressive Feature\nLearning that can be applied with off-the-shelf backbones to improve their\nperformance in RCC-GR. Specifically, in our framework, we design Progressive\nMapping and Progressive Uncertainty to extract cross-view features and then\nextract cross-cloth features on the basis. In this way, the feature from the\ncross-view sub-dataset can first dominate the feature space and relieve the\nuneven distribution caused by the adverse effect from the cross-cloth\nsub-dataset. The experiments on our benchmarks show that our framework can\neffectively improve recognition performance, especially in the cloth-changing\nconditions.\n","authors":["Xuqian Ren","Saihui Hou","Chunshui Cao","Xu Liu","Yongzhen Huang"],"pdf_url":"https://arxiv.org/pdf/2207.11720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10772v2","updated":"2024-04-21T19:51:26Z","published":"2023-03-19T21:34:20Z","title":"Unsupervised Gait Recognition with Selective Fusion","summary":" Previous gait recognition methods primarily trained on labeled datasets,\nwhich require painful labeling effort. However, using a pre-trained model on a\nnew dataset without fine-tuning can lead to significant performance\ndegradation. So to make the pre-trained gait recognition model able to be\nfine-tuned on unlabeled datasets, we propose a new task: Unsupervised Gait\nRecognition (UGR). We introduce a new cluster-based baseline to solve UGR with\ncluster-level contrastive learning. But we further find more challenges this\ntask meets. First, sequences of the same person in different clothes tend to\ncluster separately due to the significant appearance changes. Second, sequences\ntaken from 0{\\deg} and 180{\\deg} views lack walking postures and do not cluster\nwith sequences taken from other views. To address these challenges, we propose\na Selective Fusion method, which includes Selective Cluster Fusion (SCF) and\nSelective Sample Fusion (SSF). With SCF, we merge matched clusters of the same\nperson wearing different clothes by updating the cluster-level memory bank with\na multi-cluster update strategy. And in SSF, we merge sequences taken from\nfront/back views gradually with curriculum learning. Extensive experiments show\nthe effectiveness of our method in improving the rank-1 accuracy in walking\nwith different coats condition and front/back views conditions.\n","authors":["Xuqian Ren","Shaopeng Yang","Saihui Hou","Chunshui Cao","Xu Liu","Yongzhen Huang"],"pdf_url":"https://arxiv.org/pdf/2303.10772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13756v1","updated":"2024-04-21T19:42:28Z","published":"2024-04-21T19:42:28Z","title":"BC-MRI-SEG: A Breast Cancer MRI Tumor Segmentation Benchmark","summary":" Binary breast cancer tumor segmentation with Magnetic Resonance Imaging (MRI)\ndata is typically trained and evaluated on private medical data, which makes\ncomparing deep learning approaches difficult. We propose a benchmark\n(BC-MRI-SEG) for binary breast cancer tumor segmentation based on publicly\navailable MRI datasets. The benchmark consists of four datasets in total, where\ntwo datasets are used for supervised training and evaluation, and two are used\nfor zero-shot evaluation. Additionally we compare state-of-the-art (SOTA)\napproaches on our benchmark and provide an exhaustive list of available public\nbreast cancer MRI datasets. The source code has been made available at\nhttps://irulenot.github.io/BC_MRI_SEG_Benchmark.\n","authors":["Anthony Bilic","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13745v1","updated":"2024-04-21T19:02:38Z","published":"2024-04-21T19:02:38Z","title":"A Nasal Cytology Dataset for Object Detection and Deep Learning","summary":" Nasal Cytology is a new and efficient clinical technique to diagnose rhinitis\nand allergies that is not much widespread due to the time-consuming nature of\ncell counting; that is why AI-aided counting could be a turning point for the\ndiffusion of this technique. In this article we present the first dataset of\nrhino-cytological field images: the NCD (Nasal Cytology Dataset), aimed to\ntrain and deploy Object Detection models to support physicians and biologists\nduring clinical practice. The real distribution of the cytotypes, populating\nthe nasal mucosa has been replicated, sampling images from slides of clinical\npatients, and manually annotating each cell found on them. The correspondent\nobject detection task presents non'trivial issues associated with the strong\nclass imbalancement, involving the rarest cell types. This work contributes to\nsome of open challenges by presenting a novel machine learning-based approach\nto aid the automated detection and classification of nasal mucosa cells: the\nDETR and YOLO models shown good performance in detecting cells and classifying\nthem correctly, revealing great potential to accelerate the work of rhinology\nexperts.\n","authors":["Mauro Camporeale","Giovanni Dimauro","Matteo Gelardi","Giorgia Iacobellis","Mattia Sebastiano Ladisa","Sergio Latrofa","Nunzia Lomonte"],"pdf_url":"https://arxiv.org/pdf/2404.13745v1.pdf","comment":"Pre Print almost ready to be submitted"},{"id":"http://arxiv.org/abs/2403.06098v2","updated":"2024-04-21T18:42:44Z","published":"2024-03-10T05:40:12Z","title":"VidProM: A Million-scale Real Prompt-Gallery Dataset for Text-to-Video\n Diffusion Models","summary":" The arrival of Sora marks a new era for text-to-video diffusion models,\nbringing significant advancements in video generation and potential\napplications. However, Sora, along with other text-to-video diffusion models,\nis highly reliant on prompts, and there is no publicly available dataset that\nfeatures a study of text-to-video prompts. In this paper, we introduce VidProM,\nthe first large-scale dataset comprising 1.67 Million unique text-to-Video\nPrompts from real users. Additionally, this dataset includes 6.69 million\nvideos generated by four state-of-the-art diffusion models, alongside some\nrelated data. We initially discuss the curation of this large-scale dataset, a\nprocess that is both time-consuming and costly. Subsequently, we underscore the\nneed for a new prompt dataset specifically designed for text-to-video\ngeneration by illustrating how VidProM differs from DiffusionDB, a large-scale\nprompt-gallery dataset for image generation. Our extensive and diverse dataset\nalso opens up many exciting new research areas. For instance, we suggest\nexploring text-to-video prompt engineering, efficient video generation, and\nvideo copy detection for diffusion models to develop better, more efficient,\nand safer models. The project (including the collected dataset VidProM and\nrelated code) is publicly available at https://vidprom.github.io under the\nCC-BY-NC 4.0 License.\n","authors":["Wenhao Wang","Yifan Sun","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.06098v2.pdf","comment":"The project (including the collected dataset VidProM and related\n code) is publicly available at https://vidprom.github.io under the CC-BY-NC\n 4.0 License"},{"id":"http://arxiv.org/abs/2404.14990v1","updated":"2024-04-21T18:32:08Z","published":"2024-04-21T18:32:08Z","title":"Interpreting COVID Lateral Flow Tests' Results with Foundation Models","summary":" Lateral flow tests (LFTs) enable rapid, low-cost testing for health\nconditions including Covid, pregnancy, HIV, and malaria. Automated readers of\nLFT results can yield many benefits including empowering blind people to\nindependently learn about their health and accelerating data entry for\nlarge-scale monitoring (e.g., for pandemics such as Covid) by using only a\nsingle photograph per LFT test. Accordingly, we explore the abilities of modern\nfoundation vision language models (VLMs) in interpreting such tests. To enable\nthis analysis, we first create a new labeled dataset with hierarchical\nsegmentations of each LFT test and its nested test result window. We call this\ndataset LFT-Grounding. Next, we benchmark eight modern VLMs in zero-shot\nsettings for analyzing these images. We demonstrate that current VLMs\nfrequently fail to correctly identify the type of LFT test, interpret the test\nresults, locate the nested result window of the LFT tests, and recognize LFT\ntests when they partially obfuscated. To facilitate community-wide progress\ntowards automated LFT reading, we publicly release our dataset at\nhttps://iamstuti.github.io/lft_grounding_foundation_models/.\n","authors":["Stuti Pandey","Josh Myers-Dean","Jarek Reynolds","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2404.14990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13733v1","updated":"2024-04-21T18:19:27Z","published":"2024-04-21T18:19:27Z","title":"Elucidating the Design Space of Dataset Condensation","summary":" Dataset condensation, a concept within data-centric learning, efficiently\ntransfers critical attributes from an original dataset to a synthetic version,\nmaintaining both diversity and realism. This approach significantly improves\nmodel training efficiency and is adaptable across multiple application areas.\nPrevious methods in dataset condensation have faced challenges: some incur high\ncomputational costs which limit scalability to larger datasets (e.g., MTT,\nDREAM, and TESLA), while others are restricted to less optimal design spaces,\nwhich could hinder potential improvements, especially in smaller datasets\n(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a\ncomprehensive design framework that includes specific, effective strategies\nlike implementing soft category-aware matching and adjusting the learning rate\nschedule. These strategies are grounded in empirical evidence and theoretical\nbacking. Our resulting approach, Elucidate Dataset Condensation (EDC),\nestablishes a benchmark for both small and large-scale dataset condensation. In\nour testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on\nImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a\ncompression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,\nand RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.\n","authors":["Shitong Shao","Zikai Zhou","Huanran Chen","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.13733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13711v1","updated":"2024-04-21T16:45:35Z","published":"2024-04-21T16:45:35Z","title":"ArtNeRF: A Stylized Neural Field for 3D-Aware Cartoonized Face Synthesis","summary":" Recent advances in generative visual models and neural radiance fields have\ngreatly boosted 3D-aware image synthesis and stylization tasks. However,\nprevious NeRF-based work is limited to single scene stylization, training a\nmodel to generate 3D-aware cartoon faces with arbitrary styles remains\nunsolved. We propose ArtNeRF, a novel face stylization framework derived from\n3D-aware GAN to tackle this problem. In this framework, we utilize an\nexpressive generator to synthesize stylized faces and a triple-branch\ndiscriminator module to improve the visual quality and style consistency of the\ngenerated faces. Specifically, a style encoder based on contrastive learning is\nleveraged to extract robust low-dimensional embeddings of style images,\nempowering the generator with the knowledge of various styles. To smooth the\ntraining process of cross-domain transfer learning, we propose an adaptive\nstyle blending module which helps inject style information and allows users to\nfreely tune the level of stylization. We further introduce a neural rendering\nmodule to achieve efficient real-time rendering of images with higher\nresolutions. Extensive experiments demonstrate that ArtNeRF is versatile in\ngenerating high-quality 3D-aware cartoon faces with arbitrary styles.\n","authors":["Zichen Tang","Hongyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13710v1","updated":"2024-04-21T16:44:52Z","published":"2024-04-21T16:44:52Z","title":"SVGEditBench: A Benchmark Dataset for Quantitative Assessment of LLM's\n SVG Editing Capabilities","summary":" Text-to-image models have shown progress in recent years. Along with this\nprogress, generating vector graphics from text has also advanced. SVG is a\npopular format for vector graphics, and SVG represents a scene with XML text.\nTherefore, Large Language Models can directly process SVG code. Taking this\ninto account, we focused on editing SVG with LLMs. For quantitative evaluation\nof LLMs' ability to edit SVG, we propose SVGEditBench. SVGEditBench is a\nbenchmark for assessing the LLMs' ability to edit SVG code. We also show the\nGPT-4 and GPT-3.5 results when evaluated on the proposed benchmark. In the\nexperiments, GPT-4 showed superior performance to GPT-3.5 both quantitatively\nand qualitatively. The dataset is available at\nhttps://github.com/mti-lab/SVGEditBench.\n","authors":["Kunato Nishina","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13710v1.pdf","comment":"Accepted to Workshop on Graphic Design Understanding and Generation\n (GDUG), a CVPR2024 workshop. Dataset: https://github.com/mti-lab/SVGEditBench"},{"id":"http://arxiv.org/abs/2310.18737v2","updated":"2024-04-21T16:43:36Z","published":"2023-10-28T15:42:07Z","title":"Pre-training with Random Orthogonal Projection Image Modeling","summary":" Masked Image Modeling (MIM) is a powerful self-supervised strategy for visual\npre-training without the use of labels. MIM applies random crops to input\nimages, processes them with an encoder, and then recovers the masked inputs\nwith a decoder, which encourages the network to capture and learn structural\ninformation about objects and scenes. The intermediate feature representations\nobtained from MIM are suitable for fine-tuning on downstream tasks. In this\npaper, we propose an Image Modeling framework based on random orthogonal\nprojection instead of binary masking as in MIM. Our proposed Random Orthogonal\nProjection Image Modeling (ROPIM) reduces spatially-wise token information\nunder guaranteed bound on the noise variance and can be considered as masking\nentire spatial image area under locally varying masking degrees. Since ROPIM\nuses a random subspace for the projection that realizes the masking step, the\nreadily available complement of the subspace can be used during unmasking to\npromote recovery of removed information. In this paper, we show that using\nrandom orthogonal projection leads to superior performance compared to\ncrop-based masking. We demonstrate state-of-the-art results on several popular\nbenchmarks.\n","authors":["Maryam Haghighat","Peyman Moghadam","Shaheer Mohamed","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2310.18737v2.pdf","comment":"Published as a conference paper at the International Conference on\n Learning Representations (ICLR) 2024. 19 pages"},{"id":"http://arxiv.org/abs/2404.13706v1","updated":"2024-04-21T16:35:16Z","published":"2024-04-21T16:35:16Z","title":"Concept Arithmetics for Circumventing Concept Inhibition in Diffusion\n Models","summary":" Motivated by ethical and legal concerns, the scientific community is actively\ndeveloping methods to limit the misuse of Text-to-Image diffusion models for\nreproducing copyrighted, violent, explicit, or personal information in the\ngenerated images. Simultaneously, researchers put these newly developed safety\nmeasures to the test by assuming the role of an adversary to find\nvulnerabilities and backdoors in them. We use compositional property of\ndiffusion models, which allows to leverage multiple prompts in a single image\ngeneration. This property allows us to combine other concepts, that should not\nhave been affected by the inhibition, to reconstruct the vector, responsible\nfor target concept generation, even though the direct computation of this\nvector is no longer accessible. We provide theoretical and empirical evidence\nwhy the proposed attacks are possible and discuss the implications of these\nfindings for safe model deployment. We argue that it is essential to consider\nall possible approaches to image generation with diffusion models that can be\nemployed by an adversary. Our work opens up the discussion about the\nimplications of concept arithmetics and compositional inference for safety\nmechanisms in diffusion models.\n Content Advisory: This paper contains discussions and model-generated content\nthat may be considered offensive. Reader discretion is advised.\n Project page: https://cs-people.bu.edu/vpetsiuk/arc\n","authors":["Vitali Petsiuk","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2404.13706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13704v1","updated":"2024-04-21T16:29:49Z","published":"2024-04-21T16:29:49Z","title":"PEMMA: Parameter-Efficient Multi-Modal Adaptation for Medical Image\n Segmentation","summary":" Imaging modalities such as Computed Tomography (CT) and Positron Emission\nTomography (PET) are key in cancer detection, inspiring Deep Neural Networks\n(DNN) models that merge these scans for tumor segmentation. When both CT and\nPET scans are available, it is common to combine them as two channels of the\ninput to the segmentation model. However, this method requires both scan types\nduring training and inference, posing a challenge due to the limited\navailability of PET scans, thereby sometimes limiting the process to CT scans\nonly. Hence, there is a need to develop a flexible DNN architecture that can be\ntrained/updated using only CT scans but can effectively utilize PET scans when\nthey become available. In this work, we propose a parameter-efficient\nmulti-modal adaptation (PEMMA) framework for lightweight upgrading of a\ntransformer-based segmentation model trained only on CT scans to also\nincorporate PET scans. The benefits of the proposed approach are two-fold.\nFirstly, we leverage the inherent modularity of the transformer architecture\nand perform low-rank adaptation (LoRA) of the attention weights to achieve\nparameter-efficient adaptation. Secondly, since the PEMMA framework attempts to\nminimize cross modal entanglement, it is possible to subsequently update the\ncombined model using only one modality, without causing catastrophic forgetting\nof the other modality. Our proposed method achieves comparable results with the\nperformance of early fusion techniques with just 8% of the trainable\nparameters, especially with a remarkable +28% improvement on the average dice\nscore on PET scans when trained on a single modality.\n","authors":["Nada Saadi","Numan Saeed","Mohammad Yaqub","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2404.13704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13701v1","updated":"2024-04-21T16:05:38Z","published":"2024-04-21T16:05:38Z","title":"Semantic-Rearrangement-Based Multi-Level Alignment for Domain\n Generalized Segmentation","summary":" Domain generalized semantic segmentation is an essential computer vision\ntask, for which models only leverage source data to learn the capability of\ngeneralized semantic segmentation towards the unseen target domains. Previous\nworks typically address this challenge by global style randomization or feature\nregularization. In this paper, we argue that given the observation that\ndifferent local semantic regions perform different visual characteristics from\nthe source domain to the target domain, methods focusing on global operations\nare hard to capture such regional discrepancies, thus failing to construct\ndomain-invariant representations with the consistency from local to global\nlevel. Therefore, we propose the Semantic-Rearrangement-based Multi-Level\nAlignment (SRMA) to overcome this problem. SRMA first incorporates a Semantic\nRearrangement Module (SRM), which conducts semantic region randomization to\nenhance the diversity of the source domain sufficiently. A Multi-Level\nAlignment module (MLA) is subsequently proposed with the help of such diversity\nto establish the global-regional-local consistent domain-invariant\nrepresentations. By aligning features across randomized samples with\ndomain-neutral knowledge at multiple levels, SRMA provides a more robust way to\nhandle the source-target domain gap. Extensive experiments demonstrate the\nsuperiority of SRMA over the current state-of-the-art works on various\nbenchmarks.\n","authors":["Guanlong Jiao","Chenyangguang Zhang","Haonan Yin","Yu Mo","Biqing Huang","Hui Pan","Yi Luo","Jingxian Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13693v1","updated":"2024-04-21T15:42:56Z","published":"2024-04-21T15:42:56Z","title":"PV-S3: Advancing Automatic Photovoltaic Defect Detection using\n Semi-Supervised Semantic Segmentation of Electroluminescence Images","summary":" Photovoltaic (PV) systems allow us to tap into all abundant solar energy,\nhowever they require regular maintenance for high efficiency and to prevent\ndegradation. Traditional manual health check, using Electroluminescence (EL)\nimaging, is expensive and logistically challenging making automated defect\ndetection essential. Current automation approaches require extensive manual\nexpert labeling, which is time-consuming, expensive, and prone to errors. We\npropose PV-S3 (Photovoltaic-Semi Supervised Segmentation), a Semi-Supervised\nLearning approach for semantic segmentation of defects in EL images that\nreduces reliance on extensive labeling. PV-S3 is a Deep learning model trained\nusing a few labeled images along with numerous unlabeled images. We introduce a\nnovel Semi Cross-Entropy loss function to train PV-S3 which addresses the\nchallenges specific to automated PV defect detection, such as diverse defect\ntypes and class imbalance. We evaluate PV-S3 on multiple datasets and\ndemonstrate its effectiveness and adaptability. With merely 20% labeled\nsamples, we achieve an absolute improvement of 9.7% in IoU, 29.9% in Precision,\n12.75% in Recall, and 20.42% in F1-Score over prior state-of-the-art supervised\nmethod (which uses 100% labeled samples) on UCF-EL dataset (largest dataset\navailable for semantic segmentation of EL images) showing improvement in\nperformance while reducing the annotation costs by 80%.\n","authors":["Abhishek Jha","Yogesh Rawat","Shruti Vyas"],"pdf_url":"https://arxiv.org/pdf/2404.13693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13692v1","updated":"2024-04-21T15:40:41Z","published":"2024-04-21T15:40:41Z","title":"A sustainable development perspective on urban-scale roof greening\n priorities and benefits","summary":" Greenspaces are tightly linked to human well-being. Yet, rapid urbanization\nhas exacerbated greenspace exposure inequality and declining human life\nquality. Roof greening has been recognized as an effective strategy to mitigate\nthese negative impacts. Understanding priorities and benefits is crucial to\npromoting green roofs. Here, using geospatial big data, we conduct an\nurban-scale assessment of roof greening at a single building level in Hong Kong\nfrom a sustainable development perspective. We identify that 85.3\\% of\nbuildings reveal potential and urgent demand for roof greening. We further find\ngreen roofs could increase greenspace exposure by \\textasciitilde61\\% and\nproduce hundreds of millions (HK\\$) in economic benefits annually but play a\nsmall role in urban heat mitigation (\\textasciitilde0.15\\degree{C}) and annual\ncarbon emission offsets (\\textasciitilde0.8\\%). Our study offers a\ncomprehensive assessment of roof greening, which could provide reference for\nsustainable development in cities worldwide, from data utilization to solutions\nand findings.\n","authors":["Jie Shao","Wei Yao","Lei Luo","Linzhou Zeng","Zhiyi He","Puzuo Wang","Huadong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.13692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13691v1","updated":"2024-04-21T15:40:32Z","published":"2024-04-21T15:40:32Z","title":"A Complete System for Automated 3D Semantic-Geometric Mapping of\n Corrosion in Industrial Environments","summary":" Corrosion, a naturally occurring process leading to the deterioration of\nmetallic materials, demands diligent detection for quality control and the\npreservation of metal-based objects, especially within industrial contexts.\nTraditional techniques for corrosion identification, including ultrasonic\ntesting, radio-graphic testing, and magnetic flux leakage, necessitate the\ndeployment of expensive and bulky equipment on-site for effective data\nacquisition. An unexplored alternative involves employing lightweight,\nconventional camera systems, and state-of-the-art computer vision methods for\nits identification.\n In this work, we propose a complete system for semi-automated corrosion\nidentification and mapping in industrial environments. We leverage recent\nadvances in LiDAR-based methods for localization and mapping, with vision-based\nsemantic segmentation deep learning techniques, in order to build\nsemantic-geometric maps of industrial environments. Unlike previous corrosion\nidentification systems available in the literature, our designed multi-modal\nsystem is low-cost, portable, semi-autonomous and allows collecting large\ndatasets by untrained personnel.\n A set of experiments in an indoor laboratory environment, demonstrate\nquantitatively the high accuracy of the employed LiDAR based 3D mapping and\nlocalization system, with less then $0.05m$ and 0.02m average absolute and\nrelative pose errors. Also, our data-driven semantic segmentation model,\nachieves around 70\\% precision when trained with our pixel-wise manually\nannotated dataset.\n","authors":["Rui Pimentel de Figueiredo","Stefan Nordborg Eriksen","Ignacio Rodriguez","Simon Bøgh"],"pdf_url":"https://arxiv.org/pdf/2404.13691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02439v3","updated":"2024-04-21T15:20:15Z","published":"2023-12-05T02:41:57Z","title":"Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language\n Models with Creative Humor Generation","summary":" Chain-of-Thought (CoT) guides large language models (LLMs) to reason\nstep-by-step, and can motivate their logical reasoning ability. While effective\nfor logical tasks, CoT is not conducive to creative problem-solving which often\nrequires out-of-box thoughts and is crucial for innovation advancements. In\nthis paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a\nnon-sequential, creative paradigm involving strong associations and knowledge\nleaps. To this end, we study LLMs on the popular Oogiri game which needs\nparticipants to have good creativity and strong associative thinking for\nresponding unexpectedly and humorously to the given image, text, or both, and\nthus is suitable for LoT study. Then to investigate LLMs' LoT ability in the\nOogiri game, we first build a multimodal and multilingual Oogiri-GO dataset\nwhich contains over 130,000 samples from the Oogiri game, and observe the\ninsufficient LoT ability or failures of most existing LLMs on the Oogiri game.\nAccordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve\nLLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into\nLoT-oriented instruction tuning data to train pretrained LLM for achieving\ncertain LoT humor generation and discrimination abilities. Then CLoT designs an\nexplorative self-refinement that encourages the LLM to generate more creative\nLoT data via exploring parallels between seemingly unrelated concepts and\nselects high-quality data to train itself for self-refinement. CLoT not only\nexcels in humor generation in the Oogiri game but also boosts creative\nabilities in various tasks like cloud guessing game and divergent association\ntask. These findings advance our understanding and offer a pathway to improve\nLLMs' creative capacities for innovative applications across domains. The\ndataset, code, and models will be released online.\nhttps://zhongshsh.github.io/CLoT/.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Shanghua Gao","Wushao Wen","Liang Lin","Marinka Zitnik","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02439v3.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.13686v1","updated":"2024-04-21T15:16:05Z","published":"2024-04-21T15:16:05Z","title":"Hyper-SD: Trajectory Segmented Consistency Model for Efficient Image\n Synthesis","summary":" Recently, a series of diffusion-aware distillation algorithms have emerged to\nalleviate the computational overhead associated with the multi-step inference\nprocess of Diffusion Models (DMs). Current distillation techniques often\ndichotomize into two distinct aspects: i) ODE Trajectory Preservation; and ii)\nODE Trajectory Reformulation. However, these approaches suffer from severe\nperformance degradation or domain shifts. To address these limitations, we\npropose Hyper-SD, a novel framework that synergistically amalgamates the\nadvantages of ODE Trajectory Preservation and Reformulation, while maintaining\nnear-lossless performance during step compression. Firstly, we introduce\nTrajectory Segmented Consistency Distillation to progressively perform\nconsistent distillation within pre-defined time-step segments, which\nfacilitates the preservation of the original ODE trajectory from a higher-order\nperspective. Secondly, we incorporate human feedback learning to boost the\nperformance of the model in a low-step regime and mitigate the performance loss\nincurred by the distillation process. Thirdly, we integrate score distillation\nto further improve the low-step generation capability of the model and offer\nthe first attempt to leverage a unified LoRA to support the inference process\nat all steps. Extensive experiments and user studies demonstrate that Hyper-SD\nachieves SOTA performance from 1 to 8 inference steps for both SDXL and SD1.5.\nFor example, Hyper-SDXL surpasses SDXL-Lightning by +0.68 in CLIP Score and\n+0.51 in Aes Score in the 1-step inference.\n","authors":["Yuxi Ren","Xin Xia","Yanzuo Lu","Jiacheng Zhang","Jie Wu","Pan Xie","Xing Wang","Xuefeng Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.13686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13680v1","updated":"2024-04-21T14:43:31Z","published":"2024-04-21T14:43:31Z","title":"PoseAnimate: Zero-shot high fidelity pose controllable character\n animation","summary":" Image-to-video(I2V) generation aims to create a video sequence from a single\nimage, which requires high temporal coherence and visual fidelity with the\nsource image.However, existing approaches suffer from character appearance\ninconsistency and poor preservation of fine details. Moreover, they require a\nlarge amount of video data for training, which can be computationally\ndemanding.To address these limitations,we propose PoseAnimate, a novel\nzero-shot I2V framework for character animation.PoseAnimate contains three key\ncomponents: 1) Pose-Aware Control Module (PACM) incorporates diverse pose\nsignals into conditional embeddings, to preserve character-independent content\nand maintain precise alignment of actions.2) Dual Consistency Attention Module\n(DCAM) enhances temporal consistency, and retains character identity and\nintricate background details.3) Mask-Guided Decoupling Module (MGDM) refines\ndistinct feature perception, improving animation fidelity by decoupling the\ncharacter and background.We also propose a Pose Alignment Transition Algorithm\n(PATA) to ensure smooth action transition.Extensive experiment results\ndemonstrate that our approach outperforms the state-of-the-art training-based\nmethods in terms of character consistency and detail fidelity. Moreover, it\nmaintains a high level of temporal coherence throughout the generated\nanimations.\n","authors":["Bingwen Zhu","Fanyi Wang","Tianyi Lu","Peng Liu","Jingwen Su","Jinxiu Liu","Yanhao Zhang","Zuxuan Wu","Yu-Gang Jiang","Guo-Jun Qi"],"pdf_url":"https://arxiv.org/pdf/2404.13680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13679v1","updated":"2024-04-21T14:42:10Z","published":"2024-04-21T14:42:10Z","title":"GScream: Learning 3D Geometry and Feature Consistent Gaussian Splatting\n for Object Removal","summary":" This paper tackles the intricate challenge of object removal to update the\nradiance field using the 3D Gaussian Splatting. The main challenges of this\ntask lie in the preservation of geometric consistency and the maintenance of\ntexture coherence in the presence of the substantial discrete nature of\nGaussian primitives. We introduce a robust framework specifically designed to\novercome these obstacles. The key insight of our approach is the enhancement of\ninformation exchange among visible and invisible areas, facilitating content\nrestoration in terms of both geometry and texture. Our methodology begins with\noptimizing the positioning of Gaussian primitives to improve geometric\nconsistency across both removed and visible areas, guided by an online\nregistration process informed by monocular depth estimation. Following this, we\nemploy a novel feature propagation mechanism to bolster texture coherence,\nleveraging a cross-attention design that bridges sampling Gaussians from both\nuncertain and certain areas. This innovative approach significantly refines the\ntexture coherence within the final radiance field. Extensive experiments\nvalidate that our method not only elevates the quality of novel view synthesis\nfor scenes undergoing object removal but also showcases notable efficiency\ngains in training and rendering speeds.\n","authors":["Yuxin Wang","Qianyi Wu","Guofeng Zhang","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.13679v1.pdf","comment":"Project Page: https://w-ted.github.io/publications/gscream"},{"id":"http://arxiv.org/abs/2404.13671v1","updated":"2024-04-21T14:22:04Z","published":"2024-04-21T14:22:04Z","title":"FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and\n High-Quality Localization","summary":" Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies\ndirectly without access to any known normal or abnormal samples within the\ntarget item categories. Existing approaches typically rely on the robust\ngeneralization capabilities of multimodal pretrained models, computing\nsimilarities between manually crafted textual features representing \"normal\" or\n\"abnormal\" semantics and image features to detect anomalies and localize\nanomalous patches. However, the generic descriptions of \"abnormal\" often fail\nto precisely match diverse types of anomalies across different object\ncategories. Additionally, computing feature similarities for single patches\nstruggles to pinpoint specific locations of anomalies with various sizes and\nscales. To address these issues, we propose a novel ZSAD method called FiLo,\ncomprising two components: adaptively learned Fine-Grained Description (FG-Des)\nand position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces\nfine-grained anomaly descriptions for each category using Large Language Models\n(LLMs) and employs adaptively learned textual templates to enhance the accuracy\nand interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for\npreliminary localization, position-enhanced text prompts, and Multi-scale\nMulti-shape Cross-modal Interaction (MMCI) module, facilitates more accurate\nlocalization of anomalies of different sizes and shapes. Experimental results\non datasets like MVTec and VisA demonstrate that FiLo significantly improves\nthe performance of ZSAD in both detection and localization, achieving\nstate-of-the-art performance with an image-level AUC of 83.9% and a pixel-level\nAUC of 95.9% on the VisA dataset.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Hao Li","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13667v1","updated":"2024-04-21T14:03:34Z","published":"2024-04-21T14:03:34Z","title":"MathNet: A Data-Centric Approach for Printed Mathematical Expression\n Recognition","summary":" Printed mathematical expression recognition (MER) models are usually trained\nand tested using LaTeX-generated mathematical expressions (MEs) as input and\nthe LaTeX source code as ground truth. As the same ME can be generated by\nvarious different LaTeX source codes, this leads to unwanted variations in the\nground truth data that bias test performance results and hinder efficient\nlearning. In addition, the use of only one font to generate the MEs heavily\nlimits the generalization of the reported results to realistic scenarios. We\npropose a data-centric approach to overcome this problem, and present\nconvincing experimental results: Our main contribution is an enhanced LaTeX\nnormalization to map any LaTeX ME to a canonical form. Based on this process,\nwe developed an improved version of the benchmark dataset im2latex-100k,\nfeaturing 30 fonts instead of one. Second, we introduce the real-world dataset\nrealFormula, with MEs extracted from papers. Third, we developed a MER model,\nMathNet, based on a convolutional vision transformer, with superior results on\nall four test sets (im2latex-100k, im2latexv2, realFormula, and InftyMDB-1),\noutperforming the previous state of the art by up to 88.3%.\n","authors":["Felix M. Schmitt-Koopmann","Elaine M. Huang","Hans-Peter Hutter","Thilo Stadelmann","Alireza Darvishy"],"pdf_url":"https://arxiv.org/pdf/2404.13667v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.13659v1","updated":"2024-04-21T13:29:42Z","published":"2024-04-21T13:29:42Z","title":"LMFNet: An Efficient Multimodal Fusion Approach for Semantic\n Segmentation in High-Resolution Remote Sensing","summary":" Despite the rapid evolution of semantic segmentation for land cover\nclassification in high-resolution remote sensing imagery, integrating multiple\ndata modalities such as Digital Surface Model (DSM), RGB, and Near-infrared\n(NIR) remains a challenge. Current methods often process only two types of\ndata, missing out on the rich information that additional modalities can\nprovide. Addressing this gap, we propose a novel \\textbf{L}ightweight\n\\textbf{M}ultimodal data \\textbf{F}usion \\textbf{Net}work (LMFNet) to\naccomplish the tasks of fusion and semantic segmentation of multimodal remote\nsensing images. LMFNet uniquely accommodates various data types simultaneously,\nincluding RGB, NirRG, and DSM, through a weight-sharing, multi-branch vision\ntransformer that minimizes parameter count while ensuring robust feature\nextraction. Our proposed multimodal fusion module integrates a\n\\textit{Multimodal Feature Fusion Reconstruction Layer} and \\textit{Multimodal\nFeature Self-Attention Fusion Layer}, which can reconstruct and fuse multimodal\nfeatures. Extensive testing on public datasets such as US3D, ISPRS Potsdam, and\nISPRS Vaihingen demonstrates the effectiveness of LMFNet. Specifically, it\nachieves a mean Intersection over Union ($mIoU$) of 85.09\\% on the US3D\ndataset, marking a significant improvement over existing methods. Compared to\nunimodal approaches, LMFNet shows a 10\\% enhancement in $mIoU$ with only a 0.5M\nincrease in parameter count. Furthermore, against bimodal methods, our approach\nwith trilateral inputs enhances $mIoU$ by 0.46 percentage points.\n","authors":["Tong Wang","Guanzhou Chen","Xiaodong Zhang","Chenxi Liu","Xiaoliang Tan","Jiaqi Wang","Chanjuan He","Wenlin Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.13659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13657v1","updated":"2024-04-21T13:25:46Z","published":"2024-04-21T13:25:46Z","title":"MLP: Motion Label Prior for Temporal Sentence Localization in Untrimmed\n 3D Human Motions","summary":" In this paper, we address the unexplored question of temporal sentence\nlocalization in human motions (TSLM), aiming to locate a target moment from a\n3D human motion that semantically corresponds to a text query. Considering that\n3D human motions are captured using specialized motion capture devices, motions\nwith only a few joints lack complex scene information like objects and\nlighting. Due to this character, motion data has low contextual richness and\nsemantic ambiguity between frames, which limits the accuracy of predictions\nmade by current video localization frameworks extended to TSLM to only a rough\nlevel. To refine this, we devise two novel label-prior-assisted training\nschemes: one embed prior knowledge of foreground and background to highlight\nthe localization chances of target moments, and the other forces the originally\nrough predictions to overlap with the more accurate predictions obtained from\nthe flipped start/end prior label sequences during recovery training. We show\nthat injecting label-prior knowledge into the model is crucial for improving\nperformance at high IoU. In our constructed TSLM benchmark, our model termed\nMLP achieves a recall of 44.13 at IoU@0.7 on the BABEL dataset and 71.17 on\nHumanML3D (Restore), outperforming prior works. Finally, we showcase the\npotential of our approach in corpus-level moment retrieval. Our source code is\nopenly accessible at https://github.com/eanson023/mlp.\n","authors":["Sheng Yan","Mengyuan Liu","Yong Wang","Yang Liu","Chen Chen","Hong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13657v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.13648v1","updated":"2024-04-21T12:50:38Z","published":"2024-04-21T12:50:38Z","title":"Data-independent Module-aware Pruning for Hierarchical Vision\n Transformers","summary":" Hierarchical vision transformers (ViTs) have two advantages over conventional\nViTs. First, hierarchical ViTs achieve linear computational complexity with\nrespect to image size by local self-attention. Second, hierarchical ViTs create\nhierarchical feature maps by merging image patches in deeper layers for dense\nprediction. However, existing pruning methods ignore the unique properties of\nhierarchical ViTs and use the magnitude value as the weight importance. This\napproach leads to two main drawbacks. First, the \"local\" attention weights are\ncompared at a \"global\" level, which may cause some \"locally\" important weights\nto be pruned due to their relatively small magnitude \"globally\". The second\nissue with magnitude pruning is that it fails to consider the distinct weight\ndistributions of the network, which are essential for extracting coarse to\nfine-grained features at various hierarchical levels.\n To solve the aforementioned issues, we have developed a Data-independent\nModule-Aware Pruning method (DIMAP) to compress hierarchical ViTs. To ensure\nthat \"local\" attention weights at different hierarchical levels are compared\nfairly in terms of their contribution, we treat them as a module and examine\ntheir contribution by analyzing their information distortion. Furthermore, we\nintroduce a novel weight metric that is solely based on weights and does not\nrequire input images, thereby eliminating the dependence on the patch merging\nprocess. Our method validates its usefulness and strengths on Swin Transformers\nof different sizes on ImageNet-1k classification. Notably, the top-5 accuracy\ndrop is only 0.07% when we remove 52.5% FLOPs and 52.7% parameters of Swin-B.\nWhen we reduce 33.2% FLOPs and 33.2% parameters of Swin-S, we can even achieve\na 0.8% higher relative top-5 accuracy than the original model. Code is\navailable at: https://github.com/he-y/Data-independent-Module-Aware-Pruning\n","authors":["Yang He","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.13648v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2404.13640v1","updated":"2024-04-21T12:33:07Z","published":"2024-04-21T12:33:07Z","title":"Beyond Alignment: Blind Video Face Restoration via Parsing-Guided\n Temporal-Coherent Transformer","summary":" Multiple complex degradations are coupled in low-quality video faces in the\nreal world. Therefore, blind video face restoration is a highly challenging\nill-posed problem, requiring not only hallucinating high-fidelity details but\nalso enhancing temporal coherence across diverse pose variations. Restoring\neach frame independently in a naive manner inevitably introduces temporal\nincoherence and artifacts from pose changes and keypoint localization errors.\nTo address this, we propose the first blind video face restoration approach\nwith a novel parsing-guided temporal-coherent transformer (PGTFormer) without\npre-alignment. PGTFormer leverages semantic parsing guidance to select optimal\nface priors for generating temporally coherent artifact-free results.\nSpecifically, we pre-train a temporal-spatial vector quantized auto-encoder on\nhigh-quality video face datasets to extract expressive context-rich priors.\nThen, the temporal parse-guided codebook predictor (TPCP) restores faces in\ndifferent poses based on face parsing context cues without performing face\npre-alignment. This strategy reduces artifacts and mitigates jitter caused by\ncumulative errors from face pre-alignment. Finally, the temporal fidelity\nregulator (TFR) enhances fidelity through temporal feature interaction and\nimproves video temporal consistency. Extensive experiments on face videos show\nthat our method outperforms previous face restoration baselines. The code will\nbe released on\n\\href{https://github.com/kepengxu/PGTFormer}{https://github.com/kepengxu/PGTFormer}.\n","authors":["Kepeng Xu","Li Xu","Gang He","Wenxin Yu","Yunsong Li"],"pdf_url":"https://arxiv.org/pdf/2404.13640v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2404.13621v1","updated":"2024-04-21T11:21:27Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":" Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. Robustness of these\ntechniques, however, remains a concern, particularly in the face of adversarial\nattacks that have been proven to deceive state-of-the-art deep neural networks\nin many domains. Surprisingly, the robustness of scene flow networks against\nsuch attacks has not been thoroughly investigated. To address this problem, the\nproposed approach aims to bridge this gap by introducing adversarial white-box\nattacks specifically tailored for scene flow networks. Experimental results\nshow that the generated adversarial examples obtain up to 33.7 relative\ndegradation in average end-point error on the KITTI and FlyingThings3D\ndatasets. The study also reveals the significant impact that attacks targeting\npoint clouds in only one dimension or color channel have on average end-point\nerror. Analyzing the success and failure of these attacks on the scene flow\nnetworks and their 2D optical flow network variants show a higher vulnerability\nfor the optical flow networks.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13611v1","updated":"2024-04-21T10:41:04Z","published":"2024-04-21T10:41:04Z","title":"Video sentence grounding with temporally global textual knowledge","summary":" Temporal sentence grounding involves the retrieval of a video moment with a\nnatural language query. Many existing works directly incorporate the given\nvideo and temporally localized query for temporal grounding, overlooking the\ninherent domain gap between different modalities. In this paper, we utilize\npseudo-query features containing extensive temporally global textual knowledge\nsourced from the same video-query pair, to enhance the bridging of domain gaps\nand attain a heightened level of similarity between multi-modal features.\nSpecifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve\nan improved alignment of visual and comprehensive pseudo-query features within\nthe feature space through contrastive learning. Subsequently, we utilize\nlearnable prompts to encapsulate the knowledge of pseudo-queries, propagating\nthem into the textual encoder and multi-modal fusion module, further enhancing\nthe feature alignment between visual and language for better temporal\ngrounding. Extensive experiments conducted on the Charades-STA and\nActivityNet-Captions datasets demonstrate the effectiveness of our method.\n","authors":["Cai Chen","Runzhong Zhang","Jianjun Gao","Kejun Wu","Kim-Hui Yap","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13605v1","updated":"2024-04-21T10:28:34Z","published":"2024-04-21T10:28:34Z","title":"Turb-Seg-Res: A Segment-then-Restore Pipeline for Dynamic Videos with\n Atmospheric Turbulence","summary":" Tackling image degradation due to atmospheric turbulence, particularly in\ndynamic environment, remains a challenge for long-range imaging systems.\nExisting techniques have been primarily designed for static scenes or scenes\nwith small motion. This paper presents the first segment-then-restore pipeline\nfor restoring the videos of dynamic scenes in turbulent environment. We\nleverage mean optical flow with an unsupervised motion segmentation method to\nseparate dynamic and static scene components prior to restoration. After camera\nshake compensation and segmentation, we introduce foreground/background\nenhancement leveraging the statistics of turbulence strength and a transformer\nmodel trained on a novel noise-based procedural turbulence generator for fast\ndataset augmentation. Benchmarked against existing restoration methods, our\napproach restores most of the geometric distortion and enhances sharpness for\nvideos. We make our code, simulator, and data publicly available to advance the\nfield of video restoration from turbulence: riponcs.github.io/TurbSegRes\n","authors":["Ripon Kumar Saha","Dehao Qin","Nianyi Li","Jinwei Ye","Suren Jayasuriya"],"pdf_url":"https://arxiv.org/pdf/2404.13605v1.pdf","comment":"CVPR 2024 Paper"},{"id":"http://arxiv.org/abs/2404.08544v2","updated":"2024-04-21T10:24:45Z","published":"2024-04-12T15:37:53Z","title":"Analyzing Decades-Long Environmental Changes in Namibia Using Archival\n Aerial Photography and Deep Learning","summary":" This study explores object detection in historical aerial photographs of\nNamibia to identify long-term environmental changes. Specifically, we aim to\nidentify key objects -- Waterholes, Omuti homesteads, and Big trees -- around\nOshikango in Namibia using sub-meter gray-scale aerial imagery from 1943 and\n1972. In this work, we propose a workflow for analyzing historical aerial\nimagery using a deep semantic segmentation model on sparse hand-labels. To this\nend, we employ a number of strategies including class-weighting,\npseudo-labeling and empirical p-value-based filtering to balance skewed and\nsparse representations of objects in the ground truth data. Results demonstrate\nthe benefits of these different training strategies resulting in an average\n$F_1=0.661$ and $F_1=0.755$ over the three objects of interest for the 1943 and\n1972 imagery, respectively. We also identified that the average size of\nWaterhole and Big trees increased while the average size of Omuti homesteads\ndecreased between 1943 and 1972 reflecting some of the local effects of the\nmassive post-Second World War economic, agricultural, demographic, and\nenvironmental changes. This work also highlights the untapped potential of\nhistorical aerial photographs in understanding long-term environmental changes\nbeyond Namibia (and Africa). With the lack of adequate satellite technology in\nthe past, archival aerial photography offers a great alternative to uncover\ndecades-long environmental changes.\n","authors":["Girmaw Abebe Tadesse","Caleb Robinson","Gilles Quentin Hacheme","Akram Zaytar","Rahul Dodhia","Tsering Wangyal Shawa","Juan M. Lavista Ferres","Emmanuel H. Kreike"],"pdf_url":"https://arxiv.org/pdf/2404.08544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06629v3","updated":"2024-04-21T10:05:06Z","published":"2023-10-10T13:48:18Z","title":"EViT: An Eagle Vision Transformer with Bi-Fovea Self-Attention","summary":" Thanks to the advancement of deep learning technology, vision transformers\nhas demonstrated competitive performance in various computer vision tasks.\nUnfortunately, vision transformers still faces some challenges such as high\ncomputational complexity and absence of desirable inductive bias. To alleviate\nthese issues, we propose a novel Bi-Fovea Self-Attention (BFSA) inspired by the\nphysiological structure and visual properties of eagle eyes. This BFSA is used\nto simulate the shallow and deep fovea of eagle vision, prompting the network\nto learn the feature representation of targets from coarse to fine.\nAdditionally, we design a Bionic Eagle Vision (BEV) block based on BFSA. It\ncombines the advantages of convolution and introduces a novel Bi-Fovea\nFeedforward Network (BFFN) to mimic the working way of biological visual cortex\nprocesses information in hierarchically and parallel. Furthermore, we develop a\nunified and efficient pyramid backbone network family called Eagle Vision\nTransformers (EViTs) by stacking BEV blocks. Experimental results show that\nEViTs exhibit highly competitive performance in various computer vision tasks\nsuch as image classification, object detection and semantic segmentation.\nEspecially in terms of performance and computational efficiency, EViTs show\nsignificant advantages compared with other counterparts. Code is available at\nhttps://github.com/nkusyl/EViT\n","authors":["Yulong Shi","Mingwei Sun","Yongshuai Wang","Jiahao Ma","Zengqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2310.06629v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2402.09181v2","updated":"2024-04-21T09:51:58Z","published":"2024-02-14T13:51:56Z","title":"OmniMedVQA: A New Large-Scale Comprehensive Evaluation Benchmark for\n Medical LVLM","summary":" Large Vision-Language Models (LVLMs) have demonstrated remarkable\ncapabilities in various multimodal tasks. However, their potential in the\nmedical domain remains largely unexplored. A significant challenge arises from\nthe scarcity of diverse medical images spanning various modalities and\nanatomical regions, which is essential in real-world medical applications. To\nsolve this problem, in this paper, we introduce OmniMedVQA, a novel\ncomprehensive medical Visual Question Answering (VQA) benchmark. This benchmark\nis collected from 73 different medical datasets, including 12 different\nmodalities and covering more than 20 distinct anatomical regions. Importantly,\nall images in this benchmark are sourced from authentic medical scenarios,\nensuring alignment with the requirements of the medical field and suitability\nfor evaluating LVLMs. Through our extensive experiments, we have found that\nexisting LVLMs struggle to address these medical VQA problems effectively.\nMoreover, what surprises us is that medical-specialized LVLMs even exhibit\ninferior performance to those general-domain models, calling for a more\nversatile and robust LVLM in the biomedical field. The evaluation results not\nonly reveal the current limitations of LVLM in understanding real medical\nimages but also highlight our dataset's significance. Our code with dataset are\navailable at https://github.com/OpenGVLab/Multi-Modality-Arena.\n","authors":["Yutao Hu","Tianbin Li","Quanfeng Lu","Wenqi Shao","Junjun He","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2402.09181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13594v1","updated":"2024-04-21T09:23:36Z","published":"2024-04-21T09:23:36Z","title":"Lost in Space: Probing Fine-grained Spatial Understanding in Vision and\n Language Resamplers","summary":" An effective method for combining frozen large language models (LLM) and\nvisual encoders involves a resampler module that creates a `visual prompt'\nwhich is provided to the LLM, along with the textual prompt. While this\napproach has enabled impressive performance across many coarse-grained tasks\nlike image captioning and visual question answering, more fine-grained tasks\nthat require spatial understanding have not been thoroughly examined. In this\npaper, we use \\textit{diagnostic classifiers} to measure the extent to which\nthe visual prompt produced by the resampler encodes spatial information. Our\nresults show that this information is largely absent from the resampler output\nwhen kept frozen during training of the classifiers. However, when the\nresampler and classifier are trained jointly, we observe a significant\nperformance boost. This shows that the compression achieved by the resamplers\ncan in principle encode the requisite spatial information, but that more\nobject-aware objectives are needed at the pretraining stage to facilitate this\ncapability\n","authors":["Georgios Pantazopoulos","Alessandro Suglia","Oliver Lemon","Arash Eshghi"],"pdf_url":"https://arxiv.org/pdf/2404.13594v1.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2404.13591v1","updated":"2024-04-21T09:15:02Z","published":"2024-04-21T09:15:02Z","title":"MARVEL: Multidimensional Abstraction and Reasoning through Visual\n Evaluation and Learning","summary":" While multi-modal large language models (MLLMs) have shown significant\nprogress on many popular visual reasoning benchmarks, whether they possess\nabstract visual reasoning abilities remains an open question. Similar to the\nSudoku puzzles, abstract visual reasoning (AVR) problems require finding\nhigh-level patterns (e.g., repetition constraints) that control the input\nshapes (e.g., digits) in a specific task configuration (e.g., matrix). However,\nexisting AVR benchmarks only considered a limited set of patterns (addition,\nconjunction), input shapes (rectangle, square), and task configurations (3 by 3\nmatrices). To evaluate MLLMs' reasoning abilities comprehensively, we introduce\nMARVEL, a multidimensional AVR benchmark with 770 puzzles composed of six core\nknowledge patterns, geometric and abstract shapes, and five different task\nconfigurations. To inspect whether the model accuracy is grounded in perception\nand reasoning, MARVEL complements the general AVR question with perception\nquestions in a hierarchical evaluation framework. We conduct comprehensive\nexperiments on MARVEL with nine representative MLLMs in zero-shot and few-shot\nsettings. Our experiments reveal that all models show near-random performance\non the AVR question, with significant performance gaps (40%) compared to humans\nacross all patterns and task configurations. Further analysis of perception\nquestions reveals that MLLMs struggle to comprehend the visual features\n(near-random performance) and even count the panels in the puzzle ( <45%),\nhindering their ability for abstract reasoning. We release our entire code and\ndataset.\n","authors":["Yifan Jiang","Jiarui Zhang","Kexuan Sun","Zhivar Sourati","Kian Ahrabian","Kaixin Ma","Filip Ilievski","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2404.13591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14593v2","updated":"2024-04-21T09:02:36Z","published":"2023-07-27T02:36:13Z","title":"FakeTracer: Catching Face-swap DeepFakes via Implanting Traces in\n Training","summary":" Face-swap DeepFake is an emerging AI-based face forgery technique that can\nreplace the original face in a video with a generated face of the target\nidentity while retaining consistent facial attributes such as expression and\norientation. Due to the high privacy of faces, the misuse of this technique can\nraise severe social concerns, drawing tremendous attention to defend against\nDeepFakes recently. In this paper, we describe a new proactive defense method\ncalled FakeTracer to expose face-swap DeepFakes via implanting traces in\ntraining. Compared to general face-synthesis DeepFake, the face-swap DeepFake\nis more complex as it involves identity change, is subjected to the\nencoding-decoding process, and is trained unsupervised, increasing the\ndifficulty of implanting traces into the training phase. To effectively defend\nagainst face-swap DeepFake, we design two types of traces, sustainable trace\n(STrace) and erasable trace (ETrace), to be added to training faces. During the\ntraining, these manipulated faces affect the learning of the face-swap DeepFake\nmodel, enabling it to generate faces that only contain sustainable traces. In\nlight of these two traces, our method can effectively expose DeepFakes by\nidentifying them. Extensive experiments corroborate the efficacy of our method\non defending against face-swap DeepFake.\n","authors":["Pu Sun","Honggang Qi","Yuezun Li","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2307.14593v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13584v1","updated":"2024-04-21T08:52:22Z","published":"2024-04-21T08:52:22Z","title":"Rethink Arbitrary Style Transfer with Transformer and Contrastive\n Learning","summary":" Arbitrary style transfer holds widespread attention in research and boasts\nnumerous practical applications. The existing methods, which either employ\ncross-attention to incorporate deep style attributes into content attributes or\nuse adaptive normalization to adjust content features, fail to generate\nhigh-quality stylized images. In this paper, we introduce an innovative\ntechnique to improve the quality of stylized images. Firstly, we propose Style\nConsistency Instance Normalization (SCIN), a method to refine the alignment\nbetween content and style features. In addition, we have developed an\nInstance-based Contrastive Learning (ICL) approach designed to understand the\nrelationships among various styles, thereby enhancing the quality of the\nresulting stylized images. Recognizing that VGG networks are more adept at\nextracting classification features and need to be better suited for capturing\nstyle features, we have also introduced the Perception Encoder (PE) to capture\nstyle features. Extensive experiments demonstrate that our proposed method\ngenerates high-quality stylized images and effectively prevents artifacts\ncompared with the existing state-of-the-art methods.\n","authors":["Zhanjie Zhang","Jiakai Sun","Guangyuan Li","Lei Zhao","Quanwei Zhang","Zehua Lan","Haolin Yin","Wei Xing","Huaizhong Lin","Zhiwen Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.13584v1.pdf","comment":"Accepted by CVIU"},{"id":"http://arxiv.org/abs/2404.13579v1","updated":"2024-04-21T08:37:43Z","published":"2024-04-21T08:37:43Z","title":"LTOS: Layout-controllable Text-Object Synthesis via Adaptive\n Cross-attention Fusions","summary":" Controllable text-to-image generation synthesizes visual text and objects in\nimages with certain conditions, which are frequently applied to emoji and\nposter generation. Visual text rendering and layout-to-image generation tasks\nhave been popular in controllable text-to-image generation. However, each of\nthese tasks typically focuses on single modality generation or rendering,\nleaving yet-to-be-bridged gaps between the approaches correspondingly designed\nfor each of the tasks. In this paper, we combine text rendering and\nlayout-to-image generation tasks into a single task: layout-controllable\ntext-object synthesis (LTOS) task, aiming at synthesizing images with object\nand visual text based on predefined object layout and text contents. As\ncompliant datasets are not readily available for our LTOS task, we construct a\nlayout-aware text-object synthesis dataset, containing elaborate well-aligned\nlabels of visual text and object information. Based on the dataset, we propose\na layout-controllable text-object adaptive fusion (TOF) framework, which\ngenerates images with clear, legible visual text and plausible objects. We\nconstruct a visual-text rendering module to synthesize text and employ an\nobject-layout control module to generate objects while integrating the two\nmodules to harmoniously generate and integrate text content and objects in\nimages. To better the image-text integration, we propose a self-adaptive\ncross-attention fusion module that helps the image generation to attend more to\nimportant text information. Within such a fusion module, we use a self-adaptive\nlearnable factor to learn to flexibly control the influence of cross-attention\noutputs on image generation. Experimental results show that our method\noutperforms the state-of-the-art in LTOS, text rendering, and layout-to-image\ntasks, enabling harmonious visual text rendering and object generation.\n","authors":["Xiaoran Zhao","Tianhao Wu","Yu Lai","Zhiliang Tian","Zhen Huang","Yahui Liu","Zejiang He","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.13579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12322v2","updated":"2024-04-21T08:37:09Z","published":"2024-04-18T16:53:08Z","title":"Generalizable Face Landmarking Guided by Conditional Face Warping","summary":" As a significant step for human face modeling, editing, and generation, face\nlandmarking aims at extracting facial keypoints from images. A generalizable\nface landmarker is required in practice because real-world facial images, e.g.,\nthe avatars in animations and games, are often stylized in various ways.\nHowever, achieving generalizable face landmarking is challenging due to the\ndiversity of facial styles and the scarcity of labeled stylized faces. In this\nstudy, we propose a simple but effective paradigm to learn a generalizable face\nlandmarker based on labeled real human faces and unlabeled stylized faces. Our\nmethod learns the face landmarker as the key module of a conditional face\nwarper. Given a pair of real and stylized facial images, the conditional face\nwarper predicts a warping field from the real face to the stylized one, in\nwhich the face landmarker predicts the ending points of the warping field and\nprovides us with high-quality pseudo landmarks for the corresponding stylized\nfacial images. Applying an alternating optimization strategy, we learn the face\nlandmarker to minimize $i)$ the discrepancy between the stylized faces and the\nwarped real ones and $ii)$ the prediction errors of both real and pseudo\nlandmarks. Experiments on various datasets show that our method outperforms\nexisting state-of-the-art domain adaptation methods in face landmarking tasks,\nleading to a face landmarker with better generalizability. Code is available at\nhttps://plustwo0.github.io/project-face-landmarker.\n","authors":["Jiayi Liang","Haotian Liu","Hongteng Xu","Dixin Luo"],"pdf_url":"https://arxiv.org/pdf/2404.12322v2.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13576v1","updated":"2024-04-21T08:28:52Z","published":"2024-04-21T08:28:52Z","title":"I2CANSAY:Inter-Class Analogical Augmentation and Intra-Class\n Significance Analysis for Non-Exemplar Online Task-Free Continual Learning","summary":" Online task-free continual learning (OTFCL) is a more challenging variant of\ncontinual learning which emphasizes the gradual shift of task boundaries and\nlearns in an online mode. Existing methods rely on a memory buffer composed of\nold samples to prevent forgetting. However,the use of memory buffers not only\nraises privacy concerns but also hinders the efficient learning of new samples.\nTo address this problem, we propose a novel framework called I2CANSAY that gets\nrid of the dependence on memory buffers and efficiently learns the knowledge of\nnew data from one-shot samples. Concretely, our framework comprises two main\nmodules. Firstly, the Inter-Class Analogical Augmentation (ICAN) module\ngenerates diverse pseudo-features for old classes based on the inter-class\nanalogy of feature distributions for different new classes, serving as a\nsubstitute for the memory buffer. Secondly, the Intra-Class Significance\nAnalysis (ISAY) module analyzes the significance of attributes for each class\nvia its distribution standard deviation, and generates the importance vector as\na correction bias for the linear classifier, thereby enhancing the capability\nof learning from new samples. We run our experiments on four popular image\nclassification datasets: CoRe50, CIFAR-10, CIFAR-100, and CUB-200, our approach\noutperforms the prior state-of-the-art by a large margin.\n","authors":["Songlin Dong","Yingjie Chen","Yuhang He","Yuhan Jin","Alex C. Kot","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13573v1","updated":"2024-04-21T08:27:20Z","published":"2024-04-21T08:27:20Z","title":"Exploring AIGC Video Quality: A Focus on Visual Harmony, Video-Text\n Consistency and Domain Distribution Gap","summary":" The recent advancements in Text-to-Video Artificial Intelligence Generated\nContent (AIGC) have been remarkable. Compared with traditional videos, the\nassessment of AIGC videos encounters various challenges: visual inconsistency\nthat defy common sense, discrepancies between content and the textual prompt,\nand distribution gap between various generative models, etc. Target at these\nchallenges, in this work, we categorize the assessment of AIGC video quality\ninto three dimensions: visual harmony, video-text consistency, and domain\ndistribution gap. For each dimension, we design specific modules to provide a\ncomprehensive quality assessment of AIGC videos. Furthermore, our research\nidentifies significant variations in visual quality, fluidity, and style among\nvideos generated by different text-to-video models. Predicting the source\ngenerative model can make the AIGC video features more discriminative, which\nenhances the quality assessment performance. The proposed method was used in\nthe third-place winner of the NTIRE 2024 Quality Assessment for AI-Generated\nContent - Track 2 Video, demonstrating its effectiveness.\n","authors":["Bowen Qu","Xiaoyu Liang","Shangkun Sun","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2404.13573v1.pdf","comment":"9 pages, 3 figures, 3 tables. Accepted by CVPR2024 Workshop (3rd\n place of NTIRE2024 Quality Assessment for AI-Generated Content - Track 2\n Video)"},{"id":"http://arxiv.org/abs/2404.15190v1","updated":"2024-04-21T08:10:20Z","published":"2024-04-21T08:10:20Z","title":"Socratic Planner: Inquiry-Based Zero-Shot Planning for Embodied\n Instruction Following","summary":" Embodied Instruction Following (EIF) is the task of executing natural\nlanguage instructions by navigating and interacting with objects in 3D\nenvironments. One of the primary challenges in EIF is compositional task\nplanning, which is often addressed with supervised or in-context learning with\nlabeled data. To this end, we introduce the Socratic Planner, the first\nzero-shot planning method that infers without the need for any training data.\nSocratic Planner first decomposes the instructions into substructural\ninformation of the task through self-questioning and answering, translating it\ninto a high-level plan, i.e., a sequence of subgoals. Subgoals are executed\nsequentially, with our visually grounded re-planning mechanism adjusting plans\ndynamically through a dense visual feedback. We also introduce an evaluation\nmetric of high-level plans, RelaxedHLP, for a more comprehensive evaluation.\nExperiments demonstrate the effectiveness of the Socratic Planner, achieving\ncompetitive performance on both zero-shot and few-shot task planning in the\nALFRED benchmark, particularly excelling in tasks requiring higher-dimensional\ninference. Additionally, a precise adjustments in the plan were achieved by\nincorporating environmental visual information.\n","authors":["Suyeon Shin","Sujin jeon","Junghyun Kim","Gi-Cheon Kang","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15190v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.13565v1","updated":"2024-04-21T07:34:44Z","published":"2024-04-21T07:34:44Z","title":"Exploring Diverse Methods in Visual Question Answering","summary":" This study explores innovative methods for improving Visual Question\nAnswering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and\nattention mechanisms. Leveraging a balanced VQA dataset, we investigate three\ndistinct strategies. Firstly, GAN-based approaches aim to generate answer\nembeddings conditioned on image and question inputs, showing potential but\nstruggling with more complex tasks. Secondly, autoencoder-based techniques\nfocus on learning optimal embeddings for questions and images, achieving\ncomparable results with GAN due to better ability on complex questions. Lastly,\nattention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB),\naddress language priors and attention modeling, albeit with a\ncomplexity-performance trade-off. This study underscores the challenges and\nopportunities in VQA and suggests avenues for future research, including\nalternative GAN formulations and attentional mechanisms.\n","authors":["Panfeng Li","Qikai Yang","Xieming Geng","Wenjing Zhou","Zhicheng Ding","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13564v1","updated":"2024-04-21T07:26:09Z","published":"2024-04-21T07:26:09Z","title":"Masked Latent Transformer with the Random Masking Ratio to Advance the\n Diagnosis of Dental Fluorosis","summary":" Dental fluorosis is a chronic disease caused by long-term overconsumption of\nfluoride, which leads to changes in the appearance of tooth enamel. It is an\nimportant basis for early non-invasive diagnosis of endemic fluorosis. However,\neven dental professionals may not be able to accurately distinguish dental\nfluorosis and its severity based on tooth images. Currently, there is still a\ngap in research on applying deep learning to diagnosing dental fluorosis.\nTherefore, we construct the first open-source dental fluorosis image dataset\n(DFID), laying the foundation for deep learning research in this field. To\nadvance the diagnosis of dental fluorosis, we propose a pioneering deep\nlearning model called masked latent transformer with the random masking ratio\n(MLTrMR). MLTrMR introduces a mask latent modeling scheme based on Vision\nTransformer to enhance contextual learning of dental fluorosis lesion\ncharacteristics. Consisting of a latent embedder, encoder, and decoder, MLTrMR\nemploys the latent embedder to extract latent tokens from the original image,\nwhereas the encoder and decoder comprising the latent transformer (LT) block\nare used to process unmasked tokens and predict masked tokens, respectively. To\nmitigate the lack of inductive bias in Vision Transformer, which may result in\nperformance degradation, the LT block introduces latent tokens to enhance the\nlearning capacity of latent lesion features. Furthermore, we design an\nauxiliary loss function to constrain the parameter update direction of the\nmodel. MLTrMR achieves 80.19% accuracy, 75.79% F1, and 81.28% quadratic\nweighted kappa on DFID, making it state-of-the-art (SOTA).\n","authors":["Yun Wu","Hao Xu","Maohua Gu","Zhongchuan Jiang","Jun Xu","Youliang Tian"],"pdf_url":"https://arxiv.org/pdf/2404.13564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05175v2","updated":"2024-04-21T07:15:01Z","published":"2023-06-08T13:14:35Z","title":"Large-scale Dataset Pruning with Dynamic Uncertainty","summary":" The state of the art of many learning tasks, e.g., image classification, is\nadvanced by collecting larger datasets and then training larger models on them.\nAs the outcome, the increasing computational cost is becoming unaffordable. In\nthis paper, we investigate how to prune the large-scale datasets, and thus\nproduce an informative subset for training sophisticated deep models with\nnegligible performance drop. We propose a simple yet effective dataset pruning\nmethod by exploring both the prediction uncertainty and training dynamics. We\nstudy dataset pruning by measuring the variation of predictions during the\nwhole training process on large-scale datasets, i.e., ImageNet-1K and\nImageNet-21K, and advanced models, i.e., Swin Transformer and ConvNeXt.\nExtensive experimental results indicate that our method outperforms the state\nof the art and achieves 25% lossless pruning ratio on both ImageNet-1K and\nImageNet-21K. The code and pruned datasets are available at\nhttps://github.com/BAAI-DCAI/Dataset-Pruning.\n","authors":["Muyang He","Shuo Yang","Tiejun Huang","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.05175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09069v2","updated":"2024-04-21T07:08:15Z","published":"2023-12-14T16:04:34Z","title":"PI3D: Efficient Text-to-3D Generation with Pseudo-Image Diffusion","summary":" Diffusion models trained on large-scale text-image datasets have demonstrated\na strong capability of controllable high-quality image generation from\narbitrary text prompts. However, the generation quality and generalization\nability of 3D diffusion models is hindered by the scarcity of high-quality and\nlarge-scale 3D datasets. In this paper, we present PI3D, a framework that fully\nleverages the pre-trained text-to-image diffusion models' ability to generate\nhigh-quality 3D shapes from text prompts in minutes. The core idea is to\nconnect the 2D and 3D domains by representing a 3D shape as a set of Pseudo RGB\nImages. We fine-tune an existing text-to-image diffusion model to produce such\npseudo-images using a small number of text-3D pairs. Surprisingly, we find that\nit can already generate meaningful and consistent 3D shapes given complex text\ndescriptions. We further take the generated shapes as the starting point for a\nlightweight iterative refinement using score distillation sampling to achieve\nhigh-quality generation under a low budget. PI3D generates a single 3D shape\nfrom text in only 3 minutes and the quality is validated to outperform existing\n3D generative models by a large margin.\n","authors":["Ying-Tian Liu","Yuan-Chen Guo","Guan Luo","Heyi Sun","Wei Yin","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.09069v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13555v1","updated":"2024-04-21T07:03:48Z","published":"2024-04-21T07:03:48Z","title":"Cell Phone Image-Based Persian Rice Detection and Classification Using\n Deep Learning Techniques","summary":" This study introduces an innovative approach to classifying various types of\nPersian rice using image-based deep learning techniques, highlighting the\npractical application of everyday technology in food categorization.\nRecognizing the diversity of Persian rice and its culinary significance, we\nleveraged the capabilities of convolutional neural networks (CNNs),\nspecifically by fine-tuning a ResNet model for accurate identification of\ndifferent rice varieties and employing a U-Net architecture for precise\nsegmentation of rice grains in bulk images. This dual-methodology framework\nallows for both individual grain classification and comprehensive analysis of\nbulk rice samples, addressing two crucial aspects of rice quality assessment.\nUtilizing images captured with consumer-grade cell phones reflects a realistic\nscenario in which individuals can leverage this technology for assistance with\ngrocery shopping and meal preparation. The dataset, comprising various rice\ntypes photographed under natural conditions without professional lighting or\nequipment, presents a challenging yet practical classification problem. Our\nfindings demonstrate the feasibility of using non-professional images for food\nclassification and the potential of deep learning models, like ResNet and\nU-Net, to adapt to the nuances of everyday objects and textures. This study\ncontributes to the field by providing insights into the applicability of\nimage-based deep learning in daily life, specifically for enhancing consumer\nexperiences and knowledge in food selection. Furthermore, it opens avenues for\nextending this approach to other food categories and practical applications,\nemphasizing the role of accessible technology in bridging the gap between\nsophisticated computational methods and everyday tasks.\n","authors":["Mahmood Saeedi kelishami","Amin Saeidi Kelishami","Sajjad Saeedi Kelishami"],"pdf_url":"https://arxiv.org/pdf/2404.13555v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.04727v2","updated":"2024-04-21T06:53:31Z","published":"2024-01-09T18:58:40Z","title":"Revisiting Adversarial Training at Scale","summary":" The machine learning community has witnessed a drastic change in the training\npipeline, pivoted by those ''foundation models'' with unprecedented scales.\nHowever, the field of adversarial training is lagging behind, predominantly\ncentered around small model sizes like ResNet-50, and tiny and low-resolution\ndatasets like CIFAR-10. To bridge this transformation gap, this paper provides\na modern re-examination with adversarial training, investigating its potential\nbenefits when applied at scale. Additionally, we introduce an efficient and\neffective training strategy to enable adversarial training with giant models\nand web-scale data at an affordable computing cost. We denote this newly\nintroduced framework as AdvXL.\n Empirical results demonstrate that AdvXL establishes new state-of-the-art\nrobust accuracy records under AutoAttack on ImageNet-1K. For example, by\ntraining on DataComp-1B dataset, our AdvXL empowers a vanilla ViT-g model to\nsubstantially surpass the previous records of $l_{\\infty}$-, $l_{2}$-, and\n$l_{1}$-robust accuracy by margins of 11.4%, 14.2% and 12.9%, respectively.\nThis achievement posits AdvXL as a pioneering approach, charting a new\ntrajectory for the efficient training of robust visual representations at\nsignificantly larger scales. Our code is available at\nhttps://github.com/UCSC-VLAA/AdvXL.\n","authors":["Zeyu Wang","Xianhang Li","Hongru Zhu","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2401.04727v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11947v2","updated":"2024-04-21T06:36:08Z","published":"2024-04-18T06:59:40Z","title":"VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled\n Examples in Semi-supervised Learning","summary":" Despite the progress of Semi-supervised Learning (SSL), existing methods fail\nto utilize unlabeled data effectively and efficiently. Many pseudo-label-based\nmethods select unlabeled examples based on inaccurate confidence scores from\nthe classifier. Most prior work also uses all available unlabeled data without\npruning, making it difficult to handle large amounts of unlabeled data. To\naddress these issues, we propose two methods: Variational Confidence\nCalibration (VCC) and Influence-Function-based Unlabeled Sample Elimination\n(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a\nvariational autoencoder to select more accurate pseudo labels based on three\ntypes of consistency scores. INFUSE is a data pruning method that constructs a\ncore dataset of unlabeled examples under SSL. Our methods are effective in\nmultiple datasets and settings, reducing classification errors rates and saving\ntraining time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the\nCIFAR-100 dataset by 1.08% while saving nearly half of the training time.\n","authors":["Shijie Fang","Qianhan Feng","Tong Lin"],"pdf_url":"https://arxiv.org/pdf/2404.11947v2.pdf","comment":"Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng\n contributed equally to this paper. New version, some problems and typos are\n fixed"},{"id":"http://arxiv.org/abs/2404.13550v1","updated":"2024-04-21T06:31:29Z","published":"2024-04-21T06:31:29Z","title":"Pointsoup: High-Performance and Extremely Low-Decoding-Latency Learned\n Geometry Codec for Large-Scale Point Cloud Scenes","summary":" Despite considerable progress being achieved in point cloud geometry\ncompression, there still remains a challenge in effectively compressing\nlarge-scale scenes with sparse surfaces. Another key challenge lies in reducing\ndecoding latency, a crucial requirement in real-world application. In this\npaper, we propose Pointsoup, an efficient learning-based geometry codec that\nattains high-performance and extremely low-decoding-latency simultaneously.\nInspired by conventional Trisoup codec, a point model-based strategy is devised\nto characterize local surfaces. Specifically, skin features are embedded from\nlocal windows via an attention-based encoder, and dilated windows are\nintroduced as cross-scale priors to infer the distribution of quantized\nfeatures in parallel. During decoding, features undergo fast refinement,\nfollowed by a folding-based point generator that reconstructs point coordinates\nwith fairly fast speed. Experiments show that Pointsoup achieves\nstate-of-the-art performance on multiple benchmarks with significantly lower\ndecoding complexity, i.e., up to 90$\\sim$160$\\times$ faster than the G-PCCv23\nTrisoup decoder on a comparatively low-end platform (e.g., one RTX 2080Ti).\nFurthermore, it offers variable-rate control with a single neural model\n(2.9MB), which is attractive for industrial practitioners.\n","authors":["Kang You","Kai Liu","Li Yu","Pan Gao","Dandan Ding"],"pdf_url":"https://arxiv.org/pdf/2404.13550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13541v1","updated":"2024-04-21T05:39:44Z","published":"2024-04-21T05:39:44Z","title":"Generalizable Novel-View Synthesis using a Stereo Camera","summary":" In this paper, we propose the first generalizable view synthesis approach\nthat specifically targets multi-view stereo-camera images. Since recent stereo\nmatching has demonstrated accurate geometry prediction, we introduce stereo\nmatching into novel-view synthesis for high-quality geometry reconstruction. To\nthis end, this paper proposes a novel framework, dubbed StereoNeRF, which\nintegrates stereo matching into a NeRF-based generalizable view synthesis\napproach. StereoNeRF is equipped with three key components to effectively\nexploit stereo matching in novel-view synthesis: a stereo feature extractor, a\ndepth-guided plane-sweeping, and a stereo depth loss. Moreover, we propose the\nStereoNVS dataset, the first multi-view dataset of stereo-camera images,\nencompassing a wide variety of both real and synthetic scenes. Our experimental\nresults demonstrate that StereoNeRF surpasses previous approaches in\ngeneralizable view synthesis.\n","authors":["Haechan Lee","Wonjoon Jin","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.13541v1.pdf","comment":"Accepted to CVPR 2024. Project page URL:\n https://jinwonjoon.github.io/stereonerf/"},{"id":"http://arxiv.org/abs/2404.13537v1","updated":"2024-04-21T05:11:37Z","published":"2024-04-21T05:11:37Z","title":"Bracketing Image Restoration and Enhancement with High-Low Frequency\n Decomposition","summary":" In real-world scenarios, due to a series of image degradations, obtaining\nhigh-quality, clear content photos is challenging. While significant progress\nhas been made in synthesizing high-quality images, previous methods for image\nrestoration and enhancement often overlooked the characteristics of different\ndegradations. They applied the same structure to address various types of\ndegradation, resulting in less-than-ideal restoration outcomes. Inspired by the\nnotion that high/low frequency information is applicable to different\ndegradations, we introduce HLNet, a Bracketing Image Restoration and\nEnhancement method based on high-low frequency decomposition. Specifically, we\nemploy two modules for feature extraction: shared weight modules and non-shared\nweight modules. In the shared weight modules, we use SCConv to extract common\nfeatures from different degradations. In the non-shared weight modules, we\nintroduce the High-Low Frequency Decomposition Block (HLFDB), which employs\ndifferent methods to handle high-low frequency information, enabling the model\nto address different degradations more effectively. Compared to other networks,\nour method takes into account the characteristics of different degradations,\nthus achieving higher-quality image restoration.\n","authors":["Genggeng Chen","Kexin Dai","Kangzhen Yang","Tao Hu","Xiangyu Chen","Yongqing Yang","Wei Dong","Peng Wu","Yanning Zhang","Qingsen Yan"],"pdf_url":"https://arxiv.org/pdf/2404.13537v1.pdf","comment":"This paper is accepted by CVPR 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.13534v1","updated":"2024-04-21T05:09:56Z","published":"2024-04-21T05:09:56Z","title":"Motion-aware Latent Diffusion Models for Video Frame Interpolation","summary":" With the advancement of AIGC, video frame interpolation (VFI) has become a\ncrucial component in existing video generation frameworks, attracting\nwidespread research interest. For the VFI task, the motion estimation between\nneighboring frames plays a crucial role in avoiding motion ambiguity. However,\nexisting VFI methods always struggle to accurately predict the motion\ninformation between consecutive frames, and this imprecise estimation leads to\nblurred and visually incoherent interpolated frames. In this paper, we propose\na novel diffusion framework, motion-aware latent diffusion models (MADiff),\nwhich is specifically designed for the VFI task. By incorporating motion priors\nbetween the conditional neighboring frames with the target interpolated frame\npredicted throughout the diffusion sampling procedure, MADiff progressively\nrefines the intermediate outcomes, culminating in generating both visually\nsmooth and realistic results. Extensive experiments conducted on benchmark\ndatasets demonstrate that our method achieves state-of-the-art performance\nsignificantly outperforming existing approaches, especially under challenging\nscenarios involving dynamic textures with complex motion.\n","authors":["Zhilin Huang","Yijie Yu","Ling Yang","Chujun Qin","Bing Zheng","Xiawu Zheng","Zikun Zhou","Yaowei Wang","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13534v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.09508 by\n other authors"},{"id":"http://arxiv.org/abs/2404.13530v1","updated":"2024-04-21T04:55:13Z","published":"2024-04-21T04:55:13Z","title":"Listen Then See: Video Alignment with Speaker Attention","summary":" Video-based Question Answering (Video QA) is a challenging task and becomes\neven more intricate when addressing Socially Intelligent Question Answering\n(SIQA). SIQA requires context understanding, temporal reasoning, and the\nintegration of multimodal information, but in addition, it requires processing\nnuanced human behavior. Furthermore, the complexities involved are exacerbated\nby the dominance of the primary modality (text) over the others. Thus, there is\na need to help the task's secondary modalities to work in tandem with the\nprimary modality. In this work, we introduce a cross-modal alignment and\nsubsequent representation fusion approach that achieves state-of-the-art\nresults (82.06\\% accuracy) on the Social IQ 2.0 dataset for SIQA. Our approach\nexhibits an improved ability to leverage the video modality by using the audio\nmodality as a bridge with the language modality. This leads to enhanced\nperformance by reducing the prevalent issue of language overfitting and\nresultant video modality bypassing encountered by current existing techniques.\nOur code and models are publicly available at\nhttps://github.com/sts-vlcc/sts-vlcc\n","authors":["Aviral Agrawal","Carlos Mateo Samudio Lezcano","Iqui Balam Heredia-Marin","Prabhdeep Singh Sethi"],"pdf_url":"https://arxiv.org/pdf/2404.13530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13521v1","updated":"2024-04-21T04:06:09Z","published":"2024-04-21T04:06:09Z","title":"Graph4GUI: Graph Neural Networks for Representing Graphical User\n Interfaces","summary":" Present-day graphical user interfaces (GUIs) exhibit diverse arrangements of\ntext, graphics, and interactive elements such as buttons and menus, but\nrepresentations of GUIs have not kept up. They do not encapsulate both semantic\nand visuo-spatial relationships among elements. To seize machine learning's\npotential for GUIs more efficiently, Graph4GUI exploits graph neural networks\nto capture individual elements' properties and their semantic-visuo-spatial\nconstraints in a layout. The learned representation demonstrated its\neffectiveness in multiple tasks, especially generating designs in a challenging\nGUI autocompletion task, which involved predicting the positions of remaining\nunplaced elements in a partially completed GUI. The new model's suggestions\nshowed alignment and visual appeal superior to the baseline method and received\nhigher subjective ratings for preference. Furthermore, we demonstrate the\npractical benefits and efficiency advantages designers perceive when utilizing\nour model as an autocompletion plug-in.\n","authors":["Yue Jiang","Changkong Zhou","Vikas Garg","Antti Oulasvirta"],"pdf_url":"https://arxiv.org/pdf/2404.13521v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2309.04891v2","updated":"2024-04-21T03:42:47Z","published":"2023-09-09T23:03:50Z","title":"How to Evaluate Semantic Communications for Images with ViTScore Metric?","summary":" Semantic communications (SC) have been expected to be a new paradigm shifting\nto catalyze the next generation communication, whose main concerns shift from\naccurate bit transmission to effective semantic information exchange in\ncommunications. However, the previous and widely-used metrics for images are\nnot applicable to evaluate the image semantic similarity in SC. Classical\nmetrics to measure the similarity between two images usually rely on the pixel\nlevel or the structural level, such as the PSNR and the MS-SSIM.\nStraightforwardly using some tailored metrics based on deep-learning methods in\nCV community, such as the LPIPS, is infeasible for SC. To tackle this, inspired\nby BERTScore in NLP community, we propose a novel metric for evaluating image\nsemantic similarity, named Vision Transformer Score (ViTScore). We prove\ntheoretically that ViTScore has 3 important properties, including symmetry,\nboundedness, and normalization, which make ViTScore convenient and intuitive\nfor image measurement. To evaluate the performance of ViTScore, we compare\nViTScore with 3 typical metrics (PSNR, MS-SSIM, and LPIPS) through 4 classes of\nexperiments: (i) correlation with BERTScore through evaluation of image caption\ndownstream CV task, (ii) evaluation in classical image communications, (iii)\nevaluation in image semantic communication systems, and (iv) evaluation in\nimage semantic communication systems with semantic attack. Experimental results\ndemonstrate that ViTScore is robust and efficient in evaluating the semantic\nsimilarity of images. Particularly, ViTScore outperforms the other 3 typical\nmetrics in evaluating the image semantic changes by semantic attack, such as\nimage inverse with Generative Adversarial Networks (GANs). This indicates that\nViTScore is an effective performance metric when deployed in SC scenarios.\n","authors":["Tingting Zhu","Bo Peng","Jifan Liang","Tingchen Han","Hai Wan","Jingqiao Fu","Junjie Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03641v2","updated":"2024-04-21T03:39:55Z","published":"2023-09-07T11:21:10Z","title":"Spiking Structured State Space Model for Monaural Speech Enhancement","summary":" Speech enhancement seeks to extract clean speech from noisy signals.\nTraditional deep learning methods face two challenges: efficiently using\ninformation in long speech sequences and high computational costs. To address\nthese, we introduce the Spiking Structured State Space Model (Spiking-S4). This\napproach merges the energy efficiency of Spiking Neural Networks (SNN) with the\nlong-range sequence modeling capabilities of Structured State Space Models\n(S4), offering a compelling solution. Evaluation on the DNS Challenge and\nVoiceBank+Demand Datasets confirms that Spiking-S4 rivals existing Artificial\nNeural Network (ANN) methods but with fewer computational resources, as\nevidenced by reduced parameters and Floating Point Operations (FLOPs).\n","authors":["Yu Du","Xu Liu","Yansong Chua"],"pdf_url":"https://arxiv.org/pdf/2309.03641v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11395v3","updated":"2024-04-21T03:26:27Z","published":"2024-01-21T04:13:58Z","title":"UniM-OV3D: Uni-Modality Open-Vocabulary 3D Scene Understanding with\n Fine-Grained Feature Representation","summary":" 3D open-vocabulary scene understanding aims to recognize arbitrary novel\ncategories beyond the base label space. However, existing works not only fail\nto fully utilize all the available modal information in the 3D domain but also\nlack sufficient granularity in representing the features of each modality. In\nthis paper, we propose a unified multimodal 3D open-vocabulary scene\nunderstanding network, namely UniM-OV3D, which aligns point clouds with image,\nlanguage and depth. To better integrate global and local features of the point\nclouds, we design a hierarchical point cloud feature extraction module that\nlearns comprehensive fine-grained feature representations. Further, to\nfacilitate the learning of coarse-to-fine point-semantic representations from\ncaptions, we propose the utilization of hierarchical 3D caption pairs,\ncapitalizing on geometric constraints across various viewpoints of 3D scenes.\nExtensive experimental results demonstrate the effectiveness and superiority of\nour method in open-vocabulary semantic and instance segmentation, which\nachieves state-of-the-art performance on both indoor and outdoor benchmarks\nsuch as ScanNet, ScanNet200, S3IDS and nuScenes. Code is available at\nhttps://github.com/hithqd/UniM-OV3D.\n","authors":["Qingdong He","Jinlong Peng","Zhengkai Jiang","Kai Wu","Xiaozhong Ji","Jiangning Zhang","Yabiao Wang","Chengjie Wang","Mingang Chen","Yunsheng Wu"],"pdf_url":"https://arxiv.org/pdf/2401.11395v3.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.10163v2","updated":"2024-04-21T03:17:23Z","published":"2024-04-15T22:26:27Z","title":"EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided\n Reinforcement Learning","summary":" From a visual perception perspective, modern graphical user interfaces (GUIs)\ncomprise a complex graphics-rich two-dimensional visuospatial arrangement of\ntext, images, and interactive objects such as buttons and menus. While existing\nmodels can accurately predict regions and objects that are likely to attract\nattention ``on average'', so far there is no scanpath model capable of\npredicting scanpaths for an individual. To close this gap, we introduce\nEyeFormer, which leverages a Transformer architecture as a policy network to\nguide a deep reinforcement learning algorithm that controls gaze locations. Our\nmodel has the unique capability of producing personalized predictions when\ngiven a few user scanpath samples. It can predict full scanpath information,\nincluding fixation positions and duration, across individuals and various\nstimulus types. Additionally, we demonstrate applications in GUI layout\noptimization driven by our model. Our software and models will be publicly\navailable.\n","authors":["Yue Jiang","Zixin Guo","Hamed Rezazadegan Tavakoli","Luis A. Leiva","Antti Oulasvirta"],"pdf_url":"https://arxiv.org/pdf/2404.10163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09457v3","updated":"2024-04-21T03:06:09Z","published":"2023-10-14T00:32:11Z","title":"UCM-Net: A Lightweight and Efficient Solution for Skin Lesion\n Segmentation using MLP and CNN","summary":" Skin cancer poses a significant public health challenge, necessitating\nefficient diagnostic tools. We introduce UCM-Net, a novel skin lesion\nsegmentation model combining Multi-Layer Perceptrons (MLP) and Convolutional\nNeural Networks (CNN). This lightweight, efficient architecture, deviating from\ntraditional UNet designs, dramatically reduces computational demands, making it\nideal for mobile health applications. Evaluated on PH2, ISIC 2017, and ISIC\n2018 datasets, UCM-Net demonstrates robust performance with fewer than 50KB\nparameters and requires less than 0.05 Giga Operations Per Second (GLOPs).\nMoreover, its minimal memory requirement is just 1.19MB in CPU environment\npositions. It is a potential benchmark for efficiency in skin lesion\nsegmentation, suitable for deployment in resource-constrained settings. In\norder to facilitate accessibility and further research in the field, the\nUCM-Net source code is https://github.com/chunyuyuan/UCM-Net.\n","authors":["Chunyu Yuan","Dongfang Zhao","Sos S. Agaian"],"pdf_url":"https://arxiv.org/pdf/2310.09457v3.pdf","comment":"17 pages, under review"},{"id":"http://arxiv.org/abs/2403.05146v2","updated":"2024-04-21T02:44:55Z","published":"2024-03-08T08:31:46Z","title":"Motion-Guided Dual-Camera Tracker for Low-Cost Skill Evaluation of\n Gastric Endoscopy","summary":" Gastric simulators with objective educational feedback have been proven\nuseful for endoscopy training. Existing electronic simulators with feedback are\nhowever not commonly adopted due to their high cost. In this work, a\nmotion-guided dual-camera tracker is proposed to provide reliable endoscope tip\nposition feedback at a low cost inside a mechanical simulator for endoscopy\nskill evaluation, tackling several unique challenges. To address the issue of\nsignificant appearance variation of the endoscope tip while keeping dual-camera\ntracking consistency, the cross-camera mutual template strategy (CMT) is\nproposed to introduce dynamic transient mutual templates to dual-camera\ntracking. To alleviate disturbance from large occlusion and distortion by the\nlight source from the endoscope tip, the Mamba-based motion-guided prediction\nhead (MMH) is presented to aggregate historical motion with visual tracking. It\nis the first application of Mamba for object tracking. The proposed tracker was\nevaluated on datasets captured by low-cost camera pairs during endoscopy\nprocedures performed inside the mechanical simulator. The tracker achieves SOTA\nperformance with robust and consistent tracking on dual cameras. Further\ndownstream evaluation proves that the 3D tip position determined by the\nproposed tracker enables reliable skill differentiation. The code and dataset\nare available at https://github.com/PieceZhang/MotionDCTrack\n","authors":["Yuelin Zhang","Wanquan Yan","Kim Yan","Chun Ping Lam","Yufu Qiu","Pengyu Zheng","Raymond Shing-Yan Tang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.05146v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13505v1","updated":"2024-04-21T02:21:30Z","published":"2024-04-21T02:21:30Z","title":"Dynamic in Static: Hybrid Visual Correspondence for Self-Supervised\n Video Object Segmentation","summary":" Conventional video object segmentation (VOS) methods usually necessitate a\nsubstantial volume of pixel-level annotated video data for fully supervised\nlearning. In this paper, we present HVC, a \\textbf{h}ybrid static-dynamic\n\\textbf{v}isual \\textbf{c}orrespondence framework for self-supervised VOS. HVC\nextracts pseudo-dynamic signals from static images, enabling an efficient and\nscalable VOS model. Our approach utilizes a minimalist fully-convolutional\narchitecture to capture static-dynamic visual correspondence in image-cropped\nviews. To achieve this objective, we present a unified self-supervised approach\nto learn visual representations of static-dynamic feature similarity. Firstly,\nwe establish static correspondence by utilizing a priori coordinate information\nbetween cropped views to guide the formation of consistent static feature\nrepresentations. Subsequently, we devise a concise convolutional layer to\ncapture the forward / backward pseudo-dynamic signals between two views,\nserving as cues for dynamic representations. Finally, we propose a hybrid\nvisual correspondence loss to learn joint static and dynamic consistency\nrepresentations. Our approach, without bells and whistles, necessitates only\none training session using static image data, significantly reducing memory\nconsumption ($\\sim$16GB) and training time ($\\sim$\\textbf{2h}). Moreover, HVC\nachieves state-of-the-art performance in several self-supervised VOS benchmarks\nand additional video label propagation tasks.\n","authors":["Gensheng Pei","Yazhou Yao","Jianbo Jiao","Wenguan Wang","Liqiang Nie","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2404.13505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03095v2","updated":"2024-04-21T00:39:30Z","published":"2022-11-30T15:55:40Z","title":"Interpretation of Neural Networks is Susceptible to Universal\n Adversarial Perturbations","summary":" Interpreting neural network classifiers using gradient-based saliency maps\nhas been extensively studied in the deep learning literature. While the\nexisting algorithms manage to achieve satisfactory performance in application\nto standard image recognition datasets, recent works demonstrate the\nvulnerability of widely-used gradient-based interpretation schemes to\nnorm-bounded perturbations adversarially designed for every individual input\nsample. However, such adversarial perturbations are commonly designed using the\nknowledge of an input sample, and hence perform sub-optimally in application to\nan unknown or constantly changing data point. In this paper, we show the\nexistence of a Universal Perturbation for Interpretation (UPI) for standard\nimage datasets, which can alter a gradient-based feature map of neural networks\nover a significant fraction of test samples. To design such a UPI, we propose a\ngradient-based optimization method as well as a principal component analysis\n(PCA)-based approach to compute a UPI which can effectively alter a neural\nnetwork's gradient-based interpretation on different samples. We support the\nproposed UPI approaches by presenting several numerical results of their\nsuccessful applications to standard image datasets.\n","authors":["Haniyeh Ehsani Oskouie","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2212.03095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13493v1","updated":"2024-04-21T00:14:03Z","published":"2024-04-21T00:14:03Z","title":"Authentic Emotion Mapping: Benchmarking Facial Expressions in Real News","summary":" In this paper, we present a novel benchmark for Emotion Recognition using\nfacial landmarks extracted from realistic news videos. Traditional methods\nrelying on RGB images are resource-intensive, whereas our approach with Facial\nLandmark Emotion Recognition (FLER) offers a simplified yet effective\nalternative. By leveraging Graph Neural Networks (GNNs) to analyze the\ngeometric and spatial relationships of facial landmarks, our method enhances\nthe understanding and accuracy of emotion recognition. We discuss the\nadvancements and challenges in deep learning techniques for emotion\nrecognition, particularly focusing on Graph Neural Networks (GNNs) and\nTransformers. Our experimental results demonstrate the viability and potential\nof our dataset as a benchmark, setting a new direction for future research in\nemotion recognition technologies. The codes and models are at:\nhttps://github.com/wangzhifengharrison/benchmark_real_news\n","authors":["Qixuan Zhang","Zhifeng Wang","Yang Liu","Zhenyue Qin","Kaihao Zhang","Sabrina Caldwell","Tom Gedeon"],"pdf_url":"https://arxiv.org/pdf/2404.13493v1.pdf","comment":null}]},"2024-04-20T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2205.06265v3","updated":"2024-04-20T23:40:53Z","published":"2022-05-12T17:59:56Z","title":"ELODI: Ensemble Logit Difference Inhibition for Positive-Congruent\n Training","summary":" Negative flips are errors introduced in a classification system when a legacy\nmodel is updated. Existing methods to reduce the negative flip rate (NFR)\neither do so at the expense of overall accuracy by forcing a new model to\nimitate the old models, or use ensembles, which multiply inference cost\nprohibitively. We analyze the role of ensembles in reducing NFR and observe\nthat they remove negative flips that are typically not close to the decision\nboundary, but often exhibit large deviations in the distance among their\nlogits. Based on the observation, we present a method, called Ensemble Logit\nDifference Inhibition (ELODI), to train a classification system that achieves\nparagon performance in both error rate and NFR, at the inference cost of a\nsingle model. The method distills a homogeneous ensemble to a single student\nmodel which is used to update the classification system. ELODI also introduces\na generalized distillation objective, Logit Difference Inhibition (LDI), which\nonly penalizes the logit difference of a subset of classes with the highest\nlogit values. On multiple image classification benchmarks, model updates with\nELODI demonstrate superior accuracy retention and NFR reduction.\n","authors":["Yue Zhao","Yantao Shen","Yuanjun Xiong","Shuo Yang","Wei Xia","Zhuowen Tu","Bernt Schiele","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2205.06265v3.pdf","comment":"Accepted as a Regular Paper in TPAMI. Code is at\n https://github.com/amazon-science/regression-constraint-model-upgrade"},{"id":"http://arxiv.org/abs/2403.16967v3","updated":"2024-04-20T23:22:02Z","published":"2024-03-25T17:26:08Z","title":"Visual Whole-Body Control for Legged Loco-Manipulation","summary":" We study the problem of mobile manipulation using legged robots equipped with\nan arm, namely legged loco-manipulation. The robot legs, while usually utilized\nfor mobility, offer an opportunity to amplify the manipulation capabilities by\nconducting whole-body control. That is, the robot can control the legs and the\narm at the same time to extend its workspace. We propose a framework that can\nconduct the whole-body control autonomously with visual observations. Our\napproach, namely Visual Whole-Body Control(VBC), is composed of a low-level\npolicy using all degrees of freedom to track the end-effector manipulator\nposition and a high-level policy proposing the end-effector position based on\nvisual inputs. We train both levels of policies in simulation and perform\nSim2Real transfer for real robot deployment. We perform extensive experiments\nand show significant improvements over baselines in picking up diverse objects\nin different configurations (heights, locations, orientations) and\nenvironments. Project page: https://wholebody-b1.github.io\n","authors":["Minghuan Liu","Zixuan Chen","Xuxin Cheng","Yandong Ji","Rizhao Qiu","Ruihan Yang","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16967v3.pdf","comment":"Add more details. The first two authors contribute equally. Project\n page: https://wholebody-b1.github.io"},{"id":"http://arxiv.org/abs/2310.10352v3","updated":"2024-04-20T23:18:59Z","published":"2023-10-16T12:42:43Z","title":"Semi-Supervised Crowd Counting with Contextual Modeling: Facilitating\n Holistic Understanding of Crowd Scenes","summary":" To alleviate the heavy annotation burden for training a reliable crowd\ncounting model and thus make the model more practicable and accurate by being\nable to benefit from more data, this paper presents a new semi-supervised\nmethod based on the mean teacher framework. When there is a scarcity of labeled\ndata available, the model is prone to overfit local patches. Within such\ncontexts, the conventional approach of solely improving the accuracy of local\npatch predictions through unlabeled data proves inadequate. Consequently, we\npropose a more nuanced approach: fostering the model's intrinsic 'subitizing'\ncapability. This ability allows the model to accurately estimate the count in\nregions by leveraging its understanding of the crowd scenes, mirroring the\nhuman cognitive process. To achieve this goal, we apply masking on unlabeled\ndata, guiding the model to make predictions for these masked patches based on\nthe holistic cues. Furthermore, to help with feature learning, herein we\nincorporate a fine-grained density classification task. Our method is general\nand applicable to most existing crowd counting methods as it doesn't have\nstrict structural or loss constraints. In addition, we observe that the model\ntrained with our framework exhibits a 'subitizing'-like behavior. It accurately\npredicts low-density regions with only a 'glance', while incorporating local\ndetails to predict high-density regions. Our method achieves the\nstate-of-the-art performance, surpassing previous approaches by a large margin\non challenging benchmarks such as ShanghaiTech A and UCF-QNRF. The code is\navailable at: https://github.com/cha15yq/MRC-Crowd.\n","authors":["Yifei Qian","Xiaopeng Hong","Zhongliang Guo","Ognjen Arandjelović","Carl R. Donovan"],"pdf_url":"https://arxiv.org/pdf/2310.10352v3.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2404.13484v1","updated":"2024-04-20T23:02:57Z","published":"2024-04-20T23:02:57Z","title":"Joint Quality Assessment and Example-Guided Image Processing by\n Disentangling Picture Appearance from Content","summary":" The deep learning revolution has strongly impacted low-level image processing\ntasks such as style/domain transfer, enhancement/restoration, and visual\nquality assessments. Despite often being treated separately, the aforementioned\ntasks share a common theme of understanding, editing, or enhancing the\nappearance of input images without modifying the underlying content. We\nleverage this observation to develop a novel disentangled representation\nlearning method that decomposes inputs into content and appearance features.\nThe model is trained in a self-supervised manner and we use the learned\nfeatures to develop a new quality prediction model named DisQUE. We demonstrate\nthrough extensive evaluations that DisQUE achieves state-of-the-art accuracy\nacross quality prediction tasks and distortion types. Moreover, we demonstrate\nthat the same features may also be used for image processing tasks such as HDR\ntone mapping, where the desired output characteristics may be tuned using\nexample input-output pairs.\n","authors":["Abhinau K. Venkataramanan","Cosmin Stejerean","Ioannis Katsavounidis","Hassene Tmar","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2404.13484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13478v1","updated":"2024-04-20T22:16:56Z","published":"2024-04-20T22:16:56Z","title":"Deep SE(3)-Equivariant Geometric Reasoning for Precise Placement Tasks","summary":" Many robot manipulation tasks can be framed as geometric reasoning tasks,\nwhere an agent must be able to precisely manipulate an object into a position\nthat satisfies the task from a set of initial conditions. Often, task success\nis defined based on the relationship between two objects - for instance,\nhanging a mug on a rack. In such cases, the solution should be equivariant to\nthe initial position of the objects as well as the agent, and invariant to the\npose of the camera. This poses a challenge for learning systems which attempt\nto solve this task by learning directly from high-dimensional demonstrations:\nthe agent must learn to be both equivariant as well as precise, which can be\nchallenging without any inductive biases about the problem. In this work, we\npropose a method for precise relative pose prediction which is provably\nSE(3)-equivariant, can be learned from only a few demonstrations, and can\ngeneralize across variations in a class of objects. We accomplish this by\nfactoring the problem into learning an SE(3) invariant task-specific\nrepresentation of the scene and then interpreting this representation with\nnovel geometric reasoning layers which are provably SE(3) equivariant. We\ndemonstrate that our method can yield substantially more precise placement\npredictions in simulated placement tasks than previous methods trained with the\nsame amount of data, and can accurately represent relative placement\nrelationships data collected from real-world demonstrations. Supplementary\ninformation and videos can be found at\nhttps://sites.google.com/view/reldist-iclr-2023.\n","authors":["Ben Eisner","Yi Yang","Todor Davchev","Mel Vecerik","Jonathan Scholz","David Held"],"pdf_url":"https://arxiv.org/pdf/2404.13478v1.pdf","comment":"Published at International Conference on Representation Learning\n (ICLR 2024)"},{"id":"http://arxiv.org/abs/2404.06605v2","updated":"2024-04-20T22:10:37Z","published":"2024-04-09T20:24:29Z","title":"RoadBEV: Road Surface Reconstruction in Bird's Eye View","summary":" Road surface conditions, especially geometry profiles, enormously affect\ndriving performance of autonomous vehicles. Vision-based online road\nreconstruction promisingly captures road information in advance. Existing\nsolutions like monocular depth estimation and stereo matching suffer from\nmodest performance. The recent technique of Bird's-Eye-View (BEV) perception\nprovides immense potential to more reliable and accurate reconstruction. This\npaper uniformly proposes two simple yet effective models for road elevation\nreconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate\nroad elevation with monocular and stereo images, respectively. The former\ndirectly fits elevation values based on voxel features queried from image view,\nwhile the latter efficiently recognizes road elevation patterns based on BEV\nvolume representing discrepancy between left and right voxel features.\nInsightful analyses reveal their consistence and difference with perspective\nview. Experiments on real-world dataset verify the models' effectiveness and\nsuperiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm\nand 0.50cm, respectively. The estimation performance improves by 50\\% in BEV\nbased on monocular image. Our models are promising for practical applications,\nproviding valuable references for vision-based BEV perception in autonomous\ndriving. The code is released at https://github.com/ztsrxh/RoadBEV.\n","authors":["Tong Zhao","Lei Yang","Yichen Xie","Mingyu Ding","Masayoshi Tomizuka","Yintao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.06605v2.pdf","comment":"Dataset page: https://thu-rsxd.com/rsrd Code:\n https://github.com/ztsrxh/RoadBEV"},{"id":"http://arxiv.org/abs/2312.14494v2","updated":"2024-04-20T22:00:41Z","published":"2023-12-22T07:42:00Z","title":"Revisiting Few-Shot Object Detection with Vision-Language Models","summary":" Few-shot object detection (FSOD) benchmarks have advanced techniques for\ndetecting new categories with limited annotations. Existing benchmarks\nrepurpose well-established datasets like COCO by partitioning categories into\nbase and novel classes for pre-training and fine-tuning respectively. However,\nthese benchmarks do not reflect how FSOD is deployed in practice. Rather than\nonly pre-training on a small number of base categories, we argue that it is\nmore practical to fine-tune a foundation model (e.g., a vision-language model\n(VLM) pre-trained on web-scale data) for a target domain. Surprisingly, we find\nthat zero-shot inference from VLMs like GroundingDINO significantly outperforms\nthe state-of-the-art (48.3 vs. 33.1 AP) on COCO. However, such zero-shot models\ncan still be misaligned to target concepts of interest. For example, trailers\non the web may be different from trailers in the context of autonomous\nvehicles. In this work, we propose Foundational FSOD, a new benchmark protocol\nthat evaluates detectors pre-trained on any external datasets and fine-tuned on\nK-shots per target class. Further, we note that current FSOD benchmarks are\nactually federated datasets containing exhaustive annotations for each category\non a subset of the data. We leverage this insight to propose simple strategies\nfor fine-tuning VLMs with federated losses. We demonstrate the effectiveness of\nour approach on LVIS and nuImages, improving over prior work by 5.9 AP. Our\ncode is available at https://github.com/anishmadan23/foundational_fsod\n","authors":["Anish Madan","Neehar Peri","Shu Kong","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2312.14494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13474v1","updated":"2024-04-20T21:51:15Z","published":"2024-04-20T21:51:15Z","title":"Composing Pre-Trained Object-Centric Representations for Robotics From\n \"What\" and \"Where\" Foundation Models","summary":" There have recently been large advances both in pre-training visual\nrepresentations for robotic control and segmenting unknown category objects in\ngeneral images. To leverage these for improved robot learning, we propose\n$\\textbf{POCR}$, a new framework for building pre-trained object-centric\nrepresentations for robotic control. Building on theories of \"what-where\"\nrepresentations in psychology and computer vision, we use segmentations from a\npre-trained model to stably locate across timesteps, various entities in the\nscene, capturing \"where\" information. To each such segmented entity, we apply\nother pre-trained models that build vector descriptions suitable for robotic\ncontrol tasks, thus capturing \"what\" the entity is. Thus, our pre-trained\nobject-centric representations for control are constructed by appropriately\ncombining the outputs of off-the-shelf pre-trained models, with no new\ntraining. On various simulated and real robotic tasks, we show that imitation\npolicies for robotic manipulators trained on POCR achieve better performance\nand systematic generalization than state of the art pre-trained representations\nfor robotics, as well as prior object-centric representations that are\ntypically trained from scratch.\n","authors":["Junyao Shi","Jianing Qian","Yecheng Jason Ma","Dinesh Jayaraman"],"pdf_url":"https://arxiv.org/pdf/2404.13474v1.pdf","comment":"ICRA 2024. Project website: https://sites.google.com/view/pocr"},{"id":"http://arxiv.org/abs/2312.02914v4","updated":"2024-04-20T21:28:24Z","published":"2023-12-05T17:39:19Z","title":"Unsupervised Video Domain Adaptation with Masked Pre-Training and\n Collaborative Self-Training","summary":" In this work, we tackle the problem of unsupervised domain adaptation (UDA)\nfor video action recognition. Our approach, which we call UNITE, uses an image\nteacher model to adapt a video student model to the target domain. UNITE first\nemploys self-supervised pre-training to promote discriminative feature learning\non target domain videos using a teacher-guided masked distillation objective.\nWe then perform self-training on masked target data, using the video student\nmodel and image teacher model together to generate improved pseudolabels for\nunlabeled target videos. Our self-training process successfully leverages the\nstrengths of both models to achieve strong transfer performance across domains.\nWe evaluate our approach on multiple video domain adaptation benchmarks and\nobserve significant improvements upon previously reported results.\n","authors":["Arun Reddy","William Paul","Corban Rivera","Ketul Shah","Celso M. de Melo","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2312.02914v4.pdf","comment":"Accepted at CVPR 2024. 13 pages, 4 figures. Approved for public\n release: distribution unlimited"},{"id":"http://arxiv.org/abs/2401.17484v3","updated":"2024-04-20T21:14:15Z","published":"2024-01-30T22:37:24Z","title":"Pixel to Elevation: Learning to Predict Elevation Maps at Long Range\n using Images for Autonomous Offroad Navigation","summary":" Understanding terrain topology at long-range is crucial for the success of\noff-road robotic missions, especially when navigating at high-speeds. LiDAR\nsensors, which are currently heavily relied upon for geometric mapping, provide\nsparse measurements when mapping at greater distances. To address this\nchallenge, we present a novel learning-based approach capable of predicting\nterrain elevation maps at long-range using only onboard egocentric images in\nreal-time. Our proposed method is comprised of three main elements. First, a\ntransformer-based encoder is introduced that learns cross-view associations\nbetween the egocentric views and prior bird-eye-view elevation map predictions.\nSecond, an orientation-aware positional encoding is proposed to incorporate the\n3D vehicle pose information over complex unstructured terrain with multi-view\nvisual image features. Lastly, a history-augmented learn-able map embedding is\nproposed to achieve better temporal consistency between elevation map\npredictions to facilitate the downstream navigational tasks. We experimentally\nvalidate the applicability of our proposed approach for autonomous offroad\nrobotic navigation in complex and unstructured terrain using real-world offroad\ndriving data. Furthermore, the method is qualitatively and quantitatively\ncompared against the current state-of-the-art methods. Extensive field\nexperiments demonstrate that our method surpasses baseline models in accurately\npredicting terrain elevation while effectively capturing the overall terrain\ntopology at long-ranges. Finally, ablation studies are conducted to highlight\nand understand the effect of key components of the proposed approach and\nvalidate their suitability to improve offroad robotic navigation capabilities.\n","authors":["Chanyoung Chung","Georgios Georgakis","Patrick Spieler","Curtis Padgett","Ali Agha","Shehryar Khattak"],"pdf_url":"https://arxiv.org/pdf/2401.17484v3.pdf","comment":"8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2403.15977v3","updated":"2024-04-20T20:19:11Z","published":"2024-03-24T01:20:08Z","title":"Towards Two-Stream Foveation-based Active Vision Learning","summary":" Deep neural network (DNN) based machine perception frameworks process the\nentire input in a one-shot manner to provide answers to both \"what object is\nbeing observed\" and \"where it is located\". In contrast, the \"two-stream\nhypothesis\" from neuroscience explains the neural processing in the human\nvisual cortex as an active vision system that utilizes two separate regions of\nthe brain to answer the what and the where questions. In this work, we propose\na machine learning framework inspired by the \"two-stream hypothesis\" and\nexplore the potential benefits that it offers. Specifically, the proposed\nframework models the following mechanisms: 1) ventral (what) stream focusing on\nthe input regions perceived by the fovea part of an eye (foveation), 2) dorsal\n(where) stream providing visual guidance, and 3) iterative processing of the\ntwo streams to calibrate visual focus and process the sequence of focused image\npatches. The training of the proposed framework is accomplished by label-based\nDNN training for the ventral stream model and reinforcement learning for the\ndorsal stream model. We show that the two-stream foveation-based learning is\napplicable to the challenging task of weakly-supervised object localization\n(WSOL), where the training data is limited to the object class or its\nattributes. The framework is capable of both predicting the properties of an\nobject and successfully localizing it by predicting its bounding box. We also\nshow that, due to the independent nature of the two streams, the dorsal model\ncan be applied on its own to unseen images to localize objects from different\ndatasets.\n","authors":["Timur Ibrayev","Amitangshu Mukherjee","Sai Aparna Aketi","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2403.15977v3.pdf","comment":"Accepted version of the article, 18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.17241v2","updated":"2024-04-20T19:30:38Z","published":"2023-11-28T21:31:04Z","title":"End-to-End Temporal Action Detection with 1B Parameters Across 1000\n Frames","summary":" Recently, temporal action detection (TAD) has seen significant performance\nimprovement with end-to-end training. However, due to the memory bottleneck,\nonly models with limited scales and limited data volumes can afford end-to-end\ntraining, which inevitably restricts TAD performance. In this paper, we reduce\nthe memory consumption for end-to-end training, and manage to scale up the TAD\nbackbone to 1 billion parameters and the input video to 1,536 frames, leading\nto significant detection performance. The key to our approach lies in our\nproposed temporal-informative adapter (TIA), which is a novel lightweight\nmodule that reduces training memory. Using TIA, we free the humongous backbone\nfrom learning to adapt to the TAD task by only updating the parameters in TIA.\nTIA also leads to better TAD representation by temporally aggregating context\nfrom adjacent frames throughout the backbone. We evaluate our model across four\nrepresentative datasets. Owing to our efficient design, we are able to train\nend-to-end on VideoMAEv2-giant and achieve 75.4% mAP on THUMOS14, being the\nfirst end-to-end model to outperform the best feature-based methods. Code is\navailable at https://github.com/sming256/AdaTAD.\n","authors":["Shuming Liu","Chen-Lin Zhang","Chen Zhao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2311.17241v2.pdf","comment":"Accepted to CVPR 2024. Camera-Ready Version"},{"id":"http://arxiv.org/abs/2404.13452v1","updated":"2024-04-20T19:29:51Z","published":"2024-04-20T19:29:51Z","title":"Cut-FUNQUE: An Objective Quality Model for Compressed Tone-Mapped High\n Dynamic Range Videos","summary":" High Dynamic Range (HDR) videos have enjoyed a surge in popularity in recent\nyears due to their ability to represent a wider range of contrast and color\nthan Standard Dynamic Range (SDR) videos. Although HDR video capture has seen\nincreasing popularity because of recent flagship mobile phones such as Apple\niPhones, Google Pixels, and Samsung Galaxy phones, a broad swath of consumers\nstill utilize legacy SDR displays that are unable to display HDR videos. As\nresult, HDR videos must be processed, i.e., tone-mapped, before streaming to a\nlarge section of SDR-capable video consumers. However, server-side tone-mapping\ninvolves automating decisions regarding the choices of tone-mapping operators\n(TMOs) and their parameters to yield high-fidelity outputs. Moreover, these\nchoices must be balanced against the effects of lossy compression, which is\nubiquitous in streaming scenarios. In this work, we develop a novel, efficient\nmodel of objective video quality named Cut-FUNQUE that is able to accurately\npredict the visual quality of tone-mapped and compressed HDR videos. Finally,\nwe evaluate Cut-FUNQUE on a large-scale crowdsourced database of such videos\nand show that it achieves state-of-the-art accuracy.\n","authors":["Abhinau K. Venkataramanan","Cosmin Stejerean","Ioannis Katsavounidis","Hassene Tmar","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2404.13452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13449v1","updated":"2024-04-20T19:17:40Z","published":"2024-04-20T19:17:40Z","title":"SiNC+: Adaptive Camera-Based Vitals with Unsupervised Learning of\n Periodic Signals","summary":" Subtle periodic signals, such as blood volume pulse and respiration, can be\nextracted from RGB video, enabling noncontact health monitoring at low cost.\nAdvancements in remote pulse estimation -- or remote photoplethysmography\n(rPPG) -- are currently driven by deep learning solutions. However, modern\napproaches are trained and evaluated on benchmark datasets with ground truth\nfrom contact-PPG sensors. We present the first non-contrastive unsupervised\nlearning framework for signal regression to mitigate the need for labelled\nvideo data. With minimal assumptions of periodicity and finite bandwidth, our\napproach discovers the blood volume pulse directly from unlabelled videos. We\nfind that encouraging sparse power spectra within normal physiological\nbandlimits and variance over batches of power spectra is sufficient for\nlearning visual features of periodic signals. We perform the first experiments\nutilizing unlabelled video data not specifically created for rPPG to train\nrobust pulse rate estimators. Given the limited inductive biases, we\nsuccessfully applied the same approach to camera-based respiration by changing\nthe bandlimits of the target signal. This shows that the approach is general\nenough for unsupervised learning of bandlimited quasi-periodic signals from\ndifferent domains. Furthermore, we show that the framework is effective for\nfinetuning models on unlabelled video from a single subject, allowing for\npersonalized and adaptive signal regressors.\n","authors":["Jeremy Speth","Nathan Vance","Patrick Flynn","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2404.13449v1.pdf","comment":"Extension of CVPR2023 highlight paper. arXiv admin note: substantial\n text overlap with arXiv:2303.07944"},{"id":"http://arxiv.org/abs/2404.13445v1","updated":"2024-04-20T18:52:51Z","published":"2024-04-20T18:52:51Z","title":"DMesh: A Differentiable Representation for General Meshes","summary":" We present a differentiable representation, DMesh, for general 3D triangular\nmeshes. DMesh considers both the geometry and connectivity information of a\nmesh. In our design, we first get a set of convex tetrahedra that compactly\ntessellates the domain based on Weighted Delaunay Triangulation (WDT), and\nformulate probability of faces to exist on our desired mesh in a differentiable\nmanner based on the WDT. This enables DMesh to represent meshes of various\ntopology in a differentiable way, and allows us to reconstruct the mesh under\nvarious observations, such as point cloud and multi-view images using\ngradient-based optimization. The source code and full paper is available at:\nhttps://sonsang.github.io/dmesh-project.\n","authors":["Sanghyun Son","Matheus Gadelha","Yang Zhou","Zexiang Xu","Ming C. Lin","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.13445v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.13443v1","updated":"2024-04-20T18:50:57Z","published":"2024-04-20T18:50:57Z","title":"FisheyeDetNet: Object Detection on Fisheye Surround View Camera Systems\n for Automated Driving","summary":" Object detection is a mature problem in autonomous driving with pedestrian\ndetection being one of the first deployed algorithms. It has been\ncomprehensively studied in the literature. However, object detection is\nrelatively less explored for fisheye cameras used for surround-view near field\nsensing. The standard bounding box representation fails in fisheye cameras due\nto heavy radial distortion, particularly in the periphery. To mitigate this, we\nexplore extending the standard object detection output representation of\nbounding box. We design rotated bounding boxes, ellipse, generic polygon as\npolar arc/angle representations and define an instance segmentation mIOU metric\nto analyze these representations. The proposed model FisheyeDetNet with polygon\noutperforms others and achieves a mAP score of 49.5 % on Valeo fisheye\nsurround-view dataset for automated driving applications. This dataset has 60K\nimages captured from 4 surround-view cameras across Europe, North America and\nAsia. To the best of our knowledge, this is the first detailed study on object\ndetection on fisheye cameras for autonomous driving scenarios.\n","authors":["Ganesh Sistu","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2404.13443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11156v2","updated":"2024-04-20T18:10:34Z","published":"2024-04-17T08:09:25Z","title":"Learning SO(3)-Invariant Semantic Correspondence via Local Shape\n Transform","summary":" Establishing accurate 3D correspondences between shapes stands as a pivotal\nchallenge with profound implications for computer vision and robotics. However,\nexisting self-supervised methods for this problem assume perfect input shape\nalignment, restricting their real-world applicability. In this work, we\nintroduce a novel self-supervised Rotation-Invariant 3D correspondence learner\nwith Local Shape Transform, dubbed RIST, that learns to establish dense\ncorrespondences between shapes even under challenging intra-class variations\nand arbitrary orientations. Specifically, RIST learns to dynamically formulate\nan SO(3)-invariant local shape transform for each point, which maps the\nSO(3)-equivariant global shape descriptor of the input shape to a local shape\ndescriptor. These local shape descriptors are provided as inputs to our decoder\nto facilitate point cloud self- and cross-reconstruction. Our proposed\nself-supervised training pipeline encourages semantically corresponding points\nfrom different shapes to be mapped to similar local shape descriptors, enabling\nRIST to establish dense point-wise correspondences. RIST demonstrates\nstate-of-the-art performances on 3D part label transfer and semantic keypoint\ntransfer given arbitrarily rotated point cloud pairs, outperforming existing\nmethods by significant margins.\n","authors":["Chunghyun Park","Seungwook Kim","Jaesik Park","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.11156v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13437v1","updated":"2024-04-20T18:06:26Z","published":"2024-04-20T18:06:26Z","title":"High-fidelity Endoscopic Image Synthesis by Utilizing Depth-guided\n Neural Surfaces","summary":" In surgical oncology, screening colonoscopy plays a pivotal role in providing\ndiagnostic assistance, such as biopsy, and facilitating surgical navigation,\nparticularly in polyp detection. Computer-assisted endoscopic surgery has\nrecently gained attention and amalgamated various 3D computer vision\ntechniques, including camera localization, depth estimation, surface\nreconstruction, etc. Neural Radiance Fields (NeRFs) and Neural Implicit\nSurfaces (NeuS) have emerged as promising methodologies for deriving accurate\n3D surface models from sets of registered images, addressing the limitations of\nexisting colon reconstruction approaches stemming from constrained camera\nmovement.\n However, the inadequate tissue texture representation and confused scale\nproblem in monocular colonoscopic image reconstruction still impede the\nprogress of the final rendering results. In this paper, we introduce a novel\nmethod for colon section reconstruction by leveraging NeuS applied to\nendoscopic images, supplemented by a single frame of depth map. Notably, we\npioneered the exploration of utilizing only one frame depth map in\nphotorealistic reconstruction and neural rendering applications while this\nsingle depth map can be easily obtainable from other monocular depth estimation\nnetworks with an object scale. Through rigorous experimentation and validation\non phantom imagery, our approach demonstrates exceptional accuracy in\ncompletely rendering colon sections, even capturing unseen portions of the\nsurface. This breakthrough opens avenues for achieving stable and consistently\nscaled reconstructions, promising enhanced quality in cancer screening\nprocedures and treatment interventions.\n","authors":["Baoru Huang","Yida Wang","Anh Nguyen","Daniel Elson","Francisco Vasconcelos","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2404.13437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13434v1","updated":"2024-04-20T17:56:14Z","published":"2024-04-20T17:56:14Z","title":"Nested-TNT: Hierarchical Vision Transformers with Multi-Scale Feature\n Processing","summary":" Transformer has been applied in the field of computer vision due to its\nexcellent performance in natural language processing, surpassing traditional\nconvolutional neural networks and achieving new state-of-the-art. ViT divides\nan image into several local patches, known as \"visual sentences\". However, the\ninformation contained in the image is vast and complex, and focusing only on\nthe features at the \"visual sentence\" level is not enough. The features between\nlocal patches should also be taken into consideration. In order to achieve\nfurther improvement, the TNT model is proposed, whose algorithm further divides\nthe image into smaller patches, namely \"visual words,\" achieving more accurate\nresults. The core of Transformer is the Multi-Head Attention mechanism, and\ntraditional attention mechanisms ignore interactions across different attention\nheads. In order to reduce redundancy and improve utilization, we introduce the\nnested algorithm and apply the Nested-TNT to image classification tasks. The\nexperiment confirms that the proposed model has achieved better classification\nperformance over ViT and TNT, exceeding 2.25%, 1.1% on dataset CIFAR10 and\n2.78%, 0.25% on dataset FLOWERS102 respectively.\n","authors":["Yuang Liu","Zhiheng Qiu","Xiaokai Qin"],"pdf_url":"https://arxiv.org/pdf/2404.13434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13425v1","updated":"2024-04-20T17:19:54Z","published":"2024-04-20T17:19:54Z","title":"AdvLoRA: Adversarial Low-Rank Adaptation of Vision-Language Models","summary":" Vision-Language Models (VLMs) are a significant technique for Artificial\nGeneral Intelligence (AGI). With the fast growth of AGI, the security problem\nbecome one of the most important challenges for VLMs. In this paper, through\nextensive experiments, we demonstrate the vulnerability of the conventional\nadaptation methods for VLMs, which may bring significant security risks. In\naddition, as the size of the VLMs increases, performing conventional\nadversarial adaptation techniques on VLMs results in high computational costs.\nTo solve these problems, we propose a parameter-efficient\n\\underline{Adv}ersarial adaptation method named \\underline{AdvLoRA} by\n\\underline{Lo}w-\\underline{R}ank \\underline{A}daptation. At first, we\ninvestigate and reveal the intrinsic low-rank property during the adversarial\nadaptation for VLMs. Different from LoRA, we improve the efficiency and\nrobustness of adversarial adaptation by designing a novel reparameterizing\nmethod based on parameter clustering and parameter alignment. In addition, an\nadaptive parameter update strategy is proposed to further improve the\nrobustness. By these settings, our proposed AdvLoRA alleviates the model\nsecurity and high resource waste problems. Extensive experiments demonstrate\nthe effectiveness and efficiency of the AdvLoRA.\n","authors":["Yuheng Ji","Yue Liu","Zhicheng Zhang","Zhao Zhang","Yuting Zhao","Gang Zhou","Xingwei Zhang","Xinwang Liu","Xiaolong Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09498v2","updated":"2024-04-20T16:42:42Z","published":"2024-04-15T06:37:21Z","title":"FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion\n with Mamba","summary":" Multi-modal image fusion aims to combine information from different modes to\ncreate a single image with comprehensive information and detailed textures.\nHowever, fusion models based on convolutional neural networks encounter\nlimitations in capturing global image features due to their focus on local\nconvolution operations. Transformer-based models, while excelling in global\nfeature modeling, confront computational challenges stemming from their\nquadratic complexity. Recently, the Selective Structured State Space Model has\nexhibited significant potential for long-range dependency modeling with linear\ncomplexity, offering a promising avenue to address the aforementioned dilemma.\nIn this paper, we propose FusionMamba, a novel dynamic feature enhancement\nmethod for multimodal image fusion with Mamba. Specifically, we devise an\nimproved efficient Mamba model for image fusion, integrating efficient visual\nstate space model with dynamic convolution and channel attention. This refined\nmodel not only upholds the performance of Mamba and global modeling capability\nbut also diminishes channel redundancy while enhancing local enhancement\ncapability. Additionally, we devise a dynamic feature fusion module (DFFM)\ncomprising two dynamic feature enhancement modules (DFEM) and a cross modality\nfusion mamba module (CMFM). The former serves for dynamic texture enhancement\nand dynamic difference perception, whereas the latter enhances correlation\nfeatures between modes and suppresses redundant intermodal information.\nFusionMamba has yielded state-of-the-art (SOTA) performance across various\nmultimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared\nand visible image fusion task (IR-VIS) and multimodal biomedical image fusion\ndataset (GFP-PC), which is proved that our model has generalization ability.\nThe code for FusionMamba is available at\nhttps://github.com/millieXie/FusionMamba.\n","authors":["Xinyu Xie","Yawen Cui","Chio-In Ieong","Tao Tan","Xiaozhi Zhang","Xubin Zheng","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.09498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13420v1","updated":"2024-04-20T16:36:24Z","published":"2024-04-20T16:36:24Z","title":"NeurCADRecon: Neural Representation for Reconstructing CAD Surfaces by\n Enforcing Zero Gaussian Curvature","summary":" Despite recent advances in reconstructing an organic model with the neural\nsigned distance function (SDF), the high-fidelity reconstruction of a CAD model\ndirectly from low-quality unoriented point clouds remains a significant\nchallenge. In this paper, we address this challenge based on the prior\nobservation that the surface of a CAD model is generally composed of piecewise\nsurface patches, each approximately developable even around the feature line.\nOur approach, named NeurCADRecon, is self-supervised, and its loss includes a\ndevelopability term to encourage the Gaussian curvature toward 0 while ensuring\nfidelity to the input points. Noticing that the Gaussian curvature is non-zero\nat tip points, we introduce a double-trough curve to tolerate the existence of\nthese tip points. Furthermore, we develop a dynamic sampling strategy to deal\nwith situations where the given points are incomplete or too sparse. Since our\nresulting neural SDFs can clearly manifest sharp feature points/lines, one can\neasily extract the feature-aligned triangle mesh from the SDF and then\ndecompose it into smooth surface patches, greatly reducing the difficulty of\nrecovering the parametric CAD design. A comprehensive comparison with existing\nstate-of-the-art methods shows the significant advantage of our approach in\nreconstructing faithful CAD shapes.\n","authors":["Qiujie Dong","Rui Xu","Pengfei Wang","Shuangmin Chen","Shiqing Xin","Xiaohong Jia","Wenping Wang","Changhe Tu"],"pdf_url":"https://arxiv.org/pdf/2404.13420v1.pdf","comment":"ACM Transactions on Graphics (SIGGRAPH 2024)"},{"id":"http://arxiv.org/abs/2403.10488v3","updated":"2024-04-20T16:24:44Z","published":"2024-03-15T17:23:38Z","title":"Joint Multimodal Transformer for Emotion Recognition in the Wild","summary":" Multimodal emotion recognition (MMER) systems typically outperform unimodal\nsystems by leveraging the inter- and intra-modal relationships between, e.g.,\nvisual, textual, physiological, and auditory modalities. This paper proposes an\nMMER method that relies on a joint multimodal transformer (JMT) for fusion with\nkey-based cross-attention. This framework can exploit the complementary nature\nof diverse modalities to improve predictive accuracy. Separate backbones\ncapture intra-modal spatiotemporal dependencies within each modality over video\nsequences. Subsequently, our JMT fusion architecture integrates the individual\nmodality embeddings, allowing the model to effectively capture inter- and\nintra-modal relationships. Extensive experiments on two challenging expression\nrecognition tasks -- (1) dimensional emotion recognition on the Affwild2\ndataset (with face and voice) and (2) pain estimation on the Biovid dataset\n(with face and biosensors) -- indicate that our JMT fusion can provide a\ncost-effective solution for MMER. Empirical results show that MMER systems with\nour proposed fusion allow us to outperform relevant baseline and\nstate-of-the-art methods.\n","authors":["Paul Waligora","Haseeb Aslam","Osama Zeeshan","Soufiane Belharbi","Alessandro Lameiras Koerich","Marco Pedersoli","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2403.10488v3.pdf","comment":"10 pages, 4 figures, 6 tables, CVPRw 2024"},{"id":"http://arxiv.org/abs/2404.05238v3","updated":"2024-04-20T16:15:53Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature importance maps \\cite{bansal2020sam}\nhave been established as a common means for finding how important each input\nfeature is to an AI's decisions. It is an interesting, unexplored question\nwhether allowing users to edit the feature importance at test time would\nimprove a human-AI team's accuracy on downstream tasks. In this paper, we\naddress this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc\nexplainable classifier \\cite{taesiri2022visual} that first predicts patch-wise\ncorrespondences between the input and training-set images, and then bases on\nthem to make classification decisions. We build CHM-Corr++, an interactive\ninterface for CHM-Corr, enabling users to edit the feature importance map\nprovided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users\ncan gain insights into if, when, and how the model changes its outputs,\nimproving their understanding beyond static explanations. However, our study\nwith 18 expert users who performed 1,400 decisions finds no statistical\nsignificance that our interactive approach improves user accuracy on CUB-200\nbird image classification over static explanations. This challenges the\nhypothesis that interactivity can boost human-AI team accuracy and raises needs\nfor future research. We open-source CHM-Corr++, an interactive tool for editing\nimage classifier attention (see an interactive demo here:\nhttp://137.184.82.109:7080/). We release code and data on github:\nhttps://github.com/anguyen8/chm-corr-interactive.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v3.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2404.13417v1","updated":"2024-04-20T16:11:47Z","published":"2024-04-20T16:11:47Z","title":"Efficient and Concise Explanations for Object Detection with\n Gaussian-Class Activation Mapping Explainer","summary":" To address the challenges of providing quick and plausible explanations in\nExplainable AI (XAI) for object detection models, we introduce the Gaussian\nClass Activation Mapping Explainer (G-CAME). Our method efficiently generates\nconcise saliency maps by utilizing activation maps from selected layers and\napplying a Gaussian kernel to emphasize critical image regions for the\npredicted object. Compared with other Region-based approaches, G-CAME\nsignificantly reduces explanation time to 0.5 seconds without compromising the\nquality. Our evaluation of G-CAME, using Faster-RCNN and YOLOX on the MS-COCO\n2017 dataset, demonstrates its ability to offer highly plausible and faithful\nexplanations, especially in reducing the bias on tiny object detection.\n","authors":["Quoc Khanh Nguyen","Truong Thanh Hung Nguyen","Vo Thanh Khang Nguyen","Van Binh Truong","Tuong Phan","Hung Cao"],"pdf_url":"https://arxiv.org/pdf/2404.13417v1.pdf","comment":"Canadian AI 2024"},{"id":"http://arxiv.org/abs/2401.09630v3","updated":"2024-04-20T16:10:04Z","published":"2024-01-17T22:44:18Z","title":"CT Liver Segmentation via PVT-based Encoding and Refined Decoding","summary":" Accurate liver segmentation from CT scans is essential for effective\ndiagnosis and treatment planning. Computer-aided diagnosis systems promise to\nimprove the precision of liver disease diagnosis, disease progression, and\ntreatment planning. In response to the need, we propose a novel deep learning\napproach, \\textit{\\textbf{PVTFormer}}, that is built upon a pretrained pyramid\nvision transformer (PVT v2) combined with advanced residual upsampling and\ndecoder block. By integrating a refined feature channel approach with a\nhierarchical decoding strategy, PVTFormer generates high quality segmentation\nmasks by enhancing semantic features. Rigorous evaluation of the proposed\nmethod on Liver Tumor Segmentation Benchmark (LiTS) 2017 demonstrates that our\nproposed architecture not only achieves a high dice coefficient of 86.78\\%,\nmIoU of 78.46\\%, but also obtains a low HD of 3.50. The results underscore\nPVTFormer's efficacy in setting a new benchmark for state-of-the-art liver\nsegmentation methods. The source code of the proposed PVTFormer is available at\n\\url{https://github.com/DebeshJha/PVTFormer}.\n","authors":["Debesh Jha","Nikhil Kumar Tomar","Koushik Biswas","Gorkem Durak","Alpay Medetalibeyoglu","Matthew Antalek","Yury Velichko","Daniela Ladner","Amir Borhani","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2401.09630v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17948v2","updated":"2024-04-20T15:29:00Z","published":"2023-11-29T05:28:05Z","title":"Action-slot: Visual Action-centric Representations for Multi-label\n Atomic Activity Recognition in Traffic Scenes","summary":" In this paper, we study multi-label atomic activity recognition. Despite the\nnotable progress in action recognition, it is still challenging to recognize\natomic activities due to a deficiency in a holistic understanding of both\nmultiple road users' motions and their contextual information. In this paper,\nwe introduce Action-slot, a slot attention-based approach that learns visual\naction-centric representations, capturing both motion and contextual\ninformation. Our key idea is to design action slots that are capable of paying\nattention to regions where atomic activities occur, without the need for\nexplicit perception guidance. To further enhance slot attention, we introduce a\nbackground slot that competes with action slots, aiding the training process in\navoiding unnecessary focus on background regions devoid of activities. Yet, the\nimbalanced class distribution in the existing dataset hampers the assessment of\nrare activities. To address the limitation, we collect a synthetic dataset\ncalled TACO, which is four times larger than OATS and features a balanced\ndistribution of atomic activities. To validate the effectiveness of our method,\nwe conduct comprehensive experiments and ablation studies against various\naction recognition baselines. We also show that the performance of multi-label\natomic activity recognition on real-world datasets can be improved by\npretraining representations on TACO. We will release our source code and\ndataset. See the videos of visualization on the project page:\nhttps://hcis-lab.github.io/Action-slot/\n","authors":["Chi-Hsi Kung","Shu-Wei Lu","Yi-Hsuan Tsai","Yi-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13408v1","updated":"2024-04-20T15:23:15Z","published":"2024-04-20T15:23:15Z","title":"AMMUNet: Multi-Scale Attention Map Merging for Remote Sensing Image\n Segmentation","summary":" The advancement of deep learning has driven notable progress in remote\nsensing semantic segmentation. Attention mechanisms, while enabling global\nmodeling and utilizing contextual information, face challenges of high\ncomputational costs and require window-based operations that weaken capturing\nlong-range dependencies, hindering their effectiveness for remote sensing image\nprocessing. In this letter, we propose AMMUNet, a UNet-based framework that\nemploys multi-scale attention map merging, comprising two key innovations: the\ngranular multi-head self-attention (GMSA) module and the attention map merging\nmechanism (AMMM). GMSA efficiently acquires global information while\nsubstantially mitigating computational costs in contrast to global multi-head\nself-attention mechanism. This is accomplished through the strategic\nutilization of dimension correspondence to align granularity and the reduction\nof relative position bias parameters, thereby optimizing computational\nefficiency. The proposed AMMM effectively combines multi-scale attention maps\ninto a unified representation using a fixed mask template, enabling the\nmodeling of global attention mechanism. Experimental evaluations highlight the\nsuperior performance of our approach, achieving remarkable mean intersection\nover union (mIoU) scores of 75.48\\% on the challenging Vaihingen dataset and an\nexceptional 77.90\\% on the Potsdam dataset, demonstrating the superiority of\nour method in precise remote sensing semantic segmentation. Codes are available\nat https://github.com/interpretty/AMMUNet.\n","authors":["Yang Yang","Shunyi Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17132v3","updated":"2024-04-20T15:20:14Z","published":"2023-11-28T18:03:27Z","title":"TransNeXt: Robust Foveal Visual Perception for Vision Transformers","summary":" Due to the depth degradation effect in residual connections, many efficient\nVision Transformers models that rely on stacking layers for information\nexchange often fail to form sufficient information mixing, leading to unnatural\nvisual perception. To address this issue, in this paper, we propose Aggregated\nAttention, a biomimetic design-based token mixer that simulates biological\nfoveal vision and continuous eye movement while enabling each token on the\nfeature map to have a global perception. Furthermore, we incorporate learnable\ntokens that interact with conventional queries and keys, which further\ndiversifies the generation of affinity matrices beyond merely relying on the\nsimilarity between queries and keys. Our approach does not rely on stacking for\ninformation exchange, thus effectively avoiding depth degradation and achieving\nnatural visual perception. Additionally, we propose Convolutional GLU, a\nchannel mixer that bridges the gap between GLU and SE mechanism, which empowers\neach token to have channel attention based on its nearest neighbor image\nfeatures, enhancing local modeling capability and model robustness. We combine\naggregated attention and convolutional GLU to create a new visual backbone\ncalled TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves\nstate-of-the-art performance across multiple model sizes. At a resolution of\n$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing\nConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet\naccuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of\n$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic\nsegmentation mIoU of 54.7.\n","authors":["Dai Shi"],"pdf_url":"https://arxiv.org/pdf/2311.17132v3.pdf","comment":"CVPR 2024 Camera-ready Version. Project Page:\n https://github.com/DaiShiResearch/TransNeXt"},{"id":"http://arxiv.org/abs/2404.09640v3","updated":"2024-04-20T15:18:03Z","published":"2024-04-15T10:19:39Z","title":"CREST: Cross-modal Resonance through Evidential Deep Learning for\n Enhanced Zero-Shot Learning","summary":" Zero-shot learning (ZSL) enables the recognition of novel classes by\nleveraging semantic knowledge transfer from known to unknown categories. This\nknowledge, typically encapsulated in attribute descriptions, aids in\nidentifying class-specific visual features, thus facilitating visual-semantic\nalignment and improving ZSL performance. However, real-world challenges such as\ndistribution imbalances and attribute co-occurrence among instances often\nhinder the discernment of local variances in images, a problem exacerbated by\nthe scarcity of fine-grained, region-specific attribute annotations. Moreover,\nthe variability in visual presentation within categories can also skew\nattribute-category associations. In response, we propose a bidirectional\ncross-modal ZSL approach CREST. It begins by extracting representations for\nattribute and visual localization and employs Evidential Deep Learning (EDL) to\nmeasure underlying epistemic uncertainty, thereby enhancing the model's\nresilience against hard negatives. CREST incorporates dual learning pathways,\nfocusing on both visual-category and attribute-category alignments, to ensure\nrobust correlation between latent and observable spaces. Moreover, we introduce\nan uncertainty-informed cross-modal fusion technique to refine visual-attribute\ninference. Extensive experiments demonstrate our model's effectiveness and\nunique explainability across multiple datasets. Our code and data are available\nat: https://github.com/JethroJames/CREST\n","authors":["Haojian Huang","Xiaozhen Qiao","Zhuo Chen","Haodong Chen","Bingyu Li","Zhe Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.09640v3.pdf","comment":"Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at:\n https://github.com/JethroJames/CREST"},{"id":"http://arxiv.org/abs/2212.11152v2","updated":"2024-04-20T15:15:13Z","published":"2022-12-10T13:01:18Z","title":"OpenPack: A Large-scale Dataset for Recognizing Packaging Works in\n IoT-enabled Logistic Environments","summary":" Unlike human daily activities, existing publicly available sensor datasets\nfor work activity recognition in industrial domains are limited by difficulties\nin collecting realistic data as close collaboration with industrial sites is\nrequired. This also limits research on and development of methods for\nindustrial applications. To address these challenges and contribute to research\non machine recognition of work activities in industrial domains, in this study,\nwe introduce a new large-scale dataset for packaging work recognition called\nOpenPack. OpenPack contains 53.8 hours of multimodal sensor data, including\nacceleration data, keypoints, depth images, and readings from IoT-enabled\ndevices (e.g., handheld barcode scanners), collected from 16 distinct subjects\nwith different levels of packaging work experience. We apply state-of-the-art\nhuman activity recognition techniques to the dataset and provide future\ndirections of complex work activity recognition studies in the pervasive\ncomputing community based on the results. We believe that OpenPack will\ncontribute to the sensor-based action/activity recognition community by\nproviding challenging tasks. The OpenPack dataset is available at\nhttps://open-pack.github.io.\n","authors":["Naoya Yoshimura","Jaime Morales","Takuya Maekawa","Takahiro Hara"],"pdf_url":"https://arxiv.org/pdf/2212.11152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13400v1","updated":"2024-04-20T14:57:31Z","published":"2024-04-20T14:57:31Z","title":"HiVG: Hierarchical Multimodal Fine-grained Modulation for Visual\n Grounding","summary":" Visual grounding, which aims to ground a visual region via natural language,\nis a task that heavily relies on cross-modal alignment. Existing works utilized\nuni-modal pre-trained models to transfer visual/linguistic knowledge separately\nwhile ignoring the multimodal corresponding information. Motivated by recent\nadvancements in contrastive language-image pre-training and low-rank adaptation\n(LoRA) methods, we aim to solve the grounding task based on multimodal\npre-training. However, there exists significant task gaps between pre-training\nand grounding. Therefore, to address these gaps, we propose a concise and\nefficient hierarchical multimodal fine-grained modulation framework, namely\nHiVG. Specifically, HiVG consists of a multi-layer adaptive cross-modal bridge\nand a hierarchical multimodal low-rank adaptation (Hi LoRA) paradigm. The\ncross-modal bridge can address the inconsistency between visual features and\nthose required for grounding, and establish a connection between multi-level\nvisual and text features. Hi LoRA prevents the accumulation of perceptual\nerrors by adapting the cross-modal features from shallow to deep layers in a\nhierarchical manner. Experimental results on five datasets demonstrate the\neffectiveness of our approach and showcase the significant grounding\ncapabilities as well as promising energy efficiency advantages. The project\npage: https://github.com/linhuixiao/HiVG.\n","authors":["Linhui Xiao","Xiaoshan Yang","Fang Peng","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.13400v1.pdf","comment":"The project page: https://github.com/linhuixiao/HiVG"},{"id":"http://arxiv.org/abs/2404.02148v2","updated":"2024-04-20T14:45:54Z","published":"2024-04-02T17:58:03Z","title":"Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of\n Orthogonal Diffusion Models","summary":" Recent advancements in 3D generation are predominantly propelled by\nimprovements in 3D-aware image diffusion models which are pretrained on\nInternet-scale image data and fine-tuned on massive 3D data, offering the\ncapability of producing highly consistent multi-view images. However, due to\nthe scarcity of synchronized multi-view video data, it is impractical to adapt\nthis paradigm to 4D generation directly. Despite that, the available video and\n3D data are adequate for training video and multi-view diffusion models that\ncan provide satisfactory dynamic and geometric priors respectively. In this\npaper, we present Diffusion$^2$, a novel framework for dynamic 3D content\ncreation that leverages the knowledge about geometric consistency and temporal\nsmoothness from these models to directly sample dense multi-view and\nmulti-frame images which can be employed to optimize continuous 4D\nrepresentation. Specifically, we design a simple yet effective denoising\nstrategy via score composition of video and multi-view diffusion models based\non the probability structure of the images to be generated. Owing to the high\nparallelism of the image generation and the efficiency of the modern 4D\nreconstruction pipeline, our framework can generate 4D content within few\nminutes. Furthermore, our method circumvents the reliance on 4D data, thereby\nhaving the potential to benefit from the scalability of the foundation video\nand multi-view diffusion models. Extensive experiments demonstrate the efficacy\nof our proposed framework and its capability to flexibly adapt to various types\nof prompts.\n","authors":["Zeyu Yang","Zijie Pan","Chun Gu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.02148v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2304.02970v6","updated":"2024-04-20T14:16:48Z","published":"2023-04-06T09:54:06Z","title":"Unraveling Instance Associations: A Closer Look for Audio-Visual\n Segmentation","summary":" Audio-visual segmentation (AVS) is a challenging task that involves\naccurately segmenting sounding objects based on audio-visual cues. The\neffectiveness of audio-visual learning critically depends on achieving accurate\ncross-modal alignment between sound and visual objects. Successful audio-visual\nlearning requires two essential components: 1) a challenging dataset with\nhigh-quality pixel-level multi-class annotated images associated with audio\nfiles, and 2) a model that can establish strong links between audio information\nand its corresponding visual object. However, these requirements are only\npartially addressed by current methods, with training sets containing biased\naudio-visual data, and models that generalise poorly beyond this biased\ntraining set. In this work, we propose a new cost-effective strategy to build\nchallenging and relatively unbiased high-quality audio-visual segmentation\nbenchmarks. We also propose a new informative sample mining method for\naudio-visual supervised contrastive learning to leverage discriminative\ncontrastive samples to enforce cross-modal understanding. We show empirical\nresults that demonstrate the effectiveness of our benchmark. Furthermore,\nexperiments conducted on existing AVS datasets and on our new benchmark show\nthat our method achieves state-of-the-art (SOTA) segmentation accuracy.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v6.pdf","comment":"Code is available at https://github.com/cyh-0/CAVP"},{"id":"http://arxiv.org/abs/2312.01431v3","updated":"2024-04-20T14:15:36Z","published":"2023-12-03T15:40:10Z","title":"D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for\n Few-shot Action Recognition","summary":" Adapting large pre-trained image models to few-shot action recognition has\nproven to be an effective and efficient strategy for learning robust feature\nextractors, which is essential for few-shot learning. Typical fine-tuning based\nadaptation paradigm is prone to overfitting in the few-shot learning scenarios\nand offers little modeling flexibility for learning temporal features in video\ndata. In this work we present the Disentangled-and-Deformable Spatio-Temporal\nAdapter (D$^2$ST-Adapter), which is a novel adapter tuning framework\nwell-suited for few-shot action recognition due to lightweight design and low\nparameter-learning overhead. It is designed in a dual-pathway architecture to\nencode spatial and temporal features in a disentangled manner. In particular,\nwe devise the anisotropic Deformable Spatio-Temporal Attention module as the\ncore component of D$^2$ST-Adapter, which can be tailored with anisotropic\nsampling densities along spatial and temporal domains to learn spatial and\ntemporal features specifically in corresponding pathways, allowing our\nD$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space\nwhile maintaining a lightweight design. Extensive experiments with\ninstantiations of our method on both pre-trained ResNet and ViT demonstrate the\nsuperiority of our method over state-of-the-art methods for few-shot action\nrecognition. Our method is particularly well-suited to challenging scenarios\nwhere temporal dynamics are critical for action recognition.\n","authors":["Wenjie Pei","Qizhong Tan","Guangming Lu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2312.01431v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13386v1","updated":"2024-04-20T14:06:04Z","published":"2024-04-20T14:06:04Z","title":"SSVT: Self-Supervised Vision Transformer For Eye Disease Diagnosis Based\n On Fundus Images","summary":" Machine learning-based fundus image diagnosis technologies trigger worldwide\ninterest owing to their benefits such as reducing medical resource power and\nproviding objective evaluation results. However, current methods are commonly\nbased on supervised methods, bringing in a heavy workload to biomedical staff\nand hence suffering in expanding effective databases. To address this issue, in\nthis article, we established a label-free method, name 'SSVT',which can\nautomatically analyze un-labeled fundus images and generate high evaluation\naccuracy of 97.0% of four main eye diseases based on six public datasets and\ntwo datasets collected by Beijing Tongren Hospital. The promising results\nshowcased the effectiveness of the proposed unsupervised learning method, and\nthe strong application potential in biomedical resource shortage regions to\nimprove global eye health.\n","authors":["Jiaqi Wang","Mengtian Kang","Yong Liu","Chi Zhang","Ying Liu","Shiming Li","Yue Qi","Wenjun Xu","Chenyu Tang","Edoardo Occhipinti","Mayinuer Yusufu","Ningli Wang","Weiling Bai","Shuo Gao","Luigi G. Occhipinti"],"pdf_url":"https://arxiv.org/pdf/2404.13386v1.pdf","comment":"ISBI 2024"},{"id":"http://arxiv.org/abs/2401.13516v2","updated":"2024-04-20T13:56:32Z","published":"2024-01-24T15:14:05Z","title":"Delocate: Detection and Localization for Deepfake Videos with\n Randomly-Located Tampered Traces","summary":" Deepfake videos are becoming increasingly realistic, showing subtle tampering\ntraces on facial areasthat vary between frames. Consequently, many existing\nDeepfake detection methods struggle to detect unknown domain Deepfake videos\nwhile accurately locating the tampered region. To address thislimitation, we\npropose Delocate, a novel Deepfake detection model that can both recognize\nandlocalize unknown domain Deepfake videos. Ourmethod consists of two stages\nnamed recoveringand localization. In the recovering stage, the modelrandomly\nmasks regions of interest (ROIs) and reconstructs real faces without tampering\ntraces, resulting in a relatively good recovery effect for realfaces and a poor\nrecovery effect for fake faces. Inthe localization stage, the output of the\nrecoveryphase and the forgery ground truth mask serve assupervision to guide\nthe forgery localization process. This process strategically emphasizes the\nrecovery phase of fake faces with poor recovery, facilitating the localization\nof tampered regions. Ourextensive experiments on four widely used benchmark\ndatasets demonstrate that Delocate not onlyexcels in localizing tampered areas\nbut also enhances cross-domain detection performance.\n","authors":["Juan Hu","Xin Liao","Difei Gao","Satoshi Tsutsui","Qian Wang","Zheng Qin","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2401.13516v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.09921,\n arXiv:2305.05943"},{"id":"http://arxiv.org/abs/2404.13372v1","updated":"2024-04-20T13:19:08Z","published":"2024-04-20T13:19:08Z","title":"HybridFlow: Infusing Continuity into Masked Codebook for Extreme\n Low-Bitrate Image Compression","summary":" This paper investigates the challenging problem of learned image compression\n(LIC) with extreme low bitrates. Previous LIC methods based on transmitting\nquantized continuous features often yield blurry and noisy reconstruction due\nto the severe quantization loss. While previous LIC methods based on learned\ncodebooks that discretize visual space usually give poor-fidelity\nreconstruction due to the insufficient representation power of limited\ncodewords in capturing faithful details. We propose a novel dual-stream\nframework, HyrbidFlow, which combines the continuous-feature-based and\ncodebook-based streams to achieve both high perceptual quality and high\nfidelity under extreme low bitrates. The codebook-based stream benefits from\nthe high-quality learned codebook priors to provide high quality and clarity in\nreconstructed images. The continuous feature stream targets at maintaining\nfidelity details. To achieve the ultra low bitrate, a masked token-based\ntransformer is further proposed, where we only transmit a masked portion of\ncodeword indices and recover the missing indices through token generation\nguided by information from the continuous feature stream. We also develop a\nbridging correction network to merge the two streams in pixel decoding for\nfinal image reconstruction, where the continuous stream features rectify biases\nof the codebook-based pixel decoder to impose reconstructed fidelity details.\nExperimental results demonstrate superior performance across several datasets\nunder extremely low bitrates, compared with existing single-stream\ncodebook-based or continuous-feature-based LIC methods.\n","authors":["Lei Lu","Yanyue Xie","Wei Jiang","Wei Wang","Xue Lin","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13370v1","updated":"2024-04-20T13:15:27Z","published":"2024-04-20T13:15:27Z","title":"Movie101v2: Improved Movie Narration Benchmark","summary":" Automatic movie narration targets at creating video-aligned plot descriptions\nto assist visually impaired audiences. It differs from standard video\ncaptioning in that it requires not only describing key visual details but also\ninferring the plots developed across multiple movie shots, thus posing unique\nand ongoing challenges. To advance the development of automatic movie narrating\nsystems, we first revisit the limitations of existing datasets and develop a\nlarge-scale, bilingual movie narration dataset, Movie101v2. Second, taking into\naccount the essential difficulties in achieving applicable movie narration, we\nbreak the long-term goal into three progressive stages and tentatively focus on\nthe initial stages featuring understanding within individual clips. We also\nintroduce a new narration assessment to align with our staged task goals.\nThird, using our new dataset, we baseline several leading large vision-language\nmodels, including GPT-4V, and conduct in-depth investigations into the\nchallenges current models face for movie narration generation. Our findings\nreveal that achieving applicable movie narration generation is a fascinating\ngoal that requires thorough research.\n","authors":["Zihao Yue","Yepeng Zhang","Ziheng Wang","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2404.13370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06214v3","updated":"2024-04-20T13:15:21Z","published":"2023-10-10T00:07:25Z","title":"CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding","summary":" 3D visual grounding is the ability to localize objects in 3D scenes\nconditioned by utterances. Most existing methods devote the referring head to\nlocalize the referred object directly, causing failure in complex scenarios. In\naddition, it does not illustrate how and why the network reaches the final\ndecision. In this paper, we address this question Can we design an\ninterpretable 3D visual grounding framework that has the potential to mimic the\nhuman perception system?. To this end, we formulate the 3D visual grounding\nproblem as a sequence-to-sequence Seq2Seq task by first predicting a chain of\nanchors and then the final target. Interpretability not only improves the\noverall performance but also helps us identify failure cases. Following the\nchain of thoughts approach enables us to decompose the referring task into\ninterpretable intermediate steps, boosting the performance and making our\nframework extremely data-efficient. Moreover, our proposed framework can be\neasily integrated into any existing architecture. We validate our approach\nthrough comprehensive experiments on the Nr3D, Sr3D, and Scanrefer benchmarks\nand show consistent performance gains compared to existing methods without\nrequiring manually annotated data. Furthermore, our proposed framework, dubbed\nCoT3DRef, is significantly data-efficient, whereas on the Sr3D dataset, when\ntrained only on 10% of the data, we match the SOTA performance that trained on\nthe entire data. The code is available at\nhttps:eslambakr.github.io/cot3dref.github.io/.\n","authors":["Eslam Mohamed Bakr","Mohamed Ayman","Mahmoud Ahmed","Habib Slim","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2310.06214v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2402.11677v3","updated":"2024-04-20T13:00:25Z","published":"2024-02-18T18:56:13Z","title":"MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of\n LiDAR-Camera Fusion for 3D Object Detection","summary":" Multi-modal 3D object detection models for automated driving have\ndemonstrated exceptional performance on computer vision benchmarks like\nnuScenes. However, their reliance on densely sampled LiDAR point clouds and\nmeticulously calibrated sensor arrays poses challenges for real-world\napplications. Issues such as sensor misalignment, miscalibration, and disparate\nsampling frequencies lead to spatial and temporal misalignment in data from\nLiDAR and cameras. Additionally, the integrity of LiDAR and camera data is\noften compromised by adverse environmental conditions such as inclement\nweather, leading to occlusions and noise interference. To address this\nchallenge, we introduce MultiCorrupt, a comprehensive benchmark designed to\nevaluate the robustness of multi-modal 3D object detectors against ten distinct\ntypes of corruptions. We evaluate five state-of-the-art multi-modal detectors\non MultiCorrupt and analyze their performance in terms of their resistance\nability. Our results show that existing methods exhibit varying degrees of\nrobustness depending on the type of corruption and their fusion strategy. We\nprovide insights into which multi-modal design choices make such models robust\nagainst certain perturbations. The dataset generation code and benchmark are\nopen-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt.\n","authors":["Till Beemelmanns","Quan Zhang","Christian Geller","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2402.11677v3.pdf","comment":"Code: https://github.com/ika-rwth-aachen/MultiCorrupt"},{"id":"http://arxiv.org/abs/2307.06737v2","updated":"2024-04-20T11:53:13Z","published":"2023-07-13T13:17:50Z","title":"Improving 2D Human Pose Estimation in Rare Camera Views with Synthetic\n Data","summary":" Methods and datasets for human pose estimation focus predominantly on side-\nand front-view scenarios. We overcome the limitation by leveraging synthetic\ndata and introduce RePoGen (RarE POses GENerator), an SMPL-based method for\ngenerating synthetic humans with comprehensive control over pose and view.\nExperiments on top-view datasets and a new dataset of real images with diverse\nposes show that adding the RePoGen data to the COCO dataset outperforms\nprevious approaches to top- and bottom-view pose estimation without harming\nperformance on common views. An ablation study shows that anatomical\nplausibility, a property prior research focused on, is not a prerequisite for\neffective performance. The introduced dataset and the corresponding code are\navailable on https://mirapurkrabek.github.io/RePoGen-paper/ .\n","authors":["Miroslav Purkrabek","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2307.06737v2.pdf","comment":"https://mirapurkrabek.github.io/RePoGen-paper/"},{"id":"http://arxiv.org/abs/2404.13353v1","updated":"2024-04-20T11:28:14Z","published":"2024-04-20T11:28:14Z","title":"Generating Daylight-driven Architectural Design via Diffusion Models","summary":" In recent years, the rapid development of large-scale models has made new\npossibilities for interdisciplinary fields such as architecture. In this paper,\nwe present a novel daylight-driven AI-aided architectural design method.\nFirstly, we formulate a method for generating massing models, producing\narchitectural massing models using random parameters quickly. Subsequently, we\nintegrate a daylight-driven facade design strategy, accurately determining\nwindow layouts and applying them to the massing models. Finally, we seamlessly\ncombine a large-scale language model with a text-to-image model, enhancing the\nefficiency of generating visual architectural design renderings. Experimental\nresults demonstrate that our approach supports architects' creative\ninspirations and pioneers novel avenues for architectural design development.\nProject page: https://zrealli.github.io/DDADesign/.\n","authors":["Pengzhi Li","Baijuan Li"],"pdf_url":"https://arxiv.org/pdf/2404.13353v1.pdf","comment":"Project page: https://zrealli.github.io/DDADesign/"},{"id":"http://arxiv.org/abs/2404.13342v1","updated":"2024-04-20T10:40:12Z","published":"2024-04-20T10:40:12Z","title":"Hyperspectral Anomaly Detection with Self-Supervised Anomaly Prior","summary":" The majority of existing hyperspectral anomaly detection (HAD) methods use\nthe low-rank representation (LRR) model to separate the background and anomaly\ncomponents, where the anomaly component is optimized by handcrafted sparse\npriors (e.g., $\\ell_{2,1}$-norm). However, this may not be ideal since they\noverlook the spatial structure present in anomalies and make the detection\nresult largely dependent on manually set sparsity. To tackle these problems, we\nredefine the optimization criterion for the anomaly component in the LRR model\nwith a self-supervised network called self-supervised anomaly prior (SAP). This\nprior is obtained by the pretext task of self-supervised learning, which is\ncustomized to learn the characteristics of hyperspectral anomalies.\nSpecifically, this pretext task is a classification task to distinguish the\noriginal hyperspectral image (HSI) and the pseudo-anomaly HSI, where the\npseudo-anomaly is generated from the original HSI and designed as a prism with\narbitrary polygon bases and arbitrary spectral bands. In addition, a\ndual-purified strategy is proposed to provide a more refined background\nrepresentation with an enriched background dictionary, facilitating the\nseparation of anomalies from complex backgrounds. Extensive experiments on\nvarious hyperspectral datasets demonstrate that the proposed SAP offers a more\naccurate and interpretable solution than other advanced HAD methods.\n","authors":["Yidan Liu","Weiying Xie","Kai Jiang","Jiaqing Zhang","Yunsong Li","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2404.13342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13330v1","updated":"2024-04-20T09:27:05Z","published":"2024-04-20T09:27:05Z","title":"SEGSRNet for Stereo-Endoscopic Image Super-Resolution and Surgical\n Instrument Segmentation","summary":" SEGSRNet addresses the challenge of precisely identifying surgical\ninstruments in low-resolution stereo endoscopic images, a common issue in\nmedical imaging and robotic surgery. Our innovative framework enhances image\nclarity and segmentation accuracy by applying state-of-the-art super-resolution\ntechniques before segmentation. This ensures higher-quality inputs for more\nprecise segmentation. SEGSRNet combines advanced feature extraction and\nattention mechanisms with spatial processing to sharpen image details, which is\nsignificant for accurate tool identification in medical images. Our proposed\nmodel outperforms current models including Dice, IoU, PSNR, and SSIM, SEGSRNet\nwhere it produces clearer and more accurate images for stereo endoscopic\nsurgical imaging. SEGSRNet can provide image resolution and precise\nsegmentation which can significantly enhance surgical accuracy and patient care\noutcomes.\n","authors":["Mansoor Hayat","Supavadee Aramvith","Titipat Achakulvisut"],"pdf_url":"https://arxiv.org/pdf/2404.13330v1.pdf","comment":"Paper accepted for Presentation in 46th Annual International\n Conference of the IEEE Engineering in Medicine and Biology Society (EMBS),\n Orlando, Florida, USA"},{"id":"http://arxiv.org/abs/2403.17881v3","updated":"2024-04-20T09:06:02Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" Deepfake is a technology dedicated to creating highly realistic facial images\nand videos under specific conditions, which has significant application\npotential in fields such as entertainment, movie production, digital human\ncreation, to name a few. With the advancements in deep learning, techniques\nprimarily represented by Variational Autoencoders and Generative Adversarial\nNetworks have achieved impressive generation results. More recently, the\nemergence of diffusion models with powerful generation capabilities has sparked\na renewed wave of research. In addition to deepfake generation, corresponding\ndetection technologies continuously evolve to regulate the potential misuse of\ndeepfakes, such as for privacy invasion and phishing attacks. This survey\ncomprehensively reviews the latest developments in deepfake generation and\ndetection, summarizing and analyzing current state-of-the-arts in this rapidly\nevolving field. We first unify task definitions, comprehensively introduce\ndatasets and metrics, and discuss developing technologies. Then, we discuss the\ndevelopment of several related sub-fields and focus on researching four\nrepresentative deepfake fields: face swapping, face reenactment, talking face\ngeneration, and facial attribute editing, as well as forgery detection.\nSubsequently, we comprehensively benchmark representative methods on popular\ndatasets for each field, fully evaluating the latest and influential published\nworks. Finally, we analyze challenges and future research directions of the\ndiscussed fields.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Zhenyu Zhang","Chengjie Wang","Yunsheng Wu","Guangtao Zhai","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v3.pdf","comment":"We closely follow the latest developments in\n https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection"},{"id":"http://arxiv.org/abs/2404.13324v1","updated":"2024-04-20T08:48:37Z","published":"2024-04-20T08:48:37Z","title":"Collaborative Visual Place Recognition through Federated Learning","summary":" Visual Place Recognition (VPR) aims to estimate the location of an image by\ntreating it as a retrieval problem. VPR uses a database of geo-tagged images\nand leverages deep neural networks to extract a global representation, called\ndescriptor, from each image. While the training data for VPR models often\noriginates from diverse, geographically scattered sources (geo-tagged images),\nthe training process itself is typically assumed to be centralized. This\nresearch revisits the task of VPR through the lens of Federated Learning (FL),\naddressing several key challenges associated with this adaptation. VPR data\ninherently lacks well-defined classes, and models are typically trained using\ncontrastive learning, which necessitates a data mining step on a centralized\ndatabase. Additionally, client devices in federated systems can be highly\nheterogeneous in terms of their processing capabilities. The proposed FedVPR\nframework not only presents a novel approach for VPR but also introduces a new,\nchallenging, and realistic task for FL research, paving the way to other image\nretrieval tasks in FL.\n","authors":["Mattia Dutto","Gabriele Berton","Debora Caldarola","Eros Fanì","Gabriele Trivigno","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2404.13324v1.pdf","comment":"13 pages, 7 figures, CVPR - The 3rd International Workshop on\n Federated Learning for Computer Vision (FedVision-2024)"},{"id":"http://arxiv.org/abs/2404.13320v1","updated":"2024-04-20T08:28:43Z","published":"2024-04-20T08:28:43Z","title":"Pixel is a Barrier: Diffusion Models Are More Adversarially Robust Than\n We Think","summary":" Adversarial examples for diffusion models are widely used as solutions for\nsafety concerns. By adding adversarial perturbations to personal images,\nattackers can not edit or imitate them easily. However, it is essential to note\nthat all these protections target the latent diffusion model (LDMs), the\nadversarial examples for diffusion models in the pixel space (PDMs) are largely\noverlooked. This may mislead us to think that the diffusion models are\nvulnerable to adversarial attacks like most deep models. In this paper, we show\nnovel findings that: even though gradient-based white-box attacks can be used\nto attack the LDMs, they fail to attack PDMs. This finding is supported by\nextensive experiments of almost a wide range of attacking methods on various\nPDMs and LDMs with different model structures, which means diffusion models are\nindeed much more robust against adversarial attacks. We also find that PDMs can\nbe used as an off-the-shelf purifier to effectively remove the adversarial\npatterns that were generated on LDMs to protect the images, which means that\nmost protection methods nowadays, to some extent, cannot protect our images\nfrom malicious attacks. We hope that our insights will inspire the community to\nrethink the adversarial samples for diffusion models as protection methods and\nmove forward to more effective protection. Codes are available in\nhttps://github.com/xavihart/PDM-Pure.\n","authors":["Haotian Xue","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13311v1","updated":"2024-04-20T07:56:21Z","published":"2024-04-20T07:56:21Z","title":"STAT: Towards Generalizable Temporal Action Localization","summary":" Weakly-supervised temporal action localization (WTAL) aims to recognize and\nlocalize action instances with only video-level labels. Despite the significant\nprogress, existing methods suffer from severe performance degradation when\ntransferring to different distributions and thus may hardly adapt to real-world\nscenarios . To address this problem, we propose the Generalizable Temporal\nAction Localization task (GTAL), which focuses on improving the\ngeneralizability of action localization methods. We observed that the\nperformance decline can be primarily attributed to the lack of generalizability\nto different action scales. To address this problem, we propose STAT\n(Self-supervised Temporal Adaptive Teacher), which leverages a teacher-student\nstructure for iterative refinement. Our STAT features a refinement module and\nan alignment module. The former iteratively refines the model's output by\nleveraging contextual information and helps adapt to the target scale. The\nlatter improves the refinement process by promoting a consensus between student\nand teacher models. We conduct extensive experiments on three datasets,\nTHUMOS14, ActivityNet1.2, and HACS, and the results show that our method\nsignificantly improves the Baseline methods under the cross-distribution\nevaluation setting, even approaching the same-distribution evaluation\nperformance.\n","authors":["Yangcen Liu","Ziyi Liu","Yuanhao Zhai","Wen Li","David Doerman","Junsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.13311v1.pdf","comment":"14 pages, LaTeX;"},{"id":"http://arxiv.org/abs/2404.04848v2","updated":"2024-04-20T07:54:18Z","published":"2024-04-07T07:42:04Z","title":"Task-Aware Encoder Control for Deep Video Compression","summary":" Prior research on deep video compression (DVC) for machine tasks typically\nnecessitates training a unique codec for each specific task, mandating a\ndedicated decoder per task. In contrast, traditional video codecs employ a\nflexible encoder controller, enabling the adaptation of a single codec to\ndifferent tasks through mechanisms like mode prediction. Drawing inspiration\nfrom this, we introduce an innovative encoder controller for deep video\ncompression for machines. This controller features a mode prediction and a\nGroup of Pictures (GoP) selection module. Our approach centralizes control at\nthe encoding stage, allowing for adaptable encoder adjustments across different\ntasks, such as detection and tracking, while maintaining compatibility with a\nstandard pre-trained DVC decoder. Empirical evidence demonstrates that our\nmethod is applicable across multiple tasks with various existing pre-trained\nDVCs. Moreover, extensive experiments demonstrate that our method outperforms\nprevious DVC by about 25% bitrate for different tasks, with only one\npre-trained decoder.\n","authors":["Xingtong Ge","Jixiang Luo","Xinjie Zhang","Tongda Xu","Guo Lu","Dailan He","Jing Geng","Yan Wang","Jun Zhang","Hongwei Qin"],"pdf_url":"https://arxiv.org/pdf/2404.04848v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01643v2","updated":"2024-04-20T07:41:32Z","published":"2024-04-02T05:19:27Z","title":"A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection","summary":" Conventional Computed Tomography (CT) imaging recognition faces two\nsignificant challenges: (1) There is often considerable variability in the\nresolution and size of each CT scan, necessitating strict requirements for the\ninput size and adaptability of models. (2) CT-scan contains large number of\nout-of-distribution (OOD) slices. The crucial features may only be present in\nspecific spatial regions and slices of the entire CT scan. How can we\neffectively figure out where these are located? To deal with this, we introduce\nan enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically\ndesigned for CT scan. It aim to filter out a OOD data within whole CT scan,\nenabling our to select crucial spatial-slice for analysis by reducing 70%\nredundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling\n(KDS) method to improve the stability when training and inference stage,\ntherefore speeding up the rate of convergence and boosting performance. As a\nresult, the experiments demonstrate the promising performance of our model\nusing a simple EfficientNet-2D (E2D) model, even with only 1% of the training\ndata. The efficacy of our approach has been validated on the COVID-19-CT-DB\ndatasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024.\nOur source code is available at https://github.com/ming053l/E2D\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yang Fan Chiang","Yi-Shiuan Chou","Chih-Yu Jiang","Shen-Chieh Tai","Chi-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2404.01643v2.pdf","comment":"Camera-ready version, accepted by DEF-AI-MIA workshop, in conjunted\n with CVPR2024"},{"id":"http://arxiv.org/abs/2404.13306v1","updated":"2024-04-20T07:28:55Z","published":"2024-04-20T07:28:55Z","title":"FakeBench: Uncover the Achilles' Heels of Fake Images with Large\n Multimodal Models","summary":" Recently, fake images generated by artificial intelligence (AI) models have\nbecome indistinguishable from the real, exerting new challenges for fake image\ndetection models. To this extent, simple binary judgments of real or fake seem\nless convincing and credible due to the absence of human-understandable\nexplanations. Fortunately, Large Multimodal Models (LMMs) bring possibilities\nto materialize the judgment process while their performance remains\nundetermined. Therefore, we propose FakeBench, the first-of-a-kind benchmark\ntowards transparent defake, consisting of fake images with human language\ndescriptions on forgery signs. FakeBench gropes for two open questions of LMMs:\n(1) can LMMs distinguish fake images generated by AI, and (2) how do LMMs\ndistinguish fake images? In specific, we construct the FakeClass dataset with\n6k diverse-sourced fake and real images, each equipped with a Question&Answer\npair concerning the authenticity of images, which are utilized to benchmark the\ndetection ability. To examine the reasoning and interpretation abilities of\nLMMs, we present the FakeClue dataset, consisting of 15k pieces of descriptions\non the telltale clues revealing the falsification of fake images. Besides, we\nconstruct the FakeQA to measure the LMMs' open-question answering ability on\nfine-grained authenticity-relevant aspects. Our experimental results discover\nthat current LMMs possess moderate identification ability, preliminary\ninterpretation and reasoning ability, and passable open-question answering\nability for image defake. The FakeBench will be made publicly available soon.\n","authors":["Yixuan Li","Xuelin Liu","Xiaoyang Wang","Shiqi Wang","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2404.13306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13299v1","updated":"2024-04-20T07:05:45Z","published":"2024-04-20T07:05:45Z","title":"PCQA: A Strong Baseline for AIGC Quality Assessment Based on Prompt\n Condition","summary":" The development of Large Language Models (LLM) and Diffusion Models brings\nthe boom of Artificial Intelligence Generated Content (AIGC). It is essential\nto build an effective quality assessment framework to provide a quantifiable\nevaluation of different images or videos based on the AIGC technologies. The\ncontent generated by AIGC methods is driven by the crafted prompts. Therefore,\nit is intuitive that the prompts can also serve as the foundation of the AIGC\nquality assessment. This study proposes an effective AIGC quality assessment\n(QA) framework. First, we propose a hybrid prompt encoding method based on a\ndual-source CLIP (Contrastive Language-Image Pre-Training) text encoder to\nunderstand and respond to the prompt conditions. Second, we propose an\nensemble-based feature mixer module to effectively blend the adapted prompt and\nvision features. The empirical study practices in two datasets: AIGIQA-20K\n(AI-Generated Image Quality Assessment database) and T2VQA-DB (Text-to-Video\nQuality Assessment DataBase), which validates the effectiveness of our proposed\nmethod: Prompt Condition Quality Assessment (PCQA). Our proposed simple and\nfeasible framework may promote research development in the multimodal\ngeneration field.\n","authors":["Xi Fang","Weigang Wang","Xiaoxin Lv","Jun Yan"],"pdf_url":"https://arxiv.org/pdf/2404.13299v1.pdf","comment":"Published in CVPR-2024's NTIRE: New Trends in Image Restoration and\n Enhancement workshop and challenges"},{"id":"http://arxiv.org/abs/2404.13288v1","updated":"2024-04-20T06:25:32Z","published":"2024-04-20T06:25:32Z","title":"PoseINN: Realtime Visual-based Pose Regression and Localization with\n Invertible Neural Networks","summary":" Estimating ego-pose from cameras is an important problem in robotics with\napplications ranging from mobile robotics to augmented reality. While SOTA\nmodels are becoming increasingly accurate, they can still be unwieldy due to\nhigh computational costs. In this paper, we propose to solve the problem by\nusing invertible neural networks (INN) to find the mapping between the latent\nspace of images and poses for a given scene. Our model achieves similar\nperformance to the SOTA while being faster to train and only requiring offline\nrendering of low-resolution synthetic data. By using normalizing flows, the\nproposed method also provides uncertainty estimation for the output. We also\ndemonstrated the efficiency of this method by deploying the model on a mobile\nrobot.\n","authors":["Zirui Zang","Ahmad Amine","Rahul Mangharam"],"pdf_url":"https://arxiv.org/pdf/2404.13288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05060v2","updated":"2024-04-20T06:14:34Z","published":"2023-04-11T08:43:52Z","title":"SPIRiT-Diffusion: Self-Consistency Driven Diffusion Model for\n Accelerated MRI","summary":" Diffusion models have emerged as a leading methodology for image generation\nand have proven successful in the realm of magnetic resonance imaging (MRI)\nreconstruction. However, existing reconstruction methods based on diffusion\nmodels are primarily formulated in the image domain, making the reconstruction\nquality susceptible to inaccuracies in coil sensitivity maps (CSMs). k-space\ninterpolation methods can effectively address this issue but conventional\ndiffusion models are not readily applicable in k-space interpolation. To\novercome this challenge, we introduce a novel approach called SPIRiT-Diffusion,\nwhich is a diffusion model for k-space interpolation inspired by the iterative\nself-consistent SPIRiT method. Specifically, we utilize the iterative solver of\nthe self-consistent term (i.e., k-space physical prior) in SPIRiT to formulate\na novel stochastic differential equation (SDE) governing the diffusion process.\nSubsequently, k-space data can be interpolated by executing the diffusion\nprocess. This innovative approach highlights the optimization model's role in\ndesigning the SDE in diffusion models, enabling the diffusion process to align\nclosely with the physics inherent in the optimization model, a concept referred\nto as model-driven diffusion. We evaluated the proposed SPIRiT-Diffusion method\nusing a 3D joint intracranial and carotid vessel wall imaging dataset. The\nresults convincingly demonstrate its superiority over image-domain\nreconstruction methods, achieving high reconstruction quality even at a\nsubstantial acceleration rate of 10.\n","authors":["Zhuo-Xu Cui","Chentao Cao","Yue Wang","Sen Jia","Jing Cheng","Xin Liu","Hairong Zheng","Dong Liang","Yanjie Zhu"],"pdf_url":"https://arxiv.org/pdf/2304.05060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13282v1","updated":"2024-04-20T06:01:09Z","published":"2024-04-20T06:01:09Z","title":"Wills Aligner: A Robust Multi-Subject Brain Representation Learner","summary":" Decoding visual information from human brain activity has seen remarkable\nadvancements in recent research. However, due to the significant variability in\ncortical parcellation and cognition patterns across subjects, current\napproaches personalized deep models for each subject, constraining the\npracticality of this technology in real-world contexts. To tackle the\nchallenges, we introduce Wills Aligner, a robust multi-subject brain\nrepresentation learner. Our Wills Aligner initially aligns different subjects'\nbrains at the anatomical level. Subsequently, it incorporates a mixture of\nbrain experts to learn individual cognition patterns. Additionally, it\ndecouples the multi-subject learning task into a two-stage training, propelling\nthe deep model and its plugin network to learn inter-subject commonality\nknowledge and various cognition patterns, respectively. Wills Aligner enables\nus to overcome anatomical differences and to efficiently leverage a single\nmodel for multi-subject brain representation learning. We meticulously evaluate\nthe performance of our approach across coarse-grained and fine-grained visual\ndecoding tasks. The experimental results demonstrate that our Wills Aligner\nachieves state-of-the-art performance.\n","authors":["Guangyin Bao","Zixuan Gong","Qi Zhang","Jialei Zhou","Wei Fan","Kun Yi","Usman Naseem","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2404.13282v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.13277v1","updated":"2024-04-20T05:24:06Z","published":"2024-04-20T05:24:06Z","title":"Beyond Score Changes: Adversarial Attack on No-Reference Image Quality\n Assessment from Two Perspectives","summary":" Deep neural networks have demonstrated impressive success in No-Reference\nImage Quality Assessment (NR-IQA). However, recent researches highlight the\nvulnerability of NR-IQA models to subtle adversarial perturbations, leading to\ninconsistencies between model predictions and subjective ratings. Current\nadversarial attacks, however, focus on perturbing predicted scores of\nindividual images, neglecting the crucial aspect of inter-score correlation\nrelationships within an entire image set. Meanwhile, it is important to note\nthat the correlation, like ranking correlation, plays a significant role in\nNR-IQA tasks. To comprehensively explore the robustness of NR-IQA models, we\nintroduce a new framework of correlation-error-based attacks that perturb both\nthe correlation within an image set and score changes on individual images. Our\nresearch primarily focuses on ranking-related correlation metrics like\nSpearman's Rank-Order Correlation Coefficient (SROCC) and prediction\nerror-related metrics like Mean Squared Error (MSE). As an instantiation, we\npropose a practical two-stage SROCC-MSE-Attack (SMA) that initially optimizes\ntarget attack scores for the entire image set and then generates adversarial\nexamples guided by these scores. Experimental results demonstrate that our SMA\nmethod not only significantly disrupts the SROCC to negative values but also\nmaintains a considerable change in the scores of individual images. Meanwhile,\nit exhibits state-of-the-art performance across metrics with different\ncategories. Our method provides a new perspective on the robustness of NR-IQA\nmodels.\n","authors":["Chenxi Yang","Yujia Liu","Dingquan Li","Yan Zhong","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.13277v1.pdf","comment":"Submitted to a conference"},{"id":"http://arxiv.org/abs/2404.13273v1","updated":"2024-04-20T05:13:56Z","published":"2024-04-20T05:13:56Z","title":"Multi-feature Reconstruction Network using Crossed-mask Restoration for\n Unsupervised Anomaly Detection","summary":" Unsupervised anomaly detection using only normal samples is of great\nsignificance for quality inspection in industrial manufacturing. Although\nexisting reconstruction-based methods have achieved promising results, they\nstill face two problems: poor distinguishable information in image\nreconstruction and well abnormal regeneration caused by model\nover-generalization ability. To overcome the above issues, we convert the image\nreconstruction into a combination of parallel feature restorations and propose\na multi-feature reconstruction network, MFRNet, using crossed-mask restoration\nin this paper. Specifically, a multi-scale feature aggregator is first\ndeveloped to generate more discriminative hierarchical representations of the\ninput images from a pre-trained model. Subsequently, a crossed-mask generator\nis adopted to randomly cover the extracted feature map, followed by a\nrestoration network based on the transformer structure for high-quality repair\nof the missing regions. Finally, a hybrid loss is equipped to guide model\ntraining and anomaly estimation, which gives consideration to both the pixel\nand structural similarity. Extensive experiments show that our method is highly\ncompetitive with or significantly outperforms other state-of-the-arts on four\npublic available datasets and one self-made dataset.\n","authors":["Junpu Wang","Guili Xu","Chunlei Li","Guangshuai Gao","Yuehua Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.13273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13270v1","updated":"2024-04-20T04:51:59Z","published":"2024-04-20T04:51:59Z","title":"StrideNET: Swin Transformer for Terrain Recognition with Dynamic\n Roughness Extraction","summary":" Advancements in deep learning are revolutionizing the classification of\nremote-sensing images. Transformer-based architectures, utilizing\nself-attention mechanisms, have emerged as alternatives to conventional\nconvolution methods, enabling the capture of long-range dependencies along with\nglobal relationships in the image. Motivated by these advancements, this paper\npresents StrideNET, a novel dual-branch architecture designed for terrain\nrecognition and implicit properties estimation. The terrain recognition branch\nutilizes the Swin Transformer, leveraging its hierarchical representation and\nlow computational cost to efficiently capture both local and global features.\nThe terrain properties branch focuses on the extraction of surface properties\nsuch as roughness and slipperiness using a statistical texture analysis method.\nBy computing surface terrain properties, an enhanced environmental perception\ncan be obtained. The StrideNET model is trained on a dataset comprising four\ntarget terrain classes: Grassy, Marshy, Sandy, and Rocky. StrideNET attains\ncompetitive performance compared to contemporary methods. The implications of\nthis work extend to various applications, including environmental monitoring,\nland use and land cover (LULC) classification, disaster response, precision\nagriculture, and much more.\n","authors":["Maitreya Shelare","Neha Shigvan","Atharva Satam","Poonam Sonar"],"pdf_url":"https://arxiv.org/pdf/2404.13270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01123v2","updated":"2024-04-20T04:38:35Z","published":"2024-02-02T03:50:45Z","title":"A Single Simple Patch is All You Need for AI-generated Image Detection","summary":" The recent development of generative models unleashes the potential of\ngenerating hyper-realistic fake images. To prevent the malicious usage of fake\nimages, AI-generated image detection aims to distinguish fake images from real\nimages. However, existing method suffer from severe performance drop when\ndetecting images generated by unseen generators. We find that generative models\ntend to focus on generating the patches with rich textures to make the images\nmore realistic while neglecting the hidden noise caused by camera capture\npresent in simple patches. In this paper, we propose to exploit the noise\npattern of a single simple patch to identify fake images. Furthermore, due to\nthe performance decline when handling low-quality generated images, we\nintroduce an enhancement module and a perception module to remove the\ninterfering information. Extensive experiments demonstrate that our method can\nachieve state-of-the-art performance on public benchmarks.\n","authors":["Jiaxuan Chen","Jieteng Yao","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2402.01123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12216v2","updated":"2024-04-20T04:33:08Z","published":"2024-04-18T14:20:30Z","title":"ProTA: Probabilistic Token Aggregation for Text-Video Retrieval","summary":" Text-video retrieval aims to find the most relevant cross-modal samples for a\ngiven query. Recent methods focus on modeling the whole spatial-temporal\nrelations. However, since video clips contain more diverse content than\ncaptions, the model aligning these asymmetric video-text pairs has a high risk\nof retrieving many false positive results. In this paper, we propose\nProbabilistic Token Aggregation (ProTA) to handle cross-modal interaction with\ncontent asymmetry. Specifically, we propose dual partial-related aggregation to\ndisentangle and re-aggregate token representations in both low-dimension and\nhigh-dimension spaces. We propose token-based probabilistic alignment to\ngenerate token-level probabilistic representation and maintain the feature\nrepresentation diversity. In addition, an adaptive contrastive loss is proposed\nto learn compact cross-modal distribution space. Based on extensive\nexperiments, ProTA achieves significant improvements on MSR-VTT (50.9%), LSMDC\n(25.8%), and DiDeMo (47.2%).\n","authors":["Han Fang","Xianghao Zang","Chao Ban","Zerun Feng","Lanxiang Zhou","Zhongjiang He","Yongxiang Li","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13268v1","updated":"2024-04-20T04:30:38Z","published":"2024-04-20T04:30:38Z","title":"Multi-Cell Decoder and Mutual Learning for Table Structure and Character\n Recognition","summary":" Extracting table contents from documents such as scientific papers and\nfinancial reports and converting them into a format that can be processed by\nlarge language models is an important task in knowledge information processing.\nEnd-to-end approaches, which recognize not only table structure but also cell\ncontents, achieved performance comparable to state-of-the-art models using\nexternal character recognition systems, and have potential for further\nimprovements. In addition, these models can now recognize long tables with\nhundreds of cells by introducing local attention. However, the models recognize\ntable structure in one direction from the header to the footer, and cell\ncontent recognition is performed independently for each cell, so there is no\nopportunity to retrieve useful information from the neighbor cells. In this\npaper, we propose a multi-cell content decoder and bidirectional mutual\nlearning mechanism to improve the end-to-end approach. The effectiveness is\ndemonstrated on two large datasets, and the experimental results show\ncomparable performance to state-of-the-art models, even for long tables with\nlarge numbers of cells.\n","authors":["Takaya Kawakatsu"],"pdf_url":"https://arxiv.org/pdf/2404.13268v1.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2404.13263v1","updated":"2024-04-20T04:17:34Z","published":"2024-04-20T04:17:34Z","title":"FilterPrompt: Guiding Image Transfer in Diffusion Models","summary":" In controllable generation tasks, flexibly manipulating the generated images\nto attain a desired appearance or structure based on a single input image cue\nremains a critical and longstanding challenge. Achieving this requires the\neffective decoupling of key attributes within the input image data, aiming to\nget representations accurately. Previous research has predominantly\nconcentrated on disentangling image attributes within feature space. However,\nthe complex distribution present in real-world data often makes the application\nof such decoupling algorithms to other datasets challenging. Moreover, the\ngranularity of control over feature encoding frequently fails to meet specific\ntask requirements. Upon scrutinizing the characteristics of various generative\nmodels, we have observed that the input sensitivity and dynamic evolution\nproperties of the diffusion model can be effectively fused with the explicit\ndecomposition operation in pixel space. This integration enables the image\nprocessing operations performed in pixel space for a specific feature\ndistribution of the input image, and can achieve the desired control effect in\nthe generated results. Therefore, we propose FilterPrompt, an approach to\nenhance the model control effect. It can be universally applied to any\ndiffusion model, allowing users to adjust the representation of specific image\nfeatures in accordance with task requirements, thereby facilitating more\nprecise and controllable generation outcomes. In particular, our designed\nexperiments demonstrate that the FilterPrompt optimizes feature correlation,\nmitigates content conflicts during the generation process, and enhances the\nmodel's control capability.\n","authors":["Xi Wang","Yichen Peng","Heng Fang","Haoran Xie","Xi Yang","Chuntao Li"],"pdf_url":"https://arxiv.org/pdf/2404.13263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13252v1","updated":"2024-04-20T03:39:54Z","published":"2024-04-20T03:39:54Z","title":"3D-Convolution Guided Spectral-Spatial Transformer for Hyperspectral\n Image Classification","summary":" In recent years, Vision Transformers (ViTs) have shown promising\nclassification performance over Convolutional Neural Networks (CNNs) due to\ntheir self-attention mechanism. Many researchers have incorporated ViTs for\nHyperspectral Image (HSI) classification. HSIs are characterised by narrow\ncontiguous spectral bands, providing rich spectral data. Although ViTs excel\nwith sequential data, they cannot extract spectral-spatial information like\nCNNs. Furthermore, to have high classification performance, there should be a\nstrong interaction between the HSI token and the class (CLS) token. To solve\nthese issues, we propose a 3D-Convolution guided Spectral-Spatial Transformer\n(3D-ConvSST) for HSI classification that utilizes a 3D-Convolution Guided\nResidual Module (CGRM) in-between encoders to \"fuse\" the local spatial and\nspectral information and to enhance the feature propagation. Furthermore, we\nforego the class token and instead apply Global Average Pooling, which\neffectively encodes more discriminative and pertinent high-level features for\nclassification. Extensive experiments have been conducted on three public HSI\ndatasets to show the superiority of the proposed model over state-of-the-art\ntraditional, convolutional, and Transformer models. The code is available at\nhttps://github.com/ShyamVarahagiri/3D-ConvSST.\n","authors":["Shyam Varahagiri","Aryaman Sinha","Shiv Ram Dubey","Satish Kumar Singh"],"pdf_url":"https://arxiv.org/pdf/2404.13252v1.pdf","comment":"Accepted in IEEE Conference on Artificial Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.13239v1","updated":"2024-04-20T02:40:49Z","published":"2024-04-20T02:40:49Z","title":"Beyond Pixel-Wise Supervision for Medical Image Segmentation: From\n Traditional Models to Foundation Models","summary":" Medical image segmentation plays an important role in many image-guided\nclinical approaches. However, existing segmentation algorithms mostly rely on\nthe availability of fully annotated images with pixel-wise annotations for\ntraining, which can be both labor-intensive and expertise-demanding, especially\nin the medical imaging domain where only experts can provide reliable and\naccurate annotations. To alleviate this challenge, there has been a growing\nfocus on developing segmentation methods that can train deep models with weak\nannotations, such as image-level, bounding boxes, scribbles, and points. The\nemergence of vision foundation models, notably the Segment Anything Model\n(SAM), has introduced innovative capabilities for segmentation tasks using weak\nannotations for promptable segmentation enabled by large-scale pre-training.\nAdopting foundation models together with traditional learning methods has\nincreasingly gained recent interest research community and shown potential for\nreal-world applications. In this paper, we present a comprehensive survey of\nrecent progress on annotation-efficient learning for medical image segmentation\nutilizing weak annotations before and in the era of foundation models.\nFurthermore, we analyze and discuss several challenges of existing approaches,\nwhich we believe will provide valuable guidance for shaping the trajectory of\nfoundational models to further advance the field of medical image segmentation.\n","authors":["Yuyan Shi","Jialu Ma","Jin Yang","Shasha Wang","Yichi Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.13239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00717v2","updated":"2024-04-20T02:32:17Z","published":"2024-03-31T15:22:11Z","title":"End-to-End Autonomous Driving through V2X Cooperation","summary":" Cooperatively utilizing both ego-vehicle and infrastructure sensor data via\nV2X communication has emerged as a promising approach for advanced autonomous\ndriving. However, current research mainly focuses on improving individual\nmodules, rather than taking end-to-end learning to optimize final planning\nperformance, resulting in underutilized data potential. In this paper, we\nintroduce UniV2X, a pioneering cooperative autonomous driving framework that\nseamlessly integrates all key driving modules across diverse views into a\nunified network. We propose a sparse-dense hybrid data transmission and fusion\nmechanism for effective vehicle-infrastructure cooperation, offering three\nadvantages: 1) Effective for simultaneously enhancing agent perception, online\nmapping, and occupancy prediction, ultimately improving planning performance.\n2) Transmission-friendly for practical and limited communication conditions. 3)\nReliable data fusion with interpretability of this hybrid data. We implement\nUniV2X, as well as reproducing several benchmark methods, on the challenging\nDAIR-V2X, the real-world cooperative driving dataset. Experimental results\ndemonstrate the effectiveness of UniV2X in significantly enhancing planning\nperformance, as well as all intermediate output performance. Code is at\nhttps://github.com/AIR-THU/UniV2X.\n","authors":["Haibao Yu","Wenxian Yang","Jiaru Zhong","Zhenwei Yang","Siqi Fan","Ping Luo","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2404.00717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13237v1","updated":"2024-04-20T02:25:46Z","published":"2024-04-20T02:25:46Z","title":"PAFedFV: Personalized and Asynchronous Federated Learning for Finger\n Vein Recognition","summary":" With the increasing emphasis on user privacy protection, biometric\nrecognition based on federated learning have become the latest research\nhotspot. However, traditional federated learning methods cannot be directly\napplied to finger vein recognition, due to heterogeneity of data and open-set\nverification. Therefore, only a few application cases have been proposed. And\nthese methods still have two drawbacks. (1) Uniform model results in poor\nperformance in some clients, as the finger vein data is highly heterogeneous\nand non-Independently Identically Distributed (non-IID). (2) On individual\nclient, a large amount of time is underutilized, such as the time to wait for\nreturning model from server. To address those problems, this paper proposes a\nPersonalized and Asynchronous Federated Learning for Finger Vein Recognition\n(PAFedFV) framework. PAFedFV designs personalized model aggregation method to\nsolve the heterogeneity among non-IID data. Meanwhile, it employs an\nasynchronized training module for clients to utilize their waiting time.\nFinally, extensive experiments on six finger vein datasets are conducted. Base\non these experiment results, the impact of non-IID finger vein data on\nperformance of federated learning are analyzed, and the superiority of PAFedFV\nin accuracy and robustness are demonstrated.\n","authors":["Hengyu Mu","Jian Guo","Chong Han","Lijuan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.13237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10518v3","updated":"2024-04-20T02:01:11Z","published":"2024-03-15T17:59:33Z","title":"Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation\n Guided by the Characteristic Dance Primitives","summary":" We propose Lodge, a network capable of generating extremely long dance\nsequences conditioned on given music. We design Lodge as a two-stage coarse to\nfine diffusion architecture, and propose the characteristic dance primitives\nthat possess significant expressiveness as intermediate representations between\ntwo diffusion models. The first stage is global diffusion, which focuses on\ncomprehending the coarse-level music-dance correlation and production\ncharacteristic dance primitives. In contrast, the second-stage is the local\ndiffusion, which parallelly generates detailed motion sequences under the\nguidance of the dance primitives and choreographic rules. In addition, we\npropose a Foot Refine Block to optimize the contact between the feet and the\nground, enhancing the physical realism of the motion. Our approach can\nparallelly generate dance sequences of extremely long length, striking a\nbalance between global choreographic patterns and local motion quality and\nexpressiveness. Extensive experiments validate the efficacy of our method.\n","authors":["Ronghui Li","YuXiang Zhang","Yachao Zhang","Hongwen Zhang","Jie Guo","Yan Zhang","Yebin Liu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.10518v3.pdf","comment":"Accepted by CVPR2024, Project page:\n https://li-ronghui.github.io/lodge"},{"id":"http://arxiv.org/abs/2404.13222v1","updated":"2024-04-20T00:44:40Z","published":"2024-04-20T00:44:40Z","title":"Vim4Path: Self-Supervised Vision Mamba for Histopathology Images","summary":" Representation learning from Gigapixel Whole Slide Images (WSI) poses a\nsignificant challenge in computational pathology due to the complicated nature\nof tissue structures and the scarcity of labeled data. Multi-instance learning\nmethods have addressed this challenge, leveraging image patches to classify\nslides utilizing pretrained models using Self-Supervised Learning (SSL)\napproaches. The performance of both SSL and MIL methods relies on the\narchitecture of the feature encoder. This paper proposes leveraging the Vision\nMamba (Vim) architecture, inspired by state space models, within the DINO\nframework for representation learning in computational pathology. We evaluate\nthe performance of Vim against Vision Transformers (ViT) on the Camelyon16\ndataset for both patch-level and slide-level classification. Our findings\nhighlight Vim's enhanced performance compared to ViT, particularly at smaller\nscales, where Vim achieves an 8.21 increase in ROC AUC for models of similar\nsize. An explainability analysis further highlights Vim's capabilities, which\nreveals that Vim uniquely emulates the pathologist workflow-unlike ViT. This\nalignment with human expert analysis highlights Vim's potential in practical\ndiagnostic settings and contributes significantly to developing effective\nrepresentation-learning algorithms in computational pathology. We release the\ncodes and pretrained weights at\n\\url{https://github.com/AtlasAnalyticsLab/Vim4Path}.\n","authors":["Ali Nasiri-Sarvi","Vincent Quoc-Huy Trinh","Hassan Rivaz","Mahdi S. Hosseini"],"pdf_url":"https://arxiv.org/pdf/2404.13222v1.pdf","comment":"Accepted in CVPR2023 (9th Workshop on Computer Vision for Microscopy\n Image Analysis)"},{"id":"http://arxiv.org/abs/2404.14441v1","updated":"2024-04-20T00:21:06Z","published":"2024-04-20T00:21:06Z","title":"Optimizing Contrail Detection: A Deep Learning Approach with\n EfficientNet-b4 Encoding","summary":" In the pursuit of environmental sustainability, the aviation industry faces\nthe challenge of minimizing its ecological footprint. Among the key solutions\nis contrail avoidance, targeting the linear ice-crystal clouds produced by\naircraft exhaust. These contrails exacerbate global warming by trapping\natmospheric heat, necessitating precise segmentation and comprehensive analysis\nof contrail images to gauge their environmental impact. However, this\nsegmentation task is complex due to the varying appearances of contrails under\ndifferent atmospheric conditions and potential misalignment issues in\npredictive modeling. This paper presents an innovative deep-learning approach\nutilizing the efficient net-b4 encoder for feature extraction, seamlessly\nintegrating misalignment correction, soft labeling, and pseudo-labeling\ntechniques to enhance the accuracy and efficiency of contrail detection in\nsatellite imagery. The proposed methodology aims to redefine contrail image\nanalysis and contribute to the objectives of sustainable aviation by providing\na robust framework for precise contrail detection and analysis in satellite\nimagery, thus aiding in the mitigation of aviation's environmental impact.\n","authors":["Qunwei Lin","Qian Leng","Zhicheng Ding","Chao Yan","Xiaonan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.14441v1.pdf","comment":null}]},"2024-04-24T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.01112v4","updated":"2024-04-24T03:14:59Z","published":"2024-04-01T13:38:16Z","title":"Few-shot point cloud reconstruction and denoising via learned Guassian\n splats renderings and fine-tuned diffusion features","summary":" Existing deep learning methods for the reconstruction and denoising of point\nclouds rely on small datasets of 3D shapes. We circumvent the problem by\nleveraging deep learning methods trained on billions of images. We propose a\nmethod to reconstruct point clouds from few images and to denoise point clouds\nfrom their rendering by exploiting prior knowledge distilled from image-based\ndeep learning models. To improve reconstruction in constraint settings, we\nregularize the training of a differentiable renderer with hybrid surface and\nappearance by introducing semantic consistency supervision. In addition, we\npropose a pipeline to finetune Stable Diffusion to denoise renderings of noisy\npoint clouds and we demonstrate how these learned filters can be used to remove\npoint cloud noise coming without 3D supervision. We compare our method with DSS\nand PointRadiance and achieved higher quality 3D reconstruction on the\nSketchfab Testset and SCUT Dataset.\n","authors":["Pietro Bonazzi","Marie-Julie Rakatosaona","Marco Cannici","Federico Tombari","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2404.01112v4.pdf","comment":"An author was not timely informed before the released submission"},{"id":"http://arxiv.org/abs/2404.01887v3","updated":"2024-04-24T03:13:50Z","published":"2024-04-02T12:26:17Z","title":"3D scene generation from scene graphs and self-attention","summary":" Synthesizing realistic and diverse indoor 3D scene layouts in a controllable\nfashion opens up applications in simulated navigation and virtual reality. As\nconcise and robust representations of a scene, scene graphs have proven to be\nwell-suited as the semantic control on the generated layout. We present a\nvariant of the conditional variational autoencoder (cVAE) model to synthesize\n3D scenes from scene graphs and floor plans. We exploit the properties of\nself-attention layers to capture high-level relationships between objects in a\nscene, and use these as the building blocks of our model. Our model, leverages\ngraph transformers to estimate the size, dimension and orientation of the\nobjects in a room while satisfying relationships in the given scene graph. Our\nexperiments shows self-attention layers leads to sparser (7.9x compared to\nGraphto3D) and more diverse scenes (16%).\n","authors":["Pietro Bonazzi","Mengqi Wang","Diego Martin Arroyo","Fabian Manhardt","Nico Messikomer","Federico Tombari","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2404.01887v3.pdf","comment":"Some authors were not timely informed of the submission"},{"id":"http://arxiv.org/abs/2404.13880v2","updated":"2024-04-24T02:55:29Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15608v1","updated":"2024-04-24T02:51:13Z","published":"2024-04-24T02:51:13Z","title":"Understanding and Improving CNNs with Complex Structure Tensor: A\n Biometrics Study","summary":" Our study provides evidence that CNNs struggle to effectively extract\norientation features. We show that the use of Complex Structure Tensor, which\ncontains compact orientation features with certainties, as input to CNNs\nconsistently improves identification accuracy compared to using grayscale\ninputs alone. Experiments also demonstrated that our inputs, which were\nprovided by mini complex conv-nets, combined with reduced CNN sizes,\noutperformed full-fledged, prevailing CNN architectures. This suggests that the\nupfront use of orientation features in CNNs, a strategy seen in mammalian\nvision, not only mitigates their limitations but also enhances their\nexplainability and relevance to thin-clients. Experiments were done on publicly\navailable data sets comprising periocular images for biometric identification\nand verification (Close and Open World) using 6 State of the Art CNN\narchitectures. We reduced SOA Equal Error Rate (EER) on the PolyU dataset by\n5-26% depending on data and scenario.\n","authors":["Kevin Hernandez-Diaz","Josef Bigun","Fernando Alonso-Fernandez"],"pdf_url":"https://arxiv.org/pdf/2404.15608v1.pdf","comment":"preprint manuscript"},{"id":"http://arxiv.org/abs/2402.03833v2","updated":"2024-04-24T02:39:53Z","published":"2024-02-06T09:24:53Z","title":"A Lightweight Randomized Nonlinear Dictionary Learning Method using\n Random Vector Functional Link","summary":" Kernel-based nonlinear dictionary learning methods operate in a feature space\nobtained by an implicit feature map, and they are not independent of\ncomputationally expensive operations like Singular Value Decomposition (SVD).\nThis paper presents an SVD-free lightweight approach to learning a nonlinear\ndictionary using a randomized functional link called a Random Vector Functional\nLink (RVFL). The proposed RVFL-based nonlinear Dictionary Learning (RVFLDL)\nlearns a dictionary as a sparse-to-dense feature map from nonlinear sparse\ncoefficients to the dense input features. Sparse coefficients w.r.t an initial\nrandom dictionary are derived by assuming Horseshoe prior are used as inputs\nmaking it a lightweight network. Training the RVFL-based dictionary is free\nfrom SVD computation as RVFL generates weights from the input to the output\nlayer analytically. Higher-order dependencies between the input sparse\ncoefficients and the dictionary atoms are incorporated into the training\nprocess by nonlinearly transforming the sparse coefficients and adding them as\nenhanced features. Thus the method projects sparse coefficients to a higher\ndimensional space while inducing nonlinearities into the dictionary. For\nclassification using RVFL-net, a classifier matrix is learned as a transform\nthat maps nonlinear sparse coefficients to the labels. The empirical evidence\nof the method illustrated in image classification and reconstruction\napplications shows that RVFLDL is scalable and provides a solution better than\nthose obtained using other nonlinear dictionary learning methods.\n","authors":["G. Madhuri","Atul Negi"],"pdf_url":"https://arxiv.org/pdf/2402.03833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04119v2","updated":"2024-04-24T02:26:33Z","published":"2023-12-07T08:20:07Z","title":"A Multilevel Guidance-Exploration Network and Behavior-Scene Matching\n Method for Human Behavior Anomaly Detection","summary":" Human behavior anomaly detection aims to identify unusual human actions,\nplaying a crucial role in intelligent surveillance and other areas. The current\nmainstream methods still adopt reconstruction or future frame prediction\ntechniques. However, reconstructing or predicting low-level pixel features\neasily enables the network to achieve overly strong generalization ability,\nallowing anomalies to be reconstructed or predicted as effectively as normal\ndata. Different from their methods, inspired by the Student-Teacher Network, we\npropose a novel framework called the Multilevel Guidance-Exploration\nNetwork(MGENet), which detects anomalies through the difference in high-level\nrepresentation between the Guidance and Exploration network. Specifically, we\nfirst utilize the pre-trained Normalizing Flow that takes skeletal keypoints as\ninput to guide an RGB encoder, which takes unmasked RGB frames as input, to\nexplore motion latent features. Then, the RGB encoder guides the mask encoder,\nwhich takes masked RGB frames as input, to explore the latent appearance\nfeature. Additionally, we design a Behavior-Scene Matching Module(BSMM) to\ndetect scene-related behavioral anomalies. Extensive experiments demonstrate\nthat our proposed method achieves state-of-the-art performance on ShanghaiTech\nand UBnormal datasets, with AUC of 86.9 % and 73.5 %, respectively. The code\nwill be available on https://github.com/molu-ggg/GENet.\n","authors":["Guoqing Yang","Zhiming Luo","Jianzhe Gao","Yingxin Lai","Kun Yang","Yifan He","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2312.04119v2.pdf","comment":"The experimental methods and results are incorrect and need to be\n revised"},{"id":"http://arxiv.org/abs/2404.15592v1","updated":"2024-04-24T01:54:40Z","published":"2024-04-24T01:54:40Z","title":"ImplicitAVE: An Open-Source Dataset and Multimodal LLMs Benchmark for\n Implicit Attribute Value Extraction","summary":" Existing datasets for attribute value extraction (AVE) predominantly focus on\nexplicit attribute values while neglecting the implicit ones, lack product\nimages, are often not publicly available, and lack an in-depth human inspection\nacross diverse domains. To address these limitations, we present ImplicitAVE,\nthe first, publicly available multimodal dataset for implicit attribute value\nextraction. ImplicitAVE, sourced from the MAVE dataset, is carefully curated\nand expanded to include implicit AVE and multimodality, resulting in a refined\ndataset of 68k training and 1.6k testing data across five domains. We also\nexplore the application of multimodal large language models (MLLMs) to implicit\nAVE, establishing a comprehensive benchmark for MLLMs on the ImplicitAVE\ndataset. Six recent MLLMs with eleven variants are evaluated across diverse\nsettings, revealing that implicit value extraction remains a challenging task\nfor MLLMs. The contributions of this work include the development and release\nof ImplicitAVE, and the exploration and benchmarking of various MLLMs for\nimplicit AVE, providing valuable insights and potential future research\ndirections. Dataset and code are available at\nhttps://github.com/HenryPengZou/ImplicitAVE\n","authors":["Henry Peng Zou","Vinay Samuel","Yue Zhou","Weizhi Zhang","Liancheng Fang","Zihe Song","Philip S. Yu","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2404.15592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15591v1","updated":"2024-04-24T01:50:36Z","published":"2024-04-24T01:50:36Z","title":"Domain Adaptation for Learned Image Compression with Supervised Adapters","summary":" In Learned Image Compression (LIC), a model is trained at encoding and\ndecoding images sampled from a source domain, often outperforming traditional\ncodecs on natural images; yet its performance may be far from optimal on images\nsampled from different domains. In this work, we tackle the problem of adapting\na pre-trained model to multiple target domains by plugging into the decoder an\nadapter module for each of them, including the source one. Each adapter\nimproves the decoder performance on a specific domain, without the model\nforgetting about the images seen at training time. A gate network computes the\nweights to optimally blend the contributions from the adapters when the\nbitstream is decoded. We experimentally validate our method over two\nstate-of-the-art pre-trained models, observing improved rate-distortion\nefficiency on the target domains without penalties on the source domain.\nFurthermore, the gate's ability to find similarities with the learned target\ndomains enables better encoding efficiency also for images outside them.\n","authors":["Alberto Presta","Gabriele Spadaro","Enzo Tartaglione","Attilio Fiandrotti","Marco Grangetto"],"pdf_url":"https://arxiv.org/pdf/2404.15591v1.pdf","comment":"10 pages, published to Data compression conference 2024 (DCC2024)"},{"id":"http://arxiv.org/abs/2404.00552v2","updated":"2024-04-24T01:16:17Z","published":"2024-03-31T03:53:45Z","title":"Comparison of Methods in Human Skin Decomposition","summary":" Decomposition of skin pigment plays an important role in medical fields.\nHuman skin can be decomposed into two primitive components, hemoglobin and\nmelanin. It is our goal to apply these results for diagnosis of skin cancer. In\nthis paper, various methods for skin pigment decomposition are reviewed\ncomparatively and the performance of each method is evaluated both\ntheoretically and experimentally. In addition, isometric feature mapping\n(Isomap) is introduced in order to improve the dimensionality reduction\nperformance in context of skin decomposition.\n","authors":["Hao Gong","Michel Desvignes"],"pdf_url":"https://arxiv.org/pdf/2404.00552v2.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.14696v2","updated":"2024-04-24T01:14:46Z","published":"2024-04-23T02:54:12Z","title":"Adaptive Prompt Learning with Negative Textual Semantics and Uncertainty\n Modeling for Universal Multi-Source Domain Adaptation","summary":" Universal Multi-source Domain Adaptation (UniMDA) transfers knowledge from\nmultiple labeled source domains to an unlabeled target domain under domain\nshifts (different data distribution) and class shifts (unknown target classes).\nExisting solutions focus on excavating image features to detect unknown\nsamples, ignoring abundant information contained in textual semantics. In this\npaper, we propose an Adaptive Prompt learning with Negative textual semantics\nand uncErtainty modeling method based on Contrastive Language-Image\nPre-training (APNE-CLIP) for UniMDA classification tasks. Concretely, we\nutilize the CLIP with adaptive prompts to leverage textual information of class\nsemantics and domain representations, helping the model identify unknown\nsamples and address domain shifts. Additionally, we design a novel global\ninstance-level alignment objective by utilizing negative textual semantics to\nachieve more precise image-text pair alignment. Furthermore, we propose an\nenergy-based uncertainty modeling strategy to enlarge the margin distance\nbetween known and unknown samples. Extensive experiments demonstrate the\nsuperiority of our proposed method.\n","authors":["Yuxiang Yang","Lu Wen","Yuanyuan Xu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14696v2.pdf","comment":"Accepted by ICME2024"},{"id":"http://arxiv.org/abs/2404.15580v1","updated":"2024-04-24T01:14:33Z","published":"2024-04-24T01:14:33Z","title":"MiM: Mask in Mask Self-Supervised Pre-Training for 3D Medical Image\n Analysis","summary":" The Vision Transformer (ViT) has demonstrated remarkable performance in\nSelf-Supervised Learning (SSL) for 3D medical image analysis. Mask AutoEncoder\n(MAE) for feature pre-training can further unleash the potential of ViT on\nvarious medical vision tasks. However, due to large spatial sizes with much\nhigher dimensions of 3D medical images, the lack of hierarchical design for MAE\nmay hinder the performance of downstream tasks. In this paper, we propose a\nnovel \\textit{Mask in Mask (MiM)} pre-training framework for 3D medical images,\nwhich aims to advance MAE by learning discriminative representation from\nhierarchical visual tokens across varying scales. We introduce multiple levels\nof granularity for masked inputs from the volume, which are then reconstructed\nsimultaneously ranging at both fine and coarse levels. Additionally, a\ncross-level alignment mechanism is applied to adjacent level volumes to enforce\nanatomical similarity hierarchically. Furthermore, we adopt a hybrid backbone\nto enhance the hierarchical representation learning efficiently during the\npre-training. MiM was pre-trained on a large scale of available 3D volumetric\nimages, \\textit{i.e.,} Computed Tomography (CT) images containing various body\nparts. Extensive experiments on thirteen public datasets demonstrate the\nsuperiority of MiM over other SSL methods in organ/lesion/tumor segmentation\nand disease classification. We further scale up the MiM to large pre-training\ndatasets with more than 10k volumes, showing that large-scale pre-training can\nfurther enhance the performance of downstream tasks. The improvement also\nconcluded that the research community should pay more attention to the scale of\nthe pre-training dataset towards the healthcare foundation model for 3D medical\nimages.\n","authors":["Jiaxin Zhuang","Linshan Wu","Qiong Wang","Varut Vardhanabhuti","Lin Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15580v1.pdf","comment":"submitted to journal"},{"id":"http://arxiv.org/abs/2310.17994v2","updated":"2024-04-24T01:08:12Z","published":"2023-10-27T09:06:43Z","title":"ZeroNVS: Zero-Shot 360-Degree View Synthesis from a Single Image","summary":" We introduce a 3D-aware diffusion model, ZeroNVS, for single-image novel view\nsynthesis for in-the-wild scenes. While existing methods are designed for\nsingle objects with masked backgrounds, we propose new techniques to address\nchallenges introduced by in-the-wild multi-object scenes with complex\nbackgrounds. Specifically, we train a generative prior on a mixture of data\nsources that capture object-centric, indoor, and outdoor scenes. To address\nissues from data mixture such as depth-scale ambiguity, we propose a novel\ncamera conditioning parameterization and normalization scheme. Further, we\nobserve that Score Distillation Sampling (SDS) tends to truncate the\ndistribution of complex backgrounds during distillation of 360-degree scenes,\nand propose \"SDS anchoring\" to improve the diversity of synthesized novel\nviews. Our model sets a new state-of-the-art result in LPIPS on the DTU dataset\nin the zero-shot setting, even outperforming methods specifically trained on\nDTU. We further adapt the challenging Mip-NeRF 360 dataset as a new benchmark\nfor single-image novel view synthesis, and demonstrate strong performance in\nthis setting. Our code and data are at http://kylesargent.github.io/zeronvs/\n","authors":["Kyle Sargent","Zizhang Li","Tanmay Shah","Charles Herrmann","Hong-Xing Yu","Yunzhi Zhang","Eric Ryan Chan","Dmitry Lagun","Li Fei-Fei","Deqing Sun","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2310.17994v2.pdf","comment":"Accepted to CVPR 2024. 12 pages"},{"id":"http://arxiv.org/abs/2310.13593v2","updated":"2024-04-24T00:59:10Z","published":"2023-10-20T15:42:47Z","title":"Learning with Unmasked Tokens Drives Stronger Vision Learners","summary":" Masked image modeling (MIM) has become a leading self-supervised learning\nstrategy. MIMs such as Masked Autoencoder (MAE) learn strong representations by\nrandomly masking input tokens for the encoder to process, with the decoder\nreconstructing the masked tokens to the input. However, MIM pre-trained\nencoders often exhibit a limited attention span, attributed to MIM's sole focus\non regressing masked tokens only, which may impede the encoder's broader\ncontext learning. To tackle the limitation, we improve MIM by explicitly\nincorporating unmasked tokens into the training process. Specifically, our\nmethod enables the encoder to learn from broader context supervision, allowing\nunmasked tokens to experience broader contexts while the decoder reconstructs\nmasked tokens. Thus, the encoded unmasked tokens are equipped with extensive\ncontextual information, empowering masked tokens to leverage the enhanced\nunmasked tokens for MIM. As a result, our simple remedy trains more\ndiscriminative representations revealed by achieving 84.2% top-1 accuracy with\nViT-B on ImageNet-1K with 0.6%p gain. We attribute the success to the enhanced\npre-training method, as evidenced by the singular value spectrum and attention\nanalyses. Finally, our models achieve significant performance gains at the\ndownstream semantic segmentation and fine-grained visual classification tasks;\nand on diverse robust evaluation metrics. Code is available at\nhttps://github.com/naver-ai/lut\n","authors":["Taekyung Kim","Sanghyuk Chun","Byeongho Heo","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2310.13593v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08965v3","updated":"2024-04-24T00:40:05Z","published":"2024-04-13T11:07:10Z","title":"Seeing Text in the Dark: Algorithm and Benchmark","summary":" Localizing text in low-light environments is challenging due to visual\ndegradations. Although a straightforward solution involves a two-stage pipeline\nwith low-light image enhancement (LLE) as the initial step followed by\ndetector, LLE is primarily designed for human vision instead of machine and can\naccumulate errors. In this work, we propose an efficient and effective\nsingle-stage approach for localizing text in dark that circumvents the need for\nLLE. We introduce a constrained learning module as an auxiliary mechanism\nduring the training stage of the text detector. This module is designed to\nguide the text detector in preserving textual spatial features amidst feature\nmap resizing, thus minimizing the loss of spatial information in texts under\nlow-light visual degradations. Specifically, we incorporate spatial\nreconstruction and spatial semantic constraints within this module to ensure\nthe text detector acquires essential positional and contextual range knowledge.\nOur approach enhances the original text detector's ability to identify text's\nlocal topological features using a dynamic snake feature pyramid network and\nadopts a bottom-up contour shaping strategy with a novel rectangular\naccumulation technique for accurate delineation of streamlined text features.\nIn addition, we present a comprehensive low-light dataset for arbitrary-shaped\ntext, encompassing diverse scenes and languages. Notably, our method achieves\nstate-of-the-art results on this low-light dataset and exhibits comparable\nperformance on standard normal light datasets. The code and dataset will be\nreleased.\n","authors":["Chengpei Xu","Hao Fu","Long Ma","Wenjing Jia","Chengqi Zhang","Feng Xia","Xiaoyu Ai","Binghao Li","Wenjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08965v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06310v2","updated":"2024-04-24T00:20:44Z","published":"2022-12-13T01:36:56Z","title":"Structure-Guided Image Completion with Image-level and Object-level\n Semantic Discriminators","summary":" Structure-guided image completion aims to inpaint a local region of an image\naccording to an input guidance map from users. While such a task enables many\npractical applications for interactive editing, existing methods often struggle\nto hallucinate realistic object instances in complex natural scenes. Such a\nlimitation is partially due to the lack of semantic-level constraints inside\nthe hole region as well as the lack of a mechanism to enforce realistic object\ngeneration. In this work, we propose a learning paradigm that consists of\nsemantic discriminators and object-level discriminators for improving the\ngeneration of complex semantics and objects. Specifically, the semantic\ndiscriminators leverage pretrained visual features to improve the realism of\nthe generated visual concepts. Moreover, the object-level discriminators take\naligned instances as inputs to enforce the realism of individual objects. Our\nproposed scheme significantly improves the generation quality and achieves\nstate-of-the-art results on various tasks, including segmentation-guided\ncompletion, edge-guided manipulation and panoptically-guided manipulation on\nPlaces2 datasets. Furthermore, our trained model is flexible and can support\nmultiple editing use cases, such as object insertion, replacement, removal and\nstandard inpainting. In particular, our trained model combined with a novel\nautomatic image completion pipeline achieves state-of-the-art results on the\nstandard inpainting task.\n","authors":["Haitian Zheng","Zhe Lin","Jingwan Lu","Scott Cohen","Eli Shechtman","Connelly Barnes","Jianming Zhang","Qing Liu","Yuqian Zhou","Sohrab Amirghodsi","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2212.06310v2.pdf","comment":"18 pages, 16 figures"},{"id":"http://arxiv.org/abs/2404.09454v2","updated":"2024-04-24T00:08:42Z","published":"2024-04-15T04:43:53Z","title":"Utility-Fairness Trade-Offs and How to Find Them","summary":" When building classification systems with demographic fairness\nconsiderations, there are two objectives to satisfy: 1) maximizing utility for\nthe specific task and 2) ensuring fairness w.r.t. a known demographic\nattribute. These objectives often compete, so optimizing both can lead to a\ntrade-off between utility and fairness. While existing works acknowledge the\ntrade-offs and study their limits, two questions remain unanswered: 1) What are\nthe optimal trade-offs between utility and fairness? and 2) How can we\nnumerically quantify these trade-offs from data for a desired prediction task\nand demographic attribute of interest? This paper addresses these questions. We\nintroduce two utility-fairness trade-offs: the Data-Space and Label-Space\nTrade-off. The trade-offs reveal three regions within the utility-fairness\nplane, delineating what is fully and partially possible and impossible. We\npropose U-FaTE, a method to numerically quantify the trade-offs for a given\nprediction task and group fairness definition from data samples. Based on the\ntrade-offs, we introduce a new scheme for evaluating representations. An\nextensive evaluation of fair representation learning methods and\nrepresentations from over 1000 pre-trained models revealed that most current\napproaches are far from the estimated and achievable fairness-utility\ntrade-offs across multiple datasets and prediction tasks.\n","authors":["Sepehr Dehdashtian","Bashir Sadeghi","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2404.09454v2.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024"},{"id":"http://arxiv.org/abs/2404.15256v2","updated":"2024-04-24T16:53:56Z","published":"2024-04-23T17:42:45Z","title":"TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and\n Proprioception Estimation","summary":" Legged navigation is typically examined within open-world, off-road, and\nchallenging environments. In these scenarios, estimating external disturbances\nrequires a complex synthesis of multi-modal information. This underlines a\nmajor limitation in existing works that primarily focus on avoiding obstacles.\nIn this work, we propose TOP-Nav, a novel legged navigation framework that\nintegrates a comprehensive path planner with Terrain awareness, Obstacle\navoidance and close-loop Proprioception. TOP-Nav underscores the synergies\nbetween vision and proprioception in both path and motion planning. Within the\npath planner, we present and integrate a terrain estimator that enables the\nrobot to select waypoints on terrains with higher traversability while\neffectively avoiding obstacles. In the motion planning level, we not only\nimplement a locomotion controller to track the navigation commands, but also\nconstruct a proprioception advisor to provide motion evaluations for the path\nplanner. Based on the close-loop motion feedback, we make online corrections\nfor the vision-based terrain and obstacle estimations. Consequently, TOP-Nav\nachieves open-world navigation that the robot can handle terrains or\ndisturbances beyond the distribution of prior knowledge and overcomes\nconstraints imposed by visual conditions. Building upon extensive experiments\nconducted in both simulation and real-world environments, TOP-Nav demonstrates\nsuperior performance in open-world navigation compared to existing methods.\n","authors":["Junli Ren","Yikai Liu","Yingru Dai","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.15256v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14956v2","updated":"2024-04-24T06:03:48Z","published":"2024-04-23T12:01:21Z","title":"DAWN: Domain-Adaptive Weakly Supervised Nuclei Segmentation via\n Cross-Task Interactions","summary":" Weakly supervised segmentation methods have gained significant attention due\nto their ability to reduce the reliance on costly pixel-level annotations\nduring model training. However, the current weakly supervised nuclei\nsegmentation approaches typically follow a two-stage pseudo-label generation\nand network training process. The performance of the nuclei segmentation\nheavily relies on the quality of the generated pseudo-labels, thereby limiting\nits effectiveness. This paper introduces a novel domain-adaptive weakly\nsupervised nuclei segmentation framework using cross-task interaction\nstrategies to overcome the challenge of pseudo-label generation. Specifically,\nwe utilize weakly annotated data to train an auxiliary detection task, which\nassists the domain adaptation of the segmentation network. To enhance the\nefficiency of domain adaptation, we design a consistent feature constraint\nmodule integrating prior knowledge from the source domain. Furthermore, we\ndevelop pseudo-label optimization and interactive training methods to improve\nthe domain transfer capability. To validate the effectiveness of our proposed\nmethod, we conduct extensive comparative and ablation experiments on six\ndatasets. The results demonstrate the superiority of our approach over existing\nweakly supervised approaches. Remarkably, our method achieves comparable or\neven better performance than fully supervised methods. Our code will be\nreleased in https://github.com/zhangye-zoe/DAWN.\n","authors":["Ye Zhang","Yifeng Wang","Zijie Fang","Hao Bian","Linghan Cai","Ziyue Wang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.14956v2.pdf","comment":"13 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.14882v2","updated":"2024-04-24T05:56:54Z","published":"2024-04-23T10:09:32Z","title":"A sensitivity analysis to quantify the impact of neuroimaging\n preprocessing strategies on subsequent statistical analyses","summary":" Even though novel imaging techniques have been successful in studying brain\nstructure and function, the measured biological signals are often contaminated\nby multiple sources of noise, arising due to e.g. head movements of the\nindividual being scanned, limited spatial/temporal resolution, or other issues\nspecific to each imaging technology. Data preprocessing (e.g. denoising) is\ntherefore critical. Preprocessing pipelines have become increasingly complex\nover the years, but also more flexible, and this flexibility can have a\nsignificant impact on the final results and conclusions of a given study. This\nlarge parameter space is often referred to as multiverse analyses. Here, we\nprovide conceptual and practical tools for statistical analyses that can\naggregate multiple pipeline results along with a new sensitivity analysis\ntesting for hypotheses across pipelines such as \"no effect across all\npipelines\" or \"at least one pipeline with no effect\". The proposed framework is\ngeneric and can be applied to any multiverse scenario, but we illustrate its\nuse based on positron emission tomography data.\n","authors":["Brice Ozenne","Martin Norgaard","Cyril Pernet","Melanie Ganz"],"pdf_url":"https://arxiv.org/pdf/2404.14882v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05180v3","updated":"2024-04-24T09:58:04Z","published":"2023-01-12T18:04:51Z","title":"Effective Decision Boundary Learning for Class Incremental Learning","summary":" Rehearsal approaches in class incremental learning (CIL) suffer from decision\nboundary overfitting to new classes, which is mainly caused by two factors:\ninsufficiency of old classes data for knowledge distillation and imbalanced\ndata learning between the learned and new classes because of the limited\nstorage memory. In this work, we present a simple but effective approach to\ntackle these two factors. First, we employ a re-sampling strategy and Mixup\nK}nowledge D}istillation (Re-MKD) to improve the performances of KD, which\nwould greatly alleviate the overfitting problem. Specifically, we combine mixup\nand re-sampling strategies to synthesize adequate data used in KD training that\nare more consistent with the latent distribution between the learned and new\nclasses. Second, we propose a novel incremental influence balance (IIB) method\nfor CIL to tackle the classification of imbalanced data by extending the\ninfluence balance method into the CIL setting, which re-weights samples by\ntheir influences to create a proper decision boundary. With these two\nimprovements, we present the effective decision boundary learning algorithm\n(EDBL) which improves the performance of KD and deals with the imbalanced data\nlearning simultaneously. Experiments show that the proposed EDBL achieves\nstate-of-the-art performances on several CIL benchmarks.\n","authors":["Chaoyue Ding","Kunchi Li","Jun Wan","Shan Yu"],"pdf_url":"https://arxiv.org/pdf/2301.05180v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09359v3","updated":"2024-04-24T15:07:04Z","published":"2024-04-14T21:14:47Z","title":"Exploring Feedback Generation in Automated Skeletal Movement Assessment:\n A Comprehensive Overview","summary":" The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection and analysis from 2D or 3D videos. While the primary\nobjective of automatic assessment tasks is to score movements, the automatic\ngeneration of feedback highlighting key movement issues has the potential to\nsignificantly enhance and accelerate the rehabilitation process. While numerous\nresearch works exist in the field of automatic movement assessment, only a\nhandful address feedback generation. In this study, we explain the types of\nfeedback that can be generated, review existing solutions for automatic\nfeedback generation, and discuss future research directions. To our knowledge,\nthis is the first comprehensive review of feedback generation in skeletal\nmovement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13993v2","updated":"2024-04-24T06:00:47Z","published":"2024-04-22T08:59:35Z","title":"Zero-Shot Character Identification and Speaker Prediction in Comics via\n Iterative Multimodal Fusion","summary":" Recognizing characters and predicting speakers of dialogue are critical for\ncomic processing tasks, such as voice generation or translation. However,\nbecause characters vary by comic title, supervised learning approaches like\ntraining character classifiers which require specific annotations for each\ncomic title are infeasible. This motivates us to propose a novel zero-shot\napproach, allowing machines to identify characters and predict speaker names\nbased solely on unannotated comic images. In spite of their importance in\nreal-world applications, these task have largely remained unexplored due to\nchallenges in story comprehension and multimodal integration. Recent large\nlanguage models (LLMs) have shown great capability for text understanding and\nreasoning, while their application to multimodal content analysis is still an\nopen problem. To address this problem, we propose an iterative multimodal\nframework, the first to employ multimodal information for both character\nidentification and speaker prediction tasks. Our experiments demonstrate the\neffectiveness of the proposed framework, establishing a robust baseline for\nthese tasks. Furthermore, since our method requires no training data or\nannotations, it can be used as-is on any comic series.\n","authors":["Yingxuan Li","Ryota Hinami","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13993v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13923v2","updated":"2024-04-24T14:21:18Z","published":"2024-04-22T07:00:17Z","title":"MaterialSeg3D: Segmenting Dense Materials from 2D Priors for 3D Assets","summary":" Driven by powerful image diffusion models, recent research has achieved the\nautomatic creation of 3D objects from textual or visual guidance. By performing\nscore distillation sampling (SDS) iteratively across different views, these\nmethods succeed in lifting 2D generative prior to the 3D space. However, such a\n2D generative image prior bakes the effect of illumination and shadow into the\ntexture. As a result, material maps optimized by SDS inevitably involve\nspurious correlated components. The absence of precise material definition\nmakes it infeasible to relight the generated assets reasonably in novel scenes,\nwhich limits their application in downstream scenarios. In contrast, humans can\neffortlessly circumvent this ambiguity by deducing the material of the object\nfrom its appearance and semantics. Motivated by this insight, we propose\nMaterialSeg3D, a 3D asset material generation framework to infer underlying\nmaterial from the 2D semantic prior. Based on such a prior model, we devise a\nmechanism to parse material in 3D space. We maintain a UV stack, each map of\nwhich is unprojected from a specific viewpoint. After traversing all\nviewpoints, we fuse the stack through a weighted voting scheme and then employ\nregion unification to ensure the coherence of the object parts. To fuel the\nlearning of semantics prior, we collect a material dataset, named Materialized\nIndividual Objects (MIO), which features abundant images, diverse categories,\nand accurate annotations. Extensive quantitative and qualitative experiments\ndemonstrate the effectiveness of our method.\n","authors":["Zeyu Li","Ruitong Gan","Chuanchen Luo","Yuxi Wang","Jiaheng Liu","Ziwei Zhu Man Zhang","Qing Li","Xucheng Yin","Zhaoxiang Zhang","Junran Peng"],"pdf_url":"https://arxiv.org/pdf/2404.13923v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13537v2","updated":"2024-04-24T09:51:35Z","published":"2024-04-21T05:11:37Z","title":"Bracketing Image Restoration and Enhancement with High-Low Frequency\n Decomposition","summary":" In real-world scenarios, due to a series of image degradations, obtaining\nhigh-quality, clear content photos is challenging. While significant progress\nhas been made in synthesizing high-quality images, previous methods for image\nrestoration and enhancement often overlooked the characteristics of different\ndegradations. They applied the same structure to address various types of\ndegradation, resulting in less-than-ideal restoration outcomes. Inspired by the\nnotion that high/low frequency information is applicable to different\ndegradations, we introduce HLNet, a Bracketing Image Restoration and\nEnhancement method based on high-low frequency decomposition. Specifically, we\nemploy two modules for feature extraction: shared weight modules and non-shared\nweight modules. In the shared weight modules, we use SCConv to extract common\nfeatures from different degradations. In the non-shared weight modules, we\nintroduce the High-Low Frequency Decomposition Block (HLFDB), which employs\ndifferent methods to handle high-low frequency information, enabling the model\nto address different degradations more effectively. Compared to other networks,\nour method takes into account the characteristics of different degradations,\nthus achieving higher-quality image restoration.\n","authors":["Genggeng Chen","Kexin Dai","Kangzhen Yang","Tao Hu","Xiangyu Chen","Yongqing Yang","Wei Dong","Peng Wu","Yanning Zhang","Qingsen Yan"],"pdf_url":"https://arxiv.org/pdf/2404.13537v2.pdf","comment":"This paper is accepted by CVPR 2024 Workshop, code:\n https://github.com/chengeng0613/HLNet"},{"id":"http://arxiv.org/abs/2404.16255v1","updated":"2024-04-24T23:56:03Z","published":"2024-04-24T23:56:03Z","title":"Enhancing Privacy in Face Analytics Using Fully Homomorphic Encryption","summary":" Modern face recognition systems utilize deep neural networks to extract\nsalient features from a face. These features denote embeddings in latent space\nand are often stored as templates in a face recognition system. These\nembeddings are susceptible to data leakage and, in some cases, can even be used\nto reconstruct the original face image. To prevent compromising identities,\ntemplate protection schemes are commonly employed. However, these schemes may\nstill not prevent the leakage of soft biometric information such as age, gender\nand race. To alleviate this issue, we propose a novel technique that combines\nFully Homomorphic Encryption (FHE) with an existing template protection scheme\nknown as PolyProtect. We show that the embeddings can be compressed and\nencrypted using FHE and transformed into a secure PolyProtect template using\npolynomial transformation, for additional protection. We demonstrate the\nefficacy of the proposed approach through extensive experiments on multiple\ndatasets. Our proposed approach ensures irreversibility and unlinkability,\neffectively preventing the leakage of soft biometric attributes from face\nembeddings without compromising recognition accuracy.\n","authors":["Bharat Yalavarthi","Arjun Ramesh Kaushik","Arun Ross","Vishnu Boddeti","Nalini Ratha"],"pdf_url":"https://arxiv.org/pdf/2404.16255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08275v3","updated":"2024-04-24T23:34:58Z","published":"2023-05-14T23:14:09Z","title":"ULIP-2: Towards Scalable Multimodal Pre-training for 3D Understanding","summary":" Recent advancements in multimodal pre-training have shown promising efficacy\nin 3D representation learning by aligning multimodal features across 3D shapes,\ntheir 2D counterparts, and language descriptions. However, the methods used by\nexisting frameworks to curate such multimodal data, in particular language\ndescriptions for 3D shapes, are not scalable, and the collected language\ndescriptions are not diverse. To address this, we introduce ULIP-2, a simple\nyet effective tri-modal pre-training framework that leverages large multimodal\nmodels to automatically generate holistic language descriptions for 3D shapes.\nIt only needs 3D data as input, eliminating the need for any manual 3D\nannotations, and is therefore scalable to large datasets. ULIP-2 is also\nequipped with scaled-up backbones for better multimodal representation\nlearning. We conduct experiments on two large-scale 3D datasets, Objaverse and\nShapeNet, and augment them with tri-modal datasets of 3D point clouds, images,\nand language for training ULIP-2. Experiments show that ULIP-2 demonstrates\nsubstantial benefits in three downstream tasks: zero-shot 3D classification,\nstandard 3D classification with fine-tuning, and 3D captioning (3D-to-language\ngeneration). It achieves a new SOTA of 50.6% (top-1) on Objaverse-LVIS and\n84.7% (top-1) on ModelNet40 in zero-shot classification. In the ScanObjectNN\nbenchmark for standard fine-tuning, ULIP-2 reaches an overall accuracy of 91.5%\nwith a compact model of only 1.4 million parameters. ULIP-2 sheds light on a\nnew paradigm for scalable multimodal 3D representation learning without human\nannotations and shows significant improvements over existing baselines. The\ncode and datasets are released at https://github.com/salesforce/ULIP.\n","authors":["Le Xue","Ning Yu","Shu Zhang","Junnan Li","Roberto Martín-Martín","Jiajun Wu","Caiming Xiong","Ran Xu","Juan Carlos Niebles","Silvio Savarese"],"pdf_url":"https://arxiv.org/pdf/2305.08275v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2305.10722v3","updated":"2024-04-24T23:10:17Z","published":"2023-05-18T05:41:36Z","title":"Discffusion: Discriminative Diffusion Models as Few-shot Vision and\n Language Learners","summary":" Diffusion models, such as Stable Diffusion, have shown incredible performance\non text-to-image generation. Since text-to-image generation often requires\nmodels to generate visual concepts with fine-grained details and attributes\nspecified in text prompts, can we leverage the powerful representations learned\nby pre-trained diffusion models for discriminative tasks such as image-text\nmatching? To answer this question, we propose a novel approach, Discriminative\nStable Diffusion (DSD), which turns pre-trained text-to-image diffusion models\ninto few-shot discriminative learners. Our approach mainly uses the\ncross-attention score of a Stable Diffusion model to capture the mutual\ninfluence between visual and textual information and fine-tune the model via\nefficient attention-based prompt learning to perform image-text matching. By\ncomparing DSD with state-of-the-art methods on several benchmark datasets, we\ndemonstrate the potential of using pre-trained diffusion models for\ndiscriminative tasks with superior results on few-shot image-text matching.\n","authors":["Xuehai He","Weixi Feng","Tsu-Jui Fu","Varun Jampani","Arjun Akula","Pradyumna Narayana","Sugato Basu","William Yang Wang","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10722v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02411v2","updated":"2024-04-24T23:03:19Z","published":"2024-03-04T19:08:20Z","title":"NiNformer: A Network in Network Transformer with Token Mixing Generated\n Gating Function","summary":" The Attention mechanism is the main component of the Transformer\narchitecture, and since its introduction, it has led to significant\nadvancements in Deep Learning that span many domains and multiple tasks. The\nAttention Mechanism was utilized in Computer Vision as the Vision Transformer\nViT, and its usage has expanded into many tasks in the vision domain, such as\nclassification, segmentation, object detection, and image generation. While\nthis mechanism is very expressive and capable, it comes with the drawback of\nbeing computationally expensive and requiring datasets of considerable size for\neffective optimization. To address these shortcomings, many designs have been\nproposed in the literature to reduce the computational burden and alleviate the\ndata size requirements. Examples of such attempts in the vision domain are the\nMLP-Mixer, the Conv-Mixer, the Perciver-IO, and many more. This paper\nintroduces a new computational block as an alternative to the standard ViT\nblock that reduces the compute burdens by replacing the normal Attention layers\nwith a Network in Network structure that enhances the static approach of the\nMLP Mixer with a dynamic system of learning an element-wise gating function by\na token mixing process. Extensive experimentation shows that the proposed\ndesign provides better performance than the baseline architectures on multiple\ndatasets applied in the image classification task of the vision domain.\n","authors":["Abdullah Nazhat Abdullah","Tarkan Aydin"],"pdf_url":"https://arxiv.org/pdf/2403.02411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11385v2","updated":"2024-04-24T22:35:02Z","published":"2023-10-17T16:32:38Z","title":"A voxel-level approach to brain age prediction: A method to assess\n regional brain aging","summary":" Brain aging is a regional phenomenon, a facet that remains relatively\nunder-explored within the realm of brain age prediction research using machine\nlearning methods. Voxel-level predictions can provide localized brain age\nestimates that can provide granular insights into the regional aging processes.\nThis is essential to understand the differences in aging trajectories in\nhealthy versus diseased subjects. In this work, a deep learning-based multitask\nmodel is proposed for voxel-level brain age prediction from T1-weighted\nmagnetic resonance images. The proposed model outperforms the models existing\nin the literature and yields valuable clinical insights when applied to both\nhealthy and diseased populations. Regional analysis is performed on the\nvoxel-level brain age predictions to understand aging trajectories of known\nanatomical regions in the brain and show that there exist disparities in\nregional aging trajectories of healthy subjects compared to ones with\nunderlying neurological disorders such as Dementia and more specifically,\nAlzheimer's disease. Our code is available at\nhttps://github.com/nehagianchandani/Voxel-level-brain-age-prediction.\n","authors":["Neha Gianchandani","Mahsa Dibaji","Johanna Ospel","Fernando Vega","Mariana Bento","M. Ethan MacDonald","Roberto Souza"],"pdf_url":"https://arxiv.org/pdf/2310.11385v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024:007"},{"id":"http://arxiv.org/abs/2404.13591v2","updated":"2024-04-24T22:32:10Z","published":"2024-04-21T09:15:02Z","title":"MARVEL: Multidimensional Abstraction and Reasoning through Visual\n Evaluation and Learning","summary":" While multi-modal large language models (MLLMs) have shown significant\nprogress on many popular visual reasoning benchmarks, whether they possess\nabstract visual reasoning abilities remains an open question. Similar to the\nSudoku puzzles, abstract visual reasoning (AVR) problems require finding\nhigh-level patterns (e.g., repetition constraints) that control the input\nshapes (e.g., digits) in a specific task configuration (e.g., matrix). However,\nexisting AVR benchmarks only considered a limited set of patterns (addition,\nconjunction), input shapes (rectangle, square), and task configurations (3 by 3\nmatrices). To evaluate MLLMs' reasoning abilities comprehensively, we introduce\nMARVEL, a multidimensional AVR benchmark with 770 puzzles composed of six core\nknowledge patterns, geometric and abstract shapes, and five different task\nconfigurations. To inspect whether the model accuracy is grounded in perception\nand reasoning, MARVEL complements the general AVR question with perception\nquestions in a hierarchical evaluation framework. We conduct comprehensive\nexperiments on MARVEL with nine representative MLLMs in zero-shot and few-shot\nsettings. Our experiments reveal that all models show near-random performance\non the AVR question, with significant performance gaps (40%) compared to humans\nacross all patterns and task configurations. Further analysis of perception\nquestions reveals that MLLMs struggle to comprehend the visual features\n(near-random performance) and even count the panels in the puzzle ( <45%),\nhindering their ability for abstract reasoning. We release our entire code and\ndataset.\n","authors":["Yifan Jiang","Jiarui Zhang","Kexuan Sun","Zhivar Sourati","Kian Ahrabian","Kaixin Ma","Filip Ilievski","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2404.13591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16223v1","updated":"2024-04-24T21:51:01Z","published":"2024-04-24T21:51:01Z","title":"Deep RAW Image Super-Resolution. A NTIRE 2024 Challenge Survey","summary":" This paper reviews the NTIRE 2024 RAW Image Super-Resolution Challenge,\nhighlighting the proposed solutions and results. New methods for RAW\nSuper-Resolution could be essential in modern Image Signal Processing (ISP)\npipelines, however, this problem is not as explored as in the RGB domain. Th\ngoal of this challenge is to upscale RAW Bayer images by 2x, considering\nunknown degradations such as noise and blur. In the challenge, a total of 230\nparticipants registered, and 45 submitted results during thee challenge period.\nThe performance of the top-5 submissions is reviewed and provided here as a\ngauge for the current state-of-the-art in RAW Image Super-Resolution.\n","authors":["Marcos V. Conde","Florin-Alexandru Vasluianu","Radu Timofte","Jianxing Zhang","Jia Li","Fan Wang","Xiaopeng Li","Zikun Liu","Hyunhee Park","Sejun Song","Changho Kim","Zhijuan Huang","Hongyuan Yu","Cheng Wan","Wending Xiang","Jiamin Lin","Hang Zhong","Qiaosong Zhang","Yue Sun","Xuanwu Yin","Kunlong Zuo","Senyan Xu","Siyuan Jiang","Zhijing Sun","Jiaying Zhu","Liangyan Li","Ke Chen","Yunzhe Li","Yimo Ning","Guanhua Zhao","Jun Chen","Jinyang Yu","Kele Xu","Qisheng Xu","Yong Dou"],"pdf_url":"https://arxiv.org/pdf/2404.16223v1.pdf","comment":"CVPR 2024 - NTIRE Workshop"},{"id":"http://arxiv.org/abs/2404.16222v1","updated":"2024-04-24T21:49:59Z","published":"2024-04-24T21:49:59Z","title":"Step Differences in Instructional Video","summary":" Comparing a user video to a reference how-to video is a key requirement for\nAR/VR technology delivering personalized assistance tailored to the user's\nprogress. However, current approaches for language-based assistance can only\nanswer questions about a single video. We propose an approach that first\nautomatically generates large amounts of visual instruction tuning data\ninvolving pairs of videos from HowTo100M by leveraging existing step\nannotations and accompanying narrations, and then trains a video-conditioned\nlanguage model to jointly reason across multiple raw videos. Our model achieves\nstate-of-the-art performance at identifying differences between video pairs and\nranking videos based on the severity of these differences, and shows promising\nability to perform general reasoning over multiple videos.\n","authors":["Tushar Nagarajan","Lorenzo Torresani"],"pdf_url":"https://arxiv.org/pdf/2404.16222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16221v1","updated":"2024-04-24T21:43:15Z","published":"2024-04-24T21:43:15Z","title":"NeRF-XL: Scaling NeRFs with Multiple GPUs","summary":" We present NeRF-XL, a principled method for distributing Neural Radiance\nFields (NeRFs) across multiple GPUs, thus enabling the training and rendering\nof NeRFs with an arbitrarily large capacity. We begin by revisiting existing\nmulti-GPU approaches, which decompose large scenes into multiple independently\ntrained NeRFs, and identify several fundamental issues with these methods that\nhinder improvements in reconstruction quality as additional computational\nresources (GPUs) are used in training. NeRF-XL remedies these issues and\nenables the training and rendering of NeRFs with an arbitrary number of\nparameters by simply using more hardware. At the core of our method lies a\nnovel distributed training and rendering formulation, which is mathematically\nequivalent to the classic single-GPU case and minimizes communication between\nGPUs. By unlocking NeRFs with arbitrarily large parameter counts, our approach\nis the first to reveal multi-GPU scaling laws for NeRFs, showing improvements\nin reconstruction quality with larger parameter counts and speed improvements\nwith more GPUs. We demonstrate the effectiveness of NeRF-XL on a wide variety\nof datasets, including the largest open-source dataset to date, MatrixCity,\ncontaining 258K images covering a 25km^2 city area.\n","authors":["Ruilong Li","Sanja Fidler","Angjoo Kanazawa","Francis Williams"],"pdf_url":"https://arxiv.org/pdf/2404.16221v1.pdf","comment":"Webpage: https://research.nvidia.com/labs/toronto-ai/nerfxl/"},{"id":"http://arxiv.org/abs/2404.16216v1","updated":"2024-04-24T21:30:01Z","published":"2024-04-24T21:30:01Z","title":"ActiveRIR: Active Audio-Visual Exploration for Acoustic Environment\n Modeling","summary":" An environment acoustic model represents how sound is transformed by the\nphysical characteristics of an indoor environment, for any given\nsource/receiver location. Traditional methods for constructing acoustic models\ninvolve expensive and time-consuming collection of large quantities of acoustic\ndata at dense spatial locations in the space, or rely on privileged knowledge\nof scene geometry to intelligently select acoustic data sampling locations. We\npropose active acoustic sampling, a new task for efficiently building an\nenvironment acoustic model of an unmapped environment in which a mobile agent\nequipped with visual and acoustic sensors jointly constructs the environment\nacoustic model and the occupancy map on-the-fly. We introduce ActiveRIR, a\nreinforcement learning (RL) policy that leverages information from audio-visual\nsensor streams to guide agent navigation and determine optimal acoustic data\nsampling positions, yielding a high quality acoustic model of the environment\nfrom a minimal set of acoustic samples. We train our policy with a novel RL\nreward based on information gain in the environment acoustic model. Evaluating\non diverse unseen indoor environments from a state-of-the-art acoustic\nsimulation platform, ActiveRIR outperforms an array of methods--both\ntraditional navigation agents based on spatial novelty and visual exploration\nas well as existing state-of-the-art methods.\n","authors":["Arjun Somayazulu","Sagnik Majumder","Changan Chen","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2404.16216v1.pdf","comment":"Project page: https://vision.cs.utexas.edu/projects/active_rir/"},{"id":"http://arxiv.org/abs/2404.16212v1","updated":"2024-04-24T21:21:50Z","published":"2024-04-24T21:21:50Z","title":"An Analysis of Recent Advances in Deepfake Image Detection in an\n Evolving Threat Landscape","summary":" Deepfake or synthetic images produced using deep generative models pose\nserious risks to online platforms. This has triggered several research efforts\nto accurately detect deepfake images, achieving excellent performance on\npublicly available deepfake datasets. In this work, we study 8 state-of-the-art\ndetectors and argue that they are far from being ready for deployment due to\ntwo recent developments. First, the emergence of lightweight methods to\ncustomize large generative models, can enable an attacker to create many\ncustomized generators (to create deepfakes), thereby substantially increasing\nthe threat surface. We show that existing defenses fail to generalize well to\nsuch \\emph{user-customized generative models} that are publicly available\ntoday. We discuss new machine learning approaches based on content-agnostic\nfeatures, and ensemble modeling to improve generalization performance against\nuser-customized models. Second, the emergence of \\textit{vision foundation\nmodels} -- machine learning models trained on broad data that can be easily\nadapted to several downstream tasks -- can be misused by attackers to craft\nadversarial deepfakes that can evade existing defenses. We propose a simple\nadversarial attack that leverages existing foundation models to craft\nadversarial samples \\textit{without adding any adversarial noise}, through\ncareful semantic manipulation of the image content. We highlight the\nvulnerabilities of several defenses against our attack, and explore directions\nleveraging advanced foundation models and adversarial training to defend\nagainst this new threat.\n","authors":["Sifat Muhammad Abdullah","Aravind Cheruvu","Shravya Kanchi","Taejoong Chung","Peng Gao","Murtuza Jadliwala","Bimal Viswanath"],"pdf_url":"https://arxiv.org/pdf/2404.16212v1.pdf","comment":"Accepted to IEEE S&P 2024; 19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2205.06891v5","updated":"2024-04-24T21:13:11Z","published":"2022-05-13T21:07:26Z","title":"Unsupervised Representation Learning for 3D MRI Super Resolution with\n Degradation Adaptation","summary":" High-resolution (HR) magnetic resonance imaging is critical in aiding doctors\nin their diagnoses and image-guided treatments. However, acquiring HR images\ncan be time-consuming and costly. Consequently, deep learning-based\nsuper-resolution reconstruction (SRR) has emerged as a promising solution for\ngenerating super-resolution (SR) images from low-resolution (LR) images.\nUnfortunately, training such neural networks requires aligned authentic HR and\nLR image pairs, which are challenging to obtain due to patient movements during\nand between image acquisitions. While rigid movements of hard tissues can be\ncorrected with image registration, aligning deformed soft tissues is complex,\nmaking it impractical to train neural networks with authentic HR and LR image\npairs. Previous studies have focused on SRR using authentic HR images and\ndown-sampled synthetic LR images. However, the difference in degradation\nrepresentations between synthetic and authentic LR images suppresses the\nquality of SR images reconstructed from authentic LR images. To address this\nissue, we propose a novel Unsupervised Degradation Adaptation Network (UDEAN).\nOur network consists of a degradation learning network and an SRR network. The\ndegradation learning network downsamples the HR images using the degradation\nrepresentation learned from the misaligned or unpaired LR images. The SRR\nnetwork then learns the mapping from the down-sampled HR images to the original\nones. Experimental results show that our method outperforms state-of-the-art\nnetworks and is a promising solution to the challenges in clinical settings.\n","authors":["Jianan Liu","Hao Li","Tao Huang","Euijoon Ahn","Kang Han","Adeel Razi","Wei Xiang","Jinman Kim","David Dagan Feng"],"pdf_url":"https://arxiv.org/pdf/2205.06891v5.pdf","comment":"Accepted by IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2404.16205v1","updated":"2024-04-24T21:02:14Z","published":"2024-04-24T21:02:14Z","title":"AIS 2024 Challenge on Video Quality Assessment of User-Generated\n Content: Methods and Results","summary":" This paper reviews the AIS 2024 Video Quality Assessment (VQA) Challenge,\nfocused on User-Generated Content (UGC). The aim of this challenge is to gather\ndeep learning-based methods capable of estimating the perceptual quality of UGC\nvideos. The user-generated videos from the YouTube UGC Dataset include diverse\ncontent (sports, games, lyrics, anime, etc.), quality and resolutions. The\nproposed methods must process 30 FHD frames under 1 second. In the challenge, a\ntotal of 102 participants registered, and 15 submitted code and models. The\nperformance of the top-5 submissions is reviewed and provided here as a survey\nof diverse deep models for efficient video quality assessment of user-generated\ncontent.\n","authors":["Marcos V. Conde","Saman Zadtootaghaj","Nabajeet Barman","Radu Timofte","Chenlong He","Qi Zheng","Ruoxi Zhu","Zhengzhong Tu","Haiqiang Wang","Xiangguang Chen","Wenhui Meng","Xiang Pan","Huiying Shi","Han Zhu","Xiaozhong Xu","Lei Sun","Zhenzhong Chen","Shan Liu","Zicheng Zhang","Haoning Wu","Yingjie Zhou","Chunyi Li","Xiaohong Liu","Weisi Lin","Guangtao Zhai","Wei Sun","Yuqin Cao","Yanwei Jiang","Jun Jia","Zhichao Zhang","Zijian Chen","Weixia Zhang","Xiongkuo Min","Steve Göring","Zihao Qi","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2404.16205v1.pdf","comment":"CVPR 2024 Workshop -- AI for Streaming (AIS) Video Quality Assessment\n Challenge"},{"id":"http://arxiv.org/abs/2404.16193v1","updated":"2024-04-24T20:33:25Z","published":"2024-04-24T20:33:25Z","title":"Improving Multi-label Recognition using Class Co-Occurrence\n Probabilities","summary":" Multi-label Recognition (MLR) involves the identification of multiple objects\nwithin an image. To address the additional complexity of this problem, recent\nworks have leveraged information from vision-language models (VLMs) trained on\nlarge text-images datasets for the task. These methods learn an independent\nclassifier for each object (class), overlooking correlations in their\noccurrences. Such co-occurrences can be captured from the training data as\nconditional probabilities between a pair of classes. We propose a framework to\nextend the independent classifiers by incorporating the co-occurrence\ninformation for object pairs to improve the performance of independent\nclassifiers. We use a Graph Convolutional Network (GCN) to enforce the\nconditional probabilities between classes, by refining the initial estimates\nderived from image and text sources obtained using VLMs. We validate our method\non four MLR datasets, where our approach outperforms all state-of-the-art\nmethods.\n","authors":["Samyak Rawlekar","Shubhang Bhatnagar","Vishnuvardhan Pogunulu Srinivasulu","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2404.16193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16192v1","updated":"2024-04-24T20:31:15Z","published":"2024-04-24T20:31:15Z","title":"Fusion of Domain-Adapted Vision and Language Models for Medical Visual\n Question Answering","summary":" Vision-language models, while effective in general domains and showing strong\nperformance in diverse multi-modal applications like visual question-answering\n(VQA), struggle to maintain the same level of effectiveness in more specialized\ndomains, e.g., medical. We propose a medical vision-language model that\nintegrates large vision and language models adapted for the medical domain.\nThis model goes through three stages of parameter-efficient training using\nthree separate biomedical and radiology multi-modal visual and text datasets.\nThe proposed model achieves state-of-the-art performance on the SLAKE 1.0\nmedical VQA (MedVQA) dataset with an overall accuracy of 87.5% and demonstrates\nstrong performance on another MedVQA dataset, VQA-RAD, achieving an overall\naccuracy of 73.2%.\n","authors":["Cuong Nhat Ha","Shima Asaadi","Sanjeev Kumar Karn","Oladimeji Farri","Tobias Heimann","Thomas Runkler"],"pdf_url":"https://arxiv.org/pdf/2404.16192v1.pdf","comment":"Clinical NLP @ NAACL 2024"},{"id":"http://arxiv.org/abs/2404.15009v2","updated":"2024-04-24T20:30:08Z","published":"2024-04-23T13:15:22Z","title":"The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus\n on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs)","summary":" Pediatric tumors of the central nervous system are the most common cause of\ncancer-related death in children. The five-year survival rate for high-grade\ngliomas in children is less than 20%. Due to their rarity, the diagnosis of\nthese entities is often delayed, their treatment is mainly based on historic\ntreatment concepts, and clinical trials require multi-institutional\ncollaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs\nchallenge, focused on pediatric brain tumors with data acquired across multiple\ninternational consortia dedicated to pediatric neuro-oncology and clinical\ntrials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together\nclinicians and AI/imaging scientists to lead to faster development of automated\nsegmentation techniques that could benefit clinical trials, and ultimately the\ncare of children with brain tumors.\n","authors":["Anahita Fathi Kazerooni","Nastaran Khalili","Deep Gandhi","Xinyang Liu","Zhifan Jiang","Syed Muhammed Anwar","Jake Albrecht","Maruf Adewole","Udunna Anazodo","Hannah Anderson","Sina Bagheri","Ujjwal Baid","Timothy Bergquist","Austin J. Borja","Evan Calabrese","Verena Chung","Gian-Marco Conte","Farouk Dako","James Eddy","Ivan Ezhov","Ariana Familiar","Keyvan Farahani","Anurag Gottipati","Debanjan Haldar","Shuvanjan Haldar","Juan Eugenio Iglesias","Anastasia Janas","Elaine Johansen","Blaise V Jones","Neda Khalili","Florian Kofler","Dominic LaBella","Hollie Anne Lai","Koen Van Leemput","Hongwei Bran Li","Nazanin Maleki","Aaron S McAllister","Zeke Meier","Bjoern Menze","Ahmed W Moawad","Khanak K Nandolia","Julija Pavaine","Marie Piraud","Tina Poussaint","Sanjay P Prabhu","Zachary Reitman","Andres Rodriguez","Jeffrey D Rudie","Mariana Sanchez-Montano","Ibraheem Salman Shaikh","Lubdha M. Shah","Nakul Sheth","Russel Taki Shinohara","Wenxin Tu","Karthik Viswanathan","Chunhao Wang","Jeffrey B Ware","Benedikt Wiestler","Walter Wiggins","Anna Zapaishchykova","Mariam Aboian","Miriam Bornhorst","Peter de Blank","Michelle Deutsch","Maryam Fouladi","Lindsey Hoffman","Benjamin Kann","Margot Lazow","Leonie Mikael","Ali Nabavizadeh","Roger Packer","Spyridon Bakas","Adam Resnick","Brian Rood","Arastoo Vossough","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2404.15009v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2305.17033"},{"id":"http://arxiv.org/abs/2310.01641v4","updated":"2024-04-24T20:05:04Z","published":"2023-10-02T21:09:43Z","title":"You Only Look at Once for Real-time and Generic Multi-Task","summary":" High precision, lightweight, and real-time responsiveness are three essential\nrequirements for implementing autonomous driving. In this study, we incorporate\nA-YOLOM, an adaptive, real-time, and lightweight multi-task model designed to\nconcurrently address object detection, drivable area segmentation, and lane\nline segmentation tasks. Specifically, we develop an end-to-end multi-task\nmodel with a unified and streamlined segmentation structure. We introduce a\nlearnable parameter that adaptively concatenates features between necks and\nbackbone in segmentation tasks, using the same loss function for all\nsegmentation tasks. This eliminates the need for customizations and enhances\nthe model's generalization capabilities. We also introduce a segmentation head\ncomposed only of a series of convolutional layers, which reduces the number of\nparameters and inference time. We achieve competitive results on the BDD100k\ndataset, particularly in visualization outcomes. The performance results show a\nmAP50 of 81.1% for object detection, a mIoU of 91.0% for drivable area\nsegmentation, and an IoU of 28.8% for lane line segmentation. Additionally, we\nintroduce real-world scenarios to evaluate our model's performance in a real\nscene, which significantly outperforms competitors. This demonstrates that our\nmodel not only exhibits competitive performance but is also more flexible and\nfaster than existing multi-task models. The source codes and pre-trained models\nare released at https://github.com/JiayuanWang-JW/YOLOv8-multi-task\n","authors":["Jiayuan Wang","Q. M. Jonathan Wu","Ning Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.01641v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16174v1","updated":"2024-04-24T20:04:55Z","published":"2024-04-24T20:04:55Z","title":"MiMICRI: Towards Domain-centered Counterfactual Explanations of\n Cardiovascular Image Classification Models","summary":" The recent prevalence of publicly accessible, large medical imaging datasets\nhas led to a proliferation of artificial intelligence (AI) models for\ncardiovascular image classification and analysis. At the same time, the\npotentially significant impacts of these models have motivated the development\nof a range of explainable AI (XAI) methods that aim to explain model\npredictions given certain image inputs. However, many of these methods are not\ndeveloped or evaluated with domain experts, and explanations are not\ncontextualized in terms of medical expertise or domain knowledge. In this\npaper, we propose a novel framework and python library, MiMICRI, that provides\ndomain-centered counterfactual explanations of cardiovascular image\nclassification models. MiMICRI helps users interactively select and replace\nsegments of medical images that correspond to morphological structures. From\nthe counterfactuals generated, users can then assess the influence of each\nsegment on model predictions, and validate the model against known medical\nfacts. We evaluate this library with two medical experts. Our evaluation\ndemonstrates that a domain-centered XAI approach can enhance the\ninterpretability of model explanations, and help experts reason about models in\nterms of relevant domain knowledge. However, concerns were also surfaced about\nthe clinical plausibility of the counterfactuals generated. We conclude with a\ndiscussion on the generalizability and trustworthiness of the MiMICRI\nframework, as well as the implications of our findings on the development of\ndomain-centered XAI methods for model interpretability in healthcare contexts.\n","authors":["Grace Guo","Lifu Deng","Animesh Tandon","Alex Endert","Bum Chul Kwon"],"pdf_url":"https://arxiv.org/pdf/2404.16174v1.pdf","comment":"14 pages, 6 figures, ACM FAccT 2024"},{"id":"http://arxiv.org/abs/2403.06862v2","updated":"2024-04-24T19:36:34Z","published":"2024-03-11T16:15:51Z","title":"Real-Time Simulated Avatar from Head-Mounted Sensors","summary":" We present SimXR, a method for controlling a simulated avatar from\ninformation (headset pose and cameras) obtained from AR / VR headsets. Due to\nthe challenging viewpoint of head-mounted cameras, the human body is often\nclipped out of view, making traditional image-based egocentric pose estimation\nchallenging. On the other hand, headset poses provide valuable information\nabout overall body motion, but lack fine-grained details about the hands and\nfeet. To synergize headset poses with cameras, we control a humanoid to track\nheadset movement while analyzing input images to decide body movement. When\nbody parts are seen, the movements of hands and feet will be guided by the\nimages; when unseen, the laws of physics guide the controller to generate\nplausible motion. We design an end-to-end method that does not rely on any\nintermediate representations and learns to directly map from images and headset\nposes to humanoid control signals. To train our method, we also propose a\nlarge-scale synthetic dataset created using camera configurations compatible\nwith a commercially available VR headset (Quest 2) and show promising results\non real-world captures. To demonstrate the applicability of our framework, we\nalso test it on an AR headset with a forward-facing camera.\n","authors":["Zhengyi Luo","Jinkun Cao","Rawal Khirodkar","Alexander Winkler","Jing Huang","Kris Kitani","Weipeng Xu"],"pdf_url":"https://arxiv.org/pdf/2403.06862v2.pdf","comment":"CVPR 2024 Hightlight. Website: https://www.zhengyiluo.com/SimXR/"},{"id":"http://arxiv.org/abs/2404.03537v3","updated":"2024-04-24T19:36:25Z","published":"2024-04-04T15:45:25Z","title":"If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face\n Recognition through Synthetic Faces","summary":" Recent advances in deep face recognition have spurred a growing demand for\nlarge, diverse, and manually annotated face datasets. Acquiring authentic,\nhigh-quality data for face recognition has proven to be a challenge, primarily\ndue to privacy concerns. Large face datasets are primarily sourced from\nweb-based images, lacking explicit user consent. In this paper, we examine\nwhether and how synthetic face data can be used to train effective face\nrecognition models with reduced reliance on authentic images, thereby\nmitigating data collection concerns. First, we explored the performance gap\namong recent state-of-the-art face recognition models, trained with synthetic\ndata only and authentic (scarce) data only. Then, we deepened our analysis by\ntraining a state-of-the-art backbone with various combinations of synthetic and\nauthentic data, gaining insights into optimizing the limited use of the latter\nfor verification accuracy. Finally, we assessed the effectiveness of data\naugmentation approaches on synthetic and authentic data, with the same goal in\nmind. Our results highlighted the effectiveness of FR trained on combined\ndatasets, particularly when combined with appropriate augmentation techniques.\n","authors":["Andrea Atzori","Fadi Boutros","Naser Damer","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2404.03537v3.pdf","comment":"Accepted as full paper at FG 2024 main track"},{"id":"http://arxiv.org/abs/2404.16155v1","updated":"2024-04-24T19:22:45Z","published":"2024-04-24T19:22:45Z","title":"Does SAM dream of EIG? Characterizing Interactive Segmenter Performance\n using Expected Information Gain","summary":" We introduce an assessment procedure for interactive segmentation models.\nBased on concepts from Bayesian Experimental Design, the procedure measures a\nmodel's understanding of point prompts and their correspondence with the\ndesired segmentation mask. We show that Oracle Dice index measurements are\ninsensitive or even misleading in measuring this property. We demonstrate the\nuse of the proposed procedure on three interactive segmentation models and\nsubsets of two large image segmentation datasets.\n","authors":["Kuan-I Chung","Daniel Moyer"],"pdf_url":"https://arxiv.org/pdf/2404.16155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16139v1","updated":"2024-04-24T18:57:30Z","published":"2024-04-24T18:57:30Z","title":"A Survey on Intermediate Fusion Methods for Collaborative Perception\n Categorized by Real World Challenges","summary":" This survey analyzes intermediate fusion methods in collaborative perception\nfor autonomous driving, categorized by real-world challenges. We examine\nvarious methods, detailing their features and the evaluation metrics they\nemploy. The focus is on addressing challenges like transmission efficiency,\nlocalization errors, communication disruptions, and heterogeneity. Moreover, we\nexplore strategies to counter adversarial attacks and defenses, as well as\napproaches to adapt to domain shifts. The objective is to present an overview\nof how intermediate fusion methods effectively meet these diverse challenges,\nhighlighting their role in advancing the field of collaborative perception in\nautonomous driving.\n","authors":["Melih Yazgan","Thomas Graf","Min Liu","J. Marius Zoellner"],"pdf_url":"https://arxiv.org/pdf/2404.16139v1.pdf","comment":"8 pages, 6 tables"},{"id":"http://arxiv.org/abs/2404.16136v1","updated":"2024-04-24T18:49:37Z","published":"2024-04-24T18:49:37Z","title":"3D Human Pose Estimation with Occlusions: Introducing BlendMimic3D\n Dataset and GCN Refinement","summary":" In the field of 3D Human Pose Estimation (HPE), accurately estimating human\npose, especially in scenarios with occlusions, is a significant challenge. This\nwork identifies and addresses a gap in the current state of the art in 3D HPE\nconcerning the scarcity of data and strategies for handling occlusions. We\nintroduce our novel BlendMimic3D dataset, designed to mimic real-world\nsituations where occlusions occur for seamless integration in 3D HPE\nalgorithms. Additionally, we propose a 3D pose refinement block, employing a\nGraph Convolutional Network (GCN) to enhance pose representation through a\ngraph model. This GCN block acts as a plug-and-play solution, adaptable to\nvarious 3D HPE frameworks without requiring retraining them. By training the\nGCN with occluded data from BlendMimic3D, we demonstrate significant\nimprovements in resolving occluded poses, with comparable results for\nnon-occluded ones. Project web page is available at\nhttps://blendmimic3d.github.io/BlendMimic3D/.\n","authors":["Filipa Lino","Carlos Santiago","Manuel Marques"],"pdf_url":"https://arxiv.org/pdf/2404.16136v1.pdf","comment":"Accepted at 6th Workshop and Competition on Affective Behavior\n Analysis in-the-wild - CVPR 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.16133v1","updated":"2024-04-24T18:40:45Z","published":"2024-04-24T18:40:45Z","title":"Quantitative Characterization of Retinal Features in Translated OCTA","summary":" Purpose: This study explores the feasibility of using generative machine\nlearning (ML) to translate Optical Coherence Tomography (OCT) images into\nOptical Coherence Tomography Angiography (OCTA) images, potentially bypassing\nthe need for specialized OCTA hardware. Methods: The method involved\nimplementing a generative adversarial network framework that includes a 2D\nvascular segmentation model and a 2D OCTA image translation model. The study\nutilizes a public dataset of 500 patients, divided into subsets based on\nresolution and disease status, to validate the quality of TR-OCTA images. The\nvalidation employs several quality and quantitative metrics to compare the\ntranslated images with ground truth OCTAs (GT-OCTA). We then quantitatively\ncharacterize vascular features generated in TR-OCTAs with GT-OCTAs to assess\nthe feasibility of using TR-OCTA for objective disease diagnosis. Result:\nTR-OCTAs showed high image quality in both 3 and 6 mm datasets\n(high-resolution, moderate structural similarity and contrast quality compared\nto GT-OCTAs). There were slight discrepancies in vascular metrics, especially\nin diseased patients. Blood vessel features like tortuosity and vessel\nperimeter index showed a better trend compared to density features which are\naffected by local vascular distortions. Conclusion: This study presents a\npromising solution to the limitations of OCTA adoption in clinical practice by\nusing vascular features from TR-OCTA for disease detection. Translation\nrelevance: This study has the potential to significantly enhance the diagnostic\nprocess for retinal diseases by making detailed vascular imaging more widely\navailable and reducing dependency on costly OCTA equipment.\n","authors":["Rashadul Hasan Badhon","Atalie Carina Thompson","Jennifer I. Lim","Theodore Leng","Minhaj Nur Alam"],"pdf_url":"https://arxiv.org/pdf/2404.16133v1.pdf","comment":"The article has been revised and edited"},{"id":"http://arxiv.org/abs/2404.16123v1","updated":"2024-04-24T18:28:17Z","published":"2024-04-24T18:28:17Z","title":"FairDeDup: Detecting and Mitigating Vision-Language Fairness Disparities\n in Semantic Dataset Deduplication","summary":" Recent dataset deduplication techniques have demonstrated that content-aware\ndataset pruning can dramatically reduce the cost of training Vision-Language\nPretrained (VLP) models without significant performance losses compared to\ntraining on the original dataset. These results have been based on pruning\ncommonly used image-caption datasets collected from the web -- datasets that\nare known to harbor harmful social biases that may then be codified in trained\nmodels. In this work, we evaluate how deduplication affects the prevalence of\nthese biases in the resulting trained models and introduce an easy-to-implement\nmodification to the recent SemDeDup algorithm that can reduce the negative\neffects that we observe. When examining CLIP-style models trained on\ndeduplicated variants of LAION-400M, we find our proposed FairDeDup algorithm\nconsistently leads to improved fairness metrics over SemDeDup on the FairFace\nand FACET datasets while maintaining zero-shot performance on CLIP benchmarks.\n","authors":["Eric Slyman","Stefan Lee","Scott Cohen","Kushal Kafle"],"pdf_url":"https://arxiv.org/pdf/2404.16123v1.pdf","comment":"Conference paper at CVPR 2024. 6 pages, 8 figures. Project Page:\n https://ericslyman.com/fairdedup/"},{"id":"http://arxiv.org/abs/2312.04564v2","updated":"2024-04-24T18:19:32Z","published":"2023-12-07T18:59:55Z","title":"EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS","summary":" Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view\nscene synthesis. It addresses the challenges of lengthy training times and slow\nrendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid,\ndifferentiable rasterization of 3D Gaussians, 3D-GS achieves real-time\nrendering and accelerated training. They, however, demand substantial memory\nresources for both training and storage, as they require millions of Gaussians\nin their point cloud representation for each scene. We present a technique\nutilizing quantized embeddings to significantly reduce per-point memory storage\nrequirements and a coarse-to-fine training strategy for a faster and more\nstable optimization of the Gaussian point clouds. Our approach develops a\npruning stage which results in scene representations with fewer Gaussians,\nleading to faster training times and rendering speeds for real-time rendering\nof high resolution scenes. We reduce storage memory by more than an order of\nmagnitude all while preserving the reconstruction quality. We validate the\neffectiveness of our approach on a variety of datasets and scenes preserving\nthe visual quality while consuming 10-20x lesser memory and faster\ntraining/inference speed. Project page and code is available\nhttps://efficientgaussian.github.io\n","authors":["Sharath Girish","Kamal Gupta","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.04564v2.pdf","comment":"Website: https://efficientgaussian.github.io Code:\n https://github.com/Sharath-girish/efficientgaussian"},{"id":"http://arxiv.org/abs/2403.15360v2","updated":"2024-04-24T18:19:21Z","published":"2024-03-22T17:22:56Z","title":"SiMBA: Simplified Mamba-Based Architecture for Vision and Multivariate\n Time series","summary":" Transformers have widely adopted attention networks for sequence mixing and\nMLPs for channel mixing, playing a pivotal role in achieving breakthroughs\nacross domains. However, recent literature highlights issues with attention\nnetworks, including low inductive bias and quadratic complexity concerning\ninput sequence length. State Space Models (SSMs) like S4 and others (Hippo,\nGlobal Convolutions, liquid S4, LRU, Mega, and Mamba), have emerged to address\nthe above issues to help handle longer sequence lengths. Mamba, while being the\nstate-of-the-art SSM, has a stability issue when scaled to large networks for\ncomputer vision datasets. We propose SiMBA, a new architecture that introduces\nEinstein FFT (EinFFT) for channel modeling by specific eigenvalue computations\nand uses the Mamba block for sequence modeling. Extensive performance studies\nacross image and time-series benchmarks demonstrate that SiMBA outperforms\nexisting SSMs, bridging the performance gap with state-of-the-art transformers.\nNotably, SiMBA establishes itself as the new state-of-the-art SSM on ImageNet\nand transfer learning benchmarks such as Stanford Car and Flower as well as\ntask learning benchmarks as well as seven time series benchmark datasets. The\nproject page is available on this website\n~\\url{https://github.com/badripatro/Simba}.\n","authors":["Badri N. Patro","Vijay S. Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2403.15360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16112v1","updated":"2024-04-24T18:10:31Z","published":"2024-04-24T18:10:31Z","title":"Mamba-360: Survey of State Space Models as Transformer Alternative for\n Long Sequence Modelling: Methods, Applications, and Challenges","summary":" Sequence modeling is a crucial area across various domains, including Natural\nLanguage Processing (NLP), speech recognition, time series forecasting, music\ngeneration, and bioinformatics. Recurrent Neural Networks (RNNs) and Long Short\nTerm Memory Networks (LSTMs) have historically dominated sequence modeling\ntasks like Machine Translation, Named Entity Recognition (NER), etc. However,\nthe advancement of transformers has led to a shift in this paradigm, given\ntheir superior performance. Yet, transformers suffer from $O(N^2)$ attention\ncomplexity and challenges in handling inductive bias. Several variations have\nbeen proposed to address these issues which use spectral networks or\nconvolutions and have performed well on a range of tasks. However, they still\nhave difficulty in dealing with long sequences. State Space Models(SSMs) have\nemerged as promising alternatives for sequence modeling paradigms in this\ncontext, especially with the advent of S4 and its variants, such as S4nd,\nHippo, Hyena, Diagnol State Spaces (DSS), Gated State Spaces (GSS), Linear\nRecurrent Unit (LRU), Liquid-S4, Mamba, etc. In this survey, we categorize the\nfoundational SSMs based on three paradigms namely, Gating architectures,\nStructural architectures, and Recurrent architectures. This survey also\nhighlights diverse applications of SSMs across domains such as vision, video,\naudio, speech, language (especially long sequence modeling), medical (including\ngenomics), chemical (like drug design), recommendation systems, and time series\nanalysis, including tabular data. Moreover, we consolidate the performance of\nSSMs on benchmark datasets like Long Range Arena (LRA), WikiText, Glue, Pile,\nImageNet, Kinetics-400, sstv2, as well as video datasets such as Breakfast,\nCOIN, LVU, and various time series datasets. The project page for Mamba-360\nwork is available on this webpage.\\url{https://github.com/badripatro/mamba360}.\n","authors":["Badri Narayana Patro","Vijay Srinivas Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2404.16112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16035v1","updated":"2024-04-24T17:59:53Z","published":"2024-04-24T17:59:53Z","title":"MaGGIe: Masked Guided Gradual Human Instance Matting","summary":" Human matting is a foundation task in image and video processing, where human\nforeground pixels are extracted from the input. Prior works either improve the\naccuracy by additional guidance or improve the temporal consistency of a single\ninstance across frames. We propose a new framework MaGGIe, Masked Guided\nGradual Human Instance Matting, which predicts alpha mattes progressively for\neach human instances while maintaining the computational cost, precision, and\nconsistency. Our method leverages modern architectures, including transformer\nattention and sparse convolution, to output all instance mattes simultaneously\nwithout exploding memory and latency. Although keeping constant inference costs\nin the multiple-instance scenario, our framework achieves robust and versatile\nperformance on our proposed synthesized benchmarks. With the higher quality\nimage and video matting benchmarks, the novel multi-instance synthesis approach\nfrom publicly available sources is introduced to increase the generalization of\nmodels in real-world scenarios.\n","authors":["Chuong Huynh","Seoung Wug Oh","Abhinav Shrivastava","Joon-Young Lee"],"pdf_url":"https://arxiv.org/pdf/2404.16035v1.pdf","comment":"CVPR 2024. Project link: https://maggie-matt.github.io"},{"id":"http://arxiv.org/abs/2404.16033v1","updated":"2024-04-24T17:59:48Z","published":"2024-04-24T17:59:48Z","title":"Cantor: Inspiring Multimodal Chain-of-Thought of MLLM","summary":" With the advent of large language models(LLMs) enhanced by the\nchain-of-thought(CoT) methodology, visual reasoning problem is usually\ndecomposed into manageable sub-tasks and tackled sequentially with various\nexternal tools. However, such a paradigm faces the challenge of the potential\n\"determining hallucinations\" in decision-making due to insufficient visual\ninformation and the limitation of low-level perception tools that fail to\nprovide abstract summaries necessary for comprehensive reasoning. We argue that\nconverging visual context acquisition and logical reasoning is pivotal for\ntackling visual reasoning tasks. This paper delves into the realm of multimodal\nCoT to solve intricate visual reasoning tasks with multimodal large language\nmodels(MLLMs) and their cognitive capability. To this end, we propose an\ninnovative multimodal CoT framework, termed Cantor, characterized by a\nperception-decision architecture. Cantor first acts as a decision generator and\nintegrates visual inputs to analyze the image and problem, ensuring a closer\nalignment with the actual context. Furthermore, Cantor leverages the advanced\ncognitive functions of MLLMs to perform as multifaceted experts for deriving\nhigher-level information, enhancing the CoT generation process. Our extensive\nexperiments demonstrate the efficacy of the proposed framework, showing\nsignificant improvements in multimodal CoT performance across two complex\nvisual reasoning datasets, without necessitating fine-tuning or ground-truth\nrationales. Project Page: https://ggg0919.github.io/cantor/ .\n","authors":["Timin Gao","Peixian Chen","Mengdan Zhang","Chaoyou Fu","Yunhang Shen","Yan Zhang","Shengchuan Zhang","Xiawu Zheng","Xing Sun","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.16033v1.pdf","comment":"The project page is available at https://ggg0919.github.io/cantor/"},{"id":"http://arxiv.org/abs/2404.16030v1","updated":"2024-04-24T17:59:24Z","published":"2024-04-24T17:59:24Z","title":"MoDE: CLIP Data Experts via Clustering","summary":" The success of contrastive language-image pretraining (CLIP) relies on the\nsupervision from the pairing between images and captions, which tends to be\nnoisy in web-crawled data. We present Mixture of Data Experts (MoDE) and learn\na system of CLIP data experts via clustering. Each data expert is trained on\none data cluster, being less sensitive to false negative noises in other\nclusters. At inference time, we ensemble their outputs by applying weights\ndetermined through the correlation between task metadata and cluster\nconditions. To estimate the correlation precisely, the samples in one cluster\nshould be semantically similar, but the number of data experts should still be\nreasonable for training and inference. As such, we consider the ontology in\nhuman language and propose to use fine-grained cluster centers to represent\neach data expert at a coarse-grained level. Experimental studies show that four\nCLIP data experts on ViT-B/16 outperform the ViT-L/14 by OpenAI CLIP and\nOpenCLIP on zero-shot image classification but with less ($<$35\\%) training\ncost. Meanwhile, MoDE can train all data expert asynchronously and can flexibly\ninclude new data experts. The code is available at\nhttps://github.com/facebookresearch/MetaCLIP/tree/main/mode.\n","authors":["Jiawei Ma","Po-Yao Huang","Saining Xie","Shang-Wen Li","Luke Zettlemoyer","Shih-Fu Chang","Wen-Tau Yih","Hu Xu"],"pdf_url":"https://arxiv.org/pdf/2404.16030v1.pdf","comment":"IEEE CVPR 2024 Camera Ready. Code Link:\n https://github.com/facebookresearch/MetaCLIP/tree/main/mode"},{"id":"http://arxiv.org/abs/2404.16029v1","updated":"2024-04-24T17:59:11Z","published":"2024-04-24T17:59:11Z","title":"Editable Image Elements for Controllable Synthesis","summary":" Diffusion models have made significant advances in text-guided synthesis\ntasks. However, editing user-provided images remains challenging, as the high\ndimensional noise input space of diffusion models is not naturally suited for\nimage inversion or spatial editing. In this work, we propose an image\nrepresentation that promotes spatial editing of input images using a diffusion\nmodel. Concretely, we learn to encode an input into \"image elements\" that can\nfaithfully reconstruct an input image. These elements can be intuitively edited\nby a user, and are decoded by a diffusion model into realistic images. We show\nthe effectiveness of our representation on various image editing tasks, such as\nobject resizing, rearrangement, dragging, de-occlusion, removal, variation, and\nimage composition. Project page:\nhttps://jitengmu.github.io/Editable_Image_Elements/\n","authors":["Jiteng Mu","Michaël Gharbi","Richard Zhang","Eli Shechtman","Nuno Vasconcelos","Xiaolong Wang","Taesung Park"],"pdf_url":"https://arxiv.org/pdf/2404.16029v1.pdf","comment":"Project page: https://jitengmu.github.io/Editable_Image_Elements/"},{"id":"http://arxiv.org/abs/2404.16022v1","updated":"2024-04-24T17:55:33Z","published":"2024-04-24T17:55:33Z","title":"PuLID: Pure and Lightning ID Customization via Contrastive Alignment","summary":" We propose Pure and Lightning ID customization (PuLID), a novel tuning-free\nID customization method for text-to-image generation. By incorporating a\nLightning T2I branch with a standard diffusion one, PuLID introduces both\ncontrastive alignment loss and accurate ID loss, minimizing disruption to the\noriginal model and ensuring high ID fidelity. Experiments show that PuLID\nachieves superior performance in both ID fidelity and editability. Another\nattractive property of PuLID is that the image elements (e.g., background,\nlighting, composition, and style) before and after the ID insertion are kept as\nconsistent as possible. Codes and models will be available at\nhttps://github.com/ToTheBeginning/PuLID\n","authors":["Zinan Guo","Yanze Wu","Zhuowei Chen","Lang Chen","Qian He"],"pdf_url":"https://arxiv.org/pdf/2404.16022v1.pdf","comment":"Tech Report. Codes and models will be available at\n https://github.com/ToTheBeginning/PuLID"},{"id":"http://arxiv.org/abs/2401.17231v2","updated":"2024-04-24T17:55:06Z","published":"2024-01-30T18:18:41Z","title":"Achieving More Human Brain-Like Vision via Human EEG Representational\n Alignment","summary":" Despite advancements in artificial intelligence, object recognition models\nstill lag behind in emulating visual information processing in human brains.\nRecent studies have highlighted the potential of using neural data to mimic\nbrain processing; however, these often rely on invasive neural recordings from\nnon-human subjects, leaving a critical gap in understanding human visual\nperception. Addressing this gap, we present, for the first time,\n'Re(presentational)Al(ignment)net', a vision model aligned with human brain\nactivity based on non-invasive EEG, demonstrating a significantly higher\nsimilarity to human brain representations. Our innovative image-to-brain\nmulti-layer encoding framework advances human neural alignment by optimizing\nmultiple model layers and enabling the model to efficiently learn and mimic\nhuman brain's visual representational patterns across object categories and\ndifferent modalities. Our findings suggest that ReAlnet represents a\nbreakthrough in bridging the gap between artificial and human vision, and\npaving the way for more brain-like artificial intelligence systems.\n","authors":["Zitong Lu","Yile Wang","Julie D. Golomb"],"pdf_url":"https://arxiv.org/pdf/2401.17231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08371v2","updated":"2024-04-24T17:53:08Z","published":"2023-12-13T18:59:13Z","title":"PTT: Point-Trajectory Transformer for Efficient Temporal 3D Object\n Detection","summary":" Recent temporal LiDAR-based 3D object detectors achieve promising performance\nbased on the two-stage proposal-based approach. They generate 3D box candidates\nfrom the first-stage dense detector, followed by different temporal aggregation\nmethods. However, these approaches require per-frame objects or whole point\nclouds, posing challenges related to memory bank utilization. Moreover, point\nclouds and trajectory features are combined solely based on concatenation,\nwhich may neglect effective interactions between them. In this paper, we\npropose a point-trajectory transformer with long short-term memory for\nefficient temporal 3D object detection. To this end, we only utilize point\nclouds of current-frame objects and their historical trajectories as input to\nminimize the memory bank storage requirement. Furthermore, we introduce modules\nto encode trajectory features, focusing on long short-term and future-aware\nperspectives, and then effectively aggregate them with point cloud features. We\nconduct extensive experiments on the large-scale Waymo dataset to demonstrate\nthat our approach performs well against state-of-the-art methods. Code and\nmodels will be made publicly available at https://github.com/kuanchihhuang/PTT.\n","authors":["Kuan-Chih Huang","Weijie Lyu","Ming-Hsuan Yang","Yi-Hsuan Tsai"],"pdf_url":"https://arxiv.org/pdf/2312.08371v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://github.com/kuanchihhuang/PTT"},{"id":"http://arxiv.org/abs/2404.16017v1","updated":"2024-04-24T17:50:37Z","published":"2024-04-24T17:50:37Z","title":"RetinaRegNet: A Versatile Approach for Retinal Image Registration","summary":" We introduce the RetinaRegNet model, which can achieve state-of-the-art\nperformance across various retinal image registration tasks. RetinaRegNet does\nnot require training on any retinal images. It begins by establishing point\ncorrespondences between two retinal images using image features derived from\ndiffusion models. This process involves the selection of feature points from\nthe moving image using the SIFT algorithm alongside random point sampling. For\neach selected feature point, a 2D correlation map is computed by assessing the\nsimilarity between the feature vector at that point and the feature vectors of\nall pixels in the fixed image. The pixel with the highest similarity score in\nthe correlation map corresponds to the feature point in the moving image. To\nremove outliers in the estimated point correspondences, we first applied an\ninverse consistency constraint, followed by a transformation-based outlier\ndetector. This method proved to outperform the widely used random sample\nconsensus (RANSAC) outlier detector by a significant margin. To handle large\ndeformations, we utilized a two-stage image registration framework. A\nhomography transformation was used in the first stage and a more accurate\nthird-order polynomial transformation was used in the second stage. The model's\neffectiveness was demonstrated across three retinal image datasets: color\nfundus images, fluorescein angiography images, and laser speckle flowgraphy\nimages. RetinaRegNet outperformed current state-of-the-art methods in all three\ndatasets. It was especially effective for registering image pairs with large\ndisplacement and scaling deformations. This innovation holds promise for\nvarious applications in retinal image analysis. Our code is publicly available\nat https://github.com/mirthAI/RetinaRegNet.\n","authors":["Vishal Balaji Sivaraman","Muhammad Imran","Qingyue Wei","Preethika Muralidharan","Michelle R. Tamplin","Isabella M . Grumbach","Randy H. Kardon","Jui-Kai Wang","Yuyin Zhou","Wei Shao"],"pdf_url":"https://arxiv.org/pdf/2404.16017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16006v1","updated":"2024-04-24T17:37:05Z","published":"2024-04-24T17:37:05Z","title":"MMT-Bench: A Comprehensive Multimodal Benchmark for Evaluating Large\n Vision-Language Models Towards Multitask AGI","summary":" Large Vision-Language Models (LVLMs) show significant strides in\ngeneral-purpose multimodal applications such as visual dialogue and embodied\nnavigation. However, existing multimodal evaluation benchmarks cover a limited\nnumber of multimodal tasks testing rudimentary capabilities, falling short in\ntracking LVLM development. In this study, we present MMT-Bench, a comprehensive\nbenchmark designed to assess LVLMs across massive multimodal tasks requiring\nexpert knowledge and deliberate visual recognition, localization, reasoning,\nand planning. MMT-Bench comprises $31,325$ meticulously curated multi-choice\nvisual questions from various multimodal scenarios such as vehicle driving and\nembodied navigation, covering $32$ core meta-tasks and $162$ subtasks in\nmultimodal understanding. Due to its extensive task coverage, MMT-Bench enables\nthe evaluation of LVLMs using a task map, facilitating the discovery of in- and\nout-of-domain tasks. Evaluation results involving $30$ LVLMs such as the\nproprietary GPT-4V, GeminiProVision, and open-sourced InternVL-Chat, underscore\nthe significant challenges posed by MMT-Bench. We anticipate that MMT-Bench\nwill inspire the community to develop next-generation multimodal foundation\nmodels aimed at achieving general-purpose multimodal intelligence.\n","authors":["Kaining Ying","Fanqing Meng","Jin Wang","Zhiqian Li","Han Lin","Yue Yang","Hao Zhang","Wenbo Zhang","Yuqi Lin","Shuo Liu","Jiayi Lei","Quanfeng Lu","Runjian Chen","Peng Xu","Renrui Zhang","Haozhe Zhang","Peng Gao","Yali Wang","Yu Qiao","Ping Luo","Kaipeng Zhang","Wenqi Shao"],"pdf_url":"https://arxiv.org/pdf/2404.16006v1.pdf","comment":"77 pages, 41 figures"},{"id":"http://arxiv.org/abs/2404.16000v1","updated":"2024-04-24T17:27:57Z","published":"2024-04-24T17:27:57Z","title":"A comprehensive and easy-to-use multi-domain multi-task medical imaging\n meta-dataset (MedIMeta)","summary":" While the field of medical image analysis has undergone a transformative\nshift with the integration of machine learning techniques, the main challenge\nof these techniques is often the scarcity of large, diverse, and well-annotated\ndatasets. Medical images vary in format, size, and other parameters and\ntherefore require extensive preprocessing and standardization, for usage in\nmachine learning. Addressing these challenges, we introduce the Medical Imaging\nMeta-Dataset (MedIMeta), a novel multi-domain, multi-task meta-dataset.\nMedIMeta contains 19 medical imaging datasets spanning 10 different domains and\nencompassing 54 distinct medical tasks, all of which are standardized to the\nsame format and readily usable in PyTorch or other ML frameworks. We perform a\ntechnical validation of MedIMeta, demonstrating its utility through fully\nsupervised and cross-domain few-shot learning baselines.\n","authors":["Stefano Woerner","Arthur Jaques","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2404.16000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02649v4","updated":"2024-04-24T17:10:34Z","published":"2023-04-03T20:19:56Z","title":"Specialty-Oriented Generalist Medical AI for Chest CT Screening","summary":" Modern medical records include a vast amount of multimodal free text clinical\ndata and imaging data from radiology, cardiology, and digital pathology. Fully\nmining such big data requires multitasking; otherwise, occult but important\naspects may be overlooked, adversely affecting clinical management and\npopulation healthcare. Despite remarkable successes of AI in individual tasks\nwith single-modal data, the progress in developing generalist medical AI\nremains relatively slow to combine multimodal data for multitasks because of\nthe dual challenges of data curation and model architecture. The data challenge\ninvolves querying and curating multimodal structured and unstructured text,\nalphanumeric, and especially 3D tomographic scans on an individual patient\nlevel for real-time decisions and on a scale to estimate population health\nstatistics. The model challenge demands a scalable and adaptable network\narchitecture to integrate multimodal datasets for diverse clinical tasks. Here\nwe propose the first-of-its-kind medical multimodal-multitask foundation model\n(M3FM) with application in lung cancer screening and related tasks. After we\ncurated a comprehensive multimodal multitask dataset consisting of 49 clinical\ndata types including 163,725 chest CT series and 17 medical tasks involved in\nLCS, we develop a multimodal question-answering framework as a unified training\nand inference strategy to synergize multimodal information and perform multiple\ntasks via free-text prompting. M3FM consistently outperforms the\nstate-of-the-art single-modal task-specific models, identifies multimodal data\nelements informative for clinical tasks and flexibly adapts to new tasks with a\nsmall out-of-distribution dataset. As a specialty-oriented generalist medical\nAI model, M3FM paves the way for similar breakthroughs in other areas of\nmedicine, closing the gap between specialists and the generalist.\n","authors":["Chuang Niu","Qing Lyu","Christopher D. Carothers","Parisa Kaviani","Josh Tan","Pingkun Yan","Mannudeep K. Kalra","Christopher T. Whitlow","Ge Wang"],"pdf_url":"https://arxiv.org/pdf/2304.02649v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11669v3","updated":"2024-04-24T17:07:43Z","published":"2024-04-17T18:08:00Z","title":"Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis","summary":" Designing a 3D representation of a dynamic scene for fast optimization and\nrendering is a challenging task. While recent explicit representations enable\nfast learning and rendering of dynamic radiance fields, they require a dense\nset of input viewpoints. In this work, we focus on learning a fast\nrepresentation for dynamic radiance fields with sparse input viewpoints.\nHowever, the optimization with sparse input is under-constrained and\nnecessitates the use of motion priors to constrain the learning. Existing fast\ndynamic scene models do not explicitly model the motion, making them difficult\nto be constrained with motion priors. We design an explicit motion model as a\nfactorized 4D representation that is fast and can exploit the spatio-temporal\ncorrelation of the motion field. We then introduce reliable flow priors\nincluding a combination of sparse flow priors across cameras and dense flow\npriors within cameras to regularize our motion model. Our model is fast,\ncompact and achieves very good performance on popular multi-view dynamic scene\ndatasets with sparse input viewpoints. The source code for our model can be\nfound on our project page:\nhttps://nagabhushansn95.github.io/publications/2024/RF-DeRF.html.\n","authors":["Nagabhushan Somraj","Kapil Choudhary","Sai Harsha Mupparaju","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2404.11669v3.pdf","comment":"Accepted at SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2404.15992v1","updated":"2024-04-24T17:06:52Z","published":"2024-04-24T17:06:52Z","title":"HDDGAN: A Heterogeneous Dual-Discriminator Generative Adversarial\n Network for Infrared and Visible Image Fusion","summary":" Infrared and visible image fusion (IVIF) aims to preserve thermal radiation\ninformation from infrared images while integrating texture details from visible\nimages, enabling the capture of important features and hidden details of\nsubjects in complex scenes and disturbed environments. Consequently, IVIF\noffers distinct advantages in practical applications such as video\nsurveillance, night navigation, and target recognition. However, prevailing\nmethods often face challenges in simultaneously capturing thermal region\nfeatures and detailed information due to the disparate characteristics of\ninfrared and visible images. Consequently, fusion outcomes frequently entail a\ncompromise between thermal target area information and texture details. In this\nstudy, we introduce a novel heterogeneous dual-discriminator generative\nadversarial network (HDDGAN) to address this issue. Specifically, the generator\nis structured as a multi-scale skip-connected structure, facilitating the\nextraction of essential features from different source images. To enhance the\ninformation representation ability of the fusion result, an attention mechanism\nis employed to construct the information fusion layer within the generator,\nleveraging the disparities between the source images. Moreover, recognizing the\ndistinct learning requirements of information in infrared and visible images,\nwe design two discriminators with differing structures. This approach aims to\nguide the model to learn salient information from infrared images while\nsimultaneously capturing detailed information from visible images. Extensive\nexperiments conducted on various public datasets demonstrate the superiority of\nour proposed HDDGAN over other state-of-the-art (SOTA) algorithms, highlighting\nits enhanced potential for practical applications.\n","authors":["Guosheng Lu","Zile Fang","Chunming He","Zhigang Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.15992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15979v1","updated":"2024-04-24T16:54:39Z","published":"2024-04-24T16:54:39Z","title":"On the Fourier analysis in the SO(3) space : EquiLoPO Network","summary":" Analyzing volumetric data with rotational invariance or equivariance is an\nactive topic in current research. Existing deep-learning approaches utilize\neither group convolutional networks limited to discrete rotations or steerable\nconvolutional networks with constrained filter structures. This work proposes a\nnovel equivariant neural network architecture that achieves analytical\nEquivariance to Local Pattern Orientation on the continuous SO(3) group while\nallowing unconstrained trainable filters - EquiLoPO Network. Our key\ninnovations are a group convolutional operation leveraging irreducible\nrepresentations as the Fourier basis and a local activation function in the\nSO(3) space that provides a well-defined mapping from input to output\nfunctions, preserving equivariance. By integrating these operations into a\nResNet-style architecture, we propose a model that overcomes the limitations of\nprior methods. A comprehensive evaluation on diverse 3D medical imaging\ndatasets from MedMNIST3D demonstrates the effectiveness of our approach, which\nconsistently outperforms state of the art. This work suggests the benefits of\ntrue rotational equivariance on SO(3) and flexible unconstrained filters\nenabled by the local activation function, providing a flexible framework for\nequivariant deep learning on volumetric data with potential applications across\ndomains. Our code is publicly available at\n\\url{https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPO/-/tree/main/EquiLoPO}.\n","authors":["Dmitrii Zhemchuzhnikov","Sergei Grudinin"],"pdf_url":"https://arxiv.org/pdf/2404.15979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17222v2","updated":"2024-04-24T16:38:37Z","published":"2023-03-30T08:36:48Z","title":"LatentForensics: Towards frugal deepfake detection in the StyleGAN\n latent space","summary":" The classification of forged videos has been a challenge for the past few\nyears. Deepfake classifiers can now reliably predict whether or not video\nframes have been tampered with. However, their performance is tied to both the\ndataset used for training and the analyst's computational power. We propose a\ndeepfake detection method that operates in the latent space of a\nstate-of-the-art generative adversarial network (GAN) trained on high-quality\nface images. The proposed method leverages the structure of the latent space of\nStyleGAN to learn a lightweight binary classification model. Experimental\nresults on standard datasets reveal that the proposed approach outperforms\nother state-of-the-art deepfake classification methods, especially in contexts\nwhere the data available to train the models is rare, such as when a new\nmanipulation method is introduced. To the best of our knowledge, this is the\nfirst study showing the interest of the latent space of StyleGAN for deepfake\nclassification. Combined with other recent studies on the interpretation and\nmanipulation of this latent space, we believe that the proposed approach can\nfurther help in developing frugal deepfake classification methods based on\ninterpretable high-level properties of face images.\n","authors":["Matthieu Delmas","Amine Kacete","Stephane Paquelet","Simon Leglaive","Renaud Seguier"],"pdf_url":"https://arxiv.org/pdf/2303.17222v2.pdf","comment":"7 pages, 3 figures, 5 tables, submitted to IPAI 2024"},{"id":"http://arxiv.org/abs/2404.14414v2","updated":"2024-04-24T16:36:47Z","published":"2024-02-28T19:07:49Z","title":"Removing Reflections from RAW Photos","summary":" We describe a system to remove real-world reflections from images for\nconsumer photography. Our system operates on linear (RAW) photos, with the\n(optional) addition of a contextual photo looking in the opposite direction,\ne.g., using the selfie camera on a mobile device, which helps disambiguate what\nshould be considered the reflection. The system is trained using synthetic\nmixtures of real-world RAW images, which are combined using a reflection\nsimulation that is photometrically and geometrically accurate. Our system\nconsists of a base model that accepts the captured photo and optional\ncontextual photo as input, and runs at 256p, followed by an up-sampling model\nthat transforms output 256p images to full resolution. The system can produce\nimages for review at 1K in 4.5 to 6.5 seconds on a MacBook or iPhone 14 Pro. We\ntest on RAW photos that were captured in the field and embody typical consumer\nphotographs.\n","authors":["Eric Kee","Adam Pikielny","Kevin Blackburn-Matzen","Marc Levoy"],"pdf_url":"https://arxiv.org/pdf/2404.14414v2.pdf","comment":"14 pages plus 22 pages of supplemental material"},{"id":"http://arxiv.org/abs/2404.15956v1","updated":"2024-04-24T16:23:34Z","published":"2024-04-24T16:23:34Z","title":"A Survey on Visual Mamba","summary":" State space models (SSMs) with selection mechanisms and hardware-aware\narchitectures, namely Mamba, have recently demonstrated significant promise in\nlong-sequence modeling. Since the self-attention mechanism in transformers has\nquadratic complexity with image size and increasing computational demands, the\nresearchers are now exploring how to adapt Mamba for computer vision tasks.\nThis paper is the first comprehensive survey aiming to provide an in-depth\nanalysis of Mamba models in the field of computer vision. It begins by\nexploring the foundational concepts contributing to Mamba's success, including\nthe state space model framework, selection mechanisms, and hardware-aware\ndesign. Next, we review these vision mamba models by categorizing them into\nfoundational ones and enhancing them with techniques such as convolution,\nrecurrence, and attention to improve their sophistication. We further delve\ninto the widespread applications of Mamba in vision tasks, which include their\nuse as a backbone in various levels of vision processing. This encompasses\ngeneral visual tasks, Medical visual tasks (e.g., 2D / 3D segmentation,\nclassification, and image registration, etc.), and Remote Sensing visual tasks.\nWe specially introduce general visual tasks from two levels: High/Mid-level\nvision (e.g., Object detection, Segmentation, Video classification, etc.) and\nLow-level vision (e.g., Image super-resolution, Image restoration, Visual\ngeneration, etc.). We hope this endeavor will spark additional interest within\nthe community to address current challenges and further apply Mamba models in\ncomputer vision.\n","authors":["Hanwei Zhang","Ying Zhu","Dan Wang","Lijun Zhang","Tianxiang Chen","Zi Ye"],"pdf_url":"https://arxiv.org/pdf/2404.15956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15955v1","updated":"2024-04-24T16:19:31Z","published":"2024-04-24T16:19:31Z","title":"Beyond Deepfake Images: Detecting AI-Generated Videos","summary":" Recent advances in generative AI have led to the development of techniques to\ngenerate visually realistic synthetic video. While a number of techniques have\nbeen developed to detect AI-generated synthetic images, in this paper we show\nthat synthetic image detectors are unable to detect synthetic videos. We\ndemonstrate that this is because synthetic video generators introduce\nsubstantially different traces than those left by image generators. Despite\nthis, we show that synthetic video traces can be learned, and used to perform\nreliable synthetic video detection or generator source attribution even after\nH.264 re-compression. Furthermore, we demonstrate that while detecting videos\nfrom new generators through zero-shot transferability is challenging, accurate\ndetection of videos from a new generator can be achieved through few-shot\nlearning.\n","authors":["Danial Samadi Vahdati","Tai D. Nguyen","Aref Azizpour","Matthew C. Stamm"],"pdf_url":"https://arxiv.org/pdf/2404.15955v1.pdf","comment":"To be published in CVPRW24"},{"id":"http://arxiv.org/abs/2404.15946v1","updated":"2024-04-24T16:07:31Z","published":"2024-04-24T16:07:31Z","title":"Mammo-CLIP: Leveraging Contrastive Language-Image Pre-training (CLIP)\n for Enhanced Breast Cancer Diagnosis with Multi-view Mammography","summary":" Although fusion of information from multiple views of mammograms plays an\nimportant role to increase accuracy of breast cancer detection, developing\nmulti-view mammograms-based computer-aided diagnosis (CAD) schemes still faces\nchallenges and no such CAD schemes have been used in clinical practice. To\novercome the challenges, we investigate a new approach based on Contrastive\nLanguage-Image Pre-training (CLIP), which has sparked interest across various\nmedical imaging tasks. By solving the challenges in (1) effectively adapting\nthe single-view CLIP for multi-view feature fusion and (2) efficiently\nfine-tuning this parameter-dense model with limited samples and computational\nresources, we introduce Mammo-CLIP, the first multi-modal framework to process\nmulti-view mammograms and corresponding simple texts. Mammo-CLIP uses an early\nfeature fusion strategy to learn multi-view relationships in four mammograms\nacquired from the CC and MLO views of the left and right breasts. To enhance\nlearning efficiency, plug-and-play adapters are added into CLIP image and text\nencoders for fine-tuning parameters and limiting updates to about 1% of the\nparameters. For framework evaluation, we assembled two datasets\nretrospectively. The first dataset, comprising 470 malignant and 479 benign\ncases, was used for few-shot fine-tuning and internal evaluation of the\nproposed Mammo-CLIP via 5-fold cross-validation. The second dataset, including\n60 malignant and 294 benign cases, was used to test generalizability of\nMammo-CLIP. Study results show that Mammo-CLIP outperforms the state-of-art\ncross-view transformer in AUC (0.841 vs. 0.817, 0.837 vs. 0.807) on both\ndatasets. It also surpasses previous two CLIP-based methods by 20.3% and 14.3%.\nThis study highlights the potential of applying the finetuned vision-language\nmodels for developing next-generation, image-text-based CAD schemes of breast\ncancer.\n","authors":["Xuxin Chen","Yuheng Li","Mingzhe Hu","Ella Salari","Xiaoqian Chen","Richard L. J. Qiu","Bin Zheng","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.15946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01163v2","updated":"2024-04-24T16:01:25Z","published":"2024-01-02T11:46:42Z","title":"NU-Class Net: A Novel Deep Learning-based Approach for Video Quality\n Enhancement","summary":" Video content has experienced a surge in popularity, asserting its dominance\nover internet traffic and Internet of Things (IoT) networks. Video compression\nhas long been regarded as the primary means of efficiently managing the\nsubstantial multimedia traffic generated by video-capturing devices.\nNevertheless, video compression algorithms entail significant computational\ndemands in order to achieve substantial compression ratios. This complexity\npresents a formidable challenge when implementing efficient video coding\nstandards in resource-constrained embedded systems, such as IoT edge node\ncameras. To tackle this challenge, this paper introduces NU-Class Net, an\ninnovative deep-learning model designed to mitigate compression artifacts\nstemming from lossy compression codecs. This enhancement significantly elevates\nthe perceptible quality of low-bit-rate videos. By employing the NU-Class Net,\nthe video encoder within the video-capturing node can reduce output quality,\nthereby generating low-bit-rate videos and effectively curtailing both\ncomputation and bandwidth requirements at the edge. On the decoder side, which\nis typically less encumbered by resource limitations, NU-Class Net is applied\nafter the video decoder to compensate for artifacts and approximate the quality\nof the original video. Experimental results affirm the efficacy of the proposed\nmodel in enhancing the perceptible quality of videos, especially those streamed\nat low bit rates.\n","authors":["Parham Zilouchian Moghaddam","Mehdi Modarressi","Mohammad Amin Sadeghi"],"pdf_url":"https://arxiv.org/pdf/2401.01163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05726v2","updated":"2024-04-24T15:38:48Z","published":"2024-04-08T17:59:24Z","title":"MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video\n Understanding","summary":" With the success of large language models (LLMs), integrating the vision\nmodel into LLMs to build vision-language foundation models has gained much more\ninterest recently. However, existing LLM-based large multimodal models (e.g.,\nVideo-LLaMA, VideoChat) can only take in a limited number of frames for short\nvideo understanding. In this study, we mainly focus on designing an efficient\nand effective model for long-term video understanding. Instead of trying to\nprocess more frames simultaneously like most existing work, we propose to\nprocess videos in an online manner and store past video information in a memory\nbank. This allows our model to reference historical video content for long-term\nanalysis without exceeding LLMs' context length constraints or GPU memory\nlimits. Our memory bank can be seamlessly integrated into current multimodal\nLLMs in an off-the-shelf manner. We conduct extensive experiments on various\nvideo understanding tasks, such as long-video understanding, video question\nanswering, and video captioning, and our model can achieve state-of-the-art\nperformances across multiple datasets. Code available at\nhttps://boheumd.github.io/MA-LMM/.\n","authors":["Bo He","Hengduo Li","Young Kyun Jang","Menglin Jia","Xuefei Cao","Ashish Shah","Abhinav Shrivastava","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2404.05726v2.pdf","comment":"Accepted at CVPR 2024. Project Page https://boheumd.github.io/MA-LMM/"},{"id":"http://arxiv.org/abs/2402.18196v2","updated":"2024-04-24T15:16:56Z","published":"2024-02-28T09:36:22Z","title":"NToP: NeRF-Powered Large-scale Dataset Generation for 2D and 3D Human\n Pose Estimation in Top-View Fisheye Images","summary":" Human pose estimation (HPE) in the top-view using fisheye cameras presents a\npromising and innovative application domain. However, the availability of\ndatasets capturing this viewpoint is extremely limited, especially those with\nhigh-quality 2D and 3D keypoint annotations. Addressing this gap, we leverage\nthe capabilities of Neural Radiance Fields (NeRF) technique to establish a\ncomprehensive pipeline for generating human pose datasets from existing 2D and\n3D datasets, specifically tailored for the top-view fisheye perspective.\nThrough this pipeline, we create a novel dataset NToP570K (NeRF-powered\nTop-view human Pose dataset for fisheye cameras with over 570 thousand images),\nand conduct an extensive evaluation of its efficacy in enhancing neural\nnetworks for 2D and 3D top-view human pose estimation. A pretrained ViTPose-B\nmodel achieves an improvement in AP of 33.3 % on our validation set for 2D HPE\nafter finetuning on our training set. A similarly finetuned HybrIK-Transformer\nmodel gains 53.7 mm reduction in PA-MPJPE for 3D HPE on the validation set.\n","authors":["Jingrui Yu","Dipankar Nandi","Roman Seidel","Gangolf Hirtz"],"pdf_url":"https://arxiv.org/pdf/2402.18196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15919v1","updated":"2024-04-24T15:16:06Z","published":"2024-04-24T15:16:06Z","title":"An Element-Wise Weights Aggregation Method for Federated Learning","summary":" Federated learning (FL) is a powerful Machine Learning (ML) paradigm that\nenables distributed clients to collaboratively learn a shared global model\nwhile keeping the data on the original device, thereby preserving privacy. A\ncentral challenge in FL is the effective aggregation of local model weights\nfrom disparate and potentially unbalanced participating clients. Existing\nmethods often treat each client indiscriminately, applying a single proportion\nto the entire local model. However, it is empirically advantageous for each\nweight to be assigned a specific proportion. This paper introduces an\ninnovative Element-Wise Weights Aggregation Method for Federated Learning\n(EWWA-FL) aimed at optimizing learning performance and accelerating convergence\nspeed. Unlike traditional FL approaches, EWWA-FL aggregates local weights to\nthe global model at the level of individual elements, thereby allowing each\nparticipating client to make element-wise contributions to the learning\nprocess. By taking into account the unique dataset characteristics of each\nclient, EWWA-FL enhances the robustness of the global model to different\ndatasets while also achieving rapid convergence. The method is flexible enough\nto employ various weighting strategies. Through comprehensive experiments, we\ndemonstrate the advanced capabilities of EWWA-FL, showing significant\nimprovements in both accuracy and convergence speed across a range of backbones\nand benchmarks.\n","authors":["Yi Hu","Hanchi Ren","Chen Hu","Jingjing Deng","Xianghua Xie"],"pdf_url":"https://arxiv.org/pdf/2404.15919v1.pdf","comment":"2023 IEEE International Conference on Data Mining Workshops (ICDMW)"},{"id":"http://arxiv.org/abs/2404.15918v1","updated":"2024-04-24T15:12:25Z","published":"2024-04-24T15:12:25Z","title":"Perception and Localization of Macular Degeneration Applying\n Convolutional Neural Network, ResNet and Grad-CAM","summary":" A well-known retinal disease that feels blurry visions to the affected\npatients is Macular Degeneration. This research is based on classifying the\nhealthy and macular degeneration fundus with localizing the affected region of\nthe fundus. A CNN architecture and CNN with ResNet architecture (ResNet50,\nResNet50v2, ResNet101, ResNet101v2, ResNet152, ResNet152v2) as the backbone are\nused to classify the two types of fundus. The data are split into three\ncategories including (a) Training set is 90% and Testing set is 10% (b)\nTraining set is 80% and Testing set is 20%, (c) Training set is 50% and Testing\nset is 50%. After the training, the best model has been selected from the\nevaluation metrics. Among the models, CNN with backbone of ResNet50 performs\nbest which gives the training accuracy of 98.7\\% for 90\\% train and 10\\% test\ndata split. With this model, we have performed the Grad-CAM visualization to\nget the region of affected area of fundus.\n","authors":["Tahmim Hossain","Sagor Chandro Bakchy"],"pdf_url":"https://arxiv.org/pdf/2404.15918v1.pdf","comment":"12 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.10135v3","updated":"2024-04-24T15:07:25Z","published":"2023-07-19T17:00:45Z","title":"A Hierarchical Architecture for Neural Materials","summary":" Neural reflectance models are capable of reproducing the spatially-varying\nappearance of many real-world materials at different scales. Unfortunately,\nexisting techniques such as NeuMIP have difficulties handling materials with\nstrong shadowing effects or detailed specular highlights. In this paper, we\nintroduce a neural appearance model that offers a new level of accuracy.\nCentral to our model is an inception-based core network structure that captures\nmaterial appearances at multiple scales using parallel-operating kernels and\nensures multi-stage features through specialized convolution layers.\nFurthermore, we encode the inputs into frequency space, introduce a\ngradient-based loss, and employ it adaptive to the progress of the learning\nphase. We demonstrate the effectiveness of our method using a variety of\nsynthetic and real examples.\n","authors":["Bowen Xue","Shuang Zhao","Henrik Wann Jensen","Zahra Montazeri"],"pdf_url":"https://arxiv.org/pdf/2307.10135v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15909v1","updated":"2024-04-24T15:03:53Z","published":"2024-04-24T15:03:53Z","title":"Learning Long-form Video Prior via Generative Pre-Training","summary":" Concepts involved in long-form videos such as people, objects, and their\ninteractions, can be viewed as following an implicit prior. They are notably\ncomplex and continue to pose challenges to be comprehensively learned. In\nrecent years, generative pre-training (GPT) has exhibited versatile capacities\nin modeling any kind of text content even visual locations. Can this manner\nwork for learning long-form video prior? Instead of operating on pixel space,\nit is efficient to employ visual locations like bounding boxes and keypoints to\nrepresent key information in videos, which can be simply discretized and then\ntokenized for consumption by GPT. Due to the scarcity of suitable data, we\ncreate a new dataset called \\textbf{Storyboard20K} from movies to serve as a\nrepresentative. It includes synopses, shot-by-shot keyframes, and fine-grained\nannotations of film sets and characters with consistent IDs, bounding boxes,\nand whole body keypoints. In this way, long-form videos can be represented by a\nset of tokens and be learned via generative pre-training. Experimental results\nvalidate that our approach has great potential for learning long-form video\nprior. Code and data will be released at\n\\url{https://github.com/showlab/Long-form-Video-Prior}.\n","authors":["Jinheng Xie","Jiajun Feng","Zhaoxu Tian","Kevin Qinghong Lin","Yawen Huang","Xi Xia","Nanxu Gong","Xu Zuo","Jiaqi Yang","Yefeng Zheng","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2404.15909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15903v1","updated":"2024-04-24T14:57:37Z","published":"2024-04-24T14:57:37Z","title":"Drawing the Line: Deep Segmentation for Extracting Art from Ancient\n Etruscan Mirrors","summary":" Etruscan mirrors constitute a significant category within Etruscan art and,\ntherefore, undergo systematic examinations to obtain insights into ancient\ntimes. A crucial aspect of their analysis involves the labor-intensive task of\nmanually tracing engravings from the backside. Additionally, this task is\ninherently challenging due to the damage these mirrors have sustained,\nintroducing subjectivity into the process. We address these challenges by\nautomating the process through photometric-stereo scanning in conjunction with\ndeep segmentation networks which, however, requires effective usage of the\nlimited data at hand. We accomplish this by incorporating predictions on a\nper-patch level, and various data augmentations, as well as exploring\nself-supervised learning. Compared to our baseline, we improve predictive\nperformance w.r.t. the pseudo-F-Measure by around 16%. When assessing\nperformance on complete mirrors against a human baseline, our approach yields\nquantitative similar performance to a human annotator and significantly\noutperforms existing binarization methods. With our proposed methodology, we\nstreamline the annotation process, enhance its objectivity, and reduce overall\nworkload, offering a valuable contribution to the examination of these\nhistorical artifacts and other non-traditional documents.\n","authors":["Rafael Sterzinger","Simon Brenner","Robert Sablatnig"],"pdf_url":"https://arxiv.org/pdf/2404.15903v1.pdf","comment":"19 pages, accepted at ICDAR2024"},{"id":"http://arxiv.org/abs/2403.19612v3","updated":"2024-04-24T14:26:52Z","published":"2024-03-28T17:32:01Z","title":"ILPO-NET: Network for the invariant recognition of arbitrary volumetric\n patterns in 3D","summary":" Effective recognition of spatial patterns and learning their hierarchy is\ncrucial in modern spatial data analysis. Volumetric data applications seek\ntechniques ensuring invariance not only to shifts but also to pattern\nrotations. While traditional methods can readily achieve translational\ninvariance, rotational invariance possesses multiple challenges and remains an\nactive area of research. Here, we present ILPO-Net (Invariant to Local Patterns\nOrientation Network), a novel approach that handles arbitrarily shaped patterns\nwith the convolutional operation inherently invariant to local spatial pattern\norientations using the Wigner matrix expansions. Our architecture seamlessly\nintegrates the new convolution operator and, when benchmarked on diverse\nvolumetric datasets such as MedMNIST and CATH, demonstrates superior\nperformance over the baselines with significantly reduced parameter counts - up\nto 1000 times fewer in the case of MedMNIST. Beyond these demonstrations,\nILPO-Net's rotational invariance paves the way for other applications across\nmultiple disciplines. Our code is publicly available at\nhttps://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPO/-/tree/main/ILPONet.\n","authors":["Dmitrii Zhemchuzhnikov","Sergei Grudinin"],"pdf_url":"https://arxiv.org/pdf/2403.19612v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15889v1","updated":"2024-04-24T14:24:57Z","published":"2024-04-24T14:24:57Z","title":"Sketch2Human: Deep Human Generation with Disentangled Geometry and\n Appearance Control","summary":" Geometry- and appearance-controlled full-body human image generation is an\ninteresting but challenging task. Existing solutions are either unconditional\nor dependent on coarse conditions (e.g., pose, text), thus lacking explicit\ngeometry and appearance control of body and garment. Sketching offers such\nediting ability and has been adopted in various sketch-based face generation\nand editing solutions. However, directly adapting sketch-based face generation\nto full-body generation often fails to produce high-fidelity and diverse\nresults due to the high complexity and diversity in the pose, body shape, and\ngarment shape and texture. Recent geometrically controllable diffusion-based\nmethods mainly rely on prompts to generate appearance and it is hard to balance\nthe realism and the faithfulness of their results to the sketch when the input\nis coarse. This work presents Sketch2Human, the first system for controllable\nfull-body human image generation guided by a semantic sketch (for geometry\ncontrol) and a reference image (for appearance control). Our solution is based\non the latent space of StyleGAN-Human with inverted geometry and appearance\nlatent codes as input. Specifically, we present a sketch encoder trained with a\nlarge synthetic dataset sampled from StyleGAN-Human's latent space and directly\nsupervised by sketches rather than real images. Considering the entangled\ninformation of partial geometry and texture in StyleGAN-Human and the absence\nof disentangled datasets, we design a novel training scheme that creates\ngeometry-preserved and appearance-transferred training data to tune a generator\nto achieve disentangled geometry and appearance control. Although our method is\ntrained with synthetic data, it can handle hand-drawn sketches as well.\nQualitative and quantitative evaluations demonstrate the superior performance\nof our method to state-of-the-art methods.\n","authors":["Linzi Qu","Jiaxiang Shang","Hui Ye","Xiaoguang Han","Hongbo Fu"],"pdf_url":"https://arxiv.org/pdf/2404.15889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06279v2","updated":"2024-04-24T14:15:27Z","published":"2024-04-09T13:02:33Z","title":"NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural\n Cellular Automata","summary":" Neural Cellular Automata (NCA) is a class of Cellular Automata where the\nupdate rule is parameterized by a neural network that can be trained using\ngradient descent. In this paper, we focus on NCA models used for texture\nsynthesis, where the update rule is inspired by partial differential equations\n(PDEs) describing reaction-diffusion systems. To train the NCA model, the\nspatio-termporal domain is discretized, and Euler integration is used to\nnumerically simulate the PDE. However, whether a trained NCA truly learns the\ncontinuous dynamic described by the corresponding PDE or merely overfits the\ndiscretization used in training remains an open question. We study NCA models\nat the limit where space-time discretization approaches continuity. We find\nthat existing NCA models tend to overfit the training discretization,\nespecially in the proximity of the initial condition, also called \"seed\". To\naddress this, we propose a solution that utilizes uniform noise as the initial\ncondition. We demonstrate the effectiveness of our approach in preserving the\nconsistency of NCA dynamics across a wide range of spatio-temporal\ngranularities. Our improved NCA model enables two new test-time interactions by\nallowing continuous control over the speed of pattern formation and the scale\nof the synthesized patterns. We demonstrate this new NCA feature in our\ninteractive online demo. Our work reveals that NCA models can learn continuous\ndynamics and opens new venues for NCA research from a dynamical systems'\nperspective.\n","authors":["Ehsan Pajouheshgar","Yitao Xu","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06279v2.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.15881v1","updated":"2024-04-24T13:51:56Z","published":"2024-04-24T13:51:56Z","title":"Steal Now and Attack Later: Evaluating Robustness of Object Detection\n against Black-box Adversarial Attacks","summary":" Latency attacks against object detection represent a variant of adversarial\nattacks that aim to inflate the inference time by generating additional ghost\nobjects in a target image. However, generating ghost objects in the black-box\nscenario remains a challenge since information about these unqualified objects\nremains opaque. In this study, we demonstrate the feasibility of generating\nghost objects in adversarial examples by extending the concept of \"steal now,\ndecrypt later\" attacks. These adversarial examples, once produced, can be\nemployed to exploit potential vulnerabilities in the AI service, giving rise to\nsignificant security concerns. The experimental results demonstrate that the\nproposed attack achieves successful attacks across various commonly used models\nand Google Vision API without any prior knowledge about the target model.\nAdditionally, the average cost of each attack is less than \\$ 1 dollars, posing\na significant threat to AI security.\n","authors":["Erh-Chung Chen","Pin-Yu Chen","I-Hsin Chung","Che-Rung Lee"],"pdf_url":"https://arxiv.org/pdf/2404.15881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15879v1","updated":"2024-04-24T13:48:38Z","published":"2024-04-24T13:48:38Z","title":"Revisiting Out-of-Distribution Detection in LiDAR-based 3D Object\n Detection","summary":" LiDAR-based 3D object detection has become an essential part of automated\ndriving due to its ability to localize and classify objects precisely in 3D.\nHowever, object detectors face a critical challenge when dealing with unknown\nforeground objects, particularly those that were not present in their original\ntraining data. These out-of-distribution (OOD) objects can lead to\nmisclassifications, posing a significant risk to the safety and reliability of\nautomated vehicles. Currently, LiDAR-based OOD object detection has not been\nwell studied. We address this problem by generating synthetic training data for\nOOD objects by perturbing known object categories. Our idea is that these\nsynthetic OOD objects produce different responses in the feature map of an\nobject detector compared to in-distribution (ID) objects. We then extract\nfeatures using a pre-trained and fixed object detector and train a simple\nmultilayer perceptron (MLP) to classify each detection as either ID or OOD. In\naddition, we propose a new evaluation protocol that allows the use of existing\ndatasets without modifying the point cloud, ensuring a more authentic\nevaluation of real-world scenarios. The effectiveness of our method is\nvalidated through experiments on the newly proposed nuScenes OOD benchmark. The\nsource code is available at https://github.com/uulm-mrm/mmood3d.\n","authors":["Michael Kösel","Marcel Schreiber","Michael Ulrich","Claudius Gläser","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2404.15879v1.pdf","comment":"Accepted for publication at the 2024 35th IEEE Intelligent Vehicles\n Symposium (IV 2024), June 2-5, 2024, in Jeju Island, Korea"},{"id":"http://arxiv.org/abs/2404.16080v1","updated":"2024-04-24T13:23:03Z","published":"2024-04-24T13:23:03Z","title":"Enhancing Diagnosis through AI-driven Analysis of Reflectance Confocal\n Microscopy","summary":" Reflectance Confocal Microscopy (RCM) is a non-invasive imaging technique\nused in biomedical research and clinical dermatology. It provides virtual\nhigh-resolution images of the skin and superficial tissues, reducing the need\nfor physical biopsies. RCM employs a laser light source to illuminate the\ntissue, capturing the reflected light to generate detailed images of\nmicroscopic structures at various depths. Recent studies explored AI and\nmachine learning, particularly CNNs, for analyzing RCM images. Our study\nproposes a segmentation strategy based on textural features to identify\nclinically significant regions, empowering dermatologists in effective image\ninterpretation and boosting diagnostic confidence. This approach promises to\nadvance dermatological diagnosis and treatment.\n","authors":["Hong-Jun Yoon","Chris Keum","Alexander Witkowski","Joanna Ludzik","Tracy Petrie","Heidi A. Hanson","Sancy A. Leachman"],"pdf_url":"https://arxiv.org/pdf/2404.16080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09329v4","updated":"2024-04-24T13:22:08Z","published":"2024-02-14T17:18:15Z","title":"YOLOv8-AM: YOLOv8 with Attention Mechanisms for Pediatric Wrist Fracture\n Detection","summary":" Wrist trauma and even fractures occur frequently in daily life, particularly\namong children who account for a significant proportion of fracture cases.\nBefore performing surgery, surgeons often request patients to undergo X-ray\nimaging first and prepare for it based on the analysis of the radiologist. With\nthe development of neural networks, You Only Look Once (YOLO) series models\nhave been widely used in fracture detection as computer-assisted diagnosis\n(CAD). In 2023, Ultralytics presented the latest version of the YOLO models,\nwhich has been employed for detecting fractures across various parts of the\nbody. Attention mechanism is one of the hottest methods to improve the model\nperformance. This research work proposes YOLOv8-AM, which incorporates the\nattention mechanism into the original YOLOv8 architecture. Specifically, we\nrespectively employ four attention modules, Convolutional Block Attention\nModule (CBAM), Global Attention Mechanism (GAM), Efficient Channel Attention\n(ECA), and Shuffle Attention (SA), to design the improved models and train them\non GRAZPEDWRI-DX dataset. Experimental results demonstrate that the mean\nAverage Precision at IoU 50 (mAP 50) of the YOLOv8-AM model based on ResBlock +\nCBAM (ResCBAM) increased from 63.6% to 65.8%, which achieves the\nstate-of-the-art (SOTA) performance. Conversely, YOLOv8-AM model incorporating\nGAM obtains the mAP 50 value of 64.2%, which is not a satisfactory enhancement.\nTherefore, we combine ResBlock and GAM, introducing ResGAM to design another\nnew YOLOv8-AM model, whose mAP 50 value is increased to 65.0%. The\nimplementation code for this study is available on GitHub at\nhttps://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8.\n","authors":["Chun-Tse Chien","Rui-Yang Ju","Kuang-Yi Chou","Enkaer Xieerke","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2402.09329v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05370v3","updated":"2024-04-24T13:19:55Z","published":"2023-04-11T17:24:31Z","title":"Overload: Latency Attacks on Object Detection for Edge Devices","summary":" Nowadays, the deployment of deep learning-based applications is an essential\ntask owing to the increasing demands on intelligent services. In this paper, we\ninvestigate latency attacks on deep learning applications. Unlike common\nadversarial attacks for misclassification, the goal of latency attacks is to\nincrease the inference time, which may stop applications from responding to the\nrequests within a reasonable time. This kind of attack is ubiquitous for\nvarious applications, and we use object detection to demonstrate how such kind\nof attacks work. We also design a framework named Overload to generate latency\nattacks at scale. Our method is based on a newly formulated optimization\nproblem and a novel technique, called spatial attention. This attack serves to\nescalate the required computing costs during the inference time, consequently\nleading to an extended inference time for object detection. It presents a\nsignificant threat, especially to systems with limited computing resources. We\nconducted experiments using YOLOv5 models on Nvidia NX. Compared to existing\nmethods, our method is simpler and more effective. The experimental results\nshow that with latency attacks, the inference time of a single image can be\nincreased ten times longer in reference to the normal setting. Moreover, our\nfindings pose a potential new threat to all object detection tasks requiring\nnon-maximum suppression (NMS), as our attack is NMS-agnostic.\n","authors":["Erh-Chung Chen","Pin-Yu Chen","I-Hsin Chung","Che-rung Lee"],"pdf_url":"https://arxiv.org/pdf/2304.05370v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00971v2","updated":"2024-04-24T13:01:22Z","published":"2024-02-01T19:40:39Z","title":"FuseFormer: A Transformer for Visual and Thermal Image Fusion","summary":" Due to the lack of a definitive ground truth for the image fusion problem,\nthe loss functions are structured based on evaluation metrics, such as the\nstructural similarity index measure (SSIM). However, in doing so, a bias is\nintroduced toward the SSIM and, consequently, the input visual band image. The\nobjective of this study is to propose a novel methodology for the image fusion\nproblem that mitigates the limitations associated with using classical\nevaluation metrics as loss functions. Our approach integrates a\ntransformer-based multi-scale fusion strategy that adeptly addresses local and\nglobal context information. This integration not only refines the individual\ncomponents of the image fusion process but also significantly enhances the\noverall efficacy of the method. Our proposed method follows a two-stage\ntraining approach, where an auto-encoder is initially trained to extract deep\nfeatures at multiple scales in the first stage. For the second stage, we\nintegrate our fusion block and change the loss function as mentioned. The\nmulti-scale features are fused using a combination of Convolutional Neural\nNetworks (CNNs) and Transformers. The CNNs are utilized to capture local\nfeatures, while the Transformer handles the integration of general context\nfeatures. Through extensive experiments on various benchmark datasets, our\nproposed method, along with the novel loss function definition, demonstrates\nsuperior performance compared to other competitive fusion algorithms.\n","authors":["Aytekin Erdogan","Erdem Akagündüz"],"pdf_url":"https://arxiv.org/pdf/2402.00971v2.pdf","comment":"8 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.15851v1","updated":"2024-04-24T12:59:54Z","published":"2024-04-24T12:59:54Z","title":"Porting Large Language Models to Mobile Devices for Question Answering","summary":" Deploying Large Language Models (LLMs) on mobile devices makes all the\ncapabilities of natural language processing available on the device. An\nimportant use case of LLMs is question answering, which can provide accurate\nand contextually relevant answers to a wide array of user queries. We describe\nhow we managed to port state of the art LLMs to mobile devices, enabling them\nto operate natively on the device. We employ the llama.cpp framework, a\nflexible and self-contained C++ framework for LLM inference. We selected a\n6-bit quantized version of the Orca-Mini-3B model with 3 billion parameters and\npresent the correct prompt format for this model. Experimental results show\nthat LLM inference runs in interactive speed on a Galaxy S21 smartphone and\nthat the model delivers high-quality answers to user queries related to\nquestions from different subjects like politics, geography or history.\n","authors":["Hannes Fassold"],"pdf_url":"https://arxiv.org/pdf/2404.15851v1.pdf","comment":"Accepted for ASPAI 2024 Conference"},{"id":"http://arxiv.org/abs/2404.15847v1","updated":"2024-04-24T12:52:43Z","published":"2024-04-24T12:52:43Z","title":"3D Freehand Ultrasound using Visual Inertial and Deep Inertial Odometry\n for Measuring Patellar Tracking","summary":" Patellofemoral joint (PFJ) issues affect one in four people, with 20%\nexperiencing chronic knee pain despite treatment. Poor outcomes and pain after\nknee replacement surgery are often linked to patellar mal-tracking. Traditional\nimaging methods like CT and MRI face challenges, including cost and metal\nartefacts, and there's currently no ideal way to observe joint motion without\nissues such as soft tissue artefacts or radiation exposure. A new system to\nmonitor joint motion could significantly improve understanding of PFJ dynamics,\naiding in better patient care and outcomes. Combining 2D ultrasound with motion\ntracking for 3D reconstruction of the joint using semantic segmentation and\nposition registration can be a solution. However, the need for expensive\nexternal infrastructure to estimate the trajectories of the scanner remains the\nmain limitation to implementing 3D bone reconstruction from handheld ultrasound\nscanning clinically. We proposed the Visual-Inertial Odometry (VIO) and the\ndeep learning-based inertial-only odometry methods as alternatives to motion\ncapture for tracking a handheld ultrasound scanner. The 3D reconstruction\ngenerated by these methods has demonstrated potential for assessing the PFJ and\nfor further measurements from free-hand ultrasound scans. The results show that\nthe VIO method performs as well as the motion capture method, with average\nreconstruction errors of 1.25 mm and 1.21 mm, respectively. The VIO method is\nthe first infrastructure-free method for 3D reconstruction of bone from\nwireless handheld ultrasound scanning with an accuracy comparable to methods\nthat require external infrastructure.\n","authors":["Russell Buchanan","S. Jack Tu","Marco Camurri","Stephen J. Mellon","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2404.15847v1.pdf","comment":"Accepted to IEEE Medical Measurements & Applications (MeMeA) 2024"},{"id":"http://arxiv.org/abs/2404.11864v2","updated":"2024-04-24T12:36:10Z","published":"2024-04-18T02:40:31Z","title":"Progressive Multi-modal Conditional Prompt Tuning","summary":" Pre-trained vision-language models (VLMs) have shown remarkable\ngeneralization capabilities via prompting, which leverages VLMs as knowledge\nbases to extract information beneficial for downstream tasks. However, existing\nmethods primarily employ uni-modal prompting, which only engages a uni-modal\nbranch, failing to simultaneously adjust vision-language (V-L) features.\nAdditionally, the one-pass forward pipeline in VLM encoding struggles to align\nV-L features that have a huge gap. Confronting these challenges, we propose a\nnovel method, Progressive Multi-modal conditional Prompt Tuning (ProMPT).\nProMPT exploits a recurrent structure, optimizing and aligning V-L features by\niteratively utilizing image and current encoding information. It comprises an\ninitialization and a multi-modal iterative evolution (MIE) module.\nInitialization is responsible for encoding images and text using a VLM,\nfollowed by a feature filter that selects text features similar to image. MIE\nthen facilitates multi-modal prompting through class-conditional vision\nprompting, instance-conditional text prompting, and feature filtering. In each\nMIE iteration, vision prompts are obtained from filtered text features via a\nvision generator, promoting image features to focus more on target object\nduring vision prompting. The encoded image features are fed into a text\ngenerator to produce text prompts that are more robust to class shifts. Thus,\nV-L features are progressively aligned, enabling advance from coarse to exact\nprediction. Extensive experiments are conducted in three settings to evaluate\nthe efficacy of ProMPT. The results indicate that ProMPT outperforms existing\nmethods on average across all settings, demonstrating its superior\ngeneralization and robustness. Code is available at\nhttps://github.com/qiuxiaoyu9954/ProMPT.\n","authors":["Xiaoyu Qiu","Hao Feng","Yuechen Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.11864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20210v4","updated":"2024-04-24T12:26:46Z","published":"2023-10-31T06:19:09Z","title":"UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale\n Transformer","summary":" Underwater images often exhibit poor quality, distorted color balance and low\ncontrast due to the complex and intricate interplay of light, water, and\nobjects. Despite the significant contributions of previous underwater\nenhancement techniques, there exist several problems that demand further\nimprovement: (i) The current deep learning methods rely on Convolutional Neural\nNetworks (CNNs) that lack the multi-scale enhancement, and global perception\nfield is also limited. (ii) The scarcity of paired real-world underwater\ndatasets poses a significant challenge, and the utilization of synthetic image\npairs could lead to overfitting. To address the aforementioned problems, this\npaper introduces a Multi-scale Transformer-based Network called UWFormer for\nenhancing images at multiple frequencies via semi-supervised learning, in which\nwe propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale\nFusion Feed-forward Network for low-frequency enhancement. Besides, we\nintroduce a special underwater semi-supervised training strategy, where we\npropose a Subaqueous Perceptual Loss function to generate reliable pseudo\nlabels. Experiments using full-reference and non-reference underwater\nbenchmarks demonstrate that our method outperforms state-of-the-art methods in\nterms of both quantity and visual quality.\n","authors":["Weiwen Chen","Yingtie Lei","Shenghong Luo","Ziyang Zhou","Mingxian Li","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2310.20210v4.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2107.13931v2","updated":"2024-04-24T12:20:54Z","published":"2021-07-29T12:30:39Z","title":"Learning Geometry-Guided Depth via Projective Modeling for Monocular 3D\n Object Detection","summary":" As a crucial task of autonomous driving, 3D object detection has made great\nprogress in recent years. However, monocular 3D object detection remains a\nchallenging problem due to the unsatisfactory performance in depth estimation.\nMost existing monocular methods typically directly regress the scene depth\nwhile ignoring important relationships between the depth and various geometric\nelements (e.g. bounding box sizes, 3D object dimensions, and object poses). In\nthis paper, we propose to learn geometry-guided depth estimation with\nprojective modeling to advance monocular 3D object detection. Specifically, a\nprincipled geometry formula with projective modeling of 2D and 3D depth\npredictions in the monocular 3D object detection network is devised. We further\nimplement and embed the proposed formula to enable geometry-aware deep\nrepresentation learning, allowing effective 2D and 3D interactions for boosting\nthe depth estimation. Moreover, we provide a strong baseline through addressing\nsubstantial misalignment between 2D annotation and projected boxes to ensure\nrobust learning with the proposed geometric formula. Experiments on the KITTI\ndataset show that our method remarkably improves the detection performance of\nthe state-of-the-art monocular-based method without extra data by 2.80% on the\nmoderate test setting. The model and code will be released at\nhttps://github.com/YinminZhang/MonoGeo.\n","authors":["Yinmin Zhang","Xinzhu Ma","Shuai Yi","Jun Hou","Zhihui Wang","Wanli Ouyang","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2107.13931v2.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.15817v1","updated":"2024-04-24T11:41:28Z","published":"2024-04-24T11:41:28Z","title":"Vision Transformer-based Adversarial Domain Adaptation","summary":" Unsupervised domain adaptation (UDA) aims to transfer knowledge from a\nlabeled source domain to an unlabeled target domain. The most recent UDA\nmethods always resort to adversarial training to yield state-of-the-art results\nand a dominant number of existing UDA methods employ convolutional neural\nnetworks (CNNs) as feature extractors to learn domain invariant features.\nVision transformer (ViT) has attracted tremendous attention since its emergence\nand has been widely used in various computer vision tasks, such as image\nclassification, object detection, and semantic segmentation, yet its potential\nin adversarial domain adaptation has never been investigated. In this paper, we\nfill this gap by employing the ViT as the feature extractor in adversarial\ndomain adaptation. Moreover, we empirically demonstrate that ViT can be a\nplug-and-play component in adversarial domain adaptation, which means directly\nreplacing the CNN-based feature extractor in existing UDA methods with the\nViT-based feature extractor can easily obtain performance improvement. The code\nis available at https://github.com/LluckyYH/VT-ADA.\n","authors":["Yahan Li","Yuan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.15817v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.15815v1","updated":"2024-04-24T11:36:37Z","published":"2024-04-24T11:36:37Z","title":"Single-View Scene Point Cloud Human Grasp Generation","summary":" In this work, we explore a novel task of generating human grasps based on\nsingle-view scene point clouds, which more accurately mirrors the typical\nreal-world situation of observing objects from a single viewpoint. Due to the\nincompleteness of object point clouds and the presence of numerous scene\npoints, the generated hand is prone to penetrating into the invisible parts of\nthe object and the model is easily affected by scene points. Thus, we introduce\nS2HGrasp, a framework composed of two key modules: the Global Perception module\nthat globally perceives partial object point clouds, and the DiffuGrasp module\ndesigned to generate high-quality human grasps based on complex inputs that\ninclude scene points. Additionally, we introduce S2HGD dataset, which comprises\napproximately 99,000 single-object single-view scene point clouds of 1,668\nunique objects, each annotated with one human grasp. Our extensive experiments\ndemonstrate that S2HGrasp can not only generate natural human grasps regardless\nof scene points, but also effectively prevent penetration between the hand and\ninvisible parts of the object. Moreover, our model showcases strong\ngeneralization capability when applied to unseen objects. Our code and dataset\nare available at https://github.com/iSEE-Laboratory/S2HGrasp.\n","authors":["Yan-Kang Wang","Chengyi Xing","Yi-Lin Wei","Xiao-Ming Wu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.15815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15812v1","updated":"2024-04-24T11:26:47Z","published":"2024-04-24T11:26:47Z","title":"Facilitating Advanced Sentinel-2 Analysis Through a Simplified\n Computation of Nadir BRDF Adjusted Reflectance","summary":" The Sentinel-2 (S2) mission from the European Space Agency's Copernicus\nprogram provides essential data for Earth surface analysis. Its Level-2A\nproducts deliver high-to-medium resolution (10-60 m) surface reflectance (SR)\ndata through the MultiSpectral Instrument (MSI). To enhance the accuracy and\ncomparability of SR data, adjustments simulating a nadir viewing perspective\nare essential. These corrections address the anisotropic nature of SR and the\nvariability in sun and observation angles, ensuring consistent image\ncomparisons over time and under different conditions. The $c$-factor method, a\nsimple yet effective algorithm, adjusts observed S2 SR by using the MODIS BRDF\nmodel to achieve Nadir BRDF Adjusted Reflectance (NBAR). Despite the\nstraightforward application of the $c$-factor to individual images, a cohesive\nPython framework for its application across multiple S2 images and Earth System\nData Cubes (ESDCs) from cloud-stored data has been lacking. Here we introduce\nsen2nbar, a Python package crafted to convert S2 SR data to NBAR, supporting\nboth individual images and ESDCs derived from cloud-stored data. This package\nsimplifies the conversion of S2 SR data to NBAR via a single function,\norganized into modules for efficient process management. By facilitating NBAR\nconversion for both SAFE files and ESDCs from SpatioTemporal Asset Catalogs\n(STAC), sen2nbar is developed as a flexible tool that can handle diverse data\nformat requirements. We anticipate that sen2nbar will considerably contribute\nto the standardization and harmonization of S2 data, offering a robust solution\nfor a diverse range of users across various applications. sen2nbar is an\nopen-source tool available at https://github.com/ESDS-Leipzig/sen2nbar.\n","authors":["David Montero","Miguel D. Mahecha","César Aybar","Clemens Mosig","Sebastian Wieneke"],"pdf_url":"https://arxiv.org/pdf/2404.15812v1.pdf","comment":"Submitted to FOSS4G Europe 2024"},{"id":"http://arxiv.org/abs/2305.10874v4","updated":"2024-04-24T11:22:00Z","published":"2023-05-18T11:06:15Z","title":"Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation","summary":" With the explosive popularity of AI-generated content (AIGC), video\ngeneration has recently received a lot of attention. Generating videos guided\nby text instructions poses significant challenges, such as modeling the complex\nrelationship between space and time, and the lack of large-scale text-video\npaired data. Existing text-video datasets suffer from limitations in both\ncontent quality and scale, or they are not open-source, rendering them\ninaccessible for study and use. For model design, previous approaches extend\npretrained text-to-image generation models by adding temporal 1D\nconvolution/attention modules for video generation. However, these approaches\noverlook the importance of jointly modeling space and time, inevitably leading\nto temporal distortions and misalignment between texts and videos. In this\npaper, we propose a novel approach that strengthens the interaction between\nspatial and temporal perceptions. In particular, we utilize a swapped\ncross-attention mechanism in 3D windows that alternates the \"query\" role\nbetween spatial and temporal blocks, enabling mutual reinforcement for each\nother. Moreover, to fully unlock model capabilities for high-quality video\ngeneration and promote the development of the field, we curate a large-scale\nand open-source video dataset called HD-VG-130M. This dataset comprises 130\nmillion text-video pairs from the open-domain, ensuring high-definition,\nwidescreen and watermark-free characters. A smaller-scale yet more meticulously\ncleaned subset further enhances the data quality, aiding models in achieving\nsuperior performance. Experimental quantitative and qualitative results\ndemonstrate the superiority of our approach in terms of per-frame quality,\ntemporal correlation, and text-video alignment, with clear margins.\n","authors":["Wenjing Wang","Huan Yang","Zixi Tuo","Huiguo He","Junchen Zhu","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10874v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13277v2","updated":"2024-04-24T11:06:06Z","published":"2024-04-20T05:24:06Z","title":"Beyond Score Changes: Adversarial Attack on No-Reference Image Quality\n Assessment from Two Perspectives","summary":" Deep neural networks have demonstrated impressive success in No-Reference\nImage Quality Assessment (NR-IQA). However, recent researches highlight the\nvulnerability of NR-IQA models to subtle adversarial perturbations, leading to\ninconsistencies between model predictions and subjective ratings. Current\nadversarial attacks, however, focus on perturbing predicted scores of\nindividual images, neglecting the crucial aspect of inter-score correlation\nrelationships within an entire image set. Meanwhile, it is important to note\nthat the correlation, like ranking correlation, plays a significant role in\nNR-IQA tasks. To comprehensively explore the robustness of NR-IQA models, we\nintroduce a new framework of correlation-error-based attacks that perturb both\nthe correlation within an image set and score changes on individual images. Our\nresearch primarily focuses on ranking-related correlation metrics like\nSpearman's Rank-Order Correlation Coefficient (SROCC) and prediction\nerror-related metrics like Mean Squared Error (MSE). As an instantiation, we\npropose a practical two-stage SROCC-MSE-Attack (SMA) that initially optimizes\ntarget attack scores for the entire image set and then generates adversarial\nexamples guided by these scores. Experimental results demonstrate that our SMA\nmethod not only significantly disrupts the SROCC to negative values but also\nmaintains a considerable change in the scores of individual images. Meanwhile,\nit exhibits state-of-the-art performance across metrics with different\ncategories. Our method provides a new perspective on the robustness of NR-IQA\nmodels.\n","authors":["Chenxi Yang","Yujia Liu","Dingquan Li","Yan Zhong","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.13277v2.pdf","comment":"Submitted to a conference"},{"id":"http://arxiv.org/abs/2404.15802v1","updated":"2024-04-24T11:02:13Z","published":"2024-04-24T11:02:13Z","title":"Raformer: Redundancy-Aware Transformer for Video Wire Inpainting","summary":" Video Wire Inpainting (VWI) is a prominent application in video inpainting,\naimed at flawlessly removing wires in films or TV series, offering significant\ntime and labor savings compared to manual frame-by-frame removal. However, wire\nremoval poses greater challenges due to the wires being longer and slimmer than\nobjects typically targeted in general video inpainting tasks, and often\nintersecting with people and background objects irregularly, which adds\ncomplexity to the inpainting process. Recognizing the limitations posed by\nexisting video wire datasets, which are characterized by their small size, poor\nquality, and limited variety of scenes, we introduce a new VWI dataset with a\nnovel mask generation strategy, namely Wire Removal Video Dataset 2 (WRV2) and\nPseudo Wire-Shaped (PWS) Masks. WRV2 dataset comprises over 4,000 videos with\nan average length of 80 frames, designed to facilitate the development and\nefficacy of inpainting models. Building upon this, our research proposes the\nRedundancy-Aware Transformer (Raformer) method that addresses the unique\nchallenges of wire removal in video inpainting. Unlike conventional approaches\nthat indiscriminately process all frame patches, Raformer employs a novel\nstrategy to selectively bypass redundant parts, such as static background\nsegments devoid of valuable information for inpainting. At the core of Raformer\nis the Redundancy-Aware Attention (RAA) module, which isolates and accentuates\nessential content through a coarse-grained, window-based attention mechanism.\nThis is complemented by a Soft Feature Alignment (SFA) module, which refines\nthese features and achieves end-to-end feature alignment. Extensive experiments\non both the traditional video inpainting datasets and our proposed WRV2 dataset\ndemonstrate that Raformer outperforms other state-of-the-art methods.\n","authors":["Zhong Ji","Yimu Su","Yan Zhang","Jiacheng Hou","Yanwei Pang","Jungong Han"],"pdf_url":"https://arxiv.org/pdf/2404.15802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08555v3","updated":"2024-04-24T10:50:24Z","published":"2023-01-19T11:02:44Z","title":"Hybrid Open-set Segmentation with Synthetic Negative Data","summary":" Open-set segmentation can be conceived by complementing closed-set\nclassification with anomaly detection. Many of the existing dense anomaly\ndetectors operate through generative modelling of regular data or by\ndiscriminating with respect to negative data. These two approaches optimize\ndifferent objectives and therefore exhibit different failure modes.\nConsequently, we propose a novel anomaly score that fuses generative and\ndiscriminative cues. Our score can be implemented by upgrading any closed-set\nsegmentation model with dense estimates of dataset posterior and unnormalized\ndata likelihood. The resulting dense hybrid open-set models require negative\ntraining images that can be sampled from an auxiliary negative dataset, from a\njointly trained generative model, or from a mixture of both sources. We\nevaluate our contributions on benchmarks for dense anomaly detection and\nopen-set segmentation. The experiments reveal strong open-set performance in\nspite of negligible computational overhead.\n","authors":["Matej Grcić","Siniša Šegvić"],"pdf_url":"https://arxiv.org/pdf/2301.08555v3.pdf","comment":"Published in IEEE TPAMI"},{"id":"http://arxiv.org/abs/2312.09222v2","updated":"2024-04-24T10:34:45Z","published":"2023-12-14T18:52:52Z","title":"Mosaic-SDF for 3D Generative Models","summary":" Current diffusion or flow-based generative models for 3D shapes divide to\ntwo: distilling pre-trained 2D image diffusion models, and training directly on\n3D shapes. When training a diffusion or flow models on 3D shapes a crucial\ndesign choice is the shape representation. An effective shape representation\nneeds to adhere three design principles: it should allow an efficient\nconversion of large 3D datasets to the representation form; it should provide a\ngood tradeoff of approximation power versus number of parameters; and it should\nhave a simple tensorial form that is compatible with existing powerful neural\narchitectures. While standard 3D shape representations such as volumetric grids\nand point clouds do not adhere to all these principles simultaneously, we\nadvocate in this paper a new representation that does. We introduce Mosaic-SDF\n(M-SDF): a simple 3D shape representation that approximates the Signed Distance\nFunction (SDF) of a given shape by using a set of local grids spread near the\nshape's boundary. The M-SDF representation is fast to compute for each shape\nindividually making it readily parallelizable; it is parameter efficient as it\nonly covers the space around the shape's boundary; and it has a simple matrix\nform, compatible with Transformer-based architectures. We demonstrate the\nefficacy of the M-SDF representation by using it to train a 3D generative flow\nmodel including class-conditioned generation with the 3D Warehouse dataset, and\ntext-to-3D generation using a dataset of about 600k caption-shape pairs.\n","authors":["Lior Yariv","Omri Puny","Natalia Neverova","Oran Gafni","Yaron Lipman"],"pdf_url":"https://arxiv.org/pdf/2312.09222v2.pdf","comment":"More results and details can be found at\n https://lioryariv.github.io/msdf"},{"id":"http://arxiv.org/abs/2404.12966v2","updated":"2024-04-24T10:33:26Z","published":"2024-04-19T15:53:27Z","title":"Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of\n Multi-modal Large Language Models","summary":" Counterfactual reasoning, as a crucial manifestation of human intelligence,\nrefers to making presuppositions based on established facts and extrapolating\npotential outcomes. Existing multimodal large language models (MLLMs) have\nexhibited impressive cognitive and reasoning capabilities, which have been\nexamined across a wide range of Visual Question Answering (VQA) benchmarks.\nNevertheless, how will existing MLLMs perform when faced with counterfactual\nquestions? To answer this question, we first curate a novel\n\\textbf{C}ounter\\textbf{F}actual \\textbf{M}ulti\\textbf{M}odal reasoning\nbenchmark, abbreviated as \\textbf{CFMM}, to systematically assess the\ncounterfactual reasoning capabilities of MLLMs. Our CFMM comprises six\nchallenging tasks, each including hundreds of carefully human-labeled\ncounterfactual questions, to evaluate MLLM's counterfactual reasoning\ncapabilities across diverse aspects. Through experiments, interestingly, we\nfind that existing MLLMs prefer to believe what they see, but ignore the\ncounterfactual presuppositions presented in the question, thereby leading to\ninaccurate responses. Furthermore, we evaluate a wide range of prevalent MLLMs\non our proposed CFMM. The significant gap between their performance on our CFMM\nand that on several VQA benchmarks indicates that there is still considerable\nroom for improvement in existing MLLMs toward approaching human-level\nintelligence. On the other hand, through boosting MLLMs performances on our\nCFMM in the future, potential avenues toward developing MLLMs with advanced\nintelligence can be explored.\n","authors":["Yian Li","Wentao Tian","Yang Jiao","Jingjing Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.12966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15790v1","updated":"2024-04-24T10:30:42Z","published":"2024-04-24T10:30:42Z","title":"Leveraging Large Language Models for Multimodal Search","summary":" Multimodal search has become increasingly important in providing users with a\nnatural and effective way to ex-press their search intentions. Images offer\nfine-grained details of the desired products, while text allows for easily\nincorporating search modifications. However, some existing multimodal search\nsystems are unreliable and fail to address simple queries. The problem becomes\nharder with the large variability of natural language text queries, which may\ncontain ambiguous, implicit, and irrelevant in-formation. Addressing these\nissues may require systems with enhanced matching capabilities, reasoning\nabilities, and context-aware query parsing and rewriting. This paper introduces\na novel multimodal search model that achieves a new performance milestone on\nthe Fashion200K dataset. Additionally, we propose a novel search interface\nintegrating Large Language Models (LLMs) to facilitate natural language\ninteraction. This interface routes queries to search systems while\nconversationally engaging with users and considering previous searches. When\ncoupled with our multimodal search model, it heralds a new era of shopping\nassistants capable of offering human-like interaction and enhancing the overall\nsearch experience.\n","authors":["Oriol Barbany","Michael Huang","Xinliang Zhu","Arnab Dhua"],"pdf_url":"https://arxiv.org/pdf/2404.15790v1.pdf","comment":"Published at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.15789v1","updated":"2024-04-24T10:28:54Z","published":"2024-04-24T10:28:54Z","title":"MotionMaster: Training-free Camera Motion Transfer For Video Generation","summary":" The emergence of diffusion models has greatly propelled the progress in image\nand video generation. Recently, some efforts have been made in controllable\nvideo generation, including text-to-video generation and video motion control,\namong which camera motion control is an important topic. However, existing\ncamera motion control methods rely on training a temporal camera module, and\nnecessitate substantial computation resources due to the large amount of\nparameters in video generation models. Moreover, existing methods pre-define\ncamera motion types during training, which limits their flexibility in camera\ncontrol. Therefore, to reduce training costs and achieve flexible camera\ncontrol, we propose COMD, a novel training-free video motion transfer model,\nwhich disentangles camera motions and object motions in source videos and\ntransfers the extracted camera motions to new videos. We first propose a\none-shot camera motion disentanglement method to extract camera motion from a\nsingle source video, which separates the moving objects from the background and\nestimates the camera motion in the moving objects region based on the motion in\nthe background by solving a Poisson equation. Furthermore, we propose a\nfew-shot camera motion disentanglement method to extract the common camera\nmotion from multiple videos with similar camera motions, which employs a\nwindow-based clustering technique to extract the common features in temporal\nattention maps of multiple videos. Finally, we propose a motion combination\nmethod to combine different types of camera motions together, enabling our\nmodel a more controllable and flexible camera control. Extensive experiments\ndemonstrate that our training-free approach can effectively decouple\ncamera-object motion and apply the decoupled camera motion to a wide range of\ncontrollable video generation tasks, achieving flexible and diverse camera\nmotion control.\n","authors":["Teng Hu","Jiangning Zhang","Ran Yi","Yating Wang","Hongrui Huang","Jieyu Weng","Yabiao Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.15789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15786v1","updated":"2024-04-24T10:19:25Z","published":"2024-04-24T10:19:25Z","title":"Rethinking Model Prototyping through the MedMNIST+ Dataset Collection","summary":" The integration of deep learning based systems in clinical practice is often\nimpeded by challenges rooted in limited and heterogeneous medical datasets. In\naddition, prioritization of marginal performance improvements on a few,\nnarrowly scoped benchmarks over clinical applicability has slowed down\nmeaningful algorithmic progress. This trend often results in excessive\nfine-tuning of existing methods to achieve state-of-the-art performance on\nselected datasets rather than fostering clinically relevant innovations. In\nresponse, this work presents a comprehensive benchmark for the MedMNIST+\ndatabase to diversify the evaluation landscape and conduct a thorough analysis\nof common convolutional neural networks (CNNs) and Transformer-based\narchitectures, for medical image classification. Our evaluation encompasses\nvarious medical datasets, training methodologies, and input resolutions, aiming\nto reassess the strengths and limitations of widely used model variants. Our\nfindings suggest that computationally efficient training schemes and modern\nfoundation models hold promise in bridging the gap between expensive end-to-end\ntraining and more resource-refined approaches. Additionally, contrary to\nprevailing assumptions, we observe that higher resolutions may not consistently\nimprove performance beyond a certain threshold, advocating for the use of lower\nresolutions, particularly in prototyping stages, to expedite processing.\nNotably, our analysis reaffirms the competitiveness of convolutional models\ncompared to ViT-based architectures emphasizing the importance of comprehending\nthe intrinsic capabilities of different model architectures. Moreover, we hope\nthat our standardized evaluation framework will help enhance transparency,\nreproducibility, and comparability on the MedMNIST+ dataset collection as well\nas future research within the field. Code will be released soon.\n","authors":["Sebastian Doerrich","Francesco Di Salvo","Julius Brockmann","Christian Ledig"],"pdf_url":"https://arxiv.org/pdf/2404.15786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15785v1","updated":"2024-04-24T10:17:13Z","published":"2024-04-24T10:17:13Z","title":"Seeing Beyond Classes: Zero-Shot Grounded Situation Recognition via\n Language Explainer","summary":" Benefiting from strong generalization ability, pre-trained vision language\nmodels (VLMs), e.g., CLIP, have been widely utilized in zero-shot scene\nunderstanding. Unlike simple recognition tasks, grounded situation recognition\n(GSR) requires the model not only to classify salient activity (verb) in the\nimage, but also to detect all semantic roles that participate in the action.\nThis complex task usually involves three steps: verb recognition, semantic role\ngrounding, and noun recognition. Directly employing class-based prompts with\nVLMs and grounding models for this task suffers from several limitations, e.g.,\nit struggles to distinguish ambiguous verb concepts, accurately localize roles\nwith fixed verb-centric template1 input, and achieve context-aware noun\npredictions. In this paper, we argue that these limitations stem from the\nmode's poor understanding of verb/noun classes. To this end, we introduce a new\napproach for zero-shot GSR via Language EXplainer (LEX), which significantly\nboosts the model's comprehensive capabilities through three explainers: 1) verb\nexplainer, which generates general verb-centric descriptions to enhance the\ndiscriminability of different verb classes; 2) grounding explainer, which\nrephrases verb-centric templates for clearer understanding, thereby enhancing\nprecise semantic role localization; and 3) noun explainer, which creates\nscene-specific noun descriptions to ensure context-aware noun recognition. By\nequipping each step of the GSR process with an auxiliary explainer, LEX\nfacilitates complex scene understanding in real-world scenarios. Our extensive\nvalidations on the SWiG dataset demonstrate LEX's effectiveness and\ninteroperability in zero-shot GSR.\n","authors":["Jiaming Lei","Lin Li","Chunping Wang","Jun Xiao","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11573v2","updated":"2024-04-24T10:09:46Z","published":"2023-08-22T17:23:00Z","title":"G3Reg: Pyramid Graph-based Global Registration using Gaussian Ellipsoid\n Model","summary":" This study introduces a novel framework, G3Reg, for fast and robust global\nregistration of LiDAR point clouds. In contrast to conventional complex\nkeypoints and descriptors, we extract fundamental geometric primitives,\nincluding planes, clusters, and lines (PCL) from the raw point cloud to obtain\nlow-level semantic segments. Each segment is represented as a unified Gaussian\nEllipsoid Model (GEM), using a probability ellipsoid to ensure the ground truth\ncenters are encompassed with a certain degree of probability. Utilizing these\nGEMs, we present a distrust-and-verify scheme based on a Pyramid Compatibility\nGraph for Global Registration (PAGOR). Specifically, we establish an upper\nbound, which can be traversed based on the confidence level for compatibility\ntesting to construct the pyramid graph. Then, we solve multiple maximum cliques\n(MAC) for each level of the pyramid graph, thus generating the corresponding\ntransformation candidates. In the verification phase, we adopt a precise and\nefficient metric for point cloud alignment quality, founded on geometric\nprimitives, to identify the optimal candidate. The algorithm's performance is\nvalidated on three publicly available datasets and a self-collected\nmulti-session dataset. Parameter settings remained unchanged during the\nexperiment evaluations. The results exhibit superior robustness and real-time\nperformance of the G3Reg framework compared to state-of-the-art methods.\nFurthermore, we demonstrate the potential for integrating individual GEM and\nPAGOR components into other registration frameworks to enhance their efficacy.\nCode: https://github.com/HKUST-Aerial-Robotics/G3Reg\n","authors":["Zhijian Qiao","Zehuan Yu","Binqian Jiang","Huan Yin","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2308.11573v2.pdf","comment":"Accepted to 2024 IEEE Transactions on Automation Science and\n Engineering (IEEE TASE)"},{"id":"http://arxiv.org/abs/2404.15781v1","updated":"2024-04-24T10:03:37Z","published":"2024-04-24T10:03:37Z","title":"Real-Time Compressed Sensing for Joint Hyperspectral Image Transmission\n and Restoration for CubeSat","summary":" This paper addresses the challenges associated with hyperspectral image (HSI)\nreconstruction from miniaturized satellites, which often suffer from stripe\neffects and are computationally resource-limited. We propose a Real-Time\nCompressed Sensing (RTCS) network designed to be lightweight and require only\nrelatively few training samples for efficient and robust HSI reconstruction in\nthe presence of the stripe effect and under noisy transmission conditions. The\nRTCS network features a simplified architecture that reduces the required\ntraining samples and allows for easy implementation on integer-8-based\nencoders, facilitating rapid compressed sensing for stripe-like HSI, which\nexactly matches the moderate design of miniaturized satellites on push broom\nscanning mechanism. This contrasts optimization-based models that demand\nhigh-precision floating-point operations, making them difficult to deploy on\nedge devices. Our encoder employs an integer-8-compatible linear projection for\nstripe-like HSI data transmission, ensuring real-time compressed sensing.\nFurthermore, based on the novel two-streamed architecture, an efficient HSI\nrestoration decoder is proposed for the receiver side, allowing for edge-device\nreconstruction without needing a sophisticated central server. This is\nparticularly crucial as an increasing number of miniaturized satellites\nnecessitates significant computing resources on the ground station. Extensive\nexperiments validate the superior performance of our approach, offering new and\nvital capabilities for existing miniaturized satellite systems.\n","authors":["Chih-Chung Hsu","Chih-Yu Jian","Eng-Shen Tu","Chia-Ming Lee","Guan-Lin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15781v1.pdf","comment":"Accepted by TGRS 2024"},{"id":"http://arxiv.org/abs/2404.15774v1","updated":"2024-04-24T09:52:36Z","published":"2024-04-24T09:52:36Z","title":"Toward Physics-Aware Deep Learning Architectures for LiDAR Intensity\n Simulation","summary":" Autonomous vehicles (AVs) heavily rely on LiDAR perception for environment\nunderstanding and navigation. LiDAR intensity provides valuable information\nabout the reflected laser signals and plays a crucial role in enhancing the\nperception capabilities of AVs. However, accurately simulating LiDAR intensity\nremains a challenge due to the unavailability of material properties of the\nobjects in the environment, and complex interactions between the laser beam and\nthe environment. The proposed method aims to improve the accuracy of intensity\nsimulation by incorporating physics-based modalities within the deep learning\nframework. One of the key entities that captures the interaction between the\nlaser beam and the objects is the angle of incidence. In this work we\ndemonstrate that the addition of the LiDAR incidence angle as a separate input\nto the deep neural networks significantly enhances the results. We present a\ncomparative study between two prominent deep learning architectures: U-NET a\nConvolutional Neural Network (CNN), and Pix2Pix a Generative Adversarial\nNetwork (GAN). We implemented these two architectures for the intensity\nprediction task and used SemanticKITTI and VoxelScape datasets for experiments.\nThe comparative analysis reveals that both architectures benefit from the\nincidence angle as an additional input. Moreover, the Pix2Pix architecture\noutperforms U-NET, especially when the incidence angle is incorporated.\n","authors":["Vivek Anand","Bharat Lohani","Gaurav Pandey","Rakesh Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.15774v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.15771v1","updated":"2024-04-24T09:45:12Z","published":"2024-04-24T09:45:12Z","title":"DVF: Advancing Robust and Accurate Fine-Grained Image Retrieval with\n Retrieval Guidelines","summary":" Fine-grained image retrieval (FGIR) is to learn visual representations that\ndistinguish visually similar objects while maintaining generalization. Existing\nmethods propose to generate discriminative features, but rarely consider the\nparticularity of the FGIR task itself. This paper presents a meticulous\nanalysis leading to the proposal of practical guidelines to identify\nsubcategory-specific discrepancies and generate discriminative features to\ndesign effective FGIR models. These guidelines include emphasizing the object\n(G1), highlighting subcategory-specific discrepancies (G2), and employing\neffective training strategy (G3). Following G1 and G2, we design a novel Dual\nVisual Filtering mechanism for the plain visual transformer, denoted as DVF, to\ncapture subcategory-specific discrepancies. Specifically, the dual visual\nfiltering mechanism comprises an object-oriented module and a semantic-oriented\nmodule. These components serve to magnify objects and identify discriminative\nregions, respectively. Following G3, we implement a discriminative model\ntraining strategy to improve the discriminability and generalization ability of\nDVF. Extensive analysis and ablation studies confirm the efficacy of our\nproposed guidelines. Without bells and whistles, the proposed DVF achieves\nstate-of-the-art performance on three widely-used fine-grained datasets in\nclosed-set and open-set settings.\n","authors":["Xin Jiang","Hao Tang","Rui Yan","Jinhui Tang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2404.15771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15770v1","updated":"2024-04-24T09:44:44Z","published":"2024-04-24T09:44:44Z","title":"ChEX: Interactive Localization and Region Description in Chest X-rays","summary":" Report generation models offer fine-grained textual interpretations of\nmedical images like chest X-rays, yet they often lack interactivity (i.e. the\nability to steer the generation process through user queries) and localized\ninterpretability (i.e. visually grounding their predictions), which we deem\nessential for future adoption in clinical practice. While there have been\nefforts to tackle these issues, they are either limited in their interactivity\nby not supporting textual queries or fail to also offer localized\ninterpretability. Therefore, we propose a novel multitask architecture and\ntraining paradigm integrating textual prompts and bounding boxes for diverse\naspects like anatomical regions and pathologies. We call this approach the\nChest X-Ray Explainer (ChEX). Evaluations across a heterogeneous set of 9 chest\nX-ray tasks, including localized image interpretation and report generation,\nshowcase its competitiveness with SOTA models while additional analysis\ndemonstrates ChEX's interactive capabilities.\n","authors":["Philip Müller","Georgios Kaissis","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2404.15770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15765v1","updated":"2024-04-24T09:37:22Z","published":"2024-04-24T09:37:22Z","title":"3D Face Morphing Attack Generation using Non-Rigid Registration","summary":" Face Recognition Systems (FRS) are widely used in commercial environments,\nsuch as e-commerce and e-banking, owing to their high accuracy in real-world\nconditions. However, these systems are vulnerable to facial morphing attacks,\nwhich are generated by blending face color images of different subjects. This\npaper presents a new method for generating 3D face morphs from two bona fide\npoint clouds. The proposed method first selects bona fide point clouds with\nneutral expressions. The two input point clouds were then registered using a\nBayesian Coherent Point Drift (BCPD) without optimization, and the geometry and\ncolor of the registered point clouds were averaged to generate a face morphing\npoint cloud. The proposed method generates 388 face-morphing point clouds from\n200 bona fide subjects. The effectiveness of the method was demonstrated\nthrough extensive vulnerability experiments, achieving a Generalized Morphing\nAttack Potential (G-MAP) of 97.93%, which is superior to the existing\nstate-of-the-art (SOTA) with a G-MAP of 81.61%.\n","authors":["Jag Mohan Singh","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2404.15765v1.pdf","comment":"Accepted to 2024 18th International Conference on Automatic Face and\n Gesture Recognition (FG) as short paper"},{"id":"http://arxiv.org/abs/2403.20168v2","updated":"2024-04-24T09:31:11Z","published":"2024-03-29T13:35:37Z","title":"Unsupervised Tumor-Aware Distillation for Multi-Modal Brain Image\n Translation","summary":" Multi-modal brain images from MRI scans are widely used in clinical diagnosis\nto provide complementary information from different modalities. However,\nobtaining fully paired multi-modal images in practice is challenging due to\nvarious factors, such as time, cost, and artifacts, resulting in\nmodality-missing brain images. To address this problem, unsupervised\nmulti-modal brain image translation has been extensively studied. Existing\nmethods suffer from the problem of brain tumor deformation during translation,\nas they fail to focus on the tumor areas when translating the whole images. In\nthis paper, we propose an unsupervised tumor-aware distillation teacher-student\nnetwork called UTAD-Net, which is capable of perceiving and translating tumor\nareas precisely. Specifically, our model consists of two parts: a teacher\nnetwork and a student network. The teacher network learns an end-to-end mapping\nfrom source to target modality using unpaired images and corresponding tumor\nmasks first. Then, the translation knowledge is distilled into the student\nnetwork, enabling it to generate more realistic tumor areas and whole images\nwithout masks. Experiments show that our model achieves competitive performance\non both quantitative and qualitative evaluations of image quality compared with\nstate-of-the-art methods. Furthermore, we demonstrate the effectiveness of the\ngenerated images on downstream segmentation tasks. Our code is available at\nhttps://github.com/scut-HC/UTAD-Net.\n","authors":["Chuan Huang","Jia Wei","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2403.20168v2.pdf","comment":"8 pages, 5 figures. It has been provisionally accepted for IJCNN 2024"},{"id":"http://arxiv.org/abs/2403.20035v3","updated":"2024-04-24T09:17:06Z","published":"2024-03-29T08:03:42Z","title":"UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces\n Parameters for Skin Lesion Segmentation","summary":" Traditionally for improving the segmentation performance of models, most\napproaches prefer to use adding more complex modules. And this is not suitable\nfor the medical field, especially for mobile medical devices, where\ncomputationally loaded models are not suitable for real clinical environments\ndue to computational resource constraints. Recently, state-space models (SSMs),\nrepresented by Mamba, have become a strong competitor to traditional CNNs and\nTransformers. In this paper, we deeply explore the key elements of parameter\ninfluence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight\nVM-UNet) based on this. Specifically, we propose a method for processing\nfeatures in parallel Vision Mamba, named PVM Layer, which achieves excellent\nperformance with the lowest computational load while keeping the overall number\nof processing channels constant. We conducted comparisons and ablation\nexperiments with several state-of-the-art lightweight models on three skin\nlesion public datasets and demonstrated that the UltraLight VM-UNet exhibits\nthe same strong performance competitiveness with parameters of only 0.049M and\nGFLOPs of 0.060. In addition, this study deeply explores the key elements of\nparameter influence in Mamba, which will lay a theoretical foundation for Mamba\nto possibly become a new mainstream module for lightweighting in the future.\nThe code is available from https://github.com/wurenkai/UltraLight-VM-UNet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.20035v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15743v1","updated":"2024-04-24T09:02:24Z","published":"2024-04-24T09:02:24Z","title":"SRAGAN: Saliency Regularized and Attended Generative Adversarial Network\n for Chinese Ink-wash Painting Generation","summary":" This paper handles the problem of converting real pictures into traditional\nChinese ink-wash paintings, i.e., Chinese ink-wash painting style transfer.\nThough this problem could be realized by a wide range of image-to-image\ntranslation models, a notable issue with all these methods is that the original\nimage content details could be easily erased or corrupted due to transfer of\nink-wash style elements. To solve or ameliorate this issue, we propose to\nincorporate saliency detection into the unpaired image-to-image translation\nframework to regularize content information of the generated paintings. The\nsaliency map is utilized for content regularization from two aspects, both\nexplicitly and implicitly: (\\romannumeral1) we propose saliency IOU (SIOU) loss\nto explicitly regularize saliency consistency before and after stylization;\n(\\romannumeral2) we propose saliency adaptive normalization (SANorm) which\nimplicitly enhances content integrity of the generated paintings by injecting\nsaliency information to the generator network to guide painting generation.\nBesides, we also propose saliency attended discriminator network which\nharnesses saliency mask to focus generative adversarial attention onto salient\nimage regions, it contributes to producing finer ink-wash stylization effect\nfor salient objects of images. Qualitative and quantitative experiments\nconsistently demonstrate superiority of our model over related advanced methods\nfor Chinese ink-wash painting style transfer.\n","authors":["Xiang Gao","Yuqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15743v1.pdf","comment":"25 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.15734v1","updated":"2024-04-24T08:46:25Z","published":"2024-04-24T08:46:25Z","title":"Fine-grained Spatial-temporal MLP Architecture for Metro\n Origin-Destination Prediction","summary":" Accurate prediction of metro traffic is crucial for optimizing metro\nscheduling and enhancing overall transport efficiency. Analyzing fine-grained\nand comprehensive relations among stations effectively is imperative for metro\nOrigin-Destination (OD) prediction. However, existing metro OD models either\nmix information from multiple OD pairs from the station's perspective or\nexclusively focus on a subset of OD pairs. These approaches may overlook\nfine-grained relations among OD pairs, leading to difficulties in predicting\npotential anomalous conditions. To address these challenges, we analyze traffic\nvariations from the perspective of all OD pairs and propose a fine-grained\nspatial-temporal MLP architecture for metro OD prediction, namely ODMixer.\nSpecifically, our ODMixer has double-branch structure and involves the Channel\nMixer, the Multi-view Mixer, and the Bidirectional Trend Learner. The Channel\nMixer aims to capture short-term temporal relations among OD pairs, the\nMulti-view Mixer concentrates on capturing relations from both origin and\ndestination perspectives. To model long-term temporal relations, we introduce\nthe Bidirectional Trend Learner. Extensive experiments on two large-scale metro\nOD prediction datasets HZMOD and SHMO demonstrate the advantages of our\nODMixer. The code will be available.\n","authors":["Yang Liu","Binglin Chen","Yongsen Zheng","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.15734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15721v1","updated":"2024-04-24T08:15:36Z","published":"2024-04-24T08:15:36Z","title":"SPARO: Selective Attention for Robust and Compositional Transformer\n Encodings for Vision","summary":" Selective attention helps us focus on task-relevant aspects in the constant\nflood of our sensory input. This constraint in our perception allows us to\nrobustly generalize under distractions and to new compositions of perceivable\nconcepts. Transformers employ a similar notion of attention in their\narchitecture, but representation learning models with transformer backbones\nlike CLIP and DINO often fail to demonstrate robustness and compositionality.\nWe highlight a missing architectural prior: unlike human perception,\ntransformer encodings do not separately attend over individual concepts. In\nresponse, we propose SPARO, a read-out mechanism that partitions encodings into\nseparately-attended slots, each produced by a single attention head. Using\nSPARO with CLIP imparts an inductive bias that the vision and text modalities\nare different views of a shared compositional world with the same corresponding\nconcepts. Using SPARO, we demonstrate improvements on downstream recognition,\nrobustness, retrieval, and compositionality benchmarks with CLIP (up to +14%\nfor ImageNet, +4% for SugarCrepe), and on nearest neighbors and linear probe\nfor ImageNet with DINO (+3% each). We also showcase a powerful ability to\nintervene and select individual SPARO concepts to further improve downstream\ntask performance (up from +4% to +9% for SugarCrepe) and use this ability to\nstudy the robustness of SPARO's representation structure. Finally, we provide\ninsights through ablation experiments and visualization of learned concepts.\n","authors":["Ankit Vani","Bac Nguyen","Samuel Lavoie","Ranjay Krishna","Aaron Courville"],"pdf_url":"https://arxiv.org/pdf/2404.15721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15718v1","updated":"2024-04-24T08:11:18Z","published":"2024-04-24T08:11:18Z","title":"Mitigating False Predictions In Unreasonable Body Regions","summary":" Despite considerable strides in developing deep learning models for 3D\nmedical image segmentation, the challenge of effectively generalizing across\ndiverse image distributions persists. While domain generalization is\nacknowledged as vital for robust application in clinical settings, the\nchallenges stemming from training with a limited Field of View (FOV) remain\nunaddressed. This limitation leads to false predictions when applied to body\nregions beyond the FOV of the training data. In response to this problem, we\npropose a novel loss function that penalizes predictions in implausible body\nregions, applicable in both single-dataset and multi-dataset training schemes.\nIt is realized with a Body Part Regression model that generates axial slice\npositional scores. Through comprehensive evaluation using a test set featuring\nvarying FOVs, our approach demonstrates remarkable improvements in\ngeneralization capabilities. It effectively mitigates false positive tumor\npredictions up to 85% and significantly enhances overall segmentation\nperformance.\n","authors":["Constantin Ulrich","Catherine Knobloch","Julius C. Holzschuh","Tassilo Wald","Maximilian R. Rokuss","Maximilian Zenk","Maximilian Fischer","Michael Baumgartner","Fabian Isensee","Klaus H. Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2404.15718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15714v1","updated":"2024-04-24T08:07:16Z","published":"2024-04-24T08:07:16Z","title":"Ada-DF: An Adaptive Label Distribution Fusion Network For Facial\n Expression Recognition","summary":" Facial expression recognition (FER) plays a significant role in our daily\nlife. However, annotation ambiguity in the datasets could greatly hinder the\nperformance. In this paper, we address FER task via label distribution learning\nparadigm, and develop a dual-branch Adaptive Distribution Fusion (Ada-DF)\nframework. One auxiliary branch is constructed to obtain the label\ndistributions of samples. The class distributions of emotions are then computed\nthrough the label distributions of each emotion. Finally, those two\ndistributions are adaptively fused according to the attention weights to train\nthe target branch. Extensive experiments are conducted on three real-world\ndatasets, RAF-DB, AffectNet and SFEW, where our Ada-DF shows advantages over\nthe state-of-the-art works.\n","authors":["Shu Liu","Yan Xu","Tongming Wan","Xiaoyan Kui"],"pdf_url":"https://arxiv.org/pdf/2404.15714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15709v1","updated":"2024-04-24T07:58:28Z","published":"2024-04-24T07:58:28Z","title":"ViViDex: Learning Vision-based Dexterous Manipulation from Human Videos","summary":" In this work, we aim to learn a unified vision-based policy for a\nmulti-fingered robot hand to manipulate different objects in diverse poses.\nThough prior work has demonstrated that human videos can benefit policy\nlearning, performance improvement has been limited by physically implausible\ntrajectories extracted from videos. Moreover, reliance on privileged object\ninformation such as ground-truth object states further limits the applicability\nin realistic scenarios. To address these limitations, we propose a new\nframework ViViDex to improve vision-based policy learning from human videos. It\nfirst uses reinforcement learning with trajectory guided rewards to train\nstate-based policies for each video, obtaining both visually natural and\nphysically plausible trajectories from the video. We then rollout successful\nepisodes from state-based policies and train a unified visual policy without\nusing any privileged information. A coordinate transformation method is\nproposed to significantly boost the performance. We evaluate our method on\nthree dexterous manipulation tasks and demonstrate a large improvement over\nstate-of-the-art algorithms.\n","authors":["Zerui Chen","Shizhe Chen","Cordelia Schmid","Ivan Laptev"],"pdf_url":"https://arxiv.org/pdf/2404.15709v1.pdf","comment":"Project Page: https://zerchen.github.io/projects/vividex.html"},{"id":"http://arxiv.org/abs/2311.12268v2","updated":"2024-04-24T07:57:40Z","published":"2023-11-21T01:18:23Z","title":"Boosting Audio-visual Zero-shot Learning with Large Language Models","summary":" Audio-visual zero-shot learning aims to recognize unseen classes based on\npaired audio-visual sequences. Recent methods mainly focus on learning\nmulti-modal features aligned with class names to enhance the generalization\nability to unseen categories. However, these approaches ignore the obscure\nevent concepts in class names and may inevitably introduce complex network\nstructures with difficult training objectives. In this paper, we introduce a\nstraightforward yet efficient framework called KnowleDge-Augmented audio-visual\nlearning (KDA), which aids the model in more effectively learning novel event\ncontent by leveraging an external knowledge base. Specifically, we first\npropose to utilize the knowledge contained in large language models (LLMs) to\ngenerate numerous descriptive sentences that include important distinguishing\naudio-visual features of event classes, which helps to better understand unseen\ncategories. Furthermore, we propose a knowledge-aware adaptive margin loss to\nhelp distinguish similar events, further improving the generalization ability\ntowards unseen classes. Extensive experimental results demonstrate that our\nproposed KDA can outperform state-of-the-art methods on three popular\naudio-visual zero-shot learning datasets.Our code will be avaliable at\n\\url{https://github.com/chenhaoxing/KDA}.\n","authors":["Haoxing Chen","Yaohui Li","Yan Hong","Zizheng Huang","Zhuoer Xu","Zhangxuan Gu","Jun Lan","Huijia Zhu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15707v1","updated":"2024-04-24T07:56:28Z","published":"2024-04-24T07:56:28Z","title":"ESR-NeRF: Emissive Source Reconstruction Using LDR Multi-view Images","summary":" Existing NeRF-based inverse rendering methods suppose that scenes are\nexclusively illuminated by distant light sources, neglecting the potential\ninfluence of emissive sources within a scene. In this work, we confront this\nlimitation using LDR multi-view images captured with emissive sources turned on\nand off. Two key issues must be addressed: 1) ambiguity arising from the\nlimited dynamic range along with unknown lighting details, and 2) the expensive\ncomputational cost in volume rendering to backtrace the paths leading to final\nobject colors. We present a novel approach, ESR-NeRF, leveraging neural\nnetworks as learnable functions to represent ray-traced fields. By training\nnetworks to satisfy light transport segments, we regulate outgoing radiances,\nprogressively identifying emissive sources while being aware of reflection\nareas. The results on scenes encompassing emissive sources with various\nproperties demonstrate the superiority of ESR-NeRF in qualitative and\nquantitative ways. Our approach also extends its applicability to the scenes\ndevoid of emissive sources, achieving lower CD metrics on the DTU dataset.\n","authors":["Jinseo Jeong","Junseo Koo","Qimeng Zhang","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2404.15707v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15700v1","updated":"2024-04-24T07:38:14Z","published":"2024-04-24T07:38:14Z","title":"MAS-SAM: Segment Any Marine Animal with Aggregated Features","summary":" Recently, Segment Anything Model (SAM) shows exceptional performance in\ngenerating high-quality object masks and achieving zero-shot image\nsegmentation. However, as a versatile vision model, SAM is primarily trained\nwith large-scale natural light images. In underwater scenes, it exhibits\nsubstantial performance degradation due to the light scattering and absorption.\nMeanwhile, the simplicity of the SAM's decoder might lead to the loss of\nfine-grained object details. To address the above issues, we propose a novel\nfeature learning framework named MAS-SAM for marine animal segmentation, which\ninvolves integrating effective adapters into the SAM's encoder and constructing\na pyramidal decoder. More specifically, we first build a new SAM's encoder with\neffective adapters for underwater scenes. Then, we introduce a Hypermap\nExtraction Module (HEM) to generate multi-scale features for a comprehensive\nguidance. Finally, we propose a Progressive Prediction Decoder (PPD) to\naggregate the multi-scale features and predict the final segmentation results.\nWhen grafting with the Fusion Attention Module (FAM), our method enables to\nextract richer marine information from global contextual cues to fine-grained\nlocal details. Extensive experiments on four public MAS datasets demonstrate\nthat our MAS-SAM can obtain better results than other typical segmentation\nmethods. The source code is available at https://github.com/Drchip61/MAS-SAM.\n","authors":["Tianyu Yan","Zifu Wan","Xinhao Deng","Pingping Zhang","Yang Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.15700v1.pdf","comment":"Accepted by IJCAI2024. More modifications may be performed"},{"id":"http://arxiv.org/abs/2404.15697v1","updated":"2024-04-24T07:25:36Z","published":"2024-04-24T07:25:36Z","title":"DeepFeatureX Net: Deep Features eXtractors based Network for\n discriminating synthetic from real images","summary":" Deepfakes, synthetic images generated by deep learning algorithms, represent\none of the biggest challenges in the field of Digital Forensics. The scientific\ncommunity is working to develop approaches that can discriminate the origin of\ndigital images (real or AI-generated). However, these methodologies face the\nchallenge of generalization, that is, the ability to discern the nature of an\nimage even if it is generated by an architecture not seen during training. This\nusually leads to a drop in performance. In this context, we propose a novel\napproach based on three blocks called Base Models, each of which is responsible\nfor extracting the discriminative features of a specific image class (Diffusion\nModel-generated, GAN-generated, or real) as it is trained by exploiting\ndeliberately unbalanced datasets. The features extracted from each block are\nthen concatenated and processed to discriminate the origin of the input image.\nExperimental results showed that this approach not only demonstrates good\nrobust capabilities to JPEG compression but also outperforms state-of-the-art\nmethods in several generalization tests. Code, models and dataset are available\nat https://github.com/opontorno/block-based_deepfake-detection.\n","authors":["Orazio Pontorno","Luca Guarnera","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2404.15697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00906v3","updated":"2024-04-24T07:15:16Z","published":"2024-04-01T04:21:01Z","title":"From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with\n Vision-Language Models","summary":" Scene graph generation (SGG) aims to parse a visual scene into an\nintermediate graph representation for downstream reasoning tasks. Despite\nrecent advancements, existing methods struggle to generate scene graphs with\nnovel visual relation concepts. To address this challenge, we introduce a new\nopen-vocabulary SGG framework based on sequence generation. Our framework\nleverages vision-language pre-trained models (VLM) by incorporating an\nimage-to-graph generation paradigm. Specifically, we generate scene graph\nsequences via image-to-text generation with VLM and then construct scene graphs\nfrom these sequences. By doing so, we harness the strong capabilities of VLM\nfor open-vocabulary SGG and seamlessly integrate explicit relational modeling\nfor enhancing the VL tasks. Experimental results demonstrate that our design\nnot only achieves superior performance with an open vocabulary but also\nenhances downstream vision-language task performance through explicit relation\nmodeling knowledge.\n","authors":["Rongjie Li","Songyang Zhang","Dahua Lin","Kai Chen","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.00906v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06731v4","updated":"2024-04-24T07:05:11Z","published":"2023-12-11T09:44:41Z","title":"Genixer: Empowering Multimodal Large Language Models as a Powerful Data\n Generator","summary":" Instruction tuning data is essential for training the Multimodal Large\nLanguage Models (MLLMs). However, the creation of high-quality instruction\ntuning data presents significant challenges. Asking the human to label the\ninstruction tuning data is label-intensive and time-consuming. Some works\nprompted to GPT-4 for data generation were not only costly but also lacked\nsatisfactory performance in complex tasks (i.e., grounding-based reasoning\ntasks). To address the challenges of data creation, we are the first to explore\nthe potential of empowering MLLMs with the ability to generate\ninstruction-tuning data by following user instructions. Specifically, we\ndeveloped an innovative data generation pipeline Genixer to generate various\nhigh-quality instruction tuning data, including nine representative tasks,\ne.g., Common VQA, REC, REG, and PointQ. Genixer provides a unified solution for\ndata generation with four key steps: (i) instruction data collection, (ii)\ninstruction template design, (iii) empowering MLLM, and (iv) data generation\nand filtering. To validate the effectiveness of generated data, we conducted\nthe human evaluation and user preference study to assess the quality of\ngenerated data. Subsequently, we generated two instruction-tuning datasets for\nthe training of two representative MLLMs, LLaVA1.5 and Shikra, and noted\nconsistent improvements across various VQA tasks and multimodal benchmarks. For\ninstance, performance on the VizWiz benchmark improved from 50.0% to 53.8%, and\non ScienceQA, it increased from 66.8% to 69.7%, reconfirming the quality of the\ngenerated instruction tuning data. The data, code, and models will be released.\n","authors":["Henry Hengyuan Zhao","Pan Zhou","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.06731v4.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.15683v1","updated":"2024-04-24T06:35:56Z","published":"2024-04-24T06:35:56Z","title":"AnoFPDM: Anomaly Segmentation with Forward Process of Diffusion Models\n for Brain MRI","summary":" Weakly-supervised diffusion models (DM) in anomaly segmentation, leveraging\nimage-level labels, have attracted significant attention for their superior\nperformance compared to unsupervised methods. It eliminates the need for\npixel-level labels in training, offering a more cost-effective alternative to\nsupervised methods. However, existing methods are not fully weakly-supervised\nbecause they heavily rely on costly pixel-level labels for hyperparameter\ntuning in inference. To tackle this challenge, we introduce Anomaly\nSegmentation with Forward Process of Diffusion Models (AnoFPDM), a fully\nweakly-supervised framework that operates without the need for pixel-level\nlabels. Leveraging the unguided forward process as a reference, we identify\nsuitable hyperparameters, i.e., noise scale and threshold, for each input\nimage. We aggregate anomaly maps from each step in the forward process,\nenhancing the signal strength of anomalous regions. Remarkably, our proposed\nmethod outperforms recent state-of-the-art weakly-supervised approaches, even\nwithout utilizing pixel-level labels.\n","authors":["Yiming Che","Fazle Rafsani","Jay Shah","Md Mahfuzur Rahman Siddiquee","Teresa Wu"],"pdf_url":"https://arxiv.org/pdf/2404.15683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15677v1","updated":"2024-04-24T06:15:31Z","published":"2024-04-24T06:15:31Z","title":"CharacterFactory: Sampling Consistent Characters with GANs for Diffusion\n Models","summary":" Recent advances in text-to-image models have opened new frontiers in\nhuman-centric generation. However, these models cannot be directly employed to\ngenerate images with consistent newly coined identities. In this work, we\npropose CharacterFactory, a framework that allows sampling new characters with\nconsistent identities in the latent space of GANs for diffusion models. More\nspecifically, we consider the word embeddings of celeb names as ground truths\nfor the identity-consistent generation task and train a GAN model to learn the\nmapping from a latent space to the celeb embedding space. In addition, we\ndesign a context-consistent loss to ensure that the generated identity\nembeddings can produce identity-consistent images in various contexts.\nRemarkably, the whole model only takes 10 minutes for training, and can sample\ninfinite characters end-to-end during inference. Extensive experiments\ndemonstrate excellent performance of the proposed CharacterFactory on character\ncreation in terms of identity consistency and editability. Furthermore, the\ngenerated characters can be seamlessly combined with the off-the-shelf\nimage/video/3D diffusion models. We believe that the proposed CharacterFactory\nis an important step for identity-consistent character generation. Project page\nis available at: https://qinghew.github.io/CharacterFactory/.\n","authors":["Qinghe Wang","Baolu Li","Xiaomin Li","Bing Cao","Liqian Ma","Huchuan Lu","Xu Jia"],"pdf_url":"https://arxiv.org/pdf/2404.15677v1.pdf","comment":"Code will be released very soon:\n https://github.com/qinghew/CharacterFactory"},{"id":"http://arxiv.org/abs/2404.11868v2","updated":"2024-04-24T06:05:33Z","published":"2024-04-18T02:59:48Z","title":"OPTiML: Dense Semantic Invariance Using Optimal Transport for\n Self-Supervised Medical Image Representation","summary":" Self-supervised learning (SSL) has emerged as a promising technique for\nmedical image analysis due to its ability to learn without annotations.\nHowever, despite the promising potential, conventional SSL methods encounter\nlimitations, including challenges in achieving semantic alignment and capturing\nsubtle details. This leads to suboptimal representations, which fail to\naccurately capture the underlying anatomical structures and pathological\ndetails. In response to these constraints, we introduce a novel SSL framework\nOPTiML, employing optimal transport (OT), to capture the dense semantic\ninvariance and fine-grained details, thereby enhancing the overall\neffectiveness of SSL in medical image representation learning. The core idea is\nto integrate OT with a cross-viewpoint semantics infusion module (CV-SIM),\nwhich effectively captures complex, fine-grained details inherent in medical\nimages across different viewpoints. In addition to the CV-SIM module, OPTiML\nimposes the variance and covariance regularizations within OT framework to\nforce the model focus on clinically relevant information while discarding less\ninformative features. Through these, the proposed framework demonstrates its\ncapacity to learn semantically rich representations that can be applied to\nvarious medical imaging tasks. To validate its effectiveness, we conduct\nexperimental studies on three publicly available datasets from chest X-ray\nmodality. Our empirical results reveal OPTiML's superiority over\nstate-of-the-art methods across all evaluated tasks.\n","authors":["Azad Singh","Vandan Gorade","Deepak Mishra"],"pdf_url":"https://arxiv.org/pdf/2404.11868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15672v1","updated":"2024-04-24T06:02:59Z","published":"2024-04-24T06:02:59Z","title":"Representing Part-Whole Hierarchies in Foundation Models by Learning\n Localizability, Composability, and Decomposability from Anatomy via\n Self-Supervision","summary":" Humans effortlessly interpret images by parsing them into part-whole\nhierarchies; deep learning excels in learning multi-level feature spaces, but\nthey often lack explicit coding of part-whole relations, a prominent property\nof medical imaging. To overcome this limitation, we introduce Adam-v2, a new\nself-supervised learning framework extending Adam [79] by explicitly\nincorporating part-whole hierarchies into its learning objectives through three\nkey branches: (1) Localizability, acquiring discriminative representations to\ndistinguish different anatomical patterns; (2) Composability, learning each\nanatomical structure in a parts-to-whole manner; and (3) Decomposability,\ncomprehending each anatomical structure in a whole-to-parts manner.\nExperimental results across 10 tasks, compared to 11 baselines in zero-shot,\nfew-shot transfer, and full fine-tuning settings, showcase Adam-v2's superior\nperformance over large-scale medical models and existing SSL methods across\ndiverse downstream tasks. The higher generality and robustness of Adam-v2's\nrepresentations originate from its explicit construction of hierarchies for\ndistinct anatomical structures from unlabeled medical images. Adam-v2 preserves\na semantic balance of anatomical diversity and harmony in its embedding,\nyielding representations that are both generic and semantically meaningful, yet\noverlooked in existing SSL methods. All code and pretrained models are\navailable at https://github.com/JLiangLab/Eden.\n","authors":["Mohammad Reza Hosseinzadeh Taher","Michael B. Gotway","Jianming Liang"],"pdf_url":"https://arxiv.org/pdf/2404.15672v1.pdf","comment":"Accepted at CVPR 2024 [main conference]"},{"id":"http://arxiv.org/abs/2404.15661v1","updated":"2024-04-24T05:37:17Z","published":"2024-04-24T05:37:17Z","title":"CWF: Consolidating Weak Features in High-quality Mesh Simplification","summary":" In mesh simplification, common requirements like accuracy, triangle quality,\nand feature alignment are often considered as a trade-off. Existing algorithms\nconcentrate on just one or a few specific aspects of these requirements. For\nexample, the well-known Quadric Error Metrics (QEM) approach prioritizes\naccuracy and can preserve strong feature lines/points as well but falls short\nin ensuring high triangle quality and may degrade weak features that are not as\ndistinctive as strong ones. In this paper, we propose a smooth functional that\nsimultaneously considers all of these requirements. The functional comprises a\nnormal anisotropy term and a Centroidal Voronoi Tessellation (CVT) energy term,\nwith the variables being a set of movable points lying on the surface. The\nformer inherits the spirit of QEM but operates in a continuous setting, while\nthe latter encourages even point distribution, allowing various surface\nmetrics. We further introduce a decaying weight to automatically balance the\ntwo terms. We selected 100 CAD models from the ABC dataset, along with 21\norganic models, to compare the existing mesh simplification algorithms with\nours. Experimental results reveal an important observation: the introduction of\na decaying weight effectively reduces the conflict between the two terms and\nenables the alignment of weak features. This distinctive feature sets our\napproach apart from most existing mesh simplification methods and demonstrates\nsignificant potential in shape understanding.\n","authors":["Rui Xu","Longdu Liu","Ningna Wang","Shuangmin Chen","Shiqing Xin","Xiaohu Guo","Zichun Zhong","Taku Komura","Wenping Wang","Changhe Tu"],"pdf_url":"https://arxiv.org/pdf/2404.15661v1.pdf","comment":"14 pages, 22 figures"},{"id":"http://arxiv.org/abs/2212.01742v2","updated":"2024-04-24T05:30:46Z","published":"2022-12-04T04:19:36Z","title":"Lightweight Facial Attractiveness Prediction Using Dual Label\n Distribution","summary":" Facial attractiveness prediction (FAP) aims to assess facial attractiveness\nautomatically based on human aesthetic perception. Previous methods using deep\nconvolutional neural networks have improved the performance, but their\nlarge-scale models have led to a deficiency in flexibility. In addition, most\nmethods fail to take full advantage of the dataset. In this paper, we present a\nnovel end-to-end FAP approach that integrates dual label distribution and\nlightweight design. The manual ratings, attractiveness score, and standard\ndeviation are aggregated explicitly to construct a dual-label distribution to\nmake the best use of the dataset, including the attractiveness distribution and\nthe rating distribution. Such distributions, as well as the attractiveness\nscore, are optimized under a joint learning framework based on the label\ndistribution learning (LDL) paradigm. The data processing is simplified to a\nminimum for a lightweight design, and MobileNetV2 is selected as our backbone.\nExtensive experiments are conducted on two benchmark datasets, where our\napproach achieves promising results and succeeds in balancing performance and\nefficiency. Ablation studies demonstrate that our delicately designed learning\nmodules are indispensable and correlated. Additionally, the visualization\nindicates that our approach can perceive facial attractiveness and capture\nattractive facial regions to facilitate semantic predictions. The code is\navailable at https://github.com/enquan/2D_FAP.\n","authors":["Shu Liu","Enquan Huang","Ziyu Zhou","Yan Xu","Xiaoyan Kui","Tao Lei","Hongying Meng"],"pdf_url":"https://arxiv.org/pdf/2212.01742v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15655v1","updated":"2024-04-24T05:20:42Z","published":"2024-04-24T05:20:42Z","title":"Multi-Modal Proxy Learning Towards Personalized Visual Multiple\n Clustering","summary":" Multiple clustering has gained significant attention in recent years due to\nits potential to reveal multiple hidden structures of data from different\nperspectives. The advent of deep multiple clustering techniques has notably\nadvanced the performance by uncovering complex patterns and relationships\nwithin large datasets. However, a major challenge arises as users often do not\nneed all the clusterings that algorithms generate, and figuring out the one\nneeded requires a substantial understanding of each clustering result.\nTraditionally, aligning a user's brief keyword of interest with the\ncorresponding vision components was challenging, but the emergence of\nmulti-modal and large language models (LLMs) has begun to bridge this gap. In\nresponse, given unlabeled target visual data, we propose Multi-MaP, a novel\nmethod employing a multi-modal proxy learning process. It leverages CLIP\nencoders to extract coherent text and image embeddings, with GPT-4 integrating\nusers' interests to formulate effective textual contexts. Moreover, reference\nword constraint and concept-level constraint are designed to learn the optimal\ntext proxy according to the user's interest. Multi-MaP not only adeptly\ncaptures a user's interest via a keyword but also facilitates identifying\nrelevant clusterings. Our extensive experiments show that Multi-MaP\nconsistently outperforms state-of-the-art methods in all benchmark\nmulti-clustering vision tasks. Our code is available at\nhttps://github.com/Alexander-Yao/Multi-MaP.\n","authors":["Jiawei Yao","Qi Qian","Juhua Hu"],"pdf_url":"https://arxiv.org/pdf/2404.15655v1.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://github.com/Alexander-Yao/Multi-MaP"},{"id":"http://arxiv.org/abs/2404.15653v1","updated":"2024-04-24T05:13:28Z","published":"2024-04-24T05:13:28Z","title":"CatLIP: CLIP-level Visual Recognition Accuracy with 2.7x Faster\n Pre-training on Web-scale Image-Text Data","summary":" Contrastive learning has emerged as a transformative method for learning\neffective visual representations through the alignment of image and text\nembeddings. However, pairwise similarity computation in contrastive loss\nbetween image and text pairs poses computational challenges. This paper\npresents a novel weakly supervised pre-training of vision models on web-scale\nimage-text data. The proposed method reframes pre-training on image-text data\nas a classification task. Consequently, it eliminates the need for pairwise\nsimilarity computations in contrastive loss, achieving a remarkable $2.7\\times$\nacceleration in training speed compared to contrastive learning on web-scale\ndata. Through extensive experiments spanning diverse vision tasks, including\ndetection and segmentation, we demonstrate that the proposed method maintains\nhigh representation quality. Our source code along with pre-trained model\nweights and training recipes is available at\n\\url{https://github.com/apple/corenet}.\n","authors":["Sachin Mehta","Maxwell Horton","Fartash Faghri","Mohammad Hossein Sekhavat","Mahyar Najibi","Mehrdad Farajtabar","Oncel Tuzel","Mohammad Rastegari"],"pdf_url":"https://arxiv.org/pdf/2404.15653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15644v1","updated":"2024-04-24T04:50:50Z","published":"2024-04-24T04:50:50Z","title":"Building-PCC: Building Point Cloud Completion Benchmarks","summary":" With the rapid advancement of 3D sensing technologies, obtaining 3D shape\ninformation of objects has become increasingly convenient. Lidar technology,\nwith its capability to accurately capture the 3D information of objects at long\ndistances, has been widely applied in the collection of 3D data in urban\nscenes. However, the collected point cloud data often exhibit incompleteness\ndue to factors such as occlusion, signal absorption, and specular reflection.\nThis paper explores the application of point cloud completion technologies in\nprocessing these incomplete data and establishes a new real-world benchmark\nBuilding-PCC dataset, to evaluate the performance of existing deep learning\nmethods in the task of urban building point cloud completion. Through a\ncomprehensive evaluation of different methods, we analyze the key challenges\nfaced in building point cloud completion, aiming to promote innovation in the\nfield of 3D geoinformation applications. Our source code is available at\nhttps://github.com/tudelft3d/Building-PCC-Building-Point-Cloud-Completion-Benchmarks.git.\n","authors":["Weixiao Gao","Ravi Peters","Jantien Stoter"],"pdf_url":"https://arxiv.org/pdf/2404.15644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04744v3","updated":"2024-04-24T04:31:29Z","published":"2023-06-07T19:44:14Z","title":"WOUAF: Weight Modulation for User Attribution and Fingerprinting in\n Text-to-Image Diffusion Models","summary":" The rapid advancement of generative models, facilitating the creation of\nhyper-realistic images from textual descriptions, has concurrently escalated\ncritical societal concerns such as misinformation. Although providing some\nmitigation, traditional fingerprinting mechanisms fall short in attributing\nresponsibility for the malicious use of synthetic images. This paper introduces\na novel approach to model fingerprinting that assigns responsibility for the\ngenerated images, thereby serving as a potential countermeasure to model\nmisuse. Our method modifies generative models based on each user's unique\ndigital fingerprint, imprinting a unique identifier onto the resultant content\nthat can be traced back to the user. This approach, incorporating fine-tuning\ninto Text-to-Image (T2I) tasks using the Stable Diffusion Model, demonstrates\nnear-perfect attribution accuracy with a minimal impact on output quality.\nThrough extensive evaluation, we show that our method outperforms baseline\nmethods with an average improvement of 11\\% in handling image post-processes.\nOur method presents a promising and novel avenue for accountable model\ndistribution and responsible use. Our code is available in\n\\url{https://github.com/kylemin/WOUAF}.\n","authors":["Changhoon Kim","Kyle Min","Maitreya Patel","Sheng Cheng","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2306.04744v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15638v1","updated":"2024-04-24T04:20:22Z","published":"2024-04-24T04:20:22Z","title":"PriorNet: A Novel Lightweight Network with Multidimensional Interactive\n Attention for Efficient Image Dehazing","summary":" Hazy images degrade visual quality, and dehazing is a crucial prerequisite\nfor subsequent processing tasks. Most current dehazing methods rely on neural\nnetworks and face challenges such as high computational parameter pressure and\nweak generalization capabilities. This paper introduces PriorNet--a novel,\nlightweight, and highly applicable dehazing network designed to significantly\nimprove the clarity and visual quality of hazy images while avoiding excessive\ndetail extraction issues. The core of PriorNet is the original\nMulti-Dimensional Interactive Attention (MIA) mechanism, which effectively\ncaptures a wide range of haze characteristics, substantially reducing the\ncomputational load and generalization difficulties associated with complex\nsystems. By utilizing a uniform convolutional kernel size and incorporating\nskip connections, we have streamlined the feature extraction process.\nSimplifying the number of layers and architecture not only enhances dehazing\nefficiency but also facilitates easier deployment on edge devices. Extensive\ntesting across multiple datasets has demonstrated PriorNet's exceptional\nperformance in dehazing and clarity restoration, maintaining image detail and\ncolor fidelity in single-image dehazing tasks. Notably, with a model size of\njust 18Kb, PriorNet showcases superior dehazing generalization capabilities\ncompared to other methods. Our research makes a significant contribution to\nadvancing image dehazing technology, providing new perspectives and tools for\nthe field and related domains, particularly emphasizing the importance of\nimproving universality and deployability.\n","authors":["Yutong Chen","Zhang Wen","Chao Wang","Lei Gong","Zhongchao Yi"],"pdf_url":"https://arxiv.org/pdf/2404.15638v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.15635v1","updated":"2024-04-24T04:10:05Z","published":"2024-04-24T04:10:05Z","title":"A Real-time Evaluation Framework for Pedestrian's Potential Risk at\n Non-Signalized Intersections Based on Predicted Post-Encroachment Time","summary":" Addressing pedestrian safety at intersections is one of the paramount\nconcerns in the field of transportation research, driven by the urgency of\nreducing traffic-related injuries and fatalities. With advances in computer\nvision technologies and predictive models, the pursuit of developing real-time\nproactive protection systems is increasingly recognized as vital to improving\npedestrian safety at intersections. The core of these protection systems lies\nin the prediction-based evaluation of pedestrian's potential risks, which plays\na significant role in preventing the occurrence of accidents. The major\nchallenges in the current prediction-based potential risk evaluation research\ncan be summarized into three aspects: the inadequate progress in creating a\nreal-time framework for the evaluation of pedestrian's potential risks, the\nabsence of accurate and explainable safety indicators that can represent the\npotential risk, and the lack of tailor-made evaluation criteria specifically\nfor each category of pedestrians. To address these research challenges, in this\nstudy, a framework with computer vision technologies and predictive models is\ndeveloped to evaluate the potential risk of pedestrians in real time. Integral\nto this framework is a novel surrogate safety measure, the Predicted\nPost-Encroachment Time (P-PET), derived from deep learning models capable to\npredict the arrival time of pedestrians and vehicles at intersections. To\nfurther improve the effectiveness and reliability of pedestrian risk\nevaluation, we classify pedestrians into distinct categories and apply specific\nevaluation criteria for each group. The results demonstrate the framework's\nability to effectively identify potential risks through the use of P-PET,\nindicating its feasibility for real-time applications and its improved\nperformance in risk evaluation across different categories of pedestrians.\n","authors":["Tengfeng Lin","Zhixiong Jin","Seongjin Choi","Hwasoo Yeo"],"pdf_url":"https://arxiv.org/pdf/2404.15635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08420v3","updated":"2024-04-24T03:35:09Z","published":"2023-10-12T15:39:54Z","title":"Visual Attention Prompted Prediction and Learning","summary":" Visual explanation (attention)-guided learning uses not only labels but also\nexplanations to guide model reasoning process. While visual attention-guided\nlearning has shown promising results, it requires a large number of explanation\nannotations that are time-consuming to prepare. However, in many real-world\nsituations, it is usually desired to prompt the model with visual attention\nwithout model retraining. For example, when doing AI-assisted cancer\nclassification on a medical image, users (e.g., clinicians) can provide the AI\nmodel with visual attention prompt on which areas are indispensable and which\nare precluded. Despite its promising objectives, achieving visual\nattention-prompted prediction presents several major challenges: 1) How can the\nvisual prompt be effectively integrated into the model's reasoning process? 2)\nHow should the model handle samples that lack visual prompts? 3) What is the\nimpact on the model's performance when a visual prompt is imperfect? This paper\nintroduces a novel framework for attention-prompted prediction and learning,\nutilizing visual prompts to steer the model's reasoning process. To improve\nperformance in non-prompted situations and align it with prompted scenarios, we\npropose a co-training approach for both non-prompted and prompted models,\nensuring they share similar parameters and activations. Additionally, for\ninstances where the visual prompt does not encompass the entire input image, we\nhave developed innovative attention prompt refinement methods. These methods\ninterpolate the incomplete prompts while maintaining alignment with the model's\nexplanations. Extensive experiments on four datasets demonstrate the\neffectiveness of our proposed framework in enhancing predictions for samples\nboth with and without prompt.\n","authors":["Yifei Zhang","Siyi Gu","Bo Pan","Guangji Bai","Meikang Qiu","Xiaofeng Yang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08420v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06670v2","updated":"2024-04-24T03:28:44Z","published":"2023-03-12T14:24:10Z","title":"Extending global-local view alignment for self-supervised learning with\n remote sensing imagery","summary":" Since large number of high-quality remote sensing images are readily\naccessible, exploiting the corpus of images with less manual annotation draws\nincreasing attention. Self-supervised models acquire general feature\nrepresentations by formulating a pretext task that generates pseudo-labels for\nmassive unlabeled data to provide supervision for training. While prior studies\nhave explored multiple self-supervised learning techniques in remote sensing\ndomain, pretext tasks based on local-global view alignment remain\nunderexplored, despite achieving state-of-the-art results on natural imagery.\nInspired by DINO, which employs an effective representation learning structure\nwith knowledge distillation based on global-local view alignment, we formulate\ntwo pretext tasks for self-supervised learning on remote sensing imagery\n(SSLRS). Using these tasks, we explore the effectiveness of positive temporal\ncontrast as well as multi-sized views on SSLRS. We extend DINO and propose\nDINO-MC which uses local views of various sized crops instead of a single fixed\nsize in order to alleviate the limited variation in object size observed in\nremote sensing imagery. Our experiments demonstrate that even when pre-trained\non only 10% of the dataset, DINO-MC performs on par or better than existing\nstate-of-the-art SSLRS methods on multiple remote sensing tasks, while using\nless computational resources. All codes, models, and results are released at\nhttps://github.com/WennyXY/DINO-MC.\n","authors":["Xinye Wanyan","Sachith Seneviratne","Shuchang Shen","Michael Kirley"],"pdf_url":"https://arxiv.org/pdf/2303.06670v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.12668v2","updated":"2024-04-24T16:17:13Z","published":"2021-04-26T15:53:03Z","title":"Appearance-based Gaze Estimation With Deep Learning: A Review and\n Benchmark","summary":" Human gaze provides valuable information on human focus and intentions,\nmaking it a crucial area of research. Recently, deep learning has\nrevolutionized appearance-based gaze estimation. However, due to the unique\nfeatures of gaze estimation research, such as the unfair comparison between 2D\ngaze positions and 3D gaze vectors and the different pre-processing and\npost-processing methods, there is a lack of a definitive guideline for\ndeveloping deep learning-based gaze estimation algorithms. In this paper, we\npresent a systematic review of the appearance-based gaze estimation methods\nusing deep learning. Firstly, we survey the existing gaze estimation algorithms\nalong the typical gaze estimation pipeline: deep feature extraction, deep\nlearning model design, personal calibration and platforms. Secondly, to fairly\ncompare the performance of different approaches, we summarize the data\npre-processing and post-processing methods, including face/eye detection, data\nrectification, 2D/3D gaze conversion and gaze origin conversion. Finally, we\nset up a comprehensive benchmark for deep learning-based gaze estimation. We\ncharacterize all the public datasets and provide the source code of typical\ngaze estimation algorithms. This paper serves not only as a reference to\ndevelop deep learning-based gaze estimation methods, but also a guideline for\nfuture gaze estimation research. The project web page can be found at\nhttps://phi-ai.buaa.edu.cn/Gazehub.\n","authors":["Yihua Cheng","Haofei Wang","Yiwei Bao","Feng Lu"],"pdf_url":"https://arxiv.org/pdf/2104.12668v2.pdf","comment":"Accepted by TPAMI"},{"id":"http://arxiv.org/abs/2008.10796v5","updated":"2024-04-24T11:46:09Z","published":"2020-08-25T03:30:53Z","title":"Deep Variational Network Toward Blind Image Restoration","summary":" Blind image restoration (IR) is a common yet challenging problem in computer\nvision. Classical model-based methods and recent deep learning (DL)-based\nmethods represent two different methodologies for this problem, each with their\nown merits and drawbacks. In this paper, we propose a novel blind image\nrestoration method, aiming to integrate both the advantages of them.\nSpecifically, we construct a general Bayesian generative model for the blind\nIR, which explicitly depicts the degradation process. In this proposed model, a\npixel-wise non-i.i.d. Gaussian distribution is employed to fit the image noise.\nIt is with more flexibility than the simple i.i.d. Gaussian or Laplacian\ndistributions as adopted in most of conventional methods, so as to handle more\ncomplicated noise types contained in the image degradation. To solve the model,\nwe design a variational inference algorithm where all the expected posteriori\ndistributions are parameterized as deep neural networks to increase their model\ncapability. Notably, such an inference algorithm induces a unified framework to\njointly deal with the tasks of degradation estimation and image restoration.\nFurther, the degradation information estimated in the former task is utilized\nto guide the latter IR process. Experiments on two typical blind IR tasks,\nnamely image denoising and super-resolution, demonstrate that the proposed\nmethod achieves superior performance over current state-of-the-arts.\n","authors":["Zongsheng Yue","Hongwei Yong","Qian Zhao","Lei Zhang","Deyu Meng","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2008.10796v5.pdf","comment":"Accepted by TPAMI@2024. Code: https://github.com/zsyOAOA/VIRNet"}]},"2024-04-25T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.16831v1","updated":"2024-04-25T17:59:59Z","published":"2024-04-25T17:59:59Z","title":"The Third Monocular Depth Estimation Challenge","summary":" This paper discusses the results of the third edition of the Monocular Depth\nEstimation Challenge (MDEC). The challenge focuses on zero-shot generalization\nto the challenging SYNS-Patches dataset, featuring complex scenes in natural\nand indoor settings. As with the previous edition, methods can use any form of\nsupervision, i.e. supervised or self-supervised. The challenge received a total\nof 19 submissions outperforming the baseline on the test set: 10 among them\nsubmitted a report describing their approach, highlighting a diffused use of\nfoundational models such as Depth Anything at the core of their method. The\nchallenge winners drastically improved 3D F-Score performance, from 17.51% to\n23.72%.\n","authors":["Jaime Spencer","Fabio Tosi","Matteo Poggi","Ripudaman Singh Arora","Chris Russell","Simon Hadfield","Richard Bowden","GuangYuan Zhou","ZhengXin Li","Qiang Rao","YiPing Bao","Xiao Liu","Dohyeong Kim","Jinseong Kim","Myunghyun Kim","Mykola Lavreniuk","Rui Li","Qing Mao","Jiang Wu","Yu Zhu","Jinqiu Sun","Yanning Zhang","Suraj Patni","Aradhye Agarwal","Chetan Arora","Pihai Sun","Kui Jiang","Gang Wu","Jian Liu","Xianming Liu","Junjun Jiang","Xidan Zhang","Jianing Wei","Fangjun Wang","Zhiming Tan","Jiabao Wang","Albert Luginov","Muhammad Shahzad","Seyed Hosseini","Aleksander Trajcevski","James H. Elder"],"pdf_url":"https://arxiv.org/pdf/2404.16831v1.pdf","comment":"To appear in CVPRW2024"},{"id":"http://arxiv.org/abs/2404.16829v1","updated":"2024-04-25T17:59:58Z","published":"2024-04-25T17:59:58Z","title":"Make-it-Real: Unleashing Large Multimodal Model's Ability for Painting\n 3D Objects with Realistic Materials","summary":" Physically realistic materials are pivotal in augmenting the realism of 3D\nassets across various applications and lighting conditions. However, existing\n3D assets and generative models often lack authentic material properties.\nManual assignment of materials using graphic software is a tedious and\ntime-consuming task. In this paper, we exploit advancements in Multimodal Large\nLanguage Models (MLLMs), particularly GPT-4V, to present a novel approach,\nMake-it-Real: 1) We demonstrate that GPT-4V can effectively recognize and\ndescribe materials, allowing the construction of a detailed material library.\n2) Utilizing a combination of visual cues and hierarchical text prompts, GPT-4V\nprecisely identifies and aligns materials with the corresponding components of\n3D objects. 3) The correctly matched materials are then meticulously applied as\nreference for the new SVBRDF material generation according to the original\ndiffuse map, significantly enhancing their visual authenticity. Make-it-Real\noffers a streamlined integration into the 3D content creation workflow,\nshowcasing its utility as an essential tool for developers of 3D assets.\n","authors":["Ye Fang","Zeyi Sun","Tong Wu","Jiaqi Wang","Ziwei Liu","Gordon Wetzstein","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2404.16829v1.pdf","comment":"Project Page: https://sunzey.github.io/Make-it-Real/"},{"id":"http://arxiv.org/abs/2404.16828v1","updated":"2024-04-25T17:59:56Z","published":"2024-04-25T17:59:56Z","title":"Made to Order: Discovering monotonic temporal changes via\n self-supervised video ordering","summary":" Our objective is to discover and localize monotonic temporal changes in a\nsequence of images. To achieve this, we exploit a simple proxy task of ordering\na shuffled image sequence, with `time' serving as a supervisory signal since\nonly changes that are monotonic with time can give rise to the correct\nordering. We also introduce a flexible transformer-based model for\ngeneral-purpose ordering of image sequences of arbitrary length with built-in\nattribution maps. After training, the model successfully discovers and\nlocalizes monotonic changes while ignoring cyclic and stochastic ones. We\ndemonstrate applications of the model in multiple video settings covering\ndifferent scene and object types, discovering both object-level and\nenvironmental changes in unseen sequences. We also demonstrate that the\nattention-based attribution maps function as effective prompts for segmenting\nthe changing regions, and that the learned representations can be used for\ndownstream applications. Finally, we show that the model achieves the state of\nthe art on standard benchmarks for ordering a set of images.\n","authors":["Charig Yang","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2404.16828v1.pdf","comment":"Project page: https://charigyang.github.io/order/"},{"id":"http://arxiv.org/abs/2404.16825v1","updated":"2024-04-25T17:59:46Z","published":"2024-04-25T17:59:46Z","title":"ResVR: Joint Rescaling and Viewport Rendering of Omnidirectional Images","summary":" With the advent of virtual reality technology, omnidirectional image (ODI)\nrescaling techniques are increasingly embraced for reducing transmitted and\nstored file sizes while preserving high image quality. Despite this progress,\ncurrent ODI rescaling methods predominantly focus on enhancing the quality of\nimages in equirectangular projection (ERP) format, which overlooks the fact\nthat the content viewed on head mounted displays (HMDs) is actually a rendered\nviewport instead of an ERP image. In this work, we emphasize that focusing\nsolely on ERP quality results in inferior viewport visual experiences for\nusers. Thus, we propose ResVR, which is the first comprehensive framework for\nthe joint Rescaling and Viewport Rendering of ODIs. ResVR allows obtaining LR\nERP images for transmission while rendering high-quality viewports for users to\nwatch on HMDs. In our ResVR, a novel discrete pixel sampling strategy is\ndeveloped to tackle the complex mapping between the viewport and ERP, enabling\nend-to-end training of ResVR pipeline. Furthermore, a spherical pixel shape\nrepresentation technique is innovatively derived from spherical differentiation\nto significantly improve the visual quality of rendered viewports. Extensive\nexperiments demonstrate that our ResVR outperforms existing methods in viewport\nrendering tasks across different fields of view, resolutions, and view\ndirections while keeping a low transmission overhead.\n","authors":["Weiqi Li","Shijie Zhao","Bin Chen","Xinhua Cheng","Junlin Li","Li Zhang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16824v1","updated":"2024-04-25T17:59:45Z","published":"2024-04-25T17:59:45Z","title":"V2A-Mark: Versatile Deep Visual-Audio Watermarking for Manipulation\n Localization and Copyright Protection","summary":" AI-generated video has revolutionized short video production, filmmaking, and\npersonalized media, making video local editing an essential tool. However, this\nprogress also blurs the line between reality and fiction, posing challenges in\nmultimedia forensics. To solve this urgent issue, V2A-Mark is proposed to\naddress the limitations of current video tampering forensics, such as poor\ngeneralizability, singular function, and single modality focus. Combining the\nfragility of video-into-video steganography with deep robust watermarking, our\nmethod can embed invisible visual-audio localization watermarks and copyright\nwatermarks into the original video frames and audio, enabling precise\nmanipulation localization and copyright protection. We also design a temporal\nalignment and fusion module and degradation prompt learning to enhance the\nlocalization accuracy and decoding robustness. Meanwhile, we introduce a\nsample-level audio localization method and a cross-modal copyright extraction\nmechanism to couple the information of audio and video frames. The\neffectiveness of V2A-Mark has been verified on a visual-audio tampering\ndataset, emphasizing its superiority in localization precision and copyright\naccuracy, crucial for the sustainable development of video editing in the AIGC\nvideo era.\n","authors":["Xuanyu Zhang","Youmin Xu","Runyi Li","Jiwen Yu","Weiqi Li","Zhipei Xu","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16823v1","updated":"2024-04-25T17:59:41Z","published":"2024-04-25T17:59:41Z","title":"Learning Visuotactile Skills with Two Multifingered Hands","summary":" Aiming to replicate human-like dexterity, perceptual experiences, and motion\npatterns, we explore learning from human demonstrations using a bimanual system\nwith multifingered hands and visuotactile data. Two significant challenges\nexist: the lack of an affordable and accessible teleoperation system suitable\nfor a dual-arm setup with multifingered hands, and the scarcity of\nmultifingered hand hardware equipped with touch sensing. To tackle the first\nchallenge, we develop HATO, a low-cost hands-arms teleoperation system that\nleverages off-the-shelf electronics, complemented with a software suite that\nenables efficient data collection; the comprehensive software suite also\nsupports multimodal data processing, scalable policy learning, and smooth\npolicy deployment. To tackle the latter challenge, we introduce a novel\nhardware adaptation by repurposing two prosthetic hands equipped with touch\nsensors for research. Using visuotactile data collected from our system, we\nlearn skills to complete long-horizon, high-precision tasks which are difficult\nto achieve without multifingered dexterity and touch feedback. Furthermore, we\nempirically investigate the effects of dataset size, sensing modality, and\nvisual input preprocessing on policy learning. Our results mark a promising\nstep forward in bimanual multifingered manipulation from visuotactile data.\nVideos, code, and datasets can be found at https://toruowo.github.io/hato/ .\n","authors":["Toru Lin","Yu Zhang","Qiyang Li","Haozhi Qi","Brent Yi","Sergey Levine","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.16823v1.pdf","comment":"Code and Project Website: https://toruowo.github.io/hato/"},{"id":"http://arxiv.org/abs/2404.16821v1","updated":"2024-04-25T17:59:19Z","published":"2024-04-25T17:59:19Z","title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal\n Models with Open-Source Suites","summary":" In this report, we introduce InternVL 1.5, an open-source multimodal large\nlanguage model (MLLM) to bridge the capability gap between open-source and\nproprietary commercial models in multimodal understanding. We introduce three\nsimple improvements: (1) Strong Vision Encoder: we explored a continuous\nlearning strategy for the large-scale vision foundation model -- InternViT-6B,\nboosting its visual understanding capabilities, and making it can be\ntransferred and reused in different LLMs. (2) Dynamic High-Resolution: we\ndivide images into tiles ranging from 1 to 40 of 448$\\times$448 pixels\naccording to the aspect ratio and resolution of the input images, which\nsupports up to 4K resolution input. (3) High-Quality Bilingual Dataset: we\ncarefully collected a high-quality bilingual dataset that covers common scenes,\ndocument images, and annotated them with English and Chinese question-answer\npairs, significantly enhancing performance in OCR- and Chinese-related tasks.\nWe evaluate InternVL 1.5 through a series of benchmarks and comparative\nstudies. Compared to both open-source and proprietary models, InternVL 1.5\nshows competitive performance, achieving state-of-the-art results in 8 of 18\nbenchmarks. Code has been released at https://github.com/OpenGVLab/InternVL.\n","authors":["Zhe Chen","Weiyun Wang","Hao Tian","Shenglong Ye","Zhangwei Gao","Erfei Cui","Wenwen Tong","Kongzhi Hu","Jiapeng Luo","Zheng Ma","Ji Ma","Jiaqi Wang","Xiaoyi Dong","Hang Yan","Hewei Guo","Conghui He","Zhenjiang Jin","Chao Xu","Bin Wang","Xingjian Wei","Wei Li","Wenjian Zhang","Lewei Lu","Xizhou Zhu","Tong Lu","Dahua Lin","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2404.16821v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.16820v1","updated":"2024-04-25T17:58:43Z","published":"2024-04-25T17:58:43Z","title":"Revisiting Text-to-Image Evaluation with Gecko: On Metrics, Prompts, and\n Human Ratings","summary":" While text-to-image (T2I) generative models have become ubiquitous, they do\nnot necessarily generate images that align with a given prompt. While previous\nwork has evaluated T2I alignment by proposing metrics, benchmarks, and\ntemplates for collecting human judgements, the quality of these components is\nnot systematically measured. Human-rated prompt sets are generally small and\nthe reliability of the ratings -- and thereby the prompt set used to compare\nmodels -- is not evaluated. We address this gap by performing an extensive\nstudy evaluating auto-eval metrics and human templates. We provide three main\ncontributions: (1) We introduce a comprehensive skills-based benchmark that can\ndiscriminate models across different human templates. This skills-based\nbenchmark categorises prompts into sub-skills, allowing a practitioner to\npinpoint not only which skills are challenging, but at what level of complexity\na skill becomes challenging. (2) We gather human ratings across four templates\nand four T2I models for a total of >100K annotations. This allows us to\nunderstand where differences arise due to inherent ambiguity in the prompt and\nwhere they arise due to differences in metric and model quality. (3) Finally,\nwe introduce a new QA-based auto-eval metric that is better correlated with\nhuman ratings than existing metrics for our new dataset, across different human\ntemplates, and on TIFA160.\n","authors":["Olivia Wiles","Chuhan Zhang","Isabela Albuquerque","Ivana Kajić","Su Wang","Emanuele Bugliarello","Yasumasa Onoe","Chris Knutsen","Cyrus Rashtchian","Jordi Pont-Tuset","Aida Nematzadeh"],"pdf_url":"https://arxiv.org/pdf/2404.16820v1.pdf","comment":"Data and code will be released at:\n https://github.com/google-deepmind/gecko_benchmark_t2i"},{"id":"http://arxiv.org/abs/2404.16818v1","updated":"2024-04-25T17:58:09Z","published":"2024-04-25T17:58:09Z","title":"Boosting Unsupervised Semantic Segmentation with Principal Mask\n Proposals","summary":" Unsupervised semantic segmentation aims to automatically partition images\ninto semantically meaningful regions by identifying global categories within an\nimage corpus without any form of annotation. Building upon recent advances in\nself-supervised representation learning, we focus on how to leverage these\nlarge pre-trained models for the downstream task of unsupervised segmentation.\nWe present PriMaPs - Principal Mask Proposals - decomposing images into\nsemantically meaningful masks based on their feature representation. This\nallows us to realize unsupervised semantic segmentation by fitting class\nprototypes to PriMaPs with a stochastic expectation-maximization algorithm,\nPriMaPs-EM. Despite its conceptual simplicity, PriMaPs-EM leads to competitive\nresults across various pre-trained backbone models, including DINO and DINOv2,\nand across datasets, such as Cityscapes, COCO-Stuff, and Potsdam-3.\nImportantly, PriMaPs-EM is able to boost results when applied orthogonally to\ncurrent state-of-the-art unsupervised semantic segmentation pipelines.\n","authors":["Oliver Hahn","Nikita Araslanov","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2404.16818v1.pdf","comment":"Code: https://github.com/visinf/primaps"},{"id":"http://arxiv.org/abs/2404.16814v1","updated":"2024-04-25T17:56:45Z","published":"2024-04-25T17:56:45Z","title":"Meta-Transfer Derm-Diagnosis: Exploring Few-Shot Learning and Transfer\n Learning for Skin Disease Classification in Long-Tail Distribution","summary":" Addressing the challenges of rare diseases is difficult, especially with the\nlimited number of reference images and a small patient population. This is more\nevident in rare skin diseases, where we encounter long-tailed data\ndistributions that make it difficult to develop unbiased and broadly effective\nmodels. The diverse ways in which image datasets are gathered and their\ndistinct purposes also add to these challenges. Our study conducts a detailed\nexamination of the benefits and drawbacks of episodic and conventional training\nmethodologies, adopting a few-shot learning approach alongside transfer\nlearning. We evaluated our models using the ISIC2018, Derm7pt, and SD-198\ndatasets. With minimal labeled examples, our models showed substantial\ninformation gains and better performance compared to previously trained models.\nOur research emphasizes the improved ability to represent features in\nDenseNet121 and MobileNetV2 models, achieved by using pre-trained models on\nImageNet to increase similarities within classes. Moreover, our experiments,\nranging from 2-way to 5-way classifications with up to 10 examples, showed a\ngrowing success rate for traditional transfer learning methods as the number of\nexamples increased. The addition of data augmentation techniques significantly\nimproved our transfer learning based model performance, leading to higher\nperformances than existing methods, especially in the SD-198 and ISIC2018\ndatasets. All source code related to this work will be made publicly available\nsoon at the provided URL.\n","authors":["Zeynep Özdemir","Hacer Yalim Keles","Ömer Özgür Tanrıöver"],"pdf_url":"https://arxiv.org/pdf/2404.16814v1.pdf","comment":"17 pages, 5 figures, 6 tables, submitted to IEEE Journal of\n Biomedical and Health Informatics"},{"id":"http://arxiv.org/abs/2404.16804v1","updated":"2024-04-25T17:51:10Z","published":"2024-04-25T17:51:10Z","title":"AAPL: Adding Attributes to Prompt Learning for Vision-Language Models","summary":" Recent advances in large pre-trained vision-language models have demonstrated\nremarkable performance on zero-shot downstream tasks. Building upon this,\nrecent studies, such as CoOp and CoCoOp, have proposed the use of prompt\nlearning, where context within a prompt is replaced with learnable vectors,\nleading to significant improvements over manually crafted prompts. However, the\nperformance improvement for unseen classes is still marginal, and to tackle\nthis problem, data augmentation has been frequently used in traditional\nzero-shot learning techniques. Through our experiments, we have identified\nimportant issues in CoOp and CoCoOp: the context learned through traditional\nimage augmentation is biased toward seen classes, negatively impacting\ngeneralization to unseen classes. To address this problem, we propose\nadversarial token embedding to disentangle low-level visual augmentation\nfeatures from high-level class information when inducing bias in learnable\nprompts. Through our novel mechanism called \"Adding Attributes to Prompt\nLearning\", AAPL, we guide the learnable context to effectively extract text\nfeatures by focusing on high-level features for unseen classes. We have\nconducted experiments across 11 datasets, and overall, AAPL shows favorable\nperformances compared to the existing methods in few-shot learning, zero-shot\nlearning, cross-dataset, and domain generalization tasks.\n","authors":["Gahyeon Kim","Sohee Kim","Seokju Lee"],"pdf_url":"https://arxiv.org/pdf/2404.16804v1.pdf","comment":"Accepted to CVPR 2024 Workshop on Prompting in Vision, Project Page:\n https://github.com/Gahyeonkim09/AAPL"},{"id":"http://arxiv.org/abs/2403.08733v3","updated":"2024-04-25T17:50:07Z","published":"2024-03-13T17:35:28Z","title":"GaussCtrl: Multi-View Consistent Text-Driven 3D Gaussian Splatting\n Editing","summary":" We propose GaussCtrl, a text-driven method to edit a 3D scene reconstructed\nby the 3D Gaussian Splatting (3DGS).\n Our method first renders a collection of images by using the 3DGS and edits\nthem by using a pre-trained 2D diffusion model (ControlNet) based on the input\nprompt, which is then used to optimise the 3D model.\n Our key contribution is multi-view consistent editing, which enables editing\nall images together instead of iteratively editing one image while updating the\n3D model as in previous works.\n It leads to faster editing as well as higher visual quality.\n This is achieved by the two terms:\n (a) depth-conditioned editing that enforces geometric consistency across\nmulti-view images by leveraging naturally consistent depth maps.\n (b) attention-based latent code alignment that unifies the appearance of\nedited images by conditioning their editing to several reference views through\nself and cross-view attention between images' latent representations.\n Experiments demonstrate that our method achieves faster editing and better\nvisual results than previous state-of-the-art methods.\n","authors":["Jing Wu","Jia-Wang Bian","Xinghui Li","Guangrun Wang","Ian Reid","Philip Torr","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2403.08733v3.pdf","comment":"Our Project Website: https://gaussctrl.active.vision/"},{"id":"http://arxiv.org/abs/2404.16790v1","updated":"2024-04-25T17:39:35Z","published":"2024-04-25T17:39:35Z","title":"SEED-Bench-2-Plus: Benchmarking Multimodal Large Language Models with\n Text-Rich Visual Comprehension","summary":" Comprehending text-rich visual content is paramount for the practical\napplication of Multimodal Large Language Models (MLLMs), since text-rich\nscenarios are ubiquitous in the real world, which are characterized by the\npresence of extensive texts embedded within images. Recently, the advent of\nMLLMs with impressive versatility has raised the bar for what we can expect\nfrom MLLMs. However, their proficiency in text-rich scenarios has yet to be\ncomprehensively and objectively assessed, since current MLLM benchmarks\nprimarily focus on evaluating general visual comprehension. In this work, we\nintroduce SEED-Bench-2-Plus, a benchmark specifically designed for evaluating\n\\textbf{text-rich visual comprehension} of MLLMs. Our benchmark comprises 2.3K\nmultiple-choice questions with precise human annotations, spanning three broad\ncategories: Charts, Maps, and Webs, each of which covers a wide spectrum of\ntext-rich scenarios in the real world. These categories, due to their inherent\ncomplexity and diversity, effectively simulate real-world text-rich\nenvironments. We further conduct a thorough evaluation involving 34 prominent\nMLLMs (including GPT-4V, Gemini-Pro-Vision and Claude-3-Opus) and emphasize the\ncurrent limitations of MLLMs in text-rich visual comprehension. We hope that\nour work can serve as a valuable addition to existing MLLM benchmarks,\nproviding insightful observations and inspiring further research in the area of\ntext-rich visual comprehension with MLLMs. The dataset and evaluation code can\nbe accessed at https://github.com/AILab-CVC/SEED-Bench.\n","authors":["Bohao Li","Yuying Ge","Yi Chen","Yixiao Ge","Ruimao Zhang","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.16790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16781v1","updated":"2024-04-25T17:30:38Z","published":"2024-04-25T17:30:38Z","title":"Registration by Regression (RbR): a framework for interpretable and\n flexible atlas registration","summary":" In human neuroimaging studies, atlas registration enables mapping MRI scans\nto a common coordinate frame, which is necessary to aggregate data from\nmultiple subjects. Machine learning registration methods have achieved\nexcellent speed and accuracy but lack interpretability. More recently,\nkeypoint-based methods have been proposed to tackle this issue, but their\naccuracy is still subpar, particularly when fitting nonlinear transforms. Here\nwe propose Registration by Regression (RbR), a novel atlas registration\nframework that is highly robust and flexible, conceptually simple, and can be\ntrained with cheaply obtained data. RbR predicts the (x,y,z) atlas coordinates\nfor every voxel of the input scan (i.e., every voxel is a keypoint), and then\nuses closed-form expressions to quickly fit transforms using a wide array of\npossible deformation models, including affine and nonlinear (e.g., Bspline,\nDemons, invertible diffeomorphic models, etc.). Robustness is provided by the\nlarge number of voxels informing the registration and can be further increased\nby robust estimators like RANSAC. Experiments on independent public datasets\nshow that RbR yields more accurate registration than competing keypoint\napproaches, while providing full control of the deformation model.\n","authors":["Karthik Gopinath","Xiaoling Hu","Malte Hoffmann","Oula Puonti","Juan Eugenio Iglesias"],"pdf_url":"https://arxiv.org/pdf/2404.16781v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/1902.00615v4","updated":"2024-04-25T17:29:45Z","published":"2019-02-02T01:52:53Z","title":"Confidence-Triggered Detection: Accelerating Real-time\n Tracking-by-detection Systems","summary":" Real-time object tracking necessitates a delicate balance between speed and\naccuracy, a challenge exacerbated by the computational demands of deep learning\nmethods. In this paper, we propose Confidence-Triggered Detection (CTD), an\ninnovative approach that strategically bypasses object detection for frames\nclosely resembling intermediate states, leveraging tracker confidence scores.\nCTD not only enhances tracking speed but also preserves accuracy, surpassing\nexisting tracking algorithms. Through extensive evaluation across various\ntracker confidence thresholds, we identify an optimal trade-off between\ntracking speed and accuracy, providing crucial insights for parameter\nfine-tuning and enhancing CTD's practicality in real-world scenarios. Our\nexperiments across diverse detection models underscore the robustness and\nversatility of the CTD framework, demonstrating its potential to enable\nreal-time tracking in resource-constrained environments.\n","authors":["Zhicheng Ding","Zhixin Lai","Siyang Li","Panfeng Li","Qikai Yang","Edward Wong"],"pdf_url":"https://arxiv.org/pdf/1902.00615v4.pdf","comment":"To be appeared in 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2404.16773v1","updated":"2024-04-25T17:24:35Z","published":"2024-04-25T17:24:35Z","title":"ConKeD++ -- Improving descriptor learning for retinal image\n registration: A comprehensive study of contrastive losses","summary":" Self-supervised contrastive learning has emerged as one of the most\nsuccessful deep learning paradigms. In this regard, it has seen extensive use\nin image registration and, more recently, in the particular field of medical\nimage registration. In this work, we propose to test and extend and improve a\nstate-of-the-art framework for color fundus image registration, ConKeD. Using\nthe ConKeD framework we test multiple loss functions, adapting them to the\nframework and the application domain. Furthermore, we evaluate our models using\nthe standarized benchmark dataset FIRE as well as several datasets that have\nnever been used before for color fundus registration, for which we are\nreleasing the pairing data as well as a standardized evaluation approach. Our\nwork demonstrates state-of-the-art performance across all datasets and metrics\ndemonstrating several advantages over current SOTA color fundus registration\nmethods\n","authors":["David Rivas-Villar","Álvaro S. Hervella","José Rouco","Jorge Novo"],"pdf_url":"https://arxiv.org/pdf/2404.16773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16771v1","updated":"2024-04-25T17:23:43Z","published":"2024-04-25T17:23:43Z","title":"ConsistentID: Portrait Generation with Multimodal Fine-Grained Identity\n Preserving","summary":" Diffusion-based technologies have made significant strides, particularly in\npersonalized and customized facialgeneration. However, existing methods face\nchallenges in achieving high-fidelity and detailed identity (ID)consistency,\nprimarily due to insufficient fine-grained control over facial areas and the\nlack of a comprehensive strategy for ID preservation by fully considering\nintricate facial details and the overall face. To address these limitations, we\nintroduce ConsistentID, an innovative method crafted for\ndiverseidentity-preserving portrait generation under fine-grained multimodal\nfacial prompts, utilizing only a single reference image. ConsistentID comprises\ntwo key components: a multimodal facial prompt generator that combines facial\nfeatures, corresponding facial descriptions and the overall facial context to\nenhance precision in facial details, and an ID-preservation network optimized\nthrough the facial attention localization strategy, aimed at preserving ID\nconsistency in facial regions. Together, these components significantly enhance\nthe accuracy of ID preservation by introducing fine-grained multimodal ID\ninformation from facial regions. To facilitate training of ConsistentID, we\npresent a fine-grained portrait dataset, FGID, with over 500,000 facial images,\noffering greater diversity and comprehensiveness than existing public facial\ndatasets. % such as LAION-Face, CelebA, FFHQ, and SFHQ. Experimental results\nsubstantiate that our ConsistentID achieves exceptional precision and diversity\nin personalized facial generation, surpassing existing methods in the MyStyle\ndataset. Furthermore, while ConsistentID introduces more multimodal ID\ninformation, it maintains a fast inference speed during generation.\n","authors":["Jiehui Huang","Xiao Dong","Wenhui Song","Hanhui Li","Jun Zhou","Yuhao Cheng","Shutao Liao","Long Chen","Yiqiang Yan","Shengcai Liao","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2404.16771v1.pdf","comment":"Project page: https://ssugarwh.github.io/consistentid.github.io/"},{"id":"http://arxiv.org/abs/2404.16767v1","updated":"2024-04-25T17:20:45Z","published":"2024-04-25T17:20:45Z","title":"REBEL: Reinforcement Learning via Regressing Relative Rewards","summary":" While originally developed for continuous control problems, Proximal Policy\nOptimization (PPO) has emerged as the work-horse of a variety of reinforcement\nlearning (RL) applications including the fine-tuning of generative models.\nUnfortunately, PPO requires multiple heuristics to enable stable convergence\n(e.g. value networks, clipping) and is notorious for its sensitivity to the\nprecise implementation of these components. In response, we take a step back\nand ask what a minimalist RL algorithm for the era of generative models would\nlook like. We propose REBEL, an algorithm that cleanly reduces the problem of\npolicy optimization to regressing the relative rewards via a direct policy\nparameterization between two completions to a prompt, enabling strikingly\nlightweight implementation. In theory, we prove that fundamental RL algorithms\nlike Natural Policy Gradient can be seen as variants of REBEL, which allows us\nto match the strongest known theoretical guarantees in terms of convergence and\nsample complexity in the RL literature. REBEL can also cleanly incorporate\noffline data and handle the intransitive preferences we frequently see in\npractice. Empirically, we find that REBEL provides a unified approach to\nlanguage modeling and image generation with stronger or similar performance as\nPPO and DPO, all while being simpler to implement and more computationally\ntractable than PPO.\n","authors":["Zhaolin Gao","Jonathan D. Chang","Wenhao Zhan","Owen Oertell","Gokul Swamy","Kianté Brantley","Thorsten Joachims","J. Andrew Bagnell","Jason D. Lee","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.16767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16754v1","updated":"2024-04-25T17:11:37Z","published":"2024-04-25T17:11:37Z","title":"RadGenome-Chest CT: A Grounded Vision-Language Dataset for Chest CT\n Analysis","summary":" Developing generalist foundation model has recently attracted tremendous\nattention among researchers in the field of AI for Medicine (AI4Medicine). A\npivotal insight in developing these models is their reliance on dataset\nscaling, which emphasizes the requirements on developing open-source medical\nimage datasets that incorporate diverse supervision signals across various\nimaging modalities. In this paper, we introduce RadGenome-Chest CT, a\ncomprehensive, large-scale, region-guided 3D chest CT interpretation dataset\nbased on CT-RATE. Specifically, we leverage the latest powerful universal\nsegmentation and large language models, to extend the original datasets (over\n25,692 non-contrast 3D chest CT volume and reports from 20,000 patients) from\nthe following aspects: (i) organ-level segmentation masks covering 197\ncategories, which provide intermediate reasoning visual clues for\ninterpretation; (ii) 665 K multi-granularity grounded reports, where each\nsentence of the report is linked to the corresponding anatomical region of CT\nvolume in the form of a segmentation mask; (iii) 1.3 M grounded VQA pairs,\nwhere questions and answers are all linked with reference segmentation masks,\nenabling models to associate visual evidence with textual explanations. All\ngrounded reports and VQA pairs in the validation set have gone through manual\nverification to ensure dataset quality. We believe that RadGenome-Chest CT can\nsignificantly advance the development of multimodal medical foundation models,\nby training to generate texts based on given segmentation regions, which is\nunattainable with previous relevant datasets. We will release all segmentation\nmasks, grounded reports, and VQA pairs to facilitate further research and\ndevelopment in this field.\n","authors":["Xiaoman Zhang","Chaoyi Wu","Ziheng Zhao","Jiayu Lei","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2404.16754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16752v1","updated":"2024-04-25T17:09:14Z","published":"2024-04-25T17:09:14Z","title":"TokenHMR: Advancing Human Mesh Recovery with a Tokenized Pose\n Representation","summary":" We address the problem of regressing 3D human pose and shape from a single\nimage, with a focus on 3D accuracy. The current best methods leverage large\ndatasets of 3D pseudo-ground-truth (p-GT) and 2D keypoints, leading to robust\nperformance. With such methods, we observe a paradoxical decline in 3D pose\naccuracy with increasing 2D accuracy. This is caused by biases in the p-GT and\nthe use of an approximate camera projection model. We quantify the error\ninduced by current camera models and show that fitting 2D keypoints and p-GT\naccurately causes incorrect 3D poses. Our analysis defines the invalid\ndistances within which minimizing 2D and p-GT losses is detrimental. We use\nthis to formulate a new loss Threshold-Adaptive Loss Scaling (TALS) that\npenalizes gross 2D and p-GT losses but not smaller ones. With such a loss,\nthere are many 3D poses that could equally explain the 2D evidence. To reduce\nthis ambiguity we need a prior over valid human poses but such priors can\nintroduce unwanted bias. To address this, we exploit a tokenized representation\nof human pose and reformulate the problem as token prediction. This restricts\nthe estimated poses to the space of valid poses, effectively providing a\nuniform prior. Extensive experiments on the EMDB and 3DPW datasets show that\nour reformulated keypoint loss and tokenization allows us to train on\nin-the-wild data while improving 3D accuracy over the state-of-the-art. Our\nmodels and code are available for research at https://tokenhmr.is.tue.mpg.de.\n","authors":["Sai Kumar Dwivedi","Yu Sun","Priyanka Patel","Yao Feng","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.16752v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16748v1","updated":"2024-04-25T17:05:38Z","published":"2024-04-25T17:05:38Z","title":"TELA: Text to Layer-wise 3D Clothed Human Generation","summary":" This paper addresses the task of 3D clothed human generation from textural\ndescriptions. Previous works usually encode the human body and clothes as a\nholistic model and generate the whole model in a single-stage optimization,\nwhich makes them struggle for clothing editing and meanwhile lose fine-grained\ncontrol over the whole generation process. To solve this, we propose a\nlayer-wise clothed human representation combined with a progressive\noptimization strategy, which produces clothing-disentangled 3D human models\nwhile providing control capacity for the generation process. The basic idea is\nprogressively generating a minimal-clothed human body and layer-wise clothes.\nDuring clothing generation, a novel stratified compositional rendering method\nis proposed to fuse multi-layer human models, and a new loss function is\nutilized to help decouple the clothing model from the human body. The proposed\nmethod achieves high-quality disentanglement, which thereby provides an\neffective way for 3D garment generation. Extensive experiments demonstrate that\nour approach achieves state-of-the-art 3D clothed human generation while also\nsupporting cloth editing applications such as virtual try-on. Project page:\nhttp://jtdong.com/tela_layer/\n","authors":["Junting Dong","Qi Fang","Zehuan Huang","Xudong Xu","Jingbo Wang","Sida Peng","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2404.16748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00281v3","updated":"2024-04-25T16:55:46Z","published":"2024-02-01T02:13:49Z","title":"Guided Interpretable Facial Expression Recognition via Spatial Action\n Unit Cues","summary":" Although state-of-the-art classifiers for facial expression recognition (FER)\ncan achieve a high level of accuracy, they lack interpretability, an important\nfeature for end-users. Experts typically associate spatial action units (\\aus)\nfrom a codebook to facial regions for the visual interpretation of expressions.\nIn this paper, the same expert steps are followed. A new learning strategy is\nproposed to explicitly incorporate \\au cues into classifier training, allowing\nto train deep interpretable models. During training, this \\au codebook is used,\nalong with the input image expression label, and facial landmarks, to construct\na \\au heatmap that indicates the most discriminative image regions of interest\nw.r.t the facial expression. This valuable spatial cue is leveraged to train a\ndeep interpretable classifier for FER. This is achieved by constraining the\nspatial layer features of a classifier to be correlated with \\au heatmaps.\nUsing a composite loss, the classifier is trained to correctly classify an\nimage while yielding interpretable visual layer-wise attention correlated with\n\\au maps, simulating the expert decision process. Our strategy only relies on\nimage class expression for supervision, without additional manual annotations.\nOur new strategy is generic, and can be applied to any deep CNN- or\ntransformer-based classifier without requiring any architectural change or\nsignificant additional training time. Our extensive evaluation on two public\nbenchmarks \\rafdb, and \\affectnet datasets shows that our proposed strategy can\nimprove layer-wise interpretability without degrading classification\nperformance. In addition, we explore a common type of interpretable classifiers\nthat rely on class activation mapping (CAM) methods, and show that our approach\ncan also improve CAM interpretability.\n","authors":["Soufiane Belharbi","Marco Pedersoli","Alessandro Lameiras Koerich","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2402.00281v3.pdf","comment":"15 pages, 11 figures, 3 tables, International Conference on Automatic\n Face and Gesture Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.16739v1","updated":"2024-04-25T16:49:10Z","published":"2024-04-25T16:49:10Z","title":"CBRW: A Novel Approach for Cancelable Biometric Template Generation\n based on","summary":" Cancelable Biometric is a challenging research field in which security of an\noriginal biometric image is ensured by transforming the original biometric into\nanother irreversible domain. Several approaches have been suggested in\nliterature for generating cancelable biometric templates. In this paper, two\nnovel and simple cancelable biometric template generation methods based on\nRandom Walk (CBRW) have been proposed. By employing random walk and other steps\ngiven in the proposed two algorithms viz. CBRW-BitXOR and CBRW-BitCMP, the\noriginal biometric is transformed into a cancellable template. The performance\nof the proposed methods is compared with other state-of-the-art methods.\nExperiments have been performed on eight publicly available gray and color\ndatasets i.e. CP (ear) (gray and color), UTIRIS (iris) (gray and color), ORL\n(face) (gray), IIT Delhi (iris) (gray and color), and AR (face) (color).\nPerformance of the generated templates is measured in terms of Correlation\nCoefficient (Cr), Root Mean Square Error (RMSE), Peak Signal to Noise Ratio\n(PSNR), Structural Similarity (SSIM), Mean Absolute Error (MAE), Number of\nPixel Change Rate (NPCR), and Unified Average Changing Intensity (UACI). By\nexperimental results, it has been proved that proposed methods are superior\nthan other state-of-the-art methods in qualitative as well as quantitative\nanalysis. Furthermore, CBRW performs better on both gray as well as color\nimages.\n","authors":["Nitin Kumar"," Manisha"],"pdf_url":"https://arxiv.org/pdf/2404.16739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16718v1","updated":"2024-04-25T16:30:30Z","published":"2024-04-25T16:30:30Z","title":"Features Fusion for Dual-View Mammography Mass Detection","summary":" Detection of malignant lesions on mammography images is extremely important\nfor early breast cancer diagnosis. In clinical practice, images are acquired\nfrom two different angles, and radiologists can fully utilize information from\nboth views, simultaneously locating the same lesion. However, for automatic\ndetection approaches such information fusion remains a challenge. In this\npaper, we propose a new model called MAMM-Net, which allows the processing of\nboth mammography views simultaneously by sharing information not only on an\nobject level, as seen in existing works, but also on a feature level.\nMAMM-Net's key component is the Fusion Layer, based on deformable attention and\ndesigned to increase detection precision while keeping high recall. Our\nexperiments show superior performance on the public DDSM dataset compared to\nthe previous state-of-the-art model, while introducing new helpful features\nsuch as lesion annotation on pixel-level and classification of lesions\nmalignancy.\n","authors":["Arina Varlamova","Valery Belotsky","Grigory Novikov","Anton Konushin","Evgeny Sidorov"],"pdf_url":"https://arxiv.org/pdf/2404.16718v1.pdf","comment":"Accepted at ISBI 2024 (21st IEEE International Symposium on\n Biomedical Imaging)"},{"id":"http://arxiv.org/abs/2404.16717v1","updated":"2024-04-25T16:29:06Z","published":"2024-04-25T16:29:06Z","title":"Embracing Diversity: Interpretable Zero-shot classification beyond one\n vector per class","summary":" Vision-language models enable open-world classification of objects without\nthe need for any retraining. While this zero-shot paradigm marks a significant\nadvance, even today's best models exhibit skewed performance when objects are\ndissimilar from their typical depiction. Real world objects such as pears\nappear in a variety of forms -- from diced to whole, on a table or in a bowl --\nyet standard VLM classifiers map all instances of a class to a \\it{single\nvector based on the class label}. We argue that to represent this rich\ndiversity within a class, zero-shot classification should move beyond a single\nvector. We propose a method to encode and account for diversity within a class\nusing inferred attributes, still in the zero-shot setting without retraining.\nWe find our method consistently outperforms standard zero-shot classification\nover a large suite of datasets encompassing hierarchies, diverse object states,\nand real-world geographic diversity, as well finer-grained datasets where\nintra-class diversity may be less prevalent. Importantly, our method is\ninherently interpretable, offering faithful explanations for each inference to\nfacilitate model debugging and enhance transparency. We also find our method\nscales efficiently to a large number of attributes to account for diversity --\nleading to more accurate predictions for atypical instances. Finally, we\ncharacterize a principled trade-off between overall and worst class accuracy,\nwhich can be tuned via a hyperparameter of our method. We hope this work spurs\nfurther research into the promise of zero-shot classification beyond a single\nclass vector for capturing diversity in the world, and building transparent AI\nsystems without compromising performance.\n","authors":["Mazda Moayeri","Michael Rabbat","Mark Ibrahim","Diane Bouchacourt"],"pdf_url":"https://arxiv.org/pdf/2404.16717v1.pdf","comment":"Accepted to FAccT 2024"},{"id":"http://arxiv.org/abs/2404.16708v1","updated":"2024-04-25T16:13:59Z","published":"2024-04-25T16:13:59Z","title":"Multi-view Cardiac Image Segmentation via Trans-Dimensional Priors","summary":" We propose a novel multi-stage trans-dimensional architecture for multi-view\ncardiac image segmentation. Our method exploits the relationship between\nlong-axis (2D) and short-axis (3D) magnetic resonance (MR) images to perform a\nsequential 3D-to-2D-to-3D segmentation, segmenting the long-axis and short-axis\nimages. In the first stage, 3D segmentation is performed using the short-axis\nimage, and the prediction is transformed to the long-axis view and used as a\nsegmentation prior in the next stage. In the second step, the heart region is\nlocalized and cropped around the segmentation prior using a Heart Localization\nand Cropping (HLC) module, focusing the subsequent model on the heart region of\nthe image, where a 2D segmentation is performed. Similarly, we transform the\nlong-axis prediction to the short-axis view, localize and crop the heart region\nand again perform a 3D segmentation to refine the initial short-axis\nsegmentation. We evaluate our proposed method on the Multi-Disease, Multi-View\n& Multi-Center Right Ventricular Segmentation in Cardiac MRI (M&Ms-2) dataset,\nwhere our method outperforms state-of-the-art methods in segmenting cardiac\nregions of interest in both short-axis and long-axis images. The pre-trained\nmodels, source code, and implementation details will be publicly available.\n","authors":["Abbas Khan","Muhammad Asad","Martin Benning","Caroline Roney","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2404.16708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16139v2","updated":"2024-04-25T16:11:40Z","published":"2023-10-24T19:27:35Z","title":"Pix2HDR -- A pixel-wise acquisition and deep learning-based synthesis\n approach for high-speed HDR videos","summary":" Accurately capturing dynamic scenes with wide-ranging motion and light\nintensity is crucial for many vision applications. However, acquiring\nhigh-speed high dynamic range (HDR) video is challenging because the camera's\nframe rate restricts its dynamic range. Existing methods sacrifice speed to\nacquire multi-exposure frames. Yet, misaligned motion in these frames can still\npose complications for HDR fusion algorithms, resulting in artifacts. Instead\nof frame-based exposures, we sample the videos using individual pixels at\nvarying exposures and phase offsets. Implemented on a monochrome pixel-wise\nprogrammable image sensor, our sampling pattern simultaneously captures fast\nmotion at a high dynamic range. We then transform pixel-wise outputs into an\nHDR video using end-to-end learned weights from deep neural networks, achieving\nhigh spatiotemporal resolution with minimized motion blurring. We demonstrate\naliasing-free HDR video acquisition at 1000 FPS, resolving fast motion under\nlow-light conditions and against bright backgrounds - both challenging\nconditions for conventional cameras. By combining the versatility of pixel-wise\nsampling patterns with the strength of deep neural networks at decoding complex\nscenes, our method greatly enhances the vision system's adaptability and\nperformance in dynamic conditions.\n","authors":["Caixin Wang","Jie Zhang","Matthew A. Wilson","Ralph Etienne-Cummings"],"pdf_url":"https://arxiv.org/pdf/2310.16139v2.pdf","comment":"17 pages, 18 figures"},{"id":"http://arxiv.org/abs/2404.14560v2","updated":"2024-04-25T15:57:09Z","published":"2024-04-22T20:15:43Z","title":"Adaptive Local Binary Pattern: A Novel Feature Descriptor for Enhanced\n Analysis of Kidney Abnormalities in CT Scan Images using ensemble based\n Machine Learning Approach","summary":" The shortage of nephrologists and the growing public health concern over\nrenal failure have spurred the demand for AI systems capable of autonomously\ndetecting kidney abnormalities. Renal failure, marked by a gradual decline in\nkidney function, can result from factors like cysts, stones, and tumors.\nChronic kidney disease may go unnoticed initially, leading to untreated cases\nuntil they reach an advanced stage. The dataset, comprising 12,427 images from\nmultiple hospitals in Dhaka, was categorized into four groups: cyst, tumor,\nstone, and normal. Our methodology aims to enhance CT scan image quality using\nCropping, Resizing, and CALHE techniques, followed by feature extraction with\nour proposed Adaptive Local Binary Pattern (A-LBP) feature extraction method\ncompared with the state-of-the-art local binary pattern (LBP) method. Our\nproposed features fed into classifiers such as Random Forest, Decision Tree,\nNaive Bayes, K-Nearest Neighbor, and SVM. We explored an ensemble model with\nsoft voting to get a more robust model for our task. We got the highest of more\nthan 99% in accuracy using our feature descriptor and ensembling five\nclassifiers (Random Forest, Decision Tree, Naive Bayes, K-Nearest Neighbor,\nSupport Vector Machine) with the soft voting method.\n","authors":["Tahmim Hossain","Faisal Sayed","Solehin Islam"],"pdf_url":"https://arxiv.org/pdf/2404.14560v2.pdf","comment":"17 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2404.16687v1","updated":"2024-04-25T15:36:18Z","published":"2024-04-25T15:36:18Z","title":"NTIRE 2024 Quality Assessment of AI-Generated Content Challenge","summary":" This paper reports on the NTIRE 2024 Quality Assessment of AI-Generated\nContent Challenge, which will be held in conjunction with the New Trends in\nImage Restoration and Enhancement Workshop (NTIRE) at CVPR 2024. This challenge\nis to address a major challenge in the field of image and video processing,\nnamely, Image Quality Assessment (IQA) and Video Quality Assessment (VQA) for\nAI-Generated Content (AIGC). The challenge is divided into the image track and\nthe video track. The image track uses the AIGIQA-20K, which contains 20,000\nAI-Generated Images (AIGIs) generated by 15 popular generative models. The\nimage track has a total of 318 registered participants. A total of 1,646\nsubmissions are received in the development phase, and 221 submissions are\nreceived in the test phase. Finally, 16 participating teams submitted their\nmodels and fact sheets. The video track uses the T2VQA-DB, which contains\n10,000 AI-Generated Videos (AIGVs) generated by 9 popular Text-to-Video (T2V)\nmodels. A total of 196 participants have registered in the video track. A total\nof 991 submissions are received in the development phase, and 185 submissions\nare received in the test phase. Finally, 12 participating teams submitted their\nmodels and fact sheets. Some methods have achieved better results than baseline\nmethods, and the winning methods in both tracks have demonstrated superior\nprediction performance on AIGC.\n","authors":["Xiaohong Liu","Xiongkuo Min","Guangtao Zhai","Chunyi Li","Tengchuan Kou","Wei Sun","Haoning Wu","Yixuan Gao","Yuqin Cao","Zicheng Zhang","Xiele Wu","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2404.16687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16685v1","updated":"2024-04-25T15:33:23Z","published":"2024-04-25T15:33:23Z","title":"Multi-scale HSV Color Feature Embedding for High-fidelity NIR-to-RGB\n Spectrum Translation","summary":" The NIR-to-RGB spectral domain translation is a formidable task due to the\ninherent spectral mapping ambiguities within NIR inputs and RGB outputs. Thus,\nexisting methods fail to reconcile the tension between maintaining texture\ndetail fidelity and achieving diverse color variations. In this paper, we\npropose a Multi-scale HSV Color Feature Embedding Network (MCFNet) that\ndecomposes the mapping process into three sub-tasks, including NIR texture\nmaintenance, coarse geometry reconstruction, and RGB color prediction. Thus, we\npropose three key modules for each corresponding sub-task: the Texture\nPreserving Block (TPB), the HSV Color Feature Embedding Module (HSV-CFEM), and\nthe Geometry Reconstruction Module (GRM). These modules contribute to our\nMCFNet methodically tackling spectral translation through a series of\nescalating resolutions, progressively enriching images with color and texture\nfidelity in a scale-coherent fashion. The proposed MCFNet demonstrates\nsubstantial performance gains over the NIR image colorization task. Code is\nreleased at: https://github.com/AlexYangxx/MCFNet.\n","authors":["Huiyu Zhai","Mo Chen","Xingxing Yang","Gusheng Kang"],"pdf_url":"https://arxiv.org/pdf/2404.16685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16678v1","updated":"2024-04-25T15:28:22Z","published":"2024-04-25T15:28:22Z","title":"Multimodal Semantic-Aware Automatic Colorization with Diffusion Prior","summary":" Colorizing grayscale images offers an engaging visual experience. Existing\nautomatic colorization methods often fail to generate satisfactory results due\nto incorrect semantic colors and unsaturated colors. In this work, we propose\nan automatic colorization pipeline to overcome these challenges. We leverage\nthe extraordinary generative ability of the diffusion prior to synthesize color\nwith plausible semantics. To overcome the artifacts introduced by the diffusion\nprior, we apply the luminance conditional guidance. Moreover, we adopt\nmultimodal high-level semantic priors to help the model understand the image\ncontent and deliver saturated colors. Besides, a luminance-aware decoder is\ndesigned to restore details and enhance overall visual quality. The proposed\npipeline synthesizes saturated colors while maintaining plausible semantics.\nExperiments indicate that our proposed method considers both diversity and\nfidelity, surpassing previous methods in terms of perceptual realism and gain\nmost human preference.\n","authors":["Han Wang","Xinning Chai","Yiwen Wang","Yuhong Zhang","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2404.16678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16122v2","updated":"2024-04-25T15:27:59Z","published":"2023-06-28T11:47:08Z","title":"Semantic Positive Pairs for Enhancing Visual Representation Learning of\n Instance Discrimination methods","summary":" Self-supervised learning algorithms (SSL) based on instance discrimination\nhave shown promising results, performing competitively or even outperforming\nsupervised learning counterparts in some downstream tasks. Such approaches\nemploy data augmentation to create two views of the same instance (i.e.,\npositive pairs) and encourage the model to learn good representations by\nattracting these views closer in the embedding space without collapsing to the\ntrivial solution. However, data augmentation is limited in representing\npositive pairs, and the repulsion process between the instances during\ncontrastive learning may discard important features for instances that have\nsimilar categories. To address this issue, we propose an approach to identify\nthose images with similar semantic content and treat them as positive\ninstances, thereby reducing the chance of discarding important features during\nrepresentation learning and increasing the richness of the latent\nrepresentation. Our approach is generic and could work with any self-supervised\ninstance discrimination frameworks such as MoCo and SimSiam. To evaluate our\nmethod, we run experiments on three benchmark datasets: ImageNet, STL-10 and\nCIFAR-10 with different instance discrimination SSL approaches. The\nexperimental results show that our approach consistently outperforms the\nbaseline methods across all three datasets; for instance, we improve upon the\nvanilla MoCo-v2 by 4.1% on ImageNet under a linear evaluation protocol over 800\nepochs. We also report results on semi-supervised learning, transfer learning\non downstream tasks, and object detection.\n","authors":["Mohammad Alkhalefi","Georgios Leontidis","Mingjun Zhong"],"pdf_url":"https://arxiv.org/pdf/2306.16122v2.pdf","comment":"17 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2306.08832v4","updated":"2024-04-25T15:24:11Z","published":"2023-06-15T03:26:28Z","title":"Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to\n Enhance Visio-Linguistic Compositional Understanding","summary":" Vision-Language Models (VLMs), such as CLIP, exhibit strong image-text\ncomprehension abilities, facilitating advances in several downstream tasks such\nas zero-shot image classification, image-text retrieval, and text-to-image\ngeneration. However, the compositional reasoning abilities of existing VLMs\nremains subpar. The root of this limitation lies in the inadequate alignment\nbetween the images and captions in the pretraining datasets. Additionally, the\ncurrent contrastive learning objective fails to focus on fine-grained grounding\ncomponents like relations, actions, and attributes, resulting in \"bag-of-words\"\nrepresentations. We introduce a simple and effective method to improve\ncompositional reasoning in VLMs. Our method better leverages available datasets\nby refining and expanding the standard image-text contrastive learning\nframework. Our approach does not require specific annotations and does not\nincur extra parameters. When integrated with CLIP, our technique yields notable\nimprovement over state-of-the-art baselines across five vision-language\ncompositional benchmarks. We open-source our code at\nhttps://github.com/lezhang7/Enhance-FineGrained.\n","authors":["Le Zhang","Rabiul Awal","Aishwarya Agrawal"],"pdf_url":"https://arxiv.org/pdf/2306.08832v4.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16670v1","updated":"2024-04-25T15:15:36Z","published":"2024-04-25T15:15:36Z","title":"EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning","summary":" Visual Instruction Tuning represents a novel learning paradigm involving the\nfine-tuning of pre-trained language models using task-specific instructions.\nThis paradigm shows promising zero-shot results in various natural language\nprocessing tasks but is still unexplored in vision emotion understanding. In\nthis work, we focus on enhancing the model's proficiency in understanding and\nadhering to instructions related to emotional contexts. Initially, we identify\nkey visual clues critical to visual emotion recognition. Subsequently, we\nintroduce a novel GPT-assisted pipeline for generating emotion visual\ninstruction data, effectively addressing the scarcity of annotated instruction\ndata in this domain. Expanding on the groundwork established by InstructBLIP,\nour proposed EmoVIT architecture incorporates emotion-specific instruction\ndata, leveraging the powerful capabilities of Large Language Models to enhance\nperformance. Through extensive experiments, our model showcases its proficiency\nin emotion classification, adeptness in affective reasoning, and competence in\ncomprehending humor. The comparative analysis provides a robust benchmark for\nEmotion Visual Instruction Tuning in the era of LLMs, providing valuable\ninsights and opening avenues for future exploration in this domain. Our code is\navailable at \\url{https://github.com/aimmemotion/EmoVIT}.\n","authors":["Hongxia Xie","Chu-Jun Peng","Yu-Wen Tseng","Hung-Jen Chen","Chan-Feng Hsu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.16670v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16666v1","updated":"2024-04-25T15:06:58Z","published":"2024-04-25T15:06:58Z","title":"PhyRecon: Physically Plausible Neural Scene Reconstruction","summary":" While neural implicit representations have gained popularity in multi-view 3D\nreconstruction, previous work struggles to yield physically plausible results,\nthereby limiting their applications in physics-demanding domains like embodied\nAI and robotics. The lack of plausibility originates from both the absence of\nphysics modeling in the existing pipeline and their inability to recover\nintricate geometrical structures. In this paper, we introduce PhyRecon, which\nstands as the first approach to harness both differentiable rendering and\ndifferentiable physics simulation to learn implicit surface representations.\nOur framework proposes a novel differentiable particle-based physical simulator\nseamlessly integrated with the neural implicit representation. At its core is\nan efficient transformation between SDF-based implicit representation and\nexplicit surface points by our proposed algorithm, Surface Points Marching\nCubes (SP-MC), enabling differentiable learning with both rendering and\nphysical losses. Moreover, we model both rendering and physical uncertainty to\nidentify and compensate for the inconsistent and inaccurate monocular geometric\npriors. The physical uncertainty additionally enables a physics-guided pixel\nsampling to enhance the learning of slender structures. By amalgamating these\ntechniques, our model facilitates efficient joint modeling with appearance,\ngeometry, and physics. Extensive experiments demonstrate that PhyRecon\nsignificantly outperforms all state-of-the-art methods in terms of\nreconstruction quality. Our reconstruction results also yield superior physical\nstability, verified by Isaac Gym, with at least a 40% improvement across all\ndatasets, opening broader avenues for future physics-based applications.\n","authors":["Junfeng Ni","Yixin Chen","Bohan Jing","Nan Jiang","Bin Wang","Bo Dai","Yixin Zhu","Song-Chun Zhu","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.16666v1.pdf","comment":"project page: https://phyrecon.github.io/"},{"id":"http://arxiv.org/abs/2404.16637v1","updated":"2024-04-25T14:24:41Z","published":"2024-04-25T14:24:41Z","title":"Zero-Shot Distillation for Image Encoders: How to Make Effective Use of\n Synthetic Data","summary":" Multi-modal foundation models such as CLIP have showcased impressive\nzero-shot capabilities. However, their applicability in resource-constrained\nenvironments is limited due to their large number of parameters and high\ninference time. While existing approaches have scaled down the entire CLIP\narchitecture, we focus on training smaller variants of the image encoder, which\nsuffices for efficient zero-shot classification. The use of synthetic data has\nshown promise in distilling representations from larger teachers, resulting in\nstrong few-shot and linear probe performance. However, we find that this\napproach surprisingly fails in true zero-shot settings when using contrastive\nlosses. We identify the exploitation of spurious features as being responsible\nfor poor generalization between synthetic and real data. However, by using the\nimage feature-based L2 distillation loss, we mitigate these problems and train\nstudents that achieve zero-shot performance which on four domain-specific\ndatasets is on-par with a ViT-B/32 teacher model trained on DataCompXL, while\nfeaturing up to 92% fewer parameters.\n","authors":["Niclas Popp","Jan Hendrik Metzen","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2404.16637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16635v1","updated":"2024-04-25T14:23:24Z","published":"2024-04-25T14:23:24Z","title":"TinyChart: Efficient Chart Understanding with Visual Token Merging and\n Program-of-Thoughts Learning","summary":" Charts are important for presenting and explaining complex data\nrelationships. Recently, multimodal large language models (MLLMs) have shown\nremarkable capabilities in various chart understanding tasks. However, the\nsheer size of these models in terms of parameters and computational\nrequirements limits their use in resource-constrained environments. In this\npaper, we present TinyChart, an efficient MLLM for chart understanding with\nonly 3B parameters. TinyChart overcomes two key challenges in efficient chart\nunderstanding: (1) reduce the burden of learning numerical computations through\na Program-of-Thoughts (PoT) learning strategy, which trains the model to\ngenerate Python programs for numerical calculations, and (2) reduce lengthy\nvision feature sequences produced by the vision transformer for high-resolution\nimages through a Vision Token Merging module, which gradually merges most\nsimilar vision tokens. Extensive experiments demonstrate that our 3B TinyChart\nachieves SOTA performance on a variety of chart understanding benchmarks\nincluding ChartQA, Chart-to-Text, Chart-to-Table, OpenCQA, and ChartX. It\noutperforms several chart understanding MLLM with up to 13B parameters such as\nChartLlama and ChartAst, and close-sourced general-purpose MLLM GPT-4V on\nChartQA. It also demonstrates its superior efficiency with higher throughput\nduring inference due to a smaller model scale and more efficient vision\nencoding. Our code and model are available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/TinyChart.\n","authors":["Liang Zhang","Anwen Hu","Haiyang Xu","Ming Yan","Yichen Xu","Qin Jin","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2404.16635v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.16633v1","updated":"2024-04-25T14:22:44Z","published":"2024-04-25T14:22:44Z","title":"Self-Balanced R-CNN for Instance Segmentation","summary":" Current state-of-the-art two-stage models on instance segmentation task\nsuffer from several types of imbalances. In this paper, we address the\nIntersection over the Union (IoU) distribution imbalance of positive input\nRegions of Interest (RoIs) during the training of the second stage. Our\nSelf-Balanced R-CNN (SBR-CNN), an evolved version of the Hybrid Task Cascade\n(HTC) model, brings brand new loop mechanisms of bounding box and mask\nrefinements. With an improved Generic RoI Extraction (GRoIE), we also address\nthe feature-level imbalance at the Feature Pyramid Network (FPN) level,\noriginated by a non-uniform integration between low- and high-level features\nfrom the backbone layers. In addition, the redesign of the architecture heads\ntoward a fully convolutional approach with FCC further reduces the number of\nparameters and obtains more clues to the connection between the task to solve\nand the layers used. Moreover, our SBR-CNN model shows the same or even better\nimprovements if adopted in conjunction with other state-of-the-art models. In\nfact, with a lightweight ResNet-50 as backbone, evaluated on COCO minival 2017\ndataset, our model reaches 45.3% and 41.5% AP for object detection and instance\nsegmentation, with 12 epochs and without extra tricks. The code is available at\nhttps://github.com/IMPLabUniPr/mmdetection/tree/sbr_cnn\n","authors":["Leonardo Rossi","Akbar Karimi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2404.16633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16622v1","updated":"2024-04-25T14:07:52Z","published":"2024-04-25T14:07:52Z","title":"DAVE -- A Detect-and-Verify Paradigm for Low-Shot Counting","summary":" Low-shot counters estimate the number of objects corresponding to a selected\ncategory, based on only few or no exemplars annotated in the image. The current\nstate-of-the-art estimates the total counts as the sum over the object location\ndensity map, but does not provide individual object locations and sizes, which\nare crucial for many applications. This is addressed by detection-based\ncounters, which, however fall behind in the total count accuracy. Furthermore,\nboth approaches tend to overestimate the counts in the presence of other object\nclasses due to many false positives. We propose DAVE, a low-shot counter based\non a detect-and-verify paradigm, that avoids the aforementioned issues by first\ngenerating a high-recall detection set and then verifying the detections to\nidentify and remove the outliers. This jointly increases the recall and\nprecision, leading to accurate counts. DAVE outperforms the top density-based\ncounters by ~20% in the total count MAE, it outperforms the most recent\ndetection-based counter by ~20% in detection quality and sets a new\nstate-of-the-art in zero-shot as well as text-prompt-based counting.\n","authors":["Jer Pelhan","Alan Lukežič","Vitjan Zavrtanik","Matej Kristan"],"pdf_url":"https://arxiv.org/pdf/2404.16622v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.16617v1","updated":"2024-04-25T13:56:54Z","published":"2024-04-25T13:56:54Z","title":"Denoising: from classical methods to deep CNNs","summary":" This paper aims to explore the evolution of image denoising in a\npedagological way. We briefly review classical methods such as Fourier analysis\nand wavelet bases, highlighting the challenges they faced until the emergence\nof neural networks, notably the U-Net, in the 2010s. The remarkable performance\nof these networks has been demonstrated in studies such as Kadkhodaie et al.\n(2024). They exhibit adaptability to various image types, including those with\nfixed regularity, facial images, and bedroom scenes, achieving optimal results\nand biased towards geometry-adaptive harmonic basis. The introduction of score\ndiffusion has played a crucial role in image generation. In this context,\ndenoising becomes essential as it facilitates the estimation of probability\ndensity scores. We discuss the prerequisites for genuine learning of\nprobability densities, offering insights that extend from mathematical research\nto the implications of universal structures.\n","authors":["Jean-Eric Campagne"],"pdf_url":"https://arxiv.org/pdf/2404.16617v1.pdf","comment":"33 pages, 33 figures"},{"id":"http://arxiv.org/abs/2404.16612v1","updated":"2024-04-25T13:51:38Z","published":"2024-04-25T13:51:38Z","title":"MuseumMaker: Continual Style Customization without Catastrophic\n Forgetting","summary":" Pre-trained large text-to-image (T2I) models with an appropriate text prompt\nhas attracted growing interests in customized images generation field. However,\ncatastrophic forgetting issue make it hard to continually synthesize new\nuser-provided styles while retaining the satisfying results amongst learned\nstyles. In this paper, we propose MuseumMaker, a method that enables the\nsynthesis of images by following a set of customized styles in a never-end\nmanner, and gradually accumulate these creative artistic works as a Museum.\nWhen facing with a new customization style, we develop a style distillation\nloss module to transfer the style of the whole dataset into generation of\nimages. It can minimize the learning biases caused by content of images, and\naddress the catastrophic overfitting issue induced by few-shot images. To deal\nwith catastrophic forgetting amongst past learned styles, we devise a dual\nregularization for shared-LoRA module to optimize the direction of model\nupdate, which could regularize the diffusion model from both weight and feature\naspects, respectively. Meanwhile, a unique token embedding corresponding to\nthis new style is learned by a task-wise token learning module, which could\npreserve historical knowledge from past styles with the limitation of LoRA\nparameter quantity. As any new user-provided style come, our MuseumMaker can\ncapture the nuances of the new styles while maintaining the details of learned\nstyles. Experimental results on diverse style datasets validate the\neffectiveness of our proposed MuseumMaker method, showcasing its robustness and\nversatility across various scenarios.\n","authors":["Chenxi Liu","Gan Sun","Wenqi Liang","Jiahua Dong","Can Qin","Yang Cong"],"pdf_url":"https://arxiv.org/pdf/2404.16612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16609v1","updated":"2024-04-25T13:49:42Z","published":"2024-04-25T13:49:42Z","title":"SFMViT: SlowFast Meet ViT in Chaotic World","summary":" The task of spatiotemporal action localization in chaotic scenes is a\nchallenging task toward advanced video understanding. Paving the way with\nhigh-quality video feature extraction and enhancing the precision of\ndetector-predicted anchors can effectively improve model performance. To this\nend, we propose a high-performance dual-stream spatiotemporal feature\nextraction network SFMViT with an anchor pruning strategy. The backbone of our\nSFMViT is composed of ViT and SlowFast with prior knowledge of spatiotemporal\naction localization, which fully utilizes ViT's excellent global feature\nextraction capabilities and SlowFast's spatiotemporal sequence modeling\ncapabilities. Secondly, we introduce the confidence maximum heap to prune the\nanchors detected in each frame of the picture to filter out the effective\nanchors. These designs enable our SFMViT to achieve a mAP of 26.62% in the\nChaotic World dataset, far exceeding existing models. Code is available at\nhttps://github.com/jfightyr/SlowFast-Meet-ViT.\n","authors":["Jiaying Lin","Jiajun Wen","Mengyuan Liu","Jinfu Liu","Baiqiao Yin","Yue Li"],"pdf_url":"https://arxiv.org/pdf/2404.16609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16581v1","updated":"2024-04-25T12:55:58Z","published":"2024-04-25T12:55:58Z","title":"AudioScenic: Audio-Driven Video Scene Editing","summary":" Audio-driven visual scene editing endeavors to manipulate the visual\nbackground while leaving the foreground content unchanged, according to the\ngiven audio signals. Unlike current efforts focusing primarily on image\nediting, audio-driven video scene editing has not been extensively addressed.\nIn this paper, we introduce AudioScenic, an audio-driven framework designed for\nvideo scene editing. AudioScenic integrates audio semantics into the visual\nscene through a temporal-aware audio semantic injection process. As our focus\nis on background editing, we further introduce a SceneMasker module, which\nmaintains the integrity of the foreground content during the editing process.\nAudioScenic exploits the inherent properties of audio, namely, audio magnitude\nand frequency, to guide the editing process, aiming to control the temporal\ndynamics and enhance the temporal consistency. First, we present an audio\nMagnitude Modulator module that adjusts the temporal dynamics of the scene in\nresponse to changes in audio magnitude, enhancing the visual dynamics. Second,\nthe audio Frequency Fuser module is designed to ensure temporal consistency by\naligning the frequency of the audio with the dynamics of the video scenes, thus\nimproving the overall temporal coherence of the edited videos. These integrated\nfeatures enable AudioScenic to not only enhance visual diversity but also\nmaintain temporal consistency throughout the video. We present a new metric\nnamed temporal score for more comprehensive validation of temporal consistency.\nWe demonstrate substantial advancements of AudioScenic over competing methods\non DAVIS and Audioset datasets.\n","authors":["Kaixin Shen","Ruijie Quan","Linchao Zhu","Jun Xiao","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.16581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15891v2","updated":"2024-04-25T12:52:37Z","published":"2024-04-24T14:29:26Z","title":"OMEGAS: Object Mesh Extraction from Large Scenes Guided by Gaussian\n Segmentation","summary":" Recent advancements in 3D reconstruction technologies have paved the way for\nhigh-quality and real-time rendering of complex 3D scenes. Despite these\nachievements, a notable challenge persists: it is difficult to precisely\nreconstruct specific objects from large scenes. Current scene reconstruction\ntechniques frequently result in the loss of object detail textures and are\nunable to reconstruct object portions that are occluded or unseen in views. To\naddress this challenge, we delve into the meticulous 3D reconstruction of\nspecific objects within large scenes and propose a framework termed OMEGAS:\nObject Mesh Extraction from Large Scenes Guided by GAussian Segmentation.\nOMEGAS employs a multi-step approach, grounded in several excellent\noff-the-shelf methodologies. Specifically, initially, we utilize the Segment\nAnything Model (SAM) to guide the segmentation of 3D Gaussian Splatting (3DGS),\nthereby creating a basic 3DGS model of the target object. Then, we leverage\nlarge-scale diffusion priors to further refine the details of the 3DGS model,\nespecially aimed at addressing invisible or occluded object portions from the\noriginal scene views. Subsequently, by re-rendering the 3DGS model onto the\nscene views, we achieve accurate object segmentation and effectively remove the\nbackground. Finally, these target-only images are used to improve the 3DGS\nmodel further and extract the definitive 3D object mesh by the SuGaR model. In\nvarious scenarios, our experiments demonstrate that OMEGAS significantly\nsurpasses existing scene reconstruction methods. Our project page is at:\nhttps://github.com/CrystalWlz/OMEGAS\n","authors":["Lizhi Wang","Feng Zhou","Jianqin Yin"],"pdf_url":"https://arxiv.org/pdf/2404.15891v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2311.17061 by other authors"},{"id":"http://arxiv.org/abs/2404.16578v1","updated":"2024-04-25T12:46:23Z","published":"2024-04-25T12:46:23Z","title":"Road Surface Friction Estimation for Winter Conditions Utilising General\n Visual Features","summary":" In below freezing winter conditions, road surface friction can greatly vary\nbased on the mixture of snow, ice, and water on the road. Friction between the\nroad and vehicle tyres is a critical parameter defining vehicle dynamics, and\ntherefore road surface friction information is essential to acquire for several\nintelligent transportation applications, such as safe control of automated\nvehicles or alerting drivers of slippery road conditions. This paper explores\ncomputer vision-based evaluation of road surface friction from roadside\ncameras. Previous studies have extensively investigated the application of\nconvolutional neural networks for the task of evaluating the road surface\ncondition from images. Here, we propose a hybrid deep learning architecture,\nWCamNet, consisting of a pretrained visual transformer model and convolutional\nblocks. The motivation of the architecture is to combine general visual\nfeatures provided by the transformer model, as well as finetuned feature\nextraction properties of the convolutional blocks. To benchmark the approach,\nan extensive dataset was gathered from national Finnish road infrastructure\nnetwork of roadside cameras and optical road surface friction sensors. Acquired\nresults highlight that the proposed WCamNet outperforms previous approaches in\nthe task of predicting the road surface friction from the roadside camera\nimages.\n","authors":["Risto Ojala","Eerik Alamikkotervo"],"pdf_url":"https://arxiv.org/pdf/2404.16578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16573v1","updated":"2024-04-25T12:35:27Z","published":"2024-04-25T12:35:27Z","title":"Multi-Scale Representations by Varying Window Attention for Semantic\n Segmentation","summary":" Multi-scale learning is central to semantic segmentation. We visualize the\neffective receptive field (ERF) of canonical multi-scale representations and\npoint out two risks in learning them: scale inadequacy and field inactivation.\nA novel multi-scale learner, varying window attention (VWA), is presented to\naddress these issues. VWA leverages the local window attention (LWA) and\ndisentangles LWA into the query window and context window, allowing the\ncontext's scale to vary for the query to learn representations at multiple\nscales. However, varying the context to large-scale windows (enlarging ratio R)\ncan significantly increase the memory footprint and computation cost (R^2 times\nlarger than LWA). We propose a simple but professional re-scaling strategy to\nzero the extra induced cost without compromising performance. Consequently, VWA\nuses the same cost as LWA to overcome the receptive limitation of the local\nwindow. Furthermore, depending on VWA and employing various MLPs, we introduce\na multi-scale decoder (MSD), VWFormer, to improve multi-scale representations\nfor semantic segmentation. VWFormer achieves efficiency competitive with the\nmost compute-friendly MSDs, like FPN and MLP decoder, but performs much better\nthan any MSDs. For instance, using nearly half of UPerNet's computation,\nVWFormer outperforms it by 1.0%-2.5% mIoU on ADE20K. With little extra\noverhead, ~10G FLOPs, Mask2Former armed with VWFormer improves by 1.0%-1.3%.\n","authors":["Haotian Yan","Ming Wu","Chuang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16573v1.pdf","comment":"ICLR2024 Poster"},{"id":"http://arxiv.org/abs/2404.16571v1","updated":"2024-04-25T12:34:23Z","published":"2024-04-25T12:34:23Z","title":"MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth\n Estimation of Endoscopic Images","summary":" Photometric constraint is indispensable for self-supervised monocular depth\nestimation. It involves warping a source image onto a target view using\nestimated depth&pose, and then minimizing the difference between the warped and\ntarget images. However, the endoscopic built-in light causes significant\nbrightness fluctuations, and thus makes the photometric constraint unreliable.\nPrevious efforts only mitigate this relying on extra models to calibrate image\nbrightness. In this paper, we propose MonoPCC to address the brightness\ninconsistency radically by reshaping the photometric constraint into a cycle\nform. Instead of only warping the source image, MonoPCC constructs a closed\nloop consisting of two opposite forward-backward warping paths: from target to\nsource and then back to target. Thus, the target image finally receives an\nimage cycle-warped from itself, which naturally makes the constraint invariant\nto brightness changes. Moreover, MonoPCC transplants the source image's\nphase-frequency into the intermediate warped image to avoid structure lost, and\nalso stabilizes the training via an exponential moving average (EMA) strategy\nto avoid frequent changes in the forward warping. The comprehensive and\nextensive experimental results on three datasets demonstrate that our proposed\nMonoPCC shows a great robustness to the brightness inconsistency, and exceeds\nother state-of-the-arts by reducing the absolute relative error by at least\n7.27%.\n","authors":["Zhiwei Wang","Ying Zhou","Shiquan He","Ting Li","Yitong Zhang","Xinxia Feng","Mei Liu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.16571v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.16561v1","updated":"2024-04-25T12:18:04Z","published":"2024-04-25T12:18:04Z","title":"Research on geometric figure classification algorithm based on Deep\n Learning","summary":" In recent years, with the rapid development of computer information\ntechnology, the development of artificial intelligence has been accelerating.\nThe traditional geometry recognition technology is relatively backward and the\nrecognition rate is low. In the face of massive information database, the\ntraditional algorithm model inevitably has the problems of low recognition\naccuracy and poor performance. Deep learning theory has gradually become a very\nimportant part of machine learning. The implementation of convolutional neural\nnetwork (CNN) reduces the difficulty of graphics generation algorithm. In this\npaper, using the advantages of lenet-5 architecture sharing weights and feature\nextraction and classification, the proposed geometric pattern recognition\nalgorithm model is faster in the training data set. By constructing the shared\nfeature parameters of the algorithm model, the cross-entropy loss function is\nused in the recognition process to improve the generalization of the model and\nimprove the average recognition accuracy of the test data set.\n","authors":["Ruiyang Wang","Haonan Wang","Junfeng Sun","Mingjia Zhao","Meng Liu"],"pdf_url":"https://arxiv.org/pdf/2404.16561v1.pdf","comment":"6 pages,9 figures"},{"id":"http://arxiv.org/abs/2404.16558v1","updated":"2024-04-25T12:15:11Z","published":"2024-04-25T12:15:11Z","title":"DeepKalPose: An Enhanced Deep-Learning Kalman Filter for Temporally\n Consistent Monocular Vehicle Pose Estimation","summary":" This paper presents DeepKalPose, a novel approach for enhancing temporal\nconsistency in monocular vehicle pose estimation applied on video through a\ndeep-learning-based Kalman Filter. By integrating a Bi-directional Kalman\nfilter strategy utilizing forward and backward time-series processing, combined\nwith a learnable motion model to represent complex motion patterns, our method\nsignificantly improves pose accuracy and robustness across various conditions,\nparticularly for occluded or distant vehicles. Experimental validation on the\nKITTI dataset confirms that DeepKalPose outperforms existing methods in both\npose accuracy and temporal consistency.\n","authors":["Leandro Di Bella","Yangxintong Lyu","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2404.16558v1.pdf","comment":"4 pages, 3 Figures, published to IET Electronic Letters"},{"id":"http://arxiv.org/abs/2404.16557v1","updated":"2024-04-25T12:11:38Z","published":"2024-04-25T12:11:38Z","title":"Energy-Latency Manipulation of Multi-modal Large Language Models via\n Verbose Samples","summary":" Despite the exceptional performance of multi-modal large language models\n(MLLMs), their deployment requires substantial computational resources. Once\nmalicious users induce high energy consumption and latency time (energy-latency\ncost), it will exhaust computational resources and harm availability of\nservice. In this paper, we investigate this vulnerability for MLLMs,\nparticularly image-based and video-based ones, and aim to induce high\nenergy-latency cost during inference by crafting an imperceptible perturbation.\nWe find that high energy-latency cost can be manipulated by maximizing the\nlength of generated sequences, which motivates us to propose verbose samples,\nincluding verbose images and videos. Concretely, two modality non-specific\nlosses are proposed, including a loss to delay end-of-sequence (EOS) token and\nan uncertainty loss to increase the uncertainty over each generated token. In\naddition, improving diversity is important to encourage longer responses by\nincreasing the complexity, which inspires the following modality specific loss.\nFor verbose images, a token diversity loss is proposed to promote diverse\nhidden states. For verbose videos, a frame feature diversity loss is proposed\nto increase the feature diversity among frames. To balance these losses, we\npropose a temporal weight adjustment algorithm. Experiments demonstrate that\nour verbose samples can largely extend the length of generated sequences.\n","authors":["Kuofeng Gao","Jindong Gu","Yang Bai","Shu-Tao Xia","Philip Torr","Wei Liu","Zhifeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.16557v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2401.11170"},{"id":"http://arxiv.org/abs/2404.16556v1","updated":"2024-04-25T12:11:28Z","published":"2024-04-25T12:11:28Z","title":"Conditional Distribution Modelling for Few-Shot Image Synthesis with\n Diffusion Models","summary":" Few-shot image synthesis entails generating diverse and realistic images of\nnovel categories using only a few example images. While multiple recent efforts\nin this direction have achieved impressive results, the existing approaches are\ndependent only upon the few novel samples available at test time in order to\ngenerate new images, which restricts the diversity of the generated images. To\novercome this limitation, we propose Conditional Distribution Modelling (CDM)\n-- a framework which effectively utilizes Diffusion models for few-shot image\ngeneration. By modelling the distribution of the latent space used to condition\na Diffusion process, CDM leverages the learnt statistics of the training data\nto get a better approximation of the unseen class distribution, thereby\nremoving the bias arising due to limited number of few shot samples.\nSimultaneously, we devise a novel inversion based optimization strategy that\nfurther improves the approximated unseen class distribution, and ensures the\nfidelity of the generated samples to the unseen class. The experimental results\non four benchmark datasets demonstrate the effectiveness of our proposed CDM\nfor few-shot generation.\n","authors":["Parul Gupta","Munawar Hayat","Abhinav Dhall","Thanh-Toan Do"],"pdf_url":"https://arxiv.org/pdf/2404.16556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16552v1","updated":"2024-04-25T12:09:16Z","published":"2024-04-25T12:09:16Z","title":"Efficient Solution of Point-Line Absolute Pose","summary":" We revisit certain problems of pose estimation based on 3D--2D\ncorrespondences between features which may be points or lines. Specifically, we\naddress the two previously-studied minimal problems of estimating camera\nextrinsics from $p \\in \\{ 1, 2 \\}$ point--point correspondences and $l=3-p$\nline--line correspondences. To the best of our knowledge, all of the\npreviously-known practical solutions to these problems required computing the\nroots of degree $\\ge 4$ (univariate) polynomials when $p=2$, or degree $\\ge 8$\npolynomials when $p=1.$ We describe and implement two elementary solutions\nwhich reduce the degrees of the needed polynomials from $4$ to $2$ and from $8$\nto $4$, respectively. We show experimentally that the resulting solvers are\nnumerically stable and fast: when compared to the previous state-of-the art, we\nmay obtain nearly an order of magnitude speedup. The code is available at\n\\url{https://github.com/petrhruby97/efficient\\_absolute}\n","authors":["Petr Hruby","Timothy Duff","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2404.16552v1.pdf","comment":"CVPR 2024, 11 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.16548v1","updated":"2024-04-25T12:04:31Z","published":"2024-04-25T12:04:31Z","title":"Cross-Domain Spatial Matching for Camera and Radar Sensor Data Fusion in\n Autonomous Vehicle Perception System","summary":" In this paper, we propose a novel approach to address the problem of camera\nand radar sensor fusion for 3D object detection in autonomous vehicle\nperception systems. Our approach builds on recent advances in deep learning and\nleverages the strengths of both sensors to improve object detection\nperformance. Precisely, we extract 2D features from camera images using a\nstate-of-the-art deep learning architecture and then apply a novel Cross-Domain\nSpatial Matching (CDSM) transformation method to convert these features into 3D\nspace. We then fuse them with extracted radar data using a complementary fusion\nstrategy to produce a final 3D object representation. To demonstrate the\neffectiveness of our approach, we evaluate it on the NuScenes dataset. We\ncompare our approach to both single-sensor performance and current\nstate-of-the-art fusion methods. Our results show that the proposed approach\nachieves superior performance over single-sensor solutions and could directly\ncompete with other top-level fusion methods.\n","authors":["Daniel Dworak","Mateusz Komorkiewicz","Paweł Skruch","Jerzy Baranowski"],"pdf_url":"https://arxiv.org/pdf/2404.16548v1.pdf","comment":"12 pages including highlights and graphical abstract, submitted to\n Expert Systems with Applications journal"},{"id":"http://arxiv.org/abs/2404.16538v1","updated":"2024-04-25T11:53:36Z","published":"2024-04-25T11:53:36Z","title":"OpenDlign: Enhancing Open-World 3D Learning with Depth-Aligned Images","summary":" Recent advances in Vision and Language Models (VLMs) have improved open-world\n3D representation, facilitating 3D zero-shot capability in unseen categories.\nExisting open-world methods pre-train an extra 3D encoder to align features\nfrom 3D data (e.g., depth maps or point clouds) with CAD-rendered images and\ncorresponding texts. However, the limited color and texture variations in CAD\nimages can compromise the alignment robustness. Furthermore, the volume\ndiscrepancy between pre-training datasets of the 3D encoder and VLM leads to\nsub-optimal 2D to 3D knowledge transfer. To overcome these issues, we propose\nOpenDlign, a novel framework for learning open-world 3D representations, that\nleverages depth-aligned images generated from point cloud-projected depth maps.\nUnlike CAD-rendered images, our generated images provide rich, realistic color\nand texture diversity while preserving geometric and semantic consistency with\nthe depth maps. OpenDlign also optimizes depth map projection and integrates\ndepth-specific text prompts, improving 2D VLM knowledge adaptation for 3D\nlearning efficient fine-tuning. Experimental results show that OpenDlign\nsignificantly outperforms existing benchmarks in zero-shot and few-shot 3D\ntasks, exceeding prior scores by 8.0% on ModelNet40 and 16.4% on OmniObject3D\nwith just 6 million tuned parameters. Moreover, integrating generated\ndepth-aligned images into existing 3D learning pipelines consistently improves\ntheir performance.\n","authors":["Ye Mao","Junpeng Jing","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2404.16538v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2404.16536v1","updated":"2024-04-25T11:50:47Z","published":"2024-04-25T11:50:47Z","title":"3D Face Modeling via Weakly-supervised Disentanglement Network joint\n Identity-consistency Prior","summary":" Generative 3D face models featuring disentangled controlling factors hold\nimmense potential for diverse applications in computer vision and computer\ngraphics. However, previous 3D face modeling methods face a challenge as they\ndemand specific labels to effectively disentangle these factors. This becomes\nparticularly problematic when integrating multiple 3D face datasets to improve\nthe generalization of the model. Addressing this issue, this paper introduces a\nWeakly-Supervised Disentanglement Framework, denoted as WSDF, to facilitate the\ntraining of controllable 3D face models without an overly stringent labeling\nrequirement. Adhering to the paradigm of Variational Autoencoders (VAEs), the\nproposed model achieves disentanglement of identity and expression controlling\nfactors through a two-branch encoder equipped with dedicated\nidentity-consistency prior. It then faithfully re-entangles these factors via a\ntensor-based combination mechanism. Notably, the introduction of the Neutral\nBank allows precise acquisition of subject-specific information using only\nidentity labels, thereby averting degeneration due to insufficient supervision.\nAdditionally, the framework incorporates a label-free second-order loss\nfunction for the expression factor to regulate deformation space and eliminate\nextraneous information, resulting in enhanced disentanglement. Extensive\nexperiments have been conducted to substantiate the superior performance of\nWSDF. Our code is available at https://github.com/liguohao96/WSDF.\n","authors":["Guohao Li","Hongyu Yang","Di Huang","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16529v1","updated":"2024-04-25T11:42:32Z","published":"2024-04-25T11:42:32Z","title":"Vision-based robot manipulation of transparent liquid containers in a\n laboratory setting","summary":" Laboratory processes involving small volumes of solutions and active\ningredients are often performed manually due to challenges in automation, such\nas high initial costs, semi-structured environments and protocol variability.\nIn this work, we develop a flexible and cost-effective approach to address this\ngap by introducing a vision-based system for liquid volume estimation and a\nsimulation-driven pouring method particularly designed for containers with\nsmall openings. We evaluate both components individually, followed by an\napplied real-world integration of cell culture automation using a UR5 robotic\narm. Our work is fully reproducible: we share our code at at\n\\url{https://github.com/DaniSchober/LabLiquidVision} and the newly introduced\ndataset LabLiquidVolume is available at\nhttps://data.dtu.dk/articles/dataset/LabLiquidVision/25103102.\n","authors":["Daniel Schober","Ronja Güldenring","James Love","Lazaros Nalpantidis"],"pdf_url":"https://arxiv.org/pdf/2404.16529v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.16510v1","updated":"2024-04-25T11:06:57Z","published":"2024-04-25T11:06:57Z","title":"Interactive3D: Create What You Want by Interactive 3D Generation","summary":" 3D object generation has undergone significant advancements, yielding\nhigh-quality results. However, fall short of achieving precise user control,\noften yielding results that do not align with user expectations, thus limiting\ntheir applicability. User-envisioning 3D object generation faces significant\nchallenges in realizing its concepts using current generative models due to\nlimited interaction capabilities. Existing methods mainly offer two approaches:\n(i) interpreting textual instructions with constrained controllability, or (ii)\nreconstructing 3D objects from 2D images. Both of them limit customization to\nthe confines of the 2D reference and potentially introduce undesirable\nartifacts during the 3D lifting process, restricting the scope for direct and\nversatile 3D modifications. In this work, we introduce Interactive3D, an\ninnovative framework for interactive 3D generation that grants users precise\ncontrol over the generative process through extensive 3D interaction\ncapabilities. Interactive3D is constructed in two cascading stages, utilizing\ndistinct 3D representations. The first stage employs Gaussian Splatting for\ndirect user interaction, allowing modifications and guidance of the generative\ndirection at any intermediate step through (i) Adding and Removing components,\n(ii) Deformable and Rigid Dragging, (iii) Geometric Transformations, and (iv)\nSemantic Editing. Subsequently, the Gaussian splats are transformed into\nInstantNGP. We introduce a novel (v) Interactive Hash Refinement module to\nfurther add details and extract the geometry in the second stage. Our\nexperiments demonstrate that Interactive3D markedly improves the\ncontrollability and quality of 3D generation. Our project webpage is available\nat \\url{https://interactive-3d.github.io/}.\n","authors":["Shaocong Dong","Lihe Ding","Zhanpeng Huang","Zibin Wang","Tianfan Xue","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.16510v1.pdf","comment":"project page: https://interactive-3d.github.io/"},{"id":"http://arxiv.org/abs/2404.16507v1","updated":"2024-04-25T11:01:40Z","published":"2024-04-25T11:01:40Z","title":"Semantic-aware Next-Best-View for Multi-DoFs Mobile System in\n Search-and-Acquisition based Visual Perception","summary":" Efficient visual perception using mobile systems is crucial, particularly in\nunknown environments such as search and rescue operations, where swift and\ncomprehensive perception of objects of interest is essential. In such\nreal-world applications, objects of interest are often situated in complex\nenvironments, making the selection of the 'Next Best' view based solely on\nmaximizing visibility gain suboptimal. Semantics, providing a higher-level\ninterpretation of perception, should significantly contribute to the selection\nof the next viewpoint for various perception tasks. In this study, we formulate\na novel information gain that integrates both visibility gain and semantic gain\nin a unified form to select the semantic-aware Next-Best-View. Additionally, we\ndesign an adaptive strategy with termination criterion to support a two-stage\nsearch-and-acquisition manoeuvre on multiple objects of interest aided by a\nmulti-degree-of-freedoms (Multi-DoFs) mobile system. Several semantically\nrelevant reconstruction metrics, including perspective directivity and region\nof interest (ROI)-to-full reconstruction volume ratio, are introduced to\nevaluate the performance of the proposed approach. Simulation experiments\ndemonstrate the advantages of the proposed approach over existing methods,\nachieving improvements of up to 27.13% for the ROI-to-full reconstruction\nvolume ratio and a 0.88234 average perspective directivity. Furthermore, the\nplanned motion trajectory exhibits better perceiving coverage toward the\ntarget.\n","authors":["Xiaotong Yu","Chang-Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.16507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16501v1","updated":"2024-04-25T10:52:08Z","published":"2024-04-25T10:52:08Z","title":"360SFUDA++: Towards Source-free UDA for Panoramic Segmentation by\n Learning Reliable Category Prototypes","summary":" In this paper, we address the challenging source-free unsupervised domain\nadaptation (SFUDA) for pinhole-to-panoramic semantic segmentation, given only a\npinhole image pre-trained model (i.e., source) and unlabeled panoramic images\n(i.e., target). Tackling this problem is non-trivial due to three critical\nchallenges: 1) semantic mismatches from the distinct Field-of-View (FoV)\nbetween domains, 2) style discrepancies inherent in the UDA problem, and 3)\ninevitable distortion of the panoramic images. To tackle these problems, we\npropose 360SFUDA++ that effectively extracts knowledge from the source pinhole\nmodel with only unlabeled panoramic images and transfers the reliable knowledge\nto the target panoramic domain. Specifically, we first utilize Tangent\nProjection (TP) as it has less distortion and meanwhile slits the\nequirectangular projection (ERP) to patches with fixed FoV projection (FFP) to\nmimic the pinhole images. Both projections are shown effective in extracting\nknowledge from the source model. However, as the distinct projections make it\nless possible to directly transfer knowledge between domains, we then propose\nReliable Panoramic Prototype Adaptation Module (RP2AM) to transfer knowledge at\nboth prediction and prototype levels. RP$^2$AM selects the confident knowledge\nand integrates panoramic prototypes for reliable knowledge adaptation.\nMoreover, we introduce Cross-projection Dual Attention Module (CDAM), which\nbetter aligns the spatial and channel characteristics across projections at the\nfeature level between domains. Both knowledge extraction and transfer processes\nare synchronously updated to reach the best performance. Extensive experiments\non the synthetic and real-world benchmarks, including outdoor and indoor\nscenarios, demonstrate that our 360SFUDA++ achieves significantly better\nperformance than prior SFUDA methods.\n","authors":["Xu Zheng","Pengyuan Zhou","Athanasios V. Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16501v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.12505"},{"id":"http://arxiv.org/abs/2404.16493v1","updated":"2024-04-25T10:38:33Z","published":"2024-04-25T10:38:33Z","title":"Commonsense Prototype for Outdoor Unsupervised 3D Object Detection","summary":" The prevalent approaches of unsupervised 3D object detection follow\ncluster-based pseudo-label generation and iterative self-training processes.\nHowever, the challenge arises due to the sparsity of LiDAR scans, which leads\nto pseudo-labels with erroneous size and position, resulting in subpar\ndetection performance. To tackle this problem, this paper introduces a\nCommonsense Prototype-based Detector, termed CPD, for unsupervised 3D object\ndetection. CPD first constructs Commonsense Prototype (CProto) characterized by\nhigh-quality bounding box and dense points, based on commonsense intuition.\nSubsequently, CPD refines the low-quality pseudo-labels by leveraging the size\nprior from CProto. Furthermore, CPD enhances the detection accuracy of sparsely\nscanned objects by the geometric knowledge from CProto. CPD outperforms\nstate-of-the-art unsupervised 3D detectors on Waymo Open Dataset (WOD),\nPandaSet, and KITTI datasets by a large margin. Besides, by training CPD on WOD\nand testing on KITTI, CPD attains 90.85% and 81.01% 3D Average Precision on\neasy and moderate car classes, respectively. These achievements position CPD in\nclose proximity to fully supervised detectors, highlighting the significance of\nour method. The code will be available at https://github.com/hailanyi/CPD.\n","authors":["Hai Wu","Shijia Zhao","Xun Huang","Chenglu Wen","Xin Li","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16493v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16012v2","updated":"2024-04-25T10:25:11Z","published":"2024-04-24T17:45:24Z","title":"GaussianTalker: Real-Time High-Fidelity Talking Head Synthesis with\n Audio-Driven 3D Gaussian Splatting","summary":" We propose GaussianTalker, a novel framework for real-time generation of\npose-controllable talking heads. It leverages the fast rendering capabilities\nof 3D Gaussian Splatting (3DGS) while addressing the challenges of directly\ncontrolling 3DGS with speech audio. GaussianTalker constructs a canonical 3DGS\nrepresentation of the head and deforms it in sync with the audio. A key insight\nis to encode the 3D Gaussian attributes into a shared implicit feature\nrepresentation, where it is merged with audio features to manipulate each\nGaussian attribute. This design exploits the spatial-aware features and\nenforces interactions between neighboring points. The feature embeddings are\nthen fed to a spatial-audio attention module, which predicts frame-wise offsets\nfor the attributes of each Gaussian. It is more stable than previous\nconcatenation or multiplication approaches for manipulating the numerous\nGaussians and their intricate parameters. Experimental results showcase\nGaussianTalker's superiority in facial fidelity, lip synchronization accuracy,\nand rendering speed compared to previous methods. Specifically, GaussianTalker\nachieves a remarkable rendering speed up to 120 FPS, surpassing previous\nbenchmarks. Our code is made available at\nhttps://github.com/KU-CVLAB/GaussianTalker/ .\n","authors":["Kyusun Cho","Joungbin Lee","Heeji Yoon","Yeobin Hong","Jaehoon Ko","Sangjun Ahn","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2404.16012v2.pdf","comment":"Project Page: https://ku-cvlab.github.io/GaussianTalker"},{"id":"http://arxiv.org/abs/2404.16484v1","updated":"2024-04-25T10:12:42Z","published":"2024-04-25T10:12:42Z","title":"Real-Time 4K Super-Resolution of Compressed AVIF Images. AIS 2024\n Challenge Survey","summary":" This paper introduces a novel benchmark as part of the AIS 2024 Real-Time\nImage Super-Resolution (RTSR) Challenge, which aims to upscale compressed\nimages from 540p to 4K resolution (4x factor) in real-time on commercial GPUs.\nFor this, we use a diverse test set containing a variety of 4K images ranging\nfrom digital art to gaming and photography. The images are compressed using the\nmodern AVIF codec, instead of JPEG. All the proposed methods improve PSNR\nfidelity over Lanczos interpolation, and process images under 10ms. Out of the\n160 participants, 25 teams submitted their code and models. The solutions\npresent novel designs tailored for memory-efficiency and runtime on edge\ndevices. This survey describes the best solutions for real-time SR of\ncompressed high-resolution images.\n","authors":["Marcos V. Conde","Zhijun Lei","Wen Li","Cosmin Stejerean","Ioannis Katsavounidis","Radu Timofte","Kihwan Yoon","Ganzorig Gankhuyag","Jiangtao Lv","Long Sun","Jinshan Pan","Jiangxin Dong","Jinhui Tang","Zhiyuan Li","Hao Wei","Chenyang Ge","Dongyang Zhang","Tianle Liu","Huaian Chen","Yi Jin","Menghan Zhou","Yiqiang Yan","Si Gao","Biao Wu","Shaoli Liu","Chengjian Zheng","Diankai Zhang","Ning Wang","Xintao Qiu","Yuanbo Zhou","Kongxian Wu","Xinwei Dai","Hui Tang","Wei Deng","Qingquan Gao","Tong Tong","Jae-Hyeon Lee","Ui-Jin Choi","Min Yan","Xin Liu","Qian Wang","Xiaoqian Ye","Zhan Du","Tiansen Zhang","Long Peng","Jiaming Guo","Xin Di","Bohao Liao","Zhibo Du","Peize Xia","Renjing Pei","Yang Wang","Yang Cao","Zhengjun Zha","Bingnan Han","Hongyuan Yu","Zhuoyuan Wu","Cheng Wan","Yuqing Liu","Haodong Yu","Jizhe Li","Zhijuan Huang","Yuan Huang","Yajun Zou","Xianyu Guan","Qi Jia","Heng Zhang","Xuanwu Yin","Kunlong Zuo","Hyeon-Cheol Moon","Tae-hyun Jeong","Yoonmo Yang","Jae-Gon Kim","Jinwoo Jeong","Sunjei Kim"],"pdf_url":"https://arxiv.org/pdf/2404.16484v1.pdf","comment":"CVPR 2024, AI for Streaming (AIS) Workshop"},{"id":"http://arxiv.org/abs/2404.16482v1","updated":"2024-04-25T10:10:48Z","published":"2024-04-25T10:10:48Z","title":"CoCoG: Controllable Visual Stimuli Generation based on Human Concept\n Representations","summary":" A central question for cognitive science is to understand how humans process\nvisual objects, i.e, to uncover human low-dimensional concept representation\nspace from high-dimensional visual stimuli. Generating visual stimuli with\ncontrolling concepts is the key. However, there are currently no generative\nmodels in AI to solve this problem. Here, we present the Concept based\nControllable Generation (CoCoG) framework. CoCoG consists of two components, a\nsimple yet efficient AI agent for extracting interpretable concept and\npredicting human decision-making in visual similarity judgment tasks, and a\nconditional generation model for generating visual stimuli given the concepts.\nWe quantify the performance of CoCoG from two aspects, the human behavior\nprediction accuracy and the controllable generation ability. The experiments\nwith CoCoG indicate that 1) the reliable concept embeddings in CoCoG allows to\npredict human behavior with 64.07\\% accuracy in the THINGS-similarity dataset;\n2) CoCoG can generate diverse objects through the control of concepts; 3) CoCoG\ncan manipulate human similarity judgment behavior by intervening key concepts.\nCoCoG offers visual objects with controlling concepts to advance our\nunderstanding of causality in human cognition. The code of CoCoG is available\nat \\url{https://github.com/ncclab-sustech/CoCoG}.\n","authors":["Chen Wei","Jiachen Zou","Dietmar Heinke","Quanying Liu"],"pdf_url":"https://arxiv.org/pdf/2404.16482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16474v1","updated":"2024-04-25T09:57:52Z","published":"2024-04-25T09:57:52Z","title":"DiffSeg: A Segmentation Model for Skin Lesions Based on Diffusion\n Difference","summary":" Weakly supervised medical image segmentation (MIS) using generative models is\ncrucial for clinical diagnosis. However, the accuracy of the segmentation\nresults is often limited by insufficient supervision and the complex nature of\nmedical imaging. Existing models also only provide a single outcome, which does\nnot allow for the measurement of uncertainty. In this paper, we introduce\nDiffSeg, a segmentation model for skin lesions based on diffusion difference\nwhich exploits diffusion model principles to ex-tract noise-based features from\nimages with diverse semantic information. By discerning difference between\nthese noise features, the model identifies diseased areas. Moreover, its\nmulti-output capability mimics doctors' annotation behavior, facilitating the\nvisualization of segmentation result consistency and ambiguity. Additionally,\nit quantifies output uncertainty using Generalized Energy Distance (GED),\naiding interpretability and decision-making for physicians. Finally, the model\nintegrates outputs through the Dense Conditional Random Field (DenseCRF)\nalgorithm to refine the segmentation boundaries by considering inter-pixel\ncorrelations, which improves the accuracy and optimizes the segmentation\nresults. We demonstrate the effectiveness of DiffSeg on the ISIC 2018 Challenge\ndataset, outperforming state-of-the-art U-Net-based methods.\n","authors":["Zhihao Shuai","Yinan Chen","Shunqiang Mao","Yihan Zho","Xiaohong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16471v1","updated":"2024-04-25T09:55:35Z","published":"2024-04-25T09:55:35Z","title":"COBRA -- COnfidence score Based on shape Regression Analysis for\n method-independent quality assessment of object pose estimation from single\n images","summary":" We present a generic algorithm for scoring pose estimation methods that rely\non single image semantic analysis. The algorithm employs a lightweight putative\nshape representation using a combination of multiple Gaussian Processes. Each\nGaussian Process (GP) yields distance normal distributions from multiple\nreference points in the object's coordinate system to its surface, thus\nproviding a geometric evaluation framework for scoring predicted poses. Our\nconfidence measure comprises the average mixture probability of pixel\nback-projections onto the shape template. In the reported experiments, we\ncompare the accuracy of our GP based representation of objects versus the\nactual geometric models and demonstrate the ability of our method to capture\nthe influence of outliers as opposed to the corresponding intrinsic measures\nthat ship with the segmentation and pose estimation methods.\n","authors":["Panagiotis Sapoutzoglou","Georgios Giapitzakis Tzintanos","George Terzakis","Maria Pateraki"],"pdf_url":"https://arxiv.org/pdf/2404.16471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00994v3","updated":"2024-04-25T09:53:33Z","published":"2023-08-02T07:59:25Z","title":"SYNAuG: Exploiting Synthetic Data for Data Imbalance Problems","summary":" Data imbalance in training data often leads to biased predictions from\ntrained models, which in turn causes ethical and social issues. A\nstraightforward solution is to carefully curate training data, but given the\nenormous scale of modern neural networks, this is prohibitively labor-intensive\nand thus impractical. Inspired by recent developments in generative models,\nthis paper explores the potential of synthetic data to address the data\nimbalance problem. To be specific, our method, dubbed SYNAuG, leverages\nsynthetic data to equalize the unbalanced distribution of training data. Our\nexperiments demonstrate that, although a domain gap between real and synthetic\ndata exists, training with SYNAuG followed by fine-tuning with a few real\nsamples allows to achieve impressive performance on diverse tasks with\ndifferent data imbalance issues, surpassing existing task-specific methods for\nthe same purpose.\n","authors":["Moon Ye-Bin","Nam Hyeon-Woo","Wonseok Choi","Nayeong Kim","Suha Kwak","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2308.00994v3.pdf","comment":"The paper is under consideration at Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2404.16456v1","updated":"2024-04-25T09:35:09Z","published":"2024-04-25T09:35:09Z","title":"Correlation-Decoupled Knowledge Distillation for Multimodal Sentiment\n Analysis with Incomplete Modalities","summary":" Multimodal sentiment analysis (MSA) aims to understand human sentiment\nthrough multimodal data. Most MSA efforts are based on the assumption of\nmodality completeness. However, in real-world applications, some practical\nfactors cause uncertain modality missingness, which drastically degrades the\nmodel's performance. To this end, we propose a Correlation-decoupled Knowledge\nDistillation (CorrKD) framework for the MSA task under uncertain missing\nmodalities. Specifically, we present a sample-level contrastive distillation\nmechanism that transfers comprehensive knowledge containing cross-sample\ncorrelations to reconstruct missing semantics. Moreover, a category-guided\nprototype distillation mechanism is introduced to capture cross-category\ncorrelations using category prototypes to align feature distributions and\ngenerate favorable joint representations. Eventually, we design a\nresponse-disentangled consistency distillation strategy to optimize the\nsentiment decision boundaries of the student network through response\ndisentanglement and mutual information maximization. Comprehensive experiments\non three datasets indicate that our framework can achieve favorable\nimprovements compared with several baselines.\n","authors":["Mingcheng Li","Dingkang Yang","Xiao Zhao","Shuaibing Wang","Yan Wang","Kun Yang","Mingyang Sun","Dongliang Kou","Ziyun Qian","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16452v1","updated":"2024-04-25T09:32:34Z","published":"2024-04-25T09:32:34Z","title":"PAD: Patch-Agnostic Defense against Adversarial Patch Attacks","summary":" Adversarial patch attacks present a significant threat to real-world object\ndetectors due to their practical feasibility. Existing defense methods, which\nrely on attack data or prior knowledge, struggle to effectively address a wide\nrange of adversarial patches. In this paper, we show two inherent\ncharacteristics of adversarial patches, semantic independence and spatial\nheterogeneity, independent of their appearance, shape, size, quantity, and\nlocation. Semantic independence indicates that adversarial patches operate\nautonomously within their semantic context, while spatial heterogeneity\nmanifests as distinct image quality of the patch area that differs from\noriginal clean image due to the independent generation process. Based on these\nobservations, we propose PAD, a novel adversarial patch localization and\nremoval method that does not require prior knowledge or additional training.\nPAD offers patch-agnostic defense against various adversarial patches,\ncompatible with any pre-trained object detectors. Our comprehensive digital and\nphysical experiments involving diverse patch types, such as localized noise,\nprintable, and naturalistic patches, exhibit notable improvements over\nstate-of-the-art works. Our code is available at\nhttps://github.com/Lihua-Jing/PAD.\n","authors":["Lihua Jing","Rui Wang","Wenqi Ren","Xin Dong","Cong Zou"],"pdf_url":"https://arxiv.org/pdf/2404.16452v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16451v1","updated":"2024-04-25T09:30:38Z","published":"2024-04-25T09:30:38Z","title":"Latent Modulated Function for Computational Optimal Continuous Image\n Representation","summary":" The recent work Local Implicit Image Function (LIIF) and subsequent Implicit\nNeural Representation (INR) based works have achieved remarkable success in\nArbitrary-Scale Super-Resolution (ASSR) by using MLP to decode Low-Resolution\n(LR) features. However, these continuous image representations typically\nimplement decoding in High-Resolution (HR) High-Dimensional (HD) space, leading\nto a quadratic increase in computational cost and seriously hindering the\npractical applications of ASSR. To tackle this problem, we propose a novel\nLatent Modulated Function (LMF), which decouples the HR-HD decoding process\ninto shared latent decoding in LR-HD space and independent rendering in HR\nLow-Dimensional (LD) space, thereby realizing the first computational optimal\nparadigm of continuous image representation. Specifically, LMF utilizes an HD\nMLP in latent space to generate latent modulations of each LR feature vector.\nThis enables a modulated LD MLP in render space to quickly adapt to any input\nfeature vector and perform rendering at arbitrary resolution. Furthermore, we\nleverage the positive correlation between modulation intensity and input image\ncomplexity to design a Controllable Multi-Scale Rendering (CMSR) algorithm,\noffering the flexibility to adjust the decoding efficiency based on the\nrendering precision. Extensive experiments demonstrate that converting existing\nINR-based ASSR methods to LMF can reduce the computational cost by up to 99.9%,\naccelerate inference by up to 57 times, and save up to 76% of parameters, while\nmaintaining competitive performance. The code is available at\nhttps://github.com/HeZongyao/LMF.\n","authors":["Zongyao He","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2404.16451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14199v2","updated":"2024-04-25T09:25:24Z","published":"2023-11-23T20:52:44Z","title":"A Systematic Review of Deep Learning-based Research on Radiology Report\n Generation","summary":" Radiology report generation (RRG) aims to automatically generate free-text\ndescriptions from clinical radiographs, e.g., chest X-Ray images. RRG plays an\nessential role in promoting clinical automation and presents significant help\nto provide practical assistance for inexperienced doctors and alleviate\nradiologists' workloads. Therefore, consider these meaningful potentials,\nresearch on RRG is experiencing explosive growth in the past half-decade,\nespecially with the rapid development of deep learning approaches. Existing\nstudies perform RRG from the perspective of enhancing different modalities,\nprovide insights on optimizing the report generation process with elaborated\nfeatures from both visual and textual information, and further facilitate RRG\nwith the cross-modal interactions among them. In this paper, we present a\ncomprehensive review of deep learning-based RRG from various perspectives.\nSpecifically, we firstly cover pivotal RRG approaches based on the\ntask-specific features of radiographs, reports, and the cross-modal relations\nbetween them, and then illustrate the benchmark datasets conventionally used\nfor this task with evaluation metrics, subsequently analyze the performance of\ndifferent approaches and finally offer our summary on the challenges and the\ntrends in future directions. Overall, the goal of this paper is to serve as a\ntool for understanding existing literature and inspiring potential valuable\nresearch in the field of RRG.\n","authors":["Chang Liu","Yuanhe Tian","Yan Song"],"pdf_url":"https://arxiv.org/pdf/2311.14199v2.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.16432v1","updated":"2024-04-25T09:07:19Z","published":"2024-04-25T09:07:19Z","title":"Point-JEPA: A Joint Embedding Predictive Architecture for\n Self-Supervised Learning on Point Cloud","summary":" Recent advancements in self-supervised learning in the point cloud domain\nhave demonstrated significant potential. However, these methods often suffer\nfrom drawbacks, including lengthy pre-training time, the necessity of\nreconstruction in the input space, or the necessity of additional modalities.\nIn order to address these issues, we introduce Point-JEPA, a joint embedding\npredictive architecture designed specifically for point cloud data. To this\nend, we introduce a sequencer that orders point cloud tokens to efficiently\ncompute and utilize tokens proximity based on their indices during target and\ncontext selection. The sequencer also allows shared computations of the tokens\nproximity between context and target selection, further improving the\nefficiency. Experimentally, our method achieves competitive results with\nstate-of-the-art methods while avoiding the reconstruction in the input space\nor additional modality.\n","authors":["Ayumu Saito","Jiju Poovvancheri"],"pdf_url":"https://arxiv.org/pdf/2404.16432v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.16429v1","updated":"2024-04-25T09:02:11Z","published":"2024-04-25T09:02:11Z","title":"Depth Supervised Neural Surface Reconstruction from Airborne Imagery","summary":" While originally developed for novel view synthesis, Neural Radiance Fields\n(NeRFs) have recently emerged as an alternative to multi-view stereo (MVS).\nTriggered by a manifold of research activities, promising results have been\ngained especially for texture-less, transparent, and reflecting surfaces, while\nsuch scenarios remain challenging for traditional MVS-based approaches.\nHowever, most of these investigations focus on close-range scenarios, with\nstudies for airborne scenarios still missing. For this task, NeRFs face\npotential difficulties at areas of low image redundancy and weak data evidence,\nas often found in street canyons, facades or building shadows. Furthermore,\ntraining such networks is computationally expensive. Thus, the aim of our work\nis twofold: First, we investigate the applicability of NeRFs for aerial image\nblocks representing different characteristics like nadir-only, oblique and\nhigh-resolution imagery. Second, during these investigations we demonstrate the\nbenefit of integrating depth priors from tie-point measures, which are provided\nduring presupposed Bundle Block Adjustment. Our work is based on the\nstate-of-the-art framework VolSDF, which models 3D scenes by signed distance\nfunctions (SDFs), since this is more applicable for surface reconstruction\ncompared to the standard volumetric representation in vanilla NeRFs. For\nevaluation, the NeRF-based reconstructions are compared to results of a\npublicly available benchmark dataset for airborne images.\n","authors":["Vincent Hackstein","Paul Fauth-Mayer","Matthias Rothermel","Norbert Haala"],"pdf_url":"https://arxiv.org/pdf/2404.16429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07592v2","updated":"2024-04-25T08:56:49Z","published":"2024-03-12T12:25:38Z","title":"Accurate Spatial Gene Expression Prediction by integrating\n Multi-resolution features","summary":" Recent advancements in Spatial Transcriptomics (ST) technology have\nfacilitated detailed gene expression analysis within tissue contexts. However,\nthe high costs and methodological limitations of ST necessitate a more robust\npredictive model. In response, this paper introduces TRIPLEX, a novel deep\nlearning framework designed to predict spatial gene expression from Whole Slide\nImages (WSIs). TRIPLEX uniquely harnesses multi-resolution features, capturing\ncellular morphology at individual spots, the local context around these spots,\nand the global tissue organization. By integrating these features through an\neffective fusion strategy, TRIPLEX achieves accurate gene expression\nprediction. Our comprehensive benchmark study, conducted on three public ST\ndatasets and supplemented with Visium data from 10X Genomics, demonstrates that\nTRIPLEX outperforms current state-of-the-art models in Mean Squared Error\n(MSE), Mean Absolute Error (MAE), and Pearson Correlation Coefficient (PCC).\nThe model's predictions align closely with ground truth gene expression\nprofiles and tumor annotations, underscoring TRIPLEX's potential in advancing\ncancer diagnosis and treatment.\n","authors":["Youngmin Chung","Ji Hun Ha","Kyeong Chan Im","Joo Sang Lee"],"pdf_url":"https://arxiv.org/pdf/2403.07592v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2309.02961v2","updated":"2024-04-25T08:54:21Z","published":"2023-09-06T12:57:00Z","title":"LuViRA Dataset Validation and Discussion: Comparing Vision, Radio, and\n Audio Sensors for Indoor Localization","summary":" We present a unique comparative analysis, and evaluation of vision, radio,\nand audio based localization algorithms. We create the first baseline for the\naforementioned sensors using the recently published Lund University Vision,\nRadio, and Audio (LuViRA) dataset, where all the sensors are synchronized and\nmeasured in the same environment. Some of the challenges of using each specific\nsensor for indoor localization tasks are highlighted. Each sensor is paired\nwith a current state-of-the-art localization algorithm and evaluated for\ndifferent aspects: localization accuracy, reliability and sensitivity to\nenvironment changes, calibration requirements, and potential system complexity.\nSpecifically, the evaluation covers the ORB-SLAM3 algorithm for vision-based\nlocalization with an RGB-D camera, a machine-learning algorithm for radio-based\nlocalization with massive MIMO technology, and the SFS2 algorithm for\naudio-based localization with distributed microphones. The results can serve as\na guideline and basis for further development of robust and high-precision\nmulti-sensory localization systems, e.g., through sensor fusion, context, and\nenvironment-aware adaptation.\n","authors":["Ilayda Yaman","Guoda Tian","Erik Tegler","Jens Gulin","Nikhil Challa","Fredrik Tufvesson","Ove Edfors","Kalle Astrom","Steffen Malkowsky","Liang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02961v2.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.16423v1","updated":"2024-04-25T08:53:23Z","published":"2024-04-25T08:53:23Z","title":"Neural Assembler: Learning to Generate Fine-Grained Robotic Assembly\n Instructions from Multi-View Images","summary":" Image-guided object assembly represents a burgeoning research topic in\ncomputer vision. This paper introduces a novel task: translating multi-view\nimages of a structural 3D model (for example, one constructed with building\nblocks drawn from a 3D-object library) into a detailed sequence of assembly\ninstructions executable by a robotic arm. Fed with multi-view images of the\ntarget 3D model for replication, the model designed for this task must address\nseveral sub-tasks, including recognizing individual components used in\nconstructing the 3D model, estimating the geometric pose of each component, and\ndeducing a feasible assembly order adhering to physical rules. Establishing\naccurate 2D-3D correspondence between multi-view images and 3D objects is\ntechnically challenging. To tackle this, we propose an end-to-end model known\nas the Neural Assembler. This model learns an object graph where each vertex\nrepresents recognized components from the images, and the edges specify the\ntopology of the 3D model, enabling the derivation of an assembly plan. We\nestablish benchmarks for this task and conduct comprehensive empirical\nevaluations of Neural Assembler and alternative solutions. Our experiments\nclearly demonstrate the superiority of Neural Assembler.\n","authors":["Hongyu Yan","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2404.16423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16422v1","updated":"2024-04-25T08:52:25Z","published":"2024-04-25T08:52:25Z","title":"Robust Fine-tuning for Pre-trained 3D Point Cloud Models","summary":" This paper presents a robust fine-tuning method designed for pre-trained 3D\npoint cloud models, to enhance feature robustness in downstream fine-tuned\nmodels. We highlight the limitations of current fine-tuning methods and the\nchallenges of learning robust models. The proposed method, named Weight-Space\nEnsembles for Fine-Tuning then Linear Probing (WiSE-FT-LP), integrates the\noriginal pre-training and fine-tuning models through weight space integration\nfollowed by Linear Probing. This approach significantly enhances the\nperformance of downstream fine-tuned models under distribution shifts,\nimproving feature robustness while maintaining high performance on the target\ndistribution. We apply this robust fine-tuning method to mainstream 3D point\ncloud pre-trained models and evaluate the quality of model parameters and the\ndegradation of downstream task performance. Experimental results demonstrate\nthe effectiveness of WiSE-FT-LP in enhancing model robustness, effectively\nbalancing downstream task performance and model feature robustness without\naltering the model structures.\n","authors":["Zhibo Zhang","Ximing Yang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2404.16422v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.16421v1","updated":"2024-04-25T08:51:59Z","published":"2024-04-25T08:51:59Z","title":"SynCellFactory: Generative Data Augmentation for Cell Tracking","summary":" Cell tracking remains a pivotal yet challenging task in biomedical research.\nThe full potential of deep learning for this purpose is often untapped due to\nthe limited availability of comprehensive and varied training data sets. In\nthis paper, we present SynCellFactory, a generative cell video augmentation. At\nthe heart of SynCellFactory lies the ControlNet architecture, which has been\nfine-tuned to synthesize cell imagery with photorealistic accuracy in style and\nmotion patterns. This technique enables the creation of synthetic yet realistic\ncell videos that mirror the complexity of authentic microscopy time-lapses. Our\nexperiments demonstrate that SynCellFactory boosts the performance of\nwell-established deep learning models for cell tracking, particularly when\noriginal training data is sparse.\n","authors":["Moritz Sturm","Lorenzo Cerrone","Fred A. Hamprecht"],"pdf_url":"https://arxiv.org/pdf/2404.16421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16416v1","updated":"2024-04-25T08:49:08Z","published":"2024-04-25T08:49:08Z","title":"Learning Discriminative Spatio-temporal Representations for\n Semi-supervised Action Recognition","summary":" Semi-supervised action recognition aims to improve spatio-temporal reasoning\nability with a few labeled data in conjunction with a large amount of unlabeled\ndata. Albeit recent advancements, existing powerful methods are still prone to\nmaking ambiguous predictions under scarce labeled data, embodied as the\nlimitation of distinguishing different actions with similar spatio-temporal\ninformation. In this paper, we approach this problem by empowering the model\ntwo aspects of capability, namely discriminative spatial modeling and temporal\nstructure modeling for learning discriminative spatio-temporal representations.\nSpecifically, we propose an Adaptive Contrastive Learning~(ACL) strategy. It\nassesses the confidence of all unlabeled samples by the class prototypes of the\nlabeled data, and adaptively selects positive-negative samples from a\npseudo-labeled sample bank to construct contrastive learning. Additionally, we\nintroduce a Multi-scale Temporal Learning~(MTL) strategy. It could highlight\ninformative semantics from long-term clips and integrate them into the\nshort-term clip while suppressing noisy information. Afterwards, both of these\ntwo new techniques are integrated in a unified framework to encourage the model\nto make accurate predictions. Extensive experiments on UCF101, HMDB51 and\nKinetics400 show the superiority of our method over prior state-of-the-art\napproaches.\n","authors":["Yu Wang","Sanping Zhou","Kun Xia","Le Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16416v1.pdf","comment":"10 pages, 6 figures, 6 tables, 56 conferences"},{"id":"http://arxiv.org/abs/2404.16409v1","updated":"2024-04-25T08:36:09Z","published":"2024-04-25T08:36:09Z","title":"Cross-sensor super-resolution of irregularly sampled Sentinel-2 time\n series","summary":" Satellite imaging generally presents a trade-off between the frequency of\nacquisitions and the spatial resolution of the images. Super-resolution is\noften advanced as a way to get the best of both worlds. In this work, we\ninvestigate multi-image super-resolution of satellite image time series, i.e.\nhow multiple images of the same area acquired at different dates can help\nreconstruct a higher resolution observation. In particular, we extend\nstate-of-the-art deep single and multi-image super-resolution algorithms, such\nas SRDiff and HighRes-net, to deal with irregularly sampled Sentinel-2 time\nseries. We introduce BreizhSR, a new dataset for 4x super-resolution of\nSentinel-2 time series using very high-resolution SPOT-6 imagery of Brittany, a\nFrench region. We show that using multiple images significantly improves\nsuper-resolution performance, and that a well-designed temporal positional\nencoding allows us to perform super-resolution for different times of the\nseries. In addition, we observe a trade-off between spectral fidelity and\nperceptual quality of the reconstructed HR images, questioning future\ndirections for super-resolution of Earth Observation data.\n","authors":["Aimi Okabayashi","Nicolas Audebert","Simon Donike","Charlotte Pelletier"],"pdf_url":"https://arxiv.org/pdf/2404.16409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08903v3","updated":"2024-04-25T08:31:00Z","published":"2024-01-17T01:10:17Z","title":"Rethinking Impersonation and Dodging Attacks on Face Recognition Systems","summary":" Face Recognition (FR) systems can be easily deceived by adversarial examples\nthat manipulate benign face images through imperceptible perturbations.\nAdversarial attacks on FR encompass two types: impersonation (targeted) attacks\nand dodging (untargeted) attacks. Previous methods often achieve a successful\nimpersonation attack on FR; However, it does not necessarily guarantee a\nsuccessful dodging attack on FR in the black-box setting. In this paper, our\nkey insight is that the generation of adversarial examples should perform both\nimpersonation and dodging attacks simultaneously. To this end, we propose a\nnovel attack method termed as Adversarial Pruning (Adv-Pruning), to fine-tune\nexisting adversarial examples to enhance their dodging capabilities while\npreserving their impersonation capabilities. Adv-Pruning consists of Priming,\nPruning, and Restoration stages. Concretely, we propose Adversarial Priority\nQuantification to measure the region-wise priority of original adversarial\nperturbations, identifying and releasing those with minimal impact on absolute\nmodel output variances. Then, Biased Gradient Adaptation is presented to adapt\nthe adversarial examples to traverse the decision boundaries of both the\nattacker and victim by adding perturbations favoring dodging attacks on the\nvacated regions, preserving the prioritized features of the original\nperturbations while boosting dodging performance. As a result, we can maintain\nthe impersonation capabilities of original adversarial examples while\neffectively enhancing dodging capabilities. Comprehensive experiments\ndemonstrate the superiority of our method compared with state-of-the-art\nadversarial attacks.\n","authors":["Fengfan Zhou","Qianyu Zhou","Bangjie Yin","Hui Zheng","Xuequan Lu","Lizhuang Ma","Hefei Ling"],"pdf_url":"https://arxiv.org/pdf/2401.08903v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15719v2","updated":"2024-04-25T08:27:34Z","published":"2024-04-24T08:11:50Z","title":"HDBN: A Novel Hybrid Dual-branch Network for Robust Skeleton-based\n Action Recognition","summary":" Skeleton-based action recognition has gained considerable traction thanks to\nits utilization of succinct and robust skeletal representations. Nonetheless,\ncurrent methodologies often lean towards utilizing a solitary backbone to model\nskeleton modality, which can be limited by inherent flaws in the network\nbackbone. To address this and fully leverage the complementary characteristics\nof various network architectures, we propose a novel Hybrid Dual-Branch Network\n(HDBN) for robust skeleton-based action recognition, which benefits from the\ngraph convolutional network's proficiency in handling graph-structured data and\nthe powerful modeling capabilities of Transformers for global information. In\ndetail, our proposed HDBN is divided into two trunk branches: MixGCN and\nMixFormer. The two branches utilize GCNs and Transformers to model both 2D and\n3D skeletal modalities respectively. Our proposed HDBN emerged as one of the\ntop solutions in the Multi-Modal Video Reasoning and Analyzing Competition\n(MMVRAC) of 2024 ICME Grand Challenge, achieving accuracies of 47.95% and\n75.36% on two benchmarks of the UAV-Human dataset by outperforming most\nexisting methods. Our code will be publicly available at:\nhttps://github.com/liujf69/ICMEW2024-Track10.\n","authors":["Jinfu Liu","Baiqiao Yin","Jiaying Lin","Jiajun Wen","Yue Li","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.15719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16398v1","updated":"2024-04-25T08:18:18Z","published":"2024-04-25T08:18:18Z","title":"Revisiting Relevance Feedback for CLIP-based Interactive Image Retrieval","summary":" Many image retrieval studies use metric learning to train an image encoder.\nHowever, metric learning cannot handle differences in users' preferences, and\nrequires data to train an image encoder. To overcome these limitations, we\nrevisit relevance feedback, a classic technique for interactive retrieval\nsystems, and propose an interactive CLIP-based image retrieval system with\nrelevance feedback. Our retrieval system first executes the retrieval, collects\neach user's unique preferences through binary feedback, and returns images the\nuser prefers. Even when users have various preferences, our retrieval system\nlearns each user's preference through the feedback and adapts to the\npreference. Moreover, our retrieval system leverages CLIP's zero-shot\ntransferability and achieves high accuracy without training. We empirically\nshow that our retrieval system competes well with state-of-the-art metric\nlearning in category-based image retrieval, despite not training image encoders\nspecifically for each dataset. Furthermore, we set up two additional\nexperimental settings where users have various preferences: one-label-based\nimage retrieval and conditioned image retrieval. In both cases, our retrieval\nsystem effectively adapts to each user's preferences, resulting in improved\naccuracy compared to image retrieval without feedback. Overall, our work\nhighlights the potential benefits of integrating CLIP with classic relevance\nfeedback techniques to enhance image retrieval.\n","authors":["Ryoya Nara","Yu-Chieh Lin","Yuji Nozawa","Youyang Ng","Goh Itoh","Osamu Torii","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.16398v1.pdf","comment":"20 pages, 8 sugures"},{"id":"http://arxiv.org/abs/2402.07635v2","updated":"2024-04-25T08:15:56Z","published":"2024-02-12T13:19:08Z","title":"Collaborative Semantic Occupancy Prediction with Hybrid Feature Fusion\n in Connected Automated Vehicles","summary":" Collaborative perception in automated vehicles leverages the exchange of\ninformation between agents, aiming to elevate perception results. Previous\ncamera-based collaborative 3D perception methods typically employ 3D bounding\nboxes or bird's eye views as representations of the environment. However, these\napproaches fall short in offering a comprehensive 3D environmental prediction.\nTo bridge this gap, we introduce the first method for collaborative 3D semantic\noccupancy prediction. Particularly, it improves local 3D semantic occupancy\npredictions by hybrid fusion of (i) semantic and occupancy task features, and\n(ii) compressed orthogonal attention features shared between vehicles.\nAdditionally, due to the lack of a collaborative perception dataset designed\nfor semantic occupancy prediction, we augment a current collaborative\nperception dataset to include 3D collaborative semantic occupancy labels for a\nmore robust evaluation. The experimental findings highlight that: (i) our\ncollaborative semantic occupancy predictions excel above the results from\nsingle vehicles by over 30%, and (ii) models anchored on semantic occupancy\noutpace state-of-the-art collaborative 3D detection techniques in subsequent\nperception applications, showcasing enhanced accuracy and enriched\nsemantic-awareness in road environments.\n","authors":["Rui Song","Chenwei Liang","Hu Cao","Zhiran Yan","Walter Zimmer","Markus Gross","Andreas Festag","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2402.07635v2.pdf","comment":"Accepted by CVPR2024. Website link:\n https://rruisong.github.io/publications/CoHFF"},{"id":"http://arxiv.org/abs/2404.16397v1","updated":"2024-04-25T08:15:37Z","published":"2024-04-25T08:15:37Z","title":"Deep Learning-based Prediction of Breast Cancer Tumor and Immune\n Phenotypes from Histopathology","summary":" The interactions between tumor cells and the tumor microenvironment (TME)\ndictate therapeutic efficacy of radiation and many systemic therapies in breast\ncancer. However, to date, there is not a widely available method to\nreproducibly measure tumor and immune phenotypes for each patient's tumor.\nGiven this unmet clinical need, we applied multiple instance learning (MIL)\nalgorithms to assess activity of ten biologically relevant pathways from the\nhematoxylin and eosin (H&E) slide of primary breast tumors. We employed\ndifferent feature extraction approaches and state-of-the-art model\narchitectures. Using binary classification, our models attained area under the\nreceiver operating characteristic (AUROC) scores above 0.70 for nearly all gene\nexpression pathways and on some cases, exceeded 0.80. Attention maps suggest\nthat our trained models recognize biologically relevant spatial patterns of\ncell sub-populations from H&E. These efforts represent a first step towards\ndeveloping computational H&E biomarkers that reflect facets of the TME and hold\npromise for augmenting precision oncology.\n","authors":["Tiago Gonçalves","Dagoberto Pulido-Arias","Julian Willett","Katharina V. Hoebel","Mason Cleveland","Syed Rakin Ahmed","Elizabeth Gerstner","Jayashree Kalpathy-Cramer","Jaime S. Cardoso","Christopher P. Bridge","Albert E. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.16397v1.pdf","comment":"Paper accepted at the First Workshop on Imageomics\n (Imageomics-AAAI-24) - Discovering Biological Knowledge from Images using AI\n (https://sites.google.com/vt.edu/imageomics-aaai-24/home), held as part of\n the 38th Annual AAAI Conference on Artificial Intelligence\n (https://aaai.org/aaai-conference/)"},{"id":"http://arxiv.org/abs/2404.16386v1","updated":"2024-04-25T07:55:47Z","published":"2024-04-25T07:55:47Z","title":"Promoting CNNs with Cross-Architecture Knowledge Distillation for\n Efficient Monocular Depth Estimation","summary":" Recently, the performance of monocular depth estimation (MDE) has been\nsignificantly boosted with the integration of transformer models. However, the\ntransformer models are usually computationally-expensive, and their\neffectiveness in light-weight models are limited compared to convolutions. This\nlimitation hinders their deployment on resource-limited devices. In this paper,\nwe propose a cross-architecture knowledge distillation method for MDE, dubbed\nDisDepth, to enhance efficient CNN models with the supervision of\nstate-of-the-art transformer models. Concretely, we first build a simple\nframework of convolution-based MDE, which is then enhanced with a novel\nlocal-global convolution module to capture both local and global information in\nthe image. To effectively distill valuable information from the transformer\nteacher and bridge the gap between convolution and transformer features, we\nintroduce a method to acclimate the teacher with a ghost decoder. The ghost\ndecoder is a copy of the student's decoder, and adapting the teacher with the\nghost decoder aligns the features to be student-friendly while preserving their\noriginal performance. Furthermore, we propose an attentive knowledge\ndistillation loss that adaptively identifies features valuable for depth\nestimation. This loss guides the student to focus more on attentive regions,\nimproving its performance. Extensive experiments on KITTI and NYU Depth V2\ndatasets demonstrate the effectiveness of DisDepth. Our method achieves\nsignificant improvements on various efficient backbones, showcasing its\npotential for efficient monocular depth estimation.\n","authors":["Zhimeng Zheng","Tao Huang","Gongsheng Li","Zuyi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16385v1","updated":"2024-04-25T07:51:26Z","published":"2024-04-25T07:51:26Z","title":"Efficiency in Focus: LayerNorm as a Catalyst for Fine-tuning Medical\n Visual Language Pre-trained Models","summary":" In the realm of Medical Visual Language Models (Med-VLMs), the quest for\nuniversal efficient fine-tuning mechanisms remains paramount, especially given\nresearchers in interdisciplinary fields are often extremely short of training\nresources, yet largely unexplored. Given the unique challenges in the medical\ndomain, such as limited data scope and significant domain-specific\nrequirements, evaluating and adapting Parameter-Efficient Fine-Tuning (PEFT)\nmethods specifically for Med-VLMs is essential. Most of the current PEFT\nmethods on Med-VLMs have yet to be comprehensively investigated but mainly\nfocus on adding some components to the model's structure or input. However,\nfine-tuning intrinsic model components often yields better generality and\nconsistency, and its impact on the ultimate performance of Med-VLMs has been\nwidely overlooked and remains understudied. In this paper, we endeavour to\nexplore an alternative to traditional PEFT methods, especially the impact of\nfine-tuning LayerNorm layers, FFNs and Attention layers on the Med-VLMs. Our\ncomprehensive studies span both small-scale and large-scale Med-VLMs,\nevaluating their performance under various fine-tuning paradigms across tasks\nsuch as Medical Visual Question Answering and Medical Imaging Report\nGeneration. The findings reveal unique insights into the effects of intrinsic\nparameter fine-tuning methods on fine-tuning Med-VLMs to downstream tasks and\nexpose fine-tuning solely the LayerNorm layers not only surpasses the\nefficiency of traditional PEFT methods but also retains the model's accuracy\nand generalization capabilities across a spectrum of medical downstream tasks.\nThe experiments show LayerNorm fine-tuning's superior adaptability and\nscalability, particularly in the context of large-scale Med-VLMs.\n","authors":["Jiawei Chen","Dingkang Yang","Yue Jiang","Mingcheng Li","Jinjie Wei","Xiaolu Hou","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16380v1","updated":"2024-04-25T07:42:48Z","published":"2024-04-25T07:42:48Z","title":"Efficient Higher-order Convolution for Small Kernels in Deep Learning","summary":" Deep convolutional neural networks (DCNNs) are a class of artificial neural\nnetworks, primarily for computer vision tasks such as segmentation and\nclassification. Many nonlinear operations, such as activation functions and\npooling strategies, are used in DCNNs to enhance their ability to process\ndifferent signals with different tasks. Conceptional convolution, a linear\nfilter, is the essential component of DCNNs while nonlinear convolution is\ngenerally implemented as higher-order Volterra filters, However, for Volterra\nfiltering, significant memory and computational costs pose a primary limitation\nfor its widespread application in DCNN applications. In this study, we propose\na novel method to perform higher-order Volterra filtering with lower memory and\ncomputation cost in forward and backward pass in DCNN training. The proposed\nmethod demonstrates computational advantages compared with conventional\nVolterra filter implementation. Furthermore, based on the proposed method, a\nnew attention module called Higher-order Local Attention Block (HLA) is\nproposed and tested on CIFAR-100 dataset, which shows competitive improvement\nfor classification task. Source code is available at:\nhttps://github.com/WinterWen666/Efficient-High-Order-Volterra-Convolution.git\n","authors":["Zuocheng Wen","Lingzhong Guo"],"pdf_url":"https://arxiv.org/pdf/2404.16380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14885v2","updated":"2024-04-25T07:38:25Z","published":"2024-04-23T10:13:31Z","title":"Domain adaptive pose estimation via multi-level alignment","summary":" Domain adaptive pose estimation aims to enable deep models trained on source\ndomain (synthesized) datasets produce similar results on the target domain\n(real-world) datasets. The existing methods have made significant progress by\nconducting image-level or feature-level alignment. However, only aligning at a\nsingle level is not sufficient to fully bridge the domain gap and achieve\nexcellent domain adaptive results. In this paper, we propose a multi-level\ndomain adaptation aproach, which aligns different domains at the image,\nfeature, and pose levels. Specifically, we first utilize image style transer to\nensure that images from the source and target domains have a similar\ndistribution. Subsequently, at the feature level, we employ adversarial\ntraining to make the features from the source and target domains preserve\ndomain-invariant characeristics as much as possible. Finally, at the pose\nlevel, a self-supervised approach is utilized to enable the model to learn\ndiverse knowledge, implicitly addressing the domain gap. Experimental results\ndemonstrate that significant imrovement can be achieved by the proposed\nmulti-level alignment method in pose estimation, which outperforms previous\nstate-of-the-art in human pose by up to 2.4% and animal pose estimation by up\nto 3.1% for dogs and 1.4% for sheep.\n","authors":["Yugan Chen","Lin Zhao","Yalong Xu","Honglei Zu","Xiaoqi An","Guangyu Li"],"pdf_url":"https://arxiv.org/pdf/2404.14885v2.pdf","comment":"accepted to icme2024"},{"id":"http://arxiv.org/abs/2404.00680v2","updated":"2024-04-25T07:35:16Z","published":"2024-03-31T13:12:41Z","title":"Learning to Rank Patches for Unbiased Image Redundancy Reduction","summary":" Images suffer from heavy spatial redundancy because pixels in neighboring\nregions are spatially correlated. Existing approaches strive to overcome this\nlimitation by reducing less meaningful image regions. However, current leading\nmethods rely on supervisory signals. They may compel models to preserve content\nthat aligns with labeled categories and discard content belonging to unlabeled\ncategories. This categorical inductive bias makes these methods less effective\nin real-world scenarios. To address this issue, we propose a self-supervised\nframework for image redundancy reduction called Learning to Rank Patches\n(LTRP). We observe that image reconstruction of masked image modeling models is\nsensitive to the removal of visible patches when the masking ratio is high\n(e.g., 90\\%). Building upon it, we implement LTRP via two steps: inferring the\nsemantic density score of each patch by quantifying variation between\nreconstructions with and without this patch, and learning to rank the patches\nwith the pseudo score. The entire process is self-supervised, thus getting out\nof the dilemma of categorical inductive bias. We design extensive experiments\non different datasets and tasks. The results demonstrate that LTRP outperforms\nboth supervised and other self-supervised methods due to the fair assessment of\nimage content.\n","authors":["Yang Luo","Zhineng Chen","Peng Zhou","Zuxuan Wu","Xieping Gao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.00680v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16375v1","updated":"2024-04-25T07:29:17Z","published":"2024-04-25T07:29:17Z","title":"List Items One by One: A New Data Source and Learning Paradigm for\n Multimodal LLMs","summary":" Set-of-Mark (SoM) Prompting unleashes the visual grounding capability of\nGPT-4V, by enabling the model to associate visual objects with tags inserted on\nthe image. These tags, marked with alphanumerics, can be indexed via text\ntokens for easy reference. Despite the extraordinary performance from GPT-4V,\nwe observe that other Multimodal Large Language Models (MLLMs) struggle to\nunderstand these visual tags. To promote the learning of SoM prompting for\nopen-source models, we propose a new learning paradigm: \"list items one by\none,\" which asks the model to enumerate and describe all visual tags placed on\nthe image following the alphanumeric orders of tags. By integrating our curated\ndataset with other visual instruction tuning datasets, we are able to equip\nexisting MLLMs with the SoM prompting ability. Furthermore, we evaluate our\nfinetuned SoM models on five MLLM benchmarks. We find that this new dataset,\neven in a relatively small size (10k-30k images with tags), significantly\nenhances visual reasoning capabilities and reduces hallucinations for MLLMs.\nPerhaps surprisingly, these improvements persist even when the visual tags are\nomitted from input images during inference. This suggests the potential of\n\"list items one by one\" as a new paradigm for training MLLMs, which strengthens\nthe object-text alignment through the use of visual tags in the training stage.\nFinally, we conduct analyses by probing trained models to understand the\nworking mechanism of SoM. Our code and data are available at\n\\url{https://github.com/zzxslp/SoM-LLaVA}.\n","authors":["An Yan","Zhengyuan Yang","Junda Wu","Wanrong Zhu","Jianwei Yang","Linjie Li","Kevin Lin","Jianfeng Wang","Julian McAuley","Jianfeng Gao","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16375v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.16371v1","updated":"2024-04-25T07:21:14Z","published":"2024-04-25T07:21:14Z","title":"Multimodal Information Interaction for Medical Image Segmentation","summary":" The use of multimodal data in assisted diagnosis and segmentation has emerged\nas a prominent area of interest in current research. However, one of the\nprimary challenges is how to effectively fuse multimodal features. Most of the\ncurrent approaches focus on the integration of multimodal features while\nignoring the correlation and consistency between different modal features,\nleading to the inclusion of potentially irrelevant information. To address this\nissue, we introduce an innovative Multimodal Information Cross Transformer\n(MicFormer), which employs a dual-stream architecture to simultaneously extract\nfeatures from each modality. Leveraging the Cross Transformer, it queries\nfeatures from one modality and retrieves corresponding responses from another,\nfacilitating effective communication between bimodal features. Additionally, we\nincorporate a deformable Transformer architecture to expand the search space.\nWe conducted experiments on the MM-WHS dataset, and in the CT-MRI multimodal\nimage segmentation task, we successfully improved the whole-heart segmentation\nDICE score to 85.57 and MIoU to 75.51. Compared to other multimodal\nsegmentation techniques, our method outperforms by margins of 2.83 and 4.23,\nrespectively. This demonstrates the efficacy of MicFormer in integrating\nrelevant information between different modalities in multimodal tasks. These\nfindings hold significant implications for multimodal image tasks, and we\nbelieve that MicFormer possesses extensive potential for broader applications\nacross various domains. Access to our method is available at\nhttps://github.com/fxxJuses/MICFormer\n","authors":["Xinxin Fan","Lin Liu","Haoran Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06209v2","updated":"2024-04-25T07:12:39Z","published":"2024-01-11T18:58:36Z","title":"Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs","summary":" Is vision good enough for language? Recent advancements in multimodal models\nprimarily stem from the powerful reasoning abilities of large language models\n(LLMs). However, the visual component typically depends only on the\ninstance-level contrastive language-image pre-training (CLIP). Our research\nreveals that the visual capabilities in recent multimodal LLMs (MLLMs) still\nexhibit systematic shortcomings. To understand the roots of these errors, we\nexplore the gap between the visual embedding space of CLIP and vision-only\nself-supervised learning. We identify ''CLIP-blind pairs'' - images that CLIP\nperceives as similar despite their clear visual differences. With these pairs,\nwe construct the Multimodal Visual Patterns (MMVP) benchmark. MMVP exposes\nareas where state-of-the-art systems, including GPT-4V, struggle with\nstraightforward questions across nine basic visual patterns, often providing\nincorrect answers and hallucinated explanations. We further evaluate various\nCLIP-based vision-and-language models and found a notable correlation between\nvisual patterns that challenge CLIP models and those problematic for multimodal\nLLMs. As an initial effort to address these issues, we propose a Mixture of\nFeatures (MoF) approach, demonstrating that integrating vision self-supervised\nlearning features with MLLMs can significantly enhance their visual grounding\ncapabilities. Together, our research suggests visual representation learning\nremains an open challenge, and accurate visual grounding is crucial for future\nsuccessful multimodal systems.\n","authors":["Shengbang Tong","Zhuang Liu","Yuexiang Zhai","Yi Ma","Yann LeCun","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2401.06209v2.pdf","comment":"Project page: https://tsb0601.github.io/mmvp_blog/"},{"id":"http://arxiv.org/abs/2404.14829v2","updated":"2024-04-25T07:04:03Z","published":"2024-04-23T08:31:55Z","title":"Revisiting Neural Networks for Continual Learning: An Architectural\n Perspective","summary":" Efforts to overcome catastrophic forgetting have primarily centered around\ndeveloping more effective Continual Learning (CL) methods. In contrast, less\nattention was devoted to analyzing the role of network architecture design\n(e.g., network depth, width, and components) in contributing to CL. This paper\nseeks to bridge this gap between network architecture design and CL, and to\npresent a holistic study on the impact of network architectures on CL. This\nwork considers architecture design at the network scaling level, i.e., width\nand depth, and also at the network components, i.e., skip connections, global\npooling layers, and down-sampling. In both cases, we first derive insights\nthrough systematically exploring how architectural designs affect CL. Then,\ngrounded in these insights, we craft a specialized search space for CL and\nfurther propose a simple yet effective ArchCraft method to steer a CL-friendly\narchitecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC.\nExperimental validation across various CL settings and scenarios demonstrates\nthat improved architectures are parameter-efficient, achieving state-of-the-art\nperformance of CL while being 86%, 61%, and 97% more compact in terms of\nparameters than the naive CL architecture in Task IL and Class IL. Code is\navailable at https://github.com/byyx666/ArchCraft.\n","authors":["Aojun Lu","Tao Feng","Hangjie Yuan","Xiaotian Song","Yanan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.14829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14828v3","updated":"2024-04-25T06:54:35Z","published":"2024-01-26T12:57:05Z","title":"TIP-Editor: An Accurate 3D Editor Following Both Text-Prompts And\n Image-Prompts","summary":" Text-driven 3D scene editing has gained significant attention owing to its\nconvenience and user-friendliness. However, existing methods still lack\naccurate control of the specified appearance and location of the editing result\ndue to the inherent limitations of the text description. To this end, we\npropose a 3D scene editing framework, TIPEditor, that accepts both text and\nimage prompts and a 3D bounding box to specify the editing region. With the\nimage prompt, users can conveniently specify the detailed appearance/style of\nthe target content in complement to the text description, enabling accurate\ncontrol of the appearance. Specifically, TIP-Editor employs a stepwise 2D\npersonalization strategy to better learn the representation of the existing\nscene and the reference image, in which a localization loss is proposed to\nencourage correct object placement as specified by the bounding box.\nAdditionally, TIPEditor utilizes explicit and flexible 3D Gaussian splatting as\nthe 3D representation to facilitate local editing while keeping the background\nunchanged. Extensive experiments have demonstrated that TIP-Editor conducts\naccurate editing following the text and image prompts in the specified bounding\nbox region, consistently outperforming the baselines in editing quality, and\nthe alignment to the prompts, qualitatively and quantitatively.\n","authors":["Jingyu Zhuang","Di Kang","Yan-Pei Cao","Guanbin Li","Liang Lin","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.14828v3.pdf","comment":"Accpeted by Siggraph 2024 & ACM Transactions on Graphics"},{"id":"http://arxiv.org/abs/2404.05212v2","updated":"2024-04-25T06:53:06Z","published":"2024-04-08T05:58:07Z","title":"DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage\n CJK Character Generation","summary":" Chinese, Japanese, and Korean (CJK), with a vast number of native speakers,\nhave profound influence on society and culture. The typesetting of CJK\nlanguages carries a wide range of requirements due to the complexity of their\nscripts and unique literary traditions. A critical aspect of this typesetting\nprocess is that CJK fonts need to provide a set of consistent-looking glyphs\nfor approximately one hundred thousand characters. However, creating such a\nfont is inherently labor-intensive and expensive, which significantly hampers\nthe development of new CJK fonts for typesetting, historical, aesthetic, or\nartistic purposes. To bridge this gap, we are motivated by recent advancements\nin diffusion-based generative models and propose a novel diffusion method for\ngenerating glyphs in a targeted style from a single conditioned, standard glyph\nform. Our experiments show that our method is capable of generating fonts of\nboth printed and hand-written styles, the latter of which presents a greater\nchallenge. Moreover, our approach shows remarkable zero-shot generalization\ncapabilities for non-CJK but Chinese-inspired scripts. We also show our method\nfacilitates smooth style interpolation and generates bitmap images suitable for\nvectorization, which is crucial in the font creation process. In summary, our\nproposed method opens the door to high-quality, generative model-assisted font\ncreation for CJK characters, for both typesetting and artistic endeavors.\n","authors":["Yingtao Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05212v2.pdf","comment":"Accepted in 15th International Conference on Computational\n Creativity, ICCC'24"},{"id":"http://arxiv.org/abs/2404.16359v1","updated":"2024-04-25T06:41:58Z","published":"2024-04-25T06:41:58Z","title":"An Improved Graph Pooling Network for Skeleton-Based Action Recognition","summary":" Pooling is a crucial operation in computer vision, yet the unique structure\nof skeletons hinders the application of existing pooling strategies to skeleton\ngraph modelling. In this paper, we propose an Improved Graph Pooling Network,\nreferred to as IGPN. The main innovations include: Our method incorporates a\nregion-awareness pooling strategy based on structural partitioning. The\ncorrelation matrix of the original feature is used to adaptively adjust the\nweight of information in different regions of the newly generated features,\nresulting in more flexible and effective processing. To prevent the\nirreversible loss of discriminative information, we propose a cross fusion\nmodule and an information supplement module to provide block-level and\ninput-level information respectively. As a plug-and-play structure, the\nproposed operation can be seamlessly combined with existing GCN-based models.\nWe conducted extensive evaluations on several challenging benchmarks, and the\nexperimental results indicate the effectiveness of our proposed solutions. For\nexample, in the cross-subject evaluation of the NTU-RGB+D 60 dataset, IGPN\nachieves a significant improvement in accuracy compared to the baseline while\nreducing Flops by nearly 70%; a heavier version has also been introduced to\nfurther boost accuracy.\n","authors":["Cong Wu","Xiao-Jun Wu","Tianyang Xu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2404.16359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09987v2","updated":"2024-04-25T06:31:01Z","published":"2024-04-15T17:58:57Z","title":"OneChart: Purify the Chart Structural Extraction via One Auxiliary Token","summary":" Chart parsing poses a significant challenge due to the diversity of styles,\nvalues, texts, and so forth. Even advanced large vision-language models (LVLMs)\nwith billions of parameters struggle to handle such tasks satisfactorily. To\naddress this, we propose OneChart: a reliable agent specifically devised for\nthe structural extraction of chart information. Similar to popular LVLMs,\nOneChart incorporates an autoregressive main body. Uniquely, to enhance the\nreliability of the numerical parts of the output, we introduce an auxiliary\ntoken placed at the beginning of the total tokens along with an additional\ndecoder. The numerically optimized (auxiliary) token allows subsequent tokens\nfor chart parsing to capture enhanced numerical features through causal\nattention. Furthermore, with the aid of the auxiliary token, we have devised a\nself-evaluation mechanism that enables the model to gauge the reliability of\nits chart parsing results by providing confidence scores for the generated\ncontent. Compared to current state-of-the-art (SOTA) chart parsing models,\ne.g., DePlot, ChartVLM, ChartAst, OneChart significantly outperforms in Average\nPrecision (AP) for chart structural extraction across multiple public\nbenchmarks, despite enjoying only 0.2 billion parameters. Moreover, as a chart\nparsing agent, it also brings 10%+ accuracy gains for the popular LVLM\n(LLaVA-1.6) in the downstream ChartQA benchmark.\n","authors":["Jinyue Chen","Lingyu Kong","Haoran Wei","Chenglong Liu","Zheng Ge","Liang Zhao","Jianjian Sun","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09987v2.pdf","comment":"14 pages, 9 figures and 6 tables"},{"id":"http://arxiv.org/abs/2404.15736v2","updated":"2024-04-25T06:04:16Z","published":"2024-04-24T08:50:45Z","title":"What Makes Multimodal In-Context Learning Work?","summary":" Large Language Models have demonstrated remarkable performance across various\ntasks, exhibiting the capacity to swiftly acquire new skills, such as through\nIn-Context Learning (ICL) with minimal demonstration examples. In this work, we\npresent a comprehensive framework for investigating Multimodal ICL (M-ICL) in\nthe context of Large Multimodal Models. We consider the best open-source\nmultimodal models (e.g., IDEFICS, OpenFlamingo) and a wide range of multimodal\ntasks. Our study unveils several noteworthy findings: (1) M-ICL primarily\nrelies on text-driven mechanisms, showing little to no influence from the image\nmodality. (2) When used with advanced-ICL strategy (like RICES), M-ICL is not\nbetter than a simple strategy based on majority voting over context examples.\nMoreover, we identify several biases and limitations of M-ICL that warrant\nconsideration prior to deployment. Code available at\nhttps://gitlab.com/folbaeni/multimodal-icl\n","authors":["Folco Bertini Baldassini","Mustafa Shukor","Matthieu Cord","Laure Soulier","Benjamin Piwowarski"],"pdf_url":"https://arxiv.org/pdf/2404.15736v2.pdf","comment":"20 pages, 16 figures. Accepted to CVPR 2024 Workshop on Prompting in\n Vision. Project page: https://folbaeni.gitlab.io/multimodal-icl"},{"id":"http://arxiv.org/abs/2404.16348v1","updated":"2024-04-25T05:59:42Z","published":"2024-04-25T05:59:42Z","title":"Dual Expert Distillation Network for Generalized Zero-Shot Learning","summary":" Zero-shot learning has consistently yielded remarkable progress via modeling\nnuanced one-to-one visual-attribute correlation. Existing studies resort to\nrefining a uniform mapping function to align and correlate the sample regions\nand subattributes, ignoring two crucial issues: 1) the inherent asymmetry of\nattributes; and 2) the unutilized channel information. This paper addresses\nthese issues by introducing a simple yet effective approach, dubbed Dual Expert\nDistillation Network (DEDN), where two experts are dedicated to coarse- and\nfine-grained visual-attribute modeling, respectively. Concretely, one coarse\nexpert, namely cExp, has a complete perceptual scope to coordinate\nvisual-attribute similarity metrics across dimensions, and moreover, another\nfine expert, namely fExp, consists of multiple specialized subnetworks, each\ncorresponds to an exclusive set of attributes. Two experts cooperatively\ndistill from each other to reach a mutual agreement during training. Meanwhile,\nwe further equip DEDN with a newly designed backbone network, i.e., Dual\nAttention Network (DAN), which incorporates both region and channel attention\ninformation to fully exploit and leverage visual semantic knowledge.\nExperiments on various benchmark datasets indicate a new state-of-the-art.\n","authors":["Zhijie Rao","Jingcai Guo","Xiaocheng Lu","Jingming Liang","Jie Zhang","Haozhao Wang","Kang Wei","Xiaofeng Cao"],"pdf_url":"https://arxiv.org/pdf/2404.16348v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.16346v1","updated":"2024-04-25T05:42:41Z","published":"2024-04-25T05:42:41Z","title":"Light-weight Retinal Layer Segmentation with Global Reasoning","summary":" Automatic retinal layer segmentation with medical images, such as optical\ncoherence tomography (OCT) images, serves as an important tool for diagnosing\nophthalmic diseases. However, it is challenging to achieve accurate\nsegmentation due to low contrast and blood flow noises presented in the images.\nIn addition, the algorithm should be light-weight to be deployed for practical\nclinical applications. Therefore, it is desired to design a light-weight\nnetwork with high performance for retinal layer segmentation. In this paper, we\npropose LightReSeg for retinal layer segmentation which can be applied to OCT\nimages. Specifically, our approach follows an encoder-decoder structure, where\nthe encoder part employs multi-scale feature extraction and a Transformer block\nfor fully exploiting the semantic information of feature maps at all scales and\nmaking the features have better global reasoning capabilities, while the\ndecoder part, we design a multi-scale asymmetric attention (MAA) module for\npreserving the semantic information at each encoder scale. The experiments show\nthat our approach achieves a better segmentation performance compared to the\ncurrent state-of-the-art method TransUnet with 105.7M parameters on both our\ncollected dataset and two other public datasets, with only 3.3M parameters.\n","authors":["Xiang He","Weiye Song","Yiming Wang","Fabio Poiesi","Ji Yi","Manishi Desai","Quanqing Xu","Kongzheng Yang","Yi Wan"],"pdf_url":"https://arxiv.org/pdf/2404.16346v1.pdf","comment":"IEEE Transactions on Instrumentation & Measurement"},{"id":"http://arxiv.org/abs/2404.15882v2","updated":"2024-04-25T05:38:52Z","published":"2024-04-24T13:59:19Z","title":"Unexplored Faces of Robustness and Out-of-Distribution: Covariate Shifts\n in Environment and Sensor Domains","summary":" Computer vision applications predict on digital images acquired by a camera\nfrom physical scenes through light. However, conventional robustness benchmarks\nrely on perturbations in digitized images, diverging from distribution shifts\noccurring in the image acquisition process. To bridge this gap, we introduce a\nnew distribution shift dataset, ImageNet-ES, comprising variations in\nenvironmental and camera sensor factors by directly capturing 202k images with\na real camera in a controllable testbed. With the new dataset, we evaluate\nout-of-distribution (OOD) detection and model robustness. We find that existing\nOOD detection methods do not cope with the covariate shifts in ImageNet-ES,\nimplying that the definition and detection of OOD should be revisited to\nembrace real-world distribution shifts. We also observe that the model becomes\nmore robust in both ImageNet-C and -ES by learning environment and sensor\nvariations in addition to existing digital augmentations. Lastly, our results\nsuggest that effective shift mitigation via camera sensor control can\nsignificantly improve performance without increasing model size. With these\nfindings, our benchmark may aid future research on robustness, OOD, and camera\nsensor control for computer vision. Our code and dataset are available at\nhttps://github.com/Edw2n/ImageNet-ES.\n","authors":["Eunsu Baek","Keondo Park","Jiyoon Kim","Hyung-Sin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.15882v2.pdf","comment":"Published as a conference paper at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16339v1","updated":"2024-04-25T05:07:50Z","published":"2024-04-25T05:07:50Z","title":"Training-Free Unsupervised Prompt for Vision-Language Models","summary":" Prompt learning has become the most effective paradigm for adapting large\npre-trained vision-language models (VLMs) to downstream tasks. Recently,\nunsupervised prompt tuning methods, such as UPL and POUF, directly leverage\npseudo-labels as supervisory information to fine-tune additional adaptation\nmodules on unlabeled data. However, inaccurate pseudo labels easily misguide\nthe tuning process and result in poor representation capabilities. In light of\nthis, we propose Training-Free Unsupervised Prompts (TFUP), which maximally\npreserves the inherent representation capabilities and enhances them with a\nresidual connection to similarity-based prediction probabilities in a\ntraining-free and labeling-free manner. Specifically, we integrate both\ninstance confidence and prototype scores to select representative samples,\nwhich are used to customize a reliable Feature Cache Model (FCM) for\ntraining-free inference. Then, we design a Multi-level Similarity Measure (MSM)\nthat considers both feature-level and semantic-level similarities to calculate\nthe distance between each test image and the cached sample as the weight of the\ncorresponding cached label to generate similarity-based prediction\nprobabilities. In this way, TFUP achieves surprising performance, even\nsurpassing the training-base method on multiple classification datasets. Based\non our TFUP, we propose a training-based approach (TFUP-T) to further boost the\nadaptation performance. In addition to the standard cross-entropy loss, TFUP-T\nadopts an additional marginal distribution entropy loss to constrain the model\nfrom a global perspective. Our TFUP-T achieves new state-of-the-art\nclassification performance compared to unsupervised and few-shot adaptation\napproaches on multiple benchmarks. In particular, TFUP-T improves the\nclassification accuracy of POUF by 3.3% on the most challenging Domain-Net\ndataset.\n","authors":["Sifan Long","Linbin Wang","Zhen Zhao","Zichang Tan","Yiming Wu","Shengsheng Wang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16336v1","updated":"2024-04-25T04:53:43Z","published":"2024-04-25T04:53:43Z","title":"FedStyle: Style-Based Federated Learning Crowdsourcing Framework for Art\n Commissions","summary":" The unique artistic style is crucial to artists' occupational\ncompetitiveness, yet prevailing Art Commission Platforms rarely support\nstyle-based retrieval. Meanwhile, the fast-growing generative AI techniques\naggravate artists' concerns about releasing personal artworks to public\nplatforms. To achieve artistic style-based retrieval without exposing personal\nartworks, we propose FedStyle, a style-based federated learning crowdsourcing\nframework. It allows artists to train local style models and share model\nparameters rather than artworks for collaboration. However, most artists\npossess a unique artistic style, resulting in severe model drift among them.\nFedStyle addresses such extreme data heterogeneity by having artists learn\ntheir abstract style representations and align with the server, rather than\nmerely aggregating model parameters lacking semantics. Besides, we introduce\ncontrastive learning to meticulously construct the style representation space,\npulling artworks with similar styles closer and keeping different ones apart in\nthe embedding space. Extensive experiments on the proposed datasets demonstrate\nthe superiority of FedStyle.\n","authors":["Changjuan Ran","Yeting Guo","Fang Liu","Shenglan Cui","Yunfan Ye"],"pdf_url":"https://arxiv.org/pdf/2404.16336v1.pdf","comment":"Accepted to ICME 2024"},{"id":"http://arxiv.org/abs/2404.16331v1","updated":"2024-04-25T04:37:35Z","published":"2024-04-25T04:37:35Z","title":"IMWA: Iterative Model Weight Averaging Benefits Class-Imbalanced\n Learning Tasks","summary":" Model Weight Averaging (MWA) is a technique that seeks to enhance model's\nperformance by averaging the weights of multiple trained models. This paper\nfirst empirically finds that 1) the vanilla MWA can benefit the\nclass-imbalanced learning, and 2) performing model averaging in the early\nepochs of training yields a greater performance improvement than doing that in\nlater epochs. Inspired by these two observations, in this paper we propose a\nnovel MWA technique for class-imbalanced learning tasks named Iterative Model\nWeight Averaging (IMWA). Specifically, IMWA divides the entire training stage\ninto multiple episodes. Within each episode, multiple models are concurrently\ntrained from the same initialized model weight, and subsequently averaged into\na singular model. Then, the weight of this average model serves as a fresh\ninitialization for the ensuing episode, thus establishing an iterative learning\nparadigm. Compared to vanilla MWA, IMWA achieves higher performance\nimprovements with the same computational cost. Moreover, IMWA can further\nenhance the performance of those methods employing EMA strategy, demonstrating\nthat IMWA and EMA can complement each other. Extensive experiments on various\nclass-imbalanced learning tasks, i.e., class-imbalanced image classification,\nsemi-supervised class-imbalanced image classification and semi-supervised\nobject detection tasks showcase the effectiveness of our IMWA.\n","authors":["Zitong Huang","Ze Chen","Bowen Dong","Chaoqi Liang","Erjin Zhou","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.16331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16325v1","updated":"2024-04-25T04:21:57Z","published":"2024-04-25T04:21:57Z","title":"Semantic Segmentation Refiner for Ultrasound Applications with Zero-Shot\n Foundation Models","summary":" Despite the remarkable success of deep learning in medical imaging analysis,\nmedical image segmentation remains challenging due to the scarcity of\nhigh-quality labeled images for supervision. Further, the significant domain\ngap between natural and medical images in general and ultrasound images in\nparticular hinders fine-tuning models trained on natural images to the task at\nhand. In this work, we address the performance degradation of segmentation\nmodels in low-data regimes and propose a prompt-less segmentation method\nharnessing the ability of segmentation foundation models to segment abstract\nshapes. We do that via our novel prompt point generation algorithm which uses\ncoarse semantic segmentation masks as input and a zero-shot prompt-able\nfoundation model as an optimization target. We demonstrate our method on a\nsegmentation findings task (pathologic anomalies) in ultrasound images. Our\nmethod's advantages are brought to light in varying degrees of low-data regime\nexperiments on a small-scale musculoskeletal ultrasound images dataset,\nyielding a larger performance gain as the training set size decreases.\n","authors":["Hedda Cohen Indelman","Elay Dahan","Angeles M. Perez-Agosto","Carmit Shiran","Doron Shaked","Nati Daniel"],"pdf_url":"https://arxiv.org/pdf/2404.16325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16323v1","updated":"2024-04-25T04:18:59Z","published":"2024-04-25T04:18:59Z","title":"DIG3D: Marrying Gaussian Splatting with Deformable Transformer for\n Single Image 3D Reconstruction","summary":" In this paper, we study the problem of 3D reconstruction from a single-view\nRGB image and propose a novel approach called DIG3D for 3D object\nreconstruction and novel view synthesis. Our method utilizes an encoder-decoder\nframework which generates 3D Gaussians in decoder with the guidance of\ndepth-aware image features from encoder. In particular, we introduce the use of\ndeformable transformer, allowing efficient and effective decoding through 3D\nreference point and multi-layer refinement adaptations. By harnessing the\nbenefits of 3D Gaussians, our approach offers an efficient and accurate\nsolution for 3D reconstruction from single-view images. We evaluate our method\non the ShapeNet SRN dataset, getting PSNR of 24.21 and 24.98 in car and chair\ndataset, respectively. The result outperforming the recent method by around\n2.25%, demonstrating the effectiveness of our method in achieving superior\nresults.\n","authors":["Jiamin Wu","Kenkun Liu","Han Gao","Xiaoke Jiang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13516v3","updated":"2024-04-25T03:57:03Z","published":"2024-01-24T15:14:05Z","title":"Delocate: Detection and Localization for Deepfake Videos with\n Randomly-Located Tampered Traces","summary":" Deepfake videos are becoming increasingly realistic, showing subtle tampering\ntraces on facial areasthat vary between frames. Consequently, many existing\nDeepfake detection methods struggle to detect unknown domain Deepfake videos\nwhile accurately locating the tampered region. To address thislimitation, we\npropose Delocate, a novel Deepfake detection model that can both recognize\nandlocalize unknown domain Deepfake videos. Ourmethod consists of two stages\nnamed recoveringand localization. In the recovering stage, the modelrandomly\nmasks regions of interest (ROIs) and reconstructs real faces without tampering\ntraces, resulting in a relatively good recovery effect for realfaces and a poor\nrecovery effect for fake faces. Inthe localization stage, the output of the\nrecoveryphase and the forgery ground truth mask serve assupervision to guide\nthe forgery localization process. This process strategically emphasizes the\nrecovery phase of fake faces with poor recovery, facilitating the localization\nof tampered regions. Ourextensive experiments on four widely used benchmark\ndatasets demonstrate that Delocate not onlyexcels in localizing tampered areas\nbut also enhances cross-domain detection performance.\n","authors":["Juan Hu","Xin Liao","Difei Gao","Satoshi Tsutsui","Qian Wang","Zheng Qin","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2401.13516v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.09921,\n arXiv:2305.05943"},{"id":"http://arxiv.org/abs/2312.04484v2","updated":"2024-04-25T03:38:39Z","published":"2023-12-07T17:59:53Z","title":"FRNet: Frustum-Range Networks for Scalable LiDAR Segmentation","summary":" LiDAR segmentation has become a crucial component in advanced autonomous\ndriving systems. Recent range-view LiDAR segmentation approaches show promise\nfor real-time processing. However, they inevitably suffer from corrupted\ncontextual information and rely heavily on post-processing techniques for\nprediction refinement. In this work, we propose FRNet, a simple yet powerful\nmethod aimed at restoring the contextual information of range image pixels\nusing corresponding frustum LiDAR points. Firstly, a frustum feature encoder\nmodule is used to extract per-point features within the frustum region, which\npreserves scene consistency and is crucial for point-level predictions. Next, a\nfrustum-point fusion module is introduced to update per-point features\nhierarchically, enabling each point to extract more surrounding information via\nthe frustum features. Finally, a head fusion module is used to fuse features at\ndifferent levels for final semantic prediction. Extensive experiments conducted\non four popular LiDAR segmentation benchmarks under various task setups\ndemonstrate the superiority of FRNet. Notably, FRNet achieves 73.3% and 82.5%\nmIoU scores on the testing sets of SemanticKITTI and nuScenes. While achieving\ncompetitive performance, FRNet operates 5 times faster than state-of-the-art\napproaches. Such high efficiency opens up new possibilities for more scalable\nLiDAR segmentation. The code has been made publicly available at\nhttps://github.com/Xiangxu-0103/FRNet.\n","authors":["Xiang Xu","Lingdong Kong","Hui Shuai","Qingshan Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04484v2.pdf","comment":"Preprint; 16 pages, 8 figures, 10 tables; Code at\n https://github.com/Xiangxu-0103/FRNet"},{"id":"http://arxiv.org/abs/2403.18442v2","updated":"2024-04-25T03:34:01Z","published":"2024-03-27T10:50:24Z","title":"Backpropagation-free Network for 3D Test-time Adaptation","summary":" Real-world systems often encounter new data over time, which leads to\nexperiencing target domain shifts. Existing Test-Time Adaptation (TTA) methods\ntend to apply computationally heavy and memory-intensive backpropagation-based\napproaches to handle this. Here, we propose a novel method that uses a\nbackpropagation-free approach for TTA for the specific case of 3D data. Our\nmodel uses a two-stream architecture to maintain knowledge about the source\ndomain as well as complementary target-domain-specific information. The\nbackpropagation-free property of our model helps address the well-known\nforgetting problem and mitigates the error accumulation issue. The proposed\nmethod also eliminates the need for the usually noisy process of\npseudo-labeling and reliance on costly self-supervised training. Moreover, our\nmethod leverages subspace learning, effectively reducing the distribution\nvariance between the two domains. Furthermore, the source-domain-specific and\nthe target-domain-specific streams are aligned using a novel entropy-based\nadaptive fusion strategy. Extensive experiments on popular benchmarks\ndemonstrate the effectiveness of our method. The code will be available at\n\\url{https://github.com/abie-e/BFTT3D}.\n","authors":["Yanshuo Wang","Ali Cheraghian","Zeeshan Hayder","Jie Hong","Sameera Ramasinghe","Shafin Rahman","David Ahmedt-Aristizabal","Xuesong Li","Lars Petersson","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2403.18442v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.11003v2","updated":"2024-04-25T03:25:43Z","published":"2024-04-17T02:29:44Z","title":"InfoMatch: Entropy Neural Estimation for Semi-Supervised Image\n Classification","summary":" Semi-supervised image classification, leveraging pseudo supervision and\nconsistency regularization, has demonstrated remarkable success. However, the\nongoing challenge lies in fully exploiting the potential of unlabeled data. To\naddress this, we employ information entropy neural estimation to utilize the\npotential of unlabeled samples. Inspired by contrastive learning, the entropy\nis estimated by maximizing a lower bound on mutual information across different\naugmented views. Moreover, we theoretically analyze that the information\nentropy of the posterior of an image classifier is approximated by maximizing\nthe likelihood function of the softmax predictions. Guided by these insights,\nwe optimize our model from both perspectives to ensure that the predicted\nprobability distribution closely aligns with the ground-truth distribution.\nGiven the theoretical connection to information entropy, we name our method\nInfoMatch. Through extensive experiments, we show its superior performance. The\nsource code is available at https://github.com/kunzhan/InfoMatch.\n","authors":["Qi Han","Zhibo Tian","Chengwei Xia","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.11003v2.pdf","comment":"IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.13016v2","updated":"2024-04-25T03:25:25Z","published":"2024-04-19T17:25:43Z","title":"Optimizing Calibration by Gaining Aware of Prediction Correctness","summary":" Model calibration aims to align confidence with prediction correctness. The\nCross-Entropy (CE) loss is widely used for calibrator training, which enforces\nthe model to increase confidence on the ground truth class. However, we find\nthe CE loss has intrinsic limitations. For example, for a narrow\nmisclassification, a calibrator trained by the CE loss often produces high\nconfidence on the wrongly predicted class (e.g., a test sample is wrongly\nclassified and its softmax score on the ground truth class is around 0.4),\nwhich is undesirable. In this paper, we propose a new post-hoc calibration\nobjective derived from the aim of calibration. Intuitively, the proposed\nobjective function asks that the calibrator decrease model confidence on\nwrongly predicted samples and increase confidence on correctly predicted\nsamples. Because a sample itself has insufficient ability to indicate\ncorrectness, we use its transformed versions (e.g., rotated, greyscaled and\ncolor-jittered) during calibrator training. Trained on an in-distribution\nvalidation set and tested with isolated, individual test samples, our method\nachieves competitive calibration performance on both in-distribution and\nout-of-distribution test sets compared with the state of the art. Further, our\nanalysis points out the difference between our method and commonly used\nobjectives such as CE loss and mean square error loss, where the latters\nsometimes deviates from the calibration aim.\n","authors":["Yuchi Liu","Lei Wang","Yuli Zou","James Zou","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.13016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16307v1","updated":"2024-04-25T03:22:48Z","published":"2024-04-25T03:22:48Z","title":"Boosting Model Resilience via Implicit Adversarial Data Augmentation","summary":" Data augmentation plays a pivotal role in enhancing and diversifying training\ndata. Nonetheless, consistently improving model performance in varied learning\nscenarios, especially those with inherent data biases, remains challenging. To\naddress this, we propose to augment the deep features of samples by\nincorporating their adversarial and anti-adversarial perturbation\ndistributions, enabling adaptive adjustment in the learning difficulty tailored\nto each sample's specific characteristics. We then theoretically reveal that\nour augmentation process approximates the optimization of a surrogate loss\nfunction as the number of augmented copies increases indefinitely. This insight\nleads us to develop a meta-learning-based framework for optimizing classifiers\nwith this novel loss, introducing the effects of augmentation while bypassing\nthe explicit augmentation process. We conduct extensive experiments across four\ncommon biased learning scenarios: long-tail learning, generalized long-tail\nlearning, noisy label learning, and subpopulation shift learning. The empirical\nresults demonstrate that our method consistently achieves state-of-the-art\nperformance, highlighting its broad adaptability.\n","authors":["Xiaoling Zhou","Wei Ye","Zhemg Lee","Rui Xie","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16307v1.pdf","comment":"9 pages, 6 figures, accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.16306v1","updated":"2024-04-25T03:21:11Z","published":"2024-04-25T03:21:11Z","title":"TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion\n Models","summary":" Text-conditioned image-to-video generation (TI2V) aims to synthesize a\nrealistic video starting from a given image (e.g., a woman's photo) and a text\ndescription (e.g., \"a woman is drinking water.\"). Existing TI2V frameworks\noften require costly training on video-text datasets and specific model designs\nfor text and image conditioning. In this paper, we propose TI2V-Zero, a\nzero-shot, tuning-free method that empowers a pretrained text-to-video (T2V)\ndiffusion model to be conditioned on a provided image, enabling TI2V generation\nwithout any optimization, fine-tuning, or introducing external modules. Our\napproach leverages a pretrained T2V diffusion foundation model as the\ngenerative prior. To guide video generation with the additional image input, we\npropose a \"repeat-and-slide\" strategy that modulates the reverse denoising\nprocess, allowing the frozen diffusion model to synthesize a video\nframe-by-frame starting from the provided image. To ensure temporal continuity,\nwe employ a DDPM inversion strategy to initialize Gaussian noise for each newly\nsynthesized frame and a resampling technique to help preserve visual details.\nWe conduct comprehensive experiments on both domain-specific and open-domain\ndatasets, where TI2V-Zero consistently outperforms a recent open-domain TI2V\nmodel. Furthermore, we show that TI2V-Zero can seamlessly extend to other tasks\nsuch as video infilling and prediction when provided with more images. Its\nautoregressive design also supports long video generation.\n","authors":["Haomiao Ni","Bernhard Egger","Suhas Lohit","Anoop Cherian","Ye Wang","Toshiaki Koike-Akino","Sharon X. Huang","Tim K. Marks"],"pdf_url":"https://arxiv.org/pdf/2404.16306v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.16304v1","updated":"2024-04-25T03:05:46Z","published":"2024-04-25T03:05:46Z","title":"BezierFormer: A Unified Architecture for 2D and 3D Lane Detection","summary":" Lane detection has made significant progress in recent years, but there is\nnot a unified architecture for its two sub-tasks: 2D lane detection and 3D lane\ndetection. To fill this gap, we introduce B\\'{e}zierFormer, a unified 2D and 3D\nlane detection architecture based on B\\'{e}zier curve lane representation.\nB\\'{e}zierFormer formulate queries as B\\'{e}zier control points and incorporate\na novel B\\'{e}zier curve attention mechanism. This attention mechanism enables\ncomprehensive and accurate feature extraction for slender lane curves via\nsampling and fusing multiple reference points on each curve. In addition, we\npropose a novel Chamfer IoU-based loss which is more suitable for the\nB\\'{e}zier control points regression. The state-of-the-art performance of\nB\\'{e}zierFormer on widely-used 2D and 3D lane detection benchmarks verifies\nits effectiveness and suggests the worthiness of further exploration.\n","authors":["Zhiwei Dong","Xi Zhu","Xiya Cao","Ran Ding","Wei Li","Caifa Zhou","Yongliang Wang","Qiangbo Liu"],"pdf_url":"https://arxiv.org/pdf/2404.16304v1.pdf","comment":"ICME 2024, 11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.16161v2","updated":"2024-04-25T03:03:00Z","published":"2023-10-24T20:08:15Z","title":"MyriadAL: Active Few Shot Learning for Histopathology","summary":" Active Learning (AL) and Few Shot Learning (FSL) are two label-efficient\nmethods which have achieved excellent results recently. However, most prior\narts in both learning paradigms fail to explore the wealth of the vast\nunlabelled data. In this study, we address this issue in the scenario where the\nannotation budget is very limited, yet a large amount of unlabelled data for\nthe target task is available. We frame this work in the context of\nhistopathology where labelling is prohibitively expensive. To this end, we\nintroduce an active few shot learning framework, Myriad Active Learning (MAL),\nincluding a contrastive-learning encoder, pseudo-label generation, and novel\nquery sample selection in the loop. Specifically, we propose to massage\nunlabelled data in a self-supervised manner, where the obtained data\nrepresentations and clustering knowledge form the basis to activate the AL\nloop. With feedback from the oracle in each AL cycle, the pseudo-labels of the\nunlabelled data are refined by optimizing a shallow task-specific net on top of\nthe encoder. These updated pseudo-labels serve to inform and improve the active\nlearning query selection process. Furthermore, we introduce a novel recipe to\ncombine existing uncertainty measures and utilize the entire uncertainty list\nto reduce sample redundancy in AL. Extensive experiments on two public\nhistopathology datasets show that MAL has superior test accuracy, macro\nF1-score, and label efficiency compared to prior works, and can achieve a\ncomparable test accuracy to a fully supervised algorithm while labelling only\n5% of the dataset.\n","authors":["Nico Schiavone","Jingyi Wang","Shuangzhi Li","Roger Zemp","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2310.16161v2.pdf","comment":"Accepted to IEEE CAI 2024. 8 pages, 2 figures. Code available at:\n https://github.com/mesophil/MyriadAL"},{"id":"http://arxiv.org/abs/2404.16302v1","updated":"2024-04-25T02:54:11Z","published":"2024-04-25T02:54:11Z","title":"CFMW: Cross-modality Fusion Mamba for Multispectral Object Detection\n under Adverse Weather Conditions","summary":" Cross-modality images that integrate visible-infrared spectra cues can\nprovide richer complementary information for object detection. Despite this,\nexisting visible-infrared object detection methods severely degrade in severe\nweather conditions. This failure stems from the pronounced sensitivity of\nvisible images to environmental perturbations, such as rain, haze, and snow,\nwhich frequently cause false negatives and false positives in detection. To\naddress this issue, we introduce a novel and challenging task, termed\nvisible-infrared object detection under adverse weather conditions. To foster\nthis task, we have constructed a new Severe Weather Visible-Infrared Dataset\n(SWVID) with diverse severe weather scenes. Furthermore, we introduce the\nCross-modality Fusion Mamba with Weather-removal (CFMW) to augment detection\naccuracy in adverse weather conditions. Thanks to the proposed Weather Removal\nDiffusion Model (WRDM) and Cross-modality Fusion Mamba (CFM) modules, CFMW is\nable to mine more essential information of pedestrian features in\ncross-modality fusion, thus could transfer to other rarer scenarios with high\nefficiency and has adequate availability on those platforms with low computing\npower. To the best of our knowledge, this is the first study that targeted\nimprovement and integrated both Diffusion and Mamba modules in cross-modality\nobject detection, successfully expanding the practical application of this type\nof model with its higher accuracy and more advanced architecture. Extensive\nexperiments on both well-recognized and self-created datasets conclusively\ndemonstrate that our CFMW achieves state-of-the-art detection performance,\nsurpassing existing benchmarks. The dataset and source code will be made\npublicly available at https://github.com/lhy-zjut/CFMW.\n","authors":["Haoyuan Li","Qi Hu","You Yao","Kailun Yang","Peng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.16302v1.pdf","comment":"The dataset and source code will be made publicly available at\n https://github.com/lhy-zjut/CFMW"},{"id":"http://arxiv.org/abs/2404.16301v1","updated":"2024-04-25T02:51:55Z","published":"2024-04-25T02:51:55Z","title":"Style Adaptation for Domain-adaptive Semantic Segmentation","summary":" Unsupervised Domain Adaptation (UDA) refers to the method that utilizes\nannotated source domain data and unlabeled target domain data to train a model\ncapable of generalizing to the target domain data. Domain discrepancy leads to\na significant decrease in the performance of general network models trained on\nthe source domain data when applied to the target domain. We introduce a\nstraightforward approach to mitigate the domain discrepancy, which necessitates\nno additional parameter calculations and seamlessly integrates with\nself-training-based UDA methods. Through the transfer of the target domain\nstyle to the source domain in the latent feature space, the model is trained to\nprioritize the target domain style during the decision-making process. We\ntackle the problem at both the image-level and shallow feature map level by\ntransferring the style information from the target domain to the source domain\ndata. As a result, we obtain a model that exhibits superior performance on the\ntarget domain. Our method yields remarkable enhancements in the\nstate-of-the-art performance for synthetic-to-real UDA tasks. For example, our\nproposed method attains a noteworthy UDA performance of 76.93 mIoU on the\nGTA->Cityscapes dataset, representing a notable improvement of +1.03 percentage\npoints over the previous state-of-the-art results.\n","authors":["Ting Li","Jianshu Chao","Deyu An"],"pdf_url":"https://arxiv.org/pdf/2404.16301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16300v1","updated":"2024-04-25T02:48:16Z","published":"2024-04-25T02:48:16Z","title":"Reinforcement Learning with Generative Models for Compact Support Sets","summary":" Foundation models contain a wealth of information from their vast number of\ntraining samples. However, most prior arts fail to extract this information in\na precise and efficient way for small sample sizes. In this work, we propose a\nframework utilizing reinforcement learning as a control for foundation models,\nallowing for the granular generation of small, focused synthetic support sets\nto augment the performance of neural network models on real data classification\ntasks. We first allow a reinforcement learning agent access to a novel context\nbased dictionary; the agent then uses this dictionary with a novel prompt\nstructure to form and optimize prompts as inputs to generative models,\nreceiving feedback based on a reward function combining the change in\nvalidation accuracy and entropy. A support set is formed this way over several\nexploration steps. Our framework produced excellent results, increasing\nclassification accuracy by significant margins for no additional labelling or\ndata cost.\n","authors":["Nico Schiavone","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2404.16300v1.pdf","comment":"4 pages, 2 figures. Code available at:\n https://github.com/mesophil/deeprl"},{"id":"http://arxiv.org/abs/2404.16296v1","updated":"2024-04-25T02:28:16Z","published":"2024-04-25T02:28:16Z","title":"Research on Splicing Image Detection Algorithms Based on Natural Image\n Statistical Characteristics","summary":" With the development and widespread application of digital image processing\ntechnology, image splicing has become a common method of image manipulation,\nraising numerous security and legal issues. This paper introduces a new\nsplicing image detection algorithm based on the statistical characteristics of\nnatural images, aimed at improving the accuracy and efficiency of splicing\nimage detection. By analyzing the limitations of traditional methods, we have\ndeveloped a detection framework that integrates advanced statistical analysis\ntechniques and machine learning methods. The algorithm has been validated using\nmultiple public datasets, showing high accuracy in detecting spliced edges and\nlocating tampered areas, as well as good robustness. Additionally, we explore\nthe potential applications and challenges faced by the algorithm in real-world\nscenarios. This research not only provides an effective technological means for\nthe field of image tampering detection but also offers new ideas and methods\nfor future related research.\n","authors":["Ao Xiang","Jingyu Zhang","Qin Yang","Liyang Wang","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.16296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16292v1","updated":"2024-04-25T02:23:11Z","published":"2024-04-25T02:23:11Z","title":"One Noise to Rule Them All: Learning a Unified Model of\n Spatially-Varying Noise Patterns","summary":" Procedural noise is a fundamental component of computer graphics pipelines,\noffering a flexible way to generate textures that exhibit \"natural\" random\nvariation. Many different types of noise exist, each produced by a separate\nalgorithm. In this paper, we present a single generative model which can learn\nto generate multiple types of noise as well as blend between them. In addition,\nit is capable of producing spatially-varying noise blends despite not having\naccess to such data for training. These features are enabled by training a\ndenoising diffusion model using a novel combination of data augmentation and\nnetwork conditioning techniques. Like procedural noise generators, the model's\nbehavior is controllable via interpretable parameters and a source of\nrandomness. We use our model to produce a variety of visually compelling noise\ntextures. We also present an application of our model to improving inverse\nprocedural material design; using our model in place of fixed-type noise nodes\nin a procedural material graph results in higher-fidelity material\nreconstructions without needing to know the type of noise in advance.\n","authors":["Arman Maesumi","Dylan Hu","Krishi Saripalli","Vladimir G. Kim","Matthew Fisher","Sören Pirk","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2404.16292v1.pdf","comment":"In ACM Transactions on Graphics (Proceedings of SIGGRAPH) 2024, 21\n pages"},{"id":"http://arxiv.org/abs/2404.12390v2","updated":"2024-04-25T01:55:49Z","published":"2024-04-18T17:59:54Z","title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","summary":" We introduce Blink, a new benchmark for multimodal language models (LLMs)\nthat focuses on core visual perception abilities not found in other\nevaluations. Most of the Blink tasks can be solved by humans \"within a blink\"\n(e.g., relative depth estimation, visual correspondence, forensics detection,\nand multi-view reasoning). However, we find these perception-demanding tasks\ncast significant challenges for current multimodal LLMs because they resist\nmediation through natural language. Blink reformats 14 classic computer vision\ntasks into 3,807 multiple-choice questions, paired with single or multiple\nimages and visual prompting. While humans get 95.70% accuracy on average, Blink\nis surprisingly challenging for existing multimodal LLMs: even the\nbest-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only\n13.17% and 7.63% higher than random guessing, indicating that such perception\nabilities have not \"emerged\" yet in recent multimodal LLMs. Our analysis also\nhighlights that specialist CV models could solve these problems much better,\nsuggesting potential pathways for future improvements. We believe Blink will\nstimulate the community to help multimodal LLMs catch up with human-level\nvisual perception.\n","authors":["Xingyu Fu","Yushi Hu","Bangzheng Li","Yu Feng","Haoyu Wang","Xudong Lin","Dan Roth","Noah A. Smith","Wei-Chiu Ma","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.12390v2.pdf","comment":"Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/"},{"id":"http://arxiv.org/abs/2404.16268v1","updated":"2024-04-25T00:34:52Z","published":"2024-04-25T00:34:52Z","title":"Lacunarity Pooling Layers for Plant Image Classification using Texture\n Analysis","summary":" Pooling layers (e.g., max and average) may overlook important information\nencoded in the spatial arrangement of pixel intensity and/or feature values. We\npropose a novel lacunarity pooling layer that aims to capture the spatial\nheterogeneity of the feature maps by evaluating the variability within local\nwindows. The layer operates at multiple scales, allowing the network to\nadaptively learn hierarchical features. The lacunarity pooling layer can be\nseamlessly integrated into any artificial neural network architecture.\nExperimental results demonstrate the layer's effectiveness in capturing\nintricate spatial patterns, leading to improved feature extraction\ncapabilities. The proposed approach holds promise in various domains,\nespecially in agricultural image analysis tasks. This work contributes to the\nevolving landscape of artificial neural network architectures by introducing a\nnovel pooling layer that enriches the representation of spatial features. Our\ncode is publicly available.\n","authors":["Akshatha Mohan","Joshua Peeples"],"pdf_url":"https://arxiv.org/pdf/2404.16268v1.pdf","comment":"9 pages, 7 figures, accepted at 2024 IEEE/CVF Computer Vision and\n Pattern Recognition Vision for Agriculture Workshop"},{"id":"http://arxiv.org/abs/2404.16266v1","updated":"2024-04-25T00:30:03Z","published":"2024-04-25T00:30:03Z","title":"A Multi-objective Optimization Benchmark Test Suite for Real-time\n Semantic Segmentation","summary":" As one of the emerging challenges in Automated Machine Learning, the\nHardware-aware Neural Architecture Search (HW-NAS) tasks can be treated as\nblack-box multi-objective optimization problems (MOPs). An important\napplication of HW-NAS is real-time semantic segmentation, which plays a pivotal\nrole in autonomous driving scenarios. The HW-NAS for real-time semantic\nsegmentation inherently needs to balance multiple optimization objectives,\nincluding model accuracy, inference speed, and hardware-specific\nconsiderations. Despite its importance, benchmarks have yet to be developed to\nframe such a challenging task as multi-objective optimization. To bridge the\ngap, we introduce a tailored streamline to transform the task of HW-NAS for\nreal-time semantic segmentation into standard MOPs. Building upon the\nstreamline, we present a benchmark test suite, CitySeg/MOP, comprising fifteen\nMOPs derived from the Cityscapes dataset. The CitySeg/MOP test suite is\nintegrated into the EvoXBench platform to provide seamless interfaces with\nvarious programming languages (e.g., Python and MATLAB) for instant fitness\nevaluations. We comprehensively assessed the CitySeg/MOP test suite on various\nmulti-objective evolutionary algorithms, showcasing its versatility and\npracticality. Source codes are available at\nhttps://github.com/EMI-Group/evoxbench.\n","authors":["Yifan Zhao","Zhenyu Liang","Zhichao Lu","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.16266v1.pdf","comment":"8 pages, 16 figures, GECCO 2024"},{"id":"http://arxiv.org/abs/2404.17083v1","updated":"2024-04-25T23:42:09Z","published":"2024-04-25T23:42:09Z","title":"Calculation of Femur Caput Collum Diaphyseal angle for X-Rays images\n using Semantic Segmentation","summary":" This paper investigates the use of deep learning approaches to estimate the\nfemur caput-collum-diaphyseal (CCD) angle from X-ray images. The CCD angle is\nan important measurement in the diagnosis of hip problems, and correct\nprediction can help in the planning of surgical procedures. Manual measurement\nof this angle, on the other hand, can be time-intensive and vulnerable to\ninter-observer variability. In this paper, we present a deep-learning algorithm\nthat can reliably estimate the femur CCD angle from X-ray images. To train and\ntest the performance of our model, we employed an X-ray image dataset with\nassociated femur CCD angle measurements. Furthermore, we built a prototype to\ndisplay the resulting predictions and to allow the user to interact with the\npredictions. As this is happening in a sterile setting during surgery, we\nexpanded our interface to the possibility of being used only by voice commands.\n Our results show that our deep learning model predicts the femur CCD angle on\nX-ray images with great accuracy, with a mean absolute error of 4.3 degrees on\nthe left femur and 4.9 degrees on the right femur on the test dataset. Our\nresults suggest that deep learning has the potential to give a more efficient\nand accurate technique for predicting the femur CCD angle, which might have\nsubstantial therapeutic implications for the diagnosis and management of hip\nproblems.\n","authors":["Deepak Bhatia","Muhammad Abdullah","Anne Querfurth","Mahdi Mantash"],"pdf_url":"https://arxiv.org/pdf/2404.17083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08876v7","updated":"2024-04-25T23:13:51Z","published":"2024-01-16T23:19:30Z","title":"Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image\n Labeling","summary":" As deep neural networks are more commonly deployed in high-stakes domains,\ntheir black-box nature makes uncertainty quantification challenging. We\ninvestigate the presentation of conformal prediction sets--a distribution-free\nclass of methods for generating prediction sets with specified coverage--to\nexpress uncertainty in AI-advised decision-making. Through a large online\nexperiment, we compare the utility of conformal prediction sets to displays of\nTop-1 and Top-k predictions for AI-advised image labeling. In a pre-registered\nanalysis, we find that the utility of prediction sets for accuracy varies with\nthe difficulty of the task: while they result in accuracy on par with or less\nthan Top-1 and Top-k displays for easy images, prediction sets offer some\nadvantage in assisting humans in labeling out-of-distribution (OOD) images in\nthe setting that we studied, especially when the set size is small. Our results\nempirically pinpoint practical challenges of conformal prediction sets and\nprovide implications on how to incorporate them for real-world decision-making.\n","authors":["Dongping Zhang","Angelos Chatzimparmpas","Negar Kamali","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2401.08876v7.pdf","comment":"19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024"},{"id":"http://arxiv.org/abs/2312.00923v2","updated":"2024-04-25T22:36:32Z","published":"2023-12-01T20:52:10Z","title":"Label Delay in Online Continual Learning","summary":" Online continual learning, the process of training models on streaming data,\nhas gained increasing attention in recent years. However, a critical aspect\noften overlooked is the label delay, where new data may not be labeled due to\nslow and costly annotation processes. We introduce a new continual learning\nframework with explicit modeling of the label delay between data and label\nstreams over time steps. In each step, the framework reveals both unlabeled\ndata from the current time step $t$ and labels delayed with $d$ steps, from the\ntime step $t-d$. In our extensive experiments amounting to 1060 GPU days, we\nshow that merely augmenting the computational resources is insufficient to\ntackle this challenge. Our findings underline a notable performance decline\nwhen solely relying on labeled data when the label delay becomes significant.\nMore surprisingly, when using state-of-the-art SSL and TTA techniques to\nutilize the newer, unlabeled data, they fail to surpass the performance of a\nna\\\"ive method that simply trains on the delayed supervised stream. To this\nend, we introduce a simple, efficient baseline that rehearses from the labeled\nmemory samples that are most similar to the new unlabeled samples. This method\nbridges the accuracy gap caused by label delay without significantly increasing\ncomputational complexity. We show experimentally that our method is the least\naffected by the label delay factor and in some cases successfully recovers the\naccuracy of the non-delayed counterpart. We conduct various ablations and\nsensitivity experiments, demonstrating the effectiveness of our approach.\n","authors":["Botos Csaba","Wenxuan Zhang","Matthias Müller","Ser-Nam Lim","Mohamed Elhoseiny","Philip Torr","Adel Bibi"],"pdf_url":"https://arxiv.org/pdf/2312.00923v2.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.17064v1","updated":"2024-04-25T22:20:17Z","published":"2024-04-25T22:20:17Z","title":"Detection of Peri-Pancreatic Edema using Deep Learning and Radiomics\n Techniques","summary":" Identifying peri-pancreatic edema is a pivotal indicator for identifying\ndisease progression and prognosis, emphasizing the critical need for accurate\ndetection and assessment in pancreatitis diagnosis and management. This study\n\\textit{introduces a novel CT dataset sourced from 255 patients with pancreatic\ndiseases, featuring annotated pancreas segmentation masks and corresponding\ndiagnostic labels for peri-pancreatic edema condition}. With the novel dataset,\nwe first evaluate the efficacy of the \\textit{LinTransUNet} model, a linear\nTransformer based segmentation algorithm, to segment the pancreas accurately\nfrom CT imaging data. Then, we use segmented pancreas regions with two\ndistinctive machine learning classifiers to identify existence of\nperi-pancreatic edema: deep learning-based models and a radiomics-based eXtreme\nGradient Boosting (XGBoost). The LinTransUNet achieved promising results, with\na dice coefficient of 80.85\\%, and mIoU of 68.73\\%. Among the nine benchmarked\nclassification models for peri-pancreatic edema detection, \\textit{Swin-Tiny}\ntransformer model demonstrated the highest recall of $98.85 \\pm 0.42$ and\nprecision of $98.38\\pm 0.17$. Comparatively, the radiomics-based XGBoost model\nachieved an accuracy of $79.61\\pm4.04$ and recall of $91.05\\pm3.28$, showcasing\nits potential as a supplementary diagnostic tool given its rapid processing\nspeed and reduced training time. Our code is available\n\\url{https://github.com/NUBagciLab/Peri-Pancreatic-Edema-Detection}.\n","authors":["Ziliang Hong","Debesh Jha","Koushik Biswas","Zheyuan Zhang","Yury Velichko","Cemal Yazici","Temel Tirkes","Amir Borhani","Baris Turkbey","Alpay Medetalibeyoglu","Gorkem Durak","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2404.17064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17063v1","updated":"2024-04-25T22:17:32Z","published":"2024-04-25T22:17:32Z","title":"WheelPose: Data Synthesis Techniques to Improve Pose Estimation\n Performance on Wheelchair Users","summary":" Existing pose estimation models perform poorly on wheelchair users due to a\nlack of representation in training data. We present a data synthesis pipeline\nto address this disparity in data collection and subsequently improve pose\nestimation performance for wheelchair users. Our configurable pipeline\ngenerates synthetic data of wheelchair users using motion capture data and\nmotion generation outputs simulated in the Unity game engine. We validated our\npipeline by conducting a human evaluation, investigating perceived realism,\ndiversity, and an AI performance evaluation on a set of synthetic datasets from\nour pipeline that synthesized different backgrounds, models, and postures. We\nfound our generated datasets were perceived as realistic by human evaluators,\nhad more diversity than existing image datasets, and had improved person\ndetection and pose estimation performance when fine-tuned on existing pose\nestimation models. Through this work, we hope to create a foothold for future\nefforts in tackling the inclusiveness of AI in a data-centric and human-centric\nmanner with the data synthesis techniques demonstrated in this work. Finally,\nfor future works to extend upon, we open source all code in this research and\nprovide a fully configurable Unity Environment used to generate our datasets.\nIn the case of any models we are unable to share due to redistribution and\nlicensing policies, we provide detailed instructions on how to source and\nreplace said models.\n","authors":["William Huang","Sam Ghahremani","Siyou Pei","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.17063v1.pdf","comment":"Published for ACM CHI 2024. For source files, see\n https://github.com/hilab-open-source/wheelpose"},{"id":"http://arxiv.org/abs/2404.17041v1","updated":"2024-04-25T21:06:53Z","published":"2024-04-25T21:06:53Z","title":"Nuclei-Location Based Point Set Registration of Multi-Stained Whole\n Slide Images","summary":" Whole Slide Images (WSIs) provide exceptional detail for studying tissue\narchitecture at the cell level. To study tumour microenvironment (TME) with the\ncontext of various protein biomarkers and cell sub-types, analysis and\nregistration of features using multi-stained WSIs is often required.\nMulti-stained WSI pairs normally suffer from rigid and non-rigid deformities in\naddition to slide artefacts and control tissue which present challenges at\nprecise registration. Traditional registration methods mainly focus on global\nrigid/non-rigid registration but struggle with aligning slides with complex\ntissue deformations at the nuclei level. However, nuclei level non-rigid\nregistration is essential for downstream tasks such as cell sub-type analysis\nin the context of protein biomarker signatures. This paper focuses on local\nlevel non-rigid registration using a nuclei-location based point set\nregistration approach for aligning multi-stained WSIs. We exploit the spatial\ndistribution of nuclei that is prominent and consistent (to a large level)\nacross different stains to establish a spatial correspondence. We evaluate our\napproach using the HYRECO dataset consisting of 54 re-stained images of H\\&E\nand PHH3 image pairs. The approach can be extended to other IHC and IF stained\nWSIs considering a good nuclei detection algorithm is accessible. The\nperformance of the model is tested against established registration algorithms\nand is shown to outperform the model for nuclei level registration.\n","authors":["Adith Jeyasangar","Abdullah Alsalemi","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2404.17041v1.pdf","comment":"15 pages, 5 figures, Submitted to Medical Image Understanding and\n Analysis Conference 2024"},{"id":"http://arxiv.org/abs/2402.06497v2","updated":"2024-04-25T20:59:28Z","published":"2024-02-09T16:08:16Z","title":"Iris-SAM: Iris Segmentation Using a Foundation Model","summary":" Iris segmentation is a critical component of an iris biometric system and it\ninvolves extracting the annular iris region from an ocular image. In this work,\nwe develop a pixel-level iris segmentation model from a foundational model,\nviz., Segment Anything Model (SAM), that has been successfully used for\nsegmenting arbitrary objects. The primary contribution of this work lies in the\nintegration of different loss functions during the fine-tuning of SAM on ocular\nimages. In particular, the importance of Focal Loss is borne out in the\nfine-tuning process since it strategically addresses the class imbalance\nproblem (i.e., iris versus non-iris pixels). Experiments on ND-IRIS-0405,\nCASIA-Iris-Interval-v3, and IIT-Delhi-Iris datasets convey the efficacy of the\ntrained model for the task of iris segmentation. For instance, on the\nND-IRIS-0405 dataset, an average segmentation accuracy of 99.58% was achieved,\ncompared to the best baseline performance of 89.75%.\n","authors":["Parisa Farmanifard","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2402.06497v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17249v3","updated":"2024-04-25T20:56:39Z","published":"2023-03-30T09:29:03Z","title":"Model-agnostic explainable artificial intelligence for object detection\n in image data","summary":" In recent years, deep neural networks have been widely used for building\nhigh-performance Artificial Intelligence (AI) systems for computer vision\napplications. Object detection is a fundamental task in computer vision, which\nhas been greatly progressed through developing large and intricate deep\nlearning models. However, the lack of transparency is a big challenge that may\nnot allow the widespread adoption of these models. Explainable artificial\nintelligence is a field of research where methods are developed to help users\nunderstand the behavior, decision logics, and vulnerabilities of AI systems.\nPreviously, few explanation methods were developed for object detection, based\non the idea of random masks. However, random masks may raise some issues\nregarding the actual importance of pixels within an image. In this paper, we\ndesign and implement a black-box explanation method named Black-box Object\nDetection Explanation by Masking (BODEM) through adopting a hierarchical random\nmasking approach for AI-based object detection systems. We propose a\nhierarchical random masking framework in which coarse-grained masks are used in\nlower levels to find salient regions within an image, and fine-grained mask are\nused to refine the salient regions in higher levels. Experimentations on\nvarious object detection datasets and models showed that BODEM can be\neffectively used to explain the behavior of object detectors. Moreover, our\nmethod outperformed Detector Randomized Input Sampling for Explanation (D-RISE)\nwith respect to different quantitative measures of explanation effectiveness.\nThe experimental results demonstrate that BODEM can be an effective method for\nexplaining and validating object detection systems in black-box testing\nscenarios.\n","authors":["Milad Moradi","Ke Yan","David Colwell","Matthias Samwald","Rhona Asgari"],"pdf_url":"https://arxiv.org/pdf/2303.17249v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17033v1","updated":"2024-04-25T20:47:08Z","published":"2024-04-25T20:47:08Z","title":"Auto-Generating Weak Labels for Real & Synthetic Data to Improve\n Label-Scarce Medical Image Segmentation","summary":" The high cost of creating pixel-by-pixel gold-standard labels, limited expert\navailability, and presence of diverse tasks make it challenging to generate\nsegmentation labels to train deep learning models for medical imaging tasks. In\nthis work, we present a new approach to overcome the hurdle of costly medical\nimage labeling by leveraging foundation models like Segment Anything Model\n(SAM) and its medical alternate MedSAM. Our pipeline has the ability to\ngenerate weak labels for any unlabeled medical image and subsequently use it to\naugment label-scarce datasets. We perform this by leveraging a model trained on\na few gold-standard labels and using it to intelligently prompt MedSAM for weak\nlabel generation. This automation eliminates the manual prompting step in\nMedSAM, creating a streamlined process for generating labels for both real and\nsynthetic images, regardless of quantity. We conduct experiments on\nlabel-scarce settings for multiple tasks pertaining to modalities ranging from\nultrasound, dermatology, and X-rays to demonstrate the usefulness of our\npipeline. The code is available at\nhttps://github.com/stanfordmlgroup/Auto-Generate-WLs/.\n","authors":["Tanvi Deshpande","Eva Prakash","Elsie Gyang Ross","Curtis Langlotz","Andrew Ng","Jeya Maria Jose Valanarasu"],"pdf_url":"https://arxiv.org/pdf/2404.17033v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2404.17031v1","updated":"2024-04-25T20:45:39Z","published":"2024-04-25T20:45:39Z","title":"Motor Focus: Ego-Motion Prediction with All-Pixel Matching","summary":" Motion analysis plays a critical role in various applications, from virtual\nreality and augmented reality to assistive visual navigation. Traditional\nself-driving technologies, while advanced, typically do not translate directly\nto pedestrian applications due to their reliance on extensive sensor arrays and\nnon-feasible computational frameworks. This highlights a significant gap in\napplying these solutions to human users since human navigation introduces\nunique challenges, including the unpredictable nature of human movement,\nlimited processing capabilities of portable devices, and the need for\ndirectional responsiveness due to the limited perception range of humans. In\nthis project, we introduce an image-only method that applies motion analysis\nusing optical flow with ego-motion compensation to predict Motor Focus-where\nand how humans or machines focus their movement intentions. Meanwhile, this\npaper addresses the camera shaking issue in handheld and body-mounted devices\nwhich can severely degrade performance and accuracy, by applying a Gaussian\naggregation to stabilize the predicted motor focus area and enhance the\nprediction accuracy of movement direction. This also provides a robust,\nreal-time solution that adapts to the user's immediate environment.\nFurthermore, in the experiments part, we show the qualitative analysis of motor\nfocus estimation between the conventional dense optical flow-based method and\nthe proposed method. In quantitative tests, we show the performance of the\nproposed method on a collected small dataset that is specialized for motor\nfocus estimation tasks.\n","authors":["Hao Wang","Jiayou Qin","Xiwen Chen","Ashish Bastola","John Suchanek","Zihao Gong","Abolfazl Razi"],"pdf_url":"https://arxiv.org/pdf/2404.17031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17029v1","updated":"2024-04-25T20:43:32Z","published":"2024-04-25T20:43:32Z","title":"Dr-SAM: An End-to-End Framework for Vascular Segmentation, Diameter\n Estimation, and Anomaly Detection on Angiography Images","summary":" Recent advancements in AI have significantly transformed medical imaging,\nparticularly in angiography, by enhancing diagnostic precision and patient\ncare. However existing works are limited in analyzing the aorta and iliac\narteries, above all for vascular anomaly detection and characterization. To\nclose this gap, we propose Dr-SAM, a comprehensive multi-stage framework for\nvessel segmentation, diameter estimation, and anomaly analysis aiming to\nexamine the peripheral vessels through angiography images. For segmentation we\nintroduce a customized positive/negative point selection mechanism applied on\ntop of the Segment Anything Model (SAM), specifically for medical (Angiography)\nimages. Then we propose a morphological approach to determine the vessel\ndiameters followed by our histogram-driven anomaly detection approach.\nMoreover, we introduce a new benchmark dataset for the comprehensive analysis\nof peripheral vessel angiography images which we hope can boost the upcoming\nresearch in this direction leading to enhanced diagnostic precision and\nultimately better health outcomes for individuals facing vascular issues.\n","authors":["Vazgen Zohranyan","Vagner Navasardyan","Hayk Navasardyan","Jan Borggrefe","Shant Navasardyan"],"pdf_url":"https://arxiv.org/pdf/2404.17029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02897v2","updated":"2024-04-25T20:42:13Z","published":"2024-04-03T17:54:37Z","title":"Deep Image Composition Meets Image Forgery","summary":" Image forgery is a topic that has been studied for many years. Before the\nbreakthrough of deep learning, forged images were detected using handcrafted\nfeatures that did not require training. These traditional methods failed to\nperform satisfactorily even on datasets much worse in quality than real-life\nimage manipulations. Advances in deep learning have impacted image forgery\ndetection as much as they have impacted other areas of computer vision and have\nimproved the state of the art. Deep learning models require large amounts of\nlabeled data for training. In the case of image forgery, labeled data at the\npixel level is a very important factor for the models to learn. None of the\nexisting datasets have sufficient size, realism and pixel-level labeling at the\nsame time. This is due to the high cost of producing and labeling quality\nimages. It can take hours for an image editing expert to manipulate just one\nimage. To bridge this gap, we automate data generation using image composition\ntechniques that are very related to image forgery. Unlike other automated data\ngeneration frameworks, we use state of the art image composition deep learning\nmodels to generate spliced images close to the quality of real-life\nmanipulations. Finally, we test the generated dataset on the SOTA image\nmanipulation detection model and show that its prediction performance is lower\ncompared to existing datasets, i.e. we produce realistic images that are more\ndifficult to detect. Dataset will be available at\nhttps://github.com/99eren99/DIS25k .\n","authors":["Eren Tahir","Mert Bal"],"pdf_url":"https://arxiv.org/pdf/2404.02897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15489v2","updated":"2024-04-25T20:17:08Z","published":"2024-01-27T19:44:15Z","title":"Distilling Privileged Multimodal Information for Expression Recognition\n using Optimal Transport","summary":" Deep learning models for multimodal expression recognition have reached\nremarkable performance in controlled laboratory environments because of their\nability to learn complementary and redundant semantic information. However,\nthese models struggle in the wild, mainly because of the unavailability and\nquality of modalities used for training. In practice, only a subset of the\ntraining-time modalities may be available at test time. Learning with\nprivileged information enables models to exploit data from additional\nmodalities that are only available during training. State-of-the-art knowledge\ndistillation (KD) methods have been proposed to distill information from\nmultiple teacher models (each trained on a modality) to a common student model.\nThese privileged KD methods typically utilize point-to-point matching, yet have\nno explicit mechanism to capture the structural information in the teacher\nrepresentation space formed by introducing the privileged modality. Experiments\nwere performed on two challenging problems - pain estimation on the Biovid\ndataset (ordinal classification) and arousal-valance prediction on the Affwild2\ndataset (regression). Results show that our proposed method can outperform\nstate-of-the-art privileged KD methods on these problems. The diversity among\nmodalities and fusion architectures indicates that PKDOT is modality- and\nmodel-agnostic.\n","authors":["Muhammad Haseeb Aslam","Muhammad Osama Zeeshan","Soufiane Belharbi","Marco Pedersoli","Alessandro Koerich","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2401.15489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02416v2","updated":"2024-04-25T19:51:53Z","published":"2024-01-04T18:59:25Z","title":"ODIN: A Single Model for 2D and 3D Segmentation","summary":" State-of-the-art models on contemporary 3D segmentation benchmarks like\nScanNet consume and label dataset-provided 3D point clouds, obtained through\npost processing of sensed multiview RGB-D images. They are typically trained\nin-domain, forego large-scale 2D pre-training and outperform alternatives that\nfeaturize the posed RGB-D multiview images instead. The gap in performance\nbetween methods that consume posed images versus post-processed 3D point clouds\nhas fueled the belief that 2D and 3D perception require distinct model\narchitectures. In this paper, we challenge this view and propose ODIN\n(Omni-Dimensional INstance segmentation), a model that can segment and label\nboth 2D RGB images and 3D point clouds, using a transformer architecture that\nalternates between 2D within-view and 3D cross-view information fusion. Our\nmodel differentiates 2D and 3D feature operations through the positional\nencodings of the tokens involved, which capture pixel coordinates for 2D patch\ntokens and 3D coordinates for 3D feature tokens. ODIN achieves state-of-the-art\nperformance on ScanNet200, Matterport3D and AI2THOR 3D instance segmentation\nbenchmarks, and competitive performance on ScanNet, S3DIS and COCO. It\noutperforms all previous works by a wide margin when the sensed 3D point cloud\nis used in place of the point cloud sampled from 3D mesh. When used as the 3D\nperception engine in an instructable embodied agent architecture, it sets a new\nstate-of-the-art on the TEACh action-from-dialogue benchmark. Our code and\ncheckpoints can be found at the project website (https://odin-seg.github.io).\n","authors":["Ayush Jain","Pushkal Katara","Nikolaos Gkanatsios","Adam W. Harley","Gabriel Sarch","Kriti Aggarwal","Vishrav Chaudhary","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2401.02416v2.pdf","comment":"Camera Ready (CVPR 2024, Highlight)"},{"id":"http://arxiv.org/abs/2404.16994v1","updated":"2024-04-25T19:29:55Z","published":"2024-04-25T19:29:55Z","title":"PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video\n Dense Captioning","summary":" Vision-language pre-training has significantly elevated performance across a\nwide range of image-language applications. Yet, the pre-training process for\nvideo-related tasks demands exceptionally large computational and data\nresources, which hinders the progress of video-language models. This paper\ninvestigates a straightforward, highly efficient, and resource-light approach\nto adapting an existing image-language pre-trained model for dense video\nunderstanding. Our preliminary experiments reveal that directly fine-tuning\npre-trained image-language models with multiple frames as inputs on video\ndatasets leads to performance saturation or even a drop. Our further\ninvestigation reveals that it is largely attributed to the bias of learned\nhigh-norm visual features. Motivated by this finding, we propose a simple but\neffective pooling strategy to smooth the feature distribution along the\ntemporal dimension and thus reduce the dominant impacts from the extreme\nfeatures. The new model is termed Pooling LLaVA, or \\nameofmethod{} in short.\n\\nameofmethod{} achieves new state-of-the-art performance on modern benchmark\ndatasets for both video question-answer and captioning tasks. Notably, on the\nrecent popular Video ChatGPT benchmark, PLLaVA achieves a score of 3.48 out of\n5 on average of five evaluated dimensions, exceeding the previous SOTA results\nfrom GPT4V (IG-VLM) by 9\\%. On the latest multi-choice benchmark MVBench,\nPLLaVA achieves 58.1\\% accuracy on average across 20 sub-tasks, 14.5\\% higher\nthan GPT4V (IG-VLM). Code is available at\n\\url{https://github.com/magic-research/PLLaVA}.\n","authors":["Lin Xu","Yilin Zhao","Daquan Zhou","Zhijie Lin","See Kiong Ng","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2404.16994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16972v1","updated":"2024-04-25T18:50:26Z","published":"2024-04-25T18:50:26Z","title":"CriSp: Leveraging Tread Depth Maps for Enhanced Crime-Scene Shoeprint\n Matching","summary":" Shoeprints are a common type of evidence found at crime scenes and are used\nregularly in forensic investigations. However, existing methods cannot\neffectively employ deep learning techniques to match noisy and occluded\ncrime-scene shoeprints to a shoe database due to a lack of training data.\nMoreover, all existing methods match crime-scene shoeprints to clean reference\nprints, yet our analysis shows matching to more informative tread depth maps\nyields better retrieval results. The matching task is further complicated by\nthe necessity to identify similarities only in corresponding regions (heels,\ntoes, etc) of prints and shoe treads. To overcome these challenges, we leverage\nshoe tread images from online retailers and utilize an off-the-shelf predictor\nto estimate depth maps and clean prints. Our method, named CriSp, matches\ncrime-scene shoeprints to tread depth maps by training on this data. CriSp\nincorporates data augmentation to simulate crime-scene shoeprints, an encoder\nto learn spatially-aware features, and a masking module to ensure only visible\nregions of crime-scene prints affect retrieval results. To validate our\napproach, we introduce two validation sets by reprocessing existing datasets of\ncrime-scene shoeprints and establish a benchmarking protocol for comparison. On\nthis benchmark, CriSp significantly outperforms state-of-the-art methods in\nboth automated shoeprint matching and image retrieval tailored to this task.\n","authors":["Samia Shafique","Shu Kong","Charless Fowlkes"],"pdf_url":"https://arxiv.org/pdf/2404.16972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16944v1","updated":"2024-04-25T18:00:24Z","published":"2024-04-25T18:00:24Z","title":"Constellation Dataset: Benchmarking High-Altitude Object Detection for\n an Urban Intersection","summary":" We introduce Constellation, a dataset of 13K images suitable for research on\ndetection of objects in dense urban streetscapes observed from high-elevation\ncameras, collected for a variety of temporal conditions. The dataset addresses\nthe need for curated data to explore problems in small object detection\nexemplified by the limited pixel footprint of pedestrians observed tens of\nmeters from above. It enables the testing of object detection models for\nvariations in lighting, building shadows, weather, and scene dynamics. We\nevaluate contemporary object detection architectures on the dataset, observing\nthat state-of-the-art methods have lower performance in detecting small\npedestrians compared to vehicles, corresponding to a 10% difference in average\nprecision (AP). Using structurally similar datasets for pretraining the models\nresults in an increase of 1.8% mean AP (mAP). We further find that\nincorporating domain-specific data augmentations helps improve model\nperformance. Using pseudo-labeled data, obtained from inference outcomes of the\nbest-performing models, improves the performance of the models. Finally,\ncomparing the models trained using the data collected in two different time\nintervals, we find a performance drift in models due to the changes in\nintersection conditions over time. The best-performing model achieves a\npedestrian AP of 92.0% with 11.5 ms inference time on NVIDIA A100 GPUs, and an\nmAP of 95.4%.\n","authors":["Mehmet Kerem Turkcan","Sanjeev Narasimhan","Chengbo Zang","Gyung Hyun Je","Bo Yu","Mahshid Ghasemi","Javad Ghaderi","Gil Zussman","Zoran Kostic"],"pdf_url":"https://arxiv.org/pdf/2404.16944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16917v1","updated":"2024-04-25T16:07:01Z","published":"2024-04-25T16:07:01Z","title":"Grad Queue : A probabilistic framework to reinforce sparse gradients","summary":" Informative gradients are often lost in large batch updates. We propose a\nrobust mechanism to reinforce the sparse components within a random batch of\ndata points. A finite queue of online gradients is used to determine their\nexpected instantaneous statistics. We propose a function to measure the\nscarcity of incoming gradients using these statistics and establish the\ntheoretical ground of this mechanism. To minimize conflicting components within\nlarge mini-batches, samples are grouped with aligned objectives by clustering\nbased on inherent feature space. Sparsity is measured for each centroid and\nweighted accordingly. A strong intuitive criterion to squeeze out redundant\ninformation from each cluster is the backbone of the system. It makes rare\ninformation indifferent to aggressive momentum also exhibits superior\nperformance with larger mini-batch horizon. The effective length of the queue\nkept variable to follow the local loss pattern. The contribution of our method\nis to restore intra-mini-batch diversity at the same time widening the optimal\nbatch boundary. Both of these collectively drive it deeper towards the minima.\nOur method has shown superior performance for CIFAR10, MNIST, and Reuters News\ncategory dataset compared to mini-batch gradient descent.\n","authors":["Irfan Mohammad Al Hasib"],"pdf_url":"https://arxiv.org/pdf/2404.16917v1.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.16897v1","updated":"2024-04-25T06:04:34Z","published":"2024-04-25T06:04:34Z","title":"Exploring Learngene via Stage-wise Weight Sharing for Initializing\n Variable-sized Models","summary":" In practice, we usually need to build variable-sized models adapting for\ndiverse resource constraints in different application scenarios, where weight\ninitialization is an important step prior to training. The Learngene framework,\nintroduced recently, firstly learns one compact part termed as learngene from a\nlarge well-trained model, after which learngene is expanded to initialize\nvariable-sized models. In this paper, we start from analysing the importance of\nguidance for the expansion of well-trained learngene layers, inspiring the\ndesign of a simple but highly effective Learngene approach termed SWS\n(Stage-wise Weight Sharing), where both learngene layers and their learning\nprocess critically contribute to providing knowledge and guidance for\ninitializing models at varying scales. Specifically, to learn learngene layers,\nwe build an auxiliary model comprising multiple stages where the layer weights\nin each stage are shared, after which we train it through distillation.\nSubsequently, we expand these learngene layers containing stage information at\ntheir corresponding stage to initialize models of variable depths. Extensive\nexperiments on ImageNet-1K demonstrate that SWS achieves consistent better\nperformance compared to many models trained from scratch, while reducing around\n6.6x total training costs. In some cases, SWS performs better only after 1\nepoch tuning. When initializing variable-sized models adapting for different\nresource constraints, SWS achieves better results while reducing around 20x\nparameters stored to initialize these models and around 10x pre-training costs,\nin contrast to the pre-training and fine-tuning approach.\n","authors":["Shi-Yu Xia","Wenxuan Zhu","Xu Yang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2404.16897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17608v1","updated":"2024-04-25T22:19:42Z","published":"2024-04-25T22:19:42Z","title":"Synthesizing Audio from Silent Video using Sequence to Sequence Modeling","summary":" Generating audio from a video's visual context has multiple practical\napplications in improving how we interact with audio-visual media - for\nexample, enhancing CCTV footage analysis, restoring historical videos (e.g.,\nsilent movies), and improving video generation models. We propose a novel\nmethod to generate audio from video using a sequence-to-sequence model,\nimproving on prior work that used CNNs and WaveNet and faced sound diversity\nand generalization challenges. Our approach employs a 3D Vector Quantized\nVariational Autoencoder (VQ-VAE) to capture the video's spatial and temporal\nstructures, decoding with a custom audio decoder for a broader range of sounds.\nTrained on the Youtube8M dataset segment, focusing on specific domains, our\nmodel aims to enhance applications like CCTV footage analysis, silent movie\nrestoration, and video generation models.\n","authors":["Hugo Garrido-Lestache Belinchon","Helina Mulugeta","Adam Haile"],"pdf_url":"https://arxiv.org/pdf/2404.17608v1.pdf","comment":null}]},"2024-04-26T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.10805v2","updated":"2024-04-26T17:59:51Z","published":"2024-01-19T16:48:49Z","title":"Learning to Visually Connect Actions and their Effects","summary":" In this work, we introduce the novel concept of visually Connecting Actions\nand Their Effects (CATE) in video understanding. CATE can have applications in\nareas like task planning and learning from demonstration. We identify and\nexplore two different aspects of the concept of CATE: Action Selection and\nEffect-Affinity Assessment, where video understanding models connect actions\nand effects at semantic and fine-grained levels, respectively. We observe that\ndifferent formulations produce representations capturing intuitive action\nproperties. We also design various baseline models for Action Selection and\nEffect-Affinity Assessment. Despite the intuitive nature of the task, we\nobserve that models struggle, and humans outperform them by a large margin. The\nstudy aims to establish a foundation for future efforts, showcasing the\nflexibility and versatility of connecting actions and effects in video\nunderstanding, with the hope of inspiring advanced formulations and models.\n","authors":["Eric Peh","Paritosh Parmar","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.10805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17571v1","updated":"2024-04-26T17:55:26Z","published":"2024-04-26T17:55:26Z","title":"Tunnel Try-on: Excavating Spatial-temporal Tunnels for High-quality\n Virtual Try-on in Videos","summary":" Video try-on is a challenging task and has not been well tackled in previous\nworks. The main obstacle lies in preserving the details of the clothing and\nmodeling the coherent motions simultaneously. Faced with those difficulties, we\naddress video try-on by proposing a diffusion-based framework named \"Tunnel\nTry-on.\" The core idea is excavating a \"focus tunnel\" in the input video that\ngives close-up shots around the clothing regions. We zoom in on the region in\nthe tunnel to better preserve the fine details of the clothing. To generate\ncoherent motions, we first leverage the Kalman filter to construct smooth crops\nin the focus tunnel and inject the position embedding of the tunnel into\nattention layers to improve the continuity of the generated videos. In\naddition, we develop an environment encoder to extract the context information\noutside the tunnels as supplementary cues. Equipped with these techniques,\nTunnel Try-on keeps the fine details of the clothing and synthesizes stable and\nsmooth videos. Demonstrating significant advancements, Tunnel Try-on could be\nregarded as the first attempt toward the commercial-level application of\nvirtual try-on in videos.\n","authors":["Zhengze Xu","Mengting Chen","Zhao Wang","Linyu Xing","Zhonghua Zhai","Nong Sang","Jinsong Lan","Shuai Xiao","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.17571v1.pdf","comment":"Project Page: https://mengtingchen.github.io/tunnel-try-on-page/"},{"id":"http://arxiv.org/abs/2404.17569v1","updated":"2024-04-26T17:54:38Z","published":"2024-04-26T17:54:38Z","title":"MaPa: Text-driven Photorealistic Material Painting for 3D Shapes","summary":" This paper aims to generate materials for 3D meshes from text descriptions.\nUnlike existing methods that synthesize texture maps, we propose to generate\nsegment-wise procedural material graphs as the appearance representation, which\nsupports high-quality rendering and provides substantial flexibility in\nediting. Instead of relying on extensive paired data, i.e., 3D meshes with\nmaterial graphs and corresponding text descriptions, to train a material graph\ngenerative model, we propose to leverage the pre-trained 2D diffusion model as\na bridge to connect the text and material graphs. Specifically, our approach\ndecomposes a shape into a set of segments and designs a segment-controlled\ndiffusion model to synthesize 2D images that are aligned with mesh parts. Based\non generated images, we initialize parameters of material graphs and fine-tune\nthem through the differentiable rendering module to produce materials in\naccordance with the textual description. Extensive experiments demonstrate the\nsuperior performance of our framework in photorealism, resolution, and\neditability over existing methods. Project page:\nhttps://zhanghe3z.github.io/MaPa/\n","authors":["Shangzhan Zhang","Sida Peng","Tao Xu","Yuanbo Yang","Tianrun Chen","Nan Xue","Yujun Shen","Hujun Bao","Ruizhen Hu","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17569v1.pdf","comment":"SIGGRAPH 2024. Project page: https://zhanghe3z.github.io/MaPa/"},{"id":"http://arxiv.org/abs/2404.17565v1","updated":"2024-04-26T17:47:14Z","published":"2024-04-26T17:47:14Z","title":"ChangeBind: A Hybrid Change Encoder for Remote Sensing Change Detection","summary":" Change detection (CD) is a fundamental task in remote sensing (RS) which aims\nto detect the semantic changes between the same geographical regions at\ndifferent time stamps. Existing convolutional neural networks (CNNs) based\napproaches often struggle to capture long-range dependencies. Whereas recent\ntransformer-based methods are prone to the dominant global representation and\nmay limit their capabilities to capture the subtle change regions due to the\ncomplexity of the objects in the scene. To address these limitations, we\npropose an effective Siamese-based framework to encode the semantic changes\noccurring in the bi-temporal RS images. The main focus of our design is to\nintroduce a change encoder that leverages local and global feature\nrepresentations to capture both subtle and large change feature information\nfrom multi-scale features to precisely estimate the change regions. Our\nexperimental study on two challenging CD datasets reveals the merits of our\napproach and obtains state-of-the-art performance.\n","authors":["Mubashir Noman","Mustansar Fiaz","Hisham Cholakkal"],"pdf_url":"https://arxiv.org/pdf/2404.17565v1.pdf","comment":"accepted at IGARSS 2024"},{"id":"http://arxiv.org/abs/2304.05370v4","updated":"2024-04-26T17:23:06Z","published":"2023-04-11T17:24:31Z","title":"Overload: Latency Attacks on Object Detection for Edge Devices","summary":" Nowadays, the deployment of deep learning-based applications is an essential\ntask owing to the increasing demands on intelligent services. In this paper, we\ninvestigate latency attacks on deep learning applications. Unlike common\nadversarial attacks for misclassification, the goal of latency attacks is to\nincrease the inference time, which may stop applications from responding to the\nrequests within a reasonable time. This kind of attack is ubiquitous for\nvarious applications, and we use object detection to demonstrate how such kind\nof attacks work. We also design a framework named Overload to generate latency\nattacks at scale. Our method is based on a newly formulated optimization\nproblem and a novel technique, called spatial attention. This attack serves to\nescalate the required computing costs during the inference time, consequently\nleading to an extended inference time for object detection. It presents a\nsignificant threat, especially to systems with limited computing resources. We\nconducted experiments using YOLOv5 models on Nvidia NX. Compared to existing\nmethods, our method is simpler and more effective. The experimental results\nshow that with latency attacks, the inference time of a single image can be\nincreased ten times longer in reference to the normal setting. Moreover, our\nfindings pose a potential new threat to all object detection tasks requiring\nnon-maximum suppression (NMS), as our attack is NMS-agnostic.\n","authors":["Erh-Chung Chen","Pin-Yu Chen","I-Hsin Chung","Che-rung Lee"],"pdf_url":"https://arxiv.org/pdf/2304.05370v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17534v1","updated":"2024-04-26T16:59:26Z","published":"2024-04-26T16:59:26Z","title":"Exploring the Distinctiveness and Fidelity of the Descriptions Generated\n by Large Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) are gaining traction for their\nremarkable ability to process and integrate visual and textual data. Despite\ntheir popularity, the capacity of LVLMs to generate precise, fine-grained\ntextual descriptions has not been fully explored. This study addresses this gap\nby focusing on \\textit{distinctiveness} and \\textit{fidelity}, assessing how\nmodels like Open-Flamingo, IDEFICS, and MiniGPT-4 can distinguish between\nsimilar objects and accurately describe visual features. We proposed the\nTextual Retrieval-Augmented Classification (TRAC) framework, which, by\nleveraging its generative capabilities, allows us to delve deeper into\nanalyzing fine-grained visual description generation. This research provides\nvaluable insights into the generation quality of LVLMs, enhancing the\nunderstanding of multimodal language models. Notably, MiniGPT-4 stands out for\nits better ability to generate fine-grained descriptions, outperforming the\nother two models in this aspect. The code is provided at\n\\url{https://anonymous.4open.science/r/Explore_FGVDs-E277}.\n","authors":["Yuhang Huang","Zihan Wu","Chongyang Gao","Jiawei Peng","Xu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.17534v1.pdf","comment":"11 pages, 9 figures, 6 tables. For associated code, see\n https://anonymous.4open.science/r/Explore_FGVDs-E277"},{"id":"http://arxiv.org/abs/2404.15272v2","updated":"2024-04-26T16:50:20Z","published":"2024-04-23T17:59:01Z","title":"CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and\n Radiology Reports for Full-Body Scenarios","summary":" Medical Vision-Language Pretraining (Med-VLP) establishes a connection\nbetween visual content from medical images and the relevant textual\ndescriptions. Existing Med-VLP methods primarily focus on 2D images depicting a\nsingle body part, notably chest X-rays. In this paper, we extend the scope of\nMed-VLP to encompass 3D images, specifically targeting full-body scenarios, by\nusing a multimodal dataset of CT images and reports. Compared with the 2D\ncounterpart, 3D VLP is required to effectively capture essential semantics from\nsignificantly sparser representation in 3D imaging. In this paper, we introduce\nCT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method\nthat constructs organ-level image-text pairs to enhance multimodal contrastive\nlearning, aligning grounded visual features with precise diagnostic text.\nAdditionally, we developed an abnormality dictionary to augment contrastive\nlearning with diverse contrastive pairs. Our method, trained on a multimodal CT\ndataset comprising 44,011 organ-level vision-text pairs from 17,702 patients\nacross 104 organs, demonstrates it can identify organs and abnormalities in a\nzero-shot manner using natural languages. The performance of CT-GLIP is\nvalidated on a separate test set of 1,130 patients, focusing on the 16 most\nfrequent abnormalities across 7 organs. The experimental results show our\nmodel's superior performance over the standard CLIP framework across zero-shot\nand fine-tuning scenarios, using both CNN and ViT architectures.\n","authors":["Jingyang Lin","Yingda Xia","Jianpeng Zhang","Ke Yan","Le Lu","Jiebo Luo","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15272v2.pdf","comment":"12 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.17528v1","updated":"2024-04-26T16:46:28Z","published":"2024-04-26T16:46:28Z","title":"Geometry-aware Reconstruction and Fusion-refined Rendering for\n Generalizable Neural Radiance Fields","summary":" Generalizable NeRF aims to synthesize novel views for unseen scenes. Common\npractices involve constructing variance-based cost volumes for geometry\nreconstruction and encoding 3D descriptors for decoding novel views. However,\nexisting methods show limited generalization ability in challenging conditions\ndue to inaccurate geometry, sub-optimal descriptors, and decoding strategies.\nWe address these issues point by point. First, we find the variance-based cost\nvolume exhibits failure patterns as the features of pixels corresponding to the\nsame point can be inconsistent across different views due to occlusions or\nreflections. We introduce an Adaptive Cost Aggregation (ACA) approach to\namplify the contribution of consistent pixel pairs and suppress inconsistent\nones. Unlike previous methods that solely fuse 2D features into descriptors,\nour approach introduces a Spatial-View Aggregator (SVA) to incorporate 3D\ncontext into descriptors through spatial and inter-view interaction. When\ndecoding the descriptors, we observe the two existing decoding strategies excel\nin different areas, which are complementary. A Consistency-Aware Fusion (CAF)\nstrategy is proposed to leverage the advantages of both. We incorporate the\nabove ACA, SVA, and CAF into a coarse-to-fine framework, termed Geometry-aware\nReconstruction and Fusion-refined Rendering (GeFu). GeFu attains\nstate-of-the-art performance across multiple datasets. Code is available at\nhttps://github.com/TQTQliu/GeFu .\n","authors":["Tianqi Liu","Xinyi Ye","Min Shi","Zihao Huang","Zhiyu Pan","Zhan Peng","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2404.17528v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://gefucvpr24.github.io"},{"id":"http://arxiv.org/abs/2311.12161v3","updated":"2024-04-26T16:43:14Z","published":"2023-11-20T20:27:42Z","title":"ChemScraper: Leveraging PDF Graphics Instructions for Molecular Diagram\n Parsing","summary":" Most molecular diagram parsers recover chemical structure from raster images\n(e.g., PNGs). However, many PDFs include commands giving explicit locations and\nshapes for characters, lines, and polygons. We present a new parser that uses\nthese born-digital PDF primitives as input. The parsing model is fast and\naccurate, and does not require GPUs, Optical Character Recognition (OCR), or\nvectorization. We use the parser to annotate raster images and then train a new\nmulti-task neural network for recognizing molecules in raster images. We\nevaluate our parsers using SMILES and standard benchmarks, along with a novel\nevaluation protocol comparing molecular graphs directly that supports automatic\nerror compilation and reveals errors missed by SMILES-based evaluation.\n","authors":["Ayush Kumar Shah","Bryan Manrique Amador","Abhisek Dey","Ming Creekmore","Blake Ocampo","Scott Denmark","Richard Zanibbi"],"pdf_url":"https://arxiv.org/pdf/2311.12161v3.pdf","comment":"20 pages without references, 12 figures, 4 Tables, submitted to\n International Journal on Document Analysis and Recognition (IJDAR)"},{"id":"http://arxiv.org/abs/2404.17521v1","updated":"2024-04-26T16:40:17Z","published":"2024-04-26T16:40:17Z","title":"Ag2Manip: Learning Novel Manipulation Skills with Agent-Agnostic Visual\n and Action Representations","summary":" Autonomous robotic systems capable of learning novel manipulation tasks are\npoised to transform industries from manufacturing to service automation.\nHowever, modern methods (e.g., VIP and R3M) still face significant hurdles,\nnotably the domain gap among robotic embodiments and the sparsity of successful\ntask executions within specific action spaces, resulting in misaligned and\nambiguous task representations. We introduce Ag2Manip (Agent-Agnostic\nrepresentations for Manipulation), a framework aimed at surmounting these\nchallenges through two key innovations: a novel agent-agnostic visual\nrepresentation derived from human manipulation videos, with the specifics of\nembodiments obscured to enhance generalizability; and an agent-agnostic action\nrepresentation abstracting a robot's kinematics to a universal agent proxy,\nemphasizing crucial interactions between end-effector and object. Ag2Manip's\nempirical validation across simulated benchmarks like FrankaKitchen, ManiSkill,\nand PartManip shows a 325% increase in performance, achieved without\ndomain-specific demonstrations. Ablation studies underline the essential\ncontributions of the visual and action representations to this success.\nExtending our evaluations to the real world, Ag2Manip significantly improves\nimitation learning success rates from 50% to 77.5%, demonstrating its\neffectiveness and generalizability across both simulated and physical\nenvironments.\n","authors":["Puhao Li","Tengyu Liu","Yuyang Li","Muzhi Han","Haoran Geng","Shu Wang","Yixin Zhu","Song-Chun Zhu","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.17521v1.pdf","comment":"Project website and open-source code:\n https://xiaoyao-li.github.io/research/ag2manip"},{"id":"http://arxiv.org/abs/2311.13668v3","updated":"2024-04-26T16:29:54Z","published":"2023-11-22T19:45:40Z","title":"MAIRA-1: A specialised large multimodal model for radiology report\n generation","summary":" We present a radiology-specific multimodal model for the task for generating\nradiological reports from chest X-rays (CXRs). Our work builds on the idea that\nlarge language model(s) can be equipped with multimodal capabilities through\nalignment with pre-trained vision encoders. On natural images, this has been\nshown to allow multimodal models to gain image understanding and description\ncapabilities. Our proposed model (MAIRA-1) leverages a CXR-specific image\nencoder in conjunction with a fine-tuned large language model based on\nVicuna-7B, and text-based data augmentation, to produce reports with\nstate-of-the-art quality. In particular, MAIRA-1 significantly improves on the\nradiologist-aligned RadCliQ metric and across all lexical metrics considered.\nManual review of model outputs demonstrates promising fluency and accuracy of\ngenerated reports while uncovering failure modes not captured by existing\nevaluation practices. More information and resources can be found on the\nproject website: https://aka.ms/maira.\n","authors":["Stephanie L. Hyland","Shruthi Bannur","Kenza Bouzid","Daniel C. Castro","Mercy Ranjit","Anton Schwaighofer","Fernando Pérez-García","Valentina Salvatelli","Shaury Srivastav","Anja Thieme","Noel Codella","Matthew P. Lungren","Maria Teodora Wetscherek","Ozan Oktay","Javier Alvarez-Valle"],"pdf_url":"https://arxiv.org/pdf/2311.13668v3.pdf","comment":"18 pages, 9 tables, 5 figures. v2 adds test IDs and image encoder\n citation. v3 fixes error in NPV/specificity"},{"id":"http://arxiv.org/abs/2311.10448v2","updated":"2024-04-26T16:29:27Z","published":"2023-11-17T11:03:13Z","title":"DeepClean: Machine Unlearning on the Cheap by Resetting Privacy\n Sensitive Weights using the Fisher Diagonal","summary":" Machine learning models trained on sensitive or private data can\ninadvertently memorize and leak that information. Machine unlearning seeks to\nretroactively remove such details from model weights to protect privacy. We\ncontribute a lightweight unlearning algorithm that leverages the Fisher\nInformation Matrix (FIM) for selective forgetting. Prior work in this area\nrequires full retraining or large matrix inversions, which are computationally\nexpensive. Our key insight is that the diagonal elements of the FIM, which\nmeasure the sensitivity of log-likelihood to changes in weights, contain\nsufficient information for effective forgetting. Specifically, we compute the\nFIM diagonal over two subsets -- the data to retain and forget -- for all\ntrainable weights. This diagonal representation approximates the complete FIM\nwhile dramatically reducing computation. We then use it to selectively update\nweights to maximize forgetting of the sensitive subset while minimizing impact\non the retained subset. Experiments show that our algorithm can successfully\nforget any randomly selected subsets of training data across neural network\narchitectures. By leveraging the FIM diagonal, our approach provides an\ninterpretable, lightweight, and efficient solution for machine unlearning with\npractical privacy benefits.\n","authors":["Jiaeli Shi","Najah Ghalyan","Kostis Gourgoulias","John Buford","Sean Moran"],"pdf_url":"https://arxiv.org/pdf/2311.10448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03659v3","updated":"2024-04-26T16:23:31Z","published":"2023-04-07T14:26:11Z","title":"Probing Conceptual Understanding of Large Visual-Language Models","summary":" In recent years large visual-language (V+L) models have achieved great\nsuccess in various downstream tasks. However, it is not well studied whether\nthese models have a conceptual grasp of the visual content. In this work we\nfocus on conceptual understanding of these large V+L models. To facilitate this\nstudy, we propose novel benchmarking datasets for probing three different\naspects of content understanding, 1) \\textit{relations}, 2)\n\\textit{composition}, and 3) \\textit{context}. Our probes are grounded in\ncognitive science and help determine if a V+L model can, for example, determine\nif snow garnished with a man is implausible, or if it can identify beach\nfurniture by knowing it is located on a beach. We experimented with many recent\nstate-of-the-art V+L models and observe that these models mostly \\textit{fail\nto demonstrate} a conceptual understanding. This study reveals several\ninteresting insights such as that \\textit{cross-attention} helps learning\nconceptual understanding, and that CNNs are better with \\textit{texture and\npatterns}, while Transformers are better at \\textit{color and shape}. We\nfurther utilize some of these insights and investigate a \\textit{simple\nfinetuning technique} that rewards the three conceptual understanding measures\nwith promising initial results. The proposed benchmarks will drive the\ncommunity to delve deeper into conceptual understanding and foster advancements\nin the capabilities of large V+L models. The code and dataset is available at:\n\\url{https://tinyurl.com/vlm-robustness}\n","authors":["Madeline Schiappa","Raiyaan Abdullah","Shehreen Azad","Jared Claypoole","Michael Cogswell","Ajay Divakaran","Yogesh Rawat"],"pdf_url":"https://arxiv.org/pdf/2304.03659v3.pdf","comment":"All code and dataset is available at:\n https://tinyurl.com/vlm-robustness. Accepted in CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.17507v1","updated":"2024-04-26T16:19:55Z","published":"2024-04-26T16:19:55Z","title":"HYPE: Hyperbolic Entailment Filtering for Underspecified Images and\n Texts","summary":" In an era where the volume of data drives the effectiveness of\nself-supervised learning, the specificity and clarity of data semantics play a\ncrucial role in model training. Addressing this, we introduce HYPerbolic\nEntailment filtering (HYPE), a novel methodology designed to meticulously\nextract modality-wise meaningful and well-aligned data from extensive, noisy\nimage-text pair datasets. Our approach leverages hyperbolic embeddings and the\nconcept of entailment cones to evaluate and filter out samples with meaningless\nor underspecified semantics, focusing on enhancing the specificity of each data\nsample. HYPE not only demonstrates a significant improvement in filtering\nefficiency but also sets a new state-of-the-art in the DataComp benchmark when\ncombined with existing filtering techniques. This breakthrough showcases the\npotential of HYPE to refine the data selection process, thereby contributing to\nthe development of more accurate and efficient self-supervised learning models.\nAdditionally, the image specificity $\\epsilon_{i}$ can be independently applied\nto induce an image-only dataset from an image-text or image-only data pool for\ntraining image-only self-supervised models and showed superior performance when\ncompared to the dataset induced by CLIP score.\n","authors":["Wonjae Kim","Sanghyuk Chun","Taekyung Kim","Dongyoon Han","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2404.17507v1.pdf","comment":"28pages, 4.5MB"},{"id":"http://arxiv.org/abs/2404.17503v1","updated":"2024-04-26T16:09:42Z","published":"2024-04-26T16:09:42Z","title":"Inhomogeneous illuminated image enhancement under extremely low\n visibility condition","summary":" Imaging through fog significantly impacts fields such as object detection and\nrecognition. In conditions of extremely low visibility, essential image\ninformation can be obscured, rendering standard extraction methods ineffective.\nTraditional digital processing techniques, such as histogram stretching, aim to\nmitigate fog effects by enhancing object light contrast diminished by\natmospheric scattering. However, these methods often experience reduce\neffectiveness under inhomogeneous illumination. This paper introduces a novel\napproach that adaptively filters background illumination under extremely low\nvisibility and preserve only the essential signal information. Additionally, we\nemploy a visual optimization strategy based on image gradients to eliminate\ngrayscale banding. Finally, the image is transformed to achieve high contrast\nand maintain fidelity to the original information through maximum histogram\nequalization. Our proposed method significantly enhances signal clarity in\nconditions of extremely low visibility and outperforms existing algorithms.\n","authors":["Libang Chen","Yikun Liu","Jianying Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01373v2","updated":"2024-04-26T16:07:25Z","published":"2023-12-29T15:47:22Z","title":"Boosting Defect Detection in Manufacturing using Tensor Convolutional\n Neural Networks","summary":" Defect detection is one of the most important yet challenging tasks in the\nquality control stage in the manufacturing sector. In this work, we introduce a\nTensor Convolutional Neural Network (T-CNN) and examine its performance on a\nreal defect detection application in one of the components of the ultrasonic\nsensors produced at Robert Bosch's manufacturing plants. Our quantum-inspired\nT-CNN operates on a reduced model parameter space to substantially improve the\ntraining speed and performance of an equivalent CNN model without sacrificing\naccuracy. More specifically, we demonstrate how T-CNNs are able to reach the\nsame performance as classical CNNs as measured by quality metrics, with up to\nfifteen times fewer parameters and 4% to 19% faster training times. Our results\ndemonstrate that the T-CNN greatly outperforms the results of traditional human\nvisual inspection, providing value in a current real application in\nmanufacturing.\n","authors":["Pablo Martin-Ramiro","Unai Sainz de la Maza","Sukhbinder Singh","Roman Orus","Samuel Mugel"],"pdf_url":"https://arxiv.org/pdf/2401.01373v2.pdf","comment":"12 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.17498v1","updated":"2024-04-26T15:56:08Z","published":"2024-04-26T15:56:08Z","title":"Learning text-to-video retrieval from image captioning","summary":" We describe a protocol to study text-to-video retrieval training with\nunlabeled videos, where we assume (i) no access to labels for any videos, i.e.,\nno access to the set of ground-truth captions, but (ii) access to labeled\nimages in the form of text. Using image expert models is a realistic scenario\ngiven that annotating images is cheaper therefore scalable, in contrast to\nexpensive video labeling schemes. Recently, zero-shot image experts such as\nCLIP have established a new strong baseline for video understanding tasks. In\nthis paper, we make use of this progress and instantiate the image experts from\ntwo types of models: a text-to-image retrieval model to provide an initial\nbackbone, and image captioning models to provide supervision signal into\nunlabeled videos. We show that automatically labeling video frames with image\ncaptioning allows text-to-video retrieval training. This process adapts the\nfeatures to the target domain at no manual annotation cost, consequently\noutperforming the strong zero-shot CLIP baseline. During training, we sample\ncaptions from multiple video frames that best match the visual content, and\nperform a temporal pooling over frame representations by scoring frames\naccording to their relevance to each caption. We conduct extensive ablations to\nprovide insights and demonstrate the effectiveness of this simple framework by\noutperforming the CLIP zero-shot baselines on text-to-video retrieval on three\nstandard datasets, namely ActivityNet, MSR-VTT, and MSVD.\n","authors":["Lucas Ventura","Cordelia Schmid","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2404.17498v1.pdf","comment":"A short version of this work appeared at CVPR 2023 Workshops. Project\n page: https://imagine.enpc.fr/~ventural/multicaps/"},{"id":"http://arxiv.org/abs/2403.18360v3","updated":"2024-04-26T15:46:05Z","published":"2024-03-27T08:52:44Z","title":"Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific\n Boundaries for Domain Adaptation","summary":" Most domain adaptation (DA) methods are based on either a convolutional\nneural networks (CNNs) or a vision transformers (ViTs). They align the\ndistribution differences between domains as encoders without considering their\nunique characteristics. For instance, ViT excels in accuracy due to its\nsuperior ability to capture global representations, while CNN has an advantage\nin capturing local representations. This fact has led us to design a hybrid\nmethod to fully take advantage of both ViT and CNN, called Explicitly\nClass-specific Boundaries (ECB). ECB learns CNN on ViT to combine their\ndistinct strengths. In particular, we leverage ViT's properties to explicitly\nfind class-specific decision boundaries by maximizing the discrepancy between\nthe outputs of the two classifiers to detect target samples far from the source\nsupport. In contrast, the CNN encoder clusters target features based on the\npreviously defined class-specific boundaries by minimizing the discrepancy\nbetween the probabilities of the two classifiers. Finally, ViT and CNN mutually\nexchange knowledge to improve the quality of pseudo labels and reduce the\nknowledge discrepancies of these models. Compared to conventional DA methods,\nour ECB achieves superior performance, which verifies its effectiveness in this\nhybrid model. The project website can be found\nhttps://dotrannhattuong.github.io/ECB/website.\n","authors":["Ba Hung Ngo","Nhat-Tuong Do-Tran","Tuan-Ngoc Nguyen","Hae-Gon Jeon","Tae Jong Choi"],"pdf_url":"https://arxiv.org/pdf/2403.18360v3.pdf","comment":"Project page: https://dotrannhattuong.github.io/ECB/website, Accepted\n to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.17488v1","updated":"2024-04-26T15:43:24Z","published":"2024-04-26T15:43:24Z","title":"Low Cost Machine Vision for Insect Classification","summary":" Preserving the number and diversity of insects is one of our society's most\nimportant goals in the area of environmental sustainability. A prerequisite for\nthis is a systematic and up-scaled monitoring in order to detect correlations\nand identify countermeasures. Therefore, automatized monitoring using live\ntraps is important, but so far there is no system that provides image data of\nsufficient detailed information for entomological classification.\n In this work, we present an imaging method as part of a multisensor system\ndeveloped as a low-cost, scalable, open-source system that is adaptable to\nclassical trap types. The image quality meets the requirements needed for\nclassification in the taxonomic tree. Therefore, illumination and resolution\nhave been optimized and motion artefacts have been suppressed. The system is\nevaluated exemplarily on a dataset consisting of 16 insect species of the same\nas well as different genus, family and order. We demonstrate that standard\nCNN-architectures like ResNet50 (pretrained on iNaturalist data) or MobileNet\nperform very well for the prediction task after re-training. Smaller custom\nmade CNNs also lead to promising results. Classification accuracy of $>96\\%$\nhas been achieved. Moreover, it was proved that image cropping of insects is\nnecessary for classification of species with high inter-class similarity.\n","authors":["Danja Brandt","Martin Tschaikner","Teodor Chiaburu","Henning Schmidt","Ilona Schrimpf","Alexandra Stadel","Ingeborg E. Beckers","Frank Haußer"],"pdf_url":"https://arxiv.org/pdf/2404.17488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17486v1","updated":"2024-04-26T15:42:24Z","published":"2024-04-26T15:42:24Z","title":"TextGaze: Gaze-Controllable Face Generation with Natural Language","summary":" Generating face image with specific gaze information has attracted\nconsiderable attention. Existing approaches typically input gaze values\ndirectly for face generation, which is unnatural and requires annotated gaze\ndatasets for training, thereby limiting its application. In this paper, we\npresent a novel gaze-controllable face generation task. Our approach inputs\ntextual descriptions that describe human gaze and head behavior and generates\ncorresponding face images. Our work first introduces a text-of-gaze dataset\ncontaining over 90k text descriptions spanning a dense distribution of gaze and\nhead poses. We further propose a gaze-controllable text-to-face method. Our\nmethod contains a sketch-conditioned face diffusion module and a model-based\nsketch diffusion module. We define a face sketch based on facial landmarks and\neye segmentation map. The face diffusion module generates face images from the\nface sketch, and the sketch diffusion module employs a 3D face model to\ngenerate face sketch from text description. Experiments on the FFHQ dataset\nshow the effectiveness of our method. We will release our dataset and code for\nfuture research.\n","authors":["Hengfei Wang","Zhongqun Zhang","Yihua Cheng","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2404.17486v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.17484v1","updated":"2024-04-26T15:37:50Z","published":"2024-04-26T15:37:50Z","title":"Sparse Reconstruction of Optical Doppler Tomography Based on State Space\n Model","summary":" Optical Doppler Tomography (ODT) is a blood flow imaging technique popularly\nused in bioengineering applications. The fundamental unit of ODT is the 1D\nfrequency response along the A-line (depth), named raw A-scan. A 2D ODT image\n(B-scan) is obtained by first sensing raw A-scans along the B-line (width), and\nthen constructing the B-scan from these raw A-scans via magnitude-phase\nanalysis and post-processing. To obtain a high-resolution B-scan with a precise\nflow map, densely sampled A-scans are required in current methods, causing both\ncomputational and storage burdens. To address this issue, in this paper we\npropose a novel sparse reconstruction framework with four main sequential\nsteps: 1) early magnitude-phase fusion that encourages rich interaction of the\ncomplementary information in magnitude and phase, 2) State Space Model\n(SSM)-based representation learning, inspired by recent successes in Mamba and\nVMamba, to naturally capture both the intra-A-scan sequential information and\nbetween-A-scan interactions, 3) an Inception-based Feedforward Network module\n(IncFFN) to further boost the SSM-module, and 4) a B-line Pixel Shuffle (BPS)\nlayer to effectively reconstruct the final results. In the experiments on\nreal-world animal data, our method shows clear effectiveness in reconstruction\naccuracy. As the first application of SSM for image reconstruction tasks, we\nexpect our work to inspire related explorations in not only efficient ODT\nimaging techniques but also generic image enhancement.\n","authors":["Zhenghong Li","Jiaxiang Ren","Wensheng Cheng","Congwu Du","Yingtian Pan","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2404.17484v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.17163v5","updated":"2024-04-26T15:06:10Z","published":"2023-12-28T17:52:09Z","title":"FENet: Focusing Enhanced Network for Lane Detection","summary":" Inspired by human driving focus, this research pioneers networks augmented\nwith Focusing Sampling, Partial Field of View Evaluation, Enhanced FPN\narchitecture and Directional IoU Loss - targeted innovations addressing\nobstacles to precise lane detection for autonomous driving. Experiments\ndemonstrate our Focusing Sampling strategy, emphasizing vital distant details\nunlike uniform approaches, significantly boosts both benchmark and practical\ncurved/distant lane recognition accuracy essential for safety. While FENetV1\nachieves state-of-the-art conventional metric performance via enhancements\nisolating perspective-aware contexts mimicking driver vision, FENetV2 proves\nmost reliable on the proposed Partial Field analysis. Hence we specifically\nrecommend V2 for practical lane navigation despite fractional degradation on\nstandard entire-image measures. Future directions include collecting on-road\ndata and integrating complementary dual frameworks to further breakthroughs\nguided by human perception principles. The Code is available at\nhttps://github.com/HanyangZhong/FENet.\n","authors":["Liman Wang","Hanyang Zhong"],"pdf_url":"https://arxiv.org/pdf/2312.17163v5.pdf","comment":"12 pages including appendix. The Code is available at\n https://github.com/HanyangZhong/FENet"},{"id":"http://arxiv.org/abs/2403.04654v3","updated":"2024-04-26T14:50:45Z","published":"2024-03-07T16:57:45Z","title":"Audio-Visual Person Verification based on Recursive Fusion of Joint\n Cross-Attention","summary":" Person or identity verification has been recently gaining a lot of attention\nusing audio-visual fusion as faces and voices share close associations with\neach other. Conventional approaches based on audio-visual fusion rely on\nscore-level or early feature-level fusion techniques. Though existing\napproaches showed improvement over unimodal systems, the potential of\naudio-visual fusion for person verification is not fully exploited. In this\npaper, we have investigated the prospect of effectively capturing both the\nintra- and inter-modal relationships across audio and visual modalities, which\ncan play a crucial role in significantly improving the fusion performance over\nunimodal systems. In particular, we introduce a recursive fusion of a joint\ncross-attentional model, where a joint audio-visual feature representation is\nemployed in the cross-attention framework in a recursive fashion to\nprogressively refine the feature representations that can efficiently capture\nthe intra-and inter-modal relationships. To further enhance the audio-visual\nfeature representations, we have also explored BLSTMs to improve the temporal\nmodeling of audio-visual feature representations. Extensive experiments are\nconducted on the Voxceleb1 dataset to evaluate the proposed model. Results\nindicate that the proposed model shows promising improvement in fusion\nperformance by adeptly capturing the intra-and inter-modal relationships across\naudio and visual modalities.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.04654v3.pdf","comment":"Accepted to FG2024"},{"id":"http://arxiv.org/abs/2312.02246v4","updated":"2024-04-26T14:40:55Z","published":"2023-12-04T14:45:56Z","title":"Conditional Variational Diffusion Models","summary":" Inverse problems aim to determine parameters from observations, a crucial\ntask in engineering and science. Lately, generative models, especially\ndiffusion models, have gained popularity in this area for their ability to\nproduce realistic solutions and their good mathematical properties. Despite\ntheir success, an important drawback of diffusion models is their sensitivity\nto the choice of variance schedule, which controls the dynamics of the\ndiffusion process. Fine-tuning this schedule for specific applications is\ncrucial but time-costly and does not guarantee an optimal result. We propose a\nnovel approach for learning the schedule as part of the training process. Our\nmethod supports probabilistic conditioning on data, provides high-quality\nsolutions, and is flexible, proving able to adapt to different applications\nwith minimum overhead. This approach is tested in two unrelated inverse\nproblems: super-resolution microscopy and quantitative phase imaging, yielding\ncomparable or superior results to previous methods and fine-tuned diffusion\nmodels. We conclude that fine-tuning the schedule by experimentation should be\navoided because it can be learned during training in a stable way that yields\nbetter results.\n","authors":["Gabriel della Maggiora","Luis Alberto Croquevielle","Nikita Deshpande","Harry Horsley","Thomas Heinis","Artur Yakimovich"],"pdf_url":"https://arxiv.org/pdf/2312.02246v4.pdf","comment":"Denoising Diffusion Probabilistic Models, Inverse Problems,\n Generative Models, Super Resolution, Phase Quantification, Variational\n Methods"},{"id":"http://arxiv.org/abs/2404.14471v2","updated":"2024-04-26T14:35:32Z","published":"2024-04-22T17:55:07Z","title":"Narrative Action Evaluation with Prompt-Guided Multimodal Interaction","summary":" In this paper, we investigate a new problem called narrative action\nevaluation (NAE). NAE aims to generate professional commentary that evaluates\nthe execution of an action. Unlike traditional tasks such as score-based action\nquality assessment and video captioning involving superficial sentences, NAE\nfocuses on creating detailed narratives in natural language. These narratives\nprovide intricate descriptions of actions along with objective evaluations. NAE\nis a more challenging task because it requires both narrative flexibility and\nevaluation rigor. One existing possible solution is to use multi-task learning,\nwhere narrative language and evaluative information are predicted separately.\nHowever, this approach results in reduced performance for individual tasks\nbecause of variations between tasks and differences in modality between\nlanguage information and evaluation information. To address this, we propose a\nprompt-guided multimodal interaction framework. This framework utilizes a pair\nof transformers to facilitate the interaction between different modalities of\ninformation. It also uses prompts to transform the score regression task into a\nvideo-text matching task, thus enabling task interactivity. To support further\nresearch in this field, we re-annotate the MTL-AQA and FineGym datasets with\nhigh-quality and comprehensive action narration. Additionally, we establish\nbenchmarks for NAE. Extensive experiment results prove that our method\noutperforms separate learning methods and naive multi-task learning methods.\nData and code are released at https://github.com/shiyi-zh0408/NAE_CVPR2024.\n","authors":["Shiyi Zhang","Sule Bai","Guangyi Chen","Lei Chen","Jiwen Lu","Junle Wang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2404.14471v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.17433v1","updated":"2024-04-26T14:20:31Z","published":"2024-04-26T14:20:31Z","title":"PromptCIR: Blind Compressed Image Restoration with Prompt Learning","summary":" Blind Compressed Image Restoration (CIR) has garnered significant attention\ndue to its practical applications. It aims to mitigate compression artifacts\ncaused by unknown quality factors, particularly with JPEG codecs. Existing\nworks on blind CIR often seek assistance from a quality factor prediction\nnetwork to facilitate their network to restore compressed images. However, the\npredicted numerical quality factor lacks spatial information, preventing\nnetwork adaptability toward image contents. Recent studies in\nprompt-learning-based image restoration have showcased the potential of prompts\nto generalize across varied degradation types and degrees. This motivated us to\ndesign a prompt-learning-based compressed image restoration network, dubbed\nPromptCIR, which can effectively restore images from various compress levels.\nSpecifically, PromptCIR exploits prompts to encode compression information\nimplicitly, where prompts directly interact with soft weights generated from\nimage features, thus providing dynamic content-aware and distortion-aware\nguidance for the restoration process. The light-weight prompts enable our\nmethod to adapt to different compression levels, while introducing minimal\nparameter overhead. Overall, PromptCIR leverages the powerful transformer-based\nbackbone with the dynamic prompt module to proficiently handle blind CIR tasks,\nwinning first place in the NTIRE 2024 challenge of blind compressed image\nenhancement track. Extensive experiments have validated the effectiveness of\nour proposed PromptCIR. The code is available at\nhttps://github.com/lbc12345/PromptCIR-NTIRE24.\n","authors":["Bingchen Li","Xin Li","Yiting Lu","Ruoyu Feng","Mengxi Guo","Shijie Zhao","Li Zhang","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17433v1.pdf","comment":"Winner of NTIRE 2024 Blind Compressed Image Enhancement Challenge"},{"id":"http://arxiv.org/abs/2306.05272v5","updated":"2024-04-26T14:10:49Z","published":"2023-06-08T15:20:27Z","title":"Image Clustering via the Principle of Rate Reduction in the Age of\n Pretrained Models","summary":" The advent of large pre-trained models has brought about a paradigm shift in\nboth visual representation learning and natural language processing. However,\nclustering unlabeled images, as a fundamental and classic machine learning\nproblem, still lacks an effective solution, particularly for large-scale\ndatasets. In this paper, we propose a novel image clustering pipeline that\nleverages the powerful feature representation of large pre-trained models such\nas CLIP and cluster images effectively and efficiently at scale. We first\ndeveloped a novel algorithm to estimate the number of clusters in a given\ndataset. We then show that the pre-trained features are significantly more\nstructured by further optimizing the rate reduction objective. The resulting\nfeatures may significantly improve the clustering accuracy, e.g., from 57\\% to\n66\\% on ImageNet-1k. Furthermore, by leveraging CLIP's multimodality bridge\nbetween image and text, we develop a simple yet effective self-labeling\nalgorithm that produces meaningful captions for the clusters. Through extensive\nexperiments, we show that our pipeline works well on standard datasets such as\nCIFAR-10, CIFAR-100, and ImageNet-1k. It also extends to datasets that are not\ncurated for clustering, such as LAION-Aesthetics and WikiArts. We released the\ncode in https://github.com/LeslieTrue/CPP.\n","authors":["Tianzhe Chu","Shengbang Tong","Tianjiao Ding","Xili Dai","Benjamin David Haeffele","René Vidal","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2306.05272v5.pdf","comment":"23 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.17427v1","updated":"2024-04-26T14:03:55Z","published":"2024-04-26T14:03:55Z","title":"Cost-Sensitive Uncertainty-Based Failure Recognition for Object\n Detection","summary":" Object detectors in real-world applications often fail to detect objects due\nto varying factors such as weather conditions and noisy input. Therefore, a\nprocess that mitigates false detections is crucial for both safety and\naccuracy. While uncertainty-based thresholding shows promise, previous works\ndemonstrate an imperfect correlation between uncertainty and detection errors.\nThis hinders ideal thresholding, prompting us to further investigate the\ncorrelation and associated cost with different types of uncertainty. We\ntherefore propose a cost-sensitive framework for object detection tailored to\nuser-defined budgets on the two types of errors, missing and false detections.\nWe derive minimum thresholding requirements to prevent performance degradation\nand define metrics to assess the applicability of uncertainty for failure\nrecognition. Furthermore, we automate and optimize the thresholding process to\nmaximize the failure recognition rate w.r.t. the specified budget. Evaluation\non three autonomous driving datasets demonstrates that our approach\nsignificantly enhances safety, particularly in challenging scenarios.\nLeveraging localization aleatoric uncertainty and softmax-based entropy only,\nour method boosts the failure recognition rate by 36-60\\% compared to\nconventional approaches. Code is available at\nhttps://mos-ks.github.io/publications.\n","authors":["Moussa Kassem Sbeyti","Michelle Karg","Christian Wirth","Nadja Klein","Sahin Albayrak"],"pdf_url":"https://arxiv.org/pdf/2404.17427v1.pdf","comment":"Accepted with an oral presentation at UAI 2024"},{"id":"http://arxiv.org/abs/2404.17426v1","updated":"2024-04-26T14:03:23Z","published":"2024-04-26T14:03:23Z","title":"One-Shot Image Restoration","summary":" Image restoration, or inverse problems in image processing, has long been an\nextensively studied topic. In recent years supervised learning approaches have\nbecome a popular strategy attempting to tackle this task. Unfortunately, most\nsupervised learning-based methods are highly demanding in terms of\ncomputational resources and training data (sample complexity). In addition,\ntrained models are sensitive to domain changes, such as varying acquisition\nsystems, signal sampling rates, resolution and contrast. In this work, we try\nto answer a fundamental question: Can supervised learning models generalize\nwell solely by learning from one image or even part of an image? If so, then\nwhat is the minimal amount of patches required to achieve acceptable\ngeneralization? To this end, we focus on an efficient patch-based learning\nframework that requires a single image input-output pair for training.\nExperimental results demonstrate the applicability, robustness and\ncomputational efficiency of the proposed approach for supervised image\ndeblurring and super-resolution. Our results showcase significant improvement\nof learning models' sample efficiency, generalization and time complexity, that\ncan hopefully be leveraged for future real-time applications, and applied to\nother signals and modalities.\n","authors":["Deborah Pereg"],"pdf_url":"https://arxiv.org/pdf/2404.17426v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.14267"},{"id":"http://arxiv.org/abs/2404.03537v4","updated":"2024-04-26T14:01:36Z","published":"2024-04-04T15:45:25Z","title":"If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face\n Recognition through Synthetic Faces","summary":" Recent advances in deep face recognition have spurred a growing demand for\nlarge, diverse, and manually annotated face datasets. Acquiring authentic,\nhigh-quality data for face recognition has proven to be a challenge, primarily\ndue to privacy concerns. Large face datasets are primarily sourced from\nweb-based images, lacking explicit user consent. In this paper, we examine\nwhether and how synthetic face data can be used to train effective face\nrecognition models with reduced reliance on authentic images, thereby\nmitigating data collection concerns. First, we explored the performance gap\namong recent state-of-the-art face recognition models, trained with synthetic\ndata only and authentic (scarce) data only. Then, we deepened our analysis by\ntraining a state-of-the-art backbone with various combinations of synthetic and\nauthentic data, gaining insights into optimizing the limited use of the latter\nfor verification accuracy. Finally, we assessed the effectiveness of data\naugmentation approaches on synthetic and authentic data, with the same goal in\nmind. Our results highlighted the effectiveness of FR trained on combined\ndatasets, particularly when combined with appropriate augmentation techniques.\n","authors":["Andrea Atzori","Fadi Boutros","Naser Damer","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2404.03537v4.pdf","comment":"Accepted as full paper at FG 2024 main track"},{"id":"http://arxiv.org/abs/2404.17419v1","updated":"2024-04-26T13:55:39Z","published":"2024-04-26T13:55:39Z","title":"Multi-view Image Prompted Multi-view Diffusion for Improved 3D\n Generation","summary":" Using image as prompts for 3D generation demonstrate particularly strong\nperformances compared to using text prompts alone, for images provide a more\nintuitive guidance for the 3D generation process. In this work, we delve into\nthe potential of using multiple image prompts, instead of a single image\nprompt, for 3D generation. Specifically, we build on ImageDream, a novel\nimage-prompt multi-view diffusion model, to support multi-view images as the\ninput prompt. Our method, dubbed MultiImageDream, reveals that transitioning\nfrom a single-image prompt to multiple-image prompts enhances the performance\nof multi-view and 3D object generation according to various quantitative\nevaluation metrics and qualitative assessments. This advancement is achieved\nwithout the necessity of fine-tuning the pre-trained ImageDream multi-view\ndiffusion model.\n","authors":["Seungwook Kim","Yichun Shi","Kejie Li","Minsu Cho","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.17419v1.pdf","comment":"5 pages including references, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2311.10543v4","updated":"2024-04-26T13:43:04Z","published":"2023-11-17T14:10:55Z","title":"Joint covariance properties under geometric image transformations for\n spatio-temporal receptive fields according to the generalized Gaussian\n derivative model for visual receptive fields","summary":" The influence of natural image transformations on receptive field responses\nis crucial for modelling visual operations in computer vision and biological\nvision. In this regard, covariance properties with respect to geometric image\ntransformations in the earliest layers of the visual hierarchy are essential\nfor expressing robust image operations, and for formulating invariant visual\noperations at higher levels.\n This paper defines and proves a set of joint covariance properties under\ncompositions of spatial scaling transformations, spatial affine\ntransformations, Galilean transformations and temporal scaling transformations,\nwhich make it possible to characterize how different types of image\ntransformations interact with each other and the associated spatio-temporal\nreceptive field responses. In this regard, we also extend the notion of\nscale-normalized derivatives to affine-normalized derivatives, to be able to\nobtain true affine-covariant properties of spatial derivatives, that are\ncomputed based on spatial smoothing with affine Gaussian kernels.\n The derived relations show how the parameters of the receptive fields need to\nbe transformed, in order to match the output from spatio-temporal receptive\nfields under composed spatio-temporal image transformations. As a side effect,\nthe presented proof for the joint covariance property over the integrated\ncombination of the different geometric image transformations also provides\nspecific proofs for the individual transformation properties, which have not\npreviously been fully reported in the literature.\n The paper also presents an in-depth theoretical analysis of geometric\ninterpretations of the derived covariance properties, as well as outlines a\nnumber of biological interpretations of these results.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.10543v4.pdf","comment":"38 pages, 13 figures. Note: From version 4, this paper considers a\n different form of joint composition of the geometric image transformations\n than in the earlier versions"},{"id":"http://arxiv.org/abs/2404.15041v2","updated":"2024-04-26T13:38:08Z","published":"2024-04-23T13:43:33Z","title":"LEAF: Unveiling Two Sides of the Same Coin in Semi-supervised Facial\n Expression Recognition","summary":" Semi-supervised learning has emerged as a promising approach to tackle the\nchallenge of label scarcity in facial expression recognition (FER) task.\nHowever, current state-of-the-art methods primarily focus on one side of the\ncoin, i.e., generating high-quality pseudo-labels, while overlooking the other\nside: enhancing expression-relevant representations. In this paper, we unveil\nboth sides of the coin by proposing a unified framework termed hierarchicaL\ndEcoupling And Fusing (LEAF) to coordinate expression-relevant representations\nand pseudo-labels for semi-supervised FER. LEAF introduces a hierarchical\nexpression-aware aggregation strategy that operates at three levels: semantic,\ninstance, and category. (1) At the semantic and instance levels, LEAF decouples\nrepresentations into expression-agnostic and expression-relevant components,\nand adaptively fuses them using learnable gating weights. (2) At the category\nlevel, LEAF assigns ambiguous pseudo-labels by decoupling predictions into\npositive and negative parts, and employs a consistency loss to ensure agreement\nbetween two augmented views of the same image. Extensive experiments on\nbenchmark datasets demonstrate that by unveiling and harmonizing both sides of\nthe coin, LEAF outperforms state-of-the-art semi-supervised FER methods,\neffectively leveraging both labeled and unlabeled data. Moreover, the proposed\nexpression-aware aggregation strategy can be seamlessly integrated into\nexisting semi-supervised frameworks, leading to significant performance gains.\nOur code is available at https://anonymous.4open.science/r/LEAF-BC57/.\n","authors":["Fan Zhang","Zhi-Qi Cheng","Jian Zhao","Xiaojiang Peng","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.15041v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17400v1","updated":"2024-04-26T13:21:31Z","published":"2024-04-26T13:21:31Z","title":"Spatial-frequency Dual-Domain Feature Fusion Network for Low-Light\n Remote Sensing Image Enhancement","summary":" Low-light remote sensing images generally feature high resolution and high\nspatial complexity, with continuously distributed surface features in space.\nThis continuity in scenes leads to extensive long-range correlations in spatial\ndomains within remote sensing images. Convolutional Neural Networks, which rely\non local correlations for long-distance modeling, struggle to establish\nlong-range correlations in such images. On the other hand, transformer-based\nmethods that focus on global information face high computational complexities\nwhen processing high-resolution remote sensing images. From another\nperspective, Fourier transform can compute global information without\nintroducing a large number of parameters, enabling the network to more\nefficiently capture the overall image structure and establish long-range\ncorrelations. Therefore, we propose a Dual-Domain Feature Fusion Network (DFFN)\nfor low-light remote sensing image enhancement. Specifically, this challenging\ntask of low-light enhancement is divided into two more manageable sub-tasks:\nthe first phase learns amplitude information to restore image brightness, and\nthe second phase learns phase information to refine details. To facilitate\ninformation exchange between the two phases, we designed an information fusion\naffine block that combines data from different phases and scales. Additionally,\nwe have constructed two dark light remote sensing datasets to address the\ncurrent lack of datasets in dark light remote sensing image enhancement.\nExtensive evaluations show that our method outperforms existing\nstate-of-the-art methods. The code is available at\nhttps://github.com/iijjlk/DFFN.\n","authors":["Zishu Yao","Guodong Fan","Jinfu Fan","Min Gan","C. L. Philip Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17400v1.pdf","comment":"14 page"},{"id":"http://arxiv.org/abs/2402.00290v2","updated":"2024-04-26T13:13:52Z","published":"2024-02-01T02:43:20Z","title":"MEIA: Towards Realistic Multimodal Interaction and Manipulation for\n Embodied Robots","summary":" With the surge in the development of large language models, embodied\nintelligence has attracted increasing attention. Nevertheless, prior works on\nembodied intelligence typically encode scene or historical memory in an\nunimodal manner, either visual or linguistic, which complicates the alignment\nof the model's action planning with embodied control. To overcome this\nlimitation, we introduce the Multimodal Embodied Interactive Agent (MEIA),\ncapable of translating high-level tasks expressed in natural language into a\nsequence of executable actions. Specifically, we propose a novel Multimodal\nEnvironment Memory (MEM) module, facilitating the integration of embodied\ncontrol with large models through the visual-language memory of scenes. This\ncapability enables MEIA to generate executable action plans based on diverse\nrequirements and the robot's capabilities. Furthermore, we construct an\nembodied question answering dataset based on a dynamic virtual cafe environment\nwith the help of the large language model. In this virtual environment, we\nconduct several experiments, utilizing multiple large models through zero-shot\nlearning, and carefully design scenarios for various situations. The\nexperimental results showcase the promising performance of our MEIA in various\nembodied interactive tasks.\n","authors":["Yang Liu","Xinshuai Song","Kaixuan Jiang","Weixing Chen","Jingzhou Luo","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2402.00290v2.pdf","comment":"Codes will be available at https://github.com/HCPLab-SYSU/CausalVLR"},{"id":"http://arxiv.org/abs/2404.17381v1","updated":"2024-04-26T12:56:16Z","published":"2024-04-26T12:56:16Z","title":"Frequency-Guided Multi-Level Human Action Anomaly Detection with\n Normalizing Flows","summary":" We introduce the task of human action anomaly detection (HAAD), which aims to\nidentify anomalous motions in an unsupervised manner given only the\npre-determined normal category of training action samples. Compared to prior\nhuman-related anomaly detection tasks which primarily focus on unusual events\nfrom videos, HAAD involves the learning of specific action labels to recognize\nsemantically anomalous human behaviors. To address this task, we propose a\nnormalizing flow (NF)-based detection framework where the sample likelihood is\neffectively leveraged to indicate anomalies. As action anomalies often occur in\nsome specific body parts, in addition to the full-body action feature learning,\nwe incorporate extra encoding streams into our framework for a finer modeling\nof body subsets. Our framework is thus multi-level to jointly discover global\nand local motion anomalies. Furthermore, to show awareness of the potentially\njittery data during recording, we resort to discrete cosine transformation by\nconverting the action samples from the temporal to the frequency domain to\nmitigate the issue of data instability. Extensive experimental results on two\nhuman action datasets demonstrate that our method outperforms the baselines\nformed by adapting state-of-the-art human activity AD approaches to our task of\nHAAD.\n","authors":["Shun Maeda","Chunzhi Gu","Jun Yu","Shogo Tokai","Shangce Gao","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.17381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17371v1","updated":"2024-04-26T12:43:19Z","published":"2024-04-26T12:43:19Z","title":"Estimating the Robustness Radius for Randomized Smoothing with\n 100$\\times$ Sample Efficiency","summary":" Randomized smoothing (RS) has successfully been used to improve the\nrobustness of predictions for deep neural networks (DNNs) by adding random\nnoise to create multiple variations of an input, followed by deciding the\nconsensus. To understand if an RS-enabled DNN is effective in the sampled input\ndomains, it is mandatory to sample data points within the operational design\ndomain, acquire the point-wise certificate regarding robustness radius, and\ncompare it with pre-defined acceptance criteria. Consequently, ensuring that a\npoint-wise robustness certificate for any given data point is obtained\nrelatively cost-effectively is crucial. This work demonstrates that reducing\nthe number of samples by one or two orders of magnitude can still enable the\ncomputation of a slightly smaller robustness radius (commonly ~20% radius\nreduction) with the same confidence. We provide the mathematical foundation for\nexplaining the phenomenon while experimentally showing promising results on the\nstandard CIFAR-10 and ImageNet datasets.\n","authors":["Emmanouil Seferis","Stefanos Kollias","Chih-Hong Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.17371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17364v1","updated":"2024-04-26T12:27:57Z","published":"2024-04-26T12:27:57Z","title":"MV-VTON: Multi-View Virtual Try-On with Diffusion Models","summary":" The goal of image-based virtual try-on is to generate an image of the target\nperson naturally wearing the given clothing. However, most existing methods\nsolely focus on the frontal try-on using the frontal clothing. When the views\nof the clothing and person are significantly inconsistent, particularly when\nthe person's view is non-frontal, the results are unsatisfactory. To address\nthis challenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to\nreconstruct the dressing results of a person from multiple views using the\ngiven clothes. On the one hand, given that single-view clothes provide\ninsufficient information for MV-VTON, we instead employ two images, i.e., the\nfrontal and back views of the clothing, to encompass the complete view as much\nas possible. On the other hand, the diffusion models that have demonstrated\nsuperior abilities are adopted to perform our MV-VTON. In particular, we\npropose a view-adaptive selection method where hard-selection and\nsoft-selection are applied to the global and local clothing feature extraction,\nrespectively. This ensures that the clothing features are roughly fit to the\nperson's view. Subsequently, we suggest a joint attention block to align and\nfuse clothing features with person features. Additionally, we collect a MV-VTON\ndataset, i.e., Multi-View Garment (MVG), in which each person has multiple\nphotos with diverse views and poses. Experiments show that the proposed method\nnot only achieves state-of-the-art results on MV-VTON task using our MVG\ndataset, but also has superiority on frontal-view virtual try-on task using\nVITON-HD and DressCode datasets. Codes and datasets will be publicly released\nat https://github.com/hywang2002/MV-VTON .\n","authors":["Haoyu Wang","Zhilu Zhang","Donglin Di","Shiliang Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.17364v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2211.12371v2","updated":"2024-04-26T12:25:55Z","published":"2022-11-22T16:05:58Z","title":"Gait Recognition in Large-scale Free Environment via Single LiDAR","summary":" Human gait recognition is crucial in multimedia, enabling identification\nthrough walking patterns without direct interaction, enhancing the integration\nacross various media forms in real-world applications like smart homes,\nhealthcare and non-intrusive security. LiDAR's ability to capture depth makes\nit pivotal for robotic perception and holds promise for real-world gait\nrecognition. In this paper, based on a single LiDAR, we present the\nHierarchical Multi-representation Feature Interaction Network (HMRNet) for\nrobust gait recognition. Prevailing LiDAR-based gait datasets primarily derive\nfrom controlled settings with predefined trajectory, remaining a gap with\nreal-world scenarios. To facilitate LiDAR-based gait recognition research, we\nintroduce FreeGait, a comprehensive gait dataset from large-scale,\nunconstrained settings, enriched with multi-modal and varied 2D/3D data.\nNotably, our approach achieves state-of-the-art performance on prior dataset\n(SUSTech1K) and on FreeGait. Code and dataset will be released upon publication\nof this paper.\n","authors":["Xiao Han","Yiming Ren","Peishan Cong","Yujing Sun","Jingya Wang","Lan Xu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2211.12371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17360v1","updated":"2024-04-26T12:21:57Z","published":"2024-04-26T12:21:57Z","title":"UniRGB-IR: A Unified Framework for Visible-Infrared Downstream Tasks via\n Adapter Tuning","summary":" Semantic analysis on visible (RGB) and infrared (IR) images has gained\nattention for its ability to be more accurate and robust under low-illumination\nand complex weather conditions. Due to the lack of pre-trained foundation\nmodels on the large-scale infrared image datasets, existing methods prefer to\ndesign task-specific frameworks and directly fine-tune them with pre-trained\nfoundation models on their RGB-IR semantic relevance datasets, which results in\npoor scalability and limited generalization. In this work, we propose a\nscalable and efficient framework called UniRGB-IR to unify RGB-IR downstream\ntasks, in which a novel adapter is developed to efficiently introduce richer\nRGB-IR features into the pre-trained RGB-based foundation model. Specifically,\nour framework consists of a vision transformer (ViT) foundation model, a\nMulti-modal Feature Pool (MFP) module and a Supplementary Feature Injector\n(SFI) module. The MFP and SFI modules cooperate with each other as an adpater\nto effectively complement the ViT features with the contextual multi-scale\nfeatures. During training process, we freeze the entire foundation model to\ninherit prior knowledge and only optimize the MFP and SFI modules. Furthermore,\nto verify the effectiveness of our framework, we utilize the ViT-Base as the\npre-trained foundation model to perform extensive experiments. Experimental\nresults on various RGB-IR downstream tasks demonstrate that our method can\nachieve state-of-the-art performance. The source code and results are available\nat https://github.com/PoTsui99/UniRGB-IR.git.\n","authors":["Maoxun Yuan","Bo Cui","Tianyi Zhao","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2404.17360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17357v1","updated":"2024-04-26T12:13:41Z","published":"2024-04-26T12:13:41Z","title":"Simultaneous Tri-Modal Medical Image Fusion and Super-Resolution using\n Conditional Diffusion Model","summary":" In clinical practice, tri-modal medical image fusion, compared to the\nexisting dual-modal technique, can provide a more comprehensive view of the\nlesions, aiding physicians in evaluating the disease's shape, location, and\nbiological activity. However, due to the limitations of imaging equipment and\nconsiderations for patient safety, the quality of medical images is usually\nlimited, leading to sub-optimal fusion performance, and affecting the depth of\nimage analysis by the physician. Thus, there is an urgent need for a technology\nthat can both enhance image resolution and integrate multi-modal information.\nAlthough current image processing methods can effectively address image fusion\nand super-resolution individually, solving both problems synchronously remains\nextremely challenging. In this paper, we propose TFS-Diff, a simultaneously\nrealize tri-modal medical image fusion and super-resolution model. Specially,\nTFS-Diff is based on the diffusion model generation of a random iterative\ndenoising process. We also develop a simple objective function and the proposed\nfusion super-resolution loss, effectively evaluates the uncertainty in the\nfusion and ensures the stability of the optimization process. And the channel\nattention module is proposed to effectively integrate key information from\ndifferent modalities for clinical diagnosis, avoiding information loss caused\nby multiple image processing. Extensive experiments on public Harvard datasets\nshow that TFS-Diff significantly surpass the existing state-of-the-art methods\nin both quantitative and visual evaluations. The source code will be available\nat GitHub.\n","authors":["Yushen Xu","Xiaosong Li","Yuchan Jie","Haishu Tan"],"pdf_url":"https://arxiv.org/pdf/2404.17357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13330v2","updated":"2024-04-26T12:05:20Z","published":"2024-04-20T09:27:05Z","title":"SEGSRNet for Stereo-Endoscopic Image Super-Resolution and Surgical\n Instrument Segmentation","summary":" SEGSRNet addresses the challenge of precisely identifying surgical\ninstruments in low-resolution stereo endoscopic images, a common issue in\nmedical imaging and robotic surgery. Our innovative framework enhances image\nclarity and segmentation accuracy by applying state-of-the-art super-resolution\ntechniques before segmentation. This ensures higher-quality inputs for more\nprecise segmentation. SEGSRNet combines advanced feature extraction and\nattention mechanisms with spatial processing to sharpen image details, which is\nsignificant for accurate tool identification in medical images. Our proposed\nmodel outperforms current models including Dice, IoU, PSNR, and SSIM, SEGSRNet\nwhere it produces clearer and more accurate images for stereo endoscopic\nsurgical imaging. SEGSRNet can provide image resolution and precise\nsegmentation which can significantly enhance surgical accuracy and patient care\noutcomes.\n","authors":["Mansoor Hayat","Supavadee Aramvith","Titipat Achakulvisut"],"pdf_url":"https://arxiv.org/pdf/2404.13330v2.pdf","comment":"Paper accepted for Presentation in 46th Annual International\n Conference of the IEEE Engineering in Medicine and Biology Society (EMBS),\n Orlando, Florida, USA (Camera Ready Version)"},{"id":"http://arxiv.org/abs/2404.17350v1","updated":"2024-04-26T11:57:17Z","published":"2024-04-26T11:57:17Z","title":"On the Road to Clarity: Exploring Explainable AI for World Models in a\n Driver Assistance System","summary":" In Autonomous Driving (AD) transparency and safety are paramount, as mistakes\nare costly. However, neural networks used in AD systems are generally\nconsidered black boxes. As a countermeasure, we have methods of explainable AI\n(XAI), such as feature relevance estimation and dimensionality reduction.\nCoarse graining techniques can also help reduce dimensionality and find\ninterpretable global patterns. A specific coarse graining method is\nRenormalization Groups from statistical physics. It has previously been applied\nto Restricted Boltzmann Machines (RBMs) to interpret unsupervised learning. We\nrefine this technique by building a transparent backbone model for\nconvolutional variational autoencoders (VAE) that allows mapping latent values\nto input features and has performance comparable to trained black box VAEs.\nMoreover, we propose a custom feature map visualization technique to analyze\nthe internal convolutional layers in the VAE to explain internal causes of poor\nreconstruction that may lead to dangerous traffic scenarios in AD applications.\nIn a second key contribution, we propose explanation and evaluation techniques\nfor the internal dynamics and feature relevance of prediction networks. We test\na long short-term memory (LSTM) network in the computer vision domain to\nevaluate the predictability and in future applications potentially safety of\nprediction models. We showcase our methods by analyzing a VAE-LSTM world model\nthat predicts pedestrian perception in an urban traffic situation.\n","authors":["Mohamed Roshdi","Julian Petzold","Mostafa Wahby","Hussein Ebrahim","Mladen Berekovic","Heiko Hamann"],"pdf_url":"https://arxiv.org/pdf/2404.17350v1.pdf","comment":"8 pages, 6 figures, to be published in IEEE CAI 2024"},{"id":"http://arxiv.org/abs/2401.13555v2","updated":"2024-04-26T11:50:10Z","published":"2024-01-24T16:13:26Z","title":"Benchmarking the Fairness of Image Upsampling Methods","summary":" Recent years have witnessed a rapid development of deep generative models for\ncreating synthetic media, such as images and videos. While the practical\napplications of these models in everyday tasks are enticing, it is crucial to\nassess the inherent risks regarding their fairness. In this work, we introduce\na comprehensive framework for benchmarking the performance and fairness of\nconditional generative models. We develop a set of\nmetrics$\\unicode{x2013}$inspired by their supervised fairness\ncounterparts$\\unicode{x2013}$to evaluate the models on their fairness and\ndiversity. Focusing on the specific application of image upsampling, we create\na benchmark covering a wide variety of modern upsampling methods. As part of\nthe benchmark, we introduce UnfairFace, a subset of FairFace that replicates\nthe racial distribution of common large-scale face datasets. Our empirical\nstudy highlights the importance of using an unbiased training set and reveals\nvariations in how the algorithms respond to dataset imbalances. Alarmingly, we\nfind that none of the considered methods produces statistically fair and\ndiverse results. All experiments can be reproduced using our provided\nrepository.\n","authors":["Mike Laszkiewicz","Imant Daunhawer","Julia E. Vogt","Asja Fischer","Johannes Lederer"],"pdf_url":"https://arxiv.org/pdf/2401.13555v2.pdf","comment":"This is the author's version of the work. It is posted here for your\n personal use. Not for redistribution. The definitive Version of Record was\n published at the 2024 ACM Conference on Fairness, Accountability, and\n Transparency (FAccT '24)"},{"id":"http://arxiv.org/abs/2404.17340v1","updated":"2024-04-26T11:39:50Z","published":"2024-04-26T11:39:50Z","title":"Masked Two-channel Decoupling Framework for Incomplete Multi-view Weak\n Multi-label Learning","summary":" Multi-view learning has become a popular research topic in recent years, but\nresearch on the cross-application of classic multi-label classification and\nmulti-view learning is still in its early stages. In this paper, we focus on\nthe complex yet highly realistic task of incomplete multi-view weak multi-label\nlearning and propose a masked two-channel decoupling framework based on deep\nneural networks to solve this problem. The core innovation of our method lies\nin decoupling the single-channel view-level representation, which is common in\ndeep multi-view learning methods, into a shared representation and a\nview-proprietary representation. We also design a cross-channel contrastive\nloss to enhance the semantic property of the two channels. Additionally, we\nexploit supervised information to design a label-guided graph regularization\nloss, helping the extracted embedding features preserve the geometric structure\namong samples. Inspired by the success of masking mechanisms in image and text\nanalysis, we develop a random fragment masking strategy for vector features to\nimprove the learning ability of encoders. Finally, it is important to emphasize\nthat our model is fully adaptable to arbitrary view and label absences while\nalso performing well on the ideal full data. We have conducted sufficient and\nconvincing experiments to confirm the effectiveness and advancement of our\nmodel.\n","authors":["Chengliang Liu","Jie Wen","Yabo Liu","Chao Huang","Zhihao Wu","Xiaoling Luo","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.17340v1.pdf","comment":"Accepted at NeurIPS 2023. Email: liucl1996@163.com"},{"id":"http://arxiv.org/abs/2404.17335v1","updated":"2024-04-26T11:32:53Z","published":"2024-04-26T11:32:53Z","title":"A Novel Spike Transformer Network for Depth Estimation from Event\n Cameras via Cross-modality Knowledge Distillation","summary":" Depth estimation is crucial for interpreting complex environments, especially\nin areas such as autonomous vehicle navigation and robotics. Nonetheless,\nobtaining accurate depth readings from event camera data remains a formidable\nchallenge. Event cameras operate differently from traditional digital cameras,\ncontinuously capturing data and generating asynchronous binary spikes that\nencode time, location, and light intensity. Yet, the unique sampling mechanisms\nof event cameras render standard image based algorithms inadequate for\nprocessing spike data. This necessitates the development of innovative,\nspike-aware algorithms tailored for event cameras, a task compounded by the\nirregularity, continuity, noise, and spatial and temporal characteristics\ninherent in spiking data.Harnessing the strong generalization capabilities of\ntransformer neural networks for spatiotemporal data, we propose a purely\nspike-driven spike transformer network for depth estimation from spiking camera\ndata. To address performance limitations with Spiking Neural Networks (SNN), we\nintroduce a novel single-stage cross-modality knowledge transfer framework\nleveraging knowledge from a large vision foundational model of artificial\nneural networks (ANN) (DINOv2) to enhance the performance of SNNs with limited\ndata. Our experimental results on both synthetic and real datasets show\nsubstantial improvements over existing models, with notable gains in Absolute\nRelative and Square Relative errors (49% and 39.77% improvements over the\nbenchmark model Spike-T, respectively). Besides accuracy, the proposed model\nalso demonstrates reduced power consumptions, a critical factor for practical\napplications.\n","authors":["Xin Zhang","Liangxiu Han","Tam Sobeih","Lianghao Han","Darren Dancey"],"pdf_url":"https://arxiv.org/pdf/2404.17335v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.08931v2","updated":"2024-04-26T11:22:51Z","published":"2023-11-15T13:04:57Z","title":"Structural-Based Uncertainty in Deep Learning Across Anatomical Scales:\n Analysis in White Matter Lesion Segmentation","summary":" This paper explores uncertainty quantification (UQ) as an indicator of the\ntrustworthiness of automated deep-learning (DL) tools in the context of white\nmatter lesion (WML) segmentation from magnetic resonance imaging (MRI) scans of\nmultiple sclerosis (MS) patients. Our study focuses on two principal aspects of\nuncertainty in structured output segmentation tasks. Firstly, we postulate that\na good uncertainty measure should indicate predictions likely to be incorrect\nwith high uncertainty values. Second, we investigate the merit of quantifying\nuncertainty at different anatomical scales (voxel, lesion, or patient). We\nhypothesize that uncertainty at each scale is related to specific types of\nerrors. Our study aims to confirm this relationship by conducting separate\nanalyses for in-domain and out-of-domain settings. Our primary methodological\ncontributions are (i) the development of novel measures for quantifying\nuncertainty at lesion and patient scales, derived from structural prediction\ndiscrepancies, and (ii) the extension of an error retention curve analysis\nframework to facilitate the evaluation of UQ performance at both lesion and\npatient scales. The results from a multi-centric MRI dataset of 334 patients\ndemonstrate that our proposed measures more effectively capture model errors at\nthe lesion and patient scales compared to measures that average voxel-scale\nuncertainty values. We provide the UQ protocols code at\nhttps://github.com/Medical-Image-Analysis-Laboratory/MS_WML_uncs.\n","authors":["Nataliia Molchanova","Vatsal Raina","Andrey Malinin","Francesco La Rosa","Adrien Depeursinge","Mark Gales","Cristina Granziera","Henning Muller","Mara Graziani","Meritxell Bach Cuadra"],"pdf_url":"https://arxiv.org/pdf/2311.08931v2.pdf","comment":"Preprint submitted to the journal"},{"id":"http://arxiv.org/abs/2309.07880v2","updated":"2024-04-26T11:15:57Z","published":"2023-09-14T17:25:25Z","title":"mEBAL2 Database and Benchmark: Image-based Multispectral Eyeblink\n Detection","summary":" This work introduces a new multispectral database and novel approaches for\neyeblink detection in RGB and Near-Infrared (NIR) individual images. Our\ncontributed dataset (mEBAL2, multimodal Eye Blink and Attention Level\nestimation, Version 2) is the largest existing eyeblink database, representing\na great opportunity to improve data-driven multispectral approaches for blink\ndetection and related applications (e.g., attention level estimation and\npresentation attack detection in face biometrics). mEBAL2 includes 21,100 image\nsequences from 180 different students (more than 2 million labeled images in\ntotal) while conducting a number of e-learning tasks of varying difficulty or\ntaking a real course on HTML initiation through the edX MOOC platform. mEBAL2\nuses multiple sensors, including two Near-Infrared (NIR) and one RGB camera to\ncapture facial gestures during the execution of the tasks, as well as an\nElectroencephalogram (EEG) band to get the cognitive activity of the user and\nblinking events. Furthermore, this work proposes a Convolutional Neural Network\narchitecture as benchmark for blink detection on mEBAL2 with performances up to\n97%. Different training methodologies are implemented using the RGB spectrum,\nNIR spectrum, and the combination of both to enhance the performance on\nexisting eyeblink detectors. We demonstrate that combining NIR and RGB images\nduring training improves the performance of RGB eyeblink detectors (i.e.,\ndetection based only on a RGB image). Finally, the generalization capacity of\nthe proposed eyeblink detectors is validated in wilder and more challenging\nenvironments like the HUST-LEBW dataset to show the usefulness of mEBAL2 to\ntrain a new generation of data-driven approaches for eyeblink detection.\n","authors":["Roberto Daza","Aythami Morales","Julian Fierrez","Ruben Tolosana","Ruben Vera-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2309.07880v2.pdf","comment":"Published in the journal Pattern Recognition Letters in June 2024.\n Accessible from\n https://www.sciencedirect.com/science/article/pii/S0167865524001120?via%3Dihub"},{"id":"http://arxiv.org/abs/2404.17324v1","updated":"2024-04-26T11:10:24Z","published":"2024-04-26T11:10:24Z","title":"Dense Road Surface Grip Map Prediction from Multimodal Image Data","summary":" Slippery road weather conditions are prevalent in many regions and cause a\nregular risk for traffic. Still, there has been less research on how autonomous\nvehicles could detect slippery driving conditions on the road to drive safely.\nIn this work, we propose a method to predict a dense grip map from the area in\nfront of the car, based on postprocessed multimodal sensor data. We trained a\nconvolutional neural network to predict pixelwise grip values from fused RGB\ncamera, thermal camera, and LiDAR reflectance images, based on weakly\nsupervised ground truth from an optical road weather sensor.\n The experiments show that it is possible to predict dense grip values with\ngood accuracy from the used data modalities as the produced grip map follows\nboth ground truth measurements and local weather conditions, such as snowy\nareas on the road. The model using only the RGB camera or LiDAR reflectance\nmodality provided good baseline results for grip prediction accuracy while\nusing models fusing the RGB camera, thermal camera, and LiDAR modalities\nimproved the grip predictions significantly.\n","authors":["Jyri Maanpää","Julius Pesonen","Heikki Hyyti","Iaroslav Melekhov","Juho Kannala","Petri Manninen","Antero Kukko","Juha Hyyppä"],"pdf_url":"https://arxiv.org/pdf/2404.17324v1.pdf","comment":"17 pages, 7 figures (supplementary material 1 page, 1 figure).\n Submitted to 27th International Conference of Pattern Recognition (ICPR 2024)"},{"id":"http://arxiv.org/abs/2302.05309v3","updated":"2024-04-26T11:04:02Z","published":"2023-02-10T15:12:40Z","title":"The LuViRA Dataset: Synchronized Vision, Radio, and Audio Sensors for\n Indoor Localization","summary":" We present a synchronized multisensory dataset for accurate and robust indoor\nlocalization: the Lund University Vision, Radio, and Audio (LuViRA) Dataset.\nThe dataset includes color images, corresponding depth maps, inertial\nmeasurement unit (IMU) readings, channel response between a 5G massive\nmultiple-input and multiple-output (MIMO) testbed and user equipment, audio\nrecorded by 12 microphones, and accurate six degrees of freedom (6DOF) pose\nground truth of 0.5 mm. We synchronize these sensors to ensure that all data is\nrecorded simultaneously. A camera, speaker, and transmit antenna are placed on\ntop of a slowly moving service robot, and 89 trajectories are recorded. Each\ntrajectory includes 20 to 50 seconds of recorded sensor data and ground truth\nlabels. Data from different sensors can be used separately or jointly to\nperform localization tasks, and data from the motion capture (mocap) system is\nused to verify the results obtained by the localization algorithms. The main\naim of this dataset is to enable research on sensor fusion with the most\ncommonly used sensors for localization tasks. Moreover, the full dataset or\nsome parts of it can also be used for other research areas such as channel\nestimation, image classification, etc. Our dataset is available at:\nhttps://github.com/ilaydayaman/LuViRA_Dataset\n","authors":["Ilayda Yaman","Guoda Tian","Martin Larsson","Patrik Persson","Michiel Sandra","Alexander Dürr","Erik Tegler","Nikhil Challa","Henrik Garde","Fredrik Tufvesson","Kalle Åström","Ove Edfors","Steffen Malkowsky","Liang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.05309v3.pdf","comment":"7 pages, 7 figures, Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2304.09571v7","updated":"2024-04-26T10:58:18Z","published":"2023-04-19T11:19:10Z","title":"LLIC: Large Receptive Field Transform Coding with Adaptive Weights for\n Learned Image Compression","summary":" The Effective Receptive field (ERF) plays an important role in transform\ncoding, which determines how much redundancy can be removed at most during\ntransform and how many spatial priors can be utilized to synthesize textures\nduring inverse transform. Existing methods rely on stacks of small kernels,\nwhose ERF remains not large enough instead, or heavy non-local attention\nmechanisms, which limit the potential of high-resolution image coding. To\ntackle this issue, we propose Large Receptive Field Transform Coding with\nAdaptive Weights for Learned Image Compression (LLIC). Specifically, for the\nfirst time in the learned image compression community, we introduce a few large\nkernel-based depth-wise convolutions to reduce more redundancy while\nmaintaining modest complexity. Due to the wide range of image diversity, we\nfurther propose a mechanism to augment convolution adaptability through the\nself-conditioned generation of weights. The large kernels cooperate with\nnon-linear embedding and gate mechanisms for better expressiveness and lighter\npoint-wise interactions. Our investigation extends to refined training methods\nthat unlock the full potential of these large kernels. Moreover, to promote\nmore dynamic inter-channel interactions, we introduce an adaptive channel-wise\nbit allocation strategy that autonomously generates channel importance factors\nin a self-conditioned manner. To demonstrate the effectiveness of the proposed\ntransform coding, we align the entropy model to compare with existing transform\nmethods and obtain models LLIC-STF, LLIC-ELIC, LLIC-TCM. Extensive experiments\ndemonstrate our proposed LLIC models have significant improvements over\ncorresponding baselines and reduce BD-Rate by 9.49%, 9.47%, 10.94% on Kodak\nover VTM-17.0 Intra, respectively. Our LLIC models achieve state-of-the-art\nperformances and better trade-offs between performance and complexity.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Feng Gao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v7.pdf","comment":"major updates"},{"id":"http://arxiv.org/abs/2404.17310v1","updated":"2024-04-26T10:38:17Z","published":"2024-04-26T10:38:17Z","title":"Image Copy-Move Forgery Detection via Deep PatchMatch and Pairwise\n Ranking Learning","summary":" Recent advances in deep learning algorithms have shown impressive progress in\nimage copy-move forgery detection (CMFD). However, these algorithms lack\ngeneralizability in practical scenarios where the copied regions are not\npresent in the training images, or the cloned regions are part of the\nbackground. Additionally, these algorithms utilize convolution operations to\ndistinguish source and target regions, leading to unsatisfactory results when\nthe target regions blend well with the background. To address these\nlimitations, this study proposes a novel end-to-end CMFD framework that\nintegrates the strengths of conventional and deep learning methods.\nSpecifically, the study develops a deep cross-scale PatchMatch (PM) method that\nis customized for CMFD to locate copy-move regions. Unlike existing deep\nmodels, our approach utilizes features extracted from high-resolution scales to\nseek explicit and reliable point-to-point matching between source and target\nregions. Furthermore, we propose a novel pairwise rank learning framework to\nseparate source and target regions. By leveraging the strong prior of\npoint-to-point matches, the framework can identify subtle differences and\neffectively discriminate between source and target regions, even when the\ntarget regions blend well with the background. Our framework is fully\ndifferentiable and can be trained end-to-end. Comprehensive experimental\nresults highlight the remarkable generalizability of our scheme across various\ncopy-move scenarios, significantly outperforming existing methods.\n","authors":["Yuanman Li","Yingjie He","Changsheng Chen","Li Dong","Bin Li","Jiantao Zhou","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2404.17310v1.pdf","comment":"16 pages, 14figures"},{"id":"http://arxiv.org/abs/2404.17302v1","updated":"2024-04-26T10:18:17Z","published":"2024-04-26T10:18:17Z","title":"Part-Guided 3D RL for Sim2Real Articulated Object Manipulation","summary":" Manipulating unseen articulated objects through visual feedback is a critical\nbut challenging task for real robots. Existing learning-based solutions mainly\nfocus on visual affordance learning or other pre-trained visual models to guide\nmanipulation policies, which face challenges for novel instances in real-world\nscenarios. In this paper, we propose a novel part-guided 3D RL framework, which\ncan learn to manipulate articulated objects without demonstrations. We combine\nthe strengths of 2D segmentation and 3D RL to improve the efficiency of RL\npolicy training. To improve the stability of the policy on real robots, we\ndesign a Frame-consistent Uncertainty-aware Sampling (FUS) strategy to get a\ncondensed and hierarchical 3D representation. In addition, a single versatile\nRL policy can be trained on multiple articulated object manipulation tasks\nsimultaneously in simulation and shows great generalizability to novel\ncategories and instances. Experimental results demonstrate the effectiveness of\nour framework in both simulation and real-world settings. Our code is available\nat\nhttps://github.com/THU-VCLab/Part-Guided-3D-RL-for-Sim2Real-Articulated-Object-Manipulation.\n","authors":["Pengwei Xie","Rui Chen","Siang Chen","Yuzhe Qin","Fanbo Xiang","Tianyu Sun","Jing Xu","Guijin Wang","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2404.17302v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2404.13108v2","updated":"2024-04-26T10:10:52Z","published":"2024-04-19T16:19:30Z","title":"RegWSI: Whole Slide Image Registration using Combined Deep Feature- and\n Intensity-Based Methods: Winner of the ACROBAT 2023 Challenge","summary":" The automatic registration of differently stained whole slide images (WSIs)\nis crucial for improving diagnosis and prognosis by fusing complementary\ninformation emerging from different visible structures. It is also useful to\nquickly transfer annotations between consecutive or restained slides, thus\nsignificantly reducing the annotation time and associated costs. Nevertheless,\nthe slide preparation is different for each stain and the tissue undergoes\ncomplex and large deformations. Therefore, a robust, efficient, and accurate\nregistration method is highly desired by the scientific community and hospitals\nspecializing in digital pathology. We propose a two-step hybrid method\nconsisting of (i) deep learning- and feature-based initial alignment algorithm,\nand (ii) intensity-based nonrigid registration using the instance optimization.\nThe proposed method does not require any fine-tuning to a particular dataset\nand can be used directly for any desired tissue type and stain. The method\nscored 1st place in the ACROBAT 2023 challenge. We evaluated using three open\ndatasets: (i) ANHIR, (ii) ACROBAT, and (iii) HyReCo, and performed several\nablation studies concerning the resolution used for registration and the\ninitial alignment robustness and stability. The method achieves the most\naccurate results for the ACROBAT dataset, the cell-level registration accuracy\nfor the restained slides from the HyReCo dataset, and is among the best methods\nevaluated on the ANHIR dataset. The method does not require any fine-tuning to\na new datasets and can be used out-of-the-box for other types of microscopic\nimages. The method is incorporated into the DeeperHistReg framework, allowing\nothers to directly use it to register, transform, and save the WSIs at any\ndesired pyramid level. The proposed method is a significant contribution to the\nWSI registration, thus advancing the field of digital pathology.\n","authors":["Marek Wodzinski","Niccolò Marini","Manfredo Atzori","Henning Müller"],"pdf_url":"https://arxiv.org/pdf/2404.13108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13816v2","updated":"2024-04-26T09:43:37Z","published":"2024-04-22T01:36:50Z","title":"Neural Radiance Field in Autonomous Driving: A Survey","summary":" Neural Radiance Field (NeRF) has garnered significant attention from both\nacademia and industry due to its intrinsic advantages, particularly its\nimplicit representation and novel view synthesis capabilities. With the rapid\nadvancements in deep learning, a multitude of methods have emerged to explore\nthe potential applications of NeRF in the domain of Autonomous Driving (AD).\nHowever, a conspicuous void is apparent within the current literature. To\nbridge this gap, this paper conducts a comprehensive survey of NeRF's\napplications in the context of AD. Our survey is structured to categorize\nNeRF's applications in Autonomous Driving (AD), specifically encompassing\nperception, 3D reconstruction, simultaneous localization and mapping (SLAM),\nand simulation. We delve into in-depth analysis and summarize the findings for\neach application category, and conclude by providing insights and discussions\non future directions in this field. We hope this paper serves as a\ncomprehensive reference for researchers in this domain. To the best of our\nknowledge, this is the first survey specifically focused on the applications of\nNeRF in the Autonomous Driving domain.\n","authors":["Lei He","Leheng Li","Wenchao Sun","Zeyu Han","Yichen Liu","Sifa Zheng","Jianqiang Wang","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.13816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10711v3","updated":"2024-04-26T09:38:10Z","published":"2024-01-19T14:21:46Z","title":"Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal\n Models for Video Question Answering","summary":" Video Question Answering (VideoQA) aims to answer natural language questions\nbased on the information observed in videos. Despite the recent success of\nLarge Multimodal Models (LMMs) in image-language understanding and reasoning,\nthey deal with VideoQA insufficiently, by simply taking uniformly sampled\nframes as visual inputs, which ignores question-relevant visual clues.\nMoreover, there are no human annotations for question-critical timestamps in\nexisting VideoQA datasets. In light of this, we propose a novel weakly\nsupervised framework to enforce the LMMs to reason out the answers with\nquestion-critical moments as visual inputs. Specifically, we first fuse the\nquestion and answer pairs as event descriptions to find multiple keyframes as\ntarget moments and pseudo-labels, with the visual-language alignment capability\nof the CLIP models. With these pseudo-labeled keyframes as additionally weak\nsupervision, we devise a lightweight Gaussian-based Contrastive Grounding (GCG)\nmodule. GCG learns multiple Gaussian functions to characterize the temporal\nstructure of the video, and sample question-critical frames as positive moments\nto be the visual inputs of LMMs. Extensive experiments on several benchmarks\nverify the effectiveness of our framework, and we achieve substantial\nimprovements compared to previous state-of-the-art methods.\n","authors":["Haibo Wang","Chenghang Lai","Yixuan Sun","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10711v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00923v2","updated":"2024-04-26T09:32:11Z","published":"2023-10-02T06:33:06Z","title":"Lightweight Regression Model with Prediction Interval Estimation for\n Computer Vision-based Winter Road Surface Condition Monitoring","summary":" Winter conditions pose several challenges for automated driving applications.\nA key challenge during winter is accurate assessment of road surface condition,\nas its impact on friction is a critical parameter for safely and reliably\ncontrolling a vehicle. This paper proposes a deep learning regression model,\nSIWNet, capable of estimating road surface friction properties from camera\nimages. SIWNet extends state of the art by including an uncertainty estimation\nmechanism in the architecture. This is achieved by including an additional head\nin the network, which estimates a prediction interval. The prediction interval\nhead is trained with a maximum likelihood loss function. The model was trained\nand tested with the SeeingThroughFog dataset, which features corresponding road\nfriction sensor readings and images from an instrumented vehicle. Acquired\nresults highlight the functionality of the prediction interval estimation of\nSIWNet, while the network also achieved similar point estimate accuracy as the\nprevious state of the art. Furthermore, the SIWNet architecture is several\ntimes more lightweight than the previously applied state-of-the-art model,\nresulting in more practical and efficient deployment.\n","authors":["Risto Ojala","Alvari Seppänen"],"pdf_url":"https://arxiv.org/pdf/2310.00923v2.pdf","comment":"Published in IEEE Transactions on Intelligent Vehicles (2024)"},{"id":"http://arxiv.org/abs/2404.17275v1","updated":"2024-04-26T09:29:55Z","published":"2024-04-26T09:29:55Z","title":"Adversarial Reweighting with $α$-Power Maximization for Domain\n Adaptation","summary":" The practical Domain Adaptation (DA) tasks, e.g., Partial DA (PDA), open-set\nDA, universal DA, and test-time adaptation, have gained increasing attention in\nthe machine learning community. In this paper, we propose a novel approach,\ndubbed Adversarial Reweighting with $\\alpha$-Power Maximization (ARPM), for PDA\nwhere the source domain contains private classes absent in target domain. In\nARPM, we propose a novel adversarial reweighting model that adversarially\nlearns to reweight source domain data to identify source-private class samples\nby assigning smaller weights to them, for mitigating potential negative\ntransfer. Based on the adversarial reweighting, we train the transferable\nrecognition model on the reweighted source distribution to be able to classify\ncommon class data. To reduce the prediction uncertainty of the recognition\nmodel on the target domain for PDA, we present an $\\alpha$-power maximization\nmechanism in ARPM, which enriches the family of losses for reducing the\nprediction uncertainty for PDA. Extensive experimental results on five PDA\nbenchmarks, i.e., Office-31, Office-Home, VisDA-2017, ImageNet-Caltech, and\nDomainNet, show that our method is superior to recent PDA methods. Ablation\nstudies also confirm the effectiveness of components in our approach. To\ntheoretically analyze our method, we deduce an upper bound of target domain\nexpected error for PDA, which is approximately minimized in our approach. We\nfurther extend ARPM to open-set DA, universal DA, and test time adaptation, and\nverify the usefulness through experiments.\n","authors":["Xiang Gu","Xi Yu","Yan Yang","Jian Sun","Zongben Xu"],"pdf_url":"https://arxiv.org/pdf/2404.17275v1.pdf","comment":"To appear in IJCV"},{"id":"http://arxiv.org/abs/2404.17273v1","updated":"2024-04-26T09:25:18Z","published":"2024-04-26T09:25:18Z","title":"3SHNet: Boosting Image-Sentence Retrieval via Visual Semantic-Spatial\n Self-Highlighting","summary":" In this paper, we propose a novel visual Semantic-Spatial Self-Highlighting\nNetwork (termed 3SHNet) for high-precision, high-efficiency and\nhigh-generalization image-sentence retrieval. 3SHNet highlights the salient\nidentification of prominent objects and their spatial locations within the\nvisual modality, thus allowing the integration of visual semantics-spatial\ninteractions and maintaining independence between two modalities. This\nintegration effectively combines object regions with the corresponding semantic\nand position layouts derived from segmentation to enhance the visual\nrepresentation. And the modality-independence guarantees efficiency and\ngeneralization. Additionally, 3SHNet utilizes the structured contextual visual\nscene information from segmentation to conduct the local (region-based) or\nglobal (grid-based) guidance and achieve accurate hybrid-level retrieval.\nExtensive experiments conducted on MS-COCO and Flickr30K benchmarks\nsubstantiate the superior performances, inference efficiency and generalization\nof the proposed 3SHNet when juxtaposed with contemporary state-of-the-art\nmethodologies. Specifically, on the larger MS-COCO 5K test set, we achieve\n16.3%, 24.8%, and 18.3% improvements in terms of rSum score, respectively,\ncompared with the state-of-the-art methods using different image\nrepresentations, while maintaining optimal retrieval efficiency. Moreover, our\nperformance on cross-dataset generalization improves by 18.6%. Data and code\nare available at https://github.com/XuriGe1995/3SHNet.\n","authors":["Xuri Ge","Songpei Xu","Fuhai Chen","Jie Wang","Guoxin Wang","Shan An","Joemon M. Jose"],"pdf_url":"https://arxiv.org/pdf/2404.17273v1.pdf","comment":"Accepted Information Processing and Management (IP&M), 10 pages, 9\n figures and 8 tables"},{"id":"http://arxiv.org/abs/2404.17255v1","updated":"2024-04-26T08:51:31Z","published":"2024-04-26T08:51:31Z","title":"SDFD: Building a Versatile Synthetic Face Image Dataset with Diverse\n Attributes","summary":" AI systems rely on extensive training on large datasets to address various\ntasks. However, image-based systems, particularly those used for demographic\nattribute prediction, face significant challenges. Many current face image\ndatasets primarily focus on demographic factors such as age, gender, and skin\ntone, overlooking other crucial facial attributes like hairstyle and\naccessories. This narrow focus limits the diversity of the data and\nconsequently the robustness of AI systems trained on them. This work aims to\naddress this limitation by proposing a methodology for generating synthetic\nface image datasets that capture a broader spectrum of facial diversity.\nSpecifically, our approach integrates a systematic prompt formulation strategy,\nencompassing not only demographics and biometrics but also non-permanent traits\nlike make-up, hairstyle, and accessories. These prompts guide a\nstate-of-the-art text-to-image model in generating a comprehensive dataset of\nhigh-quality realistic images and can be used as an evaluation set in face\nanalysis systems. Compared to existing datasets, our proposed dataset proves\nequally or more challenging in image classification tasks while being much\nsmaller in size.\n","authors":["Georgia Baltsou","Ioannis Sarridis","Christos Koutlis","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.17255v1.pdf","comment":"2024 18th International Conference on Automatic Face and Gesture\n Recognition (FG)"},{"id":"http://arxiv.org/abs/2404.15956v2","updated":"2024-04-26T08:51:10Z","published":"2024-04-24T16:23:34Z","title":"A Survey on Visual Mamba","summary":" State space models (SSMs) with selection mechanisms and hardware-aware\narchitectures, namely Mamba, have recently demonstrated significant promise in\nlong-sequence modeling. Since the self-attention mechanism in transformers has\nquadratic complexity with image size and increasing computational demands, the\nresearchers are now exploring how to adapt Mamba for computer vision tasks.\nThis paper is the first comprehensive survey aiming to provide an in-depth\nanalysis of Mamba models in the field of computer vision. It begins by\nexploring the foundational concepts contributing to Mamba's success, including\nthe state space model framework, selection mechanisms, and hardware-aware\ndesign. Next, we review these vision mamba models by categorizing them into\nfoundational ones and enhancing them with techniques such as convolution,\nrecurrence, and attention to improve their sophistication. We further delve\ninto the widespread applications of Mamba in vision tasks, which include their\nuse as a backbone in various levels of vision processing. This encompasses\ngeneral visual tasks, Medical visual tasks (e.g., 2D / 3D segmentation,\nclassification, and image registration, etc.), and Remote Sensing visual tasks.\nWe specially introduce general visual tasks from two levels: High/Mid-level\nvision (e.g., Object detection, Segmentation, Video classification, etc.) and\nLow-level vision (e.g., Image super-resolution, Image restoration, Visual\ngeneration, etc.). We hope this endeavor will spark additional interest within\nthe community to address current challenges and further apply Mamba models in\ncomputer vision.\n","authors":["Hanwei Zhang","Ying Zhu","Dan Wang","Lijun Zhang","Tianxiang Chen","Zi Ye"],"pdf_url":"https://arxiv.org/pdf/2404.15956v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17254v1","updated":"2024-04-26T08:50:35Z","published":"2024-04-26T08:50:35Z","title":"Trinity Detector:text-assisted and attention mechanisms based spectral\n fusion for diffusion generation image detection","summary":" Artificial Intelligence Generated Content (AIGC) techniques, represented by\ntext-to-image generation, have led to a malicious use of deep forgeries,\nraising concerns about the trustworthiness of multimedia content. Adapting\ntraditional forgery detection methods to diffusion models proves challenging.\nThus, this paper proposes a forgery detection method explicitly designed for\ndiffusion models called Trinity Detector. Trinity Detector incorporates\ncoarse-grained text features through a CLIP encoder, coherently integrating\nthem with fine-grained artifacts in the pixel domain for comprehensive\nmultimodal detection. To heighten sensitivity to diffusion-generated image\nfeatures, a Multi-spectral Channel Attention Fusion Unit (MCAF) is designed,\nextracting spectral inconsistencies through adaptive fusion of diverse\nfrequency bands and further integrating spatial co-occurrence of the two\nmodalities. Extensive experimentation validates that our Trinity Detector\nmethod outperforms several state-of-the-art methods, our performance is\ncompetitive across all datasets and up to 17.6\\% improvement in transferability\nin the diffusion datasets.\n","authors":["Jiawei Song","Dengpan Ye","Yunming Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.17254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17252v1","updated":"2024-04-26T08:47:28Z","published":"2024-04-26T08:47:28Z","title":"Comparison of self-supervised in-domain and supervised out-domain\n transfer learning for bird species recognition","summary":" Transferring the weights of a pre-trained model to assist another task has\nbecome a crucial part of modern deep learning, particularly in data-scarce\nscenarios. Pre-training refers to the initial step of training models outside\nthe current task of interest, typically on another dataset. It can be done via\nsupervised models using human-annotated datasets or self-supervised models\ntrained on unlabeled datasets. In both cases, many pre-trained models are\navailable to fine-tune for the task of interest. Interestingly, research has\nshown that pre-trained models from ImageNet can be helpful for audio tasks\ndespite being trained on image datasets. Hence, it's unclear whether in-domain\nmodels would be advantageous compared to competent out-domain models, such as\nconvolutional neural networks from ImageNet. Our experiments will demonstrate\nthe usefulness of in-domain models and datasets for bird species recognition by\nleveraging VICReg, a recent and powerful self-supervised method.\n","authors":["Houtan Ghaffari","Paul Devos"],"pdf_url":"https://arxiv.org/pdf/2404.17252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17253v1","updated":"2024-04-26T08:47:28Z","published":"2024-04-26T08:47:28Z","title":"Weakly Supervised Training for Hologram Verification in Identity\n Documents","summary":" We propose a method to remotely verify the authenticity of Optically Variable\nDevices (OVDs), often referred to as ``holograms'', in identity documents. Our\nmethod processes video clips captured with smartphones under common lighting\nconditions, and is evaluated on two public datasets: MIDV-HOLO and MIDV-2020.\nThanks to a weakly-supervised training, we optimize a feature extraction and\ndecision pipeline which achieves a new leading performance on MIDV-HOLO, while\nmaintaining a high recall on documents from MIDV-2020 used as attack samples.\nIt is also the first method, to date, to effectively address the photo\nreplacement attack task, and can be trained on either genuine samples, attack\nsamples, or both for increased performance. By enabling to verify OVD shapes\nand dynamics with very little supervision, this work opens the way towards the\nuse of massive amounts of unlabeled data to build robust remote identity\ndocument verification systems on commodity smartphones. Code is available at\nhttps://github.com/EPITAResearchLab/pouliquen.24.icdar\n","authors":["Glen Pouliquen","Guillaume Chiron","Joseph Chazalon","Thierry Géraud","Ahmad Montaser Awal"],"pdf_url":"https://arxiv.org/pdf/2404.17253v1.pdf","comment":"Accepted at the International Conference on Document Analysis and\n Recognition (ICDAR 2024)"},{"id":"http://arxiv.org/abs/2404.17251v1","updated":"2024-04-26T08:42:59Z","published":"2024-04-26T08:42:59Z","title":"Camera Motion Estimation from RGB-D-Inertial Scene Flow","summary":" In this paper, we introduce a novel formulation for camera motion estimation\nthat integrates RGB-D images and inertial data through scene flow. Our goal is\nto accurately estimate the camera motion in a rigid 3D environment, along with\nthe state of the inertial measurement unit (IMU). Our proposed method offers\nthe flexibility to operate as a multi-frame optimization or to marginalize\nolder data, thus effectively utilizing past measurements. To assess the\nperformance of our method, we conducted evaluations using both synthetic data\nfrom the ICL-NUIM dataset and real data sequences from the OpenLORIS-Scene\ndataset. Our results show that the fusion of these two sensors enhances the\naccuracy of camera motion estimation when compared to using only visual data.\n","authors":["Samuel Cerezo","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2404.17251v1.pdf","comment":"Accepted to CVPR2024 Workshop on Visual Odometry and Computer Vision\n Applications"},{"id":"http://arxiv.org/abs/2404.17245v1","updated":"2024-04-26T08:35:46Z","published":"2024-04-26T08:35:46Z","title":"Parameter Efficient Fine-tuning of Self-supervised ViTs without\n Catastrophic Forgetting","summary":" Artificial neural networks often suffer from catastrophic forgetting, where\nlearning new concepts leads to a complete loss of previously acquired\nknowledge. We observe that this issue is particularly magnified in vision\ntransformers (ViTs), where post-pre-training and fine-tuning on new tasks can\nsignificantly degrade the model's original general abilities. For instance, a\nDINO ViT-Base/16 pre-trained on ImageNet-1k loses over 70% accuracy on\nImageNet-1k after just 10 iterations of fine-tuning on CIFAR-100. Overcoming\nthis stability-plasticity dilemma is crucial for enabling ViTs to continuously\nlearn and adapt to new domains while preserving their initial knowledge. In\nthis work, we study two new parameter-efficient fine-tuning strategies:\n(1)~Block Expansion, and (2) Low-rank adaptation (LoRA). Our experiments reveal\nthat using either Block Expansion or LoRA on self-supervised pre-trained ViTs\nsurpass fully fine-tuned ViTs in new domains while offering significantly\ngreater parameter efficiency. Notably, we find that Block Expansion experiences\nonly a minimal performance drop in the pre-training domain, thereby effectively\nmitigating catastrophic forgetting in pre-trained ViTs.\n","authors":["Reza Akbarian Bafghi","Nidhin Harilal","Claire Monteleoni","Maziar Raissi"],"pdf_url":"https://arxiv.org/pdf/2404.17245v1.pdf","comment":"Accepted at eLVM Workshop, CVPR, 2024"},{"id":"http://arxiv.org/abs/2404.17243v1","updated":"2024-04-26T08:31:10Z","published":"2024-04-26T08:31:10Z","title":"Binarizing Documents by Leveraging both Space and Frequency","summary":" Document Image Binarization is a well-known problem in Document Analysis and\nComputer Vision, although it is far from being solved. One of the main\nchallenges of this task is that documents generally exhibit degradations and\nacquisition artifacts that can greatly vary throughout the page. Nonetheless,\neven when dealing with a local patch of the document, taking into account the\noverall appearance of a wide portion of the page can ease the prediction by\nenriching it with semantic information on the ink and background conditions. In\nthis respect, approaches able to model both local and global information have\nbeen proven suitable for this task. In particular, recent applications of\nVision Transformer (ViT)-based models, able to model short and long-range\ndependencies via the attention mechanism, have demonstrated their superiority\nover standard Convolution-based models, which instead struggle to model global\ndependencies. In this work, we propose an alternative solution based on the\nrecently introduced Fast Fourier Convolutions, which overcomes the limitation\nof standard convolutions in modeling global information while requiring fewer\nparameters than ViTs. We validate the effectiveness of our approach via\nextensive experimental analysis considering different types of degradations.\n","authors":["Fabio Quattrini","Vittorio Pippi","Silvia Cascianelli","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.17243v1.pdf","comment":"Accepted at ICDAR2024"},{"id":"http://arxiv.org/abs/2403.00642v2","updated":"2024-04-26T08:24:11Z","published":"2024-03-01T16:22:05Z","title":"Rethinking The Uniformity Metric in Self-Supervised Learning","summary":" Uniformity plays an important role in evaluating learned representations,\nproviding insights into self-supervised learning. In our quest for effective\nuniformity metrics, we pinpoint four principled properties that such metrics\nshould possess. Namely, an effective uniformity metric should remain invariant\nto instance permutations and sample replications while accurately capturing\nfeature redundancy and dimensional collapse. Surprisingly, we find that the\nuniformity metric proposed by \\citet{Wang2020UnderstandingCR} fails to satisfy\nthe majority of these properties. Specifically, their metric is sensitive to\nsample replications, and can not account for feature redundancy and dimensional\ncollapse correctly. To overcome these limitations, we introduce a new\nuniformity metric based on the Wasserstein distance, which satisfies all the\naforementioned properties. Integrating this new metric in existing\nself-supervised learning methods effectively mitigates dimensional collapse and\nconsistently improves their performance on downstream tasks involving CIFAR-10\nand CIFAR-100 datasets. Code is available at\n\\url{https://github.com/statsle/WassersteinSSL}.\n","authors":["Xianghong Fang","Jian Li","Qiang Sun","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2403.00642v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17235v1","updated":"2024-04-26T08:15:43Z","published":"2024-04-26T08:15:43Z","title":"Optimizing Universal Lesion Segmentation: State Space Model-Guided\n Hierarchical Networks with Feature Importance Adjustment","summary":" Deep learning has revolutionized medical imaging by providing innovative\nsolutions to complex healthcare challenges. Traditional models often struggle\nto dynamically adjust feature importance, resulting in suboptimal\nrepresentation, particularly in tasks like semantic segmentation crucial for\naccurate structure delineation. Moreover, their static nature incurs high\ncomputational costs. To tackle these issues, we introduce Mamba-Ahnet, a novel\nintegration of State Space Model (SSM) and Advanced Hierarchical Network\n(AHNet) within the MAMBA framework, specifically tailored for semantic\nsegmentation in medical imaging.Mamba-Ahnet combines SSM's feature extraction\nand comprehension with AHNet's attention mechanisms and image reconstruction,\naiming to enhance segmentation accuracy and robustness. By dissecting images\ninto patches and refining feature comprehension through self-attention\nmechanisms, the approach significantly improves feature resolution. Integration\nof AHNet into the MAMBA framework further enhances segmentation performance by\nselectively amplifying informative regions and facilitating the learning of\nrich hierarchical representations. Evaluation on the Universal Lesion\nSegmentation dataset demonstrates superior performance compared to\nstate-of-the-art techniques, with notable metrics such as a Dice similarity\ncoefficient of approximately 98% and an Intersection over Union of about 83%.\nThese results underscore the potential of our methodology to enhance diagnostic\naccuracy, treatment planning, and ultimately, patient outcomes in clinical\npractice. By addressing the limitations of traditional models and leveraging\nthe power of deep learning, our approach represents a significant step forward\nin advancing medical imaging technology.\n","authors":["Kazi Shahriar Sanjid","Md. Tanzim Hossain","Md. Shakib Shahariar Junayed","M. Monir Uddin"],"pdf_url":"https://arxiv.org/pdf/2404.17235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17230v1","updated":"2024-04-26T08:02:07Z","published":"2024-04-26T08:02:07Z","title":"ObjectAdd: Adding Objects into Image via a Training-Free Diffusion\n Modification Fashion","summary":" We introduce ObjectAdd, a training-free diffusion modification method to add\nuser-expected objects into user-specified area. The motive of ObjectAdd stems\nfrom: first, describing everything in one prompt can be difficult, and second,\nusers often need to add objects into the generated image. To accommodate with\nreal world, our ObjectAdd maintains accurate image consistency after adding\nobjects with technical innovations in: (1) embedding-level concatenation to\nensure correct text embedding coalesce; (2) object-driven layout control with\nlatent and attention injection to ensure objects accessing user-specified area;\n(3) prompted image inpainting in an attention refocusing & object expansion\nfashion to ensure rest of the image stays the same. With a text-prompted image,\nour ObjectAdd allows users to specify a box and an object, and achieves: (1)\nadding object inside the box area; (2) exact content outside the box area; (3)\nflawless fusion between the two areas\n","authors":["Ziyue Zhang","Mingbao Lin","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.17230v1.pdf","comment":"12 pages, submitted to ECCV2024"},{"id":"http://arxiv.org/abs/2308.09375v3","updated":"2024-04-26T07:52:13Z","published":"2023-08-18T08:10:41Z","title":"Image Processing and Machine Learning for Hyperspectral Unmixing: An\n Overview and the HySUPP Python Package","summary":" Spectral pixels are often a mixture of the pure spectra of the materials,\ncalled endmembers, due to the low spatial resolution of hyperspectral sensors,\ndouble scattering, and intimate mixtures of materials in the scenes. Unmixing\nestimates the fractional abundances of the endmembers within the pixel.\nDepending on the prior knowledge of endmembers, linear unmixing can be divided\ninto three main groups: supervised, semi-supervised, and unsupervised (blind)\nlinear unmixing. Advances in Image processing and machine learning\nsubstantially affected unmixing. This paper provides an overview of advanced\nand conventional unmixing approaches. Additionally, we draw a critical\ncomparison between advanced and conventional techniques from the three\ncategories. We compare the performance of the unmixing techniques on three\nsimulated and two real datasets. The experimental results reveal the advantages\nof different unmixing categories for different unmixing scenarios. Moreover, we\nprovide an open-source Python-based package available at\nhttps://github.com/BehnoodRasti/HySUPP to reproduce the results.\n","authors":["Behnood Rasti","Alexandre Zouaoui","Julien Mairal","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2308.09375v3.pdf","comment":"IEEE Transactions on Geoscience and Remote Sensing, 2024"},{"id":"http://arxiv.org/abs/2404.17221v1","updated":"2024-04-26T07:48:00Z","published":"2024-04-26T07:48:00Z","title":"SAGHOG: Self-Supervised Autoencoder for Generating HOG Features for\n Writer Retrieval","summary":" This paper introduces SAGHOG, a self-supervised pretraining strategy for\nwriter retrieval using HOG features of the binarized input image. Our\npreprocessing involves the application of the Segment Anything technique to\nextract handwriting from various datasets, ending up with about 24k documents,\nfollowed by training a vision transformer on reconstructing masked patches of\nthe handwriting. SAGHOG is then finetuned by appending NetRVLAD as an encoding\nlayer to the pretrained encoder. Evaluation of our approach on three historical\ndatasets, Historical-WI, HisFrag20, and GRK-Papyri, demonstrates the\neffectiveness of SAGHOG for writer retrieval. Additionally, we provide ablation\nstudies on our architecture and evaluate un- and supervised finetuning.\nNotably, on HisFrag20, SAGHOG outperforms related work with a mAP of 57.2 % - a\nmargin of 11.6 % to the current state of the art, showcasing its robustness on\nchallenging data, and is competitive on even small datasets, e.g. GRK-Papyri,\nwhere we achieve a Top-1 accuracy of 58.0%.\n","authors":["Marco Peer","Florian Kleber","Robert Sablatnig"],"pdf_url":"https://arxiv.org/pdf/2404.17221v1.pdf","comment":"accepted for ICDAR2024"},{"id":"http://arxiv.org/abs/2404.16296v2","updated":"2024-04-26T07:45:58Z","published":"2024-04-25T02:28:16Z","title":"Research on Splicing Image Detection Algorithms Based on Natural Image\n Statistical Characteristics","summary":" With the development and widespread application of digital image processing\ntechnology, image splicing has become a common method of image manipulation,\nraising numerous security and legal issues. This paper introduces a new\nsplicing image detection algorithm based on the statistical characteristics of\nnatural images, aimed at improving the accuracy and efficiency of splicing\nimage detection. By analyzing the limitations of traditional methods, we have\ndeveloped a detection framework that integrates advanced statistical analysis\ntechniques and machine learning methods. The algorithm has been validated using\nmultiple public datasets, showing high accuracy in detecting spliced edges and\nlocating tampered areas, as well as good robustness. Additionally, we explore\nthe potential applications and challenges faced by the algorithm in real-world\nscenarios. This research not only provides an effective technological means for\nthe field of image tampering detection but also offers new ideas and methods\nfor future related research.\n","authors":["Ao Xiang","Jingyu Zhang","Qin Yang","Liyang Wang","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.16296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17215v1","updated":"2024-04-26T07:42:20Z","published":"2024-04-26T07:42:20Z","title":"SLAM for Indoor Mapping of Wide Area Construction Environments","summary":" Simultaneous localization and mapping (SLAM), i.e., the reconstruction of the\nenvironment represented by a (3D) map and the concurrent pose estimation, has\nmade astonishing progress. Meanwhile, large scale applications aiming at the\ndata collection in complex environments like factory halls or construction\nsites are becoming feasible. However, in contrast to small scale scenarios with\nbuilding interiors separated to single rooms, shop floors or construction areas\nrequire measures at larger distances in potentially texture less areas under\ndifficult illumination. Pose estimation is further aggravated since no GNSS\nmeasures are available as it is usual for such indoor applications. In our\nwork, we realize data collection in a large factory hall by a robot system\nequipped with four stereo cameras as well as a 3D laser scanner. We apply our\nstate-of-the-art LiDAR and visual SLAM approaches and discuss the respective\npros and cons of the different sensor types for trajectory estimation and dense\nmap generation in such an environment. Additionally, dense and accurate depth\nmaps are generated by 3D Gaussian splatting, which we plan to use in the\ncontext of our project aiming on the automatic construction and site\nmonitoring.\n","authors":["Vincent Ress","Wei Zhang","David Skuddis","Norbert Haala","Uwe Soergel"],"pdf_url":"https://arxiv.org/pdf/2404.17215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17212v1","updated":"2024-04-26T07:40:37Z","published":"2024-04-26T07:40:37Z","title":"Scrutinizing Data from Sky: An Examination of Its Veracity in Area Based\n Traffic Contexts","summary":" Traffic data collection has been an overwhelming task for researchers as well\nas authorities over the years. With the advancement in technology and\nintroduction of various tools for processing and extracting traffic data the\ntask has been made significantly convenient. Data from Sky (DFS) is one such\ntool, based on image processing and artificial intelligence (AI), that provides\noutput for macroscopic as well as microscopic variables of the traffic streams.\nThe company claims to provide 98 to 100 percent accuracy on the data exported\nusing DFS tool. The tool is widely used in developed countries where the\ntraffic is homogenous and has lane-based movements. In this study, authors have\nchecked the veracity of DFS tool in heterogenous and area-based traffic\nmovement that is prevailing in most developing countries. The validation is\ndone using various methods using Classified Volume Count (CVC), Space Mean\nSpeeds (SMS) of individual vehicle classes and microscopic trajectory of probe\nvehicle to verify DFS claim. The error for CVCs for each vehicle class present\nin the traffic stream is estimated. Mean Absolute Percentage Error (MAPE)\nvalues are calculated for average speeds of each vehicle class between manually\nand DFS extracted space mean speeds (SMSs), and the microscopic trajectories\nare validated using a GPS based tracker put on probe vehicles. The results are\nfairly accurate in the case of data taken from a bird eye view with least\nerrors. The other configurations of data collection have some significant\nerrors, that are majorly caused by the varied traffic composition, the view of\ncamera angle, and the direction of traffic.\n","authors":["Yawar Ali","Krishnan K N","Debashis Ray Sarkar","K. Ramachandra Rao","Niladri Chatterjee","Ashish Bhaskar"],"pdf_url":"https://arxiv.org/pdf/2404.17212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18573v2","updated":"2024-04-26T07:38:28Z","published":"2024-02-28T18:59:31Z","title":"UniMODE: Unified Monocular 3D Object Detection","summary":" Realizing unified monocular 3D object detection, including both indoor and\noutdoor scenes, holds great importance in applications like robot navigation.\nHowever, involving various scenarios of data to train models poses challenges\ndue to their significantly different characteristics, e.g., diverse geometry\nproperties and heterogeneous domain distributions. To address these challenges,\nwe build a detector based on the bird's-eye-view (BEV) detection paradigm,\nwhere the explicit feature projection is beneficial to addressing the geometry\nlearning ambiguity when employing multiple scenarios of data to train\ndetectors. Then, we split the classical BEV detection architecture into two\nstages and propose an uneven BEV grid design to handle the convergence\ninstability caused by the aforementioned challenges. Moreover, we develop a\nsparse BEV feature projection strategy to reduce computational cost and a\nunified domain alignment method to handle heterogeneous domains. Combining\nthese techniques, a unified detector UniMODE is derived, which surpasses the\nprevious state-of-the-art on the challenging Omni3D dataset (a large-scale\ndataset including both indoor and outdoor scenes) by 4.9% AP_3D, revealing the\nfirst successful generalization of a BEV detector to unified 3D object\ndetection.\n","authors":["Zhuoling Li","Xiaogang Xu","SerNam Lim","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.18573v2.pdf","comment":"This paper has been accepted for publication in CVPR2024"},{"id":"http://arxiv.org/abs/2404.17205v1","updated":"2024-04-26T07:30:32Z","published":"2024-04-26T07:30:32Z","title":"Two in One Go: Single-stage Emotion Recognition with Decoupled\n Subject-context Transformer","summary":" Emotion recognition aims to discern the emotional state of subjects within an\nimage, relying on subject-centric and contextual visual cues. Current\napproaches typically follow a two-stage pipeline: first localize subjects by\noff-the-shelf detectors, then perform emotion classification through the late\nfusion of subject and context features. However, the complicated paradigm\nsuffers from disjoint training stages and limited interaction between\nfine-grained subject-context elements. To address the challenge, we present a\nsingle-stage emotion recognition approach, employing a Decoupled\nSubject-Context Transformer (DSCT), for simultaneous subject localization and\nemotion classification. Rather than compartmentalizing training stages, we\njointly leverage box and emotion signals as supervision to enrich\nsubject-centric feature learning. Furthermore, we introduce DSCT to facilitate\ninteractions between fine-grained subject-context cues in a decouple-then-fuse\nmanner. The decoupled query token--subject queries and context\nqueries--gradually intertwine across layers within DSCT, during which spatial\nand semantic relations are exploited and aggregated. We evaluate our\nsingle-stage framework on two widely used context-aware emotion recognition\ndatasets, CAER-S and EMOTIC. Our approach surpasses two-stage alternatives with\nfewer parameter numbers, achieving a 3.39% accuracy improvement and a 6.46%\naverage precision gain on CAER-S and EMOTIC datasets, respectively.\n","authors":["Xinpeng Li","Teng Wang","Jian Zhao","Shuyi Mao","Jinbao Wang","Feng Zheng","Xiaojiang Peng","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.17205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17202v1","updated":"2024-04-26T07:23:14Z","published":"2024-04-26T07:23:14Z","title":"Self-supervised visual learning in the low-data regime: a comparative\n evaluation","summary":" Self-Supervised Learning (SSL) is a valuable and robust training methodology\nfor contemporary Deep Neural Networks (DNNs), enabling unsupervised pretraining\non a `pretext task' that does not require ground-truth labels/annotation. This\nallows efficient representation learning from massive amounts of unlabeled\ntraining data, which in turn leads to increased accuracy in a `downstream task'\nby exploiting supervised transfer learning. Despite the relatively\nstraightforward conceptualization and applicability of SSL, it is not always\nfeasible to collect and/or to utilize very large pretraining datasets,\nespecially when it comes to real-world application settings. In particular, in\ncases of specialized and domain-specific application scenarios, it may not be\nachievable or practical to assemble a relevant image pretraining dataset in the\norder of millions of instances or it could be computationally infeasible to\npretrain at this scale. This motivates an investigation on the effectiveness of\ncommon SSL pretext tasks, when the pretraining dataset is of relatively\nlimited/constrained size. In this context, this work introduces a taxonomy of\nmodern visual SSL methods, accompanied by detailed explanations and insights\nregarding the main categories of approaches, and, subsequently, conducts a\nthorough comparative experimental evaluation in the low-data regime, targeting\nto identify: a) what is learnt via low-data SSL pretraining, and b) how do\ndifferent SSL categories behave in such training scenarios. Interestingly, for\ndomain-specific downstream tasks, in-domain low-data SSL pretraining\noutperforms the common approach of large-scale pretraining on general datasets.\nGrounded on the obtained results, valuable insights are highlighted regarding\nthe performance of each category of SSL methods, which in turn suggest\nstraightforward future research directions in the field.\n","authors":["Sotirios Konstantakos","Despina Ioanna Chalkiadaki","Ioannis Mademlis","Yuki M. Asano","Efstratios Gavves","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.17202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17199v1","updated":"2024-04-26T07:17:09Z","published":"2024-04-26T07:17:09Z","title":"Few-shot Calligraphy Style Learning","summary":" We introduced \"Presidifussion,\" a novel approach to learning and replicating\nthe unique style of calligraphy of President Xu, using a pretrained diffusion\nmodel adapted through a two-stage training process. Initially, our model is\npretrained on a diverse dataset containing works from various calligraphers.\nThis is followed by fine-tuning on a smaller, specialized dataset of President\nXu's calligraphy, comprising just under 200 images. Our method introduces\ninnovative techniques of font image conditioning and stroke information\nconditioning, enabling the model to capture the intricate structural elements\nof Chinese characters. The effectiveness of our approach is demonstrated\nthrough a comparison with traditional methods like zi2zi and CalliGAN, with our\nmodel achieving comparable performance using significantly smaller datasets and\nreduced computational resources. This work not only presents a breakthrough in\nthe digital preservation of calligraphic art but also sets a new standard for\ndata-efficient generative modeling in the domain of cultural heritage\ndigitization.\n","authors":["Fangda Chen","Jiacheng Nie","Lichuan Jiang","Zhuoer Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.17199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10223v3","updated":"2024-04-26T06:57:43Z","published":"2023-05-17T13:56:48Z","title":"NAI$_2$: Learning Noise-Aware Illumination-Interpolator for Unsupervised\n Low-Light Image Enhancement","summary":" Contemporary Low-Light Image Enhancement (LLIE) techniques have made notable\nadvancements in preserving image details and enhancing contrast, achieving\ncommendable results on specific datasets. Nevertheless, these approaches\nencounter persistent challenges in efficiently mitigating dynamic noise and\naccommodating diverse low-light scenarios. Insufficient constraints on complex\npixel-wise mapping learning lead to overfitting to specific types of noise and\nartifacts associated with low-light conditions, reducing effectiveness in\nvariable lighting scenarios. To this end, we first propose a method for\nestimating the noise level in low light images in a quick and accurate way.\nThis facilitates precise denoising, prevents over-smoothing, and adapts to\ndynamic noise patterns. Subsequently, we devise a Learnable Illumination\nInterpolator (LII), which employs learnlable interpolation operations between\nthe input and unit vector to satisfy general constraints between illumination\nand input. Finally, we introduce a self-regularization loss that incorporates\nintrinsic image properties and essential visual attributes to guide the output\ntowards meeting human visual expectations. Comprehensive experiments validate\nthe competitiveness of our proposed algorithm in both qualitative and\nquantitative assessments. Notably, our noise estimation method, with linear\ntime complexity and suitable for various denoisers, significantly improves both\ndenoising and enhancement performance. Benefiting from this, our approach\nachieves a 0.675dB PSNR improvement on the LOL dataset and 0.818dB on the MIT\ndataset on LLIE task, even compared to supervised methods.\n","authors":["Xiaofeng Liu","Jiaxin Gao","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10223v3.pdf","comment":"Image processing, low-light image enhancement, noise estimation,\n illumination learning"},{"id":"http://arxiv.org/abs/2404.17186v1","updated":"2024-04-26T06:40:54Z","published":"2024-04-26T06:40:54Z","title":"MCSDNet: Mesoscale Convective System Detection Network via Multi-scale\n Spatiotemporal Information","summary":" The accurate detection of Mesoscale Convective Systems (MCS) is crucial for\nmeteorological monitoring due to their potential to cause significant\ndestruction through severe weather phenomena such as hail, thunderstorms, and\nheavy rainfall. However, the existing methods for MCS detection mostly targets\non single-frame detection, which just considers the static characteristics and\nignores the temporal evolution in the life cycle of MCS. In this paper, we\npropose a novel encoder-decoder neural network for MCS detection(MCSDNet).\nMCSDNet has a simple architecture and is easy to expand. Different from the\nprevious models, MCSDNet targets on multi-frames detection and leverages\nmulti-scale spatiotemporal information for the detection of MCS regions in\nremote sensing imagery(RSI). As far as we know, it is the first work to utilize\nmulti-scale spatiotemporal information to detect MCS regions. Firstly, we\ndesign a multi-scale spatiotemporal information module to extract multi-level\nsemantic from different encoder levels, which makes our models can extract more\ndetail spatiotemporal features. Secondly, a Spatiotemporal Mix Unit(STMU) is\nintroduced to MCSDNet to capture both intra-frame features and inter-frame\ncorrelations, which is a scalable module and can be replaced by other\nspatiotemporal module, e.g., CNN, RNN, Transformer and our proposed Dual\nSpatiotemporal Attention(DSTA). This means that the future works about\nspatiotemporal modules can be easily integrated to our model. Finally, we\npresent MCSRSI, the first publicly available dataset for multi-frames MCS\ndetection based on visible channel images from the FY-4A satellite. We also\nconduct several experiments on MCSRSI and find that our proposed MCSDNet\nachieve the best performance on MCS detection task when comparing to other\nbaseline methods.\n","authors":["Jiajun Liang","Baoquan Zhang","Yunming Ye","Xutao Li","Chuyao Luo","Xukai Fu"],"pdf_url":"https://arxiv.org/pdf/2404.17186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17184v1","updated":"2024-04-26T06:30:47Z","published":"2024-04-26T06:30:47Z","title":"Low-Rank Knowledge Decomposition for Medical Foundation Models","summary":" The popularity of large-scale pre-training has promoted the development of\nmedical foundation models. However, some studies have shown that although\nfoundation models exhibit strong general feature extraction capabilities, their\nperformance on specific tasks is still inferior to task-specific methods. In\nthis paper, we explore a new perspective called ``Knowledge Decomposition'' to\nimprove the performance on specific medical tasks, which deconstruct the\nfoundation model into multiple lightweight expert models, each dedicated to a\nparticular task, with the goal of improving specialization while concurrently\nmitigating resource expenditure. To accomplish the above objective, we design a\nnovel framework named Low-Rank Knowledge Decomposition (LoRKD), which\nexplicitly separates graidents by incorporating low-rank expert modules and the\nefficient knowledge separation convolution. Extensive experimental results\ndemonstrate that the decomposed models perform well in terms of performance and\ntransferability, even surpassing the original foundation models.\n","authors":["Yuhang Zhou","Haolin Li","Siyuan Du","Jiangchao Yao","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.17184v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.17176v1","updated":"2024-04-26T06:17:04Z","published":"2024-04-26T06:17:04Z","title":"MovieChat+: Question-aware Sparse Memory for Long Video Question\n Answering","summary":" Recently, integrating video foundation models and large language models to\nbuild a video understanding system can overcome the limitations of specific\npre-defined vision tasks. Yet, existing methods either employ complex\nspatial-temporal modules or rely heavily on additional perception models to\nextract temporal features for video understanding, and they only perform well\non short videos. For long videos, the computational complexity and memory costs\nassociated with long-term temporal connections are significantly increased,\nposing additional challenges.Taking advantage of the Atkinson-Shiffrin memory\nmodel, with tokens in Transformers being employed as the carriers of memory in\ncombination with our specially designed memory mechanism, we propose MovieChat\nto overcome these challenges. We lift pre-trained multi-modal large language\nmodels for understanding long videos without incorporating additional trainable\ntemporal modules, employing a zero-shot approach. MovieChat achieves\nstate-of-the-art performance in long video understanding, along with the\nreleased MovieChat-1K benchmark with 1K long video, 2K temporal grounding\nlabels, and 14K manual annotations for validation of the effectiveness of our\nmethod. The code along with the dataset can be accessed via the following\nhttps://github.com/rese1f/MovieChat.\n","authors":["Enxin Song","Wenhao Chai","Tian Ye","Jenq-Neng Hwang","Xi Li","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.17176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17173v1","updated":"2024-04-26T06:00:27Z","published":"2024-04-26T06:00:27Z","title":"Exploring Beyond Logits: Hierarchical Dynamic Labeling Based on\n Embeddings for Semi-Supervised Classification","summary":" In semi-supervised learning, methods that rely on confidence learning to\ngenerate pseudo-labels have been widely proposed. However, increasing research\nfinds that when faced with noisy and biased data, the model's representation\nnetwork is more reliable than the classification network. Additionally, label\ngeneration methods based on model predictions often show poor adaptability\nacross different datasets, necessitating customization of the classification\nnetwork. Therefore, we propose a Hierarchical Dynamic Labeling (HDL) algorithm\nthat does not depend on model predictions and utilizes image embeddings to\ngenerate sample labels. We also introduce an adaptive method for selecting\nhyperparameters in HDL, enhancing its versatility. Moreover, HDL can be\ncombined with general image encoders (e.g., CLIP) to serve as a fundamental\ndata processing module. We extract embeddings from datasets with class-balanced\nand long-tailed distributions using pre-trained semi-supervised models.\nSubsequently, samples are re-labeled using HDL, and the re-labeled samples are\nused to further train the semi-supervised models. Experiments demonstrate\nimproved model performance, validating the motivation that representation\nnetworks are more reliable than classifiers or predictors. Our approach has the\npotential to change the paradigm of pseudo-label generation in semi-supervised\nlearning.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Lingling Li","Shuyuan Yang","Xu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11894v2","updated":"2024-04-26T05:54:28Z","published":"2023-12-19T06:38:18Z","title":"3D-LFM: Lifting Foundation Model","summary":" The lifting of 3D structure and camera from 2D landmarks is at the\ncornerstone of the entire discipline of computer vision. Traditional methods\nhave been confined to specific rigid objects, such as those in\nPerspective-n-Point (PnP) problems, but deep learning has expanded our\ncapability to reconstruct a wide range of object classes (e.g. C3DPO and PAUL)\nwith resilience to noise, occlusions, and perspective distortions. All these\ntechniques, however, have been limited by the fundamental need to establish\ncorrespondences across the 3D training data -- significantly limiting their\nutility to applications where one has an abundance of \"in-correspondence\" 3D\ndata. Our approach harnesses the inherent permutation equivariance of\ntransformers to manage varying number of points per 3D data instance,\nwithstands occlusions, and generalizes to unseen categories. We demonstrate\nstate of the art performance across 2D-3D lifting task benchmarks. Since our\napproach can be trained across such a broad class of structures we refer to it\nsimply as a 3D Lifting Foundation Model (3D-LFM) -- the first of its kind.\n","authors":["Mosam Dabhi","Laszlo A. Jeni","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2312.11894v2.pdf","comment":"Visit the project page at https://3dlfm.github.io for links to\n additional media, code, and videos. The site also features a custom GPT\n tailored to address queries related to 3D-LFM. Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06738v3","updated":"2024-04-26T05:52:31Z","published":"2023-12-11T17:53:45Z","title":"InstructAny2Pix: Flexible Visual Editing via Multimodal Instruction\n Following","summary":" The ability to provide fine-grained control for generating and editing visual\nimagery has profound implications for computer vision and its applications.\nPrevious works have explored extending controllability in two directions:\ninstruction tuning with text-based prompts and multi-modal conditioning.\nHowever, these works make one or more unnatural assumptions on the number\nand/or type of modality inputs used to express controllability. We propose\nInstructAny2Pix, a flexible multi-modal instruction-following system that\nenables users to edit an input image using instructions involving audio,\nimages, and text. InstructAny2Pix consists of three building blocks that\nfacilitate this capability: a multi-modal encoder that encodes different\nmodalities such as images and audio into a unified latent space, a diffusion\nmodel that learns to decode representations in this latent space into images,\nand a multi-modal LLM that can understand instructions involving multiple\nimages and audio pieces and generate a conditional embedding of the desired\noutput, which can be used by the diffusion decoder. Additionally, to facilitate\ntraining efficiency and improve generation quality, we include an additional\nrefinement prior module that enhances the visual quality of LLM outputs. These\ndesigns are critical to the performance of our system. We demonstrate that our\nsystem can perform a series of novel instruction-guided editing tasks. The code\nis available at https://github.com/jacklishufan/InstructAny2Pix.git\n","authors":["Shufan Li","Harkanwar Singh","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2312.06738v3.pdf","comment":"29 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.17170v1","updated":"2024-04-26T05:51:57Z","published":"2024-04-26T05:51:57Z","title":"S-IQA Image Quality Assessment With Compressive Sampling","summary":" No-Reference Image Quality Assessment (IQA) aims at estimating image quality\nin accordance with subjective human perception. However, most existing NR-IQA\nmethods focus on exploring increasingly complex networks or components to\nimprove the final performance. Such practice imposes great limitations and\ncomplexity on IQA methods, especially when they are applied to high-resolution\n(HR) images in the real world. Actually, most images own high spatial\nredundancy, especially for those HR data. To further exploit the characteristic\nand alleviate the issue above, we propose a new framework for Image Quality\nAssessment with compressive Sampling (dubbed S-IQA), which consists of three\ncomponents: (1) The Flexible Sampling Module (FSM) samples the image to obtain\nmeasurements at an arbitrary ratio. (2) Vision Transformer with the Adaptive\nEmbedding Module (AEM) makes measurements of uniform size and extracts deep\nfeatures (3) Dual Branch (DB) allocates weight for every patch and predicts the\nfinal quality score. Experiments show that our proposed S-IQA achieves\nstate-of-the-art result on various datasets with less data usage.\n","authors":["Ronghua Liao","Chen Hui","Lang Yuan","Feng Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.17170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10962v2","updated":"2024-04-26T05:48:26Z","published":"2024-03-16T16:17:44Z","title":"Exploiting Topological Priors for Boosting Point Cloud Generation","summary":" This paper presents an innovative enhancement to the Sphere as Prior\nGenerative Adversarial Network (SP-GAN) model, a state-of-the-art GAN designed\nfor point cloud generation. A novel method is introduced for point cloud\ngeneration that elevates the structural integrity and overall quality of the\ngenerated point clouds by incorporating topological priors into the training\nprocess of the generator. Specifically, this work utilizes the K-means\nalgorithm to segment a point cloud from the repository into clusters and\nextract centroids, which are then used as priors in the generation process of\nthe SP-GAN. Furthermore, the discriminator component of the SP-GAN utilizes the\nidentical point cloud that contributed the centroids, ensuring a coherent and\nconsistent learning environment. This strategic use of centroids as intuitive\nguides not only boosts the efficiency of global feature learning but also\nsubstantially improves the structural coherence and fidelity of the generated\npoint clouds. By applying the K-means algorithm to generate centroids as the\nprior, the work intuitively and experimentally demonstrates that such a prior\nenhances the quality of generated point clouds.\n","authors":["Baiyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10962v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2301.13014v2","updated":"2024-04-26T05:27:38Z","published":"2022-12-27T05:28:38Z","title":"Attribute-Guided Multi-Level Attention Network for Fine-Grained Fashion\n Retrieval","summary":" Fine-grained fashion retrieval searches for items that share a similar\nattribute with the query image. Most existing methods use a pre-trained feature\nextractor (e.g., ResNet 50) to capture image representations. However, a\npre-trained feature backbone is typically trained for image classification and\nobject detection, which are fundamentally different tasks from fine-grained\nfashion retrieval. Therefore, existing methods suffer from a feature gap\nproblem when directly using the pre-trained backbone for fine-tuning. To solve\nthis problem, we introduce an attribute-guided multi-level attention network\n(AG-MAN). Specifically, we first enhance the pre-trained feature extractor to\ncapture multi-level image embedding, thereby enriching the low-level features\nwithin these representations. Then, we propose a classification scheme where\nimages with the same attribute, albeit with different values, are categorized\ninto the same class. This can further alleviate the feature gap problem by\nperturbing object-centric feature learning. Moreover, we propose an improved\nattribute-guided attention module for extracting more accurate\nattribute-specific representations. Our model consistently outperforms existing\nattention based methods when assessed on the FashionAI (62.8788% in MAP),\nDeepFashion (8.9804% in MAP), and Zappos50k datasets (93.32% in Prediction\naccuracy). Especially, ours improves the most typical ASENet_V2 model by 2.12%,\n0.31%, and 0.78% points in FashionAI, DeepFashion, and Zappos50k datasets,\nrespectively. The source code is available in\nhttps://github.com/Dr-LingXiao/AG-MAN.\n","authors":["Ling Xiao","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2301.13014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16573v2","updated":"2024-04-26T05:17:14Z","published":"2024-04-25T12:35:27Z","title":"Multi-Scale Representations by Varying Window Attention for Semantic\n Segmentation","summary":" Multi-scale learning is central to semantic segmentation. We visualize the\neffective receptive field (ERF) of canonical multi-scale representations and\npoint out two risks in learning them: scale inadequacy and field inactivation.\nA novel multi-scale learner, varying window attention (VWA), is presented to\naddress these issues. VWA leverages the local window attention (LWA) and\ndisentangles LWA into the query window and context window, allowing the\ncontext's scale to vary for the query to learn representations at multiple\nscales. However, varying the context to large-scale windows (enlarging ratio R)\ncan significantly increase the memory footprint and computation cost (R^2 times\nlarger than LWA). We propose a simple but professional re-scaling strategy to\nzero the extra induced cost without compromising performance. Consequently, VWA\nuses the same cost as LWA to overcome the receptive limitation of the local\nwindow. Furthermore, depending on VWA and employing various MLPs, we introduce\na multi-scale decoder (MSD), VWFormer, to improve multi-scale representations\nfor semantic segmentation. VWFormer achieves efficiency competitive with the\nmost compute-friendly MSDs, like FPN and MLP decoder, but performs much better\nthan any MSDs. For instance, using nearly half of UPerNet's computation,\nVWFormer outperforms it by 1.0%-2.5% mIoU on ADE20K. With little extra\noverhead, ~10G FLOPs, Mask2Former armed with VWFormer improves by 1.0%-1.3%.\nThe code and models are available at https://github.com/yan-hao-tian/vw\n","authors":["Haotian Yan","Ming Wu","Chuang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16573v2.pdf","comment":"ICLR2024 Poster"},{"id":"http://arxiv.org/abs/2404.17159v1","updated":"2024-04-26T05:06:53Z","published":"2024-04-26T05:06:53Z","title":"Phase-aggregated Dual-branch Network for Efficient Fingerprint Dense\n Registration","summary":" Fingerprint dense registration aims to finely align fingerprint pairs at the\npixel level, thereby reducing intra-class differences caused by distortion.\nUnfortunately, traditional methods exhibited subpar performance when dealing\nwith low-quality fingerprints while suffering from slow inference speed.\nAlthough deep learning based approaches shows significant improvement in these\naspects, their registration accuracy is still unsatisfactory. In this paper, we\npropose a Phase-aggregated Dual-branch Registration Network (PDRNet) to\naggregate the advantages of both types of methods. A dual-branch structure with\nmulti-stage interactions is introduced between correlation information at high\nresolution and texture feature at low resolution, to perceive local fine\ndifferences while ensuring global stability. Extensive experiments are\nconducted on more comprehensive databases compared to previous works.\nExperimental results demonstrate that our method reaches the state-of-the-art\nregistration performance in terms of accuracy and robustness, while maintaining\nconsiderable competitiveness in efficiency.\n","authors":["Xiongjun Guan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17152v1","updated":"2024-04-26T04:52:45Z","published":"2024-04-26T04:52:45Z","title":"CSCO: Connectivity Search of Convolutional Operators","summary":" Exploring dense connectivity of convolutional operators establishes critical\n\"synapses\" to communicate feature vectors from different levels and enriches\nthe set of transformations on Computer Vision applications. Yet, even with\nheavy-machinery approaches such as Neural Architecture Search (NAS),\ndiscovering effective connectivity patterns requires tremendous efforts due to\neither constrained connectivity design space or a sub-optimal exploration\nprocess induced by an unconstrained search space. In this paper, we propose\nCSCO, a novel paradigm that fabricates effective connectivity of convolutional\noperators with minimal utilization of existing design motifs and further\nutilizes the discovered wiring to construct high-performing ConvNets. CSCO\nguides the exploration via a neural predictor as a surrogate of the\nground-truth performance. We introduce Graph Isomorphism as data augmentation\nto improve sample efficiency and propose a Metropolis-Hastings Evolutionary\nSearch (MH-ES) to evade locally optimal architectures and advance search\nquality. Results on ImageNet show ~0.6% performance improvement over\nhand-crafted and NAS-crafted dense connectivity. Our code is publicly\navailable.\n","authors":["Tunhou Zhang","Shiyu Li","Hsin-Pai Cheng","Feng Yan","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17152v1.pdf","comment":"To appear on Proceedings of the IEEE/CVF Conference on Computer\n Vision and Pattern Recognition (CVPR) Workshops (2024)"},{"id":"http://arxiv.org/abs/2404.17151v1","updated":"2024-04-26T04:49:42Z","published":"2024-04-26T04:49:42Z","title":"MorphText: Deep Morphology Regularized Arbitrary-shape Scene Text\n Detection","summary":" Bottom-up text detection methods play an important role in arbitrary-shape\nscene text detection but there are two restrictions preventing them from\nachieving their great potential, i.e., 1) the accumulation of false text\nsegment detections, which affects subsequent processing, and 2) the difficulty\nof building reliable connections between text segments. Targeting these two\nproblems, we propose a novel approach, named ``MorphText\", to capture the\nregularity of texts by embedding deep morphology for arbitrary-shape text\ndetection. Towards this end, two deep morphological modules are designed to\nregularize text segments and determine the linkage between them. First, a Deep\nMorphological Opening (DMOP) module is constructed to remove false text segment\ndetections generated in the feature extraction process. Then, a Deep\nMorphological Closing (DMCL) module is proposed to allow text instances of\nvarious shapes to stretch their morphology along their most significant\norientation while deriving their connections. Extensive experiments conducted\non four challenging benchmark datasets (CTW1500, Total-Text, MSRA-TD500 and\nICDAR2017) demonstrate that our proposed MorphText outperforms both top-down\nand bottom-up state-of-the-art arbitrary-shape scene text detection approaches.\n","authors":["Chengpei Xu","Wenjing Jia","Ruomei Wang","Xiaonan Luo","Xiangjian He"],"pdf_url":"https://arxiv.org/pdf/2404.17151v1.pdf","comment":"Accepted by Transaction on Multimedia"},{"id":"http://arxiv.org/abs/2404.17149v1","updated":"2024-04-26T04:44:23Z","published":"2024-04-26T04:44:23Z","title":"Pose-Specific 3D Fingerprint Unfolding","summary":" In order to make 3D fingerprints compatible with traditional 2D flat\nfingerprints, a common practice is to unfold the 3D fingerprint into a 2D\nrolled fingerprint, which is then matched with the flat fingerprints by\ntraditional 2D fingerprint recognition algorithms. The problem with this method\nis that there may be large elastic deformation between the unfolded rolled\nfingerprint and flat fingerprint, which affects the recognition rate. In this\npaper, we propose a pose-specific 3D fingerprint unfolding algorithm to unfold\nthe 3D fingerprint using the same pose as the flat fingerprint. Our experiments\nshow that the proposed unfolding algorithm improves the compatibility between\n3D fingerprint and flat fingerprint and thus leads to higher genuine matching\nscores.\n","authors":["Xiongjun Guan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17148v1","updated":"2024-04-26T04:35:42Z","published":"2024-04-26T04:35:42Z","title":"Direct Regression of Distortion Field from a Single Fingerprint Image","summary":" Skin distortion is a long standing challenge in fingerprint matching, which\ncauses false non-matches. Previous studies have shown that the recognition rate\ncan be improved by estimating the distortion field from a distorted fingerprint\nand then rectifying it into a normal fingerprint. However, existing\nrectification methods are based on principal component representation of\ndistortion fields, which is not accurate and are very sensitive to finger pose.\nIn this paper, we propose a rectification method where a self-reference based\nnetwork is utilized to directly estimate the dense distortion field of\ndistorted fingerprint instead of its low dimensional representation. This\nmethod can output accurate distortion fields of distorted fingerprints with\nvarious finger poses. Considering the limited number and variety of distorted\nfingerprints in the existing public dataset, we collected more distorted\nfingerprints with diverse finger poses and distortion patterns as a new\ndatabase. Experimental results demonstrate that our proposed method achieves\nthe state-of-the-art rectification performance in terms of distortion field\nestimation and rectified fingerprint matching.\n","authors":["Xiongjun Guan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17147v1","updated":"2024-04-26T04:34:45Z","published":"2024-04-26T04:34:45Z","title":"On the Federated Learning Framework for Cooperative Perception","summary":" Cooperative perception is essential to enhance the efficiency and safety of\nfuture transportation systems, requiring extensive data sharing among vehicles\non the road, which raises significant privacy concerns. Federated learning\noffers a promising solution by enabling data privacy-preserving collaborative\nenhancements in perception, decision-making, and planning among connected and\nautonomous vehicles (CAVs). However, federated learning is impeded by\nsignificant challenges arising from data heterogeneity across diverse clients,\npotentially diminishing model accuracy and prolonging convergence periods. This\nstudy introduces a specialized federated learning framework for CP, termed the\nfederated dynamic weighted aggregation (FedDWA) algorithm, facilitated by\ndynamic adjusting loss (DALoss) function. This framework employs dynamic client\nweighting to direct model convergence and integrates a novel loss function that\nutilizes Kullback-Leibler divergence (KLD) to counteract the detrimental\neffects of non-independently and identically distributed (Non-IID) and\nunbalanced data. Utilizing the BEV transformer as the primary model, our\nrigorous testing on the OpenV2V dataset, augmented with FedBEVT data,\ndemonstrates significant improvements in the average intersection over union\n(IoU). These results highlight the substantial potential of our federated\nlearning framework to address data heterogeneity challenges in CP, thereby\nenhancing the accuracy of environmental perception models and facilitating more\nrobust and efficient collaborative learning solutions in the transportation\nsector.\n","authors":["Zhenrong Zhang","Jianan Liu","Xi Zhou","Tao Huang","Qing-Long Han","Jingxin Liu","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00260v3","updated":"2024-04-26T03:59:41Z","published":"2023-12-30T15:24:50Z","title":"GazeCLIP: Towards Enhancing Gaze Estimation via Text Guidance","summary":" Over the past decade, visual gaze estimation has garnered increasing\nattention within the research community, owing to its wide-ranging application\nscenarios. While existing estimation approaches have achieved remarkable\nsuccess in enhancing prediction accuracy, they primarily infer gaze from\nsingle-image signals, neglecting the potential benefits of the currently\ndominant text guidance. Notably, visual-language collaboration has been\nextensively explored across various visual tasks, such as image synthesis and\nmanipulation, leveraging the remarkable transferability of large-scale\nContrastive Language-Image Pre-training (CLIP) model. Nevertheless, existing\ngaze estimation approaches overlook the rich semantic cues conveyed by\nlinguistic signals and the priors embedded in CLIP feature space, thereby\nyielding performance setbacks. To address this gap, we delve deeply into the\ntext-eye collaboration protocol and introduce a novel gaze estimation\nframework, named GazeCLIP. Specifically, we intricately design a linguistic\ndescription generator to produce text signals with coarse directional cues.\nAdditionally, a CLIP-based backbone that excels in characterizing text-eye\npairs for gaze estimation is presented. This is followed by the implementation\nof a fine-grained multi-modal fusion module aimed at modeling the\ninterrelationships between heterogeneous inputs. Extensive experiments on three\nchallenging datasets demonstrate the superiority of the proposed GazeCLIP which\nachieves the state-of-the-art accuracy.\n","authors":["Jun Wang","Hao Ruan","Mingjie Wang","Chuanghui Zhang","Huachun Li","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.00260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13125v2","updated":"2024-04-26T03:16:47Z","published":"2023-11-22T03:26:07Z","title":"DAE-Net: Deforming Auto-Encoder for fine-grained shape co-segmentation","summary":" We present an unsupervised 3D shape co-segmentation method which learns a set\nof deformable part templates from a shape collection. To accommodate structural\nvariations in the collection, our network composes each shape by a selected\nsubset of template parts which are affine-transformed. To maximize the\nexpressive power of the part templates, we introduce a per-part deformation\nnetwork to enable the modeling of diverse parts with substantial geometry\nvariations, while imposing constraints on the deformation capacity to ensure\nfidelity to the originally represented parts. We also propose a training scheme\nto effectively overcome local minima. Architecturally, our network is a\nbranched autoencoder, with a CNN encoder taking a voxel shape as input and\nproducing per-part transformation matrices, latent codes, and part existence\nscores, and the decoder outputting point occupancies to define the\nreconstruction loss. Our network, coined DAE-Net for Deforming Auto-Encoder,\ncan achieve unsupervised 3D shape co-segmentation that yields fine-grained,\ncompact, and meaningful parts that are consistent across diverse shapes. We\nconduct extensive experiments on the ShapeNet Part dataset, DFAUST, and an\nanimal subset of Objaverse to show superior performance over prior methods.\nCode and data are available at https://github.com/czq142857/DAE-Net.\n","authors":["Zhiqin Chen","Qimin Chen","Hang Zhou","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13125v2.pdf","comment":"SIGGRAPH 2024 conference track"},{"id":"http://arxiv.org/abs/2401.15261v2","updated":"2024-04-26T03:12:35Z","published":"2024-01-27T01:01:58Z","title":"Vanishing-Point-Guided Video Semantic Segmentation of Driving Scenes","summary":" The estimation of implicit cross-frame correspondences and the high\ncomputational cost have long been major challenges in video semantic\nsegmentation (VSS) for driving scenes. Prior works utilize keyframes, feature\npropagation, or cross-frame attention to address these issues. By contrast, we\nare the first to harness vanishing point (VP) priors for more effective\nsegmentation. Intuitively, objects near VPs (i.e., away from the vehicle) are\nless discernible. Moreover, they tend to move radially away from the VP over\ntime in the usual case of a forward-facing camera, a straight road, and linear\nforward motion of the vehicle. Our novel, efficient network for VSS, named\nVPSeg, incorporates two modules that utilize exactly this pair of static and\ndynamic VP priors: sparse-to-dense feature mining (DenseVP) and VP-guided\nmotion fusion (MotionVP). MotionVP employs VP-guided motion estimation to\nestablish explicit correspondences across frames and help attend to the most\nrelevant features from neighboring frames, while DenseVP enhances weak dynamic\nfeatures in distant regions around VPs. These modules operate within a\ncontext-detail framework, which separates contextual features from\nhigh-resolution local features at different input resolutions to reduce\ncomputational costs. Contextual and local features are integrated through\ncontextualized motion attention (CMA) for the final prediction. Extensive\nexperiments on two popular driving segmentation benchmarks, Cityscapes and\nACDC, demonstrate that VPSeg outperforms previous SOTA methods, with only\nmodest computational overhead.\n","authors":["Diandian Guo","Deng-Ping Fan","Tongyu Lu","Christos Sakaridis","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2401.15261v2.pdf","comment":"CVPR 2024 highlight"},{"id":"http://arxiv.org/abs/2401.05217v3","updated":"2024-04-26T02:56:53Z","published":"2024-01-10T15:30:19Z","title":"Exploring Vulnerabilities of No-Reference Image Quality Assessment\n Models: A Query-Based Black-Box Method","summary":" No-Reference Image Quality Assessment (NR-IQA) aims to predict image quality\nscores consistent with human perception without relying on pristine reference\nimages, serving as a crucial component in various visual tasks. Ensuring the\nrobustness of NR-IQA methods is vital for reliable comparisons of different\nimage processing techniques and consistent user experiences in recommendations.\nThe attack methods for NR-IQA provide a powerful instrument to test the\nrobustness of NR-IQA. However, current attack methods of NR-IQA heavily rely on\nthe gradient of the NR-IQA model, leading to limitations when the gradient\ninformation is unavailable. In this paper, we present a pioneering query-based\nblack box attack against NR-IQA methods. We propose the concept of score\nboundary and leverage an adaptive iterative approach with multiple score\nboundaries. Meanwhile, the initial attack directions are also designed to\nleverage the characteristics of the Human Visual System (HVS). Experiments show\nour method outperforms all compared state-of-the-art attack methods and is far\nahead of previous black-box methods. The effective NR-IQA model DBCNN suffers a\nSpearman's rank-order correlation coefficient (SROCC) decline of 0.6381\nattacked by our method, revealing the vulnerability of NR-IQA models to\nblack-box attacks. The proposed attack method also provides a potent tool for\nfurther exploration into NR-IQA robustness.\n","authors":["Chenxi Yang","Yujia Liu","Dingquan Li","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.05217v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13711v2","updated":"2024-04-26T02:53:52Z","published":"2024-04-21T16:45:35Z","title":"ArtNeRF: A Stylized Neural Field for 3D-Aware Cartoonized Face Synthesis","summary":" Recent advances in generative visual models and neural radiance fields have\ngreatly boosted 3D-aware image synthesis and stylization tasks. However,\nprevious NeRF-based work is limited to single scene stylization, training a\nmodel to generate 3D-aware cartoon faces with arbitrary styles remains\nunsolved. We propose ArtNeRF, a novel face stylization framework derived from\n3D-aware GAN to tackle this problem. In this framework, we utilize an\nexpressive generator to synthesize stylized faces and a triple-branch\ndiscriminator module to improve the visual quality and style consistency of the\ngenerated faces. Specifically, a style encoder based on contrastive learning is\nleveraged to extract robust low-dimensional embeddings of style images,\nempowering the generator with the knowledge of various styles. To smooth the\ntraining process of cross-domain transfer learning, we propose an adaptive\nstyle blending module which helps inject style information and allows users to\nfreely tune the level of stylization. We further introduce a neural rendering\nmodule to achieve efficient real-time rendering of images with higher\nresolutions. Extensive experiments demonstrate that ArtNeRF is versatile in\ngenerating high-quality 3D-aware cartoon faces with arbitrary styles.\n","authors":["Zichen Tang","Hongyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06462v3","updated":"2024-04-26T02:53:13Z","published":"2023-09-12T17:56:06Z","title":"Action Segmentation Using 2D Skeleton Heatmaps and Multi-Modality Fusion","summary":" This paper presents a 2D skeleton-based action segmentation method with\napplications in fine-grained human activity recognition. In contrast with\nstate-of-the-art methods which directly take sequences of 3D skeleton\ncoordinates as inputs and apply Graph Convolutional Networks (GCNs) for\nspatiotemporal feature learning, our main idea is to use sequences of 2D\nskeleton heatmaps as inputs and employ Temporal Convolutional Networks (TCNs)\nto extract spatiotemporal features. Despite lacking 3D information, our\napproach yields comparable/superior performances and better robustness against\nmissing keypoints than previous methods on action segmentation datasets.\nMoreover, we improve the performances further by using both 2D skeleton\nheatmaps and RGB videos as inputs. To our best knowledge, this is the first\nwork to utilize 2D skeleton heatmap inputs and the first work to explore 2D\nskeleton+RGB fusion for action segmentation.\n","authors":["Syed Waleed Hyder","Muhammad Usama","Anas Zafar","Muhammad Naufil","Fawad Javed Fateh","Andrey Konin","M. Zeeshan Zia","Quoc-Huy Tran"],"pdf_url":"https://arxiv.org/pdf/2309.06462v3.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2306.08313v2","updated":"2024-04-26T02:29:42Z","published":"2023-06-14T07:33:04Z","title":"A Proxy Attack-Free Strategy for Practically Improving the Poisoning\n Efficiency in Backdoor Attacks","summary":" Poisoning efficiency plays a critical role in poisoning-based backdoor\nattacks. To evade detection, attackers aim to use the fewest poisoning samples\nwhile achieving the desired attack strength. Although efficient triggers have\nsignificantly improved poisoning efficiency, there is still room for further\nenhancement. Recently, selecting efficient samples has shown promise, but it\noften requires a proxy backdoor injection task to identify an efficient\npoisoning sample set. However, the proxy attack-based approach can lead to\nperformance degradation if the proxy attack settings differ from those used by\nthe actual victims due to the shortcut of backdoor learning. This paper\npresents a Proxy attack-Free Strategy (PFS) designed to identify efficient\npoisoning samples based on individual similarity and ensemble diversity,\neffectively addressing the mentioned concern. The proposed PFS is motivated by\nthe observation that selecting the to-be-poisoned samples with high similarity\nbetween clean samples and their corresponding poisoning samples results in\nsignificantly higher attack success rates compared to using samples with low\nsimilarity. Furthermore, theoretical analyses for this phenomenon are provided\nbased on the theory of active learning and neural tangent kernel. We\ncomprehensively evaluate the proposed strategy across various datasets,\ntriggers, poisoning rates, architectures, and training hyperparameters. Our\nexperimental results demonstrate that PFS enhances backdoor attack efficiency,\nwhile also exhibiting a remarkable speed advantage over prior proxy-dependent\nselection methodologies.\n","authors":["Ziqiang Li","Hong Sun","Pengfei Xia","Beihao Xia","Xue Rui","Wei Zhang","Qinglang Guo","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2306.08313v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.17118v1","updated":"2024-04-26T02:23:59Z","published":"2024-04-26T02:23:59Z","title":"Localization of Pallets on Shelves Using Horizontal Plane Projection of\n a 360-degree Image","summary":" In this paper, we propose a method for calculating the three-dimensional (3D)\nposition and orientation of a pallet placed on a shelf on the side of a\nforklift truck using a 360-degree camera. By using a 360-degree camera mounted\non the forklift truck, it is possible to observe both the pallet at the side of\nthe forklift and one several meters ahead. However, the pallet on the obtained\nimage is observed with different distortion depending on its 3D position, so\nthat it is difficult to extract the pallet from the image. To solve this\nproblem, a method [1] has been proposed for detecting a pallet by projecting a\n360-degree image on a vertical plane that coincides with the front of the shelf\nto calculate an image similar to the image seen from the front of the shelf. At\nthe same time as the detection, the approximate position and orientation of the\ndetected pallet can be obtained, but the accuracy is not sufficient for\nautomatic control of the forklift truck. In this paper, we propose a method for\naccurately detecting the yaw angle, which is the angle of the front surface of\nthe pallet in the horizontal plane, by projecting the 360-degree image on a\nhorizontal plane including the boundary line of the front surface of the\ndetected pallet. The position of the pallet is also determined by moving the\nvertical plane having the detected yaw angle back and forth, and finding the\nposition at which the degree of coincidence between the projection image on the\nvertical plane and the actual size of the front surface of the pallet is\nmaximized. Experiments using real images taken in a laboratory and an actual\nwarehouse have confirmed that the proposed method can calculate the position\nand orientation of a pallet within a reasonable calculation time and with the\naccuracy necessary for inserting the fork into the hole in the front of the\npallet.\n","authors":["Yasuyo Kita","Yudai Fujieda","Ichiro Matsuda","Nobuyuki Kita"],"pdf_url":"https://arxiv.org/pdf/2404.17118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18892v2","updated":"2024-04-26T02:16:11Z","published":"2024-02-29T06:31:18Z","title":"Aligning Knowledge Graph with Visual Perception for Object-goal\n Navigation","summary":" Object-goal navigation is a challenging task that requires guiding an agent\nto specific objects based on first-person visual observations. The ability of\nagent to comprehend its surroundings plays a crucial role in achieving\nsuccessful object finding. However, existing knowledge-graph-based navigators\noften rely on discrete categorical one-hot vectors and vote counting strategy\nto construct graph representation of the scenes, which results in misalignment\nwith visual images. To provide more accurate and coherent scene descriptions\nand address this misalignment issue, we propose the Aligning Knowledge Graph\nwith Visual Perception (AKGVP) method for object-goal navigation. Technically,\nour approach introduces continuous modeling of the hierarchical scene\narchitecture and leverages visual-language pre-training to align natural\nlanguage description with visual perception. The integration of a continuous\nknowledge graph architecture and multimodal feature alignment empowers the\nnavigator with a remarkable zero-shot navigation capability. We extensively\nevaluate our method using the AI2-THOR simulator and conduct a series of\nexperiments to demonstrate the effectiveness and efficiency of our navigator.\nCode available: https://github.com/nuoxu/AKGVP.\n","authors":["Nuo Xu","Wen Wang","Rong Yang","Mengjie Qin","Zheyuan Lin","Wei Song","Chunlong Zhang","Jason Gu","Chao Li"],"pdf_url":"https://arxiv.org/pdf/2402.18892v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2311.18405v2","updated":"2024-04-26T01:57:00Z","published":"2023-11-30T09:56:17Z","title":"CAT-DM: Controllable Accelerated Virtual Try-on with Diffusion Model","summary":" Generative Adversarial Networks (GANs) dominate the research field in\nimage-based virtual try-on, but have not resolved problems such as unnatural\ndeformation of garments and the blurry generation quality. While the generative\nquality of diffusion models is impressive, achieving controllability poses a\nsignificant challenge when applying it to virtual try-on and multiple denoising\niterations limit its potential for real-time applications. In this paper, we\npropose Controllable Accelerated virtual Try-on with Diffusion Model (CAT-DM).\nTo enhance the controllability, a basic diffusion-based virtual try-on network\nis designed, which utilizes ControlNet to introduce additional control\nconditions and improves the feature extraction of garment images. In terms of\nacceleration, CAT-DM initiates a reverse denoising process with an implicit\ndistribution generated by a pre-trained GAN-based model. Compared with previous\ntry-on methods based on diffusion models, CAT-DM not only retains the pattern\nand texture details of the inshop garment but also reduces the sampling steps\nwithout compromising generation quality. Extensive experiments demonstrate the\nsuperiority of CAT-DM against both GANbased and diffusion-based methods in\nproducing more realistic images and accurately reproducing garment patterns.\n","authors":["Jianhao Zeng","Dan Song","Weizhi Nie","Hongshuo Tian","Tongtong Wang","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12327v2","updated":"2024-04-26T01:50:55Z","published":"2023-11-21T03:40:09Z","title":"Enhancing Visual Grounding and Generalization: A Multi-Task Cycle\n Training Approach for Vision-Language Models","summary":" Visual grounding (VG) occupies a pivotal position in multi-modality\nvision-language models. In this study, we propose ViLaM, a large multi-modality\nmodel, that supports multi-tasks of VG using the cycle training strategy, with\nabundant interaction instructions. The cycle training between referring\nexpression generation (REG) and referring expression comprehension (REC) is\nintroduced. It enhances the consistency between visual location and referring\nexpressions, and addresses the need for high-quality, multi-tasks VG datasets.\nMoreover, multi-tasks of VG are promoted in our model, contributed by the cycle\ntraining strategy. The multi-tasks in REC encompass a range of granularities,\nfrom region-level to pixel-level, which include referring bbox detection,\nreferring keypoints detection, and referring image segmentation. In REG,\nreferring region classification determines the fine-grained category of the\ntarget, while referring region captioning generates a comprehensive\ndescription. Meanwhile, all tasks participate in the joint training,\nsynergistically enhancing one another and collectively improving the overall\nperformance of the model. Furthermore, leveraging the capabilities of large\nlanguage models, ViLaM extends a wide range of instructions, thereby\nsignificantly enhancing its generalization and interaction potentials.\nExtensive public datasets corroborate the superior capabilities of our model in\nVG with muti-tasks. Additionally, validating its robust generalization, ViLaM\nis validated under open-set and few-shot scenarios. Especially in the medical\nfield, our model demonstrates cross-domain robust generalization capabilities.\nFurthermore, we contribute a VG dataset, especially with multi-tasks. To\nsupport and encourage the community focused on VG, we have made both the\ndataset and our code public: https://github.com/AnonymGiant/ViLaM.\n","authors":["Xiaoyu Yang","Lijian Xu","Hao Sun","Hongsheng Li","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12327v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2404.17105v1","updated":"2024-04-26T01:45:58Z","published":"2024-04-26T01:45:58Z","title":"Synthesizing Iris Images using Generative Adversarial Networks: Survey\n and Comparative Analysis","summary":" Biometric systems based on iris recognition are currently being used in\nborder control applications and mobile devices. However, research in iris\nrecognition is stymied by various factors such as limited datasets of bonafide\nirides and presentation attack instruments; restricted intra-class variations;\nand privacy concerns. Some of these issues can be mitigated by the use of\nsynthetic iris data. In this paper, we present a comprehensive review of\nstate-of-the-art GAN-based synthetic iris image generation techniques,\nevaluating their strengths and limitations in producing realistic and useful\niris images that can be used for both training and testing iris recognition\nsystems and presentation attack detectors. In this regard, we first survey the\nvarious methods that have been used for synthetic iris generation and\nspecifically consider generators based on StyleGAN, RaSGAN, CIT-GAN, iWarpGAN,\nStarGAN, etc. We then analyze the images generated by these models for realism,\nuniqueness, and biometric utility. This comprehensive analysis highlights the\npros and cons of various GANs in the context of developing robust iris matchers\nand presentation attack detectors.\n","authors":["Shivangi Yadav","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2404.17105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17104v1","updated":"2024-04-26T01:39:31Z","published":"2024-04-26T01:39:31Z","title":"Don't Look at the Camera: Achieving Perceived Eye Contact","summary":" We consider the question of how to best achieve the perception of eye contact\nwhen a person is captured by camera and then rendered on a 2D display. For\nsingle subjects photographed by a camera, conventional wisdom tells us that\nlooking directly into the camera achieves eye contact. Through empirical user\nstudies, we show that it is instead preferable to {\\em look just below the\ncamera lens}. We quantitatively assess where subjects should direct their gaze\nrelative to a camera lens to optimize the perception that they are making eye\ncontact.\n","authors":["Alice Gao","Samyukta Jayakumar","Marcello Maniglia","Brian Curless","Ira Kemelmacher-Shlizerman","Aaron R. Seitz","Steven M. Seitz"],"pdf_url":"https://arxiv.org/pdf/2404.17104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09055v3","updated":"2024-04-26T01:29:46Z","published":"2023-07-18T08:11:08Z","title":"Robust Data Clustering with Outliers via Transformed Tensor Low-Rank\n Representation","summary":" Recently, tensor low-rank representation (TLRR) has become a popular tool for\ntensor data recovery and clustering, due to its empirical success and\ntheoretical guarantees. However, existing TLRR methods consider Gaussian or\ngross sparse noise, inevitably leading to performance degradation when the\ntensor data are contaminated by outliers or sample-specific corruptions. This\npaper develops an outlier-robust tensor low-rank representation (OR-TLRR)\nmethod that provides outlier detection and tensor data clustering\nsimultaneously based on the t-SVD framework. For tensor observations with\narbitrary outlier corruptions, OR-TLRR has provable performance guarantee for\nexactly recovering the row space of clean data and detecting outliers under\nmild conditions. Moreover, an extension of OR-TLRR is proposed to handle the\ncase when parts of the data are missing. Finally, extensive experimental\nresults on synthetic and real data demonstrate the effectiveness of the\nproposed algorithms. We release our code at\nhttps://github.com/twugithub/2024-AISTATS-ORTLRR.\n","authors":["Tong Wu"],"pdf_url":"https://arxiv.org/pdf/2307.09055v3.pdf","comment":"AISTATS 2024"},{"id":"http://arxiv.org/abs/2310.02992v3","updated":"2024-04-26T01:24:57Z","published":"2023-10-04T17:28:44Z","title":"Kosmos-G: Generating Images in Context with Multimodal Large Language\n Models","summary":" Recent advancements in subject-driven image generation have made significant\nstrides. However, current methods still fall short in diverse application\nscenarios, as they require test-time tuning and cannot accept interleaved\nmulti-image and text input. These limitations keep them far from the ultimate\ngoal of \"image as a foreign language in image generation.\" This paper presents\nKosmos-G, a model that leverages the advanced multimodal perception\ncapabilities of Multimodal Large Language Models (MLLMs) to tackle the\naforementioned challenge. Our approach aligns the output space of MLLM with\nCLIP using the textual modality as an anchor and performs compositional\ninstruction tuning on curated data. Kosmos-G demonstrates an impressive\ncapability of zero-shot subject-driven generation with interleaved multi-image\nand text input. Notably, the score distillation instruction tuning requires no\nmodifications to the image decoder. This allows for a seamless substitution of\nCLIP and effortless integration with a myriad of U-Net techniques ranging from\nfine-grained controls to personalized image decoder variants. We posit Kosmos-G\nas an initial attempt towards the goal of \"image as a foreign language in image\ngeneration.\" The code can be found at https://aka.ms/Kosmos-G\n","authors":["Xichen Pan","Li Dong","Shaohan Huang","Zhiliang Peng","Wenhu Chen","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2310.02992v3.pdf","comment":"Code: https://aka.ms/Kosmos-G Project Page:\n https://xichenpan.github.io/kosmosg"},{"id":"http://arxiv.org/abs/2404.17100v1","updated":"2024-04-26T01:21:08Z","published":"2024-04-26T01:21:08Z","title":"Open-Set Video-based Facial Expression Recognition with Human\n Expression-sensitive Prompting","summary":" In Video-based Facial Expression Recognition (V-FER), models are typically\ntrained on closed-set datasets with a fixed number of known classes. However,\nthese V-FER models cannot deal with unknown classes that are prevalent in\nreal-world scenarios. In this paper, we introduce a challenging Open-set\nVideo-based Facial Expression Recognition (OV-FER) task, aiming at identifying\nnot only known classes but also new, unknown human facial expressions not\nencountered during training. While existing approaches address open-set\nrecognition by leveraging large-scale vision-language models like CLIP to\nidentify unseen classes, we argue that these methods may not adequately capture\nthe nuanced and subtle human expression patterns required by the OV-FER task.\nTo address this limitation, we propose a novel Human Expression-Sensitive\nPrompting (HESP) mechanism to significantly enhance CLIP's ability to model\nvideo-based facial expression details effectively, thereby presenting a new\nCLIP-based OV-FER approach. Our proposed HESP comprises three components: 1) a\ntextual prompting module with learnable prompt representations to complement\nthe original CLIP textual prompts and enhance the textual representations of\nboth known and unknown emotions, 2) a visual prompting module that encodes\ntemporal emotional information from video frames using expression-sensitive\nattention, equipping CLIP with a new visual modeling ability to extract\nemotion-rich information, 3) a delicately designed open-set multi-task learning\nscheme that facilitates prompt learning and encourages interactions between the\ntextual and visual prompting modules. Extensive experiments conducted on four\nOV-FER task settings demonstrate that HESP can significantly boost CLIP's\nperformance (a relative improvement of 17.93% on AUROC and 106.18% on OSCR) and\noutperform other state-of-the-art open-set video understanding methods by a\nlarge margin.\n","authors":["Yuanyuan Liu","Yuxuan Huang","Shuyang Liu","Yibing Zhan","Zijing Chen","Zhe Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06306v3","updated":"2024-04-26T01:14:22Z","published":"2023-06-09T23:51:11Z","title":"DocumentCLIP: Linking Figures and Main Body Text in Reflowed Documents","summary":" Vision-language pretraining models have achieved great success in supporting\nmultimedia applications by understanding the alignments between images and\ntext. While existing vision-language pretraining models primarily focus on\nunderstanding single image associated with a single piece of text, they often\nignore the alignment at the intra-document level, consisting of multiple\nsentences with multiple images. In this work, we propose DocumentCLIP, a\nsalience-aware contrastive learning framework to enforce vision-language\npretraining models to comprehend the interaction between images and longer text\nwithin documents. Our model is beneficial for the real-world multimodal\ndocument understanding like news article, magazines, product descriptions,\nwhich contain linguistically and visually richer content. To the best of our\nknowledge, we are the first to explore multimodal intra-document links by\ncontrastive learning. In addition, we collect a large Wikipedia dataset for\npretraining, which provides various topics and structures. Experiments show\nDocumentCLIP not only outperforms the state-of-the-art baselines in the\nsupervised setting, but also achieves the best zero-shot performance in the\nwild after human evaluation. Our code is available at\nhttps://github.com/FuxiaoLiu/DocumentCLIP.\n","authors":["Fuxiao Liu","Hao Tan","Chris Tensmeyer"],"pdf_url":"https://arxiv.org/pdf/2306.06306v3.pdf","comment":"Accepted to ICPRAI 2024"},{"id":"http://arxiv.org/abs/2404.04608v2","updated":"2024-04-26T01:07:26Z","published":"2024-04-06T12:27:21Z","title":"Panoptic Perception: A Novel Task and Fine-grained Dataset for Universal\n Remote Sensing Image Interpretation","summary":" Current remote-sensing interpretation models often focus on a single task\nsuch as detection, segmentation, or caption. However, the task-specific\ndesigned models are unattainable to achieve the comprehensive multi-level\ninterpretation of images. The field also lacks support for multi-task joint\ninterpretation datasets. In this paper, we propose Panoptic Perception, a novel\ntask and a new fine-grained dataset (FineGrip) to achieve a more thorough and\nuniversal interpretation for RSIs. The new task, 1) integrates pixel-level,\ninstance-level, and image-level information for universal image perception, 2)\ncaptures image information from coarse to fine granularity, achieving deeper\nscene understanding and description, and 3) enables various independent tasks\nto complement and enhance each other through multi-task learning. By\nemphasizing multi-task interactions and the consistency of perception results,\nthis task enables the simultaneous processing of fine-grained foreground\ninstance segmentation, background semantic segmentation, and global\nfine-grained image captioning. Concretely, the FineGrip dataset includes 2,649\nremote sensing images, 12,054 fine-grained instance segmentation masks\nbelonging to 20 foreground things categories, 7,599 background semantic masks\nfor 5 stuff classes and 13,245 captioning sentences. Furthermore, we propose a\njoint optimization-based panoptic perception model. Experimental results on\nFineGrip demonstrate the feasibility of the panoptic perception task and the\nbeneficial effect of multi-task joint optimization on individual tasks. The\ndataset will be publicly available.\n","authors":["Danpei Zhao","Bo Yuan","Ziqiang Chen","Tian Li","Zhuoran Liu","Wentao Li","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2404.04608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17092v1","updated":"2024-04-26T00:57:06Z","published":"2024-04-26T00:57:06Z","title":"Defending Spiking Neural Networks against Adversarial Attacks through\n Image Purification","summary":" Spiking Neural Networks (SNNs) aim to bridge the gap between neuroscience and\nmachine learning by emulating the structure of the human nervous system.\nHowever, like convolutional neural networks, SNNs are vulnerable to adversarial\nattacks. To tackle the challenge, we propose a biologically inspired\nmethodology to enhance the robustness of SNNs, drawing insights from the visual\nmasking effect and filtering theory. First, an end-to-end SNN-based image\npurification model is proposed to defend against adversarial attacks, including\na noise extraction network and a non-blind denoising network. The former\nnetwork extracts noise features from noisy images, while the latter component\nemploys a residual U-Net structure to reconstruct high-quality noisy images and\ngenerate clean images. Simultaneously, a multi-level firing SNN based on\nSqueeze-and-Excitation Network is introduced to improve the robustness of the\nclassifier. Crucially, the proposed image purification network serves as a\npre-processing module, avoiding modifications to classifiers. Unlike\nadversarial training, our method is highly flexible and can be seamlessly\nintegrated with other defense strategies. Experimental results on various\ndatasets demonstrate that the proposed methodology outperforms state-of-the-art\nbaselines in terms of defense effectiveness, training time, and resource\nconsumption.\n","authors":["Weiran Chen","Qi Sun","Qi Xu"],"pdf_url":"https://arxiv.org/pdf/2404.17092v1.pdf","comment":"8 pages, 5 figures, ECAI2024 under review"},{"id":"http://arxiv.org/abs/2305.08275v4","updated":"2024-04-26T00:26:44Z","published":"2023-05-14T23:14:09Z","title":"ULIP-2: Towards Scalable Multimodal Pre-training for 3D Understanding","summary":" Recent advancements in multimodal pre-training have shown promising efficacy\nin 3D representation learning by aligning multimodal features across 3D shapes,\ntheir 2D counterparts, and language descriptions. However, the methods used by\nexisting frameworks to curate such multimodal data, in particular language\ndescriptions for 3D shapes, are not scalable, and the collected language\ndescriptions are not diverse. To address this, we introduce ULIP-2, a simple\nyet effective tri-modal pre-training framework that leverages large multimodal\nmodels to automatically generate holistic language descriptions for 3D shapes.\nIt only needs 3D data as input, eliminating the need for any manual 3D\nannotations, and is therefore scalable to large datasets. ULIP-2 is also\nequipped with scaled-up backbones for better multimodal representation\nlearning. We conduct experiments on two large-scale 3D datasets, Objaverse and\nShapeNet, and augment them with tri-modal datasets of 3D point clouds, images,\nand language for training ULIP-2. Experiments show that ULIP-2 demonstrates\nsubstantial benefits in three downstream tasks: zero-shot 3D classification,\nstandard 3D classification with fine-tuning, and 3D captioning (3D-to-language\ngeneration). It achieves a new SOTA of 50.6% (top-1) on Objaverse-LVIS and\n84.7% (top-1) on ModelNet40 in zero-shot classification. In the ScanObjectNN\nbenchmark for standard fine-tuning, ULIP-2 reaches an overall accuracy of 91.5%\nwith a compact model of only 1.4 million parameters. ULIP-2 sheds light on a\nnew paradigm for scalable multimodal 3D representation learning without human\nannotations and shows significant improvements over existing baselines. The\ncode and datasets are released at https://github.com/salesforce/ULIP.\n","authors":["Le Xue","Ning Yu","Shu Zhang","Artemis Panagopoulou","Junnan Li","Roberto Martín-Martín","Jiajun Wu","Caiming Xiong","Ran Xu","Juan Carlos Niebles","Silvio Savarese"],"pdf_url":"https://arxiv.org/pdf/2305.08275v4.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.11051v2","updated":"2024-04-26T00:21:19Z","published":"2024-04-17T03:51:24Z","title":"WPS-Dataset: A benchmark for wood plate segmentation in bark removal\n processing","summary":" Using deep learning methods is a promising approach to improving bark removal\nefficiency and enhancing the quality of wood products. However, the lack of\npublicly available datasets for wood plate segmentation in bark removal\nprocessing poses challenges for researchers in this field. To address this\nissue, a benchmark for wood plate segmentation in bark removal processing named\nWPS-dataset is proposed in this study, which consists of 4863 images. We\ndesigned an image acquisition device and assembled it on a bark removal\nequipment to capture images in real industrial settings. We evaluated the\nWPS-dataset using six typical segmentation models. The models effectively learn\nand understand the WPS-dataset characteristics during training, resulting in\nhigh performance and accuracy in wood plate segmentation tasks. We believe that\nour dataset can lay a solid foundation for future research in bark removal\nprocessing and contribute to advancements in this field.\n","authors":["Rijun Wang","Guanghao Zhang","Fulong Liang","Bo Wang","Xiangwei Mou","Yesheng Chen","Peng Sun","Canjin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17732v1","updated":"2024-04-26T23:46:10Z","published":"2024-04-26T23:46:10Z","title":"Generative Dataset Distillation: Balancing Global Structure and Local\n Details","summary":" In this paper, we propose a new dataset distillation method that considers\nbalancing global structure and local details when distilling the information\nfrom a large dataset into a generative model. Dataset distillation has been\nproposed to reduce the size of the required dataset when training models. The\nconventional dataset distillation methods face the problem of long redeployment\ntime and poor cross-architecture performance. Moreover, previous methods\nfocused too much on the high-level semantic attributes between the synthetic\ndataset and the original dataset while ignoring the local features such as\ntexture and shape. Based on the above understanding, we propose a new method\nfor distilling the original image dataset into a generative model. Our method\ninvolves using a conditional generative adversarial network to generate the\ndistilled dataset. Subsequently, we ensure balancing global structure and local\ndetails in the distillation process, continuously optimizing the generator for\nmore information-dense dataset generation.\n","authors":["Longzhen Li","Guang Li","Ren Togo","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2404.17732v1.pdf","comment":"Accepted by the 1st CVPR Workshop on Dataset Distillation"},{"id":"http://arxiv.org/abs/2404.17718v1","updated":"2024-04-26T22:46:17Z","published":"2024-04-26T22:46:17Z","title":"Lessons from Deploying CropFollow++: Under-Canopy Agricultural\n Navigation with Keypoints","summary":" We present a vision-based navigation system for under-canopy agricultural\nrobots using semantic keypoints. Autonomous under-canopy navigation is\nchallenging due to the tight spacing between the crop rows ($\\sim 0.75$ m),\ndegradation in RTK-GPS accuracy due to multipath error, and noise in LiDAR\nmeasurements from the excessive clutter. Our system, CropFollow++, introduces\nmodular and interpretable perception architecture with a learned semantic\nkeypoint representation. We deployed CropFollow++ in multiple under-canopy\ncover crop planting robots on a large scale (25 km in total) in various field\nconditions and we discuss the key lessons learned from this.\n","authors":["Arun N. Sivakumar","Mateus V. Gasparino","Michael McGuire","Vitor A. H. Higuti","M. Ugur Akcal","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2404.17718v1.pdf","comment":"Accepted to the IEEE ICRA Workshop on Field Robotics 2024"},{"id":"http://arxiv.org/abs/2309.08891v2","updated":"2024-04-26T21:59:30Z","published":"2023-09-16T06:06:53Z","title":"V2CE: Video to Continuous Events Simulator","summary":" Dynamic Vision Sensor (DVS)-based solutions have recently garnered\nsignificant interest across various computer vision tasks, offering notable\nbenefits in terms of dynamic range, temporal resolution, and inference speed.\nHowever, as a relatively nascent vision sensor compared to Active Pixel Sensor\n(APS) devices such as RGB cameras, DVS suffers from a dearth of ample labeled\ndatasets. Prior efforts to convert APS data into events often grapple with\nissues such as a considerable domain shift from real events, the absence of\nquantified validation, and layering problems within the time axis. In this\npaper, we present a novel method for video-to-events stream conversion from\nmultiple perspectives, considering the specific characteristics of DVS. A\nseries of carefully designed losses helps enhance the quality of generated\nevent voxels significantly. We also propose a novel local dynamic-aware\ntimestamp inference strategy to accurately recover event timestamps from event\nvoxels in a continuous fashion and eliminate the temporal layering problem.\nResults from rigorous validation through quantified metrics at all stages of\nthe pipeline establish our method unquestionably as the current\nstate-of-the-art (SOTA).\n","authors":["Zhongyang Zhang","Shuyang Cui","Kaidong Chai","Haowen Yu","Subhasis Dasgupta","Upal Mahbub","Tauhidur Rahman"],"pdf_url":"https://arxiv.org/pdf/2309.08891v2.pdf","comment":"6 pages, 7 figures, IEEE International Conference on Robotics and\n Automation (ICRA) 2024"},{"id":"http://arxiv.org/abs/2404.17704v1","updated":"2024-04-26T21:30:36Z","published":"2024-04-26T21:30:36Z","title":"SPLICE -- Streamlining Digital Pathology Image Processing","summary":" Digital pathology and the integration of artificial intelligence (AI) models\nhave revolutionized histopathology, opening new opportunities. With the\nincreasing availability of Whole Slide Images (WSIs), there's a growing demand\nfor efficient retrieval, processing, and analysis of relevant images from vast\nbiomedical archives. However, processing WSIs presents challenges due to their\nlarge size and content complexity. Full computer digestion of WSIs is\nimpractical, and processing all patches individually is prohibitively\nexpensive. In this paper, we propose an unsupervised patching algorithm,\nSequential Patching Lattice for Image Classification and Enquiry (SPLICE). This\nnovel approach condenses a histopathology WSI into a compact set of\nrepresentative patches, forming a \"collage\" of WSI while minimizing redundancy.\nSPLICE prioritizes patch quality and uniqueness by sequentially analyzing a WSI\nand selecting non-redundant representative features. We evaluated SPLICE for\nsearch and match applications, demonstrating improved accuracy, reduced\ncomputation time, and storage requirements compared to existing\nstate-of-the-art methods. As an unsupervised method, SPLICE effectively reduces\nstorage requirements for representing tissue images by 50%. This reduction\nenables numerous algorithms in computational pathology to operate much more\nefficiently, paving the way for accelerated adoption of digital pathology.\n","authors":["Areej Alsaafin","Peyman Nejat","Abubakr Shafique","Jibran Khan","Saghir Alfasly","Ghazal Alabtah","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2404.17704v1.pdf","comment":"Under review for publication"},{"id":"http://arxiv.org/abs/2404.16421v2","updated":"2024-04-26T21:23:04Z","published":"2024-04-25T08:51:59Z","title":"SynCellFactory: Generative Data Augmentation for Cell Tracking","summary":" Cell tracking remains a pivotal yet challenging task in biomedical research.\nThe full potential of deep learning for this purpose is often untapped due to\nthe limited availability of comprehensive and varied training data sets. In\nthis paper, we present SynCellFactory, a generative cell video augmentation. At\nthe heart of SynCellFactory lies the ControlNet architecture, which has been\nfine-tuned to synthesize cell imagery with photorealistic accuracy in style and\nmotion patterns. This technique enables the creation of synthetic yet realistic\ncell videos that mirror the complexity of authentic microscopy time-lapses. Our\nexperiments demonstrate that SynCellFactory boosts the performance of\nwell-established deep learning models for cell tracking, particularly when\noriginal training data is sparse.\n","authors":["Moritz Sturm","Lorenzo Cerrone","Fred A. Hamprecht"],"pdf_url":"https://arxiv.org/pdf/2404.16421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17699v1","updated":"2024-04-26T20:55:39Z","published":"2024-04-26T20:55:39Z","title":"Deep Learning for Melt Pool Depth Contour Prediction From Surface\n Thermal Images via Vision Transformers","summary":" Insufficient overlap between the melt pools produced during Laser Powder Bed\nFusion (L-PBF) can lead to lack-of-fusion defects and deteriorated mechanical\nand fatigue performance. In-situ monitoring of the melt pool subsurface\nmorphology requires specialized equipment that may not be readily accessible or\nscalable. Therefore, we introduce a machine learning framework to correlate\nin-situ two-color thermal images observed via high-speed color imaging to the\ntwo-dimensional profile of the melt pool cross-section. Specifically, we employ\na hybrid CNN-Transformer architecture to establish a correlation between single\nbead off-axis thermal image sequences and melt pool cross-section contours\nmeasured via optical microscopy. In this architecture, a ResNet model embeds\nthe spatial information contained within the thermal images to a latent vector,\nwhile a Transformer model correlates the sequence of embedded vectors to\nextract temporal information. Our framework is able to model the curvature of\nthe subsurface melt pool structure, with improved performance in high energy\ndensity regimes compared to analytical melt pool models. The performance of\nthis model is evaluated through dimensional and geometric comparisons to the\ncorresponding experimental melt pool observations.\n","authors":["Francis Ogoke","Peter Myung-Won Pak","Alexander Myers","Guadalupe Quirarte","Jack Beuth","Jonathan Malen","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2404.17699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17697v1","updated":"2024-04-26T20:54:44Z","published":"2024-04-26T20:54:44Z","title":"Enhancing Track Management Systems with Vehicle-To-Vehicle Enabled\n Sensor Fusion","summary":" In the rapidly advancing landscape of connected and automated vehicles (CAV),\nthe integration of Vehicle-to-Everything (V2X) communication in traditional\nfusion systems presents a promising avenue for enhancing vehicle perception.\nAddressing current limitations with vehicle sensing, this paper proposes a\nnovel Vehicle-to-Vehicle (V2V) enabled track management system that leverages\nthe synergy between V2V signals and detections from radar and camera sensors.\nThe core innovation lies in the creation of independent priority track lists,\nconsisting of fused detections validated through V2V communication. This\napproach enables more flexible and resilient thresholds for track management,\nparticularly in scenarios with numerous occlusions where the tracked objects\nmove outside the field of view of the perception sensors. The proposed system\nconsiders the implications of falsification of V2X signals which is combated\nthrough an initial vehicle identification process using detection from\nperception sensors. Presented are the fusion algorithm, simulated environments,\nand validation mechanisms. Experimental results demonstrate the improved\naccuracy and robustness of the proposed system in common driving scenarios,\nhighlighting its potential to advance the reliability and efficiency of\nautonomous vehicles.\n","authors":["Thomas Billington","Ansh Gwash","Aadi Kothari","Lucas Izquierdo","Timothy Talty"],"pdf_url":"https://arxiv.org/pdf/2404.17697v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.05632v3","updated":"2024-04-26T20:35:41Z","published":"2023-12-09T18:40:37Z","title":"Subject-Based Domain Adaptation for Facial Expression Recognition","summary":" Adapting a deep learning model to a specific target individual is a\nchallenging facial expression recognition (FER) task that may be achieved using\nunsupervised domain adaptation (UDA) methods. Although several UDA methods have\nbeen proposed to adapt deep FER models across source and target data sets,\nmultiple subject-specific source domains are needed to accurately represent the\nintra- and inter-person variability in subject-based adaption. This paper\nconsiders the setting where domains correspond to individuals, not entire\ndatasets. Unlike UDA, multi-source domain adaptation (MSDA) methods can\nleverage multiple source datasets to improve the accuracy and robustness of the\ntarget model. However, previous methods for MSDA adapt image classification\nmodels across datasets and do not scale well to a more significant number of\nsource domains. This paper introduces a new MSDA method for subject-based\ndomain adaptation in FER. It efficiently leverages information from multiple\nsource subjects (labeled source domain data) to adapt a deep FER model to a\nsingle target individual (unlabeled target domain data). During adaptation, our\nsubject-based MSDA first computes a between-source discrepancy loss to mitigate\nthe domain shift among data from several source subjects. Then, a new strategy\nis employed to generate augmented confident pseudo-labels for the target\nsubject, allowing a reduction in the domain shift between source and target\nsubjects. Experiments performed on the challenging BioVid heat and pain dataset\nwith 87 subjects and the UNBC-McMaster shoulder pain dataset with 25 subjects\nshow that our subject-based MSDA can outperform state-of-the-art methods yet\nscale well to multiple subject-based source domains.\n","authors":["Muhammad Osama Zeeshan","Muhammad Haseeb Aslam","Soufiane Belharbi","Alessandro Lameiras Koerich","Marco Pedersoli","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2312.05632v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02352v3","updated":"2024-04-26T19:53:22Z","published":"2024-02-04T05:33:04Z","title":"Region-Based Representations Revisited","summary":" We investigate whether region-based representations are effective for\nrecognition. Regions were once a mainstay in recognition approaches, but pixel\nand patch-based features are now used almost exclusively. We show that recent\nclass-agnostic segmenters like SAM can be effectively combined with strong\nunsupervised representations like DINOv2 and used for a wide variety of tasks,\nincluding semantic segmentation, object-based image retrieval, and multi-image\nanalysis. Once the masks and features are extracted, these representations,\neven with linear decoders, enable competitive performance, making them well\nsuited to applications that require custom queries. The compactness of the\nrepresentation also makes it well-suited to video analysis and other problems\nrequiring inference across many images.\n","authors":["Michal Shlapentokh-Rothman","Ansel Blume","Yao Xiao","Yuqun Wu","Sethuraman T V","Heyi Tao","Jae Yong Lee","Wilfredo Torres","Yu-Xiong Wang","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2402.02352v3.pdf","comment":"CVPR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2404.17672v1","updated":"2024-04-26T19:37:13Z","published":"2024-04-26T19:37:13Z","title":"BlenderAlchemy: Editing 3D Graphics with Vision-Language Models","summary":" Graphics design is important for various applications, including movie\nproduction and game design. To create a high-quality scene, designers usually\nneed to spend hours in software like Blender, in which they might need to\ninterleave and repeat operations, such as connecting material nodes, hundreds\nof times. Moreover, slightly different design goals may require completely\ndifferent sequences, making automation difficult. In this paper, we propose a\nsystem that leverages Vision-Language Models (VLMs), like GPT-4V, to\nintelligently search the design action space to arrive at an answer that can\nsatisfy a user's intent. Specifically, we design a vision-based edit generator\nand state evaluator to work together to find the correct sequence of actions to\nachieve the goal. Inspired by the role of visual imagination in the human\ndesign process, we supplement the visual reasoning capabilities of VLMs with\n\"imagined\" reference images from image-generation models, providing visual\ngrounding of abstract language descriptions. In this paper, we provide\nempirical evidence suggesting our system can produce simple but tedious Blender\nediting sequences for tasks such as editing procedural materials from text\nand/or reference images, as well as adjusting lighting configurations for\nproduct renderings in complex scenes.\n","authors":["Ian Huang","Guandao Yang","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2404.17672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02974v2","updated":"2024-04-26T19:29:58Z","published":"2023-12-05T18:59:16Z","title":"Describing Differences in Image Sets with Natural Language","summary":" How do two sets of images differ? Discerning set-level differences is crucial\nfor understanding model behaviors and analyzing datasets, yet manually sifting\nthrough thousands of images is impractical. To aid in this discovery process,\nwe explore the task of automatically describing the differences between two\n$\\textbf{sets}$ of images, which we term Set Difference Captioning. This task\ntakes in image sets $D_A$ and $D_B$, and outputs a description that is more\noften true on $D_A$ than $D_B$. We outline a two-stage approach that first\nproposes candidate difference descriptions from image sets and then re-ranks\nthe candidates by checking how well they can differentiate the two sets. We\nintroduce VisDiff, which first captions the images and prompts a language model\nto propose candidate descriptions, then re-ranks these descriptions using CLIP.\nTo evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image\nsets with ground truth difference descriptions. We apply VisDiff to various\ndomains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing\nclassification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing\nmodel failure modes (supervised ResNet), characterizing differences between\ngenerative models (e.g., StableDiffusionV1 and V2), and discovering what makes\nimages memorable. Using VisDiff, we are able to find interesting and previously\nunknown differences in datasets and models, demonstrating its utility in\nrevealing nuanced insights.\n","authors":["Lisa Dunlap","Yuhui Zhang","Xiaohan Wang","Ruiqi Zhong","Trevor Darrell","Jacob Steinhardt","Joseph E. Gonzalez","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2312.02974v2.pdf","comment":"CVPR 2024 Oral"},{"id":"http://arxiv.org/abs/2404.17670v1","updated":"2024-04-26T19:27:07Z","published":"2024-04-26T19:27:07Z","title":"Federated Learning for Blind Image Super-Resolution","summary":" Traditional blind image SR methods need to model real-world degradations\nprecisely. Consequently, current research struggles with this dilemma by\nassuming idealized degradations, which leads to limited applicability to actual\nuser data. Moreover, the ideal scenario - training models on data from the\ntargeted user base - presents significant privacy concerns. To address both\nchallenges, we propose to fuse image SR with federated learning, allowing\nreal-world degradations to be directly learned from users without invading\ntheir privacy. Furthermore, it enables optimization across many devices without\ndata centralization. As this fusion is underexplored, we introduce new\nbenchmarks specifically designed to evaluate new SR methods in this federated\nsetting. By doing so, we employ known degradation modeling techniques from SR\nresearch. However, rather than aiming to mirror real degradations, our\nbenchmarks use these degradation models to simulate the variety of degradations\nfound across clients within a distributed user base. This distinction is\ncrucial as it circumvents the need to precisely model real-world degradations,\nwhich limits contemporary blind image SR research. Our proposed benchmarks\ninvestigate blind image SR under new aspects, namely differently distributed\ndegradation types among users and varying user numbers. We believe new methods\ntested within these benchmarks will perform more similarly in an application,\nas the simulated scenario addresses the variety while federated learning\nenables the training on actual degradations.\n","authors":["Brian B. Moser","Ahmed Anwar","Federico Raue","Stanislav Frolov","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2404.17670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09347v2","updated":"2024-04-26T19:19:22Z","published":"2023-03-16T14:27:45Z","title":"CSSL-MHTR: Continual Self-Supervised Learning for Scalable Multi-script\n Handwritten Text Recognition","summary":" Self-supervised learning has recently emerged as a strong alternative in\ndocument analysis. These approaches are now capable of learning high-quality\nimage representations and overcoming the limitations of supervised methods,\nwhich require a large amount of labeled data. However, these methods are unable\nto capture new knowledge in an incremental fashion, where data is presented to\nthe model sequentially, which is closer to the realistic scenario. In this\npaper, we explore the potential of continual self-supervised learning to\nalleviate the catastrophic forgetting problem in handwritten text recognition,\nas an example of sequence recognition. Our method consists in adding\nintermediate layers called adapters for each task, and efficiently distilling\nknowledge from the previous model while learning the current task. Our proposed\nframework is efficient in both computation and memory complexity. To\ndemonstrate its effectiveness, we evaluate our method by transferring the\nlearned model to diverse text recognition downstream tasks, including Latin and\nnon-Latin scripts. As far as we know, this is the first application of\ncontinual self-supervised learning for handwritten text recognition. We attain\nstate-of-the-art performance on English, Italian and Russian scripts, whilst\nadding only a few parameters per task. The code and trained models will be\npublicly available.\n","authors":["Marwa Dhiaf","Mohamed Ali Souibgui","Kai Wang","Yuyang Liu","Yousri Kessentini","Alicia Fornés","Ahmed Cheikh Rouhou"],"pdf_url":"https://arxiv.org/pdf/2303.09347v2.pdf","comment":"Due to current company policy constraints, we are compelled to\n withdraw our paper. The organization's guidelines prohibit us from proceeding\n with the publication of this work at this time. We apologize for any\n inconvenience this may cause and appreciate your understanding in this matter"},{"id":"http://arxiv.org/abs/2303.00612v3","updated":"2024-04-26T18:49:56Z","published":"2023-03-01T16:09:11Z","title":"Has the Virtualization of the Face Changed Facial Perception? A Study of\n the Impact of Photo Editing and Augmented Reality on Facial Perception","summary":" Augmented reality and other photo editing filters are popular methods used to\nmodify faces online. Considering the important role of facial perception in\ncommunication, how do we perceive this increasing number of modified faces? In\nthis paper we present the results of six surveys that measure familiarity with\ndifferent styles of facial filters, perceived strangeness of faces edited with\ndifferent filters, and ability to discern whether images are filtered. Our\nresults demonstrate that faces modified with more traditional face filters are\nperceived similarly to unmodified faces, and faces filtered with augmented\nreality filters are perceived differently from unmodified faces. We discuss\npossible explanations for these results, including a societal adjustment to\ntraditional photo editing techniques or the inherent differences in the\ndifferent types of filters. We conclude with a discussion of how to build\nonline spaces more responsibly based on our results.\n","authors":["Louisa Conwill","Sam English Anthony","Walter J. Scheirer"],"pdf_url":"https://arxiv.org/pdf/2303.00612v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13127v5","updated":"2024-04-26T18:27:56Z","published":"2023-11-22T03:31:31Z","title":"MetaCloak: Preventing Unauthorized Subject-driven Text-to-image\n Diffusion-based Synthesis via Meta-learning","summary":" Text-to-image diffusion models allow seamless generation of personalized\nimages from scant reference photos. Yet, these tools, in the wrong hands, can\nfabricate misleading or harmful content, endangering individuals. To address\nthis problem, existing poisoning-based approaches perturb user images in an\nimperceptible way to render them \"unlearnable\" from malicious uses. We identify\ntwo limitations of these defending approaches: i) sub-optimal due to the\nhand-crafted heuristics for solving the intractable bilevel optimization and\nii) lack of robustness against simple data transformations like Gaussian\nfiltering. To solve these challenges, we propose MetaCloak, which solves the\nbi-level poisoning problem with a meta-learning framework with an additional\ntransformation sampling process to craft transferable and robust perturbation.\nSpecifically, we employ a pool of surrogate diffusion models to craft\ntransferable and model-agnostic perturbation. Furthermore, by incorporating an\nadditional transformation process, we design a simple denoising-error\nmaximization loss that is sufficient for causing transformation-robust semantic\ndistortion and degradation in a personalized generation. Extensive experiments\non the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing\napproaches. Notably, MetaCloak can successfully fool online training services\nlike Replicate, in a black-box manner, demonstrating the effectiveness of\nMetaCloak in real-world scenarios. Our code is available at\nhttps://github.com/liuyixin-louis/MetaCloak.\n","authors":["Yixin Liu","Chenrui Fan","Yutong Dai","Xun Chen","Pan Zhou","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13127v5.pdf","comment":"Accepted to CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.17651v1","updated":"2024-04-26T18:16:39Z","published":"2024-04-26T18:16:39Z","title":"Hard ASH: Sparsity and the right optimizer make a continual learner","summary":" In class incremental learning, neural networks typically suffer from\ncatastrophic forgetting. We show that an MLP featuring a sparse activation\nfunction and an adaptive learning rate optimizer can compete with established\nregularization techniques in the Split-MNIST task. We highlight the\neffectiveness of the Adaptive SwisH (ASH) activation function in this context\nand introduce a novel variant, Hard Adaptive SwisH (Hard ASH) to further\nenhance the learning retention.\n","authors":["Santtu Keskinen"],"pdf_url":"https://arxiv.org/pdf/2404.17651v1.pdf","comment":"ICLR 2024 TinyPaper"},{"id":"http://arxiv.org/abs/2305.19480v5","updated":"2024-04-26T18:13:36Z","published":"2023-05-31T01:16:08Z","title":"Learning by Aligning 2D Skeleton Sequences and Multi-Modality Fusion","summary":" This paper presents a self-supervised temporal video alignment framework\nwhich is useful for several fine-grained human activity understanding\napplications. In contrast with the state-of-the-art method of CASA, where\nsequences of 3D skeleton coordinates are taken directly as input, our key idea\nis to use sequences of 2D skeleton heatmaps as input. Unlike CASA which\nperforms self-attention in the temporal domain only, we feed 2D skeleton\nheatmaps to a video transformer which performs self-attention both in the\nspatial and temporal domains for extracting effective spatiotemporal and\ncontextual features. In addition, we introduce simple heatmap augmentation\ntechniques based on 2D skeletons for self-supervised learning. Despite the lack\nof 3D information, our approach achieves not only higher accuracy but also\nbetter robustness against missing and noisy keypoints than CASA. Furthermore,\nextensive evaluations on three public datasets, i.e., Penn Action, IKEA ASM,\nand H2O, demonstrate that our approach outperforms previous methods in\ndifferent fine-grained human activity understanding tasks. Finally, fusing 2D\nskeleton heatmaps with RGB videos yields the state-of-the-art on all metrics\nand datasets. To our best knowledge, our work is the first to utilize 2D\nskeleton heatmap inputs and the first to explore multi-modality fusion for\ntemporal video alignment.\n","authors":["Quoc-Huy Tran","Muhammad Ahmed","Murad Popattia","M. Hassan Ahmed","Andrey Konin","M. Zeeshan Zia"],"pdf_url":"https://arxiv.org/pdf/2305.19480v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08698v2","updated":"2024-04-26T18:02:38Z","published":"2024-02-13T02:43:41Z","title":"AMEND: A Mixture of Experts Framework for Long-tailed Trajectory\n Prediction","summary":" Accurate prediction of pedestrians' future motions is critical for\nintelligent driving systems. Developing models for this task requires rich\ndatasets containing diverse sets of samples. However, the existing naturalistic\ntrajectory prediction datasets are generally imbalanced in favor of simpler\nsamples and lack challenging scenarios. Such a long-tail effect causes\nprediction models to underperform on the tail portion of the data distribution\ncontaining safety-critical scenarios. Previous methods tackle the long-tail\nproblem using methods such as contrastive learning and class-conditioned\nhypernetworks. These approaches, however, are not modular and cannot be applied\nto many machine learning architectures. In this work, we propose a modular\nmodel-agnostic framework for trajectory prediction that leverages a specialized\nmixture of experts. In our approach, each expert is trained with a specialized\nskill with respect to a particular part of the data. To produce predictions, we\nutilise a router network that selects the best expert by generating relative\nconfidence scores. We conduct experimentation on common pedestrian trajectory\nprediction datasets and show that our method improves performance on long-tail\nscenarios. We further conduct ablation studies to highlight the contribution of\ndifferent proposed components.\n","authors":["Ray Coden Mercurius","Ehsan Ahmadi","Soheil Mohamad Alizadeh Shabestary","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2402.08698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09278v2","updated":"2024-04-26T17:51:20Z","published":"2023-06-15T16:59:42Z","title":"Robustness Analysis on Foundational Segmentation Models","summary":" Due to the increase in computational resources and accessibility of data, an\nincrease in large, deep learning models trained on copious amounts of\nmulti-modal data using self-supervised or semi-supervised learning have\nemerged. These ``foundation'' models are often adapted to a variety of\ndownstream tasks like classification, object detection, and segmentation with\nlittle-to-no training on the target dataset. In this work, we perform a\nrobustness analysis of Visual Foundation Models (VFMs) for segmentation tasks\nand focus on robustness against real-world distribution shift inspired\nperturbations. We benchmark seven state-of-the-art segmentation architectures\nusing 2 different perturbed datasets, MS COCO-P and ADE20K-P, with 17 different\nperturbations with 5 severity levels each. Our findings reveal several key\ninsights: (1) VFMs exhibit vulnerabilities to compression-induced corruptions,\n(2) despite not outpacing all of unimodal models in robustness, multimodal\nmodels show competitive resilience in zero-shot scenarios, and (3) VFMs\ndemonstrate enhanced robustness for certain object categories. These\nobservations suggest that our robustness evaluation framework sets new\nrequirements for foundational models, encouraging further advancements to\nbolster their adaptability and performance. The code and dataset is available\nat: \\url{https://tinyurl.com/fm-robust}.\n","authors":["Madeline Chantry Schiappa","Shehreen Azad","Sachidanand VS","Yunhao Ge","Ondrej Miksik","Yogesh S. Rawat","Vibhav Vineet"],"pdf_url":"https://arxiv.org/pdf/2306.09278v2.pdf","comment":"This benchmark along with the code and datasets is available at:\n https://tinyurl.com/fm-robust. Accepted at CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.18591v1","updated":"2024-04-26T14:59:42Z","published":"2024-04-26T14:59:42Z","title":"FashionSD-X: Multimodal Fashion Garment Synthesis using Latent Diffusion","summary":" The rapid evolution of the fashion industry increasingly intersects with\ntechnological advancements, particularly through the integration of generative\nAI. This study introduces a novel generative pipeline designed to transform the\nfashion design process by employing latent diffusion models. Utilizing\nControlNet and LoRA fine-tuning, our approach generates high-quality images\nfrom multimodal inputs such as text and sketches. We leverage and enhance\nstate-of-the-art virtual try-on datasets, including Multimodal Dress Code and\nVITON-HD, by integrating sketch data. Our evaluation, utilizing metrics like\nFID, CLIP Score, and KID, demonstrates that our model significantly outperforms\ntraditional stable diffusion models. The results not only highlight the\neffectiveness of our model in generating fashion-appropriate outputs but also\nunderscore the potential of diffusion models in revolutionizing fashion design\nworkflows. This research paves the way for more interactive, personalized, and\ntechnologically enriched methodologies in fashion design and representation,\nbridging the gap between creative vision and practical application.\n","authors":["Abhishek Kumar Singh","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2404.18591v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.17621v1","updated":"2024-04-26T14:25:07Z","published":"2024-04-26T14:25:07Z","title":"Attention-aware non-rigid image registration for accelerated MR imaging","summary":" Accurate motion estimation at high acceleration factors enables rapid\nmotion-compensated reconstruction in Magnetic Resonance Imaging (MRI) without\ncompromising the diagnostic image quality. In this work, we introduce an\nattention-aware deep learning-based framework that can perform non-rigid\npairwise registration for fully sampled and accelerated MRI. We extract local\nvisual representations to build similarity maps between the registered image\npairs at multiple resolution levels and additionally leverage long-range\ncontextual information using a transformer-based module to alleviate\nambiguities in the presence of artifacts caused by undersampling. We combine\nlocal and global dependencies to perform simultaneous coarse and fine motion\nestimation. The proposed method was evaluated on in-house acquired fully\nsampled and accelerated data of 101 patients and 62 healthy subjects undergoing\ncardiac and thoracic MRI. The impact of motion estimation accuracy on the\ndownstream task of motion-compensated reconstruction was analyzed. We\ndemonstrate that our model derives reliable and consistent motion fields across\ndifferent sampling trajectories (Cartesian and radial) and acceleration factors\nof up to 16x for cardiac motion and 30x for respiratory motion and achieves\nsuperior image quality in motion-compensated reconstruction qualitatively and\nquantitatively compared to conventional and recent deep learning-based\napproaches. The code is publicly available at\nhttps://github.com/lab-midas/GMARAFT.\n","authors":["Aya Ghoul","Jiazhen Pan","Andreas Lingg","Jens Kübler","Patrick Krumm","Kerstin Hammernik","Daniel Rueckert","Sergios Gatidis","Thomas Küstner"],"pdf_url":"https://arxiv.org/pdf/2404.17621v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.17620v1","updated":"2024-04-26T14:12:37Z","published":"2024-04-26T14:12:37Z","title":"Neural Modes: Self-supervised Learning of Nonlinear Modal Subspaces","summary":" We propose a self-supervised approach for learning physics-based subspaces\nfor real-time simulation. Existing learning-based methods construct subspaces\nby approximating pre-defined simulation data in a purely geometric way.\nHowever, this approach tends to produce high-energy configurations, leads to\nentangled latent space dimensions, and generalizes poorly beyond the training\nset. To overcome these limitations, we propose a self-supervised approach that\ndirectly minimizes the system's mechanical energy during training. We show that\nour method leads to learned subspaces that reflect physical equilibrium\nconstraints, resolve overfitting issues of previous methods, and offer\ninterpretable latent space parameters.\n","authors":["Jiahong Wang","Yinwei Du","Stelian Coros","Bernhard Thomaszewski"],"pdf_url":"https://arxiv.org/pdf/2404.17620v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.17617v1","updated":"2024-04-26T11:47:36Z","published":"2024-04-26T11:47:36Z","title":"Beyond Traditional Threats: A Persistent Backdoor Attack on Federated\n Learning","summary":" Backdoors on federated learning will be diluted by subsequent benign updates.\nThis is reflected in the significant reduction of attack success rate as\niterations increase, ultimately failing. We use a new metric to quantify the\ndegree of this weakened backdoor effect, called attack persistence. Given that\nresearch to improve this performance has not been widely noted,we propose a\nFull Combination Backdoor Attack (FCBA) method. It aggregates more combined\ntrigger information for a more complete backdoor pattern in the global model.\nTrained backdoored global model is more resilient to benign updates, leading to\na higher attack success rate on the test set. We test on three datasets and\nevaluate with two models across various settings. FCBA's persistence\noutperforms SOTA federated learning backdoor attacks. On GTSRB, postattack 120\nrounds, our attack success rate rose over 50% from baseline. The core code of\nour method is available at https://github.com/PhD-TaoLiu/FCBA.\n","authors":["Tao Liu","Yuhang Zhang","Zhu Feng","Zhiqin Yang","Chen Xu","Dapeng Man","Wu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.17617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17610v1","updated":"2024-04-26T05:00:51Z","published":"2024-04-26T05:00:51Z","title":"Regression of Dense Distortion Field from a Single Fingerprint Image","summary":" Skin distortion is a long standing challenge in fingerprint matching, which\ncauses false non-matches. Previous studies have shown that the recognition rate\ncan be improved by estimating the distortion field from a distorted fingerprint\nand then rectifying it into a normal fingerprint. However, existing\nrectification methods are based on principal component representation of\ndistortion fields, which is not accurate and are very sensitive to finger pose.\nIn this paper, we propose a rectification method where a self-reference based\nnetwork is utilized to directly estimate the dense distortion field of\ndistorted fingerprint instead of its low dimensional representation. This\nmethod can output accurate distortion fields of distorted fingerprints with\nvarious finger poses and distortion patterns. We conducted experiments on\nFVC2004 DB1\\_A, expanded Tsinghua Distorted Fingerprint database (with\nadditional distorted fingerprints in diverse finger poses and distortion\npatterns) and a latent fingerprint database. Experimental results demonstrate\nthat our proposed method achieves the state-of-the-art rectification\nperformance in terms of distortion field estimation and rectified fingerprint\nmatching.\n","authors":["Xiongjun Guan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17610v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.17148"},{"id":"http://arxiv.org/abs/2008.10796v6","updated":"2024-04-26T15:02:07Z","published":"2020-08-25T03:30:53Z","title":"Deep Variational Network Toward Blind Image Restoration","summary":" Blind image restoration (IR) is a common yet challenging problem in computer\nvision. Classical model-based methods and recent deep learning (DL)-based\nmethods represent two different methodologies for this problem, each with their\nown merits and drawbacks. In this paper, we propose a novel blind image\nrestoration method, aiming to integrate both the advantages of them.\nSpecifically, we construct a general Bayesian generative model for the blind\nIR, which explicitly depicts the degradation process. In this proposed model, a\npixel-wise non-i.i.d. Gaussian distribution is employed to fit the image noise.\nIt is with more flexibility than the simple i.i.d. Gaussian or Laplacian\ndistributions as adopted in most of conventional methods, so as to handle more\ncomplicated noise types contained in the image degradation. To solve the model,\nwe design a variational inference algorithm where all the expected posteriori\ndistributions are parameterized as deep neural networks to increase their model\ncapability. Notably, such an inference algorithm induces a unified framework to\njointly deal with the tasks of degradation estimation and image restoration.\nFurther, the degradation information estimated in the former task is utilized\nto guide the latter IR process. Experiments on two typical blind IR tasks,\nnamely image denoising and super-resolution, demonstrate that the proposed\nmethod achieves superior performance over current state-of-the-arts.\n","authors":["Zongsheng Yue","Hongwei Yong","Qian Zhao","Lei Zhang","Deyu Meng","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2008.10796v6.pdf","comment":"Accepted by TPAMI@2024. Code: https://github.com/zsyOAOA/VIRNet"}]},"2024-04-29T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.18930v1","updated":"2024-04-29T17:59:41Z","published":"2024-04-29T17:59:41Z","title":"Hallucination of Multimodal Large Language Models: A Survey","summary":" This survey presents a comprehensive analysis of the phenomenon of\nhallucination in multimodal large language models (MLLMs), also known as Large\nVision-Language Models (LVLMs), which have demonstrated significant\nadvancements and remarkable abilities in multimodal tasks. Despite these\npromising developments, MLLMs often generate outputs that are inconsistent with\nthe visual content, a challenge known as hallucination, which poses substantial\nobstacles to their practical deployment and raises concerns regarding their\nreliability in real-world applications. This problem has attracted increasing\nattention, prompting efforts to detect and mitigate such inaccuracies. We\nreview recent advances in identifying, evaluating, and mitigating these\nhallucinations, offering a detailed overview of the underlying causes,\nevaluation benchmarks, metrics, and strategies developed to address this issue.\nAdditionally, we analyze the current challenges and limitations, formulating\nopen questions that delineate potential pathways for future research. By\ndrawing the granular classification and landscapes of hallucination causes,\nevaluation benchmarks, and mitigation methods, this survey aims to deepen the\nunderstanding of hallucinations in MLLMs and inspire further advancements in\nthe field. Through our thorough and in-depth review, we contribute to the\nongoing dialogue on enhancing the robustness and reliability of MLLMs,\nproviding valuable insights and resources for researchers and practitioners\nalike. Resources are available at:\nhttps://github.com/showlab/Awesome-MLLM-Hallucination.\n","authors":["Zechen Bai","Pichao Wang","Tianjun Xiao","Tong He","Zongbo Han","Zheng Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2404.18930v1.pdf","comment":"140 references"},{"id":"http://arxiv.org/abs/2404.18929v1","updated":"2024-04-29T17:59:30Z","published":"2024-04-29T17:59:30Z","title":"DGE: Direct Gaussian 3D Editing by Consistent Multi-view Editing","summary":" We consider the problem of editing 3D objects and scenes based on open-ended\nlanguage instructions. The established paradigm to solve this problem is to use\na 2D image generator or editor to guide the 3D editing process. However, this\nis often slow as it requires do update a computationally expensive 3D\nrepresentations such as a neural radiance field, and to do so by using\ncontradictory guidance from a 2D model which is inherently not multi-view\nconsistent. We thus introduce the Direct Gaussian Editor (DGE), a method that\naddresses these issues in two ways. First, we modify a given high-quality image\neditor like InstructPix2Pix to be multi-view consistent. We do so by utilizing\na training-free approach which integrates cues from the underlying 3D geometry\nof the scene. Second, given a multi-view consistent edited sequence of images\nof the object, we directly and efficiently optimize the 3D object\nrepresentation, which is based on 3D Gaussian Splatting. Because it does not\nrequire to apply edits incrementally and iteratively, DGE is significantly more\nefficient than existing approaches, and comes with other perks such as allowing\nselective editing of parts of the scene.\n","authors":["Minghao Chen","Iro Laina","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2404.18929v1.pdf","comment":"Project Page: https://silent-chen.github.io/DGE/"},{"id":"http://arxiv.org/abs/2404.18928v1","updated":"2024-04-29T17:59:16Z","published":"2024-04-29T17:59:16Z","title":"Stylus: Automatic Adapter Selection for Diffusion Models","summary":" Beyond scaling base models with more data or parameters, fine-tuned adapters\nprovide an alternative way to generate high fidelity, custom images at reduced\ncosts. As such, adapters have been widely adopted by open-source communities,\naccumulating a database of over 100K adapters-most of which are highly\ncustomized with insufficient descriptions. This paper explores the problem of\nmatching the prompt to a set of relevant adapters, built on recent work that\nhighlight the performance gains of composing adapters. We introduce Stylus,\nwhich efficiently selects and automatically composes task-specific adapters\nbased on a prompt's keywords. Stylus outlines a three-stage approach that first\nsummarizes adapters with improved descriptions and embeddings, retrieves\nrelevant adapters, and then further assembles adapters based on prompts'\nkeywords by checking how well they fit the prompt. To evaluate Stylus, we\ndeveloped StylusDocs, a curated dataset featuring 75K adapters with\npre-computed adapter embeddings. In our evaluation on popular Stable Diffusion\ncheckpoints, Stylus achieves greater CLIP-FID Pareto efficiency and is twice as\npreferred, with humans and multimodal models as evaluators, over the base\nmodel. See stylus-diffusion.github.io for more.\n","authors":["Michael Luo","Justin Wong","Brandon Trabucco","Yanping Huang","Joseph E. Gonzalez","Zhifeng Chen","Ruslan Salakhutdinov","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2404.18928v1.pdf","comment":"Project Website: https://stylus-diffusion.github.io"},{"id":"http://arxiv.org/abs/2404.18926v1","updated":"2024-04-29T17:59:11Z","published":"2024-04-29T17:59:11Z","title":"Point Cloud Models Improve Visual Robustness in Robotic Learners","summary":" Visual control policies can encounter significant performance degradation\nwhen visual conditions like lighting or camera position differ from those seen\nduring training -- often exhibiting sharp declines in capability even for minor\ndifferences. In this work, we examine robustness to a suite of these types of\nvisual changes for RGB-D and point cloud based visual control policies. To\nperform these experiments on both model-free and model-based reinforcement\nlearners, we introduce a novel Point Cloud World Model (PCWM) and point cloud\nbased control policies. Our experiments show that policies that explicitly\nencode point clouds are significantly more robust than their RGB-D\ncounterparts. Further, we find our proposed PCWM significantly outperforms\nprior works in terms of sample efficiency during training. Taken together,\nthese results suggest reasoning about the 3D scene through point clouds can\nimprove performance, reduce learning time, and increase robustness for robotic\nlearners. Project Webpage: https://pvskand.github.io/projects/PCWM\n","authors":["Skand Peri","Iain Lee","Chanho Kim","Li Fuxin","Tucker Hermans","Stefan Lee"],"pdf_url":"https://arxiv.org/pdf/2404.18926v1.pdf","comment":"Accepted at International Conference on Robotics and Automation, 2024"},{"id":"http://arxiv.org/abs/2404.18924v1","updated":"2024-04-29T17:59:02Z","published":"2024-04-29T17:59:02Z","title":"Swin2-MoSE: A New Single Image Super-Resolution Model for Remote Sensing","summary":" Due to the limitations of current optical and sensor technologies and the\nhigh cost of updating them, the spectral and spatial resolution of satellites\nmay not always meet desired requirements. For these reasons, Remote-Sensing\nSingle-Image Super-Resolution (RS-SISR) techniques have gained significant\ninterest. In this paper, we propose Swin2-MoSE model, an enhanced version of\nSwin2SR. Our model introduces MoE-SM, an enhanced Mixture-of-Experts (MoE) to\nreplace the Feed-Forward inside all Transformer block. MoE-SM is designed with\nSmart-Merger, and new layer for merging the output of individual experts, and\nwith a new way to split the work between experts, defining a new per-example\nstrategy instead of the commonly used per-token one. Furthermore, we analyze\nhow positional encodings interact with each other, demonstrating that\nper-channel bias and per-head bias can positively cooperate. Finally, we\npropose to use a combination of Normalized-Cross-Correlation (NCC) and\nStructural Similarity Index Measure (SSIM) losses, to avoid typical MSE loss\nlimitations. Experimental results demonstrate that Swin2-MoSE outperforms SOTA\nby up to 0.377 ~ 0.958 dB (PSNR) on task of 2x, 3x and 4x resolution-upscaling\n(Sen2Venus and OLI2MSI datasets). We show the efficacy of Swin2-MoSE, applying\nit to a semantic segmentation task (SeasoNet dataset). Code and pretrained are\navailable on https://github.com/IMPLabUniPr/swin2-mose/tree/official_code\n","authors":["Leonardo Rossi","Vittorio Bernuzzi","Tomaso Fontanini","Massimo Bertozzi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2404.18924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18919v1","updated":"2024-04-29T17:58:14Z","published":"2024-04-29T17:58:14Z","title":"TheaterGen: Character Management with LLM for Consistent Multi-turn\n Image Generation","summary":" Recent advances in diffusion models can generate high-quality and stunning\nimages from text. However, multi-turn image generation, which is of high demand\nin real-world scenarios, still faces challenges in maintaining semantic\nconsistency between images and texts, as well as contextual consistency of the\nsame subject across multiple interactive turns. To address this issue, we\nintroduce TheaterGen, a training-free framework that integrates large language\nmodels (LLMs) and text-to-image (T2I) models to provide the capability of\nmulti-turn image generation. Within this framework, LLMs, acting as a\n\"Screenwriter\", engage in multi-turn interaction, generating and managing a\nstandardized prompt book that encompasses prompts and layout designs for each\ncharacter in the target image. Based on these, Theatergen generate a list of\ncharacter images and extract guidance information, akin to the \"Rehearsal\".\nSubsequently, through incorporating the prompt book and guidance information\ninto the reverse denoising process of T2I diffusion models, Theatergen generate\nthe final image, as conducting the \"Final Performance\". With the effective\nmanagement of prompt books and character images, TheaterGen significantly\nimproves semantic and contextual consistency in synthesized images.\nFurthermore, we introduce a dedicated benchmark, CMIGBench (Consistent\nMulti-turn Image Generation Benchmark) with 8000 multi-turn instructions.\nDifferent from previous multi-turn benchmarks, CMIGBench does not define\ncharacters in advance. Both the tasks of story generation and multi-turn\nediting are included on CMIGBench for comprehensive evaluation. Extensive\nexperimental results show that TheaterGen outperforms state-of-the-art methods\nsignificantly. It raises the performance bar of the cutting-edge Mini DALLE 3\nmodel by 21% in average character-character similarity and 19% in average\ntext-image similarity.\n","authors":["Junhao Cheng","Baiqiao Yin","Kaixin Cai","Minbin Huang","Hanhui Li","Yuxin He","Xi Lu","Yue Li","Yifei Li","Yuhao Cheng","Yiqiang Yan","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2404.18919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18259v3","updated":"2024-04-29T17:55:15Z","published":"2023-11-30T05:21:07Z","title":"Ego-Exo4D: Understanding Skilled Human Activity from First- and\n Third-Person Perspectives","summary":" We present Ego-Exo4D, a diverse, large-scale multimodal multiview video\ndataset and benchmark challenge. Ego-Exo4D centers around\nsimultaneously-captured egocentric and exocentric video of skilled human\nactivities (e.g., sports, music, dance, bike repair). 740 participants from 13\ncities worldwide performed these activities in 123 different natural scene\ncontexts, yielding long-form captures from 1 to 42 minutes each and 1,286 hours\nof video combined. The multimodal nature of the dataset is unprecedented: the\nvideo is accompanied by multichannel audio, eye gaze, 3D point clouds, camera\nposes, IMU, and multiple paired language descriptions -- including a novel\n\"expert commentary\" done by coaches and teachers and tailored to the\nskilled-activity domain. To push the frontier of first-person video\nunderstanding of skilled human activity, we also present a suite of benchmark\ntasks and their annotations, including fine-grained activity understanding,\nproficiency estimation, cross-view translation, and 3D hand/body pose. All\nresources are open sourced to fuel new research in the community. Project page:\nhttp://ego-exo4d-data.org/\n","authors":["Kristen Grauman","Andrew Westbury","Lorenzo Torresani","Kris Kitani","Jitendra Malik","Triantafyllos Afouras","Kumar Ashutosh","Vijay Baiyya","Siddhant Bansal","Bikram Boote","Eugene Byrne","Zach Chavis","Joya Chen","Feng Cheng","Fu-Jen Chu","Sean Crane","Avijit Dasgupta","Jing Dong","Maria Escobar","Cristhian Forigua","Abrham Gebreselasie","Sanjay Haresh","Jing Huang","Md Mohaiminul Islam","Suyog Jain","Rawal Khirodkar","Devansh Kukreja","Kevin J Liang","Jia-Wei Liu","Sagnik Majumder","Yongsen Mao","Miguel Martin","Effrosyni Mavroudi","Tushar Nagarajan","Francesco Ragusa","Santhosh Kumar Ramakrishnan","Luigi Seminara","Arjun Somayazulu","Yale Song","Shan Su","Zihui Xue","Edward Zhang","Jinxu Zhang","Angela Castillo","Changan Chen","Xinzhu Fu","Ryosuke Furuta","Cristina Gonzalez","Prince Gupta","Jiabo Hu","Yifei Huang","Yiming Huang","Weslie Khoo","Anush Kumar","Robert Kuo","Sach Lakhavani","Miao Liu","Mi Luo","Zhengyi Luo","Brighid Meredith","Austin Miller","Oluwatumininu Oguntola","Xiaqing Pan","Penny Peng","Shraman Pramanick","Merey Ramazanova","Fiona Ryan","Wei Shan","Kiran Somasundaram","Chenan Song","Audrey Southerland","Masatoshi Tateno","Huiyu Wang","Yuchen Wang","Takuma Yagi","Mingfei Yan","Xitong Yang","Zecheng Yu","Shengxin Cindy Zha","Chen Zhao","Ziwei Zhao","Zhifan Zhu","Jeff Zhuo","Pablo Arbelaez","Gedas Bertasius","David Crandall","Dima Damen","Jakob Engel","Giovanni Maria Farinella","Antonino Furnari","Bernard Ghanem","Judy Hoffman","C. V. Jawahar","Richard Newcombe","Hyun Soo Park","James M. Rehg","Yoichi Sato","Manolis Savva","Jianbo Shi","Mike Zheng Shou","Michael Wray"],"pdf_url":"https://arxiv.org/pdf/2311.18259v3.pdf","comment":"updated baseline results and dataset statistics to match the released\n v2 data; added table to appendix comparing stats of Ego-Exo4D alongside other\n datasets"},{"id":"http://arxiv.org/abs/2404.16829v2","updated":"2024-04-29T17:48:37Z","published":"2024-04-25T17:59:58Z","title":"Make-it-Real: Unleashing Large Multimodal Model's Ability for Painting\n 3D Objects with Realistic Materials","summary":" Physically realistic materials are pivotal in augmenting the realism of 3D\nassets across various applications and lighting conditions. However, existing\n3D assets and generative models often lack authentic material properties.\nManual assignment of materials using graphic software is a tedious and\ntime-consuming task. In this paper, we exploit advancements in Multimodal Large\nLanguage Models (MLLMs), particularly GPT-4V, to present a novel approach,\nMake-it-Real: 1) We demonstrate that GPT-4V can effectively recognize and\ndescribe materials, allowing the construction of a detailed material library.\n2) Utilizing a combination of visual cues and hierarchical text prompts, GPT-4V\nprecisely identifies and aligns materials with the corresponding components of\n3D objects. 3) The correctly matched materials are then meticulously applied as\nreference for the new SVBRDF material generation according to the original\ndiffuse map, significantly enhancing their visual authenticity. Make-it-Real\noffers a streamlined integration into the 3D content creation workflow,\nshowcasing its utility as an essential tool for developers of 3D assets.\n","authors":["Ye Fang","Zeyi Sun","Tong Wu","Jiaqi Wang","Ziwei Liu","Gordon Wetzstein","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2404.16829v2.pdf","comment":"Project Page: https://sunzey.github.io/Make-it-Real/"},{"id":"http://arxiv.org/abs/2312.17670v3","updated":"2024-04-29T17:45:25Z","published":"2023-12-29T16:37:08Z","title":"Benchmarking the CoW with the TopCoW Challenge: Topology-Aware\n Anatomical Segmentation of the Circle of Willis for CTA and MRA","summary":" The Circle of Willis (CoW) is an important network of arteries connecting\nmajor circulations of the brain. Its vascular architecture is believed to\naffect the risk, severity, and clinical outcome of serious neuro-vascular\ndiseases. However, characterizing the highly variable CoW anatomy is still a\nmanual and time-consuming expert task. The CoW is usually imaged by two\nangiographic imaging modalities, magnetic resonance angiography (MRA) and\ncomputed tomography angiography (CTA), but there exist limited public datasets\nwith annotations on CoW anatomy, especially for CTA. Therefore we organized the\nTopCoW Challenge in 2023 with the release of an annotated CoW dataset. The\nTopCoW dataset was the first public dataset with voxel-level annotations for\nthirteen possible CoW vessel components, enabled by virtual-reality (VR)\ntechnology. It was also the first large dataset with paired MRA and CTA from\nthe same patients. TopCoW challenge formalized the CoW characterization problem\nas a multiclass anatomical segmentation task with an emphasis on topological\nmetrics. We invited submissions worldwide for the CoW segmentation task, which\nattracted over 140 registered participants from four continents. The top\nperforming teams managed to segment many CoW components to Dice scores around\n90%, but with lower scores for communicating arteries and rare variants. There\nwere also topological mistakes for predictions with high Dice scores.\nAdditional topological analysis revealed further areas for improvement in\ndetecting certain CoW components and matching CoW variant topology accurately.\nTopCoW represented a first attempt at benchmarking the CoW anatomical\nsegmentation task for MRA and CTA, both morphologically and topologically.\n","authors":["Kaiyuan Yang","Fabio Musio","Yihui Ma","Norman Juchler","Johannes C. Paetzold","Rami Al-Maskari","Luciano Höher","Hongwei Bran Li","Ibrahim Ethem Hamamci","Anjany Sekuboyina","Suprosanna Shit","Houjing Huang","Chinmay Prabhakar","Ezequiel de la Rosa","Diana Waldmannstetter","Florian Kofler","Fernando Navarro","Martin Menten","Ivan Ezhov","Daniel Rueckert","Iris Vos","Ynte Ruigrok","Birgitta Velthuis","Hugo Kuijf","Julien Hämmerli","Catherine Wurster","Philippe Bijlenga","Laura Westphal","Jeroen Bisschop","Elisa Colombo","Hakim Baazaoui","Andrew Makmur","James Hallinan","Bene Wiestler","Jan S. Kirschke","Roland Wiest","Emmanuel Montagnon","Laurent Letourneau-Guillon","Adrian Galdran","Francesco Galati","Daniele Falcetta","Maria A. Zuluaga","Chaolong Lin","Haoran Zhao","Zehan Zhang","Sinyoung Ra","Jongyun Hwang","Hyunjin Park","Junqiang Chen","Marek Wodzinski","Henning Müller","Pengcheng Shi","Wei Liu","Ting Ma","Cansu Yalçin","Rachika E. Hamadache","Joaquim Salvi","Xavier Llado","Uma Maria Lal-Trehan Estrada","Valeriia Abramova","Luca Giancardo","Arnau Oliver","Jialu Liu","Haibin Huang","Yue Cui","Zehang Lin","Yusheng Liu","Shunzhi Zhu","Tatsat R. Patel","Vincent M. Tutino","Maysam Orouskhani","Huayu Wang","Mahmud Mossa-Basha","Chengcheng Zhu","Maximilian R. Rokuss","Yannick Kirchhoff","Nico Disch","Julius Holzschuh","Fabian Isensee","Klaus Maier-Hein","Yuki Sato","Sven Hirsch","Susanne Wegener","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2312.17670v3.pdf","comment":"24 pages, 11 figures, 9 tables. Summary Paper for the MICCAI TopCoW\n 2023 Challenge"},{"id":"http://arxiv.org/abs/2312.17247v2","updated":"2024-04-29T17:35:27Z","published":"2023-12-28T18:59:41Z","title":"Amodal Ground Truth and Completion in the Wild","summary":" This paper studies amodal image segmentation: predicting entire object\nsegmentation masks including both visible and invisible (occluded) parts. In\nprevious work, the amodal segmentation ground truth on real images is usually\npredicted by manual annotaton and thus is subjective. In contrast, we use 3D\ndata to establish an automatic pipeline to determine authentic ground truth\namodal masks for partially occluded objects in real images. This pipeline is\nused to construct an amodal completion evaluation benchmark, MP3D-Amodal,\nconsisting of a variety of object categories and labels. To better handle the\namodal completion task in the wild, we explore two architecture variants: a\ntwo-stage model that first infers the occluder, followed by amodal mask\ncompletion; and a one-stage model that exploits the representation power of\nStable Diffusion for amodal segmentation across many categories. Without bells\nand whistles, our method achieves a new state-of-the-art performance on Amodal\nsegmentation datasets that cover a large variety of objects, including COCOA\nand our new MP3D-Amodal dataset. The dataset, model, and code are available at\nhttps://www.robots.ox.ac.uk/~vgg/research/amodal/.\n","authors":["Guanqi Zhan","Chuanxia Zheng","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2312.17247v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.18895v1","updated":"2024-04-29T17:31:00Z","published":"2024-04-29T17:31:00Z","title":"RSCaMa: Remote Sensing Image Change Captioning with State Space Model","summary":" Remote Sensing Image Change Captioning (RSICC) aims to identify surface\nchanges in multi-temporal remote sensing images and describe them in natural\nlanguage. Current methods typically rely on an encoder-decoder architecture and\nfocus on designing a sophisticated neck to process bi-temporal features\nextracted by the backbone. Recently, State Space Models (SSMs), especially\nMamba, have demonstrated outstanding performance in many fields, owing to their\nefficient feature-selective modelling capability. However, their potential in\nthe RSICC task remains unexplored. In this paper, we introduce Mamba into RSICC\nand propose a novel approach called RSCaMa (Remote Sensing Change Captioning\nMamba). Specifically, we utilize Siamese backbones to extract bi-temporal\nfeatures, which are then processed through multiple CaMa layers consisting of\nSpatial Difference-guided SSM (SD-SSM) and Temporal Traveling SSM (TT-SSM).\nSD-SSM uses differential features to enhance change perception, while TT-SSM\npromotes bitemporal interactions in a token-wise cross-scanning manner.\nExperimental results validate the effectiveness of CaMa layers and demonstrate\nthe superior performance of RSCaMa, as well as the potential of Mamba in the\nRSICC task. Additionally, we systematically compare the effects of three\nlanguage decoders, including Mamba, GPT-style decoder with causal attention\nmechanism, and Transformer decoder with cross-attention mechanism. This\nprovides valuable insights for future RSICC research. The code will be\navailable at https://github.com/Chen-Yang-Liu/RSCaMa\n","authors":["Chenyang Liu","Keyan Chen","Bowen Chen","Haotian Zhang","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2404.18895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18891v1","updated":"2024-04-29T17:27:37Z","published":"2024-04-29T17:27:37Z","title":"IPixMatch: Boost Semi-supervised Semantic Segmentation with Inter-Pixel\n Relation","summary":" The scarcity of labeled data in real-world scenarios is a critical bottleneck\nof deep learning's effectiveness. Semi-supervised semantic segmentation has\nbeen a typical solution to achieve a desirable tradeoff between annotation cost\nand segmentation performance. However, previous approaches, whether based on\nconsistency regularization or self-training, tend to neglect the contextual\nknowledge embedded within inter-pixel relations. This negligence leads to\nsuboptimal performance and limited generalization. In this paper, we propose a\nnovel approach IPixMatch designed to mine the neglected but valuable\nInter-Pixel information for semi-supervised learning. Specifically, IPixMatch\nis constructed as an extension of the standard teacher-student network,\nincorporating additional loss terms to capture inter-pixel relations. It shines\nin low-data regimes by efficiently leveraging the limited labeled data and\nextracting maximum utility from the available unlabeled data. Furthermore,\nIPixMatch can be integrated seamlessly into most teacher-student frameworks\nwithout the need of model modification or adding additional components. Our\nstraightforward IPixMatch method demonstrates consistent performance\nimprovements across various benchmark datasets under different partitioning\nprotocols.\n","authors":["Kebin Wu","Wenbin Li","Xiaofei Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.18891v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.18890v1","updated":"2024-04-29T17:27:08Z","published":"2024-04-29T17:27:08Z","title":"Hide and Seek: How Does Watermarking Impact Face Recognition?","summary":" The recent progress in generative models has revolutionized the synthesis of\nhighly realistic images, including face images. This technological development\nhas undoubtedly helped face recognition, such as training data augmentation for\nhigher recognition accuracy and data privacy. However, it has also introduced\nnovel challenges concerning the responsible use and proper attribution of\ncomputer generated images. We investigate the impact of digital watermarking, a\ntechnique for embedding ownership signatures into images, on the effectiveness\nof face recognition models. We propose a comprehensive pipeline that integrates\nface image generation, watermarking, and face recognition to systematically\nexamine this question. The proposed watermarking scheme, based on an\nencoder-decoder architecture, successfully embeds and recovers signatures from\nboth real and synthetic face images while preserving their visual fidelity.\nThrough extensive experiments, we unveil that while watermarking enables robust\nimage attribution, it results in a slight decline in face recognition accuracy,\nparticularly evident for face images with challenging poses and expressions.\nAdditionally, we find that directly training face recognition models on\nwatermarked images offers only a limited alleviation of this performance\ndecline. Our findings underscore the intricate trade off between watermarking\nand face recognition accuracy. This work represents a pivotal step towards the\nresponsible utilization of generative models in face recognition and serves to\ninitiate discussions regarding the broader implications of watermarking in\nbiometrics.\n","authors":["Yuguang Yao","Steven Grosz","Sijia Liu","Anil Jain"],"pdf_url":"https://arxiv.org/pdf/2404.18890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12245v3","updated":"2024-04-29T17:19:45Z","published":"2023-09-21T16:43:29Z","title":"Adaptive Input-image Normalization for Solving the Mode Collapse Problem\n in GAN-based X-ray Images","summary":" Biomedical image datasets can be imbalanced due to the rarity of targeted\ndiseases. Generative Adversarial Networks play a key role in addressing this\nimbalance by enabling the generation of synthetic images to augment datasets.\nIt is important to generate synthetic images that incorporate a diverse range\nof features to accurately represent the distribution of features present in the\ntraining imagery. Furthermore, the absence of diverse features in synthetic\nimages can degrade the performance of machine learning classifiers. The mode\ncollapse problem impacts Generative Adversarial Networks' capacity to generate\ndiversified images. Mode collapse comes in two varieties: intra-class and\ninter-class. In this paper, both varieties of the mode collapse problem are\ninvestigated, and their subsequent impact on the diversity of synthetic X-ray\nimages is evaluated. This work contributes an empirical demonstration of the\nbenefits of integrating the adaptive input-image normalization with the Deep\nConvolutional GAN and Auxiliary Classifier GAN to alleviate the mode collapse\nproblems. Synthetically generated images are utilized for data augmentation and\ntraining a Vision Transformer model. The classification performance of the\nmodel is evaluated using accuracy, recall, and precision scores. Results\ndemonstrate that the DCGAN and the ACGAN with adaptive input-image\nnormalization outperform the DCGAN and ACGAN with un-normalized X-ray images as\nevidenced by the superior diversity scores and classification scores.\n","authors":["Muhammad Muneeb Saad","Mubashir Husain Rehmani","Ruairi O'Reilly"],"pdf_url":"https://arxiv.org/pdf/2309.12245v3.pdf","comment":"Submitted to the Elsevier Journal"},{"id":"http://arxiv.org/abs/2404.18876v1","updated":"2024-04-29T17:10:41Z","published":"2024-04-29T17:10:41Z","title":"A Multilevel Strategy to Improve People Tracking in a Real-World\n Scenario","summary":" The Pal\\'acio do Planalto, office of the President of Brazil, was invaded by\nprotesters on January 8, 2023. Surveillance videos taken from inside the\nbuilding were subsequently released by the Brazilian Supreme Court for public\nscrutiny. We used segments of such footage to create the UFPR-Planalto801\ndataset for people tracking and re-identification in a real-world scenario.\nThis dataset consists of more than 500,000 images. This paper presents a\ntracking approach targeting this dataset. The method proposed in this paper\nrelies on the use of known state-of-the-art trackers combined in a multilevel\nhierarchy to correct the ID association over the trajectories. We evaluated our\nmethod using IDF1, MOTA, MOTP and HOTA metrics. The results show improvements\nfor every tracker used in the experiments, with IDF1 score increasing by a\nmargin up to 9.5%.\n","authors":["Cristiano B. de Oliveira","Joao C. Neves","Rafael O. Ribeiro","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2404.18876v1.pdf","comment":"Accepted for presentation at the International Conference on Computer\n Vision Theory and Applications (VISAPP) 2024"},{"id":"http://arxiv.org/abs/2404.18873v1","updated":"2024-04-29T17:06:44Z","published":"2024-04-29T17:06:44Z","title":"OpenStreetView-5M: The Many Roads to Global Visual Geolocation","summary":" Determining the location of an image anywhere on Earth is a complex visual\ntask, which makes it particularly relevant for evaluating computer vision\nalgorithms. Yet, the absence of standard, large-scale, open-access datasets\nwith reliably localizable images has limited its potential. To address this\nissue, we introduce OpenStreetView-5M, a large-scale, open-access dataset\ncomprising over 5.1 million geo-referenced street view images, covering 225\ncountries and territories. In contrast to existing benchmarks, we enforce a\nstrict train/test separation, allowing us to evaluate the relevance of learned\ngeographical features beyond mere memorization. To demonstrate the utility of\nour dataset, we conduct an extensive benchmark of various state-of-the-art\nimage encoders, spatial representations, and training strategies. All\nassociated codes and models can be found at https://github.com/gastruc/osv5m.\n","authors":["Guillaume Astruc","Nicolas Dufour","Ioannis Siglidis","Constantin Aronssohn","Nacim Bouia","Stephanie Fu","Romain Loiseau","Van Nguyen Nguyen","Charles Raude","Elliot Vincent","Lintao XU","Hongyu Zhou","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2404.18873v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.18861v1","updated":"2024-04-29T16:51:30Z","published":"2024-04-29T16:51:30Z","title":"A Survey on Vision Mamba: Models, Applications and Challenges","summary":" Mamba, a recent selective structured state space model, performs excellently\non long sequence modeling tasks. Mamba mitigates the modeling constraints of\nconvolutional neural networks and offers advanced modeling capabilities similar\nto those of Transformers, through global receptive fields and dynamic\nweighting. Crucially, it achieves this without incurring the quadratic\ncomputational complexity typically associated with Transformers. Due to its\nadvantages over the former two mainstream foundation models, Mamba exhibits\ngreat potential to be a visual foundation model. Researchers are actively\napplying Mamba to various computer vision tasks, leading to numerous emerging\nworks. To help keep pace with the rapid advancements in computer vision, this\npaper aims to provide a comprehensive review of visual Mamba approaches. This\npaper begins by delineating the formulation of the original Mamba model.\nSubsequently, our review of visual Mamba delves into several representative\nbackbone networks to elucidate the core insights of the visual Mamba. We then\ncategorize related works using different modalities, including image, video,\npoint cloud, multi-modal, and others. Specifically, for image applications, we\nfurther organize them into distinct tasks to facilitate a more structured\ndiscussion. Finally, we discuss the challenges and future research directions\nfor visual Mamba, providing insights for future research in this quickly\nevolving area. A comprehensive list of visual Mamba models reviewed in this\nwork is available at https://github.com/Ruixxxx/Awesome-Vision-Mamba-Models.\n","authors":["Rui Xu","Shu Yang","Yihui Wang","Bo Du","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.18861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18849v1","updated":"2024-04-29T16:42:58Z","published":"2024-04-29T16:42:58Z","title":"MiPa: Mixed Patch Infrared-Visible Modality Agnostic Object Detection","summary":" In this paper, we present a different way to use two modalities, in which\neither one modality or the other is seen by a single model. This can be useful\nwhen adapting an unimodal model to leverage more information while respecting a\nlimited computational budget. This would mean having a single model that is\nable to deal with any modalities. To describe this, we coined the term anymodal\nlearning. An example of this, is a use case where, surveillance in a room when\nthe lights are off would be much more valuable using an infrared modality while\na visible one would provide more discriminative information when lights are on.\nThis work investigates how to efficiently leverage visible and infrared/thermal\nmodalities for transformer-based object detection backbone to create an\nanymodal architecture. Our work does not create any inference overhead during\nthe testing while exploring an effective way to exploit the two modalities\nduring the training. To accomplish such a task, we introduce the novel anymodal\ntraining technique: Mixed Patches (MiPa), in conjunction with a patch-wise\ndomain agnostic module, which is responsible of learning the best way to find a\ncommon representation of both modalities. This approach proves to be able to\nbalance modalities by reaching competitive results on individual modality\nbenchmarks with the alternative of using an unimodal architecture on three\ndifferent visible-infrared object detection datasets. Finally, our proposed\nmethod, when used as a regularization for the strongest modality, can beat the\nperformance of multimodal fusion methods while only requiring a single modality\nduring inference. Notably, MiPa became the state-of-the-art on the LLVIP\nvisible/infrared benchmark. Code: https://github.com/heitorrapela/MiPa\n","authors":["Heitor R. Medeiros","David Latortue","Fidel Guerrero Pena","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2404.18849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18842v1","updated":"2024-04-29T16:30:24Z","published":"2024-04-29T16:30:24Z","title":"VISION: Toward a Standardized Process for Radiology Image Management at\n the National Level","summary":" The compilation and analysis of radiological images poses numerous challenges\nfor researchers. The sheer volume of data as well as the computational needs of\nalgorithms capable of operating on images are extensive. Additionally, the\nassembly of these images alone is difficult, as these exams may differ widely\nin terms of clinical context, structured annotation available for model\ntraining, modality, and patient identifiers. In this paper, we describe our\nexperiences and challenges in establishing a trusted collection of radiology\nimages linked to the United States Department of Veterans Affairs (VA)\nelectronic health record database. We also discuss implications in making this\nrepository research-ready for medical investigators. Key insights include\nuncovering the specific procedures required for transferring images from a\nclinical to a research-ready environment, as well as roadblocks and bottlenecks\nin this process that may hinder future efforts at automation.\n","authors":["Kathryn Knight","Ioana Danciu","Olga Ovchinnikova","Jacob Hinkle","Mayanka Chandra Shekar","Debangshu Mukherjee","Eileen McAllister","Caitlin Rizy","Kelly Cho","Amy C. Justice","Joseph Erdos","Peter Kuzmak","Lauren Costa","Yuk-Lam Ho","Reddy Madipadga","Suzanne Tamang","Ian Goethert"],"pdf_url":"https://arxiv.org/pdf/2404.18842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08513v5","updated":"2024-04-29T16:20:23Z","published":"2023-09-15T16:19:09Z","title":"SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient\n Channels","summary":" Pre-trained vision transformers have strong representation benefits to\nvarious downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT)\nmethods have been proposed, and their experiments demonstrate that tuning only\n1\\% extra parameters could surpass full fine-tuning in low-data resource\nscenarios. However, these methods overlook the task-specific information when\nfine-tuning diverse downstream tasks. In this paper, we propose a simple yet\neffective method called \"Salient Channel Tuning\" (SCT) to leverage the\ntask-specific information by forwarding the model with the task images to\nselect partial channels in a feature map that enables us to tune only 1/8\nchannels leading to significantly lower parameter costs. Experiments on 19\nvisual transfer learning downstream tasks demonstrate that our SCT outperforms\nfull fine-tuning on 18 out of 19 tasks by adding only 0.11M parameters of the\nViT-B, which is 780$\\times$ fewer than its full fine-tuning counterpart.\nFurthermore, experiments on domain generalization and few-shot classification\nfurther demonstrate the effectiveness and generic of our approach. The code is\navailable at https://github.com/showlab/SCT.\n","authors":["Henry Hengyuan Zhao","Pichao Wang","Yuyang Zhao","Hao Luo","Fan Wang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2309.08513v5.pdf","comment":"This work has been accepted by IJCV"},{"id":"http://arxiv.org/abs/2404.18831v1","updated":"2024-04-29T16:16:42Z","published":"2024-04-29T16:16:42Z","title":"ConPro: Learning Severity Representation for Medical Images using\n Contrastive Learning and Preference Optimization","summary":" Understanding the severity of conditions shown in images in medical diagnosis\nis crucial, serving as a key guide for clinical assessment, treatment, as well\nas evaluating longitudinal progression. This paper proposes Con- PrO: a novel\nrepresentation learning method for severity assessment in medical images using\nContrastive learningintegrated Preference Optimization. Different from\nconventional contrastive learning methods that maximize the distance between\nclasses, ConPrO injects into the latent vector the distance preference\nknowledge between various severity classes and the normal class. We\nsystematically examine the key components of our framework to illuminate how\ncontrastive prediction tasks acquire valuable representations. We show that our\nrepresentation learning framework offers valuable severity ordering in the\nfeature space while outperforming previous state-of-the-art methods on\nclassification tasks. We achieve a 6% and 20% relative improvement compared to\na supervised and a self-supervised baseline, respectively. In addition, we\nderived discussions on severity indicators and related applications of\npreference comparison in the medical domain.\n","authors":["Hong Nguyen","Hoang Nguyen","Melinda Chang","Hieu Pham","Shrikanth Narayanan","Michael Pazzani"],"pdf_url":"https://arxiv.org/pdf/2404.18831v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2404.18825v1","updated":"2024-04-29T16:07:36Z","published":"2024-04-29T16:07:36Z","title":"Harmonic Machine Learning Models are Robust","summary":" We introduce Harmonic Robustness, a powerful and intuitive method to test the\nrobustness of any machine-learning model either during training or in black-box\nreal-time inference monitoring without ground-truth labels. It is based on\nfunctional deviation from the harmonic mean value property, indicating\ninstability and lack of explainability. We show implementation examples in\nlow-dimensional trees and feedforward NNs, where the method reliably identifies\noverfitting, as well as in more complex high-dimensional models such as\nResNet-50 and Vision Transformer where it efficiently measures adversarial\nvulnerability across image classes.\n","authors":["Nicholas S. Kersting","Yi Li","Aman Mohanty","Oyindamola Obisesan","Raphael Okochu"],"pdf_url":"https://arxiv.org/pdf/2404.18825v1.pdf","comment":"18 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.18820v1","updated":"2024-04-29T16:02:38Z","published":"2024-04-29T16:02:38Z","title":"Towards Extreme Image Compression with Latent Feature Guidance and\n Diffusion Prior","summary":" Compressing images at extremely low bitrates (below 0.1 bits per pixel (bpp))\nis a significant challenge due to substantial information loss. Existing\nextreme image compression methods generally suffer from heavy compression\nartifacts or low-fidelity reconstructions. To address this problem, we propose\na novel extreme image compression framework that combines compressive VAEs and\npre-trained text-to-image diffusion models in an end-to-end manner.\nSpecifically, we introduce a latent feature-guided compression module based on\ncompressive VAEs. This module compresses images and initially decodes the\ncompressed information into content variables. To enhance the alignment between\ncontent variables and the diffusion space, we introduce external guidance to\nmodulate intermediate feature maps. Subsequently, we develop a conditional\ndiffusion decoding module that leverages pre-trained diffusion models to\nfurther decode these content variables. To preserve the generative capability\nof pre-trained diffusion models, we keep their parameters fixed and use a\ncontrol module to inject content information. We also design a space alignment\nloss to provide sufficient constraints for the latent feature-guided\ncompression module. Extensive experiments demonstrate that our method\noutperforms state-of-the-art approaches in terms of both visual performance and\nimage fidelity at extremely low bitrates.\n","authors":["Zhiyuan Li","Yanhui Zhou","Hao Wei","Chenyang Ge","Jingwen Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.18820v1.pdf","comment":"Submitted to IEEE TCSVT"},{"id":"http://arxiv.org/abs/2404.18801v1","updated":"2024-04-29T15:40:40Z","published":"2024-04-29T15:40:40Z","title":"A Partial Replication of MaskFormer in TensorFlow on TPUs for the\n TensorFlow Model Garden","summary":" This paper undertakes the task of replicating the MaskFormer model a\nuniversal image segmentation model originally developed using the PyTorch\nframework, within the TensorFlow ecosystem, specifically optimized for\nexecution on Tensor Processing Units (TPUs). Our implementation exploits the\nmodular constructs available within the TensorFlow Model Garden (TFMG),\nencompassing elements such as the data loader, training orchestrator, and\nvarious architectural components, tailored and adapted to meet the\nspecifications of the MaskFormer model. We address key challenges encountered\nduring the replication, non-convergence issues, slow training, adaptation of\nloss functions, and the integration of TPU-specific functionalities. We verify\nour reproduced implementation and present qualitative results on the COCO\ndataset. Although our implementation meets some of the objectives for\nend-to-end reproducibility, we encountered challenges in replicating the\nPyTorch version of MaskFormer in TensorFlow. This replication process is not\nstraightforward and requires substantial engineering efforts. Specifically, it\nnecessitates the customization of various components within the TFMG, alongside\nthorough verification and hyper-parameter tuning. The replication is available\nat:\nhttps://github.com/PurdueDualityLab/tf-maskformer/tree/main/official/projects/maskformer\n","authors":["Vishal Purohit","Wenxin Jiang","Akshath R. Ravikiran","James C. Davis"],"pdf_url":"https://arxiv.org/pdf/2404.18801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16759v2","updated":"2024-04-29T15:21:35Z","published":"2023-11-28T13:02:33Z","title":"Gradient-based Local Next-best-view Planning for Improved Perception of\n Targeted Plant Nodes","summary":" Robots are increasingly used in tomato greenhouses to automate\nlabour-intensive tasks such as selective harvesting and de-leafing. To perform\nthese tasks, robots must be able to accurately and efficiently perceive the\nplant nodes that need to be cut, despite the high levels of occlusion from\nother plant parts. We formulate this problem as a local next-best-view (NBV)\nplanning task where the robot has to plan an efficient set of camera viewpoints\nto overcome occlusion and improve the quality of perception. Our formulation\nfocuses on quickly improving the perception accuracy of a single target node to\nmaximise its chances of being cut. Previous methods of NBV planning mostly\nfocused on global view planning and used random sampling of candidate\nviewpoints for exploration, which could suffer from high computational costs,\nineffective view selection due to poor candidates, or non-smooth trajectories\ndue to inefficient sampling. We propose a gradient-based NBV planner using\ndifferential ray sampling, which directly estimates the local gradient\ndirection for viewpoint planning to overcome occlusion and improve perception.\nThrough simulation experiments, we showed that our planner can handle\nocclusions and improve the 3D reconstruction and position estimation of nodes\nequally well as a sampling-based NBV planner, while taking ten times less\ncomputation and generating 28% more efficient trajectories.\n","authors":["Akshay K. Burusa","Eldert J. van Henten","Gert Kootstra"],"pdf_url":"https://arxiv.org/pdf/2311.16759v2.pdf","comment":"This work has been accepted for the 2024 International Conference on\n Robotics and Automation (ICRA 2024)"},{"id":"http://arxiv.org/abs/2307.06281v4","updated":"2024-04-29T15:21:19Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":" Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15009v3","updated":"2024-04-29T15:19:07Z","published":"2024-04-23T13:15:22Z","title":"The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus\n on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs)","summary":" Pediatric tumors of the central nervous system are the most common cause of\ncancer-related death in children. The five-year survival rate for high-grade\ngliomas in children is less than 20%. Due to their rarity, the diagnosis of\nthese entities is often delayed, their treatment is mainly based on historic\ntreatment concepts, and clinical trials require multi-institutional\ncollaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs\nchallenge, focused on pediatric brain tumors with data acquired across multiple\ninternational consortia dedicated to pediatric neuro-oncology and clinical\ntrials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together\nclinicians and AI/imaging scientists to lead to faster development of automated\nsegmentation techniques that could benefit clinical trials, and ultimately the\ncare of children with brain tumors.\n","authors":["Anahita Fathi Kazerooni","Nastaran Khalili","Deep Gandhi","Xinyang Liu","Zhifan Jiang","Syed Muhammed Anwar","Jake Albrecht","Maruf Adewole","Udunna Anazodo","Hannah Anderson","Sina Bagheri","Ujjwal Baid","Timothy Bergquist","Austin J. Borja","Evan Calabrese","Verena Chung","Gian-Marco Conte","Farouk Dako","James Eddy","Ivan Ezhov","Ariana Familiar","Keyvan Farahani","Anurag Gottipati","Debanjan Haldar","Shuvanjan Haldar","Juan Eugenio Iglesias","Anastasia Janas","Elaine Johansen","Blaise V Jones","Neda Khalili","Florian Kofler","Dominic LaBella","Hollie Anne Lai","Koen Van Leemput","Hongwei Bran Li","Nazanin Maleki","Aaron S McAllister","Zeke Meier","Bjoern Menze","Ahmed W Moawad","Khanak K Nandolia","Julija Pavaine","Marie Piraud","Tina Poussaint","Sanjay P Prabhu","Zachary Reitman","Andres Rodriguez","Jeffrey D Rudie","Mariana Sanchez-Montano","Ibraheem Salman Shaikh","Lubdha M. Shah","Nakul Sheth","Russel Taki Shinohara","Wenxin Tu","Karthik Viswanathan","Chunhao Wang","Jeffrey B Ware","Benedikt Wiestler","Walter Wiggins","Anna Zapaishchykova","Mariam Aboian","Miriam Bornhorst","Peter de Blank","Michelle Deutsch","Maryam Fouladi","Lindsey Hoffman","Benjamin Kann","Margot Lazow","Leonie Mikael","Ali Nabavizadeh","Roger Packer","Spyridon Bakas","Adam Resnick","Brian Rood","Arastoo Vossough","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2404.15009v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2305.17033"},{"id":"http://arxiv.org/abs/2404.18772v1","updated":"2024-04-29T15:05:42Z","published":"2024-04-29T15:05:42Z","title":"Saliency Suppressed, Semantics Surfaced: Visual Transformations in\n Neural Networks and the Brain","summary":" Deep learning algorithms lack human-interpretable accounts of how they\ntransform raw visual input into a robust semantic understanding, which impedes\ncomparisons between different architectures, training objectives, and the human\nbrain. In this work, we take inspiration from neuroscience and employ\nrepresentational approaches to shed light on how neural networks encode\ninformation at low (visual saliency) and high (semantic similarity) levels of\nabstraction. Moreover, we introduce a custom image dataset where we\nsystematically manipulate salient and semantic information. We find that\nResNets are more sensitive to saliency information than ViTs, when trained with\nobject classification objectives. We uncover that networks suppress saliency in\nearly layers, a process enhanced by natural language supervision (CLIP) in\nResNets. CLIP also enhances semantic encoding in both architectures. Finally,\nwe show that semantic encoding is a key factor in aligning AI with human visual\nperception, while saliency suppression is a non-brain-like strategy.\n","authors":["Gustaw Opiełka","Jessica Loke","Steven Scholte"],"pdf_url":"https://arxiv.org/pdf/2404.18772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18763v1","updated":"2024-04-29T15:01:09Z","published":"2024-04-29T15:01:09Z","title":"From Density to Geometry: YOLOv8 Instance Segmentation for Reverse\n Engineering of Optimized Structures","summary":" This paper introduces YOLOv8-TO, a novel approach for reverse engineering of\ntopology-optimized structures into interpretable geometric parameters using the\nYOLOv8 instance segmentation model. Density-based topology optimization methods\nrequire post-processing to convert the optimal density distribution into a\nparametric representation for design exploration and integration with CAD\ntools. Traditional methods such as skeletonization struggle with complex\ngeometries and require manual intervention. YOLOv8-TO addresses these\nchallenges by training a custom YOLOv8 model to automatically detect and\nreconstruct structural components from binary density distributions. The model\nis trained on a diverse dataset of both optimized and random structures\ngenerated using the Moving Morphable Components method. A custom reconstruction\nloss function based on the dice coefficient of the predicted geometry is used\nto train the new regression head of the model via self-supervised learning. The\nmethod is evaluated on test sets generated from different topology optimization\nmethods, including out-of-distribution samples, and compared against a\nskeletonization approach. Results show that YOLOv8-TO significantly outperforms\nskeletonization in reconstructing visually and structurally similar designs.\nThe method showcases an average improvement of 13.84% in the Dice coefficient,\nwith peak enhancements reaching 20.78%. The method demonstrates good\ngeneralization to complex geometries and fast inference times, making it\nsuitable for integration into design workflows using regular workstations.\nLimitations include the sensitivity to non-max suppression thresholds.\nYOLOv8-TO represents a significant advancement in topology optimization\npost-processing, enabling efficient and accurate reverse engineering of\noptimized structures for design exploration and manufacturing.\n","authors":["Thomas Rochefort-Beaudoin","Aurelian Vadean","Sofiane Achiche","Niels Aage"],"pdf_url":"https://arxiv.org/pdf/2404.18763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16681v2","updated":"2024-04-29T14:58:16Z","published":"2023-11-28T10:53:26Z","title":"Understanding the (Extra-)Ordinary: Validating Deep Model Decisions with\n Prototypical Concept-based Explanations","summary":" Ensuring both transparency and safety is critical when deploying Deep Neural\nNetworks (DNNs) in high-risk applications, such as medicine. The field of\nexplainable AI (XAI) has proposed various methods to comprehend the\ndecision-making processes of opaque DNNs. However, only few XAI methods are\nsuitable of ensuring safety in practice as they heavily rely on repeated\nlabor-intensive and possibly biased human assessment. In this work, we present\na novel post-hoc concept-based XAI framework that conveys besides instance-wise\n(local) also class-wise (global) decision-making strategies via prototypes.\nWhat sets our approach apart is the combination of local and global strategies,\nenabling a clearer understanding of the (dis-)similarities in model decisions\ncompared to the expected (prototypical) concept use, ultimately reducing the\ndependence on human long-term assessment. Quantifying the deviation from\nprototypical behavior not only allows to associate predictions with specific\nmodel sub-strategies but also to detect outlier behavior. As such, our approach\nconstitutes an intuitive and explainable tool for model validation. We\ndemonstrate the effectiveness of our approach in identifying\nout-of-distribution samples, spurious model behavior and data quality issues\nacross three datasets (ImageNet, CUB-200, and CIFAR-10) utilizing VGG, ResNet,\nand EfficientNet architectures. Code is available on\nhttps://github.com/maxdreyer/pcx.\n","authors":["Maximilian Dreyer","Reduan Achtibat","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2311.16681v2.pdf","comment":"39 pages (8 pages manuscript, 3 pages references, 28 pages appendix)"},{"id":"http://arxiv.org/abs/2404.18760v1","updated":"2024-04-29T14:57:16Z","published":"2024-04-29T14:57:16Z","title":"Flow AM: Generating Point Cloud Global Explanations by Latent Alignment","summary":" Although point cloud models have gained significant improvements in\nprediction accuracy over recent years, their trustworthiness is still not\nsufficiently investigated. In terms of global explainability, Activation\nMaximization (AM) techniques in the image domain are not directly\ntransplantable due to the special structure of the point cloud models. Existing\nstudies exploit generative models to yield global explanations that can be\nperceived by humans. However, the opacity of the generative models themselves\nand the introduction of additional priors call into question the plausibility\nand fidelity of the explanations. In this work, we demonstrate that when the\nclassifier predicts different types of instances, the intermediate layer\nactivations are differently activated, known as activation flows. Based on this\nproperty, we propose an activation flow-based AM method that generates global\nexplanations that can be perceived without incorporating any generative model.\nFurthermore, we reveal that AM based on generative models fails the sanity\nchecks and thus lack of fidelity. Extensive experiments show that our approach\ndramatically enhances the perceptibility of explanations compared to other AM\nmethods that are not based on generative models. Our code is available at:\nhttps://github.com/Explain3D/FlowAM\n","authors":["Hanxiao Tan"],"pdf_url":"https://arxiv.org/pdf/2404.18760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18758v1","updated":"2024-04-29T14:56:11Z","published":"2024-04-29T14:56:11Z","title":"Transitive Vision-Language Prompt Learning for Domain Generalization","summary":" The vision-language pre-training has enabled deep models to make a huge step\nforward in generalizing across unseen domains. The recent learning method based\non the vision-language pre-training model is a great tool for domain\ngeneralization and can solve this problem to a large extent. However, there are\nstill some issues that an advancement still suffers from trading-off between\ndomain invariance and class separability, which are crucial in current DG\nproblems. However, there are still some issues that an advancement still\nsuffers from trading-off between domain invariance and class separability,\nwhich are crucial in current DG problems. In this paper, we introduce a novel\nprompt learning strategy that leverages deep vision prompts to address domain\ninvariance while utilizing language prompts to ensure class separability,\ncoupled with adaptive weighting mechanisms to balance domain invariance and\nclass separability. Extensive experiments demonstrate that deep vision prompts\neffectively extract domain-invariant features, significantly improving the\ngeneralization ability of deep models and achieving state-of-the-art\nperformance on three datasets.\n","authors":["Liyuan Wang","Yan Jin","Zhen Chen","Jinlin Wu","Mengke Li","Yang Lu","Hanzi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.18758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16994v2","updated":"2024-04-29T14:52:02Z","published":"2024-04-25T19:29:55Z","title":"PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video\n Dense Captioning","summary":" Vision-language pre-training has significantly elevated performance across a\nwide range of image-language applications. Yet, the pre-training process for\nvideo-related tasks demands exceptionally large computational and data\nresources, which hinders the progress of video-language models. This paper\ninvestigates a straight-forward, highly efficient, and resource-light approach\nto adapting an existing image-language pre-trained model for dense video\nunderstanding. Our preliminary experiments reveal that directly fine-tuning\npre-trained image-language models with multiple frames as inputs on video\ndatasets leads to performance saturation or even a drop. Our further\ninvestigation reveals that it is largely attributed to the bias of learned\nhigh-norm visual features. Motivated by this finding, we propose a simple but\neffective pooling strategy to smooth the feature distribution along the\ntemporal dimension and thus reduce the dominant impacts from the extreme\nfeatures. The new model is termed Pooling LLaVA, or PLLaVA in short. PLLaVA\nachieves new state-of-the-art performance on modern benchmark datasets for both\nvideo question-answer and captioning tasks. Notably, on the recent popular\nVideoChatGPT benchmark, PLLaVA achieves a score of 3.48 out of 5 on average of\nfive evaluated dimensions, exceeding the previous SOTA results from GPT4V\n(IG-VLM) by 9%. On the latest multi-choice benchmark MVBench, PLLaVA achieves\n58.1% accuracy on average across 20 sub-tasks, 14.5% higher than GPT4V\n(IG-VLM). Code is available at https://pllava.github.io/\n","authors":["Lin Xu","Yilin Zhao","Daquan Zhou","Zhijie Lin","See Kiong Ng","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2404.16994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18750v1","updated":"2024-04-29T14:49:35Z","published":"2024-04-29T14:49:35Z","title":"Survey on Datasets for Perception in Unstructured Outdoor Environments","summary":" Perception is an essential component of pipelines in field robotics. In this\nsurvey, we quantitatively compare publicly available datasets available in\nunstructured outdoor environments. We focus on datasets for common perception\ntasks in field robotics. Our survey categorizes and compares available research\ndatasets. This survey also reports on relevant dataset characteristics to help\npractitioners determine which dataset fits best for their own application. We\nbelieve more consideration should be taken in choosing compatible annotation\npolicies across the datasets in unstructured outdoor environments.\n","authors":["Peter Mortimer","Mirko Maehlisch"],"pdf_url":"https://arxiv.org/pdf/2404.18750v1.pdf","comment":"Accepted to the IEEE ICRA Workshop on Field Robotics 2024"},{"id":"http://arxiv.org/abs/2404.18747v1","updated":"2024-04-29T14:47:32Z","published":"2024-04-29T14:47:32Z","title":"Evaluating the Effectiveness of Video Anomaly Detection in the Wild:\n Online Learning and Inference for Real-world Deployment","summary":" Video Anomaly Detection (VAD) identifies unusual activities in video streams,\na key technology with broad applications ranging from surveillance to\nhealthcare. Tackling VAD in real-life settings poses significant challenges due\nto the dynamic nature of human actions, environmental variations, and domain\nshifts. Many research initiatives neglect these complexities, often\nconcentrating on traditional testing methods that fail to account for\nperformance on unseen datasets, creating a gap between theoretical models and\ntheir real-world utility. Online learning is a potential strategy to mitigate\nthis issue by allowing models to adapt to new information continuously. This\npaper assesses how well current VAD algorithms can adjust to real-life\nconditions through an online learning framework, particularly those based on\npose analysis, for their efficiency and privacy advantages. Our proposed\nframework enables continuous model updates with streaming data from novel\nenvironments, thus mirroring actual world challenges and evaluating the models'\nability to adapt in real-time while maintaining accuracy. We investigate three\nstate-of-the-art models in this setting, focusing on their adaptability across\ndifferent domains. Our findings indicate that, even under the most challenging\nconditions, our online learning approach allows a model to preserve 89.39% of\nits original effectiveness compared to its offline-trained counterpart in a\nspecific target domain.\n","authors":["Shanle Yao","Ghazal Alinezhad Noghre","Armin Danesh Pazho","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2404.18747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18746v1","updated":"2024-04-29T14:46:35Z","published":"2024-04-29T14:46:35Z","title":"Enhancing Interactive Image Retrieval With Query Rewriting Using Large\n Language Models and Vision Language Models","summary":" Image search stands as a pivotal task in multimedia and computer vision,\nfinding applications across diverse domains, ranging from internet search to\nmedical diagnostics. Conventional image search systems operate by accepting\ntextual or visual queries, retrieving the top-relevant candidate results from\nthe database. However, prevalent methods often rely on single-turn procedures,\nintroducing potential inaccuracies and limited recall. These methods also face\nthe challenges, such as vocabulary mismatch and the semantic gap, constraining\ntheir overall effectiveness. To address these issues, we propose an interactive\nimage retrieval system capable of refining queries based on user relevance\nfeedback in a multi-turn setting. This system incorporates a vision language\nmodel (VLM) based image captioner to enhance the quality of text-based queries,\nresulting in more informative queries with each iteration. Moreover, we\nintroduce a large language model (LLM) based denoiser to refine text-based\nquery expansions, mitigating inaccuracies in image descriptions generated by\ncaptioning models. To evaluate our system, we curate a new dataset by adapting\nthe MSR-VTT video retrieval dataset to the image retrieval task, offering\nmultiple relevant ground truth images for each query. Through comprehensive\nexperiments, we validate the effectiveness of our proposed system against\nbaseline methods, achieving state-of-the-art performance with a notable 10\\%\nimprovement in terms of recall. Our contributions encompass the development of\nan innovative interactive image retrieval system, the integration of an\nLLM-based denoiser, the curation of a meticulously designed evaluation dataset,\nand thorough experimental validation.\n","authors":["Hongyi Zhu","Jia-Hong Huang","Stevan Rudinac","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2404.18746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00766v3","updated":"2024-04-29T14:43:51Z","published":"2024-01-01T14:14:35Z","title":"Exposure Bracketing is All You Need for Unifying Image Restoration and\n Enhancement Tasks","summary":" It is highly desired but challenging to acquire high-quality photos with\nclear content in low-light environments. Although multi-image processing\nmethods (using burst, dual-exposure, or multi-exposure images) have made\nsignificant progress in addressing this issue, they typically focus on specific\nrestoration or enhancement problems, being insufficient in exploiting\nmulti-image. Motivated by that multi-exposure images are complementary in\ndenoising, deblurring, high dynamic range imaging, and super-resolution, we\npropose to utilize exposure bracketing photography to unify restoration and\nenhancement tasks in this work. Due to the difficulty in collecting real-world\npairs, we suggest a solution that first pre-trains the model with synthetic\npaired data and then adapts it to real-world unlabeled images. In particular, a\ntemporally modulated recurrent network (TMRNet) and self-supervised adaptation\nmethod are proposed. Moreover, we construct a data simulation pipeline to\nsynthesize pairs and collect real-world images from 200 nighttime scenarios.\nExperiments on both datasets show that our method performs favorably against\nthe state-of-the-art multi-image processing ones. The dataset, code, and\npre-trained models are available at https://github.com/cszhilu1998/BracketIRE.\n","authors":["Zhilu Zhang","Shuohao Zhang","Renlong Wu","Zifei Yan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2401.00766v3.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2306.12189v2","updated":"2024-04-29T14:40:01Z","published":"2023-06-21T11:35:37Z","title":"Annotating Ambiguous Images: General Annotation Strategy for\n High-Quality Data with Real-World Biomedical Validation","summary":" In the field of image classification, existing methods often struggle with\nbiased or ambiguous data, a prevalent issue in real-world scenarios. Current\nstrategies, including semi-supervised learning and class blending, offer\npartial solutions but lack a definitive resolution. Addressing this gap, our\npaper introduces a novel strategy for generating high-quality labels in\nchallenging datasets. Central to our approach is a clearly designed flowchart,\nbased on a broad literature review, which enables the creation of reliable\nlabels. We validate our methodology through a rigorous real-world test case in\nthe biomedical field, specifically in deducing height reduction from vertebral\nimaging. Our empirical study, leveraging over 250,000 annotations, demonstrates\nthe effectiveness of our strategies decisions compared to their alternatives.\n","authors":["Lars Schmarje","Vasco Grossmann","Claudius Zelenka","Johannes Brünger","Reinhard Koch"],"pdf_url":"https://arxiv.org/pdf/2306.12189v2.pdf","comment":"Accepted at ICLR 2024, DMLR Workshop"},{"id":"http://arxiv.org/abs/2312.00195v2","updated":"2024-04-29T14:25:42Z","published":"2023-11-30T21:11:20Z","title":"Raising the Bar of AI-generated Image Detection with CLIP","summary":" The aim of this work is to explore the potential of pre-trained\nvision-language models (VLMs) for universal detection of AI-generated images.\nWe develop a lightweight detection strategy based on CLIP features and study\nits performance in a wide variety of challenging scenarios. We find that,\ncontrary to previous beliefs, it is neither necessary nor convenient to use a\nlarge domain-specific dataset for training. On the contrary, by using only a\nhandful of example images from a single generative model, a CLIP-based detector\nexhibits surprising generalization ability and high robustness across different\narchitectures, including recent commercial tools such as Dalle-3, Midjourney\nv5, and Firefly. We match the state-of-the-art (SoTA) on in-distribution data\nand significantly improve upon it in terms of generalization to\nout-of-distribution data (+6% AUC) and robustness to impaired/laundered data\n(+13%). Our project is available at\nhttps://grip-unina.github.io/ClipBased-SyntheticImageDetection/\n","authors":["Davide Cozzolino","Giovanni Poggi","Riccardo Corvi","Matthias Nießner","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2312.00195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18731v1","updated":"2024-04-29T14:17:52Z","published":"2024-04-29T14:17:52Z","title":"Real Time Multi Organ Classification on Computed Tomography Images","summary":" Organ segmentation is a fundamental task in medical imaging, and it is useful\nfor many clinical automation pipelines. Typically, the process involves\nsegmenting the entire volume, which can be unnecessary when the points of\ninterest are limited. In those cases, a classifier could be used instead of\nsegmentation. However, there is an inherent trade-off between the context size\nand the speed of classifiers. To address this issue, we propose a new method\nthat employs a data selection strategy with sparse sampling across a wide field\nof view without image resampling. This sparse sampling strategy makes it\npossible to classify voxels into multiple organs in real time without using\naccelerators. Although our method is an independent classifier, it can generate\nfull segmentation by querying grid locations at any resolution. We have\ncompared our method with existing segmentation techniques, demonstrating its\npotential for superior runtime in practical applications in medical imaging.\n","authors":["Halid Ziya Yerebakan","Yoshihisa Shinagawa","Gerardo Hermosillo Valadez"],"pdf_url":"https://arxiv.org/pdf/2404.18731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16348v2","updated":"2024-04-29T14:12:49Z","published":"2024-04-25T05:59:42Z","title":"Dual Expert Distillation Network for Generalized Zero-Shot Learning","summary":" Zero-shot learning has consistently yielded remarkable progress via modeling\nnuanced one-to-one visual-attribute correlation. Existing studies resort to\nrefining a uniform mapping function to align and correlate the sample regions\nand subattributes, ignoring two crucial issues: 1) the inherent asymmetry of\nattributes; and 2) the unutilized channel information. This paper addresses\nthese issues by introducing a simple yet effective approach, dubbed Dual Expert\nDistillation Network (DEDN), where two experts are dedicated to coarse- and\nfine-grained visual-attribute modeling, respectively. Concretely, one coarse\nexpert, namely cExp, has a complete perceptual scope to coordinate\nvisual-attribute similarity metrics across dimensions, and moreover, another\nfine expert, namely fExp, consists of multiple specialized subnetworks, each\ncorresponds to an exclusive set of attributes. Two experts cooperatively\ndistill from each other to reach a mutual agreement during training. Meanwhile,\nwe further equip DEDN with a newly designed backbone network, i.e., Dual\nAttention Network (DAN), which incorporates both region and channel attention\ninformation to fully exploit and leverage visual semantic knowledge.\nExperiments on various benchmark datasets indicate a new state-of-the-art.\n","authors":["Zhijie Rao","Jingcai Guo","Xiaocheng Lu","Jingming Liang","Jie Zhang","Haozhao Wang","Kang Wei","Xiaofeng Cao"],"pdf_url":"https://arxiv.org/pdf/2404.16348v2.pdf","comment":"9 pages, 4 figures; Accepted to IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.18722v1","updated":"2024-04-29T14:11:16Z","published":"2024-04-29T14:11:16Z","title":"Improving Automatic Text Recognition with Language Models in the PyLaia\n Open-Source Library","summary":" PyLaia is one of the most popular open-source software for Automatic Text\nRecognition (ATR), delivering strong performance in terms of speed and\naccuracy. In this paper, we outline our recent contributions to the PyLaia\nlibrary, focusing on the incorporation of reliable confidence scores and the\nintegration of statistical language modeling during decoding. Our\nimplementation provides an easy way to combine PyLaia with n-grams language\nmodels at different levels. One of the highlights of this work is that language\nmodels are completely auto-tuned: they can be built and used easily without any\nexpert knowledge, and without requiring any additional data. To demonstrate the\nsignificance of our contribution, we evaluate PyLaia's performance on twelve\ndatasets, both with and without language modelling. The results show that\ndecoding with small language models improves the Word Error Rate by 13% and the\nCharacter Error Rate by 12% in average. Additionally, we conduct an analysis of\nconfidence scores and highlight the importance of calibration techniques. Our\nimplementation is publicly available in the official PyLaia repository at\nhttps://gitlab.teklia.com/atr/pylaia, and twelve open-source models are\nreleased on Hugging Face.\n","authors":["Solène Tarride","Yoann Schneider","Marie Generali-Lince","Mélodie Boillet","Bastien Abadie","Christopher Kermorvant"],"pdf_url":"https://arxiv.org/pdf/2404.18722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18706v1","updated":"2024-04-29T13:57:02Z","published":"2024-04-29T13:57:02Z","title":"The Socface Project: Large-Scale Collection, Processing, and Analysis of\n a Century of French Censuses","summary":" This paper presents a complete processing workflow for extracting information\nfrom French census lists from 1836 to 1936. These lists contain information\nabout individuals living in France and their households. We aim at extracting\nall the information contained in these tables using automatic handwritten table\nrecognition. At the end of the Socface project, in which our work is taking\nplace, the extracted information will be redistributed to the departmental\narchives, and the nominative lists will be freely available to the public,\nallowing anyone to browse hundreds of millions of records. The extracted data\nwill be used by demographers to analyze social change over time, significantly\nimproving our understanding of French economic and social structures. For this\nproject, we developed a complete processing workflow: large-scale data\ncollection from French departmental archives, collaborative annotation of\ndocuments, training of handwritten table text and structure recognition models,\nand mass processing of millions of images. We present the tools we have\ndeveloped to easily collect and process millions of pages. We also show that it\nis possible to process such a wide variety of tables with a single table\nrecognition model that uses the image of the entire page to recognize\ninformation about individuals, categorize them and automatically group them\ninto households. The entire process has been successfully used to process the\ndocuments of a departmental archive, representing more than 450,000 images.\n","authors":["Mélodie Boillet","Solène Tarride","Yoann Schneider","Bastien Abadie","Lionel Kesztenbaum","Christopher Kermorvant"],"pdf_url":"https://arxiv.org/pdf/2404.18706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18699v1","updated":"2024-04-29T13:47:59Z","published":"2024-04-29T13:47:59Z","title":"Convergence Properties of Score-Based Models using Graduated\n Optimisation for Linear Inverse Problems","summary":" The incorporation of generative models as regularisers within variational\nformulations for inverse problems has proven effective across numerous image\nreconstruction tasks. However, the resulting optimisation problem is often\nnon-convex and challenging to solve. In this work, we show that score-based\ngenerative models (SGMs) can be used in a graduated optimisation framework to\nsolve inverse problems. We show that the resulting graduated non-convexity flow\nconverge to stationary points of the original problem and provide a numerical\nconvergence analysis of a 2D toy example. We further provide experiments on\ncomputed tomography image reconstruction, where we show that this framework is\nable to recover high-quality images, independent of the initial value. The\nexperiments highlight the potential of using SGMs in graduated optimisation\nframeworks.\n","authors":["Pascal Fernsel","Željko Kereta","Alexander Denker"],"pdf_url":"https://arxiv.org/pdf/2404.18699v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2404.18695v1","updated":"2024-04-29T13:43:49Z","published":"2024-04-29T13:43:49Z","title":"Dual-Modal Prompting for Sketch-Based Image Retrieval","summary":" Sketch-based image retrieval (SBIR) associates hand-drawn sketches with their\ncorresponding realistic images. In this study, we aim to tackle two major\nchallenges of this task simultaneously: i) zero-shot, dealing with unseen\ncategories, and ii) fine-grained, referring to intra-category instance-level\nretrieval. Our key innovation lies in the realization that solely addressing\nthis cross-category and fine-grained recognition task from the generalization\nperspective may be inadequate since the knowledge accumulated from limited seen\ncategories might not be fully valuable or transferable to unseen target\ncategories. Inspired by this, in this work, we propose a dual-modal prompting\nCLIP (DP-CLIP) network, in which an adaptive prompting strategy is designed.\nSpecifically, to facilitate the adaptation of our DP-CLIP toward unpredictable\ntarget categories, we employ a set of images within the target category and the\ntextual category label to respectively construct a set of category-adaptive\nprompt tokens and channel scales. By integrating the generated guidance,\nDP-CLIP could gain valuable category-centric insights, efficiently adapting to\nnovel categories and capturing unique discriminative clues for effective\nretrieval within each target category. With these designs, our DP-CLIP\noutperforms the state-of-the-art fine-grained zero-shot SBIR method by 7.3% in\nAcc.@1 on the Sketchy dataset. Meanwhile, in the other two category-level\nzero-shot SBIR benchmarks, our method also achieves promising performance.\n","authors":["Liying Gao","Bingliang Jiao","Peng Wang","Shizhou Zhang","Hanwang Zhang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.18695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19043v2","updated":"2024-04-29T13:08:36Z","published":"2024-03-27T22:36:02Z","title":"Illicit object detection in X-ray images using Vision Transformers","summary":" Illicit object detection is a critical task performed at various\nhigh-security locations, including airports, train stations, subways, and\nports. The continuous and tedious work of examining thousands of X-ray images\nper hour can be mentally taxing. Thus, Deep Neural Networks (DNNs) can be used\nto automate the X-ray image analysis process, improve efficiency and alleviate\nthe security officers' inspection burden. The neural architectures typically\nutilized in relevant literature are Convolutional Neural Networks (CNNs), with\nVision Transformers (ViTs) rarely employed. In order to address this gap, this\npaper conducts a comprehensive evaluation of relevant ViT architectures on\nillicit item detection in X-ray images. This study utilizes both Transformer\nand hybrid backbones, such as SWIN and NextViT, and detectors, such as DINO and\nRT-DETR. The results demonstrate the remarkable accuracy of the DINO\nTransformer detector in the low-data regime, the impressive real-time\nperformance of YOLOv8, and the effectiveness of the hybrid NextViT backbone.\n","authors":["Jorgen Cani","Ioannis Mademlis","Adamantia Anna Rebolledo Chrysochoou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2403.19043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18669v1","updated":"2024-04-29T12:57:05Z","published":"2024-04-29T12:57:05Z","title":"Bootstrap 3D Reconstructed Scenes from 3D Gaussian Splatting","summary":" Recent developments in neural rendering techniques have greatly enhanced the\nrendering of photo-realistic 3D scenes across both academic and commercial\nfields. The latest method, known as 3D Gaussian Splatting (3D-GS), has set new\nbenchmarks for rendering quality and speed. Nevertheless, the limitations of\n3D-GS become pronounced in synthesizing new viewpoints, especially for views\nthat greatly deviate from those seen during training. Additionally, issues such\nas dilation and aliasing arise when zooming in or out. These challenges can all\nbe traced back to a single underlying issue: insufficient sampling. In our\npaper, we present a bootstrapping method that significantly addresses this\nproblem. This approach employs a diffusion model to enhance the rendering of\nnovel views using trained 3D-GS, thereby streamlining the training process. Our\nresults indicate that bootstrapping effectively reduces artifacts, as well as\nclear enhancements on the evaluation metrics. Furthermore, we show that our\nmethod is versatile and can be easily integrated, allowing various 3D\nreconstruction projects to benefit from our approach.\n","authors":["Yifei Gao","Jie Ou","Lei Wang","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.18669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18665v1","updated":"2024-04-29T12:49:53Z","published":"2024-04-29T12:49:53Z","title":"Leveraging PointNet and PointNet++ for Lyft Point Cloud Classification\n Challenge","summary":" This study investigates the application of PointNet and PointNet++ in the\nclassification of LiDAR-generated point cloud data, a critical component for\nachieving fully autonomous vehicles. Utilizing a modified dataset from the Lyft\n3D Object Detection Challenge, we examine the models' capabilities to handle\ndynamic and complex environments essential for autonomous navigation. Our\nanalysis shows that PointNet and PointNet++ achieved accuracy rates of 79.53%\nand 84.24%, respectively. These results underscore the models' robustness in\ninterpreting intricate environmental data, which is pivotal for the safety and\nefficiency of autonomous vehicles. Moreover, the enhanced detection accuracy,\nparticularly in distinguishing pedestrians from other objects, highlights the\npotential of these models to contribute substantially to the advancement of\nautonomous vehicle technology.\n","authors":["Rajat K. Doshi"],"pdf_url":"https://arxiv.org/pdf/2404.18665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18664v1","updated":"2024-04-29T12:49:30Z","published":"2024-04-29T12:49:30Z","title":"Reading Order Independent Metrics for Information Extraction in\n Handwritten Documents","summary":" Information Extraction processes in handwritten documents tend to rely on\nobtaining an automatic transcription and performing Named Entity Recognition\n(NER) over such transcription. For this reason, in publicly available datasets,\nthe performance of the systems is usually evaluated with metrics particular to\neach dataset. Moreover, most of the metrics employed are sensitive to reading\norder errors. Therefore, they do not reflect the expected final application of\nthe system and introduce biases in more complex documents. In this paper, we\npropose and publicly release a set of reading order independent metrics\ntailored to Information Extraction evaluation in handwritten documents. In our\nexperimentation, we perform an in-depth analysis of the behavior of the metrics\nto recommend what we consider to be the minimal set of metrics to evaluate a\ntask correctly.\n","authors":["David Villanova-Aparisi","Solène Tarride","Carlos-D. Martínez-Hinarejos","Verónica Romero","Christopher Kermorvant","Moisés Pastor-Gadea"],"pdf_url":"https://arxiv.org/pdf/2404.18664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18663v1","updated":"2024-04-29T12:48:42Z","published":"2024-04-29T12:48:42Z","title":"Terrain characterisation for online adaptability of automated sonar\n processing: Lessons learnt from operationally applying ATR to sidescan sonar\n in MCM applications","summary":" The performance of Automated Recognition (ATR) algorithms on side-scan sonar\nimagery has shown to degrade rapidly when deployed on non benign environments.\nComplex seafloors and acoustic artefacts constitute distractors in the form of\nstrong textural patterns, creating false detections or preventing detections of\ntrue objects. This paper presents two online seafloor characterisation\ntechniques to improve explainability during Autonomous Underwater Vehicles\n(AUVs) missions. Importantly and as opposed to previous work in the domain,\nthese techniques are not based on a model and require limited input from human\noperators, making it suitable for real-time onboard processing. Both techniques\nrely on an unsupervised machine learning approach to extract terrain features\nwhich relate to the human understanding of terrain complexity. The first\ntechnnique provides a quantitative, application-driven terrain characterisation\nmetric based on the performance of an ATR algorithm. The second method provides\na way to incorporate subject matter expertise and enables contextualisation and\nexplainability in support for scenario-dependent subjective terrain\ncharacterisation. The terrain complexity matches the expectation of seasoned\nusers making this tool desirable and trustworthy in comparison to traditional\nunsupervised approaches. We finally detail an application of these techniques\nto repair a Mine Countermeasures (MCM) mission carried with SeeByte autonomy\nframework Neptune.\n","authors":["Thomas Guerneve","Stephanos Loizou","Andrea Munafo","Pierre-Yves Mignotte"],"pdf_url":"https://arxiv.org/pdf/2404.18663v1.pdf","comment":"Presented at UACE (Underwater Acoustics Conference & Exhibition)\n 2023, Kalamata, Greece"},{"id":"http://arxiv.org/abs/2311.11260v2","updated":"2024-04-29T12:39:37Z","published":"2023-11-19T07:47:11Z","title":"Radarize: Enhancing Radar SLAM with Generalizable Doppler-Based Odometry","summary":" Millimeter-wave (mmWave) radar is increasingly being considered as an\nalternative to optical sensors for robotic primitives like simultaneous\nlocalization and mapping (SLAM). While mmWave radar overcomes some limitations\nof optical sensors, such as occlusions, poor lighting conditions, and privacy\nconcerns, it also faces unique challenges, such as missed obstacles due to\nspecular reflections or fake objects due to multipath. To address these\nchallenges, we propose Radarize, a self-contained SLAM pipeline that uses only\na commodity single-chip mmWave radar. Our radar-native approach uses techniques\nsuch as Doppler shift-based odometry and multipath artifact suppression to\nimprove performance. We evaluate our method on a large dataset of 146\ntrajectories spanning 4 buildings and mounted on 3 different platforms,\ntotaling approximately 4.7 Km of travel distance. Our results show that our\nmethod outperforms state-of-the-art radar and radar-inertial approaches by\napproximately 5x in terms of odometry and 8x in terms of end-to-end SLAM, as\nmeasured by absolute trajectory error (ATE), without the need for additional\nsensors such as IMUs or wheel encoders.\n","authors":["Emerson Sie","Xinyu Wu","Heyu Guo","Deepak Vasisht"],"pdf_url":"https://arxiv.org/pdf/2311.11260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12739v2","updated":"2024-04-29T12:36:39Z","published":"2024-04-19T09:32:16Z","title":"The Solution for the CVPR2024 NICE Image Captioning Challenge","summary":" This report introduces a solution to the Topic 1 Zero-shot Image Captioning\nof 2024 NICE : New frontiers for zero-shot Image Captioning Evaluation. In\ncontrast to NICE 2023 datasets, this challenge involves new annotations by\nhumans with significant differences in caption style and content. Therefore, we\nenhance image captions effectively through retrieval augmentation and caption\ngrading methods. At the data level, we utilize high-quality captions generated\nby image caption models as training data to address the gap in text styles. At\nthe model level, we employ OFA (a large-scale visual-language pre-training\nmodel based on handcrafted templates) to perform the image captioning task.\nSubsequently, we propose caption-level strategy for the high-quality caption\ndata generated by the image caption models and integrate them with retrieval\naugmentation strategy into the template to compel the model to generate higher\nquality, more matching, and semantically enriched captions based on the\nretrieval augmentation prompts. Our approach achieves a CIDEr score of 234.11.\n","authors":["Longfei Huang","Shupeng Zhong","Xiangyu Wu","Ruoxuan Li"],"pdf_url":"https://arxiv.org/pdf/2404.12739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18649v1","updated":"2024-04-29T12:32:14Z","published":"2024-04-29T12:32:14Z","title":"Towards Quantitative Evaluation of Explainable AI Methods for Deepfake\n Detection","summary":" In this paper we propose a new framework for evaluating the performance of\nexplanation methods on the decisions of a deepfake detector. This framework\nassesses the ability of an explanation method to spot the regions of a fake\nimage with the biggest influence on the decision of the deepfake detector, by\nexamining the extent to which these regions can be modified through a set of\nadversarial attacks, in order to flip the detector's prediction or reduce its\ninitial prediction; we anticipate a larger drop in deepfake detection accuracy\nand prediction, for methods that spot these regions more accurately. Based on\nthis framework, we conduct a comparative study using a state-of-the-art model\nfor deepfake detection that has been trained on the FaceForensics++ dataset,\nand five explanation methods from the literature. The findings of our\nquantitative and qualitative evaluations document the advanced performance of\nthe LIME explanation method against the other compared ones, and indicate this\nmethod as the most appropriate for explaining the decisions of the utilized\ndeepfake detector.\n","authors":["Konstantinos Tsigos","Evlampios Apostolidis","Spyridon Baxevanakis","Symeon Papadopoulos","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2404.18649v1.pdf","comment":"Accepted for publication, 3rd ACM Int. Workshop on Multimedia AI\n against Disinformation (MAD'24) at ACM ICMR'24, June 10, 2024, Phuket,\n Thailand. This is the \"accepted version\""},{"id":"http://arxiv.org/abs/2404.18648v1","updated":"2024-04-29T12:31:38Z","published":"2024-04-29T12:31:38Z","title":"Uncertainty-boosted Robust Video Activity Anticipation","summary":" Video activity anticipation aims to predict what will happen in the future,\nembracing a broad application prospect ranging from robot vision and autonomous\ndriving. Despite the recent progress, the data uncertainty issue, reflected as\nthe content evolution process and dynamic correlation in event labels, has been\nsomehow ignored. This reduces the model generalization ability and deep\nunderstanding on video content, leading to serious error accumulation and\ndegraded performance. In this paper, we address the uncertainty learning\nproblem and propose an uncertainty-boosted robust video activity anticipation\nframework, which generates uncertainty values to indicate the credibility of\nthe anticipation results. The uncertainty value is used to derive a temperature\nparameter in the softmax function to modulate the predicted target activity\ndistribution. To guarantee the distribution adjustment, we construct a\nreasonable target activity label representation by incorporating the activity\nevolution from the temporal class correlation and the semantic relationship.\nMoreover, we quantify the uncertainty into relative values by comparing the\nuncertainty among sample pairs and their temporal-lengths. This relative\nstrategy provides a more accessible way in uncertainty modeling than\nquantifying the absolute uncertainty values on the whole dataset. Experiments\non multiple backbones and benchmarks show our framework achieves promising\nperformance and better robustness/interpretability. Source codes are available\nat https://github.com/qzhb/UbRV2A.\n","authors":["Zhaobo Qi","Shuhui Wang","Weigang Zhang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2404.18648v1.pdf","comment":"Accepted by T-PAMI"},{"id":"http://arxiv.org/abs/2403.02090v3","updated":"2024-04-29T12:16:04Z","published":"2024-03-04T14:46:58Z","title":"Modeling Multimodal Social Interactions: New Challenges and Baselines\n with Densely Aligned Representations","summary":" Understanding social interactions involving both verbal and non-verbal cues\nis essential for effectively interpreting social situations. However, most\nprior works on multimodal social cues focus predominantly on single-person\nbehaviors or rely on holistic visual representations that are not aligned to\nutterances in multi-party environments. Consequently, they are limited in\nmodeling the intricate dynamics of multi-party interactions. In this paper, we\nintroduce three new challenging tasks to model the fine-grained dynamics\nbetween multiple people: speaking target identification, pronoun coreference\nresolution, and mentioned player prediction. We contribute extensive data\nannotations to curate these new challenges in social deduction game settings.\nFurthermore, we propose a novel multimodal baseline that leverages densely\naligned language-visual representations by synchronizing visual features with\ntheir corresponding utterances. This facilitates concurrently capturing verbal\nand non-verbal cues pertinent to social reasoning. Experiments demonstrate the\neffectiveness of the proposed approach with densely aligned multimodal\nrepresentations in modeling fine-grained social interactions. Project website:\nhttps://sangmin-git.github.io/projects/MMSI.\n","authors":["Sangmin Lee","Bolin Lai","Fiona Ryan","Bikram Boote","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2403.02090v3.pdf","comment":"CVPR 2024 Oral"},{"id":"http://arxiv.org/abs/2404.16612v2","updated":"2024-04-29T12:08:10Z","published":"2024-04-25T13:51:38Z","title":"MuseumMaker: Continual Style Customization without Catastrophic\n Forgetting","summary":" Pre-trained large text-to-image (T2I) models with an appropriate text prompt\nhas attracted growing interests in customized images generation field. However,\ncatastrophic forgetting issue make it hard to continually synthesize new\nuser-provided styles while retaining the satisfying results amongst learned\nstyles. In this paper, we propose MuseumMaker, a method that enables the\nsynthesis of images by following a set of customized styles in a never-end\nmanner, and gradually accumulate these creative artistic works as a Museum.\nWhen facing with a new customization style, we develop a style distillation\nloss module to extract and learn the styles of the training data for new image\ngeneration. It can minimize the learning biases caused by content of new\ntraining images, and address the catastrophic overfitting issue induced by\nfew-shot images. To deal with catastrophic forgetting amongst past learned\nstyles, we devise a dual regularization for shared-LoRA module to optimize the\ndirection of model update, which could regularize the diffusion model from both\nweight and feature aspects, respectively. Meanwhile, to further preserve\nhistorical knowledge from past styles and address the limited representability\nof LoRA, we consider a task-wise token learning module where a unique token\nembedding is learned to denote a new style. As any new user-provided style\ncome, our MuseumMaker can capture the nuances of the new styles while\nmaintaining the details of learned styles. Experimental results on diverse\nstyle datasets validate the effectiveness of our proposed MuseumMaker method,\nshowcasing its robustness and versatility across various scenarios.\n","authors":["Chenxi Liu","Gan Sun","Wenqi Liang","Jiahua Dong","Can Qin","Yang Cong"],"pdf_url":"https://arxiv.org/pdf/2404.16612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18630v1","updated":"2024-04-29T12:06:06Z","published":"2024-04-29T12:06:06Z","title":"4D-DRESS: A 4D Dataset of Real-world Human Clothing with Semantic\n Annotations","summary":" The studies of human clothing for digital avatars have predominantly relied\non synthetic datasets. While easy to collect, synthetic data often fall short\nin realism and fail to capture authentic clothing dynamics. Addressing this\ngap, we introduce 4D-DRESS, the first real-world 4D dataset advancing human\nclothing research with its high-quality 4D textured scans and garment meshes.\n4D-DRESS captures 64 outfits in 520 human motion sequences, amounting to 78k\ntextured scans. Creating a real-world clothing dataset is challenging,\nparticularly in annotating and segmenting the extensive and complex 4D human\nscans. To address this, we develop a semi-automatic 4D human parsing pipeline.\nWe efficiently combine a human-in-the-loop process with automation to\naccurately label 4D scans in diverse garments and body movements. Leveraging\nprecise annotations and high-quality garment meshes, we establish several\nbenchmarks for clothing simulation and reconstruction. 4D-DRESS offers\nrealistic and challenging data that complements synthetic sources, paving the\nway for advancements in research of lifelike human clothing. Website:\nhttps://ait.ethz.ch/4d-dress.\n","authors":["Wenbo Wang","Hsuan-I Ho","Chen Guo","Boxiang Rong","Artur Grigorev","Jie Song","Juan Jose Zarate","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2404.18630v1.pdf","comment":"CVPR 2024 paper, 21 figures, 9 tables"},{"id":"http://arxiv.org/abs/2404.18628v1","updated":"2024-04-29T12:02:06Z","published":"2024-04-29T12:02:06Z","title":"Self-Avatar Animation in Virtual Reality: Impact of Motion Signals\n Artifacts on the Full-Body Pose Reconstruction","summary":" Virtual Reality (VR) applications have revolutionized user experiences by\nimmersing individuals in interactive 3D environments. These environments find\napplications in numerous fields, including healthcare, education, or\narchitecture. A significant aspect of VR is the inclusion of self-avatars,\nrepresenting users within the virtual world, which enhances interaction and\nembodiment. However, generating lifelike full-body self-avatar animations\nremains challenging, particularly in consumer-grade VR systems, where\nlower-body tracking is often absent. One method to tackle this problem is by\nproviding an external source of motion information that includes lower body\ninformation such as full Cartesian positions estimated from RGB(D) cameras.\nNevertheless, the limitations of these systems are multiples: the\ndesynchronization between the two motion sources and occlusions are examples of\nsignificant issues that hinder the implementations of such systems. In this\npaper, we aim to measure the impact on the reconstruction of the articulated\nself-avatar's full-body pose of (1) the latency between the VR motion features\nand estimated positions, (2) the data acquisition rate, (3) occlusions, and (4)\nthe inaccuracy of the position estimation algorithm. In addition, we analyze\nthe motion reconstruction errors using ground truth and 3D Cartesian\ncoordinates estimated from \\textit{YOLOv8} pose estimation. These analyzes show\nthat the studied methods are significantly sensitive to any degradation tested,\nespecially regarding the velocity reconstruction error.\n","authors":["Antoine Maiorca","Seyed Abolfazl Ghasemzadeh","Thierry Ravet","François Cresson","Thierry Dutoit","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2404.18628v1.pdf","comment":"8 pages, 5 figures and 1 table"},{"id":"http://arxiv.org/abs/2404.11474v2","updated":"2024-04-29T11:59:10Z","published":"2024-04-17T15:28:53Z","title":"Towards Highly Realistic Artistic Style Transfer via Stable Diffusion\n with Step-aware and Layer-aware Prompt","summary":" Artistic style transfer aims to transfer the learned artistic style onto an\narbitrary content image, generating artistic stylized images. Existing\ngenerative adversarial network-based methods fail to generate highly realistic\nstylized images and always introduce obvious artifacts and disharmonious\npatterns. Recently, large-scale pre-trained diffusion models opened up a new\nway for generating highly realistic artistic stylized images. However,\ndiffusion model-based methods generally fail to preserve the content structure\nof input content images well, introducing some undesired content structure and\nstyle patterns. To address the above problems, we propose a novel pre-trained\ndiffusion-based artistic style transfer method, called LSAST, which can\ngenerate highly realistic artistic stylized images while preserving the content\nstructure of input content images well, without bringing obvious artifacts and\ndisharmonious style patterns. Specifically, we introduce a Step-aware and\nLayer-aware Prompt Space, a set of learnable prompts, which can learn the style\ninformation from the collection of artworks and dynamically adjusts the input\nimages' content structure and style pattern. To train our prompt space, we\npropose a novel inversion method, called Step-ware and Layer-aware Prompt\nInversion, which allows the prompt space to learn the style information of the\nartworks collection. In addition, we inject a pre-trained conditional branch of\nControlNet into our LSAST, which further improved our framework's ability to\nmaintain content structure. Extensive experiments demonstrate that our proposed\nmethod can generate more highly realistic artistic stylized images than the\nstate-of-the-art artistic style transfer methods.\n","authors":["Zhanjie Zhang","Quanwei Zhang","Huaizhong Lin","Wei Xing","Juncheng Mo","Shuaicheng Huang","Jinheng Xie","Guangyuan Li","Junsheng Luan","Lei Zhao","Dalong Zhang","Lixia Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11474v2.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2310.20225v2","updated":"2024-04-29T11:53:43Z","published":"2023-10-31T06:56:51Z","title":"A Multi-Modal Foundation Model to Assist People with Blindness and Low\n Vision in Environmental Interaction","summary":" People with blindness and low vision (pBLV) encounter substantial challenges\nwhen it comes to comprehensive scene recognition and precise object\nidentification in unfamiliar environments. Additionally, due to the vision\nloss, pBLV have difficulty in accessing and identifying potential tripping\nhazards on their own. In this paper, we present a pioneering approach that\nleverages a large vision-language model to enhance visual perception for pBLV,\noffering detailed and comprehensive descriptions of the surrounding\nenvironments and providing warnings about the potential risks. Our method\nbegins by leveraging a large image tagging model (i.e., Recognize Anything\n(RAM)) to identify all common objects present in the captured images. The\nrecognition results and user query are then integrated into a prompt, tailored\nspecifically for pBLV using prompt engineering. By combining the prompt and\ninput image, a large vision-language model (i.e., InstructBLIP) generates\ndetailed and comprehensive descriptions of the environment and identifies\npotential risks in the environment by analyzing the environmental objects and\nscenes, relevant to the prompt. We evaluate our approach through experiments\nconducted on both indoor and outdoor datasets. Our results demonstrate that our\nmethod is able to recognize objects accurately and provide insightful\ndescriptions and analysis of the environment for pBLV.\n","authors":["Yu Hao","Fan Yang","Hao Huang","Shuaihang Yuan","Sundeep Rangan","John-Ross Rizzo","Yao Wang","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2310.20225v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18624v1","updated":"2024-04-29T11:52:20Z","published":"2024-04-29T11:52:20Z","title":"Do Vision & Language Decoders use Images and Text equally? How\n Self-consistent are their Explanations?","summary":" Vision and language models (VLMs) are currently the most generally performant\narchitectures on multimodal tasks. Next to their predictions, they can also\nproduce explanations, either in post-hoc or CoT settings. However, it is not\nclear how much they use the vision and text modalities when generating\npredictions or explanations. In this work, we investigate if VLMs rely on\nmodalities differently when generating explanations as opposed to when they\nprovide answers. We also evaluate the self-consistency of VLM decoders in both\npost-hoc and CoT explanation settings, by extending existing tests and measures\nto VLM decoders. We find that VLMs are less self-consistent than LLMs. The text\ncontributions in VL decoders are much larger than the image contributions\nacross all measured tasks. And the contributions of the image are significantly\nlarger for explanation generations than for answer generation. This difference\nis even larger in CoT compared to the post-hoc explanation setting. We also\nprovide an up-to-date benchmarking of state-of-the-art VL decoders on the VALSE\nbenchmark, which to date focused only on VL encoders. We find that VL decoders\nare still struggling with most phenomena tested by VALSE.\n","authors":["Letitia Parcalabescu","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2404.18624v1.pdf","comment":"27 pages, from which 12 pages contain the text of the main paper. 8\n figures, 11 tables"},{"id":"http://arxiv.org/abs/2404.18620v1","updated":"2024-04-29T11:41:34Z","published":"2024-04-29T11:41:34Z","title":"FlexiFilm: Long Video Generation with Flexible Conditions","summary":" Generating long and consistent videos has emerged as a significant yet\nchallenging problem. While most existing diffusion-based video generation\nmodels, derived from image generation models, demonstrate promising performance\nin generating short videos, their simple conditioning mechanism and sampling\nstrategy-originally designed for image generation-cause severe performance\ndegradation when adapted to long video generation. This results in prominent\ntemporal inconsistency and overexposure. Thus, in this work, we introduce\nFlexiFilm, a new diffusion model tailored for long video generation. Our\nframework incorporates a temporal conditioner to establish a more consistent\nrelationship between generation and multi-modal conditions, and a resampling\nstrategy to tackle overexposure. Empirical results demonstrate FlexiFilm\ngenerates long and consistent videos, each over 30 seconds in length,\noutperforming competitors in qualitative and quantitative analyses. Project\npage: https://y-ichen.github.io/FlexiFilm-Page/\n","authors":["Yichen Ouyang","jianhao Yuan","Hao Zhao","Gaoang Wang","Bo zhao"],"pdf_url":"https://arxiv.org/pdf/2404.18620v1.pdf","comment":"9 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.18617v1","updated":"2024-04-29T11:40:27Z","published":"2024-04-29T11:40:27Z","title":"CoSense3D: an Agent-based Efficient Learning Framework for Collective\n Perception","summary":" Collective Perception has attracted significant attention in recent years due\nto its advantage for mitigating occlusion and expanding the field-of-view,\nthereby enhancing reliability, efficiency, and, most crucially, decision-making\nsafety. However, developing collective perception models is highly resource\ndemanding due to extensive requirements of processing input data for many\nagents, usually dozens of images and point clouds for a single frame. This not\nonly slows down the model development process for collective perception but\nalso impedes the utilization of larger models. In this paper, we propose an\nagent-based training framework that handles the deep learning modules and agent\ndata separately to have a cleaner data flow structure. This framework not only\nprovides an API for flexibly prototyping the data processing pipeline and\ndefining the gradient calculation for each agent, but also provides the user\ninterface for interactive training, testing and data visualization. Training\nexperiment results of four collective object detection models on the prominent\ncollective perception benchmark OPV2V show that the agent-based training can\nsignificantly reduce the GPU memory consumption and training time while\nretaining inference performance. The framework and model implementations are\navailable at \\url{https://github.com/YuanYunshuang/CoSense3D}\n","authors":["Yunshuang Yuan","Monika Sester"],"pdf_url":"https://arxiv.org/pdf/2404.18617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00848v3","updated":"2024-04-29T11:25:06Z","published":"2023-09-02T07:17:43Z","title":"Bengali Document Layout Analysis -- A YOLOV8 Based Ensembling Approach","summary":" This paper focuses on enhancing Bengali Document Layout Analysis (DLA) using\nthe YOLOv8 model and innovative post-processing techniques. We tackle\nchallenges unique to the complex Bengali script by employing data augmentation\nfor model robustness. After meticulous validation set evaluation, we fine-tune\nour approach on the complete dataset, leading to a two-stage prediction\nstrategy for accurate element segmentation. Our ensemble model, combined with\npost-processing, outperforms individual base architectures, addressing issues\nidentified in the BaDLAD dataset. By leveraging this approach, we aim to\nadvance Bengali document analysis, contributing to improved OCR and document\ncomprehension and BaDLAD serves as a foundational resource for this endeavor,\naiding future research in the field. Furthermore, our experiments provided key\ninsights to incorporate new strategies into the established solution.\n","authors":["Nazmus Sakib Ahmed","Saad Sakib Noor","Ashraful Islam Shanto Sikder","Abhijit Paul"],"pdf_url":"https://arxiv.org/pdf/2309.00848v3.pdf","comment":"Need to review and rework this"},{"id":"http://arxiv.org/abs/2404.18604v1","updated":"2024-04-29T11:19:15Z","published":"2024-04-29T11:19:15Z","title":"CSTalk: Correlation Supervised Speech-driven 3D Emotional Facial\n Animation Generation","summary":" Speech-driven 3D facial animation technology has been developed for years,\nbut its practical application still lacks expectations. The main challenges lie\nin data limitations, lip alignment, and the naturalness of facial expressions.\nAlthough lip alignment has seen many related studies, existing methods struggle\nto synthesize natural and realistic expressions, resulting in a mechanical and\nstiff appearance of facial animations. Even with some research extracting\nemotional features from speech, the randomness of facial movements limits the\neffective expression of emotions. To address this issue, this paper proposes a\nmethod called CSTalk (Correlation Supervised) that models the correlations\namong different regions of facial movements and supervises the training of the\ngenerative model to generate realistic expressions that conform to human facial\nmotion patterns. To generate more intricate animations, we employ a rich set of\ncontrol parameters based on the metahuman character model and capture a dataset\nfor five different emotions. We train a generative network using an autoencoder\nstructure and input an emotion embedding vector to achieve the generation of\nuser-control expressions. Experimental results demonstrate that our method\noutperforms existing state-of-the-art methods.\n","authors":["Xiangyu Liang","Wenlin Zhuang","Tianyong Wang","Guangxing Geng","Guangyue Geng","Haifeng Xia","Siyu Xia"],"pdf_url":"https://arxiv.org/pdf/2404.18604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18599v1","updated":"2024-04-29T11:14:11Z","published":"2024-04-29T11:14:11Z","title":"Self-supervised learning for classifying paranasal anomalies in the\n maxillary sinus","summary":" Purpose: Paranasal anomalies, frequently identified in routine radiological\nscreenings, exhibit diverse morphological characteristics. Due to the diversity\nof anomalies, supervised learning methods require large labelled dataset\nexhibiting diverse anomaly morphology. Self-supervised learning (SSL) can be\nused to learn representations from unlabelled data. However, there are no SSL\nmethods designed for the downstream task of classifying paranasal anomalies in\nthe maxillary sinus (MS).\n Methods: Our approach uses a 3D Convolutional Autoencoder (CAE) trained in an\nunsupervised anomaly detection (UAD) framework. Initially, we train the 3D CAE\nto reduce reconstruction errors when reconstructing normal maxillary sinus (MS)\nimage. Then, this CAE is applied to an unlabelled dataset to generate coarse\nanomaly locations by creating residual MS images. Following this, a 3D\nConvolutional Neural Network (CNN) reconstructs these residual images, which\nforms our SSL task. Lastly, we fine-tune the encoder part of the 3D CNN on a\nlabelled dataset of normal and anomalous MS images.\n Results: The proposed SSL technique exhibits superior performance compared to\nexisting generic self-supervised methods, especially in scenarios with limited\nannotated data. When trained on just 10% of the annotated dataset, our method\nachieves an Area Under the Precision-Recall Curve (AUPRC) of 0.79 for the\ndownstream classification task. This performance surpasses other methods, with\nBYOL attaining an AUPRC of 0.75, SimSiam at 0.74, SimCLR at 0.73 and Masked\nAutoencoding using SparK at 0.75.\n Conclusion: A self-supervised learning approach that inherently focuses on\nlocalizing paranasal anomalies proves to be advantageous, particularly when the\nsubsequent task involves differentiating normal from anomalous maxillary\nsinuses. Access our code at\nhttps://github.com/mtec-tuhh/self-supervised-paranasal-anomaly\n","authors":["Debayan Bhattacharya","Finn Behrendt","Benjamin Tobias Becker","Lennart Maack","Dirk Beyersdorff","Elina Petersen","Marvin Petersen","Bastian Cheng","Dennis Eggert","Christian Betz","Anna Sophie Hoffmann","Alexander Schlaefer"],"pdf_url":"https://arxiv.org/pdf/2404.18599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18598v1","updated":"2024-04-29T11:13:37Z","published":"2024-04-29T11:13:37Z","title":"Anywhere: A Multi-Agent Framework for Reliable and Diverse\n Foreground-Conditioned Image Inpainting","summary":" Recent advancements in image inpainting, particularly through diffusion\nmodeling, have yielded promising outcomes. However, when tested in scenarios\ninvolving the completion of images based on the foreground objects, current\nmethods that aim to inpaint an image in an end-to-end manner encounter\nchallenges such as \"over-imagination\", inconsistency between foreground and\nbackground, and limited diversity. In response, we introduce Anywhere, a\npioneering multi-agent framework designed to address these issues. Anywhere\nutilizes a sophisticated pipeline framework comprising various agents such as\nVisual Language Model (VLM), Large Language Model (LLM), and image generation\nmodels. This framework consists of three principal components: the prompt\ngeneration module, the image generation module, and the outcome analyzer. The\nprompt generation module conducts a semantic analysis of the input foreground\nimage, leveraging VLM to predict relevant language descriptions and LLM to\nrecommend optimal language prompts. In the image generation module, we employ a\ntext-guided canny-to-image generation model to create a template image based on\nthe edge map of the foreground image and language prompts, and an image refiner\nto produce the outcome by blending the input foreground and the template image.\nThe outcome analyzer employs VLM to evaluate image content rationality,\naesthetic score, and foreground-background relevance, triggering prompt and\nimage regeneration as needed. Extensive experiments demonstrate that our\nAnywhere framework excels in foreground-conditioned image inpainting,\nmitigating \"over-imagination\", resolving foreground-background discrepancies,\nand enhancing diversity. It successfully elevates foreground-conditioned image\ninpainting to produce more reliable and diverse results.\n","authors":["Tianyidan Xie","Rui Ma","Qian Wang","Xiaoqian Ye","Feixuan Liu","Ying Tai","Zhenyu Zhang","Zili Yi"],"pdf_url":"https://arxiv.org/pdf/2404.18598v1.pdf","comment":"16 pages, 9 figures, project page:\n https://anywheremultiagent.github.io"},{"id":"http://arxiv.org/abs/2404.09586v2","updated":"2024-04-29T11:10:48Z","published":"2024-04-15T08:54:33Z","title":"Mitigating the Curse of Dimensionality for Certified Robustness via Dual\n Randomized Smoothing","summary":" Randomized Smoothing (RS) has been proven a promising method for endowing an\narbitrary image classifier with certified robustness. However, the substantial\nuncertainty inherent in the high-dimensional isotropic Gaussian noise imposes\nthe curse of dimensionality on RS. Specifically, the upper bound of ${\\ell_2}$\ncertified robustness radius provided by RS exhibits a diminishing trend with\nthe expansion of the input dimension $d$, proportionally decreasing at a rate\nof $1/\\sqrt{d}$. This paper explores the feasibility of providing ${\\ell_2}$\ncertified robustness for high-dimensional input through the utilization of dual\nsmoothing in the lower-dimensional space. The proposed Dual Randomized\nSmoothing (DRS) down-samples the input image into two sub-images and smooths\nthe two sub-images in lower dimensions. Theoretically, we prove that DRS\nguarantees a tight ${\\ell_2}$ certified robustness radius for the original\ninput and reveal that DRS attains a superior upper bound on the ${\\ell_2}$\nrobustness radius, which decreases proportionally at a rate of $(1/\\sqrt m +\n1/\\sqrt n )$ with $m+n=d$. Extensive experiments demonstrate the\ngeneralizability and effectiveness of DRS, which exhibits a notable capability\nto integrate with established methodologies, yielding substantial improvements\nin both accuracy and ${\\ell_2}$ certified robustness baselines of RS on the\nCIFAR-10 and ImageNet datasets. Code is available at\nhttps://github.com/xiasong0501/DRS.\n","authors":["Song Xia","Yu Yi","Xudong Jiang","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2404.09586v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.08748v2","updated":"2024-04-29T11:07:24Z","published":"2024-03-13T17:50:59Z","title":"Real-time 3D semantic occupancy prediction for autonomous vehicles using\n memory-efficient sparse convolution","summary":" In autonomous vehicles, understanding the surrounding 3D environment of the\nego vehicle in real-time is essential. A compact way to represent scenes while\nencoding geometric distances and semantic object information is via 3D semantic\noccupancy maps. State of the art 3D mapping methods leverage transformers with\ncross-attention mechanisms to elevate 2D vision-centric camera features into\nthe 3D domain. However, these methods encounter significant challenges in\nreal-time applications due to their high computational demands during\ninference. This limitation is particularly problematic in autonomous vehicles,\nwhere GPU resources must be shared with other tasks such as localization and\nplanning. In this paper, we introduce an approach that extracts features from\nfront-view 2D camera images and LiDAR scans, then employs a sparse convolution\nnetwork (Minkowski Engine), for 3D semantic occupancy prediction. Given that\noutdoor scenes in autonomous driving scenarios are inherently sparse, the\nutilization of sparse convolution is particularly apt. By jointly solving the\nproblems of 3D scene completion of sparse scenes and 3D semantic segmentation,\nwe provide a more efficient learning framework suitable for real-time\napplications in autonomous vehicles. We also demonstrate competitive accuracy\non the nuScenes dataset.\n","authors":["Samuel Sze","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2403.08748v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.03690v2","updated":"2024-04-29T11:03:13Z","published":"2024-02-06T04:25:07Z","title":"3Doodle: Compact Abstraction of Objects with 3D Strokes","summary":" While free-hand sketching has long served as an efficient representation to\nconvey characteristics of an object, they are often subjective, deviating\nsignificantly from realistic representations. Moreover, sketches are not\nconsistent for arbitrary viewpoints, making it hard to catch 3D shapes. We\npropose 3Dooole, generating descriptive and view-consistent sketch images given\nmulti-view images of the target object. Our method is based on the idea that a\nset of 3D strokes can efficiently represent 3D structural information and\nrender view-consistent 2D sketches. We express 2D sketches as a union of\nview-independent and view-dependent components. 3D cubic B ezier curves\nindicate view-independent 3D feature lines, while contours of superquadrics\nexpress a smooth outline of the volume of varying viewpoints. Our pipeline\ndirectly optimizes the parameters of 3D stroke primitives to minimize\nperceptual losses in a fully differentiable manner. The resulting sparse set of\n3D strokes can be rendered as abstract sketches containing essential 3D\ncharacteristic shapes of various objects. We demonstrate that 3Doodle can\nfaithfully express concepts of the original images compared with recent sketch\ngeneration approaches.\n","authors":["Changwoon Choi","Jaeah Lee","Jaesik Park","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2402.03690v2.pdf","comment":"SIGGRAPH 2024 (Transactions on Graphics)"},{"id":"http://arxiv.org/abs/2404.18583v1","updated":"2024-04-29T10:47:37Z","published":"2024-04-29T10:47:37Z","title":"Context Matters: Leveraging Spatiotemporal Metadata for Semi-Supervised\n Learning on Remote Sensing Images","summary":" Remote sensing projects typically generate large amounts of imagery that can\nbe used to train powerful deep neural networks. However, the amount of labeled\nimages is often small, as remote sensing applications generally require expert\nlabelers. Thus, semi-supervised learning (SSL), i.e., learning with a small\npool of labeled and a larger pool of unlabeled data, is particularly useful in\nthis domain. Current SSL approaches generate pseudo-labels from model\npredictions for unlabeled samples. As the quality of these pseudo-labels is\ncrucial for performance, utilizing additional information to improve\npseudo-label quality yields a promising direction. For remote sensing images,\ngeolocation and recording time are generally available and provide a valuable\nsource of information as semantic concepts, such as land cover, are highly\ndependent on spatiotemporal context, e.g., due to seasonal effects and\nvegetation zones. In this paper, we propose to exploit spatiotemporal\nmetainformation in SSL to improve the quality of pseudo-labels and, therefore,\nthe final model performance. We show that directly adding the available\nmetadata to the input of the predictor at test time degenerates the prediction\nquality for metadata outside the spatiotemporal distribution of the training\nset. Thus, we propose a teacher-student SSL framework where only the teacher\nnetwork uses metainformation to improve the quality of pseudo-labels on the\ntraining set. Correspondingly, our student network benefits from the improved\npseudo-labels but does not receive metadata as input, making it invariant to\nspatiotemporal shifts at test time. Furthermore, we propose methods for\nencoding and injecting spatiotemporal information into the model and introduce\na novel distillation mechanism to enhance the knowledge transfer between\nteacher and student. Our framework dubbed Spatiotemporal SSL can be easily\ncombined with several stat...\n","authors":["Maximilian Bernhard","Tanveer Hannan","Niklas Strauß","Matthias Schubert"],"pdf_url":"https://arxiv.org/pdf/2404.18583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07596v2","updated":"2024-04-29T09:53:57Z","published":"2024-02-12T11:52:21Z","title":"Sheet Music Transformer: End-To-End Optical Music Recognition Beyond\n Monophonic Transcription","summary":" State-of-the-art end-to-end Optical Music Recognition (OMR) has, to date,\nprimarily been carried out using monophonic transcription techniques to handle\ncomplex score layouts, such as polyphony, often by resorting to simplifications\nor specific adaptations. Despite their efficacy, these approaches imply\nchallenges related to scalability and limitations. This paper presents the\nSheet Music Transformer, the first end-to-end OMR model designed to transcribe\ncomplex musical scores without relying solely on monophonic strategies. Our\nmodel employs a Transformer-based image-to-sequence framework that predicts\nscore transcriptions in a standard digital music encoding format from input\nimages. Our model has been tested on two polyphonic music datasets and has\nproven capable of handling these intricate music structures effectively. The\nexperimental outcomes not only indicate the competence of the model, but also\nshow that it is better than the state-of-the-art methods, thus contributing to\nadvancements in end-to-end OMR transcription.\n","authors":["Antonio Ríos-Vila","Jorge Calvo-Zaragoza","Thierry Paquet"],"pdf_url":"https://arxiv.org/pdf/2402.07596v2.pdf","comment":"Submitted to the International Conference on Document Analysis and\n Recognition 2024"},{"id":"http://arxiv.org/abs/2404.18552v1","updated":"2024-04-29T09:50:16Z","published":"2024-04-29T09:50:16Z","title":"SIDBench: A Python Framework for Reliably Assessing Synthetic Image\n Detection Methods","summary":" The generative AI technology offers an increasing variety of tools for\ngenerating entirely synthetic images that are increasingly indistinguishable\nfrom real ones. Unlike methods that alter portions of an image, the creation of\ncompletely synthetic images presents a unique challenge and several Synthetic\nImage Detection (SID) methods have recently appeared to tackle it. Yet, there\nis often a large gap between experimental results on benchmark datasets and the\nperformance of methods in the wild. To better address the evaluation needs of\nSID and help close this gap, this paper introduces a benchmarking framework\nthat integrates several state-of-the-art SID models. Our selection of\nintegrated models was based on the utilization of varied input features, and\ndifferent network architectures, aiming to encompass a broad spectrum of\ntechniques. The framework leverages recent datasets with a diverse set of\ngenerative models, high level of photo-realism and resolution, reflecting the\nrapid improvements in image synthesis technology. Additionally, the framework\nenables the study of how image transformations, common in assets shared online,\nsuch as JPEG compression, affect detection performance. SIDBench is available\non https://github.com/mever-team/sidbench and is designed in a modular manner\nto enable easy inclusion of new datasets and SID models.\n","authors":["Manos Schinas","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.18552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08367v3","updated":"2024-04-29T09:40:38Z","published":"2023-12-13T18:58:15Z","title":"ViLA: Efficient Video-Language Alignment for Video Question Answering","summary":" In this work, we propose an efficient Video-Language Alignment (ViLA)\nnetwork. Our ViLA model addresses both efficient frame sampling and effective\ncross-modal alignment in a unified way. In our ViLA network, we design a new\nlearnable text-guided Frame-Prompter together with a new cross-modal\ndistillation (QFormer-Distiller) module. Pre-trained large image-language\nmodels have shown promising results on problems such as visual question\nanswering (VQA). However, how to efficiently and effectively sample video\nframes when adapting pre-trained large image-language model to video-language\nalignment is still the major challenge. Compared with prior work, our ViLA\nmodel demonstrates the capability of selecting key frames with critical\ncontents, thus improving the video-language alignment accuracy while reducing\nthe inference latency +3.3% on NExT-QA Temporal with 3.0X speed up). Overall,\nour ViLA network outperforms the state-of-the-art methods on the video\nquestion-answering benchmarks: +4.6% on STAR Interaction, +2.2% on STAR average\nwith 3.0X speed up, ours 2-frames out-perform SeViLA 4-frames on the VLEP\ndataset with 4.2X speed-up.\n","authors":["Xijun Wang","Junbang Liang","Chun-Kai Wang","Kenan Deng","Yu Lou","Ming Lin","Shan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.08367v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10511v2","updated":"2024-04-29T09:38:20Z","published":"2023-09-19T10:47:32Z","title":"Self2Seg: Single-Image Self-Supervised Joint Segmentation and Denoising","summary":" We develop Self2Seg, a self-supervised method for the joint segmentation and\ndenoising of a single image. To this end, we combine the advantages of\nvariational segmentation with self-supervised deep learning. One major benefit\nof our method lies in the fact, that in contrast to data-driven methods, where\nhuge amounts of labeled samples are necessary, Self2Seg segments an image into\nmeaningful regions without any training database. Moreover, we demonstrate that\nself-supervised denoising itself is significantly improved through the\nregion-specific learning of Self2Seg. Therefore, we introduce a novel\nself-supervised energy functional in which denoising and segmentation are\ncoupled in a way that both tasks benefit from each other. We propose a unified\noptimisation strategy and numerically show that for noisy microscopy images our\nproposed joint approach outperforms its sequential counterpart as well as\nalternative methods focused purely on denoising or segmentation.\n","authors":["Nadja Gruber","Johannes Schwab","Noémie Debroux","Nicolas Papadakis","Markus Haltmeier"],"pdf_url":"https://arxiv.org/pdf/2309.10511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18539v1","updated":"2024-04-29T09:27:31Z","published":"2024-04-29T09:27:31Z","title":"Enhancing Boundary Segmentation for Topological Accuracy with\n Skeleton-based Methods","summary":" Topological consistency plays a crucial role in the task of boundary\nsegmentation for reticular images, such as cell membrane segmentation in neuron\nelectron microscopic images, grain boundary segmentation in material\nmicroscopic images and road segmentation in aerial images. In these fields,\ntopological changes in segmentation results have a serious impact on the\ndownstream tasks, which can even exceed the misalignment of the boundary\nitself. To enhance the topology accuracy in segmentation results, we propose\nthe Skea-Topo Aware loss, which is a novel loss function that takes into\naccount the shape of each object and topological significance of the pixels. It\nconsists of two components. First, the skeleton-aware weighted loss improves\nthe segmentation accuracy by better modeling the object geometry with\nskeletons. Second, a boundary rectified term effectively identifies and\nemphasizes topological critical pixels in the prediction errors using both\nforeground and background skeletons in the ground truth and predictions.\nExperiments prove that our method improves topological consistency by up to 7\npoints in VI compared to 13 state-of-art methods, based on objective and\nsubjective assessments across three different boundary segmentation datasets.\nThe code is available at https://github.com/clovermini/Skea_topo.\n","authors":["Chuni Liu","Boyuan Ma","Xiaojuan Ban","Yujie Xie","Hao Wang","Weihua Xue","Jingchao Ma","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2404.18539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17528v2","updated":"2024-04-29T09:26:36Z","published":"2023-11-29T11:01:38Z","title":"HiDiffusion: Unlocking Higher-Resolution Creativity and Efficiency in\n Pretrained Diffusion Models","summary":" Diffusion models have become a mainstream approach for high-resolution image\nsynthesis. However, directly generating higher-resolution images from\npretrained diffusion models will encounter unreasonable object duplication and\nexponentially increase the generation time. In this paper, we discover that\nobject duplication arises from feature duplication in the deep blocks of the\nU-Net. Concurrently, We pinpoint the extended generation times to\nself-attention redundancy in U-Net's top blocks. To address these issues, we\npropose a tuning-free higher-resolution framework named HiDiffusion.\nSpecifically, HiDiffusion contains Resolution-Aware U-Net (RAU-Net) that\ndynamically adjusts the feature map size to resolve object duplication and\nengages Modified Shifted Window Multi-head Self-Attention (MSW-MSA) that\nutilizes optimized window attention to reduce computations. we can integrate\nHiDiffusion into various pretrained diffusion models to scale image generation\nresolutions even to 4096x4096 at 1.5-6x the inference speed of previous\nmethods. Extensive experiments demonstrate that our approach can address object\nduplication and heavy computation issues, achieving state-of-the-art\nperformance on higher-resolution image synthesis tasks.\n","authors":["Shen Zhang","Zhaowei Chen","Zhenyu Zhao","Yuhao Chen","Yao Tang","Jiajun Liang"],"pdf_url":"https://arxiv.org/pdf/2311.17528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18532v1","updated":"2024-04-29T09:19:05Z","published":"2024-04-29T09:19:05Z","title":"MileBench: Benchmarking MLLMs in Long Context","summary":" Despite the advancements and impressive performance of Multimodal Large\nLanguage Models (MLLMs) on benchmarks, their effectiveness in real-world,\nlong-context, and multi-image tasks is unclear due to the benchmarks' limited\nscope. Existing benchmarks often focus on single-image and short-text samples,\nand when assessing multi-image tasks, they either limit the image count or\nfocus on specific task (e.g time-series captioning), potentially obscuring the\nperformance challenges of MLLMs. To address these limitations, we introduce\nMileBench, a pioneering benchmark designed to test the MultImodal Long-contExt\ncapabilities of MLLMs. This benchmark comprises not only multimodal long\ncontexts, but also multiple tasks requiring both comprehension and generation.\nWe establish two distinct evaluation sets, diagnostic and realistic, to\nsystematically assess MLLMs' long-context adaptation capacity and their ability\nto complete tasks in long-context scenarios. Our experimental results, obtained\nfrom testing 20 models, revealed that while the closed-source GPT-4(Vision) and\nGemini 1.5 outperform others, most open-source MLLMs struggle in long-context\nsituations. Interestingly, the performance gap tends to widen with an increase\nin the number of images. We strongly encourage an intensification of research\nefforts towards enhancing MLLMs' long-context capabilities, especially in\nscenarios involving multiple images.\n","authors":["Dingjie Song","Shunian Chen","Guiming Hardy Chen","Fei Yu","Xiang Wan","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2404.18532v1.pdf","comment":"29 pages, 13 figures, 14 tables"},{"id":"http://arxiv.org/abs/2403.09993v2","updated":"2024-04-29T08:55:45Z","published":"2024-03-15T03:27:39Z","title":"TRG-Net: An Interpretable and Controllable Rain Generator","summary":" Exploring and modeling rain generation mechanism is critical for augmenting\npaired data to ease training of rainy image processing models. Against this\ntask, this study proposes a novel deep learning based rain generator, which\nfully takes the physical generation mechanism underlying rains into\nconsideration and well encodes the learning of the fundamental rain factors\n(i.e., shape, orientation, length, width and sparsity) explicitly into the deep\nnetwork. Its significance lies in that the generator not only elaborately\ndesign essential elements of the rain to simulate expected rains, like\nconventional artificial strategies, but also finely adapt to complicated and\ndiverse practical rainy images, like deep learning methods. By rationally\nadopting filter parameterization technique, we first time achieve a deep\nnetwork that is finely controllable with respect to rain factors and able to\nlearn the distribution of these factors purely from data. Our unpaired\ngeneration experiments demonstrate that the rain generated by the proposed rain\ngenerator is not only of higher quality, but also more effective for deraining\nand downstream tasks compared to current state-of-the-art rain generation\nmethods. Besides, the paired data augmentation experiments, including both\nin-distribution and out-of-distribution (OOD), further validate the diversity\nof samples generated by our model for in-distribution deraining and OOD\ngeneralization tasks.\n","authors":["Zhiqiang Pang","Hong Wang","Qi Xie","Deyu Meng","Zongben Xu"],"pdf_url":"https://arxiv.org/pdf/2403.09993v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05208v2","updated":"2024-04-29T08:47:02Z","published":"2023-06-08T14:05:06Z","title":"PriSampler: Mitigating Property Inference of Diffusion Models","summary":" Diffusion models have been remarkably successful in data synthesis. However,\nwhen these models are applied to sensitive datasets, such as banking and human\nface data, they might bring up severe privacy concerns. This work\nsystematically presents the first privacy study about property inference\nattacks against diffusion models, where adversaries aim to extract sensitive\nglobal properties of its training set from a diffusion model. Specifically, we\nfocus on the most practical attack scenario: adversaries are restricted to\naccessing only synthetic data. Under this realistic scenario, we conduct a\ncomprehensive evaluation of property inference attacks on various diffusion\nmodels trained on diverse data types, including tabular and image datasets. A\nbroad range of evaluations reveals that diffusion models and their samplers are\nuniversally vulnerable to property inference attacks. In response, we propose a\nnew model-agnostic plug-in method PriSampler to mitigate the risks of the\nproperty inference of diffusion models. PriSampler can be directly applied to\nwell-trained diffusion models and support both stochastic and deterministic\nsampling. Extensive experiments illustrate the effectiveness of our defense,\nand it can lead adversaries to infer the proportion of properties as close as\npredefined values that model owners wish. Notably, PriSampler also shows its\nsignificantly superior performance to diffusion models trained with\ndifferential privacy on both model utility and defense performance. This work\nwill elevate the awareness of preventing property inference attacks and\nencourage privacy-preserving synthetic data release.\n","authors":["Hailong Hu","Jun Pang"],"pdf_url":"https://arxiv.org/pdf/2306.05208v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18504v1","updated":"2024-04-29T08:46:43Z","published":"2024-04-29T08:46:43Z","title":"Multisensor Data Fusion for Automatized Insect Monitoring (KInsecta)","summary":" Insect populations are declining globally, making systematic monitoring\nessential for conservation. Most classical methods involve death traps and\ncounter insect conservation. This paper presents a multisensor approach that\nuses AI-based data fusion for insect classification. The system is designed as\nlow-cost setup and consists of a camera module and an optical wing beat sensor\nas well as environmental sensors to measure temperature, irradiance or daytime\nas prior information. The system has been tested in the laboratory and in the\nfield. First tests on a small very unbalanced data set with 7 species show\npromising results for species classification. The multisensor system will\nsupport biodiversity and agriculture studies.\n","authors":["Martin Tschaikner","Danja Brandt","Henning Schmidt","Felix Bießmann","Teodor Chiaburu","Ilona Schrimpf","Thomas Schrimpf","Alexandra Stadel","Frank Haußer","Ingeborg Beckers"],"pdf_url":"https://arxiv.org/pdf/2404.18504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01603v2","updated":"2024-04-29T08:32:37Z","published":"2022-10-04T13:27:38Z","title":"Neural-Symbolic Recursive Machine for Systematic Generalization","summary":" Current learning models often struggle with human-like systematic\ngeneralization, particularly in learning compositional rules from limited data\nand extrapolating them to novel combinations. We introduce the Neural-Symbolic\nRecursive Machine (NSR), whose core is a Grounded Symbol System (GSS), allowing\nfor the emergence of combinatorial syntax and semantics directly from training\ndata. The NSR employs a modular design that integrates neural perception,\nsyntactic parsing, and semantic reasoning. These components are synergistically\ntrained through a novel deduction-abduction algorithm. Our findings demonstrate\nthat NSR's design, imbued with the inductive biases of equivariance and\ncompositionality, grants it the expressiveness to adeptly handle diverse\nsequence-to-sequence tasks and achieve unparalleled systematic generalization.\nWe evaluate NSR's efficacy across four challenging benchmarks designed to probe\nsystematic generalization capabilities: SCAN for semantic parsing, PCFG for\nstring manipulation, HINT for arithmetic reasoning, and a compositional machine\ntranslation task. The results affirm NSR's superiority over contemporary neural\nand hybrid models in terms of generalization and transferability.\n","authors":["Qing Li","Yixin Zhu","Yitao Liang","Ying Nian Wu","Song-Chun Zhu","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2210.01603v2.pdf","comment":"ICLR 2024. Project website: https://liqing-ustc.github.io/NSR/"},{"id":"http://arxiv.org/abs/2402.08439v2","updated":"2024-04-29T08:25:20Z","published":"2024-02-13T13:18:18Z","title":"JeFaPaTo -- A joint toolbox for blinking analysis and facial features\n extraction","summary":" Analyzing facial features and expressions is a complex task in computer\nvision. The human face is intricate, with significant shape, texture, and\nappearance variations. In medical contexts, facial structures and movements\nthat differ from the norm are particularly important to study and require\nprecise analysis to understand the underlying conditions. Given that solely the\nfacial muscles, innervated by the facial nerve, are responsible for facial\nexpressions, facial palsy can lead to severe impairments in facial movements.\n One affected area of interest is the subtle movements involved in blinking.\nIt is an intricate spontaneous process that is not yet fully understood and\nneeds high-resolution, time-specific analysis for detailed understanding.\nHowever, a significant challenge is that many computer vision techniques demand\nprogramming skills for automated extraction and analysis, making them less\naccessible to medical professionals who may not have these skills. The Jena\nFacial Palsy Toolbox (JeFaPaTo) has been developed to bridge this gap. It\nutilizes cutting-edge computer vision algorithms and offers a user-friendly\ninterface for those without programming expertise. This toolbox makes advanced\nfacial analysis more accessible to medical experts, simplifying integration\ninto their workflow.\n","authors":["Tim Büchner","Oliver Mothes","Orlando Guntinas-Lichius","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2402.08439v2.pdf","comment":"A Preprint - Submitted to the Journal of Open Source Software; 7\n pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.06031v5","updated":"2024-04-29T07:19:31Z","published":"2023-11-10T12:38:16Z","title":"Diagonal Hierarchical Consistency Learning for Semi-supervised Medical\n Image Segmentation","summary":" Medical image segmentation, which is essential for many clinical\napplications, has achieved almost human-level performance via data-driven deep\nlearning technologies. Nevertheless, its performance is predicated upon the\ncostly process of manually annotating a vast amount of medical images. To this\nend, we propose a novel framework for robust semi-supervised medical image\nsegmentation using diagonal hierarchical consistency learning (DiHC-Net).\nFirst, it is composed of multiple sub-models with identical multi-scale\narchitecture but with distinct sub-layers, such as up-sampling and\nnormalisation layers. Second, with mutual consistency, a novel consistency\nregularisation is enforced between one model's intermediate and final\nprediction and soft pseudo labels from other models in a diagonal hierarchical\nfashion. A series of experiments verifies the efficacy of our simple framework,\noutperforming all previous approaches on public benchmark dataset covering\norgan and tumour.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2311.06031v5.pdf","comment":"Accepted to IEEE EMBC 2024 (46th Annual International Conference of\n the IEEE Engineering in Medicine & Biology Society)"},{"id":"http://arxiv.org/abs/2401.12422v2","updated":"2024-04-29T07:14:45Z","published":"2024-01-23T01:11:10Z","title":"InverseMatrixVT3D: An Efficient Projection Matrix-Based Approach for 3D\n Occupancy Prediction","summary":" This paper introduces InverseMatrixVT3D, an efficient method for transforming\nmulti-view image features into 3D feature volumes for 3D semantic occupancy\nprediction. Existing methods for constructing 3D volumes often rely on depth\nestimation, device-specific operators, or transformer queries, which hinders\nthe widespread adoption of 3D occupancy models. In contrast, our approach\nleverages two projection matrices to store the static mapping relationships and\nmatrix multiplications to efficiently generate global Bird's Eye View (BEV)\nfeatures and local 3D feature volumes. Specifically, we achieve this by\nperforming matrix multiplications between multi-view image feature maps and two\nsparse projection matrices. We introduce a sparse matrix handling technique for\nthe projection matrices to optimize GPU memory usage. Moreover, a global-local\nattention fusion module is proposed to integrate the global BEV features with\nthe local 3D feature volumes to obtain the final 3D volume. We also employ a\nmulti-scale supervision mechanism to enhance performance further. Extensive\nexperiments performed on the nuScenes and SemanticKITTI datasets reveal that\nour approach not only stands out for its simplicity and effectiveness but also\nachieves the top performance in detecting vulnerable road users (VRU), crucial\nfor autonomous driving and road safety. The code has been made available at:\nhttps://github.com/DanielMing123/InverseMatrixVT3D\n","authors":["Zhenxing Ming","Julie Stephany Berrio","Mao Shan","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2401.12422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17255v2","updated":"2024-04-29T06:55:56Z","published":"2024-04-26T08:51:31Z","title":"SDFD: Building a Versatile Synthetic Face Image Dataset with Diverse\n Attributes","summary":" AI systems rely on extensive training on large datasets to address various\ntasks. However, image-based systems, particularly those used for demographic\nattribute prediction, face significant challenges. Many current face image\ndatasets primarily focus on demographic factors such as age, gender, and skin\ntone, overlooking other crucial facial attributes like hairstyle and\naccessories. This narrow focus limits the diversity of the data and\nconsequently the robustness of AI systems trained on them. This work aims to\naddress this limitation by proposing a methodology for generating synthetic\nface image datasets that capture a broader spectrum of facial diversity.\nSpecifically, our approach integrates a systematic prompt formulation strategy,\nencompassing not only demographics and biometrics but also non-permanent traits\nlike make-up, hairstyle, and accessories. These prompts guide a\nstate-of-the-art text-to-image model in generating a comprehensive dataset of\nhigh-quality realistic images and can be used as an evaluation set in face\nanalysis systems. Compared to existing datasets, our proposed dataset proves\nequally or more challenging in image classification tasks while being much\nsmaller in size.\n","authors":["Georgia Baltsou","Ioannis Sarridis","Christos Koutlis","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.17255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11991v2","updated":"2024-04-29T06:45:43Z","published":"2023-06-21T03:05:25Z","title":"Generalizable Metric Network for Cross-domain Person Re-identification","summary":" Person Re-identification (Re-ID) is a crucial technique for public security\nand has made significant progress in supervised settings. However, the\ncross-domain (i.e., domain generalization) scene presents a challenge in Re-ID\ntasks due to unseen test domains and domain-shift between the training and test\nsets. To tackle this challenge, most existing methods aim to learn\ndomain-invariant or robust features for all domains. In this paper, we observe\nthat the data-distribution gap between the training and test sets is smaller in\nthe sample-pair space than in the sample-instance space. Based on this\nobservation, we propose a Generalizable Metric Network (GMN) to further explore\nsample similarity in the sample-pair space. Specifically, we add a Metric\nNetwork (M-Net) after the main network and train it on positive and negative\nsample-pair features, which is then employed during the test stage.\nAdditionally, we introduce the Dropout-based Perturbation (DP) module to\nenhance the generalization capability of the metric network by enriching the\nsample-pair diversity. Moreover, we develop a Pair-Identity Center (PIC) loss\nto enhance the model's discrimination by ensuring that sample-pair features\nwith the same pair-identity are consistent. We validate the effectiveness of\nour proposed method through a lot of experiments on multiple benchmark datasets\nand confirm the value of each module in our GMN.\n","authors":["Lei Qi","Ziang Liu","Yinghuan Shi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2306.11991v2.pdf","comment":"Accepted by IEEE TCSVT"},{"id":"http://arxiv.org/abs/2404.18461v1","updated":"2024-04-29T06:44:33Z","published":"2024-04-29T06:44:33Z","title":"Clicks2Line: Using Lines for Interactive Image Segmentation","summary":" For click-based interactive segmentation methods, reducing the number of\nclicks required to obtain a desired segmentation result is essential. Although\nrecent click-based methods yield decent segmentation results, we observe that\nsubstantial amount of clicks are required to segment elongated regions. To\nreduce the amount of user-effort required, we propose using lines instead of\nclicks for such cases. In this paper, an interactive segmentation algorithm\nwhich adaptively adopts either clicks or lines as input is proposed.\nExperimental results demonstrate that using lines can generate better\nsegmentation results than clicks for several cases.\n","authors":["Chaewon Lee","Chang-Su Kim"],"pdf_url":"https://arxiv.org/pdf/2404.18461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18459v1","updated":"2024-04-29T06:35:34Z","published":"2024-04-29T06:35:34Z","title":"Chameleon: A Data-Efficient Generalist for Dense Visual Prediction in\n the Wild","summary":" Large language models have evolved data-efficient generalists, benefiting\nfrom the universal language interface and large-scale pre-training. However,\nconstructing a data-efficient generalist for dense visual prediction presents a\ndistinct challenge due to the variation in label structures across different\ntasks. Consequently, generalization to unseen dense prediction tasks in the\nlow-data regime is not straightforward and has received less attention from\nprevious vision generalists. In this study, we explore a universal model that\ncan flexibly adapt to unseen dense label structures with a few examples,\nenabling it to serve as a data-efficient vision generalist in diverse\nreal-world scenarios. To this end, we base our method on a powerful\nmeta-learning framework and explore several axes to improve its performance and\nversatility for real-world problems, such as flexible adaptation mechanisms and\nscalability. We evaluate our model across a spectrum of unseen real-world\nscenarios where low-shot learning is desirable, including video, 3D, medical,\nbiological, and user-interactive tasks. Equipped with a generic architecture\nand an effective adaptation mechanism, our model flexibly adapts to all of\nthese tasks with at most 50 labeled images, showcasing a significant\nadvancement over existing data-efficient generalist approaches. Codes are\navailable at https://github.com/GitGyun/chameleon.\n","authors":["Donggyun Kim","Seongwoong Cho","Semin Kim","Chong Luo","Seunghoon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.18459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18458v1","updated":"2024-04-29T06:32:28Z","published":"2024-04-29T06:32:28Z","title":"Autonomous Quality and Hallucination Assessment for Virtual Tissue\n Staining and Digital Pathology","summary":" Histopathological staining of human tissue is essential in the diagnosis of\nvarious diseases. The recent advances in virtual tissue staining technologies\nusing AI alleviate some of the costly and tedious steps involved in the\ntraditional histochemical staining process, permitting multiplexed rapid\nstaining of label-free tissue without using staining reagents, while also\npreserving tissue. However, potential hallucinations and artifacts in these\nvirtually stained tissue images pose concerns, especially for the clinical\nutility of these approaches. Quality assessment of histology images is\ngenerally performed by human experts, which can be subjective and depends on\nthe training level of the expert. Here, we present an autonomous quality and\nhallucination assessment method (termed AQuA), mainly designed for virtual\ntissue staining, while also being applicable to histochemical staining. AQuA\nachieves 99.8% accuracy when detecting acceptable and unacceptable virtually\nstained tissue images without access to ground truth, also presenting an\nagreement of 98.5% with the manual assessments made by board-certified\npathologists. Besides, AQuA achieves super-human performance in identifying\nrealistic-looking, virtually stained hallucinatory images that would normally\nmislead human diagnosticians by deceiving them into diagnosing patients that\nnever existed. We further demonstrate the wide adaptability of AQuA across\nvarious virtually and histochemically stained tissue images and showcase its\nstrong external generalization to detect unseen hallucination patterns of\nvirtual staining network models as well as artifacts observed in the\ntraditional histochemical staining workflow. This framework creates new\nopportunities to enhance the reliability of virtual staining and will provide\nquality assurance for various image generation and transformation tasks in\ndigital pathology and computational imaging.\n","authors":["Luzhe Huang","Yuzhu Li","Nir Pillar","Tal Keidar Haran","William Dean Wallace","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2404.18458v1.pdf","comment":"37 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2208.05853v3","updated":"2024-04-29T06:32:01Z","published":"2022-08-11T14:44:33Z","title":"MultiMatch: Multi-task Learning for Semi-supervised Domain\n Generalization","summary":" Domain generalization (DG) aims at learning a model on source domains to well\ngeneralize on the unseen target domain. Although it has achieved great success,\nmost of existing methods require the label information for all training samples\nin source domains, which is time-consuming and expensive in the real-world\napplication. In this paper, we resort to solving the semi-supervised domain\ngeneralization (SSDG) task, where there are a few label information in each\nsource domain. To address the task, we first analyze the theory of the\nmulti-domain learning, which highlights that 1) mitigating the impact of domain\ngap and 2) exploiting all samples to train the model can effectively reduce the\ngeneralization error in each source domain so as to improve the quality of\npseudo-labels. According to the analysis, we propose MultiMatch, i.e.,\nextending FixMatch to the multi-task learning framework, producing the\nhigh-quality pseudo-label for SSDG. To be specific, we consider each training\ndomain as a single task (i.e., local task) and combine all training domains\ntogether (i.e., global task) to train an extra task for the unseen test domain.\nIn the multi-task framework, we utilize the independent BN and classifier for\neach task, which can effectively alleviate the interference from different\ndomains during pseudo-labeling. Also, most of parameters in the framework are\nshared, which can be trained by all training samples sufficiently. Moreover, to\nfurther boost the pseudo-label accuracy and the model's generalization, we fuse\nthe predictions from the global task and local task during training and\ntesting, respectively. A series of experiments validate the effectiveness of\nthe proposed method, and it outperforms the existing semi-supervised methods\nand the SSDG method on several benchmark DG datasets.\n","authors":["Lei Qi","Hongpeng Yang","Yinghuan Shi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2208.05853v3.pdf","comment":"Accepted by ACM TOMM"},{"id":"http://arxiv.org/abs/2404.18454v1","updated":"2024-04-29T06:24:32Z","published":"2024-04-29T06:24:32Z","title":"3D Gaussian Splatting with Deferred Reflection","summary":" The advent of neural and Gaussian-based radiance field methods have achieved\ngreat success in the field of novel view synthesis. However, specular\nreflection remains non-trivial, as the high frequency radiance field is\nnotoriously difficult to fit stably and accurately. We present a deferred\nshading method to effectively render specular reflection with Gaussian\nsplatting. The key challenge comes from the environment map reflection model,\nwhich requires accurate surface normal while simultaneously bottlenecks normal\nestimation with discontinuous gradients. We leverage the per-pixel reflection\ngradients generated by deferred shading to bridge the optimization process of\nneighboring Gaussians, allowing nearly correct normal estimations to gradually\npropagate and eventually spread over all reflective objects. Our method\nsignificantly outperforms state-of-the-art techniques and concurrent work in\nsynthesizing high-quality specular reflection effects, demonstrating a\nconsistent improvement of peak signal-to-noise ratio (PSNR) for both synthetic\nand real-world scenes, while running at a frame rate almost identical to\nvanilla Gaussian splatting.\n","authors":["Keyang Ye","Qiming Hou","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.18454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18448v1","updated":"2024-04-29T06:17:56Z","published":"2024-04-29T06:17:56Z","title":"MFP: Making Full Use of Probability Maps for Interactive Image\n Segmentation","summary":" In recent interactive segmentation algorithms, previous probability maps are\nused as network input to help predictions in the current segmentation round.\nHowever, despite the utilization of previous masks, useful information\ncontained in the probability maps is not well propagated to the current\npredictions. In this paper, to overcome this limitation, we propose a novel and\neffective algorithm for click-based interactive image segmentation, called MFP,\nwhich attempts to make full use of probability maps. We first modulate previous\nprobability maps to enhance their representations of user-specified objects.\nThen, we feed the modulated probability maps as additional input to the\nsegmentation network. We implement the proposed MFP algorithm based on the\nResNet-34, HRNet-18, and ViT-B backbones and assess the performance extensively\non various datasets. It is demonstrated that MFP meaningfully outperforms the\nexisting algorithms using identical backbones. The source codes are available\nat \\href{https://github.com/cwlee00/MFP}{https://github.com/cwlee00/MFP}.\n","authors":["Chaewon Lee","Seon-Ho Lee","Chang-Su Kim"],"pdf_url":"https://arxiv.org/pdf/2404.18448v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.18439v1","updated":"2024-04-29T05:29:26Z","published":"2024-04-29T05:29:26Z","title":"$ν$-DBA: Neural Implicit Dense Bundle Adjustment Enables Image-Only\n Driving Scene Reconstruction","summary":" The joint optimization of the sensor trajectory and 3D map is a crucial\ncharacteristic of bundle adjustment (BA), essential for autonomous driving.\nThis paper presents $\\nu$-DBA, a novel framework implementing geometric dense\nbundle adjustment (DBA) using 3D neural implicit surfaces for map\nparametrization, which optimizes both the map surface and trajectory poses\nusing geometric error guided by dense optical flow prediction. Additionally, we\nfine-tune the optical flow model with per-scene self-supervision to further\nimprove the quality of the dense mapping. Our experimental results on multiple\ndriving scene datasets demonstrate that our method achieves superior trajectory\noptimization and dense reconstruction accuracy. We also investigate the\ninfluences of photometric error and different neural geometric priors on the\nperformance of surface reconstruction and novel view synthesis. Our method\nstands as a significant step towards leveraging neural implicit representations\nin dense bundle adjustment for more accurate trajectories and detailed\nenvironmental mapping.\n","authors":["Yunxuan Mao","Bingqi Shen","Yifei Yang","Kai Wang","Rong Xiong","Yiyi Liao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2404.18439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18433v1","updated":"2024-04-29T05:17:33Z","published":"2024-04-29T05:17:33Z","title":"ShadowMaskFormer: Mask Augmented Patch Embeddings for Shadow Removal","summary":" Transformer recently emerged as the de facto model for computer vision tasks\nand has also been successfully applied to shadow removal. However, these\nexisting methods heavily rely on intricate modifications to the attention\nmechanisms within the transformer blocks while using a generic patch embedding.\nAs a result, it often leads to complex architectural designs requiring\nadditional computation resources. In this work, we aim to explore the efficacy\nof incorporating shadow information within the early processing stage.\nAccordingly, we propose a transformer-based framework with a novel patch\nembedding that is tailored for shadow removal, dubbed ShadowMaskFormer.\nSpecifically, we present a simple and effective mask-augmented patch embedding\nto integrate shadow information and promote the model's emphasis on acquiring\nknowledge for shadow regions. Extensive experiments conducted on the ISTD,\nISTD+, and SRD benchmark datasets demonstrate the efficacy of our method\nagainst state-of-the-art approaches while using fewer model parameters.\n","authors":["Zhuohao Li","Guoyang Xie","Guannan Jiang","Zhichao Lu"],"pdf_url":"https://arxiv.org/pdf/2404.18433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17364v2","updated":"2024-04-29T05:11:20Z","published":"2024-04-26T12:27:57Z","title":"MV-VTON: Multi-View Virtual Try-On with Diffusion Models","summary":" The goal of image-based virtual try-on is to generate an image of the target\nperson naturally wearing the given clothing. However, most existing methods\nsolely focus on the frontal try-on using the frontal clothing. When the views\nof the clothing and person are significantly inconsistent, particularly when\nthe person's view is non-frontal, the results are unsatisfactory. To address\nthis challenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to\nreconstruct the dressing results of a person from multiple views using the\ngiven clothes. On the one hand, given that single-view clothes provide\ninsufficient information for MV-VTON, we instead employ two images, i.e., the\nfrontal and back views of the clothing, to encompass the complete view as much\nas possible. On the other hand, the diffusion models that have demonstrated\nsuperior abilities are adopted to perform our MV-VTON. In particular, we\npropose a view-adaptive selection method where hard-selection and\nsoft-selection are applied to the global and local clothing feature extraction,\nrespectively. This ensures that the clothing features are roughly fit to the\nperson's view. Subsequently, we suggest a joint attention block to align and\nfuse clothing features with person features. Additionally, we collect a MV-VTON\ndataset, i.e., Multi-View Garment (MVG), in which each person has multiple\nphotos with diverse views and poses. Experiments show that the proposed method\nnot only achieves state-of-the-art results on MV-VTON task using our MVG\ndataset, but also has superiority on frontal-view virtual try-on task using\nVITON-HD and DressCode datasets. Codes and datasets will be publicly released\nat https://github.com/hywang2002/MV-VTON .\n","authors":["Haoyu Wang","Zhilu Zhang","Donglin Di","Shiliang Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.17364v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.09533v2","updated":"2024-04-29T04:58:20Z","published":"2024-04-15T07:53:07Z","title":"WiTUnet: A U-Shaped Architecture Integrating CNN and Transformer for\n Improved Feature Alignment and Local Information Fusion","summary":" Low-dose computed tomography (LDCT) has become the technology of choice for\ndiagnostic medical imaging, given its lower radiation dose compared to standard\nCT, despite increasing image noise and potentially affecting diagnostic\naccuracy. To address this, advanced deep learning-based LDCT denoising\nalgorithms have been developed, primarily using Convolutional Neural Networks\n(CNNs) or Transformer Networks with the Unet architecture. This architecture\nenhances image detail by integrating feature maps from the encoder and decoder\nvia skip connections. However, current methods often overlook enhancements to\nthe Unet architecture itself, focusing instead on optimizing encoder and\ndecoder structures. This approach can be problematic due to the significant\ndifferences in feature map characteristics between the encoder and decoder,\nwhere simple fusion strategies may not effectively reconstruct images.In this\npaper, we introduce WiTUnet, a novel LDCT image denoising method that utilizes\nnested, dense skip pathways instead of traditional skip connections to improve\nfeature integration. WiTUnet also incorporates a windowed Transformer structure\nto process images in smaller, non-overlapping segments, reducing computational\nload. Additionally, the integration of a Local Image Perception Enhancement\n(LiPe) module in both the encoder and decoder replaces the standard multi-layer\nperceptron (MLP) in Transformers, enhancing local feature capture and\nrepresentation. Through extensive experimental comparisons, WiTUnet has\ndemonstrated superior performance over existing methods in key metrics such as\nPeak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), and Root Mean\nSquare Error (RMSE), significantly improving noise removal and image quality.\n","authors":["Bin Wang","Fei Deng","Peifan Jiang","Shuang Wang","Xiao Han","Zhixuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18426v1","updated":"2024-04-29T04:56:52Z","published":"2024-04-29T04:56:52Z","title":"Efficient Meta-Learning Enabled Lightweight Multiscale Few-Shot Object\n Detection in Remote Sensing Images","summary":" Presently, the task of few-shot object detection (FSOD) in remote sensing\nimages (RSIs) has become a focal point of attention. Numerous few-shot\ndetectors, particularly those based on two-stage detectors, face challenges\nwhen dealing with the multiscale complexities inherent in RSIs. Moreover, these\ndetectors present impractical characteristics in real-world applications,\nmainly due to their unwieldy model parameters when handling large amount of\ndata. In contrast, we recognize the advantages of one-stage detectors,\nincluding high detection speed and a global receptive field. Consequently, we\nchoose the YOLOv7 one-stage detector as a baseline and subject it to a novel\nmeta-learning training framework. This transformation allows the detector to\nadeptly address FSOD tasks while capitalizing on its inherent advantage of\nlightweight. Additionally, we thoroughly investigate the samples generated by\nthe meta-learning strategy and introduce a novel meta-sampling approach to\nretain samples produced by our designed meta-detection head. Coupled with our\ndevised meta-cross loss, we deliberately utilize ``negative samples\" that are\noften overlooked to extract valuable knowledge from them. This approach serves\nto enhance detection accuracy and efficiently refine the overall meta-learning\nstrategy. To validate the effectiveness of our proposed detector, we conducted\nperformance comparisons with current state-of-the-art detectors using the DIOR\nand NWPU VHR-10.v2 datasets, yielding satisfactory results.\n","authors":["Wenbin Guan","Zijiu Yang","Xiaohong Wu","Liqiong Chen","Feng Huang","Xiaohai He","Honggang Chen"],"pdf_url":"https://arxiv.org/pdf/2404.18426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08403v2","updated":"2024-04-29T04:47:25Z","published":"2023-11-14T18:59:59Z","title":"Instant3D: Instant Text-to-3D Generation","summary":" Text-to-3D generation has attracted much attention from the computer vision\ncommunity. Existing methods mainly optimize a neural field from scratch for\neach text prompt, relying on heavy and repetitive training cost which impedes\ntheir practical deployment. In this paper, we propose a novel framework for\nfast text-to-3D generation, dubbed Instant3D. Once trained, Instant3D is able\nto create a 3D object for an unseen text prompt in less than one second with a\nsingle run of a feedforward network. We achieve this remarkable speed by\ndevising a new network that directly constructs a 3D triplane from a text\nprompt. The core innovation of our Instant3D lies in our exploration of\nstrategies to effectively inject text conditions into the network. In\nparticular, we propose to combine three key mechanisms: cross-attention, style\ninjection, and token-to-plane transformation, which collectively ensure precise\nalignment of the output with the input text. Furthermore, we propose a simple\nyet effective activation function, the scaled-sigmoid, to replace the original\nsigmoid function, which speeds up the training convergence by more than ten\ntimes. Finally, to address the Janus (multi-head) problem in 3D generation, we\npropose an adaptive Perp-Neg algorithm that can dynamically adjust its concept\nnegation scales according to the severity of the Janus problem during training,\neffectively reducing the multi-head effect. Extensive experiments on a wide\nvariety of benchmark datasets demonstrate that the proposed algorithm performs\nfavorably against the state-of-the-art methods both qualitatively and\nquantitatively, while achieving significantly better efficiency. The code,\ndata, and models are available at https://github.com/ming1993li/Instant3DCodes.\n","authors":["Ming Li","Pan Zhou","Jia-Wei Liu","Jussi Keppo","Min Lin","Shuicheng Yan","Xiangyu Xu"],"pdf_url":"https://arxiv.org/pdf/2311.08403v2.pdf","comment":"Project page: https://ming1993li.github.io/Instant3DProj"},{"id":"http://arxiv.org/abs/2404.18423v1","updated":"2024-04-29T04:47:23Z","published":"2024-04-29T04:47:23Z","title":"Unsupervised Dynamics Prediction with Object-Centric Kinematics","summary":" Human perception involves discerning complex multi-object scenes into\ntime-static object appearance (\\ie, size, shape, color) and time-varying object\nmotion (\\ie, location, velocity, acceleration). This innate ability to\nunconsciously understand the environment is the motivation behind the success\nof dynamics modeling. Object-centric representations have emerged as a\npromising tool for dynamics prediction, yet they primarily focus on the\nobjects' appearance, often overlooking other crucial attributes. In this paper,\nwe propose Object-Centric Kinematics (OCK), a framework for dynamics prediction\nleveraging object-centric representations. Our model utilizes a novel component\nnamed object kinematics, which comprises low-level structured states of\nobjects' position, velocity, and acceleration. The object kinematics are\nobtained via either implicit or explicit approaches, enabling comprehensive\nspatiotemporal object reasoning, and integrated through various transformer\nmechanisms, facilitating effective object-centric dynamics modeling. Our model\ndemonstrates superior performance when handling objects and backgrounds in\ncomplex scenes characterized by a wide range of object attributes and dynamic\nmovements. Moreover, our model demonstrates generalization capabilities across\ndiverse synthetic environments, highlighting its potential for broad\napplicability in vision-related tasks.\n","authors":["Yeon-Ji Song","Suhyung Choi","Jaein Kim","Jin-Hwa Kim","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.18423v1.pdf","comment":"15 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.18419v1","updated":"2024-04-29T04:32:11Z","published":"2024-04-29T04:32:11Z","title":"Research on Intelligent Aided Diagnosis System of Medical Image Based on\n Computer Deep Learning","summary":" This paper combines Struts and Hibernate two architectures together, using\nDAO (Data Access Object) to store and access data. Then a set of dual-mode\nhumidity medical image library suitable for deep network is established, and a\ndual-mode medical image assisted diagnosis method based on the image is\nproposed. Through the test of various feature extraction methods, the optimal\noperating characteristic under curve product (AUROC) is 0.9985, the recall rate\nis 0.9814, and the accuracy is 0.9833. This method can be applied to clinical\ndiagnosis, and it is a practical method. Any outpatient doctor can register\nquickly through the system, or log in to the platform to upload the image to\nobtain more accurate images. Through the system, each outpatient physician can\nquickly register or log in to the platform for image uploading, thus obtaining\nmore accurate images. The segmentation of images can guide doctors in clinical\ndepartments. Then the image is analyzed to determine the location and nature of\nthe tumor, so as to make targeted treatment.\n","authors":["Jiajie Yuan","Linxiao Wu","Yulu Gong","Zhou Yu","Ziang Liu","Shuyao He"],"pdf_url":"https://arxiv.org/pdf/2404.18419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18416v1","updated":"2024-04-29T04:11:28Z","published":"2024-04-29T04:11:28Z","title":"Capabilities of Gemini Models in Medicine","summary":" Excellence in a wide variety of medical applications poses considerable\nchallenges for AI, requiring advanced reasoning, access to up-to-date medical\nknowledge and understanding of complex multimodal data. Gemini models, with\nstrong general capabilities in multimodal and long-context reasoning, offer\nexciting possibilities in medicine. Building on these core strengths of Gemini,\nwe introduce Med-Gemini, a family of highly capable multimodal models that are\nspecialized in medicine with the ability to seamlessly use web search, and that\ncan be efficiently tailored to novel modalities using custom encoders. We\nevaluate Med-Gemini on 14 medical benchmarks, establishing new state-of-the-art\n(SoTA) performance on 10 of them, and surpass the GPT-4 model family on every\nbenchmark where a direct comparison is viable, often by a wide margin. On the\npopular MedQA (USMLE) benchmark, our best-performing Med-Gemini model achieves\nSoTA performance of 91.1% accuracy, using a novel uncertainty-guided search\nstrategy. On 7 multimodal benchmarks including NEJM Image Challenges and MMMU\n(health & medicine), Med-Gemini improves over GPT-4V by an average relative\nmargin of 44.5%. We demonstrate the effectiveness of Med-Gemini's long-context\ncapabilities through SoTA performance on a needle-in-a-haystack retrieval task\nfrom long de-identified health records and medical video question answering,\nsurpassing prior bespoke methods using only in-context learning. Finally,\nMed-Gemini's performance suggests real-world utility by surpassing human\nexperts on tasks such as medical text summarization, alongside demonstrations\nof promising potential for multimodal medical dialogue, medical research and\neducation. Taken together, our results offer compelling evidence for\nMed-Gemini's potential, although further rigorous evaluation will be crucial\nbefore real-world deployment in this safety-critical domain.\n","authors":["Khaled Saab","Tao Tu","Wei-Hung Weng","Ryutaro Tanno","David Stutz","Ellery Wulczyn","Fan Zhang","Tim Strother","Chunjong Park","Elahe Vedadi","Juanma Zambrano Chaves","Szu-Yeu Hu","Mike Schaekermann","Aishwarya Kamath","Yong Cheng","David G. T. Barrett","Cathy Cheung","Basil Mustafa","Anil Palepu","Daniel McDuff","Le Hou","Tomer Golany","Luyang Liu","Jean-baptiste Alayrac","Neil Houlsby","Nenad Tomasev","Jan Freyberg","Charles Lau","Jonas Kemp","Jeremy Lai","Shekoofeh Azizi","Kimberly Kanada","SiWai Man","Kavita Kulkarni","Ruoxi Sun","Siamak Shakeri","Luheng He","Ben Caine","Albert Webson","Natasha Latysheva","Melvin Johnson","Philip Mansfield","Jian Lu","Ehud Rivlin","Jesper Anderson","Bradley Green","Renee Wong","Jonathan Krause","Jonathon Shlens","Ewa Dominowska","S. M. Ali Eslami","Claire Cui","Oriol Vinyals","Koray Kavukcuoglu","James Manyika","Jeff Dean","Demis Hassabis","Yossi Matias","Dale Webster","Joelle Barral","Greg Corrado","Christopher Semturs","S. Sara Mahdavi","Juraj Gottweis","Alan Karthikesalingam","Vivek Natarajan"],"pdf_url":"https://arxiv.org/pdf/2404.18416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18413v1","updated":"2024-04-29T04:01:30Z","published":"2024-04-29T04:01:30Z","title":"3AM: An Ambiguity-Aware Multi-Modal Machine Translation Dataset","summary":" Multimodal machine translation (MMT) is a challenging task that seeks to\nimprove translation quality by incorporating visual information. However,\nrecent studies have indicated that the visual information provided by existing\nMMT datasets is insufficient, causing models to disregard it and overestimate\ntheir capabilities. This issue presents a significant obstacle to the\ndevelopment of MMT research. This paper presents a novel solution to this issue\nby introducing 3AM, an ambiguity-aware MMT dataset comprising 26,000 parallel\nsentence pairs in English and Chinese, each with corresponding images. Our\ndataset is specifically designed to include more ambiguity and a greater\nvariety of both captions and images than other MMT datasets. We utilize a word\nsense disambiguation model to select ambiguous data from vision-and-language\ndatasets, resulting in a more challenging dataset. We further benchmark several\nstate-of-the-art MMT models on our proposed dataset. Experimental results show\nthat MMT models trained on our dataset exhibit a greater ability to exploit\nvisual information than those trained on other MMT datasets. Our work provides\na valuable resource for researchers in the field of multimodal learning and\nencourages further exploration in this area. The data, code and scripts are\nfreely available at https://github.com/MaxyLee/3AM.\n","authors":["Xinyu Ma","Xuebo Liu","Derek F. Wong","Jun Rao","Bei Li","Liang Ding","Lidia S. Chao","Dacheng Tao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.18413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18411v1","updated":"2024-04-29T04:00:19Z","published":"2024-04-29T04:00:19Z","title":"Multi-modal Perception Dataset of In-water Objects for Autonomous\n Surface Vehicles","summary":" This paper introduces the first publicly accessible multi-modal perception\ndataset for autonomous maritime navigation, focusing on in-water obstacles\nwithin the aquatic environment to enhance situational awareness for Autonomous\nSurface Vehicles (ASVs). This dataset, consisting of diverse objects\nencountered under varying environmental conditions, aims to bridge the research\ngap in marine robotics by providing a multi-modal, annotated, and ego-centric\nperception dataset, for object detection and classification. We also show the\napplicability of the proposed dataset's framework using deep learning-based\nopen-source perception algorithms that have shown success. We expect that our\ndataset will contribute to development of the marine autonomy pipeline and\nmarine (field) robotics. Please note this is a work-in-progress paper about our\non-going research that we plan to release in full via future publication.\n","authors":["Mingi Jeong","Arihant Chadda","Ziang Ren","Luyang Zhao","Haowen Liu","Monika Roznere","Aiwei Zhang","Yitao Jiang","Sabriel Achong","Samuel Lensgraf","Alberto Quattrini Li"],"pdf_url":"https://arxiv.org/pdf/2404.18411v1.pdf","comment":"Accepted to the IEEE ICRA Workshop on Field Robotics 2024"},{"id":"http://arxiv.org/abs/2404.18409v1","updated":"2024-04-29T03:57:43Z","published":"2024-04-29T03:57:43Z","title":"PKU-AIGIQA-4K: A Perceptual Quality Assessment Database for Both\n Text-to-Image and Image-to-Image AI-Generated Images","summary":" In recent years, image generation technology has rapidly advanced, resulting\nin the creation of a vast array of AI-generated images (AIGIs). However, the\nquality of these AIGIs is highly inconsistent, with low-quality AIGIs severely\nimpairing the visual experience of users. Due to the widespread application of\nAIGIs, the AI-generated image quality assessment (AIGIQA), aimed at evaluating\nthe quality of AIGIs from the perspective of human perception, has garnered\nincreasing interest among scholars. Nonetheless, current research has not yet\nfully explored this field. We have observed that existing databases are limited\nto images generated from single scenario settings. Databases such as AGIQA-1K,\nAGIQA-3K, and AIGCIQA2023, for example, only include images generated by\ntext-to-image generative models. This oversight highlights a critical gap in\nthe current research landscape, underscoring the need for dedicated databases\ncatering to image-to-image scenarios, as well as more comprehensive databases\nthat encompass a broader range of AI-generated image scenarios. Addressing\nthese issues, we have established a large scale perceptual quality assessment\ndatabase for both text-to-image and image-to-image AIGIs, named PKU-AIGIQA-4K.\nWe then conduct a well-organized subjective experiment to collect quality\nlabels for AIGIs and perform a comprehensive analysis of the PKU-AIGIQA-4K\ndatabase. Regarding the use of image prompts during the training process, we\npropose three image quality assessment (IQA) methods based on pre-trained\nmodels that include a no-reference method NR-AIGCIQA, a full-reference method\nFR-AIGCIQA, and a partial-reference method PR-AIGCIQA. Finally, leveraging the\nPKU-AIGIQA-4K database, we conduct extensive benchmark experiments and compare\nthe performance of the proposed methods and the current IQA methods.\n","authors":["Jiquan Yuan","Fanyi Yang","Jihe Li","Xinyan Cao","Jinming Che","Jinlong Lin","Xixin Cao"],"pdf_url":"https://arxiv.org/pdf/2404.18409v1.pdf","comment":"12 pages. arXiv admin note: substantial text overlap with\n arXiv:2311.15556"},{"id":"http://arxiv.org/abs/2404.18401v1","updated":"2024-04-29T03:36:05Z","published":"2024-04-29T03:36:05Z","title":"Spectral-Spatial Mamba for Hyperspectral Image Classification","summary":" Recently, deep learning models have achieved excellent performance in\nhyperspectral image (HSI) classification. Among the many deep models,\nTransformer has gradually attracted interest for its excellence in modeling the\nlong-range dependencies of spatial-spectral features in HSI. However,\nTransformer has the problem of quadratic computational complexity due to the\nself-attention mechanism, which is heavier than other models and thus has\nlimited adoption in HSI processing. Fortunately, the recently emerging state\nspace model-based Mamba shows great computational efficiency while achieving\nthe modeling power of Transformers. Therefore, in this paper, we make a\npreliminary attempt to apply the Mamba to HSI classification, leading to the\nproposed spectral-spatial Mamba (SS-Mamba). Specifically, the proposed SS-Mamba\nmainly consists of spectral-spatial token generation module and several stacked\nspectral-spatial Mamba blocks. Firstly, the token generation module converts\nany given HSI cube to spatial and spectral tokens as sequences. And then these\ntokens are sent to stacked spectral-spatial mamba blocks (SS-MB). Each SS-MB\nblock consists of two basic mamba blocks and a spectral-spatial feature\nenhancement module. The spatial and spectral tokens are processed separately by\nthe two basic mamba blocks, respectively. Besides, the feature enhancement\nmodule modulates spatial and spectral tokens using HSI sample's center region\ninformation. In this way, the spectral and spatial tokens cooperate with each\nother and achieve information fusion within each block. The experimental\nresults conducted on widely used HSI datasets reveal that the proposed model\nachieves competitive results compared with the state-of-the-art methods. The\nMamba-based method opens a new window for HSI classification.\n","authors":["Lingbo Huang","Yushi Chen","Xin He"],"pdf_url":"https://arxiv.org/pdf/2404.18401v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2404.15272v3","updated":"2024-04-29T03:25:14Z","published":"2024-04-23T17:59:01Z","title":"CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and\n Radiology Reports for Full-Body Scenarios","summary":" Medical Vision-Language Pretraining (Med-VLP) establishes a connection\nbetween visual content from medical images and the relevant textual\ndescriptions. Existing Med-VLP methods primarily focus on 2D images depicting a\nsingle body part, notably chest X-rays. In this paper, we extend the scope of\nMed-VLP to encompass 3D images, specifically targeting full-body scenarios, by\nusing a multimodal dataset of CT images and reports. Compared with the 2D\ncounterpart, 3D VLP is required to effectively capture essential semantics from\nsignificantly sparser representation in 3D imaging. In this paper, we introduce\nCT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method\nthat constructs organ-level image-text pairs to enhance multimodal contrastive\nlearning, aligning grounded visual features with precise diagnostic text.\nAdditionally, we developed an abnormality dictionary to augment contrastive\nlearning with diverse contrastive pairs. Our method, trained on a multimodal CT\ndataset comprising 44,011 organ-level vision-text pairs from 17,702 patients\nacross 104 organs, demonstrates it can identify organs and abnormalities in a\nzero-shot manner using natural languages. The performance of CT-GLIP is\nvalidated on a separate test set of 1,130 patients, focusing on the 16 most\nfrequent abnormalities across 7 organs. The experimental results show our\nmodel's superior performance over the standard CLIP framework across zero-shot\nand fine-tuning scenarios, using both CNN and ViT architectures.\n","authors":["Jingyang Lin","Yingda Xia","Jianpeng Zhang","Ke Yan","Le Lu","Jiebo Luo","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15272v3.pdf","comment":"12 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.06075v2","updated":"2024-04-29T03:21:49Z","published":"2024-04-09T07:25:30Z","title":"LIPT: Latency-aware Image Processing Transformer","summary":" Transformer is leading a trend in the field of image processing. Despite the\ngreat success that existing lightweight image processing transformers have\nachieved, they are tailored to FLOPs or parameters reduction, rather than\npractical inference acceleration. In this paper, we present a latency-aware\nimage processing transformer, termed LIPT. We devise the low-latency proportion\nLIPT block that substitutes memory-intensive operators with the combination of\nself-attention and convolutions to achieve practical speedup. Specifically, we\npropose a novel non-volatile sparse masking self-attention (NVSM-SA) that\nutilizes a pre-computing sparse mask to capture contextual information from a\nlarger window with no extra computation overload. Besides, a high-frequency\nreparameterization module (HRM) is proposed to make LIPT block\nreparameterization friendly, which improves the model's detail reconstruction\ncapability. Extensive experiments on multiple image processing tasks (e.g.,\nimage super-resolution (SR), JPEG artifact reduction, and image denoising)\ndemonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves\nreal-time GPU inference with state-of-the-art performance on multiple image SR\nbenchmarks.\n","authors":["Junbo Qiao","Wei Li","Haizhen Xie","Hanting Chen","Yunshuai Zhou","Zhijun Tu","Jie Hu","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18399v1","updated":"2024-04-29T03:21:05Z","published":"2024-04-29T03:21:05Z","title":"Semantic Line Combination Detector","summary":" A novel algorithm, called semantic line combination detector (SLCD), to find\nan optimal combination of semantic lines is proposed in this paper. It\nprocesses all lines in each line combination at once to assess the overall\nharmony of the lines. First, we generate various line combinations from\nreliable lines. Second, we estimate the score of each line combination and\ndetermine the best one. Experimental results demonstrate that the proposed SLCD\noutperforms existing semantic line detectors on various datasets. Moreover, it\nis shown that SLCD can be applied effectively to three vision tasks of\nvanishing point detection, symmetry axis detection, and composition-based image\nretrieval. Our codes are available at https://github.com/Jinwon-Ko/SLCD.\n","authors":["Jinwon Ko","Dongkwon Jin","Chang-Su Kim"],"pdf_url":"https://arxiv.org/pdf/2404.18399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18397v1","updated":"2024-04-29T03:17:47Z","published":"2024-04-29T03:17:47Z","title":"ViOCRVQA: Novel Benchmark Dataset and Vision Reader for Visual Question\n Answering by Understanding Vietnamese Text in Images","summary":" Optical Character Recognition - Visual Question Answering (OCR-VQA) is the\ntask of answering text information contained in images that have just been\nsignificantly developed in the English language in recent years. However, there\nare limited studies of this task in low-resource languages such as Vietnamese.\nTo this end, we introduce a novel dataset, ViOCRVQA (Vietnamese Optical\nCharacter Recognition - Visual Question Answering dataset), consisting of\n28,000+ images and 120,000+ question-answer pairs. In this dataset, all the\nimages contain text and questions about the information relevant to the text in\nthe images. We deploy ideas from state-of-the-art methods proposed for English\nto conduct experiments on our dataset, revealing the challenges and\ndifficulties inherent in a Vietnamese dataset. Furthermore, we introduce a\nnovel approach, called VisionReader, which achieved 0.4116 in EM and 0.6990 in\nthe F1-score on the test set. Through the results, we found that the OCR system\nplays a very important role in VQA models on the ViOCRVQA dataset. In addition,\nthe objects in the image also play a role in improving model performance. We\nopen access to our dataset at link (https://github.com/qhnhynmm/ViOCRVQA.git)\nfor further research in OCR-VQA task in Vietnamese.\n","authors":["Huy Quang Pham","Thang Kien-Bao Nguyen","Quan Van Nguyen","Dan Quang Tran","Nghia Hieu Nguyen","Kiet Van Nguyen","Ngan Luu-Thuy Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.18397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18394v1","updated":"2024-04-29T03:13:09Z","published":"2024-04-29T03:13:09Z","title":"Reconstructing Satellites in 3D from Amateur Telescope Images","summary":" This paper proposes a framework for the 3D reconstruction of satellites in\nlow-Earth orbit, utilizing videos captured by small amateur telescopes. The\nvideo data obtained from these telescopes differ significantly from data for\nstandard 3D reconstruction tasks, characterized by intense motion blur,\natmospheric turbulence, pervasive background light pollution, extended focal\nlength and constrained observational perspectives. To address these challenges,\nour approach begins with a comprehensive pre-processing workflow that\nencompasses deep learning-based image restoration, feature point extraction and\ncamera pose initialization. We proceed with the application of an improved 3D\nGaussian splatting algorithm for reconstructing the 3D model. Our technique\nsupports simultaneous 3D Gaussian training and pose estimation, enabling the\nrobust generation of intricate 3D point clouds from sparse, noisy data. The\nprocedure is further bolstered by a post-editing phase designed to eliminate\nnoise points inconsistent with our prior knowledge of a satellite's geometric\nconstraints. We validate our approach using both synthetic datasets and actual\nobservations of China's Space Station, showcasing its significant advantages\nover existing methods in reconstructing 3D space objects from ground-based\nobservations.\n","authors":["Zhiming Chang","Boyang Liu","Yifei Xia","Youming Guo","Boxin Shi","He Sun"],"pdf_url":"https://arxiv.org/pdf/2404.18394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16556v2","updated":"2024-04-29T03:09:33Z","published":"2024-04-25T12:11:28Z","title":"Conditional Distribution Modelling for Few-Shot Image Synthesis with\n Diffusion Models","summary":" Few-shot image synthesis entails generating diverse and realistic images of\nnovel categories using only a few example images. While multiple recent efforts\nin this direction have achieved impressive results, the existing approaches are\ndependent only upon the few novel samples available at test time in order to\ngenerate new images, which restricts the diversity of the generated images. To\novercome this limitation, we propose Conditional Distribution Modelling (CDM)\n-- a framework which effectively utilizes Diffusion models for few-shot image\ngeneration. By modelling the distribution of the latent space used to condition\na Diffusion process, CDM leverages the learnt statistics of the training data\nto get a better approximation of the unseen class distribution, thereby\nremoving the bias arising due to limited number of few shot samples.\nSimultaneously, we devise a novel inversion based optimization strategy that\nfurther improves the approximated unseen class distribution, and ensures the\nfidelity of the generated samples to the unseen class. The experimental results\non four benchmark datasets demonstrate the effectiveness of our proposed CDM\nfor few-shot generation.\n","authors":["Parul Gupta","Munawar Hayat","Abhinav Dhall","Thanh-Toan Do"],"pdf_url":"https://arxiv.org/pdf/2404.16556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11459v2","updated":"2024-04-29T02:56:59Z","published":"2023-12-18T18:59:05Z","title":"VolumeDiffusion: Flexible Text-to-3D Generation with Efficient\n Volumetric Encoder","summary":" This paper introduces a pioneering 3D volumetric encoder designed for\ntext-to-3D generation. To scale up the training data for the diffusion model, a\nlightweight network is developed to efficiently acquire feature volumes from\nmulti-view images. The 3D volumes are then trained on a diffusion model for\ntext-to-3D generation using a 3D U-Net. This research further addresses the\nchallenges of inaccurate object captions and high-dimensional feature volumes.\nThe proposed model, trained on the public Objaverse dataset, demonstrates\npromising outcomes in producing diverse and recognizable samples from text\nprompts. Notably, it empowers finer control over object part characteristics\nthrough textual cues, fostering model creativity by seamlessly combining\nmultiple concepts within a single object. This research significantly\ncontributes to the progress of 3D generation by introducing an efficient,\nflexible, and scalable representation methodology. Code is available at\nhttps://github.com/checkcrab/VolumeDiffusion.\n","authors":["Zhicong Tang","Shuyang Gu","Chunyu Wang","Ting Zhang","Jianmin Bao","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2312.11459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17205v2","updated":"2024-04-29T02:53:39Z","published":"2024-04-26T07:30:32Z","title":"Two in One Go: Single-stage Emotion Recognition with Decoupled\n Subject-context Transformer","summary":" Emotion recognition aims to discern the emotional state of subjects within an\nimage, relying on subject-centric and contextual visual cues. Current\napproaches typically follow a two-stage pipeline: first localize subjects by\noff-the-shelf detectors, then perform emotion classification through the late\nfusion of subject and context features. However, the complicated paradigm\nsuffers from disjoint training stages and limited interaction between\nfine-grained subject-context elements. To address the challenge, we present a\nsingle-stage emotion recognition approach, employing a Decoupled\nSubject-Context Transformer (DSCT), for simultaneous subject localization and\nemotion classification. Rather than compartmentalizing training stages, we\njointly leverage box and emotion signals as supervision to enrich\nsubject-centric feature learning. Furthermore, we introduce DSCT to facilitate\ninteractions between fine-grained subject-context cues in a decouple-then-fuse\nmanner. The decoupled query token--subject queries and context\nqueries--gradually intertwine across layers within DSCT, during which spatial\nand semantic relations are exploited and aggregated. We evaluate our\nsingle-stage framework on two widely used context-aware emotion recognition\ndatasets, CAER-S and EMOTIC. Our approach surpasses two-stage alternatives with\nfewer parameter numbers, achieving a 3.39% accuracy improvement and a 6.46%\naverage precision gain on CAER-S and EMOTIC datasets, respectively.\n","authors":["Xinpeng Li","Teng Wang","Jian Zhao","Shuyi Mao","Jinbao Wang","Feng Zheng","Xiaojiang Peng","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.17205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18381v1","updated":"2024-04-29T02:33:40Z","published":"2024-04-29T02:33:40Z","title":"Object Registration in Neural Fields","summary":" Neural fields provide a continuous scene representation of 3D geometry and\nappearance in a way which has great promise for robotics applications. One\nfunctionality that unlocks unique use-cases for neural fields in robotics is\nobject 6-DoF registration. In this paper, we provide an expanded analysis of\nthe recent Reg-NF neural field registration method and its use-cases within a\nrobotics context. We showcase the scenario of determining the 6-DoF pose of\nknown objects within a scene using scene and object neural field models. We\nshow how this may be used to better represent objects within imperfectly\nmodelled scenes and generate new scenes by substituting object neural field\nmodels into the scene.\n","authors":["David Hall","Stephen Hausler","Sutharsan Mahendren","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2404.18381v1.pdf","comment":"Accepted to ICRA 2024 RoboNeRF workshop. 5 pages, 10 figures. arXiv\n admin note: substantial text overlap with arXiv:2402.09722"},{"id":"http://arxiv.org/abs/2404.16266v2","updated":"2024-04-29T01:39:37Z","published":"2024-04-25T00:30:03Z","title":"A Multi-objective Optimization Benchmark Test Suite for Real-time\n Semantic Segmentation","summary":" As one of the emerging challenges in Automated Machine Learning, the\nHardware-aware Neural Architecture Search (HW-NAS) tasks can be treated as\nblack-box multi-objective optimization problems (MOPs). An important\napplication of HW-NAS is real-time semantic segmentation, which plays a pivotal\nrole in autonomous driving scenarios. The HW-NAS for real-time semantic\nsegmentation inherently needs to balance multiple optimization objectives,\nincluding model accuracy, inference speed, and hardware-specific\nconsiderations. Despite its importance, benchmarks have yet to be developed to\nframe such a challenging task as multi-objective optimization. To bridge the\ngap, we introduce a tailored streamline to transform the task of HW-NAS for\nreal-time semantic segmentation into standard MOPs. Building upon the\nstreamline, we present a benchmark test suite, CitySeg/MOP, comprising fifteen\nMOPs derived from the Cityscapes dataset. The CitySeg/MOP test suite is\nintegrated into the EvoXBench platform to provide seamless interfaces with\nvarious programming languages (e.g., Python and MATLAB) for instant fitness\nevaluations. We comprehensively assessed the CitySeg/MOP test suite on various\nmulti-objective evolutionary algorithms, showcasing its versatility and\npracticality. Source codes are available at\nhttps://github.com/EMI-Group/evoxbench.\n","authors":["Yifan Zhao","Zhenyu Liang","Zhichao Lu","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.16266v2.pdf","comment":"GECCO 2024"},{"id":"http://arxiv.org/abs/2404.18352v1","updated":"2024-04-29T01:19:17Z","published":"2024-04-29T01:19:17Z","title":"Post-hoc and manifold explanations analysis of facial expression data\n based on deep learning","summary":" The complex information processing system of humans generates a lot of\nobjective and subjective evaluations, making the exploration of human cognitive\nproducts of great cutting-edge theoretical value. In recent years, deep\nlearning technologies, which are inspired by biological brain mechanisms, have\nmade significant strides in the application of psychological or cognitive\nscientific research, particularly in the memorization and recognition of facial\ndata. This paper investigates through experimental research how neural networks\nprocess and store facial expression data and associate these data with a range\nof psychological attributes produced by humans. Researchers utilized deep\nlearning model VGG16, demonstrating that neural networks can learn and\nreproduce key features of facial data, thereby storing image memories.\nMoreover, the experimental results reveal the potential of deep learning models\nin understanding human emotions and cognitive processes and establish a\nmanifold visualization interpretation of cognitive products or psychological\nattributes from a non-Euclidean space perspective, offering new insights into\nenhancing the explainability of AI. This study not only advances the\napplication of AI technology in the field of psychology but also provides a new\npsychological theoretical understanding the information processing of the AI.\nThe code is available in here: https://github.com/NKUShaw/Psychoinformatics.\n","authors":["Yang Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.18352v1.pdf","comment":"19PAGES"},{"id":"http://arxiv.org/abs/2401.15489v3","updated":"2024-04-29T01:01:35Z","published":"2024-01-27T19:44:15Z","title":"Distilling Privileged Multimodal Information for Expression Recognition\n using Optimal Transport","summary":" Deep learning models for multimodal expression recognition have reached\nremarkable performance in controlled laboratory environments because of their\nability to learn complementary and redundant semantic information. However,\nthese models struggle in the wild, mainly because of the unavailability and\nquality of modalities used for training. In practice, only a subset of the\ntraining-time modalities may be available at test time. Learning with\nprivileged information enables models to exploit data from additional\nmodalities that are only available during training. State-of-the-art knowledge\ndistillation (KD) methods have been proposed to distill information from\nmultiple teacher models (each trained on a modality) to a common student model.\nThese privileged KD methods typically utilize point-to-point matching, yet have\nno explicit mechanism to capture the structural information in the teacher\nrepresentation space formed by introducing the privileged modality. Experiments\nwere performed on two challenging problems - pain estimation on the Biovid\ndataset (ordinal classification) and arousal-valance prediction on the Affwild2\ndataset (regression). Results show that our proposed method can outperform\nstate-of-the-art privileged KD methods on these problems. The diversity among\nmodalities and fusion architectures indicates that PKDOT is modality- and\nmodel-agnostic.\n","authors":["Muhammad Haseeb Aslam","Muhammad Osama Zeeshan","Soufiane Belharbi","Marco Pedersoli","Alessandro Koerich","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2401.15489v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18343v1","updated":"2024-04-29T00:54:38Z","published":"2024-04-29T00:54:38Z","title":"G-Refine: A General Quality Refiner for Text-to-Image Generation","summary":" With the evolution of Text-to-Image (T2I) models, the quality defects of\nAI-Generated Images (AIGIs) pose a significant barrier to their widespread\nadoption. In terms of both perception and alignment, existing models cannot\nalways guarantee high-quality results. To mitigate this limitation, we\nintroduce G-Refine, a general image quality refiner designed to enhance\nlow-quality images without compromising the integrity of high-quality ones. The\nmodel is composed of three interconnected modules: a perception quality\nindicator, an alignment quality indicator, and a general quality enhancement\nmodule. Based on the mechanisms of the Human Visual System (HVS) and syntax\ntrees, the first two indicators can respectively identify the perception and\nalignment deficiencies, and the last module can apply targeted quality\nenhancement accordingly. Extensive experimentation reveals that when compared\nto alternative optimization methods, AIGIs after G-Refine outperform in 10+\nquality metrics across 4 databases. This improvement significantly contributes\nto the practical application of contemporary T2I models, paving the way for\ntheir broader adoption. The code will be released on\nhttps://github.com/Q-Future/Q-Refine.\n","authors":["Chunyi Li","Haoning Wu","Hongkun Hao","Zicheng Zhang","Tengchaun Kou","Chaofeng Chen","Lei Bai","Xiaohong Liu","Weisi Lin","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.18343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19149v1","updated":"2024-04-29T23:26:30Z","published":"2024-04-29T23:26:30Z","title":"SAGS: Structure-Aware 3D Gaussian Splatting","summary":" Following the advent of NeRFs, 3D Gaussian Splatting (3D-GS) has paved the\nway to real-time neural rendering overcoming the computational burden of\nvolumetric methods. Following the pioneering work of 3D-GS, several methods\nhave attempted to achieve compressible and high-fidelity performance\nalternatives. However, by employing a geometry-agnostic optimization scheme,\nthese methods neglect the inherent 3D structure of the scene, thereby\nrestricting the expressivity and the quality of the representation, resulting\nin various floating points and artifacts. In this work, we propose a\nstructure-aware Gaussian Splatting method (SAGS) that implicitly encodes the\ngeometry of the scene, which reflects to state-of-the-art rendering performance\nand reduced storage requirements on benchmark novel-view synthesis datasets.\nSAGS is founded on a local-global graph representation that facilitates the\nlearning of complex scenes and enforces meaningful point displacements that\npreserve the scene's geometry. Additionally, we introduce a lightweight version\nof SAGS, using a simple yet effective mid-point interpolation scheme, which\nshowcases a compact representation of the scene with up to 24$\\times$ size\nreduction without the reliance on any compression strategies. Extensive\nexperiments across multiple benchmark datasets demonstrate the superiority of\nSAGS compared to state-of-the-art 3D-GS methods under both rendering quality\nand model size. Besides, we demonstrate that our structure-aware method can\neffectively mitigate floating artifacts and irregular distortions of previous\nmethods while obtaining precise depth maps. Project page\nhttps://eververas.github.io/SAGS/.\n","authors":["Evangelos Ververas","Rolandos Alexandros Potamias","Jifei Song","Jiankang Deng","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2404.19149v1.pdf","comment":"15 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.19148v1","updated":"2024-04-29T23:21:17Z","published":"2024-04-29T23:21:17Z","title":"Enhancing Brazilian Sign Language Recognition through Skeleton Image\n Representation","summary":" Effective communication is paramount for the inclusion of deaf individuals in\nsociety. However, persistent communication barriers due to limited Sign\nLanguage (SL) knowledge hinder their full participation. In this context, Sign\nLanguage Recognition (SLR) systems have been developed to improve communication\nbetween signing and non-signing individuals. In particular, there is the\nproblem of recognizing isolated signs (Isolated Sign Language Recognition,\nISLR) of great relevance in the development of vision-based SL search engines,\nlearning tools, and translation systems. This work proposes an ISLR approach\nwhere body, hands, and facial landmarks are extracted throughout time and\nencoded as 2-D images. These images are processed by a convolutional neural\nnetwork, which maps the visual-temporal information into a sign label.\nExperimental results demonstrate that our method surpassed the state-of-the-art\nin terms of performance metrics on two widely recognized datasets in Brazilian\nSign Language (LIBRAS), the primary focus of this study. In addition to being\nmore accurate, our method is more time-efficient and easier to train due to its\nreliance on a simpler network architecture and solely RGB data as input.\n","authors":["Carlos Eduardo G. R. Alves","Francisco de Assis Boldt","Thiago M. Paixão"],"pdf_url":"https://arxiv.org/pdf/2404.19148v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2310.20636v2","updated":"2024-04-29T23:13:32Z","published":"2023-10-31T17:05:02Z","title":"Using Skew to Assess the Quality of GAN-generated Image Features","summary":" The rapid advancement of Generative Adversarial Networks (GANs) necessitates\nthe need to robustly evaluate these models. Among the established evaluation\ncriteria, the Fr\\'{e}chetInception Distance (FID) has been widely adopted due\nto its conceptual simplicity, fast computation time, and strong correlation\nwith human perception. However, FID has inherent limitations, mainly stemming\nfrom its assumption that feature embeddings follow a Gaussian distribution, and\ntherefore can be defined by their first two moments. As this does not hold in\npractice, in this paper we explore the importance of third-moments in image\nfeature data and use this information to define a new measure, which we call\nthe Skew Inception Distance (SID). We prove that SID is a pseudometric on\nprobability distributions, show how it extends FID, and present a practical\nmethod for its computation. Our numerical experiments support that SID either\ntracks with FID or, in some cases, aligns more closely with human perception\nwhen evaluating image features of ImageNet data. Our work also shows that\nprincipal component analysis can be used to speed up the computation time of\nboth FID and SID. Although we focus on using SID on image features for GAN\nevaluation, SID is applicable much more generally, including for the evaluation\nof other generative models.\n","authors":["Lorenzo Luzi","Helen Jenne","Ryan Murray","Carlos Ortiz Marrero"],"pdf_url":"https://arxiv.org/pdf/2310.20636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19134v1","updated":"2024-04-29T22:39:55Z","published":"2024-04-29T22:39:55Z","title":"Evaluating Deep Clustering Algorithms on Non-Categorical 3D CAD Models","summary":" We introduce the first work on benchmarking and evaluating deep clustering\nalgorithms on large-scale non-categorical 3D CAD models. We first propose a\nworkflow to allow expert mechanical engineers to efficiently annotate 252,648\ncarefully sampled pairwise CAD model similarities, from a subset of the ABC\ndataset with 22,968 shapes. Using seven baseline deep clustering methods, we\nthen investigate the fundamental challenges of evaluating clustering methods\nfor non-categorical data. Based on these challenges, we propose a novel and\nviable ensemble-based clustering comparison approach. This work is the first to\ndirectly target the underexplored area of deep clustering algorithms for 3D\nshapes, and we believe it will be an important building block to analyze and\nutilize the massive 3D shape collections that are starting to appear in deep\ngeometric computing.\n","authors":["Siyuan Xiang","Chin Tseng","Congcong Wen","Deshana Desai","Yifeng Kou","Binil Starly","Daniele Panozzo","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2404.19134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19132v1","updated":"2024-04-29T22:31:21Z","published":"2024-04-29T22:31:21Z","title":"Integrating Present and Past in Unsupervised Continual Learning","summary":" We formulate a unifying framework for unsupervised continual learning (UCL),\nwhich disentangles learning objectives that are specific to the present and the\npast data, encompassing stability, plasticity, and cross-task consolidation.\nThe framework reveals that many existing UCL approaches overlook cross-task\nconsolidation and try to balance plasticity and stability in a shared embedding\nspace. This results in worse performance due to a lack of within-task data\ndiversity and reduced effectiveness in learning the current task. Our method,\nOsiris, which explicitly optimizes all three objectives on separate embedding\nspaces, achieves state-of-the-art performance on all benchmarks, including two\nnovel benchmarks proposed in this paper featuring semantically structured task\nsequences. Compared to standard benchmarks, these two structured benchmarks\nmore closely resemble visual signals received by humans and animals when\nnavigating real-world environments. Finally, we show some preliminary evidence\nthat continual models can benefit from such realistic learning scenarios.\n","authors":["Yipeng Zhang","Laurent Charlin","Richard Zemel","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2404.19132v1.pdf","comment":"CoLLAs 2024"},{"id":"http://arxiv.org/abs/2404.19128v1","updated":"2024-04-29T22:06:17Z","published":"2024-04-29T22:06:17Z","title":"Q-GroundCAM: Quantifying Grounding in Vision Language Models via GradCAM","summary":" Vision and Language Models (VLMs) continue to demonstrate remarkable\nzero-shot (ZS) performance across various tasks. However, many probing studies\nhave revealed that even the best-performing VLMs struggle to capture aspects of\ncompositional scene understanding, lacking the ability to properly ground and\nlocalize linguistic phrases in images. Recent VLM advancements include scaling\nup both model and dataset sizes, additional training objectives and levels of\nsupervision, and variations in the model architectures. To characterize the\ngrounding ability of VLMs, such as phrase grounding, referring expressions\ncomprehension, and relationship understanding, Pointing Game has been used as\nan evaluation metric for datasets with bounding box annotations. In this paper,\nwe introduce a novel suite of quantitative metrics that utilize GradCAM\nactivations to rigorously evaluate the grounding capabilities of pre-trained\nVLMs like CLIP, BLIP, and ALBEF. These metrics offer an explainable and\nquantifiable approach for a more detailed comparison of the zero-shot\ncapabilities of VLMs and enable measuring models' grounding uncertainty. This\ncharacterization reveals interesting tradeoffs between the size of the model,\nthe dataset size, and their performance.\n","authors":["Navid Rajabi","Jana Kosecka"],"pdf_url":"https://arxiv.org/pdf/2404.19128v1.pdf","comment":"Accepted to CVPR 2024, Second Workshop on Foundation Models (WFM)"},{"id":"http://arxiv.org/abs/2404.19126v1","updated":"2024-04-29T22:03:02Z","published":"2024-04-29T22:03:02Z","title":"Compositional Factorization of Visual Scenes with Convolutional Sparse\n Coding and Resonator Networks","summary":" We propose a system for visual scene analysis and recognition based on\nencoding the sparse, latent feature-representation of an image into a\nhigh-dimensional vector that is subsequently factorized to parse scene content.\nThe sparse feature representation is learned from image statistics via\nconvolutional sparse coding, while scene parsing is performed by a resonator\nnetwork. The integration of sparse coding with the resonator network increases\nthe capacity of distributed representations and reduces collisions in the\ncombinatorial search space during factorization. We find that for this problem\nthe resonator network is capable of fast and accurate vector factorization, and\nwe develop a confidence-based metric that assists in tracking the convergence\nof the resonator network.\n","authors":["Christopher J. Kymn","Sonia Mazelet","Annabel Ng","Denis Kleyko","Bruno A. Olshausen"],"pdf_url":"https://arxiv.org/pdf/2404.19126v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.19113v1","updated":"2024-04-29T21:25:59Z","published":"2024-04-29T21:25:59Z","title":"Source-Free Domain Adaptation of Weakly-Supervised Object Localization\n Models for Histology","summary":" Given the emergence of deep learning, digital pathology has gained popularity\nfor cancer diagnosis based on histology images. Deep weakly supervised object\nlocalization (WSOL) models can be trained to classify histology images\naccording to cancer grade and identify regions of interest (ROIs) for\ninterpretation, using inexpensive global image-class annotations. A WSOL model\ninitially trained on some labeled source image data can be adapted using\nunlabeled target data in cases of significant domain shifts caused by\nvariations in staining, scanners, and cancer type. In this paper, we focus on\nsource-free (unsupervised) domain adaptation (SFDA), a challenging problem\nwhere a pre-trained source model is adapted to a new target domain without\nusing any source domain data for privacy and efficiency reasons. SFDA of WSOL\nmodels raises several challenges in histology, most notably because they are\nnot intended to adapt for both classification and localization tasks. In this\npaper, 4 state-of-the-art SFDA methods, each one representative of a main SFDA\nfamily, are compared for WSOL in terms of classification and localization\naccuracy. They are the SFDA-Distribution Estimation, Source HypOthesis\nTransfer, Cross-Domain Contrastive Learning, and Adaptively Domain Statistics\nAlignment. Experimental results on the challenging Glas (smaller, breast\ncancer) and Camelyon16 (larger, colon cancer) histology datasets indicate that\nthese SFDA methods typically perform poorly for localization after adaptation\nwhen optimized for classification.\n","authors":["Alexis Guichemerre","Soufiane Belharbi","Tsiry Mayet","Shakeeb Murtaza","Pourya Shamsolmoali","Luke McCaffrey","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2404.19113v1.pdf","comment":"16 pages, 21 figures, 5 tables, CVPRw 2024"},{"id":"http://arxiv.org/abs/2404.19110v1","updated":"2024-04-29T21:23:29Z","published":"2024-04-29T21:23:29Z","title":"EMOPortraits: Emotion-enhanced Multimodal One-shot Head Avatars","summary":" Head avatars animated by visual signals have gained popularity, particularly\nin cross-driving synthesis where the driver differs from the animated\ncharacter, a challenging but highly practical approach. The recently presented\nMegaPortraits model has demonstrated state-of-the-art results in this domain.\nWe conduct a deep examination and evaluation of this model, with a particular\nfocus on its latent space for facial expression descriptors, and uncover\nseveral limitations with its ability to express intense face motions. To\naddress these limitations, we propose substantial changes in both training\npipeline and model architecture, to introduce our EMOPortraits model, where we:\n Enhance the model's capability to faithfully support intense, asymmetric face\nexpressions, setting a new state-of-the-art result in the emotion transfer\ntask, surpassing previous methods in both metrics and quality.\n Incorporate speech-driven mode to our model, achieving top-tier performance\nin audio-driven facial animation, making it possible to drive source identity\nthrough diverse modalities, including visual signal, audio, or a blend of both.\n We propose a novel multi-view video dataset featuring a wide range of intense\nand asymmetric facial expressions, filling the gap with absence of such data in\nexisting datasets.\n","authors":["Nikita Drobyshev","Antoni Bigata Casademunt","Konstantinos Vougioukas","Zoe Landgraf","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2404.19110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19108v1","updated":"2024-04-29T21:19:12Z","published":"2024-04-29T21:19:12Z","title":"Real-Time Convolutional Neural Network-Based Star Detection and\n Centroiding Method for CubeSat Star Tracker","summary":" Star trackers are one of the most accurate celestial sensors used for\nabsolute attitude determination. The devices detect stars in captured images\nand accurately compute their projected centroids on an imaging focal plane with\nsubpixel precision. Traditional algorithms for star detection and centroiding\noften rely on threshold adjustments for star pixel detection and pixel\nbrightness weighting for centroid computation. However, challenges like high\nsensor noise and stray light can compromise algorithm performance. This article\nintroduces a Convolutional Neural Network (CNN)-based approach for star\ndetection and centroiding, tailored to address the issues posed by noisy star\ntracker images in the presence of stray light and other artifacts. Trained\nusing simulated star images overlayed with real sensor noise and stray light,\nthe CNN produces both a binary segmentation map distinguishing star pixels from\nthe background and a distance map indicating each pixel's proximity to the\nnearest star centroid. Leveraging this distance information alongside pixel\ncoordinates transforms centroid calculations into a set of trilateration\nproblems solvable via the least squares method. Our method employs efficient\nUNet variants for the underlying CNN architectures, and the variants'\nperformances are evaluated. Comprehensive testing has been undertaken with\nsynthetic image evaluations, hardware-in-the-loop assessments, and night sky\ntests. The tests consistently demonstrated that our method outperforms several\nexisting algorithms in centroiding accuracy and exhibits superior resilience to\nhigh sensor noise and stray light interference. An additional benefit of our\nalgorithms is that they can be executed in real-time on low-power edge AI\nprocessors.\n","authors":["Hongrui Zhao","Michael F. Lembeck","Adrian Zhuang","Riya Shah","Jesse Wei"],"pdf_url":"https://arxiv.org/pdf/2404.19108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12303v2","updated":"2024-04-29T20:35:09Z","published":"2024-02-19T17:27:04Z","title":"UncertaintyTrack: Exploiting Detection and Localization Uncertainty in\n Multi-Object Tracking","summary":" Multi-object tracking (MOT) methods have seen a significant boost in\nperformance recently, due to strong interest from the research community and\nsteadily improving object detection methods. The majority of tracking methods\nfollow the tracking-by-detection (TBD) paradigm, blindly trust the incoming\ndetections with no sense of their associated localization uncertainty. This\nlack of uncertainty awareness poses a problem in safety-critical tasks such as\nautonomous driving where passengers could be put at risk due to erroneous\ndetections that have propagated to downstream tasks, including MOT. While there\nare existing works in probabilistic object detection that predict the\nlocalization uncertainty around the boxes, no work in 2D MOT for autonomous\ndriving has studied whether these estimates are meaningful enough to be\nleveraged effectively in object tracking. We introduce UncertaintyTrack, a\ncollection of extensions that can be applied to multiple TBD trackers to\naccount for localization uncertainty estimates from probabilistic object\ndetectors. Experiments on the Berkeley Deep Drive MOT dataset show that the\ncombination of our method and informative uncertainty estimates reduces the\nnumber of ID switches by around 19\\% and improves mMOTA by 2-3%. The source\ncode is available at https://github.com/TRAILab/UncertaintyTrack\n","authors":["Chang Won Lee","Steven L. Waslander"],"pdf_url":"https://arxiv.org/pdf/2402.12303v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2404.16821v2","updated":"2024-04-29T20:24:30Z","published":"2024-04-25T17:59:19Z","title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal\n Models with Open-Source Suites","summary":" In this report, we introduce InternVL 1.5, an open-source multimodal large\nlanguage model (MLLM) to bridge the capability gap between open-source and\nproprietary commercial models in multimodal understanding. We introduce three\nsimple improvements: (1) Strong Vision Encoder: we explored a continuous\nlearning strategy for the large-scale vision foundation model -- InternViT-6B,\nboosting its visual understanding capabilities, and making it can be\ntransferred and reused in different LLMs. (2) Dynamic High-Resolution: we\ndivide images into tiles ranging from 1 to 40 of 448$\\times$448 pixels\naccording to the aspect ratio and resolution of the input images, which\nsupports up to 4K resolution input. (3) High-Quality Bilingual Dataset: we\ncarefully collected a high-quality bilingual dataset that covers common scenes,\ndocument images, and annotated them with English and Chinese question-answer\npairs, significantly enhancing performance in OCR- and Chinese-related tasks.\nWe evaluate InternVL 1.5 through a series of benchmarks and comparative\nstudies. Compared to both open-source and proprietary models, InternVL 1.5\nshows competitive performance, achieving state-of-the-art results in 8 of 18\nbenchmarks. Code has been released at https://github.com/OpenGVLab/InternVL.\n","authors":["Zhe Chen","Weiyun Wang","Hao Tian","Shenglong Ye","Zhangwei Gao","Erfei Cui","Wenwen Tong","Kongzhi Hu","Jiapeng Luo","Zheng Ma","Ji Ma","Jiaqi Wang","Xiaoyi Dong","Hang Yan","Hewei Guo","Conghui He","Botian Shi","Zhenjiang Jin","Chao Xu","Bin Wang","Xingjian Wei","Wei Li","Wenjian Zhang","Bo Zhang","Pinlong Cai","Licheng Wen","Xiangchao Yan","Min Dou","Lewei Lu","Xizhou Zhu","Tong Lu","Dahua Lin","Yu Qiao","Jifeng Dai","Wenhai Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16821v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.19083v1","updated":"2024-04-29T19:52:09Z","published":"2024-04-29T19:52:09Z","title":"Longitudinal Mammogram Risk Prediction","summary":" Breast cancer is one of the leading causes of mortality among women\nworldwide. Early detection and risk assessment play a crucial role in improving\nsurvival rates. Therefore, annual or biennial mammograms are often recommended\nfor screening in high-risk groups. Mammograms are typically interpreted by\nexpert radiologists based on the Breast Imaging Reporting and Data System\n(BI-RADS), which provides a uniform way to describe findings and categorizes\nthem to indicate the level of concern for breast cancer. Recently, machine\nlearning (ML) and computational approaches have been developed to automate and\nimprove the interpretation of mammograms. However, both BI-RADS and the\nML-based methods focus on the analysis of data from the present and sometimes\nthe most recent prior visit. While it is clear that temporal changes in image\nfeatures of the longitudinal scans should carry value for quantifying breast\ncancer risk, no prior work has conducted a systematic study of this. In this\npaper, we extend a state-of-the-art ML model to ingest an arbitrary number of\nlongitudinal mammograms and predict future breast cancer risk. On a large-scale\ndataset, we demonstrate that our model, LoMaR, achieves state-of-the-art\nperformance when presented with only the present mammogram. Furthermore, we use\nLoMaR to characterize the predictive value of prior visits. Our results show\nthat longer histories (e.g., up to four prior annual mammograms) can\nsignificantly boost the accuracy of predicting future breast cancer risk,\nparticularly beyond the short-term. Our code and model weights are available at\nhttps://github.com/batuhankmkaraman/LoMaR.\n","authors":["Batuhan K. Karaman","Katerina Dodelzon","Gozde B. Akar","Mert R. Sabuncu"],"pdf_url":"https://arxiv.org/pdf/2404.19083v1.pdf","comment":"Submitted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2404.19075v1","updated":"2024-04-29T19:41:51Z","published":"2024-04-29T19:41:51Z","title":"Distributed Stochastic Optimization of a Neural Representation Network\n for Time-Space Tomography Reconstruction","summary":" 4D time-space reconstruction of dynamic events or deforming objects using\nX-ray computed tomography (CT) is an extremely ill-posed inverse problem.\nExisting approaches assume that the object remains static for the duration of\nseveral tens or hundreds of X-ray projection measurement images (reconstruction\nof consecutive limited-angle CT scans). However, this is an unrealistic\nassumption for many in-situ experiments that causes spurious artifacts and\ninaccurate morphological reconstructions of the object. To solve this problem,\nwe propose to perform a 4D time-space reconstruction using a distributed\nimplicit neural representation (DINR) network that is trained using a novel\ndistributed stochastic training algorithm. Our DINR network learns to\nreconstruct the object at its output by iterative optimization of its network\nparameters such that the measured projection images best match the output of\nthe CT forward measurement model. We use a continuous time and space forward\nmeasurement model that is a function of the DINR outputs at a sparsely sampled\nset of continuous valued object coordinates. Unlike existing state-of-the-art\nneural representation architectures that forward and back propagate through\ndense voxel grids that sample the object's entire time-space coordinates, we\nonly propagate through the DINR at a small subset of object coordinates in each\niteration resulting in an order-of-magnitude reduction in memory and compute\nfor training. DINR leverages distributed computation across several compute\nnodes and GPUs to produce high-fidelity 4D time-space reconstructions even for\nextremely large CT data sizes. We use both simulated parallel-beam and\nexperimental cone-beam X-ray CT datasets to demonstrate the superior\nperformance of our approach.\n","authors":["K. Aditya Mohan","Massimiliano Ferrucci","Chuck Divin","Garrett A. Stevenson","Hyojin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.19075v1.pdf","comment":"submitted to Nature Machine Intelligence"},{"id":"http://arxiv.org/abs/2404.19066v1","updated":"2024-04-29T19:18:52Z","published":"2024-04-29T19:18:52Z","title":"Revolutionizing Traffic Sign Recognition: Unveiling the Potential of\n Vision Transformers","summary":" This research introduces an innovative method for Traffic Sign Recognition\n(TSR) by leveraging deep learning techniques, with a particular emphasis on\nVision Transformers. TSR holds a vital role in advancing driver assistance\nsystems and autonomous vehicles. Traditional TSR approaches, reliant on manual\nfeature extraction, have proven to be labor-intensive and costly. Moreover,\nmethods based on shape and color have inherent limitations, including\nsusceptibility to various factors and changes in lighting conditions. This\nstudy explores three variants of Vision Transformers (PVT, TNT, LNL) and six\nconvolutional neural networks (AlexNet, ResNet, VGG16, MobileNet, EfficientNet,\nGoogleNet) as baseline models. To address the shortcomings of traditional\nmethods, a novel pyramid EATFormer backbone is proposed, amalgamating\nEvolutionary Algorithms (EAs) with the Transformer architecture. The introduced\nEA-based Transformer block captures multi-scale, interactive, and individual\ninformation through its components: Feed-Forward Network, Global and Local\nInteraction, and Multi-Scale Region Aggregation modules. Furthermore, a\nModulated Deformable MSA module is introduced to dynamically model irregular\nlocations. Experimental evaluations on the GTSRB and BelgiumTS datasets\ndemonstrate the efficacy of the proposed approach in enhancing both prediction\nspeed and accuracy. This study concludes that Vision Transformers hold\nsignificant promise in traffic sign classification and contributes a fresh\nalgorithmic framework for TSR. These findings set the stage for the development\nof precise and dependable TSR algorithms, benefiting driver assistance systems\nand autonomous vehicles.\n","authors":["Susano Mingwin","Yulong Shisu","Yongshuai Wanwag","Sunshin Huing"],"pdf_url":"https://arxiv.org/pdf/2404.19066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19065v1","updated":"2024-04-29T19:12:42Z","published":"2024-04-29T19:12:42Z","title":"HELPER-X: A Unified Instructable Embodied Agent to Tackle Four\n Interactive Vision-Language Domains with Memory-Augmented Language Models","summary":" Recent research on instructable agents has used memory-augmented Large\nLanguage Models (LLMs) as task planners, a technique that retrieves\nlanguage-program examples relevant to the input instruction and uses them as\nin-context examples in the LLM prompt to improve the performance of the LLM in\ninferring the correct action and task plans. In this technical report, we\nextend the capabilities of HELPER, by expanding its memory with a wider array\nof examples and prompts, and by integrating additional APIs for asking\nquestions. This simple expansion of HELPER into a shared memory enables the\nagent to work across the domains of executing plans from dialogue, natural\nlanguage instruction following, active question asking, and commonsense room\nreorganization. We evaluate the agent on four diverse interactive\nvisual-language embodied agent benchmarks: ALFRED, TEACh, DialFRED, and the\nTidy Task. HELPER-X achieves few-shot, state-of-the-art performance across\nthese benchmarks using a single agent, without requiring in-domain training,\nand remains competitive with agents that have undergone in-domain training.\n","authors":["Gabriel Sarch","Sahil Somani","Raghav Kapoor","Michael J. Tarr","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2404.19065v1.pdf","comment":"Videos and code https://helper-agent-llm.github.io/"},{"id":"http://arxiv.org/abs/2312.03001v2","updated":"2024-04-29T19:08:15Z","published":"2023-12-03T19:01:50Z","title":"Computer Vision for Increased Operative Efficiency via Identification of\n Instruments in the Neurosurgical Operating Room: A Proof-of-Concept Study","summary":" Objectives Computer vision (CV) is a field of artificial intelligence that\nenables machines to interpret and understand images and videos. CV has the\npotential to be of assistance in the operating room (OR) to track surgical\ninstruments. We built a CV algorithm for identifying surgical instruments in\nthe neurosurgical operating room as a potential solution for surgical\ninstrument tracking and management to decrease surgical waste and opening of\nunnecessary tools. Methods We collected 1660 images of 27 commonly used\nneurosurgical instruments. Images were labeled using the VGG Image Annotator\nand split into 80% training and 20% testing sets in order to train a U-Net\nConvolutional Neural Network using 5-fold cross validation. Results Our U-Net\nachieved a tool identification accuracy of 80-100% when distinguishing 25\nclasses of instruments, with 19/25 classes having accuracy over 90%. The model\nperformance was not adequate for sub classifying Adson, Gerald, and Debakey\nforceps, which had accuracies of 60-80%. Conclusions We demonstrated the\nviability of using machine learning to accurately identify surgical\ninstruments. Instrument identification could help optimize surgical tray\npacking, decrease tool usage and waste, decrease incidence of instrument\nmisplacement events, and assist in timing of routine instrument maintenance.\nMore training data will be needed to increase accuracy across all surgical\ninstruments that would appear in a neurosurgical operating room. Such\ntechnology has the potential to be used as a method to be used for proving what\ntools are truly needed in each type of operation allowing surgeons across the\nworld to do more with less.\n","authors":["Tanner J. Zachem","Sully F. Chen","Vishal Venkatraman","David AW Sykes","Ravi Prakash","Koumani W. Ntowe","Mikhail A. Bethell","Samantha Spellicy","Alexander D Suarez","Weston Ross","Patrick J. Codd"],"pdf_url":"https://arxiv.org/pdf/2312.03001v2.pdf","comment":"Data is openly available through The Open Science Framework:\n https://doi.org/10.17605/OSF.IO/BCQK2"},{"id":"http://arxiv.org/abs/2402.13250v4","updated":"2024-04-29T18:51:06Z","published":"2024-02-20T18:58:54Z","title":"Video ReCap: Recursive Captioning of Hour-Long Videos","summary":" Most video captioning models are designed to process short video clips of few\nseconds and output text describing low-level visual concepts (e.g., objects,\nscenes, atomic actions). However, most real-world videos last for minutes or\nhours and have a complex hierarchical structure spanning different temporal\ngranularities. We propose Video ReCap, a recursive video captioning model that\ncan process video inputs of dramatically different lengths (from 1 second to 2\nhours) and output video captions at multiple hierarchy levels. The recursive\nvideo-language architecture exploits the synergy between different video\nhierarchies and can process hour-long videos efficiently. We utilize a\ncurriculum learning training scheme to learn the hierarchical structure of\nvideos, starting from clip-level captions describing atomic actions, then\nfocusing on segment-level descriptions, and concluding with generating\nsummaries for hour-long videos. Furthermore, we introduce Ego4D-HCap dataset by\naugmenting Ego4D with 8,267 manually collected long-range video summaries. Our\nrecursive model can flexibly generate captions at different hierarchy levels\nwhile also being useful for other complex video understanding tasks, such as\nVideoQA on EgoSchema. Data, code, and models are available at:\nhttps://sites.google.com/view/vidrecap\n","authors":["Md Mohaiminul Islam","Ngan Ho","Xitong Yang","Tushar Nagarajan","Lorenzo Torresani","Gedas Bertasius"],"pdf_url":"https://arxiv.org/pdf/2402.13250v4.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.19043v1","updated":"2024-04-29T18:33:17Z","published":"2024-04-29T18:33:17Z","title":"Improving Interpretability of Deep Active Learning for Flood Inundation\n Mapping Through Class Ambiguity Indices Using Multi-spectral Satellite\n Imagery","summary":" Flood inundation mapping is a critical task for responding to the increasing\nrisk of flooding linked to global warming. Significant advancements of deep\nlearning in recent years have triggered its extensive applications, including\nflood inundation mapping. To cope with the time-consuming and labor-intensive\ndata labeling process in supervised learning, deep active learning strategies\nare one of the feasible approaches. However, there remains limited exploration\ninto the interpretability of how deep active learning strategies operate, with\na specific focus on flood inundation mapping in the field of remote sensing. In\nthis study, we introduce a novel framework of Interpretable Deep Active\nLearning for Flood inundation Mapping (IDAL-FIM), specifically in terms of\nclass ambiguity of multi-spectral satellite images. In the experiments, we\nutilize Sen1Floods11 dataset, and adopt U-Net with MC-dropout. In addition, we\nemploy five acquisition functions, which are the random, K-means, BALD,\nentropy, and margin acquisition functions. Based on the experimental results,\nwe demonstrate that two proposed class ambiguity indices are effective\nvariables to interpret the deep active learning by establishing statistically\nsignificant correlation with the predictive uncertainty of the deep learning\nmodel at the tile level. Then, we illustrate the behaviors of deep active\nlearning through visualizing two-dimensional density plots and providing\ninterpretations regarding the operation of deep active learning, in flood\ninundation mapping.\n","authors":["Hyunho Lee","Wenwen Li"],"pdf_url":"https://arxiv.org/pdf/2404.19043v1.pdf","comment":"46 pages, 11 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.16398v2","updated":"2024-04-29T18:32:34Z","published":"2024-04-25T08:18:18Z","title":"Revisiting Relevance Feedback for CLIP-based Interactive Image Retrieval","summary":" Many image retrieval studies use metric learning to train an image encoder.\nHowever, metric learning cannot handle differences in users' preferences, and\nrequires data to train an image encoder. To overcome these limitations, we\nrevisit relevance feedback, a classic technique for interactive retrieval\nsystems, and propose an interactive CLIP-based image retrieval system with\nrelevance feedback. Our retrieval system first executes the retrieval, collects\neach user's unique preferences through binary feedback, and returns images the\nuser prefers. Even when users have various preferences, our retrieval system\nlearns each user's preference through the feedback and adapts to the\npreference. Moreover, our retrieval system leverages CLIP's zero-shot\ntransferability and achieves high accuracy without training. We empirically\nshow that our retrieval system competes well with state-of-the-art metric\nlearning in category-based image retrieval, despite not training image encoders\nspecifically for each dataset. Furthermore, we set up two additional\nexperimental settings where users have various preferences: one-label-based\nimage retrieval and conditioned image retrieval. In both cases, our retrieval\nsystem effectively adapts to each user's preferences, resulting in improved\naccuracy compared to image retrieval without feedback. Overall, our work\nhighlights the potential benefits of integrating CLIP with classic relevance\nfeedback techniques to enhance image retrieval.\n","authors":["Ryoya Nara","Yu-Chieh Lin","Yuji Nozawa","Youyang Ng","Goh Itoh","Osamu Torii","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.16398v2.pdf","comment":"20 pages, 8 sugures"},{"id":"http://arxiv.org/abs/2404.19040v1","updated":"2024-04-29T18:28:36Z","published":"2024-04-29T18:28:36Z","title":"GSTalker: Real-time Audio-Driven Talking Face Generation via Deformable\n Gaussian Splatting","summary":" We present GStalker, a 3D audio-driven talking face generation model with\nGaussian Splatting for both fast training (40 minutes) and real-time rendering\n(125 FPS) with a 3$\\sim$5 minute video for training material, in comparison\nwith previous 2D and 3D NeRF-based modeling frameworks which require hours of\ntraining and seconds of rendering per frame. Specifically, GSTalker learns an\naudio-driven Gaussian deformation field to translate and transform 3D Gaussians\nto synchronize with audio information, in which multi-resolution hashing\ngrid-based tri-plane and temporal smooth module are incorporated to learn\naccurate deformation for fine-grained facial details. In addition, a\npose-conditioned deformation field is designed to model the stabilized torso.\nTo enable efficient optimization of the condition Gaussian deformation field,\nwe initialize 3D Gaussians by learning a coarse static Gaussian representation.\nExtensive experiments in person-specific videos with audio tracks validate that\nGSTalker can generate high-fidelity and audio-lips synchronized results with\nfast training and real-time rendering speed.\n","authors":["Bo Chen","Shoukang Hu","Qi Chen","Chenpeng Du","Ran Yi","Yanmin Qian","Xie Chen"],"pdf_url":"https://arxiv.org/pdf/2404.19040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19038v1","updated":"2024-04-29T18:24:55Z","published":"2024-04-29T18:24:55Z","title":"Embedded Representation Learning Network for Animating Styled Video\n Portrait","summary":" The talking head generation recently attracted considerable attention due to\nits widespread application prospects, especially for digital avatars and 3D\nanimation design. Inspired by this practical demand, several works explored\nNeural Radiance Fields (NeRF) to synthesize the talking heads. However, these\nmethods based on NeRF face two challenges: (1) Difficulty in generating\nstyle-controllable talking heads. (2) Displacement artifacts around the neck in\nrendered images. To overcome these two challenges, we propose a novel\ngenerative paradigm \\textit{Embedded Representation Learning Network} (ERLNet)\nwith two learning stages. First, the \\textit{ audio-driven FLAME} (ADF) module\nis constructed to produce facial expression and head pose sequences\nsynchronized with content audio and style video. Second, given the sequence\ndeduced by the ADF, one novel \\textit{dual-branch fusion NeRF} (DBF-NeRF)\nexplores these contents to render the final images. Extensive empirical studies\ndemonstrate that the collaboration of these two stages effectively facilitates\nour method to render a more realistic talking head than the existing\nalgorithms.\n","authors":["Tianyong Wang","Xiangyu Liang","Wangguandong Zheng","Dan Niu","Haifeng Xia","Siyu Xia"],"pdf_url":"https://arxiv.org/pdf/2404.19038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19031v1","updated":"2024-04-29T18:16:13Z","published":"2024-04-29T18:16:13Z","title":"Machine Unlearning for Document Classification","summary":" Document understanding models have recently demonstrated remarkable\nperformance by leveraging extensive collections of user documents. However,\nsince documents often contain large amounts of personal data, their usage can\npose a threat to user privacy and weaken the bonds of trust between humans and\nAI services. In response to these concerns, legislation advocating ``the right\nto be forgotten\" has recently been proposed, allowing users to request the\nremoval of private information from computer systems and neural network models.\nA novel approach, known as machine unlearning, has emerged to make AI models\nforget about a particular class of data. In our research, we explore machine\nunlearning for document classification problems, representing, to the best of\nour knowledge, the first investigation into this area. Specifically, we\nconsider a realistic scenario where a remote server houses a well-trained model\nand possesses only a small portion of training data. This setup is designed for\nefficient forgetting manipulation. This work represents a pioneering step\ntowards the development of machine unlearning methods aimed at addressing\nprivacy concerns in document analysis applications. Our code is publicly\navailable at\n\\url{https://github.com/leitro/MachineUnlearning-DocClassification}.\n","authors":["Lei Kang","Mohamed Ali Souibgui","Fei Yang","Lluis Gomez","Ernest Valveny","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2404.19031v1.pdf","comment":"Accepted to ICDAR2024"},{"id":"http://arxiv.org/abs/2404.19026v1","updated":"2024-04-29T18:10:12Z","published":"2024-04-29T18:10:12Z","title":"MeGA: Hybrid Mesh-Gaussian Head Avatar for High-Fidelity Rendering and\n Head Editing","summary":" Creating high-fidelity head avatars from multi-view videos is a core issue\nfor many AR/VR applications. However, existing methods usually struggle to\nobtain high-quality renderings for all different head components simultaneously\nsince they use one single representation to model components with drastically\ndifferent characteristics (e.g., skin vs. hair). In this paper, we propose a\nHybrid Mesh-Gaussian Head Avatar (MeGA) that models different head components\nwith more suitable representations. Specifically, we select an enhanced FLAME\nmesh as our facial representation and predict a UV displacement map to provide\nper-vertex offsets for improved personalized geometric details. To achieve\nphotorealistic renderings, we obtain facial colors using deferred neural\nrendering and disentangle neural textures into three meaningful parts. For hair\nmodeling, we first build a static canonical hair using 3D Gaussian Splatting. A\nrigid transformation and an MLP-based deformation field are further applied to\nhandle complex dynamic expressions. Combined with our occlusion-aware blending,\nMeGA generates higher-fidelity renderings for the whole head and naturally\nsupports more downstream tasks. Experiments on the NeRSemble dataset\ndemonstrate the effectiveness of our designs, outperforming previous\nstate-of-the-art methods and supporting various editing functionalities,\nincluding hairstyle alteration and texture editing.\n","authors":["Cong Wang","Di Kang","He-Yi Sun","Shen-Han Qian","Zi-Xuan Wang","Linchao Bao","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.19026v1.pdf","comment":"Project page: https://conallwang.github.io/MeGA_Pages/"},{"id":"http://arxiv.org/abs/2404.19024v1","updated":"2024-04-29T18:07:47Z","published":"2024-04-29T18:07:47Z","title":"Multi-Page Document Visual Question Answering using Self-Attention\n Scoring Mechanism","summary":" Documents are 2-dimensional carriers of written communication, and as such\ntheir interpretation requires a multi-modal approach where textual and visual\ninformation are efficiently combined. Document Visual Question Answering\n(Document VQA), due to this multi-modal nature, has garnered significant\ninterest from both the document understanding and natural language processing\ncommunities. The state-of-the-art single-page Document VQA methods show\nimpressive performance, yet in multi-page scenarios, these methods struggle.\nThey have to concatenate all pages into one large page for processing,\ndemanding substantial GPU resources, even for evaluation. In this work, we\npropose a novel method and efficient training strategy for multi-page Document\nVQA tasks. In particular, we employ a visual-only document representation,\nleveraging the encoder from a document understanding model, Pix2Struct. Our\napproach utilizes a self-attention scoring mechanism to generate relevance\nscores for each document page, enabling the retrieval of pertinent pages. This\nadaptation allows us to extend single-page Document VQA models to multi-page\nscenarios without constraints on the number of pages during evaluation, all\nwith minimal demand for GPU resources. Our extensive experiments demonstrate\nnot only achieving state-of-the-art performance without the need for Optical\nCharacter Recognition (OCR), but also sustained performance in scenarios\nextending to documents of nearly 800 pages compared to a maximum of 20 pages in\nthe MP-DocVQA dataset. Our code is publicly available at\n\\url{https://github.com/leitro/SelfAttnScoring-MPDocVQA}.\n","authors":["Lei Kang","Rubèn Tito","Ernest Valveny","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2404.19024v1.pdf","comment":"Accepted to ICDAR2024"},{"id":"http://arxiv.org/abs/2404.19015v1","updated":"2024-04-29T18:00:25Z","published":"2024-04-29T18:00:25Z","title":"Simple-RF: Regularizing Sparse Input Radiance Fields with Simpler\n Solutions","summary":" Neural Radiance Fields (NeRF) show impressive performance in photo-realistic\nfree-view rendering of scenes. Recent improvements on the NeRF such as TensoRF\nand ZipNeRF employ explicit models for faster optimization and rendering, as\ncompared to the NeRF that employs an implicit representation. However, both\nimplicit and explicit radiance fields require dense sampling of images in the\ngiven scene. Their performance degrades significantly when only a sparse set of\nviews is available. Researchers find that supervising the depth estimated by a\nradiance field helps train it effectively with fewer views. The depth\nsupervision is obtained either using classical approaches or neural networks\npre-trained on a large dataset. While the former may provide only sparse\nsupervision, the latter may suffer from generalization issues. As opposed to\nthe earlier approaches, we seek to learn the depth supervision by designing\naugmented models and training them along with the main radiance field. Further,\nwe aim to design a framework of regularizations that can work across different\nimplicit and explicit radiance fields. We observe that certain features of\nthese radiance field models overfit to the observed images in the sparse-input\nscenario. Our key finding is that reducing the capability of the radiance\nfields with respect to positional encoding, the number of decomposed tensor\ncomponents or the size of the hash table, constrains the model to learn simpler\nsolutions, which estimate better depth in certain regions. By designing\naugmented models based on such reduced capabilities, we obtain better depth\nsupervision for the main radiance field. We achieve state-of-the-art\nview-synthesis performance with sparse input views on popular datasets\ncontaining forward-facing and 360$^\\circ$ scenes by employing the above\nregularizations.\n","authors":["Nagabhushan Somraj","Adithyan Karanayil","Sai Harsha Mupparaju","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2404.19015v1.pdf","comment":"The source code for our model can be found on our project page:\n https://nagabhushansn95.github.io/publications/2024/Simple-RF.html. arXiv\n admin note: substantial text overlap with arXiv:2309.03955"},{"id":"http://arxiv.org/abs/2404.18976v1","updated":"2024-04-29T14:45:28Z","published":"2024-04-29T14:45:28Z","title":"Foundations of Multisensory Artificial Intelligence","summary":" Building multisensory AI systems that learn from multiple sensory inputs such\nas text, speech, video, real-world sensors, wearable devices, and medical data\nholds great promise for impact in many scientific areas with practical\nbenefits, such as in supporting human health and well-being, enabling\nmultimedia content processing, and enhancing real-world autonomous agents. By\nsynthesizing a range of theoretical frameworks and application domains, this\nthesis aims to advance the machine learning foundations of multisensory AI. In\nthe first part, we present a theoretical framework formalizing how modalities\ninteract with each other to give rise to new information for a task. These\ninteractions are the basic building blocks in all multimodal problems, and\ntheir quantification enables users to understand their multimodal datasets,\ndesign principled approaches to learn these interactions, and analyze whether\ntheir model has succeeded in learning. In the second part, we study the design\nof practical multimodal foundation models that generalize over many modalities\nand tasks, which presents a step toward grounding large language models to\nreal-world sensory modalities. We introduce MultiBench, a unified large-scale\nbenchmark across a wide range of modalities, tasks, and research areas,\nfollowed by the cross-modal attention and multimodal transformer architectures\nthat now underpin many of today's multimodal foundation models. Scaling these\narchitectures on MultiBench enables the creation of general-purpose\nmultisensory AI systems, and we discuss our collaborative efforts in applying\nthese models for real-world impact in affective computing, mental health,\ncancer prognosis, and robotics. Finally, we conclude this thesis by discussing\nhow future work can leverage these ideas toward more general, interactive, and\nsafe multisensory AI.\n","authors":["Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2404.18976v1.pdf","comment":"CMU Machine Learning Department PhD Thesis"},{"id":"http://arxiv.org/abs/2401.13555v3","updated":"2024-04-29T12:39:23Z","published":"2024-01-24T16:13:26Z","title":"Benchmarking the Fairness of Image Upsampling Methods","summary":" Recent years have witnessed a rapid development of deep generative models for\ncreating synthetic media, such as images and videos. While the practical\napplications of these models in everyday tasks are enticing, it is crucial to\nassess the inherent risks regarding their fairness. In this work, we introduce\na comprehensive framework for benchmarking the performance and fairness of\nconditional generative models. We develop a set of\nmetrics$\\unicode{x2013}$inspired by their supervised fairness\ncounterparts$\\unicode{x2013}$to evaluate the models on their fairness and\ndiversity. Focusing on the specific application of image upsampling, we create\na benchmark covering a wide variety of modern upsampling methods. As part of\nthe benchmark, we introduce UnfairFace, a subset of FairFace that replicates\nthe racial distribution of common large-scale face datasets. Our empirical\nstudy highlights the importance of using an unbiased training set and reveals\nvariations in how the algorithms respond to dataset imbalances. Alarmingly, we\nfind that none of the considered methods produces statistically fair and\ndiverse results. All experiments can be reproduced using our provided\nrepository.\n","authors":["Mike Laszkiewicz","Imant Daunhawer","Julia E. Vogt","Asja Fischer","Johannes Lederer"],"pdf_url":"https://arxiv.org/pdf/2401.13555v3.pdf","comment":"This is the author's version of the work. It is posted here for your\n personal use. Not for redistribution. The definitive Version of Record was\n published at the 2024 ACM Conference on Fairness, Accountability, and\n Transparency (FAccT '24)"},{"id":"http://arxiv.org/abs/2404.18962v1","updated":"2024-04-29T05:55:23Z","published":"2024-04-29T05:55:23Z","title":"An Aggregation-Free Federated Learning for Tackling Data Heterogeneity","summary":" The performance of Federated Learning (FL) hinges on the effectiveness of\nutilizing knowledge from distributed datasets. Traditional FL methods adopt an\naggregate-then-adapt framework, where clients update local models based on a\nglobal model aggregated by the server from the previous training round. This\nprocess can cause client drift, especially with significant cross-client data\nheterogeneity, impacting model performance and convergence of the FL algorithm.\nTo address these challenges, we introduce FedAF, a novel aggregation-free FL\nalgorithm. In this framework, clients collaboratively learn condensed data by\nleveraging peer knowledge, the server subsequently trains the global model\nusing the condensed data and soft labels received from the clients. FedAF\ninherently avoids the issue of client drift, enhances the quality of condensed\ndata amid notable data heterogeneity, and improves the global model\nperformance. Extensive numerical studies on several popular benchmark datasets\nshow FedAF surpasses various state-of-the-art FL algorithms in handling\nlabel-skew and feature-skew data heterogeneity, leading to superior global\nmodel accuracy and faster convergence.\n","authors":["Yuan Wang","Huazhu Fu","Renuga Kanagavelu","Qingsong Wei","Yong Liu","Rick Siow Mong Goh"],"pdf_url":"https://arxiv.org/pdf/2404.18962v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2007.10689v2","updated":"2024-04-29T05:31:49Z","published":"2020-07-21T10:03:42Z","title":"A Deep Ordinal Distortion Estimation Approach for Distortion\n Rectification","summary":" Distortion is widely existed in the images captured by popular wide-angle\ncameras and fisheye cameras. Despite the long history of distortion\nrectification, accurately estimating the distortion parameters from a single\ndistorted image is still challenging. The main reason is these parameters are\nimplicit to image features, influencing the networks to fully learn the\ndistortion information. In this work, we propose a novel distortion\nrectification approach that can obtain more accurate parameters with higher\nefficiency. Our key insight is that distortion rectification can be cast as a\nproblem of learning an ordinal distortion from a single distorted image. To\nsolve this problem, we design a local-global associated estimation network that\nlearns the ordinal distortion to approximate the realistic distortion\ndistribution. In contrast to the implicit distortion parameters, the proposed\nordinal distortion have more explicit relationship with image features, and\nthus significantly boosts the distortion perception of neural networks.\nConsidering the redundancy of distortion information, our approach only uses a\npart of distorted image for the ordinal distortion estimation, showing\npromising applications in the efficient distortion rectification. To our\nknowledge, we first unify the heterogeneous distortion parameters into a\nlearning-friendly intermediate representation through ordinal distortion,\nbridging the gap between image feature and distortion rectification. The\nexperimental results demonstrate that our approach outperforms the\nstate-of-the-art methods by a significant margin, with approximately 23%\nimprovement on the quantitative evaluation while displaying the best\nperformance on visual appearance. The code is available at\nhttps://github.com/KangLiao929/OrdinalDistortion.\n","authors":["Kang Liao","Chunyu Lin","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2007.10689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18961v1","updated":"2024-04-29T05:23:10Z","published":"2024-04-29T05:23:10Z","title":"Unleashing the Power of Multi-Task Learning: A Comprehensive Survey\n Spanning Traditional, Deep, and Pretrained Foundation Model Eras","summary":" MTL is a learning paradigm that effectively leverages both task-specific and\nshared information to address multiple related tasks simultaneously. In\ncontrast to STL, MTL offers a suite of benefits that enhance both the training\nprocess and the inference efficiency. MTL's key advantages encompass\nstreamlined model architecture, performance enhancement, and cross-domain\ngeneralizability. Over the past twenty years, MTL has become widely recognized\nas a flexible and effective approach in various fields, including CV, NLP,\nrecommendation systems, disease prognosis and diagnosis, and robotics. This\nsurvey provides a comprehensive overview of the evolution of MTL, encompassing\nthe technical aspects of cutting-edge methods from traditional approaches to\ndeep learning and the latest trend of pretrained foundation models. Our survey\nmethodically categorizes MTL techniques into five key areas: regularization,\nrelationship learning, feature propagation, optimization, and pre-training.\nThis categorization not only chronologically outlines the development of MTL\nbut also dives into various specialized strategies within each category.\nFurthermore, the survey reveals how the MTL evolves from handling a fixed set\nof tasks to embracing a more flexible approach free from task or modality\nconstraints. It explores the concepts of task-promptable and -agnostic\ntraining, along with the capacity for ZSL, which unleashes the untapped\npotential of this historically coveted learning paradigm. Overall, we hope this\nsurvey provides the research community with a comprehensive overview of the\nadvancements in MTL from its inception in 1997 to the present in 2023. We\naddress present challenges and look ahead to future possibilities, shedding\nlight on the opportunities and potential avenues for MTL research in a broad\nmanner. This project is publicly available at\nhttps://github.com/junfish/Awesome-Multitask-Learning.\n","authors":["Jun Yu","Yutong Dai","Xiaokang Liu","Jin Huang","Yishan Shen","Ke Zhang","Rong Zhou","Eashan Adhikarla","Wenxuan Ye","Yixin Liu","Zhaoming Kong","Kai Zhang","Yilong Yin","Vinod Namboodiri","Brian D. Davison","Jason H. Moore","Yong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.18961v1.pdf","comment":"60 figures, 116 pages, 500+ references"},{"id":"http://arxiv.org/abs/2005.06111v9","updated":"2024-04-29T22:16:08Z","published":"2020-05-13T01:51:15Z","title":"Project RISE: Recognizing Industrial Smoke Emissions","summary":" Industrial smoke emissions pose a significant concern to human health. Prior\nworks have shown that using Computer Vision (CV) techniques to identify smoke\nas visual evidence can influence the attitude of regulators and empower\ncitizens to pursue environmental justice. However, existing datasets are not of\nsufficient quality nor quantity to train the robust CV models needed to support\nair quality advocacy. We introduce RISE, the first large-scale video dataset\nfor Recognizing Industrial Smoke Emissions. We adopted a citizen science\napproach to collaborate with local community members to annotate whether a\nvideo clip has smoke emissions. Our dataset contains 12,567 clips from 19\ndistinct views from cameras that monitored three industrial facilities. These\ndaytime clips span 30 days over two years, including all four seasons. We ran\nexperiments using deep neural networks to establish a strong performance\nbaseline and reveal smoke recognition challenges. Our survey study discussed\ncommunity feedback, and our data analysis displayed opportunities for\nintegrating citizen scientists and crowd workers into the application of\nArtificial Intelligence for Social Impact.\n","authors":["Yen-Chia Hsu","Ting-Hao 'Kenneth' Huang","Ting-Yao Hu","Paul Dille","Sean Prendi","Ryan Hoffman","Anastasia Tsuhlares","Jessica Pachuta","Randy Sargent","Illah Nourbakhsh"],"pdf_url":"https://arxiv.org/pdf/2005.06111v9.pdf","comment":"Accepted by AAAI 2021"}]},"2024-04-28T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.11671v2","updated":"2024-04-28T22:21:56Z","published":"2024-01-22T03:09:00Z","title":"RTA-Former: Reverse Transformer Attention for Polyp Segmentation","summary":" Polyp segmentation is a key aspect of colorectal cancer prevention, enabling\nearly detection and guiding subsequent treatments. Intelligent diagnostic\ntools, including deep learning solutions, are widely explored to streamline and\npotentially automate this process. However, even with many powerful network\narchitectures, there still comes the problem of producing accurate edge\nsegmentation. In this paper, we introduce a novel network, namely RTA-Former,\nthat employs a transformer model as the encoder backbone and innovatively\nadapts Reverse Attention (RA) with a transformer stage in the decoder for\nenhanced edge segmentation. The results of the experiments illustrate that\nRTA-Former achieves state-of-the-art (SOTA) performance in five polyp\nsegmentation datasets. The strong capability of RTA-Former holds promise in\nimproving the accuracy of Transformer-based polyp segmentation, potentially\nleading to better clinical decisions and patient outcomes. Our code is publicly\navailable on GitHub.\n","authors":["Zhikai Li","Murong Yi","Ali Uneri","Sihan Niu","Craig Jones"],"pdf_url":"https://arxiv.org/pdf/2401.11671v2.pdf","comment":"The paper has been accepted by EMBC 2024"},{"id":"http://arxiv.org/abs/2308.05459v2","updated":"2024-04-28T22:11:48Z","published":"2023-08-10T09:32:20Z","title":"KS-APR: Keyframe Selection for Robust Absolute Pose Regression","summary":" Markerless Mobile Augmented Reality (AR) aims to anchor digital content in\nthe physical world without using specific 2D or 3D objects. Absolute Pose\nRegressors (APR) are end-to-end machine learning solutions that infer the\ndevice's pose from a single monocular image. Thanks to their low computation\ncost, they can be directly executed on the constrained hardware of mobile AR\ndevices. However, APR methods tend to yield significant inaccuracies for input\nimages that are too distant from the training set. This paper introduces\nKS-APR, a pipeline that assesses the reliability of an estimated pose with\nminimal overhead by combining the inference results of the APR and the prior\nimages in the training set. Mobile AR systems tend to rely upon visual-inertial\nodometry to track the relative pose of the device during the experience. As\nsuch, KS-APR favours reliability over frequency, discarding unreliable poses.\nThis pipeline can integrate most existing APR methods to improve accuracy by\nfiltering unreliable images with their pose estimates. We implement the\npipeline on three types of APR models on indoor and outdoor datasets. The\nmedian error on position and orientation is reduced for all models, and the\nproportion of large errors is minimized across datasets. Our method enables\nstate-of-the-art APRs such as DFNetdm to outperform single-image and sequential\nAPR methods. These results demonstrate the scalability and effectiveness of\nKS-APR for visual localization tasks that do not require one-shot decisions.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2308.05459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18327v1","updated":"2024-04-28T21:53:42Z","published":"2024-04-28T21:53:42Z","title":"MultiMAE-DER: Multimodal Masked Autoencoder for Dynamic Emotion\n Recognition","summary":" This paper presents a novel approach to processing multimodal data for\ndynamic emotion recognition, named as the Multimodal Masked Autoencoder for\nDynamic Emotion Recognition (MultiMAE-DER). The MultiMAE-DER leverages the\nclosely correlated representation information within spatiotemporal sequences\nacross visual and audio modalities. By utilizing a pre-trained masked\nautoencoder model, the MultiMAEDER is accomplished through simple,\nstraightforward finetuning. The performance of the MultiMAE-DER is enhanced by\noptimizing six fusion strategies for multimodal input sequences. These\nstrategies address dynamic feature correlations within cross-domain data across\nspatial, temporal, and spatiotemporal sequences. In comparison to\nstate-of-the-art multimodal supervised learning models for dynamic emotion\nrecognition, MultiMAE-DER enhances the weighted average recall (WAR) by 4.41%\non the RAVDESS dataset and by 2.06% on the CREMAD. Furthermore, when compared\nwith the state-of-the-art model of multimodal self-supervised learning,\nMultiMAE-DER achieves a 1.86% higher WAR on the IEMOCAP dataset.\n","authors":["Peihao Xiang","Chaohao Lin","Kaida Wu","Ou Bai"],"pdf_url":"https://arxiv.org/pdf/2404.18327v1.pdf","comment":"Accepted by ICPRS 2024"},{"id":"http://arxiv.org/abs/2404.18316v1","updated":"2024-04-28T20:57:55Z","published":"2024-04-28T20:57:55Z","title":"Position paper: Do not explain (vision models) without context","summary":" Does the stethoscope in the picture make the adjacent person a doctor or a\npatient? This, of course, depends on the contextual relationship of the two\nobjects. If it is obvious, why don not explanation methods for vision models\nuse contextual information? In this paper, we (1) review the most popular\nmethods of explaining computer vision models by pointing out that they do not\ntake into account context information, (2) provide examples of real-world use\ncases where spatial context plays a significant role, (3) propose new research\ndirections that may lead to better use of context information in explaining\ncomputer vision models, (4) argue that a change in approach to explanations is\nneeded from 'where' to 'how'.\n","authors":["Paulina Tomaszewska","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.18316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10172v2","updated":"2024-04-28T20:15:25Z","published":"2024-04-15T23:01:59Z","title":"Forensic Iris Image-Based Post-Mortem Interval Estimation","summary":" Post-mortem iris recognition is an emerging application of iris-based human\nidentification in a forensic setup. One factor that may be useful in\nconditioning iris recognition methods is the tissue decomposition level, which\nis correlated with the post-mortem interval (PMI), i.g., the number of hours\nthat have elapsed since death. PMI, however, is not always available, and its\nprecise estimation remains one of the core challenges in forensic examination.\nThis paper presents the first known to us method of PMI estimation directly\nfrom forensic iris images. To assess the feasibility of the iris-based PMI\nestimation, convolutional neural networks-based models (VGG19, DenseNet121,\nResNet152, and Inception_v3) were trained to predict the PMI from (a)\nnear-infrared (NIR), (b) visible (RGB), and (c) multispectral forensic iris\nimages. Models were evaluated following a 10-fold cross-validation in (S1)\nsample-disjoint, (S2) subject-disjoint, and (S3) cross-dataset scenarios. We\nfound that using the multispectral data offers a spectacularly low mean\nabsolute error (MAE) of approximately 3.5 hours in scenario (S1), a bit worse\nMAE of approximately 17.5 hours in scenario (S2), and an MAE of approximately\n69.0 hours of in the scenario (S3). This suggests that if the environmental\nconditions are favorable (e.g., bodies are kept in low temperatures), forensic\niris images provide features that are indicative of the PMI and can be\nautomatically estimated. The source codes and model weights are made available\nwith the paper.\n","authors":["Rasel Ahmed Bhuiyan","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2404.10172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09474v2","updated":"2024-04-28T20:05:45Z","published":"2024-02-12T11:04:08Z","title":"Deciphering Heartbeat Signatures: A Vision Transformer Approach to\n Explainable Atrial Fibrillation Detection from ECG Signals","summary":" Remote patient monitoring based on wearable single-lead electrocardiogram\n(ECG) devices has significant potential for enabling the early detection of\nheart disease, especially in combination with artificial intelligence (AI)\napproaches for automated heart disease detection. There have been prior studies\napplying AI approaches based on deep learning for heart disease detection.\nHowever, these models are yet to be widely accepted as a reliable aid for\nclinical diagnostics, in part due to the current black-box perception\nsurrounding many AI algorithms. In particular, there is a need to identify the\nkey features of the ECG signal that contribute toward making an accurate\ndiagnosis, thereby enhancing the interpretability of the model. In the present\nstudy, we develop a vision transformer approach to identify atrial fibrillation\nbased on single-lead ECG data. A residual network (ResNet) approach is also\ndeveloped for comparison with the vision transformer approach. These models are\napplied to the Chapman-Shaoxing dataset to classify atrial fibrillation, as\nwell as another common arrhythmia, sinus bradycardia, and normal sinus rhythm\nheartbeats. The models enable the identification of the key regions of the\nheartbeat that determine the resulting classification, and highlight the\nimportance of P-waves and T-waves, as well as heartbeat duration and signal\namplitude, in distinguishing normal sinus rhythm from atrial fibrillation and\nsinus bradycardia.\n","authors":["Aruna Mohan","Danne Elbers","Or Zilbershot","Fatemeh Afghah","David Vorchheimer"],"pdf_url":"https://arxiv.org/pdf/2402.09474v2.pdf","comment":"Accepted for publication at the 46th Annual International Conference\n of the IEEE Engineering in Medicine and Biology Society, IEEE EMBC 2024"},{"id":"http://arxiv.org/abs/2311.06243v2","updated":"2024-04-28T20:05:02Z","published":"2023-11-10T18:59:54Z","title":"Parameter-Efficient Orthogonal Finetuning via Butterfly Factorization","summary":" Large foundation models are becoming ubiquitous, but training them from\nscratch is prohibitively expensive. Thus, efficiently adapting these powerful\nmodels to downstream tasks is increasingly important. In this paper, we study a\nprincipled finetuning paradigm -- Orthogonal Finetuning (OFT) -- for downstream\ntask adaptation. Despite demonstrating good generalizability, OFT still uses a\nfairly large number of trainable parameters due to the high dimensionality of\northogonal matrices. To address this, we start by examining OFT from an\ninformation transmission perspective, and then identify a few key desiderata\nthat enable better parameter-efficiency. Inspired by how the Cooley-Tukey fast\nFourier transform algorithm enables efficient information transmission, we\npropose an efficient orthogonal parameterization using butterfly structures. We\napply this parameterization to OFT, creating a novel parameter-efficient\nfinetuning method, called Orthogonal Butterfly (BOFT). By subsuming OFT as a\nspecial case, BOFT introduces a generalized orthogonal finetuning framework.\nFinally, we conduct an extensive empirical study of adapting large vision\ntransformers, large language models, and text-to-image diffusion models to\nvarious downstream tasks in vision and language.\n","authors":["Weiyang Liu","Zeju Qiu","Yao Feng","Yuliang Xiu","Yuxuan Xue","Longhui Yu","Haiwen Feng","Zhen Liu","Juyeon Heo","Songyou Peng","Yandong Wen","Michael J. Black","Adrian Weller","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2311.06243v2.pdf","comment":"ICLR 2024 (v2: 34 pages, 19 figures)"},{"id":"http://arxiv.org/abs/2404.18291v1","updated":"2024-04-28T19:35:00Z","published":"2024-04-28T19:35:00Z","title":"Panoptic Segmentation and Labelling of Lumbar Spine Vertebrae using\n Modified Attention Unet","summary":" Segmentation and labeling of vertebrae in MRI images of the spine are\ncritical for the diagnosis of illnesses and abnormalities. These steps are\nindispensable as MRI technology provides detailed information about the tissue\nstructure of the spine. Both supervised and unsupervised segmentation methods\nexist, yet acquiring sufficient data remains challenging for achieving high\naccuracy. In this study, we propose an enhancing approach based on modified\nattention U-Net architecture for panoptic segmentation of 3D sliced MRI data of\nthe lumbar spine. Our method achieves an impressive accuracy of 99.5\\% by\nincorporating novel masking logic, thus significantly advancing the\nstate-of-the-art in vertebral segmentation and labeling. This contributes to\nmore precise and reliable diagnosis and treatment planning.\n","authors":["Rikathi Pal","Priya Saha","Somoballi Ghoshal","Amlan Chakrabarti","Susmita Sur-Kolay"],"pdf_url":"https://arxiv.org/pdf/2404.18291v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.18284v1","updated":"2024-04-28T19:02:54Z","published":"2024-04-28T19:02:54Z","title":"S3-SLAM: Sparse Tri-plane Encoding for Neural Implicit SLAM","summary":" With the emergence of Neural Radiance Fields (NeRF), neural implicit\nrepresentations have gained widespread applications across various domains,\nincluding simultaneous localization and mapping. However, current neural\nimplicit SLAM faces a challenging trade-off problem between performance and the\nnumber of parameters. To address this problem, we propose sparse tri-plane\nencoding, which efficiently achieves scene reconstruction at resolutions up to\n512 using only 2~4% of the commonly used tri-plane parameters (reduced from\n100MB to 2~4MB). On this basis, we design S3-SLAM to achieve rapid and\nhigh-quality tracking and mapping through sparsifying plane parameters and\nintegrating orthogonal features of tri-plane. Furthermore, we develop\nhierarchical bundle adjustment to achieve globally consistent geometric\nstructures and reconstruct high-resolution appearance. Experimental results\ndemonstrate that our approach achieves competitive tracking and scene\nreconstruction with minimal parameters on three datasets. Source code will soon\nbe available.\n","authors":["Zhiyao Zhang","Yunzhou Zhang","Yanmin Wu","Bin Zhao","Xingshuo Wang","Rui Tian"],"pdf_url":"https://arxiv.org/pdf/2404.18284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18279v1","updated":"2024-04-28T18:51:32Z","published":"2024-04-28T18:51:32Z","title":"Out-of-distribution Detection in Medical Image Analysis: A survey","summary":" Computer-aided diagnostics has benefited from the development of deep\nlearning-based computer vision techniques in these years. Traditional\nsupervised deep learning methods assume that the test sample is drawn from the\nidentical distribution as the training data. However, it is possible to\nencounter out-of-distribution samples in real-world clinical scenarios, which\nmay cause silent failure in deep learning-based medical image analysis tasks.\nRecently, research has explored various out-of-distribution (OOD) detection\nsituations and techniques to enable a trustworthy medical AI system. In this\nsurvey, we systematically review the recent advances in OOD detection in\nmedical image analysis. We first explore several factors that may cause a\ndistributional shift when using a deep-learning-based model in clinic\nscenarios, with three different types of distributional shift well defined on\ntop of these factors. Then a framework is suggested to categorize and feature\nexisting solutions, while the previous studies are reviewed based on the\nmethodology taxonomy. Our discussion also includes evaluation protocols and\nmetrics, as well as the challenge and a research direction lack of exploration.\n","authors":["Zesheng Hong","Yubiao Yue","Yubin Chen","Huanjie Lin","Yuanmei Luo","Mini Han Wang","Weidong Wang","Jialong Xu","Xiaoqi Yang","Zhenzhang Li","Sihong Xie"],"pdf_url":"https://arxiv.org/pdf/2404.18279v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.00372v2","updated":"2024-04-28T18:45:32Z","published":"2024-03-01T08:57:28Z","title":"HyperSDFusion: Bridging Hierarchical Structures in Language and Geometry\n for Enhanced 3D Text2Shape Generation","summary":" 3D shape generation from text is a fundamental task in 3D representation\nlearning. The text-shape pairs exhibit a hierarchical structure, where a\ngeneral text like ``chair\" covers all 3D shapes of the chair, while more\ndetailed prompts refer to more specific shapes. Furthermore, both text and 3D\nshapes are inherently hierarchical structures. However, existing Text2Shape\nmethods, such as SDFusion, do not exploit that. In this work, we propose\nHyperSDFusion, a dual-branch diffusion model that generates 3D shapes from a\ngiven text. Since hyperbolic space is suitable for handling hierarchical data,\nwe propose to learn the hierarchical representations of text and 3D shapes in\nhyperbolic space. First, we introduce a hyperbolic text-image encoder to learn\nthe sequential and multi-modal hierarchical features of text in hyperbolic\nspace. In addition, we design a hyperbolic text-graph convolution module to\nlearn the hierarchical features of text in hyperbolic space. In order to fully\nutilize these text features, we introduce a dual-branch structure to embed text\nfeatures in 3D feature space. At last, to endow the generated 3D shapes with a\nhierarchical structure, we devise a hyperbolic hierarchical loss. Our method is\nthe first to explore the hyperbolic hierarchical representation for\ntext-to-shape generation. Experimental results on the existing text-to-shape\npaired dataset, Text2Shape, achieved state-of-the-art results. We release our\nimplementation under HyperSDFusion.github.io.\n","authors":["Zhiying Leng","Tolga Birdal","Xiaohui Liang","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2403.00372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02558v3","updated":"2024-04-28T18:44:33Z","published":"2023-06-05T03:14:54Z","title":"Multi-View Representation is What You Need for Point-Cloud Pre-Training","summary":" A promising direction for pre-training 3D point clouds is to leverage the\nmassive amount of data in 2D, whereas the domain gap between 2D and 3D creates\na fundamental challenge. This paper proposes a novel approach to point-cloud\npre-training that learns 3D representations by leveraging pre-trained 2D\nnetworks. Different from the popular practice of predicting 2D features first\nand then obtaining 3D features through dimensionality lifting, our approach\ndirectly uses a 3D network for feature extraction. We train the 3D feature\nextraction network with the help of the novel 2D knowledge transfer loss, which\nenforces the 2D projections of the 3D feature to be consistent with the output\nof pre-trained 2D networks. To prevent the feature from discarding 3D signals,\nwe introduce the multi-view consistency loss that additionally encourages the\nprojected 2D feature representations to capture pixel-wise correspondences\nacross different views. Such correspondences induce 3D geometry and effectively\nretain 3D features in the projected 2D features. Experimental results\ndemonstrate that our pre-trained model can be successfully transferred to\nvarious downstream tasks, including 3D shape classification, part segmentation,\n3D object detection, and semantic segmentation, achieving state-of-the-art\nperformance.\n","authors":["Siming Yan","Chen Song","Youkang Kong","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2306.02558v3.pdf","comment":"Published in ICLR 2024"},{"id":"http://arxiv.org/abs/2304.06911v2","updated":"2024-04-28T18:36:19Z","published":"2023-04-14T03:25:24Z","title":"3D Feature Prediction for Masked-AutoEncoder-Based Point Cloud\n Pretraining","summary":" Masked autoencoders (MAE) have recently been introduced to 3D self-supervised\npretraining for point clouds due to their great success in NLP and computer\nvision. Unlike MAEs used in the image domain, where the pretext task is to\nrestore features at the masked pixels, such as colors, the existing 3D MAE\nworks reconstruct the missing geometry only, i.e, the location of the masked\npoints. In contrast to previous studies, we advocate that point location\nrecovery is inessential and restoring intrinsic point features is much\nsuperior. To this end, we propose to ignore point position reconstruction and\nrecover high-order features at masked points including surface normals and\nsurface variations, through a novel attention-based decoder which is\nindependent of the encoder design. We validate the effectiveness of our pretext\ntask and decoder design using different encoder structures for 3D training and\ndemonstrate the advantages of our pretrained networks on various point cloud\nanalysis tasks.\n","authors":["Siming Yan","Yuqi Yang","Yuxiao Guo","Hao Pan","Peng-shuai Wang","Xin Tong","Yang Liu","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2304.06911v2.pdf","comment":"Published in ICLR 2024"},{"id":"http://arxiv.org/abs/2404.18260v1","updated":"2024-04-28T17:50:58Z","published":"2024-04-28T17:50:58Z","title":"Align, Minimize and Diversify: A Source-Free Unsupervised Domain\n Adaptation Method for Handwritten Text Recognition","summary":" This paper serves to introduce the Align, Minimize and Diversify (AMD)\nmethod, a Source-Free Unsupervised Domain Adaptation approach for Handwritten\nText Recognition (HTR). This framework decouples the adaptation process from\nthe source data, thus not only sidestepping the resource-intensive retraining\nprocess but also making it possible to leverage the wealth of pre-trained\nknowledge encoded in modern Deep Learning architectures. Our method explicitly\neliminates the need to revisit the source data during adaptation by\nincorporating three distinct regularization terms: the Align term, which\nreduces the feature distribution discrepancy between source and target data,\nensuring the transferability of the pre-trained representation; the Minimize\nterm, which encourages the model to make assertive predictions, pushing the\noutputs towards one-hot-like distributions in order to minimize prediction\nuncertainty, and finally, the Diversify term, which safeguards against the\ndegeneracy in predictions by promoting varied and distinctive sequences\nthroughout the target data, preventing informational collapse. Experimental\nresults from several benchmarks demonstrated the effectiveness and robustness\nof AMD, showing it to be competitive and often outperforming DA methods in HTR.\n","authors":["María Alfaro-Contreras","Jorge Calvo-Zaragoza"],"pdf_url":"https://arxiv.org/pdf/2404.18260v1.pdf","comment":"Submitted to ECCV 2024"},{"id":"http://arxiv.org/abs/2404.18253v1","updated":"2024-04-28T17:20:08Z","published":"2024-04-28T17:20:08Z","title":"Efficient Remote Sensing with Harmonized Transfer Learning and Modality\n Alignment","summary":" With the rise of Visual and Language Pretraining (VLP), an increasing number\nof downstream tasks are adopting the paradigm of pretraining followed by\nfine-tuning. Although this paradigm has demonstrated potential in various\nmultimodal downstream tasks, its implementation in the remote sensing domain\nencounters some obstacles. Specifically, the tendency for same-modality\nembeddings to cluster together impedes efficient transfer learning. To tackle\nthis issue, we review the aim of multimodal transfer learning for downstream\ntasks from a unified perspective, and rethink the optimization process based on\nthree distinct objectives. We propose \"Harmonized Transfer Learning and\nModality Alignment (HarMA)\", a method that simultaneously satisfies task\nconstraints, modality alignment, and single-modality uniform alignment, while\nminimizing training overhead through parameter-efficient fine-tuning.\nRemarkably, without the need for external data for training, HarMA achieves\nstate-of-the-art performance in two popular multimodal retrieval tasks in the\nfield of remote sensing. Our experiments reveal that HarMA achieves competitive\nand even superior performance to fully fine-tuned models with only minimal\nadjustable parameters. Due to its simplicity, HarMA can be integrated into\nalmost all existing multimodal pretraining models. We hope this method can\nfacilitate the efficient application of large models to a wide range of\ndownstream tasks while significantly reducing the resource consumption. Code is\navailable at https://github.com/seekerhuang/HarMA.\n","authors":["Tengjun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.18253v1.pdf","comment":"Accepted by the Twelfth International Conference on Learning\n Representations (ICLR) Workshop"},{"id":"http://arxiv.org/abs/2404.18252v1","updated":"2024-04-28T17:18:41Z","published":"2024-04-28T17:18:41Z","title":"Fisher Information Improved Training-Free Conditional Diffusion Model","summary":" Recently, the diffusion model with the training-free methods has succeeded in\nconditional image generation tasks. However, there is an efficiency problem\nbecause it requires calculating the gradient with high computational cost, and\nprevious methods make strong assumptions to solve it, sacrificing\ngeneralization. In this work, we propose the Fisher information guided\ndiffusion model (FIGD). Concretely, we introduce the Fisher information to\nestimate the gradient without making any additional assumptions to reduce\ncomputation cost. Meanwhile, we demonstrate that the Fisher information ensures\nthe generalization of FIGD and provides new insights for training-free methods\nbased on the information theory. The experimental results demonstrate that FIGD\ncould achieve different conditional generations more quickly while maintaining\nhigh quality.\n","authors":["Kaiyu Song","Hanjiang Lai"],"pdf_url":"https://arxiv.org/pdf/2404.18252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18246v1","updated":"2024-04-28T16:58:53Z","published":"2024-04-28T16:58:53Z","title":"AdaFSNet: Time Series Classification Based on Convolutional Network with\n a Adaptive and Effective Kernel Size Configuration","summary":" Time series classification is one of the most critical and challenging\nproblems in data mining, existing widely in various fields and holding\nsignificant research importance. Despite extensive research and notable\nachievements with successful real-world applications, addressing the challenge\nof capturing the appropriate receptive field (RF) size from one-dimensional or\nmulti-dimensional time series of varying lengths remains a persistent issue,\nwhich greatly impacts performance and varies considerably across different\ndatasets. In this paper, we propose an Adaptive and Effective Full-Scope\nConvolutional Neural Network (AdaFSNet) to enhance the accuracy of time series\nclassification. This network includes two Dense Blocks. Particularly, it can\ndynamically choose a range of kernel sizes that effectively encompass the\noptimal RF size for various datasets by incorporating multiple prime numbers\ncorresponding to the time series length. We also design a TargetDrop block,\nwhich can reduce redundancy while extracting a more effective RF. To assess the\neffectiveness of the AdaFSNet network, comprehensive experiments were conducted\nusing the UCR and UEA datasets, which include one-dimensional and\nmulti-dimensional time series data, respectively. Our model surpassed baseline\nmodels in terms of classification accuracy, underscoring the AdaFSNet network's\nefficiency and effectiveness in handling time series classification tasks.\n","authors":["Haoxiao Wang","Bo Peng","Jianhua Zhang","Xu Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.18246v1.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.18245v1","updated":"2024-04-28T16:55:44Z","published":"2024-04-28T16:55:44Z","title":"FAD-SAR: A Novel Fishing Activity Detection System via Synthetic\n Aperture Radar Images Based on Deep Learning Method","summary":" Illegal, unreported, and unregulated (IUU) fishing seriously affects various\naspects of human life. However, current methods for detecting and monitoring\nIUU activities at sea have limitations. While Synthetic Aperture Radar (SAR)\ncan complement existing vessel detection systems, extracting useful information\nfrom SAR images using traditional methods, especially for IUU fishing\nidentification, poses challenges. This paper proposes a deep learning-based\nsystem for detecting fishing activities. We implemented this system on the\nxView3 dataset using six classical object detection models: Faster R-CNN,\nCascade R-CNN, SSD, RetinaNet, FSAF, and FCOS. We applied improvement methods\nto enhance the performance of the Faster R-CNN model. Specifically, training\nthe Faster R-CNN model using Online Hard Example Mining (OHEM) strategy\nimproved the Avg-F1 value from 0.212 to 0.216, representing a 1.96%\nimprovement.\n","authors":["Yanbing Bai","Rui-Yang Ju","Siao Li","Zihao Yang","Jinze Yu"],"pdf_url":"https://arxiv.org/pdf/2404.18245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18235v1","updated":"2024-04-28T16:29:22Z","published":"2024-04-28T16:29:22Z","title":"Flood Data Analysis on SpaceNet 8 Using Apache Sedona","summary":" With the escalating frequency of floods posing persistent threats to human\nlife and property, satellite remote sensing has emerged as an indispensable\ntool for monitoring flood hazards. SpaceNet8 offers a unique opportunity to\nleverage cutting-edge artificial intelligence technologies to assess these\nhazards. A significant contribution of this research is its application of\nApache Sedona, an advanced platform specifically designed for the efficient and\ndistributed processing of large-scale geospatial data. This platform aims to\nenhance the efficiency of error analysis, a critical aspect of improving flood\ndamage detection accuracy. Based on Apache Sedona, we introduce a novel\napproach that addresses the challenges associated with inaccuracies in flood\ndamage detection. This approach involves the retrieval of cases from historical\nflood events, the adaptation of these cases to current scenarios, and the\nrevision of the model based on clustering algorithms to refine its performance.\nThrough the replication of both the SpaceNet8 baseline and its top-performing\nmodels, we embark on a comprehensive error analysis. This analysis reveals\nseveral main sources of inaccuracies. To address these issues, we employ data\nvisual interpretation and histogram equalization techniques, resulting in\nsignificant improvements in model metrics. After these enhancements, our\nindicators show a notable improvement, with precision up by 5%, F1 score by\n2.6%, and IoU by 4.5%. This work highlights the importance of advanced\ngeospatial data processing tools, such as Apache Sedona. By improving the\naccuracy and efficiency of flood detection, this research contributes to\nsafeguarding public safety and strengthening infrastructure resilience in\nflood-prone areas, making it a valuable addition to the field of remote sensing\nand disaster management.\n","authors":["Yanbing Bai","Zihao Yang","Jinze Yu","Rui-Yang Ju","Bin Yang","Erick Mas","Shunichi Koshimura"],"pdf_url":"https://arxiv.org/pdf/2404.18235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07609v2","updated":"2024-04-28T15:36:40Z","published":"2023-11-11T13:09:11Z","title":"Artificial Intelligence in Assessing Cardiovascular Diseases and Risk\n Factors via Retinal Fundus Images: A Review of the Last Decade","summary":" Background: Cardiovascular diseases (CVDs) are the leading cause of death\nglobally. The use of artificial intelligence (AI) methods - in particular, deep\nlearning (DL) - has been on the rise lately for the analysis of different\nCVD-related topics. The use of fundus images and optical coherence tomography\nangiography (OCTA) in the diagnosis of retinal diseases has also been\nextensively studied. To better understand heart function and anticipate changes\nbased on microvascular characteristics and function, researchers are currently\nexploring the integration of AI with non-invasive retinal scanning. There is\ngreat potential to reduce the number of cardiovascular events and the financial\nstrain on healthcare systems by utilizing AI-assisted early detection and\nprediction of cardiovascular diseases on a large scale. Method: A comprehensive\nsearch was conducted across various databases, including PubMed, Medline,\nGoogle Scholar, Scopus, Web of Sciences, IEEE Xplore, and ACM Digital Library,\nusing specific keywords related to cardiovascular diseases and artificial\nintelligence. Results: The study included 87 English-language publications\nselected for relevance, and additional references were considered. This paper\nprovides an overview of the recent developments and difficulties in using\nartificial intelligence and retinal imaging to diagnose cardiovascular\ndiseases. It provides insights for further exploration in this field.\nConclusion: Researchers are trying to develop precise disease prognosis\npatterns in response to the aging population and the growing global burden of\nCVD. AI and deep learning are revolutionizing healthcare by potentially\ndiagnosing multiple CVDs from a single retinal image. However, swifter adoption\nof these technologies in healthcare systems is required.\n","authors":["Mirsaeed Abdollahi","Ali Jafarizadeh","Amirhosein Ghafouri Asbagh","Navid Sobhi","Keysan Pourmoghtader","Siamak Pedrammehr","Houshyar Asadi","Roohallah Alizadehsani","Ru-San Tan","U. Rajendra Acharya"],"pdf_url":"https://arxiv.org/pdf/2311.07609v2.pdf","comment":"41 pages, 5 figures, 3 tables, 114 references"},{"id":"http://arxiv.org/abs/2309.14162v4","updated":"2024-04-28T15:19:15Z","published":"2023-09-25T14:13:26Z","title":"Data Upcycling Knowledge Distillation for Image Super-Resolution","summary":" Knowledge distillation (KD) compresses deep neural networks by transferring\ntask-related knowledge from cumbersome pre-trained teacher models to compact\nstudent models. However, current KD methods for super-resolution (SR) networks\noverlook the nature of SR task that the outputs of the teacher model are noisy\napproximations to the ground-truth distribution of high-quality images (GT),\nwhich shades the teacher model's knowledge to result in limited KD effects. To\nutilize the teacher model beyond the GT upper-bound, we present the Data\nUpcycling Knowledge Distillation (DUKD), to transfer the teacher model's\nknowledge to the student model through the upcycled in-domain data derived from\ntraining data. Besides, we impose label consistency regularization to KD for SR\nby the paired invertible augmentations to improve the student model's\nperformance and robustness. Comprehensive experiments demonstrate that the DUKD\nmethod significantly outperforms previous arts on several SR tasks.\n","authors":["Yun Zhang","Wei Li","Simiao Li","Hanting Chen","Zhijun Tu","Wenjia Wang","Bingyi Jing","Shaohui Lin","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2309.14162v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18213v1","updated":"2024-04-28T15:12:56Z","published":"2024-04-28T15:12:56Z","title":"S$^2$Mamba: A Spatial-spectral State Space Model for Hyperspectral Image\n Classification","summary":" Land cover analysis using hyperspectral images (HSI) remains an open problem\ndue to their low spatial resolution and complex spectral information. Recent\nstudies are primarily dedicated to designing Transformer-based architectures\nfor spatial-spectral long-range dependencies modeling, which is computationally\nexpensive with quadratic complexity. Selective structured state space model\n(Mamba), which is efficient for modeling long-range dependencies with linear\ncomplexity, has recently shown promising progress. However, its potential in\nhyperspectral image processing that requires handling numerous spectral bands\nhas not yet been explored. In this paper, we innovatively propose S$^2$Mamba, a\nspatial-spectral state space model for hyperspectral image classification, to\nexcavate spatial-spectral contextual features, resulting in more efficient and\naccurate land cover analysis. In S$^2$Mamba, two selective structured state\nspace models through different dimensions are designed for feature extraction,\none for spatial, and the other for spectral, along with a spatial-spectral\nmixture gate for optimal fusion. More specifically, S$^2$Mamba first captures\nspatial contextual relations by interacting each pixel with its adjacent\nthrough a Patch Cross Scanning module and then explores semantic information\nfrom continuous spectral bands through a Bi-directional Spectral Scanning\nmodule. Considering the distinct expertise of the two attributes in homogenous\nand complicated texture scenes, we realize the Spatial-spectral Mixture Gate by\na group of learnable matrices, allowing for the adaptive incorporation of\nrepresentations learned across different dimensions. Extensive experiments\nconducted on HSI classification benchmarks demonstrate the superiority and\nprospect of S$^2$Mamba. The code will be available at:\nhttps://github.com/PURE-melo/S2Mamba.\n","authors":["Guanchun Wang","Xiangrong Zhang","Zelin Peng","Tianyang Zhang","Xiuping Jia","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.18213v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.18212v1","updated":"2024-04-28T15:07:53Z","published":"2024-04-28T15:07:53Z","title":"Paint by Inpaint: Learning to Add Image Objects by Removing Them First","summary":" Image editing has advanced significantly with the introduction of\ntext-conditioned diffusion models. Despite this progress, seamlessly adding\nobjects to images based on textual instructions without requiring user-provided\ninput masks remains a challenge. We address this by leveraging the insight that\nremoving objects (Inpaint) is significantly simpler than its inverse process of\nadding them (Paint), attributed to the utilization of segmentation mask\ndatasets alongside inpainting models that inpaint within these masks.\nCapitalizing on this realization, by implementing an automated and extensive\npipeline, we curate a filtered large-scale image dataset containing pairs of\nimages and their corresponding object-removed versions. Using these pairs, we\ntrain a diffusion model to inverse the inpainting process, effectively adding\nobjects into images. Unlike other editing datasets, ours features natural\ntarget images instead of synthetic ones; moreover, it maintains consistency\nbetween source and target by construction. Additionally, we utilize a large\nVision-Language Model to provide detailed descriptions of the removed objects\nand a Large Language Model to convert these descriptions into diverse,\nnatural-language instructions. We show that the trained model surpasses\nexisting ones both qualitatively and quantitatively, and release the\nlarge-scale dataset alongside the trained models for the community.\n","authors":["Navve Wasserman","Noam Rotstein","Roy Ganz","Ron Kimmel"],"pdf_url":"https://arxiv.org/pdf/2404.18212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16139v2","updated":"2024-04-28T15:06:51Z","published":"2024-04-24T18:57:30Z","title":"A Survey on Intermediate Fusion Methods for Collaborative Perception\n Categorized by Real World Challenges","summary":" This survey analyzes intermediate fusion methods in collaborative perception\nfor autonomous driving, categorized by real-world challenges. We examine\nvarious methods, detailing their features and the evaluation metrics they\nemploy. The focus is on addressing challenges like transmission efficiency,\nlocalization errors, communication disruptions, and heterogeneity. Moreover, we\nexplore strategies to counter adversarial attacks and defenses, as well as\napproaches to adapt to domain shifts. The objective is to present an overview\nof how intermediate fusion methods effectively meet these diverse challenges,\nhighlighting their role in advancing the field of collaborative perception in\nautonomous driving.\n","authors":["Melih Yazgan","Thomas Graf","Min Liu","Tobias Fleck","J. Marius Zoellner"],"pdf_url":"https://arxiv.org/pdf/2404.16139v2.pdf","comment":"8 pages, 6 tables"},{"id":"http://arxiv.org/abs/2404.18206v1","updated":"2024-04-28T14:58:54Z","published":"2024-04-28T14:58:54Z","title":"Enhancing Action Recognition from Low-Quality Skeleton Data via\n Part-Level Knowledge Distillation","summary":" Skeleton-based action recognition is vital for comprehending human-centric\nvideos and has applications in diverse domains. One of the challenges of\nskeleton-based action recognition is dealing with low-quality data, such as\nskeletons that have missing or inaccurate joints. This paper addresses the\nissue of enhancing action recognition using low-quality skeletons through a\ngeneral knowledge distillation framework. The proposed framework employs a\nteacher-student model setup, where a teacher model trained on high-quality\nskeletons guides the learning of a student model that handles low-quality\nskeletons. To bridge the gap between heterogeneous high-quality and lowquality\nskeletons, we present a novel part-based skeleton matching strategy, which\nexploits shared body parts to facilitate local action pattern learning. An\naction-specific part matrix is developed to emphasize critical parts for\ndifferent actions, enabling the student model to distill discriminative\npart-level knowledge. A novel part-level multi-sample contrastive loss achieves\nknowledge transfer from multiple high-quality skeletons to low-quality ones,\nwhich enables the proposed knowledge distillation framework to include training\nlow-quality skeletons that lack corresponding high-quality matches.\nComprehensive experiments conducted on the NTU-RGB+D, Penn Action, and SYSU 3D\nHOI datasets demonstrate the effectiveness of the proposed knowledge\ndistillation framework.\n","authors":["Cuiwei Liu","Youzhi Jiang","Chong Du","Zhaokui Li"],"pdf_url":"https://arxiv.org/pdf/2404.18206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18203v1","updated":"2024-04-28T14:47:09Z","published":"2024-04-28T14:47:09Z","title":"LMM-PCQA: Assisting Point Cloud Quality Assessment with LMM","summary":" Although large multi-modality models (LMMs) have seen extensive exploration\nand application in various quality assessment studies, their integration into\nPoint Cloud Quality Assessment (PCQA) remains unexplored. Given LMMs'\nexceptional performance and robustness in low-level vision and quality\nassessment tasks, this study aims to investigate the feasibility of imparting\nPCQA knowledge to LMMs through text supervision. To achieve this, we transform\nquality labels into textual descriptions during the fine-tuning phase, enabling\nLMMs to derive quality rating logits from 2D projections of point clouds. To\ncompensate for the loss of perception in the 3D domain, structural features are\nextracted as well. These quality logits and structural features are then\ncombined and regressed into quality scores. Our experimental results affirm the\neffectiveness of our approach, showcasing a novel integration of LMMs into PCQA\nthat enhances model understanding and assessment accuracy. We hope our\ncontributions can inspire subsequent investigations into the fusion of LMMs\nwith PCQA, fostering advancements in 3D visual quality analysis and beyond.\n","authors":["Zicheng Zhang","Haoning Wu","Yingjie Zhou","Chunyi Li","Wei Sun","Chaofeng Chen","Xiongkuo Min","Xiaohong Liu","Weisi Lin","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.18203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18199v1","updated":"2024-04-28T14:37:10Z","published":"2024-04-28T14:37:10Z","title":"Rethinking Attention Gated with Hybrid Dual Pyramid Transformer-CNN for\n Generalized Segmentation in Medical Imaging","summary":" Inspired by the success of Transformers in Computer vision, Transformers have\nbeen widely investigated for medical imaging segmentation. However, most of\nTransformer architecture are using the recent transformer architectures as\nencoder or as parallel encoder with the CNN encoder. In this paper, we\nintroduce a novel hybrid CNN-Transformer segmentation architecture\n(PAG-TransYnet) designed for efficiently building a strong CNN-Transformer\nencoder. Our approach exploits attention gates within a Dual Pyramid hybrid\nencoder. The contributions of this methodology can be summarized into three key\naspects: (i) the utilization of Pyramid input for highlighting the prominent\nfeatures at different scales, (ii) the incorporation of a PVT transformer to\ncapture long-range dependencies across various resolutions, and (iii) the\nimplementation of a Dual-Attention Gate mechanism for effectively fusing\nprominent features from both CNN and Transformer branches. Through\ncomprehensive evaluation across different segmentation tasks including:\nabdominal multi-organs segmentation, infection segmentation (Covid-19 and Bone\nMetastasis), microscopic tissues segmentation (Gland and Nucleus). The proposed\napproach demonstrates state-of-the-art performance and exhibits remarkable\ngeneralization capabilities. This research represents a significant advancement\ntowards addressing the pressing need for efficient and adaptable segmentation\nsolutions in medical imaging applications.\n","authors":["Fares Bougourzi","Fadi Dornaika","Abdelmalik Taleb-Ahmed","Vinh Truong Hoang"],"pdf_url":"https://arxiv.org/pdf/2404.18199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18198v1","updated":"2024-04-28T14:34:28Z","published":"2024-04-28T14:34:28Z","title":"Permutation-equivariant quantum convolutional neural networks","summary":" The Symmetric group $S_{n}$ manifests itself in large classes of quantum\nsystems as the invariance of certain characteristics of a quantum state with\nrespect to permuting the qubits. The subgroups of $S_{n}$ arise, among many\nother contexts, to describe label symmetry of classical images with respect to\nspatial transformations, e.g. reflection or rotation. Equipped with the\nformalism of geometric quantum machine learning, in this work we propose the\narchitectures of equivariant quantum convolutional neural networks (EQCNNs)\nadherent to $S_{n}$ and its subgroups. We demonstrate that a careful choice of\npixel-to-qubit embedding order can facilitate easy construction of EQCNNs for\nsmall subgroups of $S_{n}$. Our novel EQCNN architecture corresponding to the\nfull permutation group $S_{n}$ is built by applying all possible QCNNs with\nequal probability, which can also be conceptualized as a dropout strategy in\nquantum neural networks. For subgroups of $S_{n}$, our numerical results using\nMNIST datasets show better classification accuracy than non-equivariant QCNNs.\nThe $S_{n}$-equivariant QCNN architecture shows significantly improved training\nand test performance than non-equivariant QCNN for classification of connected\nand non-connected graphs. When trained with sufficiently large number of data,\nthe $S_{n}$-equivariant QCNN shows better average performance compared to\n$S_{n}$-equivariant QNN . These results contribute towards building powerful\nquantum machine learning architectures in permutation-symmetric systems.\n","authors":["Sreetama Das","Filippo Caruso"],"pdf_url":"https://arxiv.org/pdf/2404.18198v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.17590v2","updated":"2024-04-28T13:54:29Z","published":"2023-11-29T12:35:34Z","title":"SyncTalk: The Devil is in the Synchronization for Talking Head Synthesis","summary":" Achieving high synchronization in the synthesis of realistic, speech-driven\ntalking head videos presents a significant challenge. Traditional Generative\nAdversarial Networks (GAN) struggle to maintain consistent facial identity,\nwhile Neural Radiance Fields (NeRF) methods, although they can address this\nissue, often produce mismatched lip movements, inadequate facial expressions,\nand unstable head poses. A lifelike talking head requires synchronized\ncoordination of subject identity, lip movements, facial expressions, and head\nposes. The absence of these synchronizations is a fundamental flaw, leading to\nunrealistic and artificial outcomes. To address the critical issue of\nsynchronization, identified as the \"devil\" in creating realistic talking heads,\nwe introduce SyncTalk. This NeRF-based method effectively maintains subject\nidentity, enhancing synchronization and realism in talking head synthesis.\nSyncTalk employs a Face-Sync Controller to align lip movements with speech and\ninnovatively uses a 3D facial blendshape model to capture accurate facial\nexpressions. Our Head-Sync Stabilizer optimizes head poses, achieving more\nnatural head movements. The Portrait-Sync Generator restores hair details and\nblends the generated head with the torso for a seamless visual experience.\nExtensive experiments and user studies demonstrate that SyncTalk outperforms\nstate-of-the-art methods in synchronization and realism. We recommend watching\nthe supplementary video: https://ziqiaopeng.github.io/synctalk\n","authors":["Ziqiao Peng","Wentao Hu","Yue Shi","Xiangyu Zhu","Xiaomei Zhang","Hao Zhao","Jun He","Hongyan Liu","Zhaoxin Fan"],"pdf_url":"https://arxiv.org/pdf/2311.17590v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08995v3","updated":"2024-04-28T13:49:54Z","published":"2024-04-13T12:41:40Z","title":"Beyond Known Clusters: Probe New Prototypes for Efficient Generalized\n Class Discovery","summary":" Generalized Class Discovery (GCD) aims to dynamically assign labels to\nunlabelled data partially based on knowledge learned from labelled data, where\nthe unlabelled data may come from known or novel classes. The prevailing\napproach generally involves clustering across all data and learning conceptions\nby prototypical contrastive learning. However, existing methods largely hinge\non the performance of clustering algorithms and are thus subject to their\ninherent limitations. Firstly, the estimated cluster number is often smaller\nthan the ground truth, making the existing methods suffer from the lack of\nprototypes for comprehensive conception learning. To address this issue, we\npropose an adaptive probing mechanism that introduces learnable potential\nprototypes to expand cluster prototypes (centers). As there is no ground truth\nfor the potential prototype, we develop a self-supervised prototype learning\nframework to optimize the potential prototype in an end-to-end fashion.\nSecondly, clustering is computationally intensive, and the conventional\nstrategy of clustering both labelled and unlabelled instances exacerbates this\nissue. To counteract this inefficiency, we opt to cluster only the unlabelled\ninstances and subsequently expand the cluster prototypes with our introduced\npotential prototypes to fast explore novel classes. Despite the simplicity of\nour proposed method, extensive empirical analysis on a wide range of datasets\nconfirms that our method consistently delivers state-of-the-art results.\nSpecifically, our method surpasses the nearest competitor by a significant\nmargin of \\textbf{9.7}$\\%$ within the Stanford Cars dataset and\n\\textbf{12$\\times$} clustering efficiency within the Herbarium 19 dataset. We\nwill make the code and checkpoints publicly available at\n\\url{https://github.com/xjtuYW/PNP.git}.\n","authors":["Ye Wang","Yaxiong Wang","Yujiao Wu","Bingchen Zhao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2404.08995v3.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.18178v1","updated":"2024-04-28T13:18:47Z","published":"2024-04-28T13:18:47Z","title":"Assessing Image Quality Using a Simple Generative Representation","summary":" Perceptual image quality assessment (IQA) is the task of predicting the\nvisual quality of an image as perceived by a human observer. Current\nstate-of-the-art techniques are based on deep representations trained in\ndiscriminative manner. Such representations may ignore visually important\nfeatures, if they are not predictive of class labels. Recent generative models\nsuccessfully learn low-dimensional representations using auto-encoding and have\nbeen argued to preserve better visual features. Here we leverage existing\nauto-encoders and propose VAE-QA, a simple and efficient method for predicting\nimage quality in the presence of a full-reference. We evaluate our approach on\nfour standard benchmarks and find that it significantly improves generalization\nacross datasets, has fewer trainable parameters, a smaller memory footprint and\nfaster run time.\n","authors":["Simon Raviv","Gal Chechik"],"pdf_url":"https://arxiv.org/pdf/2404.18178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18174v1","updated":"2024-04-28T13:12:49Z","published":"2024-04-28T13:12:49Z","title":"Mamba-FETrack: Frame-Event Tracking via State Space Model","summary":" RGB-Event based tracking is an emerging research topic, focusing on how to\neffectively integrate heterogeneous multi-modal data (synchronized exposure\nvideo frames and asynchronous pulse Event stream). Existing works typically\nemploy Transformer based networks to handle these modalities and achieve decent\naccuracy through input-level or feature-level fusion on multiple datasets.\nHowever, these trackers require significant memory consumption and\ncomputational complexity due to the use of self-attention mechanism. This paper\nproposes a novel RGB-Event tracking framework, Mamba-FETrack, based on the\nState Space Model (SSM) to achieve high-performance tracking while effectively\nreducing computational costs and realizing more efficient tracking.\nSpecifically, we adopt two modality-specific Mamba backbone networks to extract\nthe features of RGB frames and Event streams. Then, we also propose to boost\nthe interactive learning between the RGB and Event features using the Mamba\nnetwork. The fused features will be fed into the tracking head for target\nobject localization. Extensive experiments on FELT and FE108 datasets fully\nvalidated the efficiency and effectiveness of our proposed tracker.\nSpecifically, our Mamba-based tracker achieves 43.5/55.6 on the SR/PR metric,\nwhile the ViT-S based tracker (OSTrack) obtains 40.0/50.9. The GPU memory cost\nof ours and ViT-S based tracker is 13.98GB and 15.44GB, which decreased about\n$9.5\\%$. The FLOPs and parameters of ours/ViT-S based OSTrack are 59GB/1076GB\nand 7MB/60MB, which decreased about $94.5\\%$ and $88.3\\%$, respectively. We\nhope this work can bring some new insights to the tracking field and greatly\npromote the application of the Mamba architecture in tracking. The source code\nof this work will be released on\n\\url{https://github.com/Event-AHU/Mamba_FETrack}.\n","authors":["Ju Huang","Shiao Wang","Shuai Wang","Zhe Wu","Xiao Wang","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.18174v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2404.16666v2","updated":"2024-04-28T12:49:32Z","published":"2024-04-25T15:06:58Z","title":"PhyRecon: Physically Plausible Neural Scene Reconstruction","summary":" While neural implicit representations have gained popularity in multi-view 3D\nreconstruction, previous work struggles to yield physically plausible results,\nthereby limiting their applications in physics-demanding domains like embodied\nAI and robotics. The lack of plausibility originates from both the absence of\nphysics modeling in the existing pipeline and their inability to recover\nintricate geometrical structures. In this paper, we introduce PhyRecon, which\nstands as the first approach to harness both differentiable rendering and\ndifferentiable physics simulation to learn implicit surface representations.\nOur framework proposes a novel differentiable particle-based physical simulator\nseamlessly integrated with the neural implicit representation. At its core is\nan efficient transformation between SDF-based implicit representation and\nexplicit surface points by our proposed algorithm, Surface Points Marching\nCubes (SP-MC), enabling differentiable learning with both rendering and\nphysical losses. Moreover, we model both rendering and physical uncertainty to\nidentify and compensate for the inconsistent and inaccurate monocular geometric\npriors. The physical uncertainty additionally enables a physics-guided pixel\nsampling to enhance the learning of slender structures. By amalgamating these\ntechniques, our model facilitates efficient joint modeling with appearance,\ngeometry, and physics. Extensive experiments demonstrate that PhyRecon\nsignificantly outperforms all state-of-the-art methods in terms of\nreconstruction quality. Our reconstruction results also yield superior physical\nstability, verified by Isaac Gym, with at least a 40% improvement across all\ndatasets, opening broader avenues for future physics-based applications.\n","authors":["Junfeng Ni","Yixin Chen","Bohan Jing","Nan Jiang","Bin Wang","Bo Dai","Yixin Zhu","Song-Chun Zhu","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.16666v2.pdf","comment":"project page: https://phyrecon.github.io/"},{"id":"http://arxiv.org/abs/2402.14313v2","updated":"2024-04-28T12:34:51Z","published":"2024-02-22T06:04:49Z","title":"Learning to Kern: Set-wise Estimation of Optimal Letter Space","summary":" Kerning is the task of setting appropriate horizontal spaces for all possible\nletter pairs of a certain font. One of the difficulties of kerning is that the\nappropriate space differs for each letter pair. Therefore, for a total of 52\ncapital and small letters, we need to adjust $52 \\times 52 = 2704$ different\nspaces. Another difficulty is that there is neither a general procedure nor\ncriterion for automatic kerning; therefore, kerning is still done manually or\nwith heuristics. In this paper, we tackle kerning by proposing two\nmachine-learning models, called pairwise and set-wise models. The former is a\nsimple deep neural network that estimates the letter space for two given letter\nimages. In contrast, the latter is a transformer-based model that estimates the\nletter spaces for three or more given letter images. For example, the set-wise\nmodel simultaneously estimates 2704 spaces for 52 letter images for a certain\nfont. Among the two models, the set-wise model is not only more efficient but\nalso more accurate because its internal self-attention mechanism allows for\nmore consistent kerning for all letters. Experimental results on about 2500\nGoogle fonts and their quantitative and qualitative analyses show that the\nset-wise model has an average estimation error of only about 5.3 pixels when\nthe average letter space of all fonts and letter pairs is about 115 pixels.\n","authors":["Kei Nakatsuru","Seiichi Uchida"],"pdf_url":"https://arxiv.org/pdf/2402.14313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18161v1","updated":"2024-04-28T12:25:09Z","published":"2024-04-28T12:25:09Z","title":"IMEX-Reg: Implicit-Explicit Regularization in the Function Space for\n Continual Learning","summary":" Continual learning (CL) remains one of the long-standing challenges for deep\nneural networks due to catastrophic forgetting of previously acquired\nknowledge. Although rehearsal-based approaches have been fairly successful in\nmitigating catastrophic forgetting, they suffer from overfitting on buffered\nsamples and prior information loss, hindering generalization under low-buffer\nregimes. Inspired by how humans learn using strong inductive biases, we propose\nIMEX-Reg to improve the generalization performance of experience rehearsal in\nCL under low buffer regimes. Specifically, we employ a two-pronged\nimplicit-explicit regularization approach using contrastive representation\nlearning (CRL) and consistency regularization. To further leverage the global\nrelationship between representations learned using CRL, we propose a\nregularization strategy to guide the classifier toward the activation\ncorrelations in the unit hypersphere of the CRL. Our results show that IMEX-Reg\nsignificantly improves generalization performance and outperforms\nrehearsal-based approaches in several CL scenarios. It is also robust to\nnatural and adversarial corruptions with less task-recency bias. Additionally,\nwe provide theoretical insights to support our design decisions further.\n","authors":["Prashant Bhat","Bharath Renjith","Elahe Arani","Bahram Zonooz"],"pdf_url":"https://arxiv.org/pdf/2404.18161v1.pdf","comment":"Published in Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2404.18156v1","updated":"2024-04-28T12:13:34Z","published":"2024-04-28T12:13:34Z","title":"Event-based Video Frame Interpolation with Edge Guided Motion Refinement","summary":" Video frame interpolation, the process of synthesizing intermediate frames\nbetween sequential video frames, has made remarkable progress with the use of\nevent cameras. These sensors, with microsecond-level temporal resolution, fill\ninformation gaps between frames by providing precise motion cues. However,\ncontemporary Event-Based Video Frame Interpolation (E-VFI) techniques often\nneglect the fact that event data primarily supply high-confidence features at\nscene edges during multi-modal feature fusion, thereby diminishing the role of\nevent signals in optical flow (OF) estimation and warping refinement. To\naddress this overlooked aspect, we introduce an end-to-end E-VFI learning\nmethod (referred to as EGMR) to efficiently utilize edge features from event\nsignals for motion flow and warping enhancement. Our method incorporates an\nEdge Guided Attentive (EGA) module, which rectifies estimated video motion\nthrough attentive aggregation based on the local correlation of multi-modal\nfeatures in a coarse-to-fine strategy. Moreover, given that event data can\nprovide accurate visual references at scene edges between consecutive frames,\nwe introduce a learned visibility map derived from event data to adaptively\nmitigate the occlusion problem in the warping refinement process. Extensive\nexperiments on both synthetic and real datasets show the effectiveness of the\nproposed approach, demonstrating its potential for higher quality video frame\ninterpolation.\n","authors":["Yuhan Liu","Yongjian Deng","Hao Chen","Bochen Xie","Youfu Li","Zhen Yang"],"pdf_url":"https://arxiv.org/pdf/2404.18156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18155v1","updated":"2024-04-28T12:12:08Z","published":"2024-04-28T12:12:08Z","title":"ShapeMoiré: Channel-Wise Shape-Guided Network for Image Demoiréing","summary":" Photographing optoelectronic displays often introduces unwanted moir\\'e\npatterns due to analog signal interference between the pixel grids of the\ndisplay and the camera sensor arrays. This work identifies two problems that\nare largely ignored by existing image demoir\\'eing approaches: 1) moir\\'e\npatterns vary across different channels (RGB); 2) repetitive patterns are\nconstantly observed. However, employing conventional convolutional (CNN) layers\ncannot address these problems. Instead, this paper presents the use of our\nrecently proposed Shape concept. It was originally employed to model consistent\nfeatures from fragmented regions, particularly when identical or similar\nobjects coexist in an RGB-D image. Interestingly, we find that the Shape\ninformation effectively captures the moir\\'e patterns in artifact images.\nMotivated by this discovery, we propose a ShapeMoir\\'e method to aid in image\ndemoir\\'eing. Beyond modeling shape features at the patch-level, we further\nextend this to the global image-level and design a novel Shape-Architecture.\nConsequently, our proposed method, equipped with both ShapeConv and\nShape-Architecture, can be seamlessly integrated into existing approaches\nwithout introducing additional parameters or computation overhead during\ninference. We conduct extensive experiments on four widely used datasets, and\nthe results demonstrate that our ShapeMoir\\'e achieves state-of-the-art\nperformance, particularly in terms of the PSNR metric. We then apply our method\nacross four popular architectures to showcase its generalization capabilities.\nMoreover, our ShapeMoir\\'e is robust and viable under real-world demoir\\'eing\nscenarios involving smartphone photographs.\n","authors":["Jinming Cao","Sicheng Shen","Qiu Zhou","Yifang Yin","Yangyan Li","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2404.18155v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2404.14829v3","updated":"2024-04-28T12:08:26Z","published":"2024-04-23T08:31:55Z","title":"Revisiting Neural Networks for Continual Learning: An Architectural\n Perspective","summary":" Efforts to overcome catastrophic forgetting have primarily centered around\ndeveloping more effective Continual Learning (CL) methods. In contrast, less\nattention was devoted to analyzing the role of network architecture design\n(e.g., network depth, width, and components) in contributing to CL. This paper\nseeks to bridge this gap between network architecture design and CL, and to\npresent a holistic study on the impact of network architectures on CL. This\nwork considers architecture design at the network scaling level, i.e., width\nand depth, and also at the network components, i.e., skip connections, global\npooling layers, and down-sampling. In both cases, we first derive insights\nthrough systematically exploring how architectural designs affect CL. Then,\ngrounded in these insights, we craft a specialized search space for CL and\nfurther propose a simple yet effective ArchCraft method to steer a CL-friendly\narchitecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC.\nExperimental validation across various CL settings and scenarios demonstrates\nthat improved architectures are parameter-efficient, achieving state-of-the-art\nperformance of CL while being 86%, 61%, and 97% more compact in terms of\nparameters than the naive CL architecture in Task IL and Class IL. Code is\navailable at https://github.com/byyx666/ArchCraft.\n","authors":["Aojun Lu","Tao Feng","Hangjie Yuan","Xiaotian Song","Yanan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.14829v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16123v2","updated":"2024-04-28T12:03:38Z","published":"2024-02-25T15:46:33Z","title":"InstructEdit: Instruction-based Knowledge Editing for Large Language\n Models","summary":" Knowledge editing for large language models can offer an efficient solution\nto alter a model's behavior without negatively impacting the overall\nperformance. However, the current approaches encounter issues with limited\ngeneralizability across tasks, necessitating one distinct editor for each task,\nsignificantly hindering the broader applications. To address this, we take the\nfirst step to analyze the multi-task generalization issue in knowledge editing.\nSpecifically, we develop an instruction-based editing technique, termed\nInstructEdit, which facilitates the editor's adaptation to various task\nperformances simultaneously using simple instructions. With only one unified\neditor for each LLM, we empirically demonstrate that InstructEdit can improve\nthe editor's control, leading to an average 14.86% increase in Reliability in\nmulti-task editing setting. Furthermore, experiments involving holdout unseen\ntask illustrate that InstructEdit consistently surpass previous strong\nbaselines. To further investigate the underlying mechanisms of\ninstruction-based knowledge editing, we analyze the principal components of the\nediting gradient directions, which unveils that instructions can help control\noptimization direction with stronger OOD generalization. Code and datasets are\navailable in https://github.com/zjunlp/EasyEdit.\n","authors":["Ningyu Zhang","Bozhong Tian","Siyuan Cheng","Xiaozhuan Liang","Yi Hu","Kouying Xue","Yanjie Gou","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.16123v2.pdf","comment":"IJCAI 2024; the project website is at\n https://www.zjukg.org/project/InstructEdit/"},{"id":"http://arxiv.org/abs/2404.18152v1","updated":"2024-04-28T12:02:38Z","published":"2024-04-28T12:02:38Z","title":"Masked Attention as a Mechanism for Improving Interpretability of Vision\n Transformers","summary":" Vision Transformers are at the heart of the current surge of interest in\nfoundation models for histopathology. They process images by breaking them into\nsmaller patches following a regular grid, regardless of their content. Yet, not\nall parts of an image are equally relevant for its understanding. This is\nparticularly true in computational pathology where background is completely\nnon-informative and may introduce artefacts that could mislead predictions. To\naddress this issue, we propose a novel method that explicitly masks background\nin Vision Transformers' attention mechanism. This ensures tokens corresponding\nto background patches do not contribute to the final image representation,\nthereby improving model robustness and interpretability. We validate our\napproach using prostate cancer grading from whole-slide images as a case study.\nOur results demonstrate that it achieves comparable performance with plain\nself-attention while providing more accurate and clinically meaningful\nattention heatmaps.\n","authors":["Clément Grisi","Geert Litjens","Jeroen van der Laak"],"pdf_url":"https://arxiv.org/pdf/2404.18152v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2404.18150v1","updated":"2024-04-28T11:55:50Z","published":"2024-04-28T11:55:50Z","title":"RadSimReal: Bridging the Gap Between Synthetic and Real Data in Radar\n Object Detection With Simulation","summary":" Object detection in radar imagery with neural networks shows great potential\nfor improving autonomous driving. However, obtaining annotated datasets from\nreal radar images, crucial for training these networks, is challenging,\nespecially in scenarios with long-range detection and adverse weather and\nlighting conditions where radar performance excels. To address this challenge,\nwe present RadSimReal, an innovative physical radar simulation capable of\ngenerating synthetic radar images with accompanying annotations for various\nradar types and environmental conditions, all without the need for real data\ncollection. Remarkably, our findings demonstrate that training object detection\nmodels on RadSimReal data and subsequently evaluating them on real-world data\nproduce performance levels comparable to models trained and tested on real data\nfrom the same dataset, and even achieves better performance when testing across\ndifferent real datasets. RadSimReal offers advantages over other physical radar\nsimulations that it does not necessitate knowledge of the radar design details,\nwhich are often not disclosed by radar suppliers, and has faster run-time. This\ninnovative tool has the potential to advance the development of computer vision\nalgorithms for radar-based autonomous driving applications.\n","authors":["Oded Bialer","Yuval Haitman"],"pdf_url":"https://arxiv.org/pdf/2404.18150v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.18149v1","updated":"2024-04-28T11:48:13Z","published":"2024-04-28T11:48:13Z","title":"Compressed Deepfake Video Detection Based on 3D Spatiotemporal\n Trajectories","summary":" The misuse of deepfake technology by malicious actors poses a potential\nthreat to nations, societies, and individuals. However, existing methods for\ndetecting deepfakes primarily focus on uncompressed videos, such as noise\ncharacteristics, local textures, or frequency statistics. When applied to\ncompressed videos, these methods experience a decrease in detection performance\nand are less suitable for real-world scenarios. In this paper, we propose a\ndeepfake video detection method based on 3D spatiotemporal trajectories.\nSpecifically, we utilize a robust 3D model to construct spatiotemporal motion\nfeatures, integrating feature details from both 2D and 3D frames to mitigate\nthe influence of large head rotation angles or insufficient lighting within\nframes. Furthermore, we separate facial expressions from head movements and\ndesign a sequential analysis method based on phase space motion trajectories to\nexplore the feature differences between genuine and fake faces in deepfake\nvideos. We conduct extensive experiments to validate the performance of our\nproposed method on several compressed deepfake benchmarks. The robustness of\nthe well-designed features is verified by calculating the consistent\ndistribution of facial landmarks before and after video compression.Our method\nyields satisfactory results and showcases its potential for practical\napplications.\n","authors":["Zongmei Chen","Xin Liao","Xiaoshuai Wu","Yanxiang Chen"],"pdf_url":"https://arxiv.org/pdf/2404.18149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06947v2","updated":"2024-04-28T11:41:58Z","published":"2023-12-12T03:04:08Z","title":"MaTe3D: Mask-guided Text-based 3D-aware Portrait Editing","summary":" 3D-aware portrait editing has a wide range of applications in multiple\nfields. However, current approaches are limited due that they can only perform\nmask-guided or text-based editing. Even by fusing the two procedures into a\nmodel, the editing quality and stability cannot be ensured. To address this\nlimitation, we propose \\textbf{MaTe3D}: mask-guided text-based 3D-aware\nportrait editing. In this framework, first, we introduce a new SDF-based 3D\ngenerator which learns local and global representations with proposed SDF and\ndensity consistency losses. This enhances masked-based editing in local areas;\nsecond, we present a novel distillation strategy: Conditional Distillation on\nGeometry and Texture (CDGT). Compared to exiting distillation strategies, it\nmitigates visual ambiguity and avoids mismatch between texture and geometry,\nthereby producing stable texture and convincing geometry while editing.\nAdditionally, we create the CatMask-HQ dataset, a large-scale high-resolution\ncat face annotation for exploration of model generalization and expansion. We\nperform expensive experiments on both the FFHQ and CatMask-HQ datasets to\ndemonstrate the editing quality and stability of the proposed method. Our\nmethod faithfully generates a 3D-aware edited face image based on a modified\nmask and a text prompt. Our code and models will be publicly released.\n","authors":["Kangneng Zhou","Daiheng Gao","Xuan Wang","Jie Zhang","Peng Zhang","Xusen Sun","Longhao Zhang","Shiqi Yang","Bang Zhang","Liefeng Bo","Yaxing Wang","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.06947v2.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.18143v1","updated":"2024-04-28T11:24:32Z","published":"2024-04-28T11:24:32Z","title":"Tracking Transforming Objects: A Benchmark","summary":" Tracking transforming objects holds significant importance in various fields\ndue to the dynamic nature of many real-world scenarios. By enabling systems\naccurately represent transforming objects over time, tracking transforming\nobjects facilitates advancements in areas such as autonomous systems,\nhuman-computer interaction, and security applications. Moreover, understanding\nthe behavior of transforming objects provides valuable insights into complex\ninteractions or processes, contributing to the development of intelligent\nsystems capable of robust and adaptive perception in dynamic environments.\nHowever, current research in the field mainly focuses on tracking generic\nobjects. In this study, we bridge this gap by collecting a novel dedicated\nDataset for Tracking Transforming Objects, called DTTO, which contains 100\nsequences, amounting to approximately 9.3K frames. We provide carefully\nhand-annotated bounding boxes for each frame within these sequences, making\nDTTO the pioneering benchmark dedicated to tracking transforming objects. We\nthoroughly evaluate 20 state-of-the-art trackers on the benchmark, aiming to\ncomprehend the performance of existing methods and provide a comparison for\nfuture research on DTTO. With the release of DTTO, our goal is to facilitate\nfurther research and applications related to tracking transforming objects.\n","authors":["You Wu","Yuelong Wang","Yaxin Liao","Fuliang Wu","Hengzhou Ye","Shuiwang Li"],"pdf_url":"https://arxiv.org/pdf/2404.18143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03473v2","updated":"2024-04-28T10:52:32Z","published":"2024-03-06T05:13:28Z","title":"Inverse-Free Fast Natural Gradient Descent Method for Deep Learning","summary":" Second-order optimization techniques have the potential to achieve faster\nconvergence rates compared to first-order methods through the incorporation of\nsecond-order derivatives or statistics. However, their utilization in deep\nlearning is limited due to their computational inefficiency. Various approaches\nhave been proposed to address this issue, primarily centered on minimizing the\nsize of the matrix to be inverted. Nevertheless, the necessity of performing\nthe inverse operation iteratively persists. In this work, we present a fast\nnatural gradient descent (FNGD) method that only requires inversion during the\nfirst epoch. Specifically, it is revealed that natural gradient descent (NGD)\nis essentially a weighted sum of per-sample gradients. Our novel approach\nfurther proposes to share these weighted coefficients across epochs without\naffecting empirical performance. Consequently, FNGD exhibits similarities to\nthe average sum in first-order methods, leading to the computational complexity\nof FNGD being comparable to that of first-order methods. Extensive experiments\non image classification and machine translation tasks demonstrate the\nefficiency of the proposed FNGD. For training ResNet-18 on CIFAR-100, FNGD can\nachieve a speedup of 2.07$\\times$ compared with KFAC. For training Transformer\non Multi30K, FNGD outperforms AdamW by 24 BLEU score while requiring almost the\nsame training time.\n","authors":["Xinwei Ou","Ce Zhu","Xiaolin Huang","Yipeng Liu"],"pdf_url":"https://arxiv.org/pdf/2403.03473v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18136v1","updated":"2024-04-28T10:16:35Z","published":"2024-04-28T10:16:35Z","title":"SafePaint: Anti-forensic Image Inpainting with Domain Adaptation","summary":" Existing image inpainting methods have achieved remarkable accomplishments in\ngenerating visually appealing results, often accompanied by a trend toward\ncreating more intricate structural textures. However, while these models excel\nat creating more realistic image content, they often leave noticeable traces of\ntampering, posing a significant threat to security. In this work, we take the\nanti-forensic capabilities into consideration, firstly proposing an end-to-end\ntraining framework for anti-forensic image inpainting named SafePaint.\nSpecifically, we innovatively formulated image inpainting as two major tasks:\nsemantically plausible content completion and region-wise optimization. The\nformer is similar to current inpainting methods that aim to restore the missing\nregions of corrupted images. The latter, through domain adaptation, endeavors\nto reconcile the discrepancies between the inpainted region and the unaltered\narea to achieve anti-forensic goals. Through comprehensive theoretical\nanalysis, we validate the effectiveness of domain adaptation for anti-forensic\nperformance. Furthermore, we meticulously crafted a region-wise separated\nattention (RWSA) module, which not only aligns with our objective of\nanti-forensics but also enhances the performance of the model. Extensive\nqualitative and quantitative evaluations show our approach achieves comparable\nresults to existing image inpainting methods while offering anti-forensic\ncapabilities not available in other methods.\n","authors":["Dunyun Chen","Xin Liao","Xiaoshuai Wu","Shiwei Chen"],"pdf_url":"https://arxiv.org/pdf/2404.18136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13788v2","updated":"2024-04-28T10:15:37Z","published":"2024-04-21T22:33:57Z","title":"AnyPattern: Towards In-context Image Copy Detection","summary":" This paper explores in-context learning for image copy detection (ICD), i.e.,\nprompting an ICD model to identify replicated images with new tampering\npatterns without the need for additional training. The prompts (or the\ncontexts) are from a small set of image-replica pairs that reflect the new\npatterns and are used at inference time. Such in-context ICD has good realistic\nvalue, because it requires no fine-tuning and thus facilitates fast reaction\nagainst the emergence of unseen patterns. To accommodate the \"seen\n$\\rightarrow$ unseen\" generalization scenario, we construct the first\nlarge-scale pattern dataset named AnyPattern, which has the largest number of\ntamper patterns ($90$ for training and $10$ for testing) among all the existing\nones. We benchmark AnyPattern with popular ICD methods and reveal that existing\nmethods barely generalize to novel patterns. We further propose a simple\nin-context ICD method named ImageStacker. ImageStacker learns to select the\nmost representative image-replica pairs and employs them as the pattern prompts\nin a stacking manner (rather than the popular concatenation manner).\nExperimental results show (1) training with our large-scale dataset\nsubstantially benefits pattern generalization ($+26.66 \\%$ $\\mu AP$), (2) the\nproposed ImageStacker facilitates effective in-context ICD (another round of\n$+16.75 \\%$ $\\mu AP$), and (3) AnyPattern enables in-context ICD, i.e., without\nsuch a large-scale dataset, in-context learning does not emerge even with our\nImageStacker. Beyond the ICD task, we also demonstrate how AnyPattern can\nbenefit artists, i.e., the pattern retrieval method trained on AnyPattern can\nbe generalized to identify style mimicry by text-to-image models. The project\nis publicly available at https://anypattern.github.io.\n","authors":["Wenhao Wang","Yifan Sun","Zhentao Tan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13788v2.pdf","comment":"The project is publicly available at https://anypattern.github.io.\n arXiv admin note: text overlap with arXiv:2403.06098"},{"id":"http://arxiv.org/abs/2401.11824v3","updated":"2024-04-28T09:58:57Z","published":"2024-01-22T10:37:59Z","title":"Rethinking Centered Kernel Alignment in Knowledge Distillation","summary":" Knowledge distillation has emerged as a highly effective method for bridging\nthe representation discrepancy between large-scale models and lightweight\nmodels. Prevalent approaches involve leveraging appropriate metrics to minimize\nthe divergence or distance between the knowledge extracted from the teacher\nmodel and the knowledge learned by the student model. Centered Kernel Alignment\n(CKA) is widely used to measure representation similarity and has been applied\nin several knowledge distillation methods. However, these methods are complex\nand fail to uncover the essence of CKA, thus not answering the question of how\nto use CKA to achieve simple and effective distillation properly. This paper\nfirst provides a theoretical perspective to illustrate the effectiveness of\nCKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD)\nand a constant term. Drawing from this, we propose a novel Relation-Centered\nKernel Alignment~(RCKA) framework, which practically establishes a connection\nbetween CKA and MMD. Furthermore, we dynamically customize the application of\nCKA based on the characteristics of each task, with less computational source\nyet comparable performance than the previous methods. The extensive experiments\non the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves\nstate-of-the-art performance on almost all teacher-student pairs for image\nclassification and object detection, validating the effectiveness of our\napproaches. Our code is available in https://github.com/Klayand/PCKA\n","authors":["Zikai Zhou","Yunhang Shen","Shitong Shao","Linrui Gong","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11824v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11470v2","updated":"2024-04-28T09:29:33Z","published":"2023-07-21T10:10:18Z","title":"Physics-Aware Semi-Supervised Underwater Image Enhancement","summary":" Underwater images normally suffer from degradation due to the transmission\nmedium of water bodies. Both traditional prior-based approaches and deep\nlearning-based methods have been used to address this problem. However, the\ninflexible assumption of the former often impairs their effectiveness in\nhandling diverse underwater scenes, while the generalization of the latter to\nunseen images is usually weakened by insufficient data. In this study, we\nleverage both the physics-based underwater Image Formation Model (IFM) and deep\nlearning techniques for Underwater Image Enhancement (UIE). To this end, we\npropose a novel Physics-Aware Dual-Stream Underwater Image Enhancement Network,\ni.e., PA-UIENet, which comprises a Transmission Estimation Steam (T-Stream) and\nan Ambient Light Estimation Stream (A-Stream). This network fulfills the UIE\ntask by explicitly estimating the degradation parameters of the IFM. We also\nadopt an IFM-inspired semi-supervised learning framework, which exploits both\nthe labeled and unlabeled images, to address the issue of insufficient data.\nOur method performs better than, or at least comparably to, eight baselines\nacross five testing sets in the degradation estimation and UIE tasks. This\nshould be due to the fact that it not only can model the degradation but also\ncan learn the characteristics of diverse underwater scenes.\n","authors":["Hao Qi","Xinghui Dong"],"pdf_url":"https://arxiv.org/pdf/2307.11470v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.09353v4","updated":"2024-04-28T09:06:50Z","published":"2024-02-14T17:59:34Z","title":"DoRA: Weight-Decomposed Low-Rank Adaptation","summary":" Among the widely used parameter-efficient finetuning (PEFT) methods, LoRA and\nits variants have gained considerable popularity because of avoiding additional\ninference costs. However, there still often exists an accuracy gap between\nthese methods and full fine-tuning (FT). In this work, we first introduce a\nnovel weight decomposition analysis to investigate the inherent differences\nbetween FT and LoRA. Aiming to resemble the learning capacity of FT from the\nfindings, we propose Weight-Decomposed LowRank Adaptation (DoRA). DoRA\ndecomposes the pre-trained weight into two components, magnitude and direction,\nfor fine-tuning, specifically employing LoRA for directional updates to\nefficiently minimize the number of trainable parameters. By employing DoRA, we\nenhance both the learning capacity and training stability of LoRA while\navoiding any additional inference overhead. DoRA consistently outperforms LoRA\non fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as\ncommonsense reasoning, visual instruction tuning, and image/video-text\nunderstanding. Code available at https://github.com/NVlabs/DoRA.\n","authors":["Shih-Yang Liu","Chien-Yi Wang","Hongxu Yin","Pavlo Molchanov","Yu-Chiang Frank Wang","Kwang-Ting Cheng","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2402.09353v4.pdf","comment":"Code available at https://github.com/NVlabs/DoRA"},{"id":"http://arxiv.org/abs/2402.15659v2","updated":"2024-04-28T09:01:22Z","published":"2024-02-24T00:25:22Z","title":"DeepLight: Reconstructing High-Resolution Observations of Nighttime\n Light With Multi-Modal Remote Sensing Data","summary":" Nighttime light (NTL) remote sensing observation serves as a unique proxy for\nquantitatively assessing progress toward meeting a series of Sustainable\nDevelopment Goals (SDGs), such as poverty estimation, urban sustainable\ndevelopment, and carbon emission. However, existing NTL observations often\nsuffer from pervasive degradation and inconsistency, limiting their utility for\ncomputing the indicators defined by the SDGs. In this study, we propose a novel\napproach to reconstruct high-resolution NTL images using multi-modal remote\nsensing data. To support this research endeavor, we introduce DeepLightMD, a\ncomprehensive dataset comprising data from five heterogeneous sensors, offering\nfine spatial resolution and rich spectral information at a national scale.\nAdditionally, we present DeepLightSR, a calibration-aware method for building\nbridges between spatially heterogeneous modality data in the multi-modality\nsuper-resolution. DeepLightSR integrates calibration-aware alignment, an\nauxiliary-to-main multi-modality fusion, and an auxiliary-embedded refinement\nto effectively address spatial heterogeneity, fuse diversely representative\nfeatures, and enhance performance in $8\\times$ super-resolution (SR) tasks.\nExtensive experiments demonstrate the superiority of DeepLightSR over 8\ncompeting methods, as evidenced by improvements in PSNR (2.01 dB $ \\sim $ 13.25\ndB) and PIQE (0.49 $ \\sim $ 9.32). Our findings underscore the practical\nsignificance of our proposed dataset and model in reconstructing\nhigh-resolution NTL data, supporting efficiently and quantitatively assessing\nthe SDG progress.\n","authors":["Lixian Zhang","Runmin Dong","Shuai Yuan","Jinxiao Zhang","Mengxuan Chen","Juepeng Zheng","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2402.15659v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18114v1","updated":"2024-04-28T08:44:28Z","published":"2024-04-28T08:44:28Z","title":"Deep Boosting Learning: A Brand-new Cooperative Approach for Image-Text\n Matching","summary":" Image-text matching remains a challenging task due to heterogeneous semantic\ndiversity across modalities and insufficient distance separability within\ntriplets. Different from previous approaches focusing on enhancing multi-modal\nrepresentations or exploiting cross-modal correspondence for more accurate\nretrieval, in this paper we aim to leverage the knowledge transfer between peer\nbranches in a boosting manner to seek a more powerful matching model.\nSpecifically, we propose a brand-new Deep Boosting Learning (DBL) algorithm,\nwhere an anchor branch is first trained to provide insights into the data\nproperties, with a target branch gaining more advanced knowledge to develop\noptimal features and distance metrics. Concretely, an anchor branch initially\nlearns the absolute or relative distance between positive and negative pairs,\nproviding a foundational understanding of the particular network and data\ndistribution. Building upon this knowledge, a target branch is concurrently\ntasked with more adaptive margin constraints to further enlarge the relative\ndistance between matched and unmatched samples. Extensive experiments validate\nthat our DBL can achieve impressive and consistent improvements based on\nvarious recent state-of-the-art models in the image-text matching field, and\noutperform related popular cooperative strategies, e.g., Conventional\nDistillation, Mutual Learning, and Contrastive Learning. Beyond the above, we\nconfirm that DBL can be seamlessly integrated into their training scenarios and\nachieve superior performance under the same computational costs, demonstrating\nthe flexibility and broad applicability of our proposed method. Our code is\npublicly available at: https://github.com/Paranioar/DBL.\n","authors":["Haiwen Diao","Ying Zhang","Shang Gao","Xiang Ruan","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.18114v1.pdf","comment":"12 pages, 9 figures, Accepted by TIP2024"},{"id":"http://arxiv.org/abs/2404.14037v2","updated":"2024-04-28T08:39:38Z","published":"2024-04-22T09:51:43Z","title":"GaussianTalker: Speaker-specific Talking Head Synthesis via 3D Gaussian\n Splatting","summary":" Recent works on audio-driven talking head synthesis using Neural Radiance\nFields (NeRF) have achieved impressive results. However, due to inadequate pose\nand expression control caused by NeRF implicit representation, these methods\nstill have some limitations, such as unsynchronized or unnatural lip movements,\nand visual jitter and artifacts. In this paper, we propose GaussianTalker, a\nnovel method for audio-driven talking head synthesis based on 3D Gaussian\nSplatting. With the explicit representation property of 3D Gaussians, intuitive\ncontrol of the facial motion is achieved by binding Gaussians to 3D facial\nmodels. GaussianTalker consists of two modules, Speaker-specific Motion\nTranslator and Dynamic Gaussian Renderer. Speaker-specific Motion Translator\nachieves accurate lip movements specific to the target speaker through\nuniversalized audio feature extraction and customized lip motion generation.\nDynamic Gaussian Renderer introduces Speaker-specific BlendShapes to enhance\nfacial detail representation via a latent pose, delivering stable and realistic\nrendered videos. Extensive experimental results suggest that GaussianTalker\noutperforms existing state-of-the-art methods in talking head synthesis,\ndelivering precise lip synchronization and exceptional visual quality. Our\nmethod achieves rendering speeds of 130 FPS on NVIDIA RTX4090 GPU,\nsignificantly exceeding the threshold for real-time rendering performance, and\ncan potentially be deployed on other hardware platforms.\n","authors":["Hongyun Yu","Zhan Qu","Qihang Yu","Jianchuan Chen","Zhonghua Jiang","Zhiwen Chen","Shengyu Zhang","Jimin Xu","Fei Wu","Chengfei Lv","Gang Yu"],"pdf_url":"https://arxiv.org/pdf/2404.14037v2.pdf","comment":"https://yuhongyun777.github.io/GaussianTalker/"},{"id":"http://arxiv.org/abs/2404.18112v1","updated":"2024-04-28T08:36:32Z","published":"2024-04-28T08:36:32Z","title":"Garbage Segmentation and Attribute Analysis by Robotic Dogs","summary":" Efficient waste management and recycling heavily rely on garbage exploration\nand identification. In this study, we propose GSA2Seg (Garbage Segmentation and\nAttribute Analysis), a novel visual approach that utilizes quadruped robotic\ndogs as autonomous agents to address waste management and recycling challenges\nin diverse indoor and outdoor environments. Equipped with advanced visual\nperception system, including visual sensors and instance segmentators, the\nrobotic dogs adeptly navigate their surroundings, diligently searching for\ncommon garbage items. Inspired by open-vocabulary algorithms, we introduce an\ninnovative method for object attribute analysis. By combining garbage\nsegmentation and attribute analysis techniques, the robotic dogs accurately\ndetermine the state of the trash, including its position and placement\nproperties. This information enhances the robotic arm's grasping capabilities,\nfacilitating successful garbage retrieval. Additionally, we contribute an image\ndataset, named GSA2D, to support evaluation. Through extensive experiments on\nGSA2D, this paper provides a comprehensive analysis of GSA2Seg's effectiveness.\nDataset available:\n\\href{https://www.kaggle.com/datasets/hellob/gsa2d-2024}{https://www.kaggle.com/datasets/hellob/gsa2d-2024}.\n","authors":["Nuo Xu","Jianfeng Liao","Qiwei Meng","Wei Song"],"pdf_url":"https://arxiv.org/pdf/2404.18112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13621v2","updated":"2024-04-28T08:05:55Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":" Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. Robustness of these\ntechniques, however, remains a concern, particularly in the face of adversarial\nattacks that have been proven to deceive state-of-the-art deep neural networks\nin many domains. Surprisingly, the robustness of scene flow networks against\nsuch attacks has not been thoroughly investigated. To address this problem, the\nproposed approach aims to bridge this gap by introducing adversarial white-box\nattacks specifically tailored for scene flow networks. Experimental results\nshow that the generated adversarial examples obtain up to 33.7 relative\ndegradation in average end-point error on the KITTI and FlyingThings3D\ndatasets. The study also reveals the significant impact that attacks targeting\npoint clouds in only one dimension or color channel have on average end-point\nerror. Analyzing the success and failure of these attacks on the scene flow\nnetworks and their 2D optical flow network variants show a higher vulnerability\nfor the optical flow networks.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15761v2","updated":"2024-04-28T08:04:53Z","published":"2024-02-24T08:20:39Z","title":"Res-VMamba: Fine-Grained Food Category Visual Classification Using\n Selective State Space Models with Deep Residual Learning","summary":" Food classification is the foundation for developing food vision tasks and\nplays a key role in the burgeoning field of computational nutrition. Due to the\ncomplexity of food requiring fine-grained classification, recent academic\nresearch mainly modifies Convolutional Neural Networks (CNNs) and/or Vision\nTransformers (ViTs) to perform food category classification. However, to learn\nfine-grained features, the CNN backbone needs additional structural design,\nwhereas ViT, containing the self-attention module, has increased computational\ncomplexity. In recent months, a new Sequence State Space (S4) model, through a\nSelection mechanism and computation with a Scan (S6), colloquially termed\nMamba, has demonstrated superior performance and computation efficiency\ncompared to the Transformer architecture. The VMamba model, which incorporates\nthe Mamba mechanism into image tasks (such as classification), currently\nestablishes the state-of-the-art (SOTA) on the ImageNet dataset. In this\nresearch, we introduce an academically underestimated food dataset CNFOOD-241,\nand pioneer the integration of a residual learning framework within the VMamba\nmodel to concurrently harness both global and local state features inherent in\nthe original VMamba architectural design. The research results show that VMamba\nsurpasses current SOTA models in fine-grained and food classification. The\nproposed Res-VMamba further improves the classification accuracy to 79.54\\%\nwithout pretrained weight. Our findings elucidate that our proposed methodology\nestablishes a new benchmark for SOTA performance in food recognition on the\nCNFOOD-241 dataset. The code can be obtained on GitHub:\nhttps://github.com/ChiShengChen/ResVMamba.\n","authors":["Chi-Sheng Chen","Guan-Ying Chen","Dong Zhou","Di Jiang","Dai-Shi Chen"],"pdf_url":"https://arxiv.org/pdf/2402.15761v2.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.18109v1","updated":"2024-04-28T08:04:04Z","published":"2024-04-28T08:04:04Z","title":"Finding Beautiful and Happy Images for Mental Health and Well-being\n Applications","summary":" This paper explores how artificial intelligence (AI) technology can\ncontribute to achieve progress on good health and well-being, one of the United\nNations' 17 Sustainable Development Goals. It is estimated that one in ten of\nthe global population lived with a mental disorder. Inspired by studies showing\nthat engaging and viewing beautiful natural images can make people feel happier\nand less stressful, lead to higher emotional well-being, and can even have\ntherapeutic values, we explore how AI can help to promote mental health by\ndeveloping automatic algorithms for finding beautiful and happy images. We\nfirst construct a large image database consisting of nearly 20K very high\nresolution colour photographs of natural scenes where each image is labelled\nwith beautifulness and happiness scores by about 10 observers. Statistics of\nthe database shows that there is a good correlation between the beautifulness\nand happiness scores which provides anecdotal evidence to corroborate that\nengaging beautiful natural images can potentially benefit mental well-being.\nBuilding on this unique database, the very first of its kind, we have developed\na deep learning based model for automatically predicting the beautifulness and\nhappiness scores of natural images. Experimental results are presented to show\nthat it is possible to develop AI algorithms to automatically assess an image's\nbeautifulness and happiness values which can in turn be used to develop\napplications for promoting mental health and well-being.\n","authors":["Ruitao Xie","Connor Qiu","Guoping Qiu"],"pdf_url":"https://arxiv.org/pdf/2404.18109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18106v1","updated":"2024-04-28T07:47:52Z","published":"2024-04-28T07:47:52Z","title":"Semi-supervised Text-based Person Search","summary":" Text-based person search (TBPS) aims to retrieve images of a specific person\nfrom a large image gallery based on a natural language description. Existing\nmethods rely on massive annotated image-text data to achieve satisfactory\nperformance in fully-supervised learning. It poses a significant challenge in\npractice, as acquiring person images from surveillance videos is relatively\neasy, while obtaining annotated texts is challenging. The paper undertakes a\npioneering initiative to explore TBPS under the semi-supervised setting, where\nonly a limited number of person images are annotated with textual descriptions\nwhile the majority of images lack annotations. We present a two-stage basic\nsolution based on generation-then-retrieval for semi-supervised TBPS. The\ngeneration stage enriches annotated data by applying an image captioning model\nto generate pseudo-texts for unannotated images. Later, the retrieval stage\nperforms fully-supervised retrieval learning using the augmented data.\nSignificantly, considering the noise interference of the pseudo-texts on\nretrieval learning, we propose a noise-robust retrieval framework that enhances\nthe ability of the retrieval model to handle noisy data. The framework\nintegrates two key strategies: Hybrid Patch-Channel Masking (PC-Mask) to refine\nthe model architecture, and Noise-Guided Progressive Training (NP-Train) to\nenhance the training process. PC-Mask performs masking on the input data at\nboth the patch-level and the channel-level to prevent overfitting noisy\nsupervision. NP-Train introduces a progressive training schedule based on the\nnoise level of pseudo-texts to facilitate noise-robust learning. Extensive\nexperiments on multiple TBPS benchmarks show that the proposed framework\nachieves promising performance under the semi-supervised setting.\n","authors":["Daming Gao","Yang Bai","Min Cao","Hao Dou","Mang Ye","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.18106v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2404.18096v1","updated":"2024-04-28T07:01:55Z","published":"2024-04-28T07:01:55Z","title":"Snake with Shifted Window: Learning to Adapt Vessel Pattern for OCTA\n Segmentation","summary":" Segmenting specific targets or structures in optical coherence tomography\nangiography (OCTA) images is fundamental for conducting further pathological\nstudies. The retinal vascular layers are rich and intricate, and such vascular\nwith complex shapes can be captured by the widely-studied OCTA images. In this\npaper, we thus study how to use OCTA images with projection vascular layers to\nsegment retinal structures. To this end, we propose the SSW-OCTA model, which\nintegrates the advantages of deformable convolutions suited for tubular\nstructures and the swin-transformer for global feature extraction, adapting to\nthe characteristics of OCTA modality images. Our model underwent testing and\ncomparison on the OCTA-500 dataset, achieving state-of-the-art performance. The\ncode is available at: https://github.com/ShellRedia/Snake-SWin-OCTA.\n","authors":["Xinrun Chen","Mei Shen","Haojian Ning","Mengzhan Zhang","Chengliang Wang","Shiying Li"],"pdf_url":"https://arxiv.org/pdf/2404.18096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07125v3","updated":"2024-04-28T06:42:58Z","published":"2023-11-13T07:34:53Z","title":"Attention-Challenging Multiple Instance Learning for Whole Slide Image\n Classification","summary":" In the application of Multiple Instance Learning (MIL) methods for Whole\nSlide Image (WSI) classification, attention mechanisms often focus on a subset\nof discriminative instances, which are closely linked to overfitting. To\nmitigate overfitting, we present Attention-Challenging MIL (ACMIL). ACMIL\ncombines two techniques based on separate analyses for attention value\nconcentration. Firstly, UMAP of instance features reveals various patterns\namong discriminative instances, with existing attention mechanisms capturing\nonly some of them. To remedy this, we introduce Multiple Branch Attention (MBA)\nto capture more discriminative instances using multiple attention branches.\nSecondly, the examination of the cumulative value of Top-K attention scores\nindicates that a tiny number of instances dominate the majority of attention.\nIn response, we present Stochastic Top-K Instance Masking (STKIM), which masks\nout a portion of instances with Top-K attention values and allocates their\nattention values to the remaining instances. The extensive experimental results\non three WSI datasets with two pre-trained backbones reveal that our ACMIL\noutperforms state-of-the-art methods. Additionally, through heatmap\nvisualization and UMAP visualization, this paper extensively illustrates\nACMIL's effectiveness in suppressing attention value concentration and\novercoming the overfitting challenge. The source code is available at\n\\url{https://github.com/dazhangyu123/ACMIL}.\n","authors":["Yunlong Zhang","Honglin Li","Yuxuan Sun","Sunyi Zheng","Chenglu Zhu","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.07125v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.18083v1","updated":"2024-04-28T06:25:56Z","published":"2024-04-28T06:25:56Z","title":"Online,Target-Free LiDAR-Camera Extrinsic Calibration via Cross-Modal\n Mask Matching","summary":" LiDAR-camera extrinsic calibration (LCEC) is crucial for data fusion in\nintelligent vehicles. Offline, target-based approaches have long been the\npreferred choice in this field. However, they often demonstrate poor\nadaptability to real-world environments. This is largely because extrinsic\nparameters may change significantly due to moderate shocks or during extended\noperations in environments with vibrations. In contrast, online, target-free\napproaches provide greater adaptability yet typically lack robustness,\nprimarily due to the challenges in cross-modal feature matching. Therefore, in\nthis article, we unleash the full potential of large vision models (LVMs),\nwhich are emerging as a significant trend in the fields of computer vision and\nrobotics, especially for embodied artificial intelligence, to achieve robust\nand accurate online, target-free LCEC across a variety of challenging\nscenarios. Our main contributions are threefold: we introduce a novel framework\nknown as MIAS-LCEC, provide an open-source versatile calibration toolbox with\nan interactive visualization interface, and publish three real-world datasets\ncaptured from various indoor and outdoor environments. The cornerstone of our\nframework and toolbox is the cross-modal mask matching (C3M) algorithm,\ndeveloped based on a state-of-the-art (SoTA) LVM and capable of generating\nsufficient and reliable matches. Extensive experiments conducted on these\nreal-world datasets demonstrate the robustness of our approach and its superior\nperformance compared to SoTA methods, particularly for the solid-state LiDARs\nwith super-wide fields of view.\n","authors":["Zhiwei Huang","Yikang Zhang","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.18083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12310v3","updated":"2024-04-28T05:54:15Z","published":"2023-04-24T17:57:43Z","title":"Fully Sparse Fusion for 3D Object Detection","summary":" Currently prevalent multimodal 3D detection methods are built upon\nLiDAR-based detectors that usually use dense Bird's-Eye-View (BEV) feature\nmaps. However, the cost of such BEV feature maps is quadratic to the detection\nrange, making it not suitable for long-range detection. Fully sparse\narchitecture is gaining attention as they are highly efficient in long-range\nperception. In this paper, we study how to effectively leverage image modality\nin the emerging fully sparse architecture. Particularly, utilizing instance\nqueries, our framework integrates the well-studied 2D instance segmentation\ninto the LiDAR side, which is parallel to the 3D instance segmentation part in\nthe fully sparse detector. This design achieves a uniform query-based fusion\nframework in both the 2D and 3D sides while maintaining the fully sparse\ncharacteristic. Extensive experiments showcase state-of-the-art results on the\nwidely used nuScenes dataset and the long-range Argoverse 2 dataset. Notably,\nthe inference speed of the proposed method under the long-range LiDAR\nperception setting is 2.7 $\\times$ faster than that of other state-of-the-art\nmultimodal 3D detection methods. Code will be released at\n\\url{https://github.com/BraveGroup/FullySparseFusion}.\n","authors":["Yingyan Li","Lue Fan","Yang Liu","Zehao Huang","Yuntao Chen","Naiyan Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.12310v3.pdf","comment":"TPMAI 2024"},{"id":"http://arxiv.org/abs/2212.09298v3","updated":"2024-04-28T05:23:35Z","published":"2022-12-19T08:31:08Z","title":"From a Bird's Eye View to See: Joint Camera and Subject Registration\n without the Camera Calibration","summary":" We tackle a new problem of multi-view camera and subject registration in the\nbird's eye view (BEV) without pre-given camera calibration. This is a very\nchallenging problem since its only input is several RGB images from different\nfirst-person views (FPVs) for a multi-person scene, without the BEV image and\nthe calibration of the FPVs, while the output is a unified plane with the\nlocalization and orientation of both the subjects and cameras in a BEV. We\npropose an end-to-end framework solving this problem, whose main idea can be\ndivided into following parts: i) creating a view-transform subject detection\nmodule to transform the FPV to a virtual BEV including localization and\norientation of each pedestrian, ii) deriving a geometric transformation based\nmethod to estimate camera localization and view direction, i.e., the camera\nregistration in a unified BEV, iii) making use of spatial and appearance\ninformation to aggregate the subjects into the unified BEV. We collect a new\nlarge-scale synthetic dataset with rich annotations for evaluation. The\nexperimental results show the remarkable effectiveness of our proposed method.\n","authors":["Zekun Qian","Ruize Han","Wei Feng","Feifan Wang","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2212.09298v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.04386v3","updated":"2024-04-28T05:13:35Z","published":"2021-12-07T07:46:18Z","title":"Which images to label for few-shot medical landmark detection?","summary":" The success of deep learning methods relies on the availability of\nwell-labeled large-scale datasets. However, for medical images, annotating such\nabundant training data often requires experienced radiologists and consumes\ntheir limited time. Few-shot learning is developed to alleviate this burden,\nwhich achieves competitive performances with only several labeled data.\nHowever, a crucial yet previously overlooked problem in few-shot learning is\nabout the selection of template images for annotation before learning, which\naffects the final performance. We herein propose a novel Sample Choosing Policy\n(SCP) to select \"the most worthy\" images for annotation, in the context of\nfew-shot medical landmark detection. SCP consists of three parts: 1)\nSelf-supervised training for building a pre-trained deep model to extract\nfeatures from radiological images, 2) Key Point Proposal for localizing\ninformative patches, and 3) Representative Score Estimation for searching the\nmost representative samples or templates. The advantage of SCP is demonstrated\nby various experiments on three widely-used public datasets. For one-shot\nmedical landmark detection, its use reduces the mean radial errors on\nCephalometric and HandXray datasets by 14.2% (from 3.595mm to 3.083mm) and\n35.5% (4.114mm to 2.653mm), respectively.\n","authors":["Quan Quan","Qingsong Yao","Jun Li","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2112.04386v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18066v1","updated":"2024-04-28T04:32:44Z","published":"2024-04-28T04:32:44Z","title":"Quantized Context Based LIF Neurons for Recurrent Spiking Neural\n Networks in 45nm","summary":" In this study, we propose the first hardware implementation of a\ncontext-based recurrent spiking neural network (RSNN) emphasizing on\nintegrating dual information streams within the neocortical pyramidal neurons\nspecifically Context- Dependent Leaky Integrate and Fire (CLIF) neuron models,\nessential element in RSNN. We present a quantized version of the CLIF neuron\n(qCLIF), developed through a hardware-software codesign approach utilizing the\nsparse activity of RSNN. Implemented in a 45nm technology node, the qCLIF is\ncompact (900um^2) and achieves a high accuracy of 90% despite 8 bit\nquantization on DVS gesture classification dataset. Our analysis spans a\nnetwork configuration from 10 to 200 qCLIF neurons, supporting up to 82k\nsynapses within a 1.86 mm^2 footprint, demonstrating scalability and efficiency\n","authors":["Sai Sukruth Bezugam","Yihao Wu","JaeBum Yoo","Dmitri Strukov","Bongjin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.18066v1.pdf","comment":"7 Pages, 7 Figures, 2 Tables"},{"id":"http://arxiv.org/abs/2404.18065v1","updated":"2024-04-28T04:05:10Z","published":"2024-04-28T04:05:10Z","title":"Grounded Compositional and Diverse Text-to-3D with Pretrained Multi-View\n Diffusion Model","summary":" In this paper, we propose an effective two-stage approach named\nGrounded-Dreamer to generate 3D assets that can accurately follow complex,\ncompositional text prompts while achieving high fidelity by using a pre-trained\nmulti-view diffusion model. Multi-view diffusion models, such as MVDream, have\nshown to generate high-fidelity 3D assets using score distillation sampling\n(SDS). However, applied naively, these methods often fail to comprehend\ncompositional text prompts, and may often entirely omit certain subjects or\nparts. To address this issue, we first advocate leveraging text-guided 4-view\nimages as the bottleneck in the text-to-3D pipeline. We then introduce an\nattention refocusing mechanism to encourage text-aligned 4-view image\ngeneration, without the necessity to re-train the multi-view diffusion model or\ncraft a high-quality compositional 3D dataset. We further propose a hybrid\noptimization strategy to encourage synergy between the SDS loss and the sparse\nRGB reference images. Our method consistently outperforms previous\nstate-of-the-art (SOTA) methods in generating compositional 3D assets,\nexcelling in both quality and accuracy, and enabling diverse 3D from the same\ntext prompt.\n","authors":["Xiaolong Li","Jiawei Mo","Ying Wang","Chethan Parameshwara","Xiaohan Fei","Ashwin Swaminathan","CJ Taylor","Zhuowen Tu","Paolo Favaro","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2404.18065v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.18062v1","updated":"2024-04-28T03:47:48Z","published":"2024-04-28T03:47:48Z","title":"Compressed Image Captioning using CNN-based Encoder-Decoder Framework","summary":" In today's world, image processing plays a crucial role across various\nfields, from scientific research to industrial applications. But one\nparticularly exciting application is image captioning. The potential impact of\neffective image captioning is vast. It can significantly boost the accuracy of\nsearch engines, making it easier to find relevant information. Moreover, it can\ngreatly enhance accessibility for visually impaired individuals, providing them\nwith a more immersive experience of digital content. However, despite its\npromise, image captioning presents several challenges. One major hurdle is\nextracting meaningful visual information from images and transforming it into\ncoherent language. This requires bridging the gap between the visual and\nlinguistic domains, a task that demands sophisticated algorithms and models.\nOur project is focused on addressing these challenges by developing an\nautomatic image captioning architecture that combines the strengths of\nconvolutional neural networks (CNNs) and encoder-decoder models. The CNN model\nis used to extract the visual features from images, and later, with the help of\nthe encoder-decoder framework, captions are generated. We also did a\nperformance comparison where we delved into the realm of pre-trained CNN\nmodels, experimenting with multiple architectures to understand their\nperformance variations. In our quest for optimization, we also explored the\nintegration of frequency regularization techniques to compress the \"AlexNet\"\nand \"EfficientNetB0\" model. We aimed to see if this compressed model could\nmaintain its effectiveness in generating image captions while being more\nresource-efficient.\n","authors":["Md Alif Rahman Ridoy","M Mahmud Hasan","Shovon Bhowmick"],"pdf_url":"https://arxiv.org/pdf/2404.18062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18060v1","updated":"2024-04-28T03:28:27Z","published":"2024-04-28T03:28:27Z","title":"Prompt Customization for Continual Learning","summary":" Contemporary continual learning approaches typically select prompts from a\npool, which function as supplementary inputs to a pre-trained model. However,\nthis strategy is hindered by the inherent noise of its selection approach when\nhandling increasing tasks. In response to these challenges, we reformulate the\nprompting approach for continual learning and propose the prompt customization\n(PC) method. PC mainly comprises a prompt generation module (PGM) and a prompt\nmodulation module (PMM). In contrast to conventional methods that employ hard\nprompt selection, PGM assigns different coefficients to prompts from a\nfixed-sized pool of prompts and generates tailored prompts. Moreover, PMM\nfurther modulates the prompts by adaptively assigning weights according to the\ncorrelations between input data and corresponding prompts. We evaluate our\nmethod on four benchmark datasets for three diverse settings, including the\nclass, domain, and task-agnostic incremental learning tasks. Experimental\nresults demonstrate consistent improvement (by up to 16.2\\%), yielded by the\nproposed method, over the state-of-the-art (SOTA) techniques.\n","authors":["Yong Dai","Xiaopeng Hong","Yabin Wang","Zhiheng Ma","Dongmei Jiang","Yaowei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.18060v1.pdf","comment":"ACM MM"},{"id":"http://arxiv.org/abs/2404.18058v1","updated":"2024-04-28T03:11:44Z","published":"2024-04-28T03:11:44Z","title":"Joint Reference Frame Synthesis and Post Filter Enhancement for\n Versatile Video Coding","summary":" This paper presents the joint reference frame synthesis (RFS) and\npost-processing filter enhancement (PFE) for Versatile Video Coding (VVC),\naiming to explore the combination of different neural network-based video\ncoding (NNVC) tools to better utilize the hierarchical bi-directional coding\nstructure of VVC. Both RFS and PFE utilize the Space-Time Enhancement Network\n(STENet), which receives two input frames with artifacts and produces two\nenhanced frames with suppressed artifacts, along with an intermediate\nsynthesized frame. STENet comprises two pipelines, the synthesis pipeline and\nthe enhancement pipeline, tailored for different purposes. During RFS, two\nreconstructed frames are sent into STENet's synthesis pipeline to synthesize a\nvirtual reference frame, similar to the current to-be-coded frame. The\nsynthesized frame serves as an additional reference frame inserted into the\nreference picture list (RPL). During PFE, two reconstructed frames are fed into\nSTENet's enhancement pipeline to alleviate their artifacts and distortions,\nresulting in enhanced frames with reduced artifacts and distortions. To reduce\ninference complexity, we propose joint inference of RFS and PFE (JISE),\nachieved through a single execution of STENet. Integrated into the VVC\nreference software VTM-15.0, RFS, PFE, and JISE are coordinated within a novel\nSpace-Time Enhancement Window (STEW) under Random Access (RA) configuration.\nThe proposed method could achieve -7.34%/-17.21%/-16.65% PSNR-based BD-rate on\naverage for three components under RA configuration.\n","authors":["Weijie Bao","Yuantong Zhang","Jianghao Jia","Zhenzhong Chen","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.18058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11876v3","updated":"2024-04-28T02:07:37Z","published":"2023-06-20T20:23:46Z","title":"BMAD: Benchmarks for Medical Anomaly Detection","summary":" Anomaly detection (AD) is a fundamental research problem in machine learning\nand computer vision, with practical applications in industrial inspection,\nvideo surveillance, and medical diagnosis. In medical imaging, AD is especially\nvital for detecting and diagnosing anomalies that may indicate rare diseases or\nconditions. However, there is a lack of a universal and fair benchmark for\nevaluating AD methods on medical images, which hinders the development of more\ngeneralized and robust AD methods in this specific domain. To bridge this gap,\nwe introduce a comprehensive evaluation benchmark for assessing anomaly\ndetection methods on medical images. This benchmark encompasses six reorganized\ndatasets from five medical domains (i.e. brain MRI, liver CT, retinal OCT,\nchest X-ray, and digital histopathology) and three key evaluation metrics, and\nincludes a total of fourteen state-of-the-art AD algorithms. This standardized\nand well-curated medical benchmark with the well-structured codebase enables\ncomprehensive comparisons among recently proposed anomaly detection methods. It\nwill facilitate the community to conduct a fair comparison and advance the\nfield of AD on medical imaging. More information on BMAD is available in our\nGitHub repository: https://github.com/DorisBao/BMAD\n","authors":["Jinan Bao","Hanshi Sun","Hanqiu Deng","Yinsheng He","Zhaoxiang Zhang","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2306.11876v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19924v2","updated":"2024-04-28T02:05:03Z","published":"2024-03-29T02:22:54Z","title":"SceneTracker: Long-term Scene Flow Estimation Network","summary":" Considering the complementarity of scene flow estimation in the spatial\ndomain's focusing capability and 3D object tracking in the temporal domain's\ncoherence, this study aims to address a comprehensive new task that can\nsimultaneously capture fine-grained and long-term 3D motion in an online\nmanner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a\nnovel learning-based LSFE network that adopts an iterative approach to\napproximate the optimal trajectory. Besides, it dynamically indexes and\nconstructs appearance and depth correlation features simultaneously and employs\nthe Transformer to explore and utilize long-range connections within and\nbetween trajectories. With detailed experiments, SceneTracker shows superior\ncapabilities in handling 3D spatial occlusion and depth noise interference,\nhighly tailored to the LSFE task's needs. The code for SceneTracker is\navailable at https://github.com/wwsource/SceneTracker.\n","authors":["Bo Wang","Jian Li","Yang Yu","Li Liu","Zhenping Sun","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2403.19924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18033v1","updated":"2024-04-28T00:29:24Z","published":"2024-04-28T00:29:24Z","title":"Exposing Text-Image Inconsistency Using Diffusion Models","summary":" In the battle against widespread online misinformation, a growing problem is\ntext-image inconsistency, where images are misleadingly paired with texts with\ndifferent intent or meaning. Existing classification-based methods for\ntext-image inconsistency can identify contextual inconsistencies but fail to\nprovide explainable justifications for their decisions that humans can\nunderstand. Although more nuanced, human evaluation is impractical at scale and\nsusceptible to errors. To address these limitations, this study introduces\nD-TIIL (Diffusion-based Text-Image Inconsistency Localization), which employs\ntext-to-image diffusion models to localize semantic inconsistencies in text and\nimage pairs. These models, trained on large-scale datasets act as ``omniscient\"\nagents that filter out irrelevant information and incorporate background\nknowledge to identify inconsistencies. In addition, D-TIIL uses text embeddings\nand modified image regions to visualize these inconsistencies. To evaluate\nD-TIIL's efficacy, we introduce a new TIIL dataset containing 14K consistent\nand inconsistent text-image pairs. Unlike existing datasets, TIIL enables\nassessment at the level of individual words and image regions and is carefully\ndesigned to represent various inconsistencies. D-TIIL offers a scalable and\nevidence-based approach to identifying and localizing text-image inconsistency,\nproviding a robust framework for future research combating misinformation.\n","authors":["Mingzhen Huang","Shan Jia","Zhou Zhou","Yan Ju","Jialing Cai","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2404.18033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19527v1","updated":"2024-04-28T09:56:49Z","published":"2024-04-28T09:56:49Z","title":"Revealing the Two Sides of Data Augmentation: An Asymmetric\n Distillation-based Win-Win Solution for Open-Set Recognition","summary":" In this paper, we reveal the two sides of data augmentation: enhancements in\nclosed-set recognition correlate with a significant decrease in open-set\nrecognition. Through empirical investigation, we find that multi-sample-based\naugmentations would contribute to reducing feature discrimination, thereby\ndiminishing the open-set criteria. Although knowledge distillation could impair\nthe feature via imitation, the mixed feature with ambiguous semantics hinders\nthe distillation. To this end, we propose an asymmetric distillation framework\nby feeding teacher model extra raw data to enlarge the benefit of teacher.\nMoreover, a joint mutual information loss and a selective relabel strategy are\nutilized to alleviate the influence of hard mixed samples. Our method\nsuccessfully mitigates the decline in open-set and outperforms SOTAs by 2%~3%\nAUROC on the Tiny-ImageNet dataset and experiments on large-scale dataset\nImageNet-21K demonstrate the generalization of our method.\n","authors":["Yunbing Jia","Xiaoyu Kong","Fan Tang","Yixing Gao","Weiming Dong","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.19527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01587v1","updated":"2024-04-28T19:11:08Z","published":"2024-04-28T19:11:08Z","title":"Improve Academic Query Resolution through BERT-based Question Extraction\n from Images","summary":" Providing fast and accurate resolution to the student's query is an essential\nsolution provided by Edtech organizations. This is generally provided with a\nchat-bot like interface to enable students to ask their doubts easily. One\npreferred format for student queries is images, as it allows students to\ncapture and post questions without typing complex equations and information.\nHowever, this format also presents difficulties, as images may contain multiple\nquestions or textual noise that lowers the accuracy of existing single-query\nanswering solutions. In this paper, we propose a method for extracting\nquestions from text or images using a BERT-based deep learning model and\ncompare it to the other rule-based and layout-based methods. Our method aims to\nimprove the accuracy and efficiency of student query resolution in Edtech\norganizations.\n","authors":["Nidhi Kamal","Saurabh Yadav","Jorawar Singh","Aditi Avasthi"],"pdf_url":"https://arxiv.org/pdf/2405.01587v1.pdf","comment":null}]},"2024-04-27T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.18025v1","updated":"2024-04-27T23:22:39Z","published":"2024-04-27T23:22:39Z","title":"Retrieval Robust to Object Motion Blur","summary":" Moving objects are frequently seen in daily life and usually appear blurred\nin images due to their motion. While general object retrieval is a widely\nexplored area in computer vision, it primarily focuses on sharp and static\nobjects, and retrieval of motion-blurred objects in large image collections\nremains unexplored. We propose a method for object retrieval in images that are\naffected by motion blur. The proposed method learns a robust representation\ncapable of matching blurred objects to their deblurred versions and vice versa.\nTo evaluate our approach, we present the first large-scale datasets for blurred\nobject retrieval, featuring images with objects exhibiting varying degrees of\nblur in various poses and scales. We conducted extensive experiments, showing\nthat our method outperforms state-of-the-art retrieval methods on the new\nblur-retrieval datasets, which validates the effectiveness of the proposed\napproach.\n","authors":["Rong Zou","Marc Pollefeys","Denys Rozumnyi"],"pdf_url":"https://arxiv.org/pdf/2404.18025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18020v1","updated":"2024-04-27T22:45:47Z","published":"2024-04-27T22:45:47Z","title":"DM-Align: Leveraging the Power of Natural Language Instructions to Make\n Changes to Images","summary":" Text-based semantic image editing assumes the manipulation of an image using\na natural language instruction. Although recent works are capable of generating\ncreative and qualitative images, the problem is still mostly approached as a\nblack box sensitive to generating unexpected outputs. Therefore, we propose a\nnovel model to enhance the text-based control of an image editor by explicitly\nreasoning about which parts of the image to alter or preserve. It relies on\nword alignments between a description of the original source image and the\ninstruction that reflects the needed updates, and the input image. The proposed\nDiffusion Masking with word Alignments (DM-Align) allows the editing of an\nimage in a transparent and explainable way. It is evaluated on a subset of the\nBison dataset and a self-defined dataset dubbed Dream. When comparing to\nstate-of-the-art baselines, quantitative and qualitative results show that\nDM-Align has superior performance in image editing conditioned on language\ninstructions, well preserves the background of the image and can better cope\nwith long text instructions.\n","authors":["Maria Mihaela Trusca","Tinne Tuytelaars","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2404.18020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18006v1","updated":"2024-04-27T20:54:15Z","published":"2024-04-27T20:54:15Z","title":"FRAME: A Modular Framework for Autonomous Map-merging: Advancements in\n the Field","summary":" In this article, a novel approach for merging 3D point cloud maps in the\ncontext of egocentric multi-robot exploration is presented. Unlike traditional\nmethods, the proposed approach leverages state-of-the-art place recognition and\nlearned descriptors to efficiently detect overlap between maps, eliminating the\nneed for the time-consuming global feature extraction and feature matching\nprocess. The estimated overlapping regions are used to calculate a homogeneous\nrigid transform, which serves as an initial condition for the GICP point cloud\nregistration algorithm to refine the alignment between the maps. The advantages\nof this approach include faster processing time, improved accuracy, and\nincreased robustness in challenging environments. Furthermore, the\neffectiveness of the proposed framework is successfully demonstrated through\nmultiple field missions of robot exploration in a variety of different\nunderground environments.\n","authors":["Nikolaos Stathoulopoulos","Björn Lindqvist","Anton Koval","Ali-akbar Agha-mohammadi","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.18006v1.pdf","comment":"28 pages, 24 figures. Submitted to Field Robotics"},{"id":"http://arxiv.org/abs/2404.17993v1","updated":"2024-04-27T19:54:42Z","published":"2024-04-27T19:54:42Z","title":"MinBackProp -- Backpropagating through Minimal Solvers","summary":" We present an approach to backpropagating through minimal problem solvers in\nend-to-end neural network training. Traditional methods relying on manually\nconstructed formulas, finite differences, and autograd are laborious,\napproximate, and unstable for complex minimal problem solvers. We show that\nusing the Implicit function theorem to calculate derivatives to backpropagate\nthrough the solution of a minimal problem solver is simple, fast, and stable.\nWe compare our approach to (i) using the standard autograd on minimal problem\nsolvers and relate it to existing backpropagation formulas through SVD-based\nand Eig-based solvers and (ii) implementing the backprop with an existing\nPyTorch Deep Declarative Networks (DDN) framework. We demonstrate our technique\non a toy example of training outlier-rejection weights for 3D point\nregistration and on a real application of training an outlier-rejection and\nRANSAC sampling network in image matching. Our method provides $100\\%$\nstability and is 10 times faster compared to autograd, which is unstable and\nslow, and compared to DDN, which is stable but also slow.\n","authors":["Diana Sungatullina","Tomas Pajdla"],"pdf_url":"https://arxiv.org/pdf/2404.17993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17978v1","updated":"2024-04-27T18:41:32Z","published":"2024-04-27T18:41:32Z","title":"A Method of Moments Embedding Constraint and its Application to\n Semi-Supervised Learning","summary":" Discriminative deep learning models with a linear+softmax final layer have a\nproblem: the latent space only predicts the conditional probabilities $p(Y|X)$\nbut not the full joint distribution $p(Y,X)$, which necessitates a generative\napproach. The conditional probability cannot detect outliers, causing outlier\nsensitivity in softmax networks. This exacerbates model over-confidence\nimpacting many problems, such as hallucinations, confounding biases, and\ndependence on large datasets. To address this we introduce a novel embedding\nconstraint based on the Method of Moments (MoM). We investigate the use of\npolynomial moments ranging from 1st through 4th order hyper-covariance\nmatrices. Furthermore, we use this embedding constraint to train an\nAxis-Aligned Gaussian Mixture Model (AAGMM) final layer, which learns not only\nthe conditional, but also the joint distribution of the latent space. We apply\nthis method to the domain of semi-supervised image classification by extending\nFlexMatch with our technique. We find our MoM constraint with the AAGMM layer\nis able to match the reported FlexMatch accuracy, while also modeling the joint\ndistribution, thereby reducing outlier sensitivity. We also present a\npreliminary outlier detection strategy based on Mahalanobis distance and\ndiscuss future improvements to this strategy. Code is available at:\n\\url{https://github.com/mmajurski/ssl-gmm}\n","authors":["Michael Majurski","Sumeet Menon","Parniyan Farvardin","David Chapman"],"pdf_url":"https://arxiv.org/pdf/2404.17978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17974v1","updated":"2024-04-27T18:24:53Z","published":"2024-04-27T18:24:53Z","title":"HVOFusion: Incremental Mesh Reconstruction Using Hybrid Voxel Octree","summary":" Incremental scene reconstruction is essential to the navigation in robotics.\nMost of the conventional methods typically make use of either TSDF (truncated\nsigned distance functions) volume or neural networks to implicitly represent\nthe surface. Due to the voxel representation or involving with time-consuming\nsampling, they have difficulty in balancing speed, memory storage, and surface\nquality. In this paper, we propose a novel hybrid voxel-octree approach to\neffectively fuse octree with voxel structures so that we can take advantage of\nboth implicit surface and explicit triangular mesh representation. Such sparse\nstructure preserves triangular faces in the leaf nodes and produces partial\nmeshes sequentially for incremental reconstruction. This storage scheme allows\nus to naturally optimize the mesh in explicit 3D space to achieve higher\nsurface quality. We iteratively deform the mesh towards the target and recovers\nvertex colors by optimizing a shading model. Experimental results on several\ndatasets show that our proposed approach is capable of quickly and accurately\nreconstructing a scene with realistic colors.\n","authors":["Shaofan Liu","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.17974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02367v2","updated":"2024-04-27T18:04:11Z","published":"2024-02-04T06:39:01Z","title":"Exploring Intrinsic Properties of Medical Images for Self-Supervised\n Binary Semantic Segmentation","summary":" Recent advancements in self-supervised learning have unlocked the potential\nto harness unlabeled data for auxiliary tasks, facilitating the learning of\nbeneficial priors. This has been particularly advantageous in fields like\nmedical image analysis, where labeled data are scarce. Although effective for\nclassification tasks, this methodology has shown limitations in more complex\napplications, such as medical image segmentation. In this paper, we introduce\nMedical imaging Enhanced with Dynamic Self-Adaptive Semantic Segmentation\n(MedSASS), a dedicated self-supervised framework tailored for medical image\nsegmentation. We evaluate MedSASS against existing state-of-the-art methods\nacross four diverse medical datasets, showcasing its superiority. MedSASS\noutperforms existing CNN-based self-supervised methods by 3.83% and matches the\nperformance of ViT-based methods. Furthermore, when MedSASS is trained\nend-to-end, covering both encoder and decoder, it demonstrates significant\nimprovements of 14.4% for CNNs and 6% for ViT-based architectures compared to\nexisting state-of-the-art self-supervised strategies.\n","authors":["Pranav Singh","Jacopo Cirrone"],"pdf_url":"https://arxiv.org/pdf/2402.02367v2.pdf","comment":"30 pages, 10 figures, and 10 tables. Under Review"},{"id":"http://arxiv.org/abs/2404.17967v1","updated":"2024-04-27T17:56:58Z","published":"2024-04-27T17:56:58Z","title":"SCorP: Statistics-Informed Dense Correspondence Prediction Directly from\n Unsegmented Medical Images","summary":" Statistical shape modeling (SSM) is a powerful computational framework for\nquantifying and analyzing the geometric variability of anatomical structures,\nfacilitating advancements in medical research, diagnostics, and treatment\nplanning. Traditional methods for shape modeling from imaging data demand\nsignificant manual and computational resources. Additionally, these methods\nnecessitate repeating the entire modeling pipeline to derive shape descriptors\n(e.g., surface-based point correspondences) for new data. While deep learning\napproaches have shown promise in streamlining the construction of SSMs on new\ndata, they still rely on traditional techniques to supervise the training of\nthe deep networks. Moreover, the predominant linearity assumption of\ntraditional approaches restricts their efficacy, a limitation also inherited by\ndeep learning models trained using optimized/established correspondences.\nConsequently, representing complex anatomies becomes challenging. To address\nthese limitations, we introduce SCorP, a novel framework capable of predicting\nsurface-based correspondences directly from unsegmented images. By leveraging\nthe shape prior learned directly from surface meshes in an unsupervised manner,\nthe proposed model eliminates the need for an optimized shape model for\ntraining supervision. The strong shape prior acts as a teacher and regularizes\nthe feature learning of the student network to guide it in learning image-based\nfeatures that are predictive of surface correspondences. The proposed model\nstreamlines the training and inference phases by removing the supervision for\nthe correspondence prediction task while alleviating the linearity assumption.\n","authors":["Krithika Iyer","Jadie Adams","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2404.17967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17961v1","updated":"2024-04-27T17:16:45Z","published":"2024-04-27T17:16:45Z","title":"Random Walk on Pixel Manifolds for Anomaly Segmentation of Complex\n Driving Scenes","summary":" In anomaly segmentation for complex driving scenes, state-of-the-art\napproaches utilize anomaly scoring functions to calculate anomaly scores. For\nthese functions, accurately predicting the logits of inlier classes for each\npixel is crucial for precisely inferring the anomaly score. However, in\nreal-world driving scenarios, the diversity of scenes often results in\ndistorted manifolds of pixel embeddings in embedding space. This effect is not\nconducive to directly using the pixel embeddings for the logit prediction\nduring inference, a concern overlooked by existing methods. To address this\nproblem, we propose a novel method called Random Walk on Pixel Manifolds\n(RWPM). RWPM utilizes random walks to reveal the intrinsic relationships among\npixels to refine the pixel embeddings. The refined pixel embeddings alleviate\nthe distortion of manifolds, improving the accuracy of anomaly scores. Our\nextensive experiments show that RWPM consistently improve the performance of\nthe existing anomaly segmentation methods and achieve the best results. Code:\n\\url{https://github.com/ZelongZeng/RWPM}.\n","authors":["Zelong Zeng","Kaname Tomite"],"pdf_url":"https://arxiv.org/pdf/2404.17961v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2403.12229v2","updated":"2024-04-27T15:51:37Z","published":"2024-03-18T20:20:13Z","title":"Fusion Transformer with Object Mask Guidance for Image Forgery Analysis","summary":" In this work, we introduce OMG-Fuser, a fusion transformer-based network\ndesigned to extract information from various forensic signals to enable robust\nimage forgery detection and localization. Our approach can operate with an\narbitrary number of forensic signals and leverages object information for their\nanalysis -- unlike previous methods that rely on fusion schemes with few\nsignals and often disregard image semantics. To this end, we design a forensic\nsignal stream composed of a transformer guided by an object attention\nmechanism, associating patches that depict the same objects. In that way, we\nincorporate object-level information from the image. Each forensic signal is\nprocessed by a different stream that adapts to its peculiarities. A token\nfusion transformer efficiently aggregates the outputs of an arbitrary number of\nnetwork streams and generates a fused representation for each image patch. We\nassess two fusion variants on top of the proposed approach: (i) score-level\nfusion that fuses the outputs of multiple image forensics algorithms and (ii)\nfeature-level fusion that fuses low-level forensic traces directly. Both\nvariants exceed state-of-the-art performance on seven datasets for image\nforgery detection and localization, with a relative average improvement of\n12.1% and 20.4% in terms of F1. Our model is robust against traditional and\nnovel forgery attacks and can be expanded with new signals without training\nfrom scratch. Our code is publicly available at:\nhttps://github.com/mever-team/omgfuser\n","authors":["Dimitrios Karageorgiou","Giorgos Kordopatis-Zilos","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2403.12229v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17936v1","updated":"2024-04-27T15:16:34Z","published":"2024-04-27T15:16:34Z","title":"FDCE-Net: Underwater Image Enhancement with Embedding Frequency and Dual\n Color Encoder","summary":" Underwater images often suffer from various issues such as low brightness,\ncolor shift, blurred details, and noise due to light absorption and scattering\ncaused by water and suspended particles. Previous underwater image enhancement\n(UIE) methods have primarily focused on spatial domain enhancement, neglecting\nthe frequency domain information inherent in the images. However, the\ndegradation factors of underwater images are closely intertwined in the spatial\ndomain. Although certain methods focus on enhancing images in the frequency\ndomain, they overlook the inherent relationship between the image degradation\nfactors and the information present in the frequency domain. As a result, these\nmethods frequently enhance certain attributes of the improved image while\ninadequately addressing or even exacerbating other attributes. Moreover, many\nexisting methods heavily rely on prior knowledge to address color shift\nproblems in underwater images, limiting their flexibility and robustness. In\norder to overcome these limitations, we propose the Embedding Frequency and\nDual Color Encoder Network (FDCE-Net) in our paper. The FDCE-Net consists of\ntwo main structures: (1) Frequency Spatial Network (FS-Net) aims to achieve\ninitial enhancement by utilizing our designed Frequency Spatial Residual Block\n(FSRB) to decouple image degradation factors in the frequency domain and\nenhance different attributes separately. (2) To tackle the color shift issue,\nwe introduce the Dual-Color Encoder (DCE). The DCE establishes correlations\nbetween color and semantic representations through cross-attention and\nleverages multi-scale image features to guide the optimization of adaptive\ncolor query. The final enhanced images are generated by combining the outputs\nof FS-Net and DCE through a fusion network. These images exhibit rich details,\nclear textures, low noise and natural colors.\n","authors":["Zheng Cheng","Guodong Fan","Jingchun Zhou","Min Gan","C. L. Philip Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17936v1.pdf","comment":"16pages,13 figures"},{"id":"http://arxiv.org/abs/2404.13573v2","updated":"2024-04-27T15:10:55Z","published":"2024-04-21T08:27:20Z","title":"Exploring AIGC Video Quality: A Focus on Visual Harmony, Video-Text\n Consistency and Domain Distribution Gap","summary":" The recent advancements in Text-to-Video Artificial Intelligence Generated\nContent (AIGC) have been remarkable. Compared with traditional videos, the\nassessment of AIGC videos encounters various challenges: visual inconsistency\nthat defy common sense, discrepancies between content and the textual prompt,\nand distribution gap between various generative models, etc. Target at these\nchallenges, in this work, we categorize the assessment of AIGC video quality\ninto three dimensions: visual harmony, video-text consistency, and domain\ndistribution gap. For each dimension, we design specific modules to provide a\ncomprehensive quality assessment of AIGC videos. Furthermore, our research\nidentifies significant variations in visual quality, fluidity, and style among\nvideos generated by different text-to-video models. Predicting the source\ngenerative model can make the AIGC video features more discriminative, which\nenhances the quality assessment performance. The proposed method was used in\nthe third-place winner of the NTIRE 2024 Quality Assessment for AI-Generated\nContent - Track 2 Video, demonstrating its effectiveness. Code will be\navailable at https://github.com/Coobiw/TriVQA.\n","authors":["Bowen Qu","Xiaoyu Liang","Shangkun Sun","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2404.13573v2.pdf","comment":"9 pages, 3 figures, 3 tables. Accepted by CVPR2024 Workshop (3rd\n place winner of NTIRE2024 Quality Assessment for AI-Generated Content - Track\n 2 Video)"},{"id":"http://arxiv.org/abs/2404.17931v1","updated":"2024-04-27T15:04:30Z","published":"2024-04-27T15:04:30Z","title":"Critical Review for One-class Classification: recent advances and the\n reality behind them","summary":" This paper offers a comprehensive review of one-class classification (OCC),\nexamining the technologies and methodologies employed in its implementation. It\ndelves into various approaches utilized for OCC across diverse data types, such\nas feature data, image, video, time series, and others. Through a systematic\nreview, this paper synthesizes promi-nent strategies used in OCC from its\ninception to its current advance-ments, with a particular emphasis on the\npromising application. Moreo-ver, the article criticizes the state-of-the-art\n(SOTA) image anomaly de-tection (AD) algorithms dominating one-class\nexperiments. These algo-rithms include outlier exposure (binary classification)\nand pretrained model (multi-class classification), conflicting with the\nfundamental con-cept of learning from one class. Our investigation reveals that\nthe top nine algorithms for one-class CIFAR10 benchmark are not OCC. We ar-gue\nthat binary/multi-class classification algorithms should not be com-pared with\nOCC.\n","authors":["Toshitaka Hayashi","Dalibor Cimr","Hamido Fujita","Richard Cimler"],"pdf_url":"https://arxiv.org/pdf/2404.17931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08420v3","updated":"2024-04-27T15:02:45Z","published":"2023-05-15T08:01:05Z","title":"Exploring Few-Shot Adaptation for Activity Recognition on Diverse\n Domains","summary":" Domain adaptation is essential for activity recognition to ensure accurate\nand robust performance across diverse environments, sensor types, and data\nsources. Unsupervised domain adaptation methods have been extensively studied,\nyet, they require large-scale unlabeled data from the target domain. In this\nwork, we focus on Few-Shot Domain Adaptation for Activity Recognition\n(FSDA-AR), which leverages a very small amount of labeled target videos to\nachieve effective adaptation. This approach is appealing for applications\nbecause it only needs a few or even one labeled example per class in the target\ndomain, ideal for recognizing rare but critical activities. However, the\nexisting FSDA-AR works mostly focus on the domain adaptation on sports videos,\nwhere the domain diversity is limited. We propose a new FSDA-AR benchmark using\nfive established datasets considering the adaptation on more diverse and\nchallenging domains. Our results demonstrate that FSDA-AR performs comparably\nto unsupervised domain adaptation with significantly fewer labeled target\ndomain samples. We further propose a novel approach, RelaMiX, to better\nleverage the few labeled target domain samples as knowledge guidance. RelaMiX\nencompasses a temporal relational attention network with relation dropout,\nalongside a cross-domain information alignment mechanism. Furthermore, it\nintegrates a mechanism for mixing features within a latent space by using the\nfew-shot target domain samples. The proposed RelaMiX solution achieves\nstate-of-the-art performance on all datasets within the FSDA-AR benchmark. To\nencourage future research of few-shot domain adaptation for activity\nrecognition, our code will be publicly available at\nhttps://github.com/KPeng9510/RelaMiX.\n","authors":["Kunyu Peng","Di Wen","David Schneider","Jiaming Zhang","Kailun Yang","M. Saquib Sarfraz","Rainer Stiefelhagen","Alina Roitberg"],"pdf_url":"https://arxiv.org/pdf/2305.08420v3.pdf","comment":"The benchmark and source code will be publicly available at\n https://github.com/KPeng9510/RelaMiX"},{"id":"http://arxiv.org/abs/2404.17930v1","updated":"2024-04-27T15:00:57Z","published":"2024-04-27T15:00:57Z","title":"Multi-Stream Cellular Test-Time Adaptation of Real-Time Models Evolving\n in Dynamic Environments","summary":" In the era of the Internet of Things (IoT), objects connect through a dynamic\nnetwork, empowered by technologies like 5G, enabling real-time data sharing.\nHowever, smart objects, notably autonomous vehicles, face challenges in\ncritical local computations due to limited resources. Lightweight AI models\noffer a solution but struggle with diverse data distributions. To address this\nlimitation, we propose a novel Multi-Stream Cellular Test-Time Adaptation\n(MSC-TTA) setup where models adapt on the fly to a dynamic environment divided\ninto cells. Then, we propose a real-time adaptive student-teacher method that\nleverages the multiple streams available in each cell to quickly adapt to\nchanging data distributions. We validate our methodology in the context of\nautonomous vehicles navigating across cells defined based on location and\nweather conditions. To facilitate future benchmarking, we release a new\nmulti-stream large-scale synthetic semantic segmentation dataset, called DADE,\nand show that our multi-stream approach outperforms a single-stream baseline.\nWe believe that our work will open research opportunities in the IoT and 5G\neras, offering solutions for real-time model adaptation.\n","authors":["Benoît Gérin","Anaïs Halin","Anthony Cioppa","Maxim Henry","Bernard Ghanem","Benoît Macq","Christophe De Vleeschouwer","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2404.17930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17929v1","updated":"2024-04-27T14:43:32Z","published":"2024-04-27T14:43:32Z","title":"Spatio-Temporal Side Tuning Pre-trained Foundation Models for\n Video-based Pedestrian Attribute Recognition","summary":" Existing pedestrian attribute recognition (PAR) algorithms are mainly\ndeveloped based on a static image, however, the performance is unreliable in\nchallenging scenarios, such as heavy occlusion, motion blur, etc. In this work,\nwe propose to understand human attributes using video frames that can fully use\ntemporal information by fine-tuning a pre-trained multi-modal foundation model\nefficiently. Specifically, we formulate the video-based PAR as a\nvision-language fusion problem and adopt a pre-trained foundation model CLIP to\nextract the visual features. More importantly, we propose a novel\nspatiotemporal side-tuning strategy to achieve parameter-efficient optimization\nof the pre-trained vision foundation model. To better utilize the semantic\ninformation, we take the full attribute list that needs to be recognized as\nanother input and transform the attribute words/phrases into the corresponding\nsentence via split, expand, and prompt operations. Then, the text encoder of\nCLIP is utilized for embedding processed attribute descriptions. The averaged\nvisual tokens and text tokens are concatenated and fed into a fusion\nTransformer for multi-modal interactive learning. The enhanced tokens will be\nfed into a classification head for pedestrian attribute prediction. Extensive\nexperiments on two large-scale video-based PAR datasets fully validated the\neffectiveness of our proposed framework. The source code of this paper is\navailable at https://github.com/Event-AHU/OpenPAR.\n","authors":["Xiao Wang","Qian Zhu","Jiandong Jin","Jun Zhu","Futian Wang","Bo Jiang","Yaowei Wang","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.17929v1.pdf","comment":"Parameter Efficient Fine-Tuning Strategy for Video-based Pedestrian\n Attribute Recognition"},{"id":"http://arxiv.org/abs/2404.17926v1","updated":"2024-04-27T14:29:53Z","published":"2024-04-27T14:29:53Z","title":"Pre-training on High Definition X-ray Images: An Experimental Study","summary":" Existing X-ray based pre-trained vision models are usually conducted on a\nrelatively small-scale dataset (less than 500k samples) with limited resolution\n(e.g., 224 $\\times$ 224). However, the key to the success of self-supervised\npre-training large models lies in massive training data, and maintaining high\nresolution in the field of X-ray images is the guarantee of effective solutions\nto difficult miscellaneous diseases. In this paper, we address these issues by\nproposing the first high-definition (1280 $\\times$ 1280) X-ray based\npre-trained foundation vision model on our newly collected large-scale dataset\nwhich contains more than 1 million X-ray images. Our model follows the masked\nauto-encoder framework which takes the tokens after mask processing (with a\nhigh rate) is used as input, and the masked image patches are reconstructed by\nthe Transformer encoder-decoder network. More importantly, we introduce a novel\ncontext-aware masking strategy that utilizes the chest contour as a boundary\nfor adaptive masking operations. We validate the effectiveness of our model on\ntwo downstream tasks, including X-ray report generation and disease\nrecognition. Extensive experiments demonstrate that our pre-trained medical\nfoundation vision model achieves comparable or even new state-of-the-art\nperformance on downstream benchmark datasets. The source code and pre-trained\nmodels of this paper will be released on\nhttps://github.com/Event-AHU/Medical_Image_Analysis.\n","authors":["Xiao Wang","Yuehang Li","Wentao Wu","Jiandong Jin","Yao Rong","Bo Jiang","Chuanfu Li","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2404.17926v1.pdf","comment":"Technology Report"},{"id":"http://arxiv.org/abs/2404.15677v2","updated":"2024-04-27T14:24:15Z","published":"2024-04-24T06:15:31Z","title":"CharacterFactory: Sampling Consistent Characters with GANs for Diffusion\n Models","summary":" Recent advances in text-to-image models have opened new frontiers in\nhuman-centric generation. However, these models cannot be directly employed to\ngenerate images with consistent newly coined identities. In this work, we\npropose CharacterFactory, a framework that allows sampling new characters with\nconsistent identities in the latent space of GANs for diffusion models. More\nspecifically, we consider the word embeddings of celeb names as ground truths\nfor the identity-consistent generation task and train a GAN model to learn the\nmapping from a latent space to the celeb embedding space. In addition, we\ndesign a context-consistent loss to ensure that the generated identity\nembeddings can produce identity-consistent images in various contexts.\nRemarkably, the whole model only takes 10 minutes for training, and can sample\ninfinite characters end-to-end during inference. Extensive experiments\ndemonstrate excellent performance of the proposed CharacterFactory on character\ncreation in terms of identity consistency and editability. Furthermore, the\ngenerated characters can be seamlessly combined with the off-the-shelf\nimage/video/3D diffusion models. We believe that the proposed CharacterFactory\nis an important step for identity-consistent character generation. Project page\nis available at: https://qinghew.github.io/CharacterFactory/.\n","authors":["Qinghe Wang","Baolu Li","Xiaomin Li","Bing Cao","Liqian Ma","Huchuan Lu","Xu Jia"],"pdf_url":"https://arxiv.org/pdf/2404.15677v2.pdf","comment":"Code will be released very soon:\n https://github.com/qinghew/CharacterFactory"},{"id":"http://arxiv.org/abs/2404.17922v1","updated":"2024-04-27T14:20:46Z","published":"2024-04-27T14:20:46Z","title":"Open-Set 3D Semantic Instance Maps for Vision Language Navigation --\n O3D-SIM","summary":" Humans excel at forming mental maps of their surroundings, equipping them to\nunderstand object relationships and navigate based on language queries. Our\nprevious work SI Maps [1] showed that having instance-level information and the\nsemantic understanding of an environment helps significantly improve\nperformance for language-guided tasks. We extend this instance-level approach\nto 3D while increasing the pipeline's robustness and improving quantitative and\nqualitative results. Our method leverages foundational models for object\nrecognition, image segmentation, and feature extraction. We propose a\nrepresentation that results in a 3D point cloud map with instance-level\nembeddings, which bring in the semantic understanding that natural language\ncommands can query. Quantitatively, the work improves upon the success rate of\nlanguage-guided tasks. At the same time, we qualitatively observe the ability\nto identify instances more clearly and leverage the foundational models and\nlanguage and image-aligned embeddings to identify objects that, otherwise, a\nclosed-set approach wouldn't be able to identify.\n","authors":["Laksh Nanwani","Kumaraditya Gupta","Aditya Mathur","Swayam Agrawal","A. H. Abdul Hafez","K. Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.17922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17917v1","updated":"2024-04-27T14:10:09Z","published":"2024-04-27T14:10:09Z","title":"EvaNet: Elevation-Guided Flood Extent Mapping on Earth Imagery","summary":" Accurate and timely mapping of flood extent from high-resolution satellite\nimagery plays a crucial role in disaster management such as damage assessment\nand relief activities. However, current state-of-the-art solutions are based on\nU-Net, which can-not segment the flood pixels accurately due to the ambiguous\npixels (e.g., tree canopies, clouds) that prevent a direct judgement from only\nthe spectral features. Thanks to the digital elevation model (DEM) data readily\navailable from sources such as United States Geological Survey (USGS), this\nwork explores the use of an elevation map to improve flood extent mapping. We\npropose, EvaNet, an elevation-guided segmentation model based on the\nencoder-decoder architecture with two novel techniques: (1) a loss function\nencoding the physical law of gravity that if a location is flooded (resp. dry),\nthen its adjacent locations with a lower (resp. higher) elevation must also be\nflooded (resp. dry); (2) a new (de)convolution operation that integrates the\nelevation map by a location sensitive gating mechanism to regulate how much\nspectral features flow through adjacent layers. Extensive experiments show that\nEvaNet significantly outperforms the U-Net baselines, and works as a perfect\ndrop-in replacement for U-Net in existing solutions to flood extent mapping.\n","authors":["Mirza Tanzim Sami","Da Yan","Saugat Adhikari","Lyuheng Yuan","Jiao Han","Zhe Jiang","Jalal Khalil","Yang Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17917v1.pdf","comment":"Accepted at the International Joint Conference on Artificial\n Intelligence (IJCAI, 2024)"},{"id":"http://arxiv.org/abs/2404.13443v2","updated":"2024-04-27T14:02:35Z","published":"2024-04-20T18:50:57Z","title":"FisheyeDetNet: 360° Surround view Fisheye Camera based Object\n Detection System for Autonomous Driving","summary":" Object detection is a mature problem in autonomous driving with pedestrian\ndetection being one of the first deployed algorithms. It has been\ncomprehensively studied in the literature. However, object detection is\nrelatively less explored for fisheye cameras used for surround-view near field\nsensing. The standard bounding box representation fails in fisheye cameras due\nto heavy radial distortion, particularly in the periphery. To mitigate this, we\nexplore extending the standard object detection output representation of\nbounding box. We design rotated bounding boxes, ellipse, generic polygon as\npolar arc/angle representations and define an instance segmentation mIOU metric\nto analyze these representations. The proposed model FisheyeDetNet with polygon\noutperforms others and achieves a mAP score of 49.5 % on Valeo fisheye\nsurround-view dataset for automated driving applications. This dataset has 60K\nimages captured from 4 surround-view cameras across Europe, North America and\nAsia. To the best of our knowledge, this is the first detailed study on object\ndetection on fisheye cameras for autonomous driving scenarios.\n","authors":["Ganesh Sistu","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2404.13443v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2206.05542 by other authors"},{"id":"http://arxiv.org/abs/2404.17910v1","updated":"2024-04-27T13:38:45Z","published":"2024-04-27T13:38:45Z","title":"Reliable Student: Addressing Noise in Semi-Supervised 3D Object\n Detection","summary":" Semi-supervised 3D object detection can benefit from the promising\npseudo-labeling technique when labeled data is limited. However, recent\napproaches have overlooked the impact of noisy pseudo-labels during training,\ndespite efforts to enhance pseudo-label quality through confidence-based\nfiltering. In this paper, we examine the impact of noisy pseudo-labels on\nIoU-based target assignment and propose the Reliable Student framework, which\nincorporates two complementary approaches to mitigate errors. First, it\ninvolves a class-aware target assignment strategy that reduces false negative\nassignments in difficult classes. Second, it includes a reliability weighting\nstrategy that suppresses false positive assignment errors while also addressing\nremaining false negatives from the first step. The reliability weights are\ndetermined by querying the teacher network for confidence scores of the\nstudent-generated proposals. Our work surpasses the previous state-of-the-art\non KITTI 3D object detection benchmark on point clouds in the semi-supervised\nsetting. On 1% labeled data, our approach achieves a 6.2% AP improvement for\nthe pedestrian class, despite having only 37 labeled samples available. The\nimprovements become significant for the 2% setting, achieving 6.0% AP and 5.7%\nAP improvements for the pedestrian and cyclist classes, respectively.\n","authors":["Farzad Nozarian","Shashank Agarwal","Farzaneh Rezaeianaran","Danish Shahzad","Atanas Poibrenski","Christian Müller","Philipp Slusallek"],"pdf_url":"https://arxiv.org/pdf/2404.17910v1.pdf","comment":"Accepted at CVPR Workshop L3D-IVU 2023. Code:\n https://github.com/fnozarian/ReliableStudent"},{"id":"http://arxiv.org/abs/2401.16465v2","updated":"2024-04-27T13:32:52Z","published":"2024-01-29T16:24:21Z","title":"DressCode: Autoregressively Sewing and Generating Garments from Text\n Guidance","summary":" Apparel's significant role in human appearance underscores the importance of\ngarment digitalization for digital human creation. Recent advances in 3D\ncontent creation are pivotal for digital human creation. Nonetheless, garment\ngeneration from text guidance is still nascent. We introduce a text-driven 3D\ngarment generation framework, DressCode, which aims to democratize design for\nnovices and offer immense potential in fashion design, virtual try-on, and\ndigital human creation. For our framework, we first introduce SewingGPT, a\nGPT-based architecture integrating cross-attention with text-conditioned\nembedding to generate sewing patterns with text guidance. We also tailored a\npre-trained Stable Diffusion for high-quality, tile-based PBR texture\ngeneration. By leveraging a large language model, our framework generates\nCG-friendly garments through natural language interaction. Our method also\nfacilitates pattern completion and texture editing, streamlining the design\nprocess through user-friendly interaction. This framework fosters innovation by\nallowing creators to freely experiment with designs and incorporate unique\nelements into their work, thereby igniting new ideas and artistic\npossibilities. With comprehensive evaluations and comparisons with other\nstate-of-the-art methods, our method showcases the best quality and alignment\nwith input prompts. User studies further validate our high-quality rendering\nresults, highlighting its practical utility and potential in production\nsettings. Our project page is https://IHe-KaiI.github.io/DressCode/.\n","authors":["Kai He","Kaixin Yao","Qixuan Zhang","Jingyi Yu","Lingjie Liu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2401.16465v2.pdf","comment":"Project page: https://IHe-KaiI.github.io/DressCode/"},{"id":"http://arxiv.org/abs/2404.17900v1","updated":"2024-04-27T13:13:27Z","published":"2024-04-27T13:13:27Z","title":"Unsupervised Anomaly Detection via Masked Diffusion Posterior Sampling","summary":" Reconstruction-based methods have been commonly used for unsupervised anomaly\ndetection, in which a normal image is reconstructed and compared with the given\ntest image to detect and locate anomalies. Recently, diffusion models have\nshown promising applications for anomaly detection due to their powerful\ngenerative ability. However, these models lack strict mathematical support for\nnormal image reconstruction and unexpectedly suffer from low reconstruction\nquality. To address these issues, this paper proposes a novel and\nhighly-interpretable method named Masked Diffusion Posterior Sampling (MDPS).\nIn MDPS, the problem of normal image reconstruction is mathematically modeled\nas multiple diffusion posterior sampling for normal images based on the devised\nmasked noisy observation model and the diffusion-based normal image prior under\nBayesian framework. Using a metric designed from pixel-level and\nperceptual-level perspectives, MDPS can effectively compute the difference map\nbetween each normal posterior sample and the given test image. Anomaly scores\nare obtained by averaging all difference maps for multiple posterior samples.\nExhaustive experiments on MVTec and BTAD datasets demonstrate that MDPS can\nachieve state-of-the-art performance in normal image reconstruction quality as\nwell as anomaly detection and localization.\n","authors":["Di Wu","Shicai Fan","Xue Zhou","Li Yu","Yuzhong Deng","Jianxiao Zou","Baihong Lin"],"pdf_url":"https://arxiv.org/pdf/2404.17900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17894v1","updated":"2024-04-27T13:03:57Z","published":"2024-04-27T13:03:57Z","title":"Unpaired Multi-view Clustering via Reliable View Guidance","summary":" This paper focuses on unpaired multi-view clustering (UMC), a challenging\nproblem where paired observed samples are unavailable across multiple views.\nThe goal is to perform effective joint clustering using the unpaired observed\nsamples in all views. In incomplete multi-view clustering, existing methods\ntypically rely on sample pairing between views to capture their complementary.\nHowever, that is not applicable in the case of UMC. Hence, we aim to extract\nthe consistent cluster structure across views. In UMC, two challenging issues\narise: uncertain cluster structure due to lack of label and uncertain pairing\nrelationship due to absence of paired samples. We assume that the view with a\ngood cluster structure is the reliable view, which acts as a supervisor to\nguide the clustering of the other views. With the guidance of reliable views, a\nmore certain cluster structure of these views is obtained while achieving\nalignment between reliable views and other views. Then we propose Reliable view\nGuidance with one reliable view (RG-UMC) and multiple reliable views (RGs-UMC)\nfor UMC. Specifically, we design alignment modules with one reliable view and\nmultiple reliable views, respectively, to adaptively guide the optimization\nprocess. Also, we utilize the compactness module to enhance the relationship of\nsamples within the same cluster. Meanwhile, an orthogonal constraint is applied\nto latent representation to obtain discriminate features. Extensive experiments\nshow that both RG-UMC and RGs-UMC outperform the best state-of-the-art method\nby an average of 24.14\\% and 29.42\\% in NMI, respectively.\n","authors":["Like Xin","Wanqi Yang","Lei Wang","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2404.17894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01673v3","updated":"2024-04-27T13:02:57Z","published":"2024-04-02T06:24:21Z","title":"A Universal Knowledge Embedded Contrastive Learning Framework for\n Hyperspectral Image Classification","summary":" Hyperspectral image (HSI) classification techniques have been intensively\nstudied and a variety of models have been developed. However, these HSI\nclassification models are confined to pocket models and unrealistic ways of\ndataset partitioning. The former limits the generalization performance of the\nmodel and the latter is partitioned leading to inflated model evaluation\nmetrics, which results in plummeting model performance in the real world.\nTherefore, we propose a universal knowledge embedded contrastive learning\nframework (KnowCL) for supervised, unsupervised, and semisupervised HSI\nclassification, which largely closes the gap between HSI classification models\nbetween pocket models and standard vision backbones. We present a new HSI\nprocessing pipeline in conjunction with a range of data transformation and\naugmentation techniques that provide diverse data representations and realistic\ndata partitioning. The proposed framework based on this pipeline is compatible\nwith all kinds of backbones and can fully exploit labeled and unlabeled samples\nwith the expected training time. Furthermore, we design a new loss function,\nwhich can adaptively fuse the supervised loss and unsupervised loss, enhancing\nthe learning performance. This proposed new classification paradigm shows great\npotential in exploring for HSI classification technology. The code can be\naccessed at \\url{https://github.com/quanweiliu/KnowCL}.\n","authors":["Quanwei Liu","Yanni Dong","Tao Huang","Lefei Zhang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2404.01673v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17890v1","updated":"2024-04-27T12:55:13Z","published":"2024-04-27T12:55:13Z","title":"DPER: Diffusion Prior Driven Neural Representation for Limited Angle and\n Sparse View CT Reconstruction","summary":" Limited-angle and sparse-view computed tomography (LACT and SVCT) are crucial\nfor expanding the scope of X-ray CT applications. However, they face challenges\ndue to incomplete data acquisition, resulting in diverse artifacts in the\nreconstructed CT images. Emerging implicit neural representation (INR)\ntechniques, such as NeRF, NeAT, and NeRP, have shown promise in\nunder-determined CT imaging reconstruction tasks. However, the unsupervised\nnature of INR architecture imposes limited constraints on the solution space,\nparticularly for the highly ill-posed reconstruction task posed by LACT and\nultra-SVCT. In this study, we introduce the Diffusion Prior Driven Neural\nRepresentation (DPER), an advanced unsupervised framework designed to address\nthe exceptionally ill-posed CT reconstruction inverse problems. DPER adopts the\nHalf Quadratic Splitting (HQS) algorithm to decompose the inverse problem into\ndata fidelity and distribution prior sub-problems. The two sub-problems are\nrespectively addressed by INR reconstruction scheme and pre-trained score-based\ndiffusion model. This combination initially preserves the implicit image local\nconsistency prior from INR. Additionally, it effectively augments the\nfeasibility of the solution space for the inverse problem through the\ngenerative diffusion model, resulting in increased stability and precision in\nthe solutions. We conduct comprehensive experiments to evaluate the performance\nof DPER on LACT and ultra-SVCT reconstruction with two public datasets (AAPM\nand LIDC). The results show that our method outperforms the state-of-the-art\nreconstruction methods on in-domain datasets, while achieving significant\nperformance improvements on out-of-domain datasets.\n","authors":["Chenhe Du","Xiyue Lin","Qing Wu","Xuanyu Tian","Ying Su","Zhe Luo","Hongjiang Wei","S. Kevin Zhou","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.17890v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.17888v1","updated":"2024-04-27T12:53:50Z","published":"2024-04-27T12:53:50Z","title":"A Hybrid Approach for Document Layout Analysis in Document images","summary":" Document layout analysis involves understanding the arrangement of elements\nwithin a document. This paper navigates the complexities of understanding\nvarious elements within document images, such as text, images, tables, and\nheadings. The approach employs an advanced Transformer-based object detection\nnetwork as an innovative graphical page object detector for identifying tables,\nfigures, and displayed elements. We introduce a query encoding mechanism to\nprovide high-quality object queries for contrastive learning, enhancing\nefficiency in the decoder phase. We also present a hybrid matching scheme that\nintegrates the decoder's original one-to-one matching strategy with the\none-to-many matching strategy during the training phase. This approach aims to\nimprove the model's accuracy and versatility in detecting various graphical\nelements on a page. Our experiments on PubLayNet, DocLayNet, and PubTables\nbenchmarks show that our approach outperforms current state-of-the-art methods.\nIt achieves an average precision of 97.3% on PubLayNet, 81.6% on DocLayNet, and\n98.6 on PubTables, demonstrating its superior performance in layout analysis.\nThese advancements not only enhance the conversion of document images into\neditable and accessible formats but also streamline information retrieval and\ndata extraction processes.\n","authors":["Tahira Shehzadi","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2404.17888v1.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2404.17883v1","updated":"2024-04-27T12:42:26Z","published":"2024-04-27T12:42:26Z","title":"Underwater Variable Zoom-Depth-Guided Perception Network for Underwater\n Image Enhancement","summary":" Underwater scenes intrinsically involve degradation problems owing to\nheterogeneous ocean elements. Prevailing underwater image enhancement (UIE)\nmethods stick to straightforward feature modeling to learn the mapping\nfunction, which leads to limited vision gain as it lacks more explicit physical\ncues (e.g., depth). In this work, we investigate injecting the depth prior into\nthe deep UIE model for more precise scene enhancement capability. To this end,\nwe present a novel depth-guided perception UIE framework, dubbed underwater\nvariable zoom (UVZ). Specifically, UVZ resorts to a two-stage pipeline. First,\na depth estimation network is designed to generate critical depth maps,\ncombined with an auxiliary supervision network introduced to suppress\nestimation differences during training. Second, UVZ parses near-far scenarios\nby harnessing the predicted depth maps, enabling local and non-local perceiving\nin different regions. Extensive experiments on five benchmark datasets\ndemonstrate that UVZ achieves superior visual gain and delivers promising\nquantitative metrics. Besides, UVZ is confirmed to exhibit good generalization\nin some visual tasks, especially in unusual lighting conditions. The code,\nmodels and results are available at: https://github.com/WindySprint/UVZ.\n","authors":["Zhixiong Huang","Xinying Wang","Jinjiang Li","Shenglan Liu","Lin Feng"],"pdf_url":"https://arxiv.org/pdf/2404.17883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00952v4","updated":"2024-04-27T12:36:50Z","published":"2023-03-02T04:12:53Z","title":"Towards Activated Muscle Group Estimation in the Wild","summary":" In this paper, we tackle the new task of video-based Activated Muscle Group\nEstimation (AMGE) aiming at identifying active muscle regions during physical\nactivity in the wild. To this intent, we provide the MuscleMap dataset\nfeaturing >15K video clips with 135 different activities and 20 labeled muscle\ngroups. This dataset opens the vistas to multiple video-based applications in\nsports and rehabilitation medicine under flexible environment constraints. The\nproposed MuscleMap dataset is constructed with YouTube videos, specifically\ntargeting High-Intensity Interval Training (HIIT) physical exercise in the\nwild. To make the AMGE model applicable in real-life situations, it is crucial\nto ensure that the model can generalize well to numerous types of physical\nactivities not present during training and involving new combinations of\nactivated muscles. To achieve this, our benchmark also covers an evaluation\nsetting where the model is exposed to activity types excluded from the training\nset. Our experiments reveal that the generalizability of existing architectures\nadapted for the AMGE task remains a challenge. Therefore, we also propose a new\napproach, TransM3E, which employs a multi-modality feature fusion mechanism\nbetween both the video transformer model and the skeleton-based graph\nconvolution model with novel cross-modal knowledge distillation executed on\nmulti-classification tokens. The proposed method surpasses all popular video\nclassification models when dealing with both, previously seen and new types of\nphysical activities. The contributed dataset and code will be publicly\navailable at https://github.com/KPeng9510/MuscleMap.\n","authors":["Kunyu Peng","David Schneider","Alina Roitberg","Kailun Yang","Jiaming Zhang","Chen Deng","Kaiyu Zhang","M. Saquib Sarfraz","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2303.00952v4.pdf","comment":"The contributed dataset and code will be publicly available at\n https://github.com/KPeng9510/MuscleMap"},{"id":"http://arxiv.org/abs/2404.17878v1","updated":"2024-04-27T12:29:12Z","published":"2024-04-27T12:29:12Z","title":"Processing HSV Colored Medical Images and Adapting Color Thresholds for\n Computational Image Analysis: a Practical Introduction to an open-source tool","summary":" Background: Using artificial intelligence (AI) techniques for computational\nmedical image analysis has shown promising results. However, colored images are\noften not readily available for AI analysis because of different coloring\nthresholds used across centers and physicians as well as the removal of\nclinical annotations. We aimed to develop an open-source tool that can adapt\ndifferent color thresholds of HSV-colored medical images and remove annotations\nwith a simple click.\n Materials and Methods: We built a function using MATLAB and used multi-center\ninternational shear wave elastography data (NCT 02638935) to test the function.\nWe provide step-by-step instructions with accompanying code lines.\n Results: We demonstrate that the newly developed pre-processing function\nsuccessfully removed letters and adapted different color thresholds of\nHSV-colored medical images.\n Conclusion: We developed an open-source tool for removing letters and\nadapting different color thresholds in HSV-colored medical images. We hope this\ncontributes to advancing medical image processing for developing robust\ncomputational imaging algorithms using diverse multi-center big data. The\nopen-source Matlab tool is available at\nhttps://github.com/cailiemed/image-threshold-adapting.\n","authors":["Lie Cai","Andre Pfob"],"pdf_url":"https://arxiv.org/pdf/2404.17878v1.pdf","comment":"An open-source tool that can adapt different color thresholds of\n HSV-colored medical images. The newly developed pre-processing Matlab\n function successfully works on multi-center, international shear wave\n elastography data (NCT 02638935). Step-by-step instructions with accompanying\n code lines were provided, easy to follow and reproduce"},{"id":"http://arxiv.org/abs/2404.17876v1","updated":"2024-04-27T12:19:23Z","published":"2024-04-27T12:19:23Z","title":"DF-SLAM: Neural Feature Rendering Based on Dictionary Factors\n Representation for High-Fidelity Dense Visual SLAM System","summary":" We introduce a high-fidelity neural implicit dense visual Simultaneous\nLocalization and Mapping (SLAM) system, termed DF-SLAM. In our work, we employ\ndictionary factors for scene representation, encoding the geometry and\nappearance information of the scene as a combination of basis and coefficient\nfactors. Compared to neural implicit SLAM methods that directly encode scene\ninformation as features, our method exhibits superior scene detail\nreconstruction capabilities and more efficient memory usage, while our model\nsize is insensitive to the size of the scene map, making our method more\nsuitable for large-scale scenes. Additionally, we employ feature integration\nrendering to accelerate color rendering speed while ensuring color rendering\nquality, further enhancing the real-time performance of our neural SLAM method.\nExtensive experiments on synthetic and real-world datasets demonstrate that our\nmethod is competitive with existing state-of-the-art neural implicit SLAM\nmethods in terms of real-time performance, localization accuracy, and scene\nreconstruction quality. Our source code is available at\nhttps://github.com/funcdecl/DF-SLAM.\n","authors":["Weifeng Wei","Jie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.17876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16831v2","updated":"2024-04-27T12:08:00Z","published":"2024-04-25T17:59:59Z","title":"The Third Monocular Depth Estimation Challenge","summary":" This paper discusses the results of the third edition of the Monocular Depth\nEstimation Challenge (MDEC). The challenge focuses on zero-shot generalization\nto the challenging SYNS-Patches dataset, featuring complex scenes in natural\nand indoor settings. As with the previous edition, methods can use any form of\nsupervision, i.e. supervised or self-supervised. The challenge received a total\nof 19 submissions outperforming the baseline on the test set: 10 among them\nsubmitted a report describing their approach, highlighting a diffused use of\nfoundational models such as Depth Anything at the core of their method. The\nchallenge winners drastically improved 3D F-Score performance, from 17.51% to\n23.72%.\n","authors":["Jaime Spencer","Fabio Tosi","Matteo Poggi","Ripudaman Singh Arora","Chris Russell","Simon Hadfield","Richard Bowden","GuangYuan Zhou","ZhengXin Li","Qiang Rao","YiPing Bao","Xiao Liu","Dohyeong Kim","Jinseong Kim","Myunghyun Kim","Mykola Lavreniuk","Rui Li","Qing Mao","Jiang Wu","Yu Zhu","Jinqiu Sun","Yanning Zhang","Suraj Patni","Aradhye Agarwal","Chetan Arora","Pihai Sun","Kui Jiang","Gang Wu","Jian Liu","Xianming Liu","Junjun Jiang","Xidan Zhang","Jianing Wei","Fangjun Wang","Zhiming Tan","Jiabao Wang","Albert Luginov","Muhammad Shahzad","Seyed Hosseini","Aleksander Trajcevski","James H. Elder"],"pdf_url":"https://arxiv.org/pdf/2404.16831v2.pdf","comment":"To appear in CVPRW2024"},{"id":"http://arxiv.org/abs/2404.17867v1","updated":"2024-04-27T11:20:49Z","published":"2024-04-27T11:20:49Z","title":"Are Watermarks Bugs for Deepfake Detectors? Rethinking Proactive\n Forensics","summary":" AI-generated content has accelerated the topic of media synthesis,\nparticularly Deepfake, which can manipulate our portraits for positive or\nmalicious purposes. Before releasing these threatening face images, one\npromising forensics solution is the injection of robust watermarks to track\ntheir own provenance. However, we argue that current watermarking models,\noriginally devised for genuine images, may harm the deployed Deepfake detectors\nwhen directly applied to forged images, since the watermarks are prone to\noverlap with the forgery signals used for detection. To bridge this gap, we\nthus propose AdvMark, on behalf of proactive forensics, to exploit the\nadversarial vulnerability of passive detectors for good. Specifically, AdvMark\nserves as a plug-and-play procedure for fine-tuning any robust watermarking\ninto adversarial watermarking, to enhance the forensic detectability of\nwatermarked images; meanwhile, the watermarks can still be extracted for\nprovenance tracking. Extensive experiments demonstrate the effectiveness of the\nproposed AdvMark, leveraging robust watermarking to fool Deepfake detectors,\nwhich can help improve the accuracy of downstream Deepfake detection without\ntuning the in-the-wild detectors. We believe this work will shed some light on\nthe harmless proactive forensics against Deepfake.\n","authors":["Xiaoshuai Wu","Xin Liao","Bo Ou","Yuling Liu","Zheng Qin"],"pdf_url":"https://arxiv.org/pdf/2404.17867v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.17865v1","updated":"2024-04-27T11:13:55Z","published":"2024-04-27T11:13:55Z","title":"Vision-based Discovery of Nonlinear Dynamics for 3D Moving Target","summary":" Data-driven discovery of governing equations has kindled significant\ninterests in many science and engineering areas. Existing studies primarily\nfocus on uncovering equations that govern nonlinear dynamics based on direct\nmeasurement of the system states (e.g., trajectories). Limited efforts have\nbeen placed on distilling governing laws of dynamics directly from videos for\nmoving targets in a 3D space. To this end, we propose a vision-based approach\nto automatically uncover governing equations of nonlinear dynamics for 3D\nmoving targets via raw videos recorded by a set of cameras. The approach is\ncomposed of three key blocks: (1) a target tracking module that extracts plane\npixel motions of the moving target in each video, (2) a Rodrigues' rotation\nformula-based coordinate transformation learning module that reconstructs the\n3D coordinates with respect to a predefined reference point, and (3) a\nspline-enhanced library-based sparse regressor that uncovers the underlying\ngoverning law of dynamics. This framework is capable of effectively handling\nthe challenges associated with measurement data, e.g., noise in the video,\nimprecise tracking of the target that causes data missing, etc. The efficacy of\nour method has been demonstrated through multiple sets of synthetic videos\nconsidering different nonlinear dynamics.\n","authors":["Zitong Zhang","Yang Liu","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.17865v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.02194v2","updated":"2024-04-27T11:01:58Z","published":"2023-08-04T08:20:54Z","title":"Paired Competing Neurons Improving STDP Supervised Local Learning In\n Spiking Neural Networks","summary":" Direct training of Spiking Neural Networks (SNNs) on neuromorphic hardware\nhas the potential to significantly reduce the energy consumption of artificial\nneural network training. SNNs trained with Spike Timing-Dependent Plasticity\n(STDP) benefit from gradient-free and unsupervised local learning, which can be\neasily implemented on ultra-low-power neuromorphic hardware. However,\nclassification tasks cannot be performed solely with unsupervised STDP. In this\npaper, we propose Stabilized Supervised STDP (S2-STDP), a supervised STDP\nlearning rule to train the classification layer of an SNN equipped with\nunsupervised STDP for feature extraction. S2-STDP integrates error-modulated\nweight updates that align neuron spikes with desired timestamps derived from\nthe average firing time within the layer. Then, we introduce a training\narchitecture called Paired Competing Neurons (PCN) to further enhance the\nlearning capabilities of our classification layer trained with S2-STDP. PCN\nassociates each class with paired neurons and encourages neuron specialization\ntoward target or non-target samples through intra-class competition. We\nevaluate our methods on image recognition datasets, including MNIST,\nFashion-MNIST, and CIFAR-10. Results show that our methods outperform\nstate-of-the-art supervised STDP learning rules, for comparable architectures\nand numbers of neurons. Further analysis demonstrates that the use of PCN\nenhances the performance of S2-STDP, regardless of the hyperparameter set and\nwithout introducing any additional hyperparameters.\n","authors":["Gaspard Goupy","Pierre Tirilly","Ioan Marius Bilasco"],"pdf_url":"https://arxiv.org/pdf/2308.02194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17861v1","updated":"2024-04-27T10:40:52Z","published":"2024-04-27T10:40:52Z","title":"BoostRad: Enhancing Object Detection by Boosting Radar Reflections","summary":" Automotive radars have an important role in autonomous driving systems. The\nmain challenge in automotive radar detection is the radar's wide point spread\nfunction (PSF) in the angular domain that causes blurriness and clutter in the\nradar image. Numerous studies suggest employing an 'end-to-end' learning\nstrategy using a Deep Neural Network (DNN) to directly detect objects from\nradar images. This approach implicitly addresses the PSF's impact on objects of\ninterest. In this paper, we propose an alternative approach, which we term\n\"Boosting Radar Reflections\" (BoostRad). In BoostRad, a first DNN is trained to\nnarrow the PSF for all the reflection points in the scene. The output of the\nfirst DNN is a boosted reflection image with higher resolution and reduced\nclutter, resulting in a sharper and cleaner image. Subsequently, a second DNN\nis employed to detect objects within the boosted reflection image. We develop a\nnovel method for training the boosting DNN that incorporates domain knowledge\nof radar's PSF characteristics. BoostRad's performance is evaluated using the\nRADDet and CARRADA datasets, revealing its superiority over reference methods.\n","authors":["Yuval Haitman","Oded Bialer"],"pdf_url":"https://arxiv.org/pdf/2404.17861v1.pdf","comment":"WACV2024"},{"id":"http://arxiv.org/abs/2404.17854v1","updated":"2024-04-27T10:18:55Z","published":"2024-04-27T10:18:55Z","title":"GLIMS: Attention-Guided Lightweight Multi-Scale Hybrid Network for\n Volumetric Semantic Segmentation","summary":" Convolutional Neural Networks (CNNs) have become widely adopted for medical\nimage segmentation tasks, demonstrating promising performance. However, the\ninherent inductive biases in convolutional architectures limit their ability to\nmodel long-range dependencies and spatial correlations. While recent\ntransformer-based architectures address these limitations by leveraging\nself-attention mechanisms to encode long-range dependencies and learn\nexpressive representations, they often struggle to extract low-level features\nand are highly dependent on data availability. This motivated us for the\ndevelopment of GLIMS, a data-efficient attention-guided hybrid volumetric\nsegmentation network. GLIMS utilizes Dilated Feature Aggregator Convolutional\nBlocks (DACB) to capture local-global feature correlations efficiently.\nFurthermore, the incorporated Swin Transformer-based bottleneck bridges the\nlocal and global features to improve the robustness of the model. Additionally,\nGLIMS employs an attention-guided segmentation approach through Channel and\nSpatial-Wise Attention Blocks (CSAB) to localize expressive features for\nfine-grained border segmentation. Quantitative and qualitative results on\nglioblastoma and multi-organ CT segmentation tasks demonstrate GLIMS'\neffectiveness in terms of complexity and accuracy. GLIMS demonstrated\noutstanding performance on BraTS2021 and BTCV datasets, surpassing the\nperformance of Swin UNETR. Notably, GLIMS achieved this high performance with a\nsignificantly reduced number of trainable parameters. Specifically, GLIMS has\n47.16M trainable parameters and 72.30G FLOPs, while Swin UNETR has 61.98M\ntrainable parameters and 394.84G FLOPs. The code is publicly available on\nhttps://github.com/yaziciz/GLIMS.\n","authors":["Ziya Ata Yazıcı","İlkay Öksüz","Hazım Kemal Ekenel"],"pdf_url":"https://arxiv.org/pdf/2404.17854v1.pdf","comment":"The article was accepted for publication in the Image and Vision\n Computing journal"},{"id":"http://arxiv.org/abs/2404.14542v2","updated":"2024-04-27T09:55:51Z","published":"2024-04-22T19:29:12Z","title":"UVEB: A Large-scale Benchmark and Baseline Towards Real-World Underwater\n Video Enhancement","summary":" Learning-based underwater image enhancement (UIE) methods have made great\nprogress. However, the lack of large-scale and high-quality paired training\nsamples has become the main bottleneck hindering the development of UIE. The\ninter-frame information in underwater videos can accelerate or optimize the UIE\nprocess. Thus, we constructed the first large-scale high-resolution underwater\nvideo enhancement benchmark (UVEB) to promote the development of underwater\nvision.It contains 1,308 pairs of video sequences and more than 453,000\nhigh-resolution with 38\\% Ultra-High-Definition (UHD) 4K frame pairs. UVEB\ncomes from multiple countries, containing various scenes and video degradation\ntypes to adapt to diverse and complex underwater environments. We also propose\nthe first supervised underwater video enhancement method, UVE-Net. UVE-Net\nconverts the current frame information into convolutional kernels and passes\nthem to adjacent frames for efficient inter-frame information exchange. By\nfully utilizing the redundant degraded information of underwater videos,\nUVE-Net completes video enhancement better. Experiments show the effective\nnetwork design and good performance of UVE-Net.\n","authors":["Yaofeng Xie","Lingwei Kong","Kai Chen","Ziqiang Zheng","Xiao Yu","Zhibin Yu","Bing Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.14542v2.pdf","comment":"10 pages,CVPR2024 accept"},{"id":"http://arxiv.org/abs/2404.17845v1","updated":"2024-04-27T09:46:49Z","published":"2024-04-27T09:46:49Z","title":"Instance-free Text to Point Cloud Localization with Relative Position\n Awareness","summary":" Text-to-point-cloud cross-modal localization is an emerging vision-language\ntask critical for future robot-human collaboration. It seeks to localize a\nposition from a city-scale point cloud scene based on a few natural language\ninstructions. In this paper, we address two key limitations of existing\napproaches: 1) their reliance on ground-truth instances as input; and 2) their\nneglect of the relative positions among potential instances. Our proposed model\nfollows a two-stage pipeline, including a coarse stage for text-cell retrieval\nand a fine stage for position estimation. In both stages, we introduce an\ninstance query extractor, in which the cells are encoded by a 3D sparse\nconvolution U-Net to generate the multi-scale point cloud features, and a set\nof queries iteratively attend to these features to represent instances. In the\ncoarse stage, a row-column relative position-aware self-attention (RowColRPA)\nmodule is designed to capture the spatial relations among the instance queries.\nIn the fine stage, a multi-modal relative position-aware cross-attention (RPCA)\nmodule is developed to fuse the text and point cloud features along with\nspatial relations for improving fine position estimation. Experiment results on\nthe KITTI360Pose dataset demonstrate that our model achieves competitive\nperformance with the state-of-the-art models without taking ground-truth\ninstances as input.\n","authors":["Lichao Wang","Zhihao Yuan","Jinke Ren","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2404.17845v1.pdf","comment":"12 pages, 10 figures, conference"},{"id":"http://arxiv.org/abs/2404.16617v2","updated":"2024-04-27T09:29:38Z","published":"2024-04-25T13:56:54Z","title":"Denoising: from classical methods to deep CNNs","summary":" This paper aims to explore the evolution of image denoising in a\npedagological way. We briefly review classical methods such as Fourier analysis\nand wavelet bases, highlighting the challenges they faced until the emergence\nof neural networks, notably the U-Net, in the 2010s. The remarkable performance\nof these networks has been demonstrated in studies such as Kadkhodaie et al.\n(2024). They exhibit adaptability to various image types, including those with\nfixed regularity, facial images, and bedroom scenes, achieving optimal results\nand biased towards geometry-adaptive harmonic basis. The introduction of score\ndiffusion has played a crucial role in image generation. In this context,\ndenoising becomes essential as it facilitates the estimation of probability\ndensity scores. We discuss the prerequisites for genuine learning of\nprobability densities, offering insights that extend from mathematical research\nto the implications of universal structures.\n","authors":["Jean-Eric Campagne"],"pdf_url":"https://arxiv.org/pdf/2404.16617v2.pdf","comment":"This document uses works by authors not yet presented to the\n community and may appear to be original"},{"id":"http://arxiv.org/abs/2404.17837v1","updated":"2024-04-27T09:02:42Z","published":"2024-04-27T09:02:42Z","title":"Hybrid 3D Human Pose Estimation with Monocular Video and Sparse IMUs","summary":" Temporal 3D human pose estimation from monocular videos is a challenging task\nin human-centered computer vision due to the depth ambiguity of 2D-to-3D\nlifting. To improve accuracy and address occlusion issues, inertial sensor has\nbeen introduced to provide complementary source of information. However, it\nremains challenging to integrate heterogeneous sensor data for producing\nphysically rational 3D human poses. In this paper, we propose a novel\nframework, Real-time Optimization and Fusion (RTOF), to address this issue. We\nfirst incorporate sparse inertial orientations into a parametric human skeleton\nto refine 3D poses in kinematics. The poses are then optimized by energy\nfunctions built on both visual and inertial observations to reduce the temporal\njitters. Our framework outputs smooth and biomechanically plausible human\nmotion. Comprehensive experiments with ablation studies demonstrate its\nrationality and efficiency. On Total Capture dataset, the pose estimation error\nis significantly decreased compared to the baseline method.\n","authors":["Yiming Bao","Xu Zhao","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2404.17837v1.pdf","comment":"10 pages, 5 figures, Under Review"},{"id":"http://arxiv.org/abs/2310.10586v2","updated":"2024-04-27T08:41:37Z","published":"2023-10-16T17:05:56Z","title":"VidCoM: Fast Video Comprehension through Large Language Models with\n Multimodal Tools","summary":" Building models that comprehends videos and responds specific user\ninstructions is a practical and challenging topic, as it requires mastery of\nboth vision understanding and knowledge reasoning. Compared to language and\nimage modalities, training efficiency remains a serious problem as existing\nstudies train models on massive sparse videos paired with brief descriptions.\nIn this paper, we introduce \\textbf{VidCoM}, a fast adaptive framework that\nleverages Large Language Models (LLMs) to reason about videos using lightweight\nvisual tools. Specifically, we reveal that the key to responding to specific\ninstructions is focusing on relevant video events, and utilize two visual\ntools, structured scene graph generation and descriptive image caption\ngeneration, to gather and represent the event information. Thus, a LLM enriched\nwith world knowledge is adopted as the reasoning agent to achieve the responses\nby performing multiple reasoning steps on specific video events. To address the\ndifficulty of LLMs identifying video events, we further propose an\nInstruction-oriented Video Events Recognition (InsOVER) algorithm. This\nalgorithm locates the corresponding video events based on an efficient\nHungarian matching between decompositions of linguistic instructions and video\nevents, thereby enabling LLMs to interact effectively with extended videos.\nExtensive experiments on two typical video comprehension tasks show that the\nproposed tuning-free framework outperforms the pre-trained models including\nFlamingo-80B, to achieve the state-of-the-art performance. Our source code and\nsystem will be publicly available.\n","authors":["Ji Qi","Kaixuan Ji","Jifan Yu","Duokang Wang","Bin Xu","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2310.10586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16445v3","updated":"2024-04-27T08:41:13Z","published":"2023-11-28T03:00:59Z","title":"CLAP: Isolating Content from Style through Contrastive Learning with\n Augmented Prompts","summary":" Contrastive vision-language models, such as CLIP, have garnered considerable\nattention for various dowmsteam tasks, mainly due to the remarkable ability of\nthe learned features for generalization. However, the features they learned\noften blend content and style information, which somewhat limits their\ngeneralization capabilities under distribution shifts. To address this\nlimitation, we adopt a causal generative perspective for multimodal data and\npropose contrastive learning with data augmentation to disentangle content\nfeatures from the original representations. To achieve this, we begins with\nexploring image augmentation techniques and develop a method to seamlessly\nintegrate them into pre-trained CLIP-like models to extract pure content\nfeatures. Taking a step further, recognizing the inherent semantic richness and\nlogical structure of text data, we explore the use of text augmentation to\nisolate latent content from style features. This enables CLIP-like model's\nencoders to concentrate on latent content information, refining the learned\nrepresentations by pre-trained CLIP-like models. Our extensive experiments\nacross diverse datasets demonstrate significant improvements in zero-shot and\nfew-shot classification tasks, alongside enhanced robustness to various\nperturbations. These results underscore the effectiveness of our proposed\nmethods in refining vision-language representations and advancing the\nstate-of-the-art in multimodal learning.\n","authors":["Yichao Cai","Yuhang Liu","Zhen Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2311.16445v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17830v1","updated":"2024-04-27T08:40:33Z","published":"2024-04-27T08:40:33Z","title":"Dynamic Against Dynamic: An Open-set Self-learning Framework","summary":" In open-set recognition, existing methods generally learn statically fixed\ndecision boundaries using known classes to reject unknown classes. Though they\nhave achieved promising results, such decision boundaries are evidently\ninsufficient for universal unknown classes in dynamic and open scenarios as\nthey can potentially appear at any position in the feature space. Moreover,\nthese methods just simply reject unknown class samples during testing without\nany effective utilization for them. In fact, such samples completely can\nconstitute the true instantiated representation of the unknown classes to\nfurther enhance the model's performance. To address these issues, this paper\nproposes a novel dynamic against dynamic idea, i.e., dynamic method against\ndynamic changing open-set world, where an open-set self-learning (OSSL)\nframework is correspondingly developed. OSSL starts with a good closed-set\nclassifier trained by known classes and utilizes available test samples for\nmodel adaptation during testing, thus gaining the adaptability to changing data\ndistributions. In particular, a novel self-matching module is designed for\nOSSL, which can achieve the adaptation in automatically identifying known class\nsamples while rejecting unknown class samples which are further utilized to\nenhance the discriminability of the model as the instantiated representation of\nunknown classes. Our method establishes new performance milestones respectively\nin almost all standard and cross-data benchmarks.\n","authors":["Haifeng Yang","Chuanxing Geng","PongChi Yuen","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17830v1.pdf","comment":"The first two authors contributed equally to this work. Accepted at\n IJCAI2024"},{"id":"http://arxiv.org/abs/2404.03892v3","updated":"2024-04-27T08:24:37Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The Deep learning (DL) models for diagnosing breast cancer from mammographic\nimages often operate as \"black boxes\", making it difficult for healthcare\nprofessionals to trust and understand their decision-making processes. The\nstudy presents an integrated framework combining Convolutional Neural Networks\n(CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced diagnosis\nof breast cancer using the CBIS-DDSM dataset. The methodology encompasses an\nelaborate data preprocessing pipeline and advanced data augmentation techniques\nto counteract dataset limitations and transfer learning using pre-trained\nnetworks such as VGG-16, Inception-V3 and ResNet was employed. A focal point of\nour study is the evaluation of XAI's effectiveness in interpreting model\npredictions, highlighted by utilizing the Hausdorff measure to assess the\nalignment between AI-generated explanations and expert annotations\nquantitatively. This approach is critical for XAI in promoting trustworthiness\nand ethical fairness in AI-assisted diagnostics. The findings from our research\nillustrate the effective collaboration between CNNs and XAI in advancing\ndiagnostic methods for breast cancer, thereby facilitating a more seamless\nintegration of advanced AI technologies within clinical settings. By enhancing\nthe interpretability of AI driven decisions, this work lays the groundwork for\nimproved collaboration between AI systems and medical practitioners, ultimately\nenriching patient care. Furthermore, the implications of our research extended\nwell beyond the current methodologies. It encourages further research into how\nto combine multimodal data and improve AI explanations to meet the needs of\nclinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17825v1","updated":"2024-04-27T08:13:13Z","published":"2024-04-27T08:13:13Z","title":"ODCR: Orthogonal Decoupling Contrastive Regularization for Unpaired\n Image Dehazing","summary":" Unpaired image dehazing (UID) holds significant research importance due to\nthe challenges in acquiring haze/clear image pairs with identical backgrounds.\nThis paper proposes a novel method for UID named Orthogonal Decoupling\nContrastive Regularization (ODCR). Our method is grounded in the assumption\nthat an image consists of both haze-related features, which influence the\ndegree of haze, and haze-unrelated features, such as texture and semantic\ninformation. ODCR aims to ensure that the haze-related features of the dehazing\nresult closely resemble those of the clear image, while the haze-unrelated\nfeatures align with the input hazy image. To accomplish the motivation,\nOrthogonal MLPs optimized geometrically on the Stiefel manifold are proposed,\nwhich can project image features into an orthogonal space, thereby reducing the\nrelevance between different features. Furthermore, a task-driven Depth-wise\nFeature Classifier (DWFC) is proposed, which assigns weights to the orthogonal\nfeatures based on the contribution of each channel's feature in predicting\nwhether the feature source is hazy or clear in a self-supervised fashion.\nFinally, a Weighted PatchNCE (WPNCE) loss is introduced to achieve the pulling\nof haze-related features in the output image toward those of clear images,\nwhile bringing haze-unrelated features close to those of the hazy input.\nExtensive experiments demonstrate the superior performance of our ODCR method\non UID.\n","authors":["Zhongze Wang","Haitao Zhao","Jingchao Peng","Lujian Yao","Kaijie Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.17825v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04956v2","updated":"2024-04-27T08:05:29Z","published":"2024-04-07T13:30:10Z","title":"Gaussian Shading: Provable Performance-Lossless Image Watermarking for\n Diffusion Models","summary":" Ethical concerns surrounding copyright protection and inappropriate content\ngeneration pose challenges for the practical implementation of diffusion\nmodels. One effective solution involves watermarking the generated images.\nHowever, existing methods often compromise the model performance or require\nadditional training, which is undesirable for operators and users. To address\nthis issue, we propose Gaussian Shading, a diffusion model watermarking\ntechnique that is both performance-lossless and training-free, while serving\nthe dual purpose of copyright protection and tracing of offending content. Our\nwatermark embedding is free of model parameter modifications and thus is\nplug-and-play. We map the watermark to latent representations following a\nstandard Gaussian distribution, which is indistinguishable from latent\nrepresentations obtained from the non-watermarked diffusion model. Therefore we\ncan achieve watermark embedding with lossless performance, for which we also\nprovide theoretical proof. Furthermore, since the watermark is intricately\nlinked with image semantics, it exhibits resilience to lossy processing and\nerasure attempts. The watermark can be extracted by Denoising Diffusion\nImplicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian\nShading on multiple versions of Stable Diffusion, and the results demonstrate\nthat Gaussian Shading not only is performance-lossless but also outperforms\nexisting methods in terms of robustness.\n","authors":["Zijin Yang","Kai Zeng","Kejiang Chen","Han Fang","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04956v2.pdf","comment":"17 pages, 11 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07072v2","updated":"2024-04-27T07:39:45Z","published":"2024-04-10T15:02:26Z","title":"Implicit Multi-Spectral Transformer: An Lightweight and Effective\n Visible to Infrared Image Translation Model","summary":" In the field of computer vision, visible light images often exhibit low\ncontrast in low-light conditions, presenting a significant challenge. While\ninfrared imagery provides a potential solution, its utilization entails high\ncosts and practical limitations. Recent advancements in deep learning,\nparticularly the deployment of Generative Adversarial Networks (GANs), have\nfacilitated the transformation of visible light images to infrared images.\nHowever, these methods often experience unstable training phases and may\nproduce suboptimal outputs. To address these issues, we propose a novel\nend-to-end Transformer-based model that efficiently converts visible light\nimages into high-fidelity infrared images. Initially, the Texture Mapping\nModule and Color Perception Adapter collaborate to extract texture and color\nfeatures from the visible light image. The Dynamic Fusion Aggregation Module\nsubsequently integrates these features. Finally, the transformation into an\ninfrared image is refined through the synergistic action of the Color\nPerception Adapter and the Enhanced Perception Attention mechanism.\nComprehensive benchmarking experiments confirm that our model outperforms\nexisting methods, producing infrared images of markedly superior quality, both\nqualitatively and quantitatively. Furthermore, the proposed model enables more\neffective downstream applications for infrared images than other methods.\n","authors":["Yijia Chen","Pinghua Chen","Xiangxin Zhou","Yingtie Lei","Ziyang Zhou","Mingxian Li"],"pdf_url":"https://arxiv.org/pdf/2404.07072v2.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2305.18829v5","updated":"2024-04-27T07:16:13Z","published":"2023-05-30T08:23:06Z","title":"UniScene: Multi-Camera Unified Pre-training via 3D Scene Reconstruction\n for Autonomous Driving","summary":" Multi-camera 3D perception has emerged as a prominent research field in\nautonomous driving, offering a viable and cost-effective alternative to\nLiDAR-based solutions. The existing multi-camera algorithms primarily rely on\nmonocular 2D pre-training. However, the monocular 2D pre-training overlooks the\nspatial and temporal correlations among the multi-camera system. To address\nthis limitation, we propose the first multi-camera unified pre-training\nframework, called UniScene, which involves initially reconstructing the 3D\nscene as the foundational stage and subsequently fine-tuning the model on\ndownstream tasks. Specifically, we employ Occupancy as the general\nrepresentation for the 3D scene, enabling the model to grasp geometric priors\nof the surrounding world through pre-training. A significant benefit of\nUniScene is its capability to utilize a considerable volume of unlabeled\nimage-LiDAR pairs for pre-training purposes. The proposed multi-camera unified\npre-training framework demonstrates promising results in key tasks such as\nmulti-camera 3D object detection and surrounding semantic scene completion.\nWhen compared to monocular pre-training methods on the nuScenes dataset,\nUniScene shows a significant improvement of about 2.0% in mAP and 2.0% in NDS\nfor multi-camera 3D object detection, as well as a 3% increase in mIoU for\nsurrounding semantic scene completion. By adopting our unified pre-training\nmethod, a 25% reduction in 3D training annotation costs can be achieved,\noffering significant practical value for the implementation of real-world\nautonomous driving. Codes are publicly available at\nhttps://github.com/chaytonmin/UniScene.\n","authors":["Chen Min","Liang Xiao","Dawei Zhao","Yiming Nie","Bin Dai"],"pdf_url":"https://arxiv.org/pdf/2305.18829v5.pdf","comment":"Accepted by RAL2024"},{"id":"http://arxiv.org/abs/2404.00650v2","updated":"2024-04-27T07:05:43Z","published":"2024-03-31T11:37:43Z","title":"Deep Instruction Tuning for Segment Anything Model","summary":" Recently, Segment Anything Model (SAM) has become a research hotspot in the\nfields of multimedia and computer vision, which exhibits powerful yet versatile\ncapabilities on various (un) conditional image segmentation tasks. Although SAM\ncan support different types of segmentation prompts, we note that, compared to\npoint- and box-guided segmentations, it performs much worse on text-instructed\ntasks, e.g., referring image segmentation (RIS). In this paper, we argue that\ndeep text instruction tuning is key to mitigate such shortcoming caused by the\nshallow fusion scheme in its default light-weight mask decoder. To address this\nissue, we propose two simple yet effective deep instruction tuning (DIT)\nmethods for SAM, one is end-to-end and the other is layer-wise. With minimal\nmodifications, DITs can directly transform the image encoder of SAM as a\nstand-alone vision-language learner in contrast to building another deep fusion\nbranch, maximizing the benefit of its superior segmentation capability.\nExtensive experiments on three highly competitive benchmark datasets of RIS\nshow that a simple end-to-end DIT can improve SAM by a large margin, while the\nlayer-wise DIT can further boost the performance to state-of-the-art with much\nless data and training expenditures. Our code is released at:\nhttps://github.com/wysnzzzz/DIT.\n","authors":["Xiaorui Huang","Gen Luo","Chaoyang Zhu","Bo Tong","Yiyi Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.00650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17805v1","updated":"2024-04-27T07:05:41Z","published":"2024-04-27T07:05:41Z","title":"From Optimization to Generalization: Fair Federated Learning against\n Quality Shift via Inter-Client Sharpness Matching","summary":" Due to escalating privacy concerns, federated learning has been recognized as\na vital approach for training deep neural networks with decentralized medical\ndata. In practice, it is challenging to ensure consistent imaging quality\nacross various institutions, often attributed to equipment malfunctions\naffecting a minority of clients. This imbalance in image quality can cause the\nfederated model to develop an inherent bias towards higher-quality images, thus\nposing a severe fairness issue. In this study, we pioneer the identification\nand formulation of this new fairness challenge within the context of the\nimaging quality shift. Traditional methods for promoting fairness in federated\nlearning predominantly focus on balancing empirical risks across diverse client\ndistributions. This strategy primarily facilitates fair optimization across\ndifferent training data distributions, yet neglects the crucial aspect of\ngeneralization. To address this, we introduce a solution termed Federated\nlearning with Inter-client Sharpness Matching (FedISM). FedISM enhances both\nlocal training and global aggregation by incorporating sharpness-awareness,\naiming to harmonize the sharpness levels across clients for fair\ngeneralization. Our empirical evaluations, conducted using the widely-used ICH\nand ISIC 2019 datasets, establish FedISM's superiority over current\nstate-of-the-art federated learning methods in promoting fairness. Code is\navailable at https://github.com/wnn2000/FFL4MIA.\n","authors":["Nannan Wu","Zhuo Kuang","Zengqiang Yan","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2404.17805v1.pdf","comment":"This paper is accepted at IJCAI'24 (Main Track)"},{"id":"http://arxiv.org/abs/2404.17793v1","updated":"2024-04-27T06:18:23Z","published":"2024-04-27T06:18:23Z","title":"CLFT: Camera-LiDAR Fusion Transformer for Semantic Segmentation in\n Autonomous Driving","summary":" Critical research about camera-and-LiDAR-based semantic object segmentation\nfor autonomous driving significantly benefited from the recent development of\ndeep learning. Specifically, the vision transformer is the novel ground-breaker\nthat successfully brought the multi-head-attention mechanism to computer vision\napplications. Therefore, we propose a vision-transformer-based network to carry\nout camera-LiDAR fusion for semantic segmentation applied to autonomous\ndriving. Our proposal uses the novel progressive-assemble strategy of vision\ntransformers on a double-direction network and then integrates the results in a\ncross-fusion strategy over the transformer decoder layers. Unlike other works\nin the literature, our camera-LiDAR fusion transformers have been evaluated in\nchallenging conditions like rain and low illumination, showing robust\nperformance. The paper reports the segmentation results over the vehicle and\nhuman classes in different modalities: camera-only, LiDAR-only, and\ncamera-LiDAR fusion. We perform coherent controlled benchmark experiments of\nCLFT against other networks that are also designed for semantic segmentation.\nThe experiments aim to evaluate the performance of CLFT independently from two\nperspectives: multimodal sensor fusion and backbone architectures. The\nquantitative assessments show our CLFT networks yield an improvement of up to\n10\\% for challenging dark-wet conditions when comparing with\nFully-Convolutional-Neural-Network-based (FCN) camera-LiDAR fusion neural\nnetwork. Contrasting to the network with transformer backbone but using single\nmodality input, the all-around improvement is 5-10\\%.\n","authors":["Junyi Gu","Mauro Bellone","Tomáš Pivoňka","Raivo Sell"],"pdf_url":"https://arxiv.org/pdf/2404.17793v1.pdf","comment":"Submitted to IEEE Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2304.10701v7","updated":"2024-04-27T05:45:34Z","published":"2023-04-21T02:02:02Z","title":"GMValuator: Similarity-based Data Valuation for Generative Models","summary":" Data valuation plays a crucial role in machine learning. Existing data\nvaluation methods have primarily focused on discriminative models, neglecting\ngenerative models that have recently gained considerable attention. A very few\nexisting attempts of data valuation method designed for deep generative models\neither concentrates on specific models or lacks robustness in their outcomes.\nMoreover, efficiency still reveals vulnerable shortcomings. To bridge the gaps,\nwe formulate the data valuation problem in generative models from a\nsimilarity-matching perspective. Specifically, we introduce Generative Model\nValuator (GMValuator), the first training-free and model-agnostic approach to\nprovide data valuation for generation tasks. It empowers efficient data\nvaluation through our innovatively similarity matching module, calibrates\nbiased contribution by incorporating image quality assessment, and attributes\ncredits to all training samples based on their contributions to the generated\nsamples. Additionally, we introduce four evaluation criteria for assessing data\nvaluation methods in generative models, aligning with principles of\nplausibility and truthfulness. GMValuator is extensively evaluated on various\ndatasets and generative architectures to demonstrate its effectiveness.\n","authors":["Jiaxi Yang","Wenglong Deng","Benlin Liu","Yangsibo Huang","James Zou","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2304.10701v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17774v1","updated":"2024-04-27T04:13:39Z","published":"2024-04-27T04:13:39Z","title":"High-quality Surface Reconstruction using Gaussian Surfels","summary":" We propose a novel point-based representation, Gaussian surfels, to combine\nthe advantages of the flexible optimization procedure in 3D Gaussian points and\nthe surface alignment property of surfels. This is achieved by directly setting\nthe z-scale of 3D Gaussian points to 0, effectively flattening the original 3D\nellipsoid into a 2D ellipse. Such a design provides clear guidance to the\noptimizer. By treating the local z-axis as the normal direction, it greatly\nimproves optimization stability and surface alignment. While the derivatives to\nthe local z-axis computed from the covariance matrix are zero in this setting,\nwe design a self-supervised normal-depth consistency loss to remedy this issue.\nMonocular normal priors and foreground masks are incorporated to enhance the\nquality of the reconstruction, mitigating issues related to highlights and\nbackground. We propose a volumetric cutting method to aggregate the information\nof Gaussian surfels so as to remove erroneous points in depth maps generated by\nalpha blending. Finally, we apply screened Poisson reconstruction method to the\nfused depth maps to extract the surface mesh. Experimental results show that\nour method demonstrates superior performance in surface reconstruction compared\nto state-of-the-art neural volume rendering and point-based rendering methods.\n","authors":["Pinxuan Dai","Jiamin Xu","Wenxiang Xie","Xinguo Liu","Huamin Wang","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2404.17774v1.pdf","comment":"Original version"},{"id":"http://arxiv.org/abs/2404.17773v1","updated":"2024-04-27T04:09:49Z","published":"2024-04-27T04:09:49Z","title":"Compressing Latent Space via Least Volume","summary":" This paper introduces Least Volume-a simple yet effective regularization\ninspired by geometric intuition-that can reduce the necessary number of latent\ndimensions needed by an autoencoder without requiring any prior knowledge of\nthe intrinsic dimensionality of the dataset. We show that the Lipschitz\ncontinuity of the decoder is the key to making it work, provide a proof that\nPCA is just a linear special case of it, and reveal that it has a similar\nPCA-like importance ordering effect when applied to nonlinear models. We\ndemonstrate the intuition behind the regularization on some pedagogical toy\nproblems, and its effectiveness on several benchmark problems, including MNIST,\nCIFAR-10 and CelebA.\n","authors":["Qiuyi Chen","Mark Fuge"],"pdf_url":"https://arxiv.org/pdf/2404.17773v1.pdf","comment":"24 pages, International Conference on Learning Representations 2024"},{"id":"http://arxiv.org/abs/2404.17771v1","updated":"2024-04-27T03:55:53Z","published":"2024-04-27T03:55:53Z","title":"Charaterization of dim light response in DVS pixel: Discontinuity of\n event triggering time","summary":" Dynamic Vision Sensors (DVS) have recently generated great interest because\nof the advantages of wide dynamic range and low latency compared with\nconventional frame-based cameras. However, the complicated behaviors in dim\nlight conditions are still not clear, restricting the applications of DVS. In\nthis paper, we analyze the typical DVS circuit, and find that there exists\ndiscontinuity of event triggering time. In dim light conditions, the\ndiscontinuity becomes prominent. We point out that the discontinuity depends\nexclusively on the changing speed of light intensity. Experimental results on\nreal event data validate the analysis and the existence of discontinuity that\nreveals the non-first-order behaviors of DVS in dim light conditions.\n","authors":["Xiao Jiang","Fei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17771v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.17768v1","updated":"2024-04-27T03:30:50Z","published":"2024-04-27T03:30:50Z","title":"Make the Most of Your Data: Changing the Training Data Distribution to\n Improve In-distribution Generalization Performance","summary":" Can we modify the training data distribution to encourage the underlying\noptimization method toward finding solutions with superior generalization\nperformance on in-distribution data? In this work, we approach this question\nfor the first time by comparing the inductive bias of gradient descent (GD)\nwith that of sharpness-aware minimization (SAM). By studying a two-layer CNN,\nwe prove that SAM learns easy and difficult features more uniformly,\nparticularly in early epochs. That is, SAM is less susceptible to simplicity\nbias compared to GD. Based on this observation, we propose USEFUL, an algorithm\nthat clusters examples based on the network output early in training and\nupsamples examples with no easy features to alleviate the pitfalls of the\nsimplicity bias. We show empirically that modifying the training data\ndistribution in this way effectively improves the generalization performance on\nthe original data distribution when training with (S)GD by mimicking the\ntraining dynamics of SAM. Notably, we demonstrate that our method can be\ncombined with SAM and existing data augmentation strategies to achieve, to the\nbest of our knowledge, state-of-the-art performance for training ResNet18 on\nCIFAR10, STL10, CINIC10, Tiny-ImageNet; ResNet34 on CIFAR100; and VGG19 and\nDenseNet121 on CIFAR10.\n","authors":["Dang Nguyen","Paymon Haddad","Eric Gan","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2404.17768v1.pdf","comment":"32 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.17765v1","updated":"2024-04-27T03:07:07Z","published":"2024-04-27T03:07:07Z","title":"RFL-CDNet: Towards Accurate Change Detection via Richer Feature Learning","summary":" Change Detection is a crucial but extremely challenging task of remote\nsensing image analysis, and much progress has been made with the rapid\ndevelopment of deep learning. However, most existing deep learning-based change\ndetection methods mainly focus on intricate feature extraction and multi-scale\nfeature fusion, while ignoring the insufficient utilization of features in the\nintermediate stages, thus resulting in sub-optimal results. To this end, we\npropose a novel framework, named RFL-CDNet, that utilizes richer feature\nlearning to boost change detection performance. Specifically, we first\nintroduce deep multiple supervision to enhance intermediate representations,\nthus unleashing the potential of backbone feature extractor at each stage.\nFurthermore, we design the Coarse-To-Fine Guiding (C2FG) module and the\nLearnable Fusion (LF) module to further improve feature learning and obtain\nmore discriminative feature representations. The C2FG module aims to seamlessly\nintegrate the side prediction from the previous coarse-scale into the current\nfine-scale prediction in a coarse-to-fine manner, while LF module assumes that\nthe contribution of each stage and each spatial location is independent, thus\ndesigning a learnable module to fuse multiple predictions. Experiments on\nseveral benchmark datasets show that our proposed RFL-CDNet achieves\nstate-of-the-art performance on WHU cultivated land dataset and CDD dataset,\nand the second-best performance on WHU building dataset. The source code and\nmodels are publicly available at https://github.com/Hhaizee/RFL-CDNet.\n","authors":["Yuhang Gan","Wenjie Xuan","Hang Chen","Juhua Liu","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2404.17765v1.pdf","comment":"Accepted by PR, volume 153"},{"id":"http://arxiv.org/abs/2404.14025v2","updated":"2024-04-27T02:48:39Z","published":"2024-04-22T09:41:03Z","title":"DHRNet: A Dual-Path Hierarchical Relation Network for Multi-Person Pose\n Estimation","summary":" Multi-person pose estimation (MPPE) presents a formidable yet crucial\nchallenge in computer vision. Most existing methods predominantly concentrate\non isolated interaction either between instances or joints, which is inadequate\nfor scenarios demanding concurrent localization of both instances and joints.\nThis paper introduces a novel CNN-based single-stage method, named Dual-path\nHierarchical Relation Network (DHRNet), to extract instance-to-joint and\njoint-to-instance interactions concurrently. Specifically, we design a\ndual-path interaction modeling module (DIM) that strategically organizes\ncross-instance and cross-joint interaction modeling modules in two\ncomplementary orders, enriching interaction information by integrating merits\nfrom different correlation modeling branches. Notably, DHRNet excels in joint\nlocalization by leveraging information from other instances and joints.\nExtensive evaluations on challenging datasets, including COCO, CrowdPose, and\nOCHuman datasets, showcase DHRNet's state-of-the-art performance. The code will\nbe released at https://github.com/YHDang/dhrnet-multi-pose-estimation.\n","authors":["Yonghao Dang","Jianqin Yin","Liyuan Liu","Pengxiang Ding","Yuan Sun","Yanzhu Hu"],"pdf_url":"https://arxiv.org/pdf/2404.14025v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17762v1","updated":"2024-04-27T02:40:36Z","published":"2024-04-27T02:40:36Z","title":"Large Multi-modality Model Assisted AI-Generated Image Quality\n Assessment","summary":" Traditional deep neural network (DNN)-based image quality assessment (IQA)\nmodels leverage convolutional neural networks (CNN) or Transformer to learn the\nquality-aware feature representation, achieving commendable performance on\nnatural scene images. However, when applied to AI-Generated images (AGIs),\nthese DNN-based IQA models exhibit subpar performance. This situation is\nlargely due to the semantic inaccuracies inherent in certain AGIs caused by\nuncontrollable nature of the generation process. Thus, the capability to\ndiscern semantic content becomes crucial for assessing the quality of AGIs.\nTraditional DNN-based IQA models, constrained by limited parameter complexity\nand training data, struggle to capture complex fine-grained semantic features,\nmaking it challenging to grasp the existence and coherence of semantic content\nof the entire image. To address the shortfall in semantic content perception of\ncurrent IQA models, we introduce a large Multi-modality model Assisted\nAI-Generated Image Quality Assessment (MA-AGIQA) model, which utilizes\nsemantically informed guidance to sense semantic information and extract\nsemantic vectors through carefully designed text prompts. Moreover, it employs\na mixture of experts (MoE) structure to dynamically integrate the semantic\ninformation with the quality-aware features extracted by traditional DNN-based\nIQA models. Comprehensive experiments conducted on two AI-generated content\ndatasets, AIGCQA-20k and AGIQA-3k show that MA-AGIQA achieves state-of-the-art\nperformance, and demonstrate its superior generalization capabilities on\nassessing the quality of AGIs. Code is available at\nhttps://github.com/wangpuyi/MA-AGIQA.\n","authors":["Puyi Wang","Wei Sun","Zicheng Zhang","Jun Jia","Yanwei Jiang","Zhichao Zhang","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.17762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02985v2","updated":"2024-04-27T02:38:40Z","published":"2024-02-05T13:16:12Z","title":"Applying Unsupervised Semantic Segmentation to High-Resolution UAV\n Imagery for Enhanced Road Scene Parsing","summary":" There are two challenges presented in parsing road scenes from UAV images:\nthe complexity of processing high-resolution images and the dependency on\nextensive manual annotations required by traditional supervised deep learning\nmethods to train robust and accurate models. In this paper, a novel\nunsupervised road parsing framework that leverages advancements in vision\nlanguage models with fundamental computer vision techniques is introduced to\naddress these critical challenges. Our approach initiates with a vision\nlanguage model that efficiently processes ultra-high resolution images to\nrapidly identify road regions of interest. Subsequent application of the vision\nfoundation model, SAM, generates masks for these regions without requiring\ncategory information. A self-supervised learning network then processes these\nmasked regions to extract feature representations, which are clustered using an\nunsupervised algorithm that assigns unique IDs to each feature cluster. The\nmasked regions are combined with the corresponding IDs to generate initial\npseudo-labels, which initiate an iterative self-training process for regular\nsemantic segmentation. Remarkably, the proposed method achieves a mean\nIntersection over Union (mIoU) of 89.96% on the development dataset without any\nmanual annotation, demonstrating extraordinary flexibility by surpassing the\nlimitations of human-defined categories, and autonomously acquiring knowledge\nof new categories from the dataset itself.\n","authors":["Zihan Ma","Yongshang Li","Ronggui Ma","Chen Liang"],"pdf_url":"https://arxiv.org/pdf/2402.02985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17760v1","updated":"2024-04-27T02:35:15Z","published":"2024-04-27T02:35:15Z","title":"Adversarial Examples: Generation Proposal in the Context of Facial\n Recognition Systems","summary":" In this paper we investigate the vulnerability that facial recognition\nsystems present to adversarial examples by introducing a new methodology from\nthe attacker perspective. The technique is based on the use of the autoencoder\nlatent space, organized with principal component analysis. We intend to analyze\nthe potential to craft adversarial examples suitable for both dodging and\nimpersonation attacks, against state-of-the-art systems. Our initial\nhypothesis, which was not strongly favoured by the results, stated that it\nwould be possible to separate between the \"identity\" and \"facial expression\"\nfeatures to produce high-quality examples. Despite the findings not supporting\nit, the results sparked insights into adversarial examples generation and\nopened new research avenues in the area.\n","authors":["Marina Fuster","Ignacio Vidaurreta"],"pdf_url":"https://arxiv.org/pdf/2404.17760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17753v1","updated":"2024-04-27T02:04:36Z","published":"2024-04-27T02:04:36Z","title":"Leveraging Cross-Modal Neighbor Representation for Improved CLIP\n Classification","summary":" CLIP showcases exceptional cross-modal matching capabilities due to its\ntraining on image-text contrastive learning tasks. However, without specific\noptimization for unimodal scenarios, its performance in single-modality feature\nextraction might be suboptimal. Despite this, some studies have directly used\nCLIP's image encoder for tasks like few-shot classification, introducing a\nmisalignment between its pre-training objectives and feature extraction\nmethods. This inconsistency can diminish the quality of the image's feature\nrepresentation, adversely affecting CLIP's effectiveness in target tasks. In\nthis paper, we view text features as precise neighbors of image features in\nCLIP's space and present a novel CrOss-moDal nEighbor Representation(CODER)\nbased on the distance structure between images and their neighbor texts. This\nfeature extraction method aligns better with CLIP's pre-training objectives,\nthereby fully leveraging CLIP's robust cross-modal capabilities. The key to\nconstruct a high-quality CODER lies in how to create a vast amount of\nhigh-quality and diverse texts to match with images. We introduce the Auto Text\nGenerator(ATG) to automatically generate the required texts in a data-free and\ntraining-free manner. We apply CODER to CLIP's zero-shot and few-shot image\nclassification tasks. Experiment results across various datasets and models\nconfirm CODER's effectiveness. Code is available\nat:https://github.com/YCaigogogo/CVPR24-CODER.\n","authors":["Chao Yi","Lu Ren","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2404.17753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00784v2","updated":"2024-04-27T01:53:39Z","published":"2023-12-01T18:59:56Z","title":"ViP-LLaVA: Making Large Multimodal Models Understand Arbitrary Visual\n Prompts","summary":" While existing large vision-language multimodal models focus on whole image\nunderstanding, there is a prominent gap in achieving region-specific\ncomprehension. Current approaches that use textual coordinates or spatial\nencodings often fail to provide a user-friendly interface for visual prompting.\nTo address this challenge, we introduce a novel multimodal model capable of\ndecoding arbitrary visual prompts. This allows users to intuitively mark images\nand interact with the model using natural cues like a \"red bounding box\" or\n\"pointed arrow\". Our simple design directly overlays visual markers onto the\nRGB image, eliminating the need for complex region encodings, yet achieves\nstate-of-the-art performance on region-understanding tasks like Visual7W,\nPointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present\nViP-Bench, a comprehensive benchmark to assess the capability of models in\nunderstanding visual prompts across multiple dimensions, enabling future\nresearch in this domain. Code, data, and model are publicly available.\n","authors":["Mu Cai","Haotian Liu","Dennis Park","Siva Karthik Mustikovela","Gregory P. Meyer","Yuning Chai","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.00784v2.pdf","comment":"Accepted to CVPR2024. Project page: https://vip-llava.github.io/"},{"id":"http://arxiv.org/abs/2404.17747v1","updated":"2024-04-27T01:35:21Z","published":"2024-04-27T01:35:21Z","title":"MMA-UNet: A Multi-Modal Asymmetric UNet Architecture for Infrared and\n Visible Image Fusion","summary":" Multi-modal image fusion (MMIF) maps useful information from various\nmodalities into the same representation space, thereby producing an informative\nfused image. However, the existing fusion algorithms tend to symmetrically fuse\nthe multi-modal images, causing the loss of shallow information or bias towards\na single modality in certain regions of the fusion results. In this study, we\nanalyzed the spatial distribution differences of information in different\nmodalities and proved that encoding features within the same network is not\nconducive to achieving simultaneous deep feature space alignment for\nmulti-modal images. To overcome this issue, a Multi-Modal Asymmetric UNet\n(MMA-UNet) was proposed. We separately trained specialized feature encoders for\ndifferent modal and implemented a cross-scale fusion strategy to maintain the\nfeatures from different modalities within the same representation space,\nensuring a balanced information fusion process. Furthermore, extensive fusion\nand downstream task experiments were conducted to demonstrate the efficiency of\nMMA-UNet in fusing infrared and visible image information, producing visually\nnatural and semantically rich fusion results. Its performance surpasses that of\nthe state-of-the-art comparison fusion methods.\n","authors":["Jingxue Huang","Xilai Li","Tianshu Tan","Xiaosong Li","Tao Ye"],"pdf_url":"https://arxiv.org/pdf/2404.17747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17745v1","updated":"2024-04-27T01:22:45Z","published":"2024-04-27T01:22:45Z","title":"An Attention-Based Deep Learning Architecture for Real-Time Monocular\n Visual Odometry: Applications to GPS-free Drone Navigation","summary":" Drones are increasingly used in fields like industry, medicine, research,\ndisaster relief, defense, and security. Technical challenges, such as\nnavigation in GPS-denied environments, hinder further adoption. Research in\nvisual odometry is advancing, potentially solving GPS-free navigation issues.\nTraditional visual odometry methods use geometry-based pipelines which, while\npopular, often suffer from error accumulation and high computational demands.\nRecent studies utilizing deep neural networks (DNNs) have shown improved\nperformance, addressing these drawbacks. Deep visual odometry typically employs\nconvolutional neural networks (CNNs) and sequence modeling networks like\nrecurrent neural networks (RNNs) to interpret scenes and deduce visual odometry\nfrom video sequences. This paper presents a novel real-time monocular visual\nodometry model for drones, using a deep neural architecture with a\nself-attention module. It estimates the ego-motion of a camera on a drone,\nusing consecutive video frames. An inference utility processes the live video\nfeed, employing deep learning to estimate the drone's trajectory. The\narchitecture combines a CNN for image feature extraction and a long short-term\nmemory (LSTM) network with a multi-head attention module for video sequence\nmodeling. Tested on two visual odometry datasets, this model converged 48%\nfaster than a previous RNN model and showed a 22% reduction in mean\ntranslational drift and a 12% improvement in mean translational absolute\ntrajectory error, demonstrating enhanced robustness to noise.\n","authors":["Olivier Brochu Dufour","Abolfazl Mohebbi","Sofiane Achiche"],"pdf_url":"https://arxiv.org/pdf/2404.17745v1.pdf","comment":"22 Pages, 3 Tables, 9 Figures"},{"id":"http://arxiv.org/abs/2404.17742v1","updated":"2024-04-27T00:49:39Z","published":"2024-04-27T00:49:39Z","title":"Segmentation Quality and Volumetric Accuracy in Medical Imaging","summary":" Current medical image segmentation relies on the region-based (Dice,\nF1-score) and boundary-based (Hausdorff distance, surface distance) metrics as\nthe de-facto standard. While these metrics are widely used, they lack a unified\ninterpretation, particularly regarding volume agreement. Clinicians often lack\nclear benchmarks to gauge the \"goodness\" of segmentation results based on these\nmetrics. Recognizing the clinical relevance of volumetry, we utilize relative\nvolume prediction error (vpe) to directly assess the accuracy of volume\npredictions derived from segmentation tasks. Our work integrates theoretical\nanalysis and empirical validation across diverse datasets. We delve into the\noften-ambiguous relationship between segmentation quality (measured by Dice)\nand volumetric accuracy in clinical practice. Our findings highlight the\ncritical role of incorporating volumetric prediction accuracy into segmentation\nevaluation. This approach empowers clinicians with a more nuanced understanding\nof segmentation performance, ultimately improving the interpretation and\nutility of these metrics in real-world healthcare settings.\n","authors":["Zheyuan Zhang","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2404.17742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00964v2","updated":"2024-04-27T00:19:58Z","published":"2023-08-02T06:41:19Z","title":"ForensicsForest Family: A Series of Multi-scale Hierarchical Cascade\n Forests for Detecting GAN-generated Faces","summary":" The prominent progress in generative models has significantly improved the\nreality of generated faces, bringing serious concerns to society. Since recent\nGAN-generated faces are in high realism, the forgery traces have become more\nimperceptible, increasing the forensics challenge. To combat GAN-generated\nfaces, many countermeasures based on Convolutional Neural Networks (CNNs) have\nbeen spawned due to their strong learning ability. In this paper, we rethink\nthis problem and explore a new approach based on forest models instead of CNNs.\nSpecifically, we describe a simple and effective forest-based method set called\n{\\em ForensicsForest Family} to detect GAN-generate faces. The proposed\nForensicsForest family is composed of three variants, which are {\\em\nForensicsForest}, {\\em Hybrid ForensicsForest} and {\\em Divide-and-Conquer\nForensicsForest} respectively. ForenscisForest is a newly proposed Multi-scale\nHierarchical Cascade Forest, which takes semantic, frequency and biology\nfeatures as input, hierarchically cascades different levels of features for\nauthenticity prediction, and then employs a multi-scale ensemble scheme that\ncan comprehensively consider different levels of information to improve the\nperformance further. Based on ForensicsForest, we develop Hybrid\nForensicsForest, an extended version that integrates the CNN layers into\nmodels, to further refine the effectiveness of augmented features. Moreover, to\nreduce the memory cost in training, we propose Divide-and-Conquer\nForensicsForest, which can construct a forest model using only a portion of\ntraining samplings. In the training stage, we train several candidate forest\nmodels using the subsets of training samples. Then a ForensicsForest is\nassembled by picking the suitable components from these candidate forest\nmodels...\n","authors":["Jiucui Lu","Jiaran Zhou","Junyu Dong","Bin Li","Siwei Lyu","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2308.00964v2.pdf","comment":"To Appear in IEEE TIFS 2024"},{"id":"http://arxiv.org/abs/2404.17736v1","updated":"2024-04-27T00:12:13Z","published":"2024-04-27T00:12:13Z","title":"Diffusion-Aided Joint Source Channel Coding For High Realism Wireless\n Image Transmission","summary":" Deep learning-based joint source-channel coding (deep JSCC) has been\ndemonstrated as an effective approach for wireless image transmission.\nNevertheless, current research has concentrated on minimizing a standard\ndistortion metric such as Mean Squared Error (MSE), which does not necessarily\nimprove the perceptual quality. To address this issue, we propose DiffJSCC, a\nnovel framework that leverages pre-trained text-to-image diffusion models to\nenhance the realism of images transmitted over the channel. The proposed\nDiffJSCC utilizes prior deep JSCC frameworks to deliver an initial\nreconstructed image at the receiver. Then, the spatial and textual features are\nextracted from the initial reconstruction, which, together with the channel\nstate information (e.g., signal-to-noise ratio, SNR), are passed to a control\nmodule to fine-tune the pre-trained Stable Diffusion model. Extensive\nexperiments on the Kodak dataset reveal that our method significantly surpasses\nboth conventional methods and prior deep JSCC approaches on perceptual metrics\nsuch as LPIPS and FID scores, especially with poor channel conditions and\nlimited bandwidth. Notably, DiffJSCC can achieve highly realistic\nreconstructions for 768x512 pixel Kodak images with only 3072 symbols (<0.008\nsymbols per pixel) under 1dB SNR. Our code will be released in\nhttps://github.com/mingyuyng/DiffJSCC.\n","authors":["Mingyu Yang","Bowen Liu","Boyang Wang","Hun-Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2404.17736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01198v3","updated":"2024-04-27T00:05:18Z","published":"2023-06-01T23:23:37Z","title":"Confidence Intervals for Error Rates in 1:1 Matching Tasks: Critical\n Statistical Analysis and Recommendations","summary":" Matching algorithms are commonly used to predict matches between items in a\ncollection. For example, in 1:1 face verification, a matching algorithm\npredicts whether two face images depict the same person. Accurately assessing\nthe uncertainty of the error rates of such algorithms can be challenging when\ndata are dependent and error rates are low, two aspects that have been often\noverlooked in the literature. In this work, we review methods for constructing\nconfidence intervals for error rates in 1:1 matching tasks. We derive and\nexamine the statistical properties of these methods, demonstrating how coverage\nand interval width vary with sample size, error rates, and degree of data\ndependence on both analysis and experiments with synthetic and real-world\ndatasets. Based on our findings, we provide recommendations for best practices\nfor constructing confidence intervals for error rates in 1:1 matching tasks.\n","authors":["Riccardo Fogliato","Pratik Patil","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2306.01198v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.08123v4","updated":"2024-04-27T00:00:29Z","published":"2021-09-16T17:21:52Z","title":"Neural Étendue Expander for Ultra-Wide-Angle High-Fidelity\n Holographic Display","summary":" Holographic displays can generate light fields by dynamically modulating the\nwavefront of a coherent beam of light using a spatial light modulator,\npromising rich virtual and augmented reality applications. However, the limited\nspatial resolution of existing dynamic spatial light modulators imposes a tight\nbound on the diffraction angle. As a result, modern holographic displays\npossess low \\'{e}tendue, which is the product of the display area and the\nmaximum solid angle of diffracted light. The low \\'{e}tendue forces a sacrifice\nof either the field-of-view (FOV) or the display size. In this work, we lift\nthis limitation by presenting neural \\'{e}tendue expanders. This new breed of\noptical elements, which is learned from a natural image dataset, enables higher\ndiffraction angles for ultra-wide FOV while maintaining both a compact form\nfactor and the fidelity of displayed contents to human viewers. With neural\n\\'{e}tendue expanders, we experimentally achieve 64$\\times$ \\'{e}tendue\nexpansion of natural images in full color, expanding the FOV by an order of\nmagnitude horizontally and vertically, with high-fidelity reconstruction\nquality (measured in PSNR) over 29 dB on retinal-resolution images.\n","authors":["Ethan Tseng","Grace Kuo","Seung-Hwan Baek","Nathan Matsuda","Andrew Maimone","Florian Schiffers","Praneeth Chakravarthula","Qiang Fu","Wolfgang Heidrich","Douglas Lanman","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2109.08123v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18952v1","updated":"2024-04-27T20:09:40Z","published":"2024-04-27T20:09:40Z","title":"CUE-Net: Violence Detection Video Analytics with Spatial Cropping,\n Enhanced UniformerV2 and Modified Efficient Additive Attention","summary":" In this paper we introduce CUE-Net, a novel architecture designed for\nautomated violence detection in video surveillance. As surveillance systems\nbecome more prevalent due to technological advances and decreasing costs, the\nchallenge of efficiently monitoring vast amounts of video data has intensified.\nCUE-Net addresses this challenge by combining spatial Cropping with an enhanced\nversion of the UniformerV2 architecture, integrating convolutional and\nself-attention mechanisms alongside a novel Modified Efficient Additive\nAttention mechanism (which reduces the quadratic time complexity of\nself-attention) to effectively and efficiently identify violent activities.\nThis approach aims to overcome traditional challenges such as capturing distant\nor partially obscured subjects within video frames. By focusing on both local\nand global spatiotemporal features, CUE-Net achieves state-of-the-art\nperformance on the RWF-2000 and RLVS datasets, surpassing existing methods.\n","authors":["Damith Chamalke Senadeera","Xiaoyun Yang","Dimitrios Kollias","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2404.18952v1.pdf","comment":"To be published in the proceedings of 2024 IEEE/CVF Conference on\n Computer Vision and Pattern Recognition Workshops (CVPRW)"},{"id":"http://arxiv.org/abs/2404.19640v1","updated":"2024-04-27T01:34:46Z","published":"2024-04-27T01:34:46Z","title":"Attacking Bayes: On the Adversarial Robustness of Bayesian Neural\n Networks","summary":" Adversarial examples have been shown to cause neural networks to fail on a\nwide range of vision and language tasks, but recent work has claimed that\nBayesian neural networks (BNNs) are inherently robust to adversarial\nperturbations. In this work, we examine this claim. To study the adversarial\nrobustness of BNNs, we investigate whether it is possible to successfully break\nstate-of-the-art BNN inference methods and prediction pipelines using even\nrelatively unsophisticated attacks for three tasks: (1) label prediction under\nthe posterior predictive mean, (2) adversarial example detection with Bayesian\npredictive uncertainty, and (3) semantic shift detection. We find that BNNs\ntrained with state-of-the-art approximate inference methods, and even BNNs\ntrained with Hamiltonian Monte Carlo, are highly susceptible to adversarial\nattacks. We also identify various conceptual and experimental errors in\nprevious works that claimed inherent adversarial robustness of BNNs and\nconclusively demonstrate that BNNs and uncertainty-aware Bayesian prediction\npipelines are not inherently robust against adversarial attacks.\n","authors":["Yunzhen Feng","Tim G. J. Rudner","Nikolaos Tsilivis","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2404.19640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01583v1","updated":"2024-04-27T20:03:47Z","published":"2024-04-27T20:03:47Z","title":"MediFact at MEDIQA-M3G 2024: Medical Question Answering in Dermatology\n with Multimodal Learning","summary":" The MEDIQA-M3G 2024 challenge necessitates novel solutions for Multilingual &\nMultimodal Medical Answer Generation in dermatology (wai Yim et al., 2024a).\nThis paper addresses the limitations of traditional methods by proposing a\nweakly supervised learning approach for open-ended medical question-answering\n(QA). Our system leverages readily available MEDIQA-M3G images via a\nVGG16-CNN-SVM model, enabling multilingual (English, Chinese, Spanish) learning\nof informative skin condition representations. Using pre-trained QA models, we\nfurther bridge the gap between visual and textual information through\nmultimodal fusion. This approach tackles complex, open-ended questions even\nwithout predefined answer choices. We empower the generation of comprehensive\nanswers by feeding the ViT-CLIP model with multiple responses alongside images.\nThis work advances medical QA research, paving the way for clinical decision\nsupport systems and ultimately improving healthcare delivery.\n","authors":["Nadia Saeed"],"pdf_url":"https://arxiv.org/pdf/2405.01583v1.pdf","comment":"7 pages, 3 figures, Clinical NLP 2024 workshop proceedings in Shared\n Task"}]},"2024-04-30T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.19760v1","updated":"2024-04-30T17:59:51Z","published":"2024-04-30T17:59:51Z","title":"Lightplane: Highly-Scalable Components for Neural 3D Fields","summary":" Contemporary 3D research, particularly in reconstruction and generation,\nheavily relies on 2D images for inputs or supervision. However, current designs\nfor these 2D-3D mapping are memory-intensive, posing a significant bottleneck\nfor existing methods and hindering new applications. In response, we propose a\npair of highly scalable components for 3D neural fields: Lightplane Render and\nSplatter, which significantly reduce memory usage in 2D-3D mapping. These\ninnovations enable the processing of vastly more and higher resolution images\nwith small memory and computational costs. We demonstrate their utility in\nvarious applications, from benefiting single-scene optimization with\nimage-level losses to realizing a versatile pipeline for dramatically scaling\n3D reconstruction and generation. Code:\n\\url{https://github.com/facebookresearch/lightplane}.\n","authors":["Ang Cao","Justin Johnson","Andrea Vedaldi","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2404.19760v1.pdf","comment":"Project Page: https://lightplane.github.io/ Code:\n https://github.com/facebookresearch/lightplane"},{"id":"http://arxiv.org/abs/2404.19759v1","updated":"2024-04-30T17:59:47Z","published":"2024-04-30T17:59:47Z","title":"MotionLCM: Real-time Controllable Motion Generation via Latent\n Consistency Model","summary":" This work introduces MotionLCM, extending controllable motion generation to a\nreal-time level. Existing methods for spatial control in text-conditioned\nmotion generation suffer from significant runtime inefficiency. To address this\nissue, we first propose the motion latent consistency model (MotionLCM) for\nmotion generation, building upon the latent diffusion model (MLD). By employing\none-step (or few-step) inference, we further improve the runtime efficiency of\nthe motion latent diffusion model for motion generation. To ensure effective\ncontrollability, we incorporate a motion ControlNet within the latent space of\nMotionLCM and enable explicit control signals (e.g., pelvis trajectory) in the\nvanilla motion space to control the generation process directly, similar to\ncontrolling other latent-free diffusion models for motion generation. By\nemploying these techniques, our approach can generate human motions with text\nand control signals in real-time. Experimental results demonstrate the\nremarkable generation and controlling capabilities of MotionLCM while\nmaintaining real-time runtime efficiency.\n","authors":["Wenxun Dai","Ling-Hao Chen","Jingbo Wang","Jinpeng Liu","Bo Dai","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2404.19759v1.pdf","comment":"MotionLCM project version 1.0"},{"id":"http://arxiv.org/abs/2404.19758v1","updated":"2024-04-30T17:59:40Z","published":"2024-04-30T17:59:40Z","title":"Invisible Stitch: Generating Smooth 3D Scenes with Depth Inpainting","summary":" 3D scene generation has quickly become a challenging new research direction,\nfueled by consistent improvements of 2D generative diffusion models. Most prior\nwork in this area generates scenes by iteratively stitching newly generated\nframes with existing geometry. These works often depend on pre-trained\nmonocular depth estimators to lift the generated images into 3D, fusing them\nwith the existing scene representation. These approaches are then often\nevaluated via a text metric, measuring the similarity between the generated\nimages and a given text prompt. In this work, we make two fundamental\ncontributions to the field of 3D scene generation. First, we note that lifting\nimages to 3D with a monocular depth estimation model is suboptimal as it\nignores the geometry of the existing scene. We thus introduce a novel depth\ncompletion model, trained via teacher distillation and self-training to learn\nthe 3D fusion process, resulting in improved geometric coherence of the scene.\nSecond, we introduce a new benchmarking scheme for scene generation methods\nthat is based on ground truth geometry, and thus measures the quality of the\nstructure of the scene.\n","authors":["Paul Engstler","Andrea Vedaldi","Iro Laina","Christian Rupprecht"],"pdf_url":"https://arxiv.org/pdf/2404.19758v1.pdf","comment":"Project page: https://research.paulengstler.com/invisible-stitch/"},{"id":"http://arxiv.org/abs/2404.19753v1","updated":"2024-04-30T17:56:24Z","published":"2024-04-30T17:56:24Z","title":"DOCCI: Descriptions of Connected and Contrasting Images","summary":" Vision-language datasets are vital for both text-to-image (T2I) and\nimage-to-text (I2T) research. However, current datasets lack descriptions with\nfine-grained detail that would allow for richer associations to be learned by\nmodels. To fill the gap, we introduce Descriptions of Connected and Contrasting\nImages (DOCCI), a dataset with long, human-annotated English descriptions for\n15k images that were taken, curated and donated by a single researcher intent\non capturing key challenges such as spatial relations, counting, text\nrendering, world knowledge, and more. We instruct human annotators to create\ncomprehensive descriptions for each image; these average 136 words in length\nand are crafted to clearly distinguish each image from those that are related\nor similar. Each description is highly compositional and typically encompasses\nmultiple challenges. Through both quantitative and qualitative analyses, we\ndemonstrate that DOCCI serves as an effective training resource for\nimage-to-text generation -- a PaLI 5B model finetuned on DOCCI shows equal or\nsuperior results compared to highly-performant larger models like LLaVA-1.5 7B\nand InstructBLIP 7B. Furthermore, we show that DOCCI is a useful testbed for\ntext-to-image generation, highlighting the limitations of current text-to-image\nmodels in capturing long descriptions and fine details.\n","authors":["Yasumasa Onoe","Sunayana Rane","Zachary Berger","Yonatan Bitton","Jaemin Cho","Roopal Garg","Alexander Ku","Zarana Parekh","Jordi Pont-Tuset","Garrett Tanzer","Su Wang","Jason Baldridge"],"pdf_url":"https://arxiv.org/pdf/2404.19753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19752v1","updated":"2024-04-30T17:55:27Z","published":"2024-04-30T17:55:27Z","title":"Visual Fact Checker: Enabling High-Fidelity Detailed Caption Generation","summary":" Existing automatic captioning methods for visual content face challenges such\nas lack of detail, content hallucination, and poor instruction following. In\nthis work, we propose VisualFactChecker (VFC), a flexible training-free\npipeline that generates high-fidelity and detailed captions for both 2D images\nand 3D objects. VFC consists of three steps: 1) proposal, where image-to-text\ncaptioning models propose multiple initial captions; 2) verification, where a\nlarge language model (LLM) utilizes tools such as object detection and VQA\nmodels to fact-check proposed captions; 3) captioning, where an LLM generates\nthe final caption by summarizing caption proposals and the fact check\nverification results. In this step, VFC can flexibly generate captions in\nvarious styles following complex instructions. We conduct comprehensive\ncaptioning evaluations using four metrics: 1) CLIP-Score for image-text\nsimilarity; 2) CLIP-Image-Score for measuring the image-image similarity\nbetween the original and the reconstructed image generated by a text-to-image\nmodel using the caption. 3) human study on Amazon Mechanical Turk; 4) GPT-4V\nfor fine-grained evaluation. Evaluation results show that VFC outperforms\nstate-of-the-art open-sourced captioning methods for 2D images on the COCO\ndataset and 3D assets on the Objaverse dataset. Our study demonstrates that by\ncombining open-source models into a pipeline, we can attain captioning\ncapability comparable to proprietary models such as GPT-4V, despite being over\n10x smaller in model size.\n","authors":["Yunhao Ge","Xiaohui Zeng","Jacob Samuel Huffman","Tsung-Yi Lin","Ming-Yu Liu","Yin Cui"],"pdf_url":"https://arxiv.org/pdf/2404.19752v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.19748v1","updated":"2024-04-30T17:52:31Z","published":"2024-04-30T17:52:31Z","title":"Quantifying Nematodes through Images: Datasets, Models, and Baselines of\n Deep Learning","summary":" Every year, plant parasitic nematodes, one of the major groups of plant\npathogens, cause a significant loss of crops worldwide. To mitigate crop yield\nlosses caused by nematodes, an efficient nematode monitoring method is\nessential for plant and crop disease management. In other respects, efficient\nnematode detection contributes to medical research and drug discovery, as\nnematodes are model organisms. With the rapid development of computer\ntechnology, computer vision techniques provide a feasible solution for\nquantifying nematodes or nematode infections. In this paper, we survey and\ncategorise the studies and available datasets on nematode detection through\ndeep-learning models. To stimulate progress in related research, this survey\npresents the potential state-of-the-art object detection models, training\ntechniques, optimisation techniques, and evaluation metrics for deep learning\nbeginners. Moreover, seven state-of-the-art object detection models are\nvalidated on three public datasets and the AgriNema dataset for plant parasitic\nnematodes to construct a baseline for nematode detection.\n","authors":["Zhipeng Yuan","Nasamu Musa","Katarzyna Dybal","Matthew Back","Daniel Leybourne","Po Yang"],"pdf_url":"https://arxiv.org/pdf/2404.19748v1.pdf","comment":"The 26th IEEE International Conference on Computational Science and\n Engineering (CSE-2023)"},{"id":"http://arxiv.org/abs/2303.14541v2","updated":"2024-04-30T17:24:00Z","published":"2023-03-25T19:15:16Z","title":"UnScene3D: Unsupervised 3D Instance Segmentation for Indoor Scenes","summary":" 3D instance segmentation is fundamental to geometric understanding of the\nworld around us. Existing methods for instance segmentation of 3D scenes rely\non supervision from expensive, manual 3D annotations. We propose UnScene3D, the\nfirst fully unsupervised 3D learning approach for class-agnostic 3D instance\nsegmentation of indoor scans. UnScene3D first generates pseudo masks by\nleveraging self-supervised color and geometry features to find potential object\nregions. We operate on a basis of geometric oversegmentation, enabling\nefficient representation and learning on high-resolution 3D data. The coarse\nproposals are then refined through self-training our model on its predictions.\nOur approach improves over state-of-the-art unsupervised 3D instance\nsegmentation methods by more than 300% Average Precision score, demonstrating\neffective instance segmentation even in challenging, cluttered 3D scenes.\n","authors":["David Rozenberszki","Or Litany","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2303.14541v2.pdf","comment":"Project page: https://rozdavid.github.io/unscene3d, paper updated\n according to CVPR24 camera ready version"},{"id":"http://arxiv.org/abs/2404.19722v1","updated":"2024-04-30T17:15:42Z","published":"2024-04-30T17:15:42Z","title":"PACER+: On-Demand Pedestrian Animation Controller in Driving Scenarios","summary":" We address the challenge of content diversity and controllability in\npedestrian simulation for driving scenarios. Recent pedestrian animation\nframeworks have a significant limitation wherein they primarily focus on either\nfollowing trajectory [46] or the content of the reference video [57],\nconsequently overlooking the potential diversity of human motion within such\nscenarios. This limitation restricts the ability to generate pedestrian\nbehaviors that exhibit a wider range of variations and realistic motions and\ntherefore restricts its usage to provide rich motion content for other\ncomponents in the driving simulation system, e.g., suddenly changed motion to\nwhich the autonomous vehicle should respond. In our approach, we strive to\nsurpass the limitation by showcasing diverse human motions obtained from\nvarious sources, such as generated human motions, in addition to following the\ngiven trajectory. The fundamental contribution of our framework lies in\ncombining the motion tracking task with trajectory following, which enables the\ntracking of specific motion parts (e.g., upper body) while simultaneously\nfollowing the given trajectory by a single policy. This way, we significantly\nenhance both the diversity of simulated human motion within the given scenario\nand the controllability of the content, including language-based control. Our\nframework facilitates the generation of a wide range of human motions,\ncontributing to greater realism and adaptability in pedestrian simulations for\ndriving scenarios. More information is on our project page\nhttps://wangjingbo1219.github.io/papers/CVPR2024_PACER_PLUS/PACERPLUSPage.html .\n","authors":["Jingbo Wang","Zhengyi Luo","Ye Yuan","Yixuan Li","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2404.19722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19706v1","updated":"2024-04-30T16:54:59Z","published":"2024-04-30T16:54:59Z","title":"RTG-SLAM: Real-time 3D Reconstruction at Scale using Gaussian Splatting","summary":" We propose RTG-SLAM, a real-time 3D reconstruction system with an RGBD camera\nfor large-scale environments using Gaussian splatting. RTG-SLAM features a\ncompact Gaussian representation and a highly efficient on-the-fly Gaussian\noptimization scheme. We force each Gaussian to be either opaque or nearly\ntransparent, with the opaque ones fitting the surface and dominant colors, and\ntransparent ones fitting residual colors. By rendering depth in a different way\nfrom color rendering, we let a single opaque Gaussian well fit a local surface\nregion without the need of multiple overlapping Gaussians, hence largely\nreducing the memory and computation cost. For on-the-fly Gaussian optimization,\nwe explicitly add Gaussians for three types of pixels per frame: newly\nobserved, with large color errors and with large depth errors. We also\ncategorize all Gaussians into stable and unstable ones, where the stable\nGaussians are expected to well fit previously observed RGBD images and\notherwise unstable. We only optimize the unstable Gaussians and only render the\npixels occupied by unstable Gaussians. In this way, both the number of\nGaussians to be optimized and pixels to be rendered are largely reduced, and\nthe optimization can be done in real time. We show real-time reconstructions of\na variety of real large scenes. Compared with the state-of-the-art NeRF-based\nRGBD SLAM, our system achieves comparable high-quality reconstruction but with\naround twice the speed and half the memory cost, and shows superior performance\nin the realism of novel view synthesis and camera tracking accuracy.\n","authors":["Zhexi Peng","Tianjia Shao","Yong Liu","Jingke Zhou","Yin Yang","Jingdong Wang","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.19706v1.pdf","comment":"To be published in ACM SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2404.19702v1","updated":"2024-04-30T16:47:46Z","published":"2024-04-30T16:47:46Z","title":"GS-LRM: Large Reconstruction Model for 3D Gaussian Splatting","summary":" We propose GS-LRM, a scalable large reconstruction model that can predict\nhigh-quality 3D Gaussian primitives from 2-4 posed sparse images in 0.23\nseconds on single A100 GPU. Our model features a very simple transformer-based\narchitecture; we patchify input posed images, pass the concatenated multi-view\nimage tokens through a sequence of transformer blocks, and decode final\nper-pixel Gaussian parameters directly from these tokens for differentiable\nrendering. In contrast to previous LRMs that can only reconstruct objects, by\npredicting per-pixel Gaussians, GS-LRM naturally handles scenes with large\nvariations in scale and complexity. We show that our model can work on both\nobject and scene captures by training it on Objaverse and RealEstate10K\nrespectively. In both scenarios, the models outperform state-of-the-art\nbaselines by a wide margin. We also demonstrate applications of our model in\ndownstream 3D generation tasks. Our project webpage is available at:\nhttps://sai-bi.github.io/project/gs-lrm/ .\n","authors":["Kai Zhang","Sai Bi","Hao Tan","Yuanbo Xiangli","Nanxuan Zhao","Kalyan Sunkavalli","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.19702v1.pdf","comment":"Project webpage: https://sai-bi.github.io/project/gs-lrm/"},{"id":"http://arxiv.org/abs/2404.19696v1","updated":"2024-04-30T16:44:18Z","published":"2024-04-30T16:44:18Z","title":"Naturally Supervised 3D Visual Grounding with Language-Regularized\n Concept Learners","summary":" 3D visual grounding is a challenging task that often requires direct and\ndense supervision, notably the semantic label for each object in the scene. In\nthis paper, we instead study the naturally supervised setting that learns from\nonly 3D scene and QA pairs, where prior works underperform. We propose the\nLanguage-Regularized Concept Learner (LARC), which uses constraints from\nlanguage as regularization to significantly improve the accuracy of\nneuro-symbolic concept learners in the naturally supervised setting. Our\napproach is based on two core insights: the first is that language constraints\n(e.g., a word's relation to another) can serve as effective regularization for\nstructured representations in neuro-symbolic models; the second is that we can\nquery large language models to distill such constraints from language\nproperties. We show that LARC improves performance of prior works in naturally\nsupervised 3D visual grounding, and demonstrates a wide range of 3D visual\nreasoning capabilities-from zero-shot composition, to data efficiency and\ntransferability. Our method represents a promising step towards regularizing\nstructured visual reasoning frameworks with language-based priors, for learning\nin settings without dense supervision.\n","authors":["Chun Feng","Joy Hsu","Weiyu Liu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2404.19696v1.pdf","comment":"CVPR 2024. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2404.19693v1","updated":"2024-04-30T16:37:27Z","published":"2024-04-30T16:37:27Z","title":"SwipeGANSpace: Swipe-to-Compare Image Generation via Efficient Latent\n Space Exploration","summary":" Generating preferred images using generative adversarial networks (GANs) is\nchallenging owing to the high-dimensional nature of latent space. In this\nstudy, we propose a novel approach that uses simple user-swipe interactions to\ngenerate preferred images for users. To effectively explore the latent space\nwith only swipe interactions, we apply principal component analysis to the\nlatent space of the StyleGAN, creating meaningful subspaces. We use a\nmulti-armed bandit algorithm to decide the dimensions to explore, focusing on\nthe preferences of the user. Experiments show that our method is more efficient\nin generating preferred images than the baseline methods. Furthermore, changes\nin preferred images during image generation or the display of entirely\ndifferent image styles were observed to provide new inspirations, subsequently\naltering user preferences. This highlights the dynamic nature of user\npreferences, which our proposed approach recognizes and enhances.\n","authors":["Yuto Nakashima","Mingzhe Yang","Yukino Baba"],"pdf_url":"https://arxiv.org/pdf/2404.19693v1.pdf","comment":"11 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.19666v1","updated":"2024-04-30T16:01:14Z","published":"2024-04-30T16:01:14Z","title":"Beyond MOS: Subjective Image Quality Score Preprocessing Method Based on\n Perceptual Similarity","summary":" Image quality assessment often relies on raw opinion scores provided by\nsubjects in subjective experiments, which can be noisy and unreliable. To\naddress this issue, postprocessing procedures such as ITU-R BT.500, ITU-T\nP.910, and ITU-T P.913 have been standardized to clean up the original opinion\nscores. These methods use annotator-based statistical priors, but they do not\ntake into account extensive information about the image itself, which limits\ntheir performance in less annotated scenarios. Generally speaking, image\nquality datasets usually contain similar scenes or distortions, and it is\ninevitable for subjects to compare images to score a reasonable score when\nscoring. Therefore, In this paper, we proposed Subjective Image Quality Score\nPreprocessing Method perceptual similarity Subjective Preprocessing (PSP),\nwhich exploit the perceptual similarity between images to alleviate subjective\nbias in less annotated scenarios. Specifically, we model subjective scoring as\na conditional probability model based on perceptual similarity with previously\nscored images, called subconscious reference scoring. The reference images are\nstored by a neighbor dictionary, which is obtained by a normalized vector\ndot-product based nearest neighbor search of the images' perceptual depth\nfeatures. Then the preprocessed score is updated by the exponential moving\naverage (EMA) of the subconscious reference scoring, called similarity\nregularized EMA. Our experiments on multiple datasets (LIVE, TID2013, CID2013)\nshow that this method can effectively remove the bias of the subjective scores.\nAdditionally, Experiments prove that the Preprocesed dataset can improve the\nperformance of downstream IQA tasks very well.\n","authors":["Lei Wang","Desen Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.19666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06362v2","updated":"2024-04-30T15:58:32Z","published":"2024-04-09T14:56:34Z","title":"Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot\n Medical Image Segmentation","summary":" The Segment Anything Model (SAM) and CLIP are remarkable vision foundation\nmodels (VFMs). SAM, a prompt driven segmentation model, excels in segmentation\ntasks across diverse domains, while CLIP is renowned for its zero shot\nrecognition capabilities. However, their unified potential has not yet been\nexplored in medical image segmentation. To adapt SAM to medical imaging,\nexisting methods primarily rely on tuning strategies that require extensive\ndata or prior prompts tailored to the specific task, making it particularly\nchallenging when only a limited number of data samples are available. This work\npresents an in depth exploration of integrating SAM and CLIP into a unified\nframework for medical image segmentation. Specifically, we propose a simple\nunified framework, SaLIP, for organ segmentation. Initially, SAM is used for\npart based segmentation within the image, followed by CLIP to retrieve the mask\ncorresponding to the region of interest (ROI) from the pool of SAM generated\nmasks. Finally, SAM is prompted by the retrieved ROI to segment a specific\norgan. Thus, SaLIP is training and fine tuning free and does not rely on domain\nexpertise or labeled data for prompt engineering. Our method shows substantial\nenhancements in zero shot segmentation, showcasing notable improvements in DICE\nscores across diverse segmentation tasks like brain (63.46%), lung (50.11%),\nand fetal head (30.82%), when compared to un prompted SAM. Code and text\nprompts are available at: https://github.com/aleemsidra/SaLIP.\n","authors":["Sidra Aleem","Fangyijie Wang","Mayug Maniparambil","Eric Arazo","Julia Dietlmeier","Guenole Silvestre","Kathleen Curran","Noel E. O'Connor","Suzanne Little"],"pdf_url":"https://arxiv.org/pdf/2404.06362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19656v1","updated":"2024-04-30T15:52:49Z","published":"2024-04-30T15:52:49Z","title":"Towards Scenario- and Capability-Driven Dataset Development and\n Evaluation: An Approach in the Context of Mapless Automated Driving","summary":" The foundational role of datasets in defining the capabilities of deep\nlearning models has led to their rapid proliferation. At the same time,\npublished research focusing on the process of dataset development for\nenvironment perception in automated driving has been scarce, thereby reducing\nthe applicability of openly available datasets and impeding the development of\neffective environment perception systems. Sensor-based, mapless automated\ndriving is one of the contexts where this limitation is evident. While\nleveraging real-time sensor data, instead of pre-defined HD maps promises\nenhanced adaptability and safety by effectively navigating unexpected\nenvironmental changes, it also increases the demands on the scope and\ncomplexity of the information provided by the perception system.\n To address these challenges, we propose a scenario- and capability-based\napproach to dataset development. Grounded in the principles of ISO 21448\n(safety of the intended functionality, SOTIF), extended by ISO/TR 4804, our\napproach facilitates the structured derivation of dataset requirements. This\nnot only aids in the development of meaningful new datasets but also enables\nthe effective comparison of existing ones. Applying this methodology to a broad\nrange of existing lane detection datasets, we identify significant limitations\nin current datasets, particularly in terms of real-world applicability, a lack\nof labeling of critical features, and an absence of comprehensive information\nfor complex driving maneuvers.\n","authors":["Felix Grün","Marcus Nolte","Markus Maurer"],"pdf_url":"https://arxiv.org/pdf/2404.19656v1.pdf","comment":"Accepted to be published at the 2024 35th IEEE Intelligent Vehicles\n Symposium (IV), Jeju Island, Korea, June 2 - 5, 2024"},{"id":"http://arxiv.org/abs/2404.05466v2","updated":"2024-04-30T15:51:21Z","published":"2024-04-08T12:44:24Z","title":"Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder","summary":" Automatic lip-reading (ALR) aims to automatically transcribe spoken content\nfrom a speaker's silent lip motion captured in video. Current mainstream\nlip-reading approaches only use a single visual encoder to model input videos\nof a single scale. In this paper, we propose to enhance lip-reading by\nincorporating multi-scale video data and multi-encoder. Specifically, we first\npropose a novel multi-scale lip motion extraction algorithm based on the size\nof the speaker's face and an Enhanced ResNet3D visual front-end (VFE) to\nextract lip features at different scales. For the multi-encoder, in addition to\nthe mainstream Transformer and Conformer, we also incorporate the recently\nproposed Branchformer and E-Branchformer as visual encoders. In the\nexperiments, we explore the influence of different video data scales and\nencoders on ALR system performance and fuse the texts transcribed by all ALR\nsystems using recognizer output voting error reduction (ROVER). Finally, our\nproposed approach placed second in the ICME 2024 ChatCLR Challenge Task 2, with\na 21.52% reduction in character error rate (CER) compared to the official\nbaseline on the evaluation set.\n","authors":["He Wang","Pengcheng Guo","Xucheng Wan","Huan Zhou","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.05466v2.pdf","comment":"6 pages, 3 figures, Accepted at ICMEW 2024"},{"id":"http://arxiv.org/abs/2404.19654v1","updated":"2024-04-30T15:51:05Z","published":"2024-04-30T15:51:05Z","title":"Masked Multi-Query Slot Attention for Unsupervised Object Discovery","summary":" Unsupervised object discovery is becoming an essential line of research for\ntackling recognition problems that require decomposing an image into entities,\nsuch as semantic segmentation and object detection. Recently, object-centric\nmethods that leverage self-supervision have gained popularity, due to their\nsimplicity and adaptability to different settings and conditions. However,\nthose methods do not exploit effective techniques already employed in modern\nself-supervised approaches. In this work, we consider an object-centric\napproach in which DINO ViT features are reconstructed via a set of queried\nrepresentations called slots. Based on that, we propose a masking scheme on\ninput features that selectively disregards the background regions, inducing our\nmodel to focus more on salient objects during the reconstruction phase.\nMoreover, we extend the slot attention to a multi-query approach, allowing the\nmodel to learn multiple sets of slots, producing more stable masks. During\ntraining, these multiple sets of slots are learned independently while, at test\ntime, these sets are merged through Hungarian matching to obtain the final\nslots. Our experimental results and ablations on the PASCAL-VOC 2012 dataset\nshow the importance of each component and highlight how their combination\nconsistently improves object localization. Our source code is available at:\nhttps://github.com/rishavpramanik/maskedmultiqueryslot\n","authors":["Rishav Pramanik","José-Fabian Villa-Vásquez","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2404.19654v1.pdf","comment":"Paper accepted for presentation at IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.19652v1","updated":"2024-04-30T15:49:03Z","published":"2024-04-30T15:49:03Z","title":"VimTS: A Unified Video and Image Text Spotter for Enhancing the\n Cross-domain Generalization","summary":" Text spotting, a task involving the extraction of textual information from\nimage or video sequences, faces challenges in cross-domain adaption, such as\nimage-to-image and image-to-video generalization. In this paper, we introduce a\nnew method, termed VimTS, which enhances the generalization ability of the\nmodel by achieving better synergy among different tasks. Typically, we propose\na Prompt Queries Generation Module and a Tasks-aware Adapter to effectively\nconvert the original single-task model into a multi-task model suitable for\nboth image and video scenarios with minimal additional parameters. The Prompt\nQueries Generation Module facilitates explicit interaction between different\ntasks, while the Tasks-aware Adapter helps the model dynamically learn suitable\nfeatures for each task. Additionally, to further enable the model to learn\ntemporal information at a lower cost, we propose a synthetic video text dataset\n(VTD-368k) by leveraging the Content Deformation Fields (CoDeF) algorithm.\nNotably, our method outperforms the state-of-the-art method by an average of\n2.6% in six cross-domain benchmarks such as TT-to-IC15, CTW1500-to-TT, and\nTT-to-CTW1500. For video-level cross-domain adaption, our method even surpasses\nthe previous end-to-end video spotting method in ICDAR2015 video and DSText v2\nby an average of 5.5% on the MOTA metric, using only image-level data. We\nfurther demonstrate that existing Large Multimodal Models exhibit limitations\nin generating cross-domain scene text spotting, in contrast to our VimTS model\nwhich requires significantly fewer parameters and data. The code and datasets\nwill be made available at the https://VimTextSpotter.github.io.\n","authors":["Yuliang Liu","Mingxin Huang","Hao Yan","Linger Deng","Weijia Wu","Hao Lu","Chunhua Shen","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2404.19652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19651v1","updated":"2024-04-30T15:49:01Z","published":"2024-04-30T15:49:01Z","title":"Provably Robust Conformal Prediction with Improved Efficiency","summary":" Conformal prediction is a powerful tool to generate uncertainty sets with\nguaranteed coverage using any predictive model, under the assumption that the\ntraining and test data are i.i.d.. Recently, it has been shown that adversarial\nexamples are able to manipulate conformal methods to construct prediction sets\nwith invalid coverage rates, as the i.i.d. assumption is violated. To address\nthis issue, a recent work, Randomized Smoothed Conformal Prediction (RSCP), was\nfirst proposed to certify the robustness of conformal prediction methods to\nadversarial noise. However, RSCP has two major limitations: (i) its robustness\nguarantee is flawed when used in practice and (ii) it tends to produce large\nuncertainty sets. To address these limitations, we first propose a novel\nframework called RSCP+ to provide provable robustness guarantee in evaluation,\nwhich fixes the issues in the original RSCP method. Next, we propose two novel\nmethods, Post-Training Transformation (PTT) and Robust Conformal Training\n(RCT), to effectively reduce prediction set size with little computation\noverhead. Experimental results in CIFAR10, CIFAR100, and ImageNet suggest the\nbaseline method only yields trivial predictions including full label set, while\nour methods could boost the efficiency by up to $4.36\\times$, $5.46\\times$, and\n$16.9\\times$ respectively and provide practical robustness guarantee. Our codes\nare available at\nhttps://github.com/Trustworthy-ML-Lab/Provably-Robust-Conformal-Prediction.\n","authors":["Ge Yan","Yaniv Romano","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2404.19651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19644v1","updated":"2024-04-30T15:45:30Z","published":"2024-04-30T15:45:30Z","title":"MetaCoCo: A New Few-Shot Classification Benchmark with Spurious\n Correlation","summary":" Out-of-distribution (OOD) problems in few-shot classification (FSC) occur\nwhen novel classes sampled from testing distributions differ from base classes\ndrawn from training distributions, which considerably degrades the performance\nof deep learning models deployed in real-world applications. Recent studies\nsuggest that the OOD problems in FSC mainly including: (a) cross-domain\nfew-shot classification (CD-FSC) and (b) spurious-correlation few-shot\nclassification (SC-FSC). Specifically, CD-FSC occurs when a classifier learns\ntransferring knowledge from base classes drawn from seen training distributions\nbut recognizes novel classes sampled from unseen testing distributions. In\ncontrast, SC-FSC arises when a classifier relies on non-causal features (or\ncontexts) that happen to be correlated with the labels (or concepts) in base\nclasses but such relationships no longer hold during the model deployment.\nDespite CD-FSC has been extensively studied, SC-FSC remains understudied due to\nlack of the corresponding evaluation benchmarks. To this end, we present Meta\nConcept Context (MetaCoCo), a benchmark with spurious-correlation shifts\ncollected from real-world scenarios. Moreover, to quantify the extent of\nspurious-correlation shifts of the presented MetaCoCo, we further propose a\nmetric by using CLIP as a pre-trained vision-language model. Extensive\nexperiments on the proposed benchmark are performed to evaluate the\nstate-of-the-art methods in FSC, cross-domain shifts, and self-supervised\nlearning. The experimental results show that the performance of the existing\nmethods degrades significantly in the presence of spurious-correlation shifts.\nWe open-source all codes of our benchmark and hope that the proposed MetaCoCo\ncan facilitate future research on spurious-correlation shifts problems in FSC.\nThe code is available at: https://github.com/remiMZ/MetaCoCo-ICLR24.\n","authors":["Min Zhang","Haoxuan Li","Fei Wu","Kun Kuang"],"pdf_url":"https://arxiv.org/pdf/2404.19644v1.pdf","comment":"ICLR 24"},{"id":"http://arxiv.org/abs/2404.19639v1","updated":"2024-04-30T15:42:45Z","published":"2024-04-30T15:42:45Z","title":"ESP-Zero: Unsupervised enhancement of zero-shot classification for\n Extremely Sparse Point cloud","summary":" In recent years, zero-shot learning has attracted the focus of many\nresearchers, due to its flexibility and generality. Many approaches have been\nproposed to achieve the zero-shot classification of the point clouds for 3D\nobject understanding, following the schema of CLIP. However, in the real world,\nthe point clouds could be extremely sparse, dramatically limiting the\neffectiveness of the 3D point cloud encoders, and resulting in the misalignment\nof point cloud features and text embeddings. To the point cloud encoders to fit\nthe extremely sparse point clouds without re-running the pre-training procedure\nwhich could be time-consuming and expensive, in this work, we propose an\nunsupervised model adaptation approach to enhance the point cloud encoder for\nthe extremely sparse point clouds. We propose a novel fused-cross attention\nlayer that expands the pre-trained self-attention layer with additional\nlearnable tokens and attention blocks, which effectively modifies the point\ncloud features while maintaining the alignment between point cloud features and\ntext embeddings. We also propose a complementary learning-based\nself-distillation schema that encourages the modified features to be pulled\napart from the irrelevant text embeddings without overfitting the feature space\nto the observed text embeddings. Extensive experiments demonstrate that the\nproposed approach effectively increases the zero-shot capability on extremely\nsparse point clouds, and overwhelms other state-of-the-art model adaptation\napproaches.\n","authors":["Jiayi Han","Zidi Cao","Weibo Zheng","Xiangguo Zhou","Xiangjian He","Yuanfang Zhang","Daisen Wei"],"pdf_url":"https://arxiv.org/pdf/2404.19639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18433v2","updated":"2024-04-30T15:42:25Z","published":"2024-04-29T05:17:33Z","title":"ShadowMaskFormer: Mask Augmented Patch Embeddings for Shadow Removal","summary":" Transformer recently emerged as the de facto model for computer vision tasks\nand has also been successfully applied to shadow removal. However, these\nexisting methods heavily rely on intricate modifications to the attention\nmechanisms within the transformer blocks while using a generic patch embedding.\nAs a result, it often leads to complex architectural designs requiring\nadditional computation resources. In this work, we aim to explore the efficacy\nof incorporating shadow information within the early processing stage.\nAccordingly, we propose a transformer-based framework with a novel patch\nembedding that is tailored for shadow removal, dubbed ShadowMaskFormer.\nSpecifically, we present a simple and effective mask-augmented patch embedding\nto integrate shadow information and promote the model's emphasis on acquiring\nknowledge for shadow regions. Extensive experiments conducted on the ISTD,\nISTD+, and SRD benchmark datasets demonstrate the efficacy of our method\nagainst state-of-the-art approaches while using fewer model parameters.\n","authors":["Zhuohao Li","Guoyang Xie","Guannan Jiang","Zhichao Lu"],"pdf_url":"https://arxiv.org/pdf/2404.18433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19622v1","updated":"2024-04-30T15:22:19Z","published":"2024-04-30T15:22:19Z","title":"Fake it to make it: Using synthetic data to remedy the data shortage in\n joint multimodal speech-and-gesture synthesis","summary":" Although humans engaged in face-to-face conversation simultaneously\ncommunicate both verbally and non-verbally, methods for joint and unified\nsynthesis of speech audio and co-speech 3D gesture motion from text are a new\nand emerging field. These technologies hold great promise for more human-like,\nefficient, expressive, and robust synthetic communication, but are currently\nheld back by the lack of suitably large datasets, as existing methods are\ntrained on parallel data from all constituent modalities. Inspired by\nstudent-teacher methods, we propose a straightforward solution to the data\nshortage, by simply synthesising additional training material. Specifically, we\nuse unimodal synthesis models trained on large datasets to create multimodal\n(but synthetic) parallel training data, and then pre-train a joint synthesis\nmodel on that material. In addition, we propose a new synthesis architecture\nthat adds better and more controllable prosody modelling to the\nstate-of-the-art method in the field. Our results confirm that pre-training on\nlarge amounts of synthetic data improves the quality of both the speech and the\nmotion synthesised by the multimodal model, with the proposed architecture\nyielding further benefits when pre-trained on the synthetic data. See\nhttps://shivammehta25.github.io/MAGI/ for example output.\n","authors":["Shivam Mehta","Anna Deichler","Jim O'Regan","Birger Moëll","Jonas Beskow","Gustav Eje Henter","Simon Alexanderson"],"pdf_url":"https://arxiv.org/pdf/2404.19622v1.pdf","comment":"13+1 pages, 2 figures, accepted at the Human Motion Generation\n workshop (HuMoGen) at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10853v2","updated":"2024-04-30T15:20:54Z","published":"2024-03-16T08:28:42Z","title":"Just Say the Name: Online Continual Learning with Category Names Only\n via Data Generation","summary":" In real-world scenarios, extensive manual annotation for continual learning\nis impractical due to prohibitive costs. Although prior arts, influenced by\nlarge-scale webly supervised training, suggest leveraging web-scraped data in\ncontinual learning, this poses challenges such as data imbalance, usage\nrestrictions, and privacy concerns. Addressing the risks of continual webly\nsupervised training, we present an online continual learning framework -\nGenerative Name only Continual Learning (G-NoCL). The proposed G-NoCL uses a\nset of generators G along with the learner. When encountering new concepts\n(i.e., classes), G-NoCL employs the novel sample complexity-guided data\nensembling technique DIverSity and COmplexity enhancing ensemBlER (DISCOBER) to\noptimally sample training data from generated data. Through extensive\nexperimentation, we demonstrate superior performance of DISCOBER in G-NoCL\nonline CL benchmarks, covering both In-Distribution (ID) and\nOut-of-Distribution (OOD) generalization evaluations, compared to naive\ngenerator-ensembling, web-supervised, and manually annotated data.\n","authors":["Minhyuk Seo","Diganta Misra","Seongwon Cho","Minjae Lee","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2403.10853v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02145v2","updated":"2024-04-30T15:20:03Z","published":"2023-05-03T14:23:37Z","title":"ProgDTD: Progressive Learned Image Compression with Double-Tail-Drop\n Training","summary":" Progressive compression allows images to start loading as low-resolution\nversions, becoming clearer as more data is received. This increases user\nexperience when, for example, network connections are slow. Today, most\napproaches for image compression, both classical and learned ones, are designed\nto be non-progressive. This paper introduces ProgDTD, a training method that\ntransforms learned, non-progressive image compression approaches into\nprogressive ones. The design of ProgDTD is based on the observation that the\ninformation stored within the bottleneck of a compression model commonly varies\nin importance. To create a progressive compression model, ProgDTD modifies the\ntraining steps to enforce the model to store the data in the bottleneck sorted\nby priority. We achieve progressive compression by transmitting the data in\norder of its sorted index. ProgDTD is designed for CNN-based learned image\ncompression models, does not need additional parameters, and has a customizable\nrange of progressiveness. For evaluation, we apply ProgDTDto the hyperprior\nmodel, one of the most common structures in learned image compression. Our\nexperimental results show that ProgDTD performs comparably to its\nnon-progressive counterparts and other state-of-the-art progressive models in\nterms of MS-SSIM and accuracy.\n","authors":["Ali Hojjat","Janek Haberer","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2305.02145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19615v1","updated":"2024-04-30T15:13:57Z","published":"2024-04-30T15:13:57Z","title":"SemiPL: A Semi-supervised Method for Event Sound Source Localization","summary":" In recent years, Event Sound Source Localization has been widely applied in\nvarious fields. Recent works typically relying on the contrastive learning\nframework show impressive performance. However, all work is based on large\nrelatively simple datasets. It's also crucial to understand and analyze human\nbehaviors (actions and interactions of people), voices, and sounds in chaotic\nevents in many applications, e.g., crowd management, and emergency response\nservices. In this paper, we apply the existing model to a more complex dataset,\nexplore the influence of parameters on the model, and propose a semi-supervised\nimprovement method SemiPL. With the increase in data quantity and the influence\nof label quality, self-supervised learning will be an unstoppable trend. The\nexperiment shows that the parameter adjustment will positively affect the\nexisting model. In particular, SSPL achieved an improvement of 12.2% cIoU and\n0.56% AUC in Chaotic World compared to the results provided. The code is\navailable at: https://github.com/ly245422/SSPL\n","authors":["Yue Li","Baiqiao Yin","Jinfu Liu","Jiajun Wen","Jiaying Lin","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.19615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17420v2","updated":"2024-04-30T15:05:34Z","published":"2024-02-27T11:23:39Z","title":"PANDAS: Prototype-based Novel Class Discovery and Detection","summary":" Object detectors are typically trained once and for all on a fixed set of\nclasses. However, this closed-world assumption is unrealistic in practice, as\nnew classes will inevitably emerge after the detector is deployed in the wild.\nIn this work, we look at ways to extend a detector trained for a set of base\nclasses so it can i) spot the presence of novel classes, and ii) automatically\nenrich its repertoire to be able to detect those newly discovered classes\ntogether with the base ones. We propose PANDAS, a method for novel class\ndiscovery and detection. It discovers clusters representing novel classes from\nunlabeled data, and represents old and new classes with prototypes. During\ninference, a distance-based classifier uses these prototypes to assign a label\nto each detected object instance. The simplicity of our method makes it widely\napplicable. We experimentally demonstrate the effectiveness of PANDAS on the\nVOC 2012 and COCO-to-LVIS benchmarks. It performs favorably against the state\nof the art for this task while being computationally more affordable.\n","authors":["Tyler L. Hayes","César R. de Souza","Namil Kim","Jiwon Kim","Riccardo Volpi","Diane Larlus"],"pdf_url":"https://arxiv.org/pdf/2402.17420v2.pdf","comment":"Accepted to the Conference on Lifelong Learning Agents (CoLLAs 2024)"},{"id":"http://arxiv.org/abs/2404.19609v1","updated":"2024-04-30T15:03:27Z","published":"2024-04-30T15:03:27Z","title":"Seeing Through the Clouds: Cloud Gap Imputation with Prithvi Foundation\n Model","summary":" Filling cloudy pixels in multispectral satellite imagery is essential for\naccurate data analysis and downstream applications, especially for tasks which\nrequire time series data. To address this issue, we compare the performance of\na foundational Vision Transformer (ViT) model with a baseline Conditional\nGenerative Adversarial Network (CGAN) model for missing value imputation in\ntime series of multispectral satellite imagery. We randomly mask time series of\nsatellite images using real-world cloud masks and train each model to\nreconstruct the missing pixels. The ViT model is fine-tuned from a pretrained\nmodel, while the CGAN is trained from scratch. Using quantitative evaluation\nmetrics such as structural similarity index and mean absolute error as well as\nqualitative visual analysis, we assess imputation accuracy and contextual\npreservation.\n","authors":["Denys Godwin","Hanxi Li","Michael Cecil","Hamed Alemohammad"],"pdf_url":"https://arxiv.org/pdf/2404.19609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07865v2","updated":"2024-04-30T15:00:01Z","published":"2023-12-13T03:04:22Z","title":"SimAC: A Simple Anti-Customization Method for Protecting Face Privacy\n against Text-to-Image Synthesis of Diffusion Models","summary":" Despite the success of diffusion-based customization methods on visual\ncontent creation, increasing concerns have been raised about such techniques\nfrom both privacy and political perspectives. To tackle this issue, several\nanti-customization methods have been proposed in very recent months,\npredominantly grounded in adversarial attacks. Unfortunately, most of these\nmethods adopt straightforward designs, such as end-to-end optimization with a\nfocus on adversarially maximizing the original training loss, thereby\nneglecting nuanced internal properties intrinsic to the diffusion model, and\neven leading to ineffective optimization in some diffusion time steps.In this\npaper, we strive to bridge this gap by undertaking a comprehensive exploration\nof these inherent properties, to boost the performance of current\nanti-customization approaches. Two aspects of properties are investigated: 1)\nWe examine the relationship between time step selection and the model's\nperception in the frequency domain of images and find that lower time steps can\ngive much more contributions to adversarial noises. This inspires us to propose\nan adaptive greedy search for optimal time steps that seamlessly integrates\nwith existing anti-customization methods. 2) We scrutinize the roles of\nfeatures at different layers during denoising and devise a sophisticated\nfeature-based optimization framework for anti-customization.Experiments on\nfacial benchmarks demonstrate that our approach significantly increases\nidentity disruption, thereby protecting user privacy and copyright. Our code is\navailable at: https://github.com/somuchtome/SimAC.\n","authors":["Feifei Wang","Zhentao Tan","Tianyi Wei","Yue Wu","Qidong Huang"],"pdf_url":"https://arxiv.org/pdf/2312.07865v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19605v1","updated":"2024-04-30T14:55:57Z","published":"2024-04-30T14:55:57Z","title":"Data-Driven Invertible Neural Surrogates of Atmospheric Transmission","summary":" We present a framework for inferring an atmospheric transmission profile from\na spectral scene. This framework leverages a lightweight, physics-based\nsimulator that is automatically tuned - by virtue of autodifferentiation and\ndifferentiable programming - to construct a surrogate atmospheric profile to\nmodel the observed data. We demonstrate utility of the methodology by (i)\nperforming atmospheric correction, (ii) recasting spectral data between various\nmodalities (e.g. radiance and reflectance at the surface and at the sensor),\nand (iii) inferring atmospheric transmission profiles, such as absorbing bands\nand their relative magnitudes.\n","authors":["James Koch","Brenda Forland","Bruce Bernacki","Timothy Doster","Tegan Emerson"],"pdf_url":"https://arxiv.org/pdf/2404.19605v1.pdf","comment":"Manuscript accepted for presentation and publication at the 2024 IEEE\n International Geoscience and Remote Sensing Symposium (IGARSS)"},{"id":"http://arxiv.org/abs/2404.19604v1","updated":"2024-04-30T14:53:07Z","published":"2024-04-30T14:53:07Z","title":"X-Diffusion: Generating Detailed 3D MRI Volumes From a Single Image\n Using Cross-Sectional Diffusion Models","summary":" In this work, we present X-Diffusion, a cross-sectional diffusion model\ntailored for Magnetic Resonance Imaging (MRI) data. X-Diffusion is capable of\ngenerating the entire MRI volume from just a single MRI slice or optionally\nfrom few multiple slices, setting new benchmarks in the precision of\nsynthesized MRIs from extremely sparse observations. The uniqueness lies in the\nnovel view-conditional training and inference of X-Diffusion on MRI volumes,\nallowing for generalized MRI learning. Our evaluations span both brain tumour\nMRIs from the BRATS dataset and full-body MRIs from the UK Biobank dataset.\nUtilizing the paired pre-registered Dual-energy X-ray Absorptiometry (DXA) and\nMRI modalities in the UK Biobank dataset, X-Diffusion is able to generate\ndetailed 3D MRI volume from a single full-body DXA. Remarkably, the resultant\nMRIs not only stand out in precision on unseen examples (surpassing\nstate-of-the-art results by large margins) but also flawlessly retain essential\nfeatures of the original MRI, including tumour profiles, spine curvature, brain\nvolume, and beyond. Furthermore, the trained X-Diffusion model on the MRI\ndatasets attains a generalization capacity out-of-domain (e.g. generating knee\nMRIs even though it is trained on brains). The code is available on the project\nwebsite https://emmanuelleb985.github.io/XDiffusion/ .\n","authors":["Emmanuelle Bourigault","Abdullah Hamdi","Amir Jamaludin"],"pdf_url":"https://arxiv.org/pdf/2404.19604v1.pdf","comment":"preprint, project website:\n https://emmanuelleb985.github.io/XDiffusion/"},{"id":"http://arxiv.org/abs/2311.05524v2","updated":"2024-04-30T14:50:59Z","published":"2023-11-09T17:10:20Z","title":"SeaTurtleID2022: A long-span dataset for reliable sea turtle\n re-identification","summary":" This paper introduces the first public large-scale, long-span dataset with\nsea turtle photographs captured in the wild -- SeaTurtleID2022\n(https://www.kaggle.com/datasets/wildlifedatasets/seaturtleid2022). The dataset\ncontains 8729 photographs of 438 unique individuals collected within 13 years,\nmaking it the longest-spanned dataset for animal re-identification. All\nphotographs include various annotations, e.g., identity, encounter timestamp,\nand body parts segmentation masks. Instead of standard \"random\" splits, the\ndataset allows for two realistic and ecologically motivated splits: (i) a\ntime-aware closed-set with training, validation, and test data from different\ndays/years, and (ii) a time-aware open-set with new unknown individuals in test\nand validation sets. We show that time-aware splits are essential for\nbenchmarking re-identification methods, as random splits lead to performance\noverestimation. Furthermore, a baseline instance segmentation and\nre-identification performance over various body parts is provided. Finally, an\nend-to-end system for sea turtle re-identification is proposed and evaluated.\nThe proposed system based on Hybrid Task Cascade for head instance segmentation\nand ArcFace-trained feature-extractor achieved an accuracy of 86.8%.\n","authors":["Lukáš Adam","Vojtěch Čermák","Kostas Papafitsoros","Lukáš Picek"],"pdf_url":"https://arxiv.org/pdf/2311.05524v2.pdf","comment":"This version is essentially an updated version of the initial\n SeaTurtleID paper (arXiv:2211.10307) and from now on it can be found as a\n replacement of the latter paper. You can also find the published version\n here:\n https://openaccess.thecvf.com/content/WACV2024/html/Adam_SeaTurtleID2022_A_Long-Span_Dataset_for_Reliable_Sea_Turtle_Re-Identification_WACV_2024_paper.html"},{"id":"http://arxiv.org/abs/2404.19598v1","updated":"2024-04-30T14:49:03Z","published":"2024-04-30T14:49:03Z","title":"Artificial Intelligence in Bone Metastasis Analysis: Current\n Advancements, Opportunities and Challenges","summary":" In recent years, Artificial Intelligence (AI) has been widely used in\nmedicine, particularly in the analysis of medical imaging, which has been\ndriven by advances in computer vision and deep learning methods. This is\nparticularly important in overcoming the challenges posed by diseases such as\nBone Metastases (BM), a common and complex malignancy of the bones. Indeed,\nthere have been an increasing interest in developing Machine Learning (ML)\ntechniques into oncologic imaging for BM analysis. In order to provide a\ncomprehensive overview of the current state-of-the-art and advancements for BM\nanalysis using artificial intelligence, this review is conducted with the\naccordance with PRISMA guidelines. Firstly, this review highlights the clinical\nand oncologic perspectives of BM and the used medical imaging modalities, with\ndiscussing their advantages and limitations. Then the review focuses on modern\napproaches with considering the main BM analysis tasks, which includes:\nclassification, detection and segmentation. The results analysis show that ML\ntechnologies can achieve promising performance for BM analysis and have\nsignificant potential to improve clinician efficiency and cope with time and\ncost limitations. Furthermore, there are requirements for further research to\nvalidate the clinical performance of ML tools and facilitate their integration\ninto routine clinical practice.\n","authors":["Marwa Afnouch","Fares Bougourzi","Olfa Gaddour","Fadi Dornaika","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.19598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12135v2","updated":"2024-04-30T14:43:21Z","published":"2023-12-19T13:14:52Z","title":"Object Detection for Automated Coronary Artery Using Deep Learning","summary":" In the era of digital medicine, medical imaging serves as a widespread\ntechnique for early disease detection, with a substantial volume of images\nbeing generated and stored daily in electronic patient records. X-ray\nangiography imaging is a standard and one of the most common methods for\nrapidly diagnosing coronary artery diseases. The notable achievements of recent\ndeep learning algorithms align with the increased use of electronic health\nrecords and diagnostic imaging. Deep neural networks, leveraging abundant data,\nadvanced algorithms, and powerful computational capabilities, prove highly\neffective in the analysis and interpretation of images. In this context, Object\ndetection methods have become a promising approach, particularly through\nconvolutional neural networks (CNN), streamlining medical image analysis by\neliminating manual feature extraction. This allows for direct feature\nextraction from images, ensuring high accuracy in results. Therefore, in our\npaper, we utilized the object detection method on X-ray angiography images to\nprecisely identify the location of coronary artery stenosis. As a result, this\nmodel enables automatic and real-time detection of stenosis locations,\nassisting in the crucial and sensitive decision-making process for healthcare\nprofessionals.\n","authors":["Hadis Keshavarz","Hossein Sadr"],"pdf_url":"https://arxiv.org/pdf/2312.12135v2.pdf","comment":"The results in the article need fundamental corrections"},{"id":"http://arxiv.org/abs/2404.19595v1","updated":"2024-04-30T14:42:55Z","published":"2024-04-30T14:42:55Z","title":"Perceptual Constancy Constrained Single Opinion Score Calibration for\n Image Quality Assessment","summary":" In this paper, we propose a highly efficient method to estimate an image's\nmean opinion score (MOS) from a single opinion score (SOS). Assuming that each\nSOS is the observed sample of a normal distribution and the MOS is its unknown\nexpectation, the MOS inference is formulated as a maximum likelihood estimation\nproblem, where the perceptual correlation of pairwise images is considered in\nmodeling the likelihood of SOS. More specifically, by means of the\nquality-aware representations learned from the self-supervised backbone, we\nintroduce a learnable relative quality measure to predict the MOS difference\nbetween two images. Then, the current image's maximum likelihood estimation\ntowards MOS is represented by the sum of another reference image's estimated\nMOS and their relative quality. Ideally, no matter which image is selected as\nthe reference, the MOS of the current image should remain unchanged, which is\ntermed perceptual cons tancy constrained calibration (PC3). Finally, we\nalternatively optimize the relative quality measure's parameter and the current\nimage's estimated MOS via backpropagation and Newton's method respectively.\nExperiments show that the proposed method is efficient in calibrating the\nbiased SOS and significantly improves IQA model learning when only SOSs are\navailable.\n","authors":["Lei Wang","Desen Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.19595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16132v2","updated":"2024-04-30T14:37:59Z","published":"2023-06-28T12:01:51Z","title":"Fast and Accurate Unknown Object Instance Segmentation through\n Error-Informed Refinement","summary":" Accurate perception of unknown objects is essential for autonomous robots,\nparticularly when manipulating novel items in unstructured environments.\nHowever, existing unknown object instance segmentation (UOIS) methods often\nhave over-segmentation and under-segmentation problems, resulting in inaccurate\ninstance boundaries and failures in subsequent robotic tasks such as grasping\nand placement. To address this challenge, this article introduces INSTA-BEER, a\nfast and accurate model-agnostic refinement method that enhances the UOIS\nperformance. The model adopts an error-informed refinement approach, which\nfirst predicts pixel-wise errors in the initial segmentation and then refines\nthe segmentation guided by these error estimates. We introduce the quad-metric\nboundary error, which quantifies pixel-wise true positives, true negatives,\nfalse positives, and false negatives at the boundaries of object instances,\neffectively capturing both fine-grained and instance-level segmentation errors.\nAdditionally, the Error Guidance Fusion (EGF) module explicitly integrates\nerror information into the refinement process, further improving segmentation\nquality. In comprehensive evaluations conducted on three widely used benchmark\ndatasets, INSTA-BEER outperformed state-of-the-art models in both accuracy and\ninference time. Moreover, a real-world robotic experiment demonstrated the\npractical applicability of our method in improving the performance of target\nobject grasping tasks in cluttered environments.\n","authors":["Seunghyeok Back","Sangbeom Lee","Kangmin Kim","Joosoon Lee","Sungho Shin","Jemo Maeng","Kyoobin Lee"],"pdf_url":"https://arxiv.org/pdf/2306.16132v2.pdf","comment":"8 pages, 5 figures, project website:\n https://sites.google.com/view/insta-beer"},{"id":"http://arxiv.org/abs/2402.03305v2","updated":"2024-04-30T14:32:31Z","published":"2024-02-05T18:58:38Z","title":"Do Diffusion Models Learn Semantically Meaningful and Efficient\n Representations?","summary":" Diffusion models are capable of impressive feats of image generation with\nuncommon juxtapositions such as astronauts riding horses on the moon with\nproperly placed shadows. These outputs indicate the ability to perform\ncompositional generalization, but how do the models do so? We perform\ncontrolled experiments on conditional DDPMs learning to generate 2D spherical\nGaussian bumps centered at specified $x$- and $y$-positions. Our results show\nthat the emergence of semantically meaningful latent representations is key to\nachieving high performance. En route to successful performance over learning,\nthe model traverses three distinct phases of latent representations: (phase A)\nno latent structure, (phase B) a 2D manifold of disordered states, and (phase\nC) a 2D ordered manifold. Corresponding to each of these phases, we identify\nqualitatively different generation behaviors: 1) multiple bumps are generated,\n2) one bump is generated but at inaccurate $x$ and $y$ locations, 3) a bump is\ngenerated at the correct $x$ and y location. Furthermore, we show that even\nunder imbalanced datasets where features ($x$- versus $y$-positions) are\nrepresented with skewed frequencies, the learning process for $x$ and $y$ is\ncoupled rather than factorized, demonstrating that simple vanilla-flavored\ndiffusion models cannot learn efficient representations in which localization\nin $x$ and $y$ are factorized into separate 1D tasks. These findings suggest\nthe need for future work to find inductive biases that will push generative\nmodels to discover and exploit factorizable independent structures in their\ninputs, which will be required to vault these models into more data-efficient\nregimes.\n","authors":["Qiyao Liang","Ziming Liu","Ila Fiete"],"pdf_url":"https://arxiv.org/pdf/2402.03305v2.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.19586v1","updated":"2024-04-30T14:25:32Z","published":"2024-04-30T14:25:32Z","title":"AI techniques for near real-time monitoring of contaminants in coastal\n waters on board future Phisat-2 mission","summary":" Differently from conventional procedures, the proposed solution advocates for\na groundbreaking paradigm in water quality monitoring through the integration\nof satellite Remote Sensing (RS) data, Artificial Intelligence (AI) techniques,\nand onboard processing. The objective is to offer nearly real-time detection of\ncontaminants in coastal waters addressing a significant gap in the existing\nliterature. Moreover, the expected outcomes include substantial advancements in\nenvironmental monitoring, public health protection, and resource conservation.\nThe specific focus of our study is on the estimation of Turbidity and pH\nparameters, for their implications on human and aquatic health. Nevertheless,\nthe designed framework can be extended to include other parameters of interest\nin the water environment and beyond. Originating from our participation in the\nEuropean Space Agency (ESA) OrbitalAI Challenge, this article describes the\ndistinctive opportunities and issues for the contaminants monitoring on the\nPhisat-2 mission. The specific characteristics of this mission, with the tools\nmade available, will be presented, with the methodology proposed by the authors\nfor the onboard monitoring of water contaminants in near real-time. Preliminary\npromising results are discussed and in progress and future work introduced.\n","authors":["Francesca Razzano","Pietro Di Stasio","Francesco Mauro","Gabriele Meoni","Marco Esposito","Gilda Schirinzi","Silvia L. Ullo"],"pdf_url":"https://arxiv.org/pdf/2404.19586v1.pdf","comment":"11 pages, 9 figures, submitted to IEEE JSTARS"},{"id":"http://arxiv.org/abs/2404.13680v2","updated":"2024-04-30T14:24:23Z","published":"2024-04-21T14:43:31Z","title":"PoseAnimate: Zero-shot high fidelity pose controllable character\n animation","summary":" Image-to-video(I2V) generation aims to create a video sequence from a single\nimage, which requires high temporal coherence and visual fidelity with the\nsource image.However, existing approaches suffer from character appearance\ninconsistency and poor preservation of fine details. Moreover, they require a\nlarge amount of video data for training, which can be computationally\ndemanding.To address these limitations,we propose PoseAnimate, a novel\nzero-shot I2V framework for character animation.PoseAnimate contains three key\ncomponents: 1) Pose-Aware Control Module (PACM) incorporates diverse pose\nsignals into conditional embeddings, to preserve character-independent content\nand maintain precise alignment of actions.2) Dual Consistency Attention Module\n(DCAM) enhances temporal consistency, and retains character identity and\nintricate background details.3) Mask-Guided Decoupling Module (MGDM) refines\ndistinct feature perception, improving animation fidelity by decoupling the\ncharacter and background.We also propose a Pose Alignment Transition Algorithm\n(PATA) to ensure smooth action transition.Extensive experiment results\ndemonstrate that our approach outperforms the state-of-the-art training-based\nmethods in terms of character consistency and detail fidelity. Moreover, it\nmaintains a high level of temporal coherence throughout the generated\nanimations.\n","authors":["Bingwen Zhu","Fanyi Wang","Tianyi Lu","Peng Liu","Jingwen Su","Jinxiu Liu","Yanhao Zhang","Zuxuan Wu","Yu-Gang Jiang","Guo-Jun Qi"],"pdf_url":"https://arxiv.org/pdf/2404.13680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19579v1","updated":"2024-04-30T14:16:45Z","published":"2024-04-30T14:16:45Z","title":"Automatic Cardiac Pathology Recognition in Echocardiography Images Using\n Higher Order Dynamic Mode Decomposition and a Vision Transformer for Small\n Datasets","summary":" Heart diseases are the main international cause of human defunction.\nAccording to the WHO, nearly 18 million people decease each year because of\nheart diseases. Also considering the increase of medical data, much pressure is\nput on the health industry to develop systems for early and accurate heart\ndisease recognition. In this work, an automatic cardiac pathology recognition\nsystem based on a novel deep learning framework is proposed, which analyses in\nreal-time echocardiography video sequences. The system works in two stages. The\nfirst one transforms the data included in a database of echocardiography\nsequences into a machine-learning-compatible collection of annotated images\nwhich can be used in the training stage of any kind of machine learning-based\nframework, and more specifically with deep learning. This includes the use of\nthe Higher Order Dynamic Mode Decomposition (HODMD) algorithm, for the first\ntime to the authors' knowledge, for both data augmentation and feature\nextraction in the medical field. The second stage is focused on building and\ntraining a Vision Transformer (ViT), barely explored in the related literature.\nThe ViT is adapted for an effective training from scratch, even with small\ndatasets. The designed neural network analyses images from an echocardiography\nsequence to predict the heart state. The results obtained show the superiority\nof the proposed system and the efficacy of the HODMD algorithm, even\noutperforming pretrained Convolutional Neural Networks (CNNs), which are so far\nthe method of choice in the literature.\n","authors":["Andrés Bell-Navas","Nourelhouda Groun","María Villalba-Orero","Enrique Lara-Pezzi","Jesús Garicano-Mena","Soledad Le Clainche"],"pdf_url":"https://arxiv.org/pdf/2404.19579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18370v2","updated":"2024-04-30T14:13:48Z","published":"2024-02-27T01:21:37Z","title":"Adversarial Example Soups: Improving Transferability and Stealthiness\n for Free","summary":" Transferable adversarial examples cause practical security risks since they\ncan mislead a target model without knowing its internal knowledge. A\nconventional recipe for maximizing transferability is to keep only the optimal\nadversarial example from all those obtained in the optimization pipeline. In\nthis paper, for the first time, we question this convention and demonstrate\nthat those discarded, sub-optimal adversarial examples can be reused to boost\ntransferability. Specifically, we propose ``Adversarial Example Soups'' (AES),\nwith AES-tune for averaging discarded adversarial examples in hyperparameter\ntuning and AES-rand for stability testing. In addition, our AES is inspired by\n``model soups'', which averages weights of multiple fine-tuned models for\nimproved accuracy without increasing inference time. Extensive experiments\nvalidate the global effectiveness of our AES, boosting 10 state-of-the-art\ntransfer attacks and their combinations by up to 13% against 10 diverse\n(defensive) target models. We also show the possibility of generalizing AES to\nother types, e.g., directly averaging multiple in-the-wild adversarial examples\nthat yield comparable success. A promising byproduct of AES is the improved\nstealthiness of adversarial examples since the perturbation variances are\nnaturally reduced.\n","authors":["Bo Yang","Hengwei Zhang","Jindong Wang","Yulong Yang","Chenhao Lin","Chao Shen","Zhengyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.18370v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.19574v1","updated":"2024-04-30T14:09:14Z","published":"2024-04-30T14:09:14Z","title":"A Spatio-Temporal based Frame Indexing Algorithm for QoS Improvement in\n Live Low-Motion Video Streaming","summary":" Real-time video life streaming of events over a network continued to gain\nmore popularity among the populace. However, there is need to ensure the\njudicious utilization of allocated bandwidth without compromising the Quality\nof Service (QoS) of the system. In this regard, this paper presents an approach\nbased on spatio-temporal frame indexing that detects and eliminate redundancy\nwithin and across captured frame, prior transmission from the server to\nclients. The standard and local low motion videos were the two scenarios\nconsidered in evaluating the performance of the proposed algorithm. Results\nobtained showed that the proposed approach achieved an improvement of 5.13%,\n15.8% and 5%, 15.6% improvement in terms of the buffer size and compression\nratio. Though with a tradeoff of the frame-built time, where both the standard\nand local frame indexing outperforms the proposed scheme with 10.8% and 8.71%\nrespectively.\n","authors":["Adewale Emmanuel Adedokun","Muhammed Bashir Abdulrazak","Muyideen Momoh Omuya","Habeeb BelloSalau","Bashir Olaniyi Sadiq"],"pdf_url":"https://arxiv.org/pdf/2404.19574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07355v2","updated":"2024-04-30T14:02:35Z","published":"2023-10-11T10:12:43Z","title":"IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training","summary":" In the field of medical Vision-Language Pre-training (VLP), significant\nefforts have been devoted to deriving text and image features from both\nclinical reports and associated medical images. However, most existing methods\nmay have overlooked the opportunity in leveraging the inherent hierarchical\nstructure of clinical reports, which are generally split into `findings' for\ndescriptive content and `impressions' for conclusive observation. Instead of\nutilizing this rich, structured format, current medical VLP approaches often\nsimplify the report into either a unified entity or fragmented tokens. In this\nwork, we propose a novel clinical prior guided VLP framework named IMITATE to\nlearn the structure information from medical reports with hierarchical\nvision-language alignment. The framework derives multi-level visual features\nfrom the chest X-ray (CXR) images and separately aligns these features with the\ndescriptive and the conclusive text encoded in the hierarchical medical report.\nFurthermore, a new clinical-informed contrastive loss is introduced for\ncross-modal learning, which accounts for clinical prior knowledge in\nformulating sample correlations in contrastive learning. The proposed model,\nIMITATE, outperforms baseline VLP methods across six different datasets,\nspanning five medical imaging downstream tasks. Comprehensive experimental\nresults highlight the advantages of integrating the hierarchical structure of\nmedical reports for vision-language alignment.\n","authors":["Che Liu","Sibo Cheng","Miaojing Shi","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2310.07355v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2307.16670v3","updated":"2024-04-30T13:59:31Z","published":"2023-07-31T13:47:33Z","title":"Conditioning Generative Latent Optimization for Sparse-View CT Image\n Reconstruction","summary":" Computed Tomography (CT) is a prominent example of Imaging Inverse Problem\nhighlighting the unrivaled performances of data-driven methods in degraded\nmeasurements setups like sparse X-ray projections. Although a significant\nproportion of deep learning approaches benefit from large supervised datasets,\nthey cannot generalize to new experimental setups. In contrast, fully\nunsupervised techniques, most notably using score-based generative models, have\nrecently demonstrated similar or better performances compared to supervised\napproaches while being flexible at test time. However, their use cases are\nlimited as they need considerable amounts of training data to have good\ngeneralization properties. Another unsupervised approach taking advantage of\nthe implicit natural bias of deep convolutional networks, Deep Image Prior, has\nrecently been adapted to solve sparse CT by reparameterizing the reconstruction\nproblem. Although this methodology does not require any training dataset, it\nenforces a weaker prior on the reconstructions when compared to data-driven\nmethods. To fill the gap between these two strategies, we propose an\nunsupervised conditional approach to the Generative Latent Optimization\nframework (cGLO). Similarly to DIP, without any training dataset, cGLO benefits\nfrom the structural bias of a decoder network. However, the prior is further\nreinforced as the effect of a likelihood objective shared between multiple\nslices being reconstructed simultaneously through the same decoder network. In\naddition, the parameters of the decoder may be initialized on an unsupervised,\nand eventually very small, training dataset to enhance the reconstruction. The\nresulting approach is tested on full-dose sparse-view CT using multiple\ntraining dataset sizes and varying numbers of viewing angles.\n","authors":["Thomas Braure","Delphine Lazaro","David Hateau","Vincent Brandon","Kévin Ginsburger"],"pdf_url":"https://arxiv.org/pdf/2307.16670v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19568v1","updated":"2024-04-30T13:59:13Z","published":"2024-04-30T13:59:13Z","title":"Enhancing Deep Learning Model Explainability in Brain Tumor Datasets\n using Post-Heuristic Approaches","summary":" The application of deep learning models in medical diagnosis has showcased\nconsiderable efficacy in recent years. Nevertheless, a notable limitation\ninvolves the inherent lack of explainability during decision-making processes.\nThis study addresses such a constraint, by enhancing the interpretability\nrobustness. The primary focus is directed towards refining the explanations\ngenerated by the LIME Library and LIME image explainer. This is achieved\nthrouhg post-processing mechanisms, based on scenario-specific rules. Multiple\nexperiments have been conducted using publicly accessible datasets related to\nbrain tumor detection. Our proposed post-heuristic approach demonstrates\nsignificant advancements, yielding more robust and concrete results, in the\ncontext of medical diagnosis.\n","authors":["Konstantinos Pasvantis","Eftychios Protopapadakis"],"pdf_url":"https://arxiv.org/pdf/2404.19568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19567v1","updated":"2024-04-30T13:55:30Z","published":"2024-04-30T13:55:30Z","title":"Causal Perception Inspired Representation Learning for Trustworthy Image\n Quality Assessment","summary":" Despite great success in modeling visual perception, deep neural network\nbased image quality assessment (IQA) still remains unreliable in real-world\napplications due to its vulnerability to adversarial perturbations and the\ninexplicit black-box structure. In this paper, we propose to build a\ntrustworthy IQA model via Causal Perception inspired Representation Learning\n(CPRL), and a score reflection attack method for IQA model. More specifically,\nwe assume that each image is composed of Causal Perception Representation (CPR)\nand non-causal perception representation (N-CPR). CPR serves as the causation\nof the subjective quality label, which is invariant to the imperceptible\nadversarial perturbations. Inversely, N-CPR presents spurious associations with\nthe subjective quality label, which may significantly change with the\nadversarial perturbations. To extract the CPR from each input image, we develop\na soft ranking based channel-wise activation function to mediate the causally\nsufficient (beneficial for high prediction accuracy) and necessary (beneficial\nfor high robustness) deep features, and based on intervention employ minimax\ngame to optimize. Experiments on four benchmark databases show that the\nproposed CPRL method outperforms many state-of-the-art adversarial defense\nmethods and provides explicit model interpretation.\n","authors":["Lei Wang","Desen Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.19567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14831v3","updated":"2024-04-30T13:25:57Z","published":"2024-01-26T12:59:26Z","title":"The Machine Vision Iceberg Explained: Advancing Dynamic Testing by\n Considering Holistic Environmental Relations","summary":" Machine Vision (MV) is essential for solving driving automation. This paper\nexamines potential shortcomings in current MV testing strategies for highly\nautomated driving (HAD) systems. We argue for a more comprehensive\nunderstanding of the performance factors that must be considered during the MV\nevaluation process, noting that neglecting these factors can lead to\nsignificant risks. This is not only relevant to MV component testing, but also\nto integration testing. To illustrate this point, we draw an analogy to a ship\nnavigating towards an iceberg to show potential hidden challenges in current MV\ntesting strategies. The main contribution is a novel framework for black-box\ntesting which observes environmental relations. This means it is designed to\nenhance MV assessments by considering the attributes and surroundings of\nrelevant individual objects. The framework provides the identification of seven\ngeneral concerns about the object recognition of MV, which are not addressed\nadequately in established test processes. To detect these deficits based on\ntheir performance factors, we propose the use of a taxonomy called \"granularity\norders\" along with a graphical representation. This allows an identification of\nMV uncertainties across a range of driving scenarios. This approach aims to\nadvance the precision, efficiency, and completeness of testing procedures for\nMV.\n","authors":["Hubert Padusinski","Christian Steinhauser","Thilo Braun","Lennart Ries","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2401.14831v3.pdf","comment":"Submitted at IEEE ITSC 2024"},{"id":"http://arxiv.org/abs/2404.19542v1","updated":"2024-04-30T13:14:28Z","published":"2024-04-30T13:14:28Z","title":"One-Stage Open-Vocabulary Temporal Action Detection Leveraging Temporal\n Multi-scale and Action Label Features","summary":" Open-vocabulary Temporal Action Detection (Open-vocab TAD) is an advanced\nvideo analysis approach that expands Closed-vocabulary Temporal Action\nDetection (Closed-vocab TAD) capabilities. Closed-vocab TAD is typically\nconfined to localizing and classifying actions based on a predefined set of\ncategories. In contrast, Open-vocab TAD goes further and is not limited to\nthese predefined categories. This is particularly useful in real-world\nscenarios where the variety of actions in videos can be vast and not always\npredictable. The prevalent methods in Open-vocab TAD typically employ a 2-stage\napproach, which involves generating action proposals and then identifying those\nactions. However, errors made during the first stage can adversely affect the\nsubsequent action identification accuracy. Additionally, existing studies face\nchallenges in handling actions of different durations owing to the use of fixed\ntemporal processing methods. Therefore, we propose a 1-stage approach\nconsisting of two primary modules: Multi-scale Video Analysis (MVA) and\nVideo-Text Alignment (VTA). The MVA module captures actions at varying temporal\nresolutions, overcoming the challenge of detecting actions with diverse\ndurations. The VTA module leverages the synergy between visual and textual\nmodalities to precisely align video segments with corresponding action labels,\na critical step for accurate action identification in Open-vocab scenarios.\nEvaluations on widely recognized datasets THUMOS14 and ActivityNet-1.3, showed\nthat the proposed method achieved superior results compared to the other\nmethods in both Open-vocab and Closed-vocab settings. This serves as a strong\ndemonstration of the effectiveness of the proposed method in the TAD task.\n","authors":["Trung Thanh Nguyen","Yasutomo Kawanishi","Takahiro Komamizu","Ichiro Ide"],"pdf_url":"https://arxiv.org/pdf/2404.19542v1.pdf","comment":"The 18th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.19541v1","updated":"2024-04-30T13:14:11Z","published":"2024-04-30T13:14:11Z","title":"Ultra Inertial Poser: Scalable Motion Capture and Tracking from Sparse\n Inertial Sensors and Ultra-Wideband Ranging","summary":" While camera-based capture systems remain the gold standard for recording\nhuman motion, learning-based tracking systems based on sparse wearable sensors\nare gaining popularity. Most commonly, they use inertial sensors, whose\npropensity for drift and jitter have so far limited tracking accuracy. In this\npaper, we propose Ultra Inertial Poser, a novel 3D full body pose estimation\nmethod that constrains drift and jitter in inertial tracking via inter-sensor\ndistances. We estimate these distances across sparse sensor setups using a\nlightweight embedded tracker that augments inexpensive off-the-shelf 6D\ninertial measurement units with ultra-wideband radio-based\nranging$-$dynamically and without the need for stationary reference anchors.\nOur method then fuses these inter-sensor distances with the 3D states estimated\nfrom each sensor Our graph-based machine learning model processes the 3D states\nand distances to estimate a person's 3D full body pose and translation. To\ntrain our model, we synthesize inertial measurements and distance estimates\nfrom the motion capture database AMASS. For evaluation, we contribute a novel\nmotion dataset of 10 participants who performed 25 motion types, captured by 6\nwearable IMU+UWB trackers and an optical motion capture system, totaling 200\nminutes of synchronized sensor data (UIP-DB). Our extensive experiments show\nstate-of-the-art performance for our method over PIP and TIP, reducing position\nerror from $13.62$ to $10.65cm$ ($22\\%$ better) and lowering jitter from $1.56$\nto $0.055km/s^3$ (a reduction of $97\\%$).\n","authors":["Rayan Armani","Changlin Qian","Jiaxi Jiang","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2404.19541v1.pdf","comment":"Accepted by SIGGRAPH 2024, Code:\n https://github.com/eth-siplab/UltraInertialPoser"},{"id":"http://arxiv.org/abs/2404.19534v1","updated":"2024-04-30T13:11:12Z","published":"2024-04-30T13:11:12Z","title":"MIPI 2024 Challenge on Nighttime Flare Removal: Methods and Results","summary":" The increasing demand for computational photography and imaging on mobile\nplatforms has led to the widespread development and integration of advanced\nimage sensors with novel algorithms in camera systems. However, the scarcity of\nhigh-quality data for research and the rare opportunity for in-depth exchange\nof views from industry and academia constrain the development of mobile\nintelligent photography and imaging (MIPI). Building on the achievements of the\nprevious MIPI Workshops held at ECCV 2022 and CVPR 2023, we introduce our third\nMIPI challenge including three tracks focusing on novel image sensors and\nimaging algorithms. In this paper, we summarize and review the Nighttime Flare\nRemoval track on MIPI 2024. In total, 170 participants were successfully\nregistered, and 14 teams submitted results in the final testing phase. The\ndeveloped solutions in this challenge achieved state-of-the-art performance on\nNighttime Flare Removal. More details of this challenge and the link to the\ndataset can be found at https://mipi-challenge.org/MIPI2024/.\n","authors":["Yuekun Dai","Dafeng Zhang","Xiaoming Li","Zongsheng Yue","Chongyi Li","Shangchen Zhou","Ruicheng Feng","Peiqing Yang","Zhezhu Jin","Guanqun Liu","Chen Change Loy","Lize Zhang","Shuai Liu","Chaoyu Feng","Luyang Wang","Shuan Chen","Guangqi Shao","Xiaotao Wang","Lei Lei","Qirui Yang","Qihua Cheng","Zhiqiang Xu","Yihao Liu","Huanjing Yue","Jingyu Yang","Florin-Alexandru Vasluianu","Zongwei Wu","George Ciubotariu","Radu Timofte","Zhao Zhang","Suiyi Zhao","Bo Wang","Zhichao Zuo","Yanyan Wei","Kuppa Sai Sri Teja","Jayakar Reddy A","Girish Rongali","Kaushik Mitra","Zhihao Ma","Yongxu Liu","Wanying Zhang","Wei Shang","Yuhong He","Long Peng","Zhongxin Yu","Shaofei Luo","Jian Wang","Yuqi Miao","Baiang Li","Gang Wei","Rakshank Verma","Ritik Maheshwari","Rahul Tekchandani","Praful Hambarde","Satya Narayan Tazi","Santosh Kumar Vipparthi","Subrahmanyam Murala","Haopeng Zhang","Yingli Hou","Mingde Yao","Levin M S","Aniruth Sundararajan","Hari Kumar A"],"pdf_url":"https://arxiv.org/pdf/2404.19534v1.pdf","comment":"CVPR 2024 Mobile Intelligent Photography and Imaging (MIPI)\n Workshop--Nighttime Flare Removal Challenge Report. Website:\n https://mipi-challenge.org/MIPI2024/"},{"id":"http://arxiv.org/abs/2404.19531v1","updated":"2024-04-30T13:09:41Z","published":"2024-04-30T13:09:41Z","title":"MoST: Multi-modality Scene Tokenization for Motion Prediction","summary":" Many existing motion prediction approaches rely on symbolic perception\noutputs to generate agent trajectories, such as bounding boxes, road graph\ninformation and traffic lights. This symbolic representation is a high-level\nabstraction of the real world, which may render the motion prediction model\nvulnerable to perception errors (e.g., failures in detecting open-vocabulary\nobstacles) while missing salient information from the scene context (e.g., poor\nroad conditions). An alternative paradigm is end-to-end learning from raw\nsensors. However, this approach suffers from the lack of interpretability and\nrequires significantly more training resources. In this work, we propose\ntokenizing the visual world into a compact set of scene elements and then\nleveraging pre-trained image foundation models and LiDAR neural networks to\nencode all the scene elements in an open-vocabulary manner. The image\nfoundation model enables our scene tokens to encode the general knowledge of\nthe open world while the LiDAR neural network encodes geometry information. Our\nproposed representation can efficiently encode the multi-frame multi-modality\nobservations with a few hundred tokens and is compatible with most\ntransformer-based architectures. To evaluate our method, we have augmented\nWaymo Open Motion Dataset with camera embeddings. Experiments over Waymo Open\nMotion Dataset show that our approach leads to significant performance\nimprovements over the state-of-the-art.\n","authors":["Norman Mu","Jingwei Ji","Zhenpei Yang","Nate Harada","Haotian Tang","Kan Chen","Charles R. Qi","Runzhou Ge","Kratarth Goel","Zoey Yang","Scott Ettinger","Rami Al-Rfou","Dragomir Anguelov","Yin Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.19531v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.19525v1","updated":"2024-04-30T12:56:14Z","published":"2024-04-30T12:56:14Z","title":"MicroDreamer: Zero-shot 3D Generation in $\\sim$20 Seconds by Score-based\n Iterative Reconstruction","summary":" Optimization-based approaches, such as score distillation sampling (SDS),\nshow promise in zero-shot 3D generation but suffer from low efficiency,\nprimarily due to the high number of function evaluations (NFEs) required for\neach sample. In this paper, we introduce score-based iterative reconstruction\n(SIR), an efficient and general algorithm for 3D generation with a multi-view\nscore-based diffusion model. Given the images produced by the diffusion model,\nSIR reduces NFEs by repeatedly optimizing 3D parameters, unlike the single\noptimization in SDS, mimicking the 3D reconstruction process. With other\nimprovements including optimization in the pixel space, we present an efficient\napproach called MicroDreamer that generally applies to various 3D\nrepresentations and 3D generation tasks. In particular, retaining a comparable\nperformance, MicroDreamer is 5-20 times faster than SDS in generating neural\nradiance field and takes about 20 seconds to generate meshes from 3D Gaussian\nsplitting on a single A100 GPU, halving the time of the fastest zero-shot\nbaseline, DreamGaussian. Our code is available at\nhttps://github.com/ML-GSAI/MicroDreamer.\n","authors":["Luxi Chen","Zhengyi Wang","Chongxuan Li","Tingting Gao","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.19525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19513v1","updated":"2024-04-30T12:45:41Z","published":"2024-04-30T12:45:41Z","title":"A Smartphone-Based Method for Assessing Tomato Nutrient Status through\n Trichome Density Measurement","summary":" Accurately assessing tomato plant nutrient status is crucial for maintaining\nhigh yields. Consequently, accurately identifying fertilizer-induced stress\nthrough the morphological traits of tomato plants has become a critical\nagricultural challenge. Research and development efforts have focused on\ndeveloping noninvasive diagnostic tools for nutrition that leverage a\ncombination of morphological traits and advanced sensor technologies. Given\nthese advancements, detecting fertilizer stress by observing morphological\ntraits near the growth points of tomatoes is still a significant challenge. To\naddress this challenge, we developed a simple and cost-effective\nsmartphone-based method for measuring trichome density. This method involves\ntransferring trichomes from the surface of a leaf onto cellophane tape and\ncapturing images using a smartphone. The images are processed using computer\nvision techniques to calculate the trichome density. To assess the efficacy of\nthis method, we performed experiments on hydroponically grown tomato plants\nsubjected to varying fertilizer concentrations. Our results indicate that our\nnovel method for measuring trichome density accurately reflects fertilizer\nstress in tomato plants. The predictive performance of our model, as evaluated\nby the mean area under the precision recall curve, was 0.824, despite\nvariations in the measurement data caused by differences in optical conditions.\nThis study introduces an innovative approach for designing diagnostic devices\nfor detecting fertilizer stress in plants by considering the surface structures\nof plants. Our proposed method represents a straightforward, efficient, and\neconomical approach for evaluating the nutrient status of tomato plants and has\nthe potential to overcome the limitations of conventional noncontact optical\nmethods.\n","authors":["Sho Ueda","Xujun Ye"],"pdf_url":"https://arxiv.org/pdf/2404.19513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07027v2","updated":"2024-04-30T12:37:13Z","published":"2023-10-10T21:29:41Z","title":"Utilizing Synthetic Data for Medical Vision-Language Pre-training:\n Bypassing the Need for Real Images","summary":" Medical Vision-Language Pre-training (VLP) learns representations jointly\nfrom medical images and paired radiology reports. It typically requires\nlarge-scale paired image-text datasets to achieve effective pre-training for\nboth the image encoder and text encoder. The advent of text-guided generative\nmodels raises a compelling question: Can VLP be implemented solely with\nsynthetic images generated from genuine radiology reports, thereby mitigating\nthe need for extensively pairing and curating image-text datasets? In this\nwork, we scrutinize this very question by examining the feasibility and\neffectiveness of employing synthetic images for medical VLP. We replace real\nmedical images with their synthetic equivalents, generated from authentic\nmedical reports. Utilizing three state-of-the-art VLP algorithms, we\nexclusively train on these synthetic samples. Our empirical evaluation across\nthree subsequent tasks, namely image classification, semantic segmentation and\nobject detection, reveals that the performance achieved through synthetic data\nis on par with or even exceeds that obtained with real images. As a pioneering\ncontribution to this domain, we introduce a large-scale synthetic medical image\ndataset, paired with anonymized real radiology reports. This alleviates the\nneed of sharing medical images, which are not easy to curate and share in\npractice. The code and the dataset can be found in\n\\href{https://github.com/cheliu-computation/MedSyn-RepLearn/tree/main}{https://github.com/cheliu-computation/MedSyn-RepLearn/tree/main}.\n","authors":["Che Liu","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2310.07027v2.pdf","comment":"Accepted by CVPR 2024 Workshop Data Curation and Augmentation in\n Enhancing Medical Imaging Applications"},{"id":"http://arxiv.org/abs/2404.19500v1","updated":"2024-04-30T12:37:01Z","published":"2024-04-30T12:37:01Z","title":"Towards Real-world Video Face Restoration: A New Benchmark","summary":" Blind face restoration (BFR) on images has significantly progressed over the\nlast several years, while real-world video face restoration (VFR), which is\nmore challenging for more complex face motions such as moving gaze directions\nand facial orientations involved, remains unsolved. Typical BFR methods are\nevaluated on privately synthesized datasets or self-collected real-world\nlow-quality face images, which are limited in their coverage of real-world\nvideo frames. In this work, we introduced new real-world datasets named FOS\nwith a taxonomy of \"Full, Occluded, and Side\" faces from mainly video frames to\nstudy the applicability of current methods on videos. Compared with existing\ntest datasets, FOS datasets cover more diverse degradations and involve face\nsamples from more complex scenarios, which helps to revisit current face\nrestoration approaches more comprehensively. Given the established datasets, we\nbenchmarked both the state-of-the-art BFR methods and the video super\nresolution (VSR) methods to comprehensively study current approaches,\nidentifying their potential and limitations in VFR tasks. In addition, we\nstudied the effectiveness of the commonly used image quality assessment (IQA)\nmetrics and face IQA (FIQA) metrics by leveraging a subjective user study. With\nextensive experimental results and detailed analysis provided, we gained\ninsights from the successes and failures of both current BFR and VSR methods.\nThese results also pose challenges to current face restoration approaches,\nwhich we hope stimulate future advances in VFR research.\n","authors":["Ziyan Chen","Jingwen He","Xinqi Lin","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2404.19500v1.pdf","comment":"Project page: https://ziyannchen.github.io/projects/VFRxBenchmark/"},{"id":"http://arxiv.org/abs/2303.10802v2","updated":"2024-04-30T12:24:24Z","published":"2023-03-20T00:35:33Z","title":"PASS: Peer-Agreement based Sample Selection for training with Noisy\n Labels","summary":" The prevalence of noisy-label samples poses a significant challenge in deep\nlearning, inducing overfitting effects. This has, therefore, motivated the\nemergence of learning with noisy-label (LNL) techniques that focus on\nseparating noisy- and clean-label samples to apply different learning\nstrategies to each group of samples. Current methodologies often rely on the\nsmall-loss hypothesis or feature-based selection to separate noisy- and\nclean-label samples, yet our empirical observations reveal their limitations,\nespecially for labels with instance dependent noise (IDN). An important\ncharacteristic of IDN is the difficulty to distinguish the clean-label samples\nthat lie near the decision boundary (i.e., the hard samples) from the\nnoisy-label samples. We, therefore, propose a new noisy-label detection method,\ntermed Peer-Agreement based Sample Selection (PASS), to address this problem.\nUtilising a trio of classifiers, PASS employs consensus-driven peer-based\nagreement of two models to select the samples to train the remaining model.\nPASS is easily integrated into existing LNL models, enabling the improvement of\nthe detection accuracy of noisy- and clean-label samples, which increases the\nclassification accuracy across various LNL benchmarks.\n","authors":["Arpit Garg","Cuong Nguyen","Rafael Felix","Thanh-Toan Do","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2303.10802v2.pdf","comment":"In Submission"},{"id":"http://arxiv.org/abs/2404.19489v1","updated":"2024-04-30T12:18:47Z","published":"2024-04-30T12:18:47Z","title":"EvGNN: An Event-driven Graph Neural Network Accelerator for Edge Vision","summary":" Edge vision systems combining sensing and embedded processing promise\nlow-latency, decentralized, and energy-efficient solutions that forgo reliance\non the cloud. As opposed to conventional frame-based vision sensors,\nevent-based cameras deliver a microsecond-scale temporal resolution with sparse\ninformation encoding, thereby outlining new opportunities for edge vision\nsystems. However, mainstream algorithms for frame-based vision, which mostly\nrely on convolutional neural networks (CNNs), can hardly exploit the advantages\nof event-based vision as they are typically optimized for dense matrix-vector\nmultiplications. While event-driven graph neural networks (GNNs) have recently\nemerged as a promising solution for sparse event-based vision, their irregular\nstructure is a challenge that currently hinders the design of efficient\nhardware accelerators. In this paper, we propose EvGNN, the first event-driven\nGNN accelerator for low-footprint, ultra-low-latency, and high-accuracy edge\nvision with event-based cameras. It relies on three central ideas: (i) directed\ndynamic graphs exploiting single-hop nodes with edge-free storage, (ii) event\nqueues for the efficient identification of local neighbors within a\nspatiotemporally decoupled search range, and (iii) a novel layer-parallel\nprocessing scheme enabling the low-latency execution of multi-layer GNNs. We\ndeployed EvGNN on a Xilinx KV260 Ultrascale+ MPSoC platform and benchmarked it\non the N-CARS dataset for car recognition, demonstrating a classification\naccuracy of 87.8% and an average latency per event of 16$\\mu$s, thereby\nenabling real-time, microsecond-resolution event-based vision at the edge.\n","authors":["Yufeng Yang","Adrian Kneip","Charlotte Frenkel"],"pdf_url":"https://arxiv.org/pdf/2404.19489v1.pdf","comment":"12 pages, 14 figures"},{"id":"http://arxiv.org/abs/2403.06119v2","updated":"2024-04-30T12:05:02Z","published":"2024-03-10T07:31:06Z","title":"CLEAR: Cross-Transformers with Pre-trained Language Model is All you\n need for Person Attribute Recognition and Retrieval","summary":" Person attribute recognition and attribute-based retrieval are two core\nhuman-centric tasks. In the recognition task, the challenge is specifying\nattributes depending on a person's appearance, while the retrieval task\ninvolves searching for matching persons based on attribute queries. There is a\nsignificant relationship between recognition and retrieval tasks. In this\nstudy, we demonstrate that if there is a sufficiently robust network to solve\nperson attribute recognition, it can be adapted to facilitate better\nperformance for the retrieval task. Another issue that needs addressing in the\nretrieval task is the modality gap between attribute queries and persons'\nimages. Therefore, in this paper, we present CLEAR, a unified network designed\nto address both tasks. We introduce a robust cross-transformers network to\nhandle person attribute recognition. Additionally, leveraging a pre-trained\nlanguage model, we construct pseudo-descriptions for attribute queries and\nintroduce an effective training strategy to train only a few additional\nparameters for adapters, facilitating the handling of the retrieval task.\nFinally, the unified CLEAR model is evaluated on five benchmarks: PETA, PA100K,\nMarket-1501, RAPv2, and UPAR-2024. Without bells and whistles, CLEAR achieves\nstate-of-the-art performance or competitive results for both tasks,\nsignificantly outperforming other competitors in terms of person retrieval\nperformance on the widely-used Market-1501 dataset.\n","authors":["Doanh C. Bui","Thinh V. Le","Ba Hung Ngo","Tae Jong Choi"],"pdf_url":"https://arxiv.org/pdf/2403.06119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19481v1","updated":"2024-04-30T11:49:29Z","published":"2024-04-30T11:49:29Z","title":"SpecstatOR: Speckle statistics-based iOCT Segmentation Network for\n Ophthalmic Surgery","summary":" This paper presents an innovative approach to intraoperative Optical\nCoherence Tomography (iOCT) image segmentation in ophthalmic surgery,\nleveraging statistical analysis of speckle patterns to incorporate statistical\npathology-specific prior knowledge. Our findings indicate statistically\ndifferent speckle patterns within the retina and between retinal layers and\nsurgical tools, facilitating the segmentation of previously unseen data without\nthe necessity for manual labeling. The research involves fitting various\nstatistical distributions to iOCT data, enabling the differentiation of\ndifferent ocular structures and surgical tools. The proposed segmentation model\naims to refine the statistical findings based on prior tissue understanding to\nleverage statistical and biological knowledge. Incorporating statistical\nparameters, physical analysis of light-tissue interaction, and deep learning\ninformed by biological structures enhance segmentation accuracy, offering\npotential benefits to real-time applications in ophthalmic surgical procedures.\nThe study demonstrates the adaptability and precision of using Gamma\ndistribution parameters and the derived binary maps as sole inputs for\nsegmentation, notably enhancing the model's inference performance on unseen\ndata.\n","authors":["Kristina Mach","Hessam Roodaki","Michael Sommersperger","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2404.19481v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.19475v1","updated":"2024-04-30T11:43:37Z","published":"2024-04-30T11:43:37Z","title":"TwinDiffusion: Enhancing Coherence and Efficiency in Panoramic Image\n Generation with Diffusion Models","summary":" Diffusion models have emerged as effective tools for generating diverse and\nhigh-quality content. However, their capability in high-resolution image\ngeneration, particularly for panoramic images, still faces challenges such as\nvisible seams and incoherent transitions. In this paper, we propose\nTwinDiffusion, an optimized framework designed to address these challenges\nthrough two key innovations: Crop Fusion for quality enhancement and Cross\nSampling for efficiency optimization. We introduce a training-free optimizing\nstage to refine the similarity of the adjacent image areas, as well as an\ninterleaving sampling strategy to yield dynamic patches during the cropping\nprocess. A comprehensive evaluation is conducted to compare TwinDiffusion with\nthe existing methods, considering factors including coherence, fidelity,\ncompatibility, and efficiency. The results demonstrate the superior performance\nof our approach in generating seamless and coherent panoramas, setting a new\nstandard in quality and efficiency for panoramic image generation.\n","authors":["Teng Zhou","Yongchuan Tang"],"pdf_url":"https://arxiv.org/pdf/2404.19475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05058v3","updated":"2024-04-30T11:20:47Z","published":"2023-10-08T07:48:25Z","title":"Learning Separable Hidden Unit Contributions for Speaker-Adaptive\n Lip-Reading","summary":" In this paper, we propose a novel method for speaker adaptation in lip\nreading, motivated by two observations. Firstly, a speaker's own\ncharacteristics can always be portrayed well by his/her few facial images or\neven a single image with shallow networks, while the fine-grained dynamic\nfeatures associated with speech content expressed by the talking face always\nneed deep sequential networks to represent accurately. Therefore, we treat the\nshallow and deep layers differently for speaker adaptive lip reading. Secondly,\nwe observe that a speaker's unique characteristics ( e.g. prominent oral cavity\nand mandible) have varied effects on lip reading performance for different\nwords and pronunciations, necessitating adaptive enhancement or suppression of\nthe features for robust lip reading. Based on these two observations, we\npropose to take advantage of the speaker's own characteristics to automatically\nlearn separable hidden unit contributions with different targets for shallow\nlayers and deep layers respectively. For shallow layers where features related\nto the speaker's characteristics are stronger than the speech content related\nfeatures, we introduce speaker-adaptive features to learn for enhancing the\nspeech content features. For deep layers where both the speaker's features and\nthe speech content features are all expressed well, we introduce the\nspeaker-adaptive features to learn for suppressing the speech content\nirrelevant noise for robust lip reading. Our approach consistently outperforms\nexisting methods, as confirmed by comprehensive analysis and comparison across\ndifferent settings. Besides the evaluation on the popular LRW-ID and GRID\ndatasets, we also release a new dataset for evaluation, CAS-VSR-S68h, to\nfurther assess the performance in an extreme setting where just a few speakers\nare available but the speech content covers a large and diversified range.\n","authors":["Songtao Luo","Shuang Yang","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2310.05058v3.pdf","comment":"Accepted to BMVC 2023 20pages"},{"id":"http://arxiv.org/abs/2404.19460v1","updated":"2024-04-30T11:19:05Z","published":"2024-04-30T11:19:05Z","title":"AttackBench: Evaluating Gradient-based Attacks for Adversarial Examples","summary":" Adversarial examples are typically optimized with gradient-based attacks.\nWhile novel attacks are continuously proposed, each is shown to outperform its\npredecessors using different experimental setups, hyperparameter settings, and\nnumber of forward and backward calls to the target models. This provides\noverly-optimistic and even biased evaluations that may unfairly favor one\nparticular attack over the others. In this work, we aim to overcome these\nlimitations by proposing AttackBench, i.e., the first evaluation framework that\nenables a fair comparison among different attacks. To this end, we first\npropose a categorization of gradient-based attacks, identifying their main\ncomponents and differences. We then introduce our framework, which evaluates\ntheir effectiveness and efficiency. We measure these characteristics by (i)\ndefining an optimality metric that quantifies how close an attack is to the\noptimal solution, and (ii) limiting the number of forward and backward queries\nto the model, such that all attacks are compared within a given maximum query\nbudget. Our extensive experimental analysis compares more than 100 attack\nimplementations with a total of over 800 different configurations against\nCIFAR-10 and ImageNet models, highlighting that only very few attacks\noutperform all the competing approaches. Within this analysis, we shed light on\nseveral implementation issues that prevent many attacks from finding better\nsolutions or running at all. We release AttackBench as a publicly available\nbenchmark, aiming to continuously update it to include and evaluate novel\ngradient-based attacks for optimizing adversarial examples.\n","authors":["Antonio Emanuele Cinà","Jérôme Rony","Maura Pintor","Luca Demetrio","Ambra Demontis","Battista Biggio","Ismail Ben Ayed","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2404.19460v1.pdf","comment":"https://attackbench.github.io"},{"id":"http://arxiv.org/abs/2404.12083v2","updated":"2024-04-30T11:17:55Z","published":"2024-04-18T11:09:25Z","title":"MambaPupil: Bidirectional Selective Recurrent model for Event-based Eye\n tracking","summary":" Event-based eye tracking has shown great promise with the high temporal\nresolution and low redundancy provided by the event camera. However, the\ndiversity and abruptness of eye movement patterns, including blinking,\nfixating, saccades, and smooth pursuit, pose significant challenges for eye\nlocalization. To achieve a stable event-based eye-tracking system, this paper\nproposes a bidirectional long-term sequence modeling and time-varying state\nselection mechanism to fully utilize contextual temporal information in\nresponse to the variability of eye movements. Specifically, the MambaPupil\nnetwork is proposed, which consists of the multi-layer convolutional encoder to\nextract features from the event representations, a bidirectional Gated\nRecurrent Unit (GRU), and a Linear Time-Varying State Space Module (LTV-SSM),\nto selectively capture contextual correlation from the forward and backward\ntemporal relationship. Furthermore, the Bina-rep is utilized as a compact event\nrepresentation, and the tailor-made data augmentation, called as Event-Cutout,\nis proposed to enhance the model's robustness by applying spatial random\nmasking to the event image. The evaluation on the ThreeET-plus benchmark shows\nthe superior performance of the MambaPupil, which secured the 1st place in\nCVPR'2024 AIS Event-based Eye Tracking challenge.\n","authors":["Zhong Wang","Zengyu Wan","Han Han","Bohao Liao","Yuliang Wu","Wei Zhai","Yang Cao","Zheng-jun Zha"],"pdf_url":"https://arxiv.org/pdf/2404.12083v2.pdf","comment":"Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for\n Streaming), top solution of challenge Event-based Eye Tracking, see\n https://www.kaggle.com/competitions/event-based-eye-tracking-ais2024"},{"id":"http://arxiv.org/abs/2404.19444v1","updated":"2024-04-30T10:48:43Z","published":"2024-04-30T10:48:43Z","title":"AnomalyXFusion: Multi-modal Anomaly Synthesis with Diffusion","summary":" Anomaly synthesis is one of the effective methods to augment abnormal samples\nfor training. However, current anomaly synthesis methods predominantly rely on\ntexture information as input, which limits the fidelity of synthesized abnormal\nsamples. Because texture information is insufficient to correctly depict the\npattern of anomalies, especially for logical anomalies. To surmount this\nobstacle, we present the AnomalyXFusion framework, designed to harness\nmulti-modality information to enhance the quality of synthesized abnormal\nsamples. The AnomalyXFusion framework comprises two distinct yet synergistic\nmodules: the Multi-modal In-Fusion (MIF) module and the Dynamic Dif-Fusion\n(DDF) module. The MIF module refines modality alignment by aggregating and\nintegrating various modality features into a unified embedding space, termed\nX-embedding, which includes image, text, and mask features. Concurrently, the\nDDF module facilitates controlled generation through an adaptive adjustment of\nX-embedding conditioned on the diffusion steps. In addition, to reveal the\nmulti-modality representational power of AnomalyXFusion, we propose a new\ndataset, called MVTec Caption. More precisely, MVTec Caption extends 2.2k\naccurate image-mask-text annotations for the MVTec AD and LOCO datasets.\nComprehensive evaluations demonstrate the effectiveness of AnomalyXFusion,\nespecially regarding the fidelity and diversity for logical anomalies. Project\npage: http:github.com/hujiecpp/MVTec-Caption\n","authors":["Jie Hu","Yawen Huang","Yilin Lu","Guoyang Xie","Guannan Jiang","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.19444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06221v4","updated":"2024-04-30T10:39:19Z","published":"2023-05-10T14:54:29Z","title":"Multi-Prompt with Depth Partitioned Cross-Modal Learning","summary":" In recent years, soft prompt learning methods have been proposed to fine-tune\nlarge-scale vision-language pre-trained models for various downstream tasks.\nThese methods typically combine learnable textual tokens with class tokens as\ninput for models with frozen parameters. However, they often employ a single\nprompt to describe class contexts, failing to capture categories' diverse\nattributes adequately. This study introduces the Partitioned Multi-modal Prompt\n(PMPO), a multi-modal prompting technique that extends the soft prompt from a\nsingle learnable prompt to multiple prompts. Our method divides the visual\nencoder depths and connects learnable prompts to the separated visual depths,\nenabling different prompts to capture the hierarchical contextual depths of\nvisual representations. Furthermore, to maximize the advantages of multi-prompt\nlearning, we incorporate prior information from manually designed templates and\nlearnable multi-prompts, thus improving the generalization capabilities of our\napproach. We evaluate the effectiveness of our approach on three challenging\ntasks: new class generalization, cross-dataset evaluation, and domain\ngeneralization. For instance, our method achieves a $79.28$ harmonic mean,\naveraged over 11 diverse image recognition datasets ($+7.62$ compared to CoOp),\ndemonstrating significant competitiveness compared to state-of-the-art\nprompting methods.\n","authors":["Yingjie Tian","Yiqi Wang","Xianda Guo","Zheng Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2305.06221v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10731v2","updated":"2024-04-30T10:29:22Z","published":"2024-03-15T23:31:41Z","title":"Giving a Hand to Diffusion Models: a Two-Stage Approach to Improving\n Conditional Human Image Generation","summary":" Recent years have seen significant progress in human image generation,\nparticularly with the advancements in diffusion models. However, existing\ndiffusion methods encounter challenges when producing consistent hand anatomy\nand the generated images often lack precise control over the hand pose. To\naddress this limitation, we introduce a novel approach to pose-conditioned\nhuman image generation, dividing the process into two stages: hand generation\nand subsequent body outpainting around the hands. We propose training the hand\ngenerator in a multi-task setting to produce both hand images and their\ncorresponding segmentation masks, and employ the trained model in the first\nstage of generation. An adapted ControlNet model is then used in the second\nstage to outpaint the body around the generated hands, producing the final\nresult. A novel blending technique is introduced to preserve the hand details\nduring the second stage that combines the results of both stages in a coherent\nway. This involves sequential expansion of the outpainted region while fusing\nthe latent representations, to ensure a seamless and cohesive synthesis of the\nfinal image. Experimental evaluations demonstrate the superiority of our\nproposed method over state-of-the-art techniques, in both pose accuracy and\nimage quality, as validated on the HaGRID dataset. Our approach not only\nenhances the quality of the generated hands but also offers improved control\nover hand pose, advancing the capabilities of pose-conditioned human image\ngeneration. The source code of the proposed approach is available at\nhttps://github.com/apelykh/hand-to-diffusion.\n","authors":["Anton Pelykh","Ozge Mercanoglu Sincan","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2403.10731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19486v2","updated":"2024-04-30T10:27:48Z","published":"2023-05-31T01:46:14Z","title":"Instance-dependent Noisy-label Learning with Graphical Model Based\n Noise-rate Estimation","summary":" Deep learning faces a formidable challenge when handling noisy labels, as\nmodels tend to overfit samples affected by label noise. This challenge is\nfurther compounded by the presence of instance-dependent noise (IDN), a\nrealistic form of label noise arising from ambiguous sample information. To\naddress IDN, Label Noise Learning (LNL) incorporates a sample selection stage\nto differentiate clean and noisy-label samples. This stage uses an arbitrary\ncriterion and a pre-defined curriculum that initially selects most samples as\nnoisy and gradually decreases this selection rate during training. Such\ncurriculum is sub-optimal since it does not consider the actual label noise\nrate in the training set. This paper addresses this issue with a new noise-rate\nestimation method that is easily integrated with most state-of-the-art (SOTA)\nLNL methods to produce a more effective curriculum. Synthetic and real-world\nbenchmark results demonstrate that integrating our approach with SOTA LNL\nmethods improves accuracy in most cases.\n","authors":["Arpit Garg","Cuong Nguyen","Rafael Felix","Thanh-Toan Do","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2305.19486v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19427v1","updated":"2024-04-30T10:16:21Z","published":"2024-04-30T10:16:21Z","title":"InstantFamily: Masked Attention for Zero-shot Multi-ID Image Generation","summary":" In the field of personalized image generation, the ability to create images\npreserving concepts has significantly improved. Creating an image that\nnaturally integrates multiple concepts in a cohesive and visually appealing\ncomposition can indeed be challenging. This paper introduces \"InstantFamily,\"\nan approach that employs a novel masked cross-attention mechanism and a\nmultimodal embedding stack to achieve zero-shot multi-ID image generation. Our\nmethod effectively preserves ID as it utilizes global and local features from a\npre-trained face recognition model integrated with text conditions.\nAdditionally, our masked cross-attention mechanism enables the precise control\nof multi-ID and composition in the generated images. We demonstrate the\neffectiveness of InstantFamily through experiments showing its dominance in\ngenerating images with multi-ID, while resolving well-known multi-ID generation\nproblems. Additionally, our model achieves state-of-the-art performance in both\nsingle-ID and multi-ID preservation. Furthermore, our model exhibits remarkable\nscalability with a greater number of ID preservation than it was originally\ntrained with.\n","authors":["Chanran Kim","Jeongin Lee","Shichang Joung","Bongmo Kim","Yeul-Min Baek"],"pdf_url":"https://arxiv.org/pdf/2404.19427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19417v1","updated":"2024-04-30T10:03:26Z","published":"2024-04-30T10:03:26Z","title":"Physical Backdoor: Towards Temperature-based Backdoor Attacks in the\n Physical World","summary":" Backdoor attacks have been well-studied in visible light object detection\n(VLOD) in recent years. However, VLOD can not effectively work in dark and\ntemperature-sensitive scenarios. Instead, thermal infrared object detection\n(TIOD) is the most accessible and practical in such environments. In this\npaper, our team is the first to investigate the security vulnerabilities\nassociated with TIOD in the context of backdoor attacks, spanning both the\ndigital and physical realms. We introduce two novel types of backdoor attacks\non TIOD, each offering unique capabilities: Object-affecting Attack and\nRange-affecting Attack. We conduct a comprehensive analysis of key factors\ninfluencing trigger design, which include temperature, size, material, and\nconcealment. These factors, especially temperature, significantly impact the\nefficacy of backdoor attacks on TIOD. A thorough understanding of these factors\nwill serve as a foundation for designing physical triggers and temperature\ncontrolling experiments. Our study includes extensive experiments conducted in\nboth digital and physical environments. In the digital realm, we evaluate our\napproach using benchmark datasets for TIOD, achieving an Attack Success Rate\n(ASR) of up to 98.21%. In the physical realm, we test our approach in two\nreal-world settings: a traffic intersection and a parking lot, using a thermal\ninfrared camera. Here, we attain an ASR of up to 98.38%.\n","authors":["Wen Yin","Jian Lou","Pan Zhou","Yulai Xie","Dan Feng","Yuhua Sun","Tailai Zhang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.19417v1.pdf","comment":"To appear in CVPR 2024.11pages, 8 figures and 4 tables"},{"id":"http://arxiv.org/abs/2402.16594v4","updated":"2024-04-30T09:55:28Z","published":"2024-02-26T14:18:12Z","title":"CURSOR: Scalable Mixed-Order Hypergraph Matching with CUR Decomposition","summary":" To achieve greater accuracy, hypergraph matching algorithms require\nexponential increases in computational resources. Recent kd-tree-based\napproximate nearest neighbor (ANN) methods, despite the sparsity of their\ncompatibility tensor, still require exhaustive calculations for large-scale\ngraph matching. This work utilizes CUR tensor decomposition and introduces a\nnovel cascaded second and third-order hypergraph matching framework (CURSOR)\nfor efficient hypergraph matching. A CUR-based second-order graph matching\nalgorithm is used to provide a rough match, and then the core of CURSOR, a\nfiber-CUR-based tensor generation method, directly calculates entries of the\ncompatibility tensor by leveraging the initial second-order match result. This\nsignificantly decreases the time complexity and tensor density. A probability\nrelaxation labeling (PRL)-based matching algorithm, especially suitable for\nsparse tensors, is developed. Experiment results on large-scale synthetic\ndatasets and widely-adopted benchmark sets demonstrate the superiority of\nCURSOR over existing methods. The tensor generation method in CURSOR can be\nintegrated seamlessly into existing hypergraph matching methods to improve\ntheir performance and lower their computational costs.\n","authors":["Qixuan Zheng","Ming Zhang","Hong Yan"],"pdf_url":"https://arxiv.org/pdf/2402.16594v4.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2302.08212v2","updated":"2024-04-30T09:51:32Z","published":"2023-02-16T10:56:00Z","title":"Visible-Infrared Person Re-Identification via Patch-Mixed Cross-Modality\n Learning","summary":" Visible-infrared person re-identification (VI-ReID) aims to retrieve images\nof the same pedestrian from different modalities, where the challenges lie in\nthe significant modality discrepancy. To alleviate the modality gap, recent\nmethods generate intermediate images by GANs, grayscaling, or mixup strategies.\nHowever, these methods could introduce extra data distribution, and the\nsemantic correspondence between the two modalities is not well learned. In this\npaper, we propose a Patch-Mixed Cross-Modality framework (PMCM), where two\nimages of the same person from two modalities are split into patches and\nstitched into a new one for model learning. A part-alignment loss is introduced\nto regularize representation learning, and a patch-mixed modality learning loss\nis proposed to align between the modalities. In this way, the model learns to\nrecognize a person through patches of different styles, thereby the modality\nsemantic correspondence can be inferred. In addition, with the flexible image\ngeneration strategy, the patch-mixed images freely adjust the ratio of\ndifferent modality patches, which could further alleviate the modality\nimbalance problem. On two VI-ReID datasets, we report new state-of-the-art\nperformance with the proposed method.\n","authors":["Zhihao Qian","Yutian Lin","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2302.08212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19401v1","updated":"2024-04-30T09:47:44Z","published":"2024-04-30T09:47:44Z","title":"UniFS: Universal Few-shot Instance Perception with Point Representations","summary":" Instance perception tasks (object detection, instance segmentation, pose\nestimation, counting) play a key role in industrial applications of visual\nmodels. As supervised learning methods suffer from high labeling cost, few-shot\nlearning methods which effectively learn from a limited number of labeled\nexamples are desired. Existing few-shot learning methods primarily focus on a\nrestricted set of tasks, presumably due to the challenges involved in designing\na generic model capable of representing diverse tasks in a unified manner. In\nthis paper, we propose UniFS, a universal few-shot instance perception model\nthat unifies a wide range of instance perception tasks by reformulating them\ninto a dynamic point representation learning framework. Additionally, we\npropose Structure-Aware Point Learning (SAPL) to exploit the higher-order\nstructural relationship among points to further enhance representation\nlearning. Our approach makes minimal assumptions about the tasks, yet it\nachieves competitive results compared to highly specialized and well optimized\nspecialist models. Codes will be released soon.\n","authors":["Sheng Jin","Ruijie Yao","Lumin Xu","Wentao Liu","Chen Qian","Ji Wu","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2404.19401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19398v1","updated":"2024-04-30T09:45:41Z","published":"2024-04-30T09:45:41Z","title":"3D Gaussian Blendshapes for Head Avatar Animation","summary":" We introduce 3D Gaussian blendshapes for modeling photorealistic head\navatars. Taking a monocular video as input, we learn a base head model of\nneutral expression, along with a group of expression blendshapes, each of which\ncorresponds to a basis expression in classical parametric face models. Both the\nneutral model and expression blendshapes are represented as 3D Gaussians, which\ncontain a few properties to depict the avatar appearance. The avatar model of\nan arbitrary expression can be effectively generated by combining the neutral\nmodel and expression blendshapes through linear blending of Gaussians with the\nexpression coefficients. High-fidelity head avatar animations can be\nsynthesized in real time using Gaussian splatting. Compared to state-of-the-art\nmethods, our Gaussian blendshape representation better captures high-frequency\ndetails exhibited in input video, and achieves superior rendering performance.\n","authors":["Shengjie Ma","Yanlin Weng","Tianjia Shao","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.19398v1.pdf","comment":"ACM SIGGRAPH Conference Proceedings 2024"},{"id":"http://arxiv.org/abs/2404.19394v1","updated":"2024-04-30T09:40:07Z","published":"2024-04-30T09:40:07Z","title":"CLIP-Mamba: CLIP Pretrained Mamba Models with OOD and Hessian Evaluation","summary":" State space models and Mamba-based models have been increasingly applied\nacross various domains, achieving state-of-the-art performance. This technical\nreport introduces the first attempt to train a transferable Mamba model\nutilizing contrastive language-image pretraining (CLIP). We have trained Mamba\nmodels of varying sizes and undertaken comprehensive evaluations of these\nmodels on 26 zero-shot classification datasets and 16 out-of-distribution (OOD)\ndatasets. Our findings reveal that a Mamba model with 67 million parameters is\non par with a 307 million-parameter Vision Transformer (ViT) model in zero-shot\nclassification tasks, highlighting the parameter efficiency of Mamba models. In\ntests of OOD generalization, Mamba-based models exhibit exceptional performance\nin conditions of OOD image contrast or when subjected to high-pass filtering.\nHowever, a Hessian analysis indicates that Mamba models feature a sharper and\nmore non-convex landscape compared to ViT-based models, making them more\nchallenging to train. The source code is available at\nhttps://github.com/raytrun/mamba-clip.\n","authors":["Weiquan Huang","Yifei Shen","Yifan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.19394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15583v3","updated":"2024-04-30T09:40:01Z","published":"2024-01-28T06:41:15Z","title":"SCTransNet: Spatial-channel Cross Transformer Network for Infrared Small\n Target Detection","summary":" Infrared small target detection (IRSTD) has recently benefitted greatly from\nU-shaped neural models. However, largely overlooking effective global\ninformation modeling, existing techniques struggle when the target has high\nsimilarities with the background. We present a Spatial-channel Cross\nTransformer Network (SCTransNet) that leverages spatial-channel cross\ntransformer blocks (SCTBs) on top of long-range skip connections to address the\naforementioned challenge. In the proposed SCTBs, the outputs of all encoders\nare interacted with cross transformer to generate mixed features, which are\nredistributed to all decoders to effectively reinforce semantic differences\nbetween the target and clutter at full scales. Specifically, SCTB contains the\nfollowing two key elements: (a) spatial-embedded single-head channel-cross\nattention (SSCA) for exchanging local spatial features and full-level global\nchannel information to eliminate ambiguity among the encoders and facilitate\nhigh-level semantic associations of the images, and (b) a complementary\nfeed-forward network (CFN) for enhancing the feature discriminability via a\nmulti-scale strategy and cross-spatial-channel information interaction to\npromote beneficial information transfer. Our SCTransNet effectively encodes the\nsemantic differences between targets and backgrounds to boost its internal\nrepresentation for detecting small infrared targets accurately. Extensive\nexperiments on three public datasets, NUDT-SIRST, NUAA-SIRST, and IRSTD-1k,\ndemonstrate that the proposed SCTransNet outperforms existing IRSTD methods.\nOur code will be made public at https://github.com/xdFai.\n","authors":["Shuai Yuan","Hanlin Qin","Xiang Yan","Naveed AKhtar","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2401.15583v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19384v1","updated":"2024-04-30T09:20:35Z","published":"2024-04-30T09:20:35Z","title":"Pseudo Label Refinery for Unsupervised Domain Adaptation on\n Cross-dataset 3D Object Detection","summary":" Recent self-training techniques have shown notable improvements in\nunsupervised domain adaptation for 3D object detection (3D UDA). These\ntechniques typically select pseudo labels, i.e., 3D boxes, to supervise models\nfor the target domain. However, this selection process inevitably introduces\nunreliable 3D boxes, in which 3D points cannot be definitively assigned as\nforeground or background. Previous techniques mitigate this by reweighting\nthese boxes as pseudo labels, but these boxes can still poison the training\nprocess. To resolve this problem, in this paper, we propose a novel pseudo\nlabel refinery framework. Specifically, in the selection process, to improve\nthe reliability of pseudo boxes, we propose a complementary augmentation\nstrategy. This strategy involves either removing all points within an\nunreliable box or replacing it with a high-confidence box. Moreover, the point\nnumbers of instances in high-beam datasets are considerably higher than those\nin low-beam datasets, also degrading the quality of pseudo labels during the\ntraining process. We alleviate this issue by generating additional proposals\nand aligning RoI features across different domains. Experimental results\ndemonstrate that our method effectively enhances the quality of pseudo labels\nand consistently surpasses the state-of-the-art methods on six autonomous\ndriving benchmarks. Code will be available at\nhttps://github.com/Zhanwei-Z/PERE.\n","authors":["Zhanwei Zhang","Minghao Chen","Shuai Xiao","Liang Peng","Hengjia Li","Binbin Lin","Ping Li","Wenxiao Wang","Boxi Wu","Deng Cai"],"pdf_url":"https://arxiv.org/pdf/2404.19384v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05253v2","updated":"2024-04-30T09:18:59Z","published":"2024-04-08T07:34:39Z","title":"CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement","summary":" Low-light image enhancement (LLIE) aims to improve low-illumination images.\nHowever, existing methods face two challenges: (1) uncertainty in restoration\nfrom diverse brightness degradations; (2) loss of texture and color information\ncaused by noise suppression and light enhancement. In this paper, we propose a\nnovel enhancement approach, CodeEnhance, by leveraging quantized priors and\nimage refinement to address these challenges. In particular, we reframe LLIE as\nlearning an image-to-code mapping from low-light images to discrete codebook,\nwhich has been learned from high-quality images. To enhance this process, a\nSemantic Embedding Module (SEM) is introduced to integrate semantic information\nwith low-level features, and a Codebook Shift (CS) mechanism, designed to adapt\nthe pre-learned codebook to better suit the distinct characteristics of our\nlow-light dataset. Additionally, we present an Interactive Feature\nTransformation (IFT) module to refine texture and color information during\nimage reconstruction, allowing for interactive enhancement based on user\npreferences. Extensive experiments on both real-world and synthetic benchmarks\ndemonstrate that the incorporation of prior knowledge and controllable\ninformation transfer significantly enhances LLIE performance in terms of\nquality and fidelity. The proposed CodeEnhance exhibits superior robustness to\nvarious degradations, including uneven illumination, noise, and color\ndistortion.\n","authors":["Xu Wu","XianXu Hou","Zhihui Lai","Jie Zhou","Ya-nan Zhang","Witold Pedrycz","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.05253v2.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.19383v1","updated":"2024-04-30T09:16:30Z","published":"2024-04-30T09:16:30Z","title":"Cross-Block Fine-Grained Semantic Cascade for Skeleton-Based Sports\n Action Recognition","summary":" Human action video recognition has recently attracted more attention in\napplications such as video security and sports posture correction. Popular\nsolutions, including graph convolutional networks (GCNs) that model the human\nskeleton as a spatiotemporal graph, have proven very effective. GCNs-based\nmethods with stacked blocks usually utilize top-layer semantics for\nclassification/annotation purposes. Although the global features learned\nthrough the procedure are suitable for the general classification, they have\ndifficulty capturing fine-grained action change across adjacent frames --\ndecisive factors in sports actions. In this paper, we propose a novel\n``Cross-block Fine-grained Semantic Cascade (CFSC)'' module to overcome this\nchallenge. In summary, the proposed CFSC progressively integrates shallow\nvisual knowledge into high-level blocks to allow networks to focus on action\ndetails. In particular, the CFSC module utilizes the GCN feature maps produced\nat different levels, as well as aggregated features from proceeding levels to\nconsolidate fine-grained features. In addition, a dedicated temporal\nconvolution is applied at each level to learn short-term temporal features,\nwhich will be carried over from shallow to deep layers to maximize the leverage\nof low-level details. This cross-block feature aggregation methodology, capable\nof mitigating the loss of fine-grained information, has resulted in improved\nperformance. Last, FD-7, a new action recognition dataset for fencing sports,\nwas collected and will be made publicly available. Experimental results and\nempirical analysis on public benchmarks (FSD-10) and self-collected (FD-7)\ndemonstrate the advantage of our CFSC module on learning discriminative\npatterns for action classification over others.\n","authors":["Zhendong Liu","Haifeng Xia","Tong Guo","Libo Sun","Ming Shao","Siyu Xia"],"pdf_url":"https://arxiv.org/pdf/2404.19383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19382v1","updated":"2024-04-30T09:14:54Z","published":"2024-04-30T09:14:54Z","title":"Probing Unlearned Diffusion Models: A Transferable Adversarial Attack\n Perspective","summary":" Advanced text-to-image diffusion models raise safety concerns regarding\nidentity privacy violation, copyright infringement, and Not Safe For Work\ncontent generation. Towards this, unlearning methods have been developed to\nerase these involved concepts from diffusion models. However, these unlearning\nmethods only shift the text-to-image mapping and preserve the visual content\nwithin the generative space of diffusion models, leaving a fatal flaw for\nrestoring these erased concepts. This erasure trustworthiness problem needs\nprobe, but previous methods are sub-optimal from two perspectives: (1) Lack of\ntransferability: Some methods operate within a white-box setting, requiring\naccess to the unlearned model. And the learned adversarial input often fails to\ntransfer to other unlearned models for concept restoration; (2) Limited attack:\nThe prompt-level methods struggle to restore narrow concepts from unlearned\nmodels, such as celebrity identity. Therefore, this paper aims to leverage the\ntransferability of the adversarial attack to probe the unlearning robustness\nunder a black-box setting. This challenging scenario assumes that the\nunlearning method is unknown and the unlearned model is inaccessible for\noptimization, requiring the attack to be capable of transferring across\ndifferent unlearned models. Specifically, we employ an adversarial search\nstrategy to search for the adversarial embedding which can transfer across\ndifferent unlearned models. This strategy adopts the original Stable Diffusion\nmodel as a surrogate model to iteratively erase and search for embeddings,\nenabling it to find the embedding that can restore the target concept for\ndifferent unlearning methods. Extensive experiments demonstrate the\ntransferability of the searched adversarial embedding across several\nstate-of-the-art unlearning methods and its effectiveness for different levels\nof concepts.\n","authors":["Xiaoxuan Han","Songlin Yang","Wei Wang","Yang Li","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.19382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19379v1","updated":"2024-04-30T09:11:04Z","published":"2024-04-30T09:11:04Z","title":"SemanticFormer: Holistic and Semantic Traffic Scene Representation for\n Trajectory Prediction using Knowledge Graphs","summary":" Trajectory prediction in autonomous driving relies on accurate representation\nof all relevant contexts of the driving scene including traffic participants,\nroad topology, traffic signs as well as their semantic relations to each other.\nDespite increased attention to this issue, most approaches in trajectory\nprediction do not consider all of these factors sufficiently. This paper\ndescribes a method SemanticFormer to predict multimodal trajectories by\nreasoning over a semantic traffic scene graph using a hybrid approach. We\nextract high-level information in the form of semantic meta-paths from a\nknowledge graph which is then processed by a novel pipeline based on multiple\nattention mechanisms to predict accurate trajectories. The proposed\narchitecture comprises a hierarchical heterogeneous graph encoder, which can\ncapture spatio-temporal and relational information across agents and between\nagents and road elements, and a predictor that fuses the different encodings\nand decodes trajectories with probabilities. Finally, a refinement module\nevaluates permitted meta-paths of trajectories and speed profiles to obtain\nfinal predicted trajectories. Evaluation of the nuScenes benchmark demonstrates\nimproved performance compared to the state-of-the-art methods.\n","authors":["Zhigang Sun","Zixu Wang","Lavdim Halilaj","Juergen Luettin"],"pdf_url":"https://arxiv.org/pdf/2404.19379v1.pdf","comment":"8 pages, 6 figures, submitted to RA-L"},{"id":"http://arxiv.org/abs/2401.11824v4","updated":"2024-04-30T09:06:04Z","published":"2024-01-22T10:37:59Z","title":"Rethinking Centered Kernel Alignment in Knowledge Distillation","summary":" Knowledge distillation has emerged as a highly effective method for bridging\nthe representation discrepancy between large-scale models and lightweight\nmodels. Prevalent approaches involve leveraging appropriate metrics to minimize\nthe divergence or distance between the knowledge extracted from the teacher\nmodel and the knowledge learned by the student model. Centered Kernel Alignment\n(CKA) is widely used to measure representation similarity and has been applied\nin several knowledge distillation methods. However, these methods are complex\nand fail to uncover the essence of CKA, thus not answering the question of how\nto use CKA to achieve simple and effective distillation properly. This paper\nfirst provides a theoretical perspective to illustrate the effectiveness of\nCKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD)\nand a constant term. Drawing from this, we propose a novel Relation-Centered\nKernel Alignment~(RCKA) framework, which practically establishes a connection\nbetween CKA and MMD. Furthermore, we dynamically customize the application of\nCKA based on the characteristics of each task, with less computational source\nyet comparable performance than the previous methods. The extensive experiments\non the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves\nstate-of-the-art performance on almost all teacher-student pairs for image\nclassification and object detection, validating the effectiveness of our\napproaches. Our code is available in https://github.com/Klayand/PCKA\n","authors":["Zikai Zhou","Yunhang Shen","Shitong Shao","Linrui Gong","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11824v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18723v4","updated":"2024-04-30T08:53:45Z","published":"2023-05-30T04:00:35Z","title":"Towards Accurate Post-training Quantization for Diffusion Models","summary":" In this paper, we propose an accurate data-free post-training quantization\nframework of diffusion models (ADP-DM) for efficient image generation.\nConventional data-free quantization methods learn shared quantization functions\nfor tensor discretization regardless of the generation timesteps, while the\nactivation distribution differs significantly across various timesteps. The\ncalibration images are acquired in random timesteps which fail to provide\nsufficient information for generalizable quantization function learning. Both\nissues cause sizable quantization errors with obvious image generation\nperformance degradation. On the contrary, we design group-wise quantization\nfunctions for activation discretization in different timesteps and sample the\noptimal timestep for informative calibration image generation, so that our\nquantized diffusion model can reduce the discretization errors with negligible\ncomputational overhead. Specifically, we partition the timesteps according to\nthe importance weights of quantization functions in different groups, which are\noptimized by differentiable search algorithms. We also select the optimal\ntimestep for calibration image generation by structural risk minimizing\nprinciple in order to enhance the generalization ability in the deployment of\nquantized diffusion model. Extensive experimental results show that our method\noutperforms the state-of-the-art post-training quantization of diffusion model\nby a sizable margin with similar computational cost.\n","authors":["Changyuan Wang","Ziwei Wang","Xiuwei Xu","Yansong Tang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2305.18723v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19360v1","updated":"2024-04-30T08:45:16Z","published":"2024-04-30T08:45:16Z","title":"Large Language Model Informed Patent Image Retrieval","summary":" In patent prosecution, image-based retrieval systems for identifying\nsimilarities between current patent images and prior art are pivotal to ensure\nthe novelty and non-obviousness of patent applications. Despite their growing\npopularity in recent years, existing attempts, while effective at recognizing\nimages within the same patent, fail to deliver practical value due to their\nlimited generalizability in retrieving relevant prior art. Moreover, this task\ninherently involves the challenges posed by the abstract visual features of\npatent images, the skewed distribution of image classifications, and the\nsemantic information of image descriptions. Therefore, we propose a\nlanguage-informed, distribution-aware multimodal approach to patent image\nfeature learning, which enriches the semantic understanding of patent image by\nintegrating Large Language Models and improves the performance of\nunderrepresented classes with our proposed distribution-aware contrastive\nlosses. Extensive experiments on DeepPatent2 dataset show that our proposed\nmethod achieves state-of-the-art or comparable performance in image-based\npatent retrieval with mAP +53.3%, Recall@10 +41.8%, and MRR@10 +51.9%.\nFurthermore, through an in-depth user analysis, we explore our model in aiding\npatent professionals in their image retrieval efforts, highlighting the model's\nreal-world applicability and effectiveness.\n","authors":["Hao-Cheng Lo","Jung-Mei Chu","Jieh Hsiang","Chun-Chieh Cho"],"pdf_url":"https://arxiv.org/pdf/2404.19360v1.pdf","comment":"8 pages. Under review"},{"id":"http://arxiv.org/abs/2402.19404v3","updated":"2024-04-30T08:13:10Z","published":"2024-02-29T18:03:00Z","title":"EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image\n Captioning","summary":" News image captioning requires model to generate an informative caption rich\nin entities, with the news image and the associated news article. Though\nMultimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in addressing various vision-language tasks, our research finds\nthat current MLLMs still bear limitations in handling entity information on\nnews image captioning task. Besides, while MLLMs have the ability to process\nlong inputs, generating high-quality news image captions still requires a\ntrade-off between sufficiency and conciseness of textual input information. To\nexplore the potential of MLLMs and address problems we discovered, we propose :\nan Entity-Aware Multimodal Alignment based approach for news image captioning.\nOur approach first aligns the MLLM through Balance Training Strategy with two\nextra alignment tasks: Entity-Aware Sentence Selection task and Entity\nSelection task, together with News Image Captioning task, to enhance its\ncapability in handling multimodal entity information. The aligned MLLM will\nutilizes the additional entity-related information it explicitly extracts to\nsupplement its textual input while generating news image captions. Our approach\nachieves better results than all previous models in CIDEr score on GoodNews\ndataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61).\n","authors":["Junzhe Zhang","Huixuan Zhang","Xunjian Yin","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2402.19404v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19341v1","updated":"2024-04-30T08:06:04Z","published":"2024-04-30T08:06:04Z","title":"Reliable or Deceptive? Investigating Gated Features for Smooth Visual\n Explanations in CNNs","summary":" Deep learning models have achieved remarkable success across diverse domains.\nHowever, the intricate nature of these models often impedes a clear\nunderstanding of their decision-making processes. This is where Explainable AI\n(XAI) becomes indispensable, offering intuitive explanations for model\ndecisions. In this work, we propose a simple yet highly effective approach,\nScoreCAM++, which introduces modifications to enhance the promising ScoreCAM\nmethod for visual explainability. Our proposed approach involves altering the\nnormalization function within the activation layer utilized in ScoreCAM,\nresulting in significantly improved results compared to previous efforts.\nAdditionally, we apply an activation function to the upsampled activation\nlayers to enhance interpretability. This improvement is achieved by selectively\ngating lower-priority values within the activation layer. Through extensive\nexperiments and qualitative comparisons, we demonstrate that ScoreCAM++\nconsistently achieves notably superior performance and fairness in interpreting\nthe decision-making process compared to both ScoreCAM and previous methods.\n","authors":["Soham Mitra","Atri Sukul","Swalpa Kumar Roy","Pravendra Singh","Vinay Verma"],"pdf_url":"https://arxiv.org/pdf/2404.19341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19334v1","updated":"2024-04-30T08:00:17Z","published":"2024-04-30T08:00:17Z","title":"Multi-Scale Heterogeneity-Aware Hypergraph Representation for\n Histopathology Whole Slide Images","summary":" Survival prediction is a complex ordinal regression task that aims to predict\nthe survival coefficient ranking among a cohort of patients, typically achieved\nby analyzing patients' whole slide images. Existing deep learning approaches\nmainly adopt multiple instance learning or graph neural networks under weak\nsupervision. Most of them are unable to uncover the diverse interactions\nbetween different types of biological entities(\\textit{e.g.}, cell cluster and\ntissue block) across multiple scales, while such interactions are crucial for\npatient survival prediction. In light of this, we propose a novel multi-scale\nheterogeneity-aware hypergraph representation framework. Specifically, our\nframework first constructs a multi-scale heterogeneity-aware hypergraph and\nassigns each node with its biological entity type. It then mines diverse\ninteractions between nodes on the graph structure to obtain a global\nrepresentation. Experimental results demonstrate that our method outperforms\nstate-of-the-art approaches on three benchmark datasets. Code is publicly\navailable at\n\\href{https://github.com/Hanminghao/H2GT}{https://github.com/Hanminghao/H2GT}.\n","authors":["Minghao Han","Xukun Zhang","Dingkang Yang","Tao Liu","Haopeng Kuang","Jinghui Feng","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.19334v1.pdf","comment":"9 pages, 6 figures, accepted by ICME2024"},{"id":"http://arxiv.org/abs/2404.19330v1","updated":"2024-04-30T07:53:34Z","published":"2024-04-30T07:53:34Z","title":"G2LTraj: A Global-to-Local Generation Approach for Trajectory Prediction","summary":" Predicting future trajectories of traffic agents accurately holds substantial\nimportance in various applications such as autonomous driving. Previous methods\ncommonly infer all future steps of an agent either recursively or\nsimultaneously. However, the recursive strategy suffers from the accumulated\nerror, while the simultaneous strategy overlooks the constraints among future\nsteps, resulting in kinematically infeasible predictions. To address these\nissues, in this paper, we propose G2LTraj, a plug-and-play global-to-local\ngeneration approach for trajectory prediction. Specifically, we generate a\nseries of global key steps that uniformly cover the entire future time range.\nSubsequently, the local intermediate steps between the adjacent key steps are\nrecursively filled in. In this way, we prevent the accumulated error from\npropagating beyond the adjacent key steps. Moreover, to boost the kinematical\nfeasibility, we not only introduce the spatial constraints among key steps but\nalso strengthen the temporal constraints among the intermediate steps. Finally,\nto ensure the optimal granularity of key steps, we design a selectable\ngranularity strategy that caters to each predicted trajectory. Our G2LTraj\nsignificantly improves the performance of seven existing trajectory predictors\nacross the ETH, UCY and nuScenes datasets. Experimental results demonstrate its\neffectiveness. Code will be available at https://github.com/Zhanwei-Z/G2LTraj.\n","authors":["Zhanwei Zhang","Zishuo Hua","Minghao Chen","Wei Lu","Binbin Lin","Deng Cai","Wenxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.19330v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2404.19329v1","updated":"2024-04-30T07:52:36Z","published":"2024-04-30T07:52:36Z","title":"End-to-end information extraction in handwritten documents:\n Understanding Paris marriage records from 1880 to 1940","summary":" The EXO-POPP project aims to establish a comprehensive database comprising\n300,000 marriage records from Paris and its suburbs, spanning the years 1880 to\n1940, which are preserved in over 130,000 scans of double pages. Each marriage\nrecord may encompass up to 118 distinct types of information that require\nextraction from plain text. In this paper, we introduce the M-POPP dataset, a\nsubset of the M-POPP database with annotations for full-page text recognition\nand information extraction in both handwritten and printed documents, and which\nis now publicly available. We present a fully end-to-end architecture adapted\nfrom the DAN, designed to perform both handwritten text recognition and\ninformation extraction directly from page images without the need for explicit\nsegmentation. We showcase the information extraction capabilities of this\narchitecture by achieving a new state of the art for full-page Information\nExtraction on Esposalles and we use this architecture as a baseline for the\nM-POPP dataset. We also assess and compare how different encoding strategies\nfor named entities in the text affect the performance of jointly recognizing\nhandwritten text and extracting information, from full pages.\n","authors":["Thomas Constum","Lucas Preel","Théo Larcher","Pierrick Tranouez","Thierry Paquet","Sandra Brée"],"pdf_url":"https://arxiv.org/pdf/2404.19329v1.pdf","comment":"To be published in: International Conference on Document Analysis and\n Recognition - ICDAR 2024"},{"id":"http://arxiv.org/abs/2404.19326v1","updated":"2024-04-30T07:50:29Z","published":"2024-04-30T07:50:29Z","title":"LVOS: A Benchmark for Large-scale Long-term Video Object Segmentation","summary":" Video object segmentation (VOS) aims to distinguish and track target objects\nin a video. Despite the excellent performance achieved by off-the-shell VOS\nmodels, existing VOS benchmarks mainly focus on short-term videos lasting about\n5 seconds, where objects remain visible most of the time. However, these\nbenchmarks poorly represent practical applications, and the absence of\nlong-term datasets restricts further investigation of VOS in realistic\nscenarios. Thus, we propose a novel benchmark named LVOS, comprising 720 videos\nwith 296,401 frames and 407,945 high-quality annotations. Videos in LVOS last\n1.14 minutes on average, approximately 5 times longer than videos in existing\ndatasets. Each video includes various attributes, especially challenges\nderiving from the wild, such as long-term reappearing and cross-temporal\nsimilar objects. Compared to previous benchmarks, our LVOS better reflects VOS\nmodels' performance in real scenarios. Based on LVOS, we evaluate 20 existing\nVOS models under 4 different settings and conduct a comprehensive analysis. On\nLVOS, these models suffer a large performance drop, highlighting the challenge\nof achieving precise tracking and segmentation in real-world scenarios.\nAttribute-based analysis indicates that key factor to accuracy decline is the\nincreased video length, emphasizing LVOS's crucial role. We hope our LVOS can\nadvance development of VOS in real scenes. Data and code are available at\nhttps://lingyihongfd.github.io/lvos.github.io/.\n","authors":["Lingyi Hong","Zhongying Liu","Wenchao Chen","Chenzhi Tan","Yuang Feng","Xinyu Zhou","Pinxue Guo","Jinglun Li","Zhaoyu Chen","Shuyong Gao","Wei Zhang","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.19326v1.pdf","comment":"LVOS V2"},{"id":"http://arxiv.org/abs/2404.19317v1","updated":"2024-04-30T07:37:48Z","published":"2024-04-30T07:37:48Z","title":"Revisiting N-Gram Models: Their Impact in Modern Neural Networks for\n Handwritten Text Recognition","summary":" In recent advances in automatic text recognition (ATR), deep neural networks\nhave demonstrated the ability to implicitly capture language statistics,\npotentially reducing the need for traditional language models. This study\ndirectly addresses whether explicit language models, specifically n-gram\nmodels, still contribute to the performance of state-of-the-art deep learning\narchitectures in the field of handwriting recognition. We evaluate two\nprominent neural network architectures, PyLaia and DAN, with and without the\nintegration of explicit n-gram language models. Our experiments on three\ndatasets - IAM, RIMES, and NorHand v2 - at both line and page level,\ninvestigate optimal parameters for n-gram models, including their order,\nweight, smoothing methods and tokenization level. The results show that\nincorporating character or subword n-gram models significantly improves the\nperformance of ATR models on all datasets, challenging the notion that deep\nlearning models alone are sufficient for optimal performance. In particular,\nthe combination of DAN with a character language model outperforms current\nbenchmarks, confirming the value of hybrid approaches in modern document\nanalysis systems.\n","authors":["Solène Tarride","Christopher Kermorvant"],"pdf_url":"https://arxiv.org/pdf/2404.19317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19311v1","updated":"2024-04-30T07:30:33Z","published":"2024-04-30T07:30:33Z","title":"A Light-weight Transformer-based Self-supervised Matching Network for\n Heterogeneous Images","summary":" Matching visible and near-infrared (NIR) images remains a significant\nchallenge in remote sensing image fusion. The nonlinear radiometric differences\nbetween heterogeneous remote sensing images make the image matching task even\nmore difficult. Deep learning has gained substantial attention in computer\nvision tasks in recent years. However, many methods rely on supervised learning\nand necessitate large amounts of annotated data. Nevertheless, annotated data\nis frequently limited in the field of remote sensing image matching. To address\nthis challenge, this paper proposes a novel keypoint descriptor approach that\nobtains robust feature descriptors via a self-supervised matching network. A\nlight-weight transformer network, termed as LTFormer, is designed to generate\ndeep-level feature descriptors. Furthermore, we implement an innovative triplet\nloss function, LT Loss, to enhance the matching performance further. Our\napproach outperforms conventional hand-crafted local feature descriptors and\nproves equally competitive compared to state-of-the-art deep learning-based\nmethods, even amidst the shortage of annotated data.\n","authors":["Wang Zhang","Tingting Li","Yuntian Zhang","Gensheng Pei","Xiruo Jiang","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2404.19311v1.pdf","comment":"accepted by Information Fusion"},{"id":"http://arxiv.org/abs/2404.08995v4","updated":"2024-04-30T07:13:18Z","published":"2024-04-13T12:41:40Z","title":"Beyond Known Clusters: Probe New Prototypes for Efficient Generalized\n Class Discovery","summary":" Generalized Class Discovery (GCD) aims to dynamically assign labels to\nunlabelled data partially based on knowledge learned from labelled data, where\nthe unlabelled data may come from known or novel classes. The prevailing\napproach generally involves clustering across all data and learning conceptions\nby prototypical contrastive learning. However, existing methods largely hinge\non the performance of clustering algorithms and are thus subject to their\ninherent limitations. Firstly, the estimated cluster number is often smaller\nthan the ground truth, making the existing methods suffer from the lack of\nprototypes for comprehensive conception learning. To address this issue, we\npropose an adaptive probing mechanism that introduces learnable potential\nprototypes to expand cluster prototypes (centers). As there is no ground truth\nfor the potential prototype, we develop a self-supervised prototype learning\nframework to optimize the potential prototype in an end-to-end fashion.\nSecondly, clustering is computationally intensive, and the conventional\nstrategy of clustering both labelled and unlabelled instances exacerbates this\nissue. To counteract this inefficiency, we opt to cluster only the unlabelled\ninstances and subsequently expand the cluster prototypes with our introduced\npotential prototypes to fast explore novel classes. Despite the simplicity of\nour proposed method, extensive empirical analysis on a wide range of datasets\nconfirms that our method consistently delivers state-of-the-art results.\nSpecifically, our method surpasses the nearest competitor by a significant\nmargin of 9.7% within the Stanford Cars dataset and 12x clustering efficiency\nwithin the Herbarium 19 dataset. We will make the code and checkpoints publicly\navailable at https://github.com/xjtuYW/PNP.git.\n","authors":["Ye Wang","Yaxiong Wang","Yujiao Wu","Bingchen Zhao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2404.08995v4.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.19303v1","updated":"2024-04-30T07:07:45Z","published":"2024-04-30T07:07:45Z","title":"Data Set Terminology of Artificial Intelligence in Medicine: A\n Historical Review and Recommendation","summary":" Medicine and artificial intelligence (AI) engineering represent two distinct\nfields each with decades of published history. With such history comes a set of\nterminology that has a specific way in which it is applied. However, when two\ndistinct fields with overlapping terminology start to collaborate,\nmiscommunication and misunderstandings can occur. This narrative review aims to\ngive historical context for these terms, accentuate the importance of clarity\nwhen these terms are used in medical AI contexts, and offer solutions to\nmitigate misunderstandings by readers from either field. Through an examination\nof historical documents, including articles, writing guidelines, and textbooks,\nthis review traces the divergent evolution of terms for data sets and their\nimpact. Initially, the discordant interpretations of the word 'validation' in\nmedical and AI contexts are explored. Then the data sets used for AI evaluation\nare classified, namely random splitting, cross-validation, temporal,\ngeographic, internal, and external sets. The accurate and standardized\ndescription of these data sets is crucial for demonstrating the robustness and\ngeneralizability of AI applications in medicine. This review clarifies existing\nliterature to provide a comprehensive understanding of these classifications\nand their implications in AI evaluation. This review then identifies often\nmisunderstood terms and proposes pragmatic solutions to mitigate terminological\nconfusion. Among these solutions are the use of standardized terminology such\nas 'training set,' 'validation (or tuning) set,' and 'test set,' and explicit\ndefinition of data set splitting terminologies in each medical AI research\npublication. This review aspires to enhance the precision of communication in\nmedical AI, thereby fostering more effective and transparent research\nmethodologies in this interdisciplinary field.\n","authors":["Shannon L. Walston","Hiroshi Seki","Hirotaka Takita","Yasuhito Mitsuyama","Shingo Sato","Akifumi Hagiwara","Rintaro Ito","Shouhei Hanaoka","Yukio Miki","Daiju Ueda"],"pdf_url":"https://arxiv.org/pdf/2404.19303v1.pdf","comment":"Totally 20 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.19299v1","updated":"2024-04-30T07:01:05Z","published":"2024-04-30T07:01:05Z","title":"Robust Pedestrian Detection via Constructing Versatile Pedestrian\n Knowledge Bank","summary":" Pedestrian detection is a crucial field of computer vision research which can\nbe adopted in various real-world applications (e.g., self-driving systems).\nHowever, despite noticeable evolution of pedestrian detection, pedestrian\nrepresentations learned within a detection framework are usually limited to\nparticular scene data in which they were trained. Therefore, in this paper, we\npropose a novel approach to construct versatile pedestrian knowledge bank\ncontaining representative pedestrian knowledge which can be applicable to\nvarious detection frameworks and adopted in diverse scenes. We extract\ngeneralized pedestrian knowledge from a large-scale pretrained model, and we\ncurate them by quantizing most representative features and guiding them to be\ndistinguishable from background scenes. Finally, we construct versatile\npedestrian knowledge bank which is composed of such representations, and then\nwe leverage it to complement and enhance pedestrian features within a\npedestrian detection framework. Through comprehensive experiments, we validate\nthe effectiveness of our method, demonstrating its versatility and\noutperforming state-of-the-art detection performances.\n","authors":["Sungjune Park","Hyunjun Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2404.19299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01025v3","updated":"2024-04-30T06:55:51Z","published":"2023-11-02T06:38:19Z","title":"Integrating Language-Derived Appearance Elements with Visual Cues in\n Pedestrian Detection","summary":" Large language models (LLMs) have shown their capabilities in understanding\ncontextual and semantic information regarding knowledge of instance\nappearances. In this paper, we introduce a novel approach to utilize the\nstrengths of LLMs in understanding contextual appearance variations and to\nleverage this knowledge into a vision model (here, pedestrian detection). While\npedestrian detection is considered one of the crucial tasks directly related to\nour safety (e.g., intelligent driving systems), it is challenging because of\nvarying appearances and poses in diverse scenes. Therefore, we propose to\nformulate language-derived appearance elements and incorporate them with visual\ncues in pedestrian detection. To this end, we establish a description corpus\nthat includes numerous narratives describing various appearances of pedestrians\nand other instances. By feeding them through an LLM, we extract appearance\nknowledge sets that contain the representations of appearance variations.\nSubsequently, we perform a task-prompting process to obtain appearance elements\nwhich are guided representative appearance knowledge relevant to a downstream\npedestrian detection task. The obtained knowledge elements are adaptable to\nvarious detection frameworks, so that we can provide plentiful appearance\ninformation by integrating the language-derived appearance elements with visual\ncues within a detector. Through comprehensive experiments with various\npedestrian detectors, we verify the adaptability and effectiveness of our\nmethod showing noticeable performance gains and achieving state-of-the-art\ndetection performance on two public pedestrian detection benchmarks (i.e.,\nCrowdHuman and WiderPedestrian).\n","authors":["Sungjune Park","Hyunjun Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2311.01025v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12106v2","updated":"2024-04-30T06:52:11Z","published":"2023-05-20T05:58:35Z","title":"Human-annotated label noise and their impact on ConvNets for remote\n sensing image scene classification","summary":" Convolutional neural networks (ConvNets) have been successfully applied to\nsatellite image scene classification. Human-labeled training datasets are\nessential for ConvNets to perform accurate classification. Errors in\nhuman-annotated training datasets are unavoidable due to the complexity of\nsatellite images. However, the distribution of real-world human-annotated label\nnoises on remote sensing images and their impact on ConvNets have not been\ninvestigated. To fill this research gap, this study, for the first time,\ncollected real-world labels from 32 participants and explored how their\nannotated label noise affect three representative ConvNets (VGG16, GoogleNet,\nand ResNet-50) for remote sensing image scene classification. We found that:\n(1) human-annotated label noise exhibits significant class and instance\ndependence; (2) an additional 1% of human-annotated label noise in training\ndata leads to 0.5% reduction in the overall accuracy of ConvNets\nclassification; (3) the error pattern of ConvNet predictions was strongly\ncorrelated with that of participant's labels. To uncover the mechanism\nunderlying the impact of human labeling errors on ConvNets, we further compared\nit with three types of simulated label noise: uniform noise, class-dependent\nnoise and instance-dependent noise. Our results show that the impact of\nhuman-annotated label noise on ConvNets significantly differs from all three\ntypes of simulated label noise, while both class dependence and instance\ndependence contribute to the impact of human-annotated label noise on ConvNets.\nThese observations necessitate a reevaluation of the handling of noisy labels,\nand we anticipate that our real-world label noise dataset would facilitate the\nfuture development and assessment of label-noise learning algorithms.\n","authors":["Longkang Peng","Tao Wei","Xuehong Chen","Xiaobei Chen","Rui Sun","Luoma Wan","Jin Chen","Xiaolin Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.12106v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.19294v1","updated":"2024-04-30T06:51:30Z","published":"2024-04-30T06:51:30Z","title":"Masked Spatial Propagation Network for Sparsity-Adaptive Depth\n Refinement","summary":" The main function of depth completion is to compensate for an insufficient\nand unpredictable number of sparse depth measurements of hardware sensors.\nHowever, existing research on depth completion assumes that the sparsity -- the\nnumber of points or LiDAR lines -- is fixed for training and testing. Hence,\nthe completion performance drops severely when the number of sparse depths\nchanges significantly. To address this issue, we propose the sparsity-adaptive\ndepth refinement (SDR) framework, which refines monocular depth estimates using\nsparse depth points. For SDR, we propose the masked spatial propagation network\n(MSPN) to perform SDR with a varying number of sparse depths effectively by\ngradually propagating sparse depth information throughout the entire depth map.\nExperimental results demonstrate that MPSN achieves state-of-the-art\nperformance on both SDR and conventional depth completion scenarios.\n","authors":["Jinyoung Jun","Jae-Han Lee","Chang-Su Kim"],"pdf_url":"https://arxiv.org/pdf/2404.19294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19289v1","updated":"2024-04-30T06:39:04Z","published":"2024-04-30T06:39:04Z","title":"On Improving the Algorithm-, Model-, and Data- Efficiency of\n Self-Supervised Learning","summary":" Self-supervised learning (SSL) has developed rapidly in recent years.\nHowever, most of the mainstream methods are computationally expensive and rely\non two (or more) augmentations for each image to construct positive pairs.\nMoreover, they mainly focus on large models and large-scale datasets, which\nlack flexibility and feasibility in many practical applications. In this paper,\nwe propose an efficient single-branch SSL method based on non-parametric\ninstance discrimination, aiming to improve the algorithm, model, and data\nefficiency of SSL. By analyzing the gradient formula, we correct the update\nrule of the memory bank with improved performance. We further propose a novel\nself-distillation loss that minimizes the KL divergence between the probability\ndistribution and its square root version. We show that this alleviates the\ninfrequent updating problem in instance discrimination and greatly accelerates\nconvergence. We systematically compare the training overhead and performance of\ndifferent methods in different scales of data, and under different backbones.\nExperimental results show that our method outperforms various baselines with\nsignificantly less overhead, and is especially effective for limited amounts of\ndata and small models.\n","authors":["Yun-Hao Cao","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2404.19289v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.11317v6","updated":"2024-04-30T06:36:36Z","published":"2023-11-19T13:07:06Z","title":"Discrete approximations of Gaussian smoothing and Gaussian derivatives","summary":" This paper develops an in-depth treatment concerning the problem of\napproximating the Gaussian smoothing and Gaussian derivative computations in\nscale-space theory for application on discrete data. With close connections to\nprevious axiomatic treatments of continuous and discrete scale-space theory, we\nconsider three main ways discretizing these scale-space operations in terms of\nexplicit discrete convolutions, based on either (i) sampling the Gaussian\nkernels and the Gaussian derivative kernels, (ii) locally integrating the\nGaussian kernels and the Gaussian derivative kernels over each pixel support\nregion and (iii) basing the scale-space analysis on the discrete analogue of\nthe Gaussian kernel, and then computing derivative approximations by applying\nsmall-support central difference operators to the spatially smoothed image\ndata.\n We study the properties of these three main discretization methods both\ntheoretically and experimentally, and characterize their performance by\nquantitative measures, including the results they give rise to with respect to\nthe task of scale selection, investigated for four different use cases, and\nwith emphasis on the behaviour at fine scales. The results show that the\nsampled Gaussian kernels and derivatives as well as the integrated Gaussian\nkernels and derivatives perform very poorly at very fine scales. At very fine\nscales, the discrete analogue of the Gaussian kernel with its corresponding\ndiscrete derivative approximations performs substantially better. The sampled\nGaussian kernel and the sampled Gaussian derivatives do, on the other hand,\nlead to numerically very good approximations of the corresponding continuous\nresults, when the scale parameter is sufficiently large, in the experiments\npresented in the paper, when the scale parameter is greater than a value of\nabout 1, in units of the grid spacing.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.11317v6.pdf","comment":"42 pages, 21 figures"},{"id":"http://arxiv.org/abs/2404.19287v1","updated":"2024-04-30T06:34:21Z","published":"2024-04-30T06:34:21Z","title":"Revisiting the Adversarial Robustness of Vision Language Models: a\n Multimodal Perspective","summary":" Pretrained vision-language models (VLMs) like CLIP have shown impressive\ngeneralization performance across various downstream tasks, yet they remain\nvulnerable to adversarial attacks. While prior research has primarily\nconcentrated on improving the adversarial robustness of image encoders to guard\nagainst attacks on images, the exploration of text-based and multimodal attacks\nhas largely been overlooked. In this work, we initiate the first known and\ncomprehensive effort to study adapting vision-language models for adversarial\nrobustness under the multimodal attack. Firstly, we introduce a multimodal\nattack strategy and investigate the impact of different attacks. We then\npropose a multimodal contrastive adversarial training loss, aligning the clean\nand adversarial text embeddings with the adversarial and clean visual features,\nto enhance the adversarial robustness of both image and text encoders of CLIP.\nExtensive experiments on 15 datasets across two tasks demonstrate that our\nmethod significantly improves the adversarial robustness of CLIP.\nInterestingly, we find that the model fine-tuned against multimodal adversarial\nattacks exhibits greater robustness than its counterpart fine-tuned solely\nagainst image-based attacks, even in the context of image attacks, which may\nopen up new possibilities for enhancing the security of VLMs.\n","authors":["Wanqi Zhou","Shuanghao Bai","Qibin Zhao","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.19287v1.pdf","comment":"16 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.14606v2","updated":"2024-04-30T06:34:16Z","published":"2024-04-22T22:02:19Z","title":"Cross-Task Multi-Branch Vision Transformer for Facial Expression and\n Mask Wearing Classification","summary":" With wearing masks becoming a new cultural norm, facial expression\nrecognition (FER) while taking masks into account has become a significant\nchallenge. In this paper, we propose a unified multi-branch vision transformer\nfor facial expression recognition and mask wearing classification tasks. Our\napproach extracts shared features for both tasks using a dual-branch\narchitecture that obtains multi-scale feature representations. Furthermore, we\npropose a cross-task fusion phase that processes tokens for each task with\nseparate branches, while exchanging information using a cross attention module.\nOur proposed framework reduces the overall complexity compared with using\nseparate networks for both tasks by the simple yet effective cross-task fusion\nphase. Extensive experiments demonstrate that our proposed model performs\nbetter than or on par with different state-of-the-art methods on both facial\nexpression recognition and facial mask wearing classification task.\n","authors":["Armando Zhu","Keqin Li","Tong Wu","Peng Zhao","Bo Hong"],"pdf_url":"https://arxiv.org/pdf/2404.14606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19286v1","updated":"2024-04-30T06:33:07Z","published":"2024-04-30T06:33:07Z","title":"Soft Prompt Generation for Domain Generalization","summary":" Large pre-trained vision language models (VLMs) have shown impressive\nzero-shot ability on downstream tasks with manually designed prompt, which are\nnot optimal for specific domains. To further adapt VLMs to downstream tasks,\nsoft prompt is proposed to replace manually designed prompt, which acts as a\nlearning vector that undergoes fine-tuning based on specific domain data. Prior\nprompt learning methods primarily learn a fixed prompt and residuled prompt\nfrom training samples. However, the learned prompts lack diversity and ignore\ninformation about unseen domains, potentially compromising the transferability\nof the prompts. In this paper, we reframe the prompt learning framework from a\ngenerative perspective and propose a simple yet efficient method for the Domain\nGeneralization (DG) task, namely \\textbf{S}oft \\textbf{P}rompt\n\\textbf{G}eneration (SPG). To the best of our knowledge, we are the first to\nintroduce the generative model into prompt learning in VLMs and explore its\npotential for producing soft prompts by relying solely on the generative model,\nensuring the diversity of prompts. Specifically, SPG consists of a two-stage\ntraining phase and an inference phase. During the training phase, we introduce\nsoft prompt labels for each domain, aiming to incorporate the generative model\ndomain knowledge. During the inference phase, the generator of the generative\nmodel is employed to obtain instance-specific soft prompts for the unseen\ntarget domain. Extensive experiments on five domain generalization benchmarks\nof three DG tasks demonstrate that our proposed SPG achieves state-of-the-art\nperformance. The code will be available soon.\n","authors":["Shuanghao Bai","Yuedi Zhang","Wanqi Zhou","Zhirong Luan","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.19286v1.pdf","comment":"23 pages, 4 figures"},{"id":"http://arxiv.org/abs/2202.02002v2","updated":"2024-04-30T06:30:18Z","published":"2022-02-04T07:19:09Z","title":"Scaling up Multi-domain Semantic Segmentation with Sentence Embeddings","summary":" We propose an approach to semantic segmentation that achieves\nstate-of-the-art supervised performance when applied in a zero-shot setting. It\nthus achieves results equivalent to those of the supervised methods, on each of\nthe major semantic segmentation datasets, without training on those datasets.\nThis is achieved by replacing each class label with a vector-valued embedding\nof a short paragraph that describes the class. The generality and simplicity of\nthis approach enables merging multiple datasets from different domains, each\nwith varying class labels and semantics. The resulting merged semantic\nsegmentation dataset of over 2 Million images enables training a model that\nachieves performance equal to that of state-of-the-art supervised methods on 7\nbenchmark datasets, despite not using any images therefrom. By fine-tuning the\nmodel on standard semantic segmentation datasets, we also achieve a significant\nimprovement over the state-of-the-art supervised segmentation on NYUD-V2 and\nPASCAL-context at 60% and 65% mIoU, respectively. Based on the closeness of\nlanguage embeddings, our method can even segment unseen labels. Extensive\nexperiments demonstrate strong generalization to unseen image domains and\nunseen labels, and that the method enables impressive performance improvements\nin downstream applications, including depth estimation and instance\nsegmentation.\n","authors":["Wei Yin","Yifan Liu","Chunhua Shen","Baichuan Sun","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2202.02002v2.pdf","comment":"14 pages. Accepted to Int. J. Comp. Vis. (IJCV)"},{"id":"http://arxiv.org/abs/2404.06741v2","updated":"2024-04-30T06:14:23Z","published":"2024-04-10T04:59:51Z","title":"An Animation-based Augmentation Approach for Action Recognition from\n Discontinuous Video","summary":" Action recognition, an essential component of computer vision, plays a\npivotal role in multiple applications. Despite significant improvements brought\nby Convolutional Neural Networks (CNNs), these models suffer performance\ndeclines when trained with discontinuous video frames, which is a frequent\nscenario in real-world settings. This decline primarily results from the loss\nof temporal continuity, which is crucial for understanding the semantics of\nhuman actions. To overcome this issue, we introduce the 4A (Action\nAnimation-based Augmentation Approach) pipeline, which employs a series of\nsophisticated techniques: starting with 2D human pose estimation from RGB\nvideos, followed by Quaternion-based Graph Convolution Network for joint\norientation and trajectory prediction, and Dynamic Skeletal Interpolation for\ncreating smoother, diversified actions using game engine technology. This\ninnovative approach generates realistic animations in varied game environments,\nviewed from multiple viewpoints. In this way, our method effectively bridges\nthe domain gap between virtual and real-world data. In experimental\nevaluations, the 4A pipeline achieves comparable or even superior performance\nto traditional training approaches using real-world data, while requiring only\n10% of the original data volume. Additionally, our approach demonstrates\nenhanced performance on In-the-wild videos, marking a significant advancement\nin the field of action recognition.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Xin-Qiang Cai","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.06741v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.13414"},{"id":"http://arxiv.org/abs/2212.06278v2","updated":"2024-04-30T06:05:26Z","published":"2022-12-12T23:12:19Z","title":"Efficient Bayesian Uncertainty Estimation for nnU-Net","summary":" The self-configuring nnU-Net has achieved leading performance in a large\nrange of medical image segmentation challenges. It is widely considered as the\nmodel of choice and a strong baseline for medical image segmentation. However,\ndespite its extraordinary performance, nnU-Net does not supply a measure of\nuncertainty to indicate its possible failure. This can be problematic for\nlarge-scale image segmentation applications, where data are heterogeneous and\nnnU-Net may fail without notice. In this work, we introduce a novel method to\nestimate nnU-Net uncertainty for medical image segmentation. We propose a\nhighly effective scheme for posterior sampling of weight space for Bayesian\nuncertainty estimation. Different from previous baseline methods such as Monte\nCarlo Dropout and mean-field Bayesian Neural Networks, our proposed method does\nnot require a variational architecture and keeps the original nnU-Net\narchitecture intact, thereby preserving its excellent performance and ease of\nuse. Additionally, we boost the segmentation performance over the original\nnnU-Net via marginalizing multi-modal posterior models. We applied our method\non the public ACDC and M&M datasets of cardiac MRI and demonstrated improved\nuncertainty estimation over a range of baseline methods. The proposed method\nfurther strengthens nnU-Net for medical image segmentation in terms of both\nsegmentation accuracy and quality control.\n","authors":["Yidong Zhao","Changchun Yang","Artur Schweidtmann","Qian Tao"],"pdf_url":"https://arxiv.org/pdf/2212.06278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19279v1","updated":"2024-04-30T06:02:59Z","published":"2024-04-30T06:02:59Z","title":"Quater-GCN: Enhancing 3D Human Pose Estimation with Orientation and\n Semi-supervised Training","summary":" 3D human pose estimation is a vital task in computer vision, involving the\nprediction of human joint positions from images or videos to reconstruct a\nskeleton of a human in three-dimensional space. This technology is pivotal in\nvarious fields, including animation, security, human-computer interaction, and\nautomotive safety, where it promotes both technological progress and enhanced\nhuman well-being. The advent of deep learning significantly advances the\nperformance of 3D pose estimation by incorporating temporal information for\npredicting the spatial positions of human joints. However, traditional methods\noften fall short as they primarily focus on the spatial coordinates of joints\nand overlook the orientation and rotation of the connecting bones, which are\ncrucial for a comprehensive understanding of human pose in 3D space. To address\nthese limitations, we introduce Quater-GCN (Q-GCN), a directed graph\nconvolutional network tailored to enhance pose estimation by orientation. Q-GCN\nexcels by not only capturing the spatial dependencies among node joints through\ntheir coordinates but also integrating the dynamic context of bone rotations in\n2D space. This approach enables a more sophisticated representation of human\nposes by also regressing the orientation of each bone in 3D space, moving\nbeyond mere coordinate prediction. Furthermore, we complement our model with a\nsemi-supervised training strategy that leverages unlabeled data, addressing the\nchallenge of limited orientation ground truth data. Through comprehensive\nevaluations, Q-GCN has demonstrated outstanding performance against current\nstate-of-the-art methods.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.19279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00549v2","updated":"2024-04-30T06:01:22Z","published":"2024-03-01T14:18:00Z","title":"Relaxometry Guided Quantitative Cardiac Magnetic Resonance Image\n Reconstruction","summary":" Deep learning-based methods have achieved prestigious performance for\nmagnetic resonance imaging (MRI) reconstruction, enabling fast imaging for many\nclinical applications. Previous methods employ convolutional networks to learn\nthe image prior as the regularization term. In quantitative MRI, the physical\nmodel of nuclear magnetic resonance relaxometry is known, providing additional\nprior knowledge for image reconstruction. However, traditional reconstruction\nnetworks are limited to learning the spatial domain prior knowledge, ignoring\nthe relaxometry prior. Therefore, we propose a relaxometry-guided quantitative\nMRI reconstruction framework to learn the spatial prior from data and the\nrelaxometry prior from MRI physics. Additionally, we also evaluated the\nperformance of two popular reconstruction backbones, namely, recurrent\nvariational networks (RVN) and variational networks (VN) with U- Net.\nExperiments demonstrate that the proposed method achieves highly promising\nresults in quantitative MRI reconstruction.\n","authors":["Yidong Zhao","Yi Zhang","Qian Tao"],"pdf_url":"https://arxiv.org/pdf/2403.00549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19277v1","updated":"2024-04-30T05:54:40Z","published":"2024-04-30T05:54:40Z","title":"Bridge to Non-Barrier Communication: Gloss-Prompted Fine-grained Cued\n Speech Gesture Generation with Diffusion Model","summary":" Cued Speech (CS) is an advanced visual phonetic encoding system that\nintegrates lip reading with hand codings, enabling people with hearing\nimpairments to communicate efficiently. CS video generation aims to produce\nspecific lip and gesture movements of CS from audio or text inputs. The main\nchallenge is that given limited CS data, we strive to simultaneously generate\nfine-grained hand and finger movements, as well as lip movements, meanwhile the\ntwo kinds of movements need to be asynchronously aligned. Existing CS\ngeneration methods are fragile and prone to poor performance due to\ntemplate-based statistical models and careful hand-crafted pre-processing to\nfit the models. Therefore, we propose a novel Gloss-prompted Diffusion-based CS\nGesture generation framework (called GlossDiff). Specifically, to integrate\nadditional linguistic rules knowledge into the model. we first introduce a\nbridging instruction called \\textbf{Gloss}, which is an automatically generated\ndescriptive text to establish a direct and more delicate semantic connection\nbetween spoken language and CS gestures. Moreover, we first suggest rhythm is\nan important paralinguistic feature for CS to improve the communication\nefficacy. Therefore, we propose a novel Audio-driven Rhythmic Module (ARM) to\nlearn rhythm that matches audio speech. Moreover, in this work, we design,\nrecord, and publish the first Chinese CS dataset with four CS cuers. Extensive\nexperiments demonstrate that our method quantitatively and qualitatively\noutperforms current state-of-the-art (SOTA) methods. We release the code and\ndata at https://glossdiff.github.io/.\n","authors":["Wentao Lei","Li Liu","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2404.19277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19276v1","updated":"2024-04-30T05:51:21Z","published":"2024-04-30T05:51:21Z","title":"C2FDrone: Coarse-to-Fine Drone-to-Drone Detection using Vision\n Transformer Networks","summary":" A vision-based drone-to-drone detection system is crucial for various\napplications like collision avoidance, countering hostile drones, and\nsearch-and-rescue operations. However, detecting drones presents unique\nchallenges, including small object sizes, distortion, occlusion, and real-time\nprocessing requirements. Current methods integrating multi-scale feature fusion\nand temporal information have limitations in handling extreme blur and\nminuscule objects. To address this, we propose a novel coarse-to-fine detection\nstrategy based on vision transformers. We evaluate our approach on three\nchallenging drone-to-drone detection datasets, achieving F1 score enhancements\nof 7%, 3%, and 1% on the FL-Drones, AOT, and NPS-Drones datasets, respectively.\nAdditionally, we demonstrate real-time processing capabilities by deploying our\nmodel on an edge-computing device. Our code will be made publicly available.\n","authors":["Sairam VC Rebbapragada","Pranoy Panda","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2404.19276v1.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.17771v2","updated":"2024-04-30T05:40:20Z","published":"2024-04-27T03:55:53Z","title":"Characterization of dim light response in DVS pixel: Discontinuity of\n event triggering time","summary":" Dynamic Vision Sensors (DVS) have recently generated great interest because\nof the advantages of wide dynamic range and low latency compared with\nconventional frame-based cameras. However, the complicated behaviors in dim\nlight conditions are still not clear, restricting the applications of DVS. In\nthis paper, we analyze the typical DVS circuit, and find that there exists\ndiscontinuity of event triggering time. In dim light conditions, the\ndiscontinuity becomes prominent. We point out that the discontinuity depends\nexclusively on the changing speed of light intensity. Experimental results on\nreal event data validate the analysis and the existence of discontinuity that\nreveals the non-first-order behaviors of DVS in dim light conditions.\n","authors":["Xiao Jiang","Fei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17771v2.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.00372v3","updated":"2024-04-30T05:32:01Z","published":"2024-03-01T08:57:28Z","title":"HyperSDFusion: Bridging Hierarchical Structures in Language and Geometry\n for Enhanced 3D Text2Shape Generation","summary":" 3D shape generation from text is a fundamental task in 3D representation\nlearning. The text-shape pairs exhibit a hierarchical structure, where a\ngeneral text like ``chair\" covers all 3D shapes of the chair, while more\ndetailed prompts refer to more specific shapes. Furthermore, both text and 3D\nshapes are inherently hierarchical structures. However, existing Text2Shape\nmethods, such as SDFusion, do not exploit that. In this work, we propose\nHyperSDFusion, a dual-branch diffusion model that generates 3D shapes from a\ngiven text. Since hyperbolic space is suitable for handling hierarchical data,\nwe propose to learn the hierarchical representations of text and 3D shapes in\nhyperbolic space. First, we introduce a hyperbolic text-image encoder to learn\nthe sequential and multi-modal hierarchical features of text in hyperbolic\nspace. In addition, we design a hyperbolic text-graph convolution module to\nlearn the hierarchical features of text in hyperbolic space. In order to fully\nutilize these text features, we introduce a dual-branch structure to embed text\nfeatures in 3D feature space. At last, to endow the generated 3D shapes with a\nhierarchical structure, we devise a hyperbolic hierarchical loss. Our method is\nthe first to explore the hyperbolic hierarchical representation for\ntext-to-shape generation. Experimental results on the existing text-to-shape\npaired dataset, Text2Shape, achieved state-of-the-art results. We release our\nimplementation under HyperSDFusion.github.io.\n","authors":["Zhiying Leng","Tolga Birdal","Xiaohui Liang","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2403.00372v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19265v1","updated":"2024-04-30T05:11:32Z","published":"2024-04-30T05:11:32Z","title":"Mapping New Realities: Ground Truth Image Creation with Pix2Pix\n Image-to-Image Translation","summary":" Generative Adversarial Networks (GANs) have significantly advanced image\nprocessing, with Pix2Pix being a notable framework for image-to-image\ntranslation. This paper explores a novel application of Pix2Pix to transform\nabstract map images into realistic ground truth images, addressing the scarcity\nof such images crucial for domains like urban planning and autonomous vehicle\ntraining. We detail the Pix2Pix model's utilization for generating\nhigh-fidelity datasets, supported by a dataset of paired map and aerial images,\nand enhanced by a tailored training regimen. The results demonstrate the\nmodel's capability to accurately render complex urban features, establishing\nits efficacy and potential for broad real-world applications.\n","authors":["Zhenglin Li","Bo Guan","Yuanzhou Wei","Yiming Zhou","Jingyu Zhang","Jinxin Xu"],"pdf_url":"https://arxiv.org/pdf/2404.19265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19259v1","updated":"2024-04-30T04:53:10Z","published":"2024-04-30T04:53:10Z","title":"DELINE8K: A Synthetic Data Pipeline for the Semantic Segmentation of\n Historical Documents","summary":" Document semantic segmentation is a promising avenue that can facilitate\ndocument analysis tasks, including optical character recognition (OCR), form\nclassification, and document editing. Although several synthetic datasets have\nbeen developed to distinguish handwriting from printed text, they fall short in\nclass variety and document diversity. We demonstrate the limitations of\ntraining on existing datasets when solving the National Archives Form Semantic\nSegmentation dataset (NAFSS), a dataset which we introduce. To address these\nlimitations, we propose the most comprehensive document semantic segmentation\nsynthesis pipeline to date, incorporating preprinted text, handwriting, and\ndocument backgrounds from over 10 sources to create the Document Element Layer\nINtegration Ensemble 8K, or DELINE8K dataset. Our customized dataset exhibits\nsuperior performance on the NAFSS benchmark, demonstrating it as a promising\ntool in further research. The DELINE8K dataset is available at\nhttps://github.com/Tahlor/deline8k.\n","authors":["Taylor Archibald","Tony Martinez"],"pdf_url":"https://arxiv.org/pdf/2404.19259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11793v2","updated":"2024-04-30T04:38:53Z","published":"2023-12-19T02:09:38Z","title":"An Effective Image Copy-Move Forgery Detection Using Entropy Information","summary":" Image forensics has become increasingly crucial in our daily lives. Among\nvarious types of forgeries, copy-move forgery detection has received\nconsiderable attention within the academic community. Keypoint-based\nalgorithms, particularly those based on Scale Invariant Feature Transform, have\nachieved promising outcomes. However, most of keypoint detection algorithms\nfailed to generate sufficient matches when tampered patches were occurred in\nsmooth areas, leading to insufficient matches. Therefore, this paper introduces\nentropy images to determine the coordinates and scales of keypoints based on\nScale Invariant Feature Transform detector, which make the pre-processing more\nsuitable for solving the above problems. Furthermore, an overlapped entropy\nlevel clustering algorithm is developed to mitigate the increased matching\ncomplexity caused by the non-ideal distribution of gray values in keypoints.\nExperimental results demonstrate that our algorithm achieves a good balance\nbetween performance and time efficiency.\n","authors":["Li Jiang","Zhaowei Lu"],"pdf_url":"https://arxiv.org/pdf/2312.11793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19250v1","updated":"2024-04-30T04:13:14Z","published":"2024-04-30T04:13:14Z","title":"Enhancing Intrinsic Features for Debiasing via Investigating\n Class-Discerning Common Attributes in Bias-Contrastive Pair","summary":" In the image classification task, deep neural networks frequently rely on\nbias attributes that are spuriously correlated with a target class in the\npresence of dataset bias, resulting in degraded performance when applied to\ndata without bias attributes. The task of debiasing aims to compel classifiers\nto learn intrinsic attributes that inherently define a target class rather than\nfocusing on bias attributes. While recent approaches mainly focus on\nemphasizing the learning of data samples without bias attributes (i.e.,\nbias-conflicting samples) compared to samples with bias attributes (i.e.,\nbias-aligned samples), they fall short of directly guiding models where to\nfocus for learning intrinsic features. To address this limitation, this paper\nproposes a method that provides the model with explicit spatial guidance that\nindicates the region of intrinsic features. We first identify the intrinsic\nfeatures by investigating the class-discerning common features between a\nbias-aligned (BA) sample and a bias-conflicting (BC) sample (i.e.,\nbias-contrastive pair). Next, we enhance the intrinsic features in the BA\nsample that are relatively under-exploited for prediction compared to the BC\nsample. To construct the bias-contrastive pair without using bias information,\nwe introduce a bias-negative score that distinguishes BC samples from BA\nsamples employing a biased model. The experiments demonstrate that our method\nachieves state-of-the-art performance on synthetic and real-world datasets with\nvarious levels of bias severity.\n","authors":["Jeonghoon Park","Chaeyeon Chung","Juyoung Lee","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2404.19250v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.19248v1","updated":"2024-04-30T04:12:36Z","published":"2024-04-30T04:12:36Z","title":"Transition Rate Scheduling for Quantization-Aware Training","summary":" Quantization-aware training (QAT) simulates a quantization process during\ntraining to lower bit-precision of weights/activations. It learns quantized\nweights indirectly by updating latent weights, i.e., full-precision inputs to a\nquantizer, using gradient-based optimizers. We claim that coupling a\nuser-defined learning rate (LR) with these optimizers is sub-optimal for QAT.\nQuantized weights transit discrete levels of a quantizer, only if corresponding\nlatent weights pass transition points, where the quantizer changes discrete\nstates. This suggests that the changes of quantized weights are affected by\nboth the LR for latent weights and their distributions. It is thus difficult to\ncontrol the degree of changes for quantized weights by scheduling the LR\nmanually. We conjecture that the degree of parameter changes in QAT is related\nto the number of quantized weights transiting discrete levels. Based on this,\nwe introduce a transition rate (TR) scheduling technique that controls the\nnumber of transitions of quantized weights explicitly. Instead of scheduling a\nLR for latent weights, we schedule a target TR of quantized weights, and update\nthe latent weights with a novel transition-adaptive LR (TALR), enabling\nconsidering the degree of changes for the quantized weights during QAT.\nExperimental results demonstrate the effectiveness of our approach on standard\nbenchmarks.\n","authors":["Junghyup lee","Dohyung Kim","Jeimin Jeon","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2404.19248v1.pdf","comment":"Submitted to IEEE TPAMI on Apr. 03, 2023"},{"id":"http://arxiv.org/abs/2402.12712v3","updated":"2024-04-30T04:11:58Z","published":"2024-02-20T04:25:57Z","title":"MVDiffusion++: A Dense High-resolution Multi-view Diffusion Model for\n Single or Sparse-view 3D Object Reconstruction","summary":" This paper presents a neural architecture MVDiffusion++ for 3D object\nreconstruction that synthesizes dense and high-resolution views of an object\ngiven one or a few images without camera poses. MVDiffusion++ achieves superior\nflexibility and scalability with two surprisingly simple ideas: 1) A\n``pose-free architecture'' where standard self-attention among 2D latent\nfeatures learns 3D consistency across an arbitrary number of conditional and\ngeneration views without explicitly using camera pose information; and 2) A\n``view dropout strategy'' that discards a substantial number of output views\nduring training, which reduces the training-time memory footprint and enables\ndense and high-resolution view synthesis at test time. We use the Objaverse for\ntraining and the Google Scanned Objects for evaluation with standard novel view\nsynthesis and 3D reconstruction metrics, where MVDiffusion++ significantly\noutperforms the current state of the arts. We also demonstrate a text-to-3D\napplication example by combining MVDiffusion++ with a text-to-image generative\nmodel. The project page is at https://mvdiffusion-plusplus.github.io.\n","authors":["Shitao Tang","Jiacheng Chen","Dilin Wang","Chengzhou Tang","Fuyang Zhang","Yuchen Fan","Vikas Chandra","Yasutaka Furukawa","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2402.12712v3.pdf","comment":"3D generation, project page: https://mvdiffusion-plusplus.github.io/"},{"id":"http://arxiv.org/abs/2404.19247v1","updated":"2024-04-30T04:11:21Z","published":"2024-04-30T04:11:21Z","title":"Improved AutoEncoder with LSTM module and KL divergence","summary":" The task of anomaly detection is to separate anomalous data from normal data\nin the dataset. Models such as deep convolutional autoencoder (CAE) network and\ndeep supporting vector data description (SVDD) model have been universally\nemployed and have demonstrated significant success in detecting anomalies.\nHowever, the over-reconstruction ability of CAE network for anomalous data can\neasily lead to high false negative rate in detecting anomalous data. On the\nother hand, the deep SVDD model has the drawback of feature collapse, which\nleads to a decrease of detection accuracy for anomalies. To address these\nproblems, we propose the Improved AutoEncoder with LSTM module and\nKullback-Leibler divergence (IAE-LSTM-KL) model in this paper. An LSTM network\nis added after the encoder to memorize feature representations of normal data.\nIn the meanwhile, the phenomenon of feature collapse can also be mitigated by\npenalizing the featured input to SVDD module via KL divergence. The efficacy of\nthe IAE-LSTM-KL model is validated through experiments on both synthetic and\nreal-world datasets. Experimental results show that IAE-LSTM-KL model yields\nhigher detection accuracy for anomalies. In addition, it is also found that the\nIAE-LSTM-KL model demonstrates enhanced robustness to contaminated outliers in\nthe dataset.\n","authors":["Wei Huang","Bingyang Zhang","Kaituo Zhang","Hua Gao","Rongchun Wan"],"pdf_url":"https://arxiv.org/pdf/2404.19247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10584v2","updated":"2024-04-30T03:59:18Z","published":"2024-04-16T14:10:42Z","title":"ReWiTe: Realistic Wide-angle and Telephoto Dual Camera Fusion Dataset\n via Beam Splitter Camera Rig","summary":" The fusion of images from dual camera systems featuring a wide-angle and a\ntelephoto camera has become a hotspot problem recently. By integrating\nsimultaneously captured wide-angle and telephoto images from these systems, the\nresulting fused image achieves a wide field of view (FOV) coupled with\nhigh-definition quality. Existing approaches are mostly deep learning methods,\nand predominantly rely on supervised learning, where the training dataset plays\na pivotal role. However, current datasets typically adopt a data synthesis\napproach generate input pairs of wide-angle and telephoto images alongside\nground-truth images. Notably, the wide-angle inputs are synthesized rather than\ncaptured using real wide-angle cameras, and the ground-truth image is captured\nby wide-angle camera whose quality is substantially lower than that of input\ntelephoto images captured by telephoto cameras. To address these limitations,\nwe introduce a novel hardware setup utilizing a beam splitter to simultaneously\ncapture three images, i.e. input pairs and ground-truth images, from two\nauthentic cellphones equipped with wide-angle and telephoto dual cameras.\nSpecifically, the wide-angle and telephoto images captured by cellphone 2 serve\nas the input pair, while the telephoto image captured by cellphone 1, which is\ncalibrated to match the optical path of the wide-angle image from cellphone 2,\nserves as the ground-truth image, maintaining quality on par with the input\ntelephoto image. Experiments validate the efficacy of our newly introduced\ndataset, named ReWiTe, significantly enhances the performance of various\nexisting methods for real-world wide-angle and telephoto dual image fusion\ntasks.\n","authors":["Chunli Peng","Xuan Dong","Tiantian Cao","Zhengqing Li","Kun Dong","Weixin Li"],"pdf_url":"https://arxiv.org/pdf/2404.10584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19242v1","updated":"2024-04-30T03:58:19Z","published":"2024-04-30T03:58:19Z","title":"A Minimal Set of Parameters Based Depth-Dependent Distortion Model and\n Its Calibration Method for Stereo Vision Systems","summary":" Depth position highly affects lens distortion, especially in close-range\nphotography, which limits the measurement accuracy of existing stereo vision\nsystems. Moreover, traditional depth-dependent distortion models and their\ncalibration methods have remained complicated. In this work, we propose a\nminimal set of parameters based depth-dependent distortion model (MDM), which\nconsiders the radial and decentering distortions of the lens to improve the\naccuracy of stereo vision systems and simplify their calibration process. In\naddition, we present an easy and flexible calibration method for the MDM of\nstereo vision systems with a commonly used planar pattern, which requires\ncameras to observe the planar pattern in different orientations. The proposed\ntechnique is easy to use and flexible compared with classical calibration\ntechniques for depth-dependent distortion models in which the lens must be\nperpendicular to the planar pattern. The experimental validation of the MDM and\nits calibration method showed that the MDM improved the calibration accuracy by\n56.55% and 74.15% compared with the Li's distortion model and traditional\nBrown's distortion model. Besides, an iteration-based reconstruction method is\nproposed to iteratively estimate the depth information in the MDM during\nthree-dimensional reconstruction. The results showed that the accuracy of the\niteration-based reconstruction method was improved by 9.08% compared with that\nof the non-iteration reconstruction method.\n","authors":["Xin Ma","Puchen Zhu","Xiao Li","Xiaoyin Zheng","Jianshu Zhou","Xuchen Wang","Kwok Wai Samuel Au"],"pdf_url":"https://arxiv.org/pdf/2404.19242v1.pdf","comment":"This paper has been accepted for publication in IEEE Transactions on\n Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2404.12538v2","updated":"2024-04-30T03:46:28Z","published":"2024-04-18T23:12:46Z","title":"TrACT: A Training Dynamics Aware Contrastive Learning Framework for\n Long-tail Trajectory Prediction","summary":" As a safety critical task, autonomous driving requires accurate predictions\nof road users' future trajectories for safe motion planning, particularly under\nchallenging conditions. Yet, many recent deep learning methods suffer from a\ndegraded performance on the challenging scenarios, mainly because these\nscenarios appear less frequently in the training data. To address such a\nlong-tail issue, existing methods force challenging scenarios closer together\nin the feature space during training to trigger information sharing among them\nfor more robust learning. These methods, however, primarily rely on the motion\npatterns to characterize scenarios, omitting more informative contextual\ninformation, such as interactions and scene layout. We argue that exploiting\nsuch information not only improves prediction accuracy but also scene\ncompliance of the generated trajectories. In this paper, we propose to\nincorporate richer training dynamics information into a prototypical\ncontrastive learning framework. More specifically, we propose a two-stage\nprocess. First, we generate rich contextual features using a baseline\nencoder-decoder framework. These features are split into clusters based on the\nmodel's output errors, using the training dynamics information, and a prototype\nis computed within each cluster. Second, we retrain the model using the\nprototypes in a contrastive learning framework. We conduct empirical\nevaluations of our approach using two large-scale naturalistic datasets and\nshow that our method achieves state-of-the-art performance by improving\naccuracy and scene compliance on the long-tail samples. Furthermore, we perform\nexperiments on a subset of the clusters to highlight the additional benefit of\nour approach in reducing training bias.\n","authors":["Junrui Zhang","Mozhgan Pourkeshavarz","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2404.12538v2.pdf","comment":"2024 IEEE Intelligent Vehicles Symposium (IV)"},{"id":"http://arxiv.org/abs/2402.06537v2","updated":"2024-04-30T03:44:13Z","published":"2024-02-09T16:51:01Z","title":"Feature Density Estimation for Out-of-Distribution Detection via\n Normalizing Flows","summary":" Out-of-distribution (OOD) detection is a critical task for safe deployment of\nlearning systems in the open world setting. In this work, we investigate the\nuse of feature density estimation via normalizing flows for OOD detection and\npresent a fully unsupervised approach which requires no exposure to OOD data,\navoiding researcher bias in OOD sample selection. This is a post-hoc method\nwhich can be applied to any pretrained model, and involves training a\nlightweight auxiliary normalizing flow model to perform the out-of-distribution\ndetection via density thresholding. Experiments on OOD detection in image\nclassification show strong results for far-OOD data detection with only a\nsingle epoch of flow training, including 98.2% AUROC for ImageNet-1k vs.\nTextures, which exceeds the state of the art by 7.8%. We additionally explore\nthe connection between the feature space distribution of the pretrained model\nand the performance of our method. Finally, we provide insights into training\npitfalls that have plagued normalizing flows for use in OOD detection.\n","authors":["Evan D. Cook","Marc-Antoine Lavoie","Steven L. Waslander"],"pdf_url":"https://arxiv.org/pdf/2402.06537v2.pdf","comment":"Accepted to CRV 2024"},{"id":"http://arxiv.org/abs/2404.19227v1","updated":"2024-04-30T03:13:06Z","published":"2024-04-30T03:13:06Z","title":"Espresso: Robust Concept Filtering in Text-to-Image Models","summary":" Diffusion-based text-to-image (T2I) models generate high-fidelity images for\ngiven textual prompts. They are trained on large datasets scraped from the\nInternet, potentially containing unacceptable concepts (e.g., copyright\ninfringing or unsafe). Retraining T2I models after filtering out unacceptable\nconcepts in the training data is inefficient and degrades utility. Hence, there\nis a need for concept removal techniques (CRTs) which are effective in removing\nunacceptable concepts, utility-preserving on acceptable concepts, and robust\nagainst evasion with adversarial prompts. None of the prior filtering and\nfine-tuning CRTs satisfy all these requirements simultaneously.\n We introduce Espresso, the first robust concept filter based on Contrastive\nLanguage-Image Pre-Training (CLIP). It identifies unacceptable concepts by\nprojecting the generated image's embedding onto the vector connecting\nunacceptable and acceptable concepts in the joint text-image embedding space.\nThis ensures robustness by restricting the adversary to adding noise only along\nthis vector, in the direction of the acceptable concept. Further fine-tuning\nEspresso to separate embeddings of acceptable and unacceptable concepts, while\npreserving their pairing with image embeddings, ensures both effectiveness and\nutility. We evaluate Espresso on eleven concepts to show that it is effective\n(~5% CLIP accuracy on unacceptable concepts), utility-preserving (~93%\nnormalized CLIP score on acceptable concepts), and robust (~4% CLIP accuracy on\nadversarial prompts for unacceptable concepts). Finally, we present theoretical\nbounds for the certified robustness of Espresso against adversarial prompts,\nand an empirical analysis.\n","authors":["Anudeep Das","Vasisht Duddu","Rui Zhang","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2404.19227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19221v1","updated":"2024-04-30T02:48:20Z","published":"2024-04-30T02:48:20Z","title":"Transcrib3D: 3D Referring Expression Resolution through Large Language\n Models","summary":" If robots are to work effectively alongside people, they must be able to\ninterpret natural language references to objects in their 3D environment.\nUnderstanding 3D referring expressions is challenging -- it requires the\nability to both parse the 3D structure of the scene and correctly ground\nfree-form language in the presence of distraction and clutter. We introduce\nTranscrib3D, an approach that brings together 3D detection methods and the\nemergent reasoning capabilities of large language models (LLMs). Transcrib3D\nuses text as the unifying medium, which allows us to sidestep the need to learn\nshared representations connecting multi-modal inputs, which would require\nmassive amounts of annotated 3D data. As a demonstration of its effectiveness,\nTranscrib3D achieves state-of-the-art results on 3D reference resolution\nbenchmarks, with a great leap in performance from previous multi-modality\nbaselines. To improve upon zero-shot performance and facilitate local\ndeployment on edge computers and robots, we propose self-correction for\nfine-tuning that trains smaller models, resulting in performance close to that\nof large models. We show that our method enables a real robot to perform\npick-and-place tasks given queries that contain challenging referring\nexpressions. Project site is at https://ripl.github.io/Transcrib3D.\n","authors":["Jiading Fang","Xiangshan Tan","Shengjie Lin","Igor Vasiljevic","Vitor Guizilini","Hongyuan Mei","Rares Ambrus","Gregory Shakhnarovich","Matthew R Walter"],"pdf_url":"https://arxiv.org/pdf/2404.19221v1.pdf","comment":"CORLW 2023"},{"id":"http://arxiv.org/abs/2404.19205v1","updated":"2024-04-30T02:05:18Z","published":"2024-04-30T02:05:18Z","title":"TableVQA-Bench: A Visual Question Answering Benchmark on Multiple Table\n Domains","summary":" In this paper, we establish a benchmark for table visual question answering,\nreferred to as the TableVQA-Bench, derived from pre-existing table\nquestion-answering (QA) and table structure recognition datasets. It is\nimportant to note that existing datasets have not incorporated images or QA\npairs, which are two crucial components of TableVQA. As such, the primary\nobjective of this paper is to obtain these necessary components. Specifically,\nimages are sourced either through the application of a \\textit{stylesheet} or\nby employing the proposed table rendering system. QA pairs are generated by\nexploiting the large language model (LLM) where the input is a text-formatted\ntable. Ultimately, the completed TableVQA-Bench comprises 1,500 QA pairs. We\ncomprehensively compare the performance of various multi-modal large language\nmodels (MLLMs) on TableVQA-Bench. GPT-4V achieves the highest accuracy among\ncommercial and open-sourced MLLMs from our experiments. Moreover, we discover\nthat the number of vision queries plays a significant role in TableVQA\nperformance. To further analyze the capabilities of MLLMs in comparison to\ntheir LLM backbones, we investigate by presenting image-formatted tables to\nMLLMs and text-formatted tables to LLMs, respectively. Our findings suggest\nthat processing visual inputs is more challenging than text inputs, as\nevidenced by the lower performance of MLLMs, despite generally requiring higher\ncomputational costs than LLMs. The proposed TableVQA-Bench and evaluation codes\nare available at\n\\href{https://github.com/naver-ai/tablevqabench}{https://github.com/naver-ai/tablevqabench}.\n","authors":["Yoonsik Kim","Moonbin Yim","Ka Yeon Song"],"pdf_url":"https://arxiv.org/pdf/2404.19205v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.19204v1","updated":"2024-04-30T02:04:49Z","published":"2024-04-30T02:04:49Z","title":"NeRF-Insert: 3D Local Editing with Multimodal Control Signals","summary":" We propose NeRF-Insert, a NeRF editing framework that allows users to make\nhigh-quality local edits with a flexible level of control. Unlike previous work\nthat relied on image-to-image models, we cast scene editing as an in-painting\nproblem, which encourages the global structure of the scene to be preserved.\nMoreover, while most existing methods use only textual prompts to condition\nedits, our framework accepts a combination of inputs of different modalities as\nreference. More precisely, a user may provide a combination of textual and\nvisual inputs including images, CAD models, and binary image masks for\nspecifying a 3D region. We use generic image generation models to in-paint the\nscene from multiple viewpoints, and lift the local edits to a 3D-consistent\nNeRF edit. Compared to previous methods, our results show better visual quality\nand also maintain stronger consistency with the original NeRF.\n","authors":["Benet Oriol Sabat","Alessandro Achille","Matthew Trager","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2404.19204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19201v1","updated":"2024-04-30T01:59:25Z","published":"2024-04-30T01:59:25Z","title":"Global Search Optics: Automatically Exploring Optimal Solutions to\n Compact Computational Imaging Systems","summary":" The popularity of mobile vision creates a demand for advanced compact\ncomputational imaging systems, which call for the development of both a\nlightweight optical system and an effective image reconstruction model.\nRecently, joint design pipelines come to the research forefront, where the two\nsignificant components are simultaneously optimized via data-driven learning to\nrealize the optimal system design. However, the effectiveness of these designs\nlargely depends on the initial setup of the optical system, complicated by a\nnon-convex solution space that impedes reaching a globally optimal solution. In\nthis work, we present Global Search Optics (GSO) to automatically design\ncompact computational imaging systems through two parts: (i) Fused Optimization\nMethod for Automatic Optical Design (OptiFusion), which searches for diverse\ninitial optical systems under certain design specifications; and (ii) Efficient\nPhysic-aware Joint Optimization (EPJO), which conducts parallel joint\noptimization of initial optical systems and image reconstruction networks with\nthe consideration of physical constraints, culminating in the selection of the\noptimal solution. Extensive experimental results on the design of three-piece\n(3P) sphere computational imaging systems illustrate that the GSO serves as a\ntransformative end-to-end lens design paradigm for superior global optimal\nstructure searching ability, which provides compact computational imaging\nsystems with higher imaging quality compared to traditional methods. The source\ncode will be made publicly available at https://github.com/wumengshenyou/GSO.\n","authors":["Yao Gao","Qi Jiang","Shaohua Gao","Lei Sun","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.19201v1.pdf","comment":"The source code will be made publicly available at\n https://github.com/wumengshenyou/GSO"},{"id":"http://arxiv.org/abs/2310.02556v2","updated":"2024-04-30T01:58:16Z","published":"2023-10-04T03:30:24Z","title":"NOLA: Compressing LoRA using Linear Combination of Random Basis","summary":" Fine-tuning Large Language Models (LLMs) and storing them for each downstream\ntask or domain is impractical because of the massive model size (e.g., 350GB in\nGPT-3). Current literature, such as LoRA, showcases the potential of low-rank\nmodifications to the original weights of an LLM, enabling efficient adaptation\nand storage for task-specific models. These methods can reduce the number of\nparameters needed to fine-tune an LLM by several orders of magnitude. Yet,\nthese methods face two primary limitations: (1) the parameter count is\nlower-bounded by the rank one decomposition, and (2) the extent of reduction is\nheavily influenced by both the model architecture and the chosen rank. We\nintroduce NOLA, which overcomes the rank one lower bound present in LoRA. It\nachieves this by re-parameterizing the low-rank matrices in LoRA using linear\ncombinations of randomly generated matrices (basis) and optimizing the linear\nmixture coefficients only. This approach allows us to decouple the number of\ntrainable parameters from both the choice of rank and the network architecture.\nWe present adaptation results using GPT-2, LLaMA-2, and ViT in natural language\nand computer vision tasks. NOLA performs as well as LoRA models with much fewer\nnumber of parameters compared to LoRA with rank one, the best compression LoRA\ncan archive. Particularly, on LLaMA-2 70B, our method is almost 20 times more\ncompact than the most compressed LoRA without degradation in accuracy. Our code\nis available here: https://github.com/UCDvision/NOLA\n","authors":["Soroush Abbasi Koohpayegani","KL Navaneet","Parsa Nooralinejad","Soheil Kolouri","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2310.02556v2.pdf","comment":"ICLR 2024. Our code is available here:\n https://github.com/UCDvision/NOLA"},{"id":"http://arxiv.org/abs/2404.17774v2","updated":"2024-04-30T01:53:27Z","published":"2024-04-27T04:13:39Z","title":"High-quality Surface Reconstruction using Gaussian Surfels","summary":" We propose a novel point-based representation, Gaussian surfels, to combine\nthe advantages of the flexible optimization procedure in 3D Gaussian points and\nthe surface alignment property of surfels. This is achieved by directly setting\nthe z-scale of 3D Gaussian points to 0, effectively flattening the original 3D\nellipsoid into a 2D ellipse. Such a design provides clear guidance to the\noptimizer. By treating the local z-axis as the normal direction, it greatly\nimproves optimization stability and surface alignment. While the derivatives to\nthe local z-axis computed from the covariance matrix are zero in this setting,\nwe design a self-supervised normal-depth consistency loss to remedy this issue.\nMonocular normal priors and foreground masks are incorporated to enhance the\nquality of the reconstruction, mitigating issues related to highlights and\nbackground. We propose a volumetric cutting method to aggregate the information\nof Gaussian surfels so as to remove erroneous points in depth maps generated by\nalpha blending. Finally, we apply screened Poisson reconstruction method to the\nfused depth maps to extract the surface mesh. Experimental results show that\nour method demonstrates superior performance in surface reconstruction compared\nto state-of-the-art neural volume rendering and point-based rendering methods.\n","authors":["Pinxuan Dai","Jiamin Xu","Wenxiang Xie","Xinguo Liu","Huamin Wang","Weiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2404.17774v2.pdf","comment":"Results added and improved"},{"id":"http://arxiv.org/abs/2302.12172v5","updated":"2024-04-30T00:52:24Z","published":"2023-02-23T17:13:25Z","title":"Vision-Language Generative Model for View-Specific Chest X-ray\n Generation","summary":" Synthetic medical data generation has opened up new possibilities in the\nhealthcare domain, offering a powerful tool for simulating clinical scenarios,\nenhancing diagnostic and treatment quality, gaining granular medical knowledge,\nand accelerating the development of unbiased algorithms. In this context, we\npresent a novel approach called ViewXGen, designed to overcome the limitations\nof existing methods that rely on general domain pipelines using only radiology\nreports to generate frontal-view chest X-rays. Our approach takes into\nconsideration the diverse view positions found in the dataset, enabling the\ngeneration of chest X-rays with specific views, which marks a significant\nadvancement in the field. To achieve this, we introduce a set of specially\ndesigned tokens for each view position, tailoring the generation process to the\nuser's preferences. Furthermore, we leverage multi-view chest X-rays as input,\nincorporating valuable information from different views within the same study.\nThis integration rectifies potential errors and contributes to faithfully\ncapturing abnormal findings in chest X-ray generation. To validate the\neffectiveness of our approach, we conducted statistical analyses, evaluating\nits performance in a clinical efficacy metric on the MIMIC-CXR dataset. Also,\nhuman evaluation demonstrates the remarkable capabilities of ViewXGen,\nparticularly in producing realistic view-specific X-rays that closely resemble\nthe original images.\n","authors":["Hyungyung Lee","Da Young Lee","Wonjae Kim","Jin-Hwa Kim","Tackeun Kim","Jihang Kim","Leonard Sunwoo","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2302.12172v5.pdf","comment":"Accepted at CHIL 2024"},{"id":"http://arxiv.org/abs/2404.19174v1","updated":"2024-04-30T00:37:55Z","published":"2024-04-30T00:37:55Z","title":"XFeat: Accelerated Features for Lightweight Image Matching","summary":" We introduce a lightweight and accurate architecture for resource-efficient\nvisual correspondence. Our method, dubbed XFeat (Accelerated Features),\nrevisits fundamental design choices in convolutional neural networks for\ndetecting, extracting, and matching local features. Our new model satisfies a\ncritical need for fast and robust algorithms suitable to resource-limited\ndevices. In particular, accurate image matching requires sufficiently large\nimage resolutions - for this reason, we keep the resolution as large as\npossible while limiting the number of channels in the network. Besides, our\nmodel is designed to offer the choice of matching at the sparse or semi-dense\nlevels, each of which may be more suitable for different downstream\napplications, such as visual navigation and augmented reality. Our model is the\nfirst to offer semi-dense matching efficiently, leveraging a novel match\nrefinement module that relies on coarse local descriptors. XFeat is versatile\nand hardware-independent, surpassing current deep learning-based local features\nin speed (up to 5x faster) with comparable or better accuracy, proven in pose\nestimation and visual localization. We showcase it running in real-time on an\ninexpensive laptop CPU without specialized hardware optimizations. Code and\nweights are available at www.verlab.dcc.ufmg.br/descriptors/xfeat_cvpr24.\n","authors":["Guilherme Potje","Felipe Cadar","Andre Araujo","Renato Martins","Erickson R. Nascimento"],"pdf_url":"https://arxiv.org/pdf/2404.19174v1.pdf","comment":"CVPR 2024; Source code available at\n www.verlab.dcc.ufmg.br/descriptors/xfeat_cvpr24"},{"id":"http://arxiv.org/abs/2404.19171v1","updated":"2024-04-30T00:25:44Z","published":"2024-04-30T00:25:44Z","title":"Explicit Correlation Learning for Generalizable Cross-Modal Deepfake\n Detection","summary":" With the rising prevalence of deepfakes, there is a growing interest in\ndeveloping generalizable detection methods for various types of deepfakes.\nWhile effective in their specific modalities, traditional detection methods\nfall short in addressing the generalizability of detection across diverse\ncross-modal deepfakes. This paper aims to explicitly learn potential\ncross-modal correlation to enhance deepfake detection towards various\ngeneration scenarios. Our approach introduces a correlation distillation task,\nwhich models the inherent cross-modal correlation based on content information.\nThis strategy helps to prevent the model from overfitting merely to\naudio-visual synchronization. Additionally, we present the Cross-Modal Deepfake\nDataset (CMDFD), a comprehensive dataset with four generation methods to\nevaluate the detection of diverse cross-modal deepfakes. The experimental\nresults on CMDFD and FakeAVCeleb datasets demonstrate the superior\ngeneralizability of our method over existing state-of-the-art methods. Our code\nand data can be found at\n\\url{https://github.com/ljj898/CMDFD-Dataset-and-Deepfake-Detection}.\n","authors":["Cai Yu","Shan Jia","Xiaomeng Fu","Jin Liu","Jiahe Tian","Jiao Dai","Xi Wang","Siwei Lyu","Jizhong Han"],"pdf_url":"https://arxiv.org/pdf/2404.19171v1.pdf","comment":"accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2404.19168v1","updated":"2024-04-30T00:16:59Z","published":"2024-04-30T00:16:59Z","title":"PEVA-Net: Prompt-Enhanced View Aggregation Network for Zero/Few-Shot\n Multi-View 3D Shape Recognition","summary":" Large vision-language models have impressively promote the performance of 2D\nvisual recognition under zero/few-shot scenarios. In this paper, we focus on\nexploiting the large vision-language model, i.e., CLIP, to address\nzero/few-shot 3D shape recognition based on multi-view representations. The key\nchallenge for both tasks is to generate a discriminative descriptor of the 3D\nshape represented by multiple view images under the scenarios of either without\nexplicit training (zero-shot 3D shape recognition) or training with a limited\nnumber of data (few-shot 3D shape recognition). We analyze that both tasks are\nrelevant and can be considered simultaneously. Specifically, leveraging the\ndescriptor which is effective for zero-shot inference to guide the tuning of\nthe aggregated descriptor under the few-shot training can significantly improve\nthe few-shot learning efficacy. Hence, we propose Prompt-Enhanced View\nAggregation Network (PEVA-Net) to simultaneously address zero/few-shot 3D shape\nrecognition. Under the zero-shot scenario, we propose to leverage the prompts\nbuilt up from candidate categories to enhance the aggregation process of\nmultiple view-associated visual features. The resulting aggregated feature\nserves for effective zero-shot recognition of the 3D shapes. Under the few-shot\nscenario, we first exploit a transformer encoder to aggregate the\nview-associated visual features into a global descriptor. To tune the encoder,\ntogether with the main classification loss, we propose a self-distillation\nscheme via a feature distillation loss by treating the zero-shot descriptor as\nthe guidance signal for the few-shot descriptor. This scheme can significantly\nenhance the few-shot learning efficacy.\n","authors":["Dongyun Lin","Yi Cheng","Shangbo Mao","Aiyuan Guo","Yiqun Li"],"pdf_url":"https://arxiv.org/pdf/2404.19168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16043v3","updated":"2024-04-30T00:11:33Z","published":"2023-12-26T13:14:17Z","title":"An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced\n linear classification","summary":" This article presents a new polynomial parameterized sigmoid called SIGTRON,\nwhich is an extended asymmetric sigmoid with Perceptron, and its companion\nconvex model called SIGTRON-imbalanced classification (SIC) model that employs\na virtual SIGTRON-induced convex loss function. In contrast to the conventional\n$\\pi$-weighted cost-sensitive learning model, the SIC model does not have an\nexternal $\\pi$-weight on the loss function but has internal parameters in the\nvirtual SIGTRON-induced loss function. As a consequence, when the given\ntraining dataset is close to the well-balanced condition considering the\n(scale-)class-imbalance ratio, we show that the proposed SIC model is more\nadaptive to variations of the dataset, such as the inconsistency of the\n(scale-)class-imbalance ratio between the training and test datasets. This\nadaptation is justified by a skewed hyperplane equation, created via\nlinearization of the gradient satisfying $\\epsilon$-optimal condition.\n Additionally, we present a quasi-Newton optimization(L-BFGS) framework for\nthe virtual convex loss by developing an interval-based bisection line search.\nEmpirically, we have observed that the proposed approach outperforms (or is\ncomparable to) $\\pi$-weighted convex focal loss and balanced classifier\nLIBLINEAR(logistic regression, SVM, and L2SVM) in terms of test classification\naccuracy with $51$ two-class and $67$ multi-class datasets. In binary\nclassification problems, where the scale-class-imbalance ratio of the training\ndataset is not significant but the inconsistency exists, a group of SIC models\nwith the best test accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC\nwith RBF kernel), a well-known kernel-based classifier.\n","authors":["Hyenkyun Woo"],"pdf_url":"https://arxiv.org/pdf/2312.16043v3.pdf","comment":"26 pages, 9 figures, revised version"},{"id":"http://arxiv.org/abs/2403.13315v2","updated":"2024-04-30T23:53:47Z","published":"2024-03-20T05:37:24Z","title":"PuzzleVQA: Diagnosing Multimodal Reasoning Challenges of Language Models\n with Abstract Visual Patterns","summary":" Large multimodal models extend the impressive capabilities of large language\nmodels by integrating multimodal understanding abilities. However, it is not\nclear how they can emulate the general intelligence and reasoning ability of\nhumans. As recognizing patterns and abstracting concepts are key to general\nintelligence, we introduce PuzzleVQA, a collection of puzzles based on abstract\npatterns. With this dataset, we evaluate large multimodal models with abstract\npatterns based on fundamental concepts, including colors, numbers, sizes, and\nshapes. Through our experiments on state-of-the-art large multimodal models, we\nfind that they are not able to generalize well to simple abstract patterns.\nNotably, even GPT-4V cannot solve more than half of the puzzles. To diagnose\nthe reasoning challenges in large multimodal models, we progressively guide the\nmodels with our ground truth reasoning explanations for visual perception,\ninductive reasoning, and deductive reasoning. Our systematic analysis finds\nthat the main bottlenecks of GPT-4V are weaker visual perception and inductive\nreasoning abilities. Through this work, we hope to shed light on the\nlimitations of large multimodal models and how they can better emulate human\ncognitive processes in the future (Our data and code will be released publicly\nat https://github.com/declare-lab/LLM-PuzzleTest).\n","authors":["Yew Ken Chia","Vernon Toh Yan Han","Deepanway Ghosal","Lidong Bing","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2403.13315v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00251v1","updated":"2024-04-30T23:49:26Z","published":"2024-04-30T23:49:26Z","title":"Semantically Consistent Video Inpainting with Conditional Diffusion\n Models","summary":" Current state-of-the-art methods for video inpainting typically rely on\noptical flow or attention-based approaches to inpaint masked regions by\npropagating visual information across frames. While such approaches have led to\nsignificant progress on standard benchmarks, they struggle with tasks that\nrequire the synthesis of novel content that is not present in other frames. In\nthis paper we reframe video inpainting as a conditional generative modeling\nproblem and present a framework for solving such problems with conditional\nvideo diffusion models. We highlight the advantages of using a generative\napproach for this task, showing that our method is capable of generating\ndiverse, high-quality inpaintings and synthesizing new content that is\nspatially, temporally, and semantically consistent with the provided context.\n","authors":["Dylan Green","William Harvey","Saeid Naderiparizi","Matthew Niedoba","Yunpeng Liu","Xiaoxuan Liang","Jonathan Lavington","Ke Zhang","Vasileios Lioutas","Setareh Dabiri","Adam Scibior","Berend Zwartsenberg","Frank Wood"],"pdf_url":"https://arxiv.org/pdf/2405.00251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00250v1","updated":"2024-04-30T23:45:16Z","published":"2024-04-30T23:45:16Z","title":"SemVecNet: Generalizable Vector Map Generation for Arbitrary Sensor\n Configurations","summary":" Vector maps are essential in autonomous driving for tasks like localization\nand planning, yet their creation and maintenance are notably costly. While\nrecent advances in online vector map generation for autonomous vehicles are\npromising, current models lack adaptability to different sensor configurations.\nThey tend to overfit to specific sensor poses, leading to decreased performance\nand higher retraining costs. This limitation hampers their practical use in\nreal-world applications. In response to this challenge, we propose a modular\npipeline for vector map generation with improved generalization to sensor\nconfigurations. The pipeline leverages probabilistic semantic mapping to\ngenerate a bird's-eye-view (BEV) semantic map as an intermediate\nrepresentation. This intermediate representation is then converted to a vector\nmap using the MapTRv2 decoder. By adopting a BEV semantic map robust to\ndifferent sensor configurations, our proposed approach significantly improves\nthe generalization performance. We evaluate the model on datasets with sensor\nconfigurations not used during training. Our evaluation sets includes larger\npublic datasets, and smaller scale private data collected on our platform. Our\nmodel generalizes significantly better than the state-of-the-art methods.\n","authors":["Narayanan Elavathur Ranganatha","Hengyuan Zhang","Shashank Venkatramani","Jing-Yan Liao","Henrik I. Christensen"],"pdf_url":"https://arxiv.org/pdf/2405.00250v1.pdf","comment":"8 pages, 6 figures, Accepted to IV 2024"},{"id":"http://arxiv.org/abs/2405.00244v1","updated":"2024-04-30T23:29:26Z","published":"2024-04-30T23:29:26Z","title":"Towards Real-World HDR Video Reconstruction: A Large-Scale Benchmark\n Dataset and A Two-Stage Alignment Network","summary":" As an important and practical way to obtain high dynamic range (HDR) video,\nHDR video reconstruction from sequences with alternating exposures is still\nless explored, mainly due to the lack of large-scale real-world datasets.\nExisting methods are mostly trained on synthetic datasets, which perform poorly\nin real scenes. In this work, to facilitate the development of real-world HDR\nvideo reconstruction, we present Real-HDRV, a large-scale real-world benchmark\ndataset for HDR video reconstruction, featuring various scenes, diverse motion\npatterns, and high-quality labels. Specifically, our dataset contains 500\nLDRs-HDRs video pairs, comprising about 28,000 LDR frames and 4,000 HDR labels,\ncovering daytime, nighttime, indoor, and outdoor scenes. To our best knowledge,\nour dataset is the largest real-world HDR video reconstruction dataset.\nCorrespondingly, we propose an end-to-end network for HDR video reconstruction,\nwhere a novel two-stage strategy is designed to perform alignment sequentially.\nSpecifically, the first stage performs global alignment with the adaptively\nestimated global offsets, reducing the difficulty of subsequent alignment. The\nsecond stage implicitly performs local alignment in a coarse-to-fine manner at\nthe feature level using the adaptive separable convolution. Extensive\nexperiments demonstrate that: (1) models trained on our dataset can achieve\nbetter performance on real scenes than those trained on synthetic datasets; (2)\nour method outperforms previous state-of-the-art methods. Our dataset is\navailable at https://github.com/yungsyu99/Real-HDRV.\n","authors":["Yong Shu","Liquan Shen","Xiangyu Hu","Mengyao Li","Zihao Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.00244v1.pdf","comment":"This paper has been accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2405.00242v1","updated":"2024-04-30T23:18:51Z","published":"2024-04-30T23:18:51Z","title":"Guiding Attention in End-to-End Driving Models","summary":" Vision-based end-to-end driving models trained by imitation learning can lead\nto affordable solutions for autonomous driving. However, training these\nwell-performing models usually requires a huge amount of data, while still\nlacking explicit and intuitive activation maps to reveal the inner workings of\nthese models while driving. In this paper, we study how to guide the attention\nof these models to improve their driving quality and obtain more intuitive\nactivation maps by adding a loss term during training using salient semantic\nmaps. In contrast to previous work, our method does not require these salient\nsemantic maps to be available during testing time, as well as removing the need\nto modify the model's architecture to which it is applied. We perform tests\nusing perfect and noisy salient semantic maps with encouraging results in both,\nthe latter of which is inspired by possible errors encountered with real data.\nUsing CIL++ as a representative state-of-the-art model and the CARLA simulator\nwith its standard benchmarks, we conduct experiments that show the\neffectiveness of our method in training better autonomous driving models,\nespecially when data and computational resources are scarce.\n","authors":["Diego Porres","Yi Xiao","Gabriel Villalonga","Alexandre Levy","Antonio M. López"],"pdf_url":"https://arxiv.org/pdf/2405.00242v1.pdf","comment":"Accepted for publication at the 35th IEEE Intelligent Vehicles\n Symposium (IV 2024)"},{"id":"http://arxiv.org/abs/2405.00239v1","updated":"2024-04-30T23:09:54Z","published":"2024-04-30T23:09:54Z","title":"IgCONDA-PET: Implicitly-Guided Counterfactual Diffusion for Detecting\n Anomalies in PET Images","summary":" Minimizing the need for pixel-level annotated data for training PET anomaly\nsegmentation networks is crucial, particularly due to time and cost constraints\nrelated to expert annotations. Current un-/weakly-supervised anomaly detection\nmethods rely on autoencoder or generative adversarial networks trained only on\nhealthy data, although these are more challenging to train. In this work, we\npresent a weakly supervised and Implicitly guided COuNterfactual diffusion\nmodel for Detecting Anomalies in PET images, branded as IgCONDA-PET. The\ntraining is conditioned on image class labels (healthy vs. unhealthy) along\nwith implicit guidance to generate counterfactuals for an unhealthy image with\nanomalies. The counterfactual generation process synthesizes the healthy\ncounterpart for a given unhealthy image, and the difference between the two\nfacilitates the identification of anomaly locations. The code is available at:\nhttps://github.com/igcondapet/IgCONDA-PET.git\n","authors":["Shadab Ahamed","Yixi Xu","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2405.00239v1.pdf","comment":"12 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2405.00236v1","updated":"2024-04-30T23:04:36Z","published":"2024-04-30T23:04:36Z","title":"STT: Stateful Tracking with Transformers for Autonomous Driving","summary":" Tracking objects in three-dimensional space is critical for autonomous\ndriving. To ensure safety while driving, the tracker must be able to reliably\ntrack objects across frames and accurately estimate their states such as\nvelocity and acceleration in the present. Existing works frequently focus on\nthe association task while either neglecting the model performance on state\nestimation or deploying complex heuristics to predict the states. In this\npaper, we propose STT, a Stateful Tracking model built with Transformers, that\ncan consistently track objects in the scenes while also predicting their states\naccurately. STT consumes rich appearance, geometry, and motion signals through\nlong term history of detections and is jointly optimized for both data\nassociation and state estimation tasks. Since the standard tracking metrics\nlike MOTA and MOTP do not capture the combined performance of the two tasks in\nthe wider spectrum of object states, we extend them with new metrics called\nS-MOTA and MOTPS that address this limitation. STT achieves competitive\nreal-time performance on the Waymo Open Dataset.\n","authors":["Longlong Jing","Ruichi Yu","Xu Chen","Zhengli Zhao","Shiwei Sheng","Colin Graber","Qi Chen","Qinru Li","Shangxuan Wu","Han Deng","Sangjin Lee","Chris Sweeney","Qiurui He","Wei-Chih Hung","Tong He","Xingyi Zhou","Farshid Moussavi","Zijian Guo","Yin Zhou","Mingxing Tan","Weilong Yang","Congcong Li"],"pdf_url":"https://arxiv.org/pdf/2405.00236v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2405.00228v1","updated":"2024-04-30T22:32:02Z","published":"2024-04-30T22:32:02Z","title":"Synthetic Face Datasets Generation via Latent Space Exploration from\n Brownian Identity Diffusion","summary":" Face Recognition (FR) models are trained on large-scale datasets, which have\nprivacy and ethical concerns. Lately, the use of synthetic data to complement\nor replace genuine data for the training of FR models has been proposed. While\npromising results have been obtained, it still remains unclear if generative\nmodels can yield diverse enough data for such tasks. In this work, we introduce\na new method, inspired by the physical motion of soft particles subjected to\nstochastic Brownian forces, allowing us to sample identities distributions in a\nlatent space under various constraints. With this in hands, we generate several\nface datasets and benchmark them by training FR models, showing that data\ngenerated with our method exceeds the performance of previously GAN-based\ndatasets and achieves competitive performance with state-of-the-art\ndiffusion-based synthetic datasets. We also show that this method can be used\nto mitigate leakage from the generator's training set and explore the ability\nof generative models to generate data beyond it.\n","authors":["David Geissbühler","Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2405.00228v1.pdf","comment":"17 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2312.06709v5","updated":"2024-04-30T22:22:03Z","published":"2023-12-10T17:07:29Z","title":"AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains\n Into One","summary":" A handful of visual foundation models (VFMs) have recently emerged as the\nbackbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are\ntrained with distinct objectives, exhibiting unique characteristics for various\ndownstream tasks. We find that despite their conceptual differences, these\nmodels can be effectively merged into a unified model through multi-teacher\ndistillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All\nDomains Into One). This integrative approach not only surpasses the performance\nof individual teacher models but also amalgamates their distinctive features,\nsuch as zero-shot vision-language comprehension, detailed pixel-level\nunderstanding, and open vocabulary segmentation capabilities. In pursuit of the\nmost hardware-efficient backbone, we evaluated numerous architectures in our\nmulti-teacher distillation pipeline using the same training recipe. This led to\nthe development of a novel architecture (E-RADIO) that exceeds the performance\nof its predecessors and is at least 7x faster than the teacher models. Our\ncomprehensive benchmarking process covers downstream tasks including ImageNet\nclassification, ADE20k semantic segmentation, COCO object detection and\nLLaVa-1.5 framework.\n Code: https://github.com/NVlabs/RADIO\n","authors":["Mike Ranzinger","Greg Heinrich","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2312.06709v5.pdf","comment":"CVPR 2024 Version 3: CVPR Camera Ready, reconfigured full paper,\n table 1 is now more comprehensive Version 2: Added more acknowledgements and\n updated table 7 with more recent results. Ensured that the link in the\n abstract to our code is working properly Version 3: Fix broken hyperlinks"},{"id":"http://arxiv.org/abs/2405.00196v1","updated":"2024-04-30T20:59:53Z","published":"2024-04-30T20:59:53Z","title":"Synthetic Image Verification in the Era of Generative AI: What Works and\n What Isn't There Yet","summary":" In this work we present an overview of approaches for the detection and\nattribution of synthetic images and highlight their strengths and weaknesses.\nWe also point out and discuss hot topics in this field and outline promising\ndirections for future research.\n","authors":["Diangarti Tariang","Riccardo Corvi","Davide Cozzolino","Giovanni Poggi","Koki Nagano","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2405.00196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04801v2","updated":"2024-04-30T20:54:08Z","published":"2024-01-09T19:52:25Z","title":"Refining Remote Photoplethysmography Architectures using CKA and\n Empirical Methods","summary":" Model architecture refinement is a challenging task in deep learning research\nfields such as remote photoplethysmography (rPPG). One architectural\nconsideration, the depth of the model, can have significant consequences on the\nresulting performance. In rPPG models that are overprovisioned with more layers\nthan necessary, redundancies exist, the removal of which can result in faster\ntraining and reduced computational load at inference time. With too few layers\nthe models may exhibit sub-optimal error rates. We apply Centered Kernel\nAlignment (CKA) to an array of rPPG architectures of differing depths,\ndemonstrating that shallower models do not learn the same representations as\ndeeper models, and that after a certain depth, redundant layers are added\nwithout significantly increased functionality. An empirical study confirms how\nthe architectural deficiencies discovered using CKA impact performance, and we\nshow how CKA as a diagnostic can be used to refine rPPG architectures.\n","authors":["Nathan Vance","Patrick Flynn"],"pdf_url":"https://arxiv.org/pdf/2401.04801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15378v2","updated":"2024-04-30T20:52:08Z","published":"2024-04-23T03:04:22Z","title":"Hierarchical Hybrid Sliced Wasserstein: A Scalable Metric for\n Heterogeneous Joint Distributions","summary":" Sliced Wasserstein (SW) and Generalized Sliced Wasserstein (GSW) have been\nwidely used in applications due to their computational and statistical\nscalability. However, the SW and the GSW are only defined between distributions\nsupported on a homogeneous domain. This limitation prevents their usage in\napplications with heterogeneous joint distributions with marginal distributions\nsupported on multiple different domains. Using SW and GSW directly on the joint\ndomains cannot make a meaningful comparison since their homogeneous slicing\noperator i.e., Radon Transform (RT) and Generalized Radon Transform (GRT) are\nnot expressive enough to capture the structure of the joint supports set. To\naddress the issue, we propose two new slicing operators i.e., Partial\nGeneralized Radon Transform (PGRT) and Hierarchical Hybrid Radon Transform\n(HHRT). In greater detail, PGRT is the generalization of Partial Radon\nTransform (PRT), which transforms a subset of function arguments non-linearly\nwhile HHRT is the composition of PRT and multiple domain-specific PGRT on\nmarginal domain arguments. By using HHRT, we extend the SW into Hierarchical\nHybrid Sliced Wasserstein (H2SW) distance which is designed specifically for\ncomparing heterogeneous joint distributions. We then discuss the topological,\nstatistical, and computational properties of H2SW. Finally, we demonstrate the\nfavorable performance of H2SW in 3D mesh deformation, deep 3D mesh\nautoencoders, and datasets comparison.\n","authors":["Khai Nguyen","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2404.15378v2.pdf","comment":"28 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.02176v3","updated":"2024-04-30T20:33:41Z","published":"2023-06-03T19:06:06Z","title":"TransRUPNet for Improved Polyp Segmentation","summary":" Colorectal cancer is among the most common cause of cancer worldwide. Removal\nof precancerous polyps through early detection is essential to prevent them\nfrom progressing to colon cancer. We develop an advanced deep learning-based\narchitecture, Transformer based Residual Upsampling Network (TransRUPNet) for\nautomatic and real-time polyp segmentation. The proposed architecture,\nTransRUPNet, is an encoder-decoder network consisting of three encoder and\ndecoder blocks with additional upsampling blocks at the end of the network.\nWith the image size of $256\\times256$, the proposed method achieves an\nexcellent real-time operation speed of 47.07 frames per second with an average\nmean dice coefficient score of 0.7786 and mean Intersection over Union of\n0.7210 on the out-of-distribution polyp datasets. The results on the publicly\navailable PolypGen dataset suggest that TransRUPNet can give real-time feedback\nwhile retaining high accuracy for in-distribution datasets. Furthermore, we\ndemonstrate the generalizability of the proposed method by showing that it\nsignificantly improves performance on out-of-distribution datasets compared to\nthe existing methods. The source code of our network is available at\nhttps://github.com/DebeshJha/TransRUPNet.\n","authors":["Debesh Jha","Nikhil Kumar Tomar","Debayan Bhattacharya","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2306.02176v3.pdf","comment":"Accepted at EMBC 2024"},{"id":"http://arxiv.org/abs/2405.00187v1","updated":"2024-04-30T20:25:57Z","published":"2024-04-30T20:25:57Z","title":"Towards End-to-End Semi-Supervised Table Detection with Semantic Aligned\n Matching Transformer","summary":" Table detection within document images is a crucial task in document\nprocessing, involving the identification and localization of tables. Recent\nstrides in deep learning have substantially improved the accuracy of this task,\nbut it still heavily relies on large labeled datasets for effective training.\nSeveral semi-supervised approaches have emerged to overcome this challenge,\noften employing CNN-based detectors with anchor proposals and post-processing\ntechniques like non-maximal suppression (NMS). However, recent advancements in\nthe field have shifted the focus towards transformer-based techniques,\neliminating the need for NMS and emphasizing object queries and attention\nmechanisms. Previous research has focused on two key areas to improve\ntransformer-based detectors: refining the quality of object queries and\noptimizing attention mechanisms. However, increasing object queries can\nintroduce redundancy, while adjustments to the attention mechanism can increase\ncomplexity. To address these challenges, we introduce a semi-supervised\napproach employing SAM-DETR, a novel approach for precise alignment between\nobject queries and target features. Our approach demonstrates remarkable\nreductions in false positives and substantial enhancements in table detection\nperformance, particularly in complex documents characterized by diverse table\nstructures. This work provides more efficient and accurate table detection in\nsemi-supervised settings.\n","authors":["Tahira Shehzadi","Shalini Sarode","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2405.00187v1.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2307.15615v3","updated":"2024-04-30T20:13:05Z","published":"2023-07-28T15:22:34Z","title":"A survey on deep learning in medical image registration: new\n technologies, uncertainty, evaluation metrics, and beyond","summary":" Deep learning technologies have dramatically reshaped the field of medical\nimage registration over the past decade. The initial developments, such as\nregression-based and U-Net-based networks, established the foundation for deep\nlearning in image registration. Subsequent progress has been made in various\naspects of deep learning-based registration, including similarity measures,\ndeformation regularizations, network architectures, and uncertainty estimation.\nThese advancements have not only enriched the field of image registration but\nhave also facilitated its application in a wide range of tasks, including atlas\nconstruction, multi-atlas segmentation, motion estimation, and 2D-3D\nregistration. In this paper, we present a comprehensive overview of the most\nrecent advancements in deep learning-based image registration. We begin with a\nconcise introduction to the core concepts of deep learning-based image\nregistration. Then, we delve into innovative network architectures, loss\nfunctions specific to registration, and methods for estimating registration\nuncertainty. Additionally, this paper explores appropriate evaluation metrics\nfor assessing the performance of deep learning models in registration tasks.\nFinally, we highlight the practical applications of these novel techniques in\nmedical imaging and discuss the future prospects of deep learning-based image\nregistration.\n","authors":["Junyu Chen","Yihao Liu","Shuwen Wei","Zhangxing Bian","Shalini Subramanian","Aaron Carass","Jerry L. Prince","Yong Du"],"pdf_url":"https://arxiv.org/pdf/2307.15615v3.pdf","comment":"A list of open-sourced code from the papers reviewed has been\n organized and is available at https://bit.ly/3QgFJ9z"},{"id":"http://arxiv.org/abs/2405.00181v1","updated":"2024-04-30T20:11:49Z","published":"2024-04-30T20:11:49Z","title":"Uncovering What, Why and How: A Comprehensive Benchmark for Causation\n Understanding of Video Anomaly","summary":" Video anomaly understanding (VAU) aims to automatically comprehend unusual\noccurrences in videos, thereby enabling various applications such as traffic\nsurveillance and industrial manufacturing. While existing VAU benchmarks\nprimarily concentrate on anomaly detection and localization, our focus is on\nmore practicality, prompting us to raise the following crucial questions: \"what\nanomaly occurred?\", \"why did it happen?\", and \"how severe is this abnormal\nevent?\". In pursuit of these answers, we present a comprehensive benchmark for\nCausation Understanding of Video Anomaly (CUVA). Specifically, each instance of\nthe proposed benchmark involves three sets of human annotations to indicate the\n\"what\", \"why\" and \"how\" of an anomaly, including 1) anomaly type, start and end\ntimes, and event descriptions, 2) natural language explanations for the cause\nof an anomaly, and 3) free text reflecting the effect of the abnormality. In\naddition, we also introduce MMEval, a novel evaluation metric designed to\nbetter align with human preferences for CUVA, facilitating the measurement of\nexisting LLMs in comprehending the underlying cause and corresponding effect of\nvideo anomalies. Finally, we propose a novel prompt-based method that can serve\nas a baseline approach for the challenging CUVA. We conduct extensive\nexperiments to show the superiority of our evaluation metric and the\nprompt-based approach. Our code and dataset are available at\nhttps://github.com/fesvhtr/CUVA.\n","authors":["Hang Du","Sicheng Zhang","Binzhu Xie","Guoshun Nan","Jiayang Zhang","Junrui Xu","Hangyu Liu","Sicong Leng","Jiangming Liu","Hehe Fan","Dajiu Huang","Jing Feng","Linli Chen","Can Zhang","Xuhuan Li","Hao Zhang","Jianhang Chen","Qimei Cui","Xiaofeng Tao"],"pdf_url":"https://arxiv.org/pdf/2405.00181v1.pdf","comment":"Codebase: https://github.com/fesvhtr/CUVA"},{"id":"http://arxiv.org/abs/2404.17888v2","updated":"2024-04-30T20:00:36Z","published":"2024-04-27T12:53:50Z","title":"A Hybrid Approach for Document Layout Analysis in Document images","summary":" Document layout analysis involves understanding the arrangement of elements\nwithin a document. This paper navigates the complexities of understanding\nvarious elements within document images, such as text, images, tables, and\nheadings. The approach employs an advanced Transformer-based object detection\nnetwork as an innovative graphical page object detector for identifying tables,\nfigures, and displayed elements. We introduce a query encoding mechanism to\nprovide high-quality object queries for contrastive learning, enhancing\nefficiency in the decoder phase. We also present a hybrid matching scheme that\nintegrates the decoder's original one-to-one matching strategy with the\none-to-many matching strategy during the training phase. This approach aims to\nimprove the model's accuracy and versatility in detecting various graphical\nelements on a page. Our experiments on PubLayNet, DocLayNet, and PubTables\nbenchmarks show that our approach outperforms current state-of-the-art methods.\nIt achieves an average precision of 97.3% on PubLayNet, 81.6% on DocLayNet, and\n98.6 on PubTables, demonstrating its superior performance in layout analysis.\nThese advancements not only enhance the conversion of document images into\neditable and accessible formats but also streamline information retrieval and\ndata extraction processes.\n","authors":["Tahira Shehzadi","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2404.17888v2.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2405.00168v1","updated":"2024-04-30T19:37:58Z","published":"2024-04-30T19:37:58Z","title":"Revisiting RGBT Tracking Benchmarks from the Perspective of Modality\n Validity: A New Benchmark, Problem, and Method","summary":" RGBT tracking draws increasing attention due to its robustness in\nmulti-modality warranting (MMW) scenarios, such as nighttime and bad weather,\nwhere relying on a single sensing modality fails to ensure stable tracking\nresults. However, the existing benchmarks predominantly consist of videos\ncollected in common scenarios where both RGB and thermal infrared (TIR)\ninformation are of sufficient quality. This makes the data unrepresentative of\nsevere imaging conditions, leading to tracking failures in MMW scenarios. To\nbridge this gap, we present a new benchmark, MV-RGBT, captured specifically in\nMMW scenarios. In contrast with the existing datasets, MV-RGBT comprises more\nobject categories and scenes, providing a diverse and challenging benchmark.\nFurthermore, for severe imaging conditions of MMW scenarios, a new problem is\nposed, namely \\textit{when to fuse}, to stimulate the development of fusion\nstrategies for such data. We propose a new method based on a mixture of\nexperts, namely MoETrack, as a baseline fusion strategy. In MoETrack, each\nexpert generates independent tracking results along with the corresponding\nconfidence score, which is used to control the fusion process. Extensive\nexperimental results demonstrate the significant potential of MV-RGBT in\nadvancing RGBT tracking and elicit the conclusion that fusion is not always\nbeneficial, especially in MMW scenarios. Significantly, the proposed MoETrack\nmethod achieves new state-of-the-art results not only on MV-RGBT, but also on\nstandard benchmarks, such as RGBT234, LasHeR, and the short-term split of VTUAV\n(VTUAV-ST). More information of MV-RGBT and the source code of MoETrack will be\nreleased at https://github.com/Zhangyong-Tang/MoETrack.\n","authors":["Zhangyong Tang","Tianyang Xu","Zhenhua Feng","Xuefeng Zhu","He Wang","Pengcheng Shao","Chunyang Cheng","Xiao-Jun Wu","Muhammad Awais","Sara Atito","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2405.00168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00156v1","updated":"2024-04-30T19:06:37Z","published":"2024-04-30T19:06:37Z","title":"Expanding the Horizon: Enabling Hybrid Quantum Transfer Learning for\n Long-Tailed Chest X-Ray Classification","summary":" Quantum machine learning (QML) has the potential for improving the\nmulti-label classification of rare, albeit critical, diseases in large-scale\nchest x-ray (CXR) datasets due to theoretical quantum advantages over classical\nmachine learning (CML) in sample efficiency and generalizability. While prior\nliterature has explored QML with CXRs, it has focused on binary classification\ntasks with small datasets due to limited access to quantum hardware and\ncomputationally expensive simulations. To that end, we implemented a Jax-based\nframework that enables the simulation of medium-sized qubit architectures with\nsignificant improvements in wall-clock time over current software offerings. We\nevaluated the performance of our Jax-based framework in terms of efficiency and\nperformance for hybrid quantum transfer learning for long-tailed classification\nacross 8, 14, and 19 disease labels using large-scale CXR datasets. The\nJax-based framework resulted in up to a 58% and 95% speed-up compared to\nPyTorch and TensorFlow implementations, respectively. However, compared to CML,\nQML demonstrated slower convergence and an average AUROC of 0.70, 0.73, and\n0.74 for the classification of 8, 14, and 19 CXR disease labels. In comparison,\nthe CML models had an average AUROC of 0.77, 0.78, and 0.80 respectively. In\nconclusion, our work presents an accessible implementation of hybrid quantum\ntransfer learning for long-tailed CXR classification with a computationally\nefficient Jax-based framework.\n","authors":["Skylar Chan","Pranav Kulkarni","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2405.00156v1.pdf","comment":"11 pages, 13 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.00145v1","updated":"2024-04-30T18:42:18Z","published":"2024-04-30T18:42:18Z","title":"GUing: A Mobile GUI Search Engine using a Vision-Language Model","summary":" App developers use the Graphical User Interface (GUI) of other apps as an\nimportant source of inspiration to design and improve their own apps. In recent\nyears, research suggested various approaches to retrieve GUI designs that fit a\ncertain text query from screenshot datasets acquired through automated GUI\nexploration. However, such text-to-GUI retrieval approaches only leverage the\ntextual information of the GUI elements in the screenshots, neglecting visual\ninformation such as icons or background images. In addition, the retrieved\nscreenshots are not steered by app developers and often lack important app\nfeatures, e.g. whose UI pages require user authentication. To overcome these\nlimitations, this paper proposes GUing, a GUI search engine based on a\nvision-language model called UIClip, which we trained specifically for the app\nGUI domain. For this, we first collected app introduction images from Google\nPlay, which usually display the most representative screenshots selected and\noften captioned (i.e. labeled) by app vendors. Then, we developed an automated\npipeline to classify, crop, and extract the captions from these images. This\nfinally results in a large dataset which we share with this paper: including\n303k app screenshots, out of which 135k have captions. We used this dataset to\ntrain a novel vision-language model, which is, to the best of our knowledge,\nthe first of its kind in GUI retrieval. We evaluated our approach on various\ndatasets from related work and in manual experiment. The results demonstrate\nthat our model outperforms previous approaches in text-to-GUI retrieval\nachieving a Recall@10 of up to 0.69 and a HIT@10 of 0.91. We also explored the\nperformance of UIClip for other GUI tasks including GUI classification and\nSketch-to-GUI retrieval with encouraging results.\n","authors":["Jialiang Wei","Anne-Lise Courbis","Thomas Lambolais","Binbin Xu","Pierre Louis Bernard","Gérard Dray","Walid Maalej"],"pdf_url":"https://arxiv.org/pdf/2405.00145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00142v1","updated":"2024-04-30T18:39:41Z","published":"2024-04-30T18:39:41Z","title":"Utilizing Machine Learning and 3D Neuroimaging to Predict Hearing Loss:\n A Comparative Analysis of Dimensionality Reduction and Regression Techniques","summary":" In this project, we have explored machine learning approaches for predicting\nhearing loss thresholds on the brain's gray matter 3D images. We have solved\nthe problem statement in two phases. In the first phase, we used a 3D CNN model\nto reduce high-dimensional input into latent space and decode it into an\noriginal image to represent the input in rich feature space. In the second\nphase, we utilized this model to reduce input into rich features and used these\nfeatures to train standard machine learning models for predicting hearing\nthresholds. We have experimented with autoencoders and variational autoencoders\nin the first phase for dimensionality reduction and explored random forest,\nXGBoost and multi-layer perceptron for regressing the thresholds. We split the\ngiven data set into training and testing sets and achieved an 8.80 range and\n22.57 range for PT500 and PT4000 on the test set, respectively. We got the\nlowest RMSE using multi-layer perceptron among the other models.\n Our approach leverages the unique capabilities of VAEs to capture complex,\nnon-linear relationships within high-dimensional neuroimaging data. We\nrigorously evaluated the models using various metrics, focusing on the root\nmean squared error (RMSE). The results highlight the efficacy of the\nmulti-layer neural network model, which outperformed other techniques in terms\nof accuracy. This project advances the application of data mining in medical\ndiagnostics and enhances our understanding of age-related hearing loss through\ninnovative machine-learning frameworks.\n","authors":["Trinath Sai Subhash Reddy Pittala","Uma Maheswara R Meleti","Manasa Thatipamula"],"pdf_url":"https://arxiv.org/pdf/2405.00142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07356v2","updated":"2024-04-30T18:29:23Z","published":"2024-04-10T21:23:13Z","title":"GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic\n Microplastics Data","summary":" Microplastic particle ingestion or inhalation by humans is a problem of\ngrowing concern. Unfortunately, current research methods that use machine\nlearning to understand their potential harms are obstructed by a lack of\navailable data. Deep learning techniques in particular are challenged by such\ndomains where only small or imbalanced data sets are available. Overcoming this\nchallenge often involves oversampling underrepresented classes or augmenting\nthe existing data to improve model performance. This paper proposes GANsemble:\na two-module framework connecting data augmentation with conditional generative\nadversarial networks (cGANs) to generate class-conditioned synthetic data.\nFirst, the data chooser module automates augmentation strategy selection by\nsearching for the best data augmentation strategy. Next, the cGAN module uses\nthis strategy to train a cGAN for generating enhanced synthetic data. We\nexperiment with the GANsemble framework on a small and imbalanced microplastics\ndata set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines\nfor synthetic microplastics (SYMP) data are established in terms of Frechet\nInception Distance (FID) and Inception Scores (IS). We also provide a synthetic\nmicroplastics filter (SYMP-Filter) algorithm to increase the quality of\ngenerated SYMP. Additionally, we show the best amount of oversampling with\naugmentation to fix class imbalance in small microplastics data sets. To our\nknowledge, this study is the first application of generative AI to\nsynthetically create microplastics data.\n","authors":["Daniel Platnick","Sourena Khanzadeh","Alireza Sadeghian","Richard Anthony Valenzano"],"pdf_url":"https://arxiv.org/pdf/2404.07356v2.pdf","comment":"Accepted to the 37th Canadian Artificial Intelligence Conference\n (2024), 12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.00130v1","updated":"2024-04-30T18:28:09Z","published":"2024-04-30T18:28:09Z","title":"A Flexible 2.5D Medical Image Segmentation Approach with In-Slice and\n Cross-Slice Attention","summary":" Deep learning has become the de facto method for medical image segmentation,\nwith 3D segmentation models excelling in capturing complex 3D structures and 2D\nmodels offering high computational efficiency. However, segmenting 2.5D images,\nwhich have high in-plane but low through-plane resolution, is a relatively\nunexplored challenge. While applying 2D models to individual slices of a 2.5D\nimage is feasible, it fails to capture the spatial relationships between\nslices. On the other hand, 3D models face challenges such as resolution\ninconsistencies in 2.5D images, along with computational complexity and\nsusceptibility to overfitting when trained with limited data. In this context,\n2.5D models, which capture inter-slice correlations using only 2D neural\nnetworks, emerge as a promising solution due to their reduced computational\ndemand and simplicity in implementation. In this paper, we introduce CSA-Net, a\nflexible 2.5D segmentation model capable of processing 2.5D images with an\narbitrary number of slices through an innovative Cross-Slice Attention (CSA)\nmodule. This module uses the cross-slice attention mechanism to effectively\ncapture 3D spatial information by learning long-range dependencies between the\ncenter slice (for segmentation) and its neighboring slices. Moreover, CSA-Net\nutilizes the self-attention mechanism to understand correlations among pixels\nwithin the center slice. We evaluated CSA-Net on three 2.5D segmentation tasks:\n(1) multi-class brain MRI segmentation, (2) binary prostate MRI segmentation,\nand (3) multi-class prostate MRI segmentation. CSA-Net outperformed leading 2D\nand 2.5D segmentation methods across all three tasks, demonstrating its\nefficacy and superiority. Our code is publicly available at\nhttps://github.com/mirthAI/CSA-Net.\n","authors":["Amarjeet Kumar","Hongxu Jiang","Muhammad Imran","Cyndi Valdes","Gabriela Leon","Dahyun Kang","Parvathi Nataraj","Yuyin Zhou","Michael D. Weiss","Wei Shao"],"pdf_url":"https://arxiv.org/pdf/2405.00130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.02608v2","updated":"2024-04-30T18:10:32Z","published":"2023-01-06T17:10:32Z","title":"An interpretable machine learning system for colorectal cancer diagnosis\n from pathology slides","summary":" Considering the profound transformation affecting pathology practice, we\naimed to develop a scalable artificial intelligence (AI) system to diagnose\ncolorectal cancer from whole-slide images (WSI). For this, we propose a deep\nlearning (DL) system that learns from weak labels, a sampling strategy that\nreduces the number of training samples by a factor of six without compromising\nperformance, an approach to leverage a small subset of fully annotated samples,\nand a prototype with explainable predictions, active learning features and\nparallelisation. Noting some problems in the literature, this study is\nconducted with one of the largest WSI colorectal samples dataset with\napproximately 10,500 WSIs. Of these samples, 900 are testing samples.\nFurthermore, the robustness of the proposed method is assessed with two\nadditional external datasets (TCGA and PAIP) and a dataset of samples collected\ndirectly from the proposed prototype. Our proposed method predicts, for the\npatch-based tiles, a class based on the severity of the dysplasia and uses that\ninformation to classify the whole slide. It is trained with an interpretable\nmixed-supervision scheme to leverage the domain knowledge introduced by\npathologists through spatial annotations. The mixed-supervision scheme allowed\nfor an intelligent sampling strategy effectively evaluated in several different\nscenarios without compromising the performance. On the internal dataset, the\nmethod shows an accuracy of 93.44% and a sensitivity between positive\n(low-grade and high-grade dysplasia) and non-neoplastic samples of 0.996. On\nthe external test samples varied with TCGA being the most challenging dataset\nwith an overall accuracy of 84.91% and a sensitivity of 0.996.\n","authors":["Pedro C. Neto","Diana Montezuma","Sara P. Oliveira","Domingos Oliveira","João Fraga","Ana Monteiro","João Monteiro","Liliana Ribeiro","Sofia Gonçalves","Stefan Reinhard","Inti Zlobec","Isabel M. Pinto","Jaime S. Cardoso"],"pdf_url":"https://arxiv.org/pdf/2301.02608v2.pdf","comment":"Accepted at npj Precision Oncology. Available at:\n https://www.nature.com/articles/s41698-024-00539-4"},{"id":"http://arxiv.org/abs/2405.00117v1","updated":"2024-04-30T18:08:08Z","published":"2024-04-30T18:08:08Z","title":"Training a high-performance retinal foundation model with half-the-data\n and 400 times less compute","summary":" Artificial Intelligence holds tremendous potential in medicine, but is\ntraditionally limited by the lack of massive datasets to train models on.\nFoundation models, pre-trained models that can be adapted to downstream tasks\nwith small datasets, could alleviate this problem. Researchers at Moorfields\nEye Hospital (MEH) proposed RETFound-MEH, a foundation model for retinal\nimaging that was trained on 900,000 images, including private hospital data.\nRecently, data-efficient DERETFound was proposed that provides comparable\nperformance while being trained on only 150,000 images that are all publicly\navailable. However, both these models required very substantial resources to\ntrain initially and are resource-intensive in downstream use. We propose a\nnovel Token Reconstruction objective that we use to train RETFound-Green, a\nretinal foundation model trained using only 75,000 publicly available images\nand 400 times less compute. We estimate the cost of training RETFound-MEH and\nDERETFound at $10,000 and $14,000, respectively, while RETFound-Green could be\ntrained for less than $100, with equally reduced environmental impact.\nRETFound-Green is also far more efficient in downstream use: it can be\ndownloaded 14 times faster, computes vector embeddings 2.7 times faster which\nthen require 2.6 times less storage space. Despite this, RETFound-Green does\nnot perform systematically worse. In fact, it performs best on 14 tasks,\ncompared to six for DERETFound and two for RETFound-MEH. Our results suggest\nthat RETFound-Green is a very efficient, high-performance retinal foundation\nmodel. We anticipate that our Token Reconstruction objective could be scaled up\nfor even higher performance and be applied to other domains beyond retinal\nimaging.\n","authors":["Justin Engelmann","Miguel O. Bernabeu"],"pdf_url":"https://arxiv.org/pdf/2405.00117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18253v2","updated":"2024-04-30T18:02:36Z","published":"2024-04-28T17:20:08Z","title":"Efficient Remote Sensing with Harmonized Transfer Learning and Modality\n Alignment","summary":" With the rise of Visual and Language Pretraining (VLP), an increasing number\nof downstream tasks are adopting the paradigm of pretraining followed by\nfine-tuning. Although this paradigm has demonstrated potential in various\nmultimodal downstream tasks, its implementation in the remote sensing domain\nencounters some obstacles. Specifically, the tendency for same-modality\nembeddings to cluster together impedes efficient transfer learning. To tackle\nthis issue, we review the aim of multimodal transfer learning for downstream\ntasks from a unified perspective, and rethink the optimization process based on\nthree distinct objectives. We propose \"Harmonized Transfer Learning and\nModality Alignment (HarMA)\", a method that simultaneously satisfies task\nconstraints, modality alignment, and single-modality uniform alignment, while\nminimizing training overhead through parameter-efficient fine-tuning.\nRemarkably, without the need for external data for training, HarMA achieves\nstate-of-the-art performance in two popular multimodal retrieval tasks in the\nfield of remote sensing. Our experiments reveal that HarMA achieves competitive\nand even superior performance to fully fine-tuned models with only minimal\nadjustable parameters. Due to its simplicity, HarMA can be integrated into\nalmost all existing multimodal pretraining models. We hope this method can\nfacilitate the efficient application of large models to a wide range of\ndownstream tasks while significantly reducing the resource consumption. Code is\navailable at https://github.com/seekerhuang/HarMA.\n","authors":["Tengjun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.18253v2.pdf","comment":"Accepted by the Twelfth International Conference on Learning\n Representations (ICLR) Workshop"},{"id":"http://arxiv.org/abs/2405.00740v1","updated":"2024-04-30T01:19:18Z","published":"2024-04-30T01:19:18Z","title":"Modeling Caption Diversity in Contrastive Vision-Language Pretraining","summary":" There are a thousand ways to caption an image. Contrastive Language\nPretraining (CLIP) on the other hand, works by mapping an image and its caption\nto a single vector -- limiting how well CLIP-like models can represent the\ndiverse ways to describe an image. In this work, we introduce Llip, Latent\nLanguage Image Pretraining, which models the diversity of captions that could\nmatch an image. Llip's vision encoder outputs a set of visual features that are\nmixed into a final representation by conditioning on information derived from\nthe text. We show that Llip outperforms non-contextualized baselines like CLIP\nand SigLIP on a variety of tasks even with large-scale encoders. Llip improves\nzero-shot classification by an average of 2.9% zero-shot classification\nbenchmarks with a ViT-G/14 encoder. Specifically, Llip attains a zero-shot\ntop-1 accuracy of 83.5% on ImageNet outperforming a similarly sized CLIP by\n1.4%. We also demonstrate improvement on zero-shot retrieval on MS-COCO by\n6.0%. We provide a comprehensive analysis of the components introduced by the\nmethod and demonstrate that Llip leads to richer visual representations.\n","authors":["Samuel Lavoie","Polina Kirichenko","Mark Ibrahim","Mahmoud Assran","Andrew Gordon Wildon","Aaron Courville","Nicolas Ballas"],"pdf_url":"https://arxiv.org/pdf/2405.00740v1.pdf","comment":"14 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2405.00739v1","updated":"2024-04-30T01:12:32Z","published":"2024-04-30T01:12:32Z","title":"Why does Knowledge Distillation Work? Rethink its Attention and Fidelity\n Mechanism","summary":" Does Knowledge Distillation (KD) really work? Conventional wisdom viewed it\nas a knowledge transfer procedure where a perfect mimicry of the student to its\nteacher is desired. However, paradoxical studies indicate that closely\nreplicating the teacher's behavior does not consistently improve student\ngeneralization, posing questions on its possible causes. Confronted with this\ngap, we hypothesize that diverse attentions in teachers contribute to better\nstudent generalization at the expense of reduced fidelity in ensemble KD\nsetups. By increasing data augmentation strengths, our key findings reveal a\ndecrease in the Intersection over Union (IoU) of attentions between teacher\nmodels, leading to reduced student overfitting and decreased fidelity. We\npropose this low-fidelity phenomenon as an underlying characteristic rather\nthan a pathology when training KD. This suggests that stronger data\naugmentation fosters a broader perspective provided by the divergent teacher\nensemble and lower student-teacher mutual information, benefiting\ngeneralization performance. These insights clarify the mechanism on\nlow-fidelity phenomenon in KD. Thus, we offer new perspectives on optimizing\nstudent model performance, by emphasizing increased diversity in teacher\nattentions and reduced mimicry behavior between teachers and student.\n","authors":["Chenqi Guo","Shiwei Zhong","Xiaofeng Liu","Qianli Feng","Yinglong Ma"],"pdf_url":"https://arxiv.org/pdf/2405.00739v1.pdf","comment":null}]},"2024-05-01T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.00676v1","updated":"2024-05-01T17:59:45Z","published":"2024-05-01T17:59:45Z","title":"Spectrally Pruned Gaussian Fields with Neural Compensation","summary":" Recently, 3D Gaussian Splatting, as a novel 3D representation, has garnered\nattention for its fast rendering speed and high rendering quality. However,\nthis comes with high memory consumption, e.g., a well-trained Gaussian field\nmay utilize three million Gaussian primitives and over 700 MB of memory. We\ncredit this high memory footprint to the lack of consideration for the\nrelationship between primitives. In this paper, we propose a memory-efficient\nGaussian field named SUNDAE with spectral pruning and neural compensation. On\none hand, we construct a graph on the set of Gaussian primitives to model their\nrelationship and design a spectral down-sampling module to prune out primitives\nwhile preserving desired signals. On the other hand, to compensate for the\nquality loss of pruning Gaussians, we exploit a lightweight neural network head\nto mix splatted features, which effectively compensates for quality losses\nwhile capturing the relationship between primitives in its weights. We\ndemonstrate the performance of SUNDAE with extensive results. For example,\nSUNDAE can achieve 26.80 PSNR at 145 FPS using 104 MB memory while the vanilla\nGaussian splatting algorithm achieves 25.60 PSNR at 160 FPS using 523 MB\nmemory, on the Mip-NeRF360 dataset. Codes are publicly available at\nhttps://runyiyang.github.io/projects/SUNDAE/.\n","authors":["Runyi Yang","Zhenxin Zhu","Zhou Jiang","Baijun Ye","Xiaoxue Chen","Yifei Zhang","Yuantao Chen","Jian Zhao","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.00676v1.pdf","comment":"Code: https://github.com/RunyiYang/SUNDAE Project page:\n https://runyiyang.github.io/projects/SUNDAE/"},{"id":"http://arxiv.org/abs/2405.00672v1","updated":"2024-05-01T17:57:21Z","published":"2024-05-01T17:57:21Z","title":"TexSliders: Diffusion-Based Texture Editing in CLIP Space","summary":" Generative models have enabled intuitive image creation and manipulation\nusing natural language. In particular, diffusion models have recently shown\nremarkable results for natural image editing. In this work, we propose to apply\ndiffusion techniques to edit textures, a specific class of images that are an\nessential part of 3D content creation pipelines. We analyze existing editing\nmethods and show that they are not directly applicable to textures, since their\ncommon underlying approach, manipulating attention maps, is unsuitable for the\ntexture domain. To address this, we propose a novel approach that instead\nmanipulates CLIP image embeddings to condition the diffusion generation. We\ndefine editing directions using simple text prompts (e.g., \"aged wood\" to \"new\nwood\") and map these to CLIP image embedding space using a texture prior, with\na sampling-based approach that gives us identity-preserving directions in CLIP\nspace. To further improve identity preservation, we project these directions to\na CLIP subspace that minimizes identity variations resulting from entangled\ntexture attributes. Our editing pipeline facilitates the creation of arbitrary\nsliders using natural language prompts only, with no ground-truth annotated\ndata necessary.\n","authors":["Julia Guerrero-Viu","Milos Hasan","Arthur Roullier","Midhun Harikumar","Yiwei Hu","Paul Guerrero","Diego Gutierrez","Belen Masia","Valentin Deschaintre"],"pdf_url":"https://arxiv.org/pdf/2405.00672v1.pdf","comment":"SIGGRAPH 2024 Conference Proceedings"},{"id":"http://arxiv.org/abs/2405.00670v1","updated":"2024-05-01T17:57:12Z","published":"2024-05-01T17:57:12Z","title":"Adapting Pretrained Networks for Image Quality Assessment on High\n Dynamic Range Displays","summary":" Conventional image quality metrics (IQMs), such as PSNR and SSIM, are\ndesigned for perceptually uniform gamma-encoded pixel values and cannot be\ndirectly applied to perceptually non-uniform linear high-dynamic-range (HDR)\ncolors. Similarly, most of the available datasets consist of\nstandard-dynamic-range (SDR) images collected in standard and possibly\nuncontrolled viewing conditions. Popular pre-trained neural networks are\nlikewise intended for SDR inputs, restricting their direct application to HDR\ncontent. On the other hand, training HDR models from scratch is challenging due\nto limited available HDR data. In this work, we explore more effective\napproaches for training deep learning-based models for image quality assessment\n(IQA) on HDR data. We leverage networks pre-trained on SDR data (source domain)\nand re-target these models to HDR (target domain) with additional fine-tuning\nand domain adaptation. We validate our methods on the available HDR IQA\ndatasets, demonstrating that models trained with our combined recipe outperform\nprevious baselines, converge much quicker, and reliably generalize to HDR\ninputs.\n","authors":["Andrei Chubarau","Hyunjin Yoo","Tara Akhavan","James Clark"],"pdf_url":"https://arxiv.org/pdf/2405.00670v1.pdf","comment":"7 pages, 3 figures, 3 tables. Submitted to Human Vision and\n Electronic Imaging 2024 (HVEI)"},{"id":"http://arxiv.org/abs/2405.00666v1","updated":"2024-05-01T17:54:05Z","published":"2024-05-01T17:54:05Z","title":"RGB$\\leftrightarrow$X: Image decomposition and synthesis using material-\n and lighting-aware diffusion models","summary":" The three areas of realistic forward rendering, per-pixel inverse rendering,\nand generative image synthesis may seem like separate and unrelated sub-fields\nof graphics and vision. However, recent work has demonstrated improved\nestimation of per-pixel intrinsic channels (albedo, roughness, metallicity)\nbased on a diffusion architecture; we call this the RGB$\\rightarrow$X problem.\nWe further show that the reverse problem of synthesizing realistic images given\nintrinsic channels, X$\\rightarrow$RGB, can also be addressed in a diffusion\nframework.\n Focusing on the image domain of interior scenes, we introduce an improved\ndiffusion model for RGB$\\rightarrow$X, which also estimates lighting, as well\nas the first diffusion X$\\rightarrow$RGB model capable of synthesizing\nrealistic images from (full or partial) intrinsic channels. Our\nX$\\rightarrow$RGB model explores a middle ground between traditional rendering\nand generative models: we can specify only certain appearance properties that\nshould be followed, and give freedom to the model to hallucinate a plausible\nversion of the rest.\n This flexibility makes it possible to use a mix of heterogeneous training\ndatasets, which differ in the available channels. We use multiple existing\ndatasets and extend them with our own synthetic and real data, resulting in a\nmodel capable of extracting scene properties better than previous work and of\ngenerating highly realistic images of interior scenes.\n","authors":["Zheng Zeng","Valentin Deschaintre","Iliyan Georgiev","Yannick Hold-Geoffroy","Yiwei Hu","Fujun Luan","Ling-Qi Yan","Miloš Hašan"],"pdf_url":"https://arxiv.org/pdf/2405.00666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00650v1","updated":"2024-05-01T17:27:11Z","published":"2024-05-01T17:27:11Z","title":"Grains of Saliency: Optimizing Saliency-based Training of Biometric\n Attack Detection Models","summary":" Incorporating human-perceptual intelligence into model training has shown to\nincrease the generalization capability of models in several difficult biometric\ntasks, such as presentation attack detection (PAD) and detection of synthetic\nsamples. After the initial collection phase, human visual saliency (e.g.,\neye-tracking data, or handwritten annotations) can be integrated into model\ntraining through attention mechanisms, augmented training samples, or through\nhuman perception-related components of loss functions. Despite their successes,\na vital, but seemingly neglected, aspect of any saliency-based training is the\nlevel of salience granularity (e.g., bounding boxes, single saliency maps, or\nsaliency aggregated from multiple subjects) necessary to find a balance between\nreaping the full benefits of human saliency and the cost of its collection. In\nthis paper, we explore several different levels of salience granularity and\ndemonstrate that increased generalization capabilities of PAD and synthetic\nface detection can be achieved by using simple yet effective saliency\npost-processing techniques across several different CNNs.\n","authors":["Colton R. Crum","Samuel Webster","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2405.00650v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.13646v2","updated":"2024-05-01T17:25:52Z","published":"2023-08-25T19:34:21Z","title":"GRASP: A Rehearsal Policy for Efficient Online Continual Learning","summary":" Continual learning (CL) in deep neural networks (DNNs) involves incrementally\naccumulating knowledge in a DNN from a growing data stream. A major challenge\nin CL is that non-stationary data streams cause catastrophic forgetting of\npreviously learned abilities. A popular solution is rehearsal: storing past\nobservations in a buffer and then sampling the buffer to update the DNN.\nUniform sampling in a class-balanced manner is highly effective, and better\nsample selection policies have been elusive. Here, we propose a new sample\nselection policy called GRASP that selects the most prototypical (easy) samples\nfirst and then gradually selects less prototypical (harder) examples. GRASP has\nlittle additional compute or memory overhead compared to uniform selection,\nenabling it to scale to large datasets. Compared to 17 other rehearsal\npolicies, GRASP achieves higher accuracy in CL experiments on ImageNet.\nCompared to uniform balanced sampling, GRASP achieves the same performance with\n40% fewer updates. We also show that GRASP is effective for CL on five text\nclassification datasets.\n","authors":["Md Yousuf Harun","Jhair Gallardo","Junyu Chen","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2308.13646v2.pdf","comment":"Accepted to the Conference on Lifelong Learning Agents (CoLLAs) 2024"},{"id":"http://arxiv.org/abs/2405.00646v1","updated":"2024-05-01T17:21:36Z","published":"2024-05-01T17:21:36Z","title":"Learning to Compose: Improving Object Centric Learning by Injecting\n Compositionality","summary":" Learning compositional representation is a key aspect of object-centric\nlearning as it enables flexible systematic generalization and supports complex\nvisual reasoning. However, most of the existing approaches rely on\nauto-encoding objective, while the compositionality is implicitly imposed by\nthe architectural or algorithmic bias in the encoder. This misalignment between\nauto-encoding objective and learning compositionality often results in failure\nof capturing meaningful object representations. In this study, we propose a\nnovel objective that explicitly encourages compositionality of the\nrepresentations. Built upon the existing object-centric learning framework\n(e.g., slot attention), our method incorporates additional constraints that an\narbitrary mixture of object representations from two images should be valid by\nmaximizing the likelihood of the composite data. We demonstrate that\nincorporating our objective to the existing framework consistently improves the\nobjective-centric learning and enhances the robustness to the architectural\nchoices.\n","authors":["Whie Jung","Jaehoon Yoo","Sungjin Ahn","Seunghoon Hong"],"pdf_url":"https://arxiv.org/pdf/2405.00646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18416v2","updated":"2024-05-01T17:12:10Z","published":"2024-04-29T04:11:28Z","title":"Capabilities of Gemini Models in Medicine","summary":" Excellence in a wide variety of medical applications poses considerable\nchallenges for AI, requiring advanced reasoning, access to up-to-date medical\nknowledge and understanding of complex multimodal data. Gemini models, with\nstrong general capabilities in multimodal and long-context reasoning, offer\nexciting possibilities in medicine. Building on these core strengths of Gemini,\nwe introduce Med-Gemini, a family of highly capable multimodal models that are\nspecialized in medicine with the ability to seamlessly use web search, and that\ncan be efficiently tailored to novel modalities using custom encoders. We\nevaluate Med-Gemini on 14 medical benchmarks, establishing new state-of-the-art\n(SoTA) performance on 10 of them, and surpass the GPT-4 model family on every\nbenchmark where a direct comparison is viable, often by a wide margin. On the\npopular MedQA (USMLE) benchmark, our best-performing Med-Gemini model achieves\nSoTA performance of 91.1% accuracy, using a novel uncertainty-guided search\nstrategy. On 7 multimodal benchmarks including NEJM Image Challenges and MMMU\n(health & medicine), Med-Gemini improves over GPT-4V by an average relative\nmargin of 44.5%. We demonstrate the effectiveness of Med-Gemini's long-context\ncapabilities through SoTA performance on a needle-in-a-haystack retrieval task\nfrom long de-identified health records and medical video question answering,\nsurpassing prior bespoke methods using only in-context learning. Finally,\nMed-Gemini's performance suggests real-world utility by surpassing human\nexperts on tasks such as medical text summarization, alongside demonstrations\nof promising potential for multimodal medical dialogue, medical research and\neducation. Taken together, our results offer compelling evidence for\nMed-Gemini's potential, although further rigorous evaluation will be crucial\nbefore real-world deployment in this safety-critical domain.\n","authors":["Khaled Saab","Tao Tu","Wei-Hung Weng","Ryutaro Tanno","David Stutz","Ellery Wulczyn","Fan Zhang","Tim Strother","Chunjong Park","Elahe Vedadi","Juanma Zambrano Chaves","Szu-Yeu Hu","Mike Schaekermann","Aishwarya Kamath","Yong Cheng","David G. T. Barrett","Cathy Cheung","Basil Mustafa","Anil Palepu","Daniel McDuff","Le Hou","Tomer Golany","Luyang Liu","Jean-baptiste Alayrac","Neil Houlsby","Nenad Tomasev","Jan Freyberg","Charles Lau","Jonas Kemp","Jeremy Lai","Shekoofeh Azizi","Kimberly Kanada","SiWai Man","Kavita Kulkarni","Ruoxi Sun","Siamak Shakeri","Luheng He","Ben Caine","Albert Webson","Natasha Latysheva","Melvin Johnson","Philip Mansfield","Jian Lu","Ehud Rivlin","Jesper Anderson","Bradley Green","Renee Wong","Jonathan Krause","Jonathon Shlens","Ewa Dominowska","S. M. Ali Eslami","Katherine Chou","Claire Cui","Oriol Vinyals","Koray Kavukcuoglu","James Manyika","Jeff Dean","Demis Hassabis","Yossi Matias","Dale Webster","Joelle Barral","Greg Corrado","Christopher Semturs","S. Sara Mahdavi","Juraj Gottweis","Alan Karthikesalingam","Vivek Natarajan"],"pdf_url":"https://arxiv.org/pdf/2404.18416v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00631v1","updated":"2024-05-01T16:58:22Z","published":"2024-05-01T16:58:22Z","title":"Deep Metric Learning-Based Out-of-Distribution Detection with Synthetic\n Outlier Exposure","summary":" In this paper, we present a novel approach that combines deep metric learning\nand synthetic data generation using diffusion models for out-of-distribution\n(OOD) detection. One popular approach for OOD detection is outlier exposure,\nwhere models are trained using a mixture of in-distribution (ID) samples and\n``seen\" OOD samples. For the OOD samples, the model is trained to minimize the\nKL divergence between the output probability and the uniform distribution while\ncorrectly classifying the in-distribution (ID) data. In this paper, we propose\na label-mixup approach to generate synthetic OOD data using Denoising Diffusion\nProbabilistic Models (DDPMs). Additionally, we explore recent advancements in\nmetric learning to train our models.\n In the experiments, we found that metric learning-based loss functions\nperform better than the softmax. Furthermore, the baseline models (including\nsoftmax, and metric learning) show a significant improvement when trained with\nthe generated OOD data. Our approach outperforms strong baselines in\nconventional OOD detection metrics.\n","authors":["Assefa Seyoum Wahd"],"pdf_url":"https://arxiv.org/pdf/2405.00631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00630v1","updated":"2024-05-01T16:55:08Z","published":"2024-05-01T16:55:08Z","title":"Depth Priors in Removal Neural Radiance Fields","summary":" Neural Radiance Fields (NeRF) have shown impressive results in 3D\nreconstruction and generating novel views. A key challenge within NeRF is the\nediting of reconstructed scenes, such as object removal, which requires\nmaintaining consistency across multiple views and ensuring high-quality\nsynthesised perspectives. Previous studies have incorporated depth priors,\ntypically from LiDAR or sparse depth measurements provided by COLMAP, to\nimprove the performance of object removal in NeRF. However, these methods are\neither costly or time-consuming. In this paper, we propose a novel approach\nthat integrates monocular depth estimates with NeRF-based object removal models\nto significantly reduce time consumption and enhance the robustness and quality\nof scene generation and object removal. We conducted a thorough evaluation of\nCOLMAP's dense depth reconstruction on the KITTI dataset to verify its accuracy\nin depth map generation. Our findings suggest that COLMAP can serve as an\neffective alternative to a ground truth depth map where such information is\nmissing or costly to obtain. Additionally, we integrated various monocular\ndepth estimation methods into the removal NeRF model, i.e., SpinNeRF, to assess\ntheir capacity to improve object removal performance. Our experimental results\nhighlight the potential of monocular depth estimation to substantially improve\nNeRF applications.\n","authors":["Zhihao Guo","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.00630v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2402.04829v2","updated":"2024-05-01T16:50:48Z","published":"2024-02-07T13:25:16Z","title":"NeRF as a Non-Distant Environment Emitter in Physics-based Inverse\n Rendering","summary":" Physics-based inverse rendering enables joint optimization of shape,\nmaterial, and lighting based on captured 2D images. To ensure accurate\nreconstruction, using a light model that closely resembles the captured\nenvironment is essential. Although the widely adopted distant environmental\nlighting model is adequate in many cases, we demonstrate that its inability to\ncapture spatially varying illumination can lead to inaccurate reconstructions\nin many real-world inverse rendering scenarios. To address this limitation, we\nincorporate NeRF as a non-distant environment emitter into the inverse\nrendering pipeline. Additionally, we introduce an emitter importance sampling\ntechnique for NeRF to reduce the rendering variance. Through comparisons on\nboth real and synthetic datasets, our results demonstrate that our NeRF-based\nemitter offers a more precise representation of scene lighting, thereby\nimproving the accuracy of inverse rendering.\n","authors":["Jingwang Ling","Ruihan Yu","Feng Xu","Chun Du","Shuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.04829v2.pdf","comment":"SIGGRAPH 2024. Project page and video:\n https://nerfemitterpbir.github.io/"},{"id":"http://arxiv.org/abs/2405.00620v1","updated":"2024-05-01T16:40:15Z","published":"2024-05-01T16:40:15Z","title":"Lane Segmentation Refinement with Diffusion Models","summary":" The lane graph is a key component for building high-definition (HD) maps and\ncrucial for downstream tasks such as autonomous driving or navigation planning.\nPreviously, He et al. (2022) explored the extraction of the lane-level graph\nfrom aerial imagery utilizing a segmentation based approach. However,\nsegmentation networks struggle to achieve perfect segmentation masks resulting\nin inaccurate lane graph extraction. We explore additional enhancements to\nrefine this segmentation-based approach and extend it with a diffusion\nprobabilistic model (DPM) component. This combination further improves the GEO\nF1 and TOPO F1 scores, which are crucial indicators of the quality of a lane\ngraph, in the undirected graph in non-intersection areas. We conduct\nexperiments on a publicly available dataset, demonstrating that our method\noutperforms the previous approach, particularly in enhancing the connectivity\nof such a graph, as measured by the TOPO F1 score. Moreover, we perform\nablation studies on the individual components of our method to understand their\ncontribution and evaluate their effectiveness.\n","authors":["Antonio Ruiz","Andrew Melnik","Dong Wang","Helge Ritter"],"pdf_url":"https://arxiv.org/pdf/2405.00620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00604v1","updated":"2024-05-01T16:17:39Z","published":"2024-05-01T16:17:39Z","title":"A Preprocessing and Evaluation Toolbox for Trajectory Prediction\n Research on the Drone Datasets","summary":" The availability of high-quality datasets is crucial for the development of\nbehavior prediction algorithms in autonomous vehicles. This paper highlights\nthe need for standardizing the use of certain datasets for motion forecasting\nresearch to simplify comparative analysis and proposes a set of tools and\npractices to achieve this. Drawing on extensive experience and a comprehensive\nreview of current literature, we summarize our proposals for preprocessing,\nvisualizing, and evaluation in the form of an open-sourced toolbox designed for\nresearchers working on trajectory prediction problems. The clear specification\nof necessary preprocessing steps and evaluation metrics is intended to\nalleviate development efforts and facilitate the comparison of results across\ndifferent studies. The toolbox is available at:\nhttps://github.com/westny/dronalize.\n","authors":["Theodor Westny","Björn Olofsson","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2405.00604v1.pdf","comment":"https://github.com/westny/dronalize"},{"id":"http://arxiv.org/abs/2404.01094v2","updated":"2024-05-01T16:12:54Z","published":"2024-04-01T12:59:49Z","title":"HairFastGAN: Realistic and Robust Hair Transfer with a Fast\n Encoder-Based Approach","summary":" Our paper addresses the complex task of transferring a hairstyle from a\nreference image to an input photo for virtual hair try-on. This task is\nchallenging due to the need to adapt to various photo poses, the sensitivity of\nhairstyles, and the lack of objective metrics. The current state of the art\nhairstyle transfer methods use an optimization process for different parts of\nthe approach, making them inexcusably slow. At the same time, faster\nencoder-based models are of very low quality because they either operate in\nStyleGAN's W+ space or use other low-dimensional image generators.\nAdditionally, both approaches have a problem with hairstyle transfer when the\nsource pose is very different from the target pose, because they either don't\nconsider the pose at all or deal with it inefficiently. In our paper, we\npresent the HairFast model, which uniquely solves these problems and achieves\nhigh resolution, near real-time performance, and superior reconstruction\ncompared to optimization problem-based methods. Our solution includes a new\narchitecture operating in the FS latent space of StyleGAN, an enhanced\ninpainting approach, and improved encoders for better alignment, color\ntransfer, and a new encoder for post-processing. The effectiveness of our\napproach is demonstrated on realism metrics after random hairstyle transfer and\nreconstruction when the original hairstyle is transferred. In the most\ndifficult scenario of transferring both shape and color of a hairstyle from\ndifferent images, our method performs in less than a second on the Nvidia V100.\nOur code is available at https://github.com/AIRI-Institute/HairFastGAN.\n","authors":["Maxim Nikolaev","Mikhail Kuznetsov","Dmitry Vetrov","Aibek Alanov"],"pdf_url":"https://arxiv.org/pdf/2404.01094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05452v3","updated":"2024-05-01T15:58:52Z","published":"2024-03-08T16:57:54Z","title":"The R2D2 deep neural network series paradigm for fast precision imaging\n in radio astronomy","summary":" Radio-interferometric (RI) imaging entails solving high-resolution\nhigh-dynamic range inverse problems from large data volumes. Recent image\nreconstruction techniques grounded in optimization theory have demonstrated\nremarkable capability for imaging precision, well beyond CLEAN's capability.\nThese range from advanced proximal algorithms propelled by handcrafted\nregularization operators, such as the SARA family, to hybrid plug-and-play\n(PnP) algorithms propelled by learned regularization denoisers, such as AIRI.\nOptimization and PnP structures are however highly iterative, which hinders\ntheir ability to handle the extreme data sizes expected from future\ninstruments. To address this scalability challenge, we introduce a novel deep\nlearning approach, dubbed \"Residual-to-Residual DNN series for high-Dynamic\nrange imaging\". R2D2's reconstruction is formed as a series of residual images,\niteratively estimated as outputs of Deep Neural Networks (DNNs) taking the\nprevious iteration's image estimate and associated data residual as inputs. It\nthus takes a hybrid structure between a PnP algorithm and a learned version of\nthe matching pursuit algorithm that underpins CLEAN. We present a comprehensive\nstudy of our approach, featuring its multiple incarnations distinguished by\ntheir DNN architectures. We provide a detailed description of its training\nprocess, targeting a telescope-specific approach. R2D2's capability to deliver\nhigh precision is demonstrated in simulation, across a variety of image and\nobservation settings using the Very Large Array (VLA). Its reconstruction speed\nis also demonstrated: with only few iterations required to clean data residuals\nat dynamic ranges up to 100000, R2D2 opens the door to fast precision imaging.\nR2D2 codes are available in the BASPLib library on GitHub.\n","authors":["Amir Aghabiglou","Chung San Chu","Arwa Dabbech","Yves Wiaux"],"pdf_url":"https://arxiv.org/pdf/2403.05452v3.pdf","comment":"Accepted for publication in ApJS"},{"id":"http://arxiv.org/abs/2310.12153v2","updated":"2024-05-01T15:54:34Z","published":"2023-10-18T17:59:45Z","title":"Probabilistic Sampling of Balanced K-Means using Adiabatic Quantum\n Computing","summary":" Adiabatic quantum computing (AQC) is a promising approach for discrete and\noften NP-hard optimization problems. Current AQCs allow to implement problems\nof research interest, which has sparked the development of quantum\nrepresentations for many computer vision tasks. Despite requiring multiple\nmeasurements from the noisy AQC, current approaches only utilize the best\nmeasurement, discarding information contained in the remaining ones. In this\nwork, we explore the potential of using this information for probabilistic\nbalanced k-means clustering. Instead of discarding non-optimal solutions, we\npropose to use them to compute calibrated posterior probabilities with little\nadditional compute cost. This allows us to identify ambiguous solutions and\ndata points, which we demonstrate on a D-Wave AQC on synthetic tasks and real\nvisual data.\n","authors":["Jan-Nico Zaech","Martin Danelljan","Tolga Birdal","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2310.12153v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2405.00588v1","updated":"2024-05-01T15:51:15Z","published":"2024-05-01T15:51:15Z","title":"Are Models Biased on Text without Gender-related Language?","summary":" Gender bias research has been pivotal in revealing undesirable behaviors in\nlarge language models, exposing serious gender stereotypes associated with\noccupations, and emotions. A key observation in prior work is that models\nreinforce stereotypes as a consequence of the gendered correlations that are\npresent in the training data. In this paper, we focus on bias where the effect\nfrom training data is unclear, and instead address the question: Do language\nmodels still exhibit gender bias in non-stereotypical settings? To do so, we\nintroduce UnStereoEval (USE), a novel framework tailored for investigating\ngender bias in stereotype-free scenarios. USE defines a sentence-level score\nbased on pretraining data statistics to determine if the sentence contain\nminimal word-gender associations. To systematically benchmark the fairness of\npopular language models in stereotype-free scenarios, we utilize USE to\nautomatically generate benchmarks without any gender-related language. By\nleveraging USE's sentence-level score, we also repurpose prior gender bias\nbenchmarks (Winobias and Winogender) for non-stereotypical evaluation.\nSurprisingly, we find low fairness across all 28 tested models. Concretely,\nmodels demonstrate fair behavior in only 9%-41% of stereotype-free sentences,\nsuggesting that bias does not solely stem from the presence of gender-related\nwords. These results raise important questions about where underlying model\nbiases come from and highlight the need for more systematic and comprehensive\nbias evaluation. We release the full dataset and code at\nhttps://ucinlp.github.io/unstereo-eval.\n","authors":["Catarina G Belém","Preethi Seshadri","Yasaman Razeghi","Sameer Singh"],"pdf_url":"https://arxiv.org/pdf/2405.00588v1.pdf","comment":"In International Conference on Learning Representations 2024"},{"id":"http://arxiv.org/abs/2405.00587v1","updated":"2024-05-01T15:50:16Z","published":"2024-05-01T15:50:16Z","title":"GraCo: Granularity-Controllable Interactive Segmentation","summary":" Interactive Segmentation (IS) segments specific objects or parts in the image\naccording to user input. Current IS pipelines fall into two categories:\nsingle-granularity output and multi-granularity output. The latter aims to\nalleviate the spatial ambiguity present in the former. However, the\nmulti-granularity output pipeline suffers from limited interaction flexibility\nand produces redundant results. In this work, we introduce\nGranularity-Controllable Interactive Segmentation (GraCo), a novel approach\nthat allows precise control of prediction granularity by introducing additional\nparameters to input. This enhances the customization of the interactive system\nand eliminates redundancy while resolving ambiguity. Nevertheless, the\nexorbitant cost of annotating multi-granularity masks and the lack of available\ndatasets with granularity annotations make it difficult for models to acquire\nthe necessary guidance to control output granularity. To address this problem,\nwe design an any-granularity mask generator that exploits the semantic property\nof the pre-trained IS model to automatically generate abundant mask-granularity\npairs without requiring additional manual annotation. Based on these pairs, we\npropose a granularity-controllable learning strategy that efficiently imparts\nthe granularity controllability to the IS model. Extensive experiments on\nintricate scenarios at object and part levels demonstrate that our GraCo has\nsignificant advantages over previous methods. This highlights the potential of\nGraCo to be a flexible annotation tool, capable of adapting to diverse\nsegmentation scenarios. The project page: https://zhao-yian.github.io/GraCo.\n","authors":["Yian Zhao","Kehan Li","Zesen Cheng","Pengchong Qiao","Xiawu Zheng","Rongrong Ji","Chang Liu","Li Yuan","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2405.00587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19242v2","updated":"2024-05-01T15:42:09Z","published":"2024-04-30T03:58:19Z","title":"A Minimal Set of Parameters Based Depth-Dependent Distortion Model and\n Its Calibration Method for Stereo Vision Systems","summary":" Depth position highly affects lens distortion, especially in close-range\nphotography, which limits the measurement accuracy of existing stereo vision\nsystems. Moreover, traditional depth-dependent distortion models and their\ncalibration methods have remained complicated. In this work, we propose a\nminimal set of parameters based depth-dependent distortion model (MDM), which\nconsiders the radial and decentering distortions of the lens to improve the\naccuracy of stereo vision systems and simplify their calibration process. In\naddition, we present an easy and flexible calibration method for the MDM of\nstereo vision systems with a commonly used planar pattern, which requires\ncameras to observe the planar pattern in different orientations. The proposed\ntechnique is easy to use and flexible compared with classical calibration\ntechniques for depth-dependent distortion models in which the lens must be\nperpendicular to the planar pattern. The experimental validation of the MDM and\nits calibration method showed that the MDM improved the calibration accuracy by\n56.55% and 74.15% compared with the Li's distortion model and traditional\nBrown's distortion model. Besides, an iteration-based reconstruction method is\nproposed to iteratively estimate the depth information in the MDM during\nthree-dimensional reconstruction. The results showed that the accuracy of the\niteration-based reconstruction method was improved by 9.08% compared with that\nof the non-iteration reconstruction method.\n","authors":["Xin Ma","Puchen Zhu","Xiao Li","Xiaoyin Zheng","Jianshu Zhou","Xuchen Wang","Kwok Wai Samuel Au"],"pdf_url":"https://arxiv.org/pdf/2404.19242v2.pdf","comment":"This paper has been accepted for publication in IEEE Transactions on\n Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2311.13172v2","updated":"2024-05-01T15:27:51Z","published":"2023-11-22T05:31:06Z","title":"Learning to Complement with Multiple Humans","summary":" Real-world image classification tasks tend to be complex, where expert\nlabellers are sometimes unsure about the classes present in the images, leading\nto the issue of learning with noisy labels (LNL). The ill-posedness of the LNL\ntask requires the adoption of strong assumptions or the use of multiple noisy\nlabels per training image, resulting in accurate models that work well in\nisolation but fail to optimise human-AI collaborative classification (HAI-CC).\nUnlike such LNL methods, HAI-CC aims to leverage the synergies between human\nexpertise and AI capabilities but requires clean training labels, limiting its\nreal-world applicability. This paper addresses this gap by introducing the\ninnovative Learning to Complement with Multiple Humans (LECOMH) approach.\nLECOMH is designed to learn from noisy labels without depending on clean\nlabels, simultaneously maximising collaborative accuracy while minimising the\ncost of human collaboration, measured by the number of human expert annotations\nrequired per image. Additionally, new benchmarks featuring multiple noisy\nlabels for both training and testing are proposed to evaluate HAI-CC methods.\nThrough quantitative comparisons on these benchmarks, LECOMH consistently\noutperforms competitive HAI-CC approaches, human labellers, multi-rater\nlearning, and noisy-label learning methods across various datasets, offering a\npromising solution for addressing real-world image classification challenges.\n","authors":["Zheng Zhang","Cuong Nguyen","Kevin Wells","Thanh-Toan Do","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2311.13172v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2405.00574v1","updated":"2024-05-01T15:25:54Z","published":"2024-05-01T15:25:54Z","title":"EALD-MLLM: Emotion Analysis in Long-sequential and De-identity videos\n with Multi-modal Large Language Model","summary":" Emotion AI is the ability of computers to understand human emotional states.\nExisting works have achieved promising progress, but two limitations remain to\nbe solved: 1) Previous studies have been more focused on short sequential video\nemotion analysis while overlooking long sequential video. However, the emotions\nin short sequential videos only reflect instantaneous emotions, which may be\ndeliberately guided or hidden. In contrast, long sequential videos can reveal\nauthentic emotions; 2) Previous studies commonly utilize various signals such\nas facial, speech, and even sensitive biological signals (e.g.,\nelectrocardiogram). However, due to the increasing demand for privacy,\ndeveloping Emotion AI without relying on sensitive signals is becoming\nimportant. To address the aforementioned limitations, in this paper, we\nconstruct a dataset for Emotion Analysis in Long-sequential and De-identity\nvideos called EALD by collecting and processing the sequences of athletes'\npost-match interviews. In addition to providing annotations of the overall\nemotional state of each video, we also provide the Non-Facial Body Language\n(NFBL) annotations for each player. NFBL is an inner-driven emotional\nexpression and can serve as an identity-free clue to understanding the\nemotional state. Moreover, we provide a simple but effective baseline for\nfurther research. More precisely, we evaluate the Multimodal Large Language\nModels (MLLMs) with de-identification signals (e.g., visual, speech, and NFBLs)\nto perform emotion analysis. Our experimental results demonstrate that: 1)\nMLLMs can achieve comparable, even better performance than the supervised\nsingle-modal models, even in a zero-shot scenario; 2) NFBL is an important cue\nin long sequential emotion analysis. EALD will be available on the open-source\nplatform.\n","authors":["Deng Li","Xin Liu","Bohao Xing","Baiqiang Xia","Yuan Zong","Bihan Wen","Heikki Kälviäinen"],"pdf_url":"https://arxiv.org/pdf/2405.00574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19706v2","updated":"2024-05-01T15:25:30Z","published":"2024-04-30T16:54:59Z","title":"RTG-SLAM: Real-time 3D Reconstruction at Scale using Gaussian Splatting","summary":" We present Real-time Gaussian SLAM (RTG-SLAM), a real-time 3D reconstruction\nsystem with an RGBD camera for large-scale environments using Gaussian\nsplatting. The system features a compact Gaussian representation and a highly\nefficient on-the-fly Gaussian optimization scheme. We force each Gaussian to be\neither opaque or nearly transparent, with the opaque ones fitting the surface\nand dominant colors, and transparent ones fitting residual colors. By rendering\ndepth in a different way from color rendering, we let a single opaque Gaussian\nwell fit a local surface region without the need of multiple overlapping\nGaussians, hence largely reducing the memory and computation cost. For\non-the-fly Gaussian optimization, we explicitly add Gaussians for three types\nof pixels per frame: newly observed, with large color errors, and with large\ndepth errors. We also categorize all Gaussians into stable and unstable ones,\nwhere the stable Gaussians are expected to well fit previously observed RGBD\nimages and otherwise unstable. We only optimize the unstable Gaussians and only\nrender the pixels occupied by unstable Gaussians. In this way, both the number\nof Gaussians to be optimized and pixels to be rendered are largely reduced, and\nthe optimization can be done in real time. We show real-time reconstructions of\na variety of large scenes. Compared with the state-of-the-art NeRF-based RGBD\nSLAM, our system achieves comparable high-quality reconstruction but with\naround twice the speed and half the memory cost, and shows superior performance\nin the realism of novel view synthesis and camera tracking accuracy.\n","authors":["Zhexi Peng","Tianjia Shao","Yong Liu","Jingke Zhou","Yin Yang","Jingdong Wang","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.19706v2.pdf","comment":"To be published in ACM SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2405.00571v1","updated":"2024-05-01T15:19:54Z","published":"2024-05-01T15:19:54Z","title":"Spherical Linear Interpolation and Text-Anchoring for Zero-shot Composed\n Image Retrieval","summary":" Composed Image Retrieval (CIR) is a complex task that retrieves images using\na query, which is configured with an image and a caption that describes desired\nmodifications to that image. Supervised CIR approaches have shown strong\nperformance, but their reliance on expensive manually-annotated datasets\nrestricts their scalability and broader applicability. To address these issues,\nprevious studies have proposed pseudo-word token-based Zero-Shot CIR (ZS-CIR)\nmethods, which utilize a projection module to map images to word tokens.\nHowever, we conjecture that this approach has a downside: the projection module\ndistorts the original image representation and confines the resulting composed\nembeddings to the text-side. In order to resolve this, we introduce a novel\nZS-CIR method that uses Spherical Linear Interpolation (Slerp) to directly\nmerge image and text representations by identifying an intermediate embedding\nof both. Furthermore, we introduce Text-Anchored-Tuning (TAT), a method that\nfine-tunes the image encoder while keeping the text encoder fixed. TAT closes\nthe modality gap between images and text, making the Slerp process much more\neffective. Notably, the TAT method is not only efficient in terms of the scale\nof the training dataset and training time, but it also serves as an excellent\ninitial checkpoint for training supervised CIR models, thereby highlighting its\nwider potential. The integration of the Slerp-based ZS-CIR with a TAT-tuned\nmodel enables our approach to deliver state-of-the-art retrieval performance\nacross CIR benchmarks.\n","authors":["Young Kyun Jang","Dat Huynh","Ashish Shah","Wen-Kai Chen","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2405.00571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11200v2","updated":"2024-05-01T15:17:18Z","published":"2022-09-22T17:42:44Z","title":"Attention is All They Need: Exploring the Media Archaeology of the\n Computer Vision Research Paper","summary":" Research papers, in addition to textual documents, are a designed interface\nthrough which researchers communicate. Recently, rapid growth has transformed\nthat interface in many fields of computing. In this work, we examine the\neffects of this growth from a media archaeology perspective, through the\nchanges to figures and tables in research papers. Specifically, we study these\nchanges in computer vision over the past decade, as the deep learning\nrevolution has driven unprecedented growth in the discipline. We ground our\ninvestigation through interviews with veteran researchers spanning computer\nvision, graphics and visualization. Our analysis focuses on the research\nattention economy: how research paper elements contribute towards advertising,\nmeasuring and disseminating an increasingly commodified ``contribution.''\nThrough this work, we seek to motivate future discussion surrounding the design\nof both the research paper itself as well as the larger sociotechnical research\npublishing system, including tools for finding, reading and writing research\npapers.\n","authors":["Samuel Goree","Gabriel Appleby","David Crandall","Norman Su"],"pdf_url":"https://arxiv.org/pdf/2209.11200v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04811v3","updated":"2024-05-01T14:56:23Z","published":"2023-11-08T16:34:18Z","title":"Image-Based Virtual Try-On: A Survey","summary":" Image-based virtual try-on aims to synthesize a naturally dressed person\nimage with a clothing image, which revolutionizes online shopping and inspires\nrelated topics within image generation, showing both research significance and\ncommercial potential. However, there is a gap between current research progress\nand commercial applications and an absence of comprehensive overview of this\nfield to accelerate the development. In this survey, we provide a comprehensive\nanalysis of the state-of-the-art techniques and methodologies in aspects of\npipeline architecture, person representation and key modules such as try-on\nindication, clothing warping and try-on stage. We propose a new semantic\ncriteria with CLIP, and evaluate representative methods with uniformly\nimplemented evaluation metrics on the same dataset. In addition to quantitative\nand qualitative evaluation of current open-source methods, unresolved issues\nare highlighted and future research directions are prospected to identify key\ntrends and inspire further exploration. The uniformly implemented evaluation\nmetrics, dataset and collected methods will be made public available at\nhttps://github.com/little-misfit/Survey-Of-Virtual-Try-On.\n","authors":["Dan Song","Xuanpu Zhang","Juan Zhou","Weizhi Nie","Ruofeng Tong","Mohan Kankanhalli","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2311.04811v3.pdf","comment":"30 pages, 18 figures"},{"id":"http://arxiv.org/abs/2405.00542v1","updated":"2024-05-01T14:27:43Z","published":"2024-05-01T14:27:43Z","title":"UWAFA-GAN: Ultra-Wide-Angle Fluorescein Angiography Transformation via\n Multi-scale Generation and Registration Enhancement","summary":" Fundus photography, in combination with the ultra-wide-angle fundus (UWF)\ntechniques, becomes an indispensable diagnostic tool in clinical settings by\noffering a more comprehensive view of the retina. Nonetheless, UWF fluorescein\nangiography (UWF-FA) necessitates the administration of a fluorescent dye via\ninjection into the patient's hand or elbow unlike UWF scanning laser\nophthalmoscopy (UWF-SLO). To mitigate potential adverse effects associated with\ninjections, researchers have proposed the development of cross-modality medical\nimage generation algorithms capable of converting UWF-SLO images into their\nUWF-FA counterparts. Current image generation techniques applied to fundus\nphotography encounter difficulties in producing high-resolution retinal images,\nparticularly in capturing minute vascular lesions. To address these issues, we\nintroduce a novel conditional generative adversarial network (UWAFA-GAN) to\nsynthesize UWF-FA from UWF-SLO. This approach employs multi-scale generators\nand an attention transmit module to efficiently extract both global structures\nand local lesions. Additionally, to counteract the image blurriness issue that\narises from training with misaligned data, a registration module is integrated\nwithin this framework. Our method performs non-trivially on inception scores\nand details generation. Clinical user studies further indicate that the UWF-FA\nimages generated by UWAFA-GAN are clinically comparable to authentic images in\nterms of diagnostic reliability. Empirical evaluations on our proprietary UWF\nimage datasets elucidate that UWAFA-GAN outperforms extant methodologies. The\ncode is accessible at https://github.com/Tinysqua/UWAFA-GAN.\n","authors":["Ruiquan Ge","Zhaojie Fang","Pengxue Wei","Zhanghao Chen","Hongyang Jiang","Ahmed Elazab","Wangting Li","Xiang Wan","Shaochong Zhang","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2405.00542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13253v3","updated":"2024-05-01T14:20:47Z","published":"2022-12-26T18:45:25Z","title":"DSI2I: Dense Style for Unpaired Image-to-Image Translation","summary":" Unpaired exemplar-based image-to-image (UEI2I) translation aims to translate\na source image to a target image domain with the style of a target image\nexemplar, without ground-truth input-translation pairs. Existing UEI2I methods\nrepresent style using one vector per image or rely on semantic supervision to\ndefine one style vector per object. Here, in contrast, we propose to represent\nstyle as a dense feature map, allowing for a finer-grained transfer to the\nsource image without requiring any external semantic information. We then rely\non perceptual and adversarial losses to disentangle our dense style and content\nrepresentations. To stylize the source content with the exemplar style, we\nextract unsupervised cross-domain semantic correspondences and warp the\nexemplar style to the source content. We demonstrate the effectiveness of our\nmethod on four datasets using standard metrics together with a localized style\nmetric we propose, which measures style similarity in a class-wise manner. Our\nresults show that the translations produced by our approach are more diverse,\npreserve the source content better, and are closer to the exemplars when\ncompared to the state-of-the-art methods. Project page:\nhttps://github.com/IVRL/dsi2i\n","authors":["Baran Ozaydin","Tong Zhang","Sabine Süsstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2212.13253v3.pdf","comment":"To appear on TMLR '24, Reviewed on OpenReview:\n https://openreview.net/forum?id=mrJi5kdKA4"},{"id":"http://arxiv.org/abs/2405.00515v1","updated":"2024-05-01T13:51:39Z","published":"2024-05-01T13:51:39Z","title":"GAD-Generative Learning for HD Map-Free Autonomous Driving","summary":" Deep-learning-based techniques have been widely adopted for autonomous\ndriving software stacks for mass production in recent years, focusing primarily\non perception modules, with some work extending this method to prediction\nmodules. However, the downstream planning and control modules are still\ndesigned with hefty handcrafted rules, dominated by optimization-based methods\nsuch as quadratic programming or model predictive control. This results in a\nperformance bottleneck for autonomous driving systems in that corner cases\nsimply cannot be solved by enumerating hand-crafted rules. We present a\ndeep-learning-based approach that brings prediction, decision, and planning\nmodules together with the attempt to overcome the rule-based methods'\ndeficiency in real-world applications of autonomous driving, especially for\nurban scenes. The DNN model we proposed is solely trained with 10 hours of\nhuman driver data, and it supports all mass-production ADAS features available\non the market to date. This method is deployed onto a Jiyue test car with no\nmodification to its factory-ready sensor set and compute platform. the\nfeasibility, usability, and commercial potential are demonstrated in this\narticle.\n","authors":["Weijian Sun","Yanbo Jia","Qi Zeng","Zihao Liu","Jiang Liao","Yue Li","Xianfeng Li","Bolin Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.00515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00514v1","updated":"2024-05-01T13:49:09Z","published":"2024-05-01T13:49:09Z","title":"Get Your Embedding Space in Order: Domain-Adaptive Regression for Forest\n Monitoring","summary":" Image-level regression is an important task in Earth observation, where\nvisual domain and label shifts are a core challenge hampering generalization.\nHowever, cross-domain regression with remote sensing data remains understudied\ndue to the absence of suited datasets. We introduce a new dataset with aerial\nand satellite imagery in five countries with three forest-related regression\ntasks. To match real-world applicative interests, we compare methods through a\nrestrictive setup where no prior on the target domain is available during\ntraining, and models are adapted with limited information during testing.\nBuilding on the assumption that ordered relationships generalize better, we\npropose manifold diffusion for regression as a strong baseline for transduction\nin low-data regimes. Our comparison highlights the comparative advantages of\ninductive and transductive methods in cross-domain regression.\n","authors":["Sizhuo Li","Dimitri Gominski","Martin Brandt","Xiaoye Tong","Philippe Ciais"],"pdf_url":"https://arxiv.org/pdf/2405.00514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16465v3","updated":"2024-05-01T13:48:22Z","published":"2024-01-29T16:24:21Z","title":"DressCode: Autoregressively Sewing and Generating Garments from Text\n Guidance","summary":" Apparel's significant role in human appearance underscores the importance of\ngarment digitalization for digital human creation. Recent advances in 3D\ncontent creation are pivotal for digital human creation. Nonetheless, garment\ngeneration from text guidance is still nascent. We introduce a text-driven 3D\ngarment generation framework, DressCode, which aims to democratize design for\nnovices and offer immense potential in fashion design, virtual try-on, and\ndigital human creation. We first introduce SewingGPT, a GPT-based architecture\nintegrating cross-attention with text-conditioned embedding to generate sewing\npatterns with text guidance. We then tailor a pre-trained Stable Diffusion to\ngenerate tile-based Physically-based Rendering (PBR) textures for the garments.\nBy leveraging a large language model, our framework generates CG-friendly\ngarments through natural language interaction. It also facilitates pattern\ncompletion and texture editing, streamlining the design process through\nuser-friendly interaction. This framework fosters innovation by allowing\ncreators to freely experiment with designs and incorporate unique elements into\ntheir work. With comprehensive evaluations and comparisons with other\nstate-of-the-art methods, our method showcases superior quality and alignment\nwith input prompts. User studies further validate our high-quality rendering\nresults, highlighting its practical utility and potential in production\nsettings. Our project page is https://IHe-KaiI.github.io/DressCode/.\n","authors":["Kai He","Kaixin Yao","Qixuan Zhang","Lingjie Liu","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2401.16465v3.pdf","comment":"Project page: https://IHe-KaiI.github.io/DressCode/"},{"id":"http://arxiv.org/abs/2306.10274v3","updated":"2024-05-01T13:38:59Z","published":"2023-06-17T06:38:42Z","title":"Benchmarking Deep Learning Architectures for Urban Vegetation Point\n Cloud Semantic Segmentation from MLS","summary":" Vegetation is crucial for sustainable and resilient cities providing various\necosystem services and well-being of humans. However, vegetation is under\ncritical stress with rapid urbanization and expanding infrastructure\nfootprints. Consequently, mapping of this vegetation is essential in the urban\nenvironment. Recently, deep learning for point cloud semantic segmentation has\nshown significant progress. Advanced models attempt to obtain state-of-the-art\nperformance on benchmark datasets, comprising multiple classes and representing\nreal world scenarios. However, class specific segmentation with respect to\nvegetation points has not been explored. Therefore, selection of a deep\nlearning model for vegetation points segmentation is ambiguous. To address this\nproblem, we provide a comprehensive assessment of point-based deep learning\nmodels for semantic segmentation of vegetation class. We have selected seven\nrepresentative point-based models, namely PointCNN, KPConv (omni-supervised),\nRandLANet, SCFNet, PointNeXt, SPoTr and PointMetaBase. These models are\ninvestigated on three different datasets, specifically Chandigarh, Toronto3D\nand Kerala, which are characterized by diverse nature of vegetation and varying\nscene complexity combined with changing per-point features and class-wise\ncomposition. PointMetaBase and KPConv (omni-supervised) achieve the highest\nmIoU on the Chandigarh (95.24%) and Toronto3D datasets (91.26%), respectively\nwhile PointCNN provides the highest mIoU on the Kerala dataset (85.68%). The\npaper develops a deeper insight, hitherto not reported, into the working of\nthese models for vegetation segmentation and outlines the ingredients that\nshould be included in a model specifically for vegetation segmentation. This\npaper is a step towards the development of a novel architecture for vegetation\npoints segmentation.\n","authors":["Aditya Aditya","Bharat Lohani","Jagannath Aryal","Stephan Winter"],"pdf_url":"https://arxiv.org/pdf/2306.10274v3.pdf","comment":"The paper has been accepted for publication in IEEE Transactions on\n Geoscience and Remote Sensing. DOI: 10.1109/TGRS.2024.3381976"},{"id":"http://arxiv.org/abs/2405.00507v1","updated":"2024-05-01T13:38:03Z","published":"2024-05-01T13:38:03Z","title":"NeRF-Guided Unsupervised Learning of RGB-D Registration","summary":" This paper focuses on training a robust RGB-D registration model without\nground-truth pose supervision. Existing methods usually adopt a pairwise\ntraining strategy based on differentiable rendering, which enforces the\nphotometric and the geometric consistency between the two registered frames as\nsupervision. However, this frame-to-frame framework suffers from poor\nmulti-view consistency due to factors such as lighting changes, geometry\nocclusion and reflective materials. In this paper, we present NeRF-UR, a novel\nframe-to-model optimization framework for unsupervised RGB-D registration.\nInstead of frame-to-frame consistency, we leverage the neural radiance field\n(NeRF) as a global model of the scene and use the consistency between the input\nand the NeRF-rerendered frames for pose optimization. This design can\nsignificantly improve the robustness in scenarios with poor multi-view\nconsistency and provides better learning signal for the registration model.\nFurthermore, to bootstrap the NeRF optimization, we create a synthetic dataset,\nSim-RGBD, through a photo-realistic simulator to warm up the registration\nmodel. By first training the registration model on Sim-RGBD and later\nunsupervisedly fine-tuning on real data, our framework enables distilling the\ncapability of feature extraction and registration from simulation to reality.\nOur method outperforms the state-of-the-art counterparts on two popular indoor\nRGB-D datasets, ScanNet and 3DMatch. Code and models will be released for paper\nreproduction.\n","authors":["Zhinan Yu","Zheng Qin","Yijie Tang","Yongjun Wang","Renjiao Yi","Chenyang Zhu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2405.00507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12661v3","updated":"2024-05-01T13:29:25Z","published":"2023-05-22T03:04:22Z","title":"Semantic-guided modeling of spatial relation and object co-occurrence\n for indoor scene recognition","summary":" Exploring the semantic context in scene images is essential for indoor scene\nrecognition. However, due to the diverse intra-class spatial layouts and the\ncoexisting inter-class objects, modeling contextual relationships to adapt\nvarious image characteristics is a great challenge. Existing contextual\nmodeling methods for scene recognition exhibit two limitations: 1) They\ntypically model only one kind of spatial relationship among objects within\nscenes in an artificially predefined manner, with limited exploration of\ndiverse spatial layouts. 2) They often overlook the differences in coexisting\nobjects across different scenes, suppressing scene recognition performance. To\novercome these limitations, we propose SpaCoNet, which simultaneously models\nSpatial relation and Co-occurrence of objects guided by semantic segmentation.\nFirstly, the Semantic Spatial Relation Module (SSRM) is constructed to model\nscene spatial features. With the help of semantic segmentation, this module\ndecouples the spatial information from the scene image and thoroughly explores\nall spatial relationships among objects in an end-to-end manner. Secondly, both\nspatial features from the SSRM and deep features from the Image Feature\nExtraction Module are allocated to each object, so as to distinguish the\ncoexisting object across different scenes. Finally, utilizing the\ndiscriminative features above, we design a Global-Local Dependency Module to\nexplore the long-range co-occurrence among objects, and further generate a\nsemantic-guided feature representation for indoor scene recognition.\nExperimental results on three widely used scene datasets demonstrate the\neffectiveness and generality of the proposed method.\n","authors":["Chuanxin Song","Hanbo Wu","Xin Ma"],"pdf_url":"https://arxiv.org/pdf/2305.12661v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13890v2","updated":"2024-05-01T13:18:17Z","published":"2024-03-20T18:01:57Z","title":"Towards Learning Contrast Kinetics with Multi-Condition Latent Diffusion\n Models","summary":" Contrast agents in dynamic contrast enhanced magnetic resonance imaging allow\nto localize tumors and observe their contrast kinetics, which is essential for\ncancer characterization and respective treatment decision-making. However,\ncontrast agent administration is not only associated with adverse health risks,\nbut also restricted for patients during pregnancy, and for those with kidney\nmalfunction, or other adverse reactions. With contrast uptake as key biomarker\nfor lesion malignancy, cancer recurrence risk, and treatment response, it\nbecomes pivotal to reduce the dependency on intravenous contrast agent\nadministration. To this end, we propose a multi-conditional latent diffusion\nmodel capable of acquisition time-conditioned image synthesis of DCE-MRI\ntemporal sequences. To evaluate medical image synthesis, we additionally\npropose and validate the Fr\\'echet radiomics distance as an image quality\nmeasure based on biomarker variability between synthetic and real imaging data.\nOur results demonstrate our method's ability to generate realistic\nmulti-sequence fat-saturated breast DCE-MRI and uncover the emerging potential\nof deep learning based contrast kinetics simulation. We publicly share our\naccessible codebase at https://github.com/RichardObi/ccnet and provide a\nuser-friendly library for Fr\\'echet radiomics distance calculation at\nhttps://pypi.org/project/frd-score.\n","authors":["Richard Osuala","Daniel Lang","Preeti Verma","Smriti Joshi","Apostolia Tsirikoglou","Grzegorz Skorupko","Kaisar Kushibar","Lidia Garrucho","Walter H. L. Pinaya","Oliver Diaz","Julia Schnabel","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2403.13890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10307v4","updated":"2024-05-01T13:16:09Z","published":"2022-11-18T15:46:24Z","title":"SeaTurtleID2022: A long-span dataset for reliable sea turtle\n re-identification","summary":" This paper introduces the first public large-scale, long-span dataset with\nsea turtle photographs captured in the wild --\n\\href{https://www.kaggle.com/datasets/wildlifedatasets/seaturtleid2022}{SeaTurtleID2022}.\nThe dataset contains 8729 photographs of 438 unique individuals collected\nwithin 13 years, making it the longest-spanned dataset for animal\nre-identification. All photographs include various annotations, e.g., identity,\nencounter timestamp, and body parts segmentation masks. Instead of standard\n\"random\" splits, the dataset allows for two realistic and ecologically\nmotivated splits: (i) a \\textit{time-aware closed-set} with training,\nvalidation, and test data from different days/years, and (ii) a\n\\textit{time-aware open-set} with new unknown individuals in test and\nvalidation sets. We show that time-aware splits are essential for benchmarking\nre-identification methods, as random splits lead to performance overestimation.\nFurthermore, a baseline instance segmentation and re-identification performance\nover various body parts is provided. Finally, an end-to-end system for sea\nturtle re-identification is proposed and evaluated. The proposed system based\non Hybrid Task Cascade for head instance segmentation and ArcFace-trained\nfeature-extractor achieved an accuracy of 86.8\\%.\n","authors":["Lukáš Adam","Vojtěch Čermák","Kostas Papafitsoros","Lukáš Picek"],"pdf_url":"https://arxiv.org/pdf/2211.10307v4.pdf","comment":"The SeaTurtleID2022 dataset is the latest version of the SeaTurtleID\n dataset which was described in the previous versions of this arXiv\n submission. Notice the change of title in the latest version"},{"id":"http://arxiv.org/abs/2405.00485v1","updated":"2024-05-01T12:49:57Z","published":"2024-05-01T12:49:57Z","title":"The Pyramid of Captions","summary":" We introduce a formal information-theoretic framework for image captioning by\nregarding it as a representation learning task. Our framework defines three key\nobjectives: task sufficiency, minimal redundancy, and human interpretability.\nBuilding upon this foundation, we propose a novel Pyramid of Captions (PoCa)\nmethod, which constructs caption pyramids by generating localized captions for\nzoomed-in image patches and integrating them with global caption information\nusing large language models. This approach leverages intuition that the\ndetailed examination of local patches can reduce error risks and address\ninaccuracies in global captions, either by correcting the hallucination or\nadding missing details. Based on our theoretical framework, we formalize this\nintuition and provide formal proof demonstrating the effectiveness of PoCa\nunder certain assumptions. Empirical tests with various image captioning models\nand large language models show that PoCa consistently yields more informative\nand semantically aligned captions, maintaining brevity and interpretability.\n","authors":["Delong Chen","Samuel Cahyawijaya","Etsuko Ishii","Ho Shu Chan","Yejin Bang","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2405.00485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00483v1","updated":"2024-05-01T12:48:13Z","published":"2024-05-01T12:48:13Z","title":"In Anticipation of Perfect Deepfake: Identity-anchored Artifact-agnostic\n Detection under Rebalanced Deepfake Detection Protocol","summary":" As deep generative models advance, we anticipate deepfakes achieving\n\"perfection\"-generating no discernible artifacts or noise. However, current\ndeepfake detectors, intentionally or inadvertently, rely on such artifacts for\ndetection, as they are exclusive to deepfakes and absent in genuine examples.\nTo bridge this gap, we introduce the Rebalanced Deepfake Detection Protocol\n(RDDP) to stress-test detectors under balanced scenarios where genuine and\nforged examples bear similar artifacts. We offer two RDDP variants:\nRDDP-WHITEHAT uses white-hat deepfake algorithms to create 'self-deepfakes,'\ngenuine portrait videos with the resemblance of the underlying identity, yet\ncarry similar artifacts to deepfake videos; RDDP-SURROGATE employs surrogate\nfunctions (e.g., Gaussian noise) to process both genuine and forged examples,\nintroducing equivalent noise, thereby sidestepping the need of deepfake\nalgorithms.\n Towards detecting perfect deepfake videos that aligns with genuine ones, we\npresent ID-Miner, a detector that identifies the puppeteer behind the disguise\nby focusing on motion over artifacts or appearances. As an identity-based\ndetector, it authenticates videos by comparing them with reference footage.\nEquipped with the artifact-agnostic loss at frame-level and the\nidentity-anchored loss at video-level, ID-Miner effectively singles out\nidentity signals amidst distracting variations. Extensive experiments comparing\nID-Miner with 12 baseline detectors under both conventional and RDDP\nevaluations with two deepfake datasets, along with additional qualitative\nstudies, affirm the superiority of our method and the necessity for detectors\ndesigned to counter perfect deepfakes.\n","authors":["Wei-Han Wang","Chin-Yuan Yeh","Hsi-Wen Chen","De-Nian Yang","Ming-Syan Chen"],"pdf_url":"https://arxiv.org/pdf/2405.00483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00479v1","updated":"2024-05-01T12:39:35Z","published":"2024-05-01T12:39:35Z","title":"Enhanced Visual Question Answering: A Comparative Analysis and Textual\n Feature Extraction Via Convolutions","summary":" Visual Question Answering (VQA) has emerged as a highly engaging field in\nrecent years, attracting increasing research efforts aiming to enhance VQA\naccuracy through the deployment of advanced models such as Transformers.\nDespite this growing interest, there has been limited exploration into the\ncomparative analysis and impact of textual modalities within VQA, particularly\nin terms of model complexity and its effect on performance. In this work, we\nconduct a comprehensive comparison between complex textual models that leverage\nlong dependency mechanisms and simpler models focusing on local textual\nfeatures within a well-established VQA framework. Our findings reveal that\nemploying complex textual encoders is not invariably the optimal approach for\nthe VQA-v2 dataset. Motivated by this insight, we introduce an improved model,\nConvGRU, which incorporates convolutional layers to enhance the representation\nof question text. Tested on the VQA-v2 dataset, ConvGRU achieves better\nperformance without substantially increasing parameter complexity.\n","authors":["Zhilin Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.00479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00175v2","updated":"2024-05-01T12:34:53Z","published":"2024-02-29T22:59:27Z","title":"FusionVision: A comprehensive approach of 3D object reconstruction and\n segmentation from RGB-D cameras using YOLO and fast segment anything","summary":" In the realm of computer vision, the integration of advanced techniques into\nthe processing of RGB-D camera inputs poses a significant challenge, given the\ninherent complexities arising from diverse environmental conditions and varying\nobject appearances. Therefore, this paper introduces FusionVision, an\nexhaustive pipeline adapted for the robust 3D segmentation of objects in RGB-D\nimagery. Traditional computer vision systems face limitations in simultaneously\ncapturing precise object boundaries and achieving high-precision object\ndetection on depth map as they are mainly proposed for RGB cameras. To address\nthis challenge, FusionVision adopts an integrated approach by merging\nstate-of-the-art object detection techniques, with advanced instance\nsegmentation methods. The integration of these components enables a holistic\n(unified analysis of information obtained from both color \\textit{RGB} and\ndepth \\textit{D} channels) interpretation of RGB-D data, facilitating the\nextraction of comprehensive and accurate object information. The proposed\nFusionVision pipeline employs YOLO for identifying objects within the RGB image\ndomain. Subsequently, FastSAM, an innovative semantic segmentation model, is\napplied to delineate object boundaries, yielding refined segmentation masks.\nThe synergy between these components and their integration into 3D scene\nunderstanding ensures a cohesive fusion of object detection and segmentation,\nenhancing overall precision in 3D object segmentation. The code and pre-trained\nmodels are publicly available at https://github.com/safouaneelg/FusionVision/.\n","authors":["Safouane El Ghazouali","Youssef Mhirit","Ali Oukhrid","Umberto Michelucci","Hichem Nouira"],"pdf_url":"https://arxiv.org/pdf/2403.00175v2.pdf","comment":"14 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2405.00472v1","updated":"2024-05-01T12:15:58Z","published":"2024-05-01T12:15:58Z","title":"DmADs-Net: Dense multiscale attention and depth-supervised network for\n medical image segmentation","summary":" Deep learning has made important contributions to the development of medical\nimage segmentation. Convolutional neural networks, as a crucial branch, have\nattracted strong attention from researchers. Through the tireless efforts of\nnumerous researchers, convolutional neural networks have yielded numerous\noutstanding algorithms for processing medical images. The ideas and\narchitectures of these algorithms have also provided important inspiration for\nthe development of later technologies.Through extensive experimentation, we\nhave found that currently mainstream deep learning algorithms are not always\nable to achieve ideal results when processing complex datasets and different\ntypes of datasets. These networks still have room for improvement in lesion\nlocalization and feature extraction. Therefore, we have created the Dense\nMultiscale Attention and Depth-Supervised Network (DmADs-Net).We use ResNet for\nfeature extraction at different depths and create a Multi-scale Convolutional\nFeature Attention Block to improve the network's attention to weak feature\ninformation. The Local Feature Attention Block is created to enable enhanced\nlocal feature attention for high-level semantic information. In addition, in\nthe feature fusion phase, a Feature Refinement and Fusion Block is created to\nenhance the fusion of different semantic information.We validated the\nperformance of the network using five datasets of varying sizes and types.\nResults from comparative experiments show that DmADs-Net outperformed\nmainstream networks. Ablation experiments further demonstrated the\neffectiveness of the created modules and the rationality of the network\narchitecture.\n","authors":["Zhaojin Fu","Zheng Chen","Jinjiang Li","Lu Ren"],"pdf_url":"https://arxiv.org/pdf/2405.00472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00468v1","updated":"2024-05-01T12:08:38Z","published":"2024-05-01T12:08:38Z","title":"Feature-Aware Noise Contrastive Learning For Unsupervised Red Panda\n Re-Identification","summary":" To facilitate the re-identification (Re-ID) of individual animals, existing\nmethods primarily focus on maximizing feature similarity within the same\nindividual and enhancing distinctiveness between different individuals.\nHowever, most of them still rely on supervised learning and require substantial\nlabeled data, which is challenging to obtain. To avoid this issue, we propose a\nFeature-Aware Noise Contrastive Learning (FANCL) method to explore an\nunsupervised learning solution, which is then validated on the task of red\npanda re-ID. FANCL employs a Feature-Aware Noise Addition module to produce\nnoised images that conceal critical features and designs two contrastive\nlearning modules to calculate the losses. Firstly, a feature consistency module\nis designed to bridge the gap between the original and noised features.\nSecondly, the neural networks are trained through a cluster contrastive\nlearning module. Through these more challenging learning tasks, FANCL can\nadaptively extract deeper representations of red pandas. The experimental\nresults on a set of red panda images collected in both indoor and outdoor\nenvironments prove that FANCL outperforms several related state-of-the-art\nunsupervised methods, achieving high performance comparable to supervised\nlearning methods.\n","authors":["Jincheng Zhang","Qijun Zhao","Tie Liu"],"pdf_url":"https://arxiv.org/pdf/2405.00468v1.pdf","comment":"7 pages, 5 figures, IJCNN2024"},{"id":"http://arxiv.org/abs/2405.00466v1","updated":"2024-05-01T12:03:39Z","published":"2024-05-01T12:03:39Z","title":"Lazy Layers to Make Fine-Tuned Diffusion Models More Traceable","summary":" Foundational generative models should be traceable to protect their owners\nand facilitate safety regulation. To achieve this, traditional approaches embed\nidentifiers based on supervisory trigger-response signals, which are commonly\nknown as backdoor watermarks. They are prone to failure when the model is\nfine-tuned with nontrigger data. Our experiments show that this vulnerability\nis due to energetic changes in only a few 'busy' layers during fine-tuning.\nThis yields a novel arbitrary-in-arbitrary-out (AIAO) strategy that makes\nwatermarks resilient to fine-tuning-based removal. The trigger-response pairs\nof AIAO samples across various neural network depths can be used to construct\nwatermarked subpaths, employing Monte Carlo sampling to achieve stable\nverification results. In addition, unlike the existing methods of designing a\nbackdoor for the input/output space of diffusion models, in our method, we\npropose to embed the backdoor into the feature space of sampled subpaths, where\na mask-controlled trigger function is proposed to preserve the generation\nperformance and ensure the invisibility of the embedded backdoor. Our empirical\nstudies on the MS-COCO, AFHQ, LSUN, CUB-200, and DreamBooth datasets confirm\nthe robustness of AIAO; while the verification rates of other trigger-based\nmethods fall from ~90% to ~70% after fine-tuning, those of our method remain\nconsistently above 90%.\n","authors":["Haozhe Liu","Wentian Zhang","Bing Li","Bernard Ghanem","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2405.00466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00452v1","updated":"2024-05-01T11:12:08Z","published":"2024-05-01T11:12:08Z","title":"Predictive Accuracy-Based Active Learning for Medical Image Segmentation","summary":" Active learning is considered a viable solution to alleviate the\ncontradiction between the high dependency of deep learning-based segmentation\nmethods on annotated data and the expensive pixel-level annotation cost of\nmedical images. However, most existing methods suffer from unreliable\nuncertainty assessment and the struggle to balance diversity and\ninformativeness, leading to poor performance in segmentation tasks. In\nresponse, we propose an efficient Predictive Accuracy-based Active Learning\n(PAAL) method for medical image segmentation, first introducing predictive\naccuracy to define uncertainty. Specifically, PAAL mainly consists of an\nAccuracy Predictor (AP) and a Weighted Polling Strategy (WPS). The former is an\nattached learnable module that can accurately predict the segmentation accuracy\nof unlabeled samples relative to the target model with the predicted posterior\nprobability. The latter provides an efficient hybrid querying scheme by\ncombining predicted accuracy and feature representation, aiming to ensure the\nuncertainty and diversity of the acquired samples. Extensive experiment results\non multiple datasets demonstrate the superiority of PAAL. PAAL achieves\ncomparable accuracy to fully annotated data while reducing annotation costs by\napproximately 50% to 80%, showcasing significant potential in clinical\napplications. The code is available at https://github.com/shijun18/PAAL-MedSeg.\n","authors":["Jun Shi","Shulan Ruan","Ziqi Zhu","Minfan Zhao","Hong An","Xudong Xue","Bing Yan"],"pdf_url":"https://arxiv.org/pdf/2405.00452v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.00448v1","updated":"2024-05-01T11:04:22Z","published":"2024-05-01T11:04:22Z","title":"MMTryon: Multi-Modal Multi-Reference Control for High-Quality Fashion\n Generation","summary":" This paper introduces MMTryon, a multi-modal multi-reference VIrtual Try-ON\n(VITON) framework, which can generate high-quality compositional try-on results\nby taking as inputs a text instruction and multiple garment images. Our MMTryon\nmainly addresses two problems overlooked in prior literature: 1) Support of\nmultiple try-on items and dressing styleExisting methods are commonly designed\nfor single-item try-on tasks (e.g., upper/lower garments, dresses) and fall\nshort on customizing dressing styles (e.g., zipped/unzipped, tuck-in/tuck-out,\netc.) 2) Segmentation Dependency. They further heavily rely on\ncategory-specific segmentation models to identify the replacement regions, with\nsegmentation errors directly leading to significant artifacts in the try-on\nresults. For the first issue, our MMTryon introduces a novel multi-modality and\nmulti-reference attention mechanism to combine the garment information from\nreference images and dressing-style information from text instructions.\nBesides, to remove the segmentation dependency, MMTryon uses a parsing-free\ngarment encoder and leverages a novel scalable data generation pipeline to\nconvert existing VITON datasets to a form that allows MMTryon to be trained\nwithout requiring any explicit segmentation. Extensive experiments on\nhigh-resolution benchmarks and in-the-wild test sets demonstrate MMTryon's\nsuperiority over existing SOTA methods both qualitatively and quantitatively.\nBesides, MMTryon's impressive performance on multi-items and style-controllable\nvirtual try-on scenarios and its ability to try on any outfit in a large\nvariety of scenarios from any source image, opens up a new avenue for future\ninvestigation in the fashion community.\n","authors":["Xujie Zhang","Ente Lin","Xiu Li","Yuxuan Luo","Michael Kampffmeyer","Xin Dong","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2405.00448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05557v3","updated":"2024-05-01T10:53:54Z","published":"2022-09-12T19:16:48Z","title":"Blurring Diffusion Models","summary":" Recently, Rissanen et al., (2022) have presented a new type of diffusion\nprocess for generative modeling based on heat dissipation, or blurring, as an\nalternative to isotropic Gaussian diffusion. Here, we show that blurring can\nequivalently be defined through a Gaussian diffusion process with non-isotropic\nnoise. In making this connection, we bridge the gap between inverse heat\ndissipation and denoising diffusion, and we shed light on the inductive bias\nthat results from this modeling choice. Finally, we propose a generalized class\nof diffusion models that offers the best of both standard Gaussian denoising\ndiffusion and inverse heat dissipation, which we call Blurring Diffusion\nModels.\n","authors":["Emiel Hoogeboom","Tim Salimans"],"pdf_url":"https://arxiv.org/pdf/2209.05557v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00431v1","updated":"2024-05-01T10:27:22Z","published":"2024-05-01T10:27:22Z","title":"Detail-Enhancing Framework for Reference-Based Image Super-Resolution","summary":" Recent years have witnessed the prosperity of reference-based image\nsuper-resolution (Ref-SR). By importing the high-resolution (HR) reference\nimages into the single image super-resolution (SISR) approach, the ill-posed\nnature of this long-standing field has been alleviated with the assistance of\ntexture transferred from reference images. Although the significant improvement\nin quantitative and qualitative results has verified the superiority of Ref-SR\nmethods, the presence of misalignment before texture transfer indicates room\nfor further performance improvement. Existing methods tend to neglect the\nsignificance of details in the context of comparison, therefore not fully\nleveraging the information contained within low-resolution (LR) images. In this\npaper, we propose a Detail-Enhancing Framework (DEF) for reference-based\nsuper-resolution, which introduces the diffusion model to generate and enhance\nthe underlying detail in LR images. If corresponding parts are present in the\nreference image, our method can facilitate rigorous alignment. In cases where\nthe reference image lacks corresponding parts, it ensures a fundamental\nimprovement while avoiding the influence of the reference image. Extensive\nexperiments demonstrate that our proposed method achieves superior visual\nresults while maintaining comparable numerical outcomes.\n","authors":["Zihan Wang","Ziliang Xiong","Hongying Tang","Xiaobing Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.00431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00430v1","updated":"2024-05-01T10:26:08Z","published":"2024-05-01T10:26:08Z","title":"Continuous sPatial-Temporal Deformable Image Registration (CPT-DIR) for\n motion modelling in radiotherapy: beyond classic voxel-based methods","summary":" Background and purpose: Deformable image registration (DIR) is a crucial tool\nin radiotherapy for extracting and modelling organ motion. However, when\nsignificant changes and sliding boundaries are present, it faces compromised\naccuracy and uncertainty, determining the subsequential contour propagation and\ndose accumulation procedures. Materials and methods: We propose an implicit\nneural representation (INR)-based approach modelling motion continuously in\nboth space and time, named Continues-sPatial-Temporal DIR (CPT-DIR). This\nmethod uses a multilayer perception (MLP) network to map 3D coordinate (x,y,z)\nto its corresponding velocity vector (vx,vy,vz). The displacement vectors\n(dx,dy,dz) are then calculated by integrating velocity vectors over time. The\nMLP's parameters can rapidly adapt to new cases without pre-training, enhancing\noptimisation. The DIR's performance was tested on the DIR-Lab dataset of 10\nlung 4DCT cases, using metrics of landmark accuracy (TRE), contour conformity\n(Dice) and image similarity (MAE). Results: The proposed CPT-DIR can reduce\nlandmark TRE from 2.79mm to 0.99mm, outperforming B-splines' results for all\ncases. The MAE of the whole-body region improves from 35.46HU to 28.99HU.\nFurthermore, CPT-DIR surpasses B-splines for accuracy in the sliding boundary\nregion, lowering MAE and increasing Dice coefficients for the ribcage from\n65.65HU and 90.41% to 42.04HU and 90.56%, versus 75.40HU and 89.30% without\nregistration. Meanwhile, CPT-DIR offers significant speed advantages,\ncompleting in under 15 seconds compared to a few minutes with the conventional\nB-splines method. Conclusion: Leveraging the continuous representations, the\nCPT-DIR method significantly enhances registration accuracy, automation and\nspeed, outperforming traditional B-splines in landmark and contour precision,\nparticularly in the challenging areas.\n","authors":["Xia Li","Muheng Li","Antony Lomax","Joachim Buhmann","Ye Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.00430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02877v2","updated":"2024-05-01T10:12:04Z","published":"2024-04-03T17:24:27Z","title":"FlightScope: A Deep Comprehensive Assessment of Aircraft Detection\n Algorithms in Satellite Imagery","summary":" Object detection in remotely sensed satellite pictures is fundamental in many\nfields such as biophysical, and environmental monitoring. While deep learning\nalgorithms are constantly evolving, they have been mostly implemented and\ntested on popular ground-based taken photos. This paper critically evaluates\nand compares a suite of advanced object detection algorithms customized for the\ntask of identifying aircraft within satellite imagery. Using the large\nHRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset,\nthis research encompasses an array of methodologies including YOLO versions 5\nand 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from\nscratch. This exhaustive training and validation study reveal YOLOv5 as the\npreeminent model for the specific case of identifying airplanes from remote\nsensing data, showcasing high precision and adaptability across diverse imaging\nconditions. This research highlight the nuanced performance landscapes of these\nalgorithms, with YOLOv5 emerging as a robust solution for aerial object\ndetection, underlining its importance through superior mean average precision,\nRecall, and Intersection over Union scores. The findings described here\nunderscore the fundamental role of algorithm selection aligned with the\nspecific demands of satellite imagery analysis and extend a comprehensive\nframework to evaluate model efficacy. The benchmark toolkit and codes,\navailable via https://github.com/toelt-llc/FlightScope_Bench, aims to further\nexploration and innovation in the realm of remote sensing object detection,\npaving the way for improved analytical methodologies in satellite imagery\napplications.\n","authors":["Safouane El Ghazouali","Arnaud Gucciardi","Nicola Venturi","Michael Rueegsegger","Umberto Michelucci"],"pdf_url":"https://arxiv.org/pdf/2404.02877v2.pdf","comment":"15 figures, 4 tables, comprehensive survey, comparative study"},{"id":"http://arxiv.org/abs/2310.07355v3","updated":"2024-05-01T10:06:22Z","published":"2023-10-11T10:12:43Z","title":"IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training","summary":" In the field of medical Vision-Language Pre-training (VLP), significant\nefforts have been devoted to deriving text and image features from both\nclinical reports and associated medical images. However, most existing methods\nmay have overlooked the opportunity in leveraging the inherent hierarchical\nstructure of clinical reports, which are generally split into `findings' for\ndescriptive content and `impressions' for conclusive observation. Instead of\nutilizing this rich, structured format, current medical VLP approaches often\nsimplify the report into either a unified entity or fragmented tokens. In this\nwork, we propose a novel clinical prior guided VLP framework named IMITATE to\nlearn the structure information from medical reports with hierarchical\nvision-language alignment. The framework derives multi-level visual features\nfrom the chest X-ray (CXR) images and separately aligns these features with the\ndescriptive and the conclusive text encoded in the hierarchical medical report.\nFurthermore, a new clinical-informed contrastive loss is introduced for\ncross-modal learning, which accounts for clinical prior knowledge in\nformulating sample correlations in contrastive learning. The proposed model,\nIMITATE, outperforms baseline VLP methods across six different datasets,\nspanning five medical imaging downstream tasks. Comprehensive experimental\nresults highlight the advantages of integrating the hierarchical structure of\nmedical reports for vision-language alignment.\n","authors":["Che Liu","Sibo Cheng","Miaojing Shi","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2310.07355v3.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2405.00420v1","updated":"2024-05-01T09:58:57Z","published":"2024-05-01T09:58:57Z","title":"Self-supervised Pre-training of Text Recognizers","summary":" In this paper, we investigate self-supervised pre-training methods for\ndocument text recognition. Nowadays, large unlabeled datasets can be collected\nfor many research tasks, including text recognition, but it is costly to\nannotate them. Therefore, methods utilizing unlabeled data are researched. We\nstudy self-supervised pre-training methods based on masked label prediction\nusing three different approaches -- Feature Quantization, VQ-VAE, and\nPost-Quantized AE. We also investigate joint-embedding approaches with VICReg\nand NT-Xent objectives, for which we propose an image shifting technique to\nprevent model collapse where it relies solely on positional encoding while\ncompletely ignoring the input image. We perform our experiments on historical\nhandwritten (Bentham) and historical printed datasets mainly to investigate the\nbenefits of the self-supervised pre-training techniques with different amounts\nof annotated target domain data. We use transfer learning as strong baselines.\nThe evaluation shows that the self-supervised pre-training on data from the\ntarget domain is very effective, but it struggles to outperform transfer\nlearning from closely related domains. This paper is one of the first\nresearches exploring self-supervised pre-training in document text recognition,\nand we believe that it will become a cornerstone for future research in this\narea. We made our implementation of the investigated methods publicly available\nat https://github.com/DCGM/pero-pretraining.\n","authors":["Martin Kišš","Michal Hradiš"],"pdf_url":"https://arxiv.org/pdf/2405.00420v1.pdf","comment":"18 pages, 6 figures, 4 tables, accepted to ICDAR24"},{"id":"http://arxiv.org/abs/2301.09430v4","updated":"2024-05-01T09:51:07Z","published":"2023-01-23T13:34:01Z","title":"Rethinking Real-world Image Deraining via An Unpaired\n Degradation-Conditioned Diffusion Model","summary":" Recent diffusion models have exhibited great potential in generative modeling\ntasks. Part of their success can be attributed to the ability of training\nstable on huge sets of paired synthetic data. However, adapting these models to\nreal-world image deraining remains difficult for two aspects. First, collecting\na large-scale paired real-world clean/rainy dataset is unavailable while\nregular conditional diffusion models heavily rely on paired data for training.\nSecond, real-world rain usually reflects real-world scenarios with a variety of\nunknown rain degradation types, which poses a significant challenge for the\ngenerative modeling process. To meet these challenges, we propose RainDiff, the\nfirst real-world image deraining paradigm based on diffusion models, serving as\na new standard bar for real-world image deraining. We address the first\nchallenge by introducing a stable and non-adversarial unpaired cycle-consistent\narchitecture that can be trained, end-to-end, with only unpaired data for\nsupervision; and the second challenge by proposing a degradation-conditioned\ndiffusion model that refines the desired output via a diffusive generative\nprocess conditioned by learned priors of multiple rain degradations. Extensive\nexperiments confirm the superiority of our RainDiff over existing\nunpaired/semi-supervised methods and show its competitive advantages over\nseveral fully-supervised ones.\n","authors":["Yiyang Shen","Mingqiang Wei","Yongzhen Wang","Xueyang Fu","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2301.09430v4.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2404.17335v2","updated":"2024-05-01T08:54:54Z","published":"2024-04-26T11:32:53Z","title":"A Novel Spike Transformer Network for Depth Estimation from Event\n Cameras via Cross-modality Knowledge Distillation","summary":" Depth estimation is crucial for interpreting complex environments, especially\nin areas such as autonomous vehicle navigation and robotics. Nonetheless,\nobtaining accurate depth readings from event camera data remains a formidable\nchallenge. Event cameras operate differently from traditional digital cameras,\ncontinuously capturing data and generating asynchronous binary spikes that\nencode time, location, and light intensity. Yet, the unique sampling mechanisms\nof event cameras render standard image based algorithms inadequate for\nprocessing spike data. This necessitates the development of innovative,\nspike-aware algorithms tailored for event cameras, a task compounded by the\nirregularity, continuity, noise, and spatial and temporal characteristics\ninherent in spiking data.Harnessing the strong generalization capabilities of\ntransformer neural networks for spatiotemporal data, we propose a purely\nspike-driven spike transformer network for depth estimation from spiking camera\ndata. To address performance limitations with Spiking Neural Networks (SNN), we\nintroduce a novel single-stage cross-modality knowledge transfer framework\nleveraging knowledge from a large vision foundational model of artificial\nneural networks (ANN) (DINOv2) to enhance the performance of SNNs with limited\ndata. Our experimental results on both synthetic and real datasets show\nsubstantial improvements over existing models, with notable gains in Absolute\nRelative and Square Relative errors (49% and 39.77% improvements over the\nbenchmark model Spike-T, respectively). Besides accuracy, the proposed model\nalso demonstrates reduced power consumptions, a critical factor for practical\napplications.\n","authors":["Xin Zhang","Liangxiu Han","Tam Sobeih","Lianghao Han","Darren Dancey"],"pdf_url":"https://arxiv.org/pdf/2404.17335v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.03443v4","updated":"2024-05-01T08:39:50Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v4.pdf","comment":"Accepted By International Joint Conference on Neural Networks 2024"},{"id":"http://arxiv.org/abs/2405.00384v1","updated":"2024-05-01T08:30:58Z","published":"2024-05-01T08:30:58Z","title":"Visual and audio scene classification for detecting discrepancies in\n video: a baseline method and experimental protocol","summary":" This paper presents a baseline approach and an experimental protocol for a\nspecific content verification problem: detecting discrepancies between the\naudio and video modalities in multimedia content. We first design and optimize\nan audio-visual scene classifier, to compare with existing classification\nbaselines that use both modalities. Then, by applying this classifier\nseparately to the audio and the visual modality, we can detect scene-class\ninconsistencies between them. To facilitate further research and provide a\ncommon evaluation platform, we introduce an experimental protocol and a\nbenchmark dataset simulating such inconsistencies. Our approach achieves\nstate-of-the-art results in scene classification and promising outcomes in\naudio-visual discrepancies detection, highlighting its potential in content\nverification applications.\n","authors":["Konstantinos Apostolidis","Jakob Abesser","Luca Cuccovillo","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2405.00384v1.pdf","comment":"Accepted for publication, 3rd ACM Int. Workshop on Multimedia AI\n against Disinformation (MAD'24) at ACM ICMR'24, June 10, 2024, Phuket,\n Thailand. This is the \"accepted version\""},{"id":"http://arxiv.org/abs/2405.00378v1","updated":"2024-05-01T08:17:43Z","published":"2024-05-01T08:17:43Z","title":"Adaptive Bidirectional Displacement for Semi-Supervised Medical Image\n Segmentation","summary":" Consistency learning is a central strategy to tackle unlabeled data in\nsemi-supervised medical image segmentation (SSMIS), which enforces the model to\nproduce consistent predictions under the perturbation. However, most current\napproaches solely focus on utilizing a specific single perturbation, which can\nonly cope with limited cases, while employing multiple perturbations\nsimultaneously is hard to guarantee the quality of consistency learning. In\nthis paper, we propose an Adaptive Bidirectional Displacement (ABD) approach to\nsolve the above challenge. Specifically, we first design a bidirectional patch\ndisplacement based on reliable prediction confidence for unlabeled data to\ngenerate new samples, which can effectively suppress uncontrollable regions and\nstill retain the influence of input perturbations. Meanwhile, to enforce the\nmodel to learn the potentially uncontrollable content, a bidirectional\ndisplacement operation with inverse confidence is proposed for the labeled\nimages, which generates samples with more unreliable information to facilitate\nmodel learning. Extensive experiments show that ABD achieves new\nstate-of-the-art performances for SSMIS, significantly improving different\nbaselines. Source code is available at https://github.com/chy-upc/ABD.\n","authors":["Hanyang Chi","Jian Pang","Bingfeng Zhang","Weifeng Liu"],"pdf_url":"https://arxiv.org/pdf/2405.00378v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2308.03290v2","updated":"2024-05-01T08:16:21Z","published":"2023-08-07T04:17:19Z","title":"FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization\n Search","summary":" Quantization has become a mainstream compression technique for reducing model\nsize, computational requirements, and energy consumption for modern deep neural\nnetworks (DNNs). With improved numerical support in recent hardware, including\nmultiple variants of integer and floating point, mixed-precision quantization\nhas become necessary to achieve high-quality results with low model cost. Prior\nmixed-precision methods have performed either a post-training quantization\nsearch, which compromises on accuracy, or a differentiable quantization search,\nwhich leads to high memory usage from branching. Therefore, we propose the\nfirst one-shot mixed-precision quantization search that eliminates the need for\nretraining in both integer and low-precision floating point models. We evaluate\nour search (FLIQS) on multiple convolutional and vision transformer networks to\ndiscover Pareto-optimal models. Our approach improves upon uniform precision,\nmanual mixed-precision, and recent integer quantization search methods. With\ninteger models, we increase the accuracy of ResNet-18 on ImageNet by 1.31% and\nResNet-50 by 0.90% with equivalent model cost over previous methods.\nAdditionally, for the first time, we explore a novel mixed-precision\nfloating-point search and improve MobileNetV2 by up to 0.98% compared to prior\nstate-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously search\na joint quantization and neural architecture space and improve the ImageNet\naccuracy by 2.69% with similar model cost on a MobileNetV2 search space.\n","authors":["Jordan Dotzel","Gang Wu","Andrew Li","Muhammad Umar","Yun Ni","Mohamed S. Abdelfattah","Zhiru Zhang","Liqun Cheng","Martin G. Dixon","Norman P. Jouppi","Quoc V. Le","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.03290v2.pdf","comment":"Accepted to AutoML 2024"},{"id":"http://arxiv.org/abs/2311.11210v2","updated":"2024-05-01T08:05:24Z","published":"2023-11-19T03:25:14Z","title":"HiH: A Multi-modal Hierarchy in Hierarchy Network for Unconstrained Gait\n Recognition","summary":" Gait recognition has achieved promising advances in controlled settings, yet\nit significantly struggles in unconstrained environments due to challenges such\nas view changes, occlusions, and varying walking speeds. Additionally, efforts\nto fuse multiple modalities often face limited improvements because of\ncross-modality incompatibility, particularly in outdoor scenarios. To address\nthese issues, we present a multi-modal Hierarchy in Hierarchy network (HiH)\nthat integrates silhouette and pose sequences for robust gait recognition. HiH\nfeatures a main branch that utilizes Hierarchical Gait Decomposer (HGD) modules\nfor depth-wise and intra-module hierarchical examination of general gait\npatterns from silhouette data. This approach captures motion hierarchies from\noverall body dynamics to detailed limb movements, facilitating the\nrepresentation of gait attributes across multiple spatial resolutions.\nComplementing this, an auxiliary branch, based on 2D joint sequences, enriches\nthe spatial and temporal aspects of gait analysis. It employs a Deformable\nSpatial Enhancement (DSE) module for pose-guided spatial attention and a\nDeformable Temporal Alignment (DTA) module for aligning motion dynamics through\nlearned temporal offsets. Extensive evaluations across diverse indoor and\noutdoor datasets demonstrate HiH's state-of-the-art performance, affirming a\nwell-balanced trade-off between accuracy and efficiency.\n","authors":["Lei Wang","Bo Liu","Yinchi Ma","Fangfang Liang","Nawei Guo"],"pdf_url":"https://arxiv.org/pdf/2311.11210v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00355v1","updated":"2024-05-01T07:16:49Z","published":"2024-05-01T07:16:49Z","title":"Exploring Self-Supervised Vision Transformers for Deepfake Detection: A\n Comparative Analysis","summary":" This paper investigates the effectiveness of self-supervised pre-trained\ntransformers compared to supervised pre-trained transformers and conventional\nneural networks (ConvNets) for detecting various types of deepfakes. We focus\non their potential for improved generalization, particularly when training data\nis limited. Despite the notable success of large vision-language models\nutilizing transformer architectures in various tasks, including zero-shot and\nfew-shot learning, the deepfake detection community has still shown some\nreluctance to adopt pre-trained vision transformers (ViTs), especially large\nones, as feature extractors. One concern is their perceived excessive capacity,\nwhich often demands extensive data, and the resulting suboptimal generalization\nwhen training or fine-tuning data is small or less diverse. This contrasts\npoorly with ConvNets, which have already established themselves as robust\nfeature extractors. Additionally, training and optimizing transformers from\nscratch requires significant computational resources, making this accessible\nprimarily to large companies and hindering broader investigation within the\nacademic community. Recent advancements in using self-supervised learning (SSL)\nin transformers, such as DINO and its derivatives, have showcased significant\nadaptability across diverse vision tasks and possess explicit semantic\nsegmentation capabilities. By leveraging DINO for deepfake detection with\nmodest training data and implementing partial fine-tuning, we observe\ncomparable adaptability to the task and the natural explainability of the\ndetection result via the attention mechanism. Moreover, partial fine-tuning of\ntransformers for deepfake detection offers a more resource-efficient\nalternative, requiring significantly fewer computational resources.\n","authors":["Huy H. Nguyen","Junichi Yamagishi","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2405.00355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00354v1","updated":"2024-05-01T07:16:03Z","published":"2024-05-01T07:16:03Z","title":"CrossMatch: Enhance Semi-Supervised Medical Image Segmentation with\n Perturbation Strategies and Knowledge Distillation","summary":" Semi-supervised learning for medical image segmentation presents a unique\nchallenge of efficiently using limited labeled data while leveraging abundant\nunlabeled data. Despite advancements, existing methods often do not fully\nexploit the potential of the unlabeled data for enhancing model robustness and\naccuracy. In this paper, we introduce CrossMatch, a novel framework that\nintegrates knowledge distillation with dual perturbation strategies-image-level\nand feature-level-to improve the model's learning from both labeled and\nunlabeled data. CrossMatch employs multiple encoders and decoders to generate\ndiverse data streams, which undergo self-knowledge distillation to enhance\nconsistency and reliability of predictions across varied perturbations. Our\nmethod significantly surpasses other state-of-the-art techniques in standard\nbenchmarks by effectively minimizing the gap between training on labeled and\nunlabeled data and improving edge accuracy and generalization in medical image\nsegmentation. The efficacy of CrossMatch is demonstrated through extensive\nexperimental validations, showing remarkable performance improvements without\nincreasing computational costs. Code for this implementation is made available\nat https://github.com/AiEson/CrossMatch.git.\n","authors":["Bin Zhao","Chunshi Wang","Shuxue Ding"],"pdf_url":"https://arxiv.org/pdf/2405.00354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00351v1","updated":"2024-05-01T07:08:24Z","published":"2024-05-01T07:08:24Z","title":"Learning High-Quality Navigation and Zooming on Omnidirectional Images\n in Virtual Reality","summary":" Viewing omnidirectional images (ODIs) in virtual reality (VR) represents a\nnovel form of media that provides immersive experiences for users to navigate\nand interact with digital content. Nonetheless, this sense of immersion can be\ngreatly compromised by a blur effect that masks details and hampers the user's\nability to engage with objects of interest. In this paper, we present a novel\nsystem, called OmniVR, designed to enhance visual clarity during VR navigation.\nOur system enables users to effortlessly locate and zoom in on the objects of\ninterest in VR. It captures user commands for navigation and zoom, converting\nthese inputs into parameters for the Mobius transformation matrix. Leveraging\nthese parameters, the ODI is refined using a learning-based algorithm. The\nresultant ODI is presented within the VR media, effectively reducing blur and\nincreasing user engagement. To verify the effectiveness of our system, we first\nevaluate our algorithm with state-of-the-art methods on public datasets, which\nachieves the best performance. Furthermore, we undertake a comprehensive user\nstudy to evaluate viewer experiences across diverse scenarios and to gather\ntheir qualitative feedback from multiple perspectives. The outcomes reveal that\nour system enhances user engagement by improving the viewers' recognition,\nreducing discomfort, and improving the overall immersive experience. Our system\nmakes the navigation and zoom more user-friendly.\n","authors":["Zidong Cao","Zhan Wang","Yexin Liu","Yan-Pei Cao","Ying Shan","Wei Zeng","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2405.00351v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2403.00549v3","updated":"2024-05-01T06:50:59Z","published":"2024-03-01T14:18:00Z","title":"Relaxometry Guided Quantitative Cardiac Magnetic Resonance Image\n Reconstruction","summary":" Deep learning-based methods have achieved prestigious performance for\nmagnetic resonance imaging (MRI) reconstruction, enabling fast imaging for many\nclinical applications. Previous methods employ convolutional networks to learn\nthe image prior as the regularization term. In quantitative MRI, the physical\nmodel of nuclear magnetic resonance relaxometry is known, providing additional\nprior knowledge for image reconstruction. However, traditional reconstruction\nnetworks are limited to learning the spatial domain prior knowledge, ignoring\nthe relaxometry prior. Therefore, we propose a relaxometry-guided quantitative\nMRI reconstruction framework to learn the spatial prior from data and the\nrelaxometry prior from MRI physics. Additionally, we also evaluated the\nperformance of two popular reconstruction backbones, namely, recurrent\nvariational networks (RVN) and variational networks (VN) with U- Net.\nExperiments demonstrate that the proposed method achieves highly promising\nresults in quantitative MRI reconstruction.\n","authors":["Yidong Zhao","Yi Zhang","Qian Tao"],"pdf_url":"https://arxiv.org/pdf/2403.00549v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06278v3","updated":"2024-05-01T06:49:03Z","published":"2022-12-12T23:12:19Z","title":"Efficient Bayesian Uncertainty Estimation for nnU-Net","summary":" The self-configuring nnU-Net has achieved leading performance in a large\nrange of medical image segmentation challenges. It is widely considered as the\nmodel of choice and a strong baseline for medical image segmentation. However,\ndespite its extraordinary performance, nnU-Net does not supply a measure of\nuncertainty to indicate its possible failure. This can be problematic for\nlarge-scale image segmentation applications, where data are heterogeneous and\nnnU-Net may fail without notice. In this work, we introduce a novel method to\nestimate nnU-Net uncertainty for medical image segmentation. We propose a\nhighly effective scheme for posterior sampling of weight space for Bayesian\nuncertainty estimation. Different from previous baseline methods such as Monte\nCarlo Dropout and mean-field Bayesian Neural Networks, our proposed method does\nnot require a variational architecture and keeps the original nnU-Net\narchitecture intact, thereby preserving its excellent performance and ease of\nuse. Additionally, we boost the segmentation performance over the original\nnnU-Net via marginalizing multi-modal posterior models. We applied our method\non the public ACDC and M&M datasets of cardiac MRI and demonstrated improved\nuncertainty estimation over a range of baseline methods. The proposed method\nfurther strengthens nnU-Net for medical image segmentation in terms of both\nsegmentation accuracy and quality control.\n","authors":["Yidong Zhao","Changchun Yang","Artur Schweidtmann","Qian Tao"],"pdf_url":"https://arxiv.org/pdf/2212.06278v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07330v2","updated":"2024-05-01T06:33:56Z","published":"2024-02-11T23:39:42Z","title":"Expert-Adaptive Medical Image Segmentation","summary":" Medical image segmentation (MIS) plays an instrumental role in medical image\nanalysis, where considerable effort has been devoted to automating the process.\nCurrently, mainstream MIS approaches are based on deep neural networks (DNNs),\nwhich are typically trained on a dataset with annotations produced by certain\nmedical experts. In the medical domain, the annotations generated by different\nexperts can be inherently distinct due to complexity of medical images and\nvariations in expertise and post-segmentation missions. Consequently, the DNN\nmodel trained on the data annotated by some experts may hardly adapt to a new\nexpert. In this work, we evaluate a customised expert-adaptive method,\ncharacterised by multi-expert annotation, multi-task DNN-based model training,\nand lightweight model fine-tuning, to investigate model's adaptivity to a new\nexpert in the situation where the amount and mobility of training images are\nlimited. Experiments conducted on brain MRI segmentation tasks with limited\ntraining data demonstrate its effectiveness and the impact of its key\nparameters.\n","authors":["Binyan Hu","A. K. Qin"],"pdf_url":"https://arxiv.org/pdf/2402.07330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00340v1","updated":"2024-05-01T06:26:35Z","published":"2024-05-01T06:26:35Z","title":"NC-SDF: Enhancing Indoor Scene Reconstruction Using Neural SDFs with\n View-Dependent Normal Compensation","summary":" State-of-the-art neural implicit surface representations have achieved\nimpressive results in indoor scene reconstruction by incorporating monocular\ngeometric priors as additional supervision. However, we have observed that\nmulti-view inconsistency between such priors poses a challenge for high-quality\nreconstructions. In response, we present NC-SDF, a neural signed distance field\n(SDF) 3D reconstruction framework with view-dependent normal compensation (NC).\nSpecifically, we integrate view-dependent biases in monocular normal priors\ninto the neural implicit representation of the scene. By adaptively learning\nand correcting the biases, our NC-SDF effectively mitigates the adverse impact\nof inconsistent supervision, enhancing both the global consistency and local\ndetails in the reconstructions. To further refine the details, we introduce an\ninformative pixel sampling strategy to pay more attention to intricate geometry\nwith higher information content. Additionally, we design a hybrid geometry\nmodeling approach to improve the neural implicit representation. Experiments on\nsynthetic and real-world datasets demonstrate that NC-SDF outperforms existing\napproaches in terms of reconstruction quality.\n","authors":["Ziyi Chen","Xiaolong Wu","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.00340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00209v2","updated":"2024-05-01T06:14:44Z","published":"2024-03-01T00:59:50Z","title":"ChartReformer: Natural Language-Driven Chart Image Editing","summary":" Chart visualizations are essential for data interpretation and communication;\nhowever, most charts are only accessible in image format and lack the\ncorresponding data tables and supplementary information, making it difficult to\nalter their appearance for different application scenarios. To eliminate the\nneed for original underlying data and information to perform chart editing, we\npropose ChartReformer, a natural language-driven chart image editing solution\nthat directly edits the charts from the input images with the given instruction\nprompts. The key in this method is that we allow the model to comprehend the\nchart and reason over the prompt to generate the corresponding underlying data\ntable and visual attributes for new charts, enabling precise edits.\nAdditionally, to generalize ChartReformer, we define and standardize various\ntypes of chart editing, covering style, layout, format, and data-centric edits.\nThe experiments show promising results for the natural language-driven chart\nimage editing.\n","authors":["Pengyu Yan","Mahesh Bhosale","Jay Lal","Bikhyat Adhikari","David Doermann"],"pdf_url":"https://arxiv.org/pdf/2403.00209v2.pdf","comment":"Published in ICDAR 2024. Code and model are available at\n https://github.com/pengyu965/ChartReformer"},{"id":"http://arxiv.org/abs/2402.00724v2","updated":"2024-05-01T05:46:56Z","published":"2024-02-01T16:14:54Z","title":"Automatic Segmentation of the Spinal Cord Nerve Rootlets","summary":" Precise identification of spinal nerve rootlets is relevant to delineate\nspinal levels for the study of functional activity in the spinal cord. The goal\nof this study was to develop an automatic method for the semantic segmentation\nof spinal nerve rootlets from T2-weighted magnetic resonance imaging (MRI)\nscans. Images from two open-access MRI datasets were used to train a 3D\nmulti-class convolutional neural network using an active learning approach to\nsegment C2-C8 dorsal nerve rootlets. Each output class corresponds to a spinal\nlevel. The method was tested on 3T T2-weighted images from datasets unseen\nduring training to assess inter-site, inter-session, and inter-resolution\nvariability. The test Dice score was 0.67 +- 0.16 (mean +- standard deviation\nacross testing images and rootlets levels), suggesting a good performance. The\nmethod also demonstrated low inter-vendor and inter-site variability\n(coefficient of variation <= 1.41 %), as well as low inter-session variability\n(coefficient of variation <= 1.30 %) indicating stable predictions across\ndifferent MRI vendors, sites, and sessions. The proposed methodology is\nopen-source and readily available in the Spinal Cord Toolbox (SCT) v6.2 and\nhigher.\n","authors":["Jan Valosek","Theo Mathieu","Raphaelle Schlienger","Olivia S. Kowalczyk","Julien Cohen-Adad"],"pdf_url":"https://arxiv.org/pdf/2402.00724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09378v3","updated":"2024-05-01T05:18:22Z","published":"2024-04-14T23:30:35Z","title":"Orientation-conditioned Facial Texture Mapping for Video-based Facial\n Remote Photoplethysmography Estimation","summary":" Camera-based remote photoplethysmography (rPPG) enables contactless\nmeasurement of important physiological signals such as pulse rate (PR).\nHowever, dynamic and unconstrained subject motion introduces significant\nvariability into the facial appearance in video, confounding the ability of\nvideo-based methods to accurately extract the rPPG signal. In this study, we\nleverage the 3D facial surface to construct a novel orientation-conditioned\nfacial texture video representation which improves the motion robustness of\nexisting video-based facial rPPG estimation methods. Our proposed method\nachieves a significant 18.2% performance improvement in cross-dataset testing\non MMPD over our baseline using the PhysNet model trained on PURE, highlighting\nthe efficacy and generalization benefits of our designed video representation.\nWe demonstrate significant performance improvements of up to 29.6% in all\ntested motion scenarios in cross-dataset testing on MMPD, even in the presence\nof dynamic and unconstrained subject motion, emphasizing the benefits of\ndisentangling motion through modeling the 3D facial surface for motion robust\nfacial rPPG estimation. We validate the efficacy of our design decisions and\nthe impact of different video processing steps through an ablation study. Our\nfindings illustrate the potential strengths of exploiting the 3D facial surface\nas a general strategy for addressing dynamic and unconstrained subject motion\nin videos. The code is available at\nhttps://samcantrill.github.io/orientation-uv-rppg/.\n","authors":["Sam Cantrill","David Ahmedt-Aristizabal","Lars Petersson","Hanna Suominen","Mohammad Ali Armin"],"pdf_url":"https://arxiv.org/pdf/2404.09378v3.pdf","comment":"12 pages, 8 figures, 6 tables; minor corrections"},{"id":"http://arxiv.org/abs/2302.06358v5","updated":"2024-05-01T05:10:50Z","published":"2023-02-13T13:44:52Z","title":"Anticipating Next Active Objects for Egocentric Videos","summary":" This paper addresses the problem of anticipating the next-active-object\nlocation in the future, for a given egocentric video clip where the contact\nmight happen, before any action takes place. The problem is considerably hard,\nas we aim at estimating the position of such objects in a scenario where the\nobserved clip and the action segment are separated by the so-called ``time to\ncontact'' (TTC) segment. Many methods have been proposed to anticipate the\naction of a person based on previous hand movements and interactions with the\nsurroundings. However, there have been no attempts to investigate the next\npossible interactable object, and its future location with respect to the\nfirst-person's motion and the field-of-view drift during the TTC window. We\ndefine this as the task of Anticipating the Next ACTive Object (ANACTO). To\nthis end, we propose a transformer-based self-attention framework to identify\nand locate the next-active-object in an egocentric clip.\n We benchmark our method on three datasets: EpicKitchens-100, EGTEA+ and\nEgo4D. We also provide annotations for the first two datasets. Our approach\nperforms best compared to relevant baseline methods. We also conduct ablation\nstudies to understand the effectiveness of the proposed and baseline methods on\nvarying conditions. Code and ANACTO task annotations will be made available\nupon paper acceptance.\n","authors":["Sanket Thakur","Cigdem Beyan","Pietro Morerio","Vittorio Murino","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2302.06358v5.pdf","comment":"Accepted by IEEE ACCESS, this paper carries the Manuscript DOI:\n 10.1109/ACCESS.2024.3395282. The complete peer-reviewed version is available\n via this DOI, while the arXiv version is a post-author manuscript without\n peer-review"},{"id":"http://arxiv.org/abs/2308.00692v3","updated":"2024-05-01T05:10:13Z","published":"2023-08-01T17:50:17Z","title":"LISA: Reasoning Segmentation via Large Language Model","summary":" Although perception systems have made remarkable advancements in recent\nyears, they still rely on explicit human instruction or pre-defined categories\nto identify the target objects before executing visual recognition tasks. Such\nsystems cannot actively reason and comprehend implicit user intention. In this\nwork, we propose a new segmentation task -- reasoning segmentation. The task is\ndesigned to output a segmentation mask given a complex and implicit query text.\nFurthermore, we establish a benchmark comprising over one thousand\nimage-instruction-mask data samples, incorporating intricate reasoning and\nworld knowledge for evaluation purposes. Finally, we present LISA: large\nLanguage Instructed Segmentation Assistant, which inherits the language\ngeneration capabilities of multimodal Large Language Models (LLMs) while also\npossessing the ability to produce segmentation masks. We expand the original\nvocabulary with a token and propose the embedding-as-mask paradigm to\nunlock the segmentation capability. Remarkably, LISA can handle cases involving\ncomplex reasoning and world knowledge. Also, it demonstrates robust zero-shot\ncapability when trained exclusively on reasoning-free datasets. In addition,\nfine-tuning the model with merely 239 reasoning segmentation data samples\nresults in further performance enhancement. Both quantitative and qualitative\nexperiments show our method effectively unlocks new reasoning segmentation\ncapabilities for multimodal LLMs. Code, models, and data are available at\nhttps://github.com/dvlab-research/LISA.\n","authors":["Xin Lai","Zhuotao Tian","Yukang Chen","Yanwei Li","Yuhui Yuan","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2308.00692v3.pdf","comment":"Code, models, and data are available at\n https://github.com/dvlab-research/LISA"},{"id":"http://arxiv.org/abs/2405.00318v1","updated":"2024-05-01T04:51:10Z","published":"2024-05-01T04:51:10Z","title":"Covariant spatio-temporal receptive fields for neuromorphic computing","summary":" Biological nervous systems constitute important sources of inspiration\ntowards computers that are faster, cheaper, and more energy efficient.\nNeuromorphic disciplines view the brain as a coevolved system, simultaneously\noptimizing the hardware and the algorithms running on it. There are clear\nefficiency gains when bringing the computations into a physical substrate, but\nwe presently lack theories to guide efficient implementations. Here, we present\na principled computational model for neuromorphic systems in terms of\nspatio-temporal receptive fields, based on affine Gaussian kernels over space\nand leaky-integrator and leaky integrate-and-fire models over time. Our theory\nis provably covariant to spatial affine and temporal scaling transformations,\nand with close similarities to the visual processing in mammalian brains. We\nuse these spatio-temporal receptive fields as a prior in an event-based vision\ntask, and show that this improves the training of spiking networks, which\notherwise is known as problematic for event-based vision. This work combines\nefforts within scale-space theory and computational neuroscience to identify\ntheoretically well-founded ways to process spatio-temporal signals in\nneuromorphic systems. Our contributions are immediately relevant for signal\nprocessing and event-based vision, and can be extended to other processing\ntasks over space and time, such as memory and control.\n","authors":["Jens Egholm Pedersen","Jörg Conradt","Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2405.00318v1.pdf","comment":"Code available at https://github.com/jegp/nrf"},{"id":"http://arxiv.org/abs/2404.18399v2","updated":"2024-05-01T04:42:39Z","published":"2024-04-29T03:21:05Z","title":"Semantic Line Combination Detector","summary":" A novel algorithm, called semantic line combination detector (SLCD), to find\nan optimal combination of semantic lines is proposed in this paper. It\nprocesses all lines in each line combination at once to assess the overall\nharmony of the lines. First, we generate various line combinations from\nreliable lines. Second, we estimate the score of each line combination and\ndetermine the best one. Experimental results demonstrate that the proposed SLCD\noutperforms existing semantic line detectors on various datasets. Moreover, it\nis shown that SLCD can be applied effectively to three vision tasks of\nvanishing point detection, symmetry axis detection, and composition-based image\nretrieval. Our codes are available at https://github.com/Jinwon-Ko/SLCD.\n","authors":["Jinwon Ko","Dongkwon Jin","Chang-Su Kim"],"pdf_url":"https://arxiv.org/pdf/2404.18399v2.pdf","comment":"CVPR 2024 accepted"},{"id":"http://arxiv.org/abs/2405.00314v1","updated":"2024-05-01T04:32:07Z","published":"2024-05-01T04:32:07Z","title":"Model Quantization and Hardware Acceleration for Vision Transformers: A\n Comprehensive Survey","summary":" Vision Transformers (ViTs) have recently garnered considerable attention,\nemerging as a promising alternative to convolutional neural networks (CNNs) in\nseveral vision-related applications. However, their large model sizes and high\ncomputational and memory demands hinder deployment, especially on\nresource-constrained devices. This underscores the necessity of\nalgorithm-hardware co-design specific to ViTs, aiming to optimize their\nperformance by tailoring both the algorithmic structure and the underlying\nhardware accelerator to each other's strengths. Model quantization, by\nconverting high-precision numbers to lower-precision, reduces the computational\ndemands and memory needs of ViTs, allowing the creation of hardware\nspecifically optimized for these quantized algorithms, boosting efficiency.\nThis article provides a comprehensive survey of ViTs quantization and its\nhardware acceleration. We first delve into the unique architectural attributes\nof ViTs and their runtime characteristics. Subsequently, we examine the\nfundamental principles of model quantization, followed by a comparative\nanalysis of the state-of-the-art quantization techniques for ViTs.\nAdditionally, we explore the hardware acceleration of quantized ViTs,\nhighlighting the importance of hardware-friendly algorithm design. In\nconclusion, this article will discuss ongoing challenges and future research\npaths. We consistently maintain the related open-source materials at\nhttps://github.com/DD-DuDa/awesome-vit-quantization-acceleration.\n","authors":["Dayou Du","Gu Gong","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2405.00314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00313v1","updated":"2024-05-01T04:30:03Z","published":"2024-05-01T04:30:03Z","title":"Streamlining Image Editing with Layered Diffusion Brushes","summary":" Denoising diffusion models have recently gained prominence as powerful tools\nfor a variety of image generation and manipulation tasks. Building on this, we\npropose a novel tool for real-time editing of images that provides users with\nfine-grained region-targeted supervision in addition to existing prompt-based\ncontrols. Our novel editing technique, termed Layered Diffusion Brushes,\nleverages prompt-guided and region-targeted alteration of intermediate\ndenoising steps, enabling precise modifications while maintaining the integrity\nand context of the input image. We provide an editor based on Layered Diffusion\nBrushes modifications, which incorporates well-known image editing concepts\nsuch as layer masks, visibility toggles, and independent manipulation of\nlayers; regardless of their order. Our system renders a single edit on a\n512x512 image within 140 ms using a high-end consumer GPU, enabling real-time\nfeedback and rapid exploration of candidate edits. We validated our method and\nediting system through a user study involving both natural images (using\ninversion) and generated images, showcasing its usability and effectiveness\ncompared to existing techniques such as InstructPix2Pix and Stable Diffusion\nInpainting for refining images. Our approach demonstrates efficacy across a\nrange of tasks, including object attribute adjustments, error correction, and\nsequential prompt-based object placement and manipulation, demonstrating its\nversatility and potential for enhancing creative workflows.\n","authors":["Peyman Gholami","Robert Xiao"],"pdf_url":"https://arxiv.org/pdf/2405.00313v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.00219"},{"id":"http://arxiv.org/abs/2211.05716v2","updated":"2024-05-01T03:31:12Z","published":"2022-11-09T09:38:57Z","title":"Resource-Aware Heterogeneous Federated Learning using Neural\n Architecture Search","summary":" Federated Learning (FL) is extensively used to train AI/ML models in\ndistributed and privacy-preserving settings. Participant edge devices in FL\nsystems typically contain non-independent and identically distributed (Non-IID)\nprivate data and unevenly distributed computational resources. Preserving user\ndata privacy while optimizing AI/ML models in a heterogeneous federated network\nrequires us to address data and system/resource heterogeneity. To address these\nchallenges, we propose Resource-aware Federated Learning (RaFL). RaFL allocates\nresource-aware specialized models to edge devices using Neural Architecture\nSearch (NAS) and allows heterogeneous model architecture deployment by\nknowledge extraction and fusion. Combining NAS and FL enables on-demand\ncustomized model deployment for resource-diverse edge devices. Furthermore, we\npropose a multi-model architecture fusion scheme allowing the aggregation of\nthe distributed learning results. Results demonstrate RaFL's superior resource\nefficiency compared to SoTA.\n","authors":["Sixing Yu","J. Pablo Muñoz","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2211.05716v2.pdf","comment":"Accepted at the 30th International European Conference on Parallel\n and Distributed Computing (Euro-Par 2024)"},{"id":"http://arxiv.org/abs/2405.00293v1","updated":"2024-05-01T03:15:28Z","published":"2024-05-01T03:15:28Z","title":"MoPEFT: A Mixture-of-PEFTs for the Segment Anything Model","summary":" The emergence of foundation models, such as the Segment Anything Model (SAM),\nhas sparked interest in Parameter-Efficient Fine-Tuning (PEFT) methods that\ntailor these large models to application domains outside their training data.\nHowever, different PEFT techniques modify the representation of a model\ndifferently, making it a non-trivial task to select the most appropriate method\nfor the domain of interest. We propose a new framework, Mixture-of-PEFTs\nmethods (MoPEFT), that is inspired by traditional Mixture-of-Experts (MoE)\nmethodologies and is utilized for fine-tuning SAM. Our MoPEFT framework\nincorporates three different PEFT techniques as submodules and dynamically\nlearns to activate the ones that are best suited for a given data-task setup.\nWe test our method on the Segment Anything Model and show that MoPEFT\nconsistently outperforms other fine-tuning methods on the MESS benchmark.\n","authors":["Rajat Sahay","Andreas Savakis"],"pdf_url":"https://arxiv.org/pdf/2405.00293v1.pdf","comment":"Workshop on Foundation Models, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.15789v2","updated":"2024-05-01T02:37:18Z","published":"2024-04-24T10:28:54Z","title":"MotionMaster: Training-free Camera Motion Transfer For Video Generation","summary":" The emergence of diffusion models has greatly propelled the progress in image\nand video generation. Recently, some efforts have been made in controllable\nvideo generation, including text-to-video generation and video motion control,\namong which camera motion control is an important topic. However, existing\ncamera motion control methods rely on training a temporal camera module, and\nnecessitate substantial computation resources due to the large amount of\nparameters in video generation models. Moreover, existing methods pre-define\ncamera motion types during training, which limits their flexibility in camera\ncontrol. Therefore, to reduce training costs and achieve flexible camera\ncontrol, we propose COMD, a novel training-free video motion transfer model,\nwhich disentangles camera motions and object motions in source videos and\ntransfers the extracted camera motions to new videos. We first propose a\none-shot camera motion disentanglement method to extract camera motion from a\nsingle source video, which separates the moving objects from the background and\nestimates the camera motion in the moving objects region based on the motion in\nthe background by solving a Poisson equation. Furthermore, we propose a\nfew-shot camera motion disentanglement method to extract the common camera\nmotion from multiple videos with similar camera motions, which employs a\nwindow-based clustering technique to extract the common features in temporal\nattention maps of multiple videos. Finally, we propose a motion combination\nmethod to combine different types of camera motions together, enabling our\nmodel a more controllable and flexible camera control. Extensive experiments\ndemonstrate that our training-free approach can effectively decouple\ncamera-object motion and apply the decoupled camera motion to a wide range of\ncontrollable video generation tasks, achieving flexible and diverse camera\nmotion control.\n","authors":["Teng Hu","Jiangning Zhang","Ran Yi","Yating Wang","Hongrui Huang","Jieyu Weng","Yabiao Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.15789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10182v2","updated":"2024-05-01T02:08:06Z","published":"2023-07-02T11:09:08Z","title":"Enhancing Super-Resolution Networks through Realistic Thick-Slice CT\n Simulation","summary":" Deep learning-based Generative Models have the potential to convert\nlow-resolution CT images into high-resolution counterparts without long\nacquisition times and increased radiation exposure in thin-slice CT imaging.\nHowever, procuring appropriate training data for these Super-Resolution (SR)\nmodels is challenging. Previous SR research has simulated thick-slice CT images\nfrom thin-slice CT images to create training pairs. However, these methods\neither rely on simplistic interpolation techniques that lack realism or\nsinogram reconstruction, which require the release of raw data and complex\nreconstruction algorithms. Thus, we introduce a simple yet realistic method to\ngenerate thick CT images from thin-slice CT images, facilitating the creation\nof training pairs for SR algorithms. The training pairs produced by our method\nclosely resemble real data distributions (PSNR=49.74 vs. 40.66, p$<$0.05). A\nmultivariate Cox regression analysis involving thick slice CT images with lung\nfibrosis revealed that only the radiomics features extracted using our method\ndemonstrated a significant correlation with mortality (HR=1.19 and HR=1.14,\np$<$0.005). This paper represents the first to identify and address the\nchallenge of generating appropriate paired training data for Deep\nLearning-based CT SR models, which enhances the efficacy and applicability of\nSR models in real-world scenarios.\n","authors":["Zeyu Tang","Xiaodan Xing","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10182v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2012.12437v2","updated":"2024-05-01T02:07:29Z","published":"2020-12-23T01:24:41Z","title":"Pit30M: A Benchmark for Global Localization in the Age of Self-Driving\n Cars","summary":" We are interested in understanding whether retrieval-based localization\napproaches are good enough in the context of self-driving vehicles. Towards\nthis goal, we introduce Pit30M, a new image and LiDAR dataset with over 30\nmillion frames, which is 10 to 100 times larger than those used in previous\nwork. Pit30M is captured under diverse conditions (i.e., season, weather, time\nof the day, traffic), and provides accurate localization ground truth. We also\nautomatically annotate our dataset with historical weather and astronomical\ndata, as well as with image and LiDAR semantic segmentation as a proxy measure\nfor occlusion. We benchmark multiple existing methods for image and LiDAR\nretrieval and, in the process, introduce a simple, yet effective convolutional\nnetwork-based LiDAR retrieval method that is competitive with the state of the\nart. Our work provides, for the first time, a benchmark for sub-metre\nretrieval-based localization at city scale. The dataset, its Python SDK, as\nwell as more information about the sensors, calibration, and metadata, are\navailable on the project website: https://pit30m.github.io/\n","authors":["Julieta Martinez","Sasha Doubov","Jack Fan","Ioan Andrei Bârsan","Shenlong Wang","Gellért Máttyus","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2012.12437v2.pdf","comment":"Published at IROS 2020"},{"id":"http://arxiv.org/abs/2404.00231v3","updated":"2024-05-01T01:47:43Z","published":"2024-03-30T03:23:52Z","title":"Attention-based Shape-Deformation Networks for Artifact-Free Geometry\n Reconstruction of Lumbar Spine from MR Images","summary":" Lumbar disc degeneration, a progressive structural wear and tear of lumbar\nintervertebral disc, is regarded as an essential role on low back pain, a\nsignificant global health concern. Automated lumbar spine geometry\nreconstruction from MR images will enable fast measurement of medical\nparameters to evaluate the lumbar status, in order to determine a suitable\ntreatment. Existing image segmentation-based techniques often generate\nerroneous segments or unstructured point clouds, unsuitable for medical\nparameter measurement. In this work, we present $\\textit{UNet-DeformSA}$ and\n$\\textit{TransDeformer}$: novel attention-based deep neural networks that\nreconstruct the geometry of the lumbar spine with high spatial accuracy and\nmesh correspondence across patients, and we also present a variant of\n$\\textit{TransDeformer}$ for error estimation. Specially, we devise new\nattention modules with a new attention formula, which integrate image features\nand tokenized contour features to predict the displacements of the points on a\nshape template without the need for image segmentation. The deformed template\nreveals the lumbar spine geometry in an image. Experiment results show that our\nnetworks generate artifact-free geometry outputs, and the variant of\n$\\textit{TransDeformer}$ can predict the errors of a reconstructed geometry.\nOur code is available at https://github.com/linchenq/TransDeformer-Mesh.\n","authors":["Linchen Qian","Jiasong Chen","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2404.00231v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02189v5","updated":"2024-05-01T01:32:34Z","published":"2023-11-03T18:44:21Z","title":"FairSeg: A Large-Scale Medical Image Segmentation Dataset for Fairness\n Learning Using Segment Anything Model with Fair Error-Bound Scaling","summary":" Fairness in artificial intelligence models has gained significantly more\nattention in recent years, especially in the area of medicine, as fairness in\nmedical models is critical to people's well-being and lives. High-quality\nmedical fairness datasets are needed to promote fairness learning research.\nExisting medical fairness datasets are all for classification tasks, and no\nfairness datasets are available for medical segmentation, while medical\nsegmentation is an equally important clinical task as classifications, which\ncan provide detailed spatial information on organ abnormalities ready to be\nassessed by clinicians. In this paper, we propose the first fairness dataset\nfor medical segmentation named Harvard-FairSeg with 10,000 subject samples. In\naddition, we propose a fair error-bound scaling approach to reweight the loss\nfunction with the upper error-bound in each identity group, using the segment\nanything model (SAM). We anticipate that the segmentation performance equity\ncan be improved by explicitly tackling the hard cases with high training errors\nin each identity group. To facilitate fair comparisons, we utilize a novel\nequity-scaled segmentation performance metric to compare segmentation metrics\nin the context of fairness, such as the equity-scaled Dice coefficient. Through\ncomprehensive experiments, we demonstrate that our fair error-bound scaling\napproach either has superior or comparable fairness performance to the\nstate-of-the-art fairness learning models. The dataset and code are publicly\naccessible via https://ophai.hms.harvard.edu/datasets/harvard-fairseg10k.\n","authors":["Yu Tian","Min Shi","Yan Luo","Ava Kouhana","Tobias Elze","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2311.02189v5.pdf","comment":"ICLR 2024; Codes available at\n https://github.com/Harvard-Ophthalmology-AI-Lab/FairSeg"},{"id":"http://arxiv.org/abs/2404.19326v2","updated":"2024-05-01T01:30:58Z","published":"2024-04-30T07:50:29Z","title":"LVOS: A Benchmark for Large-scale Long-term Video Object Segmentation","summary":" Video object segmentation (VOS) aims to distinguish and track target objects\nin a video. Despite the excellent performance achieved by off-the-shell VOS\nmodels, existing VOS benchmarks mainly focus on short-term videos lasting about\n5 seconds, where objects remain visible most of the time. However, these\nbenchmarks poorly represent practical applications, and the absence of\nlong-term datasets restricts further investigation of VOS in realistic\nscenarios. Thus, we propose a novel benchmark named LVOS, comprising 720 videos\nwith 296,401 frames and 407,945 high-quality annotations. Videos in LVOS last\n1.14 minutes on average, approximately 5 times longer than videos in existing\ndatasets. Each video includes various attributes, especially challenges\nderiving from the wild, such as long-term reappearing and cross-temporal\nsimilar objects. Compared to previous benchmarks, our LVOS better reflects VOS\nmodels' performance in real scenarios. Based on LVOS, we evaluate 20 existing\nVOS models under 4 different settings and conduct a comprehensive analysis. On\nLVOS, these models suffer a large performance drop, highlighting the challenge\nof achieving precise tracking and segmentation in real-world scenarios.\nAttribute-based analysis indicates that key factor to accuracy decline is the\nincreased video length, emphasizing LVOS's crucial role. We hope our LVOS can\nadvance development of VOS in real scenes. Data and code are available at\nhttps://lingyihongfd.github.io/lvos.github.io/.\n","authors":["Lingyi Hong","Zhongying Liu","Wenchao Chen","Chenzhi Tan","Yuang Feng","Xinyu Zhou","Pinxue Guo","Jinglun Li","Zhaoyu Chen","Shuyong Gao","Wei Zhang","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.19326v2.pdf","comment":"LVOS V2"},{"id":"http://arxiv.org/abs/2404.19265v2","updated":"2024-05-01T00:51:48Z","published":"2024-04-30T05:11:32Z","title":"Mapping New Realities: Ground Truth Image Creation with Pix2Pix\n Image-to-Image Translation","summary":" Generative Adversarial Networks (GANs) have significantly advanced image\nprocessing, with Pix2Pix being a notable framework for image-to-image\ntranslation. This paper explores a novel application of Pix2Pix to transform\nabstract map images into realistic ground truth images, addressing the scarcity\nof such images crucial for domains like urban planning and autonomous vehicle\ntraining. We detail the Pix2Pix model's utilization for generating\nhigh-fidelity datasets, supported by a dataset of paired map and aerial images,\nand enhanced by a tailored training regimen. The results demonstrate the\nmodel's capability to accurately render complex urban features, establishing\nits efficacy and potential for broad real-world applications.\n","authors":["Zhenglin Li","Bo Guan","Yuanzhou Wei","Yiming Zhou","Jingyu Zhang","Jinxin Xu"],"pdf_url":"https://arxiv.org/pdf/2404.19265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00264v1","updated":"2024-05-01T00:48:55Z","published":"2024-05-01T00:48:55Z","title":"Using Texture to Classify Forests Separately from Vegetation","summary":" Identifying terrain within satellite image data is a key issue in\ngeographical information sciences, with numerous environmental and safety\nimplications. Many techniques exist to derive classifications from spectral\ndata captured by satellites. However, the ability to reliably classify\nvegetation remains a challenge. In particular, no precise methods exist for\nclassifying forest vs. non-forest vegetation in high-level satellite images.\nThis paper provides an initial proposal for a static, algorithmic process to\nidentify forest regions in satellite image data through texture features\ncreated from detected edges and the NDVI ratio captured by Sentinel-2 satellite\nimages. With strong initial results, this paper also identifies the next steps\nto improve the accuracy of the classification and verification processes.\n","authors":["David R. Treadwell IV","Derek Jacoby","Will Parkinson","Bruce Maxwell","Yvonne Coady"],"pdf_url":"https://arxiv.org/pdf/2405.00264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00260v1","updated":"2024-05-01T00:30:13Z","published":"2024-05-01T00:30:13Z","title":"CREPE: Coordinate-Aware End-to-End Document Parser","summary":" In this study, we formulate an OCR-free sequence generation model for visual\ndocument understanding (VDU). Our model not only parses text from document\nimages but also extracts the spatial coordinates of the text based on the\nmulti-head architecture. Named as Coordinate-aware End-to-end Document Parser\n(CREPE), our method uniquely integrates these capabilities by introducing a\nspecial token for OCR text, and token-triggered coordinate decoding. We also\nproposed a weakly-supervised framework for cost-efficient training, requiring\nonly parsing annotations without high-cost coordinate annotations. Our\nexperimental evaluations demonstrate CREPE's state-of-the-art performances on\ndocument parsing tasks. Beyond that, CREPE's adaptability is further\nhighlighted by its successful usage in other document understanding tasks such\nas layout analysis, document visual question answering, and so one. CREPE's\nabilities including OCR and semantic parsing not only mitigate error\npropagation issues in existing OCR-dependent methods, it also significantly\nenhance the functionality of sequence generation models, ushering in a new era\nfor document understanding studies.\n","authors":["Yamato Okamoto","Youngmin Baek","Geewook Kim","Ryota Nakao","DongHyun Kim","Moon Bin Yim","Seunghyun Park","Bado Lee"],"pdf_url":"https://arxiv.org/pdf/2405.00260v1.pdf","comment":"Accepted at the International Conference on Document Analysis and\n Recognition (ICDAR 2024) main conference"},{"id":"http://arxiv.org/abs/2405.00256v1","updated":"2024-05-01T00:13:05Z","published":"2024-05-01T00:13:05Z","title":"ASAM: Boosting Segment Anything Model with Adversarial Tuning","summary":" In the evolving landscape of computer vision, foundation models have emerged\nas pivotal tools, exhibiting exceptional adaptability to a myriad of tasks.\nAmong these, the Segment Anything Model (SAM) by Meta AI has distinguished\nitself in image segmentation. However, SAM, like its counterparts, encounters\nlimitations in specific niche applications, prompting a quest for enhancement\nstrategies that do not compromise its inherent capabilities. This paper\nintroduces ASAM, a novel methodology that amplifies SAM's performance through\nadversarial tuning. We harness the potential of natural adversarial examples,\ninspired by their successful implementation in natural language processing. By\nutilizing a stable diffusion model, we augment a subset (1%) of the SA-1B\ndataset, generating adversarial instances that are more representative of\nnatural variations rather than conventional imperceptible perturbations. Our\napproach maintains the photorealism of adversarial examples and ensures\nalignment with original mask annotations, thereby preserving the integrity of\nthe segmentation task. The fine-tuned ASAM demonstrates significant\nimprovements across a diverse range of segmentation tasks without necessitating\nadditional data or architectural modifications. The results of our extensive\nevaluations confirm that ASAM establishes new benchmarks in segmentation tasks,\nthereby contributing to the advancement of foundational models in computer\nvision. Our project page is in https://asam2024.github.io/.\n","authors":["Bo Li","Haoke Xiao","Lv Tang"],"pdf_url":"https://arxiv.org/pdf/2405.00256v1.pdf","comment":"This paper is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.01030v3","updated":"2024-05-01T23:58:22Z","published":"2024-04-01T10:19:05Z","title":"Survey of Bias In Text-to-Image Generation: Definition, Evaluation, and\n Mitigation","summary":" The recent advancement of large and powerful models with Text-to-Image (T2I)\ngeneration abilities -- such as OpenAI's DALLE-3 and Google's Gemini -- enables\nusers to generate high-quality images from textual prompts. However, it has\nbecome increasingly evident that even simple prompts could cause T2I models to\nexhibit conspicuous social bias in generated images. Such bias might lead to\nboth allocational and representational harms in society, further marginalizing\nminority groups. Noting this problem, a large body of recent works has been\ndedicated to investigating different dimensions of bias in T2I systems.\nHowever, an extensive review of these studies is lacking, hindering a\nsystematic understanding of current progress and research gaps. We present the\nfirst extensive survey on bias in T2I generative models. In this survey, we\nreview prior studies on dimensions of bias: Gender, Skintone, and Geo-Culture.\nSpecifically, we discuss how these works define, evaluate, and mitigate\ndifferent aspects of bias. We found that: (1) while gender and skintone biases\nare widely studied, geo-cultural bias remains under-explored; (2) most works on\ngender and skintone bias investigated occupational association, while other\naspects are less frequently studied; (3) almost all gender bias works overlook\nnon-binary identities in their studies; (4) evaluation datasets and metrics are\nscattered, with no unified framework for measuring biases; and (5) current\nmitigation methods fail to resolve biases comprehensively. Based on current\nlimitations, we point out future research directions that contribute to\nhuman-centric definitions, evaluations, and mitigation of biases. We hope to\nhighlight the importance of studying biases in T2I systems, as well as\nencourage future efforts to holistically understand and tackle biases, building\nfair and trustworthy T2I technologies for everyone.\n","authors":["Yixin Wan","Arjun Subramonian","Anaelia Ovalle","Zongyu Lin","Ashima Suvarna","Christina Chance","Hritik Bansal","Rebecca Pattichis","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2404.01030v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00908v1","updated":"2024-05-01T23:40:12Z","published":"2024-05-01T23:40:12Z","title":"Transformer-Based Self-Supervised Learning for Histopathological\n Classification of Ischemic Stroke Clot Origin","summary":" Background and Purpose: Identifying the thromboembolism source in ischemic\nstroke is crucial for treatment and secondary prevention yet is often\nundetermined. This study describes a self-supervised deep learning approach in\ndigital pathology of emboli for classifying ischemic stroke clot origin from\nhistopathological images. Methods: The dataset included whole slide images\n(WSI) from the STRIP AI Kaggle challenge, consisting of retrieved clots from\nischemic stroke patients following mechanical thrombectomy. Transformer-based\ndeep learning models were developed using transfer learning and self-supervised\npretraining for classifying WSI. Customizations included an attention pooling\nlayer, weighted loss function, and threshold optimization. Various model\narchitectures were tested and compared, and model performances were primarily\nevaluated using weighted logarithmic loss. Results: The model achieved a\nlogloss score of 0.662 in cross-validation and 0.659 on the test set. Different\nmodel backbones were compared, with the swin_large_patch4_window12_384 showed\nhigher performance. Thresholding techniques for clot origin classification were\nemployed to balance false positives and negatives. Conclusion: The study\ndemonstrates the extent of efficacy of transformer-based deep learning models\nin identifying ischemic stroke clot origins from histopathological images and\nemphasizes the need for refined modeling techniques specifically adapted to\nthrombi WSI. Further research is needed to improve model performance,\ninterpretability, validate its effectiveness. Future enhancement could include\nintegrating larger patient cohorts, advanced preprocessing strategies, and\nexploring ensemble multimodal methods for enhanced diagnostic accuracy.\n","authors":["K. Yeh","M. S. Jabal","V. Gupta","D. F. Kallmes","W. Brinjikji","B. S. Erdal"],"pdf_url":"https://arxiv.org/pdf/2405.00908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00906v1","updated":"2024-05-01T23:30:12Z","published":"2024-05-01T23:30:12Z","title":"LOTUS: Improving Transformer Efficiency with Sparsity Pruning and Data\n Lottery Tickets","summary":" Vision transformers have revolutionized computer vision, but their\ncomputational demands present challenges for training and deployment. This\npaper introduces LOTUS (LOttery Transformers with Ultra Sparsity), a novel\nmethod that leverages data lottery ticket selection and sparsity pruning to\naccelerate vision transformer training while maintaining accuracy. Our approach\nfocuses on identifying and utilizing the most informative data subsets and\neliminating redundant model parameters to optimize the training process.\nThrough extensive experiments, we demonstrate the effectiveness of LOTUS in\nachieving rapid convergence and high accuracy with significantly reduced\ncomputational requirements. This work highlights the potential of combining\ndata selection and sparsity techniques for efficient vision transformer\ntraining, opening doors for further research and development in this area.\n","authors":["Ojasw Upadhyay"],"pdf_url":"https://arxiv.org/pdf/2405.00906v1.pdf","comment":"3 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.00900v1","updated":"2024-05-01T23:07:12Z","published":"2024-05-01T23:07:12Z","title":"DiL-NeRF: Delving into Lidar for Neural Radiance Field on Street Scenes","summary":" Photorealistic simulation plays a crucial role in applications such as\nautonomous driving, where advances in neural radiance fields (NeRFs) may allow\nbetter scalability through the automatic creation of digital 3D assets.\nHowever, reconstruction quality suffers on street scenes due to largely\ncollinear camera motions and sparser samplings at higher speeds. On the other\nhand, the application often demands rendering from camera views that deviate\nfrom the inputs to accurately simulate behaviors like lane changes. In this\npaper, we propose several insights that allow a better utilization of Lidar\ndata to improve NeRF quality on street scenes. First, our framework learns a\ngeometric scene representation from Lidar, which is fused with the implicit\ngrid-based representation for radiance decoding, thereby supplying stronger\ngeometric information offered by explicit point cloud. Second, we put forth a\nrobust occlusion-aware depth supervision scheme, which allows utilizing\ndensified Lidar points by accumulation. Third, we generate augmented training\nviews from Lidar points for further improvement. Our insights translate to\nlargely improved novel view synthesis under real driving scenes.\n","authors":["Shanlin Sun","Bingbing Zhuang","Ziyu Jiang","Buyu Liu","Xiaohui Xie","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2405.00900v1.pdf","comment":"CVPR2024 Highlights"},{"id":"http://arxiv.org/abs/2404.10966v2","updated":"2024-05-01T23:01:13Z","published":"2024-04-17T00:21:36Z","title":"Domain-Specific Block Selection and Paired-View Pseudo-Labeling for\n Online Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test\ndomain without access to source data after deployment. Existing approaches\ntypically rely on self-training with pseudo-labels since ground-truth cannot be\nobtained from test data. Although the quality of pseudo labels is important for\nstable and accurate long-term adaptation, it has not been previously addressed.\nIn this work, we propose DPLOT, a simple yet effective TTA framework that\nconsists of two components: (1) domain-specific block selection and (2)\npseudo-label generation using paired-view images. Specifically, we select\nblocks that involve domain-specific feature extraction and train these blocks\nby entropy minimization. After blocks are adjusted for current test domain, we\ngenerate pseudo-labels by averaging given test images and corresponding flipped\ncounterparts. By simply using flip augmentation, we prevent a decrease in the\nquality of the pseudo-labels, which can be caused by the domain gap resulting\nfrom strong augmentation. Our experimental results demonstrate that DPLOT\noutperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C\nbenchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also,\nwe provide an extensive analysis to demonstrate effectiveness of our framework.\nCode is available at\nhttps://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA.\n","authors":["Yeonguk Yu","Sungho Shin","Seunghyeok Back","Minhwan Ko","Sangjun Noh","Kyoobin Lee"],"pdf_url":"https://arxiv.org/pdf/2404.10966v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2405.00892v1","updated":"2024-05-01T22:33:45Z","published":"2024-05-01T22:33:45Z","title":"Wake Vision: A Large-scale, Diverse Dataset and Benchmark Suite for\n TinyML Person Detection","summary":" Machine learning applications on extremely low-power devices, commonly\nreferred to as tiny machine learning (TinyML), promises a smarter and more\nconnected world. However, the advancement of current TinyML research is\nhindered by the limited size and quality of pertinent datasets. To address this\nchallenge, we introduce Wake Vision, a large-scale, diverse dataset tailored\nfor person detection -- the canonical task for TinyML visual sensing. Wake\nVision comprises over 6 million images, which is a hundredfold increase\ncompared to the previous standard, and has undergone thorough quality\nfiltering. Using Wake Vision for training results in a 2.41\\% increase in\naccuracy compared to the established benchmark. Alongside the dataset, we\nprovide a collection of five detailed benchmark sets that assess model\nperformance on specific segments of the test data, such as varying lighting\nconditions, distances from the camera, and demographic characteristics of\nsubjects. These novel fine-grained benchmarks facilitate the evaluation of\nmodel quality in challenging real-world scenarios that are often ignored when\nfocusing solely on overall accuracy. Through an evaluation of a MobileNetV2\nTinyML model on the benchmarks, we show that the input resolution plays a more\ncrucial role than the model width in detecting distant subjects and that the\nimpact of quantization on model robustness is minimal, thanks to the dataset\nquality. These findings underscore the importance of a detailed evaluation to\nidentify essential factors for model development. The dataset, benchmark suite,\ncode, and models are publicly available under the CC-BY 4.0 license, enabling\ntheir use for commercial use cases.\n","authors":["Colby Banbury","Emil Njor","Matthew Stewart","Pete Warden","Manjunath Kudlur","Nat Jeffries","Xenofon Fafoutis","Vijay Janapa Reddi"],"pdf_url":"https://arxiv.org/pdf/2405.00892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10110v3","updated":"2024-05-01T21:54:24Z","published":"2023-05-17T10:18:02Z","title":"Adaptive aggregation of Monte Carlo augmented decomposed filters for\n efficient group-equivariant convolutional neural network","summary":" Group-equivariant convolutional neural networks (G-CNN) heavily rely on\nparameter sharing to increase CNN's data efficiency and performance. However,\nthe parameter-sharing strategy greatly increases the computational burden for\neach added parameter, which hampers its application to deep neural network\nmodels. In this paper, we address these problems by proposing a\nnon-parameter-sharing approach for group equivariant neural networks. The\nproposed methods adaptively aggregate a diverse range of filters by a weighted\nsum of stochastically augmented decomposed filters. We give theoretical proof\nabout how the continuous group convolution can be approximated by our methods.\nOur method applies to both continuous and discrete groups, where the\naugmentation is implemented using Monte Carlo sampling and bootstrap\nresampling, respectively. We demonstrate that our methods serve as an efficient\nextension of standard CNN. Experiments on group equivariance tests show how our\nmethods can achieve superior performance to parameter-sharing group equivariant\nnetworks. Experiments on image classification and image denoising tasks show\nthat in certain scenarios, with a suitable set of filter bases, our method\nhelps improve the performance of standard CNNs and build efficient lightweight\nimage denoising networks. The code will be available at\nhttps://github.com/ZhaoWenzhao/MCG_CNN.\n","authors":["Wenzhao Zhao","Barbara D. Wichtmann","Steffen Albert","Angelika Maurer","Frank G. Zöllner","Ulrike Attenberger","Jürgen Hesser"],"pdf_url":"https://arxiv.org/pdf/2305.10110v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00878v1","updated":"2024-05-01T21:43:57Z","published":"2024-05-01T21:43:57Z","title":"SonicDiffusion: Audio-Driven Image Generation and Editing with\n Pretrained Diffusion Models","summary":" We are witnessing a revolution in conditional image synthesis with the recent\nsuccess of large scale text-to-image generation methods. This success also\nopens up new opportunities in controlling the generation and editing process\nusing multi-modal input. While spatial control using cues such as depth,\nsketch, and other images has attracted a lot of research, we argue that another\nequally effective modality is audio since sound and sight are two main\ncomponents of human perception. Hence, we propose a method to enable\naudio-conditioning in large scale image diffusion models. Our method first maps\nfeatures obtained from audio clips to tokens that can be injected into the\ndiffusion model in a fashion similar to text tokens. We introduce additional\naudio-image cross attention layers which we finetune while freezing the weights\nof the original layers of the diffusion model. In addition to audio conditioned\nimage generation, our method can also be utilized in conjuction with diffusion\nbased editing methods to enable audio conditioned image editing. We demonstrate\nour method on a wide range of audio and image datasets. We perform extensive\ncomparisons with recent methods and show favorable performance.\n","authors":["Burak Can Biner","Farrin Marouf Sofian","Umur Berkay Karakaş","Duygu Ceylan","Erkut Erdem","Aykut Erdem"],"pdf_url":"https://arxiv.org/pdf/2405.00878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12388v2","updated":"2024-05-01T21:41:30Z","published":"2024-04-18T17:59:53Z","title":"VideoGigaGAN: Towards Detail-rich Video Super-Resolution","summary":" Video super-resolution (VSR) approaches have shown impressive temporal\nconsistency in upsampled videos. However, these approaches tend to generate\nblurrier results than their image counterparts as they are limited in their\ngenerative capability. This raises a fundamental question: can we extend the\nsuccess of a generative image upsampler to the VSR task while preserving the\ntemporal consistency? We introduce VideoGigaGAN, a new generative VSR model\nthat can produce videos with high-frequency details and temporal consistency.\nVideoGigaGAN builds upon a large-scale image upsampler -- GigaGAN. Simply\ninflating GigaGAN to a video model by adding temporal modules produces severe\ntemporal flickering. We identify several key issues and propose techniques that\nsignificantly improve the temporal consistency of upsampled videos. Our\nexperiments show that, unlike previous VSR methods, VideoGigaGAN generates\ntemporally consistent videos with more fine-grained appearance details. We\nvalidate the effectiveness of VideoGigaGAN by comparing it with\nstate-of-the-art VSR models on public datasets and showcasing video results\nwith $8\\times$ super-resolution.\n","authors":["Yiran Xu","Taesung Park","Richard Zhang","Yang Zhou","Eli Shechtman","Feng Liu","Jia-Bin Huang","Difan Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12388v2.pdf","comment":"project page: https://videogigagan.github.io/"},{"id":"http://arxiv.org/abs/2405.00876v1","updated":"2024-05-01T21:35:04Z","published":"2024-05-01T21:35:04Z","title":"Beyond Human Vision: The Role of Large Vision Language Models in\n Microscope Image Analysis","summary":" Vision language models (VLMs) have recently emerged and gained the spotlight\nfor their ability to comprehend the dual modality of image and textual data.\nVLMs such as LLaVA, ChatGPT-4, and Gemini have recently shown impressive\nperformance on tasks such as natural image captioning, visual question\nanswering (VQA), and spatial reasoning. Additionally, a universal segmentation\nmodel by Meta AI, Segment Anything Model (SAM) shows unprecedented performance\nat isolating objects from unforeseen images. Since medical experts, biologists,\nand materials scientists routinely examine microscopy or medical images in\nconjunction with textual information in the form of captions, literature, or\nreports, and draw conclusions of great importance and merit, it is indubitably\nessential to test the performance of VLMs and foundation models such as SAM, on\nthese images. In this study, we charge ChatGPT, LLaVA, Gemini, and SAM with\nclassification, segmentation, counting, and VQA tasks on a variety of\nmicroscopy images. We observe that ChatGPT and Gemini are impressively able to\ncomprehend the visual features in microscopy images, while SAM is quite capable\nat isolating artefacts in a general sense. However, the performance is not\nclose to that of a domain expert - the models are readily encumbered by the\nintroduction of impurities, defects, artefact overlaps and diversity present in\nthe images.\n","authors":["Prateek Verma","Minh-Hao Van","Xintao Wu"],"pdf_url":"https://arxiv.org/pdf/2405.00876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00858v1","updated":"2024-05-01T20:47:06Z","published":"2024-05-01T20:47:06Z","title":"Guided Conditional Diffusion Classifier (ConDiff) for Enhanced\n Prediction of Infection in Diabetic Foot Ulcers","summary":" To detect infected wounds in Diabetic Foot Ulcers (DFUs) from photographs,\npreventing severe complications and amputations. Methods: This paper proposes\nthe Guided Conditional Diffusion Classifier (ConDiff), a novel deep-learning\ninfection detection model that combines guided image synthesis with a denoising\ndiffusion model and distance-based classification. The process involves (1)\ngenerating guided conditional synthetic images by injecting Gaussian noise to a\nguide image, followed by denoising the noise-perturbed image through a reverse\ndiffusion process, conditioned on infection status and (2) classifying\ninfections based on the minimum Euclidean distance between synthesized images\nand the original guide image in embedding space. Results: ConDiff demonstrated\nsuperior performance with an accuracy of 83% and an F1-score of 0.858,\noutperforming state-of-the-art models by at least 3%. The use of a triplet loss\nfunction reduces overfitting in the distance-based classifier. Conclusions:\nConDiff not only enhances diagnostic accuracy for DFU infections but also\npioneers the use of generative discriminative models for detailed medical image\nanalysis, offering a promising approach for improving patient outcomes.\n","authors":["Palawat Busaranuvong","Emmanuel Agu","Deepak Kumar","Shefalika Gautam","Reza Saadati Fard","Bengisu Tulu","Diane Strong"],"pdf_url":"https://arxiv.org/pdf/2405.00858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00857v1","updated":"2024-05-01T20:46:04Z","published":"2024-05-01T20:46:04Z","title":"Brighteye: Glaucoma Screening with Color Fundus Photographs based on\n Vision Transformer","summary":" Differences in image quality, lighting conditions, and patient demographics\npose challenges to automated glaucoma detection from color fundus photography.\nBrighteye, a method based on Vision Transformer, is proposed for glaucoma\ndetection and glaucomatous feature classification. Brighteye learns long-range\nrelationships among pixels within large fundus images using a self-attention\nmechanism. Prior to being input into Brighteye, the optic disc is localized\nusing YOLOv8, and the region of interest (ROI) around the disc center is\ncropped to ensure alignment with clinical practice. Optic disc detection\nimproves the sensitivity at 95% specificity from 79.20% to 85.70% for glaucoma\ndetection and the Hamming distance from 0.2470 to 0.1250 for glaucomatous\nfeature classification. In the developmental stage of the Justified Referral in\nAI Glaucoma Screening (JustRAIGS) challenge, the overall outcome secured the\nfifth position out of 226 entries.\n","authors":["Hui Lin","Charilaos Apostolidis","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2405.00857v1.pdf","comment":"ISBI 2024, JustRAIGS challenge, glaucoma detection"},{"id":"http://arxiv.org/abs/2402.14095v3","updated":"2024-05-01T18:56:27Z","published":"2024-02-21T19:45:05Z","title":"Zero-shot generalization across architectures for visual classification","summary":" Generalization to unseen data is a key desideratum for deep networks, but its\nrelation to classification accuracy is unclear. Using a minimalist vision\ndataset and a measure of generalizability, we show that popular networks, from\ndeep convolutional networks (CNNs) to transformers, vary in their power to\nextrapolate to unseen classes both across layers and across architectures.\nAccuracy is not a good predictor of generalizability, and generalization varies\nnon-monotonically with layer depth.\n","authors":["Evan Gerritz","Luciano Dyballa","Steven W. Zucker"],"pdf_url":"https://arxiv.org/pdf/2402.14095v3.pdf","comment":"Accepted as a Tiny Paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2404.19227v2","updated":"2024-05-01T18:30:14Z","published":"2024-04-30T03:13:06Z","title":"Espresso: Robust Concept Filtering in Text-to-Image Models","summary":" Diffusion-based text-to-image (T2I) models generate high-fidelity images for\ngiven textual prompts. They are trained on large datasets scraped from the\nInternet, potentially containing unacceptable concepts (e.g., copyright\ninfringing or unsafe). Retraining T2I models after filtering out unacceptable\nconcepts in the training data is inefficient and degrades utility. Hence, there\nis a need for concept removal techniques (CRTs) which are effective in removing\nunacceptable concepts, utility-preserving on acceptable concepts, and robust\nagainst evasion with adversarial prompts. None of the prior filtering and\nfine-tuning CRTs satisfy all these requirements simultaneously.\n We introduce Espresso, the first robust concept filter based on Contrastive\nLanguage-Image Pre-Training (CLIP). It identifies unacceptable concepts by\nprojecting the generated image's embedding onto the vector connecting\nunacceptable and acceptable concepts in the joint text-image embedding space.\nThis ensures robustness by restricting the adversary to adding noise only along\nthis vector, in the direction of the acceptable concept. Further fine-tuning\nEspresso to separate embeddings of acceptable and unacceptable concepts, while\npreserving their pairing with image embeddings, ensures both effectiveness and\nutility. We evaluate Espresso on eleven concepts to show that it is effective\n(~5% CLIP accuracy on unacceptable concepts), utility-preserving (~93%\nnormalized CLIP score on acceptable concepts), and robust (~4% CLIP accuracy on\nadversarial prompts for unacceptable concepts). Finally, we present theoretical\nbounds for the certified robustness of Espresso against adversarial prompts,\nand an empirical analysis.\n","authors":["Anudeep Das","Vasisht Duddu","Rui Zhang","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2404.19227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00797v1","updated":"2024-05-01T18:16:55Z","published":"2024-05-01T18:16:55Z","title":"ADM: Accelerated Diffusion Model via Estimated Priors for Robust Motion\n Prediction under Uncertainties","summary":" Motion prediction is a challenging problem in autonomous driving as it\ndemands the system to comprehend stochastic dynamics and the multi-modal nature\nof real-world agent interactions. Diffusion models have recently risen to\nprominence, and have proven particularly effective in pedestrian motion\nprediction tasks. However, the significant time consumption and sensitivity to\nnoise have limited the real-time predictive capability of diffusion models. In\nresponse to these impediments, we propose a novel diffusion-based,\nacceleratable framework that adeptly predicts future trajectories of agents\nwith enhanced resistance to noise. The core idea of our model is to learn a\ncoarse-grained prior distribution of trajectory, which can skip a large number\nof denoise steps. This advancement not only boosts sampling efficiency but also\nmaintains the fidelity of prediction accuracy. Our method meets the rigorous\nreal-time operational standards essential for autonomous vehicles, enabling\nprompt trajectory generation that is vital for secure and efficient navigation.\nThrough extensive experiments, our method speeds up the inference time to 136ms\ncompared to standard diffusion model, and achieves significant improvement in\nmulti-agent motion prediction on the Argoverse 1 motion forecasting dataset.\n","authors":["Jiahui Li","Tianle Shen","Zekai Gu","Jiawei Sun","Chengran Yuan","Yuhang Han","Shuo Sun","Marcelo H. Ang Jr"],"pdf_url":"https://arxiv.org/pdf/2405.00797v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.17183v2","updated":"2024-05-01T18:10:40Z","published":"2023-12-28T18:16:00Z","title":"One Model to Rule them All: Towards Universal Segmentation for Medical\n Images with Text Prompts","summary":" In this study, we focus on building up a model that aims to Segment Anything\nin medical scenarios, driven by Text prompts, termed as SAT. Our main\ncontributions are three folds: (i) for dataset construction, we combine\nmultiple knowledge sources to construct the first multi-modal knowledge tree on\nhuman anatomy, including 6502 anatomical terminologies; Then we build up the\nlargest and most comprehensive segmentation dataset for training, by collecting\nover 22K 3D medical image scans from 72 segmentation datasets with careful\nstandardization on both image scans and label space; (ii) for architecture\ndesign, we formulate a universal segmentation model, that can be prompted by\ninputting medical terminologies in text form. We present knowledge-enhanced\nrepresentation learning on the combination of a large number of datasets; (iii)\nfor model evaluation, we train a SAT-Pro with only 447M parameters, to segment\n72 different segmentation datasets with text prompt, resulting in 497 classes.\nWe have thoroughly evaluated the model from three aspects: averaged by body\nregions, averaged by classes, and average by datasets, demonstrating comparable\nperformance to 72 specialist nnU-Nets, i.e., we train nnU-Net models on each\ndataset/subset, resulting in 72 nnU-Nets with around 2.2B parameters for the 72\ndatasets. We will release all the codes, and models in this work.\n","authors":["Ziheng Zhao","Yao Zhang","Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2312.17183v2.pdf","comment":"53 pages"},{"id":"http://arxiv.org/abs/2405.00794v1","updated":"2024-05-01T18:08:51Z","published":"2024-05-01T18:08:51Z","title":"Coherent 3D Portrait Video Reconstruction via Triplane Fusion","summary":" Recent breakthroughs in single-image 3D portrait reconstruction have enabled\ntelepresence systems to stream 3D portrait videos from a single camera in\nreal-time, potentially democratizing telepresence. However, per-frame 3D\nreconstruction exhibits temporal inconsistency and forgets the user's\nappearance. On the other hand, self-reenactment methods can render coherent 3D\nportraits by driving a personalized 3D prior, but fail to faithfully\nreconstruct the user's per-frame appearance (e.g., facial expressions and\nlighting). In this work, we recognize the need to maintain both coherent\nidentity and dynamic per-frame appearance to enable the best possible realism.\nTo this end, we propose a new fusion-based method that fuses a personalized 3D\nsubject prior with per-frame information, producing temporally stable 3D videos\nwith faithful reconstruction of the user's per-frame appearances. Trained only\nusing synthetic data produced by an expression-conditioned 3D GAN, our\nencoder-based method achieves both state-of-the-art 3D reconstruction accuracy\nand temporal consistency on in-studio and in-the-wild datasets.\n","authors":["Shengze Wang","Xueting Li","Chao Liu","Matthew Chan","Michael Stengel","Josef Spjut","Henry Fuchs","Shalini De Mello","Koki Nagano"],"pdf_url":"https://arxiv.org/pdf/2405.00794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00791v1","updated":"2024-05-01T18:07:48Z","published":"2024-05-01T18:07:48Z","title":"Obtaining Favorable Layouts for Multiple Object Generation","summary":" Large-scale text-to-image models that can generate high-quality and diverse\nimages based on textual prompts have shown remarkable success. These models aim\nultimately to create complex scenes, and addressing the challenge of\nmulti-subject generation is a critical step towards this goal. However, the\nexisting state-of-the-art diffusion models face difficulty when generating\nimages that involve multiple subjects. When presented with a prompt containing\nmore than one subject, these models may omit some subjects or merge them\ntogether. To address this challenge, we propose a novel approach based on a\nguiding principle. We allow the diffusion model to initially propose a layout,\nand then we rearrange the layout grid. This is achieved by enforcing\ncross-attention maps (XAMs) to adhere to proposed masks and by migrating pixels\nfrom latent maps to new locations determined by us. We introduce new loss terms\naimed at reducing XAM entropy for clearer spatial definition of subjects,\nreduce the overlap between XAMs, and ensure that XAMs align with their\nrespective masks. We contrast our approach with several alternative methods and\nshow that it more faithfully captures the desired concepts across a variety of\ntext prompts.\n","authors":["Barak Battash","Amit Rozner","Lior Wolf","Ofir Lindenbaum"],"pdf_url":"https://arxiv.org/pdf/2405.00791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00760v1","updated":"2024-05-01T15:26:14Z","published":"2024-05-01T15:26:14Z","title":"Deep Reward Supervisions for Tuning Text-to-Image Diffusion Models","summary":" Optimizing a text-to-image diffusion model with a given reward function is an\nimportant but underexplored research area. In this study, we propose Deep\nReward Tuning (DRTune), an algorithm that directly supervises the final output\nimage of a text-to-image diffusion model and back-propagates through the\niterative sampling process to the input noise. We find that training earlier\nsteps in the sampling process is crucial for low-level rewards, and deep\nsupervision can be achieved efficiently and effectively by stopping the\ngradient of the denoising network input. DRTune is extensively evaluated on\nvarious reward models. It consistently outperforms other algorithms,\nparticularly for low-level control signals, where all shallow supervision\nmethods fail. Additionally, we fine-tune Stable Diffusion XL 1.0 (SDXL 1.0)\nmodel via DRTune to optimize Human Preference Score v2.1, resulting in the\nFavorable Diffusion XL 1.0 (FDXL 1.0) model. FDXL 1.0 significantly enhances\nimage quality compared to SDXL 1.0 and reaches comparable quality compared with\nMidjourney v5.2.\n","authors":["Xiaoshi Wu","Yiming Hao","Manyuan Zhang","Keqiang Sun","Zhaoyang Huang","Guanglu Song","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2405.00760v1.pdf","comment":"N/A"},{"id":"http://arxiv.org/abs/2405.00754v1","updated":"2024-05-01T07:24:30Z","published":"2024-05-01T07:24:30Z","title":"CLIPArTT: Light-weight Adaptation of CLIP to New Domains at Test Time","summary":" Pre-trained vision-language models (VLMs), exemplified by CLIP, demonstrate\nremarkable adaptability across zero-shot classification tasks without\nadditional training. However, their performance diminishes in the presence of\ndomain shifts. In this study, we introduce CLIP Adaptation duRing Test-Time\n(CLIPArTT), a fully test-time adaptation (TTA) approach for CLIP, which\ninvolves automatic text prompts construction during inference for their use as\ntext supervision. Our method employs a unique, minimally invasive text prompt\ntuning process, wherein multiple predicted classes are aggregated into a single\nnew text prompt, used as pseudo label to re-classify inputs in a transductive\nmanner. Additionally, we pioneer the standardization of TTA benchmarks (e.g.,\nTENT) in the realm of VLMs. Our findings demonstrate that, without requiring\nadditional transformations nor new trainable modules, CLIPArTT enhances\nperformance dynamically across non-corrupted datasets such as CIFAR-10,\ncorrupted datasets like CIFAR-10-C and CIFAR-10.1, alongside synthetic datasets\nsuch as VisDA-C. This research underscores the potential for improving VLMs'\nadaptability through novel test-time strategies, offering insights for robust\nperformance across varied datasets and environments. The code can be found at:\nhttps://github.com/dosowiechi/CLIPArTT.git\n","authors":["Gustavo Adolfo Vargas Hakim","David Osowiechi","Mehrdad Noori","Milad Cheraghalikhani","Ali Bahri","Moslem Yazdanpanah","Ismail Ben Ayed","Christian Desrosiers"],"pdf_url":"https://arxiv.org/pdf/2405.00754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00749v1","updated":"2024-05-01T03:37:12Z","published":"2024-05-01T03:37:12Z","title":"More is Better: Deep Domain Adaptation with Multiple Sources","summary":" In many practical applications, it is often difficult and expensive to obtain\nlarge-scale labeled data to train state-of-the-art deep neural networks.\nTherefore, transferring the learned knowledge from a separate, labeled source\ndomain to an unlabeled or sparsely labeled target domain becomes an appealing\nalternative. However, direct transfer often results in significant performance\ndecay due to domain shift. Domain adaptation (DA) aims to address this problem\nby aligning the distributions between the source and target domains.\nMulti-source domain adaptation (MDA) is a powerful and practical extension in\nwhich the labeled data may be collected from multiple sources with different\ndistributions. In this survey, we first define various MDA strategies. Then we\nsystematically summarize and compare modern MDA methods in the deep learning\nera from different perspectives, followed by commonly used datasets and a brief\nbenchmark. Finally, we discuss future research directions for MDA that are\nworth investigating.\n","authors":["Sicheng Zhao","Hui Chen","Hu Huang","Pengfei Xu","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2405.00749v1.pdf","comment":"Accepted by IJCAI 2024. arXiv admin note: text overlap with\n arXiv:2002.12169"},{"id":"http://arxiv.org/abs/2405.02208v1","updated":"2024-05-01T22:28:18Z","published":"2024-05-01T22:28:18Z","title":"Reference-Free Image Quality Metric for Degradation and Reconstruction\n Artifacts","summary":" Image Quality Assessment (IQA) is essential in various Computer Vision tasks\nsuch as image deblurring and super-resolution. However, most IQA methods\nrequire reference images, which are not always available. While there are some\nreference-free IQA metrics, they have limitations in simulating human\nperception and discerning subtle image quality variations. We hypothesize that\nthe JPEG quality factor is representatives of image quality measurement, and a\nwell-trained neural network can learn to accurately evaluate image quality\nwithout requiring a clean reference, as it can recognize image degradation\nartifacts based on prior knowledge. Thus, we developed a reference-free quality\nevaluation network, dubbed \"Quality Factor (QF) Predictor\", which does not\nrequire any reference. Our QF Predictor is a lightweight, fully convolutional\nnetwork comprising seven layers. The model is trained in a self-supervised\nmanner: it receives JPEG compressed image patch with a random QF as input, is\ntrained to accurately predict the corresponding QF. We demonstrate the\nversatility of the model by applying it to various tasks. First, our QF\nPredictor can generalize to measure the severity of various image artifacts,\nsuch as Gaussian Blur and Gaussian noise. Second, we show that the QF Predictor\ncan be trained to predict the undersampling rate of images reconstructed from\nMagnetic Resonance Imaging (MRI) data.\n","authors":["Han Cui","Alfredo De Goyeneche","Efrat Shimron","Boyuan Ma","Michael Lustig"],"pdf_url":"https://arxiv.org/pdf/2405.02208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01600v1","updated":"2024-05-01T06:05:13Z","published":"2024-05-01T06:05:13Z","title":"Deep Learning Descriptor Hybridization with Feature Reduction for\n Accurate Cervical Cancer Colposcopy Image Classification","summary":" Cervical cancer stands as a predominant cause of female mortality,\nunderscoring the need for regular screenings to enable early diagnosis and\npreemptive treatment of pre-cancerous conditions. The transformation zone in\nthe cervix, where cellular differentiation occurs, plays a critical role in the\ndetection of abnormalities. Colposcopy has emerged as a pivotal tool in\ncervical cancer prevention since it provides a meticulous examination of\ncervical abnormalities. However, challenges in visual evaluation necessitate\nthe development of Computer Aided Diagnosis (CAD) systems.\n We propose a novel CAD system that combines the strengths of various\ndeep-learning descriptors (ResNet50, ResNet101, and ResNet152) with appropriate\nfeature normalization (min-max) as well as feature reduction technique (LDA).\nThe combination of different descriptors ensures that all the features\n(low-level like edges and colour, high-level like shape and texture) are\ncaptured, feature normalization prevents biased learning, and feature reduction\navoids overfitting. We do experiments on the IARC dataset provided by WHO. The\ndataset is initially segmented and balanced. Our approach achieves exceptional\nperformance in the range of 97%-100% for both the normal-abnormal and the type\nclassification. A competitive approach for type classification on the same\ndataset achieved 81%-91% performance.\n","authors":["Saurabh Saini","Kapil Ahuja","Siddartha Chennareddy","Karthik Boddupalli"],"pdf_url":"https://arxiv.org/pdf/2405.01600v1.pdf","comment":"7 Pages double column, 5 figures, and 5 tables"}]},"2024-05-02T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.01538v1","updated":"2024-05-02T17:59:57Z","published":"2024-05-02T17:59:57Z","title":"Multi-Space Alignments Towards Universal LiDAR Segmentation","summary":" A unified and versatile LiDAR segmentation model with strong robustness and\ngeneralizability is desirable for safe autonomous driving perception. This work\npresents M3Net, a one-of-a-kind framework for fulfilling multi-task,\nmulti-dataset, multi-modality LiDAR segmentation in a universal manner using\njust a single set of parameters. To better exploit data volume and diversity,\nwe first combine large-scale driving datasets acquired by different types of\nsensors from diverse scenes and then conduct alignments in three spaces, namely\ndata, feature, and label spaces, during the training. As a result, M3Net is\ncapable of taming heterogeneous data for training state-of-the-art LiDAR\nsegmentation models. Extensive experiments on twelve LiDAR segmentation\ndatasets verify our effectiveness. Notably, using a shared set of parameters,\nM3Net achieves 75.1%, 83.1%, and 72.4% mIoU scores, respectively, on the\nofficial benchmarks of SemanticKITTI, nuScenes, and Waymo Open.\n","authors":["Youquan Liu","Lingdong Kong","Xiaoyang Wu","Runnan Chen","Xin Li","Liang Pan","Ziwei Liu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2405.01538v1.pdf","comment":"CVPR 2024; 33 pages, 14 figures, 14 tables; Code at\n https://github.com/youquanl/M3Net"},{"id":"http://arxiv.org/abs/2405.01536v1","updated":"2024-05-02T17:59:52Z","published":"2024-05-02T17:59:52Z","title":"Customizing Text-to-Image Models with a Single Image Pair","summary":" Art reinterpretation is the practice of creating a variation of a reference\nwork, making a paired artwork that exhibits a distinct artistic style. We ask\nif such an image pair can be used to customize a generative model to capture\nthe demonstrated stylistic difference. We propose Pair Customization, a new\ncustomization method that learns stylistic difference from a single image pair\nand then applies the acquired style to the generation process. Unlike existing\nmethods that learn to mimic a single concept from a collection of images, our\nmethod captures the stylistic difference between paired images. This allows us\nto apply a stylistic change without overfitting to the specific image content\nin the examples. To address this new task, we employ a joint optimization\nmethod that explicitly separates the style and content into distinct LoRA\nweight spaces. We optimize these style and content weights to reproduce the\nstyle and content images while encouraging their orthogonality. During\ninference, we modify the diffusion process via a new style guidance based on\nour learned weights. Both qualitative and quantitative experiments show that\nour method can effectively learn style while avoiding overfitting to image\ncontent, highlighting the potential of modeling such stylistic differences from\na single image pair.\n","authors":["Maxwell Jones","Sheng-Yu Wang","Nupur Kumari","David Bau","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.01536v1.pdf","comment":"project page: https://paircustomization.github.io/"},{"id":"http://arxiv.org/abs/2405.01534v1","updated":"2024-05-02T17:59:31Z","published":"2024-05-02T17:59:31Z","title":"Plan-Seq-Learn: Language Model Guided RL for Solving Long Horizon\n Robotics Tasks","summary":" Large Language Models (LLMs) have been shown to be capable of performing\nhigh-level planning for long-horizon robotics tasks, yet existing methods\nrequire access to a pre-defined skill library (e.g. picking, placing, pulling,\npushing, navigating). However, LLM planning does not address how to design or\nlearn those behaviors, which remains challenging particularly in long-horizon\nsettings. Furthermore, for many tasks of interest, the robot needs to be able\nto adjust its behavior in a fine-grained manner, requiring the agent to be\ncapable of modifying low-level control actions. Can we instead use the\ninternet-scale knowledge from LLMs for high-level policies, guiding\nreinforcement learning (RL) policies to efficiently solve robotic control tasks\nonline without requiring a pre-determined set of skills? In this paper, we\npropose Plan-Seq-Learn (PSL): a modular approach that uses motion planning to\nbridge the gap between abstract language and learned low-level control for\nsolving long-horizon robotics tasks from scratch. We demonstrate that PSL\nachieves state-of-the-art results on over 25 challenging robotics tasks with up\nto 10 stages. PSL solves long-horizon tasks from raw visual input spanning four\nbenchmarks at success rates of over 85%, out-performing language-based,\nclassical, and end-to-end approaches. Video results and code at\nhttps://mihdalal.github.io/planseqlearn/\n","authors":["Murtaza Dalal","Tarun Chiruvolu","Devendra Chaplot","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2405.01534v1.pdf","comment":"Published at ICLR 2024. Website at\n https://mihdalal.github.io/planseqlearn/ 9 pages, 3 figures, 3 tables; 14\n pages appendix (7 additional figures)"},{"id":"http://arxiv.org/abs/2405.01533v1","updated":"2024-05-02T17:59:24Z","published":"2024-05-02T17:59:24Z","title":"OmniDrive: A Holistic LLM-Agent Framework for Autonomous Driving with 3D\n Perception, Reasoning and Planning","summary":" The advances in multimodal large language models (MLLMs) have led to growing\ninterests in LLM-based autonomous driving agents to leverage their strong\nreasoning capabilities. However, capitalizing on MLLMs' strong reasoning\ncapabilities for improved planning behavior is challenging since planning\nrequires full 3D situational awareness beyond 2D reasoning. To address this\nchallenge, our work proposes a holistic framework for strong alignment between\nagent models and 3D driving tasks. Our framework starts with a novel 3D MLLM\narchitecture that uses sparse queries to lift and compress visual\nrepresentations into 3D before feeding them into an LLM. This query-based\nrepresentation allows us to jointly encode dynamic objects and static map\nelements (e.g., traffic lanes), providing a condensed world model for\nperception-action alignment in 3D. We further propose OmniDrive-nuScenes, a new\nvisual question-answering dataset challenging the true 3D situational awareness\nof a model with comprehensive visual question-answering (VQA) tasks, including\nscene description, traffic regulation, 3D grounding, counterfactual reasoning,\ndecision making and planning. Extensive studies show the effectiveness of the\nproposed architecture as well as the importance of the VQA tasks for reasoning\nand planning in complex 3D scenes.\n","authors":["Shihao Wang","Zhiding Yu","Xiaohui Jiang","Shiyi Lan","Min Shi","Nadine Chang","Jan Kautz","Ying Li","Jose M. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2405.01533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01531v1","updated":"2024-05-02T17:59:01Z","published":"2024-05-02T17:59:01Z","title":"Improving Intervention Efficacy via Concept Realignment in Concept\n Bottleneck Models","summary":" Concept Bottleneck Models (CBMs) ground image classification on\nhuman-understandable concepts to allow for interpretable model decisions.\nCrucially, the CBM design inherently allows for human interventions, in which\nexpert users are given the ability to modify potentially misaligned concept\nchoices to influence the decision behavior of the model in an interpretable\nfashion. However, existing approaches often require numerous human\ninterventions per image to achieve strong performances, posing practical\nchallenges in scenarios where obtaining human feedback is expensive. In this\npaper, we find that this is noticeably driven by an independent treatment of\nconcepts during intervention, wherein a change of one concept does not\ninfluence the use of other ones in the model's final decision. To address this\nissue, we introduce a trainable concept intervention realignment module, which\nleverages concept relations to realign concept assignments post-intervention.\nAcross standard, real-world benchmarks, we find that concept realignment can\nsignificantly improve intervention efficacy; significantly reducing the number\nof interventions needed to reach a target classification performance or concept\nprediction accuracy. In addition, it easily integrates into existing\nconcept-based architectures without requiring changes to the models themselves.\nThis reduced cost of human-model collaboration is crucial to enhancing the\nfeasibility of CBMs in resource-constrained environments.\n","authors":["Nishad Singhi","Jae Myung Kim","Karsten Roth","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2405.01531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01527v1","updated":"2024-05-02T17:56:55Z","published":"2024-05-02T17:56:55Z","title":"Track2Act: Predicting Point Tracks from Internet Videos enables Diverse\n Zero-shot Robot Manipulation","summary":" We seek to learn a generalizable goal-conditioned policy that enables\nzero-shot robot manipulation: interacting with unseen objects in novel scenes\nwithout test-time adaptation. While typical approaches rely on a large amount\nof demonstration data for such generalization, we propose an approach that\nleverages web videos to predict plausible interaction plans and learns a\ntask-agnostic transformation to obtain robot actions in the real world. Our\nframework,Track2Act predicts tracks of how points in an image should move in\nfuture time-steps based on a goal, and can be trained with diverse videos on\nthe web including those of humans and robots manipulating everyday objects. We\nuse these 2D track predictions to infer a sequence of rigid transforms of the\nobject to be manipulated, and obtain robot end-effector poses that can be\nexecuted in an open-loop manner. We then refine this open-loop plan by\npredicting residual actions through a closed loop policy trained with a few\nembodiment-specific demonstrations. We show that this approach of combining\nscalably learned track prediction with a residual policy requiring minimal\nin-domain robot-specific data enables zero-shot robot manipulation, and present\na wide array of real-world robot manipulation results across unseen tasks,\nobjects, and scenes. https://homangab.github.io/track2act/\n","authors":["Homanga Bharadhwaj","Roozbeh Mottaghi","Abhinav Gupta","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2405.01527v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2405.01524v1","updated":"2024-05-02T17:54:35Z","published":"2024-05-02T17:54:35Z","title":"A separability-based approach to quantifying generalization: which layer\n is best?","summary":" Generalization to unseen data remains poorly understood for deep learning\nclassification and foundation models. How can one assess the ability of\nnetworks to adapt to new or extended versions of their input space in the\nspirit of few-shot learning, out-of-distribution generalization, and domain\nadaptation? Which layers of a network are likely to generalize best? We provide\na new method for evaluating the capacity of networks to represent a sampled\ndomain, regardless of whether the network has been trained on all classes in\nthe domain. Our approach is the following: after fine-tuning state-of-the-art\npre-trained models for visual classification on a particular domain, we assess\ntheir performance on data from related but distinct variations in that domain.\nGeneralization power is quantified as a function of the latent embeddings of\nunseen data from intermediate layers for both unsupervised and supervised\nsettings. Working throughout all stages of the network, we find that (i) high\nclassification accuracy does not imply high generalizability; and (ii) deeper\nlayers in a model do not always generalize the best, which has implications for\npruning. Since the trends observed across datasets are largely consistent, we\nconclude that our approach reveals (a function of) the intrinsic capacity of\nthe different layers of a model to generalize.\n","authors":["Luciano Dyballa","Evan Gerritz","Steven W. Zucker"],"pdf_url":"https://arxiv.org/pdf/2405.01524v1.pdf","comment":"6, pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.01521v1","updated":"2024-05-02T17:50:53Z","published":"2024-05-02T17:50:53Z","title":"Transformer-Aided Semantic Communications","summary":" The transformer structure employed in large language models (LLMs), as a\nspecialized category of deep neural networks (DNNs) featuring attention\nmechanisms, stands out for their ability to identify and highlight the most\nrelevant aspects of input data. Such a capability is particularly beneficial in\naddressing a variety of communication challenges, notably in the realm of\nsemantic communication where proper encoding of the relevant data is critical\nespecially in systems with limited bandwidth. In this work, we employ vision\ntransformers specifically for the purpose of compression and compact\nrepresentation of the input image, with the goal of preserving semantic\ninformation throughout the transmission process. Through the use of the\nattention mechanism inherent in transformers, we create an attention mask. This\nmask effectively prioritizes critical segments of images for transmission,\nensuring that the reconstruction phase focuses on key objects highlighted by\nthe mask. Our methodology significantly improves the quality of semantic\ncommunication and optimizes bandwidth usage by encoding different parts of the\ndata in accordance with their semantic information content, thus enhancing\noverall efficiency. We evaluate the effectiveness of our proposed framework\nusing the TinyImageNet dataset, focusing on both reconstruction quality and\naccuracy. Our evaluation results demonstrate that our framework successfully\npreserves semantic information, even when only a fraction of the encoded data\nis transmitted, according to the intended compression rates.\n","authors":["Matin Mortaheb","Erciyes Karakaya","Mohammad A. Amir Khojastepour","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2405.01521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06027v2","updated":"2024-05-02T17:43:34Z","published":"2023-04-12T17:59:41Z","title":"Continual Diffusion: Continual Customization of Text-to-Image Diffusion\n with C-LoRA","summary":" Recent works demonstrate a remarkable ability to customize text-to-image\ndiffusion models while only providing a few example images. What happens if you\ntry to customize such models using multiple, fine-grained concepts in a\nsequential (i.e., continual) manner? In our work, we show that recent\nstate-of-the-art customization of text-to-image models suffer from catastrophic\nforgetting when new concepts arrive sequentially. Specifically, when adding a\nnew concept, the ability to generate high quality images of past, similar\nconcepts degrade. To circumvent this forgetting, we propose a new method,\nC-LoRA, composed of a continually self-regularized low-rank adaptation in cross\nattention layers of the popular Stable Diffusion model. Furthermore, we use\ncustomization prompts which do not include the word of the customized object\n(i.e., \"person\" for a human face dataset) and are initialized as completely\nrandom embeddings. Importantly, our method induces only marginal additional\nparameter costs and requires no storage of user data for replay. We show that\nC-LoRA not only outperforms several baselines for our proposed setting of\ntext-to-image continual customization, which we refer to as Continual\nDiffusion, but that we achieve a new state-of-the-art in the well-established\nrehearsal-free continual learning setting for image classification. The high\nachieving performance of C-LoRA in two separate domains positions it as a\ncompelling solution for a wide range of applications, and we believe it has\nsignificant potential for practical impact. Project page:\nhttps://jamessealesmith.github.io/continual-diffusion/\n","authors":["James Seale Smith","Yen-Chang Hsu","Lingyu Zhang","Ting Hua","Zsolt Kira","Yilin Shen","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2304.06027v2.pdf","comment":"Transactions on Machine Learning Research (TMLR) 2024"},{"id":"http://arxiv.org/abs/2405.01503v1","updated":"2024-05-02T17:33:26Z","published":"2024-05-02T17:33:26Z","title":"PAM-UNet: Shifting Attention on Region of Interest in Medical Images","summary":" Computer-aided segmentation methods can assist medical personnel in improving\ndiagnostic outcomes. While recent advancements like UNet and its variants have\nshown promise, they face a critical challenge: balancing accuracy with\ncomputational efficiency. Shallow encoder architectures in UNets often struggle\nto capture crucial spatial features, leading in inaccurate and sparse\nsegmentation. To address this limitation, we propose a novel\n\\underline{P}rogressive \\underline{A}ttention based \\underline{M}obile\n\\underline{UNet} (\\underline{PAM-UNet}) architecture. The inverted residual\n(IR) blocks in PAM-UNet help maintain a lightweight framework, while layerwise\n\\textit{Progressive Luong Attention} ($\\mathcal{PLA}$) promotes precise\nsegmentation by directing attention toward regions of interest during\nsynthesis. Our approach prioritizes both accuracy and speed, achieving a\ncommendable balance with a mean IoU of 74.65 and a dice score of 82.87, while\nrequiring only 1.32 floating-point operations per second (FLOPS) on the Liver\nTumor Segmentation Benchmark (LiTS) 2017 dataset. These results highlight the\nimportance of developing efficient segmentation models to accelerate the\nadoption of AI in clinical practice.\n","authors":["Abhijit Das","Debesh Jha","Vandan Gorade","Koushik Biswas","Hongyi Pan","Zheyuan Zhang","Daniela P. Ladner","Yury Velichko","Amir Borhani","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2405.01503v1.pdf","comment":"Accepted at 2024 IEEE EMBC"},{"id":"http://arxiv.org/abs/2404.15918v2","updated":"2024-05-02T17:27:42Z","published":"2024-04-24T15:12:25Z","title":"Perception and Localization of Macular Degeneration Applying\n Convolutional Neural Network, ResNet and Grad-CAM","summary":" A well-known retinal disease that sends blurry visions to the affected\npatients is Macular Degeneration. This research is based on classifying the\nhealthy and macular degeneration fundus by localizing the affected region of\nthe fundus. A CNN architecture and CNN with ResNet architecture (ResNet50,\nResNet50v2, ResNet101, ResNet101v2, ResNet152, ResNet152v2) as the backbone are\nused to classify the two types of fundus. The data are split into three\ncategories including (a) Training set is 90% and Testing set is 10% (b)\nTraining set is 80% and Testing set is 20%, (c) Training set is 50% and Testing\nset is 50%. After the training, the best model has been selected from the\nevaluation metrics. Among the models, CNN with a backbone of ResNet50 performs\nbest which gives the training accuracy of 98.7% for 90% train and 10% test data\nsplit. With this model, we have performed the Grad-CAM visualization to get the\nregion of the affected area of the fundus.\n","authors":["Tahmim Hossain","Sagor Chandro Bakchy"],"pdf_url":"https://arxiv.org/pdf/2404.15918v2.pdf","comment":"12 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.01496v1","updated":"2024-05-02T17:27:04Z","published":"2024-05-02T17:27:04Z","title":"LocInv: Localization-aware Inversion for Text-Guided Image Editing","summary":" Large-scale Text-to-Image (T2I) diffusion models demonstrate significant\ngeneration capabilities based on textual prompts. Based on the T2I diffusion\nmodels, text-guided image editing research aims to empower users to manipulate\ngenerated images by altering the text prompts. However, existing image editing\ntechniques are prone to editing over unintentional regions that are beyond the\nintended target area, primarily due to inaccuracies in cross-attention maps. To\naddress this problem, we propose Localization-aware Inversion (LocInv), which\nexploits segmentation maps or bounding boxes as extra localization priors to\nrefine the cross-attention maps in the denoising phases of the diffusion\nprocess. Through the dynamic updating of tokens corresponding to noun words in\nthe textual input, we are compelling the cross-attention maps to closely align\nwith the correct noun and adjective words in the text prompt. Based on this\ntechnique, we achieve fine-grained image editing over particular objects while\npreventing undesired changes to other regions. Our method LocInv, based on the\npublicly available Stable Diffusion, is extensively evaluated on a subset of\nthe COCO dataset, and consistently obtains superior results both quantitatively\nand qualitatively.The code will be released at\nhttps://github.com/wangkai930418/DPL\n","authors":["Chuanming Tang","Kai Wang","Fei Yang","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2405.01496v1.pdf","comment":"Accepted by CVPR 2024 Workshop AI4CC"},{"id":"http://arxiv.org/abs/2405.01494v1","updated":"2024-05-02T17:26:52Z","published":"2024-05-02T17:26:52Z","title":"Navigating Heterogeneity and Privacy in One-Shot Federated Learning with\n Diffusion Models","summary":" Federated learning (FL) enables multiple clients to train models collectively\nwhile preserving data privacy. However, FL faces challenges in terms of\ncommunication cost and data heterogeneity. One-shot federated learning has\nemerged as a solution by reducing communication rounds, improving efficiency,\nand providing better security against eavesdropping attacks. Nevertheless, data\nheterogeneity remains a significant challenge, impacting performance. This work\nexplores the effectiveness of diffusion models in one-shot FL, demonstrating\ntheir applicability in addressing data heterogeneity and improving FL\nperformance. Additionally, we investigate the utility of our diffusion model\napproach, FedDiff, compared to other one-shot FL methods under differential\nprivacy (DP). Furthermore, to improve generated sample quality under DP\nsettings, we propose a pragmatic Fourier Magnitude Filtering (FMF) method,\nenhancing the effectiveness of generated data for global model training.\n","authors":["Matias Mendieta","Guangyu Sun","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13299v2","updated":"2024-05-02T17:20:28Z","published":"2023-12-19T20:18:29Z","title":"Compact 3D Scene Representation via Self-Organizing Gaussian Grids","summary":" 3D Gaussian Splatting has recently emerged as a highly promising technique\nfor modeling of static 3D scenes. In contrast to Neural Radiance Fields, it\nutilizes efficient rasterization allowing for very fast rendering at\nhigh-quality. However, the storage size is significantly higher, which hinders\npractical deployment, e.g. on resource constrained devices. In this paper, we\nintroduce a compact scene representation organizing the parameters of 3D\nGaussian Splatting (3DGS) into a 2D grid with local homogeneity, ensuring a\ndrastic reduction in storage requirements without compromising visual quality\nduring rendering. Central to our idea is the explicit exploitation of\nperceptual redundancies present in natural scenes. In essence, the inherent\nnature of a scene allows for numerous permutations of Gaussian parameters to\nequivalently represent it. To this end, we propose a novel highly parallel\nalgorithm that regularly arranges the high-dimensional Gaussian parameters into\na 2D grid while preserving their neighborhood structure. During training, we\nfurther enforce local smoothness between the sorted parameters in the grid. The\nuncompressed Gaussians use the same structure as 3DGS, ensuring a seamless\nintegration with established renderers. Our method achieves a reduction factor\nof 17x to 42x in size for complex scenes with no increase in training time,\nmarking a substantial leap forward in the domain of 3D scene distribution and\nconsumption. Additional information can be found on our project page:\nhttps://fraunhoferhhi.github.io/Self-Organizing-Gaussians/\n","authors":["Wieland Morgenstern","Florian Barthel","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2312.13299v2.pdf","comment":"Added compression of spherical harmonics, updated compression method\n with improved results (all attributes compressed with JPEG XL now), added\n qualitative comparison of additional scenes, moved compression explanation\n and comparison to main paper, added comparison with \"Making Gaussian Splats\n smaller\""},{"id":"http://arxiv.org/abs/2405.01483v1","updated":"2024-05-02T17:14:57Z","published":"2024-05-02T17:14:57Z","title":"MANTIS: Interleaved Multi-Image Instruction Tuning","summary":" The recent years have witnessed a great array of large multimodal models\n(LMMs) to effectively solve single-image vision language tasks. However, their\nabilities to solve multi-image visual language tasks is yet to be improved. The\nexisting multi-image LMMs (e.g. OpenFlamingo, Emu, Idefics, etc) mostly gain\ntheir multi-image ability through pre-training on hundreds of millions of noisy\ninterleaved image-text data from web, which is neither efficient nor effective.\nIn this paper, we aim at building strong multi-image LMMs via instruction\ntuning with academic-level resources. Therefore, we meticulously construct\nMantis-Instruct containing 721K instances from 14 multi-image datasets. We\ndesign Mantis-Instruct to cover different multi-image skills like co-reference,\nreasoning, comparing, temporal understanding. We combine Mantis-Instruct with\nseveral single-image visual-language datasets to train our model Mantis to\nhandle any interleaved image-text inputs. We evaluate the trained Mantis on\nfive multi-image benchmarks and eight single-image benchmarks. Though only\nrequiring academic-level resources (i.e. 36 hours on 16xA100-40G), Mantis-8B\ncan achieve state-of-the-art performance on all the multi-image benchmarks and\nbeats the existing best multi-image LMM Idefics2-8B by an average of 9 absolute\npoints. We observe that Mantis performs equivalently well on the held-in and\nheld-out evaluation benchmarks. We further evaluate Mantis on single-image\nbenchmarks and demonstrate that Mantis can maintain a strong single-image\nperformance on par with CogVLM and Emu2. Our results are particularly\nencouraging as it shows that low-cost instruction tuning is indeed much more\neffective than intensive pre-training in terms of building multi-image LMMs.\n","authors":["Dongfu Jiang","Xuan He","Huaye Zeng","Cong Wei","Max Ku","Qian Liu","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01483v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.01474v1","updated":"2024-05-02T17:07:25Z","published":"2024-05-02T17:07:25Z","title":"V-FLUTE: Visual Figurative Language Understanding with Textual\n Explanations","summary":" Large Vision-Language models (VLMs) have demonstrated strong reasoning\ncapabilities in tasks requiring a fine-grained understanding of literal images\nand text, such as visual question-answering or visual entailment. However,\nthere has been little exploration of these models' capabilities when presented\nwith images and captions containing figurative phenomena such as metaphors or\nhumor, the meaning of which is often implicit. To close this gap, we propose a\nnew task and a high-quality dataset: Visual Figurative Language Understanding\nwith Textual Explanations (V-FLUTE). We frame the visual figurative language\nunderstanding problem as an explainable visual entailment task, where the model\nhas to predict whether the image (premise) entails a claim (hypothesis) and\njustify the predicted label with a textual explanation. Using a human-AI\ncollaboration framework, we build a high-quality dataset, V-FLUTE, that\ncontains 6,027 instances spanning five\ndiverse multimodal figurative phenomena: metaphors, similes, idioms, sarcasm,\nand humor. The figurative phenomena can be present either in the image, the\ncaption, or both. We further conduct both automatic and human evaluations to\nassess current VLMs' capabilities in understanding figurative phenomena.\n","authors":["Arkadiy Saakyan","Shreyas Kulkarni","Tuhin Chakrabarty","Smaranda Muresan"],"pdf_url":"https://arxiv.org/pdf/2405.01474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01469v1","updated":"2024-05-02T16:59:10Z","published":"2024-05-02T16:59:10Z","title":"Advancing human-centric AI for robust X-ray analysis through holistic\n self-supervised learning","summary":" AI Foundation models are gaining traction in various applications, including\nmedical fields like radiology. However, medical foundation models are often\ntested on limited tasks, leaving their generalisability and biases unexplored.\nWe present RayDINO, a large visual encoder trained by self-supervision on 873k\nchest X-rays. We compare RayDINO to previous state-of-the-art models across\nnine radiology tasks, from classification and dense segmentation to text\ngeneration, and provide an in depth analysis of population, age and sex biases\nof our model. Our findings suggest that self-supervision allows patient-centric\nAI proving useful in clinical workflows and interpreting X-rays holistically.\nWith RayDINO and small task-specific adapters, we reach state-of-the-art\nresults and improve generalization to unseen populations while mitigating bias,\nillustrating the true promise of foundation models: versatility and robustness.\n","authors":["Théo Moutakanni","Piotr Bojanowski","Guillaume Chassagnon","Céline Hudelot","Armand Joulin","Yann LeCun","Matthew Muckley","Maxime Oquab","Marie-Pierre Revel","Maria Vakalopoulou"],"pdf_url":"https://arxiv.org/pdf/2405.01469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01468v1","updated":"2024-05-02T16:59:05Z","published":"2024-05-02T16:59:05Z","title":"Understanding Retrieval-Augmented Task Adaptation for Vision-Language\n Models","summary":" Pre-trained contrastive vision-language models have demonstrated remarkable\nperformance across a wide range of tasks. However, they often struggle on\nfine-trained datasets with categories not adequately represented during\npre-training, which makes adaptation necessary. Recent works have shown\npromising results by utilizing samples from web-scale databases for\nretrieval-augmented adaptation, especially in low-data regimes. Despite the\nempirical success, understanding how retrieval impacts the adaptation of\nvision-language models remains an open research question. In this work, we\nadopt a reflective perspective by presenting a systematic study to understand\nthe roles of key components in retrieval-augmented adaptation. We unveil new\ninsights on uni-modal and cross-modal retrieval and highlight the critical role\nof logit ensemble for effective adaptation. We further present theoretical\nunderpinnings that directly support our empirical observations.\n","authors":["Yifei Ming","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2405.01468v1.pdf","comment":"The paper is accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2405.01461v1","updated":"2024-05-02T16:50:41Z","published":"2024-05-02T16:50:41Z","title":"SATO: Stable Text-to-Motion Framework","summary":" Is the Text to Motion model robust? Recent advancements in Text to Motion\nmodels primarily stem from more accurate predictions of specific actions.\nHowever, the text modality typically relies solely on pre-trained Contrastive\nLanguage-Image Pretraining (CLIP) models. Our research has uncovered a\nsignificant issue with the text-to-motion model: its predictions often exhibit\ninconsistent outputs, resulting in vastly different or even incorrect poses\nwhen presented with semantically similar or identical text inputs. In this\npaper, we undertake an analysis to elucidate the underlying causes of this\ninstability, establishing a clear link between the unpredictability of model\noutputs and the erratic attention patterns of the text encoder module.\nConsequently, we introduce a formal framework aimed at addressing this issue,\nwhich we term the Stable Text-to-Motion Framework (SATO). SATO consists of\nthree modules, each dedicated to stable attention, stable prediction, and\nmaintaining a balance between accuracy and robustness trade-off. We present a\nmethodology for constructing an SATO that satisfies the stability of attention\nand prediction. To verify the stability of the model, we introduced a new\ntextual synonym perturbation dataset based on HumanML3D and KIT-ML. Results\nshow that SATO is significantly more stable against synonyms and other slight\nperturbations while keeping its high accuracy performance.\n","authors":["Wenshuo Chen","Hongru Xiao","Erhang Zhang","Lijie Hu","Lei Wang","Mengyuan Liu","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01460v1","updated":"2024-05-02T16:49:25Z","published":"2024-05-02T16:49:25Z","title":"Purify Unlearnable Examples via Rate-Constrained Variational\n Autoencoders","summary":" Unlearnable examples (UEs) seek to maximize testing error by making subtle\nmodifications to training examples that are correctly labeled. Defenses against\nthese poisoning attacks can be categorized based on whether specific\ninterventions are adopted during training. The first approach is training-time\ndefense, such as adversarial training, which can mitigate poisoning effects but\nis computationally intensive. The other approach is pre-training purification,\ne.g., image short squeezing, which consists of several simple compressions but\noften encounters challenges in dealing with various UEs. Our work provides a\nnovel disentanglement mechanism to build an efficient pre-training purification\nmethod. Firstly, we uncover rate-constrained variational autoencoders (VAEs),\ndemonstrating a clear tendency to suppress the perturbations in UEs. We\nsubsequently conduct a theoretical analysis for this phenomenon. Building upon\nthese insights, we introduce a disentangle variational autoencoder (D-VAE),\ncapable of disentangling the perturbations with learnable class-wise\nembeddings. Based on this network, a two-stage purification approach is\nnaturally developed. The first stage focuses on roughly eliminating\nperturbations, while the second stage produces refined, poison-free results,\nensuring effectiveness and robustness across various scenarios. Extensive\nexperiments demonstrate the remarkable performance of our method across\nCIFAR-10, CIFAR-100, and a 100-class ImageNet-subset. Code is available at\nhttps://github.com/yuyi-sd/D-VAE.\n","authors":["Yi Yu","Yufei Wang","Song Xia","Wenhan Yang","Shijian Lu","Yap-Peng Tan","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2405.01460v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.01439v1","updated":"2024-05-02T16:26:37Z","published":"2024-05-02T16:26:37Z","title":"Improving Domain Generalization on Gaze Estimation via Branch-out\n Auxiliary Regularization","summary":" Despite remarkable advancements, mainstream gaze estimation techniques,\nparticularly appearance-based methods, often suffer from performance\ndegradation in uncontrolled environments due to variations in illumination and\nindividual facial attributes. Existing domain adaptation strategies, limited by\ntheir need for target domain samples, may fall short in real-world\napplications. This letter introduces Branch-out Auxiliary Regularization (BAR),\nan innovative method designed to boost gaze estimation's generalization\ncapabilities without requiring direct access to target domain data.\nSpecifically, BAR integrates two auxiliary consistency regularization branches:\none that uses augmented samples to counteract environmental variations, and\nanother that aligns gaze directions with positive source domain samples to\nencourage the learning of consistent gaze features. These auxiliary pathways\nstrengthen the core network and are integrated in a smooth, plug-and-play\nmanner, facilitating easy adaptation to various other models. Comprehensive\nexperimental evaluations on four cross-dataset tasks demonstrate the\nsuperiority of our approach.\n","authors":["Ruijie Zhao","Pinyan Tang","Sihui Luo"],"pdf_url":"https://arxiv.org/pdf/2405.01439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19473v4","updated":"2024-05-02T16:25:18Z","published":"2024-02-29T18:59:01Z","title":"Retrieval-Augmented Generation for AI-Generated Content: A Survey","summary":" Advancements in model algorithms, the growth of foundational models, and\naccess to high-quality datasets have propelled the evolution of Artificial\nIntelligence Generated Content (AIGC). Despite its notable successes, AIGC\nstill faces hurdles such as updating knowledge, handling long-tail data,\nmitigating data leakage, and managing high training and inference costs.\nRetrieval-Augmented Generation (RAG) has recently emerged as a paradigm to\naddress such challenges. In particular, RAG introduces the information\nretrieval process, which enhances the generation process by retrieving relevant\nobjects from available data stores, leading to higher accuracy and better\nrobustness. In this paper, we comprehensively review existing efforts that\nintegrate RAG technique into AIGC scenarios. We first classify RAG foundations\naccording to how the retriever augments the generator, distilling the\nfundamental abstractions of the augmentation methodologies for various\nretrievers and generators. This unified perspective encompasses all RAG\nscenarios, illuminating advancements and pivotal technologies that help with\npotential future progress. We also summarize additional enhancements methods\nfor RAG, facilitating effective engineering and implementation of RAG systems.\nThen from another view, we survey on practical applications of RAG across\ndifferent modalities and tasks, offering valuable references for researchers\nand practitioners. Furthermore, we introduce the benchmarks for RAG, discuss\nthe limitations of current RAG systems, and suggest potential directions for\nfuture research. Github: https://github.com/PKU-DAIR/RAG-Survey.\n","authors":["Penghao Zhao","Hailin Zhang","Qinhan Yu","Zhengren Wang","Yunteng Geng","Fangcheng Fu","Ling Yang","Wentao Zhang","Jie Jiang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2402.19473v4.pdf","comment":"Citing 334 papers, 21 pages, 1 table, 12 figures. Project:\n https://github.com/PKU-DAIR/RAG-Survey"},{"id":"http://arxiv.org/abs/2405.01434v1","updated":"2024-05-02T16:25:16Z","published":"2024-05-02T16:25:16Z","title":"StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video\n Generation","summary":" For recent diffusion-based generative models, maintaining consistent content\nacross a series of generated images, especially those containing subjects and\ncomplex details, presents a significant challenge. In this paper, we propose a\nnew way of self-attention calculation, termed Consistent Self-Attention, that\nsignificantly boosts the consistency between the generated images and augments\nprevalent pretrained diffusion-based text-to-image models in a zero-shot\nmanner. To extend our method to long-range video generation, we further\nintroduce a novel semantic space temporal motion prediction module, named\nSemantic Motion Predictor. It is trained to estimate the motion conditions\nbetween two provided images in the semantic spaces. This module converts the\ngenerated sequence of images into videos with smooth transitions and consistent\nsubjects that are significantly more stable than the modules based on latent\nspaces only, especially in the context of long video generation. By merging\nthese two novel components, our framework, referred to as StoryDiffusion, can\ndescribe a text-based story with consistent images or videos encompassing a\nrich variety of contents. The proposed StoryDiffusion encompasses pioneering\nexplorations in visual story generation with the presentation of images and\nvideos, which we hope could inspire more research from the aspect of\narchitectural modifications. Our code is made publicly available at\nhttps://github.com/HVision-NKU/StoryDiffusion.\n","authors":["Yupeng Zhou","Daquan Zhou","Ming-Ming Cheng","Jiashi Feng","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2405.01434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.04382v6","updated":"2024-05-02T16:17:25Z","published":"2022-05-09T15:35:33Z","title":"FlowBot3D: Learning 3D Articulation Flow to Manipulate Articulated\n Objects","summary":" We explore a novel method to perceive and manipulate 3D articulated objects\nthat generalizes to enable a robot to articulate unseen classes of objects. We\npropose a vision-based system that learns to predict the potential motions of\nthe parts of a variety of articulated objects to guide downstream motion\nplanning of the system to articulate the objects. To predict the object\nmotions, we train a neural network to output a dense vector field representing\nthe point-wise motion direction of the points in the point cloud under\narticulation. We then deploy an analytical motion planner based on this vector\nfield to achieve a policy that yields maximum articulation. We train the vision\nsystem entirely in simulation, and we demonstrate the capability of our system\nto generalize to unseen object instances and novel categories in both\nsimulation and the real world, deploying our policy on a Sawyer robot with no\nfinetuning. Results show that our system achieves state-of-the-art performance\nin both simulated and real-world experiments.\n","authors":["Ben Eisner","Harry Zhang","David Held"],"pdf_url":"https://arxiv.org/pdf/2205.04382v6.pdf","comment":"Accepted to Robotics Science and Systems (RSS) 2022, Best Paper\n Finalist"},{"id":"http://arxiv.org/abs/2405.01413v1","updated":"2024-05-02T16:04:30Z","published":"2024-05-02T16:04:30Z","title":"MiniGPT-3D: Efficiently Aligning 3D Point Clouds with Large Language\n Models using 2D Priors","summary":" Large 2D vision-language models (2D-LLMs) have gained significant attention\nby bridging Large Language Models (LLMs) with images using a simple projector.\nInspired by their success, large 3D point cloud-language models (3D-LLMs) also\nintegrate point clouds into LLMs. However, directly aligning point clouds with\nLLM requires expensive training costs, typically in hundreds of GPU-hours on\nA100, which hinders the development of 3D-LLMs. In this paper, we introduce\nMiniGPT-3D, an efficient and powerful 3D-LLM that achieves multiple SOTA\nresults while training for only 27 hours on one RTX 3090. Specifically, we\npropose to align 3D point clouds with LLMs using 2D priors from 2D-LLMs, which\ncan leverage the similarity between 2D and 3D visual information. We introduce\na novel four-stage training strategy for modality alignment in a cascaded way,\nand a mixture of query experts module to adaptively aggregate features with\nhigh efficiency. Moreover, we utilize parameter-efficient fine-tuning methods\nLoRA and Norm fine-tuning, resulting in only 47.8M learnable parameters, which\nis up to 260x fewer than existing methods. Extensive experiments show that\nMiniGPT-3D achieves SOTA on 3D object classification and captioning tasks, with\nsignificantly cheaper training costs. Notably, MiniGPT-3D gains an 8.12\nincrease on GPT-4 evaluation score for the challenging object captioning task\ncompared to ShapeLLM-13B, while the latter costs 160 total GPU-hours on 8 A800.\nWe are the first to explore the efficient 3D-LLM, offering new insights to the\ncommunity. Code and weights are available at\nhttps://github.com/TangYuan96/MiniGPT-3D.\n","authors":["Yuan Tang","Xu Han","Xianzhi Li","Qiao Yu","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01413v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2211.09325v3","updated":"2024-05-02T16:04:19Z","published":"2022-11-17T04:06:16Z","title":"TAX-Pose: Task-Specific Cross-Pose Estimation for Robot Manipulation","summary":" How do we imbue robots with the ability to efficiently manipulate unseen\nobjects and transfer relevant skills based on demonstrations? End-to-end\nlearning methods often fail to generalize to novel objects or unseen\nconfigurations. Instead, we focus on the task-specific pose relationship\nbetween relevant parts of interacting objects. We conjecture that this\nrelationship is a generalizable notion of a manipulation task that can transfer\nto new objects in the same category; examples include the relationship between\nthe pose of a pan relative to an oven or the pose of a mug relative to a mug\nrack. We call this task-specific pose relationship \"cross-pose\" and provide a\nmathematical definition of this concept. We propose a vision-based system that\nlearns to estimate the cross-pose between two objects for a given manipulation\ntask using learned cross-object correspondences. The estimated cross-pose is\nthen used to guide a downstream motion planner to manipulate the objects into\nthe desired pose relationship (placing a pan into the oven or the mug onto the\nmug rack). We demonstrate our method's capability to generalize to unseen\nobjects, in some cases after training on only 10 demonstrations in the real\nworld. Results show that our system achieves state-of-the-art performance in\nboth simulated and real-world experiments across a number of tasks.\nSupplementary information and videos can be found at\nhttps://sites.google.com/view/tax-pose/home.\n","authors":["Chuer Pan","Brian Okorn","Harry Zhang","Ben Eisner","David Held"],"pdf_url":"https://arxiv.org/pdf/2211.09325v3.pdf","comment":"Conference on Robot Learning (CoRL), 2022. Supplementary material is\n available at https://sites.google.com/view/tax-pose/home"},{"id":"http://arxiv.org/abs/2405.01409v1","updated":"2024-05-02T16:01:58Z","published":"2024-05-02T16:01:58Z","title":"Goal-conditioned reinforcement learning for ultrasound navigation\n guidance","summary":" Transesophageal echocardiography (TEE) plays a pivotal role in cardiology for\ndiagnostic and interventional procedures. However, using it effectively\nrequires extensive training due to the intricate nature of image acquisition\nand interpretation. To enhance the efficiency of novice sonographers and reduce\nvariability in scan acquisitions, we propose a novel ultrasound (US) navigation\nassistance method based on contrastive learning as goal-conditioned\nreinforcement learning (GCRL). We augment the previous framework using a novel\ncontrastive patient batching method (CPB) and a data-augmented contrastive\nloss, both of which we demonstrate are essential to ensure generalization to\nanatomical variations across patients. The proposed framework enables\nnavigation to both standard diagnostic as well as intricate interventional\nviews with a single model. Our method was developed with a large dataset of 789\npatients and obtained an average error of 6.56 mm in position and 9.36 degrees\nin angle on a testing dataset of 140 patients, which is competitive or superior\nto models trained on individual views. Furthermore, we quantitatively validate\nour method's ability to navigate to interventional views such as the Left\nAtrial Appendage (LAA) view used in LAA closure. Our approach holds promise in\nproviding valuable guidance during transesophageal ultrasound examinations,\ncontributing to the advancement of skill acquisition for cardiac ultrasound\npractitioners.\n","authors":["Abdoul Aziz Amadou","Vivek Singh","Florin C. Ghesu","Young-Ho Kim","Laura Stanciulescu","Harshitha P. Sai","Puneet Sharma","Alistair Young","Ronak Rajani","Kawal Rhode"],"pdf_url":"https://arxiv.org/pdf/2405.01409v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2209.10368v4","updated":"2024-05-02T15:46:28Z","published":"2022-09-21T14:03:08Z","title":"USC: Uncompromising Spatial Constraints for Safety-Oriented 3D Object\n Detectors in Autonomous Driving","summary":" We consider the safety-oriented performance of 3D object detectors in\nautonomous driving contexts. Specifically, despite impressive results shown by\nthe mass literature, developers often find it hard to ensure the safe\ndeployment of these learning-based perception models. Attributing the challenge\nto the lack of safety-oriented metrics, we hereby present uncompromising\nspatial constraints (USC), which characterize a simple yet important\nlocalization requirement demanding the predictions to fully cover the objects\nwhen seen from the autonomous vehicle. The constraints, as we formulate using\nthe perspective and bird's-eye views, can be naturally reflected by\nquantitative measures, such that having an object detector with a higher score\nimplies a lower risk of collision. Finally, beyond model evaluation, we\nincorporate the quantitative measures into common loss functions to enable\nsafety-oriented fine-tuning for existing models. With experiments using the\nnuScenes dataset and a closed-loop simulation, our work demonstrates such\nconsiderations of safety notions at the perception level not only improve model\nperformances beyond accuracy but also allow for a more direct linkage to actual\nsystem safety.\n","authors":["Brian Hsuan-Cheng Liao","Chih-Hong Cheng","Hasan Esen","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2209.10368v4.pdf","comment":"8 pages (IEEE double column format), 7 figures, 2 tables, submitted\n to ITSC 2024"},{"id":"http://arxiv.org/abs/2307.06065v3","updated":"2024-05-02T15:41:11Z","published":"2023-07-12T10:29:40Z","title":"Operational Support Estimator Networks","summary":" In this work, we propose a novel approach called Operational Support\nEstimator Networks (OSENs) for the support estimation task. Support Estimation\n(SE) is defined as finding the locations of non-zero elements in sparse\nsignals. By its very nature, the mapping between the measurement and sparse\nsignal is a non-linear operation. Traditional support estimators rely on\ncomputationally expensive iterative signal recovery techniques to achieve such\nnon-linearity. Contrary to the convolutional layers, the proposed OSEN approach\nconsists of operational layers that can learn such complex non-linearities\nwithout the need for deep networks. In this way, the performance of\nnon-iterative support estimation is greatly improved. Moreover, the operational\nlayers comprise so-called generative super neurons with non-local kernels. The\nkernel location for each neuron/feature map is optimized jointly for the SE\ntask during training. We evaluate the OSENs in three different applications: i.\nsupport estimation from Compressive Sensing (CS) measurements, ii.\nrepresentation-based classification, and iii. learning-aided CS reconstruction\nwhere the output of OSENs is used as prior knowledge to the CS algorithm for\nenhanced reconstruction. Experimental results show that the proposed approach\nachieves computational efficiency and outperforms competing methods, especially\nat low measurement rates by significant margins. The software implementation is\nshared at https://github.com/meteahishali/OSEN.\n","authors":["Mete Ahishali","Mehmet Yamac","Serkan Kiranyaz","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2307.06065v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01373v1","updated":"2024-05-02T15:15:01Z","published":"2024-05-02T15:15:01Z","title":"ATOM: Attention Mixer for Efficient Dataset Distillation","summary":" Recent works in dataset distillation seek to minimize training expenses by\ngenerating a condensed synthetic dataset that encapsulates the information\npresent in a larger real dataset. These approaches ultimately aim to attain\ntest accuracy levels akin to those achieved by models trained on the entirety\nof the original dataset. Previous studies in feature and distribution matching\nhave achieved significant results without incurring the costs of bi-level\noptimization in the distillation process. Despite their convincing efficiency,\nmany of these methods suffer from marginal downstream performance improvements,\nlimited distillation of contextual information, and subpar cross-architecture\ngeneralization. To address these challenges in dataset distillation, we propose\nthe ATtentiOn Mixer (ATOM) module to efficiently distill large datasets using a\nmixture of channel and spatial-wise attention in the feature matching process.\nSpatial-wise attention helps guide the learning process based on consistent\nlocalization of classes in their respective images, allowing for distillation\nfrom a broader receptive field. Meanwhile, channel-wise attention captures the\ncontextual information associated with the class itself, thus making the\nsynthetic image more informative for training. By integrating both types of\nattention, our ATOM module demonstrates superior performance across various\ncomputer vision datasets, including CIFAR10/100 and TinyImagenet. Notably, our\nmethod significantly improves performance in scenarios with a low number of\nimages per class, thereby enhancing its potential. Furthermore, we maintain the\nimprovement in cross-architectures and applications such as neural architecture\nsearch.\n","authors":["Samir Khaki","Ahmad Sajedi","Kai Wang","Lucy Z. Liu","Yuri A. Lawryshyn","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2405.01373v1.pdf","comment":"Accepted for an oral presentation in CVPR-DD 2024"},{"id":"http://arxiv.org/abs/2405.01356v1","updated":"2024-05-02T15:03:41Z","published":"2024-05-02T15:03:41Z","title":"Improving Subject-Driven Image Synthesis with Subject-Agnostic Guidance","summary":" In subject-driven text-to-image synthesis, the synthesis process tends to be\nheavily influenced by the reference images provided by users, often overlooking\ncrucial attributes detailed in the text prompt. In this work, we propose\nSubject-Agnostic Guidance (SAG), a simple yet effective solution to remedy the\nproblem. We show that through constructing a subject-agnostic condition and\napplying our proposed dual classifier-free guidance, one could obtain outputs\nconsistent with both the given subject and input text prompts. We validate the\nefficacy of our approach through both optimization-based and encoder-based\nmethods. Additionally, we demonstrate its applicability in second-order\ncustomization methods, where an encoder-based model is fine-tuned with\nDreamBooth. Our approach is conceptually simple and requires only minimal code\nmodifications, but leads to substantial quality improvements, as evidenced by\nour evaluations and user studies.\n","authors":["Kelvin C. K. Chan","Yang Zhao","Xuhui Jia","Ming-Hsuan Yang","Huisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.01356v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2405.01353v1","updated":"2024-05-02T15:01:25Z","published":"2024-05-02T15:01:25Z","title":"Sparse multi-view hand-object reconstruction for unseen environments","summary":" Recent works in hand-object reconstruction mainly focus on the single-view\nand dense multi-view settings. On the one hand, single-view methods can\nleverage learned shape priors to generalise to unseen objects but are prone to\ninaccuracies due to occlusions. On the other hand, dense multi-view methods are\nvery accurate but cannot easily adapt to unseen objects without further data\ncollection. In contrast, sparse multi-view methods can take advantage of the\nadditional views to tackle occlusion, while keeping the computational cost low\ncompared to dense multi-view methods. In this paper, we consider the problem of\nhand-object reconstruction with unseen objects in the sparse multi-view\nsetting. Given multiple RGB images of the hand and object captured at the same\ntime, our model SVHO combines the predictions from each view into a unified\nreconstruction without optimisation across views. We train our model on a\nsynthetic hand-object dataset and evaluate directly on a real world recorded\nhand-object dataset with unseen objects. We show that while reconstruction of\nunseen hands and objects from RGB is challenging, additional views can help\nimprove the reconstruction quality.\n","authors":["Yik Lung Pang","Changjae Oh","Andrea Cavallaro"],"pdf_url":"https://arxiv.org/pdf/2405.01353v1.pdf","comment":"Camera-ready version. Paper accepted to CVPRW 2024. 8 pages, 7\n figures, 1 table"},{"id":"http://arxiv.org/abs/2404.17230v2","updated":"2024-05-02T14:57:37Z","published":"2024-04-26T08:02:07Z","title":"ObjectAdd: Adding Objects into Image via a Training-Free Diffusion\n Modification Fashion","summary":" We introduce ObjectAdd, a training-free diffusion modification method to add\nuser-expected objects into user-specified area. The motive of ObjectAdd stems\nfrom: first, describing everything in one prompt can be difficult, and second,\nusers often need to add objects into the generated image. To accommodate with\nreal world, our ObjectAdd maintains accurate image consistency after adding\nobjects with technical innovations in: (1) embedding-level concatenation to\nensure correct text embedding coalesce; (2) object-driven layout control with\nlatent and attention injection to ensure objects accessing user-specified area;\n(3) prompted image inpainting in an attention refocusing & object expansion\nfashion to ensure rest of the image stays the same. With a text-prompted image,\nour ObjectAdd allows users to specify a box and an object, and achieves: (1)\nadding object inside the box area; (2) exact content outside the box area; (3)\nflawless fusion between the two areas\n","authors":["Ziyue Zhang","Mingbao Lin","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.17230v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2405.01337v1","updated":"2024-05-02T14:43:21Z","published":"2024-05-02T14:43:21Z","title":"Multi-view Action Recognition via Directed Gromov-Wasserstein\n Discrepancy","summary":" Action recognition has become one of the popular research topics in computer\nvision. There are various methods based on Convolutional Networks and\nself-attention mechanisms as Transformers to solve both spatial and temporal\ndimensions problems of action recognition tasks that achieve competitive\nperformances. However, these methods lack a guarantee of the correctness of the\naction subject that the models give attention to, i.e., how to ensure an action\nrecognition model focuses on the proper action subject to make a reasonable\naction prediction. In this paper, we propose a multi-view attention consistency\nmethod that computes the similarity between two attentions from two different\nviews of the action videos using Directed Gromov-Wasserstein Discrepancy.\nFurthermore, our approach applies the idea of Neural Radiance Field to\nimplicitly render the features from novel views when training on single-view\ndatasets. Therefore, the contributions in this work are three-fold. Firstly, we\nintroduce the multi-view attention consistency to solve the problem of\nreasonable prediction in action recognition. Secondly, we define a new metric\nfor multi-view consistent attention using Directed Gromov-Wasserstein\nDiscrepancy. Thirdly, we built an action recognition model based on Video\nTransformers and Neural Radiance Fields. Compared to the recent action\nrecognition methods, the proposed approach achieves state-of-the-art results on\nthree large-scale datasets, i.e., Jester, Something-Something V2, and\nKinetics-400.\n","authors":["Hoang-Quan Nguyen","Thanh-Dat Truong","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2405.01337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01333v1","updated":"2024-05-02T14:38:18Z","published":"2024-05-02T14:38:18Z","title":"NeRF in Robotics: A Survey","summary":" Meticulous 3D environment representations have been a longstanding goal in\ncomputer vision and robotics fields. The recent emergence of neural implicit\nrepresentations has introduced radical innovation to this field as implicit\nrepresentations enable numerous capabilities. Among these, the Neural Radiance\nField (NeRF) has sparked a trend because of the huge representational\nadvantages, such as simplified mathematical models, compact environment\nstorage, and continuous scene representations. Apart from computer vision, NeRF\nhas also shown tremendous potential in the field of robotics. Thus, we create\nthis survey to provide a comprehensive understanding of NeRF in the field of\nrobotics. By exploring the advantages and limitations of NeRF, as well as its\ncurrent applications and future potential, we hope to shed light on this\npromising area of research. Our survey is divided into two main sections:\n\\textit{The Application of NeRF in Robotics} and \\textit{The Advance of NeRF in\nRobotics}, from the perspective of how NeRF enters the field of robotics. In\nthe first section, we introduce and analyze some works that have been or could\nbe used in the field of robotics from the perception and interaction\nperspectives. In the second section, we show some works related to improving\nNeRF's own properties, which are essential for deploying NeRF in the field of\nrobotics. In the discussion section of the review, we summarize the existing\nchallenges and provide some valuable future research directions for reference.\n","authors":["Guangming Wang","Lei Pan","Songyou Peng","Shaohui Liu","Chenfeng Xu","Yanzi Miao","Wei Zhan","Masayoshi Tomizuka","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.01333v1.pdf","comment":"21 pages, 19 figures"},{"id":"http://arxiv.org/abs/2405.01326v1","updated":"2024-05-02T14:31:47Z","published":"2024-05-02T14:31:47Z","title":"Multi-modal Learnable Queries for Image Aesthetics Assessment","summary":" Image aesthetics assessment (IAA) is attracting wide interest with the\nprevalence of social media. The problem is challenging due to its subjective\nand ambiguous nature. Instead of directly extracting aesthetic features solely\nfrom the image, user comments associated with an image could potentially\nprovide complementary knowledge that is useful for IAA. With existing\nlarge-scale pre-trained models demonstrating strong capabilities in extracting\nhigh-quality transferable visual and textual features, learnable queries are\nshown to be effective in extracting useful features from the pre-trained visual\nfeatures. Therefore, in this paper, we propose MMLQ, which utilizes multi-modal\nlearnable queries to extract aesthetics-related features from multi-modal\npre-trained features. Extensive experimental results demonstrate that MMLQ\nachieves new state-of-the-art performance on multi-modal IAA, beating previous\nmethods by 7.7% and 8.3% in terms of SRCC and PLCC, respectively.\n","authors":["Zhiwei Xiong","Yunfan Zhang","Zhiqi Shen","Peiran Ren","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2405.01326v1.pdf","comment":"Accepted by ICME2024"},{"id":"http://arxiv.org/abs/2405.01311v1","updated":"2024-05-02T14:20:20Z","published":"2024-05-02T14:20:20Z","title":"Imagine the Unseen: Occluded Pedestrian Detection via Adversarial\n Feature Completion","summary":" Pedestrian detection has significantly progressed in recent years, thanks to\nthe development of DNNs. However, detection performance at occluded scenes is\nstill far from satisfactory, as occlusion increases the intra-class variance of\npedestrians, hindering the model from finding an accurate classification\nboundary between pedestrians and background clutters. From the perspective of\nreducing intra-class variance, we propose to complete features for occluded\nregions so as to align the features of pedestrians across different occlusion\npatterns. An important premise for feature completion is to locate occluded\nregions. From our analysis, channel features of different pedestrian proposals\nonly show high correlation values at visible parts and thus feature\ncorrelations can be used to model occlusion patterns. In order to narrow down\nthe gap between completed features and real fully visible ones, we propose an\nadversarial learning method, which completes occluded features with a generator\nsuch that they can hardly be distinguished by the discriminator from real fully\nvisible features. We report experimental results on the CityPersons, Caltech\nand CrowdHuman datasets. On CityPersons, we show significant improvements over\nfive different baseline detectors, especially on the heavy occlusion subset.\nFurthermore, we show that our proposed method FeatComp++ achieves\nstate-of-the-art results on all the above three datasets without relying on\nextra cues.\n","authors":["Shanshan Zhang","Mingqian Ji","Yang Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2405.01311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04930v2","updated":"2024-05-02T13:53:01Z","published":"2024-02-07T14:59:25Z","title":"Blue noise for diffusion models","summary":" Most of the existing diffusion models use Gaussian noise for training and\nsampling across all time steps, which may not optimally account for the\nfrequency contents reconstructed by the denoising network. Despite the diverse\napplications of correlated noise in computer graphics, its potential for\nimproving the training process has been underexplored. In this paper, we\nintroduce a novel and general class of diffusion models taking correlated noise\nwithin and across images into account. More specifically, we propose a\ntime-varying noise model to incorporate correlated noise into the training\nprocess, as well as a method for fast generation of correlated noise mask. Our\nmodel is built upon deterministic diffusion models and utilizes blue noise to\nhelp improve the generation quality compared to using Gaussian white (random)\nnoise only. Further, our framework allows introducing correlation across images\nwithin a single mini-batch to improve gradient flow. We perform both\nqualitative and quantitative evaluations on a variety of datasets using our\nmethod, achieving improvements on different tasks over existing deterministic\ndiffusion models in terms of FID metric.\n","authors":["Xingchang Huang","Corentin Salaün","Cristina Vasconcelos","Christian Theobalt","Cengiz Öztireli","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2402.04930v2.pdf","comment":"SIGGRAPH 2024 Conference Proceedings; Project page:\n https://xchhuang.github.io/bndm"},{"id":"http://arxiv.org/abs/2404.04916v2","updated":"2024-05-02T13:37:13Z","published":"2024-04-07T10:57:54Z","title":"Correcting Diffusion-Based Perceptual Image Compression with Privileged\n End-to-End Decoder","summary":" The images produced by diffusion models can attain excellent perceptual\nquality. However, it is challenging for diffusion models to guarantee\ndistortion, hence the integration of diffusion models and image compression\nmodels still needs more comprehensive explorations. This paper presents a\ndiffusion-based image compression method that employs a privileged end-to-end\ndecoder model as correction, which achieves better perceptual quality while\nguaranteeing the distortion to an extent. We build a diffusion model and design\na novel paradigm that combines the diffusion model and an end-to-end decoder,\nand the latter is responsible for transmitting the privileged information\nextracted at the encoder side. Specifically, we theoretically analyze the\nreconstruction process of the diffusion models at the encoder side with the\noriginal images being visible. Based on the analysis, we introduce an\nend-to-end convolutional decoder to provide a better approximation of the score\nfunction $\\nabla_{\\mathbf{x}_t}\\log p(\\mathbf{x}_t)$ at the encoder side and\neffectively transmit the combination. Experiments demonstrate the superiority\nof our method in both distortion and perception compared with previous\nperceptual compression methods.\n","authors":["Yiyang Ma","Wenhan Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04916v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.01273v1","updated":"2024-05-02T13:31:09Z","published":"2024-05-02T13:31:09Z","title":"Towards Inclusive Face Recognition Through Synthetic Ethnicity\n Alteration","summary":" Numerous studies have shown that existing Face Recognition Systems (FRS),\nincluding commercial ones, often exhibit biases toward certain ethnicities due\nto under-represented data. In this work, we explore ethnicity alteration and\nskin tone modification using synthetic face image generation methods to\nincrease the diversity of datasets. We conduct a detailed analysis by first\nconstructing a balanced face image dataset representing three ethnicities:\nAsian, Black, and Indian. We then make use of existing Generative Adversarial\nNetwork-based (GAN) image-to-image translation and manifold learning models to\nalter the ethnicity from one to another. A systematic analysis is further\nconducted to assess the suitability of such datasets for FRS by studying the\nrealistic skin-tone representation using Individual Typology Angle (ITA).\nFurther, we also analyze the quality characteristics using existing Face image\nquality assessment (FIQA) approaches. We then provide a holistic FRS\nperformance analysis using four different systems. Our findings pave the way\nfor future research works in (i) developing both specific ethnicity and general\n(any to any) ethnicity alteration models, (ii) expanding such approaches to\ncreate databases with diverse skin tones, (iii) creating datasets representing\nvarious ethnicities which further can help in mitigating bias while addressing\nprivacy concerns.\n","authors":["Praveen Kumar Chandaliya","Kiran Raja","Raghavendra Ramachandra","Zahid Akhtar","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2405.01273v1.pdf","comment":"8 Pages"},{"id":"http://arxiv.org/abs/2309.08152v2","updated":"2024-05-02T13:30:29Z","published":"2023-09-15T04:37:28Z","title":"DA-RAW: Domain Adaptive Object Detection for Real-World Adverse Weather\n Conditions","summary":" Despite the success of deep learning-based object detection methods in recent\nyears, it is still challenging to make the object detector reliable in adverse\nweather conditions such as rain and snow. For the robust performance of object\ndetectors, unsupervised domain adaptation has been utilized to adapt the\ndetection network trained on clear weather images to adverse weather images.\nWhile previous methods do not explicitly address weather corruption during\nadaptation, the domain gap between clear and adverse weather can be decomposed\ninto two factors with distinct characteristics: a style gap and a weather gap.\nIn this paper, we present an unsupervised domain adaptation framework for\nobject detection that can more effectively adapt to real-world environments\nwith adverse weather conditions by addressing these two gaps separately. Our\nmethod resolves the style gap by concentrating on style-related information of\nhigh-level features using an attention module. Using self-supervised\ncontrastive learning, our framework then reduces the weather gap and acquires\ninstance features that are robust to weather corruption. Extensive experiments\ndemonstrate that our method outperforms other methods for object detection in\nadverse weather conditions.\n","authors":["Minsik Jeon","Junwon Seo","Jihong Min"],"pdf_url":"https://arxiv.org/pdf/2309.08152v2.pdf","comment":"Accepted to ICRA 2024. Our project website can be found at\n https://bit.ly/3yccTRa"},{"id":"http://arxiv.org/abs/2305.16965v2","updated":"2024-05-02T13:12:03Z","published":"2023-05-26T14:20:36Z","title":"Accelerating Diffusion Models for Inverse Problems through Shortcut\n Sampling","summary":" Diffusion models have recently demonstrated an impressive ability to address\ninverse problems in an unsupervised manner. While existing methods primarily\nfocus on modifying the posterior sampling process, the potential of the forward\nprocess remains largely unexplored. In this work, we propose Shortcut Sampling\nfor Diffusion(SSD), a novel approach for solving inverse problems in a\nzero-shot manner. Instead of initiating from random noise, the core concept of\nSSD is to find a specific transitional state that bridges the measurement image\ny and the restored image x. By utilizing the shortcut path of \"input -\ntransitional state - output\", SSD can achieve precise restoration with fewer\nsteps. To derive the transitional state during the forward process, we\nintroduce Distortion Adaptive Inversion. Moreover, we apply back projection as\nadditional consistency constraints during the generation process.\nExperimentally, we demonstrate SSD's effectiveness on multiple representative\nIR tasks. Our method achieves competitive results with only 30 NFEs compared to\nstate-of-the-art zero-shot methods(100 NFEs) and outperforms them with 100 NFEs\nin certain tasks. Code is available at https://github.com/GongyeLiu/SSD\n","authors":["Gongye Liu","Haoze Sun","Jiayi Li","Fei Yin","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2305.16965v2.pdf","comment":"full version; IJCAI 2024 accepted (main track)"},{"id":"http://arxiv.org/abs/2405.01258v1","updated":"2024-05-02T13:04:26Z","published":"2024-05-02T13:04:26Z","title":"Towards Consistent Object Detection via LiDAR-Camera Synergy","summary":" As human-machine interaction continues to evolve, the capacity for\nenvironmental perception is becoming increasingly crucial. Integrating the two\nmost common types of sensory data, images, and point clouds, can enhance\ndetection accuracy. However, currently, no model exists that can simultaneously\ndetect an object's position in both point clouds and images and ascertain their\ncorresponding relationship. This information is invaluable for human-machine\ninteractions, offering new possibilities for their enhancement. In light of\nthis, this paper introduces an end-to-end Consistency Object Detection (COD)\nalgorithm framework that requires only a single forward inference to\nsimultaneously obtain an object's position in both point clouds and images and\nestablish their correlation. Furthermore, to assess the accuracy of the object\ncorrelation between point clouds and images, this paper proposes a new\nevaluation metric, Consistency Precision (CP). To verify the effectiveness of\nthe proposed framework, an extensive set of experiments has been conducted on\nthe KITTI and DAIR-V2X datasets. The study also explored how the proposed\nconsistency detection method performs on images when the calibration parameters\nbetween images and point clouds are disturbed, compared to existing\npost-processing methods. The experimental results demonstrate that the proposed\nmethod exhibits excellent detection performance and robustness, achieving\nend-to-end consistency detection. The source code will be made publicly\navailable at https://github.com/xifen523/COD.\n","authors":["Kai Luo","Hao Wu","Kefu Yi","Kailun Yang","Wei Hao","Rongdong Hu"],"pdf_url":"https://arxiv.org/pdf/2405.01258v1.pdf","comment":"The source code will be made publicly available at\n https://github.com/xifen523/COD"},{"id":"http://arxiv.org/abs/2405.01230v1","updated":"2024-05-02T12:21:51Z","published":"2024-05-02T12:21:51Z","title":"Evaluation of Video-Based rPPG in Challenging Environments: Artifact\n Mitigation and Network Resilience","summary":" Video-based remote photoplethysmography (rPPG) has emerged as a promising\ntechnology for non-contact vital sign monitoring, especially under controlled\nconditions. However, the accurate measurement of vital signs in real-world\nscenarios faces several challenges, including artifacts induced by videocodecs,\nlow-light noise, degradation, low dynamic range, occlusions, and hardware and\nnetwork constraints. In this article, we systematically investigate\ncomprehensive investigate these issues, measuring their detrimental effects on\nthe quality of rPPG measurements. Additionally, we propose practical strategies\nfor mitigating these challenges to improve the dependability and resilience of\nvideo-based rPPG systems. We detail methods for effective biosignal recovery in\nthe presence of network limitations and present denoising and inpainting\ntechniques aimed at preserving video frame integrity. Through extensive\nevaluations and direct comparisons, we demonstrate the effectiveness of the\napproaches in enhancing rPPG measurements under challenging environments,\ncontributing to the development of more reliable and effective remote vital\nsign monitoring technologies.\n","authors":["Nhi Nguyen","Le Nguyen","Honghan Li","Miguel Bordallo López","Constantino Álvarez Casado"],"pdf_url":"https://arxiv.org/pdf/2405.01230v1.pdf","comment":"22 main article pages with 3 supplementary pages, journal"},{"id":"http://arxiv.org/abs/2405.01228v1","updated":"2024-05-02T12:13:00Z","published":"2024-05-02T12:13:00Z","title":"RaffeSDG: Random Frequency Filtering enabled Single-source Domain\n Generalization for Medical Image Segmentation","summary":" Deep learning models often encounter challenges in making accurate inferences\nwhen there are domain shifts between the source and target data. This issue is\nparticularly pronounced in clinical settings due to the scarcity of annotated\ndata resulting from the professional and private nature of medical data.\nDespite the existence of decent solutions, many of them are hindered in\nclinical settings due to limitations in data collection and computational\ncomplexity. To tackle domain shifts in data-scarce medical scenarios, we\npropose a Random frequency filtering enabled Single-source Domain\nGeneralization algorithm (RaffeSDG), which promises robust out-of-domain\ninference with segmentation models trained on a single-source domain. A\nfilter-based data augmentation strategy is first proposed to promote domain\nvariability within a single-source domain by introducing variations in\nfrequency space and blending homologous samples. Then Gaussian filter-based\nstructural saliency is also leveraged to learn robust representations across\naugmented samples, further facilitating the training of generalizable\nsegmentation models. To validate the effectiveness of RaffeSDG, we conducted\nextensive experiments involving out-of-domain inference on segmentation tasks\nfor three human tissues imaged by four diverse modalities. Through thorough\ninvestigations and comparisons, compelling evidence was observed in these\nexperiments, demonstrating the potential and generalizability of RaffeSDG. The\ncode is available at\nhttps://github.com/liamheng/Non-IID_Medical_Image_Segmentation.\n","authors":["Heng Li","Haojin Li","Jianyu Chen","Zhongxi Qiu","Huazhu Fu","Lidai Wang","Yan Hu","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2405.01228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14014v3","updated":"2024-05-02T12:10:16Z","published":"2023-05-23T12:51:20Z","title":"CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained\n Vision-Language Model","summary":" Pre-trained vision-language models~(VLMs) are the de-facto foundation models\nfor various downstream tasks. However, scene text recognition methods still\nprefer backbones pre-trained on a single modality, namely, the visual modality,\ndespite the potential of VLMs to serve as powerful scene text readers. For\nexample, CLIP can robustly identify regular (horizontal) and irregular\n(rotated, curved, blurred, or occluded) text in images. With such merits, we\ntransform CLIP into a scene text reader and introduce CLIP4STR, a simple yet\neffective STR method built upon image and text encoders of CLIP. It has two\nencoder-decoder branches: a visual branch and a cross-modal branch. The visual\nbranch provides an initial prediction based on the visual feature, and the\ncross-modal branch refines this prediction by addressing the discrepancy\nbetween the visual feature and text semantics. To fully leverage the\ncapabilities of both branches, we design a dual predict-and-refine decoding\nscheme for inference. We scale CLIP4STR in terms of the model size,\npre-training data, and training data, achieving state-of-the-art performance on\n11 STR benchmarks. Additionally, a comprehensive empirical study is provided to\nenhance the understanding of the adaptation of CLIP to STR. We believe our\nmethod establishes a simple yet strong baseline for future STR research with\nVLMs.\n","authors":["Shuai Zhao","Ruijie Quan","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.14014v3.pdf","comment":"Preprint. A PyTorch re-implementation is at\n https://github.com/VamosC/CLIP4STR"},{"id":"http://arxiv.org/abs/2405.01217v1","updated":"2024-05-02T11:58:06Z","published":"2024-05-02T11:58:06Z","title":"CromSS: Cross-modal pre-training with noisy labels for remote sensing\n image segmentation","summary":" We study the potential of noisy labels y to pretrain semantic segmentation\nmodels in a multi-modal learning framework for geospatial applications.\nSpecifically, we propose a novel Cross-modal Sample Selection method (CromSS)\nthat utilizes the class distributions P^{(d)}(x,c) over pixels x and classes c\nmodelled by multiple sensors/modalities d of a given geospatial scene.\nConsistency of predictions across sensors $d$ is jointly informed by the\nentropy of P^{(d)}(x,c). Noisy label sampling we determine by the confidence of\neach sensor d in the noisy class label, P^{(d)}(x,c=y(x)). To verify the\nperformance of our approach, we conduct experiments with Sentinel-1 (radar) and\nSentinel-2 (optical) satellite imagery from the globally-sampled SSL4EO-S12\ndataset. We pair those scenes with 9-class noisy labels sourced from the Google\nDynamic World project for pretraining. Transfer learning evaluations\n(downstream task) on the DFC2020 dataset confirm the effectiveness of the\nproposed method for remote sensing image segmentation.\n","authors":["Chenying Liu","Conrad Albrecht","Yi Wang","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.01217v1.pdf","comment":"Accepted as an oral presentation by ICLR 2024 ML4RS workshop"},{"id":"http://arxiv.org/abs/2405.01205v1","updated":"2024-05-02T11:48:14Z","published":"2024-05-02T11:48:14Z","title":"Error-Driven Uncertainty Aware Training","summary":" Neural networks are often overconfident about their predictions, which\nundermines their reliability and trustworthiness. In this work, we present a\nnovel technique, named Error-Driven Uncertainty Aware Training (EUAT), which\naims to enhance the ability of neural models to estimate their uncertainty\ncorrectly, namely to be highly uncertain when they output inaccurate\npredictions and low uncertain when their output is accurate. The EUAT approach\noperates during the model's training phase by selectively employing two loss\nfunctions depending on whether the training examples are correctly or\nincorrectly predicted by the model. This allows for pursuing the twofold goal\nof i) minimizing model uncertainty for correctly predicted inputs and ii)\nmaximizing uncertainty for mispredicted inputs, while preserving the model's\nmisprediction rate. We evaluate EUAT using diverse neural models and datasets\nin the image recognition domains considering both non-adversarial and\nadversarial settings. The results show that EUAT outperforms existing\napproaches for uncertainty estimation (including other uncertainty-aware\ntraining techniques, calibration, ensembles, and DEUP) by providing uncertainty\nestimates that not only have higher quality when evaluated via statistical\nmetrics (e.g., correlation with residuals) but also when employed to build\nbinary classifiers that decide whether the model's output can be trusted or not\nand under distributional data shifts.\n","authors":["Pedro Mendes","Paolo Romano","David Garlan"],"pdf_url":"https://arxiv.org/pdf/2405.01205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01204v1","updated":"2024-05-02T11:46:12Z","published":"2024-05-02T11:46:12Z","title":"Towards Cross-Scale Attention and Surface Supervision for Fractured Bone\n Segmentation in CT","summary":" Bone segmentation is an essential step for the preoperative planning of\nfracture trauma surgery. The automated segmentation of fractured bone from\ncomputed tomography (CT) scans remains challenging, due to the large\ndifferences of fractures in position and morphology, and also the inherent\nanatomical characteristics of different bone structures. To alleviate these\nissues, we propose a cross-scale attention mechanism as well as a surface\nsupervision strategy for fractured bone segmentation in CT. Specifically, a\ncross-scale attention mechanism is introduced to effectively aggregate the\nfeatures among different scales to provide more powerful fracture\nrepresentation. Moreover, a surface supervision strategy is employed, which\nexplicitly constrains the network to pay more attention to the bone boundary.\nThe efficacy of the proposed method is evaluated on a public dataset containing\nCT scans with hip fractures. The evaluation metrics are Dice similarity\ncoefficient (DSC), average symmetric surface distance (ASSD), and Hausdorff\ndistance (95HD). The proposed method achieves an average DSC of 93.36%, ASSD of\n0.85mm, 95HD of 7.51mm. Our method offers an effective fracture segmentation\napproach for the pelvic CT examinations, and has the potential to be used for\nimproving the segmentation performance of other types of fractures.\n","authors":["Yu Zhou","Xiahao Zou","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2405.01204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01199v1","updated":"2024-05-02T11:40:44Z","published":"2024-05-02T11:40:44Z","title":"Latent Fingerprint Matching via Dense Minutia Descriptor","summary":" Latent fingerprint matching is a daunting task, primarily due to the poor\nquality of latent fingerprints. In this study, we propose a deep-learning based\ndense minutia descriptor (DMD) for latent fingerprint matching. A DMD is\nobtained by extracting the fingerprint patch aligned by its central minutia,\ncapturing detailed minutia information and texture information. Our dense\ndescriptor takes the form of a three-dimensional representation, with two\ndimensions associated with the original image plane and the other dimension\nrepresenting the abstract features. Additionally, the extraction process\noutputs the fingerprint segmentation map, ensuring that the descriptor is only\nvalid in the foreground region. The matching between two descriptors occurs in\ntheir overlapping regions, with a score normalization strategy to reduce the\nimpact brought by the differences outside the valid area. Our descriptor\nachieves state-of-the-art performance on several latent fingerprint datasets.\nOverall, our DMD is more representative and interpretable compared to previous\nmethods.\n","authors":["Zhiyu Pan","Yongjie Duan","Xiongjun Guan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.01199v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.01192v1","updated":"2024-05-02T11:33:54Z","published":"2024-05-02T11:33:54Z","title":"Imagine2touch: Predictive Tactile Sensing for Robotic Manipulation using\n Efficient Low-Dimensional Signals","summary":" Humans seemingly incorporate potential touch signals in their perception. Our\ngoal is to equip robots with a similar capability, which we term Imagine2touch.\nImagine2touch aims to predict the expected touch signal based on a visual patch\nrepresenting the area to be touched. We use ReSkin, an inexpensive and compact\ntouch sensor to collect the required dataset through random touching of five\nbasic geometric shapes, and one tool. We train Imagine2touch on two out of\nthose shapes and validate it on the ood. tool. We demonstrate the efficacy of\nImagine2touch through its application to the downstream task of object\nrecognition. In this task, we evaluate Imagine2touch performance in two\nexperiments, together comprising 5 out of training distribution objects.\nImagine2touch achieves an object recognition accuracy of 58% after ten touches\nper object, surpassing a proprioception baseline.\n","authors":["Abdallah Ayad","Adrian Röfer","Nick Heppert","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2405.01192v1.pdf","comment":"3 pages, 3 figures, 2 tables, accepted at ViTac2024 ICRA2024\n Workshop. arXiv admin note: substantial text overlap with arXiv:2403.15107"},{"id":"http://arxiv.org/abs/2311.18576v2","updated":"2024-05-02T11:31:27Z","published":"2023-11-30T14:15:39Z","title":"Fingerprint Matching with Localized Deep Representation","summary":" Compared to minutia-based fingerprint representations, fixed-length\nrepresentations are attractive due to simple and efficient matching. However,\nfixed-length fingerprint representations are limited in accuracy when matching\nfingerprints with different visible areas, which can occur due to different\nfinger poses or acquisition methods. To address this issue, we propose a\nlocalized deep representation of fingerprint, named LDRF. By focusing on the\ndiscriminative characteristics within local regions, LDRF provides a more\nrobust and accurate fixed-length representation for fingerprints with variable\nvisible areas. LDRF can be adapted to retain information within any valid area,\nmaking it highly flexible. The matching scores produced by LDRF also exhibit\nintuitive statistical characteristics, which led us to propose a matching score\nnormalization technique to mitigate the uncertainty in the cases of very small\noverlapping area. With this new technique, we can maintain a high level of\naccuracy and reliability in our fingerprint matching, even as the size of the\ndatabase grows rapidly. Our experimental results on 21 datasets containing over\n140K fingerprints of various finger poses and impression types show that LDRF\noutperforms other fixed-length representations and is robust to sensing\ntechnologies and impression types. Besides, the proposed matching score\nnormalization effectively reduces the false match rate (FMR) in large-scale\nidentification experiments comprising over 5.11 million fingerprints.\nSpecifically, this technique results in a reduction of two orders of magnitude\ncompared to matching without matching score normalization and five orders of\nmagnitude compared to prior works.\n","authors":["Yongjie Duan","Zhiyu Pan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v2.pdf","comment":"18 pages, 20 figures"},{"id":"http://arxiv.org/abs/2405.01175v1","updated":"2024-05-02T11:01:31Z","published":"2024-05-02T11:01:31Z","title":"Uncertainty-aware self-training with expectation maximization basis\n transformation","summary":" Self-training is a powerful approach to deep learning. The key process is to\nfind a pseudo-label for modeling. However, previous self-training algorithms\nsuffer from the over-confidence issue brought by the hard labels, even some\nconfidence-related regularizers cannot comprehensively catch the uncertainty.\nTherefore, we propose a new self-training framework to combine uncertainty\ninformation of both model and dataset. Specifically, we propose to use\nExpectation-Maximization (EM) to smooth the labels and comprehensively estimate\nthe uncertainty information. We further design a basis extraction network to\nestimate the initial basis from the dataset. The obtained basis with\nuncertainty can be filtered based on uncertainty information. It can then be\ntransformed into the real hard label to iteratively update the model and basis\nin the retraining process. Experiments on image classification and semantic\nsegmentation show the advantages of our methods among confidence-aware\nself-training algorithms with 1-3 percentage improvement on different datasets.\n","authors":["Zijia Wang","Wenbin Yang","Zhisong Liu","Zhen Jia"],"pdf_url":"https://arxiv.org/pdf/2405.01175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19398v2","updated":"2024-05-02T10:58:57Z","published":"2024-04-30T09:45:41Z","title":"3D Gaussian Blendshapes for Head Avatar Animation","summary":" We introduce 3D Gaussian blendshapes for modeling photorealistic head\navatars. Taking a monocular video as input, we learn a base head model of\nneutral expression, along with a group of expression blendshapes, each of which\ncorresponds to a basis expression in classical parametric face models. Both the\nneutral model and expression blendshapes are represented as 3D Gaussians, which\ncontain a few properties to depict the avatar appearance. The avatar model of\nan arbitrary expression can be effectively generated by combining the neutral\nmodel and expression blendshapes through linear blending of Gaussians with the\nexpression coefficients. High-fidelity head avatar animations can be\nsynthesized in real time using Gaussian splatting. Compared to state-of-the-art\nmethods, our Gaussian blendshape representation better captures high-frequency\ndetails exhibited in input video, and achieves superior rendering performance.\n","authors":["Shengjie Ma","Yanlin Weng","Tianjia Shao","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.19398v2.pdf","comment":"ACM SIGGRAPH Conference Proceedings 2024"},{"id":"http://arxiv.org/abs/2405.01170v1","updated":"2024-05-02T10:48:22Z","published":"2024-05-02T10:48:22Z","title":"GroupedMixer: An Entropy Model with Group-wise Token-Mixers for Learned\n Image Compression","summary":" Transformer-based entropy models have gained prominence in recent years due\nto their superior ability to capture long-range dependencies in probability\ndistribution estimation compared to convolution-based methods. However,\nprevious transformer-based entropy models suffer from a sluggish coding process\ndue to pixel-wise autoregression or duplicated computation during inference. In\nthis paper, we propose a novel transformer-based entropy model called\nGroupedMixer, which enjoys both faster coding speed and better compression\nperformance than previous transformer-based methods. Specifically, our approach\nbuilds upon group-wise autoregression by first partitioning the latent\nvariables into groups along spatial-channel dimensions, and then entropy coding\nthe groups with the proposed transformer-based entropy model. The global causal\nself-attention is decomposed into more efficient group-wise interactions,\nimplemented using inner-group and cross-group token-mixers. The inner-group\ntoken-mixer incorporates contextual elements within a group while the\ncross-group token-mixer interacts with previously decoded groups. Alternate\narrangement of two token-mixers enables global contextual reference. To further\nexpedite the network inference, we introduce context cache optimization to\nGroupedMixer, which caches attention activation values in cross-group\ntoken-mixers and avoids complex and duplicated computation. Experimental\nresults demonstrate that the proposed GroupedMixer yields the state-of-the-art\nrate-distortion performance with fast compression speed.\n","authors":["Daxin Li","Yuanchao Bai","Kai Wang","Junjun Jiang","Xianming Liu","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2405.01170v1.pdf","comment":"Accepted by IEEE TCSVT"},{"id":"http://arxiv.org/abs/2405.01156v1","updated":"2024-05-02T10:18:22Z","published":"2024-05-02T10:18:22Z","title":"Self-Supervised Learning for Interventional Image Analytics: Towards\n Robust Device Trackers","summary":" An accurate detection and tracking of devices such as guiding catheters in\nlive X-ray image acquisitions is an essential prerequisite for endovascular\ncardiac interventions. This information is leveraged for procedural guidance,\ne.g., directing stent placements. To ensure procedural safety and efficacy,\nthere is a need for high robustness no failures during tracking. To achieve\nthat, one needs to efficiently tackle challenges, such as: device obscuration\nby contrast agent or other external devices or wires, changes in field-of-view\nor acquisition angle, as well as the continuous movement due to cardiac and\nrespiratory motion. To overcome the aforementioned challenges, we propose a\nnovel approach to learn spatio-temporal features from a very large data cohort\nof over 16 million interventional X-ray frames using self-supervision for image\nsequence data. Our approach is based on a masked image modeling technique that\nleverages frame interpolation based reconstruction to learn fine inter-frame\ntemporal correspondences. The features encoded in the resulting model are\nfine-tuned downstream. Our approach achieves state-of-the-art performance and\nin particular robustness compared to ultra optimized reference solutions (that\nuse multi-stage feature fusion, multi-task and flow regularization). The\nexperiments show that our method achieves 66.31% reduction in maximum tracking\nerror against reference solutions (23.20% when flow regularization is used);\nachieving a success score of 97.95% at a 3x faster inference speed of 42\nframes-per-second (on GPU). The results encourage the use of our approach in\nvarious other tasks within interventional image analytics that require\neffective understanding of spatio-temporal semantics.\n","authors":["Saahil Islam","Venkatesh N. Murthy","Dominik Neumann","Badhan Kumar Das","Puneet Sharma","Andreas Maier","Dorin Comaniciu","Florin C. Ghesu"],"pdf_url":"https://arxiv.org/pdf/2405.01156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01130v1","updated":"2024-05-02T09:44:13Z","published":"2024-05-02T09:44:13Z","title":"Automated Virtual Product Placement and Assessment in Images using\n Diffusion Models","summary":" In Virtual Product Placement (VPP) applications, the discrete integration of\nspecific brand products into images or videos has emerged as a challenging yet\nimportant task. This paper introduces a novel three-stage fully automated VPP\nsystem. In the first stage, a language-guided image segmentation model\nidentifies optimal regions within images for product inpainting. In the second\nstage, Stable Diffusion (SD), fine-tuned with a few example product images, is\nused to inpaint the product into the previously identified candidate regions.\nThe final stage introduces an \"Alignment Module\", which is designed to\neffectively sieve out low-quality images. Comprehensive experiments demonstrate\nthat the Alignment Module ensures the presence of the intended product in every\ngenerated image and enhances the average quality of images by 35%. The results\npresented in this paper demonstrate the effectiveness of the proposed VPP\nsystem, which holds significant potential for transforming the landscape of\nvirtual advertising and marketing strategies.\n","authors":["Mohammad Mahmudul Alam","Negin Sokhandan","Emmett Goodman"],"pdf_url":"https://arxiv.org/pdf/2405.01130v1.pdf","comment":"Accepted at the 6th AI for Content Creation (AI4CC) workshop at CVPR\n 2024"},{"id":"http://arxiv.org/abs/2405.01126v1","updated":"2024-05-02T09:41:31Z","published":"2024-05-02T09:41:31Z","title":"Detecting and clustering swallow events in esophageal long-term\n high-resolution manometry","summary":" High-resolution manometry (HRM) is the gold standard in diagnosing esophageal\nmotility disorders. As HRM is typically conducted under short-term laboratory\nsettings, intermittently occurring disorders are likely to be missed.\nTherefore, long-term (up to 24h) HRM (LTHRM) is used to gain detailed insights\ninto the swallowing behavior. However, analyzing the extensive data from LTHRM\nis challenging and time consuming as medical experts have to analyze the data\nmanually, which is slow and prone to errors. To address this challenge, we\npropose a Deep Learning based swallowing detection method to accurately\nidentify swallowing events and secondary non-deglutitive-induced esophageal\nmotility disorders in LTHRM data. We then proceed with clustering the\nidentified swallows into distinct classes, which are analyzed by highly\nexperienced clinicians to validate the different swallowing patterns. We\nevaluate our computational pipeline on a total of 25 LTHRMs, which were\nmeticulously annotated by medical experts. By detecting more than 94% of all\nrelevant swallow events and providing all relevant clusters for a more reliable\ndiagnostic process among experienced clinicians, we are able to demonstrate the\neffectiveness as well as positive clinical impact of our approach to make LTHRM\nfeasible in clinical care.\n","authors":["Alexander Geiger","Lars Wagner","Daniel Rueckert","Dirk Wilhelm","Alissa Jell"],"pdf_url":"https://arxiv.org/pdf/2405.01126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01124v1","updated":"2024-05-02T09:38:07Z","published":"2024-05-02T09:38:07Z","title":"Investigating Self-Supervised Image Denoising with Denaturation","summary":" Self-supervised learning for image denoising problems in the presence of\ndenaturation for noisy data is a crucial approach in machine learning. However,\ntheoretical understanding of the performance of the approach that uses\ndenatured data is lacking. To provide better understanding of the approach, in\nthis paper, we analyze a self-supervised denoising algorithm that uses\ndenatured data in depth through theoretical analysis and numerical experiments.\nThrough the theoretical analysis, we discuss that the algorithm finds desired\nsolutions to the optimization problem with the population risk, while the\nguarantee for the empirical risk depends on the hardness of the denoising task\nin terms of denaturation levels. We also conduct several experiments to\ninvestigate the performance of an extended algorithm in practice. The results\nindicate that the algorithm training with denatured images works, and the\nempirical performance aligns with the theoretical results. These results\nsuggest several insights for further improvement of self-supervised image\ndenoising that uses denatured data in future directions.\n","authors":["Hiroki Waida","Kimihiro Yamazaki","Atsushi Tokuhisa","Mutsuyo Wada","Yuichiro Wada"],"pdf_url":"https://arxiv.org/pdf/2405.01124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02067v3","updated":"2024-05-02T09:22:37Z","published":"2023-10-03T14:09:27Z","title":"Content Bias in Deep Learning Image Age Approximation: A new Approach\n Towards better Explainability","summary":" In the context of temporal image forensics, it is not evident that a neural\nnetwork, trained on images from different time-slots (classes), exploits solely\nimage age related features. Usually, images taken in close temporal proximity\n(e.g., belonging to the same age class) share some common content properties.\nSuch content bias can be exploited by a neural network. In this work, a novel\napproach is proposed that evaluates the influence of image content. This\napproach is verified using synthetic images (where content bias can be ruled\nout) with an age signal embedded. Based on the proposed approach, it is shown\nthat a deep learning approach proposed in the context of age classification is\nmost likely highly dependent on the image content. As a possible\ncountermeasure, two different models from the field of image steganalysis,\nalong with three different preprocessing techniques to increase the\nsignal-to-noise ratio (age signal to image content), are evaluated using the\nproposed method.\n","authors":["Robert Jöchl","Andreas Uhl"],"pdf_url":"https://arxiv.org/pdf/2310.02067v3.pdf","comment":"This is a preprint, the paper is currently under consideration at\n Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2405.01113v1","updated":"2024-05-02T09:21:10Z","published":"2024-05-02T09:21:10Z","title":"Domain-Transferred Synthetic Data Generation for Improving Monocular\n Depth Estimation","summary":" A major obstacle to the development of effective monocular depth estimation\nalgorithms is the difficulty in obtaining high-quality depth data that\ncorresponds to collected RGB images. Collecting this data is time-consuming and\ncostly, and even data collected by modern sensors has limited range or\nresolution, and is subject to inconsistencies and noise. To combat this, we\npropose a method of data generation in simulation using 3D synthetic\nenvironments and CycleGAN domain transfer. We compare this method of data\ngeneration to the popular NYUDepth V2 dataset by training a depth estimation\nmodel based on the DenseDepth structure using different training sets of real\nand simulated data. We evaluate the performance of the models on newly\ncollected images and LiDAR depth data from a Husky robot to verify the\ngeneralizability of the approach and show that GAN-transformed data can serve\nas an effective alternative to real-world data, particularly in depth\nestimation.\n","authors":["Seungyeop Lee","Knut Peterson","Solmaz Arezoomandan","Bill Cai","Peihan Li","Lifeng Zhou","David Han"],"pdf_url":"https://arxiv.org/pdf/2405.01113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01112v1","updated":"2024-05-02T09:19:43Z","published":"2024-05-02T09:19:43Z","title":"Sports Analysis and VR Viewing System Based on Player Tracking and Pose\n Estimation with Multimodal and Multiview Sensors","summary":" Sports analysis and viewing play a pivotal role in the current sports domain,\noffering significant value not only to coaches and athletes but also to fans\nand the media. In recent years, the rapid development of virtual reality (VR)\nand augmented reality (AR) technologies have introduced a new platform for\nwatching games. Visualization of sports competitions in VR/AR represents a\nrevolutionary technology, providing audiences with a novel immersive viewing\nexperience. However, there is still a lack of related research in this area. In\nthis work, we present for the first time a comprehensive system for sports\ncompetition analysis and real-time visualization on VR/AR platforms. First, we\nutilize multiview LiDARs and cameras to collect multimodal game data.\nSubsequently, we propose a framework for multi-player tracking and pose\nestimation based on a limited amount of supervised data, which extracts precise\nplayer positions and movements from point clouds and images. Moreover, we\nperform avatar modeling of players to obtain their 3D models. Ultimately, using\nthese 3D player data, we conduct competition analysis and real-time\nvisualization on VR/AR. Extensive quantitative experiments demonstrate the\naccuracy and robustness of our multi-player tracking and pose estimation\nframework. The visualization results showcase the immense potential of our\nsports visualization system on the domain of watching games on VR/AR devices.\nThe multimodal competition dataset we collected and all related code will be\nreleased soon.\n","authors":["Wenxuan Guo","Zhiyu Pan","Ziheng Xi","Alapati Tuerxun","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.01112v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.06409"},{"id":"http://arxiv.org/abs/2405.01108v1","updated":"2024-05-02T09:14:59Z","published":"2024-05-02T09:14:59Z","title":"Federated Learning with Heterogeneous Data Handling for Robust Vehicular\n Object Detection","summary":" In the pursuit of refining precise perception models for fully autonomous\ndriving, continual online model training becomes essential. Federated Learning\n(FL) within vehicular networks offers an efficient mechanism for model training\nwhile preserving raw sensory data integrity. Yet, FL struggles with\nnon-identically distributed data (e.g., quantity skew), leading to suboptimal\nconvergence rates during model training. In previous work, we introduced FedLA,\nan innovative Label-Aware aggregation method addressing data heterogeneity in\nFL for generic scenarios.\n In this paper, we introduce FedProx+LA, a novel FL method building upon the\nstate-of-the-art FedProx and FedLA to tackle data heterogeneity, which is\nspecifically tailored for vehicular networks. We evaluate the efficacy of\nFedProx+LA in continuous online object detection model training. Through a\ncomparative analysis against conventional and state-of-the-art methods, our\nfindings reveal the superior convergence rate of FedProx+LA. Notably, if the\nlabel distribution is very heterogeneous, our FedProx+LA approach shows\nsubstantial improvements in detection performance compared to baseline methods,\nalso outperforming our previous FedLA approach. Moreover, both FedLA and\nFedProx+LA increase convergence speed by 30% compared to baseline methods.\n","authors":["Ahmad Khalil","Tizian Dege","Pegah Golchin","Rostyslav Olshevskyi","Antonio Fernandez Anta","Tobias Meuser"],"pdf_url":"https://arxiv.org/pdf/2405.01108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01105v1","updated":"2024-05-02T09:14:38Z","published":"2024-05-02T09:14:38Z","title":"Image segmentation of treated and untreated tumor spheroids by Fully\n Convolutional Networks","summary":" Multicellular tumor spheroids (MCTS) are advanced cell culture systems for\nassessing the impact of combinatorial radio(chemo)therapy. They exhibit\ntherapeutically relevant in-vivo-like characteristics from 3D cell-cell and\ncell-matrix interactions to radial pathophysiological gradients related to\nproliferative activity and nutrient/oxygen supply, altering cellular\nradioresponse. State-of-the-art assays quantify long-term curative endpoints\nbased on collected brightfield image time series from large treated spheroid\npopulations per irradiation dose and treatment arm. Here, spheroid control\nprobabilities are documented analogous to in-vivo tumor control probabilities\nbased on Kaplan-Meier curves. This analyses require laborious spheroid\nsegmentation of up to 100.000 images per treatment arm to extract relevant\nstructural information from the images, e.g., diameter, area, volume and\ncircularity. While several image analysis algorithms are available for spheroid\nsegmentation, they all focus on compact MCTS with clearly distinguishable outer\nrim throughout growth. However, treated MCTS may partly be detached and\ndestroyed and are usually obscured by dead cell debris. We successfully train\ntwo Fully Convolutional Networks, UNet and HRNet, and optimize their\nhyperparameters to develop an automatic segmentation for both untreated and\ntreated MCTS. We systematically validate the automatic segmentation on larger,\nindependent data sets of spheroids derived from two human head-and-neck cancer\ncell lines. We find an excellent overlap between manual and automatic\nsegmentation for most images, quantified by Jaccard indices at around 90%. For\nimages with smaller overlap of the segmentations, we demonstrate that this\nerror is comparable to the variations across segmentations from different\nbiological experts, suggesting that these images represent biologically unclear\nor ambiguous cases.\n","authors":["Matthias Streller","Soňa Michlíková","Willy Ciecior","Katharina Lönnecke","Leoni A. Kunz-Schughart","Steffen Lange","Anja Voss-Böhme"],"pdf_url":"https://arxiv.org/pdf/2405.01105v1.pdf","comment":"28 pages, 21 figures"},{"id":"http://arxiv.org/abs/2405.01101v1","updated":"2024-05-02T09:09:48Z","published":"2024-05-02T09:09:48Z","title":"Enhancing Person Re-Identification via Uncertainty Feature Fusion and\n Wise Distance Aggregation","summary":" The quest for robust Person re-identification (Re-ID) systems capable of\naccurately identifying subjects across diverse scenarios remains a formidable\nchallenge in surveillance and security applications. This study presents a\nnovel methodology that significantly enhances Person Re-Identification (Re-ID)\nby integrating Uncertainty Feature Fusion (UFFM) with Wise Distance Aggregation\n(WDA). Tested on benchmark datasets - Market-1501, DukeMTMC-ReID, and MSMT17 -\nour approach demonstrates substantial improvements in Rank-1 accuracy and mean\nAverage Precision (mAP). Specifically, UFFM capitalizes on the power of feature\nsynthesis from multiple images to overcome the limitations imposed by the\nvariability of subject appearances across different views. WDA further refines\nthe process by intelligently aggregating similarity metrics, thereby enhancing\nthe system's ability to discern subtle but critical differences between\nsubjects. The empirical results affirm the superiority of our method over\nexisting approaches, achieving new performance benchmarks across all evaluated\ndatasets. Code is available on Github.\n","authors":["Quang-Huy Che","Le-Chuong Nguyen","Vinh-Tiep Nguyen"],"pdf_url":"https://arxiv.org/pdf/2405.01101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07741v2","updated":"2024-05-02T09:00:21Z","published":"2024-03-12T15:19:25Z","title":"Uncertainty Quantification with Deep Ensembles for 6D Object Pose\n Estimation","summary":" The estimation of 6D object poses is a fundamental task in many computer\nvision applications. Particularly, in high risk scenarios such as human-robot\ninteraction, industrial inspection, and automation, reliable pose estimates are\ncrucial. In the last years, increasingly accurate and robust\ndeep-learning-based approaches for 6D object pose estimation have been\nproposed. Many top-performing methods are not end-to-end trainable but consist\nof multiple stages. In the context of deep uncertainty quantification, deep\nensembles are considered as state of the art since they have been proven to\nproduce well-calibrated and robust uncertainty estimates. However, deep\nensembles can only be applied to methods that can be trained end-to-end. In\nthis work, we propose a method to quantify the uncertainty of multi-stage 6D\nobject pose estimation approaches with deep ensembles. For the implementation,\nwe choose SurfEmb as representative, since it is one of the top-performing 6D\nobject pose estimation approaches in the BOP Challenge 2022. We apply\nestablished metrics and concepts for deep uncertainty quantification to\nevaluate the results. Furthermore, we propose a novel uncertainty calibration\nscore for regression tasks to quantify the quality of the estimated\nuncertainty.\n","authors":["Kira Wursthorn","Markus Hillemann","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2403.07741v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.16071v2","updated":"2024-05-02T08:53:35Z","published":"2024-03-24T09:18:21Z","title":"Landmark-Guided Cross-Speaker Lip Reading with Mutual Information\n Regularization","summary":" Lip reading, the process of interpreting silent speech from visual lip\nmovements, has gained rising attention for its wide range of realistic\napplications. Deep learning approaches greatly improve current lip reading\nsystems. However, lip reading in cross-speaker scenarios where the speaker\nidentity changes, poses a challenging problem due to inter-speaker variability.\nA well-trained lip reading system may perform poorly when handling a brand new\nspeaker. To learn a speaker-robust lip reading model, a key insight is to\nreduce visual variations across speakers, avoiding the model overfitting to\nspecific speakers. In this work, in view of both input visual clues and latent\nrepresentations based on a hybrid CTC/attention architecture, we propose to\nexploit the lip landmark-guided fine-grained visual clues instead of\nfrequently-used mouth-cropped images as input features, diminishing\nspeaker-specific appearance characteristics. Furthermore, a max-min mutual\ninformation regularization approach is proposed to capture speaker-insensitive\nlatent representations. Experimental evaluations on public lip reading datasets\ndemonstrate the effectiveness of the proposed approach under the intra-speaker\nand inter-speaker conditions.\n","authors":["Linzhi Wu","Xingyu Zhang","Yakun Zhang","Changyan Zheng","Tiejun Liu","Liang Xie","Ye Yan","Erwei Yin"],"pdf_url":"https://arxiv.org/pdf/2403.16071v2.pdf","comment":"To appear in LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2405.01095v1","updated":"2024-05-02T08:49:01Z","published":"2024-05-02T08:49:01Z","title":"Transformers Fusion across Disjoint Samples for Hyperspectral Image\n Classification","summary":" 3D Swin Transformer (3D-ST) known for its hierarchical attention and\nwindow-based processing, excels in capturing intricate spatial relationships\nwithin images. Spatial-spectral Transformer (SST), meanwhile, specializes in\nmodeling long-range dependencies through self-attention mechanisms. Therefore,\nthis paper introduces a novel method: an attentional fusion of these two\ntransformers to significantly enhance the classification performance of\nHyperspectral Images (HSIs). What sets this approach apart is its emphasis on\nthe integration of attentional mechanisms from both architectures. This\nintegration not only refines the modeling of spatial and spectral information\nbut also contributes to achieving more precise and accurate classification\nresults. The experimentation and evaluation of benchmark HSI datasets\nunderscore the importance of employing disjoint training, validation, and test\nsamples. The results demonstrate the effectiveness of the fusion approach,\nshowcasing its superiority over traditional methods and individual\ntransformers. Incorporating disjoint samples enhances the robustness and\nreliability of the proposed methodology, emphasizing its potential for\nadvancing hyperspectral image classification.\n","authors":["Muhammad Ahmad","Manuel Mazzara","Salvatore Distifano"],"pdf_url":"https://arxiv.org/pdf/2405.01095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17105v5","updated":"2024-05-02T08:44:23Z","published":"2023-09-29T10:06:28Z","title":"Continual Action Assessment via Task-Consistent Score-Discriminative\n Feature Distribution Modeling","summary":" Action Quality Assessment (AQA) is a task that tries to answer how well an\naction is carried out. While remarkable progress has been achieved, existing\nworks on AQA assume that all the training data are visible for training at one\ntime, but do not enable continual learning on assessing new technical actions.\nIn this work, we address such a Continual Learning problem in AQA\n(Continual-AQA), which urges a unified model to learn AQA tasks sequentially\nwithout forgetting. Our idea for modeling Continual-AQA is to sequentially\nlearn a task-consistent score-discriminative feature distribution, in which the\nlatent features express a strong correlation with the score labels regardless\nof the task or action types.From this perspective, we aim to mitigate the\nforgetting in Continual-AQA from two aspects. Firstly, to fuse the features of\nnew and previous data into a score-discriminative distribution, a novel\nFeature-Score Correlation-Aware Rehearsal is proposed to store and reuse data\nfrom previous tasks with limited memory size. Secondly, an Action\nGeneral-Specific Graph is developed to learn and decouple the action-general\nand action-specific knowledge so that the task-consistent score-discriminative\nfeatures can be better extracted across various tasks. Extensive experiments\nare conducted to evaluate the contributions of proposed components. The\ncomparisons with the existing continual learning methods additionally verify\nthe effectiveness and versatility of our approach. Data and code are available\nat https://github.com/iSEE-Laboratory/Continual-AQA.\n","authors":["Yuan-Ming Li","Ling-An Zeng","Jing-Ke Meng","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.17105v5.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.01090v1","updated":"2024-05-02T08:43:16Z","published":"2024-05-02T08:43:16Z","title":"Learning Object States from Actions via Large Language Models","summary":" Temporally localizing the presence of object states in videos is crucial in\nunderstanding human activities beyond actions and objects. This task has\nsuffered from a lack of training data due to object states' inherent ambiguity\nand variety. To avoid exhaustive annotation, learning from transcribed\nnarrations in instructional videos would be intriguing. However, object states\nare less described in narrations compared to actions, making them less\neffective. In this work, we propose to extract the object state information\nfrom action information included in narrations, using large language models\n(LLMs). Our observation is that LLMs include world knowledge on the\nrelationship between actions and their resulting object states, and can infer\nthe presence of object states from past action sequences. The proposed\nLLM-based framework offers flexibility to generate plausible pseudo-object\nstate labels against arbitrary categories. We evaluate our method with our\nnewly collected Multiple Object States Transition (MOST) dataset including\ndense temporal annotation of 60 object state categories. Our model trained by\nthe generated pseudo-labels demonstrates significant improvement of over 29% in\nmAP against strong zero-shot vision-language models, showing the effectiveness\nof explicitly extracting object state information from actions through LLMs.\n","authors":["Masatoshi Tateno","Takuma Yagi","Ryosuke Furuta","Yoichi Sato"],"pdf_url":"https://arxiv.org/pdf/2405.01090v1.pdf","comment":"19 pages of main content, 24 pages of supplementary material"},{"id":"http://arxiv.org/abs/2403.20183v2","updated":"2024-05-02T08:39:34Z","published":"2024-03-29T13:57:46Z","title":"HARMamba: Efficient Wearable Sensor Human Activity Recognition Based on\n Bidirectional Selective SSM","summary":" Wearable sensor-based human activity recognition (HAR) is a critical research\ndomain in activity perception. However, achieving high efficiency and long\nsequence recognition remains a challenge. Despite the extensive investigation\nof temporal deep learning models, such as CNNs, RNNs, and transformers, their\nextensive parameters often pose significant computational and memory\nconstraints, rendering them less suitable for resource-constrained mobile\nhealth applications. This study introduces HARMamba, an innovative light-weight\nand versatile HAR architecture that combines selective bidirectional SSM and\nhardware-aware design. To optimize real-time resource consumption in practical\nscenarios, HARMamba employs linear recursive mechanisms and parameter\ndiscretization, allowing it to selectively focus on relevant input sequences\nwhile efficiently fusing scan and recompute operations. To address potential\nissues with invalid sensor data, the system processes the data stream through\nindependent channels, dividing each channel into \"patches\" and appending\nclassification token to the end of the sequence. Position embeddings are\nincorporated to represent the sequence order, and the activity categories are\noutput through a classification header. The HARMamba Block serves as the\nfundamental component of the HARMamba architecture, enabling the effective\ncapture of more discriminative activity sequence features. HARMamba outperforms\ncontemporary state-of-the-art frameworks, delivering comparable or better\naccuracy with significantly reducing computational and memory demands. It's\neffectiveness has been extensively validated on public datasets like PAMAP2,\nWISDM, UNIMIB SHAR and UCI, showcasing impressive results.\n","authors":["Shuangjian Li","Tao Zhu","Furong Duan","Liming Chen","Huansheng Ning","Christopher Nugent","Yaping Wan"],"pdf_url":"https://arxiv.org/pdf/2403.20183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01088v1","updated":"2024-05-02T08:33:43Z","published":"2024-05-02T08:33:43Z","title":"Type2Branch: Keystroke Biometrics based on a Dual-branch Architecture\n with Attention Mechanisms and Set2set Loss","summary":" In 2021, the pioneering work on TypeNet showed that keystroke dynamics\nverification could scale to hundreds of thousands of users with minimal\nperformance degradation. Recently, the KVC-onGoing competition has provided an\nopen and robust experimental protocol for evaluating keystroke dynamics\nverification systems of such scale, including considerations of algorithmic\nfairness. This article describes Type2Branch, the model and techniques that\nachieved the lowest error rates at the KVC-onGoing, in both desktop and mobile\nscenarios. The novelty aspects of the proposed Type2Branch include: i)\nsynthesized timing features emphasizing user behavior deviation from the\ngeneral population, ii) a dual-branch architecture combining recurrent and\nconvolutional paths with various attention mechanisms, iii) a new loss function\nnamed Set2set that captures the global structure of the embedding space, and\niv) a training curriculum of increasing difficulty. Considering five enrollment\nsamples per subject of approximately 50 characters typed, the proposed\nType2Branch achieves state-of-the-art performance with mean per-subject EERs of\n0.77% and 1.03% on evaluation sets of respectively 15,000 and 5,000 subjects\nfor desktop and mobile scenarios. With a uniform global threshold for all\nsubjects, the EERs are 3.25% for desktop and 3.61% for mobile, outperforming\nprevious approaches by a significant margin.\n","authors":["Nahuel González","Giuseppe Stragapede","Rubén Vera-Rodriguez","Rubén Tolosana"],"pdf_url":"https://arxiv.org/pdf/2405.01088v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.01085v1","updated":"2024-05-02T08:29:05Z","published":"2024-05-02T08:29:05Z","title":"Single Image Super-Resolution Based on Global-Local Information Synergy","summary":" Although several image super-resolution solutions exist, they still face many\nchallenges. CNN-based algorithms, despite the reduction in computational\ncomplexity, still need to improve their accuracy. While Transformer-based\nalgorithms have higher accuracy, their ultra-high computational complexity\nmakes them difficult to be accepted in practical applications. To overcome the\nexisting challenges, a novel super-resolution reconstruction algorithm is\nproposed in this paper. The algorithm achieves a significant increase in\naccuracy through a unique design while maintaining a low complexity. The core\nof the algorithm lies in its cleverly designed Global-Local Information\nExtraction Module and Basic Block Module. By combining global and local\ninformation, the Global-Local Information Extraction Module aims to understand\nthe image content more comprehensively so as to recover the global structure\nand local details in the image more accurately, which provides rich information\nsupport for the subsequent reconstruction process. Experimental results show\nthat the comprehensive performance of the algorithm proposed in this paper is\noptimal, providing an efficient and practical new solution in the field of\nsuper-resolution reconstruction.\n","authors":["Nianzu Qiao","Lamei Di","Changyin Sun"],"pdf_url":"https://arxiv.org/pdf/2405.01085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01083v1","updated":"2024-05-02T08:25:52Z","published":"2024-05-02T08:25:52Z","title":"MCMS: Multi-Category Information and Multi-Scale Stripe Attention for\n Blind Motion Deblurring","summary":" Deep learning-based motion deblurring techniques have advanced significantly\nin recent years. This class of techniques, however, does not carefully examine\nthe inherent flaws in blurry images. For instance, low edge and structural\ninformation are traits of blurry images. The high-frequency component of blurry\nimages is edge information, and the low-frequency component is structure\ninformation. A blind motion deblurring network (MCMS) based on multi-category\ninformation and multi-scale stripe attention mechanism is proposed. Given the\nrespective characteristics of the high-frequency and low-frequency components,\na three-stage encoder-decoder model is designed. Specifically, the first stage\nfocuses on extracting the features of the high-frequency component, the second\nstage concentrates on extracting the features of the low-frequency component,\nand the third stage integrates the extracted low-frequency component features,\nthe extracted high-frequency component features, and the original blurred image\nin order to recover the final clear image. As a result, the model effectively\nimproves motion deblurring by fusing the edge information of the high-frequency\ncomponent and the structural information of the low-frequency component. In\naddition, a grouped feature fusion technique is developed so as to achieve\nricher, more three-dimensional and comprehensive utilization of various types\nof features at a deep level. Next, a multi-scale stripe attention mechanism\n(MSSA) is designed, which effectively combines the anisotropy and multi-scale\ninformation of the image, a move that significantly enhances the capability of\nthe deep model in feature representation. Large-scale comparative studies on\nvarious datasets show that the strategy in this paper works better than the\nrecently published measures.\n","authors":["Nianzu Qiao","Lamei Di","Changyin Sun"],"pdf_url":"https://arxiv.org/pdf/2405.01083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02972v2","updated":"2024-05-02T08:15:07Z","published":"2024-02-05T12:50:30Z","title":"Retrieval-Augmented Score Distillation for Text-to-3D Generation","summary":" Text-to-3D generation has achieved significant success by incorporating\npowerful 2D diffusion models, but insufficient 3D prior knowledge also leads to\nthe inconsistency of 3D geometry. Recently, since large-scale multi-view\ndatasets have been released, fine-tuning the diffusion model on the multi-view\ndatasets becomes a mainstream to solve the 3D inconsistency problem. However,\nit has confronted with fundamental difficulties regarding the limited quality\nand diversity of 3D data, compared with 2D data. To sidestep these trade-offs,\nwe explore a retrieval-augmented approach tailored for score distillation,\ndubbed ReDream. We postulate that both expressiveness of 2D diffusion models\nand geometric consistency of 3D assets can be fully leveraged by employing the\nsemantically relevant assets directly within the optimization process. To this\nend, we introduce novel framework for retrieval-based quality enhancement in\ntext-to-3D generation. We leverage the retrieved asset to incorporate its\ngeometric prior in the variational objective and adapt the diffusion model's 2D\nprior toward view consistency, achieving drastic improvements in both geometry\nand fidelity of generated scenes. We conduct extensive experiments to\ndemonstrate that ReDream exhibits superior quality with increased geometric\nconsistency. Project page is available at https://ku-cvlab.github.io/ReDream/.\n","authors":["Junyoung Seo","Susung Hong","Wooseok Jang","Inès Hyeonsu Kim","Minseop Kwak","Doyup Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2402.02972v2.pdf","comment":"Accepted to ICML 2024 / Project Page:\n https://ku-cvlab.github.io/ReDream/"},{"id":"http://arxiv.org/abs/2405.01073v1","updated":"2024-05-02T08:06:10Z","published":"2024-05-02T08:06:10Z","title":"Poisoning Attacks on Federated Learning for Autonomous Driving","summary":" Federated Learning (FL) is a decentralized learning paradigm, enabling\nparties to collaboratively train models while keeping their data confidential.\nWithin autonomous driving, it brings the potential of reducing data storage\ncosts, reducing bandwidth requirements, and to accelerate the learning. FL is,\nhowever, susceptible to poisoning attacks. In this paper, we introduce two\nnovel poisoning attacks on FL tailored to regression tasks within autonomous\ndriving: FLStealth and Off-Track Attack (OTA). FLStealth, an untargeted attack,\naims at providing model updates that deteriorate the global model performance\nwhile appearing benign. OTA, on the other hand, is a targeted attack with the\nobjective to change the global model's behavior when exposed to a certain\ntrigger. We demonstrate the effectiveness of our attacks by conducting\ncomprehensive experiments pertaining to the task of vehicle trajectory\nprediction. In particular, we show that, among five different untargeted\nattacks, FLStealth is the most successful at bypassing the considered defenses\nemployed by the server. For OTA, we demonstrate the inability of common defense\nstrategies to mitigate the attack, highlighting the critical need for new\ndefensive mechanisms against targeted attacks within FL for autonomous driving.\n","authors":["Sonakshi Garg","Hugo Jönsson","Gustav Kalander","Axel Nilsson","Bhhaanu Pirange","Viktor Valadi","Johan Östman"],"pdf_url":"https://arxiv.org/pdf/2405.01073v1.pdf","comment":"Accepted to SCAI2024"},{"id":"http://arxiv.org/abs/2405.01071v1","updated":"2024-05-02T08:03:18Z","published":"2024-05-02T08:03:18Z","title":"Callico: a Versatile Open-Source Document Image Annotation Platform","summary":" This paper presents Callico, a web-based open source platform designed to\nsimplify the annotation process in document recognition projects. The move\ntowards data-centric AI in machine learning and deep learning underscores the\nimportance of high-quality data, and the need for specialised tools that\nincrease the efficiency and effectiveness of generating such data. For document\nimage annotation, Callico offers dual-display annotation for digitised\ndocuments, enabling simultaneous visualisation and annotation of scanned images\nand text. This capability is critical for OCR and HTR model training, document\nlayout analysis, named entity recognition, form-based key value annotation or\nhierarchical structure annotation with element grouping. The platform supports\ncollaborative annotation with versatile features backed by a commitment to open\nsource development, high-quality code standards and easy deployment via Docker.\nIllustrative use cases - including the transcription of the Belfort municipal\nregisters, the indexing of French World War II prisoners for the ICRC, and the\nextraction of personal information from the Socface project's census lists -\ndemonstrate Callico's applicability and utility.\n","authors":["Christopher Kermorvant","Eva Bardou","Manon Blanco","Bastien Abadie"],"pdf_url":"https://arxiv.org/pdf/2405.01071v1.pdf","comment":"Accepted to ICDAR 2024"},{"id":"http://arxiv.org/abs/2401.00254v2","updated":"2024-05-02T07:50:39Z","published":"2023-12-30T14:53:09Z","title":"Morphing Tokens Draw Strong Masked Image Models","summary":" Masked image modeling (MIM) is a promising option for training Vision\nTransformers among various self-supervised learning (SSL) methods. The essence\nof MIM lies in token-wise masked token predictions, with targets tokenized from\nimages or generated by pre-trained models such as vision-language models. While\ntokenizers or pre-trained models are plausible MIM targets, they often offer\nspatially inconsistent targets even for neighboring tokens, complicating models\nto learn unified discriminative representations. Our pilot study confirms that\naddressing spatial inconsistencies has the potential to enhance representation\nquality. Motivated by the findings, we introduce a novel self-supervision\nsignal called Dynamic Token Morphing (DTM), which dynamically aggregates\ncontextually related tokens to yield contextualized targets. DTM is compatible\nwith various SSL frameworks; we showcase an improved MIM by employing DTM,\nbarely introducing extra training costs. Our experiments on ImageNet-1K and\nADE20K demonstrate the superiority of our methods compared with\nstate-of-the-art, complex MIM methods. Furthermore, the comparative evaluation\nof the iNaturalists and fine-grained visual classification datasets further\nvalidates the transferability of our method on various downstream tasks. Code\nis available at https://github.com/naver-ai/dtm\n","authors":["Taekyung Kim","Byeongho Heo","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2401.00254v2.pdf","comment":"27 pages, 17 tables, 6 figures"},{"id":"http://arxiv.org/abs/2405.01066v1","updated":"2024-05-02T07:47:49Z","published":"2024-05-02T07:47:49Z","title":"HandSSCA: 3D Hand Mesh Reconstruction with State Space Channel Attention\n from RGB images","summary":" Reconstructing a hand mesh from a single RGB image is a challenging task\nbecause hands are often occluded by objects. Most previous works attempted to\nintroduce more additional information and adopt attention mechanisms to improve\n3D reconstruction results, but it would increased computational complexity.\nThis observation prompts us to propose a new and concise architecture while\nimproving computational efficiency. In this work, we propose a simple and\neffective 3D hand mesh reconstruction network HandSSCA, which is the first to\nincorporate state space modeling into the field of hand pose estimation. In the\nnetwork, we have designed a novel state space channel attention module that\nextends the effective sensory field, extracts hand features in the spatial\ndimension, and enhances hand regional features in the channel dimension. This\ndesign helps to reconstruct a complete and detailed hand mesh. Extensive\nexperiments conducted on well-known datasets featuring challenging hand-object\nocclusions (such as FREIHAND, DEXYCB, and HO3D) demonstrate that our proposed\nHandSSCA achieves state-of-the-art performance while maintaining a minimal\nparameter count.\n","authors":["Zixun Jiao","Xihan Wang","Quanli Gao"],"pdf_url":"https://arxiv.org/pdf/2405.01066v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.01065v1","updated":"2024-05-02T07:44:11Z","published":"2024-05-02T07:44:11Z","title":"MFDS-Net: Multi-Scale Feature Depth-Supervised Network for Remote\n Sensing Change Detection with Global Semantic and Detail Information","summary":" Change detection as an interdisciplinary discipline in the field of computer\nvision and remote sensing at present has been receiving extensive attention and\nresearch. Due to the rapid development of society, the geographic information\ncaptured by remote sensing satellites is changing faster and more complex,\nwhich undoubtedly poses a higher challenge and highlights the value of change\ndetection tasks. We propose MFDS-Net: Multi-Scale Feature Depth-Supervised\nNetwork for Remote Sensing Change Detection with Global Semantic and Detail\nInformation (MFDS-Net) with the aim of achieving a more refined description of\nchanging buildings as well as geographic information, enhancing the\nlocalisation of changing targets and the acquisition of weak features. To\nachieve the research objectives, we use a modified ResNet_34 as backbone\nnetwork to perform feature extraction and DO-Conv as an alternative to\ntraditional convolution to better focus on the association between feature\ninformation and to obtain better training results. We propose the Global\nSemantic Enhancement Module (GSEM) to enhance the processing of high-level\nsemantic information from a global perspective. The Differential Feature\nIntegration Module (DFIM) is proposed to strengthen the fusion of different\ndepth feature information, achieving learning and extraction of differential\nfeatures. The entire network is trained and optimized using a deep supervision\nmechanism.\n The experimental outcomes of MFDS-Net surpass those of current mainstream\nchange detection networks. On the LEVIR dataset, it achieved an F1 score of\n91.589 and IoU of 84.483, on the WHU dataset, the scores were F1: 92.384 and\nIoU: 86.807, and on the GZ-CD dataset, the scores were F1: 86.377 and IoU:\n76.021. The code is available at https://github.com/AOZAKIiii/MFDS-Net\n","authors":["Zhenyang Huang","Zhaojin Fu","Song Jintao","Genji Yuan","Jinjiang Li"],"pdf_url":"https://arxiv.org/pdf/2405.01065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01060v1","updated":"2024-05-02T07:34:12Z","published":"2024-05-02T07:34:12Z","title":"A text-based, generative deep learning model for soil reflectance\n spectrum simulation in the VIS-NIR (400-2499 nm) bands","summary":" Simulating soil reflectance spectra is invaluable for soil-plant radiative\nmodeling and training machine learning models, yet it is difficult as the\nintricate relationships between soil structure and its constituents. To address\nthis, a fully data-driven soil optics generative model (SOGM) for simulation of\nsoil reflectance spectra based on soil property inputs was developed. The model\nis trained on an extensive dataset comprising nearly 180,000 soil\nspectra-property pairs from 17 datasets. It generates soil reflectance spectra\nfrom text-based inputs describing soil properties and their values rather than\nonly numerical values and labels in binary vector format. The generative model\ncan simulate output spectra based on an incomplete set of input properties.\nSOGM is based on the denoising diffusion probabilistic model (DDPM). Two\nadditional sub-models were also built to complement the SOGM: a spectral\npadding model that can fill in the gaps for spectra shorter than the full\nvisible-near-infrared range (VIS-NIR; 400 to 2499 nm), and a wet soil spectra\nmodel that can estimate the effects of water content on soil reflectance\nspectra given the dry spectrum predicted by the SOGM. The SOGM was up-scaled by\ncoupling with the Helios 3D plant modeling software, which allowed for\ngeneration of synthetic aerial images of simulated soil and plant scenes. It\ncan also be easily integrated with soil-plant radiation model used for remote\nsensin research like PROSAIL. The testing results of the SOGM on new datasets\nthat not included in model training proved that the model can generate\nreasonable soil reflectance spectra based on available property inputs. The\npresented models are openly accessible on:\nhttps://github.com/GEMINI-Breeding/SOGM_soil_spectra_simulation.\n","authors":["Tong Lei","Brian N. Bailey"],"pdf_url":"https://arxiv.org/pdf/2405.01060v1.pdf","comment":"The paper has been submitted to Remote sensing of Environment and\n revised"},{"id":"http://arxiv.org/abs/2405.01054v1","updated":"2024-05-02T07:21:12Z","published":"2024-05-02T07:21:12Z","title":"Continual Learning for Robust Gate Detection under Dynamic Lighting in\n Autonomous Drone Racing","summary":" In autonomous and mobile robotics, a principal challenge is resilient\nreal-time environmental perception, particularly in situations characterized by\nunknown and dynamic elements, as exemplified in the context of autonomous drone\nracing. This study introduces a perception technique for detecting drone racing\ngates under illumination variations, which is common during high-speed drone\nflights. The proposed technique relies upon a lightweight neural network\nbackbone augmented with capabilities for continual learning. The envisaged\napproach amalgamates predictions of the gates' positional coordinates,\ndistance, and orientation, encapsulating them into a cohesive pose tuple. A\ncomprehensive number of tests serve to underscore the efficacy of this approach\nin confronting diverse and challenging scenarios, specifically those involving\nvariable lighting conditions. The proposed methodology exhibits notable\nrobustness in the face of illumination variations, thereby substantiating its\neffectiveness.\n","authors":["Zhongzheng Qiao","Xuan Huy Pham","Savitha Ramasamy","Xudong Jiang","Erdal Kayacan","Andriy Sarabakha"],"pdf_url":"https://arxiv.org/pdf/2405.01054v1.pdf","comment":"8 pages, 6 figures, in 2024 International Joint Conference on Neural\n Networks (IJCNN)"},{"id":"http://arxiv.org/abs/2402.09466v2","updated":"2024-05-02T07:17:50Z","published":"2024-02-09T13:59:14Z","title":"Few-Shot Learning with Uncertainty-based Quadruplet Selection for\n Interference Classification in GNSS Data","summary":" Jamming devices pose a significant threat by disrupting signals from the\nglobal navigation satellite system (GNSS), compromising the robustness of\naccurate positioning. Detecting anomalies in frequency snapshots is crucial to\ncounteract these interferences effectively. The ability to adapt to diverse,\nunseen interference characteristics is essential for ensuring the reliability\nof GNSS in real-world applications. In this paper, we propose a few-shot\nlearning (FSL) approach to adapt to new interference classes. Our method\nemploys quadruplet selection for the model to learn representations using\nvarious positive and negative interference classes. Furthermore, our quadruplet\nvariant selects pairs based on the aleatoric and epistemic uncertainty to\ndifferentiate between similar classes. We recorded a dataset at a motorway with\neight interference classes on which our FSL method with quadruplet loss\noutperforms other FSL techniques in jammer classification accuracy with 97.66%.\nDataset available at:\nhttps://gitlab.cc-asp.fraunhofer.de/darcy_gnss/FIOT_highway\n","authors":["Felix Ott","Lucas Heublein","Nisha Lakshmana Raichur","Tobias Feigl","Jonathan Hansen","Alexander Rügamer","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2402.09466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10543v5","updated":"2024-05-02T07:15:14Z","published":"2023-11-17T14:10:55Z","title":"Joint covariance properties under geometric image transformations for\n spatio-temporal receptive fields according to the generalized Gaussian\n derivative model for visual receptive fields","summary":" The influence of natural image transformations on receptive field responses\nis crucial for modelling visual operations in computer vision and biological\nvision. In this regard, covariance properties with respect to geometric image\ntransformations in the earliest layers of the visual hierarchy are essential\nfor expressing robust image operations, and for formulating invariant visual\noperations at higher levels.\n This paper defines and proves a set of joint covariance properties under\ncompositions of spatial scaling transformations, spatial affine\ntransformations, Galilean transformations and temporal scaling transformations,\nwhich make it possible to characterize how different types of image\ntransformations interact with each other and the associated spatio-temporal\nreceptive field responses. In this regard, we also extend the notion of\nscale-normalized derivatives to affine-normalized derivatives, to be able to\nobtain true affine-covariant properties of spatial derivatives, that are\ncomputed based on spatial smoothing with affine Gaussian kernels.\n The derived relations show how the parameters of the receptive fields need to\nbe transformed, in order to match the output from spatio-temporal receptive\nfields under composed spatio-temporal image transformations. As a side effect,\nthe presented proof for the joint covariance property over the integrated\ncombination of the different geometric image transformations also provides\nspecific proofs for the individual transformation properties, which have not\npreviously been fully reported in the literature.\n The paper also presents an in-depth theoretical analysis of geometric\ninterpretations of the derived covariance properties, as well as outlines a\nnumber of biological interpretations of these results.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.10543v5.pdf","comment":"38 pages, 13 figures. Note: From version 4, this paper considers a\n different form of joint composition of the geometric image transformations\n than in the earlier versions"},{"id":"http://arxiv.org/abs/2405.01040v1","updated":"2024-05-02T06:52:49Z","published":"2024-05-02T06:52:49Z","title":"Few Shot Class Incremental Learning using Vision-Language models","summary":" Recent advancements in deep learning have demonstrated remarkable performance\ncomparable to human capabilities across various supervised computer vision\ntasks. However, the prevalent assumption of having an extensive pool of\ntraining data encompassing all classes prior to model training often diverges\nfrom real-world scenarios, where limited data availability for novel classes is\nthe norm. The challenge emerges in seamlessly integrating new classes with few\nsamples into the training data, demanding the model to adeptly accommodate\nthese additions without compromising its performance on base classes. To\naddress this exigency, the research community has introduced several solutions\nunder the realm of few-shot class incremental learning (FSCIL).\n In this study, we introduce an innovative FSCIL framework that utilizes\nlanguage regularizer and subspace regularizer. During base training, the\nlanguage regularizer helps incorporate semantic information extracted from a\nVision-Language model. The subspace regularizer helps in facilitating the\nmodel's acquisition of nuanced connections between image and text semantics\ninherent to base classes during incremental training. Our proposed framework\nnot only empowers the model to embrace novel classes with limited data, but\nalso ensures the preservation of performance on base classes. To substantiate\nthe efficacy of our approach, we conduct comprehensive experiments on three\ndistinct FSCIL benchmarks, where our framework attains state-of-the-art\nperformance.\n","authors":["Anurag Kumar","Chinmay Bharti","Saikat Dutta","Srikrishna Karanam","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2405.01040v1.pdf","comment":"under review at Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2311.09253v2","updated":"2024-05-02T06:52:44Z","published":"2023-11-14T18:30:34Z","title":"The Perception-Robustness Tradeoff in Deterministic Image Restoration","summary":" We study the behavior of deterministic methods for solving inverse problems\nin imaging. These methods are commonly designed to achieve two goals: (1)\nattaining high perceptual quality, and (2) generating reconstructions that are\nconsistent with the measurements. We provide a rigorous proof that the better a\npredictor satisfies these two requirements, the larger its Lipschitz constant\nmust be, regardless of the nature of the degradation involved. In particular,\nto approach perfect perceptual quality and perfect consistency, the Lipschitz\nconstant of the model must grow to infinity. This implies that such methods are\nnecessarily more susceptible to adversarial attacks. We demonstrate our theory\non single image super-resolution algorithms, addressing both noisy and\nnoiseless settings. We also show how this undesired behavior can be leveraged\nto explore the posterior distribution, thereby allowing the deterministic model\nto imitate stochastic methods.\n","authors":["Guy Ohayon","Tomer Michaeli","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2311.09253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17728v2","updated":"2024-05-02T06:49:35Z","published":"2024-01-31T10:47:25Z","title":"COMET: Contrastive Mean Teacher for Online Source-Free Universal Domain\n Adaptation","summary":" In real-world applications, there is often a domain shift from training to\ntest data. This observation resulted in the development of test-time adaptation\n(TTA). It aims to adapt a pre-trained source model to the test data without\nrequiring access to the source data. Thereby, most existing works are limited\nto the closed-set assumption, i.e. there is no category shift between source\nand target domain. We argue that in a realistic open-world setting a category\nshift can appear in addition to a domain shift. This means, individual source\nclasses may not appear in the target domain anymore, samples of new classes may\nbe part of the target domain or even both at the same time. Moreover, in many\nreal-world scenarios the test data is not accessible all at once but arrives\nsequentially as a stream of batches demanding an immediate prediction. Hence,\nTTA must be applied in an online manner. To the best of our knowledge, the\ncombination of these aspects, i.e. online source-free universal domain\nadaptation (online SF-UniDA), has not been studied yet. In this paper, we\nintroduce a Contrastive Mean Teacher (COMET) tailored to this novel scenario.\nIt applies a contrastive loss to rebuild a feature space where the samples of\nknown classes build distinct clusters and the samples of new classes separate\nwell from them. It is complemented by an entropy loss which ensures that the\nclassifier output has a small entropy for samples of known classes and a large\nentropy for samples of new classes to be easily detected and rejected as\nunknown. To provide the losses with reliable pseudo labels, they are embedded\ninto a mean teacher (MT) framework. We evaluate our method across two datasets\nand all category shifts to set an initial benchmark for online SF-UniDA.\nThereby, COMET yields state-of-the-art performance and proves to be consistent\nand robust across a variety of different scenarios.\n","authors":["Pascal Schlachter","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2401.17728v2.pdf","comment":"Accepted at the International Joint Conference on Neural Networks\n (IJCNN) 2024"},{"id":"http://arxiv.org/abs/2405.01028v1","updated":"2024-05-02T06:00:09Z","published":"2024-05-02T06:00:09Z","title":"Technical Report of NICE Challenge at CVPR 2024: Caption Re-ranking\n Evaluation Using Ensembled CLIP and Consensus Scores","summary":" This report presents the ECO (Ensembled Clip score and cOnsensus score)\npipeline from team DSBA LAB, which is a new framework used to evaluate and rank\ncaptions for a given image. ECO selects the most accurate caption describing\nimage. It is made possible by combining an Ensembled CLIP score, which\nconsiders the semantic alignment between the image and captions, with a\nConsensus score that accounts for the essentialness of the captions. Using this\nframework, we achieved notable success in the CVPR 2024 Workshop Challenge on\nCaption Re-ranking Evaluation at the New Frontiers for Zero-Shot Image\nCaptioning Evaluation (NICE). Specifically, we secured third place based on the\nCIDEr metric, second in both the SPICE and METEOR metrics, and first in the\nROUGE-L and all BLEU Score metrics. The code and configuration for the ECO\nframework are available at https://github.com/ DSBA-Lab/ECO .\n","authors":["Kiyoon Jeong","Woojun Lee","Woongchan Nam","Minjeong Ma","Pilsung Kang"],"pdf_url":"https://arxiv.org/pdf/2405.01028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12900v4","updated":"2024-05-02T05:50:51Z","published":"2023-07-24T15:47:21Z","title":"Automotive Object Detection via Learning Sparse Events by Spiking\n Neurons","summary":" Event-based sensors, distinguished by their high temporal resolution of 1\n$\\mathrm{\\mu}\\text{s}$ and a dynamic range of 120 $\\text{dB}$, stand out as\nideal tools for deployment in fast-paced settings like vehicles and drones.\nTraditional object detection techniques that utilize Artificial Neural Networks\n(ANNs) face challenges due to the sparse and asynchronous nature of the events\nthese sensors capture. In contrast, Spiking Neural Networks (SNNs) offer a\npromising alternative, providing a temporal representation that is inherently\naligned with event-based data. This paper explores the unique membrane\npotential dynamics of SNNs and their ability to modulate sparse events. We\nintroduce an innovative spike-triggered adaptive threshold mechanism designed\nfor stable training. Building on these insights, we present a specialized\nspiking feature pyramid network (SpikeFPN) optimized for automotive event-based\nobject detection. Comprehensive evaluations demonstrate that SpikeFPN surpasses\nboth traditional SNNs and advanced ANNs enhanced with attention mechanisms.\nEvidently, SpikeFPN achieves a mean Average Precision (mAP) of 0.477 on the\nGEN1 Automotive Detection (GAD) benchmark dataset, marking significant\nincreases over the selected SNN baselines. Moreover, the efficient design of\nSpikeFPN ensures robust performance while optimizing computational resources,\nattributed to its innate sparse computation capabilities.\n","authors":["Hu Zhang","Yanchen Li","Luziwei Leng","Kaiwei Che","Qian Liu","Qinghai Guo","Jianxing Liao","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.12900v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13505v2","updated":"2024-05-02T05:45:45Z","published":"2024-02-21T03:39:04Z","title":"SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed\n Semi-Supervised Learning","summary":" Recent advancements in semi-supervised learning have focused on a more\nrealistic yet challenging task: addressing imbalances in labeled data while the\nclass distribution of unlabeled data remains both unknown and potentially\nmismatched. Current approaches in this sphere often presuppose rigid\nassumptions regarding the class distribution of unlabeled data, thereby\nlimiting the adaptability of models to only certain distribution ranges. In\nthis study, we propose a novel approach, introducing a highly adaptable\nframework, designated as SimPro, which does not rely on any predefined\nassumptions about the distribution of unlabeled data. Our framework, grounded\nin a probabilistic model, innovatively refines the expectation-maximization\n(EM) algorithm by explicitly decoupling the modeling of conditional and\nmarginal class distributions. This separation facilitates a closed-form\nsolution for class distribution estimation during the maximization phase,\nleading to the formulation of a Bayes classifier. The Bayes classifier, in\nturn, enhances the quality of pseudo-labels in the expectation phase.\nRemarkably, the SimPro framework not only comes with theoretical guarantees but\nalso is straightforward to implement. Moreover, we introduce two novel class\ndistributions broadening the scope of the evaluation. Our method showcases\nconsistent state-of-the-art performance across diverse benchmarks and data\ndistribution scenarios. Our code is available at\nhttps://github.com/LeapLabTHU/SimPro.\n","authors":["Chaoqun Du","Yizeng Han","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.13505v2.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2405.01016v1","updated":"2024-05-02T05:35:10Z","published":"2024-05-02T05:35:10Z","title":"Addressing Diverging Training Costs using Local Restoration for Precise\n Bird's Eye View Map Construction","summary":" Recent advancements in Bird's Eye View (BEV) fusion for map construction have\ndemonstrated remarkable mapping of urban environments. However, their deep and\nbulky architecture incurs substantial amounts of backpropagation memory and\ncomputing latency. Consequently, the problem poses an unavoidable bottleneck in\nconstructing high-resolution (HR) BEV maps, as their large-sized features cause\nsignificant increases in costs including GPU memory consumption and computing\nlatency, named diverging training costs issue. Affected by the problem, most\nexisting methods adopt low-resolution (LR) BEV and struggle to estimate the\nprecise locations of urban scene components like road lanes, and sidewalks. As\nthe imprecision leads to risky self-driving, the diverging training costs issue\nhas to be resolved. In this paper, we address the issue with our novel Trumpet\nNeural Network (TNN) mechanism. The framework utilizes LR BEV space and outputs\nan up-sampled semantic BEV map to create a memory-efficient pipeline. To this\nend, we introduce Local Restoration of BEV representation. Specifically, the\nup-sampled BEV representation has severely aliased, blocky signals, and thick\nsemantic labels. Our proposed Local Restoration restores the signals and thins\n(or narrows down) the width of the labels. Our extensive experiments show that\nthe TNN mechanism provides a plug-and-play memory-efficient pipeline, thereby\nenabling the effective estimation of real-sized (or precise) semantic labels\nfor BEV map construction.\n","authors":["Minsu Kim","Giseop Kim","Sunwook Choi"],"pdf_url":"https://arxiv.org/pdf/2405.01016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01012v1","updated":"2024-05-02T05:27:12Z","published":"2024-05-02T05:27:12Z","title":"Correcting Biased Centered Kernel Alignment Measures in Biological and\n Artificial Neural Networks","summary":" Centred Kernel Alignment (CKA) has recently emerged as a popular metric to\ncompare activations from biological and artificial neural networks (ANNs) in\norder to quantify the alignment between internal representations derived from\nstimuli sets (e.g. images, text, video) that are presented to both systems. In\nthis paper we highlight issues that the community should take into account if\nusing CKA as an alignment metric with neural data. Neural data are in the\nlow-data high-dimensionality domain, which is one of the cases where (biased)\nCKA results in high similarity scores even for pairs of random matrices. Using\nfMRI and MEG data from the THINGS project, we show that if biased CKA is\napplied to representations of different sizes in the low-data\nhigh-dimensionality domain, they are not directly comparable due to biased\nCKA's sensitivity to differing feature-sample ratios and not stimuli-driven\nresponses. This situation can arise both when comparing a pre-selected area of\ninterest (e.g. ROI) to multiple ANN layers, as well as when determining to\nwhich ANN layer multiple regions of interest (ROIs) / sensor groups of\ndifferent dimensionality are most similar. We show that biased CKA can be\nartificially driven to its maximum value when using independent random data of\ndifferent sample-feature ratios. We further show that shuffling sample-feature\npairs of real neural data does not drastically alter biased CKA similarity in\ncomparison to unshuffled data, indicating an undesirable lack of sensitivity to\nstimuli-driven neural responses. Positive alignment of true stimuli-driven\nresponses is only achieved by using debiased CKA. Lastly, we report findings\nthat suggest biased CKA is sensitive to the inherent structure of neural data,\nonly differing from shuffled data when debiased CKA detects stimuli-driven\nalignment.\n","authors":["Alex Murphy","Joel Zylberberg","Alona Fyshe"],"pdf_url":"https://arxiv.org/pdf/2405.01012v1.pdf","comment":"ICLR 2024 Re-Align Workshop"},{"id":"http://arxiv.org/abs/2405.01008v1","updated":"2024-05-02T05:19:05Z","published":"2024-05-02T05:19:05Z","title":"On Mechanistic Knowledge Localization in Text-to-Image Generative Models","summary":" Identifying layers within text-to-image models which control visual\nattributes can facilitate efficient model editing through closed-form updates.\nRecent work, leveraging causal tracing show that early Stable-Diffusion\nvariants confine knowledge primarily to the first layer of the CLIP\ntext-encoder, while it diffuses throughout the UNet.Extending this framework,\nwe observe that for recent models (e.g., SD-XL, DeepFloyd), causal tracing\nfails in pinpointing localized knowledge, highlighting challenges in model\nediting. To address this issue, we introduce the concept of Mechanistic\nLocalization in text-to-image models, where knowledge about various visual\nattributes (e.g., ``style\", ``objects\", ``facts\") can be mechanistically\nlocalized to a small fraction of layers in the UNet, thus facilitating\nefficient model editing. We localize knowledge using our method LocoGen which\nmeasures the direct effect of intermediate layers to output generation by\nperforming interventions in the cross-attention layers of the UNet. We then\nemploy LocoEdit, a fast closed-form editing method across popular open-source\ntext-to-image models (including the latest SD-XL)and explore the possibilities\nof neuron-level model editing. Using Mechanistic Localization, our work offers\na better view of successes and failures in localization-based text-to-image\nmodel editing. Code will be available at\n\\href{https://github.com/samyadeepbasu/LocoGen}{https://github.com/samyadeepbasu/LocoGen}.\n","authors":["Samyadeep Basu","Keivan Rezaei","Ryan Rossi","Cherry Zhao","Vlad Morariu","Varun Manjunatha","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2405.01008v1.pdf","comment":"Appearing in ICML 2024"},{"id":"http://arxiv.org/abs/2405.01004v1","updated":"2024-05-02T05:09:07Z","published":"2024-05-02T05:09:07Z","title":"Deep Learning Models in Speech Recognition: Measuring GPU Energy\n Consumption, Impact of Noise and Model Quantization for Edge Deployment","summary":" Recent transformer-based ASR models have achieved word-error rates (WER)\nbelow 4%, surpassing human annotator accuracy, yet they demand extensive server\nresources, contributing to significant carbon footprints. The traditional\nserver-based architecture of ASR also presents privacy concerns, alongside\nreliability and latency issues due to network dependencies. In contrast,\non-device (edge) ASR enhances privacy, boosts performance, and promotes\nsustainability by effectively balancing energy use and accuracy for specific\napplications. This study examines the effects of quantization, memory demands,\nand energy consumption on the performance of various ASR model inference on the\nNVIDIA Jetson Orin Nano. By analyzing WER and transcription speed across models\nusing FP32, FP16, and INT8 quantization on clean and noisy datasets, we\nhighlight the crucial trade-offs between accuracy, speeds, quantization, energy\nefficiency, and memory needs. We found that changing precision from fp32 to\nfp16 halves the energy consumption for audio transcription across different\nmodels, with minimal performance degradation. A larger model size and number of\nparameters neither guarantees better resilience to noise, nor predicts the\nenergy consumption for a given transcription load. These, along with several\nother findings offer novel insights for optimizing ASR systems within energy-\nand memory-limited environments, crucial for the development of efficient\non-device ASR solutions. The code and input data needed to reproduce the\nresults in this article are open sourced are available on\n[https://github.com/zzadiues3338/ASR-energy-jetson].\n","authors":["Aditya Chakravarty"],"pdf_url":"https://arxiv.org/pdf/2405.01004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01002v1","updated":"2024-05-02T04:58:29Z","published":"2024-05-02T04:58:29Z","title":"Spider: A Unified Framework for Context-dependent Concept Understanding","summary":" Different from the context-independent (CI) concepts such as human, car, and\nairplane, context-dependent (CD) concepts require higher visual understanding\nability, such as camouflaged object and medical lesion. Despite the rapid\nadvance of many CD understanding tasks in respective branches, the isolated\nevolution leads to their limited cross-domain generalisation and repetitive\ntechnique innovation. Since there is a strong coupling relationship between\nforeground and background context in CD tasks, existing methods require to\ntrain separate models in their focused domains. This restricts their real-world\nCD concept understanding towards artificial general intelligence (AGI). We\npropose a unified model with a single set of parameters, Spider, which only\nneeds to be trained once. With the help of the proposed concept filter driven\nby the image-mask group prompt, Spider is able to understand and distinguish\ndiverse strong context-dependent concepts to accurately capture the Prompter's\nintention. Without bells and whistles, Spider significantly outperforms the\nstate-of-the-art specialized models in 8 different context-dependent\nsegmentation tasks, including 4 natural scenes (salient, camouflaged, and\ntransparent objects and shadow) and 4 medical lesions (COVID-19, polyp, breast,\nand skin lesion with color colonoscopy, CT, ultrasound, and dermoscopy\nmodalities). Besides, Spider shows obvious advantages in continuous learning.\nIt can easily complete the training of new tasks by fine-tuning parameters less\nthan 1\\% and bring a tolerable performance degradation of less than 5\\% for all\nold tasks. The source code will be publicly available at\n\\href{https://github.com/Xiaoqi-Zhao-DLUT/Spider-UniCDSeg}{Spider-UniCDSeg}.\n","authors":["Xiaoqi Zhao","Youwei Pang","Wei Ji","Baicheng Sheng","Jiaming Zuo","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2405.01002v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.00998v1","updated":"2024-05-02T04:31:17Z","published":"2024-05-02T04:31:17Z","title":"Part-aware Shape Generation with Latent 3D Diffusion of Neural Voxel\n Fields","summary":" This paper presents a novel latent 3D diffusion model for the generation of\nneural voxel fields, aiming to achieve accurate part-aware structures. Compared\nto existing methods, there are two key designs to ensure high-quality and\naccurate part-aware generation. On one hand, we introduce a latent 3D diffusion\nprocess for neural voxel fields, enabling generation at significantly higher\nresolutions that can accurately capture rich textural and geometric details. On\nthe other hand, a part-aware shape decoder is introduced to integrate the part\ncodes into the neural voxel fields, guiding the accurate part decomposition and\nproducing high-quality rendering results. Through extensive experimentation and\ncomparisons with state-of-the-art methods, we evaluate our approach across four\ndifferent classes of data. The results demonstrate the superior generative\ncapabilities of our proposed method in part-aware shape generation,\noutperforming existing state-of-the-art methods.\n","authors":["Yuhang Huang","SHilong Zou","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2405.00998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18895v2","updated":"2024-05-02T04:22:40Z","published":"2024-04-29T17:31:00Z","title":"RSCaMa: Remote Sensing Image Change Captioning with State Space Model","summary":" Remote Sensing Image Change Captioning (RSICC) aims to describe surface\nchanges between multi-temporal remote sensing images in language, including the\nchanged object categories, locations, and dynamics of changing objects (e.g.,\nadded or disappeared). This poses challenges to spatial and temporal modeling\nof bi-temporal features. Despite previous methods progressing in the spatial\nchange perception, there are still weaknesses in joint spatial-temporal\nmodeling. To address this, in this paper, we propose a novel RSCaMa model,\nwhich achieves efficient joint spatial-temporal modeling through multiple CaMa\nlayers, enabling iterative refinement of bi-temporal features. To achieve\nefficient spatial modeling, we introduce the recently popular Mamba (a state\nspace model) with a global receptive field and linear complexity into the RSICC\ntask and propose the Spatial Difference-aware SSM (SD-SSM), overcoming\nlimitations of previous CNN- and Transformer-based methods in the receptive\nfield and computational complexity. SD-SSM enhances the model's ability to\ncapture spatial changes sharply. In terms of efficient temporal modeling,\nconsidering the potential correlation between the temporal scanning\ncharacteristics of Mamba and the temporality of the RSICC, we propose the\nTemporal-Traversing SSM (TT-SSM), which scans bi-temporal features in a\ntemporal cross-wise manner, enhancing the model's temporal understanding and\ninformation interaction. Experiments validate the effectiveness of the\nefficient joint spatial-temporal modeling and demonstrate the outstanding\nperformance of RSCaMa and the potential of the Mamba in the RSICC task.\nAdditionally, we systematically compare three different language decoders,\nincluding Mamba, GPT-style decoder, and Transformer decoder, providing valuable\ninsights for future RSICC research. The code will be available at\n\\emph{\\url{https://github.com/Chen-Yang-Liu/RSCaMa}}\n","authors":["Chenyang Liu","Keyan Chen","Bowen Chen","Haotian Zhang","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2404.18895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18253v3","updated":"2024-05-02T04:05:07Z","published":"2024-04-28T17:20:08Z","title":"Efficient Remote Sensing with Harmonized Transfer Learning and Modality\n Alignment","summary":" With the rise of Visual and Language Pretraining (VLP), an increasing number\nof downstream tasks are adopting the paradigm of pretraining followed by\nfine-tuning. Although this paradigm has demonstrated potential in various\nmultimodal downstream tasks, its implementation in the remote sensing domain\nencounters some obstacles. Specifically, the tendency for same-modality\nembeddings to cluster together impedes efficient transfer learning. To tackle\nthis issue, we review the aim of multimodal transfer learning for downstream\ntasks from a unified perspective, and rethink the optimization process based on\nthree distinct objectives. We propose \"Harmonized Transfer Learning and\nModality Alignment (HarMA)\", a method that simultaneously satisfies task\nconstraints, modality alignment, and single-modality uniform alignment, while\nminimizing training overhead through parameter-efficient fine-tuning.\nRemarkably, without the need for external data for training, HarMA achieves\nstate-of-the-art performance in two popular multimodal retrieval tasks in the\nfield of remote sensing. Our experiments reveal that HarMA achieves competitive\nand even superior performance to fully fine-tuned models with only minimal\nadjustable parameters. Due to its simplicity, HarMA can be integrated into\nalmost all existing multimodal pretraining models. We hope this method can\nfacilitate the efficient application of large models to a wide range of\ndownstream tasks while significantly reducing the resource consumption. Code is\navailable at https://github.com/seekerhuang/HarMA.\n","authors":["Tengjun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.18253v3.pdf","comment":"Accepted by the Twelfth International Conference on Learning\n Representations (ICLR) Workshop"},{"id":"http://arxiv.org/abs/2405.00989v1","updated":"2024-05-02T03:53:59Z","published":"2024-05-02T03:53:59Z","title":"Estimate the building height at a 10-meter resolution based on Sentinel\n data","summary":" Building height is an important indicator for scientific research and\npractical application. However, building height products with a high spatial\nresolution (10m) are still very scarce. To meet the needs of high-resolution\nbuilding height estimation models, this study established a set of\nspatial-spectral-temporal feature databases, combining SAR data provided by\nSentinel-1, optical data provided by Sentinel-2, and shape data provided by\nbuilding footprints. The statistical indicators on the time scale are extracted\nto form a rich database of 160 features. This study combined with permutation\nfeature importance, Shapley Additive Explanations, and Random Forest variable\nimportance, and the final stable features are obtained through an expert\nscoring system. This study took 12 large, medium, and small cities in the\nUnited States as the training data. It used moving windows to aggregate the\npixels to solve the impact of SAR image displacement and building shadows. This\nstudy built a building height model based on a random forest model and compared\nthree model ensemble methods of bagging, boosting, and stacking. To evaluate\nthe accuracy of the prediction results, this study collected Lidar data in the\ntest area, and the evaluation results showed that its R-Square reached 0.78,\nwhich can prove that the building height can be obtained effectively. The fast\nproduction of high-resolution building height data can support large-scale\nscientific research and application in many fields.\n","authors":["Xin Yan"],"pdf_url":"https://arxiv.org/pdf/2405.00989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00984v1","updated":"2024-05-02T03:43:19Z","published":"2024-05-02T03:43:19Z","title":"FREE: Faster and Better Data-Free Meta-Learning","summary":" Data-Free Meta-Learning (DFML) aims to extract knowledge from a collection of\npre-trained models without requiring the original data, presenting practical\nbenefits in contexts constrained by data privacy concerns. Current DFML methods\nprimarily focus on the data recovery from these pre-trained models. However,\nthey suffer from slow recovery speed and overlook gaps inherent in\nheterogeneous pre-trained models. In response to these challenges, we introduce\nthe Faster and Better Data-Free Meta-Learning (FREE) framework, which contains:\n(i) a meta-generator for rapidly recovering training tasks from pre-trained\nmodels; and (ii) a meta-learner for generalizing to new unseen tasks.\nSpecifically, within the module Faster Inversion via Meta-Generator, each\npre-trained model is perceived as a distinct task. The meta-generator can\nrapidly adapt to a specific task in just five steps, significantly accelerating\nthe data recovery. Furthermore, we propose Better Generalization via\nMeta-Learner and introduce an implicit gradient alignment algorithm to optimize\nthe meta-learner. This is achieved as aligned gradient directions alleviate\npotential conflicts among tasks from heterogeneous pre-trained models.\nEmpirical experiments on multiple benchmarks affirm the superiority of our\napproach, marking a notable speed-up (20$\\times$) and performance enhancement\n(1.42\\% $\\sim$ 4.78\\%) in comparison to the state-of-the-art.\n","authors":["Yongxian Wei","Zixuan Hu","Zhenyi Wang","Li Shen","Chun Yuan","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2405.00984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00983v1","updated":"2024-05-02T03:38:58Z","published":"2024-05-02T03:38:58Z","title":"LLM-AD: Large Language Model based Audio Description System","summary":" The development of Audio Description (AD) has been a pivotal step forward in\nmaking video content more accessible and inclusive. Traditionally, AD\nproduction has demanded a considerable amount of skilled labor, while existing\nautomated approaches still necessitate extensive training to integrate\nmultimodal inputs and tailor the output from a captioning style to an AD style.\nIn this paper, we introduce an automated AD generation pipeline that harnesses\nthe potent multimodal and instruction-following capacities of GPT-4V(ision).\nNotably, our methodology employs readily available components, eliminating the\nneed for additional training. It produces ADs that not only comply with\nestablished natural language AD production standards but also maintain\ncontextually consistent character information across frames, courtesy of a\ntracking-based character recognition module. A thorough analysis on the MAD\ndataset reveals that our approach achieves a performance on par with\nlearning-based methods in automated AD production, as substantiated by a CIDEr\nscore of 20.5.\n","authors":["Peng Chu","Jiang Wang","Andre Abrantes"],"pdf_url":"https://arxiv.org/pdf/2405.00983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00980v1","updated":"2024-05-02T03:33:17Z","published":"2024-05-02T03:33:17Z","title":"A Hong Kong Sign Language Corpus Collected from Sign-interpreted TV News","summary":" This paper introduces TVB-HKSL-News, a new Hong Kong sign language (HKSL)\ndataset collected from a TV news program over a period of 7 months. The dataset\nis collected to enrich resources for HKSL and support research in\nlarge-vocabulary continuous sign language recognition (SLR) and translation\n(SLT). It consists of 16.07 hours of sign videos of two signers with a\nvocabulary of 6,515 glosses (for SLR) and 2,850 Chinese characters or 18K\nChinese words (for SLT). One signer has 11.66 hours of sign videos and the\nother has 4.41 hours. One objective in building the dataset is to support the\ninvestigation of how well large-vocabulary continuous sign language\nrecognition/translation can be done for a single signer given a (relatively)\nlarge amount of his/her training data, which could potentially lead to the\ndevelopment of new modeling methods. Besides, most parts of the data collection\npipeline are automated with little human intervention; we believe that our\ncollection method can be scaled up to collect more sign language data easily\nfor SLT in the future for any sign languages if such sign-interpreted videos\nare available. We also run a SOTA SLR/SLT model on the dataset and get a\nbaseline SLR word error rate of 34.08% and a baseline SLT BLEU-4 score of 23.58\nfor benchmarking future research on the dataset.\n","authors":["Zhe Niu","Ronglai Zuo","Brian Mak","Fangyun Wei"],"pdf_url":"https://arxiv.org/pdf/2405.00980v1.pdf","comment":"Accepted by LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2309.00733v4","updated":"2024-05-02T03:28:00Z","published":"2023-09-01T20:59:46Z","title":"TExplain: Explaining Learned Visual Features via Pre-trained (Frozen)\n Language Models","summary":" Interpreting the learned features of vision models has posed a longstanding\nchallenge in the field of machine learning. To address this issue, we propose a\nnovel method that leverages the capabilities of language models to interpret\nthe learned features of pre-trained image classifiers. Our method, called\nTExplain, tackles this task by training a neural network to establish a\nconnection between the feature space of image classifiers and language models.\nThen, during inference, our approach generates a vast number of sentences to\nexplain the features learned by the classifier for a given image. These\nsentences are then used to extract the most frequent words, providing a\ncomprehensive understanding of the learned features and patterns within the\nclassifier. Our method, for the first time, utilizes these frequent words\ncorresponding to a visual representation to provide insights into the\ndecision-making process of the independently trained classifier, enabling the\ndetection of spurious correlations, biases, and a deeper comprehension of its\nbehavior. To validate the effectiveness of our approach, we conduct experiments\non diverse datasets, including ImageNet-9L and Waterbirds. The results\ndemonstrate the potential of our method to enhance the interpretability and\nrobustness of image classifiers.\n","authors":["Saeid Asgari Taghanaki","Aliasghar Khani","Ali Saheb Pasand","Amir Khasahmadi","Aditya Sanghi","Karl D. D. Willis","Ali Mahdavi-Amiri"],"pdf_url":"https://arxiv.org/pdf/2309.00733v4.pdf","comment":"Accepted to ICLR 2024, Reliable and Responsible Foundation Models\n workshop"},{"id":"http://arxiv.org/abs/2305.00194v5","updated":"2024-05-02T03:19:33Z","published":"2023-04-29T08:16:12Z","title":"Searching from Area to Point: A Hierarchical Framework for\n Semantic-Geometric Combined Feature Matching","summary":" Feature matching is a crucial technique in computer vision. A unified\nperspective for this task is to treat it as a searching problem, aiming at an\nefficient search strategy to narrow the search space to point matches between\nimages. One of the key aspects of search strategy is the search space, which in\ncurrent approaches is not carefully defined, resulting in limited matching\naccuracy. This paper, thus, pays attention to the search space and proposes to\nset the initial search space for point matching as the matched image areas\ncontaining prominent semantic, named semantic area matches. This search space\nfavors point matching by salient features and alleviates the accuracy\nlimitation in recent Transformer-based matching methods. To achieve this search\nspace, we introduce a hierarchical feature matching framework: Area to Point\nMatching (A2PM), to first find semantic area matches between images and later\nperform point matching on area matches. We further propose Semantic and\nGeometry Area Matching (SGAM) method to realize this framework, which utilizes\nsemantic prior and geometry consistency to establish accurate area matches\nbetween images. By integrating SGAM with off-the-shelf state-of-the-art\nmatchers, our method, adopting the A2PM framework, achieves encouraging\nprecision improvements in massive point matching and pose estimation\nexperiments.\n","authors":["Yesheng Zhang","Xu Zhao","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2305.00194v5.pdf","comment":"v3"},{"id":"http://arxiv.org/abs/2405.00962v1","updated":"2024-05-02T02:58:28Z","published":"2024-05-02T02:58:28Z","title":"FITA: Fine-grained Image-Text Aligner for Radiology Report Generation","summary":" Radiology report generation aims to automatically generate detailed and\ncoherent descriptive reports alongside radiology images. Previous work mainly\nfocused on refining fine-grained image features or leveraging external\nknowledge. However, the precise alignment of fine-grained image features with\ncorresponding text descriptions has not been considered. This paper presents a\nnovel method called Fine-grained Image-Text Aligner (FITA) to construct\nfine-grained alignment for image and text features. It has three novel designs:\nImage Feature Refiner (IFR), Text Feature Refiner (TFR) and Contrastive Aligner\n(CA). IFR and TFR aim to learn fine-grained image and text features,\nrespectively. We achieve this by leveraging saliency maps to effectively fuse\nsymptoms with corresponding abnormal visual regions, and by utilizing a\nmeticulously constructed triplet set for training. Finally, CA module aligns\nfine-grained image and text features using contrastive loss for precise\nalignment. Results show that our method surpasses existing methods on the\nwidely used benchmark\n","authors":["Honglong Yang","Hui Tang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2405.00962v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.05374v2","updated":"2024-05-02T02:41:50Z","published":"2024-02-08T03:12:25Z","title":"CIC: A framework for Culturally-aware Image Captioning","summary":" Image Captioning generates descriptive sentences from images using\nVision-Language Pre-trained models (VLPs) such as BLIP, which has improved\ngreatly. However, current methods lack the generation of detailed descriptive\ncaptions for the cultural elements depicted in the images, such as the\ntraditional clothing worn by people from Asian cultural groups. In this paper,\nwe propose a new framework, \\textbf{Culturally-aware Image Captioning (CIC)},\nthat generates captions and describes cultural elements extracted from cultural\nvisual elements in images representing cultures. Inspired by methods combining\nvisual modality and Large Language Models (LLMs) through appropriate prompts,\nour framework (1) generates questions based on cultural categories from images,\n(2) extracts cultural visual elements from Visual Question Answering (VQA)\nusing generated questions, and (3) generates culturally-aware captions using\nLLMs with the prompts. Our human evaluation conducted on 45 participants from 4\ndifferent cultural groups with a high understanding of the corresponding\nculture shows that our proposed framework generates more culturally descriptive\ncaptions when compared to the image captioning baseline based on VLPs. Our code\nand dataset will be made publicly available upon acceptance.\n","authors":["Youngsik Yun","Jihie Kim"],"pdf_url":"https://arxiv.org/pdf/2402.05374v2.pdf","comment":"Accepted in IJCAI 2024"},{"id":"http://arxiv.org/abs/2405.00956v1","updated":"2024-05-02T02:34:19Z","published":"2024-05-02T02:34:19Z","title":"Efficient Data-driven Scene Simulation using Robotic Surgery Videos via\n Physics-embedded 3D Gaussians","summary":" Surgical scene simulation plays a crucial role in surgical education and\nsimulator-based robot learning. Traditional approaches for creating these\nenvironments with surgical scene involve a labor-intensive process where\ndesigners hand-craft tissues models with textures and geometries for soft body\nsimulations. This manual approach is not only time-consuming but also limited\nin the scalability and realism. In contrast, data-driven simulation offers a\ncompelling alternative. It has the potential to automatically reconstruct 3D\nsurgical scenes from real-world surgical video data, followed by the\napplication of soft body physics. This area, however, is relatively uncharted.\nIn our research, we introduce 3D Gaussian as a learnable representation for\nsurgical scene, which is learned from stereo endoscopic video. To prevent\nover-fitting and ensure the geometrical correctness of these scenes, we\nincorporate depth supervision and anisotropy regularization into the Gaussian\nlearning process. Furthermore, we apply the Material Point Method, which is\nintegrated with physical properties, to the 3D Gaussians to achieve realistic\nscene deformations. Our method was evaluated on our collected in-house and\npublic surgical videos datasets. Results show that it can reconstruct and\nsimulate surgical scenes from endoscopic videos efficiently-taking only a few\nminutes to reconstruct the surgical scene-and produce both visually and\nphysically plausible deformations at a speed approaching real-time. The results\ndemonstrate great potential of our proposed method to enhance the efficiency\nand variety of simulations available for surgical education and robot learning.\n","authors":["Zhenya Yang","Kai Chen","Yonghao Long","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2405.00956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00954v1","updated":"2024-05-02T02:30:39Z","published":"2024-05-02T02:30:39Z","title":"X-Oscar: A Progressive Framework for High-quality Text-guided 3D\n Animatable Avatar Generation","summary":" Recent advancements in automatic 3D avatar generation guided by text have\nmade significant progress. However, existing methods have limitations such as\noversaturation and low-quality output. To address these challenges, we propose\nX-Oscar, a progressive framework for generating high-quality animatable avatars\nfrom text prompts. It follows a sequential Geometry->Texture->Animation\nparadigm, simplifying optimization through step-by-step generation. To tackle\noversaturation, we introduce Adaptive Variational Parameter (AVP), representing\navatars as an adaptive distribution during training. Additionally, we present\nAvatar-aware Score Distillation Sampling (ASDS), a novel technique that\nincorporates avatar-aware noise into rendered images for improved generation\nquality during optimization. Extensive evaluations confirm the superiority of\nX-Oscar over existing text-to-3D and text-to-avatar approaches. Our anonymous\nproject page: https://xmu-xiaoma666.github.io/Projects/X-Oscar/.\n","authors":["Yiwei Ma","Zhekai Lin","Jiayi Ji","Yijun Fan","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2405.00954v1.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2404.13320v2","updated":"2024-05-02T02:25:39Z","published":"2024-04-20T08:28:43Z","title":"Pixel is a Barrier: Diffusion Models Are More Adversarially Robust Than\n We Think","summary":" Adversarial examples for diffusion models are widely used as solutions for\nsafety concerns. By adding adversarial perturbations to personal images,\nattackers can not edit or imitate them easily. However, it is essential to note\nthat all these protections target the latent diffusion model (LDMs), the\nadversarial examples for diffusion models in the pixel space (PDMs) are largely\noverlooked. This may mislead us to think that the diffusion models are\nvulnerable to adversarial attacks like most deep models. In this paper, we show\nnovel findings that: even though gradient-based white-box attacks can be used\nto attack the LDMs, they fail to attack PDMs. This finding is supported by\nextensive experiments of almost a wide range of attacking methods on various\nPDMs and LDMs with different model structures, which means diffusion models are\nindeed much more robust against adversarial attacks. We also find that PDMs can\nbe used as an off-the-shelf purifier to effectively remove the adversarial\npatterns that were generated on LDMs to protect the images, which means that\nmost protection methods nowadays, to some extent, cannot protect our images\nfrom malicious attacks. We hope that our insights will inspire the community to\nrethink the adversarial samples for diffusion models as protection methods and\nmove forward to more effective protection. Codes are available in\nhttps://github.com/xavihart/PDM-Pure.\n","authors":["Haotian Xue","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13320v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00951v1","updated":"2024-05-02T02:23:38Z","published":"2024-05-02T02:23:38Z","title":"Hyperspectral Band Selection based on Generalized 3DTV and Tensor CUR\n Decomposition","summary":" Hyperspectral Imaging (HSI) serves as an important technique in remote\nsensing. However, high dimensionality and data volume typically pose\nsignificant computational challenges. Band selection is essential for reducing\nspectral redundancy in hyperspectral imagery while retaining intrinsic critical\ninformation. In this work, we propose a novel hyperspectral band selection\nmodel by decomposing the data into a low-rank and smooth component and a sparse\none. In particular, we develop a generalized 3D total variation (G3DTV) by\napplying the $\\ell_1^p$-norm to derivatives to preserve spatial-spectral\nsmoothness. By employing the alternating direction method of multipliers\n(ADMM), we derive an efficient algorithm, where the tensor low-rankness is\nimplied by the tensor CUR decomposition. We demonstrate the effectiveness of\nthe proposed approach through comparisons with various other state-of-the-art\nband selection techniques using two benchmark real-world datasets. In addition,\nwe provide practical guidelines for parameter selection in both noise-free and\nnoisy scenarios.\n","authors":["Katherine Henneberger","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2405.00951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06464v3","updated":"2024-05-02T02:21:18Z","published":"2023-03-11T17:30:36Z","title":"PARASOL: Parametric Style Control for Diffusion Image Synthesis","summary":" We propose PARASOL, a multi-modal synthesis model that enables disentangled,\nparametric control of the visual style of the image by jointly conditioning\nsynthesis on both content and a fine-grained visual style embedding. We train a\nlatent diffusion model (LDM) using specific losses for each modality and adapt\nthe classifier-free guidance for encouraging disentangled control over\nindependent content and style modalities at inference time. We leverage\nauxiliary semantic and style-based search to create training triplets for\nsupervision of the LDM, ensuring complementarity of content and style cues.\nPARASOL shows promise for enabling nuanced control over visual style in\ndiffusion models for image creation and stylization, as well as generative\nsearch where text-based search results may be adapted to more closely match\nuser intent by interpolating both content and style descriptors.\n","authors":["Gemma Canet Tarrés","Dan Ruta","Tu Bui","John Collomosse"],"pdf_url":"https://arxiv.org/pdf/2303.06464v3.pdf","comment":"Camera-ready version"},{"id":"http://arxiv.org/abs/2404.17883v2","updated":"2024-05-02T02:12:42Z","published":"2024-04-27T12:42:26Z","title":"Underwater Variable Zoom: Depth-Guided Perception Network for Underwater\n Image Enhancement","summary":" Underwater scenes intrinsically involve degradation problems owing to\nheterogeneous ocean elements. Prevailing underwater image enhancement (UIE)\nmethods stick to straightforward feature modeling to learn the mapping\nfunction, which leads to limited vision gain as it lacks more explicit physical\ncues (e.g., depth). In this work, we investigate injecting the depth prior into\nthe deep UIE model for more precise scene enhancement capability. To this end,\nwe present a novel depth-guided perception UIE framework, dubbed underwater\nvariable zoom (UVZ). Specifically, UVZ resorts to a two-stage pipeline. First,\na depth estimation network is designed to generate critical depth maps,\ncombined with an auxiliary supervision network introduced to suppress\nestimation differences during training. Second, UVZ parses near-far scenarios\nby harnessing the predicted depth maps, enabling local and non-local perceiving\nin different regions. Extensive experiments on five benchmark datasets\ndemonstrate that UVZ achieves superior visual gain and delivers promising\nquantitative metrics. Besides, UVZ is confirmed to exhibit good generalization\nin some visual tasks, especially in unusual lighting conditions. The code,\nmodels and results are available at: https://github.com/WindySprint/UVZ.\n","authors":["Zhixiong Huang","Xinying Wang","Jinjiang Li","Shenglan Liu","Lin Feng"],"pdf_url":"https://arxiv.org/pdf/2404.17883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00942v1","updated":"2024-05-02T02:04:01Z","published":"2024-05-02T02:04:01Z","title":"LLaVA Finds Free Lunch: Teaching Human Behavior Improves Content\n Understanding Abilities Of LLMs","summary":" Communication is defined as ``Who says what to whom with what effect.'' A\nmessage from a communicator generates downstream receiver effects, also known\nas behavior. Receiver behavior, being a downstream effect of the message,\ncarries rich signals about it. Even after carrying signals about the message,\nthe behavior data is often ignored while training large language models. We\nshow that training LLMs on receiver behavior can actually help improve their\ncontent-understanding abilities. Specifically, we show that training LLMs to\npredict the receiver behavior of likes and comments improves the LLM's\nperformance on a wide variety of downstream content understanding tasks. We\nshow this performance increase over 40 video and image understanding tasks over\n23 benchmark datasets across both 0-shot and fine-tuning settings,\noutperforming many supervised baselines. Moreover, since receiver behavior,\nsuch as likes and comments, is collected by default on the internet and does\nnot need any human annotations to be useful, the performance improvement we get\nafter training on this data is essentially free-lunch. We release the receiver\nbehavior cleaned comments and likes of 750k images and videos collected from\nmultiple platforms along with our instruction-tuning data.\n","authors":["Somesh Singh","Harini S I","Yaman K Singla","Veeky Baths","Rajiv Ratn Shah","Changyou Chen","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2405.00942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01917v3","updated":"2024-05-02T01:22:42Z","published":"2023-03-03T13:36:55Z","title":"Pyramid Pixel Context Adaption Network for Medical Image Classification\n with Supervised Contrastive Learning","summary":" Spatial attention mechanism has been widely incorporated into deep neural\nnetworks (DNNs), significantly lifting the performance in computer vision tasks\nvia long-range dependency modeling. However, it may perform poorly in medical\nimage analysis. Unfortunately, existing efforts are often unaware that\nlong-range dependency modeling has limitations in highlighting subtle lesion\nregions. To overcome this limitation, we propose a practical yet lightweight\narchitectural unit, Pyramid Pixel Context Adaption (PPCA) module, which\nexploits multi-scale pixel context information to recalibrate pixel position in\na pixel-independent manner dynamically. PPCA first applies a well-designed\ncross-channel pyramid pooling to aggregate multi-scale pixel context\ninformation, then eliminates the inconsistency among them by the well-designed\npixel normalization, and finally estimates per pixel attention weight via a\npixel context integration. By embedding PPCA into a DNN with negligible\noverhead, the PPCANet is developed for medical image classification. In\naddition, we introduce supervised contrastive learning to enhance feature\nrepresentation by exploiting the potential of label information via supervised\ncontrastive loss. The extensive experiments on six medical image datasets show\nthat PPCANet outperforms state-of-the-art attention-based networks and recent\ndeep neural networks. We also provide visual analysis and ablation study to\nexplain the behavior of PPCANet in the decision-making process.\n","authors":["Xiaoqing Zhang","Zunjie Xiao","Xiao Wu","Yanlin Chen","Jilu Zhao","Yan Hu","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2303.01917v3.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.15692v2","updated":"2024-05-02T01:12:48Z","published":"2023-08-30T01:21:11Z","title":"Intriguing Properties of Diffusion Models: An Empirical Study of the\n Natural Attack Capability in Text-to-Image Generative Models","summary":" Denoising probabilistic diffusion models have shown breakthrough performance\nto generate more photo-realistic images or human-level illustrations than the\nprior models such as GANs. This high image-generation capability has stimulated\nthe creation of many downstream applications in various areas. However, we find\nthat this technology is actually a double-edged sword: We identify a new type\nof attack, called the Natural Denoising Diffusion (NDD) attack based on the\nfinding that state-of-the-art deep neural network (DNN) models still hold their\nprediction even if we intentionally remove their robust features, which are\nessential to the human visual system (HVS), through text prompts. The NDD\nattack shows a significantly high capability to generate low-cost,\nmodel-agnostic, and transferable adversarial attacks by exploiting the natural\nattack capability in diffusion models. To systematically evaluate the risk of\nthe NDD attack, we perform a large-scale empirical study with our newly created\ndataset, the Natural Denoising Diffusion Attack (NDDA) dataset. We evaluate the\nnatural attack capability by answering 6 research questions. Through a user\nstudy, we find that it can achieve an 88% detection rate while being stealthy\nto 93% of human subjects; we also find that the non-robust features embedded by\ndiffusion models contribute to the natural attack capability. To confirm the\nmodel-agnostic and transferable attack capability, we perform the NDD attack\nagainst the Tesla Model 3 and find that 73% of the physically printed attacks\ncan be detected as stop signs. Our hope is that the study and dataset can help\nour community be aware of the risks in diffusion models and facilitate further\nresearch toward robust DNN models.\n","authors":["Takami Sato","Justin Yue","Nanze Chen","Ningfei Wang","Qi Alfred Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15692v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19444v2","updated":"2024-05-02T01:12:02Z","published":"2024-04-30T10:48:43Z","title":"AnomalyXFusion: Multi-modal Anomaly Synthesis with Diffusion","summary":" Anomaly synthesis is one of the effective methods to augment abnormal samples\nfor training. However, current anomaly synthesis methods predominantly rely on\ntexture information as input, which limits the fidelity of synthesized abnormal\nsamples. Because texture information is insufficient to correctly depict the\npattern of anomalies, especially for logical anomalies. To surmount this\nobstacle, we present the AnomalyXFusion framework, designed to harness\nmulti-modality information to enhance the quality of synthesized abnormal\nsamples. The AnomalyXFusion framework comprises two distinct yet synergistic\nmodules: the Multi-modal In-Fusion (MIF) module and the Dynamic Dif-Fusion\n(DDF) module. The MIF module refines modality alignment by aggregating and\nintegrating various modality features into a unified embedding space, termed\nX-embedding, which includes image, text, and mask features. Concurrently, the\nDDF module facilitates controlled generation through an adaptive adjustment of\nX-embedding conditioned on the diffusion steps. In addition, to reveal the\nmulti-modality representational power of AnomalyXFusion, we propose a new\ndataset, called MVTec Caption. More precisely, MVTec Caption extends 2.2k\naccurate image-mask-text annotations for the MVTec AD and LOCO datasets.\nComprehensive evaluations demonstrate the effectiveness of AnomalyXFusion,\nespecially regarding the fidelity and diversity for logical anomalies. Project\npage: http:github.com/hujiecpp/MVTec-Caption\n","authors":["Jie Hu","Yawen Huang","Yilin Lu","Guoyang Xie","Guannan Jiang","Yefeng Zheng","Zhichao Lu"],"pdf_url":"https://arxiv.org/pdf/2404.19444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12605v2","updated":"2024-05-02T01:07:49Z","published":"2023-08-24T07:11:00Z","title":"APLA: Additional Perturbation for Latent Noise with Adversarial Training\n Enables Consistency","summary":" Diffusion models have exhibited promising progress in video generation.\nHowever, they often struggle to retain consistent details within local regions\nacross frames. One underlying cause is that traditional diffusion models\napproximate Gaussian noise distribution by utilizing predictive noise, without\nfully accounting for the impact of inherent information within the input\nitself. Additionally, these models emphasize the distinction between\npredictions and references, neglecting information intrinsic to the videos. To\naddress this limitation, inspired by the self-attention mechanism, we propose a\nnovel text-to-video (T2V) generation network structure based on diffusion\nmodels, dubbed Additional Perturbation for Latent noise with Adversarial\ntraining (APLA). Our approach only necessitates a single video as input and\nbuilds upon pre-trained stable diffusion networks. Notably, we introduce an\nadditional compact network, known as the Video Generation Transformer (VGT).\nThis auxiliary component is designed to extract perturbations from the inherent\ninformation contained within the input, thereby refining inconsistent pixels\nduring temporal predictions. We leverage a hybrid architecture of transformers\nand convolutions to compensate for temporal intricacies, enhancing consistency\nbetween different frames within the video. Experiments demonstrate a noticeable\nimprovement in the consistency of the generated videos both qualitatively and\nquantitatively.\n","authors":["Yupu Yao","Shangqi Deng","Zihan Cao","Harry Zhang","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2308.12605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00142v2","updated":"2024-05-02T00:44:21Z","published":"2024-04-30T18:39:41Z","title":"Utilizing Machine Learning and 3D Neuroimaging to Predict Hearing Loss:\n A Comparative Analysis of Dimensionality Reduction and Regression Techniques","summary":" In this project, we have explored machine learning approaches for predicting\nhearing loss thresholds on the brain's gray matter 3D images. We have solved\nthe problem statement in two phases. In the first phase, we used a 3D CNN model\nto reduce high-dimensional input into latent space and decode it into an\noriginal image to represent the input in rich feature space. In the second\nphase, we utilized this model to reduce input into rich features and used these\nfeatures to train standard machine learning models for predicting hearing\nthresholds. We have experimented with autoencoders and variational autoencoders\nin the first phase for dimensionality reduction and explored random forest,\nXGBoost and multi-layer perceptron for regressing the thresholds. We split the\ngiven data set into training and testing sets and achieved an 8.80 range and\n22.57 range for PT500 and PT4000 on the test set, respectively. We got the\nlowest RMSE using multi-layer perceptron among the other models.\n Our approach leverages the unique capabilities of VAEs to capture complex,\nnon-linear relationships within high-dimensional neuroimaging data. We\nrigorously evaluated the models using various metrics, focusing on the root\nmean squared error (RMSE). The results highlight the efficacy of the\nmulti-layer neural network model, which outperformed other techniques in terms\nof accuracy. This project advances the application of data mining in medical\ndiagnostics and enhances our understanding of age-related hearing loss through\ninnovative machine-learning frameworks.\n","authors":["Trinath Sai Subhash Reddy Pittala","Uma Maheswara R Meleti","Manasa Thatipamula"],"pdf_url":"https://arxiv.org/pdf/2405.00142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17626v2","updated":"2024-05-02T00:08:36Z","published":"2023-10-26T17:45:26Z","title":"A Survey on Transferability of Adversarial Examples across Deep Neural\n Networks","summary":" The emergence of Deep Neural Networks (DNNs) has revolutionized various\ndomains by enabling the resolution of complex tasks spanning image recognition,\nnatural language processing, and scientific problem-solving. However, this\nprogress has also brought to light a concerning vulnerability: adversarial\nexamples. These crafted inputs, imperceptible to humans, can manipulate machine\nlearning models into making erroneous predictions, raising concerns for\nsafety-critical applications. An intriguing property of this phenomenon is the\ntransferability of adversarial examples, where perturbations crafted for one\nmodel can deceive another, often with a different architecture. This intriguing\nproperty enables black-box attacks which circumvents the need for detailed\nknowledge of the target model. This survey explores the landscape of the\nadversarial transferability of adversarial examples. We categorize existing\nmethodologies to enhance adversarial transferability and discuss the\nfundamental principles guiding each approach. While the predominant body of\nresearch primarily concentrates on image classification, we also extend our\ndiscussion to encompass other vision tasks and beyond. Challenges and\nopportunities are discussed, highlighting the importance of fortifying DNNs\nagainst adversarial vulnerabilities in an evolving landscape.\n","authors":["Jindong Gu","Xiaojun Jia","Pau de Jorge","Wenqain Yu","Xinwei Liu","Avery Ma","Yuan Xun","Anjun Hu","Ashkan Khakzar","Zhijiang Li","Xiaochun Cao","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2310.17626v2.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2405.00915v1","updated":"2024-05-02T00:04:02Z","published":"2024-05-02T00:04:02Z","title":"EchoScene: Indoor Scene Generation via Information Echo over Scene Graph\n Diffusion","summary":" We present EchoScene, an interactive and controllable generative model that\ngenerates 3D indoor scenes on scene graphs. EchoScene leverages a dual-branch\ndiffusion model that dynamically adapts to scene graphs. Existing methods\nstruggle to handle scene graphs due to varying numbers of nodes, multiple edge\ncombinations, and manipulator-induced node-edge operations. EchoScene overcomes\nthis by associating each node with a denoising process and enables\ncollaborative information exchange, enhancing controllable and consistent\ngeneration aware of global constraints. This is achieved through an information\necho scheme in both shape and layout branches. At every denoising step, all\nprocesses share their denoising data with an information exchange unit that\ncombines these updates using graph convolution. The scheme ensures that the\ndenoising processes are influenced by a holistic understanding of the scene\ngraph, facilitating the generation of globally coherent scenes. The resulting\nscenes can be manipulated during inference by editing the input scene graph and\nsampling the noise in the diffusion model. Extensive experiments validate our\napproach, which maintains scene controllability and surpasses previous methods\nin generation fidelity. Moreover, the generated scenes are of high quality and\nthus directly compatible with off-the-shelf texture generation. Code and\ntrained models are open-sourced.\n","authors":["Guangyao Zhai","Evin Pınar Örnek","Dave Zhenyu Chen","Ruotong Liao","Yan Di","Nassir Navab","Federico Tombari","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2405.00915v1.pdf","comment":"25 pages. 10 figures"},{"id":"http://arxiv.org/abs/2009.08618v2","updated":"2024-05-02T00:53:37Z","published":"2020-09-18T03:53:18Z","title":"6-DoF Grasp Planning using Fast 3D Reconstruction and Grasp Quality CNN","summary":" Recent consumer demand for home robots has accelerated performance of robotic\ngrasping. However, a key component of the perception pipeline, the depth\ncamera, is still expensive and inaccessible to most consumers. In addition,\ngrasp planning has significantly improved recently, by leveraging large\ndatasets and cloud robotics, and by limiting the state and action space to\ntop-down grasps with 4 degrees of freedom (DoF). By leveraging multi-view\ngeometry of the object using inexpensive equipment such as off-the-shelf RGB\ncameras and state-of-the-art algorithms such as Learn Stereo Machine\n(LSM\\cite{kar2017learning}), the robot is able to generate more robust grasps\nfrom different angles with 6-DoF. In this paper, we present a modification of\nLSM to graspable objects, evaluate the grasps, and develop a 6-DoF grasp\nplanner based on Grasp-Quality CNN (GQ-CNN\\cite{mahler2017dex}) that exploits\nmultiple camera views to plan a robust grasp, even in the absence of a possible\ntop-down grasp.\n","authors":["Yahav Avigal","Samuel Paradis","Harry Zhang"],"pdf_url":"https://arxiv.org/pdf/2009.08618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01776v1","updated":"2024-05-02T23:24:27Z","published":"2024-05-02T23:24:27Z","title":"An Approach to Systematic Data Acquisition and Data-Driven Simulation\n for the Safety Testing of Automated Driving Functions","summary":" With growing complexity and criticality of automated driving functions in\nroad traffic and their operational design domains (ODD), there is increasing\ndemand for covering significant proportions of development, validation, and\nverification in virtual environments and through simulation models.\n If, however, simulations are meant not only to augment real-world\nexperiments, but to replace them, quantitative approaches are required that\nmeasure to what degree and under which preconditions simulation models\nadequately represent reality, and thus, using their results accordingly.\nEspecially in R&D areas related to the safety impact of the \"open world\", there\nis a significant shortage of real-world data to parameterize and/or validate\nsimulations - especially with respect to the behavior of human traffic\nparticipants, whom automated driving functions will meet in mixed traffic.\n We present an approach to systematically acquire data in public traffic by\nheterogeneous means, transform it into a unified representation, and use it to\nautomatically parameterize traffic behavior models for use in data-driven\nvirtual validation of automated driving functions.\n","authors":["Leon Eisemann","Mirjam Fehling-Kaschek","Henrik Gommel","David Hermann","Marvin Klemp","Martin Lauer","Benjamin Lickert","Florian Luettner","Robin Moss","Nicole Neis","Maria Pohle","Simon Romanski","Daniel Stadler","Alexander Stolz","Jens Ziehn","Jingxing Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.01776v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2210.00314v4","updated":"2024-05-02T22:59:51Z","published":"2022-10-01T16:31:44Z","title":"Learning Hierarchical Image Segmentation For Recognition and By\n Recognition","summary":" Large vision and language models learned directly through image-text\nassociations often lack detailed visual substantiation, whereas image\nsegmentation tasks are treated separately from recognition, supervisedly\nlearned without interconnections. Our key observation is that, while an image\ncan be recognized in multiple ways, each has a consistent part-and-whole visual\norganization. Segmentation thus should be treated not as an end task to be\nmastered through supervised learning, but as an internal process that evolves\nwith and supports the ultimate goal of recognition. We propose to integrate a\nhierarchical segmenter into the recognition process, train and adapt the entire\nmodel solely on image-level recognition objectives. We learn hierarchical\nsegmentation for free alongside recognition, automatically uncovering\npart-to-whole relationships that not only underpin but also enhance\nrecognition. Enhancing the Vision Transformer (ViT) with adaptive segment\ntokens and graph pooling, our model surpasses ViT in unsupervised part-whole\ndiscovery, semantic segmentation, image classification, and efficiency.\nNotably, our model (trained on unlabeled 1M ImageNet images) outperforms SAM\n(trained on 11M images and 1 billion masks) by absolute 8% in mIoU on\nPartImageNet object segmentation.\n","authors":["Tsung-Wei Ke","Sangwoo Mo","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2210.00314v4.pdf","comment":"ICLR 2024 (spotlight). First two authors contributed equally. Code\n available at https://github.com/twke18/CAST"},{"id":"http://arxiv.org/abs/2404.15523v2","updated":"2024-05-02T21:37:41Z","published":"2024-04-23T21:11:30Z","title":"Understanding Hyperbolic Metric Learning through Hard Negative Sampling","summary":" In recent years, there has been a growing trend of incorporating hyperbolic\ngeometry methods into computer vision. While these methods have achieved\nstate-of-the-art performance on various metric learning tasks using hyperbolic\ndistance measurements, the underlying theoretical analysis supporting this\nsuperior performance remains under-exploited. In this study, we investigate the\neffects of integrating hyperbolic space into metric learning, particularly when\ntraining with contrastive loss. We identify a need for a comprehensive\ncomparison between Euclidean and hyperbolic spaces regarding the temperature\neffect in the contrastive loss within the existing literature. To address this\ngap, we conduct an extensive investigation to benchmark the results of Vision\nTransformers (ViTs) using a hybrid objective function that combines loss from\nEuclidean and hyperbolic spaces. Additionally, we provide a theoretical\nanalysis of the observed performance improvement. We also reveal that\nhyperbolic metric learning is highly related to hard negative sampling,\nproviding insights for future work. This work will provide valuable data points\nand experience in understanding hyperbolic image embeddings. To shed more light\non problem-solving and encourage further investigation into our approach, our\ncode is available online (https://github.com/YunYunY/HypMix).\n","authors":["Yun Yue","Fangzhou Lin","Guanyi Mou","Ziming Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.15523v2.pdf","comment":"published in Proceedings of the IEEE/CVF Winter Conference on\n Applications of Computer Vision. 2024"},{"id":"http://arxiv.org/abs/2405.01750v1","updated":"2024-05-02T21:35:45Z","published":"2024-05-02T21:35:45Z","title":"PointCompress3D -- A Point Cloud Compression Framework for Roadside\n LiDARs in Intelligent Transportation Systems","summary":" In the context of Intelligent Transportation Systems (ITS), efficient data\ncompression is crucial for managing large-scale point cloud data acquired by\nroadside LiDAR sensors. The demand for efficient storage, streaming, and\nreal-time object detection capabilities for point cloud data is substantial.\nThis work introduces PointCompress3D, a novel point cloud compression framework\ntailored specifically for roadside LiDARs. Our framework addresses the\nchallenges of compressing high-resolution point clouds while maintaining\naccuracy and compatibility with roadside LiDAR sensors. We adapt, extend,\nintegrate, and evaluate three cutting-edge compression methods using our\nreal-world-based TUMTraf dataset family. We achieve a frame rate of 10 FPS\nwhile keeping compression sizes below 105 Kb, a reduction of 50 times, and\nmaintaining object detection performance on par with the original data. In\nextensive experiments and ablation studies, we finally achieved a PSNR d2 of\n94.46 and a BPP of 6.54 on our dataset. Future work includes the deployment on\nthe live system. The code is available on our project website:\nhttps://pointcompress3d.github.io.\n","authors":["Walter Zimmer","Ramandika Pranamulia","Xingcheng Zhou","Mingyu Liu","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2405.01750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01734v1","updated":"2024-05-02T21:09:39Z","published":"2024-05-02T21:09:39Z","title":"Diabetic Retinopathy Detection Using Quantum Transfer Learning","summary":" Diabetic Retinopathy (DR), a prevalent complication in diabetes patients, can\nlead to vision impairment due to lesions formed on the retina. Detecting DR at\nan advanced stage often results in irreversible blindness. The traditional\nprocess of diagnosing DR through retina fundus images by ophthalmologists is\nnot only time-intensive but also expensive. While classical transfer learning\nmodels have been widely adopted for computer-aided detection of DR, their high\nmaintenance costs can hinder their detection efficiency. In contrast, Quantum\nTransfer Learning offers a more effective solution to this challenge. This\napproach is notably advantageous because it operates on heuristic principles,\nmaking it highly optimized for the task. Our proposed methodology leverages\nthis hybrid quantum transfer learning technique to detect DR. To construct our\nmodel, we utilize the APTOS 2019 Blindness Detection dataset, available on\nKaggle. We employ the ResNet-18, ResNet34, ResNet50, ResNet101, ResNet152 and\nInception V3, pre-trained classical neural networks, for the initial feature\nextraction. For the classification stage, we use a Variational Quantum\nClassifier. Our hybrid quantum model has shown remarkable results, achieving an\naccuracy of 97% for ResNet-18. This demonstrates that quantum computing, when\nintegrated with quantum machine learning, can perform tasks with a level of\npower and efficiency unattainable by classical computers alone. By harnessing\nthese advanced technologies, we can significantly improve the detection and\ndiagnosis of Diabetic Retinopathy, potentially saving many from the risk of\nblindness.\n Keywords: Diabetic Retinopathy, Quantum Transfer Learning, Deep Learning\n","authors":["Ankush Jain","Rinav Gupta","Jai Singhal"],"pdf_url":"https://arxiv.org/pdf/2405.01734v1.pdf","comment":"14 pages, 12 figures and 5 tables"},{"id":"http://arxiv.org/abs/2405.01726v1","updated":"2024-05-02T20:44:26Z","published":"2024-05-02T20:44:26Z","title":"SSUMamba: Spatial-Spectral Selective State Space Model for Hyperspectral\n Image Denoising","summary":" Denoising hyperspectral images (HSIs) is a crucial preprocessing procedure\ndue to the noise originating from intra-imaging mechanisms and environmental\nfactors. Utilizing domain-specific knowledge of HSIs, such as spectral\ncorrelation, spatial self-similarity, and spatial-spectral correlation, is\nessential for deep learning-based denoising. Existing methods are often\nconstrained by running time, space complexity, and computational complexity,\nemploying strategies that explore these priors separately. While the strategies\ncan avoid some redundant information, considering that hyperspectral images are\n3-D images with strong spatial continuity and spectral correlation, this kind\nof strategy inevitably overlooks subtle long-range spatial-spectral information\nthat positively impacts image restoration. This paper proposes a\nSpatial-Spectral Selective State Space Model-based U-shaped network, termed\nSpatial-Spectral U-Mamba (SSUMamba), for hyperspectral image denoising. We can\nobtain complete global spatial-spectral correlation within a module thanks to\nthe linear space complexity in State Space Model (SSM) computations. We\nintroduce an Alternating Scan (SSAS) strategy for HSI data, which helps model\nthe information flow in multiple directions in 3-D HSIs. Experimental results\ndemonstrate that our method outperforms several compared methods. The source\ncode will be available at https://github.com/lronkitty/SSUMamba.\n","authors":["Guanyiman Fu","Fengchao Xiong","Jianfeng Lu","Jun Zhou","Yuntao Qian"],"pdf_url":"https://arxiv.org/pdf/2405.01726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01725v1","updated":"2024-05-02T20:43:58Z","published":"2024-05-02T20:43:58Z","title":"Development of Skip Connection in Deep Neural Networks for Computer\n Vision and Medical Image Analysis: A Survey","summary":" Deep learning has made significant progress in computer vision, specifically\nin image classification, object detection, and semantic segmentation. The skip\nconnection has played an essential role in the architecture of deep neural\nnetworks,enabling easier optimization through residual learning during the\ntraining stage and improving accuracy during testing. Many neural networks have\ninherited the idea of residual learning with skip connections for various\ntasks, and it has been the standard choice for designing neural networks. This\nsurvey provides a comprehensive summary and outlook on the development of skip\nconnections in deep neural networks. The short history of skip connections is\noutlined, and the development of residual learning in deep neural networks is\nsurveyed. The effectiveness of skip connections in the training and testing\nstages is summarized, and future directions for using skip connections in\nresidual learning are discussed. Finally, we summarize seminal papers, source\ncode, models, and datasets that utilize skip connections in computer vision,\nincluding image classification, object detection, semantic segmentation, and\nimage reconstruction. We hope this survey could inspire peer researchers in the\ncommunity to develop further skip connections in various forms and tasks and\nthe theory of residual learning in deep neural networks. The project page can\nbe found at https://github.com/apple1986/Residual_Learning_For_Images\n","authors":["Guoping Xu","Xiaxia Wang","Xinglong Wu","Xuesong Leng","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2405.01725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01723v1","updated":"2024-05-02T20:42:17Z","published":"2024-05-02T20:42:17Z","title":"Zero-Shot Monocular Motion Segmentation in the Wild by Combining Deep\n Learning with Geometric Motion Model Fusion","summary":" Detecting and segmenting moving objects from a moving monocular camera is\nchallenging in the presence of unknown camera motion, diverse object motions\nand complex scene structures. Most existing methods rely on a single motion cue\nto perform motion segmentation, which is usually insufficient when facing\ndifferent complex environments. While a few recent deep learning based methods\nare able to combine multiple motion cues to achieve improved accuracy, they\ndepend heavily on vast datasets and extensive annotations, making them less\nadaptable to new scenarios. To address these limitations, we propose a novel\nmonocular dense segmentation method that achieves state-of-the-art motion\nsegmentation results in a zero-shot manner. The proposed method synergestically\ncombines the strengths of deep learning and geometric model fusion methods by\nperforming geometric model fusion on object proposals. Experiments show that\nour method achieves competitive results on several motion segmentation datasets\nand even surpasses some state-of-the-art supervised methods on certain\nbenchmarks, while not being trained on any data. We also present an ablation\nstudy to show the effectiveness of combining different geometric models\ntogether for motion segmentation, highlighting the value of our geometric model\nfusion strategy.\n","authors":["Yuxiang Huang","Yuhao Chen","John Zelek"],"pdf_url":"https://arxiv.org/pdf/2405.01723v1.pdf","comment":"Accepted by the 2024 IEEE/CVF Conference on Computer Vision and\n Pattern Recognition Workshops (CVPRW)"},{"id":"http://arxiv.org/abs/2404.17699v2","updated":"2024-05-02T20:31:49Z","published":"2024-04-26T20:55:39Z","title":"Deep Learning for Melt Pool Depth Contour Prediction From Surface\n Thermal Images via Vision Transformers","summary":" Insufficient overlap between the melt pools produced during Laser Powder Bed\nFusion (L-PBF) can lead to lack-of-fusion defects and deteriorated mechanical\nand fatigue performance. In-situ monitoring of the melt pool subsurface\nmorphology requires specialized equipment that may not be readily accessible or\nscalable. Therefore, we introduce a machine learning framework to correlate\nin-situ two-color thermal images observed via high-speed color imaging to the\ntwo-dimensional profile of the melt pool cross-section. Specifically, we employ\na hybrid CNN-Transformer architecture to establish a correlation between single\nbead off-axis thermal image sequences and melt pool cross-section contours\nmeasured via optical microscopy. In this architecture, a ResNet model embeds\nthe spatial information contained within the thermal images to a latent vector,\nwhile a Transformer model correlates the sequence of embedded vectors to\nextract temporal information. Our framework is able to model the curvature of\nthe subsurface melt pool structure, with improved performance in high energy\ndensity regimes compared to analytical melt pool models. The performance of\nthis model is evaluated through dimensional and geometric comparisons to the\ncorresponding experimental melt pool observations.\n","authors":["Francis Ogoke","Peter Myung-Won Pak","Alexander Myers","Guadalupe Quirarte","Jack Beuth","Jonathan Malen","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2404.17699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.15000v3","updated":"2024-05-02T20:21:45Z","published":"2021-11-29T22:38:13Z","title":"Deformable ProtoPNet: An Interpretable Image Classifier Using Deformable\n Prototypes","summary":" We present a deformable prototypical part network (Deformable ProtoPNet), an\ninterpretable image classifier that integrates the power of deep learning and\nthe interpretability of case-based reasoning. This model classifies input\nimages by comparing them with prototypes learned during training, yielding\nexplanations in the form of \"this looks like that.\" However, while previous\nmethods use spatially rigid prototypes, we address this shortcoming by\nproposing spatially flexible prototypes. Each prototype is made up of several\nprototypical parts that adaptively change their relative spatial positions\ndepending on the input image. Consequently, a Deformable ProtoPNet can\nexplicitly capture pose variations and context, improving both model accuracy\nand the richness of explanations provided. Compared to other case-based\ninterpretable models using prototypes, our approach achieves state-of-the-art\naccuracy and gives an explanation with greater context. The code is available\nat https://github.com/jdonnelly36/Deformable-ProtoPNet.\n","authors":["Jon Donnelly","Alina Jade Barnett","Chaofan Chen"],"pdf_url":"https://arxiv.org/pdf/2111.15000v3.pdf","comment":"This was published in CVPR 2022"},{"id":"http://arxiv.org/abs/2405.01705v1","updated":"2024-05-02T20:03:19Z","published":"2024-05-02T20:03:19Z","title":"Long Tail Image Generation Through Feature Space Augmentation and\n Iterated Learning","summary":" Image and multimodal machine learning tasks are very challenging to solve in\nthe case of poorly distributed data. In particular, data availability and\nprivacy restrictions exacerbate these hurdles in the medical domain. The state\nof the art in image generation quality is held by Latent Diffusion models,\nmaking them prime candidates for tackling this problem. However, a few key\nissues still need to be solved, such as the difficulty in generating data from\nunder-represented classes and a slow inference process. To mitigate these\nissues, we propose a new method for image augmentation in long-tailed data\nbased on leveraging the rich latent space of pre-trained Stable Diffusion\nModels. We create a modified separable latent space to mix head and tail class\nexamples. We build this space via Iterated Learning of underlying sparsified\nembeddings, which we apply to task-specific saliency maps via a K-NN approach.\nCode is available at\nhttps://github.com/SugarFreeManatee/Feature-Space-Augmentation-and-Iterated-Learning\n","authors":["Rafael Elberg","Denis Parra","Mircea Petrache"],"pdf_url":"https://arxiv.org/pdf/2405.01705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01701v1","updated":"2024-05-02T19:53:56Z","published":"2024-05-02T19:53:56Z","title":"Active Learning Enabled Low-cost Cell Image Segmentation Using Bounding\n Box Annotation","summary":" Cell image segmentation is usually implemented using fully supervised deep\nlearning methods, which heavily rely on extensive annotated training data. Yet,\ndue to the complexity of cell morphology and the requirement for specialized\nknowledge, pixel-level annotation of cell images has become a highly\nlabor-intensive task. To address the above problems, we propose an active\nlearning framework for cell segmentation using bounding box annotations, which\ngreatly reduces the data annotation cost of cell segmentation algorithms.\nFirst, we generate a box-supervised learning method (denoted as YOLO-SAM) by\ncombining the YOLOv8 detector with the Segment Anything Model (SAM), which\neffectively reduces the complexity of data annotation. Furthermore, it is\nintegrated into an active learning framework that employs the MC DropBlock\nmethod to train the segmentation model with fewer box-annotated samples.\nExtensive experiments demonstrate that our model saves more than ninety percent\nof data annotation time compared to mask-supervised deep learning methods.\n","authors":["Yu Zhu","Qiang Yang","Li Xu"],"pdf_url":"https://arxiv.org/pdf/2405.01701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01699v1","updated":"2024-05-02T19:47:08Z","published":"2024-05-02T19:47:08Z","title":"SOAR: Advancements in Small Body Object Detection for Aerial Imagery\n Using State Space Models and Programmable Gradients","summary":" Small object detection in aerial imagery presents significant challenges in\ncomputer vision due to the minimal data inherent in small-sized objects and\ntheir propensity to be obscured by larger objects and background noise.\nTraditional methods using transformer-based models often face limitations\nstemming from the lack of specialized databases, which adversely affect their\nperformance with objects of varying orientations and scales. This underscores\nthe need for more adaptable, lightweight models. In response, this paper\nintroduces two innovative approaches that significantly enhance detection and\nsegmentation capabilities for small aerial objects. Firstly, we explore the use\nof the SAHI framework on the newly introduced lightweight YOLO v9 architecture,\nwhich utilizes Programmable Gradient Information (PGI) to reduce the\nsubstantial information loss typically encountered in sequential feature\nextraction processes. The paper employs the Vision Mamba model, which\nincorporates position embeddings to facilitate precise location-aware visual\nunderstanding, combined with a novel bidirectional State Space Model (SSM) for\neffective visual context modeling. This State Space Model adeptly harnesses the\nlinear complexity of CNNs and the global receptive field of Transformers,\nmaking it particularly effective in remote sensing image classification. Our\nexperimental results demonstrate substantial improvements in detection accuracy\nand processing efficiency, validating the applicability of these approaches for\nreal-time small object detection across diverse aerial scenarios. This paper\nalso discusses how these methodologies could serve as foundational models for\nfuture advancements in aerial object recognition technologies. The source code\nwill be made accessible here.\n","authors":["Tushar Verma","Jyotsna Singh","Yash Bhartari","Rishi Jarwal","Suraj Singh","Shubhkarman Singh"],"pdf_url":"https://arxiv.org/pdf/2405.01699v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.01691v1","updated":"2024-05-02T19:27:28Z","published":"2024-05-02T19:27:28Z","title":"Language-Enhanced Latent Representations for Out-of-Distribution\n Detection in Autonomous Driving","summary":" Out-of-distribution (OOD) detection is essential in autonomous driving, to\ndetermine when learning-based components encounter unexpected inputs.\nTraditional detectors typically use encoder models with fixed settings, thus\nlacking effective human interaction capabilities. With the rise of large\nfoundation models, multimodal inputs offer the possibility of taking human\nlanguage as a latent representation, thus enabling language-defined OOD\ndetection. In this paper, we use the cosine similarity of image and text\nrepresentations encoded by the multimodal model CLIP as a new representation to\nimprove the transparency and controllability of latent encodings used for\nvisual anomaly detection. We compare our approach with existing pre-trained\nencoders that can only produce latent representations that are meaningless from\nthe user's standpoint. Our experiments on realistic driving data show that the\nlanguage-based latent representation performs better than the traditional\nrepresentation of the vision encoder and helps improve the detection\nperformance when combined with standard representations.\n","authors":["Zhenjiang Mao","Dong-You Jhong","Ao Wang","Ivan Ruchkin"],"pdf_url":"https://arxiv.org/pdf/2405.01691v1.pdf","comment":"Presented at the Robot Trust for Symbiotic Societies (RTSS) Workshop,\n co-located with ICRA 2024"},{"id":"http://arxiv.org/abs/2311.18763v2","updated":"2024-05-02T19:24:23Z","published":"2023-11-30T18:04:21Z","title":"Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters","summary":" Recent work has demonstrated a remarkable ability to customize text-to-image\ndiffusion models to multiple, fine-grained concepts in a sequential (i.e.,\ncontinual) manner while only providing a few example images for each concept.\nThis setting is known as continual diffusion. Here, we ask the question: Can we\nscale these methods to longer concept sequences without forgetting? Although\nprior work mitigates the forgetting of previously learned concepts, we show\nthat its capacity to learn new tasks reaches saturation over longer sequences.\nWe address this challenge by introducing a novel method, STack-And-Mask\nINcremental Adapters (STAMINA), which is composed of low-ranked\nattention-masked adapters and customized MLP tokens. STAMINA is designed to\nenhance the robust fine-tuning properties of LoRA for sequential concept\nlearning via learnable hard-attention masks parameterized with low rank MLPs,\nenabling precise, scalable learning via sparse adaptation. Notably, all\nintroduced trainable parameters can be folded back into the model after\ntraining, inducing no additional inference parameter costs. We show that\nSTAMINA outperforms the prior SOTA for the setting of text-to-image continual\ncustomization on a 50-concept benchmark composed of landmarks and human faces,\nwith no stored replay data. Additionally, we extended our method to the setting\nof continual learning for image classification, demonstrating that our gains\nalso translate to state-of-the-art performance in this standard benchmark.\n","authors":["James Seale Smith","Yen-Chang Hsu","Zsolt Kira","Yilin Shen","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2311.18763v2.pdf","comment":"CVPR-W 2024"},{"id":"http://arxiv.org/abs/2405.01688v1","updated":"2024-05-02T19:22:39Z","published":"2024-05-02T19:22:39Z","title":"Adapting Self-Supervised Learning for Computational Pathology","summary":" Self-supervised learning (SSL) has emerged as a key technique for training\nnetworks that can generalize well to diverse tasks without task-specific\nsupervision. This property makes SSL desirable for computational pathology, the\nstudy of digitized images of tissues, as there are many target applications and\noften limited labeled training samples. However, SSL algorithms and models have\nbeen primarily developed in the field of natural images and whether their\nperformance can be improved by adaptation to particular domains remains an open\nquestion. In this work, we present an investigation of modifications to SSL for\npathology data, specifically focusing on the DINOv2 algorithm. We propose\nalternative augmentations, regularization functions, and position encodings\nmotivated by the characteristics of pathology images. We evaluate the impact of\nthese changes on several benchmarks to demonstrate the value of tailored\napproaches.\n","authors":["Eric Zimmermann","Neil Tenenholtz","James Hall","George Shaikovski","Michal Zelechowski","Adam Casson","Fausto Milletari","Julian Viret","Eugene Vorontsov","Siqi Liu","Kristen Severson"],"pdf_url":"https://arxiv.org/pdf/2405.01688v1.pdf","comment":"Presented at DCA in MI Workshop, CVPR 2024"},{"id":"http://arxiv.org/abs/2405.01673v1","updated":"2024-05-02T18:59:53Z","published":"2024-05-02T18:59:53Z","title":"ShadowNav: Autonomous Global Localization for Lunar Navigation in\n Darkness","summary":" The ability to determine the pose of a rover in an inertial frame\nautonomously is a crucial capability necessary for the next generation of\nsurface rover missions on other planetary bodies. Currently, most on-going\nrover missions utilize ground-in-the-loop interventions to manually correct for\ndrift in the pose estimate and this human supervision bottlenecks the distance\nover which rovers can operate autonomously and carry out scientific\nmeasurements. In this paper, we present ShadowNav, an autonomous approach for\nglobal localization on the Moon with an emphasis on driving in darkness and at\nnighttime. Our approach uses the leading edge of Lunar craters as landmarks and\na particle filtering approach is used to associate detected craters with known\nones on an offboard map. We discuss the key design decisions in developing the\nShadowNav framework for use with a Lunar rover concept equipped with a stereo\ncamera and an external illumination source. Finally, we demonstrate the\nefficacy of our proposed approach in both a Lunar simulation environment and on\ndata collected during a field test at Cinder Lakes, Arizona.\n","authors":["Deegan Atha","R. Michael Swan","Abhishek Cauligi","Anne Bettens","Edwin Goh","Dima Kogan","Larry Matthies","Masahiro Ono"],"pdf_url":"https://arxiv.org/pdf/2405.01673v1.pdf","comment":"21 pages, 13 figures"},{"id":"http://arxiv.org/abs/2405.01662v1","updated":"2024-05-02T18:33:02Z","published":"2024-05-02T18:33:02Z","title":"Out-of-distribution detection based on subspace projection of\n high-dimensional features output by the last convolutional layer","summary":" Out-of-distribution (OOD) detection, crucial for reliable pattern\nclassification, discerns whether a sample originates outside the training\ndistribution. This paper concentrates on the high-dimensional features output\nby the final convolutional layer, which contain rich image features. Our key\nidea is to project these high-dimensional features into two specific feature\nsubspaces, leveraging the dimensionality reduction capacity of the network's\nlinear layers, trained with Predefined Evenly-Distribution Class Centroids\n(PEDCC)-Loss. This involves calculating the cosines of three projection angles\nand the norm values of features, thereby identifying distinctive information\nfor in-distribution (ID) and OOD data, which assists in OOD detection. Building\nupon this, we have modified the batch normalization (BN) and ReLU layer\npreceding the fully connected layer, diminishing their impact on the output\nfeature distributions and thereby widening the distribution gap between ID and\nOOD data features. Our method requires only the training of the classification\nnetwork model, eschewing any need for input pre-processing or specific OOD data\npre-tuning. Extensive experiments on several benchmark datasets demonstrates\nthat our approach delivers state-of-the-art performance. Our code is available\nat https://github.com/Hewell0/ProjOOD.\n","authors":["Qiuyu Zhu","Yiwei He"],"pdf_url":"https://arxiv.org/pdf/2405.01662v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.01661v1","updated":"2024-05-02T18:31:47Z","published":"2024-05-02T18:31:47Z","title":"When a Relation Tells More Than a Concept: Exploring and Evaluating\n Classifier Decisions with CoReX","summary":" Explanations for Convolutional Neural Networks (CNNs) based on relevance of\ninput pixels might be too unspecific to evaluate which and how input features\nimpact model decisions. Especially in complex real-world domains like\nbiomedicine, the presence of specific concepts (e.g., a certain type of cell)\nand of relations between concepts (e.g., one cell type is next to another)\nmight be discriminative between classes (e.g., different types of tissue).\nPixel relevance is not expressive enough to convey this type of information. In\nconsequence, model evaluation is limited and relevant aspects present in the\ndata and influencing the model decisions might be overlooked. This work\npresents a novel method to explain and evaluate CNN models, which uses a\nconcept- and relation-based explainer (CoReX). It explains the predictive\nbehavior of a model on a set of images by masking (ir-)relevant concepts from\nthe decision-making process and by constraining relations in a learned\ninterpretable surrogate model. We test our approach with several image data\nsets and CNN architectures. Results show that CoReX explanations are faithful\nto the CNN model in terms of predictive outcomes. We further demonstrate that\nCoReX is a suitable tool for evaluating CNNs supporting identification and\nre-classification of incorrect or ambiguous classifications.\n","authors":["Bettina Finzel","Patrick Hilme","Johannes Rabold","Ute Schmid"],"pdf_url":"https://arxiv.org/pdf/2405.01661v1.pdf","comment":"preliminary version, submitted to Machine Learning"},{"id":"http://arxiv.org/abs/2405.01658v1","updated":"2024-05-02T18:29:05Z","published":"2024-05-02T18:29:05Z","title":"MMIST-ccRCC: A Real World Medical Dataset for the Development of\n Multi-Modal Systems","summary":" The acquisition of different data modalities can enhance our knowledge and\nunderstanding of various diseases, paving the way for a more personalized\nhealthcare. Thus, medicine is progressively moving towards the generation of\nmassive amounts of multi-modal data (\\emph{e.g,} molecular, radiology, and\nhistopathology). While this may seem like an ideal environment to capitalize\ndata-centric machine learning approaches, most methods still focus on exploring\na single or a pair of modalities due to a variety of reasons: i) lack of ready\nto use curated datasets; ii) difficulty in identifying the best multi-modal\nfusion strategy; and iii) missing modalities across patients. In this paper we\nintroduce a real world multi-modal dataset called MMIST-CCRCC that comprises 2\nradiology modalities (CT and MRI), histopathology, genomics, and clinical data\nfrom 618 patients with clear cell renal cell carcinoma (ccRCC). We provide\nsingle and multi-modal (early and late fusion) benchmarks in the task of\n12-month survival prediction in the challenging scenario of one or more missing\nmodalities for each patient, with missing rates that range from 26$\\%$ for\ngenomics data to more than 90$\\%$ for MRI. We show that even with such severe\nmissing rates the fusion of modalities leads to improvements in the survival\nforecasting. Additionally, incorporating a strategy to generate the latent\nrepresentations of the missing modalities given the available ones further\nimproves the performance, highlighting a potential complementarity across\nmodalities. Our dataset and code are available here:\nhttps://multi-modal-ist.github.io/datasets/ccRCC\n","authors":["Tiago Mota","M. Rita Verdelho","Alceu Bissoto","Carlos Santiago","Catarina Barata"],"pdf_url":"https://arxiv.org/pdf/2405.01658v1.pdf","comment":"Accepted in DCA in MI Workshop@CVPR2024"},{"id":"http://arxiv.org/abs/2405.01656v1","updated":"2024-05-02T18:26:15Z","published":"2024-05-02T18:26:15Z","title":"S4: Self-Supervised Sensing Across the Spectrum","summary":" Satellite image time series (SITS) segmentation is crucial for many\napplications like environmental monitoring, land cover mapping and agricultural\ncrop type classification. However, training models for SITS segmentation\nremains a challenging task due to the lack of abundant training data, which\nrequires fine grained annotation. We propose S4 a new self-supervised\npre-training approach that significantly reduces the requirement for labeled\ntraining data by utilizing two new insights: (a) Satellites capture images in\ndifferent parts of the spectrum such as radio frequencies, and visible\nfrequencies. (b) Satellite imagery is geo-registered allowing for fine-grained\nspatial alignment. We use these insights to formulate pre-training tasks in S4.\nWe also curate m2s2-SITS, a large-scale dataset of unlabeled,\nspatially-aligned, multi-modal and geographic specific SITS that serves as\nrepresentative pre-training data for S4. Finally, we evaluate S4 on multiple\nSITS segmentation datasets and demonstrate its efficacy against competing\nbaselines while using limited labeled data.\n","authors":["Jayanth Shenoy","Xinjian Davis Zhang","Shlok Mehrotra","Bill Tao","Rem Yang","Han Zhao","Deepak Vasisht"],"pdf_url":"https://arxiv.org/pdf/2405.01656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01654v1","updated":"2024-05-02T18:21:25Z","published":"2024-05-02T18:21:25Z","title":"Key Patches Are All You Need: A Multiple Instance Learning Framework For\n Robust Medical Diagnosis","summary":" Deep learning models have revolutionized the field of medical image analysis,\ndue to their outstanding performances. However, they are sensitive to spurious\ncorrelations, often taking advantage of dataset bias to improve results for\nin-domain data, but jeopardizing their generalization capabilities. In this\npaper, we propose to limit the amount of information these models use to reach\nthe final classification, by using a multiple instance learning (MIL)\nframework. MIL forces the model to use only a (small) subset of patches in the\nimage, identifying discriminative regions. This mimics the clinical procedures,\nwhere medical decisions are based on localized findings. We evaluate our\nframework on two medical applications: skin cancer diagnosis using dermoscopy\nand breast cancer diagnosis using mammography. Our results show that using only\na subset of the patches does not compromise diagnostic performance for\nin-domain data, compared to the baseline approaches. However, our approach is\nmore robust to shifts in patient demographics, while also providing more\ndetailed explanations about which regions contributed to the decision. Code is\navailable at: https://github.com/diogojpa99/MedicalMultiple-Instance-Learning.\n","authors":["Diogo J. Araújo","M. Rita Verdelho","Alceu Bissoto","Jacinto C. Nascimento","Carlos Santiago","Catarina Barata"],"pdf_url":"https://arxiv.org/pdf/2405.01654v1.pdf","comment":"Accepted in DEF-AI-MIA Workshop@CVPR 2024"},{"id":"http://arxiv.org/abs/2405.01646v1","updated":"2024-05-02T18:06:48Z","published":"2024-05-02T18:06:48Z","title":"Explaining models relating objects and privacy","summary":" Accurately predicting whether an image is private before sharing it online is\ndifficult due to the vast variety of content and the subjective nature of\nprivacy itself. In this paper, we evaluate privacy models that use objects\nextracted from an image to determine why the image is predicted as private. To\nexplain the decision of these models, we use feature-attribution to identify\nand quantify which objects (and which of their features) are more relevant to\nprivacy classification with respect to a reference input (i.e., no objects\nlocalised in an image) predicted as public. We show that the presence of the\nperson category and its cardinality is the main factor for the privacy\ndecision. Therefore, these models mostly fail to identify private images\ndepicting documents with sensitive data, vehicle ownership, and internet\nactivity, or public images with people (e.g., an outdoor concert or people\nwalking in a public space next to a famous landmark). As baselines for future\nbenchmarks, we also devise two strategies that are based on the person presence\nand cardinality and achieve comparable classification performance of the\nprivacy models.\n","authors":["Alessio Xompero","Myriam Bontonou","Jean-Michel Arbona","Emmanouil Benetos","Andrea Cavallaro"],"pdf_url":"https://arxiv.org/pdf/2405.01646v1.pdf","comment":"7 pages, 3 figures, 1 table, supplementary material included as\n Appendix. Paper accepted at the 3rd XAI4CV Workshop at CVPR 2024. Code:\n https://github.com/graphnex/ig-privacy"},{"id":"http://arxiv.org/abs/2405.01644v1","updated":"2024-05-02T18:05:37Z","published":"2024-05-02T18:05:37Z","title":"A Classification-Based Adaptive Segmentation Pipeline: Feasibility Study\n Using Polycystic Liver Disease and Metastases from Colorectal Cancer CT\n Images","summary":" Automated segmentation tools often encounter accuracy and adaptability issues\nwhen applied to images of different pathology. The purpose of this study is to\nexplore the feasibility of building a workflow to efficiently route images to\nspecifically trained segmentation models. By implementing a deep learning\nclassifier to automatically classify the images and route them to appropriate\nsegmentation models, we hope that our workflow can segment the images with\ndifferent pathology accurately. The data we used in this study are 350 CT\nimages from patients affected by polycystic liver disease and 350 CT images\nfrom patients presenting with liver metastases from colorectal cancer. All\nimages had the liver manually segmented by trained imaging analysts. Our\nproposed adaptive segmentation workflow achieved a statistically significant\nimprovement for the task of total liver segmentation compared to the generic\nsingle segmentation model (non-parametric Wilcoxon signed rank test, n=100,\np-value << 0.001). This approach is applicable in a wide range of scenarios and\nshould prove useful in clinical implementations of segmentation pipelines.\n","authors":["Peilong Wang","Timothy L. Kline","Andy D. Missert","Cole J. Cook","Matthew R. Callstrom","Alex Chan","Robert P. Hartman","Zachary S. Kelm","Panagiotis Korfiatis"],"pdf_url":"https://arxiv.org/pdf/2405.01644v1.pdf","comment":"J Digit Imaging. Inform. med. (2024)"},{"id":"http://arxiv.org/abs/2405.01636v1","updated":"2024-05-02T18:00:25Z","published":"2024-05-02T18:00:25Z","title":"Explainable AI (XAI) in Image Segmentation in Medicine, Industry, and\n Beyond: A Survey","summary":" Artificial Intelligence (XAI) has found numerous applications in computer\nvision. While image classification-based explainability techniques have\ngarnered significant attention, their counterparts in semantic segmentation\nhave been relatively neglected. Given the prevalent use of image segmentation,\nranging from medical to industrial deployments, these techniques warrant a\nsystematic look. In this paper, we present the first comprehensive survey on\nXAI in semantic image segmentation. This work focuses on techniques that were\neither specifically introduced for dense prediction tasks or were extended for\nthem by modifying existing methods in classification. We analyze and categorize\nthe literature based on application categories and domains, as well as the\nevaluation metrics and datasets used. We also propose a taxonomy for\ninterpretable semantic segmentation, and discuss potential challenges and\nfuture research directions.\n","authors":["Rokas Gipiškis","Chun-Wei Tsai","Olga Kurasova"],"pdf_url":"https://arxiv.org/pdf/2405.01636v1.pdf","comment":"35 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.01607v1","updated":"2024-05-02T04:53:42Z","published":"2024-05-02T04:53:42Z","title":"Wildfire Risk Prediction: A Review","summary":" Wildfires have significant impacts on global vegetation, wildlife, and\nhumans. They destroy plant communities and wildlife habitats and contribute to\nincreased emissions of carbon dioxide, nitrogen oxides, methane, and other\npollutants. The prediction of wildfires relies on various independent variables\ncombined with regression or machine learning methods. In this technical review,\nwe describe the options for independent variables, data processing techniques,\nmodels, independent variables collinearity and importance estimation methods,\nand model performance evaluation metrics. First, we divide the independent\nvariables into 4 aspects, including climate and meteorology conditions,\nsocio-economical factors, terrain and hydrological features, and wildfire\nhistorical records. Second, preprocessing methods are described for different\nmagnitudes, different spatial-temporal resolutions, and different formats of\ndata. Third, the collinearity and importance evaluation methods of independent\nvariables are also considered. Fourth, we discuss the application of\nstatistical models, traditional machine learning models, and deep learning\nmodels in wildfire risk prediction. In this subsection, compared with other\nreviews, this manuscript particularly discusses the evaluation metrics and\nrecent advancements in deep learning methods. Lastly, addressing the\nlimitations of current research, this paper emphasizes the need for more\neffective deep learning time series forecasting algorithms, the utilization of\nthree-dimensional data including ground and trunk fuel, extraction of more\naccurate historical fire point data, and improved model evaluation metrics.\n","authors":["Zhengsen Xu","Jonathan Li","Linlin Xu"],"pdf_url":"https://arxiv.org/pdf/2405.01607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03702v1","updated":"2024-05-02T23:53:29Z","published":"2024-05-02T23:53:29Z","title":"Leafy Spurge Dataset: Real-world Weed Classification Within Aerial Drone\n Imagery","summary":" Invasive plant species are detrimental to the ecology of both agricultural\nand wildland areas. Euphorbia esula, or leafy spurge, is one such plant that\nhas spread through much of North America from Eastern Europe. When paired with\ncontemporary computer vision systems, unmanned aerial vehicles, or drones,\noffer the means to track expansion of problem plants, such as leafy spurge, and\nimprove chances of controlling these weeds. We gathered a dataset of leafy\nspurge presence and absence in grasslands of western Montana, USA, then\nsurveyed these areas with a commercial drone. We trained image classifiers on\nthese data, and our best performing model, a pre-trained DINOv2 vision\ntransformer, identified leafy spurge with 0.84 accuracy (test set). This result\nindicates that classification of leafy spurge is tractable, but not solved. We\nrelease this unique dataset of labelled and unlabelled, aerial drone imagery\nfor the machine learning community to explore. Improving classification\nperformance of leafy spurge would benefit the fields of ecology, conservation,\nand remote sensing alike. Code and data are available at our website:\nleafy-spurge-dataset.github.io.\n","authors":["Kyle Doherty","Max Gurinas","Erik Samsoe","Charles Casper","Beau Larkin","Philip Ramsey","Brandon Trabucco","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2405.03702v1.pdf","comment":"Official Dataset Technical Report. Used in DA-Fusion\n (arXiv:2302.07944)"}]},"2024-05-03T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.02287v1","updated":"2024-05-03T17:59:55Z","published":"2024-05-03T17:59:55Z","title":"Vibe-Eval: A hard evaluation suite for measuring progress of multimodal\n language models","summary":" We introduce Vibe-Eval: a new open benchmark and framework for evaluating\nmultimodal chat models. Vibe-Eval consists of 269 visual understanding prompts,\nincluding 100 of hard difficulty, complete with gold-standard responses\nauthored by experts. Vibe-Eval is open-ended and challenging with dual\nobjectives: (i) vibe checking multimodal chat models for day-to-day tasks and\n(ii) rigorously testing and probing the capabilities of present frontier\nmodels. Notably, our hard set contains >50% questions that all frontier models\nanswer incorrectly. We explore the nuances of designing, evaluating, and\nranking models on ultra challenging prompts. We also discuss trade-offs between\nhuman and automatic evaluation, and show that automatic model evaluation using\nReka Core roughly correlates to human judgment. We offer free API access for\nthe purpose of lightweight evaluation and plan to conduct formal human\nevaluations for public models that perform well on the Vibe-Eval's automatic\nscores. We release the evaluation code and data, see\nhttps://github.com/reka-ai/reka-vibe-eval\n","authors":["Piotr Padlewski","Max Bain","Matthew Henderson","Zhongkai Zhu","Nishant Relan","Hai Pham","Donovan Ong","Kaloyan Aleksiev","Aitor Ormazabal","Samuel Phua","Ethan Yeo","Eugenie Lamprecht","Qi Liu","Yuqi Wang","Eric Chen","Deyu Fu","Lei Li","Che Zheng","Cyprien de Masson d'Autume","Dani Yogatama","Mikel Artetxe","Yi Tay"],"pdf_url":"https://arxiv.org/pdf/2405.02287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02280v1","updated":"2024-05-03T17:55:34Z","published":"2024-05-03T17:55:34Z","title":"DreamScene4D: Dynamic Multi-Object Scene Generation from Monocular\n Videos","summary":" Existing VLMs can track in-the-wild 2D video objects while current generative\nmodels provide powerful visual priors for synthesizing novel views for the\nhighly under-constrained 2D-to-3D object lifting. Building upon this exciting\nprogress, we present DreamScene4D, the first approach that can generate\nthree-dimensional dynamic scenes of multiple objects from monocular in-the-wild\nvideos with large object motion across occlusions and novel viewpoints. Our key\ninsight is to design a \"decompose-then-recompose\" scheme to factorize both the\nwhole video scene and each object's 3D motion. We first decompose the video\nscene by using open-vocabulary mask trackers and an adapted image diffusion\nmodel to segment, track, and amodally complete the objects and background in\nthe video. Each object track is mapped to a set of 3D Gaussians that deform and\nmove in space and time. We also factorize the observed motion into multiple\ncomponents to handle fast motion. The camera motion can be inferred by\nre-rendering the background to match the video frames. For the object motion,\nwe first model the object-centric deformation of the objects by leveraging\nrendering losses and multi-view generative priors in an object-centric frame,\nthen optimize object-centric to world-frame transformations by comparing the\nrendered outputs against the perceived pixel and optical flow. Finally, we\nrecompose the background and objects and optimize for relative object scales\nusing monocular depth prediction guidance. We show extensive results on the\nchallenging DAVIS, Kubric, and self-captured videos, detail some limitations,\nand provide future directions. Besides 4D scene generation, our results show\nthat DreamScene4D enables accurate 2D point motion tracking by projecting the\ninferred 3D trajectories to 2D, while never explicitly trained to do so.\n","authors":["Wen-Hsuan Chu","Lei Ke","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2405.02280v1.pdf","comment":"Project page: https://dreamscene4d.github.io/"},{"id":"http://arxiv.org/abs/2405.02266v1","updated":"2024-05-03T17:34:02Z","published":"2024-05-03T17:34:02Z","title":"On the test-time zero-shot generalization of vision-language models: Do\n we really need prompt learning?","summary":" The development of large vision-language models, notably CLIP, has catalyzed\nresearch into effective adaptation techniques, with a particular focus on soft\nprompt tuning. Conjointly, test-time augmentation, which utilizes multiple\naugmented views of a single image to enhance zero-shot generalization, is\nemerging as a significant area of interest. This has predominantly directed\nresearch efforts toward test-time prompt tuning. In contrast, we introduce a\nrobust MeanShift for Test-time Augmentation (MTA), which surpasses prompt-based\nmethods without requiring this intensive training procedure. This positions MTA\nas an ideal solution for both standalone and API-based applications.\nAdditionally, our method does not rely on ad hoc rules (e.g., confidence\nthreshold) used in some previous test-time augmentation techniques to filter\nthe augmented views. Instead, MTA incorporates a quality assessment variable\nfor each view directly into its optimization process, termed as the inlierness\nscore. This score is jointly optimized with a density mode seeking process,\nleading to an efficient training- and hyperparameter-free approach. We\nextensively benchmark our method on 15 datasets and demonstrate MTA's\nsuperiority and computational efficiency. Deployed easily as plug-and-play\nmodule on top of zero-shot models and state-of-the-art few-shot methods, MTA\nshows systematic and consistent improvements.\n","authors":["Maxime Zanella","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2405.02266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02246v1","updated":"2024-05-03T17:00:00Z","published":"2024-05-03T17:00:00Z","title":"What matters when building vision-language models?","summary":" The growing interest in vision-language models (VLMs) has been driven by\nimprovements in large language models and vision transformers. Despite the\nabundance of literature on this subject, we observe that critical decisions\nregarding the design of VLMs are often not justified. We argue that these\nunsupported decisions impede progress in the field by making it difficult to\nidentify which choices improve model performance. To address this issue, we\nconduct extensive experiments around pre-trained models, architecture choice,\ndata, and training methods. Our consolidation of findings includes the\ndevelopment of Idefics2, an efficient foundational VLM of 8 billion parameters.\nIdefics2 achieves state-of-the-art performance within its size category across\nvarious multimodal benchmarks, and is often on par with models four times its\nsize. We release the model (base, instructed, and chat) along with the datasets\ncreated for its training.\n","authors":["Hugo Laurençon","Léo Tronchon","Matthieu Cord","Victor Sanh"],"pdf_url":"https://arxiv.org/pdf/2405.02246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06733v4","updated":"2024-05-03T16:46:57Z","published":"2023-12-11T10:43:28Z","title":"TULIP: Transformer for Upsampling of LiDAR Point Clouds","summary":" LiDAR Upsampling is a challenging task for the perception systems of robots\nand autonomous vehicles, due to the sparse and irregular structure of\nlarge-scale scene contexts. Recent works propose to solve this problem by\nconverting LiDAR data from 3D Euclidean space into an image super-resolution\nproblem in 2D image space. Although their methods can generate high-resolution\nrange images with fine-grained details, the resulting 3D point clouds often\nblur out details and predict invalid points. In this paper, we propose TULIP, a\nnew method to reconstruct high-resolution LiDAR point clouds from\nlow-resolution LiDAR input. We also follow a range image-based approach but\nspecifically modify the patch and window geometries of a Swin-Transformer-based\nnetwork to better fit the characteristics of range images. We conducted several\nexperiments on three public real-world and simulated datasets. TULIP\noutperforms state-of-the-art methods in all relevant metrics and generates\nrobust and more realistic point clouds than prior works.\n","authors":["Bin Yang","Patrick Pfreundschuh","Roland Siegwart","Marco Hutter","Peyman Moghadam","Vaishakh Patil"],"pdf_url":"https://arxiv.org/pdf/2312.06733v4.pdf","comment":"The paper was accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2405.01461v2","updated":"2024-05-03T16:35:44Z","published":"2024-05-02T16:50:41Z","title":"SATO: Stable Text-to-Motion Framework","summary":" Is the Text to Motion model robust? Recent advancements in Text to Motion\nmodels primarily stem from more accurate predictions of specific actions.\nHowever, the text modality typically relies solely on pre-trained Contrastive\nLanguage-Image Pretraining (CLIP) models. Our research has uncovered a\nsignificant issue with the text-to-motion model: its predictions often exhibit\ninconsistent outputs, resulting in vastly different or even incorrect poses\nwhen presented with semantically similar or identical text inputs. In this\npaper, we undertake an analysis to elucidate the underlying causes of this\ninstability, establishing a clear link between the unpredictability of model\noutputs and the erratic attention patterns of the text encoder module.\nConsequently, we introduce a formal framework aimed at addressing this issue,\nwhich we term the Stable Text-to-Motion Framework (SATO). SATO consists of\nthree modules, each dedicated to stable attention, stable prediction, and\nmaintaining a balance between accuracy and robustness trade-off. We present a\nmethodology for constructing an SATO that satisfies the stability of attention\nand prediction. To verify the stability of the model, we introduced a new\ntextual synonym perturbation dataset based on HumanML3D and KIT-ML. Results\nshow that SATO is significantly more stable against synonyms and other slight\nperturbations while keeping its high accuracy performance.\n","authors":["Wenshuo Chen","Hongru Xiao","Erhang Zhang","Lijie Hu","Lei Wang","Mengyuan Liu","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02220v1","updated":"2024-05-03T16:27:39Z","published":"2024-05-03T16:27:39Z","title":"Designed Dithering Sign Activation for Binary Neural Networks","summary":" Binary Neural Networks emerged as a cost-effective and energy-efficient\nsolution for computer vision tasks by binarizing either network weights or\nactivations. However, common binary activations, such as the Sign activation\nfunction, abruptly binarize the values with a single threshold, losing\nfine-grained details in the feature outputs. This work proposes an activation\nthat applies multiple thresholds following dithering principles, shifting the\nSign activation function for each pixel according to a spatially periodic\nthreshold kernel. Unlike literature methods, the shifting is defined jointly\nfor a set of adjacent pixels, taking advantage of spatial correlations.\nExperiments over the classification task demonstrate the effectiveness of the\ndesigned dithering Sign activation function as an alternative activation for\nbinary neural networks, without increasing the computational cost. Further,\nDeSign balances the preservation of details with the efficiency of binary\noperations.\n","authors":["Brayan Monroy","Juan Estupiñan","Tatiana Gelvez-Barrera","Jorge Bacca","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2405.02220v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2310.05336v2","updated":"2024-05-03T16:23:58Z","published":"2023-10-09T01:44:06Z","title":"GReAT: A Graph Regularized Adversarial Training Method","summary":" This paper presents GReAT (Graph Regularized Adversarial Training), a novel\nregularization method designed to enhance the robust classification performance\nof deep learning models. Adversarial examples, characterized by subtle\nperturbations that can mislead models, pose a significant challenge in machine\nlearning. Although adversarial training is effective in defending against such\nattacks, it often overlooks the underlying data structure. In response, GReAT\nintegrates graph based regularization into the adversarial training process,\nleveraging the data's inherent structure to enhance model robustness. By\nincorporating graph information during training, GReAT defends against\nadversarial attacks and improves generalization to unseen data. Extensive\nevaluations on benchmark datasets demonstrate that GReAT outperforms state of\nthe art methods in robustness, achieving notable improvements in classification\naccuracy. Specifically, compared to the second best methods, GReAT achieves a\nperformance increase of approximately 4.87% for CIFAR10 against FGSM attack and\n10.57% for SVHN against FGSM attack. Additionally, for CIFAR10, GReAT\ndemonstrates a performance increase of approximately 11.05% against PGD attack,\nand for SVHN, a 5.54% increase against PGD attack. This paper provides detailed\ninsights into the proposed methodology, including numerical results and\ncomparisons with existing approaches, highlighting the significant impact of\nGReAT in advancing the performance of deep learning models.\n","authors":["Samet Bayram","Kenneth Barner"],"pdf_url":"https://arxiv.org/pdf/2310.05336v2.pdf","comment":"25 pages including references. 7 figures and 6 tables"},{"id":"http://arxiv.org/abs/2405.02218v1","updated":"2024-05-03T16:23:41Z","published":"2024-05-03T16:23:41Z","title":"Multispectral Fine-Grained Classification of Blackgrass in Wheat and\n Barley Crops","summary":" As the burden of herbicide resistance grows and the environmental\nrepercussions of excessive herbicide use become clear, new ways of managing\nweed populations are needed. This is particularly true for cereal crops, like\nwheat and barley, that are staple food crops and occupy a globally significant\nportion of agricultural land. Even small improvements in weed management\npractices across these major food crops worldwide would yield considerable\nbenefits for both the environment and global food security. Blackgrass is a\nmajor grass weed which causes particular problems in cereal crops in north-west\nEurope, a major cereal production area, because it has high levels of of\nherbicide resistance and is well adapted to agronomic practice in this region.\nWith the use of machine vision and multispectral imaging, we investigate the\neffectiveness of state-of-the-art methods to identify blackgrass in wheat and\nbarley crops. As part of this work, we provide a large dataset with which we\nevaluate several key aspects of blackgrass weed recognition. Firstly, we\ndetermine the performance of different CNN and transformer-based architectures\non images from unseen fields. Secondly, we demonstrate the role that different\nspectral bands have on the performance of weed classification. Lastly, we\nevaluate the role of dataset size in classification performance for each of the\nmodels trialled. We find that even with a fairly modest quantity of training\ndata an accuracy of almost 90% can be achieved on images from unseen fields.\n","authors":["Madeleine Darbyshire","Shaun Coutts","Eleanor Hammond","Fazilet Gokbudak","Cengiz Oztireli","Petra Bosilj","Junfeng Gao","Elizabeth Sklar","Simon Parsons"],"pdf_url":"https://arxiv.org/pdf/2405.02218v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.01524v2","updated":"2024-05-03T16:03:57Z","published":"2024-05-02T17:54:35Z","title":"A separability-based approach to quantifying generalization: which layer\n is best?","summary":" Generalization to unseen data remains poorly understood for deep learning\nclassification and foundation models. How can one assess the ability of\nnetworks to adapt to new or extended versions of their input space in the\nspirit of few-shot learning, out-of-distribution generalization, and domain\nadaptation? Which layers of a network are likely to generalize best? We provide\na new method for evaluating the capacity of networks to represent a sampled\ndomain, regardless of whether the network has been trained on all classes in\nthe domain. Our approach is the following: after fine-tuning state-of-the-art\npre-trained models for visual classification on a particular domain, we assess\ntheir performance on data from related but distinct variations in that domain.\nGeneralization power is quantified as a function of the latent embeddings of\nunseen data from intermediate layers for both unsupervised and supervised\nsettings. Working throughout all stages of the network, we find that (i) high\nclassification accuracy does not imply high generalizability; and (ii) deeper\nlayers in a model do not always generalize the best, which has implications for\npruning. Since the trends observed across datasets are largely consistent, we\nconclude that our approach reveals (a function of) the intrinsic capacity of\nthe different layers of a model to generalize.\n","authors":["Luciano Dyballa","Evan Gerritz","Steven W. Zucker"],"pdf_url":"https://arxiv.org/pdf/2405.01524v2.pdf","comment":"6, pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.17983v2","updated":"2024-05-03T15:49:16Z","published":"2023-11-29T18:51:21Z","title":"Improving Interpretation Faithfulness for Vision Transformers","summary":" Vision Transformers (ViTs) have achieved state-of-the-art performance for\nvarious vision tasks. One reason behind the success lies in their ability to\nprovide plausible innate explanations for the behavior of neural architectures.\nHowever, ViTs suffer from issues with explanation faithfulness, as their focal\npoints are fragile to adversarial attacks and can be easily changed with even\nslight perturbations on the input image. In this paper, we propose a rigorous\napproach to mitigate these issues by introducing Faithful ViTs (FViTs). Briefly\nspeaking, an FViT should have the following two properties: (1) The top-$k$\nindices of its self-attention vector should remain mostly unchanged under input\nperturbation, indicating stable explanations; (2) The prediction distribution\nshould be robust to perturbations. To achieve this, we propose a new method\ncalled Denoised Diffusion Smoothing (DDS), which adopts randomized smoothing\nand diffusion-based denoising. We theoretically prove that processing ViTs\ndirectly with DDS can turn them into FViTs. We also show that Gaussian noise is\nnearly optimal for both $\\ell_2$ and $\\ell_\\infty$-norm cases. Finally, we\ndemonstrate the effectiveness of our approach through comprehensive experiments\nand evaluations. Results show that FViTs are more robust against adversarial\nattacks while maintaining the explainability of attention, indicating higher\nfaithfulness.\n","authors":["Lijie Hu","Yixin Liu","Ninghao Liu","Mengdi Huai","Lichao Sun","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17983v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.02191v1","updated":"2024-05-03T15:47:07Z","published":"2024-05-03T15:47:07Z","title":"Non-Destructive Peat Analysis using Hyperspectral Imaging and Machine\n Learning","summary":" Peat, a crucial component in whisky production, imparts distinctive and\nirreplaceable flavours to the final product. However, the extraction of peat\ndisrupts ancient ecosystems and releases significant amounts of carbon,\ncontributing to climate change. This paper aims to address this issue by\nconducting a feasibility study on enhancing peat use efficiency in whisky\nmanufacturing through non-destructive analysis using hyperspectral imaging.\nResults show that shot-wave infrared (SWIR) data is more effective for\nanalyzing peat samples and predicting total phenol levels, with accuracies up\nto 99.81%.\n","authors":["Yijun Yan","Jinchang Ren","Barry Harrison","Oliver Lewis","Yinhe Li","Ping Ma"],"pdf_url":"https://arxiv.org/pdf/2405.02191v1.pdf","comment":"4 pages,4 figures"},{"id":"http://arxiv.org/abs/2404.06202v2","updated":"2024-05-03T15:46:54Z","published":"2024-04-09T10:47:43Z","title":"Automated National Urban Map Extraction","summary":" Developing countries usually lack the proper governance means to generate and\nregularly update a national rooftop map. Using traditional photogrammetry and\nsurveying methods to produce a building map at the federal level is costly and\ntime consuming. Using earth observation and deep learning methods, we can\nbridge this gap and propose an automated pipeline to fetch such national urban\nmaps. This paper aims to exploit the power of fully convolutional neural\nnetworks for multi-class buildings' instance segmentation to leverage high\nobject-wise accuracy results. Buildings' instance segmentation from sub-meter\nhigh-resolution satellite images can be achieved with relatively high\npixel-wise metric scores. We detail all engineering steps to replicate this\nwork and ensure highly accurate results in dense and slum areas witnessed in\nregions that lack proper urban planning in the Global South. We applied a case\nstudy of the proposed pipeline to Lebanon and successfully produced the first\ncomprehensive national building footprint map with approximately 1 Million\nunits with an 84% accuracy. The proposed architecture relies on advanced\naugmentation techniques to overcome dataset scarcity, which is often the case\nin developing countries.\n","authors":["Hasan Nasrallah","Abed Ellatif Samhat","Cristiano Nattero","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2404.06202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04157v2","updated":"2024-05-03T15:33:36Z","published":"2023-11-07T17:32:55Z","title":"A Simple Interpretable Transformer for Fine-Grained Image Classification\n and Analysis","summary":" We present a novel usage of Transformers to make image classification\ninterpretable. Unlike mainstream classifiers that wait until the last fully\nconnected layer to incorporate class information to make predictions, we\ninvestigate a proactive approach, asking each class to search for itself in an\nimage. We realize this idea via a Transformer encoder-decoder inspired by\nDEtection TRansformer (DETR). We learn \"class-specific\" queries (one for each\nclass) as input to the decoder, enabling each class to localize its patterns in\nan image via cross-attention. We name our approach INterpretable TRansformer\n(INTR), which is fairly easy to implement and exhibits several compelling\nproperties. We show that INTR intrinsically encourages each class to attend\ndistinctively; the cross-attention weights thus provide a faithful\ninterpretation of the prediction. Interestingly, via \"multi-head\"\ncross-attention, INTR could identify different \"attributes\" of a class, making\nit particularly suitable for fine-grained classification and analysis, which we\ndemonstrate on eight datasets. Our code and pre-trained models are publicly\naccessible at the Imageomics Institute GitHub site:\nhttps://github.com/Imageomics/INTR.\n","authors":["Dipanjyoti Paul","Arpita Chowdhury","Xinqi Xiong","Feng-Ju Chang","David Carlyn","Samuel Stevens","Kaiya L. Provost","Anuj Karpatne","Bryan Carstens","Daniel Rubenstein","Charles Stewart","Tanya Berger-Wolf","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2311.04157v2.pdf","comment":"Accepted to International Conference on Learning Representations 2024\n (ICLR 2024)"},{"id":"http://arxiv.org/abs/2405.02179v1","updated":"2024-05-03T15:27:11Z","published":"2024-05-03T15:27:11Z","title":"Training-Free Deepfake Voice Recognition by Leveraging Large-Scale\n Pre-Trained Models","summary":" Generalization is a main issue for current audio deepfake detectors, which\nstruggle to provide reliable results on out-of-distribution data. Given the\nspeed at which more and more accurate synthesis methods are developed, it is\nvery important to design techniques that work well also on data they were not\ntrained for.In this paper we study the potential of large-scale pre-trained\nmodels for audio deepfake detection, with special focus on generalization\nability. To this end, the detection problem is reformulated in a speaker\nverification framework and fake audios are exposed by the mismatch between the\nvoice sample under test and the voice of the claimed identity. With this\nparadigm, no fake speech sample is necessary in training, cutting off any link\nwith the generation method at the root, and ensuring full generalization\nability. Features are extracted by general-purpose large pre-trained models,\nwith no need for training or fine-tuning on specific fake detection or speaker\nverification datasets. At detection time only a limited set of voice fragments\nof the identity under test is required. Experiments on several datasets\nwidespread in the community show that detectors based on pre-trained models\nachieve excellent performance and show strong generalization ability, rivaling\nsupervised methods on in-distribution data and largely overcoming them on\nout-of-distribution data.\n","authors":["Alessandro Pianese","Davide Cozzolino","Giovanni Poggi","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2405.02179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10065v3","updated":"2024-05-03T15:25:54Z","published":"2023-11-16T18:02:10Z","title":"Visual Environment Assessment for Safe Autonomous Quadrotor Landing","summary":" Autonomous identification and evaluation of safe landing zones are of\nparamount importance for ensuring the safety and effectiveness of aerial robots\nin the event of system failures, low battery, or the successful completion of\nspecific tasks. In this paper, we present a novel approach for detection and\nassessment of potential landing sites for safe quadrotor landing. Our solution\nefficiently integrates 2D and 3D environmental information, eliminating the\nneed for external aids such as GPS and computationally intensive elevation\nmaps. The proposed pipeline combines semantic data derived from a Neural\nNetwork (NN), to extract environmental features, with geometric data obtained\nfrom a disparity map, to extract critical geometric attributes such as slope,\nflatness, and roughness. We define several cost metrics based on these\nattributes to evaluate safety, stability, and suitability of regions in the\nenvironments and identify the most suitable landing area. Our approach runs in\nreal-time on quadrotors equipped with limited computational capabilities.\nExperimental results conducted in diverse environments demonstrate that the\nproposed method can effectively assess and identify suitable landing areas,\nenabling the safe and autonomous landing of a quadrotor.\n","authors":["Mattia Secchiero","Nishanth Bobbili","Yang Zhou","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2311.10065v3.pdf","comment":"7 pages, 5 figures, 1 table, 2024 International Conference on\n Unmanned Aircraft Systems (ICUAS)"},{"id":"http://arxiv.org/abs/2402.14095v4","updated":"2024-05-03T15:25:09Z","published":"2024-02-21T19:45:05Z","title":"Zero-shot generalization across architectures for visual classification","summary":" Generalization to unseen data is a key desideratum for deep networks, but its\nrelation to classification accuracy is unclear. Using a minimalist vision\ndataset and a measure of generalizability, we show that popular networks, from\ndeep convolutional networks (CNNs) to transformers, vary in their power to\nextrapolate to unseen classes both across layers and across architectures.\nAccuracy is not a good predictor of generalizability, and generalization varies\nnon-monotonically with layer depth.\n","authors":["Evan Gerritz","Luciano Dyballa","Steven W. Zucker"],"pdf_url":"https://arxiv.org/pdf/2402.14095v4.pdf","comment":"Accepted as a Tiny Paper at ICLR 2024. Code available at\n https://github.com/dyballa/generalization/tree/ICLR2024TinyPaper"},{"id":"http://arxiv.org/abs/2402.03328v2","updated":"2024-05-03T15:24:20Z","published":"2024-01-09T18:18:32Z","title":"Visual Enumeration is Challenging for Large-scale Generative AI","summary":" Humans can readily judge the number of objects in a visual scene, even\nwithout counting, and such a skill has been documented in many animal species\nand babies prior to language development and formal schooling. Numerical\njudgments are error-free for small sets, while for larger collections responses\nbecome approximate, with variability increasing proportionally to the target\nnumber. This response pattern is observed for items of all kinds, despite\nvariation in object features (such as color or shape), suggesting that our\nvisual number sense relies on abstract representations of numerosity. Here, we\ninvestigate whether large-scale generative Artificial Intelligence (AI) systems\nhave a human-like number sense, which should allow them to reliably name the\nnumber of objects in simple visual stimuli or generate images containing a\ntarget number of items in the 1-10 range. Surprisingly, most of the foundation\nmodels considered have a poor number sense: They make striking errors even with\nsmall numbers, the response variability does not increase in a systematic way,\nand the pattern of errors depends on object category. Only the most recent\nproprietary systems exhibit signatures of a visual number sense. Our findings\ndemonstrate that having an intuitive visual understanding of number remains\nchallenging for foundation models, which in turn might be detrimental to the\nperceptual grounding of numeracy that in humans is crucial for mathematical\nlearning.\n","authors":["Alberto Testolin","Kuinan Hou","Marco Zorzi"],"pdf_url":"https://arxiv.org/pdf/2402.03328v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02171v1","updated":"2024-05-03T15:20:30Z","published":"2024-05-03T15:20:30Z","title":"Self-Supervised Learning for Real-World Super-Resolution from Dual and\n Multiple Zoomed Observations","summary":" In this paper, we consider two challenging issues in reference-based\nsuper-resolution (RefSR) for smartphone, (i) how to choose a proper reference\nimage, and (ii) how to learn RefSR in a self-supervised manner. Particularly,\nwe propose a novel self-supervised learning approach for real-world RefSR from\nobservations at dual and multiple camera zooms. Firstly, considering the\npopularity of multiple cameras in modern smartphones, the more zoomed\n(telephoto) image can be naturally leveraged as the reference to guide the\nsuper-resolution (SR) of the lesser zoomed (ultra-wide) image, which gives us a\nchance to learn a deep network that performs SR from the dual zoomed\nobservations (DZSR). Secondly, for self-supervised learning of DZSR, we take\nthe telephoto image instead of an additional high-resolution image as the\nsupervision information, and select a center patch from it as the reference to\nsuper-resolve the corresponding ultra-wide image patch. To mitigate the effect\nof the misalignment between ultra-wide low-resolution (LR) patch and telephoto\nground-truth (GT) image during training, we first adopt patch-based optical\nflow alignment and then design an auxiliary-LR to guide the deforming of the\nwarped LR features. To generate visually pleasing results, we present local\noverlapped sliced Wasserstein loss to better represent the perceptual\ndifference between GT and output in the feature space. During testing, DZSR can\nbe directly deployed to super-solve the whole ultra-wide image with the\nreference of the telephoto image. In addition, we further take multiple zoomed\nobservations to explore self-supervised RefSR, and present a progressive fusion\nscheme for the effective utilization of reference images. Experiments show that\nour methods achieve better quantitative and qualitative performance against\nstate-of-the-arts. Codes are available at\nhttps://github.com/cszhilu1998/SelfDZSR_PlusPlus.\n","authors":["Zhilu Zhang","Ruohao Wang","Hongzhi Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.02171v1.pdf","comment":"Accpted by IEEE TPAMI in 2024. Extended version of ECCV 2022 paper\n \"Self-Supervised Learning for Real-World Super-Resolution from Dual Zoomed\n Observations\" (arXiv:2203.01325)"},{"id":"http://arxiv.org/abs/2205.00400v3","updated":"2024-05-03T15:17:12Z","published":"2022-05-01T05:30:53Z","title":"Convex Combination Consistency between Neighbors for Weakly-supervised\n Action Localization","summary":" Weakly-supervised temporal action localization (WTAL) intends to detect\naction instances with only weak supervision, e.g., video-level labels. The\ncurrent~\\textit{de facto} pipeline locates action instances by thresholding and\ngrouping continuous high-score regions on temporal class activation sequences.\nIn this route, the capacity of the model to recognize the relationships between\nadjacent snippets is of vital importance which determines the quality of the\naction boundaries. However, it is error-prone since the variations between\nadjacent snippets are typically subtle, and unfortunately this is overlooked in\nthe literature. To tackle the issue, we propose a novel WTAL approach named\nConvex Combination Consistency between Neighbors (C$^3$BN). C$^3$BN consists of\ntwo key ingredients: a micro data augmentation strategy that increases the\ndiversity in-between adjacent snippets by convex combination of adjacent\nsnippets, and a macro-micro consistency regularization that enforces the model\nto be invariant to the transformations~\\textit{w.r.t.} video semantics, snippet\npredictions, and snippet representations. Consequently, fine-grained patterns\nin-between adjacent snippets are enforced to be explored, thereby resulting in\na more robust action boundary localization. Experimental results demonstrate\nthe effectiveness of C$^3$BN on top of various baselines for WTAL with\nvideo-level and point-level supervisions. Code is at\nhttps://github.com/Qinying-Liu/C3BN.\n","authors":["Qinying Liu","Zilei Wang","Ruoxi Chen","Zhilin Li"],"pdf_url":"https://arxiv.org/pdf/2205.00400v3.pdf","comment":"ICME2023"},{"id":"http://arxiv.org/abs/2207.14686v3","updated":"2024-05-03T15:15:27Z","published":"2022-07-29T13:58:24Z","title":"Forensic License Plate Recognition with Compression-Informed\n Transformers","summary":" Forensic license plate recognition (FLPR) remains an open challenge in legal\ncontexts such as criminal investigations, where unreadable license plates (LPs)\nneed to be deciphered from highly compressed and/or low resolution footage,\ne.g., from surveillance cameras. In this work, we propose a side-informed\nTransformer architecture that embeds knowledge on the input compression level\nto improve recognition under strong compression. We show the effectiveness of\nTransformers for license plate recognition (LPR) on a low-quality real-world\ndataset. We also provide a synthetic dataset that includes strongly degraded,\nillegible LP images and analyze the impact of knowledge embedding on it. The\nnetwork outperforms existing FLPR methods and standard state-of-the art image\nrecognition models while requiring less parameters. For the severest degraded\nimages, we can improve recognition by up to 8.9 percent points.\n","authors":["Denise Moussa","Anatol Maier","Andreas Spruck","Jürgen Seiler","Christian Riess"],"pdf_url":"https://arxiv.org/pdf/2207.14686v3.pdf","comment":"Published at ICIP 2022, Code:\n https://faui1-gitlab.cs.fau.de/denise.moussa/forensic-license-plate-transformer/"},{"id":"http://arxiv.org/abs/2310.11884v2","updated":"2024-05-03T15:15:17Z","published":"2023-10-18T11:08:02Z","title":"From Neural Activations to Concepts: A Survey on Explaining Concepts in\n Neural Networks","summary":" In this paper, we review recent approaches for explaining concepts in neural\nnetworks. Concepts can act as a natural link between learning and reasoning:\nonce the concepts are identified that a neural learning system uses, one can\nintegrate those concepts with a reasoning system for inference or use a\nreasoning system to act upon them to improve or enhance the learning system. On\nthe other hand, knowledge can not only be extracted from neural networks but\nconcept knowledge can also be inserted into neural network architectures. Since\nintegrating learning and reasoning is at the core of neuro-symbolic AI, the\ninsights gained from this survey can serve as an important step towards\nrealizing neuro-symbolic AI based on explainable concepts.\n","authors":["Jae Hee Lee","Sergio Lanza","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2310.11884v2.pdf","comment":"Accepted in Neurosymbolic Artificial Intelligence"},{"id":"http://arxiv.org/abs/2405.02162v1","updated":"2024-05-03T15:08:39Z","published":"2024-05-03T15:08:39Z","title":"Mapping the Unseen: Unified Promptable Panoptic Mapping with Dynamic\n Labeling using Foundation Models","summary":" In the field of robotics and computer vision, efficient and accurate semantic\nmapping remains a significant challenge due to the growing demand for\nintelligent machines that can comprehend and interact with complex\nenvironments. Conventional panoptic mapping methods, however, are limited by\npredefined semantic classes, thus making them ineffective for handling novel or\nunforeseen objects. In response to this limitation, we introduce the Unified\nPromptable Panoptic Mapping (UPPM) method. UPPM utilizes recent advances in\nfoundation models to enable real-time, on-demand label generation using natural\nlanguage prompts. By incorporating a dynamic labeling strategy into traditional\npanoptic mapping techniques, UPPM provides significant improvements in\nadaptability and versatility while maintaining high performance levels in map\nreconstruction. We demonstrate our approach on real-world and simulated\ndatasets. Results show that UPPM can accurately reconstruct scenes and segment\nobjects while generating rich semantic labels through natural language\ninteractions. A series of ablation experiments validated the advantages of\nfoundation model-based labeling over fixed label sets.\n","authors":["Mohamad Al Mdfaa","Raghad Salameh","Sergey Zagoruyko","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2405.02162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02155v1","updated":"2024-05-03T15:02:41Z","published":"2024-05-03T15:02:41Z","title":"Multi-method Integration with Confidence-based Weighting for Zero-shot\n Image Classification","summary":" This paper introduces a novel framework for zero-shot learning (ZSL), i.e.,\nto recognize new categories that are unseen during training, by using a\nmulti-model and multi-alignment integration method. Specifically, we propose\nthree strategies to enhance the model's performance to handle ZSL: 1) Utilizing\nthe extensive knowledge of ChatGPT and the powerful image generation\ncapabilities of DALL-E to create reference images that can precisely describe\nunseen categories and classification boundaries, thereby alleviating the\ninformation bottleneck issue; 2) Integrating the results of text-image\nalignment and image-image alignment from CLIP, along with the image-image\nalignment results from DINO, to achieve more accurate predictions; 3)\nIntroducing an adaptive weighting mechanism based on confidence levels to\naggregate the outcomes from different prediction methods. Experimental results\non multiple datasets, including CIFAR-10, CIFAR-100, and TinyImageNet,\ndemonstrate that our model can significantly improve classification accuracy\ncompared to single-model approaches, achieving AUROC scores above 96% across\nall test datasets, and notably surpassing 99% on the CIFAR-10 dataset.\n","authors":["Siqi Yin","Lifan Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.02155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14682v4","updated":"2024-05-03T14:52:21Z","published":"2022-07-29T13:57:16Z","title":"Towards Unconstrained Audio Splicing Detection and Localization with\n Neural Networks","summary":" Freely available and easy-to-use audio editing tools make it straightforward\nto perform audio splicing. Convincing forgeries can be created by combining\nvarious speech samples from the same person. Detection of such splices is\nimportant both in the public sector when considering misinformation, and in a\nlegal context to verify the integrity of evidence. Unfortunately, most existing\ndetection algorithms for audio splicing use handcrafted features and make\nspecific assumptions. However, criminal investigators are often faced with\naudio samples from unconstrained sources with unknown characteristics, which\nraises the need for more generally applicable methods.\n With this work, we aim to take a first step towards unconstrained audio\nsplicing detection to address this need. We simulate various attack scenarios\nin the form of post-processing operations that may disguise splicing. We\npropose a Transformer sequence-to-sequence (seq2seq) network for splicing\ndetection and localization. Our extensive evaluation shows that the proposed\nmethod outperforms existing dedicated approaches for splicing detection [3, 10]\nas well as the general-purpose networks EfficientNet [28] and RegNet [25].\n","authors":["Denise Moussa","Germans Hirsch","Christian Riess"],"pdf_url":"https://arxiv.org/pdf/2207.14682v4.pdf","comment":"Published at MMFORWILD 2022, ICPR Workshops - Code:\n https://faui1-gitlab.cs.fau.de/denise.moussa/audio-splicing-localization .\n International Conference on Pattern Recognition. Cham: Springer Nature\n Switzerland, 2022"},{"id":"http://arxiv.org/abs/2405.02114v1","updated":"2024-05-03T14:14:27Z","published":"2024-05-03T14:14:27Z","title":"Probablistic Restoration with Adaptive Noise Sampling for 3D Human Pose\n Estimation","summary":" The accuracy and robustness of 3D human pose estimation (HPE) are limited by\n2D pose detection errors and 2D to 3D ill-posed challenges, which have drawn\ngreat attention to Multi-Hypothesis HPE research. Most existing MH-HPE methods\nare based on generative models, which are computationally expensive and\ndifficult to train. In this study, we propose a Probabilistic Restoration 3D\nHuman Pose Estimation framework (PRPose) that can be integrated with any\nlightweight single-hypothesis model. Specifically, PRPose employs a weakly\nsupervised approach to fit the hidden probability distribution of the 2D-to-3D\nlifting process in the Single-Hypothesis HPE model and then reverse-map the\ndistribution to the 2D pose input through an adaptive noise sampling strategy\nto generate reasonable multi-hypothesis samples effectively. Extensive\nexperiments on 3D HPE benchmarks (Human3.6M and MPI-INF-3DHP) highlight the\neffectiveness and efficiency of PRPose. Code is available at:\nhttps://github.com/xzhouzeng/PRPose.\n","authors":["Xianzhou Zeng","Hao Qin","Ming Kong","Luyuan Chen","Qiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.02114v1.pdf","comment":"ICME 2024"},{"id":"http://arxiv.org/abs/2405.02109v1","updated":"2024-05-03T14:10:29Z","published":"2024-05-03T14:10:29Z","title":"Three-Dimensional Amyloid-Beta PET Synthesis from Structural MRI with\n Conditional Generative Adversarial Networks","summary":" Motivation: Alzheimer's Disease hallmarks include amyloid-beta deposits and\nbrain atrophy, detectable via PET and MRI scans, respectively. PET is\nexpensive, invasive and exposes patients to ionizing radiation. MRI is cheaper,\nnon-invasive, and free from ionizing radiation but limited to measuring brain\natrophy.\n Goal: To develop an 3D image translation model that synthesizes amyloid-beta\nPET images from T1-weighted MRI, exploiting the known relationship between\namyloid-beta and brain atrophy.\n Approach: The model was trained on 616 PET/MRI pairs and validated with 264\npairs.\n Results: The model synthesized amyloid-beta PET images from T1-weighted MRI\nwith high-degree of similarity showing high SSIM and PSNR metrics\n(SSIM>0.95&PSNR=28).\n Impact: Our model proves the feasibility of synthesizing amyloid-beta PET\nimages from structural MRI ones, significantly enhancing accessibility for\nlarge-cohort studies and early dementia detection, while also reducing cost,\ninvasiveness, and radiation exposure.\n","authors":["Fernando Vega","Abdoljalil Addeh","M. Ethan MacDonald"],"pdf_url":"https://arxiv.org/pdf/2405.02109v1.pdf","comment":"Abstract Submitted and Presented at the 2024 International Society of\n Magnetic Resonance in Medicine. Singapore, Singapore, May 4-9. Abstract\n Number 2239"},{"id":"http://arxiv.org/abs/2305.16602v2","updated":"2024-05-03T14:01:22Z","published":"2023-05-26T03:21:30Z","title":"Discovering Novel Actions from Open World Egocentric Videos with\n Object-Grounded Visual Commonsense Reasoning","summary":" Learning to infer labels in an open world, i.e., in an environment where the\ntarget ``labels'' are unknown, is an important characteristic for achieving\nautonomy. Foundation models, pre-trained on enormous amounts of data, have\nshown remarkable generalization skills through prompting, particularly in\nzero-shot inference. However, their performance is restricted to the\ncorrectness of the target label's search space, i.e., candidate labels provided\nin the prompt. This target search space can be unknown or exceptionally large\nin an open world, severely restricting their performance. To tackle this\nchallenging problem, we propose a two-step, neuro-symbolic framework called\nALGO - Action Learning with Grounded Object recognition that uses symbolic\nknowledge stored in large-scale knowledge bases to infer activities in\negocentric videos with limited supervision. First, we propose a neuro-symbolic\nprompting approach that uses object-centric vision-language models as a noisy\noracle to ground objects in the video through evidence-based reasoning. Second,\ndriven by prior commonsense knowledge, we discover plausible activities through\nan energy-based symbolic pattern theory framework and learn to ground\nknowledge-based action (verb) concepts in the video. Extensive experiments on\nfour publicly available datasets (EPIC-Kitchens, GTEA Gaze, GTEA Gaze Plus, and\nCharades-Ego) demonstrate its performance on open-world activity inference. We\nalso show that ALGO can be extended to zero-shot inference and demonstrate its\ncompetitive performance on the Charades-Ego dataset.\n","authors":["Sanjoy Kundu","Shubham Trehan","Sathyanarayanan N. Aakur"],"pdf_url":"https://arxiv.org/pdf/2305.16602v2.pdf","comment":"25 Pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.10073v2","updated":"2024-05-03T13:52:46Z","published":"2024-04-15T18:26:03Z","title":"Explainable Light-Weight Deep Learning Pipeline for Improved Drought\n Stress Identification","summary":" Early identification of drought stress in crops is vital for implementing\neffective mitigation measures and reducing yield loss. Non-invasive imaging\ntechniques hold immense potential by capturing subtle physiological changes in\nplants under water deficit. Sensor based imaging data serves as a rich source\nof information for machine learning and deep learning algorithms, facilitating\nfurther analysis aimed at identifying drought stress. While these approaches\nyield favorable results, real-time field applications requires algorithms\nspecifically designed for the complexities of natural agricultural conditions.\nOur work proposes a novel deep learning framework for classifying drought\nstress in potato crops captured by UAVs in natural settings. The novelty lies\nin the synergistic combination of a pre-trained network with carefully designed\ncustom layers. This architecture leverages feature extraction capabilities of\nthe pre-trained network while the custom layers enable targeted dimensionality\nreduction and enhanced regularization, ultimately leading to improved\nperformance. A key innovation of our work involves the integration of\nGradient-Class Activation Mapping (Grad-CAM), an explainability technique.\nGrad-CAM sheds light on the internal workings of the deep learning model,\ntypically referred to as a black box. By visualizing the focus areas of the\nmodel within the images, Grad-CAM fosters interpretability and builds trust in\nthe decision-making process of the model. Our proposed framework achieves\nsuperior performance, particularly with the DenseNet121 pre-trained network,\nreaching a precision of 97% to identify the stressed class with an overall\naccuracy of 91%. Comparative analysis of existing state-of-the-art object\ndetection algorithms reveals the superiority of our approach in significantly\nhigher precision and accuracy.\n","authors":["Aswini Kumar Patra","Lingaraj Sahoo"],"pdf_url":"https://arxiv.org/pdf/2404.10073v2.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.02077v1","updated":"2024-05-03T13:10:16Z","published":"2024-05-03T13:10:16Z","title":"MVP-Shot: Multi-Velocity Progressive-Alignment Framework for Few-Shot\n Action Recognition","summary":" Recent few-shot action recognition (FSAR) methods achieve promising\nperformance by performing semantic matching on learned discriminative features.\nHowever, most FSAR methods focus on single-scale (e.g., frame-level,\nsegment-level, \\etc) feature alignment, which ignores that human actions with\nthe same semantic may appear at different velocities. To this end, we develop a\nnovel Multi-Velocity Progressive-alignment (MVP-Shot) framework to\nprogressively learn and align semantic-related action features at\nmulti-velocity levels. Concretely, a Multi-Velocity Feature Alignment (MVFA)\nmodule is designed to measure the similarity between features from support and\nquery videos with different velocity scales and then merge all similarity\nscores in a residual fashion. To avoid the multiple velocity features deviating\nfrom the underlying motion semantic, our proposed Progressive Semantic-Tailored\nInteraction (PSTI) module injects velocity-tailored text information into the\nvideo feature via feature interaction on channel and temporal domains at\ndifferent velocities. The above two modules compensate for each other to\npredict query categories more accurately under the few-shot settings.\nExperimental results show our method outperforms current state-of-the-art\nmethods on multiple standard few-shot benchmarks (i.e., HMDB51, UCF101,\nKinetics, and SSv2-small).\n","authors":["Hongyu Qu","Rui Yan","Xiangbo Shu","Haoliang Gao","Peng Huang","Guo-Sen Xie"],"pdf_url":"https://arxiv.org/pdf/2405.02077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02068v1","updated":"2024-05-03T13:00:22Z","published":"2024-05-03T13:00:22Z","title":"Advancing Pre-trained Teacher: Towards Robust Feature Discrepancy for\n Anomaly Detection","summary":" With the wide application of knowledge distillation between an ImageNet\npre-trained teacher model and a learnable student model, industrial anomaly\ndetection has witnessed a significant achievement in the past few years. The\nsuccess of knowledge distillation mainly relies on how to keep the feature\ndiscrepancy between the teacher and student model, in which it assumes that:\n(1) the teacher model can jointly represent two different distributions for the\nnormal and abnormal patterns, while (2) the student model can only reconstruct\nthe normal distribution. However, it still remains a challenging issue to\nmaintain these ideal assumptions in practice. In this paper, we propose a\nsimple yet effective two-stage industrial anomaly detection framework, termed\nas AAND, which sequentially performs Anomaly Amplification and Normality\nDistillation to obtain robust feature discrepancy. In the first anomaly\namplification stage, we propose a novel Residual Anomaly Amplification (RAA)\nmodule to advance the pre-trained teacher encoder. With the exposure of\nsynthetic anomalies, it amplifies anomalies via residual generation while\nmaintaining the integrity of pre-trained model. It mainly comprises a\nMatching-guided Residual Gate and an Attribute-scaling Residual Generator,\nwhich can determine the residuals' proportion and characteristic, respectively.\nIn the second normality distillation stage, we further employ a reverse\ndistillation paradigm to train a student decoder, in which a novel Hard\nKnowledge Distillation (HKD) loss is built to better facilitate the\nreconstruction of normal patterns. Comprehensive experiments on the MvTecAD,\nVisA, and MvTec3D-RGB datasets show that our method achieves state-of-the-art\nperformance.\n","authors":["Canhui Tang","Sanping Zhou","Yizhe Li","Yonghao Dong","Le Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02068v1.pdf","comment":"The paper is under review"},{"id":"http://arxiv.org/abs/2405.02066v1","updated":"2024-05-03T12:56:34Z","published":"2024-05-03T12:56:34Z","title":"WateRF: Robust Watermarks in Radiance Fields for Protection of\n Copyrights","summary":" The advances in the Neural Radiance Fields (NeRF) research offer extensive\napplications in diverse domains, but protecting their copyrights has not yet\nbeen researched in depth. Recently, NeRF watermarking has been considered one\nof the pivotal solutions for safely deploying NeRF-based 3D representations.\nHowever, existing methods are designed to apply only to implicit or explicit\nNeRF representations. In this work, we introduce an innovative watermarking\nmethod that can be employed in both representations of NeRF. This is achieved\nby fine-tuning NeRF to embed binary messages in the rendering process. In\ndetail, we propose utilizing the discrete wavelet transform in the NeRF space\nfor watermarking. Furthermore, we adopt a deferred back-propagation technique\nand introduce a combination with the patch-wise loss to improve rendering\nquality and bit accuracy with minimum trade-offs. We evaluate our method in\nthree different aspects: capacity, invisibility, and robustness of the embedded\nwatermarks in the 2D-rendered images. Our method achieves state-of-the-art\nperformance with faster training speed over the compared state-of-the-art\nmethods.\n","authors":["Youngdong Jang","Dong In Lee","MinHyuk Jang","Jong Wook Kim","Feng Yang","Sangpil Kim"],"pdf_url":"https://arxiv.org/pdf/2405.02066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02061v1","updated":"2024-05-03T12:42:43Z","published":"2024-05-03T12:42:43Z","title":"Towards general deep-learning-based tree instance segmentation models","summary":" The segmentation of individual trees from forest point clouds is a crucial\ntask for downstream analyses such as carbon sequestration estimation. Recently,\ndeep-learning-based methods have been proposed which show the potential of\nlearning to segment trees. Since these methods are trained in a supervised way,\nthe question arises how general models can be obtained that are applicable\nacross a wide range of settings. So far, training has been mainly conducted\nwith data from one specific laser scanning type and for specific types of\nforests. In this work, we train one segmentation model under various\nconditions, using seven diverse datasets found in literature, to gain insights\ninto the generalization capabilities under domain-shift. Our results suggest\nthat a generalization from coniferous dominated sparse point clouds to\ndeciduous dominated high-resolution point clouds is possible. Conversely,\nqualitative evidence suggests that generalization from high-resolution to\nlow-resolution point clouds is challenging. This emphasizes the need for forest\npoint clouds with diverse data characteristics for model development. To enrich\nthe available data basis, labeled trees from two previous works were propagated\nto the complete forest point cloud and are made publicly available at\nhttps://doi.org/10.25625/QUTUWU.\n","authors":["Jonathan Henrich","Jan van Delden"],"pdf_url":"https://arxiv.org/pdf/2405.02061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03134v2","updated":"2024-05-03T12:31:52Z","published":"2024-03-05T17:21:31Z","title":"Simplicity in Complexity : Explaining Visual Complexity using Deep\n Segmentation Models","summary":" The complexity of visual stimuli plays an important role in many cognitive\nphenomena, including attention, engagement, memorability, time perception and\naesthetic evaluation. Despite its importance, complexity is poorly understood\nand ironically, previous models of image complexity have been quite complex.\nThere have been many attempts to find handcrafted features that explain\ncomplexity, but these features are usually dataset specific, and hence fail to\ngeneralise. On the other hand, more recent work has employed deep neural\nnetworks to predict complexity, but these models remain difficult to interpret,\nand do not guide a theoretical understanding of the problem. Here we propose to\nmodel complexity using segment-based representations of images. We use\nstate-of-the-art segmentation models, SAM and FC-CLIP, to quantify the number\nof segments at multiple granularities, and the number of classes in an image\nrespectively. We find that complexity is well-explained by a simple linear\nmodel with these two features across six diverse image-sets of naturalistic\nscene and art images. This suggests that the complexity of images can be\nsurprisingly simple.\n","authors":["Tingke Shen","Surabhi S Nath","Aenne Brielmann","Peter Dayan"],"pdf_url":"https://arxiv.org/pdf/2403.03134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02023v1","updated":"2024-05-03T11:55:45Z","published":"2024-05-03T11:55:45Z","title":"IFNet: Deep Imaging and Focusing for Handheld SAR with Millimeter-wave\n Signals","summary":" Recent advancements have showcased the potential of handheld millimeter-wave\n(mmWave) imaging, which applies synthetic aperture radar (SAR) principles in\nportable settings. However, existing studies addressing handheld motion errors\neither rely on costly tracking devices or employ simplified imaging models,\nleading to impractical deployment or limited performance. In this paper, we\npresent IFNet, a novel deep unfolding network that combines the strengths of\nsignal processing models and deep neural networks to achieve robust imaging and\nfocusing for handheld mmWave systems. We first formulate the handheld imaging\nmodel by integrating multiple priors about mmWave images and handheld phase\nerrors. Furthermore, we transform the optimization processes into an iterative\nnetwork structure for improved and efficient imaging performance. Extensive\nexperiments demonstrate that IFNet effectively compensates for handheld phase\nerrors and recovers high-fidelity images from severely distorted signals. In\ncomparison with existing methods, IFNet can achieve at least 11.89 dB\nimprovement in average peak signal-to-noise ratio (PSNR) and 64.91% improvement\nin average structural similarity index measure (SSIM) on a real-world dataset.\n","authors":["Li Yadong","Zhang Dongheng","Geng Ruixu","Wu Jincheng","Hu Yang","Sun Qibin","Chen Yan"],"pdf_url":"https://arxiv.org/pdf/2405.02023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10396v2","updated":"2024-05-03T11:41:30Z","published":"2023-03-18T11:26:36Z","title":"Towards Diverse Binary Segmentation via A Simple yet General Gated\n Network","summary":" In many binary segmentation tasks, most CNNs-based methods use a U-shape\nencoder-decoder network as their basic structure. They ignore two key problems\nwhen the encoder exchanges information with the decoder: one is the lack of\ninterference control mechanism between them, the other is without considering\nthe disparity of the contributions from different encoder levels. In this work,\nwe propose a simple yet general gated network (GateNet) to tackle them all at\nonce. With the help of multi-level gate units, the valuable context information\nfrom the encoder can be selectively transmitted to the decoder. In addition, we\ndesign a gated dual branch structure to build the cooperation among the\nfeatures of different levels and improve the discrimination ability of the\nnetwork. Furthermore, we introduce a \"Fold\" operation to improve the atrous\nconvolution and form a novel folded atrous convolution, which can be flexibly\nembedded in ASPP or DenseASPP to accurately localize foreground objects of\nvarious scales. GateNet can be easily generalized to many binary segmentation\ntasks, including general and specific object segmentation and multi-modal\nsegmentation. Without bells and whistles, our network consistently performs\nfavorably against the state-of-the-art methods under 10 metrics on 33 datasets\nof 10 binary segmentation tasks.\n","authors":["Xiaoqi Zhao","Youwei Pang","Lihe Zhang","Huchuan Lu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.10396v2.pdf","comment":"Accepted by IJCV 2024"},{"id":"http://arxiv.org/abs/2405.02008v1","updated":"2024-05-03T11:16:27Z","published":"2024-05-03T11:16:27Z","title":"DiffMap: Enhancing Map Segmentation with Map Prior Using Diffusion Model","summary":" Constructing high-definition (HD) maps is a crucial requirement for enabling\nautonomous driving. In recent years, several map segmentation algorithms have\nbeen developed to address this need, leveraging advancements in Bird's-Eye View\n(BEV) perception. However, existing models still encounter challenges in\nproducing realistic and consistent semantic map layouts. One prominent issue is\nthe limited utilization of structured priors inherent in map segmentation\nmasks. In light of this, we propose DiffMap, a novel approach specifically\ndesigned to model the structured priors of map segmentation masks using latent\ndiffusion model. By incorporating this technique, the performance of existing\nsemantic segmentation methods can be significantly enhanced and certain\nstructural errors present in the segmentation outputs can be effectively\nrectified. Notably, the proposed module can be seamlessly integrated into any\nmap segmentation model, thereby augmenting its capability to accurately\ndelineate semantic information. Furthermore, through extensive visualization\nanalysis, our model demonstrates superior proficiency in generating results\nthat more accurately reflect real-world map layouts, further validating its\nefficacy in improving the quality of the generated maps.\n","authors":["Peijin Jia","Tuopu Wen","Ziang Luo","Mengmeng Yang","Kun Jiang","Zhiquan Lei","Xuewei Tang","Ziyuan Liu","Le Cui","Kehua Sheng","Bo Zhang","Diange Yang"],"pdf_url":"https://arxiv.org/pdf/2405.02008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02005v1","updated":"2024-05-03T11:08:04Z","published":"2024-05-03T11:08:04Z","title":"HoloGS: Instant Depth-based 3D Gaussian Splatting with Microsoft\n HoloLens 2","summary":" In the fields of photogrammetry, computer vision and computer graphics, the\ntask of neural 3D scene reconstruction has led to the exploration of various\ntechniques. Among these, 3D Gaussian Splatting stands out for its explicit\nrepresentation of scenes using 3D Gaussians, making it appealing for tasks like\n3D point cloud extraction and surface reconstruction. Motivated by its\npotential, we address the domain of 3D scene reconstruction, aiming to leverage\nthe capabilities of the Microsoft HoloLens 2 for instant 3D Gaussian Splatting.\nWe present HoloGS, a novel workflow utilizing HoloLens sensor data, which\nbypasses the need for pre-processing steps like Structure from Motion by\ninstantly accessing the required input data i.e. the images, camera poses and\nthe point cloud from depth sensing. We provide comprehensive investigations,\nincluding the training process and the rendering quality, assessed through the\nPeak Signal-to-Noise Ratio, and the geometric 3D accuracy of the densified\npoint cloud from Gaussian centers, measured by Chamfer Distance. We evaluate\nour approach on two self-captured scenes: An outdoor scene of a cultural\nheritage statue and an indoor scene of a fine-structured plant. Our results\nshow that the HoloLens data, including RGB images, corresponding camera poses,\nand depth sensing based point clouds to initialize the Gaussians, are suitable\nas input for 3D Gaussian Splatting.\n","authors":["Miriam Jäger","Theodor Kapler","Michael Feßenbecker","Felix Birkelbach","Markus Hillemann","Boris Jutzi"],"pdf_url":"https://arxiv.org/pdf/2405.02005v1.pdf","comment":"8 pages, 9 figures, 2 tables. Will be published in the ISPRS The\n International Archives of Photogrammetry, Remote Sensing and Spatial\n Information Sciences"},{"id":"http://arxiv.org/abs/2405.02004v1","updated":"2024-05-03T11:06:37Z","published":"2024-05-03T11:06:37Z","title":"M${^2}$Depth: Self-supervised Two-Frame Multi-camera Metric Depth\n Estimation","summary":" This paper presents a novel self-supervised two-frame multi-camera metric\ndepth estimation network, termed M${^2}$Depth, which is designed to predict\nreliable scale-aware surrounding depth in autonomous driving. Unlike the\nprevious works that use multi-view images from a single time-step or multiple\ntime-step images from a single camera, M${^2}$Depth takes temporally adjacent\ntwo-frame images from multiple cameras as inputs and produces high-quality\nsurrounding depth. We first construct cost volumes in spatial and temporal\ndomains individually and propose a spatial-temporal fusion module that\nintegrates the spatial-temporal information to yield a strong volume\npresentation. We additionally combine the neural prior from SAM features with\ninternal features to reduce the ambiguity between foreground and background and\nstrengthen the depth edges. Extensive experimental results on nuScenes and DDAD\nbenchmarks show M${^2}$Depth achieves state-of-the-art performance. More\nresults can be found in https://heiheishuang.xyz/M2Depth .\n","authors":["Yingshuang Zou","Yikang Ding","Xi Qiu","Haoqian Wang","Haotian Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01995v1","updated":"2024-05-03T10:50:30Z","published":"2024-05-03T10:50:30Z","title":"Cooperation and Federation in Distributed Radar Point Cloud Processing","summary":" The paper considers the problem of human-scale RF sensing utilizing a network\nof resource-constrained MIMO radars with low range-azimuth resolution. The\nradars operate in the mmWave band and obtain time-varying 3D point cloud (PC)\ninformation that is sensitive to body movements. They also observe the same\nscene from different views and cooperate while sensing the environment using a\nsidelink communication channel. Conventional cooperation setups allow the\nradars to mutually exchange raw PC information to improve ego sensing. The\npaper proposes a federation mechanism where the radars exchange the parameters\nof a Bayesian posterior measure of the observed PCs, rather than raw data. The\nradars act as distributed parameter servers to reconstruct a global posterior\n(i.e., federated posterior) using Bayesian tools. The paper quantifies and\ncompares the benefits of radar federation with respect to cooperation\nmechanisms. Both approaches are validated by experiments with a real-time\ndemonstration platform. Federation makes minimal use of the sidelink\ncommunication channel (20 {\\div} 25 times lower bandwidth use) and is less\nsensitive to unresolved targets. On the other hand, cooperation reduces the\nmean absolute target estimation error of about 20%.\n","authors":["S. Savazzi","V. Rampa","S. Kianoush","A. Minora","L. Costa"],"pdf_url":"https://arxiv.org/pdf/2405.01995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01992v1","updated":"2024-05-03T10:47:56Z","published":"2024-05-03T10:47:56Z","title":"SFFNet: A Wavelet-Based Spatial and Frequency Domain Fusion Network for\n Remote Sensing Segmentation","summary":" In order to fully utilize spatial information for segmentation and address\nthe challenge of handling areas with significant grayscale variations in remote\nsensing segmentation, we propose the SFFNet (Spatial and Frequency Domain\nFusion Network) framework. This framework employs a two-stage network design:\nthe first stage extracts features using spatial methods to obtain features with\nsufficient spatial details and semantic information; the second stage maps\nthese features in both spatial and frequency domains. In the frequency domain\nmapping, we introduce the Wavelet Transform Feature Decomposer (WTFD)\nstructure, which decomposes features into low-frequency and high-frequency\ncomponents using the Haar wavelet transform and integrates them with spatial\nfeatures. To bridge the semantic gap between frequency and spatial features,\nand facilitate significant feature selection to promote the combination of\nfeatures from different representation domains, we design the Multiscale\nDual-Representation Alignment Filter (MDAF). This structure utilizes multiscale\nconvolutions and dual-cross attentions. Comprehensive experimental results\ndemonstrate that, compared to existing methods, SFFNet achieves superior\nperformance in terms of mIoU, reaching 84.80% and 87.73% respectively.The code\nis located at https://github.com/yysdck/SFFNet.\n","authors":["Yunsong Yang","Genji Yuan","Jinjiang Li"],"pdf_url":"https://arxiv.org/pdf/2405.01992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17701v4","updated":"2024-05-03T10:12:09Z","published":"2024-03-26T13:40:18Z","title":"Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical\n Image Segmentation","summary":" Image segmentation holds a vital position in the realms of diagnosis and\ntreatment within the medical domain. Traditional convolutional neural networks\n(CNNs) and Transformer models have made significant advancements in this realm,\nbut they still encounter challenges because of limited receptive field or high\ncomputing complexity. Recently, State Space Models (SSMs), particularly Mamba\nand its variants, have demonstrated notable performance in the field of vision.\nHowever, their feature extraction methods may not be sufficiently effective and\nretain some redundant structures, leaving room for parameter reduction.\nMotivated by previous spatial and channel attention methods, we propose Triplet\nMamba-UNet. The method leverages residual VSS Blocks to extract intensive\ncontextual features, while Triplet SSM is employed to fuse features across\nspatial and channel dimensions. We conducted experiments on ISIC17, ISIC18,\nCVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets,\ndemonstrating the superior segmentation performance of our proposed TM-UNet.\nAdditionally, compared to the previous VM-UNet, our model achieves a one-third\nreduction in parameters.\n","authors":["Hao Tang","Lianglun Cheng","Guoheng Huang","Zhengguang Tan","Junhao Lu","Kaihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17701v4.pdf","comment":"Experimental method encountered errors, undergoing experiment again"},{"id":"http://arxiv.org/abs/2405.01971v1","updated":"2024-05-03T09:53:28Z","published":"2024-05-03T09:53:28Z","title":"A Sonar-based AUV Positioning System for Underwater Environments with\n Low Infrastructure Density","summary":" The increasing demand for underwater vehicles highlights the necessity for\nrobust localization solutions in inspection missions. In this work, we present\na novel real-time sonar-based underwater global positioning algorithm for AUVs\n(Autonomous Underwater Vehicles) designed for environments with a sparse\ndistribution of human-made assets. Our approach exploits two synergistic data\ninterpretation frontends applied to the same stream of sonar data acquired by a\nmultibeam Forward-Looking Sonar (FSD). These observations are fused within a\nParticle Filter (PF) either to weigh more particles that belong to\nhigh-likelihood regions or to solve symmetric ambiguities. Preliminary\nexperiments carried out on a simulated environment resembling a real underwater\nplant provided promising results. This work represents a starting point towards\nfuture developments of the method and consequent exhaustive evaluations also in\nreal-world scenarios.\n","authors":["Emilio Olivastri","Daniel Fusaro","Wanmeng Li","Simone Mosco","Alberto Pretto"],"pdf_url":"https://arxiv.org/pdf/2405.01971v1.pdf","comment":"Accepted to the IEEE ICRA Workshop on Field Robotics 2024"},{"id":"http://arxiv.org/abs/2405.01963v1","updated":"2024-05-03T09:40:47Z","published":"2024-05-03T09:40:47Z","title":"From Attack to Defense: Insights into Deep Learning Security Measures in\n Black-Box Settings","summary":" Deep Learning (DL) is rapidly maturing to the point that it can be used in\nsafety- and security-crucial applications. However, adversarial samples, which\nare undetectable to the human eye, pose a serious threat that can cause the\nmodel to misbehave and compromise the performance of such applications.\nAddressing the robustness of DL models has become crucial to understanding and\ndefending against adversarial attacks. In this study, we perform comprehensive\nexperiments to examine the effect of adversarial attacks and defenses on\nvarious model architectures across well-known datasets. Our research focuses on\nblack-box attacks such as SimBA, HopSkipJump, MGAAttack, and boundary attacks,\nas well as preprocessor-based defensive mechanisms, including bits squeezing,\nmedian smoothing, and JPEG filter. Experimenting with various models, our\nresults demonstrate that the level of noise needed for the attack increases as\nthe number of layers increases. Moreover, the attack success rate decreases as\nthe number of layers increases. This indicates that model complexity and\nrobustness have a significant relationship. Investigating the diversity and\nrobustness relationship, our experiments with diverse models show that having a\nlarge number of parameters does not imply higher robustness. Our experiments\nextend to show the effects of the training dataset on model robustness. Using\nvarious datasets such as ImageNet-1000, CIFAR-100, and CIFAR-10 are used to\nevaluate the black-box attacks. Considering the multiple dimensions of our\nanalysis, e.g., model complexity and training dataset, we examined the behavior\nof black-box attacks when models apply defenses. Our results show that applying\ndefense strategies can significantly reduce attack effectiveness. This research\nprovides in-depth analysis and insight into the robustness of DL models against\nvarious attacks, and defenses.\n","authors":["Firuz Juraev","Mohammed Abuhamad","Eric Chan-Tin","George K. Thiruvathukal","Tamer Abuhmed"],"pdf_url":"https://arxiv.org/pdf/2405.01963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09110v2","updated":"2024-05-03T09:34:22Z","published":"2023-08-17T17:32:56Z","title":"JPEG Quantized Coefficient Recovery via DCT Domain Spatial-Frequential\n Transformer","summary":" JPEG compression adopts the quantization of Discrete Cosine Transform (DCT)\ncoefficients for effective bit-rate reduction, whilst the quantization could\nlead to a significant loss of important image details. Recovering compressed\nJPEG images in the frequency domain has recently garnered increasing interest,\ncomplementing the multitude of restoration techniques established in the pixel\ndomain. However, existing DCT domain methods typically suffer from limited\neffectiveness in handling a wide range of compression quality factors or fall\nshort in recovering sparse quantized coefficients and the components across\ndifferent colorspaces. To address these challenges, we propose a DCT domain\nspatial-frequential Transformer, namely DCTransformer, for JPEG quantized\ncoefficient recovery. Specifically, a dual-branch architecture is designed to\ncapture both spatial and frequential correlations within the collocated DCT\ncoefficients. Moreover, we incorporate the operation of quantization matrix\nembedding, which effectively allows our single model to handle a wide range of\nquality factors, and a luminance-chrominance alignment head that produces a\nunified feature map to align different-sized luminance and chrominance\ncomponents. Our proposed DCTransformer outperforms the current state-of-the-art\nJPEG artifact removal techniques, as demonstrated by our extensive experiments.\n","authors":["Mingyu Ouyang","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.09110v2.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.08501v2","updated":"2024-05-03T09:18:24Z","published":"2024-01-16T17:02:21Z","title":"ValUES: A Framework for Systematic Validation of Uncertainty Estimation\n in Semantic Segmentation","summary":" Uncertainty estimation is an essential and heavily-studied component for the\nreliable application of semantic segmentation methods. While various studies\nexist claiming methodological advances on the one hand, and successful\napplication on the other hand, the field is currently hampered by a gap between\ntheory and practice leaving fundamental questions unanswered: Can data-related\nand model-related uncertainty really be separated in practice? Which components\nof an uncertainty method are essential for real-world performance? Which\nuncertainty method works well for which application? In this work, we link this\nresearch gap to a lack of systematic and comprehensive evaluation of\nuncertainty methods. Specifically, we identify three key pitfalls in current\nliterature and present an evaluation framework that bridges the research gap by\nproviding 1) a controlled environment for studying data ambiguities as well as\ndistribution shifts, 2) systematic ablations of relevant method components, and\n3) test-beds for the five predominant uncertainty applications: OoD-detection,\nactive learning, failure detection, calibration, and ambiguity modeling.\nEmpirical results on simulated as well as real-world data demonstrate how the\nproposed framework is able to answer the predominant questions in the field\nrevealing for instance that 1) separation of uncertainty types works on\nsimulated data but does not necessarily translate to real-world data, 2)\naggregation of scores is a crucial but currently neglected component of\nuncertainty methods, 3) While ensembles are performing most robustly across the\ndifferent downstream tasks and settings, test-time augmentation often\nconstitutes a light-weight alternative. Code is at:\nhttps://github.com/IML-DKFZ/values\n","authors":["Kim-Celine Kahl","Carsten T. Lüth","Maximilian Zenk","Klaus Maier-Hein","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2401.08501v2.pdf","comment":"ICLR 2024 (oral)"},{"id":"http://arxiv.org/abs/2405.01937v1","updated":"2024-05-03T09:02:17Z","published":"2024-05-03T09:02:17Z","title":"An Attention Based Pipeline for Identifying Pre-Cancer Lesions in Head\n and Neck Clinical Images","summary":" Early detection of cancer can help improve patient prognosis by early\nintervention. Head and neck cancer is diagnosed in specialist centres after a\nsurgical biopsy, however, there is a potential for these to be missed leading\nto delayed diagnosis. To overcome these challenges, we present an attention\nbased pipeline that identifies suspected lesions, segments, and classifies them\nas non-dysplastic, dysplastic and cancerous lesions. We propose (a) a vision\ntransformer based Mask R-CNN network for lesion detection and segmentation of\nclinical images, and (b) Multiple Instance Learning (MIL) based scheme for\nclassification. Current results show that the segmentation model produces\nsegmentation masks and bounding boxes with up to 82% overlap accuracy score on\nunseen external test data and surpassing reviewed segmentation benchmarks.\nNext, a classification F1-score of 85% on the internal cohort test set. An app\nhas been developed to perform lesion segmentation taken via a smart device.\nFuture work involves employing endoscopic video data for precise early\ndetection and prognosis.\n","authors":["Abdullah Alsalemi","Anza Shakeel","Mollie Clark","Syed Ali Khurram","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2405.01937v1.pdf","comment":"5 pages, 3 figures, accepted in ISBI 2024"},{"id":"http://arxiv.org/abs/2405.01934v1","updated":"2024-05-03T08:58:38Z","published":"2024-05-03T08:58:38Z","title":"Impact of Architectural Modifications on Deep Learning Adversarial\n Robustness","summary":" Rapid advancements of deep learning are accelerating adoption in a wide\nvariety of applications, including safety-critical applications such as\nself-driving vehicles, drones, robots, and surveillance systems. These\nadvancements include applying variations of sophisticated techniques that\nimprove the performance of models. However, such models are not immune to\nadversarial manipulations, which can cause the system to misbehave and remain\nunnoticed by experts. The frequency of modifications to existing deep learning\nmodels necessitates thorough analysis to determine the impact on models'\nrobustness. In this work, we present an experimental evaluation of the effects\nof model modifications on deep learning model robustness using adversarial\nattacks. Our methodology involves examining the robustness of variations of\nmodels against various adversarial attacks. By conducting our experiments, we\naim to shed light on the critical issue of maintaining the reliability and\nsafety of deep learning models in safety- and security-critical applications.\nOur results indicate the pressing demand for an in-depth assessment of the\neffects of model changes on the robustness of models.\n","authors":["Firuz Juraev","Mohammed Abuhamad","Simon S. Woo","George K Thiruvathukal","Tamer Abuhmed"],"pdf_url":"https://arxiv.org/pdf/2405.01934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01926v1","updated":"2024-05-03T08:43:06Z","published":"2024-05-03T08:43:06Z","title":"Auto-Encoding Morph-Tokens for Multimodal LLM","summary":" For multimodal LLMs, the synergy of visual comprehension (textual output) and\ngeneration (visual output) presents an ongoing challenge. This is due to a\nconflicting objective: for comprehension, an MLLM needs to abstract the\nvisuals; for generation, it needs to preserve the visuals as much as possible.\nThus, the objective is a dilemma for visual-tokens. To resolve the conflict, we\npropose encoding images into morph-tokens to serve a dual purpose: for\ncomprehension, they act as visual prompts instructing MLLM to generate texts;\nfor generation, they take on a different, non-conflicting role as complete\nvisual-tokens for image reconstruction, where the missing visual cues are\nrecovered by the MLLM. Extensive experiments show that morph-tokens can achieve\na new SOTA for multimodal comprehension and generation simultaneously. Our\nproject is available at https://github.com/DCDmllm/MorphTokens.\n","authors":["Kaihang Pan","Siliang Tang","Juncheng Li","Zhaoyu Fan","Wei Chow","Shuicheng Yan","Tat-Seng Chua","Yueting Zhuang","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.01926v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2306.01875v3","updated":"2024-05-03T08:25:54Z","published":"2023-06-02T19:08:31Z","title":"DiffECG: A Versatile Probabilistic Diffusion Model for ECG Signals\n Synthesis","summary":" Within cardiovascular disease detection using deep learning applied to ECG\nsignals, the complexities of handling physiological signals have sparked\ngrowing interest in leveraging deep generative models for effective data\naugmentation. In this paper, we introduce a novel versatile approach based on\ndenoising diffusion probabilistic models for ECG synthesis, addressing three\nscenarios: (i) heartbeat generation, (ii) partial signal imputation, and (iii)\nfull heartbeat forecasting. Our approach presents the first generalized\nconditional approach for ECG synthesis, and our experimental results\ndemonstrate its effectiveness for various ECG-related tasks. Moreover, we show\nthat our approach outperforms other state-of-the-art ECG generative models and\ncan enhance the performance of state-of-the-art classifiers.\n","authors":["Nour Neifar","Achraf Ben-Hamadou","Afef Mdhaffar","Mohamed Jmaiel"],"pdf_url":"https://arxiv.org/pdf/2306.01875v3.pdf","comment":"Accepted in IEEE SERA 2024 conference"},{"id":"http://arxiv.org/abs/2405.01920v1","updated":"2024-05-03T08:23:39Z","published":"2024-05-03T08:23:39Z","title":"Lightweight Change Detection in Heterogeneous Remote Sensing Images with\n Online All-Integer Pruning Training","summary":" Detection of changes in heterogeneous remote sensing images is vital,\nespecially in response to emergencies like earthquakes and floods. Current\nhomogenous transformation-based change detection (CD) methods often suffer from\nhigh computation and memory costs, which are not friendly to edge-computation\ndevices like onboard CD devices at satellites. To address this issue, this\npaper proposes a new lightweight CD method for heterogeneous remote sensing\nimages that employs the online all-integer pruning (OAIP) training strategy to\nefficiently fine-tune the CD network using the current test data. The proposed\nCD network consists of two visual geometry group (VGG) subnetworks as the\nbackbone architecture. In the OAIP-based training process, all the weights,\ngradients, and intermediate data are quantized to integers to speed up training\nand reduce memory usage, where the per-layer block exponentiation scaling\nscheme is utilized to reduce the computation errors of network parameters\ncaused by quantization. Second, an adaptive filter-level pruning method based\non the L1-norm criterion is employed to further lighten the fine-tuning process\nof the CD network. Experimental results show that the proposed OAIP-based\nmethod attains similar detection performance (but with significantly reduced\ncomputation complexity and memory usage) in comparison with state-of-the-art CD\nmethods.\n","authors":["Chengyang Zhang","Weiming Li","Gang Li","Huina Song","Zhaohui Song","Xueqian Wang","Antonio Plaza"],"pdf_url":"https://arxiv.org/pdf/2405.01920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05176v2","updated":"2024-05-03T08:12:53Z","published":"2023-12-08T16:59:17Z","title":"MRI Scan Synthesis Methods based on Clustering and Pix2Pix","summary":" We consider a missing data problem in the context of automatic segmentation\nmethods for Magnetic Resonance Imaging (MRI) brain scans. Usually, automated\nMRI scan segmentation is based on multiple scans (e.g., T1-weighted,\nT2-weighted, T1CE, FLAIR). However, quite often a scan is blurry, missing or\notherwise unusable. We investigate the question whether a missing scan can be\nsynthesized. We exemplify that this is in principle possible by synthesizing a\nT2-weighted scan from a given T1-weighted scan. Our first aim is to compute a\npicture that resembles the missing scan closely, measured by average mean\nsquared error (MSE). We develop/use several methods for this, including a\nrandom baseline approach, a clustering-based method and pixel-to-pixel\ntranslation method by Isola et al. (Pix2Pix) which is based on conditional\nGANs. The lowest MSE is achieved by our clustering-based method. Our second aim\nis to compare the methods with respect to the effect that using the synthesized\nscan has on the segmentation process. For this, we use a DeepMedic model\ntrained with the four input scan modalities named above. We replace the\nT2-weighted scan by the synthesized picture and evaluate the segmentations with\nrespect to the tumor identification, using Dice scores as numerical evaluation.\nThe evaluation shows that the segmentation works well with synthesized scans\n(in particular, with Pix2Pix methods) in many cases.\n","authors":["Giulia Baldini","Melanie Schmidt","Charlotte Zäske","Liliana L. Caldeira"],"pdf_url":"https://arxiv.org/pdf/2312.05176v2.pdf","comment":"Accepted at AIME 2024"},{"id":"http://arxiv.org/abs/2405.01885v1","updated":"2024-05-03T07:11:25Z","published":"2024-05-03T07:11:25Z","title":"Enhancing Micro Gesture Recognition for Emotion Understanding via\n Context-aware Visual-Text Contrastive Learning","summary":" Psychological studies have shown that Micro Gestures (MG) are closely linked\nto human emotions. MG-based emotion understanding has attracted much attention\nbecause it allows for emotion understanding through nonverbal body gestures\nwithout relying on identity information (e.g., facial and electrocardiogram\ndata). Therefore, it is essential to recognize MG effectively for advanced\nemotion understanding. However, existing Micro Gesture Recognition (MGR)\nmethods utilize only a single modality (e.g., RGB or skeleton) while\noverlooking crucial textual information. In this letter, we propose a simple\nbut effective visual-text contrastive learning solution that utilizes text\ninformation for MGR. In addition, instead of using handcrafted prompts for\nvisual-text contrastive learning, we propose a novel module called Adaptive\nprompting to generate context-aware prompts. The experimental results show that\nthe proposed method achieves state-of-the-art performance on two public\ndatasets. Furthermore, based on an empirical study utilizing the results of MGR\nfor emotion understanding, we demonstrate that using the textual results of MGR\nsignificantly improves performance by 6%+ compared to directly using video as\ninput.\n","authors":["Deng Li","Bohao Xing","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2405.01885v1.pdf","comment":"accepted by IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2311.10329v5","updated":"2024-05-03T07:02:56Z","published":"2023-11-17T05:03:53Z","title":"High-fidelity Person-centric Subject-to-Image Synthesis","summary":" Current subject-driven image generation methods encounter significant\nchallenges in person-centric image generation. The reason is that they learn\nthe semantic scene and person generation by fine-tuning a common pre-trained\ndiffusion, which involves an irreconcilable training imbalance. Precisely, to\ngenerate realistic persons, they need to sufficiently tune the pre-trained\nmodel, which inevitably causes the model to forget the rich semantic scene\nprior and makes scene generation over-fit to the training data. Moreover, even\nwith sufficient fine-tuning, these methods can still not generate high-fidelity\npersons since joint learning of the scene and person generation also lead to\nquality compromise. In this paper, we propose Face-diffuser, an effective\ncollaborative generation pipeline to eliminate the above training imbalance and\nquality compromise. Specifically, we first develop two specialized pre-trained\ndiffusion models, i.e., Text-driven Diffusion Model (TDM) and Subject-augmented\nDiffusion Model (SDM), for scene and person generation, respectively. The\nsampling process is divided into three sequential stages, i.e., semantic scene\nconstruction, subject-scene fusion, and subject enhancement. The first and last\nstages are performed by TDM and SDM respectively. The subject-scene fusion\nstage, that is the collaboration achieved through a novel and highly effective\nmechanism, Saliency-adaptive Noise Fusion (SNF). Specifically, it is based on\nour key observation that there exists a robust link between classifier-free\nguidance responses and the saliency of generated images. In each time step, SNF\nleverages the unique strengths of each model and allows for the spatial\nblending of predicted noises from both models automatically in a saliency-aware\nmanner. Extensive experiments confirm the impressive effectiveness and\nrobustness of the Face-diffuser.\n","authors":["Yibin Wang","Weizhong Zhang","Jianwei Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2311.10329v5.pdf","comment":"Accepted by CVPR2024. Code:\n https://github.com/CodeGoat24/Face-diffuser"},{"id":"http://arxiv.org/abs/2404.18381v2","updated":"2024-05-03T06:10:18Z","published":"2024-04-29T02:33:40Z","title":"Object Registration in Neural Fields","summary":" Neural fields provide a continuous scene representation of 3D geometry and\nappearance in a way which has great promise for robotics applications. One\nfunctionality that unlocks unique use-cases for neural fields in robotics is\nobject 6-DoF registration. In this paper, we provide an expanded analysis of\nthe recent Reg-NF neural field registration method and its use-cases within a\nrobotics context. We showcase the scenario of determining the 6-DoF pose of\nknown objects within a scene using scene and object neural field models. We\nshow how this may be used to better represent objects within imperfectly\nmodelled scenes and generate new scenes by substituting object neural field\nmodels into the scene.\n","authors":["David Hall","Stephen Hausler","Sutharsan Mahendren","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2404.18381v2.pdf","comment":"Accepted to ICRA 2024 RoboNeRF workshop. 5 pages, 10 figures. arXiv\n admin note: substantial text overlap with arXiv:2402.09722"},{"id":"http://arxiv.org/abs/2405.01872v1","updated":"2024-05-03T06:03:37Z","published":"2024-05-03T06:03:37Z","title":"Defect Image Sample Generation With Diffusion Prior for Steel Surface\n Defect Recognition","summary":" The task of steel surface defect recognition is an industrial problem with\ngreat industry values. The data insufficiency is the major challenge in\ntraining a robust defect recognition network. Existing methods have\ninvestigated to enlarge the dataset by generating samples with generative\nmodels. However, their generation quality is still limited by the insufficiency\nof defect image samples. To this end, we propose Stable Surface Defect\nGeneration (StableSDG), which transfers the vast generation distribution\nembedded in Stable Diffusion model for steel surface defect image generation.\nTo tackle with the distinctive distribution gap between steel surface images\nand generated images of the diffusion model, we propose two processes. First,\nwe align the distribution by adapting parameters of the diffusion model,\nadopted both in the token embedding space and network parameter space. Besides,\nin the generation process, we propose image-oriented generation rather than\nfrom pure Gaussian noises. We conduct extensive experiments on steel surface\ndefect dataset, demonstrating state-of-the-art performance on generating\nhigh-quality samples and training recognition models, and both designed\nprocesses are significant for the performance.\n","authors":["Yichun Tai","Kun Yang","Tao Peng","Zhenzhen Huang","Zhijiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.01872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06947v3","updated":"2024-05-03T05:41:18Z","published":"2023-12-12T03:04:08Z","title":"MaTe3D: Mask-guided Text-based 3D-aware Portrait Editing","summary":" 3D-aware portrait editing has a wide range of applications in multiple\nfields. However, current approaches are limited due that they can only perform\nmask-guided or text-based editing. Even by fusing the two procedures into a\nmodel, the editing quality and stability cannot be ensured. To address this\nlimitation, we propose \\textbf{MaTe3D}: mask-guided text-based 3D-aware\nportrait editing. In this framework, first, we introduce a new SDF-based 3D\ngenerator which learns local and global representations with proposed SDF and\ndensity consistency losses. This enhances masked-based editing in local areas;\nsecond, we present a novel distillation strategy: Conditional Distillation on\nGeometry and Texture (CDGT). Compared to exiting distillation strategies, it\nmitigates visual ambiguity and avoids mismatch between texture and geometry,\nthereby producing stable texture and convincing geometry while editing.\nAdditionally, we create the CatMask-HQ dataset, a large-scale high-resolution\ncat face annotation for exploration of model generalization and expansion. We\nperform expensive experiments on both the FFHQ and CatMask-HQ datasets to\ndemonstrate the editing quality and stability of the proposed method. Our\nmethod faithfully generates a 3D-aware edited face image based on a modified\nmask and a text prompt. Our code and models will be publicly released.\n","authors":["Kangneng Zhou","Daiheng Gao","Xuan Wang","Jie Zhang","Peng Zhang","Xusen Sun","Longhao Zhang","Shiqi Yang","Bang Zhang","Liefeng Bo","Yaxing Wang","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.06947v3.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2405.01857v1","updated":"2024-05-03T05:18:35Z","published":"2024-05-03T05:18:35Z","title":"TinySeg: Model Optimizing Framework for Image Segmentation on Tiny\n Embedded Systems","summary":" Image segmentation is one of the major computer vision tasks, which is\napplicable in a variety of domains, such as autonomous navigation of an\nunmanned aerial vehicle. However, image segmentation cannot easily materialize\non tiny embedded systems because image segmentation models generally have high\npeak memory usage due to their architectural characteristics. This work finds\nthat image segmentation models unnecessarily require large memory space with an\nexisting tiny machine learning framework. That is, the existing framework\ncannot effectively manage the memory space for the image segmentation models.\n This work proposes TinySeg, a new model optimizing framework that enables\nmemory-efficient image segmentation for tiny embedded systems. TinySeg analyzes\nthe lifetimes of tensors in the target model and identifies long-living\ntensors. Then, TinySeg optimizes the memory usage of the target model mainly\nwith two methods: (i) tensor spilling into local or remote storage and (ii)\nfused fetching of spilled tensors. This work implements TinySeg on top of the\nexisting tiny machine learning framework and demonstrates that TinySeg can\nreduce the peak memory usage of an image segmentation model by 39.3% for tiny\nembedded systems.\n","authors":["Byungchul Chae","Jiae Kim","Seonyeong Heo"],"pdf_url":"https://arxiv.org/pdf/2405.01857v1.pdf","comment":"LCTES 2024"},{"id":"http://arxiv.org/abs/2310.02601v7","updated":"2024-05-03T04:50:27Z","published":"2023-10-04T06:14:06Z","title":"MagicDrive: Street View Generation with Diverse 3D Geometry Control","summary":" Recent advancements in diffusion models have significantly enhanced the data\nsynthesis with 2D control. Yet, precise 3D control in street view generation,\ncrucial for 3D perception tasks, remains elusive. Specifically, utilizing\nBird's-Eye View (BEV) as the primary condition often leads to challenges in\ngeometry control (e.g., height), affecting the representation of object shapes,\nocclusion patterns, and road surface elevations, all of which are essential to\nperception data synthesis, especially for 3D object detection tasks. In this\npaper, we introduce MagicDrive, a novel street view generation framework,\noffering diverse 3D geometry controls including camera poses, road maps, and 3D\nbounding boxes, together with textual descriptions, achieved through tailored\nencoding strategies. Besides, our design incorporates a cross-view attention\nmodule, ensuring consistency across multiple camera views. With MagicDrive, we\nachieve high-fidelity street-view image & video synthesis that captures nuanced\n3D geometry and various scene descriptions, enhancing tasks like BEV\nsegmentation and 3D object detection.\n","authors":["Ruiyuan Gao","Kai Chen","Enze Xie","Lanqing Hong","Zhenguo Li","Dit-Yan Yeung","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.02601v7.pdf","comment":"Project Page: https://flymin.github.io/magicdrive; Figure 7 updated"},{"id":"http://arxiv.org/abs/2305.03614v4","updated":"2024-05-03T04:11:55Z","published":"2023-05-05T15:20:27Z","title":"Denoising-Diffusion Alignment for Continuous Sign Language Recognition","summary":" Continuous sign language recognition (CSLR) aims to promote active and\naccessible communication for the hearing impaired, by recognizing signs in\nuntrimmed sign language videos to textual glosses sequentially. The key\nchallenge of CSLR is how to achieve the cross-modality alignment between videos\nand gloss sequences. However, the current cross-modality paradigms of CSLR\noverlook using the glosses context to guide the video clips for global temporal\ncontext alignment, which further affects the visual to gloss mapping and is\ndetrimental to recognition performance. To tackle this problem, we propose a\nnovel Denoising-Diffusion global Alignment (DDA), which consists of a\ndenoising-diffusion autoencoder and DDA loss function. DDA leverages\ndiffusion-based global alignment techniques to align video with gloss sequence,\nfacilitating global temporal context alignment. Specifically, DDA first\nproposes the auxiliary condition diffusion to conduct the gloss-part noised\nbimodal representations for video and gloss sequence. To address the problem of\nthe recognition-oriented alignment knowledge represented in the diffusion\ndenoising process cannot be feedback. The DDA further proposes the\nDenoising-Diffusion Autoencoder, which adds a decoder in the auxiliary\ncondition diffusion to denoise the partial noisy bimodal representations via\nthe designed DDA loss in self-supervised. In the denoising process, each video\nclip representation of video can be reliably guided to re-establish the global\ntemporal context between them via denoising the gloss sequence representation.\nExperiments on three public benchmarks demonstrate that our DDA achieves\nstate-of-the-art performances and confirm the feasibility of DDA for video\nrepresentation enhancement.\n","authors":["Leming Guo","Wanli Xue","Yuxi Zhou","Ze Kang","Tiantian Yuan","Zan Gao","Shengyong Chen"],"pdf_url":"https://arxiv.org/pdf/2305.03614v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01828v1","updated":"2024-05-03T03:20:37Z","published":"2024-05-03T03:20:37Z","title":"FER-YOLO-Mamba: Facial Expression Detection and Classification Based on\n Selective State Space","summary":" Facial Expression Recognition (FER) plays a pivotal role in understanding\nhuman emotional cues. However, traditional FER methods based on visual\ninformation have some limitations, such as preprocessing, feature extraction,\nand multi-stage classification procedures. These not only increase\ncomputational complexity but also require a significant amount of computing\nresources. Considering Convolutional Neural Network (CNN)-based FER schemes\nfrequently prove inadequate in identifying the deep, long-distance dependencies\nembedded within facial expression images, and the Transformer's inherent\nquadratic computational complexity, this paper presents the FER-YOLO-Mamba\nmodel, which integrates the principles of Mamba and YOLO technologies to\nfacilitate efficient coordination in facial expression image recognition and\nlocalization. Within the FER-YOLO-Mamba model, we further devise a FER-YOLO-VSS\ndual-branch module, which combines the inherent strengths of convolutional\nlayers in local feature extraction with the exceptional capability of State\nSpace Models (SSMs) in revealing long-distance dependencies. To the best of our\nknowledge, this is the first Vision Mamba model designed for facial expression\ndetection and classification. To evaluate the performance of the proposed\nFER-YOLO-Mamba model, we conducted experiments on two benchmark datasets,\nRAF-DB and SFEW. The experimental results indicate that the FER-YOLO-Mamba\nmodel achieved better results compared to other models. The code is available\nfrom https://github.com/SwjtuMa/FER-YOLO-Mamba.\n","authors":["Hui Ma","Sen Lei","Turgay Celik","Heng-Chao Li"],"pdf_url":"https://arxiv.org/pdf/2405.01828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01825v1","updated":"2024-05-03T03:02:00Z","published":"2024-05-03T03:02:00Z","title":"Improving Concept Alignment in Vision-Language Concept Bottleneck Models","summary":" Concept Bottleneck Models (CBM) map the input image to a high-level\nhuman-understandable concept space and then make class predictions based on\nthese concepts. Recent approaches automate the construction of CBM by prompting\nLarge Language Models (LLM) to generate text concepts and then use Vision\nLanguage Models (VLM) to obtain concept scores to train a CBM. However, it is\ndesired to build CBMs with concepts defined by human experts instead of LLM\ngenerated concepts to make them more trustworthy. In this work, we take a\ncloser inspection on the faithfulness of VLM concept scores for such\nexpert-defined concepts in domains like fine-grain bird species classification\nand animal classification. Our investigations reveal that frozen VLMs, like\nCLIP, struggle to correctly associate a concept to the corresponding visual\ninput despite achieving a high classification performance. To address this, we\npropose a novel Contrastive Semi-Supervised (CSS) learning method which uses a\nfew labeled concept examples to improve concept alignment (activate truthful\nvisual concepts) in CLIP model. Extensive experiments on three benchmark\ndatasets show that our approach substantially increases the concept accuracy\nand classification accuracy, yet requires only a fraction of the\nhuman-annotated concept labels. To further improve the classification\nperformance, we also introduce a new class-level intervention procedure for\nfine-grain classification problems that identifies the confounding classes and\nintervenes their concept space to reduce errors.\n","authors":["Nithish Muthuchamy Selvaraj","Xiaobao Guo","Bingquan Shen","Adams Wai-Kin Kong","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2405.01825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01822v1","updated":"2024-05-03T02:51:25Z","published":"2024-05-03T02:51:25Z","title":"Report on the AAPM Grand Challenge on deep generative modeling for\n learning medical image statistics","summary":" The findings of the 2023 AAPM Grand Challenge on Deep Generative Modeling for\nLearning Medical Image Statistics are reported in this Special Report. The goal\nof this challenge was to promote the development of deep generative models\n(DGMs) for medical imaging and to emphasize the need for their domain-relevant\nassessment via the analysis of relevant image statistics. As part of this Grand\nChallenge, a training dataset was developed based on 3D anthropomorphic breast\nphantoms from the VICTRE virtual imaging toolbox. A two-stage evaluation\nprocedure consisting of a preliminary check for memorization and image quality\n(based on the Frechet Inception distance (FID)), and a second stage evaluating\nthe reproducibility of image statistics corresponding to domain-relevant\nradiomic features was developed. A summary measure was employed to rank the\nsubmissions. Additional analyses of submissions was performed to assess DGM\nperformance specific to individual feature families, and to identify various\nartifacts. 58 submissions from 12 unique users were received for this\nChallenge. The top-ranked submission employed a conditional latent diffusion\nmodel, whereas the joint runners-up employed a generative adversarial network,\nfollowed by another network for image superresolution. We observed that the\noverall ranking of the top 9 submissions according to our evaluation method (i)\ndid not match the FID-based ranking, and (ii) differed with respect to\nindividual feature families. Another important finding from our additional\nanalyses was that different DGMs demonstrated similar kinds of artifacts. This\nGrand Challenge highlighted the need for domain-specific evaluation to further\nDGM design as well as deployment. It also demonstrated that the specification\nof a DGM may differ depending on its intended use.\n","authors":["Rucha Deshpande","Varun A. Kelkar","Dimitrios Gotsis","Prabhat Kc","Rongping Zeng","Kyle J. Myers","Frank J. Brooks","Mark A. Anastasio"],"pdf_url":"https://arxiv.org/pdf/2405.01822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01820v1","updated":"2024-05-03T02:47:44Z","published":"2024-05-03T02:47:44Z","title":"Real Risks of Fake Data: Synthetic Data, Diversity-Washing and Consent\n Circumvention","summary":" Machine learning systems require representations of the real world for\ntraining and testing - they require data, and lots of it. Collecting data at\nscale has logistical and ethical challenges, and synthetic data promises a\nsolution to these challenges. Instead of needing to collect photos of real\npeople's faces to train a facial recognition system, a model creator could\ncreate and use photo-realistic, synthetic faces. The comparative ease of\ngenerating this synthetic data rather than relying on collecting data has made\nit a common practice. We present two key risks of using synthetic data in model\ndevelopment. First, we detail the high risk of false confidence when using\nsynthetic data to increase dataset diversity and representation. We base this\nin the examination of a real world use-case of synthetic data, where synthetic\ndatasets were generated for an evaluation of facial recognition technology.\nSecond, we examine how using synthetic data risks circumventing consent for\ndata usage. We illustrate this by considering the importance of consent to the\nU.S. Federal Trade Commission's regulation of data collection and affected\nmodels. Finally, we discuss how these two risks exemplify how synthetic data\ncomplicates existing governance and ethical practice; by decoupling data from\nthose it impacts, synthetic data is prone to consolidating power away those\nmost impacted by algorithmically-mediated harm.\n","authors":["Cedric Deslandes Whitney","Justin Norman"],"pdf_url":"https://arxiv.org/pdf/2405.01820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17830v2","updated":"2024-05-03T02:29:09Z","published":"2024-04-27T08:40:33Z","title":"Dynamic Against Dynamic: An Open-set Self-learning Framework","summary":" In open-set recognition, existing methods generally learn statically fixed\ndecision boundaries using known classes to reject unknown classes. Though they\nhave achieved promising results, such decision boundaries are evidently\ninsufficient for universal unknown classes in dynamic and open scenarios as\nthey can potentially appear at any position in the feature space. Moreover,\nthese methods just simply reject unknown class samples during testing without\nany effective utilization for them. In fact, such samples completely can\nconstitute the true instantiated representation of the unknown classes to\nfurther enhance the model's performance. To address these issues, this paper\nproposes a novel dynamic against dynamic idea, i.e., dynamic method against\ndynamic changing open-set world, where an open-set self-learning (OSSL)\nframework is correspondingly developed. OSSL starts with a good closed-set\nclassifier trained by known classes and utilizes available test samples for\nmodel adaptation during testing, thus gaining the adaptability to changing data\ndistributions. In particular, a novel self-matching module is designed for\nOSSL, which can achieve the adaptation in automatically identifying known class\nsamples while rejecting unknown class samples which are further utilized to\nenhance the discriminability of the model as the instantiated representation of\nunknown classes. Our method establishes new performance milestones respectively\nin almost all standard and cross-data benchmarks.\n","authors":["Haifeng Yang","Chuanxing Geng","Pong C. Yuen","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17830v2.pdf","comment":"The first two authors contributed equally to this work. Accepted at\n IJCAI2024"},{"id":"http://arxiv.org/abs/2404.04562v3","updated":"2024-05-03T01:59:57Z","published":"2024-04-06T09:03:18Z","title":"Diffusion Time-step Curriculum for One Image to 3D Generation","summary":" Score distillation sampling~(SDS) has been widely adopted to overcome the\nabsence of unseen views in reconstructing 3D objects from a \\textbf{single}\nimage. It leverages pre-trained 2D diffusion models as teacher to guide the\nreconstruction of student 3D models. Despite their remarkable success,\nSDS-based methods often encounter geometric artifacts and texture saturation.\nWe find out the crux is the overlooked indiscriminate treatment of diffusion\ntime-steps during optimization: it unreasonably treats the student-teacher\nknowledge distillation to be equal at all time-steps and thus entangles\ncoarse-grained and fine-grained modeling. Therefore, we propose the Diffusion\nTime-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the\nteacher and student models collaborating with the time-step curriculum in a\ncoarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and\nLevel50 benchmark demonstrate that DTC123 can produce multi-view consistent,\nhigh-quality, and diverse 3D assets. Codes and more generation demos will be\nreleased in https://github.com/yxymessi/DTC123.\n","authors":["Xuanyu Yi","Zike Wu","Qingshan Xu","Pan Zhou","Joo-Hwee Lim","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04562v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2307.06472v3","updated":"2024-05-03T01:01:29Z","published":"2023-07-12T22:08:22Z","title":"Early Autism Diagnosis based on Path Signature and Siamese Unsupervised\n Feature Compressor","summary":" Autism Spectrum Disorder (ASD) has been emerging as a growing public health\nthreat. Early diagnosis of ASD is crucial for timely, effective intervention\nand treatment. However, conventional diagnosis methods based on communications\nand behavioral patterns are unreliable for children younger than 2 years of\nage. Given evidences of neurodevelopmental abnormalities in ASD infants, we\nresort to a novel deep learning-based method to extract key features from the\ninherently scarce, class-imbalanced, and heterogeneous structural MR images for\nearly autism diagnosis. Specifically, we propose a Siamese verification\nframework to extend the scarce data, and an unsupervised compressor to\nalleviate data imbalance by extracting key features. We also proposed weight\nconstraints to cope with sample heterogeneity by giving different samples\ndifferent voting weights during validation, and we used Path Signature to\nunravel meaningful developmental features from the two-time point data\nlongitudinally. We further extracted machine learning focused brain regions for\nautism diagnosis. Extensive experiments have shown that our method performed\nwell under practical scenarios, transcending existing machine learning methods\nand providing anatomical insights for autism early diagnosis.\n","authors":["Zhuowen Yin","Xinyao Ding","Xin Zhang","Zhengwang Wu","Li Wang","Xiangmin Xu","Gang Li"],"pdf_url":"https://arxiv.org/pdf/2307.06472v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08068v3","updated":"2024-05-03T00:37:12Z","published":"2023-06-13T18:32:35Z","title":"DORSal: Diffusion for Object-centric Representations of Scenes et al","summary":" Recent progress in 3D scene understanding enables scalable learning of\nrepresentations across large datasets of diverse scenes. As a consequence,\ngeneralization to unseen scenes and objects, rendering novel views from just a\nsingle or a handful of input images, and controllable scene generation that\nsupports editing, is now possible. However, training jointly on a large number\nof scenes typically compromises rendering quality when compared to single-scene\noptimized models such as NeRFs. In this paper, we leverage recent progress in\ndiffusion models to equip 3D scene representation learning models with the\nability to render high-fidelity novel views, while retaining benefits such as\nobject-level scene editing to a large degree. In particular, we propose DORSal,\nwhich adapts a video diffusion architecture for 3D scene generation conditioned\non frozen object-centric slot-based representations of scenes. On both complex\nsynthetic multi-object scenes and on the real-world large-scale Street View\ndataset, we show that DORSal enables scalable neural rendering of 3D scenes\nwith object-level editing and improves upon existing approaches.\n","authors":["Allan Jabri","Sjoerd van Steenkiste","Emiel Hoogeboom","Mehdi S. M. Sajjadi","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2306.08068v3.pdf","comment":"Accepted to ICLR 2024. Project page:\n https://www.sjoerdvansteenkiste.com/dorsal"},{"id":"http://arxiv.org/abs/2306.16772v6","updated":"2024-05-03T00:00:27Z","published":"2023-06-29T08:13:57Z","title":"M3Act: Learning from Synthetic Human Group Activities","summary":" The study of complex human interactions and group activities has become a\nfocal point in human-centric computer vision. However, progress in related\ntasks is often hindered by the challenges of obtaining large-scale labeled\ndatasets from real-world scenarios. To address the limitation, we introduce\nM3Act, a synthetic data generator for multi-view multi-group multi-person human\natomic actions and group activities. Powered by Unity Engine, M3Act features\nmultiple semantic groups, highly diverse and photorealistic images, and a\ncomprehensive set of annotations, which facilitates the learning of\nhuman-centered tasks across single-person, multi-person, and multi-group\nconditions. We demonstrate the advantages of M3Act across three core\nexperiments. The results suggest our synthetic dataset can significantly\nimprove the performance of several downstream methods and replace real-world\ndatasets to reduce cost. Notably, M3Act improves the state-of-the-art MOTRv2 on\nDanceTrack dataset, leading to a hop on the leaderboard from 10th to 2nd place.\nMoreover, M3Act opens new research for controllable 3D group activity\ngeneration. We define multiple metrics and propose a competitive baseline for\nthe novel task. Our code and data are available at our project page:\nhttp://cjerry1243.github.io/M3Act.\n","authors":["Che-Jui Chang","Danrui Li","Deep Patel","Parth Goel","Honglu Zhou","Seonghyeon Moon","Samuel S. Sohn","Sejong Yoon","Vladimir Pavlovic","Mubbasir Kapadia"],"pdf_url":"https://arxiv.org/pdf/2306.16772v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01970v2","updated":"2024-05-03T23:33:07Z","published":"2024-01-03T20:39:02Z","title":"FMGS: Foundation Model Embedded 3D Gaussian Splatting for Holistic 3D\n Scene Understanding","summary":" Precisely perceiving the geometric and semantic properties of real-world 3D\nobjects is crucial for the continued evolution of augmented reality and robotic\napplications. To this end, we present Foundation Model Embedded Gaussian\nSplatting (FMGS), which incorporates vision-language embeddings of foundation\nmodels into 3D Gaussian Splatting (GS). The key contribution of this work is an\nefficient method to reconstruct and represent 3D vision-language models. This\nis achieved by distilling feature maps generated from image-based foundation\nmodels into those rendered from our 3D model. To ensure high-quality rendering\nand fast training, we introduce a novel scene representation by integrating\nstrengths from both GS and multi-resolution hash encodings (MHE). Our effective\ntraining procedure also introduces a pixel alignment loss that makes the\nrendered feature distance of the same semantic entities close, following the\npixel-level semantic boundaries. Our results demonstrate remarkable multi-view\nsemantic consistency, facilitating diverse downstream tasks, beating\nstate-of-the-art methods by 10.2 percent on open-vocabulary language-based\nobject detection, despite that we are 851X faster for inference. This research\nexplores the intersection of vision, language, and 3D scene representation,\npaving the way for enhanced scene understanding in uncontrolled real-world\nenvironments. We plan to release the code on the project page.\n","authors":["Xingxing Zuo","Pouya Samangouei","Yunwen Zhou","Yan Di","Mingyang Li"],"pdf_url":"https://arxiv.org/pdf/2401.01970v2.pdf","comment":"Project page: https://xingxingzuo.github.io/fmgs"},{"id":"http://arxiv.org/abs/2405.02515v1","updated":"2024-05-03T22:58:48Z","published":"2024-05-03T22:58:48Z","title":"SR4ZCT: Self-supervised Through-plane Resolution Enhancement for CT\n Images with Arbitrary Resolution and Overlap","summary":" Computed tomography (CT) is a widely used non-invasive medical imaging\ntechnique for disease diagnosis. The diagnostic accuracy is often affected by\nimage resolution, which can be insufficient in practice. For medical CT images,\nthe through-plane resolution is often worse than the in-plane resolution and\nthere can be overlap between slices, causing difficulties in diagnoses.\nSelf-supervised methods for through-plane resolution enhancement, which train\non in-plane images and infer on through-plane images, have shown promise for\nboth CT and MRI imaging. However, existing self-supervised methods either\nneglect overlap or can only handle specific cases with fixed combinations of\nresolution and overlap. To address these limitations, we propose a\nself-supervised method called SR4ZCT. It employs the same off-axis training\napproach while being capable of handling arbitrary combinations of resolution\nand overlap. Our method explicitly models the relationship between resolutions\nand voxel spacings of different planes to accurately simulate training images\nthat match the original through-plane images. We highlight the significance of\naccurate modeling in self-supervised off-axis training and demonstrate the\neffectiveness of SR4ZCT using a real-world dataset.\n","authors":["Jiayang Shi","Daniel M. Pelt","K. Joost Batenburg"],"pdf_url":"https://arxiv.org/pdf/2405.02515v1.pdf","comment":"MLMI2023"},{"id":"http://arxiv.org/abs/2405.02512v1","updated":"2024-05-03T22:55:56Z","published":"2024-05-03T22:55:56Z","title":"Spatio-Temporal SwinMAE: A Swin Transformer based Multiscale\n Representation Learner for Temporal Satellite Imagery","summary":" Currently, the foundation models represented by large language models have\nmade dramatic progress and are used in a very wide range of domains including\n2D and 3D vision. As one of the important application domains of foundation\nmodels, earth observation has attracted attention and various approaches have\nbeen developed. When considering earth observation as a single image capture,\nearth observation imagery can be processed as an image with three or more\nchannels, and when it comes with multiple image captures of different\ntimestamps at one location, the temporal observation can be considered as a set\nof continuous image resembling video frames or medical SCAN slices. This paper\npresents Spatio-Temporal SwinMAE (ST-SwinMAE), an architecture which\nparticularly focuses on representation learning for spatio-temporal image\nprocessing. Specifically, it uses a hierarchical Masked Auto-encoder (MAE) with\nVideo Swin Transformer blocks. With the architecture, we present a pretrained\nmodel named Degas 100M as a geospatial foundation model. Also, we propose an\napproach for transfer learning with Degas 100M, which both pretrained encoder\nand decoder of MAE are utilized with skip connections added between them to\nachieve multi-scale information communication, forms an architecture named\nSpatio-Temporal SwinUNet (ST-SwinUNet). Our approach shows significant\nimprovements of performance over existing state-of-the-art of foundation\nmodels. Specifically, for transfer learning of the land cover downstream task\non the PhilEO Bench dataset, it shows 10.4\\% higher accuracy compared with\nother geospatial foundation models on average.\n","authors":["Yohei Nakayama","Jiawei Su"],"pdf_url":"https://arxiv.org/pdf/2405.02512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02509v1","updated":"2024-05-03T22:50:59Z","published":"2024-05-03T22:50:59Z","title":"Implicit Neural Representations for Robust Joint Sparse-View CT\n Reconstruction","summary":" Computed Tomography (CT) is pivotal in industrial quality control and medical\ndiagnostics. Sparse-view CT, offering reduced ionizing radiation, faces\nchallenges due to its under-sampled nature, leading to ill-posed reconstruction\nproblems. Recent advancements in Implicit Neural Representations (INRs) have\nshown promise in addressing sparse-view CT reconstruction. Recognizing that CT\noften involves scanning similar subjects, we propose a novel approach to\nimprove reconstruction quality through joint reconstruction of multiple objects\nusing INRs. This approach can potentially leverage both the strengths of INRs\nand the statistical regularities across multiple objects. While current INR\njoint reconstruction techniques primarily focus on accelerating convergence via\nmeta-initialization, they are not specifically tailored to enhance\nreconstruction quality. To address this gap, we introduce a novel INR-based\nBayesian framework integrating latent variables to capture the inter-object\nrelationships. These variables serve as a dynamic reference throughout the\noptimization, thereby enhancing individual reconstruction fidelity. Our\nextensive experiments, which assess various key factors such as reconstruction\nquality, resistance to overfitting, and generalizability, demonstrate\nsignificant improvements over baselines in common numerical metrics. This\nunderscores a notable advancement in CT reconstruction methods.\n","authors":["Jiayang Shi","Junyi Zhu","Daniel M. Pelt","K. Joost Batenburg","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2405.02509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18104v2","updated":"2024-05-03T22:50:41Z","published":"2024-03-26T21:04:18Z","title":"Mathematical Foundation and Corrections for Full Range Head Pose\n Estimation","summary":" Numerous works concerning head pose estimation (HPE) offer algorithms or\nproposed neural network-based approaches for extracting Euler angles from\neither facial key points or directly from images of the head region. However,\nmany works failed to provide clear definitions of the coordinate systems and\nEuler or Tait-Bryan angles orders in use. It is a well-known fact that rotation\nmatrices depend on coordinate systems, and yaw, roll, and pitch angles are\nsensitive to their application order. Without precise definitions, it becomes\nchallenging to validate the correctness of the output head pose and drawing\nroutines employed in prior works. In this paper, we thoroughly examined the\nEuler angles defined in the 300W-LP dataset, head pose estimation such as\n3DDFA-v2, 6D-RepNet, WHENet, etc, and the validity of their drawing routines of\nthe Euler angles. When necessary, we infer their coordinate system and sequence\nof yaw, roll, pitch from provided code. This paper presents (1) code and\nalgorithms for inferring coordinate system from provided source code, code for\nEuler angle application order and extracting precise rotation matrices and the\nEuler angles, (2) code and algorithms for converting poses from one rotation\nsystem to another, (3) novel formulae for 2D augmentations of the rotation\nmatrices, and (4) derivations and code for the correct drawing routines for\nrotation matrices and poses. This paper also addresses the feasibility of\ndefining rotations with right-handed coordinate system in Wikipedia and SciPy,\nwhich makes the Euler angle extraction much easier for full-range head pose\nresearch.\n","authors":["Huei-Chung Hu","Xuyang Wu","Yuan Wang","Yi Fang","Hsin-Tai Wu"],"pdf_url":"https://arxiv.org/pdf/2403.18104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02508v1","updated":"2024-05-03T22:42:00Z","published":"2024-05-03T22:42:00Z","title":"Rasterized Edge Gradients: Handling Discontinuities Differentiably","summary":" Computing the gradients of a rendering process is paramount for diverse\napplications in computer vision and graphics. However, accurate computation of\nthese gradients is challenging due to discontinuities and rendering\napproximations, particularly for surface-based representations and\nrasterization-based rendering. We present a novel method for computing\ngradients at visibility discontinuities for rasterization-based differentiable\nrenderers. Our method elegantly simplifies the traditionally complex problem\nthrough a carefully designed approximation strategy, allowing for a\nstraightforward, effective, and performant solution. We introduce a novel\nconcept of micro-edges, which allows us to treat the rasterized images as\noutcomes of a differentiable, continuous process aligned with the inherently\nnon-differentiable, discrete-pixel rasterization. This technique eliminates the\nnecessity for rendering approximations or other modifications to the forward\npass, preserving the integrity of the rendered image, which makes it applicable\nto rasterized masks, depth, and normals images where filtering is prohibitive.\nUtilizing micro-edges simplifies gradient interpretation at discontinuities and\nenables handling of geometry intersections, offering an advantage over the\nprior art. We showcase our method in dynamic human head scene reconstruction,\ndemonstrating effective handling of camera images and segmentation masks.\n","authors":["Stanislav Pidhorskyi","Tomas Simon","Gabriel Schwartz","He Wen","Yaser Sheikh","Jason Saragih"],"pdf_url":"https://arxiv.org/pdf/2405.02508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02504v1","updated":"2024-05-03T22:33:46Z","published":"2024-05-03T22:33:46Z","title":"Functional Imaging Constrained Diffusion for Brain PET Synthesis from\n Structural MRI","summary":" Magnetic resonance imaging (MRI) and positron emission tomography (PET) are\nincreasingly used in multimodal analysis of neurodegenerative disorders. While\nMRI is broadly utilized in clinical settings, PET is less accessible. Many\nstudies have attempted to use deep generative models to synthesize PET from MRI\nscans. However, they often suffer from unstable training and inadequately\npreserve brain functional information conveyed by PET. To this end, we propose\na functional imaging constrained diffusion (FICD) framework for 3D brain PET\nimage synthesis with paired structural MRI as input condition, through a new\nconstrained diffusion model (CDM). The FICD introduces noise to PET and then\nprogressively removes it with CDM, ensuring high output fidelity throughout a\nstable training phase. The CDM learns to predict denoised PET with a functional\nimaging constraint introduced to ensure voxel-wise alignment between each\ndenoised PET and its ground truth. Quantitative and qualitative analyses\nconducted on 293 subjects with paired T1-weighted MRI and\n18F-fluorodeoxyglucose (FDG)-PET scans suggest that FICD achieves superior\nperformance in generating FDG-PET data compared to state-of-the-art methods. We\nfurther validate the effectiveness of the proposed FICD on data from a total of\n1,262 subjects through three downstream tasks, with experimental results\nsuggesting its utility and generalizability.\n","authors":["Minhui Yu","Mengqi Wu","Ling Yue","Andrea Bozoki","Mingxia Liu"],"pdf_url":"https://arxiv.org/pdf/2405.02504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02497v1","updated":"2024-05-03T22:04:54Z","published":"2024-05-03T22:04:54Z","title":"Prediction techniques for dynamic imaging with online primal-dual\n methods","summary":" Online optimisation facilitates the solution of dynamic inverse problems,\nsuch as image stabilisation, fluid flow monitoring, and dynamic medical\nimaging. In this paper, we improve upon previous work on predictive online\nprimal-dual methods on two fronts. Firstly, we provide a more concise analysis\nthat symmetrises previously unsymmetric regret bounds, and relaxes previous\nrestrictive conditions on the dual predictor. Secondly, based on the latter, we\ndevelop several improved dual predictors. We numerically demonstrate their\nefficacy in image stabilisation and dynamic positron emission tomography.\n","authors":["Neil Dizon","Jyrki Jauhiainen","Tuomo Valkonen"],"pdf_url":"https://arxiv.org/pdf/2405.02497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00723v6","updated":"2024-05-03T20:31:08Z","published":"2023-10-01T16:48:48Z","title":"HOH: Markerless Multimodal Human-Object-Human Handover Dataset with\n Large Object Count","summary":" We present the HOH (Human-Object-Human) Handover Dataset, a large object\ncount dataset with 136 objects, to accelerate data-driven research on handover\nstudies, human-robot handover implementation, and artificial intelligence (AI)\non handover parameter estimation from 2D and 3D data of person interactions.\nHOH contains multi-view RGB and depth data, skeletons, fused point clouds,\ngrasp type and handedness labels, object, giver hand, and receiver hand 2D and\n3D segmentations, giver and receiver comfort ratings, and paired object\nmetadata and aligned 3D models for 2,720 handover interactions spanning 136\nobjects and 20 giver-receiver pairs-40 with role-reversal-organized from 40\nparticipants. We also show experimental results of neural networks trained\nusing HOH to perform grasp, orientation, and trajectory prediction. As the only\nfully markerless handover capture dataset, HOH represents natural human-human\nhandover interactions, overcoming challenges with markered datasets that\nrequire specific suiting for body tracking, and lack high-resolution hand\ntracking. To date, HOH is the largest handover dataset in number of objects,\nparticipants, pairs with role reversal accounted for, and total interactions\ncaptured.\n","authors":["Noah Wiederhold","Ava Megyeri","DiMaggio Paris","Sean Banerjee","Natasha Kholgade Banerjee"],"pdf_url":"https://arxiv.org/pdf/2310.00723v6.pdf","comment":"NeurIPS 2023 Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2310.02401v2","updated":"2024-05-03T20:06:17Z","published":"2023-10-03T19:50:08Z","title":"FT-Shield: A Watermark Against Unauthorized Fine-tuning in Text-to-Image\n Diffusion Models","summary":" Text-to-image generative models, especially those based on latent diffusion\nmodels (LDMs), have demonstrated outstanding ability in generating high-quality\nand high-resolution images from textual prompts. With this advancement, various\nfine-tuning methods have been developed to personalize text-to-image models for\nspecific applications such as artistic style adaptation and human face\ntransfer. However, such advancements have raised copyright concerns, especially\nwhen the data are used for personalization without authorization. For example,\na malicious user can employ fine-tuning techniques to replicate the style of an\nartist without consent. In light of this concern, we propose FT-Shield, a\nwatermarking solution tailored for the fine-tuning of text-to-image diffusion\nmodels. FT-Shield addresses copyright protection challenges by designing new\nwatermark generation and detection strategies. In particular, it introduces an\ninnovative algorithm for watermark generation. It ensures the seamless transfer\nof watermarks from training images to generated outputs, facilitating the\nidentification of copyrighted material use. To tackle the variability in\nfine-tuning methods and their impact on watermark detection, FT-Shield\nintegrates a Mixture of Experts (MoE) approach for watermark detection.\nComprehensive experiments validate the effectiveness of our proposed FT-Shield.\n","authors":["Yingqian Cui","Jie Ren","Yuping Lin","Han Xu","Pengfei He","Yue Xing","Lingjuan Lyu","Wenqi Fan","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.02401v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04346v3","updated":"2024-05-03T19:43:55Z","published":"2024-04-05T18:33:04Z","title":"Koala: Key frame-conditioned long video-LLM","summary":" Long video question answering is a challenging task that involves recognizing\nshort-term activities and reasoning about their fine-grained relationships.\nState-of-the-art video Large Language Models (vLLMs) hold promise as a viable\nsolution due to their demonstrated emergent capabilities on new tasks. However,\ndespite being trained on millions of short seconds-long videos, vLLMs are\nunable to understand minutes-long videos and accurately answer questions about\nthem. To address this limitation, we propose a lightweight and self-supervised\napproach, Key frame-conditioned long video-LLM (Koala), that introduces\nlearnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to\nlonger videos. Our approach introduces two new tokenizers that condition on\nvisual tokens computed from sparse video key frames for understanding short and\nlong video moments. We train our proposed approach on HowTo100M and demonstrate\nits effectiveness on zero-shot long video understanding benchmarks, where it\noutperforms state-of-the-art large models by 3 - 6% in absolute accuracy across\nall tasks. Surprisingly, we also empirically show that our approach not only\nhelps a pretrained vLLM to understand long videos but also improves its\naccuracy on short-term action recognition.\n","authors":["Reuben Tan","Ximeng Sun","Ping Hu","Jui-hsien Wang","Hanieh Deilamsalehy","Bryan A. Plummer","Bryan Russell","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2404.04346v3.pdf","comment":"Accepted at CVPR 2024 as a poster highlight"},{"id":"http://arxiv.org/abs/2308.11471v5","updated":"2024-05-03T19:05:18Z","published":"2023-08-22T14:36:59Z","title":"Dynamic Open Vocabulary Enhanced Safe-landing with Intelligence\n (DOVESEI)","summary":" This work targets what we consider to be the foundational step for urban\nairborne robots, a safe landing. Our attention is directed toward what we deem\nthe most crucial aspect of the safe landing perception stack: segmentation. We\npresent a streamlined reactive UAV system that employs visual servoing by\nharnessing the capabilities of open vocabulary image segmentation. This\napproach can adapt to various scenarios with minimal adjustments, bypassing the\nnecessity for extensive data accumulation for refining internal models, thanks\nto its open vocabulary methodology. Given the limitations imposed by local\nauthorities, our primary focus centers on operations originating from altitudes\nof 100 meters. This choice is deliberate, as numerous preceding works have\ndealt with altitudes up to 30 meters, aligning with the capabilities of small\nstereo cameras. Consequently, we leave the remaining 20m to be navigated using\nconventional 3D path planning methods. Utilizing monocular cameras and image\nsegmentation, our findings demonstrate the system's capability to successfully\nexecute landing maneuvers at altitudes as low as 20 meters. However, this\napproach is vulnerable to intermittent and occasionally abrupt fluctuations in\nthe segmentation between frames in a video stream. To address this challenge,\nwe enhance the image segmentation output by introducing what we call a dynamic\nfocus: a masking mechanism that self adjusts according to the current landing\nstage. This dynamic focus guides the control system to avoid regions beyond the\ndrone's safety radius projected onto the ground, thus mitigating the problems\nwith fluctuations. Through the implementation of this supplementary layer, our\nexperiments have reached improvements in the landing success rate of almost\ntenfold when compared to global segmentation. All the source code is open\nsource and available online (github.com/MISTLab/DOVESEI).\n","authors":["Haechan Mark Bong","Rongge Zhang","Ricardo de Azambuja","Giovanni Beltrame"],"pdf_url":"https://arxiv.org/pdf/2308.11471v5.pdf","comment":"IROS 2023 The Last-Mile Robotics Workshop"},{"id":"http://arxiv.org/abs/2206.07705v2","updated":"2024-05-03T19:00:47Z","published":"2022-06-15T17:57:41Z","title":"LET-3D-AP: Longitudinal Error Tolerant 3D Average Precision for\n Camera-Only 3D Detection","summary":" The 3D Average Precision (3D AP) relies on the intersection over union\nbetween predictions and ground truth objects. However, camera-only detectors\nhave limited depth accuracy, which may cause otherwise reasonable predictions\nthat suffer from such longitudinal localization errors to be treated as false\npositives. We therefore propose variants of the 3D AP metric to be more\npermissive with respect to depth estimation errors. Specifically, our novel\nlongitudinal error tolerant metrics, LET-3D-AP and LET-3D-APL, allow\nlongitudinal localization errors of the prediction boxes up to a given\ntolerance. To evaluate the proposed metrics, we also construct a new test set\nfor the Waymo Open Dataset, tailored to camera-only 3D detection methods.\nSurprisingly, we find that state-of-the-art camera-based detectors can\noutperform popular LiDAR-based detectors with our new metrics past at 10% depth\nerror tolerance, suggesting that existing camera-based detectors already have\nthe potential to surpass LiDAR-based detectors in downstream applications. We\nbelieve the proposed metrics and the new benchmark dataset will facilitate\nadvances in the field of camera-only 3D detection by providing more informative\nsignals that can better indicate the system-level performance.\n","authors":["Wei-Chih Hung","Vincent Casser","Henrik Kretzschmar","Jyh-Jing Hwang","Dragomir Anguelov"],"pdf_url":"https://arxiv.org/pdf/2206.07705v2.pdf","comment":"Find the primary metrics for the 2022 Waymo Open Dataset 3D\n Camera-Only Detection Challenge at\n https://waymo.com/open/challenges/2022/3d-camera-only-detection/ . Find the\n code at https://github.com/waymo-research/waymo-open-dataset"},{"id":"http://arxiv.org/abs/2404.16471v2","updated":"2024-05-03T18:25:39Z","published":"2024-04-25T09:55:35Z","title":"COBRA - COnfidence score Based on shape Regression Analysis for\n method-independent quality assessment of object pose estimation from single\n images","summary":" We present a generic algorithm for scoring pose estimation methods that rely\non single image semantic analysis. The algorithm employs a lightweight putative\nshape representation using a combination of multiple Gaussian Processes. Each\nGaussian Process (GP) yields distance normal distributions from multiple\nreference points in the object's coordinate system to its surface, thus\nproviding a geometric evaluation framework for scoring predicted poses. Our\nconfidence measure comprises the average mixture probability of pixel\nback-projections onto the shape template. In the reported experiments, we\ncompare the accuracy of our GP based representation of objects versus the\nactual geometric models and demonstrate the ability of our method to capture\nthe influence of outliers as opposed to the corresponding intrinsic measures\nthat ship with the segmentation and pose estimation methods.\n","authors":["Panagiotis Sapoutzoglou","Georgios Giapitzakis Tzintanos","George Terzakis","Maria Pateraki"],"pdf_url":"https://arxiv.org/pdf/2404.16471v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02386v1","updated":"2024-05-03T17:59:30Z","published":"2024-05-03T17:59:30Z","title":"Rip-NeRF: Anti-aliasing Radiance Fields with Ripmap-Encoded Platonic\n Solids","summary":" Despite significant advancements in Neural Radiance Fields (NeRFs), the\nrenderings may still suffer from aliasing and blurring artifacts, since it\nremains a fundamental challenge to effectively and efficiently characterize\nanisotropic areas induced by the cone-casting procedure. This paper introduces\na Ripmap-Encoded Platonic Solid representation to precisely and efficiently\nfeaturize 3D anisotropic areas, achieving high-fidelity anti-aliasing\nrenderings. Central to our approach are two key components: Platonic Solid\nProjection and Ripmap encoding. The Platonic Solid Projection factorizes the 3D\nspace onto the unparalleled faces of a certain Platonic solid, such that the\nanisotropic 3D areas can be projected onto planes with distinguishable\ncharacterization. Meanwhile, each face of the Platonic solid is encoded by the\nRipmap encoding, which is constructed by anisotropically pre-filtering a\nlearnable feature grid, to enable featurzing the projected anisotropic areas\nboth precisely and efficiently by the anisotropic area-sampling. Extensive\nexperiments on both well-established synthetic datasets and a newly captured\nreal-world dataset demonstrate that our Rip-NeRF attains state-of-the-art\nrendering quality, particularly excelling in the fine details of repetitive\nstructures and textures, while maintaining relatively swift training times.\n","authors":["Junchen Liu","Wenbo Hu","Zhuo Yang","Jianteng Chen","Guoliang Wang","Xiaoxue Chen","Yantong Cai","Huan-ang Gao","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.02386v1.pdf","comment":"SIGGRAPH 2024, Project page: https://junchenliu77.github.io/Rip-NeRF\n , Code: https://github.com/JunchenLiu77/Rip-NeRF"},{"id":"http://arxiv.org/abs/2405.02383v1","updated":"2024-05-03T15:47:32Z","published":"2024-05-03T15:47:32Z","title":"A Fresh Look at Sanity Checks for Saliency Maps","summary":" The Model Parameter Randomisation Test (MPRT) is highly recognised in the\neXplainable Artificial Intelligence (XAI) community due to its fundamental\nevaluative criterion: explanations should be sensitive to the parameters of the\nmodel they seek to explain. However, recent studies have raised several\nmethodological concerns for the empirical interpretation of MPRT. In response,\nwe propose two modifications to the original test: Smooth MPRT and Efficient\nMPRT. The former reduces the impact of noise on evaluation outcomes via\nsampling, while the latter avoids the need for biased similarity measurements\nby re-interpreting the test through the increase in explanation complexity\nafter full model randomisation. Our experiments show that these modifications\nenhance the metric reliability, facilitating a more trustworthy deployment of\nexplanation methods.\n","authors":["Anna Hedström","Leander Weber","Sebastian Lapuschkin","Marina Höhne"],"pdf_url":"https://arxiv.org/pdf/2405.02383v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.06465"},{"id":"http://arxiv.org/abs/2405.02367v1","updated":"2024-05-03T07:37:50Z","published":"2024-05-03T07:37:50Z","title":"Enhancing Social Media Post Popularity Prediction with Visual Content","summary":" Our study presents a framework for predicting image-based social media\ncontent popularity that focuses on addressing complex image information and a\nhierarchical data structure. We utilize the Google Cloud Vision API to\neffectively extract key image and color information from users' postings,\nachieving 6.8\\% higher accuracy compared to using non-image covariates alone.\nFor prediction, we explore a wide range of prediction models, including Linear\nMixed Model, Support Vector Regression, Multi-layer Perceptron, Random Forest,\nand XGBoost, with linear regression as the benchmark. Our comparative study\ndemonstrates that models that are capable of capturing the underlying nonlinear\ninteractions between covariates outperform other methods.\n","authors":["Dahyun Jeong","Hyelim Son","Yunjin Choi","Keunwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2405.02367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02363v1","updated":"2024-05-03T05:09:54Z","published":"2024-05-03T05:09:54Z","title":"LLM as Dataset Analyst: Subpopulation Structure Discovery with Large\n Language Model","summary":" The distribution of subpopulations is an important property hidden within a\ndataset. Uncovering and analyzing the subpopulation distribution within\ndatasets provides a comprehensive understanding of the datasets, standing as a\npowerful tool beneficial to various downstream tasks, including Dataset\nSubpopulation Organization, Subpopulation Shift, and Slice Discovery. Despite\nits importance, there has been no work that systematically explores the\nsubpopulation distribution of datasets to our knowledge. To address the\nlimitation and solve all the mentioned tasks in a unified way, we introduce a\nnovel concept of subpopulation structures to represent, analyze, and utilize\nsubpopulation distributions within datasets. To characterize the structures in\nan interpretable manner, we propose the Subpopulation Structure Discovery with\nLarge Language Models (SSD-LLM) framework, which employs world knowledge and\ninstruction-following capabilities of Large Language Models (LLMs) to\nlinguistically analyze informative image captions and summarize the structures.\nFurthermore, we propose complete workflows to address downstream tasks, named\nTask-specific Tuning, showcasing the application of the discovered structure to\na spectrum of subpopulation-related tasks, including dataset subpopulation\norganization, subpopulation shift, and slice discovery. Furthermore, we propose\ncomplete workflows to address downstream tasks, named Task-specific Tuning,\nshowcasing the application of the discovered structure to a spectrum of\nsubpopulation-related tasks, including dataset subpopulation organization,\nsubpopulation shift, and slice discovery.\n","authors":["Yulin Luo","Ruichuan An","Bocheng Zou","Yiming Tang","Jiaming Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02363v1.pdf","comment":null}]},"2024-05-06T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.03690v1","updated":"2024-05-06T17:59:45Z","published":"2024-05-06T17:59:45Z","title":"Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs","summary":" Recent advancements in Large Language Models (LLMs) have led to the\ndevelopment of Video Large Multi-modal Models (Video-LMMs) that can handle a\nwide range of video understanding tasks. These models have the potential to be\ndeployed in real-world applications such as robotics, AI assistants, medical\nimaging, and autonomous vehicles. The widespread adoption of Video-LMMs in our\ndaily lives underscores the importance of ensuring and evaluating their robust\nperformance in mirroring human-like reasoning and interaction capabilities in\ncomplex, real-world contexts. However, existing benchmarks for Video-LMMs\nprimarily focus on general video comprehension abilities and neglect assessing\ntheir reasoning capabilities over complex videos in the real-world context, and\nrobustness of these models through the lens of user prompts as text queries. In\nthis paper, we present the Complex Video Reasoning and Robustness Evaluation\nSuite (CVRR-ES), a novel benchmark that comprehensively assesses the\nperformance of Video-LMMs across 11 diverse real-world video dimensions. We\nevaluate 9 recent models, including both open-source and closed-source\nvariants, and find that most of the Video-LMMs, {especially open-source ones,}\nstruggle with robustness and reasoning when dealing with complex videos. Based\non our analysis, we develop a training-free Dual-Step Contextual Prompting\n(DSCP) technique to enhance the performance of existing Video-LMMs. Our\nfindings provide valuable insights for building the next generation of\nhuman-centric AI systems with advanced robustness and reasoning capabilities.\nOur dataset and code are publicly available at:\nhttps://mbzuai-oryx.github.io/CVRR-Evaluation-Suite/.\n","authors":["Muhammad Uzair Khattak","Muhammad Ferjad Naeem","Jameel Hassan","Muzammal Naseer","Federico Tombari","Fahad Shahbaz Khan","Salman Khan"],"pdf_url":"https://arxiv.org/pdf/2405.03690v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2405.03689v1","updated":"2024-05-06T17:59:36Z","published":"2024-05-06T17:59:36Z","title":"Pose Priors from Language Models","summary":" We present a zero-shot pose optimization method that enforces accurate\nphysical contact constraints when estimating the 3D pose of humans. Our central\ninsight is that since language is often used to describe physical interaction,\nlarge pretrained text-based models can act as priors on pose estimation.\n We can thus leverage this insight to improve pose estimation by converting\nnatural language descriptors, generated by a large multimodal model (LMM), into\ntractable losses to constrain the 3D pose optimization. Despite its simplicity,\nour method produces surprisingly compelling pose reconstructions of people in\nclose contact, correctly capturing the semantics of the social and physical\ninteractions. We demonstrate that our method rivals more complex\nstate-of-the-art approaches that require expensive human annotation of contact\npoints and training specialized models. Moreover, unlike previous approaches,\nour method provides a unified framework for resolving self-contact and\nperson-to-person contact.\n","authors":["Sanjay Subramanian","Evonne Ng","Lea Müller","Dan Klein","Shiry Ginosar","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2405.03689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03685v1","updated":"2024-05-06T17:57:27Z","published":"2024-05-06T17:57:27Z","title":"Language-Image Models with 3D Understanding","summary":" Multi-modal large language models (MLLMs) have shown incredible capabilities\nin a variety of 2D vision and language tasks. We extend MLLMs' perceptual\ncapabilities to ground and reason about images in 3-dimensional space. To that\nend, we first develop a large-scale pre-training dataset for 2D and 3D called\nLV3D by combining multiple existing 2D and 3D recognition datasets under a\ncommon task formulation: as multi-turn question-answering. Next, we introduce a\nnew MLLM named Cube-LLM and pre-train it on LV3D. We show that pure data\nscaling makes a strong 3D perception capability without 3D specific\narchitectural design or training objective. Cube-LLM exhibits intriguing\nproperties similar to LLMs: (1) Cube-LLM can apply chain-of-thought prompting\nto improve 3D understanding from 2D context information. (2) Cube-LLM can\nfollow complex and diverse instructions and adapt to versatile input and output\nformats. (3) Cube-LLM can be visually prompted such as 2D box or a set of\ncandidate 3D boxes from specialists. Our experiments on outdoor benchmarks\ndemonstrate that Cube-LLM significantly outperforms existing baselines by 21.3\npoints of AP-BEV on the Talk2Car dataset for 3D grounded reasoning and 17.7\npoints on the DriveLM dataset for complex reasoning about driving scenarios,\nrespectively. Cube-LLM also shows competitive results in general MLLM\nbenchmarks such as refCOCO for 2D grounding with (87.0) average score, as well\nas visual question answering benchmarks such as VQAv2, GQA, SQA, POPE, etc. for\ncomplex reasoning. Our project is available at\nhttps://janghyuncho.github.io/Cube-LLM.\n","authors":["Jang Hyun Cho","Boris Ivanovic","Yulong Cao","Edward Schmerling","Yue Wang","Xinshuo Weng","Boyi Li","Yurong You","Philipp Krähenbühl","Yan Wang","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2405.03685v1.pdf","comment":"Project page: https://janghyuncho.github.io/Cube-LLM"},{"id":"http://arxiv.org/abs/2405.03682v1","updated":"2024-05-06T17:57:03Z","published":"2024-05-06T17:57:03Z","title":"An Empty Room is All We Want: Automatic Defurnishing of Indoor Panoramas","summary":" We propose a pipeline that leverages Stable Diffusion to improve inpainting\nresults in the context of defurnishing -- the removal of furniture items from\nindoor panorama images. Specifically, we illustrate how increased context,\ndomain-specific model fine-tuning, and improved image blending can produce\nhigh-fidelity inpaints that are geometrically plausible without needing to rely\non room layout estimation. We demonstrate qualitative and quantitative\nimprovements over other furniture removal techniques.\n","authors":["Mira Slavcheva","Dave Gausebeck","Kevin Chen","David Buchhofer","Azwad Sabik","Chen Ma","Sachal Dhillon","Olaf Brandt","Alan Dolhasz"],"pdf_url":"https://arxiv.org/pdf/2405.03682v1.pdf","comment":"Accepted at CVPR 2024 workshops. Project page:\n https://matterport.github.io/automatic-defurnishing-of-indoor-panoramas/"},{"id":"http://arxiv.org/abs/2405.03673v1","updated":"2024-05-06T17:49:31Z","published":"2024-05-06T17:49:31Z","title":"MemoryMamba: Memory-Augmented State Space Model for Defect Recognition","summary":" As automation advances in manufacturing, the demand for precise and\nsophisticated defect detection technologies grows. Existing vision models for\ndefect recognition methods are insufficient for handling the complexities and\nvariations of defects in contemporary manufacturing settings. These models\nespecially struggle in scenarios involving limited or imbalanced defect data.\nIn this work, we introduce MemoryMamba, a novel memory-augmented state space\nmodel (SSM), designed to overcome the limitations of existing defect\nrecognition models. MemoryMamba integrates the state space model with the\nmemory augmentation mechanism, enabling the system to maintain and retrieve\nessential defect-specific information in training. Its architecture is designed\nto capture dependencies and intricate defect characteristics, which are crucial\nfor effective defect detection. In the experiments, MemoryMamba was evaluated\nacross four industrial datasets with diverse defect types and complexities. The\nmodel consistently outperformed other methods, demonstrating its capability to\nadapt to various defect recognition scenarios.\n","authors":["Qianning Wang","He Hu","Yucheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.03673v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.03662v1","updated":"2024-05-06T17:39:53Z","published":"2024-05-06T17:39:53Z","title":"Diffeomorphic Template Registration for Atmospheric Turbulence\n Mitigation","summary":" We describe a method for recovering the irradiance underlying a collection of\nimages corrupted by atmospheric turbulence. Since supervised data is often\ntechnically impossible to obtain, assumptions and biases have to be imposed to\nsolve this inverse problem, and we choose to model them explicitly. Rather than\ninitializing a latent irradiance (\"template\") by heuristics to estimate\ndeformation, we select one of the images as a reference, and model the\ndeformation in this image by the aggregation of the optical flow from it to\nother images, exploiting a prior imposed by Central Limit Theorem. Then with a\nnovel flow inversion module, the model registers each image TO the template but\nWITHOUT the template, avoiding artifacts related to poor template\ninitialization. To illustrate the robustness of the method, we simply (i)\nselect the first frame as the reference and (ii) use the simplest optical flow\nto estimate the warpings, yet the improvement in registration is decisive in\nthe final reconstruction, as we achieve state-of-the-art performance despite\nits simplicity. The method establishes a strong baseline that can be further\nimproved by integrating it seamlessly into more sophisticated pipelines, or\nwith domain-specific methods if so desired.\n","authors":["Dong Lao","Congli Wang","Alex Wong","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2405.03662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03660v1","updated":"2024-05-06T17:37:23Z","published":"2024-05-06T17:37:23Z","title":"CICA: Content-Injected Contrastive Alignment for Zero-Shot Document\n Image Classification","summary":" Zero-shot learning has been extensively investigated in the broader field of\nvisual recognition, attracting significant interest recently. However, the\ncurrent work on zero-shot learning in document image classification remains\nscarce. The existing studies either focus exclusively on zero-shot inference,\nor their evaluation does not align with the established criteria of zero-shot\nevaluation in the visual recognition domain. We provide a comprehensive\ndocument image classification analysis in Zero-Shot Learning (ZSL) and\nGeneralized Zero-Shot Learning (GZSL) settings to address this gap. Our\nmethodology and evaluation align with the established practices of this domain.\nAdditionally, we propose zero-shot splits for the RVL-CDIP dataset.\nFurthermore, we introduce CICA (pronounced 'ki-ka'), a framework that enhances\nthe zero-shot learning capabilities of CLIP. CICA consists of a novel 'content\nmodule' designed to leverage any generic document-related textual information.\nThe discriminative features extracted by this module are aligned with CLIP's\ntext and image features using a novel 'coupled-contrastive' loss. Our module\nimproves CLIP's ZSL top-1 accuracy by 6.7% and GZSL harmonic mean by 24% on the\nRVL-CDIP dataset. Our module is lightweight and adds only 3.3% more parameters\nto CLIP. Our work sets the direction for future research in zero-shot document\nclassification.\n","authors":["Sankalp Sinha","Muhammad Saif Ullah Khan","Talha Uddin Sheikh","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2405.03660v1.pdf","comment":"18 Pages, 4 Figures and Accepted in ICDAR 2024"},{"id":"http://arxiv.org/abs/2405.03659v1","updated":"2024-05-06T17:36:44Z","published":"2024-05-06T17:36:44Z","title":"A Construct-Optimize Approach to Sparse View Synthesis without Camera\n Pose","summary":" Novel view synthesis from a sparse set of input images is a challenging\nproblem of great practical interest, especially when camera poses are absent or\ninaccurate. Direct optimization of camera poses and usage of estimated depths\nin neural radiance field algorithms usually do not produce good results because\nof the coupling between poses and depths, and inaccuracies in monocular depth\nestimation. In this paper, we leverage the recent 3D Gaussian splatting method\nto develop a novel construct-and-optimize method for sparse view synthesis\nwithout camera poses. Specifically, we construct a solution progressively by\nusing monocular depth and projecting pixels back into the 3D world. During\nconstruction, we optimize the solution by detecting 2D correspondences between\ntraining views and the corresponding rendered images. We develop a unified\ndifferentiable pipeline for camera registration and adjustment of both camera\nposes and depths, followed by back-projection. We also introduce a novel notion\nof an expected surface in Gaussian splatting, which is critical to our\noptimization. These steps enable a coarse solution, which can then be low-pass\nfiltered and refined using standard optimization methods. We demonstrate\nresults on the Tanks and Temples and Static Hikes datasets with as few as three\nwidely-spaced views, showing significantly better quality than competing\nmethods, including those with approximate camera pose information. Moreover,\nour results improve with more views and outperform previous InstantNGP and\nGaussian Splatting algorithms even when using half the dataset.\n","authors":["Kaiwen Jiang","Yang Fu","Mukund Varma T","Yash Belhe","Xiaolong Wang","Hao Su","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2405.03659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15064v2","updated":"2024-05-06T17:28:20Z","published":"2024-03-22T09:46:11Z","title":"Recent Trends in 3D Reconstruction of General Non-Rigid Scenes","summary":" Reconstructing models of the real world, including 3D geometry, appearance,\nand motion of real scenes, is essential for computer graphics and computer\nvision. It enables the synthesizing of photorealistic novel views, useful for\nthe movie industry and AR/VR applications. It also facilitates the content\ncreation necessary in computer games and AR/VR by avoiding laborious manual\ndesign processes. Further, such models are fundamental for intelligent\ncomputing systems that need to interpret real-world scenes and actions to act\nand interact safely with the human world. Notably, the world surrounding us is\ndynamic, and reconstructing models of dynamic, non-rigidly moving scenes is a\nseverely underconstrained and challenging problem. This state-of-the-art report\n(STAR) offers the reader a comprehensive summary of state-of-the-art techniques\nwith monocular and multi-view inputs such as data from RGB and RGB-D sensors,\namong others, conveying an understanding of different approaches, their\npotential applications, and promising further research directions. The report\ncovers 3D reconstruction of general non-rigid scenes and further addresses the\ntechniques for scene decomposition, editing and controlling, and generalizable\nand generative modeling. More specifically, we first review the common and\nfundamental concepts necessary to understand and navigate the field and then\ndiscuss the state-of-the-art techniques by reviewing recent approaches that use\ntraditional and machine-learning-based neural representations, including a\ndiscussion on the newly enabled applications. The STAR is concluded with a\ndiscussion of the remaining limitations and open challenges.\n","authors":["Raza Yunus","Jan Eric Lenssen","Michael Niemeyer","Yiyi Liao","Christian Rupprecht","Christian Theobalt","Gerard Pons-Moll","Jia-Bin Huang","Vladislav Golyanik","Eddy Ilg"],"pdf_url":"https://arxiv.org/pdf/2403.15064v2.pdf","comment":"42 pages, 18 figures, 5 tables; State-of-the-Art Report at\n EUROGRAPHICS 2024. Project page: https://razayunus.github.io/non-rigid-star"},{"id":"http://arxiv.org/abs/2405.03652v1","updated":"2024-05-06T17:23:42Z","published":"2024-05-06T17:23:42Z","title":"Field-of-View Extension for Diffusion MRI via Deep Generative Models","summary":" Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of\nwhole-brain tissue microstructure and connectivity can be severely impeded by\nan incomplete field-of-view (FOV). This work aims to develop a method for\nimputing the missing slices directly from existing dMRI scans with an\nincomplete FOV. We hypothesize that the imputed image with complete FOV can\nimprove the whole-brain tractography for corrupted data with incomplete FOV.\nTherefore, our approach provides a desirable alternative to discarding the\nvaluable dMRI data, enabling subsequent tractography analyses that would\notherwise be challenging or unattainable with corrupted data. Approach: We\npropose a framework based on a deep generative model that estimates the absent\nbrain regions in dMRI scans with incomplete FOV. The model is capable of\nlearning both the diffusion characteristics in diffusion-weighted images (DWI)\nand the anatomical features evident in the corresponding structural images for\nefficiently imputing missing slices of DWI outside of incomplete FOV. Results:\nFor evaluating the imputed slices, on the WRAP dataset the proposed framework\nachieved PSNRb0=22.397, SSIMb0=0.905, PSNRb1300=22.479, SSIMb1300=0.893; on the\nNACC dataset it achieved PSNRb0=21.304, SSIMb0=0.892, PSNRb1300=21.599,\nSSIMb1300= 0.877. The proposed framework improved the tractography accuracy, as\ndemonstrated by an increased average Dice score for 72 tracts (p < 0.001) on\nboth the WRAP and NACC datasets. Conclusions: Results suggest that the proposed\nframework achieved sufficient imputation performance in dMRI data with\nincomplete FOV for improving whole-brain tractography, thereby repairing the\ncorrupted data. Our approach achieved more accurate whole-brain tractography\nresults with extended and complete FOV and reduced the uncertainty when\nanalyzing bundles associated with Alzheimer's Disease.\n","authors":["Chenyu Gao","Shunxing Bao","Michael Kim","Nancy Newlin","Praitayini Kanakaraj","Tianyuan Yao","Gaurav Rudravaram","Yuankai Huo","Daniel Moyer","Kurt Schilling","Walter Kukull","Arthur Toga","Derek Archer","Timothy Hohman","Bennett Landman","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2405.03652v1.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.03650v1","updated":"2024-05-06T17:14:09Z","published":"2024-05-06T17:14:09Z","title":"Generated Contents Enrichment","summary":" In this paper, we investigate a novel artificial intelligence generation\ntask, termed as generated contents enrichment (GCE). Different from\nconventional artificial intelligence contents generation task that enriches the\ngiven textual description implicitly with limited semantics for generating\nvisually real content, our proposed GCE strives to perform content enrichment\nexplicitly on both the visual and textual domain, from which the enriched\ncontents are visually real, structurally reasonable, and semantically abundant.\nTowards to solve GCE, we propose a deep end-to-end method that explicitly\nexplores the semantics and inter-semantic relationships during the enrichment.\nSpecifically, we first model the input description as a semantic graph, wherein\neach node represents an object and each edge corresponds to the inter-object\nrelationship. We then adopt Graph Convolutional Networks on top of the input\nscene description to predict the enriching objects and their relationships with\nthe input objects. Finally, the enriched graph is fed into an image synthesis\nmodel to carry out the visual contents generation. Our experiments conducted on\nthe Visual Genome dataset exhibit promising and visually plausible results.\n","authors":["Mahdi Naseri","Jiayan Qiu","Zhou Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03649v1","updated":"2024-05-06T17:12:21Z","published":"2024-05-06T17:12:21Z","title":"Learning Robust Classifiers with Self-Guided Spurious Correlation\n Mitigation","summary":" Deep neural classifiers tend to rely on spurious correlations between\nspurious attributes of inputs and targets to make predictions, which could\njeopardize their generalization capability. Training classifiers robust to\nspurious correlations typically relies on annotations of spurious correlations\nin data, which are often expensive to get. In this paper, we tackle an\nannotation-free setting and propose a self-guided spurious correlation\nmitigation framework. Our framework automatically constructs fine-grained\ntraining labels tailored for a classifier obtained with empirical risk\nminimization to improve its robustness against spurious correlations. The\nfine-grained training labels are formulated with different prediction behaviors\nof the classifier identified in a novel spuriousness embedding space. We\nconstruct the space with automatically detected conceptual attributes and a\nnovel spuriousness metric which measures how likely a class-attribute\ncorrelation is exploited for predictions. We demonstrate that training the\nclassifier to distinguish different prediction behaviors reduces its reliance\non spurious correlations without knowing them a priori and outperforms prior\nmethods on five real-world datasets.\n","authors":["Guangtao Zheng","Wenqian Ye","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03649v1.pdf","comment":"Accepted to IJCAI 2024"},{"id":"http://arxiv.org/abs/2405.03643v1","updated":"2024-05-06T17:06:32Z","published":"2024-05-06T17:06:32Z","title":"Collecting Consistently High Quality Object Tracks with Minimal Human\n Involvement by Using Self-Supervised Learning to Detect Tracker Errors","summary":" We propose a hybrid framework for consistently producing high-quality object\ntracks by combining an automated object tracker with little human input. The\nkey idea is to tailor a module for each dataset to intelligently decide when an\nobject tracker is failing and so humans should be brought in to re-localize an\nobject for continued tracking. Our approach leverages self-supervised learning\non unlabeled videos to learn a tailored representation for a target object that\nis then used to actively monitor its tracked region and decide when the tracker\nfails. Since labeled data is not needed, our approach can be applied to novel\nobject categories. Experiments on three datasets demonstrate our method\noutperforms existing approaches, especially for small, fast moving, or occluded\nobjects.\n","authors":["Samreen Anjum","Suyog Jain","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2405.03643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03642v1","updated":"2024-05-06T17:06:11Z","published":"2024-05-06T17:06:11Z","title":"Classification of Breast Cancer Histopathology Images using a Modified\n Supervised Contrastive Learning Method","summary":" Deep neural networks have reached remarkable achievements in medical image\nprocessing tasks, specifically classifying and detecting various diseases.\nHowever, when confronted with limited data, these networks face a critical\nvulnerability, often succumbing to overfitting by excessively memorizing the\nlimited information available. This work addresses the challenge mentioned\nabove by improving the supervised contrastive learning method to reduce the\nimpact of false positives. Unlike most existing methods that rely predominantly\non fully supervised learning, our approach leverages the advantages of\nself-supervised learning in conjunction with employing the available labeled\ndata. We evaluate our method on the BreakHis dataset, which consists of breast\ncancer histopathology images, and demonstrate an increase in classification\naccuracy by 1.45% at the image level and 1.42% at the patient level compared to\nthe state-of-the-art method. This improvement corresponds to 93.63% absolute\naccuracy, highlighting our approach's effectiveness in leveraging data\nproperties to learn more appropriate representation space.\n","authors":["Matina Mahdizadeh Sani","Ali Royat","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2405.03642v1.pdf","comment":"16 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.03633v1","updated":"2024-05-06T16:50:42Z","published":"2024-05-06T16:50:42Z","title":"Neural Graph Mapping for Dense SLAM with Efficient Loop Closure","summary":" Existing neural field-based SLAM methods typically employ a single monolithic\nfield as their scene representation. This prevents efficient incorporation of\nloop closure constraints and limits scalability. To address these shortcomings,\nwe propose a neural mapping framework which anchors lightweight neural fields\nto the pose graph of a sparse visual SLAM system. Our approach shows the\nability to integrate large-scale loop closures, while limiting necessary\nreintegration. Furthermore, we verify the scalability of our approach by\ndemonstrating successful building-scale mapping taking multiple loop closures\ninto account during the optimization, and show that our method outperforms\nexisting state-of-the-art approaches on large scenes in terms of quality and\nruntime. Our code is available at\nhttps://kth-rpl.github.io/neural_graph_mapping/.\n","authors":["Leonard Bruns","Jun Zhang","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2405.03633v1.pdf","comment":"Project page: https://kth-rpl.github.io/neural_graph_mapping/"},{"id":"http://arxiv.org/abs/2404.01568v2","updated":"2024-05-06T16:49:30Z","published":"2024-04-02T02:01:21Z","title":"A Linear Time and Space Local Point Cloud Geometry Encoder via\n Vectorized Kernel Mixture (VecKM)","summary":" We propose VecKM, a local point cloud geometry encoder that is descriptive\nand efficient to compute. VecKM leverages a unique approach by vectorizing a\nkernel mixture to represent the local point cloud. Such representation's\ndescriptiveness is supported by two theorems that validate its ability to\nreconstruct and preserve the similarity of the local shape. Unlike existing\nencoders downsampling the local point cloud, VecKM constructs the local\ngeometry encoding using all neighboring points, producing a more descriptive\nencoding.\n Moreover, VecKM is efficient to compute and scalable to large point cloud\ninputs: VecKM reduces the memory cost from $(n^2+nKd)$ to $(nd+np)$; and\nreduces the major runtime cost from computing $nK$ MLPs to $n$ MLPs, where $n$\nis the size of the point cloud, $K$ is the neighborhood size, $d$ is the\nencoding dimension, and $p$ is a marginal factor. The efficiency is due to\nVecKM's unique factorizable property that eliminates the need of explicitly\ngrouping points into neighbors.\n In the normal estimation task, VecKM demonstrates not only 100x faster\ninference speed but also highest accuracy and strongest robustness. In\nclassification and segmentation tasks, integrating VecKM as a preprocessing\nmodule achieves consistently better performance than the PointNet, PointNet++,\nand point transformer baselines, and runs consistently faster by up to 10\ntimes.\n","authors":["Dehao Yuan","Cornelia Fermüller","Tahseen Rabbani","Furong Huang","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2404.01568v2.pdf","comment":"ICML2024 Conference Paper"},{"id":"http://arxiv.org/abs/2307.16033v2","updated":"2024-05-06T16:32:44Z","published":"2023-07-29T17:45:27Z","title":"CoVid-19 Detection leveraging Vision Transformers and Explainable AI","summary":" Lung disease is a common health problem in many parts of the world. It is a\nsignificant risk to people health and quality of life all across the globe\nsince it is responsible for five of the top thirty leading causes of death.\nAmong them are COVID 19, pneumonia, and tuberculosis, to name just a few. It is\ncritical to diagnose lung diseases in their early stages. Several different\nmodels including machine learning and image processing have been developed for\nthis purpose. The earlier a condition is diagnosed, the better the patient\nchances of making a full recovery and surviving into the long term. Thanks to\ndeep learning algorithms, there is significant promise for the autonomous,\nrapid, and accurate identification of lung diseases based on medical imaging.\nSeveral different deep learning strategies, including convolutional neural\nnetworks (CNN), vanilla neural networks, visual geometry group based networks\n(VGG), and capsule networks , are used for the goal of making lung disease\nforecasts. The standard CNN has a poor performance when dealing with rotated,\ntilted, or other aberrant picture orientations. As a result of this, within the\nscope of this study, we have suggested a vision transformer based approach end\nto end framework for the diagnosis of lung disorders. In the architecture, data\naugmentation, training of the suggested models, and evaluation of the models\nare all included. For the purpose of detecting lung diseases such as pneumonia,\nCovid 19, lung opacity, and others, a specialised Compact Convolution\nTransformers (CCT) model have been tested and evaluated on datasets such as the\nCovid 19 Radiography Database. The model has achieved a better accuracy for\nboth its training and validation purposes on the Covid 19 Radiography Database.\n","authors":["Pangoth Santhosh Kumar","Kundrapu Supriya","Mallikharjuna Rao K","Taraka Satya Krishna Teja Malisetti"],"pdf_url":"https://arxiv.org/pdf/2307.16033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03613v1","updated":"2024-05-06T16:31:19Z","published":"2024-05-06T16:31:19Z","title":"Dual Relation Mining Network for Zero-Shot Learning","summary":" Zero-shot learning (ZSL) aims to recognize novel classes through transferring\nshared semantic knowledge (e.g., attributes) from seen classes to unseen\nclasses. Recently, attention-based methods have exhibited significant progress\nwhich align visual features and attributes via a spatial attention mechanism.\nHowever, these methods only explore visual-semantic relationship in the spatial\ndimension, which can lead to classification ambiguity when different attributes\nshare similar attention regions, and semantic relationship between attributes\nis rarely discussed. To alleviate the above problems, we propose a Dual\nRelation Mining Network (DRMN) to enable more effective visual-semantic\ninteractions and learn semantic relationship among attributes for knowledge\ntransfer. Specifically, we introduce a Dual Attention Block (DAB) for\nvisual-semantic relationship mining, which enriches visual information by\nmulti-level feature fusion and conducts spatial attention for visual to\nsemantic embedding. Moreover, an attribute-guided channel attention is utilized\nto decouple entangled semantic features. For semantic relationship modeling, we\nutilize a Semantic Interaction Transformer (SIT) to enhance the generalization\nof attribute representations among images. Additionally, a global\nclassification branch is introduced as a complement to human-defined semantic\nattributes, and we then combine the results with attribute-based\nclassification. Extensive experiments demonstrate that the proposed DRMN leads\nto new state-of-the-art performances on three standard ZSL benchmarks, i.e.,\nCUB, SUN, and AwA2.\n","authors":["Jinwei Han","Yingguo Gao","Zhiwen Lin","Ke Yan","Shouhong Ding","Yuan Gao","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2405.03613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11565v2","updated":"2024-05-06T16:29:15Z","published":"2024-04-17T17:08:05Z","title":"MoA: Mixture-of-Attention for Subject-Context Disentanglement in\n Personalized Image Generation","summary":" We introduce a new architecture for personalization of text-to-image\ndiffusion models, coined Mixture-of-Attention (MoA). Inspired by the\nMixture-of-Experts mechanism utilized in large language models (LLMs), MoA\ndistributes the generation workload between two attention pathways: a\npersonalized branch and a non-personalized prior branch. MoA is designed to\nretain the original model's prior by fixing its attention layers in the prior\nbranch, while minimally intervening in the generation process with the\npersonalized branch that learns to embed subjects in the layout and context\ngenerated by the prior branch. A novel routing mechanism manages the\ndistribution of pixels in each layer across these branches to optimize the\nblend of personalized and generic content creation. Once trained, MoA\nfacilitates the creation of high-quality, personalized images featuring\nmultiple subjects with compositions and interactions as diverse as those\ngenerated by the original model. Crucially, MoA enhances the distinction\nbetween the model's pre-existing capability and the newly augmented\npersonalized intervention, thereby offering a more disentangled subject-context\ncontrol that was previously unattainable. Project page:\nhttps://snap-research.github.io/mixture-of-attention\n","authors":["Kuan-Chieh Wang","Daniil Ostashev","Yuwei Fang","Sergey Tulyakov","Kfir Aberman"],"pdf_url":"https://arxiv.org/pdf/2404.11565v2.pdf","comment":"Project Website:\n https://snap-research.github.io/mixture-of-attention, Same as previous\n version, only updated metadata because bib was missing an author name"},{"id":"http://arxiv.org/abs/2405.01673v2","updated":"2024-05-06T16:24:08Z","published":"2024-05-02T18:59:53Z","title":"ShadowNav: Autonomous Global Localization for Lunar Navigation in\n Darkness","summary":" The ability to determine the pose of a rover in an inertial frame\nautonomously is a crucial capability necessary for the next generation of\nsurface rover missions on other planetary bodies. Currently, most on-going\nrover missions utilize ground-in-the-loop interventions to manually correct for\ndrift in the pose estimate and this human supervision bottlenecks the distance\nover which rovers can operate autonomously and carry out scientific\nmeasurements. In this paper, we present ShadowNav, an autonomous approach for\nglobal localization on the Moon with an emphasis on driving in darkness and at\nnighttime. Our approach uses the leading edge of Lunar craters as landmarks and\na particle filtering approach is used to associate detected craters with known\nones on an offboard map. We discuss the key design decisions in developing the\nShadowNav framework for use with a Lunar rover concept equipped with a stereo\ncamera and an external illumination source. Finally, we demonstrate the\nefficacy of our proposed approach in both a Lunar simulation environment and on\ndata collected during a field test at Cinder Lakes, Arizona.\n","authors":["Deegan Atha","R. Michael Swan","Abhishek Cauligi","Anne Bettens","Edwin Goh","Dima Kogan","Larry Matthies","Masahiro Ono"],"pdf_url":"https://arxiv.org/pdf/2405.01673v2.pdf","comment":"21 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.03191v2","updated":"2024-05-06T16:18:14Z","published":"2024-04-04T04:22:50Z","title":"CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception\n Tasks","summary":" Numerous roadside perception datasets have been introduced to propel\nadvancements in autonomous driving and intelligent transportation systems\nresearch and development. However, it has been observed that the majority of\ntheir concentrates is on urban arterial roads, inadvertently overlooking\nresidential areas such as parks and campuses that exhibit entirely distinct\ncharacteristics. In light of this gap, we propose CORP, which stands as the\nfirst public benchmark dataset tailored for multi-modal roadside perception\ntasks under campus scenarios. Collected in a university campus, CORP consists\nof over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR\nsensors. These sensors with different configurations are mounted on roadside\nutility poles to provide diverse viewpoints within the campus region. The\nannotations of CORP encompass multi-dimensional information beyond 2D and 3D\nbounding boxes, providing extra support for 3D seamless tracking and instance\nsegmentation with unique IDs and pixel masks for identifying targets, to\nenhance the understanding of objects and their behaviors distributed across the\ncampus premises. Unlike other roadside datasets about urban traffic, CORP\nextends the spectrum to highlight the challenges for multi-modal perception in\ncampuses and other residential areas.\n","authors":["Beibei Wang","Shuang Meng","Lu Zhang","Chenjie Wang","Jingjing Huang","Yao Li","Haojie Ren","Yuxuan Xiao","Yuru Peng","Jianmin Ji","Yu Zhang","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15957v3","updated":"2024-05-06T16:15:50Z","published":"2023-05-25T11:55:38Z","title":"DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D\n Classification","summary":" Large pre-trained models have had a significant impact on computer vision by\nenabling multi-modal learning, where the CLIP model has achieved impressive\nresults in image classification, object detection, and semantic segmentation.\nHowever, the model's performance on 3D point cloud processing tasks is limited\ndue to the domain gap between depth maps from 3D projection and training images\nof CLIP. This paper proposes DiffCLIP, a new pre-training framework that\nincorporates stable diffusion with ControlNet to minimize the domain gap in the\nvisual branch. Additionally, a style-prompt generation module is introduced for\nfew-shot tasks in the textual branch. Extensive experiments on the ModelNet10,\nModelNet40, and ScanObjectNN datasets show that DiffCLIP has strong abilities\nfor 3D understanding. By using stable diffusion and style-prompt generation,\nDiffCLIP achieves an accuracy of 43.2\\% for zero-shot classification on OBJ\\_BG\nof ScanObjectNN, which is state-of-the-art performance, and an accuracy of\n80.6\\% for zero-shot classification on ModelNet10, which is comparable to\nstate-of-the-art performance.\n","authors":["Sitian Shen","Zilin Zhu","Linqian Fan","Harry Zhang","Xinxiao Wu"],"pdf_url":"https://arxiv.org/pdf/2305.15957v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10638v2","updated":"2024-05-06T16:14:23Z","published":"2023-08-21T11:23:25Z","title":"SCULPT: Shape-Conditioned Unpaired Learning of Pose-dependent Clothed\n and Textured Human Meshes","summary":" We present SCULPT, a novel 3D generative model for clothed and textured 3D\nmeshes of humans. Specifically, we devise a deep neural network that learns to\nrepresent the geometry and appearance distribution of clothed human bodies.\nTraining such a model is challenging, as datasets of textured 3D meshes for\nhumans are limited in size and accessibility. Our key observation is that there\nexist medium-sized 3D scan datasets like CAPE, as well as large-scale 2D image\ndatasets of clothed humans and multiple appearances can be mapped to a single\ngeometry. To effectively learn from the two data modalities, we propose an\nunpaired learning procedure for pose-dependent clothed and textured human\nmeshes. Specifically, we learn a pose-dependent geometry space from 3D scan\ndata. We represent this as per vertex displacements w.r.t. the SMPL model.\nNext, we train a geometry conditioned texture generator in an unsupervised way\nusing the 2D image data. We use intermediate activations of the learned\ngeometry model to condition our texture generator. To alleviate entanglement\nbetween pose and clothing type, and pose and clothing appearance, we condition\nboth the texture and geometry generators with attribute labels such as clothing\ntypes for the geometry, and clothing colors for the texture generator. We\nautomatically generated these conditioning labels for the 2D images based on\nthe visual question answering model BLIP and CLIP. We validate our method on\nthe SCULPT dataset, and compare to state-of-the-art 3D generative models for\nclothed human bodies. Our code and data can be found at\nhttps://sculpt.is.tue.mpg.de.\n","authors":["Soubhik Sanyal","Partha Ghosh","Jinlong Yang","Michael J. Black","Justus Thies","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2308.10638v2.pdf","comment":"Updated to camera ready version of CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10522v2","updated":"2024-05-06T16:13:34Z","published":"2023-10-25T20:39:07Z","title":"Ordinal Classification with Distance Regularization for Robust Brain Age\n Prediction","summary":" Age is one of the major known risk factors for Alzheimer's Disease (AD).\nDetecting AD early is crucial for effective treatment and preventing\nirreversible brain damage. Brain age, a measure derived from brain imaging\nreflecting structural changes due to aging, may have the potential to identify\nAD onset, assess disease risk, and plan targeted interventions. Deep\nlearning-based regression techniques to predict brain age from magnetic\nresonance imaging (MRI) scans have shown great accuracy recently. However,\nthese methods are subject to an inherent regression to the mean effect, which\ncauses a systematic bias resulting in an overestimation of brain age in young\nsubjects and underestimation in old subjects. This weakens the reliability of\npredicted brain age as a valid biomarker for downstream clinical applications.\nHere, we reformulate the brain age prediction task from regression to\nclassification to address the issue of systematic bias. Recognizing the\nimportance of preserving ordinal information from ages to understand aging\ntrajectory and monitor aging longitudinally, we propose a novel ORdinal\nDistance Encoded Regularization (ORDER) loss that incorporates the order of age\nlabels, enhancing the model's ability to capture age-related patterns.\nExtensive experiments and ablation studies demonstrate that this framework\nreduces systematic bias, outperforms state-of-art methods by statistically\nsignificant margins, and can better capture subtle differences between clinical\ngroups in an independent AD dataset. Our implementation is publicly available\nat https://github.com/jaygshah/Robust-Brain-Age-Prediction.\n","authors":["Jay Shah","Md Mahfuzur Rahman Siddiquee","Yi Su","Teresa Wu","Baoxin Li"],"pdf_url":"https://arxiv.org/pdf/2403.10522v2.pdf","comment":"Accepted in WACV 2024"},{"id":"http://arxiv.org/abs/2310.12973v2","updated":"2024-05-06T15:45:30Z","published":"2023-10-19T17:59:05Z","title":"Frozen Transformers in Language Models Are Effective Visual Encoder\n Layers","summary":" This paper reveals that large language models (LLMs), despite being trained\nsolely on textual data, are surprisingly strong encoders for purely visual\ntasks in the absence of language. Even more intriguingly, this can be achieved\nby a simple yet previously overlooked strategy -- employing a frozen\ntransformer block from pre-trained LLMs as a constituent encoder layer to\ndirectly process visual tokens. Our work pushes the boundaries of leveraging\nLLMs for computer vision tasks, significantly departing from conventional\npractices that typically necessitate a multi-modal vision-language setup with\nassociated language prompts, inputs, or outputs. We demonstrate that our\napproach consistently enhances performance across a diverse range of tasks,\nencompassing pure 2D and 3D visual recognition tasks (e.g., image and point\ncloud classification), temporal modeling tasks (e.g., action recognition),\nnon-semantic tasks (e.g., motion forecasting), and multi-modal tasks (e.g.,\n2D/3D visual question answering and image-text retrieval). Such improvements\nare a general phenomenon, applicable to various types of LLMs (e.g., LLaMA and\nOPT) and different LLM transformer blocks. We additionally propose the\ninformation filtering hypothesis to explain the effectiveness of pre-trained\nLLMs in visual encoding -- the pre-trained LLM transformer blocks discern\ninformative visual tokens and further amplify their effect. This hypothesis is\nempirically supported by the observation that the feature activation, after\ntraining with LLM transformer blocks, exhibits a stronger focus on relevant\nregions. We hope that our work inspires new perspectives on utilizing LLMs and\ndeepening our understanding of their underlying mechanisms. Code is available\nat https://github.com/ziqipang/LM4VisualEncoding.\n","authors":["Ziqi Pang","Ziyang Xie","Yunze Man","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.12973v2.pdf","comment":"ICLR 2024 Spotlight. 23 pages, 13 figures. Code at\n https://github.com/ziqipang/LM4VisualEncoding"},{"id":"http://arxiv.org/abs/2405.03565v1","updated":"2024-05-06T15:38:32Z","published":"2024-05-06T15:38:32Z","title":"Liberating Seen Classes: Boosting Few-Shot and Zero-Shot Text\n Classification via Anchor Generation and Classification Reframing","summary":" Few-shot and zero-shot text classification aim to recognize samples from\nnovel classes with limited labeled samples or no labeled samples at all. While\nprevailing methods have shown promising performance via transferring knowledge\nfrom seen classes to unseen classes, they are still limited by (1) Inherent\ndissimilarities among classes make the transformation of features learned from\nseen classes to unseen classes both difficult and inefficient. (2) Rare labeled\nnovel samples usually cannot provide enough supervision signals to enable the\nmodel to adjust from the source distribution to the target distribution,\nespecially for complicated scenarios. To alleviate the above issues, we propose\na simple and effective strategy for few-shot and zero-shot text classification.\nWe aim to liberate the model from the confines of seen classes, thereby\nenabling it to predict unseen categories without the necessity of training on\nseen classes. Specifically, for mining more related unseen category knowledge,\nwe utilize a large pre-trained language model to generate pseudo novel samples,\nand select the most representative ones as category anchors. After that, we\nconvert the multi-class classification task into a binary classification task\nand use the similarities of query-anchor pairs for prediction to fully leverage\nthe limited supervision signals. Extensive experiments on six widely used\npublic datasets show that our proposed method can outperform other strong\nbaselines significantly in few-shot and zero-shot tasks, even without using any\nseen class samples.\n","authors":["Han Liu","Siyang Zhao","Xiaotong Zhang","Feng Zhang","Wei Wang","Fenglong Ma","Hongyang Chen","Hong Yu","Xianchao Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03565v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2311.10093v3","updated":"2024-05-06T15:38:26Z","published":"2023-11-16T18:59:51Z","title":"The Chosen One: Consistent Characters in Text-to-Image Diffusion Models","summary":" Recent advances in text-to-image generation models have unlocked vast\npotential for visual creativity. However, these models struggle with generation\nof consistent characters, a crucial aspect for numerous real-world applications\nsuch as story visualization, game development asset design, advertising, and\nmore. Current methods typically rely on multiple pre-existing images of the\ntarget character or involve labor-intensive manual processes. In this work, we\npropose a fully automated solution for consistent character generation, with\nthe sole input being a text prompt. We introduce an iterative procedure that,\nat each stage, identifies a coherent set of images sharing a similar identity\nand extracts a more consistent identity from this set. Our quantitative\nanalysis demonstrates that our method strikes a better balance between prompt\nalignment and identity consistency compared to the baseline methods, and these\nfindings are reinforced by a user study. To conclude, we showcase several\npractical applications of our approach. Project page is available at\nhttps://omriavrahami.com/the-chosen-one\n","authors":["Omri Avrahami","Amir Hertz","Yael Vinker","Moab Arar","Shlomi Fruchter","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2311.10093v3.pdf","comment":"Accepted to SIGGRAPH 2024. Project page is available at\n https://omriavrahami.com/the-chosen-one"},{"id":"http://arxiv.org/abs/2404.12678v2","updated":"2024-05-06T15:16:50Z","published":"2024-04-19T07:24:32Z","title":"Exploring Interactive Semantic Alignment for Efficient HOI Detection\n with Vision-language Model","summary":" Human-Object Interaction (HOI) detection aims to localize human-object pairs\nand comprehend their interactions. Recently, two-stage transformer-based\nmethods have demonstrated competitive performance. However, these methods\nfrequently focus on object appearance features and ignore global contextual\ninformation. Besides, vision-language model CLIP which effectively aligns\nvisual and text embeddings has shown great potential in zero-shot HOI\ndetection. Based on the former facts, We introduce a novel HOI detector named\nISA-HOI, which extensively leverages knowledge from CLIP, aligning interactive\nsemantics between visual and textual features. We first extract global context\nof image and local features of object to Improve interaction Features in images\n(IF). On the other hand, we propose a Verb Semantic Improvement (VSI) module to\nenhance textual features of verb labels via cross-modal fusion. Ultimately, our\nmethod achieves competitive results on the HICO-DET and V-COCO benchmarks with\nmuch fewer training epochs, and outperforms the state-of-the-art under\nzero-shot settings.\n","authors":["Jihao Dong","Renjie Pan","Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12678v2.pdf","comment":"There are issues with the experimental results"},{"id":"http://arxiv.org/abs/2305.12844v2","updated":"2024-05-06T15:16:49Z","published":"2023-05-22T09:08:59Z","title":"An Optimized Ensemble Deep Learning Model For Brain Tumor Classification","summary":" Brain tumors present a grave risk to human life, demanding precise and timely\ndiagnosis for effective treatment. Inaccurate identification of brain tumors\ncan significantly diminish life expectancy, underscoring the critical need for\nprecise diagnostic methods. Manual identification of brain tumors within vast\nMagnetic Resonance Imaging (MRI) image datasets is arduous and time-consuming.\nThus, the development of a reliable deep learning (DL) model is essential to\nenhance diagnostic accuracy and ultimately save lives. This study introduces an\ninnovative optimization-based deep ensemble approach employing transfer\nlearning (TL) to efficiently classify brain tumors. Our methodology includes\nmeticulous preprocessing, reconstruction of TL architectures, fine-tuning, and\nensemble DL models utilizing weighted optimization techniques such as Genetic\nAlgorithm-based Weight Optimization (GAWO) and Grid Search-based Weight\nOptimization (GSWO). Experimentation is conducted on the Figshare\nContrast-Enhanced MRI (CE-MRI) brain tumor dataset, comprising 3064 images. Our\napproach achieves notable accuracy scores, with Xception, ResNet50V2,\nResNet152V2, InceptionResNetV2, GAWO, and GSWO attaining 99.42%, 98.37%,\n98.22%, 98.26%, 99.71%, and 99.76% accuracy, respectively. Notably, GSWO\ndemonstrates superior accuracy, averaging 99.76\\% accuracy across five folds on\nthe Figshare CE-MRI brain tumor dataset. The comparative analysis highlights\nthe significant performance enhancement of our proposed model over existing\ncounterparts. In conclusion, our optimized deep ensemble model exhibits\nexceptional accuracy in swiftly classifying brain tumors. Furthermore, it has\nthe potential to assist neurologists and clinicians in making accurate and\nimmediate diagnostic decisions.\n","authors":["Md. Alamin Talukder","Md. Manowarul Islam","Md Ashraf Uddin"],"pdf_url":"https://arxiv.org/pdf/2305.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13959v6","updated":"2024-05-06T15:14:22Z","published":"2023-03-24T12:33:44Z","title":"Bridging Stereo Geometry and BEV Representation with Reliable Mutual\n Interaction for Semantic Scene Completion","summary":" 3D semantic scene completion (SSC) is an ill-posed perception task that\nrequires inferring a dense 3D scene from limited observations. Previous\ncamera-based methods struggle to predict accurate semantic scenes due to\ninherent geometric ambiguity and incomplete observations. In this paper, we\nresort to stereo matching technique and bird's-eye-view (BEV) representation\nlearning to address such issues in SSC. Complementary to each other, stereo\nmatching mitigates geometric ambiguity with epipolar constraint while BEV\nrepresentation enhances the hallucination ability for invisible regions with\nglobal semantic context. However, due to the inherent representation gap\nbetween stereo geometry and BEV features, it is non-trivial to bridge them for\ndense prediction task of SSC. Therefore, we further develop a unified\noccupancy-based framework dubbed BRGScene, which effectively bridges these two\nrepresentations with dense 3D volumes for reliable semantic scene completion.\nSpecifically, we design a novel Mutual Interactive Ensemble (MIE) block for\npixel-level reliable aggregation of stereo geometry and BEV features. Within\nthe MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced\nwith confidence re-weighting, is employed to encourage fine-grained interaction\nthrough mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is\nintroduced to facilitate complementary aggregation through channel-wise\nrecalibration and multi-group voting. Our method outperforms all published\ncamera-based methods on SemanticKITTI for semantic scene completion. Our code\nis available on https://github.com/Arlo0o/StereoScene.\n","authors":["Bohan Li","Yasheng Sun","Zhujin Liang","Dalong Du","Zhuanghui Zhang","Xiaofeng Wang","Yunnan Wang","Xin Jin","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.13959v6.pdf","comment":"IJCAI2024 (https://github.com/Arlo0o/StereoScene)"},{"id":"http://arxiv.org/abs/2403.19924v3","updated":"2024-05-06T15:12:09Z","published":"2024-03-29T02:22:54Z","title":"SceneTracker: Long-term Scene Flow Estimation Network","summary":" Considering the complementarity of scene flow estimation in the spatial\ndomain's focusing capability and 3D object tracking in the temporal domain's\ncoherence, this study aims to address a comprehensive new task that can\nsimultaneously capture fine-grained and long-term 3D motion in an online\nmanner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a\nnovel learning-based LSFE network that adopts an iterative approach to\napproximate the optimal trajectory. Besides, it dynamically indexes and\nconstructs appearance and depth correlation features simultaneously and employs\nthe Transformer to explore and utilize long-range connections within and\nbetween trajectories. With detailed experiments, SceneTracker shows superior\ncapabilities in handling 3D spatial occlusion and depth noise interference,\nhighly tailored to the LSFE task's needs. Finally, we build the first\nreal-world evaluation dataset, LSFDriving, further substantiating\nSceneTracker's commendable generalization capacity. The code and data for\nSceneTracker is available at https://github.com/wwsource/SceneTracker.\n","authors":["Bo Wang","Jian Li","Yang Yu","Li Liu","Zhenping Sun","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2403.19924v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03546v1","updated":"2024-05-06T15:10:19Z","published":"2024-05-06T15:10:19Z","title":"CCDM: Continuous Conditional Diffusion Models for Image Generation","summary":" Continuous Conditional Generative Modeling (CCGM) aims to estimate the\ndistribution of high-dimensional data, typically images, conditioned on scalar\ncontinuous variables known as regression labels. While Continuous conditional\nGenerative Adversarial Networks (CcGANs) were initially designed for this task,\ntheir adversarial training mechanism remains vulnerable to extremely sparse or\nimbalanced data, resulting in suboptimal outcomes. To enhance the quality of\ngenerated images, a promising alternative is to replace CcGANs with Conditional\nDiffusion Models (CDMs), renowned for their stable training process and ability\nto produce more realistic images. However, existing CDMs encounter challenges\nwhen applied to CCGM tasks due to several limitations such as inadequate U-Net\narchitectures and deficient model fitting mechanisms for handling regression\nlabels. In this paper, we introduce Continuous Conditional Diffusion Models\n(CCDMs), the first CDM designed specifically for the CCGM task. CCDMs address\nthe limitations of existing CDMs by introducing specially designed conditional\ndiffusion processes, a modified denoising U-Net with a custom-made conditioning\nmechanism, a novel hard vicinal loss for model fitting, and an efficient\nconditional sampling procedure. With comprehensive experiments on four datasets\nwith varying resolutions ranging from 64x64 to 192x192, we demonstrate the\nsuperiority of the proposed CCDM over state-of-the-art CCGM models,\nestablishing new benchmarks in CCGM. Extensive ablation studies validate the\nmodel design and implementation configuration of the proposed CCDM. Our code is\npublicly available at https://github.com/UBCDingXin/CCDM.\n","authors":["Xin Ding","Yongwei Wang","Kao Zhang","Z. Jane Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03545v1","updated":"2024-05-06T15:10:16Z","published":"2024-05-06T15:10:16Z","title":"Optimizing Hand Region Detection in MediaPipe Holistic Full-Body Pose\n Estimation to Improve Accuracy and Avoid Downstream Errors","summary":" This paper addresses a critical flaw in MediaPipe Holistic's hand Region of\nInterest (ROI) prediction, which struggles with non-ideal hand orientations,\naffecting sign language recognition accuracy. We propose a data-driven approach\nto enhance ROI estimation, leveraging an enriched feature set including\nadditional hand keypoints and the z-dimension. Our results demonstrate better\nestimates, with higher Intersection-over-Union compared to the current method.\nOur code and optimizations are available at\nhttps://github.com/sign-language-processing/mediapipe-hand-crop-fix.\n","authors":["Amit Moryossef"],"pdf_url":"https://arxiv.org/pdf/2405.03545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03541v1","updated":"2024-05-06T15:02:16Z","published":"2024-05-06T15:02:16Z","title":"RepVGG-GELAN: Enhanced GELAN with VGG-STYLE ConvNets for Brain Tumour\n Detection","summary":" Object detection algorithms particularly those based on YOLO have\ndemonstrated remarkable efficiency in balancing speed and accuracy. However,\ntheir application in brain tumour detection remains underexplored. This study\nproposes RepVGG-GELAN, a novel YOLO architecture enhanced with RepVGG, a\nreparameterized convolutional approach for object detection tasks particularly\nfocusing on brain tumour detection within medical images. RepVGG-GELAN\nleverages the RepVGG architecture to improve both speed and accuracy in\ndetecting brain tumours. Integrating RepVGG into the YOLO framework aims to\nachieve a balance between computational efficiency and detection performance.\nThis study includes a spatial pyramid pooling-based Generalized Efficient Layer\nAggregation Network (GELAN) architecture which further enhances the capability\nof RepVGG. Experimental evaluation conducted on a brain tumour dataset\ndemonstrates the effectiveness of RepVGG-GELAN surpassing existing RCS-YOLO in\nterms of precision and speed. Specifically, RepVGG-GELAN achieves an increased\nprecision of 4.91% and an increased AP50 of 2.54% over the latest existing\napproach while operating at 240.7 GFLOPs. The proposed RepVGG-GELAN with GELAN\narchitecture presents promising results establishing itself as a\nstate-of-the-art solution for accurate and efficient brain tumour detection in\nmedical images. The implementation code is publicly available at\nhttps://github.com/ThensiB/RepVGG-GELAN.\n","authors":["Thennarasi Balakrishnan","Sandeep Singh Sengar"],"pdf_url":"https://arxiv.org/pdf/2405.03541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00181v2","updated":"2024-05-06T14:57:50Z","published":"2024-04-30T20:11:49Z","title":"Uncovering What, Why and How: A Comprehensive Benchmark for Causation\n Understanding of Video Anomaly","summary":" Video anomaly understanding (VAU) aims to automatically comprehend unusual\noccurrences in videos, thereby enabling various applications such as traffic\nsurveillance and industrial manufacturing. While existing VAU benchmarks\nprimarily concentrate on anomaly detection and localization, our focus is on\nmore practicality, prompting us to raise the following crucial questions: \"what\nanomaly occurred?\", \"why did it happen?\", and \"how severe is this abnormal\nevent?\". In pursuit of these answers, we present a comprehensive benchmark for\nCausation Understanding of Video Anomaly (CUVA). Specifically, each instance of\nthe proposed benchmark involves three sets of human annotations to indicate the\n\"what\", \"why\" and \"how\" of an anomaly, including 1) anomaly type, start and end\ntimes, and event descriptions, 2) natural language explanations for the cause\nof an anomaly, and 3) free text reflecting the effect of the abnormality. In\naddition, we also introduce MMEval, a novel evaluation metric designed to\nbetter align with human preferences for CUVA, facilitating the measurement of\nexisting LLMs in comprehending the underlying cause and corresponding effect of\nvideo anomalies. Finally, we propose a novel prompt-based method that can serve\nas a baseline approach for the challenging CUVA. We conduct extensive\nexperiments to show the superiority of our evaluation metric and the\nprompt-based approach. Our code and dataset are available at\nhttps://github.com/fesvhtr/CUVA.\n","authors":["Hang Du","Sicheng Zhang","Binzhu Xie","Guoshun Nan","Jiayang Zhang","Junrui Xu","Hangyu Liu","Sicong Leng","Jiangming Liu","Hehe Fan","Dajiu Huang","Jing Feng","Linli Chen","Can Zhang","Xuhuan Li","Hao Zhang","Jianhang Chen","Qimei Cui","Xiaofeng Tao"],"pdf_url":"https://arxiv.org/pdf/2405.00181v2.pdf","comment":"Accepted in CVPR2024, Codebase: https://github.com/fesvhtr/CUVA"},{"id":"http://arxiv.org/abs/2402.19404v4","updated":"2024-05-06T14:41:56Z","published":"2024-02-29T18:03:00Z","title":"EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image\n Captioning","summary":" News image captioning requires model to generate an informative caption rich\nin entities, with the news image and the associated news article. Though\nMultimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in addressing various vision-language tasks, our research finds\nthat current MLLMs still bear limitations in handling entity information on\nnews image captioning task. Besides, while MLLMs have the ability to process\nlong inputs, generating high-quality news image captions still requires a\ntrade-off between sufficiency and conciseness of textual input information. To\nexplore the potential of MLLMs and address problems we discovered, we propose :\nan Entity-Aware Multimodal Alignment based approach for news image captioning.\nOur approach first aligns the MLLM through Balance Training Strategy with two\nextra alignment tasks: Entity-Aware Sentence Selection task and Entity\nSelection task, together with News Image Captioning task, to enhance its\ncapability in handling multimodal entity information. The aligned MLLM will\nutilizes the additional entity-related information it explicitly extracts to\nsupplement its textual input while generating news image captions. Our approach\nachieves better results than all previous models in CIDEr score on GoodNews\ndataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61).\n","authors":["Junzhe Zhang","Huixuan Zhang","Xunjian Yin","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2402.19404v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03520v1","updated":"2024-05-06T14:37:07Z","published":"2024-05-06T14:37:07Z","title":"Is Sora a World Simulator? A Comprehensive Survey on General World\n Models and Beyond","summary":" General world models represent a crucial pathway toward achieving Artificial\nGeneral Intelligence (AGI), serving as the cornerstone for various applications\nranging from virtual environments to decision-making systems. Recently, the\nemergence of the Sora model has attained significant attention due to its\nremarkable simulation capabilities, which exhibits an incipient comprehension\nof physical laws. In this survey, we embark on a comprehensive exploration of\nthe latest advancements in world models. Our analysis navigates through the\nforefront of generative methodologies in video generation, where world models\nstand as pivotal constructs facilitating the synthesis of highly realistic\nvisual content. Additionally, we scrutinize the burgeoning field of\nautonomous-driving world models, meticulously delineating their indispensable\nrole in reshaping transportation and urban mobility. Furthermore, we delve into\nthe intricacies inherent in world models deployed within autonomous agents,\nshedding light on their profound significance in enabling intelligent\ninteractions within dynamic environmental contexts. At last, we examine\nchallenges and limitations of world models, and discuss their potential future\ndirections. We hope this survey can serve as a foundational reference for the\nresearch community and inspire continued innovation. This survey will be\nregularly updated at:\nhttps://github.com/GigaAI-research/General-World-Models-Survey.\n","authors":["Zheng Zhu","Xiaofeng Wang","Wangbo Zhao","Chen Min","Nianchen Deng","Min Dou","Yuqi Wang","Botian Shi","Kai Wang","Chi Zhang","Yang You","Zhaoxiang Zhang","Dawei Zhao","Liang Xiao","Jian Zhao","Jiwen Lu","Guan Huang"],"pdf_url":"https://arxiv.org/pdf/2405.03520v1.pdf","comment":"This survey will be regularly updated at:\n https://github.com/GigaAI-research/General-World-Models-Survey"},{"id":"http://arxiv.org/abs/2405.03519v1","updated":"2024-05-06T14:36:01Z","published":"2024-05-06T14:36:01Z","title":"Low-light Object Detection","summary":" In this competition we employed a model fusion approach to achieve object\ndetection results close to those of real images. Our method is based on the\nCO-DETR model, which was trained on two sets of data: one containing images\nunder dark conditions and another containing images enhanced with low-light\nconditions. We used various enhancement techniques on the test data to generate\nmultiple sets of prediction results. Finally, we applied a clustering\naggregation method guided by IoU thresholds to select the optimal results.\n","authors":["Pengpeng Li","Haowei Gu","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2405.03519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03094v4","updated":"2024-05-06T14:32:35Z","published":"2023-04-06T14:22:02Z","title":"PopulAtion Parameter Averaging (PAPA)","summary":" Ensemble methods combine the predictions of multiple models to improve\nperformance, but they require significantly higher computation costs at\ninference time. To avoid these costs, multiple neural networks can be combined\ninto one by averaging their weights. However, this usually performs\nsignificantly worse than ensembling. Weight averaging is only beneficial when\ndifferent enough to benefit from combining them, but similar enough to average\nwell. Based on this idea, we propose PopulAtion Parameter Averaging (PAPA): a\nmethod that combines the generality of ensembling with the efficiency of weight\naveraging. PAPA leverages a population of diverse models (trained on different\ndata orders, augmentations, and regularizations) while slowly pushing the\nweights of the networks toward the population average of the weights. We also\npropose PAPA variants (PAPA-all, and PAPA-2) that average weights rarely rather\nthan continuously; all methods increase generalization, but PAPA tends to\nperform best. PAPA reduces the performance gap between averaging and\nensembling, increasing the average accuracy of a population of models by up to\n0.8% on CIFAR-10, 1.9% on CIFAR-100, and 1.6% on ImageNet when compared to\ntraining independent (non-averaged) models.\n","authors":["Alexia Jolicoeur-Martineau","Emy Gervais","Kilian Fatras","Yan Zhang","Simon Lacoste-Julien"],"pdf_url":"https://arxiv.org/pdf/2304.03094v4.pdf","comment":"Blog post: https://ajolicoeur.wordpress.com/papa/, Code:\n https://github.com/SamsungSAILMontreal/PAPA, TMLR journal publication:\n https://openreview.net/forum?id=cPDVjsOytS"},{"id":"http://arxiv.org/abs/2306.12422v2","updated":"2024-05-06T14:23:25Z","published":"2023-06-21T17:59:45Z","title":"DreamTime: An Improved Optimization Strategy for Diffusion-Guided 3D\n Generation","summary":" Text-to-image diffusion models pre-trained on billions of image-text pairs\nhave recently enabled 3D content creation by optimizing a randomly initialized\ndifferentiable 3D representation with score distillation. However, the\noptimization process suffers slow convergence and the resultant 3D models often\nexhibit two limitations: (a) quality concerns such as missing attributes and\ndistorted shape and texture; (b) extremely low diversity comparing to\ntext-guided image synthesis. In this paper, we show that the conflict between\nthe 3D optimization process and uniform timestep sampling in score distillation\nis the main reason for these limitations. To resolve this conflict, we propose\nto prioritize timestep sampling with monotonically non-increasing functions,\nwhich aligns the 3D optimization process with the sampling process of diffusion\nmodel. Extensive experiments show that our simple redesign significantly\nimproves 3D content creation with faster convergence, better quality and\ndiversity.\n","authors":["Yukun Huang","Jianan Wang","Yukai Shi","Boshi Tang","Xianbiao Qi","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.12422v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2405.03501v1","updated":"2024-05-06T14:13:38Z","published":"2024-05-06T14:13:38Z","title":"Boosting Single Positive Multi-label Classification with Generalized\n Robust Loss","summary":" Multi-label learning (MLL) requires comprehensive multi-semantic annotations\nthat is hard to fully obtain, thus often resulting in missing labels scenarios.\nIn this paper, we investigate Single Positive Multi-label Learning (SPML),\nwhere each image is associated with merely one positive label. Existing SPML\nmethods only focus on designing losses using mechanisms such as hard\npseudo-labeling and robust losses, mostly leading to unacceptable false\nnegatives. To address this issue, we first propose a generalized loss framework\nbased on expected risk minimization to provide soft pseudo labels, and point\nout that the former losses can be seamlessly converted into our framework. In\nparticular, we design a novel robust loss based on our framework, which enjoys\nflexible coordination between false positives and false negatives, and can\nadditionally deal with the imbalance between positive and negative samples.\nExtensive experiments show that our approach can significantly improve SPML\nperformance and outperform the vast majority of state-of-the-art methods on all\nthe four benchmarks.\n","authors":["Yanxi Chen","Chunxiao Li","Xinyang Dai","Jinhuan Li","Weiyu Sun","Yiming Wang","Renyuan Zhang","Tinghe Zhang","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03501v1.pdf","comment":"14 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2405.03500v1","updated":"2024-05-06T14:11:36Z","published":"2024-05-06T14:11:36Z","title":"A Rate-Distortion-Classification Approach for Lossy Image Compression","summary":" In lossy image compression, the objective is to achieve minimal signal\ndistortion while compressing images to a specified bit rate. The increasing\ndemand for visual analysis applications, particularly in classification tasks,\nhas emphasized the significance of considering semantic distortion in\ncompressed images. To bridge the gap between image compression and visual\nanalysis, we propose a Rate-Distortion-Classification (RDC) model for lossy\nimage compression, offering a unified framework to optimize the trade-off\nbetween rate, distortion, and classification accuracy. The RDC model is\nextensively analyzed both statistically on a multi-distribution source and\nexperimentally on the widely used MNIST dataset. The findings reveal that the\nRDC model exhibits desirable properties, including monotonic non-increasing and\nconvex functions, under certain conditions. This work provides insights into\nthe development of human-machine friendly compression methods and Video Coding\nfor Machine (VCM) approaches, paving the way for end-to-end image compression\ntechniques in real-world applications.\n","authors":["Yuefeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03500v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2405.03486v1","updated":"2024-05-06T13:57:03Z","published":"2024-05-06T13:57:03Z","title":"UnsafeBench: Benchmarking Image Safety Classifiers on Real-World and\n AI-Generated Images","summary":" Image safety classifiers play an important role in identifying and mitigating\nthe spread of unsafe images online (e.g., images including violence, hateful\nrhetoric, etc.). At the same time, with the advent of text-to-image models and\nincreasing concerns about the safety of AI models, developers are increasingly\nrelying on image safety classifiers to safeguard their models. Yet, the\nperformance of current image safety classifiers remains unknown for real-world\nand AI-generated images. To bridge this research gap, in this work, we propose\nUnsafeBench, a benchmarking framework that evaluates the effectiveness and\nrobustness of image safety classifiers. First, we curate a large dataset of 10K\nreal-world and AI-generated images that are annotated as safe or unsafe based\non a set of 11 unsafe categories of images (sexual, violent, hateful, etc.).\nThen, we evaluate the effectiveness and robustness of five popular image safety\nclassifiers, as well as three classifiers that are powered by general-purpose\nvisual language models. Our assessment indicates that existing image safety\nclassifiers are not comprehensive and effective enough in mitigating the\nmultifaceted problem of unsafe images. Also, we find that classifiers trained\nonly on real-world images tend to have degraded performance when applied to\nAI-generated images. Motivated by these findings, we design and implement a\ncomprehensive image moderation tool called PerspectiveVision, which effectively\nidentifies 11 categories of real-world and AI-generated unsafe images. The best\nPerspectiveVision model achieves an overall F1-Score of 0.810 on six evaluation\ndatasets, which is comparable with closed-source and expensive state-of-the-art\nmodels like GPT-4V. UnsafeBench and PerspectiveVision can aid the research\ncommunity in better understanding the landscape of image safety classification\nin the era of generative AI.\n","authors":["Yiting Qu","Xinyue Shen","Yixin Wu","Michael Backes","Savvas Zannettou","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03485v1","updated":"2024-05-06T13:56:56Z","published":"2024-05-06T13:56:56Z","title":"LGTM: Local-to-Global Text-Driven Human Motion Diffusion Model","summary":" In this paper, we introduce LGTM, a novel Local-to-Global pipeline for\nText-to-Motion generation. LGTM utilizes a diffusion-based architecture and\naims to address the challenge of accurately translating textual descriptions\ninto semantically coherent human motion in computer animation. Specifically,\ntraditional methods often struggle with semantic discrepancies, particularly in\naligning specific motions to the correct body parts. To address this issue, we\npropose a two-stage pipeline to overcome this challenge: it first employs large\nlanguage models (LLMs) to decompose global motion descriptions into\npart-specific narratives, which are then processed by independent body-part\nmotion encoders to ensure precise local semantic alignment. Finally, an\nattention-based full-body optimizer refines the motion generation results and\nguarantees the overall coherence. Our experiments demonstrate that LGTM gains\nsignificant improvements in generating locally accurate, semantically-aligned\nhuman motion, marking a notable advancement in text-to-motion applications.\nCode and data for this paper are available at https://github.com/L-Sun/LGTM\n","authors":["Haowen Sun","Ruikun Zheng","Haibin Huang","Chongyang Ma","Hui Huang","Ruizhen Hu"],"pdf_url":"https://arxiv.org/pdf/2405.03485v1.pdf","comment":"9 pages,7 figures, SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2404.14066v2","updated":"2024-05-06T13:41:44Z","published":"2024-04-22T10:23:59Z","title":"SHE-Net: Syntax-Hierarchy-Enhanced Text-Video Retrieval","summary":" The user base of short video apps has experienced unprecedented growth in\nrecent years, resulting in a significant demand for video content analysis. In\nparticular, text-video retrieval, which aims to find the top matching videos\ngiven text descriptions from a vast video corpus, is an essential function, the\nprimary challenge of which is to bridge the modality gap. Nevertheless, most\nexisting approaches treat texts merely as discrete tokens and neglect their\nsyntax structures. Moreover, the abundant spatial and temporal clues in videos\nare often underutilized due to the lack of interaction with text. To address\nthese issues, we argue that using texts as guidance to focus on relevant\ntemporal frames and spatial regions within videos is beneficial. In this paper,\nwe propose a novel Syntax-Hierarchy-Enhanced text-video retrieval method\n(SHE-Net) that exploits the inherent semantic and syntax hierarchy of texts to\nbridge the modality gap from two perspectives. First, to facilitate a more\nfine-grained integration of visual content, we employ the text syntax\nhierarchy, which reveals the grammatical structure of text descriptions, to\nguide the visual representations. Second, to further enhance the multi-modal\ninteraction and alignment, we also utilize the syntax hierarchy to guide the\nsimilarity calculation. We evaluated our method on four public text-video\nretrieval datasets of MSR-VTT, MSVD, DiDeMo, and ActivityNet. The experimental\nresults and ablation studies confirm the advantages of our proposed method.\n","authors":["Xuzheng Yu","Chen Jiang","Xingning Dong","Tian Gan","Ming Yang","Qingpei Guo"],"pdf_url":"https://arxiv.org/pdf/2404.14066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03462v1","updated":"2024-05-06T13:33:38Z","published":"2024-05-06T13:33:38Z","title":"A Lightweight Neural Architecture Search Model for Medical Image\n Classification","summary":" Accurate classification of medical images is essential for modern\ndiagnostics. Deep learning advancements led clinicians to increasingly use\nsophisticated models to make faster and more accurate decisions, sometimes\nreplacing human judgment. However, model development is costly and repetitive.\nNeural Architecture Search (NAS) provides solutions by automating the design of\ndeep learning architectures. This paper presents ZO-DARTS+, a differentiable\nNAS algorithm that improves search efficiency through a novel method of\ngenerating sparse probabilities by bi-level optimization. Experiments on five\npublic medical datasets show that ZO-DARTS+ matches the accuracy of\nstate-of-the-art solutions while reducing search times by up to three times.\n","authors":["Lunchen Xie","Eugenio Lomurno","Matteo Gambella","Danilo Ardagna","Manuel Roveri","Matteo Matteucci","Qingjiang Shi"],"pdf_url":"https://arxiv.org/pdf/2405.03462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03458v1","updated":"2024-05-06T13:29:34Z","published":"2024-05-06T13:29:34Z","title":"SSyncOA: Self-synchronizing Object-aligned Watermarking to Resist\n Cropping-paste Attacks","summary":" Modern image processing tools have made it easy for attackers to crop the\nregion or object of interest in images and paste it into other images. The\nchallenge this cropping-paste attack poses to the watermarking technology is\nthat it breaks the synchronization of the image watermark, introducing multiple\nsuperimposed desynchronization distortions, such as rotation, scaling, and\ntranslation. However, current watermarking methods can only resist a single\ntype of desynchronization and cannot be applied to protect the object's\ncopyright under the cropping-paste attack. With the finding that the key to\nresisting the cropping-paste attack lies in robust features of the object to\nprotect, this paper proposes a self-synchronizing object-aligned watermarking\nmethod, called SSyncOA. Specifically, we first constrain the watermarked region\nto be aligned with the protected object, and then synchronize the watermark's\ntranslation, rotation, and scaling distortions by normalizing the object\ninvariant features, i.e., its centroid, principal orientation, and minimum\nbounding square, respectively. To make the watermark embedded in the protected\nobject, we introduce the object-aligned watermarking model, which incorporates\nthe real cropping-paste attack into the encoder-noise layer-decoder pipeline\nand is optimized end-to-end. Besides, we illustrate the effect of different\ndesynchronization distortions on the watermark training, which confirms the\nnecessity of the self-synchronization process. Extensive experiments\ndemonstrate the superiority of our method over other SOTAs.\n","authors":["Chengxin Zhao","Hefei Ling","Sijing Xie","Han Fang","Yaokun Fang","Nan Sun"],"pdf_url":"https://arxiv.org/pdf/2405.03458v1.pdf","comment":"7 pages, 5 figures (Have been accepted by ICME 2024)"},{"id":"http://arxiv.org/abs/2311.06141v2","updated":"2024-05-06T13:23:23Z","published":"2023-11-10T15:58:53Z","title":"Federated Learning Across Decentralized and Unshared Archives for Remote\n Sensing Image Classification","summary":" Federated learning (FL) enables the collaboration of multiple deep learning\nmodels to learn from decentralized data archives (i.e., clients) without\naccessing data on clients. Although FL offers ample opportunities in knowledge\ndiscovery from distributed image archives, it is seldom considered in remote\nsensing (RS). In this paper, as a first time in RS, we present a comparative\nstudy of state-of-the-art FL algorithms for RS image classification problems.\nTo this end, we initially provide a systematic review of the FL algorithms\npresented in the computer vision and machine learning communities. Then, we\nselect several state-of-the-art FL algorithms based on their effectiveness with\nrespect to training data heterogeneity across clients (known as non-IID data).\nAfter presenting an extensive overview of the selected algorithms, a\ntheoretical comparison of the algorithms is conducted based on their: 1) local\ntraining complexity; 2) aggregation complexity; 3) learning efficiency; 4)\ncommunication cost; and 5) scalability in terms of number of clients. After the\ntheoretical comparison, experimental analyses are presented to compare them\nunder different decentralization scenarios. For the experimental analyses, we\nfocus our attention on multi-label image classification problems in RS. Based\non our comprehensive analyses, we finally derive a guideline for selecting\nsuitable FL algorithms in RS. The code of this work will be publicly available\nat https://git.tu-berlin.de/rsim/FL-RS.\n","authors":["Barış Büyüktaş","Gencer Sumbul","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2311.06141v2.pdf","comment":"Submitted to the IEEE Geoscience and Remote Sensing Magazine"},{"id":"http://arxiv.org/abs/2311.17834v3","updated":"2024-05-06T13:14:29Z","published":"2023-11-29T17:36:49Z","title":"Spice-E : Structural Priors in 3D Diffusion using Cross-Entity Attention","summary":" We are witnessing rapid progress in automatically generating and manipulating\n3D assets due to the availability of pretrained text-image diffusion models.\nHowever, time-consuming optimization procedures are required for synthesizing\neach sample, hindering their potential for democratizing 3D content creation.\nConversely, 3D diffusion models now train on million-scale 3D datasets,\nyielding high-quality text-conditional 3D samples within seconds. In this work,\nwe present Spice-E - a neural network that adds structural guidance to 3D\ndiffusion models, extending their usage beyond text-conditional generation. At\nits core, our framework introduces a cross-entity attention mechanism that\nallows for multiple entities (in particular, paired input and guidance 3D\nshapes) to interact via their internal representations within the denoising\nnetwork. We utilize this mechanism for learning task-specific structural priors\nin 3D diffusion models from auxiliary guidance shapes. We show that our\napproach supports a variety of applications, including 3D stylization, semantic\nshape editing and text-conditional abstraction-to-3D, which transforms\nprimitive-based abstractions into highly-expressive shapes. Extensive\nexperiments demonstrate that Spice-E achieves SOTA performance over these tasks\nwhile often being considerably faster than alternative methods. Importantly,\nthis is accomplished without tailoring our approach for any specific task.\n","authors":["Etai Sella","Gal Fiebelman","Noam Atia","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2311.17834v3.pdf","comment":"Project webpage: https://tau-vailab.github.io/Spice-E"},{"id":"http://arxiv.org/abs/2405.03436v1","updated":"2024-05-06T12:59:05Z","published":"2024-05-06T12:59:05Z","title":"DBDH: A Dual-Branch Dual-Head Neural Network for Invisible Embedded\n Regions Localization","summary":" Embedding invisible hyperlinks or hidden codes in images to replace QR codes\nhas become a hot topic recently. This technology requires first localizing the\nembedded region in the captured photos before decoding. Existing methods that\ntrain models to find the invisible embedded region struggle to obtain accurate\nlocalization results, leading to degraded decoding accuracy. This limitation is\nprimarily because the CNN network is sensitive to low-frequency signals, while\nthe embedded signal is typically in the high-frequency form. Based on this,\nthis paper proposes a Dual-Branch Dual-Head (DBDH) neural network tailored for\nthe precise localization of invisible embedded regions. Specifically, DBDH uses\na low-level texture branch containing 62 high-pass filters to capture the\nhigh-frequency signals induced by embedding. A high-level context branch is\nused to extract discriminative features between the embedded and normal\nregions. DBDH employs a detection head to directly detect the four vertices of\nthe embedding region. In addition, we introduce an extra segmentation head to\nsegment the mask of the embedding region during training. The segmentation head\nprovides pixel-level supervision for model learning, facilitating better\nlearning of the embedded signals. Based on two state-of-the-art invisible\noffline-to-online messaging methods, we construct two datasets and augmentation\nstrategies for training and testing localization models. Extensive experiments\ndemonstrate the superior performance of the proposed DBDH over existing\nmethods.\n","authors":["Chengxin Zhao","Hefei Ling","Sijing Xie","Nan Sun","Zongyi Li","Yuxuan Shi","Jiazhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.03436v1.pdf","comment":"7 pages, 6 figures (Have been accepted by IJCNN 2024)"},{"id":"http://arxiv.org/abs/2405.03420v1","updated":"2024-05-06T12:40:15Z","published":"2024-05-06T12:40:15Z","title":"Implantable Adaptive Cells: differentiable architecture search to\n improve the performance of any trained U-shaped network","summary":" This paper introduces a novel approach to enhance the performance of\npre-trained neural networks in medical image segmentation using Neural\nArchitecture Search (NAS) methods, specifically Differentiable Architecture\nSearch (DARTS). We present the concept of Implantable Adaptive Cell (IAC),\nsmall but powerful modules identified through Partially-Connected DARTS,\ndesigned to be injected into the skip connections of an existing and already\ntrained U-shaped model. Our strategy allows for the seamless integration of the\nIAC into the pre-existing architecture, thereby enhancing its performance\nwithout necessitating a complete retraining from scratch. The empirical\nstudies, focusing on medical image segmentation tasks, demonstrate the efficacy\nof this method. The integration of specialized IAC cells into various\nconfigurations of the U-Net model increases segmentation accuracy by almost 2\\%\npoints on average for the validation dataset and over 3\\% points for the\ntraining dataset. The findings of this study not only offer a cost-effective\nalternative to the complete overhaul of complex models for performance upgrades\nbut also indicate the potential applicability of our method to other\narchitectures and problem domains.\n","authors":["Emil Benedykciuk","Marcin Denkowski","Grzegorz Wójcik"],"pdf_url":"https://arxiv.org/pdf/2405.03420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04653v2","updated":"2024-05-06T12:34:34Z","published":"2024-04-06T15:10:29Z","title":"HawkDrive: A Transformer-driven Visual Perception System for Autonomous\n Driving in Night Scene","summary":" Many established vision perception systems for autonomous driving scenarios\nignore the influence of light conditions, one of the key elements for driving\nsafety. To address this problem, we present HawkDrive, a novel perception\nsystem with hardware and software solutions. Hardware that utilizes stereo\nvision perception, which has been demonstrated to be a more reliable way of\nestimating depth information than monocular vision, is partnered with the edge\ncomputing device Nvidia Jetson Xavier AGX. Our software for low light\nenhancement, depth estimation, and semantic segmentation tasks, is a\ntransformer-based neural network. Our software stack, which enables fast\ninference and noise reduction, is packaged into system modules in Robot\nOperating System 2 (ROS2). Our experimental results have shown that the\nproposed end-to-end system is effective in improving the depth estimation and\nsemantic segmentation performance. Our dataset and codes will be released at\nhttps://github.com/ZionGo6/HawkDrive.\n","authors":["Ziang Guo","Stepan Perminov","Mikhail Konenkov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2404.04653v2.pdf","comment":"Accepted by IEEE IV 2024"},{"id":"http://arxiv.org/abs/2405.03417v1","updated":"2024-05-06T12:32:38Z","published":"2024-05-06T12:32:38Z","title":"Gaussian Splatting: 3D Reconstruction and Novel View Synthesis, a Review","summary":" Image-based 3D reconstruction is a challenging task that involves inferring\nthe 3D shape of an object or scene from a set of input images. Learning-based\nmethods have gained attention for their ability to directly estimate 3D shapes.\nThis review paper focuses on state-of-the-art techniques for 3D reconstruction,\nincluding the generation of novel, unseen views. An overview of recent\ndevelopments in the Gaussian Splatting method is provided, covering input\ntypes, model structures, output representations, and training strategies.\nUnresolved challenges and future directions are also discussed. Given the rapid\nprogress in this domain and the numerous opportunities for enhancing 3D\nreconstruction methods, a comprehensive examination of algorithms appears\nessential. Consequently, this study offers a thorough overview of the latest\nadvancements in Gaussian Splatting.\n","authors":["Anurag Dalal","Daniel Hagen","Kjell G. Robbersmyr","Kristian Muri Knausgård"],"pdf_url":"https://arxiv.org/pdf/2405.03417v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.03134v3","updated":"2024-05-06T12:24:58Z","published":"2024-03-05T17:21:31Z","title":"Simplicity in Complexity : Explaining Visual Complexity using Deep\n Segmentation Models","summary":" The complexity of visual stimuli plays an important role in many cognitive\nphenomena, including attention, engagement, memorability, time perception and\naesthetic evaluation. Despite its importance, complexity is poorly understood\nand ironically, previous models of image complexity have been quite complex.\nThere have been many attempts to find handcrafted features that explain\ncomplexity, but these features are usually dataset specific, and hence fail to\ngeneralise. On the other hand, more recent work has employed deep neural\nnetworks to predict complexity, but these models remain difficult to interpret,\nand do not guide a theoretical understanding of the problem. Here we propose to\nmodel complexity using segment-based representations of images. We use\nstate-of-the-art segmentation models, SAM and FC-CLIP, to quantify the number\nof segments at multiple granularities, and the number of classes in an image\nrespectively. We find that complexity is well-explained by a simple linear\nmodel with these two features across six diverse image-sets of naturalistic\nscene and art images. This suggests that the complexity of images can be\nsurprisingly simple.\n","authors":["Tingke Shen","Surabhi S Nath","Aenne Brielmann","Peter Dayan"],"pdf_url":"https://arxiv.org/pdf/2403.03134v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03408v1","updated":"2024-05-06T12:20:16Z","published":"2024-05-06T12:20:16Z","title":"An Image Quality Evaluation and Masking Algorithm Based On Pre-trained\n Deep Neural Networks","summary":" With the growing amount of astronomical data, there is an increasing need for\nautomated data processing pipelines, which can extract scientific information\nfrom observation data without human interventions. A critical aspect of these\npipelines is the image quality evaluation and masking algorithm, which\nevaluates image qualities based on various factors such as cloud coverage, sky\nbrightness, scattering light from the optical system, point spread function\nsize and shape, and read-out noise. Occasionally, the algorithm requires\nmasking of areas severely affected by noise. However, the algorithm often\nnecessitates significant human interventions, reducing data processing\nefficiency. In this study, we present a deep learning based image quality\nevaluation algorithm that uses an autoencoder to learn features of high quality\nastronomical images. The trained autoencoder enables automatic evaluation of\nimage quality and masking of noise affected areas. We have evaluated the\nperformance of our algorithm using two test cases: images with point spread\nfunctions of varying full width half magnitude, and images with complex\nbackgrounds. In the first scenario, our algorithm could effectively identify\nvariations of the point spread functions, which can provide valuable reference\ninformation for photometry. In the second scenario, our method could\nsuccessfully mask regions affected by complex regions, which could\nsignificantly increase the photometry accuracy. Our algorithm can be employed\nto automatically evaluate image quality obtained by different sky surveying\nprojects, further increasing the speed and robustness of data processing\npipelines.\n","authors":["Peng Jia","Yu Song","Jiameng Lv","Runyu Ning"],"pdf_url":"https://arxiv.org/pdf/2405.03408v1.pdf","comment":"Accepted by the AJ. The code could be downloaded from:\n https://nadc.china-vo.org/res/r101415/ with DOI of: 10.12149/101415"},{"id":"http://arxiv.org/abs/2112.06433v3","updated":"2024-05-06T12:18:35Z","published":"2021-12-13T06:23:31Z","title":"Generate Point Clouds with Multiscale Details from Graph-Represented\n Structures","summary":" As details are missing in most representations of structures, the lack of\ncontrollability to more information is one of the major weaknesses in\nstructure-based controllable point cloud generation. It is observable that\ndefinitions of details and structures are subjective. Details can be treated as\nstructures on small scales. To represent structures in different scales at the\nsame time, we present a graph-based representation of structures called the\nMultiscale Structure Graph (MSG). Given structures in multiple scales, similar\npatterns of local structures can be found at different scales, positions, and\nangles. The knowledge learned from a regional structure pattern shall be\ntransferred to other similar patterns. An encoding and generation mechanism,\nnamely the Multiscale Structure-based Point Cloud Generator (MSPCG) is\nproposed, which can simultaneously learn point cloud generation from local\npatterns with miscellaneous spatial properties. The proposed method supports\nmultiscale editions on point clouds by editing the MSG. By generating point\nclouds from local structures and learning simultaneously in multiple scales,\nour MSPCG has better generalization ability and scalability. Trained on the\nShapeNet, our MSPCG can generate point clouds from a given structure for unseen\ncategories and indoor scenes. The experimental results show that our method\nsignificantly outperforms baseline methods.\n","authors":["Ximing Yang","Zhibo Zhang","Zhengfu He","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2112.06433v3.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.03388v1","updated":"2024-05-06T11:46:04Z","published":"2024-05-06T11:46:04Z","title":"3D LiDAR Mapping in Dynamic Environments Using a 4D Implicit Neural\n Representation","summary":" Building accurate maps is a key building block to enable reliable\nlocalization, planning, and navigation of autonomous vehicles. We propose a\nnovel approach for building accurate maps of dynamic environments utilizing a\nsequence of LiDAR scans. To this end, we propose encoding the 4D scene into a\nnovel spatio-temporal implicit neural map representation by fitting a\ntime-dependent truncated signed distance function to each point. Using our\nrepresentation, we extract the static map by filtering the dynamic parts. Our\nneural representation is based on sparse feature grids, a globally shared\ndecoder, and time-dependent basis functions, which we jointly optimize in an\nunsupervised fashion. To learn this representation from a sequence of LiDAR\nscans, we design a simple yet efficient loss function to supervise the map\noptimization in a piecewise way. We evaluate our approach on various scenes\ncontaining moving objects in terms of the reconstruction quality of static maps\nand the segmentation of dynamic point clouds. The experimental results\ndemonstrate that our method is capable of removing the dynamic part of the\ninput point clouds while reconstructing accurate and complete 3D maps,\noutperforming several state-of-the-art methods. Codes are available at:\nhttps://github.com/PRBonn/4dNDF\n","authors":["Xingguang Zhong","Yue Pan","Cyrill Stachniss","Jens Behley"],"pdf_url":"https://arxiv.org/pdf/2405.03388v1.pdf","comment":"10 pages, CVPR 2024"},{"id":"http://arxiv.org/abs/2405.03381v1","updated":"2024-05-06T11:40:57Z","published":"2024-05-06T11:40:57Z","title":"Statistical Edge Detection And UDF Learning For Shape Representation","summary":" In the field of computer vision, the numerical encoding of 3D surfaces is\ncrucial. It is classical to represent surfaces with their Signed Distance\nFunctions (SDFs) or Unsigned Distance Functions (UDFs). For tasks like\nrepresentation learning, surface classification, or surface reconstruction,\nthis function can be learned by a neural network, called Neural Distance\nFunction. This network, and in particular its weights, may serve as a\nparametric and implicit representation for the surface. The network must\nrepresent the surface as accurately as possible. In this paper, we propose a\nmethod for learning UDFs that improves the fidelity of the obtained Neural UDF\nto the original 3D surface. The key idea of our method is to concentrate the\nlearning effort of the Neural UDF on surface edges. More precisely, we show\nthat sampling more training points around surface edges allows better local\naccuracy of the trained Neural UDF, and thus improves the global expressiveness\nof the Neural UDF in terms of Hausdorff distance. To detect surface edges, we\npropose a new statistical method based on the calculation of a $p$-value at\neach point on the surface. Our method is shown to detect surface edges more\naccurately than a commonly used local geometric descriptor.\n","authors":["Virgile Foy","Fabrice Gamboa","Reda Chhaibi"],"pdf_url":"https://arxiv.org/pdf/2405.03381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03376v1","updated":"2024-05-06T11:30:55Z","published":"2024-05-06T11:30:55Z","title":"CRA5: Extreme Compression of ERA5 for Portable Global Climate and\n Weather Research via an Efficient Variational Transformer","summary":" The advent of data-driven weather forecasting models, which learn from\nhundreds of terabytes (TB) of reanalysis data, has significantly advanced\nforecasting capabilities. However, the substantial costs associated with data\nstorage and transmission present a major challenge for data providers and\nusers, affecting resource-constrained researchers and limiting their\naccessibility to participate in AI-based meteorological research. To mitigate\nthis issue, we introduce an efficient neural codec, the Variational Autoencoder\nTransformer (VAEformer), for extreme compression of climate data to\nsignificantly reduce data storage cost, making AI-based meteorological research\nportable to researchers. Our approach diverges from recent complex neural\ncodecs by utilizing a low-complexity Auto-Encoder transformer. This encoder\nproduces a quantized latent representation through variance inference, which\nreparameterizes the latent space as a Gaussian distribution. This method\nimproves the estimation of distributions for cross-entropy coding. Extensive\nexperiments demonstrate that our VAEformer outperforms existing\nstate-of-the-art compression methods in the context of climate data. By\napplying our VAEformer, we compressed the most popular ERA5 climate dataset\n(226 TB) into a new dataset, CRA5 (0.7 TB). This translates to a compression\nratio of over 300 while retaining the dataset's utility for accurate scientific\nanalysis. Further, downstream experiments show that global weather forecasting\nmodels trained on the compact CRA5 dataset achieve forecasting accuracy\ncomparable to the model trained on the original dataset. Code, the CRA5\ndataset, and the pre-trained model are available at\nhttps://github.com/taohan10200/CRA5.\n","authors":["Tao Han","zhenghao Chen","Song Guo","Wanghan Xu","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2405.03376v1.pdf","comment":"Main text and supplementary, 22 pages"},{"id":"http://arxiv.org/abs/2405.03373v1","updated":"2024-05-06T11:27:27Z","published":"2024-05-06T11:27:27Z","title":"Knowledge-aware Text-Image Retrieval for Remote Sensing Images","summary":" Image-based retrieval in large Earth observation archives is challenging\nbecause one needs to navigate across thousands of candidate matches only with\nthe query image as a guide. By using text as information supporting the visual\nquery, the retrieval system gains in usability, but at the same time faces\ndifficulties due to the diversity of visual signals that cannot be summarized\nby a short caption only. For this reason, as a matching-based task, cross-modal\ntext-image retrieval often suffers from information asymmetry between texts and\nimages. To address this challenge, we propose a Knowledge-aware Text-Image\nRetrieval (KTIR) method for remote sensing images. By mining relevant\ninformation from an external knowledge graph, KTIR enriches the text scope\navailable in the search query and alleviates the information gaps between texts\nand images for better matching. Moreover, by integrating domain-specific\nknowledge, KTIR also enhances the adaptation of pre-trained vision-language\nmodels to remote sensing applications. Experimental results on three commonly\nused remote sensing text-image retrieval benchmarks show that the proposed\nknowledge-aware method leads to varied and consistent retrievals, outperforming\nstate-of-the-art retrieval methods.\n","authors":["Li Mi","Xianjie Dai","Javiera Castillo-Navarro","Devis Tuia"],"pdf_url":"https://arxiv.org/pdf/2405.03373v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2405.03355v1","updated":"2024-05-06T11:05:13Z","published":"2024-05-06T11:05:13Z","title":"On the Theory of Cross-Modality Distillation with Contrastive Learning","summary":" Cross-modality distillation arises as an important topic for data modalities\ncontaining limited knowledge such as depth maps and high-quality sketches. Such\ntechniques are of great importance, especially for memory and\nprivacy-restricted scenarios where labeled training data is generally\nunavailable. To solve the problem, existing label-free methods leverage a few\npairwise unlabeled data to distill the knowledge by aligning features or\nstatistics between the source and target modalities. For instance, one\ntypically aims to minimize the L2 distance or contrastive loss between the\nlearned features of pairs of samples in the source (e.g. image) and the target\n(e.g. sketch) modalities. However, most algorithms in this domain only focus on\nthe experimental results but lack theoretical insight. To bridge the gap\nbetween the theory and practical method of cross-modality distillation, we\nfirst formulate a general framework of cross-modality contrastive distillation\n(CMCD), built upon contrastive learning that leverages both positive and\nnegative correspondence, towards a better distillation of generalizable\nfeatures. Furthermore, we establish a thorough convergence analysis that\nreveals that the distance between source and target modalities significantly\nimpacts the test error on downstream tasks within the target modality which is\nalso validated by the empirical results. Extensive experimental results show\nthat our algorithm outperforms existing algorithms consistently by a margin of\n2-3\\% across diverse modalities and tasks, covering modalities of image,\nsketch, depth map, and audio and tasks of recognition and segmentation.\n","authors":["Hangyu Lin","Chen Liu","Chengming Xu","Zhengqi Gao","Yanwei Fu","Yuan Yao"],"pdf_url":"https://arxiv.org/pdf/2405.03355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03352v1","updated":"2024-05-06T11:02:26Z","published":"2024-05-06T11:02:26Z","title":"Salient Object Detection From Arbitrary Modalities","summary":" Toward desirable saliency prediction, the types and numbers of inputs for a\nsalient object detection (SOD) algorithm may dynamically change in many\nreal-life applications. However, existing SOD algorithms are mainly designed or\ntrained for one particular type of inputs, failing to be generalized to other\ntypes of inputs. Consequentially, more types of SOD algorithms need to be\nprepared in advance for handling different types of inputs, raising huge\nhardware and research costs. Differently, in this paper, we propose a new type\nof SOD task, termed Arbitrary Modality SOD (AM SOD). The most prominent\ncharacteristics of AM SOD are that the modality types and modality numbers will\nbe arbitrary or dynamically changed. The former means that the inputs to the AM\nSOD algorithm may be arbitrary modalities such as RGB, depths, or even any\ncombination of them. While, the latter indicates that the inputs may have\narbitrary modality numbers as the input type is changed, e.g. single-modality\nRGB image, dual-modality RGB-Depth (RGB-D) images or triple-modality\nRGB-Depth-Thermal (RGB-D-T) images. Accordingly, a preliminary solution to the\nabove challenges, \\i.e. a modality switch network (MSN), is proposed in this\npaper. In particular, a modality switch feature extractor (MSFE) is first\ndesigned to extract discriminative features from each modality effectively by\nintroducing some modality indicators, which will generate some weights for\nmodality switching. Subsequently, a dynamic fusion module (DFM) is proposed to\nadaptively fuse features from a variable number of modalities based on a novel\nTransformer structure. Finally, a new dataset, named AM-XD, is constructed to\nfacilitate research on AM SOD. Extensive experiments demonstrate that our AM\nSOD method can effectively cope with changes in the type and number of input\nmodalities for robust salient object detection.\n","authors":["Nianchang Huang","Yang Yang","Ruida Xi","Qiang Zhang","Jungong Han","Jin Huang"],"pdf_url":"https://arxiv.org/pdf/2405.03352v1.pdf","comment":"15 Pages, 7 Figures, 8 Tables"},{"id":"http://arxiv.org/abs/2405.03351v1","updated":"2024-05-06T11:02:02Z","published":"2024-05-06T11:02:02Z","title":"Modality Prompts for Arbitrary Modality Salient Object Detection","summary":" This paper delves into the task of arbitrary modality salient object\ndetection (AM SOD), aiming to detect salient objects from arbitrary modalities,\neg RGB images, RGB-D images, and RGB-D-T images. A novel modality-adaptive\nTransformer (MAT) will be proposed to investigate two fundamental challenges of\nAM SOD, ie more diverse modality discrepancies caused by varying modality types\nthat need to be processed, and dynamic fusion design caused by an uncertain\nnumber of modalities present in the inputs of multimodal fusion strategy.\nSpecifically, inspired by prompt learning's ability of aligning the\ndistributions of pre-trained models to the characteristic of downstream tasks\nby learning some prompts, MAT will first present a modality-adaptive feature\nextractor (MAFE) to tackle the diverse modality discrepancies by introducing a\nmodality prompt for each modality. In the training stage, a new modality\ntranslation contractive (MTC) loss will be further designed to assist MAFE in\nlearning those modality-distinguishable modality prompts. Accordingly, in the\ntesting stage, MAFE can employ those learned modality prompts to adaptively\nadjust its feature space according to the characteristics of the input\nmodalities, thus being able to extract discriminative unimodal features. Then,\nMAFE will present a channel-wise and spatial-wise fusion hybrid (CSFH) strategy\nto meet the demand for dynamic fusion. For that, CSFH dedicates a channel-wise\ndynamic fusion module (CDFM) and a novel spatial-wise dynamic fusion module\n(SDFM) to fuse the unimodal features from varying numbers of modalities and\nmeanwhile effectively capture cross-modal complementary semantic and detail\ninformation, respectively. Moreover, CSFH will carefully align CDFM and SDFM to\ndifferent levels of unimodal features based on their characteristics for more\neffective complementary information exploitation.\n","authors":["Nianchang Huang","Yang Yang","Qiang Zhang","Jungong Han","Jin Huang"],"pdf_url":"https://arxiv.org/pdf/2405.03351v1.pdf","comment":"13 pages, 7 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2211.11312v2","updated":"2024-05-06T11:00:43Z","published":"2022-11-21T09:51:28Z","title":"Understanding the Vulnerability of Skeleton-based Human Activity\n Recognition via Black-box Attack","summary":" Human Activity Recognition (HAR) has been employed in a wide range of\napplications, e.g. self-driving cars, where safety and lives are at stake.\nRecently, the robustness of skeleton-based HAR methods have been questioned due\nto their vulnerability to adversarial attacks. However, the proposed attacks\nrequire the full-knowledge of the attacked classifier, which is overly\nrestrictive. In this paper, we show such threats indeed exist, even when the\nattacker only has access to the input/output of the model. To this end, we\npropose the very first black-box adversarial attack approach in skeleton-based\nHAR called BASAR. BASAR explores the interplay between the classification\nboundary and the natural motion manifold. To our best knowledge, this is the\nfirst time data manifold is introduced in adversarial attacks on time series.\nVia BASAR, we find on-manifold adversarial samples are extremely deceitful and\nrather common in skeletal motions, in contrast to the common belief that\nadversarial samples only exist off-manifold. Through exhaustive evaluation, we\nshow that BASAR can deliver successful attacks across classifiers, datasets,\nand attack modes. By attack, BASAR helps identify the potential causes of the\nmodel vulnerability and provides insights on possible improvements. Finally, to\nmitigate the newly identified threat, we propose a new adversarial training\napproach by leveraging the sophisticated distributions of on/off-manifold\nadversarial samples, called mixed manifold-based adversarial training (MMAT).\nMMAT can successfully help defend against adversarial attacks without\ncompromising classification accuracy.\n","authors":["Yunfeng Diao","He Wang","Tianjia Shao","Yong-Liang Yang","Kun Zhou","David Hogg","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2211.11312v2.pdf","comment":"Accepted in Pattern Recognition. arXiv admin note: substantial text\n overlap with arXiv:2103.05266"},{"id":"http://arxiv.org/abs/2405.03349v1","updated":"2024-05-06T10:59:15Z","published":"2024-05-06T10:59:15Z","title":"Retinexmamba: Retinex-based Mamba for Low-light Image Enhancement","summary":" In the field of low-light image enhancement, both traditional Retinex methods\nand advanced deep learning techniques such as Retinexformer have shown distinct\nadvantages and limitations. Traditional Retinex methods, designed to mimic the\nhuman eye's perception of brightness and color, decompose images into\nillumination and reflection components but struggle with noise management and\ndetail preservation under low light conditions. Retinexformer enhances\nillumination estimation through traditional self-attention mechanisms, but\nfaces challenges with insufficient interpretability and suboptimal enhancement\neffects. To overcome these limitations, this paper introduces the RetinexMamba\narchitecture. RetinexMamba not only captures the physical intuitiveness of\ntraditional Retinex methods but also integrates the deep learning framework of\nRetinexformer, leveraging the computational efficiency of State Space Models\n(SSMs) to enhance processing speed. This architecture features innovative\nillumination estimators and damage restorer mechanisms that maintain image\nquality during enhancement. Moreover, RetinexMamba replaces the IG-MSA\n(Illumination-Guided Multi-Head Attention) in Retinexformer with a\nFused-Attention mechanism, improving the model's interpretability. Experimental\nevaluations on the LOL dataset show that RetinexMamba outperforms existing deep\nlearning approaches based on Retinex theory in both quantitative and\nqualitative metrics, confirming its effectiveness and superiority in enhancing\nlow-light images.\n","authors":["Jiesong Bai","Yuhao Yin","Qiyuan He"],"pdf_url":"https://arxiv.org/pdf/2405.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10490v2","updated":"2024-05-06T10:53:37Z","published":"2024-04-16T11:57:03Z","title":"Enhancing Sign Language Teaching: A Mixed Reality Approach for Immersive\n Learning and Multi-Dimensional Feedback","summary":" Traditional sign language teaching methods face challenges such as limited\nfeedback and diverse learning scenarios. Although 2D resources lack real-time\nfeedback, classroom teaching is constrained by a scarcity of teacher. Methods\nbased on VR and AR have relatively primitive interaction feedback mechanisms.\nThis study proposes an innovative teaching model that uses real-time monocular\nvision and mixed reality technology. First, we introduce an improved\nhand-posture reconstruction method to achieve sign language semantic retention\nand real-time feedback. Second, a ternary system evaluation algorithm is\nproposed for a comprehensive assessment, maintaining good consistency with\nexperts in sign language. Furthermore, we use mixed reality technology to\nconstruct a scenario-based 3D sign language classroom and explore the user\nexperience of scenario teaching. Overall, this paper presents a novel teaching\nmethod that provides an immersive learning experience, advanced posture\nreconstruction, and precise feedback, achieving positive feedback on user\nexperience and learning effectiveness.\n","authors":["Hongli Wen","Yang Xu","Lin Li","Xudong Ru","Xingce Wang","Zhongke Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10490v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.04956v3","updated":"2024-05-06T10:43:48Z","published":"2024-04-07T13:30:10Z","title":"Gaussian Shading: Provable Performance-Lossless Image Watermarking for\n Diffusion Models","summary":" Ethical concerns surrounding copyright protection and inappropriate content\ngeneration pose challenges for the practical implementation of diffusion\nmodels. One effective solution involves watermarking the generated images.\nHowever, existing methods often compromise the model performance or require\nadditional training, which is undesirable for operators and users. To address\nthis issue, we propose Gaussian Shading, a diffusion model watermarking\ntechnique that is both performance-lossless and training-free, while serving\nthe dual purpose of copyright protection and tracing of offending content. Our\nwatermark embedding is free of model parameter modifications and thus is\nplug-and-play. We map the watermark to latent representations following a\nstandard Gaussian distribution, which is indistinguishable from latent\nrepresentations obtained from the non-watermarked diffusion model. Therefore we\ncan achieve watermark embedding with lossless performance, for which we also\nprovide theoretical proof. Furthermore, since the watermark is intricately\nlinked with image semantics, it exhibits resilience to lossy processing and\nerasure attempts. The watermark can be extracted by Denoising Diffusion\nImplicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian\nShading on multiple versions of Stable Diffusion, and the results demonstrate\nthat Gaussian Shading not only is performance-lossless but also outperforms\nexisting methods in terms of robustness.\n","authors":["Zijin Yang","Kai Zeng","Kejiang Chen","Han Fang","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04956v3.pdf","comment":"17 pages, 11 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2405.01726v2","updated":"2024-05-06T10:27:49Z","published":"2024-05-02T20:44:26Z","title":"SSUMamba: Spatial-Spectral Selective State Space Model for Hyperspectral\n Image Denoising","summary":" Denoising hyperspectral images (HSIs) is a crucial preprocessing procedure\ndue to the noise originating from intra-imaging mechanisms and environmental\nfactors. Utilizing domain-specific knowledge of HSIs, such as spectral\ncorrelation, spatial self-similarity, and spatial-spectral correlation, is\nessential for deep learning-based denoising. Existing methods are often\nconstrained by running time, space complexity, and computational complexity,\nemploying strategies that explore these priors separately. While these\nstrategies can avoid some redundant information, they inevitably overlook\nbroader and more underlying long-range spatial-spectral information that\npositively impacts image restoration. This paper proposes a Spatial-Spectral\nSelective State Space Model-based U-shaped network, termed Spatial-Spectral\nU-Mamba (SSUMamba), for hyperspectral image denoising. We can obtain complete\nglobal spatial-spectral correlation within a module thanks to the linear space\ncomplexity in State Space Model (SSM) computations. We introduce a\nSpatial-Spectral Alternating Scan (SSAS) strategy for HSIs, which helps model\nthe information flow in multiple directions in 3-D HSIs. Experimental results\ndemonstrate that our method outperforms compared methods. The source code will\nbe available at https://github.com/lronkitty/SSUMamba.\n","authors":["Guanyiman Fu","Fengchao Xiong","Jianfeng Lu","Jun Zhou","Yuntao Qian"],"pdf_url":"https://arxiv.org/pdf/2405.01726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03333v1","updated":"2024-05-06T10:26:06Z","published":"2024-05-06T10:26:06Z","title":"Light-VQA+: A Video Quality Assessment Model for Exposure Correction\n with Vision-Language Guidance","summary":" Recently, User-Generated Content (UGC) videos have gained popularity in our\ndaily lives. However, UGC videos often suffer from poor exposure due to the\nlimitations of photographic equipment and techniques. Therefore, Video Exposure\nCorrection (VEC) algorithms have been proposed, Low-Light Video Enhancement\n(LLVE) and Over-Exposed Video Recovery (OEVR) included. Equally important to\nthe VEC is the Video Quality Assessment (VQA). Unfortunately, almost all\nexisting VQA models are built generally, measuring the quality of a video from\na comprehensive perspective. As a result, Light-VQA, trained on LLVE-QA, is\nproposed for assessing LLVE. We extend the work of Light-VQA by expanding the\nLLVE-QA dataset into Video Exposure Correction Quality Assessment (VEC-QA)\ndataset with over-exposed videos and their corresponding corrected versions. In\naddition, we propose Light-VQA+, a VQA model specialized in assessing VEC.\nLight-VQA+ differs from Light-VQA mainly from the usage of the CLIP model and\nthe vision-language guidance during the feature extraction, followed by a new\nmodule referring to the Human Visual System (HVS) for more accurate assessment.\nExtensive experimental results show that our model achieves the best\nperformance against the current State-Of-The-Art (SOTA) VQA models on the\nVEC-QA dataset and other public datasets.\n","authors":["Xunchu Zhou","Xiaohong Liu","Yunlong Dong","Tengchuan Kou","Yixuan Gao","Zicheng Zhang","Chunyi Li","Haoning Wu","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2405.03333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12015v2","updated":"2024-05-06T10:18:21Z","published":"2023-11-20T18:54:39Z","title":"GPT-4V(ision) for Robotics: Multimodal Task Planning from Human\n Demonstration","summary":" We introduce a pipeline that enhances a general-purpose Vision Language\nModel, GPT-4V(ision), to facilitate one-shot visual teaching for robotic\nmanipulation. This system analyzes videos of humans performing tasks and\noutputs executable robot programs that incorporate insights into affordances.\nThe process begins with GPT-4V analyzing the videos to obtain textual\nexplanations of environmental and action details. A GPT-4-based task planner\nthen encodes these details into a symbolic task plan. Subsequently, vision\nsystems spatially and temporally ground the task plan in the videos. Object are\nidentified using an open-vocabulary object detector, and hand-object\ninteractions are analyzed to pinpoint moments of grasping and releasing. This\nspatiotemporal grounding allows for the gathering of affordance information\n(e.g., grasp types, waypoints, and body postures) critical for robot execution.\nExperiments across various scenarios demonstrate the method's efficacy in\nachieving real robots' operations from human demonstrations in a one-shot\nmanner. Meanwhile, quantitative tests have revealed instances of hallucination\nin GPT-4V, highlighting the importance of incorporating human supervision\nwithin the pipeline. The prompts of GPT-4V/GPT-4 are available at this project\npage:\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.12015v2.pdf","comment":"9 pages, 12 figures, 2 tables. Last updated on May 6th, 2024"},{"id":"http://arxiv.org/abs/2312.02052v2","updated":"2024-05-06T10:11:14Z","published":"2023-12-04T17:10:25Z","title":"DUCK: Distance-based Unlearning via Centroid Kinematics","summary":" Machine Unlearning is rising as a new field, driven by the pressing necessity\nof ensuring privacy in modern artificial intelligence models. This technique\nprimarily aims to eradicate any residual influence of a specific subset of data\nfrom the knowledge acquired by a neural model during its training. This work\nintroduces a novel unlearning algorithm, denoted as Distance-based Unlearning\nvia Centroid Kinematics (DUCK), which employs metric learning to guide the\nremoval of samples matching the nearest incorrect centroid in the embedding\nspace. Evaluation of the algorithm's performance is conducted across various\nbenchmark datasets in two distinct scenarios, class removal, and homogeneous\nsampling removal, obtaining state-of-the-art performance. We also introduce a\nnovel metric, called Adaptive Unlearning Score (AUS), encompassing not only the\nefficacy of the unlearning process in forgetting target data but also\nquantifying the performance loss relative to the original model. Additionally,\nwe conducted a thorough investigation of the unlearning mechanism in DUCK,\nexamining its impact on the organization of the feature space and employing\nexplainable AI techniques for deeper insights.\n","authors":["Marco Cotogni","Jacopo Bonato","Luigi Sabetta","Francesco Pelosin","Alessandro Nicolosi"],"pdf_url":"https://arxiv.org/pdf/2312.02052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03328v1","updated":"2024-05-06T10:07:16Z","published":"2024-05-06T10:07:16Z","title":"Enhancing Spatiotemporal Disease Progression Models via Latent Diffusion\n and Prior Knowledge","summary":" In this work, we introduce Brain Latent Progression (BrLP), a novel\nspatiotemporal disease progression model based on latent diffusion. BrLP is\ndesigned to predict the evolution of diseases at the individual level on 3D\nbrain MRIs. Existing deep generative models developed for this task are\nprimarily data-driven and face challenges in learning disease progressions.\nBrLP addresses these challenges by incorporating prior knowledge from disease\nmodels to enhance the accuracy of predictions. To implement this, we propose to\nintegrate an auxiliary model that infers volumetric changes in various brain\nregions. Additionally, we introduce Latent Average Stabilization (LAS), a novel\ntechnique to improve spatiotemporal consistency of the predicted progression.\nBrLP is trained and evaluated on a large dataset comprising 11,730 T1-weighted\nbrain MRIs from 2,805 subjects, collected from three publicly available,\nlongitudinal Alzheimer's Disease (AD) studies. In our experiments, we compare\nthe MRI scans generated by BrLP with the actual follow-up MRIs available from\nthe subjects, in both cross-sectional and longitudinal settings. BrLP\ndemonstrates significant improvements over existing methods, with an increase\nof 22% in volumetric accuracy across AD-related brain regions and 43% in image\nsimilarity to the ground-truth scans. The ability of BrLP to generate\nconditioned 3D scans at the subject level, along with the novelty of\nintegrating prior knowledge to enhance accuracy, represents a significant\nadvancement in disease progression modeling, opening new avenues for precision\nmedicine. The code of BrLP is available at the following link:\nhttps://github.com/LemuelPuglisi/BrLP.\n","authors":["Lemuel Puglisi","Daniel C. Alexander","Daniele Ravì"],"pdf_url":"https://arxiv.org/pdf/2405.03328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19422v3","updated":"2024-05-06T10:06:09Z","published":"2024-02-29T18:21:54Z","title":"PEM: Prototype-based Efficient MaskFormer for Image Segmentation","summary":" Recent transformer-based architectures have shown impressive results in the\nfield of image segmentation. Thanks to their flexibility, they obtain\noutstanding performance in multiple segmentation tasks, such as semantic and\npanoptic, under a single unified framework. To achieve such impressive\nperformance, these architectures employ intensive operations and require\nsubstantial computational resources, which are often not available, especially\non edge devices. To fill this gap, we propose Prototype-based Efficient\nMaskFormer (PEM), an efficient transformer-based architecture that can operate\nin multiple segmentation tasks. PEM proposes a novel prototype-based\ncross-attention which leverages the redundancy of visual features to restrict\nthe computation and improve the efficiency without harming the performance. In\naddition, PEM introduces an efficient multi-scale feature pyramid network,\ncapable of extracting features that have high semantic content in an efficient\nway, thanks to the combination of deformable convolutions and context-based\nself-modulation. We benchmark the proposed PEM architecture on two tasks,\nsemantic and panoptic segmentation, evaluated on two different datasets,\nCityscapes and ADE20K. PEM demonstrates outstanding performance on every task\nand dataset, outperforming task-specific architectures while being comparable\nand even better than computationally-expensive baselines.\n","authors":["Niccolò Cavagnero","Gabriele Rosi","Claudia Cuttano","Francesca Pistilli","Marco Ciccone","Giuseppe Averta","Fabio Cermelli"],"pdf_url":"https://arxiv.org/pdf/2402.19422v3.pdf","comment":"CVPR 2024. Project page: https://niccolocavagnero.github.io/PEM"},{"id":"http://arxiv.org/abs/2402.07739v4","updated":"2024-05-06T09:50:22Z","published":"2024-02-12T15:57:31Z","title":"Task-conditioned adaptation of visual features in multi-task policy\n learning","summary":" Successfully addressing a wide variety of tasks is a core ability of\nautonomous agents, requiring flexibly adapting the underlying decision-making\nstrategies and, as we argue in this work, also adapting the perception modules.\nAn analogical argument would be the human visual system, which uses top-down\nsignals to focus attention determined by the current task. Similarly, we adapt\npre-trained large vision models conditioned on specific downstream tasks in the\ncontext of multi-task policy learning. We introduce task-conditioned adapters\nthat do not require finetuning any pre-trained weights, combined with a single\npolicy trained with behavior cloning and capable of addressing multiple tasks.\nWe condition the visual adapters on task embeddings, which can be selected at\ninference if the task is known, or alternatively inferred from a set of example\ndemonstrations. To this end, we propose a new optimization-based estimator. We\nevaluate the method on a wide variety of tasks from the CortexBench benchmark\nand show that, compared to existing work, it can be addressed with a single\npolicy. In particular, we demonstrate that adapting visual features is a key\ndesign choice and that the method generalizes to unseen tasks given a few\ndemonstrations.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.07739v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03318v1","updated":"2024-05-06T09:50:04Z","published":"2024-05-06T09:50:04Z","title":"Enhancing DETRs Variants through Improved Content Query and Similar\n Query Aggregation","summary":" The design of the query is crucial for the performance of DETR and its\nvariants. Each query consists of two components: a content part and a\npositional one. Traditionally, the content query is initialized with a zero or\nlearnable embedding, lacking essential content information and resulting in\nsub-optimal performance. In this paper, we introduce a novel plug-and-play\nmodule, Self-Adaptive Content Query (SACQ), to address this limitation. The\nSACQ module utilizes features from the transformer encoder to generate content\nqueries via self-attention pooling. This allows candidate queries to adapt to\nthe input image, resulting in a more comprehensive content prior and better\nfocus on target objects. However, this improved concentration poses a challenge\nfor the training process that utilizes the Hungarian matching, which selects\nonly a single candidate and suppresses other similar ones. To overcome this, we\npropose a query aggregation strategy to cooperate with SACQ. It merges similar\npredicted candidates from different queries, easing the optimization. Our\nextensive experiments on the COCO dataset demonstrate the effectiveness of our\nproposed approaches across six different DETR's variants with multiple\nconfigurations, achieving an average improvement of over 1.0 AP.\n","authors":["Yingying Zhang","Chuangji Shi","Xin Guo","Jiangwei Lao","Jian Wang","Jiaotuan Wang","Jingdong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.03318v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.03314v1","updated":"2024-05-06T09:41:31Z","published":"2024-05-06T09:41:31Z","title":"Deep Learning-based Point Cloud Registration for Augmented\n Reality-guided Surgery","summary":" Point cloud registration aligns 3D point clouds using spatial\ntransformations. It is an important task in computer vision, with applications\nin areas such as augmented reality (AR) and medical imaging. This work explores\nthe intersection of two research trends: the integration of AR into\nimage-guided surgery and the use of deep learning for point cloud registration.\nThe main objective is to evaluate the feasibility of applying deep\nlearning-based point cloud registration methods for image-to-patient\nregistration in augmented reality-guided surgery. We created a dataset of point\nclouds from medical imaging and corresponding point clouds captured with a\npopular AR device, the HoloLens 2. We evaluate three well-established deep\nlearning models in registering these data pairs. While we find that some deep\nlearning methods show promise, we show that a conventional registration\npipeline still outperforms them on our challenging dataset.\n","authors":["Maximilian Weber","Daniel Wild","Jens Kleesiek","Jan Egger","Christina Gsaxner"],"pdf_url":"https://arxiv.org/pdf/2405.03314v1.pdf","comment":"5 pages, 4 figures; accepted at IEEE ISBI 2024"},{"id":"http://arxiv.org/abs/2405.03311v1","updated":"2024-05-06T09:39:13Z","published":"2024-05-06T09:39:13Z","title":"Federated Learning for Drowsiness Detection in Connected Vehicles","summary":" Ensuring driver readiness poses challenges, yet driver monitoring systems can\nassist in determining the driver's state. By observing visual cues, such\nsystems recognize various behaviors and associate them with specific\nconditions. For instance, yawning or eye blinking can indicate driver\ndrowsiness. Consequently, an abundance of distributed data is generated for\ndriver monitoring. Employing machine learning techniques, such as driver\ndrowsiness detection, presents a potential solution. However, transmitting the\ndata to a central machine for model training is impractical due to the large\ndata size and privacy concerns. Conversely, training on a single vehicle would\nlimit the available data and likely result in inferior performance. To address\nthese issues, we propose a federated learning framework for drowsiness\ndetection within a vehicular network, leveraging the YawDD dataset. Our\napproach achieves an accuracy of 99.2%, demonstrating its promise and\ncomparability to conventional deep learning techniques. Lastly, we show how our\nmodel scales using various number of federated clients\n","authors":["William Lindskog","Valentin Spannagl","Christian Prehofer"],"pdf_url":"https://arxiv.org/pdf/2405.03311v1.pdf","comment":"14 pages, 8 figures, 1 table, EAI INTSYS 2023 conference"},{"id":"http://arxiv.org/abs/2404.11054v2","updated":"2024-05-06T09:23:40Z","published":"2024-04-17T03:56:28Z","title":"Multilateral Temporal-view Pyramid Transformer for Video Inpainting\n Detection","summary":" The task of video inpainting detection is to expose the pixel-level inpainted\nregions within a video sequence. Existing methods usually focus on leveraging\nspatial and temporal inconsistencies. However, these methods typically employ\nfixed operations to combine spatial and temporal clues, limiting their\napplicability in different scenarios. In this paper, we introduce a novel\nMultilateral Temporal-view Pyramid Transformer ({\\em MumPy}) that collaborates\nspatial-temporal clues flexibly. Our method utilizes a newly designed\nmultilateral temporal-view encoder to extract various collaborations of\nspatial-temporal clues and introduces a deformable window-based temporal-view\ninteraction module to enhance the diversity of these collaborations.\nSubsequently, we develop a multi-pyramid decoder to aggregate the various types\nof features and generate detection maps. By adjusting the contribution strength\nof spatial and temporal clues, our method can effectively identify inpainted\nregions. We validate our method on existing datasets and also introduce a new\nchallenging and large-scale Video Inpainting dataset based on the YouTube-VOS\ndataset, which employs several more recent inpainting methods. The results\ndemonstrate the superiority of our method in both in-domain and cross-domain\nevaluation scenarios.\n","authors":["Ying Zhang","Yuezun Li","Bo Peng","Jiaran Zhou","Huiyu Zhou","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.11054v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03301v1","updated":"2024-05-06T09:21:35Z","published":"2024-05-06T09:21:35Z","title":"Interpretable Network Visualizations: A Human-in-the-Loop Approach for\n Post-hoc Explainability of CNN-based Image Classification","summary":" Transparency and explainability in image classification are essential for\nestablishing trust in machine learning models and detecting biases and errors.\nState-of-the-art explainability methods generate saliency maps to show where a\nspecific class is identified, without providing a detailed explanation of the\nmodel's decision process. Striving to address such a need, we introduce a\npost-hoc method that explains the entire feature extraction process of a\nConvolutional Neural Network. These explanations include a layer-wise\nrepresentation of the features the model extracts from the input. Such features\nare represented as saliency maps generated by clustering and merging similar\nfeature maps, to which we associate a weight derived by generalizing Grad-CAM\nfor the proposed methodology. To further enhance these explanations, we include\na set of textual labels collected through a gamified crowdsourcing activity and\nprocessed using NLP techniques and Sentence-BERT. Finally, we show an approach\nto generate global explanations by aggregating labels across multiple images.\n","authors":["Matteo Bianchi","Antonio De Santis","Andrea Tocchetti","Marco Brambilla"],"pdf_url":"https://arxiv.org/pdf/2405.03301v1.pdf","comment":"International Joint Conference on Artificial Intelligence 2024 (to be\n published)"},{"id":"http://arxiv.org/abs/2404.13873v2","updated":"2024-05-06T09:19:25Z","published":"2024-04-22T04:47:52Z","title":"Texture-aware and Shape-guided Transformer for Sequential DeepFake\n Detection","summary":" Sequential DeepFake detection is an emerging task that aims to predict the\nmanipulation sequence in order. Existing methods typically formulate it as an\nimage-to-sequence problem, employing conventional Transformer architectures for\ndetection. However, these methods lack dedicated design and consequently result\nin limited performance. In this paper, we propose a novel Texture-aware and\nShape-guided Transformer to enhance detection performance. Our method features\nfour major improvements. Firstly, we describe a texture-aware branch that\neffectively captures subtle manipulation traces with the Diversiform Pixel\nDifference Attention module. Then we introduce a Bidirectional Interaction\nCross-attention module that seeks deep correlations among spatial and\nsequential features, enabling effective modeling of complex manipulation\ntraces. To further enhance the cross-attention, we describe a Shape-guided\nGaussian mapping strategy, providing initial priors of the manipulation shape.\nFinally, observing that the latter manipulation in a sequence may influence\ntraces left in the earlier one, we intriguingly invert the prediction order\nfrom forward to backward, leading to notable gains as expected. Extensive\nexperimental results demonstrate that our method outperforms others by a large\nmargin, highlighting the superiority of our method.\n","authors":["Yunfei Li","Yuezun Li","Xin Wang","Jiaran Zhou","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.13873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13872v2","updated":"2024-05-06T09:14:42Z","published":"2024-04-22T04:41:42Z","title":"FreqBlender: Enhancing DeepFake Detection by Blending Frequency\n Knowledge","summary":" Generating synthetic fake faces, known as pseudo-fake faces, is an effective\nway to improve the generalization of DeepFake detection. Existing methods\ntypically generate these faces by blending real or fake faces in color space.\nWhile these methods have shown promise, they overlook the simulation of\nfrequency distribution in pseudo-fake faces, limiting the learning of generic\nforgery traces in-depth. To address this, this paper introduces {\\em\nFreqBlender}, a new method that can generate pseudo-fake faces by blending\nfrequency knowledge. Specifically, we investigate the major frequency\ncomponents and propose a Frequency Parsing Network to adaptively partition\nfrequency components related to forgery traces. Then we blend this frequency\nknowledge from fake faces into real faces to generate pseudo-fake faces. Since\nthere is no ground truth for frequency components, we describe a dedicated\ntraining strategy by leveraging the inner correlations among different\nfrequency knowledge to instruct the learning process. Experimental results\ndemonstrate the effectiveness of our method in enhancing DeepFake detection,\nmaking it a potential plug-and-play strategy for other methods.\n","authors":["Hanzhe Li","Yuezun Li","Jiaran Zhou","Bin Li","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.13872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01558v2","updated":"2024-05-06T09:02:58Z","published":"2024-03-24T13:57:30Z","title":"Configurable Learned Holography","summary":" In the pursuit of advancing holographic display technology, we face a unique\nyet persistent roadblock: the inflexibility of learned holography in adapting\nto various hardware configurations. This is due to the variances in the complex\noptical components and system settings in existing holographic displays.\nAlthough the emerging learned approaches have enabled rapid and high-quality\nhologram generation, any alteration in display hardware still requires a\nretraining of the model. Our work introduces a configurable learned model that\ninteractively computes 3D holograms from RGB-only 2D images for a variety of\nholographic displays. The model can be conditioned to predefined hardware\nparameters of existing holographic displays such as working wavelengths, pixel\npitch, propagation distance, and peak brightness without having to retrain. In\naddition, our model accommodates various hologram types, including conventional\nsingle-color and emerging multi-color holograms that simultaneously use\nmultiple color primaries in holographic displays. Notably, we enabled our\nhologram computations to rely on identifying the correlation between depth\nestimation and 3D hologram synthesis tasks within the learning domain for the\nfirst time in the literature. We employ knowledge distillation via a\nstudent-teacher learning strategy to streamline our model for interactive\nperformance. Achieving up to a 2x speed improvement compared to\nstate-of-the-art models while consistently generating high-quality 3D holograms\nwith different hardware configurations.\n","authors":["Yicheng Zhan","Liang Shi","Wojciech Matusik","Qi Sun","Kaan Akşit"],"pdf_url":"https://arxiv.org/pdf/2405.01558v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.03280v1","updated":"2024-05-06T08:56:41Z","published":"2024-05-06T08:56:41Z","title":"Animate Your Thoughts: Decoupled Reconstruction of Dynamic Natural\n Vision from Slow Brain Activity","summary":" Reconstructing human dynamic vision from brain activity is a challenging task\nwith great scientific significance. The difficulty stems from two primary\nissues: (1) vision-processing mechanisms in the brain are highly intricate and\nnot fully revealed, making it challenging to directly learn a mapping between\nfMRI and video; (2) the temporal resolution of fMRI is significantly lower than\nthat of natural videos. To overcome these issues, this paper propose a\ntwo-stage model named Mind-Animator, which achieves state-of-the-art\nperformance on three public datasets. Specifically, during the fMRI-to-feature\nstage, we decouple semantic, structural, and motion features from fMRI through\nfMRI-vision-language tri-modal contrastive learning and sparse causal\nattention. In the feature-to-video stage, these features are merged to videos\nby an inflated Stable Diffusion. We substantiate that the reconstructed video\ndynamics are indeed derived from fMRI, rather than hallucinations of the\ngenerative model, through permutation tests. Additionally, the visualization of\nvoxel-wise and ROI-wise importance maps confirms the neurobiological\ninterpretability of our model.\n","authors":["Yizhuo Lu","Changde Du","Chong Wang","Xuanliu Zhu","Liuyun Jiang","Huiguang He"],"pdf_url":"https://arxiv.org/pdf/2405.03280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03272v1","updated":"2024-05-06T08:42:34Z","published":"2024-05-06T08:42:34Z","title":"WorldQA: Multimodal World Knowledge in Videos through Long-Chain\n Reasoning","summary":" Multimodal information, together with our knowledge, help us to understand\nthe complex and dynamic world. Large language models (LLM) and large multimodal\nmodels (LMM), however, still struggle to emulate this capability. In this\npaper, we present WorldQA, a video understanding dataset designed to push the\nboundaries of multimodal world models with three appealing properties: (1)\nMultimodal Inputs: The dataset comprises 1007 question-answer pairs and 303\nvideos, necessitating the analysis of both auditory and visual data for\nsuccessful interpretation. (2) World Knowledge: We identify five essential\ntypes of world knowledge for question formulation. This approach challenges\nmodels to extend their capabilities beyond mere perception. (3) Long-Chain\nReasoning: Our dataset introduces an average reasoning step of 4.45, notably\nsurpassing other videoQA datasets. Furthermore, we introduce WorldRetriever, an\nagent designed to synthesize expert knowledge into a coherent reasoning chain,\nthereby facilitating accurate responses to WorldQA queries. Extensive\nevaluations of 13 prominent LLMs and LMMs reveal that WorldRetriever, although\nbeing the most effective model, achieved only 70% of humanlevel performance in\nmultiple-choice questions. This finding highlights the necessity for further\nadvancement in the reasoning and comprehension abilities of models. Our\nexperiments also yield several key insights. For instance, while humans tend to\nperform better with increased frames, current LMMs, including WorldRetriever,\nshow diminished performance under similar conditions. We hope that WorldQA,our\nmethodology, and these insights could contribute to the future development of\nmultimodal world models.\n","authors":["Yuanhan Zhang","Kaichen Zhang","Bo Li","Fanyi Pu","Christopher Arif Setiadharma","Jingkang Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2405.03272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00847v2","updated":"2024-05-06T08:14:01Z","published":"2024-01-01T18:56:54Z","title":"Mocap Everyone Everywhere: Lightweight Motion Capture With Smartwatches\n and a Head-Mounted Camera","summary":" We present a lightweight and affordable motion capture method based on two\nsmartwatches and a head-mounted camera. In contrast to the existing approaches\nthat use six or more expert-level IMU devices, our approach is much more\ncost-effective and convenient. Our method can make wearable motion capture\naccessible to everyone everywhere, enabling 3D full-body motion capture in\ndiverse environments. As a key idea to overcome the extreme sparsity and\nambiguities of sensor inputs with different modalities, we integrate 6D head\nposes obtained from the head-mounted cameras for motion estimation. To enable\ncapture in expansive indoor and outdoor scenes, we propose an algorithm to\ntrack and update floor level changes to define head poses, coupled with a\nmulti-stage Transformer-based regression module. We also introduce novel\nstrategies leveraging visual cues of egocentric images to further enhance the\nmotion capture quality while reducing ambiguities. We demonstrate the\nperformance of our method on various challenging scenarios, including complex\noutdoor environments and everyday motions including object interactions and\nsocial interactions among multiple individuals.\n","authors":["Jiye Lee","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2401.00847v2.pdf","comment":"Accepted to CVPR 2024; Project page:\n https://jiyewise.github.io/projects/MocapEvery/"},{"id":"http://arxiv.org/abs/2405.02179v2","updated":"2024-05-06T07:52:05Z","published":"2024-05-03T15:27:11Z","title":"Training-Free Deepfake Voice Recognition by Leveraging Large-Scale\n Pre-Trained Models","summary":" Generalization is a main issue for current audio deepfake detectors, which\nstruggle to provide reliable results on out-of-distribution data. Given the\nspeed at which more and more accurate synthesis methods are developed, it is\nvery important to design techniques that work well also on data they were not\ntrained for. In this paper we study the potential of large-scale pre-trained\nmodels for audio deepfake detection, with special focus on generalization\nability. To this end, the detection problem is reformulated in a speaker\nverification framework and fake audios are exposed by the mismatch between the\nvoice sample under test and the voice of the claimed identity. With this\nparadigm, no fake speech sample is necessary in training, cutting off any link\nwith the generation method at the root, and ensuring full generalization\nability. Features are extracted by general-purpose large pre-trained models,\nwith no need for training or fine-tuning on specific fake detection or speaker\nverification datasets. At detection time only a limited set of voice fragments\nof the identity under test is required. Experiments on several datasets\nwidespread in the community show that detectors based on pre-trained models\nachieve excellent performance and show strong generalization ability, rivaling\nsupervised methods on in-distribution data and largely overcoming them on\nout-of-distribution data.\n","authors":["Alessandro Pianese","Davide Cozzolino","Giovanni Poggi","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2405.02179v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03243v1","updated":"2024-05-06T07:51:13Z","published":"2024-05-06T07:51:13Z","title":"Mind the Gap Between Synthetic and Real: Utilizing Transfer Learning to\n Probe the Boundaries of Stable Diffusion Generated Data","summary":" Generative foundation models like Stable Diffusion comprise a diverse\nspectrum of knowledge in computer vision with the potential for transfer\nlearning, e.g., via generating data to train student models for downstream\ntasks. This could circumvent the necessity of collecting labeled real-world\ndata, thereby presenting a form of data-free knowledge distillation. However,\nthe resultant student models show a significant drop in accuracy compared to\nmodels trained on real data. We investigate possible causes for this drop and\nfocus on the role of the different layers of the student model. By training\nthese layers using either real or synthetic data, we reveal that the drop\nmainly stems from the model's final layers. Further, we briefly investigate\nother factors, such as differences in data-normalization between synthetic and\nreal, the impact of data augmentations, texture vs.\\ shape learning, and\nassuming oracle prompts. While we find that some of those factors can have an\nimpact, they are not sufficient to close the gap towards real data. Building\nupon our insights that mainly later layers are responsible for the drop, we\ninvestigate the data-efficiency of fine-tuning a synthetically trained model\nwith real data applied to only those last layers. Our results suggest an\nimproved trade-off between the amount of real training data used and the\nmodel's accuracy. Our findings contribute to the understanding of the gap\nbetween synthetic and real data and indicate solutions to mitigate the scarcity\nof labeled real data.\n","authors":["Leonhard Hennicke","Christian Medeiros Adriano","Holger Giese","Jan Mathias Koehler","Lukas Schott"],"pdf_url":"https://arxiv.org/pdf/2405.03243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15647v2","updated":"2024-05-06T07:45:53Z","published":"2024-01-28T12:51:01Z","title":"UP-CrackNet: Unsupervised Pixel-Wise Road Crack Detection via\n Adversarial Image Restoration","summary":" Over the past decade, automated methods have been developed to detect cracks\nmore efficiently, accurately, and objectively, with the ultimate goal of\nreplacing conventional manual visual inspection techniques. Among these\nmethods, semantic segmentation algorithms have demonstrated promising results\nin pixel-wise crack detection tasks. However, training such networks requires a\nlarge amount of human-annotated datasets with pixel-level annotations, which is\na highly labor-intensive and time-consuming process. Moreover, supervised\nlearning-based methods often struggle with poor generalizability in unseen\ndatasets. Therefore, we propose an unsupervised pixel-wise road crack detection\nnetwork, known as UP-CrackNet. Our approach first generates multi-scale square\nmasks and randomly selects them to corrupt undamaged road images by removing\ncertain regions. Subsequently, a generative adversarial network is trained to\nrestore the corrupted regions by leveraging the semantic context learned from\nsurrounding uncorrupted regions. During the testing phase, an error map is\ngenerated by calculating the difference between the input and restored images,\nwhich allows for pixel-wise crack detection. Our comprehensive experimental\nresults demonstrate that UP-CrackNet outperforms other general-purpose\nunsupervised anomaly detection algorithms, and exhibits satisfactory\nperformance and superior generalizability when compared with state-of-the-art\nsupervised crack segmentation algorithms. Our source code is publicly available\nat mias.group/UP-CrackNet.\n","authors":["Nachuan Ma","Rui Fan","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2401.15647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03235v1","updated":"2024-05-06T07:44:46Z","published":"2024-05-06T07:44:46Z","title":"Cross-Modal Domain Adaptation in Brain Disease Diagnosis: Maximum Mean\n Discrepancy-based Convolutional Neural Networks","summary":" Brain disorders are a major challenge to global health, causing millions of\ndeaths each year. Accurate diagnosis of these diseases relies heavily on\nadvanced medical imaging techniques such as Magnetic Resonance Imaging (MRI)\nand Computed Tomography (CT). However, the scarcity of annotated data poses a\nsignificant challenge in deploying machine learning models for medical\ndiagnosis. To address this limitation, deep learning techniques have shown\nconsiderable promise. Domain adaptation techniques enhance a model's ability to\ngeneralize across imaging modalities by transferring knowledge from one domain\n(e.g., CT images) to another (e.g., MRI images). Such cross-modality adaptation\nis essential to improve the ability of models to consistently generalize across\ndifferent imaging modalities. This study collected relevant resources from the\nKaggle website and employed the Maximum Mean Difference (MMD) method - a\npopular domain adaptation method - to reduce the differences between imaging\ndomains. By combining MMD with Convolutional Neural Networks (CNNs), the\naccuracy and utility of the model is obviously enhanced. The excellent\nexperimental results highlight the great potential of data-driven domain\nadaptation techniques to improve diagnostic accuracy and efficiency, especially\nin resource-limited environments. By bridging the gap between different imaging\nmodalities, the study aims to provide clinicians with more reliable diagnostic\ntools.\n","authors":["Xuran Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.03235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.02410v2","updated":"2024-05-06T07:33:24Z","published":"2022-07-06T02:49:02Z","title":"A Deep Model for Partial Multi-Label Image Classification with\n Curriculum Based Disambiguation","summary":" In this paper, we study the partial multi-label (PML) image classification\nproblem, where each image is annotated with a candidate label set consists of\nmultiple relevant labels and other noisy labels. Existing PML methods typically\ndesign a disambiguation strategy to filter out noisy labels by utilizing prior\nknowledge with extra assumptions, which unfortunately is unavailable in many\nreal tasks. Furthermore, because the objective function for disambiguation is\nusually elaborately designed on the whole training set, it can be hardly\noptimized in a deep model with SGD on mini-batches. In this paper, for the\nfirst time we propose a deep model for PML to enhance the representation and\ndiscrimination ability. On one hand, we propose a novel curriculum based\ndisambiguation strategy to progressively identify ground-truth labels by\nincorporating the varied difficulties of different classes. On the other hand,\na consistency regularization is introduced for model retraining to balance\nfitting identified easy labels and exploiting potential relevant labels.\nExtensive experimental results on the commonly used benchmark datasets show the\nproposed method significantly outperforms the SOTA methods.\n","authors":["Feng Sun","Ming-Kun Xie","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2207.02410v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.17222v3","updated":"2024-05-06T07:32:05Z","published":"2023-03-30T08:36:48Z","title":"LatentForensics: Towards frugal deepfake detection in the StyleGAN\n latent space","summary":" The classification of forged videos has been a challenge for the past few\nyears. Deepfake classifiers can now reliably predict whether or not video\nframes have been tampered with. However, their performance is tied to both the\ndataset used for training and the analyst's computational power. We propose a\ndeepfake detection method that operates in the latent space of a\nstate-of-the-art generative adversarial network (GAN) trained on high-quality\nface images. The proposed method leverages the structure of the latent space of\nStyleGAN to learn a lightweight binary classification model. Experimental\nresults on standard datasets reveal that the proposed approach outperforms\nother state-of-the-art deepfake classification methods, especially in contexts\nwhere the data available to train the models is rare, such as when a new\nmanipulation method is introduced. To the best of our knowledge, this is the\nfirst study showing the interest of the latent space of StyleGAN for deepfake\nclassification. Combined with other recent studies on the interpretation and\nmanipulation of this latent space, we believe that the proposed approach can\nfurther help in developing frugal deepfake classification methods based on\ninterpretable high-level properties of face images.\n","authors":["Matthieu Delmas","Amine Kacete","Stephane Paquelet","Simon Leglaive","Renaud Seguier"],"pdf_url":"https://arxiv.org/pdf/2303.17222v3.pdf","comment":"7 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2405.03221v1","updated":"2024-05-06T07:30:31Z","published":"2024-05-06T07:30:31Z","title":"Spatial and Surface Correspondence Field for Interaction Transfer","summary":" In this paper, we introduce a new method for the task of interaction\ntransfer. Given an example interaction between a source object and an agent,\nour method can automatically infer both surface and spatial relationships for\nthe agent and target objects within the same category, yielding more accurate\nand valid transfers. Specifically, our method characterizes the example\ninteraction using a combined spatial and surface representation. We correspond\nthe agent points and object points related to the representation to the target\nobject space using a learned spatial and surface correspondence field, which\nrepresents objects as deformed and rotated signed distance fields. With the\ncorresponded points, an optimization is performed under the constraints of our\nspatial and surface interaction representation and additional regularization.\nExperiments conducted on human-chair and hand-mug interaction transfer tasks\nshow that our approach can handle larger geometry and topology variations\nbetween source and target shapes, significantly outperforming state-of-the-art\nmethods.\n","authors":["Zeyu Huang","Honghao Xu","Haibin Huang","Chongyang Ma","Hui Huang","Ruizhen Hu"],"pdf_url":"https://arxiv.org/pdf/2405.03221v1.pdf","comment":"Accepted to SIGGRAPH 2024, project page at\n https://vcc.tech/research/2024/InterTransfer"},{"id":"http://arxiv.org/abs/2405.03218v1","updated":"2024-05-06T07:27:30Z","published":"2024-05-06T07:27:30Z","title":"Elevator, Escalator or Neither? Classifying Pedestrian Conveyor State\n Using Inertial Navigation System","summary":" Classifying a pedestrian in one of the three conveyor states of \"elevator,\"\n\"escalator\" and \"neither\" is fundamental to many applications such as indoor\nlocalization and people flow analysis. We estimate, for the first time, the\npedestrian conveyor state given the inertial navigation system (INS) readings\nof accelerometer, gyroscope and magnetometer sampled from the phone. Our\nproblem is challenging because the INS signals of the conveyor state are\ncoupled and perturbed by unpredictable arbitrary human actions, confusing the\ndecision process. We propose ELESON, a novel, effective and lightweight\nINS-based deep learning approach to classify whether a pedestrian is in an\nelevator, escalator or neither. ELESON utilizes a motion feature extractor to\ndecouple the conveyor state from human action in the feature space, and a\nmagnetic feature extractor to account for the speed difference between elevator\nand escalator. Given the results of the extractors, it employs an evidential\nstate classifier to estimate the confidence of the pedestrian states. Based on\nextensive experiments conducted on twenty hours of real pedestrian data, we\ndemonstrate that ELESON outperforms significantly the state-of-the-art\napproaches (where combined INS signals of both the conveyor state and human\nactions are processed together), with 15% classification improvement in F1\nscore, stronger confidence discriminability with 10% increase in AUROC (Area\nUnder the Receiver Operating Characteristics), and low computational and memory\nrequirements on smartphones.\n","authors":["Tianlang He","Zhiqiu Xia","S. -H. Gary Chan"],"pdf_url":"https://arxiv.org/pdf/2405.03218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03202v1","updated":"2024-05-06T07:02:24Z","published":"2024-05-06T07:02:24Z","title":"Hierarchical Space-Time Attention for Micro-Expression Recognition","summary":" Micro-expression recognition (MER) aims to recognize the short and subtle\nfacial movements from the Micro-expression (ME) video clips, which reveal real\nemotions. Recent MER methods mostly only utilize special frames from ME video\nclips or extract optical flow from these special frames. However, they neglect\nthe relationship between movements and space-time, while facial cues are hidden\nwithin these relationships. To solve this issue, we propose the Hierarchical\nSpace-Time Attention (HSTA). Specifically, we first process ME video frames and\nspecial frames or data parallelly by our cascaded Unimodal Space-Time Attention\n(USTA) to establish connections between subtle facial movements and specific\nfacial areas. Then, we design Crossmodal Space-Time Attention (CSTA) to achieve\na higher-quality fusion for crossmodal data. Finally, we hierarchically\nintegrate USTA and CSTA to grasp the deeper facial cues. Our model emphasizes\ntemporal modeling without neglecting the processing of special data, and it\nfuses the contents in different modalities while maintaining their respective\nuniqueness. Extensive experiments on the four benchmarks show the effectiveness\nof our proposed HSTA. Specifically, compared with the latest method on the\nCASME3 dataset, it achieves about 3% score improvement in seven-category\nclassification.\n","authors":["Haihong Hao","Shuo Wang","Huixia Ben","Yanbin Hao","Yansong Wang","Weiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03202v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.01460v2","updated":"2024-05-06T06:50:10Z","published":"2024-05-02T16:49:25Z","title":"Purify Unlearnable Examples via Rate-Constrained Variational\n Autoencoders","summary":" Unlearnable examples (UEs) seek to maximize testing error by making subtle\nmodifications to training examples that are correctly labeled. Defenses against\nthese poisoning attacks can be categorized based on whether specific\ninterventions are adopted during training. The first approach is training-time\ndefense, such as adversarial training, which can mitigate poisoning effects but\nis computationally intensive. The other approach is pre-training purification,\ne.g., image short squeezing, which consists of several simple compressions but\noften encounters challenges in dealing with various UEs. Our work provides a\nnovel disentanglement mechanism to build an efficient pre-training purification\nmethod. Firstly, we uncover rate-constrained variational autoencoders (VAEs),\ndemonstrating a clear tendency to suppress the perturbations in UEs. We\nsubsequently conduct a theoretical analysis for this phenomenon. Building upon\nthese insights, we introduce a disentangle variational autoencoder (D-VAE),\ncapable of disentangling the perturbations with learnable class-wise\nembeddings. Based on this network, a two-stage purification approach is\nnaturally developed. The first stage focuses on roughly eliminating\nperturbations, while the second stage produces refined, poison-free results,\nensuring effectiveness and robustness across various scenarios. Extensive\nexperiments demonstrate the remarkable performance of our method across\nCIFAR-10, CIFAR-100, and a 100-class ImageNet-subset. Code is available at\nhttps://github.com/yuyi-sd/D-VAE.\n","authors":["Yi Yu","Yufei Wang","Song Xia","Wenhan Yang","Shijian Lu","Yap-Peng Tan","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2405.01460v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.03197v1","updated":"2024-05-06T06:45:23Z","published":"2024-05-06T06:45:23Z","title":"StyleSeg V2: Towards Robust One-shot Segmentation of Brain Tissue via\n Optimization-free Registration Error Perception","summary":" One-shot segmentation of brain tissue requires training\nregistration-segmentation (reg-seg) dual-model iteratively, where reg-model\naims to provide pseudo masks of unlabeled images for seg-model by warping a\ncarefully-labeled atlas. However, the imperfect reg-model induces image-mask\nmisalignment, poisoning the seg-model subsequently. Recent StyleSeg bypasses\nthis bottleneck by replacing the unlabeled images with their warped copies of\natlas, but needs to borrow the diverse image patterns via style transformation.\nHere, we present StyleSeg V2, inherited from StyleSeg but granted the ability\nof perceiving the registration errors. The motivation is that good registration\nbehaves in a mirrored fashion for mirrored images. Therefore, almost at no\ncost, StyleSeg V2 can have reg-model itself \"speak out\" incorrectly-aligned\nregions by simply mirroring (symmetrically flipping the brain) its input, and\nthe registration errors are symmetric inconsistencies between the outputs of\noriginal and mirrored inputs. Consequently, StyleSeg V2 allows the seg-model to\nmake use of correctly-aligned regions of unlabeled images and also enhances the\nfidelity of style-transformed warped atlas image by weighting the local\ntransformation strength according to registration errors. The experimental\nresults on three public datasets demonstrate that our proposed StyleSeg V2\noutperforms other state-of-the-arts by considerable margins, and exceeds\nStyleSeg by increasing the average Dice by at least 2.4%.\n","authors":["Zhiwei Wang","Xiaoyu Zeng","Chongwei Wu","Jinxin lv","Xu Zhang","Wei Fang","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2405.03197v1.pdf","comment":"9 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.03194v1","updated":"2024-05-06T06:38:49Z","published":"2024-05-06T06:38:49Z","title":"CityLLaVA: Efficient Fine-Tuning for VLMs in City Scenario","summary":" In the vast and dynamic landscape of urban settings, Traffic Safety\nDescription and Analysis plays a pivotal role in applications ranging from\ninsurance inspection to accident prevention. This paper introduces CityLLaVA, a\nnovel fine-tuning framework for Visual Language Models (VLMs) designed for\nurban scenarios. CityLLaVA enhances model comprehension and prediction accuracy\nthrough (1) employing bounding boxes for optimal visual data preprocessing,\nincluding video best-view selection and visual prompt engineering during both\ntraining and testing phases; (2) constructing concise Question-Answer sequences\nand designing textual prompts to refine instruction comprehension; (3)\nimplementing block expansion to fine-tune large VLMs efficiently; and (4)\nadvancing prediction accuracy via a unique sequential questioning-based\nprediction augmentation. Demonstrating top-tier performance, our method\nachieved a benchmark score of 33.4308, securing the leading position on the\nleaderboard. The code can be found:\nhttps://github.com/alibaba/AICITY2024_Track2_AliOpenTrek_CityLLaVA\n","authors":["Zhizhao Duan","Hao Cheng","Duo Xu","Xi Wu","Xiangxie Zhang","Xi Ye","Zhen Xie"],"pdf_url":"https://arxiv.org/pdf/2405.03194v1.pdf","comment":"Accepted by AICITY2024 Workshop Track2 at CVPR2024"},{"id":"http://arxiv.org/abs/2401.03470v2","updated":"2024-05-06T06:33:39Z","published":"2024-01-07T12:34:45Z","title":"FurniScene: A Large-scale 3D Room Dataset with Intricate Furnishing\n Scenes","summary":" Indoor scene generation has attracted significant attention recently as it is\ncrucial for applications of gaming, virtual reality, and interior design.\nCurrent indoor scene generation methods can produce reasonable room layouts but\noften lack diversity and realism. This is primarily due to the limited coverage\nof existing datasets, including only large furniture without tiny furnishings\nin daily life. To address these challenges, we propose FurniScene, a\nlarge-scale 3D room dataset with intricate furnishing scenes from interior\ndesign professionals. Specifically, the FurniScene consists of 11,698 rooms and\n39,691 unique furniture CAD models with 89 different types, covering things\nfrom large beds to small teacups on the coffee table. To better suit\nfine-grained indoor scene layout generation, we introduce a novel Two-Stage\nDiffusion Scene Model (TSDSM) and conduct an evaluation benchmark for various\nindoor scene generation based on FurniScene. Quantitative and qualitative\nevaluations demonstrate the capability of our method to generate highly\nrealistic indoor scenes. Our dataset and code will be publicly available soon.\n","authors":["Genghao Zhang","Yuxi Wang","Chuanchen Luo","Shibiao Xu","Zhaoxiang Zhang","Man Zhang","Junran Peng"],"pdf_url":"https://arxiv.org/pdf/2401.03470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03193v1","updated":"2024-05-06T06:32:58Z","published":"2024-05-06T06:32:58Z","title":"Exploring Frequencies via Feature Mixing and Meta-Learning for Improving\n Adversarial Transferability","summary":" Recent studies have shown that Deep Neural Networks (DNNs) are susceptible to\nadversarial attacks, with frequency-domain analysis underscoring the\nsignificance of high-frequency components in influencing model predictions.\nConversely, targeting low-frequency components has been effective in enhancing\nattack transferability on black-box models. In this study, we introduce a\nfrequency decomposition-based feature mixing method to exploit these frequency\ncharacteristics in both clean and adversarial samples. Our findings suggest\nthat incorporating features of clean samples into adversarial features\nextracted from adversarial examples is more effective in attacking\nnormally-trained models, while combining clean features with the adversarial\nfeatures extracted from low-frequency parts decomposed from the adversarial\nsamples yields better results in attacking defense models. However, a conflict\nissue arises when these two mixing approaches are employed simultaneously. To\ntackle the issue, we propose a cross-frequency meta-optimization approach\ncomprising the meta-train step, meta-test step, and final update. In the\nmeta-train step, we leverage the low-frequency components of adversarial\nsamples to boost the transferability of attacks against defense models.\nMeanwhile, in the meta-test step, we utilize adversarial samples to stabilize\ngradients, thereby enhancing the attack's transferability against normally\ntrained models. For the final update, we update the adversarial sample based on\nthe gradients obtained from both meta-train and meta-test steps. Our proposed\nmethod is evaluated through extensive experiments on the ImageNet-Compatible\ndataset, affirming its effectiveness in improving the transferability of\nattacks on both normally-trained CNNs and defense models.\n The source code is available at https://github.com/WJJLL/MetaSSA.\n","authors":["Juanjuan Weng","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2405.03193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03190v1","updated":"2024-05-06T06:30:17Z","published":"2024-05-06T06:30:17Z","title":"Adapting Dual-encoder Vision-language Models for Paraphrased Retrieval","summary":" In the recent years, the dual-encoder vision-language models (\\eg CLIP) have\nachieved remarkable text-to-image retrieval performance. However, we discover\nthat these models usually results in very different retrievals for a pair of\nparaphrased queries. Such behavior might render the retrieval system less\npredictable and lead to user frustration. In this work, we consider the task of\nparaphrased text-to-image retrieval where a model aims to return similar\nresults given a pair of paraphrased queries. To start with, we collect a\ndataset of paraphrased image descriptions to facilitate quantitative evaluation\nfor this task. We then hypothesize that the undesired behavior of existing\ndual-encoder model is due to their text towers which are trained on\nimage-sentence pairs and lack the ability to capture the semantic similarity\nbetween paraphrased queries. To improve on this, we investigate multiple\nstrategies for training a dual-encoder model starting from a language model\npretrained on a large text corpus. Compared to public dual-encoder models such\nas CLIP and OpenCLIP, the model trained with our best adaptation strategy\nachieves a significantly higher ranking similarity for paraphrased queries\nwhile maintaining similar zero-shot classification and retrieval accuracy.\n","authors":["Jiacheng Cheng","Hijung Valentina Shin","Nuno Vasconcelos","Bryan Russell","Fabian Caba Heilbron"],"pdf_url":"https://arxiv.org/pdf/2405.03190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18423v2","updated":"2024-05-06T06:10:29Z","published":"2024-04-29T04:47:23Z","title":"Unsupervised Dynamics Prediction with Object-Centric Kinematics","summary":" Human perception involves discerning complex multi-object scenes into\ntime-static object appearance (ie, size, shape, color) and time-varying object\nmotion (ie, location, velocity, acceleration). This innate ability to\nunconsciously understand the environment is the motivation behind the success\nof dynamics modeling. Object-centric representations have emerged as a\npromising tool for dynamics prediction, yet they primarily focus on the\nobjects' appearance, often overlooking other crucial attributes. In this paper,\nwe propose Object-Centric Kinematics (OCK), a framework for dynamics prediction\nleveraging object-centric representations. Our model utilizes a novel component\nnamed object kinematics, which comprises low-level structured states of\nobjects' position, velocity, and acceleration. The object kinematics are\nobtained via either implicit or explicit approaches, enabling comprehensive\nspatiotemporal object reasoning, and integrated through various transformer\nmechanisms, facilitating effective object-centric dynamics modeling. Our model\ndemonstrates superior performance when handling objects and backgrounds in\ncomplex scenes characterized by a wide range of object attributes and dynamic\nmovements. Moreover, our model demonstrates generalization capabilities across\ndiverse synthetic environments, highlighting its potential for broad\napplicability in vision-related tasks.\n","authors":["Yeon-Ji Song","Suhyung Choi","Jaein Kim","Jin-Hwa Kim","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.18423v2.pdf","comment":"15 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.03177v1","updated":"2024-05-06T05:58:49Z","published":"2024-05-06T05:58:49Z","title":"Transformer-based RGB-T Tracking with Channel and Spatial Feature Fusion","summary":" Complementary RGB and TIR modalities enable RGB-T tracking to achieve\ncompetitive performance in challenging scenarios. Therefore, how to better fuse\ncross-modal features is the core issue of RGB-T tracking. Some previous methods\neither insufficiently fuse RGB and TIR features, or depend on intermediaries\ncontaining information from both modalities to achieve cross-modal information\ninteraction. The former does not fully exploit the potential of using only RGB\nand TIR information of the template or search region for channel and spatial\nfeature fusion, and the latter lacks direct interaction between the template\nand search area, which limits the model's ability to fully exploit the original\nsemantic information of both modalities. To alleviate these limitations, we\nexplore how to improve the performance of a visual Transformer by using direct\nfusion of cross-modal channels and spatial features, and propose CSTNet. CSTNet\nuses ViT as a backbone and inserts cross-modal channel feature fusion modules\n(CFM) and cross-modal spatial feature fusion modules (SFM) for direct\ninteraction between RGB and TIR features. The CFM performs parallel joint\nchannel enhancement and joint multilevel spatial feature modeling of RGB and\nTIR features and sums the features, and then globally integrates the sum\nfeature with the original features. The SFM uses cross-attention to model the\nspatial relationship of cross-modal features and then introduces a\nconvolutional feedforward network for joint spatial and channel integration of\nmultimodal features. Comprehensive experiments show that CSTNet achieves\nstate-of-the-art performance on three public RGB-T tracking benchmarks. Code is\navailable at https://github.com/LiYunfengLYF/CSTNet.\n","authors":["Yunfeng Li","Bo Wang","Ye Li","Zhiwen Yu","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03177v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2402.03162v2","updated":"2024-05-06T05:37:20Z","published":"2024-02-05T16:30:57Z","title":"Direct-a-Video: Customized Video Generation with User-Directed Camera\n Movement and Object Motion","summary":" Recent text-to-video diffusion models have achieved impressive progress. In\npractice, users often desire the ability to control object motion and camera\nmovement independently for customized video creation. However, current methods\nlack the focus on separately controlling object motion and camera movement in a\ndecoupled manner, which limits the controllability and flexibility of\ntext-to-video models. In this paper, we introduce Direct-a-Video, a system that\nallows users to independently specify motions for multiple objects as well as\ncamera's pan and zoom movements, as if directing a video. We propose a simple\nyet effective strategy for the decoupled control of object motion and camera\nmovement. Object motion is controlled through spatial cross-attention\nmodulation using the model's inherent priors, requiring no additional\noptimization. For camera movement, we introduce new temporal cross-attention\nlayers to interpret quantitative camera movement parameters. We further employ\nan augmentation-based approach to train these layers in a self-supervised\nmanner on a small-scale dataset, eliminating the need for explicit motion\nannotation. Both components operate independently, allowing individual or\ncombined control, and can generalize to open-domain scenarios. Extensive\nexperiments demonstrate the superiority and effectiveness of our method.\nProject page and code are available at https://direct-a-video.github.io/.\n","authors":["Shiyuan Yang","Liang Hou","Haibin Huang","Chongyang Ma","Pengfei Wan","Di Zhang","Xiaodong Chen","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2402.03162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13733v2","updated":"2024-05-06T05:34:33Z","published":"2024-04-21T18:19:27Z","title":"Elucidating the Design Space of Dataset Condensation","summary":" Dataset condensation, a concept within data-centric learning, efficiently\ntransfers critical attributes from an original dataset to a synthetic version,\nmaintaining both diversity and realism. This approach significantly improves\nmodel training efficiency and is adaptable across multiple application areas.\nPrevious methods in dataset condensation have faced challenges: some incur high\ncomputational costs which limit scalability to larger datasets (e.g., MTT,\nDREAM, and TESLA), while others are restricted to less optimal design spaces,\nwhich could hinder potential improvements, especially in smaller datasets\n(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a\ncomprehensive design framework that includes specific, effective strategies\nlike implementing soft category-aware matching and adjusting the learning rate\nschedule. These strategies are grounded in empirical evidence and theoretical\nbacking. Our resulting approach, Elucidate Dataset Condensation (EDC),\nestablishes a benchmark for both small and large-scale dataset condensation. In\nour testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on\nImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a\ncompression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM,\nand RDED by margins of 27.3%, 17.2%, and 6.6%, respectively.\n","authors":["Shitong Shao","Zikai Zhou","Huanran Chen","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.13733v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03164v1","updated":"2024-05-06T05:04:59Z","published":"2024-05-06T05:04:59Z","title":"The Role of Predictive Uncertainty and Diversity in Embodied AI and\n Robot Learning","summary":" Uncertainty has long been a critical area of study in robotics, particularly\nwhen robots are equipped with analytical models. As we move towards the\nwidespread use of deep neural networks in robots, which have demonstrated\nremarkable performance in research settings, understanding the nuances of\nuncertainty becomes crucial for their real-world deployment. This guide offers\nan overview of the importance of uncertainty and provides methods to quantify\nand evaluate it from an applications perspective.\n","authors":["Ransalu Senanayake"],"pdf_url":"https://arxiv.org/pdf/2405.03164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03162v1","updated":"2024-05-06T04:44:22Z","published":"2024-05-06T04:44:22Z","title":"Advancing Multimodal Medical Capabilities of Gemini","summary":" Many clinical tasks require an understanding of specialized data, such as\nmedical images and genomics, which is not typically found in general-purpose\nlarge multimodal models. Building upon Gemini's multimodal models, we develop\nseveral models within the new Med-Gemini family that inherit core capabilities\nof Gemini and are optimized for medical use via fine-tuning with 2D and 3D\nradiology, histopathology, ophthalmology, dermatology and genomic data.\nMed-Gemini-2D sets a new standard for AI-based chest X-ray (CXR) report\ngeneration based on expert evaluation, exceeding previous best results across\ntwo separate datasets by an absolute margin of 1% and 12%, where 57% and 96% of\nAI reports on normal cases, and 43% and 65% on abnormal cases, are evaluated as\n\"equivalent or better\" than the original radiologists' reports. We demonstrate\nthe first ever large multimodal model-based report generation for 3D computed\ntomography (CT) volumes using Med-Gemini-3D, with 53% of AI reports considered\nclinically acceptable, although additional research is needed to meet expert\nradiologist reporting quality. Beyond report generation, Med-Gemini-2D\nsurpasses the previous best performance in CXR visual question answering (VQA)\nand performs well in CXR classification and radiology VQA, exceeding SoTA or\nbaselines on 17 of 20 tasks. In histopathology, ophthalmology, and dermatology\nimage classification, Med-Gemini-2D surpasses baselines across 18 out of 20\ntasks and approaches task-specific model performance. Beyond imaging,\nMed-Gemini-Polygenic outperforms the standard linear polygenic risk score-based\napproach for disease risk prediction and generalizes to genetically correlated\ndiseases for which it has never been trained. Although further development and\nevaluation are necessary in the safety-critical medical domain, our results\nhighlight the potential of Med-Gemini across a wide range of medical tasks.\n","authors":["Lin Yang","Shawn Xu","Andrew Sellergren","Timo Kohlberger","Yuchen Zhou","Ira Ktena","Atilla Kiraly","Faruk Ahmed","Farhad Hormozdiari","Tiam Jaroensri","Eric Wang","Ellery Wulczyn","Fayaz Jamil","Theo Guidroz","Chuck Lau","Siyuan Qiao","Yun Liu","Akshay Goel","Kendall Park","Arnav Agharwal","Nick George","Yang Wang","Ryutaro Tanno","David G. T. Barrett","Wei-Hung Weng","S. Sara Mahdavi","Khaled Saab","Tao Tu","Sreenivasa Raju Kalidindi","Mozziyar Etemadi","Jorge Cuadros","Gregory Sorensen","Yossi Matias","Katherine Chou","Greg Corrado","Joelle Barral","Shravya Shetty","David Fleet","S. M. Ali Eslami","Daniel Tse","Shruthi Prabhakara","Cory McLean","Dave Steiner","Rory Pilgrim","Christopher Kelly","Shekoofeh Azizi","Daniel Golden"],"pdf_url":"https://arxiv.org/pdf/2405.03162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05717v2","updated":"2024-05-06T04:37:52Z","published":"2024-04-08T17:52:29Z","title":"SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual\n Editing","summary":" Effective editing of personal content holds a pivotal role in enabling\nindividuals to express their creativity, weaving captivating narratives within\ntheir visual stories, and elevate the overall quality and impact of their\nvisual content. Therefore, in this work, we introduce SwapAnything, a novel\nframework that can swap any objects in an image with personalized concepts\ngiven by the reference, while keeping the context unchanged. Compared with\nexisting methods for personalized subject swapping, SwapAnything has three\nunique advantages: (1) precise control of arbitrary objects and parts rather\nthan the main subject, (2) more faithful preservation of context pixels, (3)\nbetter adaptation of the personalized concept to the image. First, we propose\ntargeted variable swapping to apply region control over latent feature maps and\nswap masked variables for faithful context preservation and initial semantic\nconcept swapping. Then, we introduce appearance adaptation, to seamlessly adapt\nthe semantic concept into the original image in terms of target location,\nshape, style, and content during the image generation process. Extensive\nresults on both human and automatic evaluation demonstrate significant\nimprovements of our approach over baseline methods on personalized swapping.\nFurthermore, SwapAnything shows its precise and faithful swapping abilities\nacross single object, multiple objects, partial object, and cross-domain\nswapping tasks. SwapAnything also achieves great performance on text-based\nswapping and tasks beyond swapping such as object insertion.\n","authors":["Jing Gu","Yilin Wang","Nanxuan Zhao","Wei Xiong","Qing Liu","Zhifei Zhang","He Zhang","Jianming Zhang","HyunJoon Jung","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05717v2.pdf","comment":"18 pages, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.03159v1","updated":"2024-05-06T04:36:02Z","published":"2024-05-06T04:36:02Z","title":"DeepMpMRI: Tensor-decomposition Regularized Learning for Fast and\n High-Fidelity Multi-Parametric Microstructural MR Imaging","summary":" Deep learning has emerged as a promising approach for learning the nonlinear\nmapping between diffusion-weighted MR images and tissue parameters, which\nenables automatic and deep understanding of the brain microstructures. However,\nthe efficiency and accuracy in the multi-parametric estimations are still\nlimited since previous studies tend to estimate multi-parametric maps with\ndense sampling and isolated signal modeling. This paper proposes DeepMpMRI, a\nunified framework for fast and high-fidelity multi-parametric estimation from\nvarious diffusion models using sparsely sampled q-space data. DeepMpMRI is\nequipped with a newly designed tensor-decomposition-based regularizer to\neffectively capture fine details by exploiting the correlation across\nparameters. In addition, we introduce a Nesterov-based adaptive learning\nalgorithm that optimizes the regularization parameter dynamically to enhance\nthe performance. DeepMpMRI is an extendable framework capable of incorporating\nflexible network architecture. Experimental results demonstrate the superiority\nof our approach over 5 state-of-the-art methods in simultaneously estimating\nmulti-parametric maps for various diffusion models with fine-grained details\nboth quantitatively and qualitatively, achieving 4.5 - 22.5$\\times$\nacceleration compared to the dense sampling of a total of 270 diffusion\ngradients.\n","authors":["Wenxin Fan","Jian Cheng","Cheng Li","Xinrui Ma","Jing Yang","Juan Zou","Ruoyou Wu","Zan Chen","Yuanjing Feng","Hairong Zheng","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08384v2","updated":"2024-05-06T04:14:15Z","published":"2024-03-13T09:48:11Z","title":"AADNet: Attention aware Demoiréing Network","summary":" Moire pattern frequently appears in photographs captured with mobile devices\nand digital cameras, potentially degrading image quality. Despite recent\nadvancements in computer vision, image demoire'ing remains a challenging task\ndue to the dynamic textures and variations in colour, shape, and frequency of\nmoire patterns. Most existing methods struggle to generalize to unseen\ndatasets, limiting their effectiveness in removing moire patterns from\nreal-world scenarios. In this paper, we propose a novel lightweight\narchitecture, AADNet (Attention Aware Demoireing Network), for high-resolution\nimage demoire'ing that effectively works across different frequency bands and\ngeneralizes well to unseen datasets. Extensive experiments conducted on the\nUHDM dataset validate the effectiveness of our approach, resulting in\nhigh-fidelity images.\n","authors":["M Rakesh Reddy","Shubham Mandloi","Aman Kumar"],"pdf_url":"https://arxiv.org/pdf/2403.08384v2.pdf","comment":"Due to unauthorized access and upload, this paper has been withdrawn.\n It does not reflect the contributions or approval"},{"id":"http://arxiv.org/abs/2405.03150v1","updated":"2024-05-06T04:01:42Z","published":"2024-05-06T04:01:42Z","title":"Video Diffusion Models: A Survey","summary":" Diffusion generative models have recently become a robust technique for\nproducing and modifying coherent, high-quality video. This survey offers a\nsystematic overview of critical elements of diffusion models for video\ngeneration, covering applications, architectural choices, and the modeling of\ntemporal dynamics. Recent advancements in the field are summarized and grouped\ninto development trends. The survey concludes with an overview of remaining\nchallenges and an outlook on the future of the field. Website:\nhttps://github.com/ndrwmlnk/Awesome-Video-Diffusion-Models\n","authors":["Andrew Melnik","Michal Ljubljanac","Cong Lu","Qi Yan","Weiming Ren","Helge Ritter"],"pdf_url":"https://arxiv.org/pdf/2405.03150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04760v4","updated":"2024-05-06T03:58:51Z","published":"2023-07-10T17:58:17Z","title":"Learning Spatial Features from Audio-Visual Correspondence in Egocentric\n Videos","summary":" We propose a self-supervised method for learning representations based on\nspatial audio-visual correspondences in egocentric videos. Our method uses a\nmasked auto-encoding framework to synthesize masked binaural (multi-channel)\naudio through the synergy of audio and vision, thereby learning useful spatial\nrelationships between the two modalities. We use our pretrained features to\ntackle two downstream video tasks requiring spatial understanding in social\nscenarios: active speaker detection and spatial audio denoising. Through\nextensive experiments, we show that our features are generic enough to improve\nover multiple state-of-the-art baselines on both tasks on two challenging\negocentric video datasets that offer binaural audio, EgoCom and EasyCom.\nProject: http://vision.cs.utexas.edu/projects/ego_av_corr.\n","authors":["Sagnik Majumder","Ziad Al-Halah","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.04760v4.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2405.03144v1","updated":"2024-05-06T03:39:50Z","published":"2024-05-06T03:39:50Z","title":"PTQ4SAM: Post-Training Quantization for Segment Anything","summary":" Segment Anything Model (SAM) has achieved impressive performance in many\ncomputer vision tasks. However, as a large-scale model, the immense memory and\ncomputation costs hinder its practical deployment. In this paper, we propose a\npost-training quantization (PTQ) framework for Segment Anything Model, namely\nPTQ4SAM. First, we investigate the inherent bottleneck of SAM quantization\nattributed to the bimodal distribution in post-Key-Linear activations. We\nanalyze its characteristics from both per-tensor and per-channel perspectives,\nand propose a Bimodal Integration strategy, which utilizes a mathematically\nequivalent sign operation to transform the bimodal distribution into a\nrelatively easy-quantized normal distribution offline. Second, SAM encompasses\ndiverse attention mechanisms (i.e., self-attention and two-way\ncross-attention), resulting in substantial variations in the post-Softmax\ndistributions. Therefore, we introduce an Adaptive Granularity Quantization for\nSoftmax through searching the optimal power-of-two base, which is\nhardware-friendly. Extensive experimental results across various vision tasks\n(instance segmentation, semantic segmentation and object detection), datasets\nand model variants show the superiority of PTQ4SAM. For example, when\nquantizing SAM-L to 6-bit, we achieve lossless accuracy for instance\nsegmentation, about 0.5\\% drop with theoretical 3.9$\\times$ acceleration. The\ncode is available at \\url{https://github.com/chengtao-lv/PTQ4SAM}.\n","authors":["Chengtao Lv","Hong Chen","Jinyang Guo","Yifu Ding","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.03144v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2405.03141v1","updated":"2024-05-06T03:28:47Z","published":"2024-05-06T03:28:47Z","title":"Automatic Ultrasound Curve Angle Measurement via Affinity Clustering for\n Adolescent Idiopathic Scoliosis Evaluation","summary":" The current clinical gold standard for evaluating adolescent idiopathic\nscoliosis (AIS) is X-ray radiography, using Cobb angle measurement. However,\nthe frequent monitoring of the AIS progression using X-rays poses a challenge\ndue to the cumulative radiation exposure. Although 3D ultrasound has been\nvalidated as a reliable and radiation-free alternative for scoliosis\nassessment, the process of measuring spinal curvature is still carried out\nmanually. Consequently, there is a considerable demand for a fully automatic\nsystem that can locate bony landmarks and perform angle measurements. To this\nend, we introduce an estimation model for automatic ultrasound curve angle\n(UCA) measurement. The model employs a dual-branch network to detect candidate\nlandmarks and perform vertebra segmentation on ultrasound coronal images. An\naffinity clustering strategy is utilized within the vertebral segmentation area\nto illustrate the affinity relationship between candidate landmarks.\nSubsequently, we can efficiently perform line delineation from a clustered\naffinity map for UCA measurement. As our method is specifically designed for\nUCA calculation, this method outperforms other state-of-the-art methods for\nlandmark and line detection tasks. The high correlation between the automatic\nUCA and Cobb angle (R$^2$=0.858) suggests that our proposed method can\npotentially replace manual UCA measurement in ultrasound scoliosis assessment.\n","authors":["Yihao Zhou","Timothy Tin-Yan Lee","Kelly Ka-Lee Lai","Chonglin Wu","Hin Ting Lau","De Yang","Chui-Yi Chan","Winnie Chiu-Wing Chu","Jack Chun-Yiu Cheng","Tsz-Ping Lam","Yong-Ping Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.03141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12827v2","updated":"2024-05-06T02:59:21Z","published":"2022-11-23T10:20:19Z","title":"Video Instance Shadow Detection","summary":" Instance shadow detection, crucial for applications such as photo editing and\nlight direction estimation, has undergone significant advancements in\npredicting shadow instances, object instances, and their associations. The\nextension of this task to videos presents challenges in annotating diverse\nvideo data and addressing complexities arising from occlusion and temporary\ndisappearances within associations. In response to these challenges, we\nintroduce ViShadow, a semi-supervised video instance shadow detection framework\nthat leverages both labeled image data and unlabeled video data for training.\nViShadow features a two-stage training pipeline: the first stage, utilizing\nlabeled image data, identifies shadow and object instances through contrastive\nlearning for cross-frame pairing. The second stage employs unlabeled videos,\nincorporating an associated cycle consistency loss to enhance tracking ability.\nA retrieval mechanism is introduced to manage temporary disappearances,\nensuring tracking continuity. The SOBA-VID dataset, comprising unlabeled\ntraining videos and labeled testing videos, along with the SOAP-VID metric, is\nintroduced for the quantitative evaluation of VISD solutions. The effectiveness\nof ViShadow is further demonstrated through various video-level applications\nsuch as video inpainting, instance cloning, shadow editing, and text-instructed\nshadow-object manipulation.\n","authors":["Zhenghao Xing","Tianyu Wang","Xiaowei Hu","Haoran Wu","Chi-Wing Fu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2211.12827v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03121v1","updated":"2024-05-06T02:32:41Z","published":"2024-05-06T02:32:41Z","title":"AniTalker: Animate Vivid and Diverse Talking Faces through\n Identity-Decoupled Facial Motion Encoding","summary":" The paper introduces AniTalker, an innovative framework designed to generate\nlifelike talking faces from a single portrait. Unlike existing models that\nprimarily focus on verbal cues such as lip synchronization and fail to capture\nthe complex dynamics of facial expressions and nonverbal cues, AniTalker\nemploys a universal motion representation. This innovative representation\neffectively captures a wide range of facial dynamics, including subtle\nexpressions and head movements. AniTalker enhances motion depiction through two\nself-supervised learning strategies: the first involves reconstructing target\nvideo frames from source frames within the same identity to learn subtle motion\nrepresentations, and the second develops an identity encoder using metric\nlearning while actively minimizing mutual information between the identity and\nmotion encoders. This approach ensures that the motion representation is\ndynamic and devoid of identity-specific details, significantly reducing the\nneed for labeled data. Additionally, the integration of a diffusion model with\na variance adapter allows for the generation of diverse and controllable facial\nanimations. This method not only demonstrates AniTalker's capability to create\ndetailed and realistic facial movements but also underscores its potential in\ncrafting dynamic avatars for real-world applications. Synthetic results can be\nviewed at https://github.com/X-LANCE/AniTalker.\n","authors":["Tao Liu","Feilong Chen","Shuai Fan","Chenpeng Du","Qi Chen","Xie Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2405.03121v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.03109v1","updated":"2024-05-06T02:02:57Z","published":"2024-05-06T02:02:57Z","title":"Intra-task Mutual Attention based Vision Transformer for Few-Shot\n Learning","summary":" Humans possess remarkable ability to accurately classify new, unseen images\nafter being exposed to only a few examples. Such ability stems from their\ncapacity to identify common features shared between new and previously seen\nimages while disregarding distractions such as background variations. However,\nfor artificial neural network models, determining the most relevant features\nfor distinguishing between two images with limited samples presents a\nchallenge. In this paper, we propose an intra-task mutual attention method for\nfew-shot learning, that involves splitting the support and query samples into\npatches and encoding them using the pre-trained Vision Transformer (ViT)\narchitecture. Specifically, we swap the class (CLS) token and patch tokens\nbetween the support and query sets to have the mutual attention, which enables\neach set to focus on the most useful information. This facilitates the\nstrengthening of intra-class representations and promotes closer proximity\nbetween instances of the same class. For implementation, we adopt the ViT-based\nnetwork architecture and utilize pre-trained model parameters obtained through\nself-supervision. By leveraging Masked Image Modeling as a self-supervised\ntraining task for pre-training, the pre-trained model yields semantically\nmeaningful representations while successfully avoiding supervision collapse. We\nthen employ a meta-learning method to fine-tune the last several layers and CLS\ntoken modules. Our strategy significantly reduces the num- ber of parameters\nthat require fine-tuning while effectively uti- lizing the capability of\npre-trained model. Extensive experiments show that our framework is simple,\neffective and computationally efficient, achieving superior performance as\ncompared to the state-of-the-art baselines on five popular few-shot\nclassification benchmarks under the 5-shot and 1-shot scenarios\n","authors":["Weihao Jiang","Chang Liu","Kun He"],"pdf_url":"https://arxiv.org/pdf/2405.03109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03104v1","updated":"2024-05-06T01:40:20Z","published":"2024-05-06T01:40:20Z","title":"GeoContrastNet: Contrastive Key-Value Edge Learning for\n Language-Agnostic Document Understanding","summary":" This paper presents GeoContrastNet, a language-agnostic framework to\nstructured document understanding (DU) by integrating a contrastive learning\nobjective with graph attention networks (GATs), emphasizing the significant\nrole of geometric features. We propose a novel methodology that combines\ngeometric edge features with visual features within an overall two-staged\nGAT-based framework, demonstrating promising results in both link prediction\nand semantic entity recognition performance. Our findings reveal that combining\nboth geometric and visual features could match the capabilities of large DU\nmodels that rely heavily on Optical Character Recognition (OCR) features in\nterms of performance accuracy and efficiency. This approach underscores the\ncritical importance of relational layout information between the named text\nentities in a semi-structured layout of a page. Specifically, our results\nhighlight the model's proficiency in identifying key-value relationships within\nthe FUNSD dataset for forms and also discovering the spatial relationships in\ntable-structured layouts for RVLCDIP business invoices. Our code and pretrained\nmodels will be accessible on our official GitHub.\n","authors":["Nil Biescas","Carlos Boned","Josep Lladós","Sanket Biswas"],"pdf_url":"https://arxiv.org/pdf/2405.03104v1.pdf","comment":"Accepted in ICDAR 2024 (Athens, Greece)"},{"id":"http://arxiv.org/abs/2405.03103v1","updated":"2024-05-06T01:39:59Z","published":"2024-05-06T01:39:59Z","title":"Learning from Students: Applying t-Distributions to Explore Accurate and\n Efficient Formats for LLMs","summary":" Large language models (LLMs) have recently achieved state-of-the-art\nperformance across various tasks, yet due to their large computational\nrequirements, they struggle with strict latency and power demands. Deep neural\nnetwork (DNN) quantization has traditionally addressed these limitations by\nconverting models to low-precision integer formats. Yet recently alternative\nformats, such as Normal Float (NF4), have been shown to consistently increase\nmodel accuracy, albeit at the cost of increased chip area. In this work, we\nfirst conduct a large-scale analysis of LLM weights and activations across 30\nnetworks to conclude most distributions follow a Student's t-distribution. We\nthen derive a new theoretically optimal format, Student Float (SF4), with\nrespect to this distribution, that improves over NF4 across modern LLMs, for\nexample increasing the average accuracy on LLaMA2-7B by 0.76% across tasks.\nUsing this format as a high-accuracy reference, we then propose augmenting E2M1\nwith two variants of supernormal support for higher model accuracy. Finally, we\nexplore the quality and performance frontier across 11 datatypes, including\nnon-traditional formats like Additive-Powers-of-Two (APoT), by evaluating their\nmodel accuracy and hardware complexity. We discover a Pareto curve composed of\nINT4, E2M1, and E2M1 with supernormal support, which offers a continuous\ntradeoff between model accuracy and chip area. For example, E2M1 with\nsupernormal support increases the accuracy of Phi-2 by up to 2.19% with 1.22%\narea overhead, enabling more LLM-based applications to be run at four bits.\n","authors":["Jordan Dotzel","Yuzong Chen","Bahaa Kotb","Sushma Prasad","Gang Wu","Sheng Li","Mohamed S. Abdelfattah","Zhiru Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03103v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2405.03099v1","updated":"2024-05-06T01:24:14Z","published":"2024-05-06T01:24:14Z","title":"SketchGPT: Autoregressive Modeling for Sketch Generation and Recognition","summary":" We present SketchGPT, a flexible framework that employs a\nsequence-to-sequence autoregressive model for sketch generation, and\ncompletion, and an interpretation case study for sketch recognition. By mapping\ncomplex sketches into simplified sequences of abstract primitives, our approach\nsignificantly streamlines the input for autoregressive modeling. SketchGPT\nleverages the next token prediction objective strategy to understand sketch\npatterns, facilitating the creation and completion of drawings and also\ncategorizing them accurately. This proposed sketch representation strategy aids\nin overcoming existing challenges of autoregressive modeling for continuous\nstroke data, enabling smoother model training and competitive performance. Our\nfindings exhibit SketchGPT's capability to generate a diverse variety of\ndrawings by adding both qualitative and quantitative comparisons with existing\nstate-of-the-art, along with a comprehensive human evaluation study. The code\nand pretrained models will be released on our official GitHub.\n","authors":["Adarsh Tiwari","Sanket Biswas","Josep Lladós"],"pdf_url":"https://arxiv.org/pdf/2405.03099v1.pdf","comment":"Accepted in ICDAR 2024"},{"id":"http://arxiv.org/abs/2310.18936v4","updated":"2024-05-06T01:16:32Z","published":"2023-10-29T08:50:27Z","title":"Adversarial Examples Are Not Real Features","summary":" The existence of adversarial examples has been a mystery for years and\nattracted much interest. A well-known theory by \\citet{ilyas2019adversarial}\nexplains adversarial vulnerability from a data perspective by showing that one\ncan extract non-robust features from adversarial examples and these features\nalone are useful for classification. However, the explanation remains quite\ncounter-intuitive since non-robust features are mostly noise features to\nhumans. In this paper, we re-examine the theory from a larger context by\nincorporating multiple learning paradigms. Notably, we find that contrary to\ntheir good usefulness under supervised learning, non-robust features attain\npoor usefulness when transferred to other self-supervised learning paradigms,\nsuch as contrastive learning, masked image modeling, and diffusion models. It\nreveals that non-robust features are not really as useful as robust or natural\nfeatures that enjoy good transferability between these paradigms. Meanwhile,\nfor robustness, we also show that naturally trained encoders from robust\nfeatures are largely non-robust under AutoAttack. Our cross-paradigm\nexamination suggests that the non-robust features are not really useful but\nmore like paradigm-wise shortcuts, and robust features alone might be\ninsufficient to attain reliable model robustness. Code is available at\n\\url{https://github.com/PKU-ML/AdvNotRealFeatures}.\n","authors":["Ang Li","Yifei Wang","Yiwen Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2310.18936v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2402.00253v2","updated":"2024-05-06T01:10:01Z","published":"2024-02-01T00:33:21Z","title":"A Survey on Hallucination in Large Vision-Language Models","summary":" Recent development of Large Vision-Language Models (LVLMs) has attracted\ngrowing attention within the AI landscape for its practical implementation\npotential. However, ``hallucination'', or more specifically, the misalignment\nbetween factual visual content and corresponding textual generation, poses a\nsignificant challenge of utilizing LVLMs. In this comprehensive survey, we\ndissect LVLM-related hallucinations in an attempt to establish an overview and\nfacilitate future mitigation. Our scrutiny starts with a clarification of the\nconcept of hallucinations in LVLMs, presenting a variety of hallucination\nsymptoms and highlighting the unique challenges inherent in LVLM\nhallucinations. Subsequently, we outline the benchmarks and methodologies\ntailored specifically for evaluating hallucinations unique to LVLMs.\nAdditionally, we delve into an investigation of the root causes of these\nhallucinations, encompassing insights from the training data and model\ncomponents. We also critically review existing methods for mitigating\nhallucinations. The open questions and future directions pertaining to\nhallucinations within LVLMs are discussed to conclude this survey.\n","authors":["Hanchao Liu","Wenyuan Xue","Yifei Chen","Dapeng Chen","Xiutian Zhao","Ke Wang","Liping Hou","Rongjun Li","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2402.00253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02023v2","updated":"2024-05-06T01:06:40Z","published":"2024-05-03T11:55:45Z","title":"IFNet: Deep Imaging and Focusing for Handheld SAR with Millimeter-wave\n Signals","summary":" Recent advancements have showcased the potential of handheld millimeter-wave\n(mmWave) imaging, which applies synthetic aperture radar (SAR) principles in\nportable settings. However, existing studies addressing handheld motion errors\neither rely on costly tracking devices or employ simplified imaging models,\nleading to impractical deployment or limited performance. In this paper, we\npresent IFNet, a novel deep unfolding network that combines the strengths of\nsignal processing models and deep neural networks to achieve robust imaging and\nfocusing for handheld mmWave systems. We first formulate the handheld imaging\nmodel by integrating multiple priors about mmWave images and handheld phase\nerrors. Furthermore, we transform the optimization processes into an iterative\nnetwork structure for improved and efficient imaging performance. Extensive\nexperiments demonstrate that IFNet effectively compensates for handheld phase\nerrors and recovers high-fidelity images from severely distorted signals. In\ncomparison with existing methods, IFNet can achieve at least 11.89 dB\nimprovement in average peak signal-to-noise ratio (PSNR) and 64.91% improvement\nin average structural similarity index measure (SSIM) on a real-world dataset.\n","authors":["Yadong Li","Dongheng Zhang","Ruixu Geng","Jincheng Wu","Yang Hu","Qibin Sun","Yan Chen"],"pdf_url":"https://arxiv.org/pdf/2405.02023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01699v2","updated":"2024-05-06T01:06:33Z","published":"2024-05-02T19:47:08Z","title":"SOAR: Advancements in Small Body Object Detection for Aerial Imagery\n Using State Space Models and Programmable Gradients","summary":" Small object detection in aerial imagery presents significant challenges in\ncomputer vision due to the minimal data inherent in small-sized objects and\ntheir propensity to be obscured by larger objects and background noise.\nTraditional methods using transformer-based models often face limitations\nstemming from the lack of specialized databases, which adversely affect their\nperformance with objects of varying orientations and scales. This underscores\nthe need for more adaptable, lightweight models. In response, this paper\nintroduces two innovative approaches that significantly enhance detection and\nsegmentation capabilities for small aerial objects. Firstly, we explore the use\nof the SAHI framework on the newly introduced lightweight YOLO v9 architecture,\nwhich utilizes Programmable Gradient Information (PGI) to reduce the\nsubstantial information loss typically encountered in sequential feature\nextraction processes. The paper employs the Vision Mamba model, which\nincorporates position embeddings to facilitate precise location-aware visual\nunderstanding, combined with a novel bidirectional State Space Model (SSM) for\neffective visual context modeling. This State Space Model adeptly harnesses the\nlinear complexity of CNNs and the global receptive field of Transformers,\nmaking it particularly effective in remote sensing image classification. Our\nexperimental results demonstrate substantial improvements in detection accuracy\nand processing efficiency, validating the applicability of these approaches for\nreal-time small object detection across diverse aerial scenarios. This paper\nalso discusses how these methodologies could serve as foundational models for\nfuture advancements in aerial object recognition technologies. The source code\nwill be made accessible here.\n","authors":["Tushar Verma","Jyotsna Singh","Yash Bhartari","Rishi Jarwal","Suraj Singh","Shubhkarman Singh"],"pdf_url":"https://arxiv.org/pdf/2405.01699v2.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.03091v1","updated":"2024-05-06T01:05:21Z","published":"2024-05-06T01:05:21Z","title":"Research on Image Recognition Technology Based on Multimodal Deep\n Learning","summary":" This project investigates the human multi-modal behavior identification\nalgorithm utilizing deep neural networks. According to the characteristics of\ndifferent modal information, different deep neural networks are used to adapt\nto different modal video information. Through the integration of various deep\nneural networks, the algorithm successfully identifies behaviors across\nmultiple modalities. In this project, multiple cameras developed by Microsoft\nKinect were used to collect corresponding bone point data based on acquiring\nconventional images. In this way, the motion features in the image can be\nextracted. Ultimately, the behavioral characteristics discerned through both\napproaches are synthesized to facilitate the precise identification and\ncategorization of behaviors. The performance of the suggested algorithm was\nevaluated using the MSR3D data set. The findings from these experiments\nindicate that the accuracy in recognizing behaviors remains consistently high,\nsuggesting that the algorithm is reliable in various scenarios. Additionally,\nthe tests demonstrate that the algorithm substantially enhances the accuracy of\ndetecting pedestrian behaviors in video footage.\n","authors":["Jinyin Wang","Xingchen Li","Yixuan Jin","Yihao Zhong","Keke Zhang","Chang Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.03091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03905v1","updated":"2024-05-06T23:41:02Z","published":"2024-05-06T23:41:02Z","title":"A 65nm 36nJ/Decision Bio-inspired Temporal-Sparsity-Aware Digital\n Keyword Spotting IC with 0.6V Near-Threshold SRAM","summary":" This paper introduces, to the best of the authors' knowledge, the first\nfine-grained temporal sparsity-aware keyword spotting (KWS) IC leveraging\ntemporal similarities between neighboring feature vectors extracted from input\nframes and network hidden states, eliminating unnecessary operations and memory\naccesses. This KWS IC, featuring a bio-inspired delta-gated recurrent neural\nnetwork ({\\Delta}RNN) classifier, achieves an 11-class Google Speech Command\nDataset (GSCD) KWS accuracy of 90.5% and energy consumption of 36nJ/decision.\nAt 87% temporal sparsity, computing latency and energy per inference are\nreduced by 2.4$\\times$/3.4$\\times$, respectively. The 65nm design occupies\n0.78mm$^2$ and features two additional blocks, a compact 0.084mm$^2$ digital\ninfinite-impulse-response (IIR)-based band-pass filter (BPF) audio feature\nextractor (FEx) and a 24kB 0.6V near-Vth weight SRAM with 6.6$\\times$ lower\nread power compared to the standard SRAM.\n","authors":["Qinyu Chen","Kwantae Kim","Chang Gao","Sheng Zhou","Taekwang Jang","Tobi Delbruck","Shih-Chii Liu"],"pdf_url":"https://arxiv.org/pdf/2405.03905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03894v1","updated":"2024-05-06T22:55:53Z","published":"2024-05-06T22:55:53Z","title":"MVDiff: Scalable and Flexible Multi-View Diffusion for 3D Object\n Reconstruction from Single-View","summary":" Generating consistent multiple views for 3D reconstruction tasks is still a\nchallenge to existing image-to-3D diffusion models. Generally, incorporating 3D\nrepresentations into diffusion model decrease the model's speed as well as\ngeneralizability and quality. This paper proposes a general framework to\ngenerate consistent multi-view images from single image or leveraging scene\nrepresentation transformer and view-conditioned diffusion model. In the model,\nwe introduce epipolar geometry constraints and multi-view attention to enforce\n3D consistency. From as few as one image input, our model is able to generate\n3D meshes surpassing baselines methods in evaluation metrics, including PSNR,\nSSIM and LPIPS.\n","authors":["Emmanuelle Bourigault","Pauline Bourigault"],"pdf_url":"https://arxiv.org/pdf/2405.03894v1.pdf","comment":"CVPRW: Generative Models for Computer Vision"},{"id":"http://arxiv.org/abs/2403.01606v2","updated":"2024-05-06T22:19:22Z","published":"2024-03-03T20:16:14Z","title":"A Unified Model Selection Technique for Spectral Clustering Based Motion\n Segmentation","summary":" Motion segmentation is a fundamental problem in computer vision and is\ncrucial in various applications such as robotics, autonomous driving and action\nrecognition. Recently, spectral clustering based methods have shown impressive\nresults on motion segmentation in dynamic environments. These methods perform\nspectral clustering on motion affinity matrices to cluster objects or point\ntrajectories in the scene into different motion groups. However, existing\nmethods often need the number of motions present in the scene to be known,\nwhich significantly reduces their practicality. In this paper, we propose a\nunified model selection technique to automatically infer the number of motion\ngroups for spectral clustering based motion segmentation methods by combining\ndifferent existing model selection techniques together. We evaluate our method\non the KT3DMoSeg dataset and achieve competitve results comparing to the\nbaseline where the number of clusters is given as ground truth information.\n","authors":["Yuxiang Huang","John Zelek"],"pdf_url":"https://arxiv.org/pdf/2403.01606v2.pdf","comment":"for the published version, see\n https://openjournals.uwaterloo.ca/index.php/vsl/article/view/5870/5922"},{"id":"http://arxiv.org/abs/2405.03884v1","updated":"2024-05-06T22:02:38Z","published":"2024-05-06T22:02:38Z","title":"BadFusion: 2D-Oriented Backdoor Attacks against 3D Object Detection","summary":" 3D object detection plays an important role in autonomous driving; however,\nits vulnerability to backdoor attacks has become evident. By injecting\n''triggers'' to poison the training dataset, backdoor attacks manipulate the\ndetector's prediction for inputs containing these triggers. Existing backdoor\nattacks against 3D object detection primarily poison 3D LiDAR signals, where\nlarge-sized 3D triggers are injected to ensure their visibility within the\nsparse 3D space, rendering them easy to detect and impractical in real-world\nscenarios.\n In this paper, we delve into the robustness of 3D object detection, exploring\na new backdoor attack surface through 2D cameras. Given the prevalent adoption\nof camera and LiDAR signal fusion for high-fidelity 3D perception, we\ninvestigate the latent potential of camera signals to disrupt the process.\nAlthough the dense nature of camera signals enables the use of nearly\nimperceptible small-sized triggers to mislead 2D object detection, realizing\n2D-oriented backdoor attacks against 3D object detection is non-trivial. The\nprimary challenge emerges from the fusion process that transforms camera\nsignals into a 3D space, compromising the association with the 2D trigger to\nthe target output. To tackle this issue, we propose an innovative 2D-oriented\nbackdoor attack against LiDAR-camera fusion methods for 3D object detection,\nnamed BadFusion, for preserving trigger effectiveness throughout the entire\nfusion process. The evaluation demonstrates the effectiveness of BadFusion,\nachieving a significantly higher attack success rate compared to existing\n2D-oriented attacks.\n","authors":["Saket S. Chaturvedi","Lan Zhang","Wenbin Zhang","Pan He","Xiaoyong Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.03884v1.pdf","comment":"Accepted at IJCAI 2024 Conference"},{"id":"http://arxiv.org/abs/2405.03882v1","updated":"2024-05-06T21:57:35Z","published":"2024-05-06T21:57:35Z","title":"Trio-ViT: Post-Training Quantization and Acceleration for Softmax-Free\n Efficient Vision Transformer","summary":" Motivated by the huge success of Transformers in the field of natural\nlanguage processing (NLP), Vision Transformers (ViTs) have been rapidly\ndeveloped and achieved remarkable performance in various computer vision tasks.\nHowever, their huge model sizes and intensive computations hinder ViTs'\ndeployment on embedded devices, calling for effective model compression\nmethods, such as quantization. Unfortunately, due to the existence of\nhardware-unfriendly and quantization-sensitive non-linear operations,\nparticularly {Softmax}, it is non-trivial to completely quantize all operations\nin ViTs, yielding either significant accuracy drops or non-negligible hardware\ncosts. In response to challenges associated with \\textit{standard ViTs}, we\nfocus our attention towards the quantization and acceleration for\n\\textit{efficient ViTs}, which not only eliminate the troublesome Softmax but\nalso integrate linear attention with low computational complexity, and propose\n\\emph{Trio-ViT} accordingly. Specifically, at the algorithm level, we develop a\n{tailored post-training quantization engine} taking the unique activation\ndistributions of Softmax-free efficient ViTs into full consideration, aiming to\nboost quantization accuracy. Furthermore, at the hardware level, we build an\naccelerator dedicated to the specific Convolution-Transformer hybrid\narchitecture of efficient ViTs, thereby enhancing hardware efficiency.\nExtensive experimental results consistently prove the effectiveness of our\nTrio-ViT framework. {Particularly, we can gain up to\n$\\uparrow$$\\mathbf{7.2}\\times$ and $\\uparrow$$\\mathbf{14.6}\\times$ FPS under\ncomparable accuracy over state-of-the-art ViT accelerators, as well as\n$\\uparrow$$\\mathbf{5.9}\\times$ and $\\uparrow$$\\mathbf{2.0}\\times$ DSP\nefficiency.} Codes will be released publicly upon acceptance.\n","authors":["Huihong Shi","Haikuo Shao","Wendong Mao","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.03882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04057v2","updated":"2024-05-06T21:12:34Z","published":"2024-04-05T12:30:19Z","title":"Score identity Distillation: Exponentially Fast Distillation of\n Pretrained Diffusion Models for One-Step Generation","summary":" We introduce Score identity Distillation (SiD), an innovative data-free\nmethod that distills the generative capabilities of pretrained diffusion models\ninto a single-step generator. SiD not only facilitates an exponentially fast\nreduction in Fr\\'echet inception distance (FID) during distillation but also\napproaches or even exceeds the FID performance of the original teacher\ndiffusion models. By reformulating forward diffusion processes as semi-implicit\ndistributions, we leverage three score-related identities to create an\ninnovative loss mechanism. This mechanism achieves rapid FID reduction by\ntraining the generator using its own synthesized images, eliminating the need\nfor real data or reverse-diffusion-based generation, all accomplished within\nsignificantly shortened generation time. Upon evaluation across four benchmark\ndatasets, the SiD algorithm demonstrates high iteration efficiency during\ndistillation and surpasses competing distillation approaches, whether they are\none-step or few-step, data-free, or dependent on training data, in terms of\ngeneration quality. This achievement not only redefines the benchmarks for\nefficiency and effectiveness in diffusion distillation but also in the broader\nfield of diffusion-based generation. The PyTorch implementation is available at\nhttps://github.com/mingyuanzhou/SiD\n","authors":["Mingyuan Zhou","Huangjie Zheng","Zhendong Wang","Mingzhang Yin","Hai Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04057v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.03852v1","updated":"2024-05-06T20:59:45Z","published":"2024-05-06T20:59:45Z","title":"VSA4VQA: Scaling a Vector Symbolic Architecture to Visual Question\n Answering on Natural Images","summary":" While Vector Symbolic Architectures (VSAs) are promising for modelling\nspatial cognition, their application is currently limited to artificially\ngenerated images and simple spatial queries. We propose VSA4VQA - a novel 4D\nimplementation of VSAs that implements a mental representation of natural\nimages for the challenging task of Visual Question Answering (VQA). VSA4VQA is\nthe first model to scale a VSA to complex spatial queries. Our method is based\non the Semantic Pointer Architecture (SPA) to encode objects in a\nhyperdimensional vector space. To encode natural images, we extend the SPA to\ninclude dimensions for object's width and height in addition to their spatial\nlocation. To perform spatial queries we further introduce learned spatial query\nmasks and integrate a pre-trained vision-language model for answering\nattribute-related questions. We evaluate our method on the GQA benchmark\ndataset and show that it can effectively encode natural images, achieving\ncompetitive performance to state-of-the-art deep learning methods for zero-shot\nVQA.\n","authors":["Anna Penzkofer","Lei Shi","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2405.03852v1.pdf","comment":"To be published in the Proceedings of the Annual Meeting of the\n Cognitive Science Society (CogSci'24)"},{"id":"http://arxiv.org/abs/2405.03846v1","updated":"2024-05-06T20:51:28Z","published":"2024-05-06T20:51:28Z","title":"Enhancing Apparent Personality Trait Analysis with Cross-Modal\n Embeddings","summary":" Automatic personality trait assessment is essential for high-quality\nhuman-machine interactions. Systems capable of human behavior analysis could be\nused for self-driving cars, medical research, and surveillance, among many\nothers. We present a multimodal deep neural network with a Siamese extension\nfor apparent personality trait prediction trained on short video recordings and\nexploiting modality invariant embeddings. Acoustic, visual, and textual\ninformation are utilized to reach high-performance solutions in this task. Due\nto the highly centralized target distribution of the analyzed dataset, the\nchanges in the third digit are relevant. Our proposed method addresses the\nchallenge of under-represented extreme values, achieves 0.0033 MAE average\nimprovement, and shows a clear advantage over the baseline multimodal DNN\nwithout the introduced module.\n","authors":["Ádám Fodor","Rachid R. Saboundji","András Lőrincz"],"pdf_url":"https://arxiv.org/pdf/2405.03846v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.04558v2","updated":"2024-05-06T20:36:56Z","published":"2023-10-06T19:47:20Z","title":"VTON-IT: Virtual Try-On using Image Translation","summary":" Virtual Try-On (trying clothes virtually) is a promising application of the\nGenerative Adversarial Network (GAN). However, it is an arduous task to\ntransfer the desired clothing item onto the corresponding regions of a human\nbody because of varying body size, pose, and occlusions like hair and\noverlapped clothes. In this paper, we try to produce photo-realistic translated\nimages through semantic segmentation and a generative adversarial\narchitecture-based image translation network. We present a novel image-based\nVirtual Try-On application VTON-IT that takes an RGB image, segments desired\nbody part, and overlays target cloth over the segmented body region. Most\nstate-of-the-art GAN-based Virtual Try-On applications produce unaligned\npixelated synthesis images on real-life test images. However, our approach\ngenerates high-resolution natural images with detailed textures on such variant\nimages.\n","authors":["Santosh Adhikari","Bishnu Bhusal","Prashant Ghimire","Anil Shrestha"],"pdf_url":"https://arxiv.org/pdf/2310.04558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03827v1","updated":"2024-05-06T20:17:10Z","published":"2024-05-06T20:17:10Z","title":"Direct learning of home vector direction for insect-inspired robot\n navigation","summary":" Insects have long been recognized for their ability to navigate and return\nhome using visual cues from their nest's environment. However, the precise\nmechanism underlying this remarkable homing skill remains a subject of ongoing\ninvestigation. Drawing inspiration from the learning flights of honey bees and\nwasps, we propose a robot navigation method that directly learns the home\nvector direction from visual percepts during a learning flight in the vicinity\nof the nest. After learning, the robot will travel away from the nest, come\nback by means of odometry, and eliminate the resultant drift by inferring the\nhome vector orientation from the currently experienced view. Using a compact\nconvolutional neural network, we demonstrate successful learning in both\nsimulated and real forest environments, as well as successful homing control of\na simulated quadrotor. The average errors of the inferred home vectors in\ngeneral stay well below the 90{\\deg} required for successful homing, and below\n24{\\deg} if all images contain sufficient texture and illumination. Moreover,\nwe show that the trajectory followed during the initial learning flight has a\npronounced impact on the network's performance. A higher density of sample\npoints in proximity to the nest results in a more consistent return. Code and\ndata are available at https://mavlab.tudelft.nl/learning_to_home .\n","authors":["Michiel Firlefyn","Jesse Hagenaars","Guido de Croon"],"pdf_url":"https://arxiv.org/pdf/2405.03827v1.pdf","comment":"Published at ICRA 2024, project webpage at\n https://mavlab.tudelft.nl/learning_to_home"},{"id":"http://arxiv.org/abs/2311.15963v2","updated":"2024-05-06T19:41:50Z","published":"2023-11-27T16:07:34Z","title":"From Pixels to Titles: Video Game Identification by Screenshots using\n Convolutional Neural Networks","summary":" This paper investigates video game identification through single screenshots,\nutilizing five convolutional neural network (CNN) architectures (MobileNet,\nDenseNet, EfficientNetB0, EfficientNetB2, and EfficientNetB3) across 22 home\nconsole systems, spanning from Atari 2600 to PlayStation 5, totalling 8,796\ngames and 170,881 screenshots. Confirming the hypothesis, CNNs autonomously\nextract image features, enabling the identification of game titles from\nscreenshots without additional features. Using ImageNet pre-trained weights as\ninitial weights, EfficientNetB3 achieves the highest average accuracy (74.51%),\nwhile DenseNet169 excels in 14 of the 22 systems. Employing alternative initial\nweights trained in an arcade screenshots dataset boosts accuracy for\nEfficientNetB2 and EfficientNetB3, with the latter reaching a peak accuracy of\n76.36% and demonstrating reduced convergence epochs from 23.7 to 20.5 on\naverage. Overall, the combination of optimal architecture and weights attains\n77.67% accuracy, primarily led by EfficientNetB3 in 19 systems. These findings\nunderscore the efficacy of CNNs in video game identification through\nscreenshots.\n","authors":["Fabricio Breve"],"pdf_url":"https://arxiv.org/pdf/2311.15963v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03803v1","updated":"2024-05-06T19:19:20Z","published":"2024-05-06T19:19:20Z","title":"MoDiPO: text-to-motion alignment via AI-feedback-driven Direct\n Preference Optimization","summary":" Diffusion Models have revolutionized the field of human motion generation by\noffering exceptional generation quality and fine-grained controllability\nthrough natural language conditioning. Their inherent stochasticity, that is\nthe ability to generate various outputs from a single input, is key to their\nsuccess. However, this diversity should not be unrestricted, as it may lead to\nunlikely generations. Instead, it should be confined within the boundaries of\ntext-aligned and realistic generations. To address this issue, we propose\nMoDiPO (Motion Diffusion DPO), a novel methodology that leverages Direct\nPreference Optimization (DPO) to align text-to-motion models. We streamline the\nlaborious and expensive process of gathering human preferences needed in DPO by\nleveraging AI feedback instead. This enables us to experiment with novel DPO\nstrategies, using both online and offline generated motion-preference pairs. To\nfoster future research we contribute with a motion-preference dataset which we\ndub Pick-a-Move. We demonstrate, both qualitatively and quantitatively, that\nour proposed method yields significantly more realistic motions. In particular,\nMoDiPO substantially improves Frechet Inception Distance (FID) while retaining\nthe same RPrecision and Multi-Modality performances.\n","authors":["Massimiliano Pappa","Luca Collorone","Giovanni Ficarra","Indro Spinelli","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2405.03803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16262v4","updated":"2024-05-06T18:40:43Z","published":"2023-07-30T16:08:45Z","title":"Validating polyp and instrument segmentation methods in colonoscopy\n through Medico 2020 and MedAI 2021 Challenges","summary":" Automatic analysis of colonoscopy images has been an active field of research\nmotivated by the importance of early detection of precancerous polyps. However,\ndetecting polyps during the live examination can be challenging due to various\nfactors such as variation of skills and experience among the endoscopists, lack\nof attentiveness, and fatigue leading to a high polyp miss-rate. Deep learning\nhas emerged as a promising solution to this challenge as it can assist\nendoscopists in detecting and classifying overlooked polyps and abnormalities\nin real time. In addition to the algorithm's accuracy, transparency and\ninterpretability are crucial to explaining the whys and hows of the algorithm's\nprediction. Further, most algorithms are developed in private data, closed\nsource, or proprietary software, and methods lack reproducibility. Therefore,\nto promote the development of efficient and transparent methods, we have\norganized the \"Medico automatic polyp segmentation (Medico 2020)\" and \"MedAI:\nTransparency in Medical Image Segmentation (MedAI 2021)\" competitions. We\npresent a comprehensive summary and analyze each contribution, highlight the\nstrength of the best-performing methods, and discuss the possibility of\nclinical translations of such methods into the clinic. For the transparency\ntask, a multi-disciplinary team, including expert gastroenterologists, accessed\neach submission and evaluated the team based on open-source practices, failure\ncase analysis, ablation studies, usability and understandability of evaluations\nto gain a deeper understanding of the models' credibility for clinical\ndeployment. Through the comprehensive analysis of the challenge, we not only\nhighlight the advancements in polyp and surgical instrument segmentation but\nalso encourage qualitative evaluation for building more transparent and\nunderstandable AI-based colonoscopy systems.\n","authors":["Debesh Jha","Vanshali Sharma","Debapriya Banik","Debayan Bhattacharya","Kaushiki Roy","Steven A. Hicks","Nikhil Kumar Tomar","Vajira Thambawita","Adrian Krenzer","Ge-Peng Ji","Sahadev Poudel","George Batchkala","Saruar Alam","Awadelrahman M. A. Ahmed","Quoc-Huy Trinh","Zeshan Khan","Tien-Phat Nguyen","Shruti Shrestha","Sabari Nathan","Jeonghwan Gwak","Ritika K. Jha","Zheyuan Zhang","Alexander Schlaefer","Debotosh Bhattacharjee","M. K. Bhuyan","Pradip K. Das","Deng-Ping Fan","Sravanthi Parsa","Sharib Ali","Michael A. Riegler","Pål Halvorsen","Thomas De Lange","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2307.16262v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03967v2","updated":"2024-05-06T18:39:58Z","published":"2023-10-06T01:53:27Z","title":"Sub-token ViT Embedding via Stochastic Resonance Transformers","summary":" Vision Transformer (ViT) architectures represent images as collections of\nhigh-dimensional vectorized tokens, each corresponding to a rectangular\nnon-overlapping patch. This representation trades spatial granularity for\nembedding dimensionality, and results in semantically rich but spatially\ncoarsely quantized feature maps. In order to retrieve spatial details\nbeneficial to fine-grained inference tasks we propose a training-free method\ninspired by \"stochastic resonance\". Specifically, we perform sub-token spatial\ntransformations to the input data, and aggregate the resulting ViT features\nafter applying the inverse transformation. The resulting \"Stochastic Resonance\nTransformer\" (SRT) retains the rich semantic information of the original\nrepresentation, but grounds it on a finer-scale spatial domain, partly\nmitigating the coarse effect of spatial tokenization. SRT is applicable across\nany layer of any ViT architecture, consistently boosting performance on several\ntasks including segmentation, classification, depth estimation, and others by\nup to 14.9% without the need for any fine-tuning.\n","authors":["Dong Lao","Yangchao Wu","Tian Yu Liu","Alex Wong","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2310.03967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03770v1","updated":"2024-05-06T18:09:48Z","published":"2024-05-06T18:09:48Z","title":"Foundation Models for Video Understanding: A Survey","summary":" Video Foundation Models (ViFMs) aim to learn a general-purpose representation\nfor various video understanding tasks. Leveraging large-scale datasets and\npowerful models, ViFMs achieve this by capturing robust and generic features\nfrom video data. This survey analyzes over 200 video foundational models,\noffering a comprehensive overview of benchmarks and evaluation metrics across\n14 distinct video tasks categorized into 3 main categories. Additionally, we\noffer an in-depth performance analysis of these models for the 6 most common\nvideo tasks. We categorize ViFMs into three categories: 1) Image-based ViFMs,\nwhich adapt existing image models for video tasks, 2) Video-Based ViFMs, which\nutilize video-specific encoding methods, and 3) Universal Foundational Models\n(UFMs), which combine multiple modalities (image, video, audio, and text etc.)\nwithin a single framework. By comparing the performance of various ViFMs on\ndifferent tasks, this survey offers valuable insights into their strengths and\nweaknesses, guiding future advancements in video understanding. Our analysis\nsurprisingly reveals that image-based foundation models consistently outperform\nvideo-based models on most video understanding tasks. Additionally, UFMs, which\nleverage diverse modalities, demonstrate superior performance on video tasks.\nWe share the comprehensive list of ViFMs studied in this work at:\n\\url{https://github.com/NeeluMadan/ViFM_Survey.git}\n","authors":["Neelu Madan","Andreas Moegelmose","Rajat Modi","Yogesh S. Rawat","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2405.03770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03762v1","updated":"2024-05-06T18:01:13Z","published":"2024-05-06T18:01:13Z","title":"Deep learning classifier of locally advanced rectal cancer treatment\n response from endoscopy images","summary":" We developed a deep learning classifier of rectal cancer response (tumor vs.\nno-tumor) to total neoadjuvant treatment (TNT) from endoscopic images acquired\nbefore, during, and following TNT. We further evaluated the network's ability\nin a near out-of-distribution (OOD) problem to identify local regrowth (LR)\nfrom follow-up endoscopy images acquired several months to years after\ncompleting TNT. We addressed endoscopic image variability by using optimal mass\ntransport-based image harmonization. We evaluated multiple training\nregularization schemes to study the ResNet-50 network's in-distribution and\nnear-OOD generalization ability. Test time augmentation resulted in the most\nconsiderable accuracy improvement. Image harmonization resulted in slight\naccuracy improvement for the near-OOD cases. Our results suggest that\noff-the-shelf deep learning classifiers can detect rectal cancer from\nendoscopic images at various stages of therapy for surveillance.\n","authors":["Jorge Tapias Gomez","Aneesh Rangnekar","Hannah Williams","Hannah Thompson","Julio Garcia-Aguilar","Joshua Jesse Smith","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2405.03762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03732v1","updated":"2024-05-06T10:53:13Z","published":"2024-05-06T10:53:13Z","title":"Accelerated MR Cholangiopancreatography with Deep Learning-based\n Reconstruction","summary":" This study accelerates MR cholangiopancreatography (MRCP) acquisitions using\ndeep learning-based (DL) reconstruction at 3T and 0.55T. Thirty healthy\nvolunteers underwent conventional two-fold MRCP scans at field strengths of 3T\nor 0.55T. We trained a variational network (VN) using retrospectively six-fold\nundersampled data obtained at 3T. We then evaluated our method against standard\ntechniques such as parallel imaging (PI) and compressed sensing (CS), focusing\non peak signal-to-noise ratio (PSNR) and structural similarity (SSIM) as\nmetrics. Furthermore, considering acquiring fully-sampled MRCP is impractical,\nwe added a self-supervised DL reconstruction (SSDU) to the evaluating group. We\nalso tested our method in a prospective accelerated scenario to reflect\nreal-world clinical applications and evaluated its adaptability to MRCP at\n0.55T. Our method demonstrated a remarkable reduction of average acquisition\ntime from 599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both\nretrospective and prospective undersampling scenarios, the PSNR and SSIM of VN\nwere higher than those of PI, CS, and SSDU. At the same time, VN preserved the\nimage quality of undersampled data, i.e., sharpness and the visibility of\nhepatobiliary ducts. In addition, VN also produced high quality reconstructions\nat 0.55T resulting in the highest PSNR and SSIM. In summary, VN trained for\nhighly accelerated MRCP allows to reduce the acquisition time by a factor of\n2.4/3.0 at 3T/0.55T while maintaining the image quality of the conventional\nacquisition.\n","authors":["Jinho Kim","Marcel Dominik Nickel","Florian Knoll"],"pdf_url":"https://arxiv.org/pdf/2405.03732v1.pdf","comment":"20 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.03730v1","updated":"2024-05-06T09:47:29Z","published":"2024-05-06T09:47:29Z","title":"Tilt your Head: Activating the Hidden Spatial-Invariance of Classifiers","summary":" Deep neural networks are applied in more and more areas of everyday life.\nHowever, they still lack essential abilities, such as robustly dealing with\nspatially transformed input signals. Approaches to mitigate this severe\nrobustness issue are limited to two pathways: Either models are implicitly\nregularised by increased sample variability (data augmentation) or explicitly\nconstrained by hard-coded inductive biases. The limiting factor of the former\nis the size of the data space, which renders sufficient sample coverage\nintractable. The latter is limited by the engineering effort required to\ndevelop such inductive biases for every possible scenario. Instead, we take\ninspiration from human behaviour, where percepts are modified by mental or\nphysical actions during inference. We propose a novel technique to emulate such\nan inference process for neural nets. This is achieved by traversing a\nsparsified inverse transformation tree during inference using parallel\nenergy-based evaluations. Our proposed inference algorithm, called Inverse\nTransformation Search (ITS), is model-agnostic and equips the model with\nzero-shot pseudo-invariance to spatially transformed inputs. We evaluated our\nmethod on several benchmark datasets, including a synthesised ImageNet test\nset. ITS outperforms the utilised baselines on all zero-shot test scenarios.\n","authors":["Johann Schmidt","Sebastian Stober"],"pdf_url":"https://arxiv.org/pdf/2405.03730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.06009v4","updated":"2024-05-06T09:27:49Z","published":"2021-08-13T01:07:51Z","title":"SAR image matching algorithm based on multi-class features","summary":" Synthetic aperture radar has the ability to work 24/7 and 24/7, and has high\napplication value. Propose a new SAR image matching algorithm based on multi\nclass features, mainly using two different types of features: straight lines\nand regions to enhance the robustness of the matching algorithm; On the basis\nof using prior knowledge of images, combined with LSD (Line Segment Detector)\nline detection and template matching algorithm, by analyzing the attribute\ncorrelation between line and surface features in SAR images, selecting line and\nregion features in SAR images to match the images, the matching accuracy\nbetween SAR images and visible light images is improved, and the probability of\nmatching errors is reduced. The experimental results have verified that this\nalgorithm can obtain high-precision matching results, achieve precise target\npositioning, and has good robustness to changes in perspective and lighting.\nThe results are accurate and false positives are controllable.\n","authors":["Mazhi Qiang","Fengming Zhou"],"pdf_url":"https://arxiv.org/pdf/2108.06009v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03722v1","updated":"2024-05-06T02:13:32Z","published":"2024-05-06T02:13:32Z","title":"Class-relevant Patch Embedding Selection for Few-Shot Image\n Classification","summary":" Effective image classification hinges on discerning relevant features from\nboth foreground and background elements, with the foreground typically holding\nthe critical information. While humans adeptly classify images with limited\nexposure, artificial neural networks often struggle with feature selection from\nrare samples. To address this challenge, we propose a novel method for\nselecting class-relevant patch embeddings. Our approach involves splitting\nsupport and query images into patches, encoding them using a pre-trained Vision\nTransformer (ViT) to obtain class embeddings and patch embeddings,\nrespectively. Subsequently, we filter patch embeddings using class embeddings\nto retain only the class-relevant ones. For each image, we calculate the\nsimilarity between class embedding and each patch embedding, sort the\nsimilarity sequence in descending order, and only retain top-ranked patch\nembeddings. By prioritizing similarity between the class embedding and patch\nembeddings, we select top-ranked patch embeddings to be fused with class\nembedding to form a comprehensive image representation, enhancing pattern\nrecognition across instances. Our strategy effectively mitigates the impact of\nclass-irrelevant patch embeddings, yielding improved performance in pre-trained\nmodels. Extensive experiments on popular few-shot classification benchmarks\ndemonstrate the simplicity, efficacy, and computational efficiency of our\napproach, outperforming state-of-the-art baselines under both 5-shot and 1-shot\nscenarios.\n","authors":["Weihao Jiang","Haoyang Cui","Kun He"],"pdf_url":"https://arxiv.org/pdf/2405.03722v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.03109"}]},"2024-05-05T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.03055v1","updated":"2024-05-05T21:29:20Z","published":"2024-05-05T21:29:20Z","title":"Multi-hop graph transformer network for 3D human pose estimation","summary":" Accurate 3D human pose estimation is a challenging task due to occlusion and\ndepth ambiguity. In this paper, we introduce a multi-hop graph transformer\nnetwork designed for 2D-to-3D human pose estimation in videos by leveraging the\nstrengths of multi-head self-attention and multi-hop graph convolutional\nnetworks with disentangled neighborhoods to capture spatio-temporal\ndependencies and handle long-range interactions. The proposed network\narchitecture consists of a graph attention block composed of stacked layers of\nmulti-head self-attention and graph convolution with learnable adjacency\nmatrix, and a multi-hop graph convolutional block comprised of multi-hop\nconvolutional and dilated convolutional layers. The combination of multi-head\nself-attention and multi-hop graph convolutional layers enables the model to\ncapture both local and global dependencies, while the integration of dilated\nconvolutional layers enhances the model's ability to handle spatial details\nrequired for accurate localization of the human body joints. Extensive\nexperiments demonstrate the effectiveness and generalization ability of our\nmodel, achieving competitive performance on benchmark datasets.\n","authors":["Zaedul Islam","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2405.03055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07270v2","updated":"2024-05-05T20:34:28Z","published":"2024-02-11T18:26:18Z","title":"Open-ended VQA benchmarking of Vision-Language models by exploiting\n Classification datasets and their semantic hierarchy","summary":" The evaluation of text-generative vision-language models is a challenging yet\ncrucial endeavor. By addressing the limitations of existing Visual Question\nAnswering (VQA) benchmarks and proposing innovative evaluation methodologies,\nour research seeks to advance our understanding of these models' capabilities.\nWe propose a novel VQA benchmark based on well-known visual classification\ndatasets which allows a granular evaluation of text-generative vision-language\nmodels and their comparison with discriminative vision-language models. To\nimprove the assessment of coarse answers on fine-grained classification tasks,\nwe suggest using the semantic hierarchy of the label space to ask automatically\ngenerated follow-up questions about the ground-truth category. Finally, we\ncompare traditional NLP and LLM-based metrics for the problem of evaluating\nmodel predictions given ground-truth answers. We perform a human evaluation\nstudy upon which we base our decision on the final metric. We apply our\nbenchmark to a suite of vision-language models and show a detailed comparison\nof their abilities on object, action, and attribute classification. Our\ncontributions aim to lay the foundation for more precise and meaningful\nassessments, facilitating targeted progress in the exciting field of\nvision-language modeling.\n","authors":["Simon Ging","María A. Bravo","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2402.07270v2.pdf","comment":"Accepted as Spotlight Paper for ICLR 2024. The first two authors\n contributed equally to this work"},{"id":"http://arxiv.org/abs/2405.03039v1","updated":"2024-05-05T20:00:22Z","published":"2024-05-05T20:00:22Z","title":"Performance Evaluation of Real-Time Object Detection for Electric\n Scooters","summary":" Electric scooters (e-scooters) have rapidly emerged as a popular mode of\ntransportation in urban areas, yet they pose significant safety challenges. In\nthe United States, the rise of e-scooters has been marked by a concerning\nincrease in related injuries and fatalities. Recently, while deep-learning\nobject detection holds paramount significance in autonomous vehicles to avoid\npotential collisions, its application in the context of e-scooters remains\nrelatively unexplored. This paper addresses this gap by assessing the\neffectiveness and efficiency of cutting-edge object detectors designed for\ne-scooters. To achieve this, the first comprehensive benchmark involving 22\nstate-of-the-art YOLO object detectors, including five versions (YOLOv3,\nYOLOv5, YOLOv6, YOLOv7, and YOLOv8), has been established for real-time traffic\nobject detection using a self-collected dataset featuring e-scooters. The\ndetection accuracy, measured in terms of mAP@0.5, ranges from 27.4%\n(YOLOv7-E6E) to 86.8% (YOLOv5s). All YOLO models, particularly YOLOv3-tiny,\nhave displayed promising potential for real-time object detection in the\ncontext of e-scooters. Both the traffic scene dataset\n(https://zenodo.org/records/10578641) and software program codes\n(https://github.com/DongChen06/ScooterDet) for model benchmarking in this study\nare publicly available, which will not only improve e-scooter safety with\nadvanced object detection but also lay the groundwork for tailored solutions,\npromising a safer and more sustainable urban micromobility landscape.\n","authors":["Dong Chen","Arman Hosseini","Arik Smith","Amir Farzin Nikkhah","Arsalan Heydarian","Omid Shoghli","Bradford Campbell"],"pdf_url":"https://arxiv.org/pdf/2405.03039v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2204.07756v3","updated":"2024-05-05T18:44:14Z","published":"2022-04-16T08:57:00Z","title":"Visual Attention Methods in Deep Learning: An In-Depth Survey","summary":" Inspired by the human cognitive system, attention is a mechanism that\nimitates the human cognitive awareness about specific information, amplifying\ncritical details to focus more on the essential aspects of data. Deep learning\nhas employed attention to boost performance for many applications.\nInterestingly, the same attention design can suit processing different data\nmodalities and can easily be incorporated into large networks. Furthermore,\nmultiple complementary attention mechanisms can be incorporated into one\nnetwork. Hence, attention techniques have become extremely attractive. However,\nthe literature lacks a comprehensive survey on attention techniques to guide\nresearchers in employing attention in their deep models. Note that, besides\nbeing demanding in terms of training data and computational resources,\ntransformers only cover a single category in self-attention out of the many\ncategories available. We fill this gap and provide an in-depth survey of 50\nattention techniques, categorizing them by their most prominent features. We\ninitiate our discussion by introducing the fundamental concepts behind the\nsuccess of the attention mechanism. Next, we furnish some essentials such as\nthe strengths and limitations of each attention category, describe their\nfundamental building blocks, basic formulations with primary usage, and\napplications specifically for computer vision. We also discuss the challenges\nand general open questions related to attention mechanisms. Finally, we\nrecommend possible future research directions for deep attention. All the\ninformation about visual attention methods in deep learning is provided at\n\\href{https://github.com/saeed-anwar/VisualAttention}{https://github.com/saeed-anwar/VisualAttention}\n","authors":["Mohammed Hassanin","Saeed Anwar","Ibrahim Radwan","Fahad S Khan","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2204.07756v3.pdf","comment":"Accepted in Information Fusion"},{"id":"http://arxiv.org/abs/2405.03025v1","updated":"2024-05-05T18:36:45Z","published":"2024-05-05T18:36:45Z","title":"Matten: Video Generation with Mamba-Attention","summary":" In this paper, we introduce Matten, a cutting-edge latent diffusion model\nwith Mamba-Attention architecture for video generation. With minimal\ncomputational cost, Matten employs spatial-temporal attention for local video\ncontent modeling and bidirectional Mamba for global video content modeling. Our\ncomprehensive experimental evaluation demonstrates that Matten has competitive\nperformance with the current Transformer-based and GAN-based models in\nbenchmark performance, achieving superior FVD scores and efficiency.\nAdditionally, we observe a direct positive correlation between the complexity\nof our designed model and the improvement in video quality, indicating the\nexcellent scalability of Matten.\n","authors":["Yu Gao","Jiancheng Huang","Xiaopeng Sun","Zequn Jie","Yujie Zhong","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2405.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03011v1","updated":"2024-05-05T17:37:50Z","published":"2024-05-05T17:37:50Z","title":"AC-MAMBASEG: An adaptive convolution and Mamba-based architecture for\n enhanced skin lesion segmentation","summary":" Skin lesion segmentation is a critical task in computer-aided diagnosis\nsystems for dermatological diseases. Accurate segmentation of skin lesions from\nmedical images is essential for early detection, diagnosis, and treatment\nplanning. In this paper, we propose a new model for skin lesion segmentation\nnamely AC-MambaSeg, an enhanced model that has the hybrid CNN-Mamba backbone,\nand integrates advanced components such as Convolutional Block Attention Module\n(CBAM), Attention Gate, and Selective Kernel Bottleneck. AC-MambaSeg leverages\nthe Vision Mamba framework for efficient feature extraction, while CBAM and\nSelective Kernel Bottleneck enhance its ability to focus on informative regions\nand suppress background noise. We evaluate the performance of AC-MambaSeg on\ndiverse datasets of skin lesion images including ISIC-2018 and PH2; then\ncompare it against existing segmentation methods. Our model shows promising\npotential for improving computer-aided diagnosis systems and facilitating early\ndetection and treatment of dermatological diseases. Our source code will be\nmade available at: https://github.com/vietthanh2710/AC-MambaSeg.\n","authors":["Viet-Thanh Nguyen","Van-Truong Pham","Thi-Thao Tran"],"pdf_url":"https://arxiv.org/pdf/2405.03011v1.pdf","comment":"15 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.03008v1","updated":"2024-05-05T17:34:38Z","published":"2024-05-05T17:34:38Z","title":"DVMSR: Distillated Vision Mamba for Efficient Super-Resolution","summary":" Efficient Image Super-Resolution (SR) aims to accelerate SR network inference\nby minimizing computational complexity and network parameters while preserving\nperformance. Existing state-of-the-art Efficient Image Super-Resolution methods\nare based on convolutional neural networks. Few attempts have been made with\nMamba to harness its long-range modeling capability and efficient computational\ncomplexity, which have shown impressive performance on high-level vision tasks.\nIn this paper, we propose DVMSR, a novel lightweight Image SR network that\nincorporates Vision Mamba and a distillation strategy. The network of DVMSR\nconsists of three modules: feature extraction convolution, multiple stacked\nResidual State Space Blocks (RSSBs), and a reconstruction module. Specifically,\nthe deep feature extraction module is composed of several residual state space\nblocks (RSSB), each of which has several Vision Mamba Moudles(ViMM) together\nwith a residual connection. To achieve efficiency improvement while maintaining\ncomparable performance, we employ a distillation strategy to the vision Mamba\nnetwork for superior performance. Specifically, we leverage the rich\nrepresentation knowledge of teacher network as additional supervision for the\noutput of lightweight student networks. Extensive experiments have demonstrated\nthat our proposed DVMSR can outperform state-of-the-art efficient SR methods in\nterms of model parameters while maintaining the performance of both PSNR and\nSSIM. The source code is available at https://github.com/nathan66666/DVMSR.git\n","authors":["Xiaoyan Lei","Wenlong ZHang","Weifeng Cao"],"pdf_url":"https://arxiv.org/pdf/2405.03008v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.18253v4","updated":"2024-05-05T17:10:03Z","published":"2024-04-28T17:20:08Z","title":"Efficient Remote Sensing with Harmonized Transfer Learning and Modality\n Alignment","summary":" With the rise of Visual and Language Pretraining (VLP), an increasing number\nof downstream tasks are adopting the paradigm of pretraining followed by\nfine-tuning. Although this paradigm has demonstrated potential in various\nmultimodal downstream tasks, its implementation in the remote sensing domain\nencounters some obstacles. Specifically, the tendency for same-modality\nembeddings to cluster together impedes efficient transfer learning. To tackle\nthis issue, we review the aim of multimodal transfer learning for downstream\ntasks from a unified perspective, and rethink the optimization process based on\nthree distinct objectives. We propose \"Harmonized Transfer Learning and\nModality Alignment (HarMA)\", a method that simultaneously satisfies task\nconstraints, modality alignment, and single-modality uniform alignment, while\nminimizing training overhead through parameter-efficient fine-tuning.\nRemarkably, without the need for external data for training, HarMA achieves\nstate-of-the-art performance in two popular multimodal retrieval tasks in the\nfield of remote sensing. Our experiments reveal that HarMA achieves competitive\nand even superior performance to fully fine-tuned models with only minimal\nadjustable parameters. Due to its simplicity, HarMA can be integrated into\nalmost all existing multimodal pretraining models. We hope this method can\nfacilitate the efficient application of large models to a wide range of\ndownstream tasks while significantly reducing the resource consumption. Code is\navailable at https://github.com/seekerhuang/HarMA.\n","authors":["Tengjun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.18253v4.pdf","comment":"Accepted by the Twelfth International Conference on Learning\n Representations (ICLR) Workshop"},{"id":"http://arxiv.org/abs/2404.18316v2","updated":"2024-05-05T16:38:42Z","published":"2024-04-28T20:57:55Z","title":"Position paper: Do not explain (vision models) without context","summary":" Does the stethoscope in the picture make the adjacent person a doctor or a\npatient? This, of course, depends on the contextual relationship of the two\nobjects. If it is obvious, why don not explanation methods for vision models\nuse contextual information? In this paper, we (1) review the most popular\nmethods of explaining computer vision models by pointing out that they do not\ntake into account context information, (2) provide examples of real-world use\ncases where spatial context plays a significant role, (3) propose new research\ndirections that may lead to better use of context information in explaining\ncomputer vision models, (4) argue that a change in approach to explanations is\nneeded from 'where' to 'how'.\n","authors":["Paulina Tomaszewska","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.18316v2.pdf","comment":"Accepted for ICML 2024"},{"id":"http://arxiv.org/abs/2405.02984v1","updated":"2024-05-05T16:07:23Z","published":"2024-05-05T16:07:23Z","title":"E-TSL: A Continuous Educational Turkish Sign Language Dataset with\n Baseline Methods","summary":" This study introduces the continuous Educational Turkish Sign Language\n(E-TSL) dataset, collected from online Turkish language lessons for 5th, 6th,\nand 8th grades. The dataset comprises 1,410 videos totaling nearly 24 hours and\nincludes performances from 11 signers. Turkish, an agglutinative language,\nposes unique challenges for sign language translation, particularly with a\nvocabulary where 64% are singleton words and 85% are rare words, appearing less\nthan five times. We developed two baseline models to address these challenges:\nthe Pose to Text Transformer (P2T-T) and the Graph Neural Network based\nTransformer (GNN-T) models. The GNN-T model achieved 19.13% BLEU-1 score and\n3.28% BLEU-4 score, presenting a significant challenge compared to existing\nbenchmarks. The P2T-T model, while demonstrating slightly lower performance in\nBLEU scores, achieved a higher ROUGE-L score of 22.09%. Additionally, we\nbenchmarked our model using the well-known PHOENIX-Weather 2014T dataset to\nvalidate our approach.\n","authors":["Şükrü Öztürk","Hacer Yalim Keles"],"pdf_url":"https://arxiv.org/pdf/2405.02984v1.pdf","comment":"7 pages, 3 figures, 4 tables, submitted to IEEE conference"},{"id":"http://arxiv.org/abs/2405.02982v1","updated":"2024-05-05T16:05:56Z","published":"2024-05-05T16:05:56Z","title":"Paintings and Drawings Aesthetics Assessment with Rich Attributes for\n Various Artistic Categories","summary":" Image aesthetic evaluation is a highly prominent research domain in the field\nof computer vision. In recent years, there has been a proliferation of datasets\nand corresponding evaluation methodologies for assessing the aesthetic quality\nof photographic works, leading to the establishment of a relatively mature\nresearch environment. However, in contrast to the extensive research in\nphotographic aesthetics, the field of aesthetic evaluation for paintings and\nDrawings has seen limited attention until the introduction of the BAID dataset\nin March 2023. This dataset solely comprises overall scores for high-quality\nartistic images. Our research marks the pioneering introduction of a\nmulti-attribute, multi-category dataset specifically tailored to the field of\npainting: Aesthetics of Paintings and Drawings Dataset (APDD). The construction\nof APDD received active participation from 28 professional artists worldwide,\nalong with dozens of students specializing in the field of art. This dataset\nencompasses 24 distinct artistic categories and 10 different aesthetic\nattributes. Each image in APDD has been evaluated by six professionally trained\nexperts in the field of art, including assessments for both total aesthetic\nscores and aesthetic attribute scores. The final APDD dataset comprises a total\nof 4985 images, with an annotation count exceeding 31100 entries. Concurrently,\nwe propose an innovative approach: Art Assessment Network for Specific Painting\nStyles (AANSPS), designed for the assessment of aesthetic attributes in\nmixed-attribute art datasets. Through this research, our goal is to catalyze\nadvancements in the field of aesthetic evaluation for paintings and drawings,\nwhile enriching the available resources and methodologies for its further\ndevelopment and application.\n","authors":["Xin Jin","Qianqian Qiao","Yi Lu","Shan Gao","Heng Huang","Guangdong Li"],"pdf_url":"https://arxiv.org/pdf/2405.02982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02977v1","updated":"2024-05-05T15:50:02Z","published":"2024-05-05T15:50:02Z","title":"SkelCap: Automated Generation of Descriptive Text from Skeleton Keypoint\n Sequences","summary":" Numerous sign language datasets exist, yet they typically cover only a\nlimited selection of the thousands of signs used globally. Moreover, creating\ndiverse sign language datasets is an expensive and challenging task due to the\ncosts associated with gathering a varied group of signers. Motivated by these\nchallenges, we aimed to develop a solution that addresses these limitations. In\nthis context, we focused on textually describing body movements from skeleton\nkeypoint sequences, leading to the creation of a new dataset. We structured\nthis dataset around AUTSL, a comprehensive isolated Turkish sign language\ndataset. We also developed a baseline model, SkelCap, which can generate\ntextual descriptions of body movements. This model processes the skeleton\nkeypoints data as a vector, applies a fully connected layer for embedding, and\nutilizes a transformer neural network for sequence-to-sequence modeling. We\nconducted extensive evaluations of our model, including signer-agnostic and\nsign-agnostic assessments. The model achieved promising results, with a ROUGE-L\nscore of 0.98 and a BLEU-4 score of 0.94 in the signer-agnostic evaluation. The\ndataset we have prepared, namely the AUTSL-SkelCap, will be made publicly\navailable soon.\n","authors":["Ali Emre Keskin","Hacer Yalim Keles"],"pdf_url":"https://arxiv.org/pdf/2405.02977v1.pdf","comment":"8 pages, 5 figures, 7 tables, submitted to IEEE conference"},{"id":"http://arxiv.org/abs/2405.02962v1","updated":"2024-05-05T15:01:29Z","published":"2024-05-05T15:01:29Z","title":"VectorPainter: A Novel Approach to Stylized Vector Graphics Synthesis\n with Vectorized Strokes","summary":" We propose a novel method, VectorPainter, for the task of stylized vector\ngraphics synthesis. Given a text prompt and a reference style image,\nVectorPainter generates a vector graphic that aligns in content with the text\nprompt and remains faithful in style to the reference image. We recognize that\nthe key to this task lies in fully leveraging the intrinsic properties of\nvector graphics. Innovatively, we conceptualize the stylization process as the\nrearrangement of vectorized strokes extracted from the reference image.\nVectorPainter employs an optimization-based pipeline. It begins by extracting\nvectorized strokes from the reference image, which are then used to initialize\nthe synthesis process. To ensure fidelity to the reference style, a novel style\npreservation loss is introduced. Extensive experiments have been conducted to\ndemonstrate that our method is capable of aligning with the text description\nwhile remaining faithful to the reference image.\n","authors":["Juncheng Hu","Ximing Xing","Zhengqi Zhang","Jing Zhang","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2405.02962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02961v1","updated":"2024-05-05T15:01:00Z","published":"2024-05-05T15:01:00Z","title":"JOSENet: A Joint Stream Embedding Network for Violence Detection in\n Surveillance Videos","summary":" Due to the ever-increasing availability of video surveillance cameras and the\ngrowing need for crime prevention, the violence detection task is attracting\ngreater attention from the research community. With respect to other action\nrecognition tasks, violence detection in surveillance videos shows additional\nissues, such as the presence of a significant variety of real fight scenes.\nUnfortunately, available datasets seem to be very small compared with other\naction recognition datasets. Moreover, in surveillance applications, people in\nthe scenes always differ for each video and the background of the footage\ndiffers for each camera. Also, violent actions in real-life surveillance videos\nmust be detected quickly to prevent unwanted consequences, thus models would\ndefinitely benefit from a reduction in memory usage and computational costs.\nSuch problems make classical action recognition methods difficult to be\nadopted. To tackle all these issues, we introduce JOSENet, a novel\nself-supervised framework that provides outstanding performance for violence\ndetection in surveillance videos. The proposed model receives two\nspatiotemporal video streams, i.e., RGB frames and optical flows, and involves\na new regularized self-supervised learning approach for videos. JOSENet\nprovides improved performance compared to self-supervised state-of-the-art\nmethods, while requiring one-fourth of the number of frames per video segment\nand a reduced frame rate. The source code and the instructions to reproduce our\nexperiments are available at https://github.com/ispamm/JOSENet.\n","authors":["Pietro Nardelli","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2405.02961v1.pdf","comment":"Submitted to the International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2405.02958v1","updated":"2024-05-05T14:56:34Z","published":"2024-05-05T14:56:34Z","title":"Score-based Generative Priors Guided Model-driven Network for MRI\n Reconstruction","summary":" Score matching with Langevin dynamics (SMLD) method has been successfully\napplied to accelerated MRI. However, the hyperparameters in the sampling\nprocess require subtle tuning, otherwise the results can be severely corrupted\nby hallucination artifacts, particularly with out-of-distribution test data. In\nthis study, we propose a novel workflow in which SMLD results are regarded as\nadditional priors to guide model-driven network training. First, we adopted a\npretrained score network to obtain samples as preliminary guidance images (PGI)\nwithout the need for network retraining, parameter tuning and in-distribution\ntest data. Although PGIs are corrupted by hallucination artifacts, we believe\nthat they can provide extra information through effective denoising steps to\nfacilitate reconstruction. Therefore, we designed a denoising module (DM) in\nthe second step to improve the quality of PGIs. The features are extracted from\nthe components of Langevin dynamics and the same score network with\nfine-tuning; hence, we can directly learn the artifact patterns. Third, we\ndesigned a model-driven network whose training is guided by denoised PGIs\n(DGIs). DGIs are densely connected with intermediate reconstructions in each\ncascade to enrich the features and are periodically updated to provide more\naccurate guidance. Our experiments on different sequences revealed that despite\nthe low average quality of PGIs, the proposed workflow can effectively extract\nvaluable information to guide the network training, even with severely reduced\ntraining data and sampling steps. Our method outperforms other cutting-edge\ntechniques by effectively mitigating hallucination artifacts, yielding robust\nand high-quality reconstruction results.\n","authors":["Xiaoyu Qiao","Weisheng Li","Yuping Huang","Lijian Yang"],"pdf_url":"https://arxiv.org/pdf/2405.02958v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2405.02954v1","updated":"2024-05-05T14:48:13Z","published":"2024-05-05T14:48:13Z","title":"Source-Free Domain Adaptation Guided by Vision and Vision-Language\n Pre-Training","summary":" Source-free domain adaptation (SFDA) aims to adapt a source model trained on\na fully-labeled source domain to a related but unlabeled target domain. While\nthe source model is a key avenue for acquiring target pseudolabels, the\ngenerated pseudolabels may exhibit source bias. In the conventional SFDA\npipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to\ninitialize the source model at the start of source training, and subsequently\ndiscarded. Despite having diverse features important for generalization, the\npre-trained feature extractor can overfit to the source data distribution\nduring source training and forget relevant target domain knowledge. Rather than\ndiscarding this valuable knowledge, we introduce an integrated framework to\nincorporate pre-trained networks into the target adaptation process. The\nproposed framework is flexible and allows us to plug modern pre-trained\nnetworks into the adaptation process to leverage their stronger representation\nlearning capabilities. For adaptation, we propose the Co-learn algorithm to\nimprove target pseudolabel quality collaboratively through the source model and\na pre-trained feature extractor. Building on the recent success of the\nvision-language model CLIP in zero-shot image recognition, we present an\nextension Co-learn++ to further incorporate CLIP's zero-shot classification\ndecisions. We evaluate on 3 benchmark datasets and include more challenging\nscenarios such as open-set, partial-set and open-partial SFDA. Experimental\nresults demonstrate that our proposed strategy improves adaptation performance\nand can be successfully integrated with existing SFDA methods.\n","authors":["Wenyu Zhang","Li Shen","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2405.02954v1.pdf","comment":"Extension of ICCV paper arXiv:2212.07585, submitted to IJCV"},{"id":"http://arxiv.org/abs/2405.02951v1","updated":"2024-05-05T14:39:06Z","published":"2024-05-05T14:39:06Z","title":"iSEARLE: Improving Textual Inversion for Zero-Shot Composed Image\n Retrieval","summary":" Given a query consisting of a reference image and a relative caption,\nComposed Image Retrieval (CIR) aims to retrieve target images visually similar\nto the reference one while incorporating the changes specified in the relative\ncaption. The reliance of supervised methods on labor-intensive manually labeled\ndatasets hinders their broad applicability. In this work, we introduce a new\ntask, Zero-Shot CIR (ZS-CIR), that addresses CIR without the need for a labeled\ntraining dataset. We propose an approach named iSEARLE (improved zero-Shot\ncomposEd imAge Retrieval with textuaL invErsion) that involves mapping the\nvisual information of the reference image into a pseudo-word token in CLIP\ntoken embedding space and combining it with the relative caption. To foster\nresearch on ZS-CIR, we present an open-domain benchmarking dataset named CIRCO\n(Composed Image Retrieval on Common Objects in context), the first CIR dataset\nwhere each query is labeled with multiple ground truths and a semantic\ncategorization. The experimental results illustrate that iSEARLE obtains\nstate-of-the-art performance on three different CIR datasets -- FashionIQ,\nCIRR, and the proposed CIRCO -- and two additional evaluation settings, namely\ndomain conversion and object composition. The dataset, the code, and the model\nare publicly available at https://github.com/miccunifi/SEARLE.\n","authors":["Lorenzo Agnolucci","Alberto Baldrati","Marco Bertini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2405.02951v1.pdf","comment":"Extended version of the ICCV2023 paper arXiv:2303.15247"},{"id":"http://arxiv.org/abs/2405.02945v1","updated":"2024-05-05T14:14:49Z","published":"2024-05-05T14:14:49Z","title":"Invertible Residual Rescaling Models","summary":" Invertible Rescaling Networks (IRNs) and their variants have witnessed\nremarkable achievements in various image processing tasks like image rescaling.\nHowever, we observe that IRNs with deeper networks are difficult to train, thus\nhindering the representational ability of IRNs. To address this issue, we\npropose Invertible Residual Rescaling Models (IRRM) for image rescaling by\nlearning a bijection between a high-resolution image and its low-resolution\ncounterpart with a specific distribution. Specifically, we propose IRRM to\nbuild a deep network, which contains several Residual Downscaling Modules\n(RDMs) with long skip connections. Each RDM consists of several Invertible\nResidual Blocks (IRBs) with short connections. In this way, RDM allows rich\nlow-frequency information to be bypassed by skip connections and forces models\nto focus on extracting high-frequency information from the image. Extensive\nexperiments show that our IRRM performs significantly better than other\nstate-of-the-art methods with much fewer parameters and complexity.\nParticularly, our IRRM has respectively PSNR gains of at least 0.3 dB over\nHCFlow and IRN in the $\\times 4$ rescaling while only using 60\\% parameters and\n50\\% FLOPs. The code will be available at https://github.com/THU-Kingmin/IRRM.\n","authors":["Jinmin Li","Tao Dai","Yaohua Zha","Yilu Luo","Longfei Lu","Bin Chen","Zhi Wang","Shu-Tao Xia","Jingyun Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02944v1","updated":"2024-05-05T14:12:48Z","published":"2024-05-05T14:12:48Z","title":"Imaging Signal Recovery Using Neural Network Priors Under Uncertain\n Forward Model Parameters","summary":" Inverse imaging problems (IIPs) arise in various applications, with the main\nobjective of reconstructing an image from its compressed measurements. This\nproblem is often ill-posed for being under-determined with multiple\ninterchangeably consistent solutions. The best solution inherently depends on\nprior knowledge or assumptions, such as the sparsity of the image. Furthermore,\nthe reconstruction process for most IIPs relies significantly on the imaging\n(i.e. forward model) parameters, which might not be fully known, or the\nmeasurement device may undergo calibration drifts. These uncertainties in the\nforward model create substantial challenges, where inaccurate reconstructions\nusually happen when the postulated parameters of the forward model do not fully\nmatch the actual ones. In this work, we devoted to tackling accurate\nreconstruction under the context of a set of possible forward model parameters\nthat exist. Here, we propose a novel Moment-Aggregation (MA) framework that is\ncompatible with the popular IIP solution by using a neural network prior.\nSpecifically, our method can reconstruct the signal by considering all\ncandidate parameters of the forward model simultaneously during the update of\nthe neural network. We theoretically demonstrate the convergence of the MA\nframework, which has a similar complexity with reconstruction under the known\nforward model parameters. Proof-of-concept experiments demonstrate that the\nproposed MA achieves performance comparable to the forward model with the known\nprecise parameter in reconstruction across both compressive sensing and phase\nretrieval applications, with a PSNR gap of 0.17 to 1.94 over various datasets,\nincluding MNIST, X-ray, Glas, and MoNuseg. This highlights our method's\nsignificant potential in reconstruction under an uncertain forward model.\n","authors":["Xiwen Chen","Wenhui Zhu","Peijie Qiu","Abolfazl Razi"],"pdf_url":"https://arxiv.org/pdf/2405.02944v1.pdf","comment":"Accepted by PBDL-CVPR 2024"},{"id":"http://arxiv.org/abs/2405.02942v1","updated":"2024-05-05T14:07:23Z","published":"2024-05-05T14:07:23Z","title":"Design, analysis, and manufacturing of a glass-plastic hybrid minimalist\n aspheric panoramic annular lens","summary":" We propose a high-performance glass-plastic hybrid minimalist aspheric\npanoramic annular lens (ASPAL) to solve several major limitations of the\ntraditional panoramic annular lens (PAL), such as large size, high weight, and\ncomplex system. The field of view (FoV) of the ASPAL is\n360{\\deg}x(35{\\deg}~110{\\deg}) and the imaging quality is close to the\ndiffraction limit. This large FoV ASPAL is composed of only 4 lenses. Moreover,\nwe establish a physical structure model of PAL using the ray tracing method and\nstudy the influence of its physical parameters on compactness ratio. In\naddition, for the evaluation of local tolerances of annular surfaces, we\npropose a tolerance analysis method suitable for ASPAL. This analytical method\ncan effectively analyze surface irregularities on annular surfaces and provide\nclear guidance on manufacturing tolerances for ASPAL. Benefiting from\nhigh-precision glass molding and injection molding aspheric lens manufacturing\ntechniques, we finally manufactured 20 ASPALs in small batches. The weight of\nan ASPAL prototype is only 8.5 g. Our framework provides promising insights for\nthe application of panoramic systems in space and weight-constrained\nenvironmental sensing scenarios such as intelligent security, micro-UAVs, and\nmicro-robots.\n","authors":["Shaohua Gao","Qi Jiang","Yiqi Liao","Yi Qiu","Wanglei Ying","Kailun Yang","Kaiwei Wang","Benhao Zhang","Jian Bai"],"pdf_url":"https://arxiv.org/pdf/2405.02942v1.pdf","comment":"Accepted to Optics & Laser Technology"},{"id":"http://arxiv.org/abs/2405.02941v1","updated":"2024-05-05T14:05:33Z","published":"2024-05-05T14:05:33Z","title":"Boundary-aware Decoupled Flow Networks for Realistic Extreme Rescaling","summary":" Recently developed generative methods, including invertible rescaling network\n(IRN) based and generative adversarial network (GAN) based methods, have\ndemonstrated exceptional performance in image rescaling. However, IRN-based\nmethods tend to produce over-smoothed results, while GAN-based methods easily\ngenerate fake details, which thus hinders their real applications. To address\nthis issue, we propose Boundary-aware Decoupled Flow Networks (BDFlow) to\ngenerate realistic and visually pleasing results. Unlike previous methods that\nmodel high-frequency information as standard Gaussian distribution directly,\nour BDFlow first decouples the high-frequency information into \\textit{semantic\nhigh-frequency} that adheres to a Boundary distribution and\n\\textit{non-semantic high-frequency} counterpart that adheres to a Gaussian\ndistribution. Specifically, to capture semantic high-frequency parts\naccurately, we use Boundary-aware Mask (BAM) to constrain the model to produce\nrich textures, while non-semantic high-frequency part is randomly sampled from\na Gaussian distribution.Comprehensive experiments demonstrate that our BDFlow\nsignificantly outperforms other state-of-the-art methods while maintaining\nlower complexity. Notably, our BDFlow improves the PSNR by $4.4$ dB and the\nSSIM by $0.1$ on average over GRAIN, utilizing only 74\\% of the parameters and\n20\\% of the computation. The code will be available at\nhttps://github.com/THU-Kingmin/BAFlow.\n","authors":["Jinmin Li","Tao Dai","Jingyun Zhang","Kang Liu","Jun Wang","Shaoming Wang","Shu-Tao Xia","rizen guo"],"pdf_url":"https://arxiv.org/pdf/2405.02941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03383v2","updated":"2024-05-05T13:48:12Z","published":"2024-02-04T07:29:00Z","title":"A Collaborative Model-driven Network for MRI Reconstruction","summary":" Deep learning (DL)-based methods offer a promising solution to reduce the\nprolonged scanning time in magnetic resonance imaging (MRI). While model-driven\nDL methods have demonstrated convincing results by incorporating prior\nknowledge into deep networks, further exploration is needed to optimize the\nintegration of diverse priors.. Existing model-driven networks typically\nutilize linearly stacked unrolled cascades to mimic iterative solution steps in\noptimization algorithms. However, this approach needs to find a balance between\ndifferent prior-based regularizers during training, resulting in slower\nconvergence and suboptimal reconstructions. To overcome the limitations, we\npropose a collaborative model-driven network to maximally exploit the\ncomplementarity of different regularizers. We design attention modules to learn\nboth the relative confidence (RC) and overall confidence (OC) for the\nintermediate reconstructions (IRs) generated by different prior-based\nsubnetworks. RC assigns more weight to the areas of expertise of the\nsubnetworks, enabling precise element-wise collaboration. We design correction\nmodules to tackle bottleneck scenarios where both subnetworks exhibit low\naccuracy, and they further optimize the IRs based on OC maps. IRs across\nvarious stages are concatenated and fed to the attention modules to build\nrobust and accurate confidence maps. Experimental results on multiple datasets\nshowed significant improvements in the final results without additional\ncomputational costs. Moreover, the proposed model-driven network design\nstrategy can be conveniently applied to various model-driven methods to improve\ntheir performance.\n","authors":["Xiaoyu Qiao","Weisheng Li","Guofen Wang","Yuping Huang"],"pdf_url":"https://arxiv.org/pdf/2402.03383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02929v1","updated":"2024-05-05T13:15:11Z","published":"2024-05-05T13:15:11Z","title":"Unified Dynamic Scanpath Predictors Outperform Individually Trained\n Models","summary":" Previous research on scanpath prediction has mainly focused on group models,\ndisregarding the fact that the scanpaths and attentional behaviors of\nindividuals are diverse. The disregard of these differences is especially\ndetrimental to social human-robot interaction, whereby robots commonly emulate\nhuman gaze based on heuristics or predefined patterns. However, human gaze\npatterns are heterogeneous and varying behaviors can significantly affect the\noutcomes of such human-robot interactions. To fill this gap, we developed a\ndeep learning-based social cue integration model for saliency prediction to\ninstead predict scanpaths in videos. Our model learned scanpaths by recursively\nintegrating fixation history and social cues through a gating mechanism and\nsequential attention. We evaluated our approach on gaze datasets of dynamic\nsocial scenes, observed under the free-viewing condition. The introduction of\nfixation history into our models makes it possible to train a single unified\nmodel rather than the resource-intensive approach of training individual models\nfor each set of scanpaths. We observed that the late neural integration\napproach surpasses early fusion when training models on a large dataset, in\ncomparison to a smaller dataset with a similar distribution. Results also\nindicate that a single unified model, trained on all the observers' scanpaths,\nperforms on par or better than individually trained models. We hypothesize that\nthis outcome is a result of the group saliency representations instilling\nuniversal attention in the model, while the supervisory signal guides it to\nlearn personalized attentional behaviors, providing the unified model a benefit\nover individual models due to its implicit representation of universal\nattention.\n","authors":["Fares Abawi","Di Fu","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2405.02929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13904v2","updated":"2024-05-05T12:57:20Z","published":"2024-04-22T06:28:41Z","title":"Deep Regression Representation Learning with Topology","summary":" Most works studying representation learning focus only on classification and\nneglect regression. Yet, the learning objectives and therefore the\nrepresentation topologies of the two tasks are fundamentally different:\nclassification targets class separation, leading to disconnected\nrepresentations, whereas regression requires ordinality with respect to the\ntarget, leading to continuous representations. We thus wonder how the\neffectiveness of a regression representation is influenced by its topology,\nwith evaluation based on the Information Bottleneck (IB) principle.\n The IB principle is an important framework that provides principles for\nlearning effectiveness representations. We establish two connections between it\nand the topology of regression representations. The first connection reveals\nthat a lower intrinsic dimension of the feature space implies a reduced\ncomplexity of the representation Z. This complexity can be quantified as the\nconditional entropy of Z on the target space Y and serves as an upper bound on\nthe generalization error. The second connection suggests learning a feature\nspace that is topologically similar to the target space will better align with\nthe IB principle. Based on these two connections, we introduce PH-Reg, a\nregularizer specific to regression that matches the intrinsic dimension and\ntopology of the feature space with the target space. Experiments on synthetic\nand real-world regression tasks demonstrate the benefits of PH-Reg.\n","authors":["Shihao Zhang","kenji kawaguchi","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2404.13904v2.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2405.02918v1","updated":"2024-05-05T12:52:28Z","published":"2024-05-05T12:52:28Z","title":"MERIT: Multi-view Evidential learning for Reliable and Interpretable\n liver fibrosis sTaging","summary":" Accurate staging of liver fibrosis from magnetic resonance imaging (MRI) is\ncrucial in clinical practice. While conventional methods often focus on a\nspecific sub-region, multi-view learning captures more information by analyzing\nmultiple patches simultaneously. However, previous multi-view approaches could\nnot typically calculate uncertainty by nature, and they generally integrate\nfeatures from different views in a black-box fashion, hence compromising\nreliability as well as interpretability of the resulting models. In this work,\nwe propose a new multi-view method based on evidential learning, referred to as\nMERIT, which tackles the two challenges in a unified framework. MERIT enables\nuncertainty quantification of the predictions to enhance reliability, and\nemploys a logic-based combination rule to improve interpretability.\nSpecifically, MERIT models the prediction from each sub-view as an opinion with\nquantified uncertainty under the guidance of the subjective logic theory.\nFurthermore, a distribution-aware base rate is introduced to enhance\nperformance, particularly in scenarios involving class distribution shifts.\nFinally, MERIT adopts a feature-specific combination rule to explicitly fuse\nmulti-view predictions, thereby enhancing interpretability. Results have\nshowcased the effectiveness of the proposed MERIT, highlighting the reliability\nand offering both ad-hoc and post-hoc interpretability. They also illustrate\nthat MERIT can elucidate the significance of each view in the decision-making\nprocess for liver fibrosis staging.\n","authors":["Yuanye Liu","Zheyao Gao","Nannan Shi","Fuping Wu","Yuxin Shi","Qingchao Chen","Xiahai Zhuang"],"pdf_url":"https://arxiv.org/pdf/2405.02918v1.pdf","comment":"Submitted to Medical Image Analysis"},{"id":"http://arxiv.org/abs/2405.02917v1","updated":"2024-05-05T12:51:38Z","published":"2024-05-05T12:51:38Z","title":"Overconfidence is Key: Verbalized Uncertainty Evaluation in Large\n Language and Vision-Language Models","summary":" Language and Vision-Language Models (LLMs/VLMs) have revolutionized the field\nof AI by their ability to generate human-like text and understand images, but\nensuring their reliability is crucial. This paper aims to evaluate the ability\nof LLMs (GPT4, GPT-3.5, LLaMA2, and PaLM 2) and VLMs (GPT4V and Gemini Pro\nVision) to estimate their verbalized uncertainty via prompting. We propose the\nnew Japanese Uncertain Scenes (JUS) dataset, aimed at testing VLM capabilities\nvia difficult queries and object counting, and the Net Calibration Error (NCE)\nto measure direction of miscalibration. Results show that both LLMs and VLMs\nhave a high calibration error and are overconfident most of the time,\nindicating a poor capability for uncertainty estimation. Additionally we\ndevelop prompts for regression tasks, and we show that VLMs have poor\ncalibration when producing mean/standard deviation and 95% confidence\nintervals.\n","authors":["Tobias Groot","Matias Valdenegro-Toro"],"pdf_url":"https://arxiv.org/pdf/2405.02917v1.pdf","comment":"8 pages, with appendix. To appear in TrustNLP workshop @ NAACL 2024"},{"id":"http://arxiv.org/abs/2405.02913v1","updated":"2024-05-05T12:41:55Z","published":"2024-05-05T12:41:55Z","title":"Fast TILs estimation in lung cancer WSIs based on semi-stochastic patch\n sampling","summary":" Addressing the critical need for accurate prognostic biomarkers in cancer\ntreatment, quantifying tumor-infiltrating lymphocytes (TILs) in non-small cell\nlung cancer (NSCLC) presents considerable challenges. Manual TIL quantification\nin whole slide images (WSIs) is laborious and subject to variability,\npotentially undermining patient outcomes. Our study introduces an automated\npipeline that utilizes semi-stochastic patch sampling, patch classification to\nretain prognostically relevant patches, and cell quantification using the\nHoVer-Net model to streamline the TIL evaluation process. This pipeline\nefficiently excludes approximately 70% of areas not relevant for prognosis and\nrequires only 5% of the remaining patches to maintain prognostic accuracy\n(c-index 0.65 +- 0.01). The computational efficiency achieved does not\nsacrifice prognostic accuracy, as demonstrated by the TILs score's strong\ncorrelation with patient survival, which surpasses traditional CD8 IHC scoring\nmethods. While the pipeline demonstrates potential for enhancing NSCLC\nprognostication and personalization of treatment, comprehensive clinical\nvalidation is still required. Future research should focus on verifying its\nbroader clinical utility and investigating additional biomarkers to improve\nNSCLC prognosis.\n","authors":["Nikita Shvetsov","Anders Sildnes","Lill-Tove Rasmussen Busund","Stig Dalen","Kajsa Møllersen","Lars Ailo Bongo","Thomas K. Kilvaer"],"pdf_url":"https://arxiv.org/pdf/2405.02913v1.pdf","comment":"18 pages, 7 figures, 6 appendix pages"},{"id":"http://arxiv.org/abs/2405.02911v1","updated":"2024-05-05T12:38:10Z","published":"2024-05-05T12:38:10Z","title":"Multimodal Sense-Informed Prediction of 3D Human Motions","summary":" Predicting future human pose is a fundamental application for machine\nintelligence, which drives robots to plan their behavior and paths ahead of\ntime to seamlessly accomplish human-robot collaboration in real-world 3D\nscenarios. Despite encouraging results, existing approaches rarely consider the\neffects of the external scene on the motion sequence, leading to pronounced\nartifacts and physical implausibilities in the predictions. To address this\nlimitation, this work introduces a novel multi-modal sense-informed motion\nprediction approach, which conditions high-fidelity generation on two modal\ninformation: external 3D scene, and internal human gaze, and is able to\nrecognize their salience for future human activity. Furthermore, the gaze\ninformation is regarded as the human intention, and combined with both motion\nand scene features, we construct a ternary intention-aware attention to\nsupervise the generation to match where the human wants to reach. Meanwhile, we\nintroduce semantic coherence-aware attention to explicitly distinguish the\nsalient point clouds and the underlying ones, to ensure a reasonable\ninteraction of the generated sequence with the 3D scene. On two real-world\nbenchmarks, the proposed method achieves state-of-the-art performance both in\n3D human pose and trajectory prediction.\n","authors":["Zhenyu Lou","Qiongjie Cui","Haofan Wang","Xu Tang","Hong Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.02911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02906v1","updated":"2024-05-05T12:11:33Z","published":"2024-05-05T12:11:33Z","title":"SalFAU-Net: Saliency Fusion Attention U-Net for Salient Object Detection","summary":" Salient object detection (SOD) remains an important task in computer vision,\nwith applications ranging from image segmentation to autonomous driving. Fully\nconvolutional network (FCN)-based methods have made remarkable progress in\nvisual saliency detection over the last few decades. However, these methods\nhave limitations in accurately detecting salient objects, particularly in\nchallenging scenes with multiple objects, small objects, or objects with low\nresolutions. To address this issue, we proposed a Saliency Fusion Attention\nU-Net (SalFAU-Net) model, which incorporates a saliency fusion module into each\ndecoder block of the attention U-net model to generate saliency probability\nmaps from each decoder block. SalFAU-Net employs an attention mechanism to\nselectively focus on the most informative regions of an image and suppress\nnon-salient regions. We train SalFAU-Net on the DUTS dataset using a binary\ncross-entropy loss function. We conducted experiments on six popular SOD\nevaluation datasets to evaluate the effectiveness of the proposed method. The\nexperimental results demonstrate that our method, SalFAU-Net, achieves\ncompetitive performance compared to other methods in terms of mean absolute\nerror (MAE), F-measure, s-measure, and e-measure.\n","authors":["Kassaw Abraham Mulat","Zhengyong Feng","Tegegne Solomon Eshetie","Ahmed Endris Hasen"],"pdf_url":"https://arxiv.org/pdf/2405.02906v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.13516v4","updated":"2024-05-05T12:05:53Z","published":"2024-01-24T15:14:05Z","title":"Delocate: Detection and Localization for Deepfake Videos with\n Randomly-Located Tampered Traces","summary":" Deepfake videos are becoming increasingly realistic, showing few tampering\ntraces on facial areasthat vary between frames. Consequently, existing Deepfake\ndetection methods struggle to detect unknown domain Deepfake videos while\naccurately locating the tampered region. To address thislimitation, we propose\nDelocate, a novel Deepfake detection model that can both recognize andlocalize\nunknown domain Deepfake videos. Ourmethod consists of two stages named\nrecoveringand localization. In the recovering stage, the modelrandomly masks\nregions of interest (ROIs) and reconstructs real faces without tampering\ntraces, leading to a relatively good recovery effect for realfaces and a poor\nrecovery effect for fake faces. Inthe localization stage, the output of the\nrecoveryphase and the forgery ground truth mask serve assupervision to guide\nthe forgery localization process. This process strategically emphasizes the\nrecovery phase of fake faces with poor recovery, facilitating the localization\nof tampered regions. Ourextensive experiments on four widely used benchmark\ndatasets demonstrate that Delocate not onlyexcels in localizing tampered areas\nbut also enhances cross-domain detection performance.\n","authors":["Juan Hu","Xin Liao","Difei Gao","Satoshi Tsutsui","Qian Wang","Zheng Qin","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2401.13516v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.09921,\n arXiv:2305.05943"},{"id":"http://arxiv.org/abs/2401.10731v4","updated":"2024-05-05T11:41:59Z","published":"2024-01-19T14:49:42Z","title":"Removal and Selection: Improving RGB-Infrared Object Detection via\n Coarse-to-Fine Fusion","summary":" Object detection in visible (RGB) and infrared (IR) images has been widely\napplied in recent years. Leveraging the complementary characteristics of RGB\nand IR images, the object detector provides reliable and robust object\nlocalization from day to night. Most existing fusion strategies directly input\nRGB and IR images into deep neural networks, leading to inferior detection\nperformance. However, the RGB and IR features have modality-specific noise,\nthese strategies will exacerbate the fused features along with the propagation.\nInspired by the mechanism of the human brain processing multimodal information,\nin this paper, we introduce a new coarse-to-fine perspective to purify and fuse\ntwo modality features. Specifically, following this perspective, we design a\nRedundant Spectrum Removal module to coarsely remove interfering information\nwithin each modality and a Dynamic Feature Selection module to finely select\nthe desired features for feature fusion. To verify the effectiveness of the\ncoarse-to-fine fusion strategy, we construct a new object detector called the\nRemoval and Selection Detector (RSDet). Extensive experiments on three RGB-IR\nobject detection datasets verify the superior performance of our method.\n","authors":["Tianyi Zhao","Maoxun Yuan","Feng Jiang","Nan Wang","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2401.10731v4.pdf","comment":"11pages, 11figures"},{"id":"http://arxiv.org/abs/2209.07163v2","updated":"2024-05-05T11:38:31Z","published":"2022-09-15T09:27:14Z","title":"Morphology-Aware Interactive Keypoint Estimation","summary":" Diagnosis based on medical images, such as X-ray images, often involves\nmanual annotation of anatomical keypoints. However, this process involves\nsignificant human efforts and can thus be a bottleneck in the diagnostic\nprocess. To fully automate this procedure, deep-learning-based methods have\nbeen widely proposed and have achieved high performance in detecting keypoints\nin medical images. However, these methods still have clinical limitations:\naccuracy cannot be guaranteed for all cases, and it is necessary for doctors to\ndouble-check all predictions of models. In response, we propose a novel deep\nneural network that, given an X-ray image, automatically detects and refines\nthe anatomical keypoints through a user-interactive system in which doctors can\nfix mispredicted keypoints with fewer clicks than needed during manual\nrevision. Using our own collected data and the publicly available AASCE\ndataset, we demonstrate the effectiveness of the proposed method in reducing\nthe annotation costs via extensive quantitative and qualitative results. A demo\nvideo of our approach is available on our project webpage.\n","authors":["Jinhee Kim","Taesung Kim","Taewoo Kim","Jaegul Choo","Dong-Wook Kim","Byungduk Ahn","In-Seok Song","Yoon-Ji Kim"],"pdf_url":"https://arxiv.org/pdf/2209.07163v2.pdf","comment":"MICCAI 2022. The first two authors contributed equally. The last two\n authors are the co-corresponding authors"},{"id":"http://arxiv.org/abs/2405.02882v1","updated":"2024-05-05T10:28:26Z","published":"2024-05-05T10:28:26Z","title":"A drone detector with modified backbone and multiple pyramid featuremaps\n enhancement structure (MDDPE)","summary":" This work presents a drone detector with modified backbone and multiple\npyramid feature maps enhancement structure (MDDPE). Novel feature maps improve\nmodules that uses different levels of information to produce more robust and\ndiscriminatory features is proposed. These module includes the feature maps\nsupplement function and the feature maps recombination enhancement function.To\neffectively handle the drone characteristics, auxiliary supervisions that are\nimplemented in the early stages by employing tailored anchors designed are\nutilized. To further improve the modeling of real drone detection scenarios and\ninitialization of the regressor, an updated anchor matching technique is\nintroduced to match anchors and ground truth drone as closely as feasible. To\nshow the proposed MDDPE's superiority over the most advanced detectors,\nextensive experiments are carried out using well-known drone detection\nbenchmarks.\n","authors":["Chenhao Wu"],"pdf_url":"https://arxiv.org/pdf/2405.02882v1.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.02880v1","updated":"2024-05-05T10:27:03Z","published":"2024-05-05T10:27:03Z","title":"Blending Distributed NeRFs with Tri-stage Robust Pose Optimization","summary":" Due to the limited model capacity, leveraging distributed Neural Radiance\nFields (NeRFs) for modeling extensive urban environments has become a\nnecessity. However, current distributed NeRF registration approaches encounter\naliasing artifacts, arising from discrepancies in rendering resolutions and\nsuboptimal pose precision. These factors collectively deteriorate the fidelity\nof pose estimation within NeRF frameworks, resulting in occlusion artifacts\nduring the NeRF blending stage. In this paper, we present a distributed NeRF\nsystem with tri-stage pose optimization. In the first stage, precise poses of\nimages are achieved by bundle adjusting Mip-NeRF 360 with a coarse-to-fine\nstrategy. In the second stage, we incorporate the inverting Mip-NeRF 360,\ncoupled with the truncated dynamic low-pass filter, to enable the achievement\nof robust and precise poses, termed Frame2Model optimization. On top of this,\nwe obtain a coarse transformation between NeRFs in different coordinate\nsystems. In the third stage, we fine-tune the transformation between NeRFs by\nModel2Model pose optimization. After obtaining precise transformation\nparameters, we proceed to implement NeRF blending, showcasing superior\nperformance metrics in both real-world and simulation scenarios. Codes and data\nwill be publicly available at https://github.com/boilcy/Distributed-NeRF.\n","authors":["Baijun Ye","Caiyun Liu","Xiaoyu Ye","Yuantao Chen","Yuhai Wang","Zike Yan","Yongliang Shi","Hao Zhao","Guyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.02880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01055v2","updated":"2024-05-05T09:19:06Z","published":"2023-10-02T10:05:30Z","title":"Improved Crop and Weed Detection with Diverse Data Ensemble Learning in\n Agriculture","summary":" Modern agriculture heavily relies on Site-Specific Farm Management practices,\nnecessitating accurate detection, localization, and quantification of crops and\nweeds in the field, which can be achieved using deep learning techniques. In\nthis regard, crop and weed-specific binary segmentation models have shown\npromise. However, uncontrolled field conditions limit their performance from\none field to the other. To improve semantic model generalization, existing\nmethods augment and synthesize agricultural data to account for uncontrolled\nfield conditions. However, given highly varied field conditions, these methods\nhave limitations. To overcome the challenges of model deterioration in such\nconditions, we propose utilizing data specific to other crops and weeds for our\nspecific target problem. To achieve this, we propose a novel ensemble\nframework. Our approach involves utilizing different crop and weed models\ntrained on diverse datasets and employing a teacher-student configuration. By\nusing homogeneous stacking of base models and a trainable meta-architecture to\ncombine their outputs, we achieve significant improvements for Canola crops and\nKochia weeds on unseen test data, surpassing the performance of single semantic\nsegmentation models. We identify the UNET meta-architecture as the most\neffective in this context. Finally, through ablation studies, we demonstrate\nand validate the effectiveness of our proposed model. We observe that including\nbase models trained on other target crops and weeds can help generalize the\nmodel to capture varied field conditions. Lastly, we propose two novel datasets\nwith varied conditions for comparisons.\n","authors":["Muhammad Hamza Asad","Saeed Anwar","Abdul Bais"],"pdf_url":"https://arxiv.org/pdf/2310.01055v2.pdf","comment":"Accepted in CVPR Workshop as an Oral"},{"id":"http://arxiv.org/abs/2405.02859v1","updated":"2024-05-05T09:04:42Z","published":"2024-05-05T09:04:42Z","title":"MVIP-NeRF: Multi-view 3D Inpainting on NeRF Scenes via Diffusion Prior","summary":" Despite the emergence of successful NeRF inpainting methods built upon\nexplicit RGB and depth 2D inpainting supervisions, these methods are inherently\nconstrained by the capabilities of their underlying 2D inpainters. This is due\nto two key reasons: (i) independently inpainting constituent images results in\nview-inconsistent imagery, and (ii) 2D inpainters struggle to ensure\nhigh-quality geometry completion and alignment with inpainted RGB images.\n To overcome these limitations, we propose a novel approach called MVIP-NeRF\nthat harnesses the potential of diffusion priors for NeRF inpainting,\naddressing both appearance and geometry aspects. MVIP-NeRF performs joint\ninpainting across multiple views to reach a consistent solution, which is\nachieved via an iterative optimization process based on Score Distillation\nSampling (SDS). Apart from recovering the rendered RGB images, we also extract\nnormal maps as a geometric representation and define a normal SDS loss that\nmotivates accurate geometry inpainting and alignment with the appearance.\nAdditionally, we formulate a multi-view SDS score function to distill\ngenerative priors simultaneously from different view images, ensuring\nconsistent visual completion when dealing with large view variations. Our\nexperimental results show better appearance and geometry recovery than previous\nNeRF inpainting methods.\n","authors":["Honghua Chen","Chen Change Loy","Xingang Pan"],"pdf_url":"https://arxiv.org/pdf/2405.02859v1.pdf","comment":"14 pages, 10 figures, conference"},{"id":"http://arxiv.org/abs/2405.02857v1","updated":"2024-05-05T09:01:13Z","published":"2024-05-05T09:01:13Z","title":"I$^3$Net: Inter-Intra-slice Interpolation Network for Medical Slice\n Synthesis","summary":" Medical imaging is limited by acquisition time and scanning equipment. CT and\nMR volumes, reconstructed with thicker slices, are anisotropic with high\nin-plane resolution and low through-plane resolution. We reveal an intriguing\nphenomenon that due to the mentioned nature of data, performing slice-wise\ninterpolation from the axial view can yield greater benefits than performing\nsuper-resolution from other views. Based on this observation, we propose an\nInter-Intra-slice Interpolation Network (I$^3$Net), which fully explores\ninformation from high in-plane resolution and compensates for low through-plane\nresolution. The through-plane branch supplements the limited information\ncontained in low through-plane resolution from high in-plane resolution and\nenables continual and diverse feature learning. In-plane branch transforms\nfeatures to the frequency domain and enforces an equal learning opportunity for\nall frequency bands in a global context learning paradigm. We further propose a\ncross-view block to take advantage of the information from all three views\nonline. Extensive experiments on two public datasets demonstrate the\neffectiveness of I$^3$Net, and noticeably outperforms state-of-the-art\nsuper-resolution, video frame interpolation and slice interpolation methods by\na large margin. We achieve 43.90dB in PSNR, with at least 1.14dB improvement\nunder the upscale factor of $\\times$2 on MSD dataset with faster inference.\nCode is available at\nhttps://github.com/DeepMed-Lab-ECNU/Medical-Image-Reconstruction.\n","authors":["Haofei Song","Xintian Mao","Jing Yu","Qingli Li","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02852v1","updated":"2024-05-05T08:55:00Z","published":"2024-05-05T08:55:00Z","title":"On Enhancing Brain Tumor Segmentation Across Diverse Populations with\n Convolutional Neural Networks","summary":" Brain tumor segmentation is a fundamental step in assessing a patient's\ncancer progression. However, manual segmentation demands significant expert\ntime to identify tumors in 3D multimodal brain MRI scans accurately. This\nreliance on manual segmentation makes the process prone to intra- and\ninter-observer variability. This work proposes a brain tumor segmentation\nmethod as part of the BraTS-GoAT challenge. The task is to segment tumors in\nbrain MRI scans automatically from various populations, such as adults,\npediatrics, and underserved sub-Saharan Africa. We employ a recent CNN\narchitecture for medical image segmentation, namely MedNeXt, as our baseline,\nand we implement extensive model ensembling and postprocessing for inference.\nOur experiments show that our method performs well on the unseen validation set\nwith an average DSC of 85.54% and HD95 of 27.88. The code is available on\nhttps://github.com/BioMedIA-MBZUAI/BraTS2024_BioMedIAMBZ.\n","authors":["Fadillah Maani","Anees Ur Rehman Hashmi","Numan Saeed","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2405.02852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02844v1","updated":"2024-05-05T08:28:07Z","published":"2024-05-05T08:28:07Z","title":"SMCD: High Realism Motion Style Transfer via Mamba-based Diffusion","summary":" Motion style transfer is a significant research direction in multimedia\napplications. It enables the rapid switching of different styles of the same\nmotion for virtual digital humans, thus vastly increasing the diversity and\nrealism of movements. It is widely applied in multimedia scenarios such as\nmovies, games, and the Metaverse. However, most of the current work in this\nfield adopts the GAN, which may lead to instability and convergence issues,\nmaking the final generated motion sequence somewhat chaotic and unable to\nreflect a highly realistic and natural style. To address these problems, we\nconsider style motion as a condition and propose the Style Motion Conditioned\nDiffusion (SMCD) framework for the first time, which can more comprehensively\nlearn the style features of motion. Moreover, we apply Mamba model for the\nfirst time in the motion style transfer field, introducing the Motion Style\nMamba (MSM) module to handle longer motion sequences. Thirdly, aiming at the\nSMCD framework, we propose Diffusion-based Content Consistency Loss and Content\nConsistency Loss to assist the overall framework's training. Finally, we\nconduct extensive experiments. The results reveal that our method surpasses\nstate-of-the-art methods in both qualitative and quantitative comparisons,\ncapable of generating more realistic motion sequences.\n","authors":["Ziyun Qian","Zeyu Xiao","Zhenyi Wu","Dingkang Yang","Mingcheng Li","Shunli Wang","Shuaibing Wang","Dongliang Kou","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02843v1","updated":"2024-05-05T08:19:04Z","published":"2024-05-05T08:19:04Z","title":"Residual-Conditioned Optimal Transport: Towards Structure-preserving\n Unpaired and Paired Image Restoration","summary":" Deep learning-based image restoration methods have achieved promising\nperformance. However, how to faithfully preserve the structure of the original\nimage remains challenging. To address this challenge, we propose a novel\nResidual-Conditioned Optimal Transport (RCOT) approach, which models the image\nrestoration as an optimal transport (OT) problem for both unpaired and paired\nsettings, integrating the transport residual as a unique degradation-specific\ncue for both the transport cost and the transport map. Specifically, we first\nformalize a Fourier residual-guided OT objective by incorporating the\ndegradation-specific information of the residual into the transport cost. Based\non the dual form of the OT formulation, we design the transport map as a\ntwo-pass RCOT map that comprises a base model and a refinement process, in\nwhich the transport residual is computed by the base model in the first pass\nand then encoded as a degradation-specific embedding to condition the\nsecond-pass restoration. By duality, the RCOT problem is transformed into a\nminimax optimization problem, which can be solved by adversarially training\nneural networks. Extensive experiments on multiple restoration tasks show the\neffectiveness of our approach in terms of both distortion measures and\nperceptual quality. Particularly, RCOT restores images with more faithful\nstructural details compared to state-of-the-art methods.\n","authors":["Xiaole Tang","Xin Hu","Xiang Gu","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2405.02843v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2211.13726v4","updated":"2024-05-05T08:17:30Z","published":"2022-11-24T17:26:27Z","title":"Lightweight Event-based Optical Flow Estimation via Iterative Deblurring","summary":" Inspired by frame-based methods, state-of-the-art event-based optical flow\nnetworks rely on the explicit construction of correlation volumes, which are\nexpensive to compute and store, rendering them unsuitable for robotic\napplications with limited compute and energy budget. Moreover, correlation\nvolumes scale poorly with resolution, prohibiting them from estimating\nhigh-resolution flow. We observe that the spatiotemporally continuous traces of\nevents provide a natural search direction for seeking pixel correspondences,\nobviating the need to rely on gradients of explicit correlation volumes as such\nsearch directions. We introduce IDNet (Iterative Deblurring Network), a\nlightweight yet high-performing event-based optical flow network directly\nestimating flow from event traces without using correlation volumes. We further\npropose two iterative update schemes: \"ID\" which iterates over the same batch\nof events, and \"TID\" which iterates over time with streaming events in an\nonline fashion. Our top-performing ID model sets a new state of the art on DSEC\nbenchmark. Meanwhile, the base ID model is competitive with prior arts while\nusing 80% fewer parameters, consuming 20x less memory footprint and running 40%\nfaster on the NVidia Jetson Xavier NX. Furthermore, the TID model is even more\nefficient offering an additional 5x faster inference speed and 8 ms ultra-low\nlatency at the cost of only a 9% performance drop, making it the only model\namong current literature capable of real-time operation while maintaining\ndecent performance.\n","authors":["Yilun Wu","Federico Paredes-Vallés","Guido C. H. E. de Croon"],"pdf_url":"https://arxiv.org/pdf/2211.13726v4.pdf","comment":"Accepted to IEEE International Conference on Robotics and Automation\n (ICRA'24), Yokohama, Japan, May 13-17, 2024. arXiv revision includes\n additional ablation studies results"},{"id":"http://arxiv.org/abs/2404.12725v2","updated":"2024-05-05T08:00:17Z","published":"2024-04-19T09:08:44Z","title":"Separate in the Speech Chain: Cross-Modal Conditional Audio-Visual\n Target Speech Extraction","summary":" The integration of visual cues has revitalized the performance of the target\nspeech extraction task, elevating it to the forefront of the field.\nNevertheless, this multi-modal learning paradigm often encounters the challenge\nof modality imbalance. In audio-visual target speech extraction tasks, the\naudio modality tends to dominate, potentially overshadowing the importance of\nvisual guidance. To tackle this issue, we propose AVSepChain, drawing\ninspiration from the speech chain concept. Our approach partitions the\naudio-visual target speech extraction task into two stages: speech perception\nand speech production. In the speech perception stage, audio serves as the\ndominant modality, while visual information acts as the conditional modality.\nConversely, in the speech production stage, the roles are reversed. This\ntransformation of modality status aims to alleviate the problem of modality\nimbalance. Additionally, we introduce a contrastive semantic matching loss to\nensure that the semantic information conveyed by the generated speech aligns\nwith the semantic information conveyed by lip movements during the speech\nproduction stage. Through extensive experiments conducted on multiple benchmark\ndatasets for audio-visual target speech extraction, we showcase the superior\nperformance achieved by our proposed method.\n","authors":["Zhaoxi Mu","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12725v2.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2405.02834v1","updated":"2024-05-05T07:21:17Z","published":"2024-05-05T07:21:17Z","title":"Scene-Adaptive Person Search via Bilateral Modulations","summary":" Person search aims to localize specific a target person from a gallery set of\nimages with various scenes. As the scene of moving pedestrian changes, the\ncaptured person image inevitably bring in lots of background noise and\nforeground noise on the person feature, which are completely unrelated to the\nperson identity, leading to severe performance degeneration. To address this\nissue, we present a Scene-Adaptive Person Search (SEAS) model by introducing\nbilateral modulations to simultaneously eliminate scene noise and maintain a\nconsistent person representation to adapt to various scenes. In SEAS, a\nBackground Modulation Network (BMN) is designed to encode the feature extracted\nfrom the detected bounding box into a multi-granularity embedding, which\nreduces the input of background noise from multiple levels with norm-aware.\nAdditionally, to mitigate the effect of foreground noise on the person feature,\nSEAS introduces a Foreground Modulation Network (FMN) to compute the clutter\nreduction offset for the person embedding based on the feature map of the scene\nimage. By bilateral modulations on both background and foreground within an\nend-to-end manner, SEAS obtains consistent feature representations without\nscene noise. SEAS can achieve state-of-the-art (SOTA) performance on two\nbenchmark datasets, CUHK-SYSU with 97.1\\% mAP and PRW with 60.5\\% mAP. The code\nis available at https://github.com/whbdmu/SEAS.\n","authors":["Yimin Jiang","Huibing Wang","Jinjia Peng","Xianping Fu","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02832v1","updated":"2024-05-05T07:15:47Z","published":"2024-05-05T07:15:47Z","title":"Fast One-Stage Unsupervised Domain Adaptive Person Search","summary":" Unsupervised person search aims to localize a particular target person from a\ngallery set of scene images without annotations, which is extremely challenging\ndue to the unexpected variations of the unlabeled domains. However, most\nexisting methods dedicate to developing multi-stage models to adapt domain\nvariations while using clustering for iterative model training, which\ninevitably increases model complexity. To address this issue, we propose a Fast\nOne-stage Unsupervised person Search (FOUS) which complementary integrates\ndomain adaptaion with label adaptaion within an end-to-end manner without\niterative clustering. To minimize the domain discrepancy, FOUS introduced an\nAttention-based Domain Alignment Module (ADAM) which can not only align various\ndomains for both detection and ReID tasks but also construct an attention\nmechanism to reduce the adverse impacts of low-quality candidates resulting\nfrom unsupervised detection. Moreover, to avoid the redundant iterative\nclustering mode, FOUS adopts a prototype-guided labeling method which minimizes\nredundant correlation computations for partial samples and assigns noisy coarse\nlabel groups efficiently. The coarse label groups will be continuously refined\nvia label-flexible training network with an adaptive selection strategy. With\nthe adapted domains and labels, FOUS can achieve the state-of-the-art (SOTA)\nperformance on two benchmark datasets, CUHK-SYSU and PRW. The code is available\nat https://github.com/whbdmu/FOUS.\n","authors":["Tianxiang Cui","Huibing Wang","Jinjia Peng","Ruoxi Deng","Xianping Fu","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02830v1","updated":"2024-05-05T06:57:40Z","published":"2024-05-05T06:57:40Z","title":"You Only Need Half: Boosting Data Augmentation by Using Partial Content","summary":" We propose a novel data augmentation method termed You Only Need hAlf (YONA),\nwhich simplifies the augmentation process. YONA bisects an image, substitutes\none half with noise, and applies data augmentation techniques to the remaining\nhalf. This method reduces the redundant information in the original image,\nencourages neural networks to recognize objects from incomplete views, and\nsignificantly enhances neural networks' robustness. YONA is distinguished by\nits properties of parameter-free, straightforward application, enhancing\nvarious existing data augmentation strategies, and thereby bolstering neural\nnetworks' robustness without additional computational cost. To demonstrate\nYONA's efficacy, extensive experiments were carried out. These experiments\nconfirm YONA's compatibility with diverse data augmentation methods and neural\nnetwork architectures, yielding substantial improvements in CIFAR\nclassification tasks, sometimes outperforming conventional image-level data\naugmentation methods. Furthermore, YONA markedly increases the resilience of\nneural networks to adversarial attacks. Additional experiments exploring YONA's\nvariants conclusively show that masking half of an image optimizes performance.\nThe code is available at https://github.com/HansMoe/YONA.\n","authors":["Juntao Hu","Yuan Wu"],"pdf_url":"https://arxiv.org/pdf/2405.02830v1.pdf","comment":"Technical report,16 pages"},{"id":"http://arxiv.org/abs/2405.02824v1","updated":"2024-05-05T06:21:58Z","published":"2024-05-05T06:21:58Z","title":"Adaptive Guidance Learning for Camouflaged Object Detection","summary":" Camouflaged object detection (COD) aims to segment objects visually embedded\nin their surroundings, which is a very challenging task due to the high\nsimilarity between the objects and the background. To address it, most methods\noften incorporate additional information (e.g., boundary, texture, and\nfrequency clues) to guide feature learning for better detecting camouflaged\nobjects from the background. Although progress has been made, these methods are\nbasically individually tailored to specific auxiliary cues, thus lacking\nadaptability and not consistently achieving high segmentation performance. To\nthis end, this paper proposes an adaptive guidance learning network, dubbed\n\\textit{AGLNet}, which is a unified end-to-end learnable model for exploring\nand adapting different additional cues in CNN models to guide accurate\ncamouflaged feature learning. Specifically, we first design a straightforward\nadditional information generation (AIG) module to learn additional camouflaged\nobject cues, which can be adapted for the exploration of effective camouflaged\nfeatures. Then we present a hierarchical feature combination (HFC) module to\ndeeply integrate additional cues and image features to guide camouflaged\nfeature learning in a multi-level fusion manner.Followed by a recalibration\ndecoder (RD), different features are further aggregated and refined for\naccurate object prediction. Extensive experiments on three widely used COD\nbenchmark datasets demonstrate that the proposed method achieves significant\nperformance improvements under different additional cues, and outperforms the\nrecent 20 state-of-the-art methods by a large margin. Our code will be made\npublicly available at: \\textcolor{blue}{{https://github.com/ZNan-Chen/AGLNet}}.\n","authors":["Zhennan Chen","Xuying Zhang","Tian-Zhu Xiang","Ying Tai"],"pdf_url":"https://arxiv.org/pdf/2405.02824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08392v3","updated":"2024-05-05T06:12:19Z","published":"2024-01-16T14:33:09Z","title":"DoraemonGPT: Toward Understanding Dynamic Scenes with Large Language\n Models (Exemplified as A Video Agent)","summary":" Recent LLM-driven visual agents mainly focus on solving image-based tasks,\nwhich limits their ability to understand dynamic scenes, making it far from\nreal-life applications like guiding students in laboratory experiments and\nidentifying their mistakes. Hence, this paper explores DoraemonGPT, a\ncomprehensive and conceptually elegant system driven by LLMs to understand\ndynamic scenes. Considering the video modality better reflects the\never-changing nature of real-world scenarios, we exemplify DoraemonGPT as a\nvideo agent. Given a video with a question/task, DoraemonGPT begins by\nconverting the input video into a symbolic memory that stores task-related\nattributes. This structured representation allows for spatial-temporal querying\nand reasoning by well-designed sub-task tools, resulting in concise\nintermediate results. Recognizing that LLMs have limited internal knowledge\nwhen it comes to specialized domains (e.g., analyzing the scientific principles\nunderlying experiments), we incorporate plug-and-play tools to assess external\nknowledge and address tasks across different domains. Moreover, a novel\nLLM-driven planner based on Monte Carlo Tree Search is introduced to explore\nthe large planning space for scheduling various tools. The planner iteratively\nfinds feasible solutions by backpropagating the result's reward, and multiple\nsolutions can be summarized into an improved final answer. We extensively\nevaluate DoraemonGPT's effectiveness on three benchmarks and several\nin-the-wild scenarios. The code will be released at\nhttps://github.com/z-x-yang/DoraemonGPT.\n","authors":["Zongxin Yang","Guikun Chen","Xiaodi Li","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.08392v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16167v3","updated":"2024-05-05T05:46:45Z","published":"2024-03-24T14:21:06Z","title":"Exploiting Semantic Reconstruction to Mitigate Hallucinations in\n Vision-Language Models","summary":" Hallucinations in vision-language models pose a significant challenge to\ntheir reliability, particularly in the generation of long captions. Current\nmethods fall short of accurately identifying and mitigating these\nhallucinations. To address this issue, we introduce ESREAL, a novel\nunsupervised learning framework designed to suppress the generation of\nhallucinations through accurate localization and penalization of hallucinated\ntokens. Initially, ESREAL creates a reconstructed image based on the generated\ncaption and aligns its corresponding regions with those of the original image.\nThis semantic reconstruction aids in identifying both the presence and type of\ntoken-level hallucinations within the generated caption. Subsequently, ESREAL\ncomputes token-level hallucination scores by assessing the semantic similarity\nof aligned regions based on the type of hallucination. Finally, ESREAL employs\na proximal policy optimization algorithm, where it selectively penalizes\nhallucinated tokens according to their token-level hallucination scores. Our\nframework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2\nby 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved\nsolely through signals derived from the image itself, without the need for any\nimage-text pairs.\n","authors":["Minchan Kim","Minyeong Kim","Junik Bae","Suhwan Choi","Sungkyung Kim","Buru Chang"],"pdf_url":"https://arxiv.org/pdf/2403.16167v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02815v1","updated":"2024-05-05T05:08:38Z","published":"2024-05-05T05:08:38Z","title":"Region-specific Risk Quantification for Interpretable Prognosis of\n COVID-19","summary":" The COVID-19 pandemic has strained global public health, necessitating\naccurate diagnosis and intervention to control disease spread and reduce\nmortality rates. This paper introduces an interpretable deep survival\nprediction model designed specifically for improved understanding and trust in\nCOVID-19 prognosis using chest X-ray (CXR) images. By integrating a large-scale\npretrained image encoder, Risk-specific Grad-CAM, and anatomical region\ndetection techniques, our approach produces regional interpretable outcomes\nthat effectively capture essential disease features while focusing on rare but\ncritical abnormal regions. Our model's predictive results provide enhanced\nclarity and transparency through risk area localization, enabling clinicians to\nmake informed decisions regarding COVID-19 diagnosis with better understanding\nof prognostic insights. We evaluate the proposed method on a multi-center\nsurvival dataset and demonstrate its effectiveness via quantitative and\nqualitative assessments, achieving superior C-indexes (0.764 and 0.727) and\ntime-dependent AUCs (0.799 and 0.691). These results suggest that our\nexplainable deep survival prediction model surpasses traditional survival\nanalysis methods in risk prediction, improving interpretability for clinical\ndecision making and enhancing AI system trustworthiness.\n","authors":["Zhusi Zhong","Jie Li","Zhuoqi Ma","Scott Collins","Harrison Bai","Paul Zhang","Terrance Healey","Xinbo Gao","Michael K. Atalay","Zhicheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2405.02815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12052v3","updated":"2024-05-05T05:07:34Z","published":"2023-11-18T10:22:44Z","title":"MagicPose: Realistic Human Poses and Facial Expressions Retargeting with\n Identity-aware Diffusion","summary":" In this work, we propose MagicPose, a diffusion-based model for 2D human pose\nand facial expression retargeting. Specifically, given a reference image, we\naim to generate a person's new images by controlling the poses and facial\nexpressions while keeping the identity unchanged. To this end, we propose a\ntwo-stage training strategy to disentangle human motions and appearance (e.g.,\nfacial expressions, skin tone and dressing), consisting of (1) the pre-training\nof an appearance-control block and (2) learning appearance-disentangled pose\ncontrol. Our novel design enables robust appearance control over generated\nhuman images, including body, facial attributes, and even background. By\nleveraging the prior knowledge of image diffusion models, MagicPose generalizes\nwell to unseen human identities and complex poses without the need for\nadditional fine-tuning. Moreover, the proposed model is easy to use and can be\nconsidered as a plug-in module/extension to Stable Diffusion. The code is\navailable at: https://github.com/Boese0601/MagicDance\n","authors":["Di Chang","Yichun Shi","Quankai Gao","Jessica Fu","Hongyi Xu","Guoxian Song","Qing Yan","Yizhe Zhu","Xiao Yang","Mohammad Soleymani"],"pdf_url":"https://arxiv.org/pdf/2311.12052v3.pdf","comment":"Accepted by ICML 2024. MagicPose and MagicDance are the same project.\n Website:https://boese0601.github.io/magicdance/\n Code:https://github.com/Boese0601/MagicDance"},{"id":"http://arxiv.org/abs/2401.11708v3","updated":"2024-05-05T04:50:54Z","published":"2024-01-22T06:16:29Z","title":"Mastering Text-to-Image Diffusion: Recaptioning, Planning, and\n Generating with Multimodal LLMs","summary":" Diffusion models have exhibit exceptional performance in text-to-image\ngeneration and editing. However, existing methods often face challenges when\nhandling complex text prompts that involve multiple objects with multiple\nattributes and relationships. In this paper, we propose a brand new\ntraining-free text-to-image generation/editing framework, namely Recaption,\nPlan and Generate (RPG), harnessing the powerful chain-of-thought reasoning\nability of multimodal LLMs to enhance the compositionality of text-to-image\ndiffusion models. Our approach employs the MLLM as a global planner to\ndecompose the process of generating complex images into multiple simpler\ngeneration tasks within subregions. We propose complementary regional diffusion\nto enable region-wise compositional generation. Furthermore, we integrate\ntext-guided image generation and editing within the proposed RPG in a\nclosed-loop fashion, thereby enhancing generalization ability. Extensive\nexperiments demonstrate our RPG outperforms state-of-the-art text-to-image\ndiffusion models, including DALL-E 3 and SDXL, particularly in multi-category\nobject composition and text-image semantic alignment. Notably, our RPG\nframework exhibits wide compatibility with various MLLM architectures (e.g.,\nMiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available\nat: https://github.com/YangLing0818/RPG-DiffusionMaster\n","authors":["Ling Yang","Zhaochen Yu","Chenlin Meng","Minkai Xu","Stefano Ermon","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2401.11708v3.pdf","comment":"ICML 2024. Project:\n https://github.com/YangLing0818/RPG-DiffusionMaster"},{"id":"http://arxiv.org/abs/2304.00962v4","updated":"2024-05-05T04:44:55Z","published":"2023-04-03T13:30:04Z","title":"RegionPLC: Regional Point-Language Contrastive Learning for Open-World\n 3D Scene Understanding","summary":" We propose a lightweight and scalable Regional Point-Language Contrastive\nlearning framework, namely \\textbf{RegionPLC}, for open-world 3D scene\nunderstanding, aiming to identify and recognize open-set objects and\ncategories. Specifically, based on our empirical studies, we introduce a\n3D-aware SFusion strategy that fuses 3D vision-language pairs derived from\nmultiple 2D foundation models, yielding high-quality, dense region-level\nlanguage descriptions without human 3D annotations. Subsequently, we devise a\nregion-aware point-discriminative contrastive learning objective to enable\nrobust and effective 3D learning from dense regional language supervision. We\ncarry out extensive experiments on ScanNet, ScanNet200, and nuScenes datasets,\nand our model outperforms prior 3D open-world scene understanding approaches by\nan average of 17.2\\% and 9.1\\% for semantic and instance segmentation,\nrespectively, while maintaining greater scalability and lower resource demands.\nFurthermore, our method has the flexibility to be effortlessly integrated with\nlanguage models to enable open-ended grounded 3D reasoning without extra\ntask-specific training. Code is available at https://github.com/CVMI-Lab/PLA.\n","authors":["Jihan Yang","Runyu Ding","Weipeng Deng","Zhe Wang","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2304.00962v4.pdf","comment":"To appear in CVPR2024 .project page:\n https://jihanyang.github.io/projects/RegionPLC"},{"id":"http://arxiv.org/abs/2405.02811v1","updated":"2024-05-05T04:44:41Z","published":"2024-05-05T04:44:41Z","title":"PVTransformer: Point-to-Voxel Transformer for Scalable 3D Object\n Detection","summary":" 3D object detectors for point clouds often rely on a pooling-based PointNet\nto encode sparse points into grid-like voxels or pillars. In this paper, we\nidentify that the common PointNet design introduces an information bottleneck\nthat limits 3D object detection accuracy and scalability. To address this\nlimitation, we propose PVTransformer: a transformer-based point-to-voxel\narchitecture for 3D detection. Our key idea is to replace the PointNet pooling\noperation with an attention module, leading to a better point-to-voxel\naggregation function. Our design respects the permutation invariance of sparse\n3D points while being more expressive than the pooling-based PointNet.\nExperimental results show our PVTransformer achieves much better performance\ncompared to the latest 3D object detectors. On the widely used Waymo Open\nDataset, our PVTransformer achieves state-of-the-art 76.5 mAPH L2,\noutperforming the prior art of SWFormer by +1.7 mAPH L2.\n","authors":["Zhaoqi Leng","Pei Sun","Tong He","Dragomir Anguelov","Mingxing Tan"],"pdf_url":"https://arxiv.org/pdf/2405.02811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02807v1","updated":"2024-05-05T04:00:03Z","published":"2024-05-05T04:00:03Z","title":"Kinematic analysis of structural mechanics based on convolutional neural\n network","summary":" Attempt to use convolutional neural network to achieve kinematic analysis of\nplane bar structure. Through 3dsMax animation software and OpenCV module,\nself-build image dataset of geometrically stable system and geometrically\nunstable system. we construct and train convolutional neural network model\nbased on the TensorFlow and Keras deep learning platform framework. The model\nachieves 100% accuracy on the training set, validation set, and test set. The\naccuracy on the additional test set is 93.7%, indicating that convolutional\nneural network can learn and master the relevant knowledge of kinematic\nanalysis of structural mechanics. In the future, the generalization ability of\nthe model can be improved through the diversity of dataset, which has the\npotential to surpass human experts for complex structures. Convolutional neural\nnetwork has certain practical value in the field of kinematic analysis of\nstructural mechanics. Using visualization technology, we reveal how\nconvolutional neural network learns and recognizes structural features. Using\npre-trained VGG16 model for feature extraction and fine-tuning, we found that\nthe generalization ability is inferior to the self-built model.\n","authors":["Leye Zhang","Xiangxiang Tian","Hongjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02807v1.pdf","comment":"9 pages, 13 figures"},{"id":"http://arxiv.org/abs/2405.02797v1","updated":"2024-05-05T02:44:04Z","published":"2024-05-05T02:44:04Z","title":"Adapting to Distribution Shift by Visual Domain Prompt Generation","summary":" In this paper, we aim to adapt a model at test-time using a few unlabeled\ndata to address distribution shifts. To tackle the challenges of extracting\ndomain knowledge from a limited amount of data, it is crucial to utilize\ncorrelated information from pre-trained backbones and source domains. Previous\nstudies fail to utilize recent foundation models with strong\nout-of-distribution generalization. Additionally, domain-centric designs are\nnot flavored in their works. Furthermore, they employ the process of modelling\nsource domains and the process of learning to adapt independently into disjoint\ntraining stages. In this work, we propose an approach on top of the\npre-computed features of the foundation model. Specifically, we build a\nknowledge bank to learn the transferable knowledge from source domains.\nConditioned on few-shot target data, we introduce a domain prompt generator to\ncondense the knowledge bank into a domain-specific prompt. The domain prompt\nthen directs the visual features towards a particular domain via a guidance\nmodule. Moreover, we propose a domain-aware contrastive loss and employ\nmeta-learning to facilitate domain knowledge extraction. Extensive experiments\nare conducted to validate the domain knowledge extraction. The proposed method\noutperforms previous work on 5 large-scale benchmarks including WILDS and\nDomainNet.\n","authors":["Zhixiang Chi","Li Gu","Tao Zhong","Huan Liu","Yuanhao Yu","Konstantinos N Plataniotis","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02797v1.pdf","comment":"ICLR2024, code: https://github.com/Guliisgreat/VDPG"},{"id":"http://arxiv.org/abs/2309.14726v2","updated":"2024-05-05T02:33:38Z","published":"2023-09-26T07:36:20Z","title":"PLMM: Personal Large Language Models on Mobile Devices","summary":" Inspired by Federated Learning, in this paper, we propose personal large\nmodels that are distilled from traditional large language models but more\nadaptive to local users' personal information such as education background and\nhobbies. We classify the large language models into three levels: the personal\nlevel, expert level and traditional level. The personal level models are\nadaptive to users' personal information. They encrypt the users' input and\nprotect their privacy. The expert level models focus on merging specific\nknowledge such as finance, IT and art. The traditional models focus on the\nuniversal knowledge discovery and upgrading the expert models. In such\nclassifications, the personal models directly interact with the user. For the\nwhole system, the personal models have users' (encrypted) personal information.\nMoreover, such models must be small enough to be performed on personal\ncomputers or mobile devices. Finally, they also have to response in real-time\nfor better user experience and produce high quality results. The proposed\npersonal large models can be applied in a wide range of applications such as\nlanguage and vision tasks.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2309.14726v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2307.13221"},{"id":"http://arxiv.org/abs/2405.02793v1","updated":"2024-05-05T02:15:11Z","published":"2024-05-05T02:15:11Z","title":"ImageInWords: Unlocking Hyper-Detailed Image Descriptions","summary":" Despite the longstanding adage \"an image is worth a thousand words,\" creating\naccurate and hyper-detailed image descriptions for training Vision-Language\nmodels remains challenging. Current datasets typically have web-scraped\ndescriptions that are short, low-granularity, and often contain details\nunrelated to the visual content. As a result, models trained on such data\ngenerate descriptions replete with missing information, visual inconsistencies,\nand hallucinations. To address these issues, we introduce ImageInWords (IIW), a\ncarefully designed human-in-the-loop annotation framework for curating\nhyper-detailed image descriptions and a new dataset resulting from this\nprocess. We validate the framework through evaluations focused on the quality\nof the dataset and its utility for fine-tuning with considerations for\nreadability, comprehensiveness, specificity, hallucinations, and\nhuman-likeness. Our dataset significantly improves across these dimensions\ncompared to recently released datasets (+66%) and GPT-4V outputs (+48%).\nFurthermore, models fine-tuned with IIW data excel by +31% against prior work\nalong the same human evaluation dimensions. Given our fine-tuned models, we\nalso evaluate text-to-image generation and vision-language reasoning. Our\nmodel's descriptions can generate images closest to the original, as judged by\nboth automated and human metrics. We also find our model produces more\ncompositionally rich descriptions, outperforming the best baseline by up to 6%\non ARO, SVO-Probes, and Winoground datasets.\n","authors":["Roopal Garg","Andrea Burns","Burcu Karagol Ayan","Yonatan Bitton","Ceslee Montgomery","Yasumasa Onoe","Andrew Bunner","Ranjay Krishna","Jason Baldridge","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2405.02793v1.pdf","comment":"Webpage (https://google.github.io/imageinwords), GitHub\n (https://github.com/google/imageinwords), HuggingFace\n (https://huggingface.co/datasets/google/imageinwords)"},{"id":"http://arxiv.org/abs/2405.02792v1","updated":"2024-05-05T02:12:20Z","published":"2024-05-05T02:12:20Z","title":"Jointly Learning Spatial, Angular, and Temporal Information for Enhanced\n Lane Detection","summary":" This paper introduces a novel approach for enhanced lane detection by\nintegrating spatial, angular, and temporal information through light field\nimaging and novel deep learning models. Utilizing lenslet-inspired 2D light\nfield representations and LSTM networks, our method significantly improves lane\ndetection in challenging conditions. We demonstrate the efficacy of this\napproach with modified CNN architectures, showing superior per- formance over\ntraditional methods. Our findings suggest this integrated data approach could\nadvance lane detection technologies and inspire new models that leverage these\nmultidimensional insights for autonomous vehicle percep- tion.\n","authors":["Muhammad Zeshan Alam"],"pdf_url":"https://arxiv.org/pdf/2405.02792v1.pdf","comment":"5 pages, 3 Figures , Accepted IEEE Conference on Signal Processing\n and Communications Applications"},{"id":"http://arxiv.org/abs/2405.02791v1","updated":"2024-05-05T02:11:57Z","published":"2024-05-05T02:11:57Z","title":"Efficient Text-driven Motion Generation via Latent Consistency Training","summary":" Motion diffusion models have recently proven successful for text-driven human\nmotion generation. Despite their excellent generation performance, they are\nchallenging to infer in real time due to the multi-step sampling mechanism that\ninvolves tens or hundreds of repeat function evaluation iterations. To this\nend, we investigate a motion latent consistency Training (MLCT) for motion\ngeneration to alleviate the computation and time consumption during iteration\ninference. It applies diffusion pipelines to low-dimensional motion latent\nspaces to mitigate the computational burden of each function evaluation.\nExplaining the diffusion process with probabilistic flow ordinary differential\nequation (PF-ODE) theory, the MLCT allows extremely few steps infer between the\nprior distribution to the motion latent representation distribution via\nmaintaining consistency of the outputs over the trajectory of PF-ODE.\nEspecially, we introduce a quantization constraint to optimize motion latent\nrepresentations that are bounded, regular, and well-reconstructed compared to\ntraditional variational constraints. Furthermore, we propose a conditional\nPF-ODE trajectory simulation method, which improves the conditional generation\nperformance with minimal additional training costs. Extensive experiments on\ntwo human motion generation benchmarks show that the proposed model achieves\nstate-of-the-art performance with less than 10\\% time cost.\n","authors":["Mengxian Hu","Minghao Zhu","Xun Zhou","Qingqing Yan","Shu Li","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2405.02791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02787v1","updated":"2024-05-05T02:07:10Z","published":"2024-05-05T02:07:10Z","title":"Light Field Spatial Resolution Enhancement Framework","summary":" Light field (LF) imaging captures both angular and spatial light\ndistributions, enabling advanced photographic techniques. However, micro-lens\narray (MLA)- based cameras face a spatial-angular resolution tradeoff due to a\nsingle shared sensor. We propose a novel light field framework for resolution\nenhancement, employing a modular approach. The first module generates a\nhigh-resolution, all-in-focus image. The second module, a texture transformer\nnetwork, enhances the resolution of each light field perspective independently\nusing the output of the first module as a reference image. The final module\nleverages light field regularity to jointly improve resolution across all LF\nimage perspectives. Our approach demonstrates superior performance to existing\nmethods in both qualitative and quantitative evaluations.\n","authors":["Javeria Shabbir","Muhammad Zeshan. Alam","M. Umair Mukati"],"pdf_url":"https://arxiv.org/pdf/2405.02787v1.pdf","comment":"5 pages, 6 figures, accepted in IEEE Conference on Signal Processing\n and Communications Applications"},{"id":"http://arxiv.org/abs/2405.02785v1","updated":"2024-05-05T02:03:42Z","published":"2024-05-05T02:03:42Z","title":"Fused attention mechanism-based ore sorting network","summary":" Deep learning has had a significant impact on the identification and\nclassification of mineral resources, especially playing a key role in\nefficiently and accurately identifying different minerals, which is important\nfor improving the efficiency and accuracy of mining. However, traditional ore\nsorting meth- ods often suffer from inefficiency and lack of accuracy,\nespecially in complex mineral environments. To address these challenges, this\nstudy proposes a method called OreYOLO, which incorporates an attentional\nmechanism and a multi-scale feature fusion strategy, based on ore data from\ngold and sul- fide ores. By introducing the progressive feature pyramid\nstructure into YOLOv5 and embedding the attention mechanism in the feature\nextraction module, the detection performance and accuracy of the model are\ngreatly improved. In order to adapt to the diverse ore sorting scenarios and\nthe deployment requirements of edge devices, the network structure is designed\nto be lightweight, which achieves a low number of parameters (3.458M) and\ncomputational complexity (6.3GFLOPs) while maintaining high accuracy (99.3% and\n99.2%, respectively). In the experimental part, a target detection dataset\ncontaining 6000 images of gold and sulfuric iron ore is constructed for gold\nand sulfuric iron ore classification training, and several sets of comparison\nexperiments are set up, including the YOLO series, EfficientDet, Faster-RCNN,\nand CenterNet, etc., and the experiments prove that OreYOLO outperforms the\ncommonly used high-performance object detection of these architectures\n","authors":["Junjiang Zhen","Bojun Xie"],"pdf_url":"https://arxiv.org/pdf/2405.02785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02784v1","updated":"2024-05-05T01:59:11Z","published":"2024-05-05T01:59:11Z","title":"MR-Transformer: Vision Transformer for Total Knee Replacement Prediction\n Using Magnetic Resonance Imaging","summary":" A transformer-based deep learning model, MR-Transformer, was developed for\ntotal knee replacement (TKR) prediction using magnetic resonance imaging (MRI).\nThe model incorporates the ImageNet pre-training and captures three-dimensional\n(3D) spatial correlation from the MR images. The performance of the proposed\nmodel was compared to existing state-of-the-art deep learning models for knee\ninjury diagnosis using MRI. Knee MR scans of four different tissue contrasts\nfrom the Osteoarthritis Initiative and Multicenter Osteoarthritis Study\ndatabases were utilized in the study. Experimental results demonstrated the\nstate-of-the-art performance of the proposed model on TKR prediction using MRI.\n","authors":["Chaojie Zhang","Shengjia Chen","Ozkan Cigdem","Haresh Rengaraj Rajamohan","Kyunghyun Cho","Richard Kijowski","Cem M. Deniz"],"pdf_url":"https://arxiv.org/pdf/2405.02784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02782v1","updated":"2024-05-05T01:51:58Z","published":"2024-05-05T01:51:58Z","title":"A self-supervised text-vision framework for automated brain abnormality\n detection","summary":" Artificial neural networks trained on large, expert-labelled datasets are\nconsidered state-of-the-art for a range of medical image recognition tasks.\nHowever, categorically labelled datasets are time-consuming to generate and\nconstrain classification to a pre-defined, fixed set of classes. For\nneuroradiological applications in particular, this represents a barrier to\nclinical adoption. To address these challenges, we present a self-supervised\ntext-vision framework that learns to detect clinically relevant abnormalities\nin brain MRI scans by directly leveraging the rich information contained in\naccompanying free-text neuroradiology reports. Our training approach consisted\nof two-steps. First, a dedicated neuroradiological language model - NeuroBERT -\nwas trained to generate fixed-dimensional vector representations of\nneuroradiology reports (N = 50,523) via domain-specific self-supervised\nlearning tasks. Next, convolutional neural networks (one per MRI sequence)\nlearnt to map individual brain scans to their corresponding text vector\nrepresentations by optimising a mean square error loss. Once trained, our\ntext-vision framework can be used to detect abnormalities in unreported brain\nMRI examinations by scoring scans against suitable query sentences (e.g.,\n'there is an acute stroke', 'there is hydrocephalus' etc.), enabling a range of\nclassification-based applications including automated triage. Potentially, our\nframework could also serve as a clinical decision support tool, not only by\nsuggesting findings to radiologists and detecting errors in provisional\nreports, but also by retrieving and displaying examples of pathologies from\nhistorical examinations that could be relevant to the current case based on\ntextual descriptors.\n","authors":["David A. Wood","Emily Guilhem","Sina Kafiabadi","Ayisha Al Busaidi","Kishan Dissanayake","Ahmed Hammam","Nina Mansoor","Matthew Townend","Siddharth Agarwal","Yiran Wei","Asif Mazumder","Gareth J. Barker","Peter Sasieni","Sebastien Ourselin","James H. Cole","Thomas C. Booth"],"pdf_url":"https://arxiv.org/pdf/2405.02782v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.19652v2","updated":"2024-05-05T01:26:55Z","published":"2024-04-30T15:49:03Z","title":"VimTS: A Unified Video and Image Text Spotter for Enhancing the\n Cross-domain Generalization","summary":" Text spotting, a task involving the extraction of textual information from\nimage or video sequences, faces challenges in cross-domain adaption, such as\nimage-to-image and image-to-video generalization. In this paper, we introduce a\nnew method, termed VimTS, which enhances the generalization ability of the\nmodel by achieving better synergy among different tasks. Typically, we propose\na Prompt Queries Generation Module and a Tasks-aware Adapter to effectively\nconvert the original single-task model into a multi-task model suitable for\nboth image and video scenarios with minimal additional parameters. The Prompt\nQueries Generation Module facilitates explicit interaction between different\ntasks, while the Tasks-aware Adapter helps the model dynamically learn suitable\nfeatures for each task. Additionally, to further enable the model to learn\ntemporal information at a lower cost, we propose a synthetic video text dataset\n(VTD-368k) by leveraging the Content Deformation Fields (CoDeF) algorithm.\nNotably, our method outperforms the state-of-the-art method by an average of\n2.6% in six cross-domain benchmarks such as TT-to-IC15, CTW1500-to-TT, and\nTT-to-CTW1500. For video-level cross-domain adaption, our method even surpasses\nthe previous end-to-end video spotting method in ICDAR2015 video and DSText v2\nby an average of 5.5% on the MOTA metric, using only image-level data. We\nfurther demonstrate that existing Large Multimodal Models exhibit limitations\nin generating cross-domain scene text spotting, in contrast to our VimTS model\nwhich requires significantly fewer parameters and data. The code and datasets\nwill be made available at the https://VimTextSpotter.github.io.\n","authors":["Yuliang Liu","Mingxin Huang","Hao Yan","Linger Deng","Weijia Wu","Hao Lu","Chunhua Shen","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2404.19652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02781v1","updated":"2024-05-05T01:07:24Z","published":"2024-05-05T01:07:24Z","title":"Instantaneous Perception of Moving Objects in 3D","summary":" The perception of 3D motion of surrounding traffic participants is crucial\nfor driving safety. While existing works primarily focus on general large\nmotions, we contend that the instantaneous detection and quantification of\nsubtle motions is equally important as they indicate the nuances in driving\nbehavior that may be safety critical, such as behaviors near a stop sign of\nparking positions. We delve into this under-explored task, examining its unique\nchallenges and developing our solution, accompanied by a carefully designed\nbenchmark. Specifically, due to the lack of correspondences between consecutive\nframes of sparse Lidar point clouds, static objects might appear to be moving -\nthe so-called swimming effect. This intertwines with the true object motion,\nthereby posing ambiguity in accurate estimation, especially for subtle motions.\nTo address this, we propose to leverage local occupancy completion of object\npoint clouds to densify the shape cue, and mitigate the impact of swimming\nartifacts. The occupancy completion is learned in an end-to-end fashion\ntogether with the detection of moving objects and the estimation of their\nmotion, instantaneously as soon as objects start to move. Extensive experiments\ndemonstrate superior performance compared to standard 3D motion estimation\napproaches, particularly highlighting our method's specialized treatment of\nsubtle motions.\n","authors":["Di Liu","Bingbing Zhuang","Dimitris N. Metaxas","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2405.02781v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2405.04288v1","updated":"2024-05-05T21:08:49Z","published":"2024-05-05T21:08:49Z","title":"BetterNet: An Efficient CNN Architecture with Residual Learning and\n Attention for Precision Polyp Segmentation","summary":" Colorectal cancer contributes significantly to cancer-related mortality.\nTimely identification and elimination of polyps through colonoscopy screening\nis crucial in order to decrease mortality rates. Accurately detecting polyps in\ncolonoscopy images is difficult because of the differences in characteristics\nsuch as size, shape, texture, and similarity to surrounding tissues. Current\ndeep-learning methods often face difficulties in capturing long-range\nconnections necessary for segmentation. This research presents BetterNet, a\nconvolutional neural network (CNN) architecture that combines residual learning\nand attention methods to enhance the accuracy of polyp segmentation. The\nprimary characteristics encompass (1) a residual decoder architecture that\nfacilitates efficient gradient propagation and integration of multiscale\nfeatures. (2) channel and spatial attention blocks within the decoder block to\nconcentrate the learning process on the relevant areas of polyp regions. (3)\nAchieving state-of-the-art performance on polyp segmentation benchmarks while\nstill ensuring computational efficiency. (4) Thorough ablation tests have been\nconducted to confirm the influence of architectural components. (5) The model\ncode has been made available as open-source for further contribution. Extensive\nevaluations conducted on datasets such as Kvasir-SEG, CVC ClinicDB, Endoscene,\nEndoTect, and Kvasir-Sessile demonstrate that BetterNets outperforms current\nSOTA models in terms of segmentation accuracy by significant margins. The\nlightweight design enables real-time inference for various applications.\nBetterNet shows promise in integrating computer-assisted diagnosis techniques\nto enhance the detection of polyps and the early recognition of cancer. Link to\nthe code: https://github.com/itsOwen/BetterNet\n","authors":["Owen Singh","Sandeep Singh Sengar"],"pdf_url":"https://arxiv.org/pdf/2405.04288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2008.01503v2","updated":"2024-05-05T16:37:05Z","published":"2020-08-04T13:18:19Z","title":"Multiple Code Hashing for Efficient Image Retrieval","summary":" Due to its low storage cost and fast query speed, hashing has been widely\nused in large-scale image retrieval tasks. Hash bucket search returns data\npoints within a given Hamming radius to each query, which can enable search at\na constant or sub-linear time cost. However, existing hashing methods cannot\nachieve satisfactory retrieval performance for hash bucket search in complex\nscenarios, since they learn only one hash code for each image. More\nspecifically, by using one hash code to represent one image, existing methods\nmight fail to put similar image pairs to the buckets with a small Hamming\ndistance to the query when the semantic information of images is complex. As a\nresult, a large number of hash buckets need to be visited for retrieving\nsimilar images, based on the learned codes. This will deteriorate the\nefficiency of hash bucket search. In this paper, we propose a novel hashing\nframework, called multiple code hashing (MCH), to improve the performance of\nhash bucket search. The main idea of MCH is to learn multiple hash codes for\neach image, with each code representing a different region of the image.\nFurthermore, we propose a deep reinforcement learning algorithm to learn the\nparameters in MCH. To the best of our knowledge, this is the first work that\nproposes to learn multiple hash codes for each image in image retrieval.\nExperiments demonstrate that MCH can achieve a significant improvement in hash\nbucket search, compared with existing methods that learn only one hash code for\neach image.\n","authors":["Ming-Wei Li","Qing-Yuan Jiang","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2008.01503v2.pdf","comment":"12 pages, 9 figures, 3 tables"}]},"2024-05-04T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.10136v2","updated":"2024-05-04T23:24:37Z","published":"2023-12-15T18:59:05Z","title":"Gradient-based Parameter Selection for Efficient Fine-Tuning","summary":" With the growing size of pre-trained models, full fine-tuning and storing all\nthe parameters for various downstream tasks is costly and infeasible. In this\npaper, we propose a new parameter-efficient fine-tuning method, Gradient-based\nParameter Selection (GPS), demonstrating that only tuning a few selected\nparameters from the pre-trained model while keeping the remainder of the model\nfrozen can generate similar or better performance compared with the full model\nfine-tuning method. Different from the existing popular and state-of-the-art\nparameter-efficient fine-tuning approaches, our method does not introduce any\nadditional parameters and computational costs during both the training and\ninference stages. Another advantage is the model-agnostic and non-destructive\nproperty, which eliminates the need for any other design specific to a\nparticular model. Compared with the full fine-tuning, GPS achieves 3.33%\n(91.78% vs. 88.45%, FGVC) and 9.61% (73.1% vs. 65.57%, VTAB) improvement of the\naccuracy with tuning only 0.36% parameters of the pre-trained model on average\nover 24 image classification tasks; it also demonstrates a significant\nimprovement of 17% and 16.8% in mDice and mIoU, respectively, on medical image\nsegmentation task. Moreover, GPS achieves state-of-the-art performance compared\nwith existing PEFT methods.\n","authors":["Zhi Zhang","Qizhe Zhang","Zijun Gao","Renrui Zhang","Ekaterina Shutova","Shiji Zhou","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.10136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02771v1","updated":"2024-05-04T23:16:48Z","published":"2024-05-04T23:16:48Z","title":"MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial\n Representation Learning","summary":" The volume of unlabelled Earth observation (EO) data is huge, but many\nimportant applications lack labelled training data. However, EO data offers the\nunique opportunity to pair data from different modalities and sensors\nautomatically based on geographic location and time, at virtually no human\nlabor cost. We seize this opportunity to create a diverse multi-modal\npretraining dataset at global scale. Using this new corpus of 1.2 million\nlocations, we propose a Multi-Pretext Masked Autoencoder (MP-MAE) approach to\nlearn general-purpose representations for optical satellite images. Our\napproach builds on the ConvNeXt V2 architecture, a fully convolutional masked\nautoencoder (MAE). Drawing upon a suite of multi-modal pretext tasks, we\ndemonstrate that our MP-MAE approach outperforms both MAEs pretrained on\nImageNet and MAEs pretrained on domain-specific satellite images. This is shown\non several downstream tasks including image classification and semantic\nsegmentation. We find that multi-modal pretraining notably improves the linear\nprobing performance, e.g. 4pp on BigEarthNet and 16pp on So2Sat, compared to\npretraining on optical satellite images only. We show that this also leads to\nbetter label and parameter efficiency which are crucial aspects in global scale\napplications.\n","authors":["Vishal Nedungadi","Ankit Kariryaa","Stefan Oehmcke","Serge Belongie","Christian Igel","Nico Lang"],"pdf_url":"https://arxiv.org/pdf/2405.02771v1.pdf","comment":"Data and code is available on the project page:\n https://vishalned.github.io/mmearth"},{"id":"http://arxiv.org/abs/2404.05468v4","updated":"2024-05-04T22:46:26Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made strong progress in the past decade, thanks to\nthe availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. The resulting pipeline we call Mind-to-Image marks a step towards\ncreating a technology that allow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v4.pdf","comment":"Pre-print to be updated. Work in progress"},{"id":"http://arxiv.org/abs/2405.02766v1","updated":"2024-05-04T22:02:58Z","published":"2024-05-04T22:02:58Z","title":"Beyond Unimodal Learning: The Importance of Integrating Multiple\n Modalities for Lifelong Learning","summary":" While humans excel at continual learning (CL), deep neural networks (DNNs)\nexhibit catastrophic forgetting. A salient feature of the brain that allows\neffective CL is that it utilizes multiple modalities for learning and\ninference, which is underexplored in DNNs. Therefore, we study the role and\ninteractions of multiple modalities in mitigating forgetting and introduce a\nbenchmark for multimodal continual learning. Our findings demonstrate that\nleveraging multiple views and complementary information from multiple\nmodalities enables the model to learn more accurate and robust representations.\nThis makes the model less vulnerable to modality-specific regularities and\nconsiderably mitigates forgetting. Furthermore, we observe that individual\nmodalities exhibit varying degrees of robustness to distribution shift.\nFinally, we propose a method for integrating and aligning the information from\ndifferent modalities by utilizing the relational structural similarities\nbetween the data points in each modality. Our method sets a strong baseline\nthat enables both single- and multimodal inference. Our study provides a\npromising case for further exploring the role of multiple modalities in\nenabling CL and provides a standard benchmark for future research.\n","authors":["Fahad Sarfraz","Bahram Zonooz","Elahe Arani"],"pdf_url":"https://arxiv.org/pdf/2405.02766v1.pdf","comment":"Accepted at 3rd Conference on Lifelong Learning Agents (CoLLAs), 2024"},{"id":"http://arxiv.org/abs/2405.02762v1","updated":"2024-05-04T21:55:33Z","published":"2024-05-04T21:55:33Z","title":"TK-Planes: Tiered K-Planes with High Dimensional Feature Vectors for\n Dynamic UAV-based Scenes","summary":" In this paper, we present a new approach to bridge the domain gap between\nsynthetic and real-world data for un- manned aerial vehicle (UAV)-based\nperception. Our formu- lation is designed for dynamic scenes, consisting of\nmoving objects or human actions, where the goal is to recognize the pose or\nactions. We propose an extension of K-Planes Neural Radiance Field (NeRF),\nwherein our algorithm stores a set of tiered feature vectors. The tiered\nfeature vectors are generated to effectively model conceptual information about\na scene as well as an image decoder that transforms output feature maps into\nRGB images. Our technique leverages the information amongst both static and\ndynamic objects within a scene and is able to capture salient scene attributes\nof high altitude videos. We evaluate its performance on challenging datasets,\nincluding Okutama Action and UG2, and observe considerable improvement in\naccuracy over state of the art aerial perception algorithms.\n","authors":["Christopher Maxey","Jaehoon Choi","Yonghan Lee","Hyungtae Lee","Dinesh Manocha","Heesung Kwon"],"pdf_url":"https://arxiv.org/pdf/2405.02762v1.pdf","comment":"8 pages, submitted to IROS2024"},{"id":"http://arxiv.org/abs/2401.16663v2","updated":"2024-05-04T21:17:37Z","published":"2024-01-30T01:28:36Z","title":"VR-GS: A Physical Dynamics-Aware Interactive Gaussian Splatting System\n in Virtual Reality","summary":" As consumer Virtual Reality (VR) and Mixed Reality (MR) technologies gain\nmomentum, there's a growing focus on the development of engagements with 3D\nvirtual content. Unfortunately, traditional techniques for content creation,\nediting, and interaction within these virtual spaces are fraught with\ndifficulties. They tend to be not only engineering-intensive but also require\nextensive expertise, which adds to the frustration and inefficiency in virtual\nobject manipulation. Our proposed VR-GS system represents a leap forward in\nhuman-centered 3D content interaction, offering a seamless and intuitive user\nexperience. By developing a physical dynamics-aware interactive Gaussian\nSplatting in a Virtual Reality setting, and constructing a highly efficient\ntwo-level embedding strategy alongside deformable body simulations, VR-GS\nensures real-time execution with highly realistic dynamic responses. The\ncomponents of our Virtual Reality system are designed for high efficiency and\neffectiveness, starting from detailed scene reconstruction and object\nsegmentation, advancing through multi-view image in-painting, and extending to\ninteractive physics-based editing. The system also incorporates real-time\ndeformation embedding and dynamic shadow casting, ensuring a comprehensive and\nengaging virtual experience.Our project page is available at:\nhttps://yingjiang96.github.io/VR-GS/.\n","authors":["Ying Jiang","Chang Yu","Tianyi Xie","Xuan Li","Yutao Feng","Huamin Wang","Minchen Li","Henry Lau","Feng Gao","Yin Yang","Chenfanfu Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.16663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02751v1","updated":"2024-05-04T20:49:06Z","published":"2024-05-04T20:49:06Z","title":"Deep Image Restoration For Image Anti-Forensics","summary":" While image forensics is concerned with whether an image has been tampered\nwith, image anti-forensics attempts to prevent image forensics methods from\ndetecting tampered images. The competition between these two fields started\nlong before the advancement of deep learning. JPEG compression, blurring and\nnoising, which are simple methods by today's standards, have long been used for\nanti-forensics and have been the subject of much research in both forensics and\nanti-forensics. Although these traditional methods are old, they make it\ndifficult to detect fake images and are used for data augmentation in training\ndeep image forgery detection models. In addition to making the image difficult\nto detect, these methods leave traces on the image and consequently degrade the\nimage quality. Separate image forensics methods have also been developed to\ndetect these traces. In this study, we go one step further and improve the\nimage quality after these methods with deep image restoration models and make\nit harder to detect the forged image. We evaluate the impact of these methods\non image quality. We then test both our proposed methods with deep learning and\nmethods without deep learning on the two best existing image manipulation\ndetection models. In the obtained results, we show how existing image forgery\ndetection models fail against the proposed methods. Code implementation will be\npublicly available at https://github.com/99eren99/DIRFIAF .\n","authors":["Eren Tahir","Mert Bal"],"pdf_url":"https://arxiv.org/pdf/2405.02751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17444v3","updated":"2024-05-04T19:28:10Z","published":"2023-09-29T17:54:46Z","title":"LLM-grounded Video Diffusion Models","summary":" Text-conditioned diffusion models have emerged as a promising tool for neural\nvideo generation. However, current models still struggle with intricate\nspatiotemporal prompts and often generate restricted or incorrect motion. To\naddress these limitations, we introduce LLM-grounded Video Diffusion (LVD).\nInstead of directly generating videos from the text inputs, LVD first leverages\na large language model (LLM) to generate dynamic scene layouts based on the\ntext inputs and subsequently uses the generated layouts to guide a diffusion\nmodel for video generation. We show that LLMs are able to understand complex\nspatiotemporal dynamics from text alone and generate layouts that align closely\nwith both the prompts and the object motion patterns typically observed in the\nreal world. We then propose to guide video diffusion models with these layouts\nby adjusting the attention maps. Our approach is training-free and can be\nintegrated into any video diffusion model that admits classifier guidance. Our\nresults demonstrate that LVD significantly outperforms its base video diffusion\nmodel and several strong baseline methods in faithfully generating videos with\nthe desired attributes and motion patterns.\n","authors":["Long Lian","Baifeng Shi","Adam Yala","Trevor Darrell","Boyi Li"],"pdf_url":"https://arxiv.org/pdf/2309.17444v3.pdf","comment":"ICLR 2024. Project Page:\n https://llm-grounded-video-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2403.04161v4","updated":"2024-05-04T18:29:01Z","published":"2024-03-07T02:40:42Z","title":"SWAP-NAS: sample-wise activation patterns for ultra-fast NAS","summary":" Training-free metrics (a.k.a. zero-cost proxies) are widely used to avoid\nresource-intensive neural network training, especially in Neural Architecture\nSearch (NAS). Recent studies show that existing training-free metrics have\nseveral limitations, such as limited correlation and poor generalisation across\ndifferent search spaces and tasks. Hence, we propose Sample-Wise Activation\nPatterns and its derivative, SWAP-Score, a novel high-performance training-free\nmetric. It measures the expressivity of networks over a batch of input samples.\nThe SWAP-Score is strongly correlated with ground-truth performance across\nvarious search spaces and tasks, outperforming 15 existing training-free\nmetrics on NAS-Bench-101/201/301 and TransNAS-Bench-101. The SWAP-Score can be\nfurther enhanced by regularisation, which leads to even higher correlations in\ncell-based search space and enables model size control during the search. For\nexample, Spearman's rank correlation coefficient between regularised SWAP-Score\nand CIFAR-100 validation accuracies on NAS-Bench-201 networks is 0.90,\nsignificantly higher than 0.80 from the second-best metric, NWOT. When\nintegrated with an evolutionary algorithm for NAS, our SWAP-NAS achieves\ncompetitive performance on CIFAR-10 and ImageNet in approximately 6 minutes and\n9 minutes of GPU time respectively.\n","authors":["Yameng Peng","Andy Song","Haytham M. Fayek","Vic Ciesielski","Xiaojun Chang"],"pdf_url":"https://arxiv.org/pdf/2403.04161v4.pdf","comment":"ICLR2024 Spotlight"},{"id":"http://arxiv.org/abs/2405.02730v1","updated":"2024-05-04T18:27:29Z","published":"2024-05-04T18:27:29Z","title":"U-DiTs: Downsample Tokens in U-Shaped Diffusion Transformers","summary":" Diffusion Transformers (DiTs) introduce the transformer architecture to\ndiffusion tasks for latent-space image generation. With an isotropic\narchitecture that chains a series of transformer blocks, DiTs demonstrate\ncompetitive performance and good scalability; but meanwhile, the abandonment of\nU-Net by DiTs and their following improvements is worth rethinking. To this\nend, we conduct a simple toy experiment by comparing a U-Net architectured DiT\nwith an isotropic one. It turns out that the U-Net architecture only gain a\nslight advantage amid the U-Net inductive bias, indicating potential\nredundancies within the U-Net-style DiT. Inspired by the discovery that U-Net\nbackbone features are low-frequency-dominated, we perform token downsampling on\nthe query-key-value tuple for self-attention and bring further improvements\ndespite a considerable amount of reduction in computation. Based on\nself-attention with downsampled tokens, we propose a series of U-shaped DiTs\n(U-DiTs) in the paper and conduct extensive experiments to demonstrate the\nextraordinary performance of U-DiT models. The proposed U-DiT could outperform\nDiT-XL/2 with only 1/6 of its computation cost. Codes are available at\nhttps://github.com/YuchuanTian/U-DiT.\n","authors":["Yuchuan Tian","Zhijun Tu","Hanting Chen","Jie Hu","Chao Xu","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02730v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.02717v1","updated":"2024-05-04T17:24:09Z","published":"2024-05-04T17:24:09Z","title":"AFter: Attention-based Fusion Router for RGBT Tracking","summary":" Multi-modal feature fusion as a core investigative component of RGBT tracking\nemerges numerous fusion studies in recent years. However, existing RGBT\ntracking methods widely adopt fixed fusion structures to integrate multi-modal\nfeature, which are hard to handle various challenges in dynamic scenarios. To\naddress this problem, this work presents a novel \\emph{A}ttention-based\n\\emph{F}usion rou\\emph{ter} called AFter, which optimizes the fusion structure\nto adapt to the dynamic challenging scenarios, for robust RGBT tracking. In\nparticular, we design a fusion structure space based on the hierarchical\nattention network, each attention-based fusion unit corresponding to a fusion\noperation and a combination of these attention units corresponding to a fusion\nstructure. Through optimizing the combination of attention-based fusion units,\nwe can dynamically select the fusion structure to adapt to various challenging\nscenarios. Unlike complex search of different structures in neural architecture\nsearch algorithms, we develop a dynamic routing algorithm, which equips each\nattention-based fusion unit with a router, to predict the combination weights\nfor efficient optimization of the fusion structure. Extensive experiments on\nfive mainstream RGBT tracking datasets demonstrate the superior performance of\nthe proposed AFter against state-of-the-art RGBT trackers. We release the code\nin https://github.com/Alexadlu/AFter.\n","authors":["Andong Lu","Wanyu Wang","Chenglong Li","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2405.02717v1.pdf","comment":"Peer review"},{"id":"http://arxiv.org/abs/2401.01823v2","updated":"2024-05-04T16:44:32Z","published":"2024-01-03T16:38:56Z","title":"Detours for Navigating Instructional Videos","summary":" We introduce the video detours problem for navigating instructional videos.\nGiven a source video and a natural language query asking to alter the how-to\nvideo's current path of execution in a certain way, the goal is to find a\nrelated ''detour video'' that satisfies the requested alteration. To address\nthis challenge, we propose VidDetours, a novel video-language approach that\nlearns to retrieve the targeted temporal segments from a large repository of\nhow-to's using video-and-text conditioned queries. Furthermore, we devise a\nlanguage-based pipeline that exploits how-to video narration text to create\nweakly supervised training data. We demonstrate our idea applied to the domain\nof how-to cooking videos, where a user can detour from their current recipe to\nfind steps with alternate ingredients, tools, and techniques. Validating on a\nground truth annotated dataset of 16K samples, we show our model's significant\nimprovements over best available methods for video retrieval and question\nanswering, with recall rates exceeding the state of the art by 35%.\n","authors":["Kumar Ashutosh","Zihui Xue","Tushar Nagarajan","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2401.01823v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2405.02700v1","updated":"2024-05-04T16:06:50Z","published":"2024-05-04T16:06:50Z","title":"Towards a Scalable Identification of Novel Modes in Generative Models","summary":" An interpretable comparison of generative models requires the identification\nof sample types produced more frequently by each of the involved models. While\nseveral quantitative scores have been proposed in the literature to rank\ndifferent generative models, such score-based evaluations do not reveal the\nnuanced differences between the generative models in capturing various sample\ntypes. In this work, we propose a method called Fourier-based Identification of\nNovel Clusters (FINC) to identify modes produced by a generative model with a\nhigher frequency in comparison to a reference distribution. FINC provides a\nscalable stochastic algorithm based on random Fourier features to estimate the\neigenspace of kernel covariance matrices of two generative models and utilize\nthe principal eigendirections to detect the sample types present more\ndominantly in each model. We demonstrate the application of the FINC method to\nstandard computer vision datasets and generative model frameworks. Our\nnumerical results suggest the scalability and efficiency of the developed\nFourier-based method in highlighting the sample types captured with different\nfrequencies by widely-used generative models.\n","authors":["Jingwei Zhang","Mohammad Jalali","Cheuk Ting Li","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2405.02700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02698v1","updated":"2024-05-04T15:37:22Z","published":"2024-05-04T15:37:22Z","title":"Stable Diffusion Dataset Generation for Downstream Classification Tasks","summary":" Recent advances in generative artificial intelligence have enabled the\ncreation of high-quality synthetic data that closely mimics real-world data.\nThis paper explores the adaptation of the Stable Diffusion 2.0 model for\ngenerating synthetic datasets, using Transfer Learning, Fine-Tuning and\ngeneration parameter optimisation techniques to improve the utility of the\ndataset for downstream classification tasks. We present a class-conditional\nversion of the model that exploits a Class-Encoder and optimisation of key\ngeneration parameters. Our methodology led to synthetic datasets that, in a\nthird of cases, produced models that outperformed those trained on real\ndatasets.\n","authors":["Eugenio Lomurno","Matteo D'Oria","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2405.02698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02692v1","updated":"2024-05-04T15:04:06Z","published":"2024-05-04T15:04:06Z","title":"Diffeomorphic Transformer-based Abdomen MRI-CT Deformable Image\n Registration","summary":" This paper aims to create a deep learning framework that can estimate the\ndeformation vector field (DVF) for directly registering abdominal MRI-CT\nimages. The proposed method assumed a diffeomorphic deformation. By using\ntopology-preserved deformation features extracted from the probabilistic\ndiffeomorphic registration model, abdominal motion can be accurately obtained\nand utilized for DVF estimation. The model integrated Swin transformers, which\nhave demonstrated superior performance in motion tracking, into the\nconvolutional neural network (CNN) for deformation feature extraction. The\nmodel was optimized using a cross-modality image similarity loss and a surface\nmatching loss. To compute the image loss, a modality-independent neighborhood\ndescriptor (MIND) was used between the deformed MRI and CT images. The surface\nmatching loss was determined by measuring the distance between the warped\ncoordinates of the surfaces of contoured structures on the MRI and CT images.\nThe deformed MRI image was assessed against the CT image using the target\nregistration error (TRE), Dice similarity coefficient (DSC), and mean surface\ndistance (MSD) between the deformed contours of the MRI image and manual\ncontours of the CT image. When compared to only rigid registration, DIR with\nthe proposed method resulted in an increase of the mean DSC values of the liver\nand portal vein from 0.850 and 0.628 to 0.903 and 0.763, a decrease of the mean\nMSD of the liver from 7.216 mm to 3.232 mm, and a decrease of the TRE from\n26.238 mm to 8.492 mm. The proposed deformable image registration method based\non a diffeomorphic transformer provides an effective and efficient way to\ngenerate an accurate DVF from an MRI-CT image pair of the abdomen. It could be\nutilized in the current treatment planning workflow for liver radiotherapy.\n","authors":["Yang Lei","Luke A. Matkovic","Justin Roper","Tonghe Wang","Jun Zhou","Beth Ghavidel","Mark McDonald","Pretesh Patel","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2405.02692v1.pdf","comment":"18 pages and 4 figures"},{"id":"http://arxiv.org/abs/2405.02686v1","updated":"2024-05-04T14:57:28Z","published":"2024-05-04T14:57:28Z","title":"Boosting 3D Neuron Segmentation with 2D Vision Transformer Pre-trained\n on Natural Images","summary":" Neuron reconstruction, one of the fundamental tasks in neuroscience, rebuilds\nneuronal morphology from 3D light microscope imaging data. It plays a critical\nrole in analyzing the structure-function relationship of neurons in the nervous\nsystem. However, due to the scarcity of neuron datasets and high-quality SWC\nannotations, it is still challenging to develop robust segmentation methods for\nsingle neuron reconstruction. To address this limitation, we aim to distill the\nconsensus knowledge from massive natural image data to aid the segmentation\nmodel in learning the complex neuron structures. Specifically, in this work, we\npropose a novel training paradigm that leverages a 2D Vision Transformer model\npre-trained on large-scale natural images to initialize our Transformer-based\n3D neuron segmentation model with a tailored 2D-to-3D weight transferring\nstrategy. Our method builds a knowledge sharing connection between the abundant\nnatural and the scarce neuron image domains to improve the 3D neuron\nsegmentation ability in a data-efficiency manner. Evaluated on a popular\nbenchmark, BigNeuron, our method enhances neuron segmentation performance by\n8.71% over the model trained from scratch with the same amount of training\nsamples.\n","authors":["Yik San Cheng","Runkai Zhao","Heng Wang","Hanchuan Peng","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2405.02686v1.pdf","comment":"3 pages"},{"id":"http://arxiv.org/abs/2405.02678v1","updated":"2024-05-04T14:43:31Z","published":"2024-05-04T14:43:31Z","title":"Position Paper: Quo Vadis, Unsupervised Time Series Anomaly Detection?","summary":" The current state of machine learning scholarship in Timeseries Anomaly\nDetection (TAD) is plagued by the persistent use of flawed evaluation metrics,\ninconsistent benchmarking practices, and a lack of proper justification for the\nchoices made in novel deep learning-based model designs. Our paper presents a\ncritical analysis of the status quo in TAD, revealing the misleading track of\ncurrent research and highlighting problematic methods, and evaluation\npractices. Our position advocates for a shift in focus from pursuing only the\nnovelty in model design to improving benchmarking practices, creating\nnon-trivial datasets, and placing renewed emphasis on studying the utility of\nmodel architectures for specific tasks. Our findings demonstrate the need for\nrigorous evaluation protocols, the creation of simple baselines, and the\nrevelation that state-of-the-art deep anomaly detection models effectively\nlearn linear mappings. These findings suggest the need for more exploration and\ndevelopment of simple and interpretable TAD methods. The increment of model\ncomplexity in the state-of-the-art deep-learning based models unfortunately\noffers very little improvement. We offer insights and suggestions for the field\nto move forward.\n","authors":["M. Saquib Sarfraz","Mei-Yen Chen","Lukas Layer","Kunyu Peng","Marios Koulakis"],"pdf_url":"https://arxiv.org/pdf/2405.02678v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.02676v1","updated":"2024-05-04T14:32:13Z","published":"2024-05-04T14:32:13Z","title":"Hand-Object Interaction Controller (HOIC): Deep Reinforcement Learning\n for Reconstructing Interactions with Physics","summary":" Hand manipulating objects is an important interaction motion in our daily\nactivities. We faithfully reconstruct this motion with a single RGBD camera by\na novel deep reinforcement learning method to leverage physics. Firstly, we\npropose object compensation control which establishes direct object control to\nmake the network training more stable. Meanwhile, by leveraging the\ncompensation force and torque, we seamlessly upgrade the simple point contact\nmodel to a more physical-plausible surface contact model, further improving the\nreconstruction accuracy and physical correctness. Experiments indicate that\nwithout involving any heuristic physical rules, this work still successfully\ninvolves physics in the reconstruction of hand-object interactions which are\ncomplex motions hard to imitate with deep reinforcement learning. Our code and\ndata are available at https://github.com/hu-hy17/HOIC.\n","authors":["Haoyu Hu","Xinyu Yi","Zhe Cao","Jun-Hai Yong","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2405.02676v1.pdf","comment":"SIGGRAPH 2024 Conference Track"},{"id":"http://arxiv.org/abs/2310.01288v4","updated":"2024-05-04T14:21:01Z","published":"2023-10-02T15:41:35Z","title":"Offline Tracking with Object Permanence","summary":" To reduce the expensive labor cost for manual labeling autonomous driving\ndatasets, an alternative is to automatically label the datasets using an\noffline perception system. However, objects might be temporally occluded. Such\nocclusion scenarios in the datasets are common yet underexplored in offline\nauto labeling. In this work, we propose an offline tracking model that focuses\non occluded object tracks. It leverages the concept of object permanence which\nmeans objects continue to exist even if they are not observed anymore. The\nmodel contains three parts: a standard online tracker, a re-identification\n(Re-ID) module that associates tracklets before and after occlusion, and a\ntrack completion module that completes the fragmented tracks. The Re-ID module\nand the track completion module use the vectorized map as one of the inputs to\nrefine the tracking results with occlusion. The model can effectively recover\nthe occluded object trajectories. It achieves state-of-the-art performance in\n3D multi-object tracking by significantly improving the original online\ntracking result, showing its potential to be applied in offline auto labeling\nas a useful plugin to improve tracking by recovering occlusions.\n","authors":["Xianzhong Liu","Holger Caesar"],"pdf_url":"https://arxiv.org/pdf/2310.01288v4.pdf","comment":"Accepted by IEEE Intelligent Vehicles Symposium (IV 2024). Camera\n ready version with supplementary material"},{"id":"http://arxiv.org/abs/2405.02652v1","updated":"2024-05-04T12:37:07Z","published":"2024-05-04T12:37:07Z","title":"Deep Pulse-Signal Magnification for remote Heart Rate Estimation in\n Compressed Videos","summary":" Recent advancements in remote heart rate measurement (rPPG), motivated by\ndata-driven approaches, have significantly improved accuracy. However, certain\nchallenges, such as video compression, still remain: recovering the rPPG signal\nfrom highly compressed videos is particularly complex. Although several studies\nhave highlighted the difficulties and impact of video compression for this,\neffective solutions remain limited. In this paper, we present a novel approach\nto address the impact of video compression on rPPG estimation, which leverages\na pulse-signal magnification transformation to adapt compressed videos to an\nuncompressed data domain in which the rPPG signal is magnified. We validate the\neffectiveness of our model by exhaustive evaluations on two publicly available\ndatasets, UCLA-rPPG and UBFC-rPPG, employing both intra- and cross-database\nperformance at several compression rates. Additionally, we assess the\nrobustness of our approach on two additional highly compressed and widely-used\ndatasets, MAHNOB-HCI and COHFACE, which reveal outstanding heart rate\nestimation results.\n","authors":["Joaquim Comas","Adria Ruiz","Federico Sukno"],"pdf_url":"https://arxiv.org/pdf/2405.02652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02648v1","updated":"2024-05-04T12:22:02Z","published":"2024-05-04T12:22:02Z","title":"A Conformal Prediction Score that is Robust to Label Noise","summary":" Conformal Prediction (CP) quantifies network uncertainty by building a small\nprediction set with a pre-defined probability that the correct class is within\nthis set. In this study we tackle the problem of CP calibration based on a\nvalidation set with noisy labels. We introduce a conformal score that is robust\nto label noise. The noise-free conformal score is estimated using the noisy\nlabeled data and the noise level. In the test phase the noise-free score is\nused to form the prediction set. We applied the proposed algorithm to several\nstandard medical imaging classification datasets. We show that our method\noutperforms current methods by a large margin, in terms of the average size of\nthe prediction set, while maintaining the required coverage.\n","authors":["Coby Penso","Jacob Goldberger"],"pdf_url":"https://arxiv.org/pdf/2405.02648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04497v3","updated":"2024-05-04T09:48:32Z","published":"2022-12-08T18:59:57Z","title":"UNETR++: Delving into Efficient and Accurate 3D Medical Image\n Segmentation","summary":" Owing to the success of transformer models, recent works study their\napplicability in 3D medical segmentation tasks. Within the transformer models,\nthe self-attention mechanism is one of the main building blocks that strives to\ncapture long-range dependencies. However, the self-attention operation has\nquadratic complexity which proves to be a computational bottleneck, especially\nin volumetric medical imaging, where the inputs are 3D with numerous slices. In\nthis paper, we propose a 3D medical image segmentation approach, named UNETR++,\nthat offers both high-quality segmentation masks as well as efficiency in terms\nof parameters, compute cost, and inference speed. The core of our design is the\nintroduction of a novel efficient paired attention (EPA) block that efficiently\nlearns spatial and channel-wise discriminative features using a pair of\ninter-dependent branches based on spatial and channel attention. Our spatial\nattention formulation is efficient having linear complexity with respect to the\ninput sequence length. To enable communication between spatial and\nchannel-focused branches, we share the weights of query and key mapping\nfunctions that provide a complimentary benefit (paired attention), while also\nreducing the overall network parameters. Our extensive evaluations on five\nbenchmarks, Synapse, BTCV, ACDC, BRaTs, and Decathlon-Lung, reveal the\neffectiveness of our contributions in terms of both efficiency and accuracy. On\nSynapse, our UNETR++ sets a new state-of-the-art with a Dice Score of 87.2%,\nwhile being significantly efficient with a reduction of over 71% in terms of\nboth parameters and FLOPs, compared to the best method in the literature. Code:\nhttps://github.com/Amshaker/unetr_plus_plus.\n","authors":["Abdelrahman Shaker","Muhammad Maaz","Hanoona Rasheed","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2212.04497v3.pdf","comment":"Accepted at IEEE TMI-2024"},{"id":"http://arxiv.org/abs/2405.01066v2","updated":"2024-05-04T09:39:26Z","published":"2024-05-02T07:47:49Z","title":"HandSSCA: 3D Hand Mesh Reconstruction with State Space Channel Attention\n from RGB images","summary":" Reconstructing a hand mesh from a single RGB image is a challenging task\nbecause hands are often occluded by objects. Most previous works attempted to\nintroduce more additional information and adopt attention mechanisms to improve\n3D reconstruction results, but it would increased computational complexity.\nThis observation prompts us to propose a new and concise architecture while\nimproving computational efficiency. In this work, we propose a simple and\neffective 3D hand mesh reconstruction network HandSSCA, which is the first to\nincorporate state space modeling into the field of hand pose estimation. In the\nnetwork, we have designed a novel state space channel attention module that\nextends the effective sensory field, extracts hand features in the spatial\ndimension, and enhances hand regional features in the channel dimension. This\ndesign helps to reconstruct a complete and detailed hand mesh. Extensive\nexperiments conducted on well-known datasets featuring challenging hand-object\nocclusions (such as FREIHAND, DEXYCB, and HO3D) demonstrate that our proposed\nHandSSCA achieves state-of-the-art performance while maintaining a minimal\nparameter count.\n","authors":["Zixun Jiao","Xihan Wang","Quanli Gao"],"pdf_url":"https://arxiv.org/pdf/2405.01066v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.02698v4","updated":"2024-05-04T09:24:01Z","published":"2023-03-05T15:27:24Z","title":"Robust affine point matching via quadratic assignment on Grassmannians","summary":" Robust Affine matching with Grassmannians (RAG) is a new algorithm to perform\naffine registration of point clouds. The algorithm is based on minimizing the\nFrobenius distance between two elements of the Grassmannian. For this purpose,\nan indefinite relaxation of the Quadratic Assignment Problem (QAP) is used, and\nseveral approaches to affine feature matching are studied and compared.\nExperiments demonstrate that RAG is more robust to noise and point discrepancy\nthan previous methods.\n","authors":["Alexander Kolpakov","Michael Werman"],"pdf_url":"https://arxiv.org/pdf/2303.02698v4.pdf","comment":"8 pages, 23 figures; GitHub repository at\n (https://github.com/sashakolpakov/rag); Section IV: added comparison to\n GrassGraph (https://doi.org/10.1109/TIP.2019.2959722); notably, GrassGraph\n quickly loses accuracy on our test examples with noise and occlusion"},{"id":"http://arxiv.org/abs/2405.02608v1","updated":"2024-05-04T08:27:12Z","published":"2024-05-04T08:27:12Z","title":"UnSAMFlow: Unsupervised Optical Flow Guided by Segment Anything Model","summary":" Traditional unsupervised optical flow methods are vulnerable to occlusions\nand motion boundaries due to lack of object-level information. Therefore, we\npropose UnSAMFlow, an unsupervised flow network that also leverages object\ninformation from the latest foundation model Segment Anything Model (SAM). We\nfirst include a self-supervised semantic augmentation module tailored to SAM\nmasks. We also analyze the poor gradient landscapes of traditional smoothness\nlosses and propose a new smoothness definition based on homography instead. A\nsimple yet effective mask feature module has also been added to further\naggregate features on the object level. With all these adaptations, our method\nproduces clear optical flow estimation with sharp boundaries around objects,\nwhich outperforms state-of-the-art methods on both KITTI and Sintel datasets.\nOur method also generalizes well across domains and runs very efficiently.\n","authors":["Shuai Yuan","Lei Luo","Zhuo Hui","Can Pu","Xiaoyu Xiang","Rakesh Ranjan","Denis Demandolx"],"pdf_url":"https://arxiv.org/pdf/2405.02608v1.pdf","comment":"Accepted by CVPR 2024. Code is available at\n https://github.com/facebookresearch/UnSAMFlow"},{"id":"http://arxiv.org/abs/2405.02595v1","updated":"2024-05-04T07:39:25Z","published":"2024-05-04T07:39:25Z","title":"Vision-based 3D occupancy prediction in autonomous driving: a review and\n outlook","summary":" In recent years, autonomous driving has garnered escalating attention for its\npotential to relieve drivers' burdens and improve driving safety. Vision-based\n3D occupancy prediction, which predicts the spatial occupancy status and\nsemantics of 3D voxel grids around the autonomous vehicle from image inputs, is\nan emerging perception task suitable for cost-effective perception system of\nautonomous driving. Although numerous studies have demonstrated the greater\nadvantages of 3D occupancy prediction over object-centric perception tasks,\nthere is still a lack of a dedicated review focusing on this rapidly developing\nfield. In this paper, we first introduce the background of vision-based 3D\noccupancy prediction and discuss the challenges in this task. Secondly, we\nconduct a comprehensive survey of the progress in vision-based 3D occupancy\nprediction from three aspects: feature enhancement, deployment friendliness and\nlabel efficiency, and provide an in-depth analysis of the potentials and\nchallenges of each category of methods. Finally, we present a summary of\nprevailing research trends and propose some inspiring future outlooks. To\nprovide a valuable reference for researchers, a regularly updated collection of\nrelated papers, datasets, and codes is organized at\nhttps://github.com/zya3d/Awesome-3D-Occupancy-Prediction.\n","authors":["Yanan Zhang","Jinqing Zhang","Zengran Wang","Junhao Xu","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2405.02595v1.pdf","comment":"20 pages, 20 figures"},{"id":"http://arxiv.org/abs/2405.02591v1","updated":"2024-05-04T07:13:47Z","published":"2024-05-04T07:13:47Z","title":"Better YOLO with Attention-Augmented Network and Enhanced Generalization\n Performance for Safety Helmet Detection","summary":" Safety helmets play a crucial role in protecting workers from head injuries\nin construction sites, where potential hazards are prevalent. However,\ncurrently, there is no approach that can simultaneously achieve both model\naccuracy and performance in complex environments. In this study, we utilized a\nYolo-based model for safety helmet detection, achieved a 2% improvement in mAP\n(mean Average Precision) performance while reducing parameters and Flops count\nby over 25%. YOLO(You Only Look Once) is a widely used, high-performance,\nlightweight model architecture that is well suited for complex environments. We\npresents a novel approach by incorporating a lightweight feature extraction\nnetwork backbone based on GhostNetv2, integrating attention modules such as\nSpatial Channel-wise Attention Net(SCNet) and Coordination Attention\nNet(CANet), and adopting the Gradient Norm Aware optimizer (GAM) for improved\ngeneralization ability. In safety-critical environments, the accurate detection\nand speed of safety helmets plays a pivotal role in preventing occupational\nhazards and ensuring compliance with safety protocols. This work addresses the\npressing need for robust and efficient helmet detection methods, offering a\ncomprehensive framework that not only enhances accuracy but also improves the\nadaptability of detection models to real-world conditions. Our experimental\nresults underscore the synergistic effects of GhostNetv2, attention modules,\nand the GAM optimizer, presenting a compelling solution for safety helmet\ndetection that achieves superior performance in terms of accuracy,\ngeneralization, and efficiency.\n","authors":["Shuqi Shen","Junjie Yang"],"pdf_url":"https://arxiv.org/pdf/2405.02591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19500v2","updated":"2024-05-04T07:09:48Z","published":"2024-04-30T12:37:01Z","title":"Towards Real-world Video Face Restoration: A New Benchmark","summary":" Blind face restoration (BFR) on images has significantly progressed over the\nlast several years, while real-world video face restoration (VFR), which is\nmore challenging for more complex face motions such as moving gaze directions\nand facial orientations involved, remains unsolved. Typical BFR methods are\nevaluated on privately synthesized datasets or self-collected real-world\nlow-quality face images, which are limited in their coverage of real-world\nvideo frames. In this work, we introduced new real-world datasets named FOS\nwith a taxonomy of \"Full, Occluded, and Side\" faces from mainly video frames to\nstudy the applicability of current methods on videos. Compared with existing\ntest datasets, FOS datasets cover more diverse degradations and involve face\nsamples from more complex scenarios, which helps to revisit current face\nrestoration approaches more comprehensively. Given the established datasets, we\nbenchmarked both the state-of-the-art BFR methods and the video super\nresolution (VSR) methods to comprehensively study current approaches,\nidentifying their potential and limitations in VFR tasks. In addition, we\nstudied the effectiveness of the commonly used image quality assessment (IQA)\nmetrics and face IQA (FIQA) metrics by leveraging a subjective user study. With\nextensive experimental results and detailed analysis provided, we gained\ninsights from the successes and failures of both current BFR and VSR methods.\nThese results also pose challenges to current face restoration approaches,\nwhich we hope stimulate future advances in VFR research.\n","authors":["Ziyan Chen","Jingwen He","Xinqi Lin","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2404.19500v2.pdf","comment":"Project page: https://ziyannchen.github.io/projects/VFRxBenchmark/"},{"id":"http://arxiv.org/abs/2311.10361v2","updated":"2024-05-04T06:54:11Z","published":"2023-11-17T07:30:00Z","title":"Video-based Sequential Bayesian Homography Estimation for Soccer Field\n Registration","summary":" A novel Bayesian framework is proposed, which explicitly relates the\nhomography of one video frame to the next through an affine transformation\nwhile explicitly modelling keypoint uncertainty. The literature has previously\nused differential homography between subsequent frames, but not in a Bayesian\nsetting. In cases where Bayesian methods have been applied, camera motion is\nnot adequately modelled, and keypoints are treated as deterministic. The\nproposed method, Bayesian Homography Inference from Tracked Keypoints (BHITK),\nemploys a two-stage Kalman filter and significantly improves existing methods.\nExisting keypoint detection methods may be easily augmented with BHITK. It\nenables less sophisticated and less computationally expensive methods to\noutperform the state-of-the-art approaches in most homography evaluation\nmetrics. Furthermore, the homography annotations of the WorldCup and\nTS-WorldCup datasets have been refined using a custom homography annotation\ntool that has been released for public use. The refined datasets are\nconsolidated and released as the consolidated and refined WorldCup (CARWC)\ndataset.\n","authors":["Paul J. Claasen","J. P. de Villiers"],"pdf_url":"https://arxiv.org/pdf/2311.10361v2.pdf","comment":"Accepted to Expert Systems with Applications"},{"id":"http://arxiv.org/abs/2405.02586v1","updated":"2024-05-04T06:53:18Z","published":"2024-05-04T06:53:18Z","title":"Generalizing CLIP to Unseen Domain via Text-Guided Diverse Novel Feature\n Synthesis","summary":" Vision-language foundation models like CLIP have shown impressive zero-shot\ngeneralization, but finetuning on downstream datasets can cause overfitting and\nloss of its generalization ability on unseen domains. Although collecting\nadditional data from new domains of interest is possible, this method is often\nimpractical due to the challenges in obtaining annotated data. To address this,\nwe propose a plug-and-play feature augmentation method called LDFS\n(Language-Guided Diverse Feature Synthesis) to synthesize new domain features\nand improve existing CLIP fine-tuning strategies. LDFS has three main\ncontributions: 1) To synthesize novel domain features and promote diversity, we\npropose an instance-conditional feature augmentation strategy based on a\ntextguided feature augmentation loss. 2) To maintain feature quality after\naugmenting, we introduce a pairwise regularizer to preserve augmented feature\ncoherence within the CLIP feature space. 3) We propose to use stochastic text\nfeature augmentation to reduce the modality gap and further facilitate the\nprocess of text-guided feature synthesis. Extensive experiments show LDFS\nsuperiority in improving CLIP generalization ability on unseen domains without\ncollecting data from those domains. The code will be made publicly available.\n","authors":["Siyuan Yan","Cheng Luo","Zhen Yu","Zongyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2405.02586v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2405.02581v1","updated":"2024-05-04T06:31:38Z","published":"2024-05-04T06:31:38Z","title":"Stationary Representations: Optimally Approximating Compatibility and\n Implications for Improved Model Replacements","summary":" Learning compatible representations enables the interchangeable use of\nsemantic features as models are updated over time. This is particularly\nrelevant in search and retrieval systems where it is crucial to avoid\nreprocessing of the gallery images with the updated model. While recent\nresearch has shown promising empirical evidence, there is still a lack of\ncomprehensive theoretical understanding about learning compatible\nrepresentations. In this paper, we demonstrate that the stationary\nrepresentations learned by the $d$-Simplex fixed classifier optimally\napproximate compatibility representation according to the two inequality\nconstraints of its formal definition. This not only establishes a solid\nfoundation for future works in this line of research but also presents\nimplications that can be exploited in practical learning scenarios. An\nexemplary application is the now-standard practice of downloading and\nfine-tuning new pre-trained models. Specifically, we show the strengths and\ncritical issues of stationary representations in the case in which a model\nundergoing sequential fine-tuning is asynchronously replaced by downloading a\nbetter-performing model pre-trained elsewhere. Such a representation enables\nseamless delivery of retrieval service (i.e., no reprocessing of gallery\nimages) and offers improved performance without operational disruptions during\nmodel replacement. Code available at: https://github.com/miccunifi/iamcl2r.\n","authors":["Niccolò Biondi","Federico Pernici","Simone Ricci","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2405.02581v1.pdf","comment":"Accepted at CVPR24 as Poster Highlight"},{"id":"http://arxiv.org/abs/2405.00900v2","updated":"2024-05-04T05:36:12Z","published":"2024-05-01T23:07:12Z","title":"LidaRF: Delving into Lidar for Neural Radiance Field on Street Scenes","summary":" Photorealistic simulation plays a crucial role in applications such as\nautonomous driving, where advances in neural radiance fields (NeRFs) may allow\nbetter scalability through the automatic creation of digital 3D assets.\nHowever, reconstruction quality suffers on street scenes due to largely\ncollinear camera motions and sparser samplings at higher speeds. On the other\nhand, the application often demands rendering from camera views that deviate\nfrom the inputs to accurately simulate behaviors like lane changes. In this\npaper, we propose several insights that allow a better utilization of Lidar\ndata to improve NeRF quality on street scenes. First, our framework learns a\ngeometric scene representation from Lidar, which is fused with the implicit\ngrid-based representation for radiance decoding, thereby supplying stronger\ngeometric information offered by explicit point cloud. Second, we put forth a\nrobust occlusion-aware depth supervision scheme, which allows utilizing\ndensified Lidar points by accumulation. Third, we generate augmented training\nviews from Lidar points for further improvement. Our insights translate to\nlargely improved novel view synthesis under real driving scenes.\n","authors":["Shanlin Sun","Bingbing Zhuang","Ziyu Jiang","Buyu Liu","Xiaohui Xie","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2405.00900v2.pdf","comment":"CVPR2024 Highlights"},{"id":"http://arxiv.org/abs/2303.00244v2","updated":"2024-05-04T05:29:42Z","published":"2023-03-01T05:54:52Z","title":"SUNY: A Visual Interpretation Framework for Convolutional Neural\n Networks from a Necessary and Sufficient Perspective","summary":" Researchers have proposed various methods for visually interpreting the\nConvolutional Neural Network (CNN) via saliency maps, which include\nClass-Activation-Map (CAM) based approaches as a leading family. However, in\nterms of the internal design logic, existing CAM-based approaches often\noverlook the causal perspective that answers the core \"why\" question to help\nhumans understand the explanation. Additionally, current CNN explanations lack\nthe consideration of both necessity and sufficiency, two complementary sides of\na desirable explanation. This paper presents a causality-driven framework,\nSUNY, designed to rationalize the explanations toward better human\nunderstanding. Using the CNN model's input features or internal filters as\nhypothetical causes, SUNY generates explanations by bi-directional\nquantifications on both the necessary and sufficient perspectives. Extensive\nevaluations justify that SUNY not only produces more informative and convincing\nexplanations from the angles of necessity and sufficiency, but also achieves\nperformances competitive to other approaches across different CNN architectures\nover large-scale datasets, including ILSVRC2012 and CUB-200-2011.\n","authors":["Xiwei Xuan","Ziquan Deng","Hsuan-Tien Lin","Zhaodan Kong","Kwan-Liu Ma"],"pdf_url":"https://arxiv.org/pdf/2303.00244v2.pdf","comment":"CVPRw 2024"},{"id":"http://arxiv.org/abs/2404.12390v3","updated":"2024-05-04T05:25:26Z","published":"2024-04-18T17:59:54Z","title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","summary":" We introduce Blink, a new benchmark for multimodal language models (LLMs)\nthat focuses on core visual perception abilities not found in other\nevaluations. Most of the Blink tasks can be solved by humans \"within a blink\"\n(e.g., relative depth estimation, visual correspondence, forensics detection,\nand multi-view reasoning). However, we find these perception-demanding tasks\ncast significant challenges for current multimodal LLMs because they resist\nmediation through natural language. Blink reformats 14 classic computer vision\ntasks into 3,807 multiple-choice questions, paired with single or multiple\nimages and visual prompting. While humans get 95.70% accuracy on average, Blink\nis surprisingly challenging for existing multimodal LLMs: even the\nbest-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only\n13.17% and 7.63% higher than random guessing, indicating that such perception\nabilities have not \"emerged\" yet in recent multimodal LLMs. Our analysis also\nhighlights that specialist CV models could solve these problems much better,\nsuggesting potential pathways for future improvements. We believe Blink will\nstimulate the community to help multimodal LLMs catch up with human-level\nvisual perception.\n","authors":["Xingyu Fu","Yushi Hu","Bangzheng Li","Yu Feng","Haoyu Wang","Xudong Lin","Dan Roth","Noah A. Smith","Wei-Chiu Ma","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.12390v3.pdf","comment":"Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/"},{"id":"http://arxiv.org/abs/2401.06462v2","updated":"2024-05-04T05:08:49Z","published":"2024-01-12T09:17:32Z","title":"AttributionScanner: A Visual Analytics System for Model Validation with\n Metadata-Free Slice Finding","summary":" Data slice finding is an emerging technique for validating machine learning\n(ML) models by identifying and analyzing subgroups in a dataset that exhibit\npoor performance, often characterized by distinct feature sets or descriptive\nmetadata. However, in the context of validating vision models involving\nunstructured image data, this approach faces significant challenges, including\nthe laborious and costly requirement for additional metadata and the complex\ntask of interpreting the root causes of underperformance. To address these\nchallenges, we introduce AttributionScanner, an innovative human-in-the-loop\nVisual Analytics (VA) system, designed for metadata-free data slice finding.\nOur system identifies interpretable data slices that involve common model\nbehaviors and visualizes these patterns through an Attribution Mosaic design.\nOur interactive interface provides straightforward guidance for users to\ndetect, interpret, and annotate predominant model issues, such as spurious\ncorrelations (model biases) and mislabeled data, with minimal effort.\nAdditionally, it employs a cutting-edge model regularization technique to\nmitigate the detected issues and enhance the model's performance. The efficacy\nof AttributionScanner is demonstrated through use cases involving two benchmark\ndatasets, with qualitative and quantitative evaluations showcasing its\nsubstantial effectiveness in vision model validation, ultimately leading to\nmore reliable and accurate models.\n","authors":["Xiwei Xuan","Jorge Piazentin Ono","Liang Gou","Kwan-Liu Ma","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2401.06462v2.pdf","comment":"12 pages, 12 figures, 3 tables. This manuscript is under review by\n the IEEE Transactions on Visualization and Computer Graphics (TVCG)"},{"id":"http://arxiv.org/abs/2405.02571v1","updated":"2024-05-04T05:07:39Z","published":"2024-05-04T05:07:39Z","title":"ViTALS: Vision Transformer for Action Localization in Surgical\n Nephrectomy","summary":" Surgical action localization is a challenging computer vision problem. While\nit has promising applications including automated training of surgery\nprocedures, surgical workflow optimization, etc., appropriate model design is\npivotal to accomplishing this task. Moreover, the lack of suitable medical\ndatasets adds an additional layer of complexity. To that effect, we introduce a\nnew complex dataset of nephrectomy surgeries called UroSlice. To perform the\naction localization from these videos, we propose a novel model termed as\n`ViTALS' (Vision Transformer for Action Localization in Surgical Nephrectomy).\nOur model incorporates hierarchical dilated temporal convolution layers and\ninter-layer residual connections to capture the temporal correlations at finer\nas well as coarser granularities. The proposed approach achieves\nstate-of-the-art performance on Cholec80 and UroSlice datasets (89.8% and 66.1%\naccuracy, respectively), validating its effectiveness.\n","authors":["Soumyadeep Chandra","Sayeed Shafayet Chowdhury","Courtney Yong","Chandru P. Sundaram","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2405.02571v1.pdf","comment":"Nephrectomy surgery, Surgical Phase Recognition, Surgical Workflow\n Segmentation, 11 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.02568v1","updated":"2024-05-04T05:01:58Z","published":"2024-05-04T05:01:58Z","title":"ActiveNeuS: Active 3D Reconstruction using Neural Implicit Surface\n Uncertainty","summary":" Active learning in 3D scene reconstruction has been widely studied, as\nselecting informative training views is critical for the reconstruction.\nRecently, Neural Radiance Fields (NeRF) variants have shown performance\nincreases in active 3D reconstruction using image rendering or geometric\nuncertainty. However, the simultaneous consideration of both uncertainties in\nselecting informative views remains unexplored, while utilizing different types\nof uncertainty can reduce the bias that arises in the early training stage with\nsparse inputs. In this paper, we propose ActiveNeuS, which evaluates candidate\nviews considering both uncertainties. ActiveNeuS provides a way to accumulate\nimage rendering uncertainty while avoiding the bias that the estimated\ndensities can introduce. ActiveNeuS computes the neural implicit surface\nuncertainty, providing the color uncertainty along with the surface\ninformation. It efficiently handles the bias by using the surface information\nand a grid, enabling the fast selection of diverse viewpoints. Our method\noutperforms previous works on popular datasets, Blender and DTU, showing that\nthe views selected by ActiveNeuS significantly improve performance.\n","authors":["Hyunseo Kim","Hyeonseo Yang","Taekyung Kim","YoonSung Kim","Jin-Hwa Kim","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14162v2","updated":"2024-05-04T04:55:18Z","published":"2024-04-22T13:21:09Z","title":"FLDM-VTON: Faithful Latent Diffusion Model for Virtual Try-on","summary":" Despite their impressive generative performance, latent diffusion model-based\nvirtual try-on (VTON) methods lack faithfulness to crucial details of the\nclothes, such as style, pattern, and text. To alleviate these issues caused by\nthe diffusion stochastic nature and latent supervision, we propose a novel\nFaithful Latent Diffusion Model for VTON, termed FLDM-VTON. FLDM-VTON improves\nthe conventional latent diffusion process in three major aspects. First, we\npropose incorporating warped clothes as both the starting point and local\ncondition, supplying the model with faithful clothes priors. Second, we\nintroduce a novel clothes flattening network to constrain generated try-on\nimages, providing clothes-consistent faithful supervision. Third, we devise a\nclothes-posterior sampling for faithful inference, further enhancing the model\nperformance over conventional clothes-agnostic Gaussian sampling. Extensive\nexperimental results on the benchmark VITON-HD and Dress Code datasets\ndemonstrate that our FLDM-VTON outperforms state-of-the-art baselines and is\nable to generate photo-realistic try-on images with faithful clothing details.\n","authors":["Chenhui Wang","Tao Chen","Zhihao Chen","Zhizhong Huang","Taoran Jiang","Qi Wang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2404.14162v2.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2405.02564v1","updated":"2024-05-04T04:33:20Z","published":"2024-05-04T04:33:20Z","title":"Leveraging the Human Ventral Visual Stream to Improve Neural Network\n Robustness","summary":" Human object recognition exhibits remarkable resilience in cluttered and\ndynamic visual environments. In contrast, despite their unparalleled\nperformance across numerous visual tasks, Deep Neural Networks (DNNs) remain\nfar less robust than humans, showing, for example, a surprising susceptibility\nto adversarial attacks involving image perturbations that are (almost)\nimperceptible to humans. Human object recognition likely owes its robustness,\nin part, to the increasingly resilient representations that emerge along the\nhierarchy of the ventral visual cortex. Here we show that DNNs, when guided by\nneural representations from a hierarchical sequence of regions in the human\nventral visual stream, display increasing robustness to adversarial attacks.\nThese neural-guided models also exhibit a gradual shift towards more human-like\ndecision-making patterns and develop hierarchically smoother decision surfaces.\nImportantly, the resulting representational spaces differ in important ways\nfrom those produced by conventional smoothing methods, suggesting that such\nneural-guidance may provide previously unexplored robustness solutions. Our\nfindings support the gradual emergence of human robustness along the ventral\nvisual hierarchy and suggest that the key to DNN robustness may lie in\nincreasing emulation of the human brain.\n","authors":["Zhenan Shao","Linjian Ma","Bo Li","Diane M. Beck"],"pdf_url":"https://arxiv.org/pdf/2405.02564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02556v1","updated":"2024-05-04T04:05:59Z","published":"2024-05-04T04:05:59Z","title":"Few-Shot Fruit Segmentation via Transfer Learning","summary":" Advancements in machine learning, computer vision, and robotics have paved\nthe way for transformative solutions in various domains, particularly in\nagriculture. For example, accurate identification and segmentation of fruits\nfrom field images plays a crucial role in automating jobs such as harvesting,\ndisease detection, and yield estimation. However, achieving robust and precise\ninfield fruit segmentation remains a challenging task since large amounts of\nlabeled data are required to handle variations in fruit size, shape, color, and\nocclusion. In this paper, we develop a few-shot semantic segmentation framework\nfor infield fruits using transfer learning. Concretely, our work is aimed at\naddressing agricultural domains that lack publicly available labeled data.\nMotivated by similar success in urban scene parsing, we propose specialized\npre-training using a public benchmark dataset for fruit transfer learning. By\nleveraging pre-trained neural networks, accurate semantic segmentation of fruit\nin the field is achieved with only a few labeled images. Furthermore, we show\nthat models with pre-training learn to distinguish between fruit still on the\ntrees and fruit that have fallen on the ground, and they can effectively\ntransfer the knowledge to the target fruit dataset.\n","authors":["Jordan A. James","Heather K. Manching","Amanda M. Hulse-Kemp","William J. Beksi"],"pdf_url":"https://arxiv.org/pdf/2405.02556v1.pdf","comment":"To be published in the 2024 IEEE International Conference on Robotics\n and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2403.19902v2","updated":"2024-05-04T03:47:06Z","published":"2024-03-29T01:05:23Z","title":"Heterogeneous Network Based Contrastive Learning Method for PolSAR Land\n Cover Classification","summary":" Polarimetric synthetic aperture radar (PolSAR) image interpretation is widely\nused in various fields. Recently, deep learning has made significant progress\nin PolSAR image classification. Supervised learning (SL) requires a large\namount of labeled PolSAR data with high quality to achieve better performance,\nhowever, manually labeled data is insufficient. This causes the SL to fail into\noverfitting and degrades its generalization performance. Furthermore, the\nscattering confusion problem is also a significant challenge that attracts more\nattention. To solve these problems, this article proposes a Heterogeneous\nNetwork based Contrastive Learning method(HCLNet). It aims to learn high-level\nrepresentation from unlabeled PolSAR data for few-shot classification according\nto multi-features and superpixels. Beyond the conventional CL, HCLNet\nintroduces the heterogeneous architecture for the first time to utilize\nheterogeneous PolSAR features better. And it develops two easy-to-use plugins\nto narrow the domain gap between optics and PolSAR, including feature filter\nand superpixel-based instance discrimination, which the former is used to\nenhance the complementarity of multi-features, and the latter is used to\nincrease the diversity of negative samples. Experiments demonstrate the\nsuperiority of HCLNet on three widely used PolSAR benchmark datasets compared\nwith state-of-the-art methods. Ablation studies also verify the importance of\neach component. Besides, this work has implications for how to efficiently\nutilize the multi-features of PolSAR data to learn better high-level\nrepresentation in CL and how to construct networks suitable for PolSAR data\nbetter.\n","authors":["Jianfeng Cai","Yue Ma","Zhixi Feng","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19902v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01239v3","updated":"2024-05-04T03:07:57Z","published":"2023-12-02T22:25:24Z","title":"Motion Informed Needle Segmentation in Ultrasound Images","summary":" Segmenting a moving needle in ultrasound images is challenging due to the\npresence of artifacts, noise, and needle occlusion. This task becomes even more\ndemanding in scenarios where data availability is limited. In this paper, we\npresent a novel approach for needle segmentation for 2D ultrasound that\ncombines classical Kalman Filter (KF) techniques with data-driven learning,\nincorporating both needle features and needle motion. Our method offers three\nkey contributions. First, we propose a compatible framework that seamlessly\nintegrates into commonly used encoder-decoder style architectures. Second, we\ndemonstrate superior performance compared to recent state-of-the-art needle\nsegmentation models using our novel convolutional neural network (CNN) based\nKF-inspired block, achieving a 15\\% reduction in pixel-wise needle tip error\nand an 8\\% reduction in length error. Third, to our knowledge we are the first\nto implement a learnable filter to incorporate non-linear needle motion for\nimproving needle segmentation.\n","authors":["Raghavv Goel","Cecilia Morales","Manpreet Singh","Artur Dubrawski","John Galeotti","Howie Choset"],"pdf_url":"https://arxiv.org/pdf/2312.01239v3.pdf","comment":"7 pages, 4 figures, accepted at ISBI 2024"},{"id":"http://arxiv.org/abs/2403.04306v3","updated":"2024-05-04T02:55:09Z","published":"2024-03-07T08:25:27Z","title":"Effectiveness Assessment of Recent Large Vision-Language Models","summary":" The advent of large vision-language models (LVLMs) represents a noteworthy\nadvancement towards the pursuit of artificial general intelligence. However,\nthe model efficacy across both specialized and general tasks warrants further\ninvestigation. This paper endeavors to evaluate the competency of popular LVLMs\nin specialized and general tasks, respectively, aiming to offer a comprehensive\nunderstanding of these novel models. To gauge their efficacy in specialized\ntasks, we employ six challenging tasks across three distinct application\nscenarios, namely natural, healthcare, and industrial ones. Such six tasks\ninclude salient/camouflaged/transparent object detection, as well as polyp\ndetection, skin lesion detection, and industrial anomaly detection. We examine\nthe performance of three recent open-source LVLMs, including MiniGPT-v2,\nLLaVA-1.5, and Shikra, on both visual recognition and localization under these\ntasks. Moreover, we conduct empirical investigations utilizing the\naforementioned LVLMs together with GPT-4V, assessing their multi-modal\nunderstanding capabilities in general tasks including object counting, absurd\nquestion answering, affordance reasoning, attribute recognition, and spatial\nrelation reasoning. Our investigations reveal that these LVLMs demonstrate\nlimited proficiency not only in specialized tasks but also in general tasks. We\ndelve deep into this inadequacy and uncover several potential factors,\nincluding limited cognition in specialized tasks, object hallucination,\ntext-to-image interference, and decreased robustness in complex problems. We\nhope this study could provide useful insights for the future development of\nLVLMs, helping researchers improve LVLMs to cope with both general and\nspecialized applications.\n","authors":["Yao Jiang","Xinyu Yan","Ge-Peng Ji","Keren Fu","Meijun Sun","Huan Xiong","Deng-Ping Fan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.04306v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02538v1","updated":"2024-05-04T01:53:22Z","published":"2024-05-04T01:53:22Z","title":"AdaFPP: Adapt-Focused Bi-Propagating Prototype Learning for Panoramic\n Activity Recognition","summary":" Panoramic Activity Recognition (PAR) aims to identify multi-granularity\nbehaviors performed by multiple persons in panoramic scenes, including\nindividual activities, group activities, and global activities. Previous\nmethods 1) heavily rely on manually annotated detection boxes in training and\ninference, hindering further practical deployment; or 2) directly employ normal\ndetectors to detect multiple persons with varying size and spatial occlusion in\npanoramic scenes, blocking the performance gain of PAR. To this end, we\nconsider learning a detector adapting varying-size occluded persons, which is\noptimized along with the recognition module in the all-in-one framework.\nTherefore, we propose a novel Adapt-Focused bi-Propagating Prototype learning\n(AdaFPP) framework to jointly recognize individual, group, and global\nactivities in panoramic activity scenes by learning an adapt-focused detector\nand multi-granularity prototypes as the pretext tasks in an end-to-end way.\nSpecifically, to accommodate the varying sizes and spatial occlusion of\nmultiple persons in crowed panoramic scenes, we introduce a panoramic\nadapt-focuser, achieving the size-adapting detection of individuals by\ncomprehensively selecting and performing fine-grained detections on\nobject-dense sub-regions identified through original detections. In addition,\nto mitigate information loss due to inaccurate individual localizations, we\nintroduce a bi-propagation prototyper that promotes closed-loop interaction and\ninformative consistency across different granularities by facilitating\nbidirectional information propagation among the individual, group, and global\nlevels. Extensive experiments demonstrate the significant performance of AdaFPP\nand emphasize its powerful applicability for PAR.\n","authors":["Meiqi Cao","Rui Yan","Xiangbo Shu","Guangzhao Dai","Yazhou Yao","Guo-Sen Xie"],"pdf_url":"https://arxiv.org/pdf/2405.02538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04244v2","updated":"2024-05-04T01:06:54Z","published":"2024-04-05T17:46:38Z","title":"Fast Diffeomorphic Image Registration using Patch based Fully\n Convolutional Networks","summary":" Diffeomorphic image registration is a fundamental step in medical image\nanalysis, owing to its capability to ensure the invertibility of\ntransformations and preservation of topology. Currently, unsupervised\nlearning-based registration techniques primarily extract features at the image\nlevel, potentially limiting their efficacy. This paper proposes a novel\nunsupervised learning-based fully convolutional network (FCN) framework for\nfast diffeomorphic image registration, emphasizing feature acquisition at the\nimage patch level. Furthermore, a novel differential operator is introduced and\nintegrated into the FCN architecture for parameter learning. Experiments are\nconducted on three distinct T1-weighted magnetic resonance imaging (T1w MRI)\ndatasets. Comparative analyses with three state-of-the-art diffeomorphic image\nregistration approaches including a typical conventional registration algorithm\nand two representative unsupervised learning-based methods, reveal that the\nproposed method exhibits superior performance in both registration accuracy and\ntopology preservation.\n","authors":["Jiong Wu","Shuang Zhou","Li Lin","Xin Wang","Wenxue Tan"],"pdf_url":"https://arxiv.org/pdf/2404.04244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13288v2","updated":"2024-05-04T01:05:41Z","published":"2024-04-20T06:25:32Z","title":"PoseINN: Realtime Visual-based Pose Regression and Localization with\n Invertible Neural Networks","summary":" Estimating ego-pose from cameras is an important problem in robotics with\napplications ranging from mobile robotics to augmented reality. While SOTA\nmodels are becoming increasingly accurate, they can still be unwieldy due to\nhigh computational costs. In this paper, we propose to solve the problem by\nusing invertible neural networks (INN) to find the mapping between the latent\nspace of images and poses for a given scene. Our model achieves similar\nperformance to the SOTA while being faster to train and only requiring offline\nrendering of low-resolution synthetic data. By using normalizing flows, the\nproposed method also provides uncertainty estimation for the output. We also\ndemonstrated the efficiency of this method by deploying the model on a mobile\nrobot.\n","authors":["Zirui Zang","Ahmad Amine","Rahul Mangharam"],"pdf_url":"https://arxiv.org/pdf/2404.13288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00740v2","updated":"2024-05-04T00:37:57Z","published":"2024-04-30T01:19:18Z","title":"Modeling Caption Diversity in Contrastive Vision-Language Pretraining","summary":" There are a thousand ways to caption an image. Contrastive Language\nPretraining (CLIP) on the other hand, works by mapping an image and its caption\nto a single vector -- limiting how well CLIP-like models can represent the\ndiverse ways to describe an image. In this work, we introduce Llip, Latent\nLanguage Image Pretraining, which models the diversity of captions that could\nmatch an image. Llip's vision encoder outputs a set of visual features that are\nmixed into a final representation by conditioning on information derived from\nthe text. We show that Llip outperforms non-contextualized baselines like CLIP\nand SigLIP on a variety of tasks even with large-scale encoders. Llip improves\nzero-shot classification by an average of 2.9% zero-shot classification\nbenchmarks with a ViT-G/14 encoder. Specifically, Llip attains a zero-shot\ntop-1 accuracy of 83.5% on ImageNet outperforming a similarly sized CLIP by\n1.4%. We also demonstrate improvement on zero-shot retrieval on MS-COCO by\n6.0%. We provide a comprehensive analysis of the components introduced by the\nmethod and demonstrate that Llip leads to richer visual representations.\n","authors":["Samuel Lavoie","Polina Kirichenko","Mark Ibrahim","Mahmoud Assran","Andrew Gordon Wildon","Aaron Courville","Nicolas Ballas"],"pdf_url":"https://arxiv.org/pdf/2405.00740v2.pdf","comment":"14 pages, 8 figures, 7 tables, to be published at ICML2024"},{"id":"http://arxiv.org/abs/2403.08002v3","updated":"2024-05-04T00:35:01Z","published":"2024-03-12T18:12:02Z","title":"Towards a clinically accessible radiology foundation model: open-access\n and lightweight, with automated evaluation","summary":" The scaling laws and extraordinary performance of large foundation models\nmotivate the development and utilization of such models in biomedicine.\nHowever, despite early promising results on some biomedical benchmarks, there\nare still major challenges that need to be addressed before these models can be\nused in real-world clinics. Frontier general-domain models such as GPT-4V still\nhave significant performance gaps in multimodal biomedical applications. More\nimportantly, less-acknowledged pragmatic issues, including accessibility, model\ncost, and tedious manual evaluation make it hard for clinicians to use\nstate-of-the-art large models directly on private patient data. Here, we\nexplore training open-source small multimodal models (SMMs) to bridge\ncompetency gaps for unmet clinical needs in radiology. To maximize data\nefficiency, we adopt a modular approach by incorporating state-of-the-art\npre-trained models for image and text modalities, and focusing on training a\nlightweight adapter to ground each modality to the text embedding space, as\nexemplified by LLaVA-Med. For training, we assemble a large dataset of over 697\nthousand radiology image-text pairs. For evaluation, we propose CheXprompt, a\nGPT-4-based metric for factuality evaluation, and demonstrate its parity with\nexpert evaluation. For best practice, we conduct a systematic ablation study on\nvarious choices in data engineering and multimodal training. The resulting\nLlaVA-Rad (7B) model attains state-of-the-art results on standard radiology\ntasks such as report generation and cross-modal retrieval, even outperforming\nmuch larger models such as GPT-4V and Med-PaLM M (84B). The inference of\nLlaVA-Rad is fast and can be performed on a single V100 GPU in private\nsettings, offering a promising state-of-the-art tool for real-world clinical\napplications.\n","authors":["Juan Manuel Zambrano Chaves","Shih-Cheng Huang","Yanbo Xu","Hanwen Xu","Naoto Usuyama","Sheng Zhang","Fei Wang","Yujia Xie","Mahmoud Khademi","Ziyi Yang","Hany Awadalla","Julia Gong","Houdong Hu","Jianwei Yang","Chunyuan Li","Jianfeng Gao","Yu Gu","Cliff Wong","Mu Wei","Tristan Naumann","Muhao Chen","Matthew P. Lungren","Serena Yeung-Levy","Curtis P. Langlotz","Sheng Wang","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2403.08002v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14902v3","updated":"2024-05-04T00:30:29Z","published":"2023-11-25T02:32:46Z","title":"Parkinson's Disease Classification Using Contrastive Graph Cross-View\n Learning with Multimodal Fusion of SPECT Images and Clinical Features","summary":" Parkinson's Disease (PD) affects millions globally, impacting movement. Prior\nresearch utilized deep learning for PD prediction, primarily focusing on\nmedical images, neglecting the data's underlying manifold structure. This work\nproposes a multimodal approach encompassing both image and non-image features,\nleveraging contrastive cross-view graph fusion for PD classification. We\nintroduce a novel multimodal co-attention module, integrating embeddings from\nseparate graph views derived from low-dimensional representations of images and\nclinical features. This enables more robust and structured feature extraction\nfor improved multi-view data analysis. Additionally, a simplified contrastive\nloss-based fusion method is devised to enhance cross-view fusion learning. Our\ngraph-view multimodal approach achieves an accuracy of 91% and an area under\nthe receiver operating characteristic curve (AUC) of 92.8% in five-fold\ncross-validation. It also demonstrates superior predictive capabilities on\nnon-image data compared to solely machine learning-based methods.\n","authors":["Jun-En Ding","Chien-Chin Hsu","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2311.14902v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03715v1","updated":"2024-05-04T19:40:42Z","published":"2024-05-04T19:40:42Z","title":"Iterative Filter Pruning for Concatenation-based CNN Architectures","summary":" Model compression and hardware acceleration are essential for the\nresource-efficient deployment of deep neural networks. Modern object detectors\nhave highly interconnected convolutional layers with concatenations. In this\nwork, we study how pruning can be applied to such architectures, exemplary for\nYOLOv7. We propose a method to handle concatenation layers, based on the\nconnectivity graph of convolutional layers. By automating iterative sensitivity\nanalysis, pruning, and subsequent model fine-tuning, we can significantly\nreduce model size both in terms of the number of parameters and FLOPs, while\nkeeping comparable model accuracy. Finally, we deploy pruned models to FPGA and\nNVIDIA Jetson Xavier AGX. Pruned models demonstrate a 2x speedup for the\nconvolutional layers in comparison to the unpruned counterparts and reach\nreal-time capability with 14 FPS on FPGA. Our code is available at\nhttps://github.com/fzi-forschungszentrum-informatik/iterative-yolo-pruning.\n","authors":["Svetlana Pavlitska","Oliver Bagge","Federico Peccia","Toghrul Mammadov","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2405.03715v1.pdf","comment":"Accepted for publication at IJCNN 2024"},{"id":"http://arxiv.org/abs/2405.03713v1","updated":"2024-05-04T14:02:52Z","published":"2024-05-04T14:02:52Z","title":"Improve Cross-Modality Segmentation by Treating MRI Images as Inverted\n CT Scans","summary":" Computed tomography (CT) segmentation models frequently include classes that\nare not currently supported by magnetic resonance imaging (MRI) segmentation\nmodels. In this study, we show that a simple image inversion technique can\nsignificantly improve the segmentation quality of CT segmentation models on MRI\ndata, by using the TotalSegmentator model, applied to T1-weighted MRI images,\nas example. Image inversion is straightforward to implement and does not\nrequire dedicated graphics processing units (GPUs), thus providing a quick\nalternative to complex deep modality-transfer models for generating\nsegmentation masks for MRI data.\n","authors":["Hartmut Häntze","Lina Xu","Leonhard Donle","Felix J. Dorfner","Alessa Hering","Lisa C. Adams","Keno K. Bressem"],"pdf_url":"https://arxiv.org/pdf/2405.03713v1.pdf","comment":"3 pages, 2 figures"}]},"2024-05-07T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.04534v1","updated":"2024-05-07T17:59:50Z","published":"2024-05-07T17:59:50Z","title":"Tactile-Augmented Radiance Fields","summary":" We present a scene representation, which we call a tactile-augmented radiance\nfield (TaRF), that brings vision and touch into a shared 3D space. This\nrepresentation can be used to estimate the visual and tactile signals for a\ngiven 3D position within a scene. We capture a scene's TaRF from a collection\nof photos and sparsely sampled touch probes. Our approach makes use of two\ninsights: (i) common vision-based touch sensors are built on ordinary cameras\nand thus can be registered to images using methods from multi-view geometry,\nand (ii) visually and structurally similar regions of a scene share the same\ntactile features. We use these insights to register touch signals to a captured\nvisual scene, and to train a conditional diffusion model that, provided with an\nRGB-D image rendered from a neural radiance field, generates its corresponding\ntactile signal. To evaluate our approach, we collect a dataset of TaRFs. This\ndataset contains more touch samples than previous real-world datasets, and it\nprovides spatially aligned visual signals for each captured touch signal. We\ndemonstrate the accuracy of our cross-modal generative model and the utility of\nthe captured visual-tactile data on several downstream tasks. Project page:\nhttps://dou-yiming.github.io/TaRF\n","authors":["Yiming Dou","Fengyu Yang","Yi Liu","Antonio Loquercio","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2405.04534v1.pdf","comment":"CVPR 2024, Project page: https://dou-yiming.github.io/TaRF, Code:\n https://github.com/Dou-Yiming/TaRF/"},{"id":"http://arxiv.org/abs/2405.04533v1","updated":"2024-05-07T17:59:31Z","published":"2024-05-07T17:59:31Z","title":"ChatHuman: Language-driven 3D Human Understanding with\n Retrieval-Augmented Tool Reasoning","summary":" Numerous methods have been proposed to detect, estimate, and analyze\nproperties of people in images, including the estimation of 3D pose, shape,\ncontact, human-object interaction, emotion, and more. Each of these methods\nworks in isolation instead of synergistically. Here we address this problem and\nbuild a language-driven human understanding system -- ChatHuman, which combines\nand integrates the skills of many different methods. To do so, we finetune a\nLarge Language Model (LLM) to select and use a wide variety of existing tools\nin response to user inputs. In doing so, ChatHuman is able to combine\ninformation from multiple tools to solve problems more accurately than the\nindividual tools themselves and to leverage tool output to improve its ability\nto reason about humans. The novel features of ChatHuman include leveraging\nacademic publications to guide the application of 3D human-related tools,\nemploying a retrieval-augmented generation model to generate\nin-context-learning examples for handling new tools, and discriminating and\nintegrating tool results to enhance 3D human understanding. Our experiments\nshow that ChatHuman outperforms existing models in both tool selection accuracy\nand performance across multiple 3D human-related tasks. ChatHuman is a step\ntowards consolidating diverse methods for human analysis into a single,\npowerful, system for 3D human reasoning.\n","authors":["Jing Lin","Yao Feng","Weiyang Liu","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2405.04533v1.pdf","comment":"Project page: https://chathuman.github.io"},{"id":"http://arxiv.org/abs/2311.07761v2","updated":"2024-05-07T17:36:29Z","published":"2023-11-13T21:21:43Z","title":"Amodal Optical Flow","summary":" Optical flow estimation is very challenging in situations with transparent or\noccluded objects. In this work, we address these challenges at the task level\nby introducing Amodal Optical Flow, which integrates optical flow with amodal\nperception. Instead of only representing the visible regions, we define amodal\noptical flow as a multi-layered pixel-level motion field that encompasses both\nvisible and occluded regions of the scene. To facilitate research on this new\ntask, we extend the AmodalSynthDrive dataset to include pixel-level labels for\namodal optical flow estimation. We present several strong baselines, along with\nthe Amodal Flow Quality metric to quantify the performance in an interpretable\nmanner. Furthermore, we propose the novel AmodalFlowNet as an initial step\ntoward addressing this task. AmodalFlowNet consists of a transformer-based\ncost-volume encoder paired with a recurrent transformer decoder which\nfacilitates recurrent hierarchical feature propagation and amodal semantic\ngrounding. We demonstrate the tractability of amodal optical flow in extensive\nexperiments and show its utility for downstream tasks such as panoptic\ntracking. We make the dataset, code, and trained models publicly available at\nhttp://amodal-flow.cs.uni-freiburg.de.\n","authors":["Maximilian Luz","Rohit Mohan","Ahmed Rida Sekkat","Oliver Sawade","Elmar Matthes","Thomas Brox","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2311.07761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02527v4","updated":"2024-05-07T17:22:57Z","published":"2024-03-04T22:42:17Z","title":"A dataset of over one thousand computed tomography scans of battery\n cells","summary":" Battery technology is increasingly important for global electrification\nefforts. However, batteries are highly sensitive to small manufacturing\nvariations that can induce reliability or safety issues. An important\ntechnology for battery quality control is computed tomography (CT) scanning,\nwhich is widely used for non-destructive 3D inspection across a variety of\nclinical and industrial applications. Historically, however, the utility of CT\nscanning for high-volume manufacturing has been limited by its low throughput\nas well as the difficulty of handling its large file sizes. In this work, we\npresent a dataset of over one thousand CT scans of as-produced commercially\navailable batteries. The dataset spans various chemistries (lithium-ion and\nsodium-ion) as well as various battery form factors (cylindrical, pouch, and\nprismatic). We evaluate seven different battery types in total. The\nmanufacturing variability and the presence of battery defects can be observed\nvia this dataset. This dataset may be of interest to scientists and engineers\nworking on battery technology, computer vision, or both.\n","authors":["Amariah Condon","Bailey Buscarino","Eric Moch","William J. Sehnert","Owen Miles","Patrick K. Herring","Peter M. Attia"],"pdf_url":"https://arxiv.org/pdf/2403.02527v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04496v1","updated":"2024-05-07T17:06:59Z","published":"2024-05-07T17:06:59Z","title":"Edit-Your-Motion: Space-Time Diffusion Decoupling Learning for Video\n Motion Editing","summary":" Existing diffusion-based video editing methods have achieved impressive\nresults in motion editing. Most of the existing methods focus on the motion\nalignment between the edited video and the reference video. However, these\nmethods do not constrain the background and object content of the video to\nremain unchanged, which makes it possible for users to generate unexpected\nvideos. In this paper, we propose a one-shot video motion editing method called\nEdit-Your-Motion that requires only a single text-video pair for training.\nSpecifically, we design the Detailed Prompt-Guided Learning Strategy (DPL) to\ndecouple spatio-temporal features in space-time diffusion models. DPL separates\nlearning object content and motion into two training stages. In the first\ntraining stage, we focus on learning the spatial features (the features of\nobject content) and breaking down the temporal relationships in the video\nframes by shuffling them. We further propose Recurrent-Causal Attention\n(RC-Attn) to learn the consistent content features of the object from unordered\nvideo frames. In the second training stage, we restore the temporal\nrelationship in video frames to learn the temporal feature (the features of the\nbackground and object's motion). We also adopt the Noise Constraint Loss to\nsmooth out inter-frame differences. Finally, in the inference stage, we inject\nthe content features of the source object into the editing branch through a\ntwo-branch structure (editing branch and reconstruction branch). With\nEdit-Your-Motion, users can edit the motion of objects in the source video to\ngenerate more exciting and diverse videos. Comprehensive qualitative\nexperiments, quantitative experiments and user preference studies demonstrate\nthat Edit-Your-Motion performs better than other methods.\n","authors":["Yi Zuo","Lingling Li","Licheng Jiao","Fang Liu","Xu Liu","Wenping Ma","Shuyuan Yang","Yuwei Guo"],"pdf_url":"https://arxiv.org/pdf/2405.04496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04489v1","updated":"2024-05-07T16:56:21Z","published":"2024-05-07T16:56:21Z","title":"S3Former: Self-supervised High-resolution Transformer for Solar PV\n Profiling","summary":" As the impact of climate change escalates, the global necessity to transition\nto sustainable energy sources becomes increasingly evident. Renewable energies\nhave emerged as a viable solution for users, with Photovoltaic energy being a\nfavored choice for small installations due to its reliability and efficiency.\nAccurate mapping of PV installations is crucial for understanding the extension\nof its adoption and informing energy policy. To meet this need, we introduce\nS3Former, designed to segment solar panels from aerial imagery and provide size\nand location information critical for analyzing the impact of such\ninstallations on the grid. Solar panel identification is challenging due to\nfactors such as varying weather conditions, roof characteristics, Ground\nSampling Distance variations and lack of appropriate initialization weights for\noptimized training. To tackle these complexities, S3Former features a Masked\nAttention Mask Transformer incorporating a self-supervised learning pretrained\nbackbone. Specifically, our model leverages low-level and high-level features\nextracted from the backbone and incorporates an instance query mechanism\nincorporated on the Transformer architecture to enhance the localization of\nsolar PV installations. We introduce a self-supervised learning phase (pretext\ntask) to improve the initialization weights on the backbone of S3Former. We\nevaluated S3Former using diverse datasets, demonstrate improvement\nstate-of-the-art models.\n","authors":["Minh Tran","Adrian De Luis","Haitao Liao","Ying Huang","Roy McCann","Alan Mantooth","Jack Cothren","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2405.04489v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.16571v2","updated":"2024-05-07T16:55:12Z","published":"2024-04-25T12:34:23Z","title":"MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth\n Estimation of Endoscopic Images","summary":" Photometric constraint is indispensable for self-supervised monocular depth\nestimation. It involves warping a source image onto a target view using\nestimated depth&pose, and then minimizing the difference between the warped and\ntarget images. However, the endoscopic built-in light causes significant\nbrightness fluctuations, and thus makes the photometric constraint unreliable.\nPrevious efforts only mitigate this relying on extra models to calibrate image\nbrightness. In this paper, we propose MonoPCC to address the brightness\ninconsistency radically by reshaping the photometric constraint into a cycle\nform. Instead of only warping the source image, MonoPCC constructs a closed\nloop consisting of two opposite forward-backward warping paths: from target to\nsource and then back to target. Thus, the target image finally receives an\nimage cycle-warped from itself, which naturally makes the constraint invariant\nto brightness changes. Moreover, MonoPCC transplants the source image's\nphase-frequency into the intermediate warped image to avoid structure lost, and\nalso stabilizes the training via an exponential moving average (EMA) strategy\nto avoid frequent changes in the forward warping. The comprehensive and\nextensive experimental results on four endoscopic datasets demonstrate that our\nproposed MonoPCC shows a great robustness to the brightness inconsistency, and\nexceeds other state-of-the-arts by reducing the absolute relative error by at\nleast 7.27%, 9.38%, 9.90% and 3.17%, respectively.\n","authors":["Zhiwei Wang","Ying Zhou","Shiquan He","Ting Li","Fan Huang","Qiang Ding","Xinxia Feng","Mei Liu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.16571v2.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.12732v2","updated":"2024-05-07T16:49:38Z","published":"2023-07-24T12:24:07Z","title":"CLIP-KD: An Empirical Study of CLIP Model Distillation","summary":" Contrastive Language-Image Pre-training (CLIP) has become a promising\nlanguage-supervised visual pre-training framework. This paper aims to distill\nsmall CLIP models supervised by a large teacher CLIP model. We propose several\ndistillation strategies, including relation, feature, gradient and contrastive\nparadigms, to examine the effectiveness of CLIP-Knowledge Distillation (KD). We\nshow that a simple feature mimicry with Mean Squared Error loss works\nsurprisingly well. Moreover, interactive contrastive learning across teacher\nand student encoders is also effective in performance improvement. We explain\nthat the success of CLIP-KD can be attributed to maximizing the feature\nsimilarity between teacher and student. The unified method is applied to\ndistill several student models trained on CC3M+12M. CLIP-KD improves student\nCLIP models consistently over zero-shot ImageNet classification and cross-modal\nretrieval benchmarks. When using ViT-L/14 pretrained on Laion-400M as the\nteacher, CLIP-KD achieves 57.5\\% and 55.4\\% zero-shot top-1 ImageNet accuracy\nover ViT-B/16 and ResNet-50, surpassing the original CLIP without KD by 20.5\\%\nand 20.1\\% margins, respectively. Our code is released on\nhttps://github.com/winycg/CLIP-KD.\n","authors":["Chuanguang Yang","Zhulin An","Libo Huang","Junyu Bi","Xinqiang Yu","Han Yang","Boyu Diao","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12732v2.pdf","comment":"CVPR-2024"},{"id":"http://arxiv.org/abs/2401.15235v2","updated":"2024-05-07T16:32:18Z","published":"2024-01-26T22:59:51Z","title":"CascadedGaze: Efficiency in Global Context Extraction for Image\n Restoration","summary":" Image restoration tasks traditionally rely on convolutional neural networks.\nHowever, given the local nature of the convolutional operator, they struggle to\ncapture global information. The promise of attention mechanisms in Transformers\nis to circumvent this problem, but it comes at the cost of intensive\ncomputational overhead. Many recent studies in image restoration have focused\non solving the challenge of balancing performance and computational cost via\nTransformer variants. In this paper, we present CascadedGaze Network (CGNet),\nan encoder-decoder architecture that employs Global Context Extractor (GCE), a\nnovel and efficient way to capture global information for image restoration.\nThe GCE module leverages small kernels across convolutional layers to learn\nglobal dependencies, without requiring self-attention. Extensive experimental\nresults show that our computationally efficient approach performs competitively\nto a range of state-of-the-art methods on synthetic image denoising and single\nimage deblurring tasks, and pushes the performance boundary further on the real\nimage denoising task.\n","authors":["Amirhosein Ghasemabadi","Muhammad Kamran Janjua","Mohammad Salameh","Chunhua Zhou","Fengyu Sun","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2401.15235v2.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR), 2024.\n 20 pages"},{"id":"http://arxiv.org/abs/2311.16114v2","updated":"2024-05-07T16:30:05Z","published":"2023-09-21T10:49:02Z","title":"Learning Noise-Robust Joint Representation for Multimodal Emotion\n Recognition under Incomplete Data Scenarios","summary":" Multimodal emotion recognition (MER) in practical scenarios is significantly\nchallenged by the presence of missing or incomplete data across different\nmodalities. To overcome these challenges, researchers have aimed to simulate\nincomplete conditions during the training phase to enhance the system's overall\nrobustness. Traditional methods have often involved discarding data or\nsubstituting data segments with zero vectors to approximate these\nincompletenesses. However, such approaches neither accurately represent\nreal-world conditions nor adequately address the issue of noisy data\navailability. For instance, a blurry image cannot be simply replaced with zero\nvectors, and still retain information. To tackle this issue and develop a more\nprecise MER system, we introduce a novel noise-robust MER model that\neffectively learns robust multimodal joint representations from noisy data.\nThis approach includes two pivotal components: firstly, a noise scheduler that\nadjusts the type and level of noise in the data to emulate various realistic\nincomplete situations. Secondly, a Variational AutoEncoder (VAE)-based module\nis employed to reconstruct these robust multimodal joint representations from\nthe noisy inputs. Notably, the introduction of the noise scheduler enables the\nexploration of an entirely new type of incomplete data condition, which is\nimpossible with existing methods. Extensive experimental evaluations on the\nbenchmark datasets IEMOCAP and CMU-MOSEI demonstrate the effectiveness of the\nnoise scheduler and the excellent performance of our proposed model.\n","authors":["Qi Fan","Haolin Zuo","Rui Liu","Zheng Lian","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2311.16114v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04459v1","updated":"2024-05-07T16:24:03Z","published":"2024-05-07T16:24:03Z","title":"A Significantly Better Class of Activation Functions Than ReLU Like\n Activation Functions","summary":" This paper introduces a significantly better class of activation functions\nthan the almost universally used ReLU like and Sigmoidal class of activation\nfunctions. Two new activation functions referred to as the Cone and\nParabolic-Cone that differ drastically from popular activation functions and\nsignificantly outperform these on the CIFAR-10 and Imagenette benchmmarks are\nproposed. The cone activation functions are positive only on a finite interval\nand are strictly negative except at the end-points of the interval, where they\nbecome zero. Thus the set of inputs that produce a positive output for a neuron\nwith cone activation functions is a hyperstrip and not a half-space as is the\nusual case. Since a hyper strip is the region between two parallel\nhyper-planes, it allows neurons to more finely divide the input feature space\ninto positive and negative classes than with infinitely wide half-spaces. In\nparticular the XOR function can be learn by a single neuron with cone-like\nactivation functions. Both the cone and parabolic-cone activation functions are\nshown to achieve higher accuracies with significantly fewer neurons on\nbenchmarks. The results presented in this paper indicate that many nonlinear\nreal-world datasets may be separated with fewer hyperstrips than half-spaces.\nThe Cone and Parabolic-Cone activation functions have larger derivatives than\nReLU and are shown to significantly speedup training.\n","authors":["Mathew Mithra Noel","Yug Oswal"],"pdf_url":"https://arxiv.org/pdf/2405.04459v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2405.04457v1","updated":"2024-05-07T16:23:06Z","published":"2024-05-07T16:23:06Z","title":"Towards Geographic Inclusion in the Evaluation of Text-to-Image Models","summary":" Rapid progress in text-to-image generative models coupled with their\ndeployment for visual content creation has magnified the importance of\nthoroughly evaluating their performance and identifying potential biases. In\npursuit of models that generate images that are realistic, diverse, visually\nappealing, and consistent with the given prompt, researchers and practitioners\noften turn to automated metrics to facilitate scalable and cost-effective\nperformance profiling. However, commonly-used metrics often fail to account for\nthe full diversity of human preference; often even in-depth human evaluations\nface challenges with subjectivity, especially as interpretations of evaluation\ncriteria vary across regions and cultures. In this work, we conduct a large,\ncross-cultural study to study how much annotators in Africa, Europe, and\nSoutheast Asia vary in their perception of geographic representation, visual\nappeal, and consistency in real and generated images from state-of-the art\npublic APIs. We collect over 65,000 image annotations and 20 survey responses.\nWe contrast human annotations with common automated metrics, finding that human\npreferences vary notably across geographic location and that current metrics do\nnot fully account for this diversity. For example, annotators in different\nlocations often disagree on whether exaggerated, stereotypical depictions of a\nregion are considered geographically representative. In addition, the utility\nof automatic evaluations is dependent on assumptions about their set-up, such\nas the alignment of feature extractors with human perception of object\nsimilarity or the definition of \"appeal\" captured in reference datasets used to\nground evaluations. We recommend steps for improved automatic and human\nevaluations.\n","authors":["Melissa Hall","Samuel J. Bell","Candace Ross","Adina Williams","Michal Drozdzal","Adriana Romero Soriano"],"pdf_url":"https://arxiv.org/pdf/2405.04457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16687v2","updated":"2024-05-07T16:13:05Z","published":"2024-04-25T15:36:18Z","title":"NTIRE 2024 Quality Assessment of AI-Generated Content Challenge","summary":" This paper reports on the NTIRE 2024 Quality Assessment of AI-Generated\nContent Challenge, which will be held in conjunction with the New Trends in\nImage Restoration and Enhancement Workshop (NTIRE) at CVPR 2024. This challenge\nis to address a major challenge in the field of image and video processing,\nnamely, Image Quality Assessment (IQA) and Video Quality Assessment (VQA) for\nAI-Generated Content (AIGC). The challenge is divided into the image track and\nthe video track. The image track uses the AIGIQA-20K, which contains 20,000\nAI-Generated Images (AIGIs) generated by 15 popular generative models. The\nimage track has a total of 318 registered participants. A total of 1,646\nsubmissions are received in the development phase, and 221 submissions are\nreceived in the test phase. Finally, 16 participating teams submitted their\nmodels and fact sheets. The video track uses the T2VQA-DB, which contains\n10,000 AI-Generated Videos (AIGVs) generated by 9 popular Text-to-Video (T2V)\nmodels. A total of 196 participants have registered in the video track. A total\nof 991 submissions are received in the development phase, and 185 submissions\nare received in the test phase. Finally, 12 participating teams submitted their\nmodels and fact sheets. Some methods have achieved better results than baseline\nmethods, and the winning methods in both tracks have demonstrated superior\nprediction performance on AIGC.\n","authors":["Xiaohong Liu","Xiongkuo Min","Guangtao Zhai","Chunyi Li","Tengchuan Kou","Wei Sun","Haoning Wu","Yixuan Gao","Yuqin Cao","Zicheng Zhang","Xiele Wu","Radu Timofte","Fei Peng","Huiyuan Fu","Anlong Ming","Chuanming Wang","Huadong Ma","Shuai He","Zifei Dou","Shu Chen","Huacong Zhang","Haiyi Xie","Chengwei Wang","Baoying Chen","Jishen Zeng","Jianquan Yang","Weigang Wang","Xi Fang","Xiaoxin Lv","Jun Yan","Tianwu Zhi","Yabin Zhang","Yaohui Li","Yang Li","Jingwen Xu","Jianzhao Liu","Yiting Liao","Junlin Li","Zihao Yu","Yiting Lu","Xin Li","Hossein Motamednia","S. Farhad Hosseini-Benvidi","Fengbin Guan","Ahmad Mahmoudi-Aznaveh","Azadeh Mansouri","Ganzorig Gankhuyag","Kihwan Yoon","Yifang Xu","Haotian Fan","Fangyuan Kong","Shiling Zhao","Weifeng Dong","Haibing Yin","Li Zhu","Zhiling Wang","Bingchen Huang","Avinab Saha","Sandeep Mishra","Shashank Gupta","Rajesh Sureddi","Oindrila Saha","Luigi Celona","Simone Bianco","Paolo Napoletano","Raimondo Schettini","Junfeng Yang","Jing Fu","Wei Zhang","Wenzhi Cao","Limei Liu","Han Peng","Weijun Yuan","Zhan Li","Yihang Cheng","Yifan Deng","Haohui Li","Bowen Qu","Yao Li","Shuqing Luo","Shunzhou Wang","Wei Gao","Zihao Lu","Marcos V. Conde","Xinrui Wang","Zhibo Chen","Ruling Liao","Yan Ye","Qiulin Wang","Bing Li","Zhaokun Zhou","Miao Geng","Rui Chen","Xin Tao","Xiaoyu Liang","Shangkun Sun","Xingyuan Ma","Jiaze Li","Mengduo Yang","Haoran Xu","Jie Zhou","Shiding Zhu","Bohan Yu","Pengfei Chen","Xinrui Xu","Jiabin Shen","Zhichao Duan","Erfan Asadi","Jiahe Liu","Qi Yan","Youran Qu","Xiaohui Zeng","Lele Wang","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2404.16687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04442v1","updated":"2024-05-07T16:07:05Z","published":"2024-05-07T16:07:05Z","title":"AugmenTory: A Fast and Flexible Polygon Augmentation Library","summary":" Data augmentation is a key technique for addressing the challenge of limited\ndatasets, which have become a major component in the training procedures of\nimage processing. Techniques such as geometric transformations and color space\nadjustments have been thoroughly tested for their ability to artificially\nexpand training datasets and generate semi-realistic data for training\npurposes. Data augmentation is the most important key to addressing the\nchallenge of limited datasets, which have become a major component of image\nprocessing training procedures. Data augmentation techniques, such as geometric\ntransformations and color space adjustments, are thoroughly tested for their\nability to artificially expand training datasets and generate semi-realistic\ndata for training purposes. Polygons play a crucial role in instance\nsegmentation and have seen a surge in use across advanced models, such as\nYOLOv8. Despite their growing popularity, the lack of specialized libraries\nhampers the polygon-augmentation process. This paper introduces a novel\nsolution to this challenge, embodied in the newly developed AugmenTory library.\nNotably, AugmenTory offers reduced computational demands in both time and space\ncompared to existing methods. Additionally, the library includes a\npostprocessing thresholding feature. The AugmenTory package is publicly\navailable on GitHub, where interested users can access the source code:\nhttps://github.com/Smartory/AugmenTory\n","authors":["Tanaz Ghahremani","Mohammad Hoseyni","Mohammad Javad Ahmadi","Pouria Mehrabi","Amirhossein Nikoofard"],"pdf_url":"https://arxiv.org/pdf/2405.04442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04416v1","updated":"2024-05-07T15:41:20Z","published":"2024-05-07T15:41:20Z","title":"DistGrid: Scalable Scene Reconstruction with Distributed\n Multi-resolution Hash Grid","summary":" Neural Radiance Field~(NeRF) achieves extremely high quality in object-scaled\nand indoor scene reconstruction. However, there exist some challenges when\nreconstructing large-scale scenes. MLP-based NeRFs suffer from limited network\ncapacity, while volume-based NeRFs are heavily memory-consuming when the scene\nresolution increases. Recent approaches propose to geographically partition the\nscene and learn each sub-region using an individual NeRF. Such partitioning\nstrategies help volume-based NeRF exceed the single GPU memory limit and scale\nto larger scenes. However, this approach requires multiple background NeRF to\nhandle out-of-partition rays, which leads to redundancy of learning. Inspired\nby the fact that the background of current partition is the foreground of\nadjacent partition, we propose a scalable scene reconstruction method based on\njoint Multi-resolution Hash Grids, named DistGrid. In this method, the scene is\ndivided into multiple closely-paved yet non-overlapped Axis-Aligned Bounding\nBoxes, and a novel segmented volume rendering method is proposed to handle\ncross-boundary rays, thereby eliminating the need for background NeRFs. The\nexperiments demonstrate that our method outperforms existing methods on all\nevaluated large-scale scenes, and provides visually plausible scene\nreconstruction. The scalability of our method on reconstruction quality is\nfurther evaluated qualitatively and quantitatively.\n","authors":["Sidun Liu","Peng Qiao","Zongxin Ye","Wenyu Li","Yong Dou"],"pdf_url":"https://arxiv.org/pdf/2405.04416v1.pdf","comment":"Originally submitted to Siggraph Asia 2023"},{"id":"http://arxiv.org/abs/2405.04408v1","updated":"2024-05-07T15:35:43Z","published":"2024-05-07T15:35:43Z","title":"DocRes: A Generalist Model Toward Unifying Document Image Restoration\n Tasks","summary":" Document image restoration is a crucial aspect of Document AI systems, as the\nquality of document images significantly influences the overall performance.\nPrevailing methods address distinct restoration tasks independently, leading to\nintricate systems and the incapability to harness the potential synergies of\nmulti-task learning. To overcome this challenge, we propose DocRes, a\ngeneralist model that unifies five document image restoration tasks including\ndewarping, deshadowing, appearance enhancement, deblurring, and binarization.\nTo instruct DocRes to perform various restoration tasks, we propose a novel\nvisual prompt approach called Dynamic Task-Specific Prompt (DTSPrompt). The\nDTSPrompt for different tasks comprises distinct prior features, which are\nadditional characteristics extracted from the input image. Beyond its role as a\ncue for task-specific execution, DTSPrompt can also serve as supplementary\ninformation to enhance the model's performance. Moreover, DTSPrompt is more\nflexible than prior visual prompt approaches as it can be seamlessly applied\nand adapted to inputs with high and variable resolutions. Experimental results\ndemonstrate that DocRes achieves competitive or superior performance compared\nto existing state-of-the-art task-specific models. This underscores the\npotential of DocRes across a broader spectrum of document image restoration\ntasks. The source code is publicly available at\nhttps://github.com/ZZZHANG-jx/DocRes\n","authors":["Jiaxin Zhang","Dezhi Peng","Chongyu Liu","Peirong Zhang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2405.04408v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.11305v2","updated":"2024-05-07T15:30:45Z","published":"2024-02-17T15:15:43Z","title":"On Good Practices for Task-Specific Distillation of Large Pretrained\n Visual Models","summary":" Large pretrained visual models exhibit remarkable generalization across\ndiverse recognition tasks. Yet, real-world applications often demand compact\nmodels tailored to specific problems. Variants of knowledge distillation have\nbeen devised for such a purpose, enabling task-specific compact models (the\nstudents) to learn from a generic large pretrained one (the teacher). In this\npaper, we show that the excellent robustness and versatility of recent\npretrained models challenge common practices established in the literature,\ncalling for a new set of optimal guidelines for task-specific distillation. To\naddress the lack of samples in downstream tasks, we also show that a variant of\nMixup based on stable diffusion complements standard data augmentation. This\nstrategy eliminates the need for engineered text prompts and improves\ndistillation of generic models into streamlined specialized networks.\n","authors":["Juliette Marrie","Michael Arbel","Julien Mairal","Diane Larlus"],"pdf_url":"https://arxiv.org/pdf/2402.11305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04404v1","updated":"2024-05-07T15:30:14Z","published":"2024-05-07T15:30:14Z","title":"Vision Mamba: A Comprehensive Survey and Taxonomy","summary":" State Space Model (SSM) is a mathematical model used to describe and analyze\nthe behavior of dynamic systems. This model has witnessed numerous applications\nin several fields, including control theory, signal processing, economics and\nmachine learning. In the field of deep learning, state space models are used to\nprocess sequence data, such as time series analysis, natural language\nprocessing (NLP) and video understanding. By mapping sequence data to state\nspace, long-term dependencies in the data can be better captured. In\nparticular, modern SSMs have shown strong representational capabilities in NLP,\nespecially in long sequence modeling, while maintaining linear time complexity.\nNotably, based on the latest state-space models, Mamba merges time-varying\nparameters into SSMs and formulates a hardware-aware algorithm for efficient\ntraining and inference. Given its impressive efficiency and strong long-range\ndependency modeling capability, Mamba is expected to become a new AI\narchitecture that may outperform Transformer. Recently, a number of works have\nattempted to study the potential of Mamba in various fields, such as general\nvision, multi-modal, medical image analysis and remote sensing image analysis,\nby extending Mamba from natural language domain to visual domain. To fully\nunderstand Mamba in the visual domain, we conduct a comprehensive survey and\npresent a taxonomy study. This survey focuses on Mamba's application to a\nvariety of visual tasks and data types, and discusses its predecessors, recent\nadvances and far-reaching impact on a wide range of domains. Since Mamba is now\non an upward trend, please actively notice us if you have new findings, and new\nprogress on Mamba will be included in this survey in a timely manner and\nupdated on the Mamba project at\nhttps://github.com/lx6c78/Vision-Mamba-A-Comprehensive-Survey-and-Taxonomy.\n","authors":["Xiao Liu","Chenxu Zhang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.04404v1.pdf","comment":"https://github.com/lx6c78/Vision-Mamba-A-Comprehensive-Survey-and-Taxonomy"},{"id":"http://arxiv.org/abs/2405.04403v1","updated":"2024-05-07T15:29:48Z","published":"2024-05-07T15:29:48Z","title":"Learning To See But Forgetting To Follow: Visual Instruction Tuning\n Makes LLMs More Prone To Jailbreak Attacks","summary":" Augmenting Large Language Models (LLMs) with image-understanding capabilities\nhas resulted in a boom of high-performing Vision-Language models (VLMs). While\nstudying the alignment of LLMs to human values has received widespread\nattention, the safety of VLMs has not received the same attention. In this\npaper, we explore the impact of jailbreaking on three state-of-the-art VLMs,\neach using a distinct modeling approach. By comparing each VLM to their\nrespective LLM backbone, we find that each VLM is more susceptible to\njailbreaking. We consider this as an undesirable outcome from visual\ninstruction-tuning, which imposes a forgetting effect on an LLM's safety\nguardrails. Therefore, we provide recommendations for future work based on\nevaluation strategies that aim to highlight the weaknesses of a VLM, as well as\ntake safety measures into account during visual instruction tuning.\n","authors":["Georgios Pantazopoulos","Amit Parekh","Malvina Nikandrou","Alessandro Suglia"],"pdf_url":"https://arxiv.org/pdf/2405.04403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00761v3","updated":"2024-05-07T15:26:02Z","published":"2023-12-01T18:29:08Z","title":"Deep Unlearning: Fast and Efficient Training-free Approach to Class\n Forgetting","summary":" Machine unlearning is a prominent and challenging field, driven by regulatory\ndemands for user data deletion and heightened privacy awareness. Existing\napproaches involve retraining model or multiple finetuning steps for each\ndeletion request, often constrained by computational limits and restricted data\naccess. In this work, we introduce a novel class unlearning algorithm designed\nto strategically eliminate specific classes from the learned model. Our\nalgorithm first estimates the Retain and the Forget Spaces using Singular Value\nDecomposition on the layerwise activations for a small subset of samples from\nthe retain and unlearn classes, respectively. We then compute the shared\ninformation between these spaces and remove it from the forget space to isolate\nclass-discriminatory feature space. Finally, we obtain the unlearned model by\nupdating the weights to suppress the class discriminatory features from the\nactivation spaces. We demonstrate our algorithm's efficacy on ImageNet using a\nVision Transformer with only $\\sim 1.5\\%$ drop in retain accuracy compared to\nthe original model while maintaining under $1\\%$ accuracy on the unlearned\nclass samples. Further, our algorithm consistently performs well when subject\nto Membership Inference Attacks showing $7.8\\%$ improvement on average across a\nvariety of image classification datasets and network architectures, as compared\nto other baselines while being $\\sim 6 \\times$ more computationally efficient.\nOur code is available at https://github.com/sangamesh-kodge/class_forgetting.\n","authors":["Sangamesh Kodge","Gobinda Saha","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2312.00761v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04392v1","updated":"2024-05-07T15:14:49Z","published":"2024-05-07T15:14:49Z","title":"BILTS: A novel bi-invariant local trajectory-shape descriptor for\n rigid-body motion","summary":" Measuring the similarity between motions and established motion models is\ncrucial for motion analysis, recognition, generation, and adaptation. To\nenhance similarity measurement across diverse contexts, invariant motion\ndescriptors have been proposed. However, for rigid-body motion, few invariant\ndescriptors exist that are bi-invariant, meaning invariant to both the body and\nworld reference frames used to describe the motion. Moreover, their robustness\nto singularities is limited. This paper introduces a novel Bi-Invariant Local\nTrajectory-Shape descriptor (BILTS) and a corresponding dissimilarity measure.\nMathematical relationships between BILTS and existing descriptors are derived,\nproviding new insights into their properties. The paper also includes an\nalgorithm to reproduce the motion from the BILTS descriptor, demonstrating its\nbidirectionality and usefulness for trajectory generation. Experimental\nvalidation using datasets of daily-life activities shows the higher robustness\nof the BILTS descriptor compared to the bi-invariant ISA descriptor. This\nhigher robustness supports the further application of bi-invariant descriptors\nfor motion recognition and generalization.\n","authors":["Arno Verduyn","Erwin Aertbeliën","Glenn Maes","Joris De Schutter","Maxim Vochten"],"pdf_url":"https://arxiv.org/pdf/2405.04392v1.pdf","comment":"This work has been submitted as a regular research paper for\n consideration in the IEEE Transactions on Robotics. Copyright may be\n transferred without notice, after which this version may no longer be\n accessible"},{"id":"http://arxiv.org/abs/2405.04390v1","updated":"2024-05-07T15:14:20Z","published":"2024-05-07T15:14:20Z","title":"DriveWorld: 4D Pre-trained Scene Understanding via World Models for\n Autonomous Driving","summary":" Vision-centric autonomous driving has recently raised wide attention due to\nits lower cost. Pre-training is essential for extracting a universal\nrepresentation. However, current vision-centric pre-training typically relies\non either 2D or 3D pre-text tasks, overlooking the temporal characteristics of\nautonomous driving as a 4D scene understanding task. In this paper, we address\nthis challenge by introducing a world model-based autonomous driving 4D\nrepresentation learning framework, dubbed \\emph{DriveWorld}, which is capable\nof pre-training from multi-camera driving videos in a spatio-temporal fashion.\nSpecifically, we propose a Memory State-Space Model for spatio-temporal\nmodelling, which consists of a Dynamic Memory Bank module for learning\ntemporal-aware latent dynamics to predict future changes and a Static Scene\nPropagation module for learning spatial-aware latent statics to offer\ncomprehensive scene contexts. We additionally introduce a Task Prompt to\ndecouple task-aware features for various downstream tasks. The experiments\ndemonstrate that DriveWorld delivers promising results on various autonomous\ndriving tasks. When pre-trained with the OpenScene dataset, DriveWorld achieves\na 7.5% increase in mAP for 3D object detection, a 3.0% increase in IoU for\nonline mapping, a 5.0% increase in AMOTA for multi-object tracking, a 0.1m\ndecrease in minADE for motion forecasting, a 3.0% increase in IoU for occupancy\nprediction, and a 0.34m reduction in average L2 error for planning.\n","authors":["Chen Min","Dawei Zhao","Liang Xiao","Jian Zhao","Xinli Xu","Zheng Zhu","Lei Jin","Jianshu Li","Yulan Guo","Junliang Xing","Liping Jing","Yiming Nie","Bin Dai"],"pdf_url":"https://arxiv.org/pdf/2405.04390v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2405.04378v1","updated":"2024-05-07T15:00:19Z","published":"2024-05-07T15:00:19Z","title":"$\\textbf{Splat-MOVER}$: Multi-Stage, Open-Vocabulary Robotic\n Manipulation via Editable Gaussian Splatting","summary":" We present Splat-MOVER, a modular robotics stack for open-vocabulary robotic\nmanipulation, which leverages the editability of Gaussian Splatting (GSplat)\nscene representations to enable multi-stage manipulation tasks. Splat-MOVER\nconsists of: (i) $\\textit{ASK-Splat}$, a GSplat representation that distills\nlatent codes for language semantics and grasp affordance into the 3D scene.\nASK-Splat enables geometric, semantic, and affordance understanding of 3D\nscenes, which is critical for many robotics tasks; (ii) $\\textit{SEE-Splat}$, a\nreal-time scene-editing module using 3D semantic masking and infilling to\nvisualize the motions of objects that result from robot interactions in the\nreal-world. SEE-Splat creates a \"digital twin\" of the evolving environment\nthroughout the manipulation task; and (iii) $\\textit{Grasp-Splat}$, a grasp\ngeneration module that uses ASK-Splat and SEE-Splat to propose candidate grasps\nfor open-world objects. ASK-Splat is trained in real-time from RGB images in a\nbrief scanning phase prior to operation, while SEE-Splat and Grasp-Splat run in\nreal-time during operation. We demonstrate the superior performance of\nSplat-MOVER in hardware experiments on a Kinova robot compared to two recent\nbaselines in four single-stage, open-vocabulary manipulation tasks, as well as\nin four multi-stage manipulation tasks using the edited scene to reflect scene\nchanges due to prior manipulation stages, which is not possible with the\nexisting baselines. Code for this project and a link to the project page will\nbe made available soon.\n","authors":["Ola Shorinwa","Johnathan Tucker","Aliyah Smith","Aiden Swann","Timothy Chen","Roya Firoozi","Monroe Kennedy III","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2405.04378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04377v1","updated":"2024-05-07T15:00:11Z","published":"2024-05-07T15:00:11Z","title":"Choose What You Need: Disentangled Representation Learning for Scene\n Text Recognition, Removal and Editing","summary":" Scene text images contain not only style information (font, background) but\nalso content information (character, texture). Different scene text tasks need\ndifferent information, but previous representation learning methods use tightly\ncoupled features for all tasks, resulting in sub-optimal performance. We\npropose a Disentangled Representation Learning framework (DARLING) aimed at\ndisentangling these two types of features for improved adaptability in better\naddressing various downstream tasks (choose what you really need).\nSpecifically, we synthesize a dataset of image pairs with identical style but\ndifferent content. Based on the dataset, we decouple the two types of features\nby the supervision design. Clearly, we directly split the visual representation\ninto style and content features, the content features are supervised by a text\nrecognition loss, while an alignment loss aligns the style features in the\nimage pairs. Then, style features are employed in reconstructing the\ncounterpart image via an image decoder with a prompt that indicates the\ncounterpart's content. Such an operation effectively decouples the features\nbased on their distinctive properties. To the best of our knowledge, this is\nthe first time in the field of scene text that disentangles the inherent\nproperties of the text images. Our method achieves state-of-the-art performance\nin Scene Text Recognition, Removal, and Editing.\n","authors":["Boqiang Zhang","Hongtao Xie","Zuan Gao","Yuxin Wang"],"pdf_url":"https://arxiv.org/pdf/2405.04377v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.13288v3","updated":"2024-05-07T14:56:00Z","published":"2024-04-20T06:25:32Z","title":"PoseINN: Realtime Visual-based Pose Regression and Localization with\n Invertible Neural Networks","summary":" Estimating ego-pose from cameras is an important problem in robotics with\napplications ranging from mobile robotics to augmented reality. While SOTA\nmodels are becoming increasingly accurate, they can still be unwieldy due to\nhigh computational costs. In this paper, we propose to solve the problem by\nusing invertible neural networks (INN) to find the mapping between the latent\nspace of images and poses for a given scene. Our model achieves similar\nperformance to the SOTA while being faster to train and only requiring offline\nrendering of low-resolution synthetic data. By using normalizing flows, the\nproposed method also provides uncertainty estimation for the output. We also\ndemonstrated the efficiency of this method by deploying the model on a mobile\nrobot.\n","authors":["Zirui Zang","Ahmad Amine","Rahul Mangharam"],"pdf_url":"https://arxiv.org/pdf/2404.13288v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04370v1","updated":"2024-05-07T14:51:05Z","published":"2024-05-07T14:51:05Z","title":"Diff-IP2D: Diffusion-Based Hand-Object Interaction Prediction on\n Egocentric Videos","summary":" Understanding how humans would behave during hand-object interaction is vital\nfor applications in service robot manipulation and extended reality. To achieve\nthis, some recent works have been proposed to simultaneously predict hand\ntrajectories and object affordances on human egocentric videos. They are\nregarded as the representation of future hand-object interactions, indicating\npotential human motion and motivation. However, the existing approaches mostly\nadopt the autoregressive paradigm for unidirectional prediction, which lacks\nmutual constraints within the holistic future sequence, and accumulates errors\nalong the time axis. Meanwhile, these works basically overlook the effect of\ncamera egomotion on first-person view predictions. To address these\nlimitations, we propose a novel diffusion-based interaction prediction method,\nnamely Diff-IP2D, to forecast future hand trajectories and object affordances\nconcurrently in an iterative non-autoregressive manner. We transform the\nsequential 2D images into latent feature space and design a denoising diffusion\nmodel to predict future latent interaction features conditioned on past ones.\nMotion features are further integrated into the conditional denoising process\nto enable Diff-IP2D aware of the camera wearer's dynamics for more accurate\ninteraction prediction. The experimental results show that our method\nsignificantly outperforms the state-of-the-art baselines on both the\noff-the-shelf metrics and our proposed new evaluation protocol. This highlights\nthe efficacy of leveraging a generative paradigm for 2D hand-object interaction\nprediction. The code of Diff-IP2D will be released at\nhttps://github.com/IRMVLab/Diff-IP2D.\n","authors":["Junyi Ma","Jingyi Xu","Xieyuanli Chen","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.04370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05739v2","updated":"2024-05-07T14:50:00Z","published":"2023-08-10T17:57:22Z","title":"Zero Grads: Learning Local Surrogate Losses for Non-Differentiable\n Graphics","summary":" Gradient-based optimization is now ubiquitous across graphics, but\nunfortunately can not be applied to problems with undefined or zero gradients.\nTo circumvent this issue, the loss function can be manually replaced by a\n``surrogate'' that has similar minima but is differentiable. Our proposed\nframework, ZeroGrads, automates this process by learning a neural approximation\nof the objective function, which in turn can be used to differentiate through\narbitrary black-box graphics pipelines. We train the surrogate on an actively\nsmoothed version of the objective and encourage locality, focusing the\nsurrogate's capacity on what matters at the current training episode. The\nfitting is performed online, alongside the parameter optimization, and\nself-supervised, without pre-computed data or pre-trained models. As sampling\nthe objective is expensive (it requires a full rendering or simulator run), we\ndevise an efficient sampling scheme that allows for tractable run-times and\ncompetitive performance at little overhead. We demonstrate optimizing diverse\nnon-convex, non-differentiable black-box problems in graphics, such as\nvisibility in rendering, discrete parameter spaces in procedural modelling or\noptimal control in physics-driven animation. In contrast to other\nderivative-free algorithms, our approach scales well to higher dimensions,\nwhich we demonstrate on problems with up to 35k interlinked variables.\n","authors":["Michael Fischer","Tobias Ritschel"],"pdf_url":"https://arxiv.org/pdf/2308.05739v2.pdf","comment":"Accepted at SIGGRAPH 2024. Project page:\n https://mfischer-ucl.github.io/zerograds"},{"id":"http://arxiv.org/abs/2403.03173v4","updated":"2024-05-07T14:34:34Z","published":"2024-03-05T18:08:29Z","title":"Solving the bongard-logo problem by modeling a probabilistic model","summary":" Abstract reasoning problems challenge the perceptual and cognitive abilities\nof AI algorithms, demanding deeper pattern discernment and inductive reasoning\nbeyond explicit image features. This study introduces PMoC, a tailored\nprobability model for the Bongard-Logo problem, achieving high reasoning\naccuracy by constructing independent probability models. Additionally, we\npresent Pose-Transformer, an enhanced Transformer-Encoder designed for complex\nabstract reasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM.\nPose-Transformer incorporates positional information learning, inspired by\ncapsule networks' pose matrices, enhancing its focus on local positional\nrelationships in image data processing. When integrated with PMoC, it further\nimproves reasoning accuracy. Our approach effectively addresses reasoning\ndifficulties associated with abstract entities' positional changes,\noutperforming previous models on the OIG, D3$\\times$3 subsets of RAVEN, and PGM\ndatabases. This research contributes to advancing AI's capabilities in abstract\nreasoning and cognitive pattern recognition.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03173v4.pdf","comment":"14 pages, 11 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.04356v1","updated":"2024-05-07T14:33:40Z","published":"2024-05-07T14:33:40Z","title":"Diffusion-driven GAN Inversion for Multi-Modal Face Image Generation","summary":" We present a new multi-modal face image generation method that converts a\ntext prompt and a visual input, such as a semantic mask or scribble map, into a\nphoto-realistic face image. To do this, we combine the strengths of Generative\nAdversarial networks (GANs) and diffusion models (DMs) by employing the\nmulti-modal features in the DM into the latent space of the pre-trained GANs.\nWe present a simple mapping and a style modulation network to link two models\nand convert meaningful representations in feature maps and attention maps into\nlatent codes. With GAN inversion, the estimated latent codes can be used to\ngenerate 2D or 3D-aware facial images. We further present a multi-step training\nstrategy that reflects textual and structural representations into the\ngenerated image. Our proposed network produces realistic 2D, multi-view, and\nstylized face images, which align well with inputs. We validate our method by\nusing pre-trained 2D and 3D GANs, and our results outperform existing methods.\nOur project page is available at\nhttps://github.com/1211sh/Diffusion-driven_GAN-Inversion/.\n","authors":["Jihyun Kim","Changjae Oh","Hoseok Do","Soohyun Kim","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2405.04356v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.16854v3","updated":"2024-05-07T14:29:18Z","published":"2023-11-28T15:03:53Z","title":"A Unified Approach for Text- and Image-guided 4D Scene Generation","summary":" Large-scale diffusion generative models are greatly simplifying image, video\nand 3D asset creation from user-provided text prompts and images. However, the\nchallenging problem of text-to-4D dynamic 3D scene generation with diffusion\nguidance remains largely unexplored. We propose Dream-in-4D, which features a\nnovel two-stage approach for text-to-4D synthesis, leveraging (1) 3D and 2D\ndiffusion guidance to effectively learn a high-quality static 3D asset in the\nfirst stage; (2) a deformable neural radiance field that explicitly\ndisentangles the learned static asset from its deformation, preserving quality\nduring motion learning; and (3) a multi-resolution feature grid for the\ndeformation field with a displacement total variation loss to effectively learn\nmotion with video diffusion guidance in the second stage. Through a user\npreference study, we demonstrate that our approach significantly advances image\nand motion quality, 3D consistency and text fidelity for text-to-4D generation\ncompared to baseline approaches. Thanks to its motion-disentangled\nrepresentation, Dream-in-4D can also be easily adapted for controllable\ngeneration where appearance is defined by one or multiple images, without the\nneed to modify the motion learning stage. Thus, our method offers, for the\nfirst time, a unified approach for text-to-4D, image-to-4D and personalized 4D\ngeneration tasks.\n","authors":["Yufeng Zheng","Xueting Li","Koki Nagano","Sifei Liu","Karsten Kreis","Otmar Hilliges","Shalini De Mello"],"pdf_url":"https://arxiv.org/pdf/2311.16854v3.pdf","comment":"Project page: https://research.nvidia.com/labs/nxp/dream-in-4d/"},{"id":"http://arxiv.org/abs/2405.04345v1","updated":"2024-05-07T14:22:32Z","published":"2024-05-07T14:22:32Z","title":"Novel View Synthesis with Neural Radiance Fields for Industrial Robot\n Applications","summary":" Neural Radiance Fields (NeRFs) have become a rapidly growing research field\nwith the potential to revolutionize typical photogrammetric workflows, such as\nthose used for 3D scene reconstruction. As input, NeRFs require multi-view\nimages with corresponding camera poses as well as the interior orientation. In\nthe typical NeRF workflow, the camera poses and the interior orientation are\nestimated in advance with Structure from Motion (SfM). But the quality of the\nresulting novel views, which depends on different parameters such as the number\nand distribution of available images, as well as the accuracy of the related\ncamera poses and interior orientation, is difficult to predict. In addition,\nSfM is a time-consuming pre-processing step, and its quality strongly depends\non the image content. Furthermore, the undefined scaling factor of SfM hinders\nsubsequent steps in which metric information is required. In this paper, we\nevaluate the potential of NeRFs for industrial robot applications. We propose\nan alternative to SfM pre-processing: we capture the input images with a\ncalibrated camera that is attached to the end effector of an industrial robot\nand determine accurate camera poses with metric scale based on the robot\nkinematics. We then investigate the quality of the novel views by comparing\nthem to ground truth, and by computing an internal quality measure based on\nensemble methods. For evaluation purposes, we acquire multiple datasets that\npose challenges for reconstruction typical of industrial applications, like\nreflective objects, poor texture, and fine structures. We show that the\nrobot-based pose determination reaches similar accuracy as SfM in non-demanding\ncases, while having clear advantages in more challenging scenarios. Finally, we\npresent first results of applying the ensemble method to estimate the quality\nof the synthetic novel view in the absence of a ground truth.\n","authors":["Markus Hillemann","Robert Langendörfer","Max Heiken","Max Mehltretter","Andreas Schenk","Martin Weinmann","Stefan Hinz","Christian Heipke","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2405.04345v1.pdf","comment":"8 pages, 8 figures, accepted for publication in The International\n Archives of the Photogrammetry, Remote Sensing and Spatial Information\n Sciences (ISPRS Archives) 2024"},{"id":"http://arxiv.org/abs/2402.17323v2","updated":"2024-05-07T14:19:13Z","published":"2024-02-27T09:01:03Z","title":"SDDGR: Stable Diffusion-based Deep Generative Replay for Class\n Incremental Object Detection","summary":" In the field of class incremental learning (CIL), generative replay has\nbecome increasingly prominent as a method to mitigate the catastrophic\nforgetting, alongside the continuous improvements in generative models.\nHowever, its application in class incremental object detection (CIOD) has been\nsignificantly limited, primarily due to the complexities of scenes involving\nmultiple labels. In this paper, we propose a novel approach called stable\ndiffusion deep generative replay (SDDGR) for CIOD. Our method utilizes a\ndiffusion-based generative model with pre-trained text-to-diffusion networks to\ngenerate realistic and diverse synthetic images. SDDGR incorporates an\niterative refinement strategy to produce high-quality images encompassing old\nclasses. Additionally, we adopt an L2 knowledge distillation technique to\nimprove the retention of prior knowledge in synthetic images. Furthermore, our\napproach includes pseudo-labeling for old objects within new task images,\npreventing misclassification as background elements. Extensive experiments on\nthe COCO 2017 dataset demonstrate that SDDGR significantly outperforms existing\nalgorithms, achieving a new state-of-the-art in various CIOD scenarios. The\nsource code will be made available to the public.\n","authors":["Junsu Kim","Hoseong Cho","Jihyeon Kim","Yihalem Yimolal Tiruneh","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2402.17323v2.pdf","comment":"Accept to CVPR 2024. The camera-ready version"},{"id":"http://arxiv.org/abs/2404.18539v2","updated":"2024-05-07T13:55:57Z","published":"2024-04-29T09:27:31Z","title":"Enhancing Boundary Segmentation for Topological Accuracy with\n Skeleton-based Methods","summary":" Topological consistency plays a crucial role in the task of boundary\nsegmentation for reticular images, such as cell membrane segmentation in neuron\nelectron microscopic images, grain boundary segmentation in material\nmicroscopic images and road segmentation in aerial images. In these fields,\ntopological changes in segmentation results have a serious impact on the\ndownstream tasks, which can even exceed the misalignment of the boundary\nitself. To enhance the topology accuracy in segmentation results, we propose\nthe Skea-Topo Aware loss, which is a novel loss function that takes into\naccount the shape of each object and topological significance of the pixels. It\nconsists of two components. First, a skeleton-aware weighted loss improves the\nsegmentation accuracy by better modeling the object geometry with skeletons.\nSecond, a boundary rectified term effectively identifies and emphasizes\ntopological critical pixels in the prediction errors using both foreground and\nbackground skeletons in the ground truth and predictions. Experiments prove\nthat our method improves topological consistency by up to 7 points in VI\ncompared to 13 state-of-art methods, based on objective and subjective\nassessments across three different boundary segmentation datasets. The code is\navailable at https://github.com/clovermini/Skea_topo.\n","authors":["Chuni Liu","Boyuan Ma","Xiaojuan Ban","Yujie Xie","Hao Wang","Weihua Xue","Jingchao Ma","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2404.18539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04327v1","updated":"2024-05-07T13:55:50Z","published":"2024-05-07T13:55:50Z","title":"Audio-Visual Speech Representation Expert for Enhanced Talking Face\n Video Generation and Evaluation","summary":" In the task of talking face generation, the objective is to generate a face\nvideo with lips synchronized to the corresponding audio while preserving visual\ndetails and identity information. Current methods face the challenge of\nlearning accurate lip synchronization while avoiding detrimental effects on\nvisual quality, as well as robustly evaluating such synchronization. To tackle\nthese problems, we propose utilizing an audio-visual speech representation\nexpert (AV-HuBERT) for calculating lip synchronization loss during training.\nMoreover, leveraging AV-HuBERT's features, we introduce three novel lip\nsynchronization evaluation metrics, aiming to provide a comprehensive\nassessment of lip synchronization performance. Experimental results, along with\na detailed ablation study, demonstrate the effectiveness of our approach and\nthe utility of the proposed evaluation metrics.\n","authors":["Dogucan Yaman","Fevziye Irem Eyiokur","Leonard Bärmann","Seymanur Aktı","Hazım Kemal Ekenel","Alexander Waibel"],"pdf_url":"https://arxiv.org/pdf/2405.04327v1.pdf","comment":"CVPR2024 NTIRE Workshop"},{"id":"http://arxiv.org/abs/2312.17641v2","updated":"2024-05-07T13:42:52Z","published":"2023-12-29T15:08:06Z","title":"Motion State: A New Benchmark Multiple Object Tracking","summary":" In the realm of video analysis, the field of multiple object tracking (MOT)\nassumes paramount importance, with the motion state of objects-whether static\nor dynamic relative to the ground-holding practical significance across diverse\nscenarios. However, the extant literature exhibits a notable dearth in the\nexploration of this aspect. Deep learning methodologies encounter challenges in\naccurately discerning object motion states, while conventional approaches\nreliant on comprehensive mathematical modeling may yield suboptimal tracking\naccuracy. To address these challenges, we introduce a Model-Data-Driven Motion\nState Judgment Object Tracking Method (MoD2T). This innovative architecture\nadeptly amalgamates traditional mathematical modeling with deep learning-based\nmulti-object tracking frameworks. The integration of mathematical modeling and\ndeep learning within MoD2T enhances the precision of object motion state\ndetermination, thereby elevating tracking accuracy. Our empirical\ninvestigations comprehensively validate the efficacy of MoD2T across varied\nscenarios, encompassing unmanned aerial vehicle surveillance and street-level\ntracking. Furthermore, to gauge the method's adeptness in discerning object\nmotion states, we introduce the Motion State Validation F1 (MVF1) metric. This\nnovel performance metric aims to quantitatively assess the accuracy of motion\nstate classification, furnishing a comprehensive evaluation of MoD2T's\nperformance. Elaborate experimental validations corroborate the rationality of\nMVF1. In order to holistically appraise MoD2T's performance, we meticulously\nannotate several renowned datasets and subject MoD2T to stringent testing.\nRemarkably, under conditions characterized by minimal or moderate camera\nmotion, the achieved MVF1 values are particularly noteworthy, with exemplars\nincluding 0.774 for the KITTI dataset, 0.521 for MOT17, and 0.827 for UAVDT.\n","authors":["Yang Feng","Liao Pan","Wu Di","Liu Bo","Zhang Xingle"],"pdf_url":"https://arxiv.org/pdf/2312.17641v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04312v1","updated":"2024-05-07T13:35:58Z","published":"2024-05-07T13:35:58Z","title":"Inf-DiT: Upsampling Any-Resolution Image with Memory-Efficient Diffusion\n Transformer","summary":" Diffusion models have shown remarkable performance in image generation in\nrecent years. However, due to a quadratic increase in memory during generating\nultra-high-resolution images (e.g. 4096*4096), the resolution of generated\nimages is often limited to 1024*1024. In this work. we propose a unidirectional\nblock attention mechanism that can adaptively adjust the memory overhead during\nthe inference process and handle global dependencies. Building on this module,\nwe adopt the DiT structure for upsampling and develop an infinite\nsuper-resolution model capable of upsampling images of various shapes and\nresolutions. Comprehensive experiments show that our model achieves SOTA\nperformance in generating ultra-high-resolution images in both machine and\nhuman evaluation. Compared to commonly used UNet structures, our model can save\nmore than 5x memory when generating 4096*4096 images. The project URL is\nhttps://github.com/THUDM/Inf-DiT.\n","authors":["Zhuoyi Yang","Heyang Jiang","Wenyi Hong","Jiayan Teng","Wendi Zheng","Yuxiao Dong","Ming Ding","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2405.04312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04311v1","updated":"2024-05-07T13:35:51Z","published":"2024-05-07T13:35:51Z","title":"Cross-IQA: Unsupervised Learning for Image Quality Assessment","summary":" Automatic perception of image quality is a challenging problem that impacts\nbillions of Internet and social media users daily. To advance research in this\nfield, we propose a no-reference image quality assessment (NR-IQA) method\ntermed Cross-IQA based on vision transformer(ViT) model. The proposed Cross-IQA\nmethod can learn image quality features from unlabeled image data. We construct\nthe pretext task of synthesized image reconstruction to unsupervised extract\nthe image quality information based ViT block. The pretrained encoder of\nCross-IQA is used to fine-tune a linear regression model for score prediction.\nExperimental results show that Cross-IQA can achieve state-of-the-art\nperformance in assessing the low-frequency degradation information (e.g., color\nchange, blurring, etc.) of images compared with the classical full-reference\nIQA and NR-IQA under the same datasets.\n","authors":["Zhen Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.04311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04309v1","updated":"2024-05-07T13:33:50Z","published":"2024-05-07T13:33:50Z","title":"Non-rigid Structure-from-Motion: Temporally-smooth Procrustean Alignment\n and Spatially-variant Deformation Modeling","summary":" Even though Non-rigid Structure-from-Motion (NRSfM) has been extensively\nstudied and great progress has been made, there are still key challenges that\nhinder their broad real-world applications: 1) the inherent motion/rotation\nambiguity requires either explicit camera motion recovery with extra constraint\nor complex Procrustean Alignment; 2) existing low-rank modeling of the global\nshape can over-penalize drastic deformations in the 3D shape sequence. This\npaper proposes to resolve the above issues from a spatial-temporal modeling\nperspective. First, we propose a novel Temporally-smooth Procrustean Alignment\nmodule that estimates 3D deforming shapes and adjusts the camera motion by\naligning the 3D shape sequence consecutively. Our new alignment module remedies\nthe requirement of complex reference 3D shape during alignment, which is more\nconductive to non-isotropic deformation modeling. Second, we propose a\nspatial-weighted approach to enforce the low-rank constraint adaptively at\ndifferent locations to accommodate drastic spatially-variant deformation\nreconstruction better. Our modeling outperform existing low-rank based methods,\nand extensive experiments across different datasets validate the effectiveness\nof our method.\n","authors":["Jiawei Shi","Hui Deng","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2405.04309v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2405.04305v1","updated":"2024-05-07T13:27:58Z","published":"2024-05-07T13:27:58Z","title":"A New Dataset and Comparative Study for Aphid Cluster Detection and\n Segmentation in Sorghum Fields","summary":" Aphid infestations are one of the primary causes of extensive damage to wheat\nand sorghum fields and are one of the most common vectors for plant viruses,\nresulting in significant agricultural yield losses. To address this problem,\nfarmers often employ the inefficient use of harmful chemical pesticides that\nhave negative health and environmental impacts. As a result, a large amount of\npesticide is wasted on areas without significant pest infestation. This brings\nto attention the urgent need for an intelligent autonomous system that can\nlocate and spray sufficiently large infestations selectively within the complex\ncrop canopies. We have developed a large multi-scale dataset for aphid cluster\ndetection and segmentation, collected from actual sorghum fields and\nmeticulously annotated to include clusters of aphids. Our dataset comprises a\ntotal of 54,742 image patches, showcasing a variety of viewpoints, diverse\nlighting conditions, and multiple scales, highlighting its effectiveness for\nreal-world applications. In this study, we trained and evaluated four real-time\nsemantic segmentation models and three object detection models specifically for\naphid cluster segmentation and detection. Considering the balance between\naccuracy and efficiency, Fast-SCNN delivered the most effective segmentation\nresults, achieving 80.46% mean precision, 81.21% mean recall, and 91.66 frames\nper second (FPS). For object detection, RT-DETR exhibited the best overall\nperformance with a 61.63% mean average precision (mAP), 92.6% mean recall, and\n72.55 on an NVIDIA V100 GPU. Our experiments further indicate that aphid\ncluster segmentation is more suitable for assessing aphid infestations than\nusing detection models.\n","authors":["Raiyan Rahman","Christopher Indris","Goetz Bramesfeld","Tianxiao Zhang","Kaidong Li","Xiangyu Chen","Ivan Grijalva","Brian McCornack","Daniel Flippo","Ajay Sharda","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2405.04305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14322v2","updated":"2024-05-07T13:21:19Z","published":"2024-04-22T16:33:06Z","title":"A Novel Approach to Chest X-ray Lung Segmentation Using U-net and\n Modified Convolutional Block Attention Module","summary":" Lung segmentation in chest X-ray images is of paramount importance as it\nplays a crucial role in the diagnosis and treatment of various lung diseases.\nThis paper presents a novel approach for lung segmentation in chest X-ray\nimages by integrating U-net with attention mechanisms. The proposed method\nenhances the U-net architecture by incorporating a Convolutional Block\nAttention Module (CBAM), which unifies three distinct attention mechanisms:\nchannel attention, spatial attention, and pixel attention. The channel\nattention mechanism enables the model to concentrate on the most informative\nfeatures across various channels. The spatial attention mechanism enhances the\nmodel's precision in localization by focusing on significant spatial locations.\nLastly, the pixel attention mechanism empowers the model to focus on individual\npixels, further refining the model's focus and thereby improving the accuracy\nof segmentation. The adoption of the proposed CBAM in conjunction with the\nU-net architecture marks a significant advancement in the field of medical\nimaging, with potential implications for improving diagnostic precision and\npatient outcomes. The efficacy of this method is validated against contemporary\nstate-of-the-art techniques, showcasing its superiority in segmentation\nperformance.\n","authors":["Mohammad Ali Labbaf Khaniki","Mohammad Manthouri"],"pdf_url":"https://arxiv.org/pdf/2404.14322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11360v2","updated":"2024-05-07T13:15:47Z","published":"2023-12-18T17:17:08Z","title":"Paint-it: Text-to-Texture Synthesis via Deep Convolutional Texture Map\n Optimization and Physically-Based Rendering","summary":" We present Paint-it, a text-driven high-fidelity texture map synthesis method\nfor 3D meshes via neural re-parameterized texture optimization. Paint-it\nsynthesizes texture maps from a text description by\nsynthesis-through-optimization, exploiting the Score-Distillation Sampling\n(SDS). We observe that directly applying SDS yields undesirable texture quality\ndue to its noisy gradients. We reveal the importance of texture\nparameterization when using SDS. Specifically, we propose Deep Convolutional\nPhysically-Based Rendering (DC-PBR) parameterization, which re-parameterizes\nthe physically-based rendering (PBR) texture maps with randomly initialized\nconvolution-based neural kernels, instead of a standard pixel-based\nparameterization. We show that DC-PBR inherently schedules the optimization\ncurriculum according to texture frequency and naturally filters out the noisy\nsignals from SDS. In experiments, Paint-it obtains remarkable quality PBR\ntexture maps within 15 min., given only a text description. We demonstrate the\ngeneralizability and practicality of Paint-it by synthesizing high-quality\ntexture maps for large-scale mesh datasets and showing test-time applications\nsuch as relighting and material control using a popular graphics engine.\nProject page: https://kim-youwang.github.io/paint-it\n","authors":["Kim Youwang","Tae-Hyun Oh","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2312.11360v2.pdf","comment":"CVPR 2024. Project page: https://kim-youwang.github.io/paint-it"},{"id":"http://arxiv.org/abs/2405.04299v1","updated":"2024-05-07T13:15:07Z","published":"2024-05-07T13:15:07Z","title":"ViewFormer: Exploring Spatiotemporal Modeling for Multi-View 3D\n Occupancy Perception via View-Guided Transformers","summary":" 3D occupancy, an advanced perception technology for driving scenarios,\nrepresents the entire scene without distinguishing between foreground and\nbackground by quantifying the physical space into a grid map. The widely\nadopted projection-first deformable attention, efficient in transforming image\nfeatures into 3D representations, encounters challenges in aggregating\nmulti-view features due to sensor deployment constraints. To address this\nissue, we propose our learning-first view attention mechanism for effective\nmulti-view feature aggregation. Moreover, we showcase the scalability of our\nview attention across diverse multi-view 3D tasks, such as map construction and\n3D object detection. Leveraging the proposed view attention as well as an\nadditional multi-frame streaming temporal attention, we introduce ViewFormer, a\nvision-centric transformer-based framework for spatiotemporal feature\naggregation. To further explore occupancy-level flow representation, we present\nFlowOcc3D, a benchmark built on top of existing high-quality datasets.\nQualitative and quantitative analyses on this benchmark reveal the potential to\nrepresent fine-grained dynamic scenes. Extensive experiments show that our\napproach significantly outperforms prior state-of-the-art methods. The codes\nand benchmark will be released soon.\n","authors":["Jinke Li","Xiao He","Chonghua Zhou","Xiaoqiang Cheng","Yang Wen","Dan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.04299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04295v1","updated":"2024-05-07T13:11:08Z","published":"2024-05-07T13:11:08Z","title":"Semi-Supervised Disease Classification based on Limited Medical Image\n Data","summary":" In recent years, significant progress has been made in the field of learning\nfrom positive and unlabeled examples (PU learning), particularly in the context\nof advancing image and text classification tasks. However, applying PU learning\nto semi-supervised disease classification remains a formidable challenge,\nprimarily due to the limited availability of labeled medical images. In the\nrealm of medical image-aided diagnosis algorithms, numerous theoretical and\npractical obstacles persist. The research on PU learning for medical\nimage-assisted diagnosis holds substantial importance, as it aims to reduce the\ntime spent by professional experts in classifying images. Unlike natural\nimages, medical images are typically accompanied by a scarcity of annotated\ndata, while an abundance of unlabeled cases exists. Addressing these\nchallenges, this paper introduces a novel generative model inspired by H\\\"older\ndivergence, specifically designed for semi-supervised disease classification\nusing positive and unlabeled medical image data. In this paper, we present a\ncomprehensive formulation of the problem and establish its theoretical\nfeasibility through rigorous mathematical analysis. To evaluate the\neffectiveness of our proposed approach, we conduct extensive experiments on\nfive benchmark datasets commonly used in PU medical learning: BreastMNIST,\nPneumoniaMNIST, BloodMNIST, OCTMNIST, and AMD. The experimental results clearly\ndemonstrate the superiority of our method over existing approaches based on KL\ndivergence. Notably, our approach achieves state-of-the-art performance on all\nfive disease classification benchmarks.\n By addressing the limitations imposed by limited labeled data and harnessing\nthe untapped potential of unlabeled medical images, our novel generative model\npresents a promising direction for enhancing semi-supervised disease\nclassification in the field of medical image analysis.\n","authors":["Yan Zhang","Chun Li","Zhaoxia Liu","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2405.04295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12917v2","updated":"2024-05-07T12:45:07Z","published":"2024-04-19T14:42:42Z","title":"Zero-Shot Stitching in Reinforcement Learning using Relative\n Representations","summary":" Visual Reinforcement Learning is a popular and powerful framework that takes\nfull advantage of the Deep Learning breakthrough. However, it is also known\nthat variations in the input (e.g., different colors of the panorama due to the\nseason of the year) or the task (e.g., changing the speed limit for a car to\nrespect) could require complete retraining of the agents. In this work, we\nleverage recent developments in unifying latent representations to demonstrate\nthat it is possible to combine the components of an agent, rather than retrain\nit from scratch. We build upon the recent relative representations framework\nand adapt it for Visual RL. This allows us to create completely new agents\ncapable of handling environment-task combinations never seen during training.\nOur work paves the road toward a more accessible and flexible use of\nreinforcement learning.\n","authors":["Antonio Pio Ricciardi","Valentino Maiorca","Luca Moschella","Riccardo Marin","Emanuele Rodolà"],"pdf_url":"https://arxiv.org/pdf/2404.12917v2.pdf","comment":"13 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2303.07169v4","updated":"2024-05-07T12:43:30Z","published":"2023-03-13T15:12:30Z","title":"Dynamic Event-based Optical Identification and Communication","summary":" Optical identification is often done with spatial or temporal visual pattern\nrecognition and localization. Temporal pattern recognition, depending on the\ntechnology, involves a trade-off between communication frequency, range and\naccurate tracking. We propose a solution with light-emitting beacons that\nimproves this trade-off by exploiting fast event-based cameras and, for\ntracking, sparse neuromorphic optical flow computed with spiking neurons. The\nsystem is embedded in a simulated drone and evaluated in an asset monitoring\nuse case. It is robust to relative movements and enables simultaneous\ncommunication with, and tracking of, multiple moving beacons. Finally, in a\nhardware lab prototype, we demonstrate for the first time beacon tracking\nperformed simultaneously with state-of-the-art frequency communication in the\nkHz range.\n","authors":["Axel von Arnim","Jules Lecomte","Naima Elosegui Borras","Stanislaw Wozniak","Angeliki Pantazi"],"pdf_url":"https://arxiv.org/pdf/2303.07169v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04274v1","updated":"2024-05-07T12:42:23Z","published":"2024-05-07T12:42:23Z","title":"Group-aware Parameter-efficient Updating for Content-Adaptive Neural\n Video Compression","summary":" Content-adaptive compression is crucial for enhancing the adaptability of the\npre-trained neural codec for various contents. Although these methods have been\nvery practical in neural image compression (NIC), their application in neural\nvideo compression (NVC) is still limited due to two main aspects: 1), video\ncompression relies heavily on temporal redundancy, therefore updating just one\nor a few frames can lead to significant errors accumulating over time; 2), NVC\nframeworks are generally more complex, with many large components that are not\neasy to update quickly during encoding. To address the previously mentioned\nchallenges, we have developed a content-adaptive NVC technique called\nGroup-aware Parameter-Efficient Updating (GPU). Initially, to minimize error\naccumulation, we adopt a group-aware approach for updating encoder parameters.\nThis involves adopting a patch-based Group of Pictures (GoP) training strategy\nto segment a video into patch-based GoPs, which will be updated to facilitate a\nglobally optimized domain-transferable solution. Subsequently, we introduce a\nparameter-efficient delta-tuning strategy, which is achieved by integrating\nseveral light-weight adapters into each coding component of the encoding\nprocess by both serial and parallel configuration. Such architecture-agnostic\nmodules stimulate the components with large parameters, thereby reducing both\nthe update cost and the encoding time. We incorporate our GPU into the latest\nNVC framework and conduct comprehensive experiments, whose results showcase\noutstanding video compression efficiency across four video benchmarks and\nadaptability of one medical image benchmark.\n","authors":["Zhenghao Chen","Luping Zhou","Zhihao Hu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2405.04274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10087v2","updated":"2024-05-07T12:27:21Z","published":"2024-03-15T08:01:44Z","title":"Monkeypox disease recognition model based on improved SE-InceptionV3","summary":" In the wake of the global spread of monkeypox, accurate disease recognition\nhas become crucial. This study introduces an improved SE-InceptionV3 model,\nembedding the SENet module and incorporating L2 regularization into the\nInceptionV3 framework to enhance monkeypox disease detection. Utilizing the\nKaggle monkeypox dataset, which includes images of monkeypox and similar skin\nconditions, our model demonstrates a noteworthy accuracy of 96.71% on the test\nset, outperforming conventional methods and deep learning models. The SENet\nmodules channel attention mechanism significantly elevates feature\nrepresentation, while L2 regularization ensures robust generalization.\nExtensive experiments validate the models superiority in precision, recall, and\nF1 score, highlighting its effectiveness in differentiating monkeypox lesions\nin diverse and complex cases. The study not only provides insights into the\napplication of advanced CNN architectures in medical diagnostics but also opens\navenues for further research in model optimization and hyperparameter tuning\nfor enhanced disease recognition. https://github.com/jzc777/SE-inceptionV3-L2\n","authors":["Junzhuo Chen","Zonghan Lu","Shitong Kang"],"pdf_url":"https://arxiv.org/pdf/2403.10087v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08896v2","updated":"2024-05-07T12:15:43Z","published":"2022-12-17T15:19:45Z","title":"Human Image Generation: A Comprehensive Survey","summary":" Image and video synthesis has become a blooming topic in computer vision and\nmachine learning communities along with the developments of deep generative\nmodels, due to its great academic and application value. Many researchers have\nbeen devoted to synthesizing high-fidelity human images as one of the most\ncommonly seen object categories in daily lives, where a large number of studies\nare performed based on various models, task settings and applications. Thus, it\nis necessary to give a comprehensive overview on these variant methods on human\nimage generation. In this paper, we divide human image generation techniques\ninto three paradigms, i.e., data-driven methods, knowledge-guided methods and\nhybrid methods. For each paradigm, the most representative models and the\ncorresponding variants are presented, where the advantages and characteristics\nof different methods are summarized in terms of model architectures. Besides,\nthe main public human image datasets and evaluation metrics in the literature\nare summarized. Furthermore, due to the wide application potentials, the\ntypical downstream usages of synthesized human images are covered. Finally, the\nchallenges and potential opportunities of human image generation are discussed\nto shed light on future research.\n","authors":["Zhen Jia","Zhang Zhang","Liang Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2212.08896v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2405.04251v1","updated":"2024-05-07T12:11:15Z","published":"2024-05-07T12:11:15Z","title":"A General Model for Detecting Learner Engagement: Implementation and\n Evaluation","summary":" Considering learner engagement has a mutual benefit for both learners and\ninstructors. Instructors can help learners increase their attention,\ninvolvement, motivation, and interest. On the other hand, instructors can\nimprove their instructional performance by evaluating the cumulative results of\nall learners and upgrading their training programs. This paper proposes a\ngeneral, lightweight model for selecting and processing features to detect\nlearners' engagement levels while preserving the sequential temporal\nrelationship over time. During training and testing, we analyzed the videos\nfrom the publicly available DAiSEE dataset to capture the dynamic essence of\nlearner engagement. We have also proposed an adaptation policy to find new\nlabels that utilize the affective states of this dataset related to education,\nthereby improving the models' judgment. The suggested model achieves an\naccuracy of 68.57\\% in a specific implementation and outperforms the studied\nstate-of-the-art models detecting learners' engagement levels.\n","authors":["Somayeh Malekshahi","Javad M. Kheyridoost","Omid Fatemi"],"pdf_url":"https://arxiv.org/pdf/2405.04251v1.pdf","comment":"13 pages, 2 Postscript figures"},{"id":"http://arxiv.org/abs/2312.07661v3","updated":"2024-05-07T12:00:34Z","published":"2023-12-12T19:00:04Z","title":"CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor","summary":" Existing open-vocabulary image segmentation methods require a fine-tuning\nstep on mask labels and/or image-text datasets. Mask labels are\nlabor-intensive, which limits the number of categories in segmentation\ndatasets. Consequently, the vocabulary capacity of pre-trained VLMs is severely\nreduced after fine-tuning. However, without fine-tuning, VLMs trained under\nweak image-text supervision tend to make suboptimal mask predictions. To\nalleviate these issues, we introduce a novel recurrent framework that\nprogressively filters out irrelevant texts and enhances mask quality without\ntraining efforts. The recurrent unit is a two-stage segmenter built upon a\nfrozen VLM. Thus, our model retains the VLM's broad vocabulary space and equips\nit with segmentation ability. Experiments show that our method outperforms not\nonly the training-free counterparts, but also those fine-tuned with millions of\ndata samples, and sets the new state-of-the-art records for both zero-shot\nsemantic and referring segmentation. Concretely, we improve the current record\nby 28.8, 16.0, and 6.9 mIoU on Pascal VOC, COCO Object, and Pascal Context.\n","authors":["Shuyang Sun","Runjia Li","Philip Torr","Xiuye Gu","Siyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.07661v3.pdf","comment":"To appear in CVPR 2024. Project page:\n https://torrvision.com/clip_as_rnn/"},{"id":"http://arxiv.org/abs/2405.04233v1","updated":"2024-05-07T11:52:49Z","published":"2024-05-07T11:52:49Z","title":"Vidu: a Highly Consistent, Dynamic and Skilled Text-to-Video Generator\n with Diffusion Models","summary":" We introduce Vidu, a high-performance text-to-video generator that is capable\nof producing 1080p videos up to 16 seconds in a single generation. Vidu is a\ndiffusion model with U-ViT as its backbone, which unlocks the scalability and\nthe capability for handling long videos. Vidu exhibits strong coherence and\ndynamism, and is capable of generating both realistic and imaginative videos,\nas well as understanding some professional photography techniques, on par with\nSora -- the most powerful reported text-to-video generator. Finally, we perform\ninitial experiments on other controllable video generation, including\ncanny-to-video generation, video prediction and subject-driven generation,\nwhich demonstrate promising results.\n","authors":["Fan Bao","Chendong Xiang","Gang Yue","Guande He","Hongzhou Zhu","Kaiwen Zheng","Min Zhao","Shilong Liu","Yaole Wang","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.04233v1.pdf","comment":"Project page at https://www.shengshu-ai.com/vidu"},{"id":"http://arxiv.org/abs/2405.04211v1","updated":"2024-05-07T11:24:37Z","published":"2024-05-07T11:24:37Z","title":"Breast Histopathology Image Retrieval by Attention-based Adversarially\n Regularized Variational Graph Autoencoder with Contrastive Learning-Based\n Feature Extraction","summary":" Breast cancer is a significant global health concern, particularly for women.\nEarly detection and appropriate treatment are crucial in mitigating its impact,\nwith histopathology examinations playing a vital role in swift diagnosis.\nHowever, these examinations often require a substantial workforce and\nexperienced medical experts for proper recognition and cancer grading.\nAutomated image retrieval systems have the potential to assist pathologists in\nidentifying cancerous tissues, thereby accelerating the diagnostic process.\nNevertheless, due to considerable variability among the tissue and cell\npatterns in histological images, proposing an accurate image retrieval model is\nvery challenging.\n This work introduces a novel attention-based adversarially regularized\nvariational graph autoencoder model for breast histological image retrieval.\nAdditionally, we incorporated cluster-guided contrastive learning as the graph\nfeature extractor to boost the retrieval performance. We evaluated the proposed\nmodel's performance on two publicly available datasets of breast cancer\nhistological images and achieved superior or very competitive retrieval\nperformance, with average mAP scores of 96.5% for the BreakHis dataset and\n94.7% for the BACH dataset, and mVP scores of 91.9% and 91.3%, respectively.\n Our proposed retrieval model has the potential to be used in clinical\nsettings to enhance diagnostic performance and ultimately benefit patients.\n","authors":["Nematollah Saeidi","Hossein Karshenas","Bijan Shoushtarian","Sepideh Hatamikia","Ramona Woitek","Amirreza Mahbod"],"pdf_url":"https://arxiv.org/pdf/2405.04211v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2405.01937v2","updated":"2024-05-07T11:15:37Z","published":"2024-05-03T09:02:17Z","title":"An Attention Based Pipeline for Identifying Pre-Cancer Lesions in Head\n and Neck Clinical Images","summary":" Early detection of cancer can help improve patient prognosis by early\nintervention. Head and neck cancer is diagnosed in specialist centres after a\nsurgical biopsy, however, there is a potential for these to be missed leading\nto delayed diagnosis. To overcome these challenges, we present an attention\nbased pipeline that identifies suspected lesions, segments, and classifies them\nas non-dysplastic, dysplastic and cancerous lesions. We propose (a) a vision\ntransformer based Mask R-CNN network for lesion detection and segmentation of\nclinical images, and (b) Multiple Instance Learning (MIL) based scheme for\nclassification. Current results show that the segmentation model produces\nsegmentation masks and bounding boxes with up to 82% overlap accuracy score on\nunseen external test data and surpassing reviewed segmentation benchmarks.\nNext, a classification F1-score of 85% on the internal cohort test set. An app\nhas been developed to perform lesion segmentation taken via a smart device.\nFuture work involves employing endoscopic video data for precise early\ndetection and prognosis.\n","authors":["Abdullah Alsalemi","Anza Shakeel","Mollie Clark","Syed Ali Khurram","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2405.01937v2.pdf","comment":"5 pages, 3 figures, accepted in ISBI 2024, update: corrected typos"},{"id":"http://arxiv.org/abs/2405.02929v2","updated":"2024-05-07T10:58:27Z","published":"2024-05-05T13:15:11Z","title":"Unified Dynamic Scanpath Predictors Outperform Individually Trained\n Neural Models","summary":" Previous research on scanpath prediction has mainly focused on group models,\ndisregarding the fact that the scanpaths and attentional behaviors of\nindividuals are diverse. The disregard of these differences is especially\ndetrimental to social human-robot interaction, whereby robots commonly emulate\nhuman gaze based on heuristics or predefined patterns. However, human gaze\npatterns are heterogeneous and varying behaviors can significantly affect the\noutcomes of such human-robot interactions. To fill this gap, we developed a\ndeep learning-based social cue integration model for saliency prediction to\ninstead predict scanpaths in videos. Our model learned scanpaths by recursively\nintegrating fixation history and social cues through a gating mechanism and\nsequential attention. We evaluated our approach on gaze datasets of dynamic\nsocial scenes, observed under the free-viewing condition. The introduction of\nfixation history into our models makes it possible to train a single unified\nmodel rather than the resource-intensive approach of training individual models\nfor each set of scanpaths. We observed that the late neural integration\napproach surpasses early fusion when training models on a large dataset, in\ncomparison to a smaller dataset with a similar distribution. Results also\nindicate that a single unified model, trained on all the observers' scanpaths,\nperforms on par or better than individually trained models. We hypothesize that\nthis outcome is a result of the group saliency representations instilling\nuniversal attention in the model, while the supervisory signal and fixation\nhistory guide it to learn personalized attentional behaviors, providing the\nunified model a benefit over individual models due to its implicit\nrepresentation of universal attention.\n","authors":["Fares Abawi","Di Fu","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2405.02929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19475v2","updated":"2024-05-07T10:53:46Z","published":"2024-04-30T11:43:37Z","title":"TwinDiffusion: Enhancing Coherence and Efficiency in Panoramic Image\n Generation with Diffusion Models","summary":" Diffusion models have emerged as effective tools for generating diverse and\nhigh-quality content. However, their capability in high-resolution image\ngeneration, particularly for panoramic images, still faces challenges such as\nvisible seams and incoherent transitions. In this paper, we propose\nTwinDiffusion, an optimized framework designed to address these challenges\nthrough two key innovations: Crop Fusion for quality enhancement and Cross\nSampling for efficiency optimization. We introduce a training-free optimizing\nstage to refine the similarity of the adjacent image areas, as well as an\ninterleaving sampling strategy to yield dynamic patches during the cropping\nprocess. A comprehensive evaluation is conducted to compare TwinDiffusion with\nthe existing methods, considering factors including coherence, fidelity,\ncompatibility, and efficiency. The results demonstrate the superior performance\nof our approach in generating seamless and coherent panoramas, setting a new\nstandard in quality and efficiency for panoramic image generation.\n","authors":["Teng Zhou","Yongchuan Tang"],"pdf_url":"https://arxiv.org/pdf/2404.19475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04191v1","updated":"2024-05-07T10:53:20Z","published":"2024-05-07T10:53:20Z","title":"Effective and Robust Adversarial Training against Data and Label\n Corruptions","summary":" Corruptions due to data perturbations and label noise are prevalent in the\ndatasets from unreliable sources, which poses significant threats to model\ntraining. Despite existing efforts in developing robust models, current\nlearning methods commonly overlook the possible co-existence of both\ncorruptions, limiting the effectiveness and practicability of the model. In\nthis paper, we develop an Effective and Robust Adversarial Training (ERAT)\nframework to simultaneously handle two types of corruption (i.e., data and\nlabel) without prior knowledge of their specifics. We propose a hybrid\nadversarial training surrounding multiple potential adversarial perturbations,\nalongside a semi-supervised learning based on class-rebalancing sample\nselection to enhance the resilience of the model for dual corruption. On the\none hand, in the proposed adversarial training, the perturbation generation\nmodule learns multiple surrogate malicious data perturbations by taking a DNN\nmodel as the victim, while the model is trained to maintain semantic\nconsistency between the original data and the hybrid perturbed data. It is\nexpected to enable the model to cope with unpredictable perturbations in\nreal-world data corruption. On the other hand, a class-rebalancing data\nselection strategy is designed to fairly differentiate clean labels from noisy\nlabels. Semi-supervised learning is performed accordingly by discarding noisy\nlabels. Extensive experiments demonstrate the superiority of the proposed ERAT\nframework.\n","authors":["Peng-Fei Zhang","Zi Huang","Xin-Shun Xu","Guangdong Bai"],"pdf_url":"https://arxiv.org/pdf/2405.04191v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.13699v3","updated":"2024-05-07T10:50:37Z","published":"2024-02-21T11:00:23Z","title":"Explainable Classification Techniques for Quantum Dot Device\n Measurements","summary":" In the physical sciences, there is an increased need for robust feature\nrepresentations of image data: image acquisition, in the generalized sense of\ntwo-dimensional data, is now widespread across a large number of fields,\nincluding quantum information science, which we consider here. While\ntraditional image features are widely utilized in such cases, their use is\nrapidly being supplanted by Neural Network-based techniques that often\nsacrifice explainability in exchange for high accuracy. To ameliorate this\ntrade-off, we propose a synthetic data-based technique that results in\nexplainable features. We show, using Explainable Boosting Machines (EBMs), that\nthis method offers superior explainability without sacrificing accuracy.\nSpecifically, we show that there is a meaningful benefit to this technique in\nthe context of quantum dot tuning, where human intervention is necessary at the\ncurrent stage of development.\n","authors":["Daniel Schug","Tyler J. Kovach","M. A. Wolfe","Jared Benson","Sanghyeok Park","J. P. Dodson","J. Corrigan","M. A. Eriksson","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2402.13699v3.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.04189v1","updated":"2024-05-07T10:49:10Z","published":"2024-05-07T10:49:10Z","title":"Artificial Intelligence-powered fossil shark tooth identification:\n Unleashing the potential of Convolutional Neural Networks","summary":" All fields of knowledge are being impacted by Artificial Intelligence. In\nparticular, the Deep Learning paradigm enables the development of data analysis\ntools that support subject matter experts in a variety of sectors, from physics\nup to the recognition of ancient languages. Palaeontology is now observing this\ntrend as well. This study explores the capability of Convolutional Neural\nNetworks (CNNs), a particular class of Deep Learning algorithms specifically\ncrafted for computer vision tasks, to classify images of isolated fossil shark\nteeth gathered from online datasets as well as from the authors$'$ experience\non Peruvian Miocene and Italian Pliocene fossil assemblages. The shark taxa\nthat are included in the final, composite dataset (which consists of more than\none thousand images) are representative of both extinct and extant genera,\nnamely, Carcharhinus, Carcharias, Carcharocles, Chlamydoselachus,\nCosmopolitodus, Galeocerdo, Hemipristis, Notorynchus, Prionace and Squatina. We\ndeveloped a CNN, named SharkNet-X, specifically tailored on our recognition\ntask, reaching a 5-fold cross validated mean accuracy of 0.85 to identify\nimages containing a single shark tooth. Furthermore, we elaborated a\nvisualization of the features extracted from images using the last dense layer\nof the CNN, achieved through the application of the clustering technique t-SNE.\nIn addition, in order to understand and explain the behaviour of the CNN while\ngiving a paleontological point of view on the results, we introduced the\nexplainability method SHAP. To the best of our knowledge, this is the first\ninstance in which this method is applied to the field of palaeontology. The\nmain goal of this work is to showcase how Deep Learning techniques can aid in\nidentifying isolated fossil shark teeth, paving the way for developing new\ninformation tools for automating the recognition and classification of fossils.\n","authors":["Andrea Barucci","Giulia Ciacci","Pietro Liò","Tiago Azevedo","Andrea Di Cencio","Marco Merella","Giovanni Bianucci","Giulia Bosio","Simone Casati","Alberto Collareta"],"pdf_url":"https://arxiv.org/pdf/2405.04189v1.pdf","comment":"40 pages, 8 figures"},{"id":"http://arxiv.org/abs/2403.11674v3","updated":"2024-05-07T10:48:32Z","published":"2024-03-18T11:21:52Z","title":"Towards Generalizing to Unseen Domains with Few Labels","summary":" We approach the challenge of addressing semi-supervised domain generalization\n(SSDG). Specifically, our aim is to obtain a model that learns\ndomain-generalizable features by leveraging a limited subset of labelled data\nalongside a substantially larger pool of unlabeled data. Existing domain\ngeneralization (DG) methods which are unable to exploit unlabeled data perform\npoorly compared to semi-supervised learning (SSL) methods under SSDG setting.\nNevertheless, SSL methods have considerable room for performance improvement\nwhen compared to fully-supervised DG training. To tackle this underexplored,\nyet highly practical problem of SSDG, we make the following core contributions.\nFirst, we propose a feature-based conformity technique that matches the\nposterior distributions from the feature space with the pseudo-label from the\nmodel's output space. Second, we develop a semantics alignment loss to learn\nsemantically-compatible representations by regularizing the semantic structure\nin the feature space. Our method is plug-and-play and can be readily integrated\nwith different SSL-based SSDG baselines without introducing any additional\nparameters. Extensive experimental results across five challenging DG\nbenchmarks with four strong SSL baselines suggest that our method provides\nconsistent and notable gains in two different SSDG settings.\n","authors":["Chamuditha Jayanga Galappaththige","Sanoojan Baliah","Malitha Gunawardhana","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2403.11674v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2306.00738v2","updated":"2024-05-07T10:26:44Z","published":"2023-06-01T14:32:34Z","title":"ReFACT: Updating Text-to-Image Models by Editing the Text Encoder","summary":" Our world is marked by unprecedented technological, global, and\nsocio-political transformations, posing a significant challenge to\ntext-to-image generative models. These models encode factual associations\nwithin their parameters that can quickly become outdated, diminishing their\nutility for end-users. To that end, we introduce ReFACT, a novel approach for\nediting factual associations in text-to-image models without relaying on\nexplicit input from end-users or costly re-training. ReFACT updates the weights\nof a specific layer in the text encoder, modifying only a tiny portion of the\nmodel's parameters and leaving the rest of the model unaffected. We empirically\nevaluate ReFACT on an existing benchmark, alongside a newly curated dataset.\nCompared to other methods, ReFACT achieves superior performance in both\ngeneralization to related concepts and preservation of unrelated concepts.\nFurthermore, ReFACT maintains image generation quality, making it a practical\ntool for updating and correcting factual information in text-to-image models.\n","authors":["Dana Arad","Hadas Orgad","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2306.00738v2.pdf","comment":"Accepted to NAACL 2024 (Main Conference)"},{"id":"http://arxiv.org/abs/2405.04175v1","updated":"2024-05-07T10:21:23Z","published":"2024-05-07T10:21:23Z","title":"Topicwise Separable Sentence Retrieval for Medical Report Generation","summary":" Automated radiology reporting holds immense clinical potential in alleviating\nthe burdensome workload of radiologists and mitigating diagnostic bias.\nRecently, retrieval-based report generation methods have garnered increasing\nattention due to their inherent advantages in terms of the quality and\nconsistency of generated reports. However, due to the long-tail distribution of\nthe training data, these models tend to learn frequently occurring sentences\nand topics, overlooking the rare topics. Regrettably, in many cases, the\ndescriptions of rare topics often indicate critical findings that should be\nmentioned in the report. To address this problem, we introduce a Topicwise\nSeparable Sentence Retrieval (Teaser) for medical report generation. To ensure\ncomprehensive learning of both common and rare topics, we categorize queries\ninto common and rare types to learn differentiated topics, and then propose\nTopic Contrastive Loss to effectively align topics and queries in the latent\nspace. Moreover, we integrate an Abstractor module following the extraction of\nvisual features, which aids the topic decoder in gaining a deeper understanding\nof the visual observational intent. Experiments on the MIMIC-CXR and IU X-ray\ndatasets demonstrate that Teaser surpasses state-of-the-art models, while also\nvalidating its capability to effectively represent rare topics and establish\nmore dependable correspondences between queries and topics.\n","authors":["Junting Zhao","Yang Zhou","Zhihao Chen","Huazhu Fu","Liang Wan"],"pdf_url":"https://arxiv.org/pdf/2405.04175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04169v1","updated":"2024-05-07T10:09:41Z","published":"2024-05-07T10:09:41Z","title":"D-TrAttUnet: Toward Hybrid CNN-Transformer Architecture for Generic and\n Subtle Segmentation in Medical Images","summary":" Over the past two decades, machine analysis of medical imaging has advanced\nrapidly, opening up significant potential for several important medical\napplications. As complicated diseases increase and the number of cases rises,\nthe role of machine-based imaging analysis has become indispensable. It serves\nas both a tool and an assistant to medical experts, providing valuable insights\nand guidance. A particularly challenging task in this area is lesion\nsegmentation, a task that is challenging even for experienced radiologists. The\ncomplexity of this task highlights the urgent need for robust machine learning\napproaches to support medical staff. In response, we present our novel\nsolution: the D-TrAttUnet architecture. This framework is based on the\nobservation that different diseases often target specific organs. Our\narchitecture includes an encoder-decoder structure with a composite\nTransformer-CNN encoder and dual decoders. The encoder includes two paths: the\nTransformer path and the Encoders Fusion Module path. The Dual-Decoder\nconfiguration uses two identical decoders, each with attention gates. This\nallows the model to simultaneously segment lesions and organs and integrate\ntheir segmentation losses.\n To validate our approach, we performed evaluations on the Covid-19 and Bone\nMetastasis segmentation tasks. We also investigated the adaptability of the\nmodel by testing it without the second decoder in the segmentation of glands\nand nuclei. The results confirmed the superiority of our approach, especially\nin Covid-19 infections and the segmentation of bone metastases. In addition,\nthe hybrid encoder showed exceptional performance in the segmentation of glands\nand nuclei, solidifying its role in modern medical image analysis.\n","authors":["Fares Bougourzi","Fadi Dornaika","Cosimo Distante","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2405.04169v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.15576"},{"id":"http://arxiv.org/abs/2405.04167v1","updated":"2024-05-07T10:07:33Z","published":"2024-05-07T10:07:33Z","title":"Bridging the Synthetic-to-Authentic Gap: Distortion-Guided Unsupervised\n Domain Adaptation for Blind Image Quality Assessment","summary":" The annotation of blind image quality assessment (BIQA) is labor-intensive\nand time-consuming, especially for authentic images. Training on synthetic data\nis expected to be beneficial, but synthetically trained models often suffer\nfrom poor generalization in real domains due to domain gaps. In this work, we\nmake a key observation that introducing more distortion types in the synthetic\ndataset may not improve or even be harmful to generalizing authentic image\nquality assessment. To solve this challenge, we propose distortion-guided\nunsupervised domain adaptation for BIQA (DGQA), a novel framework that\nleverages adaptive multi-domain selection via prior knowledge from distortion\nto match the data distribution between the source domains and the target\ndomain, thereby reducing negative transfer from the outlier source domains.\nExtensive experiments on two cross-domain settings (synthetic distortion to\nauthentic distortion and synthetic distortion to algorithmic distortion) have\ndemonstrated the effectiveness of our proposed DGQA. Besides, DGQA is\northogonal to existing model-based BIQA methods, and can be used in combination\nwith such models to improve performance with less training data.\n","authors":["Aobo Li","Jinjian Wu","Yongxu Liu","Leida Li"],"pdf_url":"https://arxiv.org/pdf/2405.04167v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2405.04164v1","updated":"2024-05-07T10:00:38Z","published":"2024-05-07T10:00:38Z","title":"Sign2GPT: Leveraging Large Language Models for Gloss-Free Sign Language\n Translation","summary":" Automatic Sign Language Translation requires the integration of both computer\nvision and natural language processing to effectively bridge the communication\ngap between sign and spoken languages. However, the deficiency in large-scale\ntraining data to support sign language translation means we need to leverage\nresources from spoken language. We introduce, Sign2GPT, a novel framework for\nsign language translation that utilizes large-scale pretrained vision and\nlanguage models via lightweight adapters for gloss-free sign language\ntranslation. The lightweight adapters are crucial for sign language\ntranslation, due to the constraints imposed by limited dataset sizes and the\ncomputational requirements when training with long sign videos. We also propose\na novel pretraining strategy that directs our encoder to learn sign\nrepresentations from automatically extracted pseudo-glosses without requiring\ngloss order information or annotations. We evaluate our approach on two public\nbenchmark sign language translation datasets, namely RWTH-PHOENIX-Weather 2014T\nand CSL-Daily, and improve on state-of-the-art gloss-free translation\nperformance with a significant margin.\n","authors":["Ryan Wong","Necati Cihan Camgoz","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2405.04164v1.pdf","comment":"Accepted at ICLR2024"},{"id":"http://arxiv.org/abs/2301.00326v2","updated":"2024-05-07T09:57:37Z","published":"2023-01-01T02:08:32Z","title":"Yuille-Poggio's Flow and Global Minimizer of Polynomials through\n Convexification by Heat Evolution","summary":" This study examines the convexification version of the backward differential\nflow algorithm for the global minimization of polynomials, introduced by O.\nArikan \\textit{et al} in \\cite{ABK}. It investigates why this approach might\nfail with high-degree polynomials yet succeeds with quartic polynomials. We\nemploy the heat evolution method for convexification combined with Gaussian\nfiltering, which acts as a cumulative form of Steklov's regularization. In this\ncontext, we apply the fingerprint theory from computer vision. Originally\ndeveloped by A.L. Yuille and T. Poggio in the 1980s for computer vision, the\nfingerprint theory, particularly the fingerprint trajectory equation, is used\nto illustrate the scaling (temporal) evolution of minimizers. In the case of\ngeneral polynomials, our research has led to the creation of the Yuille-Poggio\nflow and a broader interpretation of the fingerprint concepts, in particular we\nestablish the condition both sufficient and necessary for the convexified\nbackward differential flow algorithms to successfully achieve global\nminimization. For quartic polynomials, our analysis not only reflects the\nresults of O. Arikan et al. \\cite{ABK} but also presents a significantly\nsimpler version of Newton's method that can always globally minimize quartic\npolynomials without convexification.\n","authors":["Qiao Wang"],"pdf_url":"https://arxiv.org/pdf/2301.00326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09764v2","updated":"2024-05-07T09:52:27Z","published":"2023-08-18T18:18:47Z","title":"The Impact of Background Removal on Performance of Neural Networks for\n Fashion Image Classification and Segmentation","summary":" Fashion understanding is a hot topic in computer vision, with many\napplications having great business value in the market. Fashion understanding\nremains a difficult challenge for computer vision due to the immense diversity\nof garments and various scenes and backgrounds. In this work, we try removing\nthe background from fashion images to boost data quality and increase model\nperformance. Having fashion images of evident persons in fully visible\ngarments, we can utilize Salient Object Detection to achieve the background\nremoval of fashion data to our expectations. A fashion image with the\nbackground removed is claimed as the \"rembg\" image, contrasting with the\noriginal one in the fashion dataset. We conducted extensive comparative\nexperiments with these two types of images on multiple aspects of model\ntraining, including model architectures, model initialization, compatibility\nwith other training tricks and data augmentations, and target task types. Our\nexperiments show that background removal can effectively work for fashion data\nin simple and shallow networks that are not susceptible to overfitting. It can\nimprove model accuracy by up to 5% in the classification on the FashionStyle14\ndataset when training models from scratch. However, background removal does not\nperform well in deep neural networks due to incompatibility with other\nregularization techniques like batch normalization, pre-trained initialization,\nand data augmentations introducing randomness. The loss of background pixels\ninvalidates many existing training tricks in the model training, adding the\nrisk of overfitting for deep models.\n","authors":["Junhui Liang","Ying Liu","Vladimir Vlassov"],"pdf_url":"https://arxiv.org/pdf/2308.09764v2.pdf","comment":"9 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.17691v2","updated":"2024-05-07T09:15:01Z","published":"2024-03-26T13:32:32Z","title":"Not All Similarities Are Created Equal: Leveraging Data-Driven Biases to\n Inform GenAI Copyright Disputes","summary":" The advent of Generative Artificial Intelligence (GenAI) models, including\nGitHub Copilot, OpenAI GPT, and Stable Diffusion, has revolutionized content\ncreation, enabling non-professionals to produce high-quality content across\nvarious domains. This transformative technology has led to a surge of synthetic\ncontent and sparked legal disputes over copyright infringement. To address\nthese challenges, this paper introduces a novel approach that leverages the\nlearning capacity of GenAI models for copyright legal analysis, demonstrated\nwith GPT2 and Stable Diffusion models. Copyright law distinguishes between\noriginal expressions and generic ones (Sc\\`enes \\`a faire), protecting the\nformer and permitting reproduction of the latter. However, this distinction has\nhistorically been challenging to make consistently, leading to over-protection\nof copyrighted works. GenAI offers an unprecedented opportunity to enhance this\nlegal analysis by revealing shared patterns in preexisting works. We propose a\ndata-driven approach to identify the genericity of works created by GenAI,\nemploying \"data-driven bias\" to assess the genericity of expressive\ncompositions. This approach aids in copyright scope determination by utilizing\nthe capabilities of GenAI to identify and prioritize expressive elements and\nrank them according to their frequency in the model's dataset. The potential\nimplications of measuring expressive genericity for copyright law are profound.\nSuch scoring could assist courts in determining copyright scope during\nlitigation, inform the registration practices of Copyright Offices, allowing\nregistration of only highly original synthetic works, and help copyright owners\nsignal the value of their works and facilitate fairer licensing deals. More\ngenerally, this approach offers valuable insights to policymakers grappling\nwith adapting copyright law to the challenges posed by the era of GenAI.\n","authors":["Uri Hacohen","Adi Haviv","Shahar Sarfaty","Bruria Friedman","Niva Elkin-Koren","Roi Livni","Amit H Bermano"],"pdf_url":"https://arxiv.org/pdf/2403.17691v2.pdf","comment":"Presented at ACM CSLAW 2024"},{"id":"http://arxiv.org/abs/2405.04133v1","updated":"2024-05-07T09:00:09Z","published":"2024-05-07T09:00:09Z","title":"Exposing AI-generated Videos: A Benchmark Dataset and a Local-and-Global\n Temporal Defect Based Detection Method","summary":" The generative model has made significant advancements in the creation of\nrealistic videos, which causes security issues. However, this emerging risk has\nnot been adequately addressed due to the absence of a benchmark dataset for\nAI-generated videos. In this paper, we first construct a video dataset using\nadvanced diffusion-based video generation algorithms with various semantic\ncontents. Besides, typical video lossy operations over network transmission are\nadopted to generate degraded samples. Then, by analyzing local and global\ntemporal defects of current AI-generated videos, a novel detection framework by\nadaptively learning local motion information and global appearance variation is\nconstructed to expose fake videos. Finally, experiments are conducted to\nevaluate the generalization and robustness of different spatial and temporal\ndomain detection methods, where the results can serve as the baseline and\ndemonstrate the research challenge for future studies.\n","authors":["Peisong He","Leyao Zhu","Jiaxing Li","Shiqi Wang","Haoliang Li"],"pdf_url":"https://arxiv.org/pdf/2405.04133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04121v1","updated":"2024-05-07T08:44:13Z","published":"2024-05-07T08:44:13Z","title":"ELiTe: Efficient Image-to-LiDAR Knowledge Transfer for Semantic\n Segmentation","summary":" Cross-modal knowledge transfer enhances point cloud representation learning\nin LiDAR semantic segmentation. Despite its potential, the \\textit{weak teacher\nchallenge} arises due to repetitive and non-diverse car camera images and\nsparse, inaccurate ground truth labels. To address this, we propose the\nEfficient Image-to-LiDAR Knowledge Transfer (ELiTe) paradigm. ELiTe introduces\nPatch-to-Point Multi-Stage Knowledge Distillation, transferring comprehensive\nknowledge from the Vision Foundation Model (VFM), extensively trained on\ndiverse open-world images. This enables effective knowledge transfer to a\nlightweight student model across modalities. ELiTe employs Parameter-Efficient\nFine-Tuning to strengthen the VFM teacher and expedite large-scale model\ntraining with minimal costs. Additionally, we introduce the Segment Anything\nModel based Pseudo-Label Generation approach to enhance low-quality image\nlabels, facilitating robust semantic representations. Efficient knowledge\ntransfer in ELiTe yields state-of-the-art results on the SemanticKITTI\nbenchmark, outperforming real-time inference models. Our approach achieves this\nwith significantly fewer parameters, confirming its effectiveness and\nefficiency.\n","authors":["Zhibo Zhang","Ximing Yang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2405.04121v1.pdf","comment":"9 pages, 6 figures, ICME 2024 oral"},{"id":"http://arxiv.org/abs/2405.04103v1","updated":"2024-05-07T08:16:13Z","published":"2024-05-07T08:16:13Z","title":"COM3D: Leveraging Cross-View Correspondence and Cross-Modal Mining for\n 3D Retrieval","summary":" In this paper, we investigate an open research task of cross-modal retrieval\nbetween 3D shapes and textual descriptions. Previous approaches mainly rely on\npoint cloud encoders for feature extraction, which may ignore key inherent\nfeatures of 3D shapes, including depth, spatial hierarchy, geometric\ncontinuity, etc. To address this issue, we propose COM3D, making the first\nattempt to exploit the cross-view correspondence and cross-modal mining to\nenhance the retrieval performance. Notably, we augment the 3D features through\na scene representation transformer, to generate cross-view correspondence\nfeatures of 3D shapes, which enrich the inherent features and enhance their\ncompatibility with text matching. Furthermore, we propose to optimize the\ncross-modal matching process based on the semi-hard negative example mining\nmethod, in an attempt to improve the learning efficiency. Extensive\nquantitative and qualitative experiments demonstrate the superiority of our\nproposed COM3D, achieving state-of-the-art results on the Text2Shape dataset.\n","authors":["Hao Wu","Ruochong LI","Hao Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2405.04103v1.pdf","comment":"Accepted by ICME 2024 oral"},{"id":"http://arxiv.org/abs/2405.04100v1","updated":"2024-05-07T08:15:37Z","published":"2024-05-07T08:15:37Z","title":"ESP: Extro-Spective Prediction for Long-term Behavior Reasoning in\n Emergency Scenarios","summary":" Emergent-scene safety is the key milestone for fully autonomous driving, and\nreliable on-time prediction is essential to maintain safety in emergency\nscenarios. However, these emergency scenarios are long-tailed and hard to\ncollect, which restricts the system from getting reliable predictions. In this\npaper, we build a new dataset, which aims at the long-term prediction with the\ninconspicuous state variation in history for the emergency event, named the\nExtro-Spective Prediction (ESP) problem. Based on the proposed dataset, a\nflexible feature encoder for ESP is introduced to various prediction methods as\na seamless plug-in, and its consistent performance improvement underscores its\nefficacy. Furthermore, a new metric named clamped temporal error (CTE) is\nproposed to give a more comprehensive evaluation of prediction performance,\nespecially in time-sensitive emergency events of subseconds. Interestingly, as\nour ESP features can be described in human-readable language naturally, the\napplication of integrating into ChatGPT also shows huge potential. The\nESP-dataset and all benchmarks are released at\nhttps://dingrui-wang.github.io/ESP-Dataset/.\n","authors":["Dingrui Wang","Zheyuan Lai","Yuda Li","Yi Wu","Yuexin Ma","Johannes Betz","Ruigang Yang","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2405.04100v1.pdf","comment":"Accepted by ICRA 2024 as Oral Presentation"},{"id":"http://arxiv.org/abs/2306.11023v2","updated":"2024-05-07T08:12:53Z","published":"2023-06-19T15:37:53Z","title":"PINQI: An End-to-End Physics-Informed Approach to Learned Quantitative\n MRI Reconstruction","summary":" Quantitative Magnetic Resonance Imaging (qMRI) enables the reproducible\nmeasurement of biophysical parameters in tissue. The challenge lies in solving\na nonlinear, ill-posed inverse problem to obtain the desired tissue parameter\nmaps from acquired raw data. While various learned and non-learned approaches\nhave been proposed, the existing learned methods fail to fully exploit the\nprior knowledge about the underlying MR physics, i.e. the signal model and the\nacquisition model. In this paper, we propose PINQI, a novel qMRI reconstruction\nmethod that integrates the knowledge about the signal, acquisition model, and\nlearned regularization into a single end-to-end trainable neural network. Our\napproach is based on unrolled alternating optimization, utilizing\ndifferentiable optimization blocks to solve inner linear and non-linear\noptimization tasks, as well as convolutional layers for regularization of the\nintermediate qualitative images and parameter maps. This design enables PINQI\nto leverage the advantages of both the signal model and learned regularization.\nWe evaluate the performance of our proposed network by comparing it with\nrecently published approaches in the context of highly undersampled\n$T_1$-mapping, using both a simulated brain dataset, as well as real scanner\ndata acquired from a physical phantom and in-vivo data from healthy volunteers.\nThe results demonstrate the superiority of our proposed solution over existing\nmethods and highlight the effectiveness of our method in real-world scenarios.\n","authors":["Felix F Zimmermann","Christoph Kolbitsch","Patrick Schuenke","Andreas Kofler"],"pdf_url":"https://arxiv.org/pdf/2306.11023v2.pdf","comment":"This work has been accepted for publication in IEEE Transactions on\n Computational Imaging. Changes were made to this version by the publisher\n before publication. IEEE Transactions on Computational Imaging (2024)"},{"id":"http://arxiv.org/abs/2404.10966v3","updated":"2024-05-07T08:09:13Z","published":"2024-04-17T00:21:36Z","title":"Domain-Specific Block Selection and Paired-View Pseudo-Labeling for\n Online Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test\ndomain without access to source data after deployment. Existing approaches\ntypically rely on self-training with pseudo-labels since ground-truth cannot be\nobtained from test data. Although the quality of pseudo labels is important for\nstable and accurate long-term adaptation, it has not been previously addressed.\nIn this work, we propose DPLOT, a simple yet effective TTA framework that\nconsists of two components: (1) domain-specific block selection and (2)\npseudo-label generation using paired-view images. Specifically, we select\nblocks that involve domain-specific feature extraction and train these blocks\nby entropy minimization. After blocks are adjusted for current test domain, we\ngenerate pseudo-labels by averaging given test images and corresponding flipped\ncounterparts. By simply using flip augmentation, we prevent a decrease in the\nquality of the pseudo-labels, which can be caused by the domain gap resulting\nfrom strong augmentation. Our experimental results demonstrate that DPLOT\noutperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C\nbenchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also,\nwe provide an extensive analysis to demonstrate effectiveness of our framework.\nCode is available at\nhttps://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA.\n","authors":["Yeonguk Yu","Sungho Shin","Seunghyeok Back","Minhwan Ko","Sangjun Noh","Kyoobin Lee"],"pdf_url":"https://arxiv.org/pdf/2404.10966v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2405.04097v1","updated":"2024-05-07T07:57:15Z","published":"2024-05-07T07:57:15Z","title":"Unmasking Illusions: Understanding Human Perception of Audiovisual\n Deepfakes","summary":" The emergence of contemporary deepfakes has attracted significant attention\nin machine learning research, as artificial intelligence (AI) generated\nsynthetic media increases the incidence of misinterpretation and is difficult\nto distinguish from genuine content. Currently, machine learning techniques\nhave been extensively studied for automatically detecting deepfakes. However,\nhuman perception has been less explored. Malicious deepfakes could ultimately\ncause public and social problems. Can we humans correctly perceive the\nauthenticity of the content of the videos we watch? The answer is obviously\nuncertain; therefore, this paper aims to evaluate the human ability to discern\ndeepfake videos through a subjective study. We present our findings by\ncomparing human observers to five state-ofthe-art audiovisual deepfake\ndetection models. To this end, we used gamification concepts to provide 110\nparticipants (55 native English speakers and 55 non-native English speakers)\nwith a webbased platform where they could access a series of 40 videos (20 real\nand 20 fake) to determine their authenticity. Each participant performed the\nexperiment twice with the same 40 videos in different random orders. The videos\nare manually selected from the FakeAVCeleb dataset. We found that all AI models\nperformed better than humans when evaluated on the same 40 videos. The study\nalso reveals that while deception is not impossible, humans tend to\noverestimate their detection capabilities. Our experimental results may help\nbenchmark human versus machine performance, advance forensics analysis, and\nenable adaptive countermeasures.\n","authors":["Ammarah Hashmi","Sahibzada Adil Shahzad","Chia-Wen Lin","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2405.04097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04093v1","updated":"2024-05-07T07:51:28Z","published":"2024-05-07T07:51:28Z","title":"DCNN: Dual Cross-current Neural Networks Realized Using An Interactive\n Deep Learning Discriminator for Fine-grained Objects","summary":" Accurate classification of fine-grained images remains a challenge in\nbackbones based on convolutional operations or self-attention mechanisms. This\nstudy proposes novel dual-current neural networks (DCNN), which combine the\nadvantages of convolutional operations and self-attention mechanisms to improve\nthe accuracy of fine-grained image classification. The main novel design\nfeatures for constructing a weakly supervised learning backbone model DCNN\ninclude (a) extracting heterogeneous data, (b) keeping the feature map\nresolution unchanged, (c) expanding the receptive field, and (d) fusing global\nrepresentations and local features. Experimental results demonstrated that\nusing DCNN as the backbone network for classifying certain fine-grained\nbenchmark datasets achieved performance advantage improvements of 13.5--19.5%\nand 2.2--12.9%, respectively, compared to other advanced convolution or\nattention-based fine-grained backbones.\n","authors":["Da Fu","Mingfei Rong","Eun-Hu Kim","Hao Huang","Witold Pedrycz"],"pdf_url":"https://arxiv.org/pdf/2405.04093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05950v4","updated":"2024-05-07T07:47:38Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities on downstream tasks when fine-tuned with\nminimal data. However, many VLMs rely on proprietary data and are not\nopen-source, which restricts the use of white-box approaches for fine-tuning.\nAs such, we aim to develop a black-box approach to optimize VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or even output logits. We propose employing chat-based LLMs\nto search for the best text prompt for VLMs. Specifically, we adopt an\nautomatic hill-climbing procedure that converges to an effective prompt by\nevaluating the performance of current prompts and asking LLMs to refine them\nbased on textual feedback, all within a conversational process without\nhuman-in-the-loop. In a challenging 1-shot image classification setup, our\nsimple approach surpasses the white-box continuous prompting method (CoOp) by\nan average of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms both human-engineered and LLM-generated prompts. We highlight the\nadvantage of conversational feedback that incorporates both positive and\nnegative prompts, suggesting that LLMs can utilize the implicit gradient\ndirection in textual feedback for a more efficient search. In addition, we find\nthat the text prompts generated through our strategy are not only more\ninterpretable but also transfer well across different VLM architectures in a\nblack-box manner. Lastly, we apply our framework to optimize the\nstate-of-the-art black-box VLM (DALL-E 3) for text-to-image generation, prompt\ninversion, and personalization.\n","authors":["Shihong Liu","Zhiqiu Lin","Samuel Yu","Ryan Lee","Tiffany Ling","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v4.pdf","comment":"Published at CVPR 2024. Project site:\n https://llm-can-optimize-vlm.github.io/"},{"id":"http://arxiv.org/abs/2405.04071v1","updated":"2024-05-07T07:19:25Z","published":"2024-05-07T07:19:25Z","title":"IMU-Aided Event-based Stereo Visual Odometry","summary":" Direct methods for event-based visual odometry solve the mapping and camera\npose tracking sub-problems by establishing implicit data association in a way\nthat the generative model of events is exploited. The main bottlenecks faced by\nstate-of-the-art work in this field include the high computational complexity\nof mapping and the limited accuracy of tracking. In this paper, we improve our\nprevious direct pipeline \\textit{Event-based Stereo Visual Odometry} in terms\nof accuracy and efficiency. To speed up the mapping operation, we propose an\nefficient strategy of edge-pixel sampling according to the local dynamics of\nevents. The mapping performance in terms of completeness and local smoothness\nis also improved by combining the temporal stereo results and the static stereo\nresults. To circumvent the degeneracy issue of camera pose tracking in\nrecovering the yaw component of general 6-DoF motion, we introduce as a prior\nthe gyroscope measurements via pre-integration. Experiments on publicly\navailable datasets justify our improvement. We release our pipeline as an\nopen-source software for future research in this field.\n","authors":["Junkai Niu","Sheng Zhong","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.04071v1.pdf","comment":"10 pages, 7 figures, ICRA"},{"id":"http://arxiv.org/abs/2401.10731v5","updated":"2024-05-07T06:52:13Z","published":"2024-01-19T14:49:42Z","title":"Removal and Selection: Improving RGB-Infrared Object Detection via\n Coarse-to-Fine Fusion","summary":" Object detection in visible (RGB) and infrared (IR) images has been widely\napplied in recent years. Leveraging the complementary characteristics of RGB\nand IR images, the object detector provides reliable and robust object\nlocalization from day to night. Most existing fusion strategies directly input\nRGB and IR images into deep neural networks, leading to inferior detection\nperformance. However, the RGB and IR features have modality-specific noise,\nthese strategies will exacerbate the fused features along with the propagation.\nInspired by the mechanism of the human brain processing multimodal information,\nin this paper, we introduce a new coarse-to-fine perspective to purify and fuse\ntwo modality features. Specifically, following this perspective, we design a\nRedundant Spectrum Removal module to coarsely remove interfering information\nwithin each modality and a Dynamic Feature Selection module to finely select\nthe desired features for feature fusion. To verify the effectiveness of the\ncoarse-to-fine fusion strategy, we construct a new object detector called the\nRemoval and Selection Detector (RSDet). Extensive experiments on three RGB-IR\nobject detection datasets verify the superior performance of our method.\n","authors":["Tianyi Zhao","Maoxun Yuan","Feng Jiang","Nan Wang","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2401.10731v5.pdf","comment":"11pages, 11figures"},{"id":"http://arxiv.org/abs/2311.04940v2","updated":"2024-05-07T06:44:01Z","published":"2023-11-08T01:54:56Z","title":"Interpretable Geoscience Artificial Intelligence (XGeoS-AI): Application\n to Demystify Image Recognition","summary":" As Earth science enters the era of big data, artificial intelligence (AI) not\nonly offers great potential for solving geoscience problems, but also plays a\ncritical role in accelerating the understanding of the complex, interactive,\nand multiscale processes of Earth's behavior. As geoscience AI models are\nprogressively utilized for significant predictions in crucial situations,\ngeoscience researchers are increasingly demanding their interpretability and\nversatility. This study proposes an interpretable geoscience artificial\nintelligence (XGeoS-AI) framework to unravel the mystery of image recognition\nin the Earth sciences, and its effectiveness and versatility is demonstrated by\ntaking computed tomography (CT) image recognition as an example. Inspired by\nthe mechanism of human vision, the proposed XGeoS-AI framework generates a\nthreshold value from a local region within the whole image to complete the\nrecognition. Different kinds of artificial intelligence (AI) methods, such as\nSupport Vector Regression (SVR), Multilayer Perceptron (MLP), Convolutional\nNeural Network (CNN), can be adopted as the AI engines of the proposed XGeoS-AI\nframework to efficiently complete geoscience image recognition tasks.\nExperimental results demonstrate that the effectiveness, versatility, and\nheuristics of the proposed framework have great potential in solving geoscience\nimage recognition problems. Interpretable AI should receive more and more\nattention in the field of the Earth sciences, which is the key to promoting\nmore rational and wider applications of AI in the field of Earth sciences. In\naddition, the proposed interpretable framework may be the forerunner of\ntechnological innovation in the Earth sciences.\n","authors":["Jin-Jian Xu","Hao Zhang","Chao-Sheng Tang","Lin Li","Bin Shi"],"pdf_url":"https://arxiv.org/pdf/2311.04940v2.pdf","comment":"there are some erros in the results, and a newer revision is still\n preparing"},{"id":"http://arxiv.org/abs/2405.04044v1","updated":"2024-05-07T06:29:52Z","published":"2024-05-07T06:29:52Z","title":"DMOFC: Discrimination Metric-Optimized Feature Compression","summary":" Feature compression, as an important branch of video coding for machines\n(VCM), has attracted significant attention and exploration. However, the\nexisting methods mainly focus on intra-feature similarity, such as the Mean\nSquared Error (MSE) between the reconstructed and original features, while\nneglecting the importance of inter-feature relationships. In this paper, we\nanalyze the inter-feature relationships, focusing on feature discriminability\nin machine vision and underscoring its significance in feature compression. To\nmaintain the feature discriminability of reconstructed features, we introduce a\ndiscrimination metric for feature compression. The discrimination metric is\ndesigned to ensure that the distance between features of the same category is\nsmaller than the distance between features of different categories.\nFurthermore, we explore the relationship between the discrimination metric and\nthe discriminability of the original features. Experimental results confirm the\neffectiveness of the proposed discrimination metric and reveal there exists a\ntrade-off between the discrimination metric and the discriminability of the\noriginal features.\n","authors":["Changsheng Gao","Yiheng Jiang","Li Li","Dong Liu","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2405.04044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04042v1","updated":"2024-05-07T06:26:30Z","published":"2024-05-07T06:26:30Z","title":"Space-time Reinforcement Network for Video Object Segmentation","summary":" Recently, video object segmentation (VOS) networks typically use memory-based\nmethods: for each query frame, the mask is predicted by space-time matching to\nmemory frames. Despite these methods having superior performance, they suffer\nfrom two issues: 1) Challenging data can destroy the space-time coherence\nbetween adjacent video frames. 2) Pixel-level matching will lead to undesired\nmismatching caused by the noises or distractors. To address the aforementioned\nissues, we first propose to generate an auxiliary frame between adjacent\nframes, serving as an implicit short-temporal reference for the query one.\nNext, we learn a prototype for each video object and prototype-level matching\ncan be implemented between the query and memory. The experiment demonstrated\nthat our network outperforms the state-of-the-art method on the DAVIS 2017,\nachieving a J&F score of 86.4%, and attains a competitive result 85.0% on\nYouTube VOS 2018. In addition, our network exhibits a high inference speed of\n32+ FPS.\n","authors":["Yadang Chen","Wentao Zhu","Zhi-Xin Yang","Enhua Wu"],"pdf_url":"https://arxiv.org/pdf/2405.04042v1.pdf","comment":"Accepted by ICME 2024. 6 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.04041v1","updated":"2024-05-07T06:25:49Z","published":"2024-05-07T06:25:49Z","title":"Feature Map Convergence Evaluation for Functional Module","summary":" Autonomous driving perception models are typically composed of multiple\nfunctional modules that interact through complex relationships to accomplish\nenvironment understanding. However, perception models are predominantly\noptimized as a black box through end-to-end training, lacking independent\nevaluation of functional modules, which poses difficulties for interpretability\nand optimization. Pioneering in the issue, we propose an evaluation method\nbased on feature map analysis to gauge the convergence of model, thereby\nassessing functional modules' training maturity. We construct a quantitative\nmetric named as the Feature Map Convergence Score (FMCS) and develop Feature\nMap Convergence Evaluation Network (FMCE-Net) to measure and predict the\nconvergence degree of models respectively. FMCE-Net achieves remarkable\npredictive accuracy for FMCS across multiple image classification experiments,\nvalidating the efficacy and robustness of the introduced approach. To the best\nof our knowledge, this is the first independent evaluation method for\nfunctional modules, offering a new paradigm for the training assessment towards\nperception models.\n","authors":["Ludan Zhang","Chaoyi Chen","Lei He","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2405.04041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04023v1","updated":"2024-05-07T05:55:50Z","published":"2024-05-07T05:55:50Z","title":"Lumbar Spine Tumor Segmentation and Localization in T2 MRI Images Using\n AI","summary":" In medical imaging, segmentation and localization of spinal tumors in\nthree-dimensional (3D) space pose significant computational challenges,\nprimarily stemming from limited data availability. In response, this study\nintroduces a novel data augmentation technique, aimed at automating spine tumor\nsegmentation and localization through AI approaches. Leveraging a fusion of\nfuzzy c-means clustering and Random Forest algorithms, the proposed method\nachieves successful spine tumor segmentation based on predefined masks\ninitially delineated by domain experts in medical imaging. Subsequently, a\nConvolutional Neural Network (CNN) architecture is employed for tumor\nclassification. Moreover, 3D vertebral segmentation and labeling techniques are\nused to help pinpoint the exact location of the tumors in the lumbar spine.\nResults indicate a remarkable performance, with 99% accuracy for tumor\nsegmentation, 98% accuracy for tumor classification, and 99% accuracy for tumor\nlocalization achieved with the proposed approach. These metrics surpass the\nefficacy of existing state-of-the-art techniques, as evidenced by superior Dice\nScore, Class Accuracy, and Intersection over Union (IOU) on class accuracy\nmetrics. This innovative methodology holds promise for enhancing the diagnostic\ncapabilities in detecting and characterizing spinal tumors, thereby\nfacilitating more effective clinical decision-making.\n","authors":["Rikathi Pal","Sudeshna Mondal","Aditi Gupta","Priya Saha","Somoballi Ghoshal","Amlan Chakrabarti","Susmita Sur-Kolay"],"pdf_url":"https://arxiv.org/pdf/2405.04023v1.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.13904v3","updated":"2024-05-07T05:32:55Z","published":"2024-04-22T06:28:41Z","title":"Deep Regression Representation Learning with Topology","summary":" Most works studying representation learning focus only on classification and\nneglect regression. Yet, the learning objectives and therefore the\nrepresentation topologies of the two tasks are fundamentally different:\nclassification targets class separation, leading to disconnected\nrepresentations, whereas regression requires ordinality with respect to the\ntarget, leading to continuous representations. We thus wonder how the\neffectiveness of a regression representation is influenced by its topology,\nwith evaluation based on the Information Bottleneck (IB) principle.\n The IB principle is an important framework that provides principles for\nlearning effectiveness representations. We establish two connections between it\nand the topology of regression representations. The first connection reveals\nthat a lower intrinsic dimension of the feature space implies a reduced\ncomplexity of the representation Z. This complexity can be quantified as the\nconditional entropy of Z on the target space Y and serves as an upper bound on\nthe generalization error. The second connection suggests learning a feature\nspace that is topologically similar to the target space will better align with\nthe IB principle. Based on these two connections, we introduce PH-Reg, a\nregularizer specific to regression that matches the intrinsic dimension and\ntopology of the feature space with the target space. Experiments on synthetic\nand real-world regression tasks demonstrate the benefits of PH-Reg.\n","authors":["Shihao Zhang","kenji kawaguchi","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2404.13904v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2404.00552v3","updated":"2024-05-07T05:32:11Z","published":"2024-03-31T03:53:45Z","title":"Comparison of Methods in Skin Pigment Decomposition","summary":" Decomposition of skin pigment plays an important role in medical fields.\nHuman skin can be decomposed into two primitive components, hemoglobin and\nmelanin. It is our goal to apply these results for diagnosis of skin cancer. In\nthis paper, various methods for skin pigment decomposition are reviewed\ncomparatively and the performance of each method is evaluated both\ntheoretically and experimentally. In addition, isometric feature mapping\n(Isomap) is introduced in order to improve the dimensionality reduction\nperformance in context of skin pigment decomposition.\n","authors":["Hao Gong","Michel Desvignes"],"pdf_url":"https://arxiv.org/pdf/2404.00552v3.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.04009v1","updated":"2024-05-07T04:57:25Z","published":"2024-05-07T04:57:25Z","title":"Structured Click Control in Transformer-based Interactive Segmentation","summary":" Click-point-based interactive segmentation has received widespread attention\ndue to its efficiency. However, it's hard for existing algorithms to obtain\nprecise and robust responses after multiple clicks. In this case, the\nsegmentation results tend to have little change or are even worse than before.\nTo improve the robustness of the response, we propose a structured click intent\nmodel based on graph neural networks, which adaptively obtains graph nodes via\nthe global similarity of user-clicked Transformer tokens. Then the graph nodes\nwill be aggregated to obtain structured interaction features. Finally, the dual\ncross-attention will be used to inject structured interaction features into\nvision Transformer features, thereby enhancing the control of clicks over\nsegmentation results. Extensive experiments demonstrated the proposed algorithm\ncan serve as a general structure in improving Transformer-based interactive\nsegmenta?tion performance. The code and data will be released at\nhttps://github.com/hahamyt/scc.\n","authors":["Long Xu","Yongquan Chen","Rui Huang","Feng Wu","Shiwu Lai"],"pdf_url":"https://arxiv.org/pdf/2405.04009v1.pdf","comment":"10 pages, 6 figures, submitted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.04007v1","updated":"2024-05-07T04:55:47Z","published":"2024-05-07T04:55:47Z","title":"SEED-Data-Edit Technical Report: A Hybrid Dataset for Instructional\n Image Editing","summary":" In this technical report, we introduce SEED-Data-Edit: a unique hybrid\ndataset for instruction-guided image editing, which aims to facilitate image\nmanipulation using open-form language. SEED-Data-Edit is composed of three\ndistinct types of data: (1) High-quality editing data produced by an automated\npipeline, ensuring a substantial volume of diverse image editing pairs. (2)\nReal-world scenario data collected from the internet, which captures the\nintricacies of user intentions for promoting the practical application of image\nediting in the real world. (3) High-precision multi-turn editing data annotated\nby humans, which involves multiple rounds of edits for simulating iterative\nediting processes. The combination of these diverse data sources makes\nSEED-Data-Edit a comprehensive and versatile dataset for training\nlanguage-guided image editing model. We fine-tune a pretrained Multimodal Large\nLanguage Model (MLLM) that unifies comprehension and generation with\nSEED-Data-Edit. The instruction tuned model demonstrates promising results,\nindicating the potential and effectiveness of SEED-Data-Edit in advancing the\nfield of instructional image editing. The datasets are released in\nhttps://huggingface.co/datasets/AILab-CVC/SEED-Data-Edit.\n","authors":["Yuying Ge","Sijie Zhao","Chen Li","Yixiao Ge","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2405.04007v1.pdf","comment":"Technical Report; Dataset released in\n https://huggingface.co/datasets/AILab-CVC/SEED-Data-Edit"},{"id":"http://arxiv.org/abs/2405.03995v1","updated":"2024-05-07T04:17:04Z","published":"2024-05-07T04:17:04Z","title":"Deep Event-based Object Detection in Autonomous Driving: A Survey","summary":" Object detection plays a critical role in autonomous driving, where\naccurately and efficiently detecting objects in fast-moving scenes is crucial.\nTraditional frame-based cameras face challenges in balancing latency and\nbandwidth, necessitating the need for innovative solutions. Event cameras have\nemerged as promising sensors for autonomous driving due to their low latency,\nhigh dynamic range, and low power consumption. However, effectively utilizing\nthe asynchronous and sparse event data presents challenges, particularly in\nmaintaining low latency and lightweight architectures for object detection.\nThis paper provides an overview of object detection using event data in\nautonomous driving, showcasing the competitive benefits of event cameras.\n","authors":["Bingquan Zhou","Jie Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.03995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03981v1","updated":"2024-05-07T03:42:49Z","published":"2024-05-07T03:42:49Z","title":"Predicting Lung Disease Severity via Image-Based AQI Analysis using Deep\n Learning Techniques","summary":" Air pollution is a significant health concern worldwide, contributing to\nvarious respiratory diseases. Advances in air quality mapping, driven by the\nemergence of smart cities and the proliferation of Internet-of-Things sensor\ndevices, have led to an increase in available data, fueling momentum in air\npollution forecasting. The objective of this study is to devise an integrated\napproach for predicting air quality using image data and subsequently assessing\nlung disease severity based on Air Quality Index (AQI).The aim is to implement\nan integrated approach by refining existing techniques to improve accuracy in\npredicting AQI and lung disease severity. The study aims to forecast additional\natmospheric pollutants like AQI, PM10, O3, CO, SO2, NO2 in addition to PM2.5\nlevels. Additionally, the study aims to compare the proposed approach with\nexisting methods to show its effectiveness. The approach used in this paper\nuses VGG16 model for feature extraction in images and neural network for\npredicting AQI.In predicting lung disease severity, Support Vector Classifier\n(SVC) and K-Nearest Neighbors (KNN) algorithms are utilized. The neural network\nmodel for predicting AQI achieved training accuracy of 88.54 % and testing\naccuracy of 87.44%,which was measured using loss function, while the KNN model\nused for predicting lung disease severity achieved training accuracy of 98.4%\nand testing accuracy of 97.5% In conclusion, the integrated approach presented\nin this study forecasts air quality and evaluates lung disease severity,\nachieving high testing accuracies of 87.44% for AQI and 97.5% for lung disease\nseverity using neural network, KNN, and SVC models. The future scope involves\nimplementing transfer learning and advanced deep learning modules to enhance\nprediction capabilities. While the current study focuses on India, the\nobjective is to expand its scope to encompass global coverage.\n","authors":["Anvita Mahajan","Sayali Mate","Chinmayee Kulkarni","Suraj Sawant"],"pdf_url":"https://arxiv.org/pdf/2405.03981v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2405.01273v2","updated":"2024-05-07T03:31:22Z","published":"2024-05-02T13:31:09Z","title":"Towards Inclusive Face Recognition Through Synthetic Ethnicity\n Alteration","summary":" Numerous studies have shown that existing Face Recognition Systems (FRS),\nincluding commercial ones, often exhibit biases toward certain ethnicities due\nto under-represented data. In this work, we explore ethnicity alteration and\nskin tone modification using synthetic face image generation methods to\nincrease the diversity of datasets. We conduct a detailed analysis by first\nconstructing a balanced face image dataset representing three ethnicities:\nAsian, Black, and Indian. We then make use of existing Generative Adversarial\nNetwork-based (GAN) image-to-image translation and manifold learning models to\nalter the ethnicity from one to another. A systematic analysis is further\nconducted to assess the suitability of such datasets for FRS by studying the\nrealistic skin-tone representation using Individual Typology Angle (ITA).\nFurther, we also analyze the quality characteristics using existing Face image\nquality assessment (FIQA) approaches. We then provide a holistic FRS\nperformance analysis using four different systems. Our findings pave the way\nfor future research works in (i) developing both specific ethnicity and general\n(any to any) ethnicity alteration models, (ii) expanding such approaches to\ncreate databases with diverse skin tones, (iii) creating datasets representing\nvarious ethnicities which further can help in mitigating bias while addressing\nprivacy concerns.\n","authors":["Praveen Kumar Chandaliya","Kiran Raja","Raghavendra Ramachandra","Zahid Akhtar","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2405.01273v2.pdf","comment":"8 Pages"},{"id":"http://arxiv.org/abs/2405.03978v1","updated":"2024-05-07T03:30:57Z","published":"2024-05-07T03:30:57Z","title":"VMambaCC: A Visual State Space Model for Crowd Counting","summary":" As a deep learning model, Visual Mamba (VMamba) has a low computational\ncomplexity and a global receptive field, which has been successful applied to\nimage classification and detection. To extend its applications, we apply VMamba\nto crowd counting and propose a novel VMambaCC (VMamba Crowd Counting) model.\nNaturally, VMambaCC inherits the merits of VMamba, or global modeling for\nimages and low computational cost. Additionally, we design a Multi-head\nHigh-level Feature (MHF) attention mechanism for VMambaCC. MHF is a new\nattention mechanism that leverages high-level semantic features to augment\nlow-level semantic features, thereby enhancing spatial feature representation\nwith greater precision. Building upon MHF, we further present a High-level\nSemantic Supervised Feature Pyramid Network (HS2PFN) that progressively\nintegrates and enhances high-level semantic information with low-level semantic\ninformation. Extensive experimental results on five public datasets validate\nthe efficacy of our approach. For example, our method achieves a mean absolute\nerror of 51.87 and a mean squared error of 81.3 on the ShangHaiTech\\_PartA\ndataset. Our code is coming soon.\n","authors":["Hao-Yuan Ma","Li Zhang","Shuai Shi"],"pdf_url":"https://arxiv.org/pdf/2405.03978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02234v2","updated":"2024-05-07T03:25:50Z","published":"2024-03-04T17:26:28Z","title":"3DTopia: Large Text-to-3D Generation Model with Hybrid Diffusion Priors","summary":" We present a two-stage text-to-3D generation system, namely 3DTopia, which\ngenerates high-quality general 3D assets within 5 minutes using hybrid\ndiffusion priors. The first stage samples from a 3D diffusion prior directly\nlearned from 3D data. Specifically, it is powered by a text-conditioned\ntri-plane latent diffusion model, which quickly generates coarse 3D samples for\nfast prototyping. The second stage utilizes 2D diffusion priors to further\nrefine the texture of coarse 3D models from the first stage. The refinement\nconsists of both latent and pixel space optimization for high-quality texture\ngeneration. To facilitate the training of the proposed system, we clean and\ncaption the largest open-source 3D dataset, Objaverse, by combining the power\nof vision language models and large language models. Experiment results are\nreported qualitatively and quantitatively to show the performance of the\nproposed system. Our codes and models are available at\nhttps://github.com/3DTopia/3DTopia\n","authors":["Fangzhou Hong","Jiaxiang Tang","Ziang Cao","Min Shi","Tong Wu","Zhaoxi Chen","Shuai Yang","Tengfei Wang","Liang Pan","Dahua Lin","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.02234v2.pdf","comment":"Code available at https://github.com/3DTopia/3DTopia"},{"id":"http://arxiv.org/abs/2405.03141v2","updated":"2024-05-07T03:21:18Z","published":"2024-05-06T03:28:47Z","title":"Automatic Ultrasound Curve Angle Measurement via Affinity Clustering for\n Adolescent Idiopathic Scoliosis Evaluation","summary":" The current clinical gold standard for evaluating adolescent idiopathic\nscoliosis (AIS) is X-ray radiography, using Cobb angle measurement. However,\nthe frequent monitoring of the AIS progression using X-rays poses a challenge\ndue to the cumulative radiation exposure. Although 3D ultrasound has been\nvalidated as a reliable and radiation-free alternative for scoliosis\nassessment, the process of measuring spinal curvature is still carried out\nmanually. Consequently, there is a considerable demand for a fully automatic\nsystem that can locate bony landmarks and perform angle measurements. To this\nend, we introduce an estimation model for automatic ultrasound curve angle\n(UCA) measurement. The model employs a dual-branch network to detect candidate\nlandmarks and perform vertebra segmentation on ultrasound coronal images. An\naffinity clustering strategy is utilized within the vertebral segmentation area\nto illustrate the affinity relationship between candidate landmarks.\nSubsequently, we can efficiently perform line delineation from a clustered\naffinity map for UCA measurement. As our method is specifically designed for\nUCA calculation, this method outperforms other state-of-the-art methods for\nlandmark and line detection tasks. The high correlation between the automatic\nUCA and Cobb angle (R$^2$=0.858) suggests that our proposed method can\npotentially replace manual UCA measurement in ultrasound scoliosis assessment.\n","authors":["Yihao Zhou","Timothy Tin-Yan Lee","Kelly Ka-Lee Lai","Chonglin Wu","Hin Ting Lau","De Yang","Chui-Yi Chan","Winnie Chiu-Wing Chu","Jack Chun-Yiu Cheng","Tsz-Ping Lam","Yong-Ping Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.03141v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03971v1","updated":"2024-05-07T03:01:40Z","published":"2024-05-07T03:01:40Z","title":"Unified End-to-End V2X Cooperative Autonomous Driving","summary":" V2X cooperation, through the integration of sensor data from both vehicles\nand infrastructure, is considered a pivotal approach to advancing autonomous\ndriving technology. Current research primarily focuses on enhancing perception\naccuracy, often overlooking the systematic improvement of accident prediction\naccuracy through end-to-end learning, leading to insufficient attention to the\nsafety issues of autonomous driving. To address this challenge, this paper\nintroduces the UniE2EV2X framework, a V2X-integrated end-to-end autonomous\ndriving system that consolidates key driving modules within a unified network.\nThe framework employs a deformable attention-based data fusion strategy,\neffectively facilitating cooperation between vehicles and infrastructure. The\nmain advantages include: 1) significantly enhancing agents' perception and\nmotion prediction capabilities, thereby improving the accuracy of accident\npredictions; 2) ensuring high reliability in the data fusion process; 3)\nsuperior end-to-end perception compared to modular approaches. Furthermore, We\nimplement the UniE2EV2X framework on the challenging DeepAccident, a simulation\ndataset designed for V2X cooperative driving.\n","authors":["Zhiwei Li","Bozhen Zhang","Lei Yang","Tianyu Shen","Nuo Xu","Ruosen Hao","Weiting Li","Tao Yan","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2405.03971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16689v2","updated":"2024-05-07T02:47:55Z","published":"2024-03-25T12:23:39Z","title":"Synapse: Learning Preferential Concepts from Visual Demonstrations","summary":" This paper addresses the problem of preference learning, which aims to learn\nuser-specific preferences (e.g., \"good parking spot\", \"convenient drop-off\nlocation\") from visual input. Despite its similarity to learning factual\nconcepts (e.g., \"red cube\"), preference learning is a fundamentally harder\nproblem due to its subjective nature and the paucity of person-specific\ntraining data. We address this problem using a new framework called Synapse,\nwhich is a neuro-symbolic approach designed to efficiently learn preferential\nconcepts from limited demonstrations. Synapse represents preferences as\nneuro-symbolic programs in a domain-specific language (DSL) that operates over\nimages, and leverages a novel combination of visual parsing, large language\nmodels, and program synthesis to learn programs representing individual\npreferences. We evaluate Synapse through extensive experimentation including a\nuser case study focusing on mobility-related concepts in mobile robotics and\nautonomous driving. Our evaluation demonstrates that Synapse significantly\noutperforms existing baselines as well as its own ablations. The code and other\ndetails can be found on the project website https://amrl.cs.utexas.edu/synapse .\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v2.pdf","comment":"25 pages, 7 tables, 9 figures; Preprint; Updated figures and\n appendix, added VLM ablations"},{"id":"http://arxiv.org/abs/2405.03959v1","updated":"2024-05-07T02:45:50Z","published":"2024-05-07T02:45:50Z","title":"Joint Estimation of Identity Verification and Relative Pose for Partial\n Fingerprints","summary":" Currently, portable electronic devices are becoming more and more popular.\nFor lightweight considerations, their fingerprint recognition modules usually\nuse limited-size sensors. However, partial fingerprints have few matchable\nfeatures, especially when there are differences in finger pressing posture or\nimage quality, which makes partial fingerprint verification challenging. Most\nexisting methods regard fingerprint position rectification and identity\nverification as independent tasks, ignoring the coupling relationship between\nthem -- relative pose estimation typically relies on paired features as\nanchors, and authentication accuracy tends to improve with more precise pose\nalignment. Consequently, in this paper we propose a method that jointly\nestimates identity verification and relative pose for partial fingerprints,\naiming to leverage their inherent correlation to improve each other. To achieve\nthis, we propose a multi-task CNN (Convolutional Neural Network)-Transformer\nhybrid network, and design a pre-training task to enhance the feature\nextraction capability. Experiments on multiple public datasets (NIST SD14,\nFVC2002 DB1A & DB3A, FVC2004 DB1A & DB2A, FVC2006 DB1A) and an in-house dataset\nshow that our method achieves state-of-the-art performance in both partial\nfingerprint verification and relative pose estimation, while being more\nefficient than previous methods.\n","authors":["Xiongjun Guan","Zhiyu Pan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.03959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03958v1","updated":"2024-05-07T02:45:28Z","published":"2024-05-07T02:45:28Z","title":"Simple Drop-in LoRA Conditioning on Attention Layers Will Improve Your\n Diffusion Model","summary":" Current state-of-the-art diffusion models employ U-Net architectures\ncontaining convolutional and (qkv) self-attention layers. The U-Net processes\nimages while being conditioned on the time embedding input for each sampling\nstep and the class or caption embedding input corresponding to the desired\nconditional generation. Such conditioning involves scale-and-shift operations\nto the convolutional layers but does not directly affect the attention layers.\nWhile these standard architectural choices are certainly effective, not\nconditioning the attention layers feels arbitrary and potentially suboptimal.\nIn this work, we show that simply adding LoRA conditioning to the attention\nlayers without changing or tuning the other parts of the U-Net architecture\nimproves the image generation quality. For example, a drop-in addition of LoRA\nconditioning to EDM diffusion model yields FID scores of 1.91/1.75 for\nunconditional and class-conditional CIFAR-10 generation, improving upon the\nbaseline of 1.97/1.79.\n","authors":["Joo Young Choi","Jaesung R. Park","Inkyu Park","Jaewoong Cho","Albert No","Ernest K. Ryu"],"pdf_url":"https://arxiv.org/pdf/2405.03958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03955v1","updated":"2024-05-07T02:29:41Z","published":"2024-05-07T02:29:41Z","title":"IPFed: Identity protected federated learning for user authentication","summary":" With the development of laws and regulations related to privacy preservation,\nit has become difficult to collect personal data to perform machine learning.\nIn this context, federated learning, which is distributed learning without\nsharing personal data, has been proposed. In this paper, we focus on federated\nlearning for user authentication. We show that it is difficult to achieve both\nprivacy preservation and high accuracy with existing methods. To address these\nchallenges, we propose IPFed which is privacy-preserving federated learning\nusing random projection for class embedding. Furthermore, we prove that IPFed\nis capable of learning equivalent to the state-of-the-art method. Experiments\non face image datasets show that IPFed can protect the privacy of personal data\nwhile maintaining the accuracy of the state-of-the-art method.\n","authors":["Yosuke Kaga","Yusei Suzuki","Kenta Takahashi"],"pdf_url":"https://arxiv.org/pdf/2405.03955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00187v4","updated":"2024-05-07T02:25:01Z","published":"2023-10-31T23:19:30Z","title":"Decodable and Sample Invariant Continuous Object Encoder","summary":" We propose Hyper-Dimensional Function Encoding (HDFE). Given samples of a\ncontinuous object (e.g. a function), HDFE produces an explicit vector\nrepresentation of the given object, invariant to the sample distribution and\ndensity. Sample distribution and density invariance enables HDFE to\nconsistently encode continuous objects regardless of their sampling, and\ntherefore allows neural networks to receive continuous objects as inputs for\nmachine learning tasks, such as classification and regression. Besides, HDFE\ndoes not require any training and is proved to map the object into an organized\nembedding space, which facilitates the training of the downstream tasks. In\naddition, the encoding is decodable, which enables neural networks to regress\ncontinuous objects by regressing their encodings. Therefore, HDFE serves as an\ninterface for processing continuous objects.\n We apply HDFE to function-to-function mapping, where vanilla HDFE achieves\ncompetitive performance as the state-of-the-art algorithm. We apply HDFE to\npoint cloud surface normal estimation, where a simple replacement from PointNet\nto HDFE leads to immediate 12% and 15% error reductions in two benchmarks. In\naddition, by integrating HDFE into the PointNet-based SOTA network, we improve\nthe SOTA baseline by 2.5% and 1.7% in the same benchmarks.\n","authors":["Dehao Yuan","Furong Huang","Cornelia Fermüller","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2311.00187v4.pdf","comment":"ICLR2024 Conference Paper"},{"id":"http://arxiv.org/abs/2404.01568v3","updated":"2024-05-07T02:21:11Z","published":"2024-04-02T02:01:21Z","title":"A Linear Time and Space Local Point Cloud Geometry Encoder via\n Vectorized Kernel Mixture (VecKM)","summary":" We propose VecKM, a local point cloud geometry encoder that is descriptive\nand efficient to compute. VecKM leverages a unique approach by vectorizing a\nkernel mixture to represent the local point cloud. Such representation's\ndescriptiveness is supported by two theorems that validate its ability to\nreconstruct and preserve the similarity of the local shape. Unlike existing\nencoders downsampling the local point cloud, VecKM constructs the local\ngeometry encoding using all neighboring points, producing a more descriptive\nencoding. Moreover, VecKM is efficient to compute and scalable to large point\ncloud inputs: VecKM reduces the memory cost from $(n^2+nKd)$ to $(nd+np)$; and\nreduces the major runtime cost from computing $nK$ MLPs to $n$ MLPs, where $n$\nis the size of the point cloud, $K$ is the neighborhood size, $d$ is the\nencoding dimension, and $p$ is a marginal factor. The efficiency is due to\nVecKM's unique factorizable property that eliminates the need of explicitly\ngrouping points into neighbors. In the normal estimation task, VecKM\ndemonstrates not only 100x faster inference speed but also highest accuracy and\nstrongest robustness. In classification and segmentation tasks, integrating\nVecKM as a preprocessing module achieves consistently better performance than\nthe PointNet, PointNet++, and point transformer baselines, and runs\nconsistently faster by up to 10 times.\n","authors":["Dehao Yuan","Cornelia Fermüller","Tahseen Rabbani","Furong Huang","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2404.01568v3.pdf","comment":"ICML2024 Conference Paper"},{"id":"http://arxiv.org/abs/2405.02824v2","updated":"2024-05-07T02:17:59Z","published":"2024-05-05T06:21:58Z","title":"Adaptive Guidance Learning for Camouflaged Object Detection","summary":" Camouflaged object detection (COD) aims to segment objects visually embedded\nin their surroundings, which is a very challenging task due to the high\nsimilarity between the objects and the background. To address it, most methods\noften incorporate additional information (e.g., boundary, texture, and\nfrequency clues) to guide feature learning for better detecting camouflaged\nobjects from the background. Although progress has been made, these methods are\nbasically individually tailored to specific auxiliary cues, thus lacking\nadaptability and not consistently achieving high segmentation performance. To\nthis end, this paper proposes an adaptive guidance learning network, dubbed\n\\textit{AGLNet}, which is a unified end-to-end learnable model for exploring\nand adapting different additional cues in CNN models to guide accurate\ncamouflaged feature learning. Specifically, we first design a straightforward\nadditional information generation (AIG) module to learn additional camouflaged\nobject cues, which can be adapted for the exploration of effective camouflaged\nfeatures. Then we present a hierarchical feature combination (HFC) module to\ndeeply integrate additional cues and image features to guide camouflaged\nfeature learning in a multi-level fusion manner.Followed by a recalibration\ndecoder (RD), different features are further aggregated and refined for\naccurate object prediction. Extensive experiments on three widely used COD\nbenchmark datasets demonstrate that the proposed method achieves significant\nperformance improvements under different additional cues, and outperforms the\nrecent 20 state-of-the-art methods by a large margin. Our code will be made\npublicly available at: \\textcolor{blue}{{https://github.com/ZNan-Chen/AGLNet}}.\n","authors":["Zhennan Chen","Xuying Zhang","Tian-Zhu Xiang","Ying Tai"],"pdf_url":"https://arxiv.org/pdf/2405.02824v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03945v1","updated":"2024-05-07T02:10:30Z","published":"2024-05-07T02:10:30Z","title":"Role of Sensing and Computer Vision in 6G Wireless Communications","summary":" Recently, we are witnessing the remarkable progress and widespread adoption\nof sensing technologies in autonomous driving, robotics, and metaverse.\nConsidering the rapid advancement of computer vision (CV) technology to analyze\nthe sensing information, we anticipate a proliferation of wireless applications\nexploiting the sensing and CV technologies in 6G. In this article, we provide a\nholistic overview of the sensing and CV-aided wireless communications (SVWC)\nframework for 6G. By analyzing the high-resolution sensing information through\nthe powerful CV techniques, SVWC can quickly and accurately understand the\nwireless environments and then perform the wireless tasks. To demonstrate the\nefficacy of SVWC, we design the whole process of SVWC including the sensing\ndataset collection, DL model training, and execution of realistic wireless\ntasks. From the numerical evaluations on 6G communication scenarios, we show\nthat SVWC achieves considerable performance gains over the conventional 5G\nsystems in terms of positioning accuracy, data rate, and access latency.\n","authors":["Seungnyun Kim","Jihoon Moon","Jinhong Kim","Yongjun Ahn","Donghoon Kim","Sunwoo Kim","Kyuhong Shim","Byonghyo Shim"],"pdf_url":"https://arxiv.org/pdf/2405.03945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01345v5","updated":"2024-05-07T01:46:15Z","published":"2024-02-02T12:02:46Z","title":"Skip \\n: A Simple Method to Reduce Hallucination in Large\n Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks (\\n\\n), where the content before and\nafter '\\n\\n' in the training data frequently exhibit significant semantic\nchanges. This pattern leads the model to infer that the contents following\n'\\n\\n' should be obviously different from the preceding contents with less\nhallucinatory descriptions, thereby increasing the probability of hallucinatory\ndescriptions subsequent to the '\\n\\n'. We have validated this hypothesis on\nmultiple publicly available LVLMs. Besides, we find that deliberately inserting\n'\\n\\n' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of '\\n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10665v2","updated":"2024-05-07T01:05:14Z","published":"2024-02-16T13:14:12Z","title":"Selective Prediction for Semantic Segmentation using Post-Hoc Confidence\n Estimation and Its Performance under Distribution Shift","summary":" Semantic segmentation plays a crucial role in various computer vision\napplications, yet its efficacy is often hindered by the lack of high-quality\nlabeled data. To address this challenge, a common strategy is to leverage\nmodels trained on data from different populations, such as publicly available\ndatasets. This approach, however, leads to the distribution shift problem,\npresenting a reduced performance on the population of interest. In scenarios\nwhere model errors can have significant consequences, selective prediction\nmethods offer a means to mitigate risks and reduce reliance on expert\nsupervision. This paper investigates selective prediction for semantic\nsegmentation in low-resource settings, thus focusing on post-hoc confidence\nestimators applied to pre-trained models operating under distribution shift. We\npropose a novel image-level confidence measure tailored for semantic\nsegmentation and demonstrate its effectiveness through experiments on three\nmedical imaging tasks. Our findings show that post-hoc confidence estimators\noffer a cost-effective approach to reducing the impacts of distribution shift.\n","authors":["Bruno Laboissiere Camargos Borges","Bruno Machado Pacheco","Danilo Silva"],"pdf_url":"https://arxiv.org/pdf/2402.10665v2.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000..7f5166c Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 0000000..9ded9d9 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..42c308e --- /dev/null +++ b/index.html @@ -0,0 +1,158325 @@ + + + + + Yibo's arxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 104 + +
+
+
+ + ☆ Tactile-Augmented Radiance Fields CVPR 2024 + + +
+ We present a scene representation, which we call a tactile-augmented radiance +field (TaRF), that brings vision and touch into a shared 3D space. This +representation can be used to estimate the visual and tactile signals for a +given 3D position within a scene. We capture a scene's TaRF from a collection +of photos and sparsely sampled touch probes. Our approach makes use of two +insights: (i) common vision-based touch sensors are built on ordinary cameras +and thus can be registered to images using methods from multi-view geometry, +and (ii) visually and structurally similar regions of a scene share the same +tactile features. We use these insights to register touch signals to a captured +visual scene, and to train a conditional diffusion model that, provided with an +RGB-D image rendered from a neural radiance field, generates its corresponding +tactile signal. To evaluate our approach, we collect a dataset of TaRFs. This +dataset contains more touch samples than previous real-world datasets, and it +provides spatially aligned visual signals for each captured touch signal. We +demonstrate the accuracy of our cross-modal generative model and the utility of +the captured visual-tactile data on several downstream tasks. Project page: +https://dou-yiming.github.io/TaRF + +
+
+ comment: CVPR 2024, Project page: https://dou-yiming.github.io/TaRF, Code: + https://github.com/Dou-Yiming/TaRF/ +
+
+
+
+
+ + ☆ ChatHuman: Language-driven 3D Human Understanding with + Retrieval-Augmented Tool Reasoning + + +
+ Numerous methods have been proposed to detect, estimate, and analyze +properties of people in images, including the estimation of 3D pose, shape, +contact, human-object interaction, emotion, and more. Each of these methods +works in isolation instead of synergistically. Here we address this problem and +build a language-driven human understanding system -- ChatHuman, which combines +and integrates the skills of many different methods. To do so, we finetune a +Large Language Model (LLM) to select and use a wide variety of existing tools +in response to user inputs. In doing so, ChatHuman is able to combine +information from multiple tools to solve problems more accurately than the +individual tools themselves and to leverage tool output to improve its ability +to reason about humans. The novel features of ChatHuman include leveraging +academic publications to guide the application of 3D human-related tools, +employing a retrieval-augmented generation model to generate +in-context-learning examples for handling new tools, and discriminating and +integrating tool results to enhance 3D human understanding. Our experiments +show that ChatHuman outperforms existing models in both tool selection accuracy +and performance across multiple 3D human-related tasks. ChatHuman is a step +towards consolidating diverse methods for human analysis into a single, +powerful, system for 3D human reasoning. + +
+
+ comment: Project page: https://chathuman.github.io +
+
+
+
+
+ + ☆ Edit-Your-Motion: Space-Time Diffusion Decoupling Learning for Video + Motion Editing + + +
+ Existing diffusion-based video editing methods have achieved impressive +results in motion editing. Most of the existing methods focus on the motion +alignment between the edited video and the reference video. However, these +methods do not constrain the background and object content of the video to +remain unchanged, which makes it possible for users to generate unexpected +videos. In this paper, we propose a one-shot video motion editing method called +Edit-Your-Motion that requires only a single text-video pair for training. +Specifically, we design the Detailed Prompt-Guided Learning Strategy (DPL) to +decouple spatio-temporal features in space-time diffusion models. DPL separates +learning object content and motion into two training stages. In the first +training stage, we focus on learning the spatial features (the features of +object content) and breaking down the temporal relationships in the video +frames by shuffling them. We further propose Recurrent-Causal Attention +(RC-Attn) to learn the consistent content features of the object from unordered +video frames. In the second training stage, we restore the temporal +relationship in video frames to learn the temporal feature (the features of the +background and object's motion). We also adopt the Noise Constraint Loss to +smooth out inter-frame differences. Finally, in the inference stage, we inject +the content features of the source object into the editing branch through a +two-branch structure (editing branch and reconstruction branch). With +Edit-Your-Motion, users can edit the motion of objects in the source video to +generate more exciting and diverse videos. Comprehensive qualitative +experiments, quantitative experiments and user preference studies demonstrate +that Edit-Your-Motion performs better than other methods. + +
+
+
+
+
+ + ☆ S3Former: Self-supervised High-resolution Transformer for Solar PV + Profiling + + +
+ As the impact of climate change escalates, the global necessity to transition +to sustainable energy sources becomes increasingly evident. Renewable energies +have emerged as a viable solution for users, with Photovoltaic energy being a +favored choice for small installations due to its reliability and efficiency. +Accurate mapping of PV installations is crucial for understanding the extension +of its adoption and informing energy policy. To meet this need, we introduce +S3Former, designed to segment solar panels from aerial imagery and provide size +and location information critical for analyzing the impact of such +installations on the grid. Solar panel identification is challenging due to +factors such as varying weather conditions, roof characteristics, Ground +Sampling Distance variations and lack of appropriate initialization weights for +optimized training. To tackle these complexities, S3Former features a Masked +Attention Mask Transformer incorporating a self-supervised learning pretrained +backbone. Specifically, our model leverages low-level and high-level features +extracted from the backbone and incorporates an instance query mechanism +incorporated on the Transformer architecture to enhance the localization of +solar PV installations. We introduce a self-supervised learning phase (pretext +task) to improve the initialization weights on the backbone of S3Former. We +evaluated S3Former using diverse datasets, demonstrate improvement +state-of-the-art models. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ A Significantly Better Class of Activation Functions Than ReLU Like + Activation Functions + + +
+ This paper introduces a significantly better class of activation functions +than the almost universally used ReLU like and Sigmoidal class of activation +functions. Two new activation functions referred to as the Cone and +Parabolic-Cone that differ drastically from popular activation functions and +significantly outperform these on the CIFAR-10 and Imagenette benchmmarks are +proposed. The cone activation functions are positive only on a finite interval +and are strictly negative except at the end-points of the interval, where they +become zero. Thus the set of inputs that produce a positive output for a neuron +with cone activation functions is a hyperstrip and not a half-space as is the +usual case. Since a hyper strip is the region between two parallel +hyper-planes, it allows neurons to more finely divide the input feature space +into positive and negative classes than with infinitely wide half-spaces. In +particular the XOR function can be learn by a single neuron with cone-like +activation functions. Both the cone and parabolic-cone activation functions are +shown to achieve higher accuracies with significantly fewer neurons on +benchmarks. The results presented in this paper indicate that many nonlinear +real-world datasets may be separated with fewer hyperstrips than half-spaces. +The Cone and Parabolic-Cone activation functions have larger derivatives than +ReLU and are shown to significantly speedup training. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Towards Geographic Inclusion in the Evaluation of Text-to-Image Models + + +
+ Rapid progress in text-to-image generative models coupled with their +deployment for visual content creation has magnified the importance of +thoroughly evaluating their performance and identifying potential biases. In +pursuit of models that generate images that are realistic, diverse, visually +appealing, and consistent with the given prompt, researchers and practitioners +often turn to automated metrics to facilitate scalable and cost-effective +performance profiling. However, commonly-used metrics often fail to account for +the full diversity of human preference; often even in-depth human evaluations +face challenges with subjectivity, especially as interpretations of evaluation +criteria vary across regions and cultures. In this work, we conduct a large, +cross-cultural study to study how much annotators in Africa, Europe, and +Southeast Asia vary in their perception of geographic representation, visual +appeal, and consistency in real and generated images from state-of-the art +public APIs. We collect over 65,000 image annotations and 20 survey responses. +We contrast human annotations with common automated metrics, finding that human +preferences vary notably across geographic location and that current metrics do +not fully account for this diversity. For example, annotators in different +locations often disagree on whether exaggerated, stereotypical depictions of a +region are considered geographically representative. In addition, the utility +of automatic evaluations is dependent on assumptions about their set-up, such +as the alignment of feature extractors with human perception of object +similarity or the definition of "appeal" captured in reference datasets used to +ground evaluations. We recommend steps for improved automatic and human +evaluations. + +
+
+
+
+
+ + ☆ AugmenTory: A Fast and Flexible Polygon Augmentation Library + + +
+ Data augmentation is a key technique for addressing the challenge of limited +datasets, which have become a major component in the training procedures of +image processing. Techniques such as geometric transformations and color space +adjustments have been thoroughly tested for their ability to artificially +expand training datasets and generate semi-realistic data for training +purposes. Data augmentation is the most important key to addressing the +challenge of limited datasets, which have become a major component of image +processing training procedures. Data augmentation techniques, such as geometric +transformations and color space adjustments, are thoroughly tested for their +ability to artificially expand training datasets and generate semi-realistic +data for training purposes. Polygons play a crucial role in instance +segmentation and have seen a surge in use across advanced models, such as +YOLOv8. Despite their growing popularity, the lack of specialized libraries +hampers the polygon-augmentation process. This paper introduces a novel +solution to this challenge, embodied in the newly developed AugmenTory library. +Notably, AugmenTory offers reduced computational demands in both time and space +compared to existing methods. Additionally, the library includes a +postprocessing thresholding feature. The AugmenTory package is publicly +available on GitHub, where interested users can access the source code: +https://github.com/Smartory/AugmenTory + +
+
+
+
+
+ + ☆ DistGrid: Scalable Scene Reconstruction with Distributed + Multi-resolution Hash Grid + + +
+ Neural Radiance Field~(NeRF) achieves extremely high quality in object-scaled +and indoor scene reconstruction. However, there exist some challenges when +reconstructing large-scale scenes. MLP-based NeRFs suffer from limited network +capacity, while volume-based NeRFs are heavily memory-consuming when the scene +resolution increases. Recent approaches propose to geographically partition the +scene and learn each sub-region using an individual NeRF. Such partitioning +strategies help volume-based NeRF exceed the single GPU memory limit and scale +to larger scenes. However, this approach requires multiple background NeRF to +handle out-of-partition rays, which leads to redundancy of learning. Inspired +by the fact that the background of current partition is the foreground of +adjacent partition, we propose a scalable scene reconstruction method based on +joint Multi-resolution Hash Grids, named DistGrid. In this method, the scene is +divided into multiple closely-paved yet non-overlapped Axis-Aligned Bounding +Boxes, and a novel segmented volume rendering method is proposed to handle +cross-boundary rays, thereby eliminating the need for background NeRFs. The +experiments demonstrate that our method outperforms existing methods on all +evaluated large-scale scenes, and provides visually plausible scene +reconstruction. The scalability of our method on reconstruction quality is +further evaluated qualitatively and quantitatively. + +
+
+ comment: Originally submitted to Siggraph Asia 2023 +
+
+
+
+
+ + ☆ DocRes: A Generalist Model Toward Unifying Document Image Restoration + Tasks CVPR 2024 + + +
+ Document image restoration is a crucial aspect of Document AI systems, as the +quality of document images significantly influences the overall performance. +Prevailing methods address distinct restoration tasks independently, leading to +intricate systems and the incapability to harness the potential synergies of +multi-task learning. To overcome this challenge, we propose DocRes, a +generalist model that unifies five document image restoration tasks including +dewarping, deshadowing, appearance enhancement, deblurring, and binarization. +To instruct DocRes to perform various restoration tasks, we propose a novel +visual prompt approach called Dynamic Task-Specific Prompt (DTSPrompt). The +DTSPrompt for different tasks comprises distinct prior features, which are +additional characteristics extracted from the input image. Beyond its role as a +cue for task-specific execution, DTSPrompt can also serve as supplementary +information to enhance the model's performance. Moreover, DTSPrompt is more +flexible than prior visual prompt approaches as it can be seamlessly applied +and adapted to inputs with high and variable resolutions. Experimental results +demonstrate that DocRes achieves competitive or superior performance compared +to existing state-of-the-art task-specific models. This underscores the +potential of DocRes across a broader spectrum of document image restoration +tasks. The source code is publicly available at +https://github.com/ZZZHANG-jx/DocRes + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Vision Mamba: A Comprehensive Survey and Taxonomy + + +
+ State Space Model (SSM) is a mathematical model used to describe and analyze +the behavior of dynamic systems. This model has witnessed numerous applications +in several fields, including control theory, signal processing, economics and +machine learning. In the field of deep learning, state space models are used to +process sequence data, such as time series analysis, natural language +processing (NLP) and video understanding. By mapping sequence data to state +space, long-term dependencies in the data can be better captured. In +particular, modern SSMs have shown strong representational capabilities in NLP, +especially in long sequence modeling, while maintaining linear time complexity. +Notably, based on the latest state-space models, Mamba merges time-varying +parameters into SSMs and formulates a hardware-aware algorithm for efficient +training and inference. Given its impressive efficiency and strong long-range +dependency modeling capability, Mamba is expected to become a new AI +architecture that may outperform Transformer. Recently, a number of works have +attempted to study the potential of Mamba in various fields, such as general +vision, multi-modal, medical image analysis and remote sensing image analysis, +by extending Mamba from natural language domain to visual domain. To fully +understand Mamba in the visual domain, we conduct a comprehensive survey and +present a taxonomy study. This survey focuses on Mamba's application to a +variety of visual tasks and data types, and discusses its predecessors, recent +advances and far-reaching impact on a wide range of domains. Since Mamba is now +on an upward trend, please actively notice us if you have new findings, and new +progress on Mamba will be included in this survey in a timely manner and +updated on the Mamba project at +https://github.com/lx6c78/Vision-Mamba-A-Comprehensive-Survey-and-Taxonomy. + +
+
+ comment: https://github.com/lx6c78/Vision-Mamba-A-Comprehensive-Survey-and-Taxonomy +
+
+
+
+
+ + ☆ Learning To See But Forgetting To Follow: Visual Instruction Tuning + Makes LLMs More Prone To Jailbreak Attacks + + +
+ Augmenting Large Language Models (LLMs) with image-understanding capabilities +has resulted in a boom of high-performing Vision-Language models (VLMs). While +studying the alignment of LLMs to human values has received widespread +attention, the safety of VLMs has not received the same attention. In this +paper, we explore the impact of jailbreaking on three state-of-the-art VLMs, +each using a distinct modeling approach. By comparing each VLM to their +respective LLM backbone, we find that each VLM is more susceptible to +jailbreaking. We consider this as an undesirable outcome from visual +instruction-tuning, which imposes a forgetting effect on an LLM's safety +guardrails. Therefore, we provide recommendations for future work based on +evaluation strategies that aim to highlight the weaknesses of a VLM, as well as +take safety measures into account during visual instruction tuning. + +
+
+
+
+
+ + ☆ BILTS: A novel bi-invariant local trajectory-shape descriptor for + rigid-body motion + + +
+ Measuring the similarity between motions and established motion models is +crucial for motion analysis, recognition, generation, and adaptation. To +enhance similarity measurement across diverse contexts, invariant motion +descriptors have been proposed. However, for rigid-body motion, few invariant +descriptors exist that are bi-invariant, meaning invariant to both the body and +world reference frames used to describe the motion. Moreover, their robustness +to singularities is limited. This paper introduces a novel Bi-Invariant Local +Trajectory-Shape descriptor (BILTS) and a corresponding dissimilarity measure. +Mathematical relationships between BILTS and existing descriptors are derived, +providing new insights into their properties. The paper also includes an +algorithm to reproduce the motion from the BILTS descriptor, demonstrating its +bidirectionality and usefulness for trajectory generation. Experimental +validation using datasets of daily-life activities shows the higher robustness +of the BILTS descriptor compared to the bi-invariant ISA descriptor. This +higher robustness supports the further application of bi-invariant descriptors +for motion recognition and generalization. + +
+
+ comment: This work has been submitted as a regular research paper for + consideration in the IEEE Transactions on Robotics. Copyright may be + transferred without notice, after which this version may no longer be + accessible +
+
+
+
+
+ + ☆ DriveWorld: 4D Pre-trained Scene Understanding via World Models for + Autonomous Driving CVPR2024 + + +
+ Vision-centric autonomous driving has recently raised wide attention due to +its lower cost. Pre-training is essential for extracting a universal +representation. However, current vision-centric pre-training typically relies +on either 2D or 3D pre-text tasks, overlooking the temporal characteristics of +autonomous driving as a 4D scene understanding task. In this paper, we address +this challenge by introducing a world model-based autonomous driving 4D +representation learning framework, dubbed \emph{DriveWorld}, which is capable +of pre-training from multi-camera driving videos in a spatio-temporal fashion. +Specifically, we propose a Memory State-Space Model for spatio-temporal +modelling, which consists of a Dynamic Memory Bank module for learning +temporal-aware latent dynamics to predict future changes and a Static Scene +Propagation module for learning spatial-aware latent statics to offer +comprehensive scene contexts. We additionally introduce a Task Prompt to +decouple task-aware features for various downstream tasks. The experiments +demonstrate that DriveWorld delivers promising results on various autonomous +driving tasks. When pre-trained with the OpenScene dataset, DriveWorld achieves +a 7.5% increase in mAP for 3D object detection, a 3.0% increase in IoU for +online mapping, a 5.0% increase in AMOTA for multi-object tracking, a 0.1m +decrease in minADE for motion forecasting, a 3.0% increase in IoU for occupancy +prediction, and a 0.34m reduction in average L2 error for planning. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ $\textbf{Splat-MOVER}$: Multi-Stage, Open-Vocabulary Robotic + Manipulation via Editable Gaussian Splatting + + +
+ We present Splat-MOVER, a modular robotics stack for open-vocabulary robotic +manipulation, which leverages the editability of Gaussian Splatting (GSplat) +scene representations to enable multi-stage manipulation tasks. Splat-MOVER +consists of: (i) $\textit{ASK-Splat}$, a GSplat representation that distills +latent codes for language semantics and grasp affordance into the 3D scene. +ASK-Splat enables geometric, semantic, and affordance understanding of 3D +scenes, which is critical for many robotics tasks; (ii) $\textit{SEE-Splat}$, a +real-time scene-editing module using 3D semantic masking and infilling to +visualize the motions of objects that result from robot interactions in the +real-world. SEE-Splat creates a "digital twin" of the evolving environment +throughout the manipulation task; and (iii) $\textit{Grasp-Splat}$, a grasp +generation module that uses ASK-Splat and SEE-Splat to propose candidate grasps +for open-world objects. ASK-Splat is trained in real-time from RGB images in a +brief scanning phase prior to operation, while SEE-Splat and Grasp-Splat run in +real-time during operation. We demonstrate the superior performance of +Splat-MOVER in hardware experiments on a Kinova robot compared to two recent +baselines in four single-stage, open-vocabulary manipulation tasks, as well as +in four multi-stage manipulation tasks using the edited scene to reflect scene +changes due to prior manipulation stages, which is not possible with the +existing baselines. Code for this project and a link to the project page will +be made available soon. + +
+
+
+
+
+ + ☆ Choose What You Need: Disentangled Representation Learning for Scene + Text Recognition, Removal and Editing CVPR 2024 + + +
+ Scene text images contain not only style information (font, background) but +also content information (character, texture). Different scene text tasks need +different information, but previous representation learning methods use tightly +coupled features for all tasks, resulting in sub-optimal performance. We +propose a Disentangled Representation Learning framework (DARLING) aimed at +disentangling these two types of features for improved adaptability in better +addressing various downstream tasks (choose what you really need). +Specifically, we synthesize a dataset of image pairs with identical style but +different content. Based on the dataset, we decouple the two types of features +by the supervision design. Clearly, we directly split the visual representation +into style and content features, the content features are supervised by a text +recognition loss, while an alignment loss aligns the style features in the +image pairs. Then, style features are employed in reconstructing the +counterpart image via an image decoder with a prompt that indicates the +counterpart's content. Such an operation effectively decouples the features +based on their distinctive properties. To the best of our knowledge, this is +the first time in the field of scene text that disentangles the inherent +properties of the text images. Our method achieves state-of-the-art performance +in Scene Text Recognition, Removal, and Editing. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Diff-IP2D: Diffusion-Based Hand-Object Interaction Prediction on + Egocentric Videos + + +
+ Understanding how humans would behave during hand-object interaction is vital +for applications in service robot manipulation and extended reality. To achieve +this, some recent works have been proposed to simultaneously predict hand +trajectories and object affordances on human egocentric videos. They are +regarded as the representation of future hand-object interactions, indicating +potential human motion and motivation. However, the existing approaches mostly +adopt the autoregressive paradigm for unidirectional prediction, which lacks +mutual constraints within the holistic future sequence, and accumulates errors +along the time axis. Meanwhile, these works basically overlook the effect of +camera egomotion on first-person view predictions. To address these +limitations, we propose a novel diffusion-based interaction prediction method, +namely Diff-IP2D, to forecast future hand trajectories and object affordances +concurrently in an iterative non-autoregressive manner. We transform the +sequential 2D images into latent feature space and design a denoising diffusion +model to predict future latent interaction features conditioned on past ones. +Motion features are further integrated into the conditional denoising process +to enable Diff-IP2D aware of the camera wearer's dynamics for more accurate +interaction prediction. The experimental results show that our method +significantly outperforms the state-of-the-art baselines on both the +off-the-shelf metrics and our proposed new evaluation protocol. This highlights +the efficacy of leveraging a generative paradigm for 2D hand-object interaction +prediction. The code of Diff-IP2D will be released at +https://github.com/IRMVLab/Diff-IP2D. + +
+
+
+
+
+ + ☆ Diffusion-driven GAN Inversion for Multi-Modal Face Image Generation CVPR 2024 + + +
+ We present a new multi-modal face image generation method that converts a +text prompt and a visual input, such as a semantic mask or scribble map, into a +photo-realistic face image. To do this, we combine the strengths of Generative +Adversarial networks (GANs) and diffusion models (DMs) by employing the +multi-modal features in the DM into the latent space of the pre-trained GANs. +We present a simple mapping and a style modulation network to link two models +and convert meaningful representations in feature maps and attention maps into +latent codes. With GAN inversion, the estimated latent codes can be used to +generate 2D or 3D-aware facial images. We further present a multi-step training +strategy that reflects textual and structural representations into the +generated image. Our proposed network produces realistic 2D, multi-view, and +stylized face images, which align well with inputs. We validate our method by +using pre-trained 2D and 3D GANs, and our results outperform existing methods. +Our project page is available at +https://github.com/1211sh/Diffusion-driven_GAN-Inversion/. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Novel View Synthesis with Neural Radiance Fields for Industrial Robot + Applications SP + + +
+ Neural Radiance Fields (NeRFs) have become a rapidly growing research field +with the potential to revolutionize typical photogrammetric workflows, such as +those used for 3D scene reconstruction. As input, NeRFs require multi-view +images with corresponding camera poses as well as the interior orientation. In +the typical NeRF workflow, the camera poses and the interior orientation are +estimated in advance with Structure from Motion (SfM). But the quality of the +resulting novel views, which depends on different parameters such as the number +and distribution of available images, as well as the accuracy of the related +camera poses and interior orientation, is difficult to predict. In addition, +SfM is a time-consuming pre-processing step, and its quality strongly depends +on the image content. Furthermore, the undefined scaling factor of SfM hinders +subsequent steps in which metric information is required. In this paper, we +evaluate the potential of NeRFs for industrial robot applications. We propose +an alternative to SfM pre-processing: we capture the input images with a +calibrated camera that is attached to the end effector of an industrial robot +and determine accurate camera poses with metric scale based on the robot +kinematics. We then investigate the quality of the novel views by comparing +them to ground truth, and by computing an internal quality measure based on +ensemble methods. For evaluation purposes, we acquire multiple datasets that +pose challenges for reconstruction typical of industrial applications, like +reflective objects, poor texture, and fine structures. We show that the +robot-based pose determination reaches similar accuracy as SfM in non-demanding +cases, while having clear advantages in more challenging scenarios. Finally, we +present first results of applying the ensemble method to estimate the quality +of the synthetic novel view in the absence of a ground truth. + +
+
+ comment: 8 pages, 8 figures, accepted for publication in The International + Archives of the Photogrammetry, Remote Sensing and Spatial Information + Sciences (ISPRS Archives) 2024 +
+
+
+
+
+ + ☆ Audio-Visual Speech Representation Expert for Enhanced Talking Face + Video Generation and Evaluation CVPR2024 + + +
+ In the task of talking face generation, the objective is to generate a face +video with lips synchronized to the corresponding audio while preserving visual +details and identity information. Current methods face the challenge of +learning accurate lip synchronization while avoiding detrimental effects on +visual quality, as well as robustly evaluating such synchronization. To tackle +these problems, we propose utilizing an audio-visual speech representation +expert (AV-HuBERT) for calculating lip synchronization loss during training. +Moreover, leveraging AV-HuBERT's features, we introduce three novel lip +synchronization evaluation metrics, aiming to provide a comprehensive +assessment of lip synchronization performance. Experimental results, along with +a detailed ablation study, demonstrate the effectiveness of our approach and +the utility of the proposed evaluation metrics. + +
+
+ comment: CVPR2024 NTIRE Workshop +
+
+
+
+
+ + ☆ Inf-DiT: Upsampling Any-Resolution Image with Memory-Efficient Diffusion + Transformer + + +
+ Diffusion models have shown remarkable performance in image generation in +recent years. However, due to a quadratic increase in memory during generating +ultra-high-resolution images (e.g. 4096*4096), the resolution of generated +images is often limited to 1024*1024. In this work. we propose a unidirectional +block attention mechanism that can adaptively adjust the memory overhead during +the inference process and handle global dependencies. Building on this module, +we adopt the DiT structure for upsampling and develop an infinite +super-resolution model capable of upsampling images of various shapes and +resolutions. Comprehensive experiments show that our model achieves SOTA +performance in generating ultra-high-resolution images in both machine and +human evaluation. Compared to commonly used UNet structures, our model can save +more than 5x memory when generating 4096*4096 images. The project URL is +https://github.com/THUDM/Inf-DiT. + +
+
+
+
+
+ + ☆ Cross-IQA: Unsupervised Learning for Image Quality Assessment + + +
+ Automatic perception of image quality is a challenging problem that impacts +billions of Internet and social media users daily. To advance research in this +field, we propose a no-reference image quality assessment (NR-IQA) method +termed Cross-IQA based on vision transformer(ViT) model. The proposed Cross-IQA +method can learn image quality features from unlabeled image data. We construct +the pretext task of synthesized image reconstruction to unsupervised extract +the image quality information based ViT block. The pretrained encoder of +Cross-IQA is used to fine-tune a linear regression model for score prediction. +Experimental results show that Cross-IQA can achieve state-of-the-art +performance in assessing the low-frequency degradation information (e.g., color +change, blurring, etc.) of images compared with the classical full-reference +IQA and NR-IQA under the same datasets. + +
+
+
+
+
+ + ☆ Non-rigid Structure-from-Motion: Temporally-smooth Procrustean Alignment + and Spatially-variant Deformation Modeling CVPR 2024 + + +
+ Even though Non-rigid Structure-from-Motion (NRSfM) has been extensively +studied and great progress has been made, there are still key challenges that +hinder their broad real-world applications: 1) the inherent motion/rotation +ambiguity requires either explicit camera motion recovery with extra constraint +or complex Procrustean Alignment; 2) existing low-rank modeling of the global +shape can over-penalize drastic deformations in the 3D shape sequence. This +paper proposes to resolve the above issues from a spatial-temporal modeling +perspective. First, we propose a novel Temporally-smooth Procrustean Alignment +module that estimates 3D deforming shapes and adjusts the camera motion by +aligning the 3D shape sequence consecutively. Our new alignment module remedies +the requirement of complex reference 3D shape during alignment, which is more +conductive to non-isotropic deformation modeling. Second, we propose a +spatial-weighted approach to enforce the low-rank constraint adaptively at +different locations to accommodate drastic spatially-variant deformation +reconstruction better. Our modeling outperform existing low-rank based methods, +and extensive experiments across different datasets validate the effectiveness +of our method. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ A New Dataset and Comparative Study for Aphid Cluster Detection and + Segmentation in Sorghum Fields + + +
+ Aphid infestations are one of the primary causes of extensive damage to wheat +and sorghum fields and are one of the most common vectors for plant viruses, +resulting in significant agricultural yield losses. To address this problem, +farmers often employ the inefficient use of harmful chemical pesticides that +have negative health and environmental impacts. As a result, a large amount of +pesticide is wasted on areas without significant pest infestation. This brings +to attention the urgent need for an intelligent autonomous system that can +locate and spray sufficiently large infestations selectively within the complex +crop canopies. We have developed a large multi-scale dataset for aphid cluster +detection and segmentation, collected from actual sorghum fields and +meticulously annotated to include clusters of aphids. Our dataset comprises a +total of 54,742 image patches, showcasing a variety of viewpoints, diverse +lighting conditions, and multiple scales, highlighting its effectiveness for +real-world applications. In this study, we trained and evaluated four real-time +semantic segmentation models and three object detection models specifically for +aphid cluster segmentation and detection. Considering the balance between +accuracy and efficiency, Fast-SCNN delivered the most effective segmentation +results, achieving 80.46% mean precision, 81.21% mean recall, and 91.66 frames +per second (FPS). For object detection, RT-DETR exhibited the best overall +performance with a 61.63% mean average precision (mAP), 92.6% mean recall, and +72.55 on an NVIDIA V100 GPU. Our experiments further indicate that aphid +cluster segmentation is more suitable for assessing aphid infestations than +using detection models. + +
+
+
+
+
+ + ☆ ViewFormer: Exploring Spatiotemporal Modeling for Multi-View 3D + Occupancy Perception via View-Guided Transformers + + +
+ 3D occupancy, an advanced perception technology for driving scenarios, +represents the entire scene without distinguishing between foreground and +background by quantifying the physical space into a grid map. The widely +adopted projection-first deformable attention, efficient in transforming image +features into 3D representations, encounters challenges in aggregating +multi-view features due to sensor deployment constraints. To address this +issue, we propose our learning-first view attention mechanism for effective +multi-view feature aggregation. Moreover, we showcase the scalability of our +view attention across diverse multi-view 3D tasks, such as map construction and +3D object detection. Leveraging the proposed view attention as well as an +additional multi-frame streaming temporal attention, we introduce ViewFormer, a +vision-centric transformer-based framework for spatiotemporal feature +aggregation. To further explore occupancy-level flow representation, we present +FlowOcc3D, a benchmark built on top of existing high-quality datasets. +Qualitative and quantitative analyses on this benchmark reveal the potential to +represent fine-grained dynamic scenes. Extensive experiments show that our +approach significantly outperforms prior state-of-the-art methods. The codes +and benchmark will be released soon. + +
+
+
+
+
+ + ☆ Semi-Supervised Disease Classification based on Limited Medical Image + Data + + +
+ In recent years, significant progress has been made in the field of learning +from positive and unlabeled examples (PU learning), particularly in the context +of advancing image and text classification tasks. However, applying PU learning +to semi-supervised disease classification remains a formidable challenge, +primarily due to the limited availability of labeled medical images. In the +realm of medical image-aided diagnosis algorithms, numerous theoretical and +practical obstacles persist. The research on PU learning for medical +image-assisted diagnosis holds substantial importance, as it aims to reduce the +time spent by professional experts in classifying images. Unlike natural +images, medical images are typically accompanied by a scarcity of annotated +data, while an abundance of unlabeled cases exists. Addressing these +challenges, this paper introduces a novel generative model inspired by H\"older +divergence, specifically designed for semi-supervised disease classification +using positive and unlabeled medical image data. In this paper, we present a +comprehensive formulation of the problem and establish its theoretical +feasibility through rigorous mathematical analysis. To evaluate the +effectiveness of our proposed approach, we conduct extensive experiments on +five benchmark datasets commonly used in PU medical learning: BreastMNIST, +PneumoniaMNIST, BloodMNIST, OCTMNIST, and AMD. The experimental results clearly +demonstrate the superiority of our method over existing approaches based on KL +divergence. Notably, our approach achieves state-of-the-art performance on all +five disease classification benchmarks. + By addressing the limitations imposed by limited labeled data and harnessing +the untapped potential of unlabeled medical images, our novel generative model +presents a promising direction for enhancing semi-supervised disease +classification in the field of medical image analysis. + +
+
+
+
+
+ + ☆ Group-aware Parameter-efficient Updating for Content-Adaptive Neural + Video Compression + + +
+ Content-adaptive compression is crucial for enhancing the adaptability of the +pre-trained neural codec for various contents. Although these methods have been +very practical in neural image compression (NIC), their application in neural +video compression (NVC) is still limited due to two main aspects: 1), video +compression relies heavily on temporal redundancy, therefore updating just one +or a few frames can lead to significant errors accumulating over time; 2), NVC +frameworks are generally more complex, with many large components that are not +easy to update quickly during encoding. To address the previously mentioned +challenges, we have developed a content-adaptive NVC technique called +Group-aware Parameter-Efficient Updating (GPU). Initially, to minimize error +accumulation, we adopt a group-aware approach for updating encoder parameters. +This involves adopting a patch-based Group of Pictures (GoP) training strategy +to segment a video into patch-based GoPs, which will be updated to facilitate a +globally optimized domain-transferable solution. Subsequently, we introduce a +parameter-efficient delta-tuning strategy, which is achieved by integrating +several light-weight adapters into each coding component of the encoding +process by both serial and parallel configuration. Such architecture-agnostic +modules stimulate the components with large parameters, thereby reducing both +the update cost and the encoding time. We incorporate our GPU into the latest +NVC framework and conduct comprehensive experiments, whose results showcase +outstanding video compression efficiency across four video benchmarks and +adaptability of one medical image benchmark. + +
+
+
+
+
+ + ☆ A General Model for Detecting Learner Engagement: Implementation and + Evaluation + + +
+ Considering learner engagement has a mutual benefit for both learners and +instructors. Instructors can help learners increase their attention, +involvement, motivation, and interest. On the other hand, instructors can +improve their instructional performance by evaluating the cumulative results of +all learners and upgrading their training programs. This paper proposes a +general, lightweight model for selecting and processing features to detect +learners' engagement levels while preserving the sequential temporal +relationship over time. During training and testing, we analyzed the videos +from the publicly available DAiSEE dataset to capture the dynamic essence of +learner engagement. We have also proposed an adaptation policy to find new +labels that utilize the affective states of this dataset related to education, +thereby improving the models' judgment. The suggested model achieves an +accuracy of 68.57\% in a specific implementation and outperforms the studied +state-of-the-art models detecting learners' engagement levels. + +
+
+ comment: 13 pages, 2 Postscript figures +
+
+
+
+
+ + ☆ Vidu: a Highly Consistent, Dynamic and Skilled Text-to-Video Generator + with Diffusion Models + + +
+ We introduce Vidu, a high-performance text-to-video generator that is capable +of producing 1080p videos up to 16 seconds in a single generation. Vidu is a +diffusion model with U-ViT as its backbone, which unlocks the scalability and +the capability for handling long videos. Vidu exhibits strong coherence and +dynamism, and is capable of generating both realistic and imaginative videos, +as well as understanding some professional photography techniques, on par with +Sora -- the most powerful reported text-to-video generator. Finally, we perform +initial experiments on other controllable video generation, including +canny-to-video generation, video prediction and subject-driven generation, +which demonstrate promising results. + +
+
+ comment: Project page at https://www.shengshu-ai.com/vidu +
+
+
+
+
+ + ☆ Breast Histopathology Image Retrieval by Attention-based Adversarially + Regularized Variational Graph Autoencoder with Contrastive Learning-Based + Feature Extraction + + +
+ Breast cancer is a significant global health concern, particularly for women. +Early detection and appropriate treatment are crucial in mitigating its impact, +with histopathology examinations playing a vital role in swift diagnosis. +However, these examinations often require a substantial workforce and +experienced medical experts for proper recognition and cancer grading. +Automated image retrieval systems have the potential to assist pathologists in +identifying cancerous tissues, thereby accelerating the diagnostic process. +Nevertheless, due to considerable variability among the tissue and cell +patterns in histological images, proposing an accurate image retrieval model is +very challenging. + This work introduces a novel attention-based adversarially regularized +variational graph autoencoder model for breast histological image retrieval. +Additionally, we incorporated cluster-guided contrastive learning as the graph +feature extractor to boost the retrieval performance. We evaluated the proposed +model's performance on two publicly available datasets of breast cancer +histological images and achieved superior or very competitive retrieval +performance, with average mAP scores of 96.5% for the BreakHis dataset and +94.7% for the BACH dataset, and mVP scores of 91.9% and 91.3%, respectively. + Our proposed retrieval model has the potential to be used in clinical +settings to enhance diagnostic performance and ultimately benefit patients. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ☆ Effective and Robust Adversarial Training against Data and Label + Corruptions + + +
+ Corruptions due to data perturbations and label noise are prevalent in the +datasets from unreliable sources, which poses significant threats to model +training. Despite existing efforts in developing robust models, current +learning methods commonly overlook the possible co-existence of both +corruptions, limiting the effectiveness and practicability of the model. In +this paper, we develop an Effective and Robust Adversarial Training (ERAT) +framework to simultaneously handle two types of corruption (i.e., data and +label) without prior knowledge of their specifics. We propose a hybrid +adversarial training surrounding multiple potential adversarial perturbations, +alongside a semi-supervised learning based on class-rebalancing sample +selection to enhance the resilience of the model for dual corruption. On the +one hand, in the proposed adversarial training, the perturbation generation +module learns multiple surrogate malicious data perturbations by taking a DNN +model as the victim, while the model is trained to maintain semantic +consistency between the original data and the hybrid perturbed data. It is +expected to enable the model to cope with unpredictable perturbations in +real-world data corruption. On the other hand, a class-rebalancing data +selection strategy is designed to fairly differentiate clean labels from noisy +labels. Semi-supervised learning is performed accordingly by discarding noisy +labels. Extensive experiments demonstrate the superiority of the proposed ERAT +framework. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Artificial Intelligence-powered fossil shark tooth identification: + Unleashing the potential of Convolutional Neural Networks + + +
+ All fields of knowledge are being impacted by Artificial Intelligence. In +particular, the Deep Learning paradigm enables the development of data analysis +tools that support subject matter experts in a variety of sectors, from physics +up to the recognition of ancient languages. Palaeontology is now observing this +trend as well. This study explores the capability of Convolutional Neural +Networks (CNNs), a particular class of Deep Learning algorithms specifically +crafted for computer vision tasks, to classify images of isolated fossil shark +teeth gathered from online datasets as well as from the authors$'$ experience +on Peruvian Miocene and Italian Pliocene fossil assemblages. The shark taxa +that are included in the final, composite dataset (which consists of more than +one thousand images) are representative of both extinct and extant genera, +namely, Carcharhinus, Carcharias, Carcharocles, Chlamydoselachus, +Cosmopolitodus, Galeocerdo, Hemipristis, Notorynchus, Prionace and Squatina. We +developed a CNN, named SharkNet-X, specifically tailored on our recognition +task, reaching a 5-fold cross validated mean accuracy of 0.85 to identify +images containing a single shark tooth. Furthermore, we elaborated a +visualization of the features extracted from images using the last dense layer +of the CNN, achieved through the application of the clustering technique t-SNE. +In addition, in order to understand and explain the behaviour of the CNN while +giving a paleontological point of view on the results, we introduced the +explainability method SHAP. To the best of our knowledge, this is the first +instance in which this method is applied to the field of palaeontology. The +main goal of this work is to showcase how Deep Learning techniques can aid in +identifying isolated fossil shark teeth, paving the way for developing new +information tools for automating the recognition and classification of fossils. + +
+
+ comment: 40 pages, 8 figures +
+
+
+
+
+ + ☆ Topicwise Separable Sentence Retrieval for Medical Report Generation + + +
+ Automated radiology reporting holds immense clinical potential in alleviating +the burdensome workload of radiologists and mitigating diagnostic bias. +Recently, retrieval-based report generation methods have garnered increasing +attention due to their inherent advantages in terms of the quality and +consistency of generated reports. However, due to the long-tail distribution of +the training data, these models tend to learn frequently occurring sentences +and topics, overlooking the rare topics. Regrettably, in many cases, the +descriptions of rare topics often indicate critical findings that should be +mentioned in the report. To address this problem, we introduce a Topicwise +Separable Sentence Retrieval (Teaser) for medical report generation. To ensure +comprehensive learning of both common and rare topics, we categorize queries +into common and rare types to learn differentiated topics, and then propose +Topic Contrastive Loss to effectively align topics and queries in the latent +space. Moreover, we integrate an Abstractor module following the extraction of +visual features, which aids the topic decoder in gaining a deeper understanding +of the visual observational intent. Experiments on the MIMIC-CXR and IU X-ray +datasets demonstrate that Teaser surpasses state-of-the-art models, while also +validating its capability to effectively represent rare topics and establish +more dependable correspondences between queries and topics. + +
+
+
+
+
+ + ☆ D-TrAttUnet: Toward Hybrid CNN-Transformer Architecture for Generic and + Subtle Segmentation in Medical Images + + +
+ Over the past two decades, machine analysis of medical imaging has advanced +rapidly, opening up significant potential for several important medical +applications. As complicated diseases increase and the number of cases rises, +the role of machine-based imaging analysis has become indispensable. It serves +as both a tool and an assistant to medical experts, providing valuable insights +and guidance. A particularly challenging task in this area is lesion +segmentation, a task that is challenging even for experienced radiologists. The +complexity of this task highlights the urgent need for robust machine learning +approaches to support medical staff. In response, we present our novel +solution: the D-TrAttUnet architecture. This framework is based on the +observation that different diseases often target specific organs. Our +architecture includes an encoder-decoder structure with a composite +Transformer-CNN encoder and dual decoders. The encoder includes two paths: the +Transformer path and the Encoders Fusion Module path. The Dual-Decoder +configuration uses two identical decoders, each with attention gates. This +allows the model to simultaneously segment lesions and organs and integrate +their segmentation losses. + To validate our approach, we performed evaluations on the Covid-19 and Bone +Metastasis segmentation tasks. We also investigated the adaptability of the +model by testing it without the second decoder in the segmentation of glands +and nuclei. The results confirmed the superiority of our approach, especially +in Covid-19 infections and the segmentation of bone metastases. In addition, +the hybrid encoder showed exceptional performance in the segmentation of glands +and nuclei, solidifying its role in modern medical image analysis. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.15576 +
+
+
+
+
+ + ☆ Bridging the Synthetic-to-Authentic Gap: Distortion-Guided Unsupervised + Domain Adaptation for Blind Image Quality Assessment CVPR2024 + + +
+ The annotation of blind image quality assessment (BIQA) is labor-intensive +and time-consuming, especially for authentic images. Training on synthetic data +is expected to be beneficial, but synthetically trained models often suffer +from poor generalization in real domains due to domain gaps. In this work, we +make a key observation that introducing more distortion types in the synthetic +dataset may not improve or even be harmful to generalizing authentic image +quality assessment. To solve this challenge, we propose distortion-guided +unsupervised domain adaptation for BIQA (DGQA), a novel framework that +leverages adaptive multi-domain selection via prior knowledge from distortion +to match the data distribution between the source domains and the target +domain, thereby reducing negative transfer from the outlier source domains. +Extensive experiments on two cross-domain settings (synthetic distortion to +authentic distortion and synthetic distortion to algorithmic distortion) have +demonstrated the effectiveness of our proposed DGQA. Besides, DGQA is +orthogonal to existing model-based BIQA methods, and can be used in combination +with such models to improve performance with less training data. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Sign2GPT: Leveraging Large Language Models for Gloss-Free Sign Language + Translation ICLR2024 + + +
+ Automatic Sign Language Translation requires the integration of both computer +vision and natural language processing to effectively bridge the communication +gap between sign and spoken languages. However, the deficiency in large-scale +training data to support sign language translation means we need to leverage +resources from spoken language. We introduce, Sign2GPT, a novel framework for +sign language translation that utilizes large-scale pretrained vision and +language models via lightweight adapters for gloss-free sign language +translation. The lightweight adapters are crucial for sign language +translation, due to the constraints imposed by limited dataset sizes and the +computational requirements when training with long sign videos. We also propose +a novel pretraining strategy that directs our encoder to learn sign +representations from automatically extracted pseudo-glosses without requiring +gloss order information or annotations. We evaluate our approach on two public +benchmark sign language translation datasets, namely RWTH-PHOENIX-Weather 2014T +and CSL-Daily, and improve on state-of-the-art gloss-free translation +performance with a significant margin. + +
+
+ comment: Accepted at ICLR2024 +
+
+
+
+
+ + ☆ Exposing AI-generated Videos: A Benchmark Dataset and a Local-and-Global + Temporal Defect Based Detection Method + + +
+ The generative model has made significant advancements in the creation of +realistic videos, which causes security issues. However, this emerging risk has +not been adequately addressed due to the absence of a benchmark dataset for +AI-generated videos. In this paper, we first construct a video dataset using +advanced diffusion-based video generation algorithms with various semantic +contents. Besides, typical video lossy operations over network transmission are +adopted to generate degraded samples. Then, by analyzing local and global +temporal defects of current AI-generated videos, a novel detection framework by +adaptively learning local motion information and global appearance variation is +constructed to expose fake videos. Finally, experiments are conducted to +evaluate the generalization and robustness of different spatial and temporal +domain detection methods, where the results can serve as the baseline and +demonstrate the research challenge for future studies. + +
+
+
+
+
+ + ☆ ELiTe: Efficient Image-to-LiDAR Knowledge Transfer for Semantic + Segmentation ICME 2024 + + +
+ Cross-modal knowledge transfer enhances point cloud representation learning +in LiDAR semantic segmentation. Despite its potential, the \textit{weak teacher +challenge} arises due to repetitive and non-diverse car camera images and +sparse, inaccurate ground truth labels. To address this, we propose the +Efficient Image-to-LiDAR Knowledge Transfer (ELiTe) paradigm. ELiTe introduces +Patch-to-Point Multi-Stage Knowledge Distillation, transferring comprehensive +knowledge from the Vision Foundation Model (VFM), extensively trained on +diverse open-world images. This enables effective knowledge transfer to a +lightweight student model across modalities. ELiTe employs Parameter-Efficient +Fine-Tuning to strengthen the VFM teacher and expedite large-scale model +training with minimal costs. Additionally, we introduce the Segment Anything +Model based Pseudo-Label Generation approach to enhance low-quality image +labels, facilitating robust semantic representations. Efficient knowledge +transfer in ELiTe yields state-of-the-art results on the SemanticKITTI +benchmark, outperforming real-time inference models. Our approach achieves this +with significantly fewer parameters, confirming its effectiveness and +efficiency. + +
+
+ comment: 9 pages, 6 figures, ICME 2024 oral +
+
+
+
+
+ + ☆ COM3D: Leveraging Cross-View Correspondence and Cross-Modal Mining for + 3D Retrieval ICME 2024 + + +
+ In this paper, we investigate an open research task of cross-modal retrieval +between 3D shapes and textual descriptions. Previous approaches mainly rely on +point cloud encoders for feature extraction, which may ignore key inherent +features of 3D shapes, including depth, spatial hierarchy, geometric +continuity, etc. To address this issue, we propose COM3D, making the first +attempt to exploit the cross-view correspondence and cross-modal mining to +enhance the retrieval performance. Notably, we augment the 3D features through +a scene representation transformer, to generate cross-view correspondence +features of 3D shapes, which enrich the inherent features and enhance their +compatibility with text matching. Furthermore, we propose to optimize the +cross-modal matching process based on the semi-hard negative example mining +method, in an attempt to improve the learning efficiency. Extensive +quantitative and qualitative experiments demonstrate the superiority of our +proposed COM3D, achieving state-of-the-art results on the Text2Shape dataset. + +
+
+ comment: Accepted by ICME 2024 oral +
+
+
+
+
+ + ☆ ESP: Extro-Spective Prediction for Long-term Behavior Reasoning in + Emergency Scenarios ICRA 2024 + + +
+ Emergent-scene safety is the key milestone for fully autonomous driving, and +reliable on-time prediction is essential to maintain safety in emergency +scenarios. However, these emergency scenarios are long-tailed and hard to +collect, which restricts the system from getting reliable predictions. In this +paper, we build a new dataset, which aims at the long-term prediction with the +inconspicuous state variation in history for the emergency event, named the +Extro-Spective Prediction (ESP) problem. Based on the proposed dataset, a +flexible feature encoder for ESP is introduced to various prediction methods as +a seamless plug-in, and its consistent performance improvement underscores its +efficacy. Furthermore, a new metric named clamped temporal error (CTE) is +proposed to give a more comprehensive evaluation of prediction performance, +especially in time-sensitive emergency events of subseconds. Interestingly, as +our ESP features can be described in human-readable language naturally, the +application of integrating into ChatGPT also shows huge potential. The +ESP-dataset and all benchmarks are released at +https://dingrui-wang.github.io/ESP-Dataset/. + +
+
+ comment: Accepted by ICRA 2024 as Oral Presentation +
+
+
+
+
+ + ☆ Unmasking Illusions: Understanding Human Perception of Audiovisual + Deepfakes + + +
+ The emergence of contemporary deepfakes has attracted significant attention +in machine learning research, as artificial intelligence (AI) generated +synthetic media increases the incidence of misinterpretation and is difficult +to distinguish from genuine content. Currently, machine learning techniques +have been extensively studied for automatically detecting deepfakes. However, +human perception has been less explored. Malicious deepfakes could ultimately +cause public and social problems. Can we humans correctly perceive the +authenticity of the content of the videos we watch? The answer is obviously +uncertain; therefore, this paper aims to evaluate the human ability to discern +deepfake videos through a subjective study. We present our findings by +comparing human observers to five state-ofthe-art audiovisual deepfake +detection models. To this end, we used gamification concepts to provide 110 +participants (55 native English speakers and 55 non-native English speakers) +with a webbased platform where they could access a series of 40 videos (20 real +and 20 fake) to determine their authenticity. Each participant performed the +experiment twice with the same 40 videos in different random orders. The videos +are manually selected from the FakeAVCeleb dataset. We found that all AI models +performed better than humans when evaluated on the same 40 videos. The study +also reveals that while deception is not impossible, humans tend to +overestimate their detection capabilities. Our experimental results may help +benchmark human versus machine performance, advance forensics analysis, and +enable adaptive countermeasures. + +
+
+
+
+
+ + ☆ DCNN: Dual Cross-current Neural Networks Realized Using An Interactive + Deep Learning Discriminator for Fine-grained Objects + + +
+ Accurate classification of fine-grained images remains a challenge in +backbones based on convolutional operations or self-attention mechanisms. This +study proposes novel dual-current neural networks (DCNN), which combine the +advantages of convolutional operations and self-attention mechanisms to improve +the accuracy of fine-grained image classification. The main novel design +features for constructing a weakly supervised learning backbone model DCNN +include (a) extracting heterogeneous data, (b) keeping the feature map +resolution unchanged, (c) expanding the receptive field, and (d) fusing global +representations and local features. Experimental results demonstrated that +using DCNN as the backbone network for classifying certain fine-grained +benchmark datasets achieved performance advantage improvements of 13.5--19.5% +and 2.2--12.9%, respectively, compared to other advanced convolution or +attention-based fine-grained backbones. + +
+
+
+
+
+ + ☆ IMU-Aided Event-based Stereo Visual Odometry ICRA + + +
+ Direct methods for event-based visual odometry solve the mapping and camera +pose tracking sub-problems by establishing implicit data association in a way +that the generative model of events is exploited. The main bottlenecks faced by +state-of-the-art work in this field include the high computational complexity +of mapping and the limited accuracy of tracking. In this paper, we improve our +previous direct pipeline \textit{Event-based Stereo Visual Odometry} in terms +of accuracy and efficiency. To speed up the mapping operation, we propose an +efficient strategy of edge-pixel sampling according to the local dynamics of +events. The mapping performance in terms of completeness and local smoothness +is also improved by combining the temporal stereo results and the static stereo +results. To circumvent the degeneracy issue of camera pose tracking in +recovering the yaw component of general 6-DoF motion, we introduce as a prior +the gyroscope measurements via pre-integration. Experiments on publicly +available datasets justify our improvement. We release our pipeline as an +open-source software for future research in this field. + +
+
+ comment: 10 pages, 7 figures, ICRA +
+
+
+
+
+ + ☆ DMOFC: Discrimination Metric-Optimized Feature Compression + + +
+ Feature compression, as an important branch of video coding for machines +(VCM), has attracted significant attention and exploration. However, the +existing methods mainly focus on intra-feature similarity, such as the Mean +Squared Error (MSE) between the reconstructed and original features, while +neglecting the importance of inter-feature relationships. In this paper, we +analyze the inter-feature relationships, focusing on feature discriminability +in machine vision and underscoring its significance in feature compression. To +maintain the feature discriminability of reconstructed features, we introduce a +discrimination metric for feature compression. The discrimination metric is +designed to ensure that the distance between features of the same category is +smaller than the distance between features of different categories. +Furthermore, we explore the relationship between the discrimination metric and +the discriminability of the original features. Experimental results confirm the +effectiveness of the proposed discrimination metric and reveal there exists a +trade-off between the discrimination metric and the discriminability of the +original features. + +
+
+
+
+
+ + ☆ Space-time Reinforcement Network for Video Object Segmentation ICME 2024 + + +
+ Recently, video object segmentation (VOS) networks typically use memory-based +methods: for each query frame, the mask is predicted by space-time matching to +memory frames. Despite these methods having superior performance, they suffer +from two issues: 1) Challenging data can destroy the space-time coherence +between adjacent video frames. 2) Pixel-level matching will lead to undesired +mismatching caused by the noises or distractors. To address the aforementioned +issues, we first propose to generate an auxiliary frame between adjacent +frames, serving as an implicit short-temporal reference for the query one. +Next, we learn a prototype for each video object and prototype-level matching +can be implemented between the query and memory. The experiment demonstrated +that our network outperforms the state-of-the-art method on the DAVIS 2017, +achieving a J&F score of 86.4%, and attains a competitive result 85.0% on +YouTube VOS 2018. In addition, our network exhibits a high inference speed of +32+ FPS. + +
+
+ comment: Accepted by ICME 2024. 6 pages, 10 figures +
+
+
+
+
+ + ☆ Feature Map Convergence Evaluation for Functional Module + + +
+ Autonomous driving perception models are typically composed of multiple +functional modules that interact through complex relationships to accomplish +environment understanding. However, perception models are predominantly +optimized as a black box through end-to-end training, lacking independent +evaluation of functional modules, which poses difficulties for interpretability +and optimization. Pioneering in the issue, we propose an evaluation method +based on feature map analysis to gauge the convergence of model, thereby +assessing functional modules' training maturity. We construct a quantitative +metric named as the Feature Map Convergence Score (FMCS) and develop Feature +Map Convergence Evaluation Network (FMCE-Net) to measure and predict the +convergence degree of models respectively. FMCE-Net achieves remarkable +predictive accuracy for FMCS across multiple image classification experiments, +validating the efficacy and robustness of the introduced approach. To the best +of our knowledge, this is the first independent evaluation method for +functional modules, offering a new paradigm for the training assessment towards +perception models. + +
+
+
+
+
+ + ☆ Lumbar Spine Tumor Segmentation and Localization in T2 MRI Images Using + AI + + +
+ In medical imaging, segmentation and localization of spinal tumors in +three-dimensional (3D) space pose significant computational challenges, +primarily stemming from limited data availability. In response, this study +introduces a novel data augmentation technique, aimed at automating spine tumor +segmentation and localization through AI approaches. Leveraging a fusion of +fuzzy c-means clustering and Random Forest algorithms, the proposed method +achieves successful spine tumor segmentation based on predefined masks +initially delineated by domain experts in medical imaging. Subsequently, a +Convolutional Neural Network (CNN) architecture is employed for tumor +classification. Moreover, 3D vertebral segmentation and labeling techniques are +used to help pinpoint the exact location of the tumors in the lumbar spine. +Results indicate a remarkable performance, with 99% accuracy for tumor +segmentation, 98% accuracy for tumor classification, and 99% accuracy for tumor +localization achieved with the proposed approach. These metrics surpass the +efficacy of existing state-of-the-art techniques, as evidenced by superior Dice +Score, Class Accuracy, and Intersection over Union (IOU) on class accuracy +metrics. This innovative methodology holds promise for enhancing the diagnostic +capabilities in detecting and characterizing spinal tumors, thereby +facilitating more effective clinical decision-making. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ☆ Structured Click Control in Transformer-based Interactive Segmentation NeurIPS 2024 + + +
+ Click-point-based interactive segmentation has received widespread attention +due to its efficiency. However, it's hard for existing algorithms to obtain +precise and robust responses after multiple clicks. In this case, the +segmentation results tend to have little change or are even worse than before. +To improve the robustness of the response, we propose a structured click intent +model based on graph neural networks, which adaptively obtains graph nodes via +the global similarity of user-clicked Transformer tokens. Then the graph nodes +will be aggregated to obtain structured interaction features. Finally, the dual +cross-attention will be used to inject structured interaction features into +vision Transformer features, thereby enhancing the control of clicks over +segmentation results. Extensive experiments demonstrated the proposed algorithm +can serve as a general structure in improving Transformer-based interactive +segmenta?tion performance. The code and data will be released at +https://github.com/hahamyt/scc. + +
+
+ comment: 10 pages, 6 figures, submitted to NeurIPS 2024 +
+
+
+
+
+ + ☆ SEED-Data-Edit Technical Report: A Hybrid Dataset for Instructional + Image Editing + + +
+ In this technical report, we introduce SEED-Data-Edit: a unique hybrid +dataset for instruction-guided image editing, which aims to facilitate image +manipulation using open-form language. SEED-Data-Edit is composed of three +distinct types of data: (1) High-quality editing data produced by an automated +pipeline, ensuring a substantial volume of diverse image editing pairs. (2) +Real-world scenario data collected from the internet, which captures the +intricacies of user intentions for promoting the practical application of image +editing in the real world. (3) High-precision multi-turn editing data annotated +by humans, which involves multiple rounds of edits for simulating iterative +editing processes. The combination of these diverse data sources makes +SEED-Data-Edit a comprehensive and versatile dataset for training +language-guided image editing model. We fine-tune a pretrained Multimodal Large +Language Model (MLLM) that unifies comprehension and generation with +SEED-Data-Edit. The instruction tuned model demonstrates promising results, +indicating the potential and effectiveness of SEED-Data-Edit in advancing the +field of instructional image editing. The datasets are released in +https://huggingface.co/datasets/AILab-CVC/SEED-Data-Edit. + +
+
+ comment: Technical Report; Dataset released in + https://huggingface.co/datasets/AILab-CVC/SEED-Data-Edit +
+
+
+
+
+ + ☆ Deep Event-based Object Detection in Autonomous Driving: A Survey + + +
+ Object detection plays a critical role in autonomous driving, where +accurately and efficiently detecting objects in fast-moving scenes is crucial. +Traditional frame-based cameras face challenges in balancing latency and +bandwidth, necessitating the need for innovative solutions. Event cameras have +emerged as promising sensors for autonomous driving due to their low latency, +high dynamic range, and low power consumption. However, effectively utilizing +the asynchronous and sparse event data presents challenges, particularly in +maintaining low latency and lightweight architectures for object detection. +This paper provides an overview of object detection using event data in +autonomous driving, showcasing the competitive benefits of event cameras. + +
+
+
+
+
+ + ☆ Predicting Lung Disease Severity via Image-Based AQI Analysis using Deep + Learning Techniques + + +
+ Air pollution is a significant health concern worldwide, contributing to +various respiratory diseases. Advances in air quality mapping, driven by the +emergence of smart cities and the proliferation of Internet-of-Things sensor +devices, have led to an increase in available data, fueling momentum in air +pollution forecasting. The objective of this study is to devise an integrated +approach for predicting air quality using image data and subsequently assessing +lung disease severity based on Air Quality Index (AQI).The aim is to implement +an integrated approach by refining existing techniques to improve accuracy in +predicting AQI and lung disease severity. The study aims to forecast additional +atmospheric pollutants like AQI, PM10, O3, CO, SO2, NO2 in addition to PM2.5 +levels. Additionally, the study aims to compare the proposed approach with +existing methods to show its effectiveness. The approach used in this paper +uses VGG16 model for feature extraction in images and neural network for +predicting AQI.In predicting lung disease severity, Support Vector Classifier +(SVC) and K-Nearest Neighbors (KNN) algorithms are utilized. The neural network +model for predicting AQI achieved training accuracy of 88.54 % and testing +accuracy of 87.44%,which was measured using loss function, while the KNN model +used for predicting lung disease severity achieved training accuracy of 98.4% +and testing accuracy of 97.5% In conclusion, the integrated approach presented +in this study forecasts air quality and evaluates lung disease severity, +achieving high testing accuracies of 87.44% for AQI and 97.5% for lung disease +severity using neural network, KNN, and SVC models. The future scope involves +implementing transfer learning and advanced deep learning modules to enhance +prediction capabilities. While the current study focuses on India, the +objective is to expand its scope to encompass global coverage. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ VMambaCC: A Visual State Space Model for Crowd Counting + + +
+ As a deep learning model, Visual Mamba (VMamba) has a low computational +complexity and a global receptive field, which has been successful applied to +image classification and detection. To extend its applications, we apply VMamba +to crowd counting and propose a novel VMambaCC (VMamba Crowd Counting) model. +Naturally, VMambaCC inherits the merits of VMamba, or global modeling for +images and low computational cost. Additionally, we design a Multi-head +High-level Feature (MHF) attention mechanism for VMambaCC. MHF is a new +attention mechanism that leverages high-level semantic features to augment +low-level semantic features, thereby enhancing spatial feature representation +with greater precision. Building upon MHF, we further present a High-level +Semantic Supervised Feature Pyramid Network (HS2PFN) that progressively +integrates and enhances high-level semantic information with low-level semantic +information. Extensive experimental results on five public datasets validate +the efficacy of our approach. For example, our method achieves a mean absolute +error of 51.87 and a mean squared error of 81.3 on the ShangHaiTech\_PartA +dataset. Our code is coming soon. + +
+
+
+
+
+ + ☆ Unified End-to-End V2X Cooperative Autonomous Driving + + +
+ V2X cooperation, through the integration of sensor data from both vehicles +and infrastructure, is considered a pivotal approach to advancing autonomous +driving technology. Current research primarily focuses on enhancing perception +accuracy, often overlooking the systematic improvement of accident prediction +accuracy through end-to-end learning, leading to insufficient attention to the +safety issues of autonomous driving. To address this challenge, this paper +introduces the UniE2EV2X framework, a V2X-integrated end-to-end autonomous +driving system that consolidates key driving modules within a unified network. +The framework employs a deformable attention-based data fusion strategy, +effectively facilitating cooperation between vehicles and infrastructure. The +main advantages include: 1) significantly enhancing agents' perception and +motion prediction capabilities, thereby improving the accuracy of accident +predictions; 2) ensuring high reliability in the data fusion process; 3) +superior end-to-end perception compared to modular approaches. Furthermore, We +implement the UniE2EV2X framework on the challenging DeepAccident, a simulation +dataset designed for V2X cooperative driving. + +
+
+
+
+
+ + ☆ Joint Estimation of Identity Verification and Relative Pose for Partial + Fingerprints + + +
+ Currently, portable electronic devices are becoming more and more popular. +For lightweight considerations, their fingerprint recognition modules usually +use limited-size sensors. However, partial fingerprints have few matchable +features, especially when there are differences in finger pressing posture or +image quality, which makes partial fingerprint verification challenging. Most +existing methods regard fingerprint position rectification and identity +verification as independent tasks, ignoring the coupling relationship between +them -- relative pose estimation typically relies on paired features as +anchors, and authentication accuracy tends to improve with more precise pose +alignment. Consequently, in this paper we propose a method that jointly +estimates identity verification and relative pose for partial fingerprints, +aiming to leverage their inherent correlation to improve each other. To achieve +this, we propose a multi-task CNN (Convolutional Neural Network)-Transformer +hybrid network, and design a pre-training task to enhance the feature +extraction capability. Experiments on multiple public datasets (NIST SD14, +FVC2002 DB1A & DB3A, FVC2004 DB1A & DB2A, FVC2006 DB1A) and an in-house dataset +show that our method achieves state-of-the-art performance in both partial +fingerprint verification and relative pose estimation, while being more +efficient than previous methods. + +
+
+
+
+
+ + ☆ Simple Drop-in LoRA Conditioning on Attention Layers Will Improve Your + Diffusion Model + + +
+ Current state-of-the-art diffusion models employ U-Net architectures +containing convolutional and (qkv) self-attention layers. The U-Net processes +images while being conditioned on the time embedding input for each sampling +step and the class or caption embedding input corresponding to the desired +conditional generation. Such conditioning involves scale-and-shift operations +to the convolutional layers but does not directly affect the attention layers. +While these standard architectural choices are certainly effective, not +conditioning the attention layers feels arbitrary and potentially suboptimal. +In this work, we show that simply adding LoRA conditioning to the attention +layers without changing or tuning the other parts of the U-Net architecture +improves the image generation quality. For example, a drop-in addition of LoRA +conditioning to EDM diffusion model yields FID scores of 1.91/1.75 for +unconditional and class-conditional CIFAR-10 generation, improving upon the +baseline of 1.97/1.79. + +
+
+
+
+
+ + ☆ IPFed: Identity protected federated learning for user authentication + + +
+ With the development of laws and regulations related to privacy preservation, +it has become difficult to collect personal data to perform machine learning. +In this context, federated learning, which is distributed learning without +sharing personal data, has been proposed. In this paper, we focus on federated +learning for user authentication. We show that it is difficult to achieve both +privacy preservation and high accuracy with existing methods. To address these +challenges, we propose IPFed which is privacy-preserving federated learning +using random projection for class embedding. Furthermore, we prove that IPFed +is capable of learning equivalent to the state-of-the-art method. Experiments +on face image datasets show that IPFed can protect the privacy of personal data +while maintaining the accuracy of the state-of-the-art method. + +
+
+
+
+
+ + ☆ Role of Sensing and Computer Vision in 6G Wireless Communications + + +
+ Recently, we are witnessing the remarkable progress and widespread adoption +of sensing technologies in autonomous driving, robotics, and metaverse. +Considering the rapid advancement of computer vision (CV) technology to analyze +the sensing information, we anticipate a proliferation of wireless applications +exploiting the sensing and CV technologies in 6G. In this article, we provide a +holistic overview of the sensing and CV-aided wireless communications (SVWC) +framework for 6G. By analyzing the high-resolution sensing information through +the powerful CV techniques, SVWC can quickly and accurately understand the +wireless environments and then perform the wireless tasks. To demonstrate the +efficacy of SVWC, we design the whole process of SVWC including the sensing +dataset collection, DL model training, and execution of realistic wireless +tasks. From the numerical evaluations on 6G communication scenarios, we show +that SVWC achieves considerable performance gains over the conventional 5G +systems in terms of positioning accuracy, data rate, and access latency. + +
+
+
+
+
+ + ♻ ☆ Amodal Optical Flow + + +
+ Optical flow estimation is very challenging in situations with transparent or +occluded objects. In this work, we address these challenges at the task level +by introducing Amodal Optical Flow, which integrates optical flow with amodal +perception. Instead of only representing the visible regions, we define amodal +optical flow as a multi-layered pixel-level motion field that encompasses both +visible and occluded regions of the scene. To facilitate research on this new +task, we extend the AmodalSynthDrive dataset to include pixel-level labels for +amodal optical flow estimation. We present several strong baselines, along with +the Amodal Flow Quality metric to quantify the performance in an interpretable +manner. Furthermore, we propose the novel AmodalFlowNet as an initial step +toward addressing this task. AmodalFlowNet consists of a transformer-based +cost-volume encoder paired with a recurrent transformer decoder which +facilitates recurrent hierarchical feature propagation and amodal semantic +grounding. We demonstrate the tractability of amodal optical flow in extensive +experiments and show its utility for downstream tasks such as panoptic +tracking. We make the dataset, code, and trained models publicly available at +http://amodal-flow.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ A dataset of over one thousand computed tomography scans of battery + cells + + +
+ Battery technology is increasingly important for global electrification +efforts. However, batteries are highly sensitive to small manufacturing +variations that can induce reliability or safety issues. An important +technology for battery quality control is computed tomography (CT) scanning, +which is widely used for non-destructive 3D inspection across a variety of +clinical and industrial applications. Historically, however, the utility of CT +scanning for high-volume manufacturing has been limited by its low throughput +as well as the difficulty of handling its large file sizes. In this work, we +present a dataset of over one thousand CT scans of as-produced commercially +available batteries. The dataset spans various chemistries (lithium-ion and +sodium-ion) as well as various battery form factors (cylindrical, pouch, and +prismatic). We evaluate seven different battery types in total. The +manufacturing variability and the presence of battery defects can be observed +via this dataset. This dataset may be of interest to scientists and engineers +working on battery technology, computer vision, or both. + +
+
+
+
+
+ + ♻ ☆ MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth + Estimation of Endoscopic Images + + +
+ Photometric constraint is indispensable for self-supervised monocular depth +estimation. It involves warping a source image onto a target view using +estimated depth&pose, and then minimizing the difference between the warped and +target images. However, the endoscopic built-in light causes significant +brightness fluctuations, and thus makes the photometric constraint unreliable. +Previous efforts only mitigate this relying on extra models to calibrate image +brightness. In this paper, we propose MonoPCC to address the brightness +inconsistency radically by reshaping the photometric constraint into a cycle +form. Instead of only warping the source image, MonoPCC constructs a closed +loop consisting of two opposite forward-backward warping paths: from target to +source and then back to target. Thus, the target image finally receives an +image cycle-warped from itself, which naturally makes the constraint invariant +to brightness changes. Moreover, MonoPCC transplants the source image's +phase-frequency into the intermediate warped image to avoid structure lost, and +also stabilizes the training via an exponential moving average (EMA) strategy +to avoid frequent changes in the forward warping. The comprehensive and +extensive experimental results on four endoscopic datasets demonstrate that our +proposed MonoPCC shows a great robustness to the brightness inconsistency, and +exceeds other state-of-the-arts by reducing the absolute relative error by at +least 7.27%, 9.38%, 9.90% and 3.17%, respectively. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ CLIP-KD: An Empirical Study of CLIP Model Distillation CVPR-2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) has become a promising +language-supervised visual pre-training framework. This paper aims to distill +small CLIP models supervised by a large teacher CLIP model. We propose several +distillation strategies, including relation, feature, gradient and contrastive +paradigms, to examine the effectiveness of CLIP-Knowledge Distillation (KD). We +show that a simple feature mimicry with Mean Squared Error loss works +surprisingly well. Moreover, interactive contrastive learning across teacher +and student encoders is also effective in performance improvement. We explain +that the success of CLIP-KD can be attributed to maximizing the feature +similarity between teacher and student. The unified method is applied to +distill several student models trained on CC3M+12M. CLIP-KD improves student +CLIP models consistently over zero-shot ImageNet classification and cross-modal +retrieval benchmarks. When using ViT-L/14 pretrained on Laion-400M as the +teacher, CLIP-KD achieves 57.5\% and 55.4\% zero-shot top-1 ImageNet accuracy +over ViT-B/16 and ResNet-50, surpassing the original CLIP without KD by 20.5\% +and 20.1\% margins, respectively. Our code is released on +https://github.com/winycg/CLIP-KD. + +
+
+ comment: CVPR-2024 +
+
+
+
+
+ + ♻ ☆ CascadedGaze: Efficiency in Global Context Extraction for Image + Restoration + + +
+ Image restoration tasks traditionally rely on convolutional neural networks. +However, given the local nature of the convolutional operator, they struggle to +capture global information. The promise of attention mechanisms in Transformers +is to circumvent this problem, but it comes at the cost of intensive +computational overhead. Many recent studies in image restoration have focused +on solving the challenge of balancing performance and computational cost via +Transformer variants. In this paper, we present CascadedGaze Network (CGNet), +an encoder-decoder architecture that employs Global Context Extractor (GCE), a +novel and efficient way to capture global information for image restoration. +The GCE module leverages small kernels across convolutional layers to learn +global dependencies, without requiring self-attention. Extensive experimental +results show that our computationally efficient approach performs competitively +to a range of state-of-the-art methods on synthetic image denoising and single +image deblurring tasks, and pushes the performance boundary further on the real +image denoising task. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR), 2024. + 20 pages +
+
+
+
+
+ + ♻ ☆ Learning Noise-Robust Joint Representation for Multimodal Emotion + Recognition under Incomplete Data Scenarios + + +
+ Multimodal emotion recognition (MER) in practical scenarios is significantly +challenged by the presence of missing or incomplete data across different +modalities. To overcome these challenges, researchers have aimed to simulate +incomplete conditions during the training phase to enhance the system's overall +robustness. Traditional methods have often involved discarding data or +substituting data segments with zero vectors to approximate these +incompletenesses. However, such approaches neither accurately represent +real-world conditions nor adequately address the issue of noisy data +availability. For instance, a blurry image cannot be simply replaced with zero +vectors, and still retain information. To tackle this issue and develop a more +precise MER system, we introduce a novel noise-robust MER model that +effectively learns robust multimodal joint representations from noisy data. +This approach includes two pivotal components: firstly, a noise scheduler that +adjusts the type and level of noise in the data to emulate various realistic +incomplete situations. Secondly, a Variational AutoEncoder (VAE)-based module +is employed to reconstruct these robust multimodal joint representations from +the noisy inputs. Notably, the introduction of the noise scheduler enables the +exploration of an entirely new type of incomplete data condition, which is +impossible with existing methods. Extensive experimental evaluations on the +benchmark datasets IEMOCAP and CMU-MOSEI demonstrate the effectiveness of the +noise scheduler and the excellent performance of our proposed model. + +
+
+
+
+
+ + ♻ ☆ NTIRE 2024 Quality Assessment of AI-Generated Content Challenge + + +
+ This paper reports on the NTIRE 2024 Quality Assessment of AI-Generated +Content Challenge, which will be held in conjunction with the New Trends in +Image Restoration and Enhancement Workshop (NTIRE) at CVPR 2024. This challenge +is to address a major challenge in the field of image and video processing, +namely, Image Quality Assessment (IQA) and Video Quality Assessment (VQA) for +AI-Generated Content (AIGC). The challenge is divided into the image track and +the video track. The image track uses the AIGIQA-20K, which contains 20,000 +AI-Generated Images (AIGIs) generated by 15 popular generative models. The +image track has a total of 318 registered participants. A total of 1,646 +submissions are received in the development phase, and 221 submissions are +received in the test phase. Finally, 16 participating teams submitted their +models and fact sheets. The video track uses the T2VQA-DB, which contains +10,000 AI-Generated Videos (AIGVs) generated by 9 popular Text-to-Video (T2V) +models. A total of 196 participants have registered in the video track. A total +of 991 submissions are received in the development phase, and 185 submissions +are received in the test phase. Finally, 12 participating teams submitted their +models and fact sheets. Some methods have achieved better results than baseline +methods, and the winning methods in both tracks have demonstrated superior +prediction performance on AIGC. + +
+
+
+
+
+ + ♻ ☆ On Good Practices for Task-Specific Distillation of Large Pretrained + Visual Models + + +
+ Large pretrained visual models exhibit remarkable generalization across +diverse recognition tasks. Yet, real-world applications often demand compact +models tailored to specific problems. Variants of knowledge distillation have +been devised for such a purpose, enabling task-specific compact models (the +students) to learn from a generic large pretrained one (the teacher). In this +paper, we show that the excellent robustness and versatility of recent +pretrained models challenge common practices established in the literature, +calling for a new set of optimal guidelines for task-specific distillation. To +address the lack of samples in downstream tasks, we also show that a variant of +Mixup based on stable diffusion complements standard data augmentation. This +strategy eliminates the need for engineered text prompts and improves +distillation of generic models into streamlined specialized networks. + +
+
+
+
+
+ + ♻ ☆ Deep Unlearning: Fast and Efficient Training-free Approach to Class + Forgetting + + +
+ Machine unlearning is a prominent and challenging field, driven by regulatory +demands for user data deletion and heightened privacy awareness. Existing +approaches involve retraining model or multiple finetuning steps for each +deletion request, often constrained by computational limits and restricted data +access. In this work, we introduce a novel class unlearning algorithm designed +to strategically eliminate specific classes from the learned model. Our +algorithm first estimates the Retain and the Forget Spaces using Singular Value +Decomposition on the layerwise activations for a small subset of samples from +the retain and unlearn classes, respectively. We then compute the shared +information between these spaces and remove it from the forget space to isolate +class-discriminatory feature space. Finally, we obtain the unlearned model by +updating the weights to suppress the class discriminatory features from the +activation spaces. We demonstrate our algorithm's efficacy on ImageNet using a +Vision Transformer with only $\sim 1.5\%$ drop in retain accuracy compared to +the original model while maintaining under $1\%$ accuracy on the unlearned +class samples. Further, our algorithm consistently performs well when subject +to Membership Inference Attacks showing $7.8\%$ improvement on average across a +variety of image classification datasets and network architectures, as compared +to other baselines while being $\sim 6 \times$ more computationally efficient. +Our code is available at https://github.com/sangamesh-kodge/class_forgetting. + +
+
+
+
+
+ + ♻ ☆ PoseINN: Realtime Visual-based Pose Regression and Localization with + Invertible Neural Networks + + +
+ Estimating ego-pose from cameras is an important problem in robotics with +applications ranging from mobile robotics to augmented reality. While SOTA +models are becoming increasingly accurate, they can still be unwieldy due to +high computational costs. In this paper, we propose to solve the problem by +using invertible neural networks (INN) to find the mapping between the latent +space of images and poses for a given scene. Our model achieves similar +performance to the SOTA while being faster to train and only requiring offline +rendering of low-resolution synthetic data. By using normalizing flows, the +proposed method also provides uncertainty estimation for the output. We also +demonstrated the efficiency of this method by deploying the model on a mobile +robot. + +
+
+
+
+
+ + ♻ ☆ Zero Grads: Learning Local Surrogate Losses for Non-Differentiable + Graphics SIGGRAPH 2024 + + +
+ Gradient-based optimization is now ubiquitous across graphics, but +unfortunately can not be applied to problems with undefined or zero gradients. +To circumvent this issue, the loss function can be manually replaced by a +``surrogate'' that has similar minima but is differentiable. Our proposed +framework, ZeroGrads, automates this process by learning a neural approximation +of the objective function, which in turn can be used to differentiate through +arbitrary black-box graphics pipelines. We train the surrogate on an actively +smoothed version of the objective and encourage locality, focusing the +surrogate's capacity on what matters at the current training episode. The +fitting is performed online, alongside the parameter optimization, and +self-supervised, without pre-computed data or pre-trained models. As sampling +the objective is expensive (it requires a full rendering or simulator run), we +devise an efficient sampling scheme that allows for tractable run-times and +competitive performance at little overhead. We demonstrate optimizing diverse +non-convex, non-differentiable black-box problems in graphics, such as +visibility in rendering, discrete parameter spaces in procedural modelling or +optimal control in physics-driven animation. In contrast to other +derivative-free algorithms, our approach scales well to higher dimensions, +which we demonstrate on problems with up to 35k interlinked variables. + +
+
+ comment: Accepted at SIGGRAPH 2024. Project page: + https://mfischer-ucl.github.io/zerograds +
+
+
+
+
+ + ♻ ☆ Solving the bongard-logo problem by modeling a probabilistic model + + +
+ Abstract reasoning problems challenge the perceptual and cognitive abilities +of AI algorithms, demanding deeper pattern discernment and inductive reasoning +beyond explicit image features. This study introduces PMoC, a tailored +probability model for the Bongard-Logo problem, achieving high reasoning +accuracy by constructing independent probability models. Additionally, we +present Pose-Transformer, an enhanced Transformer-Encoder designed for complex +abstract reasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM. +Pose-Transformer incorporates positional information learning, inspired by +capsule networks' pose matrices, enhancing its focus on local positional +relationships in image data processing. When integrated with PMoC, it further +improves reasoning accuracy. Our approach effectively addresses reasoning +difficulties associated with abstract entities' positional changes, +outperforming previous models on the OIG, D3$\times$3 subsets of RAVEN, and PGM +databases. This research contributes to advancing AI's capabilities in abstract +reasoning and cognitive pattern recognition. + +
+
+ comment: 14 pages, 11 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ A Unified Approach for Text- and Image-guided 4D Scene Generation + + +
+ Large-scale diffusion generative models are greatly simplifying image, video +and 3D asset creation from user-provided text prompts and images. However, the +challenging problem of text-to-4D dynamic 3D scene generation with diffusion +guidance remains largely unexplored. We propose Dream-in-4D, which features a +novel two-stage approach for text-to-4D synthesis, leveraging (1) 3D and 2D +diffusion guidance to effectively learn a high-quality static 3D asset in the +first stage; (2) a deformable neural radiance field that explicitly +disentangles the learned static asset from its deformation, preserving quality +during motion learning; and (3) a multi-resolution feature grid for the +deformation field with a displacement total variation loss to effectively learn +motion with video diffusion guidance in the second stage. Through a user +preference study, we demonstrate that our approach significantly advances image +and motion quality, 3D consistency and text fidelity for text-to-4D generation +compared to baseline approaches. Thanks to its motion-disentangled +representation, Dream-in-4D can also be easily adapted for controllable +generation where appearance is defined by one or multiple images, without the +need to modify the motion learning stage. Thus, our method offers, for the +first time, a unified approach for text-to-4D, image-to-4D and personalized 4D +generation tasks. + +
+
+ comment: Project page: https://research.nvidia.com/labs/nxp/dream-in-4d/ +
+
+
+
+
+ + ♻ ☆ SDDGR: Stable Diffusion-based Deep Generative Replay for Class + Incremental Object Detection CVPR 2024 + + +
+ In the field of class incremental learning (CIL), generative replay has +become increasingly prominent as a method to mitigate the catastrophic +forgetting, alongside the continuous improvements in generative models. +However, its application in class incremental object detection (CIOD) has been +significantly limited, primarily due to the complexities of scenes involving +multiple labels. In this paper, we propose a novel approach called stable +diffusion deep generative replay (SDDGR) for CIOD. Our method utilizes a +diffusion-based generative model with pre-trained text-to-diffusion networks to +generate realistic and diverse synthetic images. SDDGR incorporates an +iterative refinement strategy to produce high-quality images encompassing old +classes. Additionally, we adopt an L2 knowledge distillation technique to +improve the retention of prior knowledge in synthetic images. Furthermore, our +approach includes pseudo-labeling for old objects within new task images, +preventing misclassification as background elements. Extensive experiments on +the COCO 2017 dataset demonstrate that SDDGR significantly outperforms existing +algorithms, achieving a new state-of-the-art in various CIOD scenarios. The +source code will be made available to the public. + +
+
+ comment: Accept to CVPR 2024. The camera-ready version +
+
+
+
+
+ + ♻ ☆ Enhancing Boundary Segmentation for Topological Accuracy with + Skeleton-based Methods + + +
+ Topological consistency plays a crucial role in the task of boundary +segmentation for reticular images, such as cell membrane segmentation in neuron +electron microscopic images, grain boundary segmentation in material +microscopic images and road segmentation in aerial images. In these fields, +topological changes in segmentation results have a serious impact on the +downstream tasks, which can even exceed the misalignment of the boundary +itself. To enhance the topology accuracy in segmentation results, we propose +the Skea-Topo Aware loss, which is a novel loss function that takes into +account the shape of each object and topological significance of the pixels. It +consists of two components. First, a skeleton-aware weighted loss improves the +segmentation accuracy by better modeling the object geometry with skeletons. +Second, a boundary rectified term effectively identifies and emphasizes +topological critical pixels in the prediction errors using both foreground and +background skeletons in the ground truth and predictions. Experiments prove +that our method improves topological consistency by up to 7 points in VI +compared to 13 state-of-art methods, based on objective and subjective +assessments across three different boundary segmentation datasets. The code is +available at https://github.com/clovermini/Skea_topo. + +
+
+
+
+
+ + ♻ ☆ Motion State: A New Benchmark Multiple Object Tracking + + +
+ In the realm of video analysis, the field of multiple object tracking (MOT) +assumes paramount importance, with the motion state of objects-whether static +or dynamic relative to the ground-holding practical significance across diverse +scenarios. However, the extant literature exhibits a notable dearth in the +exploration of this aspect. Deep learning methodologies encounter challenges in +accurately discerning object motion states, while conventional approaches +reliant on comprehensive mathematical modeling may yield suboptimal tracking +accuracy. To address these challenges, we introduce a Model-Data-Driven Motion +State Judgment Object Tracking Method (MoD2T). This innovative architecture +adeptly amalgamates traditional mathematical modeling with deep learning-based +multi-object tracking frameworks. The integration of mathematical modeling and +deep learning within MoD2T enhances the precision of object motion state +determination, thereby elevating tracking accuracy. Our empirical +investigations comprehensively validate the efficacy of MoD2T across varied +scenarios, encompassing unmanned aerial vehicle surveillance and street-level +tracking. Furthermore, to gauge the method's adeptness in discerning object +motion states, we introduce the Motion State Validation F1 (MVF1) metric. This +novel performance metric aims to quantitatively assess the accuracy of motion +state classification, furnishing a comprehensive evaluation of MoD2T's +performance. Elaborate experimental validations corroborate the rationality of +MVF1. In order to holistically appraise MoD2T's performance, we meticulously +annotate several renowned datasets and subject MoD2T to stringent testing. +Remarkably, under conditions characterized by minimal or moderate camera +motion, the achieved MVF1 values are particularly noteworthy, with exemplars +including 0.774 for the KITTI dataset, 0.521 for MOT17, and 0.827 for UAVDT. + +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Chest X-ray Lung Segmentation Using U-net and + Modified Convolutional Block Attention Module + + +
+ Lung segmentation in chest X-ray images is of paramount importance as it +plays a crucial role in the diagnosis and treatment of various lung diseases. +This paper presents a novel approach for lung segmentation in chest X-ray +images by integrating U-net with attention mechanisms. The proposed method +enhances the U-net architecture by incorporating a Convolutional Block +Attention Module (CBAM), which unifies three distinct attention mechanisms: +channel attention, spatial attention, and pixel attention. The channel +attention mechanism enables the model to concentrate on the most informative +features across various channels. The spatial attention mechanism enhances the +model's precision in localization by focusing on significant spatial locations. +Lastly, the pixel attention mechanism empowers the model to focus on individual +pixels, further refining the model's focus and thereby improving the accuracy +of segmentation. The adoption of the proposed CBAM in conjunction with the +U-net architecture marks a significant advancement in the field of medical +imaging, with potential implications for improving diagnostic precision and +patient outcomes. The efficacy of this method is validated against contemporary +state-of-the-art techniques, showcasing its superiority in segmentation +performance. + +
+
+
+
+
+ + ♻ ☆ Paint-it: Text-to-Texture Synthesis via Deep Convolutional Texture Map + Optimization and Physically-Based Rendering CVPR 2024 + + +
+ We present Paint-it, a text-driven high-fidelity texture map synthesis method +for 3D meshes via neural re-parameterized texture optimization. Paint-it +synthesizes texture maps from a text description by +synthesis-through-optimization, exploiting the Score-Distillation Sampling +(SDS). We observe that directly applying SDS yields undesirable texture quality +due to its noisy gradients. We reveal the importance of texture +parameterization when using SDS. Specifically, we propose Deep Convolutional +Physically-Based Rendering (DC-PBR) parameterization, which re-parameterizes +the physically-based rendering (PBR) texture maps with randomly initialized +convolution-based neural kernels, instead of a standard pixel-based +parameterization. We show that DC-PBR inherently schedules the optimization +curriculum according to texture frequency and naturally filters out the noisy +signals from SDS. In experiments, Paint-it obtains remarkable quality PBR +texture maps within 15 min., given only a text description. We demonstrate the +generalizability and practicality of Paint-it by synthesizing high-quality +texture maps for large-scale mesh datasets and showing test-time applications +such as relighting and material control using a popular graphics engine. +Project page: https://kim-youwang.github.io/paint-it + +
+
+ comment: CVPR 2024. Project page: https://kim-youwang.github.io/paint-it +
+
+
+
+
+ + ♻ ☆ Zero-Shot Stitching in Reinforcement Learning using Relative + Representations + + +
+ Visual Reinforcement Learning is a popular and powerful framework that takes +full advantage of the Deep Learning breakthrough. However, it is also known +that variations in the input (e.g., different colors of the panorama due to the +season of the year) or the task (e.g., changing the speed limit for a car to +respect) could require complete retraining of the agents. In this work, we +leverage recent developments in unifying latent representations to demonstrate +that it is possible to combine the components of an agent, rather than retrain +it from scratch. We build upon the recent relative representations framework +and adapt it for Visual RL. This allows us to create completely new agents +capable of handling environment-task combinations never seen during training. +Our work paves the road toward a more accessible and flexible use of +reinforcement learning. + +
+
+ comment: 13 pages, 10 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Dynamic Event-based Optical Identification and Communication + + +
+ Optical identification is often done with spatial or temporal visual pattern +recognition and localization. Temporal pattern recognition, depending on the +technology, involves a trade-off between communication frequency, range and +accurate tracking. We propose a solution with light-emitting beacons that +improves this trade-off by exploiting fast event-based cameras and, for +tracking, sparse neuromorphic optical flow computed with spiking neurons. The +system is embedded in a simulated drone and evaluated in an asset monitoring +use case. It is robust to relative movements and enables simultaneous +communication with, and tracking of, multiple moving beacons. Finally, in a +hardware lab prototype, we demonstrate for the first time beacon tracking +performed simultaneously with state-of-the-art frequency communication in the +kHz range. + +
+
+
+
+
+ + ♻ ☆ Monkeypox disease recognition model based on improved SE-InceptionV3 + + +
+ In the wake of the global spread of monkeypox, accurate disease recognition +has become crucial. This study introduces an improved SE-InceptionV3 model, +embedding the SENet module and incorporating L2 regularization into the +InceptionV3 framework to enhance monkeypox disease detection. Utilizing the +Kaggle monkeypox dataset, which includes images of monkeypox and similar skin +conditions, our model demonstrates a noteworthy accuracy of 96.71% on the test +set, outperforming conventional methods and deep learning models. The SENet +modules channel attention mechanism significantly elevates feature +representation, while L2 regularization ensures robust generalization. +Extensive experiments validate the models superiority in precision, recall, and +F1 score, highlighting its effectiveness in differentiating monkeypox lesions +in diverse and complex cases. The study not only provides insights into the +application of advanced CNN architectures in medical diagnostics but also opens +avenues for further research in model optimization and hyperparameter tuning +for enhanced disease recognition. https://github.com/jzc777/SE-inceptionV3-L2 + +
+
+
+
+
+ + ♻ ☆ Human Image Generation: A Comprehensive Survey + + +
+ Image and video synthesis has become a blooming topic in computer vision and +machine learning communities along with the developments of deep generative +models, due to its great academic and application value. Many researchers have +been devoted to synthesizing high-fidelity human images as one of the most +commonly seen object categories in daily lives, where a large number of studies +are performed based on various models, task settings and applications. Thus, it +is necessary to give a comprehensive overview on these variant methods on human +image generation. In this paper, we divide human image generation techniques +into three paradigms, i.e., data-driven methods, knowledge-guided methods and +hybrid methods. For each paradigm, the most representative models and the +corresponding variants are presented, where the advantages and characteristics +of different methods are summarized in terms of model architectures. Besides, +the main public human image datasets and evaluation metrics in the literature +are summarized. Furthermore, due to the wide application potentials, the +typical downstream usages of synthesized human images are covered. Finally, the +challenges and potential opportunities of human image generation are discussed +to shed light on future research. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor CVPR 2024 + + +
+ Existing open-vocabulary image segmentation methods require a fine-tuning +step on mask labels and/or image-text datasets. Mask labels are +labor-intensive, which limits the number of categories in segmentation +datasets. Consequently, the vocabulary capacity of pre-trained VLMs is severely +reduced after fine-tuning. However, without fine-tuning, VLMs trained under +weak image-text supervision tend to make suboptimal mask predictions. To +alleviate these issues, we introduce a novel recurrent framework that +progressively filters out irrelevant texts and enhances mask quality without +training efforts. The recurrent unit is a two-stage segmenter built upon a +frozen VLM. Thus, our model retains the VLM's broad vocabulary space and equips +it with segmentation ability. Experiments show that our method outperforms not +only the training-free counterparts, but also those fine-tuned with millions of +data samples, and sets the new state-of-the-art records for both zero-shot +semantic and referring segmentation. Concretely, we improve the current record +by 28.8, 16.0, and 6.9 mIoU on Pascal VOC, COCO Object, and Pascal Context. + +
+
+ comment: To appear in CVPR 2024. Project page: + https://torrvision.com/clip_as_rnn/ +
+
+
+
+
+ + ♻ ☆ An Attention Based Pipeline for Identifying Pre-Cancer Lesions in Head + and Neck Clinical Images + + +
+ Early detection of cancer can help improve patient prognosis by early +intervention. Head and neck cancer is diagnosed in specialist centres after a +surgical biopsy, however, there is a potential for these to be missed leading +to delayed diagnosis. To overcome these challenges, we present an attention +based pipeline that identifies suspected lesions, segments, and classifies them +as non-dysplastic, dysplastic and cancerous lesions. We propose (a) a vision +transformer based Mask R-CNN network for lesion detection and segmentation of +clinical images, and (b) Multiple Instance Learning (MIL) based scheme for +classification. Current results show that the segmentation model produces +segmentation masks and bounding boxes with up to 82% overlap accuracy score on +unseen external test data and surpassing reviewed segmentation benchmarks. +Next, a classification F1-score of 85% on the internal cohort test set. An app +has been developed to perform lesion segmentation taken via a smart device. +Future work involves employing endoscopic video data for precise early +detection and prognosis. + +
+
+ comment: 5 pages, 3 figures, accepted in ISBI 2024, update: corrected typos +
+
+
+
+
+ + ♻ ☆ Unified Dynamic Scanpath Predictors Outperform Individually Trained + Neural Models + + +
+ Previous research on scanpath prediction has mainly focused on group models, +disregarding the fact that the scanpaths and attentional behaviors of +individuals are diverse. The disregard of these differences is especially +detrimental to social human-robot interaction, whereby robots commonly emulate +human gaze based on heuristics or predefined patterns. However, human gaze +patterns are heterogeneous and varying behaviors can significantly affect the +outcomes of such human-robot interactions. To fill this gap, we developed a +deep learning-based social cue integration model for saliency prediction to +instead predict scanpaths in videos. Our model learned scanpaths by recursively +integrating fixation history and social cues through a gating mechanism and +sequential attention. We evaluated our approach on gaze datasets of dynamic +social scenes, observed under the free-viewing condition. The introduction of +fixation history into our models makes it possible to train a single unified +model rather than the resource-intensive approach of training individual models +for each set of scanpaths. We observed that the late neural integration +approach surpasses early fusion when training models on a large dataset, in +comparison to a smaller dataset with a similar distribution. Results also +indicate that a single unified model, trained on all the observers' scanpaths, +performs on par or better than individually trained models. We hypothesize that +this outcome is a result of the group saliency representations instilling +universal attention in the model, while the supervisory signal and fixation +history guide it to learn personalized attentional behaviors, providing the +unified model a benefit over individual models due to its implicit +representation of universal attention. + +
+
+
+
+
+ + ♻ ☆ TwinDiffusion: Enhancing Coherence and Efficiency in Panoramic Image + Generation with Diffusion Models + + +
+ Diffusion models have emerged as effective tools for generating diverse and +high-quality content. However, their capability in high-resolution image +generation, particularly for panoramic images, still faces challenges such as +visible seams and incoherent transitions. In this paper, we propose +TwinDiffusion, an optimized framework designed to address these challenges +through two key innovations: Crop Fusion for quality enhancement and Cross +Sampling for efficiency optimization. We introduce a training-free optimizing +stage to refine the similarity of the adjacent image areas, as well as an +interleaving sampling strategy to yield dynamic patches during the cropping +process. A comprehensive evaluation is conducted to compare TwinDiffusion with +the existing methods, considering factors including coherence, fidelity, +compatibility, and efficiency. The results demonstrate the superior performance +of our approach in generating seamless and coherent panoramas, setting a new +standard in quality and efficiency for panoramic image generation. + +
+
+
+
+
+ + ♻ ☆ Explainable Classification Techniques for Quantum Dot Device + Measurements + + +
+ In the physical sciences, there is an increased need for robust feature +representations of image data: image acquisition, in the generalized sense of +two-dimensional data, is now widespread across a large number of fields, +including quantum information science, which we consider here. While +traditional image features are widely utilized in such cases, their use is +rapidly being supplanted by Neural Network-based techniques that often +sacrifice explainability in exchange for high accuracy. To ameliorate this +trade-off, we propose a synthetic data-based technique that results in +explainable features. We show, using Explainable Boosting Machines (EBMs), that +this method offers superior explainability without sacrificing accuracy. +Specifically, we show that there is a meaningful benefit to this technique in +the context of quantum dot tuning, where human intervention is necessary at the +current stage of development. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Towards Generalizing to Unseen Domains with Few Labels CVPR 2024 + + +
+ We approach the challenge of addressing semi-supervised domain generalization +(SSDG). Specifically, our aim is to obtain a model that learns +domain-generalizable features by leveraging a limited subset of labelled data +alongside a substantially larger pool of unlabeled data. Existing domain +generalization (DG) methods which are unable to exploit unlabeled data perform +poorly compared to semi-supervised learning (SSL) methods under SSDG setting. +Nevertheless, SSL methods have considerable room for performance improvement +when compared to fully-supervised DG training. To tackle this underexplored, +yet highly practical problem of SSDG, we make the following core contributions. +First, we propose a feature-based conformity technique that matches the +posterior distributions from the feature space with the pseudo-label from the +model's output space. Second, we develop a semantics alignment loss to learn +semantically-compatible representations by regularizing the semantic structure +in the feature space. Our method is plug-and-play and can be readily integrated +with different SSL-based SSDG baselines without introducing any additional +parameters. Extensive experimental results across five challenging DG +benchmarks with four strong SSL baselines suggest that our method provides +consistent and notable gains in two different SSDG settings. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ ReFACT: Updating Text-to-Image Models by Editing the Text Encoder NAACL 2024 + + +
+ Our world is marked by unprecedented technological, global, and +socio-political transformations, posing a significant challenge to +text-to-image generative models. These models encode factual associations +within their parameters that can quickly become outdated, diminishing their +utility for end-users. To that end, we introduce ReFACT, a novel approach for +editing factual associations in text-to-image models without relaying on +explicit input from end-users or costly re-training. ReFACT updates the weights +of a specific layer in the text encoder, modifying only a tiny portion of the +model's parameters and leaving the rest of the model unaffected. We empirically +evaluate ReFACT on an existing benchmark, alongside a newly curated dataset. +Compared to other methods, ReFACT achieves superior performance in both +generalization to related concepts and preservation of unrelated concepts. +Furthermore, ReFACT maintains image generation quality, making it a practical +tool for updating and correcting factual information in text-to-image models. + +
+
+ comment: Accepted to NAACL 2024 (Main Conference) +
+
+
+
+
+ + ♻ ☆ Yuille-Poggio's Flow and Global Minimizer of Polynomials through + Convexification by Heat Evolution + + +
+ This study examines the convexification version of the backward differential +flow algorithm for the global minimization of polynomials, introduced by O. +Arikan \textit{et al} in \cite{ABK}. It investigates why this approach might +fail with high-degree polynomials yet succeeds with quartic polynomials. We +employ the heat evolution method for convexification combined with Gaussian +filtering, which acts as a cumulative form of Steklov's regularization. In this +context, we apply the fingerprint theory from computer vision. Originally +developed by A.L. Yuille and T. Poggio in the 1980s for computer vision, the +fingerprint theory, particularly the fingerprint trajectory equation, is used +to illustrate the scaling (temporal) evolution of minimizers. In the case of +general polynomials, our research has led to the creation of the Yuille-Poggio +flow and a broader interpretation of the fingerprint concepts, in particular we +establish the condition both sufficient and necessary for the convexified +backward differential flow algorithms to successfully achieve global +minimization. For quartic polynomials, our analysis not only reflects the +results of O. Arikan et al. \cite{ABK} but also presents a significantly +simpler version of Newton's method that can always globally minimize quartic +polynomials without convexification. + +
+
+
+
+
+ + ♻ ☆ The Impact of Background Removal on Performance of Neural Networks for + Fashion Image Classification and Segmentation + + +
+ Fashion understanding is a hot topic in computer vision, with many +applications having great business value in the market. Fashion understanding +remains a difficult challenge for computer vision due to the immense diversity +of garments and various scenes and backgrounds. In this work, we try removing +the background from fashion images to boost data quality and increase model +performance. Having fashion images of evident persons in fully visible +garments, we can utilize Salient Object Detection to achieve the background +removal of fashion data to our expectations. A fashion image with the +background removed is claimed as the "rembg" image, contrasting with the +original one in the fashion dataset. We conducted extensive comparative +experiments with these two types of images on multiple aspects of model +training, including model architectures, model initialization, compatibility +with other training tricks and data augmentations, and target task types. Our +experiments show that background removal can effectively work for fashion data +in simple and shallow networks that are not susceptible to overfitting. It can +improve model accuracy by up to 5% in the classification on the FashionStyle14 +dataset when training models from scratch. However, background removal does not +perform well in deep neural networks due to incompatibility with other +regularization techniques like batch normalization, pre-trained initialization, +and data augmentations introducing randomness. The loss of background pixels +invalidates many existing training tricks in the model training, adding the +risk of overfitting for deep models. + +
+
+ comment: 9 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Not All Similarities Are Created Equal: Leveraging Data-Driven Biases to + Inform GenAI Copyright Disputes + + +
+ The advent of Generative Artificial Intelligence (GenAI) models, including +GitHub Copilot, OpenAI GPT, and Stable Diffusion, has revolutionized content +creation, enabling non-professionals to produce high-quality content across +various domains. This transformative technology has led to a surge of synthetic +content and sparked legal disputes over copyright infringement. To address +these challenges, this paper introduces a novel approach that leverages the +learning capacity of GenAI models for copyright legal analysis, demonstrated +with GPT2 and Stable Diffusion models. Copyright law distinguishes between +original expressions and generic ones (Sc\`enes \`a faire), protecting the +former and permitting reproduction of the latter. However, this distinction has +historically been challenging to make consistently, leading to over-protection +of copyrighted works. GenAI offers an unprecedented opportunity to enhance this +legal analysis by revealing shared patterns in preexisting works. We propose a +data-driven approach to identify the genericity of works created by GenAI, +employing "data-driven bias" to assess the genericity of expressive +compositions. This approach aids in copyright scope determination by utilizing +the capabilities of GenAI to identify and prioritize expressive elements and +rank them according to their frequency in the model's dataset. The potential +implications of measuring expressive genericity for copyright law are profound. +Such scoring could assist courts in determining copyright scope during +litigation, inform the registration practices of Copyright Offices, allowing +registration of only highly original synthetic works, and help copyright owners +signal the value of their works and facilitate fairer licensing deals. More +generally, this approach offers valuable insights to policymakers grappling +with adapting copyright law to the challenges posed by the era of GenAI. + +
+
+ comment: Presented at ACM CSLAW 2024 +
+
+
+
+
+ + ♻ ☆ PINQI: An End-to-End Physics-Informed Approach to Learned Quantitative + MRI Reconstruction + + +
+ Quantitative Magnetic Resonance Imaging (qMRI) enables the reproducible +measurement of biophysical parameters in tissue. The challenge lies in solving +a nonlinear, ill-posed inverse problem to obtain the desired tissue parameter +maps from acquired raw data. While various learned and non-learned approaches +have been proposed, the existing learned methods fail to fully exploit the +prior knowledge about the underlying MR physics, i.e. the signal model and the +acquisition model. In this paper, we propose PINQI, a novel qMRI reconstruction +method that integrates the knowledge about the signal, acquisition model, and +learned regularization into a single end-to-end trainable neural network. Our +approach is based on unrolled alternating optimization, utilizing +differentiable optimization blocks to solve inner linear and non-linear +optimization tasks, as well as convolutional layers for regularization of the +intermediate qualitative images and parameter maps. This design enables PINQI +to leverage the advantages of both the signal model and learned regularization. +We evaluate the performance of our proposed network by comparing it with +recently published approaches in the context of highly undersampled +$T_1$-mapping, using both a simulated brain dataset, as well as real scanner +data acquired from a physical phantom and in-vivo data from healthy volunteers. +The results demonstrate the superiority of our proposed solution over existing +methods and highlight the effectiveness of our method in real-world scenarios. + +
+
+ comment: This work has been accepted for publication in IEEE Transactions on + Computational Imaging. Changes were made to this version by the publisher + before publication. IEEE Transactions on Computational Imaging (2024) +
+
+
+
+
+ + ♻ ☆ Domain-Specific Block Selection and Paired-View Pseudo-Labeling for + Online Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test +domain without access to source data after deployment. Existing approaches +typically rely on self-training with pseudo-labels since ground-truth cannot be +obtained from test data. Although the quality of pseudo labels is important for +stable and accurate long-term adaptation, it has not been previously addressed. +In this work, we propose DPLOT, a simple yet effective TTA framework that +consists of two components: (1) domain-specific block selection and (2) +pseudo-label generation using paired-view images. Specifically, we select +blocks that involve domain-specific feature extraction and train these blocks +by entropy minimization. After blocks are adjusted for current test domain, we +generate pseudo-labels by averaging given test images and corresponding flipped +counterparts. By simply using flip augmentation, we prevent a decrease in the +quality of the pseudo-labels, which can be caused by the domain gap resulting +from strong augmentation. Our experimental results demonstrate that DPLOT +outperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C +benchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also, +we provide an extensive analysis to demonstrate effectiveness of our framework. +Code is available at +https://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Language Models as Black-Box Optimizers for Vision-Language Models CVPR 2024 + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities on downstream tasks when fine-tuned with +minimal data. However, many VLMs rely on proprietary data and are not +open-source, which restricts the use of white-box approaches for fine-tuning. +As such, we aim to develop a black-box approach to optimize VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or even output logits. We propose employing chat-based LLMs +to search for the best text prompt for VLMs. Specifically, we adopt an +automatic hill-climbing procedure that converges to an effective prompt by +evaluating the performance of current prompts and asking LLMs to refine them +based on textual feedback, all within a conversational process without +human-in-the-loop. In a challenging 1-shot image classification setup, our +simple approach surpasses the white-box continuous prompting method (CoOp) by +an average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms both human-engineered and LLM-generated prompts. We highlight the +advantage of conversational feedback that incorporates both positive and +negative prompts, suggesting that LLMs can utilize the implicit gradient +direction in textual feedback for a more efficient search. In addition, we find +that the text prompts generated through our strategy are not only more +interpretable but also transfer well across different VLM architectures in a +black-box manner. Lastly, we apply our framework to optimize the +state-of-the-art black-box VLM (DALL-E 3) for text-to-image generation, prompt +inversion, and personalization. + +
+
+ comment: Published at CVPR 2024. Project site: + https://llm-can-optimize-vlm.github.io/ +
+
+
+
+
+ + ♻ ☆ Removal and Selection: Improving RGB-Infrared Object Detection via + Coarse-to-Fine Fusion + + +
+ Object detection in visible (RGB) and infrared (IR) images has been widely +applied in recent years. Leveraging the complementary characteristics of RGB +and IR images, the object detector provides reliable and robust object +localization from day to night. Most existing fusion strategies directly input +RGB and IR images into deep neural networks, leading to inferior detection +performance. However, the RGB and IR features have modality-specific noise, +these strategies will exacerbate the fused features along with the propagation. +Inspired by the mechanism of the human brain processing multimodal information, +in this paper, we introduce a new coarse-to-fine perspective to purify and fuse +two modality features. Specifically, following this perspective, we design a +Redundant Spectrum Removal module to coarsely remove interfering information +within each modality and a Dynamic Feature Selection module to finely select +the desired features for feature fusion. To verify the effectiveness of the +coarse-to-fine fusion strategy, we construct a new object detector called the +Removal and Selection Detector (RSDet). Extensive experiments on three RGB-IR +object detection datasets verify the superior performance of our method. + +
+
+ comment: 11pages, 11figures +
+
+
+
+
+ + ♻ ☆ Interpretable Geoscience Artificial Intelligence (XGeoS-AI): Application + to Demystify Image Recognition + + +
+ As Earth science enters the era of big data, artificial intelligence (AI) not +only offers great potential for solving geoscience problems, but also plays a +critical role in accelerating the understanding of the complex, interactive, +and multiscale processes of Earth's behavior. As geoscience AI models are +progressively utilized for significant predictions in crucial situations, +geoscience researchers are increasingly demanding their interpretability and +versatility. This study proposes an interpretable geoscience artificial +intelligence (XGeoS-AI) framework to unravel the mystery of image recognition +in the Earth sciences, and its effectiveness and versatility is demonstrated by +taking computed tomography (CT) image recognition as an example. Inspired by +the mechanism of human vision, the proposed XGeoS-AI framework generates a +threshold value from a local region within the whole image to complete the +recognition. Different kinds of artificial intelligence (AI) methods, such as +Support Vector Regression (SVR), Multilayer Perceptron (MLP), Convolutional +Neural Network (CNN), can be adopted as the AI engines of the proposed XGeoS-AI +framework to efficiently complete geoscience image recognition tasks. +Experimental results demonstrate that the effectiveness, versatility, and +heuristics of the proposed framework have great potential in solving geoscience +image recognition problems. Interpretable AI should receive more and more +attention in the field of the Earth sciences, which is the key to promoting +more rational and wider applications of AI in the field of Earth sciences. In +addition, the proposed interpretable framework may be the forerunner of +technological innovation in the Earth sciences. + +
+
+ comment: there are some erros in the results, and a newer revision is still + preparing +
+
+
+
+
+ + ♻ ☆ Deep Regression Representation Learning with Topology ICML 2024 + + +
+ Most works studying representation learning focus only on classification and +neglect regression. Yet, the learning objectives and therefore the +representation topologies of the two tasks are fundamentally different: +classification targets class separation, leading to disconnected +representations, whereas regression requires ordinality with respect to the +target, leading to continuous representations. We thus wonder how the +effectiveness of a regression representation is influenced by its topology, +with evaluation based on the Information Bottleneck (IB) principle. + The IB principle is an important framework that provides principles for +learning effectiveness representations. We establish two connections between it +and the topology of regression representations. The first connection reveals +that a lower intrinsic dimension of the feature space implies a reduced +complexity of the representation Z. This complexity can be quantified as the +conditional entropy of Z on the target space Y and serves as an upper bound on +the generalization error. The second connection suggests learning a feature +space that is topologically similar to the target space will better align with +the IB principle. Based on these two connections, we introduce PH-Reg, a +regularizer specific to regression that matches the intrinsic dimension and +topology of the feature space with the target space. Experiments on synthetic +and real-world regression tasks demonstrate the benefits of PH-Reg. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Comparison of Methods in Skin Pigment Decomposition + + +
+ Decomposition of skin pigment plays an important role in medical fields. +Human skin can be decomposed into two primitive components, hemoglobin and +melanin. It is our goal to apply these results for diagnosis of skin cancer. In +this paper, various methods for skin pigment decomposition are reviewed +comparatively and the performance of each method is evaluated both +theoretically and experimentally. In addition, isometric feature mapping +(Isomap) is introduced in order to improve the dimensionality reduction +performance in context of skin pigment decomposition. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Towards Inclusive Face Recognition Through Synthetic Ethnicity + Alteration + + +
+ Numerous studies have shown that existing Face Recognition Systems (FRS), +including commercial ones, often exhibit biases toward certain ethnicities due +to under-represented data. In this work, we explore ethnicity alteration and +skin tone modification using synthetic face image generation methods to +increase the diversity of datasets. We conduct a detailed analysis by first +constructing a balanced face image dataset representing three ethnicities: +Asian, Black, and Indian. We then make use of existing Generative Adversarial +Network-based (GAN) image-to-image translation and manifold learning models to +alter the ethnicity from one to another. A systematic analysis is further +conducted to assess the suitability of such datasets for FRS by studying the +realistic skin-tone representation using Individual Typology Angle (ITA). +Further, we also analyze the quality characteristics using existing Face image +quality assessment (FIQA) approaches. We then provide a holistic FRS +performance analysis using four different systems. Our findings pave the way +for future research works in (i) developing both specific ethnicity and general +(any to any) ethnicity alteration models, (ii) expanding such approaches to +create databases with diverse skin tones, (iii) creating datasets representing +various ethnicities which further can help in mitigating bias while addressing +privacy concerns. + +
+
+ comment: 8 Pages +
+
+
+
+
+ + ♻ ☆ 3DTopia: Large Text-to-3D Generation Model with Hybrid Diffusion Priors + + +
+ We present a two-stage text-to-3D generation system, namely 3DTopia, which +generates high-quality general 3D assets within 5 minutes using hybrid +diffusion priors. The first stage samples from a 3D diffusion prior directly +learned from 3D data. Specifically, it is powered by a text-conditioned +tri-plane latent diffusion model, which quickly generates coarse 3D samples for +fast prototyping. The second stage utilizes 2D diffusion priors to further +refine the texture of coarse 3D models from the first stage. The refinement +consists of both latent and pixel space optimization for high-quality texture +generation. To facilitate the training of the proposed system, we clean and +caption the largest open-source 3D dataset, Objaverse, by combining the power +of vision language models and large language models. Experiment results are +reported qualitatively and quantitatively to show the performance of the +proposed system. Our codes and models are available at +https://github.com/3DTopia/3DTopia + +
+
+ comment: Code available at https://github.com/3DTopia/3DTopia +
+
+
+
+
+ + ♻ ☆ Automatic Ultrasound Curve Angle Measurement via Affinity Clustering for + Adolescent Idiopathic Scoliosis Evaluation + + +
+ The current clinical gold standard for evaluating adolescent idiopathic +scoliosis (AIS) is X-ray radiography, using Cobb angle measurement. However, +the frequent monitoring of the AIS progression using X-rays poses a challenge +due to the cumulative radiation exposure. Although 3D ultrasound has been +validated as a reliable and radiation-free alternative for scoliosis +assessment, the process of measuring spinal curvature is still carried out +manually. Consequently, there is a considerable demand for a fully automatic +system that can locate bony landmarks and perform angle measurements. To this +end, we introduce an estimation model for automatic ultrasound curve angle +(UCA) measurement. The model employs a dual-branch network to detect candidate +landmarks and perform vertebra segmentation on ultrasound coronal images. An +affinity clustering strategy is utilized within the vertebral segmentation area +to illustrate the affinity relationship between candidate landmarks. +Subsequently, we can efficiently perform line delineation from a clustered +affinity map for UCA measurement. As our method is specifically designed for +UCA calculation, this method outperforms other state-of-the-art methods for +landmark and line detection tasks. The high correlation between the automatic +UCA and Cobb angle (R$^2$=0.858) suggests that our proposed method can +potentially replace manual UCA measurement in ultrasound scoliosis assessment. + +
+
+
+
+
+ + ♻ ☆ Synapse: Learning Preferential Concepts from Visual Demonstrations + + +
+ This paper addresses the problem of preference learning, which aims to learn +user-specific preferences (e.g., "good parking spot", "convenient drop-off +location") from visual input. Despite its similarity to learning factual +concepts (e.g., "red cube"), preference learning is a fundamentally harder +problem due to its subjective nature and the paucity of person-specific +training data. We address this problem using a new framework called Synapse, +which is a neuro-symbolic approach designed to efficiently learn preferential +concepts from limited demonstrations. Synapse represents preferences as +neuro-symbolic programs in a domain-specific language (DSL) that operates over +images, and leverages a novel combination of visual parsing, large language +models, and program synthesis to learn programs representing individual +preferences. We evaluate Synapse through extensive experimentation including a +user case study focusing on mobility-related concepts in mobile robotics and +autonomous driving. Our evaluation demonstrates that Synapse significantly +outperforms existing baselines as well as its own ablations. The code and other +details can be found on the project website https://amrl.cs.utexas.edu/synapse . + +
+
+ comment: 25 pages, 7 tables, 9 figures; Preprint; Updated figures and + appendix, added VLM ablations +
+
+
+
+
+ + ♻ ☆ Decodable and Sample Invariant Continuous Object Encoder ICLR2024 + + +
+ We propose Hyper-Dimensional Function Encoding (HDFE). Given samples of a +continuous object (e.g. a function), HDFE produces an explicit vector +representation of the given object, invariant to the sample distribution and +density. Sample distribution and density invariance enables HDFE to +consistently encode continuous objects regardless of their sampling, and +therefore allows neural networks to receive continuous objects as inputs for +machine learning tasks, such as classification and regression. Besides, HDFE +does not require any training and is proved to map the object into an organized +embedding space, which facilitates the training of the downstream tasks. In +addition, the encoding is decodable, which enables neural networks to regress +continuous objects by regressing their encodings. Therefore, HDFE serves as an +interface for processing continuous objects. + We apply HDFE to function-to-function mapping, where vanilla HDFE achieves +competitive performance as the state-of-the-art algorithm. We apply HDFE to +point cloud surface normal estimation, where a simple replacement from PointNet +to HDFE leads to immediate 12% and 15% error reductions in two benchmarks. In +addition, by integrating HDFE into the PointNet-based SOTA network, we improve +the SOTA baseline by 2.5% and 1.7% in the same benchmarks. + +
+
+ comment: ICLR2024 Conference Paper +
+
+
+
+
+ + ♻ ☆ A Linear Time and Space Local Point Cloud Geometry Encoder via + Vectorized Kernel Mixture (VecKM) ICML2024 + + +
+ We propose VecKM, a local point cloud geometry encoder that is descriptive +and efficient to compute. VecKM leverages a unique approach by vectorizing a +kernel mixture to represent the local point cloud. Such representation's +descriptiveness is supported by two theorems that validate its ability to +reconstruct and preserve the similarity of the local shape. Unlike existing +encoders downsampling the local point cloud, VecKM constructs the local +geometry encoding using all neighboring points, producing a more descriptive +encoding. Moreover, VecKM is efficient to compute and scalable to large point +cloud inputs: VecKM reduces the memory cost from $(n^2+nKd)$ to $(nd+np)$; and +reduces the major runtime cost from computing $nK$ MLPs to $n$ MLPs, where $n$ +is the size of the point cloud, $K$ is the neighborhood size, $d$ is the +encoding dimension, and $p$ is a marginal factor. The efficiency is due to +VecKM's unique factorizable property that eliminates the need of explicitly +grouping points into neighbors. In the normal estimation task, VecKM +demonstrates not only 100x faster inference speed but also highest accuracy and +strongest robustness. In classification and segmentation tasks, integrating +VecKM as a preprocessing module achieves consistently better performance than +the PointNet, PointNet++, and point transformer baselines, and runs +consistently faster by up to 10 times. + +
+
+ comment: ICML2024 Conference Paper +
+
+
+
+
+ + ♻ ☆ Adaptive Guidance Learning for Camouflaged Object Detection + + +
+ Camouflaged object detection (COD) aims to segment objects visually embedded +in their surroundings, which is a very challenging task due to the high +similarity between the objects and the background. To address it, most methods +often incorporate additional information (e.g., boundary, texture, and +frequency clues) to guide feature learning for better detecting camouflaged +objects from the background. Although progress has been made, these methods are +basically individually tailored to specific auxiliary cues, thus lacking +adaptability and not consistently achieving high segmentation performance. To +this end, this paper proposes an adaptive guidance learning network, dubbed +\textit{AGLNet}, which is a unified end-to-end learnable model for exploring +and adapting different additional cues in CNN models to guide accurate +camouflaged feature learning. Specifically, we first design a straightforward +additional information generation (AIG) module to learn additional camouflaged +object cues, which can be adapted for the exploration of effective camouflaged +features. Then we present a hierarchical feature combination (HFC) module to +deeply integrate additional cues and image features to guide camouflaged +feature learning in a multi-level fusion manner.Followed by a recalibration +decoder (RD), different features are further aggregated and refined for +accurate object prediction. Extensive experiments on three widely used COD +benchmark datasets demonstrate that the proposed method achieves significant +performance improvements under different additional cues, and outperforms the +recent 20 state-of-the-art methods by a large margin. Our code will be made +publicly available at: \textcolor{blue}{{https://github.com/ZNan-Chen/AGLNet}}. + +
+
+
+
+
+ + ♻ ☆ Skip \n: A Simple Method to Reduce Hallucination in Large + Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks (\n\n), where the content before and +after '\n\n' in the training data frequently exhibit significant semantic +changes. This pattern leads the model to infer that the contents following +'\n\n' should be obviously different from the preceding contents with less +hallucinatory descriptions, thereby increasing the probability of hallucinatory +descriptions subsequent to the '\n\n'. We have validated this hypothesis on +multiple publicly available LVLMs. Besides, we find that deliberately inserting +'\n\n' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of '\n'. + +
+
+
+
+
+ + ♻ ☆ Selective Prediction for Semantic Segmentation using Post-Hoc Confidence + Estimation and Its Performance under Distribution Shift + + +
+ Semantic segmentation plays a crucial role in various computer vision +applications, yet its efficacy is often hindered by the lack of high-quality +labeled data. To address this challenge, a common strategy is to leverage +models trained on data from different populations, such as publicly available +datasets. This approach, however, leads to the distribution shift problem, +presenting a reduced performance on the population of interest. In scenarios +where model errors can have significant consequences, selective prediction +methods offer a means to mitigate risks and reduce reliance on expert +supervision. This paper investigates selective prediction for semantic +segmentation in low-resource settings, thus focusing on post-hoc confidence +estimators applied to pre-trained models operating under distribution shift. We +propose a novel image-level confidence measure tailored for semantic +segmentation and demonstrate its effectiveness through experiments on three +medical imaging tasks. Our findings show that post-hoc confidence estimators +offer a cost-effective approach to reducing the impacts of distribution shift. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 144 + +
+
+
+ + ☆ Complex Video Reasoning and Robustness Evaluation Suite for Video-LMMs + + +
+ Recent advancements in Large Language Models (LLMs) have led to the +development of Video Large Multi-modal Models (Video-LMMs) that can handle a +wide range of video understanding tasks. These models have the potential to be +deployed in real-world applications such as robotics, AI assistants, medical +imaging, and autonomous vehicles. The widespread adoption of Video-LMMs in our +daily lives underscores the importance of ensuring and evaluating their robust +performance in mirroring human-like reasoning and interaction capabilities in +complex, real-world contexts. However, existing benchmarks for Video-LMMs +primarily focus on general video comprehension abilities and neglect assessing +their reasoning capabilities over complex videos in the real-world context, and +robustness of these models through the lens of user prompts as text queries. In +this paper, we present the Complex Video Reasoning and Robustness Evaluation +Suite (CVRR-ES), a novel benchmark that comprehensively assesses the +performance of Video-LMMs across 11 diverse real-world video dimensions. We +evaluate 9 recent models, including both open-source and closed-source +variants, and find that most of the Video-LMMs, {especially open-source ones,} +struggle with robustness and reasoning when dealing with complex videos. Based +on our analysis, we develop a training-free Dual-Step Contextual Prompting +(DSCP) technique to enhance the performance of existing Video-LMMs. Our +findings provide valuable insights for building the next generation of +human-centric AI systems with advanced robustness and reasoning capabilities. +Our dataset and code are publicly available at: +https://mbzuai-oryx.github.io/CVRR-Evaluation-Suite/. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Pose Priors from Language Models + + +
+ We present a zero-shot pose optimization method that enforces accurate +physical contact constraints when estimating the 3D pose of humans. Our central +insight is that since language is often used to describe physical interaction, +large pretrained text-based models can act as priors on pose estimation. + We can thus leverage this insight to improve pose estimation by converting +natural language descriptors, generated by a large multimodal model (LMM), into +tractable losses to constrain the 3D pose optimization. Despite its simplicity, +our method produces surprisingly compelling pose reconstructions of people in +close contact, correctly capturing the semantics of the social and physical +interactions. We demonstrate that our method rivals more complex +state-of-the-art approaches that require expensive human annotation of contact +points and training specialized models. Moreover, unlike previous approaches, +our method provides a unified framework for resolving self-contact and +person-to-person contact. + +
+
+
+
+
+ + ☆ Language-Image Models with 3D Understanding + + +
+ Multi-modal large language models (MLLMs) have shown incredible capabilities +in a variety of 2D vision and language tasks. We extend MLLMs' perceptual +capabilities to ground and reason about images in 3-dimensional space. To that +end, we first develop a large-scale pre-training dataset for 2D and 3D called +LV3D by combining multiple existing 2D and 3D recognition datasets under a +common task formulation: as multi-turn question-answering. Next, we introduce a +new MLLM named Cube-LLM and pre-train it on LV3D. We show that pure data +scaling makes a strong 3D perception capability without 3D specific +architectural design or training objective. Cube-LLM exhibits intriguing +properties similar to LLMs: (1) Cube-LLM can apply chain-of-thought prompting +to improve 3D understanding from 2D context information. (2) Cube-LLM can +follow complex and diverse instructions and adapt to versatile input and output +formats. (3) Cube-LLM can be visually prompted such as 2D box or a set of +candidate 3D boxes from specialists. Our experiments on outdoor benchmarks +demonstrate that Cube-LLM significantly outperforms existing baselines by 21.3 +points of AP-BEV on the Talk2Car dataset for 3D grounded reasoning and 17.7 +points on the DriveLM dataset for complex reasoning about driving scenarios, +respectively. Cube-LLM also shows competitive results in general MLLM +benchmarks such as refCOCO for 2D grounding with (87.0) average score, as well +as visual question answering benchmarks such as VQAv2, GQA, SQA, POPE, etc. for +complex reasoning. Our project is available at +https://janghyuncho.github.io/Cube-LLM. + +
+
+ comment: Project page: https://janghyuncho.github.io/Cube-LLM +
+
+
+
+
+ + ☆ An Empty Room is All We Want: Automatic Defurnishing of Indoor Panoramas CVPR 2024 + + +
+ We propose a pipeline that leverages Stable Diffusion to improve inpainting +results in the context of defurnishing -- the removal of furniture items from +indoor panorama images. Specifically, we illustrate how increased context, +domain-specific model fine-tuning, and improved image blending can produce +high-fidelity inpaints that are geometrically plausible without needing to rely +on room layout estimation. We demonstrate qualitative and quantitative +improvements over other furniture removal techniques. + +
+
+ comment: Accepted at CVPR 2024 workshops. Project page: + https://matterport.github.io/automatic-defurnishing-of-indoor-panoramas/ +
+
+
+
+
+ + ☆ MemoryMamba: Memory-Augmented State Space Model for Defect Recognition + + +
+ As automation advances in manufacturing, the demand for precise and +sophisticated defect detection technologies grows. Existing vision models for +defect recognition methods are insufficient for handling the complexities and +variations of defects in contemporary manufacturing settings. These models +especially struggle in scenarios involving limited or imbalanced defect data. +In this work, we introduce MemoryMamba, a novel memory-augmented state space +model (SSM), designed to overcome the limitations of existing defect +recognition models. MemoryMamba integrates the state space model with the +memory augmentation mechanism, enabling the system to maintain and retrieve +essential defect-specific information in training. Its architecture is designed +to capture dependencies and intricate defect characteristics, which are crucial +for effective defect detection. In the experiments, MemoryMamba was evaluated +across four industrial datasets with diverse defect types and complexities. The +model consistently outperformed other methods, demonstrating its capability to +adapt to various defect recognition scenarios. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ☆ Diffeomorphic Template Registration for Atmospheric Turbulence + Mitigation + + +
+ We describe a method for recovering the irradiance underlying a collection of +images corrupted by atmospheric turbulence. Since supervised data is often +technically impossible to obtain, assumptions and biases have to be imposed to +solve this inverse problem, and we choose to model them explicitly. Rather than +initializing a latent irradiance ("template") by heuristics to estimate +deformation, we select one of the images as a reference, and model the +deformation in this image by the aggregation of the optical flow from it to +other images, exploiting a prior imposed by Central Limit Theorem. Then with a +novel flow inversion module, the model registers each image TO the template but +WITHOUT the template, avoiding artifacts related to poor template +initialization. To illustrate the robustness of the method, we simply (i) +select the first frame as the reference and (ii) use the simplest optical flow +to estimate the warpings, yet the improvement in registration is decisive in +the final reconstruction, as we achieve state-of-the-art performance despite +its simplicity. The method establishes a strong baseline that can be further +improved by integrating it seamlessly into more sophisticated pipelines, or +with domain-specific methods if so desired. + +
+
+
+
+
+ + ☆ CICA: Content-Injected Contrastive Alignment for Zero-Shot Document + Image Classification ICDAR 2024 + + +
+ Zero-shot learning has been extensively investigated in the broader field of +visual recognition, attracting significant interest recently. However, the +current work on zero-shot learning in document image classification remains +scarce. The existing studies either focus exclusively on zero-shot inference, +or their evaluation does not align with the established criteria of zero-shot +evaluation in the visual recognition domain. We provide a comprehensive +document image classification analysis in Zero-Shot Learning (ZSL) and +Generalized Zero-Shot Learning (GZSL) settings to address this gap. Our +methodology and evaluation align with the established practices of this domain. +Additionally, we propose zero-shot splits for the RVL-CDIP dataset. +Furthermore, we introduce CICA (pronounced 'ki-ka'), a framework that enhances +the zero-shot learning capabilities of CLIP. CICA consists of a novel 'content +module' designed to leverage any generic document-related textual information. +The discriminative features extracted by this module are aligned with CLIP's +text and image features using a novel 'coupled-contrastive' loss. Our module +improves CLIP's ZSL top-1 accuracy by 6.7% and GZSL harmonic mean by 24% on the +RVL-CDIP dataset. Our module is lightweight and adds only 3.3% more parameters +to CLIP. Our work sets the direction for future research in zero-shot document +classification. + +
+
+ comment: 18 Pages, 4 Figures and Accepted in ICDAR 2024 +
+
+
+
+
+ + ☆ A Construct-Optimize Approach to Sparse View Synthesis without Camera + Pose + + +
+ Novel view synthesis from a sparse set of input images is a challenging +problem of great practical interest, especially when camera poses are absent or +inaccurate. Direct optimization of camera poses and usage of estimated depths +in neural radiance field algorithms usually do not produce good results because +of the coupling between poses and depths, and inaccuracies in monocular depth +estimation. In this paper, we leverage the recent 3D Gaussian splatting method +to develop a novel construct-and-optimize method for sparse view synthesis +without camera poses. Specifically, we construct a solution progressively by +using monocular depth and projecting pixels back into the 3D world. During +construction, we optimize the solution by detecting 2D correspondences between +training views and the corresponding rendered images. We develop a unified +differentiable pipeline for camera registration and adjustment of both camera +poses and depths, followed by back-projection. We also introduce a novel notion +of an expected surface in Gaussian splatting, which is critical to our +optimization. These steps enable a coarse solution, which can then be low-pass +filtered and refined using standard optimization methods. We demonstrate +results on the Tanks and Temples and Static Hikes datasets with as few as three +widely-spaced views, showing significantly better quality than competing +methods, including those with approximate camera pose information. Moreover, +our results improve with more views and outperform previous InstantNGP and +Gaussian Splatting algorithms even when using half the dataset. + +
+
+
+
+
+ + ☆ Field-of-View Extension for Diffusion MRI via Deep Generative Models + + +
+ Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of +whole-brain tissue microstructure and connectivity can be severely impeded by +an incomplete field-of-view (FOV). This work aims to develop a method for +imputing the missing slices directly from existing dMRI scans with an +incomplete FOV. We hypothesize that the imputed image with complete FOV can +improve the whole-brain tractography for corrupted data with incomplete FOV. +Therefore, our approach provides a desirable alternative to discarding the +valuable dMRI data, enabling subsequent tractography analyses that would +otherwise be challenging or unattainable with corrupted data. Approach: We +propose a framework based on a deep generative model that estimates the absent +brain regions in dMRI scans with incomplete FOV. The model is capable of +learning both the diffusion characteristics in diffusion-weighted images (DWI) +and the anatomical features evident in the corresponding structural images for +efficiently imputing missing slices of DWI outside of incomplete FOV. Results: +For evaluating the imputed slices, on the WRAP dataset the proposed framework +achieved PSNRb0=22.397, SSIMb0=0.905, PSNRb1300=22.479, SSIMb1300=0.893; on the +NACC dataset it achieved PSNRb0=21.304, SSIMb0=0.892, PSNRb1300=21.599, +SSIMb1300= 0.877. The proposed framework improved the tractography accuracy, as +demonstrated by an increased average Dice score for 72 tracts (p < 0.001) on +both the WRAP and NACC datasets. Conclusions: Results suggest that the proposed +framework achieved sufficient imputation performance in dMRI data with +incomplete FOV for improving whole-brain tractography, thereby repairing the +corrupted data. Our approach achieved more accurate whole-brain tractography +results with extended and complete FOV and reduced the uncertainty when +analyzing bundles associated with Alzheimer's Disease. + +
+
+ comment: 20 pages, 11 figures +
+
+
+
+
+ + ☆ Generated Contents Enrichment + + +
+ In this paper, we investigate a novel artificial intelligence generation +task, termed as generated contents enrichment (GCE). Different from +conventional artificial intelligence contents generation task that enriches the +given textual description implicitly with limited semantics for generating +visually real content, our proposed GCE strives to perform content enrichment +explicitly on both the visual and textual domain, from which the enriched +contents are visually real, structurally reasonable, and semantically abundant. +Towards to solve GCE, we propose a deep end-to-end method that explicitly +explores the semantics and inter-semantic relationships during the enrichment. +Specifically, we first model the input description as a semantic graph, wherein +each node represents an object and each edge corresponds to the inter-object +relationship. We then adopt Graph Convolutional Networks on top of the input +scene description to predict the enriching objects and their relationships with +the input objects. Finally, the enriched graph is fed into an image synthesis +model to carry out the visual contents generation. Our experiments conducted on +the Visual Genome dataset exhibit promising and visually plausible results. + +
+
+
+
+
+ + ☆ Learning Robust Classifiers with Self-Guided Spurious Correlation + Mitigation IJCAI 2024 + + +
+ Deep neural classifiers tend to rely on spurious correlations between +spurious attributes of inputs and targets to make predictions, which could +jeopardize their generalization capability. Training classifiers robust to +spurious correlations typically relies on annotations of spurious correlations +in data, which are often expensive to get. In this paper, we tackle an +annotation-free setting and propose a self-guided spurious correlation +mitigation framework. Our framework automatically constructs fine-grained +training labels tailored for a classifier obtained with empirical risk +minimization to improve its robustness against spurious correlations. The +fine-grained training labels are formulated with different prediction behaviors +of the classifier identified in a novel spuriousness embedding space. We +construct the space with automatically detected conceptual attributes and a +novel spuriousness metric which measures how likely a class-attribute +correlation is exploited for predictions. We demonstrate that training the +classifier to distinguish different prediction behaviors reduces its reliance +on spurious correlations without knowing them a priori and outperforms prior +methods on five real-world datasets. + +
+
+ comment: Accepted to IJCAI 2024 +
+
+
+
+
+ + ☆ Collecting Consistently High Quality Object Tracks with Minimal Human + Involvement by Using Self-Supervised Learning to Detect Tracker Errors + + +
+ We propose a hybrid framework for consistently producing high-quality object +tracks by combining an automated object tracker with little human input. The +key idea is to tailor a module for each dataset to intelligently decide when an +object tracker is failing and so humans should be brought in to re-localize an +object for continued tracking. Our approach leverages self-supervised learning +on unlabeled videos to learn a tailored representation for a target object that +is then used to actively monitor its tracked region and decide when the tracker +fails. Since labeled data is not needed, our approach can be applied to novel +object categories. Experiments on three datasets demonstrate our method +outperforms existing approaches, especially for small, fast moving, or occluded +objects. + +
+
+
+
+
+ + ☆ Classification of Breast Cancer Histopathology Images using a Modified + Supervised Contrastive Learning Method + + +
+ Deep neural networks have reached remarkable achievements in medical image +processing tasks, specifically classifying and detecting various diseases. +However, when confronted with limited data, these networks face a critical +vulnerability, often succumbing to overfitting by excessively memorizing the +limited information available. This work addresses the challenge mentioned +above by improving the supervised contrastive learning method to reduce the +impact of false positives. Unlike most existing methods that rely predominantly +on fully supervised learning, our approach leverages the advantages of +self-supervised learning in conjunction with employing the available labeled +data. We evaluate our method on the BreakHis dataset, which consists of breast +cancer histopathology images, and demonstrate an increase in classification +accuracy by 1.45% at the image level and 1.42% at the patient level compared to +the state-of-the-art method. This improvement corresponds to 93.63% absolute +accuracy, highlighting our approach's effectiveness in leveraging data +properties to learn more appropriate representation space. + +
+
+ comment: 16 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ Neural Graph Mapping for Dense SLAM with Efficient Loop Closure + + +
+ Existing neural field-based SLAM methods typically employ a single monolithic +field as their scene representation. This prevents efficient incorporation of +loop closure constraints and limits scalability. To address these shortcomings, +we propose a neural mapping framework which anchors lightweight neural fields +to the pose graph of a sparse visual SLAM system. Our approach shows the +ability to integrate large-scale loop closures, while limiting necessary +reintegration. Furthermore, we verify the scalability of our approach by +demonstrating successful building-scale mapping taking multiple loop closures +into account during the optimization, and show that our method outperforms +existing state-of-the-art approaches on large scenes in terms of quality and +runtime. Our code is available at +https://kth-rpl.github.io/neural_graph_mapping/. + +
+
+ comment: Project page: https://kth-rpl.github.io/neural_graph_mapping/ +
+
+
+
+
+ + ☆ Dual Relation Mining Network for Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) aims to recognize novel classes through transferring +shared semantic knowledge (e.g., attributes) from seen classes to unseen +classes. Recently, attention-based methods have exhibited significant progress +which align visual features and attributes via a spatial attention mechanism. +However, these methods only explore visual-semantic relationship in the spatial +dimension, which can lead to classification ambiguity when different attributes +share similar attention regions, and semantic relationship between attributes +is rarely discussed. To alleviate the above problems, we propose a Dual +Relation Mining Network (DRMN) to enable more effective visual-semantic +interactions and learn semantic relationship among attributes for knowledge +transfer. Specifically, we introduce a Dual Attention Block (DAB) for +visual-semantic relationship mining, which enriches visual information by +multi-level feature fusion and conducts spatial attention for visual to +semantic embedding. Moreover, an attribute-guided channel attention is utilized +to decouple entangled semantic features. For semantic relationship modeling, we +utilize a Semantic Interaction Transformer (SIT) to enhance the generalization +of attribute representations among images. Additionally, a global +classification branch is introduced as a complement to human-defined semantic +attributes, and we then combine the results with attribute-based +classification. Extensive experiments demonstrate that the proposed DRMN leads +to new state-of-the-art performances on three standard ZSL benchmarks, i.e., +CUB, SUN, and AwA2. + +
+
+
+
+
+ + ☆ Liberating Seen Classes: Boosting Few-Shot and Zero-Shot Text + Classification via Anchor Generation and Classification Reframing AAAI 2024 + + +
+ Few-shot and zero-shot text classification aim to recognize samples from +novel classes with limited labeled samples or no labeled samples at all. While +prevailing methods have shown promising performance via transferring knowledge +from seen classes to unseen classes, they are still limited by (1) Inherent +dissimilarities among classes make the transformation of features learned from +seen classes to unseen classes both difficult and inefficient. (2) Rare labeled +novel samples usually cannot provide enough supervision signals to enable the +model to adjust from the source distribution to the target distribution, +especially for complicated scenarios. To alleviate the above issues, we propose +a simple and effective strategy for few-shot and zero-shot text classification. +We aim to liberate the model from the confines of seen classes, thereby +enabling it to predict unseen categories without the necessity of training on +seen classes. Specifically, for mining more related unseen category knowledge, +we utilize a large pre-trained language model to generate pseudo novel samples, +and select the most representative ones as category anchors. After that, we +convert the multi-class classification task into a binary classification task +and use the similarities of query-anchor pairs for prediction to fully leverage +the limited supervision signals. Extensive experiments on six widely used +public datasets show that our proposed method can outperform other strong +baselines significantly in few-shot and zero-shot tasks, even without using any +seen class samples. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ CCDM: Continuous Conditional Diffusion Models for Image Generation + + +
+ Continuous Conditional Generative Modeling (CCGM) aims to estimate the +distribution of high-dimensional data, typically images, conditioned on scalar +continuous variables known as regression labels. While Continuous conditional +Generative Adversarial Networks (CcGANs) were initially designed for this task, +their adversarial training mechanism remains vulnerable to extremely sparse or +imbalanced data, resulting in suboptimal outcomes. To enhance the quality of +generated images, a promising alternative is to replace CcGANs with Conditional +Diffusion Models (CDMs), renowned for their stable training process and ability +to produce more realistic images. However, existing CDMs encounter challenges +when applied to CCGM tasks due to several limitations such as inadequate U-Net +architectures and deficient model fitting mechanisms for handling regression +labels. In this paper, we introduce Continuous Conditional Diffusion Models +(CCDMs), the first CDM designed specifically for the CCGM task. CCDMs address +the limitations of existing CDMs by introducing specially designed conditional +diffusion processes, a modified denoising U-Net with a custom-made conditioning +mechanism, a novel hard vicinal loss for model fitting, and an efficient +conditional sampling procedure. With comprehensive experiments on four datasets +with varying resolutions ranging from 64x64 to 192x192, we demonstrate the +superiority of the proposed CCDM over state-of-the-art CCGM models, +establishing new benchmarks in CCGM. Extensive ablation studies validate the +model design and implementation configuration of the proposed CCDM. Our code is +publicly available at https://github.com/UBCDingXin/CCDM. + +
+
+
+
+
+ + ☆ Optimizing Hand Region Detection in MediaPipe Holistic Full-Body Pose + Estimation to Improve Accuracy and Avoid Downstream Errors + + +
+ This paper addresses a critical flaw in MediaPipe Holistic's hand Region of +Interest (ROI) prediction, which struggles with non-ideal hand orientations, +affecting sign language recognition accuracy. We propose a data-driven approach +to enhance ROI estimation, leveraging an enriched feature set including +additional hand keypoints and the z-dimension. Our results demonstrate better +estimates, with higher Intersection-over-Union compared to the current method. +Our code and optimizations are available at +https://github.com/sign-language-processing/mediapipe-hand-crop-fix. + +
+
+
+
+
+ + ☆ RepVGG-GELAN: Enhanced GELAN with VGG-STYLE ConvNets for Brain Tumour + Detection + + +
+ Object detection algorithms particularly those based on YOLO have +demonstrated remarkable efficiency in balancing speed and accuracy. However, +their application in brain tumour detection remains underexplored. This study +proposes RepVGG-GELAN, a novel YOLO architecture enhanced with RepVGG, a +reparameterized convolutional approach for object detection tasks particularly +focusing on brain tumour detection within medical images. RepVGG-GELAN +leverages the RepVGG architecture to improve both speed and accuracy in +detecting brain tumours. Integrating RepVGG into the YOLO framework aims to +achieve a balance between computational efficiency and detection performance. +This study includes a spatial pyramid pooling-based Generalized Efficient Layer +Aggregation Network (GELAN) architecture which further enhances the capability +of RepVGG. Experimental evaluation conducted on a brain tumour dataset +demonstrates the effectiveness of RepVGG-GELAN surpassing existing RCS-YOLO in +terms of precision and speed. Specifically, RepVGG-GELAN achieves an increased +precision of 4.91% and an increased AP50 of 2.54% over the latest existing +approach while operating at 240.7 GFLOPs. The proposed RepVGG-GELAN with GELAN +architecture presents promising results establishing itself as a +state-of-the-art solution for accurate and efficient brain tumour detection in +medical images. The implementation code is publicly available at +https://github.com/ThensiB/RepVGG-GELAN. + +
+
+
+
+
+ + ☆ Is Sora a World Simulator? A Comprehensive Survey on General World + Models and Beyond + + +
+ General world models represent a crucial pathway toward achieving Artificial +General Intelligence (AGI), serving as the cornerstone for various applications +ranging from virtual environments to decision-making systems. Recently, the +emergence of the Sora model has attained significant attention due to its +remarkable simulation capabilities, which exhibits an incipient comprehension +of physical laws. In this survey, we embark on a comprehensive exploration of +the latest advancements in world models. Our analysis navigates through the +forefront of generative methodologies in video generation, where world models +stand as pivotal constructs facilitating the synthesis of highly realistic +visual content. Additionally, we scrutinize the burgeoning field of +autonomous-driving world models, meticulously delineating their indispensable +role in reshaping transportation and urban mobility. Furthermore, we delve into +the intricacies inherent in world models deployed within autonomous agents, +shedding light on their profound significance in enabling intelligent +interactions within dynamic environmental contexts. At last, we examine +challenges and limitations of world models, and discuss their potential future +directions. We hope this survey can serve as a foundational reference for the +research community and inspire continued innovation. This survey will be +regularly updated at: +https://github.com/GigaAI-research/General-World-Models-Survey. + +
+
+ comment: This survey will be regularly updated at: + https://github.com/GigaAI-research/General-World-Models-Survey +
+
+
+
+
+ + ☆ Low-light Object Detection + + +
+ In this competition we employed a model fusion approach to achieve object +detection results close to those of real images. Our method is based on the +CO-DETR model, which was trained on two sets of data: one containing images +under dark conditions and another containing images enhanced with low-light +conditions. We used various enhancement techniques on the test data to generate +multiple sets of prediction results. Finally, we applied a clustering +aggregation method guided by IoU thresholds to select the optimal results. + +
+
+
+
+
+ + ☆ Boosting Single Positive Multi-label Classification with Generalized + Robust Loss + + +
+ Multi-label learning (MLL) requires comprehensive multi-semantic annotations +that is hard to fully obtain, thus often resulting in missing labels scenarios. +In this paper, we investigate Single Positive Multi-label Learning (SPML), +where each image is associated with merely one positive label. Existing SPML +methods only focus on designing losses using mechanisms such as hard +pseudo-labeling and robust losses, mostly leading to unacceptable false +negatives. To address this issue, we first propose a generalized loss framework +based on expected risk minimization to provide soft pseudo labels, and point +out that the former losses can be seamlessly converted into our framework. In +particular, we design a novel robust loss based on our framework, which enjoys +flexible coordination between false positives and false negatives, and can +additionally deal with the imbalance between positive and negative samples. +Extensive experiments show that our approach can significantly improve SPML +performance and outperform the vast majority of state-of-the-art methods on all +the four benchmarks. + +
+
+ comment: 14 pages, 5 figures, 6 tables +
+
+
+
+
+ + ☆ A Rate-Distortion-Classification Approach for Lossy Image Compression + + +
+ In lossy image compression, the objective is to achieve minimal signal +distortion while compressing images to a specified bit rate. The increasing +demand for visual analysis applications, particularly in classification tasks, +has emphasized the significance of considering semantic distortion in +compressed images. To bridge the gap between image compression and visual +analysis, we propose a Rate-Distortion-Classification (RDC) model for lossy +image compression, offering a unified framework to optimize the trade-off +between rate, distortion, and classification accuracy. The RDC model is +extensively analyzed both statistically on a multi-distribution source and +experimentally on the widely used MNIST dataset. The findings reveal that the +RDC model exhibits desirable properties, including monotonic non-increasing and +convex functions, under certain conditions. This work provides insights into +the development of human-machine friendly compression methods and Video Coding +for Machine (VCM) approaches, paving the way for end-to-end image compression +techniques in real-world applications. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ UnsafeBench: Benchmarking Image Safety Classifiers on Real-World and + AI-Generated Images + + +
+ Image safety classifiers play an important role in identifying and mitigating +the spread of unsafe images online (e.g., images including violence, hateful +rhetoric, etc.). At the same time, with the advent of text-to-image models and +increasing concerns about the safety of AI models, developers are increasingly +relying on image safety classifiers to safeguard their models. Yet, the +performance of current image safety classifiers remains unknown for real-world +and AI-generated images. To bridge this research gap, in this work, we propose +UnsafeBench, a benchmarking framework that evaluates the effectiveness and +robustness of image safety classifiers. First, we curate a large dataset of 10K +real-world and AI-generated images that are annotated as safe or unsafe based +on a set of 11 unsafe categories of images (sexual, violent, hateful, etc.). +Then, we evaluate the effectiveness and robustness of five popular image safety +classifiers, as well as three classifiers that are powered by general-purpose +visual language models. Our assessment indicates that existing image safety +classifiers are not comprehensive and effective enough in mitigating the +multifaceted problem of unsafe images. Also, we find that classifiers trained +only on real-world images tend to have degraded performance when applied to +AI-generated images. Motivated by these findings, we design and implement a +comprehensive image moderation tool called PerspectiveVision, which effectively +identifies 11 categories of real-world and AI-generated unsafe images. The best +PerspectiveVision model achieves an overall F1-Score of 0.810 on six evaluation +datasets, which is comparable with closed-source and expensive state-of-the-art +models like GPT-4V. UnsafeBench and PerspectiveVision can aid the research +community in better understanding the landscape of image safety classification +in the era of generative AI. + +
+
+
+
+
+ + ☆ LGTM: Local-to-Global Text-Driven Human Motion Diffusion Model SIGGRAPH 2024 + + +
+ In this paper, we introduce LGTM, a novel Local-to-Global pipeline for +Text-to-Motion generation. LGTM utilizes a diffusion-based architecture and +aims to address the challenge of accurately translating textual descriptions +into semantically coherent human motion in computer animation. Specifically, +traditional methods often struggle with semantic discrepancies, particularly in +aligning specific motions to the correct body parts. To address this issue, we +propose a two-stage pipeline to overcome this challenge: it first employs large +language models (LLMs) to decompose global motion descriptions into +part-specific narratives, which are then processed by independent body-part +motion encoders to ensure precise local semantic alignment. Finally, an +attention-based full-body optimizer refines the motion generation results and +guarantees the overall coherence. Our experiments demonstrate that LGTM gains +significant improvements in generating locally accurate, semantically-aligned +human motion, marking a notable advancement in text-to-motion applications. +Code and data for this paper are available at https://github.com/L-Sun/LGTM + +
+
+ comment: 9 pages,7 figures, SIGGRAPH 2024 +
+
+
+
+
+ + ☆ A Lightweight Neural Architecture Search Model for Medical Image + Classification + + +
+ Accurate classification of medical images is essential for modern +diagnostics. Deep learning advancements led clinicians to increasingly use +sophisticated models to make faster and more accurate decisions, sometimes +replacing human judgment. However, model development is costly and repetitive. +Neural Architecture Search (NAS) provides solutions by automating the design of +deep learning architectures. This paper presents ZO-DARTS+, a differentiable +NAS algorithm that improves search efficiency through a novel method of +generating sparse probabilities by bi-level optimization. Experiments on five +public medical datasets show that ZO-DARTS+ matches the accuracy of +state-of-the-art solutions while reducing search times by up to three times. + +
+
+
+
+
+ + ☆ SSyncOA: Self-synchronizing Object-aligned Watermarking to Resist + Cropping-paste Attacks ICME 2024 + + +
+ Modern image processing tools have made it easy for attackers to crop the +region or object of interest in images and paste it into other images. The +challenge this cropping-paste attack poses to the watermarking technology is +that it breaks the synchronization of the image watermark, introducing multiple +superimposed desynchronization distortions, such as rotation, scaling, and +translation. However, current watermarking methods can only resist a single +type of desynchronization and cannot be applied to protect the object's +copyright under the cropping-paste attack. With the finding that the key to +resisting the cropping-paste attack lies in robust features of the object to +protect, this paper proposes a self-synchronizing object-aligned watermarking +method, called SSyncOA. Specifically, we first constrain the watermarked region +to be aligned with the protected object, and then synchronize the watermark's +translation, rotation, and scaling distortions by normalizing the object +invariant features, i.e., its centroid, principal orientation, and minimum +bounding square, respectively. To make the watermark embedded in the protected +object, we introduce the object-aligned watermarking model, which incorporates +the real cropping-paste attack into the encoder-noise layer-decoder pipeline +and is optimized end-to-end. Besides, we illustrate the effect of different +desynchronization distortions on the watermark training, which confirms the +necessity of the self-synchronization process. Extensive experiments +demonstrate the superiority of our method over other SOTAs. + +
+
+ comment: 7 pages, 5 figures (Have been accepted by ICME 2024) +
+
+
+
+
+ + ☆ DBDH: A Dual-Branch Dual-Head Neural Network for Invisible Embedded + Regions Localization IJCNN 2024 + + +
+ Embedding invisible hyperlinks or hidden codes in images to replace QR codes +has become a hot topic recently. This technology requires first localizing the +embedded region in the captured photos before decoding. Existing methods that +train models to find the invisible embedded region struggle to obtain accurate +localization results, leading to degraded decoding accuracy. This limitation is +primarily because the CNN network is sensitive to low-frequency signals, while +the embedded signal is typically in the high-frequency form. Based on this, +this paper proposes a Dual-Branch Dual-Head (DBDH) neural network tailored for +the precise localization of invisible embedded regions. Specifically, DBDH uses +a low-level texture branch containing 62 high-pass filters to capture the +high-frequency signals induced by embedding. A high-level context branch is +used to extract discriminative features between the embedded and normal +regions. DBDH employs a detection head to directly detect the four vertices of +the embedding region. In addition, we introduce an extra segmentation head to +segment the mask of the embedding region during training. The segmentation head +provides pixel-level supervision for model learning, facilitating better +learning of the embedded signals. Based on two state-of-the-art invisible +offline-to-online messaging methods, we construct two datasets and augmentation +strategies for training and testing localization models. Extensive experiments +demonstrate the superior performance of the proposed DBDH over existing +methods. + +
+
+ comment: 7 pages, 6 figures (Have been accepted by IJCNN 2024) +
+
+
+
+
+ + ☆ Implantable Adaptive Cells: differentiable architecture search to + improve the performance of any trained U-shaped network + + +
+ This paper introduces a novel approach to enhance the performance of +pre-trained neural networks in medical image segmentation using Neural +Architecture Search (NAS) methods, specifically Differentiable Architecture +Search (DARTS). We present the concept of Implantable Adaptive Cell (IAC), +small but powerful modules identified through Partially-Connected DARTS, +designed to be injected into the skip connections of an existing and already +trained U-shaped model. Our strategy allows for the seamless integration of the +IAC into the pre-existing architecture, thereby enhancing its performance +without necessitating a complete retraining from scratch. The empirical +studies, focusing on medical image segmentation tasks, demonstrate the efficacy +of this method. The integration of specialized IAC cells into various +configurations of the U-Net model increases segmentation accuracy by almost 2\% +points on average for the validation dataset and over 3\% points for the +training dataset. The findings of this study not only offer a cost-effective +alternative to the complete overhaul of complex models for performance upgrades +but also indicate the potential applicability of our method to other +architectures and problem domains. + +
+
+
+
+
+ + ☆ Gaussian Splatting: 3D Reconstruction and Novel View Synthesis, a Review + + +
+ Image-based 3D reconstruction is a challenging task that involves inferring +the 3D shape of an object or scene from a set of input images. Learning-based +methods have gained attention for their ability to directly estimate 3D shapes. +This review paper focuses on state-of-the-art techniques for 3D reconstruction, +including the generation of novel, unseen views. An overview of recent +developments in the Gaussian Splatting method is provided, covering input +types, model structures, output representations, and training strategies. +Unresolved challenges and future directions are also discussed. Given the rapid +progress in this domain and the numerous opportunities for enhancing 3D +reconstruction methods, a comprehensive examination of algorithms appears +essential. Consequently, this study offers a thorough overview of the latest +advancements in Gaussian Splatting. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ An Image Quality Evaluation and Masking Algorithm Based On Pre-trained + Deep Neural Networks + + +
+ With the growing amount of astronomical data, there is an increasing need for +automated data processing pipelines, which can extract scientific information +from observation data without human interventions. A critical aspect of these +pipelines is the image quality evaluation and masking algorithm, which +evaluates image qualities based on various factors such as cloud coverage, sky +brightness, scattering light from the optical system, point spread function +size and shape, and read-out noise. Occasionally, the algorithm requires +masking of areas severely affected by noise. However, the algorithm often +necessitates significant human interventions, reducing data processing +efficiency. In this study, we present a deep learning based image quality +evaluation algorithm that uses an autoencoder to learn features of high quality +astronomical images. The trained autoencoder enables automatic evaluation of +image quality and masking of noise affected areas. We have evaluated the +performance of our algorithm using two test cases: images with point spread +functions of varying full width half magnitude, and images with complex +backgrounds. In the first scenario, our algorithm could effectively identify +variations of the point spread functions, which can provide valuable reference +information for photometry. In the second scenario, our method could +successfully mask regions affected by complex regions, which could +significantly increase the photometry accuracy. Our algorithm can be employed +to automatically evaluate image quality obtained by different sky surveying +projects, further increasing the speed and robustness of data processing +pipelines. + +
+
+ comment: Accepted by the AJ. The code could be downloaded from: + https://nadc.china-vo.org/res/r101415/ with DOI of: 10.12149/101415 +
+
+
+
+
+ + ☆ 3D LiDAR Mapping in Dynamic Environments Using a 4D Implicit Neural + Representation CVPR 2024 + + +
+ Building accurate maps is a key building block to enable reliable +localization, planning, and navigation of autonomous vehicles. We propose a +novel approach for building accurate maps of dynamic environments utilizing a +sequence of LiDAR scans. To this end, we propose encoding the 4D scene into a +novel spatio-temporal implicit neural map representation by fitting a +time-dependent truncated signed distance function to each point. Using our +representation, we extract the static map by filtering the dynamic parts. Our +neural representation is based on sparse feature grids, a globally shared +decoder, and time-dependent basis functions, which we jointly optimize in an +unsupervised fashion. To learn this representation from a sequence of LiDAR +scans, we design a simple yet efficient loss function to supervise the map +optimization in a piecewise way. We evaluate our approach on various scenes +containing moving objects in terms of the reconstruction quality of static maps +and the segmentation of dynamic point clouds. The experimental results +demonstrate that our method is capable of removing the dynamic part of the +input point clouds while reconstructing accurate and complete 3D maps, +outperforming several state-of-the-art methods. Codes are available at: +https://github.com/PRBonn/4dNDF + +
+
+ comment: 10 pages, CVPR 2024 +
+
+
+
+
+ + ☆ Statistical Edge Detection And UDF Learning For Shape Representation + + +
+ In the field of computer vision, the numerical encoding of 3D surfaces is +crucial. It is classical to represent surfaces with their Signed Distance +Functions (SDFs) or Unsigned Distance Functions (UDFs). For tasks like +representation learning, surface classification, or surface reconstruction, +this function can be learned by a neural network, called Neural Distance +Function. This network, and in particular its weights, may serve as a +parametric and implicit representation for the surface. The network must +represent the surface as accurately as possible. In this paper, we propose a +method for learning UDFs that improves the fidelity of the obtained Neural UDF +to the original 3D surface. The key idea of our method is to concentrate the +learning effort of the Neural UDF on surface edges. More precisely, we show +that sampling more training points around surface edges allows better local +accuracy of the trained Neural UDF, and thus improves the global expressiveness +of the Neural UDF in terms of Hausdorff distance. To detect surface edges, we +propose a new statistical method based on the calculation of a $p$-value at +each point on the surface. Our method is shown to detect surface edges more +accurately than a commonly used local geometric descriptor. + +
+
+
+
+
+ + ☆ CRA5: Extreme Compression of ERA5 for Portable Global Climate and + Weather Research via an Efficient Variational Transformer + + +
+ The advent of data-driven weather forecasting models, which learn from +hundreds of terabytes (TB) of reanalysis data, has significantly advanced +forecasting capabilities. However, the substantial costs associated with data +storage and transmission present a major challenge for data providers and +users, affecting resource-constrained researchers and limiting their +accessibility to participate in AI-based meteorological research. To mitigate +this issue, we introduce an efficient neural codec, the Variational Autoencoder +Transformer (VAEformer), for extreme compression of climate data to +significantly reduce data storage cost, making AI-based meteorological research +portable to researchers. Our approach diverges from recent complex neural +codecs by utilizing a low-complexity Auto-Encoder transformer. This encoder +produces a quantized latent representation through variance inference, which +reparameterizes the latent space as a Gaussian distribution. This method +improves the estimation of distributions for cross-entropy coding. Extensive +experiments demonstrate that our VAEformer outperforms existing +state-of-the-art compression methods in the context of climate data. By +applying our VAEformer, we compressed the most popular ERA5 climate dataset +(226 TB) into a new dataset, CRA5 (0.7 TB). This translates to a compression +ratio of over 300 while retaining the dataset's utility for accurate scientific +analysis. Further, downstream experiments show that global weather forecasting +models trained on the compact CRA5 dataset achieve forecasting accuracy +comparable to the model trained on the original dataset. Code, the CRA5 +dataset, and the pre-trained model are available at +https://github.com/taohan10200/CRA5. + +
+
+ comment: Main text and supplementary, 22 pages +
+
+
+
+
+ + ☆ Knowledge-aware Text-Image Retrieval for Remote Sensing Images + + +
+ Image-based retrieval in large Earth observation archives is challenging +because one needs to navigate across thousands of candidate matches only with +the query image as a guide. By using text as information supporting the visual +query, the retrieval system gains in usability, but at the same time faces +difficulties due to the diversity of visual signals that cannot be summarized +by a short caption only. For this reason, as a matching-based task, cross-modal +text-image retrieval often suffers from information asymmetry between texts and +images. To address this challenge, we propose a Knowledge-aware Text-Image +Retrieval (KTIR) method for remote sensing images. By mining relevant +information from an external knowledge graph, KTIR enriches the text scope +available in the search query and alleviates the information gaps between texts +and images for better matching. Moreover, by integrating domain-specific +knowledge, KTIR also enhances the adaptation of pre-trained vision-language +models to remote sensing applications. Experimental results on three commonly +used remote sensing text-image retrieval benchmarks show that the proposed +knowledge-aware method leads to varied and consistent retrievals, outperforming +state-of-the-art retrieval methods. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ On the Theory of Cross-Modality Distillation with Contrastive Learning + + +
+ Cross-modality distillation arises as an important topic for data modalities +containing limited knowledge such as depth maps and high-quality sketches. Such +techniques are of great importance, especially for memory and +privacy-restricted scenarios where labeled training data is generally +unavailable. To solve the problem, existing label-free methods leverage a few +pairwise unlabeled data to distill the knowledge by aligning features or +statistics between the source and target modalities. For instance, one +typically aims to minimize the L2 distance or contrastive loss between the +learned features of pairs of samples in the source (e.g. image) and the target +(e.g. sketch) modalities. However, most algorithms in this domain only focus on +the experimental results but lack theoretical insight. To bridge the gap +between the theory and practical method of cross-modality distillation, we +first formulate a general framework of cross-modality contrastive distillation +(CMCD), built upon contrastive learning that leverages both positive and +negative correspondence, towards a better distillation of generalizable +features. Furthermore, we establish a thorough convergence analysis that +reveals that the distance between source and target modalities significantly +impacts the test error on downstream tasks within the target modality which is +also validated by the empirical results. Extensive experimental results show +that our algorithm outperforms existing algorithms consistently by a margin of +2-3\% across diverse modalities and tasks, covering modalities of image, +sketch, depth map, and audio and tasks of recognition and segmentation. + +
+
+
+
+
+ + ☆ Salient Object Detection From Arbitrary Modalities + + +
+ Toward desirable saliency prediction, the types and numbers of inputs for a +salient object detection (SOD) algorithm may dynamically change in many +real-life applications. However, existing SOD algorithms are mainly designed or +trained for one particular type of inputs, failing to be generalized to other +types of inputs. Consequentially, more types of SOD algorithms need to be +prepared in advance for handling different types of inputs, raising huge +hardware and research costs. Differently, in this paper, we propose a new type +of SOD task, termed Arbitrary Modality SOD (AM SOD). The most prominent +characteristics of AM SOD are that the modality types and modality numbers will +be arbitrary or dynamically changed. The former means that the inputs to the AM +SOD algorithm may be arbitrary modalities such as RGB, depths, or even any +combination of them. While, the latter indicates that the inputs may have +arbitrary modality numbers as the input type is changed, e.g. single-modality +RGB image, dual-modality RGB-Depth (RGB-D) images or triple-modality +RGB-Depth-Thermal (RGB-D-T) images. Accordingly, a preliminary solution to the +above challenges, \i.e. a modality switch network (MSN), is proposed in this +paper. In particular, a modality switch feature extractor (MSFE) is first +designed to extract discriminative features from each modality effectively by +introducing some modality indicators, which will generate some weights for +modality switching. Subsequently, a dynamic fusion module (DFM) is proposed to +adaptively fuse features from a variable number of modalities based on a novel +Transformer structure. Finally, a new dataset, named AM-XD, is constructed to +facilitate research on AM SOD. Extensive experiments demonstrate that our AM +SOD method can effectively cope with changes in the type and number of input +modalities for robust salient object detection. + +
+
+ comment: 15 Pages, 7 Figures, 8 Tables +
+
+
+
+
+ + ☆ Modality Prompts for Arbitrary Modality Salient Object Detection + + +
+ This paper delves into the task of arbitrary modality salient object +detection (AM SOD), aiming to detect salient objects from arbitrary modalities, +eg RGB images, RGB-D images, and RGB-D-T images. A novel modality-adaptive +Transformer (MAT) will be proposed to investigate two fundamental challenges of +AM SOD, ie more diverse modality discrepancies caused by varying modality types +that need to be processed, and dynamic fusion design caused by an uncertain +number of modalities present in the inputs of multimodal fusion strategy. +Specifically, inspired by prompt learning's ability of aligning the +distributions of pre-trained models to the characteristic of downstream tasks +by learning some prompts, MAT will first present a modality-adaptive feature +extractor (MAFE) to tackle the diverse modality discrepancies by introducing a +modality prompt for each modality. In the training stage, a new modality +translation contractive (MTC) loss will be further designed to assist MAFE in +learning those modality-distinguishable modality prompts. Accordingly, in the +testing stage, MAFE can employ those learned modality prompts to adaptively +adjust its feature space according to the characteristics of the input +modalities, thus being able to extract discriminative unimodal features. Then, +MAFE will present a channel-wise and spatial-wise fusion hybrid (CSFH) strategy +to meet the demand for dynamic fusion. For that, CSFH dedicates a channel-wise +dynamic fusion module (CDFM) and a novel spatial-wise dynamic fusion module +(SDFM) to fuse the unimodal features from varying numbers of modalities and +meanwhile effectively capture cross-modal complementary semantic and detail +information, respectively. Moreover, CSFH will carefully align CDFM and SDFM to +different levels of unimodal features based on their characteristics for more +effective complementary information exploitation. + +
+
+ comment: 13 pages, 7 Figures, 3 Tables +
+
+
+
+
+ + ☆ Retinexmamba: Retinex-based Mamba for Low-light Image Enhancement + + +
+ In the field of low-light image enhancement, both traditional Retinex methods +and advanced deep learning techniques such as Retinexformer have shown distinct +advantages and limitations. Traditional Retinex methods, designed to mimic the +human eye's perception of brightness and color, decompose images into +illumination and reflection components but struggle with noise management and +detail preservation under low light conditions. Retinexformer enhances +illumination estimation through traditional self-attention mechanisms, but +faces challenges with insufficient interpretability and suboptimal enhancement +effects. To overcome these limitations, this paper introduces the RetinexMamba +architecture. RetinexMamba not only captures the physical intuitiveness of +traditional Retinex methods but also integrates the deep learning framework of +Retinexformer, leveraging the computational efficiency of State Space Models +(SSMs) to enhance processing speed. This architecture features innovative +illumination estimators and damage restorer mechanisms that maintain image +quality during enhancement. Moreover, RetinexMamba replaces the IG-MSA +(Illumination-Guided Multi-Head Attention) in Retinexformer with a +Fused-Attention mechanism, improving the model's interpretability. Experimental +evaluations on the LOL dataset show that RetinexMamba outperforms existing deep +learning approaches based on Retinex theory in both quantitative and +qualitative metrics, confirming its effectiveness and superiority in enhancing +low-light images. + +
+
+
+
+
+ + ☆ Light-VQA+: A Video Quality Assessment Model for Exposure Correction + with Vision-Language Guidance + + +
+ Recently, User-Generated Content (UGC) videos have gained popularity in our +daily lives. However, UGC videos often suffer from poor exposure due to the +limitations of photographic equipment and techniques. Therefore, Video Exposure +Correction (VEC) algorithms have been proposed, Low-Light Video Enhancement +(LLVE) and Over-Exposed Video Recovery (OEVR) included. Equally important to +the VEC is the Video Quality Assessment (VQA). Unfortunately, almost all +existing VQA models are built generally, measuring the quality of a video from +a comprehensive perspective. As a result, Light-VQA, trained on LLVE-QA, is +proposed for assessing LLVE. We extend the work of Light-VQA by expanding the +LLVE-QA dataset into Video Exposure Correction Quality Assessment (VEC-QA) +dataset with over-exposed videos and their corresponding corrected versions. In +addition, we propose Light-VQA+, a VQA model specialized in assessing VEC. +Light-VQA+ differs from Light-VQA mainly from the usage of the CLIP model and +the vision-language guidance during the feature extraction, followed by a new +module referring to the Human Visual System (HVS) for more accurate assessment. +Extensive experimental results show that our model achieves the best +performance against the current State-Of-The-Art (SOTA) VQA models on the +VEC-QA dataset and other public datasets. + +
+
+
+
+
+ + ☆ Enhancing Spatiotemporal Disease Progression Models via Latent Diffusion + and Prior Knowledge + + +
+ In this work, we introduce Brain Latent Progression (BrLP), a novel +spatiotemporal disease progression model based on latent diffusion. BrLP is +designed to predict the evolution of diseases at the individual level on 3D +brain MRIs. Existing deep generative models developed for this task are +primarily data-driven and face challenges in learning disease progressions. +BrLP addresses these challenges by incorporating prior knowledge from disease +models to enhance the accuracy of predictions. To implement this, we propose to +integrate an auxiliary model that infers volumetric changes in various brain +regions. Additionally, we introduce Latent Average Stabilization (LAS), a novel +technique to improve spatiotemporal consistency of the predicted progression. +BrLP is trained and evaluated on a large dataset comprising 11,730 T1-weighted +brain MRIs from 2,805 subjects, collected from three publicly available, +longitudinal Alzheimer's Disease (AD) studies. In our experiments, we compare +the MRI scans generated by BrLP with the actual follow-up MRIs available from +the subjects, in both cross-sectional and longitudinal settings. BrLP +demonstrates significant improvements over existing methods, with an increase +of 22% in volumetric accuracy across AD-related brain regions and 43% in image +similarity to the ground-truth scans. The ability of BrLP to generate +conditioned 3D scans at the subject level, along with the novelty of +integrating prior knowledge to enhance accuracy, represents a significant +advancement in disease progression modeling, opening new avenues for precision +medicine. The code of BrLP is available at the following link: +https://github.com/LemuelPuglisi/BrLP. + +
+
+
+
+
+ + ☆ Enhancing DETRs Variants through Improved Content Query and Similar + Query Aggregation + + +
+ The design of the query is crucial for the performance of DETR and its +variants. Each query consists of two components: a content part and a +positional one. Traditionally, the content query is initialized with a zero or +learnable embedding, lacking essential content information and resulting in +sub-optimal performance. In this paper, we introduce a novel plug-and-play +module, Self-Adaptive Content Query (SACQ), to address this limitation. The +SACQ module utilizes features from the transformer encoder to generate content +queries via self-attention pooling. This allows candidate queries to adapt to +the input image, resulting in a more comprehensive content prior and better +focus on target objects. However, this improved concentration poses a challenge +for the training process that utilizes the Hungarian matching, which selects +only a single candidate and suppresses other similar ones. To overcome this, we +propose a query aggregation strategy to cooperate with SACQ. It merges similar +predicted candidates from different queries, easing the optimization. Our +extensive experiments on the COCO dataset demonstrate the effectiveness of our +proposed approaches across six different DETR's variants with multiple +configurations, achieving an average improvement of over 1.0 AP. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Deep Learning-based Point Cloud Registration for Augmented + Reality-guided Surgery + + +
+ Point cloud registration aligns 3D point clouds using spatial +transformations. It is an important task in computer vision, with applications +in areas such as augmented reality (AR) and medical imaging. This work explores +the intersection of two research trends: the integration of AR into +image-guided surgery and the use of deep learning for point cloud registration. +The main objective is to evaluate the feasibility of applying deep +learning-based point cloud registration methods for image-to-patient +registration in augmented reality-guided surgery. We created a dataset of point +clouds from medical imaging and corresponding point clouds captured with a +popular AR device, the HoloLens 2. We evaluate three well-established deep +learning models in registering these data pairs. While we find that some deep +learning methods show promise, we show that a conventional registration +pipeline still outperforms them on our challenging dataset. + +
+
+ comment: 5 pages, 4 figures; accepted at IEEE ISBI 2024 +
+
+
+
+
+ + ☆ Federated Learning for Drowsiness Detection in Connected Vehicles + + +
+ Ensuring driver readiness poses challenges, yet driver monitoring systems can +assist in determining the driver's state. By observing visual cues, such +systems recognize various behaviors and associate them with specific +conditions. For instance, yawning or eye blinking can indicate driver +drowsiness. Consequently, an abundance of distributed data is generated for +driver monitoring. Employing machine learning techniques, such as driver +drowsiness detection, presents a potential solution. However, transmitting the +data to a central machine for model training is impractical due to the large +data size and privacy concerns. Conversely, training on a single vehicle would +limit the available data and likely result in inferior performance. To address +these issues, we propose a federated learning framework for drowsiness +detection within a vehicular network, leveraging the YawDD dataset. Our +approach achieves an accuracy of 99.2%, demonstrating its promise and +comparability to conventional deep learning techniques. Lastly, we show how our +model scales using various number of federated clients + +
+
+ comment: 14 pages, 8 figures, 1 table, EAI INTSYS 2023 conference +
+
+
+
+
+ + ☆ Interpretable Network Visualizations: A Human-in-the-Loop Approach for + Post-hoc Explainability of CNN-based Image Classification + + +
+ Transparency and explainability in image classification are essential for +establishing trust in machine learning models and detecting biases and errors. +State-of-the-art explainability methods generate saliency maps to show where a +specific class is identified, without providing a detailed explanation of the +model's decision process. Striving to address such a need, we introduce a +post-hoc method that explains the entire feature extraction process of a +Convolutional Neural Network. These explanations include a layer-wise +representation of the features the model extracts from the input. Such features +are represented as saliency maps generated by clustering and merging similar +feature maps, to which we associate a weight derived by generalizing Grad-CAM +for the proposed methodology. To further enhance these explanations, we include +a set of textual labels collected through a gamified crowdsourcing activity and +processed using NLP techniques and Sentence-BERT. Finally, we show an approach +to generate global explanations by aggregating labels across multiple images. + +
+
+ comment: International Joint Conference on Artificial Intelligence 2024 (to be + published) +
+
+
+
+
+ + ☆ Animate Your Thoughts: Decoupled Reconstruction of Dynamic Natural + Vision from Slow Brain Activity + + +
+ Reconstructing human dynamic vision from brain activity is a challenging task +with great scientific significance. The difficulty stems from two primary +issues: (1) vision-processing mechanisms in the brain are highly intricate and +not fully revealed, making it challenging to directly learn a mapping between +fMRI and video; (2) the temporal resolution of fMRI is significantly lower than +that of natural videos. To overcome these issues, this paper propose a +two-stage model named Mind-Animator, which achieves state-of-the-art +performance on three public datasets. Specifically, during the fMRI-to-feature +stage, we decouple semantic, structural, and motion features from fMRI through +fMRI-vision-language tri-modal contrastive learning and sparse causal +attention. In the feature-to-video stage, these features are merged to videos +by an inflated Stable Diffusion. We substantiate that the reconstructed video +dynamics are indeed derived from fMRI, rather than hallucinations of the +generative model, through permutation tests. Additionally, the visualization of +voxel-wise and ROI-wise importance maps confirms the neurobiological +interpretability of our model. + +
+
+
+
+
+ + ☆ WorldQA: Multimodal World Knowledge in Videos through Long-Chain + Reasoning + + +
+ Multimodal information, together with our knowledge, help us to understand +the complex and dynamic world. Large language models (LLM) and large multimodal +models (LMM), however, still struggle to emulate this capability. In this +paper, we present WorldQA, a video understanding dataset designed to push the +boundaries of multimodal world models with three appealing properties: (1) +Multimodal Inputs: The dataset comprises 1007 question-answer pairs and 303 +videos, necessitating the analysis of both auditory and visual data for +successful interpretation. (2) World Knowledge: We identify five essential +types of world knowledge for question formulation. This approach challenges +models to extend their capabilities beyond mere perception. (3) Long-Chain +Reasoning: Our dataset introduces an average reasoning step of 4.45, notably +surpassing other videoQA datasets. Furthermore, we introduce WorldRetriever, an +agent designed to synthesize expert knowledge into a coherent reasoning chain, +thereby facilitating accurate responses to WorldQA queries. Extensive +evaluations of 13 prominent LLMs and LMMs reveal that WorldRetriever, although +being the most effective model, achieved only 70% of humanlevel performance in +multiple-choice questions. This finding highlights the necessity for further +advancement in the reasoning and comprehension abilities of models. Our +experiments also yield several key insights. For instance, while humans tend to +perform better with increased frames, current LMMs, including WorldRetriever, +show diminished performance under similar conditions. We hope that WorldQA,our +methodology, and these insights could contribute to the future development of +multimodal world models. + +
+
+
+
+
+ + ☆ Mind the Gap Between Synthetic and Real: Utilizing Transfer Learning to + Probe the Boundaries of Stable Diffusion Generated Data + + +
+ Generative foundation models like Stable Diffusion comprise a diverse +spectrum of knowledge in computer vision with the potential for transfer +learning, e.g., via generating data to train student models for downstream +tasks. This could circumvent the necessity of collecting labeled real-world +data, thereby presenting a form of data-free knowledge distillation. However, +the resultant student models show a significant drop in accuracy compared to +models trained on real data. We investigate possible causes for this drop and +focus on the role of the different layers of the student model. By training +these layers using either real or synthetic data, we reveal that the drop +mainly stems from the model's final layers. Further, we briefly investigate +other factors, such as differences in data-normalization between synthetic and +real, the impact of data augmentations, texture vs.\ shape learning, and +assuming oracle prompts. While we find that some of those factors can have an +impact, they are not sufficient to close the gap towards real data. Building +upon our insights that mainly later layers are responsible for the drop, we +investigate the data-efficiency of fine-tuning a synthetically trained model +with real data applied to only those last layers. Our results suggest an +improved trade-off between the amount of real training data used and the +model's accuracy. Our findings contribute to the understanding of the gap +between synthetic and real data and indicate solutions to mitigate the scarcity +of labeled real data. + +
+
+
+
+
+ + ☆ Cross-Modal Domain Adaptation in Brain Disease Diagnosis: Maximum Mean + Discrepancy-based Convolutional Neural Networks + + +
+ Brain disorders are a major challenge to global health, causing millions of +deaths each year. Accurate diagnosis of these diseases relies heavily on +advanced medical imaging techniques such as Magnetic Resonance Imaging (MRI) +and Computed Tomography (CT). However, the scarcity of annotated data poses a +significant challenge in deploying machine learning models for medical +diagnosis. To address this limitation, deep learning techniques have shown +considerable promise. Domain adaptation techniques enhance a model's ability to +generalize across imaging modalities by transferring knowledge from one domain +(e.g., CT images) to another (e.g., MRI images). Such cross-modality adaptation +is essential to improve the ability of models to consistently generalize across +different imaging modalities. This study collected relevant resources from the +Kaggle website and employed the Maximum Mean Difference (MMD) method - a +popular domain adaptation method - to reduce the differences between imaging +domains. By combining MMD with Convolutional Neural Networks (CNNs), the +accuracy and utility of the model is obviously enhanced. The excellent +experimental results highlight the great potential of data-driven domain +adaptation techniques to improve diagnostic accuracy and efficiency, especially +in resource-limited environments. By bridging the gap between different imaging +modalities, the study aims to provide clinicians with more reliable diagnostic +tools. + +
+
+
+
+
+ + ☆ Spatial and Surface Correspondence Field for Interaction Transfer SIGGRAPH 2024 + + +
+ In this paper, we introduce a new method for the task of interaction +transfer. Given an example interaction between a source object and an agent, +our method can automatically infer both surface and spatial relationships for +the agent and target objects within the same category, yielding more accurate +and valid transfers. Specifically, our method characterizes the example +interaction using a combined spatial and surface representation. We correspond +the agent points and object points related to the representation to the target +object space using a learned spatial and surface correspondence field, which +represents objects as deformed and rotated signed distance fields. With the +corresponded points, an optimization is performed under the constraints of our +spatial and surface interaction representation and additional regularization. +Experiments conducted on human-chair and hand-mug interaction transfer tasks +show that our approach can handle larger geometry and topology variations +between source and target shapes, significantly outperforming state-of-the-art +methods. + +
+
+ comment: Accepted to SIGGRAPH 2024, project page at + https://vcc.tech/research/2024/InterTransfer +
+
+
+
+
+ + ☆ Elevator, Escalator or Neither? Classifying Pedestrian Conveyor State + Using Inertial Navigation System + + +
+ Classifying a pedestrian in one of the three conveyor states of "elevator," +"escalator" and "neither" is fundamental to many applications such as indoor +localization and people flow analysis. We estimate, for the first time, the +pedestrian conveyor state given the inertial navigation system (INS) readings +of accelerometer, gyroscope and magnetometer sampled from the phone. Our +problem is challenging because the INS signals of the conveyor state are +coupled and perturbed by unpredictable arbitrary human actions, confusing the +decision process. We propose ELESON, a novel, effective and lightweight +INS-based deep learning approach to classify whether a pedestrian is in an +elevator, escalator or neither. ELESON utilizes a motion feature extractor to +decouple the conveyor state from human action in the feature space, and a +magnetic feature extractor to account for the speed difference between elevator +and escalator. Given the results of the extractors, it employs an evidential +state classifier to estimate the confidence of the pedestrian states. Based on +extensive experiments conducted on twenty hours of real pedestrian data, we +demonstrate that ELESON outperforms significantly the state-of-the-art +approaches (where combined INS signals of both the conveyor state and human +actions are processed together), with 15% classification improvement in F1 +score, stronger confidence discriminability with 10% increase in AUROC (Area +Under the Receiver Operating Characteristics), and low computational and memory +requirements on smartphones. + +
+
+
+
+
+ + ☆ Hierarchical Space-Time Attention for Micro-Expression Recognition + + +
+ Micro-expression recognition (MER) aims to recognize the short and subtle +facial movements from the Micro-expression (ME) video clips, which reveal real +emotions. Recent MER methods mostly only utilize special frames from ME video +clips or extract optical flow from these special frames. However, they neglect +the relationship between movements and space-time, while facial cues are hidden +within these relationships. To solve this issue, we propose the Hierarchical +Space-Time Attention (HSTA). Specifically, we first process ME video frames and +special frames or data parallelly by our cascaded Unimodal Space-Time Attention +(USTA) to establish connections between subtle facial movements and specific +facial areas. Then, we design Crossmodal Space-Time Attention (CSTA) to achieve +a higher-quality fusion for crossmodal data. Finally, we hierarchically +integrate USTA and CSTA to grasp the deeper facial cues. Our model emphasizes +temporal modeling without neglecting the processing of special data, and it +fuses the contents in different modalities while maintaining their respective +uniqueness. Extensive experiments on the four benchmarks show the effectiveness +of our proposed HSTA. Specifically, compared with the latest method on the +CASME3 dataset, it achieves about 3% score improvement in seven-category +classification. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ StyleSeg V2: Towards Robust One-shot Segmentation of Brain Tissue via + Optimization-free Registration Error Perception + + +
+ One-shot segmentation of brain tissue requires training +registration-segmentation (reg-seg) dual-model iteratively, where reg-model +aims to provide pseudo masks of unlabeled images for seg-model by warping a +carefully-labeled atlas. However, the imperfect reg-model induces image-mask +misalignment, poisoning the seg-model subsequently. Recent StyleSeg bypasses +this bottleneck by replacing the unlabeled images with their warped copies of +atlas, but needs to borrow the diverse image patterns via style transformation. +Here, we present StyleSeg V2, inherited from StyleSeg but granted the ability +of perceiving the registration errors. The motivation is that good registration +behaves in a mirrored fashion for mirrored images. Therefore, almost at no +cost, StyleSeg V2 can have reg-model itself "speak out" incorrectly-aligned +regions by simply mirroring (symmetrically flipping the brain) its input, and +the registration errors are symmetric inconsistencies between the outputs of +original and mirrored inputs. Consequently, StyleSeg V2 allows the seg-model to +make use of correctly-aligned regions of unlabeled images and also enhances the +fidelity of style-transformed warped atlas image by weighting the local +transformation strength according to registration errors. The experimental +results on three public datasets demonstrate that our proposed StyleSeg V2 +outperforms other state-of-the-arts by considerable margins, and exceeds +StyleSeg by increasing the average Dice by at least 2.4%. + +
+
+ comment: 9 pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ CityLLaVA: Efficient Fine-Tuning for VLMs in City Scenario CVPR2024 + + +
+ In the vast and dynamic landscape of urban settings, Traffic Safety +Description and Analysis plays a pivotal role in applications ranging from +insurance inspection to accident prevention. This paper introduces CityLLaVA, a +novel fine-tuning framework for Visual Language Models (VLMs) designed for +urban scenarios. CityLLaVA enhances model comprehension and prediction accuracy +through (1) employing bounding boxes for optimal visual data preprocessing, +including video best-view selection and visual prompt engineering during both +training and testing phases; (2) constructing concise Question-Answer sequences +and designing textual prompts to refine instruction comprehension; (3) +implementing block expansion to fine-tune large VLMs efficiently; and (4) +advancing prediction accuracy via a unique sequential questioning-based +prediction augmentation. Demonstrating top-tier performance, our method +achieved a benchmark score of 33.4308, securing the leading position on the +leaderboard. The code can be found: +https://github.com/alibaba/AICITY2024_Track2_AliOpenTrek_CityLLaVA + +
+
+ comment: Accepted by AICITY2024 Workshop Track2 at CVPR2024 +
+
+
+
+
+ + ☆ Exploring Frequencies via Feature Mixing and Meta-Learning for Improving + Adversarial Transferability + + +
+ Recent studies have shown that Deep Neural Networks (DNNs) are susceptible to +adversarial attacks, with frequency-domain analysis underscoring the +significance of high-frequency components in influencing model predictions. +Conversely, targeting low-frequency components has been effective in enhancing +attack transferability on black-box models. In this study, we introduce a +frequency decomposition-based feature mixing method to exploit these frequency +characteristics in both clean and adversarial samples. Our findings suggest +that incorporating features of clean samples into adversarial features +extracted from adversarial examples is more effective in attacking +normally-trained models, while combining clean features with the adversarial +features extracted from low-frequency parts decomposed from the adversarial +samples yields better results in attacking defense models. However, a conflict +issue arises when these two mixing approaches are employed simultaneously. To +tackle the issue, we propose a cross-frequency meta-optimization approach +comprising the meta-train step, meta-test step, and final update. In the +meta-train step, we leverage the low-frequency components of adversarial +samples to boost the transferability of attacks against defense models. +Meanwhile, in the meta-test step, we utilize adversarial samples to stabilize +gradients, thereby enhancing the attack's transferability against normally +trained models. For the final update, we update the adversarial sample based on +the gradients obtained from both meta-train and meta-test steps. Our proposed +method is evaluated through extensive experiments on the ImageNet-Compatible +dataset, affirming its effectiveness in improving the transferability of +attacks on both normally-trained CNNs and defense models. + The source code is available at https://github.com/WJJLL/MetaSSA. + +
+
+
+
+
+ + ☆ Adapting Dual-encoder Vision-language Models for Paraphrased Retrieval + + +
+ In the recent years, the dual-encoder vision-language models (\eg CLIP) have +achieved remarkable text-to-image retrieval performance. However, we discover +that these models usually results in very different retrievals for a pair of +paraphrased queries. Such behavior might render the retrieval system less +predictable and lead to user frustration. In this work, we consider the task of +paraphrased text-to-image retrieval where a model aims to return similar +results given a pair of paraphrased queries. To start with, we collect a +dataset of paraphrased image descriptions to facilitate quantitative evaluation +for this task. We then hypothesize that the undesired behavior of existing +dual-encoder model is due to their text towers which are trained on +image-sentence pairs and lack the ability to capture the semantic similarity +between paraphrased queries. To improve on this, we investigate multiple +strategies for training a dual-encoder model starting from a language model +pretrained on a large text corpus. Compared to public dual-encoder models such +as CLIP and OpenCLIP, the model trained with our best adaptation strategy +achieves a significantly higher ranking similarity for paraphrased queries +while maintaining similar zero-shot classification and retrieval accuracy. + +
+
+
+
+
+ + ☆ Transformer-based RGB-T Tracking with Channel and Spatial Feature Fusion + + +
+ Complementary RGB and TIR modalities enable RGB-T tracking to achieve +competitive performance in challenging scenarios. Therefore, how to better fuse +cross-modal features is the core issue of RGB-T tracking. Some previous methods +either insufficiently fuse RGB and TIR features, or depend on intermediaries +containing information from both modalities to achieve cross-modal information +interaction. The former does not fully exploit the potential of using only RGB +and TIR information of the template or search region for channel and spatial +feature fusion, and the latter lacks direct interaction between the template +and search area, which limits the model's ability to fully exploit the original +semantic information of both modalities. To alleviate these limitations, we +explore how to improve the performance of a visual Transformer by using direct +fusion of cross-modal channels and spatial features, and propose CSTNet. CSTNet +uses ViT as a backbone and inserts cross-modal channel feature fusion modules +(CFM) and cross-modal spatial feature fusion modules (SFM) for direct +interaction between RGB and TIR features. The CFM performs parallel joint +channel enhancement and joint multilevel spatial feature modeling of RGB and +TIR features and sums the features, and then globally integrates the sum +feature with the original features. The SFM uses cross-attention to model the +spatial relationship of cross-modal features and then introduces a +convolutional feedforward network for joint spatial and channel integration of +multimodal features. Comprehensive experiments show that CSTNet achieves +state-of-the-art performance on three public RGB-T tracking benchmarks. Code is +available at https://github.com/LiYunfengLYF/CSTNet. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ The Role of Predictive Uncertainty and Diversity in Embodied AI and + Robot Learning + + +
+ Uncertainty has long been a critical area of study in robotics, particularly +when robots are equipped with analytical models. As we move towards the +widespread use of deep neural networks in robots, which have demonstrated +remarkable performance in research settings, understanding the nuances of +uncertainty becomes crucial for their real-world deployment. This guide offers +an overview of the importance of uncertainty and provides methods to quantify +and evaluate it from an applications perspective. + +
+
+
+
+
+ + ☆ Advancing Multimodal Medical Capabilities of Gemini + + +
+ Many clinical tasks require an understanding of specialized data, such as +medical images and genomics, which is not typically found in general-purpose +large multimodal models. Building upon Gemini's multimodal models, we develop +several models within the new Med-Gemini family that inherit core capabilities +of Gemini and are optimized for medical use via fine-tuning with 2D and 3D +radiology, histopathology, ophthalmology, dermatology and genomic data. +Med-Gemini-2D sets a new standard for AI-based chest X-ray (CXR) report +generation based on expert evaluation, exceeding previous best results across +two separate datasets by an absolute margin of 1% and 12%, where 57% and 96% of +AI reports on normal cases, and 43% and 65% on abnormal cases, are evaluated as +"equivalent or better" than the original radiologists' reports. We demonstrate +the first ever large multimodal model-based report generation for 3D computed +tomography (CT) volumes using Med-Gemini-3D, with 53% of AI reports considered +clinically acceptable, although additional research is needed to meet expert +radiologist reporting quality. Beyond report generation, Med-Gemini-2D +surpasses the previous best performance in CXR visual question answering (VQA) +and performs well in CXR classification and radiology VQA, exceeding SoTA or +baselines on 17 of 20 tasks. In histopathology, ophthalmology, and dermatology +image classification, Med-Gemini-2D surpasses baselines across 18 out of 20 +tasks and approaches task-specific model performance. Beyond imaging, +Med-Gemini-Polygenic outperforms the standard linear polygenic risk score-based +approach for disease risk prediction and generalizes to genetically correlated +diseases for which it has never been trained. Although further development and +evaluation are necessary in the safety-critical medical domain, our results +highlight the potential of Med-Gemini across a wide range of medical tasks. + +
+
+
+
+
+ + ☆ DeepMpMRI: Tensor-decomposition Regularized Learning for Fast and + High-Fidelity Multi-Parametric Microstructural MR Imaging + + +
+ Deep learning has emerged as a promising approach for learning the nonlinear +mapping between diffusion-weighted MR images and tissue parameters, which +enables automatic and deep understanding of the brain microstructures. However, +the efficiency and accuracy in the multi-parametric estimations are still +limited since previous studies tend to estimate multi-parametric maps with +dense sampling and isolated signal modeling. This paper proposes DeepMpMRI, a +unified framework for fast and high-fidelity multi-parametric estimation from +various diffusion models using sparsely sampled q-space data. DeepMpMRI is +equipped with a newly designed tensor-decomposition-based regularizer to +effectively capture fine details by exploiting the correlation across +parameters. In addition, we introduce a Nesterov-based adaptive learning +algorithm that optimizes the regularization parameter dynamically to enhance +the performance. DeepMpMRI is an extendable framework capable of incorporating +flexible network architecture. Experimental results demonstrate the superiority +of our approach over 5 state-of-the-art methods in simultaneously estimating +multi-parametric maps for various diffusion models with fine-grained details +both quantitatively and qualitatively, achieving 4.5 - 22.5$\times$ +acceleration compared to the dense sampling of a total of 270 diffusion +gradients. + +
+
+
+
+
+ + ☆ Video Diffusion Models: A Survey + + +
+ Diffusion generative models have recently become a robust technique for +producing and modifying coherent, high-quality video. This survey offers a +systematic overview of critical elements of diffusion models for video +generation, covering applications, architectural choices, and the modeling of +temporal dynamics. Recent advancements in the field are summarized and grouped +into development trends. The survey concludes with an overview of remaining +challenges and an outlook on the future of the field. Website: +https://github.com/ndrwmlnk/Awesome-Video-Diffusion-Models + +
+
+
+
+
+ + ☆ PTQ4SAM: Post-Training Quantization for Segment Anything CVPR 2024 + + +
+ Segment Anything Model (SAM) has achieved impressive performance in many +computer vision tasks. However, as a large-scale model, the immense memory and +computation costs hinder its practical deployment. In this paper, we propose a +post-training quantization (PTQ) framework for Segment Anything Model, namely +PTQ4SAM. First, we investigate the inherent bottleneck of SAM quantization +attributed to the bimodal distribution in post-Key-Linear activations. We +analyze its characteristics from both per-tensor and per-channel perspectives, +and propose a Bimodal Integration strategy, which utilizes a mathematically +equivalent sign operation to transform the bimodal distribution into a +relatively easy-quantized normal distribution offline. Second, SAM encompasses +diverse attention mechanisms (i.e., self-attention and two-way +cross-attention), resulting in substantial variations in the post-Softmax +distributions. Therefore, we introduce an Adaptive Granularity Quantization for +Softmax through searching the optimal power-of-two base, which is +hardware-friendly. Extensive experimental results across various vision tasks +(instance segmentation, semantic segmentation and object detection), datasets +and model variants show the superiority of PTQ4SAM. For example, when +quantizing SAM-L to 6-bit, we achieve lossless accuracy for instance +segmentation, about 0.5\% drop with theoretical 3.9$\times$ acceleration. The +code is available at \url{https://github.com/chengtao-lv/PTQ4SAM}. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Automatic Ultrasound Curve Angle Measurement via Affinity Clustering for + Adolescent Idiopathic Scoliosis Evaluation + + +
+ The current clinical gold standard for evaluating adolescent idiopathic +scoliosis (AIS) is X-ray radiography, using Cobb angle measurement. However, +the frequent monitoring of the AIS progression using X-rays poses a challenge +due to the cumulative radiation exposure. Although 3D ultrasound has been +validated as a reliable and radiation-free alternative for scoliosis +assessment, the process of measuring spinal curvature is still carried out +manually. Consequently, there is a considerable demand for a fully automatic +system that can locate bony landmarks and perform angle measurements. To this +end, we introduce an estimation model for automatic ultrasound curve angle +(UCA) measurement. The model employs a dual-branch network to detect candidate +landmarks and perform vertebra segmentation on ultrasound coronal images. An +affinity clustering strategy is utilized within the vertebral segmentation area +to illustrate the affinity relationship between candidate landmarks. +Subsequently, we can efficiently perform line delineation from a clustered +affinity map for UCA measurement. As our method is specifically designed for +UCA calculation, this method outperforms other state-of-the-art methods for +landmark and line detection tasks. The high correlation between the automatic +UCA and Cobb angle (R$^2$=0.858) suggests that our proposed method can +potentially replace manual UCA measurement in ultrasound scoliosis assessment. + +
+
+
+
+
+ + ☆ AniTalker: Animate Vivid and Diverse Talking Faces through + Identity-Decoupled Facial Motion Encoding + + +
+ The paper introduces AniTalker, an innovative framework designed to generate +lifelike talking faces from a single portrait. Unlike existing models that +primarily focus on verbal cues such as lip synchronization and fail to capture +the complex dynamics of facial expressions and nonverbal cues, AniTalker +employs a universal motion representation. This innovative representation +effectively captures a wide range of facial dynamics, including subtle +expressions and head movements. AniTalker enhances motion depiction through two +self-supervised learning strategies: the first involves reconstructing target +video frames from source frames within the same identity to learn subtle motion +representations, and the second develops an identity encoder using metric +learning while actively minimizing mutual information between the identity and +motion encoders. This approach ensures that the motion representation is +dynamic and devoid of identity-specific details, significantly reducing the +need for labeled data. Additionally, the integration of a diffusion model with +a variance adapter allows for the generation of diverse and controllable facial +animations. This method not only demonstrates AniTalker's capability to create +detailed and realistic facial movements but also underscores its potential in +crafting dynamic avatars for real-world applications. Synthetic results can be +viewed at https://github.com/X-LANCE/AniTalker. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Intra-task Mutual Attention based Vision Transformer for Few-Shot + Learning + + +
+ Humans possess remarkable ability to accurately classify new, unseen images +after being exposed to only a few examples. Such ability stems from their +capacity to identify common features shared between new and previously seen +images while disregarding distractions such as background variations. However, +for artificial neural network models, determining the most relevant features +for distinguishing between two images with limited samples presents a +challenge. In this paper, we propose an intra-task mutual attention method for +few-shot learning, that involves splitting the support and query samples into +patches and encoding them using the pre-trained Vision Transformer (ViT) +architecture. Specifically, we swap the class (CLS) token and patch tokens +between the support and query sets to have the mutual attention, which enables +each set to focus on the most useful information. This facilitates the +strengthening of intra-class representations and promotes closer proximity +between instances of the same class. For implementation, we adopt the ViT-based +network architecture and utilize pre-trained model parameters obtained through +self-supervision. By leveraging Masked Image Modeling as a self-supervised +training task for pre-training, the pre-trained model yields semantically +meaningful representations while successfully avoiding supervision collapse. We +then employ a meta-learning method to fine-tune the last several layers and CLS +token modules. Our strategy significantly reduces the num- ber of parameters +that require fine-tuning while effectively uti- lizing the capability of +pre-trained model. Extensive experiments show that our framework is simple, +effective and computationally efficient, achieving superior performance as +compared to the state-of-the-art baselines on five popular few-shot +classification benchmarks under the 5-shot and 1-shot scenarios + +
+
+
+
+
+ + ☆ GeoContrastNet: Contrastive Key-Value Edge Learning for + Language-Agnostic Document Understanding ICDAR 2024 + + +
+ This paper presents GeoContrastNet, a language-agnostic framework to +structured document understanding (DU) by integrating a contrastive learning +objective with graph attention networks (GATs), emphasizing the significant +role of geometric features. We propose a novel methodology that combines +geometric edge features with visual features within an overall two-staged +GAT-based framework, demonstrating promising results in both link prediction +and semantic entity recognition performance. Our findings reveal that combining +both geometric and visual features could match the capabilities of large DU +models that rely heavily on Optical Character Recognition (OCR) features in +terms of performance accuracy and efficiency. This approach underscores the +critical importance of relational layout information between the named text +entities in a semi-structured layout of a page. Specifically, our results +highlight the model's proficiency in identifying key-value relationships within +the FUNSD dataset for forms and also discovering the spatial relationships in +table-structured layouts for RVLCDIP business invoices. Our code and pretrained +models will be accessible on our official GitHub. + +
+
+ comment: Accepted in ICDAR 2024 (Athens, Greece) +
+
+
+
+
+ + ☆ Learning from Students: Applying t-Distributions to Explore Accurate and + Efficient Formats for LLMs ICML 2024 + + +
+ Large language models (LLMs) have recently achieved state-of-the-art +performance across various tasks, yet due to their large computational +requirements, they struggle with strict latency and power demands. Deep neural +network (DNN) quantization has traditionally addressed these limitations by +converting models to low-precision integer formats. Yet recently alternative +formats, such as Normal Float (NF4), have been shown to consistently increase +model accuracy, albeit at the cost of increased chip area. In this work, we +first conduct a large-scale analysis of LLM weights and activations across 30 +networks to conclude most distributions follow a Student's t-distribution. We +then derive a new theoretically optimal format, Student Float (SF4), with +respect to this distribution, that improves over NF4 across modern LLMs, for +example increasing the average accuracy on LLaMA2-7B by 0.76% across tasks. +Using this format as a high-accuracy reference, we then propose augmenting E2M1 +with two variants of supernormal support for higher model accuracy. Finally, we +explore the quality and performance frontier across 11 datatypes, including +non-traditional formats like Additive-Powers-of-Two (APoT), by evaluating their +model accuracy and hardware complexity. We discover a Pareto curve composed of +INT4, E2M1, and E2M1 with supernormal support, which offers a continuous +tradeoff between model accuracy and chip area. For example, E2M1 with +supernormal support increases the accuracy of Phi-2 by up to 2.19% with 1.22% +area overhead, enabling more LLM-based applications to be run at four bits. + +
+
+ comment: Accepted to ICML 2024 +
+
+
+
+
+ + ☆ SketchGPT: Autoregressive Modeling for Sketch Generation and Recognition ICDAR 2024 + + +
+ We present SketchGPT, a flexible framework that employs a +sequence-to-sequence autoregressive model for sketch generation, and +completion, and an interpretation case study for sketch recognition. By mapping +complex sketches into simplified sequences of abstract primitives, our approach +significantly streamlines the input for autoregressive modeling. SketchGPT +leverages the next token prediction objective strategy to understand sketch +patterns, facilitating the creation and completion of drawings and also +categorizing them accurately. This proposed sketch representation strategy aids +in overcoming existing challenges of autoregressive modeling for continuous +stroke data, enabling smoother model training and competitive performance. Our +findings exhibit SketchGPT's capability to generate a diverse variety of +drawings by adding both qualitative and quantitative comparisons with existing +state-of-the-art, along with a comprehensive human evaluation study. The code +and pretrained models will be released on our official GitHub. + +
+
+ comment: Accepted in ICDAR 2024 +
+
+
+
+
+ + ☆ Research on Image Recognition Technology Based on Multimodal Deep + Learning + + +
+ This project investigates the human multi-modal behavior identification +algorithm utilizing deep neural networks. According to the characteristics of +different modal information, different deep neural networks are used to adapt +to different modal video information. Through the integration of various deep +neural networks, the algorithm successfully identifies behaviors across +multiple modalities. In this project, multiple cameras developed by Microsoft +Kinect were used to collect corresponding bone point data based on acquiring +conventional images. In this way, the motion features in the image can be +extracted. Ultimately, the behavioral characteristics discerned through both +approaches are synthesized to facilitate the precise identification and +categorization of behaviors. The performance of the suggested algorithm was +evaluated using the MSR3D data set. The findings from these experiments +indicate that the accuracy in recognizing behaviors remains consistently high, +suggesting that the algorithm is reliable in various scenarios. Additionally, +the tests demonstrate that the algorithm substantially enhances the accuracy of +detecting pedestrian behaviors in video footage. + +
+
+
+
+
+ + ☆ A 65nm 36nJ/Decision Bio-inspired Temporal-Sparsity-Aware Digital + Keyword Spotting IC with 0.6V Near-Threshold SRAM + + +
+ This paper introduces, to the best of the authors' knowledge, the first +fine-grained temporal sparsity-aware keyword spotting (KWS) IC leveraging +temporal similarities between neighboring feature vectors extracted from input +frames and network hidden states, eliminating unnecessary operations and memory +accesses. This KWS IC, featuring a bio-inspired delta-gated recurrent neural +network ({\Delta}RNN) classifier, achieves an 11-class Google Speech Command +Dataset (GSCD) KWS accuracy of 90.5% and energy consumption of 36nJ/decision. +At 87% temporal sparsity, computing latency and energy per inference are +reduced by 2.4$\times$/3.4$\times$, respectively. The 65nm design occupies +0.78mm$^2$ and features two additional blocks, a compact 0.084mm$^2$ digital +infinite-impulse-response (IIR)-based band-pass filter (BPF) audio feature +extractor (FEx) and a 24kB 0.6V near-Vth weight SRAM with 6.6$\times$ lower +read power compared to the standard SRAM. + +
+
+
+
+
+ + ☆ MVDiff: Scalable and Flexible Multi-View Diffusion for 3D Object + Reconstruction from Single-View CVPR + + +
+ Generating consistent multiple views for 3D reconstruction tasks is still a +challenge to existing image-to-3D diffusion models. Generally, incorporating 3D +representations into diffusion model decrease the model's speed as well as +generalizability and quality. This paper proposes a general framework to +generate consistent multi-view images from single image or leveraging scene +representation transformer and view-conditioned diffusion model. In the model, +we introduce epipolar geometry constraints and multi-view attention to enforce +3D consistency. From as few as one image input, our model is able to generate +3D meshes surpassing baselines methods in evaluation metrics, including PSNR, +SSIM and LPIPS. + +
+
+ comment: CVPRW: Generative Models for Computer Vision +
+
+
+
+
+ + ☆ BadFusion: 2D-Oriented Backdoor Attacks against 3D Object Detection IJCAI 2024 + + +
+ 3D object detection plays an important role in autonomous driving; however, +its vulnerability to backdoor attacks has become evident. By injecting +''triggers'' to poison the training dataset, backdoor attacks manipulate the +detector's prediction for inputs containing these triggers. Existing backdoor +attacks against 3D object detection primarily poison 3D LiDAR signals, where +large-sized 3D triggers are injected to ensure their visibility within the +sparse 3D space, rendering them easy to detect and impractical in real-world +scenarios. + In this paper, we delve into the robustness of 3D object detection, exploring +a new backdoor attack surface through 2D cameras. Given the prevalent adoption +of camera and LiDAR signal fusion for high-fidelity 3D perception, we +investigate the latent potential of camera signals to disrupt the process. +Although the dense nature of camera signals enables the use of nearly +imperceptible small-sized triggers to mislead 2D object detection, realizing +2D-oriented backdoor attacks against 3D object detection is non-trivial. The +primary challenge emerges from the fusion process that transforms camera +signals into a 3D space, compromising the association with the 2D trigger to +the target output. To tackle this issue, we propose an innovative 2D-oriented +backdoor attack against LiDAR-camera fusion methods for 3D object detection, +named BadFusion, for preserving trigger effectiveness throughout the entire +fusion process. The evaluation demonstrates the effectiveness of BadFusion, +achieving a significantly higher attack success rate compared to existing +2D-oriented attacks. + +
+
+ comment: Accepted at IJCAI 2024 Conference +
+
+
+
+
+ + ☆ Trio-ViT: Post-Training Quantization and Acceleration for Softmax-Free + Efficient Vision Transformer + + +
+ Motivated by the huge success of Transformers in the field of natural +language processing (NLP), Vision Transformers (ViTs) have been rapidly +developed and achieved remarkable performance in various computer vision tasks. +However, their huge model sizes and intensive computations hinder ViTs' +deployment on embedded devices, calling for effective model compression +methods, such as quantization. Unfortunately, due to the existence of +hardware-unfriendly and quantization-sensitive non-linear operations, +particularly {Softmax}, it is non-trivial to completely quantize all operations +in ViTs, yielding either significant accuracy drops or non-negligible hardware +costs. In response to challenges associated with \textit{standard ViTs}, we +focus our attention towards the quantization and acceleration for +\textit{efficient ViTs}, which not only eliminate the troublesome Softmax but +also integrate linear attention with low computational complexity, and propose +\emph{Trio-ViT} accordingly. Specifically, at the algorithm level, we develop a +{tailored post-training quantization engine} taking the unique activation +distributions of Softmax-free efficient ViTs into full consideration, aiming to +boost quantization accuracy. Furthermore, at the hardware level, we build an +accelerator dedicated to the specific Convolution-Transformer hybrid +architecture of efficient ViTs, thereby enhancing hardware efficiency. +Extensive experimental results consistently prove the effectiveness of our +Trio-ViT framework. {Particularly, we can gain up to +$\uparrow$$\mathbf{7.2}\times$ and $\uparrow$$\mathbf{14.6}\times$ FPS under +comparable accuracy over state-of-the-art ViT accelerators, as well as +$\uparrow$$\mathbf{5.9}\times$ and $\uparrow$$\mathbf{2.0}\times$ DSP +efficiency.} Codes will be released publicly upon acceptance. + +
+
+
+
+
+ + ☆ VSA4VQA: Scaling a Vector Symbolic Architecture to Visual Question + Answering on Natural Images + + +
+ While Vector Symbolic Architectures (VSAs) are promising for modelling +spatial cognition, their application is currently limited to artificially +generated images and simple spatial queries. We propose VSA4VQA - a novel 4D +implementation of VSAs that implements a mental representation of natural +images for the challenging task of Visual Question Answering (VQA). VSA4VQA is +the first model to scale a VSA to complex spatial queries. Our method is based +on the Semantic Pointer Architecture (SPA) to encode objects in a +hyperdimensional vector space. To encode natural images, we extend the SPA to +include dimensions for object's width and height in addition to their spatial +location. To perform spatial queries we further introduce learned spatial query +masks and integrate a pre-trained vision-language model for answering +attribute-related questions. We evaluate our method on the GQA benchmark +dataset and show that it can effectively encode natural images, achieving +competitive performance to state-of-the-art deep learning methods for zero-shot +VQA. + +
+
+ comment: To be published in the Proceedings of the Annual Meeting of the + Cognitive Science Society (CogSci'24) +
+
+
+
+
+ + ☆ Enhancing Apparent Personality Trait Analysis with Cross-Modal + Embeddings + + +
+ Automatic personality trait assessment is essential for high-quality +human-machine interactions. Systems capable of human behavior analysis could be +used for self-driving cars, medical research, and surveillance, among many +others. We present a multimodal deep neural network with a Siamese extension +for apparent personality trait prediction trained on short video recordings and +exploiting modality invariant embeddings. Acoustic, visual, and textual +information are utilized to reach high-performance solutions in this task. Due +to the highly centralized target distribution of the analyzed dataset, the +changes in the third digit are relevant. Our proposed method addresses the +challenge of under-represented extreme values, achieves 0.0033 MAE average +improvement, and shows a clear advantage over the baseline multimodal DNN +without the introduced module. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ Direct learning of home vector direction for insect-inspired robot + navigation ICRA 2024 + + +
+ Insects have long been recognized for their ability to navigate and return +home using visual cues from their nest's environment. However, the precise +mechanism underlying this remarkable homing skill remains a subject of ongoing +investigation. Drawing inspiration from the learning flights of honey bees and +wasps, we propose a robot navigation method that directly learns the home +vector direction from visual percepts during a learning flight in the vicinity +of the nest. After learning, the robot will travel away from the nest, come +back by means of odometry, and eliminate the resultant drift by inferring the +home vector orientation from the currently experienced view. Using a compact +convolutional neural network, we demonstrate successful learning in both +simulated and real forest environments, as well as successful homing control of +a simulated quadrotor. The average errors of the inferred home vectors in +general stay well below the 90{\deg} required for successful homing, and below +24{\deg} if all images contain sufficient texture and illumination. Moreover, +we show that the trajectory followed during the initial learning flight has a +pronounced impact on the network's performance. A higher density of sample +points in proximity to the nest results in a more consistent return. Code and +data are available at https://mavlab.tudelft.nl/learning_to_home . + +
+
+ comment: Published at ICRA 2024, project webpage at + https://mavlab.tudelft.nl/learning_to_home +
+
+
+
+
+ + ☆ MoDiPO: text-to-motion alignment via AI-feedback-driven Direct + Preference Optimization + + +
+ Diffusion Models have revolutionized the field of human motion generation by +offering exceptional generation quality and fine-grained controllability +through natural language conditioning. Their inherent stochasticity, that is +the ability to generate various outputs from a single input, is key to their +success. However, this diversity should not be unrestricted, as it may lead to +unlikely generations. Instead, it should be confined within the boundaries of +text-aligned and realistic generations. To address this issue, we propose +MoDiPO (Motion Diffusion DPO), a novel methodology that leverages Direct +Preference Optimization (DPO) to align text-to-motion models. We streamline the +laborious and expensive process of gathering human preferences needed in DPO by +leveraging AI feedback instead. This enables us to experiment with novel DPO +strategies, using both online and offline generated motion-preference pairs. To +foster future research we contribute with a motion-preference dataset which we +dub Pick-a-Move. We demonstrate, both qualitatively and quantitatively, that +our proposed method yields significantly more realistic motions. In particular, +MoDiPO substantially improves Frechet Inception Distance (FID) while retaining +the same RPrecision and Multi-Modality performances. + +
+
+
+
+
+ + ☆ Foundation Models for Video Understanding: A Survey + + +
+ Video Foundation Models (ViFMs) aim to learn a general-purpose representation +for various video understanding tasks. Leveraging large-scale datasets and +powerful models, ViFMs achieve this by capturing robust and generic features +from video data. This survey analyzes over 200 video foundational models, +offering a comprehensive overview of benchmarks and evaluation metrics across +14 distinct video tasks categorized into 3 main categories. Additionally, we +offer an in-depth performance analysis of these models for the 6 most common +video tasks. We categorize ViFMs into three categories: 1) Image-based ViFMs, +which adapt existing image models for video tasks, 2) Video-Based ViFMs, which +utilize video-specific encoding methods, and 3) Universal Foundational Models +(UFMs), which combine multiple modalities (image, video, audio, and text etc.) +within a single framework. By comparing the performance of various ViFMs on +different tasks, this survey offers valuable insights into their strengths and +weaknesses, guiding future advancements in video understanding. Our analysis +surprisingly reveals that image-based foundation models consistently outperform +video-based models on most video understanding tasks. Additionally, UFMs, which +leverage diverse modalities, demonstrate superior performance on video tasks. +We share the comprehensive list of ViFMs studied in this work at: +\url{https://github.com/NeeluMadan/ViFM_Survey.git} + +
+
+
+
+
+ + ☆ Deep learning classifier of locally advanced rectal cancer treatment + response from endoscopy images + + +
+ We developed a deep learning classifier of rectal cancer response (tumor vs. +no-tumor) to total neoadjuvant treatment (TNT) from endoscopic images acquired +before, during, and following TNT. We further evaluated the network's ability +in a near out-of-distribution (OOD) problem to identify local regrowth (LR) +from follow-up endoscopy images acquired several months to years after +completing TNT. We addressed endoscopic image variability by using optimal mass +transport-based image harmonization. We evaluated multiple training +regularization schemes to study the ResNet-50 network's in-distribution and +near-OOD generalization ability. Test time augmentation resulted in the most +considerable accuracy improvement. Image harmonization resulted in slight +accuracy improvement for the near-OOD cases. Our results suggest that +off-the-shelf deep learning classifiers can detect rectal cancer from +endoscopic images at various stages of therapy for surveillance. + +
+
+
+
+
+ + ☆ Accelerated MR Cholangiopancreatography with Deep Learning-based + Reconstruction + + +
+ This study accelerates MR cholangiopancreatography (MRCP) acquisitions using +deep learning-based (DL) reconstruction at 3T and 0.55T. Thirty healthy +volunteers underwent conventional two-fold MRCP scans at field strengths of 3T +or 0.55T. We trained a variational network (VN) using retrospectively six-fold +undersampled data obtained at 3T. We then evaluated our method against standard +techniques such as parallel imaging (PI) and compressed sensing (CS), focusing +on peak signal-to-noise ratio (PSNR) and structural similarity (SSIM) as +metrics. Furthermore, considering acquiring fully-sampled MRCP is impractical, +we added a self-supervised DL reconstruction (SSDU) to the evaluating group. We +also tested our method in a prospective accelerated scenario to reflect +real-world clinical applications and evaluated its adaptability to MRCP at +0.55T. Our method demonstrated a remarkable reduction of average acquisition +time from 599/542 to 255/180 seconds for MRCP at 3T/0.55T. In both +retrospective and prospective undersampling scenarios, the PSNR and SSIM of VN +were higher than those of PI, CS, and SSDU. At the same time, VN preserved the +image quality of undersampled data, i.e., sharpness and the visibility of +hepatobiliary ducts. In addition, VN also produced high quality reconstructions +at 0.55T resulting in the highest PSNR and SSIM. In summary, VN trained for +highly accelerated MRCP allows to reduce the acquisition time by a factor of +2.4/3.0 at 3T/0.55T while maintaining the image quality of the conventional +acquisition. + +
+
+ comment: 20 pages, 6 figures, 2 tables +
+
+
+
+
+ + ☆ Tilt your Head: Activating the Hidden Spatial-Invariance of Classifiers + + +
+ Deep neural networks are applied in more and more areas of everyday life. +However, they still lack essential abilities, such as robustly dealing with +spatially transformed input signals. Approaches to mitigate this severe +robustness issue are limited to two pathways: Either models are implicitly +regularised by increased sample variability (data augmentation) or explicitly +constrained by hard-coded inductive biases. The limiting factor of the former +is the size of the data space, which renders sufficient sample coverage +intractable. The latter is limited by the engineering effort required to +develop such inductive biases for every possible scenario. Instead, we take +inspiration from human behaviour, where percepts are modified by mental or +physical actions during inference. We propose a novel technique to emulate such +an inference process for neural nets. This is achieved by traversing a +sparsified inverse transformation tree during inference using parallel +energy-based evaluations. Our proposed inference algorithm, called Inverse +Transformation Search (ITS), is model-agnostic and equips the model with +zero-shot pseudo-invariance to spatially transformed inputs. We evaluated our +method on several benchmark datasets, including a synthesised ImageNet test +set. ITS outperforms the utilised baselines on all zero-shot test scenarios. + +
+
+
+
+
+ + ☆ Class-relevant Patch Embedding Selection for Few-Shot Image + Classification + + +
+ Effective image classification hinges on discerning relevant features from +both foreground and background elements, with the foreground typically holding +the critical information. While humans adeptly classify images with limited +exposure, artificial neural networks often struggle with feature selection from +rare samples. To address this challenge, we propose a novel method for +selecting class-relevant patch embeddings. Our approach involves splitting +support and query images into patches, encoding them using a pre-trained Vision +Transformer (ViT) to obtain class embeddings and patch embeddings, +respectively. Subsequently, we filter patch embeddings using class embeddings +to retain only the class-relevant ones. For each image, we calculate the +similarity between class embedding and each patch embedding, sort the +similarity sequence in descending order, and only retain top-ranked patch +embeddings. By prioritizing similarity between the class embedding and patch +embeddings, we select top-ranked patch embeddings to be fused with class +embedding to form a comprehensive image representation, enhancing pattern +recognition across instances. Our strategy effectively mitigates the impact of +class-irrelevant patch embeddings, yielding improved performance in pre-trained +models. Extensive experiments on popular few-shot classification benchmarks +demonstrate the simplicity, efficacy, and computational efficiency of our +approach, outperforming state-of-the-art baselines under both 5-shot and 1-shot +scenarios. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.03109 +
+
+
+
+
+ + ♻ ☆ Recent Trends in 3D Reconstruction of General Non-Rigid Scenes + + +
+ Reconstructing models of the real world, including 3D geometry, appearance, +and motion of real scenes, is essential for computer graphics and computer +vision. It enables the synthesizing of photorealistic novel views, useful for +the movie industry and AR/VR applications. It also facilitates the content +creation necessary in computer games and AR/VR by avoiding laborious manual +design processes. Further, such models are fundamental for intelligent +computing systems that need to interpret real-world scenes and actions to act +and interact safely with the human world. Notably, the world surrounding us is +dynamic, and reconstructing models of dynamic, non-rigidly moving scenes is a +severely underconstrained and challenging problem. This state-of-the-art report +(STAR) offers the reader a comprehensive summary of state-of-the-art techniques +with monocular and multi-view inputs such as data from RGB and RGB-D sensors, +among others, conveying an understanding of different approaches, their +potential applications, and promising further research directions. The report +covers 3D reconstruction of general non-rigid scenes and further addresses the +techniques for scene decomposition, editing and controlling, and generalizable +and generative modeling. More specifically, we first review the common and +fundamental concepts necessary to understand and navigate the field and then +discuss the state-of-the-art techniques by reviewing recent approaches that use +traditional and machine-learning-based neural representations, including a +discussion on the newly enabled applications. The STAR is concluded with a +discussion of the remaining limitations and open challenges. + +
+
+ comment: 42 pages, 18 figures, 5 tables; State-of-the-Art Report at + EUROGRAPHICS 2024. Project page: https://razayunus.github.io/non-rigid-star +
+
+
+
+
+ + ♻ ☆ A Linear Time and Space Local Point Cloud Geometry Encoder via + Vectorized Kernel Mixture (VecKM) ICML2024 + + +
+ We propose VecKM, a local point cloud geometry encoder that is descriptive +and efficient to compute. VecKM leverages a unique approach by vectorizing a +kernel mixture to represent the local point cloud. Such representation's +descriptiveness is supported by two theorems that validate its ability to +reconstruct and preserve the similarity of the local shape. Unlike existing +encoders downsampling the local point cloud, VecKM constructs the local +geometry encoding using all neighboring points, producing a more descriptive +encoding. + Moreover, VecKM is efficient to compute and scalable to large point cloud +inputs: VecKM reduces the memory cost from $(n^2+nKd)$ to $(nd+np)$; and +reduces the major runtime cost from computing $nK$ MLPs to $n$ MLPs, where $n$ +is the size of the point cloud, $K$ is the neighborhood size, $d$ is the +encoding dimension, and $p$ is a marginal factor. The efficiency is due to +VecKM's unique factorizable property that eliminates the need of explicitly +grouping points into neighbors. + In the normal estimation task, VecKM demonstrates not only 100x faster +inference speed but also highest accuracy and strongest robustness. In +classification and segmentation tasks, integrating VecKM as a preprocessing +module achieves consistently better performance than the PointNet, PointNet++, +and point transformer baselines, and runs consistently faster by up to 10 +times. + +
+
+ comment: ICML2024 Conference Paper +
+
+
+
+
+ + ♻ ☆ CoVid-19 Detection leveraging Vision Transformers and Explainable AI + + +
+ Lung disease is a common health problem in many parts of the world. It is a +significant risk to people health and quality of life all across the globe +since it is responsible for five of the top thirty leading causes of death. +Among them are COVID 19, pneumonia, and tuberculosis, to name just a few. It is +critical to diagnose lung diseases in their early stages. Several different +models including machine learning and image processing have been developed for +this purpose. The earlier a condition is diagnosed, the better the patient +chances of making a full recovery and surviving into the long term. Thanks to +deep learning algorithms, there is significant promise for the autonomous, +rapid, and accurate identification of lung diseases based on medical imaging. +Several different deep learning strategies, including convolutional neural +networks (CNN), vanilla neural networks, visual geometry group based networks +(VGG), and capsule networks , are used for the goal of making lung disease +forecasts. The standard CNN has a poor performance when dealing with rotated, +tilted, or other aberrant picture orientations. As a result of this, within the +scope of this study, we have suggested a vision transformer based approach end +to end framework for the diagnosis of lung disorders. In the architecture, data +augmentation, training of the suggested models, and evaluation of the models +are all included. For the purpose of detecting lung diseases such as pneumonia, +Covid 19, lung opacity, and others, a specialised Compact Convolution +Transformers (CCT) model have been tested and evaluated on datasets such as the +Covid 19 Radiography Database. The model has achieved a better accuracy for +both its training and validation purposes on the Covid 19 Radiography Database. + +
+
+
+
+
+ + ♻ ☆ MoA: Mixture-of-Attention for Subject-Context Disentanglement in + Personalized Image Generation + + +
+ We introduce a new architecture for personalization of text-to-image +diffusion models, coined Mixture-of-Attention (MoA). Inspired by the +Mixture-of-Experts mechanism utilized in large language models (LLMs), MoA +distributes the generation workload between two attention pathways: a +personalized branch and a non-personalized prior branch. MoA is designed to +retain the original model's prior by fixing its attention layers in the prior +branch, while minimally intervening in the generation process with the +personalized branch that learns to embed subjects in the layout and context +generated by the prior branch. A novel routing mechanism manages the +distribution of pixels in each layer across these branches to optimize the +blend of personalized and generic content creation. Once trained, MoA +facilitates the creation of high-quality, personalized images featuring +multiple subjects with compositions and interactions as diverse as those +generated by the original model. Crucially, MoA enhances the distinction +between the model's pre-existing capability and the newly augmented +personalized intervention, thereby offering a more disentangled subject-context +control that was previously unattainable. Project page: +https://snap-research.github.io/mixture-of-attention + +
+
+ comment: Project Website: + https://snap-research.github.io/mixture-of-attention, Same as previous + version, only updated metadata because bib was missing an author name +
+
+
+
+
+ + ♻ ☆ ShadowNav: Autonomous Global Localization for Lunar Navigation in + Darkness + + +
+ The ability to determine the pose of a rover in an inertial frame +autonomously is a crucial capability necessary for the next generation of +surface rover missions on other planetary bodies. Currently, most on-going +rover missions utilize ground-in-the-loop interventions to manually correct for +drift in the pose estimate and this human supervision bottlenecks the distance +over which rovers can operate autonomously and carry out scientific +measurements. In this paper, we present ShadowNav, an autonomous approach for +global localization on the Moon with an emphasis on driving in darkness and at +nighttime. Our approach uses the leading edge of Lunar craters as landmarks and +a particle filtering approach is used to associate detected craters with known +ones on an offboard map. We discuss the key design decisions in developing the +ShadowNav framework for use with a Lunar rover concept equipped with a stereo +camera and an external illumination source. Finally, we demonstrate the +efficacy of our proposed approach in both a Lunar simulation environment and on +data collected during a field test at Cinder Lakes, Arizona. + +
+
+ comment: 21 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception + Tasks + + +
+ Numerous roadside perception datasets have been introduced to propel +advancements in autonomous driving and intelligent transportation systems +research and development. However, it has been observed that the majority of +their concentrates is on urban arterial roads, inadvertently overlooking +residential areas such as parks and campuses that exhibit entirely distinct +characteristics. In light of this gap, we propose CORP, which stands as the +first public benchmark dataset tailored for multi-modal roadside perception +tasks under campus scenarios. Collected in a university campus, CORP consists +of over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR +sensors. These sensors with different configurations are mounted on roadside +utility poles to provide diverse viewpoints within the campus region. The +annotations of CORP encompass multi-dimensional information beyond 2D and 3D +bounding boxes, providing extra support for 3D seamless tracking and instance +segmentation with unique IDs and pixel masks for identifying targets, to +enhance the understanding of objects and their behaviors distributed across the +campus premises. Unlike other roadside datasets about urban traffic, CORP +extends the spectrum to highlight the challenges for multi-modal perception in +campuses and other residential areas. + +
+
+
+
+
+ + ♻ ☆ DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D + Classification + + +
+ Large pre-trained models have had a significant impact on computer vision by +enabling multi-modal learning, where the CLIP model has achieved impressive +results in image classification, object detection, and semantic segmentation. +However, the model's performance on 3D point cloud processing tasks is limited +due to the domain gap between depth maps from 3D projection and training images +of CLIP. This paper proposes DiffCLIP, a new pre-training framework that +incorporates stable diffusion with ControlNet to minimize the domain gap in the +visual branch. Additionally, a style-prompt generation module is introduced for +few-shot tasks in the textual branch. Extensive experiments on the ModelNet10, +ModelNet40, and ScanObjectNN datasets show that DiffCLIP has strong abilities +for 3D understanding. By using stable diffusion and style-prompt generation, +DiffCLIP achieves an accuracy of 43.2\% for zero-shot classification on OBJ\_BG +of ScanObjectNN, which is state-of-the-art performance, and an accuracy of +80.6\% for zero-shot classification on ModelNet10, which is comparable to +state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ SCULPT: Shape-Conditioned Unpaired Learning of Pose-dependent Clothed + and Textured Human Meshes CVPR 2024 + + +
+ We present SCULPT, a novel 3D generative model for clothed and textured 3D +meshes of humans. Specifically, we devise a deep neural network that learns to +represent the geometry and appearance distribution of clothed human bodies. +Training such a model is challenging, as datasets of textured 3D meshes for +humans are limited in size and accessibility. Our key observation is that there +exist medium-sized 3D scan datasets like CAPE, as well as large-scale 2D image +datasets of clothed humans and multiple appearances can be mapped to a single +geometry. To effectively learn from the two data modalities, we propose an +unpaired learning procedure for pose-dependent clothed and textured human +meshes. Specifically, we learn a pose-dependent geometry space from 3D scan +data. We represent this as per vertex displacements w.r.t. the SMPL model. +Next, we train a geometry conditioned texture generator in an unsupervised way +using the 2D image data. We use intermediate activations of the learned +geometry model to condition our texture generator. To alleviate entanglement +between pose and clothing type, and pose and clothing appearance, we condition +both the texture and geometry generators with attribute labels such as clothing +types for the geometry, and clothing colors for the texture generator. We +automatically generated these conditioning labels for the 2D images based on +the visual question answering model BLIP and CLIP. We validate our method on +the SCULPT dataset, and compare to state-of-the-art 3D generative models for +clothed human bodies. Our code and data can be found at +https://sculpt.is.tue.mpg.de. + +
+
+ comment: Updated to camera ready version of CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Ordinal Classification with Distance Regularization for Robust Brain Age + Prediction WACV 2024 + + +
+ Age is one of the major known risk factors for Alzheimer's Disease (AD). +Detecting AD early is crucial for effective treatment and preventing +irreversible brain damage. Brain age, a measure derived from brain imaging +reflecting structural changes due to aging, may have the potential to identify +AD onset, assess disease risk, and plan targeted interventions. Deep +learning-based regression techniques to predict brain age from magnetic +resonance imaging (MRI) scans have shown great accuracy recently. However, +these methods are subject to an inherent regression to the mean effect, which +causes a systematic bias resulting in an overestimation of brain age in young +subjects and underestimation in old subjects. This weakens the reliability of +predicted brain age as a valid biomarker for downstream clinical applications. +Here, we reformulate the brain age prediction task from regression to +classification to address the issue of systematic bias. Recognizing the +importance of preserving ordinal information from ages to understand aging +trajectory and monitor aging longitudinally, we propose a novel ORdinal +Distance Encoded Regularization (ORDER) loss that incorporates the order of age +labels, enhancing the model's ability to capture age-related patterns. +Extensive experiments and ablation studies demonstrate that this framework +reduces systematic bias, outperforms state-of-art methods by statistically +significant margins, and can better capture subtle differences between clinical +groups in an independent AD dataset. Our implementation is publicly available +at https://github.com/jaygshah/Robust-Brain-Age-Prediction. + +
+
+ comment: Accepted in WACV 2024 +
+
+
+
+
+ + ♻ ☆ Frozen Transformers in Language Models Are Effective Visual Encoder + Layers ICLR 2024 + + +
+ This paper reveals that large language models (LLMs), despite being trained +solely on textual data, are surprisingly strong encoders for purely visual +tasks in the absence of language. Even more intriguingly, this can be achieved +by a simple yet previously overlooked strategy -- employing a frozen +transformer block from pre-trained LLMs as a constituent encoder layer to +directly process visual tokens. Our work pushes the boundaries of leveraging +LLMs for computer vision tasks, significantly departing from conventional +practices that typically necessitate a multi-modal vision-language setup with +associated language prompts, inputs, or outputs. We demonstrate that our +approach consistently enhances performance across a diverse range of tasks, +encompassing pure 2D and 3D visual recognition tasks (e.g., image and point +cloud classification), temporal modeling tasks (e.g., action recognition), +non-semantic tasks (e.g., motion forecasting), and multi-modal tasks (e.g., +2D/3D visual question answering and image-text retrieval). Such improvements +are a general phenomenon, applicable to various types of LLMs (e.g., LLaMA and +OPT) and different LLM transformer blocks. We additionally propose the +information filtering hypothesis to explain the effectiveness of pre-trained +LLMs in visual encoding -- the pre-trained LLM transformer blocks discern +informative visual tokens and further amplify their effect. This hypothesis is +empirically supported by the observation that the feature activation, after +training with LLM transformer blocks, exhibits a stronger focus on relevant +regions. We hope that our work inspires new perspectives on utilizing LLMs and +deepening our understanding of their underlying mechanisms. Code is available +at https://github.com/ziqipang/LM4VisualEncoding. + +
+
+ comment: ICLR 2024 Spotlight. 23 pages, 13 figures. Code at + https://github.com/ziqipang/LM4VisualEncoding +
+
+
+
+
+ + ♻ ☆ The Chosen One: Consistent Characters in Text-to-Image Diffusion Models SIGGRAPH 2024 + + +
+ Recent advances in text-to-image generation models have unlocked vast +potential for visual creativity. However, these models struggle with generation +of consistent characters, a crucial aspect for numerous real-world applications +such as story visualization, game development asset design, advertising, and +more. Current methods typically rely on multiple pre-existing images of the +target character or involve labor-intensive manual processes. In this work, we +propose a fully automated solution for consistent character generation, with +the sole input being a text prompt. We introduce an iterative procedure that, +at each stage, identifies a coherent set of images sharing a similar identity +and extracts a more consistent identity from this set. Our quantitative +analysis demonstrates that our method strikes a better balance between prompt +alignment and identity consistency compared to the baseline methods, and these +findings are reinforced by a user study. To conclude, we showcase several +practical applications of our approach. Project page is available at +https://omriavrahami.com/the-chosen-one + +
+
+ comment: Accepted to SIGGRAPH 2024. Project page is available at + https://omriavrahami.com/the-chosen-one +
+
+
+
+
+ + ♻ ☆ Exploring Interactive Semantic Alignment for Efficient HOI Detection + with Vision-language Model + + +
+ Human-Object Interaction (HOI) detection aims to localize human-object pairs +and comprehend their interactions. Recently, two-stage transformer-based +methods have demonstrated competitive performance. However, these methods +frequently focus on object appearance features and ignore global contextual +information. Besides, vision-language model CLIP which effectively aligns +visual and text embeddings has shown great potential in zero-shot HOI +detection. Based on the former facts, We introduce a novel HOI detector named +ISA-HOI, which extensively leverages knowledge from CLIP, aligning interactive +semantics between visual and textual features. We first extract global context +of image and local features of object to Improve interaction Features in images +(IF). On the other hand, we propose a Verb Semantic Improvement (VSI) module to +enhance textual features of verb labels via cross-modal fusion. Ultimately, our +method achieves competitive results on the HICO-DET and V-COCO benchmarks with +much fewer training epochs, and outperforms the state-of-the-art under +zero-shot settings. + +
+
+ comment: There are issues with the experimental results +
+
+
+
+
+ + ♻ ☆ An Optimized Ensemble Deep Learning Model For Brain Tumor Classification + + +
+ Brain tumors present a grave risk to human life, demanding precise and timely +diagnosis for effective treatment. Inaccurate identification of brain tumors +can significantly diminish life expectancy, underscoring the critical need for +precise diagnostic methods. Manual identification of brain tumors within vast +Magnetic Resonance Imaging (MRI) image datasets is arduous and time-consuming. +Thus, the development of a reliable deep learning (DL) model is essential to +enhance diagnostic accuracy and ultimately save lives. This study introduces an +innovative optimization-based deep ensemble approach employing transfer +learning (TL) to efficiently classify brain tumors. Our methodology includes +meticulous preprocessing, reconstruction of TL architectures, fine-tuning, and +ensemble DL models utilizing weighted optimization techniques such as Genetic +Algorithm-based Weight Optimization (GAWO) and Grid Search-based Weight +Optimization (GSWO). Experimentation is conducted on the Figshare +Contrast-Enhanced MRI (CE-MRI) brain tumor dataset, comprising 3064 images. Our +approach achieves notable accuracy scores, with Xception, ResNet50V2, +ResNet152V2, InceptionResNetV2, GAWO, and GSWO attaining 99.42%, 98.37%, +98.22%, 98.26%, 99.71%, and 99.76% accuracy, respectively. Notably, GSWO +demonstrates superior accuracy, averaging 99.76\% accuracy across five folds on +the Figshare CE-MRI brain tumor dataset. The comparative analysis highlights +the significant performance enhancement of our proposed model over existing +counterparts. In conclusion, our optimized deep ensemble model exhibits +exceptional accuracy in swiftly classifying brain tumors. Furthermore, it has +the potential to assist neurologists and clinicians in making accurate and +immediate diagnostic decisions. + +
+
+
+
+
+ + ♻ ☆ Bridging Stereo Geometry and BEV Representation with Reliable Mutual + Interaction for Semantic Scene Completion IJCAI2024 + + +
+ 3D semantic scene completion (SSC) is an ill-posed perception task that +requires inferring a dense 3D scene from limited observations. Previous +camera-based methods struggle to predict accurate semantic scenes due to +inherent geometric ambiguity and incomplete observations. In this paper, we +resort to stereo matching technique and bird's-eye-view (BEV) representation +learning to address such issues in SSC. Complementary to each other, stereo +matching mitigates geometric ambiguity with epipolar constraint while BEV +representation enhances the hallucination ability for invisible regions with +global semantic context. However, due to the inherent representation gap +between stereo geometry and BEV features, it is non-trivial to bridge them for +dense prediction task of SSC. Therefore, we further develop a unified +occupancy-based framework dubbed BRGScene, which effectively bridges these two +representations with dense 3D volumes for reliable semantic scene completion. +Specifically, we design a novel Mutual Interactive Ensemble (MIE) block for +pixel-level reliable aggregation of stereo geometry and BEV features. Within +the MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced +with confidence re-weighting, is employed to encourage fine-grained interaction +through mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is +introduced to facilitate complementary aggregation through channel-wise +recalibration and multi-group voting. Our method outperforms all published +camera-based methods on SemanticKITTI for semantic scene completion. Our code +is available on https://github.com/Arlo0o/StereoScene. + +
+
+ comment: IJCAI2024 (https://github.com/Arlo0o/StereoScene) +
+
+
+
+
+ + ♻ ☆ SceneTracker: Long-term Scene Flow Estimation Network + + +
+ Considering the complementarity of scene flow estimation in the spatial +domain's focusing capability and 3D object tracking in the temporal domain's +coherence, this study aims to address a comprehensive new task that can +simultaneously capture fine-grained and long-term 3D motion in an online +manner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a +novel learning-based LSFE network that adopts an iterative approach to +approximate the optimal trajectory. Besides, it dynamically indexes and +constructs appearance and depth correlation features simultaneously and employs +the Transformer to explore and utilize long-range connections within and +between trajectories. With detailed experiments, SceneTracker shows superior +capabilities in handling 3D spatial occlusion and depth noise interference, +highly tailored to the LSFE task's needs. Finally, we build the first +real-world evaluation dataset, LSFDriving, further substantiating +SceneTracker's commendable generalization capacity. The code and data for +SceneTracker is available at https://github.com/wwsource/SceneTracker. + +
+
+
+
+
+ + ♻ ☆ Uncovering What, Why and How: A Comprehensive Benchmark for Causation + Understanding of Video Anomaly CVPR2024 + + +
+ Video anomaly understanding (VAU) aims to automatically comprehend unusual +occurrences in videos, thereby enabling various applications such as traffic +surveillance and industrial manufacturing. While existing VAU benchmarks +primarily concentrate on anomaly detection and localization, our focus is on +more practicality, prompting us to raise the following crucial questions: "what +anomaly occurred?", "why did it happen?", and "how severe is this abnormal +event?". In pursuit of these answers, we present a comprehensive benchmark for +Causation Understanding of Video Anomaly (CUVA). Specifically, each instance of +the proposed benchmark involves three sets of human annotations to indicate the +"what", "why" and "how" of an anomaly, including 1) anomaly type, start and end +times, and event descriptions, 2) natural language explanations for the cause +of an anomaly, and 3) free text reflecting the effect of the abnormality. In +addition, we also introduce MMEval, a novel evaluation metric designed to +better align with human preferences for CUVA, facilitating the measurement of +existing LLMs in comprehending the underlying cause and corresponding effect of +video anomalies. Finally, we propose a novel prompt-based method that can serve +as a baseline approach for the challenging CUVA. We conduct extensive +experiments to show the superiority of our evaluation metric and the +prompt-based approach. Our code and dataset are available at +https://github.com/fesvhtr/CUVA. + +
+
+ comment: Accepted in CVPR2024, Codebase: https://github.com/fesvhtr/CUVA +
+
+
+
+
+ + ♻ ☆ EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image + Captioning + + +
+ News image captioning requires model to generate an informative caption rich +in entities, with the news image and the associated news article. Though +Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in addressing various vision-language tasks, our research finds +that current MLLMs still bear limitations in handling entity information on +news image captioning task. Besides, while MLLMs have the ability to process +long inputs, generating high-quality news image captions still requires a +trade-off between sufficiency and conciseness of textual input information. To +explore the potential of MLLMs and address problems we discovered, we propose : +an Entity-Aware Multimodal Alignment based approach for news image captioning. +Our approach first aligns the MLLM through Balance Training Strategy with two +extra alignment tasks: Entity-Aware Sentence Selection task and Entity +Selection task, together with News Image Captioning task, to enhance its +capability in handling multimodal entity information. The aligned MLLM will +utilizes the additional entity-related information it explicitly extracts to +supplement its textual input while generating news image captions. Our approach +achieves better results than all previous models in CIDEr score on GoodNews +dataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61). + +
+
+
+
+
+ + ♻ ☆ PopulAtion Parameter Averaging (PAPA) + + +
+ Ensemble methods combine the predictions of multiple models to improve +performance, but they require significantly higher computation costs at +inference time. To avoid these costs, multiple neural networks can be combined +into one by averaging their weights. However, this usually performs +significantly worse than ensembling. Weight averaging is only beneficial when +different enough to benefit from combining them, but similar enough to average +well. Based on this idea, we propose PopulAtion Parameter Averaging (PAPA): a +method that combines the generality of ensembling with the efficiency of weight +averaging. PAPA leverages a population of diverse models (trained on different +data orders, augmentations, and regularizations) while slowly pushing the +weights of the networks toward the population average of the weights. We also +propose PAPA variants (PAPA-all, and PAPA-2) that average weights rarely rather +than continuously; all methods increase generalization, but PAPA tends to +perform best. PAPA reduces the performance gap between averaging and +ensembling, increasing the average accuracy of a population of models by up to +0.8% on CIFAR-10, 1.9% on CIFAR-100, and 1.6% on ImageNet when compared to +training independent (non-averaged) models. + +
+
+ comment: Blog post: https://ajolicoeur.wordpress.com/papa/, Code: + https://github.com/SamsungSAILMontreal/PAPA, TMLR journal publication: + https://openreview.net/forum?id=cPDVjsOytS +
+
+
+
+
+ + ♻ ☆ DreamTime: An Improved Optimization Strategy for Diffusion-Guided 3D + Generation ICLR 2024 + + +
+ Text-to-image diffusion models pre-trained on billions of image-text pairs +have recently enabled 3D content creation by optimizing a randomly initialized +differentiable 3D representation with score distillation. However, the +optimization process suffers slow convergence and the resultant 3D models often +exhibit two limitations: (a) quality concerns such as missing attributes and +distorted shape and texture; (b) extremely low diversity comparing to +text-guided image synthesis. In this paper, we show that the conflict between +the 3D optimization process and uniform timestep sampling in score distillation +is the main reason for these limitations. To resolve this conflict, we propose +to prioritize timestep sampling with monotonically non-increasing functions, +which aligns the 3D optimization process with the sampling process of diffusion +model. Extensive experiments show that our simple redesign significantly +improves 3D content creation with faster convergence, better quality and +diversity. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ SHE-Net: Syntax-Hierarchy-Enhanced Text-Video Retrieval + + +
+ The user base of short video apps has experienced unprecedented growth in +recent years, resulting in a significant demand for video content analysis. In +particular, text-video retrieval, which aims to find the top matching videos +given text descriptions from a vast video corpus, is an essential function, the +primary challenge of which is to bridge the modality gap. Nevertheless, most +existing approaches treat texts merely as discrete tokens and neglect their +syntax structures. Moreover, the abundant spatial and temporal clues in videos +are often underutilized due to the lack of interaction with text. To address +these issues, we argue that using texts as guidance to focus on relevant +temporal frames and spatial regions within videos is beneficial. In this paper, +we propose a novel Syntax-Hierarchy-Enhanced text-video retrieval method +(SHE-Net) that exploits the inherent semantic and syntax hierarchy of texts to +bridge the modality gap from two perspectives. First, to facilitate a more +fine-grained integration of visual content, we employ the text syntax +hierarchy, which reveals the grammatical structure of text descriptions, to +guide the visual representations. Second, to further enhance the multi-modal +interaction and alignment, we also utilize the syntax hierarchy to guide the +similarity calculation. We evaluated our method on four public text-video +retrieval datasets of MSR-VTT, MSVD, DiDeMo, and ActivityNet. The experimental +results and ablation studies confirm the advantages of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Federated Learning Across Decentralized and Unshared Archives for Remote + Sensing Image Classification + + +
+ Federated learning (FL) enables the collaboration of multiple deep learning +models to learn from decentralized data archives (i.e., clients) without +accessing data on clients. Although FL offers ample opportunities in knowledge +discovery from distributed image archives, it is seldom considered in remote +sensing (RS). In this paper, as a first time in RS, we present a comparative +study of state-of-the-art FL algorithms for RS image classification problems. +To this end, we initially provide a systematic review of the FL algorithms +presented in the computer vision and machine learning communities. Then, we +select several state-of-the-art FL algorithms based on their effectiveness with +respect to training data heterogeneity across clients (known as non-IID data). +After presenting an extensive overview of the selected algorithms, a +theoretical comparison of the algorithms is conducted based on their: 1) local +training complexity; 2) aggregation complexity; 3) learning efficiency; 4) +communication cost; and 5) scalability in terms of number of clients. After the +theoretical comparison, experimental analyses are presented to compare them +under different decentralization scenarios. For the experimental analyses, we +focus our attention on multi-label image classification problems in RS. Based +on our comprehensive analyses, we finally derive a guideline for selecting +suitable FL algorithms in RS. The code of this work will be publicly available +at https://git.tu-berlin.de/rsim/FL-RS. + +
+
+ comment: Submitted to the IEEE Geoscience and Remote Sensing Magazine +
+
+
+
+
+ + ♻ ☆ Spice-E : Structural Priors in 3D Diffusion using Cross-Entity Attention + + +
+ We are witnessing rapid progress in automatically generating and manipulating +3D assets due to the availability of pretrained text-image diffusion models. +However, time-consuming optimization procedures are required for synthesizing +each sample, hindering their potential for democratizing 3D content creation. +Conversely, 3D diffusion models now train on million-scale 3D datasets, +yielding high-quality text-conditional 3D samples within seconds. In this work, +we present Spice-E - a neural network that adds structural guidance to 3D +diffusion models, extending their usage beyond text-conditional generation. At +its core, our framework introduces a cross-entity attention mechanism that +allows for multiple entities (in particular, paired input and guidance 3D +shapes) to interact via their internal representations within the denoising +network. We utilize this mechanism for learning task-specific structural priors +in 3D diffusion models from auxiliary guidance shapes. We show that our +approach supports a variety of applications, including 3D stylization, semantic +shape editing and text-conditional abstraction-to-3D, which transforms +primitive-based abstractions into highly-expressive shapes. Extensive +experiments demonstrate that Spice-E achieves SOTA performance over these tasks +while often being considerably faster than alternative methods. Importantly, +this is accomplished without tailoring our approach for any specific task. + +
+
+ comment: Project webpage: https://tau-vailab.github.io/Spice-E +
+
+
+
+
+ + ♻ ☆ HawkDrive: A Transformer-driven Visual Perception System for Autonomous + Driving in Night Scene + + +
+ Many established vision perception systems for autonomous driving scenarios +ignore the influence of light conditions, one of the key elements for driving +safety. To address this problem, we present HawkDrive, a novel perception +system with hardware and software solutions. Hardware that utilizes stereo +vision perception, which has been demonstrated to be a more reliable way of +estimating depth information than monocular vision, is partnered with the edge +computing device Nvidia Jetson Xavier AGX. Our software for low light +enhancement, depth estimation, and semantic segmentation tasks, is a +transformer-based neural network. Our software stack, which enables fast +inference and noise reduction, is packaged into system modules in Robot +Operating System 2 (ROS2). Our experimental results have shown that the +proposed end-to-end system is effective in improving the depth estimation and +semantic segmentation performance. Our dataset and codes will be released at +https://github.com/ZionGo6/HawkDrive. + +
+
+ comment: Accepted by IEEE IV 2024 +
+
+
+
+
+ + ♻ ☆ Simplicity in Complexity : Explaining Visual Complexity using Deep + Segmentation Models + + +
+ The complexity of visual stimuli plays an important role in many cognitive +phenomena, including attention, engagement, memorability, time perception and +aesthetic evaluation. Despite its importance, complexity is poorly understood +and ironically, previous models of image complexity have been quite complex. +There have been many attempts to find handcrafted features that explain +complexity, but these features are usually dataset specific, and hence fail to +generalise. On the other hand, more recent work has employed deep neural +networks to predict complexity, but these models remain difficult to interpret, +and do not guide a theoretical understanding of the problem. Here we propose to +model complexity using segment-based representations of images. We use +state-of-the-art segmentation models, SAM and FC-CLIP, to quantify the number +of segments at multiple granularities, and the number of classes in an image +respectively. We find that complexity is well-explained by a simple linear +model with these two features across six diverse image-sets of naturalistic +scene and art images. This suggests that the complexity of images can be +surprisingly simple. + +
+
+
+
+
+ + ♻ ☆ Generate Point Clouds with Multiscale Details from Graph-Represented + Structures + + +
+ As details are missing in most representations of structures, the lack of +controllability to more information is one of the major weaknesses in +structure-based controllable point cloud generation. It is observable that +definitions of details and structures are subjective. Details can be treated as +structures on small scales. To represent structures in different scales at the +same time, we present a graph-based representation of structures called the +Multiscale Structure Graph (MSG). Given structures in multiple scales, similar +patterns of local structures can be found at different scales, positions, and +angles. The knowledge learned from a regional structure pattern shall be +transferred to other similar patterns. An encoding and generation mechanism, +namely the Multiscale Structure-based Point Cloud Generator (MSPCG) is +proposed, which can simultaneously learn point cloud generation from local +patterns with miscellaneous spatial properties. The proposed method supports +multiscale editions on point clouds by editing the MSG. By generating point +clouds from local structures and learning simultaneously in multiple scales, +our MSPCG has better generalization ability and scalability. Trained on the +ShapeNet, our MSPCG can generate point clouds from a given structure for unseen +categories and indoor scenes. The experimental results show that our method +significantly outperforms baseline methods. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Understanding the Vulnerability of Skeleton-based Human Activity + Recognition via Black-box Attack + + +
+ Human Activity Recognition (HAR) has been employed in a wide range of +applications, e.g. self-driving cars, where safety and lives are at stake. +Recently, the robustness of skeleton-based HAR methods have been questioned due +to their vulnerability to adversarial attacks. However, the proposed attacks +require the full-knowledge of the attacked classifier, which is overly +restrictive. In this paper, we show such threats indeed exist, even when the +attacker only has access to the input/output of the model. To this end, we +propose the very first black-box adversarial attack approach in skeleton-based +HAR called BASAR. BASAR explores the interplay between the classification +boundary and the natural motion manifold. To our best knowledge, this is the +first time data manifold is introduced in adversarial attacks on time series. +Via BASAR, we find on-manifold adversarial samples are extremely deceitful and +rather common in skeletal motions, in contrast to the common belief that +adversarial samples only exist off-manifold. Through exhaustive evaluation, we +show that BASAR can deliver successful attacks across classifiers, datasets, +and attack modes. By attack, BASAR helps identify the potential causes of the +model vulnerability and provides insights on possible improvements. Finally, to +mitigate the newly identified threat, we propose a new adversarial training +approach by leveraging the sophisticated distributions of on/off-manifold +adversarial samples, called mixed manifold-based adversarial training (MMAT). +MMAT can successfully help defend against adversarial attacks without +compromising classification accuracy. + +
+
+ comment: Accepted in Pattern Recognition. arXiv admin note: substantial text + overlap with arXiv:2103.05266 +
+
+
+
+
+ + ♻ ☆ Enhancing Sign Language Teaching: A Mixed Reality Approach for Immersive + Learning and Multi-Dimensional Feedback + + +
+ Traditional sign language teaching methods face challenges such as limited +feedback and diverse learning scenarios. Although 2D resources lack real-time +feedback, classroom teaching is constrained by a scarcity of teacher. Methods +based on VR and AR have relatively primitive interaction feedback mechanisms. +This study proposes an innovative teaching model that uses real-time monocular +vision and mixed reality technology. First, we introduce an improved +hand-posture reconstruction method to achieve sign language semantic retention +and real-time feedback. Second, a ternary system evaluation algorithm is +proposed for a comprehensive assessment, maintaining good consistency with +experts in sign language. Furthermore, we use mixed reality technology to +construct a scenario-based 3D sign language classroom and explore the user +experience of scenario teaching. Overall, this paper presents a novel teaching +method that provides an immersive learning experience, advanced posture +reconstruction, and precise feedback, achieving positive feedback on user +experience and learning effectiveness. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Gaussian Shading: Provable Performance-Lossless Image Watermarking for + Diffusion Models CVPR 2024 + + +
+ Ethical concerns surrounding copyright protection and inappropriate content +generation pose challenges for the practical implementation of diffusion +models. One effective solution involves watermarking the generated images. +However, existing methods often compromise the model performance or require +additional training, which is undesirable for operators and users. To address +this issue, we propose Gaussian Shading, a diffusion model watermarking +technique that is both performance-lossless and training-free, while serving +the dual purpose of copyright protection and tracing of offending content. Our +watermark embedding is free of model parameter modifications and thus is +plug-and-play. We map the watermark to latent representations following a +standard Gaussian distribution, which is indistinguishable from latent +representations obtained from the non-watermarked diffusion model. Therefore we +can achieve watermark embedding with lossless performance, for which we also +provide theoretical proof. Furthermore, since the watermark is intricately +linked with image semantics, it exhibits resilience to lossy processing and +erasure attempts. The watermark can be extracted by Denoising Diffusion +Implicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian +Shading on multiple versions of Stable Diffusion, and the results demonstrate +that Gaussian Shading not only is performance-lossless but also outperforms +existing methods in terms of robustness. + +
+
+ comment: 17 pages, 11 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SSUMamba: Spatial-Spectral Selective State Space Model for Hyperspectral + Image Denoising + + +
+ Denoising hyperspectral images (HSIs) is a crucial preprocessing procedure +due to the noise originating from intra-imaging mechanisms and environmental +factors. Utilizing domain-specific knowledge of HSIs, such as spectral +correlation, spatial self-similarity, and spatial-spectral correlation, is +essential for deep learning-based denoising. Existing methods are often +constrained by running time, space complexity, and computational complexity, +employing strategies that explore these priors separately. While these +strategies can avoid some redundant information, they inevitably overlook +broader and more underlying long-range spatial-spectral information that +positively impacts image restoration. This paper proposes a Spatial-Spectral +Selective State Space Model-based U-shaped network, termed Spatial-Spectral +U-Mamba (SSUMamba), for hyperspectral image denoising. We can obtain complete +global spatial-spectral correlation within a module thanks to the linear space +complexity in State Space Model (SSM) computations. We introduce a +Spatial-Spectral Alternating Scan (SSAS) strategy for HSIs, which helps model +the information flow in multiple directions in 3-D HSIs. Experimental results +demonstrate that our method outperforms compared methods. The source code will +be available at https://github.com/lronkitty/SSUMamba. + +
+
+
+
+
+ + ♻ ☆ GPT-4V(ision) for Robotics: Multimodal Task Planning from Human + Demonstration + + +
+ We introduce a pipeline that enhances a general-purpose Vision Language +Model, GPT-4V(ision), to facilitate one-shot visual teaching for robotic +manipulation. This system analyzes videos of humans performing tasks and +outputs executable robot programs that incorporate insights into affordances. +The process begins with GPT-4V analyzing the videos to obtain textual +explanations of environmental and action details. A GPT-4-based task planner +then encodes these details into a symbolic task plan. Subsequently, vision +systems spatially and temporally ground the task plan in the videos. Object are +identified using an open-vocabulary object detector, and hand-object +interactions are analyzed to pinpoint moments of grasping and releasing. This +spatiotemporal grounding allows for the gathering of affordance information +(e.g., grasp types, waypoints, and body postures) critical for robot execution. +Experiments across various scenarios demonstrate the method's efficacy in +achieving real robots' operations from human demonstrations in a one-shot +manner. Meanwhile, quantitative tests have revealed instances of hallucination +in GPT-4V, highlighting the importance of incorporating human supervision +within the pipeline. The prompts of GPT-4V/GPT-4 are available at this project +page: + +
+
+ comment: 9 pages, 12 figures, 2 tables. Last updated on May 6th, 2024 +
+
+
+
+
+ + ♻ ☆ DUCK: Distance-based Unlearning via Centroid Kinematics + + +
+ Machine Unlearning is rising as a new field, driven by the pressing necessity +of ensuring privacy in modern artificial intelligence models. This technique +primarily aims to eradicate any residual influence of a specific subset of data +from the knowledge acquired by a neural model during its training. This work +introduces a novel unlearning algorithm, denoted as Distance-based Unlearning +via Centroid Kinematics (DUCK), which employs metric learning to guide the +removal of samples matching the nearest incorrect centroid in the embedding +space. Evaluation of the algorithm's performance is conducted across various +benchmark datasets in two distinct scenarios, class removal, and homogeneous +sampling removal, obtaining state-of-the-art performance. We also introduce a +novel metric, called Adaptive Unlearning Score (AUS), encompassing not only the +efficacy of the unlearning process in forgetting target data but also +quantifying the performance loss relative to the original model. Additionally, +we conducted a thorough investigation of the unlearning mechanism in DUCK, +examining its impact on the organization of the feature space and employing +explainable AI techniques for deeper insights. + +
+
+
+
+
+ + ♻ ☆ PEM: Prototype-based Efficient MaskFormer for Image Segmentation CVPR 2024 + + +
+ Recent transformer-based architectures have shown impressive results in the +field of image segmentation. Thanks to their flexibility, they obtain +outstanding performance in multiple segmentation tasks, such as semantic and +panoptic, under a single unified framework. To achieve such impressive +performance, these architectures employ intensive operations and require +substantial computational resources, which are often not available, especially +on edge devices. To fill this gap, we propose Prototype-based Efficient +MaskFormer (PEM), an efficient transformer-based architecture that can operate +in multiple segmentation tasks. PEM proposes a novel prototype-based +cross-attention which leverages the redundancy of visual features to restrict +the computation and improve the efficiency without harming the performance. In +addition, PEM introduces an efficient multi-scale feature pyramid network, +capable of extracting features that have high semantic content in an efficient +way, thanks to the combination of deformable convolutions and context-based +self-modulation. We benchmark the proposed PEM architecture on two tasks, +semantic and panoptic segmentation, evaluated on two different datasets, +Cityscapes and ADE20K. PEM demonstrates outstanding performance on every task +and dataset, outperforming task-specific architectures while being comparable +and even better than computationally-expensive baselines. + +
+
+ comment: CVPR 2024. Project page: https://niccolocavagnero.github.io/PEM +
+
+
+
+
+ + ♻ ☆ Task-conditioned adaptation of visual features in multi-task policy + learning + + +
+ Successfully addressing a wide variety of tasks is a core ability of +autonomous agents, requiring flexibly adapting the underlying decision-making +strategies and, as we argue in this work, also adapting the perception modules. +An analogical argument would be the human visual system, which uses top-down +signals to focus attention determined by the current task. Similarly, we adapt +pre-trained large vision models conditioned on specific downstream tasks in the +context of multi-task policy learning. We introduce task-conditioned adapters +that do not require finetuning any pre-trained weights, combined with a single +policy trained with behavior cloning and capable of addressing multiple tasks. +We condition the visual adapters on task embeddings, which can be selected at +inference if the task is known, or alternatively inferred from a set of example +demonstrations. To this end, we propose a new optimization-based estimator. We +evaluate the method on a wide variety of tasks from the CortexBench benchmark +and show that, compared to existing work, it can be addressed with a single +policy. In particular, we demonstrate that adapting visual features is a key +design choice and that the method generalizes to unseen tasks given a few +demonstrations. + +
+
+
+
+
+ + ♻ ☆ Multilateral Temporal-view Pyramid Transformer for Video Inpainting + Detection + + +
+ The task of video inpainting detection is to expose the pixel-level inpainted +regions within a video sequence. Existing methods usually focus on leveraging +spatial and temporal inconsistencies. However, these methods typically employ +fixed operations to combine spatial and temporal clues, limiting their +applicability in different scenarios. In this paper, we introduce a novel +Multilateral Temporal-view Pyramid Transformer ({\em MumPy}) that collaborates +spatial-temporal clues flexibly. Our method utilizes a newly designed +multilateral temporal-view encoder to extract various collaborations of +spatial-temporal clues and introduces a deformable window-based temporal-view +interaction module to enhance the diversity of these collaborations. +Subsequently, we develop a multi-pyramid decoder to aggregate the various types +of features and generate detection maps. By adjusting the contribution strength +of spatial and temporal clues, our method can effectively identify inpainted +regions. We validate our method on existing datasets and also introduce a new +challenging and large-scale Video Inpainting dataset based on the YouTube-VOS +dataset, which employs several more recent inpainting methods. The results +demonstrate the superiority of our method in both in-domain and cross-domain +evaluation scenarios. + +
+
+
+
+
+ + ♻ ☆ Texture-aware and Shape-guided Transformer for Sequential DeepFake + Detection + + +
+ Sequential DeepFake detection is an emerging task that aims to predict the +manipulation sequence in order. Existing methods typically formulate it as an +image-to-sequence problem, employing conventional Transformer architectures for +detection. However, these methods lack dedicated design and consequently result +in limited performance. In this paper, we propose a novel Texture-aware and +Shape-guided Transformer to enhance detection performance. Our method features +four major improvements. Firstly, we describe a texture-aware branch that +effectively captures subtle manipulation traces with the Diversiform Pixel +Difference Attention module. Then we introduce a Bidirectional Interaction +Cross-attention module that seeks deep correlations among spatial and +sequential features, enabling effective modeling of complex manipulation +traces. To further enhance the cross-attention, we describe a Shape-guided +Gaussian mapping strategy, providing initial priors of the manipulation shape. +Finally, observing that the latter manipulation in a sequence may influence +traces left in the earlier one, we intriguingly invert the prediction order +from forward to backward, leading to notable gains as expected. Extensive +experimental results demonstrate that our method outperforms others by a large +margin, highlighting the superiority of our method. + +
+
+
+
+
+ + ♻ ☆ FreqBlender: Enhancing DeepFake Detection by Blending Frequency + Knowledge + + +
+ Generating synthetic fake faces, known as pseudo-fake faces, is an effective +way to improve the generalization of DeepFake detection. Existing methods +typically generate these faces by blending real or fake faces in color space. +While these methods have shown promise, they overlook the simulation of +frequency distribution in pseudo-fake faces, limiting the learning of generic +forgery traces in-depth. To address this, this paper introduces {\em +FreqBlender}, a new method that can generate pseudo-fake faces by blending +frequency knowledge. Specifically, we investigate the major frequency +components and propose a Frequency Parsing Network to adaptively partition +frequency components related to forgery traces. Then we blend this frequency +knowledge from fake faces into real faces to generate pseudo-fake faces. Since +there is no ground truth for frequency components, we describe a dedicated +training strategy by leveraging the inner correlations among different +frequency knowledge to instruct the learning process. Experimental results +demonstrate the effectiveness of our method in enhancing DeepFake detection, +making it a potential plug-and-play strategy for other methods. + +
+
+
+
+
+ + ♻ ☆ Configurable Learned Holography + + +
+ In the pursuit of advancing holographic display technology, we face a unique +yet persistent roadblock: the inflexibility of learned holography in adapting +to various hardware configurations. This is due to the variances in the complex +optical components and system settings in existing holographic displays. +Although the emerging learned approaches have enabled rapid and high-quality +hologram generation, any alteration in display hardware still requires a +retraining of the model. Our work introduces a configurable learned model that +interactively computes 3D holograms from RGB-only 2D images for a variety of +holographic displays. The model can be conditioned to predefined hardware +parameters of existing holographic displays such as working wavelengths, pixel +pitch, propagation distance, and peak brightness without having to retrain. In +addition, our model accommodates various hologram types, including conventional +single-color and emerging multi-color holograms that simultaneously use +multiple color primaries in holographic displays. Notably, we enabled our +hologram computations to rely on identifying the correlation between depth +estimation and 3D hologram synthesis tasks within the learning domain for the +first time in the literature. We employ knowledge distillation via a +student-teacher learning strategy to streamline our model for interactive +performance. Achieving up to a 2x speed improvement compared to +state-of-the-art models while consistently generating high-quality 3D holograms +with different hardware configurations. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Mocap Everyone Everywhere: Lightweight Motion Capture With Smartwatches + and a Head-Mounted Camera CVPR 2024 + + +
+ We present a lightweight and affordable motion capture method based on two +smartwatches and a head-mounted camera. In contrast to the existing approaches +that use six or more expert-level IMU devices, our approach is much more +cost-effective and convenient. Our method can make wearable motion capture +accessible to everyone everywhere, enabling 3D full-body motion capture in +diverse environments. As a key idea to overcome the extreme sparsity and +ambiguities of sensor inputs with different modalities, we integrate 6D head +poses obtained from the head-mounted cameras for motion estimation. To enable +capture in expansive indoor and outdoor scenes, we propose an algorithm to +track and update floor level changes to define head poses, coupled with a +multi-stage Transformer-based regression module. We also introduce novel +strategies leveraging visual cues of egocentric images to further enhance the +motion capture quality while reducing ambiguities. We demonstrate the +performance of our method on various challenging scenarios, including complex +outdoor environments and everyday motions including object interactions and +social interactions among multiple individuals. + +
+
+ comment: Accepted to CVPR 2024; Project page: + https://jiyewise.github.io/projects/MocapEvery/ +
+
+
+
+
+ + ♻ ☆ Training-Free Deepfake Voice Recognition by Leveraging Large-Scale + Pre-Trained Models + + +
+ Generalization is a main issue for current audio deepfake detectors, which +struggle to provide reliable results on out-of-distribution data. Given the +speed at which more and more accurate synthesis methods are developed, it is +very important to design techniques that work well also on data they were not +trained for. In this paper we study the potential of large-scale pre-trained +models for audio deepfake detection, with special focus on generalization +ability. To this end, the detection problem is reformulated in a speaker +verification framework and fake audios are exposed by the mismatch between the +voice sample under test and the voice of the claimed identity. With this +paradigm, no fake speech sample is necessary in training, cutting off any link +with the generation method at the root, and ensuring full generalization +ability. Features are extracted by general-purpose large pre-trained models, +with no need for training or fine-tuning on specific fake detection or speaker +verification datasets. At detection time only a limited set of voice fragments +of the identity under test is required. Experiments on several datasets +widespread in the community show that detectors based on pre-trained models +achieve excellent performance and show strong generalization ability, rivaling +supervised methods on in-distribution data and largely overcoming them on +out-of-distribution data. + +
+
+
+
+
+ + ♻ ☆ UP-CrackNet: Unsupervised Pixel-Wise Road Crack Detection via + Adversarial Image Restoration + + +
+ Over the past decade, automated methods have been developed to detect cracks +more efficiently, accurately, and objectively, with the ultimate goal of +replacing conventional manual visual inspection techniques. Among these +methods, semantic segmentation algorithms have demonstrated promising results +in pixel-wise crack detection tasks. However, training such networks requires a +large amount of human-annotated datasets with pixel-level annotations, which is +a highly labor-intensive and time-consuming process. Moreover, supervised +learning-based methods often struggle with poor generalizability in unseen +datasets. Therefore, we propose an unsupervised pixel-wise road crack detection +network, known as UP-CrackNet. Our approach first generates multi-scale square +masks and randomly selects them to corrupt undamaged road images by removing +certain regions. Subsequently, a generative adversarial network is trained to +restore the corrupted regions by leveraging the semantic context learned from +surrounding uncorrupted regions. During the testing phase, an error map is +generated by calculating the difference between the input and restored images, +which allows for pixel-wise crack detection. Our comprehensive experimental +results demonstrate that UP-CrackNet outperforms other general-purpose +unsupervised anomaly detection algorithms, and exhibits satisfactory +performance and superior generalizability when compared with state-of-the-art +supervised crack segmentation algorithms. Our source code is publicly available +at mias.group/UP-CrackNet. + +
+
+
+
+
+ + ♻ ☆ A Deep Model for Partial Multi-Label Image Classification with + Curriculum Based Disambiguation + + +
+ In this paper, we study the partial multi-label (PML) image classification +problem, where each image is annotated with a candidate label set consists of +multiple relevant labels and other noisy labels. Existing PML methods typically +design a disambiguation strategy to filter out noisy labels by utilizing prior +knowledge with extra assumptions, which unfortunately is unavailable in many +real tasks. Furthermore, because the objective function for disambiguation is +usually elaborately designed on the whole training set, it can be hardly +optimized in a deep model with SGD on mini-batches. In this paper, for the +first time we propose a deep model for PML to enhance the representation and +discrimination ability. On one hand, we propose a novel curriculum based +disambiguation strategy to progressively identify ground-truth labels by +incorporating the varied difficulties of different classes. On the other hand, +a consistency regularization is introduced for model retraining to balance +fitting identified easy labels and exploiting potential relevant labels. +Extensive experimental results on the commonly used benchmark datasets show the +proposed method significantly outperforms the SOTA methods. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ LatentForensics: Towards frugal deepfake detection in the StyleGAN + latent space + + +
+ The classification of forged videos has been a challenge for the past few +years. Deepfake classifiers can now reliably predict whether or not video +frames have been tampered with. However, their performance is tied to both the +dataset used for training and the analyst's computational power. We propose a +deepfake detection method that operates in the latent space of a +state-of-the-art generative adversarial network (GAN) trained on high-quality +face images. The proposed method leverages the structure of the latent space of +StyleGAN to learn a lightweight binary classification model. Experimental +results on standard datasets reveal that the proposed approach outperforms +other state-of-the-art deepfake classification methods, especially in contexts +where the data available to train the models is rare, such as when a new +manipulation method is introduced. To the best of our knowledge, this is the +first study showing the interest of the latent space of StyleGAN for deepfake +classification. Combined with other recent studies on the interpretation and +manipulation of this latent space, we believe that the proposed approach can +further help in developing frugal deepfake classification methods based on +interpretable high-level properties of face images. + +
+
+ comment: 7 pages, 3 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Purify Unlearnable Examples via Rate-Constrained Variational + Autoencoders ICML 2024 + + +
+ Unlearnable examples (UEs) seek to maximize testing error by making subtle +modifications to training examples that are correctly labeled. Defenses against +these poisoning attacks can be categorized based on whether specific +interventions are adopted during training. The first approach is training-time +defense, such as adversarial training, which can mitigate poisoning effects but +is computationally intensive. The other approach is pre-training purification, +e.g., image short squeezing, which consists of several simple compressions but +often encounters challenges in dealing with various UEs. Our work provides a +novel disentanglement mechanism to build an efficient pre-training purification +method. Firstly, we uncover rate-constrained variational autoencoders (VAEs), +demonstrating a clear tendency to suppress the perturbations in UEs. We +subsequently conduct a theoretical analysis for this phenomenon. Building upon +these insights, we introduce a disentangle variational autoencoder (D-VAE), +capable of disentangling the perturbations with learnable class-wise +embeddings. Based on this network, a two-stage purification approach is +naturally developed. The first stage focuses on roughly eliminating +perturbations, while the second stage produces refined, poison-free results, +ensuring effectiveness and robustness across various scenarios. Extensive +experiments demonstrate the remarkable performance of our method across +CIFAR-10, CIFAR-100, and a 100-class ImageNet-subset. Code is available at +https://github.com/yuyi-sd/D-VAE. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ♻ ☆ FurniScene: A Large-scale 3D Room Dataset with Intricate Furnishing + Scenes + + +
+ Indoor scene generation has attracted significant attention recently as it is +crucial for applications of gaming, virtual reality, and interior design. +Current indoor scene generation methods can produce reasonable room layouts but +often lack diversity and realism. This is primarily due to the limited coverage +of existing datasets, including only large furniture without tiny furnishings +in daily life. To address these challenges, we propose FurniScene, a +large-scale 3D room dataset with intricate furnishing scenes from interior +design professionals. Specifically, the FurniScene consists of 11,698 rooms and +39,691 unique furniture CAD models with 89 different types, covering things +from large beds to small teacups on the coffee table. To better suit +fine-grained indoor scene layout generation, we introduce a novel Two-Stage +Diffusion Scene Model (TSDSM) and conduct an evaluation benchmark for various +indoor scene generation based on FurniScene. Quantitative and qualitative +evaluations demonstrate the capability of our method to generate highly +realistic indoor scenes. Our dataset and code will be publicly available soon. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Dynamics Prediction with Object-Centric Kinematics + + +
+ Human perception involves discerning complex multi-object scenes into +time-static object appearance (ie, size, shape, color) and time-varying object +motion (ie, location, velocity, acceleration). This innate ability to +unconsciously understand the environment is the motivation behind the success +of dynamics modeling. Object-centric representations have emerged as a +promising tool for dynamics prediction, yet they primarily focus on the +objects' appearance, often overlooking other crucial attributes. In this paper, +we propose Object-Centric Kinematics (OCK), a framework for dynamics prediction +leveraging object-centric representations. Our model utilizes a novel component +named object kinematics, which comprises low-level structured states of +objects' position, velocity, and acceleration. The object kinematics are +obtained via either implicit or explicit approaches, enabling comprehensive +spatiotemporal object reasoning, and integrated through various transformer +mechanisms, facilitating effective object-centric dynamics modeling. Our model +demonstrates superior performance when handling objects and backgrounds in +complex scenes characterized by a wide range of object attributes and dynamic +movements. Moreover, our model demonstrates generalization capabilities across +diverse synthetic environments, highlighting its potential for broad +applicability in vision-related tasks. + +
+
+ comment: 15 pages, 6 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Direct-a-Video: Customized Video Generation with User-Directed Camera + Movement and Object Motion + + +
+ Recent text-to-video diffusion models have achieved impressive progress. In +practice, users often desire the ability to control object motion and camera +movement independently for customized video creation. However, current methods +lack the focus on separately controlling object motion and camera movement in a +decoupled manner, which limits the controllability and flexibility of +text-to-video models. In this paper, we introduce Direct-a-Video, a system that +allows users to independently specify motions for multiple objects as well as +camera's pan and zoom movements, as if directing a video. We propose a simple +yet effective strategy for the decoupled control of object motion and camera +movement. Object motion is controlled through spatial cross-attention +modulation using the model's inherent priors, requiring no additional +optimization. For camera movement, we introduce new temporal cross-attention +layers to interpret quantitative camera movement parameters. We further employ +an augmentation-based approach to train these layers in a self-supervised +manner on a small-scale dataset, eliminating the need for explicit motion +annotation. Both components operate independently, allowing individual or +combined control, and can generalize to open-domain scenarios. Extensive +experiments demonstrate the superiority and effectiveness of our method. +Project page and code are available at https://direct-a-video.github.io/. + +
+
+
+
+
+ + ♻ ☆ Elucidating the Design Space of Dataset Condensation + + +
+ Dataset condensation, a concept within data-centric learning, efficiently +transfers critical attributes from an original dataset to a synthetic version, +maintaining both diversity and realism. This approach significantly improves +model training efficiency and is adaptable across multiple application areas. +Previous methods in dataset condensation have faced challenges: some incur high +computational costs which limit scalability to larger datasets (e.g., MTT, +DREAM, and TESLA), while others are restricted to less optimal design spaces, +which could hinder potential improvements, especially in smaller datasets +(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a +comprehensive design framework that includes specific, effective strategies +like implementing soft category-aware matching and adjusting the learning rate +schedule. These strategies are grounded in empirical evidence and theoretical +backing. Our resulting approach, Elucidate Dataset Condensation (EDC), +establishes a benchmark for both small and large-scale dataset condensation. In +our testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on +ImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a +compression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM, +and RDED by margins of 27.3%, 17.2%, and 6.6%, respectively. + +
+
+
+
+
+ + ♻ ☆ SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual + Editing + + +
+ Effective editing of personal content holds a pivotal role in enabling +individuals to express their creativity, weaving captivating narratives within +their visual stories, and elevate the overall quality and impact of their +visual content. Therefore, in this work, we introduce SwapAnything, a novel +framework that can swap any objects in an image with personalized concepts +given by the reference, while keeping the context unchanged. Compared with +existing methods for personalized subject swapping, SwapAnything has three +unique advantages: (1) precise control of arbitrary objects and parts rather +than the main subject, (2) more faithful preservation of context pixels, (3) +better adaptation of the personalized concept to the image. First, we propose +targeted variable swapping to apply region control over latent feature maps and +swap masked variables for faithful context preservation and initial semantic +concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt +the semantic concept into the original image in terms of target location, +shape, style, and content during the image generation process. Extensive +results on both human and automatic evaluation demonstrate significant +improvements of our approach over baseline methods on personalized swapping. +Furthermore, SwapAnything shows its precise and faithful swapping abilities +across single object, multiple objects, partial object, and cross-domain +swapping tasks. SwapAnything also achieves great performance on text-based +swapping and tasks beyond swapping such as object insertion. + +
+
+ comment: 18 pages, 16 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ AADNet: Attention aware Demoiréing Network + + +
+ Moire pattern frequently appears in photographs captured with mobile devices +and digital cameras, potentially degrading image quality. Despite recent +advancements in computer vision, image demoire'ing remains a challenging task +due to the dynamic textures and variations in colour, shape, and frequency of +moire patterns. Most existing methods struggle to generalize to unseen +datasets, limiting their effectiveness in removing moire patterns from +real-world scenarios. In this paper, we propose a novel lightweight +architecture, AADNet (Attention Aware Demoireing Network), for high-resolution +image demoire'ing that effectively works across different frequency bands and +generalizes well to unseen datasets. Extensive experiments conducted on the +UHDM dataset validate the effectiveness of our approach, resulting in +high-fidelity images. + +
+
+ comment: Due to unauthorized access and upload, this paper has been withdrawn. + It does not reflect the contributions or approval +
+
+
+
+
+ + ♻ ☆ Learning Spatial Features from Audio-Visual Correspondence in Egocentric + Videos CVPR 2024 + + +
+ We propose a self-supervised method for learning representations based on +spatial audio-visual correspondences in egocentric videos. Our method uses a +masked auto-encoding framework to synthesize masked binaural (multi-channel) +audio through the synergy of audio and vision, thereby learning useful spatial +relationships between the two modalities. We use our pretrained features to +tackle two downstream video tasks requiring spatial understanding in social +scenarios: active speaker detection and spatial audio denoising. Through +extensive experiments, we show that our features are generic enough to improve +over multiple state-of-the-art baselines on both tasks on two challenging +egocentric video datasets that offer binaural audio, EgoCom and EasyCom. +Project: http://vision.cs.utexas.edu/projects/ego_av_corr. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Video Instance Shadow Detection + + +
+ Instance shadow detection, crucial for applications such as photo editing and +light direction estimation, has undergone significant advancements in +predicting shadow instances, object instances, and their associations. The +extension of this task to videos presents challenges in annotating diverse +video data and addressing complexities arising from occlusion and temporary +disappearances within associations. In response to these challenges, we +introduce ViShadow, a semi-supervised video instance shadow detection framework +that leverages both labeled image data and unlabeled video data for training. +ViShadow features a two-stage training pipeline: the first stage, utilizing +labeled image data, identifies shadow and object instances through contrastive +learning for cross-frame pairing. The second stage employs unlabeled videos, +incorporating an associated cycle consistency loss to enhance tracking ability. +A retrieval mechanism is introduced to manage temporary disappearances, +ensuring tracking continuity. The SOBA-VID dataset, comprising unlabeled +training videos and labeled testing videos, along with the SOAP-VID metric, is +introduced for the quantitative evaluation of VISD solutions. The effectiveness +of ViShadow is further demonstrated through various video-level applications +such as video inpainting, instance cloning, shadow editing, and text-instructed +shadow-object manipulation. + +
+
+
+
+
+ + ♻ ☆ Adversarial Examples Are Not Real Features NeurIPS 2023 + + +
+ The existence of adversarial examples has been a mystery for years and +attracted much interest. A well-known theory by \citet{ilyas2019adversarial} +explains adversarial vulnerability from a data perspective by showing that one +can extract non-robust features from adversarial examples and these features +alone are useful for classification. However, the explanation remains quite +counter-intuitive since non-robust features are mostly noise features to +humans. In this paper, we re-examine the theory from a larger context by +incorporating multiple learning paradigms. Notably, we find that contrary to +their good usefulness under supervised learning, non-robust features attain +poor usefulness when transferred to other self-supervised learning paradigms, +such as contrastive learning, masked image modeling, and diffusion models. It +reveals that non-robust features are not really as useful as robust or natural +features that enjoy good transferability between these paradigms. Meanwhile, +for robustness, we also show that naturally trained encoders from robust +features are largely non-robust under AutoAttack. Our cross-paradigm +examination suggests that the non-robust features are not really useful but +more like paradigm-wise shortcuts, and robust features alone might be +insufficient to attain reliable model robustness. Code is available at +\url{https://github.com/PKU-ML/AdvNotRealFeatures}. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ A Survey on Hallucination in Large Vision-Language Models + + +
+ Recent development of Large Vision-Language Models (LVLMs) has attracted +growing attention within the AI landscape for its practical implementation +potential. However, ``hallucination'', or more specifically, the misalignment +between factual visual content and corresponding textual generation, poses a +significant challenge of utilizing LVLMs. In this comprehensive survey, we +dissect LVLM-related hallucinations in an attempt to establish an overview and +facilitate future mitigation. Our scrutiny starts with a clarification of the +concept of hallucinations in LVLMs, presenting a variety of hallucination +symptoms and highlighting the unique challenges inherent in LVLM +hallucinations. Subsequently, we outline the benchmarks and methodologies +tailored specifically for evaluating hallucinations unique to LVLMs. +Additionally, we delve into an investigation of the root causes of these +hallucinations, encompassing insights from the training data and model +components. We also critically review existing methods for mitigating +hallucinations. The open questions and future directions pertaining to +hallucinations within LVLMs are discussed to conclude this survey. + +
+
+
+
+
+ + ♻ ☆ IFNet: Deep Imaging and Focusing for Handheld SAR with Millimeter-wave + Signals + + +
+ Recent advancements have showcased the potential of handheld millimeter-wave +(mmWave) imaging, which applies synthetic aperture radar (SAR) principles in +portable settings. However, existing studies addressing handheld motion errors +either rely on costly tracking devices or employ simplified imaging models, +leading to impractical deployment or limited performance. In this paper, we +present IFNet, a novel deep unfolding network that combines the strengths of +signal processing models and deep neural networks to achieve robust imaging and +focusing for handheld mmWave systems. We first formulate the handheld imaging +model by integrating multiple priors about mmWave images and handheld phase +errors. Furthermore, we transform the optimization processes into an iterative +network structure for improved and efficient imaging performance. Extensive +experiments demonstrate that IFNet effectively compensates for handheld phase +errors and recovers high-fidelity images from severely distorted signals. In +comparison with existing methods, IFNet can achieve at least 11.89 dB +improvement in average peak signal-to-noise ratio (PSNR) and 64.91% improvement +in average structural similarity index measure (SSIM) on a real-world dataset. + +
+
+
+
+
+ + ♻ ☆ SOAR: Advancements in Small Body Object Detection for Aerial Imagery + Using State Space Models and Programmable Gradients + + +
+ Small object detection in aerial imagery presents significant challenges in +computer vision due to the minimal data inherent in small-sized objects and +their propensity to be obscured by larger objects and background noise. +Traditional methods using transformer-based models often face limitations +stemming from the lack of specialized databases, which adversely affect their +performance with objects of varying orientations and scales. This underscores +the need for more adaptable, lightweight models. In response, this paper +introduces two innovative approaches that significantly enhance detection and +segmentation capabilities for small aerial objects. Firstly, we explore the use +of the SAHI framework on the newly introduced lightweight YOLO v9 architecture, +which utilizes Programmable Gradient Information (PGI) to reduce the +substantial information loss typically encountered in sequential feature +extraction processes. The paper employs the Vision Mamba model, which +incorporates position embeddings to facilitate precise location-aware visual +understanding, combined with a novel bidirectional State Space Model (SSM) for +effective visual context modeling. This State Space Model adeptly harnesses the +linear complexity of CNNs and the global receptive field of Transformers, +making it particularly effective in remote sensing image classification. Our +experimental results demonstrate substantial improvements in detection accuracy +and processing efficiency, validating the applicability of these approaches for +real-time small object detection across diverse aerial scenarios. This paper +also discusses how these methodologies could serve as foundational models for +future advancements in aerial object recognition technologies. The source code +will be made accessible here. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A Unified Model Selection Technique for Spectral Clustering Based Motion + Segmentation + + +
+ Motion segmentation is a fundamental problem in computer vision and is +crucial in various applications such as robotics, autonomous driving and action +recognition. Recently, spectral clustering based methods have shown impressive +results on motion segmentation in dynamic environments. These methods perform +spectral clustering on motion affinity matrices to cluster objects or point +trajectories in the scene into different motion groups. However, existing +methods often need the number of motions present in the scene to be known, +which significantly reduces their practicality. In this paper, we propose a +unified model selection technique to automatically infer the number of motion +groups for spectral clustering based motion segmentation methods by combining +different existing model selection techniques together. We evaluate our method +on the KT3DMoSeg dataset and achieve competitve results comparing to the +baseline where the number of clusters is given as ground truth information. + +
+
+ comment: for the published version, see + https://openjournals.uwaterloo.ca/index.php/vsl/article/view/5870/5922 +
+
+
+
+
+ + ♻ ☆ Score identity Distillation: Exponentially Fast Distillation of + Pretrained Diffusion Models for One-Step Generation ICML 2024 + + +
+ We introduce Score identity Distillation (SiD), an innovative data-free +method that distills the generative capabilities of pretrained diffusion models +into a single-step generator. SiD not only facilitates an exponentially fast +reduction in Fr\'echet inception distance (FID) during distillation but also +approaches or even exceeds the FID performance of the original teacher +diffusion models. By reformulating forward diffusion processes as semi-implicit +distributions, we leverage three score-related identities to create an +innovative loss mechanism. This mechanism achieves rapid FID reduction by +training the generator using its own synthesized images, eliminating the need +for real data or reverse-diffusion-based generation, all accomplished within +significantly shortened generation time. Upon evaluation across four benchmark +datasets, the SiD algorithm demonstrates high iteration efficiency during +distillation and surpasses competing distillation approaches, whether they are +one-step or few-step, data-free, or dependent on training data, in terms of +generation quality. This achievement not only redefines the benchmarks for +efficiency and effectiveness in diffusion distillation but also in the broader +field of diffusion-based generation. The PyTorch implementation is available at +https://github.com/mingyuanzhou/SiD + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ VTON-IT: Virtual Try-On using Image Translation + + +
+ Virtual Try-On (trying clothes virtually) is a promising application of the +Generative Adversarial Network (GAN). However, it is an arduous task to +transfer the desired clothing item onto the corresponding regions of a human +body because of varying body size, pose, and occlusions like hair and +overlapped clothes. In this paper, we try to produce photo-realistic translated +images through semantic segmentation and a generative adversarial +architecture-based image translation network. We present a novel image-based +Virtual Try-On application VTON-IT that takes an RGB image, segments desired +body part, and overlays target cloth over the segmented body region. Most +state-of-the-art GAN-based Virtual Try-On applications produce unaligned +pixelated synthesis images on real-life test images. However, our approach +generates high-resolution natural images with detailed textures on such variant +images. + +
+
+
+
+
+ + ♻ ☆ From Pixels to Titles: Video Game Identification by Screenshots using + Convolutional Neural Networks + + +
+ This paper investigates video game identification through single screenshots, +utilizing five convolutional neural network (CNN) architectures (MobileNet, +DenseNet, EfficientNetB0, EfficientNetB2, and EfficientNetB3) across 22 home +console systems, spanning from Atari 2600 to PlayStation 5, totalling 8,796 +games and 170,881 screenshots. Confirming the hypothesis, CNNs autonomously +extract image features, enabling the identification of game titles from +screenshots without additional features. Using ImageNet pre-trained weights as +initial weights, EfficientNetB3 achieves the highest average accuracy (74.51%), +while DenseNet169 excels in 14 of the 22 systems. Employing alternative initial +weights trained in an arcade screenshots dataset boosts accuracy for +EfficientNetB2 and EfficientNetB3, with the latter reaching a peak accuracy of +76.36% and demonstrating reduced convergence epochs from 23.7 to 20.5 on +average. Overall, the combination of optimal architecture and weights attains +77.67% accuracy, primarily led by EfficientNetB3 in 19 systems. These findings +underscore the efficacy of CNNs in video game identification through +screenshots. + +
+
+
+
+
+ + ♻ ☆ Validating polyp and instrument segmentation methods in colonoscopy + through Medico 2020 and MedAI 2021 Challenges + + +
+ Automatic analysis of colonoscopy images has been an active field of research +motivated by the importance of early detection of precancerous polyps. However, +detecting polyps during the live examination can be challenging due to various +factors such as variation of skills and experience among the endoscopists, lack +of attentiveness, and fatigue leading to a high polyp miss-rate. Deep learning +has emerged as a promising solution to this challenge as it can assist +endoscopists in detecting and classifying overlooked polyps and abnormalities +in real time. In addition to the algorithm's accuracy, transparency and +interpretability are crucial to explaining the whys and hows of the algorithm's +prediction. Further, most algorithms are developed in private data, closed +source, or proprietary software, and methods lack reproducibility. Therefore, +to promote the development of efficient and transparent methods, we have +organized the "Medico automatic polyp segmentation (Medico 2020)" and "MedAI: +Transparency in Medical Image Segmentation (MedAI 2021)" competitions. We +present a comprehensive summary and analyze each contribution, highlight the +strength of the best-performing methods, and discuss the possibility of +clinical translations of such methods into the clinic. For the transparency +task, a multi-disciplinary team, including expert gastroenterologists, accessed +each submission and evaluated the team based on open-source practices, failure +case analysis, ablation studies, usability and understandability of evaluations +to gain a deeper understanding of the models' credibility for clinical +deployment. Through the comprehensive analysis of the challenge, we not only +highlight the advancements in polyp and surgical instrument segmentation but +also encourage qualitative evaluation for building more transparent and +understandable AI-based colonoscopy systems. + +
+
+
+
+
+ + ♻ ☆ Sub-token ViT Embedding via Stochastic Resonance Transformers + + +
+ Vision Transformer (ViT) architectures represent images as collections of +high-dimensional vectorized tokens, each corresponding to a rectangular +non-overlapping patch. This representation trades spatial granularity for +embedding dimensionality, and results in semantically rich but spatially +coarsely quantized feature maps. In order to retrieve spatial details +beneficial to fine-grained inference tasks we propose a training-free method +inspired by "stochastic resonance". Specifically, we perform sub-token spatial +transformations to the input data, and aggregate the resulting ViT features +after applying the inverse transformation. The resulting "Stochastic Resonance +Transformer" (SRT) retains the rich semantic information of the original +representation, but grounds it on a finer-scale spatial domain, partly +mitigating the coarse effect of spatial tokenization. SRT is applicable across +any layer of any ViT architecture, consistently boosting performance on several +tasks including segmentation, classification, depth estimation, and others by +up to 14.9% without the need for any fine-tuning. + +
+
+
+
+
+ + ♻ ☆ SAR image matching algorithm based on multi-class features + + +
+ Synthetic aperture radar has the ability to work 24/7 and 24/7, and has high +application value. Propose a new SAR image matching algorithm based on multi +class features, mainly using two different types of features: straight lines +and regions to enhance the robustness of the matching algorithm; On the basis +of using prior knowledge of images, combined with LSD (Line Segment Detector) +line detection and template matching algorithm, by analyzing the attribute +correlation between line and surface features in SAR images, selecting line and +region features in SAR images to match the images, the matching accuracy +between SAR images and visible light images is improved, and the probability of +matching errors is reduced. The experimental results have verified that this +algorithm can obtain high-precision matching results, achieve precise target +positioning, and has good robustness to changes in perspective and lighting. +The results are accurate and false positives are controllable. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 67 + +
+
+
+ + ☆ Multi-hop graph transformer network for 3D human pose estimation + + +
+ Accurate 3D human pose estimation is a challenging task due to occlusion and +depth ambiguity. In this paper, we introduce a multi-hop graph transformer +network designed for 2D-to-3D human pose estimation in videos by leveraging the +strengths of multi-head self-attention and multi-hop graph convolutional +networks with disentangled neighborhoods to capture spatio-temporal +dependencies and handle long-range interactions. The proposed network +architecture consists of a graph attention block composed of stacked layers of +multi-head self-attention and graph convolution with learnable adjacency +matrix, and a multi-hop graph convolutional block comprised of multi-hop +convolutional and dilated convolutional layers. The combination of multi-head +self-attention and multi-hop graph convolutional layers enables the model to +capture both local and global dependencies, while the integration of dilated +convolutional layers enhances the model's ability to handle spatial details +required for accurate localization of the human body joints. Extensive +experiments demonstrate the effectiveness and generalization ability of our +model, achieving competitive performance on benchmark datasets. + +
+
+
+
+
+ + ☆ Performance Evaluation of Real-Time Object Detection for Electric + Scooters + + +
+ Electric scooters (e-scooters) have rapidly emerged as a popular mode of +transportation in urban areas, yet they pose significant safety challenges. In +the United States, the rise of e-scooters has been marked by a concerning +increase in related injuries and fatalities. Recently, while deep-learning +object detection holds paramount significance in autonomous vehicles to avoid +potential collisions, its application in the context of e-scooters remains +relatively unexplored. This paper addresses this gap by assessing the +effectiveness and efficiency of cutting-edge object detectors designed for +e-scooters. To achieve this, the first comprehensive benchmark involving 22 +state-of-the-art YOLO object detectors, including five versions (YOLOv3, +YOLOv5, YOLOv6, YOLOv7, and YOLOv8), has been established for real-time traffic +object detection using a self-collected dataset featuring e-scooters. The +detection accuracy, measured in terms of mAP@0.5, ranges from 27.4% +(YOLOv7-E6E) to 86.8% (YOLOv5s). All YOLO models, particularly YOLOv3-tiny, +have displayed promising potential for real-time object detection in the +context of e-scooters. Both the traffic scene dataset +(https://zenodo.org/records/10578641) and software program codes +(https://github.com/DongChen06/ScooterDet) for model benchmarking in this study +are publicly available, which will not only improve e-scooter safety with +advanced object detection but also lay the groundwork for tailored solutions, +promising a safer and more sustainable urban micromobility landscape. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Matten: Video Generation with Mamba-Attention + + +
+ In this paper, we introduce Matten, a cutting-edge latent diffusion model +with Mamba-Attention architecture for video generation. With minimal +computational cost, Matten employs spatial-temporal attention for local video +content modeling and bidirectional Mamba for global video content modeling. Our +comprehensive experimental evaluation demonstrates that Matten has competitive +performance with the current Transformer-based and GAN-based models in +benchmark performance, achieving superior FVD scores and efficiency. +Additionally, we observe a direct positive correlation between the complexity +of our designed model and the improvement in video quality, indicating the +excellent scalability of Matten. + +
+
+
+
+
+ + ☆ AC-MAMBASEG: An adaptive convolution and Mamba-based architecture for + enhanced skin lesion segmentation + + +
+ Skin lesion segmentation is a critical task in computer-aided diagnosis +systems for dermatological diseases. Accurate segmentation of skin lesions from +medical images is essential for early detection, diagnosis, and treatment +planning. In this paper, we propose a new model for skin lesion segmentation +namely AC-MambaSeg, an enhanced model that has the hybrid CNN-Mamba backbone, +and integrates advanced components such as Convolutional Block Attention Module +(CBAM), Attention Gate, and Selective Kernel Bottleneck. AC-MambaSeg leverages +the Vision Mamba framework for efficient feature extraction, while CBAM and +Selective Kernel Bottleneck enhance its ability to focus on informative regions +and suppress background noise. We evaluate the performance of AC-MambaSeg on +diverse datasets of skin lesion images including ISIC-2018 and PH2; then +compare it against existing segmentation methods. Our model shows promising +potential for improving computer-aided diagnosis systems and facilitating early +detection and treatment of dermatological diseases. Our source code will be +made available at: https://github.com/vietthanh2710/AC-MambaSeg. + +
+
+ comment: 15 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ DVMSR: Distillated Vision Mamba for Efficient Super-Resolution + + +
+ Efficient Image Super-Resolution (SR) aims to accelerate SR network inference +by minimizing computational complexity and network parameters while preserving +performance. Existing state-of-the-art Efficient Image Super-Resolution methods +are based on convolutional neural networks. Few attempts have been made with +Mamba to harness its long-range modeling capability and efficient computational +complexity, which have shown impressive performance on high-level vision tasks. +In this paper, we propose DVMSR, a novel lightweight Image SR network that +incorporates Vision Mamba and a distillation strategy. The network of DVMSR +consists of three modules: feature extraction convolution, multiple stacked +Residual State Space Blocks (RSSBs), and a reconstruction module. Specifically, +the deep feature extraction module is composed of several residual state space +blocks (RSSB), each of which has several Vision Mamba Moudles(ViMM) together +with a residual connection. To achieve efficiency improvement while maintaining +comparable performance, we employ a distillation strategy to the vision Mamba +network for superior performance. Specifically, we leverage the rich +representation knowledge of teacher network as additional supervision for the +output of lightweight student networks. Extensive experiments have demonstrated +that our proposed DVMSR can outperform state-of-the-art efficient SR methods in +terms of model parameters while maintaining the performance of both PSNR and +SSIM. The source code is available at https://github.com/nathan66666/DVMSR.git + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ E-TSL: A Continuous Educational Turkish Sign Language Dataset with + Baseline Methods + + +
+ This study introduces the continuous Educational Turkish Sign Language +(E-TSL) dataset, collected from online Turkish language lessons for 5th, 6th, +and 8th grades. The dataset comprises 1,410 videos totaling nearly 24 hours and +includes performances from 11 signers. Turkish, an agglutinative language, +poses unique challenges for sign language translation, particularly with a +vocabulary where 64% are singleton words and 85% are rare words, appearing less +than five times. We developed two baseline models to address these challenges: +the Pose to Text Transformer (P2T-T) and the Graph Neural Network based +Transformer (GNN-T) models. The GNN-T model achieved 19.13% BLEU-1 score and +3.28% BLEU-4 score, presenting a significant challenge compared to existing +benchmarks. The P2T-T model, while demonstrating slightly lower performance in +BLEU scores, achieved a higher ROUGE-L score of 22.09%. Additionally, we +benchmarked our model using the well-known PHOENIX-Weather 2014T dataset to +validate our approach. + +
+
+ comment: 7 pages, 3 figures, 4 tables, submitted to IEEE conference +
+
+
+
+
+ + ☆ Paintings and Drawings Aesthetics Assessment with Rich Attributes for + Various Artistic Categories + + +
+ Image aesthetic evaluation is a highly prominent research domain in the field +of computer vision. In recent years, there has been a proliferation of datasets +and corresponding evaluation methodologies for assessing the aesthetic quality +of photographic works, leading to the establishment of a relatively mature +research environment. However, in contrast to the extensive research in +photographic aesthetics, the field of aesthetic evaluation for paintings and +Drawings has seen limited attention until the introduction of the BAID dataset +in March 2023. This dataset solely comprises overall scores for high-quality +artistic images. Our research marks the pioneering introduction of a +multi-attribute, multi-category dataset specifically tailored to the field of +painting: Aesthetics of Paintings and Drawings Dataset (APDD). The construction +of APDD received active participation from 28 professional artists worldwide, +along with dozens of students specializing in the field of art. This dataset +encompasses 24 distinct artistic categories and 10 different aesthetic +attributes. Each image in APDD has been evaluated by six professionally trained +experts in the field of art, including assessments for both total aesthetic +scores and aesthetic attribute scores. The final APDD dataset comprises a total +of 4985 images, with an annotation count exceeding 31100 entries. Concurrently, +we propose an innovative approach: Art Assessment Network for Specific Painting +Styles (AANSPS), designed for the assessment of aesthetic attributes in +mixed-attribute art datasets. Through this research, our goal is to catalyze +advancements in the field of aesthetic evaluation for paintings and drawings, +while enriching the available resources and methodologies for its further +development and application. + +
+
+
+
+
+ + ☆ SkelCap: Automated Generation of Descriptive Text from Skeleton Keypoint + Sequences + + +
+ Numerous sign language datasets exist, yet they typically cover only a +limited selection of the thousands of signs used globally. Moreover, creating +diverse sign language datasets is an expensive and challenging task due to the +costs associated with gathering a varied group of signers. Motivated by these +challenges, we aimed to develop a solution that addresses these limitations. In +this context, we focused on textually describing body movements from skeleton +keypoint sequences, leading to the creation of a new dataset. We structured +this dataset around AUTSL, a comprehensive isolated Turkish sign language +dataset. We also developed a baseline model, SkelCap, which can generate +textual descriptions of body movements. This model processes the skeleton +keypoints data as a vector, applies a fully connected layer for embedding, and +utilizes a transformer neural network for sequence-to-sequence modeling. We +conducted extensive evaluations of our model, including signer-agnostic and +sign-agnostic assessments. The model achieved promising results, with a ROUGE-L +score of 0.98 and a BLEU-4 score of 0.94 in the signer-agnostic evaluation. The +dataset we have prepared, namely the AUTSL-SkelCap, will be made publicly +available soon. + +
+
+ comment: 8 pages, 5 figures, 7 tables, submitted to IEEE conference +
+
+
+
+
+ + ☆ VectorPainter: A Novel Approach to Stylized Vector Graphics Synthesis + with Vectorized Strokes + + +
+ We propose a novel method, VectorPainter, for the task of stylized vector +graphics synthesis. Given a text prompt and a reference style image, +VectorPainter generates a vector graphic that aligns in content with the text +prompt and remains faithful in style to the reference image. We recognize that +the key to this task lies in fully leveraging the intrinsic properties of +vector graphics. Innovatively, we conceptualize the stylization process as the +rearrangement of vectorized strokes extracted from the reference image. +VectorPainter employs an optimization-based pipeline. It begins by extracting +vectorized strokes from the reference image, which are then used to initialize +the synthesis process. To ensure fidelity to the reference style, a novel style +preservation loss is introduced. Extensive experiments have been conducted to +demonstrate that our method is capable of aligning with the text description +while remaining faithful to the reference image. + +
+
+
+
+
+ + ☆ JOSENet: A Joint Stream Embedding Network for Violence Detection in + Surveillance Videos + + +
+ Due to the ever-increasing availability of video surveillance cameras and the +growing need for crime prevention, the violence detection task is attracting +greater attention from the research community. With respect to other action +recognition tasks, violence detection in surveillance videos shows additional +issues, such as the presence of a significant variety of real fight scenes. +Unfortunately, available datasets seem to be very small compared with other +action recognition datasets. Moreover, in surveillance applications, people in +the scenes always differ for each video and the background of the footage +differs for each camera. Also, violent actions in real-life surveillance videos +must be detected quickly to prevent unwanted consequences, thus models would +definitely benefit from a reduction in memory usage and computational costs. +Such problems make classical action recognition methods difficult to be +adopted. To tackle all these issues, we introduce JOSENet, a novel +self-supervised framework that provides outstanding performance for violence +detection in surveillance videos. The proposed model receives two +spatiotemporal video streams, i.e., RGB frames and optical flows, and involves +a new regularized self-supervised learning approach for videos. JOSENet +provides improved performance compared to self-supervised state-of-the-art +methods, while requiring one-fourth of the number of frames per video segment +and a reduced frame rate. The source code and the instructions to reproduce our +experiments are available at https://github.com/ispamm/JOSENet. + +
+
+ comment: Submitted to the International Journal of Computer Vision +
+
+
+
+
+ + ☆ Score-based Generative Priors Guided Model-driven Network for MRI + Reconstruction + + +
+ Score matching with Langevin dynamics (SMLD) method has been successfully +applied to accelerated MRI. However, the hyperparameters in the sampling +process require subtle tuning, otherwise the results can be severely corrupted +by hallucination artifacts, particularly with out-of-distribution test data. In +this study, we propose a novel workflow in which SMLD results are regarded as +additional priors to guide model-driven network training. First, we adopted a +pretrained score network to obtain samples as preliminary guidance images (PGI) +without the need for network retraining, parameter tuning and in-distribution +test data. Although PGIs are corrupted by hallucination artifacts, we believe +that they can provide extra information through effective denoising steps to +facilitate reconstruction. Therefore, we designed a denoising module (DM) in +the second step to improve the quality of PGIs. The features are extracted from +the components of Langevin dynamics and the same score network with +fine-tuning; hence, we can directly learn the artifact patterns. Third, we +designed a model-driven network whose training is guided by denoised PGIs +(DGIs). DGIs are densely connected with intermediate reconstructions in each +cascade to enrich the features and are periodically updated to provide more +accurate guidance. Our experiments on different sequences revealed that despite +the low average quality of PGIs, the proposed workflow can effectively extract +valuable information to guide the network training, even with severely reduced +training data and sampling steps. Our method outperforms other cutting-edge +techniques by effectively mitigating hallucination artifacts, yielding robust +and high-quality reconstruction results. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Source-Free Domain Adaptation Guided by Vision and Vision-Language + Pre-Training ICCV + + +
+ Source-free domain adaptation (SFDA) aims to adapt a source model trained on +a fully-labeled source domain to a related but unlabeled target domain. While +the source model is a key avenue for acquiring target pseudolabels, the +generated pseudolabels may exhibit source bias. In the conventional SFDA +pipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to +initialize the source model at the start of source training, and subsequently +discarded. Despite having diverse features important for generalization, the +pre-trained feature extractor can overfit to the source data distribution +during source training and forget relevant target domain knowledge. Rather than +discarding this valuable knowledge, we introduce an integrated framework to +incorporate pre-trained networks into the target adaptation process. The +proposed framework is flexible and allows us to plug modern pre-trained +networks into the adaptation process to leverage their stronger representation +learning capabilities. For adaptation, we propose the Co-learn algorithm to +improve target pseudolabel quality collaboratively through the source model and +a pre-trained feature extractor. Building on the recent success of the +vision-language model CLIP in zero-shot image recognition, we present an +extension Co-learn++ to further incorporate CLIP's zero-shot classification +decisions. We evaluate on 3 benchmark datasets and include more challenging +scenarios such as open-set, partial-set and open-partial SFDA. Experimental +results demonstrate that our proposed strategy improves adaptation performance +and can be successfully integrated with existing SFDA methods. + +
+
+ comment: Extension of ICCV paper arXiv:2212.07585, submitted to IJCV +
+
+
+
+
+ + ☆ iSEARLE: Improving Textual Inversion for Zero-Shot Composed Image + Retrieval ICCV2023 + + +
+ Given a query consisting of a reference image and a relative caption, +Composed Image Retrieval (CIR) aims to retrieve target images visually similar +to the reference one while incorporating the changes specified in the relative +caption. The reliance of supervised methods on labor-intensive manually labeled +datasets hinders their broad applicability. In this work, we introduce a new +task, Zero-Shot CIR (ZS-CIR), that addresses CIR without the need for a labeled +training dataset. We propose an approach named iSEARLE (improved zero-Shot +composEd imAge Retrieval with textuaL invErsion) that involves mapping the +visual information of the reference image into a pseudo-word token in CLIP +token embedding space and combining it with the relative caption. To foster +research on ZS-CIR, we present an open-domain benchmarking dataset named CIRCO +(Composed Image Retrieval on Common Objects in context), the first CIR dataset +where each query is labeled with multiple ground truths and a semantic +categorization. The experimental results illustrate that iSEARLE obtains +state-of-the-art performance on three different CIR datasets -- FashionIQ, +CIRR, and the proposed CIRCO -- and two additional evaluation settings, namely +domain conversion and object composition. The dataset, the code, and the model +are publicly available at https://github.com/miccunifi/SEARLE. + +
+
+ comment: Extended version of the ICCV2023 paper arXiv:2303.15247 +
+
+
+
+
+ + ☆ Invertible Residual Rescaling Models + + +
+ Invertible Rescaling Networks (IRNs) and their variants have witnessed +remarkable achievements in various image processing tasks like image rescaling. +However, we observe that IRNs with deeper networks are difficult to train, thus +hindering the representational ability of IRNs. To address this issue, we +propose Invertible Residual Rescaling Models (IRRM) for image rescaling by +learning a bijection between a high-resolution image and its low-resolution +counterpart with a specific distribution. Specifically, we propose IRRM to +build a deep network, which contains several Residual Downscaling Modules +(RDMs) with long skip connections. Each RDM consists of several Invertible +Residual Blocks (IRBs) with short connections. In this way, RDM allows rich +low-frequency information to be bypassed by skip connections and forces models +to focus on extracting high-frequency information from the image. Extensive +experiments show that our IRRM performs significantly better than other +state-of-the-art methods with much fewer parameters and complexity. +Particularly, our IRRM has respectively PSNR gains of at least 0.3 dB over +HCFlow and IRN in the $\times 4$ rescaling while only using 60\% parameters and +50\% FLOPs. The code will be available at https://github.com/THU-Kingmin/IRRM. + +
+
+
+
+
+ + ☆ Imaging Signal Recovery Using Neural Network Priors Under Uncertain + Forward Model Parameters CVPR 2024 + + +
+ Inverse imaging problems (IIPs) arise in various applications, with the main +objective of reconstructing an image from its compressed measurements. This +problem is often ill-posed for being under-determined with multiple +interchangeably consistent solutions. The best solution inherently depends on +prior knowledge or assumptions, such as the sparsity of the image. Furthermore, +the reconstruction process for most IIPs relies significantly on the imaging +(i.e. forward model) parameters, which might not be fully known, or the +measurement device may undergo calibration drifts. These uncertainties in the +forward model create substantial challenges, where inaccurate reconstructions +usually happen when the postulated parameters of the forward model do not fully +match the actual ones. In this work, we devoted to tackling accurate +reconstruction under the context of a set of possible forward model parameters +that exist. Here, we propose a novel Moment-Aggregation (MA) framework that is +compatible with the popular IIP solution by using a neural network prior. +Specifically, our method can reconstruct the signal by considering all +candidate parameters of the forward model simultaneously during the update of +the neural network. We theoretically demonstrate the convergence of the MA +framework, which has a similar complexity with reconstruction under the known +forward model parameters. Proof-of-concept experiments demonstrate that the +proposed MA achieves performance comparable to the forward model with the known +precise parameter in reconstruction across both compressive sensing and phase +retrieval applications, with a PSNR gap of 0.17 to 1.94 over various datasets, +including MNIST, X-ray, Glas, and MoNuseg. This highlights our method's +significant potential in reconstruction under an uncertain forward model. + +
+
+ comment: Accepted by PBDL-CVPR 2024 +
+
+
+
+
+ + ☆ Design, analysis, and manufacturing of a glass-plastic hybrid minimalist + aspheric panoramic annular lens + + +
+ We propose a high-performance glass-plastic hybrid minimalist aspheric +panoramic annular lens (ASPAL) to solve several major limitations of the +traditional panoramic annular lens (PAL), such as large size, high weight, and +complex system. The field of view (FoV) of the ASPAL is +360{\deg}x(35{\deg}~110{\deg}) and the imaging quality is close to the +diffraction limit. This large FoV ASPAL is composed of only 4 lenses. Moreover, +we establish a physical structure model of PAL using the ray tracing method and +study the influence of its physical parameters on compactness ratio. In +addition, for the evaluation of local tolerances of annular surfaces, we +propose a tolerance analysis method suitable for ASPAL. This analytical method +can effectively analyze surface irregularities on annular surfaces and provide +clear guidance on manufacturing tolerances for ASPAL. Benefiting from +high-precision glass molding and injection molding aspheric lens manufacturing +techniques, we finally manufactured 20 ASPALs in small batches. The weight of +an ASPAL prototype is only 8.5 g. Our framework provides promising insights for +the application of panoramic systems in space and weight-constrained +environmental sensing scenarios such as intelligent security, micro-UAVs, and +micro-robots. + +
+
+ comment: Accepted to Optics & Laser Technology +
+
+
+
+
+ + ☆ Boundary-aware Decoupled Flow Networks for Realistic Extreme Rescaling + + +
+ Recently developed generative methods, including invertible rescaling network +(IRN) based and generative adversarial network (GAN) based methods, have +demonstrated exceptional performance in image rescaling. However, IRN-based +methods tend to produce over-smoothed results, while GAN-based methods easily +generate fake details, which thus hinders their real applications. To address +this issue, we propose Boundary-aware Decoupled Flow Networks (BDFlow) to +generate realistic and visually pleasing results. Unlike previous methods that +model high-frequency information as standard Gaussian distribution directly, +our BDFlow first decouples the high-frequency information into \textit{semantic +high-frequency} that adheres to a Boundary distribution and +\textit{non-semantic high-frequency} counterpart that adheres to a Gaussian +distribution. Specifically, to capture semantic high-frequency parts +accurately, we use Boundary-aware Mask (BAM) to constrain the model to produce +rich textures, while non-semantic high-frequency part is randomly sampled from +a Gaussian distribution.Comprehensive experiments demonstrate that our BDFlow +significantly outperforms other state-of-the-art methods while maintaining +lower complexity. Notably, our BDFlow improves the PSNR by $4.4$ dB and the +SSIM by $0.1$ on average over GRAIN, utilizing only 74\% of the parameters and +20\% of the computation. The code will be available at +https://github.com/THU-Kingmin/BAFlow. + +
+
+
+
+
+ + ☆ Unified Dynamic Scanpath Predictors Outperform Individually Trained + Models + + +
+ Previous research on scanpath prediction has mainly focused on group models, +disregarding the fact that the scanpaths and attentional behaviors of +individuals are diverse. The disregard of these differences is especially +detrimental to social human-robot interaction, whereby robots commonly emulate +human gaze based on heuristics or predefined patterns. However, human gaze +patterns are heterogeneous and varying behaviors can significantly affect the +outcomes of such human-robot interactions. To fill this gap, we developed a +deep learning-based social cue integration model for saliency prediction to +instead predict scanpaths in videos. Our model learned scanpaths by recursively +integrating fixation history and social cues through a gating mechanism and +sequential attention. We evaluated our approach on gaze datasets of dynamic +social scenes, observed under the free-viewing condition. The introduction of +fixation history into our models makes it possible to train a single unified +model rather than the resource-intensive approach of training individual models +for each set of scanpaths. We observed that the late neural integration +approach surpasses early fusion when training models on a large dataset, in +comparison to a smaller dataset with a similar distribution. Results also +indicate that a single unified model, trained on all the observers' scanpaths, +performs on par or better than individually trained models. We hypothesize that +this outcome is a result of the group saliency representations instilling +universal attention in the model, while the supervisory signal guides it to +learn personalized attentional behaviors, providing the unified model a benefit +over individual models due to its implicit representation of universal +attention. + +
+
+
+
+
+ + ☆ MERIT: Multi-view Evidential learning for Reliable and Interpretable + liver fibrosis sTaging + + +
+ Accurate staging of liver fibrosis from magnetic resonance imaging (MRI) is +crucial in clinical practice. While conventional methods often focus on a +specific sub-region, multi-view learning captures more information by analyzing +multiple patches simultaneously. However, previous multi-view approaches could +not typically calculate uncertainty by nature, and they generally integrate +features from different views in a black-box fashion, hence compromising +reliability as well as interpretability of the resulting models. In this work, +we propose a new multi-view method based on evidential learning, referred to as +MERIT, which tackles the two challenges in a unified framework. MERIT enables +uncertainty quantification of the predictions to enhance reliability, and +employs a logic-based combination rule to improve interpretability. +Specifically, MERIT models the prediction from each sub-view as an opinion with +quantified uncertainty under the guidance of the subjective logic theory. +Furthermore, a distribution-aware base rate is introduced to enhance +performance, particularly in scenarios involving class distribution shifts. +Finally, MERIT adopts a feature-specific combination rule to explicitly fuse +multi-view predictions, thereby enhancing interpretability. Results have +showcased the effectiveness of the proposed MERIT, highlighting the reliability +and offering both ad-hoc and post-hoc interpretability. They also illustrate +that MERIT can elucidate the significance of each view in the decision-making +process for liver fibrosis staging. + +
+
+ comment: Submitted to Medical Image Analysis +
+
+
+
+
+ + ☆ Overconfidence is Key: Verbalized Uncertainty Evaluation in Large + Language and Vision-Language Models NAACL 2024 + + +
+ Language and Vision-Language Models (LLMs/VLMs) have revolutionized the field +of AI by their ability to generate human-like text and understand images, but +ensuring their reliability is crucial. This paper aims to evaluate the ability +of LLMs (GPT4, GPT-3.5, LLaMA2, and PaLM 2) and VLMs (GPT4V and Gemini Pro +Vision) to estimate their verbalized uncertainty via prompting. We propose the +new Japanese Uncertain Scenes (JUS) dataset, aimed at testing VLM capabilities +via difficult queries and object counting, and the Net Calibration Error (NCE) +to measure direction of miscalibration. Results show that both LLMs and VLMs +have a high calibration error and are overconfident most of the time, +indicating a poor capability for uncertainty estimation. Additionally we +develop prompts for regression tasks, and we show that VLMs have poor +calibration when producing mean/standard deviation and 95% confidence +intervals. + +
+
+ comment: 8 pages, with appendix. To appear in TrustNLP workshop @ NAACL 2024 +
+
+
+
+
+ + ☆ Fast TILs estimation in lung cancer WSIs based on semi-stochastic patch + sampling + + +
+ Addressing the critical need for accurate prognostic biomarkers in cancer +treatment, quantifying tumor-infiltrating lymphocytes (TILs) in non-small cell +lung cancer (NSCLC) presents considerable challenges. Manual TIL quantification +in whole slide images (WSIs) is laborious and subject to variability, +potentially undermining patient outcomes. Our study introduces an automated +pipeline that utilizes semi-stochastic patch sampling, patch classification to +retain prognostically relevant patches, and cell quantification using the +HoVer-Net model to streamline the TIL evaluation process. This pipeline +efficiently excludes approximately 70% of areas not relevant for prognosis and +requires only 5% of the remaining patches to maintain prognostic accuracy +(c-index 0.65 +- 0.01). The computational efficiency achieved does not +sacrifice prognostic accuracy, as demonstrated by the TILs score's strong +correlation with patient survival, which surpasses traditional CD8 IHC scoring +methods. While the pipeline demonstrates potential for enhancing NSCLC +prognostication and personalization of treatment, comprehensive clinical +validation is still required. Future research should focus on verifying its +broader clinical utility and investigating additional biomarkers to improve +NSCLC prognosis. + +
+
+ comment: 18 pages, 7 figures, 6 appendix pages +
+
+
+
+
+ + ☆ Multimodal Sense-Informed Prediction of 3D Human Motions + + +
+ Predicting future human pose is a fundamental application for machine +intelligence, which drives robots to plan their behavior and paths ahead of +time to seamlessly accomplish human-robot collaboration in real-world 3D +scenarios. Despite encouraging results, existing approaches rarely consider the +effects of the external scene on the motion sequence, leading to pronounced +artifacts and physical implausibilities in the predictions. To address this +limitation, this work introduces a novel multi-modal sense-informed motion +prediction approach, which conditions high-fidelity generation on two modal +information: external 3D scene, and internal human gaze, and is able to +recognize their salience for future human activity. Furthermore, the gaze +information is regarded as the human intention, and combined with both motion +and scene features, we construct a ternary intention-aware attention to +supervise the generation to match where the human wants to reach. Meanwhile, we +introduce semantic coherence-aware attention to explicitly distinguish the +salient point clouds and the underlying ones, to ensure a reasonable +interaction of the generated sequence with the 3D scene. On two real-world +benchmarks, the proposed method achieves state-of-the-art performance both in +3D human pose and trajectory prediction. + +
+
+
+
+
+ + ☆ SalFAU-Net: Saliency Fusion Attention U-Net for Salient Object Detection + + +
+ Salient object detection (SOD) remains an important task in computer vision, +with applications ranging from image segmentation to autonomous driving. Fully +convolutional network (FCN)-based methods have made remarkable progress in +visual saliency detection over the last few decades. However, these methods +have limitations in accurately detecting salient objects, particularly in +challenging scenes with multiple objects, small objects, or objects with low +resolutions. To address this issue, we proposed a Saliency Fusion Attention +U-Net (SalFAU-Net) model, which incorporates a saliency fusion module into each +decoder block of the attention U-net model to generate saliency probability +maps from each decoder block. SalFAU-Net employs an attention mechanism to +selectively focus on the most informative regions of an image and suppress +non-salient regions. We train SalFAU-Net on the DUTS dataset using a binary +cross-entropy loss function. We conducted experiments on six popular SOD +evaluation datasets to evaluate the effectiveness of the proposed method. The +experimental results demonstrate that our method, SalFAU-Net, achieves +competitive performance compared to other methods in terms of mean absolute +error (MAE), F-measure, s-measure, and e-measure. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ A drone detector with modified backbone and multiple pyramid featuremaps + enhancement structure (MDDPE) + + +
+ This work presents a drone detector with modified backbone and multiple +pyramid feature maps enhancement structure (MDDPE). Novel feature maps improve +modules that uses different levels of information to produce more robust and +discriminatory features is proposed. These module includes the feature maps +supplement function and the feature maps recombination enhancement function.To +effectively handle the drone characteristics, auxiliary supervisions that are +implemented in the early stages by employing tailored anchors designed are +utilized. To further improve the modeling of real drone detection scenarios and +initialization of the regressor, an updated anchor matching technique is +introduced to match anchors and ground truth drone as closely as feasible. To +show the proposed MDDPE's superiority over the most advanced detectors, +extensive experiments are carried out using well-known drone detection +benchmarks. + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+ + ☆ Blending Distributed NeRFs with Tri-stage Robust Pose Optimization + + +
+ Due to the limited model capacity, leveraging distributed Neural Radiance +Fields (NeRFs) for modeling extensive urban environments has become a +necessity. However, current distributed NeRF registration approaches encounter +aliasing artifacts, arising from discrepancies in rendering resolutions and +suboptimal pose precision. These factors collectively deteriorate the fidelity +of pose estimation within NeRF frameworks, resulting in occlusion artifacts +during the NeRF blending stage. In this paper, we present a distributed NeRF +system with tri-stage pose optimization. In the first stage, precise poses of +images are achieved by bundle adjusting Mip-NeRF 360 with a coarse-to-fine +strategy. In the second stage, we incorporate the inverting Mip-NeRF 360, +coupled with the truncated dynamic low-pass filter, to enable the achievement +of robust and precise poses, termed Frame2Model optimization. On top of this, +we obtain a coarse transformation between NeRFs in different coordinate +systems. In the third stage, we fine-tune the transformation between NeRFs by +Model2Model pose optimization. After obtaining precise transformation +parameters, we proceed to implement NeRF blending, showcasing superior +performance metrics in both real-world and simulation scenarios. Codes and data +will be publicly available at https://github.com/boilcy/Distributed-NeRF. + +
+
+
+
+
+ + ☆ MVIP-NeRF: Multi-view 3D Inpainting on NeRF Scenes via Diffusion Prior + + +
+ Despite the emergence of successful NeRF inpainting methods built upon +explicit RGB and depth 2D inpainting supervisions, these methods are inherently +constrained by the capabilities of their underlying 2D inpainters. This is due +to two key reasons: (i) independently inpainting constituent images results in +view-inconsistent imagery, and (ii) 2D inpainters struggle to ensure +high-quality geometry completion and alignment with inpainted RGB images. + To overcome these limitations, we propose a novel approach called MVIP-NeRF +that harnesses the potential of diffusion priors for NeRF inpainting, +addressing both appearance and geometry aspects. MVIP-NeRF performs joint +inpainting across multiple views to reach a consistent solution, which is +achieved via an iterative optimization process based on Score Distillation +Sampling (SDS). Apart from recovering the rendered RGB images, we also extract +normal maps as a geometric representation and define a normal SDS loss that +motivates accurate geometry inpainting and alignment with the appearance. +Additionally, we formulate a multi-view SDS score function to distill +generative priors simultaneously from different view images, ensuring +consistent visual completion when dealing with large view variations. Our +experimental results show better appearance and geometry recovery than previous +NeRF inpainting methods. + +
+
+ comment: 14 pages, 10 figures, conference +
+
+
+
+
+ + ☆ I$^3$Net: Inter-Intra-slice Interpolation Network for Medical Slice + Synthesis + + +
+ Medical imaging is limited by acquisition time and scanning equipment. CT and +MR volumes, reconstructed with thicker slices, are anisotropic with high +in-plane resolution and low through-plane resolution. We reveal an intriguing +phenomenon that due to the mentioned nature of data, performing slice-wise +interpolation from the axial view can yield greater benefits than performing +super-resolution from other views. Based on this observation, we propose an +Inter-Intra-slice Interpolation Network (I$^3$Net), which fully explores +information from high in-plane resolution and compensates for low through-plane +resolution. The through-plane branch supplements the limited information +contained in low through-plane resolution from high in-plane resolution and +enables continual and diverse feature learning. In-plane branch transforms +features to the frequency domain and enforces an equal learning opportunity for +all frequency bands in a global context learning paradigm. We further propose a +cross-view block to take advantage of the information from all three views +online. Extensive experiments on two public datasets demonstrate the +effectiveness of I$^3$Net, and noticeably outperforms state-of-the-art +super-resolution, video frame interpolation and slice interpolation methods by +a large margin. We achieve 43.90dB in PSNR, with at least 1.14dB improvement +under the upscale factor of $\times$2 on MSD dataset with faster inference. +Code is available at +https://github.com/DeepMed-Lab-ECNU/Medical-Image-Reconstruction. + +
+
+
+
+
+ + ☆ On Enhancing Brain Tumor Segmentation Across Diverse Populations with + Convolutional Neural Networks + + +
+ Brain tumor segmentation is a fundamental step in assessing a patient's +cancer progression. However, manual segmentation demands significant expert +time to identify tumors in 3D multimodal brain MRI scans accurately. This +reliance on manual segmentation makes the process prone to intra- and +inter-observer variability. This work proposes a brain tumor segmentation +method as part of the BraTS-GoAT challenge. The task is to segment tumors in +brain MRI scans automatically from various populations, such as adults, +pediatrics, and underserved sub-Saharan Africa. We employ a recent CNN +architecture for medical image segmentation, namely MedNeXt, as our baseline, +and we implement extensive model ensembling and postprocessing for inference. +Our experiments show that our method performs well on the unseen validation set +with an average DSC of 85.54% and HD95 of 27.88. The code is available on +https://github.com/BioMedIA-MBZUAI/BraTS2024_BioMedIAMBZ. + +
+
+
+
+
+ + ☆ SMCD: High Realism Motion Style Transfer via Mamba-based Diffusion + + +
+ Motion style transfer is a significant research direction in multimedia +applications. It enables the rapid switching of different styles of the same +motion for virtual digital humans, thus vastly increasing the diversity and +realism of movements. It is widely applied in multimedia scenarios such as +movies, games, and the Metaverse. However, most of the current work in this +field adopts the GAN, which may lead to instability and convergence issues, +making the final generated motion sequence somewhat chaotic and unable to +reflect a highly realistic and natural style. To address these problems, we +consider style motion as a condition and propose the Style Motion Conditioned +Diffusion (SMCD) framework for the first time, which can more comprehensively +learn the style features of motion. Moreover, we apply Mamba model for the +first time in the motion style transfer field, introducing the Motion Style +Mamba (MSM) module to handle longer motion sequences. Thirdly, aiming at the +SMCD framework, we propose Diffusion-based Content Consistency Loss and Content +Consistency Loss to assist the overall framework's training. Finally, we +conduct extensive experiments. The results reveal that our method surpasses +state-of-the-art methods in both qualitative and quantitative comparisons, +capable of generating more realistic motion sequences. + +
+
+
+
+
+ + ☆ Residual-Conditioned Optimal Transport: Towards Structure-preserving + Unpaired and Paired Image Restoration ICML 2024 + + +
+ Deep learning-based image restoration methods have achieved promising +performance. However, how to faithfully preserve the structure of the original +image remains challenging. To address this challenge, we propose a novel +Residual-Conditioned Optimal Transport (RCOT) approach, which models the image +restoration as an optimal transport (OT) problem for both unpaired and paired +settings, integrating the transport residual as a unique degradation-specific +cue for both the transport cost and the transport map. Specifically, we first +formalize a Fourier residual-guided OT objective by incorporating the +degradation-specific information of the residual into the transport cost. Based +on the dual form of the OT formulation, we design the transport map as a +two-pass RCOT map that comprises a base model and a refinement process, in +which the transport residual is computed by the base model in the first pass +and then encoded as a degradation-specific embedding to condition the +second-pass restoration. By duality, the RCOT problem is transformed into a +minimax optimization problem, which can be solved by adversarially training +neural networks. Extensive experiments on multiple restoration tasks show the +effectiveness of our approach in terms of both distortion measures and +perceptual quality. Particularly, RCOT restores images with more faithful +structural details compared to state-of-the-art methods. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ☆ Scene-Adaptive Person Search via Bilateral Modulations + + +
+ Person search aims to localize specific a target person from a gallery set of +images with various scenes. As the scene of moving pedestrian changes, the +captured person image inevitably bring in lots of background noise and +foreground noise on the person feature, which are completely unrelated to the +person identity, leading to severe performance degeneration. To address this +issue, we present a Scene-Adaptive Person Search (SEAS) model by introducing +bilateral modulations to simultaneously eliminate scene noise and maintain a +consistent person representation to adapt to various scenes. In SEAS, a +Background Modulation Network (BMN) is designed to encode the feature extracted +from the detected bounding box into a multi-granularity embedding, which +reduces the input of background noise from multiple levels with norm-aware. +Additionally, to mitigate the effect of foreground noise on the person feature, +SEAS introduces a Foreground Modulation Network (FMN) to compute the clutter +reduction offset for the person embedding based on the feature map of the scene +image. By bilateral modulations on both background and foreground within an +end-to-end manner, SEAS obtains consistent feature representations without +scene noise. SEAS can achieve state-of-the-art (SOTA) performance on two +benchmark datasets, CUHK-SYSU with 97.1\% mAP and PRW with 60.5\% mAP. The code +is available at https://github.com/whbdmu/SEAS. + +
+
+
+
+
+ + ☆ Fast One-Stage Unsupervised Domain Adaptive Person Search + + +
+ Unsupervised person search aims to localize a particular target person from a +gallery set of scene images without annotations, which is extremely challenging +due to the unexpected variations of the unlabeled domains. However, most +existing methods dedicate to developing multi-stage models to adapt domain +variations while using clustering for iterative model training, which +inevitably increases model complexity. To address this issue, we propose a Fast +One-stage Unsupervised person Search (FOUS) which complementary integrates +domain adaptaion with label adaptaion within an end-to-end manner without +iterative clustering. To minimize the domain discrepancy, FOUS introduced an +Attention-based Domain Alignment Module (ADAM) which can not only align various +domains for both detection and ReID tasks but also construct an attention +mechanism to reduce the adverse impacts of low-quality candidates resulting +from unsupervised detection. Moreover, to avoid the redundant iterative +clustering mode, FOUS adopts a prototype-guided labeling method which minimizes +redundant correlation computations for partial samples and assigns noisy coarse +label groups efficiently. The coarse label groups will be continuously refined +via label-flexible training network with an adaptive selection strategy. With +the adapted domains and labels, FOUS can achieve the state-of-the-art (SOTA) +performance on two benchmark datasets, CUHK-SYSU and PRW. The code is available +at https://github.com/whbdmu/FOUS. + +
+
+
+
+
+ + ☆ You Only Need Half: Boosting Data Augmentation by Using Partial Content + + +
+ We propose a novel data augmentation method termed You Only Need hAlf (YONA), +which simplifies the augmentation process. YONA bisects an image, substitutes +one half with noise, and applies data augmentation techniques to the remaining +half. This method reduces the redundant information in the original image, +encourages neural networks to recognize objects from incomplete views, and +significantly enhances neural networks' robustness. YONA is distinguished by +its properties of parameter-free, straightforward application, enhancing +various existing data augmentation strategies, and thereby bolstering neural +networks' robustness without additional computational cost. To demonstrate +YONA's efficacy, extensive experiments were carried out. These experiments +confirm YONA's compatibility with diverse data augmentation methods and neural +network architectures, yielding substantial improvements in CIFAR +classification tasks, sometimes outperforming conventional image-level data +augmentation methods. Furthermore, YONA markedly increases the resilience of +neural networks to adversarial attacks. Additional experiments exploring YONA's +variants conclusively show that masking half of an image optimizes performance. +The code is available at https://github.com/HansMoe/YONA. + +
+
+ comment: Technical report,16 pages +
+
+
+
+
+ + ☆ Adaptive Guidance Learning for Camouflaged Object Detection + + +
+ Camouflaged object detection (COD) aims to segment objects visually embedded +in their surroundings, which is a very challenging task due to the high +similarity between the objects and the background. To address it, most methods +often incorporate additional information (e.g., boundary, texture, and +frequency clues) to guide feature learning for better detecting camouflaged +objects from the background. Although progress has been made, these methods are +basically individually tailored to specific auxiliary cues, thus lacking +adaptability and not consistently achieving high segmentation performance. To +this end, this paper proposes an adaptive guidance learning network, dubbed +\textit{AGLNet}, which is a unified end-to-end learnable model for exploring +and adapting different additional cues in CNN models to guide accurate +camouflaged feature learning. Specifically, we first design a straightforward +additional information generation (AIG) module to learn additional camouflaged +object cues, which can be adapted for the exploration of effective camouflaged +features. Then we present a hierarchical feature combination (HFC) module to +deeply integrate additional cues and image features to guide camouflaged +feature learning in a multi-level fusion manner.Followed by a recalibration +decoder (RD), different features are further aggregated and refined for +accurate object prediction. Extensive experiments on three widely used COD +benchmark datasets demonstrate that the proposed method achieves significant +performance improvements under different additional cues, and outperforms the +recent 20 state-of-the-art methods by a large margin. Our code will be made +publicly available at: \textcolor{blue}{{https://github.com/ZNan-Chen/AGLNet}}. + +
+
+
+
+
+ + ☆ Region-specific Risk Quantification for Interpretable Prognosis of + COVID-19 + + +
+ The COVID-19 pandemic has strained global public health, necessitating +accurate diagnosis and intervention to control disease spread and reduce +mortality rates. This paper introduces an interpretable deep survival +prediction model designed specifically for improved understanding and trust in +COVID-19 prognosis using chest X-ray (CXR) images. By integrating a large-scale +pretrained image encoder, Risk-specific Grad-CAM, and anatomical region +detection techniques, our approach produces regional interpretable outcomes +that effectively capture essential disease features while focusing on rare but +critical abnormal regions. Our model's predictive results provide enhanced +clarity and transparency through risk area localization, enabling clinicians to +make informed decisions regarding COVID-19 diagnosis with better understanding +of prognostic insights. We evaluate the proposed method on a multi-center +survival dataset and demonstrate its effectiveness via quantitative and +qualitative assessments, achieving superior C-indexes (0.764 and 0.727) and +time-dependent AUCs (0.799 and 0.691). These results suggest that our +explainable deep survival prediction model surpasses traditional survival +analysis methods in risk prediction, improving interpretability for clinical +decision making and enhancing AI system trustworthiness. + +
+
+
+
+
+ + ☆ PVTransformer: Point-to-Voxel Transformer for Scalable 3D Object + Detection + + +
+ 3D object detectors for point clouds often rely on a pooling-based PointNet +to encode sparse points into grid-like voxels or pillars. In this paper, we +identify that the common PointNet design introduces an information bottleneck +that limits 3D object detection accuracy and scalability. To address this +limitation, we propose PVTransformer: a transformer-based point-to-voxel +architecture for 3D detection. Our key idea is to replace the PointNet pooling +operation with an attention module, leading to a better point-to-voxel +aggregation function. Our design respects the permutation invariance of sparse +3D points while being more expressive than the pooling-based PointNet. +Experimental results show our PVTransformer achieves much better performance +compared to the latest 3D object detectors. On the widely used Waymo Open +Dataset, our PVTransformer achieves state-of-the-art 76.5 mAPH L2, +outperforming the prior art of SWFormer by +1.7 mAPH L2. + +
+
+
+
+
+ + ☆ Kinematic analysis of structural mechanics based on convolutional neural + network + + +
+ Attempt to use convolutional neural network to achieve kinematic analysis of +plane bar structure. Through 3dsMax animation software and OpenCV module, +self-build image dataset of geometrically stable system and geometrically +unstable system. we construct and train convolutional neural network model +based on the TensorFlow and Keras deep learning platform framework. The model +achieves 100% accuracy on the training set, validation set, and test set. The +accuracy on the additional test set is 93.7%, indicating that convolutional +neural network can learn and master the relevant knowledge of kinematic +analysis of structural mechanics. In the future, the generalization ability of +the model can be improved through the diversity of dataset, which has the +potential to surpass human experts for complex structures. Convolutional neural +network has certain practical value in the field of kinematic analysis of +structural mechanics. Using visualization technology, we reveal how +convolutional neural network learns and recognizes structural features. Using +pre-trained VGG16 model for feature extraction and fine-tuning, we found that +the generalization ability is inferior to the self-built model. + +
+
+ comment: 9 pages, 13 figures +
+
+
+
+
+ + ☆ Adapting to Distribution Shift by Visual Domain Prompt Generation ICLR2024 + + +
+ In this paper, we aim to adapt a model at test-time using a few unlabeled +data to address distribution shifts. To tackle the challenges of extracting +domain knowledge from a limited amount of data, it is crucial to utilize +correlated information from pre-trained backbones and source domains. Previous +studies fail to utilize recent foundation models with strong +out-of-distribution generalization. Additionally, domain-centric designs are +not flavored in their works. Furthermore, they employ the process of modelling +source domains and the process of learning to adapt independently into disjoint +training stages. In this work, we propose an approach on top of the +pre-computed features of the foundation model. Specifically, we build a +knowledge bank to learn the transferable knowledge from source domains. +Conditioned on few-shot target data, we introduce a domain prompt generator to +condense the knowledge bank into a domain-specific prompt. The domain prompt +then directs the visual features towards a particular domain via a guidance +module. Moreover, we propose a domain-aware contrastive loss and employ +meta-learning to facilitate domain knowledge extraction. Extensive experiments +are conducted to validate the domain knowledge extraction. The proposed method +outperforms previous work on 5 large-scale benchmarks including WILDS and +DomainNet. + +
+
+ comment: ICLR2024, code: https://github.com/Guliisgreat/VDPG +
+
+
+
+
+ + ☆ ImageInWords: Unlocking Hyper-Detailed Image Descriptions + + +
+ Despite the longstanding adage "an image is worth a thousand words," creating +accurate and hyper-detailed image descriptions for training Vision-Language +models remains challenging. Current datasets typically have web-scraped +descriptions that are short, low-granularity, and often contain details +unrelated to the visual content. As a result, models trained on such data +generate descriptions replete with missing information, visual inconsistencies, +and hallucinations. To address these issues, we introduce ImageInWords (IIW), a +carefully designed human-in-the-loop annotation framework for curating +hyper-detailed image descriptions and a new dataset resulting from this +process. We validate the framework through evaluations focused on the quality +of the dataset and its utility for fine-tuning with considerations for +readability, comprehensiveness, specificity, hallucinations, and +human-likeness. Our dataset significantly improves across these dimensions +compared to recently released datasets (+66%) and GPT-4V outputs (+48%). +Furthermore, models fine-tuned with IIW data excel by +31% against prior work +along the same human evaluation dimensions. Given our fine-tuned models, we +also evaluate text-to-image generation and vision-language reasoning. Our +model's descriptions can generate images closest to the original, as judged by +both automated and human metrics. We also find our model produces more +compositionally rich descriptions, outperforming the best baseline by up to 6% +on ARO, SVO-Probes, and Winoground datasets. + +
+
+ comment: Webpage (https://google.github.io/imageinwords), GitHub + (https://github.com/google/imageinwords), HuggingFace + (https://huggingface.co/datasets/google/imageinwords) +
+
+
+
+
+ + ☆ Jointly Learning Spatial, Angular, and Temporal Information for Enhanced + Lane Detection + + +
+ This paper introduces a novel approach for enhanced lane detection by +integrating spatial, angular, and temporal information through light field +imaging and novel deep learning models. Utilizing lenslet-inspired 2D light +field representations and LSTM networks, our method significantly improves lane +detection in challenging conditions. We demonstrate the efficacy of this +approach with modified CNN architectures, showing superior per- formance over +traditional methods. Our findings suggest this integrated data approach could +advance lane detection technologies and inspire new models that leverage these +multidimensional insights for autonomous vehicle percep- tion. + +
+
+ comment: 5 pages, 3 Figures , Accepted IEEE Conference on Signal Processing + and Communications Applications +
+
+
+
+
+ + ☆ Efficient Text-driven Motion Generation via Latent Consistency Training + + +
+ Motion diffusion models have recently proven successful for text-driven human +motion generation. Despite their excellent generation performance, they are +challenging to infer in real time due to the multi-step sampling mechanism that +involves tens or hundreds of repeat function evaluation iterations. To this +end, we investigate a motion latent consistency Training (MLCT) for motion +generation to alleviate the computation and time consumption during iteration +inference. It applies diffusion pipelines to low-dimensional motion latent +spaces to mitigate the computational burden of each function evaluation. +Explaining the diffusion process with probabilistic flow ordinary differential +equation (PF-ODE) theory, the MLCT allows extremely few steps infer between the +prior distribution to the motion latent representation distribution via +maintaining consistency of the outputs over the trajectory of PF-ODE. +Especially, we introduce a quantization constraint to optimize motion latent +representations that are bounded, regular, and well-reconstructed compared to +traditional variational constraints. Furthermore, we propose a conditional +PF-ODE trajectory simulation method, which improves the conditional generation +performance with minimal additional training costs. Extensive experiments on +two human motion generation benchmarks show that the proposed model achieves +state-of-the-art performance with less than 10\% time cost. + +
+
+
+
+
+ + ☆ Light Field Spatial Resolution Enhancement Framework + + +
+ Light field (LF) imaging captures both angular and spatial light +distributions, enabling advanced photographic techniques. However, micro-lens +array (MLA)- based cameras face a spatial-angular resolution tradeoff due to a +single shared sensor. We propose a novel light field framework for resolution +enhancement, employing a modular approach. The first module generates a +high-resolution, all-in-focus image. The second module, a texture transformer +network, enhances the resolution of each light field perspective independently +using the output of the first module as a reference image. The final module +leverages light field regularity to jointly improve resolution across all LF +image perspectives. Our approach demonstrates superior performance to existing +methods in both qualitative and quantitative evaluations. + +
+
+ comment: 5 pages, 6 figures, accepted in IEEE Conference on Signal Processing + and Communications Applications +
+
+
+
+
+ + ☆ Fused attention mechanism-based ore sorting network + + +
+ Deep learning has had a significant impact on the identification and +classification of mineral resources, especially playing a key role in +efficiently and accurately identifying different minerals, which is important +for improving the efficiency and accuracy of mining. However, traditional ore +sorting meth- ods often suffer from inefficiency and lack of accuracy, +especially in complex mineral environments. To address these challenges, this +study proposes a method called OreYOLO, which incorporates an attentional +mechanism and a multi-scale feature fusion strategy, based on ore data from +gold and sul- fide ores. By introducing the progressive feature pyramid +structure into YOLOv5 and embedding the attention mechanism in the feature +extraction module, the detection performance and accuracy of the model are +greatly improved. In order to adapt to the diverse ore sorting scenarios and +the deployment requirements of edge devices, the network structure is designed +to be lightweight, which achieves a low number of parameters (3.458M) and +computational complexity (6.3GFLOPs) while maintaining high accuracy (99.3% and +99.2%, respectively). In the experimental part, a target detection dataset +containing 6000 images of gold and sulfuric iron ore is constructed for gold +and sulfuric iron ore classification training, and several sets of comparison +experiments are set up, including the YOLO series, EfficientDet, Faster-RCNN, +and CenterNet, etc., and the experiments prove that OreYOLO outperforms the +commonly used high-performance object detection of these architectures + +
+
+
+
+
+ + ☆ MR-Transformer: Vision Transformer for Total Knee Replacement Prediction + Using Magnetic Resonance Imaging + + +
+ A transformer-based deep learning model, MR-Transformer, was developed for +total knee replacement (TKR) prediction using magnetic resonance imaging (MRI). +The model incorporates the ImageNet pre-training and captures three-dimensional +(3D) spatial correlation from the MR images. The performance of the proposed +model was compared to existing state-of-the-art deep learning models for knee +injury diagnosis using MRI. Knee MR scans of four different tissue contrasts +from the Osteoarthritis Initiative and Multicenter Osteoarthritis Study +databases were utilized in the study. Experimental results demonstrated the +state-of-the-art performance of the proposed model on TKR prediction using MRI. + +
+
+
+
+
+ + ☆ A self-supervised text-vision framework for automated brain abnormality + detection + + +
+ Artificial neural networks trained on large, expert-labelled datasets are +considered state-of-the-art for a range of medical image recognition tasks. +However, categorically labelled datasets are time-consuming to generate and +constrain classification to a pre-defined, fixed set of classes. For +neuroradiological applications in particular, this represents a barrier to +clinical adoption. To address these challenges, we present a self-supervised +text-vision framework that learns to detect clinically relevant abnormalities +in brain MRI scans by directly leveraging the rich information contained in +accompanying free-text neuroradiology reports. Our training approach consisted +of two-steps. First, a dedicated neuroradiological language model - NeuroBERT - +was trained to generate fixed-dimensional vector representations of +neuroradiology reports (N = 50,523) via domain-specific self-supervised +learning tasks. Next, convolutional neural networks (one per MRI sequence) +learnt to map individual brain scans to their corresponding text vector +representations by optimising a mean square error loss. Once trained, our +text-vision framework can be used to detect abnormalities in unreported brain +MRI examinations by scoring scans against suitable query sentences (e.g., +'there is an acute stroke', 'there is hydrocephalus' etc.), enabling a range of +classification-based applications including automated triage. Potentially, our +framework could also serve as a clinical decision support tool, not only by +suggesting findings to radiologists and detecting errors in provisional +reports, but also by retrieving and displaying examples of pathologies from +historical examinations that could be relevant to the current case based on +textual descriptors. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Instantaneous Perception of Moving Objects in 3D CVPR 2024 + + +
+ The perception of 3D motion of surrounding traffic participants is crucial +for driving safety. While existing works primarily focus on general large +motions, we contend that the instantaneous detection and quantification of +subtle motions is equally important as they indicate the nuances in driving +behavior that may be safety critical, such as behaviors near a stop sign of +parking positions. We delve into this under-explored task, examining its unique +challenges and developing our solution, accompanied by a carefully designed +benchmark. Specifically, due to the lack of correspondences between consecutive +frames of sparse Lidar point clouds, static objects might appear to be moving - +the so-called swimming effect. This intertwines with the true object motion, +thereby posing ambiguity in accurate estimation, especially for subtle motions. +To address this, we propose to leverage local occupancy completion of object +point clouds to densify the shape cue, and mitigate the impact of swimming +artifacts. The occupancy completion is learned in an end-to-end fashion +together with the detection of moving objects and the estimation of their +motion, instantaneously as soon as objects start to move. Extensive experiments +demonstrate superior performance compared to standard 3D motion estimation +approaches, particularly highlighting our method's specialized treatment of +subtle motions. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ BetterNet: An Efficient CNN Architecture with Residual Learning and + Attention for Precision Polyp Segmentation + + +
+ Colorectal cancer contributes significantly to cancer-related mortality. +Timely identification and elimination of polyps through colonoscopy screening +is crucial in order to decrease mortality rates. Accurately detecting polyps in +colonoscopy images is difficult because of the differences in characteristics +such as size, shape, texture, and similarity to surrounding tissues. Current +deep-learning methods often face difficulties in capturing long-range +connections necessary for segmentation. This research presents BetterNet, a +convolutional neural network (CNN) architecture that combines residual learning +and attention methods to enhance the accuracy of polyp segmentation. The +primary characteristics encompass (1) a residual decoder architecture that +facilitates efficient gradient propagation and integration of multiscale +features. (2) channel and spatial attention blocks within the decoder block to +concentrate the learning process on the relevant areas of polyp regions. (3) +Achieving state-of-the-art performance on polyp segmentation benchmarks while +still ensuring computational efficiency. (4) Thorough ablation tests have been +conducted to confirm the influence of architectural components. (5) The model +code has been made available as open-source for further contribution. Extensive +evaluations conducted on datasets such as Kvasir-SEG, CVC ClinicDB, Endoscene, +EndoTect, and Kvasir-Sessile demonstrate that BetterNets outperforms current +SOTA models in terms of segmentation accuracy by significant margins. The +lightweight design enables real-time inference for various applications. +BetterNet shows promise in integrating computer-assisted diagnosis techniques +to enhance the detection of polyps and the early recognition of cancer. Link to +the code: https://github.com/itsOwen/BetterNet + +
+
+
+
+
+ + ♻ ☆ Open-ended VQA benchmarking of Vision-Language models by exploiting + Classification datasets and their semantic hierarchy ICLR 2024 + + +
+ The evaluation of text-generative vision-language models is a challenging yet +crucial endeavor. By addressing the limitations of existing Visual Question +Answering (VQA) benchmarks and proposing innovative evaluation methodologies, +our research seeks to advance our understanding of these models' capabilities. +We propose a novel VQA benchmark based on well-known visual classification +datasets which allows a granular evaluation of text-generative vision-language +models and their comparison with discriminative vision-language models. To +improve the assessment of coarse answers on fine-grained classification tasks, +we suggest using the semantic hierarchy of the label space to ask automatically +generated follow-up questions about the ground-truth category. Finally, we +compare traditional NLP and LLM-based metrics for the problem of evaluating +model predictions given ground-truth answers. We perform a human evaluation +study upon which we base our decision on the final metric. We apply our +benchmark to a suite of vision-language models and show a detailed comparison +of their abilities on object, action, and attribute classification. Our +contributions aim to lay the foundation for more precise and meaningful +assessments, facilitating targeted progress in the exciting field of +vision-language modeling. + +
+
+ comment: Accepted as Spotlight Paper for ICLR 2024. The first two authors + contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Visual Attention Methods in Deep Learning: An In-Depth Survey + + +
+ Inspired by the human cognitive system, attention is a mechanism that +imitates the human cognitive awareness about specific information, amplifying +critical details to focus more on the essential aspects of data. Deep learning +has employed attention to boost performance for many applications. +Interestingly, the same attention design can suit processing different data +modalities and can easily be incorporated into large networks. Furthermore, +multiple complementary attention mechanisms can be incorporated into one +network. Hence, attention techniques have become extremely attractive. However, +the literature lacks a comprehensive survey on attention techniques to guide +researchers in employing attention in their deep models. Note that, besides +being demanding in terms of training data and computational resources, +transformers only cover a single category in self-attention out of the many +categories available. We fill this gap and provide an in-depth survey of 50 +attention techniques, categorizing them by their most prominent features. We +initiate our discussion by introducing the fundamental concepts behind the +success of the attention mechanism. Next, we furnish some essentials such as +the strengths and limitations of each attention category, describe their +fundamental building blocks, basic formulations with primary usage, and +applications specifically for computer vision. We also discuss the challenges +and general open questions related to attention mechanisms. Finally, we +recommend possible future research directions for deep attention. All the +information about visual attention methods in deep learning is provided at +\href{https://github.com/saeed-anwar/VisualAttention}{https://github.com/saeed-anwar/VisualAttention} + +
+
+ comment: Accepted in Information Fusion +
+
+
+
+
+ + ♻ ☆ Efficient Remote Sensing with Harmonized Transfer Learning and Modality + Alignment ICLR + + +
+ With the rise of Visual and Language Pretraining (VLP), an increasing number +of downstream tasks are adopting the paradigm of pretraining followed by +fine-tuning. Although this paradigm has demonstrated potential in various +multimodal downstream tasks, its implementation in the remote sensing domain +encounters some obstacles. Specifically, the tendency for same-modality +embeddings to cluster together impedes efficient transfer learning. To tackle +this issue, we review the aim of multimodal transfer learning for downstream +tasks from a unified perspective, and rethink the optimization process based on +three distinct objectives. We propose "Harmonized Transfer Learning and +Modality Alignment (HarMA)", a method that simultaneously satisfies task +constraints, modality alignment, and single-modality uniform alignment, while +minimizing training overhead through parameter-efficient fine-tuning. +Remarkably, without the need for external data for training, HarMA achieves +state-of-the-art performance in two popular multimodal retrieval tasks in the +field of remote sensing. Our experiments reveal that HarMA achieves competitive +and even superior performance to fully fine-tuned models with only minimal +adjustable parameters. Due to its simplicity, HarMA can be integrated into +almost all existing multimodal pretraining models. We hope this method can +facilitate the efficient application of large models to a wide range of +downstream tasks while significantly reducing the resource consumption. Code is +available at https://github.com/seekerhuang/HarMA. + +
+
+ comment: Accepted by the Twelfth International Conference on Learning + Representations (ICLR) Workshop +
+
+
+
+
+ + ♻ ☆ Position paper: Do not explain (vision models) without context ICML 2024 + + +
+ Does the stethoscope in the picture make the adjacent person a doctor or a +patient? This, of course, depends on the contextual relationship of the two +objects. If it is obvious, why don not explanation methods for vision models +use contextual information? In this paper, we (1) review the most popular +methods of explaining computer vision models by pointing out that they do not +take into account context information, (2) provide examples of real-world use +cases where spatial context plays a significant role, (3) propose new research +directions that may lead to better use of context information in explaining +computer vision models, (4) argue that a change in approach to explanations is +needed from 'where' to 'how'. + +
+
+ comment: Accepted for ICML 2024 +
+
+
+
+
+ + ♻ ☆ A Collaborative Model-driven Network for MRI Reconstruction + + +
+ Deep learning (DL)-based methods offer a promising solution to reduce the +prolonged scanning time in magnetic resonance imaging (MRI). While model-driven +DL methods have demonstrated convincing results by incorporating prior +knowledge into deep networks, further exploration is needed to optimize the +integration of diverse priors.. Existing model-driven networks typically +utilize linearly stacked unrolled cascades to mimic iterative solution steps in +optimization algorithms. However, this approach needs to find a balance between +different prior-based regularizers during training, resulting in slower +convergence and suboptimal reconstructions. To overcome the limitations, we +propose a collaborative model-driven network to maximally exploit the +complementarity of different regularizers. We design attention modules to learn +both the relative confidence (RC) and overall confidence (OC) for the +intermediate reconstructions (IRs) generated by different prior-based +subnetworks. RC assigns more weight to the areas of expertise of the +subnetworks, enabling precise element-wise collaboration. We design correction +modules to tackle bottleneck scenarios where both subnetworks exhibit low +accuracy, and they further optimize the IRs based on OC maps. IRs across +various stages are concatenated and fed to the attention modules to build +robust and accurate confidence maps. Experimental results on multiple datasets +showed significant improvements in the final results without additional +computational costs. Moreover, the proposed model-driven network design +strategy can be conveniently applied to various model-driven methods to improve +their performance. + +
+
+
+
+
+ + ♻ ☆ Deep Regression Representation Learning with Topology ICML2024 + + +
+ Most works studying representation learning focus only on classification and +neglect regression. Yet, the learning objectives and therefore the +representation topologies of the two tasks are fundamentally different: +classification targets class separation, leading to disconnected +representations, whereas regression requires ordinality with respect to the +target, leading to continuous representations. We thus wonder how the +effectiveness of a regression representation is influenced by its topology, +with evaluation based on the Information Bottleneck (IB) principle. + The IB principle is an important framework that provides principles for +learning effectiveness representations. We establish two connections between it +and the topology of regression representations. The first connection reveals +that a lower intrinsic dimension of the feature space implies a reduced +complexity of the representation Z. This complexity can be quantified as the +conditional entropy of Z on the target space Y and serves as an upper bound on +the generalization error. The second connection suggests learning a feature +space that is topologically similar to the target space will better align with +the IB principle. Based on these two connections, we introduce PH-Reg, a +regularizer specific to regression that matches the intrinsic dimension and +topology of the feature space with the target space. Experiments on synthetic +and real-world regression tasks demonstrate the benefits of PH-Reg. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ♻ ☆ Delocate: Detection and Localization for Deepfake Videos with + Randomly-Located Tampered Traces + + +
+ Deepfake videos are becoming increasingly realistic, showing few tampering +traces on facial areasthat vary between frames. Consequently, existing Deepfake +detection methods struggle to detect unknown domain Deepfake videos while +accurately locating the tampered region. To address thislimitation, we propose +Delocate, a novel Deepfake detection model that can both recognize andlocalize +unknown domain Deepfake videos. Ourmethod consists of two stages named +recoveringand localization. In the recovering stage, the modelrandomly masks +regions of interest (ROIs) and reconstructs real faces without tampering +traces, leading to a relatively good recovery effect for realfaces and a poor +recovery effect for fake faces. Inthe localization stage, the output of the +recoveryphase and the forgery ground truth mask serve assupervision to guide +the forgery localization process. This process strategically emphasizes the +recovery phase of fake faces with poor recovery, facilitating the localization +of tampered regions. Ourextensive experiments on four widely used benchmark +datasets demonstrate that Delocate not onlyexcels in localizing tampered areas +but also enhances cross-domain detection performance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2308.09921, + arXiv:2305.05943 +
+
+
+
+
+ + ♻ ☆ Removal and Selection: Improving RGB-Infrared Object Detection via + Coarse-to-Fine Fusion + + +
+ Object detection in visible (RGB) and infrared (IR) images has been widely +applied in recent years. Leveraging the complementary characteristics of RGB +and IR images, the object detector provides reliable and robust object +localization from day to night. Most existing fusion strategies directly input +RGB and IR images into deep neural networks, leading to inferior detection +performance. However, the RGB and IR features have modality-specific noise, +these strategies will exacerbate the fused features along with the propagation. +Inspired by the mechanism of the human brain processing multimodal information, +in this paper, we introduce a new coarse-to-fine perspective to purify and fuse +two modality features. Specifically, following this perspective, we design a +Redundant Spectrum Removal module to coarsely remove interfering information +within each modality and a Dynamic Feature Selection module to finely select +the desired features for feature fusion. To verify the effectiveness of the +coarse-to-fine fusion strategy, we construct a new object detector called the +Removal and Selection Detector (RSDet). Extensive experiments on three RGB-IR +object detection datasets verify the superior performance of our method. + +
+
+ comment: 11pages, 11figures +
+
+
+
+
+ + ♻ ☆ Morphology-Aware Interactive Keypoint Estimation MICCAI 2022 + + +
+ Diagnosis based on medical images, such as X-ray images, often involves +manual annotation of anatomical keypoints. However, this process involves +significant human efforts and can thus be a bottleneck in the diagnostic +process. To fully automate this procedure, deep-learning-based methods have +been widely proposed and have achieved high performance in detecting keypoints +in medical images. However, these methods still have clinical limitations: +accuracy cannot be guaranteed for all cases, and it is necessary for doctors to +double-check all predictions of models. In response, we propose a novel deep +neural network that, given an X-ray image, automatically detects and refines +the anatomical keypoints through a user-interactive system in which doctors can +fix mispredicted keypoints with fewer clicks than needed during manual +revision. Using our own collected data and the publicly available AASCE +dataset, we demonstrate the effectiveness of the proposed method in reducing +the annotation costs via extensive quantitative and qualitative results. A demo +video of our approach is available on our project webpage. + +
+
+ comment: MICCAI 2022. The first two authors contributed equally. The last two + authors are the co-corresponding authors +
+
+
+
+
+ + ♻ ☆ Improved Crop and Weed Detection with Diverse Data Ensemble Learning in + Agriculture CVPR + + +
+ Modern agriculture heavily relies on Site-Specific Farm Management practices, +necessitating accurate detection, localization, and quantification of crops and +weeds in the field, which can be achieved using deep learning techniques. In +this regard, crop and weed-specific binary segmentation models have shown +promise. However, uncontrolled field conditions limit their performance from +one field to the other. To improve semantic model generalization, existing +methods augment and synthesize agricultural data to account for uncontrolled +field conditions. However, given highly varied field conditions, these methods +have limitations. To overcome the challenges of model deterioration in such +conditions, we propose utilizing data specific to other crops and weeds for our +specific target problem. To achieve this, we propose a novel ensemble +framework. Our approach involves utilizing different crop and weed models +trained on diverse datasets and employing a teacher-student configuration. By +using homogeneous stacking of base models and a trainable meta-architecture to +combine their outputs, we achieve significant improvements for Canola crops and +Kochia weeds on unseen test data, surpassing the performance of single semantic +segmentation models. We identify the UNET meta-architecture as the most +effective in this context. Finally, through ablation studies, we demonstrate +and validate the effectiveness of our proposed model. We observe that including +base models trained on other target crops and weeds can help generalize the +model to capture varied field conditions. Lastly, we propose two novel datasets +with varied conditions for comparisons. + +
+
+ comment: Accepted in CVPR Workshop as an Oral +
+
+
+
+
+ + ♻ ☆ Lightweight Event-based Optical Flow Estimation via Iterative Deblurring ICRA'24 + + +
+ Inspired by frame-based methods, state-of-the-art event-based optical flow +networks rely on the explicit construction of correlation volumes, which are +expensive to compute and store, rendering them unsuitable for robotic +applications with limited compute and energy budget. Moreover, correlation +volumes scale poorly with resolution, prohibiting them from estimating +high-resolution flow. We observe that the spatiotemporally continuous traces of +events provide a natural search direction for seeking pixel correspondences, +obviating the need to rely on gradients of explicit correlation volumes as such +search directions. We introduce IDNet (Iterative Deblurring Network), a +lightweight yet high-performing event-based optical flow network directly +estimating flow from event traces without using correlation volumes. We further +propose two iterative update schemes: "ID" which iterates over the same batch +of events, and "TID" which iterates over time with streaming events in an +online fashion. Our top-performing ID model sets a new state of the art on DSEC +benchmark. Meanwhile, the base ID model is competitive with prior arts while +using 80% fewer parameters, consuming 20x less memory footprint and running 40% +faster on the NVidia Jetson Xavier NX. Furthermore, the TID model is even more +efficient offering an additional 5x faster inference speed and 8 ms ultra-low +latency at the cost of only a 9% performance drop, making it the only model +among current literature capable of real-time operation while maintaining +decent performance. + +
+
+ comment: Accepted to IEEE International Conference on Robotics and Automation + (ICRA'24), Yokohama, Japan, May 13-17, 2024. arXiv revision includes + additional ablation studies results +
+
+
+
+
+ + ♻ ☆ Separate in the Speech Chain: Cross-Modal Conditional Audio-Visual + Target Speech Extraction IJCAI 2024 + + +
+ The integration of visual cues has revitalized the performance of the target +speech extraction task, elevating it to the forefront of the field. +Nevertheless, this multi-modal learning paradigm often encounters the challenge +of modality imbalance. In audio-visual target speech extraction tasks, the +audio modality tends to dominate, potentially overshadowing the importance of +visual guidance. To tackle this issue, we propose AVSepChain, drawing +inspiration from the speech chain concept. Our approach partitions the +audio-visual target speech extraction task into two stages: speech perception +and speech production. In the speech perception stage, audio serves as the +dominant modality, while visual information acts as the conditional modality. +Conversely, in the speech production stage, the roles are reversed. This +transformation of modality status aims to alleviate the problem of modality +imbalance. Additionally, we introduce a contrastive semantic matching loss to +ensure that the semantic information conveyed by the generated speech aligns +with the semantic information conveyed by lip movements during the speech +production stage. Through extensive experiments conducted on multiple benchmark +datasets for audio-visual target speech extraction, we showcase the superior +performance achieved by our proposed method. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ DoraemonGPT: Toward Understanding Dynamic Scenes with Large Language + Models (Exemplified as A Video Agent) + + +
+ Recent LLM-driven visual agents mainly focus on solving image-based tasks, +which limits their ability to understand dynamic scenes, making it far from +real-life applications like guiding students in laboratory experiments and +identifying their mistakes. Hence, this paper explores DoraemonGPT, a +comprehensive and conceptually elegant system driven by LLMs to understand +dynamic scenes. Considering the video modality better reflects the +ever-changing nature of real-world scenarios, we exemplify DoraemonGPT as a +video agent. Given a video with a question/task, DoraemonGPT begins by +converting the input video into a symbolic memory that stores task-related +attributes. This structured representation allows for spatial-temporal querying +and reasoning by well-designed sub-task tools, resulting in concise +intermediate results. Recognizing that LLMs have limited internal knowledge +when it comes to specialized domains (e.g., analyzing the scientific principles +underlying experiments), we incorporate plug-and-play tools to assess external +knowledge and address tasks across different domains. Moreover, a novel +LLM-driven planner based on Monte Carlo Tree Search is introduced to explore +the large planning space for scheduling various tools. The planner iteratively +finds feasible solutions by backpropagating the result's reward, and multiple +solutions can be summarized into an improved final answer. We extensively +evaluate DoraemonGPT's effectiveness on three benchmarks and several +in-the-wild scenarios. The code will be released at +https://github.com/z-x-yang/DoraemonGPT. + +
+
+
+
+
+ + ♻ ☆ Exploiting Semantic Reconstruction to Mitigate Hallucinations in + Vision-Language Models + + +
+ Hallucinations in vision-language models pose a significant challenge to +their reliability, particularly in the generation of long captions. Current +methods fall short of accurately identifying and mitigating these +hallucinations. To address this issue, we introduce ESREAL, a novel +unsupervised learning framework designed to suppress the generation of +hallucinations through accurate localization and penalization of hallucinated +tokens. Initially, ESREAL creates a reconstructed image based on the generated +caption and aligns its corresponding regions with those of the original image. +This semantic reconstruction aids in identifying both the presence and type of +token-level hallucinations within the generated caption. Subsequently, ESREAL +computes token-level hallucination scores by assessing the semantic similarity +of aligned regions based on the type of hallucination. Finally, ESREAL employs +a proximal policy optimization algorithm, where it selectively penalizes +hallucinated tokens according to their token-level hallucination scores. Our +framework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2 +by 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved +solely through signals derived from the image itself, without the need for any +image-text pairs. + +
+
+
+
+
+ + ♻ ☆ MagicPose: Realistic Human Poses and Facial Expressions Retargeting with + Identity-aware Diffusion ICML 2024 + + +
+ In this work, we propose MagicPose, a diffusion-based model for 2D human pose +and facial expression retargeting. Specifically, given a reference image, we +aim to generate a person's new images by controlling the poses and facial +expressions while keeping the identity unchanged. To this end, we propose a +two-stage training strategy to disentangle human motions and appearance (e.g., +facial expressions, skin tone and dressing), consisting of (1) the pre-training +of an appearance-control block and (2) learning appearance-disentangled pose +control. Our novel design enables robust appearance control over generated +human images, including body, facial attributes, and even background. By +leveraging the prior knowledge of image diffusion models, MagicPose generalizes +well to unseen human identities and complex poses without the need for +additional fine-tuning. Moreover, the proposed model is easy to use and can be +considered as a plug-in module/extension to Stable Diffusion. The code is +available at: https://github.com/Boese0601/MagicDance + +
+
+ comment: Accepted by ICML 2024. MagicPose and MagicDance are the same project. + Website:https://boese0601.github.io/magicdance/ + Code:https://github.com/Boese0601/MagicDance +
+
+
+
+
+ + ♻ ☆ Mastering Text-to-Image Diffusion: Recaptioning, Planning, and + Generating with Multimodal LLMs ICML 2024 + + +
+ Diffusion models have exhibit exceptional performance in text-to-image +generation and editing. However, existing methods often face challenges when +handling complex text prompts that involve multiple objects with multiple +attributes and relationships. In this paper, we propose a brand new +training-free text-to-image generation/editing framework, namely Recaption, +Plan and Generate (RPG), harnessing the powerful chain-of-thought reasoning +ability of multimodal LLMs to enhance the compositionality of text-to-image +diffusion models. Our approach employs the MLLM as a global planner to +decompose the process of generating complex images into multiple simpler +generation tasks within subregions. We propose complementary regional diffusion +to enable region-wise compositional generation. Furthermore, we integrate +text-guided image generation and editing within the proposed RPG in a +closed-loop fashion, thereby enhancing generalization ability. Extensive +experiments demonstrate our RPG outperforms state-of-the-art text-to-image +diffusion models, including DALL-E 3 and SDXL, particularly in multi-category +object composition and text-image semantic alignment. Notably, our RPG +framework exhibits wide compatibility with various MLLM architectures (e.g., +MiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available +at: https://github.com/YangLing0818/RPG-DiffusionMaster + +
+
+ comment: ICML 2024. Project: + https://github.com/YangLing0818/RPG-DiffusionMaster +
+
+
+
+
+ + ♻ ☆ RegionPLC: Regional Point-Language Contrastive Learning for Open-World + 3D Scene Understanding CVPR2024 + + +
+ We propose a lightweight and scalable Regional Point-Language Contrastive +learning framework, namely \textbf{RegionPLC}, for open-world 3D scene +understanding, aiming to identify and recognize open-set objects and +categories. Specifically, based on our empirical studies, we introduce a +3D-aware SFusion strategy that fuses 3D vision-language pairs derived from +multiple 2D foundation models, yielding high-quality, dense region-level +language descriptions without human 3D annotations. Subsequently, we devise a +region-aware point-discriminative contrastive learning objective to enable +robust and effective 3D learning from dense regional language supervision. We +carry out extensive experiments on ScanNet, ScanNet200, and nuScenes datasets, +and our model outperforms prior 3D open-world scene understanding approaches by +an average of 17.2\% and 9.1\% for semantic and instance segmentation, +respectively, while maintaining greater scalability and lower resource demands. +Furthermore, our method has the flexibility to be effortlessly integrated with +language models to enable open-ended grounded 3D reasoning without extra +task-specific training. Code is available at https://github.com/CVMI-Lab/PLA. + +
+
+ comment: To appear in CVPR2024 .project page: + https://jihanyang.github.io/projects/RegionPLC +
+
+
+
+
+ + ♻ ☆ PLMM: Personal Large Language Models on Mobile Devices + + +
+ Inspired by Federated Learning, in this paper, we propose personal large +models that are distilled from traditional large language models but more +adaptive to local users' personal information such as education background and +hobbies. We classify the large language models into three levels: the personal +level, expert level and traditional level. The personal level models are +adaptive to users' personal information. They encrypt the users' input and +protect their privacy. The expert level models focus on merging specific +knowledge such as finance, IT and art. The traditional models focus on the +universal knowledge discovery and upgrading the expert models. In such +classifications, the personal models directly interact with the user. For the +whole system, the personal models have users' (encrypted) personal information. +Moreover, such models must be small enough to be performed on personal +computers or mobile devices. Finally, they also have to response in real-time +for better user experience and produce high quality results. The proposed +personal large models can be applied in a wide range of applications such as +language and vision tasks. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2307.13221 +
+
+
+
+
+ + ♻ ☆ VimTS: A Unified Video and Image Text Spotter for Enhancing the + Cross-domain Generalization + + +
+ Text spotting, a task involving the extraction of textual information from +image or video sequences, faces challenges in cross-domain adaption, such as +image-to-image and image-to-video generalization. In this paper, we introduce a +new method, termed VimTS, which enhances the generalization ability of the +model by achieving better synergy among different tasks. Typically, we propose +a Prompt Queries Generation Module and a Tasks-aware Adapter to effectively +convert the original single-task model into a multi-task model suitable for +both image and video scenarios with minimal additional parameters. The Prompt +Queries Generation Module facilitates explicit interaction between different +tasks, while the Tasks-aware Adapter helps the model dynamically learn suitable +features for each task. Additionally, to further enable the model to learn +temporal information at a lower cost, we propose a synthetic video text dataset +(VTD-368k) by leveraging the Content Deformation Fields (CoDeF) algorithm. +Notably, our method outperforms the state-of-the-art method by an average of +2.6% in six cross-domain benchmarks such as TT-to-IC15, CTW1500-to-TT, and +TT-to-CTW1500. For video-level cross-domain adaption, our method even surpasses +the previous end-to-end video spotting method in ICDAR2015 video and DSText v2 +by an average of 5.5% on the MOTA metric, using only image-level data. We +further demonstrate that existing Large Multimodal Models exhibit limitations +in generating cross-domain scene text spotting, in contrast to our VimTS model +which requires significantly fewer parameters and data. The code and datasets +will be made available at the https://VimTextSpotter.github.io. + +
+
+
+
+
+ + ♻ ☆ Multiple Code Hashing for Efficient Image Retrieval + + +
+ Due to its low storage cost and fast query speed, hashing has been widely +used in large-scale image retrieval tasks. Hash bucket search returns data +points within a given Hamming radius to each query, which can enable search at +a constant or sub-linear time cost. However, existing hashing methods cannot +achieve satisfactory retrieval performance for hash bucket search in complex +scenarios, since they learn only one hash code for each image. More +specifically, by using one hash code to represent one image, existing methods +might fail to put similar image pairs to the buckets with a small Hamming +distance to the query when the semantic information of images is complex. As a +result, a large number of hash buckets need to be visited for retrieving +similar images, based on the learned codes. This will deteriorate the +efficiency of hash bucket search. In this paper, we propose a novel hashing +framework, called multiple code hashing (MCH), to improve the performance of +hash bucket search. The main idea of MCH is to learn multiple hash codes for +each image, with each code representing a different region of the image. +Furthermore, we propose a deep reinforcement learning algorithm to learn the +parameters in MCH. To the best of our knowledge, this is the first work that +proposes to learn multiple hash codes for each image in image retrieval. +Experiments demonstrate that MCH can achieve a significant improvement in hash +bucket search, compared with existing methods that learn only one hash code for +each image. + +
+
+ comment: 12 pages, 9 figures, 3 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 51 + +
+
+
+ + ☆ MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial + Representation Learning + + +
+ The volume of unlabelled Earth observation (EO) data is huge, but many +important applications lack labelled training data. However, EO data offers the +unique opportunity to pair data from different modalities and sensors +automatically based on geographic location and time, at virtually no human +labor cost. We seize this opportunity to create a diverse multi-modal +pretraining dataset at global scale. Using this new corpus of 1.2 million +locations, we propose a Multi-Pretext Masked Autoencoder (MP-MAE) approach to +learn general-purpose representations for optical satellite images. Our +approach builds on the ConvNeXt V2 architecture, a fully convolutional masked +autoencoder (MAE). Drawing upon a suite of multi-modal pretext tasks, we +demonstrate that our MP-MAE approach outperforms both MAEs pretrained on +ImageNet and MAEs pretrained on domain-specific satellite images. This is shown +on several downstream tasks including image classification and semantic +segmentation. We find that multi-modal pretraining notably improves the linear +probing performance, e.g. 4pp on BigEarthNet and 16pp on So2Sat, compared to +pretraining on optical satellite images only. We show that this also leads to +better label and parameter efficiency which are crucial aspects in global scale +applications. + +
+
+ comment: Data and code is available on the project page: + https://vishalned.github.io/mmearth +
+
+
+
+
+ + ☆ Beyond Unimodal Learning: The Importance of Integrating Multiple + Modalities for Lifelong Learning + + +
+ While humans excel at continual learning (CL), deep neural networks (DNNs) +exhibit catastrophic forgetting. A salient feature of the brain that allows +effective CL is that it utilizes multiple modalities for learning and +inference, which is underexplored in DNNs. Therefore, we study the role and +interactions of multiple modalities in mitigating forgetting and introduce a +benchmark for multimodal continual learning. Our findings demonstrate that +leveraging multiple views and complementary information from multiple +modalities enables the model to learn more accurate and robust representations. +This makes the model less vulnerable to modality-specific regularities and +considerably mitigates forgetting. Furthermore, we observe that individual +modalities exhibit varying degrees of robustness to distribution shift. +Finally, we propose a method for integrating and aligning the information from +different modalities by utilizing the relational structural similarities +between the data points in each modality. Our method sets a strong baseline +that enables both single- and multimodal inference. Our study provides a +promising case for further exploring the role of multiple modalities in +enabling CL and provides a standard benchmark for future research. + +
+
+ comment: Accepted at 3rd Conference on Lifelong Learning Agents (CoLLAs), 2024 +
+
+
+
+
+ + ☆ TK-Planes: Tiered K-Planes with High Dimensional Feature Vectors for + Dynamic UAV-based Scenes IROS2024 + + +
+ In this paper, we present a new approach to bridge the domain gap between +synthetic and real-world data for un- manned aerial vehicle (UAV)-based +perception. Our formu- lation is designed for dynamic scenes, consisting of +moving objects or human actions, where the goal is to recognize the pose or +actions. We propose an extension of K-Planes Neural Radiance Field (NeRF), +wherein our algorithm stores a set of tiered feature vectors. The tiered +feature vectors are generated to effectively model conceptual information about +a scene as well as an image decoder that transforms output feature maps into +RGB images. Our technique leverages the information amongst both static and +dynamic objects within a scene and is able to capture salient scene attributes +of high altitude videos. We evaluate its performance on challenging datasets, +including Okutama Action and UG2, and observe considerable improvement in +accuracy over state of the art aerial perception algorithms. + +
+
+ comment: 8 pages, submitted to IROS2024 +
+
+
+
+
+ + ☆ Deep Image Restoration For Image Anti-Forensics + + +
+ While image forensics is concerned with whether an image has been tampered +with, image anti-forensics attempts to prevent image forensics methods from +detecting tampered images. The competition between these two fields started +long before the advancement of deep learning. JPEG compression, blurring and +noising, which are simple methods by today's standards, have long been used for +anti-forensics and have been the subject of much research in both forensics and +anti-forensics. Although these traditional methods are old, they make it +difficult to detect fake images and are used for data augmentation in training +deep image forgery detection models. In addition to making the image difficult +to detect, these methods leave traces on the image and consequently degrade the +image quality. Separate image forensics methods have also been developed to +detect these traces. In this study, we go one step further and improve the +image quality after these methods with deep image restoration models and make +it harder to detect the forged image. We evaluate the impact of these methods +on image quality. We then test both our proposed methods with deep learning and +methods without deep learning on the two best existing image manipulation +detection models. In the obtained results, we show how existing image forgery +detection models fail against the proposed methods. Code implementation will be +publicly available at https://github.com/99eren99/DIRFIAF . + +
+
+
+
+
+ + ☆ U-DiTs: Downsample Tokens in U-Shaped Diffusion Transformers + + +
+ Diffusion Transformers (DiTs) introduce the transformer architecture to +diffusion tasks for latent-space image generation. With an isotropic +architecture that chains a series of transformer blocks, DiTs demonstrate +competitive performance and good scalability; but meanwhile, the abandonment of +U-Net by DiTs and their following improvements is worth rethinking. To this +end, we conduct a simple toy experiment by comparing a U-Net architectured DiT +with an isotropic one. It turns out that the U-Net architecture only gain a +slight advantage amid the U-Net inductive bias, indicating potential +redundancies within the U-Net-style DiT. Inspired by the discovery that U-Net +backbone features are low-frequency-dominated, we perform token downsampling on +the query-key-value tuple for self-attention and bring further improvements +despite a considerable amount of reduction in computation. Based on +self-attention with downsampled tokens, we propose a series of U-shaped DiTs +(U-DiTs) in the paper and conduct extensive experiments to demonstrate the +extraordinary performance of U-DiT models. The proposed U-DiT could outperform +DiT-XL/2 with only 1/6 of its computation cost. Codes are available at +https://github.com/YuchuanTian/U-DiT. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ AFter: Attention-based Fusion Router for RGBT Tracking + + +
+ Multi-modal feature fusion as a core investigative component of RGBT tracking +emerges numerous fusion studies in recent years. However, existing RGBT +tracking methods widely adopt fixed fusion structures to integrate multi-modal +feature, which are hard to handle various challenges in dynamic scenarios. To +address this problem, this work presents a novel \emph{A}ttention-based +\emph{F}usion rou\emph{ter} called AFter, which optimizes the fusion structure +to adapt to the dynamic challenging scenarios, for robust RGBT tracking. In +particular, we design a fusion structure space based on the hierarchical +attention network, each attention-based fusion unit corresponding to a fusion +operation and a combination of these attention units corresponding to a fusion +structure. Through optimizing the combination of attention-based fusion units, +we can dynamically select the fusion structure to adapt to various challenging +scenarios. Unlike complex search of different structures in neural architecture +search algorithms, we develop a dynamic routing algorithm, which equips each +attention-based fusion unit with a router, to predict the combination weights +for efficient optimization of the fusion structure. Extensive experiments on +five mainstream RGBT tracking datasets demonstrate the superior performance of +the proposed AFter against state-of-the-art RGBT trackers. We release the code +in https://github.com/Alexadlu/AFter. + +
+
+ comment: Peer review +
+
+
+
+
+ + ☆ Towards a Scalable Identification of Novel Modes in Generative Models + + +
+ An interpretable comparison of generative models requires the identification +of sample types produced more frequently by each of the involved models. While +several quantitative scores have been proposed in the literature to rank +different generative models, such score-based evaluations do not reveal the +nuanced differences between the generative models in capturing various sample +types. In this work, we propose a method called Fourier-based Identification of +Novel Clusters (FINC) to identify modes produced by a generative model with a +higher frequency in comparison to a reference distribution. FINC provides a +scalable stochastic algorithm based on random Fourier features to estimate the +eigenspace of kernel covariance matrices of two generative models and utilize +the principal eigendirections to detect the sample types present more +dominantly in each model. We demonstrate the application of the FINC method to +standard computer vision datasets and generative model frameworks. Our +numerical results suggest the scalability and efficiency of the developed +Fourier-based method in highlighting the sample types captured with different +frequencies by widely-used generative models. + +
+
+
+
+
+ + ☆ Stable Diffusion Dataset Generation for Downstream Classification Tasks + + +
+ Recent advances in generative artificial intelligence have enabled the +creation of high-quality synthetic data that closely mimics real-world data. +This paper explores the adaptation of the Stable Diffusion 2.0 model for +generating synthetic datasets, using Transfer Learning, Fine-Tuning and +generation parameter optimisation techniques to improve the utility of the +dataset for downstream classification tasks. We present a class-conditional +version of the model that exploits a Class-Encoder and optimisation of key +generation parameters. Our methodology led to synthetic datasets that, in a +third of cases, produced models that outperformed those trained on real +datasets. + +
+
+
+
+
+ + ☆ Diffeomorphic Transformer-based Abdomen MRI-CT Deformable Image + Registration + + +
+ This paper aims to create a deep learning framework that can estimate the +deformation vector field (DVF) for directly registering abdominal MRI-CT +images. The proposed method assumed a diffeomorphic deformation. By using +topology-preserved deformation features extracted from the probabilistic +diffeomorphic registration model, abdominal motion can be accurately obtained +and utilized for DVF estimation. The model integrated Swin transformers, which +have demonstrated superior performance in motion tracking, into the +convolutional neural network (CNN) for deformation feature extraction. The +model was optimized using a cross-modality image similarity loss and a surface +matching loss. To compute the image loss, a modality-independent neighborhood +descriptor (MIND) was used between the deformed MRI and CT images. The surface +matching loss was determined by measuring the distance between the warped +coordinates of the surfaces of contoured structures on the MRI and CT images. +The deformed MRI image was assessed against the CT image using the target +registration error (TRE), Dice similarity coefficient (DSC), and mean surface +distance (MSD) between the deformed contours of the MRI image and manual +contours of the CT image. When compared to only rigid registration, DIR with +the proposed method resulted in an increase of the mean DSC values of the liver +and portal vein from 0.850 and 0.628 to 0.903 and 0.763, a decrease of the mean +MSD of the liver from 7.216 mm to 3.232 mm, and a decrease of the TRE from +26.238 mm to 8.492 mm. The proposed deformable image registration method based +on a diffeomorphic transformer provides an effective and efficient way to +generate an accurate DVF from an MRI-CT image pair of the abdomen. It could be +utilized in the current treatment planning workflow for liver radiotherapy. + +
+
+ comment: 18 pages and 4 figures +
+
+
+
+
+ + ☆ Boosting 3D Neuron Segmentation with 2D Vision Transformer Pre-trained + on Natural Images + + +
+ Neuron reconstruction, one of the fundamental tasks in neuroscience, rebuilds +neuronal morphology from 3D light microscope imaging data. It plays a critical +role in analyzing the structure-function relationship of neurons in the nervous +system. However, due to the scarcity of neuron datasets and high-quality SWC +annotations, it is still challenging to develop robust segmentation methods for +single neuron reconstruction. To address this limitation, we aim to distill the +consensus knowledge from massive natural image data to aid the segmentation +model in learning the complex neuron structures. Specifically, in this work, we +propose a novel training paradigm that leverages a 2D Vision Transformer model +pre-trained on large-scale natural images to initialize our Transformer-based +3D neuron segmentation model with a tailored 2D-to-3D weight transferring +strategy. Our method builds a knowledge sharing connection between the abundant +natural and the scarce neuron image domains to improve the 3D neuron +segmentation ability in a data-efficiency manner. Evaluated on a popular +benchmark, BigNeuron, our method enhances neuron segmentation performance by +8.71% over the model trained from scratch with the same amount of training +samples. + +
+
+ comment: 3 pages +
+
+
+
+
+ + ☆ Position Paper: Quo Vadis, Unsupervised Time Series Anomaly Detection? ICML 2024 + + +
+ The current state of machine learning scholarship in Timeseries Anomaly +Detection (TAD) is plagued by the persistent use of flawed evaluation metrics, +inconsistent benchmarking practices, and a lack of proper justification for the +choices made in novel deep learning-based model designs. Our paper presents a +critical analysis of the status quo in TAD, revealing the misleading track of +current research and highlighting problematic methods, and evaluation +practices. Our position advocates for a shift in focus from pursuing only the +novelty in model design to improving benchmarking practices, creating +non-trivial datasets, and placing renewed emphasis on studying the utility of +model architectures for specific tasks. Our findings demonstrate the need for +rigorous evaluation protocols, the creation of simple baselines, and the +revelation that state-of-the-art deep anomaly detection models effectively +learn linear mappings. These findings suggest the need for more exploration and +development of simple and interpretable TAD methods. The increment of model +complexity in the state-of-the-art deep-learning based models unfortunately +offers very little improvement. We offer insights and suggestions for the field +to move forward. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ☆ Hand-Object Interaction Controller (HOIC): Deep Reinforcement Learning + for Reconstructing Interactions with Physics SIGGRAPH 2024 + + +
+ Hand manipulating objects is an important interaction motion in our daily +activities. We faithfully reconstruct this motion with a single RGBD camera by +a novel deep reinforcement learning method to leverage physics. Firstly, we +propose object compensation control which establishes direct object control to +make the network training more stable. Meanwhile, by leveraging the +compensation force and torque, we seamlessly upgrade the simple point contact +model to a more physical-plausible surface contact model, further improving the +reconstruction accuracy and physical correctness. Experiments indicate that +without involving any heuristic physical rules, this work still successfully +involves physics in the reconstruction of hand-object interactions which are +complex motions hard to imitate with deep reinforcement learning. Our code and +data are available at https://github.com/hu-hy17/HOIC. + +
+
+ comment: SIGGRAPH 2024 Conference Track +
+
+
+
+
+ + ☆ Deep Pulse-Signal Magnification for remote Heart Rate Estimation in + Compressed Videos + + +
+ Recent advancements in remote heart rate measurement (rPPG), motivated by +data-driven approaches, have significantly improved accuracy. However, certain +challenges, such as video compression, still remain: recovering the rPPG signal +from highly compressed videos is particularly complex. Although several studies +have highlighted the difficulties and impact of video compression for this, +effective solutions remain limited. In this paper, we present a novel approach +to address the impact of video compression on rPPG estimation, which leverages +a pulse-signal magnification transformation to adapt compressed videos to an +uncompressed data domain in which the rPPG signal is magnified. We validate the +effectiveness of our model by exhaustive evaluations on two publicly available +datasets, UCLA-rPPG and UBFC-rPPG, employing both intra- and cross-database +performance at several compression rates. Additionally, we assess the +robustness of our approach on two additional highly compressed and widely-used +datasets, MAHNOB-HCI and COHFACE, which reveal outstanding heart rate +estimation results. + +
+
+
+
+
+ + ☆ A Conformal Prediction Score that is Robust to Label Noise + + +
+ Conformal Prediction (CP) quantifies network uncertainty by building a small +prediction set with a pre-defined probability that the correct class is within +this set. In this study we tackle the problem of CP calibration based on a +validation set with noisy labels. We introduce a conformal score that is robust +to label noise. The noise-free conformal score is estimated using the noisy +labeled data and the noise level. In the test phase the noise-free score is +used to form the prediction set. We applied the proposed algorithm to several +standard medical imaging classification datasets. We show that our method +outperforms current methods by a large margin, in terms of the average size of +the prediction set, while maintaining the required coverage. + +
+
+
+
+
+ + ☆ UnSAMFlow: Unsupervised Optical Flow Guided by Segment Anything Model CVPR 2024 + + +
+ Traditional unsupervised optical flow methods are vulnerable to occlusions +and motion boundaries due to lack of object-level information. Therefore, we +propose UnSAMFlow, an unsupervised flow network that also leverages object +information from the latest foundation model Segment Anything Model (SAM). We +first include a self-supervised semantic augmentation module tailored to SAM +masks. We also analyze the poor gradient landscapes of traditional smoothness +losses and propose a new smoothness definition based on homography instead. A +simple yet effective mask feature module has also been added to further +aggregate features on the object level. With all these adaptations, our method +produces clear optical flow estimation with sharp boundaries around objects, +which outperforms state-of-the-art methods on both KITTI and Sintel datasets. +Our method also generalizes well across domains and runs very efficiently. + +
+
+ comment: Accepted by CVPR 2024. Code is available at + https://github.com/facebookresearch/UnSAMFlow +
+
+
+
+
+ + ☆ Vision-based 3D occupancy prediction in autonomous driving: a review and + outlook + + +
+ In recent years, autonomous driving has garnered escalating attention for its +potential to relieve drivers' burdens and improve driving safety. Vision-based +3D occupancy prediction, which predicts the spatial occupancy status and +semantics of 3D voxel grids around the autonomous vehicle from image inputs, is +an emerging perception task suitable for cost-effective perception system of +autonomous driving. Although numerous studies have demonstrated the greater +advantages of 3D occupancy prediction over object-centric perception tasks, +there is still a lack of a dedicated review focusing on this rapidly developing +field. In this paper, we first introduce the background of vision-based 3D +occupancy prediction and discuss the challenges in this task. Secondly, we +conduct a comprehensive survey of the progress in vision-based 3D occupancy +prediction from three aspects: feature enhancement, deployment friendliness and +label efficiency, and provide an in-depth analysis of the potentials and +challenges of each category of methods. Finally, we present a summary of +prevailing research trends and propose some inspiring future outlooks. To +provide a valuable reference for researchers, a regularly updated collection of +related papers, datasets, and codes is organized at +https://github.com/zya3d/Awesome-3D-Occupancy-Prediction. + +
+
+ comment: 20 pages, 20 figures +
+
+
+
+
+ + ☆ Better YOLO with Attention-Augmented Network and Enhanced Generalization + Performance for Safety Helmet Detection + + +
+ Safety helmets play a crucial role in protecting workers from head injuries +in construction sites, where potential hazards are prevalent. However, +currently, there is no approach that can simultaneously achieve both model +accuracy and performance in complex environments. In this study, we utilized a +Yolo-based model for safety helmet detection, achieved a 2% improvement in mAP +(mean Average Precision) performance while reducing parameters and Flops count +by over 25%. YOLO(You Only Look Once) is a widely used, high-performance, +lightweight model architecture that is well suited for complex environments. We +presents a novel approach by incorporating a lightweight feature extraction +network backbone based on GhostNetv2, integrating attention modules such as +Spatial Channel-wise Attention Net(SCNet) and Coordination Attention +Net(CANet), and adopting the Gradient Norm Aware optimizer (GAM) for improved +generalization ability. In safety-critical environments, the accurate detection +and speed of safety helmets plays a pivotal role in preventing occupational +hazards and ensuring compliance with safety protocols. This work addresses the +pressing need for robust and efficient helmet detection methods, offering a +comprehensive framework that not only enhances accuracy but also improves the +adaptability of detection models to real-world conditions. Our experimental +results underscore the synergistic effects of GhostNetv2, attention modules, +and the GAM optimizer, presenting a compelling solution for safety helmet +detection that achieves superior performance in terms of accuracy, +generalization, and efficiency. + +
+
+
+
+
+ + ☆ Generalizing CLIP to Unseen Domain via Text-Guided Diverse Novel Feature + Synthesis + + +
+ Vision-language foundation models like CLIP have shown impressive zero-shot +generalization, but finetuning on downstream datasets can cause overfitting and +loss of its generalization ability on unseen domains. Although collecting +additional data from new domains of interest is possible, this method is often +impractical due to the challenges in obtaining annotated data. To address this, +we propose a plug-and-play feature augmentation method called LDFS +(Language-Guided Diverse Feature Synthesis) to synthesize new domain features +and improve existing CLIP fine-tuning strategies. LDFS has three main +contributions: 1) To synthesize novel domain features and promote diversity, we +propose an instance-conditional feature augmentation strategy based on a +textguided feature augmentation loss. 2) To maintain feature quality after +augmenting, we introduce a pairwise regularizer to preserve augmented feature +coherence within the CLIP feature space. 3) We propose to use stochastic text +feature augmentation to reduce the modality gap and further facilitate the +process of text-guided feature synthesis. Extensive experiments show LDFS +superiority in improving CLIP generalization ability on unseen domains without +collecting data from those domains. The code will be made publicly available. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Stationary Representations: Optimally Approximating Compatibility and + Implications for Improved Model Replacements CVPR24 + + +
+ Learning compatible representations enables the interchangeable use of +semantic features as models are updated over time. This is particularly +relevant in search and retrieval systems where it is crucial to avoid +reprocessing of the gallery images with the updated model. While recent +research has shown promising empirical evidence, there is still a lack of +comprehensive theoretical understanding about learning compatible +representations. In this paper, we demonstrate that the stationary +representations learned by the $d$-Simplex fixed classifier optimally +approximate compatibility representation according to the two inequality +constraints of its formal definition. This not only establishes a solid +foundation for future works in this line of research but also presents +implications that can be exploited in practical learning scenarios. An +exemplary application is the now-standard practice of downloading and +fine-tuning new pre-trained models. Specifically, we show the strengths and +critical issues of stationary representations in the case in which a model +undergoing sequential fine-tuning is asynchronously replaced by downloading a +better-performing model pre-trained elsewhere. Such a representation enables +seamless delivery of retrieval service (i.e., no reprocessing of gallery +images) and offers improved performance without operational disruptions during +model replacement. Code available at: https://github.com/miccunifi/iamcl2r. + +
+
+ comment: Accepted at CVPR24 as Poster Highlight +
+
+
+
+
+ + ☆ ViTALS: Vision Transformer for Action Localization in Surgical + Nephrectomy + + +
+ Surgical action localization is a challenging computer vision problem. While +it has promising applications including automated training of surgery +procedures, surgical workflow optimization, etc., appropriate model design is +pivotal to accomplishing this task. Moreover, the lack of suitable medical +datasets adds an additional layer of complexity. To that effect, we introduce a +new complex dataset of nephrectomy surgeries called UroSlice. To perform the +action localization from these videos, we propose a novel model termed as +`ViTALS' (Vision Transformer for Action Localization in Surgical Nephrectomy). +Our model incorporates hierarchical dilated temporal convolution layers and +inter-layer residual connections to capture the temporal correlations at finer +as well as coarser granularities. The proposed approach achieves +state-of-the-art performance on Cholec80 and UroSlice datasets (89.8% and 66.1% +accuracy, respectively), validating its effectiveness. + +
+
+ comment: Nephrectomy surgery, Surgical Phase Recognition, Surgical Workflow + Segmentation, 11 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ ActiveNeuS: Active 3D Reconstruction using Neural Implicit Surface + Uncertainty + + +
+ Active learning in 3D scene reconstruction has been widely studied, as +selecting informative training views is critical for the reconstruction. +Recently, Neural Radiance Fields (NeRF) variants have shown performance +increases in active 3D reconstruction using image rendering or geometric +uncertainty. However, the simultaneous consideration of both uncertainties in +selecting informative views remains unexplored, while utilizing different types +of uncertainty can reduce the bias that arises in the early training stage with +sparse inputs. In this paper, we propose ActiveNeuS, which evaluates candidate +views considering both uncertainties. ActiveNeuS provides a way to accumulate +image rendering uncertainty while avoiding the bias that the estimated +densities can introduce. ActiveNeuS computes the neural implicit surface +uncertainty, providing the color uncertainty along with the surface +information. It efficiently handles the bias by using the surface information +and a grid, enabling the fast selection of diverse viewpoints. Our method +outperforms previous works on popular datasets, Blender and DTU, showing that +the views selected by ActiveNeuS significantly improve performance. + +
+
+
+
+
+ + ☆ Leveraging the Human Ventral Visual Stream to Improve Neural Network + Robustness + + +
+ Human object recognition exhibits remarkable resilience in cluttered and +dynamic visual environments. In contrast, despite their unparalleled +performance across numerous visual tasks, Deep Neural Networks (DNNs) remain +far less robust than humans, showing, for example, a surprising susceptibility +to adversarial attacks involving image perturbations that are (almost) +imperceptible to humans. Human object recognition likely owes its robustness, +in part, to the increasingly resilient representations that emerge along the +hierarchy of the ventral visual cortex. Here we show that DNNs, when guided by +neural representations from a hierarchical sequence of regions in the human +ventral visual stream, display increasing robustness to adversarial attacks. +These neural-guided models also exhibit a gradual shift towards more human-like +decision-making patterns and develop hierarchically smoother decision surfaces. +Importantly, the resulting representational spaces differ in important ways +from those produced by conventional smoothing methods, suggesting that such +neural-guidance may provide previously unexplored robustness solutions. Our +findings support the gradual emergence of human robustness along the ventral +visual hierarchy and suggest that the key to DNN robustness may lie in +increasing emulation of the human brain. + +
+
+
+
+
+ + ☆ Few-Shot Fruit Segmentation via Transfer Learning ICRA + + +
+ Advancements in machine learning, computer vision, and robotics have paved +the way for transformative solutions in various domains, particularly in +agriculture. For example, accurate identification and segmentation of fruits +from field images plays a crucial role in automating jobs such as harvesting, +disease detection, and yield estimation. However, achieving robust and precise +infield fruit segmentation remains a challenging task since large amounts of +labeled data are required to handle variations in fruit size, shape, color, and +occlusion. In this paper, we develop a few-shot semantic segmentation framework +for infield fruits using transfer learning. Concretely, our work is aimed at +addressing agricultural domains that lack publicly available labeled data. +Motivated by similar success in urban scene parsing, we propose specialized +pre-training using a public benchmark dataset for fruit transfer learning. By +leveraging pre-trained neural networks, accurate semantic segmentation of fruit +in the field is achieved with only a few labeled images. Furthermore, we show +that models with pre-training learn to distinguish between fruit still on the +trees and fruit that have fallen on the ground, and they can effectively +transfer the knowledge to the target fruit dataset. + +
+
+ comment: To be published in the 2024 IEEE International Conference on Robotics + and Automation (ICRA) +
+
+
+
+
+ + ☆ AdaFPP: Adapt-Focused Bi-Propagating Prototype Learning for Panoramic + Activity Recognition + + +
+ Panoramic Activity Recognition (PAR) aims to identify multi-granularity +behaviors performed by multiple persons in panoramic scenes, including +individual activities, group activities, and global activities. Previous +methods 1) heavily rely on manually annotated detection boxes in training and +inference, hindering further practical deployment; or 2) directly employ normal +detectors to detect multiple persons with varying size and spatial occlusion in +panoramic scenes, blocking the performance gain of PAR. To this end, we +consider learning a detector adapting varying-size occluded persons, which is +optimized along with the recognition module in the all-in-one framework. +Therefore, we propose a novel Adapt-Focused bi-Propagating Prototype learning +(AdaFPP) framework to jointly recognize individual, group, and global +activities in panoramic activity scenes by learning an adapt-focused detector +and multi-granularity prototypes as the pretext tasks in an end-to-end way. +Specifically, to accommodate the varying sizes and spatial occlusion of +multiple persons in crowed panoramic scenes, we introduce a panoramic +adapt-focuser, achieving the size-adapting detection of individuals by +comprehensively selecting and performing fine-grained detections on +object-dense sub-regions identified through original detections. In addition, +to mitigate information loss due to inaccurate individual localizations, we +introduce a bi-propagation prototyper that promotes closed-loop interaction and +informative consistency across different granularities by facilitating +bidirectional information propagation among the individual, group, and global +levels. Extensive experiments demonstrate the significant performance of AdaFPP +and emphasize its powerful applicability for PAR. + +
+
+
+
+
+ + ☆ Iterative Filter Pruning for Concatenation-based CNN Architectures IJCNN 2024 + + +
+ Model compression and hardware acceleration are essential for the +resource-efficient deployment of deep neural networks. Modern object detectors +have highly interconnected convolutional layers with concatenations. In this +work, we study how pruning can be applied to such architectures, exemplary for +YOLOv7. We propose a method to handle concatenation layers, based on the +connectivity graph of convolutional layers. By automating iterative sensitivity +analysis, pruning, and subsequent model fine-tuning, we can significantly +reduce model size both in terms of the number of parameters and FLOPs, while +keeping comparable model accuracy. Finally, we deploy pruned models to FPGA and +NVIDIA Jetson Xavier AGX. Pruned models demonstrate a 2x speedup for the +convolutional layers in comparison to the unpruned counterparts and reach +real-time capability with 14 FPS on FPGA. Our code is available at +https://github.com/fzi-forschungszentrum-informatik/iterative-yolo-pruning. + +
+
+ comment: Accepted for publication at IJCNN 2024 +
+
+
+
+
+ + ☆ Improve Cross-Modality Segmentation by Treating MRI Images as Inverted + CT Scans + + +
+ Computed tomography (CT) segmentation models frequently include classes that +are not currently supported by magnetic resonance imaging (MRI) segmentation +models. In this study, we show that a simple image inversion technique can +significantly improve the segmentation quality of CT segmentation models on MRI +data, by using the TotalSegmentator model, applied to T1-weighted MRI images, +as example. Image inversion is straightforward to implement and does not +require dedicated graphics processing units (GPUs), thus providing a quick +alternative to complex deep modality-transfer models for generating +segmentation masks for MRI data. + +
+
+ comment: 3 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Gradient-based Parameter Selection for Efficient Fine-Tuning + + +
+ With the growing size of pre-trained models, full fine-tuning and storing all +the parameters for various downstream tasks is costly and infeasible. In this +paper, we propose a new parameter-efficient fine-tuning method, Gradient-based +Parameter Selection (GPS), demonstrating that only tuning a few selected +parameters from the pre-trained model while keeping the remainder of the model +frozen can generate similar or better performance compared with the full model +fine-tuning method. Different from the existing popular and state-of-the-art +parameter-efficient fine-tuning approaches, our method does not introduce any +additional parameters and computational costs during both the training and +inference stages. Another advantage is the model-agnostic and non-destructive +property, which eliminates the need for any other design specific to a +particular model. Compared with the full fine-tuning, GPS achieves 3.33% +(91.78% vs. 88.45%, FGVC) and 9.61% (73.1% vs. 65.57%, VTAB) improvement of the +accuracy with tuning only 0.36% parameters of the pre-trained model on average +over 24 image classification tasks; it also demonstrates a significant +improvement of 17% and 16.8% in mDice and mIoU, respectively, on medical image +segmentation task. Moreover, GPS achieves state-of-the-art performance compared +with existing PEFT methods. + +
+
+
+
+
+ + ♻ ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made strong progress in the past decade, thanks to +the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. The resulting pipeline we call Mind-to-Image marks a step towards +creating a technology that allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated. Work in progress +
+
+
+
+
+ + ♻ ☆ VR-GS: A Physical Dynamics-Aware Interactive Gaussian Splatting System + in Virtual Reality + + +
+ As consumer Virtual Reality (VR) and Mixed Reality (MR) technologies gain +momentum, there's a growing focus on the development of engagements with 3D +virtual content. Unfortunately, traditional techniques for content creation, +editing, and interaction within these virtual spaces are fraught with +difficulties. They tend to be not only engineering-intensive but also require +extensive expertise, which adds to the frustration and inefficiency in virtual +object manipulation. Our proposed VR-GS system represents a leap forward in +human-centered 3D content interaction, offering a seamless and intuitive user +experience. By developing a physical dynamics-aware interactive Gaussian +Splatting in a Virtual Reality setting, and constructing a highly efficient +two-level embedding strategy alongside deformable body simulations, VR-GS +ensures real-time execution with highly realistic dynamic responses. The +components of our Virtual Reality system are designed for high efficiency and +effectiveness, starting from detailed scene reconstruction and object +segmentation, advancing through multi-view image in-painting, and extending to +interactive physics-based editing. The system also incorporates real-time +deformation embedding and dynamic shadow casting, ensuring a comprehensive and +engaging virtual experience.Our project page is available at: +https://yingjiang96.github.io/VR-GS/. + +
+
+
+
+
+ + ♻ ☆ LLM-grounded Video Diffusion Models ICLR 2024 + + +
+ Text-conditioned diffusion models have emerged as a promising tool for neural +video generation. However, current models still struggle with intricate +spatiotemporal prompts and often generate restricted or incorrect motion. To +address these limitations, we introduce LLM-grounded Video Diffusion (LVD). +Instead of directly generating videos from the text inputs, LVD first leverages +a large language model (LLM) to generate dynamic scene layouts based on the +text inputs and subsequently uses the generated layouts to guide a diffusion +model for video generation. We show that LLMs are able to understand complex +spatiotemporal dynamics from text alone and generate layouts that align closely +with both the prompts and the object motion patterns typically observed in the +real world. We then propose to guide video diffusion models with these layouts +by adjusting the attention maps. Our approach is training-free and can be +integrated into any video diffusion model that admits classifier guidance. Our +results demonstrate that LVD significantly outperforms its base video diffusion +model and several strong baseline methods in faithfully generating videos with +the desired attributes and motion patterns. + +
+
+ comment: ICLR 2024. Project Page: + https://llm-grounded-video-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ SWAP-NAS: sample-wise activation patterns for ultra-fast NAS ICLR2024 + + +
+ Training-free metrics (a.k.a. zero-cost proxies) are widely used to avoid +resource-intensive neural network training, especially in Neural Architecture +Search (NAS). Recent studies show that existing training-free metrics have +several limitations, such as limited correlation and poor generalisation across +different search spaces and tasks. Hence, we propose Sample-Wise Activation +Patterns and its derivative, SWAP-Score, a novel high-performance training-free +metric. It measures the expressivity of networks over a batch of input samples. +The SWAP-Score is strongly correlated with ground-truth performance across +various search spaces and tasks, outperforming 15 existing training-free +metrics on NAS-Bench-101/201/301 and TransNAS-Bench-101. The SWAP-Score can be +further enhanced by regularisation, which leads to even higher correlations in +cell-based search space and enables model size control during the search. For +example, Spearman's rank correlation coefficient between regularised SWAP-Score +and CIFAR-100 validation accuracies on NAS-Bench-201 networks is 0.90, +significantly higher than 0.80 from the second-best metric, NWOT. When +integrated with an evolutionary algorithm for NAS, our SWAP-NAS achieves +competitive performance on CIFAR-10 and ImageNet in approximately 6 minutes and +9 minutes of GPU time respectively. + +
+
+ comment: ICLR2024 Spotlight +
+
+
+
+
+ + ♻ ☆ Detours for Navigating Instructional Videos CVPR 2024 + + +
+ We introduce the video detours problem for navigating instructional videos. +Given a source video and a natural language query asking to alter the how-to +video's current path of execution in a certain way, the goal is to find a +related ''detour video'' that satisfies the requested alteration. To address +this challenge, we propose VidDetours, a novel video-language approach that +learns to retrieve the targeted temporal segments from a large repository of +how-to's using video-and-text conditioned queries. Furthermore, we devise a +language-based pipeline that exploits how-to video narration text to create +weakly supervised training data. We demonstrate our idea applied to the domain +of how-to cooking videos, where a user can detour from their current recipe to +find steps with alternate ingredients, tools, and techniques. Validating on a +ground truth annotated dataset of 16K samples, we show our model's significant +improvements over best available methods for video retrieval and question +answering, with recall rates exceeding the state of the art by 35%. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Offline Tracking with Object Permanence + + +
+ To reduce the expensive labor cost for manual labeling autonomous driving +datasets, an alternative is to automatically label the datasets using an +offline perception system. However, objects might be temporally occluded. Such +occlusion scenarios in the datasets are common yet underexplored in offline +auto labeling. In this work, we propose an offline tracking model that focuses +on occluded object tracks. It leverages the concept of object permanence which +means objects continue to exist even if they are not observed anymore. The +model contains three parts: a standard online tracker, a re-identification +(Re-ID) module that associates tracklets before and after occlusion, and a +track completion module that completes the fragmented tracks. The Re-ID module +and the track completion module use the vectorized map as one of the inputs to +refine the tracking results with occlusion. The model can effectively recover +the occluded object trajectories. It achieves state-of-the-art performance in +3D multi-object tracking by significantly improving the original online +tracking result, showing its potential to be applied in offline auto labeling +as a useful plugin to improve tracking by recovering occlusions. + +
+
+ comment: Accepted by IEEE Intelligent Vehicles Symposium (IV 2024). Camera + ready version with supplementary material +
+
+
+
+
+ + ♻ ☆ UNETR++: Delving into Efficient and Accurate 3D Medical Image + Segmentation + + +
+ Owing to the success of transformer models, recent works study their +applicability in 3D medical segmentation tasks. Within the transformer models, +the self-attention mechanism is one of the main building blocks that strives to +capture long-range dependencies. However, the self-attention operation has +quadratic complexity which proves to be a computational bottleneck, especially +in volumetric medical imaging, where the inputs are 3D with numerous slices. In +this paper, we propose a 3D medical image segmentation approach, named UNETR++, +that offers both high-quality segmentation masks as well as efficiency in terms +of parameters, compute cost, and inference speed. The core of our design is the +introduction of a novel efficient paired attention (EPA) block that efficiently +learns spatial and channel-wise discriminative features using a pair of +inter-dependent branches based on spatial and channel attention. Our spatial +attention formulation is efficient having linear complexity with respect to the +input sequence length. To enable communication between spatial and +channel-focused branches, we share the weights of query and key mapping +functions that provide a complimentary benefit (paired attention), while also +reducing the overall network parameters. Our extensive evaluations on five +benchmarks, Synapse, BTCV, ACDC, BRaTs, and Decathlon-Lung, reveal the +effectiveness of our contributions in terms of both efficiency and accuracy. On +Synapse, our UNETR++ sets a new state-of-the-art with a Dice Score of 87.2%, +while being significantly efficient with a reduction of over 71% in terms of +both parameters and FLOPs, compared to the best method in the literature. Code: +https://github.com/Amshaker/unetr_plus_plus. + +
+
+ comment: Accepted at IEEE TMI-2024 +
+
+
+
+
+ + ♻ ☆ HandSSCA: 3D Hand Mesh Reconstruction with State Space Channel Attention + from RGB images + + +
+ Reconstructing a hand mesh from a single RGB image is a challenging task +because hands are often occluded by objects. Most previous works attempted to +introduce more additional information and adopt attention mechanisms to improve +3D reconstruction results, but it would increased computational complexity. +This observation prompts us to propose a new and concise architecture while +improving computational efficiency. In this work, we propose a simple and +effective 3D hand mesh reconstruction network HandSSCA, which is the first to +incorporate state space modeling into the field of hand pose estimation. In the +network, we have designed a novel state space channel attention module that +extends the effective sensory field, extracts hand features in the spatial +dimension, and enhances hand regional features in the channel dimension. This +design helps to reconstruct a complete and detailed hand mesh. Extensive +experiments conducted on well-known datasets featuring challenging hand-object +occlusions (such as FREIHAND, DEXYCB, and HO3D) demonstrate that our proposed +HandSSCA achieves state-of-the-art performance while maintaining a minimal +parameter count. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Robust affine point matching via quadratic assignment on Grassmannians + + +
+ Robust Affine matching with Grassmannians (RAG) is a new algorithm to perform +affine registration of point clouds. The algorithm is based on minimizing the +Frobenius distance between two elements of the Grassmannian. For this purpose, +an indefinite relaxation of the Quadratic Assignment Problem (QAP) is used, and +several approaches to affine feature matching are studied and compared. +Experiments demonstrate that RAG is more robust to noise and point discrepancy +than previous methods. + +
+
+ comment: 8 pages, 23 figures; GitHub repository at + (https://github.com/sashakolpakov/rag); Section IV: added comparison to + GrassGraph (https://doi.org/10.1109/TIP.2019.2959722); notably, GrassGraph + quickly loses accuracy on our test examples with noise and occlusion +
+
+
+
+
+ + ♻ ☆ Towards Real-world Video Face Restoration: A New Benchmark + + +
+ Blind face restoration (BFR) on images has significantly progressed over the +last several years, while real-world video face restoration (VFR), which is +more challenging for more complex face motions such as moving gaze directions +and facial orientations involved, remains unsolved. Typical BFR methods are +evaluated on privately synthesized datasets or self-collected real-world +low-quality face images, which are limited in their coverage of real-world +video frames. In this work, we introduced new real-world datasets named FOS +with a taxonomy of "Full, Occluded, and Side" faces from mainly video frames to +study the applicability of current methods on videos. Compared with existing +test datasets, FOS datasets cover more diverse degradations and involve face +samples from more complex scenarios, which helps to revisit current face +restoration approaches more comprehensively. Given the established datasets, we +benchmarked both the state-of-the-art BFR methods and the video super +resolution (VSR) methods to comprehensively study current approaches, +identifying their potential and limitations in VFR tasks. In addition, we +studied the effectiveness of the commonly used image quality assessment (IQA) +metrics and face IQA (FIQA) metrics by leveraging a subjective user study. With +extensive experimental results and detailed analysis provided, we gained +insights from the successes and failures of both current BFR and VSR methods. +These results also pose challenges to current face restoration approaches, +which we hope stimulate future advances in VFR research. + +
+
+ comment: Project page: https://ziyannchen.github.io/projects/VFRxBenchmark/ +
+
+
+
+
+ + ♻ ☆ Video-based Sequential Bayesian Homography Estimation for Soccer Field + Registration + + +
+ A novel Bayesian framework is proposed, which explicitly relates the +homography of one video frame to the next through an affine transformation +while explicitly modelling keypoint uncertainty. The literature has previously +used differential homography between subsequent frames, but not in a Bayesian +setting. In cases where Bayesian methods have been applied, camera motion is +not adequately modelled, and keypoints are treated as deterministic. The +proposed method, Bayesian Homography Inference from Tracked Keypoints (BHITK), +employs a two-stage Kalman filter and significantly improves existing methods. +Existing keypoint detection methods may be easily augmented with BHITK. It +enables less sophisticated and less computationally expensive methods to +outperform the state-of-the-art approaches in most homography evaluation +metrics. Furthermore, the homography annotations of the WorldCup and +TS-WorldCup datasets have been refined using a custom homography annotation +tool that has been released for public use. The refined datasets are +consolidated and released as the consolidated and refined WorldCup (CARWC) +dataset. + +
+
+ comment: Accepted to Expert Systems with Applications +
+
+
+
+
+ + ♻ ☆ LidaRF: Delving into Lidar for Neural Radiance Field on Street Scenes CVPR2024 + + +
+ Photorealistic simulation plays a crucial role in applications such as +autonomous driving, where advances in neural radiance fields (NeRFs) may allow +better scalability through the automatic creation of digital 3D assets. +However, reconstruction quality suffers on street scenes due to largely +collinear camera motions and sparser samplings at higher speeds. On the other +hand, the application often demands rendering from camera views that deviate +from the inputs to accurately simulate behaviors like lane changes. In this +paper, we propose several insights that allow a better utilization of Lidar +data to improve NeRF quality on street scenes. First, our framework learns a +geometric scene representation from Lidar, which is fused with the implicit +grid-based representation for radiance decoding, thereby supplying stronger +geometric information offered by explicit point cloud. Second, we put forth a +robust occlusion-aware depth supervision scheme, which allows utilizing +densified Lidar points by accumulation. Third, we generate augmented training +views from Lidar points for further improvement. Our insights translate to +largely improved novel view synthesis under real driving scenes. + +
+
+ comment: CVPR2024 Highlights +
+
+
+
+
+ + ♻ ☆ SUNY: A Visual Interpretation Framework for Convolutional Neural + Networks from a Necessary and Sufficient Perspective CVPR + + +
+ Researchers have proposed various methods for visually interpreting the +Convolutional Neural Network (CNN) via saliency maps, which include +Class-Activation-Map (CAM) based approaches as a leading family. However, in +terms of the internal design logic, existing CAM-based approaches often +overlook the causal perspective that answers the core "why" question to help +humans understand the explanation. Additionally, current CNN explanations lack +the consideration of both necessity and sufficiency, two complementary sides of +a desirable explanation. This paper presents a causality-driven framework, +SUNY, designed to rationalize the explanations toward better human +understanding. Using the CNN model's input features or internal filters as +hypothetical causes, SUNY generates explanations by bi-directional +quantifications on both the necessary and sufficient perspectives. Extensive +evaluations justify that SUNY not only produces more informative and convincing +explanations from the angles of necessity and sufficiency, but also achieves +performances competitive to other approaches across different CNN architectures +over large-scale datasets, including ILSVRC2012 and CUB-200-2011. + +
+
+ comment: CVPRw 2024 +
+
+
+
+
+ + ♻ ☆ BLINK: Multimodal Large Language Models Can See but Not Perceive + + +
+ We introduce Blink, a new benchmark for multimodal language models (LLMs) +that focuses on core visual perception abilities not found in other +evaluations. Most of the Blink tasks can be solved by humans "within a blink" +(e.g., relative depth estimation, visual correspondence, forensics detection, +and multi-view reasoning). However, we find these perception-demanding tasks +cast significant challenges for current multimodal LLMs because they resist +mediation through natural language. Blink reformats 14 classic computer vision +tasks into 3,807 multiple-choice questions, paired with single or multiple +images and visual prompting. While humans get 95.70% accuracy on average, Blink +is surprisingly challenging for existing multimodal LLMs: even the +best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only +13.17% and 7.63% higher than random guessing, indicating that such perception +abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also +highlights that specialist CV models could solve these problems much better, +suggesting potential pathways for future improvements. We believe Blink will +stimulate the community to help multimodal LLMs catch up with human-level +visual perception. + +
+
+ comment: Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/ +
+
+
+
+
+ + ♻ ☆ AttributionScanner: A Visual Analytics System for Model Validation with + Metadata-Free Slice Finding + + +
+ Data slice finding is an emerging technique for validating machine learning +(ML) models by identifying and analyzing subgroups in a dataset that exhibit +poor performance, often characterized by distinct feature sets or descriptive +metadata. However, in the context of validating vision models involving +unstructured image data, this approach faces significant challenges, including +the laborious and costly requirement for additional metadata and the complex +task of interpreting the root causes of underperformance. To address these +challenges, we introduce AttributionScanner, an innovative human-in-the-loop +Visual Analytics (VA) system, designed for metadata-free data slice finding. +Our system identifies interpretable data slices that involve common model +behaviors and visualizes these patterns through an Attribution Mosaic design. +Our interactive interface provides straightforward guidance for users to +detect, interpret, and annotate predominant model issues, such as spurious +correlations (model biases) and mislabeled data, with minimal effort. +Additionally, it employs a cutting-edge model regularization technique to +mitigate the detected issues and enhance the model's performance. The efficacy +of AttributionScanner is demonstrated through use cases involving two benchmark +datasets, with qualitative and quantitative evaluations showcasing its +substantial effectiveness in vision model validation, ultimately leading to +more reliable and accurate models. + +
+
+ comment: 12 pages, 12 figures, 3 tables. This manuscript is under review by + the IEEE Transactions on Visualization and Computer Graphics (TVCG) +
+
+
+
+
+ + ♻ ☆ FLDM-VTON: Faithful Latent Diffusion Model for Virtual Try-on IJCAI 2024 + + +
+ Despite their impressive generative performance, latent diffusion model-based +virtual try-on (VTON) methods lack faithfulness to crucial details of the +clothes, such as style, pattern, and text. To alleviate these issues caused by +the diffusion stochastic nature and latent supervision, we propose a novel +Faithful Latent Diffusion Model for VTON, termed FLDM-VTON. FLDM-VTON improves +the conventional latent diffusion process in three major aspects. First, we +propose incorporating warped clothes as both the starting point and local +condition, supplying the model with faithful clothes priors. Second, we +introduce a novel clothes flattening network to constrain generated try-on +images, providing clothes-consistent faithful supervision. Third, we devise a +clothes-posterior sampling for faithful inference, further enhancing the model +performance over conventional clothes-agnostic Gaussian sampling. Extensive +experimental results on the benchmark VITON-HD and Dress Code datasets +demonstrate that our FLDM-VTON outperforms state-of-the-art baselines and is +able to generate photo-realistic try-on images with faithful clothing details. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Heterogeneous Network Based Contrastive Learning Method for PolSAR Land + Cover Classification + + +
+ Polarimetric synthetic aperture radar (PolSAR) image interpretation is widely +used in various fields. Recently, deep learning has made significant progress +in PolSAR image classification. Supervised learning (SL) requires a large +amount of labeled PolSAR data with high quality to achieve better performance, +however, manually labeled data is insufficient. This causes the SL to fail into +overfitting and degrades its generalization performance. Furthermore, the +scattering confusion problem is also a significant challenge that attracts more +attention. To solve these problems, this article proposes a Heterogeneous +Network based Contrastive Learning method(HCLNet). It aims to learn high-level +representation from unlabeled PolSAR data for few-shot classification according +to multi-features and superpixels. Beyond the conventional CL, HCLNet +introduces the heterogeneous architecture for the first time to utilize +heterogeneous PolSAR features better. And it develops two easy-to-use plugins +to narrow the domain gap between optics and PolSAR, including feature filter +and superpixel-based instance discrimination, which the former is used to +enhance the complementarity of multi-features, and the latter is used to +increase the diversity of negative samples. Experiments demonstrate the +superiority of HCLNet on three widely used PolSAR benchmark datasets compared +with state-of-the-art methods. Ablation studies also verify the importance of +each component. Besides, this work has implications for how to efficiently +utilize the multi-features of PolSAR data to learn better high-level +representation in CL and how to construct networks suitable for PolSAR data +better. + +
+
+
+
+
+ + ♻ ☆ Motion Informed Needle Segmentation in Ultrasound Images + + +
+ Segmenting a moving needle in ultrasound images is challenging due to the +presence of artifacts, noise, and needle occlusion. This task becomes even more +demanding in scenarios where data availability is limited. In this paper, we +present a novel approach for needle segmentation for 2D ultrasound that +combines classical Kalman Filter (KF) techniques with data-driven learning, +incorporating both needle features and needle motion. Our method offers three +key contributions. First, we propose a compatible framework that seamlessly +integrates into commonly used encoder-decoder style architectures. Second, we +demonstrate superior performance compared to recent state-of-the-art needle +segmentation models using our novel convolutional neural network (CNN) based +KF-inspired block, achieving a 15\% reduction in pixel-wise needle tip error +and an 8\% reduction in length error. Third, to our knowledge we are the first +to implement a learnable filter to incorporate non-linear needle motion for +improving needle segmentation. + +
+
+ comment: 7 pages, 4 figures, accepted at ISBI 2024 +
+
+
+
+
+ + ♻ ☆ Effectiveness Assessment of Recent Large Vision-Language Models + + +
+ The advent of large vision-language models (LVLMs) represents a noteworthy +advancement towards the pursuit of artificial general intelligence. However, +the model efficacy across both specialized and general tasks warrants further +investigation. This paper endeavors to evaluate the competency of popular LVLMs +in specialized and general tasks, respectively, aiming to offer a comprehensive +understanding of these novel models. To gauge their efficacy in specialized +tasks, we employ six challenging tasks across three distinct application +scenarios, namely natural, healthcare, and industrial ones. Such six tasks +include salient/camouflaged/transparent object detection, as well as polyp +detection, skin lesion detection, and industrial anomaly detection. We examine +the performance of three recent open-source LVLMs, including MiniGPT-v2, +LLaVA-1.5, and Shikra, on both visual recognition and localization under these +tasks. Moreover, we conduct empirical investigations utilizing the +aforementioned LVLMs together with GPT-4V, assessing their multi-modal +understanding capabilities in general tasks including object counting, absurd +question answering, affordance reasoning, attribute recognition, and spatial +relation reasoning. Our investigations reveal that these LVLMs demonstrate +limited proficiency not only in specialized tasks but also in general tasks. We +delve deep into this inadequacy and uncover several potential factors, +including limited cognition in specialized tasks, object hallucination, +text-to-image interference, and decreased robustness in complex problems. We +hope this study could provide useful insights for the future development of +LVLMs, helping researchers improve LVLMs to cope with both general and +specialized applications. + +
+
+
+
+
+ + ♻ ☆ Fast Diffeomorphic Image Registration using Patch based Fully + Convolutional Networks + + +
+ Diffeomorphic image registration is a fundamental step in medical image +analysis, owing to its capability to ensure the invertibility of +transformations and preservation of topology. Currently, unsupervised +learning-based registration techniques primarily extract features at the image +level, potentially limiting their efficacy. This paper proposes a novel +unsupervised learning-based fully convolutional network (FCN) framework for +fast diffeomorphic image registration, emphasizing feature acquisition at the +image patch level. Furthermore, a novel differential operator is introduced and +integrated into the FCN architecture for parameter learning. Experiments are +conducted on three distinct T1-weighted magnetic resonance imaging (T1w MRI) +datasets. Comparative analyses with three state-of-the-art diffeomorphic image +registration approaches including a typical conventional registration algorithm +and two representative unsupervised learning-based methods, reveal that the +proposed method exhibits superior performance in both registration accuracy and +topology preservation. + +
+
+
+
+
+ + ♻ ☆ PoseINN: Realtime Visual-based Pose Regression and Localization with + Invertible Neural Networks + + +
+ Estimating ego-pose from cameras is an important problem in robotics with +applications ranging from mobile robotics to augmented reality. While SOTA +models are becoming increasingly accurate, they can still be unwieldy due to +high computational costs. In this paper, we propose to solve the problem by +using invertible neural networks (INN) to find the mapping between the latent +space of images and poses for a given scene. Our model achieves similar +performance to the SOTA while being faster to train and only requiring offline +rendering of low-resolution synthetic data. By using normalizing flows, the +proposed method also provides uncertainty estimation for the output. We also +demonstrated the efficiency of this method by deploying the model on a mobile +robot. + +
+
+
+
+
+ + ♻ ☆ Modeling Caption Diversity in Contrastive Vision-Language Pretraining ICML2024 + + +
+ There are a thousand ways to caption an image. Contrastive Language +Pretraining (CLIP) on the other hand, works by mapping an image and its caption +to a single vector -- limiting how well CLIP-like models can represent the +diverse ways to describe an image. In this work, we introduce Llip, Latent +Language Image Pretraining, which models the diversity of captions that could +match an image. Llip's vision encoder outputs a set of visual features that are +mixed into a final representation by conditioning on information derived from +the text. We show that Llip outperforms non-contextualized baselines like CLIP +and SigLIP on a variety of tasks even with large-scale encoders. Llip improves +zero-shot classification by an average of 2.9% zero-shot classification +benchmarks with a ViT-G/14 encoder. Specifically, Llip attains a zero-shot +top-1 accuracy of 83.5% on ImageNet outperforming a similarly sized CLIP by +1.4%. We also demonstrate improvement on zero-shot retrieval on MS-COCO by +6.0%. We provide a comprehensive analysis of the components introduced by the +method and demonstrate that Llip leads to richer visual representations. + +
+
+ comment: 14 pages, 8 figures, 7 tables, to be published at ICML2024 +
+
+
+
+
+ + ♻ ☆ Towards a clinically accessible radiology foundation model: open-access + and lightweight, with automated evaluation + + +
+ The scaling laws and extraordinary performance of large foundation models +motivate the development and utilization of such models in biomedicine. +However, despite early promising results on some biomedical benchmarks, there +are still major challenges that need to be addressed before these models can be +used in real-world clinics. Frontier general-domain models such as GPT-4V still +have significant performance gaps in multimodal biomedical applications. More +importantly, less-acknowledged pragmatic issues, including accessibility, model +cost, and tedious manual evaluation make it hard for clinicians to use +state-of-the-art large models directly on private patient data. Here, we +explore training open-source small multimodal models (SMMs) to bridge +competency gaps for unmet clinical needs in radiology. To maximize data +efficiency, we adopt a modular approach by incorporating state-of-the-art +pre-trained models for image and text modalities, and focusing on training a +lightweight adapter to ground each modality to the text embedding space, as +exemplified by LLaVA-Med. For training, we assemble a large dataset of over 697 +thousand radiology image-text pairs. For evaluation, we propose CheXprompt, a +GPT-4-based metric for factuality evaluation, and demonstrate its parity with +expert evaluation. For best practice, we conduct a systematic ablation study on +various choices in data engineering and multimodal training. The resulting +LlaVA-Rad (7B) model attains state-of-the-art results on standard radiology +tasks such as report generation and cross-modal retrieval, even outperforming +much larger models such as GPT-4V and Med-PaLM M (84B). The inference of +LlaVA-Rad is fast and can be performed on a single V100 GPU in private +settings, offering a promising state-of-the-art tool for real-world clinical +applications. + +
+
+
+
+
+ + ♻ ☆ Parkinson's Disease Classification Using Contrastive Graph Cross-View + Learning with Multimodal Fusion of SPECT Images and Clinical Features + + +
+ Parkinson's Disease (PD) affects millions globally, impacting movement. Prior +research utilized deep learning for PD prediction, primarily focusing on +medical images, neglecting the data's underlying manifold structure. This work +proposes a multimodal approach encompassing both image and non-image features, +leveraging contrastive cross-view graph fusion for PD classification. We +introduce a novel multimodal co-attention module, integrating embeddings from +separate graph views derived from low-dimensional representations of images and +clinical features. This enables more robust and structured feature extraction +for improved multi-view data analysis. Additionally, a simplified contrastive +loss-based fusion method is devised to enhance cross-view fusion learning. Our +graph-view multimodal approach achieves an accuracy of 91% and an area under +the receiver operating characteristic curve (AUC) of 92.8% in five-fold +cross-validation. It also demonstrates superior predictive capabilities on +non-image data compared to solely machine learning-based methods. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 87 + +
+
+
+ + ☆ Vibe-Eval: A hard evaluation suite for measuring progress of multimodal + language models + + +
+ We introduce Vibe-Eval: a new open benchmark and framework for evaluating +multimodal chat models. Vibe-Eval consists of 269 visual understanding prompts, +including 100 of hard difficulty, complete with gold-standard responses +authored by experts. Vibe-Eval is open-ended and challenging with dual +objectives: (i) vibe checking multimodal chat models for day-to-day tasks and +(ii) rigorously testing and probing the capabilities of present frontier +models. Notably, our hard set contains >50% questions that all frontier models +answer incorrectly. We explore the nuances of designing, evaluating, and +ranking models on ultra challenging prompts. We also discuss trade-offs between +human and automatic evaluation, and show that automatic model evaluation using +Reka Core roughly correlates to human judgment. We offer free API access for +the purpose of lightweight evaluation and plan to conduct formal human +evaluations for public models that perform well on the Vibe-Eval's automatic +scores. We release the evaluation code and data, see +https://github.com/reka-ai/reka-vibe-eval + +
+
+
+
+
+ + ☆ DreamScene4D: Dynamic Multi-Object Scene Generation from Monocular + Videos + + +
+ Existing VLMs can track in-the-wild 2D video objects while current generative +models provide powerful visual priors for synthesizing novel views for the +highly under-constrained 2D-to-3D object lifting. Building upon this exciting +progress, we present DreamScene4D, the first approach that can generate +three-dimensional dynamic scenes of multiple objects from monocular in-the-wild +videos with large object motion across occlusions and novel viewpoints. Our key +insight is to design a "decompose-then-recompose" scheme to factorize both the +whole video scene and each object's 3D motion. We first decompose the video +scene by using open-vocabulary mask trackers and an adapted image diffusion +model to segment, track, and amodally complete the objects and background in +the video. Each object track is mapped to a set of 3D Gaussians that deform and +move in space and time. We also factorize the observed motion into multiple +components to handle fast motion. The camera motion can be inferred by +re-rendering the background to match the video frames. For the object motion, +we first model the object-centric deformation of the objects by leveraging +rendering losses and multi-view generative priors in an object-centric frame, +then optimize object-centric to world-frame transformations by comparing the +rendered outputs against the perceived pixel and optical flow. Finally, we +recompose the background and objects and optimize for relative object scales +using monocular depth prediction guidance. We show extensive results on the +challenging DAVIS, Kubric, and self-captured videos, detail some limitations, +and provide future directions. Besides 4D scene generation, our results show +that DreamScene4D enables accurate 2D point motion tracking by projecting the +inferred 3D trajectories to 2D, while never explicitly trained to do so. + +
+
+ comment: Project page: https://dreamscene4d.github.io/ +
+
+
+
+
+ + ☆ On the test-time zero-shot generalization of vision-language models: Do + we really need prompt learning? + + +
+ The development of large vision-language models, notably CLIP, has catalyzed +research into effective adaptation techniques, with a particular focus on soft +prompt tuning. Conjointly, test-time augmentation, which utilizes multiple +augmented views of a single image to enhance zero-shot generalization, is +emerging as a significant area of interest. This has predominantly directed +research efforts toward test-time prompt tuning. In contrast, we introduce a +robust MeanShift for Test-time Augmentation (MTA), which surpasses prompt-based +methods without requiring this intensive training procedure. This positions MTA +as an ideal solution for both standalone and API-based applications. +Additionally, our method does not rely on ad hoc rules (e.g., confidence +threshold) used in some previous test-time augmentation techniques to filter +the augmented views. Instead, MTA incorporates a quality assessment variable +for each view directly into its optimization process, termed as the inlierness +score. This score is jointly optimized with a density mode seeking process, +leading to an efficient training- and hyperparameter-free approach. We +extensively benchmark our method on 15 datasets and demonstrate MTA's +superiority and computational efficiency. Deployed easily as plug-and-play +module on top of zero-shot models and state-of-the-art few-shot methods, MTA +shows systematic and consistent improvements. + +
+
+
+
+
+ + ☆ What matters when building vision-language models? + + +
+ The growing interest in vision-language models (VLMs) has been driven by +improvements in large language models and vision transformers. Despite the +abundance of literature on this subject, we observe that critical decisions +regarding the design of VLMs are often not justified. We argue that these +unsupported decisions impede progress in the field by making it difficult to +identify which choices improve model performance. To address this issue, we +conduct extensive experiments around pre-trained models, architecture choice, +data, and training methods. Our consolidation of findings includes the +development of Idefics2, an efficient foundational VLM of 8 billion parameters. +Idefics2 achieves state-of-the-art performance within its size category across +various multimodal benchmarks, and is often on par with models four times its +size. We release the model (base, instructed, and chat) along with the datasets +created for its training. + +
+
+
+
+
+ + ☆ Designed Dithering Sign Activation for Binary Neural Networks + + +
+ Binary Neural Networks emerged as a cost-effective and energy-efficient +solution for computer vision tasks by binarizing either network weights or +activations. However, common binary activations, such as the Sign activation +function, abruptly binarize the values with a single threshold, losing +fine-grained details in the feature outputs. This work proposes an activation +that applies multiple thresholds following dithering principles, shifting the +Sign activation function for each pixel according to a spatially periodic +threshold kernel. Unlike literature methods, the shifting is defined jointly +for a set of adjacent pixels, taking advantage of spatial correlations. +Experiments over the classification task demonstrate the effectiveness of the +designed dithering Sign activation function as an alternative activation for +binary neural networks, without increasing the computational cost. Further, +DeSign balances the preservation of details with the efficiency of binary +operations. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Multispectral Fine-Grained Classification of Blackgrass in Wheat and + Barley Crops + + +
+ As the burden of herbicide resistance grows and the environmental +repercussions of excessive herbicide use become clear, new ways of managing +weed populations are needed. This is particularly true for cereal crops, like +wheat and barley, that are staple food crops and occupy a globally significant +portion of agricultural land. Even small improvements in weed management +practices across these major food crops worldwide would yield considerable +benefits for both the environment and global food security. Blackgrass is a +major grass weed which causes particular problems in cereal crops in north-west +Europe, a major cereal production area, because it has high levels of of +herbicide resistance and is well adapted to agronomic practice in this region. +With the use of machine vision and multispectral imaging, we investigate the +effectiveness of state-of-the-art methods to identify blackgrass in wheat and +barley crops. As part of this work, we provide a large dataset with which we +evaluate several key aspects of blackgrass weed recognition. Firstly, we +determine the performance of different CNN and transformer-based architectures +on images from unseen fields. Secondly, we demonstrate the role that different +spectral bands have on the performance of weed classification. Lastly, we +evaluate the role of dataset size in classification performance for each of the +models trialled. We find that even with a fairly modest quantity of training +data an accuracy of almost 90% can be achieved on images from unseen fields. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Non-Destructive Peat Analysis using Hyperspectral Imaging and Machine + Learning + + +
+ Peat, a crucial component in whisky production, imparts distinctive and +irreplaceable flavours to the final product. However, the extraction of peat +disrupts ancient ecosystems and releases significant amounts of carbon, +contributing to climate change. This paper aims to address this issue by +conducting a feasibility study on enhancing peat use efficiency in whisky +manufacturing through non-destructive analysis using hyperspectral imaging. +Results show that shot-wave infrared (SWIR) data is more effective for +analyzing peat samples and predicting total phenol levels, with accuracies up +to 99.81%. + +
+
+ comment: 4 pages,4 figures +
+
+
+
+
+ + ☆ Training-Free Deepfake Voice Recognition by Leveraging Large-Scale + Pre-Trained Models + + +
+ Generalization is a main issue for current audio deepfake detectors, which +struggle to provide reliable results on out-of-distribution data. Given the +speed at which more and more accurate synthesis methods are developed, it is +very important to design techniques that work well also on data they were not +trained for.In this paper we study the potential of large-scale pre-trained +models for audio deepfake detection, with special focus on generalization +ability. To this end, the detection problem is reformulated in a speaker +verification framework and fake audios are exposed by the mismatch between the +voice sample under test and the voice of the claimed identity. With this +paradigm, no fake speech sample is necessary in training, cutting off any link +with the generation method at the root, and ensuring full generalization +ability. Features are extracted by general-purpose large pre-trained models, +with no need for training or fine-tuning on specific fake detection or speaker +verification datasets. At detection time only a limited set of voice fragments +of the identity under test is required. Experiments on several datasets +widespread in the community show that detectors based on pre-trained models +achieve excellent performance and show strong generalization ability, rivaling +supervised methods on in-distribution data and largely overcoming them on +out-of-distribution data. + +
+
+
+
+
+ + ☆ Self-Supervised Learning for Real-World Super-Resolution from Dual and + Multiple Zoomed Observations ECCV 2022 + + +
+ In this paper, we consider two challenging issues in reference-based +super-resolution (RefSR) for smartphone, (i) how to choose a proper reference +image, and (ii) how to learn RefSR in a self-supervised manner. Particularly, +we propose a novel self-supervised learning approach for real-world RefSR from +observations at dual and multiple camera zooms. Firstly, considering the +popularity of multiple cameras in modern smartphones, the more zoomed +(telephoto) image can be naturally leveraged as the reference to guide the +super-resolution (SR) of the lesser zoomed (ultra-wide) image, which gives us a +chance to learn a deep network that performs SR from the dual zoomed +observations (DZSR). Secondly, for self-supervised learning of DZSR, we take +the telephoto image instead of an additional high-resolution image as the +supervision information, and select a center patch from it as the reference to +super-resolve the corresponding ultra-wide image patch. To mitigate the effect +of the misalignment between ultra-wide low-resolution (LR) patch and telephoto +ground-truth (GT) image during training, we first adopt patch-based optical +flow alignment and then design an auxiliary-LR to guide the deforming of the +warped LR features. To generate visually pleasing results, we present local +overlapped sliced Wasserstein loss to better represent the perceptual +difference between GT and output in the feature space. During testing, DZSR can +be directly deployed to super-solve the whole ultra-wide image with the +reference of the telephoto image. In addition, we further take multiple zoomed +observations to explore self-supervised RefSR, and present a progressive fusion +scheme for the effective utilization of reference images. Experiments show that +our methods achieve better quantitative and qualitative performance against +state-of-the-arts. Codes are available at +https://github.com/cszhilu1998/SelfDZSR_PlusPlus. + +
+
+ comment: Accpted by IEEE TPAMI in 2024. Extended version of ECCV 2022 paper + "Self-Supervised Learning for Real-World Super-Resolution from Dual Zoomed + Observations" (arXiv:2203.01325) +
+
+
+
+
+ + ☆ Mapping the Unseen: Unified Promptable Panoptic Mapping with Dynamic + Labeling using Foundation Models + + +
+ In the field of robotics and computer vision, efficient and accurate semantic +mapping remains a significant challenge due to the growing demand for +intelligent machines that can comprehend and interact with complex +environments. Conventional panoptic mapping methods, however, are limited by +predefined semantic classes, thus making them ineffective for handling novel or +unforeseen objects. In response to this limitation, we introduce the Unified +Promptable Panoptic Mapping (UPPM) method. UPPM utilizes recent advances in +foundation models to enable real-time, on-demand label generation using natural +language prompts. By incorporating a dynamic labeling strategy into traditional +panoptic mapping techniques, UPPM provides significant improvements in +adaptability and versatility while maintaining high performance levels in map +reconstruction. We demonstrate our approach on real-world and simulated +datasets. Results show that UPPM can accurately reconstruct scenes and segment +objects while generating rich semantic labels through natural language +interactions. A series of ablation experiments validated the advantages of +foundation model-based labeling over fixed label sets. + +
+
+
+
+
+ + ☆ Multi-method Integration with Confidence-based Weighting for Zero-shot + Image Classification + + +
+ This paper introduces a novel framework for zero-shot learning (ZSL), i.e., +to recognize new categories that are unseen during training, by using a +multi-model and multi-alignment integration method. Specifically, we propose +three strategies to enhance the model's performance to handle ZSL: 1) Utilizing +the extensive knowledge of ChatGPT and the powerful image generation +capabilities of DALL-E to create reference images that can precisely describe +unseen categories and classification boundaries, thereby alleviating the +information bottleneck issue; 2) Integrating the results of text-image +alignment and image-image alignment from CLIP, along with the image-image +alignment results from DINO, to achieve more accurate predictions; 3) +Introducing an adaptive weighting mechanism based on confidence levels to +aggregate the outcomes from different prediction methods. Experimental results +on multiple datasets, including CIFAR-10, CIFAR-100, and TinyImageNet, +demonstrate that our model can significantly improve classification accuracy +compared to single-model approaches, achieving AUROC scores above 96% across +all test datasets, and notably surpassing 99% on the CIFAR-10 dataset. + +
+
+
+
+
+ + ☆ Probablistic Restoration with Adaptive Noise Sampling for 3D Human Pose + Estimation ICME 2024 + + +
+ The accuracy and robustness of 3D human pose estimation (HPE) are limited by +2D pose detection errors and 2D to 3D ill-posed challenges, which have drawn +great attention to Multi-Hypothesis HPE research. Most existing MH-HPE methods +are based on generative models, which are computationally expensive and +difficult to train. In this study, we propose a Probabilistic Restoration 3D +Human Pose Estimation framework (PRPose) that can be integrated with any +lightweight single-hypothesis model. Specifically, PRPose employs a weakly +supervised approach to fit the hidden probability distribution of the 2D-to-3D +lifting process in the Single-Hypothesis HPE model and then reverse-map the +distribution to the 2D pose input through an adaptive noise sampling strategy +to generate reasonable multi-hypothesis samples effectively. Extensive +experiments on 3D HPE benchmarks (Human3.6M and MPI-INF-3DHP) highlight the +effectiveness and efficiency of PRPose. Code is available at: +https://github.com/xzhouzeng/PRPose. + +
+
+ comment: ICME 2024 +
+
+
+
+
+ + ☆ Three-Dimensional Amyloid-Beta PET Synthesis from Structural MRI with + Conditional Generative Adversarial Networks + + +
+ Motivation: Alzheimer's Disease hallmarks include amyloid-beta deposits and +brain atrophy, detectable via PET and MRI scans, respectively. PET is +expensive, invasive and exposes patients to ionizing radiation. MRI is cheaper, +non-invasive, and free from ionizing radiation but limited to measuring brain +atrophy. + Goal: To develop an 3D image translation model that synthesizes amyloid-beta +PET images from T1-weighted MRI, exploiting the known relationship between +amyloid-beta and brain atrophy. + Approach: The model was trained on 616 PET/MRI pairs and validated with 264 +pairs. + Results: The model synthesized amyloid-beta PET images from T1-weighted MRI +with high-degree of similarity showing high SSIM and PSNR metrics +(SSIM>0.95&PSNR=28). + Impact: Our model proves the feasibility of synthesizing amyloid-beta PET +images from structural MRI ones, significantly enhancing accessibility for +large-cohort studies and early dementia detection, while also reducing cost, +invasiveness, and radiation exposure. + +
+
+ comment: Abstract Submitted and Presented at the 2024 International Society of + Magnetic Resonance in Medicine. Singapore, Singapore, May 4-9. Abstract + Number 2239 +
+
+
+
+
+ + ☆ MVP-Shot: Multi-Velocity Progressive-Alignment Framework for Few-Shot + Action Recognition + + +
+ Recent few-shot action recognition (FSAR) methods achieve promising +performance by performing semantic matching on learned discriminative features. +However, most FSAR methods focus on single-scale (e.g., frame-level, +segment-level, \etc) feature alignment, which ignores that human actions with +the same semantic may appear at different velocities. To this end, we develop a +novel Multi-Velocity Progressive-alignment (MVP-Shot) framework to +progressively learn and align semantic-related action features at +multi-velocity levels. Concretely, a Multi-Velocity Feature Alignment (MVFA) +module is designed to measure the similarity between features from support and +query videos with different velocity scales and then merge all similarity +scores in a residual fashion. To avoid the multiple velocity features deviating +from the underlying motion semantic, our proposed Progressive Semantic-Tailored +Interaction (PSTI) module injects velocity-tailored text information into the +video feature via feature interaction on channel and temporal domains at +different velocities. The above two modules compensate for each other to +predict query categories more accurately under the few-shot settings. +Experimental results show our method outperforms current state-of-the-art +methods on multiple standard few-shot benchmarks (i.e., HMDB51, UCF101, +Kinetics, and SSv2-small). + +
+
+
+
+
+ + ☆ Advancing Pre-trained Teacher: Towards Robust Feature Discrepancy for + Anomaly Detection + + +
+ With the wide application of knowledge distillation between an ImageNet +pre-trained teacher model and a learnable student model, industrial anomaly +detection has witnessed a significant achievement in the past few years. The +success of knowledge distillation mainly relies on how to keep the feature +discrepancy between the teacher and student model, in which it assumes that: +(1) the teacher model can jointly represent two different distributions for the +normal and abnormal patterns, while (2) the student model can only reconstruct +the normal distribution. However, it still remains a challenging issue to +maintain these ideal assumptions in practice. In this paper, we propose a +simple yet effective two-stage industrial anomaly detection framework, termed +as AAND, which sequentially performs Anomaly Amplification and Normality +Distillation to obtain robust feature discrepancy. In the first anomaly +amplification stage, we propose a novel Residual Anomaly Amplification (RAA) +module to advance the pre-trained teacher encoder. With the exposure of +synthetic anomalies, it amplifies anomalies via residual generation while +maintaining the integrity of pre-trained model. It mainly comprises a +Matching-guided Residual Gate and an Attribute-scaling Residual Generator, +which can determine the residuals' proportion and characteristic, respectively. +In the second normality distillation stage, we further employ a reverse +distillation paradigm to train a student decoder, in which a novel Hard +Knowledge Distillation (HKD) loss is built to better facilitate the +reconstruction of normal patterns. Comprehensive experiments on the MvTecAD, +VisA, and MvTec3D-RGB datasets show that our method achieves state-of-the-art +performance. + +
+
+ comment: The paper is under review +
+
+
+
+
+ + ☆ WateRF: Robust Watermarks in Radiance Fields for Protection of + Copyrights + + +
+ The advances in the Neural Radiance Fields (NeRF) research offer extensive +applications in diverse domains, but protecting their copyrights has not yet +been researched in depth. Recently, NeRF watermarking has been considered one +of the pivotal solutions for safely deploying NeRF-based 3D representations. +However, existing methods are designed to apply only to implicit or explicit +NeRF representations. In this work, we introduce an innovative watermarking +method that can be employed in both representations of NeRF. This is achieved +by fine-tuning NeRF to embed binary messages in the rendering process. In +detail, we propose utilizing the discrete wavelet transform in the NeRF space +for watermarking. Furthermore, we adopt a deferred back-propagation technique +and introduce a combination with the patch-wise loss to improve rendering +quality and bit accuracy with minimum trade-offs. We evaluate our method in +three different aspects: capacity, invisibility, and robustness of the embedded +watermarks in the 2D-rendered images. Our method achieves state-of-the-art +performance with faster training speed over the compared state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Towards general deep-learning-based tree instance segmentation models + + +
+ The segmentation of individual trees from forest point clouds is a crucial +task for downstream analyses such as carbon sequestration estimation. Recently, +deep-learning-based methods have been proposed which show the potential of +learning to segment trees. Since these methods are trained in a supervised way, +the question arises how general models can be obtained that are applicable +across a wide range of settings. So far, training has been mainly conducted +with data from one specific laser scanning type and for specific types of +forests. In this work, we train one segmentation model under various +conditions, using seven diverse datasets found in literature, to gain insights +into the generalization capabilities under domain-shift. Our results suggest +that a generalization from coniferous dominated sparse point clouds to +deciduous dominated high-resolution point clouds is possible. Conversely, +qualitative evidence suggests that generalization from high-resolution to +low-resolution point clouds is challenging. This emphasizes the need for forest +point clouds with diverse data characteristics for model development. To enrich +the available data basis, labeled trees from two previous works were propagated +to the complete forest point cloud and are made publicly available at +https://doi.org/10.25625/QUTUWU. + +
+
+
+
+
+ + ☆ IFNet: Deep Imaging and Focusing for Handheld SAR with Millimeter-wave + Signals + + +
+ Recent advancements have showcased the potential of handheld millimeter-wave +(mmWave) imaging, which applies synthetic aperture radar (SAR) principles in +portable settings. However, existing studies addressing handheld motion errors +either rely on costly tracking devices or employ simplified imaging models, +leading to impractical deployment or limited performance. In this paper, we +present IFNet, a novel deep unfolding network that combines the strengths of +signal processing models and deep neural networks to achieve robust imaging and +focusing for handheld mmWave systems. We first formulate the handheld imaging +model by integrating multiple priors about mmWave images and handheld phase +errors. Furthermore, we transform the optimization processes into an iterative +network structure for improved and efficient imaging performance. Extensive +experiments demonstrate that IFNet effectively compensates for handheld phase +errors and recovers high-fidelity images from severely distorted signals. In +comparison with existing methods, IFNet can achieve at least 11.89 dB +improvement in average peak signal-to-noise ratio (PSNR) and 64.91% improvement +in average structural similarity index measure (SSIM) on a real-world dataset. + +
+
+
+
+
+ + ☆ DiffMap: Enhancing Map Segmentation with Map Prior Using Diffusion Model + + +
+ Constructing high-definition (HD) maps is a crucial requirement for enabling +autonomous driving. In recent years, several map segmentation algorithms have +been developed to address this need, leveraging advancements in Bird's-Eye View +(BEV) perception. However, existing models still encounter challenges in +producing realistic and consistent semantic map layouts. One prominent issue is +the limited utilization of structured priors inherent in map segmentation +masks. In light of this, we propose DiffMap, a novel approach specifically +designed to model the structured priors of map segmentation masks using latent +diffusion model. By incorporating this technique, the performance of existing +semantic segmentation methods can be significantly enhanced and certain +structural errors present in the segmentation outputs can be effectively +rectified. Notably, the proposed module can be seamlessly integrated into any +map segmentation model, thereby augmenting its capability to accurately +delineate semantic information. Furthermore, through extensive visualization +analysis, our model demonstrates superior proficiency in generating results +that more accurately reflect real-world map layouts, further validating its +efficacy in improving the quality of the generated maps. + +
+
+
+
+
+ + ☆ HoloGS: Instant Depth-based 3D Gaussian Splatting with Microsoft + HoloLens 2 SP + + +
+ In the fields of photogrammetry, computer vision and computer graphics, the +task of neural 3D scene reconstruction has led to the exploration of various +techniques. Among these, 3D Gaussian Splatting stands out for its explicit +representation of scenes using 3D Gaussians, making it appealing for tasks like +3D point cloud extraction and surface reconstruction. Motivated by its +potential, we address the domain of 3D scene reconstruction, aiming to leverage +the capabilities of the Microsoft HoloLens 2 for instant 3D Gaussian Splatting. +We present HoloGS, a novel workflow utilizing HoloLens sensor data, which +bypasses the need for pre-processing steps like Structure from Motion by +instantly accessing the required input data i.e. the images, camera poses and +the point cloud from depth sensing. We provide comprehensive investigations, +including the training process and the rendering quality, assessed through the +Peak Signal-to-Noise Ratio, and the geometric 3D accuracy of the densified +point cloud from Gaussian centers, measured by Chamfer Distance. We evaluate +our approach on two self-captured scenes: An outdoor scene of a cultural +heritage statue and an indoor scene of a fine-structured plant. Our results +show that the HoloLens data, including RGB images, corresponding camera poses, +and depth sensing based point clouds to initialize the Gaussians, are suitable +as input for 3D Gaussian Splatting. + +
+
+ comment: 8 pages, 9 figures, 2 tables. Will be published in the ISPRS The + International Archives of Photogrammetry, Remote Sensing and Spatial + Information Sciences +
+
+
+
+
+ + ☆ M${^2}$Depth: Self-supervised Two-Frame Multi-camera Metric Depth + Estimation + + +
+ This paper presents a novel self-supervised two-frame multi-camera metric +depth estimation network, termed M${^2}$Depth, which is designed to predict +reliable scale-aware surrounding depth in autonomous driving. Unlike the +previous works that use multi-view images from a single time-step or multiple +time-step images from a single camera, M${^2}$Depth takes temporally adjacent +two-frame images from multiple cameras as inputs and produces high-quality +surrounding depth. We first construct cost volumes in spatial and temporal +domains individually and propose a spatial-temporal fusion module that +integrates the spatial-temporal information to yield a strong volume +presentation. We additionally combine the neural prior from SAM features with +internal features to reduce the ambiguity between foreground and background and +strengthen the depth edges. Extensive experimental results on nuScenes and DDAD +benchmarks show M${^2}$Depth achieves state-of-the-art performance. More +results can be found in https://heiheishuang.xyz/M2Depth . + +
+
+
+
+
+ + ☆ Cooperation and Federation in Distributed Radar Point Cloud Processing + + +
+ The paper considers the problem of human-scale RF sensing utilizing a network +of resource-constrained MIMO radars with low range-azimuth resolution. The +radars operate in the mmWave band and obtain time-varying 3D point cloud (PC) +information that is sensitive to body movements. They also observe the same +scene from different views and cooperate while sensing the environment using a +sidelink communication channel. Conventional cooperation setups allow the +radars to mutually exchange raw PC information to improve ego sensing. The +paper proposes a federation mechanism where the radars exchange the parameters +of a Bayesian posterior measure of the observed PCs, rather than raw data. The +radars act as distributed parameter servers to reconstruct a global posterior +(i.e., federated posterior) using Bayesian tools. The paper quantifies and +compares the benefits of radar federation with respect to cooperation +mechanisms. Both approaches are validated by experiments with a real-time +demonstration platform. Federation makes minimal use of the sidelink +communication channel (20 {\div} 25 times lower bandwidth use) and is less +sensitive to unresolved targets. On the other hand, cooperation reduces the +mean absolute target estimation error of about 20%. + +
+
+
+
+
+ + ☆ SFFNet: A Wavelet-Based Spatial and Frequency Domain Fusion Network for + Remote Sensing Segmentation + + +
+ In order to fully utilize spatial information for segmentation and address +the challenge of handling areas with significant grayscale variations in remote +sensing segmentation, we propose the SFFNet (Spatial and Frequency Domain +Fusion Network) framework. This framework employs a two-stage network design: +the first stage extracts features using spatial methods to obtain features with +sufficient spatial details and semantic information; the second stage maps +these features in both spatial and frequency domains. In the frequency domain +mapping, we introduce the Wavelet Transform Feature Decomposer (WTFD) +structure, which decomposes features into low-frequency and high-frequency +components using the Haar wavelet transform and integrates them with spatial +features. To bridge the semantic gap between frequency and spatial features, +and facilitate significant feature selection to promote the combination of +features from different representation domains, we design the Multiscale +Dual-Representation Alignment Filter (MDAF). This structure utilizes multiscale +convolutions and dual-cross attentions. Comprehensive experimental results +demonstrate that, compared to existing methods, SFFNet achieves superior +performance in terms of mIoU, reaching 84.80% and 87.73% respectively.The code +is located at https://github.com/yysdck/SFFNet. + +
+
+
+
+
+ + ☆ A Sonar-based AUV Positioning System for Underwater Environments with + Low Infrastructure Density ICRA + + +
+ The increasing demand for underwater vehicles highlights the necessity for +robust localization solutions in inspection missions. In this work, we present +a novel real-time sonar-based underwater global positioning algorithm for AUVs +(Autonomous Underwater Vehicles) designed for environments with a sparse +distribution of human-made assets. Our approach exploits two synergistic data +interpretation frontends applied to the same stream of sonar data acquired by a +multibeam Forward-Looking Sonar (FSD). These observations are fused within a +Particle Filter (PF) either to weigh more particles that belong to +high-likelihood regions or to solve symmetric ambiguities. Preliminary +experiments carried out on a simulated environment resembling a real underwater +plant provided promising results. This work represents a starting point towards +future developments of the method and consequent exhaustive evaluations also in +real-world scenarios. + +
+
+ comment: Accepted to the IEEE ICRA Workshop on Field Robotics 2024 +
+
+
+
+
+ + ☆ From Attack to Defense: Insights into Deep Learning Security Measures in + Black-Box Settings + + +
+ Deep Learning (DL) is rapidly maturing to the point that it can be used in +safety- and security-crucial applications. However, adversarial samples, which +are undetectable to the human eye, pose a serious threat that can cause the +model to misbehave and compromise the performance of such applications. +Addressing the robustness of DL models has become crucial to understanding and +defending against adversarial attacks. In this study, we perform comprehensive +experiments to examine the effect of adversarial attacks and defenses on +various model architectures across well-known datasets. Our research focuses on +black-box attacks such as SimBA, HopSkipJump, MGAAttack, and boundary attacks, +as well as preprocessor-based defensive mechanisms, including bits squeezing, +median smoothing, and JPEG filter. Experimenting with various models, our +results demonstrate that the level of noise needed for the attack increases as +the number of layers increases. Moreover, the attack success rate decreases as +the number of layers increases. This indicates that model complexity and +robustness have a significant relationship. Investigating the diversity and +robustness relationship, our experiments with diverse models show that having a +large number of parameters does not imply higher robustness. Our experiments +extend to show the effects of the training dataset on model robustness. Using +various datasets such as ImageNet-1000, CIFAR-100, and CIFAR-10 are used to +evaluate the black-box attacks. Considering the multiple dimensions of our +analysis, e.g., model complexity and training dataset, we examined the behavior +of black-box attacks when models apply defenses. Our results show that applying +defense strategies can significantly reduce attack effectiveness. This research +provides in-depth analysis and insight into the robustness of DL models against +various attacks, and defenses. + +
+
+
+
+
+ + ☆ An Attention Based Pipeline for Identifying Pre-Cancer Lesions in Head + and Neck Clinical Images + + +
+ Early detection of cancer can help improve patient prognosis by early +intervention. Head and neck cancer is diagnosed in specialist centres after a +surgical biopsy, however, there is a potential for these to be missed leading +to delayed diagnosis. To overcome these challenges, we present an attention +based pipeline that identifies suspected lesions, segments, and classifies them +as non-dysplastic, dysplastic and cancerous lesions. We propose (a) a vision +transformer based Mask R-CNN network for lesion detection and segmentation of +clinical images, and (b) Multiple Instance Learning (MIL) based scheme for +classification. Current results show that the segmentation model produces +segmentation masks and bounding boxes with up to 82% overlap accuracy score on +unseen external test data and surpassing reviewed segmentation benchmarks. +Next, a classification F1-score of 85% on the internal cohort test set. An app +has been developed to perform lesion segmentation taken via a smart device. +Future work involves employing endoscopic video data for precise early +detection and prognosis. + +
+
+ comment: 5 pages, 3 figures, accepted in ISBI 2024 +
+
+
+
+
+ + ☆ Impact of Architectural Modifications on Deep Learning Adversarial + Robustness + + +
+ Rapid advancements of deep learning are accelerating adoption in a wide +variety of applications, including safety-critical applications such as +self-driving vehicles, drones, robots, and surveillance systems. These +advancements include applying variations of sophisticated techniques that +improve the performance of models. However, such models are not immune to +adversarial manipulations, which can cause the system to misbehave and remain +unnoticed by experts. The frequency of modifications to existing deep learning +models necessitates thorough analysis to determine the impact on models' +robustness. In this work, we present an experimental evaluation of the effects +of model modifications on deep learning model robustness using adversarial +attacks. Our methodology involves examining the robustness of variations of +models against various adversarial attacks. By conducting our experiments, we +aim to shed light on the critical issue of maintaining the reliability and +safety of deep learning models in safety- and security-critical applications. +Our results indicate the pressing demand for an in-depth assessment of the +effects of model changes on the robustness of models. + +
+
+
+
+
+ + ☆ Auto-Encoding Morph-Tokens for Multimodal LLM ICML 2024 + + +
+ For multimodal LLMs, the synergy of visual comprehension (textual output) and +generation (visual output) presents an ongoing challenge. This is due to a +conflicting objective: for comprehension, an MLLM needs to abstract the +visuals; for generation, it needs to preserve the visuals as much as possible. +Thus, the objective is a dilemma for visual-tokens. To resolve the conflict, we +propose encoding images into morph-tokens to serve a dual purpose: for +comprehension, they act as visual prompts instructing MLLM to generate texts; +for generation, they take on a different, non-conflicting role as complete +visual-tokens for image reconstruction, where the missing visual cues are +recovered by the MLLM. Extensive experiments show that morph-tokens can achieve +a new SOTA for multimodal comprehension and generation simultaneously. Our +project is available at https://github.com/DCDmllm/MorphTokens. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ☆ Lightweight Change Detection in Heterogeneous Remote Sensing Images with + Online All-Integer Pruning Training + + +
+ Detection of changes in heterogeneous remote sensing images is vital, +especially in response to emergencies like earthquakes and floods. Current +homogenous transformation-based change detection (CD) methods often suffer from +high computation and memory costs, which are not friendly to edge-computation +devices like onboard CD devices at satellites. To address this issue, this +paper proposes a new lightweight CD method for heterogeneous remote sensing +images that employs the online all-integer pruning (OAIP) training strategy to +efficiently fine-tune the CD network using the current test data. The proposed +CD network consists of two visual geometry group (VGG) subnetworks as the +backbone architecture. In the OAIP-based training process, all the weights, +gradients, and intermediate data are quantized to integers to speed up training +and reduce memory usage, where the per-layer block exponentiation scaling +scheme is utilized to reduce the computation errors of network parameters +caused by quantization. Second, an adaptive filter-level pruning method based +on the L1-norm criterion is employed to further lighten the fine-tuning process +of the CD network. Experimental results show that the proposed OAIP-based +method attains similar detection performance (but with significantly reduced +computation complexity and memory usage) in comparison with state-of-the-art CD +methods. + +
+
+
+
+
+ + ☆ Enhancing Micro Gesture Recognition for Emotion Understanding via + Context-aware Visual-Text Contrastive Learning + + +
+ Psychological studies have shown that Micro Gestures (MG) are closely linked +to human emotions. MG-based emotion understanding has attracted much attention +because it allows for emotion understanding through nonverbal body gestures +without relying on identity information (e.g., facial and electrocardiogram +data). Therefore, it is essential to recognize MG effectively for advanced +emotion understanding. However, existing Micro Gesture Recognition (MGR) +methods utilize only a single modality (e.g., RGB or skeleton) while +overlooking crucial textual information. In this letter, we propose a simple +but effective visual-text contrastive learning solution that utilizes text +information for MGR. In addition, instead of using handcrafted prompts for +visual-text contrastive learning, we propose a novel module called Adaptive +prompting to generate context-aware prompts. The experimental results show that +the proposed method achieves state-of-the-art performance on two public +datasets. Furthermore, based on an empirical study utilizing the results of MGR +for emotion understanding, we demonstrate that using the textual results of MGR +significantly improves performance by 6%+ compared to directly using video as +input. + +
+
+ comment: accepted by IEEE Signal Processing Letters +
+
+
+
+
+ + ☆ Defect Image Sample Generation With Diffusion Prior for Steel Surface + Defect Recognition + + +
+ The task of steel surface defect recognition is an industrial problem with +great industry values. The data insufficiency is the major challenge in +training a robust defect recognition network. Existing methods have +investigated to enlarge the dataset by generating samples with generative +models. However, their generation quality is still limited by the insufficiency +of defect image samples. To this end, we propose Stable Surface Defect +Generation (StableSDG), which transfers the vast generation distribution +embedded in Stable Diffusion model for steel surface defect image generation. +To tackle with the distinctive distribution gap between steel surface images +and generated images of the diffusion model, we propose two processes. First, +we align the distribution by adapting parameters of the diffusion model, +adopted both in the token embedding space and network parameter space. Besides, +in the generation process, we propose image-oriented generation rather than +from pure Gaussian noises. We conduct extensive experiments on steel surface +defect dataset, demonstrating state-of-the-art performance on generating +high-quality samples and training recognition models, and both designed +processes are significant for the performance. + +
+
+
+
+
+ + ☆ TinySeg: Model Optimizing Framework for Image Segmentation on Tiny + Embedded Systems + + +
+ Image segmentation is one of the major computer vision tasks, which is +applicable in a variety of domains, such as autonomous navigation of an +unmanned aerial vehicle. However, image segmentation cannot easily materialize +on tiny embedded systems because image segmentation models generally have high +peak memory usage due to their architectural characteristics. This work finds +that image segmentation models unnecessarily require large memory space with an +existing tiny machine learning framework. That is, the existing framework +cannot effectively manage the memory space for the image segmentation models. + This work proposes TinySeg, a new model optimizing framework that enables +memory-efficient image segmentation for tiny embedded systems. TinySeg analyzes +the lifetimes of tensors in the target model and identifies long-living +tensors. Then, TinySeg optimizes the memory usage of the target model mainly +with two methods: (i) tensor spilling into local or remote storage and (ii) +fused fetching of spilled tensors. This work implements TinySeg on top of the +existing tiny machine learning framework and demonstrates that TinySeg can +reduce the peak memory usage of an image segmentation model by 39.3% for tiny +embedded systems. + +
+
+ comment: LCTES 2024 +
+
+
+
+
+ + ☆ FER-YOLO-Mamba: Facial Expression Detection and Classification Based on + Selective State Space + + +
+ Facial Expression Recognition (FER) plays a pivotal role in understanding +human emotional cues. However, traditional FER methods based on visual +information have some limitations, such as preprocessing, feature extraction, +and multi-stage classification procedures. These not only increase +computational complexity but also require a significant amount of computing +resources. Considering Convolutional Neural Network (CNN)-based FER schemes +frequently prove inadequate in identifying the deep, long-distance dependencies +embedded within facial expression images, and the Transformer's inherent +quadratic computational complexity, this paper presents the FER-YOLO-Mamba +model, which integrates the principles of Mamba and YOLO technologies to +facilitate efficient coordination in facial expression image recognition and +localization. Within the FER-YOLO-Mamba model, we further devise a FER-YOLO-VSS +dual-branch module, which combines the inherent strengths of convolutional +layers in local feature extraction with the exceptional capability of State +Space Models (SSMs) in revealing long-distance dependencies. To the best of our +knowledge, this is the first Vision Mamba model designed for facial expression +detection and classification. To evaluate the performance of the proposed +FER-YOLO-Mamba model, we conducted experiments on two benchmark datasets, +RAF-DB and SFEW. The experimental results indicate that the FER-YOLO-Mamba +model achieved better results compared to other models. The code is available +from https://github.com/SwjtuMa/FER-YOLO-Mamba. + +
+
+
+
+
+ + ☆ Improving Concept Alignment in Vision-Language Concept Bottleneck Models + + +
+ Concept Bottleneck Models (CBM) map the input image to a high-level +human-understandable concept space and then make class predictions based on +these concepts. Recent approaches automate the construction of CBM by prompting +Large Language Models (LLM) to generate text concepts and then use Vision +Language Models (VLM) to obtain concept scores to train a CBM. However, it is +desired to build CBMs with concepts defined by human experts instead of LLM +generated concepts to make them more trustworthy. In this work, we take a +closer inspection on the faithfulness of VLM concept scores for such +expert-defined concepts in domains like fine-grain bird species classification +and animal classification. Our investigations reveal that frozen VLMs, like +CLIP, struggle to correctly associate a concept to the corresponding visual +input despite achieving a high classification performance. To address this, we +propose a novel Contrastive Semi-Supervised (CSS) learning method which uses a +few labeled concept examples to improve concept alignment (activate truthful +visual concepts) in CLIP model. Extensive experiments on three benchmark +datasets show that our approach substantially increases the concept accuracy +and classification accuracy, yet requires only a fraction of the +human-annotated concept labels. To further improve the classification +performance, we also introduce a new class-level intervention procedure for +fine-grain classification problems that identifies the confounding classes and +intervenes their concept space to reduce errors. + +
+
+
+
+
+ + ☆ Report on the AAPM Grand Challenge on deep generative modeling for + learning medical image statistics + + +
+ The findings of the 2023 AAPM Grand Challenge on Deep Generative Modeling for +Learning Medical Image Statistics are reported in this Special Report. The goal +of this challenge was to promote the development of deep generative models +(DGMs) for medical imaging and to emphasize the need for their domain-relevant +assessment via the analysis of relevant image statistics. As part of this Grand +Challenge, a training dataset was developed based on 3D anthropomorphic breast +phantoms from the VICTRE virtual imaging toolbox. A two-stage evaluation +procedure consisting of a preliminary check for memorization and image quality +(based on the Frechet Inception distance (FID)), and a second stage evaluating +the reproducibility of image statistics corresponding to domain-relevant +radiomic features was developed. A summary measure was employed to rank the +submissions. Additional analyses of submissions was performed to assess DGM +performance specific to individual feature families, and to identify various +artifacts. 58 submissions from 12 unique users were received for this +Challenge. The top-ranked submission employed a conditional latent diffusion +model, whereas the joint runners-up employed a generative adversarial network, +followed by another network for image superresolution. We observed that the +overall ranking of the top 9 submissions according to our evaluation method (i) +did not match the FID-based ranking, and (ii) differed with respect to +individual feature families. Another important finding from our additional +analyses was that different DGMs demonstrated similar kinds of artifacts. This +Grand Challenge highlighted the need for domain-specific evaluation to further +DGM design as well as deployment. It also demonstrated that the specification +of a DGM may differ depending on its intended use. + +
+
+
+
+
+ + ☆ Real Risks of Fake Data: Synthetic Data, Diversity-Washing and Consent + Circumvention + + +
+ Machine learning systems require representations of the real world for +training and testing - they require data, and lots of it. Collecting data at +scale has logistical and ethical challenges, and synthetic data promises a +solution to these challenges. Instead of needing to collect photos of real +people's faces to train a facial recognition system, a model creator could +create and use photo-realistic, synthetic faces. The comparative ease of +generating this synthetic data rather than relying on collecting data has made +it a common practice. We present two key risks of using synthetic data in model +development. First, we detail the high risk of false confidence when using +synthetic data to increase dataset diversity and representation. We base this +in the examination of a real world use-case of synthetic data, where synthetic +datasets were generated for an evaluation of facial recognition technology. +Second, we examine how using synthetic data risks circumventing consent for +data usage. We illustrate this by considering the importance of consent to the +U.S. Federal Trade Commission's regulation of data collection and affected +models. Finally, we discuss how these two risks exemplify how synthetic data +complicates existing governance and ethical practice; by decoupling data from +those it impacts, synthetic data is prone to consolidating power away those +most impacted by algorithmically-mediated harm. + +
+
+
+
+
+ + ☆ SR4ZCT: Self-supervised Through-plane Resolution Enhancement for CT + Images with Arbitrary Resolution and Overlap + + +
+ Computed tomography (CT) is a widely used non-invasive medical imaging +technique for disease diagnosis. The diagnostic accuracy is often affected by +image resolution, which can be insufficient in practice. For medical CT images, +the through-plane resolution is often worse than the in-plane resolution and +there can be overlap between slices, causing difficulties in diagnoses. +Self-supervised methods for through-plane resolution enhancement, which train +on in-plane images and infer on through-plane images, have shown promise for +both CT and MRI imaging. However, existing self-supervised methods either +neglect overlap or can only handle specific cases with fixed combinations of +resolution and overlap. To address these limitations, we propose a +self-supervised method called SR4ZCT. It employs the same off-axis training +approach while being capable of handling arbitrary combinations of resolution +and overlap. Our method explicitly models the relationship between resolutions +and voxel spacings of different planes to accurately simulate training images +that match the original through-plane images. We highlight the significance of +accurate modeling in self-supervised off-axis training and demonstrate the +effectiveness of SR4ZCT using a real-world dataset. + +
+
+ comment: MLMI2023 +
+
+
+
+
+ + ☆ Spatio-Temporal SwinMAE: A Swin Transformer based Multiscale + Representation Learner for Temporal Satellite Imagery + + +
+ Currently, the foundation models represented by large language models have +made dramatic progress and are used in a very wide range of domains including +2D and 3D vision. As one of the important application domains of foundation +models, earth observation has attracted attention and various approaches have +been developed. When considering earth observation as a single image capture, +earth observation imagery can be processed as an image with three or more +channels, and when it comes with multiple image captures of different +timestamps at one location, the temporal observation can be considered as a set +of continuous image resembling video frames or medical SCAN slices. This paper +presents Spatio-Temporal SwinMAE (ST-SwinMAE), an architecture which +particularly focuses on representation learning for spatio-temporal image +processing. Specifically, it uses a hierarchical Masked Auto-encoder (MAE) with +Video Swin Transformer blocks. With the architecture, we present a pretrained +model named Degas 100M as a geospatial foundation model. Also, we propose an +approach for transfer learning with Degas 100M, which both pretrained encoder +and decoder of MAE are utilized with skip connections added between them to +achieve multi-scale information communication, forms an architecture named +Spatio-Temporal SwinUNet (ST-SwinUNet). Our approach shows significant +improvements of performance over existing state-of-the-art of foundation +models. Specifically, for transfer learning of the land cover downstream task +on the PhilEO Bench dataset, it shows 10.4\% higher accuracy compared with +other geospatial foundation models on average. + +
+
+
+
+
+ + ☆ Implicit Neural Representations for Robust Joint Sparse-View CT + Reconstruction + + +
+ Computed Tomography (CT) is pivotal in industrial quality control and medical +diagnostics. Sparse-view CT, offering reduced ionizing radiation, faces +challenges due to its under-sampled nature, leading to ill-posed reconstruction +problems. Recent advancements in Implicit Neural Representations (INRs) have +shown promise in addressing sparse-view CT reconstruction. Recognizing that CT +often involves scanning similar subjects, we propose a novel approach to +improve reconstruction quality through joint reconstruction of multiple objects +using INRs. This approach can potentially leverage both the strengths of INRs +and the statistical regularities across multiple objects. While current INR +joint reconstruction techniques primarily focus on accelerating convergence via +meta-initialization, they are not specifically tailored to enhance +reconstruction quality. To address this gap, we introduce a novel INR-based +Bayesian framework integrating latent variables to capture the inter-object +relationships. These variables serve as a dynamic reference throughout the +optimization, thereby enhancing individual reconstruction fidelity. Our +extensive experiments, which assess various key factors such as reconstruction +quality, resistance to overfitting, and generalizability, demonstrate +significant improvements over baselines in common numerical metrics. This +underscores a notable advancement in CT reconstruction methods. + +
+
+
+
+
+ + ☆ Rasterized Edge Gradients: Handling Discontinuities Differentiably + + +
+ Computing the gradients of a rendering process is paramount for diverse +applications in computer vision and graphics. However, accurate computation of +these gradients is challenging due to discontinuities and rendering +approximations, particularly for surface-based representations and +rasterization-based rendering. We present a novel method for computing +gradients at visibility discontinuities for rasterization-based differentiable +renderers. Our method elegantly simplifies the traditionally complex problem +through a carefully designed approximation strategy, allowing for a +straightforward, effective, and performant solution. We introduce a novel +concept of micro-edges, which allows us to treat the rasterized images as +outcomes of a differentiable, continuous process aligned with the inherently +non-differentiable, discrete-pixel rasterization. This technique eliminates the +necessity for rendering approximations or other modifications to the forward +pass, preserving the integrity of the rendered image, which makes it applicable +to rasterized masks, depth, and normals images where filtering is prohibitive. +Utilizing micro-edges simplifies gradient interpretation at discontinuities and +enables handling of geometry intersections, offering an advantage over the +prior art. We showcase our method in dynamic human head scene reconstruction, +demonstrating effective handling of camera images and segmentation masks. + +
+
+
+
+
+ + ☆ Functional Imaging Constrained Diffusion for Brain PET Synthesis from + Structural MRI + + +
+ Magnetic resonance imaging (MRI) and positron emission tomography (PET) are +increasingly used in multimodal analysis of neurodegenerative disorders. While +MRI is broadly utilized in clinical settings, PET is less accessible. Many +studies have attempted to use deep generative models to synthesize PET from MRI +scans. However, they often suffer from unstable training and inadequately +preserve brain functional information conveyed by PET. To this end, we propose +a functional imaging constrained diffusion (FICD) framework for 3D brain PET +image synthesis with paired structural MRI as input condition, through a new +constrained diffusion model (CDM). The FICD introduces noise to PET and then +progressively removes it with CDM, ensuring high output fidelity throughout a +stable training phase. The CDM learns to predict denoised PET with a functional +imaging constraint introduced to ensure voxel-wise alignment between each +denoised PET and its ground truth. Quantitative and qualitative analyses +conducted on 293 subjects with paired T1-weighted MRI and +18F-fluorodeoxyglucose (FDG)-PET scans suggest that FICD achieves superior +performance in generating FDG-PET data compared to state-of-the-art methods. We +further validate the effectiveness of the proposed FICD on data from a total of +1,262 subjects through three downstream tasks, with experimental results +suggesting its utility and generalizability. + +
+
+
+
+
+ + ☆ Prediction techniques for dynamic imaging with online primal-dual + methods + + +
+ Online optimisation facilitates the solution of dynamic inverse problems, +such as image stabilisation, fluid flow monitoring, and dynamic medical +imaging. In this paper, we improve upon previous work on predictive online +primal-dual methods on two fronts. Firstly, we provide a more concise analysis +that symmetrises previously unsymmetric regret bounds, and relaxes previous +restrictive conditions on the dual predictor. Secondly, based on the latter, we +develop several improved dual predictors. We numerically demonstrate their +efficacy in image stabilisation and dynamic positron emission tomography. + +
+
+
+
+
+ + ☆ Rip-NeRF: Anti-aliasing Radiance Fields with Ripmap-Encoded Platonic + Solids SIGGRAPH 2024 + + +
+ Despite significant advancements in Neural Radiance Fields (NeRFs), the +renderings may still suffer from aliasing and blurring artifacts, since it +remains a fundamental challenge to effectively and efficiently characterize +anisotropic areas induced by the cone-casting procedure. This paper introduces +a Ripmap-Encoded Platonic Solid representation to precisely and efficiently +featurize 3D anisotropic areas, achieving high-fidelity anti-aliasing +renderings. Central to our approach are two key components: Platonic Solid +Projection and Ripmap encoding. The Platonic Solid Projection factorizes the 3D +space onto the unparalleled faces of a certain Platonic solid, such that the +anisotropic 3D areas can be projected onto planes with distinguishable +characterization. Meanwhile, each face of the Platonic solid is encoded by the +Ripmap encoding, which is constructed by anisotropically pre-filtering a +learnable feature grid, to enable featurzing the projected anisotropic areas +both precisely and efficiently by the anisotropic area-sampling. Extensive +experiments on both well-established synthetic datasets and a newly captured +real-world dataset demonstrate that our Rip-NeRF attains state-of-the-art +rendering quality, particularly excelling in the fine details of repetitive +structures and textures, while maintaining relatively swift training times. + +
+
+ comment: SIGGRAPH 2024, Project page: https://junchenliu77.github.io/Rip-NeRF + , Code: https://github.com/JunchenLiu77/Rip-NeRF +
+
+
+
+
+ + ☆ A Fresh Look at Sanity Checks for Saliency Maps + + +
+ The Model Parameter Randomisation Test (MPRT) is highly recognised in the +eXplainable Artificial Intelligence (XAI) community due to its fundamental +evaluative criterion: explanations should be sensitive to the parameters of the +model they seek to explain. However, recent studies have raised several +methodological concerns for the empirical interpretation of MPRT. In response, +we propose two modifications to the original test: Smooth MPRT and Efficient +MPRT. The former reduces the impact of noise on evaluation outcomes via +sampling, while the latter avoids the need for biased similarity measurements +by re-interpreting the test through the increase in explanation complexity +after full model randomisation. Our experiments show that these modifications +enhance the metric reliability, facilitating a more trustworthy deployment of +explanation methods. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2401.06465 +
+
+
+
+
+ + ☆ Enhancing Social Media Post Popularity Prediction with Visual Content + + +
+ Our study presents a framework for predicting image-based social media +content popularity that focuses on addressing complex image information and a +hierarchical data structure. We utilize the Google Cloud Vision API to +effectively extract key image and color information from users' postings, +achieving 6.8\% higher accuracy compared to using non-image covariates alone. +For prediction, we explore a wide range of prediction models, including Linear +Mixed Model, Support Vector Regression, Multi-layer Perceptron, Random Forest, +and XGBoost, with linear regression as the benchmark. Our comparative study +demonstrates that models that are capable of capturing the underlying nonlinear +interactions between covariates outperform other methods. + +
+
+
+
+
+ + ☆ LLM as Dataset Analyst: Subpopulation Structure Discovery with Large + Language Model + + +
+ The distribution of subpopulations is an important property hidden within a +dataset. Uncovering and analyzing the subpopulation distribution within +datasets provides a comprehensive understanding of the datasets, standing as a +powerful tool beneficial to various downstream tasks, including Dataset +Subpopulation Organization, Subpopulation Shift, and Slice Discovery. Despite +its importance, there has been no work that systematically explores the +subpopulation distribution of datasets to our knowledge. To address the +limitation and solve all the mentioned tasks in a unified way, we introduce a +novel concept of subpopulation structures to represent, analyze, and utilize +subpopulation distributions within datasets. To characterize the structures in +an interpretable manner, we propose the Subpopulation Structure Discovery with +Large Language Models (SSD-LLM) framework, which employs world knowledge and +instruction-following capabilities of Large Language Models (LLMs) to +linguistically analyze informative image captions and summarize the structures. +Furthermore, we propose complete workflows to address downstream tasks, named +Task-specific Tuning, showcasing the application of the discovered structure to +a spectrum of subpopulation-related tasks, including dataset subpopulation +organization, subpopulation shift, and slice discovery. Furthermore, we propose +complete workflows to address downstream tasks, named Task-specific Tuning, +showcasing the application of the discovered structure to a spectrum of +subpopulation-related tasks, including dataset subpopulation organization, +subpopulation shift, and slice discovery. + +
+
+
+
+
+ + ♻ ☆ TULIP: Transformer for Upsampling of LiDAR Point Clouds CVPR2024 + + +
+ LiDAR Upsampling is a challenging task for the perception systems of robots +and autonomous vehicles, due to the sparse and irregular structure of +large-scale scene contexts. Recent works propose to solve this problem by +converting LiDAR data from 3D Euclidean space into an image super-resolution +problem in 2D image space. Although their methods can generate high-resolution +range images with fine-grained details, the resulting 3D point clouds often +blur out details and predict invalid points. In this paper, we propose TULIP, a +new method to reconstruct high-resolution LiDAR point clouds from +low-resolution LiDAR input. We also follow a range image-based approach but +specifically modify the patch and window geometries of a Swin-Transformer-based +network to better fit the characteristics of range images. We conducted several +experiments on three public real-world and simulated datasets. TULIP +outperforms state-of-the-art methods in all relevant metrics and generates +robust and more realistic point clouds than prior works. + +
+
+ comment: The paper was accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ SATO: Stable Text-to-Motion Framework + + +
+ Is the Text to Motion model robust? Recent advancements in Text to Motion +models primarily stem from more accurate predictions of specific actions. +However, the text modality typically relies solely on pre-trained Contrastive +Language-Image Pretraining (CLIP) models. Our research has uncovered a +significant issue with the text-to-motion model: its predictions often exhibit +inconsistent outputs, resulting in vastly different or even incorrect poses +when presented with semantically similar or identical text inputs. In this +paper, we undertake an analysis to elucidate the underlying causes of this +instability, establishing a clear link between the unpredictability of model +outputs and the erratic attention patterns of the text encoder module. +Consequently, we introduce a formal framework aimed at addressing this issue, +which we term the Stable Text-to-Motion Framework (SATO). SATO consists of +three modules, each dedicated to stable attention, stable prediction, and +maintaining a balance between accuracy and robustness trade-off. We present a +methodology for constructing an SATO that satisfies the stability of attention +and prediction. To verify the stability of the model, we introduced a new +textual synonym perturbation dataset based on HumanML3D and KIT-ML. Results +show that SATO is significantly more stable against synonyms and other slight +perturbations while keeping its high accuracy performance. + +
+
+
+
+
+ + ♻ ☆ GReAT: A Graph Regularized Adversarial Training Method + + +
+ This paper presents GReAT (Graph Regularized Adversarial Training), a novel +regularization method designed to enhance the robust classification performance +of deep learning models. Adversarial examples, characterized by subtle +perturbations that can mislead models, pose a significant challenge in machine +learning. Although adversarial training is effective in defending against such +attacks, it often overlooks the underlying data structure. In response, GReAT +integrates graph based regularization into the adversarial training process, +leveraging the data's inherent structure to enhance model robustness. By +incorporating graph information during training, GReAT defends against +adversarial attacks and improves generalization to unseen data. Extensive +evaluations on benchmark datasets demonstrate that GReAT outperforms state of +the art methods in robustness, achieving notable improvements in classification +accuracy. Specifically, compared to the second best methods, GReAT achieves a +performance increase of approximately 4.87% for CIFAR10 against FGSM attack and +10.57% for SVHN against FGSM attack. Additionally, for CIFAR10, GReAT +demonstrates a performance increase of approximately 11.05% against PGD attack, +and for SVHN, a 5.54% increase against PGD attack. This paper provides detailed +insights into the proposed methodology, including numerical results and +comparisons with existing approaches, highlighting the significant impact of +GReAT in advancing the performance of deep learning models. + +
+
+ comment: 25 pages including references. 7 figures and 6 tables +
+
+
+
+
+ + ♻ ☆ A separability-based approach to quantifying generalization: which layer + is best? + + +
+ Generalization to unseen data remains poorly understood for deep learning +classification and foundation models. How can one assess the ability of +networks to adapt to new or extended versions of their input space in the +spirit of few-shot learning, out-of-distribution generalization, and domain +adaptation? Which layers of a network are likely to generalize best? We provide +a new method for evaluating the capacity of networks to represent a sampled +domain, regardless of whether the network has been trained on all classes in +the domain. Our approach is the following: after fine-tuning state-of-the-art +pre-trained models for visual classification on a particular domain, we assess +their performance on data from related but distinct variations in that domain. +Generalization power is quantified as a function of the latent embeddings of +unseen data from intermediate layers for both unsupervised and supervised +settings. Working throughout all stages of the network, we find that (i) high +classification accuracy does not imply high generalizability; and (ii) deeper +layers in a model do not always generalize the best, which has implications for +pruning. Since the trends observed across datasets are largely consistent, we +conclude that our approach reveals (a function of) the intrinsic capacity of +the different layers of a model to generalize. + +
+
+ comment: 6, pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Improving Interpretation Faithfulness for Vision Transformers ICML 2024 + + +
+ Vision Transformers (ViTs) have achieved state-of-the-art performance for +various vision tasks. One reason behind the success lies in their ability to +provide plausible innate explanations for the behavior of neural architectures. +However, ViTs suffer from issues with explanation faithfulness, as their focal +points are fragile to adversarial attacks and can be easily changed with even +slight perturbations on the input image. In this paper, we propose a rigorous +approach to mitigate these issues by introducing Faithful ViTs (FViTs). Briefly +speaking, an FViT should have the following two properties: (1) The top-$k$ +indices of its self-attention vector should remain mostly unchanged under input +perturbation, indicating stable explanations; (2) The prediction distribution +should be robust to perturbations. To achieve this, we propose a new method +called Denoised Diffusion Smoothing (DDS), which adopts randomized smoothing +and diffusion-based denoising. We theoretically prove that processing ViTs +directly with DDS can turn them into FViTs. We also show that Gaussian noise is +nearly optimal for both $\ell_2$ and $\ell_\infty$-norm cases. Finally, we +demonstrate the effectiveness of our approach through comprehensive experiments +and evaluations. Results show that FViTs are more robust against adversarial +attacks while maintaining the explainability of attention, indicating higher +faithfulness. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ♻ ☆ Automated National Urban Map Extraction + + +
+ Developing countries usually lack the proper governance means to generate and +regularly update a national rooftop map. Using traditional photogrammetry and +surveying methods to produce a building map at the federal level is costly and +time consuming. Using earth observation and deep learning methods, we can +bridge this gap and propose an automated pipeline to fetch such national urban +maps. This paper aims to exploit the power of fully convolutional neural +networks for multi-class buildings' instance segmentation to leverage high +object-wise accuracy results. Buildings' instance segmentation from sub-meter +high-resolution satellite images can be achieved with relatively high +pixel-wise metric scores. We detail all engineering steps to replicate this +work and ensure highly accurate results in dense and slum areas witnessed in +regions that lack proper urban planning in the Global South. We applied a case +study of the proposed pipeline to Lebanon and successfully produced the first +comprehensive national building footprint map with approximately 1 Million +units with an 84% accuracy. The proposed architecture relies on advanced +augmentation techniques to overcome dataset scarcity, which is often the case +in developing countries. + +
+
+
+
+
+ + ♻ ☆ A Simple Interpretable Transformer for Fine-Grained Image Classification + and Analysis ICLR 2024 + + +
+ We present a novel usage of Transformers to make image classification +interpretable. Unlike mainstream classifiers that wait until the last fully +connected layer to incorporate class information to make predictions, we +investigate a proactive approach, asking each class to search for itself in an +image. We realize this idea via a Transformer encoder-decoder inspired by +DEtection TRansformer (DETR). We learn "class-specific" queries (one for each +class) as input to the decoder, enabling each class to localize its patterns in +an image via cross-attention. We name our approach INterpretable TRansformer +(INTR), which is fairly easy to implement and exhibits several compelling +properties. We show that INTR intrinsically encourages each class to attend +distinctively; the cross-attention weights thus provide a faithful +interpretation of the prediction. Interestingly, via "multi-head" +cross-attention, INTR could identify different "attributes" of a class, making +it particularly suitable for fine-grained classification and analysis, which we +demonstrate on eight datasets. Our code and pre-trained models are publicly +accessible at the Imageomics Institute GitHub site: +https://github.com/Imageomics/INTR. + +
+
+ comment: Accepted to International Conference on Learning Representations 2024 + (ICLR 2024) +
+
+
+
+
+ + ♻ ☆ Visual Environment Assessment for Safe Autonomous Quadrotor Landing + + +
+ Autonomous identification and evaluation of safe landing zones are of +paramount importance for ensuring the safety and effectiveness of aerial robots +in the event of system failures, low battery, or the successful completion of +specific tasks. In this paper, we present a novel approach for detection and +assessment of potential landing sites for safe quadrotor landing. Our solution +efficiently integrates 2D and 3D environmental information, eliminating the +need for external aids such as GPS and computationally intensive elevation +maps. The proposed pipeline combines semantic data derived from a Neural +Network (NN), to extract environmental features, with geometric data obtained +from a disparity map, to extract critical geometric attributes such as slope, +flatness, and roughness. We define several cost metrics based on these +attributes to evaluate safety, stability, and suitability of regions in the +environments and identify the most suitable landing area. Our approach runs in +real-time on quadrotors equipped with limited computational capabilities. +Experimental results conducted in diverse environments demonstrate that the +proposed method can effectively assess and identify suitable landing areas, +enabling the safe and autonomous landing of a quadrotor. + +
+
+ comment: 7 pages, 5 figures, 1 table, 2024 International Conference on + Unmanned Aircraft Systems (ICUAS) +
+
+
+
+
+ + ♻ ☆ Zero-shot generalization across architectures for visual classification ICLR 2024 + + +
+ Generalization to unseen data is a key desideratum for deep networks, but its +relation to classification accuracy is unclear. Using a minimalist vision +dataset and a measure of generalizability, we show that popular networks, from +deep convolutional networks (CNNs) to transformers, vary in their power to +extrapolate to unseen classes both across layers and across architectures. +Accuracy is not a good predictor of generalizability, and generalization varies +non-monotonically with layer depth. + +
+
+ comment: Accepted as a Tiny Paper at ICLR 2024. Code available at + https://github.com/dyballa/generalization/tree/ICLR2024TinyPaper +
+
+
+
+
+ + ♻ ☆ Visual Enumeration is Challenging for Large-scale Generative AI + + +
+ Humans can readily judge the number of objects in a visual scene, even +without counting, and such a skill has been documented in many animal species +and babies prior to language development and formal schooling. Numerical +judgments are error-free for small sets, while for larger collections responses +become approximate, with variability increasing proportionally to the target +number. This response pattern is observed for items of all kinds, despite +variation in object features (such as color or shape), suggesting that our +visual number sense relies on abstract representations of numerosity. Here, we +investigate whether large-scale generative Artificial Intelligence (AI) systems +have a human-like number sense, which should allow them to reliably name the +number of objects in simple visual stimuli or generate images containing a +target number of items in the 1-10 range. Surprisingly, most of the foundation +models considered have a poor number sense: They make striking errors even with +small numbers, the response variability does not increase in a systematic way, +and the pattern of errors depends on object category. Only the most recent +proprietary systems exhibit signatures of a visual number sense. Our findings +demonstrate that having an intuitive visual understanding of number remains +challenging for foundation models, which in turn might be detrimental to the +perceptual grounding of numeracy that in humans is crucial for mathematical +learning. + +
+
+
+
+
+ + ♻ ☆ Convex Combination Consistency between Neighbors for Weakly-supervised + Action Localization ICME2023 + + +
+ Weakly-supervised temporal action localization (WTAL) intends to detect +action instances with only weak supervision, e.g., video-level labels. The +current~\textit{de facto} pipeline locates action instances by thresholding and +grouping continuous high-score regions on temporal class activation sequences. +In this route, the capacity of the model to recognize the relationships between +adjacent snippets is of vital importance which determines the quality of the +action boundaries. However, it is error-prone since the variations between +adjacent snippets are typically subtle, and unfortunately this is overlooked in +the literature. To tackle the issue, we propose a novel WTAL approach named +Convex Combination Consistency between Neighbors (C$^3$BN). C$^3$BN consists of +two key ingredients: a micro data augmentation strategy that increases the +diversity in-between adjacent snippets by convex combination of adjacent +snippets, and a macro-micro consistency regularization that enforces the model +to be invariant to the transformations~\textit{w.r.t.} video semantics, snippet +predictions, and snippet representations. Consequently, fine-grained patterns +in-between adjacent snippets are enforced to be explored, thereby resulting in +a more robust action boundary localization. Experimental results demonstrate +the effectiveness of C$^3$BN on top of various baselines for WTAL with +video-level and point-level supervisions. Code is at +https://github.com/Qinying-Liu/C3BN. + +
+
+ comment: ICME2023 +
+
+
+
+
+ + ♻ ☆ Forensic License Plate Recognition with Compression-Informed + Transformers ICIP 2022 + + +
+ Forensic license plate recognition (FLPR) remains an open challenge in legal +contexts such as criminal investigations, where unreadable license plates (LPs) +need to be deciphered from highly compressed and/or low resolution footage, +e.g., from surveillance cameras. In this work, we propose a side-informed +Transformer architecture that embeds knowledge on the input compression level +to improve recognition under strong compression. We show the effectiveness of +Transformers for license plate recognition (LPR) on a low-quality real-world +dataset. We also provide a synthetic dataset that includes strongly degraded, +illegible LP images and analyze the impact of knowledge embedding on it. The +network outperforms existing FLPR methods and standard state-of-the art image +recognition models while requiring less parameters. For the severest degraded +images, we can improve recognition by up to 8.9 percent points. + +
+
+ comment: Published at ICIP 2022, Code: + https://faui1-gitlab.cs.fau.de/denise.moussa/forensic-license-plate-transformer/ +
+
+
+
+
+ + ♻ ☆ From Neural Activations to Concepts: A Survey on Explaining Concepts in + Neural Networks + + +
+ In this paper, we review recent approaches for explaining concepts in neural +networks. Concepts can act as a natural link between learning and reasoning: +once the concepts are identified that a neural learning system uses, one can +integrate those concepts with a reasoning system for inference or use a +reasoning system to act upon them to improve or enhance the learning system. On +the other hand, knowledge can not only be extracted from neural networks but +concept knowledge can also be inserted into neural network architectures. Since +integrating learning and reasoning is at the core of neuro-symbolic AI, the +insights gained from this survey can serve as an important step towards +realizing neuro-symbolic AI based on explainable concepts. + +
+
+ comment: Accepted in Neurosymbolic Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Towards Unconstrained Audio Splicing Detection and Localization with + Neural Networks ICPR + + +
+ Freely available and easy-to-use audio editing tools make it straightforward +to perform audio splicing. Convincing forgeries can be created by combining +various speech samples from the same person. Detection of such splices is +important both in the public sector when considering misinformation, and in a +legal context to verify the integrity of evidence. Unfortunately, most existing +detection algorithms for audio splicing use handcrafted features and make +specific assumptions. However, criminal investigators are often faced with +audio samples from unconstrained sources with unknown characteristics, which +raises the need for more generally applicable methods. + With this work, we aim to take a first step towards unconstrained audio +splicing detection to address this need. We simulate various attack scenarios +in the form of post-processing operations that may disguise splicing. We +propose a Transformer sequence-to-sequence (seq2seq) network for splicing +detection and localization. Our extensive evaluation shows that the proposed +method outperforms existing dedicated approaches for splicing detection [3, 10] +as well as the general-purpose networks EfficientNet [28] and RegNet [25]. + +
+
+ comment: Published at MMFORWILD 2022, ICPR Workshops - Code: + https://faui1-gitlab.cs.fau.de/denise.moussa/audio-splicing-localization . + International Conference on Pattern Recognition. Cham: Springer Nature + Switzerland, 2022 +
+
+
+
+
+ + ♻ ☆ Discovering Novel Actions from Open World Egocentric Videos with + Object-Grounded Visual Commonsense Reasoning + + +
+ Learning to infer labels in an open world, i.e., in an environment where the +target ``labels'' are unknown, is an important characteristic for achieving +autonomy. Foundation models, pre-trained on enormous amounts of data, have +shown remarkable generalization skills through prompting, particularly in +zero-shot inference. However, their performance is restricted to the +correctness of the target label's search space, i.e., candidate labels provided +in the prompt. This target search space can be unknown or exceptionally large +in an open world, severely restricting their performance. To tackle this +challenging problem, we propose a two-step, neuro-symbolic framework called +ALGO - Action Learning with Grounded Object recognition that uses symbolic +knowledge stored in large-scale knowledge bases to infer activities in +egocentric videos with limited supervision. First, we propose a neuro-symbolic +prompting approach that uses object-centric vision-language models as a noisy +oracle to ground objects in the video through evidence-based reasoning. Second, +driven by prior commonsense knowledge, we discover plausible activities through +an energy-based symbolic pattern theory framework and learn to ground +knowledge-based action (verb) concepts in the video. Extensive experiments on +four publicly available datasets (EPIC-Kitchens, GTEA Gaze, GTEA Gaze Plus, and +Charades-Ego) demonstrate its performance on open-world activity inference. We +also show that ALGO can be extended to zero-shot inference and demonstrate its +competitive performance on the Charades-Ego dataset. + +
+
+ comment: 25 Pages, 4 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Explainable Light-Weight Deep Learning Pipeline for Improved Drought + Stress Identification + + +
+ Early identification of drought stress in crops is vital for implementing +effective mitigation measures and reducing yield loss. Non-invasive imaging +techniques hold immense potential by capturing subtle physiological changes in +plants under water deficit. Sensor based imaging data serves as a rich source +of information for machine learning and deep learning algorithms, facilitating +further analysis aimed at identifying drought stress. While these approaches +yield favorable results, real-time field applications requires algorithms +specifically designed for the complexities of natural agricultural conditions. +Our work proposes a novel deep learning framework for classifying drought +stress in potato crops captured by UAVs in natural settings. The novelty lies +in the synergistic combination of a pre-trained network with carefully designed +custom layers. This architecture leverages feature extraction capabilities of +the pre-trained network while the custom layers enable targeted dimensionality +reduction and enhanced regularization, ultimately leading to improved +performance. A key innovation of our work involves the integration of +Gradient-Class Activation Mapping (Grad-CAM), an explainability technique. +Grad-CAM sheds light on the internal workings of the deep learning model, +typically referred to as a black box. By visualizing the focus areas of the +model within the images, Grad-CAM fosters interpretability and builds trust in +the decision-making process of the model. Our proposed framework achieves +superior performance, particularly with the DenseNet121 pre-trained network, +reaching a precision of 97% to identify the stressed class with an overall +accuracy of 91%. Comparative analysis of existing state-of-the-art object +detection algorithms reveals the superiority of our approach in significantly +higher precision and accuracy. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Simplicity in Complexity : Explaining Visual Complexity using Deep + Segmentation Models + + +
+ The complexity of visual stimuli plays an important role in many cognitive +phenomena, including attention, engagement, memorability, time perception and +aesthetic evaluation. Despite its importance, complexity is poorly understood +and ironically, previous models of image complexity have been quite complex. +There have been many attempts to find handcrafted features that explain +complexity, but these features are usually dataset specific, and hence fail to +generalise. On the other hand, more recent work has employed deep neural +networks to predict complexity, but these models remain difficult to interpret, +and do not guide a theoretical understanding of the problem. Here we propose to +model complexity using segment-based representations of images. We use +state-of-the-art segmentation models, SAM and FC-CLIP, to quantify the number +of segments at multiple granularities, and the number of classes in an image +respectively. We find that complexity is well-explained by a simple linear +model with these two features across six diverse image-sets of naturalistic +scene and art images. This suggests that the complexity of images can be +surprisingly simple. + +
+
+
+
+
+ + ♻ ☆ Towards Diverse Binary Segmentation via A Simple yet General Gated + Network + + +
+ In many binary segmentation tasks, most CNNs-based methods use a U-shape +encoder-decoder network as their basic structure. They ignore two key problems +when the encoder exchanges information with the decoder: one is the lack of +interference control mechanism between them, the other is without considering +the disparity of the contributions from different encoder levels. In this work, +we propose a simple yet general gated network (GateNet) to tackle them all at +once. With the help of multi-level gate units, the valuable context information +from the encoder can be selectively transmitted to the decoder. In addition, we +design a gated dual branch structure to build the cooperation among the +features of different levels and improve the discrimination ability of the +network. Furthermore, we introduce a "Fold" operation to improve the atrous +convolution and form a novel folded atrous convolution, which can be flexibly +embedded in ASPP or DenseASPP to accurately localize foreground objects of +various scales. GateNet can be easily generalized to many binary segmentation +tasks, including general and specific object segmentation and multi-modal +segmentation. Without bells and whistles, our network consistently performs +favorably against the state-of-the-art methods under 10 metrics on 33 datasets +of 10 binary segmentation tasks. + +
+
+ comment: Accepted by IJCV 2024 +
+
+
+
+
+ + ♻ ☆ Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical + Image Segmentation + + +
+ Image segmentation holds a vital position in the realms of diagnosis and +treatment within the medical domain. Traditional convolutional neural networks +(CNNs) and Transformer models have made significant advancements in this realm, +but they still encounter challenges because of limited receptive field or high +computing complexity. Recently, State Space Models (SSMs), particularly Mamba +and its variants, have demonstrated notable performance in the field of vision. +However, their feature extraction methods may not be sufficiently effective and +retain some redundant structures, leaving room for parameter reduction. +Motivated by previous spatial and channel attention methods, we propose Triplet +Mamba-UNet. The method leverages residual VSS Blocks to extract intensive +contextual features, while Triplet SSM is employed to fuse features across +spatial and channel dimensions. We conducted experiments on ISIC17, ISIC18, +CVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets, +demonstrating the superior segmentation performance of our proposed TM-UNet. +Additionally, compared to the previous VM-UNet, our model achieves a one-third +reduction in parameters. + +
+
+ comment: Experimental method encountered errors, undergoing experiment again +
+
+
+
+
+ + ♻ ☆ JPEG Quantized Coefficient Recovery via DCT Domain Spatial-Frequential + Transformer + + +
+ JPEG compression adopts the quantization of Discrete Cosine Transform (DCT) +coefficients for effective bit-rate reduction, whilst the quantization could +lead to a significant loss of important image details. Recovering compressed +JPEG images in the frequency domain has recently garnered increasing interest, +complementing the multitude of restoration techniques established in the pixel +domain. However, existing DCT domain methods typically suffer from limited +effectiveness in handling a wide range of compression quality factors or fall +short in recovering sparse quantized coefficients and the components across +different colorspaces. To address these challenges, we propose a DCT domain +spatial-frequential Transformer, namely DCTransformer, for JPEG quantized +coefficient recovery. Specifically, a dual-branch architecture is designed to +capture both spatial and frequential correlations within the collocated DCT +coefficients. Moreover, we incorporate the operation of quantization matrix +embedding, which effectively allows our single model to handle a wide range of +quality factors, and a luminance-chrominance alignment head that produces a +unified feature map to align different-sized luminance and chrominance +components. Our proposed DCTransformer outperforms the current state-of-the-art +JPEG artifact removal techniques, as demonstrated by our extensive experiments. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ ValUES: A Framework for Systematic Validation of Uncertainty Estimation + in Semantic Segmentation ICLR 2024 + + +
+ Uncertainty estimation is an essential and heavily-studied component for the +reliable application of semantic segmentation methods. While various studies +exist claiming methodological advances on the one hand, and successful +application on the other hand, the field is currently hampered by a gap between +theory and practice leaving fundamental questions unanswered: Can data-related +and model-related uncertainty really be separated in practice? Which components +of an uncertainty method are essential for real-world performance? Which +uncertainty method works well for which application? In this work, we link this +research gap to a lack of systematic and comprehensive evaluation of +uncertainty methods. Specifically, we identify three key pitfalls in current +literature and present an evaluation framework that bridges the research gap by +providing 1) a controlled environment for studying data ambiguities as well as +distribution shifts, 2) systematic ablations of relevant method components, and +3) test-beds for the five predominant uncertainty applications: OoD-detection, +active learning, failure detection, calibration, and ambiguity modeling. +Empirical results on simulated as well as real-world data demonstrate how the +proposed framework is able to answer the predominant questions in the field +revealing for instance that 1) separation of uncertainty types works on +simulated data but does not necessarily translate to real-world data, 2) +aggregation of scores is a crucial but currently neglected component of +uncertainty methods, 3) While ensembles are performing most robustly across the +different downstream tasks and settings, test-time augmentation often +constitutes a light-weight alternative. Code is at: +https://github.com/IML-DKFZ/values + +
+
+ comment: ICLR 2024 (oral) +
+
+
+
+
+ + ♻ ☆ DiffECG: A Versatile Probabilistic Diffusion Model for ECG Signals + Synthesis + + +
+ Within cardiovascular disease detection using deep learning applied to ECG +signals, the complexities of handling physiological signals have sparked +growing interest in leveraging deep generative models for effective data +augmentation. In this paper, we introduce a novel versatile approach based on +denoising diffusion probabilistic models for ECG synthesis, addressing three +scenarios: (i) heartbeat generation, (ii) partial signal imputation, and (iii) +full heartbeat forecasting. Our approach presents the first generalized +conditional approach for ECG synthesis, and our experimental results +demonstrate its effectiveness for various ECG-related tasks. Moreover, we show +that our approach outperforms other state-of-the-art ECG generative models and +can enhance the performance of state-of-the-art classifiers. + +
+
+ comment: Accepted in IEEE SERA 2024 conference +
+
+
+
+
+ + ♻ ☆ MRI Scan Synthesis Methods based on Clustering and Pix2Pix + + +
+ We consider a missing data problem in the context of automatic segmentation +methods for Magnetic Resonance Imaging (MRI) brain scans. Usually, automated +MRI scan segmentation is based on multiple scans (e.g., T1-weighted, +T2-weighted, T1CE, FLAIR). However, quite often a scan is blurry, missing or +otherwise unusable. We investigate the question whether a missing scan can be +synthesized. We exemplify that this is in principle possible by synthesizing a +T2-weighted scan from a given T1-weighted scan. Our first aim is to compute a +picture that resembles the missing scan closely, measured by average mean +squared error (MSE). We develop/use several methods for this, including a +random baseline approach, a clustering-based method and pixel-to-pixel +translation method by Isola et al. (Pix2Pix) which is based on conditional +GANs. The lowest MSE is achieved by our clustering-based method. Our second aim +is to compare the methods with respect to the effect that using the synthesized +scan has on the segmentation process. For this, we use a DeepMedic model +trained with the four input scan modalities named above. We replace the +T2-weighted scan by the synthesized picture and evaluate the segmentations with +respect to the tumor identification, using Dice scores as numerical evaluation. +The evaluation shows that the segmentation works well with synthesized scans +(in particular, with Pix2Pix methods) in many cases. + +
+
+ comment: Accepted at AIME 2024 +
+
+
+
+
+ + ♻ ☆ High-fidelity Person-centric Subject-to-Image Synthesis CVPR2024 + + +
+ Current subject-driven image generation methods encounter significant +challenges in person-centric image generation. The reason is that they learn +the semantic scene and person generation by fine-tuning a common pre-trained +diffusion, which involves an irreconcilable training imbalance. Precisely, to +generate realistic persons, they need to sufficiently tune the pre-trained +model, which inevitably causes the model to forget the rich semantic scene +prior and makes scene generation over-fit to the training data. Moreover, even +with sufficient fine-tuning, these methods can still not generate high-fidelity +persons since joint learning of the scene and person generation also lead to +quality compromise. In this paper, we propose Face-diffuser, an effective +collaborative generation pipeline to eliminate the above training imbalance and +quality compromise. Specifically, we first develop two specialized pre-trained +diffusion models, i.e., Text-driven Diffusion Model (TDM) and Subject-augmented +Diffusion Model (SDM), for scene and person generation, respectively. The +sampling process is divided into three sequential stages, i.e., semantic scene +construction, subject-scene fusion, and subject enhancement. The first and last +stages are performed by TDM and SDM respectively. The subject-scene fusion +stage, that is the collaboration achieved through a novel and highly effective +mechanism, Saliency-adaptive Noise Fusion (SNF). Specifically, it is based on +our key observation that there exists a robust link between classifier-free +guidance responses and the saliency of generated images. In each time step, SNF +leverages the unique strengths of each model and allows for the spatial +blending of predicted noises from both models automatically in a saliency-aware +manner. Extensive experiments confirm the impressive effectiveness and +robustness of the Face-diffuser. + +
+
+ comment: Accepted by CVPR2024. Code: + https://github.com/CodeGoat24/Face-diffuser +
+
+
+
+
+ + ♻ ☆ Object Registration in Neural Fields ICRA 2024 + + +
+ Neural fields provide a continuous scene representation of 3D geometry and +appearance in a way which has great promise for robotics applications. One +functionality that unlocks unique use-cases for neural fields in robotics is +object 6-DoF registration. In this paper, we provide an expanded analysis of +the recent Reg-NF neural field registration method and its use-cases within a +robotics context. We showcase the scenario of determining the 6-DoF pose of +known objects within a scene using scene and object neural field models. We +show how this may be used to better represent objects within imperfectly +modelled scenes and generate new scenes by substituting object neural field +models into the scene. + +
+
+ comment: Accepted to ICRA 2024 RoboNeRF workshop. 5 pages, 10 figures. arXiv + admin note: substantial text overlap with arXiv:2402.09722 +
+
+
+
+
+ + ♻ ☆ MaTe3D: Mask-guided Text-based 3D-aware Portrait Editing + + +
+ 3D-aware portrait editing has a wide range of applications in multiple +fields. However, current approaches are limited due that they can only perform +mask-guided or text-based editing. Even by fusing the two procedures into a +model, the editing quality and stability cannot be ensured. To address this +limitation, we propose \textbf{MaTe3D}: mask-guided text-based 3D-aware +portrait editing. In this framework, first, we introduce a new SDF-based 3D +generator which learns local and global representations with proposed SDF and +density consistency losses. This enhances masked-based editing in local areas; +second, we present a novel distillation strategy: Conditional Distillation on +Geometry and Texture (CDGT). Compared to exiting distillation strategies, it +mitigates visual ambiguity and avoids mismatch between texture and geometry, +thereby producing stable texture and convincing geometry while editing. +Additionally, we create the CatMask-HQ dataset, a large-scale high-resolution +cat face annotation for exploration of model generalization and expansion. We +perform expensive experiments on both the FFHQ and CatMask-HQ datasets to +demonstrate the editing quality and stability of the proposed method. Our +method faithfully generates a 3D-aware edited face image based on a modified +mask and a text prompt. Our code and models will be publicly released. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ MagicDrive: Street View Generation with Diverse 3D Geometry Control + + +
+ Recent advancements in diffusion models have significantly enhanced the data +synthesis with 2D control. Yet, precise 3D control in street view generation, +crucial for 3D perception tasks, remains elusive. Specifically, utilizing +Bird's-Eye View (BEV) as the primary condition often leads to challenges in +geometry control (e.g., height), affecting the representation of object shapes, +occlusion patterns, and road surface elevations, all of which are essential to +perception data synthesis, especially for 3D object detection tasks. In this +paper, we introduce MagicDrive, a novel street view generation framework, +offering diverse 3D geometry controls including camera poses, road maps, and 3D +bounding boxes, together with textual descriptions, achieved through tailored +encoding strategies. Besides, our design incorporates a cross-view attention +module, ensuring consistency across multiple camera views. With MagicDrive, we +achieve high-fidelity street-view image & video synthesis that captures nuanced +3D geometry and various scene descriptions, enhancing tasks like BEV +segmentation and 3D object detection. + +
+
+ comment: Project Page: https://flymin.github.io/magicdrive; Figure 7 updated +
+
+
+
+
+ + ♻ ☆ Denoising-Diffusion Alignment for Continuous Sign Language Recognition + + +
+ Continuous sign language recognition (CSLR) aims to promote active and +accessible communication for the hearing impaired, by recognizing signs in +untrimmed sign language videos to textual glosses sequentially. The key +challenge of CSLR is how to achieve the cross-modality alignment between videos +and gloss sequences. However, the current cross-modality paradigms of CSLR +overlook using the glosses context to guide the video clips for global temporal +context alignment, which further affects the visual to gloss mapping and is +detrimental to recognition performance. To tackle this problem, we propose a +novel Denoising-Diffusion global Alignment (DDA), which consists of a +denoising-diffusion autoencoder and DDA loss function. DDA leverages +diffusion-based global alignment techniques to align video with gloss sequence, +facilitating global temporal context alignment. Specifically, DDA first +proposes the auxiliary condition diffusion to conduct the gloss-part noised +bimodal representations for video and gloss sequence. To address the problem of +the recognition-oriented alignment knowledge represented in the diffusion +denoising process cannot be feedback. The DDA further proposes the +Denoising-Diffusion Autoencoder, which adds a decoder in the auxiliary +condition diffusion to denoise the partial noisy bimodal representations via +the designed DDA loss in self-supervised. In the denoising process, each video +clip representation of video can be reliably guided to re-establish the global +temporal context between them via denoising the gloss sequence representation. +Experiments on three public benchmarks demonstrate that our DDA achieves +state-of-the-art performances and confirm the feasibility of DDA for video +representation enhancement. + +
+
+
+
+
+ + ♻ ☆ Dynamic Against Dynamic: An Open-set Self-learning Framework IJCAI2024 + + +
+ In open-set recognition, existing methods generally learn statically fixed +decision boundaries using known classes to reject unknown classes. Though they +have achieved promising results, such decision boundaries are evidently +insufficient for universal unknown classes in dynamic and open scenarios as +they can potentially appear at any position in the feature space. Moreover, +these methods just simply reject unknown class samples during testing without +any effective utilization for them. In fact, such samples completely can +constitute the true instantiated representation of the unknown classes to +further enhance the model's performance. To address these issues, this paper +proposes a novel dynamic against dynamic idea, i.e., dynamic method against +dynamic changing open-set world, where an open-set self-learning (OSSL) +framework is correspondingly developed. OSSL starts with a good closed-set +classifier trained by known classes and utilizes available test samples for +model adaptation during testing, thus gaining the adaptability to changing data +distributions. In particular, a novel self-matching module is designed for +OSSL, which can achieve the adaptation in automatically identifying known class +samples while rejecting unknown class samples which are further utilized to +enhance the discriminability of the model as the instantiated representation of +unknown classes. Our method establishes new performance milestones respectively +in almost all standard and cross-data benchmarks. + +
+
+ comment: The first two authors contributed equally to this work. Accepted at + IJCAI2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Time-step Curriculum for One Image to 3D Generation CVPR 2024 + + +
+ Score distillation sampling~(SDS) has been widely adopted to overcome the +absence of unseen views in reconstructing 3D objects from a \textbf{single} +image. It leverages pre-trained 2D diffusion models as teacher to guide the +reconstruction of student 3D models. Despite their remarkable success, +SDS-based methods often encounter geometric artifacts and texture saturation. +We find out the crux is the overlooked indiscriminate treatment of diffusion +time-steps during optimization: it unreasonably treats the student-teacher +knowledge distillation to be equal at all time-steps and thus entangles +coarse-grained and fine-grained modeling. Therefore, we propose the Diffusion +Time-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the +teacher and student models collaborating with the time-step curriculum in a +coarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and +Level50 benchmark demonstrate that DTC123 can produce multi-view consistent, +high-quality, and diverse 3D assets. Codes and more generation demos will be +released in https://github.com/yxymessi/DTC123. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Early Autism Diagnosis based on Path Signature and Siamese Unsupervised + Feature Compressor + + +
+ Autism Spectrum Disorder (ASD) has been emerging as a growing public health +threat. Early diagnosis of ASD is crucial for timely, effective intervention +and treatment. However, conventional diagnosis methods based on communications +and behavioral patterns are unreliable for children younger than 2 years of +age. Given evidences of neurodevelopmental abnormalities in ASD infants, we +resort to a novel deep learning-based method to extract key features from the +inherently scarce, class-imbalanced, and heterogeneous structural MR images for +early autism diagnosis. Specifically, we propose a Siamese verification +framework to extend the scarce data, and an unsupervised compressor to +alleviate data imbalance by extracting key features. We also proposed weight +constraints to cope with sample heterogeneity by giving different samples +different voting weights during validation, and we used Path Signature to +unravel meaningful developmental features from the two-time point data +longitudinally. We further extracted machine learning focused brain regions for +autism diagnosis. Extensive experiments have shown that our method performed +well under practical scenarios, transcending existing machine learning methods +and providing anatomical insights for autism early diagnosis. + +
+
+
+
+
+ + ♻ ☆ DORSal: Diffusion for Object-centric Representations of Scenes et al ICLR 2024 + + +
+ Recent progress in 3D scene understanding enables scalable learning of +representations across large datasets of diverse scenes. As a consequence, +generalization to unseen scenes and objects, rendering novel views from just a +single or a handful of input images, and controllable scene generation that +supports editing, is now possible. However, training jointly on a large number +of scenes typically compromises rendering quality when compared to single-scene +optimized models such as NeRFs. In this paper, we leverage recent progress in +diffusion models to equip 3D scene representation learning models with the +ability to render high-fidelity novel views, while retaining benefits such as +object-level scene editing to a large degree. In particular, we propose DORSal, +which adapts a video diffusion architecture for 3D scene generation conditioned +on frozen object-centric slot-based representations of scenes. On both complex +synthetic multi-object scenes and on the real-world large-scale Street View +dataset, we show that DORSal enables scalable neural rendering of 3D scenes +with object-level editing and improves upon existing approaches. + +
+
+ comment: Accepted to ICLR 2024. Project page: + https://www.sjoerdvansteenkiste.com/dorsal +
+
+
+
+
+ + ♻ ☆ M3Act: Learning from Synthetic Human Group Activities + + +
+ The study of complex human interactions and group activities has become a +focal point in human-centric computer vision. However, progress in related +tasks is often hindered by the challenges of obtaining large-scale labeled +datasets from real-world scenarios. To address the limitation, we introduce +M3Act, a synthetic data generator for multi-view multi-group multi-person human +atomic actions and group activities. Powered by Unity Engine, M3Act features +multiple semantic groups, highly diverse and photorealistic images, and a +comprehensive set of annotations, which facilitates the learning of +human-centered tasks across single-person, multi-person, and multi-group +conditions. We demonstrate the advantages of M3Act across three core +experiments. The results suggest our synthetic dataset can significantly +improve the performance of several downstream methods and replace real-world +datasets to reduce cost. Notably, M3Act improves the state-of-the-art MOTRv2 on +DanceTrack dataset, leading to a hop on the leaderboard from 10th to 2nd place. +Moreover, M3Act opens new research for controllable 3D group activity +generation. We define multiple metrics and propose a competitive baseline for +the novel task. Our code and data are available at our project page: +http://cjerry1243.github.io/M3Act. + +
+
+
+
+
+ + ♻ ☆ FMGS: Foundation Model Embedded 3D Gaussian Splatting for Holistic 3D + Scene Understanding + + +
+ Precisely perceiving the geometric and semantic properties of real-world 3D +objects is crucial for the continued evolution of augmented reality and robotic +applications. To this end, we present Foundation Model Embedded Gaussian +Splatting (FMGS), which incorporates vision-language embeddings of foundation +models into 3D Gaussian Splatting (GS). The key contribution of this work is an +efficient method to reconstruct and represent 3D vision-language models. This +is achieved by distilling feature maps generated from image-based foundation +models into those rendered from our 3D model. To ensure high-quality rendering +and fast training, we introduce a novel scene representation by integrating +strengths from both GS and multi-resolution hash encodings (MHE). Our effective +training procedure also introduces a pixel alignment loss that makes the +rendered feature distance of the same semantic entities close, following the +pixel-level semantic boundaries. Our results demonstrate remarkable multi-view +semantic consistency, facilitating diverse downstream tasks, beating +state-of-the-art methods by 10.2 percent on open-vocabulary language-based +object detection, despite that we are 851X faster for inference. This research +explores the intersection of vision, language, and 3D scene representation, +paving the way for enhanced scene understanding in uncontrolled real-world +environments. We plan to release the code on the project page. + +
+
+ comment: Project page: https://xingxingzuo.github.io/fmgs +
+
+
+
+
+ + ♻ ☆ Mathematical Foundation and Corrections for Full Range Head Pose + Estimation + + +
+ Numerous works concerning head pose estimation (HPE) offer algorithms or +proposed neural network-based approaches for extracting Euler angles from +either facial key points or directly from images of the head region. However, +many works failed to provide clear definitions of the coordinate systems and +Euler or Tait-Bryan angles orders in use. It is a well-known fact that rotation +matrices depend on coordinate systems, and yaw, roll, and pitch angles are +sensitive to their application order. Without precise definitions, it becomes +challenging to validate the correctness of the output head pose and drawing +routines employed in prior works. In this paper, we thoroughly examined the +Euler angles defined in the 300W-LP dataset, head pose estimation such as +3DDFA-v2, 6D-RepNet, WHENet, etc, and the validity of their drawing routines of +the Euler angles. When necessary, we infer their coordinate system and sequence +of yaw, roll, pitch from provided code. This paper presents (1) code and +algorithms for inferring coordinate system from provided source code, code for +Euler angle application order and extracting precise rotation matrices and the +Euler angles, (2) code and algorithms for converting poses from one rotation +system to another, (3) novel formulae for 2D augmentations of the rotation +matrices, and (4) derivations and code for the correct drawing routines for +rotation matrices and poses. This paper also addresses the feasibility of +defining rotations with right-handed coordinate system in Wikipedia and SciPy, +which makes the Euler angle extraction much easier for full-range head pose +research. + +
+
+
+
+
+ + ♻ ☆ HOH: Markerless Multimodal Human-Object-Human Handover Dataset with + Large Object Count NeurIPS 2023 + + +
+ We present the HOH (Human-Object-Human) Handover Dataset, a large object +count dataset with 136 objects, to accelerate data-driven research on handover +studies, human-robot handover implementation, and artificial intelligence (AI) +on handover parameter estimation from 2D and 3D data of person interactions. +HOH contains multi-view RGB and depth data, skeletons, fused point clouds, +grasp type and handedness labels, object, giver hand, and receiver hand 2D and +3D segmentations, giver and receiver comfort ratings, and paired object +metadata and aligned 3D models for 2,720 handover interactions spanning 136 +objects and 20 giver-receiver pairs-40 with role-reversal-organized from 40 +participants. We also show experimental results of neural networks trained +using HOH to perform grasp, orientation, and trajectory prediction. As the only +fully markerless handover capture dataset, HOH represents natural human-human +handover interactions, overcoming challenges with markered datasets that +require specific suiting for body tracking, and lack high-resolution hand +tracking. To date, HOH is the largest handover dataset in number of objects, +participants, pairs with role reversal accounted for, and total interactions +captured. + +
+
+ comment: NeurIPS 2023 Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ FT-Shield: A Watermark Against Unauthorized Fine-tuning in Text-to-Image + Diffusion Models + + +
+ Text-to-image generative models, especially those based on latent diffusion +models (LDMs), have demonstrated outstanding ability in generating high-quality +and high-resolution images from textual prompts. With this advancement, various +fine-tuning methods have been developed to personalize text-to-image models for +specific applications such as artistic style adaptation and human face +transfer. However, such advancements have raised copyright concerns, especially +when the data are used for personalization without authorization. For example, +a malicious user can employ fine-tuning techniques to replicate the style of an +artist without consent. In light of this concern, we propose FT-Shield, a +watermarking solution tailored for the fine-tuning of text-to-image diffusion +models. FT-Shield addresses copyright protection challenges by designing new +watermark generation and detection strategies. In particular, it introduces an +innovative algorithm for watermark generation. It ensures the seamless transfer +of watermarks from training images to generated outputs, facilitating the +identification of copyrighted material use. To tackle the variability in +fine-tuning methods and their impact on watermark detection, FT-Shield +integrates a Mixture of Experts (MoE) approach for watermark detection. +Comprehensive experiments validate the effectiveness of our proposed FT-Shield. + +
+
+
+
+
+ + ♻ ☆ Koala: Key frame-conditioned long video-LLM CVPR 2024 + + +
+ Long video question answering is a challenging task that involves recognizing +short-term activities and reasoning about their fine-grained relationships. +State-of-the-art video Large Language Models (vLLMs) hold promise as a viable +solution due to their demonstrated emergent capabilities on new tasks. However, +despite being trained on millions of short seconds-long videos, vLLMs are +unable to understand minutes-long videos and accurately answer questions about +them. To address this limitation, we propose a lightweight and self-supervised +approach, Key frame-conditioned long video-LLM (Koala), that introduces +learnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to +longer videos. Our approach introduces two new tokenizers that condition on +visual tokens computed from sparse video key frames for understanding short and +long video moments. We train our proposed approach on HowTo100M and demonstrate +its effectiveness on zero-shot long video understanding benchmarks, where it +outperforms state-of-the-art large models by 3 - 6% in absolute accuracy across +all tasks. Surprisingly, we also empirically show that our approach not only +helps a pretrained vLLM to understand long videos but also improves its +accuracy on short-term action recognition. + +
+
+ comment: Accepted at CVPR 2024 as a poster highlight +
+
+
+
+
+ + ♻ ☆ Dynamic Open Vocabulary Enhanced Safe-landing with Intelligence + (DOVESEI) IROS 2023 + + +
+ This work targets what we consider to be the foundational step for urban +airborne robots, a safe landing. Our attention is directed toward what we deem +the most crucial aspect of the safe landing perception stack: segmentation. We +present a streamlined reactive UAV system that employs visual servoing by +harnessing the capabilities of open vocabulary image segmentation. This +approach can adapt to various scenarios with minimal adjustments, bypassing the +necessity for extensive data accumulation for refining internal models, thanks +to its open vocabulary methodology. Given the limitations imposed by local +authorities, our primary focus centers on operations originating from altitudes +of 100 meters. This choice is deliberate, as numerous preceding works have +dealt with altitudes up to 30 meters, aligning with the capabilities of small +stereo cameras. Consequently, we leave the remaining 20m to be navigated using +conventional 3D path planning methods. Utilizing monocular cameras and image +segmentation, our findings demonstrate the system's capability to successfully +execute landing maneuvers at altitudes as low as 20 meters. However, this +approach is vulnerable to intermittent and occasionally abrupt fluctuations in +the segmentation between frames in a video stream. To address this challenge, +we enhance the image segmentation output by introducing what we call a dynamic +focus: a masking mechanism that self adjusts according to the current landing +stage. This dynamic focus guides the control system to avoid regions beyond the +drone's safety radius projected onto the ground, thus mitigating the problems +with fluctuations. Through the implementation of this supplementary layer, our +experiments have reached improvements in the landing success rate of almost +tenfold when compared to global segmentation. All the source code is open +source and available online (github.com/MISTLab/DOVESEI). + +
+
+ comment: IROS 2023 The Last-Mile Robotics Workshop +
+
+
+
+
+ + ♻ ☆ LET-3D-AP: Longitudinal Error Tolerant 3D Average Precision for + Camera-Only 3D Detection + + +
+ The 3D Average Precision (3D AP) relies on the intersection over union +between predictions and ground truth objects. However, camera-only detectors +have limited depth accuracy, which may cause otherwise reasonable predictions +that suffer from such longitudinal localization errors to be treated as false +positives. We therefore propose variants of the 3D AP metric to be more +permissive with respect to depth estimation errors. Specifically, our novel +longitudinal error tolerant metrics, LET-3D-AP and LET-3D-APL, allow +longitudinal localization errors of the prediction boxes up to a given +tolerance. To evaluate the proposed metrics, we also construct a new test set +for the Waymo Open Dataset, tailored to camera-only 3D detection methods. +Surprisingly, we find that state-of-the-art camera-based detectors can +outperform popular LiDAR-based detectors with our new metrics past at 10% depth +error tolerance, suggesting that existing camera-based detectors already have +the potential to surpass LiDAR-based detectors in downstream applications. We +believe the proposed metrics and the new benchmark dataset will facilitate +advances in the field of camera-only 3D detection by providing more informative +signals that can better indicate the system-level performance. + +
+
+ comment: Find the primary metrics for the 2022 Waymo Open Dataset 3D + Camera-Only Detection Challenge at + https://waymo.com/open/challenges/2022/3d-camera-only-detection/ . Find the + code at https://github.com/waymo-research/waymo-open-dataset +
+
+
+
+
+ + ♻ ☆ COBRA - COnfidence score Based on shape Regression Analysis for + method-independent quality assessment of object pose estimation from single + images + + +
+ We present a generic algorithm for scoring pose estimation methods that rely +on single image semantic analysis. The algorithm employs a lightweight putative +shape representation using a combination of multiple Gaussian Processes. Each +Gaussian Process (GP) yields distance normal distributions from multiple +reference points in the object's coordinate system to its surface, thus +providing a geometric evaluation framework for scoring predicted poses. Our +confidence measure comprises the average mixture probability of pixel +back-projections onto the shape template. In the reported experiments, we +compare the accuracy of our GP based representation of objects versus the +actual geometric models and demonstrate the ability of our method to capture +the influence of outliers as opposed to the corresponding intrinsic measures +that ship with the segmentation and pose estimation methods. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 148 + +
+
+
+ + ☆ Multi-Space Alignments Towards Universal LiDAR Segmentation CVPR 2024 + + +
+ A unified and versatile LiDAR segmentation model with strong robustness and +generalizability is desirable for safe autonomous driving perception. This work +presents M3Net, a one-of-a-kind framework for fulfilling multi-task, +multi-dataset, multi-modality LiDAR segmentation in a universal manner using +just a single set of parameters. To better exploit data volume and diversity, +we first combine large-scale driving datasets acquired by different types of +sensors from diverse scenes and then conduct alignments in three spaces, namely +data, feature, and label spaces, during the training. As a result, M3Net is +capable of taming heterogeneous data for training state-of-the-art LiDAR +segmentation models. Extensive experiments on twelve LiDAR segmentation +datasets verify our effectiveness. Notably, using a shared set of parameters, +M3Net achieves 75.1%, 83.1%, and 72.4% mIoU scores, respectively, on the +official benchmarks of SemanticKITTI, nuScenes, and Waymo Open. + +
+
+ comment: CVPR 2024; 33 pages, 14 figures, 14 tables; Code at + https://github.com/youquanl/M3Net +
+
+
+
+
+ + ☆ Customizing Text-to-Image Models with a Single Image Pair + + +
+ Art reinterpretation is the practice of creating a variation of a reference +work, making a paired artwork that exhibits a distinct artistic style. We ask +if such an image pair can be used to customize a generative model to capture +the demonstrated stylistic difference. We propose Pair Customization, a new +customization method that learns stylistic difference from a single image pair +and then applies the acquired style to the generation process. Unlike existing +methods that learn to mimic a single concept from a collection of images, our +method captures the stylistic difference between paired images. This allows us +to apply a stylistic change without overfitting to the specific image content +in the examples. To address this new task, we employ a joint optimization +method that explicitly separates the style and content into distinct LoRA +weight spaces. We optimize these style and content weights to reproduce the +style and content images while encouraging their orthogonality. During +inference, we modify the diffusion process via a new style guidance based on +our learned weights. Both qualitative and quantitative experiments show that +our method can effectively learn style while avoiding overfitting to image +content, highlighting the potential of modeling such stylistic differences from +a single image pair. + +
+
+ comment: project page: https://paircustomization.github.io/ +
+
+
+
+
+ + ☆ Plan-Seq-Learn: Language Model Guided RL for Solving Long Horizon + Robotics Tasks ICLR 2024 + + +
+ Large Language Models (LLMs) have been shown to be capable of performing +high-level planning for long-horizon robotics tasks, yet existing methods +require access to a pre-defined skill library (e.g. picking, placing, pulling, +pushing, navigating). However, LLM planning does not address how to design or +learn those behaviors, which remains challenging particularly in long-horizon +settings. Furthermore, for many tasks of interest, the robot needs to be able +to adjust its behavior in a fine-grained manner, requiring the agent to be +capable of modifying low-level control actions. Can we instead use the +internet-scale knowledge from LLMs for high-level policies, guiding +reinforcement learning (RL) policies to efficiently solve robotic control tasks +online without requiring a pre-determined set of skills? In this paper, we +propose Plan-Seq-Learn (PSL): a modular approach that uses motion planning to +bridge the gap between abstract language and learned low-level control for +solving long-horizon robotics tasks from scratch. We demonstrate that PSL +achieves state-of-the-art results on over 25 challenging robotics tasks with up +to 10 stages. PSL solves long-horizon tasks from raw visual input spanning four +benchmarks at success rates of over 85%, out-performing language-based, +classical, and end-to-end approaches. Video results and code at +https://mihdalal.github.io/planseqlearn/ + +
+
+ comment: Published at ICLR 2024. Website at + https://mihdalal.github.io/planseqlearn/ 9 pages, 3 figures, 3 tables; 14 + pages appendix (7 additional figures) +
+
+
+
+
+ + ☆ OmniDrive: A Holistic LLM-Agent Framework for Autonomous Driving with 3D + Perception, Reasoning and Planning + + +
+ The advances in multimodal large language models (MLLMs) have led to growing +interests in LLM-based autonomous driving agents to leverage their strong +reasoning capabilities. However, capitalizing on MLLMs' strong reasoning +capabilities for improved planning behavior is challenging since planning +requires full 3D situational awareness beyond 2D reasoning. To address this +challenge, our work proposes a holistic framework for strong alignment between +agent models and 3D driving tasks. Our framework starts with a novel 3D MLLM +architecture that uses sparse queries to lift and compress visual +representations into 3D before feeding them into an LLM. This query-based +representation allows us to jointly encode dynamic objects and static map +elements (e.g., traffic lanes), providing a condensed world model for +perception-action alignment in 3D. We further propose OmniDrive-nuScenes, a new +visual question-answering dataset challenging the true 3D situational awareness +of a model with comprehensive visual question-answering (VQA) tasks, including +scene description, traffic regulation, 3D grounding, counterfactual reasoning, +decision making and planning. Extensive studies show the effectiveness of the +proposed architecture as well as the importance of the VQA tasks for reasoning +and planning in complex 3D scenes. + +
+
+
+
+
+ + ☆ Improving Intervention Efficacy via Concept Realignment in Concept + Bottleneck Models + + +
+ Concept Bottleneck Models (CBMs) ground image classification on +human-understandable concepts to allow for interpretable model decisions. +Crucially, the CBM design inherently allows for human interventions, in which +expert users are given the ability to modify potentially misaligned concept +choices to influence the decision behavior of the model in an interpretable +fashion. However, existing approaches often require numerous human +interventions per image to achieve strong performances, posing practical +challenges in scenarios where obtaining human feedback is expensive. In this +paper, we find that this is noticeably driven by an independent treatment of +concepts during intervention, wherein a change of one concept does not +influence the use of other ones in the model's final decision. To address this +issue, we introduce a trainable concept intervention realignment module, which +leverages concept relations to realign concept assignments post-intervention. +Across standard, real-world benchmarks, we find that concept realignment can +significantly improve intervention efficacy; significantly reducing the number +of interventions needed to reach a target classification performance or concept +prediction accuracy. In addition, it easily integrates into existing +concept-based architectures without requiring changes to the models themselves. +This reduced cost of human-model collaboration is crucial to enhancing the +feasibility of CBMs in resource-constrained environments. + +
+
+
+
+
+ + ☆ Track2Act: Predicting Point Tracks from Internet Videos enables Diverse + Zero-shot Robot Manipulation + + +
+ We seek to learn a generalizable goal-conditioned policy that enables +zero-shot robot manipulation: interacting with unseen objects in novel scenes +without test-time adaptation. While typical approaches rely on a large amount +of demonstration data for such generalization, we propose an approach that +leverages web videos to predict plausible interaction plans and learns a +task-agnostic transformation to obtain robot actions in the real world. Our +framework,Track2Act predicts tracks of how points in an image should move in +future time-steps based on a goal, and can be trained with diverse videos on +the web including those of humans and robots manipulating everyday objects. We +use these 2D track predictions to infer a sequence of rigid transforms of the +object to be manipulated, and obtain robot end-effector poses that can be +executed in an open-loop manner. We then refine this open-loop plan by +predicting residual actions through a closed loop policy trained with a few +embodiment-specific demonstrations. We show that this approach of combining +scalably learned track prediction with a residual policy requiring minimal +in-domain robot-specific data enables zero-shot robot manipulation, and present +a wide array of real-world robot manipulation results across unseen tasks, +objects, and scenes. https://homangab.github.io/track2act/ + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ A separability-based approach to quantifying generalization: which layer + is best? + + +
+ Generalization to unseen data remains poorly understood for deep learning +classification and foundation models. How can one assess the ability of +networks to adapt to new or extended versions of their input space in the +spirit of few-shot learning, out-of-distribution generalization, and domain +adaptation? Which layers of a network are likely to generalize best? We provide +a new method for evaluating the capacity of networks to represent a sampled +domain, regardless of whether the network has been trained on all classes in +the domain. Our approach is the following: after fine-tuning state-of-the-art +pre-trained models for visual classification on a particular domain, we assess +their performance on data from related but distinct variations in that domain. +Generalization power is quantified as a function of the latent embeddings of +unseen data from intermediate layers for both unsupervised and supervised +settings. Working throughout all stages of the network, we find that (i) high +classification accuracy does not imply high generalizability; and (ii) deeper +layers in a model do not always generalize the best, which has implications for +pruning. Since the trends observed across datasets are largely consistent, we +conclude that our approach reveals (a function of) the intrinsic capacity of +the different layers of a model to generalize. + +
+
+ comment: 6, pages, 5 figures +
+
+
+
+
+ + ☆ Transformer-Aided Semantic Communications + + +
+ The transformer structure employed in large language models (LLMs), as a +specialized category of deep neural networks (DNNs) featuring attention +mechanisms, stands out for their ability to identify and highlight the most +relevant aspects of input data. Such a capability is particularly beneficial in +addressing a variety of communication challenges, notably in the realm of +semantic communication where proper encoding of the relevant data is critical +especially in systems with limited bandwidth. In this work, we employ vision +transformers specifically for the purpose of compression and compact +representation of the input image, with the goal of preserving semantic +information throughout the transmission process. Through the use of the +attention mechanism inherent in transformers, we create an attention mask. This +mask effectively prioritizes critical segments of images for transmission, +ensuring that the reconstruction phase focuses on key objects highlighted by +the mask. Our methodology significantly improves the quality of semantic +communication and optimizes bandwidth usage by encoding different parts of the +data in accordance with their semantic information content, thus enhancing +overall efficiency. We evaluate the effectiveness of our proposed framework +using the TinyImageNet dataset, focusing on both reconstruction quality and +accuracy. Our evaluation results demonstrate that our framework successfully +preserves semantic information, even when only a fraction of the encoded data +is transmitted, according to the intended compression rates. + +
+
+
+
+
+ + ☆ PAM-UNet: Shifting Attention on Region of Interest in Medical Images + + +
+ Computer-aided segmentation methods can assist medical personnel in improving +diagnostic outcomes. While recent advancements like UNet and its variants have +shown promise, they face a critical challenge: balancing accuracy with +computational efficiency. Shallow encoder architectures in UNets often struggle +to capture crucial spatial features, leading in inaccurate and sparse +segmentation. To address this limitation, we propose a novel +\underline{P}rogressive \underline{A}ttention based \underline{M}obile +\underline{UNet} (\underline{PAM-UNet}) architecture. The inverted residual +(IR) blocks in PAM-UNet help maintain a lightweight framework, while layerwise +\textit{Progressive Luong Attention} ($\mathcal{PLA}$) promotes precise +segmentation by directing attention toward regions of interest during +synthesis. Our approach prioritizes both accuracy and speed, achieving a +commendable balance with a mean IoU of 74.65 and a dice score of 82.87, while +requiring only 1.32 floating-point operations per second (FLOPS) on the Liver +Tumor Segmentation Benchmark (LiTS) 2017 dataset. These results highlight the +importance of developing efficient segmentation models to accelerate the +adoption of AI in clinical practice. + +
+
+ comment: Accepted at 2024 IEEE EMBC +
+
+
+
+
+ + ☆ LocInv: Localization-aware Inversion for Text-Guided Image Editing CVPR 2024 + + +
+ Large-scale Text-to-Image (T2I) diffusion models demonstrate significant +generation capabilities based on textual prompts. Based on the T2I diffusion +models, text-guided image editing research aims to empower users to manipulate +generated images by altering the text prompts. However, existing image editing +techniques are prone to editing over unintentional regions that are beyond the +intended target area, primarily due to inaccuracies in cross-attention maps. To +address this problem, we propose Localization-aware Inversion (LocInv), which +exploits segmentation maps or bounding boxes as extra localization priors to +refine the cross-attention maps in the denoising phases of the diffusion +process. Through the dynamic updating of tokens corresponding to noun words in +the textual input, we are compelling the cross-attention maps to closely align +with the correct noun and adjective words in the text prompt. Based on this +technique, we achieve fine-grained image editing over particular objects while +preventing undesired changes to other regions. Our method LocInv, based on the +publicly available Stable Diffusion, is extensively evaluated on a subset of +the COCO dataset, and consistently obtains superior results both quantitatively +and qualitatively.The code will be released at +https://github.com/wangkai930418/DPL + +
+
+ comment: Accepted by CVPR 2024 Workshop AI4CC +
+
+
+
+
+ + ☆ Navigating Heterogeneity and Privacy in One-Shot Federated Learning with + Diffusion Models + + +
+ Federated learning (FL) enables multiple clients to train models collectively +while preserving data privacy. However, FL faces challenges in terms of +communication cost and data heterogeneity. One-shot federated learning has +emerged as a solution by reducing communication rounds, improving efficiency, +and providing better security against eavesdropping attacks. Nevertheless, data +heterogeneity remains a significant challenge, impacting performance. This work +explores the effectiveness of diffusion models in one-shot FL, demonstrating +their applicability in addressing data heterogeneity and improving FL +performance. Additionally, we investigate the utility of our diffusion model +approach, FedDiff, compared to other one-shot FL methods under differential +privacy (DP). Furthermore, to improve generated sample quality under DP +settings, we propose a pragmatic Fourier Magnitude Filtering (FMF) method, +enhancing the effectiveness of generated data for global model training. + +
+
+
+
+
+ + ☆ MANTIS: Interleaved Multi-Image Instruction Tuning + + +
+ The recent years have witnessed a great array of large multimodal models +(LMMs) to effectively solve single-image vision language tasks. However, their +abilities to solve multi-image visual language tasks is yet to be improved. The +existing multi-image LMMs (e.g. OpenFlamingo, Emu, Idefics, etc) mostly gain +their multi-image ability through pre-training on hundreds of millions of noisy +interleaved image-text data from web, which is neither efficient nor effective. +In this paper, we aim at building strong multi-image LMMs via instruction +tuning with academic-level resources. Therefore, we meticulously construct +Mantis-Instruct containing 721K instances from 14 multi-image datasets. We +design Mantis-Instruct to cover different multi-image skills like co-reference, +reasoning, comparing, temporal understanding. We combine Mantis-Instruct with +several single-image visual-language datasets to train our model Mantis to +handle any interleaved image-text inputs. We evaluate the trained Mantis on +five multi-image benchmarks and eight single-image benchmarks. Though only +requiring academic-level resources (i.e. 36 hours on 16xA100-40G), Mantis-8B +can achieve state-of-the-art performance on all the multi-image benchmarks and +beats the existing best multi-image LMM Idefics2-8B by an average of 9 absolute +points. We observe that Mantis performs equivalently well on the held-in and +held-out evaluation benchmarks. We further evaluate Mantis on single-image +benchmarks and demonstrate that Mantis can maintain a strong single-image +performance on par with CogVLM and Emu2. Our results are particularly +encouraging as it shows that low-cost instruction tuning is indeed much more +effective than intensive pre-training in terms of building multi-image LMMs. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ V-FLUTE: Visual Figurative Language Understanding with Textual + Explanations + + +
+ Large Vision-Language models (VLMs) have demonstrated strong reasoning +capabilities in tasks requiring a fine-grained understanding of literal images +and text, such as visual question-answering or visual entailment. However, +there has been little exploration of these models' capabilities when presented +with images and captions containing figurative phenomena such as metaphors or +humor, the meaning of which is often implicit. To close this gap, we propose a +new task and a high-quality dataset: Visual Figurative Language Understanding +with Textual Explanations (V-FLUTE). We frame the visual figurative language +understanding problem as an explainable visual entailment task, where the model +has to predict whether the image (premise) entails a claim (hypothesis) and +justify the predicted label with a textual explanation. Using a human-AI +collaboration framework, we build a high-quality dataset, V-FLUTE, that +contains 6,027 instances spanning five +diverse multimodal figurative phenomena: metaphors, similes, idioms, sarcasm, +and humor. The figurative phenomena can be present either in the image, the +caption, or both. We further conduct both automatic and human evaluations to +assess current VLMs' capabilities in understanding figurative phenomena. + +
+
+
+
+
+ + Advancing human-centric AI for robust X-ray analysis through holistic + self-supervised learning + + +
+ AI Foundation models are gaining traction in various applications, including +medical fields like radiology. However, medical foundation models are often +tested on limited tasks, leaving their generalisability and biases unexplored. +We present RayDINO, a large visual encoder trained by self-supervision on 873k +chest X-rays. We compare RayDINO to previous state-of-the-art models across +nine radiology tasks, from classification and dense segmentation to text +generation, and provide an in depth analysis of population, age and sex biases +of our model. Our findings suggest that self-supervision allows patient-centric +AI proving useful in clinical workflows and interpreting X-rays holistically. +With RayDINO and small task-specific adapters, we reach state-of-the-art +results and improve generalization to unseen populations while mitigating bias, +illustrating the true promise of foundation models: versatility and robustness. + +
+
+
+
+
+ + ☆ Understanding Retrieval-Augmented Task Adaptation for Vision-Language + Models ICML 2024 + + +
+ Pre-trained contrastive vision-language models have demonstrated remarkable +performance across a wide range of tasks. However, they often struggle on +fine-trained datasets with categories not adequately represented during +pre-training, which makes adaptation necessary. Recent works have shown +promising results by utilizing samples from web-scale databases for +retrieval-augmented adaptation, especially in low-data regimes. Despite the +empirical success, understanding how retrieval impacts the adaptation of +vision-language models remains an open research question. In this work, we +adopt a reflective perspective by presenting a systematic study to understand +the roles of key components in retrieval-augmented adaptation. We unveil new +insights on uni-modal and cross-modal retrieval and highlight the critical role +of logit ensemble for effective adaptation. We further present theoretical +underpinnings that directly support our empirical observations. + +
+
+ comment: The paper is accepted at ICML 2024 +
+
+
+
+
+ + ☆ SATO: Stable Text-to-Motion Framework + + +
+ Is the Text to Motion model robust? Recent advancements in Text to Motion +models primarily stem from more accurate predictions of specific actions. +However, the text modality typically relies solely on pre-trained Contrastive +Language-Image Pretraining (CLIP) models. Our research has uncovered a +significant issue with the text-to-motion model: its predictions often exhibit +inconsistent outputs, resulting in vastly different or even incorrect poses +when presented with semantically similar or identical text inputs. In this +paper, we undertake an analysis to elucidate the underlying causes of this +instability, establishing a clear link between the unpredictability of model +outputs and the erratic attention patterns of the text encoder module. +Consequently, we introduce a formal framework aimed at addressing this issue, +which we term the Stable Text-to-Motion Framework (SATO). SATO consists of +three modules, each dedicated to stable attention, stable prediction, and +maintaining a balance between accuracy and robustness trade-off. We present a +methodology for constructing an SATO that satisfies the stability of attention +and prediction. To verify the stability of the model, we introduced a new +textual synonym perturbation dataset based on HumanML3D and KIT-ML. Results +show that SATO is significantly more stable against synonyms and other slight +perturbations while keeping its high accuracy performance. + +
+
+
+
+
+ + ☆ Purify Unlearnable Examples via Rate-Constrained Variational + Autoencoders ICML 2024 + + +
+ Unlearnable examples (UEs) seek to maximize testing error by making subtle +modifications to training examples that are correctly labeled. Defenses against +these poisoning attacks can be categorized based on whether specific +interventions are adopted during training. The first approach is training-time +defense, such as adversarial training, which can mitigate poisoning effects but +is computationally intensive. The other approach is pre-training purification, +e.g., image short squeezing, which consists of several simple compressions but +often encounters challenges in dealing with various UEs. Our work provides a +novel disentanglement mechanism to build an efficient pre-training purification +method. Firstly, we uncover rate-constrained variational autoencoders (VAEs), +demonstrating a clear tendency to suppress the perturbations in UEs. We +subsequently conduct a theoretical analysis for this phenomenon. Building upon +these insights, we introduce a disentangle variational autoencoder (D-VAE), +capable of disentangling the perturbations with learnable class-wise +embeddings. Based on this network, a two-stage purification approach is +naturally developed. The first stage focuses on roughly eliminating +perturbations, while the second stage produces refined, poison-free results, +ensuring effectiveness and robustness across various scenarios. Extensive +experiments demonstrate the remarkable performance of our method across +CIFAR-10, CIFAR-100, and a 100-class ImageNet-subset. Code is available at +https://github.com/yuyi-sd/D-VAE. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ☆ Improving Domain Generalization on Gaze Estimation via Branch-out + Auxiliary Regularization + + +
+ Despite remarkable advancements, mainstream gaze estimation techniques, +particularly appearance-based methods, often suffer from performance +degradation in uncontrolled environments due to variations in illumination and +individual facial attributes. Existing domain adaptation strategies, limited by +their need for target domain samples, may fall short in real-world +applications. This letter introduces Branch-out Auxiliary Regularization (BAR), +an innovative method designed to boost gaze estimation's generalization +capabilities without requiring direct access to target domain data. +Specifically, BAR integrates two auxiliary consistency regularization branches: +one that uses augmented samples to counteract environmental variations, and +another that aligns gaze directions with positive source domain samples to +encourage the learning of consistent gaze features. These auxiliary pathways +strengthen the core network and are integrated in a smooth, plug-and-play +manner, facilitating easy adaptation to various other models. Comprehensive +experimental evaluations on four cross-dataset tasks demonstrate the +superiority of our approach. + +
+
+
+
+
+ + ☆ StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video + Generation + + +
+ For recent diffusion-based generative models, maintaining consistent content +across a series of generated images, especially those containing subjects and +complex details, presents a significant challenge. In this paper, we propose a +new way of self-attention calculation, termed Consistent Self-Attention, that +significantly boosts the consistency between the generated images and augments +prevalent pretrained diffusion-based text-to-image models in a zero-shot +manner. To extend our method to long-range video generation, we further +introduce a novel semantic space temporal motion prediction module, named +Semantic Motion Predictor. It is trained to estimate the motion conditions +between two provided images in the semantic spaces. This module converts the +generated sequence of images into videos with smooth transitions and consistent +subjects that are significantly more stable than the modules based on latent +spaces only, especially in the context of long video generation. By merging +these two novel components, our framework, referred to as StoryDiffusion, can +describe a text-based story with consistent images or videos encompassing a +rich variety of contents. The proposed StoryDiffusion encompasses pioneering +explorations in visual story generation with the presentation of images and +videos, which we hope could inspire more research from the aspect of +architectural modifications. Our code is made publicly available at +https://github.com/HVision-NKU/StoryDiffusion. + +
+
+
+
+
+ + ☆ MiniGPT-3D: Efficiently Aligning 3D Point Clouds with Large Language + Models using 2D Priors + + +
+ Large 2D vision-language models (2D-LLMs) have gained significant attention +by bridging Large Language Models (LLMs) with images using a simple projector. +Inspired by their success, large 3D point cloud-language models (3D-LLMs) also +integrate point clouds into LLMs. However, directly aligning point clouds with +LLM requires expensive training costs, typically in hundreds of GPU-hours on +A100, which hinders the development of 3D-LLMs. In this paper, we introduce +MiniGPT-3D, an efficient and powerful 3D-LLM that achieves multiple SOTA +results while training for only 27 hours on one RTX 3090. Specifically, we +propose to align 3D point clouds with LLMs using 2D priors from 2D-LLMs, which +can leverage the similarity between 2D and 3D visual information. We introduce +a novel four-stage training strategy for modality alignment in a cascaded way, +and a mixture of query experts module to adaptively aggregate features with +high efficiency. Moreover, we utilize parameter-efficient fine-tuning methods +LoRA and Norm fine-tuning, resulting in only 47.8M learnable parameters, which +is up to 260x fewer than existing methods. Extensive experiments show that +MiniGPT-3D achieves SOTA on 3D object classification and captioning tasks, with +significantly cheaper training costs. Notably, MiniGPT-3D gains an 8.12 +increase on GPT-4 evaluation score for the challenging object captioning task +compared to ShapeLLM-13B, while the latter costs 160 total GPU-hours on 8 A800. +We are the first to explore the efficient 3D-LLM, offering new insights to the +community. Code and weights are available at +https://github.com/TangYuan96/MiniGPT-3D. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Goal-conditioned reinforcement learning for ultrasound navigation + guidance + + +
+ Transesophageal echocardiography (TEE) plays a pivotal role in cardiology for +diagnostic and interventional procedures. However, using it effectively +requires extensive training due to the intricate nature of image acquisition +and interpretation. To enhance the efficiency of novice sonographers and reduce +variability in scan acquisitions, we propose a novel ultrasound (US) navigation +assistance method based on contrastive learning as goal-conditioned +reinforcement learning (GCRL). We augment the previous framework using a novel +contrastive patient batching method (CPB) and a data-augmented contrastive +loss, both of which we demonstrate are essential to ensure generalization to +anatomical variations across patients. The proposed framework enables +navigation to both standard diagnostic as well as intricate interventional +views with a single model. Our method was developed with a large dataset of 789 +patients and obtained an average error of 6.56 mm in position and 9.36 degrees +in angle on a testing dataset of 140 patients, which is competitive or superior +to models trained on individual views. Furthermore, we quantitatively validate +our method's ability to navigate to interventional views such as the Left +Atrial Appendage (LAA) view used in LAA closure. Our approach holds promise in +providing valuable guidance during transesophageal ultrasound examinations, +contributing to the advancement of skill acquisition for cardiac ultrasound +practitioners. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ ATOM: Attention Mixer for Efficient Dataset Distillation CVPR + + +
+ Recent works in dataset distillation seek to minimize training expenses by +generating a condensed synthetic dataset that encapsulates the information +present in a larger real dataset. These approaches ultimately aim to attain +test accuracy levels akin to those achieved by models trained on the entirety +of the original dataset. Previous studies in feature and distribution matching +have achieved significant results without incurring the costs of bi-level +optimization in the distillation process. Despite their convincing efficiency, +many of these methods suffer from marginal downstream performance improvements, +limited distillation of contextual information, and subpar cross-architecture +generalization. To address these challenges in dataset distillation, we propose +the ATtentiOn Mixer (ATOM) module to efficiently distill large datasets using a +mixture of channel and spatial-wise attention in the feature matching process. +Spatial-wise attention helps guide the learning process based on consistent +localization of classes in their respective images, allowing for distillation +from a broader receptive field. Meanwhile, channel-wise attention captures the +contextual information associated with the class itself, thus making the +synthetic image more informative for training. By integrating both types of +attention, our ATOM module demonstrates superior performance across various +computer vision datasets, including CIFAR10/100 and TinyImagenet. Notably, our +method significantly improves performance in scenarios with a low number of +images per class, thereby enhancing its potential. Furthermore, we maintain the +improvement in cross-architectures and applications such as neural architecture +search. + +
+
+ comment: Accepted for an oral presentation in CVPR-DD 2024 +
+
+
+
+
+ + ☆ Improving Subject-Driven Image Synthesis with Subject-Agnostic Guidance CVPR 2024 + + +
+ In subject-driven text-to-image synthesis, the synthesis process tends to be +heavily influenced by the reference images provided by users, often overlooking +crucial attributes detailed in the text prompt. In this work, we propose +Subject-Agnostic Guidance (SAG), a simple yet effective solution to remedy the +problem. We show that through constructing a subject-agnostic condition and +applying our proposed dual classifier-free guidance, one could obtain outputs +consistent with both the given subject and input text prompts. We validate the +efficacy of our approach through both optimization-based and encoder-based +methods. Additionally, we demonstrate its applicability in second-order +customization methods, where an encoder-based model is fine-tuned with +DreamBooth. Our approach is conceptually simple and requires only minimal code +modifications, but leads to substantial quality improvements, as evidenced by +our evaluations and user studies. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Sparse multi-view hand-object reconstruction for unseen environments CVPR + + +
+ Recent works in hand-object reconstruction mainly focus on the single-view +and dense multi-view settings. On the one hand, single-view methods can +leverage learned shape priors to generalise to unseen objects but are prone to +inaccuracies due to occlusions. On the other hand, dense multi-view methods are +very accurate but cannot easily adapt to unseen objects without further data +collection. In contrast, sparse multi-view methods can take advantage of the +additional views to tackle occlusion, while keeping the computational cost low +compared to dense multi-view methods. In this paper, we consider the problem of +hand-object reconstruction with unseen objects in the sparse multi-view +setting. Given multiple RGB images of the hand and object captured at the same +time, our model SVHO combines the predictions from each view into a unified +reconstruction without optimisation across views. We train our model on a +synthetic hand-object dataset and evaluate directly on a real world recorded +hand-object dataset with unseen objects. We show that while reconstruction of +unseen hands and objects from RGB is challenging, additional views can help +improve the reconstruction quality. + +
+
+ comment: Camera-ready version. Paper accepted to CVPRW 2024. 8 pages, 7 + figures, 1 table +
+
+
+
+
+ + ☆ Multi-view Action Recognition via Directed Gromov-Wasserstein + Discrepancy + + +
+ Action recognition has become one of the popular research topics in computer +vision. There are various methods based on Convolutional Networks and +self-attention mechanisms as Transformers to solve both spatial and temporal +dimensions problems of action recognition tasks that achieve competitive +performances. However, these methods lack a guarantee of the correctness of the +action subject that the models give attention to, i.e., how to ensure an action +recognition model focuses on the proper action subject to make a reasonable +action prediction. In this paper, we propose a multi-view attention consistency +method that computes the similarity between two attentions from two different +views of the action videos using Directed Gromov-Wasserstein Discrepancy. +Furthermore, our approach applies the idea of Neural Radiance Field to +implicitly render the features from novel views when training on single-view +datasets. Therefore, the contributions in this work are three-fold. Firstly, we +introduce the multi-view attention consistency to solve the problem of +reasonable prediction in action recognition. Secondly, we define a new metric +for multi-view consistent attention using Directed Gromov-Wasserstein +Discrepancy. Thirdly, we built an action recognition model based on Video +Transformers and Neural Radiance Fields. Compared to the recent action +recognition methods, the proposed approach achieves state-of-the-art results on +three large-scale datasets, i.e., Jester, Something-Something V2, and +Kinetics-400. + +
+
+
+
+
+ + ☆ NeRF in Robotics: A Survey + + +
+ Meticulous 3D environment representations have been a longstanding goal in +computer vision and robotics fields. The recent emergence of neural implicit +representations has introduced radical innovation to this field as implicit +representations enable numerous capabilities. Among these, the Neural Radiance +Field (NeRF) has sparked a trend because of the huge representational +advantages, such as simplified mathematical models, compact environment +storage, and continuous scene representations. Apart from computer vision, NeRF +has also shown tremendous potential in the field of robotics. Thus, we create +this survey to provide a comprehensive understanding of NeRF in the field of +robotics. By exploring the advantages and limitations of NeRF, as well as its +current applications and future potential, we hope to shed light on this +promising area of research. Our survey is divided into two main sections: +\textit{The Application of NeRF in Robotics} and \textit{The Advance of NeRF in +Robotics}, from the perspective of how NeRF enters the field of robotics. In +the first section, we introduce and analyze some works that have been or could +be used in the field of robotics from the perception and interaction +perspectives. In the second section, we show some works related to improving +NeRF's own properties, which are essential for deploying NeRF in the field of +robotics. In the discussion section of the review, we summarize the existing +challenges and provide some valuable future research directions for reference. + +
+
+ comment: 21 pages, 19 figures +
+
+
+
+
+ + ☆ Multi-modal Learnable Queries for Image Aesthetics Assessment ICME2024 + + +
+ Image aesthetics assessment (IAA) is attracting wide interest with the +prevalence of social media. The problem is challenging due to its subjective +and ambiguous nature. Instead of directly extracting aesthetic features solely +from the image, user comments associated with an image could potentially +provide complementary knowledge that is useful for IAA. With existing +large-scale pre-trained models demonstrating strong capabilities in extracting +high-quality transferable visual and textual features, learnable queries are +shown to be effective in extracting useful features from the pre-trained visual +features. Therefore, in this paper, we propose MMLQ, which utilizes multi-modal +learnable queries to extract aesthetics-related features from multi-modal +pre-trained features. Extensive experimental results demonstrate that MMLQ +achieves new state-of-the-art performance on multi-modal IAA, beating previous +methods by 7.7% and 8.3% in terms of SRCC and PLCC, respectively. + +
+
+ comment: Accepted by ICME2024 +
+
+
+
+
+ + ☆ Imagine the Unseen: Occluded Pedestrian Detection via Adversarial + Feature Completion + + +
+ Pedestrian detection has significantly progressed in recent years, thanks to +the development of DNNs. However, detection performance at occluded scenes is +still far from satisfactory, as occlusion increases the intra-class variance of +pedestrians, hindering the model from finding an accurate classification +boundary between pedestrians and background clutters. From the perspective of +reducing intra-class variance, we propose to complete features for occluded +regions so as to align the features of pedestrians across different occlusion +patterns. An important premise for feature completion is to locate occluded +regions. From our analysis, channel features of different pedestrian proposals +only show high correlation values at visible parts and thus feature +correlations can be used to model occlusion patterns. In order to narrow down +the gap between completed features and real fully visible ones, we propose an +adversarial learning method, which completes occluded features with a generator +such that they can hardly be distinguished by the discriminator from real fully +visible features. We report experimental results on the CityPersons, Caltech +and CrowdHuman datasets. On CityPersons, we show significant improvements over +five different baseline detectors, especially on the heavy occlusion subset. +Furthermore, we show that our proposed method FeatComp++ achieves +state-of-the-art results on all the above three datasets without relying on +extra cues. + +
+
+
+
+
+ + ☆ Towards Inclusive Face Recognition Through Synthetic Ethnicity + Alteration + + +
+ Numerous studies have shown that existing Face Recognition Systems (FRS), +including commercial ones, often exhibit biases toward certain ethnicities due +to under-represented data. In this work, we explore ethnicity alteration and +skin tone modification using synthetic face image generation methods to +increase the diversity of datasets. We conduct a detailed analysis by first +constructing a balanced face image dataset representing three ethnicities: +Asian, Black, and Indian. We then make use of existing Generative Adversarial +Network-based (GAN) image-to-image translation and manifold learning models to +alter the ethnicity from one to another. A systematic analysis is further +conducted to assess the suitability of such datasets for FRS by studying the +realistic skin-tone representation using Individual Typology Angle (ITA). +Further, we also analyze the quality characteristics using existing Face image +quality assessment (FIQA) approaches. We then provide a holistic FRS +performance analysis using four different systems. Our findings pave the way +for future research works in (i) developing both specific ethnicity and general +(any to any) ethnicity alteration models, (ii) expanding such approaches to +create databases with diverse skin tones, (iii) creating datasets representing +various ethnicities which further can help in mitigating bias while addressing +privacy concerns. + +
+
+ comment: 8 Pages +
+
+
+
+
+ + ☆ Towards Consistent Object Detection via LiDAR-Camera Synergy + + +
+ As human-machine interaction continues to evolve, the capacity for +environmental perception is becoming increasingly crucial. Integrating the two +most common types of sensory data, images, and point clouds, can enhance +detection accuracy. However, currently, no model exists that can simultaneously +detect an object's position in both point clouds and images and ascertain their +corresponding relationship. This information is invaluable for human-machine +interactions, offering new possibilities for their enhancement. In light of +this, this paper introduces an end-to-end Consistency Object Detection (COD) +algorithm framework that requires only a single forward inference to +simultaneously obtain an object's position in both point clouds and images and +establish their correlation. Furthermore, to assess the accuracy of the object +correlation between point clouds and images, this paper proposes a new +evaluation metric, Consistency Precision (CP). To verify the effectiveness of +the proposed framework, an extensive set of experiments has been conducted on +the KITTI and DAIR-V2X datasets. The study also explored how the proposed +consistency detection method performs on images when the calibration parameters +between images and point clouds are disturbed, compared to existing +post-processing methods. The experimental results demonstrate that the proposed +method exhibits excellent detection performance and robustness, achieving +end-to-end consistency detection. The source code will be made publicly +available at https://github.com/xifen523/COD. + +
+
+ comment: The source code will be made publicly available at + https://github.com/xifen523/COD +
+
+
+
+
+ + ☆ Evaluation of Video-Based rPPG in Challenging Environments: Artifact + Mitigation and Network Resilience + + +
+ Video-based remote photoplethysmography (rPPG) has emerged as a promising +technology for non-contact vital sign monitoring, especially under controlled +conditions. However, the accurate measurement of vital signs in real-world +scenarios faces several challenges, including artifacts induced by videocodecs, +low-light noise, degradation, low dynamic range, occlusions, and hardware and +network constraints. In this article, we systematically investigate +comprehensive investigate these issues, measuring their detrimental effects on +the quality of rPPG measurements. Additionally, we propose practical strategies +for mitigating these challenges to improve the dependability and resilience of +video-based rPPG systems. We detail methods for effective biosignal recovery in +the presence of network limitations and present denoising and inpainting +techniques aimed at preserving video frame integrity. Through extensive +evaluations and direct comparisons, we demonstrate the effectiveness of the +approaches in enhancing rPPG measurements under challenging environments, +contributing to the development of more reliable and effective remote vital +sign monitoring technologies. + +
+
+ comment: 22 main article pages with 3 supplementary pages, journal +
+
+
+
+
+ + ☆ RaffeSDG: Random Frequency Filtering enabled Single-source Domain + Generalization for Medical Image Segmentation + + +
+ Deep learning models often encounter challenges in making accurate inferences +when there are domain shifts between the source and target data. This issue is +particularly pronounced in clinical settings due to the scarcity of annotated +data resulting from the professional and private nature of medical data. +Despite the existence of decent solutions, many of them are hindered in +clinical settings due to limitations in data collection and computational +complexity. To tackle domain shifts in data-scarce medical scenarios, we +propose a Random frequency filtering enabled Single-source Domain +Generalization algorithm (RaffeSDG), which promises robust out-of-domain +inference with segmentation models trained on a single-source domain. A +filter-based data augmentation strategy is first proposed to promote domain +variability within a single-source domain by introducing variations in +frequency space and blending homologous samples. Then Gaussian filter-based +structural saliency is also leveraged to learn robust representations across +augmented samples, further facilitating the training of generalizable +segmentation models. To validate the effectiveness of RaffeSDG, we conducted +extensive experiments involving out-of-domain inference on segmentation tasks +for three human tissues imaged by four diverse modalities. Through thorough +investigations and comparisons, compelling evidence was observed in these +experiments, demonstrating the potential and generalizability of RaffeSDG. The +code is available at +https://github.com/liamheng/Non-IID_Medical_Image_Segmentation. + +
+
+
+
+
+ + ☆ CromSS: Cross-modal pre-training with noisy labels for remote sensing + image segmentation ICLR 2024 + + +
+ We study the potential of noisy labels y to pretrain semantic segmentation +models in a multi-modal learning framework for geospatial applications. +Specifically, we propose a novel Cross-modal Sample Selection method (CromSS) +that utilizes the class distributions P^{(d)}(x,c) over pixels x and classes c +modelled by multiple sensors/modalities d of a given geospatial scene. +Consistency of predictions across sensors $d$ is jointly informed by the +entropy of P^{(d)}(x,c). Noisy label sampling we determine by the confidence of +each sensor d in the noisy class label, P^{(d)}(x,c=y(x)). To verify the +performance of our approach, we conduct experiments with Sentinel-1 (radar) and +Sentinel-2 (optical) satellite imagery from the globally-sampled SSL4EO-S12 +dataset. We pair those scenes with 9-class noisy labels sourced from the Google +Dynamic World project for pretraining. Transfer learning evaluations +(downstream task) on the DFC2020 dataset confirm the effectiveness of the +proposed method for remote sensing image segmentation. + +
+
+ comment: Accepted as an oral presentation by ICLR 2024 ML4RS workshop +
+
+
+
+
+ + ☆ Error-Driven Uncertainty Aware Training + + +
+ Neural networks are often overconfident about their predictions, which +undermines their reliability and trustworthiness. In this work, we present a +novel technique, named Error-Driven Uncertainty Aware Training (EUAT), which +aims to enhance the ability of neural models to estimate their uncertainty +correctly, namely to be highly uncertain when they output inaccurate +predictions and low uncertain when their output is accurate. The EUAT approach +operates during the model's training phase by selectively employing two loss +functions depending on whether the training examples are correctly or +incorrectly predicted by the model. This allows for pursuing the twofold goal +of i) minimizing model uncertainty for correctly predicted inputs and ii) +maximizing uncertainty for mispredicted inputs, while preserving the model's +misprediction rate. We evaluate EUAT using diverse neural models and datasets +in the image recognition domains considering both non-adversarial and +adversarial settings. The results show that EUAT outperforms existing +approaches for uncertainty estimation (including other uncertainty-aware +training techniques, calibration, ensembles, and DEUP) by providing uncertainty +estimates that not only have higher quality when evaluated via statistical +metrics (e.g., correlation with residuals) but also when employed to build +binary classifiers that decide whether the model's output can be trusted or not +and under distributional data shifts. + +
+
+
+
+
+ + ☆ Towards Cross-Scale Attention and Surface Supervision for Fractured Bone + Segmentation in CT + + +
+ Bone segmentation is an essential step for the preoperative planning of +fracture trauma surgery. The automated segmentation of fractured bone from +computed tomography (CT) scans remains challenging, due to the large +differences of fractures in position and morphology, and also the inherent +anatomical characteristics of different bone structures. To alleviate these +issues, we propose a cross-scale attention mechanism as well as a surface +supervision strategy for fractured bone segmentation in CT. Specifically, a +cross-scale attention mechanism is introduced to effectively aggregate the +features among different scales to provide more powerful fracture +representation. Moreover, a surface supervision strategy is employed, which +explicitly constrains the network to pay more attention to the bone boundary. +The efficacy of the proposed method is evaluated on a public dataset containing +CT scans with hip fractures. The evaluation metrics are Dice similarity +coefficient (DSC), average symmetric surface distance (ASSD), and Hausdorff +distance (95HD). The proposed method achieves an average DSC of 93.36%, ASSD of +0.85mm, 95HD of 7.51mm. Our method offers an effective fracture segmentation +approach for the pelvic CT examinations, and has the potential to be used for +improving the segmentation performance of other types of fractures. + +
+
+
+
+
+ + ☆ Latent Fingerprint Matching via Dense Minutia Descriptor + + +
+ Latent fingerprint matching is a daunting task, primarily due to the poor +quality of latent fingerprints. In this study, we propose a deep-learning based +dense minutia descriptor (DMD) for latent fingerprint matching. A DMD is +obtained by extracting the fingerprint patch aligned by its central minutia, +capturing detailed minutia information and texture information. Our dense +descriptor takes the form of a three-dimensional representation, with two +dimensions associated with the original image plane and the other dimension +representing the abstract features. Additionally, the extraction process +outputs the fingerprint segmentation map, ensuring that the descriptor is only +valid in the foreground region. The matching between two descriptors occurs in +their overlapping regions, with a score normalization strategy to reduce the +impact brought by the differences outside the valid area. Our descriptor +achieves state-of-the-art performance on several latent fingerprint datasets. +Overall, our DMD is more representative and interpretable compared to previous +methods. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Imagine2touch: Predictive Tactile Sensing for Robotic Manipulation using + Efficient Low-Dimensional Signals ICRA2024 + + +
+ Humans seemingly incorporate potential touch signals in their perception. Our +goal is to equip robots with a similar capability, which we term Imagine2touch. +Imagine2touch aims to predict the expected touch signal based on a visual patch +representing the area to be touched. We use ReSkin, an inexpensive and compact +touch sensor to collect the required dataset through random touching of five +basic geometric shapes, and one tool. We train Imagine2touch on two out of +those shapes and validate it on the ood. tool. We demonstrate the efficacy of +Imagine2touch through its application to the downstream task of object +recognition. In this task, we evaluate Imagine2touch performance in two +experiments, together comprising 5 out of training distribution objects. +Imagine2touch achieves an object recognition accuracy of 58% after ten touches +per object, surpassing a proprioception baseline. + +
+
+ comment: 3 pages, 3 figures, 2 tables, accepted at ViTac2024 ICRA2024 + Workshop. arXiv admin note: substantial text overlap with arXiv:2403.15107 +
+
+
+
+
+ + ☆ Uncertainty-aware self-training with expectation maximization basis + transformation + + +
+ Self-training is a powerful approach to deep learning. The key process is to +find a pseudo-label for modeling. However, previous self-training algorithms +suffer from the over-confidence issue brought by the hard labels, even some +confidence-related regularizers cannot comprehensively catch the uncertainty. +Therefore, we propose a new self-training framework to combine uncertainty +information of both model and dataset. Specifically, we propose to use +Expectation-Maximization (EM) to smooth the labels and comprehensively estimate +the uncertainty information. We further design a basis extraction network to +estimate the initial basis from the dataset. The obtained basis with +uncertainty can be filtered based on uncertainty information. It can then be +transformed into the real hard label to iteratively update the model and basis +in the retraining process. Experiments on image classification and semantic +segmentation show the advantages of our methods among confidence-aware +self-training algorithms with 1-3 percentage improvement on different datasets. + +
+
+
+
+
+ + ☆ GroupedMixer: An Entropy Model with Group-wise Token-Mixers for Learned + Image Compression + + +
+ Transformer-based entropy models have gained prominence in recent years due +to their superior ability to capture long-range dependencies in probability +distribution estimation compared to convolution-based methods. However, +previous transformer-based entropy models suffer from a sluggish coding process +due to pixel-wise autoregression or duplicated computation during inference. In +this paper, we propose a novel transformer-based entropy model called +GroupedMixer, which enjoys both faster coding speed and better compression +performance than previous transformer-based methods. Specifically, our approach +builds upon group-wise autoregression by first partitioning the latent +variables into groups along spatial-channel dimensions, and then entropy coding +the groups with the proposed transformer-based entropy model. The global causal +self-attention is decomposed into more efficient group-wise interactions, +implemented using inner-group and cross-group token-mixers. The inner-group +token-mixer incorporates contextual elements within a group while the +cross-group token-mixer interacts with previously decoded groups. Alternate +arrangement of two token-mixers enables global contextual reference. To further +expedite the network inference, we introduce context cache optimization to +GroupedMixer, which caches attention activation values in cross-group +token-mixers and avoids complex and duplicated computation. Experimental +results demonstrate that the proposed GroupedMixer yields the state-of-the-art +rate-distortion performance with fast compression speed. + +
+
+ comment: Accepted by IEEE TCSVT +
+
+
+
+
+ + ☆ Self-Supervised Learning for Interventional Image Analytics: Towards + Robust Device Trackers + + +
+ An accurate detection and tracking of devices such as guiding catheters in +live X-ray image acquisitions is an essential prerequisite for endovascular +cardiac interventions. This information is leveraged for procedural guidance, +e.g., directing stent placements. To ensure procedural safety and efficacy, +there is a need for high robustness no failures during tracking. To achieve +that, one needs to efficiently tackle challenges, such as: device obscuration +by contrast agent or other external devices or wires, changes in field-of-view +or acquisition angle, as well as the continuous movement due to cardiac and +respiratory motion. To overcome the aforementioned challenges, we propose a +novel approach to learn spatio-temporal features from a very large data cohort +of over 16 million interventional X-ray frames using self-supervision for image +sequence data. Our approach is based on a masked image modeling technique that +leverages frame interpolation based reconstruction to learn fine inter-frame +temporal correspondences. The features encoded in the resulting model are +fine-tuned downstream. Our approach achieves state-of-the-art performance and +in particular robustness compared to ultra optimized reference solutions (that +use multi-stage feature fusion, multi-task and flow regularization). The +experiments show that our method achieves 66.31% reduction in maximum tracking +error against reference solutions (23.20% when flow regularization is used); +achieving a success score of 97.95% at a 3x faster inference speed of 42 +frames-per-second (on GPU). The results encourage the use of our approach in +various other tasks within interventional image analytics that require +effective understanding of spatio-temporal semantics. + +
+
+
+
+
+ + ☆ Automated Virtual Product Placement and Assessment in Images using + Diffusion Models CVPR + 2024 + + +
+ In Virtual Product Placement (VPP) applications, the discrete integration of +specific brand products into images or videos has emerged as a challenging yet +important task. This paper introduces a novel three-stage fully automated VPP +system. In the first stage, a language-guided image segmentation model +identifies optimal regions within images for product inpainting. In the second +stage, Stable Diffusion (SD), fine-tuned with a few example product images, is +used to inpaint the product into the previously identified candidate regions. +The final stage introduces an "Alignment Module", which is designed to +effectively sieve out low-quality images. Comprehensive experiments demonstrate +that the Alignment Module ensures the presence of the intended product in every +generated image and enhances the average quality of images by 35%. The results +presented in this paper demonstrate the effectiveness of the proposed VPP +system, which holds significant potential for transforming the landscape of +virtual advertising and marketing strategies. + +
+
+ comment: Accepted at the 6th AI for Content Creation (AI4CC) workshop at CVPR + 2024 +
+
+
+
+
+ + ☆ Detecting and clustering swallow events in esophageal long-term + high-resolution manometry + + +
+ High-resolution manometry (HRM) is the gold standard in diagnosing esophageal +motility disorders. As HRM is typically conducted under short-term laboratory +settings, intermittently occurring disorders are likely to be missed. +Therefore, long-term (up to 24h) HRM (LTHRM) is used to gain detailed insights +into the swallowing behavior. However, analyzing the extensive data from LTHRM +is challenging and time consuming as medical experts have to analyze the data +manually, which is slow and prone to errors. To address this challenge, we +propose a Deep Learning based swallowing detection method to accurately +identify swallowing events and secondary non-deglutitive-induced esophageal +motility disorders in LTHRM data. We then proceed with clustering the +identified swallows into distinct classes, which are analyzed by highly +experienced clinicians to validate the different swallowing patterns. We +evaluate our computational pipeline on a total of 25 LTHRMs, which were +meticulously annotated by medical experts. By detecting more than 94% of all +relevant swallow events and providing all relevant clusters for a more reliable +diagnostic process among experienced clinicians, we are able to demonstrate the +effectiveness as well as positive clinical impact of our approach to make LTHRM +feasible in clinical care. + +
+
+
+
+
+ + ☆ Investigating Self-Supervised Image Denoising with Denaturation + + +
+ Self-supervised learning for image denoising problems in the presence of +denaturation for noisy data is a crucial approach in machine learning. However, +theoretical understanding of the performance of the approach that uses +denatured data is lacking. To provide better understanding of the approach, in +this paper, we analyze a self-supervised denoising algorithm that uses +denatured data in depth through theoretical analysis and numerical experiments. +Through the theoretical analysis, we discuss that the algorithm finds desired +solutions to the optimization problem with the population risk, while the +guarantee for the empirical risk depends on the hardness of the denoising task +in terms of denaturation levels. We also conduct several experiments to +investigate the performance of an extended algorithm in practice. The results +indicate that the algorithm training with denatured images works, and the +empirical performance aligns with the theoretical results. These results +suggest several insights for further improvement of self-supervised image +denoising that uses denatured data in future directions. + +
+
+
+
+
+ + ☆ Domain-Transferred Synthetic Data Generation for Improving Monocular + Depth Estimation + + +
+ A major obstacle to the development of effective monocular depth estimation +algorithms is the difficulty in obtaining high-quality depth data that +corresponds to collected RGB images. Collecting this data is time-consuming and +costly, and even data collected by modern sensors has limited range or +resolution, and is subject to inconsistencies and noise. To combat this, we +propose a method of data generation in simulation using 3D synthetic +environments and CycleGAN domain transfer. We compare this method of data +generation to the popular NYUDepth V2 dataset by training a depth estimation +model based on the DenseDepth structure using different training sets of real +and simulated data. We evaluate the performance of the models on newly +collected images and LiDAR depth data from a Husky robot to verify the +generalizability of the approach and show that GAN-transformed data can serve +as an effective alternative to real-world data, particularly in depth +estimation. + +
+
+
+
+
+ + ☆ Sports Analysis and VR Viewing System Based on Player Tracking and Pose + Estimation with Multimodal and Multiview Sensors + + +
+ Sports analysis and viewing play a pivotal role in the current sports domain, +offering significant value not only to coaches and athletes but also to fans +and the media. In recent years, the rapid development of virtual reality (VR) +and augmented reality (AR) technologies have introduced a new platform for +watching games. Visualization of sports competitions in VR/AR represents a +revolutionary technology, providing audiences with a novel immersive viewing +experience. However, there is still a lack of related research in this area. In +this work, we present for the first time a comprehensive system for sports +competition analysis and real-time visualization on VR/AR platforms. First, we +utilize multiview LiDARs and cameras to collect multimodal game data. +Subsequently, we propose a framework for multi-player tracking and pose +estimation based on a limited amount of supervised data, which extracts precise +player positions and movements from point clouds and images. Moreover, we +perform avatar modeling of players to obtain their 3D models. Ultimately, using +these 3D player data, we conduct competition analysis and real-time +visualization on VR/AR. Extensive quantitative experiments demonstrate the +accuracy and robustness of our multi-player tracking and pose estimation +framework. The visualization results showcase the immense potential of our +sports visualization system on the domain of watching games on VR/AR devices. +The multimodal competition dataset we collected and all related code will be +released soon. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.06409 +
+
+
+
+
+ + ☆ Federated Learning with Heterogeneous Data Handling for Robust Vehicular + Object Detection + + +
+ In the pursuit of refining precise perception models for fully autonomous +driving, continual online model training becomes essential. Federated Learning +(FL) within vehicular networks offers an efficient mechanism for model training +while preserving raw sensory data integrity. Yet, FL struggles with +non-identically distributed data (e.g., quantity skew), leading to suboptimal +convergence rates during model training. In previous work, we introduced FedLA, +an innovative Label-Aware aggregation method addressing data heterogeneity in +FL for generic scenarios. + In this paper, we introduce FedProx+LA, a novel FL method building upon the +state-of-the-art FedProx and FedLA to tackle data heterogeneity, which is +specifically tailored for vehicular networks. We evaluate the efficacy of +FedProx+LA in continuous online object detection model training. Through a +comparative analysis against conventional and state-of-the-art methods, our +findings reveal the superior convergence rate of FedProx+LA. Notably, if the +label distribution is very heterogeneous, our FedProx+LA approach shows +substantial improvements in detection performance compared to baseline methods, +also outperforming our previous FedLA approach. Moreover, both FedLA and +FedProx+LA increase convergence speed by 30% compared to baseline methods. + +
+
+
+
+
+ + ☆ Image segmentation of treated and untreated tumor spheroids by Fully + Convolutional Networks + + +
+ Multicellular tumor spheroids (MCTS) are advanced cell culture systems for +assessing the impact of combinatorial radio(chemo)therapy. They exhibit +therapeutically relevant in-vivo-like characteristics from 3D cell-cell and +cell-matrix interactions to radial pathophysiological gradients related to +proliferative activity and nutrient/oxygen supply, altering cellular +radioresponse. State-of-the-art assays quantify long-term curative endpoints +based on collected brightfield image time series from large treated spheroid +populations per irradiation dose and treatment arm. Here, spheroid control +probabilities are documented analogous to in-vivo tumor control probabilities +based on Kaplan-Meier curves. This analyses require laborious spheroid +segmentation of up to 100.000 images per treatment arm to extract relevant +structural information from the images, e.g., diameter, area, volume and +circularity. While several image analysis algorithms are available for spheroid +segmentation, they all focus on compact MCTS with clearly distinguishable outer +rim throughout growth. However, treated MCTS may partly be detached and +destroyed and are usually obscured by dead cell debris. We successfully train +two Fully Convolutional Networks, UNet and HRNet, and optimize their +hyperparameters to develop an automatic segmentation for both untreated and +treated MCTS. We systematically validate the automatic segmentation on larger, +independent data sets of spheroids derived from two human head-and-neck cancer +cell lines. We find an excellent overlap between manual and automatic +segmentation for most images, quantified by Jaccard indices at around 90%. For +images with smaller overlap of the segmentations, we demonstrate that this +error is comparable to the variations across segmentations from different +biological experts, suggesting that these images represent biologically unclear +or ambiguous cases. + +
+
+ comment: 28 pages, 21 figures +
+
+
+
+
+ + ☆ Enhancing Person Re-Identification via Uncertainty Feature Fusion and + Wise Distance Aggregation + + +
+ The quest for robust Person re-identification (Re-ID) systems capable of +accurately identifying subjects across diverse scenarios remains a formidable +challenge in surveillance and security applications. This study presents a +novel methodology that significantly enhances Person Re-Identification (Re-ID) +by integrating Uncertainty Feature Fusion (UFFM) with Wise Distance Aggregation +(WDA). Tested on benchmark datasets - Market-1501, DukeMTMC-ReID, and MSMT17 - +our approach demonstrates substantial improvements in Rank-1 accuracy and mean +Average Precision (mAP). Specifically, UFFM capitalizes on the power of feature +synthesis from multiple images to overcome the limitations imposed by the +variability of subject appearances across different views. WDA further refines +the process by intelligently aggregating similarity metrics, thereby enhancing +the system's ability to discern subtle but critical differences between +subjects. The empirical results affirm the superiority of our method over +existing approaches, achieving new performance benchmarks across all evaluated +datasets. Code is available on Github. + +
+
+
+
+
+ + ☆ Transformers Fusion across Disjoint Samples for Hyperspectral Image + Classification + + +
+ 3D Swin Transformer (3D-ST) known for its hierarchical attention and +window-based processing, excels in capturing intricate spatial relationships +within images. Spatial-spectral Transformer (SST), meanwhile, specializes in +modeling long-range dependencies through self-attention mechanisms. Therefore, +this paper introduces a novel method: an attentional fusion of these two +transformers to significantly enhance the classification performance of +Hyperspectral Images (HSIs). What sets this approach apart is its emphasis on +the integration of attentional mechanisms from both architectures. This +integration not only refines the modeling of spatial and spectral information +but also contributes to achieving more precise and accurate classification +results. The experimentation and evaluation of benchmark HSI datasets +underscore the importance of employing disjoint training, validation, and test +samples. The results demonstrate the effectiveness of the fusion approach, +showcasing its superiority over traditional methods and individual +transformers. Incorporating disjoint samples enhances the robustness and +reliability of the proposed methodology, emphasizing its potential for +advancing hyperspectral image classification. + +
+
+
+
+
+ + ☆ Learning Object States from Actions via Large Language Models + + +
+ Temporally localizing the presence of object states in videos is crucial in +understanding human activities beyond actions and objects. This task has +suffered from a lack of training data due to object states' inherent ambiguity +and variety. To avoid exhaustive annotation, learning from transcribed +narrations in instructional videos would be intriguing. However, object states +are less described in narrations compared to actions, making them less +effective. In this work, we propose to extract the object state information +from action information included in narrations, using large language models +(LLMs). Our observation is that LLMs include world knowledge on the +relationship between actions and their resulting object states, and can infer +the presence of object states from past action sequences. The proposed +LLM-based framework offers flexibility to generate plausible pseudo-object +state labels against arbitrary categories. We evaluate our method with our +newly collected Multiple Object States Transition (MOST) dataset including +dense temporal annotation of 60 object state categories. Our model trained by +the generated pseudo-labels demonstrates significant improvement of over 29% in +mAP against strong zero-shot vision-language models, showing the effectiveness +of explicitly extracting object state information from actions through LLMs. + +
+
+ comment: 19 pages of main content, 24 pages of supplementary material +
+
+
+
+
+ + ☆ Type2Branch: Keystroke Biometrics based on a Dual-branch Architecture + with Attention Mechanisms and Set2set Loss + + +
+ In 2021, the pioneering work on TypeNet showed that keystroke dynamics +verification could scale to hundreds of thousands of users with minimal +performance degradation. Recently, the KVC-onGoing competition has provided an +open and robust experimental protocol for evaluating keystroke dynamics +verification systems of such scale, including considerations of algorithmic +fairness. This article describes Type2Branch, the model and techniques that +achieved the lowest error rates at the KVC-onGoing, in both desktop and mobile +scenarios. The novelty aspects of the proposed Type2Branch include: i) +synthesized timing features emphasizing user behavior deviation from the +general population, ii) a dual-branch architecture combining recurrent and +convolutional paths with various attention mechanisms, iii) a new loss function +named Set2set that captures the global structure of the embedding space, and +iv) a training curriculum of increasing difficulty. Considering five enrollment +samples per subject of approximately 50 characters typed, the proposed +Type2Branch achieves state-of-the-art performance with mean per-subject EERs of +0.77% and 1.03% on evaluation sets of respectively 15,000 and 5,000 subjects +for desktop and mobile scenarios. With a uniform global threshold for all +subjects, the EERs are 3.25% for desktop and 3.61% for mobile, outperforming +previous approaches by a significant margin. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ Single Image Super-Resolution Based on Global-Local Information Synergy + + +
+ Although several image super-resolution solutions exist, they still face many +challenges. CNN-based algorithms, despite the reduction in computational +complexity, still need to improve their accuracy. While Transformer-based +algorithms have higher accuracy, their ultra-high computational complexity +makes them difficult to be accepted in practical applications. To overcome the +existing challenges, a novel super-resolution reconstruction algorithm is +proposed in this paper. The algorithm achieves a significant increase in +accuracy through a unique design while maintaining a low complexity. The core +of the algorithm lies in its cleverly designed Global-Local Information +Extraction Module and Basic Block Module. By combining global and local +information, the Global-Local Information Extraction Module aims to understand +the image content more comprehensively so as to recover the global structure +and local details in the image more accurately, which provides rich information +support for the subsequent reconstruction process. Experimental results show +that the comprehensive performance of the algorithm proposed in this paper is +optimal, providing an efficient and practical new solution in the field of +super-resolution reconstruction. + +
+
+
+
+
+ + ☆ MCMS: Multi-Category Information and Multi-Scale Stripe Attention for + Blind Motion Deblurring + + +
+ Deep learning-based motion deblurring techniques have advanced significantly +in recent years. This class of techniques, however, does not carefully examine +the inherent flaws in blurry images. For instance, low edge and structural +information are traits of blurry images. The high-frequency component of blurry +images is edge information, and the low-frequency component is structure +information. A blind motion deblurring network (MCMS) based on multi-category +information and multi-scale stripe attention mechanism is proposed. Given the +respective characteristics of the high-frequency and low-frequency components, +a three-stage encoder-decoder model is designed. Specifically, the first stage +focuses on extracting the features of the high-frequency component, the second +stage concentrates on extracting the features of the low-frequency component, +and the third stage integrates the extracted low-frequency component features, +the extracted high-frequency component features, and the original blurred image +in order to recover the final clear image. As a result, the model effectively +improves motion deblurring by fusing the edge information of the high-frequency +component and the structural information of the low-frequency component. In +addition, a grouped feature fusion technique is developed so as to achieve +richer, more three-dimensional and comprehensive utilization of various types +of features at a deep level. Next, a multi-scale stripe attention mechanism +(MSSA) is designed, which effectively combines the anisotropy and multi-scale +information of the image, a move that significantly enhances the capability of +the deep model in feature representation. Large-scale comparative studies on +various datasets show that the strategy in this paper works better than the +recently published measures. + +
+
+
+
+
+ + ☆ Poisoning Attacks on Federated Learning for Autonomous Driving SC + + +
+ Federated Learning (FL) is a decentralized learning paradigm, enabling +parties to collaboratively train models while keeping their data confidential. +Within autonomous driving, it brings the potential of reducing data storage +costs, reducing bandwidth requirements, and to accelerate the learning. FL is, +however, susceptible to poisoning attacks. In this paper, we introduce two +novel poisoning attacks on FL tailored to regression tasks within autonomous +driving: FLStealth and Off-Track Attack (OTA). FLStealth, an untargeted attack, +aims at providing model updates that deteriorate the global model performance +while appearing benign. OTA, on the other hand, is a targeted attack with the +objective to change the global model's behavior when exposed to a certain +trigger. We demonstrate the effectiveness of our attacks by conducting +comprehensive experiments pertaining to the task of vehicle trajectory +prediction. In particular, we show that, among five different untargeted +attacks, FLStealth is the most successful at bypassing the considered defenses +employed by the server. For OTA, we demonstrate the inability of common defense +strategies to mitigate the attack, highlighting the critical need for new +defensive mechanisms against targeted attacks within FL for autonomous driving. + +
+
+ comment: Accepted to SCAI2024 +
+
+
+
+
+ + ☆ Callico: a Versatile Open-Source Document Image Annotation Platform ICDAR 2024 + + +
+ This paper presents Callico, a web-based open source platform designed to +simplify the annotation process in document recognition projects. The move +towards data-centric AI in machine learning and deep learning underscores the +importance of high-quality data, and the need for specialised tools that +increase the efficiency and effectiveness of generating such data. For document +image annotation, Callico offers dual-display annotation for digitised +documents, enabling simultaneous visualisation and annotation of scanned images +and text. This capability is critical for OCR and HTR model training, document +layout analysis, named entity recognition, form-based key value annotation or +hierarchical structure annotation with element grouping. The platform supports +collaborative annotation with versatile features backed by a commitment to open +source development, high-quality code standards and easy deployment via Docker. +Illustrative use cases - including the transcription of the Belfort municipal +registers, the indexing of French World War II prisoners for the ICRC, and the +extraction of personal information from the Socface project's census lists - +demonstrate Callico's applicability and utility. + +
+
+ comment: Accepted to ICDAR 2024 +
+
+
+
+
+ + ☆ HandSSCA: 3D Hand Mesh Reconstruction with State Space Channel Attention + from RGB images + + +
+ Reconstructing a hand mesh from a single RGB image is a challenging task +because hands are often occluded by objects. Most previous works attempted to +introduce more additional information and adopt attention mechanisms to improve +3D reconstruction results, but it would increased computational complexity. +This observation prompts us to propose a new and concise architecture while +improving computational efficiency. In this work, we propose a simple and +effective 3D hand mesh reconstruction network HandSSCA, which is the first to +incorporate state space modeling into the field of hand pose estimation. In the +network, we have designed a novel state space channel attention module that +extends the effective sensory field, extracts hand features in the spatial +dimension, and enhances hand regional features in the channel dimension. This +design helps to reconstruct a complete and detailed hand mesh. Extensive +experiments conducted on well-known datasets featuring challenging hand-object +occlusions (such as FREIHAND, DEXYCB, and HO3D) demonstrate that our proposed +HandSSCA achieves state-of-the-art performance while maintaining a minimal +parameter count. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ MFDS-Net: Multi-Scale Feature Depth-Supervised Network for Remote + Sensing Change Detection with Global Semantic and Detail Information + + +
+ Change detection as an interdisciplinary discipline in the field of computer +vision and remote sensing at present has been receiving extensive attention and +research. Due to the rapid development of society, the geographic information +captured by remote sensing satellites is changing faster and more complex, +which undoubtedly poses a higher challenge and highlights the value of change +detection tasks. We propose MFDS-Net: Multi-Scale Feature Depth-Supervised +Network for Remote Sensing Change Detection with Global Semantic and Detail +Information (MFDS-Net) with the aim of achieving a more refined description of +changing buildings as well as geographic information, enhancing the +localisation of changing targets and the acquisition of weak features. To +achieve the research objectives, we use a modified ResNet_34 as backbone +network to perform feature extraction and DO-Conv as an alternative to +traditional convolution to better focus on the association between feature +information and to obtain better training results. We propose the Global +Semantic Enhancement Module (GSEM) to enhance the processing of high-level +semantic information from a global perspective. The Differential Feature +Integration Module (DFIM) is proposed to strengthen the fusion of different +depth feature information, achieving learning and extraction of differential +features. The entire network is trained and optimized using a deep supervision +mechanism. + The experimental outcomes of MFDS-Net surpass those of current mainstream +change detection networks. On the LEVIR dataset, it achieved an F1 score of +91.589 and IoU of 84.483, on the WHU dataset, the scores were F1: 92.384 and +IoU: 86.807, and on the GZ-CD dataset, the scores were F1: 86.377 and IoU: +76.021. The code is available at https://github.com/AOZAKIiii/MFDS-Net + +
+
+
+
+
+ + ☆ A text-based, generative deep learning model for soil reflectance + spectrum simulation in the VIS-NIR (400-2499 nm) bands + + +
+ Simulating soil reflectance spectra is invaluable for soil-plant radiative +modeling and training machine learning models, yet it is difficult as the +intricate relationships between soil structure and its constituents. To address +this, a fully data-driven soil optics generative model (SOGM) for simulation of +soil reflectance spectra based on soil property inputs was developed. The model +is trained on an extensive dataset comprising nearly 180,000 soil +spectra-property pairs from 17 datasets. It generates soil reflectance spectra +from text-based inputs describing soil properties and their values rather than +only numerical values and labels in binary vector format. The generative model +can simulate output spectra based on an incomplete set of input properties. +SOGM is based on the denoising diffusion probabilistic model (DDPM). Two +additional sub-models were also built to complement the SOGM: a spectral +padding model that can fill in the gaps for spectra shorter than the full +visible-near-infrared range (VIS-NIR; 400 to 2499 nm), and a wet soil spectra +model that can estimate the effects of water content on soil reflectance +spectra given the dry spectrum predicted by the SOGM. The SOGM was up-scaled by +coupling with the Helios 3D plant modeling software, which allowed for +generation of synthetic aerial images of simulated soil and plant scenes. It +can also be easily integrated with soil-plant radiation model used for remote +sensin research like PROSAIL. The testing results of the SOGM on new datasets +that not included in model training proved that the model can generate +reasonable soil reflectance spectra based on available property inputs. The +presented models are openly accessible on: +https://github.com/GEMINI-Breeding/SOGM_soil_spectra_simulation. + +
+
+ comment: The paper has been submitted to Remote sensing of Environment and + revised +
+
+
+
+
+ + ☆ Continual Learning for Robust Gate Detection under Dynamic Lighting in + Autonomous Drone Racing IJCNN + + +
+ In autonomous and mobile robotics, a principal challenge is resilient +real-time environmental perception, particularly in situations characterized by +unknown and dynamic elements, as exemplified in the context of autonomous drone +racing. This study introduces a perception technique for detecting drone racing +gates under illumination variations, which is common during high-speed drone +flights. The proposed technique relies upon a lightweight neural network +backbone augmented with capabilities for continual learning. The envisaged +approach amalgamates predictions of the gates' positional coordinates, +distance, and orientation, encapsulating them into a cohesive pose tuple. A +comprehensive number of tests serve to underscore the efficacy of this approach +in confronting diverse and challenging scenarios, specifically those involving +variable lighting conditions. The proposed methodology exhibits notable +robustness in the face of illumination variations, thereby substantiating its +effectiveness. + +
+
+ comment: 8 pages, 6 figures, in 2024 International Joint Conference on Neural + Networks (IJCNN) +
+
+
+
+
+ + ☆ Few Shot Class Incremental Learning using Vision-Language models + + +
+ Recent advancements in deep learning have demonstrated remarkable performance +comparable to human capabilities across various supervised computer vision +tasks. However, the prevalent assumption of having an extensive pool of +training data encompassing all classes prior to model training often diverges +from real-world scenarios, where limited data availability for novel classes is +the norm. The challenge emerges in seamlessly integrating new classes with few +samples into the training data, demanding the model to adeptly accommodate +these additions without compromising its performance on base classes. To +address this exigency, the research community has introduced several solutions +under the realm of few-shot class incremental learning (FSCIL). + In this study, we introduce an innovative FSCIL framework that utilizes +language regularizer and subspace regularizer. During base training, the +language regularizer helps incorporate semantic information extracted from a +Vision-Language model. The subspace regularizer helps in facilitating the +model's acquisition of nuanced connections between image and text semantics +inherent to base classes during incremental training. Our proposed framework +not only empowers the model to embrace novel classes with limited data, but +also ensures the preservation of performance on base classes. To substantiate +the efficacy of our approach, we conduct comprehensive experiments on three +distinct FSCIL benchmarks, where our framework attains state-of-the-art +performance. + +
+
+ comment: under review at Pattern Recognition Letters +
+
+
+
+
+ + ☆ Technical Report of NICE Challenge at CVPR 2024: Caption Re-ranking + Evaluation Using Ensembled CLIP and Consensus Scores + + +
+ This report presents the ECO (Ensembled Clip score and cOnsensus score) +pipeline from team DSBA LAB, which is a new framework used to evaluate and rank +captions for a given image. ECO selects the most accurate caption describing +image. It is made possible by combining an Ensembled CLIP score, which +considers the semantic alignment between the image and captions, with a +Consensus score that accounts for the essentialness of the captions. Using this +framework, we achieved notable success in the CVPR 2024 Workshop Challenge on +Caption Re-ranking Evaluation at the New Frontiers for Zero-Shot Image +Captioning Evaluation (NICE). Specifically, we secured third place based on the +CIDEr metric, second in both the SPICE and METEOR metrics, and first in the +ROUGE-L and all BLEU Score metrics. The code and configuration for the ECO +framework are available at https://github.com/ DSBA-Lab/ECO . + +
+
+
+
+
+ + ☆ Addressing Diverging Training Costs using Local Restoration for Precise + Bird's Eye View Map Construction + + +
+ Recent advancements in Bird's Eye View (BEV) fusion for map construction have +demonstrated remarkable mapping of urban environments. However, their deep and +bulky architecture incurs substantial amounts of backpropagation memory and +computing latency. Consequently, the problem poses an unavoidable bottleneck in +constructing high-resolution (HR) BEV maps, as their large-sized features cause +significant increases in costs including GPU memory consumption and computing +latency, named diverging training costs issue. Affected by the problem, most +existing methods adopt low-resolution (LR) BEV and struggle to estimate the +precise locations of urban scene components like road lanes, and sidewalks. As +the imprecision leads to risky self-driving, the diverging training costs issue +has to be resolved. In this paper, we address the issue with our novel Trumpet +Neural Network (TNN) mechanism. The framework utilizes LR BEV space and outputs +an up-sampled semantic BEV map to create a memory-efficient pipeline. To this +end, we introduce Local Restoration of BEV representation. Specifically, the +up-sampled BEV representation has severely aliased, blocky signals, and thick +semantic labels. Our proposed Local Restoration restores the signals and thins +(or narrows down) the width of the labels. Our extensive experiments show that +the TNN mechanism provides a plug-and-play memory-efficient pipeline, thereby +enabling the effective estimation of real-sized (or precise) semantic labels +for BEV map construction. + +
+
+
+
+
+ + ☆ Correcting Biased Centered Kernel Alignment Measures in Biological and + Artificial Neural Networks ICLR 2024 + + +
+ Centred Kernel Alignment (CKA) has recently emerged as a popular metric to +compare activations from biological and artificial neural networks (ANNs) in +order to quantify the alignment between internal representations derived from +stimuli sets (e.g. images, text, video) that are presented to both systems. In +this paper we highlight issues that the community should take into account if +using CKA as an alignment metric with neural data. Neural data are in the +low-data high-dimensionality domain, which is one of the cases where (biased) +CKA results in high similarity scores even for pairs of random matrices. Using +fMRI and MEG data from the THINGS project, we show that if biased CKA is +applied to representations of different sizes in the low-data +high-dimensionality domain, they are not directly comparable due to biased +CKA's sensitivity to differing feature-sample ratios and not stimuli-driven +responses. This situation can arise both when comparing a pre-selected area of +interest (e.g. ROI) to multiple ANN layers, as well as when determining to +which ANN layer multiple regions of interest (ROIs) / sensor groups of +different dimensionality are most similar. We show that biased CKA can be +artificially driven to its maximum value when using independent random data of +different sample-feature ratios. We further show that shuffling sample-feature +pairs of real neural data does not drastically alter biased CKA similarity in +comparison to unshuffled data, indicating an undesirable lack of sensitivity to +stimuli-driven neural responses. Positive alignment of true stimuli-driven +responses is only achieved by using debiased CKA. Lastly, we report findings +that suggest biased CKA is sensitive to the inherent structure of neural data, +only differing from shuffled data when debiased CKA detects stimuli-driven +alignment. + +
+
+ comment: ICLR 2024 Re-Align Workshop +
+
+
+
+
+ + ☆ On Mechanistic Knowledge Localization in Text-to-Image Generative Models ICML 2024 + + +
+ Identifying layers within text-to-image models which control visual +attributes can facilitate efficient model editing through closed-form updates. +Recent work, leveraging causal tracing show that early Stable-Diffusion +variants confine knowledge primarily to the first layer of the CLIP +text-encoder, while it diffuses throughout the UNet.Extending this framework, +we observe that for recent models (e.g., SD-XL, DeepFloyd), causal tracing +fails in pinpointing localized knowledge, highlighting challenges in model +editing. To address this issue, we introduce the concept of Mechanistic +Localization in text-to-image models, where knowledge about various visual +attributes (e.g., ``style", ``objects", ``facts") can be mechanistically +localized to a small fraction of layers in the UNet, thus facilitating +efficient model editing. We localize knowledge using our method LocoGen which +measures the direct effect of intermediate layers to output generation by +performing interventions in the cross-attention layers of the UNet. We then +employ LocoEdit, a fast closed-form editing method across popular open-source +text-to-image models (including the latest SD-XL)and explore the possibilities +of neuron-level model editing. Using Mechanistic Localization, our work offers +a better view of successes and failures in localization-based text-to-image +model editing. Code will be available at +\href{https://github.com/samyadeepbasu/LocoGen}{https://github.com/samyadeepbasu/LocoGen}. + +
+
+ comment: Appearing in ICML 2024 +
+
+
+
+
+ + ☆ Deep Learning Models in Speech Recognition: Measuring GPU Energy + Consumption, Impact of Noise and Model Quantization for Edge Deployment + + +
+ Recent transformer-based ASR models have achieved word-error rates (WER) +below 4%, surpassing human annotator accuracy, yet they demand extensive server +resources, contributing to significant carbon footprints. The traditional +server-based architecture of ASR also presents privacy concerns, alongside +reliability and latency issues due to network dependencies. In contrast, +on-device (edge) ASR enhances privacy, boosts performance, and promotes +sustainability by effectively balancing energy use and accuracy for specific +applications. This study examines the effects of quantization, memory demands, +and energy consumption on the performance of various ASR model inference on the +NVIDIA Jetson Orin Nano. By analyzing WER and transcription speed across models +using FP32, FP16, and INT8 quantization on clean and noisy datasets, we +highlight the crucial trade-offs between accuracy, speeds, quantization, energy +efficiency, and memory needs. We found that changing precision from fp32 to +fp16 halves the energy consumption for audio transcription across different +models, with minimal performance degradation. A larger model size and number of +parameters neither guarantees better resilience to noise, nor predicts the +energy consumption for a given transcription load. These, along with several +other findings offer novel insights for optimizing ASR systems within energy- +and memory-limited environments, crucial for the development of efficient +on-device ASR solutions. The code and input data needed to reproduce the +results in this article are open sourced are available on +[https://github.com/zzadiues3338/ASR-energy-jetson]. + +
+
+
+
+
+ + ☆ Spider: A Unified Framework for Context-dependent Concept Understanding ICML 2024 + + +
+ Different from the context-independent (CI) concepts such as human, car, and +airplane, context-dependent (CD) concepts require higher visual understanding +ability, such as camouflaged object and medical lesion. Despite the rapid +advance of many CD understanding tasks in respective branches, the isolated +evolution leads to their limited cross-domain generalisation and repetitive +technique innovation. Since there is a strong coupling relationship between +foreground and background context in CD tasks, existing methods require to +train separate models in their focused domains. This restricts their real-world +CD concept understanding towards artificial general intelligence (AGI). We +propose a unified model with a single set of parameters, Spider, which only +needs to be trained once. With the help of the proposed concept filter driven +by the image-mask group prompt, Spider is able to understand and distinguish +diverse strong context-dependent concepts to accurately capture the Prompter's +intention. Without bells and whistles, Spider significantly outperforms the +state-of-the-art specialized models in 8 different context-dependent +segmentation tasks, including 4 natural scenes (salient, camouflaged, and +transparent objects and shadow) and 4 medical lesions (COVID-19, polyp, breast, +and skin lesion with color colonoscopy, CT, ultrasound, and dermoscopy +modalities). Besides, Spider shows obvious advantages in continuous learning. +It can easily complete the training of new tasks by fine-tuning parameters less +than 1\% and bring a tolerable performance degradation of less than 5\% for all +old tasks. The source code will be publicly available at +\href{https://github.com/Xiaoqi-Zhao-DLUT/Spider-UniCDSeg}{Spider-UniCDSeg}. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ☆ Part-aware Shape Generation with Latent 3D Diffusion of Neural Voxel + Fields + + +
+ This paper presents a novel latent 3D diffusion model for the generation of +neural voxel fields, aiming to achieve accurate part-aware structures. Compared +to existing methods, there are two key designs to ensure high-quality and +accurate part-aware generation. On one hand, we introduce a latent 3D diffusion +process for neural voxel fields, enabling generation at significantly higher +resolutions that can accurately capture rich textural and geometric details. On +the other hand, a part-aware shape decoder is introduced to integrate the part +codes into the neural voxel fields, guiding the accurate part decomposition and +producing high-quality rendering results. Through extensive experimentation and +comparisons with state-of-the-art methods, we evaluate our approach across four +different classes of data. The results demonstrate the superior generative +capabilities of our proposed method in part-aware shape generation, +outperforming existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ Estimate the building height at a 10-meter resolution based on Sentinel + data + + +
+ Building height is an important indicator for scientific research and +practical application. However, building height products with a high spatial +resolution (10m) are still very scarce. To meet the needs of high-resolution +building height estimation models, this study established a set of +spatial-spectral-temporal feature databases, combining SAR data provided by +Sentinel-1, optical data provided by Sentinel-2, and shape data provided by +building footprints. The statistical indicators on the time scale are extracted +to form a rich database of 160 features. This study combined with permutation +feature importance, Shapley Additive Explanations, and Random Forest variable +importance, and the final stable features are obtained through an expert +scoring system. This study took 12 large, medium, and small cities in the +United States as the training data. It used moving windows to aggregate the +pixels to solve the impact of SAR image displacement and building shadows. This +study built a building height model based on a random forest model and compared +three model ensemble methods of bagging, boosting, and stacking. To evaluate +the accuracy of the prediction results, this study collected Lidar data in the +test area, and the evaluation results showed that its R-Square reached 0.78, +which can prove that the building height can be obtained effectively. The fast +production of high-resolution building height data can support large-scale +scientific research and application in many fields. + +
+
+
+
+
+ + ☆ FREE: Faster and Better Data-Free Meta-Learning + + +
+ Data-Free Meta-Learning (DFML) aims to extract knowledge from a collection of +pre-trained models without requiring the original data, presenting practical +benefits in contexts constrained by data privacy concerns. Current DFML methods +primarily focus on the data recovery from these pre-trained models. However, +they suffer from slow recovery speed and overlook gaps inherent in +heterogeneous pre-trained models. In response to these challenges, we introduce +the Faster and Better Data-Free Meta-Learning (FREE) framework, which contains: +(i) a meta-generator for rapidly recovering training tasks from pre-trained +models; and (ii) a meta-learner for generalizing to new unseen tasks. +Specifically, within the module Faster Inversion via Meta-Generator, each +pre-trained model is perceived as a distinct task. The meta-generator can +rapidly adapt to a specific task in just five steps, significantly accelerating +the data recovery. Furthermore, we propose Better Generalization via +Meta-Learner and introduce an implicit gradient alignment algorithm to optimize +the meta-learner. This is achieved as aligned gradient directions alleviate +potential conflicts among tasks from heterogeneous pre-trained models. +Empirical experiments on multiple benchmarks affirm the superiority of our +approach, marking a notable speed-up (20$\times$) and performance enhancement +(1.42\% $\sim$ 4.78\%) in comparison to the state-of-the-art. + +
+
+
+
+
+ + ☆ LLM-AD: Large Language Model based Audio Description System + + +
+ The development of Audio Description (AD) has been a pivotal step forward in +making video content more accessible and inclusive. Traditionally, AD +production has demanded a considerable amount of skilled labor, while existing +automated approaches still necessitate extensive training to integrate +multimodal inputs and tailor the output from a captioning style to an AD style. +In this paper, we introduce an automated AD generation pipeline that harnesses +the potent multimodal and instruction-following capacities of GPT-4V(ision). +Notably, our methodology employs readily available components, eliminating the +need for additional training. It produces ADs that not only comply with +established natural language AD production standards but also maintain +contextually consistent character information across frames, courtesy of a +tracking-based character recognition module. A thorough analysis on the MAD +dataset reveals that our approach achieves a performance on par with +learning-based methods in automated AD production, as substantiated by a CIDEr +score of 20.5. + +
+
+
+
+
+ + ☆ A Hong Kong Sign Language Corpus Collected from Sign-interpreted TV News LREC + + +
+ This paper introduces TVB-HKSL-News, a new Hong Kong sign language (HKSL) +dataset collected from a TV news program over a period of 7 months. The dataset +is collected to enrich resources for HKSL and support research in +large-vocabulary continuous sign language recognition (SLR) and translation +(SLT). It consists of 16.07 hours of sign videos of two signers with a +vocabulary of 6,515 glosses (for SLR) and 2,850 Chinese characters or 18K +Chinese words (for SLT). One signer has 11.66 hours of sign videos and the +other has 4.41 hours. One objective in building the dataset is to support the +investigation of how well large-vocabulary continuous sign language +recognition/translation can be done for a single signer given a (relatively) +large amount of his/her training data, which could potentially lead to the +development of new modeling methods. Besides, most parts of the data collection +pipeline are automated with little human intervention; we believe that our +collection method can be scaled up to collect more sign language data easily +for SLT in the future for any sign languages if such sign-interpreted videos +are available. We also run a SOTA SLR/SLT model on the dataset and get a +baseline SLR word error rate of 34.08% and a baseline SLT BLEU-4 score of 23.58 +for benchmarking future research on the dataset. + +
+
+ comment: Accepted by LREC-COLING 2024 +
+
+
+
+
+ + ☆ FITA: Fine-grained Image-Text Aligner for Radiology Report Generation + + +
+ Radiology report generation aims to automatically generate detailed and +coherent descriptive reports alongside radiology images. Previous work mainly +focused on refining fine-grained image features or leveraging external +knowledge. However, the precise alignment of fine-grained image features with +corresponding text descriptions has not been considered. This paper presents a +novel method called Fine-grained Image-Text Aligner (FITA) to construct +fine-grained alignment for image and text features. It has three novel designs: +Image Feature Refiner (IFR), Text Feature Refiner (TFR) and Contrastive Aligner +(CA). IFR and TFR aim to learn fine-grained image and text features, +respectively. We achieve this by leveraging saliency maps to effectively fuse +symptoms with corresponding abnormal visual regions, and by utilizing a +meticulously constructed triplet set for training. Finally, CA module aligns +fine-grained image and text features using contrastive loss for precise +alignment. Results show that our method surpasses existing methods on the +widely used benchmark + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Efficient Data-driven Scene Simulation using Robotic Surgery Videos via + Physics-embedded 3D Gaussians + + +
+ Surgical scene simulation plays a crucial role in surgical education and +simulator-based robot learning. Traditional approaches for creating these +environments with surgical scene involve a labor-intensive process where +designers hand-craft tissues models with textures and geometries for soft body +simulations. This manual approach is not only time-consuming but also limited +in the scalability and realism. In contrast, data-driven simulation offers a +compelling alternative. It has the potential to automatically reconstruct 3D +surgical scenes from real-world surgical video data, followed by the +application of soft body physics. This area, however, is relatively uncharted. +In our research, we introduce 3D Gaussian as a learnable representation for +surgical scene, which is learned from stereo endoscopic video. To prevent +over-fitting and ensure the geometrical correctness of these scenes, we +incorporate depth supervision and anisotropy regularization into the Gaussian +learning process. Furthermore, we apply the Material Point Method, which is +integrated with physical properties, to the 3D Gaussians to achieve realistic +scene deformations. Our method was evaluated on our collected in-house and +public surgical videos datasets. Results show that it can reconstruct and +simulate surgical scenes from endoscopic videos efficiently-taking only a few +minutes to reconstruct the surgical scene-and produce both visually and +physically plausible deformations at a speed approaching real-time. The results +demonstrate great potential of our proposed method to enhance the efficiency +and variety of simulations available for surgical education and robot learning. + +
+
+
+
+
+ + ☆ X-Oscar: A Progressive Framework for High-quality Text-guided 3D + Animatable Avatar Generation ICML2024 + + +
+ Recent advancements in automatic 3D avatar generation guided by text have +made significant progress. However, existing methods have limitations such as +oversaturation and low-quality output. To address these challenges, we propose +X-Oscar, a progressive framework for generating high-quality animatable avatars +from text prompts. It follows a sequential Geometry->Texture->Animation +paradigm, simplifying optimization through step-by-step generation. To tackle +oversaturation, we introduce Adaptive Variational Parameter (AVP), representing +avatars as an adaptive distribution during training. Additionally, we present +Avatar-aware Score Distillation Sampling (ASDS), a novel technique that +incorporates avatar-aware noise into rendered images for improved generation +quality during optimization. Extensive evaluations confirm the superiority of +X-Oscar over existing text-to-3D and text-to-avatar approaches. Our anonymous +project page: https://xmu-xiaoma666.github.io/Projects/X-Oscar/. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ☆ Hyperspectral Band Selection based on Generalized 3DTV and Tensor CUR + Decomposition + + +
+ Hyperspectral Imaging (HSI) serves as an important technique in remote +sensing. However, high dimensionality and data volume typically pose +significant computational challenges. Band selection is essential for reducing +spectral redundancy in hyperspectral imagery while retaining intrinsic critical +information. In this work, we propose a novel hyperspectral band selection +model by decomposing the data into a low-rank and smooth component and a sparse +one. In particular, we develop a generalized 3D total variation (G3DTV) by +applying the $\ell_1^p$-norm to derivatives to preserve spatial-spectral +smoothness. By employing the alternating direction method of multipliers +(ADMM), we derive an efficient algorithm, where the tensor low-rankness is +implied by the tensor CUR decomposition. We demonstrate the effectiveness of +the proposed approach through comparisons with various other state-of-the-art +band selection techniques using two benchmark real-world datasets. In addition, +we provide practical guidelines for parameter selection in both noise-free and +noisy scenarios. + +
+
+
+
+
+ + ☆ LLaVA Finds Free Lunch: Teaching Human Behavior Improves Content + Understanding Abilities Of LLMs + + +
+ Communication is defined as ``Who says what to whom with what effect.'' A +message from a communicator generates downstream receiver effects, also known +as behavior. Receiver behavior, being a downstream effect of the message, +carries rich signals about it. Even after carrying signals about the message, +the behavior data is often ignored while training large language models. We +show that training LLMs on receiver behavior can actually help improve their +content-understanding abilities. Specifically, we show that training LLMs to +predict the receiver behavior of likes and comments improves the LLM's +performance on a wide variety of downstream content understanding tasks. We +show this performance increase over 40 video and image understanding tasks over +23 benchmark datasets across both 0-shot and fine-tuning settings, +outperforming many supervised baselines. Moreover, since receiver behavior, +such as likes and comments, is collected by default on the internet and does +not need any human annotations to be useful, the performance improvement we get +after training on this data is essentially free-lunch. We release the receiver +behavior cleaned comments and likes of 750k images and videos collected from +multiple platforms along with our instruction-tuning data. + +
+
+
+
+
+ + ☆ EchoScene: Indoor Scene Generation via Information Echo over Scene Graph + Diffusion + + +
+ We present EchoScene, an interactive and controllable generative model that +generates 3D indoor scenes on scene graphs. EchoScene leverages a dual-branch +diffusion model that dynamically adapts to scene graphs. Existing methods +struggle to handle scene graphs due to varying numbers of nodes, multiple edge +combinations, and manipulator-induced node-edge operations. EchoScene overcomes +this by associating each node with a denoising process and enables +collaborative information exchange, enhancing controllable and consistent +generation aware of global constraints. This is achieved through an information +echo scheme in both shape and layout branches. At every denoising step, all +processes share their denoising data with an information exchange unit that +combines these updates using graph convolution. The scheme ensures that the +denoising processes are influenced by a holistic understanding of the scene +graph, facilitating the generation of globally coherent scenes. The resulting +scenes can be manipulated during inference by editing the input scene graph and +sampling the noise in the diffusion model. Extensive experiments validate our +approach, which maintains scene controllability and surpasses previous methods +in generation fidelity. Moreover, the generated scenes are of high quality and +thus directly compatible with off-the-shelf texture generation. Code and +trained models are open-sourced. + +
+
+ comment: 25 pages. 10 figures +
+
+
+
+
+ + ☆ An Approach to Systematic Data Acquisition and Data-Driven Simulation + for the Safety Testing of Automated Driving Functions + + +
+ With growing complexity and criticality of automated driving functions in +road traffic and their operational design domains (ODD), there is increasing +demand for covering significant proportions of development, validation, and +verification in virtual environments and through simulation models. + If, however, simulations are meant not only to augment real-world +experiments, but to replace them, quantitative approaches are required that +measure to what degree and under which preconditions simulation models +adequately represent reality, and thus, using their results accordingly. +Especially in R&D areas related to the safety impact of the "open world", there +is a significant shortage of real-world data to parameterize and/or validate +simulations - especially with respect to the behavior of human traffic +participants, whom automated driving functions will meet in mixed traffic. + We present an approach to systematically acquire data in public traffic by +heterogeneous means, transform it into a unified representation, and use it to +automatically parameterize traffic behavior models for use in data-driven +virtual validation of automated driving functions. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ PointCompress3D -- A Point Cloud Compression Framework for Roadside + LiDARs in Intelligent Transportation Systems + + +
+ In the context of Intelligent Transportation Systems (ITS), efficient data +compression is crucial for managing large-scale point cloud data acquired by +roadside LiDAR sensors. The demand for efficient storage, streaming, and +real-time object detection capabilities for point cloud data is substantial. +This work introduces PointCompress3D, a novel point cloud compression framework +tailored specifically for roadside LiDARs. Our framework addresses the +challenges of compressing high-resolution point clouds while maintaining +accuracy and compatibility with roadside LiDAR sensors. We adapt, extend, +integrate, and evaluate three cutting-edge compression methods using our +real-world-based TUMTraf dataset family. We achieve a frame rate of 10 FPS +while keeping compression sizes below 105 Kb, a reduction of 50 times, and +maintaining object detection performance on par with the original data. In +extensive experiments and ablation studies, we finally achieved a PSNR d2 of +94.46 and a BPP of 6.54 on our dataset. Future work includes the deployment on +the live system. The code is available on our project website: +https://pointcompress3d.github.io. + +
+
+
+
+
+ + ☆ Diabetic Retinopathy Detection Using Quantum Transfer Learning + + +
+ Diabetic Retinopathy (DR), a prevalent complication in diabetes patients, can +lead to vision impairment due to lesions formed on the retina. Detecting DR at +an advanced stage often results in irreversible blindness. The traditional +process of diagnosing DR through retina fundus images by ophthalmologists is +not only time-intensive but also expensive. While classical transfer learning +models have been widely adopted for computer-aided detection of DR, their high +maintenance costs can hinder their detection efficiency. In contrast, Quantum +Transfer Learning offers a more effective solution to this challenge. This +approach is notably advantageous because it operates on heuristic principles, +making it highly optimized for the task. Our proposed methodology leverages +this hybrid quantum transfer learning technique to detect DR. To construct our +model, we utilize the APTOS 2019 Blindness Detection dataset, available on +Kaggle. We employ the ResNet-18, ResNet34, ResNet50, ResNet101, ResNet152 and +Inception V3, pre-trained classical neural networks, for the initial feature +extraction. For the classification stage, we use a Variational Quantum +Classifier. Our hybrid quantum model has shown remarkable results, achieving an +accuracy of 97% for ResNet-18. This demonstrates that quantum computing, when +integrated with quantum machine learning, can perform tasks with a level of +power and efficiency unattainable by classical computers alone. By harnessing +these advanced technologies, we can significantly improve the detection and +diagnosis of Diabetic Retinopathy, potentially saving many from the risk of +blindness. + Keywords: Diabetic Retinopathy, Quantum Transfer Learning, Deep Learning + +
+
+ comment: 14 pages, 12 figures and 5 tables +
+
+
+
+
+ + ☆ SSUMamba: Spatial-Spectral Selective State Space Model for Hyperspectral + Image Denoising + + +
+ Denoising hyperspectral images (HSIs) is a crucial preprocessing procedure +due to the noise originating from intra-imaging mechanisms and environmental +factors. Utilizing domain-specific knowledge of HSIs, such as spectral +correlation, spatial self-similarity, and spatial-spectral correlation, is +essential for deep learning-based denoising. Existing methods are often +constrained by running time, space complexity, and computational complexity, +employing strategies that explore these priors separately. While the strategies +can avoid some redundant information, considering that hyperspectral images are +3-D images with strong spatial continuity and spectral correlation, this kind +of strategy inevitably overlooks subtle long-range spatial-spectral information +that positively impacts image restoration. This paper proposes a +Spatial-Spectral Selective State Space Model-based U-shaped network, termed +Spatial-Spectral U-Mamba (SSUMamba), for hyperspectral image denoising. We can +obtain complete global spatial-spectral correlation within a module thanks to +the linear space complexity in State Space Model (SSM) computations. We +introduce an Alternating Scan (SSAS) strategy for HSI data, which helps model +the information flow in multiple directions in 3-D HSIs. Experimental results +demonstrate that our method outperforms several compared methods. The source +code will be available at https://github.com/lronkitty/SSUMamba. + +
+
+
+
+
+ + ☆ Development of Skip Connection in Deep Neural Networks for Computer + Vision and Medical Image Analysis: A Survey + + +
+ Deep learning has made significant progress in computer vision, specifically +in image classification, object detection, and semantic segmentation. The skip +connection has played an essential role in the architecture of deep neural +networks,enabling easier optimization through residual learning during the +training stage and improving accuracy during testing. Many neural networks have +inherited the idea of residual learning with skip connections for various +tasks, and it has been the standard choice for designing neural networks. This +survey provides a comprehensive summary and outlook on the development of skip +connections in deep neural networks. The short history of skip connections is +outlined, and the development of residual learning in deep neural networks is +surveyed. The effectiveness of skip connections in the training and testing +stages is summarized, and future directions for using skip connections in +residual learning are discussed. Finally, we summarize seminal papers, source +code, models, and datasets that utilize skip connections in computer vision, +including image classification, object detection, semantic segmentation, and +image reconstruction. We hope this survey could inspire peer researchers in the +community to develop further skip connections in various forms and tasks and +the theory of residual learning in deep neural networks. The project page can +be found at https://github.com/apple1986/Residual_Learning_For_Images + +
+
+
+
+
+ + ☆ Zero-Shot Monocular Motion Segmentation in the Wild by Combining Deep + Learning with Geometric Motion Model Fusion CVPR + + +
+ Detecting and segmenting moving objects from a moving monocular camera is +challenging in the presence of unknown camera motion, diverse object motions +and complex scene structures. Most existing methods rely on a single motion cue +to perform motion segmentation, which is usually insufficient when facing +different complex environments. While a few recent deep learning based methods +are able to combine multiple motion cues to achieve improved accuracy, they +depend heavily on vast datasets and extensive annotations, making them less +adaptable to new scenarios. To address these limitations, we propose a novel +monocular dense segmentation method that achieves state-of-the-art motion +segmentation results in a zero-shot manner. The proposed method synergestically +combines the strengths of deep learning and geometric model fusion methods by +performing geometric model fusion on object proposals. Experiments show that +our method achieves competitive results on several motion segmentation datasets +and even surpasses some state-of-the-art supervised methods on certain +benchmarks, while not being trained on any data. We also present an ablation +study to show the effectiveness of combining different geometric models +together for motion segmentation, highlighting the value of our geometric model +fusion strategy. + +
+
+ comment: Accepted by the 2024 IEEE/CVF Conference on Computer Vision and + Pattern Recognition Workshops (CVPRW) +
+
+
+
+
+ + ☆ Long Tail Image Generation Through Feature Space Augmentation and + Iterated Learning + + +
+ Image and multimodal machine learning tasks are very challenging to solve in +the case of poorly distributed data. In particular, data availability and +privacy restrictions exacerbate these hurdles in the medical domain. The state +of the art in image generation quality is held by Latent Diffusion models, +making them prime candidates for tackling this problem. However, a few key +issues still need to be solved, such as the difficulty in generating data from +under-represented classes and a slow inference process. To mitigate these +issues, we propose a new method for image augmentation in long-tailed data +based on leveraging the rich latent space of pre-trained Stable Diffusion +Models. We create a modified separable latent space to mix head and tail class +examples. We build this space via Iterated Learning of underlying sparsified +embeddings, which we apply to task-specific saliency maps via a K-NN approach. +Code is available at +https://github.com/SugarFreeManatee/Feature-Space-Augmentation-and-Iterated-Learning + +
+
+
+
+
+ + ☆ Active Learning Enabled Low-cost Cell Image Segmentation Using Bounding + Box Annotation + + +
+ Cell image segmentation is usually implemented using fully supervised deep +learning methods, which heavily rely on extensive annotated training data. Yet, +due to the complexity of cell morphology and the requirement for specialized +knowledge, pixel-level annotation of cell images has become a highly +labor-intensive task. To address the above problems, we propose an active +learning framework for cell segmentation using bounding box annotations, which +greatly reduces the data annotation cost of cell segmentation algorithms. +First, we generate a box-supervised learning method (denoted as YOLO-SAM) by +combining the YOLOv8 detector with the Segment Anything Model (SAM), which +effectively reduces the complexity of data annotation. Furthermore, it is +integrated into an active learning framework that employs the MC DropBlock +method to train the segmentation model with fewer box-annotated samples. +Extensive experiments demonstrate that our model saves more than ninety percent +of data annotation time compared to mask-supervised deep learning methods. + +
+
+
+
+
+ + ☆ SOAR: Advancements in Small Body Object Detection for Aerial Imagery + Using State Space Models and Programmable Gradients + + +
+ Small object detection in aerial imagery presents significant challenges in +computer vision due to the minimal data inherent in small-sized objects and +their propensity to be obscured by larger objects and background noise. +Traditional methods using transformer-based models often face limitations +stemming from the lack of specialized databases, which adversely affect their +performance with objects of varying orientations and scales. This underscores +the need for more adaptable, lightweight models. In response, this paper +introduces two innovative approaches that significantly enhance detection and +segmentation capabilities for small aerial objects. Firstly, we explore the use +of the SAHI framework on the newly introduced lightweight YOLO v9 architecture, +which utilizes Programmable Gradient Information (PGI) to reduce the +substantial information loss typically encountered in sequential feature +extraction processes. The paper employs the Vision Mamba model, which +incorporates position embeddings to facilitate precise location-aware visual +understanding, combined with a novel bidirectional State Space Model (SSM) for +effective visual context modeling. This State Space Model adeptly harnesses the +linear complexity of CNNs and the global receptive field of Transformers, +making it particularly effective in remote sensing image classification. Our +experimental results demonstrate substantial improvements in detection accuracy +and processing efficiency, validating the applicability of these approaches for +real-time small object detection across diverse aerial scenarios. This paper +also discusses how these methodologies could serve as foundational models for +future advancements in aerial object recognition technologies. The source code +will be made accessible here. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Language-Enhanced Latent Representations for Out-of-Distribution + Detection in Autonomous Driving ICRA 2024 + + +
+ Out-of-distribution (OOD) detection is essential in autonomous driving, to +determine when learning-based components encounter unexpected inputs. +Traditional detectors typically use encoder models with fixed settings, thus +lacking effective human interaction capabilities. With the rise of large +foundation models, multimodal inputs offer the possibility of taking human +language as a latent representation, thus enabling language-defined OOD +detection. In this paper, we use the cosine similarity of image and text +representations encoded by the multimodal model CLIP as a new representation to +improve the transparency and controllability of latent encodings used for +visual anomaly detection. We compare our approach with existing pre-trained +encoders that can only produce latent representations that are meaningless from +the user's standpoint. Our experiments on realistic driving data show that the +language-based latent representation performs better than the traditional +representation of the vision encoder and helps improve the detection +performance when combined with standard representations. + +
+
+ comment: Presented at the Robot Trust for Symbiotic Societies (RTSS) Workshop, + co-located with ICRA 2024 +
+
+
+
+
+ + ☆ Adapting Self-Supervised Learning for Computational Pathology CVPR 2024 + + +
+ Self-supervised learning (SSL) has emerged as a key technique for training +networks that can generalize well to diverse tasks without task-specific +supervision. This property makes SSL desirable for computational pathology, the +study of digitized images of tissues, as there are many target applications and +often limited labeled training samples. However, SSL algorithms and models have +been primarily developed in the field of natural images and whether their +performance can be improved by adaptation to particular domains remains an open +question. In this work, we present an investigation of modifications to SSL for +pathology data, specifically focusing on the DINOv2 algorithm. We propose +alternative augmentations, regularization functions, and position encodings +motivated by the characteristics of pathology images. We evaluate the impact of +these changes on several benchmarks to demonstrate the value of tailored +approaches. + +
+
+ comment: Presented at DCA in MI Workshop, CVPR 2024 +
+
+
+
+
+ + ☆ ShadowNav: Autonomous Global Localization for Lunar Navigation in + Darkness + + +
+ The ability to determine the pose of a rover in an inertial frame +autonomously is a crucial capability necessary for the next generation of +surface rover missions on other planetary bodies. Currently, most on-going +rover missions utilize ground-in-the-loop interventions to manually correct for +drift in the pose estimate and this human supervision bottlenecks the distance +over which rovers can operate autonomously and carry out scientific +measurements. In this paper, we present ShadowNav, an autonomous approach for +global localization on the Moon with an emphasis on driving in darkness and at +nighttime. Our approach uses the leading edge of Lunar craters as landmarks and +a particle filtering approach is used to associate detected craters with known +ones on an offboard map. We discuss the key design decisions in developing the +ShadowNav framework for use with a Lunar rover concept equipped with a stereo +camera and an external illumination source. Finally, we demonstrate the +efficacy of our proposed approach in both a Lunar simulation environment and on +data collected during a field test at Cinder Lakes, Arizona. + +
+
+ comment: 21 pages, 13 figures +
+
+
+
+
+ + ☆ Out-of-distribution detection based on subspace projection of + high-dimensional features output by the last convolutional layer + + +
+ Out-of-distribution (OOD) detection, crucial for reliable pattern +classification, discerns whether a sample originates outside the training +distribution. This paper concentrates on the high-dimensional features output +by the final convolutional layer, which contain rich image features. Our key +idea is to project these high-dimensional features into two specific feature +subspaces, leveraging the dimensionality reduction capacity of the network's +linear layers, trained with Predefined Evenly-Distribution Class Centroids +(PEDCC)-Loss. This involves calculating the cosines of three projection angles +and the norm values of features, thereby identifying distinctive information +for in-distribution (ID) and OOD data, which assists in OOD detection. Building +upon this, we have modified the batch normalization (BN) and ReLU layer +preceding the fully connected layer, diminishing their impact on the output +feature distributions and thereby widening the distribution gap between ID and +OOD data features. Our method requires only the training of the classification +network model, eschewing any need for input pre-processing or specific OOD data +pre-tuning. Extensive experiments on several benchmark datasets demonstrates +that our approach delivers state-of-the-art performance. Our code is available +at https://github.com/Hewell0/ProjOOD. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ When a Relation Tells More Than a Concept: Exploring and Evaluating + Classifier Decisions with CoReX + + +
+ Explanations for Convolutional Neural Networks (CNNs) based on relevance of +input pixels might be too unspecific to evaluate which and how input features +impact model decisions. Especially in complex real-world domains like +biomedicine, the presence of specific concepts (e.g., a certain type of cell) +and of relations between concepts (e.g., one cell type is next to another) +might be discriminative between classes (e.g., different types of tissue). +Pixel relevance is not expressive enough to convey this type of information. In +consequence, model evaluation is limited and relevant aspects present in the +data and influencing the model decisions might be overlooked. This work +presents a novel method to explain and evaluate CNN models, which uses a +concept- and relation-based explainer (CoReX). It explains the predictive +behavior of a model on a set of images by masking (ir-)relevant concepts from +the decision-making process and by constraining relations in a learned +interpretable surrogate model. We test our approach with several image data +sets and CNN architectures. Results show that CoReX explanations are faithful +to the CNN model in terms of predictive outcomes. We further demonstrate that +CoReX is a suitable tool for evaluating CNNs supporting identification and +re-classification of incorrect or ambiguous classifications. + +
+
+ comment: preliminary version, submitted to Machine Learning +
+
+
+
+
+ + ☆ MMIST-ccRCC: A Real World Medical Dataset for the Development of + Multi-Modal Systems CVPR2024 + + +
+ The acquisition of different data modalities can enhance our knowledge and +understanding of various diseases, paving the way for a more personalized +healthcare. Thus, medicine is progressively moving towards the generation of +massive amounts of multi-modal data (\emph{e.g,} molecular, radiology, and +histopathology). While this may seem like an ideal environment to capitalize +data-centric machine learning approaches, most methods still focus on exploring +a single or a pair of modalities due to a variety of reasons: i) lack of ready +to use curated datasets; ii) difficulty in identifying the best multi-modal +fusion strategy; and iii) missing modalities across patients. In this paper we +introduce a real world multi-modal dataset called MMIST-CCRCC that comprises 2 +radiology modalities (CT and MRI), histopathology, genomics, and clinical data +from 618 patients with clear cell renal cell carcinoma (ccRCC). We provide +single and multi-modal (early and late fusion) benchmarks in the task of +12-month survival prediction in the challenging scenario of one or more missing +modalities for each patient, with missing rates that range from 26$\%$ for +genomics data to more than 90$\%$ for MRI. We show that even with such severe +missing rates the fusion of modalities leads to improvements in the survival +forecasting. Additionally, incorporating a strategy to generate the latent +representations of the missing modalities given the available ones further +improves the performance, highlighting a potential complementarity across +modalities. Our dataset and code are available here: +https://multi-modal-ist.github.io/datasets/ccRCC + +
+
+ comment: Accepted in DCA in MI Workshop@CVPR2024 +
+
+
+
+
+ + ☆ S4: Self-Supervised Sensing Across the Spectrum + + +
+ Satellite image time series (SITS) segmentation is crucial for many +applications like environmental monitoring, land cover mapping and agricultural +crop type classification. However, training models for SITS segmentation +remains a challenging task due to the lack of abundant training data, which +requires fine grained annotation. We propose S4 a new self-supervised +pre-training approach that significantly reduces the requirement for labeled +training data by utilizing two new insights: (a) Satellites capture images in +different parts of the spectrum such as radio frequencies, and visible +frequencies. (b) Satellite imagery is geo-registered allowing for fine-grained +spatial alignment. We use these insights to formulate pre-training tasks in S4. +We also curate m2s2-SITS, a large-scale dataset of unlabeled, +spatially-aligned, multi-modal and geographic specific SITS that serves as +representative pre-training data for S4. Finally, we evaluate S4 on multiple +SITS segmentation datasets and demonstrate its efficacy against competing +baselines while using limited labeled data. + +
+
+
+
+
+ + ☆ Key Patches Are All You Need: A Multiple Instance Learning Framework For + Robust Medical Diagnosis CVPR 2024 + + +
+ Deep learning models have revolutionized the field of medical image analysis, +due to their outstanding performances. However, they are sensitive to spurious +correlations, often taking advantage of dataset bias to improve results for +in-domain data, but jeopardizing their generalization capabilities. In this +paper, we propose to limit the amount of information these models use to reach +the final classification, by using a multiple instance learning (MIL) +framework. MIL forces the model to use only a (small) subset of patches in the +image, identifying discriminative regions. This mimics the clinical procedures, +where medical decisions are based on localized findings. We evaluate our +framework on two medical applications: skin cancer diagnosis using dermoscopy +and breast cancer diagnosis using mammography. Our results show that using only +a subset of the patches does not compromise diagnostic performance for +in-domain data, compared to the baseline approaches. However, our approach is +more robust to shifts in patient demographics, while also providing more +detailed explanations about which regions contributed to the decision. Code is +available at: https://github.com/diogojpa99/MedicalMultiple-Instance-Learning. + +
+
+ comment: Accepted in DEF-AI-MIA Workshop@CVPR 2024 +
+
+
+
+
+ + ☆ Explaining models relating objects and privacy CVPR 2024 + + +
+ Accurately predicting whether an image is private before sharing it online is +difficult due to the vast variety of content and the subjective nature of +privacy itself. In this paper, we evaluate privacy models that use objects +extracted from an image to determine why the image is predicted as private. To +explain the decision of these models, we use feature-attribution to identify +and quantify which objects (and which of their features) are more relevant to +privacy classification with respect to a reference input (i.e., no objects +localised in an image) predicted as public. We show that the presence of the +person category and its cardinality is the main factor for the privacy +decision. Therefore, these models mostly fail to identify private images +depicting documents with sensitive data, vehicle ownership, and internet +activity, or public images with people (e.g., an outdoor concert or people +walking in a public space next to a famous landmark). As baselines for future +benchmarks, we also devise two strategies that are based on the person presence +and cardinality and achieve comparable classification performance of the +privacy models. + +
+
+ comment: 7 pages, 3 figures, 1 table, supplementary material included as + Appendix. Paper accepted at the 3rd XAI4CV Workshop at CVPR 2024. Code: + https://github.com/graphnex/ig-privacy +
+
+
+
+
+ + ☆ A Classification-Based Adaptive Segmentation Pipeline: Feasibility Study + Using Polycystic Liver Disease and Metastases from Colorectal Cancer CT + Images + + +
+ Automated segmentation tools often encounter accuracy and adaptability issues +when applied to images of different pathology. The purpose of this study is to +explore the feasibility of building a workflow to efficiently route images to +specifically trained segmentation models. By implementing a deep learning +classifier to automatically classify the images and route them to appropriate +segmentation models, we hope that our workflow can segment the images with +different pathology accurately. The data we used in this study are 350 CT +images from patients affected by polycystic liver disease and 350 CT images +from patients presenting with liver metastases from colorectal cancer. All +images had the liver manually segmented by trained imaging analysts. Our +proposed adaptive segmentation workflow achieved a statistically significant +improvement for the task of total liver segmentation compared to the generic +single segmentation model (non-parametric Wilcoxon signed rank test, n=100, +p-value << 0.001). This approach is applicable in a wide range of scenarios and +should prove useful in clinical implementations of segmentation pipelines. + +
+
+ comment: J Digit Imaging. Inform. med. (2024) +
+
+
+
+
+ + ☆ Explainable AI (XAI) in Image Segmentation in Medicine, Industry, and + Beyond: A Survey + + +
+ Artificial Intelligence (XAI) has found numerous applications in computer +vision. While image classification-based explainability techniques have +garnered significant attention, their counterparts in semantic segmentation +have been relatively neglected. Given the prevalent use of image segmentation, +ranging from medical to industrial deployments, these techniques warrant a +systematic look. In this paper, we present the first comprehensive survey on +XAI in semantic image segmentation. This work focuses on techniques that were +either specifically introduced for dense prediction tasks or were extended for +them by modifying existing methods in classification. We analyze and categorize +the literature based on application categories and domains, as well as the +evaluation metrics and datasets used. We also propose a taxonomy for +interpretable semantic segmentation, and discuss potential challenges and +future research directions. + +
+
+ comment: 35 pages, 9 figures, 2 tables +
+
+
+
+
+ + ☆ Wildfire Risk Prediction: A Review + + +
+ Wildfires have significant impacts on global vegetation, wildlife, and +humans. They destroy plant communities and wildlife habitats and contribute to +increased emissions of carbon dioxide, nitrogen oxides, methane, and other +pollutants. The prediction of wildfires relies on various independent variables +combined with regression or machine learning methods. In this technical review, +we describe the options for independent variables, data processing techniques, +models, independent variables collinearity and importance estimation methods, +and model performance evaluation metrics. First, we divide the independent +variables into 4 aspects, including climate and meteorology conditions, +socio-economical factors, terrain and hydrological features, and wildfire +historical records. Second, preprocessing methods are described for different +magnitudes, different spatial-temporal resolutions, and different formats of +data. Third, the collinearity and importance evaluation methods of independent +variables are also considered. Fourth, we discuss the application of +statistical models, traditional machine learning models, and deep learning +models in wildfire risk prediction. In this subsection, compared with other +reviews, this manuscript particularly discusses the evaluation metrics and +recent advancements in deep learning methods. Lastly, addressing the +limitations of current research, this paper emphasizes the need for more +effective deep learning time series forecasting algorithms, the utilization of +three-dimensional data including ground and trunk fuel, extraction of more +accurate historical fire point data, and improved model evaluation metrics. + +
+
+
+
+
+ + ☆ Leafy Spurge Dataset: Real-world Weed Classification Within Aerial Drone + Imagery + + +
+ Invasive plant species are detrimental to the ecology of both agricultural +and wildland areas. Euphorbia esula, or leafy spurge, is one such plant that +has spread through much of North America from Eastern Europe. When paired with +contemporary computer vision systems, unmanned aerial vehicles, or drones, +offer the means to track expansion of problem plants, such as leafy spurge, and +improve chances of controlling these weeds. We gathered a dataset of leafy +spurge presence and absence in grasslands of western Montana, USA, then +surveyed these areas with a commercial drone. We trained image classifiers on +these data, and our best performing model, a pre-trained DINOv2 vision +transformer, identified leafy spurge with 0.84 accuracy (test set). This result +indicates that classification of leafy spurge is tractable, but not solved. We +release this unique dataset of labelled and unlabelled, aerial drone imagery +for the machine learning community to explore. Improving classification +performance of leafy spurge would benefit the fields of ecology, conservation, +and remote sensing alike. Code and data are available at our website: +leafy-spurge-dataset.github.io. + +
+
+ comment: Official Dataset Technical Report. Used in DA-Fusion + (arXiv:2302.07944) +
+
+
+
+
+ + ♻ ☆ Continual Diffusion: Continual Customization of Text-to-Image Diffusion + with C-LoRA + + +
+ Recent works demonstrate a remarkable ability to customize text-to-image +diffusion models while only providing a few example images. What happens if you +try to customize such models using multiple, fine-grained concepts in a +sequential (i.e., continual) manner? In our work, we show that recent +state-of-the-art customization of text-to-image models suffer from catastrophic +forgetting when new concepts arrive sequentially. Specifically, when adding a +new concept, the ability to generate high quality images of past, similar +concepts degrade. To circumvent this forgetting, we propose a new method, +C-LoRA, composed of a continually self-regularized low-rank adaptation in cross +attention layers of the popular Stable Diffusion model. Furthermore, we use +customization prompts which do not include the word of the customized object +(i.e., "person" for a human face dataset) and are initialized as completely +random embeddings. Importantly, our method induces only marginal additional +parameter costs and requires no storage of user data for replay. We show that +C-LoRA not only outperforms several baselines for our proposed setting of +text-to-image continual customization, which we refer to as Continual +Diffusion, but that we achieve a new state-of-the-art in the well-established +rehearsal-free continual learning setting for image classification. The high +achieving performance of C-LoRA in two separate domains positions it as a +compelling solution for a wide range of applications, and we believe it has +significant potential for practical impact. Project page: +https://jamessealesmith.github.io/continual-diffusion/ + +
+
+ comment: Transactions on Machine Learning Research (TMLR) 2024 +
+
+
+
+
+ + ♻ ☆ Perception and Localization of Macular Degeneration Applying + Convolutional Neural Network, ResNet and Grad-CAM + + +
+ A well-known retinal disease that sends blurry visions to the affected +patients is Macular Degeneration. This research is based on classifying the +healthy and macular degeneration fundus by localizing the affected region of +the fundus. A CNN architecture and CNN with ResNet architecture (ResNet50, +ResNet50v2, ResNet101, ResNet101v2, ResNet152, ResNet152v2) as the backbone are +used to classify the two types of fundus. The data are split into three +categories including (a) Training set is 90% and Testing set is 10% (b) +Training set is 80% and Testing set is 20%, (c) Training set is 50% and Testing +set is 50%. After the training, the best model has been selected from the +evaluation metrics. Among the models, CNN with a backbone of ResNet50 performs +best which gives the training accuracy of 98.7% for 90% train and 10% test data +split. With this model, we have performed the Grad-CAM visualization to get the +region of the affected area of the fundus. + +
+
+ comment: 12 pages, 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Compact 3D Scene Representation via Self-Organizing Gaussian Grids + + +
+ 3D Gaussian Splatting has recently emerged as a highly promising technique +for modeling of static 3D scenes. In contrast to Neural Radiance Fields, it +utilizes efficient rasterization allowing for very fast rendering at +high-quality. However, the storage size is significantly higher, which hinders +practical deployment, e.g. on resource constrained devices. In this paper, we +introduce a compact scene representation organizing the parameters of 3D +Gaussian Splatting (3DGS) into a 2D grid with local homogeneity, ensuring a +drastic reduction in storage requirements without compromising visual quality +during rendering. Central to our idea is the explicit exploitation of +perceptual redundancies present in natural scenes. In essence, the inherent +nature of a scene allows for numerous permutations of Gaussian parameters to +equivalently represent it. To this end, we propose a novel highly parallel +algorithm that regularly arranges the high-dimensional Gaussian parameters into +a 2D grid while preserving their neighborhood structure. During training, we +further enforce local smoothness between the sorted parameters in the grid. The +uncompressed Gaussians use the same structure as 3DGS, ensuring a seamless +integration with established renderers. Our method achieves a reduction factor +of 17x to 42x in size for complex scenes with no increase in training time, +marking a substantial leap forward in the domain of 3D scene distribution and +consumption. Additional information can be found on our project page: +https://fraunhoferhhi.github.io/Self-Organizing-Gaussians/ + +
+
+ comment: Added compression of spherical harmonics, updated compression method + with improved results (all attributes compressed with JPEG XL now), added + qualitative comparison of additional scenes, moved compression explanation + and comparison to main paper, added comparison with "Making Gaussian Splats + smaller" +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Generation for AI-Generated Content: A Survey + + +
+ Advancements in model algorithms, the growth of foundational models, and +access to high-quality datasets have propelled the evolution of Artificial +Intelligence Generated Content (AIGC). Despite its notable successes, AIGC +still faces hurdles such as updating knowledge, handling long-tail data, +mitigating data leakage, and managing high training and inference costs. +Retrieval-Augmented Generation (RAG) has recently emerged as a paradigm to +address such challenges. In particular, RAG introduces the information +retrieval process, which enhances the generation process by retrieving relevant +objects from available data stores, leading to higher accuracy and better +robustness. In this paper, we comprehensively review existing efforts that +integrate RAG technique into AIGC scenarios. We first classify RAG foundations +according to how the retriever augments the generator, distilling the +fundamental abstractions of the augmentation methodologies for various +retrievers and generators. This unified perspective encompasses all RAG +scenarios, illuminating advancements and pivotal technologies that help with +potential future progress. We also summarize additional enhancements methods +for RAG, facilitating effective engineering and implementation of RAG systems. +Then from another view, we survey on practical applications of RAG across +different modalities and tasks, offering valuable references for researchers +and practitioners. Furthermore, we introduce the benchmarks for RAG, discuss +the limitations of current RAG systems, and suggest potential directions for +future research. Github: https://github.com/PKU-DAIR/RAG-Survey. + +
+
+ comment: Citing 334 papers, 21 pages, 1 table, 12 figures. Project: + https://github.com/PKU-DAIR/RAG-Survey +
+
+
+
+
+ + ♻ ☆ FlowBot3D: Learning 3D Articulation Flow to Manipulate Articulated + Objects + + +
+ We explore a novel method to perceive and manipulate 3D articulated objects +that generalizes to enable a robot to articulate unseen classes of objects. We +propose a vision-based system that learns to predict the potential motions of +the parts of a variety of articulated objects to guide downstream motion +planning of the system to articulate the objects. To predict the object +motions, we train a neural network to output a dense vector field representing +the point-wise motion direction of the points in the point cloud under +articulation. We then deploy an analytical motion planner based on this vector +field to achieve a policy that yields maximum articulation. We train the vision +system entirely in simulation, and we demonstrate the capability of our system +to generalize to unseen object instances and novel categories in both +simulation and the real world, deploying our policy on a Sawyer robot with no +finetuning. Results show that our system achieves state-of-the-art performance +in both simulated and real-world experiments. + +
+
+ comment: Accepted to Robotics Science and Systems (RSS) 2022, Best Paper + Finalist +
+
+
+
+
+ + ♻ ☆ TAX-Pose: Task-Specific Cross-Pose Estimation for Robot Manipulation + + +
+ How do we imbue robots with the ability to efficiently manipulate unseen +objects and transfer relevant skills based on demonstrations? End-to-end +learning methods often fail to generalize to novel objects or unseen +configurations. Instead, we focus on the task-specific pose relationship +between relevant parts of interacting objects. We conjecture that this +relationship is a generalizable notion of a manipulation task that can transfer +to new objects in the same category; examples include the relationship between +the pose of a pan relative to an oven or the pose of a mug relative to a mug +rack. We call this task-specific pose relationship "cross-pose" and provide a +mathematical definition of this concept. We propose a vision-based system that +learns to estimate the cross-pose between two objects for a given manipulation +task using learned cross-object correspondences. The estimated cross-pose is +then used to guide a downstream motion planner to manipulate the objects into +the desired pose relationship (placing a pan into the oven or the mug onto the +mug rack). We demonstrate our method's capability to generalize to unseen +objects, in some cases after training on only 10 demonstrations in the real +world. Results show that our system achieves state-of-the-art performance in +both simulated and real-world experiments across a number of tasks. +Supplementary information and videos can be found at +https://sites.google.com/view/tax-pose/home. + +
+
+ comment: Conference on Robot Learning (CoRL), 2022. Supplementary material is + available at https://sites.google.com/view/tax-pose/home +
+
+
+
+
+ + ♻ ☆ USC: Uncompromising Spatial Constraints for Safety-Oriented 3D Object + Detectors in Autonomous Driving SC 2024 + + +
+ We consider the safety-oriented performance of 3D object detectors in +autonomous driving contexts. Specifically, despite impressive results shown by +the mass literature, developers often find it hard to ensure the safe +deployment of these learning-based perception models. Attributing the challenge +to the lack of safety-oriented metrics, we hereby present uncompromising +spatial constraints (USC), which characterize a simple yet important +localization requirement demanding the predictions to fully cover the objects +when seen from the autonomous vehicle. The constraints, as we formulate using +the perspective and bird's-eye views, can be naturally reflected by +quantitative measures, such that having an object detector with a higher score +implies a lower risk of collision. Finally, beyond model evaluation, we +incorporate the quantitative measures into common loss functions to enable +safety-oriented fine-tuning for existing models. With experiments using the +nuScenes dataset and a closed-loop simulation, our work demonstrates such +considerations of safety notions at the perception level not only improve model +performances beyond accuracy but also allow for a more direct linkage to actual +system safety. + +
+
+ comment: 8 pages (IEEE double column format), 7 figures, 2 tables, submitted + to ITSC 2024 +
+
+
+
+
+ + ♻ ☆ Operational Support Estimator Networks + + +
+ In this work, we propose a novel approach called Operational Support +Estimator Networks (OSENs) for the support estimation task. Support Estimation +(SE) is defined as finding the locations of non-zero elements in sparse +signals. By its very nature, the mapping between the measurement and sparse +signal is a non-linear operation. Traditional support estimators rely on +computationally expensive iterative signal recovery techniques to achieve such +non-linearity. Contrary to the convolutional layers, the proposed OSEN approach +consists of operational layers that can learn such complex non-linearities +without the need for deep networks. In this way, the performance of +non-iterative support estimation is greatly improved. Moreover, the operational +layers comprise so-called generative super neurons with non-local kernels. The +kernel location for each neuron/feature map is optimized jointly for the SE +task during training. We evaluate the OSENs in three different applications: i. +support estimation from Compressive Sensing (CS) measurements, ii. +representation-based classification, and iii. learning-aided CS reconstruction +where the output of OSENs is used as prior knowledge to the CS algorithm for +enhanced reconstruction. Experimental results show that the proposed approach +achieves computational efficiency and outperforms competing methods, especially +at low measurement rates by significant margins. The software implementation is +shared at https://github.com/meteahishali/OSEN. + +
+
+
+
+
+ + ♻ ☆ ObjectAdd: Adding Objects into Image via a Training-Free Diffusion + Modification Fashion + + +
+ We introduce ObjectAdd, a training-free diffusion modification method to add +user-expected objects into user-specified area. The motive of ObjectAdd stems +from: first, describing everything in one prompt can be difficult, and second, +users often need to add objects into the generated image. To accommodate with +real world, our ObjectAdd maintains accurate image consistency after adding +objects with technical innovations in: (1) embedding-level concatenation to +ensure correct text embedding coalesce; (2) object-driven layout control with +latent and attention injection to ensure objects accessing user-specified area; +(3) prompted image inpainting in an attention refocusing & object expansion +fashion to ensure rest of the image stays the same. With a text-prompted image, +our ObjectAdd allows users to specify a box and an object, and achieves: (1) +adding object inside the box area; (2) exact content outside the box area; (3) +flawless fusion between the two areas + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Blue noise for diffusion models SIGGRAPH 2024 + + +
+ Most of the existing diffusion models use Gaussian noise for training and +sampling across all time steps, which may not optimally account for the +frequency contents reconstructed by the denoising network. Despite the diverse +applications of correlated noise in computer graphics, its potential for +improving the training process has been underexplored. In this paper, we +introduce a novel and general class of diffusion models taking correlated noise +within and across images into account. More specifically, we propose a +time-varying noise model to incorporate correlated noise into the training +process, as well as a method for fast generation of correlated noise mask. Our +model is built upon deterministic diffusion models and utilizes blue noise to +help improve the generation quality compared to using Gaussian white (random) +noise only. Further, our framework allows introducing correlation across images +within a single mini-batch to improve gradient flow. We perform both +qualitative and quantitative evaluations on a variety of datasets using our +method, achieving improvements on different tasks over existing deterministic +diffusion models in terms of FID metric. + +
+
+ comment: SIGGRAPH 2024 Conference Proceedings; Project page: + https://xchhuang.github.io/bndm +
+
+
+
+
+ + ♻ ☆ Correcting Diffusion-Based Perceptual Image Compression with Privileged + End-to-End Decoder ICML 2024 + + +
+ The images produced by diffusion models can attain excellent perceptual +quality. However, it is challenging for diffusion models to guarantee +distortion, hence the integration of diffusion models and image compression +models still needs more comprehensive explorations. This paper presents a +diffusion-based image compression method that employs a privileged end-to-end +decoder model as correction, which achieves better perceptual quality while +guaranteeing the distortion to an extent. We build a diffusion model and design +a novel paradigm that combines the diffusion model and an end-to-end decoder, +and the latter is responsible for transmitting the privileged information +extracted at the encoder side. Specifically, we theoretically analyze the +reconstruction process of the diffusion models at the encoder side with the +original images being visible. Based on the analysis, we introduce an +end-to-end convolutional decoder to provide a better approximation of the score +function $\nabla_{\mathbf{x}_t}\log p(\mathbf{x}_t)$ at the encoder side and +effectively transmit the combination. Experiments demonstrate the superiority +of our method in both distortion and perception compared with previous +perceptual compression methods. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ♻ ☆ DA-RAW: Domain Adaptive Object Detection for Real-World Adverse Weather + Conditions ICRA 2024 + + +
+ Despite the success of deep learning-based object detection methods in recent +years, it is still challenging to make the object detector reliable in adverse +weather conditions such as rain and snow. For the robust performance of object +detectors, unsupervised domain adaptation has been utilized to adapt the +detection network trained on clear weather images to adverse weather images. +While previous methods do not explicitly address weather corruption during +adaptation, the domain gap between clear and adverse weather can be decomposed +into two factors with distinct characteristics: a style gap and a weather gap. +In this paper, we present an unsupervised domain adaptation framework for +object detection that can more effectively adapt to real-world environments +with adverse weather conditions by addressing these two gaps separately. Our +method resolves the style gap by concentrating on style-related information of +high-level features using an attention module. Using self-supervised +contrastive learning, our framework then reduces the weather gap and acquires +instance features that are robust to weather corruption. Extensive experiments +demonstrate that our method outperforms other methods for object detection in +adverse weather conditions. + +
+
+ comment: Accepted to ICRA 2024. Our project website can be found at + https://bit.ly/3yccTRa +
+
+
+
+
+ + ♻ ☆ Accelerating Diffusion Models for Inverse Problems through Shortcut + Sampling IJCAI 2024 + + +
+ Diffusion models have recently demonstrated an impressive ability to address +inverse problems in an unsupervised manner. While existing methods primarily +focus on modifying the posterior sampling process, the potential of the forward +process remains largely unexplored. In this work, we propose Shortcut Sampling +for Diffusion(SSD), a novel approach for solving inverse problems in a +zero-shot manner. Instead of initiating from random noise, the core concept of +SSD is to find a specific transitional state that bridges the measurement image +y and the restored image x. By utilizing the shortcut path of "input - +transitional state - output", SSD can achieve precise restoration with fewer +steps. To derive the transitional state during the forward process, we +introduce Distortion Adaptive Inversion. Moreover, we apply back projection as +additional consistency constraints during the generation process. +Experimentally, we demonstrate SSD's effectiveness on multiple representative +IR tasks. Our method achieves competitive results with only 30 NFEs compared to +state-of-the-art zero-shot methods(100 NFEs) and outperforms them with 100 NFEs +in certain tasks. Code is available at https://github.com/GongyeLiu/SSD + +
+
+ comment: full version; IJCAI 2024 accepted (main track) +
+
+
+
+
+ + ♻ ☆ CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained + Vision-Language Model + + +
+ Pre-trained vision-language models~(VLMs) are the de-facto foundation models +for various downstream tasks. However, scene text recognition methods still +prefer backbones pre-trained on a single modality, namely, the visual modality, +despite the potential of VLMs to serve as powerful scene text readers. For +example, CLIP can robustly identify regular (horizontal) and irregular +(rotated, curved, blurred, or occluded) text in images. With such merits, we +transform CLIP into a scene text reader and introduce CLIP4STR, a simple yet +effective STR method built upon image and text encoders of CLIP. It has two +encoder-decoder branches: a visual branch and a cross-modal branch. The visual +branch provides an initial prediction based on the visual feature, and the +cross-modal branch refines this prediction by addressing the discrepancy +between the visual feature and text semantics. To fully leverage the +capabilities of both branches, we design a dual predict-and-refine decoding +scheme for inference. We scale CLIP4STR in terms of the model size, +pre-training data, and training data, achieving state-of-the-art performance on +11 STR benchmarks. Additionally, a comprehensive empirical study is provided to +enhance the understanding of the adaptation of CLIP to STR. We believe our +method establishes a simple yet strong baseline for future STR research with +VLMs. + +
+
+ comment: Preprint. A PyTorch re-implementation is at + https://github.com/VamosC/CLIP4STR +
+
+
+
+
+ + ♻ ☆ Fingerprint Matching with Localized Deep Representation + + +
+ Compared to minutia-based fingerprint representations, fixed-length +representations are attractive due to simple and efficient matching. However, +fixed-length fingerprint representations are limited in accuracy when matching +fingerprints with different visible areas, which can occur due to different +finger poses or acquisition methods. To address this issue, we propose a +localized deep representation of fingerprint, named LDRF. By focusing on the +discriminative characteristics within local regions, LDRF provides a more +robust and accurate fixed-length representation for fingerprints with variable +visible areas. LDRF can be adapted to retain information within any valid area, +making it highly flexible. The matching scores produced by LDRF also exhibit +intuitive statistical characteristics, which led us to propose a matching score +normalization technique to mitigate the uncertainty in the cases of very small +overlapping area. With this new technique, we can maintain a high level of +accuracy and reliability in our fingerprint matching, even as the size of the +database grows rapidly. Our experimental results on 21 datasets containing over +140K fingerprints of various finger poses and impression types show that LDRF +outperforms other fixed-length representations and is robust to sensing +technologies and impression types. Besides, the proposed matching score +normalization effectively reduces the false match rate (FMR) in large-scale +identification experiments comprising over 5.11 million fingerprints. +Specifically, this technique results in a reduction of two orders of magnitude +compared to matching without matching score normalization and five orders of +magnitude compared to prior works. + +
+
+ comment: 18 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ 3D Gaussian Blendshapes for Head Avatar Animation SIGGRAPH + + +
+ We introduce 3D Gaussian blendshapes for modeling photorealistic head +avatars. Taking a monocular video as input, we learn a base head model of +neutral expression, along with a group of expression blendshapes, each of which +corresponds to a basis expression in classical parametric face models. Both the +neutral model and expression blendshapes are represented as 3D Gaussians, which +contain a few properties to depict the avatar appearance. The avatar model of +an arbitrary expression can be effectively generated by combining the neutral +model and expression blendshapes through linear blending of Gaussians with the +expression coefficients. High-fidelity head avatar animations can be +synthesized in real time using Gaussian splatting. Compared to state-of-the-art +methods, our Gaussian blendshape representation better captures high-frequency +details exhibited in input video, and achieves superior rendering performance. + +
+
+ comment: ACM SIGGRAPH Conference Proceedings 2024 +
+
+
+
+
+ + ♻ ☆ Content Bias in Deep Learning Image Age Approximation: A new Approach + Towards better Explainability + + +
+ In the context of temporal image forensics, it is not evident that a neural +network, trained on images from different time-slots (classes), exploits solely +image age related features. Usually, images taken in close temporal proximity +(e.g., belonging to the same age class) share some common content properties. +Such content bias can be exploited by a neural network. In this work, a novel +approach is proposed that evaluates the influence of image content. This +approach is verified using synthetic images (where content bias can be ruled +out) with an age signal embedded. Based on the proposed approach, it is shown +that a deep learning approach proposed in the context of age classification is +most likely highly dependent on the image content. As a possible +countermeasure, two different models from the field of image steganalysis, +along with three different preprocessing techniques to increase the +signal-to-noise ratio (age signal to image content), are evaluated using the +proposed method. + +
+
+ comment: This is a preprint, the paper is currently under consideration at + Pattern Recognition Letters +
+
+
+
+
+ + ♻ ☆ Uncertainty Quantification with Deep Ensembles for 6D Object Pose + Estimation + + +
+ The estimation of 6D object poses is a fundamental task in many computer +vision applications. Particularly, in high risk scenarios such as human-robot +interaction, industrial inspection, and automation, reliable pose estimates are +crucial. In the last years, increasingly accurate and robust +deep-learning-based approaches for 6D object pose estimation have been +proposed. Many top-performing methods are not end-to-end trainable but consist +of multiple stages. In the context of deep uncertainty quantification, deep +ensembles are considered as state of the art since they have been proven to +produce well-calibrated and robust uncertainty estimates. However, deep +ensembles can only be applied to methods that can be trained end-to-end. In +this work, we propose a method to quantify the uncertainty of multi-stage 6D +object pose estimation approaches with deep ensembles. For the implementation, +we choose SurfEmb as representative, since it is one of the top-performing 6D +object pose estimation approaches in the BOP Challenge 2022. We apply +established metrics and concepts for deep uncertainty quantification to +evaluate the results. Furthermore, we propose a novel uncertainty calibration +score for regression tasks to quantify the quality of the estimated +uncertainty. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Landmark-Guided Cross-Speaker Lip Reading with Mutual Information + Regularization LREC + + +
+ Lip reading, the process of interpreting silent speech from visual lip +movements, has gained rising attention for its wide range of realistic +applications. Deep learning approaches greatly improve current lip reading +systems. However, lip reading in cross-speaker scenarios where the speaker +identity changes, poses a challenging problem due to inter-speaker variability. +A well-trained lip reading system may perform poorly when handling a brand new +speaker. To learn a speaker-robust lip reading model, a key insight is to +reduce visual variations across speakers, avoiding the model overfitting to +specific speakers. In this work, in view of both input visual clues and latent +representations based on a hybrid CTC/attention architecture, we propose to +exploit the lip landmark-guided fine-grained visual clues instead of +frequently-used mouth-cropped images as input features, diminishing +speaker-specific appearance characteristics. Furthermore, a max-min mutual +information regularization approach is proposed to capture speaker-insensitive +latent representations. Experimental evaluations on public lip reading datasets +demonstrate the effectiveness of the proposed approach under the intra-speaker +and inter-speaker conditions. + +
+
+ comment: To appear in LREC-COLING 2024 +
+
+
+
+
+ + ♻ ☆ Continual Action Assessment via Task-Consistent Score-Discriminative + Feature Distribution Modeling + + +
+ Action Quality Assessment (AQA) is a task that tries to answer how well an +action is carried out. While remarkable progress has been achieved, existing +works on AQA assume that all the training data are visible for training at one +time, but do not enable continual learning on assessing new technical actions. +In this work, we address such a Continual Learning problem in AQA +(Continual-AQA), which urges a unified model to learn AQA tasks sequentially +without forgetting. Our idea for modeling Continual-AQA is to sequentially +learn a task-consistent score-discriminative feature distribution, in which the +latent features express a strong correlation with the score labels regardless +of the task or action types.From this perspective, we aim to mitigate the +forgetting in Continual-AQA from two aspects. Firstly, to fuse the features of +new and previous data into a score-discriminative distribution, a novel +Feature-Score Correlation-Aware Rehearsal is proposed to store and reuse data +from previous tasks with limited memory size. Secondly, an Action +General-Specific Graph is developed to learn and decouple the action-general +and action-specific knowledge so that the task-consistent score-discriminative +features can be better extracted across various tasks. Extensive experiments +are conducted to evaluate the contributions of proposed components. The +comparisons with the existing continual learning methods additionally verify +the effectiveness and versatility of our approach. Data and code are available +at https://github.com/iSEE-Laboratory/Continual-AQA. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ HARMamba: Efficient Wearable Sensor Human Activity Recognition Based on + Bidirectional Selective SSM + + +
+ Wearable sensor-based human activity recognition (HAR) is a critical research +domain in activity perception. However, achieving high efficiency and long +sequence recognition remains a challenge. Despite the extensive investigation +of temporal deep learning models, such as CNNs, RNNs, and transformers, their +extensive parameters often pose significant computational and memory +constraints, rendering them less suitable for resource-constrained mobile +health applications. This study introduces HARMamba, an innovative light-weight +and versatile HAR architecture that combines selective bidirectional SSM and +hardware-aware design. To optimize real-time resource consumption in practical +scenarios, HARMamba employs linear recursive mechanisms and parameter +discretization, allowing it to selectively focus on relevant input sequences +while efficiently fusing scan and recompute operations. To address potential +issues with invalid sensor data, the system processes the data stream through +independent channels, dividing each channel into "patches" and appending +classification token to the end of the sequence. Position embeddings are +incorporated to represent the sequence order, and the activity categories are +output through a classification header. The HARMamba Block serves as the +fundamental component of the HARMamba architecture, enabling the effective +capture of more discriminative activity sequence features. HARMamba outperforms +contemporary state-of-the-art frameworks, delivering comparable or better +accuracy with significantly reducing computational and memory demands. It's +effectiveness has been extensively validated on public datasets like PAMAP2, +WISDM, UNIMIB SHAR and UCI, showcasing impressive results. + +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Score Distillation for Text-to-3D Generation ICML 2024 + + +
+ Text-to-3D generation has achieved significant success by incorporating +powerful 2D diffusion models, but insufficient 3D prior knowledge also leads to +the inconsistency of 3D geometry. Recently, since large-scale multi-view +datasets have been released, fine-tuning the diffusion model on the multi-view +datasets becomes a mainstream to solve the 3D inconsistency problem. However, +it has confronted with fundamental difficulties regarding the limited quality +and diversity of 3D data, compared with 2D data. To sidestep these trade-offs, +we explore a retrieval-augmented approach tailored for score distillation, +dubbed ReDream. We postulate that both expressiveness of 2D diffusion models +and geometric consistency of 3D assets can be fully leveraged by employing the +semantically relevant assets directly within the optimization process. To this +end, we introduce novel framework for retrieval-based quality enhancement in +text-to-3D generation. We leverage the retrieved asset to incorporate its +geometric prior in the variational objective and adapt the diffusion model's 2D +prior toward view consistency, achieving drastic improvements in both geometry +and fidelity of generated scenes. We conduct extensive experiments to +demonstrate that ReDream exhibits superior quality with increased geometric +consistency. Project page is available at https://ku-cvlab.github.io/ReDream/. + +
+
+ comment: Accepted to ICML 2024 / Project Page: + https://ku-cvlab.github.io/ReDream/ +
+
+
+
+
+ + ♻ ☆ Morphing Tokens Draw Strong Masked Image Models + + +
+ Masked image modeling (MIM) is a promising option for training Vision +Transformers among various self-supervised learning (SSL) methods. The essence +of MIM lies in token-wise masked token predictions, with targets tokenized from +images or generated by pre-trained models such as vision-language models. While +tokenizers or pre-trained models are plausible MIM targets, they often offer +spatially inconsistent targets even for neighboring tokens, complicating models +to learn unified discriminative representations. Our pilot study confirms that +addressing spatial inconsistencies has the potential to enhance representation +quality. Motivated by the findings, we introduce a novel self-supervision +signal called Dynamic Token Morphing (DTM), which dynamically aggregates +contextually related tokens to yield contextualized targets. DTM is compatible +with various SSL frameworks; we showcase an improved MIM by employing DTM, +barely introducing extra training costs. Our experiments on ImageNet-1K and +ADE20K demonstrate the superiority of our methods compared with +state-of-the-art, complex MIM methods. Furthermore, the comparative evaluation +of the iNaturalists and fine-grained visual classification datasets further +validates the transferability of our method on various downstream tasks. Code +is available at https://github.com/naver-ai/dtm + +
+
+ comment: 27 pages, 17 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ Few-Shot Learning with Uncertainty-based Quadruplet Selection for + Interference Classification in GNSS Data + + +
+ Jamming devices pose a significant threat by disrupting signals from the +global navigation satellite system (GNSS), compromising the robustness of +accurate positioning. Detecting anomalies in frequency snapshots is crucial to +counteract these interferences effectively. The ability to adapt to diverse, +unseen interference characteristics is essential for ensuring the reliability +of GNSS in real-world applications. In this paper, we propose a few-shot +learning (FSL) approach to adapt to new interference classes. Our method +employs quadruplet selection for the model to learn representations using +various positive and negative interference classes. Furthermore, our quadruplet +variant selects pairs based on the aleatoric and epistemic uncertainty to +differentiate between similar classes. We recorded a dataset at a motorway with +eight interference classes on which our FSL method with quadruplet loss +outperforms other FSL techniques in jammer classification accuracy with 97.66%. +Dataset available at: +https://gitlab.cc-asp.fraunhofer.de/darcy_gnss/FIOT_highway + +
+
+
+
+
+ + ♻ ☆ Joint covariance properties under geometric image transformations for + spatio-temporal receptive fields according to the generalized Gaussian + derivative model for visual receptive fields + + +
+ The influence of natural image transformations on receptive field responses +is crucial for modelling visual operations in computer vision and biological +vision. In this regard, covariance properties with respect to geometric image +transformations in the earliest layers of the visual hierarchy are essential +for expressing robust image operations, and for formulating invariant visual +operations at higher levels. + This paper defines and proves a set of joint covariance properties under +compositions of spatial scaling transformations, spatial affine +transformations, Galilean transformations and temporal scaling transformations, +which make it possible to characterize how different types of image +transformations interact with each other and the associated spatio-temporal +receptive field responses. In this regard, we also extend the notion of +scale-normalized derivatives to affine-normalized derivatives, to be able to +obtain true affine-covariant properties of spatial derivatives, that are +computed based on spatial smoothing with affine Gaussian kernels. + The derived relations show how the parameters of the receptive fields need to +be transformed, in order to match the output from spatio-temporal receptive +fields under composed spatio-temporal image transformations. As a side effect, +the presented proof for the joint covariance property over the integrated +combination of the different geometric image transformations also provides +specific proofs for the individual transformation properties, which have not +previously been fully reported in the literature. + The paper also presents an in-depth theoretical analysis of geometric +interpretations of the derived covariance properties, as well as outlines a +number of biological interpretations of these results. + +
+
+ comment: 38 pages, 13 figures. Note: From version 4, this paper considers a + different form of joint composition of the geometric image transformations + than in the earlier versions +
+
+
+
+
+ + ♻ ☆ The Perception-Robustness Tradeoff in Deterministic Image Restoration + + +
+ We study the behavior of deterministic methods for solving inverse problems +in imaging. These methods are commonly designed to achieve two goals: (1) +attaining high perceptual quality, and (2) generating reconstructions that are +consistent with the measurements. We provide a rigorous proof that the better a +predictor satisfies these two requirements, the larger its Lipschitz constant +must be, regardless of the nature of the degradation involved. In particular, +to approach perfect perceptual quality and perfect consistency, the Lipschitz +constant of the model must grow to infinity. This implies that such methods are +necessarily more susceptible to adversarial attacks. We demonstrate our theory +on single image super-resolution algorithms, addressing both noisy and +noiseless settings. We also show how this undesired behavior can be leveraged +to explore the posterior distribution, thereby allowing the deterministic model +to imitate stochastic methods. + +
+
+
+
+
+ + ♻ ☆ COMET: Contrastive Mean Teacher for Online Source-Free Universal Domain + Adaptation IJCNN + + +
+ In real-world applications, there is often a domain shift from training to +test data. This observation resulted in the development of test-time adaptation +(TTA). It aims to adapt a pre-trained source model to the test data without +requiring access to the source data. Thereby, most existing works are limited +to the closed-set assumption, i.e. there is no category shift between source +and target domain. We argue that in a realistic open-world setting a category +shift can appear in addition to a domain shift. This means, individual source +classes may not appear in the target domain anymore, samples of new classes may +be part of the target domain or even both at the same time. Moreover, in many +real-world scenarios the test data is not accessible all at once but arrives +sequentially as a stream of batches demanding an immediate prediction. Hence, +TTA must be applied in an online manner. To the best of our knowledge, the +combination of these aspects, i.e. online source-free universal domain +adaptation (online SF-UniDA), has not been studied yet. In this paper, we +introduce a Contrastive Mean Teacher (COMET) tailored to this novel scenario. +It applies a contrastive loss to rebuild a feature space where the samples of +known classes build distinct clusters and the samples of new classes separate +well from them. It is complemented by an entropy loss which ensures that the +classifier output has a small entropy for samples of known classes and a large +entropy for samples of new classes to be easily detected and rejected as +unknown. To provide the losses with reliable pseudo labels, they are embedded +into a mean teacher (MT) framework. We evaluate our method across two datasets +and all category shifts to set an initial benchmark for online SF-UniDA. +Thereby, COMET yields state-of-the-art performance and proves to be consistent +and robust across a variety of different scenarios. + +
+
+ comment: Accepted at the International Joint Conference on Neural Networks + (IJCNN) 2024 +
+
+
+
+
+ + ♻ ☆ Automotive Object Detection via Learning Sparse Events by Spiking + Neurons + + +
+ Event-based sensors, distinguished by their high temporal resolution of 1 +$\mathrm{\mu}\text{s}$ and a dynamic range of 120 $\text{dB}$, stand out as +ideal tools for deployment in fast-paced settings like vehicles and drones. +Traditional object detection techniques that utilize Artificial Neural Networks +(ANNs) face challenges due to the sparse and asynchronous nature of the events +these sensors capture. In contrast, Spiking Neural Networks (SNNs) offer a +promising alternative, providing a temporal representation that is inherently +aligned with event-based data. This paper explores the unique membrane +potential dynamics of SNNs and their ability to modulate sparse events. We +introduce an innovative spike-triggered adaptive threshold mechanism designed +for stable training. Building on these insights, we present a specialized +spiking feature pyramid network (SpikeFPN) optimized for automotive event-based +object detection. Comprehensive evaluations demonstrate that SpikeFPN surpasses +both traditional SNNs and advanced ANNs enhanced with attention mechanisms. +Evidently, SpikeFPN achieves a mean Average Precision (mAP) of 0.477 on the +GEN1 Automotive Detection (GAD) benchmark dataset, marking significant +increases over the selected SNN baselines. Moreover, the efficient design of +SpikeFPN ensures robust performance while optimizing computational resources, +attributed to its innate sparse computation capabilities. + +
+
+
+
+
+ + ♻ ☆ SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed + Semi-Supervised Learning ICML2024 + + +
+ Recent advancements in semi-supervised learning have focused on a more +realistic yet challenging task: addressing imbalances in labeled data while the +class distribution of unlabeled data remains both unknown and potentially +mismatched. Current approaches in this sphere often presuppose rigid +assumptions regarding the class distribution of unlabeled data, thereby +limiting the adaptability of models to only certain distribution ranges. In +this study, we propose a novel approach, introducing a highly adaptable +framework, designated as SimPro, which does not rely on any predefined +assumptions about the distribution of unlabeled data. Our framework, grounded +in a probabilistic model, innovatively refines the expectation-maximization +(EM) algorithm by explicitly decoupling the modeling of conditional and +marginal class distributions. This separation facilitates a closed-form +solution for class distribution estimation during the maximization phase, +leading to the formulation of a Bayes classifier. The Bayes classifier, in +turn, enhances the quality of pseudo-labels in the expectation phase. +Remarkably, the SimPro framework not only comes with theoretical guarantees but +also is straightforward to implement. Moreover, we introduce two novel class +distributions broadening the scope of the evaluation. Our method showcases +consistent state-of-the-art performance across diverse benchmarks and data +distribution scenarios. Our code is available at +https://github.com/LeapLabTHU/SimPro. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ♻ ☆ RSCaMa: Remote Sensing Image Change Captioning with State Space Model + + +
+ Remote Sensing Image Change Captioning (RSICC) aims to describe surface +changes between multi-temporal remote sensing images in language, including the +changed object categories, locations, and dynamics of changing objects (e.g., +added or disappeared). This poses challenges to spatial and temporal modeling +of bi-temporal features. Despite previous methods progressing in the spatial +change perception, there are still weaknesses in joint spatial-temporal +modeling. To address this, in this paper, we propose a novel RSCaMa model, +which achieves efficient joint spatial-temporal modeling through multiple CaMa +layers, enabling iterative refinement of bi-temporal features. To achieve +efficient spatial modeling, we introduce the recently popular Mamba (a state +space model) with a global receptive field and linear complexity into the RSICC +task and propose the Spatial Difference-aware SSM (SD-SSM), overcoming +limitations of previous CNN- and Transformer-based methods in the receptive +field and computational complexity. SD-SSM enhances the model's ability to +capture spatial changes sharply. In terms of efficient temporal modeling, +considering the potential correlation between the temporal scanning +characteristics of Mamba and the temporality of the RSICC, we propose the +Temporal-Traversing SSM (TT-SSM), which scans bi-temporal features in a +temporal cross-wise manner, enhancing the model's temporal understanding and +information interaction. Experiments validate the effectiveness of the +efficient joint spatial-temporal modeling and demonstrate the outstanding +performance of RSCaMa and the potential of the Mamba in the RSICC task. +Additionally, we systematically compare three different language decoders, +including Mamba, GPT-style decoder, and Transformer decoder, providing valuable +insights for future RSICC research. The code will be available at +\emph{\url{https://github.com/Chen-Yang-Liu/RSCaMa}} + +
+
+
+
+
+ + ♻ ☆ Efficient Remote Sensing with Harmonized Transfer Learning and Modality + Alignment ICLR + + +
+ With the rise of Visual and Language Pretraining (VLP), an increasing number +of downstream tasks are adopting the paradigm of pretraining followed by +fine-tuning. Although this paradigm has demonstrated potential in various +multimodal downstream tasks, its implementation in the remote sensing domain +encounters some obstacles. Specifically, the tendency for same-modality +embeddings to cluster together impedes efficient transfer learning. To tackle +this issue, we review the aim of multimodal transfer learning for downstream +tasks from a unified perspective, and rethink the optimization process based on +three distinct objectives. We propose "Harmonized Transfer Learning and +Modality Alignment (HarMA)", a method that simultaneously satisfies task +constraints, modality alignment, and single-modality uniform alignment, while +minimizing training overhead through parameter-efficient fine-tuning. +Remarkably, without the need for external data for training, HarMA achieves +state-of-the-art performance in two popular multimodal retrieval tasks in the +field of remote sensing. Our experiments reveal that HarMA achieves competitive +and even superior performance to fully fine-tuned models with only minimal +adjustable parameters. Due to its simplicity, HarMA can be integrated into +almost all existing multimodal pretraining models. We hope this method can +facilitate the efficient application of large models to a wide range of +downstream tasks while significantly reducing the resource consumption. Code is +available at https://github.com/seekerhuang/HarMA. + +
+
+ comment: Accepted by the Twelfth International Conference on Learning + Representations (ICLR) Workshop +
+
+
+
+
+ + ♻ ☆ TExplain: Explaining Learned Visual Features via Pre-trained (Frozen) + Language Models ICLR 2024 + + +
+ Interpreting the learned features of vision models has posed a longstanding +challenge in the field of machine learning. To address this issue, we propose a +novel method that leverages the capabilities of language models to interpret +the learned features of pre-trained image classifiers. Our method, called +TExplain, tackles this task by training a neural network to establish a +connection between the feature space of image classifiers and language models. +Then, during inference, our approach generates a vast number of sentences to +explain the features learned by the classifier for a given image. These +sentences are then used to extract the most frequent words, providing a +comprehensive understanding of the learned features and patterns within the +classifier. Our method, for the first time, utilizes these frequent words +corresponding to a visual representation to provide insights into the +decision-making process of the independently trained classifier, enabling the +detection of spurious correlations, biases, and a deeper comprehension of its +behavior. To validate the effectiveness of our approach, we conduct experiments +on diverse datasets, including ImageNet-9L and Waterbirds. The results +demonstrate the potential of our method to enhance the interpretability and +robustness of image classifiers. + +
+
+ comment: Accepted to ICLR 2024, Reliable and Responsible Foundation Models + workshop +
+
+
+
+
+ + ♻ ☆ Searching from Area to Point: A Hierarchical Framework for + Semantic-Geometric Combined Feature Matching + + +
+ Feature matching is a crucial technique in computer vision. A unified +perspective for this task is to treat it as a searching problem, aiming at an +efficient search strategy to narrow the search space to point matches between +images. One of the key aspects of search strategy is the search space, which in +current approaches is not carefully defined, resulting in limited matching +accuracy. This paper, thus, pays attention to the search space and proposes to +set the initial search space for point matching as the matched image areas +containing prominent semantic, named semantic area matches. This search space +favors point matching by salient features and alleviates the accuracy +limitation in recent Transformer-based matching methods. To achieve this search +space, we introduce a hierarchical feature matching framework: Area to Point +Matching (A2PM), to first find semantic area matches between images and later +perform point matching on area matches. We further propose Semantic and +Geometry Area Matching (SGAM) method to realize this framework, which utilizes +semantic prior and geometry consistency to establish accurate area matches +between images. By integrating SGAM with off-the-shelf state-of-the-art +matchers, our method, adopting the A2PM framework, achieves encouraging +precision improvements in massive point matching and pose estimation +experiments. + +
+
+ comment: v3 +
+
+
+
+
+ + ♻ ☆ CIC: A framework for Culturally-aware Image Captioning IJCAI 2024 + + +
+ Image Captioning generates descriptive sentences from images using +Vision-Language Pre-trained models (VLPs) such as BLIP, which has improved +greatly. However, current methods lack the generation of detailed descriptive +captions for the cultural elements depicted in the images, such as the +traditional clothing worn by people from Asian cultural groups. In this paper, +we propose a new framework, \textbf{Culturally-aware Image Captioning (CIC)}, +that generates captions and describes cultural elements extracted from cultural +visual elements in images representing cultures. Inspired by methods combining +visual modality and Large Language Models (LLMs) through appropriate prompts, +our framework (1) generates questions based on cultural categories from images, +(2) extracts cultural visual elements from Visual Question Answering (VQA) +using generated questions, and (3) generates culturally-aware captions using +LLMs with the prompts. Our human evaluation conducted on 45 participants from 4 +different cultural groups with a high understanding of the corresponding +culture shows that our proposed framework generates more culturally descriptive +captions when compared to the image captioning baseline based on VLPs. Our code +and dataset will be made publicly available upon acceptance. + +
+
+ comment: Accepted in IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Pixel is a Barrier: Diffusion Models Are More Adversarially Robust Than + We Think + + +
+ Adversarial examples for diffusion models are widely used as solutions for +safety concerns. By adding adversarial perturbations to personal images, +attackers can not edit or imitate them easily. However, it is essential to note +that all these protections target the latent diffusion model (LDMs), the +adversarial examples for diffusion models in the pixel space (PDMs) are largely +overlooked. This may mislead us to think that the diffusion models are +vulnerable to adversarial attacks like most deep models. In this paper, we show +novel findings that: even though gradient-based white-box attacks can be used +to attack the LDMs, they fail to attack PDMs. This finding is supported by +extensive experiments of almost a wide range of attacking methods on various +PDMs and LDMs with different model structures, which means diffusion models are +indeed much more robust against adversarial attacks. We also find that PDMs can +be used as an off-the-shelf purifier to effectively remove the adversarial +patterns that were generated on LDMs to protect the images, which means that +most protection methods nowadays, to some extent, cannot protect our images +from malicious attacks. We hope that our insights will inspire the community to +rethink the adversarial samples for diffusion models as protection methods and +move forward to more effective protection. Codes are available in +https://github.com/xavihart/PDM-Pure. + +
+
+
+
+
+ + ♻ ☆ PARASOL: Parametric Style Control for Diffusion Image Synthesis + + +
+ We propose PARASOL, a multi-modal synthesis model that enables disentangled, +parametric control of the visual style of the image by jointly conditioning +synthesis on both content and a fine-grained visual style embedding. We train a +latent diffusion model (LDM) using specific losses for each modality and adapt +the classifier-free guidance for encouraging disentangled control over +independent content and style modalities at inference time. We leverage +auxiliary semantic and style-based search to create training triplets for +supervision of the LDM, ensuring complementarity of content and style cues. +PARASOL shows promise for enabling nuanced control over visual style in +diffusion models for image creation and stylization, as well as generative +search where text-based search results may be adapted to more closely match +user intent by interpolating both content and style descriptors. + +
+
+ comment: Camera-ready version +
+
+
+
+
+ + ♻ ☆ Underwater Variable Zoom: Depth-Guided Perception Network for Underwater + Image Enhancement + + +
+ Underwater scenes intrinsically involve degradation problems owing to +heterogeneous ocean elements. Prevailing underwater image enhancement (UIE) +methods stick to straightforward feature modeling to learn the mapping +function, which leads to limited vision gain as it lacks more explicit physical +cues (e.g., depth). In this work, we investigate injecting the depth prior into +the deep UIE model for more precise scene enhancement capability. To this end, +we present a novel depth-guided perception UIE framework, dubbed underwater +variable zoom (UVZ). Specifically, UVZ resorts to a two-stage pipeline. First, +a depth estimation network is designed to generate critical depth maps, +combined with an auxiliary supervision network introduced to suppress +estimation differences during training. Second, UVZ parses near-far scenarios +by harnessing the predicted depth maps, enabling local and non-local perceiving +in different regions. Extensive experiments on five benchmark datasets +demonstrate that UVZ achieves superior visual gain and delivers promising +quantitative metrics. Besides, UVZ is confirmed to exhibit good generalization +in some visual tasks, especially in unusual lighting conditions. The code, +models and results are available at: https://github.com/WindySprint/UVZ. + +
+
+
+
+
+ + ♻ ☆ Pyramid Pixel Context Adaption Network for Medical Image Classification + with Supervised Contrastive Learning + + +
+ Spatial attention mechanism has been widely incorporated into deep neural +networks (DNNs), significantly lifting the performance in computer vision tasks +via long-range dependency modeling. However, it may perform poorly in medical +image analysis. Unfortunately, existing efforts are often unaware that +long-range dependency modeling has limitations in highlighting subtle lesion +regions. To overcome this limitation, we propose a practical yet lightweight +architectural unit, Pyramid Pixel Context Adaption (PPCA) module, which +exploits multi-scale pixel context information to recalibrate pixel position in +a pixel-independent manner dynamically. PPCA first applies a well-designed +cross-channel pyramid pooling to aggregate multi-scale pixel context +information, then eliminates the inconsistency among them by the well-designed +pixel normalization, and finally estimates per pixel attention weight via a +pixel context integration. By embedding PPCA into a DNN with negligible +overhead, the PPCANet is developed for medical image classification. In +addition, we introduce supervised contrastive learning to enhance feature +representation by exploiting the potential of label information via supervised +contrastive loss. The extensive experiments on six medical image datasets show +that PPCANet outperforms state-of-the-art attention-based networks and recent +deep neural networks. We also provide visual analysis and ablation study to +explain the behavior of PPCANet in the decision-making process. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Intriguing Properties of Diffusion Models: An Empirical Study of the + Natural Attack Capability in Text-to-Image Generative Models + + +
+ Denoising probabilistic diffusion models have shown breakthrough performance +to generate more photo-realistic images or human-level illustrations than the +prior models such as GANs. This high image-generation capability has stimulated +the creation of many downstream applications in various areas. However, we find +that this technology is actually a double-edged sword: We identify a new type +of attack, called the Natural Denoising Diffusion (NDD) attack based on the +finding that state-of-the-art deep neural network (DNN) models still hold their +prediction even if we intentionally remove their robust features, which are +essential to the human visual system (HVS), through text prompts. The NDD +attack shows a significantly high capability to generate low-cost, +model-agnostic, and transferable adversarial attacks by exploiting the natural +attack capability in diffusion models. To systematically evaluate the risk of +the NDD attack, we perform a large-scale empirical study with our newly created +dataset, the Natural Denoising Diffusion Attack (NDDA) dataset. We evaluate the +natural attack capability by answering 6 research questions. Through a user +study, we find that it can achieve an 88% detection rate while being stealthy +to 93% of human subjects; we also find that the non-robust features embedded by +diffusion models contribute to the natural attack capability. To confirm the +model-agnostic and transferable attack capability, we perform the NDD attack +against the Tesla Model 3 and find that 73% of the physically printed attacks +can be detected as stop signs. Our hope is that the study and dataset can help +our community be aware of the risks in diffusion models and facilitate further +research toward robust DNN models. + +
+
+
+
+
+ + ♻ ☆ AnomalyXFusion: Multi-modal Anomaly Synthesis with Diffusion + + +
+ Anomaly synthesis is one of the effective methods to augment abnormal samples +for training. However, current anomaly synthesis methods predominantly rely on +texture information as input, which limits the fidelity of synthesized abnormal +samples. Because texture information is insufficient to correctly depict the +pattern of anomalies, especially for logical anomalies. To surmount this +obstacle, we present the AnomalyXFusion framework, designed to harness +multi-modality information to enhance the quality of synthesized abnormal +samples. The AnomalyXFusion framework comprises two distinct yet synergistic +modules: the Multi-modal In-Fusion (MIF) module and the Dynamic Dif-Fusion +(DDF) module. The MIF module refines modality alignment by aggregating and +integrating various modality features into a unified embedding space, termed +X-embedding, which includes image, text, and mask features. Concurrently, the +DDF module facilitates controlled generation through an adaptive adjustment of +X-embedding conditioned on the diffusion steps. In addition, to reveal the +multi-modality representational power of AnomalyXFusion, we propose a new +dataset, called MVTec Caption. More precisely, MVTec Caption extends 2.2k +accurate image-mask-text annotations for the MVTec AD and LOCO datasets. +Comprehensive evaluations demonstrate the effectiveness of AnomalyXFusion, +especially regarding the fidelity and diversity for logical anomalies. Project +page: http:github.com/hujiecpp/MVTec-Caption + +
+
+
+
+
+ + ♻ ☆ APLA: Additional Perturbation for Latent Noise with Adversarial Training + Enables Consistency + + +
+ Diffusion models have exhibited promising progress in video generation. +However, they often struggle to retain consistent details within local regions +across frames. One underlying cause is that traditional diffusion models +approximate Gaussian noise distribution by utilizing predictive noise, without +fully accounting for the impact of inherent information within the input +itself. Additionally, these models emphasize the distinction between +predictions and references, neglecting information intrinsic to the videos. To +address this limitation, inspired by the self-attention mechanism, we propose a +novel text-to-video (T2V) generation network structure based on diffusion +models, dubbed Additional Perturbation for Latent noise with Adversarial +training (APLA). Our approach only necessitates a single video as input and +builds upon pre-trained stable diffusion networks. Notably, we introduce an +additional compact network, known as the Video Generation Transformer (VGT). +This auxiliary component is designed to extract perturbations from the inherent +information contained within the input, thereby refining inconsistent pixels +during temporal predictions. We leverage a hybrid architecture of transformers +and convolutions to compensate for temporal intricacies, enhancing consistency +between different frames within the video. Experiments demonstrate a noticeable +improvement in the consistency of the generated videos both qualitatively and +quantitatively. + +
+
+
+
+
+ + ♻ ☆ Utilizing Machine Learning and 3D Neuroimaging to Predict Hearing Loss: + A Comparative Analysis of Dimensionality Reduction and Regression Techniques + + +
+ In this project, we have explored machine learning approaches for predicting +hearing loss thresholds on the brain's gray matter 3D images. We have solved +the problem statement in two phases. In the first phase, we used a 3D CNN model +to reduce high-dimensional input into latent space and decode it into an +original image to represent the input in rich feature space. In the second +phase, we utilized this model to reduce input into rich features and used these +features to train standard machine learning models for predicting hearing +thresholds. We have experimented with autoencoders and variational autoencoders +in the first phase for dimensionality reduction and explored random forest, +XGBoost and multi-layer perceptron for regressing the thresholds. We split the +given data set into training and testing sets and achieved an 8.80 range and +22.57 range for PT500 and PT4000 on the test set, respectively. We got the +lowest RMSE using multi-layer perceptron among the other models. + Our approach leverages the unique capabilities of VAEs to capture complex, +non-linear relationships within high-dimensional neuroimaging data. We +rigorously evaluated the models using various metrics, focusing on the root +mean squared error (RMSE). The results highlight the efficacy of the +multi-layer neural network model, which outperformed other techniques in terms +of accuracy. This project advances the application of data mining in medical +diagnostics and enhances our understanding of age-related hearing loss through +innovative machine-learning frameworks. + +
+
+
+
+
+ + ♻ ☆ A Survey on Transferability of Adversarial Examples across Deep Neural + Networks + + +
+ The emergence of Deep Neural Networks (DNNs) has revolutionized various +domains by enabling the resolution of complex tasks spanning image recognition, +natural language processing, and scientific problem-solving. However, this +progress has also brought to light a concerning vulnerability: adversarial +examples. These crafted inputs, imperceptible to humans, can manipulate machine +learning models into making erroneous predictions, raising concerns for +safety-critical applications. An intriguing property of this phenomenon is the +transferability of adversarial examples, where perturbations crafted for one +model can deceive another, often with a different architecture. This intriguing +property enables black-box attacks which circumvents the need for detailed +knowledge of the target model. This survey explores the landscape of the +adversarial transferability of adversarial examples. We categorize existing +methodologies to enhance adversarial transferability and discuss the +fundamental principles guiding each approach. While the predominant body of +research primarily concentrates on image classification, we also extend our +discussion to encompass other vision tasks and beyond. Challenges and +opportunities are discussed, highlighting the importance of fortifying DNNs +against adversarial vulnerabilities in an evolving landscape. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ 6-DoF Grasp Planning using Fast 3D Reconstruction and Grasp Quality CNN + + +
+ Recent consumer demand for home robots has accelerated performance of robotic +grasping. However, a key component of the perception pipeline, the depth +camera, is still expensive and inaccessible to most consumers. In addition, +grasp planning has significantly improved recently, by leveraging large +datasets and cloud robotics, and by limiting the state and action space to +top-down grasps with 4 degrees of freedom (DoF). By leveraging multi-view +geometry of the object using inexpensive equipment such as off-the-shelf RGB +cameras and state-of-the-art algorithms such as Learn Stereo Machine +(LSM\cite{kar2017learning}), the robot is able to generate more robust grasps +from different angles with 6-DoF. In this paper, we present a modification of +LSM to graspable objects, evaluate the grasps, and develop a 6-DoF grasp +planner based on Grasp-Quality CNN (GQ-CNN\cite{mahler2017dex}) that exploits +multiple camera views to plan a robust grasp, even in the absence of a possible +top-down grasp. + +
+
+
+
+
+ + ♻ ☆ Learning Hierarchical Image Segmentation For Recognition and By + Recognition ICLR 2024 + + +
+ Large vision and language models learned directly through image-text +associations often lack detailed visual substantiation, whereas image +segmentation tasks are treated separately from recognition, supervisedly +learned without interconnections. Our key observation is that, while an image +can be recognized in multiple ways, each has a consistent part-and-whole visual +organization. Segmentation thus should be treated not as an end task to be +mastered through supervised learning, but as an internal process that evolves +with and supports the ultimate goal of recognition. We propose to integrate a +hierarchical segmenter into the recognition process, train and adapt the entire +model solely on image-level recognition objectives. We learn hierarchical +segmentation for free alongside recognition, automatically uncovering +part-to-whole relationships that not only underpin but also enhance +recognition. Enhancing the Vision Transformer (ViT) with adaptive segment +tokens and graph pooling, our model surpasses ViT in unsupervised part-whole +discovery, semantic segmentation, image classification, and efficiency. +Notably, our model (trained on unlabeled 1M ImageNet images) outperforms SAM +(trained on 11M images and 1 billion masks) by absolute 8% in mIoU on +PartImageNet object segmentation. + +
+
+ comment: ICLR 2024 (spotlight). First two authors contributed equally. Code + available at https://github.com/twke18/CAST +
+
+
+
+
+ + ♻ ☆ Understanding Hyperbolic Metric Learning through Hard Negative Sampling + + +
+ In recent years, there has been a growing trend of incorporating hyperbolic +geometry methods into computer vision. While these methods have achieved +state-of-the-art performance on various metric learning tasks using hyperbolic +distance measurements, the underlying theoretical analysis supporting this +superior performance remains under-exploited. In this study, we investigate the +effects of integrating hyperbolic space into metric learning, particularly when +training with contrastive loss. We identify a need for a comprehensive +comparison between Euclidean and hyperbolic spaces regarding the temperature +effect in the contrastive loss within the existing literature. To address this +gap, we conduct an extensive investigation to benchmark the results of Vision +Transformers (ViTs) using a hybrid objective function that combines loss from +Euclidean and hyperbolic spaces. Additionally, we provide a theoretical +analysis of the observed performance improvement. We also reveal that +hyperbolic metric learning is highly related to hard negative sampling, +providing insights for future work. This work will provide valuable data points +and experience in understanding hyperbolic image embeddings. To shed more light +on problem-solving and encourage further investigation into our approach, our +code is available online (https://github.com/YunYunY/HypMix). + +
+
+ comment: published in Proceedings of the IEEE/CVF Winter Conference on + Applications of Computer Vision. 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Melt Pool Depth Contour Prediction From Surface + Thermal Images via Vision Transformers + + +
+ Insufficient overlap between the melt pools produced during Laser Powder Bed +Fusion (L-PBF) can lead to lack-of-fusion defects and deteriorated mechanical +and fatigue performance. In-situ monitoring of the melt pool subsurface +morphology requires specialized equipment that may not be readily accessible or +scalable. Therefore, we introduce a machine learning framework to correlate +in-situ two-color thermal images observed via high-speed color imaging to the +two-dimensional profile of the melt pool cross-section. Specifically, we employ +a hybrid CNN-Transformer architecture to establish a correlation between single +bead off-axis thermal image sequences and melt pool cross-section contours +measured via optical microscopy. In this architecture, a ResNet model embeds +the spatial information contained within the thermal images to a latent vector, +while a Transformer model correlates the sequence of embedded vectors to +extract temporal information. Our framework is able to model the curvature of +the subsurface melt pool structure, with improved performance in high energy +density regimes compared to analytical melt pool models. The performance of +this model is evaluated through dimensional and geometric comparisons to the +corresponding experimental melt pool observations. + +
+
+
+
+
+ + ♻ ☆ Deformable ProtoPNet: An Interpretable Image Classifier Using Deformable + Prototypes CVPR 2022 + + +
+ We present a deformable prototypical part network (Deformable ProtoPNet), an +interpretable image classifier that integrates the power of deep learning and +the interpretability of case-based reasoning. This model classifies input +images by comparing them with prototypes learned during training, yielding +explanations in the form of "this looks like that." However, while previous +methods use spatially rigid prototypes, we address this shortcoming by +proposing spatially flexible prototypes. Each prototype is made up of several +prototypical parts that adaptively change their relative spatial positions +depending on the input image. Consequently, a Deformable ProtoPNet can +explicitly capture pose variations and context, improving both model accuracy +and the richness of explanations provided. Compared to other case-based +interpretable models using prototypes, our approach achieves state-of-the-art +accuracy and gives an explanation with greater context. The code is available +at https://github.com/jdonnelly36/Deformable-ProtoPNet. + +
+
+ comment: This was published in CVPR 2022 +
+
+
+
+
+ + ♻ ☆ Continual Diffusion with STAMINA: STack-And-Mask INcremental Adapters CVPR + + +
+ Recent work has demonstrated a remarkable ability to customize text-to-image +diffusion models to multiple, fine-grained concepts in a sequential (i.e., +continual) manner while only providing a few example images for each concept. +This setting is known as continual diffusion. Here, we ask the question: Can we +scale these methods to longer concept sequences without forgetting? Although +prior work mitigates the forgetting of previously learned concepts, we show +that its capacity to learn new tasks reaches saturation over longer sequences. +We address this challenge by introducing a novel method, STack-And-Mask +INcremental Adapters (STAMINA), which is composed of low-ranked +attention-masked adapters and customized MLP tokens. STAMINA is designed to +enhance the robust fine-tuning properties of LoRA for sequential concept +learning via learnable hard-attention masks parameterized with low rank MLPs, +enabling precise, scalable learning via sparse adaptation. Notably, all +introduced trainable parameters can be folded back into the model after +training, inducing no additional inference parameter costs. We show that +STAMINA outperforms the prior SOTA for the setting of text-to-image continual +customization on a 50-concept benchmark composed of landmarks and human faces, +with no stored replay data. Additionally, we extended our method to the setting +of continual learning for image classification, demonstrating that our gains +also translate to state-of-the-art performance in this standard benchmark. + +
+
+ comment: CVPR-W 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 108 + +
+
+
+ + ☆ Spectrally Pruned Gaussian Fields with Neural Compensation + + +
+ Recently, 3D Gaussian Splatting, as a novel 3D representation, has garnered +attention for its fast rendering speed and high rendering quality. However, +this comes with high memory consumption, e.g., a well-trained Gaussian field +may utilize three million Gaussian primitives and over 700 MB of memory. We +credit this high memory footprint to the lack of consideration for the +relationship between primitives. In this paper, we propose a memory-efficient +Gaussian field named SUNDAE with spectral pruning and neural compensation. On +one hand, we construct a graph on the set of Gaussian primitives to model their +relationship and design a spectral down-sampling module to prune out primitives +while preserving desired signals. On the other hand, to compensate for the +quality loss of pruning Gaussians, we exploit a lightweight neural network head +to mix splatted features, which effectively compensates for quality losses +while capturing the relationship between primitives in its weights. We +demonstrate the performance of SUNDAE with extensive results. For example, +SUNDAE can achieve 26.80 PSNR at 145 FPS using 104 MB memory while the vanilla +Gaussian splatting algorithm achieves 25.60 PSNR at 160 FPS using 523 MB +memory, on the Mip-NeRF360 dataset. Codes are publicly available at +https://runyiyang.github.io/projects/SUNDAE/. + +
+
+ comment: Code: https://github.com/RunyiYang/SUNDAE Project page: + https://runyiyang.github.io/projects/SUNDAE/ +
+
+
+
+
+ + ☆ TexSliders: Diffusion-Based Texture Editing in CLIP Space SIGGRAPH 2024 + + +
+ Generative models have enabled intuitive image creation and manipulation +using natural language. In particular, diffusion models have recently shown +remarkable results for natural image editing. In this work, we propose to apply +diffusion techniques to edit textures, a specific class of images that are an +essential part of 3D content creation pipelines. We analyze existing editing +methods and show that they are not directly applicable to textures, since their +common underlying approach, manipulating attention maps, is unsuitable for the +texture domain. To address this, we propose a novel approach that instead +manipulates CLIP image embeddings to condition the diffusion generation. We +define editing directions using simple text prompts (e.g., "aged wood" to "new +wood") and map these to CLIP image embedding space using a texture prior, with +a sampling-based approach that gives us identity-preserving directions in CLIP +space. To further improve identity preservation, we project these directions to +a CLIP subspace that minimizes identity variations resulting from entangled +texture attributes. Our editing pipeline facilitates the creation of arbitrary +sliders using natural language prompts only, with no ground-truth annotated +data necessary. + +
+
+ comment: SIGGRAPH 2024 Conference Proceedings +
+
+
+
+
+ + ☆ Adapting Pretrained Networks for Image Quality Assessment on High + Dynamic Range Displays + + +
+ Conventional image quality metrics (IQMs), such as PSNR and SSIM, are +designed for perceptually uniform gamma-encoded pixel values and cannot be +directly applied to perceptually non-uniform linear high-dynamic-range (HDR) +colors. Similarly, most of the available datasets consist of +standard-dynamic-range (SDR) images collected in standard and possibly +uncontrolled viewing conditions. Popular pre-trained neural networks are +likewise intended for SDR inputs, restricting their direct application to HDR +content. On the other hand, training HDR models from scratch is challenging due +to limited available HDR data. In this work, we explore more effective +approaches for training deep learning-based models for image quality assessment +(IQA) on HDR data. We leverage networks pre-trained on SDR data (source domain) +and re-target these models to HDR (target domain) with additional fine-tuning +and domain adaptation. We validate our methods on the available HDR IQA +datasets, demonstrating that models trained with our combined recipe outperform +previous baselines, converge much quicker, and reliably generalize to HDR +inputs. + +
+
+ comment: 7 pages, 3 figures, 3 tables. Submitted to Human Vision and + Electronic Imaging 2024 (HVEI) +
+
+
+
+
+ + ☆ RGB$\leftrightarrow$X: Image decomposition and synthesis using material- + and lighting-aware diffusion models + + +
+ The three areas of realistic forward rendering, per-pixel inverse rendering, +and generative image synthesis may seem like separate and unrelated sub-fields +of graphics and vision. However, recent work has demonstrated improved +estimation of per-pixel intrinsic channels (albedo, roughness, metallicity) +based on a diffusion architecture; we call this the RGB$\rightarrow$X problem. +We further show that the reverse problem of synthesizing realistic images given +intrinsic channels, X$\rightarrow$RGB, can also be addressed in a diffusion +framework. + Focusing on the image domain of interior scenes, we introduce an improved +diffusion model for RGB$\rightarrow$X, which also estimates lighting, as well +as the first diffusion X$\rightarrow$RGB model capable of synthesizing +realistic images from (full or partial) intrinsic channels. Our +X$\rightarrow$RGB model explores a middle ground between traditional rendering +and generative models: we can specify only certain appearance properties that +should be followed, and give freedom to the model to hallucinate a plausible +version of the rest. + This flexibility makes it possible to use a mix of heterogeneous training +datasets, which differ in the available channels. We use multiple existing +datasets and extend them with our own synthetic and real data, resulting in a +model capable of extracting scene properties better than previous work and of +generating highly realistic images of interior scenes. + +
+
+
+
+
+ + ☆ Grains of Saliency: Optimizing Saliency-based Training of Biometric + Attack Detection Models + + +
+ Incorporating human-perceptual intelligence into model training has shown to +increase the generalization capability of models in several difficult biometric +tasks, such as presentation attack detection (PAD) and detection of synthetic +samples. After the initial collection phase, human visual saliency (e.g., +eye-tracking data, or handwritten annotations) can be integrated into model +training through attention mechanisms, augmented training samples, or through +human perception-related components of loss functions. Despite their successes, +a vital, but seemingly neglected, aspect of any saliency-based training is the +level of salience granularity (e.g., bounding boxes, single saliency maps, or +saliency aggregated from multiple subjects) necessary to find a balance between +reaping the full benefits of human saliency and the cost of its collection. In +this paper, we explore several different levels of salience granularity and +demonstrate that increased generalization capabilities of PAD and synthetic +face detection can be achieved by using simple yet effective saliency +post-processing techniques across several different CNNs. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Learning to Compose: Improving Object Centric Learning by Injecting + Compositionality + + +
+ Learning compositional representation is a key aspect of object-centric +learning as it enables flexible systematic generalization and supports complex +visual reasoning. However, most of the existing approaches rely on +auto-encoding objective, while the compositionality is implicitly imposed by +the architectural or algorithmic bias in the encoder. This misalignment between +auto-encoding objective and learning compositionality often results in failure +of capturing meaningful object representations. In this study, we propose a +novel objective that explicitly encourages compositionality of the +representations. Built upon the existing object-centric learning framework +(e.g., slot attention), our method incorporates additional constraints that an +arbitrary mixture of object representations from two images should be valid by +maximizing the likelihood of the composite data. We demonstrate that +incorporating our objective to the existing framework consistently improves the +objective-centric learning and enhances the robustness to the architectural +choices. + +
+
+
+
+
+ + ☆ Deep Metric Learning-Based Out-of-Distribution Detection with Synthetic + Outlier Exposure + + +
+ In this paper, we present a novel approach that combines deep metric learning +and synthetic data generation using diffusion models for out-of-distribution +(OOD) detection. One popular approach for OOD detection is outlier exposure, +where models are trained using a mixture of in-distribution (ID) samples and +``seen" OOD samples. For the OOD samples, the model is trained to minimize the +KL divergence between the output probability and the uniform distribution while +correctly classifying the in-distribution (ID) data. In this paper, we propose +a label-mixup approach to generate synthetic OOD data using Denoising Diffusion +Probabilistic Models (DDPMs). Additionally, we explore recent advancements in +metric learning to train our models. + In the experiments, we found that metric learning-based loss functions +perform better than the softmax. Furthermore, the baseline models (including +softmax, and metric learning) show a significant improvement when trained with +the generated OOD data. Our approach outperforms strong baselines in +conventional OOD detection metrics. + +
+
+
+
+
+ + ☆ Depth Priors in Removal Neural Radiance Fields + + +
+ Neural Radiance Fields (NeRF) have shown impressive results in 3D +reconstruction and generating novel views. A key challenge within NeRF is the +editing of reconstructed scenes, such as object removal, which requires +maintaining consistency across multiple views and ensuring high-quality +synthesised perspectives. Previous studies have incorporated depth priors, +typically from LiDAR or sparse depth measurements provided by COLMAP, to +improve the performance of object removal in NeRF. However, these methods are +either costly or time-consuming. In this paper, we propose a novel approach +that integrates monocular depth estimates with NeRF-based object removal models +to significantly reduce time consumption and enhance the robustness and quality +of scene generation and object removal. We conducted a thorough evaluation of +COLMAP's dense depth reconstruction on the KITTI dataset to verify its accuracy +in depth map generation. Our findings suggest that COLMAP can serve as an +effective alternative to a ground truth depth map where such information is +missing or costly to obtain. Additionally, we integrated various monocular +depth estimation methods into the removal NeRF model, i.e., SpinNeRF, to assess +their capacity to improve object removal performance. Our experimental results +highlight the potential of monocular depth estimation to substantially improve +NeRF applications. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Lane Segmentation Refinement with Diffusion Models + + +
+ The lane graph is a key component for building high-definition (HD) maps and +crucial for downstream tasks such as autonomous driving or navigation planning. +Previously, He et al. (2022) explored the extraction of the lane-level graph +from aerial imagery utilizing a segmentation based approach. However, +segmentation networks struggle to achieve perfect segmentation masks resulting +in inaccurate lane graph extraction. We explore additional enhancements to +refine this segmentation-based approach and extend it with a diffusion +probabilistic model (DPM) component. This combination further improves the GEO +F1 and TOPO F1 scores, which are crucial indicators of the quality of a lane +graph, in the undirected graph in non-intersection areas. We conduct +experiments on a publicly available dataset, demonstrating that our method +outperforms the previous approach, particularly in enhancing the connectivity +of such a graph, as measured by the TOPO F1 score. Moreover, we perform +ablation studies on the individual components of our method to understand their +contribution and evaluate their effectiveness. + +
+
+
+
+
+ + ☆ A Preprocessing and Evaluation Toolbox for Trajectory Prediction + Research on the Drone Datasets + + +
+ The availability of high-quality datasets is crucial for the development of +behavior prediction algorithms in autonomous vehicles. This paper highlights +the need for standardizing the use of certain datasets for motion forecasting +research to simplify comparative analysis and proposes a set of tools and +practices to achieve this. Drawing on extensive experience and a comprehensive +review of current literature, we summarize our proposals for preprocessing, +visualizing, and evaluation in the form of an open-sourced toolbox designed for +researchers working on trajectory prediction problems. The clear specification +of necessary preprocessing steps and evaluation metrics is intended to +alleviate development efforts and facilitate the comparison of results across +different studies. The toolbox is available at: +https://github.com/westny/dronalize. + +
+
+ comment: https://github.com/westny/dronalize +
+
+
+
+
+ + ☆ Are Models Biased on Text without Gender-related Language? + + +
+ Gender bias research has been pivotal in revealing undesirable behaviors in +large language models, exposing serious gender stereotypes associated with +occupations, and emotions. A key observation in prior work is that models +reinforce stereotypes as a consequence of the gendered correlations that are +present in the training data. In this paper, we focus on bias where the effect +from training data is unclear, and instead address the question: Do language +models still exhibit gender bias in non-stereotypical settings? To do so, we +introduce UnStereoEval (USE), a novel framework tailored for investigating +gender bias in stereotype-free scenarios. USE defines a sentence-level score +based on pretraining data statistics to determine if the sentence contain +minimal word-gender associations. To systematically benchmark the fairness of +popular language models in stereotype-free scenarios, we utilize USE to +automatically generate benchmarks without any gender-related language. By +leveraging USE's sentence-level score, we also repurpose prior gender bias +benchmarks (Winobias and Winogender) for non-stereotypical evaluation. +Surprisingly, we find low fairness across all 28 tested models. Concretely, +models demonstrate fair behavior in only 9%-41% of stereotype-free sentences, +suggesting that bias does not solely stem from the presence of gender-related +words. These results raise important questions about where underlying model +biases come from and highlight the need for more systematic and comprehensive +bias evaluation. We release the full dataset and code at +https://ucinlp.github.io/unstereo-eval. + +
+
+ comment: In International Conference on Learning Representations 2024 +
+
+
+
+
+ + ☆ GraCo: Granularity-Controllable Interactive Segmentation + + +
+ Interactive Segmentation (IS) segments specific objects or parts in the image +according to user input. Current IS pipelines fall into two categories: +single-granularity output and multi-granularity output. The latter aims to +alleviate the spatial ambiguity present in the former. However, the +multi-granularity output pipeline suffers from limited interaction flexibility +and produces redundant results. In this work, we introduce +Granularity-Controllable Interactive Segmentation (GraCo), a novel approach +that allows precise control of prediction granularity by introducing additional +parameters to input. This enhances the customization of the interactive system +and eliminates redundancy while resolving ambiguity. Nevertheless, the +exorbitant cost of annotating multi-granularity masks and the lack of available +datasets with granularity annotations make it difficult for models to acquire +the necessary guidance to control output granularity. To address this problem, +we design an any-granularity mask generator that exploits the semantic property +of the pre-trained IS model to automatically generate abundant mask-granularity +pairs without requiring additional manual annotation. Based on these pairs, we +propose a granularity-controllable learning strategy that efficiently imparts +the granularity controllability to the IS model. Extensive experiments on +intricate scenarios at object and part levels demonstrate that our GraCo has +significant advantages over previous methods. This highlights the potential of +GraCo to be a flexible annotation tool, capable of adapting to diverse +segmentation scenarios. The project page: https://zhao-yian.github.io/GraCo. + +
+
+
+
+
+ + ☆ EALD-MLLM: Emotion Analysis in Long-sequential and De-identity videos + with Multi-modal Large Language Model + + +
+ Emotion AI is the ability of computers to understand human emotional states. +Existing works have achieved promising progress, but two limitations remain to +be solved: 1) Previous studies have been more focused on short sequential video +emotion analysis while overlooking long sequential video. However, the emotions +in short sequential videos only reflect instantaneous emotions, which may be +deliberately guided or hidden. In contrast, long sequential videos can reveal +authentic emotions; 2) Previous studies commonly utilize various signals such +as facial, speech, and even sensitive biological signals (e.g., +electrocardiogram). However, due to the increasing demand for privacy, +developing Emotion AI without relying on sensitive signals is becoming +important. To address the aforementioned limitations, in this paper, we +construct a dataset for Emotion Analysis in Long-sequential and De-identity +videos called EALD by collecting and processing the sequences of athletes' +post-match interviews. In addition to providing annotations of the overall +emotional state of each video, we also provide the Non-Facial Body Language +(NFBL) annotations for each player. NFBL is an inner-driven emotional +expression and can serve as an identity-free clue to understanding the +emotional state. Moreover, we provide a simple but effective baseline for +further research. More precisely, we evaluate the Multimodal Large Language +Models (MLLMs) with de-identification signals (e.g., visual, speech, and NFBLs) +to perform emotion analysis. Our experimental results demonstrate that: 1) +MLLMs can achieve comparable, even better performance than the supervised +single-modal models, even in a zero-shot scenario; 2) NFBL is an important cue +in long sequential emotion analysis. EALD will be available on the open-source +platform. + +
+
+
+
+
+ + ☆ Spherical Linear Interpolation and Text-Anchoring for Zero-shot Composed + Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a complex task that retrieves images using +a query, which is configured with an image and a caption that describes desired +modifications to that image. Supervised CIR approaches have shown strong +performance, but their reliance on expensive manually-annotated datasets +restricts their scalability and broader applicability. To address these issues, +previous studies have proposed pseudo-word token-based Zero-Shot CIR (ZS-CIR) +methods, which utilize a projection module to map images to word tokens. +However, we conjecture that this approach has a downside: the projection module +distorts the original image representation and confines the resulting composed +embeddings to the text-side. In order to resolve this, we introduce a novel +ZS-CIR method that uses Spherical Linear Interpolation (Slerp) to directly +merge image and text representations by identifying an intermediate embedding +of both. Furthermore, we introduce Text-Anchored-Tuning (TAT), a method that +fine-tunes the image encoder while keeping the text encoder fixed. TAT closes +the modality gap between images and text, making the Slerp process much more +effective. Notably, the TAT method is not only efficient in terms of the scale +of the training dataset and training time, but it also serves as an excellent +initial checkpoint for training supervised CIR models, thereby highlighting its +wider potential. The integration of the Slerp-based ZS-CIR with a TAT-tuned +model enables our approach to deliver state-of-the-art retrieval performance +across CIR benchmarks. + +
+
+
+
+
+ + ☆ UWAFA-GAN: Ultra-Wide-Angle Fluorescein Angiography Transformation via + Multi-scale Generation and Registration Enhancement + + +
+ Fundus photography, in combination with the ultra-wide-angle fundus (UWF) +techniques, becomes an indispensable diagnostic tool in clinical settings by +offering a more comprehensive view of the retina. Nonetheless, UWF fluorescein +angiography (UWF-FA) necessitates the administration of a fluorescent dye via +injection into the patient's hand or elbow unlike UWF scanning laser +ophthalmoscopy (UWF-SLO). To mitigate potential adverse effects associated with +injections, researchers have proposed the development of cross-modality medical +image generation algorithms capable of converting UWF-SLO images into their +UWF-FA counterparts. Current image generation techniques applied to fundus +photography encounter difficulties in producing high-resolution retinal images, +particularly in capturing minute vascular lesions. To address these issues, we +introduce a novel conditional generative adversarial network (UWAFA-GAN) to +synthesize UWF-FA from UWF-SLO. This approach employs multi-scale generators +and an attention transmit module to efficiently extract both global structures +and local lesions. Additionally, to counteract the image blurriness issue that +arises from training with misaligned data, a registration module is integrated +within this framework. Our method performs non-trivially on inception scores +and details generation. Clinical user studies further indicate that the UWF-FA +images generated by UWAFA-GAN are clinically comparable to authentic images in +terms of diagnostic reliability. Empirical evaluations on our proprietary UWF +image datasets elucidate that UWAFA-GAN outperforms extant methodologies. The +code is accessible at https://github.com/Tinysqua/UWAFA-GAN. + +
+
+
+
+
+ + ☆ GAD-Generative Learning for HD Map-Free Autonomous Driving + + +
+ Deep-learning-based techniques have been widely adopted for autonomous +driving software stacks for mass production in recent years, focusing primarily +on perception modules, with some work extending this method to prediction +modules. However, the downstream planning and control modules are still +designed with hefty handcrafted rules, dominated by optimization-based methods +such as quadratic programming or model predictive control. This results in a +performance bottleneck for autonomous driving systems in that corner cases +simply cannot be solved by enumerating hand-crafted rules. We present a +deep-learning-based approach that brings prediction, decision, and planning +modules together with the attempt to overcome the rule-based methods' +deficiency in real-world applications of autonomous driving, especially for +urban scenes. The DNN model we proposed is solely trained with 10 hours of +human driver data, and it supports all mass-production ADAS features available +on the market to date. This method is deployed onto a Jiyue test car with no +modification to its factory-ready sensor set and compute platform. the +feasibility, usability, and commercial potential are demonstrated in this +article. + +
+
+
+
+
+ + ☆ Get Your Embedding Space in Order: Domain-Adaptive Regression for Forest + Monitoring + + +
+ Image-level regression is an important task in Earth observation, where +visual domain and label shifts are a core challenge hampering generalization. +However, cross-domain regression with remote sensing data remains understudied +due to the absence of suited datasets. We introduce a new dataset with aerial +and satellite imagery in five countries with three forest-related regression +tasks. To match real-world applicative interests, we compare methods through a +restrictive setup where no prior on the target domain is available during +training, and models are adapted with limited information during testing. +Building on the assumption that ordered relationships generalize better, we +propose manifold diffusion for regression as a strong baseline for transduction +in low-data regimes. Our comparison highlights the comparative advantages of +inductive and transductive methods in cross-domain regression. + +
+
+
+
+
+ + ☆ NeRF-Guided Unsupervised Learning of RGB-D Registration + + +
+ This paper focuses on training a robust RGB-D registration model without +ground-truth pose supervision. Existing methods usually adopt a pairwise +training strategy based on differentiable rendering, which enforces the +photometric and the geometric consistency between the two registered frames as +supervision. However, this frame-to-frame framework suffers from poor +multi-view consistency due to factors such as lighting changes, geometry +occlusion and reflective materials. In this paper, we present NeRF-UR, a novel +frame-to-model optimization framework for unsupervised RGB-D registration. +Instead of frame-to-frame consistency, we leverage the neural radiance field +(NeRF) as a global model of the scene and use the consistency between the input +and the NeRF-rerendered frames for pose optimization. This design can +significantly improve the robustness in scenarios with poor multi-view +consistency and provides better learning signal for the registration model. +Furthermore, to bootstrap the NeRF optimization, we create a synthetic dataset, +Sim-RGBD, through a photo-realistic simulator to warm up the registration +model. By first training the registration model on Sim-RGBD and later +unsupervisedly fine-tuning on real data, our framework enables distilling the +capability of feature extraction and registration from simulation to reality. +Our method outperforms the state-of-the-art counterparts on two popular indoor +RGB-D datasets, ScanNet and 3DMatch. Code and models will be released for paper +reproduction. + +
+
+
+
+
+ + ☆ The Pyramid of Captions + + +
+ We introduce a formal information-theoretic framework for image captioning by +regarding it as a representation learning task. Our framework defines three key +objectives: task sufficiency, minimal redundancy, and human interpretability. +Building upon this foundation, we propose a novel Pyramid of Captions (PoCa) +method, which constructs caption pyramids by generating localized captions for +zoomed-in image patches and integrating them with global caption information +using large language models. This approach leverages intuition that the +detailed examination of local patches can reduce error risks and address +inaccuracies in global captions, either by correcting the hallucination or +adding missing details. Based on our theoretical framework, we formalize this +intuition and provide formal proof demonstrating the effectiveness of PoCa +under certain assumptions. Empirical tests with various image captioning models +and large language models show that PoCa consistently yields more informative +and semantically aligned captions, maintaining brevity and interpretability. + +
+
+
+
+
+ + ☆ In Anticipation of Perfect Deepfake: Identity-anchored Artifact-agnostic + Detection under Rebalanced Deepfake Detection Protocol + + +
+ As deep generative models advance, we anticipate deepfakes achieving +"perfection"-generating no discernible artifacts or noise. However, current +deepfake detectors, intentionally or inadvertently, rely on such artifacts for +detection, as they are exclusive to deepfakes and absent in genuine examples. +To bridge this gap, we introduce the Rebalanced Deepfake Detection Protocol +(RDDP) to stress-test detectors under balanced scenarios where genuine and +forged examples bear similar artifacts. We offer two RDDP variants: +RDDP-WHITEHAT uses white-hat deepfake algorithms to create 'self-deepfakes,' +genuine portrait videos with the resemblance of the underlying identity, yet +carry similar artifacts to deepfake videos; RDDP-SURROGATE employs surrogate +functions (e.g., Gaussian noise) to process both genuine and forged examples, +introducing equivalent noise, thereby sidestepping the need of deepfake +algorithms. + Towards detecting perfect deepfake videos that aligns with genuine ones, we +present ID-Miner, a detector that identifies the puppeteer behind the disguise +by focusing on motion over artifacts or appearances. As an identity-based +detector, it authenticates videos by comparing them with reference footage. +Equipped with the artifact-agnostic loss at frame-level and the +identity-anchored loss at video-level, ID-Miner effectively singles out +identity signals amidst distracting variations. Extensive experiments comparing +ID-Miner with 12 baseline detectors under both conventional and RDDP +evaluations with two deepfake datasets, along with additional qualitative +studies, affirm the superiority of our method and the necessity for detectors +designed to counter perfect deepfakes. + +
+
+
+
+
+ + ☆ Enhanced Visual Question Answering: A Comparative Analysis and Textual + Feature Extraction Via Convolutions + + +
+ Visual Question Answering (VQA) has emerged as a highly engaging field in +recent years, attracting increasing research efforts aiming to enhance VQA +accuracy through the deployment of advanced models such as Transformers. +Despite this growing interest, there has been limited exploration into the +comparative analysis and impact of textual modalities within VQA, particularly +in terms of model complexity and its effect on performance. In this work, we +conduct a comprehensive comparison between complex textual models that leverage +long dependency mechanisms and simpler models focusing on local textual +features within a well-established VQA framework. Our findings reveal that +employing complex textual encoders is not invariably the optimal approach for +the VQA-v2 dataset. Motivated by this insight, we introduce an improved model, +ConvGRU, which incorporates convolutional layers to enhance the representation +of question text. Tested on the VQA-v2 dataset, ConvGRU achieves better +performance without substantially increasing parameter complexity. + +
+
+
+
+
+ + ☆ DmADs-Net: Dense multiscale attention and depth-supervised network for + medical image segmentation + + +
+ Deep learning has made important contributions to the development of medical +image segmentation. Convolutional neural networks, as a crucial branch, have +attracted strong attention from researchers. Through the tireless efforts of +numerous researchers, convolutional neural networks have yielded numerous +outstanding algorithms for processing medical images. The ideas and +architectures of these algorithms have also provided important inspiration for +the development of later technologies.Through extensive experimentation, we +have found that currently mainstream deep learning algorithms are not always +able to achieve ideal results when processing complex datasets and different +types of datasets. These networks still have room for improvement in lesion +localization and feature extraction. Therefore, we have created the Dense +Multiscale Attention and Depth-Supervised Network (DmADs-Net).We use ResNet for +feature extraction at different depths and create a Multi-scale Convolutional +Feature Attention Block to improve the network's attention to weak feature +information. The Local Feature Attention Block is created to enable enhanced +local feature attention for high-level semantic information. In addition, in +the feature fusion phase, a Feature Refinement and Fusion Block is created to +enhance the fusion of different semantic information.We validated the +performance of the network using five datasets of varying sizes and types. +Results from comparative experiments show that DmADs-Net outperformed +mainstream networks. Ablation experiments further demonstrated the +effectiveness of the created modules and the rationality of the network +architecture. + +
+
+
+
+
+ + ☆ Feature-Aware Noise Contrastive Learning For Unsupervised Red Panda + Re-Identification IJCNN2024 + + +
+ To facilitate the re-identification (Re-ID) of individual animals, existing +methods primarily focus on maximizing feature similarity within the same +individual and enhancing distinctiveness between different individuals. +However, most of them still rely on supervised learning and require substantial +labeled data, which is challenging to obtain. To avoid this issue, we propose a +Feature-Aware Noise Contrastive Learning (FANCL) method to explore an +unsupervised learning solution, which is then validated on the task of red +panda re-ID. FANCL employs a Feature-Aware Noise Addition module to produce +noised images that conceal critical features and designs two contrastive +learning modules to calculate the losses. Firstly, a feature consistency module +is designed to bridge the gap between the original and noised features. +Secondly, the neural networks are trained through a cluster contrastive +learning module. Through these more challenging learning tasks, FANCL can +adaptively extract deeper representations of red pandas. The experimental +results on a set of red panda images collected in both indoor and outdoor +environments prove that FANCL outperforms several related state-of-the-art +unsupervised methods, achieving high performance comparable to supervised +learning methods. + +
+
+ comment: 7 pages, 5 figures, IJCNN2024 +
+
+
+
+
+ + ☆ Lazy Layers to Make Fine-Tuned Diffusion Models More Traceable + + +
+ Foundational generative models should be traceable to protect their owners +and facilitate safety regulation. To achieve this, traditional approaches embed +identifiers based on supervisory trigger-response signals, which are commonly +known as backdoor watermarks. They are prone to failure when the model is +fine-tuned with nontrigger data. Our experiments show that this vulnerability +is due to energetic changes in only a few 'busy' layers during fine-tuning. +This yields a novel arbitrary-in-arbitrary-out (AIAO) strategy that makes +watermarks resilient to fine-tuning-based removal. The trigger-response pairs +of AIAO samples across various neural network depths can be used to construct +watermarked subpaths, employing Monte Carlo sampling to achieve stable +verification results. In addition, unlike the existing methods of designing a +backdoor for the input/output space of diffusion models, in our method, we +propose to embed the backdoor into the feature space of sampled subpaths, where +a mask-controlled trigger function is proposed to preserve the generation +performance and ensure the invisibility of the embedded backdoor. Our empirical +studies on the MS-COCO, AFHQ, LSUN, CUB-200, and DreamBooth datasets confirm +the robustness of AIAO; while the verification rates of other trigger-based +methods fall from ~90% to ~70% after fine-tuning, those of our method remain +consistently above 90%. + +
+
+
+
+
+ + ☆ Predictive Accuracy-Based Active Learning for Medical Image Segmentation + + +
+ Active learning is considered a viable solution to alleviate the +contradiction between the high dependency of deep learning-based segmentation +methods on annotated data and the expensive pixel-level annotation cost of +medical images. However, most existing methods suffer from unreliable +uncertainty assessment and the struggle to balance diversity and +informativeness, leading to poor performance in segmentation tasks. In +response, we propose an efficient Predictive Accuracy-based Active Learning +(PAAL) method for medical image segmentation, first introducing predictive +accuracy to define uncertainty. Specifically, PAAL mainly consists of an +Accuracy Predictor (AP) and a Weighted Polling Strategy (WPS). The former is an +attached learnable module that can accurately predict the segmentation accuracy +of unlabeled samples relative to the target model with the predicted posterior +probability. The latter provides an efficient hybrid querying scheme by +combining predicted accuracy and feature representation, aiming to ensure the +uncertainty and diversity of the acquired samples. Extensive experiment results +on multiple datasets demonstrate the superiority of PAAL. PAAL achieves +comparable accuracy to fully annotated data while reducing annotation costs by +approximately 50% to 80%, showcasing significant potential in clinical +applications. The code is available at https://github.com/shijun18/PAAL-MedSeg. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ MMTryon: Multi-Modal Multi-Reference Control for High-Quality Fashion + Generation + + +
+ This paper introduces MMTryon, a multi-modal multi-reference VIrtual Try-ON +(VITON) framework, which can generate high-quality compositional try-on results +by taking as inputs a text instruction and multiple garment images. Our MMTryon +mainly addresses two problems overlooked in prior literature: 1) Support of +multiple try-on items and dressing styleExisting methods are commonly designed +for single-item try-on tasks (e.g., upper/lower garments, dresses) and fall +short on customizing dressing styles (e.g., zipped/unzipped, tuck-in/tuck-out, +etc.) 2) Segmentation Dependency. They further heavily rely on +category-specific segmentation models to identify the replacement regions, with +segmentation errors directly leading to significant artifacts in the try-on +results. For the first issue, our MMTryon introduces a novel multi-modality and +multi-reference attention mechanism to combine the garment information from +reference images and dressing-style information from text instructions. +Besides, to remove the segmentation dependency, MMTryon uses a parsing-free +garment encoder and leverages a novel scalable data generation pipeline to +convert existing VITON datasets to a form that allows MMTryon to be trained +without requiring any explicit segmentation. Extensive experiments on +high-resolution benchmarks and in-the-wild test sets demonstrate MMTryon's +superiority over existing SOTA methods both qualitatively and quantitatively. +Besides, MMTryon's impressive performance on multi-items and style-controllable +virtual try-on scenarios and its ability to try on any outfit in a large +variety of scenarios from any source image, opens up a new avenue for future +investigation in the fashion community. + +
+
+
+
+
+ + ☆ Detail-Enhancing Framework for Reference-Based Image Super-Resolution + + +
+ Recent years have witnessed the prosperity of reference-based image +super-resolution (Ref-SR). By importing the high-resolution (HR) reference +images into the single image super-resolution (SISR) approach, the ill-posed +nature of this long-standing field has been alleviated with the assistance of +texture transferred from reference images. Although the significant improvement +in quantitative and qualitative results has verified the superiority of Ref-SR +methods, the presence of misalignment before texture transfer indicates room +for further performance improvement. Existing methods tend to neglect the +significance of details in the context of comparison, therefore not fully +leveraging the information contained within low-resolution (LR) images. In this +paper, we propose a Detail-Enhancing Framework (DEF) for reference-based +super-resolution, which introduces the diffusion model to generate and enhance +the underlying detail in LR images. If corresponding parts are present in the +reference image, our method can facilitate rigorous alignment. In cases where +the reference image lacks corresponding parts, it ensures a fundamental +improvement while avoiding the influence of the reference image. Extensive +experiments demonstrate that our proposed method achieves superior visual +results while maintaining comparable numerical outcomes. + +
+
+
+
+
+ + ☆ Continuous sPatial-Temporal Deformable Image Registration (CPT-DIR) for + motion modelling in radiotherapy: beyond classic voxel-based methods + + +
+ Background and purpose: Deformable image registration (DIR) is a crucial tool +in radiotherapy for extracting and modelling organ motion. However, when +significant changes and sliding boundaries are present, it faces compromised +accuracy and uncertainty, determining the subsequential contour propagation and +dose accumulation procedures. Materials and methods: We propose an implicit +neural representation (INR)-based approach modelling motion continuously in +both space and time, named Continues-sPatial-Temporal DIR (CPT-DIR). This +method uses a multilayer perception (MLP) network to map 3D coordinate (x,y,z) +to its corresponding velocity vector (vx,vy,vz). The displacement vectors +(dx,dy,dz) are then calculated by integrating velocity vectors over time. The +MLP's parameters can rapidly adapt to new cases without pre-training, enhancing +optimisation. The DIR's performance was tested on the DIR-Lab dataset of 10 +lung 4DCT cases, using metrics of landmark accuracy (TRE), contour conformity +(Dice) and image similarity (MAE). Results: The proposed CPT-DIR can reduce +landmark TRE from 2.79mm to 0.99mm, outperforming B-splines' results for all +cases. The MAE of the whole-body region improves from 35.46HU to 28.99HU. +Furthermore, CPT-DIR surpasses B-splines for accuracy in the sliding boundary +region, lowering MAE and increasing Dice coefficients for the ribcage from +65.65HU and 90.41% to 42.04HU and 90.56%, versus 75.40HU and 89.30% without +registration. Meanwhile, CPT-DIR offers significant speed advantages, +completing in under 15 seconds compared to a few minutes with the conventional +B-splines method. Conclusion: Leveraging the continuous representations, the +CPT-DIR method significantly enhances registration accuracy, automation and +speed, outperforming traditional B-splines in landmark and contour precision, +particularly in the challenging areas. + +
+
+
+
+
+ + ☆ Self-supervised Pre-training of Text Recognizers ICDAR24 + + +
+ In this paper, we investigate self-supervised pre-training methods for +document text recognition. Nowadays, large unlabeled datasets can be collected +for many research tasks, including text recognition, but it is costly to +annotate them. Therefore, methods utilizing unlabeled data are researched. We +study self-supervised pre-training methods based on masked label prediction +using three different approaches -- Feature Quantization, VQ-VAE, and +Post-Quantized AE. We also investigate joint-embedding approaches with VICReg +and NT-Xent objectives, for which we propose an image shifting technique to +prevent model collapse where it relies solely on positional encoding while +completely ignoring the input image. We perform our experiments on historical +handwritten (Bentham) and historical printed datasets mainly to investigate the +benefits of the self-supervised pre-training techniques with different amounts +of annotated target domain data. We use transfer learning as strong baselines. +The evaluation shows that the self-supervised pre-training on data from the +target domain is very effective, but it struggles to outperform transfer +learning from closely related domains. This paper is one of the first +researches exploring self-supervised pre-training in document text recognition, +and we believe that it will become a cornerstone for future research in this +area. We made our implementation of the investigated methods publicly available +at https://github.com/DCGM/pero-pretraining. + +
+
+ comment: 18 pages, 6 figures, 4 tables, accepted to ICDAR24 +
+
+
+
+
+ + ☆ Visual and audio scene classification for detecting discrepancies in + video: a baseline method and experimental protocol ICMR'24 + + +
+ This paper presents a baseline approach and an experimental protocol for a +specific content verification problem: detecting discrepancies between the +audio and video modalities in multimedia content. We first design and optimize +an audio-visual scene classifier, to compare with existing classification +baselines that use both modalities. Then, by applying this classifier +separately to the audio and the visual modality, we can detect scene-class +inconsistencies between them. To facilitate further research and provide a +common evaluation platform, we introduce an experimental protocol and a +benchmark dataset simulating such inconsistencies. Our approach achieves +state-of-the-art results in scene classification and promising outcomes in +audio-visual discrepancies detection, highlighting its potential in content +verification applications. + +
+
+ comment: Accepted for publication, 3rd ACM Int. Workshop on Multimedia AI + against Disinformation (MAD'24) at ACM ICMR'24, June 10, 2024, Phuket, + Thailand. This is the "accepted version" +
+
+
+
+
+ + ☆ Adaptive Bidirectional Displacement for Semi-Supervised Medical Image + Segmentation CVPR 2024 + + +
+ Consistency learning is a central strategy to tackle unlabeled data in +semi-supervised medical image segmentation (SSMIS), which enforces the model to +produce consistent predictions under the perturbation. However, most current +approaches solely focus on utilizing a specific single perturbation, which can +only cope with limited cases, while employing multiple perturbations +simultaneously is hard to guarantee the quality of consistency learning. In +this paper, we propose an Adaptive Bidirectional Displacement (ABD) approach to +solve the above challenge. Specifically, we first design a bidirectional patch +displacement based on reliable prediction confidence for unlabeled data to +generate new samples, which can effectively suppress uncontrollable regions and +still retain the influence of input perturbations. Meanwhile, to enforce the +model to learn the potentially uncontrollable content, a bidirectional +displacement operation with inverse confidence is proposed for the labeled +images, which generates samples with more unreliable information to facilitate +model learning. Extensive experiments show that ABD achieves new +state-of-the-art performances for SSMIS, significantly improving different +baselines. Source code is available at https://github.com/chy-upc/ABD. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Exploring Self-Supervised Vision Transformers for Deepfake Detection: A + Comparative Analysis + + +
+ This paper investigates the effectiveness of self-supervised pre-trained +transformers compared to supervised pre-trained transformers and conventional +neural networks (ConvNets) for detecting various types of deepfakes. We focus +on their potential for improved generalization, particularly when training data +is limited. Despite the notable success of large vision-language models +utilizing transformer architectures in various tasks, including zero-shot and +few-shot learning, the deepfake detection community has still shown some +reluctance to adopt pre-trained vision transformers (ViTs), especially large +ones, as feature extractors. One concern is their perceived excessive capacity, +which often demands extensive data, and the resulting suboptimal generalization +when training or fine-tuning data is small or less diverse. This contrasts +poorly with ConvNets, which have already established themselves as robust +feature extractors. Additionally, training and optimizing transformers from +scratch requires significant computational resources, making this accessible +primarily to large companies and hindering broader investigation within the +academic community. Recent advancements in using self-supervised learning (SSL) +in transformers, such as DINO and its derivatives, have showcased significant +adaptability across diverse vision tasks and possess explicit semantic +segmentation capabilities. By leveraging DINO for deepfake detection with +modest training data and implementing partial fine-tuning, we observe +comparable adaptability to the task and the natural explainability of the +detection result via the attention mechanism. Moreover, partial fine-tuning of +transformers for deepfake detection offers a more resource-efficient +alternative, requiring significantly fewer computational resources. + +
+
+
+
+
+ + ☆ CrossMatch: Enhance Semi-Supervised Medical Image Segmentation with + Perturbation Strategies and Knowledge Distillation + + +
+ Semi-supervised learning for medical image segmentation presents a unique +challenge of efficiently using limited labeled data while leveraging abundant +unlabeled data. Despite advancements, existing methods often do not fully +exploit the potential of the unlabeled data for enhancing model robustness and +accuracy. In this paper, we introduce CrossMatch, a novel framework that +integrates knowledge distillation with dual perturbation strategies-image-level +and feature-level-to improve the model's learning from both labeled and +unlabeled data. CrossMatch employs multiple encoders and decoders to generate +diverse data streams, which undergo self-knowledge distillation to enhance +consistency and reliability of predictions across varied perturbations. Our +method significantly surpasses other state-of-the-art techniques in standard +benchmarks by effectively minimizing the gap between training on labeled and +unlabeled data and improving edge accuracy and generalization in medical image +segmentation. The efficacy of CrossMatch is demonstrated through extensive +experimental validations, showing remarkable performance improvements without +increasing computational costs. Code for this implementation is made available +at https://github.com/AiEson/CrossMatch.git. + +
+
+
+
+
+ + ☆ Learning High-Quality Navigation and Zooming on Omnidirectional Images + in Virtual Reality + + +
+ Viewing omnidirectional images (ODIs) in virtual reality (VR) represents a +novel form of media that provides immersive experiences for users to navigate +and interact with digital content. Nonetheless, this sense of immersion can be +greatly compromised by a blur effect that masks details and hampers the user's +ability to engage with objects of interest. In this paper, we present a novel +system, called OmniVR, designed to enhance visual clarity during VR navigation. +Our system enables users to effortlessly locate and zoom in on the objects of +interest in VR. It captures user commands for navigation and zoom, converting +these inputs into parameters for the Mobius transformation matrix. Leveraging +these parameters, the ODI is refined using a learning-based algorithm. The +resultant ODI is presented within the VR media, effectively reducing blur and +increasing user engagement. To verify the effectiveness of our system, we first +evaluate our algorithm with state-of-the-art methods on public datasets, which +achieves the best performance. Furthermore, we undertake a comprehensive user +study to evaluate viewer experiences across diverse scenarios and to gather +their qualitative feedback from multiple perspectives. The outcomes reveal that +our system enhances user engagement by improving the viewers' recognition, +reducing discomfort, and improving the overall immersive experience. Our system +makes the navigation and zoom more user-friendly. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ NC-SDF: Enhancing Indoor Scene Reconstruction Using Neural SDFs with + View-Dependent Normal Compensation + + +
+ State-of-the-art neural implicit surface representations have achieved +impressive results in indoor scene reconstruction by incorporating monocular +geometric priors as additional supervision. However, we have observed that +multi-view inconsistency between such priors poses a challenge for high-quality +reconstructions. In response, we present NC-SDF, a neural signed distance field +(SDF) 3D reconstruction framework with view-dependent normal compensation (NC). +Specifically, we integrate view-dependent biases in monocular normal priors +into the neural implicit representation of the scene. By adaptively learning +and correcting the biases, our NC-SDF effectively mitigates the adverse impact +of inconsistent supervision, enhancing both the global consistency and local +details in the reconstructions. To further refine the details, we introduce an +informative pixel sampling strategy to pay more attention to intricate geometry +with higher information content. Additionally, we design a hybrid geometry +modeling approach to improve the neural implicit representation. Experiments on +synthetic and real-world datasets demonstrate that NC-SDF outperforms existing +approaches in terms of reconstruction quality. + +
+
+
+
+
+ + ☆ Covariant spatio-temporal receptive fields for neuromorphic computing + + +
+ Biological nervous systems constitute important sources of inspiration +towards computers that are faster, cheaper, and more energy efficient. +Neuromorphic disciplines view the brain as a coevolved system, simultaneously +optimizing the hardware and the algorithms running on it. There are clear +efficiency gains when bringing the computations into a physical substrate, but +we presently lack theories to guide efficient implementations. Here, we present +a principled computational model for neuromorphic systems in terms of +spatio-temporal receptive fields, based on affine Gaussian kernels over space +and leaky-integrator and leaky integrate-and-fire models over time. Our theory +is provably covariant to spatial affine and temporal scaling transformations, +and with close similarities to the visual processing in mammalian brains. We +use these spatio-temporal receptive fields as a prior in an event-based vision +task, and show that this improves the training of spiking networks, which +otherwise is known as problematic for event-based vision. This work combines +efforts within scale-space theory and computational neuroscience to identify +theoretically well-founded ways to process spatio-temporal signals in +neuromorphic systems. Our contributions are immediately relevant for signal +processing and event-based vision, and can be extended to other processing +tasks over space and time, such as memory and control. + +
+
+ comment: Code available at https://github.com/jegp/nrf +
+
+
+
+
+ + ☆ Model Quantization and Hardware Acceleration for Vision Transformers: A + Comprehensive Survey + + +
+ Vision Transformers (ViTs) have recently garnered considerable attention, +emerging as a promising alternative to convolutional neural networks (CNNs) in +several vision-related applications. However, their large model sizes and high +computational and memory demands hinder deployment, especially on +resource-constrained devices. This underscores the necessity of +algorithm-hardware co-design specific to ViTs, aiming to optimize their +performance by tailoring both the algorithmic structure and the underlying +hardware accelerator to each other's strengths. Model quantization, by +converting high-precision numbers to lower-precision, reduces the computational +demands and memory needs of ViTs, allowing the creation of hardware +specifically optimized for these quantized algorithms, boosting efficiency. +This article provides a comprehensive survey of ViTs quantization and its +hardware acceleration. We first delve into the unique architectural attributes +of ViTs and their runtime characteristics. Subsequently, we examine the +fundamental principles of model quantization, followed by a comparative +analysis of the state-of-the-art quantization techniques for ViTs. +Additionally, we explore the hardware acceleration of quantized ViTs, +highlighting the importance of hardware-friendly algorithm design. In +conclusion, this article will discuss ongoing challenges and future research +paths. We consistently maintain the related open-source materials at +https://github.com/DD-DuDa/awesome-vit-quantization-acceleration. + +
+
+
+
+
+ + ☆ Streamlining Image Editing with Layered Diffusion Brushes + + +
+ Denoising diffusion models have recently gained prominence as powerful tools +for a variety of image generation and manipulation tasks. Building on this, we +propose a novel tool for real-time editing of images that provides users with +fine-grained region-targeted supervision in addition to existing prompt-based +controls. Our novel editing technique, termed Layered Diffusion Brushes, +leverages prompt-guided and region-targeted alteration of intermediate +denoising steps, enabling precise modifications while maintaining the integrity +and context of the input image. We provide an editor based on Layered Diffusion +Brushes modifications, which incorporates well-known image editing concepts +such as layer masks, visibility toggles, and independent manipulation of +layers; regardless of their order. Our system renders a single edit on a +512x512 image within 140 ms using a high-end consumer GPU, enabling real-time +feedback and rapid exploration of candidate edits. We validated our method and +editing system through a user study involving both natural images (using +inversion) and generated images, showcasing its usability and effectiveness +compared to existing techniques such as InstructPix2Pix and Stable Diffusion +Inpainting for refining images. Our approach demonstrates efficacy across a +range of tasks, including object attribute adjustments, error correction, and +sequential prompt-based object placement and manipulation, demonstrating its +versatility and potential for enhancing creative workflows. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2306.00219 +
+
+
+
+
+ + ☆ MoPEFT: A Mixture-of-PEFTs for the Segment Anything Model CVPR 2024 + + +
+ The emergence of foundation models, such as the Segment Anything Model (SAM), +has sparked interest in Parameter-Efficient Fine-Tuning (PEFT) methods that +tailor these large models to application domains outside their training data. +However, different PEFT techniques modify the representation of a model +differently, making it a non-trivial task to select the most appropriate method +for the domain of interest. We propose a new framework, Mixture-of-PEFTs +methods (MoPEFT), that is inspired by traditional Mixture-of-Experts (MoE) +methodologies and is utilized for fine-tuning SAM. Our MoPEFT framework +incorporates three different PEFT techniques as submodules and dynamically +learns to activate the ones that are best suited for a given data-task setup. +We test our method on the Segment Anything Model and show that MoPEFT +consistently outperforms other fine-tuning methods on the MESS benchmark. + +
+
+ comment: Workshop on Foundation Models, CVPR 2024 +
+
+
+
+
+ + ☆ Using Texture to Classify Forests Separately from Vegetation + + +
+ Identifying terrain within satellite image data is a key issue in +geographical information sciences, with numerous environmental and safety +implications. Many techniques exist to derive classifications from spectral +data captured by satellites. However, the ability to reliably classify +vegetation remains a challenge. In particular, no precise methods exist for +classifying forest vs. non-forest vegetation in high-level satellite images. +This paper provides an initial proposal for a static, algorithmic process to +identify forest regions in satellite image data through texture features +created from detected edges and the NDVI ratio captured by Sentinel-2 satellite +images. With strong initial results, this paper also identifies the next steps +to improve the accuracy of the classification and verification processes. + +
+
+
+
+
+ + ☆ CREPE: Coordinate-Aware End-to-End Document Parser ICDAR 2024 + + +
+ In this study, we formulate an OCR-free sequence generation model for visual +document understanding (VDU). Our model not only parses text from document +images but also extracts the spatial coordinates of the text based on the +multi-head architecture. Named as Coordinate-aware End-to-end Document Parser +(CREPE), our method uniquely integrates these capabilities by introducing a +special token for OCR text, and token-triggered coordinate decoding. We also +proposed a weakly-supervised framework for cost-efficient training, requiring +only parsing annotations without high-cost coordinate annotations. Our +experimental evaluations demonstrate CREPE's state-of-the-art performances on +document parsing tasks. Beyond that, CREPE's adaptability is further +highlighted by its successful usage in other document understanding tasks such +as layout analysis, document visual question answering, and so one. CREPE's +abilities including OCR and semantic parsing not only mitigate error +propagation issues in existing OCR-dependent methods, it also significantly +enhance the functionality of sequence generation models, ushering in a new era +for document understanding studies. + +
+
+ comment: Accepted at the International Conference on Document Analysis and + Recognition (ICDAR 2024) main conference +
+
+
+
+
+ + ☆ ASAM: Boosting Segment Anything Model with Adversarial Tuning CVPR2024 + + +
+ In the evolving landscape of computer vision, foundation models have emerged +as pivotal tools, exhibiting exceptional adaptability to a myriad of tasks. +Among these, the Segment Anything Model (SAM) by Meta AI has distinguished +itself in image segmentation. However, SAM, like its counterparts, encounters +limitations in specific niche applications, prompting a quest for enhancement +strategies that do not compromise its inherent capabilities. This paper +introduces ASAM, a novel methodology that amplifies SAM's performance through +adversarial tuning. We harness the potential of natural adversarial examples, +inspired by their successful implementation in natural language processing. By +utilizing a stable diffusion model, we augment a subset (1%) of the SA-1B +dataset, generating adversarial instances that are more representative of +natural variations rather than conventional imperceptible perturbations. Our +approach maintains the photorealism of adversarial examples and ensures +alignment with original mask annotations, thereby preserving the integrity of +the segmentation task. The fine-tuned ASAM demonstrates significant +improvements across a diverse range of segmentation tasks without necessitating +additional data or architectural modifications. The results of our extensive +evaluations confirm that ASAM establishes new benchmarks in segmentation tasks, +thereby contributing to the advancement of foundational models in computer +vision. Our project page is in https://asam2024.github.io/. + +
+
+ comment: This paper is accepted by CVPR2024 +
+
+
+
+
+ + ☆ Transformer-Based Self-Supervised Learning for Histopathological + Classification of Ischemic Stroke Clot Origin + + +
+ Background and Purpose: Identifying the thromboembolism source in ischemic +stroke is crucial for treatment and secondary prevention yet is often +undetermined. This study describes a self-supervised deep learning approach in +digital pathology of emboli for classifying ischemic stroke clot origin from +histopathological images. Methods: The dataset included whole slide images +(WSI) from the STRIP AI Kaggle challenge, consisting of retrieved clots from +ischemic stroke patients following mechanical thrombectomy. Transformer-based +deep learning models were developed using transfer learning and self-supervised +pretraining for classifying WSI. Customizations included an attention pooling +layer, weighted loss function, and threshold optimization. Various model +architectures were tested and compared, and model performances were primarily +evaluated using weighted logarithmic loss. Results: The model achieved a +logloss score of 0.662 in cross-validation and 0.659 on the test set. Different +model backbones were compared, with the swin_large_patch4_window12_384 showed +higher performance. Thresholding techniques for clot origin classification were +employed to balance false positives and negatives. Conclusion: The study +demonstrates the extent of efficacy of transformer-based deep learning models +in identifying ischemic stroke clot origins from histopathological images and +emphasizes the need for refined modeling techniques specifically adapted to +thrombi WSI. Further research is needed to improve model performance, +interpretability, validate its effectiveness. Future enhancement could include +integrating larger patient cohorts, advanced preprocessing strategies, and +exploring ensemble multimodal methods for enhanced diagnostic accuracy. + +
+
+
+
+
+ + ☆ LOTUS: Improving Transformer Efficiency with Sparsity Pruning and Data + Lottery Tickets + + +
+ Vision transformers have revolutionized computer vision, but their +computational demands present challenges for training and deployment. This +paper introduces LOTUS (LOttery Transformers with Ultra Sparsity), a novel +method that leverages data lottery ticket selection and sparsity pruning to +accelerate vision transformer training while maintaining accuracy. Our approach +focuses on identifying and utilizing the most informative data subsets and +eliminating redundant model parameters to optimize the training process. +Through extensive experiments, we demonstrate the effectiveness of LOTUS in +achieving rapid convergence and high accuracy with significantly reduced +computational requirements. This work highlights the potential of combining +data selection and sparsity techniques for efficient vision transformer +training, opening doors for further research and development in this area. + +
+
+ comment: 3 pages, 5 figures +
+
+
+
+
+ + ☆ DiL-NeRF: Delving into Lidar for Neural Radiance Field on Street Scenes CVPR2024 + + +
+ Photorealistic simulation plays a crucial role in applications such as +autonomous driving, where advances in neural radiance fields (NeRFs) may allow +better scalability through the automatic creation of digital 3D assets. +However, reconstruction quality suffers on street scenes due to largely +collinear camera motions and sparser samplings at higher speeds. On the other +hand, the application often demands rendering from camera views that deviate +from the inputs to accurately simulate behaviors like lane changes. In this +paper, we propose several insights that allow a better utilization of Lidar +data to improve NeRF quality on street scenes. First, our framework learns a +geometric scene representation from Lidar, which is fused with the implicit +grid-based representation for radiance decoding, thereby supplying stronger +geometric information offered by explicit point cloud. Second, we put forth a +robust occlusion-aware depth supervision scheme, which allows utilizing +densified Lidar points by accumulation. Third, we generate augmented training +views from Lidar points for further improvement. Our insights translate to +largely improved novel view synthesis under real driving scenes. + +
+
+ comment: CVPR2024 Highlights +
+
+
+
+
+ + ☆ Wake Vision: A Large-scale, Diverse Dataset and Benchmark Suite for + TinyML Person Detection + + +
+ Machine learning applications on extremely low-power devices, commonly +referred to as tiny machine learning (TinyML), promises a smarter and more +connected world. However, the advancement of current TinyML research is +hindered by the limited size and quality of pertinent datasets. To address this +challenge, we introduce Wake Vision, a large-scale, diverse dataset tailored +for person detection -- the canonical task for TinyML visual sensing. Wake +Vision comprises over 6 million images, which is a hundredfold increase +compared to the previous standard, and has undergone thorough quality +filtering. Using Wake Vision for training results in a 2.41\% increase in +accuracy compared to the established benchmark. Alongside the dataset, we +provide a collection of five detailed benchmark sets that assess model +performance on specific segments of the test data, such as varying lighting +conditions, distances from the camera, and demographic characteristics of +subjects. These novel fine-grained benchmarks facilitate the evaluation of +model quality in challenging real-world scenarios that are often ignored when +focusing solely on overall accuracy. Through an evaluation of a MobileNetV2 +TinyML model on the benchmarks, we show that the input resolution plays a more +crucial role than the model width in detecting distant subjects and that the +impact of quantization on model robustness is minimal, thanks to the dataset +quality. These findings underscore the importance of a detailed evaluation to +identify essential factors for model development. The dataset, benchmark suite, +code, and models are publicly available under the CC-BY 4.0 license, enabling +their use for commercial use cases. + +
+
+
+
+
+ + ☆ SonicDiffusion: Audio-Driven Image Generation and Editing with + Pretrained Diffusion Models + + +
+ We are witnessing a revolution in conditional image synthesis with the recent +success of large scale text-to-image generation methods. This success also +opens up new opportunities in controlling the generation and editing process +using multi-modal input. While spatial control using cues such as depth, +sketch, and other images has attracted a lot of research, we argue that another +equally effective modality is audio since sound and sight are two main +components of human perception. Hence, we propose a method to enable +audio-conditioning in large scale image diffusion models. Our method first maps +features obtained from audio clips to tokens that can be injected into the +diffusion model in a fashion similar to text tokens. We introduce additional +audio-image cross attention layers which we finetune while freezing the weights +of the original layers of the diffusion model. In addition to audio conditioned +image generation, our method can also be utilized in conjuction with diffusion +based editing methods to enable audio conditioned image editing. We demonstrate +our method on a wide range of audio and image datasets. We perform extensive +comparisons with recent methods and show favorable performance. + +
+
+
+
+
+ + ☆ Beyond Human Vision: The Role of Large Vision Language Models in + Microscope Image Analysis + + +
+ Vision language models (VLMs) have recently emerged and gained the spotlight +for their ability to comprehend the dual modality of image and textual data. +VLMs such as LLaVA, ChatGPT-4, and Gemini have recently shown impressive +performance on tasks such as natural image captioning, visual question +answering (VQA), and spatial reasoning. Additionally, a universal segmentation +model by Meta AI, Segment Anything Model (SAM) shows unprecedented performance +at isolating objects from unforeseen images. Since medical experts, biologists, +and materials scientists routinely examine microscopy or medical images in +conjunction with textual information in the form of captions, literature, or +reports, and draw conclusions of great importance and merit, it is indubitably +essential to test the performance of VLMs and foundation models such as SAM, on +these images. In this study, we charge ChatGPT, LLaVA, Gemini, and SAM with +classification, segmentation, counting, and VQA tasks on a variety of +microscopy images. We observe that ChatGPT and Gemini are impressively able to +comprehend the visual features in microscopy images, while SAM is quite capable +at isolating artefacts in a general sense. However, the performance is not +close to that of a domain expert - the models are readily encumbered by the +introduction of impurities, defects, artefact overlaps and diversity present in +the images. + +
+
+
+
+
+ + ☆ Guided Conditional Diffusion Classifier (ConDiff) for Enhanced + Prediction of Infection in Diabetic Foot Ulcers + + +
+ To detect infected wounds in Diabetic Foot Ulcers (DFUs) from photographs, +preventing severe complications and amputations. Methods: This paper proposes +the Guided Conditional Diffusion Classifier (ConDiff), a novel deep-learning +infection detection model that combines guided image synthesis with a denoising +diffusion model and distance-based classification. The process involves (1) +generating guided conditional synthetic images by injecting Gaussian noise to a +guide image, followed by denoising the noise-perturbed image through a reverse +diffusion process, conditioned on infection status and (2) classifying +infections based on the minimum Euclidean distance between synthesized images +and the original guide image in embedding space. Results: ConDiff demonstrated +superior performance with an accuracy of 83% and an F1-score of 0.858, +outperforming state-of-the-art models by at least 3%. The use of a triplet loss +function reduces overfitting in the distance-based classifier. Conclusions: +ConDiff not only enhances diagnostic accuracy for DFU infections but also +pioneers the use of generative discriminative models for detailed medical image +analysis, offering a promising approach for improving patient outcomes. + +
+
+
+
+
+ + ☆ Brighteye: Glaucoma Screening with Color Fundus Photographs based on + Vision Transformer + + +
+ Differences in image quality, lighting conditions, and patient demographics +pose challenges to automated glaucoma detection from color fundus photography. +Brighteye, a method based on Vision Transformer, is proposed for glaucoma +detection and glaucomatous feature classification. Brighteye learns long-range +relationships among pixels within large fundus images using a self-attention +mechanism. Prior to being input into Brighteye, the optic disc is localized +using YOLOv8, and the region of interest (ROI) around the disc center is +cropped to ensure alignment with clinical practice. Optic disc detection +improves the sensitivity at 95% specificity from 79.20% to 85.70% for glaucoma +detection and the Hamming distance from 0.2470 to 0.1250 for glaucomatous +feature classification. In the developmental stage of the Justified Referral in +AI Glaucoma Screening (JustRAIGS) challenge, the overall outcome secured the +fifth position out of 226 entries. + +
+
+ comment: ISBI 2024, JustRAIGS challenge, glaucoma detection +
+
+
+
+
+ + ☆ ADM: Accelerated Diffusion Model via Estimated Priors for Robust Motion + Prediction under Uncertainties + + +
+ Motion prediction is a challenging problem in autonomous driving as it +demands the system to comprehend stochastic dynamics and the multi-modal nature +of real-world agent interactions. Diffusion models have recently risen to +prominence, and have proven particularly effective in pedestrian motion +prediction tasks. However, the significant time consumption and sensitivity to +noise have limited the real-time predictive capability of diffusion models. In +response to these impediments, we propose a novel diffusion-based, +acceleratable framework that adeptly predicts future trajectories of agents +with enhanced resistance to noise. The core idea of our model is to learn a +coarse-grained prior distribution of trajectory, which can skip a large number +of denoise steps. This advancement not only boosts sampling efficiency but also +maintains the fidelity of prediction accuracy. Our method meets the rigorous +real-time operational standards essential for autonomous vehicles, enabling +prompt trajectory generation that is vital for secure and efficient navigation. +Through extensive experiments, our method speeds up the inference time to 136ms +compared to standard diffusion model, and achieves significant improvement in +multi-agent motion prediction on the Argoverse 1 motion forecasting dataset. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Coherent 3D Portrait Video Reconstruction via Triplane Fusion + + +
+ Recent breakthroughs in single-image 3D portrait reconstruction have enabled +telepresence systems to stream 3D portrait videos from a single camera in +real-time, potentially democratizing telepresence. However, per-frame 3D +reconstruction exhibits temporal inconsistency and forgets the user's +appearance. On the other hand, self-reenactment methods can render coherent 3D +portraits by driving a personalized 3D prior, but fail to faithfully +reconstruct the user's per-frame appearance (e.g., facial expressions and +lighting). In this work, we recognize the need to maintain both coherent +identity and dynamic per-frame appearance to enable the best possible realism. +To this end, we propose a new fusion-based method that fuses a personalized 3D +subject prior with per-frame information, producing temporally stable 3D videos +with faithful reconstruction of the user's per-frame appearances. Trained only +using synthetic data produced by an expression-conditioned 3D GAN, our +encoder-based method achieves both state-of-the-art 3D reconstruction accuracy +and temporal consistency on in-studio and in-the-wild datasets. + +
+
+
+
+
+ + ☆ Obtaining Favorable Layouts for Multiple Object Generation + + +
+ Large-scale text-to-image models that can generate high-quality and diverse +images based on textual prompts have shown remarkable success. These models aim +ultimately to create complex scenes, and addressing the challenge of +multi-subject generation is a critical step towards this goal. However, the +existing state-of-the-art diffusion models face difficulty when generating +images that involve multiple subjects. When presented with a prompt containing +more than one subject, these models may omit some subjects or merge them +together. To address this challenge, we propose a novel approach based on a +guiding principle. We allow the diffusion model to initially propose a layout, +and then we rearrange the layout grid. This is achieved by enforcing +cross-attention maps (XAMs) to adhere to proposed masks and by migrating pixels +from latent maps to new locations determined by us. We introduce new loss terms +aimed at reducing XAM entropy for clearer spatial definition of subjects, +reduce the overlap between XAMs, and ensure that XAMs align with their +respective masks. We contrast our approach with several alternative methods and +show that it more faithfully captures the desired concepts across a variety of +text prompts. + +
+
+
+
+
+ + ☆ Deep Reward Supervisions for Tuning Text-to-Image Diffusion Models + + +
+ Optimizing a text-to-image diffusion model with a given reward function is an +important but underexplored research area. In this study, we propose Deep +Reward Tuning (DRTune), an algorithm that directly supervises the final output +image of a text-to-image diffusion model and back-propagates through the +iterative sampling process to the input noise. We find that training earlier +steps in the sampling process is crucial for low-level rewards, and deep +supervision can be achieved efficiently and effectively by stopping the +gradient of the denoising network input. DRTune is extensively evaluated on +various reward models. It consistently outperforms other algorithms, +particularly for low-level control signals, where all shallow supervision +methods fail. Additionally, we fine-tune Stable Diffusion XL 1.0 (SDXL 1.0) +model via DRTune to optimize Human Preference Score v2.1, resulting in the +Favorable Diffusion XL 1.0 (FDXL 1.0) model. FDXL 1.0 significantly enhances +image quality compared to SDXL 1.0 and reaches comparable quality compared with +Midjourney v5.2. + +
+
+ comment: N/A +
+
+
+
+
+ + ☆ CLIPArTT: Light-weight Adaptation of CLIP to New Domains at Test Time + + +
+ Pre-trained vision-language models (VLMs), exemplified by CLIP, demonstrate +remarkable adaptability across zero-shot classification tasks without +additional training. However, their performance diminishes in the presence of +domain shifts. In this study, we introduce CLIP Adaptation duRing Test-Time +(CLIPArTT), a fully test-time adaptation (TTA) approach for CLIP, which +involves automatic text prompts construction during inference for their use as +text supervision. Our method employs a unique, minimally invasive text prompt +tuning process, wherein multiple predicted classes are aggregated into a single +new text prompt, used as pseudo label to re-classify inputs in a transductive +manner. Additionally, we pioneer the standardization of TTA benchmarks (e.g., +TENT) in the realm of VLMs. Our findings demonstrate that, without requiring +additional transformations nor new trainable modules, CLIPArTT enhances +performance dynamically across non-corrupted datasets such as CIFAR-10, +corrupted datasets like CIFAR-10-C and CIFAR-10.1, alongside synthetic datasets +such as VisDA-C. This research underscores the potential for improving VLMs' +adaptability through novel test-time strategies, offering insights for robust +performance across varied datasets and environments. The code can be found at: +https://github.com/dosowiechi/CLIPArTT.git + +
+
+
+
+
+ + ☆ More is Better: Deep Domain Adaptation with Multiple Sources IJCAI 2024 + + +
+ In many practical applications, it is often difficult and expensive to obtain +large-scale labeled data to train state-of-the-art deep neural networks. +Therefore, transferring the learned knowledge from a separate, labeled source +domain to an unlabeled or sparsely labeled target domain becomes an appealing +alternative. However, direct transfer often results in significant performance +decay due to domain shift. Domain adaptation (DA) aims to address this problem +by aligning the distributions between the source and target domains. +Multi-source domain adaptation (MDA) is a powerful and practical extension in +which the labeled data may be collected from multiple sources with different +distributions. In this survey, we first define various MDA strategies. Then we +systematically summarize and compare modern MDA methods in the deep learning +era from different perspectives, followed by commonly used datasets and a brief +benchmark. Finally, we discuss future research directions for MDA that are +worth investigating. + +
+
+ comment: Accepted by IJCAI 2024. arXiv admin note: text overlap with + arXiv:2002.12169 +
+
+
+
+
+ + ☆ Reference-Free Image Quality Metric for Degradation and Reconstruction + Artifacts + + +
+ Image Quality Assessment (IQA) is essential in various Computer Vision tasks +such as image deblurring and super-resolution. However, most IQA methods +require reference images, which are not always available. While there are some +reference-free IQA metrics, they have limitations in simulating human +perception and discerning subtle image quality variations. We hypothesize that +the JPEG quality factor is representatives of image quality measurement, and a +well-trained neural network can learn to accurately evaluate image quality +without requiring a clean reference, as it can recognize image degradation +artifacts based on prior knowledge. Thus, we developed a reference-free quality +evaluation network, dubbed "Quality Factor (QF) Predictor", which does not +require any reference. Our QF Predictor is a lightweight, fully convolutional +network comprising seven layers. The model is trained in a self-supervised +manner: it receives JPEG compressed image patch with a random QF as input, is +trained to accurately predict the corresponding QF. We demonstrate the +versatility of the model by applying it to various tasks. First, our QF +Predictor can generalize to measure the severity of various image artifacts, +such as Gaussian Blur and Gaussian noise. Second, we show that the QF Predictor +can be trained to predict the undersampling rate of images reconstructed from +Magnetic Resonance Imaging (MRI) data. + +
+
+
+
+
+ + ☆ Deep Learning Descriptor Hybridization with Feature Reduction for + Accurate Cervical Cancer Colposcopy Image Classification + + +
+ Cervical cancer stands as a predominant cause of female mortality, +underscoring the need for regular screenings to enable early diagnosis and +preemptive treatment of pre-cancerous conditions. The transformation zone in +the cervix, where cellular differentiation occurs, plays a critical role in the +detection of abnormalities. Colposcopy has emerged as a pivotal tool in +cervical cancer prevention since it provides a meticulous examination of +cervical abnormalities. However, challenges in visual evaluation necessitate +the development of Computer Aided Diagnosis (CAD) systems. + We propose a novel CAD system that combines the strengths of various +deep-learning descriptors (ResNet50, ResNet101, and ResNet152) with appropriate +feature normalization (min-max) as well as feature reduction technique (LDA). +The combination of different descriptors ensures that all the features +(low-level like edges and colour, high-level like shape and texture) are +captured, feature normalization prevents biased learning, and feature reduction +avoids overfitting. We do experiments on the IARC dataset provided by WHO. The +dataset is initially segmented and balanced. Our approach achieves exceptional +performance in the range of 97%-100% for both the normal-abnormal and the type +classification. A competitive approach for type classification on the same +dataset achieved 81%-91% performance. + +
+
+ comment: 7 Pages double column, 5 figures, and 5 tables +
+
+
+
+
+ + ♻ ☆ GRASP: A Rehearsal Policy for Efficient Online Continual Learning + + +
+ Continual learning (CL) in deep neural networks (DNNs) involves incrementally +accumulating knowledge in a DNN from a growing data stream. A major challenge +in CL is that non-stationary data streams cause catastrophic forgetting of +previously learned abilities. A popular solution is rehearsal: storing past +observations in a buffer and then sampling the buffer to update the DNN. +Uniform sampling in a class-balanced manner is highly effective, and better +sample selection policies have been elusive. Here, we propose a new sample +selection policy called GRASP that selects the most prototypical (easy) samples +first and then gradually selects less prototypical (harder) examples. GRASP has +little additional compute or memory overhead compared to uniform selection, +enabling it to scale to large datasets. Compared to 17 other rehearsal +policies, GRASP achieves higher accuracy in CL experiments on ImageNet. +Compared to uniform balanced sampling, GRASP achieves the same performance with +40% fewer updates. We also show that GRASP is effective for CL on five text +classification datasets. + +
+
+ comment: Accepted to the Conference on Lifelong Learning Agents (CoLLAs) 2024 +
+
+
+
+
+ + ♻ ☆ Capabilities of Gemini Models in Medicine + + +
+ Excellence in a wide variety of medical applications poses considerable +challenges for AI, requiring advanced reasoning, access to up-to-date medical +knowledge and understanding of complex multimodal data. Gemini models, with +strong general capabilities in multimodal and long-context reasoning, offer +exciting possibilities in medicine. Building on these core strengths of Gemini, +we introduce Med-Gemini, a family of highly capable multimodal models that are +specialized in medicine with the ability to seamlessly use web search, and that +can be efficiently tailored to novel modalities using custom encoders. We +evaluate Med-Gemini on 14 medical benchmarks, establishing new state-of-the-art +(SoTA) performance on 10 of them, and surpass the GPT-4 model family on every +benchmark where a direct comparison is viable, often by a wide margin. On the +popular MedQA (USMLE) benchmark, our best-performing Med-Gemini model achieves +SoTA performance of 91.1% accuracy, using a novel uncertainty-guided search +strategy. On 7 multimodal benchmarks including NEJM Image Challenges and MMMU +(health & medicine), Med-Gemini improves over GPT-4V by an average relative +margin of 44.5%. We demonstrate the effectiveness of Med-Gemini's long-context +capabilities through SoTA performance on a needle-in-a-haystack retrieval task +from long de-identified health records and medical video question answering, +surpassing prior bespoke methods using only in-context learning. Finally, +Med-Gemini's performance suggests real-world utility by surpassing human +experts on tasks such as medical text summarization, alongside demonstrations +of promising potential for multimodal medical dialogue, medical research and +education. Taken together, our results offer compelling evidence for +Med-Gemini's potential, although further rigorous evaluation will be crucial +before real-world deployment in this safety-critical domain. + +
+
+
+
+
+ + ♻ ☆ NeRF as a Non-Distant Environment Emitter in Physics-based Inverse + Rendering SIGGRAPH 2024 + + +
+ Physics-based inverse rendering enables joint optimization of shape, +material, and lighting based on captured 2D images. To ensure accurate +reconstruction, using a light model that closely resembles the captured +environment is essential. Although the widely adopted distant environmental +lighting model is adequate in many cases, we demonstrate that its inability to +capture spatially varying illumination can lead to inaccurate reconstructions +in many real-world inverse rendering scenarios. To address this limitation, we +incorporate NeRF as a non-distant environment emitter into the inverse +rendering pipeline. Additionally, we introduce an emitter importance sampling +technique for NeRF to reduce the rendering variance. Through comparisons on +both real and synthetic datasets, our results demonstrate that our NeRF-based +emitter offers a more precise representation of scene lighting, thereby +improving the accuracy of inverse rendering. + +
+
+ comment: SIGGRAPH 2024. Project page and video: + https://nerfemitterpbir.github.io/ +
+
+
+
+
+ + ♻ ☆ HairFastGAN: Realistic and Robust Hair Transfer with a Fast + Encoder-Based Approach + + +
+ Our paper addresses the complex task of transferring a hairstyle from a +reference image to an input photo for virtual hair try-on. This task is +challenging due to the need to adapt to various photo poses, the sensitivity of +hairstyles, and the lack of objective metrics. The current state of the art +hairstyle transfer methods use an optimization process for different parts of +the approach, making them inexcusably slow. At the same time, faster +encoder-based models are of very low quality because they either operate in +StyleGAN's W+ space or use other low-dimensional image generators. +Additionally, both approaches have a problem with hairstyle transfer when the +source pose is very different from the target pose, because they either don't +consider the pose at all or deal with it inefficiently. In our paper, we +present the HairFast model, which uniquely solves these problems and achieves +high resolution, near real-time performance, and superior reconstruction +compared to optimization problem-based methods. Our solution includes a new +architecture operating in the FS latent space of StyleGAN, an enhanced +inpainting approach, and improved encoders for better alignment, color +transfer, and a new encoder for post-processing. The effectiveness of our +approach is demonstrated on realism metrics after random hairstyle transfer and +reconstruction when the original hairstyle is transferred. In the most +difficult scenario of transferring both shape and color of a hairstyle from +different images, our method performs in less than a second on the Nvidia V100. +Our code is available at https://github.com/AIRI-Institute/HairFastGAN. + +
+
+
+
+
+ + ♻ ☆ The R2D2 deep neural network series paradigm for fast precision imaging + in radio astronomy + + +
+ Radio-interferometric (RI) imaging entails solving high-resolution +high-dynamic range inverse problems from large data volumes. Recent image +reconstruction techniques grounded in optimization theory have demonstrated +remarkable capability for imaging precision, well beyond CLEAN's capability. +These range from advanced proximal algorithms propelled by handcrafted +regularization operators, such as the SARA family, to hybrid plug-and-play +(PnP) algorithms propelled by learned regularization denoisers, such as AIRI. +Optimization and PnP structures are however highly iterative, which hinders +their ability to handle the extreme data sizes expected from future +instruments. To address this scalability challenge, we introduce a novel deep +learning approach, dubbed "Residual-to-Residual DNN series for high-Dynamic +range imaging". R2D2's reconstruction is formed as a series of residual images, +iteratively estimated as outputs of Deep Neural Networks (DNNs) taking the +previous iteration's image estimate and associated data residual as inputs. It +thus takes a hybrid structure between a PnP algorithm and a learned version of +the matching pursuit algorithm that underpins CLEAN. We present a comprehensive +study of our approach, featuring its multiple incarnations distinguished by +their DNN architectures. We provide a detailed description of its training +process, targeting a telescope-specific approach. R2D2's capability to deliver +high precision is demonstrated in simulation, across a variety of image and +observation settings using the Very Large Array (VLA). Its reconstruction speed +is also demonstrated: with only few iterations required to clean data residuals +at dynamic ranges up to 100000, R2D2 opens the door to fast precision imaging. +R2D2 codes are available in the BASPLib library on GitHub. + +
+
+ comment: Accepted for publication in ApJS +
+
+
+
+
+ + ♻ ☆ Probabilistic Sampling of Balanced K-Means using Adiabatic Quantum + Computing CVPR 2024 + + +
+ Adiabatic quantum computing (AQC) is a promising approach for discrete and +often NP-hard optimization problems. Current AQCs allow to implement problems +of research interest, which has sparked the development of quantum +representations for many computer vision tasks. Despite requiring multiple +measurements from the noisy AQC, current approaches only utilize the best +measurement, discarding information contained in the remaining ones. In this +work, we explore the potential of using this information for probabilistic +balanced k-means clustering. Instead of discarding non-optimal solutions, we +propose to use them to compute calibrated posterior probabilities with little +additional compute cost. This allows us to identify ambiguous solutions and +data points, which we demonstrate on a D-Wave AQC on synthetic tasks and real +visual data. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Minimal Set of Parameters Based Depth-Dependent Distortion Model and + Its Calibration Method for Stereo Vision Systems + + +
+ Depth position highly affects lens distortion, especially in close-range +photography, which limits the measurement accuracy of existing stereo vision +systems. Moreover, traditional depth-dependent distortion models and their +calibration methods have remained complicated. In this work, we propose a +minimal set of parameters based depth-dependent distortion model (MDM), which +considers the radial and decentering distortions of the lens to improve the +accuracy of stereo vision systems and simplify their calibration process. In +addition, we present an easy and flexible calibration method for the MDM of +stereo vision systems with a commonly used planar pattern, which requires +cameras to observe the planar pattern in different orientations. The proposed +technique is easy to use and flexible compared with classical calibration +techniques for depth-dependent distortion models in which the lens must be +perpendicular to the planar pattern. The experimental validation of the MDM and +its calibration method showed that the MDM improved the calibration accuracy by +56.55% and 74.15% compared with the Li's distortion model and traditional +Brown's distortion model. Besides, an iteration-based reconstruction method is +proposed to iteratively estimate the depth information in the MDM during +three-dimensional reconstruction. The results showed that the accuracy of the +iteration-based reconstruction method was improved by 9.08% compared with that +of the non-iteration reconstruction method. + +
+
+ comment: This paper has been accepted for publication in IEEE Transactions on + Instrumentation and Measurement +
+
+
+
+
+ + ♻ ☆ Learning to Complement with Multiple Humans + + +
+ Real-world image classification tasks tend to be complex, where expert +labellers are sometimes unsure about the classes present in the images, leading +to the issue of learning with noisy labels (LNL). The ill-posedness of the LNL +task requires the adoption of strong assumptions or the use of multiple noisy +labels per training image, resulting in accurate models that work well in +isolation but fail to optimise human-AI collaborative classification (HAI-CC). +Unlike such LNL methods, HAI-CC aims to leverage the synergies between human +expertise and AI capabilities but requires clean training labels, limiting its +real-world applicability. This paper addresses this gap by introducing the +innovative Learning to Complement with Multiple Humans (LECOMH) approach. +LECOMH is designed to learn from noisy labels without depending on clean +labels, simultaneously maximising collaborative accuracy while minimising the +cost of human collaboration, measured by the number of human expert annotations +required per image. Additionally, new benchmarks featuring multiple noisy +labels for both training and testing are proposed to evaluate HAI-CC methods. +Through quantitative comparisons on these benchmarks, LECOMH consistently +outperforms competitive HAI-CC approaches, human labellers, multi-rater +learning, and noisy-label learning methods across various datasets, offering a +promising solution for addressing real-world image classification challenges. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ RTG-SLAM: Real-time 3D Reconstruction at Scale using Gaussian Splatting SIGGRAPH 2024 + + +
+ We present Real-time Gaussian SLAM (RTG-SLAM), a real-time 3D reconstruction +system with an RGBD camera for large-scale environments using Gaussian +splatting. The system features a compact Gaussian representation and a highly +efficient on-the-fly Gaussian optimization scheme. We force each Gaussian to be +either opaque or nearly transparent, with the opaque ones fitting the surface +and dominant colors, and transparent ones fitting residual colors. By rendering +depth in a different way from color rendering, we let a single opaque Gaussian +well fit a local surface region without the need of multiple overlapping +Gaussians, hence largely reducing the memory and computation cost. For +on-the-fly Gaussian optimization, we explicitly add Gaussians for three types +of pixels per frame: newly observed, with large color errors, and with large +depth errors. We also categorize all Gaussians into stable and unstable ones, +where the stable Gaussians are expected to well fit previously observed RGBD +images and otherwise unstable. We only optimize the unstable Gaussians and only +render the pixels occupied by unstable Gaussians. In this way, both the number +of Gaussians to be optimized and pixels to be rendered are largely reduced, and +the optimization can be done in real time. We show real-time reconstructions of +a variety of large scenes. Compared with the state-of-the-art NeRF-based RGBD +SLAM, our system achieves comparable high-quality reconstruction but with +around twice the speed and half the memory cost, and shows superior performance +in the realism of novel view synthesis and camera tracking accuracy. + +
+
+ comment: To be published in ACM SIGGRAPH 2024 +
+
+
+
+
+ + ♻ ☆ Attention is All They Need: Exploring the Media Archaeology of the + Computer Vision Research Paper + + +
+ Research papers, in addition to textual documents, are a designed interface +through which researchers communicate. Recently, rapid growth has transformed +that interface in many fields of computing. In this work, we examine the +effects of this growth from a media archaeology perspective, through the +changes to figures and tables in research papers. Specifically, we study these +changes in computer vision over the past decade, as the deep learning +revolution has driven unprecedented growth in the discipline. We ground our +investigation through interviews with veteran researchers spanning computer +vision, graphics and visualization. Our analysis focuses on the research +attention economy: how research paper elements contribute towards advertising, +measuring and disseminating an increasingly commodified ``contribution.'' +Through this work, we seek to motivate future discussion surrounding the design +of both the research paper itself as well as the larger sociotechnical research +publishing system, including tools for finding, reading and writing research +papers. + +
+
+
+
+
+ + ♻ ☆ Image-Based Virtual Try-On: A Survey + + +
+ Image-based virtual try-on aims to synthesize a naturally dressed person +image with a clothing image, which revolutionizes online shopping and inspires +related topics within image generation, showing both research significance and +commercial potential. However, there is a gap between current research progress +and commercial applications and an absence of comprehensive overview of this +field to accelerate the development. In this survey, we provide a comprehensive +analysis of the state-of-the-art techniques and methodologies in aspects of +pipeline architecture, person representation and key modules such as try-on +indication, clothing warping and try-on stage. We propose a new semantic +criteria with CLIP, and evaluate representative methods with uniformly +implemented evaluation metrics on the same dataset. In addition to quantitative +and qualitative evaluation of current open-source methods, unresolved issues +are highlighted and future research directions are prospected to identify key +trends and inspire further exploration. The uniformly implemented evaluation +metrics, dataset and collected methods will be made public available at +https://github.com/little-misfit/Survey-Of-Virtual-Try-On. + +
+
+ comment: 30 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ DSI2I: Dense Style for Unpaired Image-to-Image Translation + + +
+ Unpaired exemplar-based image-to-image (UEI2I) translation aims to translate +a source image to a target image domain with the style of a target image +exemplar, without ground-truth input-translation pairs. Existing UEI2I methods +represent style using one vector per image or rely on semantic supervision to +define one style vector per object. Here, in contrast, we propose to represent +style as a dense feature map, allowing for a finer-grained transfer to the +source image without requiring any external semantic information. We then rely +on perceptual and adversarial losses to disentangle our dense style and content +representations. To stylize the source content with the exemplar style, we +extract unsupervised cross-domain semantic correspondences and warp the +exemplar style to the source content. We demonstrate the effectiveness of our +method on four datasets using standard metrics together with a localized style +metric we propose, which measures style similarity in a class-wise manner. Our +results show that the translations produced by our approach are more diverse, +preserve the source content better, and are closer to the exemplars when +compared to the state-of-the-art methods. Project page: +https://github.com/IVRL/dsi2i + +
+
+ comment: To appear on TMLR '24, Reviewed on OpenReview: + https://openreview.net/forum?id=mrJi5kdKA4 +
+
+
+
+
+ + ♻ ☆ DressCode: Autoregressively Sewing and Generating Garments from Text + Guidance + + +
+ Apparel's significant role in human appearance underscores the importance of +garment digitalization for digital human creation. Recent advances in 3D +content creation are pivotal for digital human creation. Nonetheless, garment +generation from text guidance is still nascent. We introduce a text-driven 3D +garment generation framework, DressCode, which aims to democratize design for +novices and offer immense potential in fashion design, virtual try-on, and +digital human creation. We first introduce SewingGPT, a GPT-based architecture +integrating cross-attention with text-conditioned embedding to generate sewing +patterns with text guidance. We then tailor a pre-trained Stable Diffusion to +generate tile-based Physically-based Rendering (PBR) textures for the garments. +By leveraging a large language model, our framework generates CG-friendly +garments through natural language interaction. It also facilitates pattern +completion and texture editing, streamlining the design process through +user-friendly interaction. This framework fosters innovation by allowing +creators to freely experiment with designs and incorporate unique elements into +their work. With comprehensive evaluations and comparisons with other +state-of-the-art methods, our method showcases superior quality and alignment +with input prompts. User studies further validate our high-quality rendering +results, highlighting its practical utility and potential in production +settings. Our project page is https://IHe-KaiI.github.io/DressCode/. + +
+
+ comment: Project page: https://IHe-KaiI.github.io/DressCode/ +
+
+
+
+
+ + ♻ ☆ Benchmarking Deep Learning Architectures for Urban Vegetation Point + Cloud Semantic Segmentation from MLS + + +
+ Vegetation is crucial for sustainable and resilient cities providing various +ecosystem services and well-being of humans. However, vegetation is under +critical stress with rapid urbanization and expanding infrastructure +footprints. Consequently, mapping of this vegetation is essential in the urban +environment. Recently, deep learning for point cloud semantic segmentation has +shown significant progress. Advanced models attempt to obtain state-of-the-art +performance on benchmark datasets, comprising multiple classes and representing +real world scenarios. However, class specific segmentation with respect to +vegetation points has not been explored. Therefore, selection of a deep +learning model for vegetation points segmentation is ambiguous. To address this +problem, we provide a comprehensive assessment of point-based deep learning +models for semantic segmentation of vegetation class. We have selected seven +representative point-based models, namely PointCNN, KPConv (omni-supervised), +RandLANet, SCFNet, PointNeXt, SPoTr and PointMetaBase. These models are +investigated on three different datasets, specifically Chandigarh, Toronto3D +and Kerala, which are characterized by diverse nature of vegetation and varying +scene complexity combined with changing per-point features and class-wise +composition. PointMetaBase and KPConv (omni-supervised) achieve the highest +mIoU on the Chandigarh (95.24%) and Toronto3D datasets (91.26%), respectively +while PointCNN provides the highest mIoU on the Kerala dataset (85.68%). The +paper develops a deeper insight, hitherto not reported, into the working of +these models for vegetation segmentation and outlines the ingredients that +should be included in a model specifically for vegetation segmentation. This +paper is a step towards the development of a novel architecture for vegetation +points segmentation. + +
+
+ comment: The paper has been accepted for publication in IEEE Transactions on + Geoscience and Remote Sensing. DOI: 10.1109/TGRS.2024.3381976 +
+
+
+
+
+ + ♻ ☆ Semantic-guided modeling of spatial relation and object co-occurrence + for indoor scene recognition + + +
+ Exploring the semantic context in scene images is essential for indoor scene +recognition. However, due to the diverse intra-class spatial layouts and the +coexisting inter-class objects, modeling contextual relationships to adapt +various image characteristics is a great challenge. Existing contextual +modeling methods for scene recognition exhibit two limitations: 1) They +typically model only one kind of spatial relationship among objects within +scenes in an artificially predefined manner, with limited exploration of +diverse spatial layouts. 2) They often overlook the differences in coexisting +objects across different scenes, suppressing scene recognition performance. To +overcome these limitations, we propose SpaCoNet, which simultaneously models +Spatial relation and Co-occurrence of objects guided by semantic segmentation. +Firstly, the Semantic Spatial Relation Module (SSRM) is constructed to model +scene spatial features. With the help of semantic segmentation, this module +decouples the spatial information from the scene image and thoroughly explores +all spatial relationships among objects in an end-to-end manner. Secondly, both +spatial features from the SSRM and deep features from the Image Feature +Extraction Module are allocated to each object, so as to distinguish the +coexisting object across different scenes. Finally, utilizing the +discriminative features above, we design a Global-Local Dependency Module to +explore the long-range co-occurrence among objects, and further generate a +semantic-guided feature representation for indoor scene recognition. +Experimental results on three widely used scene datasets demonstrate the +effectiveness and generality of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Towards Learning Contrast Kinetics with Multi-Condition Latent Diffusion + Models + + +
+ Contrast agents in dynamic contrast enhanced magnetic resonance imaging allow +to localize tumors and observe their contrast kinetics, which is essential for +cancer characterization and respective treatment decision-making. However, +contrast agent administration is not only associated with adverse health risks, +but also restricted for patients during pregnancy, and for those with kidney +malfunction, or other adverse reactions. With contrast uptake as key biomarker +for lesion malignancy, cancer recurrence risk, and treatment response, it +becomes pivotal to reduce the dependency on intravenous contrast agent +administration. To this end, we propose a multi-conditional latent diffusion +model capable of acquisition time-conditioned image synthesis of DCE-MRI +temporal sequences. To evaluate medical image synthesis, we additionally +propose and validate the Fr\'echet radiomics distance as an image quality +measure based on biomarker variability between synthetic and real imaging data. +Our results demonstrate our method's ability to generate realistic +multi-sequence fat-saturated breast DCE-MRI and uncover the emerging potential +of deep learning based contrast kinetics simulation. We publicly share our +accessible codebase at https://github.com/RichardObi/ccnet and provide a +user-friendly library for Fr\'echet radiomics distance calculation at +https://pypi.org/project/frd-score. + +
+
+
+
+
+ + ♻ ☆ SeaTurtleID2022: A long-span dataset for reliable sea turtle + re-identification + + +
+ This paper introduces the first public large-scale, long-span dataset with +sea turtle photographs captured in the wild -- +\href{https://www.kaggle.com/datasets/wildlifedatasets/seaturtleid2022}{SeaTurtleID2022}. +The dataset contains 8729 photographs of 438 unique individuals collected +within 13 years, making it the longest-spanned dataset for animal +re-identification. All photographs include various annotations, e.g., identity, +encounter timestamp, and body parts segmentation masks. Instead of standard +"random" splits, the dataset allows for two realistic and ecologically +motivated splits: (i) a \textit{time-aware closed-set} with training, +validation, and test data from different days/years, and (ii) a +\textit{time-aware open-set} with new unknown individuals in test and +validation sets. We show that time-aware splits are essential for benchmarking +re-identification methods, as random splits lead to performance overestimation. +Furthermore, a baseline instance segmentation and re-identification performance +over various body parts is provided. Finally, an end-to-end system for sea +turtle re-identification is proposed and evaluated. The proposed system based +on Hybrid Task Cascade for head instance segmentation and ArcFace-trained +feature-extractor achieved an accuracy of 86.8\%. + +
+
+ comment: The SeaTurtleID2022 dataset is the latest version of the SeaTurtleID + dataset which was described in the previous versions of this arXiv + submission. Notice the change of title in the latest version +
+
+
+
+
+ + ♻ ☆ FusionVision: A comprehensive approach of 3D object reconstruction and + segmentation from RGB-D cameras using YOLO and fast segment anything + + +
+ In the realm of computer vision, the integration of advanced techniques into +the processing of RGB-D camera inputs poses a significant challenge, given the +inherent complexities arising from diverse environmental conditions and varying +object appearances. Therefore, this paper introduces FusionVision, an +exhaustive pipeline adapted for the robust 3D segmentation of objects in RGB-D +imagery. Traditional computer vision systems face limitations in simultaneously +capturing precise object boundaries and achieving high-precision object +detection on depth map as they are mainly proposed for RGB cameras. To address +this challenge, FusionVision adopts an integrated approach by merging +state-of-the-art object detection techniques, with advanced instance +segmentation methods. The integration of these components enables a holistic +(unified analysis of information obtained from both color \textit{RGB} and +depth \textit{D} channels) interpretation of RGB-D data, facilitating the +extraction of comprehensive and accurate object information. The proposed +FusionVision pipeline employs YOLO for identifying objects within the RGB image +domain. Subsequently, FastSAM, an innovative semantic segmentation model, is +applied to delineate object boundaries, yielding refined segmentation masks. +The synergy between these components and their integration into 3D scene +understanding ensures a cohesive fusion of object detection and segmentation, +enhancing overall precision in 3D object segmentation. The code and pre-trained +models are publicly available at https://github.com/safouaneelg/FusionVision/. + +
+
+ comment: 14 pages, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Blurring Diffusion Models + + +
+ Recently, Rissanen et al., (2022) have presented a new type of diffusion +process for generative modeling based on heat dissipation, or blurring, as an +alternative to isotropic Gaussian diffusion. Here, we show that blurring can +equivalently be defined through a Gaussian diffusion process with non-isotropic +noise. In making this connection, we bridge the gap between inverse heat +dissipation and denoising diffusion, and we shed light on the inductive bias +that results from this modeling choice. Finally, we propose a generalized class +of diffusion models that offers the best of both standard Gaussian denoising +diffusion and inverse heat dissipation, which we call Blurring Diffusion +Models. + +
+
+
+
+
+ + ♻ ☆ FlightScope: A Deep Comprehensive Assessment of Aircraft Detection + Algorithms in Satellite Imagery + + +
+ Object detection in remotely sensed satellite pictures is fundamental in many +fields such as biophysical, and environmental monitoring. While deep learning +algorithms are constantly evolving, they have been mostly implemented and +tested on popular ground-based taken photos. This paper critically evaluates +and compares a suite of advanced object detection algorithms customized for the +task of identifying aircraft within satellite imagery. Using the large +HRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset, +this research encompasses an array of methodologies including YOLO versions 5 +and 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from +scratch. This exhaustive training and validation study reveal YOLOv5 as the +preeminent model for the specific case of identifying airplanes from remote +sensing data, showcasing high precision and adaptability across diverse imaging +conditions. This research highlight the nuanced performance landscapes of these +algorithms, with YOLOv5 emerging as a robust solution for aerial object +detection, underlining its importance through superior mean average precision, +Recall, and Intersection over Union scores. The findings described here +underscore the fundamental role of algorithm selection aligned with the +specific demands of satellite imagery analysis and extend a comprehensive +framework to evaluate model efficacy. The benchmark toolkit and codes, +available via https://github.com/toelt-llc/FlightScope_Bench, aims to further +exploration and innovation in the realm of remote sensing object detection, +paving the way for improved analytical methodologies in satellite imagery +applications. + +
+
+ comment: 15 figures, 4 tables, comprehensive survey, comparative study +
+
+
+
+
+ + ♻ ☆ IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training + + +
+ In the field of medical Vision-Language Pre-training (VLP), significant +efforts have been devoted to deriving text and image features from both +clinical reports and associated medical images. However, most existing methods +may have overlooked the opportunity in leveraging the inherent hierarchical +structure of clinical reports, which are generally split into `findings' for +descriptive content and `impressions' for conclusive observation. Instead of +utilizing this rich, structured format, current medical VLP approaches often +simplify the report into either a unified entity or fragmented tokens. In this +work, we propose a novel clinical prior guided VLP framework named IMITATE to +learn the structure information from medical reports with hierarchical +vision-language alignment. The framework derives multi-level visual features +from the chest X-ray (CXR) images and separately aligns these features with the +descriptive and the conclusive text encoded in the hierarchical medical report. +Furthermore, a new clinical-informed contrastive loss is introduced for +cross-modal learning, which accounts for clinical prior knowledge in +formulating sample correlations in contrastive learning. The proposed model, +IMITATE, outperforms baseline VLP methods across six different datasets, +spanning five medical imaging downstream tasks. Comprehensive experimental +results highlight the advantages of integrating the hierarchical structure of +medical reports for vision-language alignment. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Rethinking Real-world Image Deraining via An Unpaired + Degradation-Conditioned Diffusion Model + + +
+ Recent diffusion models have exhibited great potential in generative modeling +tasks. Part of their success can be attributed to the ability of training +stable on huge sets of paired synthetic data. However, adapting these models to +real-world image deraining remains difficult for two aspects. First, collecting +a large-scale paired real-world clean/rainy dataset is unavailable while +regular conditional diffusion models heavily rely on paired data for training. +Second, real-world rain usually reflects real-world scenarios with a variety of +unknown rain degradation types, which poses a significant challenge for the +generative modeling process. To meet these challenges, we propose RainDiff, the +first real-world image deraining paradigm based on diffusion models, serving as +a new standard bar for real-world image deraining. We address the first +challenge by introducing a stable and non-adversarial unpaired cycle-consistent +architecture that can be trained, end-to-end, with only unpaired data for +supervision; and the second challenge by proposing a degradation-conditioned +diffusion model that refines the desired output via a diffusive generative +process conditioned by learned priors of multiple rain degradations. Extensive +experiments confirm the superiority of our RainDiff over existing +unpaired/semi-supervised methods and show its competitive advantages over +several fully-supervised ones. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ A Novel Spike Transformer Network for Depth Estimation from Event + Cameras via Cross-modality Knowledge Distillation + + +
+ Depth estimation is crucial for interpreting complex environments, especially +in areas such as autonomous vehicle navigation and robotics. Nonetheless, +obtaining accurate depth readings from event camera data remains a formidable +challenge. Event cameras operate differently from traditional digital cameras, +continuously capturing data and generating asynchronous binary spikes that +encode time, location, and light intensity. Yet, the unique sampling mechanisms +of event cameras render standard image based algorithms inadequate for +processing spike data. This necessitates the development of innovative, +spike-aware algorithms tailored for event cameras, a task compounded by the +irregularity, continuity, noise, and spatial and temporal characteristics +inherent in spiking data.Harnessing the strong generalization capabilities of +transformer neural networks for spatiotemporal data, we propose a purely +spike-driven spike transformer network for depth estimation from spiking camera +data. To address performance limitations with Spiking Neural Networks (SNN), we +introduce a novel single-stage cross-modality knowledge transfer framework +leveraging knowledge from a large vision foundational model of artificial +neural networks (ANN) (DINOv2) to enhance the performance of SNNs with limited +data. Our experimental results on both synthetic and real datasets show +substantial improvements over existing models, with notable gains in Absolute +Relative and Square Relative errors (49% and 39.77% improvements over the +benchmark model Spike-T, respectively). Besides accuracy, the proposed model +also demonstrates reduced power consumptions, a critical factor for practical +applications. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks 2024 +
+
+
+
+
+ + ♻ ☆ FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization + Search + + +
+ Quantization has become a mainstream compression technique for reducing model +size, computational requirements, and energy consumption for modern deep neural +networks (DNNs). With improved numerical support in recent hardware, including +multiple variants of integer and floating point, mixed-precision quantization +has become necessary to achieve high-quality results with low model cost. Prior +mixed-precision methods have performed either a post-training quantization +search, which compromises on accuracy, or a differentiable quantization search, +which leads to high memory usage from branching. Therefore, we propose the +first one-shot mixed-precision quantization search that eliminates the need for +retraining in both integer and low-precision floating point models. We evaluate +our search (FLIQS) on multiple convolutional and vision transformer networks to +discover Pareto-optimal models. Our approach improves upon uniform precision, +manual mixed-precision, and recent integer quantization search methods. With +integer models, we increase the accuracy of ResNet-18 on ImageNet by 1.31% and +ResNet-50 by 0.90% with equivalent model cost over previous methods. +Additionally, for the first time, we explore a novel mixed-precision +floating-point search and improve MobileNetV2 by up to 0.98% compared to prior +state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously search +a joint quantization and neural architecture space and improve the ImageNet +accuracy by 2.69% with similar model cost on a MobileNetV2 search space. + +
+
+ comment: Accepted to AutoML 2024 +
+
+
+
+
+ + ♻ ☆ HiH: A Multi-modal Hierarchy in Hierarchy Network for Unconstrained Gait + Recognition + + +
+ Gait recognition has achieved promising advances in controlled settings, yet +it significantly struggles in unconstrained environments due to challenges such +as view changes, occlusions, and varying walking speeds. Additionally, efforts +to fuse multiple modalities often face limited improvements because of +cross-modality incompatibility, particularly in outdoor scenarios. To address +these issues, we present a multi-modal Hierarchy in Hierarchy network (HiH) +that integrates silhouette and pose sequences for robust gait recognition. HiH +features a main branch that utilizes Hierarchical Gait Decomposer (HGD) modules +for depth-wise and intra-module hierarchical examination of general gait +patterns from silhouette data. This approach captures motion hierarchies from +overall body dynamics to detailed limb movements, facilitating the +representation of gait attributes across multiple spatial resolutions. +Complementing this, an auxiliary branch, based on 2D joint sequences, enriches +the spatial and temporal aspects of gait analysis. It employs a Deformable +Spatial Enhancement (DSE) module for pose-guided spatial attention and a +Deformable Temporal Alignment (DTA) module for aligning motion dynamics through +learned temporal offsets. Extensive evaluations across diverse indoor and +outdoor datasets demonstrate HiH's state-of-the-art performance, affirming a +well-balanced trade-off between accuracy and efficiency. + +
+
+
+
+
+ + ♻ ☆ Relaxometry Guided Quantitative Cardiac Magnetic Resonance Image + Reconstruction + + +
+ Deep learning-based methods have achieved prestigious performance for +magnetic resonance imaging (MRI) reconstruction, enabling fast imaging for many +clinical applications. Previous methods employ convolutional networks to learn +the image prior as the regularization term. In quantitative MRI, the physical +model of nuclear magnetic resonance relaxometry is known, providing additional +prior knowledge for image reconstruction. However, traditional reconstruction +networks are limited to learning the spatial domain prior knowledge, ignoring +the relaxometry prior. Therefore, we propose a relaxometry-guided quantitative +MRI reconstruction framework to learn the spatial prior from data and the +relaxometry prior from MRI physics. Additionally, we also evaluated the +performance of two popular reconstruction backbones, namely, recurrent +variational networks (RVN) and variational networks (VN) with U- Net. +Experiments demonstrate that the proposed method achieves highly promising +results in quantitative MRI reconstruction. + +
+
+
+
+
+ + ♻ ☆ Efficient Bayesian Uncertainty Estimation for nnU-Net + + +
+ The self-configuring nnU-Net has achieved leading performance in a large +range of medical image segmentation challenges. It is widely considered as the +model of choice and a strong baseline for medical image segmentation. However, +despite its extraordinary performance, nnU-Net does not supply a measure of +uncertainty to indicate its possible failure. This can be problematic for +large-scale image segmentation applications, where data are heterogeneous and +nnU-Net may fail without notice. In this work, we introduce a novel method to +estimate nnU-Net uncertainty for medical image segmentation. We propose a +highly effective scheme for posterior sampling of weight space for Bayesian +uncertainty estimation. Different from previous baseline methods such as Monte +Carlo Dropout and mean-field Bayesian Neural Networks, our proposed method does +not require a variational architecture and keeps the original nnU-Net +architecture intact, thereby preserving its excellent performance and ease of +use. Additionally, we boost the segmentation performance over the original +nnU-Net via marginalizing multi-modal posterior models. We applied our method +on the public ACDC and M&M datasets of cardiac MRI and demonstrated improved +uncertainty estimation over a range of baseline methods. The proposed method +further strengthens nnU-Net for medical image segmentation in terms of both +segmentation accuracy and quality control. + +
+
+
+
+
+ + ♻ ☆ Expert-Adaptive Medical Image Segmentation + + +
+ Medical image segmentation (MIS) plays an instrumental role in medical image +analysis, where considerable effort has been devoted to automating the process. +Currently, mainstream MIS approaches are based on deep neural networks (DNNs), +which are typically trained on a dataset with annotations produced by certain +medical experts. In the medical domain, the annotations generated by different +experts can be inherently distinct due to complexity of medical images and +variations in expertise and post-segmentation missions. Consequently, the DNN +model trained on the data annotated by some experts may hardly adapt to a new +expert. In this work, we evaluate a customised expert-adaptive method, +characterised by multi-expert annotation, multi-task DNN-based model training, +and lightweight model fine-tuning, to investigate model's adaptivity to a new +expert in the situation where the amount and mobility of training images are +limited. Experiments conducted on brain MRI segmentation tasks with limited +training data demonstrate its effectiveness and the impact of its key +parameters. + +
+
+
+
+
+ + ♻ ☆ ChartReformer: Natural Language-Driven Chart Image Editing ICDAR 2024 + + +
+ Chart visualizations are essential for data interpretation and communication; +however, most charts are only accessible in image format and lack the +corresponding data tables and supplementary information, making it difficult to +alter their appearance for different application scenarios. To eliminate the +need for original underlying data and information to perform chart editing, we +propose ChartReformer, a natural language-driven chart image editing solution +that directly edits the charts from the input images with the given instruction +prompts. The key in this method is that we allow the model to comprehend the +chart and reason over the prompt to generate the corresponding underlying data +table and visual attributes for new charts, enabling precise edits. +Additionally, to generalize ChartReformer, we define and standardize various +types of chart editing, covering style, layout, format, and data-centric edits. +The experiments show promising results for the natural language-driven chart +image editing. + +
+
+ comment: Published in ICDAR 2024. Code and model are available at + https://github.com/pengyu965/ChartReformer +
+
+
+
+
+ + ♻ ☆ Automatic Segmentation of the Spinal Cord Nerve Rootlets + + +
+ Precise identification of spinal nerve rootlets is relevant to delineate +spinal levels for the study of functional activity in the spinal cord. The goal +of this study was to develop an automatic method for the semantic segmentation +of spinal nerve rootlets from T2-weighted magnetic resonance imaging (MRI) +scans. Images from two open-access MRI datasets were used to train a 3D +multi-class convolutional neural network using an active learning approach to +segment C2-C8 dorsal nerve rootlets. Each output class corresponds to a spinal +level. The method was tested on 3T T2-weighted images from datasets unseen +during training to assess inter-site, inter-session, and inter-resolution +variability. The test Dice score was 0.67 +- 0.16 (mean +- standard deviation +across testing images and rootlets levels), suggesting a good performance. The +method also demonstrated low inter-vendor and inter-site variability +(coefficient of variation <= 1.41 %), as well as low inter-session variability +(coefficient of variation <= 1.30 %) indicating stable predictions across +different MRI vendors, sites, and sessions. The proposed methodology is +open-source and readily available in the Spinal Cord Toolbox (SCT) v6.2 and +higher. + +
+
+
+
+
+ + ♻ ☆ Orientation-conditioned Facial Texture Mapping for Video-based Facial + Remote Photoplethysmography Estimation + + +
+ Camera-based remote photoplethysmography (rPPG) enables contactless +measurement of important physiological signals such as pulse rate (PR). +However, dynamic and unconstrained subject motion introduces significant +variability into the facial appearance in video, confounding the ability of +video-based methods to accurately extract the rPPG signal. In this study, we +leverage the 3D facial surface to construct a novel orientation-conditioned +facial texture video representation which improves the motion robustness of +existing video-based facial rPPG estimation methods. Our proposed method +achieves a significant 18.2% performance improvement in cross-dataset testing +on MMPD over our baseline using the PhysNet model trained on PURE, highlighting +the efficacy and generalization benefits of our designed video representation. +We demonstrate significant performance improvements of up to 29.6% in all +tested motion scenarios in cross-dataset testing on MMPD, even in the presence +of dynamic and unconstrained subject motion, emphasizing the benefits of +disentangling motion through modeling the 3D facial surface for motion robust +facial rPPG estimation. We validate the efficacy of our design decisions and +the impact of different video processing steps through an ablation study. Our +findings illustrate the potential strengths of exploiting the 3D facial surface +as a general strategy for addressing dynamic and unconstrained subject motion +in videos. The code is available at +https://samcantrill.github.io/orientation-uv-rppg/. + +
+
+ comment: 12 pages, 8 figures, 6 tables; minor corrections +
+
+
+
+
+ + ♻ ☆ Anticipating Next Active Objects for Egocentric Videos + + +
+ This paper addresses the problem of anticipating the next-active-object +location in the future, for a given egocentric video clip where the contact +might happen, before any action takes place. The problem is considerably hard, +as we aim at estimating the position of such objects in a scenario where the +observed clip and the action segment are separated by the so-called ``time to +contact'' (TTC) segment. Many methods have been proposed to anticipate the +action of a person based on previous hand movements and interactions with the +surroundings. However, there have been no attempts to investigate the next +possible interactable object, and its future location with respect to the +first-person's motion and the field-of-view drift during the TTC window. We +define this as the task of Anticipating the Next ACTive Object (ANACTO). To +this end, we propose a transformer-based self-attention framework to identify +and locate the next-active-object in an egocentric clip. + We benchmark our method on three datasets: EpicKitchens-100, EGTEA+ and +Ego4D. We also provide annotations for the first two datasets. Our approach +performs best compared to relevant baseline methods. We also conduct ablation +studies to understand the effectiveness of the proposed and baseline methods on +varying conditions. Code and ANACTO task annotations will be made available +upon paper acceptance. + +
+
+ comment: Accepted by IEEE ACCESS, this paper carries the Manuscript DOI: + 10.1109/ACCESS.2024.3395282. The complete peer-reviewed version is available + via this DOI, while the arXiv version is a post-author manuscript without + peer-review +
+
+
+
+
+ + ♻ ☆ LISA: Reasoning Segmentation via Large Language Model + + +
+ Although perception systems have made remarkable advancements in recent +years, they still rely on explicit human instruction or pre-defined categories +to identify the target objects before executing visual recognition tasks. Such +systems cannot actively reason and comprehend implicit user intention. In this +work, we propose a new segmentation task -- reasoning segmentation. The task is +designed to output a segmentation mask given a complex and implicit query text. +Furthermore, we establish a benchmark comprising over one thousand +image-instruction-mask data samples, incorporating intricate reasoning and +world knowledge for evaluation purposes. Finally, we present LISA: large +Language Instructed Segmentation Assistant, which inherits the language +generation capabilities of multimodal Large Language Models (LLMs) while also +possessing the ability to produce segmentation masks. We expand the original +vocabulary with a token and propose the embedding-as-mask paradigm to +unlock the segmentation capability. Remarkably, LISA can handle cases involving +complex reasoning and world knowledge. Also, it demonstrates robust zero-shot +capability when trained exclusively on reasoning-free datasets. In addition, +fine-tuning the model with merely 239 reasoning segmentation data samples +results in further performance enhancement. Both quantitative and qualitative +experiments show our method effectively unlocks new reasoning segmentation +capabilities for multimodal LLMs. Code, models, and data are available at +https://github.com/dvlab-research/LISA. + +
+
+ comment: Code, models, and data are available at + https://github.com/dvlab-research/LISA +
+
+
+
+
+ + ♻ ☆ Semantic Line Combination Detector CVPR 2024 + + +
+ A novel algorithm, called semantic line combination detector (SLCD), to find +an optimal combination of semantic lines is proposed in this paper. It +processes all lines in each line combination at once to assess the overall +harmony of the lines. First, we generate various line combinations from +reliable lines. Second, we estimate the score of each line combination and +determine the best one. Experimental results demonstrate that the proposed SLCD +outperforms existing semantic line detectors on various datasets. Moreover, it +is shown that SLCD can be applied effectively to three vision tasks of +vanishing point detection, symmetry axis detection, and composition-based image +retrieval. Our codes are available at https://github.com/Jinwon-Ko/SLCD. + +
+
+ comment: CVPR 2024 accepted +
+
+
+
+
+ + ♻ ☆ Resource-Aware Heterogeneous Federated Learning using Neural + Architecture Search + + +
+ Federated Learning (FL) is extensively used to train AI/ML models in +distributed and privacy-preserving settings. Participant edge devices in FL +systems typically contain non-independent and identically distributed (Non-IID) +private data and unevenly distributed computational resources. Preserving user +data privacy while optimizing AI/ML models in a heterogeneous federated network +requires us to address data and system/resource heterogeneity. To address these +challenges, we propose Resource-aware Federated Learning (RaFL). RaFL allocates +resource-aware specialized models to edge devices using Neural Architecture +Search (NAS) and allows heterogeneous model architecture deployment by +knowledge extraction and fusion. Combining NAS and FL enables on-demand +customized model deployment for resource-diverse edge devices. Furthermore, we +propose a multi-model architecture fusion scheme allowing the aggregation of +the distributed learning results. Results demonstrate RaFL's superior resource +efficiency compared to SoTA. + +
+
+ comment: Accepted at the 30th International European Conference on Parallel + and Distributed Computing (Euro-Par 2024) +
+
+
+
+
+ + ♻ ☆ MotionMaster: Training-free Camera Motion Transfer For Video Generation + + +
+ The emergence of diffusion models has greatly propelled the progress in image +and video generation. Recently, some efforts have been made in controllable +video generation, including text-to-video generation and video motion control, +among which camera motion control is an important topic. However, existing +camera motion control methods rely on training a temporal camera module, and +necessitate substantial computation resources due to the large amount of +parameters in video generation models. Moreover, existing methods pre-define +camera motion types during training, which limits their flexibility in camera +control. Therefore, to reduce training costs and achieve flexible camera +control, we propose COMD, a novel training-free video motion transfer model, +which disentangles camera motions and object motions in source videos and +transfers the extracted camera motions to new videos. We first propose a +one-shot camera motion disentanglement method to extract camera motion from a +single source video, which separates the moving objects from the background and +estimates the camera motion in the moving objects region based on the motion in +the background by solving a Poisson equation. Furthermore, we propose a +few-shot camera motion disentanglement method to extract the common camera +motion from multiple videos with similar camera motions, which employs a +window-based clustering technique to extract the common features in temporal +attention maps of multiple videos. Finally, we propose a motion combination +method to combine different types of camera motions together, enabling our +model a more controllable and flexible camera control. Extensive experiments +demonstrate that our training-free approach can effectively decouple +camera-object motion and apply the decoupled camera motion to a wide range of +controllable video generation tasks, achieving flexible and diverse camera +motion control. + +
+
+
+
+
+ + ♻ ☆ Enhancing Super-Resolution Networks through Realistic Thick-Slice CT + Simulation + + +
+ Deep learning-based Generative Models have the potential to convert +low-resolution CT images into high-resolution counterparts without long +acquisition times and increased radiation exposure in thin-slice CT imaging. +However, procuring appropriate training data for these Super-Resolution (SR) +models is challenging. Previous SR research has simulated thick-slice CT images +from thin-slice CT images to create training pairs. However, these methods +either rely on simplistic interpolation techniques that lack realism or +sinogram reconstruction, which require the release of raw data and complex +reconstruction algorithms. Thus, we introduce a simple yet realistic method to +generate thick CT images from thin-slice CT images, facilitating the creation +of training pairs for SR algorithms. The training pairs produced by our method +closely resemble real data distributions (PSNR=49.74 vs. 40.66, p$<$0.05). A +multivariate Cox regression analysis involving thick slice CT images with lung +fibrosis revealed that only the radiomics features extracted using our method +demonstrated a significant correlation with mortality (HR=1.19 and HR=1.14, +p$<$0.005). This paper represents the first to identify and address the +challenge of generating appropriate paired training data for Deep +Learning-based CT SR models, which enhances the efficacy and applicability of +SR models in real-world scenarios. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Pit30M: A Benchmark for Global Localization in the Age of Self-Driving + Cars IROS 2020 + + +
+ We are interested in understanding whether retrieval-based localization +approaches are good enough in the context of self-driving vehicles. Towards +this goal, we introduce Pit30M, a new image and LiDAR dataset with over 30 +million frames, which is 10 to 100 times larger than those used in previous +work. Pit30M is captured under diverse conditions (i.e., season, weather, time +of the day, traffic), and provides accurate localization ground truth. We also +automatically annotate our dataset with historical weather and astronomical +data, as well as with image and LiDAR semantic segmentation as a proxy measure +for occlusion. We benchmark multiple existing methods for image and LiDAR +retrieval and, in the process, introduce a simple, yet effective convolutional +network-based LiDAR retrieval method that is competitive with the state of the +art. Our work provides, for the first time, a benchmark for sub-metre +retrieval-based localization at city scale. The dataset, its Python SDK, as +well as more information about the sensors, calibration, and metadata, are +available on the project website: https://pit30m.github.io/ + +
+
+ comment: Published at IROS 2020 +
+
+
+
+
+ + ♻ ☆ Attention-based Shape-Deformation Networks for Artifact-Free Geometry + Reconstruction of Lumbar Spine from MR Images + + +
+ Lumbar disc degeneration, a progressive structural wear and tear of lumbar +intervertebral disc, is regarded as an essential role on low back pain, a +significant global health concern. Automated lumbar spine geometry +reconstruction from MR images will enable fast measurement of medical +parameters to evaluate the lumbar status, in order to determine a suitable +treatment. Existing image segmentation-based techniques often generate +erroneous segments or unstructured point clouds, unsuitable for medical +parameter measurement. In this work, we present $\textit{UNet-DeformSA}$ and +$\textit{TransDeformer}$: novel attention-based deep neural networks that +reconstruct the geometry of the lumbar spine with high spatial accuracy and +mesh correspondence across patients, and we also present a variant of +$\textit{TransDeformer}$ for error estimation. Specially, we devise new +attention modules with a new attention formula, which integrate image features +and tokenized contour features to predict the displacements of the points on a +shape template without the need for image segmentation. The deformed template +reveals the lumbar spine geometry in an image. Experiment results show that our +networks generate artifact-free geometry outputs, and the variant of +$\textit{TransDeformer}$ can predict the errors of a reconstructed geometry. +Our code is available at https://github.com/linchenq/TransDeformer-Mesh. + +
+
+
+
+
+ + ♻ ☆ FairSeg: A Large-Scale Medical Image Segmentation Dataset for Fairness + Learning Using Segment Anything Model with Fair Error-Bound Scaling ICLR 2024 + + +
+ Fairness in artificial intelligence models has gained significantly more +attention in recent years, especially in the area of medicine, as fairness in +medical models is critical to people's well-being and lives. High-quality +medical fairness datasets are needed to promote fairness learning research. +Existing medical fairness datasets are all for classification tasks, and no +fairness datasets are available for medical segmentation, while medical +segmentation is an equally important clinical task as classifications, which +can provide detailed spatial information on organ abnormalities ready to be +assessed by clinicians. In this paper, we propose the first fairness dataset +for medical segmentation named Harvard-FairSeg with 10,000 subject samples. In +addition, we propose a fair error-bound scaling approach to reweight the loss +function with the upper error-bound in each identity group, using the segment +anything model (SAM). We anticipate that the segmentation performance equity +can be improved by explicitly tackling the hard cases with high training errors +in each identity group. To facilitate fair comparisons, we utilize a novel +equity-scaled segmentation performance metric to compare segmentation metrics +in the context of fairness, such as the equity-scaled Dice coefficient. Through +comprehensive experiments, we demonstrate that our fair error-bound scaling +approach either has superior or comparable fairness performance to the +state-of-the-art fairness learning models. The dataset and code are publicly +accessible via https://ophai.hms.harvard.edu/datasets/harvard-fairseg10k. + +
+
+ comment: ICLR 2024; Codes available at + https://github.com/Harvard-Ophthalmology-AI-Lab/FairSeg +
+
+
+
+
+ + ♻ ☆ LVOS: A Benchmark for Large-scale Long-term Video Object Segmentation + + +
+ Video object segmentation (VOS) aims to distinguish and track target objects +in a video. Despite the excellent performance achieved by off-the-shell VOS +models, existing VOS benchmarks mainly focus on short-term videos lasting about +5 seconds, where objects remain visible most of the time. However, these +benchmarks poorly represent practical applications, and the absence of +long-term datasets restricts further investigation of VOS in realistic +scenarios. Thus, we propose a novel benchmark named LVOS, comprising 720 videos +with 296,401 frames and 407,945 high-quality annotations. Videos in LVOS last +1.14 minutes on average, approximately 5 times longer than videos in existing +datasets. Each video includes various attributes, especially challenges +deriving from the wild, such as long-term reappearing and cross-temporal +similar objects. Compared to previous benchmarks, our LVOS better reflects VOS +models' performance in real scenarios. Based on LVOS, we evaluate 20 existing +VOS models under 4 different settings and conduct a comprehensive analysis. On +LVOS, these models suffer a large performance drop, highlighting the challenge +of achieving precise tracking and segmentation in real-world scenarios. +Attribute-based analysis indicates that key factor to accuracy decline is the +increased video length, emphasizing LVOS's crucial role. We hope our LVOS can +advance development of VOS in real scenes. Data and code are available at +https://lingyihongfd.github.io/lvos.github.io/. + +
+
+ comment: LVOS V2 +
+
+
+
+
+ + ♻ ☆ Mapping New Realities: Ground Truth Image Creation with Pix2Pix + Image-to-Image Translation + + +
+ Generative Adversarial Networks (GANs) have significantly advanced image +processing, with Pix2Pix being a notable framework for image-to-image +translation. This paper explores a novel application of Pix2Pix to transform +abstract map images into realistic ground truth images, addressing the scarcity +of such images crucial for domains like urban planning and autonomous vehicle +training. We detail the Pix2Pix model's utilization for generating +high-fidelity datasets, supported by a dataset of paired map and aerial images, +and enhanced by a tailored training regimen. The results demonstrate the +model's capability to accurately render complex urban features, establishing +its efficacy and potential for broad real-world applications. + +
+
+
+
+
+ + ♻ ☆ Survey of Bias In Text-to-Image Generation: Definition, Evaluation, and + Mitigation + + +
+ The recent advancement of large and powerful models with Text-to-Image (T2I) +generation abilities -- such as OpenAI's DALLE-3 and Google's Gemini -- enables +users to generate high-quality images from textual prompts. However, it has +become increasingly evident that even simple prompts could cause T2I models to +exhibit conspicuous social bias in generated images. Such bias might lead to +both allocational and representational harms in society, further marginalizing +minority groups. Noting this problem, a large body of recent works has been +dedicated to investigating different dimensions of bias in T2I systems. +However, an extensive review of these studies is lacking, hindering a +systematic understanding of current progress and research gaps. We present the +first extensive survey on bias in T2I generative models. In this survey, we +review prior studies on dimensions of bias: Gender, Skintone, and Geo-Culture. +Specifically, we discuss how these works define, evaluate, and mitigate +different aspects of bias. We found that: (1) while gender and skintone biases +are widely studied, geo-cultural bias remains under-explored; (2) most works on +gender and skintone bias investigated occupational association, while other +aspects are less frequently studied; (3) almost all gender bias works overlook +non-binary identities in their studies; (4) evaluation datasets and metrics are +scattered, with no unified framework for measuring biases; and (5) current +mitigation methods fail to resolve biases comprehensively. Based on current +limitations, we point out future research directions that contribute to +human-centric definitions, evaluations, and mitigation of biases. We hope to +highlight the importance of studying biases in T2I systems, as well as +encourage future efforts to holistically understand and tackle biases, building +fair and trustworthy T2I technologies for everyone. + +
+
+
+
+
+ + ♻ ☆ Domain-Specific Block Selection and Paired-View Pseudo-Labeling for + Online Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test +domain without access to source data after deployment. Existing approaches +typically rely on self-training with pseudo-labels since ground-truth cannot be +obtained from test data. Although the quality of pseudo labels is important for +stable and accurate long-term adaptation, it has not been previously addressed. +In this work, we propose DPLOT, a simple yet effective TTA framework that +consists of two components: (1) domain-specific block selection and (2) +pseudo-label generation using paired-view images. Specifically, we select +blocks that involve domain-specific feature extraction and train these blocks +by entropy minimization. After blocks are adjusted for current test domain, we +generate pseudo-labels by averaging given test images and corresponding flipped +counterparts. By simply using flip augmentation, we prevent a decrease in the +quality of the pseudo-labels, which can be caused by the domain gap resulting +from strong augmentation. Our experimental results demonstrate that DPLOT +outperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C +benchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also, +we provide an extensive analysis to demonstrate effectiveness of our framework. +Code is available at +https://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive aggregation of Monte Carlo augmented decomposed filters for + efficient group-equivariant convolutional neural network + + +
+ Group-equivariant convolutional neural networks (G-CNN) heavily rely on +parameter sharing to increase CNN's data efficiency and performance. However, +the parameter-sharing strategy greatly increases the computational burden for +each added parameter, which hampers its application to deep neural network +models. In this paper, we address these problems by proposing a +non-parameter-sharing approach for group equivariant neural networks. The +proposed methods adaptively aggregate a diverse range of filters by a weighted +sum of stochastically augmented decomposed filters. We give theoretical proof +about how the continuous group convolution can be approximated by our methods. +Our method applies to both continuous and discrete groups, where the +augmentation is implemented using Monte Carlo sampling and bootstrap +resampling, respectively. We demonstrate that our methods serve as an efficient +extension of standard CNN. Experiments on group equivariance tests show how our +methods can achieve superior performance to parameter-sharing group equivariant +networks. Experiments on image classification and image denoising tasks show +that in certain scenarios, with a suitable set of filter bases, our method +helps improve the performance of standard CNNs and build efficient lightweight +image denoising networks. The code will be available at +https://github.com/ZhaoWenzhao/MCG_CNN. + +
+
+
+
+
+ + ♻ ☆ VideoGigaGAN: Towards Detail-rich Video Super-Resolution + + +
+ Video super-resolution (VSR) approaches have shown impressive temporal +consistency in upsampled videos. However, these approaches tend to generate +blurrier results than their image counterparts as they are limited in their +generative capability. This raises a fundamental question: can we extend the +success of a generative image upsampler to the VSR task while preserving the +temporal consistency? We introduce VideoGigaGAN, a new generative VSR model +that can produce videos with high-frequency details and temporal consistency. +VideoGigaGAN builds upon a large-scale image upsampler -- GigaGAN. Simply +inflating GigaGAN to a video model by adding temporal modules produces severe +temporal flickering. We identify several key issues and propose techniques that +significantly improve the temporal consistency of upsampled videos. Our +experiments show that, unlike previous VSR methods, VideoGigaGAN generates +temporally consistent videos with more fine-grained appearance details. We +validate the effectiveness of VideoGigaGAN by comparing it with +state-of-the-art VSR models on public datasets and showcasing video results +with $8\times$ super-resolution. + +
+
+ comment: project page: https://videogigagan.github.io/ +
+
+
+
+
+ + ♻ ☆ Zero-shot generalization across architectures for visual classification ICLR 2024 + + +
+ Generalization to unseen data is a key desideratum for deep networks, but its +relation to classification accuracy is unclear. Using a minimalist vision +dataset and a measure of generalizability, we show that popular networks, from +deep convolutional networks (CNNs) to transformers, vary in their power to +extrapolate to unseen classes both across layers and across architectures. +Accuracy is not a good predictor of generalizability, and generalization varies +non-monotonically with layer depth. + +
+
+ comment: Accepted as a Tiny Paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Espresso: Robust Concept Filtering in Text-to-Image Models + + +
+ Diffusion-based text-to-image (T2I) models generate high-fidelity images for +given textual prompts. They are trained on large datasets scraped from the +Internet, potentially containing unacceptable concepts (e.g., copyright +infringing or unsafe). Retraining T2I models after filtering out unacceptable +concepts in the training data is inefficient and degrades utility. Hence, there +is a need for concept removal techniques (CRTs) which are effective in removing +unacceptable concepts, utility-preserving on acceptable concepts, and robust +against evasion with adversarial prompts. None of the prior filtering and +fine-tuning CRTs satisfy all these requirements simultaneously. + We introduce Espresso, the first robust concept filter based on Contrastive +Language-Image Pre-Training (CLIP). It identifies unacceptable concepts by +projecting the generated image's embedding onto the vector connecting +unacceptable and acceptable concepts in the joint text-image embedding space. +This ensures robustness by restricting the adversary to adding noise only along +this vector, in the direction of the acceptable concept. Further fine-tuning +Espresso to separate embeddings of acceptable and unacceptable concepts, while +preserving their pairing with image embeddings, ensures both effectiveness and +utility. We evaluate Espresso on eleven concepts to show that it is effective +(~5% CLIP accuracy on unacceptable concepts), utility-preserving (~93% +normalized CLIP score on acceptable concepts), and robust (~4% CLIP accuracy on +adversarial prompts for unacceptable concepts). Finally, we present theoretical +bounds for the certified robustness of Espresso against adversarial prompts, +and an empirical analysis. + +
+
+
+
+
+ + ♻ ☆ One Model to Rule them All: Towards Universal Segmentation for Medical + Images with Text Prompts + + +
+ In this study, we focus on building up a model that aims to Segment Anything +in medical scenarios, driven by Text prompts, termed as SAT. Our main +contributions are three folds: (i) for dataset construction, we combine +multiple knowledge sources to construct the first multi-modal knowledge tree on +human anatomy, including 6502 anatomical terminologies; Then we build up the +largest and most comprehensive segmentation dataset for training, by collecting +over 22K 3D medical image scans from 72 segmentation datasets with careful +standardization on both image scans and label space; (ii) for architecture +design, we formulate a universal segmentation model, that can be prompted by +inputting medical terminologies in text form. We present knowledge-enhanced +representation learning on the combination of a large number of datasets; (iii) +for model evaluation, we train a SAT-Pro with only 447M parameters, to segment +72 different segmentation datasets with text prompt, resulting in 497 classes. +We have thoroughly evaluated the model from three aspects: averaged by body +regions, averaged by classes, and average by datasets, demonstrating comparable +performance to 72 specialist nnU-Nets, i.e., we train nnU-Net models on each +dataset/subset, resulting in 72 nnU-Nets with around 2.2B parameters for the 72 +datasets. We will release all the codes, and models in this work. + +
+
+ comment: 53 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 162 + +
+
+
+ + ☆ Lightplane: Highly-Scalable Components for Neural 3D Fields + + +
+ Contemporary 3D research, particularly in reconstruction and generation, +heavily relies on 2D images for inputs or supervision. However, current designs +for these 2D-3D mapping are memory-intensive, posing a significant bottleneck +for existing methods and hindering new applications. In response, we propose a +pair of highly scalable components for 3D neural fields: Lightplane Render and +Splatter, which significantly reduce memory usage in 2D-3D mapping. These +innovations enable the processing of vastly more and higher resolution images +with small memory and computational costs. We demonstrate their utility in +various applications, from benefiting single-scene optimization with +image-level losses to realizing a versatile pipeline for dramatically scaling +3D reconstruction and generation. Code: +\url{https://github.com/facebookresearch/lightplane}. + +
+
+ comment: Project Page: https://lightplane.github.io/ Code: + https://github.com/facebookresearch/lightplane +
+
+
+
+
+ + ☆ MotionLCM: Real-time Controllable Motion Generation via Latent + Consistency Model + + +
+ This work introduces MotionLCM, extending controllable motion generation to a +real-time level. Existing methods for spatial control in text-conditioned +motion generation suffer from significant runtime inefficiency. To address this +issue, we first propose the motion latent consistency model (MotionLCM) for +motion generation, building upon the latent diffusion model (MLD). By employing +one-step (or few-step) inference, we further improve the runtime efficiency of +the motion latent diffusion model for motion generation. To ensure effective +controllability, we incorporate a motion ControlNet within the latent space of +MotionLCM and enable explicit control signals (e.g., pelvis trajectory) in the +vanilla motion space to control the generation process directly, similar to +controlling other latent-free diffusion models for motion generation. By +employing these techniques, our approach can generate human motions with text +and control signals in real-time. Experimental results demonstrate the +remarkable generation and controlling capabilities of MotionLCM while +maintaining real-time runtime efficiency. + +
+
+ comment: MotionLCM project version 1.0 +
+
+
+
+
+ + ☆ Invisible Stitch: Generating Smooth 3D Scenes with Depth Inpainting + + +
+ 3D scene generation has quickly become a challenging new research direction, +fueled by consistent improvements of 2D generative diffusion models. Most prior +work in this area generates scenes by iteratively stitching newly generated +frames with existing geometry. These works often depend on pre-trained +monocular depth estimators to lift the generated images into 3D, fusing them +with the existing scene representation. These approaches are then often +evaluated via a text metric, measuring the similarity between the generated +images and a given text prompt. In this work, we make two fundamental +contributions to the field of 3D scene generation. First, we note that lifting +images to 3D with a monocular depth estimation model is suboptimal as it +ignores the geometry of the existing scene. We thus introduce a novel depth +completion model, trained via teacher distillation and self-training to learn +the 3D fusion process, resulting in improved geometric coherence of the scene. +Second, we introduce a new benchmarking scheme for scene generation methods +that is based on ground truth geometry, and thus measures the quality of the +structure of the scene. + +
+
+ comment: Project page: https://research.paulengstler.com/invisible-stitch/ +
+
+
+
+
+ + ☆ DOCCI: Descriptions of Connected and Contrasting Images + + +
+ Vision-language datasets are vital for both text-to-image (T2I) and +image-to-text (I2T) research. However, current datasets lack descriptions with +fine-grained detail that would allow for richer associations to be learned by +models. To fill the gap, we introduce Descriptions of Connected and Contrasting +Images (DOCCI), a dataset with long, human-annotated English descriptions for +15k images that were taken, curated and donated by a single researcher intent +on capturing key challenges such as spatial relations, counting, text +rendering, world knowledge, and more. We instruct human annotators to create +comprehensive descriptions for each image; these average 136 words in length +and are crafted to clearly distinguish each image from those that are related +or similar. Each description is highly compositional and typically encompasses +multiple challenges. Through both quantitative and qualitative analyses, we +demonstrate that DOCCI serves as an effective training resource for +image-to-text generation -- a PaLI 5B model finetuned on DOCCI shows equal or +superior results compared to highly-performant larger models like LLaVA-1.5 7B +and InstructBLIP 7B. Furthermore, we show that DOCCI is a useful testbed for +text-to-image generation, highlighting the limitations of current text-to-image +models in capturing long descriptions and fine details. + +
+
+
+
+
+ + ☆ Visual Fact Checker: Enabling High-Fidelity Detailed Caption Generation CVPR 2024 + + +
+ Existing automatic captioning methods for visual content face challenges such +as lack of detail, content hallucination, and poor instruction following. In +this work, we propose VisualFactChecker (VFC), a flexible training-free +pipeline that generates high-fidelity and detailed captions for both 2D images +and 3D objects. VFC consists of three steps: 1) proposal, where image-to-text +captioning models propose multiple initial captions; 2) verification, where a +large language model (LLM) utilizes tools such as object detection and VQA +models to fact-check proposed captions; 3) captioning, where an LLM generates +the final caption by summarizing caption proposals and the fact check +verification results. In this step, VFC can flexibly generate captions in +various styles following complex instructions. We conduct comprehensive +captioning evaluations using four metrics: 1) CLIP-Score for image-text +similarity; 2) CLIP-Image-Score for measuring the image-image similarity +between the original and the reconstructed image generated by a text-to-image +model using the caption. 3) human study on Amazon Mechanical Turk; 4) GPT-4V +for fine-grained evaluation. Evaluation results show that VFC outperforms +state-of-the-art open-sourced captioning methods for 2D images on the COCO +dataset and 3D assets on the Objaverse dataset. Our study demonstrates that by +combining open-source models into a pipeline, we can attain captioning +capability comparable to proprietary models such as GPT-4V, despite being over +10x smaller in model size. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Quantifying Nematodes through Images: Datasets, Models, and Baselines of + Deep Learning + + +
+ Every year, plant parasitic nematodes, one of the major groups of plant +pathogens, cause a significant loss of crops worldwide. To mitigate crop yield +losses caused by nematodes, an efficient nematode monitoring method is +essential for plant and crop disease management. In other respects, efficient +nematode detection contributes to medical research and drug discovery, as +nematodes are model organisms. With the rapid development of computer +technology, computer vision techniques provide a feasible solution for +quantifying nematodes or nematode infections. In this paper, we survey and +categorise the studies and available datasets on nematode detection through +deep-learning models. To stimulate progress in related research, this survey +presents the potential state-of-the-art object detection models, training +techniques, optimisation techniques, and evaluation metrics for deep learning +beginners. Moreover, seven state-of-the-art object detection models are +validated on three public datasets and the AgriNema dataset for plant parasitic +nematodes to construct a baseline for nematode detection. + +
+
+ comment: The 26th IEEE International Conference on Computational Science and + Engineering (CSE-2023) +
+
+
+
+
+ + ☆ PACER+: On-Demand Pedestrian Animation Controller in Driving Scenarios + + +
+ We address the challenge of content diversity and controllability in +pedestrian simulation for driving scenarios. Recent pedestrian animation +frameworks have a significant limitation wherein they primarily focus on either +following trajectory [46] or the content of the reference video [57], +consequently overlooking the potential diversity of human motion within such +scenarios. This limitation restricts the ability to generate pedestrian +behaviors that exhibit a wider range of variations and realistic motions and +therefore restricts its usage to provide rich motion content for other +components in the driving simulation system, e.g., suddenly changed motion to +which the autonomous vehicle should respond. In our approach, we strive to +surpass the limitation by showcasing diverse human motions obtained from +various sources, such as generated human motions, in addition to following the +given trajectory. The fundamental contribution of our framework lies in +combining the motion tracking task with trajectory following, which enables the +tracking of specific motion parts (e.g., upper body) while simultaneously +following the given trajectory by a single policy. This way, we significantly +enhance both the diversity of simulated human motion within the given scenario +and the controllability of the content, including language-based control. Our +framework facilitates the generation of a wide range of human motions, +contributing to greater realism and adaptability in pedestrian simulations for +driving scenarios. More information is on our project page +https://wangjingbo1219.github.io/papers/CVPR2024_PACER_PLUS/PACERPLUSPage.html . + +
+
+
+
+
+ + ☆ RTG-SLAM: Real-time 3D Reconstruction at Scale using Gaussian Splatting SIGGRAPH 2024 + + +
+ We propose RTG-SLAM, a real-time 3D reconstruction system with an RGBD camera +for large-scale environments using Gaussian splatting. RTG-SLAM features a +compact Gaussian representation and a highly efficient on-the-fly Gaussian +optimization scheme. We force each Gaussian to be either opaque or nearly +transparent, with the opaque ones fitting the surface and dominant colors, and +transparent ones fitting residual colors. By rendering depth in a different way +from color rendering, we let a single opaque Gaussian well fit a local surface +region without the need of multiple overlapping Gaussians, hence largely +reducing the memory and computation cost. For on-the-fly Gaussian optimization, +we explicitly add Gaussians for three types of pixels per frame: newly +observed, with large color errors and with large depth errors. We also +categorize all Gaussians into stable and unstable ones, where the stable +Gaussians are expected to well fit previously observed RGBD images and +otherwise unstable. We only optimize the unstable Gaussians and only render the +pixels occupied by unstable Gaussians. In this way, both the number of +Gaussians to be optimized and pixels to be rendered are largely reduced, and +the optimization can be done in real time. We show real-time reconstructions of +a variety of real large scenes. Compared with the state-of-the-art NeRF-based +RGBD SLAM, our system achieves comparable high-quality reconstruction but with +around twice the speed and half the memory cost, and shows superior performance +in the realism of novel view synthesis and camera tracking accuracy. + +
+
+ comment: To be published in ACM SIGGRAPH 2024 +
+
+
+
+
+ + ☆ GS-LRM: Large Reconstruction Model for 3D Gaussian Splatting + + +
+ We propose GS-LRM, a scalable large reconstruction model that can predict +high-quality 3D Gaussian primitives from 2-4 posed sparse images in 0.23 +seconds on single A100 GPU. Our model features a very simple transformer-based +architecture; we patchify input posed images, pass the concatenated multi-view +image tokens through a sequence of transformer blocks, and decode final +per-pixel Gaussian parameters directly from these tokens for differentiable +rendering. In contrast to previous LRMs that can only reconstruct objects, by +predicting per-pixel Gaussians, GS-LRM naturally handles scenes with large +variations in scale and complexity. We show that our model can work on both +object and scene captures by training it on Objaverse and RealEstate10K +respectively. In both scenarios, the models outperform state-of-the-art +baselines by a wide margin. We also demonstrate applications of our model in +downstream 3D generation tasks. Our project webpage is available at: +https://sai-bi.github.io/project/gs-lrm/ . + +
+
+ comment: Project webpage: https://sai-bi.github.io/project/gs-lrm/ +
+
+
+
+
+ + ☆ Naturally Supervised 3D Visual Grounding with Language-Regularized + Concept Learners CVPR 2024 + + +
+ 3D visual grounding is a challenging task that often requires direct and +dense supervision, notably the semantic label for each object in the scene. In +this paper, we instead study the naturally supervised setting that learns from +only 3D scene and QA pairs, where prior works underperform. We propose the +Language-Regularized Concept Learner (LARC), which uses constraints from +language as regularization to significantly improve the accuracy of +neuro-symbolic concept learners in the naturally supervised setting. Our +approach is based on two core insights: the first is that language constraints +(e.g., a word's relation to another) can serve as effective regularization for +structured representations in neuro-symbolic models; the second is that we can +query large language models to distill such constraints from language +properties. We show that LARC improves performance of prior works in naturally +supervised 3D visual grounding, and demonstrates a wide range of 3D visual +reasoning capabilities-from zero-shot composition, to data efficiency and +transferability. Our method represents a promising step towards regularizing +structured visual reasoning frameworks with language-based priors, for learning +in settings without dense supervision. + +
+
+ comment: CVPR 2024. The first two authors contributed equally +
+
+
+
+
+ + ☆ SwipeGANSpace: Swipe-to-Compare Image Generation via Efficient Latent + Space Exploration + + +
+ Generating preferred images using generative adversarial networks (GANs) is +challenging owing to the high-dimensional nature of latent space. In this +study, we propose a novel approach that uses simple user-swipe interactions to +generate preferred images for users. To effectively explore the latent space +with only swipe interactions, we apply principal component analysis to the +latent space of the StyleGAN, creating meaningful subspaces. We use a +multi-armed bandit algorithm to decide the dimensions to explore, focusing on +the preferences of the user. Experiments show that our method is more efficient +in generating preferred images than the baseline methods. Furthermore, changes +in preferred images during image generation or the display of entirely +different image styles were observed to provide new inspirations, subsequently +altering user preferences. This highlights the dynamic nature of user +preferences, which our proposed approach recognizes and enhances. + +
+
+ comment: 11 pages, 13 figures +
+
+
+
+
+ + ☆ Beyond MOS: Subjective Image Quality Score Preprocessing Method Based on + Perceptual Similarity + + +
+ Image quality assessment often relies on raw opinion scores provided by +subjects in subjective experiments, which can be noisy and unreliable. To +address this issue, postprocessing procedures such as ITU-R BT.500, ITU-T +P.910, and ITU-T P.913 have been standardized to clean up the original opinion +scores. These methods use annotator-based statistical priors, but they do not +take into account extensive information about the image itself, which limits +their performance in less annotated scenarios. Generally speaking, image +quality datasets usually contain similar scenes or distortions, and it is +inevitable for subjects to compare images to score a reasonable score when +scoring. Therefore, In this paper, we proposed Subjective Image Quality Score +Preprocessing Method perceptual similarity Subjective Preprocessing (PSP), +which exploit the perceptual similarity between images to alleviate subjective +bias in less annotated scenarios. Specifically, we model subjective scoring as +a conditional probability model based on perceptual similarity with previously +scored images, called subconscious reference scoring. The reference images are +stored by a neighbor dictionary, which is obtained by a normalized vector +dot-product based nearest neighbor search of the images' perceptual depth +features. Then the preprocessed score is updated by the exponential moving +average (EMA) of the subconscious reference scoring, called similarity +regularized EMA. Our experiments on multiple datasets (LIVE, TID2013, CID2013) +show that this method can effectively remove the bias of the subjective scores. +Additionally, Experiments prove that the Preprocesed dataset can improve the +performance of downstream IQA tasks very well. + +
+
+
+
+
+ + ☆ Towards Scenario- and Capability-Driven Dataset Development and + Evaluation: An Approach in the Context of Mapless Automated Driving + + +
+ The foundational role of datasets in defining the capabilities of deep +learning models has led to their rapid proliferation. At the same time, +published research focusing on the process of dataset development for +environment perception in automated driving has been scarce, thereby reducing +the applicability of openly available datasets and impeding the development of +effective environment perception systems. Sensor-based, mapless automated +driving is one of the contexts where this limitation is evident. While +leveraging real-time sensor data, instead of pre-defined HD maps promises +enhanced adaptability and safety by effectively navigating unexpected +environmental changes, it also increases the demands on the scope and +complexity of the information provided by the perception system. + To address these challenges, we propose a scenario- and capability-based +approach to dataset development. Grounded in the principles of ISO 21448 +(safety of the intended functionality, SOTIF), extended by ISO/TR 4804, our +approach facilitates the structured derivation of dataset requirements. This +not only aids in the development of meaningful new datasets but also enables +the effective comparison of existing ones. Applying this methodology to a broad +range of existing lane detection datasets, we identify significant limitations +in current datasets, particularly in terms of real-world applicability, a lack +of labeling of critical features, and an absence of comprehensive information +for complex driving maneuvers. + +
+
+ comment: Accepted to be published at the 2024 35th IEEE Intelligent Vehicles + Symposium (IV), Jeju Island, Korea, June 2 - 5, 2024 +
+
+
+
+
+ + ☆ Masked Multi-Query Slot Attention for Unsupervised Object Discovery IJCNN 2024 + + +
+ Unsupervised object discovery is becoming an essential line of research for +tackling recognition problems that require decomposing an image into entities, +such as semantic segmentation and object detection. Recently, object-centric +methods that leverage self-supervision have gained popularity, due to their +simplicity and adaptability to different settings and conditions. However, +those methods do not exploit effective techniques already employed in modern +self-supervised approaches. In this work, we consider an object-centric +approach in which DINO ViT features are reconstructed via a set of queried +representations called slots. Based on that, we propose a masking scheme on +input features that selectively disregards the background regions, inducing our +model to focus more on salient objects during the reconstruction phase. +Moreover, we extend the slot attention to a multi-query approach, allowing the +model to learn multiple sets of slots, producing more stable masks. During +training, these multiple sets of slots are learned independently while, at test +time, these sets are merged through Hungarian matching to obtain the final +slots. Our experimental results and ablations on the PASCAL-VOC 2012 dataset +show the importance of each component and highlight how their combination +consistently improves object localization. Our source code is available at: +https://github.com/rishavpramanik/maskedmultiqueryslot + +
+
+ comment: Paper accepted for presentation at IJCNN 2024 +
+
+
+
+
+ + ☆ VimTS: A Unified Video and Image Text Spotter for Enhancing the + Cross-domain Generalization + + +
+ Text spotting, a task involving the extraction of textual information from +image or video sequences, faces challenges in cross-domain adaption, such as +image-to-image and image-to-video generalization. In this paper, we introduce a +new method, termed VimTS, which enhances the generalization ability of the +model by achieving better synergy among different tasks. Typically, we propose +a Prompt Queries Generation Module and a Tasks-aware Adapter to effectively +convert the original single-task model into a multi-task model suitable for +both image and video scenarios with minimal additional parameters. The Prompt +Queries Generation Module facilitates explicit interaction between different +tasks, while the Tasks-aware Adapter helps the model dynamically learn suitable +features for each task. Additionally, to further enable the model to learn +temporal information at a lower cost, we propose a synthetic video text dataset +(VTD-368k) by leveraging the Content Deformation Fields (CoDeF) algorithm. +Notably, our method outperforms the state-of-the-art method by an average of +2.6% in six cross-domain benchmarks such as TT-to-IC15, CTW1500-to-TT, and +TT-to-CTW1500. For video-level cross-domain adaption, our method even surpasses +the previous end-to-end video spotting method in ICDAR2015 video and DSText v2 +by an average of 5.5% on the MOTA metric, using only image-level data. We +further demonstrate that existing Large Multimodal Models exhibit limitations +in generating cross-domain scene text spotting, in contrast to our VimTS model +which requires significantly fewer parameters and data. The code and datasets +will be made available at the https://VimTextSpotter.github.io. + +
+
+
+
+
+ + ☆ Provably Robust Conformal Prediction with Improved Efficiency + + +
+ Conformal prediction is a powerful tool to generate uncertainty sets with +guaranteed coverage using any predictive model, under the assumption that the +training and test data are i.i.d.. Recently, it has been shown that adversarial +examples are able to manipulate conformal methods to construct prediction sets +with invalid coverage rates, as the i.i.d. assumption is violated. To address +this issue, a recent work, Randomized Smoothed Conformal Prediction (RSCP), was +first proposed to certify the robustness of conformal prediction methods to +adversarial noise. However, RSCP has two major limitations: (i) its robustness +guarantee is flawed when used in practice and (ii) it tends to produce large +uncertainty sets. To address these limitations, we first propose a novel +framework called RSCP+ to provide provable robustness guarantee in evaluation, +which fixes the issues in the original RSCP method. Next, we propose two novel +methods, Post-Training Transformation (PTT) and Robust Conformal Training +(RCT), to effectively reduce prediction set size with little computation +overhead. Experimental results in CIFAR10, CIFAR100, and ImageNet suggest the +baseline method only yields trivial predictions including full label set, while +our methods could boost the efficiency by up to $4.36\times$, $5.46\times$, and +$16.9\times$ respectively and provide practical robustness guarantee. Our codes +are available at +https://github.com/Trustworthy-ML-Lab/Provably-Robust-Conformal-Prediction. + +
+
+
+
+
+ + ☆ MetaCoCo: A New Few-Shot Classification Benchmark with Spurious + Correlation ICLR 24 + + +
+ Out-of-distribution (OOD) problems in few-shot classification (FSC) occur +when novel classes sampled from testing distributions differ from base classes +drawn from training distributions, which considerably degrades the performance +of deep learning models deployed in real-world applications. Recent studies +suggest that the OOD problems in FSC mainly including: (a) cross-domain +few-shot classification (CD-FSC) and (b) spurious-correlation few-shot +classification (SC-FSC). Specifically, CD-FSC occurs when a classifier learns +transferring knowledge from base classes drawn from seen training distributions +but recognizes novel classes sampled from unseen testing distributions. In +contrast, SC-FSC arises when a classifier relies on non-causal features (or +contexts) that happen to be correlated with the labels (or concepts) in base +classes but such relationships no longer hold during the model deployment. +Despite CD-FSC has been extensively studied, SC-FSC remains understudied due to +lack of the corresponding evaluation benchmarks. To this end, we present Meta +Concept Context (MetaCoCo), a benchmark with spurious-correlation shifts +collected from real-world scenarios. Moreover, to quantify the extent of +spurious-correlation shifts of the presented MetaCoCo, we further propose a +metric by using CLIP as a pre-trained vision-language model. Extensive +experiments on the proposed benchmark are performed to evaluate the +state-of-the-art methods in FSC, cross-domain shifts, and self-supervised +learning. The experimental results show that the performance of the existing +methods degrades significantly in the presence of spurious-correlation shifts. +We open-source all codes of our benchmark and hope that the proposed MetaCoCo +can facilitate future research on spurious-correlation shifts problems in FSC. +The code is available at: https://github.com/remiMZ/MetaCoCo-ICLR24. + +
+
+ comment: ICLR 24 +
+
+
+
+
+ + ☆ ESP-Zero: Unsupervised enhancement of zero-shot classification for + Extremely Sparse Point cloud + + +
+ In recent years, zero-shot learning has attracted the focus of many +researchers, due to its flexibility and generality. Many approaches have been +proposed to achieve the zero-shot classification of the point clouds for 3D +object understanding, following the schema of CLIP. However, in the real world, +the point clouds could be extremely sparse, dramatically limiting the +effectiveness of the 3D point cloud encoders, and resulting in the misalignment +of point cloud features and text embeddings. To the point cloud encoders to fit +the extremely sparse point clouds without re-running the pre-training procedure +which could be time-consuming and expensive, in this work, we propose an +unsupervised model adaptation approach to enhance the point cloud encoder for +the extremely sparse point clouds. We propose a novel fused-cross attention +layer that expands the pre-trained self-attention layer with additional +learnable tokens and attention blocks, which effectively modifies the point +cloud features while maintaining the alignment between point cloud features and +text embeddings. We also propose a complementary learning-based +self-distillation schema that encourages the modified features to be pulled +apart from the irrelevant text embeddings without overfitting the feature space +to the observed text embeddings. Extensive experiments demonstrate that the +proposed approach effectively increases the zero-shot capability on extremely +sparse point clouds, and overwhelms other state-of-the-art model adaptation +approaches. + +
+
+
+
+
+ + ☆ Fake it to make it: Using synthetic data to remedy the data shortage in + joint multimodal speech-and-gesture synthesis CVPR 2024 + + +
+ Although humans engaged in face-to-face conversation simultaneously +communicate both verbally and non-verbally, methods for joint and unified +synthesis of speech audio and co-speech 3D gesture motion from text are a new +and emerging field. These technologies hold great promise for more human-like, +efficient, expressive, and robust synthetic communication, but are currently +held back by the lack of suitably large datasets, as existing methods are +trained on parallel data from all constituent modalities. Inspired by +student-teacher methods, we propose a straightforward solution to the data +shortage, by simply synthesising additional training material. Specifically, we +use unimodal synthesis models trained on large datasets to create multimodal +(but synthetic) parallel training data, and then pre-train a joint synthesis +model on that material. In addition, we propose a new synthesis architecture +that adds better and more controllable prosody modelling to the +state-of-the-art method in the field. Our results confirm that pre-training on +large amounts of synthetic data improves the quality of both the speech and the +motion synthesised by the multimodal model, with the proposed architecture +yielding further benefits when pre-trained on the synthetic data. See +https://shivammehta25.github.io/MAGI/ for example output. + +
+
+ comment: 13+1 pages, 2 figures, accepted at the Human Motion Generation + workshop (HuMoGen) at CVPR 2024 +
+
+
+
+
+ + ☆ SemiPL: A Semi-supervised Method for Event Sound Source Localization + + +
+ In recent years, Event Sound Source Localization has been widely applied in +various fields. Recent works typically relying on the contrastive learning +framework show impressive performance. However, all work is based on large +relatively simple datasets. It's also crucial to understand and analyze human +behaviors (actions and interactions of people), voices, and sounds in chaotic +events in many applications, e.g., crowd management, and emergency response +services. In this paper, we apply the existing model to a more complex dataset, +explore the influence of parameters on the model, and propose a semi-supervised +improvement method SemiPL. With the increase in data quantity and the influence +of label quality, self-supervised learning will be an unstoppable trend. The +experiment shows that the parameter adjustment will positively affect the +existing model. In particular, SSPL achieved an improvement of 12.2% cIoU and +0.56% AUC in Chaotic World compared to the results provided. The code is +available at: https://github.com/ly245422/SSPL + +
+
+
+
+
+ + ☆ Seeing Through the Clouds: Cloud Gap Imputation with Prithvi Foundation + Model + + +
+ Filling cloudy pixels in multispectral satellite imagery is essential for +accurate data analysis and downstream applications, especially for tasks which +require time series data. To address this issue, we compare the performance of +a foundational Vision Transformer (ViT) model with a baseline Conditional +Generative Adversarial Network (CGAN) model for missing value imputation in +time series of multispectral satellite imagery. We randomly mask time series of +satellite images using real-world cloud masks and train each model to +reconstruct the missing pixels. The ViT model is fine-tuned from a pretrained +model, while the CGAN is trained from scratch. Using quantitative evaluation +metrics such as structural similarity index and mean absolute error as well as +qualitative visual analysis, we assess imputation accuracy and contextual +preservation. + +
+
+
+
+
+ + ☆ Data-Driven Invertible Neural Surrogates of Atmospheric Transmission + + +
+ We present a framework for inferring an atmospheric transmission profile from +a spectral scene. This framework leverages a lightweight, physics-based +simulator that is automatically tuned - by virtue of autodifferentiation and +differentiable programming - to construct a surrogate atmospheric profile to +model the observed data. We demonstrate utility of the methodology by (i) +performing atmospheric correction, (ii) recasting spectral data between various +modalities (e.g. radiance and reflectance at the surface and at the sensor), +and (iii) inferring atmospheric transmission profiles, such as absorbing bands +and their relative magnitudes. + +
+
+ comment: Manuscript accepted for presentation and publication at the 2024 IEEE + International Geoscience and Remote Sensing Symposium (IGARSS) +
+
+
+
+
+ + ☆ X-Diffusion: Generating Detailed 3D MRI Volumes From a Single Image + Using Cross-Sectional Diffusion Models + + +
+ In this work, we present X-Diffusion, a cross-sectional diffusion model +tailored for Magnetic Resonance Imaging (MRI) data. X-Diffusion is capable of +generating the entire MRI volume from just a single MRI slice or optionally +from few multiple slices, setting new benchmarks in the precision of +synthesized MRIs from extremely sparse observations. The uniqueness lies in the +novel view-conditional training and inference of X-Diffusion on MRI volumes, +allowing for generalized MRI learning. Our evaluations span both brain tumour +MRIs from the BRATS dataset and full-body MRIs from the UK Biobank dataset. +Utilizing the paired pre-registered Dual-energy X-ray Absorptiometry (DXA) and +MRI modalities in the UK Biobank dataset, X-Diffusion is able to generate +detailed 3D MRI volume from a single full-body DXA. Remarkably, the resultant +MRIs not only stand out in precision on unseen examples (surpassing +state-of-the-art results by large margins) but also flawlessly retain essential +features of the original MRI, including tumour profiles, spine curvature, brain +volume, and beyond. Furthermore, the trained X-Diffusion model on the MRI +datasets attains a generalization capacity out-of-domain (e.g. generating knee +MRIs even though it is trained on brains). The code is available on the project +website https://emmanuelleb985.github.io/XDiffusion/ . + +
+
+ comment: preprint, project website: + https://emmanuelleb985.github.io/XDiffusion/ +
+
+
+
+
+ + ☆ Artificial Intelligence in Bone Metastasis Analysis: Current + Advancements, Opportunities and Challenges + + +
+ In recent years, Artificial Intelligence (AI) has been widely used in +medicine, particularly in the analysis of medical imaging, which has been +driven by advances in computer vision and deep learning methods. This is +particularly important in overcoming the challenges posed by diseases such as +Bone Metastases (BM), a common and complex malignancy of the bones. Indeed, +there have been an increasing interest in developing Machine Learning (ML) +techniques into oncologic imaging for BM analysis. In order to provide a +comprehensive overview of the current state-of-the-art and advancements for BM +analysis using artificial intelligence, this review is conducted with the +accordance with PRISMA guidelines. Firstly, this review highlights the clinical +and oncologic perspectives of BM and the used medical imaging modalities, with +discussing their advantages and limitations. Then the review focuses on modern +approaches with considering the main BM analysis tasks, which includes: +classification, detection and segmentation. The results analysis show that ML +technologies can achieve promising performance for BM analysis and have +significant potential to improve clinician efficiency and cope with time and +cost limitations. Furthermore, there are requirements for further research to +validate the clinical performance of ML tools and facilitate their integration +into routine clinical practice. + +
+
+
+
+
+ + ☆ Perceptual Constancy Constrained Single Opinion Score Calibration for + Image Quality Assessment + + +
+ In this paper, we propose a highly efficient method to estimate an image's +mean opinion score (MOS) from a single opinion score (SOS). Assuming that each +SOS is the observed sample of a normal distribution and the MOS is its unknown +expectation, the MOS inference is formulated as a maximum likelihood estimation +problem, where the perceptual correlation of pairwise images is considered in +modeling the likelihood of SOS. More specifically, by means of the +quality-aware representations learned from the self-supervised backbone, we +introduce a learnable relative quality measure to predict the MOS difference +between two images. Then, the current image's maximum likelihood estimation +towards MOS is represented by the sum of another reference image's estimated +MOS and their relative quality. Ideally, no matter which image is selected as +the reference, the MOS of the current image should remain unchanged, which is +termed perceptual cons tancy constrained calibration (PC3). Finally, we +alternatively optimize the relative quality measure's parameter and the current +image's estimated MOS via backpropagation and Newton's method respectively. +Experiments show that the proposed method is efficient in calibrating the +biased SOS and significantly improves IQA model learning when only SOSs are +available. + +
+
+
+
+
+ + ☆ AI techniques for near real-time monitoring of contaminants in coastal + waters on board future Phisat-2 mission + + +
+ Differently from conventional procedures, the proposed solution advocates for +a groundbreaking paradigm in water quality monitoring through the integration +of satellite Remote Sensing (RS) data, Artificial Intelligence (AI) techniques, +and onboard processing. The objective is to offer nearly real-time detection of +contaminants in coastal waters addressing a significant gap in the existing +literature. Moreover, the expected outcomes include substantial advancements in +environmental monitoring, public health protection, and resource conservation. +The specific focus of our study is on the estimation of Turbidity and pH +parameters, for their implications on human and aquatic health. Nevertheless, +the designed framework can be extended to include other parameters of interest +in the water environment and beyond. Originating from our participation in the +European Space Agency (ESA) OrbitalAI Challenge, this article describes the +distinctive opportunities and issues for the contaminants monitoring on the +Phisat-2 mission. The specific characteristics of this mission, with the tools +made available, will be presented, with the methodology proposed by the authors +for the onboard monitoring of water contaminants in near real-time. Preliminary +promising results are discussed and in progress and future work introduced. + +
+
+ comment: 11 pages, 9 figures, submitted to IEEE JSTARS +
+
+
+
+
+ + ☆ Automatic Cardiac Pathology Recognition in Echocardiography Images Using + Higher Order Dynamic Mode Decomposition and a Vision Transformer for Small + Datasets + + +
+ Heart diseases are the main international cause of human defunction. +According to the WHO, nearly 18 million people decease each year because of +heart diseases. Also considering the increase of medical data, much pressure is +put on the health industry to develop systems for early and accurate heart +disease recognition. In this work, an automatic cardiac pathology recognition +system based on a novel deep learning framework is proposed, which analyses in +real-time echocardiography video sequences. The system works in two stages. The +first one transforms the data included in a database of echocardiography +sequences into a machine-learning-compatible collection of annotated images +which can be used in the training stage of any kind of machine learning-based +framework, and more specifically with deep learning. This includes the use of +the Higher Order Dynamic Mode Decomposition (HODMD) algorithm, for the first +time to the authors' knowledge, for both data augmentation and feature +extraction in the medical field. The second stage is focused on building and +training a Vision Transformer (ViT), barely explored in the related literature. +The ViT is adapted for an effective training from scratch, even with small +datasets. The designed neural network analyses images from an echocardiography +sequence to predict the heart state. The results obtained show the superiority +of the proposed system and the efficacy of the HODMD algorithm, even +outperforming pretrained Convolutional Neural Networks (CNNs), which are so far +the method of choice in the literature. + +
+
+
+
+
+ + ☆ A Spatio-Temporal based Frame Indexing Algorithm for QoS Improvement in + Live Low-Motion Video Streaming + + +
+ Real-time video life streaming of events over a network continued to gain +more popularity among the populace. However, there is need to ensure the +judicious utilization of allocated bandwidth without compromising the Quality +of Service (QoS) of the system. In this regard, this paper presents an approach +based on spatio-temporal frame indexing that detects and eliminate redundancy +within and across captured frame, prior transmission from the server to +clients. The standard and local low motion videos were the two scenarios +considered in evaluating the performance of the proposed algorithm. Results +obtained showed that the proposed approach achieved an improvement of 5.13%, +15.8% and 5%, 15.6% improvement in terms of the buffer size and compression +ratio. Though with a tradeoff of the frame-built time, where both the standard +and local frame indexing outperforms the proposed scheme with 10.8% and 8.71% +respectively. + +
+
+
+
+
+ + ☆ Enhancing Deep Learning Model Explainability in Brain Tumor Datasets + using Post-Heuristic Approaches + + +
+ The application of deep learning models in medical diagnosis has showcased +considerable efficacy in recent years. Nevertheless, a notable limitation +involves the inherent lack of explainability during decision-making processes. +This study addresses such a constraint, by enhancing the interpretability +robustness. The primary focus is directed towards refining the explanations +generated by the LIME Library and LIME image explainer. This is achieved +throuhg post-processing mechanisms, based on scenario-specific rules. Multiple +experiments have been conducted using publicly accessible datasets related to +brain tumor detection. Our proposed post-heuristic approach demonstrates +significant advancements, yielding more robust and concrete results, in the +context of medical diagnosis. + +
+
+
+
+
+ + ☆ Causal Perception Inspired Representation Learning for Trustworthy Image + Quality Assessment + + +
+ Despite great success in modeling visual perception, deep neural network +based image quality assessment (IQA) still remains unreliable in real-world +applications due to its vulnerability to adversarial perturbations and the +inexplicit black-box structure. In this paper, we propose to build a +trustworthy IQA model via Causal Perception inspired Representation Learning +(CPRL), and a score reflection attack method for IQA model. More specifically, +we assume that each image is composed of Causal Perception Representation (CPR) +and non-causal perception representation (N-CPR). CPR serves as the causation +of the subjective quality label, which is invariant to the imperceptible +adversarial perturbations. Inversely, N-CPR presents spurious associations with +the subjective quality label, which may significantly change with the +adversarial perturbations. To extract the CPR from each input image, we develop +a soft ranking based channel-wise activation function to mediate the causally +sufficient (beneficial for high prediction accuracy) and necessary (beneficial +for high robustness) deep features, and based on intervention employ minimax +game to optimize. Experiments on four benchmark databases show that the +proposed CPRL method outperforms many state-of-the-art adversarial defense +methods and provides explicit model interpretation. + +
+
+
+
+
+ + ☆ One-Stage Open-Vocabulary Temporal Action Detection Leveraging Temporal + Multi-scale and Action Label Features + + +
+ Open-vocabulary Temporal Action Detection (Open-vocab TAD) is an advanced +video analysis approach that expands Closed-vocabulary Temporal Action +Detection (Closed-vocab TAD) capabilities. Closed-vocab TAD is typically +confined to localizing and classifying actions based on a predefined set of +categories. In contrast, Open-vocab TAD goes further and is not limited to +these predefined categories. This is particularly useful in real-world +scenarios where the variety of actions in videos can be vast and not always +predictable. The prevalent methods in Open-vocab TAD typically employ a 2-stage +approach, which involves generating action proposals and then identifying those +actions. However, errors made during the first stage can adversely affect the +subsequent action identification accuracy. Additionally, existing studies face +challenges in handling actions of different durations owing to the use of fixed +temporal processing methods. Therefore, we propose a 1-stage approach +consisting of two primary modules: Multi-scale Video Analysis (MVA) and +Video-Text Alignment (VTA). The MVA module captures actions at varying temporal +resolutions, overcoming the challenge of detecting actions with diverse +durations. The VTA module leverages the synergy between visual and textual +modalities to precisely align video segments with corresponding action labels, +a critical step for accurate action identification in Open-vocab scenarios. +Evaluations on widely recognized datasets THUMOS14 and ActivityNet-1.3, showed +that the proposed method achieved superior results compared to the other +methods in both Open-vocab and Closed-vocab settings. This serves as a strong +demonstration of the effectiveness of the proposed method in the TAD task. + +
+
+ comment: The 18th IEEE International Conference on Automatic Face and Gesture + Recognition (FG 2024) +
+
+
+
+
+ + ☆ Ultra Inertial Poser: Scalable Motion Capture and Tracking from Sparse + Inertial Sensors and Ultra-Wideband Ranging SIGGRAPH 2024 + + +
+ While camera-based capture systems remain the gold standard for recording +human motion, learning-based tracking systems based on sparse wearable sensors +are gaining popularity. Most commonly, they use inertial sensors, whose +propensity for drift and jitter have so far limited tracking accuracy. In this +paper, we propose Ultra Inertial Poser, a novel 3D full body pose estimation +method that constrains drift and jitter in inertial tracking via inter-sensor +distances. We estimate these distances across sparse sensor setups using a +lightweight embedded tracker that augments inexpensive off-the-shelf 6D +inertial measurement units with ultra-wideband radio-based +ranging$-$dynamically and without the need for stationary reference anchors. +Our method then fuses these inter-sensor distances with the 3D states estimated +from each sensor Our graph-based machine learning model processes the 3D states +and distances to estimate a person's 3D full body pose and translation. To +train our model, we synthesize inertial measurements and distance estimates +from the motion capture database AMASS. For evaluation, we contribute a novel +motion dataset of 10 participants who performed 25 motion types, captured by 6 +wearable IMU+UWB trackers and an optical motion capture system, totaling 200 +minutes of synchronized sensor data (UIP-DB). Our extensive experiments show +state-of-the-art performance for our method over PIP and TIP, reducing position +error from $13.62$ to $10.65cm$ ($22\%$ better) and lowering jitter from $1.56$ +to $0.055km/s^3$ (a reduction of $97\%$). + +
+
+ comment: Accepted by SIGGRAPH 2024, Code: + https://github.com/eth-siplab/UltraInertialPoser +
+
+
+
+
+ + ☆ MIPI 2024 Challenge on Nighttime Flare Removal: Methods and Results CVPR 2024 + + +
+ The increasing demand for computational photography and imaging on mobile +platforms has led to the widespread development and integration of advanced +image sensors with novel algorithms in camera systems. However, the scarcity of +high-quality data for research and the rare opportunity for in-depth exchange +of views from industry and academia constrain the development of mobile +intelligent photography and imaging (MIPI). Building on the achievements of the +previous MIPI Workshops held at ECCV 2022 and CVPR 2023, we introduce our third +MIPI challenge including three tracks focusing on novel image sensors and +imaging algorithms. In this paper, we summarize and review the Nighttime Flare +Removal track on MIPI 2024. In total, 170 participants were successfully +registered, and 14 teams submitted results in the final testing phase. The +developed solutions in this challenge achieved state-of-the-art performance on +Nighttime Flare Removal. More details of this challenge and the link to the +dataset can be found at https://mipi-challenge.org/MIPI2024/. + +
+
+ comment: CVPR 2024 Mobile Intelligent Photography and Imaging (MIPI) + Workshop--Nighttime Flare Removal Challenge Report. Website: + https://mipi-challenge.org/MIPI2024/ +
+
+
+
+
+ + ☆ MoST: Multi-modality Scene Tokenization for Motion Prediction CVPR 2024 + + +
+ Many existing motion prediction approaches rely on symbolic perception +outputs to generate agent trajectories, such as bounding boxes, road graph +information and traffic lights. This symbolic representation is a high-level +abstraction of the real world, which may render the motion prediction model +vulnerable to perception errors (e.g., failures in detecting open-vocabulary +obstacles) while missing salient information from the scene context (e.g., poor +road conditions). An alternative paradigm is end-to-end learning from raw +sensors. However, this approach suffers from the lack of interpretability and +requires significantly more training resources. In this work, we propose +tokenizing the visual world into a compact set of scene elements and then +leveraging pre-trained image foundation models and LiDAR neural networks to +encode all the scene elements in an open-vocabulary manner. The image +foundation model enables our scene tokens to encode the general knowledge of +the open world while the LiDAR neural network encodes geometry information. Our +proposed representation can efficiently encode the multi-frame multi-modality +observations with a few hundred tokens and is compatible with most +transformer-based architectures. To evaluate our method, we have augmented +Waymo Open Motion Dataset with camera embeddings. Experiments over Waymo Open +Motion Dataset show that our approach leads to significant performance +improvements over the state-of-the-art. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MicroDreamer: Zero-shot 3D Generation in $\sim$20 Seconds by Score-based + Iterative Reconstruction + + +
+ Optimization-based approaches, such as score distillation sampling (SDS), +show promise in zero-shot 3D generation but suffer from low efficiency, +primarily due to the high number of function evaluations (NFEs) required for +each sample. In this paper, we introduce score-based iterative reconstruction +(SIR), an efficient and general algorithm for 3D generation with a multi-view +score-based diffusion model. Given the images produced by the diffusion model, +SIR reduces NFEs by repeatedly optimizing 3D parameters, unlike the single +optimization in SDS, mimicking the 3D reconstruction process. With other +improvements including optimization in the pixel space, we present an efficient +approach called MicroDreamer that generally applies to various 3D +representations and 3D generation tasks. In particular, retaining a comparable +performance, MicroDreamer is 5-20 times faster than SDS in generating neural +radiance field and takes about 20 seconds to generate meshes from 3D Gaussian +splitting on a single A100 GPU, halving the time of the fastest zero-shot +baseline, DreamGaussian. Our code is available at +https://github.com/ML-GSAI/MicroDreamer. + +
+
+
+
+
+ + ☆ A Smartphone-Based Method for Assessing Tomato Nutrient Status through + Trichome Density Measurement + + +
+ Accurately assessing tomato plant nutrient status is crucial for maintaining +high yields. Consequently, accurately identifying fertilizer-induced stress +through the morphological traits of tomato plants has become a critical +agricultural challenge. Research and development efforts have focused on +developing noninvasive diagnostic tools for nutrition that leverage a +combination of morphological traits and advanced sensor technologies. Given +these advancements, detecting fertilizer stress by observing morphological +traits near the growth points of tomatoes is still a significant challenge. To +address this challenge, we developed a simple and cost-effective +smartphone-based method for measuring trichome density. This method involves +transferring trichomes from the surface of a leaf onto cellophane tape and +capturing images using a smartphone. The images are processed using computer +vision techniques to calculate the trichome density. To assess the efficacy of +this method, we performed experiments on hydroponically grown tomato plants +subjected to varying fertilizer concentrations. Our results indicate that our +novel method for measuring trichome density accurately reflects fertilizer +stress in tomato plants. The predictive performance of our model, as evaluated +by the mean area under the precision recall curve, was 0.824, despite +variations in the measurement data caused by differences in optical conditions. +This study introduces an innovative approach for designing diagnostic devices +for detecting fertilizer stress in plants by considering the surface structures +of plants. Our proposed method represents a straightforward, efficient, and +economical approach for evaluating the nutrient status of tomato plants and has +the potential to overcome the limitations of conventional noncontact optical +methods. + +
+
+
+
+
+ + ☆ Towards Real-world Video Face Restoration: A New Benchmark + + +
+ Blind face restoration (BFR) on images has significantly progressed over the +last several years, while real-world video face restoration (VFR), which is +more challenging for more complex face motions such as moving gaze directions +and facial orientations involved, remains unsolved. Typical BFR methods are +evaluated on privately synthesized datasets or self-collected real-world +low-quality face images, which are limited in their coverage of real-world +video frames. In this work, we introduced new real-world datasets named FOS +with a taxonomy of "Full, Occluded, and Side" faces from mainly video frames to +study the applicability of current methods on videos. Compared with existing +test datasets, FOS datasets cover more diverse degradations and involve face +samples from more complex scenarios, which helps to revisit current face +restoration approaches more comprehensively. Given the established datasets, we +benchmarked both the state-of-the-art BFR methods and the video super +resolution (VSR) methods to comprehensively study current approaches, +identifying their potential and limitations in VFR tasks. In addition, we +studied the effectiveness of the commonly used image quality assessment (IQA) +metrics and face IQA (FIQA) metrics by leveraging a subjective user study. With +extensive experimental results and detailed analysis provided, we gained +insights from the successes and failures of both current BFR and VSR methods. +These results also pose challenges to current face restoration approaches, +which we hope stimulate future advances in VFR research. + +
+
+ comment: Project page: https://ziyannchen.github.io/projects/VFRxBenchmark/ +
+
+
+
+
+ + ☆ EvGNN: An Event-driven Graph Neural Network Accelerator for Edge Vision + + +
+ Edge vision systems combining sensing and embedded processing promise +low-latency, decentralized, and energy-efficient solutions that forgo reliance +on the cloud. As opposed to conventional frame-based vision sensors, +event-based cameras deliver a microsecond-scale temporal resolution with sparse +information encoding, thereby outlining new opportunities for edge vision +systems. However, mainstream algorithms for frame-based vision, which mostly +rely on convolutional neural networks (CNNs), can hardly exploit the advantages +of event-based vision as they are typically optimized for dense matrix-vector +multiplications. While event-driven graph neural networks (GNNs) have recently +emerged as a promising solution for sparse event-based vision, their irregular +structure is a challenge that currently hinders the design of efficient +hardware accelerators. In this paper, we propose EvGNN, the first event-driven +GNN accelerator for low-footprint, ultra-low-latency, and high-accuracy edge +vision with event-based cameras. It relies on three central ideas: (i) directed +dynamic graphs exploiting single-hop nodes with edge-free storage, (ii) event +queues for the efficient identification of local neighbors within a +spatiotemporally decoupled search range, and (iii) a novel layer-parallel +processing scheme enabling the low-latency execution of multi-layer GNNs. We +deployed EvGNN on a Xilinx KV260 Ultrascale+ MPSoC platform and benchmarked it +on the N-CARS dataset for car recognition, demonstrating a classification +accuracy of 87.8% and an average latency per event of 16$\mu$s, thereby +enabling real-time, microsecond-resolution event-based vision at the edge. + +
+
+ comment: 12 pages, 14 figures +
+
+
+
+
+ + ☆ SpecstatOR: Speckle statistics-based iOCT Segmentation Network for + Ophthalmic Surgery + + +
+ This paper presents an innovative approach to intraoperative Optical +Coherence Tomography (iOCT) image segmentation in ophthalmic surgery, +leveraging statistical analysis of speckle patterns to incorporate statistical +pathology-specific prior knowledge. Our findings indicate statistically +different speckle patterns within the retina and between retinal layers and +surgical tools, facilitating the segmentation of previously unseen data without +the necessity for manual labeling. The research involves fitting various +statistical distributions to iOCT data, enabling the differentiation of +different ocular structures and surgical tools. The proposed segmentation model +aims to refine the statistical findings based on prior tissue understanding to +leverage statistical and biological knowledge. Incorporating statistical +parameters, physical analysis of light-tissue interaction, and deep learning +informed by biological structures enhance segmentation accuracy, offering +potential benefits to real-time applications in ophthalmic surgical procedures. +The study demonstrates the adaptability and precision of using Gamma +distribution parameters and the derived binary maps as sole inputs for +segmentation, notably enhancing the model's inference performance on unseen +data. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ TwinDiffusion: Enhancing Coherence and Efficiency in Panoramic Image + Generation with Diffusion Models + + +
+ Diffusion models have emerged as effective tools for generating diverse and +high-quality content. However, their capability in high-resolution image +generation, particularly for panoramic images, still faces challenges such as +visible seams and incoherent transitions. In this paper, we propose +TwinDiffusion, an optimized framework designed to address these challenges +through two key innovations: Crop Fusion for quality enhancement and Cross +Sampling for efficiency optimization. We introduce a training-free optimizing +stage to refine the similarity of the adjacent image areas, as well as an +interleaving sampling strategy to yield dynamic patches during the cropping +process. A comprehensive evaluation is conducted to compare TwinDiffusion with +the existing methods, considering factors including coherence, fidelity, +compatibility, and efficiency. The results demonstrate the superior performance +of our approach in generating seamless and coherent panoramas, setting a new +standard in quality and efficiency for panoramic image generation. + +
+
+
+
+
+ + ☆ AttackBench: Evaluating Gradient-based Attacks for Adversarial Examples + + +
+ Adversarial examples are typically optimized with gradient-based attacks. +While novel attacks are continuously proposed, each is shown to outperform its +predecessors using different experimental setups, hyperparameter settings, and +number of forward and backward calls to the target models. This provides +overly-optimistic and even biased evaluations that may unfairly favor one +particular attack over the others. In this work, we aim to overcome these +limitations by proposing AttackBench, i.e., the first evaluation framework that +enables a fair comparison among different attacks. To this end, we first +propose a categorization of gradient-based attacks, identifying their main +components and differences. We then introduce our framework, which evaluates +their effectiveness and efficiency. We measure these characteristics by (i) +defining an optimality metric that quantifies how close an attack is to the +optimal solution, and (ii) limiting the number of forward and backward queries +to the model, such that all attacks are compared within a given maximum query +budget. Our extensive experimental analysis compares more than 100 attack +implementations with a total of over 800 different configurations against +CIFAR-10 and ImageNet models, highlighting that only very few attacks +outperform all the competing approaches. Within this analysis, we shed light on +several implementation issues that prevent many attacks from finding better +solutions or running at all. We release AttackBench as a publicly available +benchmark, aiming to continuously update it to include and evaluate novel +gradient-based attacks for optimizing adversarial examples. + +
+
+ comment: https://attackbench.github.io +
+
+
+
+
+ + ☆ AnomalyXFusion: Multi-modal Anomaly Synthesis with Diffusion + + +
+ Anomaly synthesis is one of the effective methods to augment abnormal samples +for training. However, current anomaly synthesis methods predominantly rely on +texture information as input, which limits the fidelity of synthesized abnormal +samples. Because texture information is insufficient to correctly depict the +pattern of anomalies, especially for logical anomalies. To surmount this +obstacle, we present the AnomalyXFusion framework, designed to harness +multi-modality information to enhance the quality of synthesized abnormal +samples. The AnomalyXFusion framework comprises two distinct yet synergistic +modules: the Multi-modal In-Fusion (MIF) module and the Dynamic Dif-Fusion +(DDF) module. The MIF module refines modality alignment by aggregating and +integrating various modality features into a unified embedding space, termed +X-embedding, which includes image, text, and mask features. Concurrently, the +DDF module facilitates controlled generation through an adaptive adjustment of +X-embedding conditioned on the diffusion steps. In addition, to reveal the +multi-modality representational power of AnomalyXFusion, we propose a new +dataset, called MVTec Caption. More precisely, MVTec Caption extends 2.2k +accurate image-mask-text annotations for the MVTec AD and LOCO datasets. +Comprehensive evaluations demonstrate the effectiveness of AnomalyXFusion, +especially regarding the fidelity and diversity for logical anomalies. Project +page: http:github.com/hujiecpp/MVTec-Caption + +
+
+
+
+
+ + ☆ InstantFamily: Masked Attention for Zero-shot Multi-ID Image Generation + + +
+ In the field of personalized image generation, the ability to create images +preserving concepts has significantly improved. Creating an image that +naturally integrates multiple concepts in a cohesive and visually appealing +composition can indeed be challenging. This paper introduces "InstantFamily," +an approach that employs a novel masked cross-attention mechanism and a +multimodal embedding stack to achieve zero-shot multi-ID image generation. Our +method effectively preserves ID as it utilizes global and local features from a +pre-trained face recognition model integrated with text conditions. +Additionally, our masked cross-attention mechanism enables the precise control +of multi-ID and composition in the generated images. We demonstrate the +effectiveness of InstantFamily through experiments showing its dominance in +generating images with multi-ID, while resolving well-known multi-ID generation +problems. Additionally, our model achieves state-of-the-art performance in both +single-ID and multi-ID preservation. Furthermore, our model exhibits remarkable +scalability with a greater number of ID preservation than it was originally +trained with. + +
+
+
+
+
+ + ☆ Physical Backdoor: Towards Temperature-based Backdoor Attacks in the + Physical World CVPR 2024 + + +
+ Backdoor attacks have been well-studied in visible light object detection +(VLOD) in recent years. However, VLOD can not effectively work in dark and +temperature-sensitive scenarios. Instead, thermal infrared object detection +(TIOD) is the most accessible and practical in such environments. In this +paper, our team is the first to investigate the security vulnerabilities +associated with TIOD in the context of backdoor attacks, spanning both the +digital and physical realms. We introduce two novel types of backdoor attacks +on TIOD, each offering unique capabilities: Object-affecting Attack and +Range-affecting Attack. We conduct a comprehensive analysis of key factors +influencing trigger design, which include temperature, size, material, and +concealment. These factors, especially temperature, significantly impact the +efficacy of backdoor attacks on TIOD. A thorough understanding of these factors +will serve as a foundation for designing physical triggers and temperature +controlling experiments. Our study includes extensive experiments conducted in +both digital and physical environments. In the digital realm, we evaluate our +approach using benchmark datasets for TIOD, achieving an Attack Success Rate +(ASR) of up to 98.21%. In the physical realm, we test our approach in two +real-world settings: a traffic intersection and a parking lot, using a thermal +infrared camera. Here, we attain an ASR of up to 98.38%. + +
+
+ comment: To appear in CVPR 2024.11pages, 8 figures and 4 tables +
+
+
+
+
+ + ☆ UniFS: Universal Few-shot Instance Perception with Point Representations + + +
+ Instance perception tasks (object detection, instance segmentation, pose +estimation, counting) play a key role in industrial applications of visual +models. As supervised learning methods suffer from high labeling cost, few-shot +learning methods which effectively learn from a limited number of labeled +examples are desired. Existing few-shot learning methods primarily focus on a +restricted set of tasks, presumably due to the challenges involved in designing +a generic model capable of representing diverse tasks in a unified manner. In +this paper, we propose UniFS, a universal few-shot instance perception model +that unifies a wide range of instance perception tasks by reformulating them +into a dynamic point representation learning framework. Additionally, we +propose Structure-Aware Point Learning (SAPL) to exploit the higher-order +structural relationship among points to further enhance representation +learning. Our approach makes minimal assumptions about the tasks, yet it +achieves competitive results compared to highly specialized and well optimized +specialist models. Codes will be released soon. + +
+
+
+
+
+ + ☆ 3D Gaussian Blendshapes for Head Avatar Animation SIGGRAPH + + +
+ We introduce 3D Gaussian blendshapes for modeling photorealistic head +avatars. Taking a monocular video as input, we learn a base head model of +neutral expression, along with a group of expression blendshapes, each of which +corresponds to a basis expression in classical parametric face models. Both the +neutral model and expression blendshapes are represented as 3D Gaussians, which +contain a few properties to depict the avatar appearance. The avatar model of +an arbitrary expression can be effectively generated by combining the neutral +model and expression blendshapes through linear blending of Gaussians with the +expression coefficients. High-fidelity head avatar animations can be +synthesized in real time using Gaussian splatting. Compared to state-of-the-art +methods, our Gaussian blendshape representation better captures high-frequency +details exhibited in input video, and achieves superior rendering performance. + +
+
+ comment: ACM SIGGRAPH Conference Proceedings 2024 +
+
+
+
+
+ + ☆ CLIP-Mamba: CLIP Pretrained Mamba Models with OOD and Hessian Evaluation + + +
+ State space models and Mamba-based models have been increasingly applied +across various domains, achieving state-of-the-art performance. This technical +report introduces the first attempt to train a transferable Mamba model +utilizing contrastive language-image pretraining (CLIP). We have trained Mamba +models of varying sizes and undertaken comprehensive evaluations of these +models on 26 zero-shot classification datasets and 16 out-of-distribution (OOD) +datasets. Our findings reveal that a Mamba model with 67 million parameters is +on par with a 307 million-parameter Vision Transformer (ViT) model in zero-shot +classification tasks, highlighting the parameter efficiency of Mamba models. In +tests of OOD generalization, Mamba-based models exhibit exceptional performance +in conditions of OOD image contrast or when subjected to high-pass filtering. +However, a Hessian analysis indicates that Mamba models feature a sharper and +more non-convex landscape compared to ViT-based models, making them more +challenging to train. The source code is available at +https://github.com/raytrun/mamba-clip. + +
+
+
+
+
+ + ☆ Pseudo Label Refinery for Unsupervised Domain Adaptation on + Cross-dataset 3D Object Detection CVPR2024 + + +
+ Recent self-training techniques have shown notable improvements in +unsupervised domain adaptation for 3D object detection (3D UDA). These +techniques typically select pseudo labels, i.e., 3D boxes, to supervise models +for the target domain. However, this selection process inevitably introduces +unreliable 3D boxes, in which 3D points cannot be definitively assigned as +foreground or background. Previous techniques mitigate this by reweighting +these boxes as pseudo labels, but these boxes can still poison the training +process. To resolve this problem, in this paper, we propose a novel pseudo +label refinery framework. Specifically, in the selection process, to improve +the reliability of pseudo boxes, we propose a complementary augmentation +strategy. This strategy involves either removing all points within an +unreliable box or replacing it with a high-confidence box. Moreover, the point +numbers of instances in high-beam datasets are considerably higher than those +in low-beam datasets, also degrading the quality of pseudo labels during the +training process. We alleviate this issue by generating additional proposals +and aligning RoI features across different domains. Experimental results +demonstrate that our method effectively enhances the quality of pseudo labels +and consistently surpasses the state-of-the-art methods on six autonomous +driving benchmarks. Code will be available at +https://github.com/Zhanwei-Z/PERE. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Cross-Block Fine-Grained Semantic Cascade for Skeleton-Based Sports + Action Recognition + + +
+ Human action video recognition has recently attracted more attention in +applications such as video security and sports posture correction. Popular +solutions, including graph convolutional networks (GCNs) that model the human +skeleton as a spatiotemporal graph, have proven very effective. GCNs-based +methods with stacked blocks usually utilize top-layer semantics for +classification/annotation purposes. Although the global features learned +through the procedure are suitable for the general classification, they have +difficulty capturing fine-grained action change across adjacent frames -- +decisive factors in sports actions. In this paper, we propose a novel +``Cross-block Fine-grained Semantic Cascade (CFSC)'' module to overcome this +challenge. In summary, the proposed CFSC progressively integrates shallow +visual knowledge into high-level blocks to allow networks to focus on action +details. In particular, the CFSC module utilizes the GCN feature maps produced +at different levels, as well as aggregated features from proceeding levels to +consolidate fine-grained features. In addition, a dedicated temporal +convolution is applied at each level to learn short-term temporal features, +which will be carried over from shallow to deep layers to maximize the leverage +of low-level details. This cross-block feature aggregation methodology, capable +of mitigating the loss of fine-grained information, has resulted in improved +performance. Last, FD-7, a new action recognition dataset for fencing sports, +was collected and will be made publicly available. Experimental results and +empirical analysis on public benchmarks (FSD-10) and self-collected (FD-7) +demonstrate the advantage of our CFSC module on learning discriminative +patterns for action classification over others. + +
+
+
+
+
+ + ☆ Probing Unlearned Diffusion Models: A Transferable Adversarial Attack + Perspective + + +
+ Advanced text-to-image diffusion models raise safety concerns regarding +identity privacy violation, copyright infringement, and Not Safe For Work +content generation. Towards this, unlearning methods have been developed to +erase these involved concepts from diffusion models. However, these unlearning +methods only shift the text-to-image mapping and preserve the visual content +within the generative space of diffusion models, leaving a fatal flaw for +restoring these erased concepts. This erasure trustworthiness problem needs +probe, but previous methods are sub-optimal from two perspectives: (1) Lack of +transferability: Some methods operate within a white-box setting, requiring +access to the unlearned model. And the learned adversarial input often fails to +transfer to other unlearned models for concept restoration; (2) Limited attack: +The prompt-level methods struggle to restore narrow concepts from unlearned +models, such as celebrity identity. Therefore, this paper aims to leverage the +transferability of the adversarial attack to probe the unlearning robustness +under a black-box setting. This challenging scenario assumes that the +unlearning method is unknown and the unlearned model is inaccessible for +optimization, requiring the attack to be capable of transferring across +different unlearned models. Specifically, we employ an adversarial search +strategy to search for the adversarial embedding which can transfer across +different unlearned models. This strategy adopts the original Stable Diffusion +model as a surrogate model to iteratively erase and search for embeddings, +enabling it to find the embedding that can restore the target concept for +different unlearning methods. Extensive experiments demonstrate the +transferability of the searched adversarial embedding across several +state-of-the-art unlearning methods and its effectiveness for different levels +of concepts. + +
+
+
+
+
+ + ☆ SemanticFormer: Holistic and Semantic Traffic Scene Representation for + Trajectory Prediction using Knowledge Graphs + + +
+ Trajectory prediction in autonomous driving relies on accurate representation +of all relevant contexts of the driving scene including traffic participants, +road topology, traffic signs as well as their semantic relations to each other. +Despite increased attention to this issue, most approaches in trajectory +prediction do not consider all of these factors sufficiently. This paper +describes a method SemanticFormer to predict multimodal trajectories by +reasoning over a semantic traffic scene graph using a hybrid approach. We +extract high-level information in the form of semantic meta-paths from a +knowledge graph which is then processed by a novel pipeline based on multiple +attention mechanisms to predict accurate trajectories. The proposed +architecture comprises a hierarchical heterogeneous graph encoder, which can +capture spatio-temporal and relational information across agents and between +agents and road elements, and a predictor that fuses the different encodings +and decodes trajectories with probabilities. Finally, a refinement module +evaluates permitted meta-paths of trajectories and speed profiles to obtain +final predicted trajectories. Evaluation of the nuScenes benchmark demonstrates +improved performance compared to the state-of-the-art methods. + +
+
+ comment: 8 pages, 6 figures, submitted to RA-L +
+
+
+
+
+ + ☆ Large Language Model Informed Patent Image Retrieval + + +
+ In patent prosecution, image-based retrieval systems for identifying +similarities between current patent images and prior art are pivotal to ensure +the novelty and non-obviousness of patent applications. Despite their growing +popularity in recent years, existing attempts, while effective at recognizing +images within the same patent, fail to deliver practical value due to their +limited generalizability in retrieving relevant prior art. Moreover, this task +inherently involves the challenges posed by the abstract visual features of +patent images, the skewed distribution of image classifications, and the +semantic information of image descriptions. Therefore, we propose a +language-informed, distribution-aware multimodal approach to patent image +feature learning, which enriches the semantic understanding of patent image by +integrating Large Language Models and improves the performance of +underrepresented classes with our proposed distribution-aware contrastive +losses. Extensive experiments on DeepPatent2 dataset show that our proposed +method achieves state-of-the-art or comparable performance in image-based +patent retrieval with mAP +53.3%, Recall@10 +41.8%, and MRR@10 +51.9%. +Furthermore, through an in-depth user analysis, we explore our model in aiding +patent professionals in their image retrieval efforts, highlighting the model's +real-world applicability and effectiveness. + +
+
+ comment: 8 pages. Under review +
+
+
+
+
+ + ☆ Reliable or Deceptive? Investigating Gated Features for Smooth Visual + Explanations in CNNs + + +
+ Deep learning models have achieved remarkable success across diverse domains. +However, the intricate nature of these models often impedes a clear +understanding of their decision-making processes. This is where Explainable AI +(XAI) becomes indispensable, offering intuitive explanations for model +decisions. In this work, we propose a simple yet highly effective approach, +ScoreCAM++, which introduces modifications to enhance the promising ScoreCAM +method for visual explainability. Our proposed approach involves altering the +normalization function within the activation layer utilized in ScoreCAM, +resulting in significantly improved results compared to previous efforts. +Additionally, we apply an activation function to the upsampled activation +layers to enhance interpretability. This improvement is achieved by selectively +gating lower-priority values within the activation layer. Through extensive +experiments and qualitative comparisons, we demonstrate that ScoreCAM++ +consistently achieves notably superior performance and fairness in interpreting +the decision-making process compared to both ScoreCAM and previous methods. + +
+
+
+
+
+ + ☆ Multi-Scale Heterogeneity-Aware Hypergraph Representation for + Histopathology Whole Slide Images ICME2024 + + +
+ Survival prediction is a complex ordinal regression task that aims to predict +the survival coefficient ranking among a cohort of patients, typically achieved +by analyzing patients' whole slide images. Existing deep learning approaches +mainly adopt multiple instance learning or graph neural networks under weak +supervision. Most of them are unable to uncover the diverse interactions +between different types of biological entities(\textit{e.g.}, cell cluster and +tissue block) across multiple scales, while such interactions are crucial for +patient survival prediction. In light of this, we propose a novel multi-scale +heterogeneity-aware hypergraph representation framework. Specifically, our +framework first constructs a multi-scale heterogeneity-aware hypergraph and +assigns each node with its biological entity type. It then mines diverse +interactions between nodes on the graph structure to obtain a global +representation. Experimental results demonstrate that our method outperforms +state-of-the-art approaches on three benchmark datasets. Code is publicly +available at +\href{https://github.com/Hanminghao/H2GT}{https://github.com/Hanminghao/H2GT}. + +
+
+ comment: 9 pages, 6 figures, accepted by ICME2024 +
+
+
+
+
+ + ☆ G2LTraj: A Global-to-Local Generation Approach for Trajectory Prediction IJCAI 2024 + + +
+ Predicting future trajectories of traffic agents accurately holds substantial +importance in various applications such as autonomous driving. Previous methods +commonly infer all future steps of an agent either recursively or +simultaneously. However, the recursive strategy suffers from the accumulated +error, while the simultaneous strategy overlooks the constraints among future +steps, resulting in kinematically infeasible predictions. To address these +issues, in this paper, we propose G2LTraj, a plug-and-play global-to-local +generation approach for trajectory prediction. Specifically, we generate a +series of global key steps that uniformly cover the entire future time range. +Subsequently, the local intermediate steps between the adjacent key steps are +recursively filled in. In this way, we prevent the accumulated error from +propagating beyond the adjacent key steps. Moreover, to boost the kinematical +feasibility, we not only introduce the spatial constraints among key steps but +also strengthen the temporal constraints among the intermediate steps. Finally, +to ensure the optimal granularity of key steps, we design a selectable +granularity strategy that caters to each predicted trajectory. Our G2LTraj +significantly improves the performance of seven existing trajectory predictors +across the ETH, UCY and nuScenes datasets. Experimental results demonstrate its +effectiveness. Code will be available at https://github.com/Zhanwei-Z/G2LTraj. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ End-to-end information extraction in handwritten documents: + Understanding Paris marriage records from 1880 to 1940 ICDAR 2024 + + +
+ The EXO-POPP project aims to establish a comprehensive database comprising +300,000 marriage records from Paris and its suburbs, spanning the years 1880 to +1940, which are preserved in over 130,000 scans of double pages. Each marriage +record may encompass up to 118 distinct types of information that require +extraction from plain text. In this paper, we introduce the M-POPP dataset, a +subset of the M-POPP database with annotations for full-page text recognition +and information extraction in both handwritten and printed documents, and which +is now publicly available. We present a fully end-to-end architecture adapted +from the DAN, designed to perform both handwritten text recognition and +information extraction directly from page images without the need for explicit +segmentation. We showcase the information extraction capabilities of this +architecture by achieving a new state of the art for full-page Information +Extraction on Esposalles and we use this architecture as a baseline for the +M-POPP dataset. We also assess and compare how different encoding strategies +for named entities in the text affect the performance of jointly recognizing +handwritten text and extracting information, from full pages. + +
+
+ comment: To be published in: International Conference on Document Analysis and + Recognition - ICDAR 2024 +
+
+
+
+
+ + ☆ LVOS: A Benchmark for Large-scale Long-term Video Object Segmentation + + +
+ Video object segmentation (VOS) aims to distinguish and track target objects +in a video. Despite the excellent performance achieved by off-the-shell VOS +models, existing VOS benchmarks mainly focus on short-term videos lasting about +5 seconds, where objects remain visible most of the time. However, these +benchmarks poorly represent practical applications, and the absence of +long-term datasets restricts further investigation of VOS in realistic +scenarios. Thus, we propose a novel benchmark named LVOS, comprising 720 videos +with 296,401 frames and 407,945 high-quality annotations. Videos in LVOS last +1.14 minutes on average, approximately 5 times longer than videos in existing +datasets. Each video includes various attributes, especially challenges +deriving from the wild, such as long-term reappearing and cross-temporal +similar objects. Compared to previous benchmarks, our LVOS better reflects VOS +models' performance in real scenarios. Based on LVOS, we evaluate 20 existing +VOS models under 4 different settings and conduct a comprehensive analysis. On +LVOS, these models suffer a large performance drop, highlighting the challenge +of achieving precise tracking and segmentation in real-world scenarios. +Attribute-based analysis indicates that key factor to accuracy decline is the +increased video length, emphasizing LVOS's crucial role. We hope our LVOS can +advance development of VOS in real scenes. Data and code are available at +https://lingyihongfd.github.io/lvos.github.io/. + +
+
+ comment: LVOS V2 +
+
+
+
+
+ + ☆ Revisiting N-Gram Models: Their Impact in Modern Neural Networks for + Handwritten Text Recognition + + +
+ In recent advances in automatic text recognition (ATR), deep neural networks +have demonstrated the ability to implicitly capture language statistics, +potentially reducing the need for traditional language models. This study +directly addresses whether explicit language models, specifically n-gram +models, still contribute to the performance of state-of-the-art deep learning +architectures in the field of handwriting recognition. We evaluate two +prominent neural network architectures, PyLaia and DAN, with and without the +integration of explicit n-gram language models. Our experiments on three +datasets - IAM, RIMES, and NorHand v2 - at both line and page level, +investigate optimal parameters for n-gram models, including their order, +weight, smoothing methods and tokenization level. The results show that +incorporating character or subword n-gram models significantly improves the +performance of ATR models on all datasets, challenging the notion that deep +learning models alone are sufficient for optimal performance. In particular, +the combination of DAN with a character language model outperforms current +benchmarks, confirming the value of hybrid approaches in modern document +analysis systems. + +
+
+
+
+
+ + ☆ A Light-weight Transformer-based Self-supervised Matching Network for + Heterogeneous Images + + +
+ Matching visible and near-infrared (NIR) images remains a significant +challenge in remote sensing image fusion. The nonlinear radiometric differences +between heterogeneous remote sensing images make the image matching task even +more difficult. Deep learning has gained substantial attention in computer +vision tasks in recent years. However, many methods rely on supervised learning +and necessitate large amounts of annotated data. Nevertheless, annotated data +is frequently limited in the field of remote sensing image matching. To address +this challenge, this paper proposes a novel keypoint descriptor approach that +obtains robust feature descriptors via a self-supervised matching network. A +light-weight transformer network, termed as LTFormer, is designed to generate +deep-level feature descriptors. Furthermore, we implement an innovative triplet +loss function, LT Loss, to enhance the matching performance further. Our +approach outperforms conventional hand-crafted local feature descriptors and +proves equally competitive compared to state-of-the-art deep learning-based +methods, even amidst the shortage of annotated data. + +
+
+ comment: accepted by Information Fusion +
+
+
+
+
+ + ☆ Data Set Terminology of Artificial Intelligence in Medicine: A + Historical Review and Recommendation + + +
+ Medicine and artificial intelligence (AI) engineering represent two distinct +fields each with decades of published history. With such history comes a set of +terminology that has a specific way in which it is applied. However, when two +distinct fields with overlapping terminology start to collaborate, +miscommunication and misunderstandings can occur. This narrative review aims to +give historical context for these terms, accentuate the importance of clarity +when these terms are used in medical AI contexts, and offer solutions to +mitigate misunderstandings by readers from either field. Through an examination +of historical documents, including articles, writing guidelines, and textbooks, +this review traces the divergent evolution of terms for data sets and their +impact. Initially, the discordant interpretations of the word 'validation' in +medical and AI contexts are explored. Then the data sets used for AI evaluation +are classified, namely random splitting, cross-validation, temporal, +geographic, internal, and external sets. The accurate and standardized +description of these data sets is crucial for demonstrating the robustness and +generalizability of AI applications in medicine. This review clarifies existing +literature to provide a comprehensive understanding of these classifications +and their implications in AI evaluation. This review then identifies often +misunderstood terms and proposes pragmatic solutions to mitigate terminological +confusion. Among these solutions are the use of standardized terminology such +as 'training set,' 'validation (or tuning) set,' and 'test set,' and explicit +definition of data set splitting terminologies in each medical AI research +publication. This review aspires to enhance the precision of communication in +medical AI, thereby fostering more effective and transparent research +methodologies in this interdisciplinary field. + +
+
+ comment: Totally 20 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Robust Pedestrian Detection via Constructing Versatile Pedestrian + Knowledge Bank + + +
+ Pedestrian detection is a crucial field of computer vision research which can +be adopted in various real-world applications (e.g., self-driving systems). +However, despite noticeable evolution of pedestrian detection, pedestrian +representations learned within a detection framework are usually limited to +particular scene data in which they were trained. Therefore, in this paper, we +propose a novel approach to construct versatile pedestrian knowledge bank +containing representative pedestrian knowledge which can be applicable to +various detection frameworks and adopted in diverse scenes. We extract +generalized pedestrian knowledge from a large-scale pretrained model, and we +curate them by quantizing most representative features and guiding them to be +distinguishable from background scenes. Finally, we construct versatile +pedestrian knowledge bank which is composed of such representations, and then +we leverage it to complement and enhance pedestrian features within a +pedestrian detection framework. Through comprehensive experiments, we validate +the effectiveness of our method, demonstrating its versatility and +outperforming state-of-the-art detection performances. + +
+
+
+
+
+ + ☆ Masked Spatial Propagation Network for Sparsity-Adaptive Depth + Refinement + + +
+ The main function of depth completion is to compensate for an insufficient +and unpredictable number of sparse depth measurements of hardware sensors. +However, existing research on depth completion assumes that the sparsity -- the +number of points or LiDAR lines -- is fixed for training and testing. Hence, +the completion performance drops severely when the number of sparse depths +changes significantly. To address this issue, we propose the sparsity-adaptive +depth refinement (SDR) framework, which refines monocular depth estimates using +sparse depth points. For SDR, we propose the masked spatial propagation network +(MSPN) to perform SDR with a varying number of sparse depths effectively by +gradually propagating sparse depth information throughout the entire depth map. +Experimental results demonstrate that MPSN achieves state-of-the-art +performance on both SDR and conventional depth completion scenarios. + +
+
+
+
+
+ + ☆ On Improving the Algorithm-, Model-, and Data- Efficiency of + Self-Supervised Learning + + +
+ Self-supervised learning (SSL) has developed rapidly in recent years. +However, most of the mainstream methods are computationally expensive and rely +on two (or more) augmentations for each image to construct positive pairs. +Moreover, they mainly focus on large models and large-scale datasets, which +lack flexibility and feasibility in many practical applications. In this paper, +we propose an efficient single-branch SSL method based on non-parametric +instance discrimination, aiming to improve the algorithm, model, and data +efficiency of SSL. By analyzing the gradient formula, we correct the update +rule of the memory bank with improved performance. We further propose a novel +self-distillation loss that minimizes the KL divergence between the probability +distribution and its square root version. We show that this alleviates the +infrequent updating problem in instance discrimination and greatly accelerates +convergence. We systematically compare the training overhead and performance of +different methods in different scales of data, and under different backbones. +Experimental results show that our method outperforms various baselines with +significantly less overhead, and is especially effective for limited amounts of +data and small models. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Revisiting the Adversarial Robustness of Vision Language Models: a + Multimodal Perspective + + +
+ Pretrained vision-language models (VLMs) like CLIP have shown impressive +generalization performance across various downstream tasks, yet they remain +vulnerable to adversarial attacks. While prior research has primarily +concentrated on improving the adversarial robustness of image encoders to guard +against attacks on images, the exploration of text-based and multimodal attacks +has largely been overlooked. In this work, we initiate the first known and +comprehensive effort to study adapting vision-language models for adversarial +robustness under the multimodal attack. Firstly, we introduce a multimodal +attack strategy and investigate the impact of different attacks. We then +propose a multimodal contrastive adversarial training loss, aligning the clean +and adversarial text embeddings with the adversarial and clean visual features, +to enhance the adversarial robustness of both image and text encoders of CLIP. +Extensive experiments on 15 datasets across two tasks demonstrate that our +method significantly improves the adversarial robustness of CLIP. +Interestingly, we find that the model fine-tuned against multimodal adversarial +attacks exhibits greater robustness than its counterpart fine-tuned solely +against image-based attacks, even in the context of image attacks, which may +open up new possibilities for enhancing the security of VLMs. + +
+
+ comment: 16 pages, 14 figures +
+
+
+
+
+ + ☆ Soft Prompt Generation for Domain Generalization + + +
+ Large pre-trained vision language models (VLMs) have shown impressive +zero-shot ability on downstream tasks with manually designed prompt, which are +not optimal for specific domains. To further adapt VLMs to downstream tasks, +soft prompt is proposed to replace manually designed prompt, which acts as a +learning vector that undergoes fine-tuning based on specific domain data. Prior +prompt learning methods primarily learn a fixed prompt and residuled prompt +from training samples. However, the learned prompts lack diversity and ignore +information about unseen domains, potentially compromising the transferability +of the prompts. In this paper, we reframe the prompt learning framework from a +generative perspective and propose a simple yet efficient method for the Domain +Generalization (DG) task, namely \textbf{S}oft \textbf{P}rompt +\textbf{G}eneration (SPG). To the best of our knowledge, we are the first to +introduce the generative model into prompt learning in VLMs and explore its +potential for producing soft prompts by relying solely on the generative model, +ensuring the diversity of prompts. Specifically, SPG consists of a two-stage +training phase and an inference phase. During the training phase, we introduce +soft prompt labels for each domain, aiming to incorporate the generative model +domain knowledge. During the inference phase, the generator of the generative +model is employed to obtain instance-specific soft prompts for the unseen +target domain. Extensive experiments on five domain generalization benchmarks +of three DG tasks demonstrate that our proposed SPG achieves state-of-the-art +performance. The code will be available soon. + +
+
+ comment: 23 pages, 4 figures +
+
+
+
+
+ + ☆ Quater-GCN: Enhancing 3D Human Pose Estimation with Orientation and + Semi-supervised Training + + +
+ 3D human pose estimation is a vital task in computer vision, involving the +prediction of human joint positions from images or videos to reconstruct a +skeleton of a human in three-dimensional space. This technology is pivotal in +various fields, including animation, security, human-computer interaction, and +automotive safety, where it promotes both technological progress and enhanced +human well-being. The advent of deep learning significantly advances the +performance of 3D pose estimation by incorporating temporal information for +predicting the spatial positions of human joints. However, traditional methods +often fall short as they primarily focus on the spatial coordinates of joints +and overlook the orientation and rotation of the connecting bones, which are +crucial for a comprehensive understanding of human pose in 3D space. To address +these limitations, we introduce Quater-GCN (Q-GCN), a directed graph +convolutional network tailored to enhance pose estimation by orientation. Q-GCN +excels by not only capturing the spatial dependencies among node joints through +their coordinates but also integrating the dynamic context of bone rotations in +2D space. This approach enables a more sophisticated representation of human +poses by also regressing the orientation of each bone in 3D space, moving +beyond mere coordinate prediction. Furthermore, we complement our model with a +semi-supervised training strategy that leverages unlabeled data, addressing the +challenge of limited orientation ground truth data. Through comprehensive +evaluations, Q-GCN has demonstrated outstanding performance against current +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Bridge to Non-Barrier Communication: Gloss-Prompted Fine-grained Cued + Speech Gesture Generation with Diffusion Model + + +
+ Cued Speech (CS) is an advanced visual phonetic encoding system that +integrates lip reading with hand codings, enabling people with hearing +impairments to communicate efficiently. CS video generation aims to produce +specific lip and gesture movements of CS from audio or text inputs. The main +challenge is that given limited CS data, we strive to simultaneously generate +fine-grained hand and finger movements, as well as lip movements, meanwhile the +two kinds of movements need to be asynchronously aligned. Existing CS +generation methods are fragile and prone to poor performance due to +template-based statistical models and careful hand-crafted pre-processing to +fit the models. Therefore, we propose a novel Gloss-prompted Diffusion-based CS +Gesture generation framework (called GlossDiff). Specifically, to integrate +additional linguistic rules knowledge into the model. we first introduce a +bridging instruction called \textbf{Gloss}, which is an automatically generated +descriptive text to establish a direct and more delicate semantic connection +between spoken language and CS gestures. Moreover, we first suggest rhythm is +an important paralinguistic feature for CS to improve the communication +efficacy. Therefore, we propose a novel Audio-driven Rhythmic Module (ARM) to +learn rhythm that matches audio speech. Moreover, in this work, we design, +record, and publish the first Chinese CS dataset with four CS cuers. Extensive +experiments demonstrate that our method quantitatively and qualitatively +outperforms current state-of-the-art (SOTA) methods. We release the code and +data at https://glossdiff.github.io/. + +
+
+
+
+
+ + ☆ C2FDrone: Coarse-to-Fine Drone-to-Drone Detection using Vision + Transformer Networks ICRA 2024 + + +
+ A vision-based drone-to-drone detection system is crucial for various +applications like collision avoidance, countering hostile drones, and +search-and-rescue operations. However, detecting drones presents unique +challenges, including small object sizes, distortion, occlusion, and real-time +processing requirements. Current methods integrating multi-scale feature fusion +and temporal information have limitations in handling extreme blur and +minuscule objects. To address this, we propose a novel coarse-to-fine detection +strategy based on vision transformers. We evaluate our approach on three +challenging drone-to-drone detection datasets, achieving F1 score enhancements +of 7%, 3%, and 1% on the FL-Drones, AOT, and NPS-Drones datasets, respectively. +Additionally, we demonstrate real-time processing capabilities by deploying our +model on an edge-computing device. Our code will be made publicly available. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ☆ Mapping New Realities: Ground Truth Image Creation with Pix2Pix + Image-to-Image Translation + + +
+ Generative Adversarial Networks (GANs) have significantly advanced image +processing, with Pix2Pix being a notable framework for image-to-image +translation. This paper explores a novel application of Pix2Pix to transform +abstract map images into realistic ground truth images, addressing the scarcity +of such images crucial for domains like urban planning and autonomous vehicle +training. We detail the Pix2Pix model's utilization for generating +high-fidelity datasets, supported by a dataset of paired map and aerial images, +and enhanced by a tailored training regimen. The results demonstrate the +model's capability to accurately render complex urban features, establishing +its efficacy and potential for broad real-world applications. + +
+
+
+
+
+ + ☆ DELINE8K: A Synthetic Data Pipeline for the Semantic Segmentation of + Historical Documents + + +
+ Document semantic segmentation is a promising avenue that can facilitate +document analysis tasks, including optical character recognition (OCR), form +classification, and document editing. Although several synthetic datasets have +been developed to distinguish handwriting from printed text, they fall short in +class variety and document diversity. We demonstrate the limitations of +training on existing datasets when solving the National Archives Form Semantic +Segmentation dataset (NAFSS), a dataset which we introduce. To address these +limitations, we propose the most comprehensive document semantic segmentation +synthesis pipeline to date, incorporating preprinted text, handwriting, and +document backgrounds from over 10 sources to create the Document Element Layer +INtegration Ensemble 8K, or DELINE8K dataset. Our customized dataset exhibits +superior performance on the NAFSS benchmark, demonstrating it as a promising +tool in further research. The DELINE8K dataset is available at +https://github.com/Tahlor/deline8k. + +
+
+
+
+
+ + ☆ Enhancing Intrinsic Features for Debiasing via Investigating + Class-Discerning Common Attributes in Bias-Contrastive Pair CVPR 2024 + + +
+ In the image classification task, deep neural networks frequently rely on +bias attributes that are spuriously correlated with a target class in the +presence of dataset bias, resulting in degraded performance when applied to +data without bias attributes. The task of debiasing aims to compel classifiers +to learn intrinsic attributes that inherently define a target class rather than +focusing on bias attributes. While recent approaches mainly focus on +emphasizing the learning of data samples without bias attributes (i.e., +bias-conflicting samples) compared to samples with bias attributes (i.e., +bias-aligned samples), they fall short of directly guiding models where to +focus for learning intrinsic features. To address this limitation, this paper +proposes a method that provides the model with explicit spatial guidance that +indicates the region of intrinsic features. We first identify the intrinsic +features by investigating the class-discerning common features between a +bias-aligned (BA) sample and a bias-conflicting (BC) sample (i.e., +bias-contrastive pair). Next, we enhance the intrinsic features in the BA +sample that are relatively under-exploited for prediction compared to the BC +sample. To construct the bias-contrastive pair without using bias information, +we introduce a bias-negative score that distinguishes BC samples from BA +samples employing a biased model. The experiments demonstrate that our method +achieves state-of-the-art performance on synthetic and real-world datasets with +various levels of bias severity. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Transition Rate Scheduling for Quantization-Aware Training + + +
+ Quantization-aware training (QAT) simulates a quantization process during +training to lower bit-precision of weights/activations. It learns quantized +weights indirectly by updating latent weights, i.e., full-precision inputs to a +quantizer, using gradient-based optimizers. We claim that coupling a +user-defined learning rate (LR) with these optimizers is sub-optimal for QAT. +Quantized weights transit discrete levels of a quantizer, only if corresponding +latent weights pass transition points, where the quantizer changes discrete +states. This suggests that the changes of quantized weights are affected by +both the LR for latent weights and their distributions. It is thus difficult to +control the degree of changes for quantized weights by scheduling the LR +manually. We conjecture that the degree of parameter changes in QAT is related +to the number of quantized weights transiting discrete levels. Based on this, +we introduce a transition rate (TR) scheduling technique that controls the +number of transitions of quantized weights explicitly. Instead of scheduling a +LR for latent weights, we schedule a target TR of quantized weights, and update +the latent weights with a novel transition-adaptive LR (TALR), enabling +considering the degree of changes for the quantized weights during QAT. +Experimental results demonstrate the effectiveness of our approach on standard +benchmarks. + +
+
+ comment: Submitted to IEEE TPAMI on Apr. 03, 2023 +
+
+
+
+
+ + ☆ Improved AutoEncoder with LSTM module and KL divergence + + +
+ The task of anomaly detection is to separate anomalous data from normal data +in the dataset. Models such as deep convolutional autoencoder (CAE) network and +deep supporting vector data description (SVDD) model have been universally +employed and have demonstrated significant success in detecting anomalies. +However, the over-reconstruction ability of CAE network for anomalous data can +easily lead to high false negative rate in detecting anomalous data. On the +other hand, the deep SVDD model has the drawback of feature collapse, which +leads to a decrease of detection accuracy for anomalies. To address these +problems, we propose the Improved AutoEncoder with LSTM module and +Kullback-Leibler divergence (IAE-LSTM-KL) model in this paper. An LSTM network +is added after the encoder to memorize feature representations of normal data. +In the meanwhile, the phenomenon of feature collapse can also be mitigated by +penalizing the featured input to SVDD module via KL divergence. The efficacy of +the IAE-LSTM-KL model is validated through experiments on both synthetic and +real-world datasets. Experimental results show that IAE-LSTM-KL model yields +higher detection accuracy for anomalies. In addition, it is also found that the +IAE-LSTM-KL model demonstrates enhanced robustness to contaminated outliers in +the dataset. + +
+
+
+
+
+ + ☆ A Minimal Set of Parameters Based Depth-Dependent Distortion Model and + Its Calibration Method for Stereo Vision Systems + + +
+ Depth position highly affects lens distortion, especially in close-range +photography, which limits the measurement accuracy of existing stereo vision +systems. Moreover, traditional depth-dependent distortion models and their +calibration methods have remained complicated. In this work, we propose a +minimal set of parameters based depth-dependent distortion model (MDM), which +considers the radial and decentering distortions of the lens to improve the +accuracy of stereo vision systems and simplify their calibration process. In +addition, we present an easy and flexible calibration method for the MDM of +stereo vision systems with a commonly used planar pattern, which requires +cameras to observe the planar pattern in different orientations. The proposed +technique is easy to use and flexible compared with classical calibration +techniques for depth-dependent distortion models in which the lens must be +perpendicular to the planar pattern. The experimental validation of the MDM and +its calibration method showed that the MDM improved the calibration accuracy by +56.55% and 74.15% compared with the Li's distortion model and traditional +Brown's distortion model. Besides, an iteration-based reconstruction method is +proposed to iteratively estimate the depth information in the MDM during +three-dimensional reconstruction. The results showed that the accuracy of the +iteration-based reconstruction method was improved by 9.08% compared with that +of the non-iteration reconstruction method. + +
+
+ comment: This paper has been accepted for publication in IEEE Transactions on + Instrumentation and Measurement +
+
+
+
+
+ + ☆ Espresso: Robust Concept Filtering in Text-to-Image Models + + +
+ Diffusion-based text-to-image (T2I) models generate high-fidelity images for +given textual prompts. They are trained on large datasets scraped from the +Internet, potentially containing unacceptable concepts (e.g., copyright +infringing or unsafe). Retraining T2I models after filtering out unacceptable +concepts in the training data is inefficient and degrades utility. Hence, there +is a need for concept removal techniques (CRTs) which are effective in removing +unacceptable concepts, utility-preserving on acceptable concepts, and robust +against evasion with adversarial prompts. None of the prior filtering and +fine-tuning CRTs satisfy all these requirements simultaneously. + We introduce Espresso, the first robust concept filter based on Contrastive +Language-Image Pre-Training (CLIP). It identifies unacceptable concepts by +projecting the generated image's embedding onto the vector connecting +unacceptable and acceptable concepts in the joint text-image embedding space. +This ensures robustness by restricting the adversary to adding noise only along +this vector, in the direction of the acceptable concept. Further fine-tuning +Espresso to separate embeddings of acceptable and unacceptable concepts, while +preserving their pairing with image embeddings, ensures both effectiveness and +utility. We evaluate Espresso on eleven concepts to show that it is effective +(~5% CLIP accuracy on unacceptable concepts), utility-preserving (~93% +normalized CLIP score on acceptable concepts), and robust (~4% CLIP accuracy on +adversarial prompts for unacceptable concepts). Finally, we present theoretical +bounds for the certified robustness of Espresso against adversarial prompts, +and an empirical analysis. + +
+
+
+
+
+ + ☆ Transcrib3D: 3D Referring Expression Resolution through Large Language + Models + + +
+ If robots are to work effectively alongside people, they must be able to +interpret natural language references to objects in their 3D environment. +Understanding 3D referring expressions is challenging -- it requires the +ability to both parse the 3D structure of the scene and correctly ground +free-form language in the presence of distraction and clutter. We introduce +Transcrib3D, an approach that brings together 3D detection methods and the +emergent reasoning capabilities of large language models (LLMs). Transcrib3D +uses text as the unifying medium, which allows us to sidestep the need to learn +shared representations connecting multi-modal inputs, which would require +massive amounts of annotated 3D data. As a demonstration of its effectiveness, +Transcrib3D achieves state-of-the-art results on 3D reference resolution +benchmarks, with a great leap in performance from previous multi-modality +baselines. To improve upon zero-shot performance and facilitate local +deployment on edge computers and robots, we propose self-correction for +fine-tuning that trains smaller models, resulting in performance close to that +of large models. We show that our method enables a real robot to perform +pick-and-place tasks given queries that contain challenging referring +expressions. Project site is at https://ripl.github.io/Transcrib3D. + +
+
+ comment: CORLW 2023 +
+
+
+
+
+ + ☆ TableVQA-Bench: A Visual Question Answering Benchmark on Multiple Table + Domains + + +
+ In this paper, we establish a benchmark for table visual question answering, +referred to as the TableVQA-Bench, derived from pre-existing table +question-answering (QA) and table structure recognition datasets. It is +important to note that existing datasets have not incorporated images or QA +pairs, which are two crucial components of TableVQA. As such, the primary +objective of this paper is to obtain these necessary components. Specifically, +images are sourced either through the application of a \textit{stylesheet} or +by employing the proposed table rendering system. QA pairs are generated by +exploiting the large language model (LLM) where the input is a text-formatted +table. Ultimately, the completed TableVQA-Bench comprises 1,500 QA pairs. We +comprehensively compare the performance of various multi-modal large language +models (MLLMs) on TableVQA-Bench. GPT-4V achieves the highest accuracy among +commercial and open-sourced MLLMs from our experiments. Moreover, we discover +that the number of vision queries plays a significant role in TableVQA +performance. To further analyze the capabilities of MLLMs in comparison to +their LLM backbones, we investigate by presenting image-formatted tables to +MLLMs and text-formatted tables to LLMs, respectively. Our findings suggest +that processing visual inputs is more challenging than text inputs, as +evidenced by the lower performance of MLLMs, despite generally requiring higher +computational costs than LLMs. The proposed TableVQA-Bench and evaluation codes +are available at +\href{https://github.com/naver-ai/tablevqabench}{https://github.com/naver-ai/tablevqabench}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ NeRF-Insert: 3D Local Editing with Multimodal Control Signals + + +
+ We propose NeRF-Insert, a NeRF editing framework that allows users to make +high-quality local edits with a flexible level of control. Unlike previous work +that relied on image-to-image models, we cast scene editing as an in-painting +problem, which encourages the global structure of the scene to be preserved. +Moreover, while most existing methods use only textual prompts to condition +edits, our framework accepts a combination of inputs of different modalities as +reference. More precisely, a user may provide a combination of textual and +visual inputs including images, CAD models, and binary image masks for +specifying a 3D region. We use generic image generation models to in-paint the +scene from multiple viewpoints, and lift the local edits to a 3D-consistent +NeRF edit. Compared to previous methods, our results show better visual quality +and also maintain stronger consistency with the original NeRF. + +
+
+
+
+
+ + ☆ Global Search Optics: Automatically Exploring Optimal Solutions to + Compact Computational Imaging Systems + + +
+ The popularity of mobile vision creates a demand for advanced compact +computational imaging systems, which call for the development of both a +lightweight optical system and an effective image reconstruction model. +Recently, joint design pipelines come to the research forefront, where the two +significant components are simultaneously optimized via data-driven learning to +realize the optimal system design. However, the effectiveness of these designs +largely depends on the initial setup of the optical system, complicated by a +non-convex solution space that impedes reaching a globally optimal solution. In +this work, we present Global Search Optics (GSO) to automatically design +compact computational imaging systems through two parts: (i) Fused Optimization +Method for Automatic Optical Design (OptiFusion), which searches for diverse +initial optical systems under certain design specifications; and (ii) Efficient +Physic-aware Joint Optimization (EPJO), which conducts parallel joint +optimization of initial optical systems and image reconstruction networks with +the consideration of physical constraints, culminating in the selection of the +optimal solution. Extensive experimental results on the design of three-piece +(3P) sphere computational imaging systems illustrate that the GSO serves as a +transformative end-to-end lens design paradigm for superior global optimal +structure searching ability, which provides compact computational imaging +systems with higher imaging quality compared to traditional methods. The source +code will be made publicly available at https://github.com/wumengshenyou/GSO. + +
+
+ comment: The source code will be made publicly available at + https://github.com/wumengshenyou/GSO +
+
+
+
+
+ + ☆ XFeat: Accelerated Features for Lightweight Image Matching CVPR 2024 + + +
+ We introduce a lightweight and accurate architecture for resource-efficient +visual correspondence. Our method, dubbed XFeat (Accelerated Features), +revisits fundamental design choices in convolutional neural networks for +detecting, extracting, and matching local features. Our new model satisfies a +critical need for fast and robust algorithms suitable to resource-limited +devices. In particular, accurate image matching requires sufficiently large +image resolutions - for this reason, we keep the resolution as large as +possible while limiting the number of channels in the network. Besides, our +model is designed to offer the choice of matching at the sparse or semi-dense +levels, each of which may be more suitable for different downstream +applications, such as visual navigation and augmented reality. Our model is the +first to offer semi-dense matching efficiently, leveraging a novel match +refinement module that relies on coarse local descriptors. XFeat is versatile +and hardware-independent, surpassing current deep learning-based local features +in speed (up to 5x faster) with comparable or better accuracy, proven in pose +estimation and visual localization. We showcase it running in real-time on an +inexpensive laptop CPU without specialized hardware optimizations. Code and +weights are available at www.verlab.dcc.ufmg.br/descriptors/xfeat_cvpr24. + +
+
+ comment: CVPR 2024; Source code available at + www.verlab.dcc.ufmg.br/descriptors/xfeat_cvpr24 +
+
+
+
+
+ + ☆ Explicit Correlation Learning for Generalizable Cross-Modal Deepfake + Detection ICME 2024 + + +
+ With the rising prevalence of deepfakes, there is a growing interest in +developing generalizable detection methods for various types of deepfakes. +While effective in their specific modalities, traditional detection methods +fall short in addressing the generalizability of detection across diverse +cross-modal deepfakes. This paper aims to explicitly learn potential +cross-modal correlation to enhance deepfake detection towards various +generation scenarios. Our approach introduces a correlation distillation task, +which models the inherent cross-modal correlation based on content information. +This strategy helps to prevent the model from overfitting merely to +audio-visual synchronization. Additionally, we present the Cross-Modal Deepfake +Dataset (CMDFD), a comprehensive dataset with four generation methods to +evaluate the detection of diverse cross-modal deepfakes. The experimental +results on CMDFD and FakeAVCeleb datasets demonstrate the superior +generalizability of our method over existing state-of-the-art methods. Our code +and data can be found at +\url{https://github.com/ljj898/CMDFD-Dataset-and-Deepfake-Detection}. + +
+
+ comment: accepted by ICME 2024 +
+
+
+
+
+ + ☆ PEVA-Net: Prompt-Enhanced View Aggregation Network for Zero/Few-Shot + Multi-View 3D Shape Recognition + + +
+ Large vision-language models have impressively promote the performance of 2D +visual recognition under zero/few-shot scenarios. In this paper, we focus on +exploiting the large vision-language model, i.e., CLIP, to address +zero/few-shot 3D shape recognition based on multi-view representations. The key +challenge for both tasks is to generate a discriminative descriptor of the 3D +shape represented by multiple view images under the scenarios of either without +explicit training (zero-shot 3D shape recognition) or training with a limited +number of data (few-shot 3D shape recognition). We analyze that both tasks are +relevant and can be considered simultaneously. Specifically, leveraging the +descriptor which is effective for zero-shot inference to guide the tuning of +the aggregated descriptor under the few-shot training can significantly improve +the few-shot learning efficacy. Hence, we propose Prompt-Enhanced View +Aggregation Network (PEVA-Net) to simultaneously address zero/few-shot 3D shape +recognition. Under the zero-shot scenario, we propose to leverage the prompts +built up from candidate categories to enhance the aggregation process of +multiple view-associated visual features. The resulting aggregated feature +serves for effective zero-shot recognition of the 3D shapes. Under the few-shot +scenario, we first exploit a transformer encoder to aggregate the +view-associated visual features into a global descriptor. To tune the encoder, +together with the main classification loss, we propose a self-distillation +scheme via a feature distillation loss by treating the zero-shot descriptor as +the guidance signal for the few-shot descriptor. This scheme can significantly +enhance the few-shot learning efficacy. + +
+
+
+
+
+ + ☆ Semantically Consistent Video Inpainting with Conditional Diffusion + Models + + +
+ Current state-of-the-art methods for video inpainting typically rely on +optical flow or attention-based approaches to inpaint masked regions by +propagating visual information across frames. While such approaches have led to +significant progress on standard benchmarks, they struggle with tasks that +require the synthesis of novel content that is not present in other frames. In +this paper we reframe video inpainting as a conditional generative modeling +problem and present a framework for solving such problems with conditional +video diffusion models. We highlight the advantages of using a generative +approach for this task, showing that our method is capable of generating +diverse, high-quality inpaintings and synthesizing new content that is +spatially, temporally, and semantically consistent with the provided context. + +
+
+
+
+
+ + ☆ SemVecNet: Generalizable Vector Map Generation for Arbitrary Sensor + Configurations + + +
+ Vector maps are essential in autonomous driving for tasks like localization +and planning, yet their creation and maintenance are notably costly. While +recent advances in online vector map generation for autonomous vehicles are +promising, current models lack adaptability to different sensor configurations. +They tend to overfit to specific sensor poses, leading to decreased performance +and higher retraining costs. This limitation hampers their practical use in +real-world applications. In response to this challenge, we propose a modular +pipeline for vector map generation with improved generalization to sensor +configurations. The pipeline leverages probabilistic semantic mapping to +generate a bird's-eye-view (BEV) semantic map as an intermediate +representation. This intermediate representation is then converted to a vector +map using the MapTRv2 decoder. By adopting a BEV semantic map robust to +different sensor configurations, our proposed approach significantly improves +the generalization performance. We evaluate the model on datasets with sensor +configurations not used during training. Our evaluation sets includes larger +public datasets, and smaller scale private data collected on our platform. Our +model generalizes significantly better than the state-of-the-art methods. + +
+
+ comment: 8 pages, 6 figures, Accepted to IV 2024 +
+
+
+
+
+ + ☆ Towards Real-World HDR Video Reconstruction: A Large-Scale Benchmark + Dataset and A Two-Stage Alignment Network CVPR 2024 + + +
+ As an important and practical way to obtain high dynamic range (HDR) video, +HDR video reconstruction from sequences with alternating exposures is still +less explored, mainly due to the lack of large-scale real-world datasets. +Existing methods are mostly trained on synthetic datasets, which perform poorly +in real scenes. In this work, to facilitate the development of real-world HDR +video reconstruction, we present Real-HDRV, a large-scale real-world benchmark +dataset for HDR video reconstruction, featuring various scenes, diverse motion +patterns, and high-quality labels. Specifically, our dataset contains 500 +LDRs-HDRs video pairs, comprising about 28,000 LDR frames and 4,000 HDR labels, +covering daytime, nighttime, indoor, and outdoor scenes. To our best knowledge, +our dataset is the largest real-world HDR video reconstruction dataset. +Correspondingly, we propose an end-to-end network for HDR video reconstruction, +where a novel two-stage strategy is designed to perform alignment sequentially. +Specifically, the first stage performs global alignment with the adaptively +estimated global offsets, reducing the difficulty of subsequent alignment. The +second stage implicitly performs local alignment in a coarse-to-fine manner at +the feature level using the adaptive separable convolution. Extensive +experiments demonstrate that: (1) models trained on our dataset can achieve +better performance on real scenes than those trained on synthetic datasets; (2) +our method outperforms previous state-of-the-art methods. Our dataset is +available at https://github.com/yungsyu99/Real-HDRV. + +
+
+ comment: This paper has been accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Guiding Attention in End-to-End Driving Models + + +
+ Vision-based end-to-end driving models trained by imitation learning can lead +to affordable solutions for autonomous driving. However, training these +well-performing models usually requires a huge amount of data, while still +lacking explicit and intuitive activation maps to reveal the inner workings of +these models while driving. In this paper, we study how to guide the attention +of these models to improve their driving quality and obtain more intuitive +activation maps by adding a loss term during training using salient semantic +maps. In contrast to previous work, our method does not require these salient +semantic maps to be available during testing time, as well as removing the need +to modify the model's architecture to which it is applied. We perform tests +using perfect and noisy salient semantic maps with encouraging results in both, +the latter of which is inspired by possible errors encountered with real data. +Using CIL++ as a representative state-of-the-art model and the CARLA simulator +with its standard benchmarks, we conduct experiments that show the +effectiveness of our method in training better autonomous driving models, +especially when data and computational resources are scarce. + +
+
+ comment: Accepted for publication at the 35th IEEE Intelligent Vehicles + Symposium (IV 2024) +
+
+
+
+
+ + ☆ IgCONDA-PET: Implicitly-Guided Counterfactual Diffusion for Detecting + Anomalies in PET Images + + +
+ Minimizing the need for pixel-level annotated data for training PET anomaly +segmentation networks is crucial, particularly due to time and cost constraints +related to expert annotations. Current un-/weakly-supervised anomaly detection +methods rely on autoencoder or generative adversarial networks trained only on +healthy data, although these are more challenging to train. In this work, we +present a weakly supervised and Implicitly guided COuNterfactual diffusion +model for Detecting Anomalies in PET images, branded as IgCONDA-PET. The +training is conditioned on image class labels (healthy vs. unhealthy) along +with implicit guidance to generate counterfactuals for an unhealthy image with +anomalies. The counterfactual generation process synthesizes the healthy +counterpart for a given unhealthy image, and the difference between the two +facilitates the identification of anomaly locations. The code is available at: +https://github.com/igcondapet/IgCONDA-PET.git + +
+
+ comment: 12 pages, 6 figures, 1 table +
+
+
+
+
+ + ☆ STT: Stateful Tracking with Transformers for Autonomous Driving ICRA 2024 + + +
+ Tracking objects in three-dimensional space is critical for autonomous +driving. To ensure safety while driving, the tracker must be able to reliably +track objects across frames and accurately estimate their states such as +velocity and acceleration in the present. Existing works frequently focus on +the association task while either neglecting the model performance on state +estimation or deploying complex heuristics to predict the states. In this +paper, we propose STT, a Stateful Tracking model built with Transformers, that +can consistently track objects in the scenes while also predicting their states +accurately. STT consumes rich appearance, geometry, and motion signals through +long term history of detections and is jointly optimized for both data +association and state estimation tasks. Since the standard tracking metrics +like MOTA and MOTP do not capture the combined performance of the two tasks in +the wider spectrum of object states, we extend them with new metrics called +S-MOTA and MOTPS that address this limitation. STT achieves competitive +real-time performance on the Waymo Open Dataset. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ Synthetic Face Datasets Generation via Latent Space Exploration from + Brownian Identity Diffusion + + +
+ Face Recognition (FR) models are trained on large-scale datasets, which have +privacy and ethical concerns. Lately, the use of synthetic data to complement +or replace genuine data for the training of FR models has been proposed. While +promising results have been obtained, it still remains unclear if generative +models can yield diverse enough data for such tasks. In this work, we introduce +a new method, inspired by the physical motion of soft particles subjected to +stochastic Brownian forces, allowing us to sample identities distributions in a +latent space under various constraints. With this in hands, we generate several +face datasets and benchmark them by training FR models, showing that data +generated with our method exceeds the performance of previously GAN-based +datasets and achieves competitive performance with state-of-the-art +diffusion-based synthetic datasets. We also show that this method can be used +to mitigate leakage from the generator's training set and explore the ability +of generative models to generate data beyond it. + +
+
+ comment: 17 pages, 7 figures, 10 tables +
+
+
+
+
+ + ☆ Synthetic Image Verification in the Era of Generative AI: What Works and + What Isn't There Yet + + +
+ In this work we present an overview of approaches for the detection and +attribution of synthetic images and highlight their strengths and weaknesses. +We also point out and discuss hot topics in this field and outline promising +directions for future research. + +
+
+
+
+
+ + ☆ Towards End-to-End Semi-Supervised Table Detection with Semantic Aligned + Matching Transformer ICDAR 2024 + + +
+ Table detection within document images is a crucial task in document +processing, involving the identification and localization of tables. Recent +strides in deep learning have substantially improved the accuracy of this task, +but it still heavily relies on large labeled datasets for effective training. +Several semi-supervised approaches have emerged to overcome this challenge, +often employing CNN-based detectors with anchor proposals and post-processing +techniques like non-maximal suppression (NMS). However, recent advancements in +the field have shifted the focus towards transformer-based techniques, +eliminating the need for NMS and emphasizing object queries and attention +mechanisms. Previous research has focused on two key areas to improve +transformer-based detectors: refining the quality of object queries and +optimizing attention mechanisms. However, increasing object queries can +introduce redundancy, while adjustments to the attention mechanism can increase +complexity. To address these challenges, we introduce a semi-supervised +approach employing SAM-DETR, a novel approach for precise alignment between +object queries and target features. Our approach demonstrates remarkable +reductions in false positives and substantial enhancements in table detection +performance, particularly in complex documents characterized by diverse table +structures. This work provides more efficient and accurate table detection in +semi-supervised settings. + +
+
+ comment: ICDAR 2024 +
+
+
+
+
+ + ☆ Uncovering What, Why and How: A Comprehensive Benchmark for Causation + Understanding of Video Anomaly + + +
+ Video anomaly understanding (VAU) aims to automatically comprehend unusual +occurrences in videos, thereby enabling various applications such as traffic +surveillance and industrial manufacturing. While existing VAU benchmarks +primarily concentrate on anomaly detection and localization, our focus is on +more practicality, prompting us to raise the following crucial questions: "what +anomaly occurred?", "why did it happen?", and "how severe is this abnormal +event?". In pursuit of these answers, we present a comprehensive benchmark for +Causation Understanding of Video Anomaly (CUVA). Specifically, each instance of +the proposed benchmark involves three sets of human annotations to indicate the +"what", "why" and "how" of an anomaly, including 1) anomaly type, start and end +times, and event descriptions, 2) natural language explanations for the cause +of an anomaly, and 3) free text reflecting the effect of the abnormality. In +addition, we also introduce MMEval, a novel evaluation metric designed to +better align with human preferences for CUVA, facilitating the measurement of +existing LLMs in comprehending the underlying cause and corresponding effect of +video anomalies. Finally, we propose a novel prompt-based method that can serve +as a baseline approach for the challenging CUVA. We conduct extensive +experiments to show the superiority of our evaluation metric and the +prompt-based approach. Our code and dataset are available at +https://github.com/fesvhtr/CUVA. + +
+
+ comment: Codebase: https://github.com/fesvhtr/CUVA +
+
+
+
+
+ + ☆ Revisiting RGBT Tracking Benchmarks from the Perspective of Modality + Validity: A New Benchmark, Problem, and Method + + +
+ RGBT tracking draws increasing attention due to its robustness in +multi-modality warranting (MMW) scenarios, such as nighttime and bad weather, +where relying on a single sensing modality fails to ensure stable tracking +results. However, the existing benchmarks predominantly consist of videos +collected in common scenarios where both RGB and thermal infrared (TIR) +information are of sufficient quality. This makes the data unrepresentative of +severe imaging conditions, leading to tracking failures in MMW scenarios. To +bridge this gap, we present a new benchmark, MV-RGBT, captured specifically in +MMW scenarios. In contrast with the existing datasets, MV-RGBT comprises more +object categories and scenes, providing a diverse and challenging benchmark. +Furthermore, for severe imaging conditions of MMW scenarios, a new problem is +posed, namely \textit{when to fuse}, to stimulate the development of fusion +strategies for such data. We propose a new method based on a mixture of +experts, namely MoETrack, as a baseline fusion strategy. In MoETrack, each +expert generates independent tracking results along with the corresponding +confidence score, which is used to control the fusion process. Extensive +experimental results demonstrate the significant potential of MV-RGBT in +advancing RGBT tracking and elicit the conclusion that fusion is not always +beneficial, especially in MMW scenarios. Significantly, the proposed MoETrack +method achieves new state-of-the-art results not only on MV-RGBT, but also on +standard benchmarks, such as RGBT234, LasHeR, and the short-term split of VTUAV +(VTUAV-ST). More information of MV-RGBT and the source code of MoETrack will be +released at https://github.com/Zhangyong-Tang/MoETrack. + +
+
+
+
+
+ + ☆ Expanding the Horizon: Enabling Hybrid Quantum Transfer Learning for + Long-Tailed Chest X-Ray Classification + + +
+ Quantum machine learning (QML) has the potential for improving the +multi-label classification of rare, albeit critical, diseases in large-scale +chest x-ray (CXR) datasets due to theoretical quantum advantages over classical +machine learning (CML) in sample efficiency and generalizability. While prior +literature has explored QML with CXRs, it has focused on binary classification +tasks with small datasets due to limited access to quantum hardware and +computationally expensive simulations. To that end, we implemented a Jax-based +framework that enables the simulation of medium-sized qubit architectures with +significant improvements in wall-clock time over current software offerings. We +evaluated the performance of our Jax-based framework in terms of efficiency and +performance for hybrid quantum transfer learning for long-tailed classification +across 8, 14, and 19 disease labels using large-scale CXR datasets. The +Jax-based framework resulted in up to a 58% and 95% speed-up compared to +PyTorch and TensorFlow implementations, respectively. However, compared to CML, +QML demonstrated slower convergence and an average AUROC of 0.70, 0.73, and +0.74 for the classification of 8, 14, and 19 CXR disease labels. In comparison, +the CML models had an average AUROC of 0.77, 0.78, and 0.80 respectively. In +conclusion, our work presents an accessible implementation of hybrid quantum +transfer learning for long-tailed CXR classification with a computationally +efficient Jax-based framework. + +
+
+ comment: 11 pages, 13 figures, 3 tables +
+
+
+
+
+ + ☆ GUing: A Mobile GUI Search Engine using a Vision-Language Model + + +
+ App developers use the Graphical User Interface (GUI) of other apps as an +important source of inspiration to design and improve their own apps. In recent +years, research suggested various approaches to retrieve GUI designs that fit a +certain text query from screenshot datasets acquired through automated GUI +exploration. However, such text-to-GUI retrieval approaches only leverage the +textual information of the GUI elements in the screenshots, neglecting visual +information such as icons or background images. In addition, the retrieved +screenshots are not steered by app developers and often lack important app +features, e.g. whose UI pages require user authentication. To overcome these +limitations, this paper proposes GUing, a GUI search engine based on a +vision-language model called UIClip, which we trained specifically for the app +GUI domain. For this, we first collected app introduction images from Google +Play, which usually display the most representative screenshots selected and +often captioned (i.e. labeled) by app vendors. Then, we developed an automated +pipeline to classify, crop, and extract the captions from these images. This +finally results in a large dataset which we share with this paper: including +303k app screenshots, out of which 135k have captions. We used this dataset to +train a novel vision-language model, which is, to the best of our knowledge, +the first of its kind in GUI retrieval. We evaluated our approach on various +datasets from related work and in manual experiment. The results demonstrate +that our model outperforms previous approaches in text-to-GUI retrieval +achieving a Recall@10 of up to 0.69 and a HIT@10 of 0.91. We also explored the +performance of UIClip for other GUI tasks including GUI classification and +Sketch-to-GUI retrieval with encouraging results. + +
+
+
+
+
+ + ☆ Utilizing Machine Learning and 3D Neuroimaging to Predict Hearing Loss: + A Comparative Analysis of Dimensionality Reduction and Regression Techniques + + +
+ In this project, we have explored machine learning approaches for predicting +hearing loss thresholds on the brain's gray matter 3D images. We have solved +the problem statement in two phases. In the first phase, we used a 3D CNN model +to reduce high-dimensional input into latent space and decode it into an +original image to represent the input in rich feature space. In the second +phase, we utilized this model to reduce input into rich features and used these +features to train standard machine learning models for predicting hearing +thresholds. We have experimented with autoencoders and variational autoencoders +in the first phase for dimensionality reduction and explored random forest, +XGBoost and multi-layer perceptron for regressing the thresholds. We split the +given data set into training and testing sets and achieved an 8.80 range and +22.57 range for PT500 and PT4000 on the test set, respectively. We got the +lowest RMSE using multi-layer perceptron among the other models. + Our approach leverages the unique capabilities of VAEs to capture complex, +non-linear relationships within high-dimensional neuroimaging data. We +rigorously evaluated the models using various metrics, focusing on the root +mean squared error (RMSE). The results highlight the efficacy of the +multi-layer neural network model, which outperformed other techniques in terms +of accuracy. This project advances the application of data mining in medical +diagnostics and enhances our understanding of age-related hearing loss through +innovative machine-learning frameworks. + +
+
+
+
+
+ + ☆ A Flexible 2.5D Medical Image Segmentation Approach with In-Slice and + Cross-Slice Attention + + +
+ Deep learning has become the de facto method for medical image segmentation, +with 3D segmentation models excelling in capturing complex 3D structures and 2D +models offering high computational efficiency. However, segmenting 2.5D images, +which have high in-plane but low through-plane resolution, is a relatively +unexplored challenge. While applying 2D models to individual slices of a 2.5D +image is feasible, it fails to capture the spatial relationships between +slices. On the other hand, 3D models face challenges such as resolution +inconsistencies in 2.5D images, along with computational complexity and +susceptibility to overfitting when trained with limited data. In this context, +2.5D models, which capture inter-slice correlations using only 2D neural +networks, emerge as a promising solution due to their reduced computational +demand and simplicity in implementation. In this paper, we introduce CSA-Net, a +flexible 2.5D segmentation model capable of processing 2.5D images with an +arbitrary number of slices through an innovative Cross-Slice Attention (CSA) +module. This module uses the cross-slice attention mechanism to effectively +capture 3D spatial information by learning long-range dependencies between the +center slice (for segmentation) and its neighboring slices. Moreover, CSA-Net +utilizes the self-attention mechanism to understand correlations among pixels +within the center slice. We evaluated CSA-Net on three 2.5D segmentation tasks: +(1) multi-class brain MRI segmentation, (2) binary prostate MRI segmentation, +and (3) multi-class prostate MRI segmentation. CSA-Net outperformed leading 2D +and 2.5D segmentation methods across all three tasks, demonstrating its +efficacy and superiority. Our code is publicly available at +https://github.com/mirthAI/CSA-Net. + +
+
+
+
+
+ + ☆ Training a high-performance retinal foundation model with half-the-data + and 400 times less compute + + +
+ Artificial Intelligence holds tremendous potential in medicine, but is +traditionally limited by the lack of massive datasets to train models on. +Foundation models, pre-trained models that can be adapted to downstream tasks +with small datasets, could alleviate this problem. Researchers at Moorfields +Eye Hospital (MEH) proposed RETFound-MEH, a foundation model for retinal +imaging that was trained on 900,000 images, including private hospital data. +Recently, data-efficient DERETFound was proposed that provides comparable +performance while being trained on only 150,000 images that are all publicly +available. However, both these models required very substantial resources to +train initially and are resource-intensive in downstream use. We propose a +novel Token Reconstruction objective that we use to train RETFound-Green, a +retinal foundation model trained using only 75,000 publicly available images +and 400 times less compute. We estimate the cost of training RETFound-MEH and +DERETFound at $10,000 and $14,000, respectively, while RETFound-Green could be +trained for less than $100, with equally reduced environmental impact. +RETFound-Green is also far more efficient in downstream use: it can be +downloaded 14 times faster, computes vector embeddings 2.7 times faster which +then require 2.6 times less storage space. Despite this, RETFound-Green does +not perform systematically worse. In fact, it performs best on 14 tasks, +compared to six for DERETFound and two for RETFound-MEH. Our results suggest +that RETFound-Green is a very efficient, high-performance retinal foundation +model. We anticipate that our Token Reconstruction objective could be scaled up +for even higher performance and be applied to other domains beyond retinal +imaging. + +
+
+
+
+
+ + ☆ Modeling Caption Diversity in Contrastive Vision-Language Pretraining + + +
+ There are a thousand ways to caption an image. Contrastive Language +Pretraining (CLIP) on the other hand, works by mapping an image and its caption +to a single vector -- limiting how well CLIP-like models can represent the +diverse ways to describe an image. In this work, we introduce Llip, Latent +Language Image Pretraining, which models the diversity of captions that could +match an image. Llip's vision encoder outputs a set of visual features that are +mixed into a final representation by conditioning on information derived from +the text. We show that Llip outperforms non-contextualized baselines like CLIP +and SigLIP on a variety of tasks even with large-scale encoders. Llip improves +zero-shot classification by an average of 2.9% zero-shot classification +benchmarks with a ViT-G/14 encoder. Specifically, Llip attains a zero-shot +top-1 accuracy of 83.5% on ImageNet outperforming a similarly sized CLIP by +1.4%. We also demonstrate improvement on zero-shot retrieval on MS-COCO by +6.0%. We provide a comprehensive analysis of the components introduced by the +method and demonstrate that Llip leads to richer visual representations. + +
+
+ comment: 14 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ Why does Knowledge Distillation Work? Rethink its Attention and Fidelity + Mechanism + + +
+ Does Knowledge Distillation (KD) really work? Conventional wisdom viewed it +as a knowledge transfer procedure where a perfect mimicry of the student to its +teacher is desired. However, paradoxical studies indicate that closely +replicating the teacher's behavior does not consistently improve student +generalization, posing questions on its possible causes. Confronted with this +gap, we hypothesize that diverse attentions in teachers contribute to better +student generalization at the expense of reduced fidelity in ensemble KD +setups. By increasing data augmentation strengths, our key findings reveal a +decrease in the Intersection over Union (IoU) of attentions between teacher +models, leading to reduced student overfitting and decreased fidelity. We +propose this low-fidelity phenomenon as an underlying characteristic rather +than a pathology when training KD. This suggests that stronger data +augmentation fosters a broader perspective provided by the divergent teacher +ensemble and lower student-teacher mutual information, benefiting +generalization performance. These insights clarify the mechanism on +low-fidelity phenomenon in KD. Thus, we offer new perspectives on optimizing +student model performance, by emphasizing increased diversity in teacher +attentions and reduced mimicry behavior between teachers and student. + +
+
+
+
+
+ + ♻ ☆ UnScene3D: Unsupervised 3D Instance Segmentation for Indoor Scenes CVPR24 + + +
+ 3D instance segmentation is fundamental to geometric understanding of the +world around us. Existing methods for instance segmentation of 3D scenes rely +on supervision from expensive, manual 3D annotations. We propose UnScene3D, the +first fully unsupervised 3D learning approach for class-agnostic 3D instance +segmentation of indoor scans. UnScene3D first generates pseudo masks by +leveraging self-supervised color and geometry features to find potential object +regions. We operate on a basis of geometric oversegmentation, enabling +efficient representation and learning on high-resolution 3D data. The coarse +proposals are then refined through self-training our model on its predictions. +Our approach improves over state-of-the-art unsupervised 3D instance +segmentation methods by more than 300% Average Precision score, demonstrating +effective instance segmentation even in challenging, cluttered 3D scenes. + +
+
+ comment: Project page: https://rozdavid.github.io/unscene3d, paper updated + according to CVPR24 camera ready version +
+
+
+
+
+ + ♻ ☆ Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot + Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) and CLIP are remarkable vision foundation +models (VFMs). SAM, a prompt driven segmentation model, excels in segmentation +tasks across diverse domains, while CLIP is renowned for its zero shot +recognition capabilities. However, their unified potential has not yet been +explored in medical image segmentation. To adapt SAM to medical imaging, +existing methods primarily rely on tuning strategies that require extensive +data or prior prompts tailored to the specific task, making it particularly +challenging when only a limited number of data samples are available. This work +presents an in depth exploration of integrating SAM and CLIP into a unified +framework for medical image segmentation. Specifically, we propose a simple +unified framework, SaLIP, for organ segmentation. Initially, SAM is used for +part based segmentation within the image, followed by CLIP to retrieve the mask +corresponding to the region of interest (ROI) from the pool of SAM generated +masks. Finally, SAM is prompted by the retrieved ROI to segment a specific +organ. Thus, SaLIP is training and fine tuning free and does not rely on domain +expertise or labeled data for prompt engineering. Our method shows substantial +enhancements in zero shot segmentation, showcasing notable improvements in DICE +scores across diverse segmentation tasks like brain (63.46%), lung (50.11%), +and fetal head (30.82%), when compared to un prompted SAM. Code and text +prompts are available at: https://github.com/aleemsidra/SaLIP. + +
+
+
+
+
+ + ♻ ☆ Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder ICME + + +
+ Automatic lip-reading (ALR) aims to automatically transcribe spoken content +from a speaker's silent lip motion captured in video. Current mainstream +lip-reading approaches only use a single visual encoder to model input videos +of a single scale. In this paper, we propose to enhance lip-reading by +incorporating multi-scale video data and multi-encoder. Specifically, we first +propose a novel multi-scale lip motion extraction algorithm based on the size +of the speaker's face and an Enhanced ResNet3D visual front-end (VFE) to +extract lip features at different scales. For the multi-encoder, in addition to +the mainstream Transformer and Conformer, we also incorporate the recently +proposed Branchformer and E-Branchformer as visual encoders. In the +experiments, we explore the influence of different video data scales and +encoders on ALR system performance and fuse the texts transcribed by all ALR +systems using recognizer output voting error reduction (ROVER). Finally, our +proposed approach placed second in the ICME 2024 ChatCLR Challenge Task 2, with +a 21.52% reduction in character error rate (CER) compared to the official +baseline on the evaluation set. + +
+
+ comment: 6 pages, 3 figures, Accepted at ICMEW 2024 +
+
+
+
+
+ + ♻ ☆ ShadowMaskFormer: Mask Augmented Patch Embeddings for Shadow Removal + + +
+ Transformer recently emerged as the de facto model for computer vision tasks +and has also been successfully applied to shadow removal. However, these +existing methods heavily rely on intricate modifications to the attention +mechanisms within the transformer blocks while using a generic patch embedding. +As a result, it often leads to complex architectural designs requiring +additional computation resources. In this work, we aim to explore the efficacy +of incorporating shadow information within the early processing stage. +Accordingly, we propose a transformer-based framework with a novel patch +embedding that is tailored for shadow removal, dubbed ShadowMaskFormer. +Specifically, we present a simple and effective mask-augmented patch embedding +to integrate shadow information and promote the model's emphasis on acquiring +knowledge for shadow regions. Extensive experiments conducted on the ISTD, +ISTD+, and SRD benchmark datasets demonstrate the efficacy of our method +against state-of-the-art approaches while using fewer model parameters. + +
+
+
+
+
+ + ♻ ☆ Just Say the Name: Online Continual Learning with Category Names Only + via Data Generation + + +
+ In real-world scenarios, extensive manual annotation for continual learning +is impractical due to prohibitive costs. Although prior arts, influenced by +large-scale webly supervised training, suggest leveraging web-scraped data in +continual learning, this poses challenges such as data imbalance, usage +restrictions, and privacy concerns. Addressing the risks of continual webly +supervised training, we present an online continual learning framework - +Generative Name only Continual Learning (G-NoCL). The proposed G-NoCL uses a +set of generators G along with the learner. When encountering new concepts +(i.e., classes), G-NoCL employs the novel sample complexity-guided data +ensembling technique DIverSity and COmplexity enhancing ensemBlER (DISCOBER) to +optimally sample training data from generated data. Through extensive +experimentation, we demonstrate superior performance of DISCOBER in G-NoCL +online CL benchmarks, covering both In-Distribution (ID) and +Out-of-Distribution (OOD) generalization evaluations, compared to naive +generator-ensembling, web-supervised, and manually annotated data. + +
+
+
+
+
+ + ♻ ☆ ProgDTD: Progressive Learned Image Compression with Double-Tail-Drop + Training + + +
+ Progressive compression allows images to start loading as low-resolution +versions, becoming clearer as more data is received. This increases user +experience when, for example, network connections are slow. Today, most +approaches for image compression, both classical and learned ones, are designed +to be non-progressive. This paper introduces ProgDTD, a training method that +transforms learned, non-progressive image compression approaches into +progressive ones. The design of ProgDTD is based on the observation that the +information stored within the bottleneck of a compression model commonly varies +in importance. To create a progressive compression model, ProgDTD modifies the +training steps to enforce the model to store the data in the bottleneck sorted +by priority. We achieve progressive compression by transmitting the data in +order of its sorted index. ProgDTD is designed for CNN-based learned image +compression models, does not need additional parameters, and has a customizable +range of progressiveness. For evaluation, we apply ProgDTDto the hyperprior +model, one of the most common structures in learned image compression. Our +experimental results show that ProgDTD performs comparably to its +non-progressive counterparts and other state-of-the-art progressive models in +terms of MS-SSIM and accuracy. + +
+
+
+
+
+ + ♻ ☆ PANDAS: Prototype-based Novel Class Discovery and Detection + + +
+ Object detectors are typically trained once and for all on a fixed set of +classes. However, this closed-world assumption is unrealistic in practice, as +new classes will inevitably emerge after the detector is deployed in the wild. +In this work, we look at ways to extend a detector trained for a set of base +classes so it can i) spot the presence of novel classes, and ii) automatically +enrich its repertoire to be able to detect those newly discovered classes +together with the base ones. We propose PANDAS, a method for novel class +discovery and detection. It discovers clusters representing novel classes from +unlabeled data, and represents old and new classes with prototypes. During +inference, a distance-based classifier uses these prototypes to assign a label +to each detected object instance. The simplicity of our method makes it widely +applicable. We experimentally demonstrate the effectiveness of PANDAS on the +VOC 2012 and COCO-to-LVIS benchmarks. It performs favorably against the state +of the art for this task while being computationally more affordable. + +
+
+ comment: Accepted to the Conference on Lifelong Learning Agents (CoLLAs 2024) +
+
+
+
+
+ + ♻ ☆ SimAC: A Simple Anti-Customization Method for Protecting Face Privacy + against Text-to-Image Synthesis of Diffusion Models + + +
+ Despite the success of diffusion-based customization methods on visual +content creation, increasing concerns have been raised about such techniques +from both privacy and political perspectives. To tackle this issue, several +anti-customization methods have been proposed in very recent months, +predominantly grounded in adversarial attacks. Unfortunately, most of these +methods adopt straightforward designs, such as end-to-end optimization with a +focus on adversarially maximizing the original training loss, thereby +neglecting nuanced internal properties intrinsic to the diffusion model, and +even leading to ineffective optimization in some diffusion time steps.In this +paper, we strive to bridge this gap by undertaking a comprehensive exploration +of these inherent properties, to boost the performance of current +anti-customization approaches. Two aspects of properties are investigated: 1) +We examine the relationship between time step selection and the model's +perception in the frequency domain of images and find that lower time steps can +give much more contributions to adversarial noises. This inspires us to propose +an adaptive greedy search for optimal time steps that seamlessly integrates +with existing anti-customization methods. 2) We scrutinize the roles of +features at different layers during denoising and devise a sophisticated +feature-based optimization framework for anti-customization.Experiments on +facial benchmarks demonstrate that our approach significantly increases +identity disruption, thereby protecting user privacy and copyright. Our code is +available at: https://github.com/somuchtome/SimAC. + +
+
+
+
+
+ + ♻ ☆ SeaTurtleID2022: A long-span dataset for reliable sea turtle + re-identification WACV2024 + + +
+ This paper introduces the first public large-scale, long-span dataset with +sea turtle photographs captured in the wild -- SeaTurtleID2022 +(https://www.kaggle.com/datasets/wildlifedatasets/seaturtleid2022). The dataset +contains 8729 photographs of 438 unique individuals collected within 13 years, +making it the longest-spanned dataset for animal re-identification. All +photographs include various annotations, e.g., identity, encounter timestamp, +and body parts segmentation masks. Instead of standard "random" splits, the +dataset allows for two realistic and ecologically motivated splits: (i) a +time-aware closed-set with training, validation, and test data from different +days/years, and (ii) a time-aware open-set with new unknown individuals in test +and validation sets. We show that time-aware splits are essential for +benchmarking re-identification methods, as random splits lead to performance +overestimation. Furthermore, a baseline instance segmentation and +re-identification performance over various body parts is provided. Finally, an +end-to-end system for sea turtle re-identification is proposed and evaluated. +The proposed system based on Hybrid Task Cascade for head instance segmentation +and ArcFace-trained feature-extractor achieved an accuracy of 86.8%. + +
+
+ comment: This version is essentially an updated version of the initial + SeaTurtleID paper (arXiv:2211.10307) and from now on it can be found as a + replacement of the latter paper. You can also find the published version + here: + https://openaccess.thecvf.com/content/WACV2024/html/Adam_SeaTurtleID2022_A_Long-Span_Dataset_for_Reliable_Sea_Turtle_Re-Identification_WACV_2024_paper.html +
+
+
+
+
+ + ♻ ☆ Object Detection for Automated Coronary Artery Using Deep Learning + + +
+ In the era of digital medicine, medical imaging serves as a widespread +technique for early disease detection, with a substantial volume of images +being generated and stored daily in electronic patient records. X-ray +angiography imaging is a standard and one of the most common methods for +rapidly diagnosing coronary artery diseases. The notable achievements of recent +deep learning algorithms align with the increased use of electronic health +records and diagnostic imaging. Deep neural networks, leveraging abundant data, +advanced algorithms, and powerful computational capabilities, prove highly +effective in the analysis and interpretation of images. In this context, Object +detection methods have become a promising approach, particularly through +convolutional neural networks (CNN), streamlining medical image analysis by +eliminating manual feature extraction. This allows for direct feature +extraction from images, ensuring high accuracy in results. Therefore, in our +paper, we utilized the object detection method on X-ray angiography images to +precisely identify the location of coronary artery stenosis. As a result, this +model enables automatic and real-time detection of stenosis locations, +assisting in the crucial and sensitive decision-making process for healthcare +professionals. + +
+
+ comment: The results in the article need fundamental corrections +
+
+
+
+
+ + ♻ ☆ Fast and Accurate Unknown Object Instance Segmentation through + Error-Informed Refinement + + +
+ Accurate perception of unknown objects is essential for autonomous robots, +particularly when manipulating novel items in unstructured environments. +However, existing unknown object instance segmentation (UOIS) methods often +have over-segmentation and under-segmentation problems, resulting in inaccurate +instance boundaries and failures in subsequent robotic tasks such as grasping +and placement. To address this challenge, this article introduces INSTA-BEER, a +fast and accurate model-agnostic refinement method that enhances the UOIS +performance. The model adopts an error-informed refinement approach, which +first predicts pixel-wise errors in the initial segmentation and then refines +the segmentation guided by these error estimates. We introduce the quad-metric +boundary error, which quantifies pixel-wise true positives, true negatives, +false positives, and false negatives at the boundaries of object instances, +effectively capturing both fine-grained and instance-level segmentation errors. +Additionally, the Error Guidance Fusion (EGF) module explicitly integrates +error information into the refinement process, further improving segmentation +quality. In comprehensive evaluations conducted on three widely used benchmark +datasets, INSTA-BEER outperformed state-of-the-art models in both accuracy and +inference time. Moreover, a real-world robotic experiment demonstrated the +practical applicability of our method in improving the performance of target +object grasping tasks in cluttered environments. + +
+
+ comment: 8 pages, 5 figures, project website: + https://sites.google.com/view/insta-beer +
+
+
+
+
+ + ♻ ☆ Do Diffusion Models Learn Semantically Meaningful and Efficient + Representations? + + +
+ Diffusion models are capable of impressive feats of image generation with +uncommon juxtapositions such as astronauts riding horses on the moon with +properly placed shadows. These outputs indicate the ability to perform +compositional generalization, but how do the models do so? We perform +controlled experiments on conditional DDPMs learning to generate 2D spherical +Gaussian bumps centered at specified $x$- and $y$-positions. Our results show +that the emergence of semantically meaningful latent representations is key to +achieving high performance. En route to successful performance over learning, +the model traverses three distinct phases of latent representations: (phase A) +no latent structure, (phase B) a 2D manifold of disordered states, and (phase +C) a 2D ordered manifold. Corresponding to each of these phases, we identify +qualitatively different generation behaviors: 1) multiple bumps are generated, +2) one bump is generated but at inaccurate $x$ and $y$ locations, 3) a bump is +generated at the correct $x$ and y location. Furthermore, we show that even +under imbalanced datasets where features ($x$- versus $y$-positions) are +represented with skewed frequencies, the learning process for $x$ and $y$ is +coupled rather than factorized, demonstrating that simple vanilla-flavored +diffusion models cannot learn efficient representations in which localization +in $x$ and $y$ are factorized into separate 1D tasks. These findings suggest +the need for future work to find inductive biases that will push generative +models to discover and exploit factorizable independent structures in their +inputs, which will be required to vault these models into more data-efficient +regimes. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ PoseAnimate: Zero-shot high fidelity pose controllable character + animation + + +
+ Image-to-video(I2V) generation aims to create a video sequence from a single +image, which requires high temporal coherence and visual fidelity with the +source image.However, existing approaches suffer from character appearance +inconsistency and poor preservation of fine details. Moreover, they require a +large amount of video data for training, which can be computationally +demanding.To address these limitations,we propose PoseAnimate, a novel +zero-shot I2V framework for character animation.PoseAnimate contains three key +components: 1) Pose-Aware Control Module (PACM) incorporates diverse pose +signals into conditional embeddings, to preserve character-independent content +and maintain precise alignment of actions.2) Dual Consistency Attention Module +(DCAM) enhances temporal consistency, and retains character identity and +intricate background details.3) Mask-Guided Decoupling Module (MGDM) refines +distinct feature perception, improving animation fidelity by decoupling the +character and background.We also propose a Pose Alignment Transition Algorithm +(PATA) to ensure smooth action transition.Extensive experiment results +demonstrate that our approach outperforms the state-of-the-art training-based +methods in terms of character consistency and detail fidelity. Moreover, it +maintains a high level of temporal coherence throughout the generated +animations. + +
+
+
+
+
+ + ♻ ☆ Adversarial Example Soups: Improving Transferability and Stealthiness + for Free + + +
+ Transferable adversarial examples cause practical security risks since they +can mislead a target model without knowing its internal knowledge. A +conventional recipe for maximizing transferability is to keep only the optimal +adversarial example from all those obtained in the optimization pipeline. In +this paper, for the first time, we question this convention and demonstrate +that those discarded, sub-optimal adversarial examples can be reused to boost +transferability. Specifically, we propose ``Adversarial Example Soups'' (AES), +with AES-tune for averaging discarded adversarial examples in hyperparameter +tuning and AES-rand for stability testing. In addition, our AES is inspired by +``model soups'', which averages weights of multiple fine-tuned models for +improved accuracy without increasing inference time. Extensive experiments +validate the global effectiveness of our AES, boosting 10 state-of-the-art +transfer attacks and their combinations by up to 13% against 10 diverse +(defensive) target models. We also show the possibility of generalizing AES to +other types, e.g., directly averaging multiple in-the-wild adversarial examples +that yield comparable success. A promising byproduct of AES is the improved +stealthiness of adversarial examples since the perturbation variances are +naturally reduced. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training + + +
+ In the field of medical Vision-Language Pre-training (VLP), significant +efforts have been devoted to deriving text and image features from both +clinical reports and associated medical images. However, most existing methods +may have overlooked the opportunity in leveraging the inherent hierarchical +structure of clinical reports, which are generally split into `findings' for +descriptive content and `impressions' for conclusive observation. Instead of +utilizing this rich, structured format, current medical VLP approaches often +simplify the report into either a unified entity or fragmented tokens. In this +work, we propose a novel clinical prior guided VLP framework named IMITATE to +learn the structure information from medical reports with hierarchical +vision-language alignment. The framework derives multi-level visual features +from the chest X-ray (CXR) images and separately aligns these features with the +descriptive and the conclusive text encoded in the hierarchical medical report. +Furthermore, a new clinical-informed contrastive loss is introduced for +cross-modal learning, which accounts for clinical prior knowledge in +formulating sample correlations in contrastive learning. The proposed model, +IMITATE, outperforms baseline VLP methods across six different datasets, +spanning five medical imaging downstream tasks. Comprehensive experimental +results highlight the advantages of integrating the hierarchical structure of +medical reports for vision-language alignment. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Conditioning Generative Latent Optimization for Sparse-View CT Image + Reconstruction + + +
+ Computed Tomography (CT) is a prominent example of Imaging Inverse Problem +highlighting the unrivaled performances of data-driven methods in degraded +measurements setups like sparse X-ray projections. Although a significant +proportion of deep learning approaches benefit from large supervised datasets, +they cannot generalize to new experimental setups. In contrast, fully +unsupervised techniques, most notably using score-based generative models, have +recently demonstrated similar or better performances compared to supervised +approaches while being flexible at test time. However, their use cases are +limited as they need considerable amounts of training data to have good +generalization properties. Another unsupervised approach taking advantage of +the implicit natural bias of deep convolutional networks, Deep Image Prior, has +recently been adapted to solve sparse CT by reparameterizing the reconstruction +problem. Although this methodology does not require any training dataset, it +enforces a weaker prior on the reconstructions when compared to data-driven +methods. To fill the gap between these two strategies, we propose an +unsupervised conditional approach to the Generative Latent Optimization +framework (cGLO). Similarly to DIP, without any training dataset, cGLO benefits +from the structural bias of a decoder network. However, the prior is further +reinforced as the effect of a likelihood objective shared between multiple +slices being reconstructed simultaneously through the same decoder network. In +addition, the parameters of the decoder may be initialized on an unsupervised, +and eventually very small, training dataset to enhance the reconstruction. The +resulting approach is tested on full-dose sparse-view CT using multiple +training dataset sizes and varying numbers of viewing angles. + +
+
+
+
+
+ + ♻ ☆ The Machine Vision Iceberg Explained: Advancing Dynamic Testing by + Considering Holistic Environmental Relations SC 2024 + + +
+ Machine Vision (MV) is essential for solving driving automation. This paper +examines potential shortcomings in current MV testing strategies for highly +automated driving (HAD) systems. We argue for a more comprehensive +understanding of the performance factors that must be considered during the MV +evaluation process, noting that neglecting these factors can lead to +significant risks. This is not only relevant to MV component testing, but also +to integration testing. To illustrate this point, we draw an analogy to a ship +navigating towards an iceberg to show potential hidden challenges in current MV +testing strategies. The main contribution is a novel framework for black-box +testing which observes environmental relations. This means it is designed to +enhance MV assessments by considering the attributes and surroundings of +relevant individual objects. The framework provides the identification of seven +general concerns about the object recognition of MV, which are not addressed +adequately in established test processes. To detect these deficits based on +their performance factors, we propose the use of a taxonomy called "granularity +orders" along with a graphical representation. This allows an identification of +MV uncertainties across a range of driving scenarios. This approach aims to +advance the precision, efficiency, and completeness of testing procedures for +MV. + +
+
+ comment: Submitted at IEEE ITSC 2024 +
+
+
+
+
+ + ♻ ☆ Utilizing Synthetic Data for Medical Vision-Language Pre-training: + Bypassing the Need for Real Images CVPR 2024 + + +
+ Medical Vision-Language Pre-training (VLP) learns representations jointly +from medical images and paired radiology reports. It typically requires +large-scale paired image-text datasets to achieve effective pre-training for +both the image encoder and text encoder. The advent of text-guided generative +models raises a compelling question: Can VLP be implemented solely with +synthetic images generated from genuine radiology reports, thereby mitigating +the need for extensively pairing and curating image-text datasets? In this +work, we scrutinize this very question by examining the feasibility and +effectiveness of employing synthetic images for medical VLP. We replace real +medical images with their synthetic equivalents, generated from authentic +medical reports. Utilizing three state-of-the-art VLP algorithms, we +exclusively train on these synthetic samples. Our empirical evaluation across +three subsequent tasks, namely image classification, semantic segmentation and +object detection, reveals that the performance achieved through synthetic data +is on par with or even exceeds that obtained with real images. As a pioneering +contribution to this domain, we introduce a large-scale synthetic medical image +dataset, paired with anonymized real radiology reports. This alleviates the +need of sharing medical images, which are not easy to curate and share in +practice. The code and the dataset can be found in +\href{https://github.com/cheliu-computation/MedSyn-RepLearn/tree/main}{https://github.com/cheliu-computation/MedSyn-RepLearn/tree/main}. + +
+
+ comment: Accepted by CVPR 2024 Workshop Data Curation and Augmentation in + Enhancing Medical Imaging Applications +
+
+
+
+
+ + ♻ ☆ PASS: Peer-Agreement based Sample Selection for training with Noisy + Labels + + +
+ The prevalence of noisy-label samples poses a significant challenge in deep +learning, inducing overfitting effects. This has, therefore, motivated the +emergence of learning with noisy-label (LNL) techniques that focus on +separating noisy- and clean-label samples to apply different learning +strategies to each group of samples. Current methodologies often rely on the +small-loss hypothesis or feature-based selection to separate noisy- and +clean-label samples, yet our empirical observations reveal their limitations, +especially for labels with instance dependent noise (IDN). An important +characteristic of IDN is the difficulty to distinguish the clean-label samples +that lie near the decision boundary (i.e., the hard samples) from the +noisy-label samples. We, therefore, propose a new noisy-label detection method, +termed Peer-Agreement based Sample Selection (PASS), to address this problem. +Utilising a trio of classifiers, PASS employs consensus-driven peer-based +agreement of two models to select the samples to train the remaining model. +PASS is easily integrated into existing LNL models, enabling the improvement of +the detection accuracy of noisy- and clean-label samples, which increases the +classification accuracy across various LNL benchmarks. + +
+
+ comment: In Submission +
+
+
+
+
+ + ♻ ☆ CLEAR: Cross-Transformers with Pre-trained Language Model is All you + need for Person Attribute Recognition and Retrieval + + +
+ Person attribute recognition and attribute-based retrieval are two core +human-centric tasks. In the recognition task, the challenge is specifying +attributes depending on a person's appearance, while the retrieval task +involves searching for matching persons based on attribute queries. There is a +significant relationship between recognition and retrieval tasks. In this +study, we demonstrate that if there is a sufficiently robust network to solve +person attribute recognition, it can be adapted to facilitate better +performance for the retrieval task. Another issue that needs addressing in the +retrieval task is the modality gap between attribute queries and persons' +images. Therefore, in this paper, we present CLEAR, a unified network designed +to address both tasks. We introduce a robust cross-transformers network to +handle person attribute recognition. Additionally, leveraging a pre-trained +language model, we construct pseudo-descriptions for attribute queries and +introduce an effective training strategy to train only a few additional +parameters for adapters, facilitating the handling of the retrieval task. +Finally, the unified CLEAR model is evaluated on five benchmarks: PETA, PA100K, +Market-1501, RAPv2, and UPAR-2024. Without bells and whistles, CLEAR achieves +state-of-the-art performance or competitive results for both tasks, +significantly outperforming other competitors in terms of person retrieval +performance on the widely-used Market-1501 dataset. + +
+
+
+
+
+ + ♻ ☆ Learning Separable Hidden Unit Contributions for Speaker-Adaptive + Lip-Reading BMVC 2023 + + +
+ In this paper, we propose a novel method for speaker adaptation in lip +reading, motivated by two observations. Firstly, a speaker's own +characteristics can always be portrayed well by his/her few facial images or +even a single image with shallow networks, while the fine-grained dynamic +features associated with speech content expressed by the talking face always +need deep sequential networks to represent accurately. Therefore, we treat the +shallow and deep layers differently for speaker adaptive lip reading. Secondly, +we observe that a speaker's unique characteristics ( e.g. prominent oral cavity +and mandible) have varied effects on lip reading performance for different +words and pronunciations, necessitating adaptive enhancement or suppression of +the features for robust lip reading. Based on these two observations, we +propose to take advantage of the speaker's own characteristics to automatically +learn separable hidden unit contributions with different targets for shallow +layers and deep layers respectively. For shallow layers where features related +to the speaker's characteristics are stronger than the speech content related +features, we introduce speaker-adaptive features to learn for enhancing the +speech content features. For deep layers where both the speaker's features and +the speech content features are all expressed well, we introduce the +speaker-adaptive features to learn for suppressing the speech content +irrelevant noise for robust lip reading. Our approach consistently outperforms +existing methods, as confirmed by comprehensive analysis and comparison across +different settings. Besides the evaluation on the popular LRW-ID and GRID +datasets, we also release a new dataset for evaluation, CAS-VSR-S68h, to +further assess the performance in an extreme setting where just a few speakers +are available but the speech content covers a large and diversified range. + +
+
+ comment: Accepted to BMVC 2023 20pages +
+
+
+
+
+ + ♻ ☆ MambaPupil: Bidirectional Selective Recurrent model for Event-based Eye + tracking CVPR 2024 + + +
+ Event-based eye tracking has shown great promise with the high temporal +resolution and low redundancy provided by the event camera. However, the +diversity and abruptness of eye movement patterns, including blinking, +fixating, saccades, and smooth pursuit, pose significant challenges for eye +localization. To achieve a stable event-based eye-tracking system, this paper +proposes a bidirectional long-term sequence modeling and time-varying state +selection mechanism to fully utilize contextual temporal information in +response to the variability of eye movements. Specifically, the MambaPupil +network is proposed, which consists of the multi-layer convolutional encoder to +extract features from the event representations, a bidirectional Gated +Recurrent Unit (GRU), and a Linear Time-Varying State Space Module (LTV-SSM), +to selectively capture contextual correlation from the forward and backward +temporal relationship. Furthermore, the Bina-rep is utilized as a compact event +representation, and the tailor-made data augmentation, called as Event-Cutout, +is proposed to enhance the model's robustness by applying spatial random +masking to the event image. The evaluation on the ThreeET-plus benchmark shows +the superior performance of the MambaPupil, which secured the 1st place in +CVPR'2024 AIS Event-based Eye Tracking challenge. + +
+
+ comment: Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for + Streaming), top solution of challenge Event-based Eye Tracking, see + https://www.kaggle.com/competitions/event-based-eye-tracking-ais2024 +
+
+
+
+
+ + ♻ ☆ Multi-Prompt with Depth Partitioned Cross-Modal Learning + + +
+ In recent years, soft prompt learning methods have been proposed to fine-tune +large-scale vision-language pre-trained models for various downstream tasks. +These methods typically combine learnable textual tokens with class tokens as +input for models with frozen parameters. However, they often employ a single +prompt to describe class contexts, failing to capture categories' diverse +attributes adequately. This study introduces the Partitioned Multi-modal Prompt +(PMPO), a multi-modal prompting technique that extends the soft prompt from a +single learnable prompt to multiple prompts. Our method divides the visual +encoder depths and connects learnable prompts to the separated visual depths, +enabling different prompts to capture the hierarchical contextual depths of +visual representations. Furthermore, to maximize the advantages of multi-prompt +learning, we incorporate prior information from manually designed templates and +learnable multi-prompts, thus improving the generalization capabilities of our +approach. We evaluate the effectiveness of our approach on three challenging +tasks: new class generalization, cross-dataset evaluation, and domain +generalization. For instance, our method achieves a $79.28$ harmonic mean, +averaged over 11 diverse image recognition datasets ($+7.62$ compared to CoOp), +demonstrating significant competitiveness compared to state-of-the-art +prompting methods. + +
+
+
+
+
+ + ♻ ☆ Giving a Hand to Diffusion Models: a Two-Stage Approach to Improving + Conditional Human Image Generation + + +
+ Recent years have seen significant progress in human image generation, +particularly with the advancements in diffusion models. However, existing +diffusion methods encounter challenges when producing consistent hand anatomy +and the generated images often lack precise control over the hand pose. To +address this limitation, we introduce a novel approach to pose-conditioned +human image generation, dividing the process into two stages: hand generation +and subsequent body outpainting around the hands. We propose training the hand +generator in a multi-task setting to produce both hand images and their +corresponding segmentation masks, and employ the trained model in the first +stage of generation. An adapted ControlNet model is then used in the second +stage to outpaint the body around the generated hands, producing the final +result. A novel blending technique is introduced to preserve the hand details +during the second stage that combines the results of both stages in a coherent +way. This involves sequential expansion of the outpainted region while fusing +the latent representations, to ensure a seamless and cohesive synthesis of the +final image. Experimental evaluations demonstrate the superiority of our +proposed method over state-of-the-art techniques, in both pose accuracy and +image quality, as validated on the HaGRID dataset. Our approach not only +enhances the quality of the generated hands but also offers improved control +over hand pose, advancing the capabilities of pose-conditioned human image +generation. The source code of the proposed approach is available at +https://github.com/apelykh/hand-to-diffusion. + +
+
+
+
+
+ + ♻ ☆ Instance-dependent Noisy-label Learning with Graphical Model Based + Noise-rate Estimation + + +
+ Deep learning faces a formidable challenge when handling noisy labels, as +models tend to overfit samples affected by label noise. This challenge is +further compounded by the presence of instance-dependent noise (IDN), a +realistic form of label noise arising from ambiguous sample information. To +address IDN, Label Noise Learning (LNL) incorporates a sample selection stage +to differentiate clean and noisy-label samples. This stage uses an arbitrary +criterion and a pre-defined curriculum that initially selects most samples as +noisy and gradually decreases this selection rate during training. Such +curriculum is sub-optimal since it does not consider the actual label noise +rate in the training set. This paper addresses this issue with a new noise-rate +estimation method that is easily integrated with most state-of-the-art (SOTA) +LNL methods to produce a more effective curriculum. Synthetic and real-world +benchmark results demonstrate that integrating our approach with SOTA LNL +methods improves accuracy in most cases. + +
+
+
+
+
+ + ♻ ☆ CURSOR: Scalable Mixed-Order Hypergraph Matching with CUR Decomposition CVPR 2024 + + +
+ To achieve greater accuracy, hypergraph matching algorithms require +exponential increases in computational resources. Recent kd-tree-based +approximate nearest neighbor (ANN) methods, despite the sparsity of their +compatibility tensor, still require exhaustive calculations for large-scale +graph matching. This work utilizes CUR tensor decomposition and introduces a +novel cascaded second and third-order hypergraph matching framework (CURSOR) +for efficient hypergraph matching. A CUR-based second-order graph matching +algorithm is used to provide a rough match, and then the core of CURSOR, a +fiber-CUR-based tensor generation method, directly calculates entries of the +compatibility tensor by leveraging the initial second-order match result. This +significantly decreases the time complexity and tensor density. A probability +relaxation labeling (PRL)-based matching algorithm, especially suitable for +sparse tensors, is developed. Experiment results on large-scale synthetic +datasets and widely-adopted benchmark sets demonstrate the superiority of +CURSOR over existing methods. The tensor generation method in CURSOR can be +integrated seamlessly into existing hypergraph matching methods to improve +their performance and lower their computational costs. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Visible-Infrared Person Re-Identification via Patch-Mixed Cross-Modality + Learning + + +
+ Visible-infrared person re-identification (VI-ReID) aims to retrieve images +of the same pedestrian from different modalities, where the challenges lie in +the significant modality discrepancy. To alleviate the modality gap, recent +methods generate intermediate images by GANs, grayscaling, or mixup strategies. +However, these methods could introduce extra data distribution, and the +semantic correspondence between the two modalities is not well learned. In this +paper, we propose a Patch-Mixed Cross-Modality framework (PMCM), where two +images of the same person from two modalities are split into patches and +stitched into a new one for model learning. A part-alignment loss is introduced +to regularize representation learning, and a patch-mixed modality learning loss +is proposed to align between the modalities. In this way, the model learns to +recognize a person through patches of different styles, thereby the modality +semantic correspondence can be inferred. In addition, with the flexible image +generation strategy, the patch-mixed images freely adjust the ratio of +different modality patches, which could further alleviate the modality +imbalance problem. On two VI-ReID datasets, we report new state-of-the-art +performance with the proposed method. + +
+
+
+
+
+ + ♻ ☆ SCTransNet: Spatial-channel Cross Transformer Network for Infrared Small + Target Detection + + +
+ Infrared small target detection (IRSTD) has recently benefitted greatly from +U-shaped neural models. However, largely overlooking effective global +information modeling, existing techniques struggle when the target has high +similarities with the background. We present a Spatial-channel Cross +Transformer Network (SCTransNet) that leverages spatial-channel cross +transformer blocks (SCTBs) on top of long-range skip connections to address the +aforementioned challenge. In the proposed SCTBs, the outputs of all encoders +are interacted with cross transformer to generate mixed features, which are +redistributed to all decoders to effectively reinforce semantic differences +between the target and clutter at full scales. Specifically, SCTB contains the +following two key elements: (a) spatial-embedded single-head channel-cross +attention (SSCA) for exchanging local spatial features and full-level global +channel information to eliminate ambiguity among the encoders and facilitate +high-level semantic associations of the images, and (b) a complementary +feed-forward network (CFN) for enhancing the feature discriminability via a +multi-scale strategy and cross-spatial-channel information interaction to +promote beneficial information transfer. Our SCTransNet effectively encodes the +semantic differences between targets and backgrounds to boost its internal +representation for detecting small infrared targets accurately. Extensive +experiments on three public datasets, NUDT-SIRST, NUAA-SIRST, and IRSTD-1k, +demonstrate that the proposed SCTransNet outperforms existing IRSTD methods. +Our code will be made public at https://github.com/xdFai. + +
+
+
+
+
+ + ♻ ☆ CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement + + +
+ Low-light image enhancement (LLIE) aims to improve low-illumination images. +However, existing methods face two challenges: (1) uncertainty in restoration +from diverse brightness degradations; (2) loss of texture and color information +caused by noise suppression and light enhancement. In this paper, we propose a +novel enhancement approach, CodeEnhance, by leveraging quantized priors and +image refinement to address these challenges. In particular, we reframe LLIE as +learning an image-to-code mapping from low-light images to discrete codebook, +which has been learned from high-quality images. To enhance this process, a +Semantic Embedding Module (SEM) is introduced to integrate semantic information +with low-level features, and a Codebook Shift (CS) mechanism, designed to adapt +the pre-learned codebook to better suit the distinct characteristics of our +low-light dataset. Additionally, we present an Interactive Feature +Transformation (IFT) module to refine texture and color information during +image reconstruction, allowing for interactive enhancement based on user +preferences. Extensive experiments on both real-world and synthetic benchmarks +demonstrate that the incorporation of prior knowledge and controllable +information transfer significantly enhances LLIE performance in terms of +quality and fidelity. The proposed CodeEnhance exhibits superior robustness to +various degradations, including uneven illumination, noise, and color +distortion. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Rethinking Centered Kernel Alignment in Knowledge Distillation + + +
+ Knowledge distillation has emerged as a highly effective method for bridging +the representation discrepancy between large-scale models and lightweight +models. Prevalent approaches involve leveraging appropriate metrics to minimize +the divergence or distance between the knowledge extracted from the teacher +model and the knowledge learned by the student model. Centered Kernel Alignment +(CKA) is widely used to measure representation similarity and has been applied +in several knowledge distillation methods. However, these methods are complex +and fail to uncover the essence of CKA, thus not answering the question of how +to use CKA to achieve simple and effective distillation properly. This paper +first provides a theoretical perspective to illustrate the effectiveness of +CKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD) +and a constant term. Drawing from this, we propose a novel Relation-Centered +Kernel Alignment~(RCKA) framework, which practically establishes a connection +between CKA and MMD. Furthermore, we dynamically customize the application of +CKA based on the characteristics of each task, with less computational source +yet comparable performance than the previous methods. The extensive experiments +on the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves +state-of-the-art performance on almost all teacher-student pairs for image +classification and object detection, validating the effectiveness of our +approaches. Our code is available in https://github.com/Klayand/PCKA + +
+
+
+
+
+ + ♻ ☆ Towards Accurate Post-training Quantization for Diffusion Models + + +
+ In this paper, we propose an accurate data-free post-training quantization +framework of diffusion models (ADP-DM) for efficient image generation. +Conventional data-free quantization methods learn shared quantization functions +for tensor discretization regardless of the generation timesteps, while the +activation distribution differs significantly across various timesteps. The +calibration images are acquired in random timesteps which fail to provide +sufficient information for generalizable quantization function learning. Both +issues cause sizable quantization errors with obvious image generation +performance degradation. On the contrary, we design group-wise quantization +functions for activation discretization in different timesteps and sample the +optimal timestep for informative calibration image generation, so that our +quantized diffusion model can reduce the discretization errors with negligible +computational overhead. Specifically, we partition the timesteps according to +the importance weights of quantization functions in different groups, which are +optimized by differentiable search algorithms. We also select the optimal +timestep for calibration image generation by structural risk minimizing +principle in order to enhance the generalization ability in the deployment of +quantized diffusion model. Extensive experimental results show that our method +outperforms the state-of-the-art post-training quantization of diffusion model +by a sizable margin with similar computational cost. + +
+
+
+
+
+ + ♻ ☆ EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image + Captioning + + +
+ News image captioning requires model to generate an informative caption rich +in entities, with the news image and the associated news article. Though +Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in addressing various vision-language tasks, our research finds +that current MLLMs still bear limitations in handling entity information on +news image captioning task. Besides, while MLLMs have the ability to process +long inputs, generating high-quality news image captions still requires a +trade-off between sufficiency and conciseness of textual input information. To +explore the potential of MLLMs and address problems we discovered, we propose : +an Entity-Aware Multimodal Alignment based approach for news image captioning. +Our approach first aligns the MLLM through Balance Training Strategy with two +extra alignment tasks: Entity-Aware Sentence Selection task and Entity +Selection task, together with News Image Captioning task, to enhance its +capability in handling multimodal entity information. The aligned MLLM will +utilizes the additional entity-related information it explicitly extracts to +supplement its textual input while generating news image captions. Our approach +achieves better results than all previous models in CIDEr score on GoodNews +dataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61). + +
+
+
+
+
+ + ♻ ☆ Beyond Known Clusters: Probe New Prototypes for Efficient Generalized + Class Discovery + + +
+ Generalized Class Discovery (GCD) aims to dynamically assign labels to +unlabelled data partially based on knowledge learned from labelled data, where +the unlabelled data may come from known or novel classes. The prevailing +approach generally involves clustering across all data and learning conceptions +by prototypical contrastive learning. However, existing methods largely hinge +on the performance of clustering algorithms and are thus subject to their +inherent limitations. Firstly, the estimated cluster number is often smaller +than the ground truth, making the existing methods suffer from the lack of +prototypes for comprehensive conception learning. To address this issue, we +propose an adaptive probing mechanism that introduces learnable potential +prototypes to expand cluster prototypes (centers). As there is no ground truth +for the potential prototype, we develop a self-supervised prototype learning +framework to optimize the potential prototype in an end-to-end fashion. +Secondly, clustering is computationally intensive, and the conventional +strategy of clustering both labelled and unlabelled instances exacerbates this +issue. To counteract this inefficiency, we opt to cluster only the unlabelled +instances and subsequently expand the cluster prototypes with our introduced +potential prototypes to fast explore novel classes. Despite the simplicity of +our proposed method, extensive empirical analysis on a wide range of datasets +confirms that our method consistently delivers state-of-the-art results. +Specifically, our method surpasses the nearest competitor by a significant +margin of 9.7% within the Stanford Cars dataset and 12x clustering efficiency +within the Herbarium 19 dataset. We will make the code and checkpoints publicly +available at https://github.com/xjtuYW/PNP.git. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Integrating Language-Derived Appearance Elements with Visual Cues in + Pedestrian Detection + + +
+ Large language models (LLMs) have shown their capabilities in understanding +contextual and semantic information regarding knowledge of instance +appearances. In this paper, we introduce a novel approach to utilize the +strengths of LLMs in understanding contextual appearance variations and to +leverage this knowledge into a vision model (here, pedestrian detection). While +pedestrian detection is considered one of the crucial tasks directly related to +our safety (e.g., intelligent driving systems), it is challenging because of +varying appearances and poses in diverse scenes. Therefore, we propose to +formulate language-derived appearance elements and incorporate them with visual +cues in pedestrian detection. To this end, we establish a description corpus +that includes numerous narratives describing various appearances of pedestrians +and other instances. By feeding them through an LLM, we extract appearance +knowledge sets that contain the representations of appearance variations. +Subsequently, we perform a task-prompting process to obtain appearance elements +which are guided representative appearance knowledge relevant to a downstream +pedestrian detection task. The obtained knowledge elements are adaptable to +various detection frameworks, so that we can provide plentiful appearance +information by integrating the language-derived appearance elements with visual +cues within a detector. Through comprehensive experiments with various +pedestrian detectors, we verify the adaptability and effectiveness of our +method showing noticeable performance gains and achieving state-of-the-art +detection performance on two public pedestrian detection benchmarks (i.e., +CrowdHuman and WiderPedestrian). + +
+
+
+
+
+ + ♻ ☆ Human-annotated label noise and their impact on ConvNets for remote + sensing image scene classification + + +
+ Convolutional neural networks (ConvNets) have been successfully applied to +satellite image scene classification. Human-labeled training datasets are +essential for ConvNets to perform accurate classification. Errors in +human-annotated training datasets are unavoidable due to the complexity of +satellite images. However, the distribution of real-world human-annotated label +noises on remote sensing images and their impact on ConvNets have not been +investigated. To fill this research gap, this study, for the first time, +collected real-world labels from 32 participants and explored how their +annotated label noise affect three representative ConvNets (VGG16, GoogleNet, +and ResNet-50) for remote sensing image scene classification. We found that: +(1) human-annotated label noise exhibits significant class and instance +dependence; (2) an additional 1% of human-annotated label noise in training +data leads to 0.5% reduction in the overall accuracy of ConvNets +classification; (3) the error pattern of ConvNet predictions was strongly +correlated with that of participant's labels. To uncover the mechanism +underlying the impact of human labeling errors on ConvNets, we further compared +it with three types of simulated label noise: uniform noise, class-dependent +noise and instance-dependent noise. Our results show that the impact of +human-annotated label noise on ConvNets significantly differs from all three +types of simulated label noise, while both class dependence and instance +dependence contribute to the impact of human-annotated label noise on ConvNets. +These observations necessitate a reevaluation of the handling of noisy labels, +and we anticipate that our real-world label noise dataset would facilitate the +future development and assessment of label-noise learning algorithms. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Discrete approximations of Gaussian smoothing and Gaussian derivatives + + +
+ This paper develops an in-depth treatment concerning the problem of +approximating the Gaussian smoothing and Gaussian derivative computations in +scale-space theory for application on discrete data. With close connections to +previous axiomatic treatments of continuous and discrete scale-space theory, we +consider three main ways discretizing these scale-space operations in terms of +explicit discrete convolutions, based on either (i) sampling the Gaussian +kernels and the Gaussian derivative kernels, (ii) locally integrating the +Gaussian kernels and the Gaussian derivative kernels over each pixel support +region and (iii) basing the scale-space analysis on the discrete analogue of +the Gaussian kernel, and then computing derivative approximations by applying +small-support central difference operators to the spatially smoothed image +data. + We study the properties of these three main discretization methods both +theoretically and experimentally, and characterize their performance by +quantitative measures, including the results they give rise to with respect to +the task of scale selection, investigated for four different use cases, and +with emphasis on the behaviour at fine scales. The results show that the +sampled Gaussian kernels and derivatives as well as the integrated Gaussian +kernels and derivatives perform very poorly at very fine scales. At very fine +scales, the discrete analogue of the Gaussian kernel with its corresponding +discrete derivative approximations performs substantially better. The sampled +Gaussian kernel and the sampled Gaussian derivatives do, on the other hand, +lead to numerically very good approximations of the corresponding continuous +results, when the scale parameter is sufficiently large, in the experiments +presented in the paper, when the scale parameter is greater than a value of +about 1, in units of the grid spacing. + +
+
+ comment: 42 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ Cross-Task Multi-Branch Vision Transformer for Facial Expression and + Mask Wearing Classification + + +
+ With wearing masks becoming a new cultural norm, facial expression +recognition (FER) while taking masks into account has become a significant +challenge. In this paper, we propose a unified multi-branch vision transformer +for facial expression recognition and mask wearing classification tasks. Our +approach extracts shared features for both tasks using a dual-branch +architecture that obtains multi-scale feature representations. Furthermore, we +propose a cross-task fusion phase that processes tokens for each task with +separate branches, while exchanging information using a cross attention module. +Our proposed framework reduces the overall complexity compared with using +separate networks for both tasks by the simple yet effective cross-task fusion +phase. Extensive experiments demonstrate that our proposed model performs +better than or on par with different state-of-the-art methods on both facial +expression recognition and facial mask wearing classification task. + +
+
+
+
+
+ + ♻ ☆ Scaling up Multi-domain Semantic Segmentation with Sentence Embeddings + + +
+ We propose an approach to semantic segmentation that achieves +state-of-the-art supervised performance when applied in a zero-shot setting. It +thus achieves results equivalent to those of the supervised methods, on each of +the major semantic segmentation datasets, without training on those datasets. +This is achieved by replacing each class label with a vector-valued embedding +of a short paragraph that describes the class. The generality and simplicity of +this approach enables merging multiple datasets from different domains, each +with varying class labels and semantics. The resulting merged semantic +segmentation dataset of over 2 Million images enables training a model that +achieves performance equal to that of state-of-the-art supervised methods on 7 +benchmark datasets, despite not using any images therefrom. By fine-tuning the +model on standard semantic segmentation datasets, we also achieve a significant +improvement over the state-of-the-art supervised segmentation on NYUD-V2 and +PASCAL-context at 60% and 65% mIoU, respectively. Based on the closeness of +language embeddings, our method can even segment unseen labels. Extensive +experiments demonstrate strong generalization to unseen image domains and +unseen labels, and that the method enables impressive performance improvements +in downstream applications, including depth estimation and instance +segmentation. + +
+
+ comment: 14 pages. Accepted to Int. J. Comp. Vis. (IJCV) +
+
+
+
+
+ + ♻ ☆ An Animation-based Augmentation Approach for Action Recognition from + Discontinuous Video + + +
+ Action recognition, an essential component of computer vision, plays a +pivotal role in multiple applications. Despite significant improvements brought +by Convolutional Neural Networks (CNNs), these models suffer performance +declines when trained with discontinuous video frames, which is a frequent +scenario in real-world settings. This decline primarily results from the loss +of temporal continuity, which is crucial for understanding the semantics of +human actions. To overcome this issue, we introduce the 4A (Action +Animation-based Augmentation Approach) pipeline, which employs a series of +sophisticated techniques: starting with 2D human pose estimation from RGB +videos, followed by Quaternion-based Graph Convolution Network for joint +orientation and trajectory prediction, and Dynamic Skeletal Interpolation for +creating smoother, diversified actions using game engine technology. This +innovative approach generates realistic animations in varied game environments, +viewed from multiple viewpoints. In this way, our method effectively bridges +the domain gap between virtual and real-world data. In experimental +evaluations, the 4A pipeline achieves comparable or even superior performance +to traditional training approaches using real-world data, while requiring only +10% of the original data volume. Additionally, our approach demonstrates +enhanced performance on In-the-wild videos, marking a significant advancement +in the field of action recognition. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2401.13414 +
+
+
+
+
+ + ♻ ☆ Efficient Bayesian Uncertainty Estimation for nnU-Net + + +
+ The self-configuring nnU-Net has achieved leading performance in a large +range of medical image segmentation challenges. It is widely considered as the +model of choice and a strong baseline for medical image segmentation. However, +despite its extraordinary performance, nnU-Net does not supply a measure of +uncertainty to indicate its possible failure. This can be problematic for +large-scale image segmentation applications, where data are heterogeneous and +nnU-Net may fail without notice. In this work, we introduce a novel method to +estimate nnU-Net uncertainty for medical image segmentation. We propose a +highly effective scheme for posterior sampling of weight space for Bayesian +uncertainty estimation. Different from previous baseline methods such as Monte +Carlo Dropout and mean-field Bayesian Neural Networks, our proposed method does +not require a variational architecture and keeps the original nnU-Net +architecture intact, thereby preserving its excellent performance and ease of +use. Additionally, we boost the segmentation performance over the original +nnU-Net via marginalizing multi-modal posterior models. We applied our method +on the public ACDC and M&M datasets of cardiac MRI and demonstrated improved +uncertainty estimation over a range of baseline methods. The proposed method +further strengthens nnU-Net for medical image segmentation in terms of both +segmentation accuracy and quality control. + +
+
+
+
+
+ + ♻ ☆ Relaxometry Guided Quantitative Cardiac Magnetic Resonance Image + Reconstruction + + +
+ Deep learning-based methods have achieved prestigious performance for +magnetic resonance imaging (MRI) reconstruction, enabling fast imaging for many +clinical applications. Previous methods employ convolutional networks to learn +the image prior as the regularization term. In quantitative MRI, the physical +model of nuclear magnetic resonance relaxometry is known, providing additional +prior knowledge for image reconstruction. However, traditional reconstruction +networks are limited to learning the spatial domain prior knowledge, ignoring +the relaxometry prior. Therefore, we propose a relaxometry-guided quantitative +MRI reconstruction framework to learn the spatial prior from data and the +relaxometry prior from MRI physics. Additionally, we also evaluated the +performance of two popular reconstruction backbones, namely, recurrent +variational networks (RVN) and variational networks (VN) with U- Net. +Experiments demonstrate that the proposed method achieves highly promising +results in quantitative MRI reconstruction. + +
+
+
+
+
+ + ♻ ☆ Characterization of dim light response in DVS pixel: Discontinuity of + event triggering time + + +
+ Dynamic Vision Sensors (DVS) have recently generated great interest because +of the advantages of wide dynamic range and low latency compared with +conventional frame-based cameras. However, the complicated behaviors in dim +light conditions are still not clear, restricting the applications of DVS. In +this paper, we analyze the typical DVS circuit, and find that there exists +discontinuity of event triggering time. In dim light conditions, the +discontinuity becomes prominent. We point out that the discontinuity depends +exclusively on the changing speed of light intensity. Experimental results on +real event data validate the analysis and the existence of discontinuity that +reveals the non-first-order behaviors of DVS in dim light conditions. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ HyperSDFusion: Bridging Hierarchical Structures in Language and Geometry + for Enhanced 3D Text2Shape Generation + + +
+ 3D shape generation from text is a fundamental task in 3D representation +learning. The text-shape pairs exhibit a hierarchical structure, where a +general text like ``chair" covers all 3D shapes of the chair, while more +detailed prompts refer to more specific shapes. Furthermore, both text and 3D +shapes are inherently hierarchical structures. However, existing Text2Shape +methods, such as SDFusion, do not exploit that. In this work, we propose +HyperSDFusion, a dual-branch diffusion model that generates 3D shapes from a +given text. Since hyperbolic space is suitable for handling hierarchical data, +we propose to learn the hierarchical representations of text and 3D shapes in +hyperbolic space. First, we introduce a hyperbolic text-image encoder to learn +the sequential and multi-modal hierarchical features of text in hyperbolic +space. In addition, we design a hyperbolic text-graph convolution module to +learn the hierarchical features of text in hyperbolic space. In order to fully +utilize these text features, we introduce a dual-branch structure to embed text +features in 3D feature space. At last, to endow the generated 3D shapes with a +hierarchical structure, we devise a hyperbolic hierarchical loss. Our method is +the first to explore the hyperbolic hierarchical representation for +text-to-shape generation. Experimental results on the existing text-to-shape +paired dataset, Text2Shape, achieved state-of-the-art results. We release our +implementation under HyperSDFusion.github.io. + +
+
+
+
+
+ + ♻ ☆ An Effective Image Copy-Move Forgery Detection Using Entropy Information + + +
+ Image forensics has become increasingly crucial in our daily lives. Among +various types of forgeries, copy-move forgery detection has received +considerable attention within the academic community. Keypoint-based +algorithms, particularly those based on Scale Invariant Feature Transform, have +achieved promising outcomes. However, most of keypoint detection algorithms +failed to generate sufficient matches when tampered patches were occurred in +smooth areas, leading to insufficient matches. Therefore, this paper introduces +entropy images to determine the coordinates and scales of keypoints based on +Scale Invariant Feature Transform detector, which make the pre-processing more +suitable for solving the above problems. Furthermore, an overlapped entropy +level clustering algorithm is developed to mitigate the increased matching +complexity caused by the non-ideal distribution of gray values in keypoints. +Experimental results demonstrate that our algorithm achieves a good balance +between performance and time efficiency. + +
+
+
+
+
+ + ♻ ☆ MVDiffusion++: A Dense High-resolution Multi-view Diffusion Model for + Single or Sparse-view 3D Object Reconstruction + + +
+ This paper presents a neural architecture MVDiffusion++ for 3D object +reconstruction that synthesizes dense and high-resolution views of an object +given one or a few images without camera poses. MVDiffusion++ achieves superior +flexibility and scalability with two surprisingly simple ideas: 1) A +``pose-free architecture'' where standard self-attention among 2D latent +features learns 3D consistency across an arbitrary number of conditional and +generation views without explicitly using camera pose information; and 2) A +``view dropout strategy'' that discards a substantial number of output views +during training, which reduces the training-time memory footprint and enables +dense and high-resolution view synthesis at test time. We use the Objaverse for +training and the Google Scanned Objects for evaluation with standard novel view +synthesis and 3D reconstruction metrics, where MVDiffusion++ significantly +outperforms the current state of the arts. We also demonstrate a text-to-3D +application example by combining MVDiffusion++ with a text-to-image generative +model. The project page is at https://mvdiffusion-plusplus.github.io. + +
+
+ comment: 3D generation, project page: https://mvdiffusion-plusplus.github.io/ +
+
+
+
+
+ + ♻ ☆ ReWiTe: Realistic Wide-angle and Telephoto Dual Camera Fusion Dataset + via Beam Splitter Camera Rig + + +
+ The fusion of images from dual camera systems featuring a wide-angle and a +telephoto camera has become a hotspot problem recently. By integrating +simultaneously captured wide-angle and telephoto images from these systems, the +resulting fused image achieves a wide field of view (FOV) coupled with +high-definition quality. Existing approaches are mostly deep learning methods, +and predominantly rely on supervised learning, where the training dataset plays +a pivotal role. However, current datasets typically adopt a data synthesis +approach generate input pairs of wide-angle and telephoto images alongside +ground-truth images. Notably, the wide-angle inputs are synthesized rather than +captured using real wide-angle cameras, and the ground-truth image is captured +by wide-angle camera whose quality is substantially lower than that of input +telephoto images captured by telephoto cameras. To address these limitations, +we introduce a novel hardware setup utilizing a beam splitter to simultaneously +capture three images, i.e. input pairs and ground-truth images, from two +authentic cellphones equipped with wide-angle and telephoto dual cameras. +Specifically, the wide-angle and telephoto images captured by cellphone 2 serve +as the input pair, while the telephoto image captured by cellphone 1, which is +calibrated to match the optical path of the wide-angle image from cellphone 2, +serves as the ground-truth image, maintaining quality on par with the input +telephoto image. Experiments validate the efficacy of our newly introduced +dataset, named ReWiTe, significantly enhances the performance of various +existing methods for real-world wide-angle and telephoto dual image fusion +tasks. + +
+
+
+
+
+ + ♻ ☆ TrACT: A Training Dynamics Aware Contrastive Learning Framework for + Long-tail Trajectory Prediction + + +
+ As a safety critical task, autonomous driving requires accurate predictions +of road users' future trajectories for safe motion planning, particularly under +challenging conditions. Yet, many recent deep learning methods suffer from a +degraded performance on the challenging scenarios, mainly because these +scenarios appear less frequently in the training data. To address such a +long-tail issue, existing methods force challenging scenarios closer together +in the feature space during training to trigger information sharing among them +for more robust learning. These methods, however, primarily rely on the motion +patterns to characterize scenarios, omitting more informative contextual +information, such as interactions and scene layout. We argue that exploiting +such information not only improves prediction accuracy but also scene +compliance of the generated trajectories. In this paper, we propose to +incorporate richer training dynamics information into a prototypical +contrastive learning framework. More specifically, we propose a two-stage +process. First, we generate rich contextual features using a baseline +encoder-decoder framework. These features are split into clusters based on the +model's output errors, using the training dynamics information, and a prototype +is computed within each cluster. Second, we retrain the model using the +prototypes in a contrastive learning framework. We conduct empirical +evaluations of our approach using two large-scale naturalistic datasets and +show that our method achieves state-of-the-art performance by improving +accuracy and scene compliance on the long-tail samples. Furthermore, we perform +experiments on a subset of the clusters to highlight the additional benefit of +our approach in reducing training bias. + +
+
+ comment: 2024 IEEE Intelligent Vehicles Symposium (IV) +
+
+
+
+
+ + ♻ ☆ Feature Density Estimation for Out-of-Distribution Detection via + Normalizing Flows + + +
+ Out-of-distribution (OOD) detection is a critical task for safe deployment of +learning systems in the open world setting. In this work, we investigate the +use of feature density estimation via normalizing flows for OOD detection and +present a fully unsupervised approach which requires no exposure to OOD data, +avoiding researcher bias in OOD sample selection. This is a post-hoc method +which can be applied to any pretrained model, and involves training a +lightweight auxiliary normalizing flow model to perform the out-of-distribution +detection via density thresholding. Experiments on OOD detection in image +classification show strong results for far-OOD data detection with only a +single epoch of flow training, including 98.2% AUROC for ImageNet-1k vs. +Textures, which exceeds the state of the art by 7.8%. We additionally explore +the connection between the feature space distribution of the pretrained model +and the performance of our method. Finally, we provide insights into training +pitfalls that have plagued normalizing flows for use in OOD detection. + +
+
+ comment: Accepted to CRV 2024 +
+
+
+
+
+ + ♻ ☆ NOLA: Compressing LoRA using Linear Combination of Random Basis ICLR 2024 + + +
+ Fine-tuning Large Language Models (LLMs) and storing them for each downstream +task or domain is impractical because of the massive model size (e.g., 350GB in +GPT-3). Current literature, such as LoRA, showcases the potential of low-rank +modifications to the original weights of an LLM, enabling efficient adaptation +and storage for task-specific models. These methods can reduce the number of +parameters needed to fine-tune an LLM by several orders of magnitude. Yet, +these methods face two primary limitations: (1) the parameter count is +lower-bounded by the rank one decomposition, and (2) the extent of reduction is +heavily influenced by both the model architecture and the chosen rank. We +introduce NOLA, which overcomes the rank one lower bound present in LoRA. It +achieves this by re-parameterizing the low-rank matrices in LoRA using linear +combinations of randomly generated matrices (basis) and optimizing the linear +mixture coefficients only. This approach allows us to decouple the number of +trainable parameters from both the choice of rank and the network architecture. +We present adaptation results using GPT-2, LLaMA-2, and ViT in natural language +and computer vision tasks. NOLA performs as well as LoRA models with much fewer +number of parameters compared to LoRA with rank one, the best compression LoRA +can archive. Particularly, on LLaMA-2 70B, our method is almost 20 times more +compact than the most compressed LoRA without degradation in accuracy. Our code +is available here: https://github.com/UCDvision/NOLA + +
+
+ comment: ICLR 2024. Our code is available here: + https://github.com/UCDvision/NOLA +
+
+
+
+
+ + ♻ ☆ High-quality Surface Reconstruction using Gaussian Surfels + + +
+ We propose a novel point-based representation, Gaussian surfels, to combine +the advantages of the flexible optimization procedure in 3D Gaussian points and +the surface alignment property of surfels. This is achieved by directly setting +the z-scale of 3D Gaussian points to 0, effectively flattening the original 3D +ellipsoid into a 2D ellipse. Such a design provides clear guidance to the +optimizer. By treating the local z-axis as the normal direction, it greatly +improves optimization stability and surface alignment. While the derivatives to +the local z-axis computed from the covariance matrix are zero in this setting, +we design a self-supervised normal-depth consistency loss to remedy this issue. +Monocular normal priors and foreground masks are incorporated to enhance the +quality of the reconstruction, mitigating issues related to highlights and +background. We propose a volumetric cutting method to aggregate the information +of Gaussian surfels so as to remove erroneous points in depth maps generated by +alpha blending. Finally, we apply screened Poisson reconstruction method to the +fused depth maps to extract the surface mesh. Experimental results show that +our method demonstrates superior performance in surface reconstruction compared +to state-of-the-art neural volume rendering and point-based rendering methods. + +
+
+ comment: Results added and improved +
+
+
+
+
+ + ♻ ☆ Vision-Language Generative Model for View-Specific Chest X-ray + Generation + + +
+ Synthetic medical data generation has opened up new possibilities in the +healthcare domain, offering a powerful tool for simulating clinical scenarios, +enhancing diagnostic and treatment quality, gaining granular medical knowledge, +and accelerating the development of unbiased algorithms. In this context, we +present a novel approach called ViewXGen, designed to overcome the limitations +of existing methods that rely on general domain pipelines using only radiology +reports to generate frontal-view chest X-rays. Our approach takes into +consideration the diverse view positions found in the dataset, enabling the +generation of chest X-rays with specific views, which marks a significant +advancement in the field. To achieve this, we introduce a set of specially +designed tokens for each view position, tailoring the generation process to the +user's preferences. Furthermore, we leverage multi-view chest X-rays as input, +incorporating valuable information from different views within the same study. +This integration rectifies potential errors and contributes to faithfully +capturing abnormal findings in chest X-ray generation. To validate the +effectiveness of our approach, we conducted statistical analyses, evaluating +its performance in a clinical efficacy metric on the MIMIC-CXR dataset. Also, +human evaluation demonstrates the remarkable capabilities of ViewXGen, +particularly in producing realistic view-specific X-rays that closely resemble +the original images. + +
+
+ comment: Accepted at CHIL 2024 +
+
+
+
+
+ + ♻ ☆ An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced + linear classification + + +
+ This article presents a new polynomial parameterized sigmoid called SIGTRON, +which is an extended asymmetric sigmoid with Perceptron, and its companion +convex model called SIGTRON-imbalanced classification (SIC) model that employs +a virtual SIGTRON-induced convex loss function. In contrast to the conventional +$\pi$-weighted cost-sensitive learning model, the SIC model does not have an +external $\pi$-weight on the loss function but has internal parameters in the +virtual SIGTRON-induced loss function. As a consequence, when the given +training dataset is close to the well-balanced condition considering the +(scale-)class-imbalance ratio, we show that the proposed SIC model is more +adaptive to variations of the dataset, such as the inconsistency of the +(scale-)class-imbalance ratio between the training and test datasets. This +adaptation is justified by a skewed hyperplane equation, created via +linearization of the gradient satisfying $\epsilon$-optimal condition. + Additionally, we present a quasi-Newton optimization(L-BFGS) framework for +the virtual convex loss by developing an interval-based bisection line search. +Empirically, we have observed that the proposed approach outperforms (or is +comparable to) $\pi$-weighted convex focal loss and balanced classifier +LIBLINEAR(logistic regression, SVM, and L2SVM) in terms of test classification +accuracy with $51$ two-class and $67$ multi-class datasets. In binary +classification problems, where the scale-class-imbalance ratio of the training +dataset is not significant but the inconsistency exists, a group of SIC models +with the best test accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC +with RBF kernel), a well-known kernel-based classifier. + +
+
+ comment: 26 pages, 9 figures, revised version +
+
+
+
+
+ + ♻ ☆ PuzzleVQA: Diagnosing Multimodal Reasoning Challenges of Language Models + with Abstract Visual Patterns + + +
+ Large multimodal models extend the impressive capabilities of large language +models by integrating multimodal understanding abilities. However, it is not +clear how they can emulate the general intelligence and reasoning ability of +humans. As recognizing patterns and abstracting concepts are key to general +intelligence, we introduce PuzzleVQA, a collection of puzzles based on abstract +patterns. With this dataset, we evaluate large multimodal models with abstract +patterns based on fundamental concepts, including colors, numbers, sizes, and +shapes. Through our experiments on state-of-the-art large multimodal models, we +find that they are not able to generalize well to simple abstract patterns. +Notably, even GPT-4V cannot solve more than half of the puzzles. To diagnose +the reasoning challenges in large multimodal models, we progressively guide the +models with our ground truth reasoning explanations for visual perception, +inductive reasoning, and deductive reasoning. Our systematic analysis finds +that the main bottlenecks of GPT-4V are weaker visual perception and inductive +reasoning abilities. Through this work, we hope to shed light on the +limitations of large multimodal models and how they can better emulate human +cognitive processes in the future (Our data and code will be released publicly +at https://github.com/declare-lab/LLM-PuzzleTest). + +
+
+
+
+
+ + ♻ ☆ AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains + Into One CVPR 2024 + + +
+ A handful of visual foundation models (VFMs) have recently emerged as the +backbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are +trained with distinct objectives, exhibiting unique characteristics for various +downstream tasks. We find that despite their conceptual differences, these +models can be effectively merged into a unified model through multi-teacher +distillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All +Domains Into One). This integrative approach not only surpasses the performance +of individual teacher models but also amalgamates their distinctive features, +such as zero-shot vision-language comprehension, detailed pixel-level +understanding, and open vocabulary segmentation capabilities. In pursuit of the +most hardware-efficient backbone, we evaluated numerous architectures in our +multi-teacher distillation pipeline using the same training recipe. This led to +the development of a novel architecture (E-RADIO) that exceeds the performance +of its predecessors and is at least 7x faster than the teacher models. Our +comprehensive benchmarking process covers downstream tasks including ImageNet +classification, ADE20k semantic segmentation, COCO object detection and +LLaVa-1.5 framework. + Code: https://github.com/NVlabs/RADIO + +
+
+ comment: CVPR 2024 Version 3: CVPR Camera Ready, reconfigured full paper, + table 1 is now more comprehensive Version 2: Added more acknowledgements and + updated table 7 with more recent results. Ensured that the link in the + abstract to our code is working properly Version 3: Fix broken hyperlinks +
+
+
+
+
+ + ♻ ☆ Refining Remote Photoplethysmography Architectures using CKA and + Empirical Methods + + +
+ Model architecture refinement is a challenging task in deep learning research +fields such as remote photoplethysmography (rPPG). One architectural +consideration, the depth of the model, can have significant consequences on the +resulting performance. In rPPG models that are overprovisioned with more layers +than necessary, redundancies exist, the removal of which can result in faster +training and reduced computational load at inference time. With too few layers +the models may exhibit sub-optimal error rates. We apply Centered Kernel +Alignment (CKA) to an array of rPPG architectures of differing depths, +demonstrating that shallower models do not learn the same representations as +deeper models, and that after a certain depth, redundant layers are added +without significantly increased functionality. An empirical study confirms how +the architectural deficiencies discovered using CKA impact performance, and we +show how CKA as a diagnostic can be used to refine rPPG architectures. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Hybrid Sliced Wasserstein: A Scalable Metric for + Heterogeneous Joint Distributions + + +
+ Sliced Wasserstein (SW) and Generalized Sliced Wasserstein (GSW) have been +widely used in applications due to their computational and statistical +scalability. However, the SW and the GSW are only defined between distributions +supported on a homogeneous domain. This limitation prevents their usage in +applications with heterogeneous joint distributions with marginal distributions +supported on multiple different domains. Using SW and GSW directly on the joint +domains cannot make a meaningful comparison since their homogeneous slicing +operator i.e., Radon Transform (RT) and Generalized Radon Transform (GRT) are +not expressive enough to capture the structure of the joint supports set. To +address the issue, we propose two new slicing operators i.e., Partial +Generalized Radon Transform (PGRT) and Hierarchical Hybrid Radon Transform +(HHRT). In greater detail, PGRT is the generalization of Partial Radon +Transform (PRT), which transforms a subset of function arguments non-linearly +while HHRT is the composition of PRT and multiple domain-specific PGRT on +marginal domain arguments. By using HHRT, we extend the SW into Hierarchical +Hybrid Sliced Wasserstein (H2SW) distance which is designed specifically for +comparing heterogeneous joint distributions. We then discuss the topological, +statistical, and computational properties of H2SW. Finally, we demonstrate the +favorable performance of H2SW in 3D mesh deformation, deep 3D mesh +autoencoders, and datasets comparison. + +
+
+ comment: 28 pages, 11 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ TransRUPNet for Improved Polyp Segmentation + + +
+ Colorectal cancer is among the most common cause of cancer worldwide. Removal +of precancerous polyps through early detection is essential to prevent them +from progressing to colon cancer. We develop an advanced deep learning-based +architecture, Transformer based Residual Upsampling Network (TransRUPNet) for +automatic and real-time polyp segmentation. The proposed architecture, +TransRUPNet, is an encoder-decoder network consisting of three encoder and +decoder blocks with additional upsampling blocks at the end of the network. +With the image size of $256\times256$, the proposed method achieves an +excellent real-time operation speed of 47.07 frames per second with an average +mean dice coefficient score of 0.7786 and mean Intersection over Union of +0.7210 on the out-of-distribution polyp datasets. The results on the publicly +available PolypGen dataset suggest that TransRUPNet can give real-time feedback +while retaining high accuracy for in-distribution datasets. Furthermore, we +demonstrate the generalizability of the proposed method by showing that it +significantly improves performance on out-of-distribution datasets compared to +the existing methods. The source code of our network is available at +https://github.com/DebeshJha/TransRUPNet. + +
+
+ comment: Accepted at EMBC 2024 +
+
+
+
+
+ + ♻ ☆ A survey on deep learning in medical image registration: new + technologies, uncertainty, evaluation metrics, and beyond + + +
+ Deep learning technologies have dramatically reshaped the field of medical +image registration over the past decade. The initial developments, such as +regression-based and U-Net-based networks, established the foundation for deep +learning in image registration. Subsequent progress has been made in various +aspects of deep learning-based registration, including similarity measures, +deformation regularizations, network architectures, and uncertainty estimation. +These advancements have not only enriched the field of image registration but +have also facilitated its application in a wide range of tasks, including atlas +construction, multi-atlas segmentation, motion estimation, and 2D-3D +registration. In this paper, we present a comprehensive overview of the most +recent advancements in deep learning-based image registration. We begin with a +concise introduction to the core concepts of deep learning-based image +registration. Then, we delve into innovative network architectures, loss +functions specific to registration, and methods for estimating registration +uncertainty. Additionally, this paper explores appropriate evaluation metrics +for assessing the performance of deep learning models in registration tasks. +Finally, we highlight the practical applications of these novel techniques in +medical imaging and discuss the future prospects of deep learning-based image +registration. + +
+
+ comment: A list of open-sourced code from the papers reviewed has been + organized and is available at https://bit.ly/3QgFJ9z +
+
+
+
+
+ + ♻ ☆ A Hybrid Approach for Document Layout Analysis in Document images ICDAR 2024 + + +
+ Document layout analysis involves understanding the arrangement of elements +within a document. This paper navigates the complexities of understanding +various elements within document images, such as text, images, tables, and +headings. The approach employs an advanced Transformer-based object detection +network as an innovative graphical page object detector for identifying tables, +figures, and displayed elements. We introduce a query encoding mechanism to +provide high-quality object queries for contrastive learning, enhancing +efficiency in the decoder phase. We also present a hybrid matching scheme that +integrates the decoder's original one-to-one matching strategy with the +one-to-many matching strategy during the training phase. This approach aims to +improve the model's accuracy and versatility in detecting various graphical +elements on a page. Our experiments on PubLayNet, DocLayNet, and PubTables +benchmarks show that our approach outperforms current state-of-the-art methods. +It achieves an average precision of 97.3% on PubLayNet, 81.6% on DocLayNet, and +98.6 on PubTables, demonstrating its superior performance in layout analysis. +These advancements not only enhance the conversion of document images into +editable and accessible formats but also streamline information retrieval and +data extraction processes. + +
+
+ comment: ICDAR 2024 +
+
+
+
+
+ + ♻ ☆ GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic + Microplastics Data + + +
+ Microplastic particle ingestion or inhalation by humans is a problem of +growing concern. Unfortunately, current research methods that use machine +learning to understand their potential harms are obstructed by a lack of +available data. Deep learning techniques in particular are challenged by such +domains where only small or imbalanced data sets are available. Overcoming this +challenge often involves oversampling underrepresented classes or augmenting +the existing data to improve model performance. This paper proposes GANsemble: +a two-module framework connecting data augmentation with conditional generative +adversarial networks (cGANs) to generate class-conditioned synthetic data. +First, the data chooser module automates augmentation strategy selection by +searching for the best data augmentation strategy. Next, the cGAN module uses +this strategy to train a cGAN for generating enhanced synthetic data. We +experiment with the GANsemble framework on a small and imbalanced microplastics +data set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines +for synthetic microplastics (SYMP) data are established in terms of Frechet +Inception Distance (FID) and Inception Scores (IS). We also provide a synthetic +microplastics filter (SYMP-Filter) algorithm to increase the quality of +generated SYMP. Additionally, we show the best amount of oversampling with +augmentation to fix class imbalance in small microplastics data sets. To our +knowledge, this study is the first application of generative AI to +synthetically create microplastics data. + +
+
+ comment: Accepted to the 37th Canadian Artificial Intelligence Conference + (2024), 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ An interpretable machine learning system for colorectal cancer diagnosis + from pathology slides + + +
+ Considering the profound transformation affecting pathology practice, we +aimed to develop a scalable artificial intelligence (AI) system to diagnose +colorectal cancer from whole-slide images (WSI). For this, we propose a deep +learning (DL) system that learns from weak labels, a sampling strategy that +reduces the number of training samples by a factor of six without compromising +performance, an approach to leverage a small subset of fully annotated samples, +and a prototype with explainable predictions, active learning features and +parallelisation. Noting some problems in the literature, this study is +conducted with one of the largest WSI colorectal samples dataset with +approximately 10,500 WSIs. Of these samples, 900 are testing samples. +Furthermore, the robustness of the proposed method is assessed with two +additional external datasets (TCGA and PAIP) and a dataset of samples collected +directly from the proposed prototype. Our proposed method predicts, for the +patch-based tiles, a class based on the severity of the dysplasia and uses that +information to classify the whole slide. It is trained with an interpretable +mixed-supervision scheme to leverage the domain knowledge introduced by +pathologists through spatial annotations. The mixed-supervision scheme allowed +for an intelligent sampling strategy effectively evaluated in several different +scenarios without compromising the performance. On the internal dataset, the +method shows an accuracy of 93.44% and a sensitivity between positive +(low-grade and high-grade dysplasia) and non-neoplastic samples of 0.996. On +the external test samples varied with TCGA being the most challenging dataset +with an overall accuracy of 84.91% and a sensitivity of 0.996. + +
+
+ comment: Accepted at npj Precision Oncology. Available at: + https://www.nature.com/articles/s41698-024-00539-4 +
+
+
+
+
+ + ♻ ☆ Efficient Remote Sensing with Harmonized Transfer Learning and Modality + Alignment ICLR + + +
+ With the rise of Visual and Language Pretraining (VLP), an increasing number +of downstream tasks are adopting the paradigm of pretraining followed by +fine-tuning. Although this paradigm has demonstrated potential in various +multimodal downstream tasks, its implementation in the remote sensing domain +encounters some obstacles. Specifically, the tendency for same-modality +embeddings to cluster together impedes efficient transfer learning. To tackle +this issue, we review the aim of multimodal transfer learning for downstream +tasks from a unified perspective, and rethink the optimization process based on +three distinct objectives. We propose "Harmonized Transfer Learning and +Modality Alignment (HarMA)", a method that simultaneously satisfies task +constraints, modality alignment, and single-modality uniform alignment, while +minimizing training overhead through parameter-efficient fine-tuning. +Remarkably, without the need for external data for training, HarMA achieves +state-of-the-art performance in two popular multimodal retrieval tasks in the +field of remote sensing. Our experiments reveal that HarMA achieves competitive +and even superior performance to fully fine-tuned models with only minimal +adjustable parameters. Due to its simplicity, HarMA can be integrated into +almost all existing multimodal pretraining models. We hope this method can +facilitate the efficient application of large models to a wide range of +downstream tasks while significantly reducing the resource consumption. Code is +available at https://github.com/seekerhuang/HarMA. + +
+
+ comment: Accepted by the Twelfth International Conference on Learning + Representations (ICLR) Workshop +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 151 + +
+
+
+ + ☆ Hallucination of Multimodal Large Language Models: A Survey + + +
+ This survey presents a comprehensive analysis of the phenomenon of +hallucination in multimodal large language models (MLLMs), also known as Large +Vision-Language Models (LVLMs), which have demonstrated significant +advancements and remarkable abilities in multimodal tasks. Despite these +promising developments, MLLMs often generate outputs that are inconsistent with +the visual content, a challenge known as hallucination, which poses substantial +obstacles to their practical deployment and raises concerns regarding their +reliability in real-world applications. This problem has attracted increasing +attention, prompting efforts to detect and mitigate such inaccuracies. We +review recent advances in identifying, evaluating, and mitigating these +hallucinations, offering a detailed overview of the underlying causes, +evaluation benchmarks, metrics, and strategies developed to address this issue. +Additionally, we analyze the current challenges and limitations, formulating +open questions that delineate potential pathways for future research. By +drawing the granular classification and landscapes of hallucination causes, +evaluation benchmarks, and mitigation methods, this survey aims to deepen the +understanding of hallucinations in MLLMs and inspire further advancements in +the field. Through our thorough and in-depth review, we contribute to the +ongoing dialogue on enhancing the robustness and reliability of MLLMs, +providing valuable insights and resources for researchers and practitioners +alike. Resources are available at: +https://github.com/showlab/Awesome-MLLM-Hallucination. + +
+
+ comment: 140 references +
+
+
+
+
+ + ☆ DGE: Direct Gaussian 3D Editing by Consistent Multi-view Editing + + +
+ We consider the problem of editing 3D objects and scenes based on open-ended +language instructions. The established paradigm to solve this problem is to use +a 2D image generator or editor to guide the 3D editing process. However, this +is often slow as it requires do update a computationally expensive 3D +representations such as a neural radiance field, and to do so by using +contradictory guidance from a 2D model which is inherently not multi-view +consistent. We thus introduce the Direct Gaussian Editor (DGE), a method that +addresses these issues in two ways. First, we modify a given high-quality image +editor like InstructPix2Pix to be multi-view consistent. We do so by utilizing +a training-free approach which integrates cues from the underlying 3D geometry +of the scene. Second, given a multi-view consistent edited sequence of images +of the object, we directly and efficiently optimize the 3D object +representation, which is based on 3D Gaussian Splatting. Because it does not +require to apply edits incrementally and iteratively, DGE is significantly more +efficient than existing approaches, and comes with other perks such as allowing +selective editing of parts of the scene. + +
+
+ comment: Project Page: https://silent-chen.github.io/DGE/ +
+
+
+
+
+ + ☆ Stylus: Automatic Adapter Selection for Diffusion Models + + +
+ Beyond scaling base models with more data or parameters, fine-tuned adapters +provide an alternative way to generate high fidelity, custom images at reduced +costs. As such, adapters have been widely adopted by open-source communities, +accumulating a database of over 100K adapters-most of which are highly +customized with insufficient descriptions. This paper explores the problem of +matching the prompt to a set of relevant adapters, built on recent work that +highlight the performance gains of composing adapters. We introduce Stylus, +which efficiently selects and automatically composes task-specific adapters +based on a prompt's keywords. Stylus outlines a three-stage approach that first +summarizes adapters with improved descriptions and embeddings, retrieves +relevant adapters, and then further assembles adapters based on prompts' +keywords by checking how well they fit the prompt. To evaluate Stylus, we +developed StylusDocs, a curated dataset featuring 75K adapters with +pre-computed adapter embeddings. In our evaluation on popular Stable Diffusion +checkpoints, Stylus achieves greater CLIP-FID Pareto efficiency and is twice as +preferred, with humans and multimodal models as evaluators, over the base +model. See stylus-diffusion.github.io for more. + +
+
+ comment: Project Website: https://stylus-diffusion.github.io +
+
+
+
+
+ + ☆ Point Cloud Models Improve Visual Robustness in Robotic Learners + + +
+ Visual control policies can encounter significant performance degradation +when visual conditions like lighting or camera position differ from those seen +during training -- often exhibiting sharp declines in capability even for minor +differences. In this work, we examine robustness to a suite of these types of +visual changes for RGB-D and point cloud based visual control policies. To +perform these experiments on both model-free and model-based reinforcement +learners, we introduce a novel Point Cloud World Model (PCWM) and point cloud +based control policies. Our experiments show that policies that explicitly +encode point clouds are significantly more robust than their RGB-D +counterparts. Further, we find our proposed PCWM significantly outperforms +prior works in terms of sample efficiency during training. Taken together, +these results suggest reasoning about the 3D scene through point clouds can +improve performance, reduce learning time, and increase robustness for robotic +learners. Project Webpage: https://pvskand.github.io/projects/PCWM + +
+
+ comment: Accepted at International Conference on Robotics and Automation, 2024 +
+
+
+
+
+ + ☆ Swin2-MoSE: A New Single Image Super-Resolution Model for Remote Sensing + + +
+ Due to the limitations of current optical and sensor technologies and the +high cost of updating them, the spectral and spatial resolution of satellites +may not always meet desired requirements. For these reasons, Remote-Sensing +Single-Image Super-Resolution (RS-SISR) techniques have gained significant +interest. In this paper, we propose Swin2-MoSE model, an enhanced version of +Swin2SR. Our model introduces MoE-SM, an enhanced Mixture-of-Experts (MoE) to +replace the Feed-Forward inside all Transformer block. MoE-SM is designed with +Smart-Merger, and new layer for merging the output of individual experts, and +with a new way to split the work between experts, defining a new per-example +strategy instead of the commonly used per-token one. Furthermore, we analyze +how positional encodings interact with each other, demonstrating that +per-channel bias and per-head bias can positively cooperate. Finally, we +propose to use a combination of Normalized-Cross-Correlation (NCC) and +Structural Similarity Index Measure (SSIM) losses, to avoid typical MSE loss +limitations. Experimental results demonstrate that Swin2-MoSE outperforms SOTA +by up to 0.377 ~ 0.958 dB (PSNR) on task of 2x, 3x and 4x resolution-upscaling +(Sen2Venus and OLI2MSI datasets). We show the efficacy of Swin2-MoSE, applying +it to a semantic segmentation task (SeasoNet dataset). Code and pretrained are +available on https://github.com/IMPLabUniPr/swin2-mose/tree/official_code + +
+
+
+
+
+ + ☆ TheaterGen: Character Management with LLM for Consistent Multi-turn + Image Generation + + +
+ Recent advances in diffusion models can generate high-quality and stunning +images from text. However, multi-turn image generation, which is of high demand +in real-world scenarios, still faces challenges in maintaining semantic +consistency between images and texts, as well as contextual consistency of the +same subject across multiple interactive turns. To address this issue, we +introduce TheaterGen, a training-free framework that integrates large language +models (LLMs) and text-to-image (T2I) models to provide the capability of +multi-turn image generation. Within this framework, LLMs, acting as a +"Screenwriter", engage in multi-turn interaction, generating and managing a +standardized prompt book that encompasses prompts and layout designs for each +character in the target image. Based on these, Theatergen generate a list of +character images and extract guidance information, akin to the "Rehearsal". +Subsequently, through incorporating the prompt book and guidance information +into the reverse denoising process of T2I diffusion models, Theatergen generate +the final image, as conducting the "Final Performance". With the effective +management of prompt books and character images, TheaterGen significantly +improves semantic and contextual consistency in synthesized images. +Furthermore, we introduce a dedicated benchmark, CMIGBench (Consistent +Multi-turn Image Generation Benchmark) with 8000 multi-turn instructions. +Different from previous multi-turn benchmarks, CMIGBench does not define +characters in advance. Both the tasks of story generation and multi-turn +editing are included on CMIGBench for comprehensive evaluation. Extensive +experimental results show that TheaterGen outperforms state-of-the-art methods +significantly. It raises the performance bar of the cutting-edge Mini DALLE 3 +model by 21% in average character-character similarity and 19% in average +text-image similarity. + +
+
+
+
+
+ + ☆ RSCaMa: Remote Sensing Image Change Captioning with State Space Model + + +
+ Remote Sensing Image Change Captioning (RSICC) aims to identify surface +changes in multi-temporal remote sensing images and describe them in natural +language. Current methods typically rely on an encoder-decoder architecture and +focus on designing a sophisticated neck to process bi-temporal features +extracted by the backbone. Recently, State Space Models (SSMs), especially +Mamba, have demonstrated outstanding performance in many fields, owing to their +efficient feature-selective modelling capability. However, their potential in +the RSICC task remains unexplored. In this paper, we introduce Mamba into RSICC +and propose a novel approach called RSCaMa (Remote Sensing Change Captioning +Mamba). Specifically, we utilize Siamese backbones to extract bi-temporal +features, which are then processed through multiple CaMa layers consisting of +Spatial Difference-guided SSM (SD-SSM) and Temporal Traveling SSM (TT-SSM). +SD-SSM uses differential features to enhance change perception, while TT-SSM +promotes bitemporal interactions in a token-wise cross-scanning manner. +Experimental results validate the effectiveness of CaMa layers and demonstrate +the superior performance of RSCaMa, as well as the potential of Mamba in the +RSICC task. Additionally, we systematically compare the effects of three +language decoders, including Mamba, GPT-style decoder with causal attention +mechanism, and Transformer decoder with cross-attention mechanism. This +provides valuable insights for future RSICC research. The code will be +available at https://github.com/Chen-Yang-Liu/RSCaMa + +
+
+
+
+
+ + ☆ IPixMatch: Boost Semi-supervised Semantic Segmentation with Inter-Pixel + Relation + + +
+ The scarcity of labeled data in real-world scenarios is a critical bottleneck +of deep learning's effectiveness. Semi-supervised semantic segmentation has +been a typical solution to achieve a desirable tradeoff between annotation cost +and segmentation performance. However, previous approaches, whether based on +consistency regularization or self-training, tend to neglect the contextual +knowledge embedded within inter-pixel relations. This negligence leads to +suboptimal performance and limited generalization. In this paper, we propose a +novel approach IPixMatch designed to mine the neglected but valuable +Inter-Pixel information for semi-supervised learning. Specifically, IPixMatch +is constructed as an extension of the standard teacher-student network, +incorporating additional loss terms to capture inter-pixel relations. It shines +in low-data regimes by efficiently leveraging the limited labeled data and +extracting maximum utility from the available unlabeled data. Furthermore, +IPixMatch can be integrated seamlessly into most teacher-student frameworks +without the need of model modification or adding additional components. Our +straightforward IPixMatch method demonstrates consistent performance +improvements across various benchmark datasets under different partitioning +protocols. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ Hide and Seek: How Does Watermarking Impact Face Recognition? + + +
+ The recent progress in generative models has revolutionized the synthesis of +highly realistic images, including face images. This technological development +has undoubtedly helped face recognition, such as training data augmentation for +higher recognition accuracy and data privacy. However, it has also introduced +novel challenges concerning the responsible use and proper attribution of +computer generated images. We investigate the impact of digital watermarking, a +technique for embedding ownership signatures into images, on the effectiveness +of face recognition models. We propose a comprehensive pipeline that integrates +face image generation, watermarking, and face recognition to systematically +examine this question. The proposed watermarking scheme, based on an +encoder-decoder architecture, successfully embeds and recovers signatures from +both real and synthetic face images while preserving their visual fidelity. +Through extensive experiments, we unveil that while watermarking enables robust +image attribution, it results in a slight decline in face recognition accuracy, +particularly evident for face images with challenging poses and expressions. +Additionally, we find that directly training face recognition models on +watermarked images offers only a limited alleviation of this performance +decline. Our findings underscore the intricate trade off between watermarking +and face recognition accuracy. This work represents a pivotal step towards the +responsible utilization of generative models in face recognition and serves to +initiate discussions regarding the broader implications of watermarking in +biometrics. + +
+
+
+
+
+ + ☆ A Multilevel Strategy to Improve People Tracking in a Real-World + Scenario + + +
+ The Pal\'acio do Planalto, office of the President of Brazil, was invaded by +protesters on January 8, 2023. Surveillance videos taken from inside the +building were subsequently released by the Brazilian Supreme Court for public +scrutiny. We used segments of such footage to create the UFPR-Planalto801 +dataset for people tracking and re-identification in a real-world scenario. +This dataset consists of more than 500,000 images. This paper presents a +tracking approach targeting this dataset. The method proposed in this paper +relies on the use of known state-of-the-art trackers combined in a multilevel +hierarchy to correct the ID association over the trajectories. We evaluated our +method using IDF1, MOTA, MOTP and HOTA metrics. The results show improvements +for every tracker used in the experiments, with IDF1 score increasing by a +margin up to 9.5%. + +
+
+ comment: Accepted for presentation at the International Conference on Computer + Vision Theory and Applications (VISAPP) 2024 +
+
+
+
+
+ + ☆ OpenStreetView-5M: The Many Roads to Global Visual Geolocation CVPR 2024 + + +
+ Determining the location of an image anywhere on Earth is a complex visual +task, which makes it particularly relevant for evaluating computer vision +algorithms. Yet, the absence of standard, large-scale, open-access datasets +with reliably localizable images has limited its potential. To address this +issue, we introduce OpenStreetView-5M, a large-scale, open-access dataset +comprising over 5.1 million geo-referenced street view images, covering 225 +countries and territories. In contrast to existing benchmarks, we enforce a +strict train/test separation, allowing us to evaluate the relevance of learned +geographical features beyond mere memorization. To demonstrate the utility of +our dataset, we conduct an extensive benchmark of various state-of-the-art +image encoders, spatial representations, and training strategies. All +associated codes and models can be found at https://github.com/gastruc/osv5m. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ A Survey on Vision Mamba: Models, Applications and Challenges + + +
+ Mamba, a recent selective structured state space model, performs excellently +on long sequence modeling tasks. Mamba mitigates the modeling constraints of +convolutional neural networks and offers advanced modeling capabilities similar +to those of Transformers, through global receptive fields and dynamic +weighting. Crucially, it achieves this without incurring the quadratic +computational complexity typically associated with Transformers. Due to its +advantages over the former two mainstream foundation models, Mamba exhibits +great potential to be a visual foundation model. Researchers are actively +applying Mamba to various computer vision tasks, leading to numerous emerging +works. To help keep pace with the rapid advancements in computer vision, this +paper aims to provide a comprehensive review of visual Mamba approaches. This +paper begins by delineating the formulation of the original Mamba model. +Subsequently, our review of visual Mamba delves into several representative +backbone networks to elucidate the core insights of the visual Mamba. We then +categorize related works using different modalities, including image, video, +point cloud, multi-modal, and others. Specifically, for image applications, we +further organize them into distinct tasks to facilitate a more structured +discussion. Finally, we discuss the challenges and future research directions +for visual Mamba, providing insights for future research in this quickly +evolving area. A comprehensive list of visual Mamba models reviewed in this +work is available at https://github.com/Ruixxxx/Awesome-Vision-Mamba-Models. + +
+
+
+
+
+ + ☆ MiPa: Mixed Patch Infrared-Visible Modality Agnostic Object Detection + + +
+ In this paper, we present a different way to use two modalities, in which +either one modality or the other is seen by a single model. This can be useful +when adapting an unimodal model to leverage more information while respecting a +limited computational budget. This would mean having a single model that is +able to deal with any modalities. To describe this, we coined the term anymodal +learning. An example of this, is a use case where, surveillance in a room when +the lights are off would be much more valuable using an infrared modality while +a visible one would provide more discriminative information when lights are on. +This work investigates how to efficiently leverage visible and infrared/thermal +modalities for transformer-based object detection backbone to create an +anymodal architecture. Our work does not create any inference overhead during +the testing while exploring an effective way to exploit the two modalities +during the training. To accomplish such a task, we introduce the novel anymodal +training technique: Mixed Patches (MiPa), in conjunction with a patch-wise +domain agnostic module, which is responsible of learning the best way to find a +common representation of both modalities. This approach proves to be able to +balance modalities by reaching competitive results on individual modality +benchmarks with the alternative of using an unimodal architecture on three +different visible-infrared object detection datasets. Finally, our proposed +method, when used as a regularization for the strongest modality, can beat the +performance of multimodal fusion methods while only requiring a single modality +during inference. Notably, MiPa became the state-of-the-art on the LLVIP +visible/infrared benchmark. Code: https://github.com/heitorrapela/MiPa + +
+
+
+
+
+ + ☆ VISION: Toward a Standardized Process for Radiology Image Management at + the National Level + + +
+ The compilation and analysis of radiological images poses numerous challenges +for researchers. The sheer volume of data as well as the computational needs of +algorithms capable of operating on images are extensive. Additionally, the +assembly of these images alone is difficult, as these exams may differ widely +in terms of clinical context, structured annotation available for model +training, modality, and patient identifiers. In this paper, we describe our +experiences and challenges in establishing a trusted collection of radiology +images linked to the United States Department of Veterans Affairs (VA) +electronic health record database. We also discuss implications in making this +repository research-ready for medical investigators. Key insights include +uncovering the specific procedures required for transferring images from a +clinical to a research-ready environment, as well as roadblocks and bottlenecks +in this process that may hinder future efforts at automation. + +
+
+
+
+
+ + ☆ ConPro: Learning Severity Representation for Medical Images using + Contrastive Learning and Preference Optimization + + +
+ Understanding the severity of conditions shown in images in medical diagnosis +is crucial, serving as a key guide for clinical assessment, treatment, as well +as evaluating longitudinal progression. This paper proposes Con- PrO: a novel +representation learning method for severity assessment in medical images using +Contrastive learningintegrated Preference Optimization. Different from +conventional contrastive learning methods that maximize the distance between +classes, ConPrO injects into the latent vector the distance preference +knowledge between various severity classes and the normal class. We +systematically examine the key components of our framework to illuminate how +contrastive prediction tasks acquire valuable representations. We show that our +representation learning framework offers valuable severity ordering in the +feature space while outperforming previous state-of-the-art methods on +classification tasks. We achieve a 6% and 20% relative improvement compared to +a supervised and a self-supervised baseline, respectively. In addition, we +derived discussions on severity indicators and related applications of +preference comparison in the medical domain. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Harmonic Machine Learning Models are Robust + + +
+ We introduce Harmonic Robustness, a powerful and intuitive method to test the +robustness of any machine-learning model either during training or in black-box +real-time inference monitoring without ground-truth labels. It is based on +functional deviation from the harmonic mean value property, indicating +instability and lack of explainability. We show implementation examples in +low-dimensional trees and feedforward NNs, where the method reliably identifies +overfitting, as well as in more complex high-dimensional models such as +ResNet-50 and Vision Transformer where it efficiently measures adversarial +vulnerability across image classes. + +
+
+ comment: 18 pages, 13 figures +
+
+
+
+
+ + ☆ Towards Extreme Image Compression with Latent Feature Guidance and + Diffusion Prior + + +
+ Compressing images at extremely low bitrates (below 0.1 bits per pixel (bpp)) +is a significant challenge due to substantial information loss. Existing +extreme image compression methods generally suffer from heavy compression +artifacts or low-fidelity reconstructions. To address this problem, we propose +a novel extreme image compression framework that combines compressive VAEs and +pre-trained text-to-image diffusion models in an end-to-end manner. +Specifically, we introduce a latent feature-guided compression module based on +compressive VAEs. This module compresses images and initially decodes the +compressed information into content variables. To enhance the alignment between +content variables and the diffusion space, we introduce external guidance to +modulate intermediate feature maps. Subsequently, we develop a conditional +diffusion decoding module that leverages pre-trained diffusion models to +further decode these content variables. To preserve the generative capability +of pre-trained diffusion models, we keep their parameters fixed and use a +control module to inject content information. We also design a space alignment +loss to provide sufficient constraints for the latent feature-guided +compression module. Extensive experiments demonstrate that our method +outperforms state-of-the-art approaches in terms of both visual performance and +image fidelity at extremely low bitrates. + +
+
+ comment: Submitted to IEEE TCSVT +
+
+
+
+
+ + ☆ A Partial Replication of MaskFormer in TensorFlow on TPUs for the + TensorFlow Model Garden + + +
+ This paper undertakes the task of replicating the MaskFormer model a +universal image segmentation model originally developed using the PyTorch +framework, within the TensorFlow ecosystem, specifically optimized for +execution on Tensor Processing Units (TPUs). Our implementation exploits the +modular constructs available within the TensorFlow Model Garden (TFMG), +encompassing elements such as the data loader, training orchestrator, and +various architectural components, tailored and adapted to meet the +specifications of the MaskFormer model. We address key challenges encountered +during the replication, non-convergence issues, slow training, adaptation of +loss functions, and the integration of TPU-specific functionalities. We verify +our reproduced implementation and present qualitative results on the COCO +dataset. Although our implementation meets some of the objectives for +end-to-end reproducibility, we encountered challenges in replicating the +PyTorch version of MaskFormer in TensorFlow. This replication process is not +straightforward and requires substantial engineering efforts. Specifically, it +necessitates the customization of various components within the TFMG, alongside +thorough verification and hyper-parameter tuning. The replication is available +at: +https://github.com/PurdueDualityLab/tf-maskformer/tree/main/official/projects/maskformer + +
+
+
+
+
+ + ☆ Saliency Suppressed, Semantics Surfaced: Visual Transformations in + Neural Networks and the Brain + + +
+ Deep learning algorithms lack human-interpretable accounts of how they +transform raw visual input into a robust semantic understanding, which impedes +comparisons between different architectures, training objectives, and the human +brain. In this work, we take inspiration from neuroscience and employ +representational approaches to shed light on how neural networks encode +information at low (visual saliency) and high (semantic similarity) levels of +abstraction. Moreover, we introduce a custom image dataset where we +systematically manipulate salient and semantic information. We find that +ResNets are more sensitive to saliency information than ViTs, when trained with +object classification objectives. We uncover that networks suppress saliency in +early layers, a process enhanced by natural language supervision (CLIP) in +ResNets. CLIP also enhances semantic encoding in both architectures. Finally, +we show that semantic encoding is a key factor in aligning AI with human visual +perception, while saliency suppression is a non-brain-like strategy. + +
+
+
+
+
+ + ☆ From Density to Geometry: YOLOv8 Instance Segmentation for Reverse + Engineering of Optimized Structures + + +
+ This paper introduces YOLOv8-TO, a novel approach for reverse engineering of +topology-optimized structures into interpretable geometric parameters using the +YOLOv8 instance segmentation model. Density-based topology optimization methods +require post-processing to convert the optimal density distribution into a +parametric representation for design exploration and integration with CAD +tools. Traditional methods such as skeletonization struggle with complex +geometries and require manual intervention. YOLOv8-TO addresses these +challenges by training a custom YOLOv8 model to automatically detect and +reconstruct structural components from binary density distributions. The model +is trained on a diverse dataset of both optimized and random structures +generated using the Moving Morphable Components method. A custom reconstruction +loss function based on the dice coefficient of the predicted geometry is used +to train the new regression head of the model via self-supervised learning. The +method is evaluated on test sets generated from different topology optimization +methods, including out-of-distribution samples, and compared against a +skeletonization approach. Results show that YOLOv8-TO significantly outperforms +skeletonization in reconstructing visually and structurally similar designs. +The method showcases an average improvement of 13.84% in the Dice coefficient, +with peak enhancements reaching 20.78%. The method demonstrates good +generalization to complex geometries and fast inference times, making it +suitable for integration into design workflows using regular workstations. +Limitations include the sensitivity to non-max suppression thresholds. +YOLOv8-TO represents a significant advancement in topology optimization +post-processing, enabling efficient and accurate reverse engineering of +optimized structures for design exploration and manufacturing. + +
+
+
+
+
+ + ☆ Flow AM: Generating Point Cloud Global Explanations by Latent Alignment + + +
+ Although point cloud models have gained significant improvements in +prediction accuracy over recent years, their trustworthiness is still not +sufficiently investigated. In terms of global explainability, Activation +Maximization (AM) techniques in the image domain are not directly +transplantable due to the special structure of the point cloud models. Existing +studies exploit generative models to yield global explanations that can be +perceived by humans. However, the opacity of the generative models themselves +and the introduction of additional priors call into question the plausibility +and fidelity of the explanations. In this work, we demonstrate that when the +classifier predicts different types of instances, the intermediate layer +activations are differently activated, known as activation flows. Based on this +property, we propose an activation flow-based AM method that generates global +explanations that can be perceived without incorporating any generative model. +Furthermore, we reveal that AM based on generative models fails the sanity +checks and thus lack of fidelity. Extensive experiments show that our approach +dramatically enhances the perceptibility of explanations compared to other AM +methods that are not based on generative models. Our code is available at: +https://github.com/Explain3D/FlowAM + +
+
+
+
+
+ + ☆ Transitive Vision-Language Prompt Learning for Domain Generalization + + +
+ The vision-language pre-training has enabled deep models to make a huge step +forward in generalizing across unseen domains. The recent learning method based +on the vision-language pre-training model is a great tool for domain +generalization and can solve this problem to a large extent. However, there are +still some issues that an advancement still suffers from trading-off between +domain invariance and class separability, which are crucial in current DG +problems. However, there are still some issues that an advancement still +suffers from trading-off between domain invariance and class separability, +which are crucial in current DG problems. In this paper, we introduce a novel +prompt learning strategy that leverages deep vision prompts to address domain +invariance while utilizing language prompts to ensure class separability, +coupled with adaptive weighting mechanisms to balance domain invariance and +class separability. Extensive experiments demonstrate that deep vision prompts +effectively extract domain-invariant features, significantly improving the +generalization ability of deep models and achieving state-of-the-art +performance on three datasets. + +
+
+
+
+
+ + ☆ Survey on Datasets for Perception in Unstructured Outdoor Environments ICRA + + +
+ Perception is an essential component of pipelines in field robotics. In this +survey, we quantitatively compare publicly available datasets available in +unstructured outdoor environments. We focus on datasets for common perception +tasks in field robotics. Our survey categorizes and compares available research +datasets. This survey also reports on relevant dataset characteristics to help +practitioners determine which dataset fits best for their own application. We +believe more consideration should be taken in choosing compatible annotation +policies across the datasets in unstructured outdoor environments. + +
+
+ comment: Accepted to the IEEE ICRA Workshop on Field Robotics 2024 +
+
+
+
+
+ + ☆ Evaluating the Effectiveness of Video Anomaly Detection in the Wild: + Online Learning and Inference for Real-world Deployment + + +
+ Video Anomaly Detection (VAD) identifies unusual activities in video streams, +a key technology with broad applications ranging from surveillance to +healthcare. Tackling VAD in real-life settings poses significant challenges due +to the dynamic nature of human actions, environmental variations, and domain +shifts. Many research initiatives neglect these complexities, often +concentrating on traditional testing methods that fail to account for +performance on unseen datasets, creating a gap between theoretical models and +their real-world utility. Online learning is a potential strategy to mitigate +this issue by allowing models to adapt to new information continuously. This +paper assesses how well current VAD algorithms can adjust to real-life +conditions through an online learning framework, particularly those based on +pose analysis, for their efficiency and privacy advantages. Our proposed +framework enables continuous model updates with streaming data from novel +environments, thus mirroring actual world challenges and evaluating the models' +ability to adapt in real-time while maintaining accuracy. We investigate three +state-of-the-art models in this setting, focusing on their adaptability across +different domains. Our findings indicate that, even under the most challenging +conditions, our online learning approach allows a model to preserve 89.39% of +its original effectiveness compared to its offline-trained counterpart in a +specific target domain. + +
+
+
+
+
+ + ☆ Enhancing Interactive Image Retrieval With Query Rewriting Using Large + Language Models and Vision Language Models + + +
+ Image search stands as a pivotal task in multimedia and computer vision, +finding applications across diverse domains, ranging from internet search to +medical diagnostics. Conventional image search systems operate by accepting +textual or visual queries, retrieving the top-relevant candidate results from +the database. However, prevalent methods often rely on single-turn procedures, +introducing potential inaccuracies and limited recall. These methods also face +the challenges, such as vocabulary mismatch and the semantic gap, constraining +their overall effectiveness. To address these issues, we propose an interactive +image retrieval system capable of refining queries based on user relevance +feedback in a multi-turn setting. This system incorporates a vision language +model (VLM) based image captioner to enhance the quality of text-based queries, +resulting in more informative queries with each iteration. Moreover, we +introduce a large language model (LLM) based denoiser to refine text-based +query expansions, mitigating inaccuracies in image descriptions generated by +captioning models. To evaluate our system, we curate a new dataset by adapting +the MSR-VTT video retrieval dataset to the image retrieval task, offering +multiple relevant ground truth images for each query. Through comprehensive +experiments, we validate the effectiveness of our proposed system against +baseline methods, achieving state-of-the-art performance with a notable 10\% +improvement in terms of recall. Our contributions encompass the development of +an innovative interactive image retrieval system, the integration of an +LLM-based denoiser, the curation of a meticulously designed evaluation dataset, +and thorough experimental validation. + +
+
+
+
+
+ + ☆ Real Time Multi Organ Classification on Computed Tomography Images + + +
+ Organ segmentation is a fundamental task in medical imaging, and it is useful +for many clinical automation pipelines. Typically, the process involves +segmenting the entire volume, which can be unnecessary when the points of +interest are limited. In those cases, a classifier could be used instead of +segmentation. However, there is an inherent trade-off between the context size +and the speed of classifiers. To address this issue, we propose a new method +that employs a data selection strategy with sparse sampling across a wide field +of view without image resampling. This sparse sampling strategy makes it +possible to classify voxels into multiple organs in real time without using +accelerators. Although our method is an independent classifier, it can generate +full segmentation by querying grid locations at any resolution. We have +compared our method with existing segmentation techniques, demonstrating its +potential for superior runtime in practical applications in medical imaging. + +
+
+
+
+
+ + ☆ Improving Automatic Text Recognition with Language Models in the PyLaia + Open-Source Library + + +
+ PyLaia is one of the most popular open-source software for Automatic Text +Recognition (ATR), delivering strong performance in terms of speed and +accuracy. In this paper, we outline our recent contributions to the PyLaia +library, focusing on the incorporation of reliable confidence scores and the +integration of statistical language modeling during decoding. Our +implementation provides an easy way to combine PyLaia with n-grams language +models at different levels. One of the highlights of this work is that language +models are completely auto-tuned: they can be built and used easily without any +expert knowledge, and without requiring any additional data. To demonstrate the +significance of our contribution, we evaluate PyLaia's performance on twelve +datasets, both with and without language modelling. The results show that +decoding with small language models improves the Word Error Rate by 13% and the +Character Error Rate by 12% in average. Additionally, we conduct an analysis of +confidence scores and highlight the importance of calibration techniques. Our +implementation is publicly available in the official PyLaia repository at +https://gitlab.teklia.com/atr/pylaia, and twelve open-source models are +released on Hugging Face. + +
+
+
+
+
+ + ☆ The Socface Project: Large-Scale Collection, Processing, and Analysis of + a Century of French Censuses + + +
+ This paper presents a complete processing workflow for extracting information +from French census lists from 1836 to 1936. These lists contain information +about individuals living in France and their households. We aim at extracting +all the information contained in these tables using automatic handwritten table +recognition. At the end of the Socface project, in which our work is taking +place, the extracted information will be redistributed to the departmental +archives, and the nominative lists will be freely available to the public, +allowing anyone to browse hundreds of millions of records. The extracted data +will be used by demographers to analyze social change over time, significantly +improving our understanding of French economic and social structures. For this +project, we developed a complete processing workflow: large-scale data +collection from French departmental archives, collaborative annotation of +documents, training of handwritten table text and structure recognition models, +and mass processing of millions of images. We present the tools we have +developed to easily collect and process millions of pages. We also show that it +is possible to process such a wide variety of tables with a single table +recognition model that uses the image of the entire page to recognize +information about individuals, categorize them and automatically group them +into households. The entire process has been successfully used to process the +documents of a departmental archive, representing more than 450,000 images. + +
+
+
+
+
+ + ☆ Convergence Properties of Score-Based Models using Graduated + Optimisation for Linear Inverse Problems + + +
+ The incorporation of generative models as regularisers within variational +formulations for inverse problems has proven effective across numerous image +reconstruction tasks. However, the resulting optimisation problem is often +non-convex and challenging to solve. In this work, we show that score-based +generative models (SGMs) can be used in a graduated optimisation framework to +solve inverse problems. We show that the resulting graduated non-convexity flow +converge to stationary points of the original problem and provide a numerical +convergence analysis of a 2D toy example. We further provide experiments on +computed tomography image reconstruction, where we show that this framework is +able to recover high-quality images, independent of the initial value. The +experiments highlight the potential of using SGMs in graduated optimisation +frameworks. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Dual-Modal Prompting for Sketch-Based Image Retrieval + + +
+ Sketch-based image retrieval (SBIR) associates hand-drawn sketches with their +corresponding realistic images. In this study, we aim to tackle two major +challenges of this task simultaneously: i) zero-shot, dealing with unseen +categories, and ii) fine-grained, referring to intra-category instance-level +retrieval. Our key innovation lies in the realization that solely addressing +this cross-category and fine-grained recognition task from the generalization +perspective may be inadequate since the knowledge accumulated from limited seen +categories might not be fully valuable or transferable to unseen target +categories. Inspired by this, in this work, we propose a dual-modal prompting +CLIP (DP-CLIP) network, in which an adaptive prompting strategy is designed. +Specifically, to facilitate the adaptation of our DP-CLIP toward unpredictable +target categories, we employ a set of images within the target category and the +textual category label to respectively construct a set of category-adaptive +prompt tokens and channel scales. By integrating the generated guidance, +DP-CLIP could gain valuable category-centric insights, efficiently adapting to +novel categories and capturing unique discriminative clues for effective +retrieval within each target category. With these designs, our DP-CLIP +outperforms the state-of-the-art fine-grained zero-shot SBIR method by 7.3% in +Acc.@1 on the Sketchy dataset. Meanwhile, in the other two category-level +zero-shot SBIR benchmarks, our method also achieves promising performance. + +
+
+
+
+
+ + ☆ Bootstrap 3D Reconstructed Scenes from 3D Gaussian Splatting + + +
+ Recent developments in neural rendering techniques have greatly enhanced the +rendering of photo-realistic 3D scenes across both academic and commercial +fields. The latest method, known as 3D Gaussian Splatting (3D-GS), has set new +benchmarks for rendering quality and speed. Nevertheless, the limitations of +3D-GS become pronounced in synthesizing new viewpoints, especially for views +that greatly deviate from those seen during training. Additionally, issues such +as dilation and aliasing arise when zooming in or out. These challenges can all +be traced back to a single underlying issue: insufficient sampling. In our +paper, we present a bootstrapping method that significantly addresses this +problem. This approach employs a diffusion model to enhance the rendering of +novel views using trained 3D-GS, thereby streamlining the training process. Our +results indicate that bootstrapping effectively reduces artifacts, as well as +clear enhancements on the evaluation metrics. Furthermore, we show that our +method is versatile and can be easily integrated, allowing various 3D +reconstruction projects to benefit from our approach. + +
+
+
+
+
+ + ☆ Leveraging PointNet and PointNet++ for Lyft Point Cloud Classification + Challenge + + +
+ This study investigates the application of PointNet and PointNet++ in the +classification of LiDAR-generated point cloud data, a critical component for +achieving fully autonomous vehicles. Utilizing a modified dataset from the Lyft +3D Object Detection Challenge, we examine the models' capabilities to handle +dynamic and complex environments essential for autonomous navigation. Our +analysis shows that PointNet and PointNet++ achieved accuracy rates of 79.53% +and 84.24%, respectively. These results underscore the models' robustness in +interpreting intricate environmental data, which is pivotal for the safety and +efficiency of autonomous vehicles. Moreover, the enhanced detection accuracy, +particularly in distinguishing pedestrians from other objects, highlights the +potential of these models to contribute substantially to the advancement of +autonomous vehicle technology. + +
+
+
+
+
+ + ☆ Reading Order Independent Metrics for Information Extraction in + Handwritten Documents + + +
+ Information Extraction processes in handwritten documents tend to rely on +obtaining an automatic transcription and performing Named Entity Recognition +(NER) over such transcription. For this reason, in publicly available datasets, +the performance of the systems is usually evaluated with metrics particular to +each dataset. Moreover, most of the metrics employed are sensitive to reading +order errors. Therefore, they do not reflect the expected final application of +the system and introduce biases in more complex documents. In this paper, we +propose and publicly release a set of reading order independent metrics +tailored to Information Extraction evaluation in handwritten documents. In our +experimentation, we perform an in-depth analysis of the behavior of the metrics +to recommend what we consider to be the minimal set of metrics to evaluate a +task correctly. + +
+
+
+
+
+ + ☆ Terrain characterisation for online adaptability of automated sonar + processing: Lessons learnt from operationally applying ATR to sidescan sonar + in MCM applications + + +
+ The performance of Automated Recognition (ATR) algorithms on side-scan sonar +imagery has shown to degrade rapidly when deployed on non benign environments. +Complex seafloors and acoustic artefacts constitute distractors in the form of +strong textural patterns, creating false detections or preventing detections of +true objects. This paper presents two online seafloor characterisation +techniques to improve explainability during Autonomous Underwater Vehicles +(AUVs) missions. Importantly and as opposed to previous work in the domain, +these techniques are not based on a model and require limited input from human +operators, making it suitable for real-time onboard processing. Both techniques +rely on an unsupervised machine learning approach to extract terrain features +which relate to the human understanding of terrain complexity. The first +technnique provides a quantitative, application-driven terrain characterisation +metric based on the performance of an ATR algorithm. The second method provides +a way to incorporate subject matter expertise and enables contextualisation and +explainability in support for scenario-dependent subjective terrain +characterisation. The terrain complexity matches the expectation of seasoned +users making this tool desirable and trustworthy in comparison to traditional +unsupervised approaches. We finally detail an application of these techniques +to repair a Mine Countermeasures (MCM) mission carried with SeeByte autonomy +framework Neptune. + +
+
+ comment: Presented at UACE (Underwater Acoustics Conference & Exhibition) + 2023, Kalamata, Greece +
+
+
+
+
+ + ☆ Towards Quantitative Evaluation of Explainable AI Methods for Deepfake + Detection ICMR'24 + + +
+ In this paper we propose a new framework for evaluating the performance of +explanation methods on the decisions of a deepfake detector. This framework +assesses the ability of an explanation method to spot the regions of a fake +image with the biggest influence on the decision of the deepfake detector, by +examining the extent to which these regions can be modified through a set of +adversarial attacks, in order to flip the detector's prediction or reduce its +initial prediction; we anticipate a larger drop in deepfake detection accuracy +and prediction, for methods that spot these regions more accurately. Based on +this framework, we conduct a comparative study using a state-of-the-art model +for deepfake detection that has been trained on the FaceForensics++ dataset, +and five explanation methods from the literature. The findings of our +quantitative and qualitative evaluations document the advanced performance of +the LIME explanation method against the other compared ones, and indicate this +method as the most appropriate for explaining the decisions of the utilized +deepfake detector. + +
+
+ comment: Accepted for publication, 3rd ACM Int. Workshop on Multimedia AI + against Disinformation (MAD'24) at ACM ICMR'24, June 10, 2024, Phuket, + Thailand. This is the "accepted version" +
+
+
+
+
+ + ☆ Uncertainty-boosted Robust Video Activity Anticipation + + +
+ Video activity anticipation aims to predict what will happen in the future, +embracing a broad application prospect ranging from robot vision and autonomous +driving. Despite the recent progress, the data uncertainty issue, reflected as +the content evolution process and dynamic correlation in event labels, has been +somehow ignored. This reduces the model generalization ability and deep +understanding on video content, leading to serious error accumulation and +degraded performance. In this paper, we address the uncertainty learning +problem and propose an uncertainty-boosted robust video activity anticipation +framework, which generates uncertainty values to indicate the credibility of +the anticipation results. The uncertainty value is used to derive a temperature +parameter in the softmax function to modulate the predicted target activity +distribution. To guarantee the distribution adjustment, we construct a +reasonable target activity label representation by incorporating the activity +evolution from the temporal class correlation and the semantic relationship. +Moreover, we quantify the uncertainty into relative values by comparing the +uncertainty among sample pairs and their temporal-lengths. This relative +strategy provides a more accessible way in uncertainty modeling than +quantifying the absolute uncertainty values on the whole dataset. Experiments +on multiple backbones and benchmarks show our framework achieves promising +performance and better robustness/interpretability. Source codes are available +at https://github.com/qzhb/UbRV2A. + +
+
+ comment: Accepted by T-PAMI +
+
+
+
+
+ + ☆ 4D-DRESS: A 4D Dataset of Real-world Human Clothing with Semantic + Annotations CVPR 2024 + + +
+ The studies of human clothing for digital avatars have predominantly relied +on synthetic datasets. While easy to collect, synthetic data often fall short +in realism and fail to capture authentic clothing dynamics. Addressing this +gap, we introduce 4D-DRESS, the first real-world 4D dataset advancing human +clothing research with its high-quality 4D textured scans and garment meshes. +4D-DRESS captures 64 outfits in 520 human motion sequences, amounting to 78k +textured scans. Creating a real-world clothing dataset is challenging, +particularly in annotating and segmenting the extensive and complex 4D human +scans. To address this, we develop a semi-automatic 4D human parsing pipeline. +We efficiently combine a human-in-the-loop process with automation to +accurately label 4D scans in diverse garments and body movements. Leveraging +precise annotations and high-quality garment meshes, we establish several +benchmarks for clothing simulation and reconstruction. 4D-DRESS offers +realistic and challenging data that complements synthetic sources, paving the +way for advancements in research of lifelike human clothing. Website: +https://ait.ethz.ch/4d-dress. + +
+
+ comment: CVPR 2024 paper, 21 figures, 9 tables +
+
+
+
+
+ + ☆ Self-Avatar Animation in Virtual Reality: Impact of Motion Signals + Artifacts on the Full-Body Pose Reconstruction + + +
+ Virtual Reality (VR) applications have revolutionized user experiences by +immersing individuals in interactive 3D environments. These environments find +applications in numerous fields, including healthcare, education, or +architecture. A significant aspect of VR is the inclusion of self-avatars, +representing users within the virtual world, which enhances interaction and +embodiment. However, generating lifelike full-body self-avatar animations +remains challenging, particularly in consumer-grade VR systems, where +lower-body tracking is often absent. One method to tackle this problem is by +providing an external source of motion information that includes lower body +information such as full Cartesian positions estimated from RGB(D) cameras. +Nevertheless, the limitations of these systems are multiples: the +desynchronization between the two motion sources and occlusions are examples of +significant issues that hinder the implementations of such systems. In this +paper, we aim to measure the impact on the reconstruction of the articulated +self-avatar's full-body pose of (1) the latency between the VR motion features +and estimated positions, (2) the data acquisition rate, (3) occlusions, and (4) +the inaccuracy of the position estimation algorithm. In addition, we analyze +the motion reconstruction errors using ground truth and 3D Cartesian +coordinates estimated from \textit{YOLOv8} pose estimation. These analyzes show +that the studied methods are significantly sensitive to any degradation tested, +especially regarding the velocity reconstruction error. + +
+
+ comment: 8 pages, 5 figures and 1 table +
+
+
+
+
+ + ☆ Do Vision & Language Decoders use Images and Text equally? How + Self-consistent are their Explanations? + + +
+ Vision and language models (VLMs) are currently the most generally performant +architectures on multimodal tasks. Next to their predictions, they can also +produce explanations, either in post-hoc or CoT settings. However, it is not +clear how much they use the vision and text modalities when generating +predictions or explanations. In this work, we investigate if VLMs rely on +modalities differently when generating explanations as opposed to when they +provide answers. We also evaluate the self-consistency of VLM decoders in both +post-hoc and CoT explanation settings, by extending existing tests and measures +to VLM decoders. We find that VLMs are less self-consistent than LLMs. The text +contributions in VL decoders are much larger than the image contributions +across all measured tasks. And the contributions of the image are significantly +larger for explanation generations than for answer generation. This difference +is even larger in CoT compared to the post-hoc explanation setting. We also +provide an up-to-date benchmarking of state-of-the-art VL decoders on the VALSE +benchmark, which to date focused only on VL encoders. We find that VL decoders +are still struggling with most phenomena tested by VALSE. + +
+
+ comment: 27 pages, from which 12 pages contain the text of the main paper. 8 + figures, 11 tables +
+
+
+
+
+ + ☆ FlexiFilm: Long Video Generation with Flexible Conditions + + +
+ Generating long and consistent videos has emerged as a significant yet +challenging problem. While most existing diffusion-based video generation +models, derived from image generation models, demonstrate promising performance +in generating short videos, their simple conditioning mechanism and sampling +strategy-originally designed for image generation-cause severe performance +degradation when adapted to long video generation. This results in prominent +temporal inconsistency and overexposure. Thus, in this work, we introduce +FlexiFilm, a new diffusion model tailored for long video generation. Our +framework incorporates a temporal conditioner to establish a more consistent +relationship between generation and multi-modal conditions, and a resampling +strategy to tackle overexposure. Empirical results demonstrate FlexiFilm +generates long and consistent videos, each over 30 seconds in length, +outperforming competitors in qualitative and quantitative analyses. Project +page: https://y-ichen.github.io/FlexiFilm-Page/ + +
+
+ comment: 9 pages, 9 figures +
+
+
+
+
+ + ☆ CoSense3D: an Agent-based Efficient Learning Framework for Collective + Perception + + +
+ Collective Perception has attracted significant attention in recent years due +to its advantage for mitigating occlusion and expanding the field-of-view, +thereby enhancing reliability, efficiency, and, most crucially, decision-making +safety. However, developing collective perception models is highly resource +demanding due to extensive requirements of processing input data for many +agents, usually dozens of images and point clouds for a single frame. This not +only slows down the model development process for collective perception but +also impedes the utilization of larger models. In this paper, we propose an +agent-based training framework that handles the deep learning modules and agent +data separately to have a cleaner data flow structure. This framework not only +provides an API for flexibly prototyping the data processing pipeline and +defining the gradient calculation for each agent, but also provides the user +interface for interactive training, testing and data visualization. Training +experiment results of four collective object detection models on the prominent +collective perception benchmark OPV2V show that the agent-based training can +significantly reduce the GPU memory consumption and training time while +retaining inference performance. The framework and model implementations are +available at \url{https://github.com/YuanYunshuang/CoSense3D} + +
+
+
+
+
+ + ☆ CSTalk: Correlation Supervised Speech-driven 3D Emotional Facial + Animation Generation + + +
+ Speech-driven 3D facial animation technology has been developed for years, +but its practical application still lacks expectations. The main challenges lie +in data limitations, lip alignment, and the naturalness of facial expressions. +Although lip alignment has seen many related studies, existing methods struggle +to synthesize natural and realistic expressions, resulting in a mechanical and +stiff appearance of facial animations. Even with some research extracting +emotional features from speech, the randomness of facial movements limits the +effective expression of emotions. To address this issue, this paper proposes a +method called CSTalk (Correlation Supervised) that models the correlations +among different regions of facial movements and supervises the training of the +generative model to generate realistic expressions that conform to human facial +motion patterns. To generate more intricate animations, we employ a rich set of +control parameters based on the metahuman character model and capture a dataset +for five different emotions. We train a generative network using an autoencoder +structure and input an emotion embedding vector to achieve the generation of +user-control expressions. Experimental results demonstrate that our method +outperforms existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ Self-supervised learning for classifying paranasal anomalies in the + maxillary sinus + + +
+ Purpose: Paranasal anomalies, frequently identified in routine radiological +screenings, exhibit diverse morphological characteristics. Due to the diversity +of anomalies, supervised learning methods require large labelled dataset +exhibiting diverse anomaly morphology. Self-supervised learning (SSL) can be +used to learn representations from unlabelled data. However, there are no SSL +methods designed for the downstream task of classifying paranasal anomalies in +the maxillary sinus (MS). + Methods: Our approach uses a 3D Convolutional Autoencoder (CAE) trained in an +unsupervised anomaly detection (UAD) framework. Initially, we train the 3D CAE +to reduce reconstruction errors when reconstructing normal maxillary sinus (MS) +image. Then, this CAE is applied to an unlabelled dataset to generate coarse +anomaly locations by creating residual MS images. Following this, a 3D +Convolutional Neural Network (CNN) reconstructs these residual images, which +forms our SSL task. Lastly, we fine-tune the encoder part of the 3D CNN on a +labelled dataset of normal and anomalous MS images. + Results: The proposed SSL technique exhibits superior performance compared to +existing generic self-supervised methods, especially in scenarios with limited +annotated data. When trained on just 10% of the annotated dataset, our method +achieves an Area Under the Precision-Recall Curve (AUPRC) of 0.79 for the +downstream classification task. This performance surpasses other methods, with +BYOL attaining an AUPRC of 0.75, SimSiam at 0.74, SimCLR at 0.73 and Masked +Autoencoding using SparK at 0.75. + Conclusion: A self-supervised learning approach that inherently focuses on +localizing paranasal anomalies proves to be advantageous, particularly when the +subsequent task involves differentiating normal from anomalous maxillary +sinuses. Access our code at +https://github.com/mtec-tuhh/self-supervised-paranasal-anomaly + +
+
+
+
+
+ + ☆ Anywhere: A Multi-Agent Framework for Reliable and Diverse + Foreground-Conditioned Image Inpainting + + +
+ Recent advancements in image inpainting, particularly through diffusion +modeling, have yielded promising outcomes. However, when tested in scenarios +involving the completion of images based on the foreground objects, current +methods that aim to inpaint an image in an end-to-end manner encounter +challenges such as "over-imagination", inconsistency between foreground and +background, and limited diversity. In response, we introduce Anywhere, a +pioneering multi-agent framework designed to address these issues. Anywhere +utilizes a sophisticated pipeline framework comprising various agents such as +Visual Language Model (VLM), Large Language Model (LLM), and image generation +models. This framework consists of three principal components: the prompt +generation module, the image generation module, and the outcome analyzer. The +prompt generation module conducts a semantic analysis of the input foreground +image, leveraging VLM to predict relevant language descriptions and LLM to +recommend optimal language prompts. In the image generation module, we employ a +text-guided canny-to-image generation model to create a template image based on +the edge map of the foreground image and language prompts, and an image refiner +to produce the outcome by blending the input foreground and the template image. +The outcome analyzer employs VLM to evaluate image content rationality, +aesthetic score, and foreground-background relevance, triggering prompt and +image regeneration as needed. Extensive experiments demonstrate that our +Anywhere framework excels in foreground-conditioned image inpainting, +mitigating "over-imagination", resolving foreground-background discrepancies, +and enhancing diversity. It successfully elevates foreground-conditioned image +inpainting to produce more reliable and diverse results. + +
+
+ comment: 16 pages, 9 figures, project page: + https://anywheremultiagent.github.io +
+
+
+
+
+ + ☆ Context Matters: Leveraging Spatiotemporal Metadata for Semi-Supervised + Learning on Remote Sensing Images + + +
+ Remote sensing projects typically generate large amounts of imagery that can +be used to train powerful deep neural networks. However, the amount of labeled +images is often small, as remote sensing applications generally require expert +labelers. Thus, semi-supervised learning (SSL), i.e., learning with a small +pool of labeled and a larger pool of unlabeled data, is particularly useful in +this domain. Current SSL approaches generate pseudo-labels from model +predictions for unlabeled samples. As the quality of these pseudo-labels is +crucial for performance, utilizing additional information to improve +pseudo-label quality yields a promising direction. For remote sensing images, +geolocation and recording time are generally available and provide a valuable +source of information as semantic concepts, such as land cover, are highly +dependent on spatiotemporal context, e.g., due to seasonal effects and +vegetation zones. In this paper, we propose to exploit spatiotemporal +metainformation in SSL to improve the quality of pseudo-labels and, therefore, +the final model performance. We show that directly adding the available +metadata to the input of the predictor at test time degenerates the prediction +quality for metadata outside the spatiotemporal distribution of the training +set. Thus, we propose a teacher-student SSL framework where only the teacher +network uses metainformation to improve the quality of pseudo-labels on the +training set. Correspondingly, our student network benefits from the improved +pseudo-labels but does not receive metadata as input, making it invariant to +spatiotemporal shifts at test time. Furthermore, we propose methods for +encoding and injecting spatiotemporal information into the model and introduce +a novel distillation mechanism to enhance the knowledge transfer between +teacher and student. Our framework dubbed Spatiotemporal SSL can be easily +combined with several stat... + +
+
+
+
+
+ + ☆ SIDBench: A Python Framework for Reliably Assessing Synthetic Image + Detection Methods + + +
+ The generative AI technology offers an increasing variety of tools for +generating entirely synthetic images that are increasingly indistinguishable +from real ones. Unlike methods that alter portions of an image, the creation of +completely synthetic images presents a unique challenge and several Synthetic +Image Detection (SID) methods have recently appeared to tackle it. Yet, there +is often a large gap between experimental results on benchmark datasets and the +performance of methods in the wild. To better address the evaluation needs of +SID and help close this gap, this paper introduces a benchmarking framework +that integrates several state-of-the-art SID models. Our selection of +integrated models was based on the utilization of varied input features, and +different network architectures, aiming to encompass a broad spectrum of +techniques. The framework leverages recent datasets with a diverse set of +generative models, high level of photo-realism and resolution, reflecting the +rapid improvements in image synthesis technology. Additionally, the framework +enables the study of how image transformations, common in assets shared online, +such as JPEG compression, affect detection performance. SIDBench is available +on https://github.com/mever-team/sidbench and is designed in a modular manner +to enable easy inclusion of new datasets and SID models. + +
+
+
+
+
+ + ☆ Enhancing Boundary Segmentation for Topological Accuracy with + Skeleton-based Methods + + +
+ Topological consistency plays a crucial role in the task of boundary +segmentation for reticular images, such as cell membrane segmentation in neuron +electron microscopic images, grain boundary segmentation in material +microscopic images and road segmentation in aerial images. In these fields, +topological changes in segmentation results have a serious impact on the +downstream tasks, which can even exceed the misalignment of the boundary +itself. To enhance the topology accuracy in segmentation results, we propose +the Skea-Topo Aware loss, which is a novel loss function that takes into +account the shape of each object and topological significance of the pixels. It +consists of two components. First, the skeleton-aware weighted loss improves +the segmentation accuracy by better modeling the object geometry with +skeletons. Second, a boundary rectified term effectively identifies and +emphasizes topological critical pixels in the prediction errors using both +foreground and background skeletons in the ground truth and predictions. +Experiments prove that our method improves topological consistency by up to 7 +points in VI compared to 13 state-of-art methods, based on objective and +subjective assessments across three different boundary segmentation datasets. +The code is available at https://github.com/clovermini/Skea_topo. + +
+
+
+
+
+ + ☆ MileBench: Benchmarking MLLMs in Long Context + + +
+ Despite the advancements and impressive performance of Multimodal Large +Language Models (MLLMs) on benchmarks, their effectiveness in real-world, +long-context, and multi-image tasks is unclear due to the benchmarks' limited +scope. Existing benchmarks often focus on single-image and short-text samples, +and when assessing multi-image tasks, they either limit the image count or +focus on specific task (e.g time-series captioning), potentially obscuring the +performance challenges of MLLMs. To address these limitations, we introduce +MileBench, a pioneering benchmark designed to test the MultImodal Long-contExt +capabilities of MLLMs. This benchmark comprises not only multimodal long +contexts, but also multiple tasks requiring both comprehension and generation. +We establish two distinct evaluation sets, diagnostic and realistic, to +systematically assess MLLMs' long-context adaptation capacity and their ability +to complete tasks in long-context scenarios. Our experimental results, obtained +from testing 20 models, revealed that while the closed-source GPT-4(Vision) and +Gemini 1.5 outperform others, most open-source MLLMs struggle in long-context +situations. Interestingly, the performance gap tends to widen with an increase +in the number of images. We strongly encourage an intensification of research +efforts towards enhancing MLLMs' long-context capabilities, especially in +scenarios involving multiple images. + +
+
+ comment: 29 pages, 13 figures, 14 tables +
+
+
+
+
+ + ☆ Multisensor Data Fusion for Automatized Insect Monitoring (KInsecta) + + +
+ Insect populations are declining globally, making systematic monitoring +essential for conservation. Most classical methods involve death traps and +counter insect conservation. This paper presents a multisensor approach that +uses AI-based data fusion for insect classification. The system is designed as +low-cost setup and consists of a camera module and an optical wing beat sensor +as well as environmental sensors to measure temperature, irradiance or daytime +as prior information. The system has been tested in the laboratory and in the +field. First tests on a small very unbalanced data set with 7 species show +promising results for species classification. The multisensor system will +support biodiversity and agriculture studies. + +
+
+
+
+
+ + ☆ Clicks2Line: Using Lines for Interactive Image Segmentation + + +
+ For click-based interactive segmentation methods, reducing the number of +clicks required to obtain a desired segmentation result is essential. Although +recent click-based methods yield decent segmentation results, we observe that +substantial amount of clicks are required to segment elongated regions. To +reduce the amount of user-effort required, we propose using lines instead of +clicks for such cases. In this paper, an interactive segmentation algorithm +which adaptively adopts either clicks or lines as input is proposed. +Experimental results demonstrate that using lines can generate better +segmentation results than clicks for several cases. + +
+
+
+
+
+ + ☆ Chameleon: A Data-Efficient Generalist for Dense Visual Prediction in + the Wild + + +
+ Large language models have evolved data-efficient generalists, benefiting +from the universal language interface and large-scale pre-training. However, +constructing a data-efficient generalist for dense visual prediction presents a +distinct challenge due to the variation in label structures across different +tasks. Consequently, generalization to unseen dense prediction tasks in the +low-data regime is not straightforward and has received less attention from +previous vision generalists. In this study, we explore a universal model that +can flexibly adapt to unseen dense label structures with a few examples, +enabling it to serve as a data-efficient vision generalist in diverse +real-world scenarios. To this end, we base our method on a powerful +meta-learning framework and explore several axes to improve its performance and +versatility for real-world problems, such as flexible adaptation mechanisms and +scalability. We evaluate our model across a spectrum of unseen real-world +scenarios where low-shot learning is desirable, including video, 3D, medical, +biological, and user-interactive tasks. Equipped with a generic architecture +and an effective adaptation mechanism, our model flexibly adapts to all of +these tasks with at most 50 labeled images, showcasing a significant +advancement over existing data-efficient generalist approaches. Codes are +available at https://github.com/GitGyun/chameleon. + +
+
+
+
+
+ + ☆ Autonomous Quality and Hallucination Assessment for Virtual Tissue + Staining and Digital Pathology + + +
+ Histopathological staining of human tissue is essential in the diagnosis of +various diseases. The recent advances in virtual tissue staining technologies +using AI alleviate some of the costly and tedious steps involved in the +traditional histochemical staining process, permitting multiplexed rapid +staining of label-free tissue without using staining reagents, while also +preserving tissue. However, potential hallucinations and artifacts in these +virtually stained tissue images pose concerns, especially for the clinical +utility of these approaches. Quality assessment of histology images is +generally performed by human experts, which can be subjective and depends on +the training level of the expert. Here, we present an autonomous quality and +hallucination assessment method (termed AQuA), mainly designed for virtual +tissue staining, while also being applicable to histochemical staining. AQuA +achieves 99.8% accuracy when detecting acceptable and unacceptable virtually +stained tissue images without access to ground truth, also presenting an +agreement of 98.5% with the manual assessments made by board-certified +pathologists. Besides, AQuA achieves super-human performance in identifying +realistic-looking, virtually stained hallucinatory images that would normally +mislead human diagnosticians by deceiving them into diagnosing patients that +never existed. We further demonstrate the wide adaptability of AQuA across +various virtually and histochemically stained tissue images and showcase its +strong external generalization to detect unseen hallucination patterns of +virtual staining network models as well as artifacts observed in the +traditional histochemical staining workflow. This framework creates new +opportunities to enhance the reliability of virtual staining and will provide +quality assurance for various image generation and transformation tasks in +digital pathology and computational imaging. + +
+
+ comment: 37 Pages, 7 Figures +
+
+
+
+
+ + ☆ 3D Gaussian Splatting with Deferred Reflection + + +
+ The advent of neural and Gaussian-based radiance field methods have achieved +great success in the field of novel view synthesis. However, specular +reflection remains non-trivial, as the high frequency radiance field is +notoriously difficult to fit stably and accurately. We present a deferred +shading method to effectively render specular reflection with Gaussian +splatting. The key challenge comes from the environment map reflection model, +which requires accurate surface normal while simultaneously bottlenecks normal +estimation with discontinuous gradients. We leverage the per-pixel reflection +gradients generated by deferred shading to bridge the optimization process of +neighboring Gaussians, allowing nearly correct normal estimations to gradually +propagate and eventually spread over all reflective objects. Our method +significantly outperforms state-of-the-art techniques and concurrent work in +synthesizing high-quality specular reflection effects, demonstrating a +consistent improvement of peak signal-to-noise ratio (PSNR) for both synthetic +and real-world scenes, while running at a frame rate almost identical to +vanilla Gaussian splatting. + +
+
+
+
+
+ + ☆ MFP: Making Full Use of Probability Maps for Interactive Image + Segmentation CVPR 2024 + + +
+ In recent interactive segmentation algorithms, previous probability maps are +used as network input to help predictions in the current segmentation round. +However, despite the utilization of previous masks, useful information +contained in the probability maps is not well propagated to the current +predictions. In this paper, to overcome this limitation, we propose a novel and +effective algorithm for click-based interactive image segmentation, called MFP, +which attempts to make full use of probability maps. We first modulate previous +probability maps to enhance their representations of user-specified objects. +Then, we feed the modulated probability maps as additional input to the +segmentation network. We implement the proposed MFP algorithm based on the +ResNet-34, HRNet-18, and ViT-B backbones and assess the performance extensively +on various datasets. It is demonstrated that MFP meaningfully outperforms the +existing algorithms using identical backbones. The source codes are available +at \href{https://github.com/cwlee00/MFP}{https://github.com/cwlee00/MFP}. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ $ν$-DBA: Neural Implicit Dense Bundle Adjustment Enables Image-Only + Driving Scene Reconstruction + + +
+ The joint optimization of the sensor trajectory and 3D map is a crucial +characteristic of bundle adjustment (BA), essential for autonomous driving. +This paper presents $\nu$-DBA, a novel framework implementing geometric dense +bundle adjustment (DBA) using 3D neural implicit surfaces for map +parametrization, which optimizes both the map surface and trajectory poses +using geometric error guided by dense optical flow prediction. Additionally, we +fine-tune the optical flow model with per-scene self-supervision to further +improve the quality of the dense mapping. Our experimental results on multiple +driving scene datasets demonstrate that our method achieves superior trajectory +optimization and dense reconstruction accuracy. We also investigate the +influences of photometric error and different neural geometric priors on the +performance of surface reconstruction and novel view synthesis. Our method +stands as a significant step towards leveraging neural implicit representations +in dense bundle adjustment for more accurate trajectories and detailed +environmental mapping. + +
+
+
+
+
+ + ☆ ShadowMaskFormer: Mask Augmented Patch Embeddings for Shadow Removal + + +
+ Transformer recently emerged as the de facto model for computer vision tasks +and has also been successfully applied to shadow removal. However, these +existing methods heavily rely on intricate modifications to the attention +mechanisms within the transformer blocks while using a generic patch embedding. +As a result, it often leads to complex architectural designs requiring +additional computation resources. In this work, we aim to explore the efficacy +of incorporating shadow information within the early processing stage. +Accordingly, we propose a transformer-based framework with a novel patch +embedding that is tailored for shadow removal, dubbed ShadowMaskFormer. +Specifically, we present a simple and effective mask-augmented patch embedding +to integrate shadow information and promote the model's emphasis on acquiring +knowledge for shadow regions. Extensive experiments conducted on the ISTD, +ISTD+, and SRD benchmark datasets demonstrate the efficacy of our method +against state-of-the-art approaches while using fewer model parameters. + +
+
+
+
+
+ + ☆ Efficient Meta-Learning Enabled Lightweight Multiscale Few-Shot Object + Detection in Remote Sensing Images + + +
+ Presently, the task of few-shot object detection (FSOD) in remote sensing +images (RSIs) has become a focal point of attention. Numerous few-shot +detectors, particularly those based on two-stage detectors, face challenges +when dealing with the multiscale complexities inherent in RSIs. Moreover, these +detectors present impractical characteristics in real-world applications, +mainly due to their unwieldy model parameters when handling large amount of +data. In contrast, we recognize the advantages of one-stage detectors, +including high detection speed and a global receptive field. Consequently, we +choose the YOLOv7 one-stage detector as a baseline and subject it to a novel +meta-learning training framework. This transformation allows the detector to +adeptly address FSOD tasks while capitalizing on its inherent advantage of +lightweight. Additionally, we thoroughly investigate the samples generated by +the meta-learning strategy and introduce a novel meta-sampling approach to +retain samples produced by our designed meta-detection head. Coupled with our +devised meta-cross loss, we deliberately utilize ``negative samples" that are +often overlooked to extract valuable knowledge from them. This approach serves +to enhance detection accuracy and efficiently refine the overall meta-learning +strategy. To validate the effectiveness of our proposed detector, we conducted +performance comparisons with current state-of-the-art detectors using the DIOR +and NWPU VHR-10.v2 datasets, yielding satisfactory results. + +
+
+
+
+
+ + ☆ Unsupervised Dynamics Prediction with Object-Centric Kinematics + + +
+ Human perception involves discerning complex multi-object scenes into +time-static object appearance (\ie, size, shape, color) and time-varying object +motion (\ie, location, velocity, acceleration). This innate ability to +unconsciously understand the environment is the motivation behind the success +of dynamics modeling. Object-centric representations have emerged as a +promising tool for dynamics prediction, yet they primarily focus on the +objects' appearance, often overlooking other crucial attributes. In this paper, +we propose Object-Centric Kinematics (OCK), a framework for dynamics prediction +leveraging object-centric representations. Our model utilizes a novel component +named object kinematics, which comprises low-level structured states of +objects' position, velocity, and acceleration. The object kinematics are +obtained via either implicit or explicit approaches, enabling comprehensive +spatiotemporal object reasoning, and integrated through various transformer +mechanisms, facilitating effective object-centric dynamics modeling. Our model +demonstrates superior performance when handling objects and backgrounds in +complex scenes characterized by a wide range of object attributes and dynamic +movements. Moreover, our model demonstrates generalization capabilities across +diverse synthetic environments, highlighting its potential for broad +applicability in vision-related tasks. + +
+
+ comment: 15 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Research on Intelligent Aided Diagnosis System of Medical Image Based on + Computer Deep Learning + + +
+ This paper combines Struts and Hibernate two architectures together, using +DAO (Data Access Object) to store and access data. Then a set of dual-mode +humidity medical image library suitable for deep network is established, and a +dual-mode medical image assisted diagnosis method based on the image is +proposed. Through the test of various feature extraction methods, the optimal +operating characteristic under curve product (AUROC) is 0.9985, the recall rate +is 0.9814, and the accuracy is 0.9833. This method can be applied to clinical +diagnosis, and it is a practical method. Any outpatient doctor can register +quickly through the system, or log in to the platform to upload the image to +obtain more accurate images. Through the system, each outpatient physician can +quickly register or log in to the platform for image uploading, thus obtaining +more accurate images. The segmentation of images can guide doctors in clinical +departments. Then the image is analyzed to determine the location and nature of +the tumor, so as to make targeted treatment. + +
+
+
+
+
+ + ☆ Capabilities of Gemini Models in Medicine + + +
+ Excellence in a wide variety of medical applications poses considerable +challenges for AI, requiring advanced reasoning, access to up-to-date medical +knowledge and understanding of complex multimodal data. Gemini models, with +strong general capabilities in multimodal and long-context reasoning, offer +exciting possibilities in medicine. Building on these core strengths of Gemini, +we introduce Med-Gemini, a family of highly capable multimodal models that are +specialized in medicine with the ability to seamlessly use web search, and that +can be efficiently tailored to novel modalities using custom encoders. We +evaluate Med-Gemini on 14 medical benchmarks, establishing new state-of-the-art +(SoTA) performance on 10 of them, and surpass the GPT-4 model family on every +benchmark where a direct comparison is viable, often by a wide margin. On the +popular MedQA (USMLE) benchmark, our best-performing Med-Gemini model achieves +SoTA performance of 91.1% accuracy, using a novel uncertainty-guided search +strategy. On 7 multimodal benchmarks including NEJM Image Challenges and MMMU +(health & medicine), Med-Gemini improves over GPT-4V by an average relative +margin of 44.5%. We demonstrate the effectiveness of Med-Gemini's long-context +capabilities through SoTA performance on a needle-in-a-haystack retrieval task +from long de-identified health records and medical video question answering, +surpassing prior bespoke methods using only in-context learning. Finally, +Med-Gemini's performance suggests real-world utility by surpassing human +experts on tasks such as medical text summarization, alongside demonstrations +of promising potential for multimodal medical dialogue, medical research and +education. Taken together, our results offer compelling evidence for +Med-Gemini's potential, although further rigorous evaluation will be crucial +before real-world deployment in this safety-critical domain. + +
+
+
+
+
+ + ☆ 3AM: An Ambiguity-Aware Multi-Modal Machine Translation Dataset + + +
+ Multimodal machine translation (MMT) is a challenging task that seeks to +improve translation quality by incorporating visual information. However, +recent studies have indicated that the visual information provided by existing +MMT datasets is insufficient, causing models to disregard it and overestimate +their capabilities. This issue presents a significant obstacle to the +development of MMT research. This paper presents a novel solution to this issue +by introducing 3AM, an ambiguity-aware MMT dataset comprising 26,000 parallel +sentence pairs in English and Chinese, each with corresponding images. Our +dataset is specifically designed to include more ambiguity and a greater +variety of both captions and images than other MMT datasets. We utilize a word +sense disambiguation model to select ambiguous data from vision-and-language +datasets, resulting in a more challenging dataset. We further benchmark several +state-of-the-art MMT models on our proposed dataset. Experimental results show +that MMT models trained on our dataset exhibit a greater ability to exploit +visual information than those trained on other MMT datasets. Our work provides +a valuable resource for researchers in the field of multimodal learning and +encourages further exploration in this area. The data, code and scripts are +freely available at https://github.com/MaxyLee/3AM. + +
+
+
+
+
+ + ☆ Multi-modal Perception Dataset of In-water Objects for Autonomous + Surface Vehicles ICRA + + +
+ This paper introduces the first publicly accessible multi-modal perception +dataset for autonomous maritime navigation, focusing on in-water obstacles +within the aquatic environment to enhance situational awareness for Autonomous +Surface Vehicles (ASVs). This dataset, consisting of diverse objects +encountered under varying environmental conditions, aims to bridge the research +gap in marine robotics by providing a multi-modal, annotated, and ego-centric +perception dataset, for object detection and classification. We also show the +applicability of the proposed dataset's framework using deep learning-based +open-source perception algorithms that have shown success. We expect that our +dataset will contribute to development of the marine autonomy pipeline and +marine (field) robotics. Please note this is a work-in-progress paper about our +on-going research that we plan to release in full via future publication. + +
+
+ comment: Accepted to the IEEE ICRA Workshop on Field Robotics 2024 +
+
+
+
+
+ + ☆ PKU-AIGIQA-4K: A Perceptual Quality Assessment Database for Both + Text-to-Image and Image-to-Image AI-Generated Images + + +
+ In recent years, image generation technology has rapidly advanced, resulting +in the creation of a vast array of AI-generated images (AIGIs). However, the +quality of these AIGIs is highly inconsistent, with low-quality AIGIs severely +impairing the visual experience of users. Due to the widespread application of +AIGIs, the AI-generated image quality assessment (AIGIQA), aimed at evaluating +the quality of AIGIs from the perspective of human perception, has garnered +increasing interest among scholars. Nonetheless, current research has not yet +fully explored this field. We have observed that existing databases are limited +to images generated from single scenario settings. Databases such as AGIQA-1K, +AGIQA-3K, and AIGCIQA2023, for example, only include images generated by +text-to-image generative models. This oversight highlights a critical gap in +the current research landscape, underscoring the need for dedicated databases +catering to image-to-image scenarios, as well as more comprehensive databases +that encompass a broader range of AI-generated image scenarios. Addressing +these issues, we have established a large scale perceptual quality assessment +database for both text-to-image and image-to-image AIGIs, named PKU-AIGIQA-4K. +We then conduct a well-organized subjective experiment to collect quality +labels for AIGIs and perform a comprehensive analysis of the PKU-AIGIQA-4K +database. Regarding the use of image prompts during the training process, we +propose three image quality assessment (IQA) methods based on pre-trained +models that include a no-reference method NR-AIGCIQA, a full-reference method +FR-AIGCIQA, and a partial-reference method PR-AIGCIQA. Finally, leveraging the +PKU-AIGIQA-4K database, we conduct extensive benchmark experiments and compare +the performance of the proposed methods and the current IQA methods. + +
+
+ comment: 12 pages. arXiv admin note: substantial text overlap with + arXiv:2311.15556 +
+
+
+
+
+ + ☆ Spectral-Spatial Mamba for Hyperspectral Image Classification + + +
+ Recently, deep learning models have achieved excellent performance in +hyperspectral image (HSI) classification. Among the many deep models, +Transformer has gradually attracted interest for its excellence in modeling the +long-range dependencies of spatial-spectral features in HSI. However, +Transformer has the problem of quadratic computational complexity due to the +self-attention mechanism, which is heavier than other models and thus has +limited adoption in HSI processing. Fortunately, the recently emerging state +space model-based Mamba shows great computational efficiency while achieving +the modeling power of Transformers. Therefore, in this paper, we make a +preliminary attempt to apply the Mamba to HSI classification, leading to the +proposed spectral-spatial Mamba (SS-Mamba). Specifically, the proposed SS-Mamba +mainly consists of spectral-spatial token generation module and several stacked +spectral-spatial Mamba blocks. Firstly, the token generation module converts +any given HSI cube to spatial and spectral tokens as sequences. And then these +tokens are sent to stacked spectral-spatial mamba blocks (SS-MB). Each SS-MB +block consists of two basic mamba blocks and a spectral-spatial feature +enhancement module. The spatial and spectral tokens are processed separately by +the two basic mamba blocks, respectively. Besides, the feature enhancement +module modulates spatial and spectral tokens using HSI sample's center region +information. In this way, the spectral and spatial tokens cooperate with each +other and achieve information fusion within each block. The experimental +results conducted on widely used HSI datasets reveal that the proposed model +achieves competitive results compared with the state-of-the-art methods. The +Mamba-based method opens a new window for HSI classification. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Semantic Line Combination Detector + + +
+ A novel algorithm, called semantic line combination detector (SLCD), to find +an optimal combination of semantic lines is proposed in this paper. It +processes all lines in each line combination at once to assess the overall +harmony of the lines. First, we generate various line combinations from +reliable lines. Second, we estimate the score of each line combination and +determine the best one. Experimental results demonstrate that the proposed SLCD +outperforms existing semantic line detectors on various datasets. Moreover, it +is shown that SLCD can be applied effectively to three vision tasks of +vanishing point detection, symmetry axis detection, and composition-based image +retrieval. Our codes are available at https://github.com/Jinwon-Ko/SLCD. + +
+
+
+
+
+ + ☆ ViOCRVQA: Novel Benchmark Dataset and Vision Reader for Visual Question + Answering by Understanding Vietnamese Text in Images + + +
+ Optical Character Recognition - Visual Question Answering (OCR-VQA) is the +task of answering text information contained in images that have just been +significantly developed in the English language in recent years. However, there +are limited studies of this task in low-resource languages such as Vietnamese. +To this end, we introduce a novel dataset, ViOCRVQA (Vietnamese Optical +Character Recognition - Visual Question Answering dataset), consisting of +28,000+ images and 120,000+ question-answer pairs. In this dataset, all the +images contain text and questions about the information relevant to the text in +the images. We deploy ideas from state-of-the-art methods proposed for English +to conduct experiments on our dataset, revealing the challenges and +difficulties inherent in a Vietnamese dataset. Furthermore, we introduce a +novel approach, called VisionReader, which achieved 0.4116 in EM and 0.6990 in +the F1-score on the test set. Through the results, we found that the OCR system +plays a very important role in VQA models on the ViOCRVQA dataset. In addition, +the objects in the image also play a role in improving model performance. We +open access to our dataset at link (https://github.com/qhnhynmm/ViOCRVQA.git) +for further research in OCR-VQA task in Vietnamese. + +
+
+
+
+
+ + ☆ Reconstructing Satellites in 3D from Amateur Telescope Images + + +
+ This paper proposes a framework for the 3D reconstruction of satellites in +low-Earth orbit, utilizing videos captured by small amateur telescopes. The +video data obtained from these telescopes differ significantly from data for +standard 3D reconstruction tasks, characterized by intense motion blur, +atmospheric turbulence, pervasive background light pollution, extended focal +length and constrained observational perspectives. To address these challenges, +our approach begins with a comprehensive pre-processing workflow that +encompasses deep learning-based image restoration, feature point extraction and +camera pose initialization. We proceed with the application of an improved 3D +Gaussian splatting algorithm for reconstructing the 3D model. Our technique +supports simultaneous 3D Gaussian training and pose estimation, enabling the +robust generation of intricate 3D point clouds from sparse, noisy data. The +procedure is further bolstered by a post-editing phase designed to eliminate +noise points inconsistent with our prior knowledge of a satellite's geometric +constraints. We validate our approach using both synthetic datasets and actual +observations of China's Space Station, showcasing its significant advantages +over existing methods in reconstructing 3D space objects from ground-based +observations. + +
+
+
+
+
+ + ☆ Object Registration in Neural Fields ICRA 2024 + + +
+ Neural fields provide a continuous scene representation of 3D geometry and +appearance in a way which has great promise for robotics applications. One +functionality that unlocks unique use-cases for neural fields in robotics is +object 6-DoF registration. In this paper, we provide an expanded analysis of +the recent Reg-NF neural field registration method and its use-cases within a +robotics context. We showcase the scenario of determining the 6-DoF pose of +known objects within a scene using scene and object neural field models. We +show how this may be used to better represent objects within imperfectly +modelled scenes and generate new scenes by substituting object neural field +models into the scene. + +
+
+ comment: Accepted to ICRA 2024 RoboNeRF workshop. 5 pages, 10 figures. arXiv + admin note: substantial text overlap with arXiv:2402.09722 +
+
+
+
+
+ + ☆ Post-hoc and manifold explanations analysis of facial expression data + based on deep learning + + +
+ The complex information processing system of humans generates a lot of +objective and subjective evaluations, making the exploration of human cognitive +products of great cutting-edge theoretical value. In recent years, deep +learning technologies, which are inspired by biological brain mechanisms, have +made significant strides in the application of psychological or cognitive +scientific research, particularly in the memorization and recognition of facial +data. This paper investigates through experimental research how neural networks +process and store facial expression data and associate these data with a range +of psychological attributes produced by humans. Researchers utilized deep +learning model VGG16, demonstrating that neural networks can learn and +reproduce key features of facial data, thereby storing image memories. +Moreover, the experimental results reveal the potential of deep learning models +in understanding human emotions and cognitive processes and establish a +manifold visualization interpretation of cognitive products or psychological +attributes from a non-Euclidean space perspective, offering new insights into +enhancing the explainability of AI. This study not only advances the +application of AI technology in the field of psychology but also provides a new +psychological theoretical understanding the information processing of the AI. +The code is available in here: https://github.com/NKUShaw/Psychoinformatics. + +
+
+ comment: 19PAGES +
+
+
+
+
+ + ☆ G-Refine: A General Quality Refiner for Text-to-Image Generation + + +
+ With the evolution of Text-to-Image (T2I) models, the quality defects of +AI-Generated Images (AIGIs) pose a significant barrier to their widespread +adoption. In terms of both perception and alignment, existing models cannot +always guarantee high-quality results. To mitigate this limitation, we +introduce G-Refine, a general image quality refiner designed to enhance +low-quality images without compromising the integrity of high-quality ones. The +model is composed of three interconnected modules: a perception quality +indicator, an alignment quality indicator, and a general quality enhancement +module. Based on the mechanisms of the Human Visual System (HVS) and syntax +trees, the first two indicators can respectively identify the perception and +alignment deficiencies, and the last module can apply targeted quality +enhancement accordingly. Extensive experimentation reveals that when compared +to alternative optimization methods, AIGIs after G-Refine outperform in 10+ +quality metrics across 4 databases. This improvement significantly contributes +to the practical application of contemporary T2I models, paving the way for +their broader adoption. The code will be released on +https://github.com/Q-Future/Q-Refine. + +
+
+
+
+
+ + ☆ SAGS: Structure-Aware 3D Gaussian Splatting + + +
+ Following the advent of NeRFs, 3D Gaussian Splatting (3D-GS) has paved the +way to real-time neural rendering overcoming the computational burden of +volumetric methods. Following the pioneering work of 3D-GS, several methods +have attempted to achieve compressible and high-fidelity performance +alternatives. However, by employing a geometry-agnostic optimization scheme, +these methods neglect the inherent 3D structure of the scene, thereby +restricting the expressivity and the quality of the representation, resulting +in various floating points and artifacts. In this work, we propose a +structure-aware Gaussian Splatting method (SAGS) that implicitly encodes the +geometry of the scene, which reflects to state-of-the-art rendering performance +and reduced storage requirements on benchmark novel-view synthesis datasets. +SAGS is founded on a local-global graph representation that facilitates the +learning of complex scenes and enforces meaningful point displacements that +preserve the scene's geometry. Additionally, we introduce a lightweight version +of SAGS, using a simple yet effective mid-point interpolation scheme, which +showcases a compact representation of the scene with up to 24$\times$ size +reduction without the reliance on any compression strategies. Extensive +experiments across multiple benchmark datasets demonstrate the superiority of +SAGS compared to state-of-the-art 3D-GS methods under both rendering quality +and model size. Besides, we demonstrate that our structure-aware method can +effectively mitigate floating artifacts and irregular distortions of previous +methods while obtaining precise depth maps. Project page +https://eververas.github.io/SAGS/. + +
+
+ comment: 15 pages, 8 figures, 3 tables +
+
+
+
+
+ + ☆ Enhancing Brazilian Sign Language Recognition through Skeleton Image + Representation + + +
+ Effective communication is paramount for the inclusion of deaf individuals in +society. However, persistent communication barriers due to limited Sign +Language (SL) knowledge hinder their full participation. In this context, Sign +Language Recognition (SLR) systems have been developed to improve communication +between signing and non-signing individuals. In particular, there is the +problem of recognizing isolated signs (Isolated Sign Language Recognition, +ISLR) of great relevance in the development of vision-based SL search engines, +learning tools, and translation systems. This work proposes an ISLR approach +where body, hands, and facial landmarks are extracted throughout time and +encoded as 2-D images. These images are processed by a convolutional neural +network, which maps the visual-temporal information into a sign label. +Experimental results demonstrate that our method surpassed the state-of-the-art +in terms of performance metrics on two widely recognized datasets in Brazilian +Sign Language (LIBRAS), the primary focus of this study. In addition to being +more accurate, our method is more time-efficient and easier to train due to its +reliance on a simpler network architecture and solely RGB data as input. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Evaluating Deep Clustering Algorithms on Non-Categorical 3D CAD Models + + +
+ We introduce the first work on benchmarking and evaluating deep clustering +algorithms on large-scale non-categorical 3D CAD models. We first propose a +workflow to allow expert mechanical engineers to efficiently annotate 252,648 +carefully sampled pairwise CAD model similarities, from a subset of the ABC +dataset with 22,968 shapes. Using seven baseline deep clustering methods, we +then investigate the fundamental challenges of evaluating clustering methods +for non-categorical data. Based on these challenges, we propose a novel and +viable ensemble-based clustering comparison approach. This work is the first to +directly target the underexplored area of deep clustering algorithms for 3D +shapes, and we believe it will be an important building block to analyze and +utilize the massive 3D shape collections that are starting to appear in deep +geometric computing. + +
+
+
+
+
+ + ☆ Integrating Present and Past in Unsupervised Continual Learning + + +
+ We formulate a unifying framework for unsupervised continual learning (UCL), +which disentangles learning objectives that are specific to the present and the +past data, encompassing stability, plasticity, and cross-task consolidation. +The framework reveals that many existing UCL approaches overlook cross-task +consolidation and try to balance plasticity and stability in a shared embedding +space. This results in worse performance due to a lack of within-task data +diversity and reduced effectiveness in learning the current task. Our method, +Osiris, which explicitly optimizes all three objectives on separate embedding +spaces, achieves state-of-the-art performance on all benchmarks, including two +novel benchmarks proposed in this paper featuring semantically structured task +sequences. Compared to standard benchmarks, these two structured benchmarks +more closely resemble visual signals received by humans and animals when +navigating real-world environments. Finally, we show some preliminary evidence +that continual models can benefit from such realistic learning scenarios. + +
+
+ comment: CoLLAs 2024 +
+
+
+
+
+ + ☆ Q-GroundCAM: Quantifying Grounding in Vision Language Models via GradCAM CVPR 2024 + + +
+ Vision and Language Models (VLMs) continue to demonstrate remarkable +zero-shot (ZS) performance across various tasks. However, many probing studies +have revealed that even the best-performing VLMs struggle to capture aspects of +compositional scene understanding, lacking the ability to properly ground and +localize linguistic phrases in images. Recent VLM advancements include scaling +up both model and dataset sizes, additional training objectives and levels of +supervision, and variations in the model architectures. To characterize the +grounding ability of VLMs, such as phrase grounding, referring expressions +comprehension, and relationship understanding, Pointing Game has been used as +an evaluation metric for datasets with bounding box annotations. In this paper, +we introduce a novel suite of quantitative metrics that utilize GradCAM +activations to rigorously evaluate the grounding capabilities of pre-trained +VLMs like CLIP, BLIP, and ALBEF. These metrics offer an explainable and +quantifiable approach for a more detailed comparison of the zero-shot +capabilities of VLMs and enable measuring models' grounding uncertainty. This +characterization reveals interesting tradeoffs between the size of the model, +the dataset size, and their performance. + +
+
+ comment: Accepted to CVPR 2024, Second Workshop on Foundation Models (WFM) +
+
+
+
+
+ + ☆ Compositional Factorization of Visual Scenes with Convolutional Sparse + Coding and Resonator Networks + + +
+ We propose a system for visual scene analysis and recognition based on +encoding the sparse, latent feature-representation of an image into a +high-dimensional vector that is subsequently factorized to parse scene content. +The sparse feature representation is learned from image statistics via +convolutional sparse coding, while scene parsing is performed by a resonator +network. The integration of sparse coding with the resonator network increases +the capacity of distributed representations and reduces collisions in the +combinatorial search space during factorization. We find that for this problem +the resonator network is capable of fast and accurate vector factorization, and +we develop a confidence-based metric that assists in tracking the convergence +of the resonator network. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Source-Free Domain Adaptation of Weakly-Supervised Object Localization + Models for Histology CVPR + + +
+ Given the emergence of deep learning, digital pathology has gained popularity +for cancer diagnosis based on histology images. Deep weakly supervised object +localization (WSOL) models can be trained to classify histology images +according to cancer grade and identify regions of interest (ROIs) for +interpretation, using inexpensive global image-class annotations. A WSOL model +initially trained on some labeled source image data can be adapted using +unlabeled target data in cases of significant domain shifts caused by +variations in staining, scanners, and cancer type. In this paper, we focus on +source-free (unsupervised) domain adaptation (SFDA), a challenging problem +where a pre-trained source model is adapted to a new target domain without +using any source domain data for privacy and efficiency reasons. SFDA of WSOL +models raises several challenges in histology, most notably because they are +not intended to adapt for both classification and localization tasks. In this +paper, 4 state-of-the-art SFDA methods, each one representative of a main SFDA +family, are compared for WSOL in terms of classification and localization +accuracy. They are the SFDA-Distribution Estimation, Source HypOthesis +Transfer, Cross-Domain Contrastive Learning, and Adaptively Domain Statistics +Alignment. Experimental results on the challenging Glas (smaller, breast +cancer) and Camelyon16 (larger, colon cancer) histology datasets indicate that +these SFDA methods typically perform poorly for localization after adaptation +when optimized for classification. + +
+
+ comment: 16 pages, 21 figures, 5 tables, CVPRw 2024 +
+
+
+
+
+ + ☆ EMOPortraits: Emotion-enhanced Multimodal One-shot Head Avatars + + +
+ Head avatars animated by visual signals have gained popularity, particularly +in cross-driving synthesis where the driver differs from the animated +character, a challenging but highly practical approach. The recently presented +MegaPortraits model has demonstrated state-of-the-art results in this domain. +We conduct a deep examination and evaluation of this model, with a particular +focus on its latent space for facial expression descriptors, and uncover +several limitations with its ability to express intense face motions. To +address these limitations, we propose substantial changes in both training +pipeline and model architecture, to introduce our EMOPortraits model, where we: + Enhance the model's capability to faithfully support intense, asymmetric face +expressions, setting a new state-of-the-art result in the emotion transfer +task, surpassing previous methods in both metrics and quality. + Incorporate speech-driven mode to our model, achieving top-tier performance +in audio-driven facial animation, making it possible to drive source identity +through diverse modalities, including visual signal, audio, or a blend of both. + We propose a novel multi-view video dataset featuring a wide range of intense +and asymmetric facial expressions, filling the gap with absence of such data in +existing datasets. + +
+
+
+
+
+ + ☆ Real-Time Convolutional Neural Network-Based Star Detection and + Centroiding Method for CubeSat Star Tracker + + +
+ Star trackers are one of the most accurate celestial sensors used for +absolute attitude determination. The devices detect stars in captured images +and accurately compute their projected centroids on an imaging focal plane with +subpixel precision. Traditional algorithms for star detection and centroiding +often rely on threshold adjustments for star pixel detection and pixel +brightness weighting for centroid computation. However, challenges like high +sensor noise and stray light can compromise algorithm performance. This article +introduces a Convolutional Neural Network (CNN)-based approach for star +detection and centroiding, tailored to address the issues posed by noisy star +tracker images in the presence of stray light and other artifacts. Trained +using simulated star images overlayed with real sensor noise and stray light, +the CNN produces both a binary segmentation map distinguishing star pixels from +the background and a distance map indicating each pixel's proximity to the +nearest star centroid. Leveraging this distance information alongside pixel +coordinates transforms centroid calculations into a set of trilateration +problems solvable via the least squares method. Our method employs efficient +UNet variants for the underlying CNN architectures, and the variants' +performances are evaluated. Comprehensive testing has been undertaken with +synthetic image evaluations, hardware-in-the-loop assessments, and night sky +tests. The tests consistently demonstrated that our method outperforms several +existing algorithms in centroiding accuracy and exhibits superior resilience to +high sensor noise and stray light interference. An additional benefit of our +algorithms is that they can be executed in real-time on low-power edge AI +processors. + +
+
+
+
+
+ + ☆ Longitudinal Mammogram Risk Prediction MICCAI 2024 + + +
+ Breast cancer is one of the leading causes of mortality among women +worldwide. Early detection and risk assessment play a crucial role in improving +survival rates. Therefore, annual or biennial mammograms are often recommended +for screening in high-risk groups. Mammograms are typically interpreted by +expert radiologists based on the Breast Imaging Reporting and Data System +(BI-RADS), which provides a uniform way to describe findings and categorizes +them to indicate the level of concern for breast cancer. Recently, machine +learning (ML) and computational approaches have been developed to automate and +improve the interpretation of mammograms. However, both BI-RADS and the +ML-based methods focus on the analysis of data from the present and sometimes +the most recent prior visit. While it is clear that temporal changes in image +features of the longitudinal scans should carry value for quantifying breast +cancer risk, no prior work has conducted a systematic study of this. In this +paper, we extend a state-of-the-art ML model to ingest an arbitrary number of +longitudinal mammograms and predict future breast cancer risk. On a large-scale +dataset, we demonstrate that our model, LoMaR, achieves state-of-the-art +performance when presented with only the present mammogram. Furthermore, we use +LoMaR to characterize the predictive value of prior visits. Our results show +that longer histories (e.g., up to four prior annual mammograms) can +significantly boost the accuracy of predicting future breast cancer risk, +particularly beyond the short-term. Our code and model weights are available at +https://github.com/batuhankmkaraman/LoMaR. + +
+
+ comment: Submitted to MICCAI 2024 +
+
+
+
+
+ + ☆ Distributed Stochastic Optimization of a Neural Representation Network + for Time-Space Tomography Reconstruction + + +
+ 4D time-space reconstruction of dynamic events or deforming objects using +X-ray computed tomography (CT) is an extremely ill-posed inverse problem. +Existing approaches assume that the object remains static for the duration of +several tens or hundreds of X-ray projection measurement images (reconstruction +of consecutive limited-angle CT scans). However, this is an unrealistic +assumption for many in-situ experiments that causes spurious artifacts and +inaccurate morphological reconstructions of the object. To solve this problem, +we propose to perform a 4D time-space reconstruction using a distributed +implicit neural representation (DINR) network that is trained using a novel +distributed stochastic training algorithm. Our DINR network learns to +reconstruct the object at its output by iterative optimization of its network +parameters such that the measured projection images best match the output of +the CT forward measurement model. We use a continuous time and space forward +measurement model that is a function of the DINR outputs at a sparsely sampled +set of continuous valued object coordinates. Unlike existing state-of-the-art +neural representation architectures that forward and back propagate through +dense voxel grids that sample the object's entire time-space coordinates, we +only propagate through the DINR at a small subset of object coordinates in each +iteration resulting in an order-of-magnitude reduction in memory and compute +for training. DINR leverages distributed computation across several compute +nodes and GPUs to produce high-fidelity 4D time-space reconstructions even for +extremely large CT data sizes. We use both simulated parallel-beam and +experimental cone-beam X-ray CT datasets to demonstrate the superior +performance of our approach. + +
+
+ comment: submitted to Nature Machine Intelligence +
+
+
+
+
+ + ☆ Revolutionizing Traffic Sign Recognition: Unveiling the Potential of + Vision Transformers + + +
+ This research introduces an innovative method for Traffic Sign Recognition +(TSR) by leveraging deep learning techniques, with a particular emphasis on +Vision Transformers. TSR holds a vital role in advancing driver assistance +systems and autonomous vehicles. Traditional TSR approaches, reliant on manual +feature extraction, have proven to be labor-intensive and costly. Moreover, +methods based on shape and color have inherent limitations, including +susceptibility to various factors and changes in lighting conditions. This +study explores three variants of Vision Transformers (PVT, TNT, LNL) and six +convolutional neural networks (AlexNet, ResNet, VGG16, MobileNet, EfficientNet, +GoogleNet) as baseline models. To address the shortcomings of traditional +methods, a novel pyramid EATFormer backbone is proposed, amalgamating +Evolutionary Algorithms (EAs) with the Transformer architecture. The introduced +EA-based Transformer block captures multi-scale, interactive, and individual +information through its components: Feed-Forward Network, Global and Local +Interaction, and Multi-Scale Region Aggregation modules. Furthermore, a +Modulated Deformable MSA module is introduced to dynamically model irregular +locations. Experimental evaluations on the GTSRB and BelgiumTS datasets +demonstrate the efficacy of the proposed approach in enhancing both prediction +speed and accuracy. This study concludes that Vision Transformers hold +significant promise in traffic sign classification and contributes a fresh +algorithmic framework for TSR. These findings set the stage for the development +of precise and dependable TSR algorithms, benefiting driver assistance systems +and autonomous vehicles. + +
+
+
+
+
+ + ☆ HELPER-X: A Unified Instructable Embodied Agent to Tackle Four + Interactive Vision-Language Domains with Memory-Augmented Language Models + + +
+ Recent research on instructable agents has used memory-augmented Large +Language Models (LLMs) as task planners, a technique that retrieves +language-program examples relevant to the input instruction and uses them as +in-context examples in the LLM prompt to improve the performance of the LLM in +inferring the correct action and task plans. In this technical report, we +extend the capabilities of HELPER, by expanding its memory with a wider array +of examples and prompts, and by integrating additional APIs for asking +questions. This simple expansion of HELPER into a shared memory enables the +agent to work across the domains of executing plans from dialogue, natural +language instruction following, active question asking, and commonsense room +reorganization. We evaluate the agent on four diverse interactive +visual-language embodied agent benchmarks: ALFRED, TEACh, DialFRED, and the +Tidy Task. HELPER-X achieves few-shot, state-of-the-art performance across +these benchmarks using a single agent, without requiring in-domain training, +and remains competitive with agents that have undergone in-domain training. + +
+
+ comment: Videos and code https://helper-agent-llm.github.io/ +
+
+
+
+
+ + ☆ Improving Interpretability of Deep Active Learning for Flood Inundation + Mapping Through Class Ambiguity Indices Using Multi-spectral Satellite + Imagery + + +
+ Flood inundation mapping is a critical task for responding to the increasing +risk of flooding linked to global warming. Significant advancements of deep +learning in recent years have triggered its extensive applications, including +flood inundation mapping. To cope with the time-consuming and labor-intensive +data labeling process in supervised learning, deep active learning strategies +are one of the feasible approaches. However, there remains limited exploration +into the interpretability of how deep active learning strategies operate, with +a specific focus on flood inundation mapping in the field of remote sensing. In +this study, we introduce a novel framework of Interpretable Deep Active +Learning for Flood inundation Mapping (IDAL-FIM), specifically in terms of +class ambiguity of multi-spectral satellite images. In the experiments, we +utilize Sen1Floods11 dataset, and adopt U-Net with MC-dropout. In addition, we +employ five acquisition functions, which are the random, K-means, BALD, +entropy, and margin acquisition functions. Based on the experimental results, +we demonstrate that two proposed class ambiguity indices are effective +variables to interpret the deep active learning by establishing statistically +significant correlation with the predictive uncertainty of the deep learning +model at the tile level. Then, we illustrate the behaviors of deep active +learning through visualizing two-dimensional density plots and providing +interpretations regarding the operation of deep active learning, in flood +inundation mapping. + +
+
+ comment: 46 pages, 11 figures, 5 tables +
+
+
+
+
+ + ☆ GSTalker: Real-time Audio-Driven Talking Face Generation via Deformable + Gaussian Splatting + + +
+ We present GStalker, a 3D audio-driven talking face generation model with +Gaussian Splatting for both fast training (40 minutes) and real-time rendering +(125 FPS) with a 3$\sim$5 minute video for training material, in comparison +with previous 2D and 3D NeRF-based modeling frameworks which require hours of +training and seconds of rendering per frame. Specifically, GSTalker learns an +audio-driven Gaussian deformation field to translate and transform 3D Gaussians +to synchronize with audio information, in which multi-resolution hashing +grid-based tri-plane and temporal smooth module are incorporated to learn +accurate deformation for fine-grained facial details. In addition, a +pose-conditioned deformation field is designed to model the stabilized torso. +To enable efficient optimization of the condition Gaussian deformation field, +we initialize 3D Gaussians by learning a coarse static Gaussian representation. +Extensive experiments in person-specific videos with audio tracks validate that +GSTalker can generate high-fidelity and audio-lips synchronized results with +fast training and real-time rendering speed. + +
+
+
+
+
+ + ☆ Embedded Representation Learning Network for Animating Styled Video + Portrait + + +
+ The talking head generation recently attracted considerable attention due to +its widespread application prospects, especially for digital avatars and 3D +animation design. Inspired by this practical demand, several works explored +Neural Radiance Fields (NeRF) to synthesize the talking heads. However, these +methods based on NeRF face two challenges: (1) Difficulty in generating +style-controllable talking heads. (2) Displacement artifacts around the neck in +rendered images. To overcome these two challenges, we propose a novel +generative paradigm \textit{Embedded Representation Learning Network} (ERLNet) +with two learning stages. First, the \textit{ audio-driven FLAME} (ADF) module +is constructed to produce facial expression and head pose sequences +synchronized with content audio and style video. Second, given the sequence +deduced by the ADF, one novel \textit{dual-branch fusion NeRF} (DBF-NeRF) +explores these contents to render the final images. Extensive empirical studies +demonstrate that the collaboration of these two stages effectively facilitates +our method to render a more realistic talking head than the existing +algorithms. + +
+
+
+
+
+ + ☆ Machine Unlearning for Document Classification ICDAR2024 + + +
+ Document understanding models have recently demonstrated remarkable +performance by leveraging extensive collections of user documents. However, +since documents often contain large amounts of personal data, their usage can +pose a threat to user privacy and weaken the bonds of trust between humans and +AI services. In response to these concerns, legislation advocating ``the right +to be forgotten" has recently been proposed, allowing users to request the +removal of private information from computer systems and neural network models. +A novel approach, known as machine unlearning, has emerged to make AI models +forget about a particular class of data. In our research, we explore machine +unlearning for document classification problems, representing, to the best of +our knowledge, the first investigation into this area. Specifically, we +consider a realistic scenario where a remote server houses a well-trained model +and possesses only a small portion of training data. This setup is designed for +efficient forgetting manipulation. This work represents a pioneering step +towards the development of machine unlearning methods aimed at addressing +privacy concerns in document analysis applications. Our code is publicly +available at +\url{https://github.com/leitro/MachineUnlearning-DocClassification}. + +
+
+ comment: Accepted to ICDAR2024 +
+
+
+
+
+ + ☆ MeGA: Hybrid Mesh-Gaussian Head Avatar for High-Fidelity Rendering and + Head Editing + + +
+ Creating high-fidelity head avatars from multi-view videos is a core issue +for many AR/VR applications. However, existing methods usually struggle to +obtain high-quality renderings for all different head components simultaneously +since they use one single representation to model components with drastically +different characteristics (e.g., skin vs. hair). In this paper, we propose a +Hybrid Mesh-Gaussian Head Avatar (MeGA) that models different head components +with more suitable representations. Specifically, we select an enhanced FLAME +mesh as our facial representation and predict a UV displacement map to provide +per-vertex offsets for improved personalized geometric details. To achieve +photorealistic renderings, we obtain facial colors using deferred neural +rendering and disentangle neural textures into three meaningful parts. For hair +modeling, we first build a static canonical hair using 3D Gaussian Splatting. A +rigid transformation and an MLP-based deformation field are further applied to +handle complex dynamic expressions. Combined with our occlusion-aware blending, +MeGA generates higher-fidelity renderings for the whole head and naturally +supports more downstream tasks. Experiments on the NeRSemble dataset +demonstrate the effectiveness of our designs, outperforming previous +state-of-the-art methods and supporting various editing functionalities, +including hairstyle alteration and texture editing. + +
+
+ comment: Project page: https://conallwang.github.io/MeGA_Pages/ +
+
+
+
+
+ + ☆ Multi-Page Document Visual Question Answering using Self-Attention + Scoring Mechanism ICDAR2024 + + +
+ Documents are 2-dimensional carriers of written communication, and as such +their interpretation requires a multi-modal approach where textual and visual +information are efficiently combined. Document Visual Question Answering +(Document VQA), due to this multi-modal nature, has garnered significant +interest from both the document understanding and natural language processing +communities. The state-of-the-art single-page Document VQA methods show +impressive performance, yet in multi-page scenarios, these methods struggle. +They have to concatenate all pages into one large page for processing, +demanding substantial GPU resources, even for evaluation. In this work, we +propose a novel method and efficient training strategy for multi-page Document +VQA tasks. In particular, we employ a visual-only document representation, +leveraging the encoder from a document understanding model, Pix2Struct. Our +approach utilizes a self-attention scoring mechanism to generate relevance +scores for each document page, enabling the retrieval of pertinent pages. This +adaptation allows us to extend single-page Document VQA models to multi-page +scenarios without constraints on the number of pages during evaluation, all +with minimal demand for GPU resources. Our extensive experiments demonstrate +not only achieving state-of-the-art performance without the need for Optical +Character Recognition (OCR), but also sustained performance in scenarios +extending to documents of nearly 800 pages compared to a maximum of 20 pages in +the MP-DocVQA dataset. Our code is publicly available at +\url{https://github.com/leitro/SelfAttnScoring-MPDocVQA}. + +
+
+ comment: Accepted to ICDAR2024 +
+
+
+
+
+ + ☆ Simple-RF: Regularizing Sparse Input Radiance Fields with Simpler + Solutions + + +
+ Neural Radiance Fields (NeRF) show impressive performance in photo-realistic +free-view rendering of scenes. Recent improvements on the NeRF such as TensoRF +and ZipNeRF employ explicit models for faster optimization and rendering, as +compared to the NeRF that employs an implicit representation. However, both +implicit and explicit radiance fields require dense sampling of images in the +given scene. Their performance degrades significantly when only a sparse set of +views is available. Researchers find that supervising the depth estimated by a +radiance field helps train it effectively with fewer views. The depth +supervision is obtained either using classical approaches or neural networks +pre-trained on a large dataset. While the former may provide only sparse +supervision, the latter may suffer from generalization issues. As opposed to +the earlier approaches, we seek to learn the depth supervision by designing +augmented models and training them along with the main radiance field. Further, +we aim to design a framework of regularizations that can work across different +implicit and explicit radiance fields. We observe that certain features of +these radiance field models overfit to the observed images in the sparse-input +scenario. Our key finding is that reducing the capability of the radiance +fields with respect to positional encoding, the number of decomposed tensor +components or the size of the hash table, constrains the model to learn simpler +solutions, which estimate better depth in certain regions. By designing +augmented models based on such reduced capabilities, we obtain better depth +supervision for the main radiance field. We achieve state-of-the-art +view-synthesis performance with sparse input views on popular datasets +containing forward-facing and 360$^\circ$ scenes by employing the above +regularizations. + +
+
+ comment: The source code for our model can be found on our project page: + https://nagabhushansn95.github.io/publications/2024/Simple-RF.html. arXiv + admin note: substantial text overlap with arXiv:2309.03955 +
+
+
+
+
+ + ☆ Foundations of Multisensory Artificial Intelligence + + +
+ Building multisensory AI systems that learn from multiple sensory inputs such +as text, speech, video, real-world sensors, wearable devices, and medical data +holds great promise for impact in many scientific areas with practical +benefits, such as in supporting human health and well-being, enabling +multimedia content processing, and enhancing real-world autonomous agents. By +synthesizing a range of theoretical frameworks and application domains, this +thesis aims to advance the machine learning foundations of multisensory AI. In +the first part, we present a theoretical framework formalizing how modalities +interact with each other to give rise to new information for a task. These +interactions are the basic building blocks in all multimodal problems, and +their quantification enables users to understand their multimodal datasets, +design principled approaches to learn these interactions, and analyze whether +their model has succeeded in learning. In the second part, we study the design +of practical multimodal foundation models that generalize over many modalities +and tasks, which presents a step toward grounding large language models to +real-world sensory modalities. We introduce MultiBench, a unified large-scale +benchmark across a wide range of modalities, tasks, and research areas, +followed by the cross-modal attention and multimodal transformer architectures +that now underpin many of today's multimodal foundation models. Scaling these +architectures on MultiBench enables the creation of general-purpose +multisensory AI systems, and we discuss our collaborative efforts in applying +these models for real-world impact in affective computing, mental health, +cancer prognosis, and robotics. Finally, we conclude this thesis by discussing +how future work can leverage these ideas toward more general, interactive, and +safe multisensory AI. + +
+
+ comment: CMU Machine Learning Department PhD Thesis +
+
+
+
+
+ + ☆ An Aggregation-Free Federated Learning for Tackling Data Heterogeneity CVPR 2024 + + +
+ The performance of Federated Learning (FL) hinges on the effectiveness of +utilizing knowledge from distributed datasets. Traditional FL methods adopt an +aggregate-then-adapt framework, where clients update local models based on a +global model aggregated by the server from the previous training round. This +process can cause client drift, especially with significant cross-client data +heterogeneity, impacting model performance and convergence of the FL algorithm. +To address these challenges, we introduce FedAF, a novel aggregation-free FL +algorithm. In this framework, clients collaboratively learn condensed data by +leveraging peer knowledge, the server subsequently trains the global model +using the condensed data and soft labels received from the clients. FedAF +inherently avoids the issue of client drift, enhances the quality of condensed +data amid notable data heterogeneity, and improves the global model +performance. Extensive numerical studies on several popular benchmark datasets +show FedAF surpasses various state-of-the-art FL algorithms in handling +label-skew and feature-skew data heterogeneity, leading to superior global +model accuracy and faster convergence. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Unleashing the Power of Multi-Task Learning: A Comprehensive Survey + Spanning Traditional, Deep, and Pretrained Foundation Model Eras + + +
+ MTL is a learning paradigm that effectively leverages both task-specific and +shared information to address multiple related tasks simultaneously. In +contrast to STL, MTL offers a suite of benefits that enhance both the training +process and the inference efficiency. MTL's key advantages encompass +streamlined model architecture, performance enhancement, and cross-domain +generalizability. Over the past twenty years, MTL has become widely recognized +as a flexible and effective approach in various fields, including CV, NLP, +recommendation systems, disease prognosis and diagnosis, and robotics. This +survey provides a comprehensive overview of the evolution of MTL, encompassing +the technical aspects of cutting-edge methods from traditional approaches to +deep learning and the latest trend of pretrained foundation models. Our survey +methodically categorizes MTL techniques into five key areas: regularization, +relationship learning, feature propagation, optimization, and pre-training. +This categorization not only chronologically outlines the development of MTL +but also dives into various specialized strategies within each category. +Furthermore, the survey reveals how the MTL evolves from handling a fixed set +of tasks to embracing a more flexible approach free from task or modality +constraints. It explores the concepts of task-promptable and -agnostic +training, along with the capacity for ZSL, which unleashes the untapped +potential of this historically coveted learning paradigm. Overall, we hope this +survey provides the research community with a comprehensive overview of the +advancements in MTL from its inception in 1997 to the present in 2023. We +address present challenges and look ahead to future possibilities, shedding +light on the opportunities and potential avenues for MTL research in a broad +manner. This project is publicly available at +https://github.com/junfish/Awesome-Multitask-Learning. + +
+
+ comment: 60 figures, 116 pages, 500+ references +
+
+
+
+
+ + ♻ ☆ Ego-Exo4D: Understanding Skilled Human Activity from First- and + Third-Person Perspectives + + +
+ We present Ego-Exo4D, a diverse, large-scale multimodal multiview video +dataset and benchmark challenge. Ego-Exo4D centers around +simultaneously-captured egocentric and exocentric video of skilled human +activities (e.g., sports, music, dance, bike repair). 740 participants from 13 +cities worldwide performed these activities in 123 different natural scene +contexts, yielding long-form captures from 1 to 42 minutes each and 1,286 hours +of video combined. The multimodal nature of the dataset is unprecedented: the +video is accompanied by multichannel audio, eye gaze, 3D point clouds, camera +poses, IMU, and multiple paired language descriptions -- including a novel +"expert commentary" done by coaches and teachers and tailored to the +skilled-activity domain. To push the frontier of first-person video +understanding of skilled human activity, we also present a suite of benchmark +tasks and their annotations, including fine-grained activity understanding, +proficiency estimation, cross-view translation, and 3D hand/body pose. All +resources are open sourced to fuel new research in the community. Project page: +http://ego-exo4d-data.org/ + +
+
+ comment: updated baseline results and dataset statistics to match the released + v2 data; added table to appendix comparing stats of Ego-Exo4D alongside other + datasets +
+
+
+
+
+ + ♻ ☆ Make-it-Real: Unleashing Large Multimodal Model's Ability for Painting + 3D Objects with Realistic Materials + + +
+ Physically realistic materials are pivotal in augmenting the realism of 3D +assets across various applications and lighting conditions. However, existing +3D assets and generative models often lack authentic material properties. +Manual assignment of materials using graphic software is a tedious and +time-consuming task. In this paper, we exploit advancements in Multimodal Large +Language Models (MLLMs), particularly GPT-4V, to present a novel approach, +Make-it-Real: 1) We demonstrate that GPT-4V can effectively recognize and +describe materials, allowing the construction of a detailed material library. +2) Utilizing a combination of visual cues and hierarchical text prompts, GPT-4V +precisely identifies and aligns materials with the corresponding components of +3D objects. 3) The correctly matched materials are then meticulously applied as +reference for the new SVBRDF material generation according to the original +diffuse map, significantly enhancing their visual authenticity. Make-it-Real +offers a streamlined integration into the 3D content creation workflow, +showcasing its utility as an essential tool for developers of 3D assets. + +
+
+ comment: Project Page: https://sunzey.github.io/Make-it-Real/ +
+
+
+
+
+ + ♻ ☆ Benchmarking the CoW with the TopCoW Challenge: Topology-Aware + Anatomical Segmentation of the Circle of Willis for CTA and MRA MICCAI + + +
+ The Circle of Willis (CoW) is an important network of arteries connecting +major circulations of the brain. Its vascular architecture is believed to +affect the risk, severity, and clinical outcome of serious neuro-vascular +diseases. However, characterizing the highly variable CoW anatomy is still a +manual and time-consuming expert task. The CoW is usually imaged by two +angiographic imaging modalities, magnetic resonance angiography (MRA) and +computed tomography angiography (CTA), but there exist limited public datasets +with annotations on CoW anatomy, especially for CTA. Therefore we organized the +TopCoW Challenge in 2023 with the release of an annotated CoW dataset. The +TopCoW dataset was the first public dataset with voxel-level annotations for +thirteen possible CoW vessel components, enabled by virtual-reality (VR) +technology. It was also the first large dataset with paired MRA and CTA from +the same patients. TopCoW challenge formalized the CoW characterization problem +as a multiclass anatomical segmentation task with an emphasis on topological +metrics. We invited submissions worldwide for the CoW segmentation task, which +attracted over 140 registered participants from four continents. The top +performing teams managed to segment many CoW components to Dice scores around +90%, but with lower scores for communicating arteries and rare variants. There +were also topological mistakes for predictions with high Dice scores. +Additional topological analysis revealed further areas for improvement in +detecting certain CoW components and matching CoW variant topology accurately. +TopCoW represented a first attempt at benchmarking the CoW anatomical +segmentation task for MRA and CTA, both morphologically and topologically. + +
+
+ comment: 24 pages, 11 figures, 9 tables. Summary Paper for the MICCAI TopCoW + 2023 Challenge +
+
+
+
+
+ + ♻ ☆ Amodal Ground Truth and Completion in the Wild CVPR 2024 + + +
+ This paper studies amodal image segmentation: predicting entire object +segmentation masks including both visible and invisible (occluded) parts. In +previous work, the amodal segmentation ground truth on real images is usually +predicted by manual annotaton and thus is subjective. In contrast, we use 3D +data to establish an automatic pipeline to determine authentic ground truth +amodal masks for partially occluded objects in real images. This pipeline is +used to construct an amodal completion evaluation benchmark, MP3D-Amodal, +consisting of a variety of object categories and labels. To better handle the +amodal completion task in the wild, we explore two architecture variants: a +two-stage model that first infers the occluder, followed by amodal mask +completion; and a one-stage model that exploits the representation power of +Stable Diffusion for amodal segmentation across many categories. Without bells +and whistles, our method achieves a new state-of-the-art performance on Amodal +segmentation datasets that cover a large variety of objects, including COCOA +and our new MP3D-Amodal dataset. The dataset, model, and code are available at +https://www.robots.ox.ac.uk/~vgg/research/amodal/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive Input-image Normalization for Solving the Mode Collapse Problem + in GAN-based X-ray Images + + +
+ Biomedical image datasets can be imbalanced due to the rarity of targeted +diseases. Generative Adversarial Networks play a key role in addressing this +imbalance by enabling the generation of synthetic images to augment datasets. +It is important to generate synthetic images that incorporate a diverse range +of features to accurately represent the distribution of features present in the +training imagery. Furthermore, the absence of diverse features in synthetic +images can degrade the performance of machine learning classifiers. The mode +collapse problem impacts Generative Adversarial Networks' capacity to generate +diversified images. Mode collapse comes in two varieties: intra-class and +inter-class. In this paper, both varieties of the mode collapse problem are +investigated, and their subsequent impact on the diversity of synthetic X-ray +images is evaluated. This work contributes an empirical demonstration of the +benefits of integrating the adaptive input-image normalization with the Deep +Convolutional GAN and Auxiliary Classifier GAN to alleviate the mode collapse +problems. Synthetically generated images are utilized for data augmentation and +training a Vision Transformer model. The classification performance of the +model is evaluated using accuracy, recall, and precision scores. Results +demonstrate that the DCGAN and the ACGAN with adaptive input-image +normalization outperform the DCGAN and ACGAN with un-normalized X-ray images as +evidenced by the superior diversity scores and classification scores. + +
+
+ comment: Submitted to the Elsevier Journal +
+
+
+
+
+ + ♻ ☆ SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient + Channels + + +
+ Pre-trained vision transformers have strong representation benefits to +various downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT) +methods have been proposed, and their experiments demonstrate that tuning only +1\% extra parameters could surpass full fine-tuning in low-data resource +scenarios. However, these methods overlook the task-specific information when +fine-tuning diverse downstream tasks. In this paper, we propose a simple yet +effective method called "Salient Channel Tuning" (SCT) to leverage the +task-specific information by forwarding the model with the task images to +select partial channels in a feature map that enables us to tune only 1/8 +channels leading to significantly lower parameter costs. Experiments on 19 +visual transfer learning downstream tasks demonstrate that our SCT outperforms +full fine-tuning on 18 out of 19 tasks by adding only 0.11M parameters of the +ViT-B, which is 780$\times$ fewer than its full fine-tuning counterpart. +Furthermore, experiments on domain generalization and few-shot classification +further demonstrate the effectiveness and generic of our approach. The code is +available at https://github.com/showlab/SCT. + +
+
+ comment: This work has been accepted by IJCV +
+
+
+
+
+ + ♻ ☆ Gradient-based Local Next-best-view Planning for Improved Perception of + Targeted Plant Nodes ICRA 2024 + + +
+ Robots are increasingly used in tomato greenhouses to automate +labour-intensive tasks such as selective harvesting and de-leafing. To perform +these tasks, robots must be able to accurately and efficiently perceive the +plant nodes that need to be cut, despite the high levels of occlusion from +other plant parts. We formulate this problem as a local next-best-view (NBV) +planning task where the robot has to plan an efficient set of camera viewpoints +to overcome occlusion and improve the quality of perception. Our formulation +focuses on quickly improving the perception accuracy of a single target node to +maximise its chances of being cut. Previous methods of NBV planning mostly +focused on global view planning and used random sampling of candidate +viewpoints for exploration, which could suffer from high computational costs, +ineffective view selection due to poor candidates, or non-smooth trajectories +due to inefficient sampling. We propose a gradient-based NBV planner using +differential ray sampling, which directly estimates the local gradient +direction for viewpoint planning to overcome occlusion and improve perception. +Through simulation experiments, we showed that our planner can handle +occlusions and improve the 3D reconstruction and position estimation of nodes +equally well as a sampling-based NBV planner, while taking ten times less +computation and generating 28% more efficient trajectories. + +
+
+ comment: This work has been accepted for the 2024 International Conference on + Robotics and Automation (ICRA 2024) +
+
+
+
+
+ + ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? + + +
+ Large vision-language models have recently achieved remarkable progress, +exhibiting great perception and reasoning abilities concerning visual +information. However, how to effectively evaluate these large vision-language +models remains a major obstacle, hindering future model development. +Traditional benchmarks like VQAv2 or COCO Caption provide quantitative +performance measurements but suffer from a lack of fine-grained ability +assessment and non-robust evaluation metrics. Recent subjective benchmarks, +such as OwlEval, offer comprehensive evaluations of a model's abilities by +incorporating human labor, but they are not scalable and display significant +bias. In response to these challenges, we propose MMBench, a novel +multi-modality benchmark. MMBench methodically develops a comprehensive +evaluation pipeline, primarily comprised of two elements. The first element is +a meticulously curated dataset that surpasses existing similar benchmarks in +terms of the number and variety of evaluation questions and abilities. The +second element introduces a novel CircularEval strategy and incorporates the +use of ChatGPT. This implementation is designed to convert free-form +predictions into pre-defined choices, thereby facilitating a more robust +evaluation of the model's predictions. MMBench is a systematically-designed +objective benchmark for robustly evaluating the various abilities of +vision-language models. We hope MMBench will assist the research community in +better evaluating their models and encourage future advancements in this +domain. Project page: https://opencompass.org.cn/mmbench. + +
+
+
+
+
+ + ♻ ☆ The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus + on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs) + + +
+ Pediatric tumors of the central nervous system are the most common cause of +cancer-related death in children. The five-year survival rate for high-grade +gliomas in children is less than 20%. Due to their rarity, the diagnosis of +these entities is often delayed, their treatment is mainly based on historic +treatment concepts, and clinical trials require multi-institutional +collaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs +challenge, focused on pediatric brain tumors with data acquired across multiple +international consortia dedicated to pediatric neuro-oncology and clinical +trials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together +clinicians and AI/imaging scientists to lead to faster development of automated +segmentation techniques that could benefit clinical trials, and ultimately the +care of children with brain tumors. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2305.17033 +
+
+
+
+
+ + ♻ ☆ Understanding the (Extra-)Ordinary: Validating Deep Model Decisions with + Prototypical Concept-based Explanations + + +
+ Ensuring both transparency and safety is critical when deploying Deep Neural +Networks (DNNs) in high-risk applications, such as medicine. The field of +explainable AI (XAI) has proposed various methods to comprehend the +decision-making processes of opaque DNNs. However, only few XAI methods are +suitable of ensuring safety in practice as they heavily rely on repeated +labor-intensive and possibly biased human assessment. In this work, we present +a novel post-hoc concept-based XAI framework that conveys besides instance-wise +(local) also class-wise (global) decision-making strategies via prototypes. +What sets our approach apart is the combination of local and global strategies, +enabling a clearer understanding of the (dis-)similarities in model decisions +compared to the expected (prototypical) concept use, ultimately reducing the +dependence on human long-term assessment. Quantifying the deviation from +prototypical behavior not only allows to associate predictions with specific +model sub-strategies but also to detect outlier behavior. As such, our approach +constitutes an intuitive and explainable tool for model validation. We +demonstrate the effectiveness of our approach in identifying +out-of-distribution samples, spurious model behavior and data quality issues +across three datasets (ImageNet, CUB-200, and CIFAR-10) utilizing VGG, ResNet, +and EfficientNet architectures. Code is available on +https://github.com/maxdreyer/pcx. + +
+
+ comment: 39 pages (8 pages manuscript, 3 pages references, 28 pages appendix) +
+
+
+
+
+ + ♻ ☆ PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video + Dense Captioning + + +
+ Vision-language pre-training has significantly elevated performance across a +wide range of image-language applications. Yet, the pre-training process for +video-related tasks demands exceptionally large computational and data +resources, which hinders the progress of video-language models. This paper +investigates a straight-forward, highly efficient, and resource-light approach +to adapting an existing image-language pre-trained model for dense video +understanding. Our preliminary experiments reveal that directly fine-tuning +pre-trained image-language models with multiple frames as inputs on video +datasets leads to performance saturation or even a drop. Our further +investigation reveals that it is largely attributed to the bias of learned +high-norm visual features. Motivated by this finding, we propose a simple but +effective pooling strategy to smooth the feature distribution along the +temporal dimension and thus reduce the dominant impacts from the extreme +features. The new model is termed Pooling LLaVA, or PLLaVA in short. PLLaVA +achieves new state-of-the-art performance on modern benchmark datasets for both +video question-answer and captioning tasks. Notably, on the recent popular +VideoChatGPT benchmark, PLLaVA achieves a score of 3.48 out of 5 on average of +five evaluated dimensions, exceeding the previous SOTA results from GPT4V +(IG-VLM) by 9%. On the latest multi-choice benchmark MVBench, PLLaVA achieves +58.1% accuracy on average across 20 sub-tasks, 14.5% higher than GPT4V +(IG-VLM). Code is available at https://pllava.github.io/ + +
+
+
+
+
+ + ♻ ☆ Exposure Bracketing is All You Need for Unifying Image Restoration and + Enhancement Tasks + + +
+ It is highly desired but challenging to acquire high-quality photos with +clear content in low-light environments. Although multi-image processing +methods (using burst, dual-exposure, or multi-exposure images) have made +significant progress in addressing this issue, they typically focus on specific +restoration or enhancement problems, being insufficient in exploiting +multi-image. Motivated by that multi-exposure images are complementary in +denoising, deblurring, high dynamic range imaging, and super-resolution, we +propose to utilize exposure bracketing photography to unify restoration and +enhancement tasks in this work. Due to the difficulty in collecting real-world +pairs, we suggest a solution that first pre-trains the model with synthetic +paired data and then adapts it to real-world unlabeled images. In particular, a +temporally modulated recurrent network (TMRNet) and self-supervised adaptation +method are proposed. Moreover, we construct a data simulation pipeline to +synthesize pairs and collect real-world images from 200 nighttime scenarios. +Experiments on both datasets show that our method performs favorably against +the state-of-the-art multi-image processing ones. The dataset, code, and +pre-trained models are available at https://github.com/cszhilu1998/BracketIRE. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ Annotating Ambiguous Images: General Annotation Strategy for + High-Quality Data with Real-World Biomedical Validation ICLR 2024 + + +
+ In the field of image classification, existing methods often struggle with +biased or ambiguous data, a prevalent issue in real-world scenarios. Current +strategies, including semi-supervised learning and class blending, offer +partial solutions but lack a definitive resolution. Addressing this gap, our +paper introduces a novel strategy for generating high-quality labels in +challenging datasets. Central to our approach is a clearly designed flowchart, +based on a broad literature review, which enables the creation of reliable +labels. We validate our methodology through a rigorous real-world test case in +the biomedical field, specifically in deducing height reduction from vertebral +imaging. Our empirical study, leveraging over 250,000 annotations, demonstrates +the effectiveness of our strategies decisions compared to their alternatives. + +
+
+ comment: Accepted at ICLR 2024, DMLR Workshop +
+
+
+
+
+ + ♻ ☆ Raising the Bar of AI-generated Image Detection with CLIP + + +
+ The aim of this work is to explore the potential of pre-trained +vision-language models (VLMs) for universal detection of AI-generated images. +We develop a lightweight detection strategy based on CLIP features and study +its performance in a wide variety of challenging scenarios. We find that, +contrary to previous beliefs, it is neither necessary nor convenient to use a +large domain-specific dataset for training. On the contrary, by using only a +handful of example images from a single generative model, a CLIP-based detector +exhibits surprising generalization ability and high robustness across different +architectures, including recent commercial tools such as Dalle-3, Midjourney +v5, and Firefly. We match the state-of-the-art (SoTA) on in-distribution data +and significantly improve upon it in terms of generalization to +out-of-distribution data (+6% AUC) and robustness to impaired/laundered data +(+13%). Our project is available at +https://grip-unina.github.io/ClipBased-SyntheticImageDetection/ + +
+
+
+
+
+ + ♻ ☆ Dual Expert Distillation Network for Generalized Zero-Shot Learning IJCAI 2024 + + +
+ Zero-shot learning has consistently yielded remarkable progress via modeling +nuanced one-to-one visual-attribute correlation. Existing studies resort to +refining a uniform mapping function to align and correlate the sample regions +and subattributes, ignoring two crucial issues: 1) the inherent asymmetry of +attributes; and 2) the unutilized channel information. This paper addresses +these issues by introducing a simple yet effective approach, dubbed Dual Expert +Distillation Network (DEDN), where two experts are dedicated to coarse- and +fine-grained visual-attribute modeling, respectively. Concretely, one coarse +expert, namely cExp, has a complete perceptual scope to coordinate +visual-attribute similarity metrics across dimensions, and moreover, another +fine expert, namely fExp, consists of multiple specialized subnetworks, each +corresponds to an exclusive set of attributes. Two experts cooperatively +distill from each other to reach a mutual agreement during training. Meanwhile, +we further equip DEDN with a newly designed backbone network, i.e., Dual +Attention Network (DAN), which incorporates both region and channel attention +information to fully exploit and leverage visual semantic knowledge. +Experiments on various benchmark datasets indicate a new state-of-the-art. + +
+
+ comment: 9 pages, 4 figures; Accepted to IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Illicit object detection in X-ray images using Vision Transformers + + +
+ Illicit object detection is a critical task performed at various +high-security locations, including airports, train stations, subways, and +ports. The continuous and tedious work of examining thousands of X-ray images +per hour can be mentally taxing. Thus, Deep Neural Networks (DNNs) can be used +to automate the X-ray image analysis process, improve efficiency and alleviate +the security officers' inspection burden. The neural architectures typically +utilized in relevant literature are Convolutional Neural Networks (CNNs), with +Vision Transformers (ViTs) rarely employed. In order to address this gap, this +paper conducts a comprehensive evaluation of relevant ViT architectures on +illicit item detection in X-ray images. This study utilizes both Transformer +and hybrid backbones, such as SWIN and NextViT, and detectors, such as DINO and +RT-DETR. The results demonstrate the remarkable accuracy of the DINO +Transformer detector in the low-data regime, the impressive real-time +performance of YOLOv8, and the effectiveness of the hybrid NextViT backbone. + +
+
+
+
+
+ + ♻ ☆ Radarize: Enhancing Radar SLAM with Generalizable Doppler-Based Odometry + + +
+ Millimeter-wave (mmWave) radar is increasingly being considered as an +alternative to optical sensors for robotic primitives like simultaneous +localization and mapping (SLAM). While mmWave radar overcomes some limitations +of optical sensors, such as occlusions, poor lighting conditions, and privacy +concerns, it also faces unique challenges, such as missed obstacles due to +specular reflections or fake objects due to multipath. To address these +challenges, we propose Radarize, a self-contained SLAM pipeline that uses only +a commodity single-chip mmWave radar. Our radar-native approach uses techniques +such as Doppler shift-based odometry and multipath artifact suppression to +improve performance. We evaluate our method on a large dataset of 146 +trajectories spanning 4 buildings and mounted on 3 different platforms, +totaling approximately 4.7 Km of travel distance. Our results show that our +method outperforms state-of-the-art radar and radar-inertial approaches by +approximately 5x in terms of odometry and 8x in terms of end-to-end SLAM, as +measured by absolute trajectory error (ATE), without the need for additional +sensors such as IMUs or wheel encoders. + +
+
+
+
+
+ + ♻ ☆ The Solution for the CVPR2024 NICE Image Captioning Challenge + + +
+ This report introduces a solution to the Topic 1 Zero-shot Image Captioning +of 2024 NICE : New frontiers for zero-shot Image Captioning Evaluation. In +contrast to NICE 2023 datasets, this challenge involves new annotations by +humans with significant differences in caption style and content. Therefore, we +enhance image captions effectively through retrieval augmentation and caption +grading methods. At the data level, we utilize high-quality captions generated +by image caption models as training data to address the gap in text styles. At +the model level, we employ OFA (a large-scale visual-language pre-training +model based on handcrafted templates) to perform the image captioning task. +Subsequently, we propose caption-level strategy for the high-quality caption +data generated by the image caption models and integrate them with retrieval +augmentation strategy into the template to compel the model to generate higher +quality, more matching, and semantically enriched captions based on the +retrieval augmentation prompts. Our approach achieves a CIDEr score of 234.11. + +
+
+
+
+
+ + ♻ ☆ Modeling Multimodal Social Interactions: New Challenges and Baselines + with Densely Aligned Representations CVPR 2024 + + +
+ Understanding social interactions involving both verbal and non-verbal cues +is essential for effectively interpreting social situations. However, most +prior works on multimodal social cues focus predominantly on single-person +behaviors or rely on holistic visual representations that are not aligned to +utterances in multi-party environments. Consequently, they are limited in +modeling the intricate dynamics of multi-party interactions. In this paper, we +introduce three new challenging tasks to model the fine-grained dynamics +between multiple people: speaking target identification, pronoun coreference +resolution, and mentioned player prediction. We contribute extensive data +annotations to curate these new challenges in social deduction game settings. +Furthermore, we propose a novel multimodal baseline that leverages densely +aligned language-visual representations by synchronizing visual features with +their corresponding utterances. This facilitates concurrently capturing verbal +and non-verbal cues pertinent to social reasoning. Experiments demonstrate the +effectiveness of the proposed approach with densely aligned multimodal +representations in modeling fine-grained social interactions. Project website: +https://sangmin-git.github.io/projects/MMSI. + +
+
+ comment: CVPR 2024 Oral +
+
+
+
+
+ + ♻ ☆ MuseumMaker: Continual Style Customization without Catastrophic + Forgetting + + +
+ Pre-trained large text-to-image (T2I) models with an appropriate text prompt +has attracted growing interests in customized images generation field. However, +catastrophic forgetting issue make it hard to continually synthesize new +user-provided styles while retaining the satisfying results amongst learned +styles. In this paper, we propose MuseumMaker, a method that enables the +synthesis of images by following a set of customized styles in a never-end +manner, and gradually accumulate these creative artistic works as a Museum. +When facing with a new customization style, we develop a style distillation +loss module to extract and learn the styles of the training data for new image +generation. It can minimize the learning biases caused by content of new +training images, and address the catastrophic overfitting issue induced by +few-shot images. To deal with catastrophic forgetting amongst past learned +styles, we devise a dual regularization for shared-LoRA module to optimize the +direction of model update, which could regularize the diffusion model from both +weight and feature aspects, respectively. Meanwhile, to further preserve +historical knowledge from past styles and address the limited representability +of LoRA, we consider a task-wise token learning module where a unique token +embedding is learned to denote a new style. As any new user-provided style +come, our MuseumMaker can capture the nuances of the new styles while +maintaining the details of learned styles. Experimental results on diverse +style datasets validate the effectiveness of our proposed MuseumMaker method, +showcasing its robustness and versatility across various scenarios. + +
+
+
+
+
+ + ♻ ☆ Towards Highly Realistic Artistic Style Transfer via Stable Diffusion + with Step-aware and Layer-aware Prompt IJCAI2024 + + +
+ Artistic style transfer aims to transfer the learned artistic style onto an +arbitrary content image, generating artistic stylized images. Existing +generative adversarial network-based methods fail to generate highly realistic +stylized images and always introduce obvious artifacts and disharmonious +patterns. Recently, large-scale pre-trained diffusion models opened up a new +way for generating highly realistic artistic stylized images. However, +diffusion model-based methods generally fail to preserve the content structure +of input content images well, introducing some undesired content structure and +style patterns. To address the above problems, we propose a novel pre-trained +diffusion-based artistic style transfer method, called LSAST, which can +generate highly realistic artistic stylized images while preserving the content +structure of input content images well, without bringing obvious artifacts and +disharmonious style patterns. Specifically, we introduce a Step-aware and +Layer-aware Prompt Space, a set of learnable prompts, which can learn the style +information from the collection of artworks and dynamically adjusts the input +images' content structure and style pattern. To train our prompt space, we +propose a novel inversion method, called Step-ware and Layer-aware Prompt +Inversion, which allows the prompt space to learn the style information of the +artworks collection. In addition, we inject a pre-trained conditional branch of +ControlNet into our LSAST, which further improved our framework's ability to +maintain content structure. Extensive experiments demonstrate that our proposed +method can generate more highly realistic artistic stylized images than the +state-of-the-art artistic style transfer methods. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ♻ ☆ A Multi-Modal Foundation Model to Assist People with Blindness and Low + Vision in Environmental Interaction + + +
+ People with blindness and low vision (pBLV) encounter substantial challenges +when it comes to comprehensive scene recognition and precise object +identification in unfamiliar environments. Additionally, due to the vision +loss, pBLV have difficulty in accessing and identifying potential tripping +hazards on their own. In this paper, we present a pioneering approach that +leverages a large vision-language model to enhance visual perception for pBLV, +offering detailed and comprehensive descriptions of the surrounding +environments and providing warnings about the potential risks. Our method +begins by leveraging a large image tagging model (i.e., Recognize Anything +(RAM)) to identify all common objects present in the captured images. The +recognition results and user query are then integrated into a prompt, tailored +specifically for pBLV using prompt engineering. By combining the prompt and +input image, a large vision-language model (i.e., InstructBLIP) generates +detailed and comprehensive descriptions of the environment and identifies +potential risks in the environment by analyzing the environmental objects and +scenes, relevant to the prompt. We evaluate our approach through experiments +conducted on both indoor and outdoor datasets. Our results demonstrate that our +method is able to recognize objects accurately and provide insightful +descriptions and analysis of the environment for pBLV. + +
+
+
+
+
+ + ♻ ☆ Bengali Document Layout Analysis -- A YOLOV8 Based Ensembling Approach + + +
+ This paper focuses on enhancing Bengali Document Layout Analysis (DLA) using +the YOLOv8 model and innovative post-processing techniques. We tackle +challenges unique to the complex Bengali script by employing data augmentation +for model robustness. After meticulous validation set evaluation, we fine-tune +our approach on the complete dataset, leading to a two-stage prediction +strategy for accurate element segmentation. Our ensemble model, combined with +post-processing, outperforms individual base architectures, addressing issues +identified in the BaDLAD dataset. By leveraging this approach, we aim to +advance Bengali document analysis, contributing to improved OCR and document +comprehension and BaDLAD serves as a foundational resource for this endeavor, +aiding future research in the field. Furthermore, our experiments provided key +insights to incorporate new strategies into the established solution. + +
+
+ comment: Need to review and rework this +
+
+
+
+
+ + ♻ ☆ Mitigating the Curse of Dimensionality for Certified Robustness via Dual + Randomized Smoothing ICLR 2024 + + +
+ Randomized Smoothing (RS) has been proven a promising method for endowing an +arbitrary image classifier with certified robustness. However, the substantial +uncertainty inherent in the high-dimensional isotropic Gaussian noise imposes +the curse of dimensionality on RS. Specifically, the upper bound of ${\ell_2}$ +certified robustness radius provided by RS exhibits a diminishing trend with +the expansion of the input dimension $d$, proportionally decreasing at a rate +of $1/\sqrt{d}$. This paper explores the feasibility of providing ${\ell_2}$ +certified robustness for high-dimensional input through the utilization of dual +smoothing in the lower-dimensional space. The proposed Dual Randomized +Smoothing (DRS) down-samples the input image into two sub-images and smooths +the two sub-images in lower dimensions. Theoretically, we prove that DRS +guarantees a tight ${\ell_2}$ certified robustness radius for the original +input and reveal that DRS attains a superior upper bound on the ${\ell_2}$ +robustness radius, which decreases proportionally at a rate of $(1/\sqrt m + +1/\sqrt n )$ with $m+n=d$. Extensive experiments demonstrate the +generalizability and effectiveness of DRS, which exhibits a notable capability +to integrate with established methodologies, yielding substantial improvements +in both accuracy and ${\ell_2}$ certified robustness baselines of RS on the +CIFAR-10 and ImageNet datasets. Code is available at +https://github.com/xiasong0501/DRS. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Real-time 3D semantic occupancy prediction for autonomous vehicles using + memory-efficient sparse convolution + + +
+ In autonomous vehicles, understanding the surrounding 3D environment of the +ego vehicle in real-time is essential. A compact way to represent scenes while +encoding geometric distances and semantic object information is via 3D semantic +occupancy maps. State of the art 3D mapping methods leverage transformers with +cross-attention mechanisms to elevate 2D vision-centric camera features into +the 3D domain. However, these methods encounter significant challenges in +real-time applications due to their high computational demands during +inference. This limitation is particularly problematic in autonomous vehicles, +where GPU resources must be shared with other tasks such as localization and +planning. In this paper, we introduce an approach that extracts features from +front-view 2D camera images and LiDAR scans, then employs a sparse convolution +network (Minkowski Engine), for 3D semantic occupancy prediction. Given that +outdoor scenes in autonomous driving scenarios are inherently sparse, the +utilization of sparse convolution is particularly apt. By jointly solving the +problems of 3D scene completion of sparse scenes and 3D semantic segmentation, +we provide a more efficient learning framework suitable for real-time +applications in autonomous vehicles. We also demonstrate competitive accuracy +on the nuScenes dataset. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ 3Doodle: Compact Abstraction of Objects with 3D Strokes SIGGRAPH 2024 + + +
+ While free-hand sketching has long served as an efficient representation to +convey characteristics of an object, they are often subjective, deviating +significantly from realistic representations. Moreover, sketches are not +consistent for arbitrary viewpoints, making it hard to catch 3D shapes. We +propose 3Dooole, generating descriptive and view-consistent sketch images given +multi-view images of the target object. Our method is based on the idea that a +set of 3D strokes can efficiently represent 3D structural information and +render view-consistent 2D sketches. We express 2D sketches as a union of +view-independent and view-dependent components. 3D cubic B ezier curves +indicate view-independent 3D feature lines, while contours of superquadrics +express a smooth outline of the volume of varying viewpoints. Our pipeline +directly optimizes the parameters of 3D stroke primitives to minimize +perceptual losses in a fully differentiable manner. The resulting sparse set of +3D strokes can be rendered as abstract sketches containing essential 3D +characteristic shapes of various objects. We demonstrate that 3Doodle can +faithfully express concepts of the original images compared with recent sketch +generation approaches. + +
+
+ comment: SIGGRAPH 2024 (Transactions on Graphics) +
+
+
+
+
+ + ♻ ☆ Sheet Music Transformer: End-To-End Optical Music Recognition Beyond + Monophonic Transcription + + +
+ State-of-the-art end-to-end Optical Music Recognition (OMR) has, to date, +primarily been carried out using monophonic transcription techniques to handle +complex score layouts, such as polyphony, often by resorting to simplifications +or specific adaptations. Despite their efficacy, these approaches imply +challenges related to scalability and limitations. This paper presents the +Sheet Music Transformer, the first end-to-end OMR model designed to transcribe +complex musical scores without relying solely on monophonic strategies. Our +model employs a Transformer-based image-to-sequence framework that predicts +score transcriptions in a standard digital music encoding format from input +images. Our model has been tested on two polyphonic music datasets and has +proven capable of handling these intricate music structures effectively. The +experimental outcomes not only indicate the competence of the model, but also +show that it is better than the state-of-the-art methods, thus contributing to +advancements in end-to-end OMR transcription. + +
+
+ comment: Submitted to the International Conference on Document Analysis and + Recognition 2024 +
+
+
+
+
+ + ♻ ☆ ViLA: Efficient Video-Language Alignment for Video Question Answering + + +
+ In this work, we propose an efficient Video-Language Alignment (ViLA) +network. Our ViLA model addresses both efficient frame sampling and effective +cross-modal alignment in a unified way. In our ViLA network, we design a new +learnable text-guided Frame-Prompter together with a new cross-modal +distillation (QFormer-Distiller) module. Pre-trained large image-language +models have shown promising results on problems such as visual question +answering (VQA). However, how to efficiently and effectively sample video +frames when adapting pre-trained large image-language model to video-language +alignment is still the major challenge. Compared with prior work, our ViLA +model demonstrates the capability of selecting key frames with critical +contents, thus improving the video-language alignment accuracy while reducing +the inference latency +3.3% on NExT-QA Temporal with 3.0X speed up). Overall, +our ViLA network outperforms the state-of-the-art methods on the video +question-answering benchmarks: +4.6% on STAR Interaction, +2.2% on STAR average +with 3.0X speed up, ours 2-frames out-perform SeViLA 4-frames on the VLEP +dataset with 4.2X speed-up. + +
+
+
+
+
+ + ♻ ☆ Self2Seg: Single-Image Self-Supervised Joint Segmentation and Denoising + + +
+ We develop Self2Seg, a self-supervised method for the joint segmentation and +denoising of a single image. To this end, we combine the advantages of +variational segmentation with self-supervised deep learning. One major benefit +of our method lies in the fact, that in contrast to data-driven methods, where +huge amounts of labeled samples are necessary, Self2Seg segments an image into +meaningful regions without any training database. Moreover, we demonstrate that +self-supervised denoising itself is significantly improved through the +region-specific learning of Self2Seg. Therefore, we introduce a novel +self-supervised energy functional in which denoising and segmentation are +coupled in a way that both tasks benefit from each other. We propose a unified +optimisation strategy and numerically show that for noisy microscopy images our +proposed joint approach outperforms its sequential counterpart as well as +alternative methods focused purely on denoising or segmentation. + +
+
+
+
+
+ + ♻ ☆ HiDiffusion: Unlocking Higher-Resolution Creativity and Efficiency in + Pretrained Diffusion Models + + +
+ Diffusion models have become a mainstream approach for high-resolution image +synthesis. However, directly generating higher-resolution images from +pretrained diffusion models will encounter unreasonable object duplication and +exponentially increase the generation time. In this paper, we discover that +object duplication arises from feature duplication in the deep blocks of the +U-Net. Concurrently, We pinpoint the extended generation times to +self-attention redundancy in U-Net's top blocks. To address these issues, we +propose a tuning-free higher-resolution framework named HiDiffusion. +Specifically, HiDiffusion contains Resolution-Aware U-Net (RAU-Net) that +dynamically adjusts the feature map size to resolve object duplication and +engages Modified Shifted Window Multi-head Self-Attention (MSW-MSA) that +utilizes optimized window attention to reduce computations. we can integrate +HiDiffusion into various pretrained diffusion models to scale image generation +resolutions even to 4096x4096 at 1.5-6x the inference speed of previous +methods. Extensive experiments demonstrate that our approach can address object +duplication and heavy computation issues, achieving state-of-the-art +performance on higher-resolution image synthesis tasks. + +
+
+
+
+
+ + ♻ ☆ TRG-Net: An Interpretable and Controllable Rain Generator + + +
+ Exploring and modeling rain generation mechanism is critical for augmenting +paired data to ease training of rainy image processing models. Against this +task, this study proposes a novel deep learning based rain generator, which +fully takes the physical generation mechanism underlying rains into +consideration and well encodes the learning of the fundamental rain factors +(i.e., shape, orientation, length, width and sparsity) explicitly into the deep +network. Its significance lies in that the generator not only elaborately +design essential elements of the rain to simulate expected rains, like +conventional artificial strategies, but also finely adapt to complicated and +diverse practical rainy images, like deep learning methods. By rationally +adopting filter parameterization technique, we first time achieve a deep +network that is finely controllable with respect to rain factors and able to +learn the distribution of these factors purely from data. Our unpaired +generation experiments demonstrate that the rain generated by the proposed rain +generator is not only of higher quality, but also more effective for deraining +and downstream tasks compared to current state-of-the-art rain generation +methods. Besides, the paired data augmentation experiments, including both +in-distribution and out-of-distribution (OOD), further validate the diversity +of samples generated by our model for in-distribution deraining and OOD +generalization tasks. + +
+
+
+
+
+ + ♻ ☆ PriSampler: Mitigating Property Inference of Diffusion Models + + +
+ Diffusion models have been remarkably successful in data synthesis. However, +when these models are applied to sensitive datasets, such as banking and human +face data, they might bring up severe privacy concerns. This work +systematically presents the first privacy study about property inference +attacks against diffusion models, where adversaries aim to extract sensitive +global properties of its training set from a diffusion model. Specifically, we +focus on the most practical attack scenario: adversaries are restricted to +accessing only synthetic data. Under this realistic scenario, we conduct a +comprehensive evaluation of property inference attacks on various diffusion +models trained on diverse data types, including tabular and image datasets. A +broad range of evaluations reveals that diffusion models and their samplers are +universally vulnerable to property inference attacks. In response, we propose a +new model-agnostic plug-in method PriSampler to mitigate the risks of the +property inference of diffusion models. PriSampler can be directly applied to +well-trained diffusion models and support both stochastic and deterministic +sampling. Extensive experiments illustrate the effectiveness of our defense, +and it can lead adversaries to infer the proportion of properties as close as +predefined values that model owners wish. Notably, PriSampler also shows its +significantly superior performance to diffusion models trained with +differential privacy on both model utility and defense performance. This work +will elevate the awareness of preventing property inference attacks and +encourage privacy-preserving synthetic data release. + +
+
+
+
+
+ + ♻ ☆ Neural-Symbolic Recursive Machine for Systematic Generalization ICLR 2024 + + +
+ Current learning models often struggle with human-like systematic +generalization, particularly in learning compositional rules from limited data +and extrapolating them to novel combinations. We introduce the Neural-Symbolic +Recursive Machine (NSR), whose core is a Grounded Symbol System (GSS), allowing +for the emergence of combinatorial syntax and semantics directly from training +data. The NSR employs a modular design that integrates neural perception, +syntactic parsing, and semantic reasoning. These components are synergistically +trained through a novel deduction-abduction algorithm. Our findings demonstrate +that NSR's design, imbued with the inductive biases of equivariance and +compositionality, grants it the expressiveness to adeptly handle diverse +sequence-to-sequence tasks and achieve unparalleled systematic generalization. +We evaluate NSR's efficacy across four challenging benchmarks designed to probe +systematic generalization capabilities: SCAN for semantic parsing, PCFG for +string manipulation, HINT for arithmetic reasoning, and a compositional machine +translation task. The results affirm NSR's superiority over contemporary neural +and hybrid models in terms of generalization and transferability. + +
+
+ comment: ICLR 2024. Project website: https://liqing-ustc.github.io/NSR/ +
+
+
+
+
+ + ♻ ☆ JeFaPaTo -- A joint toolbox for blinking analysis and facial features + extraction + + +
+ Analyzing facial features and expressions is a complex task in computer +vision. The human face is intricate, with significant shape, texture, and +appearance variations. In medical contexts, facial structures and movements +that differ from the norm are particularly important to study and require +precise analysis to understand the underlying conditions. Given that solely the +facial muscles, innervated by the facial nerve, are responsible for facial +expressions, facial palsy can lead to severe impairments in facial movements. + One affected area of interest is the subtle movements involved in blinking. +It is an intricate spontaneous process that is not yet fully understood and +needs high-resolution, time-specific analysis for detailed understanding. +However, a significant challenge is that many computer vision techniques demand +programming skills for automated extraction and analysis, making them less +accessible to medical professionals who may not have these skills. The Jena +Facial Palsy Toolbox (JeFaPaTo) has been developed to bridge this gap. It +utilizes cutting-edge computer vision algorithms and offers a user-friendly +interface for those without programming expertise. This toolbox makes advanced +facial analysis more accessible to medical experts, simplifying integration +into their workflow. + +
+
+ comment: A Preprint - Submitted to the Journal of Open Source Software; 7 + pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Diagonal Hierarchical Consistency Learning for Semi-supervised Medical + Image Segmentation + + +
+ Medical image segmentation, which is essential for many clinical +applications, has achieved almost human-level performance via data-driven deep +learning technologies. Nevertheless, its performance is predicated upon the +costly process of manually annotating a vast amount of medical images. To this +end, we propose a novel framework for robust semi-supervised medical image +segmentation using diagonal hierarchical consistency learning (DiHC-Net). +First, it is composed of multiple sub-models with identical multi-scale +architecture but with distinct sub-layers, such as up-sampling and +normalisation layers. Second, with mutual consistency, a novel consistency +regularisation is enforced between one model's intermediate and final +prediction and soft pseudo labels from other models in a diagonal hierarchical +fashion. A series of experiments verifies the efficacy of our simple framework, +outperforming all previous approaches on public benchmark dataset covering +organ and tumour. + +
+
+ comment: Accepted to IEEE EMBC 2024 (46th Annual International Conference of + the IEEE Engineering in Medicine & Biology Society) +
+
+
+
+
+ + ♻ ☆ InverseMatrixVT3D: An Efficient Projection Matrix-Based Approach for 3D + Occupancy Prediction + + +
+ This paper introduces InverseMatrixVT3D, an efficient method for transforming +multi-view image features into 3D feature volumes for 3D semantic occupancy +prediction. Existing methods for constructing 3D volumes often rely on depth +estimation, device-specific operators, or transformer queries, which hinders +the widespread adoption of 3D occupancy models. In contrast, our approach +leverages two projection matrices to store the static mapping relationships and +matrix multiplications to efficiently generate global Bird's Eye View (BEV) +features and local 3D feature volumes. Specifically, we achieve this by +performing matrix multiplications between multi-view image feature maps and two +sparse projection matrices. We introduce a sparse matrix handling technique for +the projection matrices to optimize GPU memory usage. Moreover, a global-local +attention fusion module is proposed to integrate the global BEV features with +the local 3D feature volumes to obtain the final 3D volume. We also employ a +multi-scale supervision mechanism to enhance performance further. Extensive +experiments performed on the nuScenes and SemanticKITTI datasets reveal that +our approach not only stands out for its simplicity and effectiveness but also +achieves the top performance in detecting vulnerable road users (VRU), crucial +for autonomous driving and road safety. The code has been made available at: +https://github.com/DanielMing123/InverseMatrixVT3D + +
+
+
+
+
+ + ♻ ☆ SDFD: Building a Versatile Synthetic Face Image Dataset with Diverse + Attributes + + +
+ AI systems rely on extensive training on large datasets to address various +tasks. However, image-based systems, particularly those used for demographic +attribute prediction, face significant challenges. Many current face image +datasets primarily focus on demographic factors such as age, gender, and skin +tone, overlooking other crucial facial attributes like hairstyle and +accessories. This narrow focus limits the diversity of the data and +consequently the robustness of AI systems trained on them. This work aims to +address this limitation by proposing a methodology for generating synthetic +face image datasets that capture a broader spectrum of facial diversity. +Specifically, our approach integrates a systematic prompt formulation strategy, +encompassing not only demographics and biometrics but also non-permanent traits +like make-up, hairstyle, and accessories. These prompts guide a +state-of-the-art text-to-image model in generating a comprehensive dataset of +high-quality realistic images and can be used as an evaluation set in face +analysis systems. Compared to existing datasets, our proposed dataset proves +equally or more challenging in image classification tasks while being much +smaller in size. + +
+
+
+
+
+ + ♻ ☆ Generalizable Metric Network for Cross-domain Person Re-identification + + +
+ Person Re-identification (Re-ID) is a crucial technique for public security +and has made significant progress in supervised settings. However, the +cross-domain (i.e., domain generalization) scene presents a challenge in Re-ID +tasks due to unseen test domains and domain-shift between the training and test +sets. To tackle this challenge, most existing methods aim to learn +domain-invariant or robust features for all domains. In this paper, we observe +that the data-distribution gap between the training and test sets is smaller in +the sample-pair space than in the sample-instance space. Based on this +observation, we propose a Generalizable Metric Network (GMN) to further explore +sample similarity in the sample-pair space. Specifically, we add a Metric +Network (M-Net) after the main network and train it on positive and negative +sample-pair features, which is then employed during the test stage. +Additionally, we introduce the Dropout-based Perturbation (DP) module to +enhance the generalization capability of the metric network by enriching the +sample-pair diversity. Moreover, we develop a Pair-Identity Center (PIC) loss +to enhance the model's discrimination by ensuring that sample-pair features +with the same pair-identity are consistent. We validate the effectiveness of +our proposed method through a lot of experiments on multiple benchmark datasets +and confirm the value of each module in our GMN. + +
+
+ comment: Accepted by IEEE TCSVT +
+
+
+
+
+ + ♻ ☆ MultiMatch: Multi-task Learning for Semi-supervised Domain + Generalization + + +
+ Domain generalization (DG) aims at learning a model on source domains to well +generalize on the unseen target domain. Although it has achieved great success, +most of existing methods require the label information for all training samples +in source domains, which is time-consuming and expensive in the real-world +application. In this paper, we resort to solving the semi-supervised domain +generalization (SSDG) task, where there are a few label information in each +source domain. To address the task, we first analyze the theory of the +multi-domain learning, which highlights that 1) mitigating the impact of domain +gap and 2) exploiting all samples to train the model can effectively reduce the +generalization error in each source domain so as to improve the quality of +pseudo-labels. According to the analysis, we propose MultiMatch, i.e., +extending FixMatch to the multi-task learning framework, producing the +high-quality pseudo-label for SSDG. To be specific, we consider each training +domain as a single task (i.e., local task) and combine all training domains +together (i.e., global task) to train an extra task for the unseen test domain. +In the multi-task framework, we utilize the independent BN and classifier for +each task, which can effectively alleviate the interference from different +domains during pseudo-labeling. Also, most of parameters in the framework are +shared, which can be trained by all training samples sufficiently. Moreover, to +further boost the pseudo-label accuracy and the model's generalization, we fuse +the predictions from the global task and local task during training and +testing, respectively. A series of experiments validate the effectiveness of +the proposed method, and it outperforms the existing semi-supervised methods +and the SSDG method on several benchmark DG datasets. + +
+
+ comment: Accepted by ACM TOMM +
+
+
+
+
+ + ♻ ☆ MV-VTON: Multi-View Virtual Try-On with Diffusion Models + + +
+ The goal of image-based virtual try-on is to generate an image of the target +person naturally wearing the given clothing. However, most existing methods +solely focus on the frontal try-on using the frontal clothing. When the views +of the clothing and person are significantly inconsistent, particularly when +the person's view is non-frontal, the results are unsatisfactory. To address +this challenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to +reconstruct the dressing results of a person from multiple views using the +given clothes. On the one hand, given that single-view clothes provide +insufficient information for MV-VTON, we instead employ two images, i.e., the +frontal and back views of the clothing, to encompass the complete view as much +as possible. On the other hand, the diffusion models that have demonstrated +superior abilities are adopted to perform our MV-VTON. In particular, we +propose a view-adaptive selection method where hard-selection and +soft-selection are applied to the global and local clothing feature extraction, +respectively. This ensures that the clothing features are roughly fit to the +person's view. Subsequently, we suggest a joint attention block to align and +fuse clothing features with person features. Additionally, we collect a MV-VTON +dataset, i.e., Multi-View Garment (MVG), in which each person has multiple +photos with diverse views and poses. Experiments show that the proposed method +not only achieves state-of-the-art results on MV-VTON task using our MVG +dataset, but also has superiority on frontal-view virtual try-on task using +VITON-HD and DressCode datasets. Codes and datasets will be publicly released +at https://github.com/hywang2002/MV-VTON . + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ WiTUnet: A U-Shaped Architecture Integrating CNN and Transformer for + Improved Feature Alignment and Local Information Fusion + + +
+ Low-dose computed tomography (LDCT) has become the technology of choice for +diagnostic medical imaging, given its lower radiation dose compared to standard +CT, despite increasing image noise and potentially affecting diagnostic +accuracy. To address this, advanced deep learning-based LDCT denoising +algorithms have been developed, primarily using Convolutional Neural Networks +(CNNs) or Transformer Networks with the Unet architecture. This architecture +enhances image detail by integrating feature maps from the encoder and decoder +via skip connections. However, current methods often overlook enhancements to +the Unet architecture itself, focusing instead on optimizing encoder and +decoder structures. This approach can be problematic due to the significant +differences in feature map characteristics between the encoder and decoder, +where simple fusion strategies may not effectively reconstruct images.In this +paper, we introduce WiTUnet, a novel LDCT image denoising method that utilizes +nested, dense skip pathways instead of traditional skip connections to improve +feature integration. WiTUnet also incorporates a windowed Transformer structure +to process images in smaller, non-overlapping segments, reducing computational +load. Additionally, the integration of a Local Image Perception Enhancement +(LiPe) module in both the encoder and decoder replaces the standard multi-layer +perceptron (MLP) in Transformers, enhancing local feature capture and +representation. Through extensive experimental comparisons, WiTUnet has +demonstrated superior performance over existing methods in key metrics such as +Peak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), and Root Mean +Square Error (RMSE), significantly improving noise removal and image quality. + +
+
+
+
+
+ + ♻ ☆ Instant3D: Instant Text-to-3D Generation + + +
+ Text-to-3D generation has attracted much attention from the computer vision +community. Existing methods mainly optimize a neural field from scratch for +each text prompt, relying on heavy and repetitive training cost which impedes +their practical deployment. In this paper, we propose a novel framework for +fast text-to-3D generation, dubbed Instant3D. Once trained, Instant3D is able +to create a 3D object for an unseen text prompt in less than one second with a +single run of a feedforward network. We achieve this remarkable speed by +devising a new network that directly constructs a 3D triplane from a text +prompt. The core innovation of our Instant3D lies in our exploration of +strategies to effectively inject text conditions into the network. In +particular, we propose to combine three key mechanisms: cross-attention, style +injection, and token-to-plane transformation, which collectively ensure precise +alignment of the output with the input text. Furthermore, we propose a simple +yet effective activation function, the scaled-sigmoid, to replace the original +sigmoid function, which speeds up the training convergence by more than ten +times. Finally, to address the Janus (multi-head) problem in 3D generation, we +propose an adaptive Perp-Neg algorithm that can dynamically adjust its concept +negation scales according to the severity of the Janus problem during training, +effectively reducing the multi-head effect. Extensive experiments on a wide +variety of benchmark datasets demonstrate that the proposed algorithm performs +favorably against the state-of-the-art methods both qualitatively and +quantitatively, while achieving significantly better efficiency. The code, +data, and models are available at https://github.com/ming1993li/Instant3DCodes. + +
+
+ comment: Project page: https://ming1993li.github.io/Instant3DProj +
+
+
+
+
+ + ♻ ☆ CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and + Radiology Reports for Full-Body Scenarios + + +
+ Medical Vision-Language Pretraining (Med-VLP) establishes a connection +between visual content from medical images and the relevant textual +descriptions. Existing Med-VLP methods primarily focus on 2D images depicting a +single body part, notably chest X-rays. In this paper, we extend the scope of +Med-VLP to encompass 3D images, specifically targeting full-body scenarios, by +using a multimodal dataset of CT images and reports. Compared with the 2D +counterpart, 3D VLP is required to effectively capture essential semantics from +significantly sparser representation in 3D imaging. In this paper, we introduce +CT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method +that constructs organ-level image-text pairs to enhance multimodal contrastive +learning, aligning grounded visual features with precise diagnostic text. +Additionally, we developed an abnormality dictionary to augment contrastive +learning with diverse contrastive pairs. Our method, trained on a multimodal CT +dataset comprising 44,011 organ-level vision-text pairs from 17,702 patients +across 104 organs, demonstrates it can identify organs and abnormalities in a +zero-shot manner using natural languages. The performance of CT-GLIP is +validated on a separate test set of 1,130 patients, focusing on the 16 most +frequent abnormalities across 7 organs. The experimental results show our +model's superior performance over the standard CLIP framework across zero-shot +and fine-tuning scenarios, using both CNN and ViT architectures. + +
+
+ comment: 12 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ LIPT: Latency-aware Image Processing Transformer + + +
+ Transformer is leading a trend in the field of image processing. Despite the +great success that existing lightweight image processing transformers have +achieved, they are tailored to FLOPs or parameters reduction, rather than +practical inference acceleration. In this paper, we present a latency-aware +image processing transformer, termed LIPT. We devise the low-latency proportion +LIPT block that substitutes memory-intensive operators with the combination of +self-attention and convolutions to achieve practical speedup. Specifically, we +propose a novel non-volatile sparse masking self-attention (NVSM-SA) that +utilizes a pre-computing sparse mask to capture contextual information from a +larger window with no extra computation overload. Besides, a high-frequency +reparameterization module (HRM) is proposed to make LIPT block +reparameterization friendly, which improves the model's detail reconstruction +capability. Extensive experiments on multiple image processing tasks (e.g., +image super-resolution (SR), JPEG artifact reduction, and image denoising) +demonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves +real-time GPU inference with state-of-the-art performance on multiple image SR +benchmarks. + +
+
+
+
+
+ + ♻ ☆ Conditional Distribution Modelling for Few-Shot Image Synthesis with + Diffusion Models + + +
+ Few-shot image synthesis entails generating diverse and realistic images of +novel categories using only a few example images. While multiple recent efforts +in this direction have achieved impressive results, the existing approaches are +dependent only upon the few novel samples available at test time in order to +generate new images, which restricts the diversity of the generated images. To +overcome this limitation, we propose Conditional Distribution Modelling (CDM) +-- a framework which effectively utilizes Diffusion models for few-shot image +generation. By modelling the distribution of the latent space used to condition +a Diffusion process, CDM leverages the learnt statistics of the training data +to get a better approximation of the unseen class distribution, thereby +removing the bias arising due to limited number of few shot samples. +Simultaneously, we devise a novel inversion based optimization strategy that +further improves the approximated unseen class distribution, and ensures the +fidelity of the generated samples to the unseen class. The experimental results +on four benchmark datasets demonstrate the effectiveness of our proposed CDM +for few-shot generation. + +
+
+
+
+
+ + ♻ ☆ VolumeDiffusion: Flexible Text-to-3D Generation with Efficient + Volumetric Encoder + + +
+ This paper introduces a pioneering 3D volumetric encoder designed for +text-to-3D generation. To scale up the training data for the diffusion model, a +lightweight network is developed to efficiently acquire feature volumes from +multi-view images. The 3D volumes are then trained on a diffusion model for +text-to-3D generation using a 3D U-Net. This research further addresses the +challenges of inaccurate object captions and high-dimensional feature volumes. +The proposed model, trained on the public Objaverse dataset, demonstrates +promising outcomes in producing diverse and recognizable samples from text +prompts. Notably, it empowers finer control over object part characteristics +through textual cues, fostering model creativity by seamlessly combining +multiple concepts within a single object. This research significantly +contributes to the progress of 3D generation by introducing an efficient, +flexible, and scalable representation methodology. Code is available at +https://github.com/checkcrab/VolumeDiffusion. + +
+
+
+
+
+ + ♻ ☆ Two in One Go: Single-stage Emotion Recognition with Decoupled + Subject-context Transformer + + +
+ Emotion recognition aims to discern the emotional state of subjects within an +image, relying on subject-centric and contextual visual cues. Current +approaches typically follow a two-stage pipeline: first localize subjects by +off-the-shelf detectors, then perform emotion classification through the late +fusion of subject and context features. However, the complicated paradigm +suffers from disjoint training stages and limited interaction between +fine-grained subject-context elements. To address the challenge, we present a +single-stage emotion recognition approach, employing a Decoupled +Subject-Context Transformer (DSCT), for simultaneous subject localization and +emotion classification. Rather than compartmentalizing training stages, we +jointly leverage box and emotion signals as supervision to enrich +subject-centric feature learning. Furthermore, we introduce DSCT to facilitate +interactions between fine-grained subject-context cues in a decouple-then-fuse +manner. The decoupled query token--subject queries and context +queries--gradually intertwine across layers within DSCT, during which spatial +and semantic relations are exploited and aggregated. We evaluate our +single-stage framework on two widely used context-aware emotion recognition +datasets, CAER-S and EMOTIC. Our approach surpasses two-stage alternatives with +fewer parameter numbers, achieving a 3.39% accuracy improvement and a 6.46% +average precision gain on CAER-S and EMOTIC datasets, respectively. + +
+
+
+
+
+ + ♻ ☆ A Multi-objective Optimization Benchmark Test Suite for Real-time + Semantic Segmentation GECCO 2024 + + +
+ As one of the emerging challenges in Automated Machine Learning, the +Hardware-aware Neural Architecture Search (HW-NAS) tasks can be treated as +black-box multi-objective optimization problems (MOPs). An important +application of HW-NAS is real-time semantic segmentation, which plays a pivotal +role in autonomous driving scenarios. The HW-NAS for real-time semantic +segmentation inherently needs to balance multiple optimization objectives, +including model accuracy, inference speed, and hardware-specific +considerations. Despite its importance, benchmarks have yet to be developed to +frame such a challenging task as multi-objective optimization. To bridge the +gap, we introduce a tailored streamline to transform the task of HW-NAS for +real-time semantic segmentation into standard MOPs. Building upon the +streamline, we present a benchmark test suite, CitySeg/MOP, comprising fifteen +MOPs derived from the Cityscapes dataset. The CitySeg/MOP test suite is +integrated into the EvoXBench platform to provide seamless interfaces with +various programming languages (e.g., Python and MATLAB) for instant fitness +evaluations. We comprehensively assessed the CitySeg/MOP test suite on various +multi-objective evolutionary algorithms, showcasing its versatility and +practicality. Source codes are available at +https://github.com/EMI-Group/evoxbench. + +
+
+ comment: GECCO 2024 +
+
+
+
+
+ + ♻ ☆ Distilling Privileged Multimodal Information for Expression Recognition + using Optimal Transport + + +
+ Deep learning models for multimodal expression recognition have reached +remarkable performance in controlled laboratory environments because of their +ability to learn complementary and redundant semantic information. However, +these models struggle in the wild, mainly because of the unavailability and +quality of modalities used for training. In practice, only a subset of the +training-time modalities may be available at test time. Learning with +privileged information enables models to exploit data from additional +modalities that are only available during training. State-of-the-art knowledge +distillation (KD) methods have been proposed to distill information from +multiple teacher models (each trained on a modality) to a common student model. +These privileged KD methods typically utilize point-to-point matching, yet have +no explicit mechanism to capture the structural information in the teacher +representation space formed by introducing the privileged modality. Experiments +were performed on two challenging problems - pain estimation on the Biovid +dataset (ordinal classification) and arousal-valance prediction on the Affwild2 +dataset (regression). Results show that our proposed method can outperform +state-of-the-art privileged KD methods on these problems. The diversity among +modalities and fusion architectures indicates that PKDOT is modality- and +model-agnostic. + +
+
+
+
+
+ + ♻ ☆ Using Skew to Assess the Quality of GAN-generated Image Features + + +
+ The rapid advancement of Generative Adversarial Networks (GANs) necessitates +the need to robustly evaluate these models. Among the established evaluation +criteria, the Fr\'{e}chetInception Distance (FID) has been widely adopted due +to its conceptual simplicity, fast computation time, and strong correlation +with human perception. However, FID has inherent limitations, mainly stemming +from its assumption that feature embeddings follow a Gaussian distribution, and +therefore can be defined by their first two moments. As this does not hold in +practice, in this paper we explore the importance of third-moments in image +feature data and use this information to define a new measure, which we call +the Skew Inception Distance (SID). We prove that SID is a pseudometric on +probability distributions, show how it extends FID, and present a practical +method for its computation. Our numerical experiments support that SID either +tracks with FID or, in some cases, aligns more closely with human perception +when evaluating image features of ImageNet data. Our work also shows that +principal component analysis can be used to speed up the computation time of +both FID and SID. Although we focus on using SID on image features for GAN +evaluation, SID is applicable much more generally, including for the evaluation +of other generative models. + +
+
+
+
+
+ + ♻ ☆ UncertaintyTrack: Exploiting Detection and Localization Uncertainty in + Multi-Object Tracking ICRA 2024 + + +
+ Multi-object tracking (MOT) methods have seen a significant boost in +performance recently, due to strong interest from the research community and +steadily improving object detection methods. The majority of tracking methods +follow the tracking-by-detection (TBD) paradigm, blindly trust the incoming +detections with no sense of their associated localization uncertainty. This +lack of uncertainty awareness poses a problem in safety-critical tasks such as +autonomous driving where passengers could be put at risk due to erroneous +detections that have propagated to downstream tasks, including MOT. While there +are existing works in probabilistic object detection that predict the +localization uncertainty around the boxes, no work in 2D MOT for autonomous +driving has studied whether these estimates are meaningful enough to be +leveraged effectively in object tracking. We introduce UncertaintyTrack, a +collection of extensions that can be applied to multiple TBD trackers to +account for localization uncertainty estimates from probabilistic object +detectors. Experiments on the Berkeley Deep Drive MOT dataset show that the +combination of our method and informative uncertainty estimates reduces the +number of ID switches by around 19\% and improves mMOTA by 2-3%. The source +code is available at https://github.com/TRAILab/UncertaintyTrack + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal + Models with Open-Source Suites + + +
+ In this report, we introduce InternVL 1.5, an open-source multimodal large +language model (MLLM) to bridge the capability gap between open-source and +proprietary commercial models in multimodal understanding. We introduce three +simple improvements: (1) Strong Vision Encoder: we explored a continuous +learning strategy for the large-scale vision foundation model -- InternViT-6B, +boosting its visual understanding capabilities, and making it can be +transferred and reused in different LLMs. (2) Dynamic High-Resolution: we +divide images into tiles ranging from 1 to 40 of 448$\times$448 pixels +according to the aspect ratio and resolution of the input images, which +supports up to 4K resolution input. (3) High-Quality Bilingual Dataset: we +carefully collected a high-quality bilingual dataset that covers common scenes, +document images, and annotated them with English and Chinese question-answer +pairs, significantly enhancing performance in OCR- and Chinese-related tasks. +We evaluate InternVL 1.5 through a series of benchmarks and comparative +studies. Compared to both open-source and proprietary models, InternVL 1.5 +shows competitive performance, achieving state-of-the-art results in 8 of 18 +benchmarks. Code has been released at https://github.com/OpenGVLab/InternVL. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ Computer Vision for Increased Operative Efficiency via Identification of + Instruments in the Neurosurgical Operating Room: A Proof-of-Concept Study + + +
+ Objectives Computer vision (CV) is a field of artificial intelligence that +enables machines to interpret and understand images and videos. CV has the +potential to be of assistance in the operating room (OR) to track surgical +instruments. We built a CV algorithm for identifying surgical instruments in +the neurosurgical operating room as a potential solution for surgical +instrument tracking and management to decrease surgical waste and opening of +unnecessary tools. Methods We collected 1660 images of 27 commonly used +neurosurgical instruments. Images were labeled using the VGG Image Annotator +and split into 80% training and 20% testing sets in order to train a U-Net +Convolutional Neural Network using 5-fold cross validation. Results Our U-Net +achieved a tool identification accuracy of 80-100% when distinguishing 25 +classes of instruments, with 19/25 classes having accuracy over 90%. The model +performance was not adequate for sub classifying Adson, Gerald, and Debakey +forceps, which had accuracies of 60-80%. Conclusions We demonstrated the +viability of using machine learning to accurately identify surgical +instruments. Instrument identification could help optimize surgical tray +packing, decrease tool usage and waste, decrease incidence of instrument +misplacement events, and assist in timing of routine instrument maintenance. +More training data will be needed to increase accuracy across all surgical +instruments that would appear in a neurosurgical operating room. Such +technology has the potential to be used as a method to be used for proving what +tools are truly needed in each type of operation allowing surgeons across the +world to do more with less. + +
+
+ comment: Data is openly available through The Open Science Framework: + https://doi.org/10.17605/OSF.IO/BCQK2 +
+
+
+
+
+ + ♻ ☆ Video ReCap: Recursive Captioning of Hour-Long Videos CVPR 2024 + + +
+ Most video captioning models are designed to process short video clips of few +seconds and output text describing low-level visual concepts (e.g., objects, +scenes, atomic actions). However, most real-world videos last for minutes or +hours and have a complex hierarchical structure spanning different temporal +granularities. We propose Video ReCap, a recursive video captioning model that +can process video inputs of dramatically different lengths (from 1 second to 2 +hours) and output video captions at multiple hierarchy levels. The recursive +video-language architecture exploits the synergy between different video +hierarchies and can process hour-long videos efficiently. We utilize a +curriculum learning training scheme to learn the hierarchical structure of +videos, starting from clip-level captions describing atomic actions, then +focusing on segment-level descriptions, and concluding with generating +summaries for hour-long videos. Furthermore, we introduce Ego4D-HCap dataset by +augmenting Ego4D with 8,267 manually collected long-range video summaries. Our +recursive model can flexibly generate captions at different hierarchy levels +while also being useful for other complex video understanding tasks, such as +VideoQA on EgoSchema. Data, code, and models are available at: +https://sites.google.com/view/vidrecap + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Revisiting Relevance Feedback for CLIP-based Interactive Image Retrieval + + +
+ Many image retrieval studies use metric learning to train an image encoder. +However, metric learning cannot handle differences in users' preferences, and +requires data to train an image encoder. To overcome these limitations, we +revisit relevance feedback, a classic technique for interactive retrieval +systems, and propose an interactive CLIP-based image retrieval system with +relevance feedback. Our retrieval system first executes the retrieval, collects +each user's unique preferences through binary feedback, and returns images the +user prefers. Even when users have various preferences, our retrieval system +learns each user's preference through the feedback and adapts to the +preference. Moreover, our retrieval system leverages CLIP's zero-shot +transferability and achieves high accuracy without training. We empirically +show that our retrieval system competes well with state-of-the-art metric +learning in category-based image retrieval, despite not training image encoders +specifically for each dataset. Furthermore, we set up two additional +experimental settings where users have various preferences: one-label-based +image retrieval and conditioned image retrieval. In both cases, our retrieval +system effectively adapts to each user's preferences, resulting in improved +accuracy compared to image retrieval without feedback. Overall, our work +highlights the potential benefits of integrating CLIP with classic relevance +feedback techniques to enhance image retrieval. + +
+
+ comment: 20 pages, 8 sugures +
+
+
+
+
+ + ♻ ☆ Benchmarking the Fairness of Image Upsampling Methods + + +
+ Recent years have witnessed a rapid development of deep generative models for +creating synthetic media, such as images and videos. While the practical +applications of these models in everyday tasks are enticing, it is crucial to +assess the inherent risks regarding their fairness. In this work, we introduce +a comprehensive framework for benchmarking the performance and fairness of +conditional generative models. We develop a set of +metrics$\unicode{x2013}$inspired by their supervised fairness +counterparts$\unicode{x2013}$to evaluate the models on their fairness and +diversity. Focusing on the specific application of image upsampling, we create +a benchmark covering a wide variety of modern upsampling methods. As part of +the benchmark, we introduce UnfairFace, a subset of FairFace that replicates +the racial distribution of common large-scale face datasets. Our empirical +study highlights the importance of using an unbiased training set and reveals +variations in how the algorithms respond to dataset imbalances. Alarmingly, we +find that none of the considered methods produces statistically fair and +diverse results. All experiments can be reproduced using our provided +repository. + +
+
+ comment: This is the author's version of the work. It is posted here for your + personal use. Not for redistribution. The definitive Version of Record was + published at the 2024 ACM Conference on Fairness, Accountability, and + Transparency (FAccT '24) +
+
+
+
+
+ + ♻ ☆ A Deep Ordinal Distortion Estimation Approach for Distortion + Rectification + + +
+ Distortion is widely existed in the images captured by popular wide-angle +cameras and fisheye cameras. Despite the long history of distortion +rectification, accurately estimating the distortion parameters from a single +distorted image is still challenging. The main reason is these parameters are +implicit to image features, influencing the networks to fully learn the +distortion information. In this work, we propose a novel distortion +rectification approach that can obtain more accurate parameters with higher +efficiency. Our key insight is that distortion rectification can be cast as a +problem of learning an ordinal distortion from a single distorted image. To +solve this problem, we design a local-global associated estimation network that +learns the ordinal distortion to approximate the realistic distortion +distribution. In contrast to the implicit distortion parameters, the proposed +ordinal distortion have more explicit relationship with image features, and +thus significantly boosts the distortion perception of neural networks. +Considering the redundancy of distortion information, our approach only uses a +part of distorted image for the ordinal distortion estimation, showing +promising applications in the efficient distortion rectification. To our +knowledge, we first unify the heterogeneous distortion parameters into a +learning-friendly intermediate representation through ordinal distortion, +bridging the gap between image feature and distortion rectification. The +experimental results demonstrate that our approach outperforms the +state-of-the-art methods by a significant margin, with approximately 23% +improvement on the quantitative evaluation while displaying the best +performance on visual appearance. The code is available at +https://github.com/KangLiao929/OrdinalDistortion. + +
+
+
+
+
+ + ♻ ☆ Project RISE: Recognizing Industrial Smoke Emissions AAAI 2021 + + +
+ Industrial smoke emissions pose a significant concern to human health. Prior +works have shown that using Computer Vision (CV) techniques to identify smoke +as visual evidence can influence the attitude of regulators and empower +citizens to pursue environmental justice. However, existing datasets are not of +sufficient quality nor quantity to train the robust CV models needed to support +air quality advocacy. We introduce RISE, the first large-scale video dataset +for Recognizing Industrial Smoke Emissions. We adopted a citizen science +approach to collaborate with local community members to annotate whether a +video clip has smoke emissions. Our dataset contains 12,567 clips from 19 +distinct views from cameras that monitored three industrial facilities. These +daytime clips span 30 days over two years, including all four seasons. We ran +experiments using deep neural networks to establish a strong performance +baseline and reveal smoke recognition challenges. Our survey study discussed +community feedback, and our data analysis displayed opportunities for +integrating citizen scientists and crowd workers into the application of +Artificial Intelligence for Social Impact. + +
+
+ comment: Accepted by AAAI 2021 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 74 + +
+
+
+ + ☆ MultiMAE-DER: Multimodal Masked Autoencoder for Dynamic Emotion + Recognition ICPR + + +
+ This paper presents a novel approach to processing multimodal data for +dynamic emotion recognition, named as the Multimodal Masked Autoencoder for +Dynamic Emotion Recognition (MultiMAE-DER). The MultiMAE-DER leverages the +closely correlated representation information within spatiotemporal sequences +across visual and audio modalities. By utilizing a pre-trained masked +autoencoder model, the MultiMAEDER is accomplished through simple, +straightforward finetuning. The performance of the MultiMAE-DER is enhanced by +optimizing six fusion strategies for multimodal input sequences. These +strategies address dynamic feature correlations within cross-domain data across +spatial, temporal, and spatiotemporal sequences. In comparison to +state-of-the-art multimodal supervised learning models for dynamic emotion +recognition, MultiMAE-DER enhances the weighted average recall (WAR) by 4.41% +on the RAVDESS dataset and by 2.06% on the CREMAD. Furthermore, when compared +with the state-of-the-art model of multimodal self-supervised learning, +MultiMAE-DER achieves a 1.86% higher WAR on the IEMOCAP dataset. + +
+
+ comment: Accepted by ICPRS 2024 +
+
+
+
+
+ + ☆ Position paper: Do not explain (vision models) without context + + +
+ Does the stethoscope in the picture make the adjacent person a doctor or a +patient? This, of course, depends on the contextual relationship of the two +objects. If it is obvious, why don not explanation methods for vision models +use contextual information? In this paper, we (1) review the most popular +methods of explaining computer vision models by pointing out that they do not +take into account context information, (2) provide examples of real-world use +cases where spatial context plays a significant role, (3) propose new research +directions that may lead to better use of context information in explaining +computer vision models, (4) argue that a change in approach to explanations is +needed from 'where' to 'how'. + +
+
+
+
+
+ + ☆ Panoptic Segmentation and Labelling of Lumbar Spine Vertebrae using + Modified Attention Unet + + +
+ Segmentation and labeling of vertebrae in MRI images of the spine are +critical for the diagnosis of illnesses and abnormalities. These steps are +indispensable as MRI technology provides detailed information about the tissue +structure of the spine. Both supervised and unsupervised segmentation methods +exist, yet acquiring sufficient data remains challenging for achieving high +accuracy. In this study, we propose an enhancing approach based on modified +attention U-Net architecture for panoptic segmentation of 3D sliced MRI data of +the lumbar spine. Our method achieves an impressive accuracy of 99.5\% by +incorporating novel masking logic, thus significantly advancing the +state-of-the-art in vertebral segmentation and labeling. This contributes to +more precise and reliable diagnosis and treatment planning. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ S3-SLAM: Sparse Tri-plane Encoding for Neural Implicit SLAM + + +
+ With the emergence of Neural Radiance Fields (NeRF), neural implicit +representations have gained widespread applications across various domains, +including simultaneous localization and mapping. However, current neural +implicit SLAM faces a challenging trade-off problem between performance and the +number of parameters. To address this problem, we propose sparse tri-plane +encoding, which efficiently achieves scene reconstruction at resolutions up to +512 using only 2~4% of the commonly used tri-plane parameters (reduced from +100MB to 2~4MB). On this basis, we design S3-SLAM to achieve rapid and +high-quality tracking and mapping through sparsifying plane parameters and +integrating orthogonal features of tri-plane. Furthermore, we develop +hierarchical bundle adjustment to achieve globally consistent geometric +structures and reconstruct high-resolution appearance. Experimental results +demonstrate that our approach achieves competitive tracking and scene +reconstruction with minimal parameters on three datasets. Source code will soon +be available. + +
+
+
+
+
+ + ☆ Out-of-distribution Detection in Medical Image Analysis: A survey + + +
+ Computer-aided diagnostics has benefited from the development of deep +learning-based computer vision techniques in these years. Traditional +supervised deep learning methods assume that the test sample is drawn from the +identical distribution as the training data. However, it is possible to +encounter out-of-distribution samples in real-world clinical scenarios, which +may cause silent failure in deep learning-based medical image analysis tasks. +Recently, research has explored various out-of-distribution (OOD) detection +situations and techniques to enable a trustworthy medical AI system. In this +survey, we systematically review the recent advances in OOD detection in +medical image analysis. We first explore several factors that may cause a +distributional shift when using a deep-learning-based model in clinic +scenarios, with three different types of distributional shift well defined on +top of these factors. Then a framework is suggested to categorize and feature +existing solutions, while the previous studies are reviewed based on the +methodology taxonomy. Our discussion also includes evaluation protocols and +metrics, as well as the challenge and a research direction lack of exploration. + +
+
+ comment: 23 pages, 3 figures +
+
+
+
+
+ + ☆ Align, Minimize and Diversify: A Source-Free Unsupervised Domain + Adaptation Method for Handwritten Text Recognition ECCV 2024 + + +
+ This paper serves to introduce the Align, Minimize and Diversify (AMD) +method, a Source-Free Unsupervised Domain Adaptation approach for Handwritten +Text Recognition (HTR). This framework decouples the adaptation process from +the source data, thus not only sidestepping the resource-intensive retraining +process but also making it possible to leverage the wealth of pre-trained +knowledge encoded in modern Deep Learning architectures. Our method explicitly +eliminates the need to revisit the source data during adaptation by +incorporating three distinct regularization terms: the Align term, which +reduces the feature distribution discrepancy between source and target data, +ensuring the transferability of the pre-trained representation; the Minimize +term, which encourages the model to make assertive predictions, pushing the +outputs towards one-hot-like distributions in order to minimize prediction +uncertainty, and finally, the Diversify term, which safeguards against the +degeneracy in predictions by promoting varied and distinctive sequences +throughout the target data, preventing informational collapse. Experimental +results from several benchmarks demonstrated the effectiveness and robustness +of AMD, showing it to be competitive and often outperforming DA methods in HTR. + +
+
+ comment: Submitted to ECCV 2024 +
+
+
+
+
+ + ☆ Efficient Remote Sensing with Harmonized Transfer Learning and Modality + Alignment ICLR + + +
+ With the rise of Visual and Language Pretraining (VLP), an increasing number +of downstream tasks are adopting the paradigm of pretraining followed by +fine-tuning. Although this paradigm has demonstrated potential in various +multimodal downstream tasks, its implementation in the remote sensing domain +encounters some obstacles. Specifically, the tendency for same-modality +embeddings to cluster together impedes efficient transfer learning. To tackle +this issue, we review the aim of multimodal transfer learning for downstream +tasks from a unified perspective, and rethink the optimization process based on +three distinct objectives. We propose "Harmonized Transfer Learning and +Modality Alignment (HarMA)", a method that simultaneously satisfies task +constraints, modality alignment, and single-modality uniform alignment, while +minimizing training overhead through parameter-efficient fine-tuning. +Remarkably, without the need for external data for training, HarMA achieves +state-of-the-art performance in two popular multimodal retrieval tasks in the +field of remote sensing. Our experiments reveal that HarMA achieves competitive +and even superior performance to fully fine-tuned models with only minimal +adjustable parameters. Due to its simplicity, HarMA can be integrated into +almost all existing multimodal pretraining models. We hope this method can +facilitate the efficient application of large models to a wide range of +downstream tasks while significantly reducing the resource consumption. Code is +available at https://github.com/seekerhuang/HarMA. + +
+
+ comment: Accepted by the Twelfth International Conference on Learning + Representations (ICLR) Workshop +
+
+
+
+
+ + ☆ Fisher Information Improved Training-Free Conditional Diffusion Model + + +
+ Recently, the diffusion model with the training-free methods has succeeded in +conditional image generation tasks. However, there is an efficiency problem +because it requires calculating the gradient with high computational cost, and +previous methods make strong assumptions to solve it, sacrificing +generalization. In this work, we propose the Fisher information guided +diffusion model (FIGD). Concretely, we introduce the Fisher information to +estimate the gradient without making any additional assumptions to reduce +computation cost. Meanwhile, we demonstrate that the Fisher information ensures +the generalization of FIGD and provides new insights for training-free methods +based on the information theory. The experimental results demonstrate that FIGD +could achieve different conditional generations more quickly while maintaining +high quality. + +
+
+
+
+
+ + ☆ AdaFSNet: Time Series Classification Based on Convolutional Network with + a Adaptive and Effective Kernel Size Configuration IJCNN 2024 + + +
+ Time series classification is one of the most critical and challenging +problems in data mining, existing widely in various fields and holding +significant research importance. Despite extensive research and notable +achievements with successful real-world applications, addressing the challenge +of capturing the appropriate receptive field (RF) size from one-dimensional or +multi-dimensional time series of varying lengths remains a persistent issue, +which greatly impacts performance and varies considerably across different +datasets. In this paper, we propose an Adaptive and Effective Full-Scope +Convolutional Neural Network (AdaFSNet) to enhance the accuracy of time series +classification. This network includes two Dense Blocks. Particularly, it can +dynamically choose a range of kernel sizes that effectively encompass the +optimal RF size for various datasets by incorporating multiple prime numbers +corresponding to the time series length. We also design a TargetDrop block, +which can reduce redundancy while extracting a more effective RF. To assess the +effectiveness of the AdaFSNet network, comprehensive experiments were conducted +using the UCR and UEA datasets, which include one-dimensional and +multi-dimensional time series data, respectively. Our model surpassed baseline +models in terms of classification accuracy, underscoring the AdaFSNet network's +efficiency and effectiveness in handling time series classification tasks. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ☆ FAD-SAR: A Novel Fishing Activity Detection System via Synthetic + Aperture Radar Images Based on Deep Learning Method + + +
+ Illegal, unreported, and unregulated (IUU) fishing seriously affects various +aspects of human life. However, current methods for detecting and monitoring +IUU activities at sea have limitations. While Synthetic Aperture Radar (SAR) +can complement existing vessel detection systems, extracting useful information +from SAR images using traditional methods, especially for IUU fishing +identification, poses challenges. This paper proposes a deep learning-based +system for detecting fishing activities. We implemented this system on the +xView3 dataset using six classical object detection models: Faster R-CNN, +Cascade R-CNN, SSD, RetinaNet, FSAF, and FCOS. We applied improvement methods +to enhance the performance of the Faster R-CNN model. Specifically, training +the Faster R-CNN model using Online Hard Example Mining (OHEM) strategy +improved the Avg-F1 value from 0.212 to 0.216, representing a 1.96% +improvement. + +
+
+
+
+
+ + ☆ Flood Data Analysis on SpaceNet 8 Using Apache Sedona + + +
+ With the escalating frequency of floods posing persistent threats to human +life and property, satellite remote sensing has emerged as an indispensable +tool for monitoring flood hazards. SpaceNet8 offers a unique opportunity to +leverage cutting-edge artificial intelligence technologies to assess these +hazards. A significant contribution of this research is its application of +Apache Sedona, an advanced platform specifically designed for the efficient and +distributed processing of large-scale geospatial data. This platform aims to +enhance the efficiency of error analysis, a critical aspect of improving flood +damage detection accuracy. Based on Apache Sedona, we introduce a novel +approach that addresses the challenges associated with inaccuracies in flood +damage detection. This approach involves the retrieval of cases from historical +flood events, the adaptation of these cases to current scenarios, and the +revision of the model based on clustering algorithms to refine its performance. +Through the replication of both the SpaceNet8 baseline and its top-performing +models, we embark on a comprehensive error analysis. This analysis reveals +several main sources of inaccuracies. To address these issues, we employ data +visual interpretation and histogram equalization techniques, resulting in +significant improvements in model metrics. After these enhancements, our +indicators show a notable improvement, with precision up by 5%, F1 score by +2.6%, and IoU by 4.5%. This work highlights the importance of advanced +geospatial data processing tools, such as Apache Sedona. By improving the +accuracy and efficiency of flood detection, this research contributes to +safeguarding public safety and strengthening infrastructure resilience in +flood-prone areas, making it a valuable addition to the field of remote sensing +and disaster management. + +
+
+
+
+
+ + ☆ S$^2$Mamba: A Spatial-spectral State Space Model for Hyperspectral Image + Classification + + +
+ Land cover analysis using hyperspectral images (HSI) remains an open problem +due to their low spatial resolution and complex spectral information. Recent +studies are primarily dedicated to designing Transformer-based architectures +for spatial-spectral long-range dependencies modeling, which is computationally +expensive with quadratic complexity. Selective structured state space model +(Mamba), which is efficient for modeling long-range dependencies with linear +complexity, has recently shown promising progress. However, its potential in +hyperspectral image processing that requires handling numerous spectral bands +has not yet been explored. In this paper, we innovatively propose S$^2$Mamba, a +spatial-spectral state space model for hyperspectral image classification, to +excavate spatial-spectral contextual features, resulting in more efficient and +accurate land cover analysis. In S$^2$Mamba, two selective structured state +space models through different dimensions are designed for feature extraction, +one for spatial, and the other for spectral, along with a spatial-spectral +mixture gate for optimal fusion. More specifically, S$^2$Mamba first captures +spatial contextual relations by interacting each pixel with its adjacent +through a Patch Cross Scanning module and then explores semantic information +from continuous spectral bands through a Bi-directional Spectral Scanning +module. Considering the distinct expertise of the two attributes in homogenous +and complicated texture scenes, we realize the Spatial-spectral Mixture Gate by +a group of learnable matrices, allowing for the adaptive incorporation of +representations learned across different dimensions. Extensive experiments +conducted on HSI classification benchmarks demonstrate the superiority and +prospect of S$^2$Mamba. The code will be available at: +https://github.com/PURE-melo/S2Mamba. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Paint by Inpaint: Learning to Add Image Objects by Removing Them First + + +
+ Image editing has advanced significantly with the introduction of +text-conditioned diffusion models. Despite this progress, seamlessly adding +objects to images based on textual instructions without requiring user-provided +input masks remains a challenge. We address this by leveraging the insight that +removing objects (Inpaint) is significantly simpler than its inverse process of +adding them (Paint), attributed to the utilization of segmentation mask +datasets alongside inpainting models that inpaint within these masks. +Capitalizing on this realization, by implementing an automated and extensive +pipeline, we curate a filtered large-scale image dataset containing pairs of +images and their corresponding object-removed versions. Using these pairs, we +train a diffusion model to inverse the inpainting process, effectively adding +objects into images. Unlike other editing datasets, ours features natural +target images instead of synthetic ones; moreover, it maintains consistency +between source and target by construction. Additionally, we utilize a large +Vision-Language Model to provide detailed descriptions of the removed objects +and a Large Language Model to convert these descriptions into diverse, +natural-language instructions. We show that the trained model surpasses +existing ones both qualitatively and quantitatively, and release the +large-scale dataset alongside the trained models for the community. + +
+
+
+
+
+ + ☆ Enhancing Action Recognition from Low-Quality Skeleton Data via + Part-Level Knowledge Distillation + + +
+ Skeleton-based action recognition is vital for comprehending human-centric +videos and has applications in diverse domains. One of the challenges of +skeleton-based action recognition is dealing with low-quality data, such as +skeletons that have missing or inaccurate joints. This paper addresses the +issue of enhancing action recognition using low-quality skeletons through a +general knowledge distillation framework. The proposed framework employs a +teacher-student model setup, where a teacher model trained on high-quality +skeletons guides the learning of a student model that handles low-quality +skeletons. To bridge the gap between heterogeneous high-quality and lowquality +skeletons, we present a novel part-based skeleton matching strategy, which +exploits shared body parts to facilitate local action pattern learning. An +action-specific part matrix is developed to emphasize critical parts for +different actions, enabling the student model to distill discriminative +part-level knowledge. A novel part-level multi-sample contrastive loss achieves +knowledge transfer from multiple high-quality skeletons to low-quality ones, +which enables the proposed knowledge distillation framework to include training +low-quality skeletons that lack corresponding high-quality matches. +Comprehensive experiments conducted on the NTU-RGB+D, Penn Action, and SYSU 3D +HOI datasets demonstrate the effectiveness of the proposed knowledge +distillation framework. + +
+
+
+
+
+ + ☆ LMM-PCQA: Assisting Point Cloud Quality Assessment with LMM + + +
+ Although large multi-modality models (LMMs) have seen extensive exploration +and application in various quality assessment studies, their integration into +Point Cloud Quality Assessment (PCQA) remains unexplored. Given LMMs' +exceptional performance and robustness in low-level vision and quality +assessment tasks, this study aims to investigate the feasibility of imparting +PCQA knowledge to LMMs through text supervision. To achieve this, we transform +quality labels into textual descriptions during the fine-tuning phase, enabling +LMMs to derive quality rating logits from 2D projections of point clouds. To +compensate for the loss of perception in the 3D domain, structural features are +extracted as well. These quality logits and structural features are then +combined and regressed into quality scores. Our experimental results affirm the +effectiveness of our approach, showcasing a novel integration of LMMs into PCQA +that enhances model understanding and assessment accuracy. We hope our +contributions can inspire subsequent investigations into the fusion of LMMs +with PCQA, fostering advancements in 3D visual quality analysis and beyond. + +
+
+
+
+
+ + ☆ Rethinking Attention Gated with Hybrid Dual Pyramid Transformer-CNN for + Generalized Segmentation in Medical Imaging + + +
+ Inspired by the success of Transformers in Computer vision, Transformers have +been widely investigated for medical imaging segmentation. However, most of +Transformer architecture are using the recent transformer architectures as +encoder or as parallel encoder with the CNN encoder. In this paper, we +introduce a novel hybrid CNN-Transformer segmentation architecture +(PAG-TransYnet) designed for efficiently building a strong CNN-Transformer +encoder. Our approach exploits attention gates within a Dual Pyramid hybrid +encoder. The contributions of this methodology can be summarized into three key +aspects: (i) the utilization of Pyramid input for highlighting the prominent +features at different scales, (ii) the incorporation of a PVT transformer to +capture long-range dependencies across various resolutions, and (iii) the +implementation of a Dual-Attention Gate mechanism for effectively fusing +prominent features from both CNN and Transformer branches. Through +comprehensive evaluation across different segmentation tasks including: +abdominal multi-organs segmentation, infection segmentation (Covid-19 and Bone +Metastasis), microscopic tissues segmentation (Gland and Nucleus). The proposed +approach demonstrates state-of-the-art performance and exhibits remarkable +generalization capabilities. This research represents a significant advancement +towards addressing the pressing need for efficient and adaptable segmentation +solutions in medical imaging applications. + +
+
+
+
+
+ + ☆ Permutation-equivariant quantum convolutional neural networks + + +
+ The Symmetric group $S_{n}$ manifests itself in large classes of quantum +systems as the invariance of certain characteristics of a quantum state with +respect to permuting the qubits. The subgroups of $S_{n}$ arise, among many +other contexts, to describe label symmetry of classical images with respect to +spatial transformations, e.g. reflection or rotation. Equipped with the +formalism of geometric quantum machine learning, in this work we propose the +architectures of equivariant quantum convolutional neural networks (EQCNNs) +adherent to $S_{n}$ and its subgroups. We demonstrate that a careful choice of +pixel-to-qubit embedding order can facilitate easy construction of EQCNNs for +small subgroups of $S_{n}$. Our novel EQCNN architecture corresponding to the +full permutation group $S_{n}$ is built by applying all possible QCNNs with +equal probability, which can also be conceptualized as a dropout strategy in +quantum neural networks. For subgroups of $S_{n}$, our numerical results using +MNIST datasets show better classification accuracy than non-equivariant QCNNs. +The $S_{n}$-equivariant QCNN architecture shows significantly improved training +and test performance than non-equivariant QCNN for classification of connected +and non-connected graphs. When trained with sufficiently large number of data, +the $S_{n}$-equivariant QCNN shows better average performance compared to +$S_{n}$-equivariant QNN . These results contribute towards building powerful +quantum machine learning architectures in permutation-symmetric systems. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ Assessing Image Quality Using a Simple Generative Representation + + +
+ Perceptual image quality assessment (IQA) is the task of predicting the +visual quality of an image as perceived by a human observer. Current +state-of-the-art techniques are based on deep representations trained in +discriminative manner. Such representations may ignore visually important +features, if they are not predictive of class labels. Recent generative models +successfully learn low-dimensional representations using auto-encoding and have +been argued to preserve better visual features. Here we leverage existing +auto-encoders and propose VAE-QA, a simple and efficient method for predicting +image quality in the presence of a full-reference. We evaluate our approach on +four standard benchmarks and find that it significantly improves generalization +across datasets, has fewer trainable parameters, a smaller memory footprint and +faster run time. + +
+
+
+
+
+ + ☆ Mamba-FETrack: Frame-Event Tracking via State Space Model + + +
+ RGB-Event based tracking is an emerging research topic, focusing on how to +effectively integrate heterogeneous multi-modal data (synchronized exposure +video frames and asynchronous pulse Event stream). Existing works typically +employ Transformer based networks to handle these modalities and achieve decent +accuracy through input-level or feature-level fusion on multiple datasets. +However, these trackers require significant memory consumption and +computational complexity due to the use of self-attention mechanism. This paper +proposes a novel RGB-Event tracking framework, Mamba-FETrack, based on the +State Space Model (SSM) to achieve high-performance tracking while effectively +reducing computational costs and realizing more efficient tracking. +Specifically, we adopt two modality-specific Mamba backbone networks to extract +the features of RGB frames and Event streams. Then, we also propose to boost +the interactive learning between the RGB and Event features using the Mamba +network. The fused features will be fed into the tracking head for target +object localization. Extensive experiments on FELT and FE108 datasets fully +validated the efficiency and effectiveness of our proposed tracker. +Specifically, our Mamba-based tracker achieves 43.5/55.6 on the SR/PR metric, +while the ViT-S based tracker (OSTrack) obtains 40.0/50.9. The GPU memory cost +of ours and ViT-S based tracker is 13.98GB and 15.44GB, which decreased about +$9.5\%$. The FLOPs and parameters of ours/ViT-S based OSTrack are 59GB/1076GB +and 7MB/60MB, which decreased about $94.5\%$ and $88.3\%$, respectively. We +hope this work can bring some new insights to the tracking field and greatly +promote the application of the Mamba architecture in tracking. The source code +of this work will be released on +\url{https://github.com/Event-AHU/Mamba_FETrack}. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ IMEX-Reg: Implicit-Explicit Regularization in the Function Space for + Continual Learning + + +
+ Continual learning (CL) remains one of the long-standing challenges for deep +neural networks due to catastrophic forgetting of previously acquired +knowledge. Although rehearsal-based approaches have been fairly successful in +mitigating catastrophic forgetting, they suffer from overfitting on buffered +samples and prior information loss, hindering generalization under low-buffer +regimes. Inspired by how humans learn using strong inductive biases, we propose +IMEX-Reg to improve the generalization performance of experience rehearsal in +CL under low buffer regimes. Specifically, we employ a two-pronged +implicit-explicit regularization approach using contrastive representation +learning (CRL) and consistency regularization. To further leverage the global +relationship between representations learned using CRL, we propose a +regularization strategy to guide the classifier toward the activation +correlations in the unit hypersphere of the CRL. Our results show that IMEX-Reg +significantly improves generalization performance and outperforms +rehearsal-based approaches in several CL scenarios. It is also robust to +natural and adversarial corruptions with less task-recency bias. Additionally, +we provide theoretical insights to support our design decisions further. + +
+
+ comment: Published in Transactions on Machine Learning Research +
+
+
+
+
+ + ☆ Event-based Video Frame Interpolation with Edge Guided Motion Refinement + + +
+ Video frame interpolation, the process of synthesizing intermediate frames +between sequential video frames, has made remarkable progress with the use of +event cameras. These sensors, with microsecond-level temporal resolution, fill +information gaps between frames by providing precise motion cues. However, +contemporary Event-Based Video Frame Interpolation (E-VFI) techniques often +neglect the fact that event data primarily supply high-confidence features at +scene edges during multi-modal feature fusion, thereby diminishing the role of +event signals in optical flow (OF) estimation and warping refinement. To +address this overlooked aspect, we introduce an end-to-end E-VFI learning +method (referred to as EGMR) to efficiently utilize edge features from event +signals for motion flow and warping enhancement. Our method incorporates an +Edge Guided Attentive (EGA) module, which rectifies estimated video motion +through attentive aggregation based on the local correlation of multi-modal +features in a coarse-to-fine strategy. Moreover, given that event data can +provide accurate visual references at scene edges between consecutive frames, +we introduce a learned visibility map derived from event data to adaptively +mitigate the occlusion problem in the warping refinement process. Extensive +experiments on both synthetic and real datasets show the effectiveness of the +proposed approach, demonstrating its potential for higher quality video frame +interpolation. + +
+
+
+
+
+ + ☆ ShapeMoiré: Channel-Wise Shape-Guided Network for Image Demoiréing + + +
+ Photographing optoelectronic displays often introduces unwanted moir\'e +patterns due to analog signal interference between the pixel grids of the +display and the camera sensor arrays. This work identifies two problems that +are largely ignored by existing image demoir\'eing approaches: 1) moir\'e +patterns vary across different channels (RGB); 2) repetitive patterns are +constantly observed. However, employing conventional convolutional (CNN) layers +cannot address these problems. Instead, this paper presents the use of our +recently proposed Shape concept. It was originally employed to model consistent +features from fragmented regions, particularly when identical or similar +objects coexist in an RGB-D image. Interestingly, we find that the Shape +information effectively captures the moir\'e patterns in artifact images. +Motivated by this discovery, we propose a ShapeMoir\'e method to aid in image +demoir\'eing. Beyond modeling shape features at the patch-level, we further +extend this to the global image-level and design a novel Shape-Architecture. +Consequently, our proposed method, equipped with both ShapeConv and +Shape-Architecture, can be seamlessly integrated into existing approaches +without introducing additional parameters or computation overhead during +inference. We conduct extensive experiments on four widely used datasets, and +the results demonstrate that our ShapeMoir\'e achieves state-of-the-art +performance, particularly in terms of the PSNR metric. We then apply our method +across four popular architectures to showcase its generalization capabilities. +Moreover, our ShapeMoir\'e is robust and viable under real-world demoir\'eing +scenarios involving smartphone photographs. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Masked Attention as a Mechanism for Improving Interpretability of Vision + Transformers + + +
+ Vision Transformers are at the heart of the current surge of interest in +foundation models for histopathology. They process images by breaking them into +smaller patches following a regular grid, regardless of their content. Yet, not +all parts of an image are equally relevant for its understanding. This is +particularly true in computational pathology where background is completely +non-informative and may introduce artefacts that could mislead predictions. To +address this issue, we propose a novel method that explicitly masks background +in Vision Transformers' attention mechanism. This ensures tokens corresponding +to background patches do not contribute to the final image representation, +thereby improving model robustness and interpretability. We validate our +approach using prostate cancer grading from whole-slide images as a case study. +Our results demonstrate that it achieves comparable performance with plain +self-attention while providing more accurate and clinically meaningful +attention heatmaps. + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ RadSimReal: Bridging the Gap Between Synthetic and Real Data in Radar + Object Detection With Simulation CVPR 2024 + + +
+ Object detection in radar imagery with neural networks shows great potential +for improving autonomous driving. However, obtaining annotated datasets from +real radar images, crucial for training these networks, is challenging, +especially in scenarios with long-range detection and adverse weather and +lighting conditions where radar performance excels. To address this challenge, +we present RadSimReal, an innovative physical radar simulation capable of +generating synthetic radar images with accompanying annotations for various +radar types and environmental conditions, all without the need for real data +collection. Remarkably, our findings demonstrate that training object detection +models on RadSimReal data and subsequently evaluating them on real-world data +produce performance levels comparable to models trained and tested on real data +from the same dataset, and even achieves better performance when testing across +different real datasets. RadSimReal offers advantages over other physical radar +simulations that it does not necessitate knowledge of the radar design details, +which are often not disclosed by radar suppliers, and has faster run-time. This +innovative tool has the potential to advance the development of computer vision +algorithms for radar-based autonomous driving applications. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Compressed Deepfake Video Detection Based on 3D Spatiotemporal + Trajectories + + +
+ The misuse of deepfake technology by malicious actors poses a potential +threat to nations, societies, and individuals. However, existing methods for +detecting deepfakes primarily focus on uncompressed videos, such as noise +characteristics, local textures, or frequency statistics. When applied to +compressed videos, these methods experience a decrease in detection performance +and are less suitable for real-world scenarios. In this paper, we propose a +deepfake video detection method based on 3D spatiotemporal trajectories. +Specifically, we utilize a robust 3D model to construct spatiotemporal motion +features, integrating feature details from both 2D and 3D frames to mitigate +the influence of large head rotation angles or insufficient lighting within +frames. Furthermore, we separate facial expressions from head movements and +design a sequential analysis method based on phase space motion trajectories to +explore the feature differences between genuine and fake faces in deepfake +videos. We conduct extensive experiments to validate the performance of our +proposed method on several compressed deepfake benchmarks. The robustness of +the well-designed features is verified by calculating the consistent +distribution of facial landmarks before and after video compression.Our method +yields satisfactory results and showcases its potential for practical +applications. + +
+
+
+
+
+ + ☆ Tracking Transforming Objects: A Benchmark + + +
+ Tracking transforming objects holds significant importance in various fields +due to the dynamic nature of many real-world scenarios. By enabling systems +accurately represent transforming objects over time, tracking transforming +objects facilitates advancements in areas such as autonomous systems, +human-computer interaction, and security applications. Moreover, understanding +the behavior of transforming objects provides valuable insights into complex +interactions or processes, contributing to the development of intelligent +systems capable of robust and adaptive perception in dynamic environments. +However, current research in the field mainly focuses on tracking generic +objects. In this study, we bridge this gap by collecting a novel dedicated +Dataset for Tracking Transforming Objects, called DTTO, which contains 100 +sequences, amounting to approximately 9.3K frames. We provide carefully +hand-annotated bounding boxes for each frame within these sequences, making +DTTO the pioneering benchmark dedicated to tracking transforming objects. We +thoroughly evaluate 20 state-of-the-art trackers on the benchmark, aiming to +comprehend the performance of existing methods and provide a comparison for +future research on DTTO. With the release of DTTO, our goal is to facilitate +further research and applications related to tracking transforming objects. + +
+
+
+
+
+ + ☆ SafePaint: Anti-forensic Image Inpainting with Domain Adaptation + + +
+ Existing image inpainting methods have achieved remarkable accomplishments in +generating visually appealing results, often accompanied by a trend toward +creating more intricate structural textures. However, while these models excel +at creating more realistic image content, they often leave noticeable traces of +tampering, posing a significant threat to security. In this work, we take the +anti-forensic capabilities into consideration, firstly proposing an end-to-end +training framework for anti-forensic image inpainting named SafePaint. +Specifically, we innovatively formulated image inpainting as two major tasks: +semantically plausible content completion and region-wise optimization. The +former is similar to current inpainting methods that aim to restore the missing +regions of corrupted images. The latter, through domain adaptation, endeavors +to reconcile the discrepancies between the inpainted region and the unaltered +area to achieve anti-forensic goals. Through comprehensive theoretical +analysis, we validate the effectiveness of domain adaptation for anti-forensic +performance. Furthermore, we meticulously crafted a region-wise separated +attention (RWSA) module, which not only aligns with our objective of +anti-forensics but also enhances the performance of the model. Extensive +qualitative and quantitative evaluations show our approach achieves comparable +results to existing image inpainting methods while offering anti-forensic +capabilities not available in other methods. + +
+
+
+
+
+ + ☆ Deep Boosting Learning: A Brand-new Cooperative Approach for Image-Text + Matching + + +
+ Image-text matching remains a challenging task due to heterogeneous semantic +diversity across modalities and insufficient distance separability within +triplets. Different from previous approaches focusing on enhancing multi-modal +representations or exploiting cross-modal correspondence for more accurate +retrieval, in this paper we aim to leverage the knowledge transfer between peer +branches in a boosting manner to seek a more powerful matching model. +Specifically, we propose a brand-new Deep Boosting Learning (DBL) algorithm, +where an anchor branch is first trained to provide insights into the data +properties, with a target branch gaining more advanced knowledge to develop +optimal features and distance metrics. Concretely, an anchor branch initially +learns the absolute or relative distance between positive and negative pairs, +providing a foundational understanding of the particular network and data +distribution. Building upon this knowledge, a target branch is concurrently +tasked with more adaptive margin constraints to further enlarge the relative +distance between matched and unmatched samples. Extensive experiments validate +that our DBL can achieve impressive and consistent improvements based on +various recent state-of-the-art models in the image-text matching field, and +outperform related popular cooperative strategies, e.g., Conventional +Distillation, Mutual Learning, and Contrastive Learning. Beyond the above, we +confirm that DBL can be seamlessly integrated into their training scenarios and +achieve superior performance under the same computational costs, demonstrating +the flexibility and broad applicability of our proposed method. Our code is +publicly available at: https://github.com/Paranioar/DBL. + +
+
+ comment: 12 pages, 9 figures, Accepted by TIP2024 +
+
+
+
+
+ + ☆ Garbage Segmentation and Attribute Analysis by Robotic Dogs + + +
+ Efficient waste management and recycling heavily rely on garbage exploration +and identification. In this study, we propose GSA2Seg (Garbage Segmentation and +Attribute Analysis), a novel visual approach that utilizes quadruped robotic +dogs as autonomous agents to address waste management and recycling challenges +in diverse indoor and outdoor environments. Equipped with advanced visual +perception system, including visual sensors and instance segmentators, the +robotic dogs adeptly navigate their surroundings, diligently searching for +common garbage items. Inspired by open-vocabulary algorithms, we introduce an +innovative method for object attribute analysis. By combining garbage +segmentation and attribute analysis techniques, the robotic dogs accurately +determine the state of the trash, including its position and placement +properties. This information enhances the robotic arm's grasping capabilities, +facilitating successful garbage retrieval. Additionally, we contribute an image +dataset, named GSA2D, to support evaluation. Through extensive experiments on +GSA2D, this paper provides a comprehensive analysis of GSA2Seg's effectiveness. +Dataset available: +\href{https://www.kaggle.com/datasets/hellob/gsa2d-2024}{https://www.kaggle.com/datasets/hellob/gsa2d-2024}. + +
+
+
+
+
+ + ☆ Finding Beautiful and Happy Images for Mental Health and Well-being + Applications + + +
+ This paper explores how artificial intelligence (AI) technology can +contribute to achieve progress on good health and well-being, one of the United +Nations' 17 Sustainable Development Goals. It is estimated that one in ten of +the global population lived with a mental disorder. Inspired by studies showing +that engaging and viewing beautiful natural images can make people feel happier +and less stressful, lead to higher emotional well-being, and can even have +therapeutic values, we explore how AI can help to promote mental health by +developing automatic algorithms for finding beautiful and happy images. We +first construct a large image database consisting of nearly 20K very high +resolution colour photographs of natural scenes where each image is labelled +with beautifulness and happiness scores by about 10 observers. Statistics of +the database shows that there is a good correlation between the beautifulness +and happiness scores which provides anecdotal evidence to corroborate that +engaging beautiful natural images can potentially benefit mental well-being. +Building on this unique database, the very first of its kind, we have developed +a deep learning based model for automatically predicting the beautifulness and +happiness scores of natural images. Experimental results are presented to show +that it is possible to develop AI algorithms to automatically assess an image's +beautifulness and happiness values which can in turn be used to develop +applications for promoting mental health and well-being. + +
+
+
+
+
+ + ☆ Semi-supervised Text-based Person Search + + +
+ Text-based person search (TBPS) aims to retrieve images of a specific person +from a large image gallery based on a natural language description. Existing +methods rely on massive annotated image-text data to achieve satisfactory +performance in fully-supervised learning. It poses a significant challenge in +practice, as acquiring person images from surveillance videos is relatively +easy, while obtaining annotated texts is challenging. The paper undertakes a +pioneering initiative to explore TBPS under the semi-supervised setting, where +only a limited number of person images are annotated with textual descriptions +while the majority of images lack annotations. We present a two-stage basic +solution based on generation-then-retrieval for semi-supervised TBPS. The +generation stage enriches annotated data by applying an image captioning model +to generate pseudo-texts for unannotated images. Later, the retrieval stage +performs fully-supervised retrieval learning using the augmented data. +Significantly, considering the noise interference of the pseudo-texts on +retrieval learning, we propose a noise-robust retrieval framework that enhances +the ability of the retrieval model to handle noisy data. The framework +integrates two key strategies: Hybrid Patch-Channel Masking (PC-Mask) to refine +the model architecture, and Noise-Guided Progressive Training (NP-Train) to +enhance the training process. PC-Mask performs masking on the input data at +both the patch-level and the channel-level to prevent overfitting noisy +supervision. NP-Train introduces a progressive training schedule based on the +noise level of pseudo-texts to facilitate noise-robust learning. Extensive +experiments on multiple TBPS benchmarks show that the proposed framework +achieves promising performance under the semi-supervised setting. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Snake with Shifted Window: Learning to Adapt Vessel Pattern for OCTA + Segmentation + + +
+ Segmenting specific targets or structures in optical coherence tomography +angiography (OCTA) images is fundamental for conducting further pathological +studies. The retinal vascular layers are rich and intricate, and such vascular +with complex shapes can be captured by the widely-studied OCTA images. In this +paper, we thus study how to use OCTA images with projection vascular layers to +segment retinal structures. To this end, we propose the SSW-OCTA model, which +integrates the advantages of deformable convolutions suited for tubular +structures and the swin-transformer for global feature extraction, adapting to +the characteristics of OCTA modality images. Our model underwent testing and +comparison on the OCTA-500 dataset, achieving state-of-the-art performance. The +code is available at: https://github.com/ShellRedia/Snake-SWin-OCTA. + +
+
+
+
+
+ + ☆ Online,Target-Free LiDAR-Camera Extrinsic Calibration via Cross-Modal + Mask Matching + + +
+ LiDAR-camera extrinsic calibration (LCEC) is crucial for data fusion in +intelligent vehicles. Offline, target-based approaches have long been the +preferred choice in this field. However, they often demonstrate poor +adaptability to real-world environments. This is largely because extrinsic +parameters may change significantly due to moderate shocks or during extended +operations in environments with vibrations. In contrast, online, target-free +approaches provide greater adaptability yet typically lack robustness, +primarily due to the challenges in cross-modal feature matching. Therefore, in +this article, we unleash the full potential of large vision models (LVMs), +which are emerging as a significant trend in the fields of computer vision and +robotics, especially for embodied artificial intelligence, to achieve robust +and accurate online, target-free LCEC across a variety of challenging +scenarios. Our main contributions are threefold: we introduce a novel framework +known as MIAS-LCEC, provide an open-source versatile calibration toolbox with +an interactive visualization interface, and publish three real-world datasets +captured from various indoor and outdoor environments. The cornerstone of our +framework and toolbox is the cross-modal mask matching (C3M) algorithm, +developed based on a state-of-the-art (SoTA) LVM and capable of generating +sufficient and reliable matches. Extensive experiments conducted on these +real-world datasets demonstrate the robustness of our approach and its superior +performance compared to SoTA methods, particularly for the solid-state LiDARs +with super-wide fields of view. + +
+
+
+
+
+ + ☆ Quantized Context Based LIF Neurons for Recurrent Spiking Neural + Networks in 45nm + + +
+ In this study, we propose the first hardware implementation of a +context-based recurrent spiking neural network (RSNN) emphasizing on +integrating dual information streams within the neocortical pyramidal neurons +specifically Context- Dependent Leaky Integrate and Fire (CLIF) neuron models, +essential element in RSNN. We present a quantized version of the CLIF neuron +(qCLIF), developed through a hardware-software codesign approach utilizing the +sparse activity of RSNN. Implemented in a 45nm technology node, the qCLIF is +compact (900um^2) and achieves a high accuracy of 90% despite 8 bit +quantization on DVS gesture classification dataset. Our analysis spans a +network configuration from 10 to 200 qCLIF neurons, supporting up to 82k +synapses within a 1.86 mm^2 footprint, demonstrating scalability and efficiency + +
+
+ comment: 7 Pages, 7 Figures, 2 Tables +
+
+
+
+
+ + ☆ Grounded Compositional and Diverse Text-to-3D with Pretrained Multi-View + Diffusion Model + + +
+ In this paper, we propose an effective two-stage approach named +Grounded-Dreamer to generate 3D assets that can accurately follow complex, +compositional text prompts while achieving high fidelity by using a pre-trained +multi-view diffusion model. Multi-view diffusion models, such as MVDream, have +shown to generate high-fidelity 3D assets using score distillation sampling +(SDS). However, applied naively, these methods often fail to comprehend +compositional text prompts, and may often entirely omit certain subjects or +parts. To address this issue, we first advocate leveraging text-guided 4-view +images as the bottleneck in the text-to-3D pipeline. We then introduce an +attention refocusing mechanism to encourage text-aligned 4-view image +generation, without the necessity to re-train the multi-view diffusion model or +craft a high-quality compositional 3D dataset. We further propose a hybrid +optimization strategy to encourage synergy between the SDS loss and the sparse +RGB reference images. Our method consistently outperforms previous +state-of-the-art (SOTA) methods in generating compositional 3D assets, +excelling in both quality and accuracy, and enabling diverse 3D from the same +text prompt. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ Compressed Image Captioning using CNN-based Encoder-Decoder Framework + + +
+ In today's world, image processing plays a crucial role across various +fields, from scientific research to industrial applications. But one +particularly exciting application is image captioning. The potential impact of +effective image captioning is vast. It can significantly boost the accuracy of +search engines, making it easier to find relevant information. Moreover, it can +greatly enhance accessibility for visually impaired individuals, providing them +with a more immersive experience of digital content. However, despite its +promise, image captioning presents several challenges. One major hurdle is +extracting meaningful visual information from images and transforming it into +coherent language. This requires bridging the gap between the visual and +linguistic domains, a task that demands sophisticated algorithms and models. +Our project is focused on addressing these challenges by developing an +automatic image captioning architecture that combines the strengths of +convolutional neural networks (CNNs) and encoder-decoder models. The CNN model +is used to extract the visual features from images, and later, with the help of +the encoder-decoder framework, captions are generated. We also did a +performance comparison where we delved into the realm of pre-trained CNN +models, experimenting with multiple architectures to understand their +performance variations. In our quest for optimization, we also explored the +integration of frequency regularization techniques to compress the "AlexNet" +and "EfficientNetB0" model. We aimed to see if this compressed model could +maintain its effectiveness in generating image captions while being more +resource-efficient. + +
+
+
+
+
+ + ☆ Prompt Customization for Continual Learning ACM MM + + +
+ Contemporary continual learning approaches typically select prompts from a +pool, which function as supplementary inputs to a pre-trained model. However, +this strategy is hindered by the inherent noise of its selection approach when +handling increasing tasks. In response to these challenges, we reformulate the +prompting approach for continual learning and propose the prompt customization +(PC) method. PC mainly comprises a prompt generation module (PGM) and a prompt +modulation module (PMM). In contrast to conventional methods that employ hard +prompt selection, PGM assigns different coefficients to prompts from a +fixed-sized pool of prompts and generates tailored prompts. Moreover, PMM +further modulates the prompts by adaptively assigning weights according to the +correlations between input data and corresponding prompts. We evaluate our +method on four benchmark datasets for three diverse settings, including the +class, domain, and task-agnostic incremental learning tasks. Experimental +results demonstrate consistent improvement (by up to 16.2\%), yielded by the +proposed method, over the state-of-the-art (SOTA) techniques. + +
+
+ comment: ACM MM +
+
+
+
+
+ + ☆ Joint Reference Frame Synthesis and Post Filter Enhancement for + Versatile Video Coding + + +
+ This paper presents the joint reference frame synthesis (RFS) and +post-processing filter enhancement (PFE) for Versatile Video Coding (VVC), +aiming to explore the combination of different neural network-based video +coding (NNVC) tools to better utilize the hierarchical bi-directional coding +structure of VVC. Both RFS and PFE utilize the Space-Time Enhancement Network +(STENet), which receives two input frames with artifacts and produces two +enhanced frames with suppressed artifacts, along with an intermediate +synthesized frame. STENet comprises two pipelines, the synthesis pipeline and +the enhancement pipeline, tailored for different purposes. During RFS, two +reconstructed frames are sent into STENet's synthesis pipeline to synthesize a +virtual reference frame, similar to the current to-be-coded frame. The +synthesized frame serves as an additional reference frame inserted into the +reference picture list (RPL). During PFE, two reconstructed frames are fed into +STENet's enhancement pipeline to alleviate their artifacts and distortions, +resulting in enhanced frames with reduced artifacts and distortions. To reduce +inference complexity, we propose joint inference of RFS and PFE (JISE), +achieved through a single execution of STENet. Integrated into the VVC +reference software VTM-15.0, RFS, PFE, and JISE are coordinated within a novel +Space-Time Enhancement Window (STEW) under Random Access (RA) configuration. +The proposed method could achieve -7.34%/-17.21%/-16.65% PSNR-based BD-rate on +average for three components under RA configuration. + +
+
+
+
+
+ + ☆ Exposing Text-Image Inconsistency Using Diffusion Models + + +
+ In the battle against widespread online misinformation, a growing problem is +text-image inconsistency, where images are misleadingly paired with texts with +different intent or meaning. Existing classification-based methods for +text-image inconsistency can identify contextual inconsistencies but fail to +provide explainable justifications for their decisions that humans can +understand. Although more nuanced, human evaluation is impractical at scale and +susceptible to errors. To address these limitations, this study introduces +D-TIIL (Diffusion-based Text-Image Inconsistency Localization), which employs +text-to-image diffusion models to localize semantic inconsistencies in text and +image pairs. These models, trained on large-scale datasets act as ``omniscient" +agents that filter out irrelevant information and incorporate background +knowledge to identify inconsistencies. In addition, D-TIIL uses text embeddings +and modified image regions to visualize these inconsistencies. To evaluate +D-TIIL's efficacy, we introduce a new TIIL dataset containing 14K consistent +and inconsistent text-image pairs. Unlike existing datasets, TIIL enables +assessment at the level of individual words and image regions and is carefully +designed to represent various inconsistencies. D-TIIL offers a scalable and +evidence-based approach to identifying and localizing text-image inconsistency, +providing a robust framework for future research combating misinformation. + +
+
+
+
+
+ + ☆ Revealing the Two Sides of Data Augmentation: An Asymmetric + Distillation-based Win-Win Solution for Open-Set Recognition + + +
+ In this paper, we reveal the two sides of data augmentation: enhancements in +closed-set recognition correlate with a significant decrease in open-set +recognition. Through empirical investigation, we find that multi-sample-based +augmentations would contribute to reducing feature discrimination, thereby +diminishing the open-set criteria. Although knowledge distillation could impair +the feature via imitation, the mixed feature with ambiguous semantics hinders +the distillation. To this end, we propose an asymmetric distillation framework +by feeding teacher model extra raw data to enlarge the benefit of teacher. +Moreover, a joint mutual information loss and a selective relabel strategy are +utilized to alleviate the influence of hard mixed samples. Our method +successfully mitigates the decline in open-set and outperforms SOTAs by 2%~3% +AUROC on the Tiny-ImageNet dataset and experiments on large-scale dataset +ImageNet-21K demonstrate the generalization of our method. + +
+
+
+
+
+ + ☆ Improve Academic Query Resolution through BERT-based Question Extraction + from Images + + +
+ Providing fast and accurate resolution to the student's query is an essential +solution provided by Edtech organizations. This is generally provided with a +chat-bot like interface to enable students to ask their doubts easily. One +preferred format for student queries is images, as it allows students to +capture and post questions without typing complex equations and information. +However, this format also presents difficulties, as images may contain multiple +questions or textual noise that lowers the accuracy of existing single-query +answering solutions. In this paper, we propose a method for extracting +questions from text or images using a BERT-based deep learning model and +compare it to the other rule-based and layout-based methods. Our method aims to +improve the accuracy and efficiency of student query resolution in Edtech +organizations. + +
+
+
+
+
+ + ♻ ☆ RTA-Former: Reverse Transformer Attention for Polyp Segmentation + + +
+ Polyp segmentation is a key aspect of colorectal cancer prevention, enabling +early detection and guiding subsequent treatments. Intelligent diagnostic +tools, including deep learning solutions, are widely explored to streamline and +potentially automate this process. However, even with many powerful network +architectures, there still comes the problem of producing accurate edge +segmentation. In this paper, we introduce a novel network, namely RTA-Former, +that employs a transformer model as the encoder backbone and innovatively +adapts Reverse Attention (RA) with a transformer stage in the decoder for +enhanced edge segmentation. The results of the experiments illustrate that +RTA-Former achieves state-of-the-art (SOTA) performance in five polyp +segmentation datasets. The strong capability of RTA-Former holds promise in +improving the accuracy of Transformer-based polyp segmentation, potentially +leading to better clinical decisions and patient outcomes. Our code is publicly +available on GitHub. + +
+
+ comment: The paper has been accepted by EMBC 2024 +
+
+
+
+
+ + ♻ ☆ KS-APR: Keyframe Selection for Robust Absolute Pose Regression + + +
+ Markerless Mobile Augmented Reality (AR) aims to anchor digital content in +the physical world without using specific 2D or 3D objects. Absolute Pose +Regressors (APR) are end-to-end machine learning solutions that infer the +device's pose from a single monocular image. Thanks to their low computation +cost, they can be directly executed on the constrained hardware of mobile AR +devices. However, APR methods tend to yield significant inaccuracies for input +images that are too distant from the training set. This paper introduces +KS-APR, a pipeline that assesses the reliability of an estimated pose with +minimal overhead by combining the inference results of the APR and the prior +images in the training set. Mobile AR systems tend to rely upon visual-inertial +odometry to track the relative pose of the device during the experience. As +such, KS-APR favours reliability over frequency, discarding unreliable poses. +This pipeline can integrate most existing APR methods to improve accuracy by +filtering unreliable images with their pose estimates. We implement the +pipeline on three types of APR models on indoor and outdoor datasets. The +median error on position and orientation is reduced for all models, and the +proportion of large errors is minimized across datasets. Our method enables +state-of-the-art APRs such as DFNetdm to outperform single-image and sequential +APR methods. These results demonstrate the scalability and effectiveness of +KS-APR for visual localization tasks that do not require one-shot decisions. + +
+
+
+
+
+ + ♻ ☆ Forensic Iris Image-Based Post-Mortem Interval Estimation + + +
+ Post-mortem iris recognition is an emerging application of iris-based human +identification in a forensic setup. One factor that may be useful in +conditioning iris recognition methods is the tissue decomposition level, which +is correlated with the post-mortem interval (PMI), i.g., the number of hours +that have elapsed since death. PMI, however, is not always available, and its +precise estimation remains one of the core challenges in forensic examination. +This paper presents the first known to us method of PMI estimation directly +from forensic iris images. To assess the feasibility of the iris-based PMI +estimation, convolutional neural networks-based models (VGG19, DenseNet121, +ResNet152, and Inception_v3) were trained to predict the PMI from (a) +near-infrared (NIR), (b) visible (RGB), and (c) multispectral forensic iris +images. Models were evaluated following a 10-fold cross-validation in (S1) +sample-disjoint, (S2) subject-disjoint, and (S3) cross-dataset scenarios. We +found that using the multispectral data offers a spectacularly low mean +absolute error (MAE) of approximately 3.5 hours in scenario (S1), a bit worse +MAE of approximately 17.5 hours in scenario (S2), and an MAE of approximately +69.0 hours of in the scenario (S3). This suggests that if the environmental +conditions are favorable (e.g., bodies are kept in low temperatures), forensic +iris images provide features that are indicative of the PMI and can be +automatically estimated. The source codes and model weights are made available +with the paper. + +
+
+
+
+
+ + ♻ ☆ Deciphering Heartbeat Signatures: A Vision Transformer Approach to + Explainable Atrial Fibrillation Detection from ECG Signals + + +
+ Remote patient monitoring based on wearable single-lead electrocardiogram +(ECG) devices has significant potential for enabling the early detection of +heart disease, especially in combination with artificial intelligence (AI) +approaches for automated heart disease detection. There have been prior studies +applying AI approaches based on deep learning for heart disease detection. +However, these models are yet to be widely accepted as a reliable aid for +clinical diagnostics, in part due to the current black-box perception +surrounding many AI algorithms. In particular, there is a need to identify the +key features of the ECG signal that contribute toward making an accurate +diagnosis, thereby enhancing the interpretability of the model. In the present +study, we develop a vision transformer approach to identify atrial fibrillation +based on single-lead ECG data. A residual network (ResNet) approach is also +developed for comparison with the vision transformer approach. These models are +applied to the Chapman-Shaoxing dataset to classify atrial fibrillation, as +well as another common arrhythmia, sinus bradycardia, and normal sinus rhythm +heartbeats. The models enable the identification of the key regions of the +heartbeat that determine the resulting classification, and highlight the +importance of P-waves and T-waves, as well as heartbeat duration and signal +amplitude, in distinguishing normal sinus rhythm from atrial fibrillation and +sinus bradycardia. + +
+
+ comment: Accepted for publication at the 46th Annual International Conference + of the IEEE Engineering in Medicine and Biology Society, IEEE EMBC 2024 +
+
+
+
+
+ + ♻ ☆ Parameter-Efficient Orthogonal Finetuning via Butterfly Factorization ICLR 2024 + + +
+ Large foundation models are becoming ubiquitous, but training them from +scratch is prohibitively expensive. Thus, efficiently adapting these powerful +models to downstream tasks is increasingly important. In this paper, we study a +principled finetuning paradigm -- Orthogonal Finetuning (OFT) -- for downstream +task adaptation. Despite demonstrating good generalizability, OFT still uses a +fairly large number of trainable parameters due to the high dimensionality of +orthogonal matrices. To address this, we start by examining OFT from an +information transmission perspective, and then identify a few key desiderata +that enable better parameter-efficiency. Inspired by how the Cooley-Tukey fast +Fourier transform algorithm enables efficient information transmission, we +propose an efficient orthogonal parameterization using butterfly structures. We +apply this parameterization to OFT, creating a novel parameter-efficient +finetuning method, called Orthogonal Butterfly (BOFT). By subsuming OFT as a +special case, BOFT introduces a generalized orthogonal finetuning framework. +Finally, we conduct an extensive empirical study of adapting large vision +transformers, large language models, and text-to-image diffusion models to +various downstream tasks in vision and language. + +
+
+ comment: ICLR 2024 (v2: 34 pages, 19 figures) +
+
+
+
+
+ + ♻ ☆ HyperSDFusion: Bridging Hierarchical Structures in Language and Geometry + for Enhanced 3D Text2Shape Generation + + +
+ 3D shape generation from text is a fundamental task in 3D representation +learning. The text-shape pairs exhibit a hierarchical structure, where a +general text like ``chair" covers all 3D shapes of the chair, while more +detailed prompts refer to more specific shapes. Furthermore, both text and 3D +shapes are inherently hierarchical structures. However, existing Text2Shape +methods, such as SDFusion, do not exploit that. In this work, we propose +HyperSDFusion, a dual-branch diffusion model that generates 3D shapes from a +given text. Since hyperbolic space is suitable for handling hierarchical data, +we propose to learn the hierarchical representations of text and 3D shapes in +hyperbolic space. First, we introduce a hyperbolic text-image encoder to learn +the sequential and multi-modal hierarchical features of text in hyperbolic +space. In addition, we design a hyperbolic text-graph convolution module to +learn the hierarchical features of text in hyperbolic space. In order to fully +utilize these text features, we introduce a dual-branch structure to embed text +features in 3D feature space. At last, to endow the generated 3D shapes with a +hierarchical structure, we devise a hyperbolic hierarchical loss. Our method is +the first to explore the hyperbolic hierarchical representation for +text-to-shape generation. Experimental results on the existing text-to-shape +paired dataset, Text2Shape, achieved state-of-the-art results. We release our +implementation under HyperSDFusion.github.io. + +
+
+
+
+
+ + ♻ ☆ Multi-View Representation is What You Need for Point-Cloud Pre-Training ICLR 2024 + + +
+ A promising direction for pre-training 3D point clouds is to leverage the +massive amount of data in 2D, whereas the domain gap between 2D and 3D creates +a fundamental challenge. This paper proposes a novel approach to point-cloud +pre-training that learns 3D representations by leveraging pre-trained 2D +networks. Different from the popular practice of predicting 2D features first +and then obtaining 3D features through dimensionality lifting, our approach +directly uses a 3D network for feature extraction. We train the 3D feature +extraction network with the help of the novel 2D knowledge transfer loss, which +enforces the 2D projections of the 3D feature to be consistent with the output +of pre-trained 2D networks. To prevent the feature from discarding 3D signals, +we introduce the multi-view consistency loss that additionally encourages the +projected 2D feature representations to capture pixel-wise correspondences +across different views. Such correspondences induce 3D geometry and effectively +retain 3D features in the projected 2D features. Experimental results +demonstrate that our pre-trained model can be successfully transferred to +various downstream tasks, including 3D shape classification, part segmentation, +3D object detection, and semantic segmentation, achieving state-of-the-art +performance. + +
+
+ comment: Published in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ 3D Feature Prediction for Masked-AutoEncoder-Based Point Cloud + Pretraining ICLR 2024 + + +
+ Masked autoencoders (MAE) have recently been introduced to 3D self-supervised +pretraining for point clouds due to their great success in NLP and computer +vision. Unlike MAEs used in the image domain, where the pretext task is to +restore features at the masked pixels, such as colors, the existing 3D MAE +works reconstruct the missing geometry only, i.e, the location of the masked +points. In contrast to previous studies, we advocate that point location +recovery is inessential and restoring intrinsic point features is much +superior. To this end, we propose to ignore point position reconstruction and +recover high-order features at masked points including surface normals and +surface variations, through a novel attention-based decoder which is +independent of the encoder design. We validate the effectiveness of our pretext +task and decoder design using different encoder structures for 3D training and +demonstrate the advantages of our pretrained networks on various point cloud +analysis tasks. + +
+
+ comment: Published in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence in Assessing Cardiovascular Diseases and Risk + Factors via Retinal Fundus Images: A Review of the Last Decade + + +
+ Background: Cardiovascular diseases (CVDs) are the leading cause of death +globally. The use of artificial intelligence (AI) methods - in particular, deep +learning (DL) - has been on the rise lately for the analysis of different +CVD-related topics. The use of fundus images and optical coherence tomography +angiography (OCTA) in the diagnosis of retinal diseases has also been +extensively studied. To better understand heart function and anticipate changes +based on microvascular characteristics and function, researchers are currently +exploring the integration of AI with non-invasive retinal scanning. There is +great potential to reduce the number of cardiovascular events and the financial +strain on healthcare systems by utilizing AI-assisted early detection and +prediction of cardiovascular diseases on a large scale. Method: A comprehensive +search was conducted across various databases, including PubMed, Medline, +Google Scholar, Scopus, Web of Sciences, IEEE Xplore, and ACM Digital Library, +using specific keywords related to cardiovascular diseases and artificial +intelligence. Results: The study included 87 English-language publications +selected for relevance, and additional references were considered. This paper +provides an overview of the recent developments and difficulties in using +artificial intelligence and retinal imaging to diagnose cardiovascular +diseases. It provides insights for further exploration in this field. +Conclusion: Researchers are trying to develop precise disease prognosis +patterns in response to the aging population and the growing global burden of +CVD. AI and deep learning are revolutionizing healthcare by potentially +diagnosing multiple CVDs from a single retinal image. However, swifter adoption +of these technologies in healthcare systems is required. + +
+
+ comment: 41 pages, 5 figures, 3 tables, 114 references +
+
+
+
+
+ + ♻ ☆ Data Upcycling Knowledge Distillation for Image Super-Resolution + + +
+ Knowledge distillation (KD) compresses deep neural networks by transferring +task-related knowledge from cumbersome pre-trained teacher models to compact +student models. However, current KD methods for super-resolution (SR) networks +overlook the nature of SR task that the outputs of the teacher model are noisy +approximations to the ground-truth distribution of high-quality images (GT), +which shades the teacher model's knowledge to result in limited KD effects. To +utilize the teacher model beyond the GT upper-bound, we present the Data +Upcycling Knowledge Distillation (DUKD), to transfer the teacher model's +knowledge to the student model through the upcycled in-domain data derived from +training data. Besides, we impose label consistency regularization to KD for SR +by the paired invertible augmentations to improve the student model's +performance and robustness. Comprehensive experiments demonstrate that the DUKD +method significantly outperforms previous arts on several SR tasks. + +
+
+
+
+
+ + ♻ ☆ A Survey on Intermediate Fusion Methods for Collaborative Perception + Categorized by Real World Challenges + + +
+ This survey analyzes intermediate fusion methods in collaborative perception +for autonomous driving, categorized by real-world challenges. We examine +various methods, detailing their features and the evaluation metrics they +employ. The focus is on addressing challenges like transmission efficiency, +localization errors, communication disruptions, and heterogeneity. Moreover, we +explore strategies to counter adversarial attacks and defenses, as well as +approaches to adapt to domain shifts. The objective is to present an overview +of how intermediate fusion methods effectively meet these diverse challenges, +highlighting their role in advancing the field of collaborative perception in +autonomous driving. + +
+
+ comment: 8 pages, 6 tables +
+
+
+
+
+ + ♻ ☆ SyncTalk: The Devil is in the Synchronization for Talking Head Synthesis CVPR 2024 + + +
+ Achieving high synchronization in the synthesis of realistic, speech-driven +talking head videos presents a significant challenge. Traditional Generative +Adversarial Networks (GAN) struggle to maintain consistent facial identity, +while Neural Radiance Fields (NeRF) methods, although they can address this +issue, often produce mismatched lip movements, inadequate facial expressions, +and unstable head poses. A lifelike talking head requires synchronized +coordination of subject identity, lip movements, facial expressions, and head +poses. The absence of these synchronizations is a fundamental flaw, leading to +unrealistic and artificial outcomes. To address the critical issue of +synchronization, identified as the "devil" in creating realistic talking heads, +we introduce SyncTalk. This NeRF-based method effectively maintains subject +identity, enhancing synchronization and realism in talking head synthesis. +SyncTalk employs a Face-Sync Controller to align lip movements with speech and +innovatively uses a 3D facial blendshape model to capture accurate facial +expressions. Our Head-Sync Stabilizer optimizes head poses, achieving more +natural head movements. The Portrait-Sync Generator restores hair details and +blends the generated head with the torso for a seamless visual experience. +Extensive experiments and user studies demonstrate that SyncTalk outperforms +state-of-the-art methods in synchronization and realism. We recommend watching +the supplementary video: https://ziqiaopeng.github.io/synctalk + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Beyond Known Clusters: Probe New Prototypes for Efficient Generalized + Class Discovery + + +
+ Generalized Class Discovery (GCD) aims to dynamically assign labels to +unlabelled data partially based on knowledge learned from labelled data, where +the unlabelled data may come from known or novel classes. The prevailing +approach generally involves clustering across all data and learning conceptions +by prototypical contrastive learning. However, existing methods largely hinge +on the performance of clustering algorithms and are thus subject to their +inherent limitations. Firstly, the estimated cluster number is often smaller +than the ground truth, making the existing methods suffer from the lack of +prototypes for comprehensive conception learning. To address this issue, we +propose an adaptive probing mechanism that introduces learnable potential +prototypes to expand cluster prototypes (centers). As there is no ground truth +for the potential prototype, we develop a self-supervised prototype learning +framework to optimize the potential prototype in an end-to-end fashion. +Secondly, clustering is computationally intensive, and the conventional +strategy of clustering both labelled and unlabelled instances exacerbates this +issue. To counteract this inefficiency, we opt to cluster only the unlabelled +instances and subsequently expand the cluster prototypes with our introduced +potential prototypes to fast explore novel classes. Despite the simplicity of +our proposed method, extensive empirical analysis on a wide range of datasets +confirms that our method consistently delivers state-of-the-art results. +Specifically, our method surpasses the nearest competitor by a significant +margin of \textbf{9.7}$\%$ within the Stanford Cars dataset and +\textbf{12$\times$} clustering efficiency within the Herbarium 19 dataset. We +will make the code and checkpoints publicly available at +\url{https://github.com/xjtuYW/PNP.git}. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ PhyRecon: Physically Plausible Neural Scene Reconstruction + + +
+ While neural implicit representations have gained popularity in multi-view 3D +reconstruction, previous work struggles to yield physically plausible results, +thereby limiting their applications in physics-demanding domains like embodied +AI and robotics. The lack of plausibility originates from both the absence of +physics modeling in the existing pipeline and their inability to recover +intricate geometrical structures. In this paper, we introduce PhyRecon, which +stands as the first approach to harness both differentiable rendering and +differentiable physics simulation to learn implicit surface representations. +Our framework proposes a novel differentiable particle-based physical simulator +seamlessly integrated with the neural implicit representation. At its core is +an efficient transformation between SDF-based implicit representation and +explicit surface points by our proposed algorithm, Surface Points Marching +Cubes (SP-MC), enabling differentiable learning with both rendering and +physical losses. Moreover, we model both rendering and physical uncertainty to +identify and compensate for the inconsistent and inaccurate monocular geometric +priors. The physical uncertainty additionally enables a physics-guided pixel +sampling to enhance the learning of slender structures. By amalgamating these +techniques, our model facilitates efficient joint modeling with appearance, +geometry, and physics. Extensive experiments demonstrate that PhyRecon +significantly outperforms all state-of-the-art methods in terms of +reconstruction quality. Our reconstruction results also yield superior physical +stability, verified by Isaac Gym, with at least a 40% improvement across all +datasets, opening broader avenues for future physics-based applications. + +
+
+ comment: project page: https://phyrecon.github.io/ +
+
+
+
+
+ + ♻ ☆ Learning to Kern: Set-wise Estimation of Optimal Letter Space + + +
+ Kerning is the task of setting appropriate horizontal spaces for all possible +letter pairs of a certain font. One of the difficulties of kerning is that the +appropriate space differs for each letter pair. Therefore, for a total of 52 +capital and small letters, we need to adjust $52 \times 52 = 2704$ different +spaces. Another difficulty is that there is neither a general procedure nor +criterion for automatic kerning; therefore, kerning is still done manually or +with heuristics. In this paper, we tackle kerning by proposing two +machine-learning models, called pairwise and set-wise models. The former is a +simple deep neural network that estimates the letter space for two given letter +images. In contrast, the latter is a transformer-based model that estimates the +letter spaces for three or more given letter images. For example, the set-wise +model simultaneously estimates 2704 spaces for 52 letter images for a certain +font. Among the two models, the set-wise model is not only more efficient but +also more accurate because its internal self-attention mechanism allows for +more consistent kerning for all letters. Experimental results on about 2500 +Google fonts and their quantitative and qualitative analyses show that the +set-wise model has an average estimation error of only about 5.3 pixels when +the average letter space of all fonts and letter pairs is about 115 pixels. + +
+
+
+
+
+ + ♻ ☆ Revisiting Neural Networks for Continual Learning: An Architectural + Perspective + + +
+ Efforts to overcome catastrophic forgetting have primarily centered around +developing more effective Continual Learning (CL) methods. In contrast, less +attention was devoted to analyzing the role of network architecture design +(e.g., network depth, width, and components) in contributing to CL. This paper +seeks to bridge this gap between network architecture design and CL, and to +present a holistic study on the impact of network architectures on CL. This +work considers architecture design at the network scaling level, i.e., width +and depth, and also at the network components, i.e., skip connections, global +pooling layers, and down-sampling. In both cases, we first derive insights +through systematically exploring how architectural designs affect CL. Then, +grounded in these insights, we craft a specialized search space for CL and +further propose a simple yet effective ArchCraft method to steer a CL-friendly +architecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC. +Experimental validation across various CL settings and scenarios demonstrates +that improved architectures are parameter-efficient, achieving state-of-the-art +performance of CL while being 86%, 61%, and 97% more compact in terms of +parameters than the naive CL architecture in Task IL and Class IL. Code is +available at https://github.com/byyx666/ArchCraft. + +
+
+
+
+
+ + ♻ ☆ InstructEdit: Instruction-based Knowledge Editing for Large Language + Models IJCAI 2024 + + +
+ Knowledge editing for large language models can offer an efficient solution +to alter a model's behavior without negatively impacting the overall +performance. However, the current approaches encounter issues with limited +generalizability across tasks, necessitating one distinct editor for each task, +significantly hindering the broader applications. To address this, we take the +first step to analyze the multi-task generalization issue in knowledge editing. +Specifically, we develop an instruction-based editing technique, termed +InstructEdit, which facilitates the editor's adaptation to various task +performances simultaneously using simple instructions. With only one unified +editor for each LLM, we empirically demonstrate that InstructEdit can improve +the editor's control, leading to an average 14.86% increase in Reliability in +multi-task editing setting. Furthermore, experiments involving holdout unseen +task illustrate that InstructEdit consistently surpass previous strong +baselines. To further investigate the underlying mechanisms of +instruction-based knowledge editing, we analyze the principal components of the +editing gradient directions, which unveils that instructions can help control +optimization direction with stronger OOD generalization. Code and datasets are +available in https://github.com/zjunlp/EasyEdit. + +
+
+ comment: IJCAI 2024; the project website is at + https://www.zjukg.org/project/InstructEdit/ +
+
+
+
+
+ + ♻ ☆ MaTe3D: Mask-guided Text-based 3D-aware Portrait Editing + + +
+ 3D-aware portrait editing has a wide range of applications in multiple +fields. However, current approaches are limited due that they can only perform +mask-guided or text-based editing. Even by fusing the two procedures into a +model, the editing quality and stability cannot be ensured. To address this +limitation, we propose \textbf{MaTe3D}: mask-guided text-based 3D-aware +portrait editing. In this framework, first, we introduce a new SDF-based 3D +generator which learns local and global representations with proposed SDF and +density consistency losses. This enhances masked-based editing in local areas; +second, we present a novel distillation strategy: Conditional Distillation on +Geometry and Texture (CDGT). Compared to exiting distillation strategies, it +mitigates visual ambiguity and avoids mismatch between texture and geometry, +thereby producing stable texture and convincing geometry while editing. +Additionally, we create the CatMask-HQ dataset, a large-scale high-resolution +cat face annotation for exploration of model generalization and expansion. We +perform expensive experiments on both the FFHQ and CatMask-HQ datasets to +demonstrate the editing quality and stability of the proposed method. Our +method faithfully generates a 3D-aware edited face image based on a modified +mask and a text prompt. Our code and models will be publicly released. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Inverse-Free Fast Natural Gradient Descent Method for Deep Learning + + +
+ Second-order optimization techniques have the potential to achieve faster +convergence rates compared to first-order methods through the incorporation of +second-order derivatives or statistics. However, their utilization in deep +learning is limited due to their computational inefficiency. Various approaches +have been proposed to address this issue, primarily centered on minimizing the +size of the matrix to be inverted. Nevertheless, the necessity of performing +the inverse operation iteratively persists. In this work, we present a fast +natural gradient descent (FNGD) method that only requires inversion during the +first epoch. Specifically, it is revealed that natural gradient descent (NGD) +is essentially a weighted sum of per-sample gradients. Our novel approach +further proposes to share these weighted coefficients across epochs without +affecting empirical performance. Consequently, FNGD exhibits similarities to +the average sum in first-order methods, leading to the computational complexity +of FNGD being comparable to that of first-order methods. Extensive experiments +on image classification and machine translation tasks demonstrate the +efficiency of the proposed FNGD. For training ResNet-18 on CIFAR-100, FNGD can +achieve a speedup of 2.07$\times$ compared with KFAC. For training Transformer +on Multi30K, FNGD outperforms AdamW by 24 BLEU score while requiring almost the +same training time. + +
+
+
+
+
+ + ♻ ☆ AnyPattern: Towards In-context Image Copy Detection + + +
+ This paper explores in-context learning for image copy detection (ICD), i.e., +prompting an ICD model to identify replicated images with new tampering +patterns without the need for additional training. The prompts (or the +contexts) are from a small set of image-replica pairs that reflect the new +patterns and are used at inference time. Such in-context ICD has good realistic +value, because it requires no fine-tuning and thus facilitates fast reaction +against the emergence of unseen patterns. To accommodate the "seen +$\rightarrow$ unseen" generalization scenario, we construct the first +large-scale pattern dataset named AnyPattern, which has the largest number of +tamper patterns ($90$ for training and $10$ for testing) among all the existing +ones. We benchmark AnyPattern with popular ICD methods and reveal that existing +methods barely generalize to novel patterns. We further propose a simple +in-context ICD method named ImageStacker. ImageStacker learns to select the +most representative image-replica pairs and employs them as the pattern prompts +in a stacking manner (rather than the popular concatenation manner). +Experimental results show (1) training with our large-scale dataset +substantially benefits pattern generalization ($+26.66 \%$ $\mu AP$), (2) the +proposed ImageStacker facilitates effective in-context ICD (another round of +$+16.75 \%$ $\mu AP$), and (3) AnyPattern enables in-context ICD, i.e., without +such a large-scale dataset, in-context learning does not emerge even with our +ImageStacker. Beyond the ICD task, we also demonstrate how AnyPattern can +benefit artists, i.e., the pattern retrieval method trained on AnyPattern can +be generalized to identify style mimicry by text-to-image models. The project +is publicly available at https://anypattern.github.io. + +
+
+ comment: The project is publicly available at https://anypattern.github.io. + arXiv admin note: text overlap with arXiv:2403.06098 +
+
+
+
+
+ + ♻ ☆ Rethinking Centered Kernel Alignment in Knowledge Distillation + + +
+ Knowledge distillation has emerged as a highly effective method for bridging +the representation discrepancy between large-scale models and lightweight +models. Prevalent approaches involve leveraging appropriate metrics to minimize +the divergence or distance between the knowledge extracted from the teacher +model and the knowledge learned by the student model. Centered Kernel Alignment +(CKA) is widely used to measure representation similarity and has been applied +in several knowledge distillation methods. However, these methods are complex +and fail to uncover the essence of CKA, thus not answering the question of how +to use CKA to achieve simple and effective distillation properly. This paper +first provides a theoretical perspective to illustrate the effectiveness of +CKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD) +and a constant term. Drawing from this, we propose a novel Relation-Centered +Kernel Alignment~(RCKA) framework, which practically establishes a connection +between CKA and MMD. Furthermore, we dynamically customize the application of +CKA based on the characteristics of each task, with less computational source +yet comparable performance than the previous methods. The extensive experiments +on the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves +state-of-the-art performance on almost all teacher-student pairs for image +classification and object detection, validating the effectiveness of our +approaches. Our code is available in https://github.com/Klayand/PCKA + +
+
+
+
+
+ + ♻ ☆ Physics-Aware Semi-Supervised Underwater Image Enhancement + + +
+ Underwater images normally suffer from degradation due to the transmission +medium of water bodies. Both traditional prior-based approaches and deep +learning-based methods have been used to address this problem. However, the +inflexible assumption of the former often impairs their effectiveness in +handling diverse underwater scenes, while the generalization of the latter to +unseen images is usually weakened by insufficient data. In this study, we +leverage both the physics-based underwater Image Formation Model (IFM) and deep +learning techniques for Underwater Image Enhancement (UIE). To this end, we +propose a novel Physics-Aware Dual-Stream Underwater Image Enhancement Network, +i.e., PA-UIENet, which comprises a Transmission Estimation Steam (T-Stream) and +an Ambient Light Estimation Stream (A-Stream). This network fulfills the UIE +task by explicitly estimating the degradation parameters of the IFM. We also +adopt an IFM-inspired semi-supervised learning framework, which exploits both +the labeled and unlabeled images, to address the issue of insufficient data. +Our method performs better than, or at least comparably to, eight baselines +across five testing sets in the degradation estimation and UIE tasks. This +should be due to the fact that it not only can model the degradation but also +can learn the characteristics of diverse underwater scenes. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ DoRA: Weight-Decomposed Low-Rank Adaptation + + +
+ Among the widely used parameter-efficient finetuning (PEFT) methods, LoRA and +its variants have gained considerable popularity because of avoiding additional +inference costs. However, there still often exists an accuracy gap between +these methods and full fine-tuning (FT). In this work, we first introduce a +novel weight decomposition analysis to investigate the inherent differences +between FT and LoRA. Aiming to resemble the learning capacity of FT from the +findings, we propose Weight-Decomposed LowRank Adaptation (DoRA). DoRA +decomposes the pre-trained weight into two components, magnitude and direction, +for fine-tuning, specifically employing LoRA for directional updates to +efficiently minimize the number of trainable parameters. By employing DoRA, we +enhance both the learning capacity and training stability of LoRA while +avoiding any additional inference overhead. DoRA consistently outperforms LoRA +on fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as +commonsense reasoning, visual instruction tuning, and image/video-text +understanding. Code available at https://github.com/NVlabs/DoRA. + +
+
+ comment: Code available at https://github.com/NVlabs/DoRA +
+
+
+
+
+ + ♻ ☆ DeepLight: Reconstructing High-Resolution Observations of Nighttime + Light With Multi-Modal Remote Sensing Data + + +
+ Nighttime light (NTL) remote sensing observation serves as a unique proxy for +quantitatively assessing progress toward meeting a series of Sustainable +Development Goals (SDGs), such as poverty estimation, urban sustainable +development, and carbon emission. However, existing NTL observations often +suffer from pervasive degradation and inconsistency, limiting their utility for +computing the indicators defined by the SDGs. In this study, we propose a novel +approach to reconstruct high-resolution NTL images using multi-modal remote +sensing data. To support this research endeavor, we introduce DeepLightMD, a +comprehensive dataset comprising data from five heterogeneous sensors, offering +fine spatial resolution and rich spectral information at a national scale. +Additionally, we present DeepLightSR, a calibration-aware method for building +bridges between spatially heterogeneous modality data in the multi-modality +super-resolution. DeepLightSR integrates calibration-aware alignment, an +auxiliary-to-main multi-modality fusion, and an auxiliary-embedded refinement +to effectively address spatial heterogeneity, fuse diversely representative +features, and enhance performance in $8\times$ super-resolution (SR) tasks. +Extensive experiments demonstrate the superiority of DeepLightSR over 8 +competing methods, as evidenced by improvements in PSNR (2.01 dB $ \sim $ 13.25 +dB) and PIQE (0.49 $ \sim $ 9.32). Our findings underscore the practical +significance of our proposed dataset and model in reconstructing +high-resolution NTL data, supporting efficiently and quantitatively assessing +the SDG progress. + +
+
+
+
+
+ + ♻ ☆ GaussianTalker: Speaker-specific Talking Head Synthesis via 3D Gaussian + Splatting + + +
+ Recent works on audio-driven talking head synthesis using Neural Radiance +Fields (NeRF) have achieved impressive results. However, due to inadequate pose +and expression control caused by NeRF implicit representation, these methods +still have some limitations, such as unsynchronized or unnatural lip movements, +and visual jitter and artifacts. In this paper, we propose GaussianTalker, a +novel method for audio-driven talking head synthesis based on 3D Gaussian +Splatting. With the explicit representation property of 3D Gaussians, intuitive +control of the facial motion is achieved by binding Gaussians to 3D facial +models. GaussianTalker consists of two modules, Speaker-specific Motion +Translator and Dynamic Gaussian Renderer. Speaker-specific Motion Translator +achieves accurate lip movements specific to the target speaker through +universalized audio feature extraction and customized lip motion generation. +Dynamic Gaussian Renderer introduces Speaker-specific BlendShapes to enhance +facial detail representation via a latent pose, delivering stable and realistic +rendered videos. Extensive experimental results suggest that GaussianTalker +outperforms existing state-of-the-art methods in talking head synthesis, +delivering precise lip synchronization and exceptional visual quality. Our +method achieves rendering speeds of 130 FPS on NVIDIA RTX4090 GPU, +significantly exceeding the threshold for real-time rendering performance, and +can potentially be deployed on other hardware platforms. + +
+
+ comment: https://yuhongyun777.github.io/GaussianTalker/ +
+
+
+
+
+ + ♻ ☆ Attack on Scene Flow using Point Clouds + + +
+ Deep neural networks have made significant advancements in accurately +estimating scene flow using point clouds, which is vital for many applications +like video analysis, action recognition, and navigation. Robustness of these +techniques, however, remains a concern, particularly in the face of adversarial +attacks that have been proven to deceive state-of-the-art deep neural networks +in many domains. Surprisingly, the robustness of scene flow networks against +such attacks has not been thoroughly investigated. To address this problem, the +proposed approach aims to bridge this gap by introducing adversarial white-box +attacks specifically tailored for scene flow networks. Experimental results +show that the generated adversarial examples obtain up to 33.7 relative +degradation in average end-point error on the KITTI and FlyingThings3D +datasets. The study also reveals the significant impact that attacks targeting +point clouds in only one dimension or color channel have on average end-point +error. Analyzing the success and failure of these attacks on the scene flow +networks and their 2D optical flow network variants show a higher vulnerability +for the optical flow networks. + +
+
+
+
+
+ + ♻ ☆ Res-VMamba: Fine-Grained Food Category Visual Classification Using + Selective State Space Models with Deep Residual Learning + + +
+ Food classification is the foundation for developing food vision tasks and +plays a key role in the burgeoning field of computational nutrition. Due to the +complexity of food requiring fine-grained classification, recent academic +research mainly modifies Convolutional Neural Networks (CNNs) and/or Vision +Transformers (ViTs) to perform food category classification. However, to learn +fine-grained features, the CNN backbone needs additional structural design, +whereas ViT, containing the self-attention module, has increased computational +complexity. In recent months, a new Sequence State Space (S4) model, through a +Selection mechanism and computation with a Scan (S6), colloquially termed +Mamba, has demonstrated superior performance and computation efficiency +compared to the Transformer architecture. The VMamba model, which incorporates +the Mamba mechanism into image tasks (such as classification), currently +establishes the state-of-the-art (SOTA) on the ImageNet dataset. In this +research, we introduce an academically underestimated food dataset CNFOOD-241, +and pioneer the integration of a residual learning framework within the VMamba +model to concurrently harness both global and local state features inherent in +the original VMamba architectural design. The research results show that VMamba +surpasses current SOTA models in fine-grained and food classification. The +proposed Res-VMamba further improves the classification accuracy to 79.54\% +without pretrained weight. Our findings elucidate that our proposed methodology +establishes a new benchmark for SOTA performance in food recognition on the +CNFOOD-241 dataset. The code can be obtained on GitHub: +https://github.com/ChiShengChen/ResVMamba. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Attention-Challenging Multiple Instance Learning for Whole Slide Image + Classification + + +
+ In the application of Multiple Instance Learning (MIL) methods for Whole +Slide Image (WSI) classification, attention mechanisms often focus on a subset +of discriminative instances, which are closely linked to overfitting. To +mitigate overfitting, we present Attention-Challenging MIL (ACMIL). ACMIL +combines two techniques based on separate analyses for attention value +concentration. Firstly, UMAP of instance features reveals various patterns +among discriminative instances, with existing attention mechanisms capturing +only some of them. To remedy this, we introduce Multiple Branch Attention (MBA) +to capture more discriminative instances using multiple attention branches. +Secondly, the examination of the cumulative value of Top-K attention scores +indicates that a tiny number of instances dominate the majority of attention. +In response, we present Stochastic Top-K Instance Masking (STKIM), which masks +out a portion of instances with Top-K attention values and allocates their +attention values to the remaining instances. The extensive experimental results +on three WSI datasets with two pre-trained backbones reveal that our ACMIL +outperforms state-of-the-art methods. Additionally, through heatmap +visualization and UMAP visualization, this paper extensively illustrates +ACMIL's effectiveness in suppressing attention value concentration and +overcoming the overfitting challenge. The source code is available at +\url{https://github.com/dazhangyu123/ACMIL}. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Fully Sparse Fusion for 3D Object Detection + + +
+ Currently prevalent multimodal 3D detection methods are built upon +LiDAR-based detectors that usually use dense Bird's-Eye-View (BEV) feature +maps. However, the cost of such BEV feature maps is quadratic to the detection +range, making it not suitable for long-range detection. Fully sparse +architecture is gaining attention as they are highly efficient in long-range +perception. In this paper, we study how to effectively leverage image modality +in the emerging fully sparse architecture. Particularly, utilizing instance +queries, our framework integrates the well-studied 2D instance segmentation +into the LiDAR side, which is parallel to the 3D instance segmentation part in +the fully sparse detector. This design achieves a uniform query-based fusion +framework in both the 2D and 3D sides while maintaining the fully sparse +characteristic. Extensive experiments showcase state-of-the-art results on the +widely used nuScenes dataset and the long-range Argoverse 2 dataset. Notably, +the inference speed of the proposed method under the long-range LiDAR +perception setting is 2.7 $\times$ faster than that of other state-of-the-art +multimodal 3D detection methods. Code will be released at +\url{https://github.com/BraveGroup/FullySparseFusion}. + +
+
+ comment: TPMAI 2024 +
+
+
+
+
+ + ♻ ☆ From a Bird's Eye View to See: Joint Camera and Subject Registration + without the Camera Calibration + + +
+ We tackle a new problem of multi-view camera and subject registration in the +bird's eye view (BEV) without pre-given camera calibration. This is a very +challenging problem since its only input is several RGB images from different +first-person views (FPVs) for a multi-person scene, without the BEV image and +the calibration of the FPVs, while the output is a unified plane with the +localization and orientation of both the subjects and cameras in a BEV. We +propose an end-to-end framework solving this problem, whose main idea can be +divided into following parts: i) creating a view-transform subject detection +module to transform the FPV to a virtual BEV including localization and +orientation of each pedestrian, ii) deriving a geometric transformation based +method to estimate camera localization and view direction, i.e., the camera +registration in a unified BEV, iii) making use of spatial and appearance +information to aggregate the subjects into the unified BEV. We collect a new +large-scale synthetic dataset with rich annotations for evaluation. The +experimental results show the remarkable effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Which images to label for few-shot medical landmark detection? + + +
+ The success of deep learning methods relies on the availability of +well-labeled large-scale datasets. However, for medical images, annotating such +abundant training data often requires experienced radiologists and consumes +their limited time. Few-shot learning is developed to alleviate this burden, +which achieves competitive performances with only several labeled data. +However, a crucial yet previously overlooked problem in few-shot learning is +about the selection of template images for annotation before learning, which +affects the final performance. We herein propose a novel Sample Choosing Policy +(SCP) to select "the most worthy" images for annotation, in the context of +few-shot medical landmark detection. SCP consists of three parts: 1) +Self-supervised training for building a pre-trained deep model to extract +features from radiological images, 2) Key Point Proposal for localizing +informative patches, and 3) Representative Score Estimation for searching the +most representative samples or templates. The advantage of SCP is demonstrated +by various experiments on three widely-used public datasets. For one-shot +medical landmark detection, its use reduces the mean radial errors on +Cephalometric and HandXray datasets by 14.2% (from 3.595mm to 3.083mm) and +35.5% (4.114mm to 2.653mm), respectively. + +
+
+
+
+
+ + ♻ ☆ BMAD: Benchmarks for Medical Anomaly Detection + + +
+ Anomaly detection (AD) is a fundamental research problem in machine learning +and computer vision, with practical applications in industrial inspection, +video surveillance, and medical diagnosis. In medical imaging, AD is especially +vital for detecting and diagnosing anomalies that may indicate rare diseases or +conditions. However, there is a lack of a universal and fair benchmark for +evaluating AD methods on medical images, which hinders the development of more +generalized and robust AD methods in this specific domain. To bridge this gap, +we introduce a comprehensive evaluation benchmark for assessing anomaly +detection methods on medical images. This benchmark encompasses six reorganized +datasets from five medical domains (i.e. brain MRI, liver CT, retinal OCT, +chest X-ray, and digital histopathology) and three key evaluation metrics, and +includes a total of fourteen state-of-the-art AD algorithms. This standardized +and well-curated medical benchmark with the well-structured codebase enables +comprehensive comparisons among recently proposed anomaly detection methods. It +will facilitate the community to conduct a fair comparison and advance the +field of AD on medical imaging. More information on BMAD is available in our +GitHub repository: https://github.com/DorisBao/BMAD + +
+
+
+
+
+ + ♻ ☆ SceneTracker: Long-term Scene Flow Estimation Network + + +
+ Considering the complementarity of scene flow estimation in the spatial +domain's focusing capability and 3D object tracking in the temporal domain's +coherence, this study aims to address a comprehensive new task that can +simultaneously capture fine-grained and long-term 3D motion in an online +manner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a +novel learning-based LSFE network that adopts an iterative approach to +approximate the optimal trajectory. Besides, it dynamically indexes and +constructs appearance and depth correlation features simultaneously and employs +the Transformer to explore and utilize long-range connections within and +between trajectories. With detailed experiments, SceneTracker shows superior +capabilities in handling 3D spatial occlusion and depth noise interference, +highly tailored to the LSFE task's needs. The code for SceneTracker is +available at https://github.com/wwsource/SceneTracker. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 75 + +
+
+
+ + ☆ Retrieval Robust to Object Motion Blur + + +
+ Moving objects are frequently seen in daily life and usually appear blurred +in images due to their motion. While general object retrieval is a widely +explored area in computer vision, it primarily focuses on sharp and static +objects, and retrieval of motion-blurred objects in large image collections +remains unexplored. We propose a method for object retrieval in images that are +affected by motion blur. The proposed method learns a robust representation +capable of matching blurred objects to their deblurred versions and vice versa. +To evaluate our approach, we present the first large-scale datasets for blurred +object retrieval, featuring images with objects exhibiting varying degrees of +blur in various poses and scales. We conducted extensive experiments, showing +that our method outperforms state-of-the-art retrieval methods on the new +blur-retrieval datasets, which validates the effectiveness of the proposed +approach. + +
+
+
+
+
+ + ☆ DM-Align: Leveraging the Power of Natural Language Instructions to Make + Changes to Images + + +
+ Text-based semantic image editing assumes the manipulation of an image using +a natural language instruction. Although recent works are capable of generating +creative and qualitative images, the problem is still mostly approached as a +black box sensitive to generating unexpected outputs. Therefore, we propose a +novel model to enhance the text-based control of an image editor by explicitly +reasoning about which parts of the image to alter or preserve. It relies on +word alignments between a description of the original source image and the +instruction that reflects the needed updates, and the input image. The proposed +Diffusion Masking with word Alignments (DM-Align) allows the editing of an +image in a transparent and explainable way. It is evaluated on a subset of the +Bison dataset and a self-defined dataset dubbed Dream. When comparing to +state-of-the-art baselines, quantitative and qualitative results show that +DM-Align has superior performance in image editing conditioned on language +instructions, well preserves the background of the image and can better cope +with long text instructions. + +
+
+
+
+
+ + ☆ FRAME: A Modular Framework for Autonomous Map-merging: Advancements in + the Field + + +
+ In this article, a novel approach for merging 3D point cloud maps in the +context of egocentric multi-robot exploration is presented. Unlike traditional +methods, the proposed approach leverages state-of-the-art place recognition and +learned descriptors to efficiently detect overlap between maps, eliminating the +need for the time-consuming global feature extraction and feature matching +process. The estimated overlapping regions are used to calculate a homogeneous +rigid transform, which serves as an initial condition for the GICP point cloud +registration algorithm to refine the alignment between the maps. The advantages +of this approach include faster processing time, improved accuracy, and +increased robustness in challenging environments. Furthermore, the +effectiveness of the proposed framework is successfully demonstrated through +multiple field missions of robot exploration in a variety of different +underground environments. + +
+
+ comment: 28 pages, 24 figures. Submitted to Field Robotics +
+
+
+
+
+ + ☆ MinBackProp -- Backpropagating through Minimal Solvers + + +
+ We present an approach to backpropagating through minimal problem solvers in +end-to-end neural network training. Traditional methods relying on manually +constructed formulas, finite differences, and autograd are laborious, +approximate, and unstable for complex minimal problem solvers. We show that +using the Implicit function theorem to calculate derivatives to backpropagate +through the solution of a minimal problem solver is simple, fast, and stable. +We compare our approach to (i) using the standard autograd on minimal problem +solvers and relate it to existing backpropagation formulas through SVD-based +and Eig-based solvers and (ii) implementing the backprop with an existing +PyTorch Deep Declarative Networks (DDN) framework. We demonstrate our technique +on a toy example of training outlier-rejection weights for 3D point +registration and on a real application of training an outlier-rejection and +RANSAC sampling network in image matching. Our method provides $100\%$ +stability and is 10 times faster compared to autograd, which is unstable and +slow, and compared to DDN, which is stable but also slow. + +
+
+
+
+
+ + ☆ A Method of Moments Embedding Constraint and its Application to + Semi-Supervised Learning + + +
+ Discriminative deep learning models with a linear+softmax final layer have a +problem: the latent space only predicts the conditional probabilities $p(Y|X)$ +but not the full joint distribution $p(Y,X)$, which necessitates a generative +approach. The conditional probability cannot detect outliers, causing outlier +sensitivity in softmax networks. This exacerbates model over-confidence +impacting many problems, such as hallucinations, confounding biases, and +dependence on large datasets. To address this we introduce a novel embedding +constraint based on the Method of Moments (MoM). We investigate the use of +polynomial moments ranging from 1st through 4th order hyper-covariance +matrices. Furthermore, we use this embedding constraint to train an +Axis-Aligned Gaussian Mixture Model (AAGMM) final layer, which learns not only +the conditional, but also the joint distribution of the latent space. We apply +this method to the domain of semi-supervised image classification by extending +FlexMatch with our technique. We find our MoM constraint with the AAGMM layer +is able to match the reported FlexMatch accuracy, while also modeling the joint +distribution, thereby reducing outlier sensitivity. We also present a +preliminary outlier detection strategy based on Mahalanobis distance and +discuss future improvements to this strategy. Code is available at: +\url{https://github.com/mmajurski/ssl-gmm} + +
+
+
+
+
+ + ☆ HVOFusion: Incremental Mesh Reconstruction Using Hybrid Voxel Octree + + +
+ Incremental scene reconstruction is essential to the navigation in robotics. +Most of the conventional methods typically make use of either TSDF (truncated +signed distance functions) volume or neural networks to implicitly represent +the surface. Due to the voxel representation or involving with time-consuming +sampling, they have difficulty in balancing speed, memory storage, and surface +quality. In this paper, we propose a novel hybrid voxel-octree approach to +effectively fuse octree with voxel structures so that we can take advantage of +both implicit surface and explicit triangular mesh representation. Such sparse +structure preserves triangular faces in the leaf nodes and produces partial +meshes sequentially for incremental reconstruction. This storage scheme allows +us to naturally optimize the mesh in explicit 3D space to achieve higher +surface quality. We iteratively deform the mesh towards the target and recovers +vertex colors by optimizing a shading model. Experimental results on several +datasets show that our proposed approach is capable of quickly and accurately +reconstructing a scene with realistic colors. + +
+
+
+
+
+ + ☆ SCorP: Statistics-Informed Dense Correspondence Prediction Directly from + Unsegmented Medical Images + + +
+ Statistical shape modeling (SSM) is a powerful computational framework for +quantifying and analyzing the geometric variability of anatomical structures, +facilitating advancements in medical research, diagnostics, and treatment +planning. Traditional methods for shape modeling from imaging data demand +significant manual and computational resources. Additionally, these methods +necessitate repeating the entire modeling pipeline to derive shape descriptors +(e.g., surface-based point correspondences) for new data. While deep learning +approaches have shown promise in streamlining the construction of SSMs on new +data, they still rely on traditional techniques to supervise the training of +the deep networks. Moreover, the predominant linearity assumption of +traditional approaches restricts their efficacy, a limitation also inherited by +deep learning models trained using optimized/established correspondences. +Consequently, representing complex anatomies becomes challenging. To address +these limitations, we introduce SCorP, a novel framework capable of predicting +surface-based correspondences directly from unsegmented images. By leveraging +the shape prior learned directly from surface meshes in an unsupervised manner, +the proposed model eliminates the need for an optimized shape model for +training supervision. The strong shape prior acts as a teacher and regularizes +the feature learning of the student network to guide it in learning image-based +features that are predictive of surface correspondences. The proposed model +streamlines the training and inference phases by removing the supervision for +the correspondence prediction task while alleviating the linearity assumption. + +
+
+
+
+
+ + ☆ Random Walk on Pixel Manifolds for Anomaly Segmentation of Complex + Driving Scenes + + +
+ In anomaly segmentation for complex driving scenes, state-of-the-art +approaches utilize anomaly scoring functions to calculate anomaly scores. For +these functions, accurately predicting the logits of inlier classes for each +pixel is crucial for precisely inferring the anomaly score. However, in +real-world driving scenarios, the diversity of scenes often results in +distorted manifolds of pixel embeddings in embedding space. This effect is not +conducive to directly using the pixel embeddings for the logit prediction +during inference, a concern overlooked by existing methods. To address this +problem, we propose a novel method called Random Walk on Pixel Manifolds +(RWPM). RWPM utilizes random walks to reveal the intrinsic relationships among +pixels to refine the pixel embeddings. The refined pixel embeddings alleviate +the distortion of manifolds, improving the accuracy of anomaly scores. Our +extensive experiments show that RWPM consistently improve the performance of +the existing anomaly segmentation methods and achieve the best results. Code: +\url{https://github.com/ZelongZeng/RWPM}. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ FDCE-Net: Underwater Image Enhancement with Embedding Frequency and Dual + Color Encoder + + +
+ Underwater images often suffer from various issues such as low brightness, +color shift, blurred details, and noise due to light absorption and scattering +caused by water and suspended particles. Previous underwater image enhancement +(UIE) methods have primarily focused on spatial domain enhancement, neglecting +the frequency domain information inherent in the images. However, the +degradation factors of underwater images are closely intertwined in the spatial +domain. Although certain methods focus on enhancing images in the frequency +domain, they overlook the inherent relationship between the image degradation +factors and the information present in the frequency domain. As a result, these +methods frequently enhance certain attributes of the improved image while +inadequately addressing or even exacerbating other attributes. Moreover, many +existing methods heavily rely on prior knowledge to address color shift +problems in underwater images, limiting their flexibility and robustness. In +order to overcome these limitations, we propose the Embedding Frequency and +Dual Color Encoder Network (FDCE-Net) in our paper. The FDCE-Net consists of +two main structures: (1) Frequency Spatial Network (FS-Net) aims to achieve +initial enhancement by utilizing our designed Frequency Spatial Residual Block +(FSRB) to decouple image degradation factors in the frequency domain and +enhance different attributes separately. (2) To tackle the color shift issue, +we introduce the Dual-Color Encoder (DCE). The DCE establishes correlations +between color and semantic representations through cross-attention and +leverages multi-scale image features to guide the optimization of adaptive +color query. The final enhanced images are generated by combining the outputs +of FS-Net and DCE through a fusion network. These images exhibit rich details, +clear textures, low noise and natural colors. + +
+
+ comment: 16pages,13 figures +
+
+
+
+
+ + ☆ Critical Review for One-class Classification: recent advances and the + reality behind them + + +
+ This paper offers a comprehensive review of one-class classification (OCC), +examining the technologies and methodologies employed in its implementation. It +delves into various approaches utilized for OCC across diverse data types, such +as feature data, image, video, time series, and others. Through a systematic +review, this paper synthesizes promi-nent strategies used in OCC from its +inception to its current advance-ments, with a particular emphasis on the +promising application. Moreo-ver, the article criticizes the state-of-the-art +(SOTA) image anomaly de-tection (AD) algorithms dominating one-class +experiments. These algo-rithms include outlier exposure (binary classification) +and pretrained model (multi-class classification), conflicting with the +fundamental con-cept of learning from one class. Our investigation reveals that +the top nine algorithms for one-class CIFAR10 benchmark are not OCC. We ar-gue +that binary/multi-class classification algorithms should not be com-pared with +OCC. + +
+
+
+
+
+ + ☆ Multi-Stream Cellular Test-Time Adaptation of Real-Time Models Evolving + in Dynamic Environments + + +
+ In the era of the Internet of Things (IoT), objects connect through a dynamic +network, empowered by technologies like 5G, enabling real-time data sharing. +However, smart objects, notably autonomous vehicles, face challenges in +critical local computations due to limited resources. Lightweight AI models +offer a solution but struggle with diverse data distributions. To address this +limitation, we propose a novel Multi-Stream Cellular Test-Time Adaptation +(MSC-TTA) setup where models adapt on the fly to a dynamic environment divided +into cells. Then, we propose a real-time adaptive student-teacher method that +leverages the multiple streams available in each cell to quickly adapt to +changing data distributions. We validate our methodology in the context of +autonomous vehicles navigating across cells defined based on location and +weather conditions. To facilitate future benchmarking, we release a new +multi-stream large-scale synthetic semantic segmentation dataset, called DADE, +and show that our multi-stream approach outperforms a single-stream baseline. +We believe that our work will open research opportunities in the IoT and 5G +eras, offering solutions for real-time model adaptation. + +
+
+
+
+
+ + ☆ Spatio-Temporal Side Tuning Pre-trained Foundation Models for + Video-based Pedestrian Attribute Recognition + + +
+ Existing pedestrian attribute recognition (PAR) algorithms are mainly +developed based on a static image, however, the performance is unreliable in +challenging scenarios, such as heavy occlusion, motion blur, etc. In this work, +we propose to understand human attributes using video frames that can fully use +temporal information by fine-tuning a pre-trained multi-modal foundation model +efficiently. Specifically, we formulate the video-based PAR as a +vision-language fusion problem and adopt a pre-trained foundation model CLIP to +extract the visual features. More importantly, we propose a novel +spatiotemporal side-tuning strategy to achieve parameter-efficient optimization +of the pre-trained vision foundation model. To better utilize the semantic +information, we take the full attribute list that needs to be recognized as +another input and transform the attribute words/phrases into the corresponding +sentence via split, expand, and prompt operations. Then, the text encoder of +CLIP is utilized for embedding processed attribute descriptions. The averaged +visual tokens and text tokens are concatenated and fed into a fusion +Transformer for multi-modal interactive learning. The enhanced tokens will be +fed into a classification head for pedestrian attribute prediction. Extensive +experiments on two large-scale video-based PAR datasets fully validated the +effectiveness of our proposed framework. The source code of this paper is +available at https://github.com/Event-AHU/OpenPAR. + +
+
+ comment: Parameter Efficient Fine-Tuning Strategy for Video-based Pedestrian + Attribute Recognition +
+
+
+
+
+ + ☆ Pre-training on High Definition X-ray Images: An Experimental Study + + +
+ Existing X-ray based pre-trained vision models are usually conducted on a +relatively small-scale dataset (less than 500k samples) with limited resolution +(e.g., 224 $\times$ 224). However, the key to the success of self-supervised +pre-training large models lies in massive training data, and maintaining high +resolution in the field of X-ray images is the guarantee of effective solutions +to difficult miscellaneous diseases. In this paper, we address these issues by +proposing the first high-definition (1280 $\times$ 1280) X-ray based +pre-trained foundation vision model on our newly collected large-scale dataset +which contains more than 1 million X-ray images. Our model follows the masked +auto-encoder framework which takes the tokens after mask processing (with a +high rate) is used as input, and the masked image patches are reconstructed by +the Transformer encoder-decoder network. More importantly, we introduce a novel +context-aware masking strategy that utilizes the chest contour as a boundary +for adaptive masking operations. We validate the effectiveness of our model on +two downstream tasks, including X-ray report generation and disease +recognition. Extensive experiments demonstrate that our pre-trained medical +foundation vision model achieves comparable or even new state-of-the-art +performance on downstream benchmark datasets. The source code and pre-trained +models of this paper will be released on +https://github.com/Event-AHU/Medical_Image_Analysis. + +
+
+ comment: Technology Report +
+
+
+
+
+ + ☆ Open-Set 3D Semantic Instance Maps for Vision Language Navigation -- + O3D-SIM + + +
+ Humans excel at forming mental maps of their surroundings, equipping them to +understand object relationships and navigate based on language queries. Our +previous work SI Maps [1] showed that having instance-level information and the +semantic understanding of an environment helps significantly improve +performance for language-guided tasks. We extend this instance-level approach +to 3D while increasing the pipeline's robustness and improving quantitative and +qualitative results. Our method leverages foundational models for object +recognition, image segmentation, and feature extraction. We propose a +representation that results in a 3D point cloud map with instance-level +embeddings, which bring in the semantic understanding that natural language +commands can query. Quantitatively, the work improves upon the success rate of +language-guided tasks. At the same time, we qualitatively observe the ability +to identify instances more clearly and leverage the foundational models and +language and image-aligned embeddings to identify objects that, otherwise, a +closed-set approach wouldn't be able to identify. + +
+
+
+
+
+ + ☆ EvaNet: Elevation-Guided Flood Extent Mapping on Earth Imagery IJCAI + + +
+ Accurate and timely mapping of flood extent from high-resolution satellite +imagery plays a crucial role in disaster management such as damage assessment +and relief activities. However, current state-of-the-art solutions are based on +U-Net, which can-not segment the flood pixels accurately due to the ambiguous +pixels (e.g., tree canopies, clouds) that prevent a direct judgement from only +the spectral features. Thanks to the digital elevation model (DEM) data readily +available from sources such as United States Geological Survey (USGS), this +work explores the use of an elevation map to improve flood extent mapping. We +propose, EvaNet, an elevation-guided segmentation model based on the +encoder-decoder architecture with two novel techniques: (1) a loss function +encoding the physical law of gravity that if a location is flooded (resp. dry), +then its adjacent locations with a lower (resp. higher) elevation must also be +flooded (resp. dry); (2) a new (de)convolution operation that integrates the +elevation map by a location sensitive gating mechanism to regulate how much +spectral features flow through adjacent layers. Extensive experiments show that +EvaNet significantly outperforms the U-Net baselines, and works as a perfect +drop-in replacement for U-Net in existing solutions to flood extent mapping. + +
+
+ comment: Accepted at the International Joint Conference on Artificial + Intelligence (IJCAI, 2024) +
+
+
+
+
+ + ☆ Reliable Student: Addressing Noise in Semi-Supervised 3D Object + Detection CVPR + + +
+ Semi-supervised 3D object detection can benefit from the promising +pseudo-labeling technique when labeled data is limited. However, recent +approaches have overlooked the impact of noisy pseudo-labels during training, +despite efforts to enhance pseudo-label quality through confidence-based +filtering. In this paper, we examine the impact of noisy pseudo-labels on +IoU-based target assignment and propose the Reliable Student framework, which +incorporates two complementary approaches to mitigate errors. First, it +involves a class-aware target assignment strategy that reduces false negative +assignments in difficult classes. Second, it includes a reliability weighting +strategy that suppresses false positive assignment errors while also addressing +remaining false negatives from the first step. The reliability weights are +determined by querying the teacher network for confidence scores of the +student-generated proposals. Our work surpasses the previous state-of-the-art +on KITTI 3D object detection benchmark on point clouds in the semi-supervised +setting. On 1% labeled data, our approach achieves a 6.2% AP improvement for +the pedestrian class, despite having only 37 labeled samples available. The +improvements become significant for the 2% setting, achieving 6.0% AP and 5.7% +AP improvements for the pedestrian and cyclist classes, respectively. + +
+
+ comment: Accepted at CVPR Workshop L3D-IVU 2023. Code: + https://github.com/fnozarian/ReliableStudent +
+
+
+
+
+ + ☆ Unsupervised Anomaly Detection via Masked Diffusion Posterior Sampling + + +
+ Reconstruction-based methods have been commonly used for unsupervised anomaly +detection, in which a normal image is reconstructed and compared with the given +test image to detect and locate anomalies. Recently, diffusion models have +shown promising applications for anomaly detection due to their powerful +generative ability. However, these models lack strict mathematical support for +normal image reconstruction and unexpectedly suffer from low reconstruction +quality. To address these issues, this paper proposes a novel and +highly-interpretable method named Masked Diffusion Posterior Sampling (MDPS). +In MDPS, the problem of normal image reconstruction is mathematically modeled +as multiple diffusion posterior sampling for normal images based on the devised +masked noisy observation model and the diffusion-based normal image prior under +Bayesian framework. Using a metric designed from pixel-level and +perceptual-level perspectives, MDPS can effectively compute the difference map +between each normal posterior sample and the given test image. Anomaly scores +are obtained by averaging all difference maps for multiple posterior samples. +Exhaustive experiments on MVTec and BTAD datasets demonstrate that MDPS can +achieve state-of-the-art performance in normal image reconstruction quality as +well as anomaly detection and localization. + +
+
+
+
+
+ + ☆ Unpaired Multi-view Clustering via Reliable View Guidance + + +
+ This paper focuses on unpaired multi-view clustering (UMC), a challenging +problem where paired observed samples are unavailable across multiple views. +The goal is to perform effective joint clustering using the unpaired observed +samples in all views. In incomplete multi-view clustering, existing methods +typically rely on sample pairing between views to capture their complementary. +However, that is not applicable in the case of UMC. Hence, we aim to extract +the consistent cluster structure across views. In UMC, two challenging issues +arise: uncertain cluster structure due to lack of label and uncertain pairing +relationship due to absence of paired samples. We assume that the view with a +good cluster structure is the reliable view, which acts as a supervisor to +guide the clustering of the other views. With the guidance of reliable views, a +more certain cluster structure of these views is obtained while achieving +alignment between reliable views and other views. Then we propose Reliable view +Guidance with one reliable view (RG-UMC) and multiple reliable views (RGs-UMC) +for UMC. Specifically, we design alignment modules with one reliable view and +multiple reliable views, respectively, to adaptively guide the optimization +process. Also, we utilize the compactness module to enhance the relationship of +samples within the same cluster. Meanwhile, an orthogonal constraint is applied +to latent representation to obtain discriminate features. Extensive experiments +show that both RG-UMC and RGs-UMC outperform the best state-of-the-art method +by an average of 24.14\% and 29.42\% in NMI, respectively. + +
+
+
+
+
+ + ☆ DPER: Diffusion Prior Driven Neural Representation for Limited Angle and + Sparse View CT Reconstruction + + +
+ Limited-angle and sparse-view computed tomography (LACT and SVCT) are crucial +for expanding the scope of X-ray CT applications. However, they face challenges +due to incomplete data acquisition, resulting in diverse artifacts in the +reconstructed CT images. Emerging implicit neural representation (INR) +techniques, such as NeRF, NeAT, and NeRP, have shown promise in +under-determined CT imaging reconstruction tasks. However, the unsupervised +nature of INR architecture imposes limited constraints on the solution space, +particularly for the highly ill-posed reconstruction task posed by LACT and +ultra-SVCT. In this study, we introduce the Diffusion Prior Driven Neural +Representation (DPER), an advanced unsupervised framework designed to address +the exceptionally ill-posed CT reconstruction inverse problems. DPER adopts the +Half Quadratic Splitting (HQS) algorithm to decompose the inverse problem into +data fidelity and distribution prior sub-problems. The two sub-problems are +respectively addressed by INR reconstruction scheme and pre-trained score-based +diffusion model. This combination initially preserves the implicit image local +consistency prior from INR. Additionally, it effectively augments the +feasibility of the solution space for the inverse problem through the +generative diffusion model, resulting in increased stability and precision in +the solutions. We conduct comprehensive experiments to evaluate the performance +of DPER on LACT and ultra-SVCT reconstruction with two public datasets (AAPM +and LIDC). The results show that our method outperforms the state-of-the-art +reconstruction methods on in-domain datasets, while achieving significant +performance improvements on out-of-domain datasets. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ☆ A Hybrid Approach for Document Layout Analysis in Document images ICDAR 2024 + + +
+ Document layout analysis involves understanding the arrangement of elements +within a document. This paper navigates the complexities of understanding +various elements within document images, such as text, images, tables, and +headings. The approach employs an advanced Transformer-based object detection +network as an innovative graphical page object detector for identifying tables, +figures, and displayed elements. We introduce a query encoding mechanism to +provide high-quality object queries for contrastive learning, enhancing +efficiency in the decoder phase. We also present a hybrid matching scheme that +integrates the decoder's original one-to-one matching strategy with the +one-to-many matching strategy during the training phase. This approach aims to +improve the model's accuracy and versatility in detecting various graphical +elements on a page. Our experiments on PubLayNet, DocLayNet, and PubTables +benchmarks show that our approach outperforms current state-of-the-art methods. +It achieves an average precision of 97.3% on PubLayNet, 81.6% on DocLayNet, and +98.6 on PubTables, demonstrating its superior performance in layout analysis. +These advancements not only enhance the conversion of document images into +editable and accessible formats but also streamline information retrieval and +data extraction processes. + +
+
+ comment: ICDAR 2024 +
+
+
+
+
+ + ☆ Underwater Variable Zoom-Depth-Guided Perception Network for Underwater + Image Enhancement + + +
+ Underwater scenes intrinsically involve degradation problems owing to +heterogeneous ocean elements. Prevailing underwater image enhancement (UIE) +methods stick to straightforward feature modeling to learn the mapping +function, which leads to limited vision gain as it lacks more explicit physical +cues (e.g., depth). In this work, we investigate injecting the depth prior into +the deep UIE model for more precise scene enhancement capability. To this end, +we present a novel depth-guided perception UIE framework, dubbed underwater +variable zoom (UVZ). Specifically, UVZ resorts to a two-stage pipeline. First, +a depth estimation network is designed to generate critical depth maps, +combined with an auxiliary supervision network introduced to suppress +estimation differences during training. Second, UVZ parses near-far scenarios +by harnessing the predicted depth maps, enabling local and non-local perceiving +in different regions. Extensive experiments on five benchmark datasets +demonstrate that UVZ achieves superior visual gain and delivers promising +quantitative metrics. Besides, UVZ is confirmed to exhibit good generalization +in some visual tasks, especially in unusual lighting conditions. The code, +models and results are available at: https://github.com/WindySprint/UVZ. + +
+
+
+
+
+ + ☆ Processing HSV Colored Medical Images and Adapting Color Thresholds for + Computational Image Analysis: a Practical Introduction to an open-source tool + + +
+ Background: Using artificial intelligence (AI) techniques for computational +medical image analysis has shown promising results. However, colored images are +often not readily available for AI analysis because of different coloring +thresholds used across centers and physicians as well as the removal of +clinical annotations. We aimed to develop an open-source tool that can adapt +different color thresholds of HSV-colored medical images and remove annotations +with a simple click. + Materials and Methods: We built a function using MATLAB and used multi-center +international shear wave elastography data (NCT 02638935) to test the function. +We provide step-by-step instructions with accompanying code lines. + Results: We demonstrate that the newly developed pre-processing function +successfully removed letters and adapted different color thresholds of +HSV-colored medical images. + Conclusion: We developed an open-source tool for removing letters and +adapting different color thresholds in HSV-colored medical images. We hope this +contributes to advancing medical image processing for developing robust +computational imaging algorithms using diverse multi-center big data. The +open-source Matlab tool is available at +https://github.com/cailiemed/image-threshold-adapting. + +
+
+ comment: An open-source tool that can adapt different color thresholds of + HSV-colored medical images. The newly developed pre-processing Matlab + function successfully works on multi-center, international shear wave + elastography data (NCT 02638935). Step-by-step instructions with accompanying + code lines were provided, easy to follow and reproduce +
+
+
+
+
+ + ☆ DF-SLAM: Neural Feature Rendering Based on Dictionary Factors + Representation for High-Fidelity Dense Visual SLAM System + + +
+ We introduce a high-fidelity neural implicit dense visual Simultaneous +Localization and Mapping (SLAM) system, termed DF-SLAM. In our work, we employ +dictionary factors for scene representation, encoding the geometry and +appearance information of the scene as a combination of basis and coefficient +factors. Compared to neural implicit SLAM methods that directly encode scene +information as features, our method exhibits superior scene detail +reconstruction capabilities and more efficient memory usage, while our model +size is insensitive to the size of the scene map, making our method more +suitable for large-scale scenes. Additionally, we employ feature integration +rendering to accelerate color rendering speed while ensuring color rendering +quality, further enhancing the real-time performance of our neural SLAM method. +Extensive experiments on synthetic and real-world datasets demonstrate that our +method is competitive with existing state-of-the-art neural implicit SLAM +methods in terms of real-time performance, localization accuracy, and scene +reconstruction quality. Our source code is available at +https://github.com/funcdecl/DF-SLAM. + +
+
+
+
+
+ + ☆ Are Watermarks Bugs for Deepfake Detectors? Rethinking Proactive + Forensics IJCAI 2024 + + +
+ AI-generated content has accelerated the topic of media synthesis, +particularly Deepfake, which can manipulate our portraits for positive or +malicious purposes. Before releasing these threatening face images, one +promising forensics solution is the injection of robust watermarks to track +their own provenance. However, we argue that current watermarking models, +originally devised for genuine images, may harm the deployed Deepfake detectors +when directly applied to forged images, since the watermarks are prone to +overlap with the forgery signals used for detection. To bridge this gap, we +thus propose AdvMark, on behalf of proactive forensics, to exploit the +adversarial vulnerability of passive detectors for good. Specifically, AdvMark +serves as a plug-and-play procedure for fine-tuning any robust watermarking +into adversarial watermarking, to enhance the forensic detectability of +watermarked images; meanwhile, the watermarks can still be extracted for +provenance tracking. Extensive experiments demonstrate the effectiveness of the +proposed AdvMark, leveraging robust watermarking to fool Deepfake detectors, +which can help improve the accuracy of downstream Deepfake detection without +tuning the in-the-wild detectors. We believe this work will shed some light on +the harmless proactive forensics against Deepfake. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Vision-based Discovery of Nonlinear Dynamics for 3D Moving Target + + +
+ Data-driven discovery of governing equations has kindled significant +interests in many science and engineering areas. Existing studies primarily +focus on uncovering equations that govern nonlinear dynamics based on direct +measurement of the system states (e.g., trajectories). Limited efforts have +been placed on distilling governing laws of dynamics directly from videos for +moving targets in a 3D space. To this end, we propose a vision-based approach +to automatically uncover governing equations of nonlinear dynamics for 3D +moving targets via raw videos recorded by a set of cameras. The approach is +composed of three key blocks: (1) a target tracking module that extracts plane +pixel motions of the moving target in each video, (2) a Rodrigues' rotation +formula-based coordinate transformation learning module that reconstructs the +3D coordinates with respect to a predefined reference point, and (3) a +spline-enhanced library-based sparse regressor that uncovers the underlying +governing law of dynamics. This framework is capable of effectively handling +the challenges associated with measurement data, e.g., noise in the video, +imprecise tracking of the target that causes data missing, etc. The efficacy of +our method has been demonstrated through multiple sets of synthetic videos +considering different nonlinear dynamics. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ BoostRad: Enhancing Object Detection by Boosting Radar Reflections WACV2024 + + +
+ Automotive radars have an important role in autonomous driving systems. The +main challenge in automotive radar detection is the radar's wide point spread +function (PSF) in the angular domain that causes blurriness and clutter in the +radar image. Numerous studies suggest employing an 'end-to-end' learning +strategy using a Deep Neural Network (DNN) to directly detect objects from +radar images. This approach implicitly addresses the PSF's impact on objects of +interest. In this paper, we propose an alternative approach, which we term +"Boosting Radar Reflections" (BoostRad). In BoostRad, a first DNN is trained to +narrow the PSF for all the reflection points in the scene. The output of the +first DNN is a boosted reflection image with higher resolution and reduced +clutter, resulting in a sharper and cleaner image. Subsequently, a second DNN +is employed to detect objects within the boosted reflection image. We develop a +novel method for training the boosting DNN that incorporates domain knowledge +of radar's PSF characteristics. BoostRad's performance is evaluated using the +RADDet and CARRADA datasets, revealing its superiority over reference methods. + +
+
+ comment: WACV2024 +
+
+
+
+
+ + ☆ GLIMS: Attention-Guided Lightweight Multi-Scale Hybrid Network for + Volumetric Semantic Segmentation + + +
+ Convolutional Neural Networks (CNNs) have become widely adopted for medical +image segmentation tasks, demonstrating promising performance. However, the +inherent inductive biases in convolutional architectures limit their ability to +model long-range dependencies and spatial correlations. While recent +transformer-based architectures address these limitations by leveraging +self-attention mechanisms to encode long-range dependencies and learn +expressive representations, they often struggle to extract low-level features +and are highly dependent on data availability. This motivated us for the +development of GLIMS, a data-efficient attention-guided hybrid volumetric +segmentation network. GLIMS utilizes Dilated Feature Aggregator Convolutional +Blocks (DACB) to capture local-global feature correlations efficiently. +Furthermore, the incorporated Swin Transformer-based bottleneck bridges the +local and global features to improve the robustness of the model. Additionally, +GLIMS employs an attention-guided segmentation approach through Channel and +Spatial-Wise Attention Blocks (CSAB) to localize expressive features for +fine-grained border segmentation. Quantitative and qualitative results on +glioblastoma and multi-organ CT segmentation tasks demonstrate GLIMS' +effectiveness in terms of complexity and accuracy. GLIMS demonstrated +outstanding performance on BraTS2021 and BTCV datasets, surpassing the +performance of Swin UNETR. Notably, GLIMS achieved this high performance with a +significantly reduced number of trainable parameters. Specifically, GLIMS has +47.16M trainable parameters and 72.30G FLOPs, while Swin UNETR has 61.98M +trainable parameters and 394.84G FLOPs. The code is publicly available on +https://github.com/yaziciz/GLIMS. + +
+
+ comment: The article was accepted for publication in the Image and Vision + Computing journal +
+
+
+
+
+ + ☆ Instance-free Text to Point Cloud Localization with Relative Position + Awareness + + +
+ Text-to-point-cloud cross-modal localization is an emerging vision-language +task critical for future robot-human collaboration. It seeks to localize a +position from a city-scale point cloud scene based on a few natural language +instructions. In this paper, we address two key limitations of existing +approaches: 1) their reliance on ground-truth instances as input; and 2) their +neglect of the relative positions among potential instances. Our proposed model +follows a two-stage pipeline, including a coarse stage for text-cell retrieval +and a fine stage for position estimation. In both stages, we introduce an +instance query extractor, in which the cells are encoded by a 3D sparse +convolution U-Net to generate the multi-scale point cloud features, and a set +of queries iteratively attend to these features to represent instances. In the +coarse stage, a row-column relative position-aware self-attention (RowColRPA) +module is designed to capture the spatial relations among the instance queries. +In the fine stage, a multi-modal relative position-aware cross-attention (RPCA) +module is developed to fuse the text and point cloud features along with +spatial relations for improving fine position estimation. Experiment results on +the KITTI360Pose dataset demonstrate that our model achieves competitive +performance with the state-of-the-art models without taking ground-truth +instances as input. + +
+
+ comment: 12 pages, 10 figures, conference +
+
+
+
+
+ + ☆ Hybrid 3D Human Pose Estimation with Monocular Video and Sparse IMUs + + +
+ Temporal 3D human pose estimation from monocular videos is a challenging task +in human-centered computer vision due to the depth ambiguity of 2D-to-3D +lifting. To improve accuracy and address occlusion issues, inertial sensor has +been introduced to provide complementary source of information. However, it +remains challenging to integrate heterogeneous sensor data for producing +physically rational 3D human poses. In this paper, we propose a novel +framework, Real-time Optimization and Fusion (RTOF), to address this issue. We +first incorporate sparse inertial orientations into a parametric human skeleton +to refine 3D poses in kinematics. The poses are then optimized by energy +functions built on both visual and inertial observations to reduce the temporal +jitters. Our framework outputs smooth and biomechanically plausible human +motion. Comprehensive experiments with ablation studies demonstrate its +rationality and efficiency. On Total Capture dataset, the pose estimation error +is significantly decreased compared to the baseline method. + +
+
+ comment: 10 pages, 5 figures, Under Review +
+
+
+
+
+ + ☆ Dynamic Against Dynamic: An Open-set Self-learning Framework IJCAI2024 + + +
+ In open-set recognition, existing methods generally learn statically fixed +decision boundaries using known classes to reject unknown classes. Though they +have achieved promising results, such decision boundaries are evidently +insufficient for universal unknown classes in dynamic and open scenarios as +they can potentially appear at any position in the feature space. Moreover, +these methods just simply reject unknown class samples during testing without +any effective utilization for them. In fact, such samples completely can +constitute the true instantiated representation of the unknown classes to +further enhance the model's performance. To address these issues, this paper +proposes a novel dynamic against dynamic idea, i.e., dynamic method against +dynamic changing open-set world, where an open-set self-learning (OSSL) +framework is correspondingly developed. OSSL starts with a good closed-set +classifier trained by known classes and utilizes available test samples for +model adaptation during testing, thus gaining the adaptability to changing data +distributions. In particular, a novel self-matching module is designed for +OSSL, which can achieve the adaptation in automatically identifying known class +samples while rejecting unknown class samples which are further utilized to +enhance the discriminability of the model as the instantiated representation of +unknown classes. Our method establishes new performance milestones respectively +in almost all standard and cross-data benchmarks. + +
+
+ comment: The first two authors contributed equally to this work. Accepted at + IJCAI2024 +
+
+
+
+
+ + ☆ ODCR: Orthogonal Decoupling Contrastive Regularization for Unpaired + Image Dehazing CVPR 2024 + + +
+ Unpaired image dehazing (UID) holds significant research importance due to +the challenges in acquiring haze/clear image pairs with identical backgrounds. +This paper proposes a novel method for UID named Orthogonal Decoupling +Contrastive Regularization (ODCR). Our method is grounded in the assumption +that an image consists of both haze-related features, which influence the +degree of haze, and haze-unrelated features, such as texture and semantic +information. ODCR aims to ensure that the haze-related features of the dehazing +result closely resemble those of the clear image, while the haze-unrelated +features align with the input hazy image. To accomplish the motivation, +Orthogonal MLPs optimized geometrically on the Stiefel manifold are proposed, +which can project image features into an orthogonal space, thereby reducing the +relevance between different features. Furthermore, a task-driven Depth-wise +Feature Classifier (DWFC) is proposed, which assigns weights to the orthogonal +features based on the contribution of each channel's feature in predicting +whether the feature source is hazy or clear in a self-supervised fashion. +Finally, a Weighted PatchNCE (WPNCE) loss is introduced to achieve the pulling +of haze-related features in the output image toward those of clear images, +while bringing haze-unrelated features close to those of the hazy input. +Extensive experiments demonstrate the superior performance of our ODCR method +on UID. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ From Optimization to Generalization: Fair Federated Learning against + Quality Shift via Inter-Client Sharpness Matching IJCAI'24 + + +
+ Due to escalating privacy concerns, federated learning has been recognized as +a vital approach for training deep neural networks with decentralized medical +data. In practice, it is challenging to ensure consistent imaging quality +across various institutions, often attributed to equipment malfunctions +affecting a minority of clients. This imbalance in image quality can cause the +federated model to develop an inherent bias towards higher-quality images, thus +posing a severe fairness issue. In this study, we pioneer the identification +and formulation of this new fairness challenge within the context of the +imaging quality shift. Traditional methods for promoting fairness in federated +learning predominantly focus on balancing empirical risks across diverse client +distributions. This strategy primarily facilitates fair optimization across +different training data distributions, yet neglects the crucial aspect of +generalization. To address this, we introduce a solution termed Federated +learning with Inter-client Sharpness Matching (FedISM). FedISM enhances both +local training and global aggregation by incorporating sharpness-awareness, +aiming to harmonize the sharpness levels across clients for fair +generalization. Our empirical evaluations, conducted using the widely-used ICH +and ISIC 2019 datasets, establish FedISM's superiority over current +state-of-the-art federated learning methods in promoting fairness. Code is +available at https://github.com/wnn2000/FFL4MIA. + +
+
+ comment: This paper is accepted at IJCAI'24 (Main Track) +
+
+
+
+
+ + ☆ CLFT: Camera-LiDAR Fusion Transformer for Semantic Segmentation in + Autonomous Driving + + +
+ Critical research about camera-and-LiDAR-based semantic object segmentation +for autonomous driving significantly benefited from the recent development of +deep learning. Specifically, the vision transformer is the novel ground-breaker +that successfully brought the multi-head-attention mechanism to computer vision +applications. Therefore, we propose a vision-transformer-based network to carry +out camera-LiDAR fusion for semantic segmentation applied to autonomous +driving. Our proposal uses the novel progressive-assemble strategy of vision +transformers on a double-direction network and then integrates the results in a +cross-fusion strategy over the transformer decoder layers. Unlike other works +in the literature, our camera-LiDAR fusion transformers have been evaluated in +challenging conditions like rain and low illumination, showing robust +performance. The paper reports the segmentation results over the vehicle and +human classes in different modalities: camera-only, LiDAR-only, and +camera-LiDAR fusion. We perform coherent controlled benchmark experiments of +CLFT against other networks that are also designed for semantic segmentation. +The experiments aim to evaluate the performance of CLFT independently from two +perspectives: multimodal sensor fusion and backbone architectures. The +quantitative assessments show our CLFT networks yield an improvement of up to +10\% for challenging dark-wet conditions when comparing with +Fully-Convolutional-Neural-Network-based (FCN) camera-LiDAR fusion neural +network. Contrasting to the network with transformer backbone but using single +modality input, the all-around improvement is 5-10\%. + +
+
+ comment: Submitted to IEEE Transactions on Intelligent Vehicles +
+
+
+
+
+ + ☆ High-quality Surface Reconstruction using Gaussian Surfels + + +
+ We propose a novel point-based representation, Gaussian surfels, to combine +the advantages of the flexible optimization procedure in 3D Gaussian points and +the surface alignment property of surfels. This is achieved by directly setting +the z-scale of 3D Gaussian points to 0, effectively flattening the original 3D +ellipsoid into a 2D ellipse. Such a design provides clear guidance to the +optimizer. By treating the local z-axis as the normal direction, it greatly +improves optimization stability and surface alignment. While the derivatives to +the local z-axis computed from the covariance matrix are zero in this setting, +we design a self-supervised normal-depth consistency loss to remedy this issue. +Monocular normal priors and foreground masks are incorporated to enhance the +quality of the reconstruction, mitigating issues related to highlights and +background. We propose a volumetric cutting method to aggregate the information +of Gaussian surfels so as to remove erroneous points in depth maps generated by +alpha blending. Finally, we apply screened Poisson reconstruction method to the +fused depth maps to extract the surface mesh. Experimental results show that +our method demonstrates superior performance in surface reconstruction compared +to state-of-the-art neural volume rendering and point-based rendering methods. + +
+
+ comment: Original version +
+
+
+
+
+ + ☆ Compressing Latent Space via Least Volume + + +
+ This paper introduces Least Volume-a simple yet effective regularization +inspired by geometric intuition-that can reduce the necessary number of latent +dimensions needed by an autoencoder without requiring any prior knowledge of +the intrinsic dimensionality of the dataset. We show that the Lipschitz +continuity of the decoder is the key to making it work, provide a proof that +PCA is just a linear special case of it, and reveal that it has a similar +PCA-like importance ordering effect when applied to nonlinear models. We +demonstrate the intuition behind the regularization on some pedagogical toy +problems, and its effectiveness on several benchmark problems, including MNIST, +CIFAR-10 and CelebA. + +
+
+ comment: 24 pages, International Conference on Learning Representations 2024 +
+
+
+
+
+ + ☆ Charaterization of dim light response in DVS pixel: Discontinuity of + event triggering time + + +
+ Dynamic Vision Sensors (DVS) have recently generated great interest because +of the advantages of wide dynamic range and low latency compared with +conventional frame-based cameras. However, the complicated behaviors in dim +light conditions are still not clear, restricting the applications of DVS. In +this paper, we analyze the typical DVS circuit, and find that there exists +discontinuity of event triggering time. In dim light conditions, the +discontinuity becomes prominent. We point out that the discontinuity depends +exclusively on the changing speed of light intensity. Experimental results on +real event data validate the analysis and the existence of discontinuity that +reveals the non-first-order behaviors of DVS in dim light conditions. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Make the Most of Your Data: Changing the Training Data Distribution to + Improve In-distribution Generalization Performance + + +
+ Can we modify the training data distribution to encourage the underlying +optimization method toward finding solutions with superior generalization +performance on in-distribution data? In this work, we approach this question +for the first time by comparing the inductive bias of gradient descent (GD) +with that of sharpness-aware minimization (SAM). By studying a two-layer CNN, +we prove that SAM learns easy and difficult features more uniformly, +particularly in early epochs. That is, SAM is less susceptible to simplicity +bias compared to GD. Based on this observation, we propose USEFUL, an algorithm +that clusters examples based on the network output early in training and +upsamples examples with no easy features to alleviate the pitfalls of the +simplicity bias. We show empirically that modifying the training data +distribution in this way effectively improves the generalization performance on +the original data distribution when training with (S)GD by mimicking the +training dynamics of SAM. Notably, we demonstrate that our method can be +combined with SAM and existing data augmentation strategies to achieve, to the +best of our knowledge, state-of-the-art performance for training ResNet18 on +CIFAR10, STL10, CINIC10, Tiny-ImageNet; ResNet34 on CIFAR100; and VGG19 and +DenseNet121 on CIFAR10. + +
+
+ comment: 32 pages, 11 figures, 6 tables +
+
+
+
+
+ + ☆ RFL-CDNet: Towards Accurate Change Detection via Richer Feature Learning + + +
+ Change Detection is a crucial but extremely challenging task of remote +sensing image analysis, and much progress has been made with the rapid +development of deep learning. However, most existing deep learning-based change +detection methods mainly focus on intricate feature extraction and multi-scale +feature fusion, while ignoring the insufficient utilization of features in the +intermediate stages, thus resulting in sub-optimal results. To this end, we +propose a novel framework, named RFL-CDNet, that utilizes richer feature +learning to boost change detection performance. Specifically, we first +introduce deep multiple supervision to enhance intermediate representations, +thus unleashing the potential of backbone feature extractor at each stage. +Furthermore, we design the Coarse-To-Fine Guiding (C2FG) module and the +Learnable Fusion (LF) module to further improve feature learning and obtain +more discriminative feature representations. The C2FG module aims to seamlessly +integrate the side prediction from the previous coarse-scale into the current +fine-scale prediction in a coarse-to-fine manner, while LF module assumes that +the contribution of each stage and each spatial location is independent, thus +designing a learnable module to fuse multiple predictions. Experiments on +several benchmark datasets show that our proposed RFL-CDNet achieves +state-of-the-art performance on WHU cultivated land dataset and CDD dataset, +and the second-best performance on WHU building dataset. The source code and +models are publicly available at https://github.com/Hhaizee/RFL-CDNet. + +
+
+ comment: Accepted by PR, volume 153 +
+
+
+
+
+ + ☆ Large Multi-modality Model Assisted AI-Generated Image Quality + Assessment + + +
+ Traditional deep neural network (DNN)-based image quality assessment (IQA) +models leverage convolutional neural networks (CNN) or Transformer to learn the +quality-aware feature representation, achieving commendable performance on +natural scene images. However, when applied to AI-Generated images (AGIs), +these DNN-based IQA models exhibit subpar performance. This situation is +largely due to the semantic inaccuracies inherent in certain AGIs caused by +uncontrollable nature of the generation process. Thus, the capability to +discern semantic content becomes crucial for assessing the quality of AGIs. +Traditional DNN-based IQA models, constrained by limited parameter complexity +and training data, struggle to capture complex fine-grained semantic features, +making it challenging to grasp the existence and coherence of semantic content +of the entire image. To address the shortfall in semantic content perception of +current IQA models, we introduce a large Multi-modality model Assisted +AI-Generated Image Quality Assessment (MA-AGIQA) model, which utilizes +semantically informed guidance to sense semantic information and extract +semantic vectors through carefully designed text prompts. Moreover, it employs +a mixture of experts (MoE) structure to dynamically integrate the semantic +information with the quality-aware features extracted by traditional DNN-based +IQA models. Comprehensive experiments conducted on two AI-generated content +datasets, AIGCQA-20k and AGIQA-3k show that MA-AGIQA achieves state-of-the-art +performance, and demonstrate its superior generalization capabilities on +assessing the quality of AGIs. Code is available at +https://github.com/wangpuyi/MA-AGIQA. + +
+
+
+
+
+ + ☆ Adversarial Examples: Generation Proposal in the Context of Facial + Recognition Systems + + +
+ In this paper we investigate the vulnerability that facial recognition +systems present to adversarial examples by introducing a new methodology from +the attacker perspective. The technique is based on the use of the autoencoder +latent space, organized with principal component analysis. We intend to analyze +the potential to craft adversarial examples suitable for both dodging and +impersonation attacks, against state-of-the-art systems. Our initial +hypothesis, which was not strongly favoured by the results, stated that it +would be possible to separate between the "identity" and "facial expression" +features to produce high-quality examples. Despite the findings not supporting +it, the results sparked insights into adversarial examples generation and +opened new research avenues in the area. + +
+
+
+
+
+ + ☆ Leveraging Cross-Modal Neighbor Representation for Improved CLIP + Classification + + +
+ CLIP showcases exceptional cross-modal matching capabilities due to its +training on image-text contrastive learning tasks. However, without specific +optimization for unimodal scenarios, its performance in single-modality feature +extraction might be suboptimal. Despite this, some studies have directly used +CLIP's image encoder for tasks like few-shot classification, introducing a +misalignment between its pre-training objectives and feature extraction +methods. This inconsistency can diminish the quality of the image's feature +representation, adversely affecting CLIP's effectiveness in target tasks. In +this paper, we view text features as precise neighbors of image features in +CLIP's space and present a novel CrOss-moDal nEighbor Representation(CODER) +based on the distance structure between images and their neighbor texts. This +feature extraction method aligns better with CLIP's pre-training objectives, +thereby fully leveraging CLIP's robust cross-modal capabilities. The key to +construct a high-quality CODER lies in how to create a vast amount of +high-quality and diverse texts to match with images. We introduce the Auto Text +Generator(ATG) to automatically generate the required texts in a data-free and +training-free manner. We apply CODER to CLIP's zero-shot and few-shot image +classification tasks. Experiment results across various datasets and models +confirm CODER's effectiveness. Code is available +at:https://github.com/YCaigogogo/CVPR24-CODER. + +
+
+
+
+
+ + ☆ MMA-UNet: A Multi-Modal Asymmetric UNet Architecture for Infrared and + Visible Image Fusion + + +
+ Multi-modal image fusion (MMIF) maps useful information from various +modalities into the same representation space, thereby producing an informative +fused image. However, the existing fusion algorithms tend to symmetrically fuse +the multi-modal images, causing the loss of shallow information or bias towards +a single modality in certain regions of the fusion results. In this study, we +analyzed the spatial distribution differences of information in different +modalities and proved that encoding features within the same network is not +conducive to achieving simultaneous deep feature space alignment for +multi-modal images. To overcome this issue, a Multi-Modal Asymmetric UNet +(MMA-UNet) was proposed. We separately trained specialized feature encoders for +different modal and implemented a cross-scale fusion strategy to maintain the +features from different modalities within the same representation space, +ensuring a balanced information fusion process. Furthermore, extensive fusion +and downstream task experiments were conducted to demonstrate the efficiency of +MMA-UNet in fusing infrared and visible image information, producing visually +natural and semantically rich fusion results. Its performance surpasses that of +the state-of-the-art comparison fusion methods. + +
+
+
+
+
+ + ☆ An Attention-Based Deep Learning Architecture for Real-Time Monocular + Visual Odometry: Applications to GPS-free Drone Navigation + + +
+ Drones are increasingly used in fields like industry, medicine, research, +disaster relief, defense, and security. Technical challenges, such as +navigation in GPS-denied environments, hinder further adoption. Research in +visual odometry is advancing, potentially solving GPS-free navigation issues. +Traditional visual odometry methods use geometry-based pipelines which, while +popular, often suffer from error accumulation and high computational demands. +Recent studies utilizing deep neural networks (DNNs) have shown improved +performance, addressing these drawbacks. Deep visual odometry typically employs +convolutional neural networks (CNNs) and sequence modeling networks like +recurrent neural networks (RNNs) to interpret scenes and deduce visual odometry +from video sequences. This paper presents a novel real-time monocular visual +odometry model for drones, using a deep neural architecture with a +self-attention module. It estimates the ego-motion of a camera on a drone, +using consecutive video frames. An inference utility processes the live video +feed, employing deep learning to estimate the drone's trajectory. The +architecture combines a CNN for image feature extraction and a long short-term +memory (LSTM) network with a multi-head attention module for video sequence +modeling. Tested on two visual odometry datasets, this model converged 48% +faster than a previous RNN model and showed a 22% reduction in mean +translational drift and a 12% improvement in mean translational absolute +trajectory error, demonstrating enhanced robustness to noise. + +
+
+ comment: 22 Pages, 3 Tables, 9 Figures +
+
+
+
+
+ + ☆ Segmentation Quality and Volumetric Accuracy in Medical Imaging + + +
+ Current medical image segmentation relies on the region-based (Dice, +F1-score) and boundary-based (Hausdorff distance, surface distance) metrics as +the de-facto standard. While these metrics are widely used, they lack a unified +interpretation, particularly regarding volume agreement. Clinicians often lack +clear benchmarks to gauge the "goodness" of segmentation results based on these +metrics. Recognizing the clinical relevance of volumetry, we utilize relative +volume prediction error (vpe) to directly assess the accuracy of volume +predictions derived from segmentation tasks. Our work integrates theoretical +analysis and empirical validation across diverse datasets. We delve into the +often-ambiguous relationship between segmentation quality (measured by Dice) +and volumetric accuracy in clinical practice. Our findings highlight the +critical role of incorporating volumetric prediction accuracy into segmentation +evaluation. This approach empowers clinicians with a more nuanced understanding +of segmentation performance, ultimately improving the interpretation and +utility of these metrics in real-world healthcare settings. + +
+
+
+
+
+ + ☆ Diffusion-Aided Joint Source Channel Coding For High Realism Wireless + Image Transmission + + +
+ Deep learning-based joint source-channel coding (deep JSCC) has been +demonstrated as an effective approach for wireless image transmission. +Nevertheless, current research has concentrated on minimizing a standard +distortion metric such as Mean Squared Error (MSE), which does not necessarily +improve the perceptual quality. To address this issue, we propose DiffJSCC, a +novel framework that leverages pre-trained text-to-image diffusion models to +enhance the realism of images transmitted over the channel. The proposed +DiffJSCC utilizes prior deep JSCC frameworks to deliver an initial +reconstructed image at the receiver. Then, the spatial and textual features are +extracted from the initial reconstruction, which, together with the channel +state information (e.g., signal-to-noise ratio, SNR), are passed to a control +module to fine-tune the pre-trained Stable Diffusion model. Extensive +experiments on the Kodak dataset reveal that our method significantly surpasses +both conventional methods and prior deep JSCC approaches on perceptual metrics +such as LPIPS and FID scores, especially with poor channel conditions and +limited bandwidth. Notably, DiffJSCC can achieve highly realistic +reconstructions for 768x512 pixel Kodak images with only 3072 symbols (<0.008 +symbols per pixel) under 1dB SNR. Our code will be released in +https://github.com/mingyuyng/DiffJSCC. + +
+
+
+
+
+ + ☆ CUE-Net: Violence Detection Video Analytics with Spatial Cropping, + Enhanced UniformerV2 and Modified Efficient Additive Attention CVPR + + +
+ In this paper we introduce CUE-Net, a novel architecture designed for +automated violence detection in video surveillance. As surveillance systems +become more prevalent due to technological advances and decreasing costs, the +challenge of efficiently monitoring vast amounts of video data has intensified. +CUE-Net addresses this challenge by combining spatial Cropping with an enhanced +version of the UniformerV2 architecture, integrating convolutional and +self-attention mechanisms alongside a novel Modified Efficient Additive +Attention mechanism (which reduces the quadratic time complexity of +self-attention) to effectively and efficiently identify violent activities. +This approach aims to overcome traditional challenges such as capturing distant +or partially obscured subjects within video frames. By focusing on both local +and global spatiotemporal features, CUE-Net achieves state-of-the-art +performance on the RWF-2000 and RLVS datasets, surpassing existing methods. + +
+
+ comment: To be published in the proceedings of 2024 IEEE/CVF Conference on + Computer Vision and Pattern Recognition Workshops (CVPRW) +
+
+
+
+
+ + ☆ Attacking Bayes: On the Adversarial Robustness of Bayesian Neural + Networks + + +
+ Adversarial examples have been shown to cause neural networks to fail on a +wide range of vision and language tasks, but recent work has claimed that +Bayesian neural networks (BNNs) are inherently robust to adversarial +perturbations. In this work, we examine this claim. To study the adversarial +robustness of BNNs, we investigate whether it is possible to successfully break +state-of-the-art BNN inference methods and prediction pipelines using even +relatively unsophisticated attacks for three tasks: (1) label prediction under +the posterior predictive mean, (2) adversarial example detection with Bayesian +predictive uncertainty, and (3) semantic shift detection. We find that BNNs +trained with state-of-the-art approximate inference methods, and even BNNs +trained with Hamiltonian Monte Carlo, are highly susceptible to adversarial +attacks. We also identify various conceptual and experimental errors in +previous works that claimed inherent adversarial robustness of BNNs and +conclusively demonstrate that BNNs and uncertainty-aware Bayesian prediction +pipelines are not inherently robust against adversarial attacks. + +
+
+
+
+
+ + ☆ MediFact at MEDIQA-M3G 2024: Medical Question Answering in Dermatology + with Multimodal Learning + + +
+ The MEDIQA-M3G 2024 challenge necessitates novel solutions for Multilingual & +Multimodal Medical Answer Generation in dermatology (wai Yim et al., 2024a). +This paper addresses the limitations of traditional methods by proposing a +weakly supervised learning approach for open-ended medical question-answering +(QA). Our system leverages readily available MEDIQA-M3G images via a +VGG16-CNN-SVM model, enabling multilingual (English, Chinese, Spanish) learning +of informative skin condition representations. Using pre-trained QA models, we +further bridge the gap between visual and textual information through +multimodal fusion. This approach tackles complex, open-ended questions even +without predefined answer choices. We empower the generation of comprehensive +answers by feeding the ViT-CLIP model with multiple responses alongside images. +This work advances medical QA research, paving the way for clinical decision +support systems and ultimately improving healthcare delivery. + +
+
+ comment: 7 pages, 3 figures, Clinical NLP 2024 workshop proceedings in Shared + Task +
+
+
+
+
+ + ♻ ☆ Exploring Intrinsic Properties of Medical Images for Self-Supervised + Binary Semantic Segmentation + + +
+ Recent advancements in self-supervised learning have unlocked the potential +to harness unlabeled data for auxiliary tasks, facilitating the learning of +beneficial priors. This has been particularly advantageous in fields like +medical image analysis, where labeled data are scarce. Although effective for +classification tasks, this methodology has shown limitations in more complex +applications, such as medical image segmentation. In this paper, we introduce +Medical imaging Enhanced with Dynamic Self-Adaptive Semantic Segmentation +(MedSASS), a dedicated self-supervised framework tailored for medical image +segmentation. We evaluate MedSASS against existing state-of-the-art methods +across four diverse medical datasets, showcasing its superiority. MedSASS +outperforms existing CNN-based self-supervised methods by 3.83% and matches the +performance of ViT-based methods. Furthermore, when MedSASS is trained +end-to-end, covering both encoder and decoder, it demonstrates significant +improvements of 14.4% for CNNs and 6% for ViT-based architectures compared to +existing state-of-the-art self-supervised strategies. + +
+
+ comment: 30 pages, 10 figures, and 10 tables. Under Review +
+
+
+
+
+ + ♻ ☆ Fusion Transformer with Object Mask Guidance for Image Forgery Analysis + + +
+ In this work, we introduce OMG-Fuser, a fusion transformer-based network +designed to extract information from various forensic signals to enable robust +image forgery detection and localization. Our approach can operate with an +arbitrary number of forensic signals and leverages object information for their +analysis -- unlike previous methods that rely on fusion schemes with few +signals and often disregard image semantics. To this end, we design a forensic +signal stream composed of a transformer guided by an object attention +mechanism, associating patches that depict the same objects. In that way, we +incorporate object-level information from the image. Each forensic signal is +processed by a different stream that adapts to its peculiarities. A token +fusion transformer efficiently aggregates the outputs of an arbitrary number of +network streams and generates a fused representation for each image patch. We +assess two fusion variants on top of the proposed approach: (i) score-level +fusion that fuses the outputs of multiple image forensics algorithms and (ii) +feature-level fusion that fuses low-level forensic traces directly. Both +variants exceed state-of-the-art performance on seven datasets for image +forgery detection and localization, with a relative average improvement of +12.1% and 20.4% in terms of F1. Our model is robust against traditional and +novel forgery attacks and can be expanded with new signals without training +from scratch. Our code is publicly available at: +https://github.com/mever-team/omgfuser + +
+
+
+
+
+ + ♻ ☆ Exploring AIGC Video Quality: A Focus on Visual Harmony, Video-Text + Consistency and Domain Distribution Gap CVPR2024 + + +
+ The recent advancements in Text-to-Video Artificial Intelligence Generated +Content (AIGC) have been remarkable. Compared with traditional videos, the +assessment of AIGC videos encounters various challenges: visual inconsistency +that defy common sense, discrepancies between content and the textual prompt, +and distribution gap between various generative models, etc. Target at these +challenges, in this work, we categorize the assessment of AIGC video quality +into three dimensions: visual harmony, video-text consistency, and domain +distribution gap. For each dimension, we design specific modules to provide a +comprehensive quality assessment of AIGC videos. Furthermore, our research +identifies significant variations in visual quality, fluidity, and style among +videos generated by different text-to-video models. Predicting the source +generative model can make the AIGC video features more discriminative, which +enhances the quality assessment performance. The proposed method was used in +the third-place winner of the NTIRE 2024 Quality Assessment for AI-Generated +Content - Track 2 Video, demonstrating its effectiveness. Code will be +available at https://github.com/Coobiw/TriVQA. + +
+
+ comment: 9 pages, 3 figures, 3 tables. Accepted by CVPR2024 Workshop (3rd + place winner of NTIRE2024 Quality Assessment for AI-Generated Content - Track + 2 Video) +
+
+
+
+
+ + ♻ ☆ Exploring Few-Shot Adaptation for Activity Recognition on Diverse + Domains + + +
+ Domain adaptation is essential for activity recognition to ensure accurate +and robust performance across diverse environments, sensor types, and data +sources. Unsupervised domain adaptation methods have been extensively studied, +yet, they require large-scale unlabeled data from the target domain. In this +work, we focus on Few-Shot Domain Adaptation for Activity Recognition +(FSDA-AR), which leverages a very small amount of labeled target videos to +achieve effective adaptation. This approach is appealing for applications +because it only needs a few or even one labeled example per class in the target +domain, ideal for recognizing rare but critical activities. However, the +existing FSDA-AR works mostly focus on the domain adaptation on sports videos, +where the domain diversity is limited. We propose a new FSDA-AR benchmark using +five established datasets considering the adaptation on more diverse and +challenging domains. Our results demonstrate that FSDA-AR performs comparably +to unsupervised domain adaptation with significantly fewer labeled target +domain samples. We further propose a novel approach, RelaMiX, to better +leverage the few labeled target domain samples as knowledge guidance. RelaMiX +encompasses a temporal relational attention network with relation dropout, +alongside a cross-domain information alignment mechanism. Furthermore, it +integrates a mechanism for mixing features within a latent space by using the +few-shot target domain samples. The proposed RelaMiX solution achieves +state-of-the-art performance on all datasets within the FSDA-AR benchmark. To +encourage future research of few-shot domain adaptation for activity +recognition, our code will be publicly available at +https://github.com/KPeng9510/RelaMiX. + +
+
+ comment: The benchmark and source code will be publicly available at + https://github.com/KPeng9510/RelaMiX +
+
+
+
+
+ + ♻ ☆ CharacterFactory: Sampling Consistent Characters with GANs for Diffusion + Models + + +
+ Recent advances in text-to-image models have opened new frontiers in +human-centric generation. However, these models cannot be directly employed to +generate images with consistent newly coined identities. In this work, we +propose CharacterFactory, a framework that allows sampling new characters with +consistent identities in the latent space of GANs for diffusion models. More +specifically, we consider the word embeddings of celeb names as ground truths +for the identity-consistent generation task and train a GAN model to learn the +mapping from a latent space to the celeb embedding space. In addition, we +design a context-consistent loss to ensure that the generated identity +embeddings can produce identity-consistent images in various contexts. +Remarkably, the whole model only takes 10 minutes for training, and can sample +infinite characters end-to-end during inference. Extensive experiments +demonstrate excellent performance of the proposed CharacterFactory on character +creation in terms of identity consistency and editability. Furthermore, the +generated characters can be seamlessly combined with the off-the-shelf +image/video/3D diffusion models. We believe that the proposed CharacterFactory +is an important step for identity-consistent character generation. Project page +is available at: https://qinghew.github.io/CharacterFactory/. + +
+
+ comment: Code will be released very soon: + https://github.com/qinghew/CharacterFactory +
+
+
+
+
+ + ♻ ☆ FisheyeDetNet: 360° Surround view Fisheye Camera based Object + Detection System for Autonomous Driving + + +
+ Object detection is a mature problem in autonomous driving with pedestrian +detection being one of the first deployed algorithms. It has been +comprehensively studied in the literature. However, object detection is +relatively less explored for fisheye cameras used for surround-view near field +sensing. The standard bounding box representation fails in fisheye cameras due +to heavy radial distortion, particularly in the periphery. To mitigate this, we +explore extending the standard object detection output representation of +bounding box. We design rotated bounding boxes, ellipse, generic polygon as +polar arc/angle representations and define an instance segmentation mIOU metric +to analyze these representations. The proposed model FisheyeDetNet with polygon +outperforms others and achieves a mAP score of 49.5 % on Valeo fisheye +surround-view dataset for automated driving applications. This dataset has 60K +images captured from 4 surround-view cameras across Europe, North America and +Asia. To the best of our knowledge, this is the first detailed study on object +detection on fisheye cameras for autonomous driving scenarios. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2206.05542 by other authors +
+
+
+
+
+ + ♻ ☆ DressCode: Autoregressively Sewing and Generating Garments from Text + Guidance + + +
+ Apparel's significant role in human appearance underscores the importance of +garment digitalization for digital human creation. Recent advances in 3D +content creation are pivotal for digital human creation. Nonetheless, garment +generation from text guidance is still nascent. We introduce a text-driven 3D +garment generation framework, DressCode, which aims to democratize design for +novices and offer immense potential in fashion design, virtual try-on, and +digital human creation. For our framework, we first introduce SewingGPT, a +GPT-based architecture integrating cross-attention with text-conditioned +embedding to generate sewing patterns with text guidance. We also tailored a +pre-trained Stable Diffusion for high-quality, tile-based PBR texture +generation. By leveraging a large language model, our framework generates +CG-friendly garments through natural language interaction. Our method also +facilitates pattern completion and texture editing, streamlining the design +process through user-friendly interaction. This framework fosters innovation by +allowing creators to freely experiment with designs and incorporate unique +elements into their work, thereby igniting new ideas and artistic +possibilities. With comprehensive evaluations and comparisons with other +state-of-the-art methods, our method showcases the best quality and alignment +with input prompts. User studies further validate our high-quality rendering +results, highlighting its practical utility and potential in production +settings. Our project page is https://IHe-KaiI.github.io/DressCode/. + +
+
+ comment: Project page: https://IHe-KaiI.github.io/DressCode/ +
+
+
+
+
+ + ♻ ☆ A Universal Knowledge Embedded Contrastive Learning Framework for + Hyperspectral Image Classification + + +
+ Hyperspectral image (HSI) classification techniques have been intensively +studied and a variety of models have been developed. However, these HSI +classification models are confined to pocket models and unrealistic ways of +dataset partitioning. The former limits the generalization performance of the +model and the latter is partitioned leading to inflated model evaluation +metrics, which results in plummeting model performance in the real world. +Therefore, we propose a universal knowledge embedded contrastive learning +framework (KnowCL) for supervised, unsupervised, and semisupervised HSI +classification, which largely closes the gap between HSI classification models +between pocket models and standard vision backbones. We present a new HSI +processing pipeline in conjunction with a range of data transformation and +augmentation techniques that provide diverse data representations and realistic +data partitioning. The proposed framework based on this pipeline is compatible +with all kinds of backbones and can fully exploit labeled and unlabeled samples +with the expected training time. Furthermore, we design a new loss function, +which can adaptively fuse the supervised loss and unsupervised loss, enhancing +the learning performance. This proposed new classification paradigm shows great +potential in exploring for HSI classification technology. The code can be +accessed at \url{https://github.com/quanweiliu/KnowCL}. + +
+
+
+
+
+ + ♻ ☆ Towards Activated Muscle Group Estimation in the Wild + + +
+ In this paper, we tackle the new task of video-based Activated Muscle Group +Estimation (AMGE) aiming at identifying active muscle regions during physical +activity in the wild. To this intent, we provide the MuscleMap dataset +featuring >15K video clips with 135 different activities and 20 labeled muscle +groups. This dataset opens the vistas to multiple video-based applications in +sports and rehabilitation medicine under flexible environment constraints. The +proposed MuscleMap dataset is constructed with YouTube videos, specifically +targeting High-Intensity Interval Training (HIIT) physical exercise in the +wild. To make the AMGE model applicable in real-life situations, it is crucial +to ensure that the model can generalize well to numerous types of physical +activities not present during training and involving new combinations of +activated muscles. To achieve this, our benchmark also covers an evaluation +setting where the model is exposed to activity types excluded from the training +set. Our experiments reveal that the generalizability of existing architectures +adapted for the AMGE task remains a challenge. Therefore, we also propose a new +approach, TransM3E, which employs a multi-modality feature fusion mechanism +between both the video transformer model and the skeleton-based graph +convolution model with novel cross-modal knowledge distillation executed on +multi-classification tokens. The proposed method surpasses all popular video +classification models when dealing with both, previously seen and new types of +physical activities. The contributed dataset and code will be publicly +available at https://github.com/KPeng9510/MuscleMap. + +
+
+ comment: The contributed dataset and code will be publicly available at + https://github.com/KPeng9510/MuscleMap +
+
+
+
+
+ + ♻ ☆ The Third Monocular Depth Estimation Challenge CVPR + + +
+ This paper discusses the results of the third edition of the Monocular Depth +Estimation Challenge (MDEC). The challenge focuses on zero-shot generalization +to the challenging SYNS-Patches dataset, featuring complex scenes in natural +and indoor settings. As with the previous edition, methods can use any form of +supervision, i.e. supervised or self-supervised. The challenge received a total +of 19 submissions outperforming the baseline on the test set: 10 among them +submitted a report describing their approach, highlighting a diffused use of +foundational models such as Depth Anything at the core of their method. The +challenge winners drastically improved 3D F-Score performance, from 17.51% to +23.72%. + +
+
+ comment: To appear in CVPRW2024 +
+
+
+
+
+ + ♻ ☆ Paired Competing Neurons Improving STDP Supervised Local Learning In + Spiking Neural Networks + + +
+ Direct training of Spiking Neural Networks (SNNs) on neuromorphic hardware +has the potential to significantly reduce the energy consumption of artificial +neural network training. SNNs trained with Spike Timing-Dependent Plasticity +(STDP) benefit from gradient-free and unsupervised local learning, which can be +easily implemented on ultra-low-power neuromorphic hardware. However, +classification tasks cannot be performed solely with unsupervised STDP. In this +paper, we propose Stabilized Supervised STDP (S2-STDP), a supervised STDP +learning rule to train the classification layer of an SNN equipped with +unsupervised STDP for feature extraction. S2-STDP integrates error-modulated +weight updates that align neuron spikes with desired timestamps derived from +the average firing time within the layer. Then, we introduce a training +architecture called Paired Competing Neurons (PCN) to further enhance the +learning capabilities of our classification layer trained with S2-STDP. PCN +associates each class with paired neurons and encourages neuron specialization +toward target or non-target samples through intra-class competition. We +evaluate our methods on image recognition datasets, including MNIST, +Fashion-MNIST, and CIFAR-10. Results show that our methods outperform +state-of-the-art supervised STDP learning rules, for comparable architectures +and numbers of neurons. Further analysis demonstrates that the use of PCN +enhances the performance of S2-STDP, regardless of the hyperparameter set and +without introducing any additional hyperparameters. + +
+
+
+
+
+ + ♻ ☆ UVEB: A Large-scale Benchmark and Baseline Towards Real-World Underwater + Video Enhancement CVPR2024 + + +
+ Learning-based underwater image enhancement (UIE) methods have made great +progress. However, the lack of large-scale and high-quality paired training +samples has become the main bottleneck hindering the development of UIE. The +inter-frame information in underwater videos can accelerate or optimize the UIE +process. Thus, we constructed the first large-scale high-resolution underwater +video enhancement benchmark (UVEB) to promote the development of underwater +vision.It contains 1,308 pairs of video sequences and more than 453,000 +high-resolution with 38\% Ultra-High-Definition (UHD) 4K frame pairs. UVEB +comes from multiple countries, containing various scenes and video degradation +types to adapt to diverse and complex underwater environments. We also propose +the first supervised underwater video enhancement method, UVE-Net. UVE-Net +converts the current frame information into convolutional kernels and passes +them to adjacent frames for efficient inter-frame information exchange. By +fully utilizing the redundant degraded information of underwater videos, +UVE-Net completes video enhancement better. Experiments show the effective +network design and good performance of UVE-Net. + +
+
+ comment: 10 pages,CVPR2024 accept +
+
+
+
+
+ + ♻ ☆ Denoising: from classical methods to deep CNNs + + +
+ This paper aims to explore the evolution of image denoising in a +pedagological way. We briefly review classical methods such as Fourier analysis +and wavelet bases, highlighting the challenges they faced until the emergence +of neural networks, notably the U-Net, in the 2010s. The remarkable performance +of these networks has been demonstrated in studies such as Kadkhodaie et al. +(2024). They exhibit adaptability to various image types, including those with +fixed regularity, facial images, and bedroom scenes, achieving optimal results +and biased towards geometry-adaptive harmonic basis. The introduction of score +diffusion has played a crucial role in image generation. In this context, +denoising becomes essential as it facilitates the estimation of probability +density scores. We discuss the prerequisites for genuine learning of +probability densities, offering insights that extend from mathematical research +to the implications of universal structures. + +
+
+ comment: This document uses works by authors not yet presented to the + community and may appear to be original +
+
+
+
+
+ + ♻ ☆ VidCoM: Fast Video Comprehension through Large Language Models with + Multimodal Tools + + +
+ Building models that comprehends videos and responds specific user +instructions is a practical and challenging topic, as it requires mastery of +both vision understanding and knowledge reasoning. Compared to language and +image modalities, training efficiency remains a serious problem as existing +studies train models on massive sparse videos paired with brief descriptions. +In this paper, we introduce \textbf{VidCoM}, a fast adaptive framework that +leverages Large Language Models (LLMs) to reason about videos using lightweight +visual tools. Specifically, we reveal that the key to responding to specific +instructions is focusing on relevant video events, and utilize two visual +tools, structured scene graph generation and descriptive image caption +generation, to gather and represent the event information. Thus, a LLM enriched +with world knowledge is adopted as the reasoning agent to achieve the responses +by performing multiple reasoning steps on specific video events. To address the +difficulty of LLMs identifying video events, we further propose an +Instruction-oriented Video Events Recognition (InsOVER) algorithm. This +algorithm locates the corresponding video events based on an efficient +Hungarian matching between decompositions of linguistic instructions and video +events, thereby enabling LLMs to interact effectively with extended videos. +Extensive experiments on two typical video comprehension tasks show that the +proposed tuning-free framework outperforms the pre-trained models including +Flamingo-80B, to achieve the state-of-the-art performance. Our source code and +system will be publicly available. + +
+
+
+
+
+ + ♻ ☆ CLAP: Isolating Content from Style through Contrastive Learning with + Augmented Prompts + + +
+ Contrastive vision-language models, such as CLIP, have garnered considerable +attention for various dowmsteam tasks, mainly due to the remarkable ability of +the learned features for generalization. However, the features they learned +often blend content and style information, which somewhat limits their +generalization capabilities under distribution shifts. To address this +limitation, we adopt a causal generative perspective for multimodal data and +propose contrastive learning with data augmentation to disentangle content +features from the original representations. To achieve this, we begins with +exploring image augmentation techniques and develop a method to seamlessly +integrate them into pre-trained CLIP-like models to extract pure content +features. Taking a step further, recognizing the inherent semantic richness and +logical structure of text data, we explore the use of text augmentation to +isolate latent content from style features. This enables CLIP-like model's +encoders to concentrate on latent content information, refining the learned +representations by pre-trained CLIP-like models. Our extensive experiments +across diverse datasets demonstrate significant improvements in zero-shot and +few-shot classification tasks, alongside enhanced robustness to various +perturbations. These results underscore the effectiveness of our proposed +methods in refining vision-language representations and advancing the +state-of-the-art in multimodal learning. + +
+
+
+
+
+ + ♻ ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The Deep learning (DL) models for diagnosing breast cancer from mammographic +images often operate as "black boxes", making it difficult for healthcare +professionals to trust and understand their decision-making processes. The +study presents an integrated framework combining Convolutional Neural Networks +(CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced diagnosis +of breast cancer using the CBIS-DDSM dataset. The methodology encompasses an +elaborate data preprocessing pipeline and advanced data augmentation techniques +to counteract dataset limitations and transfer learning using pre-trained +networks such as VGG-16, Inception-V3 and ResNet was employed. A focal point of +our study is the evaluation of XAI's effectiveness in interpreting model +predictions, highlighted by utilizing the Hausdorff measure to assess the +alignment between AI-generated explanations and expert annotations +quantitatively. This approach is critical for XAI in promoting trustworthiness +and ethical fairness in AI-assisted diagnostics. The findings from our research +illustrate the effective collaboration between CNNs and XAI in advancing +diagnostic methods for breast cancer, thereby facilitating a more seamless +integration of advanced AI technologies within clinical settings. By enhancing +the interpretability of AI driven decisions, this work lays the groundwork for +improved collaboration between AI systems and medical practitioners, ultimately +enriching patient care. Furthermore, the implications of our research extended +well beyond the current methodologies. It encourages further research into how +to combine multimodal data and improve AI explanations to meet the needs of +clinical practice. + +
+
+
+
+
+ + ♻ ☆ Gaussian Shading: Provable Performance-Lossless Image Watermarking for + Diffusion Models CVPR 2024 + + +
+ Ethical concerns surrounding copyright protection and inappropriate content +generation pose challenges for the practical implementation of diffusion +models. One effective solution involves watermarking the generated images. +However, existing methods often compromise the model performance or require +additional training, which is undesirable for operators and users. To address +this issue, we propose Gaussian Shading, a diffusion model watermarking +technique that is both performance-lossless and training-free, while serving +the dual purpose of copyright protection and tracing of offending content. Our +watermark embedding is free of model parameter modifications and thus is +plug-and-play. We map the watermark to latent representations following a +standard Gaussian distribution, which is indistinguishable from latent +representations obtained from the non-watermarked diffusion model. Therefore we +can achieve watermark embedding with lossless performance, for which we also +provide theoretical proof. Furthermore, since the watermark is intricately +linked with image semantics, it exhibits resilience to lossy processing and +erasure attempts. The watermark can be extracted by Denoising Diffusion +Implicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian +Shading on multiple versions of Stable Diffusion, and the results demonstrate +that Gaussian Shading not only is performance-lossless but also outperforms +existing methods in terms of robustness. + +
+
+ comment: 17 pages, 11 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Implicit Multi-Spectral Transformer: An Lightweight and Effective + Visible to Infrared Image Translation Model IJCNN 2024 + + +
+ In the field of computer vision, visible light images often exhibit low +contrast in low-light conditions, presenting a significant challenge. While +infrared imagery provides a potential solution, its utilization entails high +costs and practical limitations. Recent advancements in deep learning, +particularly the deployment of Generative Adversarial Networks (GANs), have +facilitated the transformation of visible light images to infrared images. +However, these methods often experience unstable training phases and may +produce suboptimal outputs. To address these issues, we propose a novel +end-to-end Transformer-based model that efficiently converts visible light +images into high-fidelity infrared images. Initially, the Texture Mapping +Module and Color Perception Adapter collaborate to extract texture and color +features from the visible light image. The Dynamic Fusion Aggregation Module +subsequently integrates these features. Finally, the transformation into an +infrared image is refined through the synergistic action of the Color +Perception Adapter and the Enhanced Perception Attention mechanism. +Comprehensive benchmarking experiments confirm that our model outperforms +existing methods, producing infrared images of markedly superior quality, both +qualitatively and quantitatively. Furthermore, the proposed model enables more +effective downstream applications for infrared images than other methods. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ UniScene: Multi-Camera Unified Pre-training via 3D Scene Reconstruction + for Autonomous Driving + + +
+ Multi-camera 3D perception has emerged as a prominent research field in +autonomous driving, offering a viable and cost-effective alternative to +LiDAR-based solutions. The existing multi-camera algorithms primarily rely on +monocular 2D pre-training. However, the monocular 2D pre-training overlooks the +spatial and temporal correlations among the multi-camera system. To address +this limitation, we propose the first multi-camera unified pre-training +framework, called UniScene, which involves initially reconstructing the 3D +scene as the foundational stage and subsequently fine-tuning the model on +downstream tasks. Specifically, we employ Occupancy as the general +representation for the 3D scene, enabling the model to grasp geometric priors +of the surrounding world through pre-training. A significant benefit of +UniScene is its capability to utilize a considerable volume of unlabeled +image-LiDAR pairs for pre-training purposes. The proposed multi-camera unified +pre-training framework demonstrates promising results in key tasks such as +multi-camera 3D object detection and surrounding semantic scene completion. +When compared to monocular pre-training methods on the nuScenes dataset, +UniScene shows a significant improvement of about 2.0% in mAP and 2.0% in NDS +for multi-camera 3D object detection, as well as a 3% increase in mIoU for +surrounding semantic scene completion. By adopting our unified pre-training +method, a 25% reduction in 3D training annotation costs can be achieved, +offering significant practical value for the implementation of real-world +autonomous driving. Codes are publicly available at +https://github.com/chaytonmin/UniScene. + +
+
+ comment: Accepted by RAL2024 +
+
+
+
+
+ + ♻ ☆ Deep Instruction Tuning for Segment Anything Model + + +
+ Recently, Segment Anything Model (SAM) has become a research hotspot in the +fields of multimedia and computer vision, which exhibits powerful yet versatile +capabilities on various (un) conditional image segmentation tasks. Although SAM +can support different types of segmentation prompts, we note that, compared to +point- and box-guided segmentations, it performs much worse on text-instructed +tasks, e.g., referring image segmentation (RIS). In this paper, we argue that +deep text instruction tuning is key to mitigate such shortcoming caused by the +shallow fusion scheme in its default light-weight mask decoder. To address this +issue, we propose two simple yet effective deep instruction tuning (DIT) +methods for SAM, one is end-to-end and the other is layer-wise. With minimal +modifications, DITs can directly transform the image encoder of SAM as a +stand-alone vision-language learner in contrast to building another deep fusion +branch, maximizing the benefit of its superior segmentation capability. +Extensive experiments on three highly competitive benchmark datasets of RIS +show that a simple end-to-end DIT can improve SAM by a large margin, while the +layer-wise DIT can further boost the performance to state-of-the-art with much +less data and training expenditures. Our code is released at: +https://github.com/wysnzzzz/DIT. + +
+
+
+
+
+ + ♻ ☆ GMValuator: Similarity-based Data Valuation for Generative Models + + +
+ Data valuation plays a crucial role in machine learning. Existing data +valuation methods have primarily focused on discriminative models, neglecting +generative models that have recently gained considerable attention. A very few +existing attempts of data valuation method designed for deep generative models +either concentrates on specific models or lacks robustness in their outcomes. +Moreover, efficiency still reveals vulnerable shortcomings. To bridge the gaps, +we formulate the data valuation problem in generative models from a +similarity-matching perspective. Specifically, we introduce Generative Model +Valuator (GMValuator), the first training-free and model-agnostic approach to +provide data valuation for generation tasks. It empowers efficient data +valuation through our innovatively similarity matching module, calibrates +biased contribution by incorporating image quality assessment, and attributes +credits to all training samples based on their contributions to the generated +samples. Additionally, we introduce four evaluation criteria for assessing data +valuation methods in generative models, aligning with principles of +plausibility and truthfulness. GMValuator is extensively evaluated on various +datasets and generative architectures to demonstrate its effectiveness. + +
+
+
+
+
+ + ♻ ☆ DHRNet: A Dual-Path Hierarchical Relation Network for Multi-Person Pose + Estimation + + +
+ Multi-person pose estimation (MPPE) presents a formidable yet crucial +challenge in computer vision. Most existing methods predominantly concentrate +on isolated interaction either between instances or joints, which is inadequate +for scenarios demanding concurrent localization of both instances and joints. +This paper introduces a novel CNN-based single-stage method, named Dual-path +Hierarchical Relation Network (DHRNet), to extract instance-to-joint and +joint-to-instance interactions concurrently. Specifically, we design a +dual-path interaction modeling module (DIM) that strategically organizes +cross-instance and cross-joint interaction modeling modules in two +complementary orders, enriching interaction information by integrating merits +from different correlation modeling branches. Notably, DHRNet excels in joint +localization by leveraging information from other instances and joints. +Extensive evaluations on challenging datasets, including COCO, CrowdPose, and +OCHuman datasets, showcase DHRNet's state-of-the-art performance. The code will +be released at https://github.com/YHDang/dhrnet-multi-pose-estimation. + +
+
+
+
+
+ + ♻ ☆ Applying Unsupervised Semantic Segmentation to High-Resolution UAV + Imagery for Enhanced Road Scene Parsing + + +
+ There are two challenges presented in parsing road scenes from UAV images: +the complexity of processing high-resolution images and the dependency on +extensive manual annotations required by traditional supervised deep learning +methods to train robust and accurate models. In this paper, a novel +unsupervised road parsing framework that leverages advancements in vision +language models with fundamental computer vision techniques is introduced to +address these critical challenges. Our approach initiates with a vision +language model that efficiently processes ultra-high resolution images to +rapidly identify road regions of interest. Subsequent application of the vision +foundation model, SAM, generates masks for these regions without requiring +category information. A self-supervised learning network then processes these +masked regions to extract feature representations, which are clustered using an +unsupervised algorithm that assigns unique IDs to each feature cluster. The +masked regions are combined with the corresponding IDs to generate initial +pseudo-labels, which initiate an iterative self-training process for regular +semantic segmentation. Remarkably, the proposed method achieves a mean +Intersection over Union (mIoU) of 89.96% on the development dataset without any +manual annotation, demonstrating extraordinary flexibility by surpassing the +limitations of human-defined categories, and autonomously acquiring knowledge +of new categories from the dataset itself. + +
+
+
+
+
+ + ♻ ☆ ViP-LLaVA: Making Large Multimodal Models Understand Arbitrary Visual + Prompts CVPR2024 + + +
+ While existing large vision-language multimodal models focus on whole image +understanding, there is a prominent gap in achieving region-specific +comprehension. Current approaches that use textual coordinates or spatial +encodings often fail to provide a user-friendly interface for visual prompting. +To address this challenge, we introduce a novel multimodal model capable of +decoding arbitrary visual prompts. This allows users to intuitively mark images +and interact with the model using natural cues like a "red bounding box" or +"pointed arrow". Our simple design directly overlays visual markers onto the +RGB image, eliminating the need for complex region encodings, yet achieves +state-of-the-art performance on region-understanding tasks like Visual7W, +PointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present +ViP-Bench, a comprehensive benchmark to assess the capability of models in +understanding visual prompts across multiple dimensions, enabling future +research in this domain. Code, data, and model are publicly available. + +
+
+ comment: Accepted to CVPR2024. Project page: https://vip-llava.github.io/ +
+
+
+
+
+ + ♻ ☆ ForensicsForest Family: A Series of Multi-scale Hierarchical Cascade + Forests for Detecting GAN-generated Faces + + +
+ The prominent progress in generative models has significantly improved the +reality of generated faces, bringing serious concerns to society. Since recent +GAN-generated faces are in high realism, the forgery traces have become more +imperceptible, increasing the forensics challenge. To combat GAN-generated +faces, many countermeasures based on Convolutional Neural Networks (CNNs) have +been spawned due to their strong learning ability. In this paper, we rethink +this problem and explore a new approach based on forest models instead of CNNs. +Specifically, we describe a simple and effective forest-based method set called +{\em ForensicsForest Family} to detect GAN-generate faces. The proposed +ForensicsForest family is composed of three variants, which are {\em +ForensicsForest}, {\em Hybrid ForensicsForest} and {\em Divide-and-Conquer +ForensicsForest} respectively. ForenscisForest is a newly proposed Multi-scale +Hierarchical Cascade Forest, which takes semantic, frequency and biology +features as input, hierarchically cascades different levels of features for +authenticity prediction, and then employs a multi-scale ensemble scheme that +can comprehensively consider different levels of information to improve the +performance further. Based on ForensicsForest, we develop Hybrid +ForensicsForest, an extended version that integrates the CNN layers into +models, to further refine the effectiveness of augmented features. Moreover, to +reduce the memory cost in training, we propose Divide-and-Conquer +ForensicsForest, which can construct a forest model using only a portion of +training samplings. In the training stage, we train several candidate forest +models using the subsets of training samples. Then a ForensicsForest is +assembled by picking the suitable components from these candidate forest +models... + +
+
+ comment: To Appear in IEEE TIFS 2024 +
+
+
+
+
+ + ♻ ☆ Confidence Intervals for Error Rates in 1:1 Matching Tasks: Critical + Statistical Analysis and Recommendations + + +
+ Matching algorithms are commonly used to predict matches between items in a +collection. For example, in 1:1 face verification, a matching algorithm +predicts whether two face images depict the same person. Accurately assessing +the uncertainty of the error rates of such algorithms can be challenging when +data are dependent and error rates are low, two aspects that have been often +overlooked in the literature. In this work, we review methods for constructing +confidence intervals for error rates in 1:1 matching tasks. We derive and +examine the statistical properties of these methods, demonstrating how coverage +and interval width vary with sample size, error rates, and degree of data +dependence on both analysis and experiments with synthetic and real-world +datasets. Based on our findings, we provide recommendations for best practices +for constructing confidence intervals for error rates in 1:1 matching tasks. + +
+
+
+
+
+ + ♻ ☆ Neural Étendue Expander for Ultra-Wide-Angle High-Fidelity + Holographic Display + + +
+ Holographic displays can generate light fields by dynamically modulating the +wavefront of a coherent beam of light using a spatial light modulator, +promising rich virtual and augmented reality applications. However, the limited +spatial resolution of existing dynamic spatial light modulators imposes a tight +bound on the diffraction angle. As a result, modern holographic displays +possess low \'{e}tendue, which is the product of the display area and the +maximum solid angle of diffracted light. The low \'{e}tendue forces a sacrifice +of either the field-of-view (FOV) or the display size. In this work, we lift +this limitation by presenting neural \'{e}tendue expanders. This new breed of +optical elements, which is learned from a natural image dataset, enables higher +diffraction angles for ultra-wide FOV while maintaining both a compact form +factor and the fidelity of displayed contents to human viewers. With neural +\'{e}tendue expanders, we experimentally achieve 64$\times$ \'{e}tendue +expansion of natural images in full color, expanding the FOV by an order of +magnitude horizontally and vertically, with high-fidelity reconstruction +quality (measured in PSNR) over 29 dB on retinal-resolution images. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 142 + +
+
+
+ + ☆ Tunnel Try-on: Excavating Spatial-temporal Tunnels for High-quality + Virtual Try-on in Videos + + +
+ Video try-on is a challenging task and has not been well tackled in previous +works. The main obstacle lies in preserving the details of the clothing and +modeling the coherent motions simultaneously. Faced with those difficulties, we +address video try-on by proposing a diffusion-based framework named "Tunnel +Try-on." The core idea is excavating a "focus tunnel" in the input video that +gives close-up shots around the clothing regions. We zoom in on the region in +the tunnel to better preserve the fine details of the clothing. To generate +coherent motions, we first leverage the Kalman filter to construct smooth crops +in the focus tunnel and inject the position embedding of the tunnel into +attention layers to improve the continuity of the generated videos. In +addition, we develop an environment encoder to extract the context information +outside the tunnels as supplementary cues. Equipped with these techniques, +Tunnel Try-on keeps the fine details of the clothing and synthesizes stable and +smooth videos. Demonstrating significant advancements, Tunnel Try-on could be +regarded as the first attempt toward the commercial-level application of +virtual try-on in videos. + +
+
+ comment: Project Page: https://mengtingchen.github.io/tunnel-try-on-page/ +
+
+
+
+
+ + ☆ MaPa: Text-driven Photorealistic Material Painting for 3D Shapes SIGGRAPH 2024 + + +
+ This paper aims to generate materials for 3D meshes from text descriptions. +Unlike existing methods that synthesize texture maps, we propose to generate +segment-wise procedural material graphs as the appearance representation, which +supports high-quality rendering and provides substantial flexibility in +editing. Instead of relying on extensive paired data, i.e., 3D meshes with +material graphs and corresponding text descriptions, to train a material graph +generative model, we propose to leverage the pre-trained 2D diffusion model as +a bridge to connect the text and material graphs. Specifically, our approach +decomposes a shape into a set of segments and designs a segment-controlled +diffusion model to synthesize 2D images that are aligned with mesh parts. Based +on generated images, we initialize parameters of material graphs and fine-tune +them through the differentiable rendering module to produce materials in +accordance with the textual description. Extensive experiments demonstrate the +superior performance of our framework in photorealism, resolution, and +editability over existing methods. Project page: +https://zhanghe3z.github.io/MaPa/ + +
+
+ comment: SIGGRAPH 2024. Project page: https://zhanghe3z.github.io/MaPa/ +
+
+
+
+
+ + ☆ ChangeBind: A Hybrid Change Encoder for Remote Sensing Change Detection + + +
+ Change detection (CD) is a fundamental task in remote sensing (RS) which aims +to detect the semantic changes between the same geographical regions at +different time stamps. Existing convolutional neural networks (CNNs) based +approaches often struggle to capture long-range dependencies. Whereas recent +transformer-based methods are prone to the dominant global representation and +may limit their capabilities to capture the subtle change regions due to the +complexity of the objects in the scene. To address these limitations, we +propose an effective Siamese-based framework to encode the semantic changes +occurring in the bi-temporal RS images. The main focus of our design is to +introduce a change encoder that leverages local and global feature +representations to capture both subtle and large change feature information +from multi-scale features to precisely estimate the change regions. Our +experimental study on two challenging CD datasets reveals the merits of our +approach and obtains state-of-the-art performance. + +
+
+ comment: accepted at IGARSS 2024 +
+
+
+
+
+ + ☆ Exploring the Distinctiveness and Fidelity of the Descriptions Generated + by Large Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) are gaining traction for their +remarkable ability to process and integrate visual and textual data. Despite +their popularity, the capacity of LVLMs to generate precise, fine-grained +textual descriptions has not been fully explored. This study addresses this gap +by focusing on \textit{distinctiveness} and \textit{fidelity}, assessing how +models like Open-Flamingo, IDEFICS, and MiniGPT-4 can distinguish between +similar objects and accurately describe visual features. We proposed the +Textual Retrieval-Augmented Classification (TRAC) framework, which, by +leveraging its generative capabilities, allows us to delve deeper into +analyzing fine-grained visual description generation. This research provides +valuable insights into the generation quality of LVLMs, enhancing the +understanding of multimodal language models. Notably, MiniGPT-4 stands out for +its better ability to generate fine-grained descriptions, outperforming the +other two models in this aspect. The code is provided at +\url{https://anonymous.4open.science/r/Explore_FGVDs-E277}. + +
+
+ comment: 11 pages, 9 figures, 6 tables. For associated code, see + https://anonymous.4open.science/r/Explore_FGVDs-E277 +
+
+
+
+
+ + ☆ Geometry-aware Reconstruction and Fusion-refined Rendering for + Generalizable Neural Radiance Fields CVPR 2024 + + +
+ Generalizable NeRF aims to synthesize novel views for unseen scenes. Common +practices involve constructing variance-based cost volumes for geometry +reconstruction and encoding 3D descriptors for decoding novel views. However, +existing methods show limited generalization ability in challenging conditions +due to inaccurate geometry, sub-optimal descriptors, and decoding strategies. +We address these issues point by point. First, we find the variance-based cost +volume exhibits failure patterns as the features of pixels corresponding to the +same point can be inconsistent across different views due to occlusions or +reflections. We introduce an Adaptive Cost Aggregation (ACA) approach to +amplify the contribution of consistent pixel pairs and suppress inconsistent +ones. Unlike previous methods that solely fuse 2D features into descriptors, +our approach introduces a Spatial-View Aggregator (SVA) to incorporate 3D +context into descriptors through spatial and inter-view interaction. When +decoding the descriptors, we observe the two existing decoding strategies excel +in different areas, which are complementary. A Consistency-Aware Fusion (CAF) +strategy is proposed to leverage the advantages of both. We incorporate the +above ACA, SVA, and CAF into a coarse-to-fine framework, termed Geometry-aware +Reconstruction and Fusion-refined Rendering (GeFu). GeFu attains +state-of-the-art performance across multiple datasets. Code is available at +https://github.com/TQTQliu/GeFu . + +
+
+ comment: Accepted by CVPR 2024. Project page: https://gefucvpr24.github.io +
+
+
+
+
+ + ☆ Ag2Manip: Learning Novel Manipulation Skills with Agent-Agnostic Visual + and Action Representations + + +
+ Autonomous robotic systems capable of learning novel manipulation tasks are +poised to transform industries from manufacturing to service automation. +However, modern methods (e.g., VIP and R3M) still face significant hurdles, +notably the domain gap among robotic embodiments and the sparsity of successful +task executions within specific action spaces, resulting in misaligned and +ambiguous task representations. We introduce Ag2Manip (Agent-Agnostic +representations for Manipulation), a framework aimed at surmounting these +challenges through two key innovations: a novel agent-agnostic visual +representation derived from human manipulation videos, with the specifics of +embodiments obscured to enhance generalizability; and an agent-agnostic action +representation abstracting a robot's kinematics to a universal agent proxy, +emphasizing crucial interactions between end-effector and object. Ag2Manip's +empirical validation across simulated benchmarks like FrankaKitchen, ManiSkill, +and PartManip shows a 325% increase in performance, achieved without +domain-specific demonstrations. Ablation studies underline the essential +contributions of the visual and action representations to this success. +Extending our evaluations to the real world, Ag2Manip significantly improves +imitation learning success rates from 50% to 77.5%, demonstrating its +effectiveness and generalizability across both simulated and physical +environments. + +
+
+ comment: Project website and open-source code: + https://xiaoyao-li.github.io/research/ag2manip +
+
+
+
+
+ + ☆ HYPE: Hyperbolic Entailment Filtering for Underspecified Images and + Texts + + +
+ In an era where the volume of data drives the effectiveness of +self-supervised learning, the specificity and clarity of data semantics play a +crucial role in model training. Addressing this, we introduce HYPerbolic +Entailment filtering (HYPE), a novel methodology designed to meticulously +extract modality-wise meaningful and well-aligned data from extensive, noisy +image-text pair datasets. Our approach leverages hyperbolic embeddings and the +concept of entailment cones to evaluate and filter out samples with meaningless +or underspecified semantics, focusing on enhancing the specificity of each data +sample. HYPE not only demonstrates a significant improvement in filtering +efficiency but also sets a new state-of-the-art in the DataComp benchmark when +combined with existing filtering techniques. This breakthrough showcases the +potential of HYPE to refine the data selection process, thereby contributing to +the development of more accurate and efficient self-supervised learning models. +Additionally, the image specificity $\epsilon_{i}$ can be independently applied +to induce an image-only dataset from an image-text or image-only data pool for +training image-only self-supervised models and showed superior performance when +compared to the dataset induced by CLIP score. + +
+
+ comment: 28pages, 4.5MB +
+
+
+
+
+ + ☆ Inhomogeneous illuminated image enhancement under extremely low + visibility condition + + +
+ Imaging through fog significantly impacts fields such as object detection and +recognition. In conditions of extremely low visibility, essential image +information can be obscured, rendering standard extraction methods ineffective. +Traditional digital processing techniques, such as histogram stretching, aim to +mitigate fog effects by enhancing object light contrast diminished by +atmospheric scattering. However, these methods often experience reduce +effectiveness under inhomogeneous illumination. This paper introduces a novel +approach that adaptively filters background illumination under extremely low +visibility and preserve only the essential signal information. Additionally, we +employ a visual optimization strategy based on image gradients to eliminate +grayscale banding. Finally, the image is transformed to achieve high contrast +and maintain fidelity to the original information through maximum histogram +equalization. Our proposed method significantly enhances signal clarity in +conditions of extremely low visibility and outperforms existing algorithms. + +
+
+
+
+
+ + ☆ Learning text-to-video retrieval from image captioning CVPR 2023 + + +
+ We describe a protocol to study text-to-video retrieval training with +unlabeled videos, where we assume (i) no access to labels for any videos, i.e., +no access to the set of ground-truth captions, but (ii) access to labeled +images in the form of text. Using image expert models is a realistic scenario +given that annotating images is cheaper therefore scalable, in contrast to +expensive video labeling schemes. Recently, zero-shot image experts such as +CLIP have established a new strong baseline for video understanding tasks. In +this paper, we make use of this progress and instantiate the image experts from +two types of models: a text-to-image retrieval model to provide an initial +backbone, and image captioning models to provide supervision signal into +unlabeled videos. We show that automatically labeling video frames with image +captioning allows text-to-video retrieval training. This process adapts the +features to the target domain at no manual annotation cost, consequently +outperforming the strong zero-shot CLIP baseline. During training, we sample +captions from multiple video frames that best match the visual content, and +perform a temporal pooling over frame representations by scoring frames +according to their relevance to each caption. We conduct extensive ablations to +provide insights and demonstrate the effectiveness of this simple framework by +outperforming the CLIP zero-shot baselines on text-to-video retrieval on three +standard datasets, namely ActivityNet, MSR-VTT, and MSVD. + +
+
+ comment: A short version of this work appeared at CVPR 2023 Workshops. Project + page: https://imagine.enpc.fr/~ventural/multicaps/ +
+
+
+
+
+ + ☆ Low Cost Machine Vision for Insect Classification + + +
+ Preserving the number and diversity of insects is one of our society's most +important goals in the area of environmental sustainability. A prerequisite for +this is a systematic and up-scaled monitoring in order to detect correlations +and identify countermeasures. Therefore, automatized monitoring using live +traps is important, but so far there is no system that provides image data of +sufficient detailed information for entomological classification. + In this work, we present an imaging method as part of a multisensor system +developed as a low-cost, scalable, open-source system that is adaptable to +classical trap types. The image quality meets the requirements needed for +classification in the taxonomic tree. Therefore, illumination and resolution +have been optimized and motion artefacts have been suppressed. The system is +evaluated exemplarily on a dataset consisting of 16 insect species of the same +as well as different genus, family and order. We demonstrate that standard +CNN-architectures like ResNet50 (pretrained on iNaturalist data) or MobileNet +perform very well for the prediction task after re-training. Smaller custom +made CNNs also lead to promising results. Classification accuracy of $>96\%$ +has been achieved. Moreover, it was proved that image cropping of insects is +necessary for classification of species with high inter-class similarity. + +
+
+
+
+
+ + ☆ TextGaze: Gaze-Controllable Face Generation with Natural Language + + +
+ Generating face image with specific gaze information has attracted +considerable attention. Existing approaches typically input gaze values +directly for face generation, which is unnatural and requires annotated gaze +datasets for training, thereby limiting its application. In this paper, we +present a novel gaze-controllable face generation task. Our approach inputs +textual descriptions that describe human gaze and head behavior and generates +corresponding face images. Our work first introduces a text-of-gaze dataset +containing over 90k text descriptions spanning a dense distribution of gaze and +head poses. We further propose a gaze-controllable text-to-face method. Our +method contains a sketch-conditioned face diffusion module and a model-based +sketch diffusion module. We define a face sketch based on facial landmarks and +eye segmentation map. The face diffusion module generates face images from the +face sketch, and the sketch diffusion module employs a 3D face model to +generate face sketch from text description. Experiments on the FFHQ dataset +show the effectiveness of our method. We will release our dataset and code for +future research. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Sparse Reconstruction of Optical Doppler Tomography Based on State Space + Model + + +
+ Optical Doppler Tomography (ODT) is a blood flow imaging technique popularly +used in bioengineering applications. The fundamental unit of ODT is the 1D +frequency response along the A-line (depth), named raw A-scan. A 2D ODT image +(B-scan) is obtained by first sensing raw A-scans along the B-line (width), and +then constructing the B-scan from these raw A-scans via magnitude-phase +analysis and post-processing. To obtain a high-resolution B-scan with a precise +flow map, densely sampled A-scans are required in current methods, causing both +computational and storage burdens. To address this issue, in this paper we +propose a novel sparse reconstruction framework with four main sequential +steps: 1) early magnitude-phase fusion that encourages rich interaction of the +complementary information in magnitude and phase, 2) State Space Model +(SSM)-based representation learning, inspired by recent successes in Mamba and +VMamba, to naturally capture both the intra-A-scan sequential information and +between-A-scan interactions, 3) an Inception-based Feedforward Network module +(IncFFN) to further boost the SSM-module, and 4) a B-line Pixel Shuffle (BPS) +layer to effectively reconstruct the final results. In the experiments on +real-world animal data, our method shows clear effectiveness in reconstruction +accuracy. As the first application of SSM for image reconstruction tasks, we +expect our work to inspire related explorations in not only efficient ODT +imaging techniques but also generic image enhancement. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ PromptCIR: Blind Compressed Image Restoration with Prompt Learning + + +
+ Blind Compressed Image Restoration (CIR) has garnered significant attention +due to its practical applications. It aims to mitigate compression artifacts +caused by unknown quality factors, particularly with JPEG codecs. Existing +works on blind CIR often seek assistance from a quality factor prediction +network to facilitate their network to restore compressed images. However, the +predicted numerical quality factor lacks spatial information, preventing +network adaptability toward image contents. Recent studies in +prompt-learning-based image restoration have showcased the potential of prompts +to generalize across varied degradation types and degrees. This motivated us to +design a prompt-learning-based compressed image restoration network, dubbed +PromptCIR, which can effectively restore images from various compress levels. +Specifically, PromptCIR exploits prompts to encode compression information +implicitly, where prompts directly interact with soft weights generated from +image features, thus providing dynamic content-aware and distortion-aware +guidance for the restoration process. The light-weight prompts enable our +method to adapt to different compression levels, while introducing minimal +parameter overhead. Overall, PromptCIR leverages the powerful transformer-based +backbone with the dynamic prompt module to proficiently handle blind CIR tasks, +winning first place in the NTIRE 2024 challenge of blind compressed image +enhancement track. Extensive experiments have validated the effectiveness of +our proposed PromptCIR. The code is available at +https://github.com/lbc12345/PromptCIR-NTIRE24. + +
+
+ comment: Winner of NTIRE 2024 Blind Compressed Image Enhancement Challenge +
+
+
+
+
+ + ☆ Cost-Sensitive Uncertainty-Based Failure Recognition for Object + Detection UAI 2024 + + +
+ Object detectors in real-world applications often fail to detect objects due +to varying factors such as weather conditions and noisy input. Therefore, a +process that mitigates false detections is crucial for both safety and +accuracy. While uncertainty-based thresholding shows promise, previous works +demonstrate an imperfect correlation between uncertainty and detection errors. +This hinders ideal thresholding, prompting us to further investigate the +correlation and associated cost with different types of uncertainty. We +therefore propose a cost-sensitive framework for object detection tailored to +user-defined budgets on the two types of errors, missing and false detections. +We derive minimum thresholding requirements to prevent performance degradation +and define metrics to assess the applicability of uncertainty for failure +recognition. Furthermore, we automate and optimize the thresholding process to +maximize the failure recognition rate w.r.t. the specified budget. Evaluation +on three autonomous driving datasets demonstrates that our approach +significantly enhances safety, particularly in challenging scenarios. +Leveraging localization aleatoric uncertainty and softmax-based entropy only, +our method boosts the failure recognition rate by 36-60\% compared to +conventional approaches. Code is available at +https://mos-ks.github.io/publications. + +
+
+ comment: Accepted with an oral presentation at UAI 2024 +
+
+
+
+
+ + ☆ One-Shot Image Restoration + + +
+ Image restoration, or inverse problems in image processing, has long been an +extensively studied topic. In recent years supervised learning approaches have +become a popular strategy attempting to tackle this task. Unfortunately, most +supervised learning-based methods are highly demanding in terms of +computational resources and training data (sample complexity). In addition, +trained models are sensitive to domain changes, such as varying acquisition +systems, signal sampling rates, resolution and contrast. In this work, we try +to answer a fundamental question: Can supervised learning models generalize +well solely by learning from one image or even part of an image? If so, then +what is the minimal amount of patches required to achieve acceptable +generalization? To this end, we focus on an efficient patch-based learning +framework that requires a single image input-output pair for training. +Experimental results demonstrate the applicability, robustness and +computational efficiency of the proposed approach for supervised image +deblurring and super-resolution. Our results showcase significant improvement +of learning models' sample efficiency, generalization and time complexity, that +can hopefully be leveraged for future real-time applications, and applied to +other signals and modalities. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.14267 +
+
+
+
+
+ + ☆ Multi-view Image Prompted Multi-view Diffusion for Improved 3D + Generation + + +
+ Using image as prompts for 3D generation demonstrate particularly strong +performances compared to using text prompts alone, for images provide a more +intuitive guidance for the 3D generation process. In this work, we delve into +the potential of using multiple image prompts, instead of a single image +prompt, for 3D generation. Specifically, we build on ImageDream, a novel +image-prompt multi-view diffusion model, to support multi-view images as the +input prompt. Our method, dubbed MultiImageDream, reveals that transitioning +from a single-image prompt to multiple-image prompts enhances the performance +of multi-view and 3D object generation according to various quantitative +evaluation metrics and qualitative assessments. This advancement is achieved +without the necessity of fine-tuning the pre-trained ImageDream multi-view +diffusion model. + +
+
+ comment: 5 pages including references, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Spatial-frequency Dual-Domain Feature Fusion Network for Low-Light + Remote Sensing Image Enhancement + + +
+ Low-light remote sensing images generally feature high resolution and high +spatial complexity, with continuously distributed surface features in space. +This continuity in scenes leads to extensive long-range correlations in spatial +domains within remote sensing images. Convolutional Neural Networks, which rely +on local correlations for long-distance modeling, struggle to establish +long-range correlations in such images. On the other hand, transformer-based +methods that focus on global information face high computational complexities +when processing high-resolution remote sensing images. From another +perspective, Fourier transform can compute global information without +introducing a large number of parameters, enabling the network to more +efficiently capture the overall image structure and establish long-range +correlations. Therefore, we propose a Dual-Domain Feature Fusion Network (DFFN) +for low-light remote sensing image enhancement. Specifically, this challenging +task of low-light enhancement is divided into two more manageable sub-tasks: +the first phase learns amplitude information to restore image brightness, and +the second phase learns phase information to refine details. To facilitate +information exchange between the two phases, we designed an information fusion +affine block that combines data from different phases and scales. Additionally, +we have constructed two dark light remote sensing datasets to address the +current lack of datasets in dark light remote sensing image enhancement. +Extensive evaluations show that our method outperforms existing +state-of-the-art methods. The code is available at +https://github.com/iijjlk/DFFN. + +
+
+ comment: 14 page +
+
+
+
+
+ + ☆ Frequency-Guided Multi-Level Human Action Anomaly Detection with + Normalizing Flows + + +
+ We introduce the task of human action anomaly detection (HAAD), which aims to +identify anomalous motions in an unsupervised manner given only the +pre-determined normal category of training action samples. Compared to prior +human-related anomaly detection tasks which primarily focus on unusual events +from videos, HAAD involves the learning of specific action labels to recognize +semantically anomalous human behaviors. To address this task, we propose a +normalizing flow (NF)-based detection framework where the sample likelihood is +effectively leveraged to indicate anomalies. As action anomalies often occur in +some specific body parts, in addition to the full-body action feature learning, +we incorporate extra encoding streams into our framework for a finer modeling +of body subsets. Our framework is thus multi-level to jointly discover global +and local motion anomalies. Furthermore, to show awareness of the potentially +jittery data during recording, we resort to discrete cosine transformation by +converting the action samples from the temporal to the frequency domain to +mitigate the issue of data instability. Extensive experimental results on two +human action datasets demonstrate that our method outperforms the baselines +formed by adapting state-of-the-art human activity AD approaches to our task of +HAAD. + +
+
+
+
+
+ + ☆ Estimating the Robustness Radius for Randomized Smoothing with + 100$\times$ Sample Efficiency + + +
+ Randomized smoothing (RS) has successfully been used to improve the +robustness of predictions for deep neural networks (DNNs) by adding random +noise to create multiple variations of an input, followed by deciding the +consensus. To understand if an RS-enabled DNN is effective in the sampled input +domains, it is mandatory to sample data points within the operational design +domain, acquire the point-wise certificate regarding robustness radius, and +compare it with pre-defined acceptance criteria. Consequently, ensuring that a +point-wise robustness certificate for any given data point is obtained +relatively cost-effectively is crucial. This work demonstrates that reducing +the number of samples by one or two orders of magnitude can still enable the +computation of a slightly smaller robustness radius (commonly ~20% radius +reduction) with the same confidence. We provide the mathematical foundation for +explaining the phenomenon while experimentally showing promising results on the +standard CIFAR-10 and ImageNet datasets. + +
+
+
+
+
+ + ☆ MV-VTON: Multi-View Virtual Try-On with Diffusion Models + + +
+ The goal of image-based virtual try-on is to generate an image of the target +person naturally wearing the given clothing. However, most existing methods +solely focus on the frontal try-on using the frontal clothing. When the views +of the clothing and person are significantly inconsistent, particularly when +the person's view is non-frontal, the results are unsatisfactory. To address +this challenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to +reconstruct the dressing results of a person from multiple views using the +given clothes. On the one hand, given that single-view clothes provide +insufficient information for MV-VTON, we instead employ two images, i.e., the +frontal and back views of the clothing, to encompass the complete view as much +as possible. On the other hand, the diffusion models that have demonstrated +superior abilities are adopted to perform our MV-VTON. In particular, we +propose a view-adaptive selection method where hard-selection and +soft-selection are applied to the global and local clothing feature extraction, +respectively. This ensures that the clothing features are roughly fit to the +person's view. Subsequently, we suggest a joint attention block to align and +fuse clothing features with person features. Additionally, we collect a MV-VTON +dataset, i.e., Multi-View Garment (MVG), in which each person has multiple +photos with diverse views and poses. Experiments show that the proposed method +not only achieves state-of-the-art results on MV-VTON task using our MVG +dataset, but also has superiority on frontal-view virtual try-on task using +VITON-HD and DressCode datasets. Codes and datasets will be publicly released +at https://github.com/hywang2002/MV-VTON . + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ UniRGB-IR: A Unified Framework for Visible-Infrared Downstream Tasks via + Adapter Tuning + + +
+ Semantic analysis on visible (RGB) and infrared (IR) images has gained +attention for its ability to be more accurate and robust under low-illumination +and complex weather conditions. Due to the lack of pre-trained foundation +models on the large-scale infrared image datasets, existing methods prefer to +design task-specific frameworks and directly fine-tune them with pre-trained +foundation models on their RGB-IR semantic relevance datasets, which results in +poor scalability and limited generalization. In this work, we propose a +scalable and efficient framework called UniRGB-IR to unify RGB-IR downstream +tasks, in which a novel adapter is developed to efficiently introduce richer +RGB-IR features into the pre-trained RGB-based foundation model. Specifically, +our framework consists of a vision transformer (ViT) foundation model, a +Multi-modal Feature Pool (MFP) module and a Supplementary Feature Injector +(SFI) module. The MFP and SFI modules cooperate with each other as an adpater +to effectively complement the ViT features with the contextual multi-scale +features. During training process, we freeze the entire foundation model to +inherit prior knowledge and only optimize the MFP and SFI modules. Furthermore, +to verify the effectiveness of our framework, we utilize the ViT-Base as the +pre-trained foundation model to perform extensive experiments. Experimental +results on various RGB-IR downstream tasks demonstrate that our method can +achieve state-of-the-art performance. The source code and results are available +at https://github.com/PoTsui99/UniRGB-IR.git. + +
+
+
+
+
+ + ☆ Simultaneous Tri-Modal Medical Image Fusion and Super-Resolution using + Conditional Diffusion Model + + +
+ In clinical practice, tri-modal medical image fusion, compared to the +existing dual-modal technique, can provide a more comprehensive view of the +lesions, aiding physicians in evaluating the disease's shape, location, and +biological activity. However, due to the limitations of imaging equipment and +considerations for patient safety, the quality of medical images is usually +limited, leading to sub-optimal fusion performance, and affecting the depth of +image analysis by the physician. Thus, there is an urgent need for a technology +that can both enhance image resolution and integrate multi-modal information. +Although current image processing methods can effectively address image fusion +and super-resolution individually, solving both problems synchronously remains +extremely challenging. In this paper, we propose TFS-Diff, a simultaneously +realize tri-modal medical image fusion and super-resolution model. Specially, +TFS-Diff is based on the diffusion model generation of a random iterative +denoising process. We also develop a simple objective function and the proposed +fusion super-resolution loss, effectively evaluates the uncertainty in the +fusion and ensures the stability of the optimization process. And the channel +attention module is proposed to effectively integrate key information from +different modalities for clinical diagnosis, avoiding information loss caused +by multiple image processing. Extensive experiments on public Harvard datasets +show that TFS-Diff significantly surpass the existing state-of-the-art methods +in both quantitative and visual evaluations. The source code will be available +at GitHub. + +
+
+
+
+
+ + ☆ On the Road to Clarity: Exploring Explainable AI for World Models in a + Driver Assistance System + + +
+ In Autonomous Driving (AD) transparency and safety are paramount, as mistakes +are costly. However, neural networks used in AD systems are generally +considered black boxes. As a countermeasure, we have methods of explainable AI +(XAI), such as feature relevance estimation and dimensionality reduction. +Coarse graining techniques can also help reduce dimensionality and find +interpretable global patterns. A specific coarse graining method is +Renormalization Groups from statistical physics. It has previously been applied +to Restricted Boltzmann Machines (RBMs) to interpret unsupervised learning. We +refine this technique by building a transparent backbone model for +convolutional variational autoencoders (VAE) that allows mapping latent values +to input features and has performance comparable to trained black box VAEs. +Moreover, we propose a custom feature map visualization technique to analyze +the internal convolutional layers in the VAE to explain internal causes of poor +reconstruction that may lead to dangerous traffic scenarios in AD applications. +In a second key contribution, we propose explanation and evaluation techniques +for the internal dynamics and feature relevance of prediction networks. We test +a long short-term memory (LSTM) network in the computer vision domain to +evaluate the predictability and in future applications potentially safety of +prediction models. We showcase our methods by analyzing a VAE-LSTM world model +that predicts pedestrian perception in an urban traffic situation. + +
+
+ comment: 8 pages, 6 figures, to be published in IEEE CAI 2024 +
+
+
+
+
+ + ☆ Masked Two-channel Decoupling Framework for Incomplete Multi-view Weak + Multi-label Learning NeurIPS 2023 + + +
+ Multi-view learning has become a popular research topic in recent years, but +research on the cross-application of classic multi-label classification and +multi-view learning is still in its early stages. In this paper, we focus on +the complex yet highly realistic task of incomplete multi-view weak multi-label +learning and propose a masked two-channel decoupling framework based on deep +neural networks to solve this problem. The core innovation of our method lies +in decoupling the single-channel view-level representation, which is common in +deep multi-view learning methods, into a shared representation and a +view-proprietary representation. We also design a cross-channel contrastive +loss to enhance the semantic property of the two channels. Additionally, we +exploit supervised information to design a label-guided graph regularization +loss, helping the extracted embedding features preserve the geometric structure +among samples. Inspired by the success of masking mechanisms in image and text +analysis, we develop a random fragment masking strategy for vector features to +improve the learning ability of encoders. Finally, it is important to emphasize +that our model is fully adaptable to arbitrary view and label absences while +also performing well on the ideal full data. We have conducted sufficient and +convincing experiments to confirm the effectiveness and advancement of our +model. + +
+
+ comment: Accepted at NeurIPS 2023. Email: liucl1996@163.com +
+
+
+
+
+ + ☆ A Novel Spike Transformer Network for Depth Estimation from Event + Cameras via Cross-modality Knowledge Distillation + + +
+ Depth estimation is crucial for interpreting complex environments, especially +in areas such as autonomous vehicle navigation and robotics. Nonetheless, +obtaining accurate depth readings from event camera data remains a formidable +challenge. Event cameras operate differently from traditional digital cameras, +continuously capturing data and generating asynchronous binary spikes that +encode time, location, and light intensity. Yet, the unique sampling mechanisms +of event cameras render standard image based algorithms inadequate for +processing spike data. This necessitates the development of innovative, +spike-aware algorithms tailored for event cameras, a task compounded by the +irregularity, continuity, noise, and spatial and temporal characteristics +inherent in spiking data.Harnessing the strong generalization capabilities of +transformer neural networks for spatiotemporal data, we propose a purely +spike-driven spike transformer network for depth estimation from spiking camera +data. To address performance limitations with Spiking Neural Networks (SNN), we +introduce a novel single-stage cross-modality knowledge transfer framework +leveraging knowledge from a large vision foundational model of artificial +neural networks (ANN) (DINOv2) to enhance the performance of SNNs with limited +data. Our experimental results on both synthetic and real datasets show +substantial improvements over existing models, with notable gains in Absolute +Relative and Square Relative errors (49% and 39.77% improvements over the +benchmark model Spike-T, respectively). Besides accuracy, the proposed model +also demonstrates reduced power consumptions, a critical factor for practical +applications. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Dense Road Surface Grip Map Prediction from Multimodal Image Data ICPR 2024 + + +
+ Slippery road weather conditions are prevalent in many regions and cause a +regular risk for traffic. Still, there has been less research on how autonomous +vehicles could detect slippery driving conditions on the road to drive safely. +In this work, we propose a method to predict a dense grip map from the area in +front of the car, based on postprocessed multimodal sensor data. We trained a +convolutional neural network to predict pixelwise grip values from fused RGB +camera, thermal camera, and LiDAR reflectance images, based on weakly +supervised ground truth from an optical road weather sensor. + The experiments show that it is possible to predict dense grip values with +good accuracy from the used data modalities as the produced grip map follows +both ground truth measurements and local weather conditions, such as snowy +areas on the road. The model using only the RGB camera or LiDAR reflectance +modality provided good baseline results for grip prediction accuracy while +using models fusing the RGB camera, thermal camera, and LiDAR modalities +improved the grip predictions significantly. + +
+
+ comment: 17 pages, 7 figures (supplementary material 1 page, 1 figure). + Submitted to 27th International Conference of Pattern Recognition (ICPR 2024) +
+
+
+
+
+ + ☆ Image Copy-Move Forgery Detection via Deep PatchMatch and Pairwise + Ranking Learning + + +
+ Recent advances in deep learning algorithms have shown impressive progress in +image copy-move forgery detection (CMFD). However, these algorithms lack +generalizability in practical scenarios where the copied regions are not +present in the training images, or the cloned regions are part of the +background. Additionally, these algorithms utilize convolution operations to +distinguish source and target regions, leading to unsatisfactory results when +the target regions blend well with the background. To address these +limitations, this study proposes a novel end-to-end CMFD framework that +integrates the strengths of conventional and deep learning methods. +Specifically, the study develops a deep cross-scale PatchMatch (PM) method that +is customized for CMFD to locate copy-move regions. Unlike existing deep +models, our approach utilizes features extracted from high-resolution scales to +seek explicit and reliable point-to-point matching between source and target +regions. Furthermore, we propose a novel pairwise rank learning framework to +separate source and target regions. By leveraging the strong prior of +point-to-point matches, the framework can identify subtle differences and +effectively discriminate between source and target regions, even when the +target regions blend well with the background. Our framework is fully +differentiable and can be trained end-to-end. Comprehensive experimental +results highlight the remarkable generalizability of our scheme across various +copy-move scenarios, significantly outperforming existing methods. + +
+
+ comment: 16 pages, 14figures +
+
+
+
+
+ + ☆ Part-Guided 3D RL for Sim2Real Articulated Object Manipulation + + +
+ Manipulating unseen articulated objects through visual feedback is a critical +but challenging task for real robots. Existing learning-based solutions mainly +focus on visual affordance learning or other pre-trained visual models to guide +manipulation policies, which face challenges for novel instances in real-world +scenarios. In this paper, we propose a novel part-guided 3D RL framework, which +can learn to manipulate articulated objects without demonstrations. We combine +the strengths of 2D segmentation and 3D RL to improve the efficiency of RL +policy training. To improve the stability of the policy on real robots, we +design a Frame-consistent Uncertainty-aware Sampling (FUS) strategy to get a +condensed and hierarchical 3D representation. In addition, a single versatile +RL policy can be trained on multiple articulated object manipulation tasks +simultaneously in simulation and shows great generalizability to novel +categories and instances. Experimental results demonstrate the effectiveness of +our framework in both simulation and real-world settings. Our code is available +at +https://github.com/THU-VCLab/Part-Guided-3D-RL-for-Sim2Real-Articulated-Object-Manipulation. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Adversarial Reweighting with $α$-Power Maximization for Domain + Adaptation + + +
+ The practical Domain Adaptation (DA) tasks, e.g., Partial DA (PDA), open-set +DA, universal DA, and test-time adaptation, have gained increasing attention in +the machine learning community. In this paper, we propose a novel approach, +dubbed Adversarial Reweighting with $\alpha$-Power Maximization (ARPM), for PDA +where the source domain contains private classes absent in target domain. In +ARPM, we propose a novel adversarial reweighting model that adversarially +learns to reweight source domain data to identify source-private class samples +by assigning smaller weights to them, for mitigating potential negative +transfer. Based on the adversarial reweighting, we train the transferable +recognition model on the reweighted source distribution to be able to classify +common class data. To reduce the prediction uncertainty of the recognition +model on the target domain for PDA, we present an $\alpha$-power maximization +mechanism in ARPM, which enriches the family of losses for reducing the +prediction uncertainty for PDA. Extensive experimental results on five PDA +benchmarks, i.e., Office-31, Office-Home, VisDA-2017, ImageNet-Caltech, and +DomainNet, show that our method is superior to recent PDA methods. Ablation +studies also confirm the effectiveness of components in our approach. To +theoretically analyze our method, we deduce an upper bound of target domain +expected error for PDA, which is approximately minimized in our approach. We +further extend ARPM to open-set DA, universal DA, and test time adaptation, and +verify the usefulness through experiments. + +
+
+ comment: To appear in IJCV +
+
+
+
+
+ + ☆ 3SHNet: Boosting Image-Sentence Retrieval via Visual Semantic-Spatial + Self-Highlighting + + +
+ In this paper, we propose a novel visual Semantic-Spatial Self-Highlighting +Network (termed 3SHNet) for high-precision, high-efficiency and +high-generalization image-sentence retrieval. 3SHNet highlights the salient +identification of prominent objects and their spatial locations within the +visual modality, thus allowing the integration of visual semantics-spatial +interactions and maintaining independence between two modalities. This +integration effectively combines object regions with the corresponding semantic +and position layouts derived from segmentation to enhance the visual +representation. And the modality-independence guarantees efficiency and +generalization. Additionally, 3SHNet utilizes the structured contextual visual +scene information from segmentation to conduct the local (region-based) or +global (grid-based) guidance and achieve accurate hybrid-level retrieval. +Extensive experiments conducted on MS-COCO and Flickr30K benchmarks +substantiate the superior performances, inference efficiency and generalization +of the proposed 3SHNet when juxtaposed with contemporary state-of-the-art +methodologies. Specifically, on the larger MS-COCO 5K test set, we achieve +16.3%, 24.8%, and 18.3% improvements in terms of rSum score, respectively, +compared with the state-of-the-art methods using different image +representations, while maintaining optimal retrieval efficiency. Moreover, our +performance on cross-dataset generalization improves by 18.6%. Data and code +are available at https://github.com/XuriGe1995/3SHNet. + +
+
+ comment: Accepted Information Processing and Management (IP&M), 10 pages, 9 + figures and 8 tables +
+
+
+
+
+ + ☆ SDFD: Building a Versatile Synthetic Face Image Dataset with Diverse + Attributes + + +
+ AI systems rely on extensive training on large datasets to address various +tasks. However, image-based systems, particularly those used for demographic +attribute prediction, face significant challenges. Many current face image +datasets primarily focus on demographic factors such as age, gender, and skin +tone, overlooking other crucial facial attributes like hairstyle and +accessories. This narrow focus limits the diversity of the data and +consequently the robustness of AI systems trained on them. This work aims to +address this limitation by proposing a methodology for generating synthetic +face image datasets that capture a broader spectrum of facial diversity. +Specifically, our approach integrates a systematic prompt formulation strategy, +encompassing not only demographics and biometrics but also non-permanent traits +like make-up, hairstyle, and accessories. These prompts guide a +state-of-the-art text-to-image model in generating a comprehensive dataset of +high-quality realistic images and can be used as an evaluation set in face +analysis systems. Compared to existing datasets, our proposed dataset proves +equally or more challenging in image classification tasks while being much +smaller in size. + +
+
+ comment: 2024 18th International Conference on Automatic Face and Gesture + Recognition (FG) +
+
+
+
+
+ + ☆ Trinity Detector:text-assisted and attention mechanisms based spectral + fusion for diffusion generation image detection + + +
+ Artificial Intelligence Generated Content (AIGC) techniques, represented by +text-to-image generation, have led to a malicious use of deep forgeries, +raising concerns about the trustworthiness of multimedia content. Adapting +traditional forgery detection methods to diffusion models proves challenging. +Thus, this paper proposes a forgery detection method explicitly designed for +diffusion models called Trinity Detector. Trinity Detector incorporates +coarse-grained text features through a CLIP encoder, coherently integrating +them with fine-grained artifacts in the pixel domain for comprehensive +multimodal detection. To heighten sensitivity to diffusion-generated image +features, a Multi-spectral Channel Attention Fusion Unit (MCAF) is designed, +extracting spectral inconsistencies through adaptive fusion of diverse +frequency bands and further integrating spatial co-occurrence of the two +modalities. Extensive experimentation validates that our Trinity Detector +method outperforms several state-of-the-art methods, our performance is +competitive across all datasets and up to 17.6\% improvement in transferability +in the diffusion datasets. + +
+
+
+
+
+ + ☆ Comparison of self-supervised in-domain and supervised out-domain + transfer learning for bird species recognition + + +
+ Transferring the weights of a pre-trained model to assist another task has +become a crucial part of modern deep learning, particularly in data-scarce +scenarios. Pre-training refers to the initial step of training models outside +the current task of interest, typically on another dataset. It can be done via +supervised models using human-annotated datasets or self-supervised models +trained on unlabeled datasets. In both cases, many pre-trained models are +available to fine-tune for the task of interest. Interestingly, research has +shown that pre-trained models from ImageNet can be helpful for audio tasks +despite being trained on image datasets. Hence, it's unclear whether in-domain +models would be advantageous compared to competent out-domain models, such as +convolutional neural networks from ImageNet. Our experiments will demonstrate +the usefulness of in-domain models and datasets for bird species recognition by +leveraging VICReg, a recent and powerful self-supervised method. + +
+
+
+
+
+ + ☆ Weakly Supervised Training for Hologram Verification in Identity + Documents ICDAR 2024 + + +
+ We propose a method to remotely verify the authenticity of Optically Variable +Devices (OVDs), often referred to as ``holograms'', in identity documents. Our +method processes video clips captured with smartphones under common lighting +conditions, and is evaluated on two public datasets: MIDV-HOLO and MIDV-2020. +Thanks to a weakly-supervised training, we optimize a feature extraction and +decision pipeline which achieves a new leading performance on MIDV-HOLO, while +maintaining a high recall on documents from MIDV-2020 used as attack samples. +It is also the first method, to date, to effectively address the photo +replacement attack task, and can be trained on either genuine samples, attack +samples, or both for increased performance. By enabling to verify OVD shapes +and dynamics with very little supervision, this work opens the way towards the +use of massive amounts of unlabeled data to build robust remote identity +document verification systems on commodity smartphones. Code is available at +https://github.com/EPITAResearchLab/pouliquen.24.icdar + +
+
+ comment: Accepted at the International Conference on Document Analysis and + Recognition (ICDAR 2024) +
+
+
+
+
+ + ☆ Camera Motion Estimation from RGB-D-Inertial Scene Flow CVPR2024 + + +
+ In this paper, we introduce a novel formulation for camera motion estimation +that integrates RGB-D images and inertial data through scene flow. Our goal is +to accurately estimate the camera motion in a rigid 3D environment, along with +the state of the inertial measurement unit (IMU). Our proposed method offers +the flexibility to operate as a multi-frame optimization or to marginalize +older data, thus effectively utilizing past measurements. To assess the +performance of our method, we conducted evaluations using both synthetic data +from the ICL-NUIM dataset and real data sequences from the OpenLORIS-Scene +dataset. Our results show that the fusion of these two sensors enhances the +accuracy of camera motion estimation when compared to using only visual data. + +
+
+ comment: Accepted to CVPR2024 Workshop on Visual Odometry and Computer Vision + Applications +
+
+
+
+
+ + ☆ Parameter Efficient Fine-tuning of Self-supervised ViTs without + Catastrophic Forgetting CVPR + + +
+ Artificial neural networks often suffer from catastrophic forgetting, where +learning new concepts leads to a complete loss of previously acquired +knowledge. We observe that this issue is particularly magnified in vision +transformers (ViTs), where post-pre-training and fine-tuning on new tasks can +significantly degrade the model's original general abilities. For instance, a +DINO ViT-Base/16 pre-trained on ImageNet-1k loses over 70% accuracy on +ImageNet-1k after just 10 iterations of fine-tuning on CIFAR-100. Overcoming +this stability-plasticity dilemma is crucial for enabling ViTs to continuously +learn and adapt to new domains while preserving their initial knowledge. In +this work, we study two new parameter-efficient fine-tuning strategies: +(1)~Block Expansion, and (2) Low-rank adaptation (LoRA). Our experiments reveal +that using either Block Expansion or LoRA on self-supervised pre-trained ViTs +surpass fully fine-tuned ViTs in new domains while offering significantly +greater parameter efficiency. Notably, we find that Block Expansion experiences +only a minimal performance drop in the pre-training domain, thereby effectively +mitigating catastrophic forgetting in pre-trained ViTs. + +
+
+ comment: Accepted at eLVM Workshop, CVPR, 2024 +
+
+
+
+
+ + ☆ Binarizing Documents by Leveraging both Space and Frequency ICDAR2024 + + +
+ Document Image Binarization is a well-known problem in Document Analysis and +Computer Vision, although it is far from being solved. One of the main +challenges of this task is that documents generally exhibit degradations and +acquisition artifacts that can greatly vary throughout the page. Nonetheless, +even when dealing with a local patch of the document, taking into account the +overall appearance of a wide portion of the page can ease the prediction by +enriching it with semantic information on the ink and background conditions. In +this respect, approaches able to model both local and global information have +been proven suitable for this task. In particular, recent applications of +Vision Transformer (ViT)-based models, able to model short and long-range +dependencies via the attention mechanism, have demonstrated their superiority +over standard Convolution-based models, which instead struggle to model global +dependencies. In this work, we propose an alternative solution based on the +recently introduced Fast Fourier Convolutions, which overcomes the limitation +of standard convolutions in modeling global information while requiring fewer +parameters than ViTs. We validate the effectiveness of our approach via +extensive experimental analysis considering different types of degradations. + +
+
+ comment: Accepted at ICDAR2024 +
+
+
+
+
+ + ☆ Optimizing Universal Lesion Segmentation: State Space Model-Guided + Hierarchical Networks with Feature Importance Adjustment + + +
+ Deep learning has revolutionized medical imaging by providing innovative +solutions to complex healthcare challenges. Traditional models often struggle +to dynamically adjust feature importance, resulting in suboptimal +representation, particularly in tasks like semantic segmentation crucial for +accurate structure delineation. Moreover, their static nature incurs high +computational costs. To tackle these issues, we introduce Mamba-Ahnet, a novel +integration of State Space Model (SSM) and Advanced Hierarchical Network +(AHNet) within the MAMBA framework, specifically tailored for semantic +segmentation in medical imaging.Mamba-Ahnet combines SSM's feature extraction +and comprehension with AHNet's attention mechanisms and image reconstruction, +aiming to enhance segmentation accuracy and robustness. By dissecting images +into patches and refining feature comprehension through self-attention +mechanisms, the approach significantly improves feature resolution. Integration +of AHNet into the MAMBA framework further enhances segmentation performance by +selectively amplifying informative regions and facilitating the learning of +rich hierarchical representations. Evaluation on the Universal Lesion +Segmentation dataset demonstrates superior performance compared to +state-of-the-art techniques, with notable metrics such as a Dice similarity +coefficient of approximately 98% and an Intersection over Union of about 83%. +These results underscore the potential of our methodology to enhance diagnostic +accuracy, treatment planning, and ultimately, patient outcomes in clinical +practice. By addressing the limitations of traditional models and leveraging +the power of deep learning, our approach represents a significant step forward +in advancing medical imaging technology. + +
+
+
+
+
+ + ☆ ObjectAdd: Adding Objects into Image via a Training-Free Diffusion + Modification Fashion ECCV2024 + + +
+ We introduce ObjectAdd, a training-free diffusion modification method to add +user-expected objects into user-specified area. The motive of ObjectAdd stems +from: first, describing everything in one prompt can be difficult, and second, +users often need to add objects into the generated image. To accommodate with +real world, our ObjectAdd maintains accurate image consistency after adding +objects with technical innovations in: (1) embedding-level concatenation to +ensure correct text embedding coalesce; (2) object-driven layout control with +latent and attention injection to ensure objects accessing user-specified area; +(3) prompted image inpainting in an attention refocusing & object expansion +fashion to ensure rest of the image stays the same. With a text-prompted image, +our ObjectAdd allows users to specify a box and an object, and achieves: (1) +adding object inside the box area; (2) exact content outside the box area; (3) +flawless fusion between the two areas + +
+
+ comment: 12 pages, submitted to ECCV2024 +
+
+
+
+
+ + ☆ SAGHOG: Self-Supervised Autoencoder for Generating HOG Features for + Writer Retrieval ICDAR2024 + + +
+ This paper introduces SAGHOG, a self-supervised pretraining strategy for +writer retrieval using HOG features of the binarized input image. Our +preprocessing involves the application of the Segment Anything technique to +extract handwriting from various datasets, ending up with about 24k documents, +followed by training a vision transformer on reconstructing masked patches of +the handwriting. SAGHOG is then finetuned by appending NetRVLAD as an encoding +layer to the pretrained encoder. Evaluation of our approach on three historical +datasets, Historical-WI, HisFrag20, and GRK-Papyri, demonstrates the +effectiveness of SAGHOG for writer retrieval. Additionally, we provide ablation +studies on our architecture and evaluate un- and supervised finetuning. +Notably, on HisFrag20, SAGHOG outperforms related work with a mAP of 57.2 % - a +margin of 11.6 % to the current state of the art, showcasing its robustness on +challenging data, and is competitive on even small datasets, e.g. GRK-Papyri, +where we achieve a Top-1 accuracy of 58.0%. + +
+
+ comment: accepted for ICDAR2024 +
+
+
+
+
+ + ☆ SLAM for Indoor Mapping of Wide Area Construction Environments + + +
+ Simultaneous localization and mapping (SLAM), i.e., the reconstruction of the +environment represented by a (3D) map and the concurrent pose estimation, has +made astonishing progress. Meanwhile, large scale applications aiming at the +data collection in complex environments like factory halls or construction +sites are becoming feasible. However, in contrast to small scale scenarios with +building interiors separated to single rooms, shop floors or construction areas +require measures at larger distances in potentially texture less areas under +difficult illumination. Pose estimation is further aggravated since no GNSS +measures are available as it is usual for such indoor applications. In our +work, we realize data collection in a large factory hall by a robot system +equipped with four stereo cameras as well as a 3D laser scanner. We apply our +state-of-the-art LiDAR and visual SLAM approaches and discuss the respective +pros and cons of the different sensor types for trajectory estimation and dense +map generation in such an environment. Additionally, dense and accurate depth +maps are generated by 3D Gaussian splatting, which we plan to use in the +context of our project aiming on the automatic construction and site +monitoring. + +
+
+
+
+
+ + ☆ Scrutinizing Data from Sky: An Examination of Its Veracity in Area Based + Traffic Contexts + + +
+ Traffic data collection has been an overwhelming task for researchers as well +as authorities over the years. With the advancement in technology and +introduction of various tools for processing and extracting traffic data the +task has been made significantly convenient. Data from Sky (DFS) is one such +tool, based on image processing and artificial intelligence (AI), that provides +output for macroscopic as well as microscopic variables of the traffic streams. +The company claims to provide 98 to 100 percent accuracy on the data exported +using DFS tool. The tool is widely used in developed countries where the +traffic is homogenous and has lane-based movements. In this study, authors have +checked the veracity of DFS tool in heterogenous and area-based traffic +movement that is prevailing in most developing countries. The validation is +done using various methods using Classified Volume Count (CVC), Space Mean +Speeds (SMS) of individual vehicle classes and microscopic trajectory of probe +vehicle to verify DFS claim. The error for CVCs for each vehicle class present +in the traffic stream is estimated. Mean Absolute Percentage Error (MAPE) +values are calculated for average speeds of each vehicle class between manually +and DFS extracted space mean speeds (SMSs), and the microscopic trajectories +are validated using a GPS based tracker put on probe vehicles. The results are +fairly accurate in the case of data taken from a bird eye view with least +errors. The other configurations of data collection have some significant +errors, that are majorly caused by the varied traffic composition, the view of +camera angle, and the direction of traffic. + +
+
+
+
+
+ + ☆ Two in One Go: Single-stage Emotion Recognition with Decoupled + Subject-context Transformer + + +
+ Emotion recognition aims to discern the emotional state of subjects within an +image, relying on subject-centric and contextual visual cues. Current +approaches typically follow a two-stage pipeline: first localize subjects by +off-the-shelf detectors, then perform emotion classification through the late +fusion of subject and context features. However, the complicated paradigm +suffers from disjoint training stages and limited interaction between +fine-grained subject-context elements. To address the challenge, we present a +single-stage emotion recognition approach, employing a Decoupled +Subject-Context Transformer (DSCT), for simultaneous subject localization and +emotion classification. Rather than compartmentalizing training stages, we +jointly leverage box and emotion signals as supervision to enrich +subject-centric feature learning. Furthermore, we introduce DSCT to facilitate +interactions between fine-grained subject-context cues in a decouple-then-fuse +manner. The decoupled query token--subject queries and context +queries--gradually intertwine across layers within DSCT, during which spatial +and semantic relations are exploited and aggregated. We evaluate our +single-stage framework on two widely used context-aware emotion recognition +datasets, CAER-S and EMOTIC. Our approach surpasses two-stage alternatives with +fewer parameter numbers, achieving a 3.39% accuracy improvement and a 6.46% +average precision gain on CAER-S and EMOTIC datasets, respectively. + +
+
+
+
+
+ + ☆ Self-supervised visual learning in the low-data regime: a comparative + evaluation + + +
+ Self-Supervised Learning (SSL) is a valuable and robust training methodology +for contemporary Deep Neural Networks (DNNs), enabling unsupervised pretraining +on a `pretext task' that does not require ground-truth labels/annotation. This +allows efficient representation learning from massive amounts of unlabeled +training data, which in turn leads to increased accuracy in a `downstream task' +by exploiting supervised transfer learning. Despite the relatively +straightforward conceptualization and applicability of SSL, it is not always +feasible to collect and/or to utilize very large pretraining datasets, +especially when it comes to real-world application settings. In particular, in +cases of specialized and domain-specific application scenarios, it may not be +achievable or practical to assemble a relevant image pretraining dataset in the +order of millions of instances or it could be computationally infeasible to +pretrain at this scale. This motivates an investigation on the effectiveness of +common SSL pretext tasks, when the pretraining dataset is of relatively +limited/constrained size. In this context, this work introduces a taxonomy of +modern visual SSL methods, accompanied by detailed explanations and insights +regarding the main categories of approaches, and, subsequently, conducts a +thorough comparative experimental evaluation in the low-data regime, targeting +to identify: a) what is learnt via low-data SSL pretraining, and b) how do +different SSL categories behave in such training scenarios. Interestingly, for +domain-specific downstream tasks, in-domain low-data SSL pretraining +outperforms the common approach of large-scale pretraining on general datasets. +Grounded on the obtained results, valuable insights are highlighted regarding +the performance of each category of SSL methods, which in turn suggest +straightforward future research directions in the field. + +
+
+
+
+
+ + ☆ Few-shot Calligraphy Style Learning + + +
+ We introduced "Presidifussion," a novel approach to learning and replicating +the unique style of calligraphy of President Xu, using a pretrained diffusion +model adapted through a two-stage training process. Initially, our model is +pretrained on a diverse dataset containing works from various calligraphers. +This is followed by fine-tuning on a smaller, specialized dataset of President +Xu's calligraphy, comprising just under 200 images. Our method introduces +innovative techniques of font image conditioning and stroke information +conditioning, enabling the model to capture the intricate structural elements +of Chinese characters. The effectiveness of our approach is demonstrated +through a comparison with traditional methods like zi2zi and CalliGAN, with our +model achieving comparable performance using significantly smaller datasets and +reduced computational resources. This work not only presents a breakthrough in +the digital preservation of calligraphic art but also sets a new standard for +data-efficient generative modeling in the domain of cultural heritage +digitization. + +
+
+
+
+
+ + ☆ MCSDNet: Mesoscale Convective System Detection Network via Multi-scale + Spatiotemporal Information + + +
+ The accurate detection of Mesoscale Convective Systems (MCS) is crucial for +meteorological monitoring due to their potential to cause significant +destruction through severe weather phenomena such as hail, thunderstorms, and +heavy rainfall. However, the existing methods for MCS detection mostly targets +on single-frame detection, which just considers the static characteristics and +ignores the temporal evolution in the life cycle of MCS. In this paper, we +propose a novel encoder-decoder neural network for MCS detection(MCSDNet). +MCSDNet has a simple architecture and is easy to expand. Different from the +previous models, MCSDNet targets on multi-frames detection and leverages +multi-scale spatiotemporal information for the detection of MCS regions in +remote sensing imagery(RSI). As far as we know, it is the first work to utilize +multi-scale spatiotemporal information to detect MCS regions. Firstly, we +design a multi-scale spatiotemporal information module to extract multi-level +semantic from different encoder levels, which makes our models can extract more +detail spatiotemporal features. Secondly, a Spatiotemporal Mix Unit(STMU) is +introduced to MCSDNet to capture both intra-frame features and inter-frame +correlations, which is a scalable module and can be replaced by other +spatiotemporal module, e.g., CNN, RNN, Transformer and our proposed Dual +Spatiotemporal Attention(DSTA). This means that the future works about +spatiotemporal modules can be easily integrated to our model. Finally, we +present MCSRSI, the first publicly available dataset for multi-frames MCS +detection based on visible channel images from the FY-4A satellite. We also +conduct several experiments on MCSRSI and find that our proposed MCSDNet +achieve the best performance on MCS detection task when comparing to other +baseline methods. + +
+
+
+
+
+ + ☆ Low-Rank Knowledge Decomposition for Medical Foundation Models CVPR 2024 + + +
+ The popularity of large-scale pre-training has promoted the development of +medical foundation models. However, some studies have shown that although +foundation models exhibit strong general feature extraction capabilities, their +performance on specific tasks is still inferior to task-specific methods. In +this paper, we explore a new perspective called ``Knowledge Decomposition'' to +improve the performance on specific medical tasks, which deconstruct the +foundation model into multiple lightweight expert models, each dedicated to a +particular task, with the goal of improving specialization while concurrently +mitigating resource expenditure. To accomplish the above objective, we design a +novel framework named Low-Rank Knowledge Decomposition (LoRKD), which +explicitly separates graidents by incorporating low-rank expert modules and the +efficient knowledge separation convolution. Extensive experimental results +demonstrate that the decomposed models perform well in terms of performance and +transferability, even surpassing the original foundation models. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MovieChat+: Question-aware Sparse Memory for Long Video Question + Answering + + +
+ Recently, integrating video foundation models and large language models to +build a video understanding system can overcome the limitations of specific +pre-defined vision tasks. Yet, existing methods either employ complex +spatial-temporal modules or rely heavily on additional perception models to +extract temporal features for video understanding, and they only perform well +on short videos. For long videos, the computational complexity and memory costs +associated with long-term temporal connections are significantly increased, +posing additional challenges.Taking advantage of the Atkinson-Shiffrin memory +model, with tokens in Transformers being employed as the carriers of memory in +combination with our specially designed memory mechanism, we propose MovieChat +to overcome these challenges. We lift pre-trained multi-modal large language +models for understanding long videos without incorporating additional trainable +temporal modules, employing a zero-shot approach. MovieChat achieves +state-of-the-art performance in long video understanding, along with the +released MovieChat-1K benchmark with 1K long video, 2K temporal grounding +labels, and 14K manual annotations for validation of the effectiveness of our +method. The code along with the dataset can be accessed via the following +https://github.com/rese1f/MovieChat. + +
+
+
+
+
+ + ☆ Exploring Beyond Logits: Hierarchical Dynamic Labeling Based on + Embeddings for Semi-Supervised Classification + + +
+ In semi-supervised learning, methods that rely on confidence learning to +generate pseudo-labels have been widely proposed. However, increasing research +finds that when faced with noisy and biased data, the model's representation +network is more reliable than the classification network. Additionally, label +generation methods based on model predictions often show poor adaptability +across different datasets, necessitating customization of the classification +network. Therefore, we propose a Hierarchical Dynamic Labeling (HDL) algorithm +that does not depend on model predictions and utilizes image embeddings to +generate sample labels. We also introduce an adaptive method for selecting +hyperparameters in HDL, enhancing its versatility. Moreover, HDL can be +combined with general image encoders (e.g., CLIP) to serve as a fundamental +data processing module. We extract embeddings from datasets with class-balanced +and long-tailed distributions using pre-trained semi-supervised models. +Subsequently, samples are re-labeled using HDL, and the re-labeled samples are +used to further train the semi-supervised models. Experiments demonstrate +improved model performance, validating the motivation that representation +networks are more reliable than classifiers or predictors. Our approach has the +potential to change the paradigm of pseudo-label generation in semi-supervised +learning. + +
+
+
+
+
+ + ☆ S-IQA Image Quality Assessment With Compressive Sampling + + +
+ No-Reference Image Quality Assessment (IQA) aims at estimating image quality +in accordance with subjective human perception. However, most existing NR-IQA +methods focus on exploring increasingly complex networks or components to +improve the final performance. Such practice imposes great limitations and +complexity on IQA methods, especially when they are applied to high-resolution +(HR) images in the real world. Actually, most images own high spatial +redundancy, especially for those HR data. To further exploit the characteristic +and alleviate the issue above, we propose a new framework for Image Quality +Assessment with compressive Sampling (dubbed S-IQA), which consists of three +components: (1) The Flexible Sampling Module (FSM) samples the image to obtain +measurements at an arbitrary ratio. (2) Vision Transformer with the Adaptive +Embedding Module (AEM) makes measurements of uniform size and extracts deep +features (3) Dual Branch (DB) allocates weight for every patch and predicts the +final quality score. Experiments show that our proposed S-IQA achieves +state-of-the-art result on various datasets with less data usage. + +
+
+
+
+
+ + ☆ Phase-aggregated Dual-branch Network for Efficient Fingerprint Dense + Registration + + +
+ Fingerprint dense registration aims to finely align fingerprint pairs at the +pixel level, thereby reducing intra-class differences caused by distortion. +Unfortunately, traditional methods exhibited subpar performance when dealing +with low-quality fingerprints while suffering from slow inference speed. +Although deep learning based approaches shows significant improvement in these +aspects, their registration accuracy is still unsatisfactory. In this paper, we +propose a Phase-aggregated Dual-branch Registration Network (PDRNet) to +aggregate the advantages of both types of methods. A dual-branch structure with +multi-stage interactions is introduced between correlation information at high +resolution and texture feature at low resolution, to perceive local fine +differences while ensuring global stability. Extensive experiments are +conducted on more comprehensive databases compared to previous works. +Experimental results demonstrate that our method reaches the state-of-the-art +registration performance in terms of accuracy and robustness, while maintaining +considerable competitiveness in efficiency. + +
+
+
+
+
+ + ☆ CSCO: Connectivity Search of Convolutional Operators CVPR + + +
+ Exploring dense connectivity of convolutional operators establishes critical +"synapses" to communicate feature vectors from different levels and enriches +the set of transformations on Computer Vision applications. Yet, even with +heavy-machinery approaches such as Neural Architecture Search (NAS), +discovering effective connectivity patterns requires tremendous efforts due to +either constrained connectivity design space or a sub-optimal exploration +process induced by an unconstrained search space. In this paper, we propose +CSCO, a novel paradigm that fabricates effective connectivity of convolutional +operators with minimal utilization of existing design motifs and further +utilizes the discovered wiring to construct high-performing ConvNets. CSCO +guides the exploration via a neural predictor as a surrogate of the +ground-truth performance. We introduce Graph Isomorphism as data augmentation +to improve sample efficiency and propose a Metropolis-Hastings Evolutionary +Search (MH-ES) to evade locally optimal architectures and advance search +quality. Results on ImageNet show ~0.6% performance improvement over +hand-crafted and NAS-crafted dense connectivity. Our code is publicly +available. + +
+
+ comment: To appear on Proceedings of the IEEE/CVF Conference on Computer + Vision and Pattern Recognition (CVPR) Workshops (2024) +
+
+
+
+
+ + ☆ MorphText: Deep Morphology Regularized Arbitrary-shape Scene Text + Detection + + +
+ Bottom-up text detection methods play an important role in arbitrary-shape +scene text detection but there are two restrictions preventing them from +achieving their great potential, i.e., 1) the accumulation of false text +segment detections, which affects subsequent processing, and 2) the difficulty +of building reliable connections between text segments. Targeting these two +problems, we propose a novel approach, named ``MorphText", to capture the +regularity of texts by embedding deep morphology for arbitrary-shape text +detection. Towards this end, two deep morphological modules are designed to +regularize text segments and determine the linkage between them. First, a Deep +Morphological Opening (DMOP) module is constructed to remove false text segment +detections generated in the feature extraction process. Then, a Deep +Morphological Closing (DMCL) module is proposed to allow text instances of +various shapes to stretch their morphology along their most significant +orientation while deriving their connections. Extensive experiments conducted +on four challenging benchmark datasets (CTW1500, Total-Text, MSRA-TD500 and +ICDAR2017) demonstrate that our proposed MorphText outperforms both top-down +and bottom-up state-of-the-art arbitrary-shape scene text detection approaches. + +
+
+ comment: Accepted by Transaction on Multimedia +
+
+
+
+
+ + ☆ Pose-Specific 3D Fingerprint Unfolding + + +
+ In order to make 3D fingerprints compatible with traditional 2D flat +fingerprints, a common practice is to unfold the 3D fingerprint into a 2D +rolled fingerprint, which is then matched with the flat fingerprints by +traditional 2D fingerprint recognition algorithms. The problem with this method +is that there may be large elastic deformation between the unfolded rolled +fingerprint and flat fingerprint, which affects the recognition rate. In this +paper, we propose a pose-specific 3D fingerprint unfolding algorithm to unfold +the 3D fingerprint using the same pose as the flat fingerprint. Our experiments +show that the proposed unfolding algorithm improves the compatibility between +3D fingerprint and flat fingerprint and thus leads to higher genuine matching +scores. + +
+
+
+
+
+ + ☆ Direct Regression of Distortion Field from a Single Fingerprint Image + + +
+ Skin distortion is a long standing challenge in fingerprint matching, which +causes false non-matches. Previous studies have shown that the recognition rate +can be improved by estimating the distortion field from a distorted fingerprint +and then rectifying it into a normal fingerprint. However, existing +rectification methods are based on principal component representation of +distortion fields, which is not accurate and are very sensitive to finger pose. +In this paper, we propose a rectification method where a self-reference based +network is utilized to directly estimate the dense distortion field of +distorted fingerprint instead of its low dimensional representation. This +method can output accurate distortion fields of distorted fingerprints with +various finger poses. Considering the limited number and variety of distorted +fingerprints in the existing public dataset, we collected more distorted +fingerprints with diverse finger poses and distortion patterns as a new +database. Experimental results demonstrate that our proposed method achieves +the state-of-the-art rectification performance in terms of distortion field +estimation and rectified fingerprint matching. + +
+
+
+
+
+ + ☆ On the Federated Learning Framework for Cooperative Perception + + +
+ Cooperative perception is essential to enhance the efficiency and safety of +future transportation systems, requiring extensive data sharing among vehicles +on the road, which raises significant privacy concerns. Federated learning +offers a promising solution by enabling data privacy-preserving collaborative +enhancements in perception, decision-making, and planning among connected and +autonomous vehicles (CAVs). However, federated learning is impeded by +significant challenges arising from data heterogeneity across diverse clients, +potentially diminishing model accuracy and prolonging convergence periods. This +study introduces a specialized federated learning framework for CP, termed the +federated dynamic weighted aggregation (FedDWA) algorithm, facilitated by +dynamic adjusting loss (DALoss) function. This framework employs dynamic client +weighting to direct model convergence and integrates a novel loss function that +utilizes Kullback-Leibler divergence (KLD) to counteract the detrimental +effects of non-independently and identically distributed (Non-IID) and +unbalanced data. Utilizing the BEV transformer as the primary model, our +rigorous testing on the OpenV2V dataset, augmented with FedBEVT data, +demonstrates significant improvements in the average intersection over union +(IoU). These results highlight the substantial potential of our federated +learning framework to address data heterogeneity challenges in CP, thereby +enhancing the accuracy of environmental perception models and facilitating more +robust and efficient collaborative learning solutions in the transportation +sector. + +
+
+
+
+
+ + ☆ Localization of Pallets on Shelves Using Horizontal Plane Projection of + a 360-degree Image + + +
+ In this paper, we propose a method for calculating the three-dimensional (3D) +position and orientation of a pallet placed on a shelf on the side of a +forklift truck using a 360-degree camera. By using a 360-degree camera mounted +on the forklift truck, it is possible to observe both the pallet at the side of +the forklift and one several meters ahead. However, the pallet on the obtained +image is observed with different distortion depending on its 3D position, so +that it is difficult to extract the pallet from the image. To solve this +problem, a method [1] has been proposed for detecting a pallet by projecting a +360-degree image on a vertical plane that coincides with the front of the shelf +to calculate an image similar to the image seen from the front of the shelf. At +the same time as the detection, the approximate position and orientation of the +detected pallet can be obtained, but the accuracy is not sufficient for +automatic control of the forklift truck. In this paper, we propose a method for +accurately detecting the yaw angle, which is the angle of the front surface of +the pallet in the horizontal plane, by projecting the 360-degree image on a +horizontal plane including the boundary line of the front surface of the +detected pallet. The position of the pallet is also determined by moving the +vertical plane having the detected yaw angle back and forth, and finding the +position at which the degree of coincidence between the projection image on the +vertical plane and the actual size of the front surface of the pallet is +maximized. Experiments using real images taken in a laboratory and an actual +warehouse have confirmed that the proposed method can calculate the position +and orientation of a pallet within a reasonable calculation time and with the +accuracy necessary for inserting the fork into the hole in the front of the +pallet. + +
+
+
+
+
+ + ☆ Synthesizing Iris Images using Generative Adversarial Networks: Survey + and Comparative Analysis + + +
+ Biometric systems based on iris recognition are currently being used in +border control applications and mobile devices. However, research in iris +recognition is stymied by various factors such as limited datasets of bonafide +irides and presentation attack instruments; restricted intra-class variations; +and privacy concerns. Some of these issues can be mitigated by the use of +synthetic iris data. In this paper, we present a comprehensive review of +state-of-the-art GAN-based synthetic iris image generation techniques, +evaluating their strengths and limitations in producing realistic and useful +iris images that can be used for both training and testing iris recognition +systems and presentation attack detectors. In this regard, we first survey the +various methods that have been used for synthetic iris generation and +specifically consider generators based on StyleGAN, RaSGAN, CIT-GAN, iWarpGAN, +StarGAN, etc. We then analyze the images generated by these models for realism, +uniqueness, and biometric utility. This comprehensive analysis highlights the +pros and cons of various GANs in the context of developing robust iris matchers +and presentation attack detectors. + +
+
+
+
+
+ + ☆ Don't Look at the Camera: Achieving Perceived Eye Contact + + +
+ We consider the question of how to best achieve the perception of eye contact +when a person is captured by camera and then rendered on a 2D display. For +single subjects photographed by a camera, conventional wisdom tells us that +looking directly into the camera achieves eye contact. Through empirical user +studies, we show that it is instead preferable to {\em look just below the +camera lens}. We quantitatively assess where subjects should direct their gaze +relative to a camera lens to optimize the perception that they are making eye +contact. + +
+
+
+
+
+ + ☆ Open-Set Video-based Facial Expression Recognition with Human + Expression-sensitive Prompting + + +
+ In Video-based Facial Expression Recognition (V-FER), models are typically +trained on closed-set datasets with a fixed number of known classes. However, +these V-FER models cannot deal with unknown classes that are prevalent in +real-world scenarios. In this paper, we introduce a challenging Open-set +Video-based Facial Expression Recognition (OV-FER) task, aiming at identifying +not only known classes but also new, unknown human facial expressions not +encountered during training. While existing approaches address open-set +recognition by leveraging large-scale vision-language models like CLIP to +identify unseen classes, we argue that these methods may not adequately capture +the nuanced and subtle human expression patterns required by the OV-FER task. +To address this limitation, we propose a novel Human Expression-Sensitive +Prompting (HESP) mechanism to significantly enhance CLIP's ability to model +video-based facial expression details effectively, thereby presenting a new +CLIP-based OV-FER approach. Our proposed HESP comprises three components: 1) a +textual prompting module with learnable prompt representations to complement +the original CLIP textual prompts and enhance the textual representations of +both known and unknown emotions, 2) a visual prompting module that encodes +temporal emotional information from video frames using expression-sensitive +attention, equipping CLIP with a new visual modeling ability to extract +emotion-rich information, 3) a delicately designed open-set multi-task learning +scheme that facilitates prompt learning and encourages interactions between the +textual and visual prompting modules. Extensive experiments conducted on four +OV-FER task settings demonstrate that HESP can significantly boost CLIP's +performance (a relative improvement of 17.93% on AUROC and 106.18% on OSCR) and +outperform other state-of-the-art open-set video understanding methods by a +large margin. + +
+
+
+
+
+ + ☆ Defending Spiking Neural Networks against Adversarial Attacks through + Image Purification ECAI2024 + + +
+ Spiking Neural Networks (SNNs) aim to bridge the gap between neuroscience and +machine learning by emulating the structure of the human nervous system. +However, like convolutional neural networks, SNNs are vulnerable to adversarial +attacks. To tackle the challenge, we propose a biologically inspired +methodology to enhance the robustness of SNNs, drawing insights from the visual +masking effect and filtering theory. First, an end-to-end SNN-based image +purification model is proposed to defend against adversarial attacks, including +a noise extraction network and a non-blind denoising network. The former +network extracts noise features from noisy images, while the latter component +employs a residual U-Net structure to reconstruct high-quality noisy images and +generate clean images. Simultaneously, a multi-level firing SNN based on +Squeeze-and-Excitation Network is introduced to improve the robustness of the +classifier. Crucially, the proposed image purification network serves as a +pre-processing module, avoiding modifications to classifiers. Unlike +adversarial training, our method is highly flexible and can be seamlessly +integrated with other defense strategies. Experimental results on various +datasets demonstrate that the proposed methodology outperforms state-of-the-art +baselines in terms of defense effectiveness, training time, and resource +consumption. + +
+
+ comment: 8 pages, 5 figures, ECAI2024 under review +
+
+
+
+
+ + ☆ Generative Dataset Distillation: Balancing Global Structure and Local + Details CVPR + + +
+ In this paper, we propose a new dataset distillation method that considers +balancing global structure and local details when distilling the information +from a large dataset into a generative model. Dataset distillation has been +proposed to reduce the size of the required dataset when training models. The +conventional dataset distillation methods face the problem of long redeployment +time and poor cross-architecture performance. Moreover, previous methods +focused too much on the high-level semantic attributes between the synthetic +dataset and the original dataset while ignoring the local features such as +texture and shape. Based on the above understanding, we propose a new method +for distilling the original image dataset into a generative model. Our method +involves using a conditional generative adversarial network to generate the +distilled dataset. Subsequently, we ensure balancing global structure and local +details in the distillation process, continuously optimizing the generator for +more information-dense dataset generation. + +
+
+ comment: Accepted by the 1st CVPR Workshop on Dataset Distillation +
+
+
+
+
+ + ☆ Lessons from Deploying CropFollow++: Under-Canopy Agricultural + Navigation with Keypoints ICRA + + +
+ We present a vision-based navigation system for under-canopy agricultural +robots using semantic keypoints. Autonomous under-canopy navigation is +challenging due to the tight spacing between the crop rows ($\sim 0.75$ m), +degradation in RTK-GPS accuracy due to multipath error, and noise in LiDAR +measurements from the excessive clutter. Our system, CropFollow++, introduces +modular and interpretable perception architecture with a learned semantic +keypoint representation. We deployed CropFollow++ in multiple under-canopy +cover crop planting robots on a large scale (25 km in total) in various field +conditions and we discuss the key lessons learned from this. + +
+
+ comment: Accepted to the IEEE ICRA Workshop on Field Robotics 2024 +
+
+
+
+
+ + ☆ SPLICE -- Streamlining Digital Pathology Image Processing + + +
+ Digital pathology and the integration of artificial intelligence (AI) models +have revolutionized histopathology, opening new opportunities. With the +increasing availability of Whole Slide Images (WSIs), there's a growing demand +for efficient retrieval, processing, and analysis of relevant images from vast +biomedical archives. However, processing WSIs presents challenges due to their +large size and content complexity. Full computer digestion of WSIs is +impractical, and processing all patches individually is prohibitively +expensive. In this paper, we propose an unsupervised patching algorithm, +Sequential Patching Lattice for Image Classification and Enquiry (SPLICE). This +novel approach condenses a histopathology WSI into a compact set of +representative patches, forming a "collage" of WSI while minimizing redundancy. +SPLICE prioritizes patch quality and uniqueness by sequentially analyzing a WSI +and selecting non-redundant representative features. We evaluated SPLICE for +search and match applications, demonstrating improved accuracy, reduced +computation time, and storage requirements compared to existing +state-of-the-art methods. As an unsupervised method, SPLICE effectively reduces +storage requirements for representing tissue images by 50%. This reduction +enables numerous algorithms in computational pathology to operate much more +efficiently, paving the way for accelerated adoption of digital pathology. + +
+
+ comment: Under review for publication +
+
+
+
+
+ + ☆ Deep Learning for Melt Pool Depth Contour Prediction From Surface + Thermal Images via Vision Transformers + + +
+ Insufficient overlap between the melt pools produced during Laser Powder Bed +Fusion (L-PBF) can lead to lack-of-fusion defects and deteriorated mechanical +and fatigue performance. In-situ monitoring of the melt pool subsurface +morphology requires specialized equipment that may not be readily accessible or +scalable. Therefore, we introduce a machine learning framework to correlate +in-situ two-color thermal images observed via high-speed color imaging to the +two-dimensional profile of the melt pool cross-section. Specifically, we employ +a hybrid CNN-Transformer architecture to establish a correlation between single +bead off-axis thermal image sequences and melt pool cross-section contours +measured via optical microscopy. In this architecture, a ResNet model embeds +the spatial information contained within the thermal images to a latent vector, +while a Transformer model correlates the sequence of embedded vectors to +extract temporal information. Our framework is able to model the curvature of +the subsurface melt pool structure, with improved performance in high energy +density regimes compared to analytical melt pool models. The performance of +this model is evaluated through dimensional and geometric comparisons to the +corresponding experimental melt pool observations. + +
+
+
+
+
+ + ☆ Enhancing Track Management Systems with Vehicle-To-Vehicle Enabled + Sensor Fusion + + +
+ In the rapidly advancing landscape of connected and automated vehicles (CAV), +the integration of Vehicle-to-Everything (V2X) communication in traditional +fusion systems presents a promising avenue for enhancing vehicle perception. +Addressing current limitations with vehicle sensing, this paper proposes a +novel Vehicle-to-Vehicle (V2V) enabled track management system that leverages +the synergy between V2V signals and detections from radar and camera sensors. +The core innovation lies in the creation of independent priority track lists, +consisting of fused detections validated through V2V communication. This +approach enables more flexible and resilient thresholds for track management, +particularly in scenarios with numerous occlusions where the tracked objects +move outside the field of view of the perception sensors. The proposed system +considers the implications of falsification of V2X signals which is combated +through an initial vehicle identification process using detection from +perception sensors. Presented are the fusion algorithm, simulated environments, +and validation mechanisms. Experimental results demonstrate the improved +accuracy and robustness of the proposed system in common driving scenarios, +highlighting its potential to advance the reliability and efficiency of +autonomous vehicles. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ BlenderAlchemy: Editing 3D Graphics with Vision-Language Models + + +
+ Graphics design is important for various applications, including movie +production and game design. To create a high-quality scene, designers usually +need to spend hours in software like Blender, in which they might need to +interleave and repeat operations, such as connecting material nodes, hundreds +of times. Moreover, slightly different design goals may require completely +different sequences, making automation difficult. In this paper, we propose a +system that leverages Vision-Language Models (VLMs), like GPT-4V, to +intelligently search the design action space to arrive at an answer that can +satisfy a user's intent. Specifically, we design a vision-based edit generator +and state evaluator to work together to find the correct sequence of actions to +achieve the goal. Inspired by the role of visual imagination in the human +design process, we supplement the visual reasoning capabilities of VLMs with +"imagined" reference images from image-generation models, providing visual +grounding of abstract language descriptions. In this paper, we provide +empirical evidence suggesting our system can produce simple but tedious Blender +editing sequences for tasks such as editing procedural materials from text +and/or reference images, as well as adjusting lighting configurations for +product renderings in complex scenes. + +
+
+
+
+
+ + ☆ Federated Learning for Blind Image Super-Resolution + + +
+ Traditional blind image SR methods need to model real-world degradations +precisely. Consequently, current research struggles with this dilemma by +assuming idealized degradations, which leads to limited applicability to actual +user data. Moreover, the ideal scenario - training models on data from the +targeted user base - presents significant privacy concerns. To address both +challenges, we propose to fuse image SR with federated learning, allowing +real-world degradations to be directly learned from users without invading +their privacy. Furthermore, it enables optimization across many devices without +data centralization. As this fusion is underexplored, we introduce new +benchmarks specifically designed to evaluate new SR methods in this federated +setting. By doing so, we employ known degradation modeling techniques from SR +research. However, rather than aiming to mirror real degradations, our +benchmarks use these degradation models to simulate the variety of degradations +found across clients within a distributed user base. This distinction is +crucial as it circumvents the need to precisely model real-world degradations, +which limits contemporary blind image SR research. Our proposed benchmarks +investigate blind image SR under new aspects, namely differently distributed +degradation types among users and varying user numbers. We believe new methods +tested within these benchmarks will perform more similarly in an application, +as the simulated scenario addresses the variety while federated learning +enables the training on actual degradations. + +
+
+
+
+
+ + ☆ Hard ASH: Sparsity and the right optimizer make a continual learner ICLR 2024 + + +
+ In class incremental learning, neural networks typically suffer from +catastrophic forgetting. We show that an MLP featuring a sparse activation +function and an adaptive learning rate optimizer can compete with established +regularization techniques in the Split-MNIST task. We highlight the +effectiveness of the Adaptive SwisH (ASH) activation function in this context +and introduce a novel variant, Hard Adaptive SwisH (Hard ASH) to further +enhance the learning retention. + +
+
+ comment: ICLR 2024 TinyPaper +
+
+
+
+
+ + ☆ FashionSD-X: Multimodal Fashion Garment Synthesis using Latent Diffusion + + +
+ The rapid evolution of the fashion industry increasingly intersects with +technological advancements, particularly through the integration of generative +AI. This study introduces a novel generative pipeline designed to transform the +fashion design process by employing latent diffusion models. Utilizing +ControlNet and LoRA fine-tuning, our approach generates high-quality images +from multimodal inputs such as text and sketches. We leverage and enhance +state-of-the-art virtual try-on datasets, including Multimodal Dress Code and +VITON-HD, by integrating sketch data. Our evaluation, utilizing metrics like +FID, CLIP Score, and KID, demonstrates that our model significantly outperforms +traditional stable diffusion models. The results not only highlight the +effectiveness of our model in generating fashion-appropriate outputs but also +underscore the potential of diffusion models in revolutionizing fashion design +workflows. This research paves the way for more interactive, personalized, and +technologically enriched methodologies in fashion design and representation, +bridging the gap between creative vision and practical application. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Attention-aware non-rigid image registration for accelerated MR imaging + + +
+ Accurate motion estimation at high acceleration factors enables rapid +motion-compensated reconstruction in Magnetic Resonance Imaging (MRI) without +compromising the diagnostic image quality. In this work, we introduce an +attention-aware deep learning-based framework that can perform non-rigid +pairwise registration for fully sampled and accelerated MRI. We extract local +visual representations to build similarity maps between the registered image +pairs at multiple resolution levels and additionally leverage long-range +contextual information using a transformer-based module to alleviate +ambiguities in the presence of artifacts caused by undersampling. We combine +local and global dependencies to perform simultaneous coarse and fine motion +estimation. The proposed method was evaluated on in-house acquired fully +sampled and accelerated data of 101 patients and 62 healthy subjects undergoing +cardiac and thoracic MRI. The impact of motion estimation accuracy on the +downstream task of motion-compensated reconstruction was analyzed. We +demonstrate that our model derives reliable and consistent motion fields across +different sampling trajectories (Cartesian and radial) and acceleration factors +of up to 16x for cardiac motion and 30x for respiratory motion and achieves +superior image quality in motion-compensated reconstruction qualitatively and +quantitatively compared to conventional and recent deep learning-based +approaches. The code is publicly available at +https://github.com/lab-midas/GMARAFT. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Neural Modes: Self-supervised Learning of Nonlinear Modal Subspaces CVPR 2024 + + +
+ We propose a self-supervised approach for learning physics-based subspaces +for real-time simulation. Existing learning-based methods construct subspaces +by approximating pre-defined simulation data in a purely geometric way. +However, this approach tends to produce high-energy configurations, leads to +entangled latent space dimensions, and generalizes poorly beyond the training +set. To overcome these limitations, we propose a self-supervised approach that +directly minimizes the system's mechanical energy during training. We show that +our method leads to learned subspaces that reflect physical equilibrium +constraints, resolve overfitting issues of previous methods, and offer +interpretable latent space parameters. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Beyond Traditional Threats: A Persistent Backdoor Attack on Federated + Learning + + +
+ Backdoors on federated learning will be diluted by subsequent benign updates. +This is reflected in the significant reduction of attack success rate as +iterations increase, ultimately failing. We use a new metric to quantify the +degree of this weakened backdoor effect, called attack persistence. Given that +research to improve this performance has not been widely noted,we propose a +Full Combination Backdoor Attack (FCBA) method. It aggregates more combined +trigger information for a more complete backdoor pattern in the global model. +Trained backdoored global model is more resilient to benign updates, leading to +a higher attack success rate on the test set. We test on three datasets and +evaluate with two models across various settings. FCBA's persistence +outperforms SOTA federated learning backdoor attacks. On GTSRB, postattack 120 +rounds, our attack success rate rose over 50% from baseline. The core code of +our method is available at https://github.com/PhD-TaoLiu/FCBA. + +
+
+
+
+
+ + ☆ Regression of Dense Distortion Field from a Single Fingerprint Image + + +
+ Skin distortion is a long standing challenge in fingerprint matching, which +causes false non-matches. Previous studies have shown that the recognition rate +can be improved by estimating the distortion field from a distorted fingerprint +and then rectifying it into a normal fingerprint. However, existing +rectification methods are based on principal component representation of +distortion fields, which is not accurate and are very sensitive to finger pose. +In this paper, we propose a rectification method where a self-reference based +network is utilized to directly estimate the dense distortion field of +distorted fingerprint instead of its low dimensional representation. This +method can output accurate distortion fields of distorted fingerprints with +various finger poses and distortion patterns. We conducted experiments on +FVC2004 DB1\_A, expanded Tsinghua Distorted Fingerprint database (with +additional distorted fingerprints in diverse finger poses and distortion +patterns) and a latent fingerprint database. Experimental results demonstrate +that our proposed method achieves the state-of-the-art rectification +performance in terms of distortion field estimation and rectified fingerprint +matching. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.17148 +
+
+
+
+
+ + ♻ ☆ Learning to Visually Connect Actions and their Effects + + +
+ In this work, we introduce the novel concept of visually Connecting Actions +and Their Effects (CATE) in video understanding. CATE can have applications in +areas like task planning and learning from demonstration. We identify and +explore two different aspects of the concept of CATE: Action Selection and +Effect-Affinity Assessment, where video understanding models connect actions +and effects at semantic and fine-grained levels, respectively. We observe that +different formulations produce representations capturing intuitive action +properties. We also design various baseline models for Action Selection and +Effect-Affinity Assessment. Despite the intuitive nature of the task, we +observe that models struggle, and humans outperform them by a large margin. The +study aims to establish a foundation for future efforts, showcasing the +flexibility and versatility of connecting actions and effects in video +understanding, with the hope of inspiring advanced formulations and models. + +
+
+
+
+
+ + ♻ ☆ Overload: Latency Attacks on Object Detection for Edge Devices + + +
+ Nowadays, the deployment of deep learning-based applications is an essential +task owing to the increasing demands on intelligent services. In this paper, we +investigate latency attacks on deep learning applications. Unlike common +adversarial attacks for misclassification, the goal of latency attacks is to +increase the inference time, which may stop applications from responding to the +requests within a reasonable time. This kind of attack is ubiquitous for +various applications, and we use object detection to demonstrate how such kind +of attacks work. We also design a framework named Overload to generate latency +attacks at scale. Our method is based on a newly formulated optimization +problem and a novel technique, called spatial attention. This attack serves to +escalate the required computing costs during the inference time, consequently +leading to an extended inference time for object detection. It presents a +significant threat, especially to systems with limited computing resources. We +conducted experiments using YOLOv5 models on Nvidia NX. Compared to existing +methods, our method is simpler and more effective. The experimental results +show that with latency attacks, the inference time of a single image can be +increased ten times longer in reference to the normal setting. Moreover, our +findings pose a potential new threat to all object detection tasks requiring +non-maximum suppression (NMS), as our attack is NMS-agnostic. + +
+
+
+
+
+ + ♻ ☆ CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and + Radiology Reports for Full-Body Scenarios + + +
+ Medical Vision-Language Pretraining (Med-VLP) establishes a connection +between visual content from medical images and the relevant textual +descriptions. Existing Med-VLP methods primarily focus on 2D images depicting a +single body part, notably chest X-rays. In this paper, we extend the scope of +Med-VLP to encompass 3D images, specifically targeting full-body scenarios, by +using a multimodal dataset of CT images and reports. Compared with the 2D +counterpart, 3D VLP is required to effectively capture essential semantics from +significantly sparser representation in 3D imaging. In this paper, we introduce +CT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method +that constructs organ-level image-text pairs to enhance multimodal contrastive +learning, aligning grounded visual features with precise diagnostic text. +Additionally, we developed an abnormality dictionary to augment contrastive +learning with diverse contrastive pairs. Our method, trained on a multimodal CT +dataset comprising 44,011 organ-level vision-text pairs from 17,702 patients +across 104 organs, demonstrates it can identify organs and abnormalities in a +zero-shot manner using natural languages. The performance of CT-GLIP is +validated on a separate test set of 1,130 patients, focusing on the 16 most +frequent abnormalities across 7 organs. The experimental results show our +model's superior performance over the standard CLIP framework across zero-shot +and fine-tuning scenarios, using both CNN and ViT architectures. + +
+
+ comment: 12 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ ChemScraper: Leveraging PDF Graphics Instructions for Molecular Diagram + Parsing + + +
+ Most molecular diagram parsers recover chemical structure from raster images +(e.g., PNGs). However, many PDFs include commands giving explicit locations and +shapes for characters, lines, and polygons. We present a new parser that uses +these born-digital PDF primitives as input. The parsing model is fast and +accurate, and does not require GPUs, Optical Character Recognition (OCR), or +vectorization. We use the parser to annotate raster images and then train a new +multi-task neural network for recognizing molecules in raster images. We +evaluate our parsers using SMILES and standard benchmarks, along with a novel +evaluation protocol comparing molecular graphs directly that supports automatic +error compilation and reveals errors missed by SMILES-based evaluation. + +
+
+ comment: 20 pages without references, 12 figures, 4 Tables, submitted to + International Journal on Document Analysis and Recognition (IJDAR) +
+
+
+
+
+ + ♻ ☆ MAIRA-1: A specialised large multimodal model for radiology report + generation + + +
+ We present a radiology-specific multimodal model for the task for generating +radiological reports from chest X-rays (CXRs). Our work builds on the idea that +large language model(s) can be equipped with multimodal capabilities through +alignment with pre-trained vision encoders. On natural images, this has been +shown to allow multimodal models to gain image understanding and description +capabilities. Our proposed model (MAIRA-1) leverages a CXR-specific image +encoder in conjunction with a fine-tuned large language model based on +Vicuna-7B, and text-based data augmentation, to produce reports with +state-of-the-art quality. In particular, MAIRA-1 significantly improves on the +radiologist-aligned RadCliQ metric and across all lexical metrics considered. +Manual review of model outputs demonstrates promising fluency and accuracy of +generated reports while uncovering failure modes not captured by existing +evaluation practices. More information and resources can be found on the +project website: https://aka.ms/maira. + +
+
+ comment: 18 pages, 9 tables, 5 figures. v2 adds test IDs and image encoder + citation. v3 fixes error in NPV/specificity +
+
+
+
+
+ + ♻ ☆ DeepClean: Machine Unlearning on the Cheap by Resetting Privacy + Sensitive Weights using the Fisher Diagonal + + +
+ Machine learning models trained on sensitive or private data can +inadvertently memorize and leak that information. Machine unlearning seeks to +retroactively remove such details from model weights to protect privacy. We +contribute a lightweight unlearning algorithm that leverages the Fisher +Information Matrix (FIM) for selective forgetting. Prior work in this area +requires full retraining or large matrix inversions, which are computationally +expensive. Our key insight is that the diagonal elements of the FIM, which +measure the sensitivity of log-likelihood to changes in weights, contain +sufficient information for effective forgetting. Specifically, we compute the +FIM diagonal over two subsets -- the data to retain and forget -- for all +trainable weights. This diagonal representation approximates the complete FIM +while dramatically reducing computation. We then use it to selectively update +weights to maximize forgetting of the sensitive subset while minimizing impact +on the retained subset. Experiments show that our algorithm can successfully +forget any randomly selected subsets of training data across neural network +architectures. By leveraging the FIM diagonal, our approach provides an +interpretable, lightweight, and efficient solution for machine unlearning with +practical privacy benefits. + +
+
+
+
+
+ + ♻ ☆ Probing Conceptual Understanding of Large Visual-Language Models CVPR + + +
+ In recent years large visual-language (V+L) models have achieved great +success in various downstream tasks. However, it is not well studied whether +these models have a conceptual grasp of the visual content. In this work we +focus on conceptual understanding of these large V+L models. To facilitate this +study, we propose novel benchmarking datasets for probing three different +aspects of content understanding, 1) \textit{relations}, 2) +\textit{composition}, and 3) \textit{context}. Our probes are grounded in +cognitive science and help determine if a V+L model can, for example, determine +if snow garnished with a man is implausible, or if it can identify beach +furniture by knowing it is located on a beach. We experimented with many recent +state-of-the-art V+L models and observe that these models mostly \textit{fail +to demonstrate} a conceptual understanding. This study reveals several +interesting insights such as that \textit{cross-attention} helps learning +conceptual understanding, and that CNNs are better with \textit{texture and +patterns}, while Transformers are better at \textit{color and shape}. We +further utilize some of these insights and investigate a \textit{simple +finetuning technique} that rewards the three conceptual understanding measures +with promising initial results. The proposed benchmarks will drive the +community to delve deeper into conceptual understanding and foster advancements +in the capabilities of large V+L models. The code and dataset is available at: +\url{https://tinyurl.com/vlm-robustness} + +
+
+ comment: All code and dataset is available at: + https://tinyurl.com/vlm-robustness. Accepted in CVPRW 2024 +
+
+
+
+
+ + ♻ ☆ Boosting Defect Detection in Manufacturing using Tensor Convolutional + Neural Networks + + +
+ Defect detection is one of the most important yet challenging tasks in the +quality control stage in the manufacturing sector. In this work, we introduce a +Tensor Convolutional Neural Network (T-CNN) and examine its performance on a +real defect detection application in one of the components of the ultrasonic +sensors produced at Robert Bosch's manufacturing plants. Our quantum-inspired +T-CNN operates on a reduced model parameter space to substantially improve the +training speed and performance of an equivalent CNN model without sacrificing +accuracy. More specifically, we demonstrate how T-CNNs are able to reach the +same performance as classical CNNs as measured by quality metrics, with up to +fifteen times fewer parameters and 4% to 19% faster training times. Our results +demonstrate that the T-CNN greatly outperforms the results of traditional human +visual inspection, providing value in a current real application in +manufacturing. + +
+
+ comment: 12 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific + Boundaries for Domain Adaptation CVPR 2024 + + +
+ Most domain adaptation (DA) methods are based on either a convolutional +neural networks (CNNs) or a vision transformers (ViTs). They align the +distribution differences between domains as encoders without considering their +unique characteristics. For instance, ViT excels in accuracy due to its +superior ability to capture global representations, while CNN has an advantage +in capturing local representations. This fact has led us to design a hybrid +method to fully take advantage of both ViT and CNN, called Explicitly +Class-specific Boundaries (ECB). ECB learns CNN on ViT to combine their +distinct strengths. In particular, we leverage ViT's properties to explicitly +find class-specific decision boundaries by maximizing the discrepancy between +the outputs of the two classifiers to detect target samples far from the source +support. In contrast, the CNN encoder clusters target features based on the +previously defined class-specific boundaries by minimizing the discrepancy +between the probabilities of the two classifiers. Finally, ViT and CNN mutually +exchange knowledge to improve the quality of pseudo labels and reduce the +knowledge discrepancies of these models. Compared to conventional DA methods, +our ECB achieves superior performance, which verifies its effectiveness in this +hybrid model. The project website can be found +https://dotrannhattuong.github.io/ECB/website. + +
+
+ comment: Project page: https://dotrannhattuong.github.io/ECB/website, Accepted + to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FENet: Focusing Enhanced Network for Lane Detection + + +
+ Inspired by human driving focus, this research pioneers networks augmented +with Focusing Sampling, Partial Field of View Evaluation, Enhanced FPN +architecture and Directional IoU Loss - targeted innovations addressing +obstacles to precise lane detection for autonomous driving. Experiments +demonstrate our Focusing Sampling strategy, emphasizing vital distant details +unlike uniform approaches, significantly boosts both benchmark and practical +curved/distant lane recognition accuracy essential for safety. While FENetV1 +achieves state-of-the-art conventional metric performance via enhancements +isolating perspective-aware contexts mimicking driver vision, FENetV2 proves +most reliable on the proposed Partial Field analysis. Hence we specifically +recommend V2 for practical lane navigation despite fractional degradation on +standard entire-image measures. Future directions include collecting on-road +data and integrating complementary dual frameworks to further breakthroughs +guided by human perception principles. The Code is available at +https://github.com/HanyangZhong/FENet. + +
+
+ comment: 12 pages including appendix. The Code is available at + https://github.com/HanyangZhong/FENet +
+
+
+
+
+ + ♻ ☆ Audio-Visual Person Verification based on Recursive Fusion of Joint + Cross-Attention + + +
+ Person or identity verification has been recently gaining a lot of attention +using audio-visual fusion as faces and voices share close associations with +each other. Conventional approaches based on audio-visual fusion rely on +score-level or early feature-level fusion techniques. Though existing +approaches showed improvement over unimodal systems, the potential of +audio-visual fusion for person verification is not fully exploited. In this +paper, we have investigated the prospect of effectively capturing both the +intra- and inter-modal relationships across audio and visual modalities, which +can play a crucial role in significantly improving the fusion performance over +unimodal systems. In particular, we introduce a recursive fusion of a joint +cross-attentional model, where a joint audio-visual feature representation is +employed in the cross-attention framework in a recursive fashion to +progressively refine the feature representations that can efficiently capture +the intra-and inter-modal relationships. To further enhance the audio-visual +feature representations, we have also explored BLSTMs to improve the temporal +modeling of audio-visual feature representations. Extensive experiments are +conducted on the Voxceleb1 dataset to evaluate the proposed model. Results +indicate that the proposed model shows promising improvement in fusion +performance by adeptly capturing the intra-and inter-modal relationships across +audio and visual modalities. + +
+
+ comment: Accepted to FG2024 +
+
+
+
+
+ + ♻ ☆ Conditional Variational Diffusion Models + + +
+ Inverse problems aim to determine parameters from observations, a crucial +task in engineering and science. Lately, generative models, especially +diffusion models, have gained popularity in this area for their ability to +produce realistic solutions and their good mathematical properties. Despite +their success, an important drawback of diffusion models is their sensitivity +to the choice of variance schedule, which controls the dynamics of the +diffusion process. Fine-tuning this schedule for specific applications is +crucial but time-costly and does not guarantee an optimal result. We propose a +novel approach for learning the schedule as part of the training process. Our +method supports probabilistic conditioning on data, provides high-quality +solutions, and is flexible, proving able to adapt to different applications +with minimum overhead. This approach is tested in two unrelated inverse +problems: super-resolution microscopy and quantitative phase imaging, yielding +comparable or superior results to previous methods and fine-tuned diffusion +models. We conclude that fine-tuning the schedule by experimentation should be +avoided because it can be learned during training in a stable way that yields +better results. + +
+
+ comment: Denoising Diffusion Probabilistic Models, Inverse Problems, + Generative Models, Super Resolution, Phase Quantification, Variational + Methods +
+
+
+
+
+ + ♻ ☆ Narrative Action Evaluation with Prompt-Guided Multimodal Interaction CVPR 2024 + + +
+ In this paper, we investigate a new problem called narrative action +evaluation (NAE). NAE aims to generate professional commentary that evaluates +the execution of an action. Unlike traditional tasks such as score-based action +quality assessment and video captioning involving superficial sentences, NAE +focuses on creating detailed narratives in natural language. These narratives +provide intricate descriptions of actions along with objective evaluations. NAE +is a more challenging task because it requires both narrative flexibility and +evaluation rigor. One existing possible solution is to use multi-task learning, +where narrative language and evaluative information are predicted separately. +However, this approach results in reduced performance for individual tasks +because of variations between tasks and differences in modality between +language information and evaluation information. To address this, we propose a +prompt-guided multimodal interaction framework. This framework utilizes a pair +of transformers to facilitate the interaction between different modalities of +information. It also uses prompts to transform the score regression task into a +video-text matching task, thus enabling task interactivity. To support further +research in this field, we re-annotate the MTL-AQA and FineGym datasets with +high-quality and comprehensive action narration. Additionally, we establish +benchmarks for NAE. Extensive experiment results prove that our method +outperforms separate learning methods and naive multi-task learning methods. +Data and code are released at https://github.com/shiyi-zh0408/NAE_CVPR2024. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Image Clustering via the Principle of Rate Reduction in the Age of + Pretrained Models + + +
+ The advent of large pre-trained models has brought about a paradigm shift in +both visual representation learning and natural language processing. However, +clustering unlabeled images, as a fundamental and classic machine learning +problem, still lacks an effective solution, particularly for large-scale +datasets. In this paper, we propose a novel image clustering pipeline that +leverages the powerful feature representation of large pre-trained models such +as CLIP and cluster images effectively and efficiently at scale. We first +developed a novel algorithm to estimate the number of clusters in a given +dataset. We then show that the pre-trained features are significantly more +structured by further optimizing the rate reduction objective. The resulting +features may significantly improve the clustering accuracy, e.g., from 57\% to +66\% on ImageNet-1k. Furthermore, by leveraging CLIP's multimodality bridge +between image and text, we develop a simple yet effective self-labeling +algorithm that produces meaningful captions for the clusters. Through extensive +experiments, we show that our pipeline works well on standard datasets such as +CIFAR-10, CIFAR-100, and ImageNet-1k. It also extends to datasets that are not +curated for clustering, such as LAION-Aesthetics and WikiArts. We released the +code in https://github.com/LeslieTrue/CPP. + +
+
+ comment: 23 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face + Recognition through Synthetic Faces + + +
+ Recent advances in deep face recognition have spurred a growing demand for +large, diverse, and manually annotated face datasets. Acquiring authentic, +high-quality data for face recognition has proven to be a challenge, primarily +due to privacy concerns. Large face datasets are primarily sourced from +web-based images, lacking explicit user consent. In this paper, we examine +whether and how synthetic face data can be used to train effective face +recognition models with reduced reliance on authentic images, thereby +mitigating data collection concerns. First, we explored the performance gap +among recent state-of-the-art face recognition models, trained with synthetic +data only and authentic (scarce) data only. Then, we deepened our analysis by +training a state-of-the-art backbone with various combinations of synthetic and +authentic data, gaining insights into optimizing the limited use of the latter +for verification accuracy. Finally, we assessed the effectiveness of data +augmentation approaches on synthetic and authentic data, with the same goal in +mind. Our results highlighted the effectiveness of FR trained on combined +datasets, particularly when combined with appropriate augmentation techniques. + +
+
+ comment: Accepted as full paper at FG 2024 main track +
+
+
+
+
+ + ♻ ☆ Joint covariance properties under geometric image transformations for + spatio-temporal receptive fields according to the generalized Gaussian + derivative model for visual receptive fields + + +
+ The influence of natural image transformations on receptive field responses +is crucial for modelling visual operations in computer vision and biological +vision. In this regard, covariance properties with respect to geometric image +transformations in the earliest layers of the visual hierarchy are essential +for expressing robust image operations, and for formulating invariant visual +operations at higher levels. + This paper defines and proves a set of joint covariance properties under +compositions of spatial scaling transformations, spatial affine +transformations, Galilean transformations and temporal scaling transformations, +which make it possible to characterize how different types of image +transformations interact with each other and the associated spatio-temporal +receptive field responses. In this regard, we also extend the notion of +scale-normalized derivatives to affine-normalized derivatives, to be able to +obtain true affine-covariant properties of spatial derivatives, that are +computed based on spatial smoothing with affine Gaussian kernels. + The derived relations show how the parameters of the receptive fields need to +be transformed, in order to match the output from spatio-temporal receptive +fields under composed spatio-temporal image transformations. As a side effect, +the presented proof for the joint covariance property over the integrated +combination of the different geometric image transformations also provides +specific proofs for the individual transformation properties, which have not +previously been fully reported in the literature. + The paper also presents an in-depth theoretical analysis of geometric +interpretations of the derived covariance properties, as well as outlines a +number of biological interpretations of these results. + +
+
+ comment: 38 pages, 13 figures. Note: From version 4, this paper considers a + different form of joint composition of the geometric image transformations + than in the earlier versions +
+
+
+
+
+ + ♻ ☆ LEAF: Unveiling Two Sides of the Same Coin in Semi-supervised Facial + Expression Recognition + + +
+ Semi-supervised learning has emerged as a promising approach to tackle the +challenge of label scarcity in facial expression recognition (FER) task. +However, current state-of-the-art methods primarily focus on one side of the +coin, i.e., generating high-quality pseudo-labels, while overlooking the other +side: enhancing expression-relevant representations. In this paper, we unveil +both sides of the coin by proposing a unified framework termed hierarchicaL +dEcoupling And Fusing (LEAF) to coordinate expression-relevant representations +and pseudo-labels for semi-supervised FER. LEAF introduces a hierarchical +expression-aware aggregation strategy that operates at three levels: semantic, +instance, and category. (1) At the semantic and instance levels, LEAF decouples +representations into expression-agnostic and expression-relevant components, +and adaptively fuses them using learnable gating weights. (2) At the category +level, LEAF assigns ambiguous pseudo-labels by decoupling predictions into +positive and negative parts, and employs a consistency loss to ensure agreement +between two augmented views of the same image. Extensive experiments on +benchmark datasets demonstrate that by unveiling and harmonizing both sides of +the coin, LEAF outperforms state-of-the-art semi-supervised FER methods, +effectively leveraging both labeled and unlabeled data. Moreover, the proposed +expression-aware aggregation strategy can be seamlessly integrated into +existing semi-supervised frameworks, leading to significant performance gains. +Our code is available at https://anonymous.4open.science/r/LEAF-BC57/. + +
+
+
+
+
+ + ♻ ☆ MEIA: Towards Realistic Multimodal Interaction and Manipulation for + Embodied Robots + + +
+ With the surge in the development of large language models, embodied +intelligence has attracted increasing attention. Nevertheless, prior works on +embodied intelligence typically encode scene or historical memory in an +unimodal manner, either visual or linguistic, which complicates the alignment +of the model's action planning with embodied control. To overcome this +limitation, we introduce the Multimodal Embodied Interactive Agent (MEIA), +capable of translating high-level tasks expressed in natural language into a +sequence of executable actions. Specifically, we propose a novel Multimodal +Environment Memory (MEM) module, facilitating the integration of embodied +control with large models through the visual-language memory of scenes. This +capability enables MEIA to generate executable action plans based on diverse +requirements and the robot's capabilities. Furthermore, we construct an +embodied question answering dataset based on a dynamic virtual cafe environment +with the help of the large language model. In this virtual environment, we +conduct several experiments, utilizing multiple large models through zero-shot +learning, and carefully design scenarios for various situations. The +experimental results showcase the promising performance of our MEIA in various +embodied interactive tasks. + +
+
+ comment: Codes will be available at https://github.com/HCPLab-SYSU/CausalVLR +
+
+
+
+
+ + ♻ ☆ Gait Recognition in Large-scale Free Environment via Single LiDAR + + +
+ Human gait recognition is crucial in multimedia, enabling identification +through walking patterns without direct interaction, enhancing the integration +across various media forms in real-world applications like smart homes, +healthcare and non-intrusive security. LiDAR's ability to capture depth makes +it pivotal for robotic perception and holds promise for real-world gait +recognition. In this paper, based on a single LiDAR, we present the +Hierarchical Multi-representation Feature Interaction Network (HMRNet) for +robust gait recognition. Prevailing LiDAR-based gait datasets primarily derive +from controlled settings with predefined trajectory, remaining a gap with +real-world scenarios. To facilitate LiDAR-based gait recognition research, we +introduce FreeGait, a comprehensive gait dataset from large-scale, +unconstrained settings, enriched with multi-modal and varied 2D/3D data. +Notably, our approach achieves state-of-the-art performance on prior dataset +(SUSTech1K) and on FreeGait. Code and dataset will be released upon publication +of this paper. + +
+
+
+
+
+ + ♻ ☆ SEGSRNet for Stereo-Endoscopic Image Super-Resolution and Surgical + Instrument Segmentation + + +
+ SEGSRNet addresses the challenge of precisely identifying surgical +instruments in low-resolution stereo endoscopic images, a common issue in +medical imaging and robotic surgery. Our innovative framework enhances image +clarity and segmentation accuracy by applying state-of-the-art super-resolution +techniques before segmentation. This ensures higher-quality inputs for more +precise segmentation. SEGSRNet combines advanced feature extraction and +attention mechanisms with spatial processing to sharpen image details, which is +significant for accurate tool identification in medical images. Our proposed +model outperforms current models including Dice, IoU, PSNR, and SSIM, SEGSRNet +where it produces clearer and more accurate images for stereo endoscopic +surgical imaging. SEGSRNet can provide image resolution and precise +segmentation which can significantly enhance surgical accuracy and patient care +outcomes. + +
+
+ comment: Paper accepted for Presentation in 46th Annual International + Conference of the IEEE Engineering in Medicine and Biology Society (EMBS), + Orlando, Florida, USA (Camera Ready Version) +
+
+
+
+
+ + ♻ ☆ Benchmarking the Fairness of Image Upsampling Methods + + +
+ Recent years have witnessed a rapid development of deep generative models for +creating synthetic media, such as images and videos. While the practical +applications of these models in everyday tasks are enticing, it is crucial to +assess the inherent risks regarding their fairness. In this work, we introduce +a comprehensive framework for benchmarking the performance and fairness of +conditional generative models. We develop a set of +metrics$\unicode{x2013}$inspired by their supervised fairness +counterparts$\unicode{x2013}$to evaluate the models on their fairness and +diversity. Focusing on the specific application of image upsampling, we create +a benchmark covering a wide variety of modern upsampling methods. As part of +the benchmark, we introduce UnfairFace, a subset of FairFace that replicates +the racial distribution of common large-scale face datasets. Our empirical +study highlights the importance of using an unbiased training set and reveals +variations in how the algorithms respond to dataset imbalances. Alarmingly, we +find that none of the considered methods produces statistically fair and +diverse results. All experiments can be reproduced using our provided +repository. + +
+
+ comment: This is the author's version of the work. It is posted here for your + personal use. Not for redistribution. The definitive Version of Record was + published at the 2024 ACM Conference on Fairness, Accountability, and + Transparency (FAccT '24) +
+
+
+
+
+ + ♻ ☆ Structural-Based Uncertainty in Deep Learning Across Anatomical Scales: + Analysis in White Matter Lesion Segmentation + + +
+ This paper explores uncertainty quantification (UQ) as an indicator of the +trustworthiness of automated deep-learning (DL) tools in the context of white +matter lesion (WML) segmentation from magnetic resonance imaging (MRI) scans of +multiple sclerosis (MS) patients. Our study focuses on two principal aspects of +uncertainty in structured output segmentation tasks. Firstly, we postulate that +a good uncertainty measure should indicate predictions likely to be incorrect +with high uncertainty values. Second, we investigate the merit of quantifying +uncertainty at different anatomical scales (voxel, lesion, or patient). We +hypothesize that uncertainty at each scale is related to specific types of +errors. Our study aims to confirm this relationship by conducting separate +analyses for in-domain and out-of-domain settings. Our primary methodological +contributions are (i) the development of novel measures for quantifying +uncertainty at lesion and patient scales, derived from structural prediction +discrepancies, and (ii) the extension of an error retention curve analysis +framework to facilitate the evaluation of UQ performance at both lesion and +patient scales. The results from a multi-centric MRI dataset of 334 patients +demonstrate that our proposed measures more effectively capture model errors at +the lesion and patient scales compared to measures that average voxel-scale +uncertainty values. We provide the UQ protocols code at +https://github.com/Medical-Image-Analysis-Laboratory/MS_WML_uncs. + +
+
+ comment: Preprint submitted to the journal +
+
+
+
+
+ + ♻ ☆ mEBAL2 Database and Benchmark: Image-based Multispectral Eyeblink + Detection + + +
+ This work introduces a new multispectral database and novel approaches for +eyeblink detection in RGB and Near-Infrared (NIR) individual images. Our +contributed dataset (mEBAL2, multimodal Eye Blink and Attention Level +estimation, Version 2) is the largest existing eyeblink database, representing +a great opportunity to improve data-driven multispectral approaches for blink +detection and related applications (e.g., attention level estimation and +presentation attack detection in face biometrics). mEBAL2 includes 21,100 image +sequences from 180 different students (more than 2 million labeled images in +total) while conducting a number of e-learning tasks of varying difficulty or +taking a real course on HTML initiation through the edX MOOC platform. mEBAL2 +uses multiple sensors, including two Near-Infrared (NIR) and one RGB camera to +capture facial gestures during the execution of the tasks, as well as an +Electroencephalogram (EEG) band to get the cognitive activity of the user and +blinking events. Furthermore, this work proposes a Convolutional Neural Network +architecture as benchmark for blink detection on mEBAL2 with performances up to +97%. Different training methodologies are implemented using the RGB spectrum, +NIR spectrum, and the combination of both to enhance the performance on +existing eyeblink detectors. We demonstrate that combining NIR and RGB images +during training improves the performance of RGB eyeblink detectors (i.e., +detection based only on a RGB image). Finally, the generalization capacity of +the proposed eyeblink detectors is validated in wilder and more challenging +environments like the HUST-LEBW dataset to show the usefulness of mEBAL2 to +train a new generation of data-driven approaches for eyeblink detection. + +
+
+ comment: Published in the journal Pattern Recognition Letters in June 2024. + Accessible from + https://www.sciencedirect.com/science/article/pii/S0167865524001120?via%3Dihub +
+
+
+
+
+ + ♻ ☆ The LuViRA Dataset: Synchronized Vision, Radio, and Audio Sensors for + Indoor Localization ICRA 2024 + + +
+ We present a synchronized multisensory dataset for accurate and robust indoor +localization: the Lund University Vision, Radio, and Audio (LuViRA) Dataset. +The dataset includes color images, corresponding depth maps, inertial +measurement unit (IMU) readings, channel response between a 5G massive +multiple-input and multiple-output (MIMO) testbed and user equipment, audio +recorded by 12 microphones, and accurate six degrees of freedom (6DOF) pose +ground truth of 0.5 mm. We synchronize these sensors to ensure that all data is +recorded simultaneously. A camera, speaker, and transmit antenna are placed on +top of a slowly moving service robot, and 89 trajectories are recorded. Each +trajectory includes 20 to 50 seconds of recorded sensor data and ground truth +labels. Data from different sensors can be used separately or jointly to +perform localization tasks, and data from the motion capture (mocap) system is +used to verify the results obtained by the localization algorithms. The main +aim of this dataset is to enable research on sensor fusion with the most +commonly used sensors for localization tasks. Moreover, the full dataset or +some parts of it can also be used for other research areas such as channel +estimation, image classification, etc. Our dataset is available at: +https://github.com/ilaydayaman/LuViRA_Dataset + +
+
+ comment: 7 pages, 7 figures, Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ LLIC: Large Receptive Field Transform Coding with Adaptive Weights for + Learned Image Compression + + +
+ The Effective Receptive field (ERF) plays an important role in transform +coding, which determines how much redundancy can be removed at most during +transform and how many spatial priors can be utilized to synthesize textures +during inverse transform. Existing methods rely on stacks of small kernels, +whose ERF remains not large enough instead, or heavy non-local attention +mechanisms, which limit the potential of high-resolution image coding. To +tackle this issue, we propose Large Receptive Field Transform Coding with +Adaptive Weights for Learned Image Compression (LLIC). Specifically, for the +first time in the learned image compression community, we introduce a few large +kernel-based depth-wise convolutions to reduce more redundancy while +maintaining modest complexity. Due to the wide range of image diversity, we +further propose a mechanism to augment convolution adaptability through the +self-conditioned generation of weights. The large kernels cooperate with +non-linear embedding and gate mechanisms for better expressiveness and lighter +point-wise interactions. Our investigation extends to refined training methods +that unlock the full potential of these large kernels. Moreover, to promote +more dynamic inter-channel interactions, we introduce an adaptive channel-wise +bit allocation strategy that autonomously generates channel importance factors +in a self-conditioned manner. To demonstrate the effectiveness of the proposed +transform coding, we align the entropy model to compare with existing transform +methods and obtain models LLIC-STF, LLIC-ELIC, LLIC-TCM. Extensive experiments +demonstrate our proposed LLIC models have significant improvements over +corresponding baselines and reduce BD-Rate by 9.49%, 9.47%, 10.94% on Kodak +over VTM-17.0 Intra, respectively. Our LLIC models achieve state-of-the-art +performances and better trade-offs between performance and complexity. + +
+
+ comment: major updates +
+
+
+
+
+ + ♻ ☆ RegWSI: Whole Slide Image Registration using Combined Deep Feature- and + Intensity-Based Methods: Winner of the ACROBAT 2023 Challenge + + +
+ The automatic registration of differently stained whole slide images (WSIs) +is crucial for improving diagnosis and prognosis by fusing complementary +information emerging from different visible structures. It is also useful to +quickly transfer annotations between consecutive or restained slides, thus +significantly reducing the annotation time and associated costs. Nevertheless, +the slide preparation is different for each stain and the tissue undergoes +complex and large deformations. Therefore, a robust, efficient, and accurate +registration method is highly desired by the scientific community and hospitals +specializing in digital pathology. We propose a two-step hybrid method +consisting of (i) deep learning- and feature-based initial alignment algorithm, +and (ii) intensity-based nonrigid registration using the instance optimization. +The proposed method does not require any fine-tuning to a particular dataset +and can be used directly for any desired tissue type and stain. The method +scored 1st place in the ACROBAT 2023 challenge. We evaluated using three open +datasets: (i) ANHIR, (ii) ACROBAT, and (iii) HyReCo, and performed several +ablation studies concerning the resolution used for registration and the +initial alignment robustness and stability. The method achieves the most +accurate results for the ACROBAT dataset, the cell-level registration accuracy +for the restained slides from the HyReCo dataset, and is among the best methods +evaluated on the ANHIR dataset. The method does not require any fine-tuning to +a new datasets and can be used out-of-the-box for other types of microscopic +images. The method is incorporated into the DeeperHistReg framework, allowing +others to directly use it to register, transform, and save the WSIs at any +desired pyramid level. The proposed method is a significant contribution to the +WSI registration, thus advancing the field of digital pathology. + +
+
+
+
+
+ + ♻ ☆ Neural Radiance Field in Autonomous Driving: A Survey + + +
+ Neural Radiance Field (NeRF) has garnered significant attention from both +academia and industry due to its intrinsic advantages, particularly its +implicit representation and novel view synthesis capabilities. With the rapid +advancements in deep learning, a multitude of methods have emerged to explore +the potential applications of NeRF in the domain of Autonomous Driving (AD). +However, a conspicuous void is apparent within the current literature. To +bridge this gap, this paper conducts a comprehensive survey of NeRF's +applications in the context of AD. Our survey is structured to categorize +NeRF's applications in Autonomous Driving (AD), specifically encompassing +perception, 3D reconstruction, simultaneous localization and mapping (SLAM), +and simulation. We delve into in-depth analysis and summarize the findings for +each application category, and conclude by providing insights and discussions +on future directions in this field. We hope this paper serves as a +comprehensive reference for researchers in this domain. To the best of our +knowledge, this is the first survey specifically focused on the applications of +NeRF in the Autonomous Driving domain. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal + Models for Video Question Answering + + +
+ Video Question Answering (VideoQA) aims to answer natural language questions +based on the information observed in videos. Despite the recent success of +Large Multimodal Models (LMMs) in image-language understanding and reasoning, +they deal with VideoQA insufficiently, by simply taking uniformly sampled +frames as visual inputs, which ignores question-relevant visual clues. +Moreover, there are no human annotations for question-critical timestamps in +existing VideoQA datasets. In light of this, we propose a novel weakly +supervised framework to enforce the LMMs to reason out the answers with +question-critical moments as visual inputs. Specifically, we first fuse the +question and answer pairs as event descriptions to find multiple keyframes as +target moments and pseudo-labels, with the visual-language alignment capability +of the CLIP models. With these pseudo-labeled keyframes as additionally weak +supervision, we devise a lightweight Gaussian-based Contrastive Grounding (GCG) +module. GCG learns multiple Gaussian functions to characterize the temporal +structure of the video, and sample question-critical frames as positive moments +to be the visual inputs of LMMs. Extensive experiments on several benchmarks +verify the effectiveness of our framework, and we achieve substantial +improvements compared to previous state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Lightweight Regression Model with Prediction Interval Estimation for + Computer Vision-based Winter Road Surface Condition Monitoring + + +
+ Winter conditions pose several challenges for automated driving applications. +A key challenge during winter is accurate assessment of road surface condition, +as its impact on friction is a critical parameter for safely and reliably +controlling a vehicle. This paper proposes a deep learning regression model, +SIWNet, capable of estimating road surface friction properties from camera +images. SIWNet extends state of the art by including an uncertainty estimation +mechanism in the architecture. This is achieved by including an additional head +in the network, which estimates a prediction interval. The prediction interval +head is trained with a maximum likelihood loss function. The model was trained +and tested with the SeeingThroughFog dataset, which features corresponding road +friction sensor readings and images from an instrumented vehicle. Acquired +results highlight the functionality of the prediction interval estimation of +SIWNet, while the network also achieved similar point estimate accuracy as the +previous state of the art. Furthermore, the SIWNet architecture is several +times more lightweight than the previously applied state-of-the-art model, +resulting in more practical and efficient deployment. + +
+
+ comment: Published in IEEE Transactions on Intelligent Vehicles (2024) +
+
+
+
+
+ + ♻ ☆ A Survey on Visual Mamba + + +
+ State space models (SSMs) with selection mechanisms and hardware-aware +architectures, namely Mamba, have recently demonstrated significant promise in +long-sequence modeling. Since the self-attention mechanism in transformers has +quadratic complexity with image size and increasing computational demands, the +researchers are now exploring how to adapt Mamba for computer vision tasks. +This paper is the first comprehensive survey aiming to provide an in-depth +analysis of Mamba models in the field of computer vision. It begins by +exploring the foundational concepts contributing to Mamba's success, including +the state space model framework, selection mechanisms, and hardware-aware +design. Next, we review these vision mamba models by categorizing them into +foundational ones and enhancing them with techniques such as convolution, +recurrence, and attention to improve their sophistication. We further delve +into the widespread applications of Mamba in vision tasks, which include their +use as a backbone in various levels of vision processing. This encompasses +general visual tasks, Medical visual tasks (e.g., 2D / 3D segmentation, +classification, and image registration, etc.), and Remote Sensing visual tasks. +We specially introduce general visual tasks from two levels: High/Mid-level +vision (e.g., Object detection, Segmentation, Video classification, etc.) and +Low-level vision (e.g., Image super-resolution, Image restoration, Visual +generation, etc.). We hope this endeavor will spark additional interest within +the community to address current challenges and further apply Mamba models in +computer vision. + +
+
+
+
+
+ + ♻ ☆ Rethinking The Uniformity Metric in Self-Supervised Learning + + +
+ Uniformity plays an important role in evaluating learned representations, +providing insights into self-supervised learning. In our quest for effective +uniformity metrics, we pinpoint four principled properties that such metrics +should possess. Namely, an effective uniformity metric should remain invariant +to instance permutations and sample replications while accurately capturing +feature redundancy and dimensional collapse. Surprisingly, we find that the +uniformity metric proposed by \citet{Wang2020UnderstandingCR} fails to satisfy +the majority of these properties. Specifically, their metric is sensitive to +sample replications, and can not account for feature redundancy and dimensional +collapse correctly. To overcome these limitations, we introduce a new +uniformity metric based on the Wasserstein distance, which satisfies all the +aforementioned properties. Integrating this new metric in existing +self-supervised learning methods effectively mitigates dimensional collapse and +consistently improves their performance on downstream tasks involving CIFAR-10 +and CIFAR-100 datasets. Code is available at +\url{https://github.com/statsle/WassersteinSSL}. + +
+
+
+
+
+ + ♻ ☆ Image Processing and Machine Learning for Hyperspectral Unmixing: An + Overview and the HySUPP Python Package + + +
+ Spectral pixels are often a mixture of the pure spectra of the materials, +called endmembers, due to the low spatial resolution of hyperspectral sensors, +double scattering, and intimate mixtures of materials in the scenes. Unmixing +estimates the fractional abundances of the endmembers within the pixel. +Depending on the prior knowledge of endmembers, linear unmixing can be divided +into three main groups: supervised, semi-supervised, and unsupervised (blind) +linear unmixing. Advances in Image processing and machine learning +substantially affected unmixing. This paper provides an overview of advanced +and conventional unmixing approaches. Additionally, we draw a critical +comparison between advanced and conventional techniques from the three +categories. We compare the performance of the unmixing techniques on three +simulated and two real datasets. The experimental results reveal the advantages +of different unmixing categories for different unmixing scenarios. Moreover, we +provide an open-source Python-based package available at +https://github.com/BehnoodRasti/HySUPP to reproduce the results. + +
+
+ comment: IEEE Transactions on Geoscience and Remote Sensing, 2024 +
+
+
+
+
+ + ♻ ☆ Research on Splicing Image Detection Algorithms Based on Natural Image + Statistical Characteristics + + +
+ With the development and widespread application of digital image processing +technology, image splicing has become a common method of image manipulation, +raising numerous security and legal issues. This paper introduces a new +splicing image detection algorithm based on the statistical characteristics of +natural images, aimed at improving the accuracy and efficiency of splicing +image detection. By analyzing the limitations of traditional methods, we have +developed a detection framework that integrates advanced statistical analysis +techniques and machine learning methods. The algorithm has been validated using +multiple public datasets, showing high accuracy in detecting spliced edges and +locating tampered areas, as well as good robustness. Additionally, we explore +the potential applications and challenges faced by the algorithm in real-world +scenarios. This research not only provides an effective technological means for +the field of image tampering detection but also offers new ideas and methods +for future related research. + +
+
+
+
+
+ + ♻ ☆ UniMODE: Unified Monocular 3D Object Detection CVPR2024 + + +
+ Realizing unified monocular 3D object detection, including both indoor and +outdoor scenes, holds great importance in applications like robot navigation. +However, involving various scenarios of data to train models poses challenges +due to their significantly different characteristics, e.g., diverse geometry +properties and heterogeneous domain distributions. To address these challenges, +we build a detector based on the bird's-eye-view (BEV) detection paradigm, +where the explicit feature projection is beneficial to addressing the geometry +learning ambiguity when employing multiple scenarios of data to train +detectors. Then, we split the classical BEV detection architecture into two +stages and propose an uneven BEV grid design to handle the convergence +instability caused by the aforementioned challenges. Moreover, we develop a +sparse BEV feature projection strategy to reduce computational cost and a +unified domain alignment method to handle heterogeneous domains. Combining +these techniques, a unified detector UniMODE is derived, which surpasses the +previous state-of-the-art on the challenging Omni3D dataset (a large-scale +dataset including both indoor and outdoor scenes) by 4.9% AP_3D, revealing the +first successful generalization of a BEV detector to unified 3D object +detection. + +
+
+ comment: This paper has been accepted for publication in CVPR2024 +
+
+
+
+
+ + ♻ ☆ NAI$_2$: Learning Noise-Aware Illumination-Interpolator for Unsupervised + Low-Light Image Enhancement + + +
+ Contemporary Low-Light Image Enhancement (LLIE) techniques have made notable +advancements in preserving image details and enhancing contrast, achieving +commendable results on specific datasets. Nevertheless, these approaches +encounter persistent challenges in efficiently mitigating dynamic noise and +accommodating diverse low-light scenarios. Insufficient constraints on complex +pixel-wise mapping learning lead to overfitting to specific types of noise and +artifacts associated with low-light conditions, reducing effectiveness in +variable lighting scenarios. To this end, we first propose a method for +estimating the noise level in low light images in a quick and accurate way. +This facilitates precise denoising, prevents over-smoothing, and adapts to +dynamic noise patterns. Subsequently, we devise a Learnable Illumination +Interpolator (LII), which employs learnlable interpolation operations between +the input and unit vector to satisfy general constraints between illumination +and input. Finally, we introduce a self-regularization loss that incorporates +intrinsic image properties and essential visual attributes to guide the output +towards meeting human visual expectations. Comprehensive experiments validate +the competitiveness of our proposed algorithm in both qualitative and +quantitative assessments. Notably, our noise estimation method, with linear +time complexity and suitable for various denoisers, significantly improves both +denoising and enhancement performance. Benefiting from this, our approach +achieves a 0.675dB PSNR improvement on the LOL dataset and 0.818dB on the MIT +dataset on LLIE task, even compared to supervised methods. + +
+
+ comment: Image processing, low-light image enhancement, noise estimation, + illumination learning +
+
+
+
+
+ + ♻ ☆ 3D-LFM: Lifting Foundation Model CVPR 2024 + + +
+ The lifting of 3D structure and camera from 2D landmarks is at the +cornerstone of the entire discipline of computer vision. Traditional methods +have been confined to specific rigid objects, such as those in +Perspective-n-Point (PnP) problems, but deep learning has expanded our +capability to reconstruct a wide range of object classes (e.g. C3DPO and PAUL) +with resilience to noise, occlusions, and perspective distortions. All these +techniques, however, have been limited by the fundamental need to establish +correspondences across the 3D training data -- significantly limiting their +utility to applications where one has an abundance of "in-correspondence" 3D +data. Our approach harnesses the inherent permutation equivariance of +transformers to manage varying number of points per 3D data instance, +withstands occlusions, and generalizes to unseen categories. We demonstrate +state of the art performance across 2D-3D lifting task benchmarks. Since our +approach can be trained across such a broad class of structures we refer to it +simply as a 3D Lifting Foundation Model (3D-LFM) -- the first of its kind. + +
+
+ comment: Visit the project page at https://3dlfm.github.io for links to + additional media, code, and videos. The site also features a custom GPT + tailored to address queries related to 3D-LFM. Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ InstructAny2Pix: Flexible Visual Editing via Multimodal Instruction + Following + + +
+ The ability to provide fine-grained control for generating and editing visual +imagery has profound implications for computer vision and its applications. +Previous works have explored extending controllability in two directions: +instruction tuning with text-based prompts and multi-modal conditioning. +However, these works make one or more unnatural assumptions on the number +and/or type of modality inputs used to express controllability. We propose +InstructAny2Pix, a flexible multi-modal instruction-following system that +enables users to edit an input image using instructions involving audio, +images, and text. InstructAny2Pix consists of three building blocks that +facilitate this capability: a multi-modal encoder that encodes different +modalities such as images and audio into a unified latent space, a diffusion +model that learns to decode representations in this latent space into images, +and a multi-modal LLM that can understand instructions involving multiple +images and audio pieces and generate a conditional embedding of the desired +output, which can be used by the diffusion decoder. Additionally, to facilitate +training efficiency and improve generation quality, we include an additional +refinement prior module that enhances the visual quality of LLM outputs. These +designs are critical to the performance of our system. We demonstrate that our +system can perform a series of novel instruction-guided editing tasks. The code +is available at https://github.com/jacklishufan/InstructAny2Pix.git + +
+
+ comment: 29 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Exploiting Topological Priors for Boosting Point Cloud Generation + + +
+ This paper presents an innovative enhancement to the Sphere as Prior +Generative Adversarial Network (SP-GAN) model, a state-of-the-art GAN designed +for point cloud generation. A novel method is introduced for point cloud +generation that elevates the structural integrity and overall quality of the +generated point clouds by incorporating topological priors into the training +process of the generator. Specifically, this work utilizes the K-means +algorithm to segment a point cloud from the repository into clusters and +extract centroids, which are then used as priors in the generation process of +the SP-GAN. Furthermore, the discriminator component of the SP-GAN utilizes the +identical point cloud that contributed the centroids, ensuring a coherent and +consistent learning environment. This strategic use of centroids as intuitive +guides not only boosts the efficiency of global feature learning but also +substantially improves the structural coherence and fidelity of the generated +point clouds. By applying the K-means algorithm to generate centroids as the +prior, the work intuitively and experimentally demonstrates that such a prior +enhances the quality of generated point clouds. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Attribute-Guided Multi-Level Attention Network for Fine-Grained Fashion + Retrieval + + +
+ Fine-grained fashion retrieval searches for items that share a similar +attribute with the query image. Most existing methods use a pre-trained feature +extractor (e.g., ResNet 50) to capture image representations. However, a +pre-trained feature backbone is typically trained for image classification and +object detection, which are fundamentally different tasks from fine-grained +fashion retrieval. Therefore, existing methods suffer from a feature gap +problem when directly using the pre-trained backbone for fine-tuning. To solve +this problem, we introduce an attribute-guided multi-level attention network +(AG-MAN). Specifically, we first enhance the pre-trained feature extractor to +capture multi-level image embedding, thereby enriching the low-level features +within these representations. Then, we propose a classification scheme where +images with the same attribute, albeit with different values, are categorized +into the same class. This can further alleviate the feature gap problem by +perturbing object-centric feature learning. Moreover, we propose an improved +attribute-guided attention module for extracting more accurate +attribute-specific representations. Our model consistently outperforms existing +attention based methods when assessed on the FashionAI (62.8788% in MAP), +DeepFashion (8.9804% in MAP), and Zappos50k datasets (93.32% in Prediction +accuracy). Especially, ours improves the most typical ASENet_V2 model by 2.12%, +0.31%, and 0.78% points in FashionAI, DeepFashion, and Zappos50k datasets, +respectively. The source code is available in +https://github.com/Dr-LingXiao/AG-MAN. + +
+
+
+
+
+ + ♻ ☆ Multi-Scale Representations by Varying Window Attention for Semantic + Segmentation ICLR2024 + + +
+ Multi-scale learning is central to semantic segmentation. We visualize the +effective receptive field (ERF) of canonical multi-scale representations and +point out two risks in learning them: scale inadequacy and field inactivation. +A novel multi-scale learner, varying window attention (VWA), is presented to +address these issues. VWA leverages the local window attention (LWA) and +disentangles LWA into the query window and context window, allowing the +context's scale to vary for the query to learn representations at multiple +scales. However, varying the context to large-scale windows (enlarging ratio R) +can significantly increase the memory footprint and computation cost (R^2 times +larger than LWA). We propose a simple but professional re-scaling strategy to +zero the extra induced cost without compromising performance. Consequently, VWA +uses the same cost as LWA to overcome the receptive limitation of the local +window. Furthermore, depending on VWA and employing various MLPs, we introduce +a multi-scale decoder (MSD), VWFormer, to improve multi-scale representations +for semantic segmentation. VWFormer achieves efficiency competitive with the +most compute-friendly MSDs, like FPN and MLP decoder, but performs much better +than any MSDs. For instance, using nearly half of UPerNet's computation, +VWFormer outperforms it by 1.0%-2.5% mIoU on ADE20K. With little extra +overhead, ~10G FLOPs, Mask2Former armed with VWFormer improves by 1.0%-1.3%. +The code and models are available at https://github.com/yan-hao-tian/vw + +
+
+ comment: ICLR2024 Poster +
+
+
+
+
+ + ♻ ☆ GazeCLIP: Towards Enhancing Gaze Estimation via Text Guidance + + +
+ Over the past decade, visual gaze estimation has garnered increasing +attention within the research community, owing to its wide-ranging application +scenarios. While existing estimation approaches have achieved remarkable +success in enhancing prediction accuracy, they primarily infer gaze from +single-image signals, neglecting the potential benefits of the currently +dominant text guidance. Notably, visual-language collaboration has been +extensively explored across various visual tasks, such as image synthesis and +manipulation, leveraging the remarkable transferability of large-scale +Contrastive Language-Image Pre-training (CLIP) model. Nevertheless, existing +gaze estimation approaches overlook the rich semantic cues conveyed by +linguistic signals and the priors embedded in CLIP feature space, thereby +yielding performance setbacks. To address this gap, we delve deeply into the +text-eye collaboration protocol and introduce a novel gaze estimation +framework, named GazeCLIP. Specifically, we intricately design a linguistic +description generator to produce text signals with coarse directional cues. +Additionally, a CLIP-based backbone that excels in characterizing text-eye +pairs for gaze estimation is presented. This is followed by the implementation +of a fine-grained multi-modal fusion module aimed at modeling the +interrelationships between heterogeneous inputs. Extensive experiments on three +challenging datasets demonstrate the superiority of the proposed GazeCLIP which +achieves the state-of-the-art accuracy. + +
+
+
+
+
+ + ♻ ☆ DAE-Net: Deforming Auto-Encoder for fine-grained shape co-segmentation SIGGRAPH 2024 + + +
+ We present an unsupervised 3D shape co-segmentation method which learns a set +of deformable part templates from a shape collection. To accommodate structural +variations in the collection, our network composes each shape by a selected +subset of template parts which are affine-transformed. To maximize the +expressive power of the part templates, we introduce a per-part deformation +network to enable the modeling of diverse parts with substantial geometry +variations, while imposing constraints on the deformation capacity to ensure +fidelity to the originally represented parts. We also propose a training scheme +to effectively overcome local minima. Architecturally, our network is a +branched autoencoder, with a CNN encoder taking a voxel shape as input and +producing per-part transformation matrices, latent codes, and part existence +scores, and the decoder outputting point occupancies to define the +reconstruction loss. Our network, coined DAE-Net for Deforming Auto-Encoder, +can achieve unsupervised 3D shape co-segmentation that yields fine-grained, +compact, and meaningful parts that are consistent across diverse shapes. We +conduct extensive experiments on the ShapeNet Part dataset, DFAUST, and an +animal subset of Objaverse to show superior performance over prior methods. +Code and data are available at https://github.com/czq142857/DAE-Net. + +
+
+ comment: SIGGRAPH 2024 conference track +
+
+
+
+
+ + ♻ ☆ Vanishing-Point-Guided Video Semantic Segmentation of Driving Scenes CVPR 2024 + + +
+ The estimation of implicit cross-frame correspondences and the high +computational cost have long been major challenges in video semantic +segmentation (VSS) for driving scenes. Prior works utilize keyframes, feature +propagation, or cross-frame attention to address these issues. By contrast, we +are the first to harness vanishing point (VP) priors for more effective +segmentation. Intuitively, objects near VPs (i.e., away from the vehicle) are +less discernible. Moreover, they tend to move radially away from the VP over +time in the usual case of a forward-facing camera, a straight road, and linear +forward motion of the vehicle. Our novel, efficient network for VSS, named +VPSeg, incorporates two modules that utilize exactly this pair of static and +dynamic VP priors: sparse-to-dense feature mining (DenseVP) and VP-guided +motion fusion (MotionVP). MotionVP employs VP-guided motion estimation to +establish explicit correspondences across frames and help attend to the most +relevant features from neighboring frames, while DenseVP enhances weak dynamic +features in distant regions around VPs. These modules operate within a +context-detail framework, which separates contextual features from +high-resolution local features at different input resolutions to reduce +computational costs. Contextual and local features are integrated through +contextualized motion attention (CMA) for the final prediction. Extensive +experiments on two popular driving segmentation benchmarks, Cityscapes and +ACDC, demonstrate that VPSeg outperforms previous SOTA methods, with only +modest computational overhead. + +
+
+ comment: CVPR 2024 highlight +
+
+
+
+
+ + ♻ ☆ Exploring Vulnerabilities of No-Reference Image Quality Assessment + Models: A Query-Based Black-Box Method + + +
+ No-Reference Image Quality Assessment (NR-IQA) aims to predict image quality +scores consistent with human perception without relying on pristine reference +images, serving as a crucial component in various visual tasks. Ensuring the +robustness of NR-IQA methods is vital for reliable comparisons of different +image processing techniques and consistent user experiences in recommendations. +The attack methods for NR-IQA provide a powerful instrument to test the +robustness of NR-IQA. However, current attack methods of NR-IQA heavily rely on +the gradient of the NR-IQA model, leading to limitations when the gradient +information is unavailable. In this paper, we present a pioneering query-based +black box attack against NR-IQA methods. We propose the concept of score +boundary and leverage an adaptive iterative approach with multiple score +boundaries. Meanwhile, the initial attack directions are also designed to +leverage the characteristics of the Human Visual System (HVS). Experiments show +our method outperforms all compared state-of-the-art attack methods and is far +ahead of previous black-box methods. The effective NR-IQA model DBCNN suffers a +Spearman's rank-order correlation coefficient (SROCC) decline of 0.6381 +attacked by our method, revealing the vulnerability of NR-IQA models to +black-box attacks. The proposed attack method also provides a potent tool for +further exploration into NR-IQA robustness. + +
+
+
+
+
+ + ♻ ☆ ArtNeRF: A Stylized Neural Field for 3D-Aware Cartoonized Face Synthesis + + +
+ Recent advances in generative visual models and neural radiance fields have +greatly boosted 3D-aware image synthesis and stylization tasks. However, +previous NeRF-based work is limited to single scene stylization, training a +model to generate 3D-aware cartoon faces with arbitrary styles remains +unsolved. We propose ArtNeRF, a novel face stylization framework derived from +3D-aware GAN to tackle this problem. In this framework, we utilize an +expressive generator to synthesize stylized faces and a triple-branch +discriminator module to improve the visual quality and style consistency of the +generated faces. Specifically, a style encoder based on contrastive learning is +leveraged to extract robust low-dimensional embeddings of style images, +empowering the generator with the knowledge of various styles. To smooth the +training process of cross-domain transfer learning, we propose an adaptive +style blending module which helps inject style information and allows users to +freely tune the level of stylization. We further introduce a neural rendering +module to achieve efficient real-time rendering of images with higher +resolutions. Extensive experiments demonstrate that ArtNeRF is versatile in +generating high-quality 3D-aware cartoon faces with arbitrary styles. + +
+
+
+
+
+ + ♻ ☆ Action Segmentation Using 2D Skeleton Heatmaps and Multi-Modality Fusion ICRA 2024 + + +
+ This paper presents a 2D skeleton-based action segmentation method with +applications in fine-grained human activity recognition. In contrast with +state-of-the-art methods which directly take sequences of 3D skeleton +coordinates as inputs and apply Graph Convolutional Networks (GCNs) for +spatiotemporal feature learning, our main idea is to use sequences of 2D +skeleton heatmaps as inputs and employ Temporal Convolutional Networks (TCNs) +to extract spatiotemporal features. Despite lacking 3D information, our +approach yields comparable/superior performances and better robustness against +missing keypoints than previous methods on action segmentation datasets. +Moreover, we improve the performances further by using both 2D skeleton +heatmaps and RGB videos as inputs. To our best knowledge, this is the first +work to utilize 2D skeleton heatmap inputs and the first work to explore 2D +skeleton+RGB fusion for action segmentation. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ A Proxy Attack-Free Strategy for Practically Improving the Poisoning + Efficiency in Backdoor Attacks + + +
+ Poisoning efficiency plays a critical role in poisoning-based backdoor +attacks. To evade detection, attackers aim to use the fewest poisoning samples +while achieving the desired attack strength. Although efficient triggers have +significantly improved poisoning efficiency, there is still room for further +enhancement. Recently, selecting efficient samples has shown promise, but it +often requires a proxy backdoor injection task to identify an efficient +poisoning sample set. However, the proxy attack-based approach can lead to +performance degradation if the proxy attack settings differ from those used by +the actual victims due to the shortcut of backdoor learning. This paper +presents a Proxy attack-Free Strategy (PFS) designed to identify efficient +poisoning samples based on individual similarity and ensemble diversity, +effectively addressing the mentioned concern. The proposed PFS is motivated by +the observation that selecting the to-be-poisoned samples with high similarity +between clean samples and their corresponding poisoning samples results in +significantly higher attack success rates compared to using samples with low +similarity. Furthermore, theoretical analyses for this phenomenon are provided +based on the theory of active learning and neural tangent kernel. We +comprehensively evaluate the proposed strategy across various datasets, +triggers, poisoning rates, architectures, and training hyperparameters. Our +experimental results demonstrate that PFS enhances backdoor attack efficiency, +while also exhibiting a remarkable speed advantage over prior proxy-dependent +selection methodologies. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Aligning Knowledge Graph with Visual Perception for Object-goal + Navigation ICRA 2024 + + +
+ Object-goal navigation is a challenging task that requires guiding an agent +to specific objects based on first-person visual observations. The ability of +agent to comprehend its surroundings plays a crucial role in achieving +successful object finding. However, existing knowledge-graph-based navigators +often rely on discrete categorical one-hot vectors and vote counting strategy +to construct graph representation of the scenes, which results in misalignment +with visual images. To provide more accurate and coherent scene descriptions +and address this misalignment issue, we propose the Aligning Knowledge Graph +with Visual Perception (AKGVP) method for object-goal navigation. Technically, +our approach introduces continuous modeling of the hierarchical scene +architecture and leverages visual-language pre-training to align natural +language description with visual perception. The integration of a continuous +knowledge graph architecture and multimodal feature alignment empowers the +navigator with a remarkable zero-shot navigation capability. We extensively +evaluate our method using the AI2-THOR simulator and conduct a series of +experiments to demonstrate the effectiveness and efficiency of our navigator. +Code available: https://github.com/nuoxu/AKGVP. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ CAT-DM: Controllable Accelerated Virtual Try-on with Diffusion Model + + +
+ Generative Adversarial Networks (GANs) dominate the research field in +image-based virtual try-on, but have not resolved problems such as unnatural +deformation of garments and the blurry generation quality. While the generative +quality of diffusion models is impressive, achieving controllability poses a +significant challenge when applying it to virtual try-on and multiple denoising +iterations limit its potential for real-time applications. In this paper, we +propose Controllable Accelerated virtual Try-on with Diffusion Model (CAT-DM). +To enhance the controllability, a basic diffusion-based virtual try-on network +is designed, which utilizes ControlNet to introduce additional control +conditions and improves the feature extraction of garment images. In terms of +acceleration, CAT-DM initiates a reverse denoising process with an implicit +distribution generated by a pre-trained GAN-based model. Compared with previous +try-on methods based on diffusion models, CAT-DM not only retains the pattern +and texture details of the inshop garment but also reduces the sampling steps +without compromising generation quality. Extensive experiments demonstrate the +superiority of CAT-DM against both GANbased and diffusion-based methods in +producing more realistic images and accurately reproducing garment patterns. + +
+
+
+
+
+ + ♻ ☆ Enhancing Visual Grounding and Generalization: A Multi-Task Cycle + Training Approach for Vision-Language Models + + +
+ Visual grounding (VG) occupies a pivotal position in multi-modality +vision-language models. In this study, we propose ViLaM, a large multi-modality +model, that supports multi-tasks of VG using the cycle training strategy, with +abundant interaction instructions. The cycle training between referring +expression generation (REG) and referring expression comprehension (REC) is +introduced. It enhances the consistency between visual location and referring +expressions, and addresses the need for high-quality, multi-tasks VG datasets. +Moreover, multi-tasks of VG are promoted in our model, contributed by the cycle +training strategy. The multi-tasks in REC encompass a range of granularities, +from region-level to pixel-level, which include referring bbox detection, +referring keypoints detection, and referring image segmentation. In REG, +referring region classification determines the fine-grained category of the +target, while referring region captioning generates a comprehensive +description. Meanwhile, all tasks participate in the joint training, +synergistically enhancing one another and collectively improving the overall +performance of the model. Furthermore, leveraging the capabilities of large +language models, ViLaM extends a wide range of instructions, thereby +significantly enhancing its generalization and interaction potentials. +Extensive public datasets corroborate the superior capabilities of our model in +VG with muti-tasks. Additionally, validating its robust generalization, ViLaM +is validated under open-set and few-shot scenarios. Especially in the medical +field, our model demonstrates cross-domain robust generalization capabilities. +Furthermore, we contribute a VG dataset, especially with multi-tasks. To +support and encourage the community focused on VG, we have made both the +dataset and our code public: https://github.com/AnonymGiant/ViLaM. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Robust Data Clustering with Outliers via Transformed Tensor Low-Rank + Representation AISTATS 2024 + + +
+ Recently, tensor low-rank representation (TLRR) has become a popular tool for +tensor data recovery and clustering, due to its empirical success and +theoretical guarantees. However, existing TLRR methods consider Gaussian or +gross sparse noise, inevitably leading to performance degradation when the +tensor data are contaminated by outliers or sample-specific corruptions. This +paper develops an outlier-robust tensor low-rank representation (OR-TLRR) +method that provides outlier detection and tensor data clustering +simultaneously based on the t-SVD framework. For tensor observations with +arbitrary outlier corruptions, OR-TLRR has provable performance guarantee for +exactly recovering the row space of clean data and detecting outliers under +mild conditions. Moreover, an extension of OR-TLRR is proposed to handle the +case when parts of the data are missing. Finally, extensive experimental +results on synthetic and real data demonstrate the effectiveness of the +proposed algorithms. We release our code at +https://github.com/twugithub/2024-AISTATS-ORTLRR. + +
+
+ comment: AISTATS 2024 +
+
+
+
+
+ + ♻ ☆ Kosmos-G: Generating Images in Context with Multimodal Large Language + Models + + +
+ Recent advancements in subject-driven image generation have made significant +strides. However, current methods still fall short in diverse application +scenarios, as they require test-time tuning and cannot accept interleaved +multi-image and text input. These limitations keep them far from the ultimate +goal of "image as a foreign language in image generation." This paper presents +Kosmos-G, a model that leverages the advanced multimodal perception +capabilities of Multimodal Large Language Models (MLLMs) to tackle the +aforementioned challenge. Our approach aligns the output space of MLLM with +CLIP using the textual modality as an anchor and performs compositional +instruction tuning on curated data. Kosmos-G demonstrates an impressive +capability of zero-shot subject-driven generation with interleaved multi-image +and text input. Notably, the score distillation instruction tuning requires no +modifications to the image decoder. This allows for a seamless substitution of +CLIP and effortless integration with a myriad of U-Net techniques ranging from +fine-grained controls to personalized image decoder variants. We posit Kosmos-G +as an initial attempt towards the goal of "image as a foreign language in image +generation." The code can be found at https://aka.ms/Kosmos-G + +
+
+ comment: Code: https://aka.ms/Kosmos-G Project Page: + https://xichenpan.github.io/kosmosg +
+
+
+
+
+ + ♻ ☆ DocumentCLIP: Linking Figures and Main Body Text in Reflowed Documents ICPR + + +
+ Vision-language pretraining models have achieved great success in supporting +multimedia applications by understanding the alignments between images and +text. While existing vision-language pretraining models primarily focus on +understanding single image associated with a single piece of text, they often +ignore the alignment at the intra-document level, consisting of multiple +sentences with multiple images. In this work, we propose DocumentCLIP, a +salience-aware contrastive learning framework to enforce vision-language +pretraining models to comprehend the interaction between images and longer text +within documents. Our model is beneficial for the real-world multimodal +document understanding like news article, magazines, product descriptions, +which contain linguistically and visually richer content. To the best of our +knowledge, we are the first to explore multimodal intra-document links by +contrastive learning. In addition, we collect a large Wikipedia dataset for +pretraining, which provides various topics and structures. Experiments show +DocumentCLIP not only outperforms the state-of-the-art baselines in the +supervised setting, but also achieves the best zero-shot performance in the +wild after human evaluation. Our code is available at +https://github.com/FuxiaoLiu/DocumentCLIP. + +
+
+ comment: Accepted to ICPRAI 2024 +
+
+
+
+
+ + ♻ ☆ Panoptic Perception: A Novel Task and Fine-grained Dataset for Universal + Remote Sensing Image Interpretation + + +
+ Current remote-sensing interpretation models often focus on a single task +such as detection, segmentation, or caption. However, the task-specific +designed models are unattainable to achieve the comprehensive multi-level +interpretation of images. The field also lacks support for multi-task joint +interpretation datasets. In this paper, we propose Panoptic Perception, a novel +task and a new fine-grained dataset (FineGrip) to achieve a more thorough and +universal interpretation for RSIs. The new task, 1) integrates pixel-level, +instance-level, and image-level information for universal image perception, 2) +captures image information from coarse to fine granularity, achieving deeper +scene understanding and description, and 3) enables various independent tasks +to complement and enhance each other through multi-task learning. By +emphasizing multi-task interactions and the consistency of perception results, +this task enables the simultaneous processing of fine-grained foreground +instance segmentation, background semantic segmentation, and global +fine-grained image captioning. Concretely, the FineGrip dataset includes 2,649 +remote sensing images, 12,054 fine-grained instance segmentation masks +belonging to 20 foreground things categories, 7,599 background semantic masks +for 5 stuff classes and 13,245 captioning sentences. Furthermore, we propose a +joint optimization-based panoptic perception model. Experimental results on +FineGrip demonstrate the feasibility of the panoptic perception task and the +beneficial effect of multi-task joint optimization on individual tasks. The +dataset will be publicly available. + +
+
+
+
+
+ + ♻ ☆ ULIP-2: Towards Scalable Multimodal Pre-training for 3D Understanding CVPR2024 + + +
+ Recent advancements in multimodal pre-training have shown promising efficacy +in 3D representation learning by aligning multimodal features across 3D shapes, +their 2D counterparts, and language descriptions. However, the methods used by +existing frameworks to curate such multimodal data, in particular language +descriptions for 3D shapes, are not scalable, and the collected language +descriptions are not diverse. To address this, we introduce ULIP-2, a simple +yet effective tri-modal pre-training framework that leverages large multimodal +models to automatically generate holistic language descriptions for 3D shapes. +It only needs 3D data as input, eliminating the need for any manual 3D +annotations, and is therefore scalable to large datasets. ULIP-2 is also +equipped with scaled-up backbones for better multimodal representation +learning. We conduct experiments on two large-scale 3D datasets, Objaverse and +ShapeNet, and augment them with tri-modal datasets of 3D point clouds, images, +and language for training ULIP-2. Experiments show that ULIP-2 demonstrates +substantial benefits in three downstream tasks: zero-shot 3D classification, +standard 3D classification with fine-tuning, and 3D captioning (3D-to-language +generation). It achieves a new SOTA of 50.6% (top-1) on Objaverse-LVIS and +84.7% (top-1) on ModelNet40 in zero-shot classification. In the ScanObjectNN +benchmark for standard fine-tuning, ULIP-2 reaches an overall accuracy of 91.5% +with a compact model of only 1.4 million parameters. ULIP-2 sheds light on a +new paradigm for scalable multimodal 3D representation learning without human +annotations and shows significant improvements over existing baselines. The +code and datasets are released at https://github.com/salesforce/ULIP. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ WPS-Dataset: A benchmark for wood plate segmentation in bark removal + processing + + +
+ Using deep learning methods is a promising approach to improving bark removal +efficiency and enhancing the quality of wood products. However, the lack of +publicly available datasets for wood plate segmentation in bark removal +processing poses challenges for researchers in this field. To address this +issue, a benchmark for wood plate segmentation in bark removal processing named +WPS-dataset is proposed in this study, which consists of 4863 images. We +designed an image acquisition device and assembled it on a bark removal +equipment to capture images in real industrial settings. We evaluated the +WPS-dataset using six typical segmentation models. The models effectively learn +and understand the WPS-dataset characteristics during training, resulting in +high performance and accuracy in wood plate segmentation tasks. We believe that +our dataset can lay a solid foundation for future research in bark removal +processing and contribute to advancements in this field. + +
+
+
+
+
+ + ♻ ☆ V2CE: Video to Continuous Events Simulator ICRA + + +
+ Dynamic Vision Sensor (DVS)-based solutions have recently garnered +significant interest across various computer vision tasks, offering notable +benefits in terms of dynamic range, temporal resolution, and inference speed. +However, as a relatively nascent vision sensor compared to Active Pixel Sensor +(APS) devices such as RGB cameras, DVS suffers from a dearth of ample labeled +datasets. Prior efforts to convert APS data into events often grapple with +issues such as a considerable domain shift from real events, the absence of +quantified validation, and layering problems within the time axis. In this +paper, we present a novel method for video-to-events stream conversion from +multiple perspectives, considering the specific characteristics of DVS. A +series of carefully designed losses helps enhance the quality of generated +event voxels significantly. We also propose a novel local dynamic-aware +timestamp inference strategy to accurately recover event timestamps from event +voxels in a continuous fashion and eliminate the temporal layering problem. +Results from rigorous validation through quantified metrics at all stages of +the pipeline establish our method unquestionably as the current +state-of-the-art (SOTA). + +
+
+ comment: 6 pages, 7 figures, IEEE International Conference on Robotics and + Automation (ICRA) 2024 +
+
+
+
+
+ + ♻ ☆ SynCellFactory: Generative Data Augmentation for Cell Tracking + + +
+ Cell tracking remains a pivotal yet challenging task in biomedical research. +The full potential of deep learning for this purpose is often untapped due to +the limited availability of comprehensive and varied training data sets. In +this paper, we present SynCellFactory, a generative cell video augmentation. At +the heart of SynCellFactory lies the ControlNet architecture, which has been +fine-tuned to synthesize cell imagery with photorealistic accuracy in style and +motion patterns. This technique enables the creation of synthetic yet realistic +cell videos that mirror the complexity of authentic microscopy time-lapses. Our +experiments demonstrate that SynCellFactory boosts the performance of +well-established deep learning models for cell tracking, particularly when +original training data is sparse. + +
+
+
+
+
+ + ♻ ☆ Subject-Based Domain Adaptation for Facial Expression Recognition + + +
+ Adapting a deep learning model to a specific target individual is a +challenging facial expression recognition (FER) task that may be achieved using +unsupervised domain adaptation (UDA) methods. Although several UDA methods have +been proposed to adapt deep FER models across source and target data sets, +multiple subject-specific source domains are needed to accurately represent the +intra- and inter-person variability in subject-based adaption. This paper +considers the setting where domains correspond to individuals, not entire +datasets. Unlike UDA, multi-source domain adaptation (MSDA) methods can +leverage multiple source datasets to improve the accuracy and robustness of the +target model. However, previous methods for MSDA adapt image classification +models across datasets and do not scale well to a more significant number of +source domains. This paper introduces a new MSDA method for subject-based +domain adaptation in FER. It efficiently leverages information from multiple +source subjects (labeled source domain data) to adapt a deep FER model to a +single target individual (unlabeled target domain data). During adaptation, our +subject-based MSDA first computes a between-source discrepancy loss to mitigate +the domain shift among data from several source subjects. Then, a new strategy +is employed to generate augmented confident pseudo-labels for the target +subject, allowing a reduction in the domain shift between source and target +subjects. Experiments performed on the challenging BioVid heat and pain dataset +with 87 subjects and the UNBC-McMaster shoulder pain dataset with 25 subjects +show that our subject-based MSDA can outperform state-of-the-art methods yet +scale well to multiple subject-based source domains. + +
+
+
+
+
+ + ♻ ☆ Region-Based Representations Revisited CVPR 2024 + + +
+ We investigate whether region-based representations are effective for +recognition. Regions were once a mainstay in recognition approaches, but pixel +and patch-based features are now used almost exclusively. We show that recent +class-agnostic segmenters like SAM can be effectively combined with strong +unsupervised representations like DINOv2 and used for a wide variety of tasks, +including semantic segmentation, object-based image retrieval, and multi-image +analysis. Once the masks and features are extracted, these representations, +even with linear decoders, enable competitive performance, making them well +suited to applications that require custom queries. The compactness of the +representation also makes it well-suited to video analysis and other problems +requiring inference across many images. + +
+
+ comment: CVPR 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Describing Differences in Image Sets with Natural Language CVPR 2024 + + +
+ How do two sets of images differ? Discerning set-level differences is crucial +for understanding model behaviors and analyzing datasets, yet manually sifting +through thousands of images is impractical. To aid in this discovery process, +we explore the task of automatically describing the differences between two +$\textbf{sets}$ of images, which we term Set Difference Captioning. This task +takes in image sets $D_A$ and $D_B$, and outputs a description that is more +often true on $D_A$ than $D_B$. We outline a two-stage approach that first +proposes candidate difference descriptions from image sets and then re-ranks +the candidates by checking how well they can differentiate the two sets. We +introduce VisDiff, which first captions the images and prompts a language model +to propose candidate descriptions, then re-ranks these descriptions using CLIP. +To evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image +sets with ground truth difference descriptions. We apply VisDiff to various +domains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing +classification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing +model failure modes (supervised ResNet), characterizing differences between +generative models (e.g., StableDiffusionV1 and V2), and discovering what makes +images memorable. Using VisDiff, we are able to find interesting and previously +unknown differences in datasets and models, demonstrating its utility in +revealing nuanced insights. + +
+
+ comment: CVPR 2024 Oral +
+
+
+
+
+ + ♻ ☆ CSSL-MHTR: Continual Self-Supervised Learning for Scalable Multi-script + Handwritten Text Recognition + + +
+ Self-supervised learning has recently emerged as a strong alternative in +document analysis. These approaches are now capable of learning high-quality +image representations and overcoming the limitations of supervised methods, +which require a large amount of labeled data. However, these methods are unable +to capture new knowledge in an incremental fashion, where data is presented to +the model sequentially, which is closer to the realistic scenario. In this +paper, we explore the potential of continual self-supervised learning to +alleviate the catastrophic forgetting problem in handwritten text recognition, +as an example of sequence recognition. Our method consists in adding +intermediate layers called adapters for each task, and efficiently distilling +knowledge from the previous model while learning the current task. Our proposed +framework is efficient in both computation and memory complexity. To +demonstrate its effectiveness, we evaluate our method by transferring the +learned model to diverse text recognition downstream tasks, including Latin and +non-Latin scripts. As far as we know, this is the first application of +continual self-supervised learning for handwritten text recognition. We attain +state-of-the-art performance on English, Italian and Russian scripts, whilst +adding only a few parameters per task. The code and trained models will be +publicly available. + +
+
+ comment: Due to current company policy constraints, we are compelled to + withdraw our paper. The organization's guidelines prohibit us from proceeding + with the publication of this work at this time. We apologize for any + inconvenience this may cause and appreciate your understanding in this matter +
+
+
+
+
+ + ♻ ☆ Has the Virtualization of the Face Changed Facial Perception? A Study of + the Impact of Photo Editing and Augmented Reality on Facial Perception + + +
+ Augmented reality and other photo editing filters are popular methods used to +modify faces online. Considering the important role of facial perception in +communication, how do we perceive this increasing number of modified faces? In +this paper we present the results of six surveys that measure familiarity with +different styles of facial filters, perceived strangeness of faces edited with +different filters, and ability to discern whether images are filtered. Our +results demonstrate that faces modified with more traditional face filters are +perceived similarly to unmodified faces, and faces filtered with augmented +reality filters are perceived differently from unmodified faces. We discuss +possible explanations for these results, including a societal adjustment to +traditional photo editing techniques or the inherent differences in the +different types of filters. We conclude with a discussion of how to build +online spaces more responsibly based on our results. + +
+
+
+
+
+ + ♻ ☆ MetaCloak: Preventing Unauthorized Subject-driven Text-to-image + Diffusion-based Synthesis via Meta-learning CVPR 2024 + + +
+ Text-to-image diffusion models allow seamless generation of personalized +images from scant reference photos. Yet, these tools, in the wrong hands, can +fabricate misleading or harmful content, endangering individuals. To address +this problem, existing poisoning-based approaches perturb user images in an +imperceptible way to render them "unlearnable" from malicious uses. We identify +two limitations of these defending approaches: i) sub-optimal due to the +hand-crafted heuristics for solving the intractable bilevel optimization and +ii) lack of robustness against simple data transformations like Gaussian +filtering. To solve these challenges, we propose MetaCloak, which solves the +bi-level poisoning problem with a meta-learning framework with an additional +transformation sampling process to craft transferable and robust perturbation. +Specifically, we employ a pool of surrogate diffusion models to craft +transferable and model-agnostic perturbation. Furthermore, by incorporating an +additional transformation process, we design a simple denoising-error +maximization loss that is sufficient for causing transformation-robust semantic +distortion and degradation in a personalized generation. Extensive experiments +on the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing +approaches. Notably, MetaCloak can successfully fool online training services +like Replicate, in a black-box manner, demonstrating the effectiveness of +MetaCloak in real-world scenarios. Our code is available at +https://github.com/liuyixin-louis/MetaCloak. + +
+
+ comment: Accepted to CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Learning by Aligning 2D Skeleton Sequences and Multi-Modality Fusion + + +
+ This paper presents a self-supervised temporal video alignment framework +which is useful for several fine-grained human activity understanding +applications. In contrast with the state-of-the-art method of CASA, where +sequences of 3D skeleton coordinates are taken directly as input, our key idea +is to use sequences of 2D skeleton heatmaps as input. Unlike CASA which +performs self-attention in the temporal domain only, we feed 2D skeleton +heatmaps to a video transformer which performs self-attention both in the +spatial and temporal domains for extracting effective spatiotemporal and +contextual features. In addition, we introduce simple heatmap augmentation +techniques based on 2D skeletons for self-supervised learning. Despite the lack +of 3D information, our approach achieves not only higher accuracy but also +better robustness against missing and noisy keypoints than CASA. Furthermore, +extensive evaluations on three public datasets, i.e., Penn Action, IKEA ASM, +and H2O, demonstrate that our approach outperforms previous methods in +different fine-grained human activity understanding tasks. Finally, fusing 2D +skeleton heatmaps with RGB videos yields the state-of-the-art on all metrics +and datasets. To our best knowledge, our work is the first to utilize 2D +skeleton heatmap inputs and the first to explore multi-modality fusion for +temporal video alignment. + +
+
+
+
+
+ + ♻ ☆ AMEND: A Mixture of Experts Framework for Long-tailed Trajectory + Prediction + + +
+ Accurate prediction of pedestrians' future motions is critical for +intelligent driving systems. Developing models for this task requires rich +datasets containing diverse sets of samples. However, the existing naturalistic +trajectory prediction datasets are generally imbalanced in favor of simpler +samples and lack challenging scenarios. Such a long-tail effect causes +prediction models to underperform on the tail portion of the data distribution +containing safety-critical scenarios. Previous methods tackle the long-tail +problem using methods such as contrastive learning and class-conditioned +hypernetworks. These approaches, however, are not modular and cannot be applied +to many machine learning architectures. In this work, we propose a modular +model-agnostic framework for trajectory prediction that leverages a specialized +mixture of experts. In our approach, each expert is trained with a specialized +skill with respect to a particular part of the data. To produce predictions, we +utilise a router network that selects the best expert by generating relative +confidence scores. We conduct experimentation on common pedestrian trajectory +prediction datasets and show that our method improves performance on long-tail +scenarios. We further conduct ablation studies to highlight the contribution of +different proposed components. + +
+
+
+
+
+ + ♻ ☆ Robustness Analysis on Foundational Segmentation Models CVPR + + +
+ Due to the increase in computational resources and accessibility of data, an +increase in large, deep learning models trained on copious amounts of +multi-modal data using self-supervised or semi-supervised learning have +emerged. These ``foundation'' models are often adapted to a variety of +downstream tasks like classification, object detection, and segmentation with +little-to-no training on the target dataset. In this work, we perform a +robustness analysis of Visual Foundation Models (VFMs) for segmentation tasks +and focus on robustness against real-world distribution shift inspired +perturbations. We benchmark seven state-of-the-art segmentation architectures +using 2 different perturbed datasets, MS COCO-P and ADE20K-P, with 17 different +perturbations with 5 severity levels each. Our findings reveal several key +insights: (1) VFMs exhibit vulnerabilities to compression-induced corruptions, +(2) despite not outpacing all of unimodal models in robustness, multimodal +models show competitive resilience in zero-shot scenarios, and (3) VFMs +demonstrate enhanced robustness for certain object categories. These +observations suggest that our robustness evaluation framework sets new +requirements for foundational models, encouraging further advancements to +bolster their adaptability and performance. The code and dataset is available +at: \url{https://tinyurl.com/fm-robust}. + +
+
+ comment: This benchmark along with the code and datasets is available at: + https://tinyurl.com/fm-robust. Accepted at CVPRW 2024 +
+
+
+
+
+ + ♻ ☆ Deep Variational Network Toward Blind Image Restoration + + +
+ Blind image restoration (IR) is a common yet challenging problem in computer +vision. Classical model-based methods and recent deep learning (DL)-based +methods represent two different methodologies for this problem, each with their +own merits and drawbacks. In this paper, we propose a novel blind image +restoration method, aiming to integrate both the advantages of them. +Specifically, we construct a general Bayesian generative model for the blind +IR, which explicitly depicts the degradation process. In this proposed model, a +pixel-wise non-i.i.d. Gaussian distribution is employed to fit the image noise. +It is with more flexibility than the simple i.i.d. Gaussian or Laplacian +distributions as adopted in most of conventional methods, so as to handle more +complicated noise types contained in the image degradation. To solve the model, +we design a variational inference algorithm where all the expected posteriori +distributions are parameterized as deep neural networks to increase their model +capability. Notably, such an inference algorithm induces a unified framework to +jointly deal with the tasks of degradation estimation and image restoration. +Further, the degradation information estimated in the former task is utilized +to guide the latter IR process. Experiments on two typical blind IR tasks, +namely image denoising and super-resolution, demonstrate that the proposed +method achieves superior performance over current state-of-the-arts. + +
+
+ comment: Accepted by TPAMI@2024. Code: https://github.com/zsyOAOA/VIRNet +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 143 + +
+
+
+ + ☆ The Third Monocular Depth Estimation Challenge CVPR + + +
+ This paper discusses the results of the third edition of the Monocular Depth +Estimation Challenge (MDEC). The challenge focuses on zero-shot generalization +to the challenging SYNS-Patches dataset, featuring complex scenes in natural +and indoor settings. As with the previous edition, methods can use any form of +supervision, i.e. supervised or self-supervised. The challenge received a total +of 19 submissions outperforming the baseline on the test set: 10 among them +submitted a report describing their approach, highlighting a diffused use of +foundational models such as Depth Anything at the core of their method. The +challenge winners drastically improved 3D F-Score performance, from 17.51% to +23.72%. + +
+
+ comment: To appear in CVPRW2024 +
+
+
+
+
+ + ☆ Make-it-Real: Unleashing Large Multimodal Model's Ability for Painting + 3D Objects with Realistic Materials + + +
+ Physically realistic materials are pivotal in augmenting the realism of 3D +assets across various applications and lighting conditions. However, existing +3D assets and generative models often lack authentic material properties. +Manual assignment of materials using graphic software is a tedious and +time-consuming task. In this paper, we exploit advancements in Multimodal Large +Language Models (MLLMs), particularly GPT-4V, to present a novel approach, +Make-it-Real: 1) We demonstrate that GPT-4V can effectively recognize and +describe materials, allowing the construction of a detailed material library. +2) Utilizing a combination of visual cues and hierarchical text prompts, GPT-4V +precisely identifies and aligns materials with the corresponding components of +3D objects. 3) The correctly matched materials are then meticulously applied as +reference for the new SVBRDF material generation according to the original +diffuse map, significantly enhancing their visual authenticity. Make-it-Real +offers a streamlined integration into the 3D content creation workflow, +showcasing its utility as an essential tool for developers of 3D assets. + +
+
+ comment: Project Page: https://sunzey.github.io/Make-it-Real/ +
+
+
+
+
+ + ☆ Made to Order: Discovering monotonic temporal changes via + self-supervised video ordering + + +
+ Our objective is to discover and localize monotonic temporal changes in a +sequence of images. To achieve this, we exploit a simple proxy task of ordering +a shuffled image sequence, with `time' serving as a supervisory signal since +only changes that are monotonic with time can give rise to the correct +ordering. We also introduce a flexible transformer-based model for +general-purpose ordering of image sequences of arbitrary length with built-in +attribution maps. After training, the model successfully discovers and +localizes monotonic changes while ignoring cyclic and stochastic ones. We +demonstrate applications of the model in multiple video settings covering +different scene and object types, discovering both object-level and +environmental changes in unseen sequences. We also demonstrate that the +attention-based attribution maps function as effective prompts for segmenting +the changing regions, and that the learned representations can be used for +downstream applications. Finally, we show that the model achieves the state of +the art on standard benchmarks for ordering a set of images. + +
+
+ comment: Project page: https://charigyang.github.io/order/ +
+
+
+
+
+ + ☆ ResVR: Joint Rescaling and Viewport Rendering of Omnidirectional Images + + +
+ With the advent of virtual reality technology, omnidirectional image (ODI) +rescaling techniques are increasingly embraced for reducing transmitted and +stored file sizes while preserving high image quality. Despite this progress, +current ODI rescaling methods predominantly focus on enhancing the quality of +images in equirectangular projection (ERP) format, which overlooks the fact +that the content viewed on head mounted displays (HMDs) is actually a rendered +viewport instead of an ERP image. In this work, we emphasize that focusing +solely on ERP quality results in inferior viewport visual experiences for +users. Thus, we propose ResVR, which is the first comprehensive framework for +the joint Rescaling and Viewport Rendering of ODIs. ResVR allows obtaining LR +ERP images for transmission while rendering high-quality viewports for users to +watch on HMDs. In our ResVR, a novel discrete pixel sampling strategy is +developed to tackle the complex mapping between the viewport and ERP, enabling +end-to-end training of ResVR pipeline. Furthermore, a spherical pixel shape +representation technique is innovatively derived from spherical differentiation +to significantly improve the visual quality of rendered viewports. Extensive +experiments demonstrate that our ResVR outperforms existing methods in viewport +rendering tasks across different fields of view, resolutions, and view +directions while keeping a low transmission overhead. + +
+
+
+
+
+ + ☆ V2A-Mark: Versatile Deep Visual-Audio Watermarking for Manipulation + Localization and Copyright Protection + + +
+ AI-generated video has revolutionized short video production, filmmaking, and +personalized media, making video local editing an essential tool. However, this +progress also blurs the line between reality and fiction, posing challenges in +multimedia forensics. To solve this urgent issue, V2A-Mark is proposed to +address the limitations of current video tampering forensics, such as poor +generalizability, singular function, and single modality focus. Combining the +fragility of video-into-video steganography with deep robust watermarking, our +method can embed invisible visual-audio localization watermarks and copyright +watermarks into the original video frames and audio, enabling precise +manipulation localization and copyright protection. We also design a temporal +alignment and fusion module and degradation prompt learning to enhance the +localization accuracy and decoding robustness. Meanwhile, we introduce a +sample-level audio localization method and a cross-modal copyright extraction +mechanism to couple the information of audio and video frames. The +effectiveness of V2A-Mark has been verified on a visual-audio tampering +dataset, emphasizing its superiority in localization precision and copyright +accuracy, crucial for the sustainable development of video editing in the AIGC +video era. + +
+
+
+
+
+ + ☆ Learning Visuotactile Skills with Two Multifingered Hands + + +
+ Aiming to replicate human-like dexterity, perceptual experiences, and motion +patterns, we explore learning from human demonstrations using a bimanual system +with multifingered hands and visuotactile data. Two significant challenges +exist: the lack of an affordable and accessible teleoperation system suitable +for a dual-arm setup with multifingered hands, and the scarcity of +multifingered hand hardware equipped with touch sensing. To tackle the first +challenge, we develop HATO, a low-cost hands-arms teleoperation system that +leverages off-the-shelf electronics, complemented with a software suite that +enables efficient data collection; the comprehensive software suite also +supports multimodal data processing, scalable policy learning, and smooth +policy deployment. To tackle the latter challenge, we introduce a novel +hardware adaptation by repurposing two prosthetic hands equipped with touch +sensors for research. Using visuotactile data collected from our system, we +learn skills to complete long-horizon, high-precision tasks which are difficult +to achieve without multifingered dexterity and touch feedback. Furthermore, we +empirically investigate the effects of dataset size, sensing modality, and +visual input preprocessing on policy learning. Our results mark a promising +step forward in bimanual multifingered manipulation from visuotactile data. +Videos, code, and datasets can be found at https://toruowo.github.io/hato/ . + +
+
+ comment: Code and Project Website: https://toruowo.github.io/hato/ +
+
+
+
+
+ + ☆ How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal + Models with Open-Source Suites + + +
+ In this report, we introduce InternVL 1.5, an open-source multimodal large +language model (MLLM) to bridge the capability gap between open-source and +proprietary commercial models in multimodal understanding. We introduce three +simple improvements: (1) Strong Vision Encoder: we explored a continuous +learning strategy for the large-scale vision foundation model -- InternViT-6B, +boosting its visual understanding capabilities, and making it can be +transferred and reused in different LLMs. (2) Dynamic High-Resolution: we +divide images into tiles ranging from 1 to 40 of 448$\times$448 pixels +according to the aspect ratio and resolution of the input images, which +supports up to 4K resolution input. (3) High-Quality Bilingual Dataset: we +carefully collected a high-quality bilingual dataset that covers common scenes, +document images, and annotated them with English and Chinese question-answer +pairs, significantly enhancing performance in OCR- and Chinese-related tasks. +We evaluate InternVL 1.5 through a series of benchmarks and comparative +studies. Compared to both open-source and proprietary models, InternVL 1.5 +shows competitive performance, achieving state-of-the-art results in 8 of 18 +benchmarks. Code has been released at https://github.com/OpenGVLab/InternVL. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Revisiting Text-to-Image Evaluation with Gecko: On Metrics, Prompts, and + Human Ratings + + +
+ While text-to-image (T2I) generative models have become ubiquitous, they do +not necessarily generate images that align with a given prompt. While previous +work has evaluated T2I alignment by proposing metrics, benchmarks, and +templates for collecting human judgements, the quality of these components is +not systematically measured. Human-rated prompt sets are generally small and +the reliability of the ratings -- and thereby the prompt set used to compare +models -- is not evaluated. We address this gap by performing an extensive +study evaluating auto-eval metrics and human templates. We provide three main +contributions: (1) We introduce a comprehensive skills-based benchmark that can +discriminate models across different human templates. This skills-based +benchmark categorises prompts into sub-skills, allowing a practitioner to +pinpoint not only which skills are challenging, but at what level of complexity +a skill becomes challenging. (2) We gather human ratings across four templates +and four T2I models for a total of >100K annotations. This allows us to +understand where differences arise due to inherent ambiguity in the prompt and +where they arise due to differences in metric and model quality. (3) Finally, +we introduce a new QA-based auto-eval metric that is better correlated with +human ratings than existing metrics for our new dataset, across different human +templates, and on TIFA160. + +
+
+ comment: Data and code will be released at: + https://github.com/google-deepmind/gecko_benchmark_t2i +
+
+
+
+
+ + ☆ Boosting Unsupervised Semantic Segmentation with Principal Mask + Proposals + + +
+ Unsupervised semantic segmentation aims to automatically partition images +into semantically meaningful regions by identifying global categories within an +image corpus without any form of annotation. Building upon recent advances in +self-supervised representation learning, we focus on how to leverage these +large pre-trained models for the downstream task of unsupervised segmentation. +We present PriMaPs - Principal Mask Proposals - decomposing images into +semantically meaningful masks based on their feature representation. This +allows us to realize unsupervised semantic segmentation by fitting class +prototypes to PriMaPs with a stochastic expectation-maximization algorithm, +PriMaPs-EM. Despite its conceptual simplicity, PriMaPs-EM leads to competitive +results across various pre-trained backbone models, including DINO and DINOv2, +and across datasets, such as Cityscapes, COCO-Stuff, and Potsdam-3. +Importantly, PriMaPs-EM is able to boost results when applied orthogonally to +current state-of-the-art unsupervised semantic segmentation pipelines. + +
+
+ comment: Code: https://github.com/visinf/primaps +
+
+
+
+
+ + ☆ Meta-Transfer Derm-Diagnosis: Exploring Few-Shot Learning and Transfer + Learning for Skin Disease Classification in Long-Tail Distribution + + +
+ Addressing the challenges of rare diseases is difficult, especially with the +limited number of reference images and a small patient population. This is more +evident in rare skin diseases, where we encounter long-tailed data +distributions that make it difficult to develop unbiased and broadly effective +models. The diverse ways in which image datasets are gathered and their +distinct purposes also add to these challenges. Our study conducts a detailed +examination of the benefits and drawbacks of episodic and conventional training +methodologies, adopting a few-shot learning approach alongside transfer +learning. We evaluated our models using the ISIC2018, Derm7pt, and SD-198 +datasets. With minimal labeled examples, our models showed substantial +information gains and better performance compared to previously trained models. +Our research emphasizes the improved ability to represent features in +DenseNet121 and MobileNetV2 models, achieved by using pre-trained models on +ImageNet to increase similarities within classes. Moreover, our experiments, +ranging from 2-way to 5-way classifications with up to 10 examples, showed a +growing success rate for traditional transfer learning methods as the number of +examples increased. The addition of data augmentation techniques significantly +improved our transfer learning based model performance, leading to higher +performances than existing methods, especially in the SD-198 and ISIC2018 +datasets. All source code related to this work will be made publicly available +soon at the provided URL. + +
+
+ comment: 17 pages, 5 figures, 6 tables, submitted to IEEE Journal of + Biomedical and Health Informatics +
+
+
+
+
+ + ☆ AAPL: Adding Attributes to Prompt Learning for Vision-Language Models CVPR 2024 + + +
+ Recent advances in large pre-trained vision-language models have demonstrated +remarkable performance on zero-shot downstream tasks. Building upon this, +recent studies, such as CoOp and CoCoOp, have proposed the use of prompt +learning, where context within a prompt is replaced with learnable vectors, +leading to significant improvements over manually crafted prompts. However, the +performance improvement for unseen classes is still marginal, and to tackle +this problem, data augmentation has been frequently used in traditional +zero-shot learning techniques. Through our experiments, we have identified +important issues in CoOp and CoCoOp: the context learned through traditional +image augmentation is biased toward seen classes, negatively impacting +generalization to unseen classes. To address this problem, we propose +adversarial token embedding to disentangle low-level visual augmentation +features from high-level class information when inducing bias in learnable +prompts. Through our novel mechanism called "Adding Attributes to Prompt +Learning", AAPL, we guide the learnable context to effectively extract text +features by focusing on high-level features for unseen classes. We have +conducted experiments across 11 datasets, and overall, AAPL shows favorable +performances compared to the existing methods in few-shot learning, zero-shot +learning, cross-dataset, and domain generalization tasks. + +
+
+ comment: Accepted to CVPR 2024 Workshop on Prompting in Vision, Project Page: + https://github.com/Gahyeonkim09/AAPL +
+
+
+
+
+ + ☆ SEED-Bench-2-Plus: Benchmarking Multimodal Large Language Models with + Text-Rich Visual Comprehension + + +
+ Comprehending text-rich visual content is paramount for the practical +application of Multimodal Large Language Models (MLLMs), since text-rich +scenarios are ubiquitous in the real world, which are characterized by the +presence of extensive texts embedded within images. Recently, the advent of +MLLMs with impressive versatility has raised the bar for what we can expect +from MLLMs. However, their proficiency in text-rich scenarios has yet to be +comprehensively and objectively assessed, since current MLLM benchmarks +primarily focus on evaluating general visual comprehension. In this work, we +introduce SEED-Bench-2-Plus, a benchmark specifically designed for evaluating +\textbf{text-rich visual comprehension} of MLLMs. Our benchmark comprises 2.3K +multiple-choice questions with precise human annotations, spanning three broad +categories: Charts, Maps, and Webs, each of which covers a wide spectrum of +text-rich scenarios in the real world. These categories, due to their inherent +complexity and diversity, effectively simulate real-world text-rich +environments. We further conduct a thorough evaluation involving 34 prominent +MLLMs (including GPT-4V, Gemini-Pro-Vision and Claude-3-Opus) and emphasize the +current limitations of MLLMs in text-rich visual comprehension. We hope that +our work can serve as a valuable addition to existing MLLM benchmarks, +providing insightful observations and inspiring further research in the area of +text-rich visual comprehension with MLLMs. The dataset and evaluation code can +be accessed at https://github.com/AILab-CVC/SEED-Bench. + +
+
+
+
+
+ + ☆ Registration by Regression (RbR): a framework for interpretable and + flexible atlas registration + + +
+ In human neuroimaging studies, atlas registration enables mapping MRI scans +to a common coordinate frame, which is necessary to aggregate data from +multiple subjects. Machine learning registration methods have achieved +excellent speed and accuracy but lack interpretability. More recently, +keypoint-based methods have been proposed to tackle this issue, but their +accuracy is still subpar, particularly when fitting nonlinear transforms. Here +we propose Registration by Regression (RbR), a novel atlas registration +framework that is highly robust and flexible, conceptually simple, and can be +trained with cheaply obtained data. RbR predicts the (x,y,z) atlas coordinates +for every voxel of the input scan (i.e., every voxel is a keypoint), and then +uses closed-form expressions to quickly fit transforms using a wide array of +possible deformation models, including affine and nonlinear (e.g., Bspline, +Demons, invertible diffeomorphic models, etc.). Robustness is provided by the +large number of voxels informing the registration and can be further increased +by robust estimators like RANSAC. Experiments on independent public datasets +show that RbR yields more accurate registration than competing keypoint +approaches, while providing full control of the deformation model. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ ConKeD++ -- Improving descriptor learning for retinal image + registration: A comprehensive study of contrastive losses + + +
+ Self-supervised contrastive learning has emerged as one of the most +successful deep learning paradigms. In this regard, it has seen extensive use +in image registration and, more recently, in the particular field of medical +image registration. In this work, we propose to test and extend and improve a +state-of-the-art framework for color fundus image registration, ConKeD. Using +the ConKeD framework we test multiple loss functions, adapting them to the +framework and the application domain. Furthermore, we evaluate our models using +the standarized benchmark dataset FIRE as well as several datasets that have +never been used before for color fundus registration, for which we are +releasing the pairing data as well as a standardized evaluation approach. Our +work demonstrates state-of-the-art performance across all datasets and metrics +demonstrating several advantages over current SOTA color fundus registration +methods + +
+
+
+
+
+ + ☆ ConsistentID: Portrait Generation with Multimodal Fine-Grained Identity + Preserving + + +
+ Diffusion-based technologies have made significant strides, particularly in +personalized and customized facialgeneration. However, existing methods face +challenges in achieving high-fidelity and detailed identity (ID)consistency, +primarily due to insufficient fine-grained control over facial areas and the +lack of a comprehensive strategy for ID preservation by fully considering +intricate facial details and the overall face. To address these limitations, we +introduce ConsistentID, an innovative method crafted for +diverseidentity-preserving portrait generation under fine-grained multimodal +facial prompts, utilizing only a single reference image. ConsistentID comprises +two key components: a multimodal facial prompt generator that combines facial +features, corresponding facial descriptions and the overall facial context to +enhance precision in facial details, and an ID-preservation network optimized +through the facial attention localization strategy, aimed at preserving ID +consistency in facial regions. Together, these components significantly enhance +the accuracy of ID preservation by introducing fine-grained multimodal ID +information from facial regions. To facilitate training of ConsistentID, we +present a fine-grained portrait dataset, FGID, with over 500,000 facial images, +offering greater diversity and comprehensiveness than existing public facial +datasets. % such as LAION-Face, CelebA, FFHQ, and SFHQ. Experimental results +substantiate that our ConsistentID achieves exceptional precision and diversity +in personalized facial generation, surpassing existing methods in the MyStyle +dataset. Furthermore, while ConsistentID introduces more multimodal ID +information, it maintains a fast inference speed during generation. + +
+
+ comment: Project page: https://ssugarwh.github.io/consistentid.github.io/ +
+
+
+
+
+ + ☆ REBEL: Reinforcement Learning via Regressing Relative Rewards + + +
+ While originally developed for continuous control problems, Proximal Policy +Optimization (PPO) has emerged as the work-horse of a variety of reinforcement +learning (RL) applications including the fine-tuning of generative models. +Unfortunately, PPO requires multiple heuristics to enable stable convergence +(e.g. value networks, clipping) and is notorious for its sensitivity to the +precise implementation of these components. In response, we take a step back +and ask what a minimalist RL algorithm for the era of generative models would +look like. We propose REBEL, an algorithm that cleanly reduces the problem of +policy optimization to regressing the relative rewards via a direct policy +parameterization between two completions to a prompt, enabling strikingly +lightweight implementation. In theory, we prove that fundamental RL algorithms +like Natural Policy Gradient can be seen as variants of REBEL, which allows us +to match the strongest known theoretical guarantees in terms of convergence and +sample complexity in the RL literature. REBEL can also cleanly incorporate +offline data and handle the intransitive preferences we frequently see in +practice. Empirically, we find that REBEL provides a unified approach to +language modeling and image generation with stronger or similar performance as +PPO and DPO, all while being simpler to implement and more computationally +tractable than PPO. + +
+
+
+
+
+ + ☆ RadGenome-Chest CT: A Grounded Vision-Language Dataset for Chest CT + Analysis + + +
+ Developing generalist foundation model has recently attracted tremendous +attention among researchers in the field of AI for Medicine (AI4Medicine). A +pivotal insight in developing these models is their reliance on dataset +scaling, which emphasizes the requirements on developing open-source medical +image datasets that incorporate diverse supervision signals across various +imaging modalities. In this paper, we introduce RadGenome-Chest CT, a +comprehensive, large-scale, region-guided 3D chest CT interpretation dataset +based on CT-RATE. Specifically, we leverage the latest powerful universal +segmentation and large language models, to extend the original datasets (over +25,692 non-contrast 3D chest CT volume and reports from 20,000 patients) from +the following aspects: (i) organ-level segmentation masks covering 197 +categories, which provide intermediate reasoning visual clues for +interpretation; (ii) 665 K multi-granularity grounded reports, where each +sentence of the report is linked to the corresponding anatomical region of CT +volume in the form of a segmentation mask; (iii) 1.3 M grounded VQA pairs, +where questions and answers are all linked with reference segmentation masks, +enabling models to associate visual evidence with textual explanations. All +grounded reports and VQA pairs in the validation set have gone through manual +verification to ensure dataset quality. We believe that RadGenome-Chest CT can +significantly advance the development of multimodal medical foundation models, +by training to generate texts based on given segmentation regions, which is +unattainable with previous relevant datasets. We will release all segmentation +masks, grounded reports, and VQA pairs to facilitate further research and +development in this field. + +
+
+
+
+
+ + ☆ TokenHMR: Advancing Human Mesh Recovery with a Tokenized Pose + Representation CVPR 2024 + + +
+ We address the problem of regressing 3D human pose and shape from a single +image, with a focus on 3D accuracy. The current best methods leverage large +datasets of 3D pseudo-ground-truth (p-GT) and 2D keypoints, leading to robust +performance. With such methods, we observe a paradoxical decline in 3D pose +accuracy with increasing 2D accuracy. This is caused by biases in the p-GT and +the use of an approximate camera projection model. We quantify the error +induced by current camera models and show that fitting 2D keypoints and p-GT +accurately causes incorrect 3D poses. Our analysis defines the invalid +distances within which minimizing 2D and p-GT losses is detrimental. We use +this to formulate a new loss Threshold-Adaptive Loss Scaling (TALS) that +penalizes gross 2D and p-GT losses but not smaller ones. With such a loss, +there are many 3D poses that could equally explain the 2D evidence. To reduce +this ambiguity we need a prior over valid human poses but such priors can +introduce unwanted bias. To address this, we exploit a tokenized representation +of human pose and reformulate the problem as token prediction. This restricts +the estimated poses to the space of valid poses, effectively providing a +uniform prior. Extensive experiments on the EMDB and 3DPW datasets show that +our reformulated keypoint loss and tokenization allows us to train on +in-the-wild data while improving 3D accuracy over the state-of-the-art. Our +models and code are available for research at https://tokenhmr.is.tue.mpg.de. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ TELA: Text to Layer-wise 3D Clothed Human Generation + + +
+ This paper addresses the task of 3D clothed human generation from textural +descriptions. Previous works usually encode the human body and clothes as a +holistic model and generate the whole model in a single-stage optimization, +which makes them struggle for clothing editing and meanwhile lose fine-grained +control over the whole generation process. To solve this, we propose a +layer-wise clothed human representation combined with a progressive +optimization strategy, which produces clothing-disentangled 3D human models +while providing control capacity for the generation process. The basic idea is +progressively generating a minimal-clothed human body and layer-wise clothes. +During clothing generation, a novel stratified compositional rendering method +is proposed to fuse multi-layer human models, and a new loss function is +utilized to help decouple the clothing model from the human body. The proposed +method achieves high-quality disentanglement, which thereby provides an +effective way for 3D garment generation. Extensive experiments demonstrate that +our approach achieves state-of-the-art 3D clothed human generation while also +supporting cloth editing applications such as virtual try-on. Project page: +http://jtdong.com/tela_layer/ + +
+
+
+
+
+ + ☆ CBRW: A Novel Approach for Cancelable Biometric Template Generation + based on + + +
+ Cancelable Biometric is a challenging research field in which security of an +original biometric image is ensured by transforming the original biometric into +another irreversible domain. Several approaches have been suggested in +literature for generating cancelable biometric templates. In this paper, two +novel and simple cancelable biometric template generation methods based on +Random Walk (CBRW) have been proposed. By employing random walk and other steps +given in the proposed two algorithms viz. CBRW-BitXOR and CBRW-BitCMP, the +original biometric is transformed into a cancellable template. The performance +of the proposed methods is compared with other state-of-the-art methods. +Experiments have been performed on eight publicly available gray and color +datasets i.e. CP (ear) (gray and color), UTIRIS (iris) (gray and color), ORL +(face) (gray), IIT Delhi (iris) (gray and color), and AR (face) (color). +Performance of the generated templates is measured in terms of Correlation +Coefficient (Cr), Root Mean Square Error (RMSE), Peak Signal to Noise Ratio +(PSNR), Structural Similarity (SSIM), Mean Absolute Error (MAE), Number of +Pixel Change Rate (NPCR), and Unified Average Changing Intensity (UACI). By +experimental results, it has been proved that proposed methods are superior +than other state-of-the-art methods in qualitative as well as quantitative +analysis. Furthermore, CBRW performs better on both gray as well as color +images. + +
+
+
+
+
+ + ☆ Features Fusion for Dual-View Mammography Mass Detection + + +
+ Detection of malignant lesions on mammography images is extremely important +for early breast cancer diagnosis. In clinical practice, images are acquired +from two different angles, and radiologists can fully utilize information from +both views, simultaneously locating the same lesion. However, for automatic +detection approaches such information fusion remains a challenge. In this +paper, we propose a new model called MAMM-Net, which allows the processing of +both mammography views simultaneously by sharing information not only on an +object level, as seen in existing works, but also on a feature level. +MAMM-Net's key component is the Fusion Layer, based on deformable attention and +designed to increase detection precision while keeping high recall. Our +experiments show superior performance on the public DDSM dataset compared to +the previous state-of-the-art model, while introducing new helpful features +such as lesion annotation on pixel-level and classification of lesions +malignancy. + +
+
+ comment: Accepted at ISBI 2024 (21st IEEE International Symposium on + Biomedical Imaging) +
+
+
+
+
+ + ☆ Embracing Diversity: Interpretable Zero-shot classification beyond one + vector per class + + +
+ Vision-language models enable open-world classification of objects without +the need for any retraining. While this zero-shot paradigm marks a significant +advance, even today's best models exhibit skewed performance when objects are +dissimilar from their typical depiction. Real world objects such as pears +appear in a variety of forms -- from diced to whole, on a table or in a bowl -- +yet standard VLM classifiers map all instances of a class to a \it{single +vector based on the class label}. We argue that to represent this rich +diversity within a class, zero-shot classification should move beyond a single +vector. We propose a method to encode and account for diversity within a class +using inferred attributes, still in the zero-shot setting without retraining. +We find our method consistently outperforms standard zero-shot classification +over a large suite of datasets encompassing hierarchies, diverse object states, +and real-world geographic diversity, as well finer-grained datasets where +intra-class diversity may be less prevalent. Importantly, our method is +inherently interpretable, offering faithful explanations for each inference to +facilitate model debugging and enhance transparency. We also find our method +scales efficiently to a large number of attributes to account for diversity -- +leading to more accurate predictions for atypical instances. Finally, we +characterize a principled trade-off between overall and worst class accuracy, +which can be tuned via a hyperparameter of our method. We hope this work spurs +further research into the promise of zero-shot classification beyond a single +class vector for capturing diversity in the world, and building transparent AI +systems without compromising performance. + +
+
+ comment: Accepted to FAccT 2024 +
+
+
+
+
+ + ☆ Multi-view Cardiac Image Segmentation via Trans-Dimensional Priors + + +
+ We propose a novel multi-stage trans-dimensional architecture for multi-view +cardiac image segmentation. Our method exploits the relationship between +long-axis (2D) and short-axis (3D) magnetic resonance (MR) images to perform a +sequential 3D-to-2D-to-3D segmentation, segmenting the long-axis and short-axis +images. In the first stage, 3D segmentation is performed using the short-axis +image, and the prediction is transformed to the long-axis view and used as a +segmentation prior in the next stage. In the second step, the heart region is +localized and cropped around the segmentation prior using a Heart Localization +and Cropping (HLC) module, focusing the subsequent model on the heart region of +the image, where a 2D segmentation is performed. Similarly, we transform the +long-axis prediction to the short-axis view, localize and crop the heart region +and again perform a 3D segmentation to refine the initial short-axis +segmentation. We evaluate our proposed method on the Multi-Disease, Multi-View +& Multi-Center Right Ventricular Segmentation in Cardiac MRI (M&Ms-2) dataset, +where our method outperforms state-of-the-art methods in segmenting cardiac +regions of interest in both short-axis and long-axis images. The pre-trained +models, source code, and implementation details will be publicly available. + +
+
+
+
+
+ + ☆ NTIRE 2024 Quality Assessment of AI-Generated Content Challenge + + +
+ This paper reports on the NTIRE 2024 Quality Assessment of AI-Generated +Content Challenge, which will be held in conjunction with the New Trends in +Image Restoration and Enhancement Workshop (NTIRE) at CVPR 2024. This challenge +is to address a major challenge in the field of image and video processing, +namely, Image Quality Assessment (IQA) and Video Quality Assessment (VQA) for +AI-Generated Content (AIGC). The challenge is divided into the image track and +the video track. The image track uses the AIGIQA-20K, which contains 20,000 +AI-Generated Images (AIGIs) generated by 15 popular generative models. The +image track has a total of 318 registered participants. A total of 1,646 +submissions are received in the development phase, and 221 submissions are +received in the test phase. Finally, 16 participating teams submitted their +models and fact sheets. The video track uses the T2VQA-DB, which contains +10,000 AI-Generated Videos (AIGVs) generated by 9 popular Text-to-Video (T2V) +models. A total of 196 participants have registered in the video track. A total +of 991 submissions are received in the development phase, and 185 submissions +are received in the test phase. Finally, 12 participating teams submitted their +models and fact sheets. Some methods have achieved better results than baseline +methods, and the winning methods in both tracks have demonstrated superior +prediction performance on AIGC. + +
+
+
+
+
+ + ☆ Multi-scale HSV Color Feature Embedding for High-fidelity NIR-to-RGB + Spectrum Translation + + +
+ The NIR-to-RGB spectral domain translation is a formidable task due to the +inherent spectral mapping ambiguities within NIR inputs and RGB outputs. Thus, +existing methods fail to reconcile the tension between maintaining texture +detail fidelity and achieving diverse color variations. In this paper, we +propose a Multi-scale HSV Color Feature Embedding Network (MCFNet) that +decomposes the mapping process into three sub-tasks, including NIR texture +maintenance, coarse geometry reconstruction, and RGB color prediction. Thus, we +propose three key modules for each corresponding sub-task: the Texture +Preserving Block (TPB), the HSV Color Feature Embedding Module (HSV-CFEM), and +the Geometry Reconstruction Module (GRM). These modules contribute to our +MCFNet methodically tackling spectral translation through a series of +escalating resolutions, progressively enriching images with color and texture +fidelity in a scale-coherent fashion. The proposed MCFNet demonstrates +substantial performance gains over the NIR image colorization task. Code is +released at: https://github.com/AlexYangxx/MCFNet. + +
+
+
+
+
+ + ☆ Multimodal Semantic-Aware Automatic Colorization with Diffusion Prior + + +
+ Colorizing grayscale images offers an engaging visual experience. Existing +automatic colorization methods often fail to generate satisfactory results due +to incorrect semantic colors and unsaturated colors. In this work, we propose +an automatic colorization pipeline to overcome these challenges. We leverage +the extraordinary generative ability of the diffusion prior to synthesize color +with plausible semantics. To overcome the artifacts introduced by the diffusion +prior, we apply the luminance conditional guidance. Moreover, we adopt +multimodal high-level semantic priors to help the model understand the image +content and deliver saturated colors. Besides, a luminance-aware decoder is +designed to restore details and enhance overall visual quality. The proposed +pipeline synthesizes saturated colors while maintaining plausible semantics. +Experiments indicate that our proposed method considers both diversity and +fidelity, surpassing previous methods in terms of perceptual realism and gain +most human preference. + +
+
+
+
+
+ + ☆ EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning CVPR 2024 + + +
+ Visual Instruction Tuning represents a novel learning paradigm involving the +fine-tuning of pre-trained language models using task-specific instructions. +This paradigm shows promising zero-shot results in various natural language +processing tasks but is still unexplored in vision emotion understanding. In +this work, we focus on enhancing the model's proficiency in understanding and +adhering to instructions related to emotional contexts. Initially, we identify +key visual clues critical to visual emotion recognition. Subsequently, we +introduce a novel GPT-assisted pipeline for generating emotion visual +instruction data, effectively addressing the scarcity of annotated instruction +data in this domain. Expanding on the groundwork established by InstructBLIP, +our proposed EmoVIT architecture incorporates emotion-specific instruction +data, leveraging the powerful capabilities of Large Language Models to enhance +performance. Through extensive experiments, our model showcases its proficiency +in emotion classification, adeptness in affective reasoning, and competence in +comprehending humor. The comparative analysis provides a robust benchmark for +Emotion Visual Instruction Tuning in the era of LLMs, providing valuable +insights and opening avenues for future exploration in this domain. Our code is +available at \url{https://github.com/aimmemotion/EmoVIT}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ PhyRecon: Physically Plausible Neural Scene Reconstruction + + +
+ While neural implicit representations have gained popularity in multi-view 3D +reconstruction, previous work struggles to yield physically plausible results, +thereby limiting their applications in physics-demanding domains like embodied +AI and robotics. The lack of plausibility originates from both the absence of +physics modeling in the existing pipeline and their inability to recover +intricate geometrical structures. In this paper, we introduce PhyRecon, which +stands as the first approach to harness both differentiable rendering and +differentiable physics simulation to learn implicit surface representations. +Our framework proposes a novel differentiable particle-based physical simulator +seamlessly integrated with the neural implicit representation. At its core is +an efficient transformation between SDF-based implicit representation and +explicit surface points by our proposed algorithm, Surface Points Marching +Cubes (SP-MC), enabling differentiable learning with both rendering and +physical losses. Moreover, we model both rendering and physical uncertainty to +identify and compensate for the inconsistent and inaccurate monocular geometric +priors. The physical uncertainty additionally enables a physics-guided pixel +sampling to enhance the learning of slender structures. By amalgamating these +techniques, our model facilitates efficient joint modeling with appearance, +geometry, and physics. Extensive experiments demonstrate that PhyRecon +significantly outperforms all state-of-the-art methods in terms of +reconstruction quality. Our reconstruction results also yield superior physical +stability, verified by Isaac Gym, with at least a 40% improvement across all +datasets, opening broader avenues for future physics-based applications. + +
+
+ comment: project page: https://phyrecon.github.io/ +
+
+
+
+
+ + ☆ Zero-Shot Distillation for Image Encoders: How to Make Effective Use of + Synthetic Data + + +
+ Multi-modal foundation models such as CLIP have showcased impressive +zero-shot capabilities. However, their applicability in resource-constrained +environments is limited due to their large number of parameters and high +inference time. While existing approaches have scaled down the entire CLIP +architecture, we focus on training smaller variants of the image encoder, which +suffices for efficient zero-shot classification. The use of synthetic data has +shown promise in distilling representations from larger teachers, resulting in +strong few-shot and linear probe performance. However, we find that this +approach surprisingly fails in true zero-shot settings when using contrastive +losses. We identify the exploitation of spurious features as being responsible +for poor generalization between synthetic and real data. However, by using the +image feature-based L2 distillation loss, we mitigate these problems and train +students that achieve zero-shot performance which on four domain-specific +datasets is on-par with a ViT-B/32 teacher model trained on DataCompXL, while +featuring up to 92% fewer parameters. + +
+
+
+
+
+ + ☆ TinyChart: Efficient Chart Understanding with Visual Token Merging and + Program-of-Thoughts Learning + + +
+ Charts are important for presenting and explaining complex data +relationships. Recently, multimodal large language models (MLLMs) have shown +remarkable capabilities in various chart understanding tasks. However, the +sheer size of these models in terms of parameters and computational +requirements limits their use in resource-constrained environments. In this +paper, we present TinyChart, an efficient MLLM for chart understanding with +only 3B parameters. TinyChart overcomes two key challenges in efficient chart +understanding: (1) reduce the burden of learning numerical computations through +a Program-of-Thoughts (PoT) learning strategy, which trains the model to +generate Python programs for numerical calculations, and (2) reduce lengthy +vision feature sequences produced by the vision transformer for high-resolution +images through a Vision Token Merging module, which gradually merges most +similar vision tokens. Extensive experiments demonstrate that our 3B TinyChart +achieves SOTA performance on a variety of chart understanding benchmarks +including ChartQA, Chart-to-Text, Chart-to-Table, OpenCQA, and ChartX. It +outperforms several chart understanding MLLM with up to 13B parameters such as +ChartLlama and ChartAst, and close-sourced general-purpose MLLM GPT-4V on +ChartQA. It also demonstrates its superior efficiency with higher throughput +during inference due to a smaller model scale and more efficient vision +encoding. Our code and model are available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/TinyChart. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ☆ Self-Balanced R-CNN for Instance Segmentation + + +
+ Current state-of-the-art two-stage models on instance segmentation task +suffer from several types of imbalances. In this paper, we address the +Intersection over the Union (IoU) distribution imbalance of positive input +Regions of Interest (RoIs) during the training of the second stage. Our +Self-Balanced R-CNN (SBR-CNN), an evolved version of the Hybrid Task Cascade +(HTC) model, brings brand new loop mechanisms of bounding box and mask +refinements. With an improved Generic RoI Extraction (GRoIE), we also address +the feature-level imbalance at the Feature Pyramid Network (FPN) level, +originated by a non-uniform integration between low- and high-level features +from the backbone layers. In addition, the redesign of the architecture heads +toward a fully convolutional approach with FCC further reduces the number of +parameters and obtains more clues to the connection between the task to solve +and the layers used. Moreover, our SBR-CNN model shows the same or even better +improvements if adopted in conjunction with other state-of-the-art models. In +fact, with a lightweight ResNet-50 as backbone, evaluated on COCO minival 2017 +dataset, our model reaches 45.3% and 41.5% AP for object detection and instance +segmentation, with 12 epochs and without extra tricks. The code is available at +https://github.com/IMPLabUniPr/mmdetection/tree/sbr_cnn + +
+
+
+
+
+ + ☆ DAVE -- A Detect-and-Verify Paradigm for Low-Shot Counting CVPR2024 + + +
+ Low-shot counters estimate the number of objects corresponding to a selected +category, based on only few or no exemplars annotated in the image. The current +state-of-the-art estimates the total counts as the sum over the object location +density map, but does not provide individual object locations and sizes, which +are crucial for many applications. This is addressed by detection-based +counters, which, however fall behind in the total count accuracy. Furthermore, +both approaches tend to overestimate the counts in the presence of other object +classes due to many false positives. We propose DAVE, a low-shot counter based +on a detect-and-verify paradigm, that avoids the aforementioned issues by first +generating a high-recall detection set and then verifying the detections to +identify and remove the outliers. This jointly increases the recall and +precision, leading to accurate counts. DAVE outperforms the top density-based +counters by ~20% in the total count MAE, it outperforms the most recent +detection-based counter by ~20% in detection quality and sets a new +state-of-the-art in zero-shot as well as text-prompt-based counting. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Denoising: from classical methods to deep CNNs + + +
+ This paper aims to explore the evolution of image denoising in a +pedagological way. We briefly review classical methods such as Fourier analysis +and wavelet bases, highlighting the challenges they faced until the emergence +of neural networks, notably the U-Net, in the 2010s. The remarkable performance +of these networks has been demonstrated in studies such as Kadkhodaie et al. +(2024). They exhibit adaptability to various image types, including those with +fixed regularity, facial images, and bedroom scenes, achieving optimal results +and biased towards geometry-adaptive harmonic basis. The introduction of score +diffusion has played a crucial role in image generation. In this context, +denoising becomes essential as it facilitates the estimation of probability +density scores. We discuss the prerequisites for genuine learning of +probability densities, offering insights that extend from mathematical research +to the implications of universal structures. + +
+
+ comment: 33 pages, 33 figures +
+
+
+
+
+ + ☆ MuseumMaker: Continual Style Customization without Catastrophic + Forgetting + + +
+ Pre-trained large text-to-image (T2I) models with an appropriate text prompt +has attracted growing interests in customized images generation field. However, +catastrophic forgetting issue make it hard to continually synthesize new +user-provided styles while retaining the satisfying results amongst learned +styles. In this paper, we propose MuseumMaker, a method that enables the +synthesis of images by following a set of customized styles in a never-end +manner, and gradually accumulate these creative artistic works as a Museum. +When facing with a new customization style, we develop a style distillation +loss module to transfer the style of the whole dataset into generation of +images. It can minimize the learning biases caused by content of images, and +address the catastrophic overfitting issue induced by few-shot images. To deal +with catastrophic forgetting amongst past learned styles, we devise a dual +regularization for shared-LoRA module to optimize the direction of model +update, which could regularize the diffusion model from both weight and feature +aspects, respectively. Meanwhile, a unique token embedding corresponding to +this new style is learned by a task-wise token learning module, which could +preserve historical knowledge from past styles with the limitation of LoRA +parameter quantity. As any new user-provided style come, our MuseumMaker can +capture the nuances of the new styles while maintaining the details of learned +styles. Experimental results on diverse style datasets validate the +effectiveness of our proposed MuseumMaker method, showcasing its robustness and +versatility across various scenarios. + +
+
+
+
+
+ + ☆ SFMViT: SlowFast Meet ViT in Chaotic World + + +
+ The task of spatiotemporal action localization in chaotic scenes is a +challenging task toward advanced video understanding. Paving the way with +high-quality video feature extraction and enhancing the precision of +detector-predicted anchors can effectively improve model performance. To this +end, we propose a high-performance dual-stream spatiotemporal feature +extraction network SFMViT with an anchor pruning strategy. The backbone of our +SFMViT is composed of ViT and SlowFast with prior knowledge of spatiotemporal +action localization, which fully utilizes ViT's excellent global feature +extraction capabilities and SlowFast's spatiotemporal sequence modeling +capabilities. Secondly, we introduce the confidence maximum heap to prune the +anchors detected in each frame of the picture to filter out the effective +anchors. These designs enable our SFMViT to achieve a mAP of 26.62% in the +Chaotic World dataset, far exceeding existing models. Code is available at +https://github.com/jfightyr/SlowFast-Meet-ViT. + +
+
+
+
+
+ + ☆ AudioScenic: Audio-Driven Video Scene Editing + + +
+ Audio-driven visual scene editing endeavors to manipulate the visual +background while leaving the foreground content unchanged, according to the +given audio signals. Unlike current efforts focusing primarily on image +editing, audio-driven video scene editing has not been extensively addressed. +In this paper, we introduce AudioScenic, an audio-driven framework designed for +video scene editing. AudioScenic integrates audio semantics into the visual +scene through a temporal-aware audio semantic injection process. As our focus +is on background editing, we further introduce a SceneMasker module, which +maintains the integrity of the foreground content during the editing process. +AudioScenic exploits the inherent properties of audio, namely, audio magnitude +and frequency, to guide the editing process, aiming to control the temporal +dynamics and enhance the temporal consistency. First, we present an audio +Magnitude Modulator module that adjusts the temporal dynamics of the scene in +response to changes in audio magnitude, enhancing the visual dynamics. Second, +the audio Frequency Fuser module is designed to ensure temporal consistency by +aligning the frequency of the audio with the dynamics of the video scenes, thus +improving the overall temporal coherence of the edited videos. These integrated +features enable AudioScenic to not only enhance visual diversity but also +maintain temporal consistency throughout the video. We present a new metric +named temporal score for more comprehensive validation of temporal consistency. +We demonstrate substantial advancements of AudioScenic over competing methods +on DAVIS and Audioset datasets. + +
+
+
+
+
+ + ☆ Road Surface Friction Estimation for Winter Conditions Utilising General + Visual Features + + +
+ In below freezing winter conditions, road surface friction can greatly vary +based on the mixture of snow, ice, and water on the road. Friction between the +road and vehicle tyres is a critical parameter defining vehicle dynamics, and +therefore road surface friction information is essential to acquire for several +intelligent transportation applications, such as safe control of automated +vehicles or alerting drivers of slippery road conditions. This paper explores +computer vision-based evaluation of road surface friction from roadside +cameras. Previous studies have extensively investigated the application of +convolutional neural networks for the task of evaluating the road surface +condition from images. Here, we propose a hybrid deep learning architecture, +WCamNet, consisting of a pretrained visual transformer model and convolutional +blocks. The motivation of the architecture is to combine general visual +features provided by the transformer model, as well as finetuned feature +extraction properties of the convolutional blocks. To benchmark the approach, +an extensive dataset was gathered from national Finnish road infrastructure +network of roadside cameras and optical road surface friction sensors. Acquired +results highlight that the proposed WCamNet outperforms previous approaches in +the task of predicting the road surface friction from the roadside camera +images. + +
+
+
+
+
+ + ☆ Multi-Scale Representations by Varying Window Attention for Semantic + Segmentation ICLR2024 + + +
+ Multi-scale learning is central to semantic segmentation. We visualize the +effective receptive field (ERF) of canonical multi-scale representations and +point out two risks in learning them: scale inadequacy and field inactivation. +A novel multi-scale learner, varying window attention (VWA), is presented to +address these issues. VWA leverages the local window attention (LWA) and +disentangles LWA into the query window and context window, allowing the +context's scale to vary for the query to learn representations at multiple +scales. However, varying the context to large-scale windows (enlarging ratio R) +can significantly increase the memory footprint and computation cost (R^2 times +larger than LWA). We propose a simple but professional re-scaling strategy to +zero the extra induced cost without compromising performance. Consequently, VWA +uses the same cost as LWA to overcome the receptive limitation of the local +window. Furthermore, depending on VWA and employing various MLPs, we introduce +a multi-scale decoder (MSD), VWFormer, to improve multi-scale representations +for semantic segmentation. VWFormer achieves efficiency competitive with the +most compute-friendly MSDs, like FPN and MLP decoder, but performs much better +than any MSDs. For instance, using nearly half of UPerNet's computation, +VWFormer outperforms it by 1.0%-2.5% mIoU on ADE20K. With little extra +overhead, ~10G FLOPs, Mask2Former armed with VWFormer improves by 1.0%-1.3%. + +
+
+ comment: ICLR2024 Poster +
+
+
+
+
+ + ☆ MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth + Estimation of Endoscopic Images + + +
+ Photometric constraint is indispensable for self-supervised monocular depth +estimation. It involves warping a source image onto a target view using +estimated depth&pose, and then minimizing the difference between the warped and +target images. However, the endoscopic built-in light causes significant +brightness fluctuations, and thus makes the photometric constraint unreliable. +Previous efforts only mitigate this relying on extra models to calibrate image +brightness. In this paper, we propose MonoPCC to address the brightness +inconsistency radically by reshaping the photometric constraint into a cycle +form. Instead of only warping the source image, MonoPCC constructs a closed +loop consisting of two opposite forward-backward warping paths: from target to +source and then back to target. Thus, the target image finally receives an +image cycle-warped from itself, which naturally makes the constraint invariant +to brightness changes. Moreover, MonoPCC transplants the source image's +phase-frequency into the intermediate warped image to avoid structure lost, and +also stabilizes the training via an exponential moving average (EMA) strategy +to avoid frequent changes in the forward warping. The comprehensive and +extensive experimental results on three datasets demonstrate that our proposed +MonoPCC shows a great robustness to the brightness inconsistency, and exceeds +other state-of-the-arts by reducing the absolute relative error by at least +7.27%. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Research on geometric figure classification algorithm based on Deep + Learning + + +
+ In recent years, with the rapid development of computer information +technology, the development of artificial intelligence has been accelerating. +The traditional geometry recognition technology is relatively backward and the +recognition rate is low. In the face of massive information database, the +traditional algorithm model inevitably has the problems of low recognition +accuracy and poor performance. Deep learning theory has gradually become a very +important part of machine learning. The implementation of convolutional neural +network (CNN) reduces the difficulty of graphics generation algorithm. In this +paper, using the advantages of lenet-5 architecture sharing weights and feature +extraction and classification, the proposed geometric pattern recognition +algorithm model is faster in the training data set. By constructing the shared +feature parameters of the algorithm model, the cross-entropy loss function is +used in the recognition process to improve the generalization of the model and +improve the average recognition accuracy of the test data set. + +
+
+ comment: 6 pages,9 figures +
+
+
+
+
+ + ☆ DeepKalPose: An Enhanced Deep-Learning Kalman Filter for Temporally + Consistent Monocular Vehicle Pose Estimation + + +
+ This paper presents DeepKalPose, a novel approach for enhancing temporal +consistency in monocular vehicle pose estimation applied on video through a +deep-learning-based Kalman Filter. By integrating a Bi-directional Kalman +filter strategy utilizing forward and backward time-series processing, combined +with a learnable motion model to represent complex motion patterns, our method +significantly improves pose accuracy and robustness across various conditions, +particularly for occluded or distant vehicles. Experimental validation on the +KITTI dataset confirms that DeepKalPose outperforms existing methods in both +pose accuracy and temporal consistency. + +
+
+ comment: 4 pages, 3 Figures, published to IET Electronic Letters +
+
+
+
+
+ + ☆ Energy-Latency Manipulation of Multi-modal Large Language Models via + Verbose Samples + + +
+ Despite the exceptional performance of multi-modal large language models +(MLLMs), their deployment requires substantial computational resources. Once +malicious users induce high energy consumption and latency time (energy-latency +cost), it will exhaust computational resources and harm availability of +service. In this paper, we investigate this vulnerability for MLLMs, +particularly image-based and video-based ones, and aim to induce high +energy-latency cost during inference by crafting an imperceptible perturbation. +We find that high energy-latency cost can be manipulated by maximizing the +length of generated sequences, which motivates us to propose verbose samples, +including verbose images and videos. Concretely, two modality non-specific +losses are proposed, including a loss to delay end-of-sequence (EOS) token and +an uncertainty loss to increase the uncertainty over each generated token. In +addition, improving diversity is important to encourage longer responses by +increasing the complexity, which inspires the following modality specific loss. +For verbose images, a token diversity loss is proposed to promote diverse +hidden states. For verbose videos, a frame feature diversity loss is proposed +to increase the feature diversity among frames. To balance these losses, we +propose a temporal weight adjustment algorithm. Experiments demonstrate that +our verbose samples can largely extend the length of generated sequences. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2401.11170 +
+
+
+
+
+ + ☆ Conditional Distribution Modelling for Few-Shot Image Synthesis with + Diffusion Models + + +
+ Few-shot image synthesis entails generating diverse and realistic images of +novel categories using only a few example images. While multiple recent efforts +in this direction have achieved impressive results, the existing approaches are +dependent only upon the few novel samples available at test time in order to +generate new images, which restricts the diversity of the generated images. To +overcome this limitation, we propose Conditional Distribution Modelling (CDM) +-- a framework which effectively utilizes Diffusion models for few-shot image +generation. By modelling the distribution of the latent space used to condition +a Diffusion process, CDM leverages the learnt statistics of the training data +to get a better approximation of the unseen class distribution, thereby +removing the bias arising due to limited number of few shot samples. +Simultaneously, we devise a novel inversion based optimization strategy that +further improves the approximated unseen class distribution, and ensures the +fidelity of the generated samples to the unseen class. The experimental results +on four benchmark datasets demonstrate the effectiveness of our proposed CDM +for few-shot generation. + +
+
+
+
+
+ + ☆ Efficient Solution of Point-Line Absolute Pose CVPR 2024 + + +
+ We revisit certain problems of pose estimation based on 3D--2D +correspondences between features which may be points or lines. Specifically, we +address the two previously-studied minimal problems of estimating camera +extrinsics from $p \in \{ 1, 2 \}$ point--point correspondences and $l=3-p$ +line--line correspondences. To the best of our knowledge, all of the +previously-known practical solutions to these problems required computing the +roots of degree $\ge 4$ (univariate) polynomials when $p=2$, or degree $\ge 8$ +polynomials when $p=1.$ We describe and implement two elementary solutions +which reduce the degrees of the needed polynomials from $4$ to $2$ and from $8$ +to $4$, respectively. We show experimentally that the resulting solvers are +numerically stable and fast: when compared to the previous state-of-the art, we +may obtain nearly an order of magnitude speedup. The code is available at +\url{https://github.com/petrhruby97/efficient\_absolute} + +
+
+ comment: CVPR 2024, 11 pages, 8 figures, 5 tables +
+
+
+
+
+ + ☆ Cross-Domain Spatial Matching for Camera and Radar Sensor Data Fusion in + Autonomous Vehicle Perception System + + +
+ In this paper, we propose a novel approach to address the problem of camera +and radar sensor fusion for 3D object detection in autonomous vehicle +perception systems. Our approach builds on recent advances in deep learning and +leverages the strengths of both sensors to improve object detection +performance. Precisely, we extract 2D features from camera images using a +state-of-the-art deep learning architecture and then apply a novel Cross-Domain +Spatial Matching (CDSM) transformation method to convert these features into 3D +space. We then fuse them with extracted radar data using a complementary fusion +strategy to produce a final 3D object representation. To demonstrate the +effectiveness of our approach, we evaluate it on the NuScenes dataset. We +compare our approach to both single-sensor performance and current +state-of-the-art fusion methods. Our results show that the proposed approach +achieves superior performance over single-sensor solutions and could directly +compete with other top-level fusion methods. + +
+
+ comment: 12 pages including highlights and graphical abstract, submitted to + Expert Systems with Applications journal +
+
+
+
+
+ + ☆ OpenDlign: Enhancing Open-World 3D Learning with Depth-Aligned Images + + +
+ Recent advances in Vision and Language Models (VLMs) have improved open-world +3D representation, facilitating 3D zero-shot capability in unseen categories. +Existing open-world methods pre-train an extra 3D encoder to align features +from 3D data (e.g., depth maps or point clouds) with CAD-rendered images and +corresponding texts. However, the limited color and texture variations in CAD +images can compromise the alignment robustness. Furthermore, the volume +discrepancy between pre-training datasets of the 3D encoder and VLM leads to +sub-optimal 2D to 3D knowledge transfer. To overcome these issues, we propose +OpenDlign, a novel framework for learning open-world 3D representations, that +leverages depth-aligned images generated from point cloud-projected depth maps. +Unlike CAD-rendered images, our generated images provide rich, realistic color +and texture diversity while preserving geometric and semantic consistency with +the depth maps. OpenDlign also optimizes depth map projection and integrates +depth-specific text prompts, improving 2D VLM knowledge adaptation for 3D +learning efficient fine-tuning. Experimental results show that OpenDlign +significantly outperforms existing benchmarks in zero-shot and few-shot 3D +tasks, exceeding prior scores by 8.0% on ModelNet40 and 16.4% on OmniObject3D +with just 6 million tuned parameters. Moreover, integrating generated +depth-aligned images into existing 3D learning pipelines consistently improves +their performance. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ 3D Face Modeling via Weakly-supervised Disentanglement Network joint + Identity-consistency Prior + + +
+ Generative 3D face models featuring disentangled controlling factors hold +immense potential for diverse applications in computer vision and computer +graphics. However, previous 3D face modeling methods face a challenge as they +demand specific labels to effectively disentangle these factors. This becomes +particularly problematic when integrating multiple 3D face datasets to improve +the generalization of the model. Addressing this issue, this paper introduces a +Weakly-Supervised Disentanglement Framework, denoted as WSDF, to facilitate the +training of controllable 3D face models without an overly stringent labeling +requirement. Adhering to the paradigm of Variational Autoencoders (VAEs), the +proposed model achieves disentanglement of identity and expression controlling +factors through a two-branch encoder equipped with dedicated +identity-consistency prior. It then faithfully re-entangles these factors via a +tensor-based combination mechanism. Notably, the introduction of the Neutral +Bank allows precise acquisition of subject-specific information using only +identity labels, thereby averting degeneration due to insufficient supervision. +Additionally, the framework incorporates a label-free second-order loss +function for the expression factor to regulate deformation space and eliminate +extraneous information, resulting in enhanced disentanglement. Extensive +experiments have been conducted to substantiate the superior performance of +WSDF. Our code is available at https://github.com/liguohao96/WSDF. + +
+
+
+
+
+ + ☆ Vision-based robot manipulation of transparent liquid containers in a + laboratory setting + + +
+ Laboratory processes involving small volumes of solutions and active +ingredients are often performed manually due to challenges in automation, such +as high initial costs, semi-structured environments and protocol variability. +In this work, we develop a flexible and cost-effective approach to address this +gap by introducing a vision-based system for liquid volume estimation and a +simulation-driven pouring method particularly designed for containers with +small openings. We evaluate both components individually, followed by an +applied real-world integration of cell culture automation using a UR5 robotic +arm. Our work is fully reproducible: we share our code at at +\url{https://github.com/DaniSchober/LabLiquidVision} and the newly introduced +dataset LabLiquidVolume is available at +https://data.dtu.dk/articles/dataset/LabLiquidVision/25103102. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Interactive3D: Create What You Want by Interactive 3D Generation + + +
+ 3D object generation has undergone significant advancements, yielding +high-quality results. However, fall short of achieving precise user control, +often yielding results that do not align with user expectations, thus limiting +their applicability. User-envisioning 3D object generation faces significant +challenges in realizing its concepts using current generative models due to +limited interaction capabilities. Existing methods mainly offer two approaches: +(i) interpreting textual instructions with constrained controllability, or (ii) +reconstructing 3D objects from 2D images. Both of them limit customization to +the confines of the 2D reference and potentially introduce undesirable +artifacts during the 3D lifting process, restricting the scope for direct and +versatile 3D modifications. In this work, we introduce Interactive3D, an +innovative framework for interactive 3D generation that grants users precise +control over the generative process through extensive 3D interaction +capabilities. Interactive3D is constructed in two cascading stages, utilizing +distinct 3D representations. The first stage employs Gaussian Splatting for +direct user interaction, allowing modifications and guidance of the generative +direction at any intermediate step through (i) Adding and Removing components, +(ii) Deformable and Rigid Dragging, (iii) Geometric Transformations, and (iv) +Semantic Editing. Subsequently, the Gaussian splats are transformed into +InstantNGP. We introduce a novel (v) Interactive Hash Refinement module to +further add details and extract the geometry in the second stage. Our +experiments demonstrate that Interactive3D markedly improves the +controllability and quality of 3D generation. Our project webpage is available +at \url{https://interactive-3d.github.io/}. + +
+
+ comment: project page: https://interactive-3d.github.io/ +
+
+
+
+
+ + ☆ Semantic-aware Next-Best-View for Multi-DoFs Mobile System in + Search-and-Acquisition based Visual Perception + + +
+ Efficient visual perception using mobile systems is crucial, particularly in +unknown environments such as search and rescue operations, where swift and +comprehensive perception of objects of interest is essential. In such +real-world applications, objects of interest are often situated in complex +environments, making the selection of the 'Next Best' view based solely on +maximizing visibility gain suboptimal. Semantics, providing a higher-level +interpretation of perception, should significantly contribute to the selection +of the next viewpoint for various perception tasks. In this study, we formulate +a novel information gain that integrates both visibility gain and semantic gain +in a unified form to select the semantic-aware Next-Best-View. Additionally, we +design an adaptive strategy with termination criterion to support a two-stage +search-and-acquisition manoeuvre on multiple objects of interest aided by a +multi-degree-of-freedoms (Multi-DoFs) mobile system. Several semantically +relevant reconstruction metrics, including perspective directivity and region +of interest (ROI)-to-full reconstruction volume ratio, are introduced to +evaluate the performance of the proposed approach. Simulation experiments +demonstrate the advantages of the proposed approach over existing methods, +achieving improvements of up to 27.13% for the ROI-to-full reconstruction +volume ratio and a 0.88234 average perspective directivity. Furthermore, the +planned motion trajectory exhibits better perceiving coverage toward the +target. + +
+
+
+
+
+ + ☆ 360SFUDA++: Towards Source-free UDA for Panoramic Segmentation by + Learning Reliable Category Prototypes + + +
+ In this paper, we address the challenging source-free unsupervised domain +adaptation (SFUDA) for pinhole-to-panoramic semantic segmentation, given only a +pinhole image pre-trained model (i.e., source) and unlabeled panoramic images +(i.e., target). Tackling this problem is non-trivial due to three critical +challenges: 1) semantic mismatches from the distinct Field-of-View (FoV) +between domains, 2) style discrepancies inherent in the UDA problem, and 3) +inevitable distortion of the panoramic images. To tackle these problems, we +propose 360SFUDA++ that effectively extracts knowledge from the source pinhole +model with only unlabeled panoramic images and transfers the reliable knowledge +to the target panoramic domain. Specifically, we first utilize Tangent +Projection (TP) as it has less distortion and meanwhile slits the +equirectangular projection (ERP) to patches with fixed FoV projection (FFP) to +mimic the pinhole images. Both projections are shown effective in extracting +knowledge from the source model. However, as the distinct projections make it +less possible to directly transfer knowledge between domains, we then propose +Reliable Panoramic Prototype Adaptation Module (RP2AM) to transfer knowledge at +both prediction and prototype levels. RP$^2$AM selects the confident knowledge +and integrates panoramic prototypes for reliable knowledge adaptation. +Moreover, we introduce Cross-projection Dual Attention Module (CDAM), which +better aligns the spatial and channel characteristics across projections at the +feature level between domains. Both knowledge extraction and transfer processes +are synchronously updated to reach the best performance. Extensive experiments +on the synthetic and real-world benchmarks, including outdoor and indoor +scenarios, demonstrate that our 360SFUDA++ achieves significantly better +performance than prior SFUDA methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2403.12505 +
+
+
+
+
+ + ☆ Commonsense Prototype for Outdoor Unsupervised 3D Object Detection CVPR 2024 + + +
+ The prevalent approaches of unsupervised 3D object detection follow +cluster-based pseudo-label generation and iterative self-training processes. +However, the challenge arises due to the sparsity of LiDAR scans, which leads +to pseudo-labels with erroneous size and position, resulting in subpar +detection performance. To tackle this problem, this paper introduces a +Commonsense Prototype-based Detector, termed CPD, for unsupervised 3D object +detection. CPD first constructs Commonsense Prototype (CProto) characterized by +high-quality bounding box and dense points, based on commonsense intuition. +Subsequently, CPD refines the low-quality pseudo-labels by leveraging the size +prior from CProto. Furthermore, CPD enhances the detection accuracy of sparsely +scanned objects by the geometric knowledge from CProto. CPD outperforms +state-of-the-art unsupervised 3D detectors on Waymo Open Dataset (WOD), +PandaSet, and KITTI datasets by a large margin. Besides, by training CPD on WOD +and testing on KITTI, CPD attains 90.85% and 81.01% 3D Average Precision on +easy and moderate car classes, respectively. These achievements position CPD in +close proximity to fully supervised detectors, highlighting the significance of +our method. The code will be available at https://github.com/hailanyi/CPD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Real-Time 4K Super-Resolution of Compressed AVIF Images. AIS 2024 + Challenge Survey CVPR 2024 + + +
+ This paper introduces a novel benchmark as part of the AIS 2024 Real-Time +Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed +images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. +For this, we use a diverse test set containing a variety of 4K images ranging +from digital art to gaming and photography. The images are compressed using the +modern AVIF codec, instead of JPEG. All the proposed methods improve PSNR +fidelity over Lanczos interpolation, and process images under 10ms. Out of the +160 participants, 25 teams submitted their code and models. The solutions +present novel designs tailored for memory-efficiency and runtime on edge +devices. This survey describes the best solutions for real-time SR of +compressed high-resolution images. + +
+
+ comment: CVPR 2024, AI for Streaming (AIS) Workshop +
+
+
+
+
+ + ☆ CoCoG: Controllable Visual Stimuli Generation based on Human Concept + Representations + + +
+ A central question for cognitive science is to understand how humans process +visual objects, i.e, to uncover human low-dimensional concept representation +space from high-dimensional visual stimuli. Generating visual stimuli with +controlling concepts is the key. However, there are currently no generative +models in AI to solve this problem. Here, we present the Concept based +Controllable Generation (CoCoG) framework. CoCoG consists of two components, a +simple yet efficient AI agent for extracting interpretable concept and +predicting human decision-making in visual similarity judgment tasks, and a +conditional generation model for generating visual stimuli given the concepts. +We quantify the performance of CoCoG from two aspects, the human behavior +prediction accuracy and the controllable generation ability. The experiments +with CoCoG indicate that 1) the reliable concept embeddings in CoCoG allows to +predict human behavior with 64.07\% accuracy in the THINGS-similarity dataset; +2) CoCoG can generate diverse objects through the control of concepts; 3) CoCoG +can manipulate human similarity judgment behavior by intervening key concepts. +CoCoG offers visual objects with controlling concepts to advance our +understanding of causality in human cognition. The code of CoCoG is available +at \url{https://github.com/ncclab-sustech/CoCoG}. + +
+
+
+
+
+ + ☆ DiffSeg: A Segmentation Model for Skin Lesions Based on Diffusion + Difference + + +
+ Weakly supervised medical image segmentation (MIS) using generative models is +crucial for clinical diagnosis. However, the accuracy of the segmentation +results is often limited by insufficient supervision and the complex nature of +medical imaging. Existing models also only provide a single outcome, which does +not allow for the measurement of uncertainty. In this paper, we introduce +DiffSeg, a segmentation model for skin lesions based on diffusion difference +which exploits diffusion model principles to ex-tract noise-based features from +images with diverse semantic information. By discerning difference between +these noise features, the model identifies diseased areas. Moreover, its +multi-output capability mimics doctors' annotation behavior, facilitating the +visualization of segmentation result consistency and ambiguity. Additionally, +it quantifies output uncertainty using Generalized Energy Distance (GED), +aiding interpretability and decision-making for physicians. Finally, the model +integrates outputs through the Dense Conditional Random Field (DenseCRF) +algorithm to refine the segmentation boundaries by considering inter-pixel +correlations, which improves the accuracy and optimizes the segmentation +results. We demonstrate the effectiveness of DiffSeg on the ISIC 2018 Challenge +dataset, outperforming state-of-the-art U-Net-based methods. + +
+
+
+
+
+ + ☆ COBRA -- COnfidence score Based on shape Regression Analysis for + method-independent quality assessment of object pose estimation from single + images + + +
+ We present a generic algorithm for scoring pose estimation methods that rely +on single image semantic analysis. The algorithm employs a lightweight putative +shape representation using a combination of multiple Gaussian Processes. Each +Gaussian Process (GP) yields distance normal distributions from multiple +reference points in the object's coordinate system to its surface, thus +providing a geometric evaluation framework for scoring predicted poses. Our +confidence measure comprises the average mixture probability of pixel +back-projections onto the shape template. In the reported experiments, we +compare the accuracy of our GP based representation of objects versus the +actual geometric models and demonstrate the ability of our method to capture +the influence of outliers as opposed to the corresponding intrinsic measures +that ship with the segmentation and pose estimation methods. + +
+
+
+
+
+ + ☆ Correlation-Decoupled Knowledge Distillation for Multimodal Sentiment + Analysis with Incomplete Modalities + + +
+ Multimodal sentiment analysis (MSA) aims to understand human sentiment +through multimodal data. Most MSA efforts are based on the assumption of +modality completeness. However, in real-world applications, some practical +factors cause uncertain modality missingness, which drastically degrades the +model's performance. To this end, we propose a Correlation-decoupled Knowledge +Distillation (CorrKD) framework for the MSA task under uncertain missing +modalities. Specifically, we present a sample-level contrastive distillation +mechanism that transfers comprehensive knowledge containing cross-sample +correlations to reconstruct missing semantics. Moreover, a category-guided +prototype distillation mechanism is introduced to capture cross-category +correlations using category prototypes to align feature distributions and +generate favorable joint representations. Eventually, we design a +response-disentangled consistency distillation strategy to optimize the +sentiment decision boundaries of the student network through response +disentanglement and mutual information maximization. Comprehensive experiments +on three datasets indicate that our framework can achieve favorable +improvements compared with several baselines. + +
+
+
+
+
+ + ☆ PAD: Patch-Agnostic Defense against Adversarial Patch Attacks CVPR 2024 + + +
+ Adversarial patch attacks present a significant threat to real-world object +detectors due to their practical feasibility. Existing defense methods, which +rely on attack data or prior knowledge, struggle to effectively address a wide +range of adversarial patches. In this paper, we show two inherent +characteristics of adversarial patches, semantic independence and spatial +heterogeneity, independent of their appearance, shape, size, quantity, and +location. Semantic independence indicates that adversarial patches operate +autonomously within their semantic context, while spatial heterogeneity +manifests as distinct image quality of the patch area that differs from +original clean image due to the independent generation process. Based on these +observations, we propose PAD, a novel adversarial patch localization and +removal method that does not require prior knowledge or additional training. +PAD offers patch-agnostic defense against various adversarial patches, +compatible with any pre-trained object detectors. Our comprehensive digital and +physical experiments involving diverse patch types, such as localized noise, +printable, and naturalistic patches, exhibit notable improvements over +state-of-the-art works. Our code is available at +https://github.com/Lihua-Jing/PAD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Latent Modulated Function for Computational Optimal Continuous Image + Representation + + +
+ The recent work Local Implicit Image Function (LIIF) and subsequent Implicit +Neural Representation (INR) based works have achieved remarkable success in +Arbitrary-Scale Super-Resolution (ASSR) by using MLP to decode Low-Resolution +(LR) features. However, these continuous image representations typically +implement decoding in High-Resolution (HR) High-Dimensional (HD) space, leading +to a quadratic increase in computational cost and seriously hindering the +practical applications of ASSR. To tackle this problem, we propose a novel +Latent Modulated Function (LMF), which decouples the HR-HD decoding process +into shared latent decoding in LR-HD space and independent rendering in HR +Low-Dimensional (LD) space, thereby realizing the first computational optimal +paradigm of continuous image representation. Specifically, LMF utilizes an HD +MLP in latent space to generate latent modulations of each LR feature vector. +This enables a modulated LD MLP in render space to quickly adapt to any input +feature vector and perform rendering at arbitrary resolution. Furthermore, we +leverage the positive correlation between modulation intensity and input image +complexity to design a Controllable Multi-Scale Rendering (CMSR) algorithm, +offering the flexibility to adjust the decoding efficiency based on the +rendering precision. Extensive experiments demonstrate that converting existing +INR-based ASSR methods to LMF can reduce the computational cost by up to 99.9%, +accelerate inference by up to 57 times, and save up to 76% of parameters, while +maintaining competitive performance. The code is available at +https://github.com/HeZongyao/LMF. + +
+
+
+
+
+ + ☆ Point-JEPA: A Joint Embedding Predictive Architecture for + Self-Supervised Learning on Point Cloud + + +
+ Recent advancements in self-supervised learning in the point cloud domain +have demonstrated significant potential. However, these methods often suffer +from drawbacks, including lengthy pre-training time, the necessity of +reconstruction in the input space, or the necessity of additional modalities. +In order to address these issues, we introduce Point-JEPA, a joint embedding +predictive architecture designed specifically for point cloud data. To this +end, we introduce a sequencer that orders point cloud tokens to efficiently +compute and utilize tokens proximity based on their indices during target and +context selection. The sequencer also allows shared computations of the tokens +proximity between context and target selection, further improving the +efficiency. Experimentally, our method achieves competitive results with +state-of-the-art methods while avoiding the reconstruction in the input space +or additional modality. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Depth Supervised Neural Surface Reconstruction from Airborne Imagery + + +
+ While originally developed for novel view synthesis, Neural Radiance Fields +(NeRFs) have recently emerged as an alternative to multi-view stereo (MVS). +Triggered by a manifold of research activities, promising results have been +gained especially for texture-less, transparent, and reflecting surfaces, while +such scenarios remain challenging for traditional MVS-based approaches. +However, most of these investigations focus on close-range scenarios, with +studies for airborne scenarios still missing. For this task, NeRFs face +potential difficulties at areas of low image redundancy and weak data evidence, +as often found in street canyons, facades or building shadows. Furthermore, +training such networks is computationally expensive. Thus, the aim of our work +is twofold: First, we investigate the applicability of NeRFs for aerial image +blocks representing different characteristics like nadir-only, oblique and +high-resolution imagery. Second, during these investigations we demonstrate the +benefit of integrating depth priors from tie-point measures, which are provided +during presupposed Bundle Block Adjustment. Our work is based on the +state-of-the-art framework VolSDF, which models 3D scenes by signed distance +functions (SDFs), since this is more applicable for surface reconstruction +compared to the standard volumetric representation in vanilla NeRFs. For +evaluation, the NeRF-based reconstructions are compared to results of a +publicly available benchmark dataset for airborne images. + +
+
+
+
+
+ + ☆ Neural Assembler: Learning to Generate Fine-Grained Robotic Assembly + Instructions from Multi-View Images + + +
+ Image-guided object assembly represents a burgeoning research topic in +computer vision. This paper introduces a novel task: translating multi-view +images of a structural 3D model (for example, one constructed with building +blocks drawn from a 3D-object library) into a detailed sequence of assembly +instructions executable by a robotic arm. Fed with multi-view images of the +target 3D model for replication, the model designed for this task must address +several sub-tasks, including recognizing individual components used in +constructing the 3D model, estimating the geometric pose of each component, and +deducing a feasible assembly order adhering to physical rules. Establishing +accurate 2D-3D correspondence between multi-view images and 3D objects is +technically challenging. To tackle this, we propose an end-to-end model known +as the Neural Assembler. This model learns an object graph where each vertex +represents recognized components from the images, and the edges specify the +topology of the 3D model, enabling the derivation of an assembly plan. We +establish benchmarks for this task and conduct comprehensive empirical +evaluations of Neural Assembler and alternative solutions. Our experiments +clearly demonstrate the superiority of Neural Assembler. + +
+
+
+
+
+ + ☆ Robust Fine-tuning for Pre-trained 3D Point Cloud Models + + +
+ This paper presents a robust fine-tuning method designed for pre-trained 3D +point cloud models, to enhance feature robustness in downstream fine-tuned +models. We highlight the limitations of current fine-tuning methods and the +challenges of learning robust models. The proposed method, named Weight-Space +Ensembles for Fine-Tuning then Linear Probing (WiSE-FT-LP), integrates the +original pre-training and fine-tuning models through weight space integration +followed by Linear Probing. This approach significantly enhances the +performance of downstream fine-tuned models under distribution shifts, +improving feature robustness while maintaining high performance on the target +distribution. We apply this robust fine-tuning method to mainstream 3D point +cloud pre-trained models and evaluate the quality of model parameters and the +degradation of downstream task performance. Experimental results demonstrate +the effectiveness of WiSE-FT-LP in enhancing model robustness, effectively +balancing downstream task performance and model feature robustness without +altering the model structures. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ SynCellFactory: Generative Data Augmentation for Cell Tracking + + +
+ Cell tracking remains a pivotal yet challenging task in biomedical research. +The full potential of deep learning for this purpose is often untapped due to +the limited availability of comprehensive and varied training data sets. In +this paper, we present SynCellFactory, a generative cell video augmentation. At +the heart of SynCellFactory lies the ControlNet architecture, which has been +fine-tuned to synthesize cell imagery with photorealistic accuracy in style and +motion patterns. This technique enables the creation of synthetic yet realistic +cell videos that mirror the complexity of authentic microscopy time-lapses. Our +experiments demonstrate that SynCellFactory boosts the performance of +well-established deep learning models for cell tracking, particularly when +original training data is sparse. + +
+
+
+
+
+ + ☆ Learning Discriminative Spatio-temporal Representations for + Semi-supervised Action Recognition + + +
+ Semi-supervised action recognition aims to improve spatio-temporal reasoning +ability with a few labeled data in conjunction with a large amount of unlabeled +data. Albeit recent advancements, existing powerful methods are still prone to +making ambiguous predictions under scarce labeled data, embodied as the +limitation of distinguishing different actions with similar spatio-temporal +information. In this paper, we approach this problem by empowering the model +two aspects of capability, namely discriminative spatial modeling and temporal +structure modeling for learning discriminative spatio-temporal representations. +Specifically, we propose an Adaptive Contrastive Learning~(ACL) strategy. It +assesses the confidence of all unlabeled samples by the class prototypes of the +labeled data, and adaptively selects positive-negative samples from a +pseudo-labeled sample bank to construct contrastive learning. Additionally, we +introduce a Multi-scale Temporal Learning~(MTL) strategy. It could highlight +informative semantics from long-term clips and integrate them into the +short-term clip while suppressing noisy information. Afterwards, both of these +two new techniques are integrated in a unified framework to encourage the model +to make accurate predictions. Extensive experiments on UCF101, HMDB51 and +Kinetics400 show the superiority of our method over prior state-of-the-art +approaches. + +
+
+ comment: 10 pages, 6 figures, 6 tables, 56 conferences +
+
+
+
+
+ + ☆ Cross-sensor super-resolution of irregularly sampled Sentinel-2 time + series + + +
+ Satellite imaging generally presents a trade-off between the frequency of +acquisitions and the spatial resolution of the images. Super-resolution is +often advanced as a way to get the best of both worlds. In this work, we +investigate multi-image super-resolution of satellite image time series, i.e. +how multiple images of the same area acquired at different dates can help +reconstruct a higher resolution observation. In particular, we extend +state-of-the-art deep single and multi-image super-resolution algorithms, such +as SRDiff and HighRes-net, to deal with irregularly sampled Sentinel-2 time +series. We introduce BreizhSR, a new dataset for 4x super-resolution of +Sentinel-2 time series using very high-resolution SPOT-6 imagery of Brittany, a +French region. We show that using multiple images significantly improves +super-resolution performance, and that a well-designed temporal positional +encoding allows us to perform super-resolution for different times of the +series. In addition, we observe a trade-off between spectral fidelity and +perceptual quality of the reconstructed HR images, questioning future +directions for super-resolution of Earth Observation data. + +
+
+
+
+
+ + ☆ Revisiting Relevance Feedback for CLIP-based Interactive Image Retrieval + + +
+ Many image retrieval studies use metric learning to train an image encoder. +However, metric learning cannot handle differences in users' preferences, and +requires data to train an image encoder. To overcome these limitations, we +revisit relevance feedback, a classic technique for interactive retrieval +systems, and propose an interactive CLIP-based image retrieval system with +relevance feedback. Our retrieval system first executes the retrieval, collects +each user's unique preferences through binary feedback, and returns images the +user prefers. Even when users have various preferences, our retrieval system +learns each user's preference through the feedback and adapts to the +preference. Moreover, our retrieval system leverages CLIP's zero-shot +transferability and achieves high accuracy without training. We empirically +show that our retrieval system competes well with state-of-the-art metric +learning in category-based image retrieval, despite not training image encoders +specifically for each dataset. Furthermore, we set up two additional +experimental settings where users have various preferences: one-label-based +image retrieval and conditioned image retrieval. In both cases, our retrieval +system effectively adapts to each user's preferences, resulting in improved +accuracy compared to image retrieval without feedback. Overall, our work +highlights the potential benefits of integrating CLIP with classic relevance +feedback techniques to enhance image retrieval. + +
+
+ comment: 20 pages, 8 sugures +
+
+
+
+
+ + ☆ Deep Learning-based Prediction of Breast Cancer Tumor and Immune + Phenotypes from Histopathology AAAI-24 + + +
+ The interactions between tumor cells and the tumor microenvironment (TME) +dictate therapeutic efficacy of radiation and many systemic therapies in breast +cancer. However, to date, there is not a widely available method to +reproducibly measure tumor and immune phenotypes for each patient's tumor. +Given this unmet clinical need, we applied multiple instance learning (MIL) +algorithms to assess activity of ten biologically relevant pathways from the +hematoxylin and eosin (H&E) slide of primary breast tumors. We employed +different feature extraction approaches and state-of-the-art model +architectures. Using binary classification, our models attained area under the +receiver operating characteristic (AUROC) scores above 0.70 for nearly all gene +expression pathways and on some cases, exceeded 0.80. Attention maps suggest +that our trained models recognize biologically relevant spatial patterns of +cell sub-populations from H&E. These efforts represent a first step towards +developing computational H&E biomarkers that reflect facets of the TME and hold +promise for augmenting precision oncology. + +
+
+ comment: Paper accepted at the First Workshop on Imageomics + (Imageomics-AAAI-24) - Discovering Biological Knowledge from Images using AI + (https://sites.google.com/vt.edu/imageomics-aaai-24/home), held as part of + the 38th Annual AAAI Conference on Artificial Intelligence + (https://aaai.org/aaai-conference/) +
+
+
+
+
+ + ☆ Promoting CNNs with Cross-Architecture Knowledge Distillation for + Efficient Monocular Depth Estimation + + +
+ Recently, the performance of monocular depth estimation (MDE) has been +significantly boosted with the integration of transformer models. However, the +transformer models are usually computationally-expensive, and their +effectiveness in light-weight models are limited compared to convolutions. This +limitation hinders their deployment on resource-limited devices. In this paper, +we propose a cross-architecture knowledge distillation method for MDE, dubbed +DisDepth, to enhance efficient CNN models with the supervision of +state-of-the-art transformer models. Concretely, we first build a simple +framework of convolution-based MDE, which is then enhanced with a novel +local-global convolution module to capture both local and global information in +the image. To effectively distill valuable information from the transformer +teacher and bridge the gap between convolution and transformer features, we +introduce a method to acclimate the teacher with a ghost decoder. The ghost +decoder is a copy of the student's decoder, and adapting the teacher with the +ghost decoder aligns the features to be student-friendly while preserving their +original performance. Furthermore, we propose an attentive knowledge +distillation loss that adaptively identifies features valuable for depth +estimation. This loss guides the student to focus more on attentive regions, +improving its performance. Extensive experiments on KITTI and NYU Depth V2 +datasets demonstrate the effectiveness of DisDepth. Our method achieves +significant improvements on various efficient backbones, showcasing its +potential for efficient monocular depth estimation. + +
+
+
+
+
+ + ☆ Efficiency in Focus: LayerNorm as a Catalyst for Fine-tuning Medical + Visual Language Pre-trained Models + + +
+ In the realm of Medical Visual Language Models (Med-VLMs), the quest for +universal efficient fine-tuning mechanisms remains paramount, especially given +researchers in interdisciplinary fields are often extremely short of training +resources, yet largely unexplored. Given the unique challenges in the medical +domain, such as limited data scope and significant domain-specific +requirements, evaluating and adapting Parameter-Efficient Fine-Tuning (PEFT) +methods specifically for Med-VLMs is essential. Most of the current PEFT +methods on Med-VLMs have yet to be comprehensively investigated but mainly +focus on adding some components to the model's structure or input. However, +fine-tuning intrinsic model components often yields better generality and +consistency, and its impact on the ultimate performance of Med-VLMs has been +widely overlooked and remains understudied. In this paper, we endeavour to +explore an alternative to traditional PEFT methods, especially the impact of +fine-tuning LayerNorm layers, FFNs and Attention layers on the Med-VLMs. Our +comprehensive studies span both small-scale and large-scale Med-VLMs, +evaluating their performance under various fine-tuning paradigms across tasks +such as Medical Visual Question Answering and Medical Imaging Report +Generation. The findings reveal unique insights into the effects of intrinsic +parameter fine-tuning methods on fine-tuning Med-VLMs to downstream tasks and +expose fine-tuning solely the LayerNorm layers not only surpasses the +efficiency of traditional PEFT methods but also retains the model's accuracy +and generalization capabilities across a spectrum of medical downstream tasks. +The experiments show LayerNorm fine-tuning's superior adaptability and +scalability, particularly in the context of large-scale Med-VLMs. + +
+
+
+
+
+ + ☆ Efficient Higher-order Convolution for Small Kernels in Deep Learning + + +
+ Deep convolutional neural networks (DCNNs) are a class of artificial neural +networks, primarily for computer vision tasks such as segmentation and +classification. Many nonlinear operations, such as activation functions and +pooling strategies, are used in DCNNs to enhance their ability to process +different signals with different tasks. Conceptional convolution, a linear +filter, is the essential component of DCNNs while nonlinear convolution is +generally implemented as higher-order Volterra filters, However, for Volterra +filtering, significant memory and computational costs pose a primary limitation +for its widespread application in DCNN applications. In this study, we propose +a novel method to perform higher-order Volterra filtering with lower memory and +computation cost in forward and backward pass in DCNN training. The proposed +method demonstrates computational advantages compared with conventional +Volterra filter implementation. Furthermore, based on the proposed method, a +new attention module called Higher-order Local Attention Block (HLA) is +proposed and tested on CIFAR-100 dataset, which shows competitive improvement +for classification task. Source code is available at: +https://github.com/WinterWen666/Efficient-High-Order-Volterra-Convolution.git + +
+
+
+
+
+ + ☆ List Items One by One: A New Data Source and Learning Paradigm for + Multimodal LLMs + + +
+ Set-of-Mark (SoM) Prompting unleashes the visual grounding capability of +GPT-4V, by enabling the model to associate visual objects with tags inserted on +the image. These tags, marked with alphanumerics, can be indexed via text +tokens for easy reference. Despite the extraordinary performance from GPT-4V, +we observe that other Multimodal Large Language Models (MLLMs) struggle to +understand these visual tags. To promote the learning of SoM prompting for +open-source models, we propose a new learning paradigm: "list items one by +one," which asks the model to enumerate and describe all visual tags placed on +the image following the alphanumeric orders of tags. By integrating our curated +dataset with other visual instruction tuning datasets, we are able to equip +existing MLLMs with the SoM prompting ability. Furthermore, we evaluate our +finetuned SoM models on five MLLM benchmarks. We find that this new dataset, +even in a relatively small size (10k-30k images with tags), significantly +enhances visual reasoning capabilities and reduces hallucinations for MLLMs. +Perhaps surprisingly, these improvements persist even when the visual tags are +omitted from input images during inference. This suggests the potential of +"list items one by one" as a new paradigm for training MLLMs, which strengthens +the object-text alignment through the use of visual tags in the training stage. +Finally, we conduct analyses by probing trained models to understand the +working mechanism of SoM. Our code and data are available at +\url{https://github.com/zzxslp/SoM-LLaVA}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Multimodal Information Interaction for Medical Image Segmentation + + +
+ The use of multimodal data in assisted diagnosis and segmentation has emerged +as a prominent area of interest in current research. However, one of the +primary challenges is how to effectively fuse multimodal features. Most of the +current approaches focus on the integration of multimodal features while +ignoring the correlation and consistency between different modal features, +leading to the inclusion of potentially irrelevant information. To address this +issue, we introduce an innovative Multimodal Information Cross Transformer +(MicFormer), which employs a dual-stream architecture to simultaneously extract +features from each modality. Leveraging the Cross Transformer, it queries +features from one modality and retrieves corresponding responses from another, +facilitating effective communication between bimodal features. Additionally, we +incorporate a deformable Transformer architecture to expand the search space. +We conducted experiments on the MM-WHS dataset, and in the CT-MRI multimodal +image segmentation task, we successfully improved the whole-heart segmentation +DICE score to 85.57 and MIoU to 75.51. Compared to other multimodal +segmentation techniques, our method outperforms by margins of 2.83 and 4.23, +respectively. This demonstrates the efficacy of MicFormer in integrating +relevant information between different modalities in multimodal tasks. These +findings hold significant implications for multimodal image tasks, and we +believe that MicFormer possesses extensive potential for broader applications +across various domains. Access to our method is available at +https://github.com/fxxJuses/MICFormer + +
+
+
+
+
+ + ☆ An Improved Graph Pooling Network for Skeleton-Based Action Recognition + + +
+ Pooling is a crucial operation in computer vision, yet the unique structure +of skeletons hinders the application of existing pooling strategies to skeleton +graph modelling. In this paper, we propose an Improved Graph Pooling Network, +referred to as IGPN. The main innovations include: Our method incorporates a +region-awareness pooling strategy based on structural partitioning. The +correlation matrix of the original feature is used to adaptively adjust the +weight of information in different regions of the newly generated features, +resulting in more flexible and effective processing. To prevent the +irreversible loss of discriminative information, we propose a cross fusion +module and an information supplement module to provide block-level and +input-level information respectively. As a plug-and-play structure, the +proposed operation can be seamlessly combined with existing GCN-based models. +We conducted extensive evaluations on several challenging benchmarks, and the +experimental results indicate the effectiveness of our proposed solutions. For +example, in the cross-subject evaluation of the NTU-RGB+D 60 dataset, IGPN +achieves a significant improvement in accuracy compared to the baseline while +reducing Flops by nearly 70%; a heavier version has also been introduced to +further boost accuracy. + +
+
+
+
+
+ + ☆ Dual Expert Distillation Network for Generalized Zero-Shot Learning + + +
+ Zero-shot learning has consistently yielded remarkable progress via modeling +nuanced one-to-one visual-attribute correlation. Existing studies resort to +refining a uniform mapping function to align and correlate the sample regions +and subattributes, ignoring two crucial issues: 1) the inherent asymmetry of +attributes; and 2) the unutilized channel information. This paper addresses +these issues by introducing a simple yet effective approach, dubbed Dual Expert +Distillation Network (DEDN), where two experts are dedicated to coarse- and +fine-grained visual-attribute modeling, respectively. Concretely, one coarse +expert, namely cExp, has a complete perceptual scope to coordinate +visual-attribute similarity metrics across dimensions, and moreover, another +fine expert, namely fExp, consists of multiple specialized subnetworks, each +corresponds to an exclusive set of attributes. Two experts cooperatively +distill from each other to reach a mutual agreement during training. Meanwhile, +we further equip DEDN with a newly designed backbone network, i.e., Dual +Attention Network (DAN), which incorporates both region and channel attention +information to fully exploit and leverage visual semantic knowledge. +Experiments on various benchmark datasets indicate a new state-of-the-art. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Light-weight Retinal Layer Segmentation with Global Reasoning + + +
+ Automatic retinal layer segmentation with medical images, such as optical +coherence tomography (OCT) images, serves as an important tool for diagnosing +ophthalmic diseases. However, it is challenging to achieve accurate +segmentation due to low contrast and blood flow noises presented in the images. +In addition, the algorithm should be light-weight to be deployed for practical +clinical applications. Therefore, it is desired to design a light-weight +network with high performance for retinal layer segmentation. In this paper, we +propose LightReSeg for retinal layer segmentation which can be applied to OCT +images. Specifically, our approach follows an encoder-decoder structure, where +the encoder part employs multi-scale feature extraction and a Transformer block +for fully exploiting the semantic information of feature maps at all scales and +making the features have better global reasoning capabilities, while the +decoder part, we design a multi-scale asymmetric attention (MAA) module for +preserving the semantic information at each encoder scale. The experiments show +that our approach achieves a better segmentation performance compared to the +current state-of-the-art method TransUnet with 105.7M parameters on both our +collected dataset and two other public datasets, with only 3.3M parameters. + +
+
+ comment: IEEE Transactions on Instrumentation & Measurement +
+
+
+
+
+ + ☆ Training-Free Unsupervised Prompt for Vision-Language Models + + +
+ Prompt learning has become the most effective paradigm for adapting large +pre-trained vision-language models (VLMs) to downstream tasks. Recently, +unsupervised prompt tuning methods, such as UPL and POUF, directly leverage +pseudo-labels as supervisory information to fine-tune additional adaptation +modules on unlabeled data. However, inaccurate pseudo labels easily misguide +the tuning process and result in poor representation capabilities. In light of +this, we propose Training-Free Unsupervised Prompts (TFUP), which maximally +preserves the inherent representation capabilities and enhances them with a +residual connection to similarity-based prediction probabilities in a +training-free and labeling-free manner. Specifically, we integrate both +instance confidence and prototype scores to select representative samples, +which are used to customize a reliable Feature Cache Model (FCM) for +training-free inference. Then, we design a Multi-level Similarity Measure (MSM) +that considers both feature-level and semantic-level similarities to calculate +the distance between each test image and the cached sample as the weight of the +corresponding cached label to generate similarity-based prediction +probabilities. In this way, TFUP achieves surprising performance, even +surpassing the training-base method on multiple classification datasets. Based +on our TFUP, we propose a training-based approach (TFUP-T) to further boost the +adaptation performance. In addition to the standard cross-entropy loss, TFUP-T +adopts an additional marginal distribution entropy loss to constrain the model +from a global perspective. Our TFUP-T achieves new state-of-the-art +classification performance compared to unsupervised and few-shot adaptation +approaches on multiple benchmarks. In particular, TFUP-T improves the +classification accuracy of POUF by 3.3% on the most challenging Domain-Net +dataset. + +
+
+
+
+
+ + ☆ FedStyle: Style-Based Federated Learning Crowdsourcing Framework for Art + Commissions ICME 2024 + + +
+ The unique artistic style is crucial to artists' occupational +competitiveness, yet prevailing Art Commission Platforms rarely support +style-based retrieval. Meanwhile, the fast-growing generative AI techniques +aggravate artists' concerns about releasing personal artworks to public +platforms. To achieve artistic style-based retrieval without exposing personal +artworks, we propose FedStyle, a style-based federated learning crowdsourcing +framework. It allows artists to train local style models and share model +parameters rather than artworks for collaboration. However, most artists +possess a unique artistic style, resulting in severe model drift among them. +FedStyle addresses such extreme data heterogeneity by having artists learn +their abstract style representations and align with the server, rather than +merely aggregating model parameters lacking semantics. Besides, we introduce +contrastive learning to meticulously construct the style representation space, +pulling artworks with similar styles closer and keeping different ones apart in +the embedding space. Extensive experiments on the proposed datasets demonstrate +the superiority of FedStyle. + +
+
+ comment: Accepted to ICME 2024 +
+
+
+
+
+ + ☆ IMWA: Iterative Model Weight Averaging Benefits Class-Imbalanced + Learning Tasks + + +
+ Model Weight Averaging (MWA) is a technique that seeks to enhance model's +performance by averaging the weights of multiple trained models. This paper +first empirically finds that 1) the vanilla MWA can benefit the +class-imbalanced learning, and 2) performing model averaging in the early +epochs of training yields a greater performance improvement than doing that in +later epochs. Inspired by these two observations, in this paper we propose a +novel MWA technique for class-imbalanced learning tasks named Iterative Model +Weight Averaging (IMWA). Specifically, IMWA divides the entire training stage +into multiple episodes. Within each episode, multiple models are concurrently +trained from the same initialized model weight, and subsequently averaged into +a singular model. Then, the weight of this average model serves as a fresh +initialization for the ensuing episode, thus establishing an iterative learning +paradigm. Compared to vanilla MWA, IMWA achieves higher performance +improvements with the same computational cost. Moreover, IMWA can further +enhance the performance of those methods employing EMA strategy, demonstrating +that IMWA and EMA can complement each other. Extensive experiments on various +class-imbalanced learning tasks, i.e., class-imbalanced image classification, +semi-supervised class-imbalanced image classification and semi-supervised +object detection tasks showcase the effectiveness of our IMWA. + +
+
+
+
+
+ + ☆ Semantic Segmentation Refiner for Ultrasound Applications with Zero-Shot + Foundation Models + + +
+ Despite the remarkable success of deep learning in medical imaging analysis, +medical image segmentation remains challenging due to the scarcity of +high-quality labeled images for supervision. Further, the significant domain +gap between natural and medical images in general and ultrasound images in +particular hinders fine-tuning models trained on natural images to the task at +hand. In this work, we address the performance degradation of segmentation +models in low-data regimes and propose a prompt-less segmentation method +harnessing the ability of segmentation foundation models to segment abstract +shapes. We do that via our novel prompt point generation algorithm which uses +coarse semantic segmentation masks as input and a zero-shot prompt-able +foundation model as an optimization target. We demonstrate our method on a +segmentation findings task (pathologic anomalies) in ultrasound images. Our +method's advantages are brought to light in varying degrees of low-data regime +experiments on a small-scale musculoskeletal ultrasound images dataset, +yielding a larger performance gain as the training set size decreases. + +
+
+
+
+
+ + ☆ DIG3D: Marrying Gaussian Splatting with Deformable Transformer for + Single Image 3D Reconstruction + + +
+ In this paper, we study the problem of 3D reconstruction from a single-view +RGB image and propose a novel approach called DIG3D for 3D object +reconstruction and novel view synthesis. Our method utilizes an encoder-decoder +framework which generates 3D Gaussians in decoder with the guidance of +depth-aware image features from encoder. In particular, we introduce the use of +deformable transformer, allowing efficient and effective decoding through 3D +reference point and multi-layer refinement adaptations. By harnessing the +benefits of 3D Gaussians, our approach offers an efficient and accurate +solution for 3D reconstruction from single-view images. We evaluate our method +on the ShapeNet SRN dataset, getting PSNR of 24.21 and 24.98 in car and chair +dataset, respectively. The result outperforming the recent method by around +2.25%, demonstrating the effectiveness of our method in achieving superior +results. + +
+
+
+
+
+ + ☆ Boosting Model Resilience via Implicit Adversarial Data Augmentation IJCAI 2024 + + +
+ Data augmentation plays a pivotal role in enhancing and diversifying training +data. Nonetheless, consistently improving model performance in varied learning +scenarios, especially those with inherent data biases, remains challenging. To +address this, we propose to augment the deep features of samples by +incorporating their adversarial and anti-adversarial perturbation +distributions, enabling adaptive adjustment in the learning difficulty tailored +to each sample's specific characteristics. We then theoretically reveal that +our augmentation process approximates the optimization of a surrogate loss +function as the number of augmented copies increases indefinitely. This insight +leads us to develop a meta-learning-based framework for optimizing classifiers +with this novel loss, introducing the effects of augmentation while bypassing +the explicit augmentation process. We conduct extensive experiments across four +common biased learning scenarios: long-tail learning, generalized long-tail +learning, noisy label learning, and subpopulation shift learning. The empirical +results demonstrate that our method consistently achieves state-of-the-art +performance, highlighting its broad adaptability. + +
+
+ comment: 9 pages, 6 figures, accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion + Models CVPR 2024 + + +
+ Text-conditioned image-to-video generation (TI2V) aims to synthesize a +realistic video starting from a given image (e.g., a woman's photo) and a text +description (e.g., "a woman is drinking water."). Existing TI2V frameworks +often require costly training on video-text datasets and specific model designs +for text and image conditioning. In this paper, we propose TI2V-Zero, a +zero-shot, tuning-free method that empowers a pretrained text-to-video (T2V) +diffusion model to be conditioned on a provided image, enabling TI2V generation +without any optimization, fine-tuning, or introducing external modules. Our +approach leverages a pretrained T2V diffusion foundation model as the +generative prior. To guide video generation with the additional image input, we +propose a "repeat-and-slide" strategy that modulates the reverse denoising +process, allowing the frozen diffusion model to synthesize a video +frame-by-frame starting from the provided image. To ensure temporal continuity, +we employ a DDPM inversion strategy to initialize Gaussian noise for each newly +synthesized frame and a resampling technique to help preserve visual details. +We conduct comprehensive experiments on both domain-specific and open-domain +datasets, where TI2V-Zero consistently outperforms a recent open-domain TI2V +model. Furthermore, we show that TI2V-Zero can seamlessly extend to other tasks +such as video infilling and prediction when provided with more images. Its +autoregressive design also supports long video generation. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ BezierFormer: A Unified Architecture for 2D and 3D Lane Detection ICME 2024 + + +
+ Lane detection has made significant progress in recent years, but there is +not a unified architecture for its two sub-tasks: 2D lane detection and 3D lane +detection. To fill this gap, we introduce B\'{e}zierFormer, a unified 2D and 3D +lane detection architecture based on B\'{e}zier curve lane representation. +B\'{e}zierFormer formulate queries as B\'{e}zier control points and incorporate +a novel B\'{e}zier curve attention mechanism. This attention mechanism enables +comprehensive and accurate feature extraction for slender lane curves via +sampling and fusing multiple reference points on each curve. In addition, we +propose a novel Chamfer IoU-based loss which is more suitable for the +B\'{e}zier control points regression. The state-of-the-art performance of +B\'{e}zierFormer on widely-used 2D and 3D lane detection benchmarks verifies +its effectiveness and suggests the worthiness of further exploration. + +
+
+ comment: ICME 2024, 11 pages, 8 figures +
+
+
+
+
+ + ☆ CFMW: Cross-modality Fusion Mamba for Multispectral Object Detection + under Adverse Weather Conditions + + +
+ Cross-modality images that integrate visible-infrared spectra cues can +provide richer complementary information for object detection. Despite this, +existing visible-infrared object detection methods severely degrade in severe +weather conditions. This failure stems from the pronounced sensitivity of +visible images to environmental perturbations, such as rain, haze, and snow, +which frequently cause false negatives and false positives in detection. To +address this issue, we introduce a novel and challenging task, termed +visible-infrared object detection under adverse weather conditions. To foster +this task, we have constructed a new Severe Weather Visible-Infrared Dataset +(SWVID) with diverse severe weather scenes. Furthermore, we introduce the +Cross-modality Fusion Mamba with Weather-removal (CFMW) to augment detection +accuracy in adverse weather conditions. Thanks to the proposed Weather Removal +Diffusion Model (WRDM) and Cross-modality Fusion Mamba (CFM) modules, CFMW is +able to mine more essential information of pedestrian features in +cross-modality fusion, thus could transfer to other rarer scenarios with high +efficiency and has adequate availability on those platforms with low computing +power. To the best of our knowledge, this is the first study that targeted +improvement and integrated both Diffusion and Mamba modules in cross-modality +object detection, successfully expanding the practical application of this type +of model with its higher accuracy and more advanced architecture. Extensive +experiments on both well-recognized and self-created datasets conclusively +demonstrate that our CFMW achieves state-of-the-art detection performance, +surpassing existing benchmarks. The dataset and source code will be made +publicly available at https://github.com/lhy-zjut/CFMW. + +
+
+ comment: The dataset and source code will be made publicly available at + https://github.com/lhy-zjut/CFMW +
+
+
+
+
+ + ☆ Style Adaptation for Domain-adaptive Semantic Segmentation + + +
+ Unsupervised Domain Adaptation (UDA) refers to the method that utilizes +annotated source domain data and unlabeled target domain data to train a model +capable of generalizing to the target domain data. Domain discrepancy leads to +a significant decrease in the performance of general network models trained on +the source domain data when applied to the target domain. We introduce a +straightforward approach to mitigate the domain discrepancy, which necessitates +no additional parameter calculations and seamlessly integrates with +self-training-based UDA methods. Through the transfer of the target domain +style to the source domain in the latent feature space, the model is trained to +prioritize the target domain style during the decision-making process. We +tackle the problem at both the image-level and shallow feature map level by +transferring the style information from the target domain to the source domain +data. As a result, we obtain a model that exhibits superior performance on the +target domain. Our method yields remarkable enhancements in the +state-of-the-art performance for synthetic-to-real UDA tasks. For example, our +proposed method attains a noteworthy UDA performance of 76.93 mIoU on the +GTA->Cityscapes dataset, representing a notable improvement of +1.03 percentage +points over the previous state-of-the-art results. + +
+
+
+
+
+ + ☆ Reinforcement Learning with Generative Models for Compact Support Sets + + +
+ Foundation models contain a wealth of information from their vast number of +training samples. However, most prior arts fail to extract this information in +a precise and efficient way for small sample sizes. In this work, we propose a +framework utilizing reinforcement learning as a control for foundation models, +allowing for the granular generation of small, focused synthetic support sets +to augment the performance of neural network models on real data classification +tasks. We first allow a reinforcement learning agent access to a novel context +based dictionary; the agent then uses this dictionary with a novel prompt +structure to form and optimize prompts as inputs to generative models, +receiving feedback based on a reward function combining the change in +validation accuracy and entropy. A support set is formed this way over several +exploration steps. Our framework produced excellent results, increasing +classification accuracy by significant margins for no additional labelling or +data cost. + +
+
+ comment: 4 pages, 2 figures. Code available at: + https://github.com/mesophil/deeprl +
+
+
+
+
+ + ☆ Research on Splicing Image Detection Algorithms Based on Natural Image + Statistical Characteristics + + +
+ With the development and widespread application of digital image processing +technology, image splicing has become a common method of image manipulation, +raising numerous security and legal issues. This paper introduces a new +splicing image detection algorithm based on the statistical characteristics of +natural images, aimed at improving the accuracy and efficiency of splicing +image detection. By analyzing the limitations of traditional methods, we have +developed a detection framework that integrates advanced statistical analysis +techniques and machine learning methods. The algorithm has been validated using +multiple public datasets, showing high accuracy in detecting spliced edges and +locating tampered areas, as well as good robustness. Additionally, we explore +the potential applications and challenges faced by the algorithm in real-world +scenarios. This research not only provides an effective technological means for +the field of image tampering detection but also offers new ideas and methods +for future related research. + +
+
+
+
+
+ + ☆ One Noise to Rule Them All: Learning a Unified Model of + Spatially-Varying Noise Patterns SIGGRAPH + + +
+ Procedural noise is a fundamental component of computer graphics pipelines, +offering a flexible way to generate textures that exhibit "natural" random +variation. Many different types of noise exist, each produced by a separate +algorithm. In this paper, we present a single generative model which can learn +to generate multiple types of noise as well as blend between them. In addition, +it is capable of producing spatially-varying noise blends despite not having +access to such data for training. These features are enabled by training a +denoising diffusion model using a novel combination of data augmentation and +network conditioning techniques. Like procedural noise generators, the model's +behavior is controllable via interpretable parameters and a source of +randomness. We use our model to produce a variety of visually compelling noise +textures. We also present an application of our model to improving inverse +procedural material design; using our model in place of fixed-type noise nodes +in a procedural material graph results in higher-fidelity material +reconstructions without needing to know the type of noise in advance. + +
+
+ comment: In ACM Transactions on Graphics (Proceedings of SIGGRAPH) 2024, 21 + pages +
+
+
+
+
+ + ☆ Lacunarity Pooling Layers for Plant Image Classification using Texture + Analysis + + +
+ Pooling layers (e.g., max and average) may overlook important information +encoded in the spatial arrangement of pixel intensity and/or feature values. We +propose a novel lacunarity pooling layer that aims to capture the spatial +heterogeneity of the feature maps by evaluating the variability within local +windows. The layer operates at multiple scales, allowing the network to +adaptively learn hierarchical features. The lacunarity pooling layer can be +seamlessly integrated into any artificial neural network architecture. +Experimental results demonstrate the layer's effectiveness in capturing +intricate spatial patterns, leading to improved feature extraction +capabilities. The proposed approach holds promise in various domains, +especially in agricultural image analysis tasks. This work contributes to the +evolving landscape of artificial neural network architectures by introducing a +novel pooling layer that enriches the representation of spatial features. Our +code is publicly available. + +
+
+ comment: 9 pages, 7 figures, accepted at 2024 IEEE/CVF Computer Vision and + Pattern Recognition Vision for Agriculture Workshop +
+
+
+
+
+ + ☆ A Multi-objective Optimization Benchmark Test Suite for Real-time + Semantic Segmentation GECCO 2024 + + +
+ As one of the emerging challenges in Automated Machine Learning, the +Hardware-aware Neural Architecture Search (HW-NAS) tasks can be treated as +black-box multi-objective optimization problems (MOPs). An important +application of HW-NAS is real-time semantic segmentation, which plays a pivotal +role in autonomous driving scenarios. The HW-NAS for real-time semantic +segmentation inherently needs to balance multiple optimization objectives, +including model accuracy, inference speed, and hardware-specific +considerations. Despite its importance, benchmarks have yet to be developed to +frame such a challenging task as multi-objective optimization. To bridge the +gap, we introduce a tailored streamline to transform the task of HW-NAS for +real-time semantic segmentation into standard MOPs. Building upon the +streamline, we present a benchmark test suite, CitySeg/MOP, comprising fifteen +MOPs derived from the Cityscapes dataset. The CitySeg/MOP test suite is +integrated into the EvoXBench platform to provide seamless interfaces with +various programming languages (e.g., Python and MATLAB) for instant fitness +evaluations. We comprehensively assessed the CitySeg/MOP test suite on various +multi-objective evolutionary algorithms, showcasing its versatility and +practicality. Source codes are available at +https://github.com/EMI-Group/evoxbench. + +
+
+ comment: 8 pages, 16 figures, GECCO 2024 +
+
+
+
+
+ + ☆ Calculation of Femur Caput Collum Diaphyseal angle for X-Rays images + using Semantic Segmentation + + +
+ This paper investigates the use of deep learning approaches to estimate the +femur caput-collum-diaphyseal (CCD) angle from X-ray images. The CCD angle is +an important measurement in the diagnosis of hip problems, and correct +prediction can help in the planning of surgical procedures. Manual measurement +of this angle, on the other hand, can be time-intensive and vulnerable to +inter-observer variability. In this paper, we present a deep-learning algorithm +that can reliably estimate the femur CCD angle from X-ray images. To train and +test the performance of our model, we employed an X-ray image dataset with +associated femur CCD angle measurements. Furthermore, we built a prototype to +display the resulting predictions and to allow the user to interact with the +predictions. As this is happening in a sterile setting during surgery, we +expanded our interface to the possibility of being used only by voice commands. + Our results show that our deep learning model predicts the femur CCD angle on +X-ray images with great accuracy, with a mean absolute error of 4.3 degrees on +the left femur and 4.9 degrees on the right femur on the test dataset. Our +results suggest that deep learning has the potential to give a more efficient +and accurate technique for predicting the femur CCD angle, which might have +substantial therapeutic implications for the diagnosis and management of hip +problems. + +
+
+
+
+
+ + ☆ Detection of Peri-Pancreatic Edema using Deep Learning and Radiomics + Techniques + + +
+ Identifying peri-pancreatic edema is a pivotal indicator for identifying +disease progression and prognosis, emphasizing the critical need for accurate +detection and assessment in pancreatitis diagnosis and management. This study +\textit{introduces a novel CT dataset sourced from 255 patients with pancreatic +diseases, featuring annotated pancreas segmentation masks and corresponding +diagnostic labels for peri-pancreatic edema condition}. With the novel dataset, +we first evaluate the efficacy of the \textit{LinTransUNet} model, a linear +Transformer based segmentation algorithm, to segment the pancreas accurately +from CT imaging data. Then, we use segmented pancreas regions with two +distinctive machine learning classifiers to identify existence of +peri-pancreatic edema: deep learning-based models and a radiomics-based eXtreme +Gradient Boosting (XGBoost). The LinTransUNet achieved promising results, with +a dice coefficient of 80.85\%, and mIoU of 68.73\%. Among the nine benchmarked +classification models for peri-pancreatic edema detection, \textit{Swin-Tiny} +transformer model demonstrated the highest recall of $98.85 \pm 0.42$ and +precision of $98.38\pm 0.17$. Comparatively, the radiomics-based XGBoost model +achieved an accuracy of $79.61\pm4.04$ and recall of $91.05\pm3.28$, showcasing +its potential as a supplementary diagnostic tool given its rapid processing +speed and reduced training time. Our code is available +\url{https://github.com/NUBagciLab/Peri-Pancreatic-Edema-Detection}. + +
+
+
+
+
+ + ☆ WheelPose: Data Synthesis Techniques to Improve Pose Estimation + Performance on Wheelchair Users + + +
+ Existing pose estimation models perform poorly on wheelchair users due to a +lack of representation in training data. We present a data synthesis pipeline +to address this disparity in data collection and subsequently improve pose +estimation performance for wheelchair users. Our configurable pipeline +generates synthetic data of wheelchair users using motion capture data and +motion generation outputs simulated in the Unity game engine. We validated our +pipeline by conducting a human evaluation, investigating perceived realism, +diversity, and an AI performance evaluation on a set of synthetic datasets from +our pipeline that synthesized different backgrounds, models, and postures. We +found our generated datasets were perceived as realistic by human evaluators, +had more diversity than existing image datasets, and had improved person +detection and pose estimation performance when fine-tuned on existing pose +estimation models. Through this work, we hope to create a foothold for future +efforts in tackling the inclusiveness of AI in a data-centric and human-centric +manner with the data synthesis techniques demonstrated in this work. Finally, +for future works to extend upon, we open source all code in this research and +provide a fully configurable Unity Environment used to generate our datasets. +In the case of any models we are unable to share due to redistribution and +licensing policies, we provide detailed instructions on how to source and +replace said models. + +
+
+ comment: Published for ACM CHI 2024. For source files, see + https://github.com/hilab-open-source/wheelpose +
+
+
+
+
+ + ☆ Nuclei-Location Based Point Set Registration of Multi-Stained Whole + Slide Images + + +
+ Whole Slide Images (WSIs) provide exceptional detail for studying tissue +architecture at the cell level. To study tumour microenvironment (TME) with the +context of various protein biomarkers and cell sub-types, analysis and +registration of features using multi-stained WSIs is often required. +Multi-stained WSI pairs normally suffer from rigid and non-rigid deformities in +addition to slide artefacts and control tissue which present challenges at +precise registration. Traditional registration methods mainly focus on global +rigid/non-rigid registration but struggle with aligning slides with complex +tissue deformations at the nuclei level. However, nuclei level non-rigid +registration is essential for downstream tasks such as cell sub-type analysis +in the context of protein biomarker signatures. This paper focuses on local +level non-rigid registration using a nuclei-location based point set +registration approach for aligning multi-stained WSIs. We exploit the spatial +distribution of nuclei that is prominent and consistent (to a large level) +across different stains to establish a spatial correspondence. We evaluate our +approach using the HYRECO dataset consisting of 54 re-stained images of H\&E +and PHH3 image pairs. The approach can be extended to other IHC and IF stained +WSIs considering a good nuclei detection algorithm is accessible. The +performance of the model is tested against established registration algorithms +and is shown to outperform the model for nuclei level registration. + +
+
+ comment: 15 pages, 5 figures, Submitted to Medical Image Understanding and + Analysis Conference 2024 +
+
+
+
+
+ + ☆ Auto-Generating Weak Labels for Real & Synthetic Data to Improve + Label-Scarce Medical Image Segmentation + + +
+ The high cost of creating pixel-by-pixel gold-standard labels, limited expert +availability, and presence of diverse tasks make it challenging to generate +segmentation labels to train deep learning models for medical imaging tasks. In +this work, we present a new approach to overcome the hurdle of costly medical +image labeling by leveraging foundation models like Segment Anything Model +(SAM) and its medical alternate MedSAM. Our pipeline has the ability to +generate weak labels for any unlabeled medical image and subsequently use it to +augment label-scarce datasets. We perform this by leveraging a model trained on +a few gold-standard labels and using it to intelligently prompt MedSAM for weak +label generation. This automation eliminates the manual prompting step in +MedSAM, creating a streamlined process for generating labels for both real and +synthetic images, regardless of quantity. We conduct experiments on +label-scarce settings for multiple tasks pertaining to modalities ranging from +ultrasound, dermatology, and X-rays to demonstrate the usefulness of our +pipeline. The code is available at +https://github.com/stanfordmlgroup/Auto-Generate-WLs/. + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ Motor Focus: Ego-Motion Prediction with All-Pixel Matching + + +
+ Motion analysis plays a critical role in various applications, from virtual +reality and augmented reality to assistive visual navigation. Traditional +self-driving technologies, while advanced, typically do not translate directly +to pedestrian applications due to their reliance on extensive sensor arrays and +non-feasible computational frameworks. This highlights a significant gap in +applying these solutions to human users since human navigation introduces +unique challenges, including the unpredictable nature of human movement, +limited processing capabilities of portable devices, and the need for +directional responsiveness due to the limited perception range of humans. In +this project, we introduce an image-only method that applies motion analysis +using optical flow with ego-motion compensation to predict Motor Focus-where +and how humans or machines focus their movement intentions. Meanwhile, this +paper addresses the camera shaking issue in handheld and body-mounted devices +which can severely degrade performance and accuracy, by applying a Gaussian +aggregation to stabilize the predicted motor focus area and enhance the +prediction accuracy of movement direction. This also provides a robust, +real-time solution that adapts to the user's immediate environment. +Furthermore, in the experiments part, we show the qualitative analysis of motor +focus estimation between the conventional dense optical flow-based method and +the proposed method. In quantitative tests, we show the performance of the +proposed method on a collected small dataset that is specialized for motor +focus estimation tasks. + +
+
+
+
+
+ + ☆ Dr-SAM: An End-to-End Framework for Vascular Segmentation, Diameter + Estimation, and Anomaly Detection on Angiography Images + + +
+ Recent advancements in AI have significantly transformed medical imaging, +particularly in angiography, by enhancing diagnostic precision and patient +care. However existing works are limited in analyzing the aorta and iliac +arteries, above all for vascular anomaly detection and characterization. To +close this gap, we propose Dr-SAM, a comprehensive multi-stage framework for +vessel segmentation, diameter estimation, and anomaly analysis aiming to +examine the peripheral vessels through angiography images. For segmentation we +introduce a customized positive/negative point selection mechanism applied on +top of the Segment Anything Model (SAM), specifically for medical (Angiography) +images. Then we propose a morphological approach to determine the vessel +diameters followed by our histogram-driven anomaly detection approach. +Moreover, we introduce a new benchmark dataset for the comprehensive analysis +of peripheral vessel angiography images which we hope can boost the upcoming +research in this direction leading to enhanced diagnostic precision and +ultimately better health outcomes for individuals facing vascular issues. + +
+
+
+
+
+ + ☆ PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video + Dense Captioning + + +
+ Vision-language pre-training has significantly elevated performance across a +wide range of image-language applications. Yet, the pre-training process for +video-related tasks demands exceptionally large computational and data +resources, which hinders the progress of video-language models. This paper +investigates a straightforward, highly efficient, and resource-light approach +to adapting an existing image-language pre-trained model for dense video +understanding. Our preliminary experiments reveal that directly fine-tuning +pre-trained image-language models with multiple frames as inputs on video +datasets leads to performance saturation or even a drop. Our further +investigation reveals that it is largely attributed to the bias of learned +high-norm visual features. Motivated by this finding, we propose a simple but +effective pooling strategy to smooth the feature distribution along the +temporal dimension and thus reduce the dominant impacts from the extreme +features. The new model is termed Pooling LLaVA, or \nameofmethod{} in short. +\nameofmethod{} achieves new state-of-the-art performance on modern benchmark +datasets for both video question-answer and captioning tasks. Notably, on the +recent popular Video ChatGPT benchmark, PLLaVA achieves a score of 3.48 out of +5 on average of five evaluated dimensions, exceeding the previous SOTA results +from GPT4V (IG-VLM) by 9\%. On the latest multi-choice benchmark MVBench, +PLLaVA achieves 58.1\% accuracy on average across 20 sub-tasks, 14.5\% higher +than GPT4V (IG-VLM). Code is available at +\url{https://github.com/magic-research/PLLaVA}. + +
+
+
+
+
+ + ☆ CriSp: Leveraging Tread Depth Maps for Enhanced Crime-Scene Shoeprint + Matching + + +
+ Shoeprints are a common type of evidence found at crime scenes and are used +regularly in forensic investigations. However, existing methods cannot +effectively employ deep learning techniques to match noisy and occluded +crime-scene shoeprints to a shoe database due to a lack of training data. +Moreover, all existing methods match crime-scene shoeprints to clean reference +prints, yet our analysis shows matching to more informative tread depth maps +yields better retrieval results. The matching task is further complicated by +the necessity to identify similarities only in corresponding regions (heels, +toes, etc) of prints and shoe treads. To overcome these challenges, we leverage +shoe tread images from online retailers and utilize an off-the-shelf predictor +to estimate depth maps and clean prints. Our method, named CriSp, matches +crime-scene shoeprints to tread depth maps by training on this data. CriSp +incorporates data augmentation to simulate crime-scene shoeprints, an encoder +to learn spatially-aware features, and a masking module to ensure only visible +regions of crime-scene prints affect retrieval results. To validate our +approach, we introduce two validation sets by reprocessing existing datasets of +crime-scene shoeprints and establish a benchmarking protocol for comparison. On +this benchmark, CriSp significantly outperforms state-of-the-art methods in +both automated shoeprint matching and image retrieval tailored to this task. + +
+
+
+
+
+ + ☆ Constellation Dataset: Benchmarking High-Altitude Object Detection for + an Urban Intersection + + +
+ We introduce Constellation, a dataset of 13K images suitable for research on +detection of objects in dense urban streetscapes observed from high-elevation +cameras, collected for a variety of temporal conditions. The dataset addresses +the need for curated data to explore problems in small object detection +exemplified by the limited pixel footprint of pedestrians observed tens of +meters from above. It enables the testing of object detection models for +variations in lighting, building shadows, weather, and scene dynamics. We +evaluate contemporary object detection architectures on the dataset, observing +that state-of-the-art methods have lower performance in detecting small +pedestrians compared to vehicles, corresponding to a 10% difference in average +precision (AP). Using structurally similar datasets for pretraining the models +results in an increase of 1.8% mean AP (mAP). We further find that +incorporating domain-specific data augmentations helps improve model +performance. Using pseudo-labeled data, obtained from inference outcomes of the +best-performing models, improves the performance of the models. Finally, +comparing the models trained using the data collected in two different time +intervals, we find a performance drift in models due to the changes in +intersection conditions over time. The best-performing model achieves a +pedestrian AP of 92.0% with 11.5 ms inference time on NVIDIA A100 GPUs, and an +mAP of 95.4%. + +
+
+
+
+
+ + ☆ Grad Queue : A probabilistic framework to reinforce sparse gradients + + +
+ Informative gradients are often lost in large batch updates. We propose a +robust mechanism to reinforce the sparse components within a random batch of +data points. A finite queue of online gradients is used to determine their +expected instantaneous statistics. We propose a function to measure the +scarcity of incoming gradients using these statistics and establish the +theoretical ground of this mechanism. To minimize conflicting components within +large mini-batches, samples are grouped with aligned objectives by clustering +based on inherent feature space. Sparsity is measured for each centroid and +weighted accordingly. A strong intuitive criterion to squeeze out redundant +information from each cluster is the backbone of the system. It makes rare +information indifferent to aggressive momentum also exhibits superior +performance with larger mini-batch horizon. The effective length of the queue +kept variable to follow the local loss pattern. The contribution of our method +is to restore intra-mini-batch diversity at the same time widening the optimal +batch boundary. Both of these collectively drive it deeper towards the minima. +Our method has shown superior performance for CIFAR10, MNIST, and Reuters News +category dataset compared to mini-batch gradient descent. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ☆ Exploring Learngene via Stage-wise Weight Sharing for Initializing + Variable-sized Models + + +
+ In practice, we usually need to build variable-sized models adapting for +diverse resource constraints in different application scenarios, where weight +initialization is an important step prior to training. The Learngene framework, +introduced recently, firstly learns one compact part termed as learngene from a +large well-trained model, after which learngene is expanded to initialize +variable-sized models. In this paper, we start from analysing the importance of +guidance for the expansion of well-trained learngene layers, inspiring the +design of a simple but highly effective Learngene approach termed SWS +(Stage-wise Weight Sharing), where both learngene layers and their learning +process critically contribute to providing knowledge and guidance for +initializing models at varying scales. Specifically, to learn learngene layers, +we build an auxiliary model comprising multiple stages where the layer weights +in each stage are shared, after which we train it through distillation. +Subsequently, we expand these learngene layers containing stage information at +their corresponding stage to initialize models of variable depths. Extensive +experiments on ImageNet-1K demonstrate that SWS achieves consistent better +performance compared to many models trained from scratch, while reducing around +6.6x total training costs. In some cases, SWS performs better only after 1 +epoch tuning. When initializing variable-sized models adapting for different +resource constraints, SWS achieves better results while reducing around 20x +parameters stored to initialize these models and around 10x pre-training costs, +in contrast to the pre-training and fine-tuning approach. + +
+
+
+
+
+ + ☆ Synthesizing Audio from Silent Video using Sequence to Sequence Modeling + + +
+ Generating audio from a video's visual context has multiple practical +applications in improving how we interact with audio-visual media - for +example, enhancing CCTV footage analysis, restoring historical videos (e.g., +silent movies), and improving video generation models. We propose a novel +method to generate audio from video using a sequence-to-sequence model, +improving on prior work that used CNNs and WaveNet and faced sound diversity +and generalization challenges. Our approach employs a 3D Vector Quantized +Variational Autoencoder (VQ-VAE) to capture the video's spatial and temporal +structures, decoding with a custom audio decoder for a broader range of sounds. +Trained on the Youtube8M dataset segment, focusing on specific domains, our +model aims to enhance applications like CCTV footage analysis, silent movie +restoration, and video generation models. + +
+
+
+
+
+ + ♻ ☆ GaussCtrl: Multi-View Consistent Text-Driven 3D Gaussian Splatting + Editing + + +
+ We propose GaussCtrl, a text-driven method to edit a 3D scene reconstructed +by the 3D Gaussian Splatting (3DGS). + Our method first renders a collection of images by using the 3DGS and edits +them by using a pre-trained 2D diffusion model (ControlNet) based on the input +prompt, which is then used to optimise the 3D model. + Our key contribution is multi-view consistent editing, which enables editing +all images together instead of iteratively editing one image while updating the +3D model as in previous works. + It leads to faster editing as well as higher visual quality. + This is achieved by the two terms: + (a) depth-conditioned editing that enforces geometric consistency across +multi-view images by leveraging naturally consistent depth maps. + (b) attention-based latent code alignment that unifies the appearance of +edited images by conditioning their editing to several reference views through +self and cross-view attention between images' latent representations. + Experiments demonstrate that our method achieves faster editing and better +visual results than previous state-of-the-art methods. + +
+
+ comment: Our Project Website: https://gaussctrl.active.vision/ +
+
+
+
+
+ + ♻ ☆ Confidence-Triggered Detection: Accelerating Real-time + Tracking-by-detection Systems + + +
+ Real-time object tracking necessitates a delicate balance between speed and +accuracy, a challenge exacerbated by the computational demands of deep learning +methods. In this paper, we propose Confidence-Triggered Detection (CTD), an +innovative approach that strategically bypasses object detection for frames +closely resembling intermediate states, leveraging tracker confidence scores. +CTD not only enhances tracking speed but also preserves accuracy, surpassing +existing tracking algorithms. Through extensive evaluation across various +tracker confidence thresholds, we identify an optimal trade-off between +tracking speed and accuracy, providing crucial insights for parameter +fine-tuning and enhancing CTD's practicality in real-world scenarios. Our +experiments across diverse detection models underscore the robustness and +versatility of the CTD framework, demonstrating its potential to enable +real-time tracking in resource-constrained environments. + +
+
+ comment: To be appeared in 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Guided Interpretable Facial Expression Recognition via Spatial Action + Unit Cues + + +
+ Although state-of-the-art classifiers for facial expression recognition (FER) +can achieve a high level of accuracy, they lack interpretability, an important +feature for end-users. Experts typically associate spatial action units (\aus) +from a codebook to facial regions for the visual interpretation of expressions. +In this paper, the same expert steps are followed. A new learning strategy is +proposed to explicitly incorporate \au cues into classifier training, allowing +to train deep interpretable models. During training, this \au codebook is used, +along with the input image expression label, and facial landmarks, to construct +a \au heatmap that indicates the most discriminative image regions of interest +w.r.t the facial expression. This valuable spatial cue is leveraged to train a +deep interpretable classifier for FER. This is achieved by constraining the +spatial layer features of a classifier to be correlated with \au heatmaps. +Using a composite loss, the classifier is trained to correctly classify an +image while yielding interpretable visual layer-wise attention correlated with +\au maps, simulating the expert decision process. Our strategy only relies on +image class expression for supervision, without additional manual annotations. +Our new strategy is generic, and can be applied to any deep CNN- or +transformer-based classifier without requiring any architectural change or +significant additional training time. Our extensive evaluation on two public +benchmarks \rafdb, and \affectnet datasets shows that our proposed strategy can +improve layer-wise interpretability without degrading classification +performance. In addition, we explore a common type of interpretable classifiers +that rely on class activation mapping (CAM) methods, and show that our approach +can also improve CAM interpretability. + +
+
+ comment: 15 pages, 11 figures, 3 tables, International Conference on Automatic + Face and Gesture Recognition (FG 2024) +
+
+
+
+
+ + ♻ ☆ Pix2HDR -- A pixel-wise acquisition and deep learning-based synthesis + approach for high-speed HDR videos + + +
+ Accurately capturing dynamic scenes with wide-ranging motion and light +intensity is crucial for many vision applications. However, acquiring +high-speed high dynamic range (HDR) video is challenging because the camera's +frame rate restricts its dynamic range. Existing methods sacrifice speed to +acquire multi-exposure frames. Yet, misaligned motion in these frames can still +pose complications for HDR fusion algorithms, resulting in artifacts. Instead +of frame-based exposures, we sample the videos using individual pixels at +varying exposures and phase offsets. Implemented on a monochrome pixel-wise +programmable image sensor, our sampling pattern simultaneously captures fast +motion at a high dynamic range. We then transform pixel-wise outputs into an +HDR video using end-to-end learned weights from deep neural networks, achieving +high spatiotemporal resolution with minimized motion blurring. We demonstrate +aliasing-free HDR video acquisition at 1000 FPS, resolving fast motion under +low-light conditions and against bright backgrounds - both challenging +conditions for conventional cameras. By combining the versatility of pixel-wise +sampling patterns with the strength of deep neural networks at decoding complex +scenes, our method greatly enhances the vision system's adaptability and +performance in dynamic conditions. + +
+
+ comment: 17 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Local Binary Pattern: A Novel Feature Descriptor for Enhanced + Analysis of Kidney Abnormalities in CT Scan Images using ensemble based + Machine Learning Approach + + +
+ The shortage of nephrologists and the growing public health concern over +renal failure have spurred the demand for AI systems capable of autonomously +detecting kidney abnormalities. Renal failure, marked by a gradual decline in +kidney function, can result from factors like cysts, stones, and tumors. +Chronic kidney disease may go unnoticed initially, leading to untreated cases +until they reach an advanced stage. The dataset, comprising 12,427 images from +multiple hospitals in Dhaka, was categorized into four groups: cyst, tumor, +stone, and normal. Our methodology aims to enhance CT scan image quality using +Cropping, Resizing, and CALHE techniques, followed by feature extraction with +our proposed Adaptive Local Binary Pattern (A-LBP) feature extraction method +compared with the state-of-the-art local binary pattern (LBP) method. Our +proposed features fed into classifiers such as Random Forest, Decision Tree, +Naive Bayes, K-Nearest Neighbor, and SVM. We explored an ensemble model with +soft voting to get a more robust model for our task. We got the highest of more +than 99% in accuracy using our feature descriptor and ensembling five +classifiers (Random Forest, Decision Tree, Naive Bayes, K-Nearest Neighbor, +Support Vector Machine) with the soft voting method. + +
+
+ comment: 17 pages, 5 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ Semantic Positive Pairs for Enhancing Visual Representation Learning of + Instance Discrimination methods + + +
+ Self-supervised learning algorithms (SSL) based on instance discrimination +have shown promising results, performing competitively or even outperforming +supervised learning counterparts in some downstream tasks. Such approaches +employ data augmentation to create two views of the same instance (i.e., +positive pairs) and encourage the model to learn good representations by +attracting these views closer in the embedding space without collapsing to the +trivial solution. However, data augmentation is limited in representing +positive pairs, and the repulsion process between the instances during +contrastive learning may discard important features for instances that have +similar categories. To address this issue, we propose an approach to identify +those images with similar semantic content and treat them as positive +instances, thereby reducing the chance of discarding important features during +representation learning and increasing the richness of the latent +representation. Our approach is generic and could work with any self-supervised +instance discrimination frameworks such as MoCo and SimSiam. To evaluate our +method, we run experiments on three benchmark datasets: ImageNet, STL-10 and +CIFAR-10 with different instance discrimination SSL approaches. The +experimental results show that our approach consistently outperforms the +baseline methods across all three datasets; for instance, we improve upon the +vanilla MoCo-v2 by 4.1% on ImageNet under a linear evaluation protocol over 800 +epochs. We also report results on semi-supervised learning, transfer learning +on downstream tasks, and object detection. + +
+
+ comment: 17 pages, 6 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to + Enhance Visio-Linguistic Compositional Understanding CVPR 2024 + + +
+ Vision-Language Models (VLMs), such as CLIP, exhibit strong image-text +comprehension abilities, facilitating advances in several downstream tasks such +as zero-shot image classification, image-text retrieval, and text-to-image +generation. However, the compositional reasoning abilities of existing VLMs +remains subpar. The root of this limitation lies in the inadequate alignment +between the images and captions in the pretraining datasets. Additionally, the +current contrastive learning objective fails to focus on fine-grained grounding +components like relations, actions, and attributes, resulting in "bag-of-words" +representations. We introduce a simple and effective method to improve +compositional reasoning in VLMs. Our method better leverages available datasets +by refining and expanding the standard image-text contrastive learning +framework. Our approach does not require specific annotations and does not +incur extra parameters. When integrated with CLIP, our technique yields notable +improvement over state-of-the-art baselines across five vision-language +compositional benchmarks. We open-source our code at +https://github.com/lezhang7/Enhance-FineGrained. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ OMEGAS: Object Mesh Extraction from Large Scenes Guided by Gaussian + Segmentation + + +
+ Recent advancements in 3D reconstruction technologies have paved the way for +high-quality and real-time rendering of complex 3D scenes. Despite these +achievements, a notable challenge persists: it is difficult to precisely +reconstruct specific objects from large scenes. Current scene reconstruction +techniques frequently result in the loss of object detail textures and are +unable to reconstruct object portions that are occluded or unseen in views. To +address this challenge, we delve into the meticulous 3D reconstruction of +specific objects within large scenes and propose a framework termed OMEGAS: +Object Mesh Extraction from Large Scenes Guided by GAussian Segmentation. +OMEGAS employs a multi-step approach, grounded in several excellent +off-the-shelf methodologies. Specifically, initially, we utilize the Segment +Anything Model (SAM) to guide the segmentation of 3D Gaussian Splatting (3DGS), +thereby creating a basic 3DGS model of the target object. Then, we leverage +large-scale diffusion priors to further refine the details of the 3DGS model, +especially aimed at addressing invisible or occluded object portions from the +original scene views. Subsequently, by re-rendering the 3DGS model onto the +scene views, we achieve accurate object segmentation and effectively remove the +background. Finally, these target-only images are used to improve the 3DGS +model further and extract the definitive 3D object mesh by the SuGaR model. In +various scenarios, our experiments demonstrate that OMEGAS significantly +surpasses existing scene reconstruction methods. Our project page is at: +https://github.com/CrystalWlz/OMEGAS + +
+
+ comment: arXiv admin note: text overlap with arXiv:2311.17061 by other authors +
+
+
+
+
+ + ♻ ☆ GaussianTalker: Real-Time High-Fidelity Talking Head Synthesis with + Audio-Driven 3D Gaussian Splatting + + +
+ We propose GaussianTalker, a novel framework for real-time generation of +pose-controllable talking heads. It leverages the fast rendering capabilities +of 3D Gaussian Splatting (3DGS) while addressing the challenges of directly +controlling 3DGS with speech audio. GaussianTalker constructs a canonical 3DGS +representation of the head and deforms it in sync with the audio. A key insight +is to encode the 3D Gaussian attributes into a shared implicit feature +representation, where it is merged with audio features to manipulate each +Gaussian attribute. This design exploits the spatial-aware features and +enforces interactions between neighboring points. The feature embeddings are +then fed to a spatial-audio attention module, which predicts frame-wise offsets +for the attributes of each Gaussian. It is more stable than previous +concatenation or multiplication approaches for manipulating the numerous +Gaussians and their intricate parameters. Experimental results showcase +GaussianTalker's superiority in facial fidelity, lip synchronization accuracy, +and rendering speed compared to previous methods. Specifically, GaussianTalker +achieves a remarkable rendering speed up to 120 FPS, surpassing previous +benchmarks. Our code is made available at +https://github.com/KU-CVLAB/GaussianTalker/ . + +
+
+ comment: Project Page: https://ku-cvlab.github.io/GaussianTalker +
+
+
+
+
+ + ♻ ☆ SYNAuG: Exploiting Synthetic Data for Data Imbalance Problems + + +
+ Data imbalance in training data often leads to biased predictions from +trained models, which in turn causes ethical and social issues. A +straightforward solution is to carefully curate training data, but given the +enormous scale of modern neural networks, this is prohibitively labor-intensive +and thus impractical. Inspired by recent developments in generative models, +this paper explores the potential of synthetic data to address the data +imbalance problem. To be specific, our method, dubbed SYNAuG, leverages +synthetic data to equalize the unbalanced distribution of training data. Our +experiments demonstrate that, although a domain gap between real and synthetic +data exists, training with SYNAuG followed by fine-tuning with a few real +samples allows to achieve impressive performance on diverse tasks with +different data imbalance issues, surpassing existing task-specific methods for +the same purpose. + +
+
+ comment: The paper is under consideration at Pattern Recognition Letters +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Deep Learning-based Research on Radiology Report + Generation + + +
+ Radiology report generation (RRG) aims to automatically generate free-text +descriptions from clinical radiographs, e.g., chest X-Ray images. RRG plays an +essential role in promoting clinical automation and presents significant help +to provide practical assistance for inexperienced doctors and alleviate +radiologists' workloads. Therefore, consider these meaningful potentials, +research on RRG is experiencing explosive growth in the past half-decade, +especially with the rapid development of deep learning approaches. Existing +studies perform RRG from the perspective of enhancing different modalities, +provide insights on optimizing the report generation process with elaborated +features from both visual and textual information, and further facilitate RRG +with the cross-modal interactions among them. In this paper, we present a +comprehensive review of deep learning-based RRG from various perspectives. +Specifically, we firstly cover pivotal RRG approaches based on the +task-specific features of radiographs, reports, and the cross-modal relations +between them, and then illustrate the benchmark datasets conventionally used +for this task with evaluation metrics, subsequently analyze the performance of +different approaches and finally offer our summary on the challenges and the +trends in future directions. Overall, the goal of this paper is to serve as a +tool for understanding existing literature and inspiring potential valuable +research in the field of RRG. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Accurate Spatial Gene Expression Prediction by integrating + Multi-resolution features CVPR 2024 + + +
+ Recent advancements in Spatial Transcriptomics (ST) technology have +facilitated detailed gene expression analysis within tissue contexts. However, +the high costs and methodological limitations of ST necessitate a more robust +predictive model. In response, this paper introduces TRIPLEX, a novel deep +learning framework designed to predict spatial gene expression from Whole Slide +Images (WSIs). TRIPLEX uniquely harnesses multi-resolution features, capturing +cellular morphology at individual spots, the local context around these spots, +and the global tissue organization. By integrating these features through an +effective fusion strategy, TRIPLEX achieves accurate gene expression +prediction. Our comprehensive benchmark study, conducted on three public ST +datasets and supplemented with Visium data from 10X Genomics, demonstrates that +TRIPLEX outperforms current state-of-the-art models in Mean Squared Error +(MSE), Mean Absolute Error (MAE), and Pearson Correlation Coefficient (PCC). +The model's predictions align closely with ground truth gene expression +profiles and tumor annotations, underscoring TRIPLEX's potential in advancing +cancer diagnosis and treatment. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ LuViRA Dataset Validation and Discussion: Comparing Vision, Radio, and + Audio Sensors for Indoor Localization + + +
+ We present a unique comparative analysis, and evaluation of vision, radio, +and audio based localization algorithms. We create the first baseline for the +aforementioned sensors using the recently published Lund University Vision, +Radio, and Audio (LuViRA) dataset, where all the sensors are synchronized and +measured in the same environment. Some of the challenges of using each specific +sensor for indoor localization tasks are highlighted. Each sensor is paired +with a current state-of-the-art localization algorithm and evaluated for +different aspects: localization accuracy, reliability and sensitivity to +environment changes, calibration requirements, and potential system complexity. +Specifically, the evaluation covers the ORB-SLAM3 algorithm for vision-based +localization with an RGB-D camera, a machine-learning algorithm for radio-based +localization with massive MIMO technology, and the SFS2 algorithm for +audio-based localization with distributed microphones. The results can serve as +a guideline and basis for further development of robust and high-precision +multi-sensory localization systems, e.g., through sensor fusion, context, and +environment-aware adaptation. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Rethinking Impersonation and Dodging Attacks on Face Recognition Systems + + +
+ Face Recognition (FR) systems can be easily deceived by adversarial examples +that manipulate benign face images through imperceptible perturbations. +Adversarial attacks on FR encompass two types: impersonation (targeted) attacks +and dodging (untargeted) attacks. Previous methods often achieve a successful +impersonation attack on FR; However, it does not necessarily guarantee a +successful dodging attack on FR in the black-box setting. In this paper, our +key insight is that the generation of adversarial examples should perform both +impersonation and dodging attacks simultaneously. To this end, we propose a +novel attack method termed as Adversarial Pruning (Adv-Pruning), to fine-tune +existing adversarial examples to enhance their dodging capabilities while +preserving their impersonation capabilities. Adv-Pruning consists of Priming, +Pruning, and Restoration stages. Concretely, we propose Adversarial Priority +Quantification to measure the region-wise priority of original adversarial +perturbations, identifying and releasing those with minimal impact on absolute +model output variances. Then, Biased Gradient Adaptation is presented to adapt +the adversarial examples to traverse the decision boundaries of both the +attacker and victim by adding perturbations favoring dodging attacks on the +vacated regions, preserving the prioritized features of the original +perturbations while boosting dodging performance. As a result, we can maintain +the impersonation capabilities of original adversarial examples while +effectively enhancing dodging capabilities. Comprehensive experiments +demonstrate the superiority of our method compared with state-of-the-art +adversarial attacks. + +
+
+
+
+
+ + ♻ ☆ HDBN: A Novel Hybrid Dual-branch Network for Robust Skeleton-based + Action Recognition + + +
+ Skeleton-based action recognition has gained considerable traction thanks to +its utilization of succinct and robust skeletal representations. Nonetheless, +current methodologies often lean towards utilizing a solitary backbone to model +skeleton modality, which can be limited by inherent flaws in the network +backbone. To address this and fully leverage the complementary characteristics +of various network architectures, we propose a novel Hybrid Dual-Branch Network +(HDBN) for robust skeleton-based action recognition, which benefits from the +graph convolutional network's proficiency in handling graph-structured data and +the powerful modeling capabilities of Transformers for global information. In +detail, our proposed HDBN is divided into two trunk branches: MixGCN and +MixFormer. The two branches utilize GCNs and Transformers to model both 2D and +3D skeletal modalities respectively. Our proposed HDBN emerged as one of the +top solutions in the Multi-Modal Video Reasoning and Analyzing Competition +(MMVRAC) of 2024 ICME Grand Challenge, achieving accuracies of 47.95% and +75.36% on two benchmarks of the UAV-Human dataset by outperforming most +existing methods. Our code will be publicly available at: +https://github.com/liujf69/ICMEW2024-Track10. + +
+
+
+
+
+ + ♻ ☆ Collaborative Semantic Occupancy Prediction with Hybrid Feature Fusion + in Connected Automated Vehicles CVPR2024 + + +
+ Collaborative perception in automated vehicles leverages the exchange of +information between agents, aiming to elevate perception results. Previous +camera-based collaborative 3D perception methods typically employ 3D bounding +boxes or bird's eye views as representations of the environment. However, these +approaches fall short in offering a comprehensive 3D environmental prediction. +To bridge this gap, we introduce the first method for collaborative 3D semantic +occupancy prediction. Particularly, it improves local 3D semantic occupancy +predictions by hybrid fusion of (i) semantic and occupancy task features, and +(ii) compressed orthogonal attention features shared between vehicles. +Additionally, due to the lack of a collaborative perception dataset designed +for semantic occupancy prediction, we augment a current collaborative +perception dataset to include 3D collaborative semantic occupancy labels for a +more robust evaluation. The experimental findings highlight that: (i) our +collaborative semantic occupancy predictions excel above the results from +single vehicles by over 30%, and (ii) models anchored on semantic occupancy +outpace state-of-the-art collaborative 3D detection techniques in subsequent +perception applications, showcasing enhanced accuracy and enriched +semantic-awareness in road environments. + +
+
+ comment: Accepted by CVPR2024. Website link: + https://rruisong.github.io/publications/CoHFF +
+
+
+
+
+ + ♻ ☆ Domain adaptive pose estimation via multi-level alignment + + +
+ Domain adaptive pose estimation aims to enable deep models trained on source +domain (synthesized) datasets produce similar results on the target domain +(real-world) datasets. The existing methods have made significant progress by +conducting image-level or feature-level alignment. However, only aligning at a +single level is not sufficient to fully bridge the domain gap and achieve +excellent domain adaptive results. In this paper, we propose a multi-level +domain adaptation aproach, which aligns different domains at the image, +feature, and pose levels. Specifically, we first utilize image style transer to +ensure that images from the source and target domains have a similar +distribution. Subsequently, at the feature level, we employ adversarial +training to make the features from the source and target domains preserve +domain-invariant characeristics as much as possible. Finally, at the pose +level, a self-supervised approach is utilized to enable the model to learn +diverse knowledge, implicitly addressing the domain gap. Experimental results +demonstrate that significant imrovement can be achieved by the proposed +multi-level alignment method in pose estimation, which outperforms previous +state-of-the-art in human pose by up to 2.4% and animal pose estimation by up +to 3.1% for dogs and 1.4% for sheep. + +
+
+ comment: accepted to icme2024 +
+
+
+
+
+ + ♻ ☆ Learning to Rank Patches for Unbiased Image Redundancy Reduction CVPR 2024 + + +
+ Images suffer from heavy spatial redundancy because pixels in neighboring +regions are spatially correlated. Existing approaches strive to overcome this +limitation by reducing less meaningful image regions. However, current leading +methods rely on supervisory signals. They may compel models to preserve content +that aligns with labeled categories and discard content belonging to unlabeled +categories. This categorical inductive bias makes these methods less effective +in real-world scenarios. To address this issue, we propose a self-supervised +framework for image redundancy reduction called Learning to Rank Patches +(LTRP). We observe that image reconstruction of masked image modeling models is +sensitive to the removal of visible patches when the masking ratio is high +(e.g., 90\%). Building upon it, we implement LTRP via two steps: inferring the +semantic density score of each patch by quantifying variation between +reconstructions with and without this patch, and learning to rank the patches +with the pseudo score. The entire process is self-supervised, thus getting out +of the dilemma of categorical inductive bias. We design extensive experiments +on different datasets and tasks. The results demonstrate that LTRP outperforms +both supervised and other self-supervised methods due to the fair assessment of +image content. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs + + +
+ Is vision good enough for language? Recent advancements in multimodal models +primarily stem from the powerful reasoning abilities of large language models +(LLMs). However, the visual component typically depends only on the +instance-level contrastive language-image pre-training (CLIP). Our research +reveals that the visual capabilities in recent multimodal LLMs (MLLMs) still +exhibit systematic shortcomings. To understand the roots of these errors, we +explore the gap between the visual embedding space of CLIP and vision-only +self-supervised learning. We identify ''CLIP-blind pairs'' - images that CLIP +perceives as similar despite their clear visual differences. With these pairs, +we construct the Multimodal Visual Patterns (MMVP) benchmark. MMVP exposes +areas where state-of-the-art systems, including GPT-4V, struggle with +straightforward questions across nine basic visual patterns, often providing +incorrect answers and hallucinated explanations. We further evaluate various +CLIP-based vision-and-language models and found a notable correlation between +visual patterns that challenge CLIP models and those problematic for multimodal +LLMs. As an initial effort to address these issues, we propose a Mixture of +Features (MoF) approach, demonstrating that integrating vision self-supervised +learning features with MLLMs can significantly enhance their visual grounding +capabilities. Together, our research suggests visual representation learning +remains an open challenge, and accurate visual grounding is crucial for future +successful multimodal systems. + +
+
+ comment: Project page: https://tsb0601.github.io/mmvp_blog/ +
+
+
+
+
+ + ♻ ☆ Revisiting Neural Networks for Continual Learning: An Architectural + Perspective + + +
+ Efforts to overcome catastrophic forgetting have primarily centered around +developing more effective Continual Learning (CL) methods. In contrast, less +attention was devoted to analyzing the role of network architecture design +(e.g., network depth, width, and components) in contributing to CL. This paper +seeks to bridge this gap between network architecture design and CL, and to +present a holistic study on the impact of network architectures on CL. This +work considers architecture design at the network scaling level, i.e., width +and depth, and also at the network components, i.e., skip connections, global +pooling layers, and down-sampling. In both cases, we first derive insights +through systematically exploring how architectural designs affect CL. Then, +grounded in these insights, we craft a specialized search space for CL and +further propose a simple yet effective ArchCraft method to steer a CL-friendly +architecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC. +Experimental validation across various CL settings and scenarios demonstrates +that improved architectures are parameter-efficient, achieving state-of-the-art +performance of CL while being 86%, 61%, and 97% more compact in terms of +parameters than the naive CL architecture in Task IL and Class IL. Code is +available at https://github.com/byyx666/ArchCraft. + +
+
+
+
+
+ + ♻ ☆ TIP-Editor: An Accurate 3D Editor Following Both Text-Prompts And + Image-Prompts + + +
+ Text-driven 3D scene editing has gained significant attention owing to its +convenience and user-friendliness. However, existing methods still lack +accurate control of the specified appearance and location of the editing result +due to the inherent limitations of the text description. To this end, we +propose a 3D scene editing framework, TIPEditor, that accepts both text and +image prompts and a 3D bounding box to specify the editing region. With the +image prompt, users can conveniently specify the detailed appearance/style of +the target content in complement to the text description, enabling accurate +control of the appearance. Specifically, TIP-Editor employs a stepwise 2D +personalization strategy to better learn the representation of the existing +scene and the reference image, in which a localization loss is proposed to +encourage correct object placement as specified by the bounding box. +Additionally, TIPEditor utilizes explicit and flexible 3D Gaussian splatting as +the 3D representation to facilitate local editing while keeping the background +unchanged. Extensive experiments have demonstrated that TIP-Editor conducts +accurate editing following the text and image prompts in the specified bounding +box region, consistently outperforming the baselines in editing quality, and +the alignment to the prompts, qualitatively and quantitatively. + +
+
+ comment: Accpeted by Siggraph 2024 & ACM Transactions on Graphics +
+
+
+
+
+ + ♻ ☆ DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage + CJK Character Generation + + +
+ Chinese, Japanese, and Korean (CJK), with a vast number of native speakers, +have profound influence on society and culture. The typesetting of CJK +languages carries a wide range of requirements due to the complexity of their +scripts and unique literary traditions. A critical aspect of this typesetting +process is that CJK fonts need to provide a set of consistent-looking glyphs +for approximately one hundred thousand characters. However, creating such a +font is inherently labor-intensive and expensive, which significantly hampers +the development of new CJK fonts for typesetting, historical, aesthetic, or +artistic purposes. To bridge this gap, we are motivated by recent advancements +in diffusion-based generative models and propose a novel diffusion method for +generating glyphs in a targeted style from a single conditioned, standard glyph +form. Our experiments show that our method is capable of generating fonts of +both printed and hand-written styles, the latter of which presents a greater +challenge. Moreover, our approach shows remarkable zero-shot generalization +capabilities for non-CJK but Chinese-inspired scripts. We also show our method +facilitates smooth style interpolation and generates bitmap images suitable for +vectorization, which is crucial in the font creation process. In summary, our +proposed method opens the door to high-quality, generative model-assisted font +creation for CJK characters, for both typesetting and artistic endeavors. + +
+
+ comment: Accepted in 15th International Conference on Computational + Creativity, ICCC'24 +
+
+
+
+
+ + ♻ ☆ OneChart: Purify the Chart Structural Extraction via One Auxiliary Token + + +
+ Chart parsing poses a significant challenge due to the diversity of styles, +values, texts, and so forth. Even advanced large vision-language models (LVLMs) +with billions of parameters struggle to handle such tasks satisfactorily. To +address this, we propose OneChart: a reliable agent specifically devised for +the structural extraction of chart information. Similar to popular LVLMs, +OneChart incorporates an autoregressive main body. Uniquely, to enhance the +reliability of the numerical parts of the output, we introduce an auxiliary +token placed at the beginning of the total tokens along with an additional +decoder. The numerically optimized (auxiliary) token allows subsequent tokens +for chart parsing to capture enhanced numerical features through causal +attention. Furthermore, with the aid of the auxiliary token, we have devised a +self-evaluation mechanism that enables the model to gauge the reliability of +its chart parsing results by providing confidence scores for the generated +content. Compared to current state-of-the-art (SOTA) chart parsing models, +e.g., DePlot, ChartVLM, ChartAst, OneChart significantly outperforms in Average +Precision (AP) for chart structural extraction across multiple public +benchmarks, despite enjoying only 0.2 billion parameters. Moreover, as a chart +parsing agent, it also brings 10%+ accuracy gains for the popular LVLM +(LLaVA-1.6) in the downstream ChartQA benchmark. + +
+
+ comment: 14 pages, 9 figures and 6 tables +
+
+
+
+
+ + ♻ ☆ What Makes Multimodal In-Context Learning Work? CVPR 2024 + + +
+ Large Language Models have demonstrated remarkable performance across various +tasks, exhibiting the capacity to swiftly acquire new skills, such as through +In-Context Learning (ICL) with minimal demonstration examples. In this work, we +present a comprehensive framework for investigating Multimodal ICL (M-ICL) in +the context of Large Multimodal Models. We consider the best open-source +multimodal models (e.g., IDEFICS, OpenFlamingo) and a wide range of multimodal +tasks. Our study unveils several noteworthy findings: (1) M-ICL primarily +relies on text-driven mechanisms, showing little to no influence from the image +modality. (2) When used with advanced-ICL strategy (like RICES), M-ICL is not +better than a simple strategy based on majority voting over context examples. +Moreover, we identify several biases and limitations of M-ICL that warrant +consideration prior to deployment. Code available at +https://gitlab.com/folbaeni/multimodal-icl + +
+
+ comment: 20 pages, 16 figures. Accepted to CVPR 2024 Workshop on Prompting in + Vision. Project page: https://folbaeni.gitlab.io/multimodal-icl +
+
+
+
+
+ + ♻ ☆ Unexplored Faces of Robustness and Out-of-Distribution: Covariate Shifts + in Environment and Sensor Domains CVPR 2024 + + +
+ Computer vision applications predict on digital images acquired by a camera +from physical scenes through light. However, conventional robustness benchmarks +rely on perturbations in digitized images, diverging from distribution shifts +occurring in the image acquisition process. To bridge this gap, we introduce a +new distribution shift dataset, ImageNet-ES, comprising variations in +environmental and camera sensor factors by directly capturing 202k images with +a real camera in a controllable testbed. With the new dataset, we evaluate +out-of-distribution (OOD) detection and model robustness. We find that existing +OOD detection methods do not cope with the covariate shifts in ImageNet-ES, +implying that the definition and detection of OOD should be revisited to +embrace real-world distribution shifts. We also observe that the model becomes +more robust in both ImageNet-C and -ES by learning environment and sensor +variations in addition to existing digital augmentations. Lastly, our results +suggest that effective shift mitigation via camera sensor control can +significantly improve performance without increasing model size. With these +findings, our benchmark may aid future research on robustness, OOD, and camera +sensor control for computer vision. Our code and dataset are available at +https://github.com/Edw2n/ImageNet-ES. + +
+
+ comment: Published as a conference paper at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Delocate: Detection and Localization for Deepfake Videos with + Randomly-Located Tampered Traces + + +
+ Deepfake videos are becoming increasingly realistic, showing subtle tampering +traces on facial areasthat vary between frames. Consequently, many existing +Deepfake detection methods struggle to detect unknown domain Deepfake videos +while accurately locating the tampered region. To address thislimitation, we +propose Delocate, a novel Deepfake detection model that can both recognize +andlocalize unknown domain Deepfake videos. Ourmethod consists of two stages +named recoveringand localization. In the recovering stage, the modelrandomly +masks regions of interest (ROIs) and reconstructs real faces without tampering +traces, resulting in a relatively good recovery effect for realfaces and a poor +recovery effect for fake faces. Inthe localization stage, the output of the +recoveryphase and the forgery ground truth mask serve assupervision to guide +the forgery localization process. This process strategically emphasizes the +recovery phase of fake faces with poor recovery, facilitating the localization +of tampered regions. Ourextensive experiments on four widely used benchmark +datasets demonstrate that Delocate not onlyexcels in localizing tampered areas +but also enhances cross-domain detection performance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2308.09921, + arXiv:2305.05943 +
+
+
+
+
+ + ♻ ☆ FRNet: Frustum-Range Networks for Scalable LiDAR Segmentation + + +
+ LiDAR segmentation has become a crucial component in advanced autonomous +driving systems. Recent range-view LiDAR segmentation approaches show promise +for real-time processing. However, they inevitably suffer from corrupted +contextual information and rely heavily on post-processing techniques for +prediction refinement. In this work, we propose FRNet, a simple yet powerful +method aimed at restoring the contextual information of range image pixels +using corresponding frustum LiDAR points. Firstly, a frustum feature encoder +module is used to extract per-point features within the frustum region, which +preserves scene consistency and is crucial for point-level predictions. Next, a +frustum-point fusion module is introduced to update per-point features +hierarchically, enabling each point to extract more surrounding information via +the frustum features. Finally, a head fusion module is used to fuse features at +different levels for final semantic prediction. Extensive experiments conducted +on four popular LiDAR segmentation benchmarks under various task setups +demonstrate the superiority of FRNet. Notably, FRNet achieves 73.3% and 82.5% +mIoU scores on the testing sets of SemanticKITTI and nuScenes. While achieving +competitive performance, FRNet operates 5 times faster than state-of-the-art +approaches. Such high efficiency opens up new possibilities for more scalable +LiDAR segmentation. The code has been made publicly available at +https://github.com/Xiangxu-0103/FRNet. + +
+
+ comment: Preprint; 16 pages, 8 figures, 10 tables; Code at + https://github.com/Xiangxu-0103/FRNet +
+
+
+
+
+ + ♻ ☆ Backpropagation-free Network for 3D Test-time Adaptation CVPR 2024 + + +
+ Real-world systems often encounter new data over time, which leads to +experiencing target domain shifts. Existing Test-Time Adaptation (TTA) methods +tend to apply computationally heavy and memory-intensive backpropagation-based +approaches to handle this. Here, we propose a novel method that uses a +backpropagation-free approach for TTA for the specific case of 3D data. Our +model uses a two-stream architecture to maintain knowledge about the source +domain as well as complementary target-domain-specific information. The +backpropagation-free property of our model helps address the well-known +forgetting problem and mitigates the error accumulation issue. The proposed +method also eliminates the need for the usually noisy process of +pseudo-labeling and reliance on costly self-supervised training. Moreover, our +method leverages subspace learning, effectively reducing the distribution +variance between the two domains. Furthermore, the source-domain-specific and +the target-domain-specific streams are aligned using a novel entropy-based +adaptive fusion strategy. Extensive experiments on popular benchmarks +demonstrate the effectiveness of our method. The code will be available at +\url{https://github.com/abie-e/BFTT3D}. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ InfoMatch: Entropy Neural Estimation for Semi-Supervised Image + Classification IJCAI 2024 + + +
+ Semi-supervised image classification, leveraging pseudo supervision and +consistency regularization, has demonstrated remarkable success. However, the +ongoing challenge lies in fully exploiting the potential of unlabeled data. To +address this, we employ information entropy neural estimation to utilize the +potential of unlabeled samples. Inspired by contrastive learning, the entropy +is estimated by maximizing a lower bound on mutual information across different +augmented views. Moreover, we theoretically analyze that the information +entropy of the posterior of an image classifier is approximated by maximizing +the likelihood function of the softmax predictions. Guided by these insights, +we optimize our model from both perspectives to ensure that the predicted +probability distribution closely aligns with the ground-truth distribution. +Given the theoretical connection to information entropy, we name our method +InfoMatch. Through extensive experiments, we show its superior performance. The +source code is available at https://github.com/kunzhan/InfoMatch. + +
+
+ comment: IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Optimizing Calibration by Gaining Aware of Prediction Correctness + + +
+ Model calibration aims to align confidence with prediction correctness. The +Cross-Entropy (CE) loss is widely used for calibrator training, which enforces +the model to increase confidence on the ground truth class. However, we find +the CE loss has intrinsic limitations. For example, for a narrow +misclassification, a calibrator trained by the CE loss often produces high +confidence on the wrongly predicted class (e.g., a test sample is wrongly +classified and its softmax score on the ground truth class is around 0.4), +which is undesirable. In this paper, we propose a new post-hoc calibration +objective derived from the aim of calibration. Intuitively, the proposed +objective function asks that the calibrator decrease model confidence on +wrongly predicted samples and increase confidence on correctly predicted +samples. Because a sample itself has insufficient ability to indicate +correctness, we use its transformed versions (e.g., rotated, greyscaled and +color-jittered) during calibrator training. Trained on an in-distribution +validation set and tested with isolated, individual test samples, our method +achieves competitive calibration performance on both in-distribution and +out-of-distribution test sets compared with the state of the art. Further, our +analysis points out the difference between our method and commonly used +objectives such as CE loss and mean square error loss, where the latters +sometimes deviates from the calibration aim. + +
+
+
+
+
+ + ♻ ☆ MyriadAL: Active Few Shot Learning for Histopathology + + +
+ Active Learning (AL) and Few Shot Learning (FSL) are two label-efficient +methods which have achieved excellent results recently. However, most prior +arts in both learning paradigms fail to explore the wealth of the vast +unlabelled data. In this study, we address this issue in the scenario where the +annotation budget is very limited, yet a large amount of unlabelled data for +the target task is available. We frame this work in the context of +histopathology where labelling is prohibitively expensive. To this end, we +introduce an active few shot learning framework, Myriad Active Learning (MAL), +including a contrastive-learning encoder, pseudo-label generation, and novel +query sample selection in the loop. Specifically, we propose to massage +unlabelled data in a self-supervised manner, where the obtained data +representations and clustering knowledge form the basis to activate the AL +loop. With feedback from the oracle in each AL cycle, the pseudo-labels of the +unlabelled data are refined by optimizing a shallow task-specific net on top of +the encoder. These updated pseudo-labels serve to inform and improve the active +learning query selection process. Furthermore, we introduce a novel recipe to +combine existing uncertainty measures and utilize the entire uncertainty list +to reduce sample redundancy in AL. Extensive experiments on two public +histopathology datasets show that MAL has superior test accuracy, macro +F1-score, and label efficiency compared to prior works, and can achieve a +comparable test accuracy to a fully supervised algorithm while labelling only +5% of the dataset. + +
+
+ comment: Accepted to IEEE CAI 2024. 8 pages, 2 figures. Code available at: + https://github.com/mesophil/MyriadAL +
+
+
+
+
+ + ♻ ☆ BLINK: Multimodal Large Language Models Can See but Not Perceive + + +
+ We introduce Blink, a new benchmark for multimodal language models (LLMs) +that focuses on core visual perception abilities not found in other +evaluations. Most of the Blink tasks can be solved by humans "within a blink" +(e.g., relative depth estimation, visual correspondence, forensics detection, +and multi-view reasoning). However, we find these perception-demanding tasks +cast significant challenges for current multimodal LLMs because they resist +mediation through natural language. Blink reformats 14 classic computer vision +tasks into 3,807 multiple-choice questions, paired with single or multiple +images and visual prompting. While humans get 95.70% accuracy on average, Blink +is surprisingly challenging for existing multimodal LLMs: even the +best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only +13.17% and 7.63% higher than random guessing, indicating that such perception +abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also +highlights that specialist CV models could solve these problems much better, +suggesting potential pathways for future improvements. We believe Blink will +stimulate the community to help multimodal LLMs catch up with human-level +visual perception. + +
+
+ comment: Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/ +
+
+
+
+
+ + ♻ ☆ Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image + Labeling + + +
+ As deep neural networks are more commonly deployed in high-stakes domains, +their black-box nature makes uncertainty quantification challenging. We +investigate the presentation of conformal prediction sets--a distribution-free +class of methods for generating prediction sets with specified coverage--to +express uncertainty in AI-advised decision-making. Through a large online +experiment, we compare the utility of conformal prediction sets to displays of +Top-1 and Top-k predictions for AI-advised image labeling. In a pre-registered +analysis, we find that the utility of prediction sets for accuracy varies with +the difficulty of the task: while they result in accuracy on par with or less +than Top-1 and Top-k displays for easy images, prediction sets offer some +advantage in assisting humans in labeling out-of-distribution (OOD) images in +the setting that we studied, especially when the set size is small. Our results +empirically pinpoint practical challenges of conformal prediction sets and +provide implications on how to incorporate them for real-world decision-making. + +
+
+ comment: 19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024 +
+
+
+
+
+ + ♻ ☆ Label Delay in Online Continual Learning + + +
+ Online continual learning, the process of training models on streaming data, +has gained increasing attention in recent years. However, a critical aspect +often overlooked is the label delay, where new data may not be labeled due to +slow and costly annotation processes. We introduce a new continual learning +framework with explicit modeling of the label delay between data and label +streams over time steps. In each step, the framework reveals both unlabeled +data from the current time step $t$ and labels delayed with $d$ steps, from the +time step $t-d$. In our extensive experiments amounting to 1060 GPU days, we +show that merely augmenting the computational resources is insufficient to +tackle this challenge. Our findings underline a notable performance decline +when solely relying on labeled data when the label delay becomes significant. +More surprisingly, when using state-of-the-art SSL and TTA techniques to +utilize the newer, unlabeled data, they fail to surpass the performance of a +na\"ive method that simply trains on the delayed supervised stream. To this +end, we introduce a simple, efficient baseline that rehearses from the labeled +memory samples that are most similar to the new unlabeled samples. This method +bridges the accuracy gap caused by label delay without significantly increasing +computational complexity. We show experimentally that our method is the least +affected by the label delay factor and in some cases successfully recovers the +accuracy of the non-delayed counterpart. We conduct various ablations and +sensitivity experiments, demonstrating the effectiveness of our approach. + +
+
+ comment: 17 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Iris-SAM: Iris Segmentation Using a Foundation Model + + +
+ Iris segmentation is a critical component of an iris biometric system and it +involves extracting the annular iris region from an ocular image. In this work, +we develop a pixel-level iris segmentation model from a foundational model, +viz., Segment Anything Model (SAM), that has been successfully used for +segmenting arbitrary objects. The primary contribution of this work lies in the +integration of different loss functions during the fine-tuning of SAM on ocular +images. In particular, the importance of Focal Loss is borne out in the +fine-tuning process since it strategically addresses the class imbalance +problem (i.e., iris versus non-iris pixels). Experiments on ND-IRIS-0405, +CASIA-Iris-Interval-v3, and IIT-Delhi-Iris datasets convey the efficacy of the +trained model for the task of iris segmentation. For instance, on the +ND-IRIS-0405 dataset, an average segmentation accuracy of 99.58% was achieved, +compared to the best baseline performance of 89.75%. + +
+
+
+
+
+ + ♻ ☆ Model-agnostic explainable artificial intelligence for object detection + in image data + + +
+ In recent years, deep neural networks have been widely used for building +high-performance Artificial Intelligence (AI) systems for computer vision +applications. Object detection is a fundamental task in computer vision, which +has been greatly progressed through developing large and intricate deep +learning models. However, the lack of transparency is a big challenge that may +not allow the widespread adoption of these models. Explainable artificial +intelligence is a field of research where methods are developed to help users +understand the behavior, decision logics, and vulnerabilities of AI systems. +Previously, few explanation methods were developed for object detection, based +on the idea of random masks. However, random masks may raise some issues +regarding the actual importance of pixels within an image. In this paper, we +design and implement a black-box explanation method named Black-box Object +Detection Explanation by Masking (BODEM) through adopting a hierarchical random +masking approach for AI-based object detection systems. We propose a +hierarchical random masking framework in which coarse-grained masks are used in +lower levels to find salient regions within an image, and fine-grained mask are +used to refine the salient regions in higher levels. Experimentations on +various object detection datasets and models showed that BODEM can be +effectively used to explain the behavior of object detectors. Moreover, our +method outperformed Detector Randomized Input Sampling for Explanation (D-RISE) +with respect to different quantitative measures of explanation effectiveness. +The experimental results demonstrate that BODEM can be an effective method for +explaining and validating object detection systems in black-box testing +scenarios. + +
+
+
+
+
+ + ♻ ☆ Deep Image Composition Meets Image Forgery + + +
+ Image forgery is a topic that has been studied for many years. Before the +breakthrough of deep learning, forged images were detected using handcrafted +features that did not require training. These traditional methods failed to +perform satisfactorily even on datasets much worse in quality than real-life +image manipulations. Advances in deep learning have impacted image forgery +detection as much as they have impacted other areas of computer vision and have +improved the state of the art. Deep learning models require large amounts of +labeled data for training. In the case of image forgery, labeled data at the +pixel level is a very important factor for the models to learn. None of the +existing datasets have sufficient size, realism and pixel-level labeling at the +same time. This is due to the high cost of producing and labeling quality +images. It can take hours for an image editing expert to manipulate just one +image. To bridge this gap, we automate data generation using image composition +techniques that are very related to image forgery. Unlike other automated data +generation frameworks, we use state of the art image composition deep learning +models to generate spliced images close to the quality of real-life +manipulations. Finally, we test the generated dataset on the SOTA image +manipulation detection model and show that its prediction performance is lower +compared to existing datasets, i.e. we produce realistic images that are more +difficult to detect. Dataset will be available at +https://github.com/99eren99/DIS25k . + +
+
+
+
+
+ + ♻ ☆ Distilling Privileged Multimodal Information for Expression Recognition + using Optimal Transport + + +
+ Deep learning models for multimodal expression recognition have reached +remarkable performance in controlled laboratory environments because of their +ability to learn complementary and redundant semantic information. However, +these models struggle in the wild, mainly because of the unavailability and +quality of modalities used for training. In practice, only a subset of the +training-time modalities may be available at test time. Learning with +privileged information enables models to exploit data from additional +modalities that are only available during training. State-of-the-art knowledge +distillation (KD) methods have been proposed to distill information from +multiple teacher models (each trained on a modality) to a common student model. +These privileged KD methods typically utilize point-to-point matching, yet have +no explicit mechanism to capture the structural information in the teacher +representation space formed by introducing the privileged modality. Experiments +were performed on two challenging problems - pain estimation on the Biovid +dataset (ordinal classification) and arousal-valance prediction on the Affwild2 +dataset (regression). Results show that our proposed method can outperform +state-of-the-art privileged KD methods on these problems. The diversity among +modalities and fusion architectures indicates that PKDOT is modality- and +model-agnostic. + +
+
+
+
+
+ + ♻ ☆ ODIN: A Single Model for 2D and 3D Segmentation CVPR 2024 + + +
+ State-of-the-art models on contemporary 3D segmentation benchmarks like +ScanNet consume and label dataset-provided 3D point clouds, obtained through +post processing of sensed multiview RGB-D images. They are typically trained +in-domain, forego large-scale 2D pre-training and outperform alternatives that +featurize the posed RGB-D multiview images instead. The gap in performance +between methods that consume posed images versus post-processed 3D point clouds +has fueled the belief that 2D and 3D perception require distinct model +architectures. In this paper, we challenge this view and propose ODIN +(Omni-Dimensional INstance segmentation), a model that can segment and label +both 2D RGB images and 3D point clouds, using a transformer architecture that +alternates between 2D within-view and 3D cross-view information fusion. Our +model differentiates 2D and 3D feature operations through the positional +encodings of the tokens involved, which capture pixel coordinates for 2D patch +tokens and 3D coordinates for 3D feature tokens. ODIN achieves state-of-the-art +performance on ScanNet200, Matterport3D and AI2THOR 3D instance segmentation +benchmarks, and competitive performance on ScanNet, S3DIS and COCO. It +outperforms all previous works by a wide margin when the sensed 3D point cloud +is used in place of the point cloud sampled from 3D mesh. When used as the 3D +perception engine in an instructable embodied agent architecture, it sets a new +state-of-the-art on the TEACh action-from-dialogue benchmark. Our code and +checkpoints can be found at the project website (https://odin-seg.github.io). + +
+
+ comment: Camera Ready (CVPR 2024, Highlight) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 142 + +
+
+
+ + ☆ Understanding and Improving CNNs with Complex Structure Tensor: A + Biometrics Study + + +
+ Our study provides evidence that CNNs struggle to effectively extract +orientation features. We show that the use of Complex Structure Tensor, which +contains compact orientation features with certainties, as input to CNNs +consistently improves identification accuracy compared to using grayscale +inputs alone. Experiments also demonstrated that our inputs, which were +provided by mini complex conv-nets, combined with reduced CNN sizes, +outperformed full-fledged, prevailing CNN architectures. This suggests that the +upfront use of orientation features in CNNs, a strategy seen in mammalian +vision, not only mitigates their limitations but also enhances their +explainability and relevance to thin-clients. Experiments were done on publicly +available data sets comprising periocular images for biometric identification +and verification (Close and Open World) using 6 State of the Art CNN +architectures. We reduced SOA Equal Error Rate (EER) on the PolyU dataset by +5-26% depending on data and scenario. + +
+
+ comment: preprint manuscript +
+
+
+
+
+ + ☆ ImplicitAVE: An Open-Source Dataset and Multimodal LLMs Benchmark for + Implicit Attribute Value Extraction + + +
+ Existing datasets for attribute value extraction (AVE) predominantly focus on +explicit attribute values while neglecting the implicit ones, lack product +images, are often not publicly available, and lack an in-depth human inspection +across diverse domains. To address these limitations, we present ImplicitAVE, +the first, publicly available multimodal dataset for implicit attribute value +extraction. ImplicitAVE, sourced from the MAVE dataset, is carefully curated +and expanded to include implicit AVE and multimodality, resulting in a refined +dataset of 68k training and 1.6k testing data across five domains. We also +explore the application of multimodal large language models (MLLMs) to implicit +AVE, establishing a comprehensive benchmark for MLLMs on the ImplicitAVE +dataset. Six recent MLLMs with eleven variants are evaluated across diverse +settings, revealing that implicit value extraction remains a challenging task +for MLLMs. The contributions of this work include the development and release +of ImplicitAVE, and the exploration and benchmarking of various MLLMs for +implicit AVE, providing valuable insights and potential future research +directions. Dataset and code are available at +https://github.com/HenryPengZou/ImplicitAVE + +
+
+
+
+
+ + ☆ Domain Adaptation for Learned Image Compression with Supervised Adapters + + +
+ In Learned Image Compression (LIC), a model is trained at encoding and +decoding images sampled from a source domain, often outperforming traditional +codecs on natural images; yet its performance may be far from optimal on images +sampled from different domains. In this work, we tackle the problem of adapting +a pre-trained model to multiple target domains by plugging into the decoder an +adapter module for each of them, including the source one. Each adapter +improves the decoder performance on a specific domain, without the model +forgetting about the images seen at training time. A gate network computes the +weights to optimally blend the contributions from the adapters when the +bitstream is decoded. We experimentally validate our method over two +state-of-the-art pre-trained models, observing improved rate-distortion +efficiency on the target domains without penalties on the source domain. +Furthermore, the gate's ability to find similarities with the learned target +domains enables better encoding efficiency also for images outside them. + +
+
+ comment: 10 pages, published to Data compression conference 2024 (DCC2024) +
+
+
+
+
+ + ☆ MiM: Mask in Mask Self-Supervised Pre-Training for 3D Medical Image + Analysis + + +
+ The Vision Transformer (ViT) has demonstrated remarkable performance in +Self-Supervised Learning (SSL) for 3D medical image analysis. Mask AutoEncoder +(MAE) for feature pre-training can further unleash the potential of ViT on +various medical vision tasks. However, due to large spatial sizes with much +higher dimensions of 3D medical images, the lack of hierarchical design for MAE +may hinder the performance of downstream tasks. In this paper, we propose a +novel \textit{Mask in Mask (MiM)} pre-training framework for 3D medical images, +which aims to advance MAE by learning discriminative representation from +hierarchical visual tokens across varying scales. We introduce multiple levels +of granularity for masked inputs from the volume, which are then reconstructed +simultaneously ranging at both fine and coarse levels. Additionally, a +cross-level alignment mechanism is applied to adjacent level volumes to enforce +anatomical similarity hierarchically. Furthermore, we adopt a hybrid backbone +to enhance the hierarchical representation learning efficiently during the +pre-training. MiM was pre-trained on a large scale of available 3D volumetric +images, \textit{i.e.,} Computed Tomography (CT) images containing various body +parts. Extensive experiments on thirteen public datasets demonstrate the +superiority of MiM over other SSL methods in organ/lesion/tumor segmentation +and disease classification. We further scale up the MiM to large pre-training +datasets with more than 10k volumes, showing that large-scale pre-training can +further enhance the performance of downstream tasks. The improvement also +concluded that the research community should pay more attention to the scale of +the pre-training dataset towards the healthcare foundation model for 3D medical +images. + +
+
+ comment: submitted to journal +
+
+
+
+
+ + ☆ Enhancing Privacy in Face Analytics Using Fully Homomorphic Encryption + + +
+ Modern face recognition systems utilize deep neural networks to extract +salient features from a face. These features denote embeddings in latent space +and are often stored as templates in a face recognition system. These +embeddings are susceptible to data leakage and, in some cases, can even be used +to reconstruct the original face image. To prevent compromising identities, +template protection schemes are commonly employed. However, these schemes may +still not prevent the leakage of soft biometric information such as age, gender +and race. To alleviate this issue, we propose a novel technique that combines +Fully Homomorphic Encryption (FHE) with an existing template protection scheme +known as PolyProtect. We show that the embeddings can be compressed and +encrypted using FHE and transformed into a secure PolyProtect template using +polynomial transformation, for additional protection. We demonstrate the +efficacy of the proposed approach through extensive experiments on multiple +datasets. Our proposed approach ensures irreversibility and unlinkability, +effectively preventing the leakage of soft biometric attributes from face +embeddings without compromising recognition accuracy. + +
+
+
+
+
+ + ☆ Deep RAW Image Super-Resolution. A NTIRE 2024 Challenge Survey CVPR 2024 + + +
+ This paper reviews the NTIRE 2024 RAW Image Super-Resolution Challenge, +highlighting the proposed solutions and results. New methods for RAW +Super-Resolution could be essential in modern Image Signal Processing (ISP) +pipelines, however, this problem is not as explored as in the RGB domain. Th +goal of this challenge is to upscale RAW Bayer images by 2x, considering +unknown degradations such as noise and blur. In the challenge, a total of 230 +participants registered, and 45 submitted results during thee challenge period. +The performance of the top-5 submissions is reviewed and provided here as a +gauge for the current state-of-the-art in RAW Image Super-Resolution. + +
+
+ comment: CVPR 2024 - NTIRE Workshop +
+
+
+
+
+ + ☆ Step Differences in Instructional Video + + +
+ Comparing a user video to a reference how-to video is a key requirement for +AR/VR technology delivering personalized assistance tailored to the user's +progress. However, current approaches for language-based assistance can only +answer questions about a single video. We propose an approach that first +automatically generates large amounts of visual instruction tuning data +involving pairs of videos from HowTo100M by leveraging existing step +annotations and accompanying narrations, and then trains a video-conditioned +language model to jointly reason across multiple raw videos. Our model achieves +state-of-the-art performance at identifying differences between video pairs and +ranking videos based on the severity of these differences, and shows promising +ability to perform general reasoning over multiple videos. + +
+
+
+
+
+ + ☆ NeRF-XL: Scaling NeRFs with Multiple GPUs + + +
+ We present NeRF-XL, a principled method for distributing Neural Radiance +Fields (NeRFs) across multiple GPUs, thus enabling the training and rendering +of NeRFs with an arbitrarily large capacity. We begin by revisiting existing +multi-GPU approaches, which decompose large scenes into multiple independently +trained NeRFs, and identify several fundamental issues with these methods that +hinder improvements in reconstruction quality as additional computational +resources (GPUs) are used in training. NeRF-XL remedies these issues and +enables the training and rendering of NeRFs with an arbitrary number of +parameters by simply using more hardware. At the core of our method lies a +novel distributed training and rendering formulation, which is mathematically +equivalent to the classic single-GPU case and minimizes communication between +GPUs. By unlocking NeRFs with arbitrarily large parameter counts, our approach +is the first to reveal multi-GPU scaling laws for NeRFs, showing improvements +in reconstruction quality with larger parameter counts and speed improvements +with more GPUs. We demonstrate the effectiveness of NeRF-XL on a wide variety +of datasets, including the largest open-source dataset to date, MatrixCity, +containing 258K images covering a 25km^2 city area. + +
+
+ comment: Webpage: https://research.nvidia.com/labs/toronto-ai/nerfxl/ +
+
+
+
+
+ + ☆ ActiveRIR: Active Audio-Visual Exploration for Acoustic Environment + Modeling + + +
+ An environment acoustic model represents how sound is transformed by the +physical characteristics of an indoor environment, for any given +source/receiver location. Traditional methods for constructing acoustic models +involve expensive and time-consuming collection of large quantities of acoustic +data at dense spatial locations in the space, or rely on privileged knowledge +of scene geometry to intelligently select acoustic data sampling locations. We +propose active acoustic sampling, a new task for efficiently building an +environment acoustic model of an unmapped environment in which a mobile agent +equipped with visual and acoustic sensors jointly constructs the environment +acoustic model and the occupancy map on-the-fly. We introduce ActiveRIR, a +reinforcement learning (RL) policy that leverages information from audio-visual +sensor streams to guide agent navigation and determine optimal acoustic data +sampling positions, yielding a high quality acoustic model of the environment +from a minimal set of acoustic samples. We train our policy with a novel RL +reward based on information gain in the environment acoustic model. Evaluating +on diverse unseen indoor environments from a state-of-the-art acoustic +simulation platform, ActiveRIR outperforms an array of methods--both +traditional navigation agents based on spatial novelty and visual exploration +as well as existing state-of-the-art methods. + +
+
+ comment: Project page: https://vision.cs.utexas.edu/projects/active_rir/ +
+
+
+
+
+ + ☆ An Analysis of Recent Advances in Deepfake Image Detection in an + Evolving Threat Landscape + + +
+ Deepfake or synthetic images produced using deep generative models pose +serious risks to online platforms. This has triggered several research efforts +to accurately detect deepfake images, achieving excellent performance on +publicly available deepfake datasets. In this work, we study 8 state-of-the-art +detectors and argue that they are far from being ready for deployment due to +two recent developments. First, the emergence of lightweight methods to +customize large generative models, can enable an attacker to create many +customized generators (to create deepfakes), thereby substantially increasing +the threat surface. We show that existing defenses fail to generalize well to +such \emph{user-customized generative models} that are publicly available +today. We discuss new machine learning approaches based on content-agnostic +features, and ensemble modeling to improve generalization performance against +user-customized models. Second, the emergence of \textit{vision foundation +models} -- machine learning models trained on broad data that can be easily +adapted to several downstream tasks -- can be misused by attackers to craft +adversarial deepfakes that can evade existing defenses. We propose a simple +adversarial attack that leverages existing foundation models to craft +adversarial samples \textit{without adding any adversarial noise}, through +careful semantic manipulation of the image content. We highlight the +vulnerabilities of several defenses against our attack, and explore directions +leveraging advanced foundation models and adversarial training to defend +against this new threat. + +
+
+ comment: Accepted to IEEE S&P 2024; 19 pages, 10 figures +
+
+
+
+
+ + ☆ AIS 2024 Challenge on Video Quality Assessment of User-Generated + Content: Methods and Results CVPR 2024 + + +
+ This paper reviews the AIS 2024 Video Quality Assessment (VQA) Challenge, +focused on User-Generated Content (UGC). The aim of this challenge is to gather +deep learning-based methods capable of estimating the perceptual quality of UGC +videos. The user-generated videos from the YouTube UGC Dataset include diverse +content (sports, games, lyrics, anime, etc.), quality and resolutions. The +proposed methods must process 30 FHD frames under 1 second. In the challenge, a +total of 102 participants registered, and 15 submitted code and models. The +performance of the top-5 submissions is reviewed and provided here as a survey +of diverse deep models for efficient video quality assessment of user-generated +content. + +
+
+ comment: CVPR 2024 Workshop -- AI for Streaming (AIS) Video Quality Assessment + Challenge +
+
+
+
+
+ + ☆ Improving Multi-label Recognition using Class Co-Occurrence + Probabilities + + +
+ Multi-label Recognition (MLR) involves the identification of multiple objects +within an image. To address the additional complexity of this problem, recent +works have leveraged information from vision-language models (VLMs) trained on +large text-images datasets for the task. These methods learn an independent +classifier for each object (class), overlooking correlations in their +occurrences. Such co-occurrences can be captured from the training data as +conditional probabilities between a pair of classes. We propose a framework to +extend the independent classifiers by incorporating the co-occurrence +information for object pairs to improve the performance of independent +classifiers. We use a Graph Convolutional Network (GCN) to enforce the +conditional probabilities between classes, by refining the initial estimates +derived from image and text sources obtained using VLMs. We validate our method +on four MLR datasets, where our approach outperforms all state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Fusion of Domain-Adapted Vision and Language Models for Medical Visual + Question Answering NAACL 2024 + + +
+ Vision-language models, while effective in general domains and showing strong +performance in diverse multi-modal applications like visual question-answering +(VQA), struggle to maintain the same level of effectiveness in more specialized +domains, e.g., medical. We propose a medical vision-language model that +integrates large vision and language models adapted for the medical domain. +This model goes through three stages of parameter-efficient training using +three separate biomedical and radiology multi-modal visual and text datasets. +The proposed model achieves state-of-the-art performance on the SLAKE 1.0 +medical VQA (MedVQA) dataset with an overall accuracy of 87.5% and demonstrates +strong performance on another MedVQA dataset, VQA-RAD, achieving an overall +accuracy of 73.2%. + +
+
+ comment: Clinical NLP @ NAACL 2024 +
+
+
+
+
+ + ☆ MiMICRI: Towards Domain-centered Counterfactual Explanations of + Cardiovascular Image Classification Models + + +
+ The recent prevalence of publicly accessible, large medical imaging datasets +has led to a proliferation of artificial intelligence (AI) models for +cardiovascular image classification and analysis. At the same time, the +potentially significant impacts of these models have motivated the development +of a range of explainable AI (XAI) methods that aim to explain model +predictions given certain image inputs. However, many of these methods are not +developed or evaluated with domain experts, and explanations are not +contextualized in terms of medical expertise or domain knowledge. In this +paper, we propose a novel framework and python library, MiMICRI, that provides +domain-centered counterfactual explanations of cardiovascular image +classification models. MiMICRI helps users interactively select and replace +segments of medical images that correspond to morphological structures. From +the counterfactuals generated, users can then assess the influence of each +segment on model predictions, and validate the model against known medical +facts. We evaluate this library with two medical experts. Our evaluation +demonstrates that a domain-centered XAI approach can enhance the +interpretability of model explanations, and help experts reason about models in +terms of relevant domain knowledge. However, concerns were also surfaced about +the clinical plausibility of the counterfactuals generated. We conclude with a +discussion on the generalizability and trustworthiness of the MiMICRI +framework, as well as the implications of our findings on the development of +domain-centered XAI methods for model interpretability in healthcare contexts. + +
+
+ comment: 14 pages, 6 figures, ACM FAccT 2024 +
+
+
+
+
+ + ☆ Does SAM dream of EIG? Characterizing Interactive Segmenter Performance + using Expected Information Gain + + +
+ We introduce an assessment procedure for interactive segmentation models. +Based on concepts from Bayesian Experimental Design, the procedure measures a +model's understanding of point prompts and their correspondence with the +desired segmentation mask. We show that Oracle Dice index measurements are +insensitive or even misleading in measuring this property. We demonstrate the +use of the proposed procedure on three interactive segmentation models and +subsets of two large image segmentation datasets. + +
+
+
+
+
+ + ☆ A Survey on Intermediate Fusion Methods for Collaborative Perception + Categorized by Real World Challenges + + +
+ This survey analyzes intermediate fusion methods in collaborative perception +for autonomous driving, categorized by real-world challenges. We examine +various methods, detailing their features and the evaluation metrics they +employ. The focus is on addressing challenges like transmission efficiency, +localization errors, communication disruptions, and heterogeneity. Moreover, we +explore strategies to counter adversarial attacks and defenses, as well as +approaches to adapt to domain shifts. The objective is to present an overview +of how intermediate fusion methods effectively meet these diverse challenges, +highlighting their role in advancing the field of collaborative perception in +autonomous driving. + +
+
+ comment: 8 pages, 6 tables +
+
+
+
+
+ + ☆ 3D Human Pose Estimation with Occlusions: Introducing BlendMimic3D + Dataset and GCN Refinement CVPR 2024 + + +
+ In the field of 3D Human Pose Estimation (HPE), accurately estimating human +pose, especially in scenarios with occlusions, is a significant challenge. This +work identifies and addresses a gap in the current state of the art in 3D HPE +concerning the scarcity of data and strategies for handling occlusions. We +introduce our novel BlendMimic3D dataset, designed to mimic real-world +situations where occlusions occur for seamless integration in 3D HPE +algorithms. Additionally, we propose a 3D pose refinement block, employing a +Graph Convolutional Network (GCN) to enhance pose representation through a +graph model. This GCN block acts as a plug-and-play solution, adaptable to +various 3D HPE frameworks without requiring retraining them. By training the +GCN with occluded data from BlendMimic3D, we demonstrate significant +improvements in resolving occluded poses, with comparable results for +non-occluded ones. Project web page is available at +https://blendmimic3d.github.io/BlendMimic3D/. + +
+
+ comment: Accepted at 6th Workshop and Competition on Affective Behavior + Analysis in-the-wild - CVPR 2024 Workshop +
+
+
+
+
+ + ☆ Quantitative Characterization of Retinal Features in Translated OCTA + + +
+ Purpose: This study explores the feasibility of using generative machine +learning (ML) to translate Optical Coherence Tomography (OCT) images into +Optical Coherence Tomography Angiography (OCTA) images, potentially bypassing +the need for specialized OCTA hardware. Methods: The method involved +implementing a generative adversarial network framework that includes a 2D +vascular segmentation model and a 2D OCTA image translation model. The study +utilizes a public dataset of 500 patients, divided into subsets based on +resolution and disease status, to validate the quality of TR-OCTA images. The +validation employs several quality and quantitative metrics to compare the +translated images with ground truth OCTAs (GT-OCTA). We then quantitatively +characterize vascular features generated in TR-OCTAs with GT-OCTAs to assess +the feasibility of using TR-OCTA for objective disease diagnosis. Result: +TR-OCTAs showed high image quality in both 3 and 6 mm datasets +(high-resolution, moderate structural similarity and contrast quality compared +to GT-OCTAs). There were slight discrepancies in vascular metrics, especially +in diseased patients. Blood vessel features like tortuosity and vessel +perimeter index showed a better trend compared to density features which are +affected by local vascular distortions. Conclusion: This study presents a +promising solution to the limitations of OCTA adoption in clinical practice by +using vascular features from TR-OCTA for disease detection. Translation +relevance: This study has the potential to significantly enhance the diagnostic +process for retinal diseases by making detailed vascular imaging more widely +available and reducing dependency on costly OCTA equipment. + +
+
+ comment: The article has been revised and edited +
+
+
+
+
+ + ☆ FairDeDup: Detecting and Mitigating Vision-Language Fairness Disparities + in Semantic Dataset Deduplication CVPR 2024 + + +
+ Recent dataset deduplication techniques have demonstrated that content-aware +dataset pruning can dramatically reduce the cost of training Vision-Language +Pretrained (VLP) models without significant performance losses compared to +training on the original dataset. These results have been based on pruning +commonly used image-caption datasets collected from the web -- datasets that +are known to harbor harmful social biases that may then be codified in trained +models. In this work, we evaluate how deduplication affects the prevalence of +these biases in the resulting trained models and introduce an easy-to-implement +modification to the recent SemDeDup algorithm that can reduce the negative +effects that we observe. When examining CLIP-style models trained on +deduplicated variants of LAION-400M, we find our proposed FairDeDup algorithm +consistently leads to improved fairness metrics over SemDeDup on the FairFace +and FACET datasets while maintaining zero-shot performance on CLIP benchmarks. + +
+
+ comment: Conference paper at CVPR 2024. 6 pages, 8 figures. Project Page: + https://ericslyman.com/fairdedup/ +
+
+
+
+
+ + ☆ Mamba-360: Survey of State Space Models as Transformer Alternative for + Long Sequence Modelling: Methods, Applications, and Challenges + + +
+ Sequence modeling is a crucial area across various domains, including Natural +Language Processing (NLP), speech recognition, time series forecasting, music +generation, and bioinformatics. Recurrent Neural Networks (RNNs) and Long Short +Term Memory Networks (LSTMs) have historically dominated sequence modeling +tasks like Machine Translation, Named Entity Recognition (NER), etc. However, +the advancement of transformers has led to a shift in this paradigm, given +their superior performance. Yet, transformers suffer from $O(N^2)$ attention +complexity and challenges in handling inductive bias. Several variations have +been proposed to address these issues which use spectral networks or +convolutions and have performed well on a range of tasks. However, they still +have difficulty in dealing with long sequences. State Space Models(SSMs) have +emerged as promising alternatives for sequence modeling paradigms in this +context, especially with the advent of S4 and its variants, such as S4nd, +Hippo, Hyena, Diagnol State Spaces (DSS), Gated State Spaces (GSS), Linear +Recurrent Unit (LRU), Liquid-S4, Mamba, etc. In this survey, we categorize the +foundational SSMs based on three paradigms namely, Gating architectures, +Structural architectures, and Recurrent architectures. This survey also +highlights diverse applications of SSMs across domains such as vision, video, +audio, speech, language (especially long sequence modeling), medical (including +genomics), chemical (like drug design), recommendation systems, and time series +analysis, including tabular data. Moreover, we consolidate the performance of +SSMs on benchmark datasets like Long Range Arena (LRA), WikiText, Glue, Pile, +ImageNet, Kinetics-400, sstv2, as well as video datasets such as Breakfast, +COIN, LVU, and various time series datasets. The project page for Mamba-360 +work is available on this webpage.\url{https://github.com/badripatro/mamba360}. + +
+
+
+
+
+ + ☆ MaGGIe: Masked Guided Gradual Human Instance Matting CVPR 2024 + + +
+ Human matting is a foundation task in image and video processing, where human +foreground pixels are extracted from the input. Prior works either improve the +accuracy by additional guidance or improve the temporal consistency of a single +instance across frames. We propose a new framework MaGGIe, Masked Guided +Gradual Human Instance Matting, which predicts alpha mattes progressively for +each human instances while maintaining the computational cost, precision, and +consistency. Our method leverages modern architectures, including transformer +attention and sparse convolution, to output all instance mattes simultaneously +without exploding memory and latency. Although keeping constant inference costs +in the multiple-instance scenario, our framework achieves robust and versatile +performance on our proposed synthesized benchmarks. With the higher quality +image and video matting benchmarks, the novel multi-instance synthesis approach +from publicly available sources is introduced to increase the generalization of +models in real-world scenarios. + +
+
+ comment: CVPR 2024. Project link: https://maggie-matt.github.io +
+
+
+
+
+ + ☆ Cantor: Inspiring Multimodal Chain-of-Thought of MLLM + + +
+ With the advent of large language models(LLMs) enhanced by the +chain-of-thought(CoT) methodology, visual reasoning problem is usually +decomposed into manageable sub-tasks and tackled sequentially with various +external tools. However, such a paradigm faces the challenge of the potential +"determining hallucinations" in decision-making due to insufficient visual +information and the limitation of low-level perception tools that fail to +provide abstract summaries necessary for comprehensive reasoning. We argue that +converging visual context acquisition and logical reasoning is pivotal for +tackling visual reasoning tasks. This paper delves into the realm of multimodal +CoT to solve intricate visual reasoning tasks with multimodal large language +models(MLLMs) and their cognitive capability. To this end, we propose an +innovative multimodal CoT framework, termed Cantor, characterized by a +perception-decision architecture. Cantor first acts as a decision generator and +integrates visual inputs to analyze the image and problem, ensuring a closer +alignment with the actual context. Furthermore, Cantor leverages the advanced +cognitive functions of MLLMs to perform as multifaceted experts for deriving +higher-level information, enhancing the CoT generation process. Our extensive +experiments demonstrate the efficacy of the proposed framework, showing +significant improvements in multimodal CoT performance across two complex +visual reasoning datasets, without necessitating fine-tuning or ground-truth +rationales. Project Page: https://ggg0919.github.io/cantor/ . + +
+
+ comment: The project page is available at https://ggg0919.github.io/cantor/ +
+
+
+
+
+ + ☆ MoDE: CLIP Data Experts via Clustering CVPR 2024 + + +
+ The success of contrastive language-image pretraining (CLIP) relies on the +supervision from the pairing between images and captions, which tends to be +noisy in web-crawled data. We present Mixture of Data Experts (MoDE) and learn +a system of CLIP data experts via clustering. Each data expert is trained on +one data cluster, being less sensitive to false negative noises in other +clusters. At inference time, we ensemble their outputs by applying weights +determined through the correlation between task metadata and cluster +conditions. To estimate the correlation precisely, the samples in one cluster +should be semantically similar, but the number of data experts should still be +reasonable for training and inference. As such, we consider the ontology in +human language and propose to use fine-grained cluster centers to represent +each data expert at a coarse-grained level. Experimental studies show that four +CLIP data experts on ViT-B/16 outperform the ViT-L/14 by OpenAI CLIP and +OpenCLIP on zero-shot image classification but with less ($<$35\%) training +cost. Meanwhile, MoDE can train all data expert asynchronously and can flexibly +include new data experts. The code is available at +https://github.com/facebookresearch/MetaCLIP/tree/main/mode. + +
+
+ comment: IEEE CVPR 2024 Camera Ready. Code Link: + https://github.com/facebookresearch/MetaCLIP/tree/main/mode +
+
+
+
+
+ + ☆ Editable Image Elements for Controllable Synthesis + + +
+ Diffusion models have made significant advances in text-guided synthesis +tasks. However, editing user-provided images remains challenging, as the high +dimensional noise input space of diffusion models is not naturally suited for +image inversion or spatial editing. In this work, we propose an image +representation that promotes spatial editing of input images using a diffusion +model. Concretely, we learn to encode an input into "image elements" that can +faithfully reconstruct an input image. These elements can be intuitively edited +by a user, and are decoded by a diffusion model into realistic images. We show +the effectiveness of our representation on various image editing tasks, such as +object resizing, rearrangement, dragging, de-occlusion, removal, variation, and +image composition. Project page: +https://jitengmu.github.io/Editable_Image_Elements/ + +
+
+ comment: Project page: https://jitengmu.github.io/Editable_Image_Elements/ +
+
+
+
+
+ + ☆ PuLID: Pure and Lightning ID Customization via Contrastive Alignment + + +
+ We propose Pure and Lightning ID customization (PuLID), a novel tuning-free +ID customization method for text-to-image generation. By incorporating a +Lightning T2I branch with a standard diffusion one, PuLID introduces both +contrastive alignment loss and accurate ID loss, minimizing disruption to the +original model and ensuring high ID fidelity. Experiments show that PuLID +achieves superior performance in both ID fidelity and editability. Another +attractive property of PuLID is that the image elements (e.g., background, +lighting, composition, and style) before and after the ID insertion are kept as +consistent as possible. Codes and models will be available at +https://github.com/ToTheBeginning/PuLID + +
+
+ comment: Tech Report. Codes and models will be available at + https://github.com/ToTheBeginning/PuLID +
+
+
+
+
+ + ☆ RetinaRegNet: A Versatile Approach for Retinal Image Registration + + +
+ We introduce the RetinaRegNet model, which can achieve state-of-the-art +performance across various retinal image registration tasks. RetinaRegNet does +not require training on any retinal images. It begins by establishing point +correspondences between two retinal images using image features derived from +diffusion models. This process involves the selection of feature points from +the moving image using the SIFT algorithm alongside random point sampling. For +each selected feature point, a 2D correlation map is computed by assessing the +similarity between the feature vector at that point and the feature vectors of +all pixels in the fixed image. The pixel with the highest similarity score in +the correlation map corresponds to the feature point in the moving image. To +remove outliers in the estimated point correspondences, we first applied an +inverse consistency constraint, followed by a transformation-based outlier +detector. This method proved to outperform the widely used random sample +consensus (RANSAC) outlier detector by a significant margin. To handle large +deformations, we utilized a two-stage image registration framework. A +homography transformation was used in the first stage and a more accurate +third-order polynomial transformation was used in the second stage. The model's +effectiveness was demonstrated across three retinal image datasets: color +fundus images, fluorescein angiography images, and laser speckle flowgraphy +images. RetinaRegNet outperformed current state-of-the-art methods in all three +datasets. It was especially effective for registering image pairs with large +displacement and scaling deformations. This innovation holds promise for +various applications in retinal image analysis. Our code is publicly available +at https://github.com/mirthAI/RetinaRegNet. + +
+
+
+
+
+ + ☆ MMT-Bench: A Comprehensive Multimodal Benchmark for Evaluating Large + Vision-Language Models Towards Multitask AGI + + +
+ Large Vision-Language Models (LVLMs) show significant strides in +general-purpose multimodal applications such as visual dialogue and embodied +navigation. However, existing multimodal evaluation benchmarks cover a limited +number of multimodal tasks testing rudimentary capabilities, falling short in +tracking LVLM development. In this study, we present MMT-Bench, a comprehensive +benchmark designed to assess LVLMs across massive multimodal tasks requiring +expert knowledge and deliberate visual recognition, localization, reasoning, +and planning. MMT-Bench comprises $31,325$ meticulously curated multi-choice +visual questions from various multimodal scenarios such as vehicle driving and +embodied navigation, covering $32$ core meta-tasks and $162$ subtasks in +multimodal understanding. Due to its extensive task coverage, MMT-Bench enables +the evaluation of LVLMs using a task map, facilitating the discovery of in- and +out-of-domain tasks. Evaluation results involving $30$ LVLMs such as the +proprietary GPT-4V, GeminiProVision, and open-sourced InternVL-Chat, underscore +the significant challenges posed by MMT-Bench. We anticipate that MMT-Bench +will inspire the community to develop next-generation multimodal foundation +models aimed at achieving general-purpose multimodal intelligence. + +
+
+ comment: 77 pages, 41 figures +
+
+
+
+
+ + ☆ A comprehensive and easy-to-use multi-domain multi-task medical imaging + meta-dataset (MedIMeta) + + +
+ While the field of medical image analysis has undergone a transformative +shift with the integration of machine learning techniques, the main challenge +of these techniques is often the scarcity of large, diverse, and well-annotated +datasets. Medical images vary in format, size, and other parameters and +therefore require extensive preprocessing and standardization, for usage in +machine learning. Addressing these challenges, we introduce the Medical Imaging +Meta-Dataset (MedIMeta), a novel multi-domain, multi-task meta-dataset. +MedIMeta contains 19 medical imaging datasets spanning 10 different domains and +encompassing 54 distinct medical tasks, all of which are standardized to the +same format and readily usable in PyTorch or other ML frameworks. We perform a +technical validation of MedIMeta, demonstrating its utility through fully +supervised and cross-domain few-shot learning baselines. + +
+
+
+
+
+ + ☆ HDDGAN: A Heterogeneous Dual-Discriminator Generative Adversarial + Network for Infrared and Visible Image Fusion + + +
+ Infrared and visible image fusion (IVIF) aims to preserve thermal radiation +information from infrared images while integrating texture details from visible +images, enabling the capture of important features and hidden details of +subjects in complex scenes and disturbed environments. Consequently, IVIF +offers distinct advantages in practical applications such as video +surveillance, night navigation, and target recognition. However, prevailing +methods often face challenges in simultaneously capturing thermal region +features and detailed information due to the disparate characteristics of +infrared and visible images. Consequently, fusion outcomes frequently entail a +compromise between thermal target area information and texture details. In this +study, we introduce a novel heterogeneous dual-discriminator generative +adversarial network (HDDGAN) to address this issue. Specifically, the generator +is structured as a multi-scale skip-connected structure, facilitating the +extraction of essential features from different source images. To enhance the +information representation ability of the fusion result, an attention mechanism +is employed to construct the information fusion layer within the generator, +leveraging the disparities between the source images. Moreover, recognizing the +distinct learning requirements of information in infrared and visible images, +we design two discriminators with differing structures. This approach aims to +guide the model to learn salient information from infrared images while +simultaneously capturing detailed information from visible images. Extensive +experiments conducted on various public datasets demonstrate the superiority of +our proposed HDDGAN over other state-of-the-art (SOTA) algorithms, highlighting +its enhanced potential for practical applications. + +
+
+
+
+
+ + ☆ On the Fourier analysis in the SO(3) space : EquiLoPO Network + + +
+ Analyzing volumetric data with rotational invariance or equivariance is an +active topic in current research. Existing deep-learning approaches utilize +either group convolutional networks limited to discrete rotations or steerable +convolutional networks with constrained filter structures. This work proposes a +novel equivariant neural network architecture that achieves analytical +Equivariance to Local Pattern Orientation on the continuous SO(3) group while +allowing unconstrained trainable filters - EquiLoPO Network. Our key +innovations are a group convolutional operation leveraging irreducible +representations as the Fourier basis and a local activation function in the +SO(3) space that provides a well-defined mapping from input to output +functions, preserving equivariance. By integrating these operations into a +ResNet-style architecture, we propose a model that overcomes the limitations of +prior methods. A comprehensive evaluation on diverse 3D medical imaging +datasets from MedMNIST3D demonstrates the effectiveness of our approach, which +consistently outperforms state of the art. This work suggests the benefits of +true rotational equivariance on SO(3) and flexible unconstrained filters +enabled by the local activation function, providing a flexible framework for +equivariant deep learning on volumetric data with potential applications across +domains. Our code is publicly available at +\url{https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPO/-/tree/main/EquiLoPO}. + +
+
+
+
+
+ + ☆ A Survey on Visual Mamba + + +
+ State space models (SSMs) with selection mechanisms and hardware-aware +architectures, namely Mamba, have recently demonstrated significant promise in +long-sequence modeling. Since the self-attention mechanism in transformers has +quadratic complexity with image size and increasing computational demands, the +researchers are now exploring how to adapt Mamba for computer vision tasks. +This paper is the first comprehensive survey aiming to provide an in-depth +analysis of Mamba models in the field of computer vision. It begins by +exploring the foundational concepts contributing to Mamba's success, including +the state space model framework, selection mechanisms, and hardware-aware +design. Next, we review these vision mamba models by categorizing them into +foundational ones and enhancing them with techniques such as convolution, +recurrence, and attention to improve their sophistication. We further delve +into the widespread applications of Mamba in vision tasks, which include their +use as a backbone in various levels of vision processing. This encompasses +general visual tasks, Medical visual tasks (e.g., 2D / 3D segmentation, +classification, and image registration, etc.), and Remote Sensing visual tasks. +We specially introduce general visual tasks from two levels: High/Mid-level +vision (e.g., Object detection, Segmentation, Video classification, etc.) and +Low-level vision (e.g., Image super-resolution, Image restoration, Visual +generation, etc.). We hope this endeavor will spark additional interest within +the community to address current challenges and further apply Mamba models in +computer vision. + +
+
+
+
+
+ + ☆ Beyond Deepfake Images: Detecting AI-Generated Videos CVPR + + +
+ Recent advances in generative AI have led to the development of techniques to +generate visually realistic synthetic video. While a number of techniques have +been developed to detect AI-generated synthetic images, in this paper we show +that synthetic image detectors are unable to detect synthetic videos. We +demonstrate that this is because synthetic video generators introduce +substantially different traces than those left by image generators. Despite +this, we show that synthetic video traces can be learned, and used to perform +reliable synthetic video detection or generator source attribution even after +H.264 re-compression. Furthermore, we demonstrate that while detecting videos +from new generators through zero-shot transferability is challenging, accurate +detection of videos from a new generator can be achieved through few-shot +learning. + +
+
+ comment: To be published in CVPRW24 +
+
+
+
+
+ + ☆ Mammo-CLIP: Leveraging Contrastive Language-Image Pre-training (CLIP) + for Enhanced Breast Cancer Diagnosis with Multi-view Mammography + + +
+ Although fusion of information from multiple views of mammograms plays an +important role to increase accuracy of breast cancer detection, developing +multi-view mammograms-based computer-aided diagnosis (CAD) schemes still faces +challenges and no such CAD schemes have been used in clinical practice. To +overcome the challenges, we investigate a new approach based on Contrastive +Language-Image Pre-training (CLIP), which has sparked interest across various +medical imaging tasks. By solving the challenges in (1) effectively adapting +the single-view CLIP for multi-view feature fusion and (2) efficiently +fine-tuning this parameter-dense model with limited samples and computational +resources, we introduce Mammo-CLIP, the first multi-modal framework to process +multi-view mammograms and corresponding simple texts. Mammo-CLIP uses an early +feature fusion strategy to learn multi-view relationships in four mammograms +acquired from the CC and MLO views of the left and right breasts. To enhance +learning efficiency, plug-and-play adapters are added into CLIP image and text +encoders for fine-tuning parameters and limiting updates to about 1% of the +parameters. For framework evaluation, we assembled two datasets +retrospectively. The first dataset, comprising 470 malignant and 479 benign +cases, was used for few-shot fine-tuning and internal evaluation of the +proposed Mammo-CLIP via 5-fold cross-validation. The second dataset, including +60 malignant and 294 benign cases, was used to test generalizability of +Mammo-CLIP. Study results show that Mammo-CLIP outperforms the state-of-art +cross-view transformer in AUC (0.841 vs. 0.817, 0.837 vs. 0.807) on both +datasets. It also surpasses previous two CLIP-based methods by 20.3% and 14.3%. +This study highlights the potential of applying the finetuned vision-language +models for developing next-generation, image-text-based CAD schemes of breast +cancer. + +
+
+
+
+
+ + ☆ An Element-Wise Weights Aggregation Method for Federated Learning ICDM + + +
+ Federated learning (FL) is a powerful Machine Learning (ML) paradigm that +enables distributed clients to collaboratively learn a shared global model +while keeping the data on the original device, thereby preserving privacy. A +central challenge in FL is the effective aggregation of local model weights +from disparate and potentially unbalanced participating clients. Existing +methods often treat each client indiscriminately, applying a single proportion +to the entire local model. However, it is empirically advantageous for each +weight to be assigned a specific proportion. This paper introduces an +innovative Element-Wise Weights Aggregation Method for Federated Learning +(EWWA-FL) aimed at optimizing learning performance and accelerating convergence +speed. Unlike traditional FL approaches, EWWA-FL aggregates local weights to +the global model at the level of individual elements, thereby allowing each +participating client to make element-wise contributions to the learning +process. By taking into account the unique dataset characteristics of each +client, EWWA-FL enhances the robustness of the global model to different +datasets while also achieving rapid convergence. The method is flexible enough +to employ various weighting strategies. Through comprehensive experiments, we +demonstrate the advanced capabilities of EWWA-FL, showing significant +improvements in both accuracy and convergence speed across a range of backbones +and benchmarks. + +
+
+ comment: 2023 IEEE International Conference on Data Mining Workshops (ICDMW) +
+
+
+
+
+ + ☆ Perception and Localization of Macular Degeneration Applying + Convolutional Neural Network, ResNet and Grad-CAM + + +
+ A well-known retinal disease that feels blurry visions to the affected +patients is Macular Degeneration. This research is based on classifying the +healthy and macular degeneration fundus with localizing the affected region of +the fundus. A CNN architecture and CNN with ResNet architecture (ResNet50, +ResNet50v2, ResNet101, ResNet101v2, ResNet152, ResNet152v2) as the backbone are +used to classify the two types of fundus. The data are split into three +categories including (a) Training set is 90% and Testing set is 10% (b) +Training set is 80% and Testing set is 20%, (c) Training set is 50% and Testing +set is 50%. After the training, the best model has been selected from the +evaluation metrics. Among the models, CNN with backbone of ResNet50 performs +best which gives the training accuracy of 98.7\% for 90\% train and 10\% test +data split. With this model, we have performed the Grad-CAM visualization to +get the region of affected area of fundus. + +
+
+ comment: 12 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Learning Long-form Video Prior via Generative Pre-Training + + +
+ Concepts involved in long-form videos such as people, objects, and their +interactions, can be viewed as following an implicit prior. They are notably +complex and continue to pose challenges to be comprehensively learned. In +recent years, generative pre-training (GPT) has exhibited versatile capacities +in modeling any kind of text content even visual locations. Can this manner +work for learning long-form video prior? Instead of operating on pixel space, +it is efficient to employ visual locations like bounding boxes and keypoints to +represent key information in videos, which can be simply discretized and then +tokenized for consumption by GPT. Due to the scarcity of suitable data, we +create a new dataset called \textbf{Storyboard20K} from movies to serve as a +representative. It includes synopses, shot-by-shot keyframes, and fine-grained +annotations of film sets and characters with consistent IDs, bounding boxes, +and whole body keypoints. In this way, long-form videos can be represented by a +set of tokens and be learned via generative pre-training. Experimental results +validate that our approach has great potential for learning long-form video +prior. Code and data will be released at +\url{https://github.com/showlab/Long-form-Video-Prior}. + +
+
+
+
+
+ + ☆ Drawing the Line: Deep Segmentation for Extracting Art from Ancient + Etruscan Mirrors ICDAR2024 + + +
+ Etruscan mirrors constitute a significant category within Etruscan art and, +therefore, undergo systematic examinations to obtain insights into ancient +times. A crucial aspect of their analysis involves the labor-intensive task of +manually tracing engravings from the backside. Additionally, this task is +inherently challenging due to the damage these mirrors have sustained, +introducing subjectivity into the process. We address these challenges by +automating the process through photometric-stereo scanning in conjunction with +deep segmentation networks which, however, requires effective usage of the +limited data at hand. We accomplish this by incorporating predictions on a +per-patch level, and various data augmentations, as well as exploring +self-supervised learning. Compared to our baseline, we improve predictive +performance w.r.t. the pseudo-F-Measure by around 16%. When assessing +performance on complete mirrors against a human baseline, our approach yields +quantitative similar performance to a human annotator and significantly +outperforms existing binarization methods. With our proposed methodology, we +streamline the annotation process, enhance its objectivity, and reduce overall +workload, offering a valuable contribution to the examination of these +historical artifacts and other non-traditional documents. + +
+
+ comment: 19 pages, accepted at ICDAR2024 +
+
+
+
+
+ + ☆ Sketch2Human: Deep Human Generation with Disentangled Geometry and + Appearance Control + + +
+ Geometry- and appearance-controlled full-body human image generation is an +interesting but challenging task. Existing solutions are either unconditional +or dependent on coarse conditions (e.g., pose, text), thus lacking explicit +geometry and appearance control of body and garment. Sketching offers such +editing ability and has been adopted in various sketch-based face generation +and editing solutions. However, directly adapting sketch-based face generation +to full-body generation often fails to produce high-fidelity and diverse +results due to the high complexity and diversity in the pose, body shape, and +garment shape and texture. Recent geometrically controllable diffusion-based +methods mainly rely on prompts to generate appearance and it is hard to balance +the realism and the faithfulness of their results to the sketch when the input +is coarse. This work presents Sketch2Human, the first system for controllable +full-body human image generation guided by a semantic sketch (for geometry +control) and a reference image (for appearance control). Our solution is based +on the latent space of StyleGAN-Human with inverted geometry and appearance +latent codes as input. Specifically, we present a sketch encoder trained with a +large synthetic dataset sampled from StyleGAN-Human's latent space and directly +supervised by sketches rather than real images. Considering the entangled +information of partial geometry and texture in StyleGAN-Human and the absence +of disentangled datasets, we design a novel training scheme that creates +geometry-preserved and appearance-transferred training data to tune a generator +to achieve disentangled geometry and appearance control. Although our method is +trained with synthetic data, it can handle hand-drawn sketches as well. +Qualitative and quantitative evaluations demonstrate the superior performance +of our method to state-of-the-art methods. + +
+
+
+
+
+ + ☆ Steal Now and Attack Later: Evaluating Robustness of Object Detection + against Black-box Adversarial Attacks + + +
+ Latency attacks against object detection represent a variant of adversarial +attacks that aim to inflate the inference time by generating additional ghost +objects in a target image. However, generating ghost objects in the black-box +scenario remains a challenge since information about these unqualified objects +remains opaque. In this study, we demonstrate the feasibility of generating +ghost objects in adversarial examples by extending the concept of "steal now, +decrypt later" attacks. These adversarial examples, once produced, can be +employed to exploit potential vulnerabilities in the AI service, giving rise to +significant security concerns. The experimental results demonstrate that the +proposed attack achieves successful attacks across various commonly used models +and Google Vision API without any prior knowledge about the target model. +Additionally, the average cost of each attack is less than \$ 1 dollars, posing +a significant threat to AI security. + +
+
+
+
+
+ + ☆ Revisiting Out-of-Distribution Detection in LiDAR-based 3D Object + Detection + + +
+ LiDAR-based 3D object detection has become an essential part of automated +driving due to its ability to localize and classify objects precisely in 3D. +However, object detectors face a critical challenge when dealing with unknown +foreground objects, particularly those that were not present in their original +training data. These out-of-distribution (OOD) objects can lead to +misclassifications, posing a significant risk to the safety and reliability of +automated vehicles. Currently, LiDAR-based OOD object detection has not been +well studied. We address this problem by generating synthetic training data for +OOD objects by perturbing known object categories. Our idea is that these +synthetic OOD objects produce different responses in the feature map of an +object detector compared to in-distribution (ID) objects. We then extract +features using a pre-trained and fixed object detector and train a simple +multilayer perceptron (MLP) to classify each detection as either ID or OOD. In +addition, we propose a new evaluation protocol that allows the use of existing +datasets without modifying the point cloud, ensuring a more authentic +evaluation of real-world scenarios. The effectiveness of our method is +validated through experiments on the newly proposed nuScenes OOD benchmark. The +source code is available at https://github.com/uulm-mrm/mmood3d. + +
+
+ comment: Accepted for publication at the 2024 35th IEEE Intelligent Vehicles + Symposium (IV 2024), June 2-5, 2024, in Jeju Island, Korea +
+
+
+
+
+ + ☆ Enhancing Diagnosis through AI-driven Analysis of Reflectance Confocal + Microscopy + + +
+ Reflectance Confocal Microscopy (RCM) is a non-invasive imaging technique +used in biomedical research and clinical dermatology. It provides virtual +high-resolution images of the skin and superficial tissues, reducing the need +for physical biopsies. RCM employs a laser light source to illuminate the +tissue, capturing the reflected light to generate detailed images of +microscopic structures at various depths. Recent studies explored AI and +machine learning, particularly CNNs, for analyzing RCM images. Our study +proposes a segmentation strategy based on textural features to identify +clinically significant regions, empowering dermatologists in effective image +interpretation and boosting diagnostic confidence. This approach promises to +advance dermatological diagnosis and treatment. + +
+
+
+
+
+ + ☆ Porting Large Language Models to Mobile Devices for Question Answering SP + + +
+ Deploying Large Language Models (LLMs) on mobile devices makes all the +capabilities of natural language processing available on the device. An +important use case of LLMs is question answering, which can provide accurate +and contextually relevant answers to a wide array of user queries. We describe +how we managed to port state of the art LLMs to mobile devices, enabling them +to operate natively on the device. We employ the llama.cpp framework, a +flexible and self-contained C++ framework for LLM inference. We selected a +6-bit quantized version of the Orca-Mini-3B model with 3 billion parameters and +present the correct prompt format for this model. Experimental results show +that LLM inference runs in interactive speed on a Galaxy S21 smartphone and +that the model delivers high-quality answers to user queries related to +questions from different subjects like politics, geography or history. + +
+
+ comment: Accepted for ASPAI 2024 Conference +
+
+
+
+
+ + ☆ 3D Freehand Ultrasound using Visual Inertial and Deep Inertial Odometry + for Measuring Patellar Tracking + + +
+ Patellofemoral joint (PFJ) issues affect one in four people, with 20% +experiencing chronic knee pain despite treatment. Poor outcomes and pain after +knee replacement surgery are often linked to patellar mal-tracking. Traditional +imaging methods like CT and MRI face challenges, including cost and metal +artefacts, and there's currently no ideal way to observe joint motion without +issues such as soft tissue artefacts or radiation exposure. A new system to +monitor joint motion could significantly improve understanding of PFJ dynamics, +aiding in better patient care and outcomes. Combining 2D ultrasound with motion +tracking for 3D reconstruction of the joint using semantic segmentation and +position registration can be a solution. However, the need for expensive +external infrastructure to estimate the trajectories of the scanner remains the +main limitation to implementing 3D bone reconstruction from handheld ultrasound +scanning clinically. We proposed the Visual-Inertial Odometry (VIO) and the +deep learning-based inertial-only odometry methods as alternatives to motion +capture for tracking a handheld ultrasound scanner. The 3D reconstruction +generated by these methods has demonstrated potential for assessing the PFJ and +for further measurements from free-hand ultrasound scans. The results show that +the VIO method performs as well as the motion capture method, with average +reconstruction errors of 1.25 mm and 1.21 mm, respectively. The VIO method is +the first infrastructure-free method for 3D reconstruction of bone from +wireless handheld ultrasound scanning with an accuracy comparable to methods +that require external infrastructure. + +
+
+ comment: Accepted to IEEE Medical Measurements & Applications (MeMeA) 2024 +
+
+
+
+
+ + ☆ Vision Transformer-based Adversarial Domain Adaptation + + +
+ Unsupervised domain adaptation (UDA) aims to transfer knowledge from a +labeled source domain to an unlabeled target domain. The most recent UDA +methods always resort to adversarial training to yield state-of-the-art results +and a dominant number of existing UDA methods employ convolutional neural +networks (CNNs) as feature extractors to learn domain invariant features. +Vision transformer (ViT) has attracted tremendous attention since its emergence +and has been widely used in various computer vision tasks, such as image +classification, object detection, and semantic segmentation, yet its potential +in adversarial domain adaptation has never been investigated. In this paper, we +fill this gap by employing the ViT as the feature extractor in adversarial +domain adaptation. Moreover, we empirically demonstrate that ViT can be a +plug-and-play component in adversarial domain adaptation, which means directly +replacing the CNN-based feature extractor in existing UDA methods with the +ViT-based feature extractor can easily obtain performance improvement. The code +is available at https://github.com/LluckyYH/VT-ADA. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Single-View Scene Point Cloud Human Grasp Generation + + +
+ In this work, we explore a novel task of generating human grasps based on +single-view scene point clouds, which more accurately mirrors the typical +real-world situation of observing objects from a single viewpoint. Due to the +incompleteness of object point clouds and the presence of numerous scene +points, the generated hand is prone to penetrating into the invisible parts of +the object and the model is easily affected by scene points. Thus, we introduce +S2HGrasp, a framework composed of two key modules: the Global Perception module +that globally perceives partial object point clouds, and the DiffuGrasp module +designed to generate high-quality human grasps based on complex inputs that +include scene points. Additionally, we introduce S2HGD dataset, which comprises +approximately 99,000 single-object single-view scene point clouds of 1,668 +unique objects, each annotated with one human grasp. Our extensive experiments +demonstrate that S2HGrasp can not only generate natural human grasps regardless +of scene points, but also effectively prevent penetration between the hand and +invisible parts of the object. Moreover, our model showcases strong +generalization capability when applied to unseen objects. Our code and dataset +are available at https://github.com/iSEE-Laboratory/S2HGrasp. + +
+
+
+
+
+ + ☆ Facilitating Advanced Sentinel-2 Analysis Through a Simplified + Computation of Nadir BRDF Adjusted Reflectance + + +
+ The Sentinel-2 (S2) mission from the European Space Agency's Copernicus +program provides essential data for Earth surface analysis. Its Level-2A +products deliver high-to-medium resolution (10-60 m) surface reflectance (SR) +data through the MultiSpectral Instrument (MSI). To enhance the accuracy and +comparability of SR data, adjustments simulating a nadir viewing perspective +are essential. These corrections address the anisotropic nature of SR and the +variability in sun and observation angles, ensuring consistent image +comparisons over time and under different conditions. The $c$-factor method, a +simple yet effective algorithm, adjusts observed S2 SR by using the MODIS BRDF +model to achieve Nadir BRDF Adjusted Reflectance (NBAR). Despite the +straightforward application of the $c$-factor to individual images, a cohesive +Python framework for its application across multiple S2 images and Earth System +Data Cubes (ESDCs) from cloud-stored data has been lacking. Here we introduce +sen2nbar, a Python package crafted to convert S2 SR data to NBAR, supporting +both individual images and ESDCs derived from cloud-stored data. This package +simplifies the conversion of S2 SR data to NBAR via a single function, +organized into modules for efficient process management. By facilitating NBAR +conversion for both SAFE files and ESDCs from SpatioTemporal Asset Catalogs +(STAC), sen2nbar is developed as a flexible tool that can handle diverse data +format requirements. We anticipate that sen2nbar will considerably contribute +to the standardization and harmonization of S2 data, offering a robust solution +for a diverse range of users across various applications. sen2nbar is an +open-source tool available at https://github.com/ESDS-Leipzig/sen2nbar. + +
+
+ comment: Submitted to FOSS4G Europe 2024 +
+
+
+
+
+ + ☆ Raformer: Redundancy-Aware Transformer for Video Wire Inpainting + + +
+ Video Wire Inpainting (VWI) is a prominent application in video inpainting, +aimed at flawlessly removing wires in films or TV series, offering significant +time and labor savings compared to manual frame-by-frame removal. However, wire +removal poses greater challenges due to the wires being longer and slimmer than +objects typically targeted in general video inpainting tasks, and often +intersecting with people and background objects irregularly, which adds +complexity to the inpainting process. Recognizing the limitations posed by +existing video wire datasets, which are characterized by their small size, poor +quality, and limited variety of scenes, we introduce a new VWI dataset with a +novel mask generation strategy, namely Wire Removal Video Dataset 2 (WRV2) and +Pseudo Wire-Shaped (PWS) Masks. WRV2 dataset comprises over 4,000 videos with +an average length of 80 frames, designed to facilitate the development and +efficacy of inpainting models. Building upon this, our research proposes the +Redundancy-Aware Transformer (Raformer) method that addresses the unique +challenges of wire removal in video inpainting. Unlike conventional approaches +that indiscriminately process all frame patches, Raformer employs a novel +strategy to selectively bypass redundant parts, such as static background +segments devoid of valuable information for inpainting. At the core of Raformer +is the Redundancy-Aware Attention (RAA) module, which isolates and accentuates +essential content through a coarse-grained, window-based attention mechanism. +This is complemented by a Soft Feature Alignment (SFA) module, which refines +these features and achieves end-to-end feature alignment. Extensive experiments +on both the traditional video inpainting datasets and our proposed WRV2 dataset +demonstrate that Raformer outperforms other state-of-the-art methods. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Multimodal Search CVPR + + +
+ Multimodal search has become increasingly important in providing users with a +natural and effective way to ex-press their search intentions. Images offer +fine-grained details of the desired products, while text allows for easily +incorporating search modifications. However, some existing multimodal search +systems are unreliable and fail to address simple queries. The problem becomes +harder with the large variability of natural language text queries, which may +contain ambiguous, implicit, and irrelevant in-formation. Addressing these +issues may require systems with enhanced matching capabilities, reasoning +abilities, and context-aware query parsing and rewriting. This paper introduces +a novel multimodal search model that achieves a new performance milestone on +the Fashion200K dataset. Additionally, we propose a novel search interface +integrating Large Language Models (LLMs) to facilitate natural language +interaction. This interface routes queries to search systems while +conversationally engaging with users and considering previous searches. When +coupled with our multimodal search model, it heralds a new era of shopping +assistants capable of offering human-like interaction and enhancing the overall +search experience. + +
+
+ comment: Published at CVPRW 2024 +
+
+
+
+
+ + ☆ MotionMaster: Training-free Camera Motion Transfer For Video Generation + + +
+ The emergence of diffusion models has greatly propelled the progress in image +and video generation. Recently, some efforts have been made in controllable +video generation, including text-to-video generation and video motion control, +among which camera motion control is an important topic. However, existing +camera motion control methods rely on training a temporal camera module, and +necessitate substantial computation resources due to the large amount of +parameters in video generation models. Moreover, existing methods pre-define +camera motion types during training, which limits their flexibility in camera +control. Therefore, to reduce training costs and achieve flexible camera +control, we propose COMD, a novel training-free video motion transfer model, +which disentangles camera motions and object motions in source videos and +transfers the extracted camera motions to new videos. We first propose a +one-shot camera motion disentanglement method to extract camera motion from a +single source video, which separates the moving objects from the background and +estimates the camera motion in the moving objects region based on the motion in +the background by solving a Poisson equation. Furthermore, we propose a +few-shot camera motion disentanglement method to extract the common camera +motion from multiple videos with similar camera motions, which employs a +window-based clustering technique to extract the common features in temporal +attention maps of multiple videos. Finally, we propose a motion combination +method to combine different types of camera motions together, enabling our +model a more controllable and flexible camera control. Extensive experiments +demonstrate that our training-free approach can effectively decouple +camera-object motion and apply the decoupled camera motion to a wide range of +controllable video generation tasks, achieving flexible and diverse camera +motion control. + +
+
+
+
+
+ + ☆ Rethinking Model Prototyping through the MedMNIST+ Dataset Collection + + +
+ The integration of deep learning based systems in clinical practice is often +impeded by challenges rooted in limited and heterogeneous medical datasets. In +addition, prioritization of marginal performance improvements on a few, +narrowly scoped benchmarks over clinical applicability has slowed down +meaningful algorithmic progress. This trend often results in excessive +fine-tuning of existing methods to achieve state-of-the-art performance on +selected datasets rather than fostering clinically relevant innovations. In +response, this work presents a comprehensive benchmark for the MedMNIST+ +database to diversify the evaluation landscape and conduct a thorough analysis +of common convolutional neural networks (CNNs) and Transformer-based +architectures, for medical image classification. Our evaluation encompasses +various medical datasets, training methodologies, and input resolutions, aiming +to reassess the strengths and limitations of widely used model variants. Our +findings suggest that computationally efficient training schemes and modern +foundation models hold promise in bridging the gap between expensive end-to-end +training and more resource-refined approaches. Additionally, contrary to +prevailing assumptions, we observe that higher resolutions may not consistently +improve performance beyond a certain threshold, advocating for the use of lower +resolutions, particularly in prototyping stages, to expedite processing. +Notably, our analysis reaffirms the competitiveness of convolutional models +compared to ViT-based architectures emphasizing the importance of comprehending +the intrinsic capabilities of different model architectures. Moreover, we hope +that our standardized evaluation framework will help enhance transparency, +reproducibility, and comparability on the MedMNIST+ dataset collection as well +as future research within the field. Code will be released soon. + +
+
+
+
+
+ + ☆ Seeing Beyond Classes: Zero-Shot Grounded Situation Recognition via + Language Explainer + + +
+ Benefiting from strong generalization ability, pre-trained vision language +models (VLMs), e.g., CLIP, have been widely utilized in zero-shot scene +understanding. Unlike simple recognition tasks, grounded situation recognition +(GSR) requires the model not only to classify salient activity (verb) in the +image, but also to detect all semantic roles that participate in the action. +This complex task usually involves three steps: verb recognition, semantic role +grounding, and noun recognition. Directly employing class-based prompts with +VLMs and grounding models for this task suffers from several limitations, e.g., +it struggles to distinguish ambiguous verb concepts, accurately localize roles +with fixed verb-centric template1 input, and achieve context-aware noun +predictions. In this paper, we argue that these limitations stem from the +mode's poor understanding of verb/noun classes. To this end, we introduce a new +approach for zero-shot GSR via Language EXplainer (LEX), which significantly +boosts the model's comprehensive capabilities through three explainers: 1) verb +explainer, which generates general verb-centric descriptions to enhance the +discriminability of different verb classes; 2) grounding explainer, which +rephrases verb-centric templates for clearer understanding, thereby enhancing +precise semantic role localization; and 3) noun explainer, which creates +scene-specific noun descriptions to ensure context-aware noun recognition. By +equipping each step of the GSR process with an auxiliary explainer, LEX +facilitates complex scene understanding in real-world scenarios. Our extensive +validations on the SWiG dataset demonstrate LEX's effectiveness and +interoperability in zero-shot GSR. + +
+
+
+
+
+ + ☆ Real-Time Compressed Sensing for Joint Hyperspectral Image Transmission + and Restoration for CubeSat + + +
+ This paper addresses the challenges associated with hyperspectral image (HSI) +reconstruction from miniaturized satellites, which often suffer from stripe +effects and are computationally resource-limited. We propose a Real-Time +Compressed Sensing (RTCS) network designed to be lightweight and require only +relatively few training samples for efficient and robust HSI reconstruction in +the presence of the stripe effect and under noisy transmission conditions. The +RTCS network features a simplified architecture that reduces the required +training samples and allows for easy implementation on integer-8-based +encoders, facilitating rapid compressed sensing for stripe-like HSI, which +exactly matches the moderate design of miniaturized satellites on push broom +scanning mechanism. This contrasts optimization-based models that demand +high-precision floating-point operations, making them difficult to deploy on +edge devices. Our encoder employs an integer-8-compatible linear projection for +stripe-like HSI data transmission, ensuring real-time compressed sensing. +Furthermore, based on the novel two-streamed architecture, an efficient HSI +restoration decoder is proposed for the receiver side, allowing for edge-device +reconstruction without needing a sophisticated central server. This is +particularly crucial as an increasing number of miniaturized satellites +necessitates significant computing resources on the ground station. Extensive +experiments validate the superior performance of our approach, offering new and +vital capabilities for existing miniaturized satellite systems. + +
+
+ comment: Accepted by TGRS 2024 +
+
+
+
+
+ + ☆ Toward Physics-Aware Deep Learning Architectures for LiDAR Intensity + Simulation + + +
+ Autonomous vehicles (AVs) heavily rely on LiDAR perception for environment +understanding and navigation. LiDAR intensity provides valuable information +about the reflected laser signals and plays a crucial role in enhancing the +perception capabilities of AVs. However, accurately simulating LiDAR intensity +remains a challenge due to the unavailability of material properties of the +objects in the environment, and complex interactions between the laser beam and +the environment. The proposed method aims to improve the accuracy of intensity +simulation by incorporating physics-based modalities within the deep learning +framework. One of the key entities that captures the interaction between the +laser beam and the objects is the angle of incidence. In this work we +demonstrate that the addition of the LiDAR incidence angle as a separate input +to the deep neural networks significantly enhances the results. We present a +comparative study between two prominent deep learning architectures: U-NET a +Convolutional Neural Network (CNN), and Pix2Pix a Generative Adversarial +Network (GAN). We implemented these two architectures for the intensity +prediction task and used SemanticKITTI and VoxelScape datasets for experiments. +The comparative analysis reveals that both architectures benefit from the +incidence angle as an additional input. Moreover, the Pix2Pix architecture +outperforms U-NET, especially when the incidence angle is incorporated. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ DVF: Advancing Robust and Accurate Fine-Grained Image Retrieval with + Retrieval Guidelines + + +
+ Fine-grained image retrieval (FGIR) is to learn visual representations that +distinguish visually similar objects while maintaining generalization. Existing +methods propose to generate discriminative features, but rarely consider the +particularity of the FGIR task itself. This paper presents a meticulous +analysis leading to the proposal of practical guidelines to identify +subcategory-specific discrepancies and generate discriminative features to +design effective FGIR models. These guidelines include emphasizing the object +(G1), highlighting subcategory-specific discrepancies (G2), and employing +effective training strategy (G3). Following G1 and G2, we design a novel Dual +Visual Filtering mechanism for the plain visual transformer, denoted as DVF, to +capture subcategory-specific discrepancies. Specifically, the dual visual +filtering mechanism comprises an object-oriented module and a semantic-oriented +module. These components serve to magnify objects and identify discriminative +regions, respectively. Following G3, we implement a discriminative model +training strategy to improve the discriminability and generalization ability of +DVF. Extensive analysis and ablation studies confirm the efficacy of our +proposed guidelines. Without bells and whistles, the proposed DVF achieves +state-of-the-art performance on three widely-used fine-grained datasets in +closed-set and open-set settings. + +
+
+
+
+
+ + ☆ ChEX: Interactive Localization and Region Description in Chest X-rays + + +
+ Report generation models offer fine-grained textual interpretations of +medical images like chest X-rays, yet they often lack interactivity (i.e. the +ability to steer the generation process through user queries) and localized +interpretability (i.e. visually grounding their predictions), which we deem +essential for future adoption in clinical practice. While there have been +efforts to tackle these issues, they are either limited in their interactivity +by not supporting textual queries or fail to also offer localized +interpretability. Therefore, we propose a novel multitask architecture and +training paradigm integrating textual prompts and bounding boxes for diverse +aspects like anatomical regions and pathologies. We call this approach the +Chest X-Ray Explainer (ChEX). Evaluations across a heterogeneous set of 9 chest +X-ray tasks, including localized image interpretation and report generation, +showcase its competitiveness with SOTA models while additional analysis +demonstrates ChEX's interactive capabilities. + +
+
+
+
+
+ + ☆ 3D Face Morphing Attack Generation using Non-Rigid Registration + + +
+ Face Recognition Systems (FRS) are widely used in commercial environments, +such as e-commerce and e-banking, owing to their high accuracy in real-world +conditions. However, these systems are vulnerable to facial morphing attacks, +which are generated by blending face color images of different subjects. This +paper presents a new method for generating 3D face morphs from two bona fide +point clouds. The proposed method first selects bona fide point clouds with +neutral expressions. The two input point clouds were then registered using a +Bayesian Coherent Point Drift (BCPD) without optimization, and the geometry and +color of the registered point clouds were averaged to generate a face morphing +point cloud. The proposed method generates 388 face-morphing point clouds from +200 bona fide subjects. The effectiveness of the method was demonstrated +through extensive vulnerability experiments, achieving a Generalized Morphing +Attack Potential (G-MAP) of 97.93%, which is superior to the existing +state-of-the-art (SOTA) with a G-MAP of 81.61%. + +
+
+ comment: Accepted to 2024 18th International Conference on Automatic Face and + Gesture Recognition (FG) as short paper +
+
+
+
+
+ + ☆ SRAGAN: Saliency Regularized and Attended Generative Adversarial Network + for Chinese Ink-wash Painting Generation + + +
+ This paper handles the problem of converting real pictures into traditional +Chinese ink-wash paintings, i.e., Chinese ink-wash painting style transfer. +Though this problem could be realized by a wide range of image-to-image +translation models, a notable issue with all these methods is that the original +image content details could be easily erased or corrupted due to transfer of +ink-wash style elements. To solve or ameliorate this issue, we propose to +incorporate saliency detection into the unpaired image-to-image translation +framework to regularize content information of the generated paintings. The +saliency map is utilized for content regularization from two aspects, both +explicitly and implicitly: (\romannumeral1) we propose saliency IOU (SIOU) loss +to explicitly regularize saliency consistency before and after stylization; +(\romannumeral2) we propose saliency adaptive normalization (SANorm) which +implicitly enhances content integrity of the generated paintings by injecting +saliency information to the generator network to guide painting generation. +Besides, we also propose saliency attended discriminator network which +harnesses saliency mask to focus generative adversarial attention onto salient +image regions, it contributes to producing finer ink-wash stylization effect +for salient objects of images. Qualitative and quantitative experiments +consistently demonstrate superiority of our model over related advanced methods +for Chinese ink-wash painting style transfer. + +
+
+ comment: 25 pages, 14 figures +
+
+
+
+
+ + ☆ Fine-grained Spatial-temporal MLP Architecture for Metro + Origin-Destination Prediction + + +
+ Accurate prediction of metro traffic is crucial for optimizing metro +scheduling and enhancing overall transport efficiency. Analyzing fine-grained +and comprehensive relations among stations effectively is imperative for metro +Origin-Destination (OD) prediction. However, existing metro OD models either +mix information from multiple OD pairs from the station's perspective or +exclusively focus on a subset of OD pairs. These approaches may overlook +fine-grained relations among OD pairs, leading to difficulties in predicting +potential anomalous conditions. To address these challenges, we analyze traffic +variations from the perspective of all OD pairs and propose a fine-grained +spatial-temporal MLP architecture for metro OD prediction, namely ODMixer. +Specifically, our ODMixer has double-branch structure and involves the Channel +Mixer, the Multi-view Mixer, and the Bidirectional Trend Learner. The Channel +Mixer aims to capture short-term temporal relations among OD pairs, the +Multi-view Mixer concentrates on capturing relations from both origin and +destination perspectives. To model long-term temporal relations, we introduce +the Bidirectional Trend Learner. Extensive experiments on two large-scale metro +OD prediction datasets HZMOD and SHMO demonstrate the advantages of our +ODMixer. The code will be available. + +
+
+
+
+
+ + ☆ SPARO: Selective Attention for Robust and Compositional Transformer + Encodings for Vision + + +
+ Selective attention helps us focus on task-relevant aspects in the constant +flood of our sensory input. This constraint in our perception allows us to +robustly generalize under distractions and to new compositions of perceivable +concepts. Transformers employ a similar notion of attention in their +architecture, but representation learning models with transformer backbones +like CLIP and DINO often fail to demonstrate robustness and compositionality. +We highlight a missing architectural prior: unlike human perception, +transformer encodings do not separately attend over individual concepts. In +response, we propose SPARO, a read-out mechanism that partitions encodings into +separately-attended slots, each produced by a single attention head. Using +SPARO with CLIP imparts an inductive bias that the vision and text modalities +are different views of a shared compositional world with the same corresponding +concepts. Using SPARO, we demonstrate improvements on downstream recognition, +robustness, retrieval, and compositionality benchmarks with CLIP (up to +14% +for ImageNet, +4% for SugarCrepe), and on nearest neighbors and linear probe +for ImageNet with DINO (+3% each). We also showcase a powerful ability to +intervene and select individual SPARO concepts to further improve downstream +task performance (up from +4% to +9% for SugarCrepe) and use this ability to +study the robustness of SPARO's representation structure. Finally, we provide +insights through ablation experiments and visualization of learned concepts. + +
+
+
+
+
+ + ☆ Mitigating False Predictions In Unreasonable Body Regions + + +
+ Despite considerable strides in developing deep learning models for 3D +medical image segmentation, the challenge of effectively generalizing across +diverse image distributions persists. While domain generalization is +acknowledged as vital for robust application in clinical settings, the +challenges stemming from training with a limited Field of View (FOV) remain +unaddressed. This limitation leads to false predictions when applied to body +regions beyond the FOV of the training data. In response to this problem, we +propose a novel loss function that penalizes predictions in implausible body +regions, applicable in both single-dataset and multi-dataset training schemes. +It is realized with a Body Part Regression model that generates axial slice +positional scores. Through comprehensive evaluation using a test set featuring +varying FOVs, our approach demonstrates remarkable improvements in +generalization capabilities. It effectively mitigates false positive tumor +predictions up to 85% and significantly enhances overall segmentation +performance. + +
+
+
+
+
+ + ☆ Ada-DF: An Adaptive Label Distribution Fusion Network For Facial + Expression Recognition + + +
+ Facial expression recognition (FER) plays a significant role in our daily +life. However, annotation ambiguity in the datasets could greatly hinder the +performance. In this paper, we address FER task via label distribution learning +paradigm, and develop a dual-branch Adaptive Distribution Fusion (Ada-DF) +framework. One auxiliary branch is constructed to obtain the label +distributions of samples. The class distributions of emotions are then computed +through the label distributions of each emotion. Finally, those two +distributions are adaptively fused according to the attention weights to train +the target branch. Extensive experiments are conducted on three real-world +datasets, RAF-DB, AffectNet and SFEW, where our Ada-DF shows advantages over +the state-of-the-art works. + +
+
+
+
+
+ + ☆ ViViDex: Learning Vision-based Dexterous Manipulation from Human Videos + + +
+ In this work, we aim to learn a unified vision-based policy for a +multi-fingered robot hand to manipulate different objects in diverse poses. +Though prior work has demonstrated that human videos can benefit policy +learning, performance improvement has been limited by physically implausible +trajectories extracted from videos. Moreover, reliance on privileged object +information such as ground-truth object states further limits the applicability +in realistic scenarios. To address these limitations, we propose a new +framework ViViDex to improve vision-based policy learning from human videos. It +first uses reinforcement learning with trajectory guided rewards to train +state-based policies for each video, obtaining both visually natural and +physically plausible trajectories from the video. We then rollout successful +episodes from state-based policies and train a unified visual policy without +using any privileged information. A coordinate transformation method is +proposed to significantly boost the performance. We evaluate our method on +three dexterous manipulation tasks and demonstrate a large improvement over +state-of-the-art algorithms. + +
+
+ comment: Project Page: https://zerchen.github.io/projects/vividex.html +
+
+
+
+
+ + ☆ ESR-NeRF: Emissive Source Reconstruction Using LDR Multi-view Images CVPR 2024 + + +
+ Existing NeRF-based inverse rendering methods suppose that scenes are +exclusively illuminated by distant light sources, neglecting the potential +influence of emissive sources within a scene. In this work, we confront this +limitation using LDR multi-view images captured with emissive sources turned on +and off. Two key issues must be addressed: 1) ambiguity arising from the +limited dynamic range along with unknown lighting details, and 2) the expensive +computational cost in volume rendering to backtrace the paths leading to final +object colors. We present a novel approach, ESR-NeRF, leveraging neural +networks as learnable functions to represent ray-traced fields. By training +networks to satisfy light transport segments, we regulate outgoing radiances, +progressively identifying emissive sources while being aware of reflection +areas. The results on scenes encompassing emissive sources with various +properties demonstrate the superiority of ESR-NeRF in qualitative and +quantitative ways. Our approach also extends its applicability to the scenes +devoid of emissive sources, achieving lower CD metrics on the DTU dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MAS-SAM: Segment Any Marine Animal with Aggregated Features IJCAI2024 + + +
+ Recently, Segment Anything Model (SAM) shows exceptional performance in +generating high-quality object masks and achieving zero-shot image +segmentation. However, as a versatile vision model, SAM is primarily trained +with large-scale natural light images. In underwater scenes, it exhibits +substantial performance degradation due to the light scattering and absorption. +Meanwhile, the simplicity of the SAM's decoder might lead to the loss of +fine-grained object details. To address the above issues, we propose a novel +feature learning framework named MAS-SAM for marine animal segmentation, which +involves integrating effective adapters into the SAM's encoder and constructing +a pyramidal decoder. More specifically, we first build a new SAM's encoder with +effective adapters for underwater scenes. Then, we introduce a Hypermap +Extraction Module (HEM) to generate multi-scale features for a comprehensive +guidance. Finally, we propose a Progressive Prediction Decoder (PPD) to +aggregate the multi-scale features and predict the final segmentation results. +When grafting with the Fusion Attention Module (FAM), our method enables to +extract richer marine information from global contextual cues to fine-grained +local details. Extensive experiments on four public MAS datasets demonstrate +that our MAS-SAM can obtain better results than other typical segmentation +methods. The source code is available at https://github.com/Drchip61/MAS-SAM. + +
+
+ comment: Accepted by IJCAI2024. More modifications may be performed +
+
+
+
+
+ + ☆ DeepFeatureX Net: Deep Features eXtractors based Network for + discriminating synthetic from real images + + +
+ Deepfakes, synthetic images generated by deep learning algorithms, represent +one of the biggest challenges in the field of Digital Forensics. The scientific +community is working to develop approaches that can discriminate the origin of +digital images (real or AI-generated). However, these methodologies face the +challenge of generalization, that is, the ability to discern the nature of an +image even if it is generated by an architecture not seen during training. This +usually leads to a drop in performance. In this context, we propose a novel +approach based on three blocks called Base Models, each of which is responsible +for extracting the discriminative features of a specific image class (Diffusion +Model-generated, GAN-generated, or real) as it is trained by exploiting +deliberately unbalanced datasets. The features extracted from each block are +then concatenated and processed to discriminate the origin of the input image. +Experimental results showed that this approach not only demonstrates good +robust capabilities to JPEG compression but also outperforms state-of-the-art +methods in several generalization tests. Code, models and dataset are available +at https://github.com/opontorno/block-based_deepfake-detection. + +
+
+
+
+
+ + ☆ AnoFPDM: Anomaly Segmentation with Forward Process of Diffusion Models + for Brain MRI + + +
+ Weakly-supervised diffusion models (DM) in anomaly segmentation, leveraging +image-level labels, have attracted significant attention for their superior +performance compared to unsupervised methods. It eliminates the need for +pixel-level labels in training, offering a more cost-effective alternative to +supervised methods. However, existing methods are not fully weakly-supervised +because they heavily rely on costly pixel-level labels for hyperparameter +tuning in inference. To tackle this challenge, we introduce Anomaly +Segmentation with Forward Process of Diffusion Models (AnoFPDM), a fully +weakly-supervised framework that operates without the need for pixel-level +labels. Leveraging the unguided forward process as a reference, we identify +suitable hyperparameters, i.e., noise scale and threshold, for each input +image. We aggregate anomaly maps from each step in the forward process, +enhancing the signal strength of anomalous regions. Remarkably, our proposed +method outperforms recent state-of-the-art weakly-supervised approaches, even +without utilizing pixel-level labels. + +
+
+
+
+
+ + ☆ CharacterFactory: Sampling Consistent Characters with GANs for Diffusion + Models + + +
+ Recent advances in text-to-image models have opened new frontiers in +human-centric generation. However, these models cannot be directly employed to +generate images with consistent newly coined identities. In this work, we +propose CharacterFactory, a framework that allows sampling new characters with +consistent identities in the latent space of GANs for diffusion models. More +specifically, we consider the word embeddings of celeb names as ground truths +for the identity-consistent generation task and train a GAN model to learn the +mapping from a latent space to the celeb embedding space. In addition, we +design a context-consistent loss to ensure that the generated identity +embeddings can produce identity-consistent images in various contexts. +Remarkably, the whole model only takes 10 minutes for training, and can sample +infinite characters end-to-end during inference. Extensive experiments +demonstrate excellent performance of the proposed CharacterFactory on character +creation in terms of identity consistency and editability. Furthermore, the +generated characters can be seamlessly combined with the off-the-shelf +image/video/3D diffusion models. We believe that the proposed CharacterFactory +is an important step for identity-consistent character generation. Project page +is available at: https://qinghew.github.io/CharacterFactory/. + +
+
+ comment: Code will be released very soon: + https://github.com/qinghew/CharacterFactory +
+
+
+
+
+ + ☆ Representing Part-Whole Hierarchies in Foundation Models by Learning + Localizability, Composability, and Decomposability from Anatomy via + Self-Supervision CVPR 2024 + + +
+ Humans effortlessly interpret images by parsing them into part-whole +hierarchies; deep learning excels in learning multi-level feature spaces, but +they often lack explicit coding of part-whole relations, a prominent property +of medical imaging. To overcome this limitation, we introduce Adam-v2, a new +self-supervised learning framework extending Adam [79] by explicitly +incorporating part-whole hierarchies into its learning objectives through three +key branches: (1) Localizability, acquiring discriminative representations to +distinguish different anatomical patterns; (2) Composability, learning each +anatomical structure in a parts-to-whole manner; and (3) Decomposability, +comprehending each anatomical structure in a whole-to-parts manner. +Experimental results across 10 tasks, compared to 11 baselines in zero-shot, +few-shot transfer, and full fine-tuning settings, showcase Adam-v2's superior +performance over large-scale medical models and existing SSL methods across +diverse downstream tasks. The higher generality and robustness of Adam-v2's +representations originate from its explicit construction of hierarchies for +distinct anatomical structures from unlabeled medical images. Adam-v2 preserves +a semantic balance of anatomical diversity and harmony in its embedding, +yielding representations that are both generic and semantically meaningful, yet +overlooked in existing SSL methods. All code and pretrained models are +available at https://github.com/JLiangLab/Eden. + +
+
+ comment: Accepted at CVPR 2024 [main conference] +
+
+
+
+
+ + ☆ CWF: Consolidating Weak Features in High-quality Mesh Simplification + + +
+ In mesh simplification, common requirements like accuracy, triangle quality, +and feature alignment are often considered as a trade-off. Existing algorithms +concentrate on just one or a few specific aspects of these requirements. For +example, the well-known Quadric Error Metrics (QEM) approach prioritizes +accuracy and can preserve strong feature lines/points as well but falls short +in ensuring high triangle quality and may degrade weak features that are not as +distinctive as strong ones. In this paper, we propose a smooth functional that +simultaneously considers all of these requirements. The functional comprises a +normal anisotropy term and a Centroidal Voronoi Tessellation (CVT) energy term, +with the variables being a set of movable points lying on the surface. The +former inherits the spirit of QEM but operates in a continuous setting, while +the latter encourages even point distribution, allowing various surface +metrics. We further introduce a decaying weight to automatically balance the +two terms. We selected 100 CAD models from the ABC dataset, along with 21 +organic models, to compare the existing mesh simplification algorithms with +ours. Experimental results reveal an important observation: the introduction of +a decaying weight effectively reduces the conflict between the two terms and +enables the alignment of weak features. This distinctive feature sets our +approach apart from most existing mesh simplification methods and demonstrates +significant potential in shape understanding. + +
+
+ comment: 14 pages, 22 figures +
+
+
+
+
+ + ☆ Multi-Modal Proxy Learning Towards Personalized Visual Multiple + Clustering CVPR 2024 + + +
+ Multiple clustering has gained significant attention in recent years due to +its potential to reveal multiple hidden structures of data from different +perspectives. The advent of deep multiple clustering techniques has notably +advanced the performance by uncovering complex patterns and relationships +within large datasets. However, a major challenge arises as users often do not +need all the clusterings that algorithms generate, and figuring out the one +needed requires a substantial understanding of each clustering result. +Traditionally, aligning a user's brief keyword of interest with the +corresponding vision components was challenging, but the emergence of +multi-modal and large language models (LLMs) has begun to bridge this gap. In +response, given unlabeled target visual data, we propose Multi-MaP, a novel +method employing a multi-modal proxy learning process. It leverages CLIP +encoders to extract coherent text and image embeddings, with GPT-4 integrating +users' interests to formulate effective textual contexts. Moreover, reference +word constraint and concept-level constraint are designed to learn the optimal +text proxy according to the user's interest. Multi-MaP not only adeptly +captures a user's interest via a keyword but also facilitates identifying +relevant clusterings. Our extensive experiments show that Multi-MaP +consistently outperforms state-of-the-art methods in all benchmark +multi-clustering vision tasks. Our code is available at +https://github.com/Alexander-Yao/Multi-MaP. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://github.com/Alexander-Yao/Multi-MaP +
+
+
+
+
+ + ☆ CatLIP: CLIP-level Visual Recognition Accuracy with 2.7x Faster + Pre-training on Web-scale Image-Text Data + + +
+ Contrastive learning has emerged as a transformative method for learning +effective visual representations through the alignment of image and text +embeddings. However, pairwise similarity computation in contrastive loss +between image and text pairs poses computational challenges. This paper +presents a novel weakly supervised pre-training of vision models on web-scale +image-text data. The proposed method reframes pre-training on image-text data +as a classification task. Consequently, it eliminates the need for pairwise +similarity computations in contrastive loss, achieving a remarkable $2.7\times$ +acceleration in training speed compared to contrastive learning on web-scale +data. Through extensive experiments spanning diverse vision tasks, including +detection and segmentation, we demonstrate that the proposed method maintains +high representation quality. Our source code along with pre-trained model +weights and training recipes is available at +\url{https://github.com/apple/corenet}. + +
+
+
+
+
+ + ☆ Building-PCC: Building Point Cloud Completion Benchmarks + + +
+ With the rapid advancement of 3D sensing technologies, obtaining 3D shape +information of objects has become increasingly convenient. Lidar technology, +with its capability to accurately capture the 3D information of objects at long +distances, has been widely applied in the collection of 3D data in urban +scenes. However, the collected point cloud data often exhibit incompleteness +due to factors such as occlusion, signal absorption, and specular reflection. +This paper explores the application of point cloud completion technologies in +processing these incomplete data and establishes a new real-world benchmark +Building-PCC dataset, to evaluate the performance of existing deep learning +methods in the task of urban building point cloud completion. Through a +comprehensive evaluation of different methods, we analyze the key challenges +faced in building point cloud completion, aiming to promote innovation in the +field of 3D geoinformation applications. Our source code is available at +https://github.com/tudelft3d/Building-PCC-Building-Point-Cloud-Completion-Benchmarks.git. + +
+
+
+
+
+ + ☆ PriorNet: A Novel Lightweight Network with Multidimensional Interactive + Attention for Efficient Image Dehazing + + +
+ Hazy images degrade visual quality, and dehazing is a crucial prerequisite +for subsequent processing tasks. Most current dehazing methods rely on neural +networks and face challenges such as high computational parameter pressure and +weak generalization capabilities. This paper introduces PriorNet--a novel, +lightweight, and highly applicable dehazing network designed to significantly +improve the clarity and visual quality of hazy images while avoiding excessive +detail extraction issues. The core of PriorNet is the original +Multi-Dimensional Interactive Attention (MIA) mechanism, which effectively +captures a wide range of haze characteristics, substantially reducing the +computational load and generalization difficulties associated with complex +systems. By utilizing a uniform convolutional kernel size and incorporating +skip connections, we have streamlined the feature extraction process. +Simplifying the number of layers and architecture not only enhances dehazing +efficiency but also facilitates easier deployment on edge devices. Extensive +testing across multiple datasets has demonstrated PriorNet's exceptional +performance in dehazing and clarity restoration, maintaining image detail and +color fidelity in single-image dehazing tasks. Notably, with a model size of +just 18Kb, PriorNet showcases superior dehazing generalization capabilities +compared to other methods. Our research makes a significant contribution to +advancing image dehazing technology, providing new perspectives and tools for +the field and related domains, particularly emphasizing the importance of +improving universality and deployability. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ A Real-time Evaluation Framework for Pedestrian's Potential Risk at + Non-Signalized Intersections Based on Predicted Post-Encroachment Time + + +
+ Addressing pedestrian safety at intersections is one of the paramount +concerns in the field of transportation research, driven by the urgency of +reducing traffic-related injuries and fatalities. With advances in computer +vision technologies and predictive models, the pursuit of developing real-time +proactive protection systems is increasingly recognized as vital to improving +pedestrian safety at intersections. The core of these protection systems lies +in the prediction-based evaluation of pedestrian's potential risks, which plays +a significant role in preventing the occurrence of accidents. The major +challenges in the current prediction-based potential risk evaluation research +can be summarized into three aspects: the inadequate progress in creating a +real-time framework for the evaluation of pedestrian's potential risks, the +absence of accurate and explainable safety indicators that can represent the +potential risk, and the lack of tailor-made evaluation criteria specifically +for each category of pedestrians. To address these research challenges, in this +study, a framework with computer vision technologies and predictive models is +developed to evaluate the potential risk of pedestrians in real time. Integral +to this framework is a novel surrogate safety measure, the Predicted +Post-Encroachment Time (P-PET), derived from deep learning models capable to +predict the arrival time of pedestrians and vehicles at intersections. To +further improve the effectiveness and reliability of pedestrian risk +evaluation, we classify pedestrians into distinct categories and apply specific +evaluation criteria for each group. The results demonstrate the framework's +ability to effectively identify potential risks through the use of P-PET, +indicating its feasibility for real-time applications and its improved +performance in risk evaluation across different categories of pedestrians. + +
+
+
+
+
+ + ♻ ☆ Few-shot point cloud reconstruction and denoising via learned Guassian + splats renderings and fine-tuned diffusion features + + +
+ Existing deep learning methods for the reconstruction and denoising of point +clouds rely on small datasets of 3D shapes. We circumvent the problem by +leveraging deep learning methods trained on billions of images. We propose a +method to reconstruct point clouds from few images and to denoise point clouds +from their rendering by exploiting prior knowledge distilled from image-based +deep learning models. To improve reconstruction in constraint settings, we +regularize the training of a differentiable renderer with hybrid surface and +appearance by introducing semantic consistency supervision. In addition, we +propose a pipeline to finetune Stable Diffusion to denoise renderings of noisy +point clouds and we demonstrate how these learned filters can be used to remove +point cloud noise coming without 3D supervision. We compare our method with DSS +and PointRadiance and achieved higher quality 3D reconstruction on the +Sketchfab Testset and SCUT Dataset. + +
+
+ comment: An author was not timely informed before the released submission +
+
+
+
+
+ + ♻ ☆ 3D scene generation from scene graphs and self-attention + + +
+ Synthesizing realistic and diverse indoor 3D scene layouts in a controllable +fashion opens up applications in simulated navigation and virtual reality. As +concise and robust representations of a scene, scene graphs have proven to be +well-suited as the semantic control on the generated layout. We present a +variant of the conditional variational autoencoder (cVAE) model to synthesize +3D scenes from scene graphs and floor plans. We exploit the properties of +self-attention layers to capture high-level relationships between objects in a +scene, and use these as the building blocks of our model. Our model, leverages +graph transformers to estimate the size, dimension and orientation of the +objects in a room while satisfying relationships in the given scene graph. Our +experiments shows self-attention layers leads to sparser (7.9x compared to +Graphto3D) and more diverse scenes (16%). + +
+
+ comment: Some authors were not timely informed of the submission +
+
+
+
+
+ + ♻ ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+
+
+
+ + ♻ ☆ A Lightweight Randomized Nonlinear Dictionary Learning Method using + Random Vector Functional Link + + +
+ Kernel-based nonlinear dictionary learning methods operate in a feature space +obtained by an implicit feature map, and they are not independent of +computationally expensive operations like Singular Value Decomposition (SVD). +This paper presents an SVD-free lightweight approach to learning a nonlinear +dictionary using a randomized functional link called a Random Vector Functional +Link (RVFL). The proposed RVFL-based nonlinear Dictionary Learning (RVFLDL) +learns a dictionary as a sparse-to-dense feature map from nonlinear sparse +coefficients to the dense input features. Sparse coefficients w.r.t an initial +random dictionary are derived by assuming Horseshoe prior are used as inputs +making it a lightweight network. Training the RVFL-based dictionary is free +from SVD computation as RVFL generates weights from the input to the output +layer analytically. Higher-order dependencies between the input sparse +coefficients and the dictionary atoms are incorporated into the training +process by nonlinearly transforming the sparse coefficients and adding them as +enhanced features. Thus the method projects sparse coefficients to a higher +dimensional space while inducing nonlinearities into the dictionary. For +classification using RVFL-net, a classifier matrix is learned as a transform +that maps nonlinear sparse coefficients to the labels. The empirical evidence +of the method illustrated in image classification and reconstruction +applications shows that RVFLDL is scalable and provides a solution better than +those obtained using other nonlinear dictionary learning methods. + +
+
+
+
+
+ + ♻ ☆ A Multilevel Guidance-Exploration Network and Behavior-Scene Matching + Method for Human Behavior Anomaly Detection + + +
+ Human behavior anomaly detection aims to identify unusual human actions, +playing a crucial role in intelligent surveillance and other areas. The current +mainstream methods still adopt reconstruction or future frame prediction +techniques. However, reconstructing or predicting low-level pixel features +easily enables the network to achieve overly strong generalization ability, +allowing anomalies to be reconstructed or predicted as effectively as normal +data. Different from their methods, inspired by the Student-Teacher Network, we +propose a novel framework called the Multilevel Guidance-Exploration +Network(MGENet), which detects anomalies through the difference in high-level +representation between the Guidance and Exploration network. Specifically, we +first utilize the pre-trained Normalizing Flow that takes skeletal keypoints as +input to guide an RGB encoder, which takes unmasked RGB frames as input, to +explore motion latent features. Then, the RGB encoder guides the mask encoder, +which takes masked RGB frames as input, to explore the latent appearance +feature. Additionally, we design a Behavior-Scene Matching Module(BSMM) to +detect scene-related behavioral anomalies. Extensive experiments demonstrate +that our proposed method achieves state-of-the-art performance on ShanghaiTech +and UBnormal datasets, with AUC of 86.9 % and 73.5 %, respectively. The code +will be available on https://github.com/molu-ggg/GENet. + +
+
+ comment: The experimental methods and results are incorrect and need to be + revised +
+
+
+
+
+ + ♻ ☆ Comparison of Methods in Human Skin Decomposition + + +
+ Decomposition of skin pigment plays an important role in medical fields. +Human skin can be decomposed into two primitive components, hemoglobin and +melanin. It is our goal to apply these results for diagnosis of skin cancer. In +this paper, various methods for skin pigment decomposition are reviewed +comparatively and the performance of each method is evaluated both +theoretically and experimentally. In addition, isometric feature mapping +(Isomap) is introduced in order to improve the dimensionality reduction +performance in context of skin decomposition. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Prompt Learning with Negative Textual Semantics and Uncertainty + Modeling for Universal Multi-Source Domain Adaptation ICME2024 + + +
+ Universal Multi-source Domain Adaptation (UniMDA) transfers knowledge from +multiple labeled source domains to an unlabeled target domain under domain +shifts (different data distribution) and class shifts (unknown target classes). +Existing solutions focus on excavating image features to detect unknown +samples, ignoring abundant information contained in textual semantics. In this +paper, we propose an Adaptive Prompt learning with Negative textual semantics +and uncErtainty modeling method based on Contrastive Language-Image +Pre-training (APNE-CLIP) for UniMDA classification tasks. Concretely, we +utilize the CLIP with adaptive prompts to leverage textual information of class +semantics and domain representations, helping the model identify unknown +samples and address domain shifts. Additionally, we design a novel global +instance-level alignment objective by utilizing negative textual semantics to +achieve more precise image-text pair alignment. Furthermore, we propose an +energy-based uncertainty modeling strategy to enlarge the margin distance +between known and unknown samples. Extensive experiments demonstrate the +superiority of our proposed method. + +
+
+ comment: Accepted by ICME2024 +
+
+
+
+
+ + ♻ ☆ ZeroNVS: Zero-Shot 360-Degree View Synthesis from a Single Image CVPR 2024 + + +
+ We introduce a 3D-aware diffusion model, ZeroNVS, for single-image novel view +synthesis for in-the-wild scenes. While existing methods are designed for +single objects with masked backgrounds, we propose new techniques to address +challenges introduced by in-the-wild multi-object scenes with complex +backgrounds. Specifically, we train a generative prior on a mixture of data +sources that capture object-centric, indoor, and outdoor scenes. To address +issues from data mixture such as depth-scale ambiguity, we propose a novel +camera conditioning parameterization and normalization scheme. Further, we +observe that Score Distillation Sampling (SDS) tends to truncate the +distribution of complex backgrounds during distillation of 360-degree scenes, +and propose "SDS anchoring" to improve the diversity of synthesized novel +views. Our model sets a new state-of-the-art result in LPIPS on the DTU dataset +in the zero-shot setting, even outperforming methods specifically trained on +DTU. We further adapt the challenging Mip-NeRF 360 dataset as a new benchmark +for single-image novel view synthesis, and demonstrate strong performance in +this setting. Our code and data are at http://kylesargent.github.io/zeronvs/ + +
+
+ comment: Accepted to CVPR 2024. 12 pages +
+
+
+
+
+ + ♻ ☆ Learning with Unmasked Tokens Drives Stronger Vision Learners + + +
+ Masked image modeling (MIM) has become a leading self-supervised learning +strategy. MIMs such as Masked Autoencoder (MAE) learn strong representations by +randomly masking input tokens for the encoder to process, with the decoder +reconstructing the masked tokens to the input. However, MIM pre-trained +encoders often exhibit a limited attention span, attributed to MIM's sole focus +on regressing masked tokens only, which may impede the encoder's broader +context learning. To tackle the limitation, we improve MIM by explicitly +incorporating unmasked tokens into the training process. Specifically, our +method enables the encoder to learn from broader context supervision, allowing +unmasked tokens to experience broader contexts while the decoder reconstructs +masked tokens. Thus, the encoded unmasked tokens are equipped with extensive +contextual information, empowering masked tokens to leverage the enhanced +unmasked tokens for MIM. As a result, our simple remedy trains more +discriminative representations revealed by achieving 84.2% top-1 accuracy with +ViT-B on ImageNet-1K with 0.6%p gain. We attribute the success to the enhanced +pre-training method, as evidenced by the singular value spectrum and attention +analyses. Finally, our models achieve significant performance gains at the +downstream semantic segmentation and fine-grained visual classification tasks; +and on diverse robust evaluation metrics. Code is available at +https://github.com/naver-ai/lut + +
+
+
+
+
+ + ♻ ☆ Seeing Text in the Dark: Algorithm and Benchmark + + +
+ Localizing text in low-light environments is challenging due to visual +degradations. Although a straightforward solution involves a two-stage pipeline +with low-light image enhancement (LLE) as the initial step followed by +detector, LLE is primarily designed for human vision instead of machine and can +accumulate errors. In this work, we propose an efficient and effective +single-stage approach for localizing text in dark that circumvents the need for +LLE. We introduce a constrained learning module as an auxiliary mechanism +during the training stage of the text detector. This module is designed to +guide the text detector in preserving textual spatial features amidst feature +map resizing, thus minimizing the loss of spatial information in texts under +low-light visual degradations. Specifically, we incorporate spatial +reconstruction and spatial semantic constraints within this module to ensure +the text detector acquires essential positional and contextual range knowledge. +Our approach enhances the original text detector's ability to identify text's +local topological features using a dynamic snake feature pyramid network and +adopts a bottom-up contour shaping strategy with a novel rectangular +accumulation technique for accurate delineation of streamlined text features. +In addition, we present a comprehensive low-light dataset for arbitrary-shaped +text, encompassing diverse scenes and languages. Notably, our method achieves +state-of-the-art results on this low-light dataset and exhibits comparable +performance on standard normal light datasets. The code and dataset will be +released. + +
+
+
+
+
+ + ♻ ☆ Structure-Guided Image Completion with Image-level and Object-level + Semantic Discriminators + + +
+ Structure-guided image completion aims to inpaint a local region of an image +according to an input guidance map from users. While such a task enables many +practical applications for interactive editing, existing methods often struggle +to hallucinate realistic object instances in complex natural scenes. Such a +limitation is partially due to the lack of semantic-level constraints inside +the hole region as well as the lack of a mechanism to enforce realistic object +generation. In this work, we propose a learning paradigm that consists of +semantic discriminators and object-level discriminators for improving the +generation of complex semantics and objects. Specifically, the semantic +discriminators leverage pretrained visual features to improve the realism of +the generated visual concepts. Moreover, the object-level discriminators take +aligned instances as inputs to enforce the realism of individual objects. Our +proposed scheme significantly improves the generation quality and achieves +state-of-the-art results on various tasks, including segmentation-guided +completion, edge-guided manipulation and panoptically-guided manipulation on +Places2 datasets. Furthermore, our trained model is flexible and can support +multiple editing use cases, such as object insertion, replacement, removal and +standard inpainting. In particular, our trained model combined with a novel +automatic image completion pipeline achieves state-of-the-art results on the +standard inpainting task. + +
+
+ comment: 18 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Utility-Fairness Trade-Offs and How to Find Them + + +
+ When building classification systems with demographic fairness +considerations, there are two objectives to satisfy: 1) maximizing utility for +the specific task and 2) ensuring fairness w.r.t. a known demographic +attribute. These objectives often compete, so optimizing both can lead to a +trade-off between utility and fairness. While existing works acknowledge the +trade-offs and study their limits, two questions remain unanswered: 1) What are +the optimal trade-offs between utility and fairness? and 2) How can we +numerically quantify these trade-offs from data for a desired prediction task +and demographic attribute of interest? This paper addresses these questions. We +introduce two utility-fairness trade-offs: the Data-Space and Label-Space +Trade-off. The trade-offs reveal three regions within the utility-fairness +plane, delineating what is fully and partially possible and impossible. We +propose U-FaTE, a method to numerically quantify the trade-offs for a given +prediction task and group fairness definition from data samples. Based on the +trade-offs, we introduce a new scheme for evaluating representations. An +extensive evaluation of fair representation learning methods and +representations from over 1000 pre-trained models revealed that most current +approaches are far from the estimated and achievable fairness-utility +trade-offs across multiple datasets and prediction tasks. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024 +
+
+
+
+
+ + ♻ ☆ TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and + Proprioception Estimation + + +
+ Legged navigation is typically examined within open-world, off-road, and +challenging environments. In these scenarios, estimating external disturbances +requires a complex synthesis of multi-modal information. This underlines a +major limitation in existing works that primarily focus on avoiding obstacles. +In this work, we propose TOP-Nav, a novel legged navigation framework that +integrates a comprehensive path planner with Terrain awareness, Obstacle +avoidance and close-loop Proprioception. TOP-Nav underscores the synergies +between vision and proprioception in both path and motion planning. Within the +path planner, we present and integrate a terrain estimator that enables the +robot to select waypoints on terrains with higher traversability while +effectively avoiding obstacles. In the motion planning level, we not only +implement a locomotion controller to track the navigation commands, but also +construct a proprioception advisor to provide motion evaluations for the path +planner. Based on the close-loop motion feedback, we make online corrections +for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav +achieves open-world navigation that the robot can handle terrains or +disturbances beyond the distribution of prior knowledge and overcomes +constraints imposed by visual conditions. Building upon extensive experiments +conducted in both simulation and real-world environments, TOP-Nav demonstrates +superior performance in open-world navigation compared to existing methods. + +
+
+
+
+
+ + ♻ ☆ DAWN: Domain-Adaptive Weakly Supervised Nuclei Segmentation via + Cross-Task Interactions + + +
+ Weakly supervised segmentation methods have gained significant attention due +to their ability to reduce the reliance on costly pixel-level annotations +during model training. However, the current weakly supervised nuclei +segmentation approaches typically follow a two-stage pseudo-label generation +and network training process. The performance of the nuclei segmentation +heavily relies on the quality of the generated pseudo-labels, thereby limiting +its effectiveness. This paper introduces a novel domain-adaptive weakly +supervised nuclei segmentation framework using cross-task interaction +strategies to overcome the challenge of pseudo-label generation. Specifically, +we utilize weakly annotated data to train an auxiliary detection task, which +assists the domain adaptation of the segmentation network. To enhance the +efficiency of domain adaptation, we design a consistent feature constraint +module integrating prior knowledge from the source domain. Furthermore, we +develop pseudo-label optimization and interactive training methods to improve +the domain transfer capability. To validate the effectiveness of our proposed +method, we conduct extensive comparative and ablation experiments on six +datasets. The results demonstrate the superiority of our approach over existing +weakly supervised approaches. Remarkably, our method achieves comparable or +even better performance than fully supervised methods. Our code will be +released in https://github.com/zhangye-zoe/DAWN. + +
+
+ comment: 13 pages, 11 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ A sensitivity analysis to quantify the impact of neuroimaging + preprocessing strategies on subsequent statistical analyses + + +
+ Even though novel imaging techniques have been successful in studying brain +structure and function, the measured biological signals are often contaminated +by multiple sources of noise, arising due to e.g. head movements of the +individual being scanned, limited spatial/temporal resolution, or other issues +specific to each imaging technology. Data preprocessing (e.g. denoising) is +therefore critical. Preprocessing pipelines have become increasingly complex +over the years, but also more flexible, and this flexibility can have a +significant impact on the final results and conclusions of a given study. This +large parameter space is often referred to as multiverse analyses. Here, we +provide conceptual and practical tools for statistical analyses that can +aggregate multiple pipeline results along with a new sensitivity analysis +testing for hypotheses across pipelines such as "no effect across all +pipelines" or "at least one pipeline with no effect". The proposed framework is +generic and can be applied to any multiverse scenario, but we illustrate its +use based on positron emission tomography data. + +
+
+
+
+
+ + ♻ ☆ Effective Decision Boundary Learning for Class Incremental Learning + + +
+ Rehearsal approaches in class incremental learning (CIL) suffer from decision +boundary overfitting to new classes, which is mainly caused by two factors: +insufficiency of old classes data for knowledge distillation and imbalanced +data learning between the learned and new classes because of the limited +storage memory. In this work, we present a simple but effective approach to +tackle these two factors. First, we employ a re-sampling strategy and Mixup +K}nowledge D}istillation (Re-MKD) to improve the performances of KD, which +would greatly alleviate the overfitting problem. Specifically, we combine mixup +and re-sampling strategies to synthesize adequate data used in KD training that +are more consistent with the latent distribution between the learned and new +classes. Second, we propose a novel incremental influence balance (IIB) method +for CIL to tackle the classification of imbalanced data by extending the +influence balance method into the CIL setting, which re-weights samples by +their influences to create a proper decision boundary. With these two +improvements, we present the effective decision boundary learning algorithm +(EDBL) which improves the performance of KD and deals with the imbalanced data +learning simultaneously. Experiments show that the proposed EDBL achieves +state-of-the-art performances on several CIL benchmarks. + +
+
+
+
+
+ + ♻ ☆ Exploring Feedback Generation in Automated Skeletal Movement Assessment: + A Comprehensive Overview + + +
+ The application of machine-learning solutions to movement assessment from +skeleton videos has attracted significant research attention in recent years. +This advancement has made rehabilitation at home more accessible, utilizing +movement assessment algorithms that can operate on affordable equipment for +human pose detection and analysis from 2D or 3D videos. While the primary +objective of automatic assessment tasks is to score movements, the automatic +generation of feedback highlighting key movement issues has the potential to +significantly enhance and accelerate the rehabilitation process. While numerous +research works exist in the field of automatic movement assessment, only a +handful address feedback generation. In this study, we explain the types of +feedback that can be generated, review existing solutions for automatic +feedback generation, and discuss future research directions. To our knowledge, +this is the first comprehensive review of feedback generation in skeletal +movement assessment. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Character Identification and Speaker Prediction in Comics via + Iterative Multimodal Fusion + + +
+ Recognizing characters and predicting speakers of dialogue are critical for +comic processing tasks, such as voice generation or translation. However, +because characters vary by comic title, supervised learning approaches like +training character classifiers which require specific annotations for each +comic title are infeasible. This motivates us to propose a novel zero-shot +approach, allowing machines to identify characters and predict speaker names +based solely on unannotated comic images. In spite of their importance in +real-world applications, these task have largely remained unexplored due to +challenges in story comprehension and multimodal integration. Recent large +language models (LLMs) have shown great capability for text understanding and +reasoning, while their application to multimodal content analysis is still an +open problem. To address this problem, we propose an iterative multimodal +framework, the first to employ multimodal information for both character +identification and speaker prediction tasks. Our experiments demonstrate the +effectiveness of the proposed framework, establishing a robust baseline for +these tasks. Furthermore, since our method requires no training data or +annotations, it can be used as-is on any comic series. + +
+
+
+
+
+ + ♻ ☆ MaterialSeg3D: Segmenting Dense Materials from 2D Priors for 3D Assets + + +
+ Driven by powerful image diffusion models, recent research has achieved the +automatic creation of 3D objects from textual or visual guidance. By performing +score distillation sampling (SDS) iteratively across different views, these +methods succeed in lifting 2D generative prior to the 3D space. However, such a +2D generative image prior bakes the effect of illumination and shadow into the +texture. As a result, material maps optimized by SDS inevitably involve +spurious correlated components. The absence of precise material definition +makes it infeasible to relight the generated assets reasonably in novel scenes, +which limits their application in downstream scenarios. In contrast, humans can +effortlessly circumvent this ambiguity by deducing the material of the object +from its appearance and semantics. Motivated by this insight, we propose +MaterialSeg3D, a 3D asset material generation framework to infer underlying +material from the 2D semantic prior. Based on such a prior model, we devise a +mechanism to parse material in 3D space. We maintain a UV stack, each map of +which is unprojected from a specific viewpoint. After traversing all +viewpoints, we fuse the stack through a weighted voting scheme and then employ +region unification to ensure the coherence of the object parts. To fuel the +learning of semantics prior, we collect a material dataset, named Materialized +Individual Objects (MIO), which features abundant images, diverse categories, +and accurate annotations. Extensive quantitative and qualitative experiments +demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Bracketing Image Restoration and Enhancement with High-Low Frequency + Decomposition CVPR 2024 + + +
+ In real-world scenarios, due to a series of image degradations, obtaining +high-quality, clear content photos is challenging. While significant progress +has been made in synthesizing high-quality images, previous methods for image +restoration and enhancement often overlooked the characteristics of different +degradations. They applied the same structure to address various types of +degradation, resulting in less-than-ideal restoration outcomes. Inspired by the +notion that high/low frequency information is applicable to different +degradations, we introduce HLNet, a Bracketing Image Restoration and +Enhancement method based on high-low frequency decomposition. Specifically, we +employ two modules for feature extraction: shared weight modules and non-shared +weight modules. In the shared weight modules, we use SCConv to extract common +features from different degradations. In the non-shared weight modules, we +introduce the High-Low Frequency Decomposition Block (HLFDB), which employs +different methods to handle high-low frequency information, enabling the model +to address different degradations more effectively. Compared to other networks, +our method takes into account the characteristics of different degradations, +thus achieving higher-quality image restoration. + +
+
+ comment: This paper is accepted by CVPR 2024 Workshop, code: + https://github.com/chengeng0613/HLNet +
+
+
+
+
+ + ♻ ☆ ULIP-2: Towards Scalable Multimodal Pre-training for 3D Understanding CVPR2024 + + +
+ Recent advancements in multimodal pre-training have shown promising efficacy +in 3D representation learning by aligning multimodal features across 3D shapes, +their 2D counterparts, and language descriptions. However, the methods used by +existing frameworks to curate such multimodal data, in particular language +descriptions for 3D shapes, are not scalable, and the collected language +descriptions are not diverse. To address this, we introduce ULIP-2, a simple +yet effective tri-modal pre-training framework that leverages large multimodal +models to automatically generate holistic language descriptions for 3D shapes. +It only needs 3D data as input, eliminating the need for any manual 3D +annotations, and is therefore scalable to large datasets. ULIP-2 is also +equipped with scaled-up backbones for better multimodal representation +learning. We conduct experiments on two large-scale 3D datasets, Objaverse and +ShapeNet, and augment them with tri-modal datasets of 3D point clouds, images, +and language for training ULIP-2. Experiments show that ULIP-2 demonstrates +substantial benefits in three downstream tasks: zero-shot 3D classification, +standard 3D classification with fine-tuning, and 3D captioning (3D-to-language +generation). It achieves a new SOTA of 50.6% (top-1) on Objaverse-LVIS and +84.7% (top-1) on ModelNet40 in zero-shot classification. In the ScanObjectNN +benchmark for standard fine-tuning, ULIP-2 reaches an overall accuracy of 91.5% +with a compact model of only 1.4 million parameters. ULIP-2 sheds light on a +new paradigm for scalable multimodal 3D representation learning without human +annotations and shows significant improvements over existing baselines. The +code and datasets are released at https://github.com/salesforce/ULIP. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Discffusion: Discriminative Diffusion Models as Few-shot Vision and + Language Learners + + +
+ Diffusion models, such as Stable Diffusion, have shown incredible performance +on text-to-image generation. Since text-to-image generation often requires +models to generate visual concepts with fine-grained details and attributes +specified in text prompts, can we leverage the powerful representations learned +by pre-trained diffusion models for discriminative tasks such as image-text +matching? To answer this question, we propose a novel approach, Discriminative +Stable Diffusion (DSD), which turns pre-trained text-to-image diffusion models +into few-shot discriminative learners. Our approach mainly uses the +cross-attention score of a Stable Diffusion model to capture the mutual +influence between visual and textual information and fine-tune the model via +efficient attention-based prompt learning to perform image-text matching. By +comparing DSD with state-of-the-art methods on several benchmark datasets, we +demonstrate the potential of using pre-trained diffusion models for +discriminative tasks with superior results on few-shot image-text matching. + +
+
+
+
+
+ + ♻ ☆ NiNformer: A Network in Network Transformer with Token Mixing Generated + Gating Function + + +
+ The Attention mechanism is the main component of the Transformer +architecture, and since its introduction, it has led to significant +advancements in Deep Learning that span many domains and multiple tasks. The +Attention Mechanism was utilized in Computer Vision as the Vision Transformer +ViT, and its usage has expanded into many tasks in the vision domain, such as +classification, segmentation, object detection, and image generation. While +this mechanism is very expressive and capable, it comes with the drawback of +being computationally expensive and requiring datasets of considerable size for +effective optimization. To address these shortcomings, many designs have been +proposed in the literature to reduce the computational burden and alleviate the +data size requirements. Examples of such attempts in the vision domain are the +MLP-Mixer, the Conv-Mixer, the Perciver-IO, and many more. This paper +introduces a new computational block as an alternative to the standard ViT +block that reduces the compute burdens by replacing the normal Attention layers +with a Network in Network structure that enhances the static approach of the +MLP Mixer with a dynamic system of learning an element-wise gating function by +a token mixing process. Extensive experimentation shows that the proposed +design provides better performance than the baseline architectures on multiple +datasets applied in the image classification task of the vision domain. + +
+
+
+
+
+ + ♻ ☆ A voxel-level approach to brain age prediction: A method to assess + regional brain aging + + +
+ Brain aging is a regional phenomenon, a facet that remains relatively +under-explored within the realm of brain age prediction research using machine +learning methods. Voxel-level predictions can provide localized brain age +estimates that can provide granular insights into the regional aging processes. +This is essential to understand the differences in aging trajectories in +healthy versus diseased subjects. In this work, a deep learning-based multitask +model is proposed for voxel-level brain age prediction from T1-weighted +magnetic resonance images. The proposed model outperforms the models existing +in the literature and yields valuable clinical insights when applied to both +healthy and diseased populations. Regional analysis is performed on the +voxel-level brain age predictions to understand aging trajectories of known +anatomical regions in the brain and show that there exist disparities in +regional aging trajectories of healthy subjects compared to ones with +underlying neurological disorders such as Dementia and more specifically, +Alzheimer's disease. Our code is available at +https://github.com/nehagianchandani/Voxel-level-brain-age-prediction. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024:007 +
+
+
+
+
+ + ♻ ☆ MARVEL: Multidimensional Abstraction and Reasoning through Visual + Evaluation and Learning + + +
+ While multi-modal large language models (MLLMs) have shown significant +progress on many popular visual reasoning benchmarks, whether they possess +abstract visual reasoning abilities remains an open question. Similar to the +Sudoku puzzles, abstract visual reasoning (AVR) problems require finding +high-level patterns (e.g., repetition constraints) that control the input +shapes (e.g., digits) in a specific task configuration (e.g., matrix). However, +existing AVR benchmarks only considered a limited set of patterns (addition, +conjunction), input shapes (rectangle, square), and task configurations (3 by 3 +matrices). To evaluate MLLMs' reasoning abilities comprehensively, we introduce +MARVEL, a multidimensional AVR benchmark with 770 puzzles composed of six core +knowledge patterns, geometric and abstract shapes, and five different task +configurations. To inspect whether the model accuracy is grounded in perception +and reasoning, MARVEL complements the general AVR question with perception +questions in a hierarchical evaluation framework. We conduct comprehensive +experiments on MARVEL with nine representative MLLMs in zero-shot and few-shot +settings. Our experiments reveal that all models show near-random performance +on the AVR question, with significant performance gaps (40%) compared to humans +across all patterns and task configurations. Further analysis of perception +questions reveals that MLLMs struggle to comprehend the visual features +(near-random performance) and even count the panels in the puzzle ( <45%), +hindering their ability for abstract reasoning. We release our entire code and +dataset. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Representation Learning for 3D MRI Super Resolution with + Degradation Adaptation + + +
+ High-resolution (HR) magnetic resonance imaging is critical in aiding doctors +in their diagnoses and image-guided treatments. However, acquiring HR images +can be time-consuming and costly. Consequently, deep learning-based +super-resolution reconstruction (SRR) has emerged as a promising solution for +generating super-resolution (SR) images from low-resolution (LR) images. +Unfortunately, training such neural networks requires aligned authentic HR and +LR image pairs, which are challenging to obtain due to patient movements during +and between image acquisitions. While rigid movements of hard tissues can be +corrected with image registration, aligning deformed soft tissues is complex, +making it impractical to train neural networks with authentic HR and LR image +pairs. Previous studies have focused on SRR using authentic HR images and +down-sampled synthetic LR images. However, the difference in degradation +representations between synthetic and authentic LR images suppresses the +quality of SR images reconstructed from authentic LR images. To address this +issue, we propose a novel Unsupervised Degradation Adaptation Network (UDEAN). +Our network consists of a degradation learning network and an SRR network. The +degradation learning network downsamples the HR images using the degradation +representation learned from the misaligned or unpaired LR images. The SRR +network then learns the mapping from the down-sampled HR images to the original +ones. Experimental results show that our method outperforms state-of-the-art +networks and is a promising solution to the challenges in clinical settings. + +
+
+ comment: Accepted by IEEE Transactions on Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus + on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs) + + +
+ Pediatric tumors of the central nervous system are the most common cause of +cancer-related death in children. The five-year survival rate for high-grade +gliomas in children is less than 20%. Due to their rarity, the diagnosis of +these entities is often delayed, their treatment is mainly based on historic +treatment concepts, and clinical trials require multi-institutional +collaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs +challenge, focused on pediatric brain tumors with data acquired across multiple +international consortia dedicated to pediatric neuro-oncology and clinical +trials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together +clinicians and AI/imaging scientists to lead to faster development of automated +segmentation techniques that could benefit clinical trials, and ultimately the +care of children with brain tumors. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2305.17033 +
+
+
+
+
+ + ♻ ☆ You Only Look at Once for Real-time and Generic Multi-Task + + +
+ High precision, lightweight, and real-time responsiveness are three essential +requirements for implementing autonomous driving. In this study, we incorporate +A-YOLOM, an adaptive, real-time, and lightweight multi-task model designed to +concurrently address object detection, drivable area segmentation, and lane +line segmentation tasks. Specifically, we develop an end-to-end multi-task +model with a unified and streamlined segmentation structure. We introduce a +learnable parameter that adaptively concatenates features between necks and +backbone in segmentation tasks, using the same loss function for all +segmentation tasks. This eliminates the need for customizations and enhances +the model's generalization capabilities. We also introduce a segmentation head +composed only of a series of convolutional layers, which reduces the number of +parameters and inference time. We achieve competitive results on the BDD100k +dataset, particularly in visualization outcomes. The performance results show a +mAP50 of 81.1% for object detection, a mIoU of 91.0% for drivable area +segmentation, and an IoU of 28.8% for lane line segmentation. Additionally, we +introduce real-world scenarios to evaluate our model's performance in a real +scene, which significantly outperforms competitors. This demonstrates that our +model not only exhibits competitive performance but is also more flexible and +faster than existing multi-task models. The source codes and pre-trained models +are released at https://github.com/JiayuanWang-JW/YOLOv8-multi-task + +
+
+
+
+
+ + ♻ ☆ Real-Time Simulated Avatar from Head-Mounted Sensors CVPR 2024 + + +
+ We present SimXR, a method for controlling a simulated avatar from +information (headset pose and cameras) obtained from AR / VR headsets. Due to +the challenging viewpoint of head-mounted cameras, the human body is often +clipped out of view, making traditional image-based egocentric pose estimation +challenging. On the other hand, headset poses provide valuable information +about overall body motion, but lack fine-grained details about the hands and +feet. To synergize headset poses with cameras, we control a humanoid to track +headset movement while analyzing input images to decide body movement. When +body parts are seen, the movements of hands and feet will be guided by the +images; when unseen, the laws of physics guide the controller to generate +plausible motion. We design an end-to-end method that does not rely on any +intermediate representations and learns to directly map from images and headset +poses to humanoid control signals. To train our method, we also propose a +large-scale synthetic dataset created using camera configurations compatible +with a commercially available VR headset (Quest 2) and show promising results +on real-world captures. To demonstrate the applicability of our framework, we +also test it on an AR headset with a forward-facing camera. + +
+
+ comment: CVPR 2024 Hightlight. Website: https://www.zhengyiluo.com/SimXR/ +
+
+
+
+
+ + ♻ ☆ If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face + Recognition through Synthetic Faces + + +
+ Recent advances in deep face recognition have spurred a growing demand for +large, diverse, and manually annotated face datasets. Acquiring authentic, +high-quality data for face recognition has proven to be a challenge, primarily +due to privacy concerns. Large face datasets are primarily sourced from +web-based images, lacking explicit user consent. In this paper, we examine +whether and how synthetic face data can be used to train effective face +recognition models with reduced reliance on authentic images, thereby +mitigating data collection concerns. First, we explored the performance gap +among recent state-of-the-art face recognition models, trained with synthetic +data only and authentic (scarce) data only. Then, we deepened our analysis by +training a state-of-the-art backbone with various combinations of synthetic and +authentic data, gaining insights into optimizing the limited use of the latter +for verification accuracy. Finally, we assessed the effectiveness of data +augmentation approaches on synthetic and authentic data, with the same goal in +mind. Our results highlighted the effectiveness of FR trained on combined +datasets, particularly when combined with appropriate augmentation techniques. + +
+
+ comment: Accepted as full paper at FG 2024 main track +
+
+
+
+
+ + ♻ ☆ EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS + + +
+ Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view +scene synthesis. It addresses the challenges of lengthy training times and slow +rendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid, +differentiable rasterization of 3D Gaussians, 3D-GS achieves real-time +rendering and accelerated training. They, however, demand substantial memory +resources for both training and storage, as they require millions of Gaussians +in their point cloud representation for each scene. We present a technique +utilizing quantized embeddings to significantly reduce per-point memory storage +requirements and a coarse-to-fine training strategy for a faster and more +stable optimization of the Gaussian point clouds. Our approach develops a +pruning stage which results in scene representations with fewer Gaussians, +leading to faster training times and rendering speeds for real-time rendering +of high resolution scenes. We reduce storage memory by more than an order of +magnitude all while preserving the reconstruction quality. We validate the +effectiveness of our approach on a variety of datasets and scenes preserving +the visual quality while consuming 10-20x lesser memory and faster +training/inference speed. Project page and code is available +https://efficientgaussian.github.io + +
+
+ comment: Website: https://efficientgaussian.github.io Code: + https://github.com/Sharath-girish/efficientgaussian +
+
+
+
+
+ + ♻ ☆ SiMBA: Simplified Mamba-Based Architecture for Vision and Multivariate + Time series + + +
+ Transformers have widely adopted attention networks for sequence mixing and +MLPs for channel mixing, playing a pivotal role in achieving breakthroughs +across domains. However, recent literature highlights issues with attention +networks, including low inductive bias and quadratic complexity concerning +input sequence length. State Space Models (SSMs) like S4 and others (Hippo, +Global Convolutions, liquid S4, LRU, Mega, and Mamba), have emerged to address +the above issues to help handle longer sequence lengths. Mamba, while being the +state-of-the-art SSM, has a stability issue when scaled to large networks for +computer vision datasets. We propose SiMBA, a new architecture that introduces +Einstein FFT (EinFFT) for channel modeling by specific eigenvalue computations +and uses the Mamba block for sequence modeling. Extensive performance studies +across image and time-series benchmarks demonstrate that SiMBA outperforms +existing SSMs, bridging the performance gap with state-of-the-art transformers. +Notably, SiMBA establishes itself as the new state-of-the-art SSM on ImageNet +and transfer learning benchmarks such as Stanford Car and Flower as well as +task learning benchmarks as well as seven time series benchmark datasets. The +project page is available on this website +~\url{https://github.com/badripatro/Simba}. + +
+
+
+
+
+ + ♻ ☆ Achieving More Human Brain-Like Vision via Human EEG Representational + Alignment + + +
+ Despite advancements in artificial intelligence, object recognition models +still lag behind in emulating visual information processing in human brains. +Recent studies have highlighted the potential of using neural data to mimic +brain processing; however, these often rely on invasive neural recordings from +non-human subjects, leaving a critical gap in understanding human visual +perception. Addressing this gap, we present, for the first time, +'Re(presentational)Al(ignment)net', a vision model aligned with human brain +activity based on non-invasive EEG, demonstrating a significantly higher +similarity to human brain representations. Our innovative image-to-brain +multi-layer encoding framework advances human neural alignment by optimizing +multiple model layers and enabling the model to efficiently learn and mimic +human brain's visual representational patterns across object categories and +different modalities. Our findings suggest that ReAlnet represents a +breakthrough in bridging the gap between artificial and human vision, and +paving the way for more brain-like artificial intelligence systems. + +
+
+
+
+
+ + ♻ ☆ PTT: Point-Trajectory Transformer for Efficient Temporal 3D Object + Detection CVPR 2024 + + +
+ Recent temporal LiDAR-based 3D object detectors achieve promising performance +based on the two-stage proposal-based approach. They generate 3D box candidates +from the first-stage dense detector, followed by different temporal aggregation +methods. However, these approaches require per-frame objects or whole point +clouds, posing challenges related to memory bank utilization. Moreover, point +clouds and trajectory features are combined solely based on concatenation, +which may neglect effective interactions between them. In this paper, we +propose a point-trajectory transformer with long short-term memory for +efficient temporal 3D object detection. To this end, we only utilize point +clouds of current-frame objects and their historical trajectories as input to +minimize the memory bank storage requirement. Furthermore, we introduce modules +to encode trajectory features, focusing on long short-term and future-aware +perspectives, and then effectively aggregate them with point cloud features. We +conduct extensive experiments on the large-scale Waymo dataset to demonstrate +that our approach performs well against state-of-the-art methods. Code and +models will be made publicly available at https://github.com/kuanchihhuang/PTT. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://github.com/kuanchihhuang/PTT +
+
+
+
+
+ + ♻ ☆ Specialty-Oriented Generalist Medical AI for Chest CT Screening + + +
+ Modern medical records include a vast amount of multimodal free text clinical +data and imaging data from radiology, cardiology, and digital pathology. Fully +mining such big data requires multitasking; otherwise, occult but important +aspects may be overlooked, adversely affecting clinical management and +population healthcare. Despite remarkable successes of AI in individual tasks +with single-modal data, the progress in developing generalist medical AI +remains relatively slow to combine multimodal data for multitasks because of +the dual challenges of data curation and model architecture. The data challenge +involves querying and curating multimodal structured and unstructured text, +alphanumeric, and especially 3D tomographic scans on an individual patient +level for real-time decisions and on a scale to estimate population health +statistics. The model challenge demands a scalable and adaptable network +architecture to integrate multimodal datasets for diverse clinical tasks. Here +we propose the first-of-its-kind medical multimodal-multitask foundation model +(M3FM) with application in lung cancer screening and related tasks. After we +curated a comprehensive multimodal multitask dataset consisting of 49 clinical +data types including 163,725 chest CT series and 17 medical tasks involved in +LCS, we develop a multimodal question-answering framework as a unified training +and inference strategy to synergize multimodal information and perform multiple +tasks via free-text prompting. M3FM consistently outperforms the +state-of-the-art single-modal task-specific models, identifies multimodal data +elements informative for clinical tasks and flexibly adapts to new tasks with a +small out-of-distribution dataset. As a specialty-oriented generalist medical +AI model, M3FM paves the way for similar breakthroughs in other areas of +medicine, closing the gap between specialists and the generalist. + +
+
+
+
+
+ + ♻ ☆ Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis SIGGRAPH 2024 + + +
+ Designing a 3D representation of a dynamic scene for fast optimization and +rendering is a challenging task. While recent explicit representations enable +fast learning and rendering of dynamic radiance fields, they require a dense +set of input viewpoints. In this work, we focus on learning a fast +representation for dynamic radiance fields with sparse input viewpoints. +However, the optimization with sparse input is under-constrained and +necessitates the use of motion priors to constrain the learning. Existing fast +dynamic scene models do not explicitly model the motion, making them difficult +to be constrained with motion priors. We design an explicit motion model as a +factorized 4D representation that is fast and can exploit the spatio-temporal +correlation of the motion field. We then introduce reliable flow priors +including a combination of sparse flow priors across cameras and dense flow +priors within cameras to regularize our motion model. Our model is fast, +compact and achieves very good performance on popular multi-view dynamic scene +datasets with sparse input viewpoints. The source code for our model can be +found on our project page: +https://nagabhushansn95.github.io/publications/2024/RF-DeRF.html. + +
+
+ comment: Accepted at SIGGRAPH 2024 +
+
+
+
+
+ + ♻ ☆ LatentForensics: Towards frugal deepfake detection in the StyleGAN + latent space + + +
+ The classification of forged videos has been a challenge for the past few +years. Deepfake classifiers can now reliably predict whether or not video +frames have been tampered with. However, their performance is tied to both the +dataset used for training and the analyst's computational power. We propose a +deepfake detection method that operates in the latent space of a +state-of-the-art generative adversarial network (GAN) trained on high-quality +face images. The proposed method leverages the structure of the latent space of +StyleGAN to learn a lightweight binary classification model. Experimental +results on standard datasets reveal that the proposed approach outperforms +other state-of-the-art deepfake classification methods, especially in contexts +where the data available to train the models is rare, such as when a new +manipulation method is introduced. To the best of our knowledge, this is the +first study showing the interest of the latent space of StyleGAN for deepfake +classification. Combined with other recent studies on the interpretation and +manipulation of this latent space, we believe that the proposed approach can +further help in developing frugal deepfake classification methods based on +interpretable high-level properties of face images. + +
+
+ comment: 7 pages, 3 figures, 5 tables, submitted to IPAI 2024 +
+
+
+
+
+ + ♻ ☆ Removing Reflections from RAW Photos + + +
+ We describe a system to remove real-world reflections from images for +consumer photography. Our system operates on linear (RAW) photos, with the +(optional) addition of a contextual photo looking in the opposite direction, +e.g., using the selfie camera on a mobile device, which helps disambiguate what +should be considered the reflection. The system is trained using synthetic +mixtures of real-world RAW images, which are combined using a reflection +simulation that is photometrically and geometrically accurate. Our system +consists of a base model that accepts the captured photo and optional +contextual photo as input, and runs at 256p, followed by an up-sampling model +that transforms output 256p images to full resolution. The system can produce +images for review at 1K in 4.5 to 6.5 seconds on a MacBook or iPhone 14 Pro. We +test on RAW photos that were captured in the field and embody typical consumer +photographs. + +
+
+ comment: 14 pages plus 22 pages of supplemental material +
+
+
+
+
+ + ♻ ☆ NU-Class Net: A Novel Deep Learning-based Approach for Video Quality + Enhancement + + +
+ Video content has experienced a surge in popularity, asserting its dominance +over internet traffic and Internet of Things (IoT) networks. Video compression +has long been regarded as the primary means of efficiently managing the +substantial multimedia traffic generated by video-capturing devices. +Nevertheless, video compression algorithms entail significant computational +demands in order to achieve substantial compression ratios. This complexity +presents a formidable challenge when implementing efficient video coding +standards in resource-constrained embedded systems, such as IoT edge node +cameras. To tackle this challenge, this paper introduces NU-Class Net, an +innovative deep-learning model designed to mitigate compression artifacts +stemming from lossy compression codecs. This enhancement significantly elevates +the perceptible quality of low-bit-rate videos. By employing the NU-Class Net, +the video encoder within the video-capturing node can reduce output quality, +thereby generating low-bit-rate videos and effectively curtailing both +computation and bandwidth requirements at the edge. On the decoder side, which +is typically less encumbered by resource limitations, NU-Class Net is applied +after the video decoder to compensate for artifacts and approximate the quality +of the original video. Experimental results affirm the efficacy of the proposed +model in enhancing the perceptible quality of videos, especially those streamed +at low bit rates. + +
+
+
+
+
+ + ♻ ☆ MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video + Understanding CVPR 2024 + + +
+ With the success of large language models (LLMs), integrating the vision +model into LLMs to build vision-language foundation models has gained much more +interest recently. However, existing LLM-based large multimodal models (e.g., +Video-LLaMA, VideoChat) can only take in a limited number of frames for short +video understanding. In this study, we mainly focus on designing an efficient +and effective model for long-term video understanding. Instead of trying to +process more frames simultaneously like most existing work, we propose to +process videos in an online manner and store past video information in a memory +bank. This allows our model to reference historical video content for long-term +analysis without exceeding LLMs' context length constraints or GPU memory +limits. Our memory bank can be seamlessly integrated into current multimodal +LLMs in an off-the-shelf manner. We conduct extensive experiments on various +video understanding tasks, such as long-video understanding, video question +answering, and video captioning, and our model can achieve state-of-the-art +performances across multiple datasets. Code available at +https://boheumd.github.io/MA-LMM/. + +
+
+ comment: Accepted at CVPR 2024. Project Page https://boheumd.github.io/MA-LMM/ +
+
+
+
+
+ + ♻ ☆ NToP: NeRF-Powered Large-scale Dataset Generation for 2D and 3D Human + Pose Estimation in Top-View Fisheye Images + + +
+ Human pose estimation (HPE) in the top-view using fisheye cameras presents a +promising and innovative application domain. However, the availability of +datasets capturing this viewpoint is extremely limited, especially those with +high-quality 2D and 3D keypoint annotations. Addressing this gap, we leverage +the capabilities of Neural Radiance Fields (NeRF) technique to establish a +comprehensive pipeline for generating human pose datasets from existing 2D and +3D datasets, specifically tailored for the top-view fisheye perspective. +Through this pipeline, we create a novel dataset NToP570K (NeRF-powered +Top-view human Pose dataset for fisheye cameras with over 570 thousand images), +and conduct an extensive evaluation of its efficacy in enhancing neural +networks for 2D and 3D top-view human pose estimation. A pretrained ViTPose-B +model achieves an improvement in AP of 33.3 % on our validation set for 2D HPE +after finetuning on our training set. A similarly finetuned HybrIK-Transformer +model gains 53.7 mm reduction in PA-MPJPE for 3D HPE on the validation set. + +
+
+
+
+
+ + ♻ ☆ A Hierarchical Architecture for Neural Materials + + +
+ Neural reflectance models are capable of reproducing the spatially-varying +appearance of many real-world materials at different scales. Unfortunately, +existing techniques such as NeuMIP have difficulties handling materials with +strong shadowing effects or detailed specular highlights. In this paper, we +introduce a neural appearance model that offers a new level of accuracy. +Central to our model is an inception-based core network structure that captures +material appearances at multiple scales using parallel-operating kernels and +ensures multi-stage features through specialized convolution layers. +Furthermore, we encode the inputs into frequency space, introduce a +gradient-based loss, and employ it adaptive to the progress of the learning +phase. We demonstrate the effectiveness of our method using a variety of +synthetic and real examples. + +
+
+
+
+
+ + ♻ ☆ ILPO-NET: Network for the invariant recognition of arbitrary volumetric + patterns in 3D + + +
+ Effective recognition of spatial patterns and learning their hierarchy is +crucial in modern spatial data analysis. Volumetric data applications seek +techniques ensuring invariance not only to shifts but also to pattern +rotations. While traditional methods can readily achieve translational +invariance, rotational invariance possesses multiple challenges and remains an +active area of research. Here, we present ILPO-Net (Invariant to Local Patterns +Orientation Network), a novel approach that handles arbitrarily shaped patterns +with the convolutional operation inherently invariant to local spatial pattern +orientations using the Wigner matrix expansions. Our architecture seamlessly +integrates the new convolution operator and, when benchmarked on diverse +volumetric datasets such as MedMNIST and CATH, demonstrates superior +performance over the baselines with significantly reduced parameter counts - up +to 1000 times fewer in the case of MedMNIST. Beyond these demonstrations, +ILPO-Net's rotational invariance paves the way for other applications across +multiple disciplines. Our code is publicly available at +https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPO/-/tree/main/ILPONet. + +
+
+
+
+
+ + ♻ ☆ NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural + Cellular Automata + + +
+ Neural Cellular Automata (NCA) is a class of Cellular Automata where the +update rule is parameterized by a neural network that can be trained using +gradient descent. In this paper, we focus on NCA models used for texture +synthesis, where the update rule is inspired by partial differential equations +(PDEs) describing reaction-diffusion systems. To train the NCA model, the +spatio-termporal domain is discretized, and Euler integration is used to +numerically simulate the PDE. However, whether a trained NCA truly learns the +continuous dynamic described by the corresponding PDE or merely overfits the +discretization used in training remains an open question. We study NCA models +at the limit where space-time discretization approaches continuity. We find +that existing NCA models tend to overfit the training discretization, +especially in the proximity of the initial condition, also called "seed". To +address this, we propose a solution that utilizes uniform noise as the initial +condition. We demonstrate the effectiveness of our approach in preserving the +consistency of NCA dynamics across a wide range of spatio-temporal +granularities. Our improved NCA model enables two new test-time interactions by +allowing continuous control over the speed of pattern formation and the scale +of the synthesized patterns. We demonstrate this new NCA feature in our +interactive online demo. Our work reveals that NCA models can learn continuous +dynamics and opens new venues for NCA research from a dynamical systems' +perspective. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ YOLOv8-AM: YOLOv8 with Attention Mechanisms for Pediatric Wrist Fracture + Detection + + +
+ Wrist trauma and even fractures occur frequently in daily life, particularly +among children who account for a significant proportion of fracture cases. +Before performing surgery, surgeons often request patients to undergo X-ray +imaging first and prepare for it based on the analysis of the radiologist. With +the development of neural networks, You Only Look Once (YOLO) series models +have been widely used in fracture detection as computer-assisted diagnosis +(CAD). In 2023, Ultralytics presented the latest version of the YOLO models, +which has been employed for detecting fractures across various parts of the +body. Attention mechanism is one of the hottest methods to improve the model +performance. This research work proposes YOLOv8-AM, which incorporates the +attention mechanism into the original YOLOv8 architecture. Specifically, we +respectively employ four attention modules, Convolutional Block Attention +Module (CBAM), Global Attention Mechanism (GAM), Efficient Channel Attention +(ECA), and Shuffle Attention (SA), to design the improved models and train them +on GRAZPEDWRI-DX dataset. Experimental results demonstrate that the mean +Average Precision at IoU 50 (mAP 50) of the YOLOv8-AM model based on ResBlock + +CBAM (ResCBAM) increased from 63.6% to 65.8%, which achieves the +state-of-the-art (SOTA) performance. Conversely, YOLOv8-AM model incorporating +GAM obtains the mAP 50 value of 64.2%, which is not a satisfactory enhancement. +Therefore, we combine ResBlock and GAM, introducing ResGAM to design another +new YOLOv8-AM model, whose mAP 50 value is increased to 65.0%. The +implementation code for this study is available on GitHub at +https://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8. + +
+
+
+
+
+ + ♻ ☆ Overload: Latency Attacks on Object Detection for Edge Devices + + +
+ Nowadays, the deployment of deep learning-based applications is an essential +task owing to the increasing demands on intelligent services. In this paper, we +investigate latency attacks on deep learning applications. Unlike common +adversarial attacks for misclassification, the goal of latency attacks is to +increase the inference time, which may stop applications from responding to the +requests within a reasonable time. This kind of attack is ubiquitous for +various applications, and we use object detection to demonstrate how such kind +of attacks work. We also design a framework named Overload to generate latency +attacks at scale. Our method is based on a newly formulated optimization +problem and a novel technique, called spatial attention. This attack serves to +escalate the required computing costs during the inference time, consequently +leading to an extended inference time for object detection. It presents a +significant threat, especially to systems with limited computing resources. We +conducted experiments using YOLOv5 models on Nvidia NX. Compared to existing +methods, our method is simpler and more effective. The experimental results +show that with latency attacks, the inference time of a single image can be +increased ten times longer in reference to the normal setting. Moreover, our +findings pose a potential new threat to all object detection tasks requiring +non-maximum suppression (NMS), as our attack is NMS-agnostic. + +
+
+
+
+
+ + ♻ ☆ FuseFormer: A Transformer for Visual and Thermal Image Fusion + + +
+ Due to the lack of a definitive ground truth for the image fusion problem, +the loss functions are structured based on evaluation metrics, such as the +structural similarity index measure (SSIM). However, in doing so, a bias is +introduced toward the SSIM and, consequently, the input visual band image. The +objective of this study is to propose a novel methodology for the image fusion +problem that mitigates the limitations associated with using classical +evaluation metrics as loss functions. Our approach integrates a +transformer-based multi-scale fusion strategy that adeptly addresses local and +global context information. This integration not only refines the individual +components of the image fusion process but also significantly enhances the +overall efficacy of the method. Our proposed method follows a two-stage +training approach, where an auto-encoder is initially trained to extract deep +features at multiple scales in the first stage. For the second stage, we +integrate our fusion block and change the loss function as mentioned. The +multi-scale features are fused using a combination of Convolutional Neural +Networks (CNNs) and Transformers. The CNNs are utilized to capture local +features, while the Transformer handles the integration of general context +features. Through extensive experiments on various benchmark datasets, our +proposed method, along with the novel loss function definition, demonstrates +superior performance compared to other competitive fusion algorithms. + +
+
+ comment: 8 pages, 6 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Progressive Multi-modal Conditional Prompt Tuning + + +
+ Pre-trained vision-language models (VLMs) have shown remarkable +generalization capabilities via prompting, which leverages VLMs as knowledge +bases to extract information beneficial for downstream tasks. However, existing +methods primarily employ uni-modal prompting, which only engages a uni-modal +branch, failing to simultaneously adjust vision-language (V-L) features. +Additionally, the one-pass forward pipeline in VLM encoding struggles to align +V-L features that have a huge gap. Confronting these challenges, we propose a +novel method, Progressive Multi-modal conditional Prompt Tuning (ProMPT). +ProMPT exploits a recurrent structure, optimizing and aligning V-L features by +iteratively utilizing image and current encoding information. It comprises an +initialization and a multi-modal iterative evolution (MIE) module. +Initialization is responsible for encoding images and text using a VLM, +followed by a feature filter that selects text features similar to image. MIE +then facilitates multi-modal prompting through class-conditional vision +prompting, instance-conditional text prompting, and feature filtering. In each +MIE iteration, vision prompts are obtained from filtered text features via a +vision generator, promoting image features to focus more on target object +during vision prompting. The encoded image features are fed into a text +generator to produce text prompts that are more robust to class shifts. Thus, +V-L features are progressively aligned, enabling advance from coarse to exact +prediction. Extensive experiments are conducted in three settings to evaluate +the efficacy of ProMPT. The results indicate that ProMPT outperforms existing +methods on average across all settings, demonstrating its superior +generalization and robustness. Code is available at +https://github.com/qiuxiaoyu9954/ProMPT. + +
+
+
+
+
+ + ♻ ☆ UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale + Transformer IJCNN 2024 + + +
+ Underwater images often exhibit poor quality, distorted color balance and low +contrast due to the complex and intricate interplay of light, water, and +objects. Despite the significant contributions of previous underwater +enhancement techniques, there exist several problems that demand further +improvement: (i) The current deep learning methods rely on Convolutional Neural +Networks (CNNs) that lack the multi-scale enhancement, and global perception +field is also limited. (ii) The scarcity of paired real-world underwater +datasets poses a significant challenge, and the utilization of synthetic image +pairs could lead to overfitting. To address the aforementioned problems, this +paper introduces a Multi-scale Transformer-based Network called UWFormer for +enhancing images at multiple frequencies via semi-supervised learning, in which +we propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale +Fusion Feed-forward Network for low-frequency enhancement. Besides, we +introduce a special underwater semi-supervised training strategy, where we +propose a Subaqueous Perceptual Loss function to generate reliable pseudo +labels. Experiments using full-reference and non-reference underwater +benchmarks demonstrate that our method outperforms state-of-the-art methods in +terms of both quantity and visual quality. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ Learning Geometry-Guided Depth via Projective Modeling for Monocular 3D + Object Detection + + +
+ As a crucial task of autonomous driving, 3D object detection has made great +progress in recent years. However, monocular 3D object detection remains a +challenging problem due to the unsatisfactory performance in depth estimation. +Most existing monocular methods typically directly regress the scene depth +while ignoring important relationships between the depth and various geometric +elements (e.g. bounding box sizes, 3D object dimensions, and object poses). In +this paper, we propose to learn geometry-guided depth estimation with +projective modeling to advance monocular 3D object detection. Specifically, a +principled geometry formula with projective modeling of 2D and 3D depth +predictions in the monocular 3D object detection network is devised. We further +implement and embed the proposed formula to enable geometry-aware deep +representation learning, allowing effective 2D and 3D interactions for boosting +the depth estimation. Moreover, we provide a strong baseline through addressing +substantial misalignment between 2D annotation and projected boxes to ensure +robust learning with the proposed geometric formula. Experiments on the KITTI +dataset show that our method remarkably improves the detection performance of +the state-of-the-art monocular-based method without extra data by 2.80% on the +moderate test setting. The model and code will be released at +https://github.com/YinminZhang/MonoGeo. + +
+
+ comment: 16 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation + + +
+ With the explosive popularity of AI-generated content (AIGC), video +generation has recently received a lot of attention. Generating videos guided +by text instructions poses significant challenges, such as modeling the complex +relationship between space and time, and the lack of large-scale text-video +paired data. Existing text-video datasets suffer from limitations in both +content quality and scale, or they are not open-source, rendering them +inaccessible for study and use. For model design, previous approaches extend +pretrained text-to-image generation models by adding temporal 1D +convolution/attention modules for video generation. However, these approaches +overlook the importance of jointly modeling space and time, inevitably leading +to temporal distortions and misalignment between texts and videos. In this +paper, we propose a novel approach that strengthens the interaction between +spatial and temporal perceptions. In particular, we utilize a swapped +cross-attention mechanism in 3D windows that alternates the "query" role +between spatial and temporal blocks, enabling mutual reinforcement for each +other. Moreover, to fully unlock model capabilities for high-quality video +generation and promote the development of the field, we curate a large-scale +and open-source video dataset called HD-VG-130M. This dataset comprises 130 +million text-video pairs from the open-domain, ensuring high-definition, +widescreen and watermark-free characters. A smaller-scale yet more meticulously +cleaned subset further enhances the data quality, aiding models in achieving +superior performance. Experimental quantitative and qualitative results +demonstrate the superiority of our approach in terms of per-frame quality, +temporal correlation, and text-video alignment, with clear margins. + +
+
+
+
+
+ + ♻ ☆ Beyond Score Changes: Adversarial Attack on No-Reference Image Quality + Assessment from Two Perspectives + + +
+ Deep neural networks have demonstrated impressive success in No-Reference +Image Quality Assessment (NR-IQA). However, recent researches highlight the +vulnerability of NR-IQA models to subtle adversarial perturbations, leading to +inconsistencies between model predictions and subjective ratings. Current +adversarial attacks, however, focus on perturbing predicted scores of +individual images, neglecting the crucial aspect of inter-score correlation +relationships within an entire image set. Meanwhile, it is important to note +that the correlation, like ranking correlation, plays a significant role in +NR-IQA tasks. To comprehensively explore the robustness of NR-IQA models, we +introduce a new framework of correlation-error-based attacks that perturb both +the correlation within an image set and score changes on individual images. Our +research primarily focuses on ranking-related correlation metrics like +Spearman's Rank-Order Correlation Coefficient (SROCC) and prediction +error-related metrics like Mean Squared Error (MSE). As an instantiation, we +propose a practical two-stage SROCC-MSE-Attack (SMA) that initially optimizes +target attack scores for the entire image set and then generates adversarial +examples guided by these scores. Experimental results demonstrate that our SMA +method not only significantly disrupts the SROCC to negative values but also +maintains a considerable change in the scores of individual images. Meanwhile, +it exhibits state-of-the-art performance across metrics with different +categories. Our method provides a new perspective on the robustness of NR-IQA +models. + +
+
+ comment: Submitted to a conference +
+
+
+
+
+ + ♻ ☆ Hybrid Open-set Segmentation with Synthetic Negative Data + + +
+ Open-set segmentation can be conceived by complementing closed-set +classification with anomaly detection. Many of the existing dense anomaly +detectors operate through generative modelling of regular data or by +discriminating with respect to negative data. These two approaches optimize +different objectives and therefore exhibit different failure modes. +Consequently, we propose a novel anomaly score that fuses generative and +discriminative cues. Our score can be implemented by upgrading any closed-set +segmentation model with dense estimates of dataset posterior and unnormalized +data likelihood. The resulting dense hybrid open-set models require negative +training images that can be sampled from an auxiliary negative dataset, from a +jointly trained generative model, or from a mixture of both sources. We +evaluate our contributions on benchmarks for dense anomaly detection and +open-set segmentation. The experiments reveal strong open-set performance in +spite of negligible computational overhead. + +
+
+ comment: Published in IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ Mosaic-SDF for 3D Generative Models + + +
+ Current diffusion or flow-based generative models for 3D shapes divide to +two: distilling pre-trained 2D image diffusion models, and training directly on +3D shapes. When training a diffusion or flow models on 3D shapes a crucial +design choice is the shape representation. An effective shape representation +needs to adhere three design principles: it should allow an efficient +conversion of large 3D datasets to the representation form; it should provide a +good tradeoff of approximation power versus number of parameters; and it should +have a simple tensorial form that is compatible with existing powerful neural +architectures. While standard 3D shape representations such as volumetric grids +and point clouds do not adhere to all these principles simultaneously, we +advocate in this paper a new representation that does. We introduce Mosaic-SDF +(M-SDF): a simple 3D shape representation that approximates the Signed Distance +Function (SDF) of a given shape by using a set of local grids spread near the +shape's boundary. The M-SDF representation is fast to compute for each shape +individually making it readily parallelizable; it is parameter efficient as it +only covers the space around the shape's boundary; and it has a simple matrix +form, compatible with Transformer-based architectures. We demonstrate the +efficacy of the M-SDF representation by using it to train a 3D generative flow +model including class-conditioned generation with the 3D Warehouse dataset, and +text-to-3D generation using a dataset of about 600k caption-shape pairs. + +
+
+ comment: More results and details can be found at + https://lioryariv.github.io/msdf +
+
+
+
+
+ + ♻ ☆ Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of + Multi-modal Large Language Models + + +
+ Counterfactual reasoning, as a crucial manifestation of human intelligence, +refers to making presuppositions based on established facts and extrapolating +potential outcomes. Existing multimodal large language models (MLLMs) have +exhibited impressive cognitive and reasoning capabilities, which have been +examined across a wide range of Visual Question Answering (VQA) benchmarks. +Nevertheless, how will existing MLLMs perform when faced with counterfactual +questions? To answer this question, we first curate a novel +\textbf{C}ounter\textbf{F}actual \textbf{M}ulti\textbf{M}odal reasoning +benchmark, abbreviated as \textbf{CFMM}, to systematically assess the +counterfactual reasoning capabilities of MLLMs. Our CFMM comprises six +challenging tasks, each including hundreds of carefully human-labeled +counterfactual questions, to evaluate MLLM's counterfactual reasoning +capabilities across diverse aspects. Through experiments, interestingly, we +find that existing MLLMs prefer to believe what they see, but ignore the +counterfactual presuppositions presented in the question, thereby leading to +inaccurate responses. Furthermore, we evaluate a wide range of prevalent MLLMs +on our proposed CFMM. The significant gap between their performance on our CFMM +and that on several VQA benchmarks indicates that there is still considerable +room for improvement in existing MLLMs toward approaching human-level +intelligence. On the other hand, through boosting MLLMs performances on our +CFMM in the future, potential avenues toward developing MLLMs with advanced +intelligence can be explored. + +
+
+
+
+
+ + ♻ ☆ G3Reg: Pyramid Graph-based Global Registration using Gaussian Ellipsoid + Model + + +
+ This study introduces a novel framework, G3Reg, for fast and robust global +registration of LiDAR point clouds. In contrast to conventional complex +keypoints and descriptors, we extract fundamental geometric primitives, +including planes, clusters, and lines (PCL) from the raw point cloud to obtain +low-level semantic segments. Each segment is represented as a unified Gaussian +Ellipsoid Model (GEM), using a probability ellipsoid to ensure the ground truth +centers are encompassed with a certain degree of probability. Utilizing these +GEMs, we present a distrust-and-verify scheme based on a Pyramid Compatibility +Graph for Global Registration (PAGOR). Specifically, we establish an upper +bound, which can be traversed based on the confidence level for compatibility +testing to construct the pyramid graph. Then, we solve multiple maximum cliques +(MAC) for each level of the pyramid graph, thus generating the corresponding +transformation candidates. In the verification phase, we adopt a precise and +efficient metric for point cloud alignment quality, founded on geometric +primitives, to identify the optimal candidate. The algorithm's performance is +validated on three publicly available datasets and a self-collected +multi-session dataset. Parameter settings remained unchanged during the +experiment evaluations. The results exhibit superior robustness and real-time +performance of the G3Reg framework compared to state-of-the-art methods. +Furthermore, we demonstrate the potential for integrating individual GEM and +PAGOR components into other registration frameworks to enhance their efficacy. +Code: https://github.com/HKUST-Aerial-Robotics/G3Reg + +
+
+ comment: Accepted to 2024 IEEE Transactions on Automation Science and + Engineering (IEEE TASE) +
+
+
+
+
+ + ♻ ☆ Unsupervised Tumor-Aware Distillation for Multi-Modal Brain Image + Translation IJCNN 2024 + + +
+ Multi-modal brain images from MRI scans are widely used in clinical diagnosis +to provide complementary information from different modalities. However, +obtaining fully paired multi-modal images in practice is challenging due to +various factors, such as time, cost, and artifacts, resulting in +modality-missing brain images. To address this problem, unsupervised +multi-modal brain image translation has been extensively studied. Existing +methods suffer from the problem of brain tumor deformation during translation, +as they fail to focus on the tumor areas when translating the whole images. In +this paper, we propose an unsupervised tumor-aware distillation teacher-student +network called UTAD-Net, which is capable of perceiving and translating tumor +areas precisely. Specifically, our model consists of two parts: a teacher +network and a student network. The teacher network learns an end-to-end mapping +from source to target modality using unpaired images and corresponding tumor +masks first. Then, the translation knowledge is distilled into the student +network, enabling it to generate more realistic tumor areas and whole images +without masks. Experiments show that our model achieves competitive performance +on both quantitative and qualitative evaluations of image quality compared with +state-of-the-art methods. Furthermore, we demonstrate the effectiveness of the +generated images on downstream segmentation tasks. Our code is available at +https://github.com/scut-HC/UTAD-Net. + +
+
+ comment: 8 pages, 5 figures. It has been provisionally accepted for IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces + Parameters for Skin Lesion Segmentation + + +
+ Traditionally for improving the segmentation performance of models, most +approaches prefer to use adding more complex modules. And this is not suitable +for the medical field, especially for mobile medical devices, where +computationally loaded models are not suitable for real clinical environments +due to computational resource constraints. Recently, state-space models (SSMs), +represented by Mamba, have become a strong competitor to traditional CNNs and +Transformers. In this paper, we deeply explore the key elements of parameter +influence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight +VM-UNet) based on this. Specifically, we propose a method for processing +features in parallel Vision Mamba, named PVM Layer, which achieves excellent +performance with the lowest computational load while keeping the overall number +of processing channels constant. We conducted comparisons and ablation +experiments with several state-of-the-art lightweight models on three skin +lesion public datasets and demonstrated that the UltraLight VM-UNet exhibits +the same strong performance competitiveness with parameters of only 0.049M and +GFLOPs of 0.060. In addition, this study deeply explores the key elements of +parameter influence in Mamba, which will lay a theoretical foundation for Mamba +to possibly become a new mainstream module for lightweighting in the future. +The code is available from https://github.com/wurenkai/UltraLight-VM-UNet . + +
+
+
+
+
+ + ♻ ☆ Boosting Audio-visual Zero-shot Learning with Large Language Models + + +
+ Audio-visual zero-shot learning aims to recognize unseen classes based on +paired audio-visual sequences. Recent methods mainly focus on learning +multi-modal features aligned with class names to enhance the generalization +ability to unseen categories. However, these approaches ignore the obscure +event concepts in class names and may inevitably introduce complex network +structures with difficult training objectives. In this paper, we introduce a +straightforward yet efficient framework called KnowleDge-Augmented audio-visual +learning (KDA), which aids the model in more effectively learning novel event +content by leveraging an external knowledge base. Specifically, we first +propose to utilize the knowledge contained in large language models (LLMs) to +generate numerous descriptive sentences that include important distinguishing +audio-visual features of event classes, which helps to better understand unseen +categories. Furthermore, we propose a knowledge-aware adaptive margin loss to +help distinguish similar events, further improving the generalization ability +towards unseen classes. Extensive experimental results demonstrate that our +proposed KDA can outperform state-of-the-art methods on three popular +audio-visual zero-shot learning datasets.Our code will be avaliable at +\url{https://github.com/chenhaoxing/KDA}. + +
+
+
+
+
+ + ♻ ☆ From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with + Vision-Language Models CVPR 2024 + + +
+ Scene graph generation (SGG) aims to parse a visual scene into an +intermediate graph representation for downstream reasoning tasks. Despite +recent advancements, existing methods struggle to generate scene graphs with +novel visual relation concepts. To address this challenge, we introduce a new +open-vocabulary SGG framework based on sequence generation. Our framework +leverages vision-language pre-trained models (VLM) by incorporating an +image-to-graph generation paradigm. Specifically, we generate scene graph +sequences via image-to-text generation with VLM and then construct scene graphs +from these sequences. By doing so, we harness the strong capabilities of VLM +for open-vocabulary SGG and seamlessly integrate explicit relational modeling +for enhancing the VL tasks. Experimental results demonstrate that our design +not only achieves superior performance with an open vocabulary but also +enhances downstream vision-language task performance through explicit relation +modeling knowledge. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Genixer: Empowering Multimodal Large Language Models as a Powerful Data + Generator + + +
+ Instruction tuning data is essential for training the Multimodal Large +Language Models (MLLMs). However, the creation of high-quality instruction +tuning data presents significant challenges. Asking the human to label the +instruction tuning data is label-intensive and time-consuming. Some works +prompted to GPT-4 for data generation were not only costly but also lacked +satisfactory performance in complex tasks (i.e., grounding-based reasoning +tasks). To address the challenges of data creation, we are the first to explore +the potential of empowering MLLMs with the ability to generate +instruction-tuning data by following user instructions. Specifically, we +developed an innovative data generation pipeline Genixer to generate various +high-quality instruction tuning data, including nine representative tasks, +e.g., Common VQA, REC, REG, and PointQ. Genixer provides a unified solution for +data generation with four key steps: (i) instruction data collection, (ii) +instruction template design, (iii) empowering MLLM, and (iv) data generation +and filtering. To validate the effectiveness of generated data, we conducted +the human evaluation and user preference study to assess the quality of +generated data. Subsequently, we generated two instruction-tuning datasets for +the training of two representative MLLMs, LLaVA1.5 and Shikra, and noted +consistent improvements across various VQA tasks and multimodal benchmarks. For +instance, performance on the VizWiz benchmark improved from 50.0% to 53.8%, and +on ScienceQA, it increased from 66.8% to 69.7%, reconfirming the quality of the +generated instruction tuning data. The data, code, and models will be released. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ OPTiML: Dense Semantic Invariance Using Optimal Transport for + Self-Supervised Medical Image Representation + + +
+ Self-supervised learning (SSL) has emerged as a promising technique for +medical image analysis due to its ability to learn without annotations. +However, despite the promising potential, conventional SSL methods encounter +limitations, including challenges in achieving semantic alignment and capturing +subtle details. This leads to suboptimal representations, which fail to +accurately capture the underlying anatomical structures and pathological +details. In response to these constraints, we introduce a novel SSL framework +OPTiML, employing optimal transport (OT), to capture the dense semantic +invariance and fine-grained details, thereby enhancing the overall +effectiveness of SSL in medical image representation learning. The core idea is +to integrate OT with a cross-viewpoint semantics infusion module (CV-SIM), +which effectively captures complex, fine-grained details inherent in medical +images across different viewpoints. In addition to the CV-SIM module, OPTiML +imposes the variance and covariance regularizations within OT framework to +force the model focus on clinically relevant information while discarding less +informative features. Through these, the proposed framework demonstrates its +capacity to learn semantically rich representations that can be applied to +various medical imaging tasks. To validate its effectiveness, we conduct +experimental studies on three publicly available datasets from chest X-ray +modality. Our empirical results reveal OPTiML's superiority over +state-of-the-art methods across all evaluated tasks. + +
+
+
+
+
+ + ♻ ☆ Lightweight Facial Attractiveness Prediction Using Dual Label + Distribution + + +
+ Facial attractiveness prediction (FAP) aims to assess facial attractiveness +automatically based on human aesthetic perception. Previous methods using deep +convolutional neural networks have improved the performance, but their +large-scale models have led to a deficiency in flexibility. In addition, most +methods fail to take full advantage of the dataset. In this paper, we present a +novel end-to-end FAP approach that integrates dual label distribution and +lightweight design. The manual ratings, attractiveness score, and standard +deviation are aggregated explicitly to construct a dual-label distribution to +make the best use of the dataset, including the attractiveness distribution and +the rating distribution. Such distributions, as well as the attractiveness +score, are optimized under a joint learning framework based on the label +distribution learning (LDL) paradigm. The data processing is simplified to a +minimum for a lightweight design, and MobileNetV2 is selected as our backbone. +Extensive experiments are conducted on two benchmark datasets, where our +approach achieves promising results and succeeds in balancing performance and +efficiency. Ablation studies demonstrate that our delicately designed learning +modules are indispensable and correlated. Additionally, the visualization +indicates that our approach can perceive facial attractiveness and capture +attractive facial regions to facilitate semantic predictions. The code is +available at https://github.com/enquan/2D_FAP. + +
+
+
+
+
+ + ♻ ☆ WOUAF: Weight Modulation for User Attribution and Fingerprinting in + Text-to-Image Diffusion Models CVPR 2024 + + +
+ The rapid advancement of generative models, facilitating the creation of +hyper-realistic images from textual descriptions, has concurrently escalated +critical societal concerns such as misinformation. Although providing some +mitigation, traditional fingerprinting mechanisms fall short in attributing +responsibility for the malicious use of synthetic images. This paper introduces +a novel approach to model fingerprinting that assigns responsibility for the +generated images, thereby serving as a potential countermeasure to model +misuse. Our method modifies generative models based on each user's unique +digital fingerprint, imprinting a unique identifier onto the resultant content +that can be traced back to the user. This approach, incorporating fine-tuning +into Text-to-Image (T2I) tasks using the Stable Diffusion Model, demonstrates +near-perfect attribution accuracy with a minimal impact on output quality. +Through extensive evaluation, we show that our method outperforms baseline +methods with an average improvement of 11\% in handling image post-processes. +Our method presents a promising and novel avenue for accountable model +distribution and responsible use. Our code is available in +\url{https://github.com/kylemin/WOUAF}. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Visual Attention Prompted Prediction and Learning + + +
+ Visual explanation (attention)-guided learning uses not only labels but also +explanations to guide model reasoning process. While visual attention-guided +learning has shown promising results, it requires a large number of explanation +annotations that are time-consuming to prepare. However, in many real-world +situations, it is usually desired to prompt the model with visual attention +without model retraining. For example, when doing AI-assisted cancer +classification on a medical image, users (e.g., clinicians) can provide the AI +model with visual attention prompt on which areas are indispensable and which +are precluded. Despite its promising objectives, achieving visual +attention-prompted prediction presents several major challenges: 1) How can the +visual prompt be effectively integrated into the model's reasoning process? 2) +How should the model handle samples that lack visual prompts? 3) What is the +impact on the model's performance when a visual prompt is imperfect? This paper +introduces a novel framework for attention-prompted prediction and learning, +utilizing visual prompts to steer the model's reasoning process. To improve +performance in non-prompted situations and align it with prompted scenarios, we +propose a co-training approach for both non-prompted and prompted models, +ensuring they share similar parameters and activations. Additionally, for +instances where the visual prompt does not encompass the entire input image, we +have developed innovative attention prompt refinement methods. These methods +interpolate the incomplete prompts while maintaining alignment with the model's +explanations. Extensive experiments on four datasets demonstrate the +effectiveness of our proposed framework in enhancing predictions for samples +both with and without prompt. + +
+
+
+
+
+ + ♻ ☆ Extending global-local view alignment for self-supervised learning with + remote sensing imagery + + +
+ Since large number of high-quality remote sensing images are readily +accessible, exploiting the corpus of images with less manual annotation draws +increasing attention. Self-supervised models acquire general feature +representations by formulating a pretext task that generates pseudo-labels for +massive unlabeled data to provide supervision for training. While prior studies +have explored multiple self-supervised learning techniques in remote sensing +domain, pretext tasks based on local-global view alignment remain +underexplored, despite achieving state-of-the-art results on natural imagery. +Inspired by DINO, which employs an effective representation learning structure +with knowledge distillation based on global-local view alignment, we formulate +two pretext tasks for self-supervised learning on remote sensing imagery +(SSLRS). Using these tasks, we explore the effectiveness of positive temporal +contrast as well as multi-sized views on SSLRS. We extend DINO and propose +DINO-MC which uses local views of various sized crops instead of a single fixed +size in order to alleviate the limited variation in object size observed in +remote sensing imagery. Our experiments demonstrate that even when pre-trained +on only 10% of the dataset, DINO-MC performs on par or better than existing +state-of-the-art SSLRS methods on multiple remote sensing tasks, while using +less computational resources. All codes, models, and results are released at +https://github.com/WennyXY/DINO-MC. + +
+
+
+
+
+ + ♻ ☆ Appearance-based Gaze Estimation With Deep Learning: A Review and + Benchmark + + +
+ Human gaze provides valuable information on human focus and intentions, +making it a crucial area of research. Recently, deep learning has +revolutionized appearance-based gaze estimation. However, due to the unique +features of gaze estimation research, such as the unfair comparison between 2D +gaze positions and 3D gaze vectors and the different pre-processing and +post-processing methods, there is a lack of a definitive guideline for +developing deep learning-based gaze estimation algorithms. In this paper, we +present a systematic review of the appearance-based gaze estimation methods +using deep learning. Firstly, we survey the existing gaze estimation algorithms +along the typical gaze estimation pipeline: deep feature extraction, deep +learning model design, personal calibration and platforms. Secondly, to fairly +compare the performance of different approaches, we summarize the data +pre-processing and post-processing methods, including face/eye detection, data +rectification, 2D/3D gaze conversion and gaze origin conversion. Finally, we +set up a comprehensive benchmark for deep learning-based gaze estimation. We +characterize all the public datasets and provide the source code of typical +gaze estimation algorithms. This paper serves not only as a reference to +develop deep learning-based gaze estimation methods, but also a guideline for +future gaze estimation research. The project web page can be found at +https://phi-ai.buaa.edu.cn/Gazehub. + +
+
+ comment: Accepted by TPAMI +
+
+
+
+
+ + ♻ ☆ Deep Variational Network Toward Blind Image Restoration + + +
+ Blind image restoration (IR) is a common yet challenging problem in computer +vision. Classical model-based methods and recent deep learning (DL)-based +methods represent two different methodologies for this problem, each with their +own merits and drawbacks. In this paper, we propose a novel blind image +restoration method, aiming to integrate both the advantages of them. +Specifically, we construct a general Bayesian generative model for the blind +IR, which explicitly depicts the degradation process. In this proposed model, a +pixel-wise non-i.i.d. Gaussian distribution is employed to fit the image noise. +It is with more flexibility than the simple i.i.d. Gaussian or Laplacian +distributions as adopted in most of conventional methods, so as to handle more +complicated noise types contained in the image degradation. To solve the model, +we design a variational inference algorithm where all the expected posteriori +distributions are parameterized as deep neural networks to increase their model +capability. Notably, such an inference algorithm induces a unified framework to +jointly deal with the tasks of degradation estimation and image restoration. +Further, the degradation information estimated in the former task is utilized +to guide the latter IR process. Experiments on two typical blind IR tasks, +namely image denoising and super-resolution, demonstrate that the proposed +method achieves superior performance over current state-of-the-arts. + +
+
+ comment: Accepted by TPAMI@2024. Code: https://github.com/zsyOAOA/VIRNet +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 171 + +
+
+
+ + ☆ SMPLer: Taming Transformers for Monocular 3D Human Shape and Pose + Estimation + + +
+ Existing Transformers for monocular 3D human shape and pose estimation +typically have a quadratic computation and memory complexity with respect to +the feature length, which hinders the exploitation of fine-grained information +in high-resolution features that is beneficial for accurate reconstruction. In +this work, we propose an SMPL-based Transformer framework (SMPLer) to address +this issue. SMPLer incorporates two key ingredients: a decoupled attention +operation and an SMPL-based target representation, which allow effective +utilization of high-resolution features in the Transformer. In addition, based +on these two designs, we also introduce several novel modules including a +multi-scale attention and a joint-aware attention to further boost the +reconstruction performance. Extensive experiments demonstrate the effectiveness +of SMPLer against existing 3D human shape and pose estimation methods both +quantitatively and qualitatively. Notably, the proposed algorithm achieves an +MPJPE of 45.2 mm on the Human3.6M dataset, improving upon Mesh Graphormer by +more than 10% with fewer than one-third of the parameters. Code and pretrained +models are available at https://github.com/xuxy09/SMPLer. + +
+
+ comment: Published at TPAMI 2024 +
+
+
+
+
+ + ☆ ID-Animator: Zero-Shot Identity-Preserving Human Video Generation + + +
+ Generating high fidelity human video with specified identities has attracted +significant attention in the content generation community. However, existing +techniques struggle to strike a balance between training efficiency and +identity preservation, either requiring tedious case-by-case finetuning or +usually missing the identity details in video generation process. In this +study, we present ID-Animator, a zero-shot human-video generation approach that +can perform personalized video generation given single reference facial image +without further training. ID-Animator inherits existing diffusion-based video +generation backbones with a face adapter to encode the ID-relevant embeddings +from learnable facial latent queries. To facilitate the extraction of identity +information in video generation, we introduce an ID-oriented dataset +construction pipeline, which incorporates decoupled human attribute and action +captioning technique from a constructed facial image pool. Based on this +pipeline, a random face reference training method is further devised to +precisely capture the ID-relevant embeddings from reference images, thus +improving the fidelity and generalization capacity of our model for ID-specific +video generation. Extensive experiments demonstrate the superiority of +ID-Animator to generate personalized human videos over previous models. +Moreover, our method is highly compatible with popular pre-trained T2V models +like animatediff and various community backbone models, showing high +extendability in real-world applications for video generation where identity +preservation is highly desired. Our codes and checkpoints will be released at +https://github.com/ID-Animator/ID-Animator. + +
+
+ comment: Project Page: https://id-animator.github.io/ +
+
+
+
+
+ + ☆ Metric-guided Image Reconstruction Bounds via Conformal Prediction + + +
+ Recent advancements in machine learning have led to novel imaging systems and +algorithms that address ill-posed problems. Assessing their trustworthiness and +understanding how to deploy them safely at test time remains an important and +open problem. We propose a method that leverages conformal prediction to +retrieve upper/lower bounds and statistical inliers/outliers of reconstructions +based on the prediction intervals of downstream metrics. We apply our method to +sparse-view CT for downstream radiotherapy planning and show 1) that +metric-guided bounds have valid coverage for downstream metrics while +conventional pixel-wise bounds do not and 2) anatomical differences of +upper/lower bounds between metric-guided and pixel-wise methods. Our work paves +the way for more meaningful reconstruction bounds. Code available at +https://github.com/matthewyccheung/conformal-metric + +
+
+
+
+
+ + ☆ CT-GLIP: 3D Grounded Language-Image Pretraining with CT Scans and + Radiology Reports for Full-Body Scenarios + + +
+ Medical Vision-Language Pretraining (Med-VLP) establishes a connection +between visual content from medical images and the relevant textual +descriptions. Existing Med-VLP methods primarily focus on 2D images depicting a +single body part, notably chest X-rays. In this paper, we extend the scope of +Med-VLP to encompass 3D images, specifically targeting full-body scenarios, by +using a multimodal dataset of CT images and reports. Compared with the 2D +counterpart, 3D VLP is required to effectively capture essential semantics from +significantly sparser representation in 3D imaging. In this paper, we introduce +CT-GLIP (Grounded Language-Image Pretraining with CT scans), a novel method +that constructs organ-level image-text pairs to enhance multimodal contrastive +learning, aligning grounded visual features with precise diagnostic text. +Additionally, we developed an abnormality dictionary to augment contrastive +learning with diverse negative samples. Our method, trained on a multimodal CT +dataset comprising 44,011 organ-level vision-text pairs from 17,702 patients +across 104 organs, demonstrates it can identify organs and abnormalities in a +zero-shot manner using natural languages. The performance of CT-GLIP is +validated on a separate test set of 1,130 patients, focusing on the 16 most +frequent abnormalities across 7 organs. The experimental results show our +model's superior performance over the standard CLIP framework across zero-shot +and fine-tuning scenarios, using both CNN and ViT architectures. + +
+
+ comment: 12 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ Automatic Layout Planning for Visually-Rich Documents with + Instruction-Following Models + + +
+ Recent advancements in instruction-following models have made user +interactions with models more user-friendly and efficient, broadening their +applicability. In graphic design, non-professional users often struggle to +create visually appealing layouts due to limited skills and resources. In this +work, we introduce a novel multimodal instruction-following framework for +layout planning, allowing users to easily arrange visual elements into tailored +layouts by specifying canvas size and design purpose, such as for book covers, +posters, brochures, or menus. We developed three layout reasoning tasks to +train the model in understanding and executing layout instructions. Experiments +on two benchmarks show that our method not only simplifies the design process +for non-professionals but also surpasses the performance of few-shot GPT-4V +models, with mIoU higher by 12% on Crello. This progress highlights the +potential of multimodal instruction-following models to automate and simplify +the design process, providing an approachable solution for a wide range of +design tasks on visually-rich documents. + +
+
+
+
+
+ + ☆ From Parts to Whole: A Unified Reference Framework for Controllable + Human Image Generation + + +
+ Recent advancements in controllable human image generation have led to +zero-shot generation using structural signals (e.g., pose, depth) or facial +appearance. Yet, generating human images conditioned on multiple parts of human +appearance remains challenging. Addressing this, we introduce Parts2Whole, a +novel framework designed for generating customized portraits from multiple +reference images, including pose images and various aspects of human +appearance. To achieve this, we first develop a semantic-aware appearance +encoder to retain details of different human parts, which processes each image +based on its textual label to a series of multi-scale feature maps rather than +one image token, preserving the image dimension. Second, our framework supports +multi-image conditioned generation through a shared self-attention mechanism +that operates across reference and target features during the diffusion +process. We enhance the vanilla attention mechanism by incorporating mask +information from the reference human images, allowing for the precise selection +of any part. Extensive experiments demonstrate the superiority of our approach +over existing alternatives, offering advanced capabilities for multi-part +controllable human image customization. See our project page at +https://huanngzh.github.io/Parts2Whole/. + +
+
+
+
+
+ + ☆ TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via + Gaussian Splatting + + +
+ Radiance fields have demonstrated impressive performance in synthesizing +lifelike 3D talking heads. However, due to the difficulty in fitting steep +appearance changes, the prevailing paradigm that presents facial motions by +directly modifying point appearance may lead to distortions in dynamic regions. +To tackle this challenge, we introduce TalkingGaussian, a deformation-based +radiance fields framework for high-fidelity talking head synthesis. Leveraging +the point-based Gaussian Splatting, facial motions can be represented in our +method by applying smooth and continuous deformations to persistent Gaussian +primitives, without requiring to learn the difficult appearance change like +previous methods. Due to this simplification, precise facial motions can be +synthesized while keeping a highly intact facial feature. Under such a +deformation paradigm, we further identify a face-mouth motion inconsistency +that would affect the learning of detailed speaking motions. To address this +conflict, we decompose the model into two branches separately for the face and +inside mouth areas, therefore simplifying the learning tasks to help +reconstruct more accurate motion and structure of the mouth region. Extensive +experiments demonstrate that our method renders high-quality lip-synchronized +talking head videos, with better facial fidelity and higher efficiency compared +with previous methods. + +
+
+ comment: Project page: https://fictionarry.github.io/TalkingGaussian/ +
+
+
+
+
+ + ☆ Multi-Session SLAM with Differentiable Wide-Baseline Pose Optimization CVPR 2024 + + +
+ We introduce a new system for Multi-Session SLAM, which tracks camera motion +across multiple disjoint videos under a single global reference. Our approach +couples the prediction of optical flow with solver layers to estimate camera +pose. The backbone is trained end-to-end using a novel differentiable solver +for wide-baseline two-view pose. The full system can connect disjoint +sequences, perform visual odometry, and global optimization. Compared to +existing approaches, our design is accurate and robust to catastrophic +failures. Code is available at github.com/princeton-vl/MultiSlam_DiffPose + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ FlowMap: High-Quality Camera Poses, Intrinsics, and Depth via Gradient + Descent + + +
+ This paper introduces FlowMap, an end-to-end differentiable method that +solves for precise camera poses, camera intrinsics, and per-frame dense depth +of a video sequence. Our method performs per-video gradient-descent +minimization of a simple least-squares objective that compares the optical flow +induced by depth, intrinsics, and poses against correspondences obtained via +off-the-shelf optical flow and point tracking. Alongside the use of point +tracks to encourage long-term geometric consistency, we introduce +differentiable re-parameterizations of depth, intrinsics, and pose that are +amenable to first-order optimization. We empirically show that camera +parameters and dense depth recovered by our method enable photo-realistic novel +view synthesis on 360-degree trajectories using Gaussian Splatting. Our method +not only far outperforms prior gradient-descent based bundle adjustment +methods, but surprisingly performs on par with COLMAP, the state-of-the-art SfM +method, on the downstream task of 360-degree novel view synthesis (even though +our method is purely gradient-descent based, fully differentiable, and presents +a complete departure from conventional SfM). + +
+
+ comment: Project website: https://cameronosmith.github.io/flowmap/ +
+
+
+
+
+ + ☆ TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and + Proprioception Estimation + + +
+ Legged navigation is typically examined within open-world, off-road, and +challenging environments. In these scenarios, estimating external disturbances +requires a complex synthesis of multi-modal information. This underlines a +major limitation in existing works that primarily focus on avoiding obstacles. +In this work, we propose TOP-Nav, a novel legged navigation framework that +integrates a comprehensive path planner with Terrain awareness, Obstacle +avoidance and close-loop Proprioception. TOP-Nav underscores the synergies +between vision and proprioception in both path and motion planning. Within the +path planner, we present and integrate a terrain estimator that enables the +robot to select waypoints on terrains with higher traversability while +effectively avoiding obstacles. In the motion planning level, we not only +implement a locomotion controller to track the navigation commands, but also +construct a proprioception advisor to provide motion evaluations for the path +planner. Based on the close-loop motion feedback, we make online corrections +for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav +achieves open-world navigation that the robot can handle terrains or +disturbances beyond the distribution of prior knowledge and overcomes +constraints imposed by visual conditions. Building upon extensive experiments +conducted in both simulation and real-world environments, TOP-Nav demonstrates +superior performance in open-world navigation compared to existing methods. + +
+
+
+
+
+ + ☆ UniMERNet: A Universal Network for Real-World Mathematical Expression + Recognition + + +
+ This paper presents the UniMER dataset to provide the first study on +Mathematical Expression Recognition (MER) towards complex real-world scenarios. +The UniMER dataset consists of a large-scale training set UniMER-1M offering an +unprecedented scale and diversity with one million training instances and a +meticulously designed test set UniMER-Test that reflects a diverse range of +formula distributions prevalent in real-world scenarios. Therefore, the UniMER +dataset enables the training of a robust and high-accuracy MER model and +comprehensive evaluation of model performance. Moreover, we introduce the +Universal Mathematical Expression Recognition Network (UniMERNet), an +innovative framework designed to enhance MER in practical scenarios. UniMERNet +incorporates a Length-Aware Module to process formulas of varied lengths +efficiently, thereby enabling the model to handle complex mathematical +expressions with greater accuracy. In addition, UniMERNet employs our UniMER-1M +data and image augmentation techniques to improve the model's robustness under +different noise conditions. Our extensive experiments demonstrate that +UniMERNet outperforms existing MER models, setting a new benchmark in various +scenarios and ensuring superior recognition quality in real-world applications. +The dataset and model are available at +https://github.com/opendatalab/UniMERNet. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ Source-free Domain Adaptation for Video Object Detection Under Adverse + Image Conditions CVPR 2024 + + +
+ When deploying pre-trained video object detectors in real-world scenarios, +the domain gap between training and testing data caused by adverse image +conditions often leads to performance degradation. Addressing this issue +becomes particularly challenging when only the pre-trained model and degraded +videos are available. Although various source-free domain adaptation (SFDA) +methods have been proposed for single-frame object detectors, SFDA for video +object detection (VOD) remains unexplored. Moreover, most unsupervised domain +adaptation works for object detection rely on two-stage detectors, while SFDA +for one-stage detectors, which are more vulnerable to fine-tuning, is not well +addressed in the literature. In this paper, we propose Spatial-Temporal +Alternate Refinement with Mean Teacher (STAR-MT), a simple yet effective SFDA +method for VOD. Specifically, we aim to improve the performance of the +one-stage VOD method, YOLOV, under adverse image conditions, including noise, +air turbulence, and haze. Extensive experiments on the ImageNetVOD dataset and +its degraded versions demonstrate that our method consistently improves video +object detection performance in challenging imaging conditions, showcasing its +potential for real-world applications. + +
+
+ comment: accepted by the UG2+ workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Efficient Transformer Encoders for Mask2Former-style models + + +
+ Vision transformer based models bring significant improvements for image +segmentation tasks. Although these architectures offer powerful capabilities +irrespective of specific segmentation tasks, their use of computational +resources can be taxing on deployed devices. One way to overcome this challenge +is by adapting the computation level to the specific needs of the input image +rather than the current one-size-fits-all approach. To this end, we introduce +ECO-M2F or EffiCient TransfOrmer Encoders for Mask2Former-style models. Noting +that the encoder module of M2F-style models incur high resource-intensive +computations, ECO-M2F provides a strategy to self-select the number of hidden +layers in the encoder, conditioned on the input image. To enable this +self-selection ability for providing a balance between performance and +computational efficiency, we present a three step recipe. The first step is to +train the parent architecture to enable early exiting from the encoder. The +second step is to create an derived dataset of the ideal number of encoder +layers required for each training example. The third step is to use the +aforementioned derived dataset to train a gating network that predicts the +number of encoder layers to be used, conditioned on the input image. +Additionally, to change the computational-accuracy tradeoff, only steps two and +three need to be repeated which significantly reduces retraining time. +Experiments on the public datasets show that the proposed approach reduces +expected encoder computational cost while maintaining performance, adapts to +various user compute resources, is flexible in architecture configurations, and +can be extended beyond the segmentation task to object detection. + +
+
+
+
+
+ + ☆ Massively Annotated Datasets for Assessment of Synthetic and Real Data + in Face Recognition + + +
+ Face recognition applications have grown in parallel with the size of +datasets, complexity of deep learning models and computational power. However, +while deep learning models evolve to become more capable and computational +power keeps increasing, the datasets available are being retracted and removed +from public access. Privacy and ethical concerns are relevant topics within +these domains. Through generative artificial intelligence, researchers have put +efforts into the development of completely synthetic datasets that can be used +to train face recognition systems. Nonetheless, the recent advances have not +been sufficient to achieve performance comparable to the state-of-the-art +models trained on real data. To study the drift between the performance of +models trained on real and synthetic datasets, we leverage a massive attribute +classifier (MAC) to create annotations for four datasets: two real and two +synthetic. From these annotations, we conduct studies on the distribution of +each attribute within all four datasets. Additionally, we further inspect the +differences between real and synthetic datasets on the attribute set. When +comparing through the Kullback-Leibler divergence we have found differences +between real and synthetic samples. Interestingly enough, we have verified that +while real samples suffice to explain the synthetic distribution, the opposite +could not be further from being true. + +
+
+ comment: Accepted at FG 2024 +
+
+
+
+
+ + ☆ Re-Thinking Inverse Graphics With Large Language Models + + +
+ Inverse graphics -- the task of inverting an image into physical variables +that, when rendered, enable reproduction of the observed scene -- is a +fundamental challenge in computer vision and graphics. Disentangling an image +into its constituent elements, such as the shape, color, and material +properties of the objects of the 3D scene that produced it, requires a +comprehensive understanding of the environment. This requirement limits the +ability of existing carefully engineered approaches to generalize across +domains. Inspired by the zero-shot ability of large language models (LLMs) to +generalize to novel contexts, we investigate the possibility of leveraging the +broad world knowledge encoded in such models in solving inverse-graphics +problems. To this end, we propose the Inverse-Graphics Large Language Model +(IG-LLM), an inverse-graphics framework centered around an LLM, that +autoregressively decodes a visual embedding into a structured, compositional +3D-scene representation. We incorporate a frozen pre-trained visual encoder and +a continuous numeric head to enable end-to-end training. Through our +investigation, we demonstrate the potential of LLMs to facilitate inverse +graphics through next-token prediction, without the use of image-space +supervision. Our analysis opens up new possibilities for precise spatial +reasoning about images that exploit the visual knowledge of LLMs. We will +release our code and data to ensure the reproducibility of our investigation +and to facilitate future research at https://ig-llm.is.tue.mpg.de/ + +
+
+ comment: 31 pages; project page: https://ig-llm.is.tue.mpg.de/ +
+
+
+
+
+ + ☆ Deep Models for Multi-View 3D Object Recognition: A Review + + +
+ Human decision-making often relies on visual information from multiple +perspectives or views. In contrast, machine learning-based object recognition +utilizes information from a single image of the object. However, the +information conveyed by a single image may not be sufficient for accurate +decision-making, particularly in complex recognition problems. The utilization +of multi-view 3D representations for object recognition has thus far +demonstrated the most promising results for achieving state-of-the-art +performance. This review paper comprehensively covers recent progress in +multi-view 3D object recognition methods for 3D classification and retrieval +tasks. Specifically, we focus on deep learning-based and transformer-based +techniques, as they are widely utilized and have achieved state-of-the-art +performance. We provide detailed information about existing deep learning-based +and transformer-based multi-view 3D object recognition models, including the +most commonly used 3D datasets, camera configurations and number of views, view +selection strategies, pre-trained CNN architectures, fusion strategies, and +recognition performance on 3D classification and 3D retrieval tasks. +Additionally, we examine various computer vision applications that use +multi-view classification. Finally, we highlight key findings and future +directions for developing multi-view 3D object recognition methods to provide +readers with a comprehensive understanding of the field. + +
+
+
+
+
+ + ☆ Closed Loop Interactive Embodied Reasoning for Robot Manipulation + + +
+ Embodied reasoning systems integrate robotic hardware and cognitive processes +to perform complex tasks typically in response to a natural language query +about a specific physical environment. This usually involves changing the +belief about the scene or physically interacting and changing the scene (e.g. +'Sort the objects from lightest to heaviest'). In order to facilitate the +development of such systems we introduce a new simulating environment that +makes use of MuJoCo physics engine and high-quality renderer Blender to provide +realistic visual observations that are also accurate to the physical state of +the scene. Together with the simulator we propose a new benchmark composed of +10 classes of multi-step reasoning scenarios that require simultaneous visual +and physical measurements. Finally, we develop a new modular Closed Loop +Interactive Reasoning (CLIER) approach that takes into account the measurements +of non-visual object properties, changes in the scene caused by external +disturbances as well as uncertain outcomes of robotic actions. We extensively +evaluate our reasoning approach in simulation and in the real world +manipulation tasks with a success rate above 76% and 64%, respectively. + +
+
+
+
+
+ + ☆ Fourier-enhanced Implicit Neural Fusion Network for Multispectral and + Hyperspectral Image Fusion + + +
+ Recently, implicit neural representations (INR) have made significant strides +in various vision-related domains, providing a novel solution for Multispectral +and Hyperspectral Image Fusion (MHIF) tasks. However, INR is prone to losing +high-frequency information and is confined to the lack of global perceptual +capabilities. To address these issues, this paper introduces a Fourier-enhanced +Implicit Neural Fusion Network (FeINFN) specifically designed for MHIF task, +targeting the following phenomena: The Fourier amplitudes of the HR-HSI latent +code and LR-HSI are remarkably similar; however, their phases exhibit different +patterns. In FeINFN, we innovatively propose a spatial and frequency implicit +fusion function (Spa-Fre IFF), helping INR capture high-frequency information +and expanding the receptive field. Besides, a new decoder employing a complex +Gabor wavelet activation function, called Spatial-Frequency Interactive Decoder +(SFID), is invented to enhance the interaction of INR features. Especially, we +further theoretically prove that the Gabor wavelet activation possesses a +time-frequency tightness property that favors learning the optimal bandwidths +in the decoder. Experiments on two benchmark MHIF datasets verify the +state-of-the-art (SOTA) performance of the proposed method, both visually and +quantitatively. Also, ablation studies demonstrate the mentioned contributions. +The code will be available on Anonymous GitHub +(https://anonymous.4open.science/r/FeINFN-15C9/) after possible acceptance. + +
+
+
+
+
+ + ☆ Adaptive Mixed-Scale Feature Fusion Network for Blind AI-Generated Image + Quality Assessment + + +
+ With the increasing maturity of the text-to-image and image-to-image +generative models, AI-generated images (AGIs) have shown great application +potential in advertisement, entertainment, education, social media, etc. +Although remarkable advancements have been achieved in generative models, very +few efforts have been paid to design relevant quality assessment models. In +this paper, we propose a novel blind image quality assessment (IQA) network, +named AMFF-Net, for AGIs. AMFF-Net evaluates AGI quality from three dimensions, +i.e., "visual quality", "authenticity", and "consistency". Specifically, +inspired by the characteristics of the human visual system and motivated by the +observation that "visual quality" and "authenticity" are characterized by both +local and global aspects, AMFF-Net scales the image up and down and takes the +scaled images and original-sized image as the inputs to obtain multi-scale +features. After that, an Adaptive Feature Fusion (AFF) block is used to +adaptively fuse the multi-scale features with learnable weights. In addition, +considering the correlation between the image and prompt, AMFF-Net compares the +semantic features from text encoder and image encoder to evaluate the +text-to-image alignment. We carry out extensive experiments on three AGI +quality assessment databases, and the experimental results show that our +AMFF-Net obtains better performance than nine state-of-the-art blind IQA +methods. The results of ablation experiments further demonstrate the +effectiveness of the proposed multi-scale input strategy and AFF block. + +
+
+ comment: IEEE Transactions on Broadcasting (TBC) +
+
+
+
+
+ + ☆ Combating Missing Modalities in Egocentric Videos at Test Time + + +
+ Understanding videos that contain multiple modalities is crucial, especially +in egocentric videos, where combining various sensory inputs significantly +improves tasks like action recognition and moment localization. However, +real-world applications often face challenges with incomplete modalities due to +privacy concerns, efficiency needs, or hardware issues. Current methods, while +effective, often necessitate retraining the model entirely to handle missing +modalities, making them computationally intensive, particularly with large +training datasets. In this study, we propose a novel approach to address this +issue at test time without requiring retraining. We frame the problem as a +test-time adaptation task, where the model adjusts to the available unlabeled +data at test time. Our method, MiDl~(Mutual information with +self-Distillation), encourages the model to be insensitive to the specific +modality source present during testing by minimizing the mutual information +between the prediction and the available modality. Additionally, we incorporate +self-distillation to maintain the model's original performance when both +modalities are available. MiDl represents the first self-supervised, online +solution for handling missing modalities exclusively at test time. Through +experiments with various pretrained models and datasets, MiDl demonstrates +substantial performance improvement without the need for retraining. + +
+
+
+
+
+ + ☆ CutDiffusion: A Simple, Fast, Cheap, and Strong Diffusion Extrapolation + Method + + +
+ Transforming large pre-trained low-resolution diffusion models to cater to +higher-resolution demands, i.e., diffusion extrapolation, significantly +improves diffusion adaptability. We propose tuning-free CutDiffusion, aimed at +simplifying and accelerating the diffusion extrapolation process, making it +more affordable and improving performance. CutDiffusion abides by the existing +patch-wise extrapolation but cuts a standard patch diffusion process into an +initial phase focused on comprehensive structure denoising and a subsequent +phase dedicated to specific detail refinement. Comprehensive experiments +highlight the numerous almighty advantages of CutDiffusion: (1) simple method +construction that enables a concise higher-resolution diffusion process without +third-party engagement; (2) fast inference speed achieved through a single-step +higher-resolution diffusion process, and fewer inference patches required; (3) +cheap GPU cost resulting from patch-wise inference and fewer patches during the +comprehensive structure denoising; (4) strong generation performance, stemming +from the emphasis on specific detail refinement. + +
+
+
+
+
+ + ☆ Gallbladder Cancer Detection in Ultrasound Images based on YOLO and + Faster R-CNN + + +
+ Medical image analysis is a significant application of artificial +intelligence for disease diagnosis. A crucial step in this process is the +identification of regions of interest within the images. This task can be +automated using object detection algorithms. YOLO and Faster R-CNN are renowned +for such algorithms, each with its own strengths and weaknesses. This study +aims to explore the advantages of both techniques to select more accurate +bounding boxes for gallbladder detection from ultrasound images, thereby +enhancing gallbladder cancer classification. A fusion method that leverages the +benefits of both techniques is presented in this study. The proposed method +demonstrated superior classification performance, with an accuracy of 92.62%, +compared to the individual use of Faster R-CNN and YOLOv8, which yielded +accuracies of 90.16% and 82.79%, respectively. + +
+
+ comment: Published in 2024 10th International Conference on Artificial + Intelligence and Robotics (QICAR) +
+
+
+
+
+ + ☆ MedDr: Diagnosis-Guided Bootstrapping for Large-Scale Medical + Vision-Language Learning + + +
+ The rapid advancement of large-scale vision-language models has showcased +remarkable capabilities across various tasks. However, the lack of extensive +and high-quality image-text data in medicine has greatly hindered the +development of large-scale medical vision-language models. In this work, we +present a diagnosis-guided bootstrapping strategy that exploits both image and +label information to construct vision-language datasets. Based on the +constructed dataset, we developed MedDr, a generalist foundation model for +healthcare capable of handling diverse medical data modalities, including +radiology, pathology, dermatology, retinography, and endoscopy. Moreover, +during inference, we propose a simple but effective retrieval-augmented medical +diagnosis strategy, which enhances the model's generalization ability. +Extensive experiments on visual question answering, medical report generation, +and medical image diagnosis demonstrate the superiority of our method. + +
+
+
+
+
+ + ☆ Taming Diffusion Probabilistic Models for Character Control SIGGRAPH 2024 + + +
+ We present a novel character control framework that effectively utilizes +motion diffusion probabilistic models to generate high-quality and diverse +character animations, responding in real-time to a variety of dynamic +user-supplied control signals. At the heart of our method lies a +transformer-based Conditional Autoregressive Motion Diffusion Model (CAMDM), +which takes as input the character's historical motion and can generate a range +of diverse potential future motions conditioned on high-level, coarse user +control. To meet the demands for diversity, controllability, and computational +efficiency required by a real-time controller, we incorporate several key +algorithmic designs. These include separate condition tokenization, +classifier-free guidance on past motion, and heuristic future trajectory +extension, all designed to address the challenges associated with taming motion +diffusion probabilistic models for character control. As a result, our work +represents the first model that enables real-time generation of high-quality, +diverse character animations based on user interactive control, supporting +animating the character in multiple styles with a single unified model. We +evaluate our method on a diverse set of locomotion skills, demonstrating the +merits of our method over existing character controllers. Project page and +source codes: https://aiganimation.github.io/CAMDM/ + +
+
+ comment: Accepted by SIGGRAPH 2024 (Conference Track). Project page and source + codes: https://aiganimation.github.io/CAMDM/ +
+
+
+
+
+ + ☆ Multimodal Large Language Model is a Human-Aligned Annotator for + Text-to-Image Generation + + +
+ Recent studies have demonstrated the exceptional potentials of leveraging +human preference datasets to refine text-to-image generative models, enhancing +the alignment between generated images and textual prompts. Despite these +advances, current human preference datasets are either prohibitively expensive +to construct or suffer from a lack of diversity in preference dimensions, +resulting in limited applicability for instruction tuning in open-source +text-to-image generative models and hinder further exploration. To address +these challenges and promote the alignment of generative models through +instruction tuning, we leverage multimodal large language models to create +VisionPrefer, a high-quality and fine-grained preference dataset that captures +multiple preference aspects. We aggregate feedback from AI annotators across +four aspects: prompt-following, aesthetic, fidelity, and harmlessness to +construct VisionPrefer. To validate the effectiveness of VisionPrefer, we train +a reward model VP-Score over VisionPrefer to guide the training of +text-to-image generative models and the preference prediction accuracy of +VP-Score is comparable to human annotators. Furthermore, we use two +reinforcement learning methods to supervised fine-tune generative models to +evaluate the performance of VisionPrefer, and extensive experimental results +demonstrate that VisionPrefer significantly improves text-image alignment in +compositional image generation across diverse aspects, e.g., aesthetic, and +generalizes better than previous human-preference metrics across various image +distributions. Moreover, VisionPrefer indicates that the integration of +AI-generated synthetic data as a supervisory signal is a promising avenue for +achieving improved alignment with human preferences in vision generative +models. + +
+
+
+
+
+ + ☆ Harnessing Optical Imaging Limit through Atmospheric Scattering Media + + +
+ Recording and identifying faint objects through atmospheric scattering media +by an optical system are fundamentally interesting and technologically +important. In this work, we introduce a comprehensive model that incorporates +contributions from target characteristics, atmospheric effects, imaging system, +digital processing, and visual perception to assess the ultimate perceptible +limit of geometrical imaging, specifically the angular resolution at the +boundary of visible distance. The model allows to reevaluate the effectiveness +of conventional imaging recording, processing, and perception and to analyze +the limiting factors that constrain image recognition capabilities in +atmospheric media. The simulations were compared with the experimental results +measured in a fog chamber and outdoor settings. The results reveal general good +agreement between analysis and experimental, pointing out the way to harnessing +the physical limit for optical imaging in scattering media. An immediate +application of the study is the extension of the image range by an amount of +1.2 times with noise reduction via multi-frame averaging, hence greatly +enhancing the capability of optical imaging in the atmosphere. + +
+
+
+
+
+ + ☆ Perturbing Attention Gives You More Bang for the Buck: Subtle Imaging + Perturbations That Efficiently Fool Customized Diffusion Models CVPR 2024 + + +
+ Diffusion models (DMs) embark a new era of generative modeling and offer more +opportunities for efficient generating high-quality and realistic data samples. +However, their widespread use has also brought forth new challenges in model +security, which motivates the creation of more effective adversarial attackers +on DMs to understand its vulnerability. We propose CAAT, a simple but generic +and efficient approach that does not require costly training to effectively +fool latent diffusion models (LDMs). The approach is based on the observation +that cross-attention layers exhibits higher sensitivity to gradient change, +allowing for leveraging subtle perturbations on published images to +significantly corrupt the generated images. We show that a subtle perturbation +on an image can significantly impact the cross-attention layers, thus changing +the mapping between text and image during the fine-tuning of customized +diffusion models. Extensive experiments demonstrate that CAAT is compatible +with diverse diffusion models and outperforms baseline attack methods in a more +effective (more noise) and efficient (twice as fast as Anti-DreamBooth and +Mist) manner. + +
+
+ comment: Published at CVPR 2024 +
+
+
+
+
+ + ☆ LEAF: Unveiling Two Sides of the Same Coin in Semi-supervised Facial + Expression Recognition + + +
+ Semi-supervised learning has emerged as a promising approach to tackle the +challenge of label scarcity in facial expression recognition (FER) task. +However, current state-of-the-art methods primarily focus on one side of the +coin, i.e., generating high-quality pseudo-labels, while overlooking the other +side: enhancing expression-relevant representations. In this paper, we unveil +both sides of the coin by proposing a unified framework termed hierarchicaL +dEcoupling And Fusing (LEAF) to coordinate expression-relevant representations +and pseudo-labels for semi-supervised FER. LEAF introduces a hierarchical +expression-aware aggregation strategy that operates at three levels: semantic, +instance, and category. (1) At the semantic and instance levels, LEAF decouples +representations into expression-agnostic and expression-relevant components, +and adaptively fuses them using learnable gating weights. (2) At the category +level, LEAF assigns ambiguous pseudo-labels by decoupling predictions into +positive and negative parts, and employs a consistency loss to ensure agreement +between two augmented views of the same image. Extensive experiments on +benchmark datasets demonstrate that by unveiling and harmonizing both sides of +the coin, LEAF outperforms state-of-the-art semi-supervised FER methods, +effectively leveraging both labeled and unlabeled data. Moreover, the proposed +expression-aware aggregation strategy can be seamlessly integrated into +existing semi-supervised frameworks, leading to significant performance gains. + +
+
+
+
+
+ + ☆ DP-Net: Learning Discriminative Parts for image recognition ICIP 2023 + + +
+ This paper presents Discriminative Part Network (DP-Net), a deep architecture +with strong interpretation capabilities, which exploits a pretrained +Convolutional Neural Network (CNN) combined with a part-based recognition +module. This system learns and detects parts in the images that are +discriminative among categories, without the need for fine-tuning the CNN, +making it more scalable than other part-based models. While part-based +approaches naturally offer interpretable representations, we propose +explanations at image and category levels and introduce specific constraints on +the part learning process to make them more discrimative. + +
+
+ comment: IEEE ICIP 2023 +
+
+
+
+
+ + ☆ IPAD: Industrial Process Anomaly Detection Dataset + + +
+ Video anomaly detection (VAD) is a challenging task aiming to recognize +anomalies in video frames, and existing large-scale VAD researches primarily +focus on road traffic and human activity scenes. In industrial scenes, there +are often a variety of unpredictable anomalies, and the VAD method can play a +significant role in these scenarios. However, there is a lack of applicable +datasets and methods specifically tailored for industrial production scenarios +due to concerns regarding privacy and security. To bridge this gap, we propose +a new dataset, IPAD, specifically designed for VAD in industrial scenarios. The +industrial processes in our dataset are chosen through on-site factory research +and discussions with engineers. This dataset covers 16 different industrial +devices and contains over 6 hours of both synthetic and real-world video +footage. Moreover, we annotate the key feature of the industrial process, ie, +periodicity. Based on the proposed dataset, we introduce a period memory module +and a sliding window inspection mechanism to effectively investigate the +periodic information in a basic reconstruction model. Our framework leverages +LoRA adapter to explore the effective migration of pretrained models, which are +initially trained using synthetic data, into real-world scenarios. Our proposed +dataset and method will fill the gap in the field of industrial video anomaly +detection and drive the process of video understanding tasks as well as smart +factory deployment. + +
+
+
+
+
+ + ☆ PRISM: A Promptable and Robust Interactive Segmentation Model with + Visual Prompts + + +
+ In this paper, we present PRISM, a Promptable and Robust Interactive +Segmentation Model, aiming for precise segmentation of 3D medical images. PRISM +accepts various visual inputs, including points, boxes, and scribbles as sparse +prompts, as well as masks as dense prompts. Specifically, PRISM is designed +with four principles to achieve robustness: (1) Iterative learning. The model +produces segmentations by using visual prompts from previous iterations to +achieve progressive improvement. (2) Confidence learning. PRISM employs +multiple segmentation heads per input image, each generating a continuous map +and a confidence score to optimize predictions. (3) Corrective learning. +Following each segmentation iteration, PRISM employs a shallow corrective +refinement network to reassign mislabeled voxels. (4) Hybrid design. PRISM +integrates hybrid encoders to better capture both the local and global +information. Comprehensive validation of PRISM is conducted using four public +datasets for tumor segmentation in the colon, pancreas, liver, and kidney, +highlighting challenges caused by anatomical variations and ambiguous +boundaries in accurate tumor identification. Compared to state-of-the-art +methods, both with and without prompt engineering, PRISM significantly improves +performance, achieving results that are close to human levels. The code is +publicly available at https://github.com/MedICL-VU/PRISM. + +
+
+
+
+
+ + ☆ A Learning Paradigm for Interpretable Gradients + + +
+ This paper studies interpretability of convolutional networks by means of +saliency maps. Most approaches based on Class Activation Maps (CAM) combine +information from fully connected layers and gradient through variants of +backpropagation. However, it is well understood that gradients are noisy and +alternatives like guided backpropagation have been proposed to obtain better +visualization at inference. In this work, we present a novel training approach +to improve the quality of gradients for interpretability. In particular, we +introduce a regularization loss such that the gradient with respect to the +input image obtained by standard backpropagation is similar to the gradient +obtained by guided backpropagation. We find that the resulting gradient is +qualitatively less noisy and improves quantitatively the interpretability +properties of different networks, using several interpretability methods. + +
+
+ comment: VISAPP 2024 +
+
+
+
+
+ + ☆ A review of deep learning-based information fusion techniques for + multimodal medical image classification + + +
+ Multimodal medical imaging plays a pivotal role in clinical diagnosis and +research, as it combines information from various imaging modalities to provide +a more comprehensive understanding of the underlying pathology. Recently, deep +learning-based multimodal fusion techniques have emerged as powerful tools for +improving medical image classification. This review offers a thorough analysis +of the developments in deep learning-based multimodal fusion for medical +classification tasks. We explore the complementary relationships among +prevalent clinical modalities and outline three main fusion schemes for +multimodal classification networks: input fusion, intermediate fusion +(encompassing single-level fusion, hierarchical fusion, and attention-based +fusion), and output fusion. By evaluating the performance of these fusion +techniques, we provide insight into the suitability of different network +architectures for various multimodal fusion scenarios and application domains. +Furthermore, we delve into challenges related to network architecture +selection, handling incomplete multimodal data management, and the potential +limitations of multimodal fusion. Finally, we spotlight the promising future of +Transformer-based multimodal fusion techniques and give recommendations for +future research in this rapidly evolving field. + +
+
+
+
+
+ + ☆ OccGen: Generative Multi-modal 3D Occupancy Prediction for Autonomous + Driving + + +
+ Existing solutions for 3D semantic occupancy prediction typically treat the +task as a one-shot 3D voxel-wise segmentation perception problem. These +discriminative methods focus on learning the mapping between the inputs and +occupancy map in a single step, lacking the ability to gradually refine the +occupancy map and the reasonable scene imaginative capacity to complete the +local regions somewhere. In this paper, we introduce OccGen, a simple yet +powerful generative perception model for the task of 3D semantic occupancy +prediction. OccGen adopts a ''noise-to-occupancy'' generative paradigm, +progressively inferring and refining the occupancy map by predicting and +eliminating noise originating from a random Gaussian distribution. OccGen +consists of two main components: a conditional encoder that is capable of +processing multi-modal inputs, and a progressive refinement decoder that +applies diffusion denoising using the multi-modal features as conditions. A key +insight of this generative pipeline is that the diffusion denoising process is +naturally able to model the coarse-to-fine refinement of the dense 3D occupancy +map, therefore producing more detailed predictions. Extensive experiments on +several occupancy benchmarks demonstrate the effectiveness of the proposed +method compared to the state-of-the-art methods. For instance, OccGen +relatively enhances the mIoU by 9.5%, 6.3%, and 13.3% on nuScenes-Occupancy +dataset under the muli-modal, LiDAR-only, and camera-only settings, +respectively. Moreover, as a generative perception model, OccGen exhibits +desirable properties that discriminative models cannot achieve, such as +providing uncertainty estimates alongside its multiple-step predictions. + +
+
+
+
+
+ + ☆ X-3D: Explicit 3D Structure Modeling for Point Cloud Recognition + + +
+ Numerous prior studies predominantly emphasize constructing relation vectors +for individual neighborhood points and generating dynamic kernels for each +vector and embedding these into high-dimensional spaces to capture implicit +local structures. However, we contend that such implicit high-dimensional +structure modeling approch inadequately represents the local geometric +structure of point clouds due to the absence of explicit structural +information. Hence, we introduce X-3D, an explicit 3D structure modeling +approach. X-3D functions by capturing the explicit local structural information +within the input 3D space and employing it to produce dynamic kernels with +shared weights for all neighborhood points within the current local region. +This modeling approach introduces effective geometric prior and significantly +diminishes the disparity between the local structure of the embedding space and +the original input point cloud, thereby improving the extraction of local +features. Experiments show that our method can be used on a variety of methods +and achieves state-of-the-art performance on segmentation, classification, +detection tasks with lower extra computational cost, such as \textbf{90.7\%} on +ScanObjectNN for classification, \textbf{79.2\%} on S3DIS 6 fold and +\textbf{74.3\%} on S3DIS Area 5 for segmentation, \textbf{76.3\%} on ScanNetV2 +for segmentation and \textbf{64.5\%} mAP , \textbf{46.9\%} mAP on SUN RGB-D and +\textbf{69.0\%} mAP , \textbf{51.1\%} mAP on ScanNetV2 . Our code is available +at +\href{https://github.com/sunshuofeng/X-3D}{https://github.com/sunshuofeng/X-3D}. + +
+
+
+
+
+ + ☆ The Brain Tumor Segmentation in Pediatrics (BraTS-PEDs) Challenge: Focus + on Pediatrics (CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs) + + +
+ Pediatric tumors of the central nervous system are the most common cause of +cancer-related death in children. The five-year survival rate for high-grade +gliomas in children is less than 20%. Due to their rarity, the diagnosis of +these entities is often delayed, their treatment is mainly based on historic +treatment concepts, and clinical trials require multi-institutional +collaborations. Here we present the CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs +challenge, focused on pediatric brain tumors with data acquired across multiple +international consortia dedicated to pediatric neuro-oncology and clinical +trials. The CBTN-CONNECT-DIPGR-ASNR-MICCAI BraTS-PEDs challenge brings together +clinicians and AI/imaging scientists to lead to faster development of automated +segmentation techniques that could benefit clinical trials, and ultimately the +care of children with brain tumors. + +
+
+
+
+
+ + ☆ External Prompt Features Enhanced Parameter-efficient Fine-tuning for + Salient Object Detection + + +
+ Salient object detection (SOD) aims at finding the most salient objects in +images and outputs pixel-level binary masks. Transformer-based methods achieve +promising performance due to their global semantic understanding, crucial for +identifying salient objects. However, these models tend to be large and require +numerous training parameters. To better harness the potential of transformers +for SOD, we propose a novel parameter-efficient fine-tuning method aimed at +reducing the number of training parameters while enhancing the salient object +detection capability. Our model, termed EXternal Prompt features Enhanced +adapteR Tuning (ExPert), features an encoder-decoder structure with adapters +and injectors interspersed between the layers of a frozen transformer encoder. +The adapter modules adapt the pre-trained backbone to SOD while the injector +modules incorporate external prompt features to enhance the awareness of +salient objects. Comprehensive experiments demonstrate the superiority of our +method. Surpassing former state-of-the-art (SOTA) models across five SOD +datasets, ExPert achieves 0.215 mean absolute error (MAE) in ECSSD dataset with +80.2M trained parameters, 21% better than transformer-based SOTA model and 47% +better than CNN-based SOTA model. + +
+
+
+
+
+ + ☆ CA-Stream: Attention-based pooling for interpretable image recognition CVPR + + +
+ Explanations obtained from transformer-based architectures in the form of raw +attention, can be seen as a class-agnostic saliency map. Additionally, +attention-based pooling serves as a form of masking the in feature space. +Motivated by this observation, we design an attention-based pooling mechanism +intended to replace Global Average Pooling (GAP) at inference. This mechanism, +called Cross-Attention Stream (CA-Stream), comprises a stream of cross +attention blocks interacting with features at different network depths. +CA-Stream enhances interpretability in models, while preserving recognition +performance. + +
+
+ comment: CVPR XAI4CV workshop 2024 +
+
+
+
+
+ + ☆ Other Tokens Matter: Exploring Global and Local Features of Vision + Transformers for Object Re-Identification + + +
+ Object Re-Identification (Re-ID) aims to identify and retrieve specific +objects from images captured at different places and times. Recently, object +Re-ID has achieved great success with the advances of Vision Transformers +(ViT). However, the effects of the global-local relation have not been fully +explored in Transformers for object Re-ID. In this work, we first explore the +influence of global and local features of ViT and then further propose a novel +Global-Local Transformer (GLTrans) for high-performance object Re-ID. We find +that the features from last few layers of ViT already have a strong +representational ability, and the global and local information can mutually +enhance each other. Based on this fact, we propose a Global Aggregation Encoder +(GAE) to utilize the class tokens of the last few Transformer layers and learn +comprehensive global features effectively. Meanwhile, we propose the Local +Multi-layer Fusion (LMF) which leverages both the global cues from GAE and +multi-layer patch tokens to explore the discriminative local representations. +Extensive experiments demonstrate that our proposed method achieves superior +performance on four object Re-ID benchmarks. + +
+
+ comment: Accepted by CVIU2024. More modifications may be performed +
+
+
+
+
+ + ☆ SGFormer: Spherical Geometry Transformer for 360 Depth Estimation + + +
+ Panoramic distortion poses a significant challenge in 360 depth estimation, +particularly pronounced at the north and south poles. Existing methods either +adopt a bi-projection fusion strategy to remove distortions or model long-range +dependencies to capture global structures, which can result in either unclear +structure or insufficient local perception. In this paper, we propose a +spherical geometry transformer, named SGFormer, to address the above issues, +with an innovative step to integrate spherical geometric priors into vision +transformers. To this end, we retarget the transformer decoder to a spherical +prior decoder (termed SPDecoder), which endeavors to uphold the integrity of +spherical structures during decoding. Concretely, we leverage bipolar +re-projection, circular rotation, and curve local embedding to preserve the +spherical characteristics of equidistortion, continuity, and surface distance, +respectively. Furthermore, we present a query-based global conditional position +embedding to compensate for spatial structure at varying resolutions. It not +only boosts the global perception of spatial position but also sharpens the +depth structure across different patches. Finally, we conduct extensive +experiments on popular benchmarks, demonstrating our superiority over +state-of-the-art solutions. + +
+
+
+
+
+ + ☆ CAGE: Circumplex Affect Guided Expression Inference CVPR2024 + + +
+ Understanding emotions and expressions is a task of interest across multiple +disciplines, especially for improving user experiences. Contrary to the common +perception, it has been shown that emotions are not discrete entities but +instead exist along a continuum. People understand discrete emotions +differently due to a variety of factors, including cultural background, +individual experiences, and cognitive biases. Therefore, most approaches to +expression understanding, particularly those relying on discrete categories, +are inherently biased. In this paper, we present a comparative in-depth +analysis of two common datasets (AffectNet and EMOTIC) equipped with the +components of the circumplex model of affect. Further, we propose a model for +the prediction of facial expressions tailored for lightweight applications. +Using a small-scaled MaxViT-based model architecture, we evaluate the impact of +discrete expression category labels in training with the continuous valence and +arousal labels. We show that considering valence and arousal in addition to +discrete category labels helps to significantly improve expression inference. +The proposed model outperforms the current state-of-the-art models on +AffectNet, establishing it as the best-performing model for inferring valence +and arousal achieving a 7% lower RMSE. Training scripts and trained weights to +reproduce our results can be found here: +https://github.com/wagner-niklas/CAGE_expression_inference. + +
+
+ comment: Accepted for publication at ABAW Workshop at CVPR2024 +
+
+
+
+
+ + ☆ CenterArt: Joint Shape Reconstruction and 6-DoF Grasp Estimation of + Articulated Objects ICRA 2024 + + +
+ Precisely grasping and reconstructing articulated objects is key to enabling +general robotic manipulation. In this paper, we propose CenterArt, a novel +approach for simultaneous 3D shape reconstruction and 6-DoF grasp estimation of +articulated objects. CenterArt takes RGB-D images of the scene as input and +first predicts the shape and joint codes through an encoder. The decoder then +leverages these codes to reconstruct 3D shapes and estimate 6-DoF grasp poses +of the objects. We further develop a mechanism for generating a dataset of +6-DoF grasp ground truth poses for articulated objects. CenterArt is trained on +realistic scenes containing multiple articulated objects with randomized +designs, textures, lighting conditions, and realistic depths. We perform +extensive experiments demonstrating that CenterArt outperforms existing methods +in accuracy and robustness. + +
+
+ comment: 4 pages, 2 figures, accepted to the ICRA 2024 Workshop on 3D Visual + Representations for Robot Manipulation +
+
+
+
+
+ + ☆ CoARF: Controllable 3D Artistic Style Transfer for Radiance Fields + + +
+ Creating artistic 3D scenes can be time-consuming and requires specialized +knowledge. To address this, recent works such as ARF, use a radiance +field-based approach with style constraints to generate 3D scenes that resemble +a style image provided by the user. However, these methods lack fine-grained +control over the resulting scenes. In this paper, we introduce Controllable +Artistic Radiance Fields (CoARF), a novel algorithm for controllable 3D scene +stylization. CoARF enables style transfer for specified objects, compositional +3D style transfer and semantic-aware style transfer. We achieve controllability +using segmentation masks with different label-dependent loss functions. We also +propose a semantic-aware nearest neighbor matching algorithm to improve the +style transfer quality. Our extensive experiments demonstrate that CoARF +provides user-specified controllability of style transfer and superior style +transfer quality with more precise feature matching. + +
+
+ comment: International Conference on 3D Vision 2024 +
+
+
+
+
+ + ☆ Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State + Space Model + + +
+ Existing Transformer-based models for point cloud analysis suffer from +quadratic complexity, leading to compromised point cloud resolution and +information loss. In contrast, the newly proposed Mamba model, based on state +space models (SSM), outperforms Transformer in multiple areas with only linear +complexity. However, the straightforward adoption of Mamba does not achieve +satisfactory performance on point cloud tasks. In this work, we present +Mamba3D, a state space model tailored for point cloud learning to enhance local +feature extraction, achieving superior performance, high efficiency, and +scalability potential. Specifically, we propose a simple yet effective Local +Norm Pooling (LNP) block to extract local geometric features. Additionally, to +obtain better global features, we introduce a bidirectional SSM (bi-SSM) with +both a token forward SSM and a novel backward SSM that operates on the feature +channel. Extensive experimental results show that Mamba3D surpasses +Transformer-based counterparts and concurrent works in multiple tasks, with or +without pre-training. Notably, Mamba3D achieves multiple SoTA, including an +overall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1% +(with single-modal pre-training) on the ModelNet40 classification task, with +only linear complexity. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ DAWN: Domain-Adaptive Weakly Supervised Nuclei Segmentation via + Cross-Task Interactions + + +
+ Weakly supervised segmentation methods have gained significant attention due +to their ability to reduce the reliance on costly pixel-level annotations +during model training. However, the current weakly supervised nuclei +segmentation approaches typically follow a two-stage pseudo-label generation +and network training process. The performance of the nuclei segmentation +heavily relies on the quality of the generated pseudo-labels, thereby limiting +its effectiveness. This paper introduces a novel domain-adaptive weakly +supervised nuclei segmentation framework using cross-task interaction +strategies to overcome the challenge of pseudo-label generation. Specifically, +we utilize weakly annotated data to train an auxiliary detection task, which +assists the domain adaptation of the segmentation network. To enhance the +efficiency of domain adaptation, we design a consistent feature constraint +module integrating prior knowledge from the source domain. Furthermore, we +develop pseudo-label optimization and interactive training methods to improve +the domain transfer capability. To validate the effectiveness of our proposed +method, we conduct extensive comparative and ablation experiments on six +datasets. The results demonstrate the superiority of our approach over existing +weakly supervised approaches. Remarkably, our method achieves comparable or +even better performance than fully supervised methods. Our code will be +released in https://github.com/zhangye-zoe/DAWN. + +
+
+ comment: 13 pages, 11 figures, 8 tables +
+
+
+
+
+ + ☆ Traditional to Transformers: A Survey on Current Trends and Future + Prospects for Hyperspectral Image Classification + + +
+ Hyperspectral image classification is a challenging task due to the high +dimensionality and complex nature of hyperspectral data. In recent years, deep +learning techniques have emerged as powerful tools for addressing these +challenges. This survey provides a comprehensive overview of the current trends +and future prospects in hyperspectral image classification, focusing on the +advancements from deep learning models to the emerging use of transformers. We +review the key concepts, methodologies, and state-of-the-art approaches in deep +learning for hyperspectral image classification. Additionally, we discuss the +potential of transformer-based models in this field and highlight the +advantages and challenges associated with these approaches. Comprehensive +experimental results have been undertaken using three Hyperspectral datasets to +verify the efficacy of various conventional deep-learning models and +Transformers. Finally, we outline future research directions and potential +applications that can further enhance the accuracy and efficiency of +hyperspectral image classification. + The Source code is available at +https://github.com/mahmad00/Conventional-to-Transformer-for-Hyperspectral-Image-Classification-Survey-2024. + +
+
+
+
+
+ + ☆ Leveraging Speech for Gesture Detection in Multimodal Communication + + +
+ Gestures are inherent to human interaction and often complement speech in +face-to-face communication, forming a multimodal communication system. An +important task in gesture analysis is detecting a gesture's beginning and end. +Research on automatic gesture detection has primarily focused on visual and +kinematic information to detect a limited set of isolated or silent gestures +with low variability, neglecting the integration of speech and vision signals +to detect gestures that co-occur with speech. This work addresses this gap by +focusing on co-speech gesture detection, emphasising the synchrony between +speech and co-speech hand gestures. We address three main challenges: the +variability of gesture forms, the temporal misalignment between gesture and +speech onsets, and differences in sampling rate between modalities. We +investigate extended speech time windows and employ separate backbone models +for each modality to address the temporal misalignment and sampling rate +differences. We utilize Transformer encoders in cross-modal and early fusion +techniques to effectively align and integrate speech and skeletal sequences. +The study results show that combining visual and speech information +significantly enhances gesture detection performance. Our findings indicate +that expanding the speech buffer beyond visual time segments improves +performance and that multimodal integration using cross-modal and early fusion +techniques outperforms baseline methods using unimodal and late fusion methods. +Additionally, we find a correlation between the models' gesture prediction +confidence and low-level speech frequency features potentially associated with +gestures. Overall, the study provides a better understanding and detection +methods for co-speech gestures, facilitating the analysis of multimodal +communication. + +
+
+
+
+
+ + ☆ Streamlining the Image Stitching Pipeline: Integrating Fusion and + Rectangling into a Unified Model + + +
+ Learning-based image stitching techniques typically involve three distinct +stages: registration, fusion, and rectangling. These stages are often performed +sequentially, each trained independently, leading to potential cascading error +propagation and complex parameter tuning challenges. In rethinking the +mathematical modeling of the fusion and rectangling stages, we discovered that +these processes can be effectively combined into a single, variety-intensity +inpainting problem. Therefore, we propose the Simple and Robust Stitcher +(SRStitcher), an efficient training-free image stitching method that merges the +fusion and rectangling stages into a unified model. By employing the weighted +mask and large-scale generative model, SRStitcher can solve the fusion and +rectangling problems in a single inference, without additional training or +fine-tuning of other models. Our method not only simplifies the stitching +pipeline but also enhances fault tolerance towards misregistration errors. +Extensive experiments demonstrate that SRStitcher outperforms state-of-the-art +(SOTA) methods in both quantitative assessments and qualitative evaluations. +The code is released at https://github.com/yayoyo66/SRStitcher + +
+
+
+
+
+ + ☆ Multi-Modal Prompt Learning on Blind Image Quality Assessment + + +
+ Image Quality Assessment (IQA) models benefit significantly from semantic +information, which allows them to treat different types of objects distinctly. +Currently, leveraging semantic information to enhance IQA is a crucial research +direction. Traditional methods, hindered by a lack of sufficiently annotated +data, have employed the CLIP image-text pretraining model as their backbone to +gain semantic awareness. However, the generalist nature of these pre-trained +Vision-Language (VL) models often renders them suboptimal for IQA-specific +tasks. Recent approaches have attempted to address this mismatch using prompt +technology, but these solutions have shortcomings. Existing prompt-based VL +models overly focus on incremental semantic information from text, neglecting +the rich insights available from visual data analysis. This imbalance limits +their performance improvements in IQA tasks. This paper introduces an +innovative multi-modal prompt-based methodology for IQA. Our approach employs +carefully crafted prompts that synergistically mine incremental semantic +information from both visual and linguistic data. Specifically, in the visual +branch, we introduce a multi-layer prompt structure to enhance the VL model's +adaptability. In the text branch, we deploy a dual-prompt scheme that steers +the model to recognize and differentiate between scene category and distortion +type, thereby refining the model's capacity to assess image quality. Our +experimental findings underscore the effectiveness of our method over existing +Blind Image Quality Assessment (BIQA) approaches. Notably, it demonstrates +competitive performance across various datasets. Our method achieves Spearman +Rank Correlation Coefficient (SRCC) values of 0.961(surpassing 0.946 in CSIQ) +and 0.941 (exceeding 0.930 in KADID), illustrating its robustness and accuracy +in diverse contexts. + +
+
+
+
+
+ + ☆ Pyramid Hierarchical Transformer for Hyperspectral Image Classification + + +
+ The traditional Transformer model encounters challenges with variable-length +input sequences, particularly in Hyperspectral Image Classification (HSIC), +leading to efficiency and scalability concerns. To overcome this, we propose a +pyramid-based hierarchical transformer (PyFormer). This innovative approach +organizes input data hierarchically into segments, each representing distinct +abstraction levels, thereby enhancing processing efficiency for lengthy +sequences. At each level, a dedicated transformer module is applied, +effectively capturing both local and global context. Spatial and spectral +information flow within the hierarchy facilitates communication and abstraction +propagation. Integration of outputs from different levels culminates in the +final input representation. Experimental results underscore the superiority of +the proposed method over traditional approaches. Additionally, the +incorporation of disjoint samples augments robustness and reliability, thereby +highlighting the potential of our approach in advancing HSIC. + The source code is available at https://github.com/mahmad00/PyFormer. + +
+
+
+
+
+ + ☆ Importance of Disjoint Sampling in Conventional and Transformer Models + for Hyperspectral Image Classification + + +
+ Disjoint sampling is critical for rigorous and unbiased evaluation of +state-of-the-art (SOTA) models. When training, validation, and test sets +overlap or share data, it introduces a bias that inflates performance metrics +and prevents accurate assessment of a model's true ability to generalize to new +examples. This paper presents an innovative disjoint sampling approach for +training SOTA models on Hyperspectral image classification (HSIC) tasks. By +separating training, validation, and test data without overlap, the proposed +method facilitates a fairer evaluation of how well a model can classify pixels +it was not exposed to during training or validation. Experiments demonstrate +the approach significantly improves a model's generalization compared to +alternatives that include training and validation data in test data. By +eliminating data leakage between sets, disjoint sampling provides reliable +metrics for benchmarking progress in HSIC. Researchers can have confidence that +reported performance truly reflects a model's capabilities for classifying new +scenes, not just memorized pixels. This rigorous methodology is critical for +advancing SOTA models and their real-world application to large-scale land +mapping with Hyperspectral sensors. + The source code is available at +https://github.com/mahmad00/Disjoint-Sampling-for-Hyperspectral-Image-Classification. + +
+
+
+
+
+ + ☆ G3R: Generating Rich and Fine-grained mmWave Radar Data from 2D Videos + for Generalized Gesture Recognition + + +
+ Millimeter wave radar is gaining traction recently as a promising modality +for enabling pervasive and privacy-preserving gesture recognition. However, the +lack of rich and fine-grained radar datasets hinders progress in developing +generalized deep learning models for gesture recognition across various user +postures (e.g., standing, sitting), positions, and scenes. To remedy this, we +resort to designing a software pipeline that exploits wealthy 2D videos to +generate realistic radar data, but it needs to address the challenge of +simulating diversified and fine-grained reflection properties of user gestures. +To this end, we design G3R with three key components: (i) a gesture reflection +point generator expands the arm's skeleton points to form human reflection +points; (ii) a signal simulation model simulates the multipath reflection and +attenuation of radar signals to output the human intensity map; (iii) an +encoder-decoder model combines a sampling module and a fitting module to +address the differences in number and distribution of points between generated +and real-world radar data for generating realistic radar data. We implement and +evaluate G3R using 2D videos from public data sources and self-collected +real-world radar data, demonstrating its superiority over other +state-of-the-art approaches for gesture recognition. + +
+
+ comment: 18 pages, 29 figures +
+
+
+
+
+ + ☆ Mining Supervision for Dynamic Regions in Self-Supervised Monocular + Depth Estimation CVPR2024 + + +
+ This paper focuses on self-supervised monocular depth estimation in dynamic +scenes trained on monocular videos. Existing methods jointly estimate +pixel-wise depth and motion, relying mainly on an image reconstruction loss. +Dynamic regions1 remain a critical challenge for these methods due to the +inherent ambiguity in depth and motion estimation, resulting in inaccurate +depth estimation. This paper proposes a self-supervised training framework +exploiting pseudo depth labels for dynamic regions from training data. The key +contribution of our framework is to decouple depth estimation for static and +dynamic regions of images in the training data. We start with an unsupervised +depth estimation approach, which provides reliable depth estimates for static +regions and motion cues for dynamic regions and allows us to extract moving +object information at the instance level. In the next stage, we use an object +network to estimate the depth of those moving objects assuming rigid motions. +Then, we propose a new scale alignment module to address the scale ambiguity +between estimated depths for static and dynamic regions. We can then use the +depth labels generated to train an end-to-end depth estimation network and +improve its performance. Extensive experiments on the Cityscapes and KITTI +datasets show that our self-training strategy consistently outperforms existing +self/unsupervised depth estimation methods. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Driver Activity Classification Using Generalizable Representations from + Vision-Language Models + + +
+ Driver activity classification is crucial for ensuring road safety, with +applications ranging from driver assistance systems to autonomous vehicle +control transitions. In this paper, we present a novel approach leveraging +generalizable representations from vision-language models for driver activity +classification. Our method employs a Semantic Representation Late Fusion Neural +Network (SRLF-Net) to process synchronized video frames from multiple +perspectives. Each frame is encoded using a pretrained vision-language encoder, +and the resulting embeddings are fused to generate class probability +predictions. By leveraging contrastively-learned vision-language +representations, our approach achieves robust performance across diverse driver +activities. We evaluate our method on the Naturalistic Driving Action +Recognition Dataset, demonstrating strong accuracy across many classes. Our +results suggest that vision-language representations offer a promising avenue +for driver monitoring systems, providing both accuracy and interpretability +through natural language descriptors. + +
+
+
+
+
+ + ☆ DENOISER: Rethinking the Robustness for Open-Vocabulary Action + Recognition + + +
+ As one of the fundamental video tasks in computer vision, Open-Vocabulary +Action Recognition (OVAR) recently gains increasing attention, with the +development of vision-language pre-trainings. To enable generalization of +arbitrary classes, existing methods treat class labels as text descriptions, +then formulate OVAR as evaluating embedding similarity between visual samples +and textual classes. However, one crucial issue is completely ignored: the +class descriptions given by users may be noisy, e.g., misspellings and typos, +limiting the real-world practicality of vanilla OVAR. To fill the research gap, +this paper pioneers to evaluate existing methods by simulating multi-level +noises of various types, and reveals their poor robustness. To tackle the noisy +OVAR task, we further propose one novel DENOISER framework, covering two parts: +generation and discrimination. Concretely, the generative part denoises noisy +class-text names via one decoding process, i.e., propose text candidates, then +utilize inter-modal and intra-modal information to vote for the best. At the +discriminative part, we use vanilla OVAR models to assign visual samples to +class-text names, thus obtaining more semantics. For optimization, we +alternately iterate between generative and discriminative parts for progressive +refinements. The denoised text classes help OVAR models classify visual samples +more accurately; in return, classified visual samples help better denoising. On +three datasets, we carry out extensive experiments to show our superior +robustness, and thorough ablations to dissect the effectiveness of each +component. + +
+
+
+
+
+ + ☆ Domain adaptive pose estimation via multi-level alignment + + +
+ Domain adaptive pose estimation aims to enable deep models trained on source +domain (synthesized) datasets produce similar results on the target domain +(real-world) datasets. The existing methods have made significant progress by +conducting image-level or feature-level alignment. However, only aligning at a +single level is not sufficient to fully bridge the domain gap and achieve +excellent domain adaptive results. In this paper, we propose a multi-level +domain adaptation aproach, which aligns different domains at the image, +feature, and pose levels. Specifically, we first utilize image style transer to +ensure that images from the source and target domains have a similar +distribution. Subsequently, at the feature level, we employ adversarial +training to make the features from the source and target domains preserve +domain-invariant characeristics as much as possible. Finally, at the pose +level, a self-supervised approach is utilized to enable the model to learn +diverse knowledge, implicitly addressing the domain gap. Experimental results +demonstrate that significant imrovement can be achieved by the proposed +multi-level alignment method in pose estimation, which outperforms previous +state-of-the-art in human pose by up to 2.4% and animal pose estimation by up +to 3.1% for dogs and 1.4% for sheep. + +
+
+
+
+
+ + ☆ A sensitivity analysis to quantify the impact of neuroimaging + preprocessing strategies on subsequent statistical analyses + + +
+ Even though novel imaging techniques have been successful in studying brain +structure and function, the measured biological signals are often contaminated +by multiple sources of noise, arising due to e.g. head movements of the +individual being scanned, limited spatial/temporal resolution, or other issues +specific to each imaging technology. Data preprocessing (e.g. denoising) is +therefore critical. Preprocessing pipelines have become increasingly complex +over the years, but also more flexible, and this flexibility can have a +significant impact on the final results and conclusions of a given study. This +large parameter space is often referred to as multiverse analyses. Here, we +provide conceptual and practical tools for statistical analyses that can +aggregate multiple pipeline results along with a new sensitivity analysis +testing for hypotheses across pipelines such as "no effect across all +pipelines" or "at least one pipeline with no effect". The proposed framework is +generic and can be applied to any multiverse scenario, but we illustrate its +use based on positron emission tomography data. + +
+
+
+
+
+ + ☆ Ultrasound Nodule Segmentation Using Asymmetric Learning with Simple + Clinical Annotation + + +
+ Recent advances in deep learning have greatly facilitated the automated +segmentation of ultrasound images, which is essential for nodule morphological +analysis. Nevertheless, most existing methods depend on extensive and precise +annotations by domain experts, which are labor-intensive and time-consuming. In +this study, we suggest using simple aspect ratio annotations directly from +ultrasound clinical diagnoses for automated nodule segmentation. Especially, an +asymmetric learning framework is developed by extending the aspect ratio +annotations with two types of pseudo labels, i.e., conservative labels and +radical labels, to train two asymmetric segmentation networks simultaneously. +Subsequently, a conservative-radical-balance strategy (CRBS) strategy is +proposed to complementally combine radical and conservative labels. An +inconsistency-aware dynamically mixed pseudo-labels supervision (IDMPS) module +is introduced to address the challenges of over-segmentation and +under-segmentation caused by the two types of labels. To further leverage the +spatial prior knowledge provided by clinical annotations, we also present a +novel loss function namely the clinical anatomy prior loss. Extensive +experiments on two clinically collected ultrasound datasets (thyroid and +breast) demonstrate the superior performance of our proposed method, which can +achieve comparable and even better performance than fully supervised methods +using ground truth annotations. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ☆ Ultrasound SAM Adapter: Adapting SAM for Breast Lesion Segmentation in + Ultrasound Images + + +
+ Segment Anything Model (SAM) has recently achieved amazing results in the +field of natural image segmentation. However, it is not effective for medical +image segmentation, owing to the large domain gap between natural and medical +images. In this paper, we mainly focus on ultrasound image segmentation. As we +know that it is very difficult to train a foundation model for ultrasound image +data due to the lack of large-scale annotated ultrasound image data. To address +these issues, in this paper, we develop a novel Breast Ultrasound SAM Adapter, +termed Breast Ultrasound Segment Anything Model (BUSSAM), which migrates the +SAM to the field of breast ultrasound image segmentation by using the adapter +technique. To be specific, we first design a novel CNN image encoder, which is +fully trained on the BUS dataset. Our CNN image encoder is more lightweight, +and focuses more on features of local receptive field, which provides the +complementary information to the ViT branch in SAM. Then, we design a novel +Cross-Branch Adapter to allow the CNN image encoder to fully interact with the +ViT image encoder in SAM module. Finally, we add both of the Position Adapter +and the Feature Adapter to the ViT branch to fine-tune the original SAM. The +experimental results on AMUBUS and BUSI datasets demonstrate that our proposed +model outperforms other medical image segmentation models significantly. Our +code will be available at: https://github.com/bscs12/BUSSAM. + +
+
+
+
+
+ + ☆ Semi-supervised 2D Human Pose Estimation via Adaptive Keypoint Masking + + +
+ Human pose estimation is a fundamental and challenging task in computer +vision. Larger-scale and more accurate keypoint annotations, while helpful for +improving the accuracy of supervised pose estimation, are often expensive and +difficult to obtain. Semi-supervised pose estimation tries to leverage a large +amount of unlabeled data to improve model performance, which can alleviate the +problem of insufficient labeled samples. The latest semi-supervised learning +usually adopts a strong and weak data augmented teacher-student learning +framework to deal with the challenge of "Human postural diversity and its +long-tailed distribution". Appropriate data augmentation method is one of the +key factors affecting the accuracy and generalization of semi-supervised +models. Aiming at the problem that the difference of sample learning is not +considered in the fixed keypoint masking augmentation method, this paper +proposes an adaptive keypoint masking method, which can fully mine the +information in the samples and obtain better estimation performance. In order +to further improve the generalization and robustness of the model, this paper +proposes a dual-branch data augmentation scheme, which can perform Mixup on +samples and features on the basis of adaptive keypoint masking. The +effectiveness of the proposed method is verified on COCO and MPII, +outperforming the state-of-the-art semi-supervised pose estimation by 5.2% and +0.3%, respectively. + +
+
+ comment: China Multimedia 2023 +
+
+
+
+
+ + ☆ CoProNN: Concept-based Prototypical Nearest Neighbors for Explaining + Vision Models + + +
+ Mounting evidence in explainability for artificial intelligence (XAI) +research suggests that good explanations should be tailored to individual tasks +and should relate to concepts relevant to the task. However, building task +specific explanations is time consuming and requires domain expertise which can +be difficult to integrate into generic XAI methods. A promising approach +towards designing useful task specific explanations with domain experts is +based on compositionality of semantic concepts. Here, we present a novel +approach that enables domain experts to quickly create concept-based +explanations for computer vision tasks intuitively via natural language. +Leveraging recent progress in deep generative methods we propose to generate +visual concept-based prototypes via text-to-image methods. These prototypes are +then used to explain predictions of computer vision models via a simple +k-Nearest-Neighbors routine. The modular design of CoProNN is simple to +implement, it is straightforward to adapt to novel tasks and allows for +replacing the classification and text-to-image models as more powerful models +are released. The approach can be evaluated offline against the ground-truth of +predefined prototypes that can be easily communicated also to domain experts as +they are based on visual concepts. We show that our strategy competes very well +with other concept-based XAI approaches on coarse grained image classification +tasks and may even outperform those methods on more demanding fine grained +tasks. We demonstrate the effectiveness of our method for human-machine +collaboration settings in qualitative and quantitative user studies. All code +and experimental data can be found in our GitHub +$\href{https://github.com/TeodorChiaburu/beexplainable}{repository}$. + +
+
+ comment: 24 pages, 9 figures, 2 tables, accepted at WCXAI 2024 Valletta +
+
+
+
+
+ + ☆ Revisiting Neural Networks for Continual Learning: An Architectural + Perspective + + +
+ Efforts to overcome catastrophic forgetting have primarily centered around +developing more effective Continual Learning (CL) methods. In contrast, less +attention was devoted to analyzing the role of network architecture design +(e.g., network depth, width, and components) in contributing to CL. This paper +seeks to bridge this gap between network architecture design and CL, and to +present a holistic study on the impact of network architectures on CL. This +work considers architecture design at the network scaling level, i.e., width +and depth, and also at the network components, i.e., skip connections, global +pooling layers, and down-sampling. In both cases, we first derive insights +through systematically exploring how architectural designs affect CL. Then, +grounded in these insights, we craft a specialized search space for CL and +further propose a simple yet effective ArchCraft method to steer a CL-friendly +architecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC. +Experimental validation across various CL settings and scenarios demonstrates +that improved architectures are parameter-efficient, achieving state-of-the-art +performance of CL while being 86%, 61%, and 97% more compact in terms of +parameters than the naive CL architecture in Class IL and Task IL. Code is +available at https://github.com/byyx666/ArchCraft. + +
+
+
+
+
+ + ☆ CNN2GNN: How to Bridge CNN with GNN + + +
+ Although the convolutional neural network (CNN) has achieved excellent +performance in vision tasks by extracting the intra-sample representation, it +will take a higher training expense because of stacking numerous convolutional +layers. Recently, as the bilinear models, graph neural networks (GNN) have +succeeded in exploring the underlying topological relationship among the graph +data with a few graph neural layers. Unfortunately, it cannot be directly +utilized on non-graph data due to the lack of graph structure and has high +inference latency on large-scale scenarios. Inspired by these complementary +strengths and weaknesses, \textit{we discuss a natural question, how to bridge +these two heterogeneous networks?} In this paper, we propose a novel CNN2GNN +framework to unify CNN and GNN together via distillation. Firstly, to break the +limitations of GNN, a differentiable sparse graph learning module is designed +as the head of networks to dynamically learn the graph for inductive learning. +Then, a response-based distillation is introduced to transfer the knowledge +from CNN to GNN and bridge these two heterogeneous networks. Notably, due to +extracting the intra-sample representation of a single instance and the +topological relationship among the datasets simultaneously, the performance of +distilled ``boosted'' two-layer GNN on Mini-ImageNet is much higher than CNN +containing dozens of layers such as ResNet152. + +
+
+
+
+
+ + ☆ Visual-Augmented Dynamic Semantic Prototype for Generative Zero-Shot + Learning + + +
+ Generative Zero-shot learning (ZSL) learns a generator to synthesize visual +samples for unseen classes, which is an effective way to advance ZSL. However, +existing generative methods rely on the conditions of Gaussian noise and the +predefined semantic prototype, which limit the generator only optimized on +specific seen classes rather than characterizing each visual instance, +resulting in poor generalizations (\textit{e.g.}, overfitting to seen classes). +To address this issue, we propose a novel Visual-Augmented Dynamic Semantic +prototype method (termed VADS) to boost the generator to learn accurate +semantic-visual mapping by fully exploiting the visual-augmented knowledge into +semantic conditions. In detail, VADS consists of two modules: (1) Visual-aware +Domain Knowledge Learning module (VDKL) learns the local bias and global prior +of the visual features (referred to as domain visual knowledge), which replace +pure Gaussian noise to provide richer prior noise information; (2) +Vision-Oriented Semantic Updation module (VOSU) updates the semantic prototype +according to the visual representations of the samples. Ultimately, we +concatenate their output as a dynamic semantic prototype, which serves as the +condition of the generator. Extensive experiments demonstrate that our VADS +achieves superior CZSL and GZSL performances on three prominent datasets and +outperforms other state-of-the-art methods with averaging increases by 6.4\%, +5.9\% and 4.2\% on SUN, CUB and AWA2, respectively. + +
+
+
+
+
+ + ☆ Reference-Free Multi-Modality Volume Registration of X-Ray Microscopy + and Light-Sheet Fluorescence Microscopy + + +
+ Recently, X-ray microscopy (XRM) and light-sheet fluorescence microscopy +(LSFM) have emerged as two pivotal imaging tools in preclinical research on +bone remodeling diseases, offering micrometer-level resolution. Integrating +these complementary modalities provides a holistic view of bone +microstructures, facilitating function-oriented volume analysis across +different disease cycles. However, registering such independently acquired +large-scale volumes is extremely challenging under real and reference-free +scenarios. This paper presents a fast two-stage pipeline for volume +registration of XRM and LSFM. The first stage extracts the surface features and +employs two successive point cloud-based methods for coarse alignment. The +second stage fine-tunes the initial alignment using a modified +cross-correlation method, ensuring precise volumetric registration. Moreover, +we propose residual similarity as a novel metric to assess the alignment of two +complementary modalities. The results imply robust gradual improvement across +the stages. In the end, all correlating microstructures, particularly lacunae +in XRM and bone cells in LSFM, are precisely matched, enabling new insights +into bone diseases like osteoporosis which are a substantial burden in aging +societies. + +
+
+
+
+
+ + ☆ DesignProbe: A Graphic Design Benchmark for Multimodal Large Language + Models + + +
+ A well-executed graphic design typically achieves harmony in two levels, from +the fine-grained design elements (color, font and layout) to the overall +design. This complexity makes the comprehension of graphic design challenging, +for it needs the capability to both recognize the design elements and +understand the design. With the rapid development of Multimodal Large Language +Models (MLLMs), we establish the DesignProbe, a benchmark to investigate the +capability of MLLMs in design. Our benchmark includes eight tasks in total, +across both the fine-grained element level and the overall design level. At +design element level, we consider both the attribute recognition and semantic +understanding tasks. At overall design level, we include style and metaphor. 9 +MLLMs are tested and we apply GPT-4 as evaluator. Besides, further experiments +indicates that refining prompts can enhance the performance of MLLMs. We first +rewrite the prompts by different LLMs and found increased performances appear +in those who self-refined by their own LLMs. We then add extra task knowledge +in two different ways (text descriptions and image examples), finding that +adding images boost much more performance over texts. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ ContextualFusion: Context-Based Multi-Sensor Fusion for 3D Object + Detection in Adverse Operating Conditions + + +
+ The fusion of multimodal sensor data streams such as camera images and lidar +point clouds plays an important role in the operation of autonomous vehicles +(AVs). Robust perception across a range of adverse weather and lighting +conditions is specifically required for AVs to be deployed widely. While +multi-sensor fusion networks have been previously developed for perception in +sunny and clear weather conditions, these methods show a significant +degradation in performance under night-time and poor weather conditions. In +this paper, we propose a simple yet effective technique called ContextualFusion +to incorporate the domain knowledge about cameras and lidars behaving +differently across lighting and weather variations into 3D object detection +models. Specifically, we design a Gated Convolutional Fusion (GatedConv) +approach for the fusion of sensor streams based on the operational context. To +aid in our evaluation, we use the open-source simulator CARLA to create a +multimodal adverse-condition dataset called AdverseOp3D to address the +shortcomings of existing datasets being biased towards daytime and good-weather +conditions. Our ContextualFusion approach yields an mAP improvement of 6.2% +over state-of-the-art methods on our context-balanced synthetic dataset. +Finally, our method enhances state-of-the-art 3D objection performance at night +on the real-world NuScenes dataset with a significant mAP improvement of 11.7%. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Enhancing Prompt Following with Visual Control Through Training-Free + Mask-Guided Diffusion + + +
+ Recently, integrating visual controls into text-to-image~(T2I) models, such +as ControlNet method, has received significant attention for finer control +capabilities. While various training-free methods make efforts to enhance +prompt following in T2I models, the issue with visual control is still rarely +studied, especially in the scenario that visual controls are misaligned with +text prompts. In this paper, we address the challenge of ``Prompt Following +With Visual Control" and propose a training-free approach named Mask-guided +Prompt Following (MGPF). Object masks are introduced to distinct aligned and +misaligned parts of visual controls and prompts. Meanwhile, a network, dubbed +as Masked ControlNet, is designed to utilize these object masks for object +generation in the misaligned visual control region. Further, to improve +attribute matching, a simple yet efficient loss is designed to align the +attention maps of attributes with object regions constrained by ControlNet and +object masks. The efficacy and superiority of MGPF are validated through +comprehensive quantitative and qualitative experiments. + +
+
+
+
+
+ + ☆ Unified Unsupervised Salient Object Detection via Knowledge Transfer + + +
+ Recently, unsupervised salient object detection (USOD) has gained increasing +attention due to its annotation-free nature. However, current methods mainly +focus on specific tasks such as RGB and RGB-D, neglecting the potential for +task migration. In this paper, we propose a unified USOD framework for generic +USOD tasks. Firstly, we propose a Progressive Curriculum Learning-based +Saliency Distilling (PCL-SD) mechanism to extract saliency cues from a +pre-trained deep network. This mechanism starts with easy samples and +progressively moves towards harder ones, to avoid initial interference caused +by hard samples. Afterwards, the obtained saliency cues are utilized to train a +saliency detector, and we employ a Self-rectify Pseudo-label Refinement (SPR) +mechanism to improve the quality of pseudo-labels. Finally, an adapter-tuning +method is devised to transfer the acquired saliency knowledge, leveraging +shared knowledge to attain superior transferring performance on the target +tasks. Extensive experiments on five representative SOD tasks confirm the +effectiveness and feasibility of our proposed method. Code and supplement +materials are available at https://github.com/I2-Multimedia-Lab/A2S-v3. + +
+
+
+
+
+ + ☆ SkinGEN: an Explainable Dermatology Diagnosis-to-Generation Framework + with Interactive Vision-Language Models + + +
+ With the continuous advancement of vision language models (VLMs) technology, +remarkable research achievements have emerged in the dermatology field, the +fourth most prevalent human disease category. However, despite these +advancements, VLM still faces "hallucination" in dermatological diagnosis, and +due to the inherent complexity of dermatological conditions, existing tools +offer relatively limited support for user comprehension. We propose SkinGEN, a +diagnosis-to-generation framework that leverages the stable diffusion (SD) +method to generate reference demonstrations from diagnosis results provided by +VLM, thereby enhancing the visual explainability for users. Through extensive +experiments with Low-Rank Adaptation (LoRA), we identify optimal strategies for +skin condition image generation. We conduct a user study with 32 participants +evaluating both the system performance and explainability. Results demonstrate +that SkinGEN significantly improves users' comprehension of VLM predictions and +fosters increased trust in the diagnostic process. This work paves the way for +more transparent and user-centric VLM applications in dermatology and beyond. + +
+
+
+
+
+ + ☆ Grounded Knowledge-Enhanced Medical VLP for Chest X-Ray + + +
+ Medical vision-language pre-training has emerged as a promising approach for +learning domain-general representations of medical image and text. Current +algorithms that exploit the global and local alignment between medical image +and text could however be marred by the redundant information in medical data. +To address this issue, we propose a grounded knowledge-enhanced medical +vision-language pre-training (GK-MVLP) framework for chest X-ray. In this +framework, medical knowledge is grounded to the appropriate anatomical regions +by using a transformer-based grounded knowledge-enhanced module for +fine-grained alignment between anatomical region-level visual features and the +textural features of medical knowledge. The performance of GK-MVLP is +competitive with or exceeds the state of the art on downstream chest X-ray +disease classification, disease localization, report generation, and medical +visual question-answering tasks. Our results show the advantage of +incorporating grounding mechanism to remove biases and improve the alignment +between chest X-ray image and radiology report. + +
+
+
+
+
+ + ☆ Differentiable Score-Based Likelihoods: Learning CT Motion Compensation + From Clean Images + + +
+ Motion artifacts can compromise the diagnostic value of computed tomography +(CT) images. Motion correction approaches require a per-scan estimation of +patient-specific motion patterns. In this work, we train a score-based model to +act as a probability density estimator for clean head CT images. Given the +trained model, we quantify the deviation of a given motion-affected CT image +from the ideal distribution through likelihood computation. We demonstrate that +the likelihood can be utilized as a surrogate metric for motion artifact +severity in the CT image facilitating the application of an iterative, +gradient-based motion compensation algorithm. By optimizing the underlying +motion parameters to maximize likelihood, our method effectively reduces motion +artifacts, bringing the image closer to the distribution of motion-free scans. +Our approach achieves comparable performance to state-of-the-art methods while +eliminating the need for a representative data set of motion-affected samples. +This is particularly advantageous in real-world applications, where patient +motion patterns may exhibit unforeseen variability, ensuring robustness without +implicit assumptions about recoverable motion types. + +
+
+
+
+
+ + ☆ TAAT: Think and Act from Arbitrary Texts in Text2Motion + + +
+ Text2Motion aims to generate human motions from texts. Existing datasets rely +on the assumption that texts include action labels (such as "walk, bend, and +pick up"), which is not flexible for practical scenarios. This paper redefines +this problem with a more realistic assumption that the texts are arbitrary. +Specifically, arbitrary texts include existing action texts composed of action +labels (e.g., A person walks and bends to pick up something), and introduce +scene texts without explicit action labels (e.g., A person notices his wallet +on the ground ahead). + To bridge the gaps between this realistic setting and existing datasets, we +expand the action texts on the HumanML3D dataset to more scene texts, thereby +creating a new HumanML3D++ dataset including arbitrary texts. In this +challenging dataset, we benchmark existing state-of-the-art methods and propose +a novel two-stage framework to extract action labels from arbitrary texts by +the Large Language Model (LLM) and then generate motions from action labels. +Extensive experiments are conducted under different application scenarios to +validate the effectiveness of the proposed framework on existing and proposed +datasets. The results indicate that Text2Motion in this realistic setting is +very challenging, fostering new research in this practical direction. Our +dataset and code will be released. + +
+
+
+
+
+ + ☆ BMapOpt: Optimization of Brain Tissue Probability Maps using a + Differentiable MRI Simulator + + +
+ Reconstructing digital brain phantoms in the form of multi-channeled brain +tissue probability maps for individual subjects is essential for capturing +brain anatomical variability, understanding neurological diseases, as well as +for testing image processing methods. We demonstrate the first framework that +optimizes brain tissue probability maps (Gray Matter - GM, White Matter - WM, +and Cerebrospinal fluid - CSF) with the help of a Physics-based differentiable +MRI simulator that models the magnetization signal at each voxel in the image. +Given an observed $T_1$/$T_2$-weighted MRI scan, the corresponding clinical MRI +sequence, and the MRI differentiable simulator, we optimize the simulator's +input probability maps by back-propagating the L2 loss between the simulator's +output and the $T_1$/$T_2$-weighted scan. This approach has the significant +advantage of not relying on any training data, and instead uses the strong +inductive bias of the MRI simulator. We tested the model on 20 scans from the +BrainWeb database and demonstrate a highly accurate reconstruction of GM, WM, +and CSF. + +
+
+
+
+
+ + ☆ Bayesian Example Selection Improves In-Context Learning for Speech, + Text, and Visual Modalities + + +
+ Large language models (LLMs) can adapt to new tasks through in-context +learning (ICL) based on a few examples presented in dialogue history without +any model parameter update. Despite such convenience, the performance of ICL +heavily depends on the quality of the in-context examples presented, which +makes the in-context example selection approach a critical choice. This paper +proposes a novel Bayesian in-Context example Selection method (ByCS) for ICL. +Extending the inference probability conditioned on in-context examples based on +Bayes' theorem, ByCS focuses on the inverse inference conditioned on test +input. Following the assumption that accurate inverse inference probability +(likelihood) will result in accurate inference probability (posterior), +in-context examples are selected based on their inverse inference results. +Diverse and extensive cross-tasking and cross-modality experiments are +performed with speech, text, and image examples. Experimental results show the +efficacy and robustness of our ByCS method on various models, tasks and +modalities. + +
+
+ comment: 16 pages, 6 figures +
+
+
+
+
+ + ☆ FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection + and Correction + + +
+ Recent progress in large-scale pre-training has led to the development of +advanced vision-language models (VLMs) with remarkable proficiency in +comprehending and generating multimodal content. Despite the impressive ability +to perform complex reasoning for VLMs, current models often struggle to +effectively and precisely capture the compositional information on both the +image and text sides. To address this, we propose FineMatch, a new aspect-based +fine-grained text and image matching benchmark, focusing on text and image +mismatch detection and correction. This benchmark introduces a novel task for +boosting and evaluating the VLMs' compositionality for aspect-based +fine-grained text and image matching. In this task, models are required to +identify mismatched aspect phrases within a caption, determine the aspect's +class, and propose corrections for an image-text pair that may contain between +0 and 3 mismatches. To evaluate the models' performance on this new task, we +propose a new evaluation metric named ITM-IoU for which our experiments show a +high correlation to human evaluation. In addition, we also provide a +comprehensive experimental analysis of existing mainstream VLMs, including +fully supervised learning and in-context learning settings. We have found that +models trained on FineMatch demonstrate enhanced proficiency in detecting +fine-grained text and image mismatches. Moreover, models (e.g., GPT-4V, Gemini +Pro Vision) with strong abilities to perform multimodal in-context learning are +not as skilled at fine-grained compositional image and text matching analysis. +With FineMatch, we are able to build a system for text-to-image generation +hallucination detection and correction. + +
+
+
+
+
+ + ☆ SC-HVPPNet: Spatial and Channel Hybrid-Attention Video Post-Processing + Network with CNN and Transformer + + +
+ Convolutional Neural Network (CNN) and Transformer have attracted much +attention recently for video post-processing (VPP). However, the interaction +between CNN and Transformer in existing VPP methods is not fully explored, +leading to inefficient communication between the local and global extracted +features. In this paper, we explore the interaction between CNN and Transformer +in the task of VPP, and propose a novel Spatial and Channel Hybrid-Attention +Video Post-Processing Network (SC-HVPPNet), which can cooperatively exploit the +image priors in both spatial and channel domains. Specifically, in the spatial +domain, a novel spatial attention fusion module is designed, in which two +attention weights are generated to fuse the local and global representations +collaboratively. In the channel domain, a novel channel attention fusion module +is developed, which can blend the deep representations at the channel dimension +dynamically. Extensive experiments show that SC-HVPPNet notably boosts video +restoration quality, with average bitrate savings of 5.29%, 12.42%, and 13.09% +for Y, U, and V components in the VTM-11.0-NNVC RA configuration. + +
+
+
+
+
+ + ☆ Think-Program-reCtify: 3D Situated Reasoning with Large Language Models + + +
+ This work addresses the 3D situated reasoning task which aims to answer +questions given egocentric observations in a 3D environment. The task remains +challenging as it requires comprehensive 3D perception and complex reasoning +skills. End-to-end models trained on supervised data for 3D situated reasoning +suffer from data scarcity and generalization ability. Inspired by the recent +success of leveraging large language models (LLMs) for visual reasoning, we +propose LLM-TPC, a novel framework that leverages the planning, tool usage, and +reflection capabilities of LLMs through a ThinkProgram-reCtify loop. The Think +phase first decomposes the compositional question into a sequence of steps, and +then the Program phase grounds each step to a piece of code and calls carefully +designed 3D visual perception modules. Finally, the Rectify phase adjusts the +plan and code if the program fails to execute. Experiments and analysis on the +SQA3D benchmark demonstrate the effectiveness, interpretability and robustness +of our method. Our code is publicly available at +https://qingrongh.github.io/LLM-TPC/. + +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptation Architecture Search with Self-Training + for Land Cover Mapping CVPR + + +
+ Unsupervised domain adaptation (UDA) is a challenging open problem in land +cover mapping. Previous studies show encouraging progress in addressing +cross-domain distribution shifts on remote sensing benchmarks for land cover +mapping. The existing works are mainly built on large neural network +architectures, which makes them resource-hungry systems, limiting their +practical impact for many real-world applications in resource-constrained +environments. Thus, we proposed a simple yet effective framework to search for +lightweight neural networks automatically for land cover mapping tasks under +domain shifts. This is achieved by integrating Markov random field neural +architecture search (MRF-NAS) into a self-training UDA framework to search for +efficient and effective networks under a limited computation budget. This is +the first attempt to combine NAS with self-training UDA as a single framework +for land cover mapping. We also investigate two different pseudo-labelling +approaches (confidence-based and energy-based) in self-training scheme. +Experimental results on two recent datasets (OpenEarthMap & FLAIR #1) for +remote sensing UDA demonstrate a satisfactory performance. With only less than +2M parameters and 30.16 GFLOPs, the best-discovered lightweight network reaches +state-of-the-art performance on the regional target domain of OpenEarthMap +(59.38% mIoU) and the considered target domain of FLAIR #1 (51.19% mIoU). The +code is at +https://github.com/cliffbb/UDA-NAS}{https://github.com/cliffbb/UDA-NAS. + +
+
+ comment: Accepted at CVPRW 2024 +
+
+
+
+
+ + ☆ Adaptive Prompt Learning with Negative Textual Semantics and Uncertainty + Modeling for Universal Multi-Source Domain Adaptation ICME2024 + + +
+ Universal Multi-source Domain Adaptation (UniMDA) transfers knowledge from +multiple labeled source domains to an unlabeled target domain under domain +shifts (different data distribution) and class shifts (unknown target classes). +Existing solutions focus on excavating image features to detect unknown +samples, ignoring abundant information contained in textual semantics. In this +paper, we propose an Adaptive Prompt learning with Negative textual semantics +and uncErtainty modeling method based on Contrastive Language-Image +Pre-training (APNE-CLIP) for UniMDA classification tasks. Concretely, we +utilize the CLIP with adaptive prompts to leverage textual information of class +semantics and domain representations, helping the model identify unknown +samples and address domain shifts. Additionally, we design a novel global +instance-level alignment objective by utilizing negative textual semantics to +achieve more precise image-text pair alignment. Furthermore, we propose an +energy-based uncertainty modeling strategy to enlarge the margin distance +between known and unknown samples. Extensive experiments demonstrate the +superiority of our proposed method. + +
+
+ comment: Accepted by ICME2024 +
+
+
+
+
+ + ☆ Double Privacy Guard: Robust Traceable Adversarial Watermarking against + Face Recognition + + +
+ The wide deployment of Face Recognition (FR) systems poses risks of privacy +leakage. One countermeasure to address this issue is adversarial attacks, which +deceive malicious FR searches but simultaneously interfere the normal identity +verification of trusted authorizers. In this paper, we propose the first Double +Privacy Guard (DPG) scheme based on traceable adversarial watermarking. DPG +employs a one-time watermark embedding to deceive unauthorized FR models and +allows authorizers to perform identity verification by extracting the +watermark. Specifically, we propose an information-guided adversarial attack +against FR models. The encoder embeds an identity-specific watermark into the +deep feature space of the carrier, guiding recognizable features of the image +to deviate from the source identity. We further adopt a collaborative +meta-optimization strategy compatible with sub-tasks, which regularizes the +joint optimization direction of the encoder and decoder. This strategy enhances +the representation of universal carrier features, mitigating multi-objective +optimization conflicts in watermarking. Experiments confirm that DPG achieves +significant attack success rates and traceability accuracy on state-of-the-art +FR models, exhibiting remarkable robustness that outperforms the existing +privacy protection methods using adversarial attacks and deep watermarking, or +simple combinations of the two. Our work potentially opens up new insights into +proactive protection for FR privacy. + +
+
+
+
+
+ + ☆ Pegasus-v1 Technical Report + + +
+ This technical report introduces Pegasus-1, a multimodal language model +specialized in video content understanding and interaction through natural +language. Pegasus-1 is designed to address the unique challenges posed by video +data, such as interpreting spatiotemporal information, to offer nuanced video +content comprehension across various lengths. This technical report overviews +Pegasus-1's architecture, training strategies, and its performance in +benchmarks on video conversation, zero-shot video question answering, and video +summarization. We also explore qualitative characteristics of Pegasus-1 , +demonstrating its capabilities as well as its limitations, in order to provide +readers a balanced view of its current state and its future direction. + +
+
+
+
+
+ + ☆ 3DBench: A Scalable 3D Benchmark and Instruction-Tuning Dataset + + +
+ Evaluating the performance of Multi-modal Large Language Models (MLLMs), +integrating both point cloud and language, presents significant challenges. The +lack of a comprehensive assessment hampers determining whether these models +truly represent advancements, thereby impeding further progress in the field. +Current evaluations heavily rely on classification and caption tasks, falling +short in providing a thorough assessment of MLLMs. A pressing need exists for a +more sophisticated evaluation method capable of thoroughly analyzing the +spatial understanding and expressive capabilities of these models. To address +these issues, we introduce a scalable 3D benchmark, accompanied by a +large-scale instruction-tuning dataset known as 3DBench, providing an +extensible platform for a comprehensive evaluation of MLLMs. Specifically, we +establish the benchmark that spans a wide range of spatial and semantic scales, +from object-level to scene-level, addressing both perception and planning +tasks. Furthermore, we present a rigorous pipeline for automatically +constructing scalable 3D instruction-tuning datasets, covering 10 diverse +multi-modal tasks with more than 0.23 million QA pairs generated in total. +Thorough experiments evaluating trending MLLMs, comparisons against existing +datasets, and variations of training protocols demonstrate the superiority of +3DBench, offering valuable insights into current limitations and potential +research directions. + +
+
+
+
+
+ + ☆ DreamPBR: Text-driven Generation of High-resolution SVBRDF with + Multi-modal Guidance + + +
+ Prior material creation methods had limitations in producing diverse results +mainly because reconstruction-based methods relied on real-world measurements +and generation-based methods were trained on relatively small material +datasets. To address these challenges, we propose DreamPBR, a novel +diffusion-based generative framework designed to create spatially-varying +appearance properties guided by text and multi-modal controls, providing high +controllability and diversity in material generation. Key to achieving diverse +and high-quality PBR material generation lies in integrating the capabilities +of recent large-scale vision-language models trained on billions of text-image +pairs, along with material priors derived from hundreds of PBR material +samples. We utilize a novel material Latent Diffusion Model (LDM) to establish +the mapping between albedo maps and the corresponding latent space. The latent +representation is then decoded into full SVBRDF parameter maps using a +rendering-aware PBR decoder. Our method supports tileable generation through +convolution with circular padding. Furthermore, we introduce a multi-modal +guidance module, which includes pixel-aligned guidance, style image guidance, +and 3D shape guidance, to enhance the control capabilities of the material LDM. +We demonstrate the effectiveness of DreamPBR in material creation, showcasing +its versatility and user-friendliness on a wide range of controllable +generation and editing applications. + +
+
+ comment: 16 pages, 17 figures +
+
+
+
+
+ + ☆ HOIN: High-Order Implicit Neural Representations + + +
+ Implicit neural representations (INR) suffer from worsening spectral bias, +which results in overly smooth solutions to the inverse problem. To deal with +this problem, we propose a universal framework for processing inverse problems +called \textbf{High-Order Implicit Neural Representations (HOIN)}. By refining +the traditional cascade structure to foster high-order interactions among +features, HOIN enhances the model's expressive power and mitigates spectral +bias through its neural tangent kernel's (NTK) strong diagonal properties, +accelerating and optimizing inverse problem resolution. By analyzing the +model's expression space, high-order derivatives, and the NTK matrix, we +theoretically validate the feasibility of HOIN. HOIN realizes 1 to 3 dB +improvements in most inverse problems, establishing a new state-of-the-art +recovery quality and training efficiency, thus providing a new general paradigm +for INR and paving the way for it to solve the inverse problem. + +
+
+
+
+
+ + ☆ LaneCorrect: Self-supervised Lane Detection + + +
+ Lane detection has evolved highly functional autonomous driving system to +understand driving scenes even under complex environments. In this paper, we +work towards developing a generalized computer vision system able to detect +lanes without using any annotation. We make the following contributions: (i) We +illustrate how to perform unsupervised 3D lane segmentation by leveraging the +distinctive intensity of lanes on the LiDAR point cloud frames, and then obtain +the noisy lane labels in the 2D plane by projecting the 3D points; (ii) We +propose a novel self-supervised training scheme, dubbed LaneCorrect, that +automatically corrects the lane label by learning geometric consistency and +instance awareness from the adversarial augmentations; (iii) With the +self-supervised pre-trained model, we distill to train a student network for +arbitrary target lane (e.g., TuSimple) detection without any human labels; (iv) +We thoroughly evaluate our self-supervised method on four major lane detection +benchmarks (including TuSimple, CULane, CurveLanes and LLAMAS) and demonstrate +excellent performance compared with existing supervised counterpart, whilst +showing more effective results on alleviating the domain gap, i.e., training on +CULane and test on TuSimple. + +
+
+
+
+
+ + ☆ 3DFlowRenderer: One-shot Face Re-enactment via Dense 3D Facial Flow + Estimation + + +
+ Performing facial expression transfer under one-shot setting has been +increasing in popularity among research community with a focus on precise +control of expressions. Existing techniques showcase compelling results in +perceiving expressions, but they lack robustness with extreme head poses. They +also struggle to accurately reconstruct background details, thus hindering the +realism. In this paper, we propose a novel warping technology which integrates +the advantages of both 2D and 3D methods to achieve robust face re-enactment. +We generate dense 3D facial flow fields in feature space to warp an input image +based on target expressions without depth information. This enables explicit 3D +geometric control for re-enacting misaligned source and target faces. We +regularize the motion estimation capability of the 3D flow prediction network +through proposed "Cyclic warp loss" by converting warped 3D features back into +2D RGB space. To ensure the generation of finer facial region with +natural-background, our framework only renders the facial foreground region +first and learns to inpaint the blank area which needs to be filled due to +source face translation, thus reconstructing the detailed background without +any unwanted pixel motion. Extensive evaluation reveals that our method +outperforms state-of-the-art techniques in rendering artifact-free facial +images. + +
+
+
+
+
+ + ☆ First Mapping the Canopy Height of Primeval Forests in the Tallest Tree + Area of Asia + + +
+ We have developed the world's first canopy height map of the distribution +area of world-level giant trees. This mapping is crucial for discovering more +individual and community world-level giant trees, and for analyzing and +quantifying the effectiveness of biodiversity conservation measures in the +Yarlung Tsangpo Grand Canyon (YTGC) National Nature Reserve. We proposed a +method to map the canopy height of the primeval forest within the world-level +giant tree distribution area by using a spaceborne LiDAR fusion satellite +imagery (Global Ecosystem Dynamics Investigation (GEDI), ICESat-2, and +Sentinel-2) driven deep learning modeling. And we customized a pyramid +receptive fields depth separable CNN (PRFXception). PRFXception, a CNN +architecture specifically customized for mapping primeval forest canopy height +to infer the canopy height at the footprint level of GEDI and ICESat-2 from +Sentinel-2 optical imagery with a 10-meter spatial resolution. We conducted a +field survey of 227 permanent plots using a stratified sampling method and +measured several giant trees using UAV-LS. The predicted canopy height was +compared with ICESat-2 and GEDI validation data (RMSE =7.56 m, MAE=6.07 m, +ME=-0.98 m, R^2=0.58 m), UAV-LS point clouds (RMSE =5.75 m, MAE =3.72 m, ME = +0.82 m, R^2= 0.65 m), and ground survey data (RMSE = 6.75 m, MAE = 5.56 m, ME= +2.14 m, R^2=0.60 m). We mapped the potential distribution map of world-level +giant trees and discovered two previously undetected giant tree communities +with an 89% probability of having trees 80-100 m tall, potentially taller than +Asia's tallest tree. This paper provides scientific evidence confirming +southeastern Tibet--northwestern Yunnan as the fourth global distribution +center of world-level giant trees initiatives and promoting the inclusion of +the YTGC giant tree distribution area within the scope of China's national park +conservation. + +
+
+
+
+
+ + ☆ Progressive Token Length Scaling in Transformer Encoders for Efficient + Universal Segmentation + + +
+ A powerful architecture for universal segmentation relies on transformers +that encode multi-scale image features and decode object queries into mask +predictions. With efficiency being a high priority for scaling such models, we +observed that the state-of-the-art method Mask2Former uses ~50% of its compute +only on the transformer encoder. This is due to the retention of a full-length +token-level representation of all backbone feature scales at each encoder +layer. With this observation, we propose a strategy termed PROgressive Token +Length SCALing for Efficient transformer encoders (PRO-SCALE) that can be +plugged-in to the Mask2Former-style segmentation architectures to significantly +reduce the computational cost. The underlying principle of PRO-SCALE is: +progressively scale the length of the tokens with the layers of the encoder. +This allows PRO-SCALE to reduce computations by a large margin with minimal +sacrifice in performance (~52% GFLOPs reduction with no drop in performance on +COCO dataset). We validate our framework on multiple public benchmarks. + +
+
+
+
+
+ + ☆ Machine Vision Based Assessment of Fall Color Changes in Apple Trees: + Exploring Relationship with Leaf Nitrogen Concentration + + +
+ Apple trees being deciduous trees, shed leaves each year which is preceded by +the change in color of leaves from green to yellow (also known as senescence) +during the fall season. The rate and timing of color change are affected by the +number of factors including nitrogen (N) deficiencies. The green color of +leaves is highly dependent on the chlorophyll content, which in turn depends on +the nitrogen concentration in the leaves. The assessment of the leaf color can +give vital information on the nutrient status of the tree. The use of a machine +vision based system to capture and quantify these timings and changes in leaf +color can be a great tool for that purpose. + \par This study is based on data collected during the fall of 2021 and 2023 +at a commercial orchard using a ground-based stereo-vision sensor for five +weeks. The point cloud obtained from the sensor was segmented to get just the +tree in the foreground. The study involved the segmentation of the trees in a +natural background using point cloud data and quantification of the color using +a custom-defined metric, \textit{yellowness index}, varying from $-1$ to $+1$ +($-1$ being completely green and $+1$ being completely yellow), which gives the +proportion of yellow leaves on a tree. The performance of K-means based +algorithm and gradient boosting algorithm were compared for \textit{yellowness +index} calculation. The segmentation method proposed in the study was able to +estimate the \textit{yellowness index} on the trees with $R^2 = 0.72$. The +results showed that the metric was able to capture the gradual color transition +from green to yellow over the study duration. It was also observed that the +trees with lower nitrogen showed the color transition to yellow earlier than +the trees with higher nitrogen. The onset of color transition during both years +aligned with the $29^{th}$ week post-full bloom. + +
+
+
+
+
+ + ☆ UPose3D: Uncertainty-Aware 3D Human Pose Estimation with Cross-View and + Temporal Cues + + +
+ We introduce UPose3D, a novel approach for multi-view 3D human pose +estimation, addressing challenges in accuracy and scalability. Our method +advances existing pose estimation frameworks by improving robustness and +flexibility without requiring direct 3D annotations. At the core of our method, +a pose compiler module refines predictions from a 2D keypoints estimator that +operates on a single image by leveraging temporal and cross-view information. +Our novel cross-view fusion strategy is scalable to any number of cameras, +while our synthetic data generation strategy ensures generalization across +diverse actors, scenes, and viewpoints. Finally, UPose3D leverages the +prediction uncertainty of both the 2D keypoint estimator and the pose compiler +module. This provides robustness to outliers and noisy data, resulting in +state-of-the-art performance in out-of-distribution settings. In addition, for +in-distribution settings, UPose3D yields a performance rivaling methods that +rely on 3D annotated data, while being the state-of-the-art among methods +relying only on 2D supervision. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ Guided AbsoluteGrad: Magnitude of Gradients Matters to Explanation's + Localization and Saliency + + +
+ This paper proposes a new gradient-based XAI method called Guided +AbsoluteGrad for saliency map explanations. We utilize both positive and +negative gradient magnitudes and employ gradient variance to distinguish the +important areas for noise deduction. We also introduce a novel evaluation +metric named ReCover And Predict (RCAP), which considers the Localization and +Visual Noise Level objectives of the explanations. We propose two propositions +for these two objectives and prove the necessity of evaluating them. We +evaluate Guided AbsoluteGrad with seven gradient-based XAI methods using the +RCAP metric and other SOTA metrics in three case studies: (1) ImageNet dataset +with ResNet50 model; (2) International Skin Imaging Collaboration (ISIC) +dataset with EfficientNet model; (3) the Places365 dataset with DenseNet161 +model. Our method surpasses other gradient-based approaches, showcasing the +quality of enhanced saliency map explanations through gradient magnitude. + +
+
+ comment: CAI2024 Camera-ready Submission +
+
+
+
+
+ + ☆ Cross-Temporal Spectrogram Autoencoder (CTSAE): Unsupervised + Dimensionality Reduction for Clustering Gravitational Wave Glitches + + +
+ The advancement of The Laser Interferometer Gravitational-Wave Observatory +(LIGO) has significantly enhanced the feasibility and reliability of +gravitational wave detection. However, LIGO's high sensitivity makes it +susceptible to transient noises known as glitches, which necessitate effective +differentiation from real gravitational wave signals. Traditional approaches +predominantly employ fully supervised or semi-supervised algorithms for the +task of glitch classification and clustering. In the future task of identifying +and classifying glitches across main and auxiliary channels, it is impractical +to build a dataset with manually labeled ground-truth. In addition, the +patterns of glitches can vary with time, generating new glitches without manual +labels. In response to this challenge, we introduce the Cross-Temporal +Spectrogram Autoencoder (CTSAE), a pioneering unsupervised method for the +dimensionality reduction and clustering of gravitational wave glitches. CTSAE +integrates a novel four-branch autoencoder with a hybrid of Convolutional +Neural Networks (CNN) and Vision Transformers (ViT). To further extract +features across multi-branches, we introduce a novel multi-branch fusion method +using the CLS (Class) token. Our model, trained and evaluated on the GravitySpy +O3 dataset on the main channel, demonstrates superior performance in clustering +tasks when compared to state-of-the-art semi-supervised learning methods. To +the best of our knowledge, CTSAE represents the first unsupervised approach +tailored specifically for clustering LIGO data, marking a significant step +forward in the field of gravitational wave research. The code of this paper is +available at https://github.com/Zod-L/CTSAE + +
+
+
+
+
+ + ☆ BattleAgent: Multi-modal Dynamic Emulation on Historical Battles to + Complement Historical Analysis + + +
+ This paper presents BattleAgent, an emulation system that combines the Large +Vision-Language Model and Multi-agent System. This novel system aims to +simulate complex dynamic interactions among multiple agents, as well as between +agents and their environments, over a period of time. It emulates both the +decision-making processes of leaders and the viewpoints of ordinary +participants, such as soldiers. The emulation showcases the current +capabilities of agents, featuring fine-grained multi-modal interactions between +agents and landscapes. It develops customizable agent structures to meet +specific situational requirements, for example, a variety of battle-related +activities like scouting and trench digging. These components collaborate to +recreate historical events in a lively and comprehensive manner while offering +insights into the thoughts and feelings of individuals from diverse viewpoints. +The technological foundations of BattleAgent establish detailed and immersive +settings for historical battles, enabling individual agents to partake in, +observe, and dynamically respond to evolving battle scenarios. This methodology +holds the potential to substantially deepen our understanding of historical +events, particularly through individual accounts. Such initiatives can also aid +historical research, as conventional historical narratives often lack +documentation and prioritize the perspectives of decision-makers, thereby +overlooking the experiences of ordinary individuals. BattelAgent illustrates +AI's potential to revitalize the human aspect in crucial social events, thereby +fostering a more nuanced collective understanding and driving the progressive +development of human society. + +
+
+ comment: 26 pages, 14 figures The data and code for this project are + accessible at https://github.com/agiresearch/battleagent +
+
+
+
+
+ + ☆ Understanding Hyperbolic Metric Learning through Hard Negative Sampling + + +
+ In recent years, there has been a growing trend of incorporating hyperbolic +geometry methods into computer vision. While these methods have achieved +state-of-the-art performance on various metric learning tasks using hyperbolic +distance measurements, the underlying theoretical analysis supporting this +superior performance remains under-exploited. In this study, we investigate the +effects of integrating hyperbolic space into metric learning, particularly when +training with contrastive loss. We identify a need for a comprehensive +comparison between Euclidean and hyperbolic spaces regarding the temperature +effect in the contrastive loss within the existing literature. To address this +gap, we conduct an extensive investigation to benchmark the results of Vision +Transformers (ViTs) using a hybrid objective function that combines loss from +Euclidean and hyperbolic spaces. Additionally, we provide a theoretical +analysis of the observed performance improvement. We also reveal that +hyperbolic metric learning is highly related to hard negative sampling, +providing insights for future work. This work will provide valuable data points +and experience in understanding hyperbolic image embeddings. To shed more light +on problem-solving and encourage further investigation into our approach, our +code is available online (https://github.com/YunYunY/HypMix). + +
+
+ comment: published in Proceedings of the IEEE/CVF Winter Conference on + Applications of Computer Vision. 2024. arXiv admin note: text overlap with + arXiv:2203.10833 by other authors +
+
+
+
+
+ + ☆ Visual Delta Generator with Large Multi-modal Models for Semi-supervised + Composed Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a task that retrieves images similar to a +query, based on a provided textual modification. Current techniques rely on +supervised learning for CIR models using labeled triplets of the reference +image, text, target image. These specific triplets are not as commonly +available as simple image-text pairs, limiting the widespread use of CIR and +its scalability. On the other hand, zero-shot CIR can be relatively easily +trained with image-caption pairs without considering the image-to-image +relation, but this approach tends to yield lower accuracy. We propose a new +semi-supervised CIR approach where we search for a reference and its related +target images in auxiliary data and learn our large language model-based Visual +Delta Generator (VDG) to generate text describing the visual difference (i.e., +visual delta) between the two. VDG, equipped with fluent language knowledge and +being model agnostic, can generate pseudo triplets to boost the performance of +CIR models. Our approach significantly improves the existing supervised +learning approaches and achieves state-of-the-art results on the CIR +benchmarks. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ CFPFormer: Feature-pyramid like Transformer Decoder for Segmentation and + Detection + + +
+ Feature pyramids have been widely adopted in convolutional neural networks +(CNNs) and transformers for tasks like medical image segmentation and object +detection. However, the currently existing models generally focus on the +Encoder-side Transformer to extract features, from which decoder improvement +can bring further potential with well-designed architecture. We propose +CFPFormer, a novel decoder block that integrates feature pyramids and +transformers. Specifically, by leveraging patch embedding, cross-layer feature +concatenation, and Gaussian attention mechanisms, CFPFormer enhances feature +extraction capabilities while promoting generalization across diverse tasks. +Benefiting from Transformer structure and U-shaped Connections, our introduced +model gains the ability to capture long-range dependencies and effectively +up-sample feature maps. Our model achieves superior performance in detecting +small objects compared to existing methods. We evaluate CFPFormer on medical +image segmentation datasets and object detection benchmarks (VOC 2007, VOC2012, +MS-COCO), demonstrating its effectiveness and versatility. On the ACDC +Post-2017-MICCAI-Challenge online test set, our model reaches exceptionally +impressive accuracy, and performed well compared with the original decoder +setting in Synapse multi-organ segmentation dataset. + +
+
+
+
+
+ + ☆ ID-Aligner: Enhancing Identity-Preserving Text-to-Image Generation with + Reward Feedback Learning + + +
+ The rapid development of diffusion models has triggered diverse applications. +Identity-preserving text-to-image generation (ID-T2I) particularly has received +significant attention due to its wide range of application scenarios like AI +portrait and advertising. While existing ID-T2I methods have demonstrated +impressive results, several key challenges remain: (1) It is hard to maintain +the identity characteristics of reference portraits accurately, (2) The +generated images lack aesthetic appeal especially while enforcing identity +retention, and (3) There is a limitation that cannot be compatible with +LoRA-based and Adapter-based methods simultaneously. To address these issues, +we present \textbf{ID-Aligner}, a general feedback learning framework to +enhance ID-T2I performance. To resolve identity features lost, we introduce +identity consistency reward fine-tuning to utilize the feedback from face +detection and recognition models to improve generated identity preservation. +Furthermore, we propose identity aesthetic reward fine-tuning leveraging +rewards from human-annotated preference data and automatically constructed +feedback on character structure generation to provide aesthetic tuning signals. +Thanks to its universal feedback fine-tuning framework, our method can be +readily applied to both LoRA and Adapter models, achieving consistent +performance gains. Extensive experiments on SD1.5 and SDXL diffusion models +validate the effectiveness of our approach. \textbf{Project Page: +\url{https://idaligner.github.io/}} + +
+
+
+
+
+ + ☆ GLoD: Composing Global Contexts and Local Details in Image Generation + + +
+ Diffusion models have demonstrated their capability to synthesize +high-quality and diverse images from textual prompts. However, simultaneous +control over both global contexts (e.g., object layouts and interactions) and +local details (e.g., colors and emotions) still remains a significant +challenge. The models often fail to understand complex descriptions involving +multiple objects and reflect specified visual attributes to wrong targets or +ignore them. This paper presents Global-Local Diffusion (\textit{GLoD}), a +novel framework which allows simultaneous control over the global contexts and +the local details in text-to-image generation without requiring training or +fine-tuning. It assigns multiple global and local prompts to corresponding +layers and composes their noises to guide a denoising process using pre-trained +diffusion models. Our framework enables complex global-local compositions, +conditioning objects in the global prompt with the local prompts while +preserving other unspecified identities. Our quantitative and qualitative +evaluations demonstrate that GLoD effectively generates complex images that +adhere to both user-provided object interactions and object details. + +
+
+
+
+
+ + ☆ Deep multi-prototype capsule networks + + +
+ Capsule networks are a type of neural network that identify image parts and +form the instantiation parameters of a whole hierarchically. The goal behind +the network is to perform an inverse computer graphics task, and the network +parameters are the mapping weights that transform parts into a whole. The +trainability of capsule networks in complex data with high intra-class or +intra-part variation is challenging. This paper presents a multi-prototype +architecture for guiding capsule networks to represent the variations in the +image parts. To this end, instead of considering a single capsule for each +class and part, the proposed method employs several capsules (co-group +capsules), capturing multiple prototypes of an object. In the final layer, +co-group capsules compete, and their soft output is considered the target for a +competitive cross-entropy loss. Moreover, in the middle layers, the most active +capsules map to the next layer with a shared weight among the co-groups. +Consequently, due to the reduction in parameters, implicit weight-sharing makes +it possible to have more deep capsule network layers. The experimental results +on MNIST, SVHN, C-Cube, CEDAR, MCYT, and UTSig datasets reveal that the +proposed model outperforms others regarding image classification accuracy. + +
+
+
+
+
+ + ☆ Iterative Cluster Harvesting for Wafer Map Defect Patterns + + +
+ Unsupervised clustering of wafer map defect patterns is challenging because +the appearance of certain defect patterns varies significantly. This includes +changing shape, location, density, and rotation of the defect area on the +wafer. We present a harvesting approach, which can cluster even challenging +defect patterns of wafer maps well. Our approach makes use of a well-known, +three-step procedure: feature extraction, dimension reduction, and clustering. +The novelty in our approach lies in repeating dimensionality reduction and +clustering iteratively while filtering out one cluster per iteration according +to its silhouette score. This method leads to an improvement of clustering +performance in general and is especially useful for difficult defect patterns. +The low computational effort allows for a quick assessment of large datasets +and can be used to support manual labeling efforts. We benchmark against +related approaches from the literature and show improved results on a +real-world industrial dataset. + +
+
+
+
+
+ + ☆ Wiki-LLaVA: Hierarchical Retrieval-Augmented Generation for Multimodal + LLMs CVPR 2024 + + +
+ Multimodal LLMs are the natural evolution of LLMs, and enlarge their +capabilities so as to work beyond the pure textual modality. As research is +being carried out to design novel architectures and vision-and-language +adapters, in this paper we concentrate on endowing such models with the +capability of answering questions that require external knowledge. Our +approach, termed Wiki-LLaVA, aims at integrating an external knowledge source +of multimodal documents, which is accessed through a hierarchical retrieval +pipeline. Relevant passages, using this approach, are retrieved from the +external knowledge source and employed as additional context for the LLM, +augmenting the effectiveness and precision of generated dialogues. We conduct +extensive experiments on datasets tailored for visual question answering with +external data and demonstrate the appropriateness of our approach. + +
+
+ comment: CVPR 2024 Workshop on What is Next in Multimodal Foundation Models +
+
+
+
+
+ + ☆ On Generating Cancelable Biometric Template using Reverse of Boolean XOR + + +
+ Cancelable Biometric is repetitive distortion embedded in original Biometric +image for keeping it secure from unauthorized access. In this paper, we have +generated Cancelable Biometric templates with Reverse Boolean XOR technique. +Three different methods have been proposed for generation of Cancelable +Biometric templates based on Visual Secret Sharing scheme. In each method, one +Secret image and n-1 Cover images are used as: (M1) One original Biometric +image (Secret) with n- 1 randomly chosen Gray Cover images (M2) One original +Secret image with n-1 Cover images, which are Randomly Permuted version of the +original Secret image (M3) One Secret image with n-1 Cover images, both Secret +image and Cover images are Randomly Permuted version of original Biometric +image. Experiment works have performed on publicly available ORL Face database +and IIT Delhi Iris database. The performance of the proposed methods is +compared in terms of Co-relation Coefficient (Cr), Mean Square Error (MSE), +Mean Absolute Error (MAE), Structural Similarity (SSIM), Peak Signal to Noise +Ratio (PSNR), Number of Pixel Change Rate (NPCR), and Unified Average Changing +Intensity (UACI). It is found that among the three proposed method, M3 +generates good quality Cancelable templates and gives best performance in terms +of quality. M3 is also better in quantitative terms on ORL dataset while M2 and +M3 are comparable on IIT Delhi Iris dataset. + +
+
+
+
+
+ + ☆ Sum of Group Error Differences: A Critical Examination of Bias + Evaluation in Biometric Verification and a Dual-Metric Measure + + +
+ Biometric Verification (BV) systems often exhibit accuracy disparities across +different demographic groups, leading to biases in BV applications. Assessing +and quantifying these biases is essential for ensuring the fairness of BV +systems. However, existing bias evaluation metrics in BV have limitations, such +as focusing exclusively on match or non-match error rates, overlooking bias on +demographic groups with performance levels falling between the best and worst +performance levels, and neglecting the magnitude of the bias present. + This paper presents an in-depth analysis of the limitations of current bias +evaluation metrics in BV and, through experimental analysis, demonstrates their +contextual suitability, merits, and limitations. Additionally, it introduces a +novel general-purpose bias evaluation measure for BV, the ``Sum of Group Error +Differences (SEDG)''. Our experimental results on controlled synthetic datasets +demonstrate the effectiveness of demographic bias quantification when using +existing metrics and our own proposed measure. We discuss the applicability of +the bias evaluation metrics in a set of simulated demographic bias scenarios +and provide scenario-based metric recommendations. Our code is publicly +available under \url{https://github.com/alaaobeid/SEDG}. + +
+
+
+
+
+ + ☆ WANDR: Intention-guided Human Motion Generation + + +
+ Synthesizing natural human motions that enable a 3D human avatar to walk and +reach for arbitrary goals in 3D space remains an unsolved problem with many +applications. Existing methods (data-driven or using reinforcement learning) +are limited in terms of generalization and motion naturalness. A primary +obstacle is the scarcity of training data that combines locomotion with goal +reaching. To address this, we introduce WANDR, a data-driven model that takes +an avatar's initial pose and a goal's 3D position and generates natural human +motions that place the end effector (wrist) on the goal location. To solve +this, we introduce novel intention features that drive rich goal-oriented +movement. Intention guides the agent to the goal, and interactively adapts the +generation to novel situations without needing to define sub-goals or the +entire motion path. Crucially, intention allows training on datasets that have +goal-oriented motions as well as those that do not. WANDR is a conditional +Variational Auto-Encoder (c-VAE), which we train using the AMASS and CIRCLE +datasets. We evaluate our method extensively and demonstrate its ability to +generate natural and long-term motions that reach 3D goals and generalize to +unseen goal locations. Our models and code are available for research purposes +at wandr.is.tue.mpg.de. + +
+
+
+
+
+ + ☆ Hierarchical Hybrid Sliced Wasserstein: A Scalable Metric for + Heterogeneous Joint Distributions + + +
+ Sliced Wasserstein (SW) and Generalized Sliced Wasserstein (GSW) have been +widely used in applications due to their computational and statistical +scalability. However, the SW and the GSW are only defined between distributions +supported on a homogeneous domain. This limitation prevents their usage in +applications with heterogeneous joint distributions with marginal distributions +supported on multiple different domains. Using SW and GSW directly on the joint +domains cannot make a meaningful comparison since their homogeneous slicing +operator i.e., Radon Transform (RT) and Generalized Radon Transform (GRT) are +not expressive enough to capture the structure of the joint supports set. To +address the issue, we propose two new slicing operators i.e., Partial +Generalized Radon Transform (PGRT) and Hierarchical Hybrid Radon Transform +(HHRT). In greater detail, PGRT is the generalization of Partial Radon +Transform (PRT), which transforms a subset of function arguments non-linearly +while HHRT is the composition of PRT and multiple domain-specific PGRT on +marginal domain arguments. By using HHRT, we extend the SW into Hierarchical +Hybrid Sliced Wasserstein (H2SW) distance which is designed specifically for +comparing heterogeneous joint distributions. We then discuss the topological, +statistical, and computational properties of H2SW. Finally, we demonstrate the +favorable performance of H2SW in 3D mesh deformation, deep 3D mesh +autoencoders, and datasets comparison. + +
+
+ comment: 24 pages, 11 figures, 4 tables +
+
+
+
+
+ + ☆ Photometry of Saturated Stars with Machine Learning + + +
+ We develop a deep neural network (DNN) to obtain photometry of saturated +stars in the All-Sky Automated Survey for Supernovae (ASAS-SN). The DNN can +obtain unbiased photometry for stars from g=4 to 14 mag with a dispersion +(15%-85% 1sigma range around median) of 0.12 mag for saturated (g<11.5 mag) +stars. More importantly, the light curve of a non-variable saturated star has a +median dispersion of only 0.037 mag. The DNN light curves are, in many cases, +spectacularly better than provided by the standard ASAS-SN pipelines. While the +network was trained on g band data from only one of ASAS-SN's 20 cameras, +initial experiments suggest that it can be used for any camera and the older +ASAS-SN V band data as well. The dominant problems seem to be associated with +correctable issues in the ASAS-SN data reduction pipeline for saturated stars +more than the DNN itself. The method is publicly available as a light curve +option on ASAS-SN Sky Patrol v1.0. + +
+
+ comment: submitted to ApJ +
+
+
+
+
+ + ☆ Adapting an Artificial Intelligence Sexually Transmitted Diseases + Symptom Checker Tool for Mpox Detection: The HeHealth Experience + + +
+ Artificial Intelligence applications have shown promise in the management of +pandemics and have been widely used to assist the identification, +classification, and diagnosis of medical images. In response to the global +outbreak of Monkeypox (Mpox), the HeHealth.ai team leveraged an existing tool +to screen for sexually transmitted diseases to develop a digital screening test +for symptomatic Mpox through AI approaches. Prior to the global outbreak of +Mpox, the team developed a smartphone app, where app users can use their own +smartphone cameras to take pictures of their own penises to screen for +symptomatic STD. The AI model was initially developed using 5000 cases and use +a modified convolutional neural network to output prediction scores across +visually diagnosable penis pathologies including Syphilis, Herpes Simplex +Virus, and Human Papilloma Virus. From June 2022 to October 2022, a total of +about 22,000 users downloaded the HeHealth app, and about 21,000 images have +been analyzed using HeHealth AI technology. We then engaged in formative +research, stakeholder engagement, rapid consolidation images, a validation +study, and implementation of the tool from July 2022. From July 2022 to October +2022, a total of 1000 Mpox related images had been used to train the Mpox +symptom checker tool. Our digital symptom checker tool showed accuracy of 87% +to rule in Mpox and 90% to rule out symptomatic Mpox. Several hurdles +identified included issues of data privacy and security for app users, initial +lack of data to train the AI tool, and the potential generalizability of input +data. We offer several suggestions to help others get started on similar +projects in emergency situations, including engaging a wide range of +stakeholders, having a multidisciplinary team, prioritizing pragmatism, as well +as the concept that big data in fact is made up of small data. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ ThermoPore: Predicting Part Porosity Based on Thermal Images Using Deep + Learning + + +
+ We present a deep learning approach for quantifying and localizing ex-situ +porosity within Laser Powder Bed Fusion fabricated samples utilizing in-situ +thermal image monitoring data. Our goal is to build the real time porosity map +of parts based on thermal images acquired during the build. The quantification +task builds upon the established Convolutional Neural Network model +architecture to predict pore count and the localization task leverages the +spatial and temporal attention mechanisms of the novel Video Vision Transformer +model to indicate areas of expected porosity. Our model for porosity +quantification achieved a $R^2$ score of 0.57 and our model for porosity +localization produced an average IoU score of 0.32 and a maximum of 1.0. This +work is setting the foundations of part porosity "Digital Twins" based on +additive manufacturing monitoring data and can be applied downstream to reduce +time-intensive post-inspection and testing activities during part qualification +and certification. In addition, we seek to accelerate the acquisition of +crucial insights normally only available through ex-situ part evaluation by +means of machine learning analysis of in-situ process monitoring data. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised 3D Object Detection via Multi-Level Visual Guidance + + +
+ Weakly supervised 3D object detection aims to learn a 3D detector with lower +annotation cost, e.g., 2D labels. Unlike prior work which still relies on few +accurate 3D annotations, we propose a framework to study how to leverage +constraints between 2D and 3D domains without requiring any 3D labels. +Specifically, we employ visual data from three perspectives to establish +connections between 2D and 3D domains. First, we design a feature-level +constraint to align LiDAR and image features based on object-aware regions. +Second, the output-level constraint is developed to enforce the overlap between +2D and projected 3D box estimations. Finally, the training-level constraint is +utilized by producing accurate and consistent 3D pseudo-labels that align with +the visual data. We conduct extensive experiments on the KITTI dataset to +validate the effectiveness of the proposed three constraints. Without using any +3D labels, our method achieves favorable performance against state-of-the-art +approaches and is competitive with the method that uses 500-frame 3D +annotations. Code and models will be made publicly available at +https://github.com/kuanchihhuang/VG-W3D. + +
+
+ comment: Project page: https://github.com/kuanchihhuang/VG-W3D +
+
+
+
+
+ + ♻ ☆ VideoXum: Cross-modal Visual and Textural Summarization of Videos + + +
+ Video summarization aims to distill the most important information from a +source video to produce either an abridged clip or a textual narrative. +Traditionally, different methods have been proposed depending on whether the +output is a video or text, thus ignoring the correlation between the two +semantically related tasks of visual summarization and textual summarization. +We propose a new joint video and text summarization task. The goal is to +generate both a shortened video clip along with the corresponding textual +summary from a long video, collectively referred to as a cross-modal summary. +The generated shortened video clip and text narratives should be semantically +well aligned. To this end, we first build a large-scale human-annotated dataset +-- VideoXum (X refers to different modalities). The dataset is reannotated +based on ActivityNet. After we filter out the videos that do not meet the +length requirements, 14,001 long videos remain in our new dataset. Each video +in our reannotated dataset has human-annotated video summaries and the +corresponding narrative summaries. We then design a novel end-to-end model -- +VTSUM-BILP to address the challenges of our proposed task. Moreover, we propose +a new metric called VT-CLIPScore to help evaluate the semantic consistency of +cross-modality summary. The proposed model achieves promising performance on +this new task and establishes a benchmark for future research. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ VT-Former: An Exploratory Study on Vehicle Trajectory Prediction for + Highway Surveillance through Graph Isomorphism and Transformer + + +
+ Enhancing roadway safety has become an essential computer vision focus area +for Intelligent Transportation Systems (ITS). As a part of ITS, Vehicle +Trajectory Prediction (VTP) aims to forecast a vehicle's future positions based +on its past and current movements. VTP is a pivotal element for road safety, +aiding in applications such as traffic management, accident prevention, +work-zone safety, and energy optimization. While most works in this field focus +on autonomous driving, with the growing number of surveillance cameras, another +sub-field emerges for surveillance VTP with its own set of challenges. In this +paper, we introduce VT-Former, a novel transformer-based VTP approach for +highway safety and surveillance. In addition to utilizing transformers to +capture long-range temporal patterns, a new Graph Attentive Tokenization (GAT) +module has been proposed to capture intricate social interactions among +vehicles. This study seeks to explore both the advantages and the limitations +inherent in combining transformer architecture with graphs for VTP. Our +investigation, conducted across three benchmark datasets from diverse +surveillance viewpoints, showcases the State-of-the-Art (SotA) or comparable +performance of VT-Former in predicting vehicle trajectories. This study +underscores the potential of VT-Former and its architecture, opening new +avenues for future research and exploration. + +
+
+ comment: Completely updated based on the reviews received for the paper +
+
+
+
+
+ + ♻ ☆ Co-Speech Gesture Detection through Multi-Phase Sequence Labeling + + +
+ Gestures are integral components of face-to-face communication. They unfold +over time, often following predictable movement phases of preparation, stroke, +and retraction. Yet, the prevalent approach to automatic gesture detection +treats the problem as binary classification, classifying a segment as either +containing a gesture or not, thus failing to capture its inherently sequential +and contextual nature. To address this, we introduce a novel framework that +reframes the task as a multi-phase sequence labeling problem rather than binary +classification. Our model processes sequences of skeletal movements over time +windows, uses Transformer encoders to learn contextual embeddings, and +leverages Conditional Random Fields to perform sequence labeling. We evaluate +our proposal on a large dataset of diverse co-speech gestures in task-oriented +face-to-face dialogues. The results consistently demonstrate that our method +significantly outperforms strong baseline models in detecting gesture strokes. +Furthermore, applying Transformer encoders to learn contextual embeddings from +movement sequences substantially improves gesture unit detection. These results +highlight our framework's capacity to capture the fine-grained dynamics of +co-speech gesture phases, paving the way for more nuanced and accurate gesture +detection and analysis. + +
+
+
+
+
+ + ♻ ☆ CLIP-QDA: An Explainable Concept Bottleneck Model + + +
+ In this paper, we introduce an explainable algorithm designed from a +multi-modal foundation model, that performs fast and explainable image +classification. Drawing inspiration from CLIP-based Concept Bottleneck Models +(CBMs), our method creates a latent space where each neuron is linked to a +specific word. Observing that this latent space can be modeled with simple +distributions, we use a Mixture of Gaussians (MoG) formalism to enhance the +interpretability of this latent space. Then, we introduce CLIP-QDA, a +classifier that only uses statistical values to infer labels from the concepts. +In addition, this formalism allows for both local and global explanations. +These explanations come from the inner design of our architecture, our work is +part of a new family of greybox models, combining performances of opaque +foundation models and the interpretability of transparent models. Our empirical +findings show that in instances where the MoG assumption holds, CLIP-QDA +achieves similar accuracy with state-of-the-art methods CBMs. Our explanations +compete with existing XAI methods while being faster to compute. + +
+
+
+
+
+ + ♻ ☆ Improving Video Corpus Moment Retrieval with Partial Relevance + Enhancement ICMR 2024 + + +
+ Video Corpus Moment Retrieval (VCMR) is a new video retrieval task aimed at +retrieving a relevant moment from a large corpus of untrimmed videos using a +text query. The relevance between the video and query is partial, mainly +evident in two aspects:~(1)~Scope: The untrimmed video contains many frames, +but not all are relevant to the query. Strong relevance is typically observed +only within the relevant moment.~(2)~Modality: The relevance of the query +varies with different modalities. Action descriptions align more with visual +elements, while character conversations are more related to textual +information.Existing methods often treat all video contents equally, leading to +sub-optimal moment retrieval. We argue that effectively capturing the partial +relevance between the query and video is essential for the VCMR task. To this +end, we propose a Partial Relevance Enhanced Model~(PREM) to improve VCMR. VCMR +involves two sub-tasks: video retrieval and moment localization. To align with +their distinct objectives, we implement specialized partial relevance +enhancement strategies. For video retrieval, we introduce a multi-modal +collaborative video retriever, generating different query representations for +the two modalities by modality-specific pooling, ensuring a more effective +match. For moment localization, we propose the focus-then-fuse moment +localizer, utilizing modality-specific gates to capture essential content. We +also introduce relevant content-enhanced training methods for both retriever +and localizer to enhance the ability of model to capture relevant content. +Experimental results on TVR and DiDeMo datasets show that the proposed model +outperforms the baselines, achieving a new state-of-the-art of VCMR. The code +is available at \url{https://github.com/hdy007007/PREM}. + +
+
+ comment: camera-ready version of ACM ICMR 2024 +
+
+
+
+
+ + ♻ ☆ Subobject-level Image Tokenization + + +
+ Transformer-based vision models typically tokenize images into fixed-size +square patches as input units, which lacks the adaptability to image content +and overlooks the inherent pixel grouping structure. Inspired by the subword +tokenization widely adopted in language models, we propose an image tokenizer +at a subobject level, where the subobjects are represented by semantically +meaningful image segments obtained by segmentation models (e.g., segment +anything models). To implement a learning system based on subobject +tokenization, we first introduced a Direct Segment Anything Model (DirectSAM) +that efficiently produces comprehensive segmentation of subobjects, then embed +subobjects into compact latent vectors and fed them into a large language model +for vision language learning. Empirical results demonstrated that our +subobject-level tokenization significantly facilitates efficient learning of +translating images into object and attribute descriptions compared to the +traditional patch-level tokenization. Codes and models are open-sourced at +https://github.com/ChenDelong1999/subobjects. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Visual Grounding Methods for VQA are Working for the Wrong Reasons! ACL 2020 + + +
+ Existing Visual Question Answering (VQA) methods tend to exploit dataset +biases and spurious statistical correlations, instead of producing right +answers for the right reasons. To address this issue, recent bias mitigation +methods for VQA propose to incorporate visual cues (e.g., human attention maps) +to better ground the VQA models, showcasing impressive gains. However, we show +that the performance improvements are not a result of improved visual +grounding, but a regularization effect which prevents over-fitting to +linguistic priors. For instance, we find that it is not actually necessary to +provide proper, human-based cues; random, insensible cues also result in +similar improvements. Based on this observation, we propose a simpler +regularization scheme that does not require any external annotations and yet +achieves near state-of-the-art performance on VQA-CPv2. + +
+
+ comment: Published in ACL 2020 under the title "A negative case analysis of + visual grounding methods for VQA" +
+
+
+
+
+ + ♻ ☆ Attention-Map Augmentation for Hypercomplex Breast Cancer Classification + + +
+ Breast cancer is the most widespread neoplasm among women and early detection +of this disease is critical. Deep learning techniques have become of great +interest to improve diagnostic performance. However, distinguishing between +malignant and benign masses in whole mammograms poses a challenge, as they +appear nearly identical to an untrained eye, and the region of interest (ROI) +constitutes only a small fraction of the entire image. In this paper, we +propose a framework, parameterized hypercomplex attention maps (PHAM), to +overcome these problems. Specifically, we deploy an augmentation step based on +computing attention maps. Then, the attention maps are used to condition the +classification step by constructing a multi-dimensional input comprised of the +original breast cancer image and the corresponding attention map. In this step, +a parameterized hypercomplex neural network (PHNN) is employed to perform +breast cancer classification. The framework offers two main advantages. First, +attention maps provide critical information regarding the ROI and allow the +neural model to concentrate on it. Second, the hypercomplex architecture has +the ability to model local relations between input dimensions thanks to +hypercomplex algebra rules, thus properly exploiting the information provided +by the attention map. We demonstrate the efficacy of the proposed framework on +both mammography images as well as histopathological ones. We surpass +attention-based state-of-the-art networks and the real-valued counterpart of +our approach. The code of our work is available at +https://github.com/ispamm/AttentionBCS. + +
+
+ comment: Published in Elsevier Pattern Recognition Letters +
+
+
+
+
+ + ♻ ☆ Diagnosis of Multiple Fundus Disorders Amidst a Scarcity of Medical + Experts Via Self-supervised Machine Learning + + +
+ Fundus diseases are major causes of visual impairment and blindness +worldwide, especially in underdeveloped regions, where the shortage of +ophthalmologists hinders timely diagnosis. AI-assisted fundus image analysis +has several advantages, such as high accuracy, reduced workload, and improved +accessibility, but it requires a large amount of expert-annotated data to build +reliable models. To address this dilemma, we propose a general self-supervised +machine learning framework that can handle diverse fundus diseases from +unlabeled fundus images. Our method's AUC surpasses existing supervised +approaches by 15.7%, and even exceeds performance of a single human expert. +Furthermore, our model adapts well to various datasets from different regions, +races, and heterogeneous image sources or qualities from multiple cameras or +devices. Our method offers a label-free general framework to diagnose fundus +diseases, which could potentially benefit telehealth programs for early +screening of people at risk of vision loss. + +
+
+
+
+
+ + ♻ ☆ RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key + Identification + + +
+ We revisit Tree-Ring Watermarking, a recent diffusion model watermarking +method that demonstrates great robustness to various attacks. We conduct an +in-depth study on it and reveal that the distribution shift unintentionally +introduced by the watermarking process, apart from watermark pattern matching, +contributes to its exceptional robustness. Our investigation further exposes +inherent flaws in its original design, particularly in its ability to identify +multiple distinct keys, where distribution shift offers no assistance. Based on +these findings and analysis, we present RingID for enhanced multi-key +identification. It consists of a novel multi-channel heterogeneous watermarking +approach designed to seamlessly amalgamate distinctive advantages from diverse +watermarks. Coupled with a series of suggested enhancements, RingID exhibits +substantial advancements in multi-key identification. Github Page: +https://github.com/showlab/RingID + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ CT-NeRF: Incremental Optimizing Neural Radiance Field and Poses with + Complex Trajectory + + +
+ Neural radiance field (NeRF) has achieved impressive results in high-quality +3D scene reconstruction. However, NeRF heavily relies on precise camera poses. +While recent works like BARF have introduced camera pose optimization within +NeRF, their applicability is limited to simple trajectory scenes. Existing +methods struggle while tackling complex trajectories involving large rotations. +To address this limitation, we propose CT-NeRF, an incremental reconstruction +optimization pipeline using only RGB images without pose and depth input. In +this pipeline, we first propose a local-global bundle adjustment under a pose +graph connecting neighboring frames to enforce the consistency between poses to +escape the local minima caused by only pose consistency with the scene +structure. Further, we instantiate the consistency between poses as a +reprojected geometric image distance constraint resulting from pixel-level +correspondences between input image pairs. Through the incremental +reconstruction, CT-NeRF enables the recovery of both camera poses and scene +structure and is capable of handling scenes with complex trajectories. We +evaluate the performance of CT-NeRF on two real-world datasets, NeRFBuster and +Free-Dataset, which feature complex trajectories. Results show CT-NeRF +outperforms existing methods in novel view synthesis and pose estimation +accuracy. + +
+
+
+
+
+ + ♻ ☆ RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM + + +
+ Multi-modal 3D object detectors are dedicated to exploring secure and +reliable perception systems for autonomous driving (AD).Although achieving +state-of-the-art (SOTA) performance on clean benchmark datasets, they tend to +overlook the complexity and harsh conditions of real-world environments. With +the emergence of visual foundation models (VFMs), opportunities and challenges +are presented for improving the robustness and generalization of multi-modal 3D +object detection in AD. Therefore, we propose RoboFusion, a robust framework +that leverages VFMs like SAM to tackle out-of-distribution (OOD) noise +scenarios. We first adapt the original SAM for AD scenarios named SAM-AD. To +align SAM or SAM-AD with multi-modal methods, we then introduce AD-FPN for +upsampling the image features extracted by SAM. We employ wavelet decomposition +to denoise the depth-guided images for further noise reduction and weather +interference. At last, we employ self-attention mechanisms to adaptively +reweight the fused features, enhancing informative features while suppressing +excess noise. In summary, RoboFusion significantly reduces noise by leveraging +the generalization and robustness of VFMs, thereby enhancing the resilience of +multi-modal 3D object detection. Consequently, RoboFusion achieves SOTA +performance in noisy scenarios, as demonstrated by the KITTI-C and nuScenes-C +benchmarks. Code is available at https://github.com/adept-thu/RoboFusion. + +
+
+
+
+
+ + ♻ ☆ Are Semi-Dense Detector-Free Methods Good at Matching Local Features? + + +
+ Semi-dense detector-free approaches (SDF), such as LoFTR, are currently among +the most popular image matching methods. While SDF methods are trained to +establish correspondences between two images, their performances are almost +exclusively evaluated using relative pose estimation metrics. Thus, the link +between their ability to establish correspondences and the quality of the +resulting estimated pose has thus far received little attention. This paper is +a first attempt to study this link. We start with proposing a novel structured +attention-based image matching architecture (SAM). It allows us to show a +counter-intuitive result on two datasets (MegaDepth and HPatches): on the one +hand SAM either outperforms or is on par with SDF methods in terms of +pose/homography estimation metrics, but on the other hand SDF approaches are +significantly better than SAM in terms of matching accuracy. We then propose to +limit the computation of the matching accuracy to textured regions, and show +that in this case SAM often surpasses SDF methods. Our findings highlight a +strong correlation between the ability to establish accurate correspondences in +textured regions and the accuracy of the resulting estimated pose/homography. +Our code will be made available. + +
+
+
+
+
+ + ♻ ☆ GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via + Blender-Oriented GPT Planning + + +
+ Recent advances in text-to-video generation have harnessed the power of +diffusion models to create visually compelling content conditioned on text +prompts. However, they usually encounter high computational costs and often +struggle to produce videos with coherent physical motions. To tackle these +issues, we propose GPT4Motion, a training-free framework that leverages the +planning capability of large language models such as GPT, the physical +simulation strength of Blender, and the excellent image generation ability of +text-to-image diffusion models to enhance the quality of video synthesis. +Specifically, GPT4Motion employs GPT-4 to generate a Blender script based on a +user textual prompt, which commands Blender's built-in physics engine to craft +fundamental scene components that encapsulate coherent physical motions across +frames. Then these components are inputted into Stable Diffusion to generate a +video aligned with the textual prompt. Experimental results on three basic +physical motion scenarios, including rigid object drop and collision, cloth +draping and swinging, and liquid flow, demonstrate that GPT4Motion can generate +high-quality videos efficiently in maintaining motion coherency and entity +consistency. GPT4Motion offers new insights in text-to-video research, +enhancing its quality and broadening its horizon for further explorations. + +
+
+
+
+
+ + ♻ ☆ StreakNet-Arch: An Anti-scattering Network-based Architecture for + Underwater Carrier LiDAR-Radar Imaging + + +
+ In this paper, we introduce StreakNet-Arch, a novel signal processing +architecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging +systems, to address the limitations in scatter suppression and real-time +imaging. StreakNet-Arch formulates the signal processing as a real-time, +end-to-end binary classification task, enabling real-time image acquisition. To +achieve this, we leverage Self-Attention networks and propose a novel Double +Branch Cross Attention (DBC-Attention) mechanism that surpasses the performance +of traditional methods. Furthermore, we present a method for embedding +streak-tube camera images into attention networks, effectively acting as a +learned bandpass filter. To facilitate further research, we contribute a +publicly available streak-tube camera image dataset. The dataset contains +2,695,168 real-world underwater 3D point cloud data. These advancements +significantly improve UCLR capabilities, enhancing its performance and +applicability in underwater imaging tasks. The source code and dataset can be +found at https://github.com/BestAnHongjun/StreakNet . + +
+
+ comment: Reduce the number of pages to 13 +
+
+
+
+
+ + ♻ ☆ Continual Learning with Pre-Trained Models: A Survey IJCAI 2024 + + +
+ Nowadays, real-world applications often face streaming data, which requires +the learning system to absorb new knowledge as data evolves. Continual Learning +(CL) aims to achieve this goal and meanwhile overcome the catastrophic +forgetting of former knowledge when learning new ones. Typical CL methods build +the model from scratch to grow with incoming data. However, the advent of the +pre-trained model (PTM) era has sparked immense research interest, particularly +in leveraging PTMs' robust representational capabilities. This paper presents a +comprehensive survey of the latest advancements in PTM-based CL. We categorize +existing methodologies into three distinct groups, providing a comparative +analysis of their similarities, differences, and respective advantages and +disadvantages. Additionally, we offer an empirical study contrasting various +state-of-the-art methods to highlight concerns regarding fairness in +comparisons. The source code to reproduce these evaluations is available at: +https://github.com/sun-hailong/LAMDA-PILOT + +
+
+ comment: Accepted to IJCAI 2024. Code is available at: + https://github.com/sun-hailong/LAMDA-PILOT +
+
+
+
+
+ + ♻ ☆ ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context SIGGRAPH + + +
+ Neural Radiance Fields (NeRFs) have recently emerged as a popular option for +photo-realistic object capture due to their ability to faithfully capture +high-fidelity volumetric content even from handheld video input. Although much +research has been devoted to efficient optimization leading to real-time +training and rendering, options for interactive editing NeRFs remain limited. +We present a very simple but effective neural network architecture that is fast +and efficient while maintaining a low memory footprint. This architecture can +be incrementally guided through user-friendly image-based edits. Our +representation allows straightforward object selection via semantic feature +distillation at the training stage. More importantly, we propose a local +3D-aware image context to facilitate view-consistent image editing that can +then be distilled into fine-tuned NeRFs, via geometric and appearance +adjustments. We evaluate our setup on a variety of examples to demonstrate +appearance and geometric edits and report 10-30x speedup over concurrent work +focusing on text-guided NeRF editing. Video results can be seen on our project +webpage at https://proteusnerf.github.io. + +
+
+ comment: Accepted at I3D'24 (ACM SIGGRAPH SYMPOSIUM ON INTERACTIVE 3D GRAPHICS + AND GAMES) +
+
+
+
+
+ + ♻ ☆ Adaptive Hybrid Masking Strategy for Privacy-Preserving Face Recognition + Against Model Inversion Attack + + +
+ The utilization of personal sensitive data in training face recognition (FR) +models poses significant privacy concerns, as adversaries can employ model +inversion attacks (MIA) to infer the original training data. Existing defense +methods, such as data augmentation and differential privacy, have been employed +to mitigate this issue. However, these methods often fail to strike an optimal +balance between privacy and accuracy. To address this limitation, this paper +introduces an adaptive hybrid masking algorithm against MIA. Specifically, face +images are masked in the frequency domain using an adaptive MixUp strategy. +Unlike the traditional MixUp algorithm, which is predominantly used for data +augmentation, our modified approach incorporates frequency domain mixing. +Previous studies have shown that increasing the number of images mixed in MixUp +can enhance privacy preservation but at the expense of reduced face recognition +accuracy. To overcome this trade-off, we develop an enhanced adaptive MixUp +strategy based on reinforcement learning, which enables us to mix a larger +number of images while maintaining satisfactory recognition accuracy. To +optimize privacy protection, we propose maximizing the reward function (i.e., +the loss function of the FR system) during the training of the strategy +network. While the loss function of the FR network is minimized in the phase of +training the FR network. The strategy network and the face recognition network +can be viewed as antagonistic entities in the training process, ultimately +reaching a more balanced trade-off. Experimental results demonstrate that our +proposed hybrid masking scheme outperforms existing defense algorithms in terms +of privacy preservation and recognition accuracy against MIA. + +
+
+
+
+
+ + ♻ ☆ DreamMatcher: Appearance Matching Self-Attention for + Semantically-Consistent Text-to-Image Personalization + + +
+ The objective of text-to-image (T2I) personalization is to customize a +diffusion model to a user-provided reference concept, generating diverse images +of the concept aligned with the target prompts. Conventional methods +representing the reference concepts using unique text embeddings often fail to +accurately mimic the appearance of the reference. To address this, one solution +may be explicitly conditioning the reference images into the target denoising +process, known as key-value replacement. However, prior works are constrained +to local editing since they disrupt the structure path of the pre-trained T2I +model. To overcome this, we propose a novel plug-in method, called +DreamMatcher, which reformulates T2I personalization as semantic matching. +Specifically, DreamMatcher replaces the target values with reference values +aligned by semantic matching, while leaving the structure path unchanged to +preserve the versatile capability of pre-trained T2I models for generating +diverse structures. We also introduce a semantic-consistent masking strategy to +isolate the personalized concept from irrelevant regions introduced by the +target prompts. Compatible with existing T2I models, DreamMatcher shows +significant improvements in complex scenarios. Intensive analyses demonstrate +the effectiveness of our approach. + +
+
+ comment: Project page is available at https://ku-cvlab.github.io/DreamMatcher/ +
+
+
+
+
+ + ♻ ☆ Seeing is Believing: Mitigating Hallucination in Large Vision-Language + Models via CLIP-Guided Decoding + + +
+ Large Vision-Language Models (LVLMs) are susceptible to object +hallucinations, an issue in which their generated text contains non-existent +objects, greatly limiting their reliability and practicality. Current +approaches often rely on the model's token likelihoods or other internal +information, instruction tuning on additional datasets, or incorporating +complex external tools. We first perform empirical analysis on sentence-level +LVLM hallucination, finding that CLIP similarity to the image acts as a +stronger and more robust indicator of hallucination compared to token +likelihoods. Motivated by this, we introduce our CLIP-Guided Decoding (CGD) +approach, a straightforward but effective training-free approach to reduce +object hallucination at decoding time. CGD uses CLIP to guide the model's +decoding process by enhancing visual grounding of generated text with the +image. Experiments demonstrate that CGD effectively mitigates object +hallucination across multiple LVLM families while preserving the utility of +text generation. Codes are available at +https://github.com/d-ailin/CLIP-Guided-Decoding. + +
+
+ comment: Code URL: https://github.com/d-ailin/CLIP-Guided-Decoding +
+
+
+
+
+ + ♻ ☆ A Survey on Autonomous Driving Datasets: Statistics, Annotation Quality, + and a Future Outlook + + +
+ Autonomous driving has rapidly developed and shown promising performance due +to recent advances in hardware and deep learning techniques. High-quality +datasets are fundamental for developing reliable autonomous driving algorithms. +Previous dataset surveys either focused on a limited number or lacked detailed +investigation of dataset characteristics. To this end, we present an exhaustive +study of 265 autonomous driving datasets from multiple perspectives, including +sensor modalities, data size, tasks, and contextual conditions. We introduce a +novel metric to evaluate the impact of datasets, which can also be a guide for +creating new datasets. Besides, we analyze the annotation processes, existing +labeling tools, and the annotation quality of datasets, showing the importance +of establishing a standard annotation pipeline. On the other hand, we +thoroughly analyze the impact of geographical and adversarial environmental +conditions on the performance of autonomous driving systems. Moreover, we +exhibit the data distribution of several vital datasets and discuss their pros +and cons accordingly. Finally, we discuss the current challenges and the +development trend of the future autonomous driving datasets. + +
+
+
+
+
+ + ♻ ☆ Fine-tuning vision foundation model for crack segmentation in civil + infrastructures + + +
+ Large-scale foundation models have become the mainstream deep learning +method, while in civil engineering, the scale of AI models is strictly limited. +In this work, a vision foundation model is introduced for crack segmentation. +Two parameter-efficient fine-tuning methods, adapter and low-rank adaptation, +are adopted to fine-tune the foundation model in semantic segmentation: the +Segment Anything Model (SAM). The fine-tuned CrackSAM shows excellent +performance on different scenes and materials. To test the zero-shot +performance of the proposed method, two unique datasets related to road and +exterior wall cracks are collected, annotated and open-sourced, for a total of +810 images. Comparative experiments are conducted with twelve mature semantic +segmentation models. On datasets with artificial noise and previously unseen +datasets, the performance of CrackSAM far exceeds that of all state-of-the-art +models. CrackSAM exhibits remarkable superiority, particularly under +challenging conditions such as dim lighting, shadows, road markings, +construction joints, and other interference factors. These cross-scenario +results demonstrate the outstanding zero-shot capability of foundation models +and provide new ideas for developing vision models in civil engineering. + +
+
+
+
+
+ + ♻ ☆ PaddingFlow: Improving Normalizing Flows with Padding-Dimensional Noise + + +
+ Normalizing flow is a generative modeling approach with efficient sampling. +However, Flow-based models suffer two issues: 1) If the target distribution is +manifold, due to the unmatch between the dimensions of the latent target +distribution and the data distribution, flow-based models might perform badly. +2) Discrete data might make flow-based models collapse into a degenerate +mixture of point masses. To sidestep such two issues, we propose PaddingFlow, a +novel dequantization method, which improves normalizing flows with +padding-dimensional noise. To implement PaddingFlow, only the dimension of +normalizing flows needs to be modified. Thus, our method is easy to implement +and computationally cheap. Moreover, the padding-dimensional noise is only +added to the padding dimension, which means PaddingFlow can dequantize without +changing data distributions. Implementing existing dequantization methods needs +to change data distributions, which might degrade performance. We validate our +method on the main benchmarks of unconditional density estimation, including +five tabular datasets and four image datasets for Variational Autoencoder (VAE) +models, and the Inverse Kinematics (IK) experiments which are conditional +density estimation. The results show that PaddingFlow can perform better in all +experiments in this paper, which means PaddingFlow is widely suitable for +various tasks. The code is available at: +https://github.com/AdamQLMeng/PaddingFlow. + +
+
+
+
+
+ + ♻ ☆ Effective Decision Boundary Learning for Class Incremental Learning + + +
+ Rehearsal approaches in class incremental learning (CIL) suffer from decision +boundary overfitting to new classes, which is mainly caused by two factors: +insufficiency of old classes data for knowledge distillation and imbalanced +data learning between the learned and new classes because of the limited +storage memory. In this work, we present a simple but effective approach to +tackle these two factors. First, we employ a re-sampling strategy and Mixup +K}nowledge D}istillation (Re-MKD) to improve the performances of KD, which +would greatly alleviate the overfitting problem. Specifically, we combine mixup +and re-sampling strategies to synthesize adequate data used in KD training that +are more consistent with the latent distribution between the learned and new +classes. Second, we propose a novel incremental influence balance (IIB) method +for CIL to tackle the classification of imbalanced data by extending the +influence balance method into the CIL setting, which re-weights samples by +their influences to create a proper decision boundary. With these two +improvements, we present the effective decision boundary learning algorithm +(EDBL) which improves the performance of KD and deals with the imbalanced data +learning simultaneously. Experiments show that the proposed EDBL achieves +state-of-the-art performances on several CIL benchmarks. + +
+
+
+
+
+ + ♻ ☆ Towards Effective Multi-Moving-Camera Tracking: A New Dataset and + Lightweight Link Model + + +
+ Ensuring driving safety for autonomous vehicles has become increasingly +crucial, highlighting the need for systematic tracking of on-road pedestrians. +Most vehicles are equipped with visual sensors, however, the large-scale visual +data has not been well studied yet. Multi-target multi-camera (MTMC) tracking +systems are composed of two modules: single-camera tracking (SCT) and +inter-camera tracking (ICT). To reliably coordinate between them, MTMC tracking +has been a very complicated task, while tracking across multiple moving cameras +makes it even more challenging. In this paper, we focus on multi-target +multi-moving-camera (MTMMC) tracking, which is attracting increasing attention +from the research community. Observing there are few datasets for MTMMC +tracking, we collect a new dataset, called Multi-Moving-Camera Track (MMCT), +which contains sequences under various driving scenarios. To address the common +problems of identity switch easily faced by most existing SCT trackers, +especially for moving cameras due to ego-motion between the camera and targets, +a lightweight appearance-free global link model, called Linker, is proposed to +mitigate the identity switch by associating two disjoint tracklets of the same +target into a complete trajectory within the same camera. Incorporated with +Linker, existing SCT trackers generally obtain a significant improvement. +Moreover, to alleviate the impact of the image style variations caused by +different cameras, a color transfer module is effectively incorporated to +extract cross-camera consistent appearance features for pedestrian association +across moving cameras for ICT, resulting in a much improved MTMMC tracking +system, which can constitute a step further towards coordinated mining of +multiple moving cameras. The project page is available at +https://dhu-mmct.github.io/. + +
+
+
+
+
+ + ♻ ☆ FG-MDM: Towards Zero-Shot Human Motion Generation via Fine-Grained + Descriptions + + +
+ Recently, significant progress has been made in text-based motion generation, +enabling the generation of diverse and high-quality human motions that conform +to textual descriptions. However, generating motions beyond the distribution of +original datasets remains challenging, i.e., zero-shot generation. By adopting +a divide-and-conquer strategy, we propose a new framework named Fine-Grained +Human Motion Diffusion Model (FG-MDM) for zero-shot human motion generation. +Specifically, we first parse previous vague textual annotations into +fine-grained descriptions of different body parts by leveraging a large +language model. We then use these fine-grained descriptions to guide a +transformer-based diffusion model, which further adopts a design of part +tokens. FG-MDM can generate human motions beyond the scope of original datasets +owing to descriptions that are closer to motion essence. Our experimental +results demonstrate the superiority of FG-MDM over previous methods in +zero-shot settings. We will release our fine-grained textual annotations for +HumanML3D and KIT. + +
+
+ comment: Project Page: https://sx0207.github.io/fg-mdm/ +
+
+
+
+
+ + ♻ ☆ Remembering Transformer for Continual Learning + + +
+ Neural networks encounter the challenge of Catastrophic Forgetting (CF) in +continual learning, where new task knowledge interferes with previously learned +knowledge. We propose Remembering Transformer, inspired by the brain's +Complementary Learning Systems (CLS), to tackle this issue. Remembering +Transformer employs a mixture-of-adapters and a generative model-based routing +mechanism to alleviate CF by dynamically routing task data to relevant +adapters. Our approach demonstrated a new SOTA performance in various vision +continual learning tasks and great parameter efficiency. + +
+
+
+
+
+ + ♻ ☆ X-Adapter: Adding Universal Compatibility of Plugins for Upgraded + Diffusion Model + + +
+ We introduce X-Adapter, a universal upgrader to enable the pretrained +plug-and-play modules (e.g., ControlNet, LoRA) to work directly with the +upgraded text-to-image diffusion model (e.g., SDXL) without further retraining. +We achieve this goal by training an additional network to control the frozen +upgraded model with the new text-image data pairs. In detail, X-Adapter keeps a +frozen copy of the old model to preserve the connectors of different plugins. +Additionally, X-Adapter adds trainable mapping layers that bridge the decoders +from models of different versions for feature remapping. The remapped features +will be used as guidance for the upgraded model. To enhance the guidance +ability of X-Adapter, we employ a null-text training strategy for the upgraded +model. After training, we also introduce a two-stage denoising strategy to +align the initial latents of X-Adapter and the upgraded model. Thanks to our +strategies, X-Adapter demonstrates universal compatibility with various plugins +and also enables plugins of different versions to work together, thereby +expanding the functionalities of diffusion community. To verify the +effectiveness of the proposed method, we conduct extensive experiments and the +results show that X-Adapter may facilitate wider application in the upgraded +foundational diffusion model. + +
+
+ comment: Project page: https://showlab.github.io/X-Adapter/ +
+
+
+
+
+ + ♻ ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/atonderski/neuro-ncap + +
+
+
+
+
+ + ♻ ☆ LASER: Tuning-Free LLM-Driven Attention Control for Efficient + Text-conditioned Image-to-Animation + + +
+ Revolutionary advancements in text-to-image models have unlocked new +dimensions for sophisticated content creation, e.g., text-conditioned image +editing, allowing us to edit the diverse images that convey highly complex +visual concepts according to the textual guidance. Despite being promising, +existing methods focus on texture- or non-rigid-based visual manipulation, +which struggles to produce the fine-grained animation of smooth +text-conditioned image morphing without fine-tuning, i.e., due to their highly +unstructured latent space. In this paper, we introduce a tuning-free LLM-driven +attention control framework, encapsulated by the progressive process of LLM +planning, prompt-Aware editing, StablE animation geneRation, abbreviated as +LASER. LASER employs a large language model (LLM) to refine coarse descriptions +into detailed prompts, guiding pre-trained text-to-image models for subsequent +image generation. We manipulate the model's spatial features and self-attention +mechanisms to maintain animation integrity and enable seamless morphing +directly from text prompts, eliminating the need for additional fine-tuning or +annotations. Our meticulous control over spatial features and self-attention +ensures structural consistency in the images. This paper presents a novel +framework integrating LLMs with text-to-image models to create high-quality +animations from a single text input. We also propose a Text-conditioned +Image-to-Animation Benchmark to validate the effectiveness and efficacy of +LASER. Extensive experiments demonstrate that LASER produces impressive, +consistent, and efficient results in animation generation, positioning it as a +powerful tool for advanced digital content creation. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes CVPR 2024 + + +
+ Recent advancements in post-hoc and inherently interpretable methods have +markedly enhanced the explanations of black box classifier models. These +methods operate either through post-analysis or by integrating concept learning +during model training. Although being effective in bridging the semantic gap +between a model's latent space and human interpretation, these explanation +methods only partially reveal the model's decision-making process. The outcome +is typically limited to high-level semantics derived from the last feature map. +We argue that the explanations lacking insights into the decision processes at +low and mid-level features are neither fully faithful nor useful. Addressing +this gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet), +an inherently interpretable model. MCPNet autonomously learns meaningful +concept prototypes across multiple feature map levels using Centered Kernel +Alignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so +without reliance on predefined concept labels. Further, we propose a novel +classifier paradigm that learns and aligns multi-level concept prototype +distributions for classification purposes via Class-aware Concept Distribution +(CCD) loss. Our experiments reveal that our proposed MCPNet while being +adaptable to various model architectures, offers comprehensive multi-level +explanations while maintaining classification accuracy. Additionally, its +concept distribution-based classification approach shows improved +generalization capabilities in few-shot classification scenarios. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Advancements in Point Cloud Data Augmentation for Deep Learning: A + Survey + + +
+ Deep learning (DL) has become one of the mainstream and effective methods for +point cloud analysis tasks such as detection, segmentation and classification. +To reduce overfitting during training DL models and improve model performance +especially when the amount and/or diversity of training data are limited, +augmentation is often crucial. Although various point cloud data augmentation +methods have been widely used in different point cloud processing tasks, there +are currently no published systematic surveys or reviews of these methods. +Therefore, this article surveys these methods, categorizing them into a +taxonomy framework that comprises basic and specialized point cloud data +augmentation methods. Through a comprehensive evaluation of these augmentation +methods, this article identifies their potentials and limitations, serving as a +useful reference for choosing appropriate augmentation methods. In addition, +potential directions for future research are recommended. This survey +contributes to providing a holistic overview of the current state of point +cloud data augmentation, promoting its wider application and development. + +
+
+ comment: Accepted by Pattern Recognition +
+
+
+
+
+ + ♻ ☆ DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On + Transformer + + +
+ With the continuous development of Optical Character Recognition (OCR) and +the expansion of application fields, text recognition in complex scenes has +become a key challenge. Factors such as multiple fonts, mixed scenes and +complex layouts seriously affect the recognition accuracy of traditional OCR +models. Although OCR models based on deep learning have performed well in +specific fields or similar datasets in recent years, the generalization ability +and robustness of the model are still a big challenge when facing complex +environments with multiple scenes. Furthermore, training an OCR model from +scratch or fine-tuning all parameters is very demanding on computing resources +and inference time, which limits the flexibility of its application. This study +focuses on a fundamental aspect of mixed text recognition in response to the +challenges mentioned above, which involves effectively fine-tuning the +pre-trained basic OCR model to demonstrate exceptional performance across +various downstream tasks. To this end, we propose a parameter-efficient mixed +text recognition method based on pre-trained OCR Transformer, namely +DLoRA-TrOCR. This method embeds DoRA into the image encoder and LoRA into the +internal structure of the text decoder, enabling efficient parameter +fine-tuning for downstream tasks. Experiments show that compared to similar +parameter adjustment methods, our model DLoRA-TrOCR has the smallest number of +parameters and performs better. It can achieve state-of-the-art performance on +complex scene datasets involving simultaneous recognition of mixed handwritten, +printed and street view texts. + +
+
+
+
+
+ + ♻ ☆ Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object + Detection CVPR2024 + + +
+ We delve into pseudo-labeling for semi-supervised monocular 3D object +detection (SSM3OD) and discover two primary issues: a misalignment between the +prediction quality of 3D and 2D attributes and the tendency of depth +supervision derived from pseudo-labels to be noisy, leading to significant +optimization conflicts with other reliable forms of supervision. We introduce a +novel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach +features a Decoupled Pseudo-label Generation (DPG) module, designed to +efficiently generate pseudo-labels by separately processing 2D and 3D +attributes. This module incorporates a unique homography-based method for +identifying dependable pseudo-labels in BEV space, specifically for 3D +attributes. Additionally, we present a DepthGradient Projection (DGP) module to +mitigate optimization conflicts caused by noisy depth supervision of +pseudo-labels, effectively decoupling the depth gradient and removing +conflicting gradients. This dual decoupling strategy-at both the pseudo-label +generation and gradient levels-significantly improves the utilization of +pseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark +demonstrate the superiority of our method over existing approaches. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Fine-Grained Information and Noise Decoupling for Remote + Sensing Change Detection + + +
+ Change detection aims to identify remote sense object changes by analyzing +data between bitemporal image pairs. Due to the large temporal and spatial span +of data collection in change detection image pairs, there are often a +significant amount of task-specific and task-agnostic noise. Previous effort +has focused excessively on denoising, with this goes a great deal of loss of +fine-grained information. In this paper, we revisit the importance of +fine-grained features in change detection and propose a series of operations +for fine-grained information compensation and noise decoupling (FINO). First, +the context is utilized to compensate for the fine-grained information in the +feature space. Next, a shape-aware and a brightness-aware module are designed +to improve the capacity for representation learning. The shape-aware module +guides the backbone for more precise shape estimation, guiding the backbone +network in extracting object shape features. The brightness-aware module learns +a overall brightness estimation to improve the model's robustness to +task-agnostic noise. Finally, a task-specific noise decoupling structure is +designed as a way to improve the model's ability to separate noise interference +from feature similarity. With these training schemes, our proposed method +achieves new state-of-the-art (SOTA) results in multiple change detection +benchmarks. The code will be made available. + +
+
+
+
+
+ + ♻ ☆ Enhancing Representations through Heterogeneous Self-Supervised Learning + + +
+ Incorporating heterogeneous representations from different architectures has +facilitated various vision tasks, e.g., some hybrid networks combine +transformers and convolutions. However, complementarity between such +heterogeneous architectures has not been well exploited in self-supervised +learning. Thus, we propose Heterogeneous Self-Supervised Learning (HSSL), which +enforces a base model to learn from an auxiliary head whose architecture is +heterogeneous from the base model. In this process, HSSL endows the base model +with new characteristics in a representation learning way without structural +changes. To comprehensively understand the HSSL, we conduct experiments on +various heterogeneous pairs containing a base model and an auxiliary head. We +discover that the representation quality of the base model moves up as their +architecture discrepancy grows. This observation motivates us to propose a +search strategy that quickly determines the most suitable auxiliary head for a +specific base model to learn and several simple but effective methods to +enlarge the model discrepancy. The HSSL is compatible with various +self-supervised methods, achieving superior performances on various downstream +tasks, including image classification, semantic segmentation, instance +segmentation, and object detection. Our source code will be made publicly +available. + +
+
+
+
+
+ + ♻ ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ♻ ☆ Latent-based Diffusion Model for Long-tailed Recognition CVPR2024 + + +
+ Long-tailed imbalance distribution is a common issue in practical computer +vision applications. Previous works proposed methods to address this problem, +which can be categorized into several classes: re-sampling, re-weighting, +transfer learning, and feature augmentation. In recent years, diffusion models +have shown an impressive generation ability in many sub-problems of deep +computer vision. However, its powerful generation has not been explored in +long-tailed problems. We propose a new approach, the Latent-based Diffusion +Model for Long-tailed Recognition (LDMLR), as a feature augmentation method to +tackle the issue. First, we encode the imbalanced dataset into features using +the baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM) +using these encoded features to generate pseudo-features. Finally, we train the +classifier using the encoded and pseudo-features from the previous two steps. +The model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT +datasets by using the proposed method. + +
+
+ comment: 8 pages, 3 figures. Accepted by L3DIVU-CVPR2024 +
+
+
+
+
+ + ♻ ☆ PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for + RGB-D Cameras with Limited Co-visibility + + +
+ RGB-D cameras are crucial in robotic perception, given their ability to +produce images augmented with depth data. However, their limited FOV often +requires multiple cameras to cover a broader area. In multi-camera RGB-D +setups, the goal is typically to reduce camera overlap, optimizing spatial +coverage with as few cameras as possible. The extrinsic calibration of these +systems introduces additional complexities. Existing methods for extrinsic +calibration either necessitate specific tools or highly depend on the accuracy +of camera motion estimation. To address these issues, we present PeLiCal, a +novel line-based calibration approach for RGB-D camera systems exhibiting +limited overlap. Our method leverages long line features from surroundings, and +filters out outliers with a novel convergence voting algorithm, achieving +targetless, real-time, and outlier-robust performance compared to existing +methods. We open source our implementation on +https://github.com/joomeok/PeLiCal.git. + +
+
+
+
+
+ + ♻ ☆ Point Clouds Are Specialized Images: A Knowledge Transfer Approach for + 3D Understanding + + +
+ Self-supervised representation learning (SSRL) has gained increasing +attention in point cloud understanding, in addressing the challenges posed by +3D data scarcity and high annotation costs. This paper presents PCExpert, a +novel SSRL approach that reinterprets point clouds as "specialized images". +This conceptual shift allows PCExpert to leverage knowledge derived from +large-scale image modality in a more direct and deeper manner, via extensively +sharing the parameters with a pre-trained image encoder in a multi-way +Transformer architecture. The parameter sharing strategy, combined with a novel +pretext task for pre-training, i.e., transformation estimation, empowers +PCExpert to outperform the state of the arts in a variety of tasks, with a +remarkable reduction in the number of trainable parameters. Notably, PCExpert's +performance under LINEAR fine-tuning (e.g., yielding a 90.02% overall accuracy +on ScanObjectNN) has already approached the results obtained with FULL model +fine-tuning (92.66%), demonstrating its effective and robust representation +capability. + +
+
+
+
+
+ + ♻ ☆ TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer + + +
+ Estimating the 6D object pose is an essential task in many applications. Due +to the lack of depth information, existing RGB-based methods are sensitive to +occlusion and illumination changes. How to extract and utilize the geometry +features in depth information is crucial to achieve accurate predictions. To +this end, we propose TransPose, a novel 6D pose framework that exploits +Transformer Encoder with geometry-aware module to develop better learning of +point cloud feature representations. Specifically, we first uniformly sample +point cloud and extract local geometry features with the designed local feature +extractor base on graph convolution network. To improve robustness to +occlusion, we adopt Transformer to perform the exchange of global information, +making each local feature contains global information. Finally, we introduce +geometry-aware module in Transformer Encoder, which to form an effective +constrain for point cloud feature learning and makes the global information +exchange more tightly coupled with point cloud tasks. Extensive experiments +indicate the effectiveness of TransPose, our pose estimation pipeline achieves +competitive results on three benchmark datasets. + +
+
+ comment: Accepted by NEUROCOMPUTING +
+
+
+
+
+ + ♻ ☆ FlowVQTalker: High-Quality Emotional Talking Face Generation through + Normalizing Flow and Quantization + + +
+ Generating emotional talking faces is a practical yet challenging endeavor. +To create a lifelike avatar, we draw upon two critical insights from a human +perspective: 1) The connection between audio and the non-deterministic facial +dynamics, encompassing expressions, blinks, poses, should exhibit synchronous +and one-to-many mapping. 2) Vibrant expressions are often accompanied by +emotion-aware high-definition (HD) textures and finely detailed teeth. However, +both aspects are frequently overlooked by existing methods. To this end, this +paper proposes using normalizing Flow and Vector-Quantization modeling to +produce emotional talking faces that satisfy both insights concurrently +(FlowVQTalker). Specifically, we develop a flow-based coefficient generator +that encodes the dynamics of facial emotion into a multi-emotion-class latent +space represented as a mixture distribution. The generation process commences +with random sampling from the modeled distribution, guided by the accompanying +audio, enabling both lip-synchronization and the uncertain nonverbal facial +cues generation. Furthermore, our designed vector-quantization image generator +treats the creation of expressive facial images as a code query task, utilizing +a learned codebook to provide rich, high-quality textures that enhance the +emotional perception of the results. Extensive experiments are conducted to +showcase the effectiveness of our approach. + +
+
+ comment: 11 pages, 11 figures, conference +
+
+
+
+
+ + ♻ ☆ Fixation-based Self-calibration for Eye Tracking in VR Headsets + + +
+ This study proposes a novel self-calibration method for eye tracking in a +virtual reality (VR) headset. The proposed method is based on the assumptions +that the user's viewpoint can freely move and that the points of regard (PoRs) +from different viewpoints are distributed within a small area on an object +surface during visual fixation. In the method, fixations are first detected +from the time-series data of uncalibrated gaze directions using an extension of +the I-VDT (velocity and dispersion threshold identification) algorithm to a +three-dimensional (3D) scene. Then, the calibration parameters are optimized by +minimizing the sum of a dispersion metrics of the PoRs. The proposed method can +potentially identify the optimal calibration parameters representing the +user-dependent offset from the optical axis to the visual axis without explicit +user calibration, image processing, or marker-substitute objects. For the gaze +data of 18 participants walking in two VR environments with many occlusions, +the proposed method achieved an accuracy of 2.1$^\circ$, which was +significantly lower than the average offset. Our method is the first +self-calibration method with an average error lower than 3$^\circ$ in 3D +environments. Further, the accuracy of the proposed method can be improved by +up to 1.2$^\circ$ by refining the fixation detection or optimization algorithm. + +
+
+
+
+
+ + ♻ ☆ Feature Imitating Networks Enhance The Performance, Reliability And + Speed Of Deep Learning On Biomedical Image Processing Tasks + + +
+ Feature-Imitating-Networks (FINs) are neural networks that are first trained +to approximate closed-form statistical features (e.g. Entropy), and then +embedded into other networks to enhance their performance. In this work, we +perform the first evaluation of FINs for biomedical image processing tasks. We +begin by training a set of FINs to imitate six common radiomics features, and +then compare the performance of larger networks (with and without embedding the +FINs) for three experimental tasks: COVID-19 detection from CT scans, brain +tumor classification from MRI scans, and brain-tumor segmentation from MRI +scans. We found that models embedded with FINs provided enhanced performance +for all three tasks when compared to baseline networks without FINs, even when +those baseline networks had more parameters. Additionally, we found that models +embedded with FINs converged faster and more consistently compared to baseline +networks with similar or greater representational capacity. The results of our +experiments provide evidence that FINs may offer state-of-the-art performance +for a variety of other biomedical image processing tasks. + +
+
+
+
+
+ + ♻ ☆ Leveraging Systematic Knowledge of 2D Transformations + + +
+ The existing deep learning models suffer from out-of-distribution (o.o.d.) +performance drop in computer vision tasks. In comparison, humans have a +remarkable ability to interpret images, even if the scenes in the images are +rare, thanks to the systematicity of acquired knowledge. This work focuses on +1) the acquisition of systematic knowledge of 2D transformations, and 2) +architectural components that can leverage the learned knowledge in image +classification tasks in an o.o.d. setting. With a new training methodology +based on synthetic datasets that are constructed under the causal framework, +the deep neural networks acquire knowledge from semantically different domains +(e.g. even from noise), and exhibit certain level of systematicity in parameter +estimation experiments. Based on this, a novel architecture is devised +consisting of a classifier, an estimator and an identifier (abbreviated as +"CED"). By emulating the "hypothesis-verification" process in human visual +perception, CED improves the classification accuracy significantly on test sets +under covariate shift. + +
+
+
+
+
+ + ♻ ☆ A Dataset and Model for Realistic License Plate Deblurring IJCAI 2024 + + +
+ Vehicle license plate recognition is a crucial task in intelligent traffic +management systems. However, the challenge of achieving accurate recognition +persists due to motion blur from fast-moving vehicles. Despite the widespread +use of image synthesis approaches in existing deblurring and recognition +algorithms, their effectiveness in real-world scenarios remains unproven. To +address this, we introduce the first large-scale license plate deblurring +dataset named License Plate Blur (LPBlur), captured by a dual-camera system and +processed through a post-processing pipeline to avoid misalignment issues. +Then, we propose a License Plate Deblurring Generative Adversarial Network +(LPDGAN) to tackle the license plate deblurring: 1) a Feature Fusion Module to +integrate multi-scale latent codes; 2) a Text Reconstruction Module to restore +structure through textual modality; 3) a Partition Discriminator Module to +enhance the model's perception of details in each letter. Extensive experiments +validate the reliability of the LPBlur dataset for both model training and +testing, showcasing that our proposed model outperforms other state-of-the-art +motion deblurring methods in realistic license plate deblurring scenarios. The +dataset and code are available at https://github.com/haoyGONG/LPDGAN. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Improved Cryo-EM Pose Estimation and 3D Classification through + Latent-Space Disentanglement + + +
+ Due to the extremely low signal-to-noise ratio (SNR) and unknown poses +(projection angles and image shifts) in cryo-electron microscopy (cryo-EM) +experiments, reconstructing 3D volumes from 2D images is very challenging. In +addition to these challenges, heterogeneous cryo-EM reconstruction requires +conformational classification. In popular cryo-EM reconstruction algorithms, +poses and conformation classification labels must be predicted for every input +cryo-EM image, which can be computationally costly for large datasets. An +emerging class of methods adopted the amortized inference approach. In these +methods, only a subset of the input dataset is needed to train neural networks +for the estimation of poses and conformations. Once trained, these neural +networks can make pose/conformation predictions and 3D reconstructions at low +cost for the entire dataset during inference. Unfortunately, when facing +heterogeneous reconstruction tasks, it is hard for current +amortized-inference-based methods to effectively estimate the conformational +distribution and poses from entangled latent variables. Here, we propose a +self-supervised variational autoencoder architecture called "HetACUMN" based on +amortized inference. We employed an auxiliary conditional pose prediction task +by inverting the order of encoder-decoder to explicitly enforce the +disentanglement of conformation and pose predictions. Results on simulated +datasets show that HetACUMN generated more accurate conformational +classifications than other amortized or non-amortized methods. Furthermore, we +show that HetACUMN is capable of performing heterogeneous 3D reconstructions of +a real experimental dataset. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Accelerating Image Generation with Sub-path Linear Approximation Model + + +
+ Diffusion models have significantly advanced the state of the art in image, +audio, and video generation tasks. However, their applications in practical +scenarios are hindered by slow inference speed. Drawing inspiration from the +approximation strategies utilized in consistency models, we propose the +Sub-path Linear Approximation Model (SLAM), which accelerates diffusion models +while maintaining high-quality image generation. SLAM treats the PF-ODE +trajectory as a series of PF-ODE sub-paths divided by sampled points, and +harnesses sub-path linear (SL) ODEs to form a progressive and continuous error +estimation along each individual PF-ODE sub-path. The optimization on such +SL-ODEs allows SLAM to construct denoising mappings with smaller cumulative +approximated errors. An efficient distillation method is also developed to +facilitate the incorporation of more advanced diffusion models, such as latent +diffusion models. Our extensive experimental results demonstrate that SLAM +achieves an efficient training regimen, requiring only 6 A100 GPU days to +produce a high-quality generative model capable of 2 to 4-step generation with +high performance. Comprehensive evaluations on LAION, MS COCO 2014, and MS COCO +2017 datasets also illustrate that SLAM surpasses existing acceleration methods +in few-step generation tasks, achieving state-of-the-art performance both on +FID and the quality of the generated images. + +
+
+
+
+
+ + ♻ ☆ Learning to Recover Spectral Reflectance from RGB Images + + +
+ This paper tackles spectral reflectance recovery (SRR) from RGB images. Since +capturing ground-truth spectral reflectance and camera spectral sensitivity are +challenging and costly, most existing approaches are trained on synthetic +images and utilize the same parameters for all unseen testing images, which are +suboptimal especially when the trained models are tested on real images because +they never exploit the internal information of the testing images. To address +this issue, we adopt a self-supervised meta-auxiliary learning (MAXL) strategy +that fine-tunes the well-trained network parameters with each testing image to +combine external with internal information. To the best of our knowledge, this +is the first work that successfully adapts the MAXL strategy to this problem. +Instead of relying on naive end-to-end training, we also propose a novel +architecture that integrates the physical relationship between the spectral +reflectance and the corresponding RGB images into the network based on our +mathematical analysis. Besides, since the spectral reflectance of a scene is +independent to its illumination while the corresponding RGB images are not, we +recover the spectral reflectance of a scene from its RGB images captured under +multiple illuminations to further reduce the unknown. Qualitative and +quantitative evaluations demonstrate the effectiveness of our proposed network +and of the MAXL. Our code and data are available at +https://github.com/Dong-Huo/SRR-MAXL. + +
+
+ comment: IEEE Transactions on Image Processing (TIP), 2024 +
+
+
+
+
+ + ♻ ☆ Learning Disentangled Identifiers for Action-Customized Text-to-Image + Generation CVPR 2024 + + +
+ This study focuses on a novel task in text-to-image (T2I) generation, namely +action customization. The objective of this task is to learn the co-existing +action from limited data and generalize it to unseen humans or even animals. +Experimental results show that existing subject-driven customization methods +fail to learn the representative characteristics of actions and struggle in +decoupling actions from context features, including appearance. To overcome the +preference for low-level features and the entanglement of high-level features, +we propose an inversion-based method Action-Disentangled Identifier (ADI) to +learn action-specific identifiers from the exemplar images. ADI first expands +the semantic conditioning space by introducing layer-wise identifier tokens, +thereby increasing the representational richness while distributing the +inversion across different features. Then, to block the inversion of +action-agnostic features, ADI extracts the gradient invariance from the +constructed sample triples and masks the updates of irrelevant channels. To +comprehensively evaluate the task, we present an ActionBench that includes a +variety of actions, each accompanied by meticulously selected samples. Both +quantitative and qualitative results show that our ADI outperforms existing +baselines in action-customized T2I generation. Our project page is at +https://adi-t2i.github.io/ADI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gradient-Regularized Out-of-Distribution Detection + + +
+ One of the challenges for neural networks in real-life applications is the +overconfident errors these models make when the data is not from the original +training distribution. + Addressing this issue is known as Out-of-Distribution (OOD) detection. + Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate +for OOD data during training to achieve improved performance. + However, these methods fail to fully exploit the local information embedded +in the auxiliary dataset. + In this work, we propose the idea of leveraging the information embedded in +the gradient of the loss function during training to enable the network to not +only learn a desired OOD score for each sample but also to exhibit similar +behavior in a local neighborhood around each sample. + We also develop a novel energy-based sampling method to allow the network to +be exposed to more informative OOD samples during the training phase. This is +especially important when the auxiliary dataset is large. We demonstrate the +effectiveness of our method through extensive experiments on several OOD +benchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet +experiment. + We further provide a theoretical analysis through the lens of certified +robustness and Lipschitz analysis to showcase the theoretical foundation of our +work. We will publicly release our code after the review process. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Rapid-INR: Storage Efficient CPU-free DNN Training Using Implicit Neural + Representation + + +
+ Implicit Neural Representation (INR) is an innovative approach for +representing complex shapes or objects without explicitly defining their +geometry or surface structure. Instead, INR represents objects as continuous +functions. Previous research has demonstrated the effectiveness of using neural +networks as INR for image compression, showcasing comparable performance to +traditional methods such as JPEG. However, INR holds potential for various +applications beyond image compression. This paper introduces Rapid-INR, a novel +approach that utilizes INR for encoding and compressing images, thereby +accelerating neural network training in computer vision tasks. Our methodology +involves storing the whole dataset directly in INR format on a GPU, mitigating +the significant data communication overhead between the CPU and GPU during +training. Additionally, the decoding process from INR to RGB format is highly +parallelized and executed on-the-fly. To further enhance compression, we +propose iterative and dynamic pruning, as well as layer-wise quantization, +building upon previous work. We evaluate our framework on the image +classification task, utilizing the ResNet-18 backbone network and three +commonly used datasets with varying image sizes. Rapid-INR reduces memory +consumption to only about 5% of the original dataset size in RGB format and +achieves a maximum 6$\times$ speedup over the PyTorch training pipeline, as +well as a maximum 1.2x speedup over the DALI training pipeline, with only a +marginal decrease in accuracy. Importantly, Rapid-INR can be readily applied to +other computer vision tasks and backbone networks with reasonable engineering +efforts. Our implementation code is publicly available at +https://github.com/sharc-lab/Rapid-INR. + +
+
+ comment: Accepted by ICCAD 2023 +
+
+
+
+
+ + ♻ ☆ SegFormer3D: an Efficient Transformer for 3D Medical Image Segmentation CVPR + + +
+ The adoption of Vision Transformers (ViTs) based architectures represents a +significant advancement in 3D Medical Image (MI) segmentation, surpassing +traditional Convolutional Neural Network (CNN) models by enhancing global +contextual understanding. While this paradigm shift has significantly enhanced +3D segmentation performance, state-of-the-art architectures require extremely +large and complex architectures with large scale computing resources for +training and deployment. Furthermore, in the context of limited datasets, often +encountered in medical imaging, larger models can present hurdles in both model +generalization and convergence. In response to these challenges and to +demonstrate that lightweight models are a valuable area of research in 3D +medical imaging, we present SegFormer3D, a hierarchical Transformer that +calculates attention across multiscale volumetric features. Additionally, +SegFormer3D avoids complex decoders and uses an all-MLP decoder to aggregate +local and global attention features to produce highly accurate segmentation +masks. The proposed memory efficient Transformer preserves the performance +characteristics of a significantly larger model in a compact design. +SegFormer3D democratizes deep learning for 3D medical image segmentation by +offering a model with 33x less parameters and a 13x reduction in GFLOPS +compared to the current state-of-the-art (SOTA). We benchmark SegFormer3D +against the current SOTA models on three widely used datasets Synapse, BRaTs, +and ACDC, achieving competitive results. Code: +https://github.com/OSUPCVLab/SegFormer3D.git + +
+
+ comment: Accepted at CVPR Workshop 2024 +
+
+
+
+
+ + ♻ ☆ Runtime Stealthy Perception Attacks against DNN-based Adaptive Cruise + Control Systems + + +
+ Adaptive Cruise Control (ACC) is a widely used driver assistance technology +for maintaining the desired speed and safe distance to the leading vehicle. +This paper evaluates the security of the deep neural network (DNN) based ACC +systems under runtime stealthy perception attacks that strategically inject +perturbations into camera data to cause forward collisions. We present a +context-aware strategy for the selection of the most critical times for +triggering the attacks and a novel optimization-based method for the adaptive +generation of image perturbations at runtime. We evaluate the effectiveness of +the proposed attack using an actual vehicle, a publicly available driving +dataset, and a realistic simulation platform with the control software from a +production ACC system, a physical-world driving simulator, and interventions by +the human driver and safety features such as Advanced Emergency Braking System +(AEBS). Experimental results show that the proposed attack achieves 142.9 times +higher success rate in causing hazards and 89.6% higher evasion rate than +baselines, while being stealthy and robust to real-world factors and dynamic +changes in the environment. This study highlights the role of human drivers and +basic safety mechanisms in preventing attacks. + +
+
+ comment: 19 pages, 23 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ AV-RIR: Audio-Visual Room Impulse Response Estimation CVPR 2024 + + +
+ Accurate estimation of Room Impulse Response (RIR), which captures an +environment's acoustic properties, is important for speech processing and AR/VR +applications. We propose AV-RIR, a novel multi-modal multi-task learning +approach to accurately estimate the RIR from a given reverberant speech signal +and the visual cues of its corresponding environment. AV-RIR builds on a novel +neural codec-based architecture that effectively captures environment geometry +and materials properties and solves speech dereverberation as an auxiliary task +by using multi-task learning. We also propose Geo-Mat features that augment +material information into visual cues and CRIP that improves late reverberation +components in the estimated RIR via image-to-RIR retrieval by 86%. Empirical +results show that AV-RIR quantitatively outperforms previous audio-only and +visual-only approaches by achieving 36% - 63% improvement across various +acoustic metrics in RIR estimation. Additionally, it also achieves higher +preference scores in human evaluation. As an auxiliary benefit, dereverbed +speech from AV-RIR shows competitive performance with the state-of-the-art in +various spoken language processing tasks and outperforms reverberation time +error score in the real-world AVSpeech dataset. Qualitative examples of both +synthesized reverberant speech and enhanced speech can be found at +https://www.youtube.com/watch?v=tTsKhviukAE. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ KDAS: Knowledge Distillation via Attention Supervision Framework for + Polyp Segmentation + + +
+ Polyp segmentation, a contentious issue in medical imaging, has seen numerous +proposed methods aimed at improving the quality of segmented masks. While +current state-of-the-art techniques yield impressive results, the size and +computational cost of these models create challenges for practical industry +applications. To address this challenge, we present KDAS, a Knowledge +Distillation framework that incorporates attention supervision, and our +proposed Symmetrical Guiding Module. This framework is designed to facilitate a +compact student model with fewer parameters, allowing it to learn the strengths +of the teacher model and mitigate the inconsistency between teacher features +and student features, a common challenge in Knowledge Distillation, via the +Symmetrical Guiding Module. Through extensive experiments, our compact models +demonstrate their strength by achieving competitive results with +state-of-the-art methods, offering a promising approach to creating compact +models with high accuracy for polyp segmentation and in the medical imaging +field. The implementation is available on https://github.com/huyquoctrinh/KDAS. + +
+
+
+
+
+ + ♻ ☆ IDD-X: A Multi-View Dataset for Ego-relative Important Object + Localization and Explanation in Dense and Unstructured Traffic ICRA 2024 + + +
+ Intelligent vehicle systems require a deep understanding of the interplay +between road conditions, surrounding entities, and the ego vehicle's driving +behavior for safe and efficient navigation. This is particularly critical in +developing countries where traffic situations are often dense and unstructured +with heterogeneous road occupants. Existing datasets, predominantly geared +towards structured and sparse traffic scenarios, fall short of capturing the +complexity of driving in such environments. To fill this gap, we present IDD-X, +a large-scale dual-view driving video dataset. With 697K bounding boxes, 9K +important object tracks, and 1-12 objects per video, IDD-X offers comprehensive +ego-relative annotations for multiple important road objects covering 10 +categories and 19 explanation label categories. The dataset also incorporates +rearview information to provide a more complete representation of the driving +environment. We also introduce custom-designed deep networks aimed at multiple +important object localization and per-object explanation prediction. Overall, +our dataset and introduced prediction models form the foundation for studying +how road conditions and surrounding entities affect driving behavior in complex +traffic situations. + +
+
+ comment: Accepted at ICRA 2024; Project page: https://idd-x.github.io/ +
+
+
+
+
+ + ♻ ☆ PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained + Image Classification Accuracy for AIs and Humans + + +
+ Nearest neighbors (NN) are traditionally used to compute final decisions, +e.g., in Support Vector Machines or k-NN classifiers, and to provide users with +explanations for the model's decision. In this paper, we show a novel utility +of nearest neighbors: To improve predictions of a frozen, pretrained classifier +C. We leverage an image comparator S that (1) compares the input image with NN +images from the top-K most probable classes; and (2) uses S's output scores to +weight the confidence scores of C. Our method consistently improves +fine-grained image classification accuracy on CUB-200, Cars-196, and Dogs-120. +Also, a human study finds that showing lay users our probable-class nearest +neighbors (PCNN) improves their decision accuracy over prior work which only +shows only the top-1 class examples. + +
+
+
+
+
+ + ♻ ☆ Tackling Structural Hallucination in Image Translation with Local + Diffusion + + +
+ Recent developments in diffusion models have advanced conditioned image +generation, yet they struggle with reconstructing out-of-distribution (OOD) +images, such as unseen tumors in medical images, causing "image hallucination" +and risking misdiagnosis. We hypothesize such hallucinations result from local +OOD regions in the conditional images. We verify that partitioning the OOD +region and conducting separate image generations alleviates hallucinations in +several applications. From this, we propose a training-free diffusion framework +that reduces hallucination with multiple Local Diffusion processes. Our +approach involves OOD estimation followed by two modules: a "branching" module +generates locally both within and outside OOD regions, and a "fusion" module +integrates these predictions into one. Our evaluation shows our method +mitigates hallucination over baseline models quantitatively and qualitatively, +reducing misdiagnosis by 40% and 25% in the real-world medical and natural +image datasets, respectively. It also demonstrates compatibility with various +pre-trained diffusion models. + +
+
+
+
+
+ + ♻ ☆ ChatPose: Chatting about 3D Human Pose + + +
+ We introduce ChatPose, a framework employing Large Language Models (LLMs) to +understand and reason about 3D human poses from images or textual descriptions. +Our work is motivated by the human ability to intuitively understand postures +from a single image or a brief description, a process that intertwines image +interpretation, world knowledge, and an understanding of body language. +Traditional human pose estimation and generation methods often operate in +isolation, lacking semantic understanding and reasoning abilities. ChatPose +addresses these limitations by embedding SMPL poses as distinct signal tokens +within a multimodal LLM, enabling the direct generation of 3D body poses from +both textual and visual inputs. Leveraging the powerful capabilities of +multimodal LLMs, ChatPose unifies classical 3D human pose and generation tasks +while offering user interactions. Additionally, ChatPose empowers LLMs to apply +their extensive world knowledge in reasoning about human poses, leading to two +advanced tasks: speculative pose generation and reasoning about pose +estimation. These tasks involve reasoning about humans to generate 3D poses +from subtle text queries, possibly accompanied by images. We establish +benchmarks for these tasks, moving beyond traditional 3D pose generation and +estimation methods. Our results show that ChatPose outperforms existing +multimodal LLMs and task-specific methods on these newly proposed tasks. +Furthermore, ChatPose's ability to understand and generate 3D human poses based +on complex reasoning opens new directions in human pose analysis. + +
+
+ comment: Home page: https://yfeng95.github.io/ChatPose/ +
+
+
+
+
+ + ♻ ☆ Are Bias Mitigation Techniques for Deep Learning Effective? WACV 2022 + + +
+ A critical problem in deep learning is that systems learn inappropriate +biases, resulting in their inability to perform well on minority groups. This +has led to the creation of multiple algorithms that endeavor to mitigate bias. +However, it is not clear how effective these methods are. This is because study +protocols differ among papers, systems are tested on datasets that fail to test +many forms of bias, and systems have access to hidden knowledge or are tuned +specifically to the test set. To address this, we introduce an improved +evaluation protocol, sensible metrics, and a new dataset, which enables us to +ask and answer critical questions about bias mitigation algorithms. We evaluate +seven state-of-the-art algorithms using the same network architecture and +hyperparameter selection policy across three benchmark datasets. We introduce a +new dataset called Biased MNIST that enables assessment of robustness to +multiple bias sources. We use Biased MNIST and a visual question answering +(VQA) benchmark to assess robustness to hidden biases. Rather than only tuning +to the test set distribution, we study robustness across different tuning +distributions, which is critical because for many applications the test +distribution may not be known during development. We find that algorithms +exploit hidden biases, are unable to scale to multiple forms of bias, and are +highly sensitive to the choice of tuning set. Based on our findings, we implore +the community to adopt more rigorous assessment of future bias mitigation +methods. All data, code, and results are publicly available at: +https://github.com/erobic/bias-mitigators. + +
+
+ comment: Published in WACV 2022 under the title "An Investigation of Critical + Issues in Bias Mitigation Techniques" +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 156 + +
+
+
+ + ☆ AutoAD III: The Prequel -- Back to the Pixels CVPR2024 + + +
+ Generating Audio Description (AD) for movies is a challenging task that +requires fine-grained visual understanding and an awareness of the characters +and their names. Currently, visual language models for AD generation are +limited by a lack of suitable training data, and also their evaluation is +hampered by using performance measures not specialized to the AD domain. In +this paper, we make three contributions: (i) We propose two approaches for +constructing AD datasets with aligned video data, and build training and +evaluation datasets using these. These datasets will be publicly released; (ii) +We develop a Q-former-based architecture which ingests raw video and generates +AD, using frozen pre-trained visual encoders and large language models; and +(iii) We provide new evaluation metrics to benchmark AD quality that are +well-matched to human performance. Taken together, we improve the state of the +art on AD generation. + +
+
+ comment: CVPR2024. Project page: + https://www.robots.ox.ac.uk/~vgg/research/autoad/ +
+
+
+
+
+ + ☆ Guess The Unseen: Dynamic 3D Scene Reconstruction from Partial 2D + Glimpses + + +
+ In this paper, we present a method to reconstruct the world and multiple +dynamic humans in 3D from a monocular video input. As a key idea, we represent +both the world and multiple humans via the recently emerging 3D Gaussian +Splatting (3D-GS) representation, enabling to conveniently and efficiently +compose and render them together. In particular, we address the scenarios with +severely limited and sparse observations in 3D human reconstruction, a common +challenge encountered in the real world. To tackle this challenge, we introduce +a novel approach to optimize the 3D-GS representation in a canonical space by +fusing the sparse cues in the common space, where we leverage a pre-trained 2D +diffusion model to synthesize unseen views while keeping the consistency with +the observed 2D appearances. We demonstrate our method can reconstruct +high-quality animatable 3D humans in various challenging examples, in the +presence of occlusion, image crops, few-shot, and extremely sparse +observations. After reconstruction, our method is capable of not only rendering +the scene in any novel views at arbitrary time instances, but also editing the +3D scene by removing individual humans or applying different motions for each +human. Through various experiments, we demonstrate the quality and efficiency +of our methods over alternative existing approaches. + +
+
+ comment: The project page is available at https://snuvclab.github.io/gtu/ +
+
+
+
+
+ + ☆ CrossScore: Towards Multi-View Image Evaluation and Scoring + + +
+ We introduce a novel cross-reference image quality assessment method that +effectively fills the gap in the image assessment landscape, complementing the +array of established evaluation schemes -- ranging from full-reference metrics +like SSIM, no-reference metrics such as NIQE, to general-reference metrics +including FID, and Multi-modal-reference metrics, e.g., CLIPScore. Utilising a +neural network with the cross-attention mechanism and a unique data collection +pipeline from NVS optimisation, our method enables accurate image quality +assessment without requiring ground truth references. By comparing a query +image against multiple views of the same scene, our method addresses the +limitations of existing metrics in novel view synthesis (NVS) and similar tasks +where direct reference images are unavailable. Experimental results show that +our method is closely correlated to the full-reference metric SSIM, while not +requiring ground truth references. + +
+
+ comment: Project page see https://crossscore.active.vision +
+
+
+
+
+ + ☆ Hyp-OC: Hyperbolic One Class Classification for Face Anti-Spoofing + + +
+ Face recognition technology has become an integral part of modern security +systems and user authentication processes. However, these systems are +vulnerable to spoofing attacks and can easily be circumvented. Most prior +research in face anti-spoofing (FAS) approaches it as a two-class +classification task where models are trained on real samples and known spoof +attacks and tested for detection performance on unknown spoof attacks. However, +in practice, FAS should be treated as a one-class classification task where, +while training, one cannot assume any knowledge regarding the spoof samples a +priori. In this paper, we reformulate the face anti-spoofing task from a +one-class perspective and propose a novel hyperbolic one-class classification +framework. To train our network, we use a pseudo-negative class sampled from +the Gaussian distribution with a weighted running mean and propose two novel +loss functions: (1) Hyp-PC: Hyperbolic Pairwise Confusion loss, and (2) Hyp-CE: +Hyperbolic Cross Entropy loss, which operate in the hyperbolic space. +Additionally, we employ Euclidean feature clipping and gradient clipping to +stabilize the training in the hyperbolic space. To the best of our knowledge, +this is the first work extending hyperbolic embeddings for face anti-spoofing +in a one-class manner. With extensive experiments on five benchmark datasets: +Rose-Youtu, MSU-MFSD, CASIA-MFSD, Idiap Replay-Attack, and OULU-NPU, we +demonstrate that our method significantly outperforms the state-of-the-art, +achieving better spoof detection performance. + +
+
+ comment: Accepted in FG2024, Project Page - + https://kartik-3004.github.io/hyp-oc/ +
+
+
+
+
+ + ☆ GeoDiffuser: Geometry-Based Image Editing with Diffusion Models + + +
+ The success of image generative models has enabled us to build methods that +can edit images based on text or other user input. However, these methods are +bespoke, imprecise, require additional information, or are limited to only 2D +image edits. We present GeoDiffuser, a zero-shot optimization-based method that +unifies common 2D and 3D image-based object editing capabilities into a single +method. Our key insight is to view image editing operations as geometric +transformations. We show that these transformations can be directly +incorporated into the attention layers in diffusion models to implicitly +perform editing operations. Our training-free optimization method uses an +objective function that seeks to preserve object style but generate plausible +images, for instance with accurate lighting and shadows. It also inpaints +disoccluded parts of the image where the object was originally located. Given a +natural image and user input, we segment the foreground object using SAM and +estimate a corresponding transform which is used by our optimization approach +for editing. GeoDiffuser can perform common 2D and 3D edits like object +translation, 3D rotation, and removal. We present quantitative results, +including a perceptual study, that shows how our approach is better than +existing methods. Visit https://ivl.cs.brown.edu/research/geodiffuser.html for +more information. + +
+
+
+
+
+ + ☆ SEED-X: Multimodal Models with Unified Multi-granularity Comprehension + and Generation + + +
+ The rapid evolution of multimodal foundation model has demonstrated +significant progresses in vision-language understanding and generation, e.g., +our previous work SEED-LLaMA. However, there remains a gap between its +capability and the real-world applicability, primarily due to the model's +limited capacity to effectively respond to various user instructions and +interact with diverse visual data. In this work, we focus on bridging this gap +through integrating two enhanced features: (1) comprehending images of +arbitrary sizes and ratios, and (2) enabling multi-granularity image +generation. We present a unified and versatile foundation model, namely, +SEED-X, which is able to model multi-granularity visual semantics for +comprehension and generation tasks. Besides the competitive results on public +benchmarks, SEED-X demonstrates its effectiveness in handling real-world +applications across various domains after instruction tuning. We hope that our +work will inspire future research into what can be achieved by versatile +multimodal foundation models in real-world applications. The models, codes, and +datasets will be released in https://github.com/AILab-CVC/SEED-X. + +
+
+ comment: Project released at: https://github.com/AILab-CVC/SEED-X +
+
+
+
+
+ + ☆ A Multimodal Automated Interpretability Agent + + +
+ This paper describes MAIA, a Multimodal Automated Interpretability Agent. +MAIA is a system that uses neural models to automate neural model understanding +tasks like feature interpretation and failure mode discovery. It equips a +pre-trained vision-language model with a set of tools that support iterative +experimentation on subcomponents of other models to explain their behavior. +These include tools commonly used by human interpretability researchers: for +synthesizing and editing inputs, computing maximally activating exemplars from +real-world datasets, and summarizing and describing experimental results. +Interpretability experiments proposed by MAIA compose these tools to describe +and explain system behavior. We evaluate applications of MAIA to computer +vision models. We first characterize MAIA's ability to describe (neuron-level) +features in learned representations of images. Across several trained models +and a novel dataset of synthetic vision neurons with paired ground-truth +descriptions, MAIA produces descriptions comparable to those generated by +expert human experimenters. We then show that MAIA can aid in two additional +interpretability tasks: reducing sensitivity to spurious features, and +automatically identifying inputs likely to be mis-classified. + +
+
+ comment: 25 pages, 13 figures +
+
+
+
+
+ + ☆ STROOBnet Optimization via GPU-Accelerated Proximal Recurrence + Strategies + + +
+ Spatiotemporal networks' observational capabilities are crucial for accurate +data gathering and informed decisions across multiple sectors. This study +focuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network +(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events +within defined geographical regions, enabling efficient monitoring. Using data +from Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New +Orleans, where RTCC combats rising crime amidst reduced police presence, we +address the network's initial observational imbalances. Aiming for uniform +observational efficacy, we propose the Proximal Recurrence approach. It +outperformed traditional clustering methods like k-means and DBSCAN by offering +holistic event frequency and spatial consideration, enhancing observational +coverage. + +
+
+ comment: 10 pages, 17 figures, 2023 IEEE International Conference on Big Data + (BigData) +
+
+
+
+
+ + ☆ TAVGBench: Benchmarking Text to Audible-Video Generation + + +
+ The Text to Audible-Video Generation (TAVG) task involves generating videos +with accompanying audio based on text descriptions. Achieving this requires +skillful alignment of both audio and video elements. To support research in +this field, we have developed a comprehensive Text to Audible-Video Generation +Benchmark (TAVGBench), which contains over 1.7 million clips with a total +duration of 11.8 thousand hours. We propose an automatic annotation pipeline to +ensure each audible video has detailed descriptions for both its audio and +video contents. We also introduce the Audio-Visual Harmoni score (AVHScore) to +provide a quantitative measure of the alignment between the generated audio and +video modalities. Additionally, we present a baseline model for TAVG called +TAVDiffusion, which uses a two-stream latent diffusion model to provide a +fundamental starting point for further research in this area. We achieve the +alignment of audio and video by employing cross-attention and contrastive +learning. Through extensive experiments and evaluations on TAVGBench, we +demonstrate the effectiveness of our proposed model under both conventional +metrics and our proposed metrics. + +
+
+ comment: Technical Report. Project + page:https://github.com/OpenNLPLab/TAVGBench +
+
+
+
+
+ + ☆ Graphic Design with Large Multimodal Model + + +
+ In the field of graphic design, automating the integration of design elements +into a cohesive multi-layered artwork not only boosts productivity but also +paves the way for the democratization of graphic design. One existing practice +is Graphic Layout Generation (GLG), which aims to layout sequential design +elements. It has been constrained by the necessity for a predefined correct +sequence of layers, thus limiting creative potential and increasing user +workload. In this paper, we present Hierarchical Layout Generation (HLG) as a +more flexible and pragmatic setup, which creates graphic composition from +unordered sets of design elements. To tackle the HLG task, we introduce +Graphist, the first layout generation model based on large multimodal models. +Graphist efficiently reframes the HLG as a sequence generation problem, +utilizing RGB-A images as input, outputs a JSON draft protocol, indicating the +coordinates, size, and order of each element. We develop new evaluation metrics +for HLG. Graphist outperforms prior arts and establishes a strong baseline for +this field. Project homepage: https://github.com/graphic-design-ai/graphist + +
+
+
+
+
+ + ☆ Scene Coordinate Reconstruction: Posing of Image Collections via + Incremental Learning of a Relocalizer + + +
+ We address the task of estimating camera parameters from a set of images +depicting a scene. Popular feature-based structure-from-motion (SfM) tools +solve this task by incremental reconstruction: they repeat triangulation of +sparse 3D points and registration of more camera views to the sparse point +cloud. We re-interpret incremental structure-from-motion as an iterated +application and refinement of a visual relocalizer, that is, of a method that +registers new views to the current state of the reconstruction. This +perspective allows us to investigate alternative visual relocalizers that are +not rooted in local feature matching. We show that scene coordinate regression, +a learning-based relocalization approach, allows us to build implicit, neural +scene representations from unposed images. Different from other learning-based +reconstruction methods, we do not require pose priors nor sequential inputs, +and we optimize efficiently over thousands of images. Our method, ACE0 (ACE +Zero), estimates camera poses to an accuracy comparable to feature-based SfM, +as demonstrated by novel view synthesis. Project page: +https://nianticlabs.github.io/acezero/ + +
+
+ comment: Project page: https://nianticlabs.github.io/acezero/ +
+
+
+
+
+ + ☆ Automatic Discovery of Visual Circuits + + +
+ To date, most discoveries of network subcomponents that implement +human-interpretable computations in deep vision models have involved close +study of single units and large amounts of human labor. We explore scalable +methods for extracting the subgraph of a vision model's computational graph +that underlies recognition of a specific visual concept. We introduce a new +method for identifying these subgraphs: specifying a visual concept using a few +examples, and then tracing the interdependence of neuron activations across +layers, or their functional connectivity. We find that our approach extracts +circuits that causally affect model output, and that editing these circuits can +defend large pretrained models from adversarial attacks. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ On-the-Fly Point Annotation for Fast Medical Video Labeling + + +
+ Purpose: In medical research, deep learning models rely on high-quality +annotated data, a process often laborious and timeconsuming. This is +particularly true for detection tasks where bounding box annotations are +required. The need to adjust two corners makes the process inherently +frame-by-frame. Given the scarcity of experts' time, efficient annotation +methods suitable for clinicians are needed. Methods: We propose an on-the-fly +method for live video annotation to enhance the annotation efficiency. In this +approach, a continuous single-point annotation is maintained by keeping the +cursor on the object in a live video, mitigating the need for tedious pausing +and repetitive navigation inherent in traditional annotation methods. This +novel annotation paradigm inherits the point annotation's ability to generate +pseudo-labels using a point-to-box teacher model. We empirically evaluate this +approach by developing a dataset and comparing on-the-fly annotation time +against traditional annotation method. Results: Using our method, annotation +speed was 3.2x faster than the traditional annotation technique. We achieved a +mean improvement of 6.51 +- 0.98 AP@50 over conventional method at equivalent +annotation budgets on the developed dataset. Conclusion: Without bells and +whistles, our approach offers a significant speed-up in annotation tasks. It +can be easily implemented on any annotation platform to accelerate the +integration of deep learning in video-based medical research. + +
+
+ comment: 7 pages, 5 figures. Int J CARS (2024) +
+
+
+
+
+ + ☆ Heterogeneous Face Recognition Using Domain Invariant Units ICASSP 2024 + + +
+ Heterogeneous Face Recognition (HFR) aims to expand the applicability of Face +Recognition (FR) systems to challenging scenarios, enabling the matching of +face images across different domains, such as matching thermal images to +visible spectra. However, the development of HFR systems is challenging because +of the significant domain gap between modalities and the lack of availability +of large-scale paired multi-channel data. In this work, we leverage a +pretrained face recognition model as a teacher network to learn domaininvariant +network layers called Domain-Invariant Units (DIU) to reduce the domain gap. +The proposed DIU can be trained effectively even with a limited amount of +paired training data, in a contrastive distillation framework. This proposed +approach has the potential to enhance pretrained models, making them more +adaptable to a wider range of variations in data. We extensively evaluate our +approach on multiple challenging benchmarks, demonstrating superior performance +compared to state-of-the-art methods. + +
+
+ comment: 6 pages, Accepted ICASSP 2024 +
+
+
+
+
+ + ☆ X-Ray: A Sequential 3D Representation for Generation + + +
+ In this paper, we introduce X-Ray, an innovative approach to 3D generation +that employs a new sequential representation, drawing inspiration from the +depth-revealing capabilities of X-Ray scans to meticulously capture both the +external and internal features of objects. Central to our method is the +utilization of ray casting techniques originating from the camera's viewpoint, +meticulously recording the geometric and textural details encountered across +all intersected surfaces. This process efficiently condenses complete objects +or scenes into a multi-frame format, just like videos. Such a structure ensures +the 3D representation is composed solely of critical surface information. +Highlighting the practicality and adaptability of our X-Ray representation, we +showcase its utility in synthesizing 3D objects, employing a network +architecture akin to that used in video diffusion models. The outcomes reveal +our representation's superior performance in enhancing both the accuracy and +efficiency of 3D synthesis, heralding new directions for ongoing research and +practical implementations in the field. + +
+
+
+
+
+ + ☆ Machine Learning Techniques for MRI Data Processing at Expanding Scale + + +
+ Imaging sites around the world generate growing amounts of medical scan data +with ever more versatile and affordable technology. Large-scale studies acquire +MRI for tens of thousands of participants, together with metadata ranging from +lifestyle questionnaires to biochemical assays, genetic analyses and more. +These large datasets encode substantial information about human health and hold +considerable potential for machine learning training and analysis. This chapter +examines ongoing large-scale studies and the challenge of distribution shifts +between them. Transfer learning for overcoming such shifts is discussed, +together with federated learning for safe access to distributed training data +securely held at multiple institutions. Finally, representation learning is +reviewed as a methodology for encoding embeddings that express abstract +relationships in multi-modal input formats. + +
+
+ comment: Book chapter pre-print +
+
+
+
+
+ + ☆ A Novel Approach to Chest X-ray Lung Segmentation Using U-net and + Modified Convolutional Block Attention Module + + +
+ Lung segmentation in chest X-ray images is of paramount importance as it +plays a crucial role in the diagnosis and treatment of various lung diseases. +This paper presents a novel approach for lung segmentation in chest X-ray +images by integrating U-net with attention mechanisms. The proposed method +enhances the U-net architecture by incorporating a Convolutional Block +Attention Module (CBAM), which unifies three distinct attention mechanisms: +channel attention, spatial attention, and pixel attention. The channel +attention mechanism enables the model to concentrate on the most informative +features across various channels. The spatial attention mechanism enhances the +model's precision in localization by focusing on significant spatial locations. +Lastly, the pixel attention mechanism empowers the model to focus on individual +pixels, further refining the model's focus and thereby improving the accuracy +of segmentation. The adoption of the proposed CBAM in conjunction with the +U-net architecture marks a significant advancement in the field of medical +imaging, with potential implications for improving diagnostic precision and +patient outcomes. The efficacy of this method is validated against contemporary +state-of-the-art techniques, showcasing its superiority in segmentation +performance. + +
+
+
+
+
+ + ☆ Towards Better Adversarial Purification via Adversarial Denoising + Diffusion Training + + +
+ Recently, diffusion-based purification (DBP) has emerged as a promising +approach for defending against adversarial attacks. However, previous studies +have used questionable methods to evaluate the robustness of DBP models, their +explanations of DBP robustness also lack experimental support. We re-examine +DBP robustness using precise gradient, and discuss the impact of stochasticity +on DBP robustness. To better explain DBP robustness, we assess DBP robustness +under a novel attack setting, Deterministic White-box, and pinpoint +stochasticity as the main factor in DBP robustness. Our results suggest that +DBP models rely on stochasticity to evade the most effective attack direction, +rather than directly countering adversarial perturbations. To improve the +robustness of DBP models, we propose Adversarial Denoising Diffusion Training +(ADDT). This technique uses Classifier-Guided Perturbation Optimization (CGPO) +to generate adversarial perturbation through guidance from a pre-trained +classifier, and uses Rank-Based Gaussian Mapping (RBGM) to convert adversarial +pertubation into a normal Gaussian distribution. Empirical results show that +ADDT improves the robustness of DBP models. Further experiments confirm that +ADDT equips DBP models with the ability to directly counter adversarial +perturbations. + +
+
+
+
+
+ + ☆ Fast and Robust Normal Estimation for Sparse LiDAR Scans + + +
+ Light Detection and Ranging (LiDAR) technology has proven to be an important +part of many robotics systems. Surface normals estimated from LiDAR data are +commonly used for a variety of tasks in such systems. As most of the today's +mechanical LiDAR sensors produce sparse data, estimating normals from a single +scan in a robust manner poses difficulties. + In this paper, we address the problem of estimating normals for sparse LiDAR +data avoiding the typical issues of smoothing out the normals in high curvature +areas. + Mechanical LiDARs rotate a set of rigidly mounted lasers. One firing of such +a set of lasers produces an array of points where each point's neighbor is +known due to the known firing pattern of the scanner. We use this knowledge to +connect these points to their neighbors and label them using the angles of the +lines connecting them. When estimating normals at these points, we only +consider points with the same label as neighbors. This allows us to avoid +estimating normals in high curvature areas. + We evaluate our approach on various data, both self-recorded and publicly +available, acquired using various sparse LiDAR sensors. We show that using our +method for normal estimation leads to normals that are more robust in areas +with high curvature which leads to maps of higher quality. We also show that +our method only incurs a constant factor runtime overhead with respect to a +lightweight baseline normal estimation procedure and is therefore suited for +operation in computationally demanding environments. + +
+
+
+
+
+ + ☆ RESFM: Robust Equivariant Multiview Structure from Motion + + +
+ Multiview Structure from Motion is a fundamental and challenging computer +vision problem. A recent deep-based approach was proposed utilizing matrix +equivariant architectures for the simultaneous recovery of camera pose and 3D +scene structure from large image collections. This work however made the +unrealistic assumption that the point tracks given as input are clean of +outliers. Here we propose an architecture suited to dealing with outliers by +adding an inlier/outlier classifying module that respects the model +equivariance and by adding a robust bundle adjustment step. Experiments +demonstrate that our method can be successfully applied in realistic settings +that include large image collections and point tracks extracted with common +heuristics and include many outliers. + +
+
+
+
+
+ + ☆ Co-designing a Sub-millisecond Latency Event-based Eye Tracking System + with Submanifold Sparse CNN CVPR 2024 + + +
+ Eye-tracking technology is integral to numerous consumer electronics +applications, particularly in the realm of virtual and augmented reality +(VR/AR). These applications demand solutions that excel in three crucial +aspects: low-latency, low-power consumption, and precision. Yet, achieving +optimal performance across all these fronts presents a formidable challenge, +necessitating a balance between sophisticated algorithms and efficient backend +hardware implementations. In this study, we tackle this challenge through a +synergistic software/hardware co-design of the system with an event camera. +Leveraging the inherent sparsity of event-based input data, we integrate a +novel sparse FPGA dataflow accelerator customized for submanifold sparse +convolution neural networks (SCNN). The SCNN implemented on the accelerator can +efficiently extract the embedding feature vector from each representation of +event slices by only processing the non-zero activations. Subsequently, these +vectors undergo further processing by a gated recurrent unit (GRU) and a fully +connected layer on the host CPU to generate the eye centers. Deployment and +evaluation of our system reveal outstanding performance metrics. On the +Event-based Eye-Tracking-AIS2024 dataset, our system achieves 81% p5 accuracy, +99.5% p10 accuracy, and 3.71 Mean Euclidean Distance with 0.7 ms latency while +only consuming 2.29 mJ per inference. Notably, our solution opens up +opportunities for future eye-tracking systems. Code is available at +https://github.com/CASR-HKU/ESDA/tree/eye_tracking. + +
+
+ comment: Accepted to CVPR 2024 workshop, AIS: Vision, Graphics, and AI for + Streaming +
+
+
+
+
+ + ☆ CLIP-GS: CLIP-Informed Gaussian Splatting for Real-time and + View-consistent 3D Semantic Understanding + + +
+ The recent 3D Gaussian Splatting (GS) exhibits high-quality and real-time +synthesis of novel views in 3D scenes. Currently, it primarily focuses on +geometry and appearance modeling, while lacking the semantic understanding of +scenes. To bridge this gap, we present CLIP-GS, which integrates semantics from +Contrastive Language-Image Pre-Training (CLIP) into Gaussian Splatting to +efficiently comprehend 3D environments without annotated semantic data. In +specific, rather than straightforwardly learning and rendering high-dimensional +semantic features of 3D Gaussians, which significantly diminishes the +efficiency, we propose a Semantic Attribute Compactness (SAC) approach. SAC +exploits the inherent unified semantics within objects to learn compact yet +effective semantic representations of 3D Gaussians, enabling highly efficient +rendering (>100 FPS). Additionally, to address the semantic ambiguity, caused +by utilizing view-inconsistent 2D CLIP semantics to supervise Gaussians, we +introduce a 3D Coherent Self-training (3DCS) strategy, resorting to the +multi-view consistency originated from the 3D model. 3DCS imposes cross-view +semantic consistency constraints by leveraging refined, self-predicted +pseudo-labels derived from the trained 3D Gaussian model, thereby enhancing +precise and view-consistent segmentation results. Extensive experiments +demonstrate that our method remarkably outperforms existing state-of-the-art +approaches, achieving improvements of 17.29% and 20.81% in mIoU metric on +Replica and ScanNet datasets, respectively, while maintaining real-time +rendering speed. Furthermore, our approach exhibits superior performance even +with sparse input data, verifying the robustness of our method. + +
+
+ comment: https://github.com/gbliao/CLIP-GS +
+
+
+
+
+ + ☆ NTIRE 2024 Challenge on Low Light Image Enhancement: Methods and Results + + +
+ This paper reviews the NTIRE 2024 low light image enhancement challenge, +highlighting the proposed solutions and results. The aim of this challenge is +to discover an effective network design or solution capable of generating +brighter, clearer, and visually appealing results when dealing with a variety +of conditions, including ultra-high resolution (4K and beyond), non-uniform +illumination, backlighting, extreme darkness, and night scenes. A notable total +of 428 participants registered for the challenge, with 22 teams ultimately +making valid submissions. This paper meticulously evaluates the +state-of-the-art advancements in enhancing low-light images, reflecting the +significant progress and creativity in this field. + +
+
+ comment: NTIRE 2024 Challenge Report +
+
+
+
+
+ + ☆ From Modalities to Styles: Rethinking the Domain Gap in Heterogeneous + Face Recognition + + +
+ Heterogeneous Face Recognition (HFR) focuses on matching faces from different +domains, for instance, thermal to visible images, making Face Recognition (FR) +systems more versatile for challenging scenarios. However, the domain gap +between these domains and the limited large-scale datasets in the target HFR +modalities make it challenging to develop robust HFR models from scratch. In +our work, we view different modalities as distinct styles and propose a method +to modulate feature maps of the target modality to address the domain gap. We +present a new Conditional Adaptive Instance Modulation (CAIM ) module that +seamlessly fits into existing FR networks, turning them into HFR-ready systems. +The CAIM block modulates intermediate feature maps, efficiently adapting to the +style of the source modality and bridging the domain gap. Our method enables +end-to-end training using a small set of paired samples. We extensively +evaluate the proposed approach on various challenging HFR benchmarks, showing +that it outperforms state-of-the-art methods. The source code and protocols for +reproducing the findings will be made publicly available + +
+
+ comment: Accepted for publication in IEEE TBIOM +
+
+
+
+
+ + ☆ UrbanCross: Enhancing Satellite Image-Text Retrieval with Cross-Domain + Adaptation + + +
+ Urbanization challenges underscore the necessity for effective satellite +image-text retrieval methods to swiftly access specific information enriched +with geographic semantics for urban applications. However, existing methods +often overlook significant domain gaps across diverse urban landscapes, +primarily focusing on enhancing retrieval performance within single domains. To +tackle this issue, we present UrbanCross, a new framework for cross-domain +satellite image-text retrieval. UrbanCross leverages a high-quality, +cross-domain dataset enriched with extensive geo-tags from three countries to +highlight domain diversity. It employs the Large Multimodal Model (LMM) for +textual refinement and the Segment Anything Model (SAM) for visual +augmentation, achieving a fine-grained alignment of images, segments and texts, +yielding a 10% improvement in retrieval performance. Additionally, UrbanCross +incorporates an adaptive curriculum-based source sampler and a weighted +adversarial cross-domain fine-tuning module, progressively enhancing +adaptability across various domains. Extensive experiments confirm UrbanCross's +superior efficiency in retrieval and adaptation to new urban environments, +demonstrating an average performance increase of 15% over its version without +domain adaptation mechanisms, effectively bridging the domain gap. + +
+
+
+
+
+ + ☆ MultiBooth: Towards Generating All Your Concepts in an Image from Text + + +
+ This paper introduces MultiBooth, a novel and efficient technique for +multi-concept customization in image generation from text. Despite the +significant advancements in customized generation methods, particularly with +the success of diffusion models, existing methods often struggle with +multi-concept scenarios due to low concept fidelity and high inference cost. +MultiBooth addresses these issues by dividing the multi-concept generation +process into two phases: a single-concept learning phase and a multi-concept +integration phase. During the single-concept learning phase, we employ a +multi-modal image encoder and an efficient concept encoding technique to learn +a concise and discriminative representation for each concept. In the +multi-concept integration phase, we use bounding boxes to define the generation +area for each concept within the cross-attention map. This method enables the +creation of individual concepts within their specified regions, thereby +facilitating the formation of multi-concept images. This strategy not only +improves concept fidelity but also reduces additional inference cost. +MultiBooth surpasses various baselines in both qualitative and quantitative +evaluations, showcasing its superior performance and computational efficiency. +Project Page: https://multibooth.github.io/ + +
+
+ comment: Project Page: https://multibooth.github.io/ . Github Page: + https://github.com/chenyangzhu1/MultiBooth +
+
+
+
+
+ + ☆ Detecting and Mitigating Hallucination in Large Vision Language Models + via Fine-Grained AI Feedback + + +
+ The rapidly developing Large Vision Language Models (LVLMs) have shown +notable capabilities on a range of multi-modal tasks, but still face the +hallucination phenomena where the generated texts do not align with the given +contexts, significantly restricting the usages of LVLMs. Most previous work +detects and mitigates hallucination at the coarse-grained level or requires +expensive annotation (e.g., labeling by proprietary models or human experts). +To address these issues, we propose detecting and mitigating hallucinations in +LVLMs via fine-grained AI feedback. The basic idea is that we generate a +small-size sentence-level hallucination annotation dataset by proprietary +models, whereby we train a hallucination detection model which can perform +sentence-level hallucination detection, covering primary hallucination types +(i.e., object, attribute, and relationship). Then, we propose a +detect-then-rewrite pipeline to automatically construct preference dataset for +training hallucination mitigating model. Furthermore, we propose +differentiating the severity of hallucinations, and introducing a Hallucination +Severity-Aware Direct Preference Optimization (HSA-DPO) for mitigating +hallucination in LVLMs by incorporating the severity of hallucinations into +preference learning. Extensive experiments demonstrate the effectiveness of our +method. + +
+
+
+
+
+ + ☆ Generalizable Neural Human Renderer + + +
+ While recent advancements in animatable human rendering have achieved +remarkable results, they require test-time optimization for each subject which +can be a significant limitation for real-world applications. To address this, +we tackle the challenging task of learning a Generalizable Neural Human +Renderer (GNH), a novel method for rendering animatable humans from monocular +video without any test-time optimization. Our core method focuses on +transferring appearance information from the input video to the output image +plane by utilizing explicit body priors and multi-view geometry. To render the +subject in the intended pose, we utilize a straightforward CNN-based image +renderer, foregoing the more common ray-sampling or rasterizing-based rendering +modules. Our GNH achieves remarkable generalizable, photorealistic rendering +with unseen subjects with a three-stage process. We quantitatively and +qualitatively demonstrate that GNH significantly surpasses current +state-of-the-art methods, notably achieving a 31.3% improvement in LPIPS. + +
+
+
+
+
+ + ☆ BCFPL: Binary classification ConvNet based Fast Parking space + recognition with Low resolution image + + +
+ The automobile plays an important role in the economic activities of mankind, +especially in the metropolis. Under the circumstances, the demand of quick +search for available parking spaces has become a major concern for the +automobile drivers. Meanwhile, the public sense of privacy is also awaking, the +image-based parking space recognition methods lack the attention of privacy +protection. In this paper, we proposed a binary convolutional neural network +with lightweight design structure named BCFPL, which can be used to train with +low-resolution parking space images and offer a reasonable recognition result. +The images of parking space were collected from various complex environments, +including different weather, occlusion conditions, and various camera angles. +We conducted the training and testing progresses among different datasets and +partial subsets. The experimental results show that the accuracy of BCFPL does +not decrease compared with the original resolution image directly, and can +reach the average level of the existing mainstream method. BCFPL also has low +hardware requirements and fast recognition speed while meeting the privacy +requirements, so it has application potential in intelligent city construction +and automatic driving field. + +
+
+
+
+
+ + ☆ Face2Face: Label-driven Facial Retouching Restoration + + +
+ With the popularity of social media platforms such as Instagram and TikTok, +and the widespread availability and convenience of retouching tools, an +increasing number of individuals are utilizing these tools to beautify their +facial photographs. This poses challenges for fields that place high demands on +the authenticity of photographs, such as identity verification and social +media. By altering facial images, users can easily create deceptive images, +leading to the dissemination of false information. This may pose challenges to +the reliability of identity verification systems and social media, and even +lead to online fraud. To address this issue, some work has proposed makeup +removal methods, but they still lack the ability to restore images involving +geometric deformations caused by retouching. To tackle the problem of facial +retouching restoration, we propose a framework, dubbed Face2Face, which +consists of three components: a facial retouching detector, an image +restoration model named FaceR, and a color correction module called +Hierarchical Adaptive Instance Normalization (H-AdaIN). Firstly, the facial +retouching detector predicts a retouching label containing three integers, +indicating the retouching methods and their corresponding degrees. Then FaceR +restores the retouched image based on the predicted retouching label. Finally, +H-AdaIN is applied to address the issue of color shift arising from diffusion +models. Extensive experiments demonstrate the effectiveness of our framework +and each module. + +
+
+
+
+
+ + ☆ FLDM-VTON: Faithful Latent Diffusion Model for Virtual Try-on IJCAI 2024 + + +
+ Despite their impressive generative performance, latent diffusion model-based +virtual try-on (VTON) methods lack faithfulness to crucial details of the +clothes, such as style, pattern, and text. To alleviate these issues caused by +the diffusion stochastic nature and latent supervision, we propose a novel +Faithful Latent Diffusion Model for VTON, termed FLDM-VTON. FLDM-VTON improves +the conventional latent diffusion process in three major aspects. First, we +propose incorporating warped clothes as both the starting point and local +condition, supplying the model with faithful clothes priors. Second, we +introduce a novel clothes flattening network to constrain generated try-on +images, providing clothes-consistent faithful supervision. Third, we devise a +clothes-posterior sampling for faithful inference, further enhancing the model +performance over conventional clothes-agnostic Gaussian sampling. Extensive +experimental results on the benchmark VITON-HD and Dress Code datasets +demonstrate that our FLDM-VTON outperforms state-of-the-art baselines and is +able to generate photo-realistic try-on images with faithful clothing details. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Text in the Dark: Extremely Low-Light Text Image Enhancement + + +
+ Extremely low-light text images are common in natural scenes, making scene +text detection and recognition challenging. One solution is to enhance these +images using low-light image enhancement methods before text extraction. +However, previous methods often do not try to particularly address the +significance of low-level features, which are crucial for optimal performance +on downstream scene text tasks. Further research is also hindered by the lack +of extremely low-light text datasets. To address these limitations, we propose +a novel encoder-decoder framework with an edge-aware attention module to focus +on scene text regions during enhancement. Our proposed method uses novel text +detection and edge reconstruction losses to emphasize low-level scene text +features, leading to successful text extraction. Additionally, we present a +Supervised Deep Curve Estimation (Supervised-DCE) model to synthesize extremely +low-light images based on publicly available scene text datasets such as +ICDAR15 (IC15). We also labeled texts in the extremely low-light See In the +Dark (SID) and ordinary LOw-Light (LOL) datasets to allow for objective +assessment of extremely low-light image enhancement through scene text tasks. +Extensive experiments show that our model outperforms state-of-the-art methods +in terms of both image quality and scene text metrics on the widely-used LOL, +SID, and synthetic IC15 datasets. Code and dataset will be released publicly at +https://github.com/chunchet-ng/Text-in-the-Dark. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ CRNet: A Detail-Preserving Network for Unified Image Restoration and + Enhancement Task CVPR2024 + + +
+ In real-world scenarios, images captured often suffer from blurring, noise, +and other forms of image degradation, and due to sensor limitations, people +usually can only obtain low dynamic range images. To achieve high-quality +images, researchers have attempted various image restoration and enhancement +operations on photographs, including denoising, deblurring, and high dynamic +range imaging. However, merely performing a single type of image enhancement +still cannot yield satisfactory images. In this paper, to deal with the +challenge above, we propose the Composite Refinement Network (CRNet) to address +this issue using multiple exposure images. By fully integrating +information-rich multiple exposure inputs, CRNet can perform unified image +restoration and enhancement. To improve the quality of image details, CRNet +explicitly separates and strengthens high and low-frequency information through +pooling layers, using specially designed Multi-Branch Blocks for effective +fusion of these frequencies. To increase the receptive field and fully +integrate input features, CRNet employs the High-Frequency Enhancement Module, +which includes large kernel convolutions and an inverted bottleneck ConvFFN. +Our model secured third place in the first track of the Bracketing Image +Restoration and Enhancement Challenge, surpassing previous SOTA models in both +testing metrics and visual quality. + +
+
+ comment: This paper is accepted by CVPR2024 Workshop, Code: + https://github.com/CalvinYang0/CRNet +
+
+
+
+
+ + ☆ Hierarchical localization with panoramic views and triplet loss + functions + + +
+ The main objective of this paper is to address the mobile robot localization +problem with Triplet Convolutional Neural Networks and test their robustness +against changes of the lighting conditions. We have used omnidirectional images +from real indoor environments captured in dynamic conditions that have been +converted to panoramic format. Two approaches are proposed to address +localization by means of triplet neural networks. First, hierarchical +localization, which consists in estimating the robot position in two stages: a +coarse localization, which involves a room retrieval task, and a fine +localization is addressed by means of image retrieval in the previously +selected room. Second, global localization, which consists in estimating the +position of the robot inside the entire map in a unique step. Besides, an +exhaustive study of the loss function influence on the network learning process +has been made. The experimental section proves that triplet neural networks are +an efficient and robust tool to address the localization of mobile robots in +indoor environments, considering real operation conditions. + +
+
+ comment: This work has been submitted to the Artificial Intelligence Journal + (Ed. Elsevier) for possible publication. Copyright may be transferred without + notice, after which this version may no longer be accessible +
+
+
+
+
+ + ☆ CKD: Contrastive Knowledge Distillation from A Sample-wise Perspective + + +
+ In this paper, we present a simple yet effective contrastive knowledge +distillation approach, which can be formulated as a sample-wise alignment +problem with intra- and inter-sample constraints. Unlike traditional knowledge +distillation methods that concentrate on maximizing feature similarities or +preserving class-wise semantic correlations between teacher and student +features, our method attempts to recover the "dark knowledge" by aligning +sample-wise teacher and student logits. Specifically, our method first +minimizes logit differences within the same sample by considering their +numerical values, thus preserving intra-sample similarities. Next, we bridge +semantic disparities by leveraging dissimilarities across different samples. +Note that constraints on intra-sample similarities and inter-sample +dissimilarities can be efficiently and effectively reformulated into a +contrastive learning framework with newly designed positive and negative pairs. +The positive pair consists of the teacher's and student's logits derived from +an identical sample, while the negative pairs are formed by using logits from +different samples. With this formulation, our method benefits from the +simplicity and efficiency of contrastive learning through the optimization of +InfoNCE, yielding a run-time complexity that is far less than $O(n^2)$, where +$n$ represents the total number of training samples. Furthermore, our method +can eliminate the need for hyperparameter tuning, particularly related to +temperature parameters and large batch sizes. We conduct comprehensive +experiments on three datasets including CIFAR-100, ImageNet-1K, and MS COCO. +Experimental results clearly confirm the effectiveness of the proposed method +on both image classification and object detection tasks. Our source codes will +be publicly available at https://github.com/wencheng-zhu/CKD. + +
+
+
+
+
+ + ☆ DynaMMo: Dynamic Model Merging for Efficient Class Incremental Learning + for Medical Images + + +
+ Continual learning, the ability to acquire knowledge from new data while +retaining previously learned information, is a fundamental challenge in machine +learning. Various approaches, including memory replay, knowledge distillation, +model regularization, and dynamic network expansion, have been proposed to +address this issue. Thus far, dynamic network expansion methods have achieved +state-of-the-art performance at the cost of incurring significant computational +overhead. This is due to the need for additional model buffers, which makes it +less feasible in resource-constrained settings, particularly in the medical +domain. To overcome this challenge, we propose Dynamic Model Merging, DynaMMo, +a method that merges multiple networks at different stages of model training to +achieve better computational efficiency. Specifically, we employ lightweight +learnable modules for each task and combine them into a unified model to +minimize computational overhead. DynaMMo achieves this without compromising +performance, offering a cost-effective solution for continual learning in +medical applications. We evaluate DynaMMo on three publicly available datasets, +demonstrating its effectiveness compared to existing approaches. DynaMMo offers +around 10-fold reduction in GFLOPS with a small drop of 2.76 in average +accuracy when compared to state-of-the-art dynamic-based approaches. The code +implementation of this work will be available upon the acceptance of this work +at https://github.com/BioMedIA-MBZUAI/DynaMMo. + +
+
+
+
+
+ + ☆ Research on Robot Path Planning Based on Reinforcement Learning + + +
+ This project has conducted research on robot path planning based on Visual +SLAM. The main work of this project is as follows: (1) Construction of Visual +SLAM system. Research has been conducted on the basic architecture of Visual +SLAM. A Visual SLAM system is developed based on ORB-SLAM3 system, which can +conduct dense point cloud mapping. (2) The map suitable for two-dimensional +path planning is obtained through map conversion. This part converts the dense +point cloud map obtained by Visual SLAM system into an octomap and then +performs projection transformation to the grid map. The map conversion converts +the dense point cloud map containing a large amount of redundant map +information into an extremely lightweight grid map suitable for path planning. +(3) Research on path planning algorithm based on reinforcement learning. This +project has conducted experimental comparisons between the Q-learning +algorithm, the DQN algorithm, and the SARSA algorithm, and found that DQN is +the algorithm with the fastest convergence and best performance in +high-dimensional complex environments. This project has conducted experimental +verification of the Visual SLAM system in a simulation environment. The +experimental results obtained based on open-source dataset and self-made +dataset prove the feasibility and effectiveness of the designed Visual SLAM +system. At the same time, this project has also conducted comparative +experiments on the three reinforcement learning algorithms under the same +experimental condition to obtain the optimal algorithm under the experimental +condition. + +
+
+ comment: My undergrad final year project report, 44 pages and 15 figures +
+
+
+
+
+ + ☆ Noise contrastive estimation with soft targets for conditional models + + +
+ Soft targets combined with the cross-entropy loss have shown to improve +generalization performance of deep neural networks on supervised classification +tasks. The standard cross-entropy loss however assumes data to be categorically +distributed, which may often not be the case in practice. In contrast, InfoNCE +does not rely on such an explicit assumption but instead implicitly estimates +the true conditional through negative sampling. Unfortunately, it cannot be +combined with soft targets in its standard formulation, hindering its use in +combination with sophisticated training strategies. In this paper, we address +this limitation by proposing a principled loss function that is compatible with +probabilistic targets. Our new soft target InfoNCE loss is conceptually simple, +efficient to compute, and can be derived within the framework of noise +contrastive estimation. Using a toy example, we demonstrate shortcomings of the +categorical distribution assumption of cross-entropy, and discuss implications +of sampling from soft distributions. We observe that soft target InfoNCE +performs on par with strong soft target cross-entropy baselines and outperforms +hard target NLL and InfoNCE losses on popular benchmarks, including ImageNet. +Finally, we provide a simple implementation of our loss, geared towards +supervised classification and fully compatible with deep classification model +trained with cross-entropy. + +
+
+
+
+
+ + ☆ SHE-Net: Syntax-Hierarchy-Enhanced Text-Video Retrieval + + +
+ The user base of short video apps has experienced unprecedented growth in +recent years, resulting in a significant demand for video content analysis. In +particular, text-video retrieval, which aims to find the top matching videos +given text descriptions from a vast video corpus, is an essential function, the +primary challenge of which is to bridge the modality gap. Nevertheless, most +existing approaches treat texts merely as discrete tokens and neglect their +syntax structures. Moreover, the abundant spatial and temporal clues in videos +are often underutilized due to the lack of interaction with text. To address +these issues, we argue that using texts as guidance to focus on relevant +temporal frames and spatial regions within videos is beneficial. In this paper, +we propose a novel Syntax-Hierarchy-Enhanced text-video retrieval method +(SHE-Net) that exploits the inherent semantic and syntax hierarchy of texts to +bridge the modality gap from two perspectives. First, to facilitate a more +fine-grained integration of visual content, we employ the text syntax +hierarchy, which reveals the grammatical structure of text descriptions, to +guide the visual representations. Second, to further enhance the multi-modal +interaction and alignment, we also utilize the syntax hierarchy to guide the +similarity calculation. We evaluated our method on four public text-video +retrieval datasets of MSR-VTT, MSVD, DiDeMo, and ActivityNet. The experimental +results and ablation studies confirm the advantages of our proposed method. + +
+
+
+
+
+ + ☆ Multi-view Disentanglement for Reinforcement Learning with Multiple + Cameras + + +
+ The performance of image-based Reinforcement Learning (RL) agents can vary +depending on the position of the camera used to capture the images. Training on +multiple cameras simultaneously, including a first-person egocentric camera, +can leverage information from different camera perspectives to improve the +performance of RL. However, hardware constraints may limit the availability of +multiple cameras in real-world deployment. Additionally, cameras may become +damaged in the real-world preventing access to all cameras that were used +during training. To overcome these hardware constraints, we propose Multi-View +Disentanglement (MVD), which uses multiple cameras to learn a policy that +achieves zero-shot generalisation to any single camera from the training set. +Our approach is a self-supervised auxiliary task for RL that learns a +disentangled representation from multiple cameras, with a shared representation +that is aligned across all cameras to allow generalisation to a single camera, +and a private representation that is camera-specific. We show experimentally +that an RL agent trained on a single third-person camera is unable to learn an +optimal policy in many control tasks; but, our approach, benefiting from +multiple cameras during training, is able to solve the task using only the same +single third-person camera. + +
+
+
+
+
+ + ☆ GatedLexiconNet: A Comprehensive End-to-End Handwritten Paragraph Text + Recognition System + + +
+ The Handwritten Text Recognition problem has been a challenge for researchers +for the last few decades, especially in the domain of computer vision, a +subdomain of pattern recognition. Variability of texts amongst writers, +cursiveness, and different font styles of handwritten texts with degradation of +historical text images make it a challenging problem. Recognizing scanned +document images in neural network-based systems typically involves a two-step +approach: segmentation and recognition. However, this method has several +drawbacks. These shortcomings encompass challenges in identifying text regions, +analyzing layout diversity within pages, and establishing accurate ground truth +segmentation. Consequently, these processes are prone to errors, leading to +bottlenecks in achieving high recognition accuracies. Thus, in this study, we +present an end-to-end paragraph recognition system that incorporates internal +line segmentation and gated convolutional layers based encoder. The gating is a +mechanism that controls the flow of information and allows to adaptively +selection of the more relevant features in handwritten text recognition models. +The attention module plays an important role in performing internal line +segmentation, allowing the page to be processed line-by-line. During the +decoding step, we have integrated a connectionist temporal classification-based +word beam search decoder as a post-processing step. In this work, we have +extended existing LexiconNet by carefully applying and utilizing gated +convolutional layers in the existing deep neural network. Our results at line +and page levels also favour our new GatedLexiconNet. This study reported +character error rates of 2.27% on IAM, 0.9% on RIMES, and 2.13% on READ-16, and +word error rates of 5.73% on IAM, 2.76% on RIMES, and 6.52% on READ-2016 +datasets. + +
+
+
+
+
+ + ☆ RingID: Rethinking Tree-Ring Watermarking for Enhanced Multi-Key + Identification + + +
+ We revisit Tree-Ring Watermarking, a recent diffusion model watermarking +method that demonstrates great robustness to various attacks. We conduct an +in-depth study on it and reveal that the distribution shift unintentionally +introduced by the watermarking process, apart from watermark pattern matching, +contributes to its exceptional robustness. Our investigation further exposes +inherent flaws in its original design, particularly in its ability to identify +multiple distinct keys, where distribution shift offers no assistance. Based on +these findings and analysis, we present RingID for enhanced multi-key +identification. It consists of a novel multi-channel heterogeneous watermarking +approach designed to seamlessly amalgamate distinctive advantages from diverse +watermarks. Coupled with a series of suggested enhancements, RingID exhibits +substantial advancements in multi-key identification. + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ☆ HashPoint: Accelerated Point Searching and Sampling for Neural Rendering CVPR2024 + + +
+ In this paper, we address the problem of efficient point searching and +sampling for volume neural rendering. Within this realm, two typical approaches +are employed: rasterization and ray tracing. The rasterization-based methods +enable real-time rendering at the cost of increased memory and lower fidelity. +In contrast, the ray-tracing-based methods yield superior quality but demand +longer rendering time. We solve this problem by our HashPoint method combining +these two strategies, leveraging rasterization for efficient point searching +and sampling, and ray marching for rendering. Our method optimizes point +searching by rasterizing points within the camera's view, organizing them in a +hash table, and facilitating rapid searches. Notably, we accelerate the +rendering process by adaptive sampling on the primary surface encountered by +the ray. Our approach yields substantial speed-up for a range of +state-of-the-art ray-tracing-based methods, maintaining equivalent or superior +accuracy across synthetic and real test datasets. The code will be available at +https://jiahao-ma.github.io/hashpoint/. + +
+
+ comment: CVPR2024 Highlight +
+
+
+
+
+ + ☆ CloudFort: Enhancing Robustness of 3D Point Cloud Classification Against + Backdoor Attacks via Spatial Partitioning and Ensemble Prediction + + +
+ The increasing adoption of 3D point cloud data in various applications, such +as autonomous vehicles, robotics, and virtual reality, has brought about +significant advancements in object recognition and scene understanding. +However, this progress is accompanied by new security challenges, particularly +in the form of backdoor attacks. These attacks involve inserting malicious +information into the training data of machine learning models, potentially +compromising the model's behavior. In this paper, we propose CloudFort, a novel +defense mechanism designed to enhance the robustness of 3D point cloud +classifiers against backdoor attacks. CloudFort leverages spatial partitioning +and ensemble prediction techniques to effectively mitigate the impact of +backdoor triggers while preserving the model's performance on clean data. We +evaluate the effectiveness of CloudFort through extensive experiments, +demonstrating its strong resilience against the Point Cloud Backdoor Attack +(PCBA). Our results show that CloudFort significantly enhances the security of +3D point cloud classification models without compromising their accuracy on +benign samples. Furthermore, we explore the limitations of CloudFort and +discuss potential avenues for future research in the field of 3D point cloud +security. The proposed defense mechanism represents a significant step towards +ensuring the trustworthiness and reliability of point-cloud-based systems in +real-world applications. + +
+
+
+
+
+ + ☆ Surgical-DeSAM: Decoupling SAM for Instrument Segmentation in Robotic + Surgery + + +
+ Purpose: The recent Segment Anything Model (SAM) has demonstrated impressive +performance with point, text or bounding box prompts, in various applications. +However, in safety-critical surgical tasks, prompting is not possible due to +(i) the lack of per-frame prompts for supervised learning, (ii) it is +unrealistic to prompt frame-by-frame in a real-time tracking application, and +(iii) it is expensive to annotate prompts for offline applications. + Methods: We develop Surgical-DeSAM to generate automatic bounding box prompts +for decoupling SAM to obtain instrument segmentation in real-time robotic +surgery. We utilise a commonly used detection architecture, DETR, and +fine-tuned it to obtain bounding box prompt for the instruments. We then +empolyed decoupling SAM (DeSAM) by replacing the image encoder with DETR +encoder and fine-tune prompt encoder and mask decoder to obtain instance +segmentation for the surgical instruments. To improve detection performance, we +adopted the Swin-transformer to better feature representation. + Results: The proposed method has been validated on two publicly available +datasets from the MICCAI surgical instruments segmentation challenge EndoVis +2017 and 2018. The performance of our method is also compared with SOTA +instrument segmentation methods and demonstrated significant improvements with +dice metrics of 89.62 and 90.70 for the EndoVis 2017 and 2018. + Conclusion: Our extensive experiments and validations demonstrate that +Surgical-DeSAM enables real-time instrument segmentation without any additional +prompting and outperforms other SOTA segmentation methods. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ GaussianTalker: Speaker-specific Talking Head Synthesis via 3D Gaussian + Splatting + + +
+ Recent works on audio-driven talking head synthesis using Neural Radiance +Fields (NeRF) have achieved impressive results. However, due to inadequate pose +and expression control caused by NeRF implicit representation, these methods +still have some limitations, such as unsynchronized or unnatural lip movements, +and visual jitter and artifacts. In this paper, we propose GaussianTalker, a +novel method for audio-driven talking head synthesis based on 3D Gaussian +Splatting. With the explicit representation property of 3D Gaussians, intuitive +control of the facial motion is achieved by binding Gaussians to 3D facial +models. GaussianTalker consists of two modules, Speaker-specific Motion +Translator and Dynamic Gaussian Renderer. Speaker-specific Motion Translator +achieves accurate lip movements specific to the target speaker through +universalized audio feature extraction and customized lip motion generation. +Dynamic Gaussian Renderer introduces Speaker-specific BlendShapes to enhance +facial detail representation via a latent pose, delivering stable and realistic +rendered videos. Extensive experimental results suggest that GaussianTalker +outperforms existing state-of-the-art methods in talking head synthesis, +delivering precise lip synchronization and exceptional visual quality. Our +method achieves rendering speeds of 130 FPS on NVIDIA RTX4090 GPU, +significantly exceeding the threshold for real-time rendering performance, and +can potentially be deployed on other hardware platforms. + +
+
+
+
+
+ + ☆ PointDifformer: Robust Point Cloud Registration With Neural Diffusion + and Transformer + + +
+ Point cloud registration is a fundamental technique in 3-D computer vision +with applications in graphics, autonomous driving, and robotics. However, +registration tasks under challenging conditions, under which noise or +perturbations are prevalent, can be difficult. We propose a robust point cloud +registration approach that leverages graph neural partial differential +equations (PDEs) and heat kernel signatures. Our method first uses graph neural +PDE modules to extract high dimensional features from point clouds by +aggregating information from the 3-D point neighborhood, thereby enhancing the +robustness of the feature representations. Then, we incorporate heat kernel +signatures into an attention mechanism to efficiently obtain corresponding +keypoints. Finally, a singular value decomposition (SVD) module with learnable +weights is used to predict the transformation between two point clouds. +Empirical experiments on a 3-D point cloud dataset demonstrate that our +approach not only achieves state-of-the-art performance for point cloud +registration but also exhibits better robustness to additive noise or 3-D shape +perturbations. + +
+
+ comment: Accepted by IEEE Transactions on Geoscience and Remote Sensing +
+
+
+
+
+ + ☆ 1st Place Solution to the 1st SkatingVerse Challenge + + +
+ This paper presents the winning solution for the 1st SkatingVerse Challenge. +We propose a method that involves several steps. To begin, we leverage the DINO +framework to extract the Region of Interest (ROI) and perform precise cropping +of the raw video footage. Subsequently, we employ three distinct models, namely +Unmasked Teacher, UniformerV2, and InfoGCN, to capture different aspects of the +data. By ensembling the prediction results based on logits, our solution +attains an impressive leaderboard score of 95.73%. + +
+
+ comment: 3 pages, 1st SkatingVerse Challenge, 18th IEEE International + Conference on Automatic Face and Gesture Recognition workshop +
+
+
+
+
+ + ☆ OccFeat: Self-supervised Occupancy Feature Prediction for Pretraining + BEV Segmentation Networks + + +
+ We introduce a self-supervised pretraining method, called OcFeat, for +camera-only Bird's-Eye-View (BEV) segmentation networks. With OccFeat, we +pretrain a BEV network via occupancy prediction and feature distillation tasks. +Occupancy prediction provides a 3D geometric understanding of the scene to the +model. However, the geometry learned is class-agnostic. Hence, we add semantic +information to the model in the 3D space through distillation from a +self-supervised pretrained image foundation model. Models pretrained with our +method exhibit improved BEV semantic segmentation performance, particularly in +low-data scenarios. Moreover, empirical results affirm the efficacy of +integrating feature distillation with 3D occupancy prediction in our +pretraining approach. + +
+
+
+
+
+ + ☆ DHRNet: A Dual-Path Hierarchical Relation Network for Multi-Person Pose + Estimation + + +
+ Multi-person pose estimation (MPPE) presents a formidable yet crucial +challenge in computer vision. Most existing methods predominantly concentrate +on isolated interaction either between instances or joints, which is inadequate +for scenarios demanding concurrent localization of both instances and joints. +This paper introduces a novel CNN-based single-stage method, named Dual-path +Hierarchical Relation Network (DHRNet), to extract instance-to-joint and +joint-to-instance interactions concurrently. Specifically, we design a +dual-path interaction modeling module (DIM) that strategically organizes +cross-instance and cross-joint interaction modeling modules in two +complementary orders, enriching interaction information by integrating merits +from different correlation modeling branches. Notably, DHRNet excels in joint +localization by leveraging information from other instances and joints. +Extensive evaluations on challenging datasets, including COCO, CrowdPose, and +OCHuman datasets, showcase DHRNet's state-of-the-art performance. The code will +be released at https://github.com/YHDang/dhrnet-multi-pose-estimation. + +
+
+
+
+
+ + ☆ Collaborative Perception Datasets in Autonomous Driving: A Survey + + +
+ This survey offers a comprehensive examination of collaborative perception +datasets in the context of Vehicle-to-Infrastructure (V2I), Vehicle-to-Vehicle +(V2V), and Vehicle-to-Everything (V2X). It highlights the latest developments +in large-scale benchmarks that accelerate advancements in perception tasks for +autonomous vehicles. The paper systematically analyzes a variety of datasets, +comparing them based on aspects such as diversity, sensor setup, quality, +public availability, and their applicability to downstream tasks. It also +highlights the key challenges such as domain shift, sensor setup limitations, +and gaps in dataset diversity and availability. The importance of addressing +privacy and security concerns in the development of datasets is emphasized, +regarding data sharing and dataset creation. The conclusion underscores the +necessity for comprehensive, globally accessible datasets and collaborative +efforts from both technological and research communities to overcome these +challenges and fully harness the potential of autonomous driving. + +
+
+ comment: 8 pages,3 figures +
+
+
+
+
+ + ☆ A Multimodal Feature Distillation with CNN-Transformer Network for Brain + Tumor Segmentation with Incomplete Modalities + + +
+ Existing brain tumor segmentation methods usually utilize multiple Magnetic +Resonance Imaging (MRI) modalities in brain tumor images for segmentation, +which can achieve better segmentation performance. However, in clinical +applications, some modalities are missing due to resource constraints, leading +to severe degradation in the performance of methods applying complete modality +segmentation. In this paper, we propose a Multimodal feature distillation with +Convolutional Neural Network (CNN)-Transformer hybrid network (MCTSeg) for +accurate brain tumor segmentation with missing modalities. We first design a +Multimodal Feature Distillation (MFD) module to distill feature-level +multimodal knowledge into different unimodality to extract complete modality +information. We further develop a Unimodal Feature Enhancement (UFE) module to +model the relationship between global and local information semantically. +Finally, we build a Cross-Modal Fusion (CMF) module to explicitly align the +global correlations among different modalities even when some modalities are +missing. Complementary features within and across different modalities are +refined via the CNN-Transformer hybrid architectures in both the UFE and CMF +modules, where local and global dependencies are both captured. Our ablation +study demonstrates the importance of the proposed modules with CNN-Transformer +networks and the convolutional blocks in Transformer for improving the +performance of brain tumor segmentation with missing modalities. Extensive +experiments on the BraTS2018 and BraTS2020 datasets show that the proposed +MCTSeg framework outperforms the state-of-the-art methods in missing modalities +cases. Our code is available at: https://github.com/mkang315/MCTSeg. + +
+
+
+
+
+ + ☆ Ungeneralizable Examples CVPR2024 + + +
+ The training of contemporary deep learning models heavily relies on publicly +available data, posing a risk of unauthorized access to online data and raising +concerns about data privacy. Current approaches to creating unlearnable data +involve incorporating small, specially designed noises, but these methods +strictly limit data usability, overlooking its potential usage in authorized +scenarios. In this paper, we extend the concept of unlearnable data to +conditional data learnability and introduce \textbf{U}n\textbf{G}eneralizable +\textbf{E}xamples (UGEs). UGEs exhibit learnability for authorized users while +maintaining unlearnability for potential hackers. The protector defines the +authorized network and optimizes UGEs to match the gradients of the original +data and its ungeneralizable version, ensuring learnability. To prevent +unauthorized learning, UGEs are trained by maximizing a designated distance +loss in a common feature space. Additionally, to further safeguard the +authorized side from potential attacks, we introduce additional undistillation +optimization. Experimental results on multiple datasets and various networks +demonstrate that the proposed UGEs framework preserves data usability while +reducing training performance on hacker networks, even under different types of +attacks. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Infusion: Preventing Customized Text-to-Image Diffusion from Overfitting + + +
+ Text-to-image (T2I) customization aims to create images that embody specific +visual concepts delineated in textual descriptions. However, existing works +still face a main challenge, concept overfitting. To tackle this challenge, we +first analyze overfitting, categorizing it into concept-agnostic overfitting, +which undermines non-customized concept knowledge, and concept-specific +overfitting, which is confined to customize on limited modalities, i.e, +backgrounds, layouts, styles. To evaluate the overfitting degree, we further +introduce two metrics, i.e, Latent Fisher divergence and Wasserstein metric to +measure the distribution changes of non-customized and customized concept +respectively. Drawing from the analysis, we propose Infusion, a T2I +customization method that enables the learning of target concepts to avoid +being constrained by limited training modalities, while preserving +non-customized knowledge. Remarkably, Infusion achieves this feat with +remarkable efficiency, requiring a mere 11KB of trained parameters. Extensive +experiments also demonstrate that our approach outperforms state-of-the-art +methods in both single and multi-concept customized generation. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Distilled Datamodel with Reverse Gradient Matching CVPR2024 + + +
+ The proliferation of large-scale AI models trained on extensive datasets has +revolutionized machine learning. With these models taking on increasingly +central roles in various applications, the need to understand their behavior +and enhance interpretability has become paramount. To investigate the impact of +changes in training data on a pre-trained model, a common approach is +leave-one-out retraining. This entails systematically altering the training +dataset by removing specific samples to observe resulting changes within the +model. However, retraining the model for each altered dataset presents a +significant computational challenge, given the need to perform this operation +for every dataset variation. In this paper, we introduce an efficient framework +for assessing data impact, comprising offline training and online evaluation +stages. During the offline training phase, we approximate the influence of +training data on the target model through a distilled synset, formulated as a +reversed gradient matching problem. For online evaluation, we expedite the +leave-one-out process using the synset, which is then utilized to compute the +attribution matrix based on the evaluation objective. Experimental evaluations, +including training data attribution and assessments of data quality, +demonstrate that our proposed method achieves comparable model behavior +evaluation while significantly speeding up the process compared to the direct +retraining method. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ CoFInAl: Enhancing Action Quality Assessment with Coarse-to-Fine + Instruction Alignment IJCAI 2024 + + +
+ Action Quality Assessment (AQA) is pivotal for quantifying actions across +domains like sports and medical care. Existing methods often rely on +pre-trained backbones from large-scale action recognition datasets to boost +performance on smaller AQA datasets. However, this common strategy yields +suboptimal results due to the inherent struggle of these backbones to capture +the subtle cues essential for AQA. Moreover, fine-tuning on smaller datasets +risks overfitting. To address these issues, we propose Coarse-to-Fine +Instruction Alignment (CoFInAl). Inspired by recent advances in large language +model tuning, CoFInAl aligns AQA with broader pre-trained tasks by +reformulating it as a coarse-to-fine classification task. Initially, it learns +grade prototypes for coarse assessment and then utilizes fixed sub-grade +prototypes for fine-grained assessment. This hierarchical approach mirrors the +judging process, enhancing interpretability within the AQA framework. +Experimental results on two long-term AQA datasets demonstrate CoFInAl achieves +state-of-the-art performance with significant correlation gains of 5.49% and +3.55% on Rhythmic Gymnastics and Fis-V, respectively. Our code is available at +https://github.com/ZhouKanglei/CoFInAl_AQA. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Challenges in automatic and selective plant-clearing + + +
+ With the advent of multispectral imagery and AI, there have been numerous +works on automatic plant segmentation for purposes such as counting, picking, +health monitoring, localized pesticide delivery, etc. In this paper, we tackle +the related problem of automatic and selective plant-clearing in a sustainable +forestry context, where an autonomous machine has to detect and avoid specific +plants while clearing any weeds which may compete with the species being +cultivated. Such an autonomous system requires a high level of robustness to +weather conditions, plant variability, terrain and weeds while remaining cheap +and easy to maintain. We notably discuss the lack of robustness of spectral +imagery, investigate the impact of the reference database's size and discuss +issues specific to AI systems operating in uncontrolled environments. + +
+
+
+
+
+ + ☆ Zero-Shot Character Identification and Speaker Prediction in Comics via + Iterative Multimodal Fusion + + +
+ Recognizing characters and predicting speakers of dialogue are critical for +comic processing tasks, such as voice generation or translation. However, +because characters vary by comic title, supervised learning approaches like +training character classifiers which require specific annotations for each +comic title are infeasible. This motivates us to propose a novel zero-shot +approach, allowing machines to identify characters and predict speaker names +based solely on unannotated comic images. In spite of their importance in +real-world applications, these task have largely remained unexplored due to +challenges in story comprehension and multimodal integration. Recent large +language models (LLMs) have shown great capability for text understanding and +reasoning, while their application to multimodal content analysis is still an +open problem. To address this problem, we propose an iterative multimodal +framework, the first to employ multimodal information for both character +identification and speaker prediction tasks. Our experiments demonstrate the +effectiveness of the proposed framework, establishing a robust baseline for +these tasks. Furthermore, since our method requires no training data or +annotations, it can be used as-is on any comic series. + +
+
+
+
+
+ + ☆ Dynamic Proxy Domain Generalizes the Crowd Localization by Better Binary + Segmentation + + +
+ Crowd localization targets on predicting each instance precise location +within an image. Current advanced methods propose the pixel-wise binary +classification to tackle the congested prediction, in which the pixel-level +thresholds binarize the prediction confidence of being the pedestrian head. +Since the crowd scenes suffer from extremely varying contents, counts and +scales, the confidence-threshold learner is fragile and under-generalized +encountering domain knowledge shift. Moreover, at the most time, the target +domain is agnostic in training. Hence, it is imperative to exploit how to +enhance the generalization of confidence-threshold locator to the latent target +domain. In this paper, we propose a Dynamic Proxy Domain (DPD) method to +generalize the learner under domain shift. Concretely, based on the theoretical +analysis to the generalization error risk upper bound on the latent target +domain to a binary classifier, we propose to introduce a generated proxy domain +to facilitate generalization. Then, based on the theory, we design a DPD +algorithm which is composed by a training paradigm and proxy domain generator +to enhance the domain generalization of the confidence-threshold learner. +Besides, we conduct our method on five kinds of domain shift scenarios, +demonstrating the effectiveness on generalizing the crowd localization. Our +code will be available at https://github.com/zhangda1018/DPD. + +
+
+
+
+
+ + ☆ RHanDS: Refining Malformed Hands for Generated Images with Decoupled + Structure and Style Guidance + + +
+ Although diffusion models can generate high-quality human images, their +applications are limited by the instability in generating hands with correct +structures. Some previous works mitigate the problem by considering hand +structure yet struggle to maintain style consistency between refined malformed +hands and other image regions. In this paper, we aim to solve the problem of +inconsistency regarding hand structure and style. We propose a conditional +diffusion-based framework RHanDS to refine the hand region with the help of +decoupled structure and style guidance. Specifically, the structure guidance is +the hand mesh reconstructed from the malformed hand, serving to correct the +hand structure. The style guidance is a hand image, e.g., the malformed hand +itself, and is employed to furnish the style reference for hand refining. In +order to suppress the structure leakage when referencing hand style and +effectively utilize hand data to improve the capability of the model, we build +a multi-style hand dataset and introduce a twostage training strategy. In the +first stage, we use paired hand images for training to generate hands with the +same style as the reference. In the second stage, various hand images generated +based on the human mesh are used for training to enable the model to gain +control over the hand structure. We evaluate our method and counterparts on the +test dataset of the proposed multi-style hand dataset. The experimental results +show that RHanDS can effectively refine hands structure- and style- correctly +compared with previous methods. The codes and datasets will be available soon. + +
+
+
+
+
+ + ☆ Structure-Aware Human Body Reshaping with Adaptive Affinity-Graph + Network + + +
+ Given a source portrait, the automatic human body reshaping task aims at +editing it to an aesthetic body shape. As the technology has been widely used +in media, several methods have been proposed mainly focusing on generating +optical flow to warp the body shape. However, those previous works only +consider the local transformation of different body parts (arms, torso, and +legs), ignoring the global affinity, and limiting the capacity to ensure +consistency and quality across the entire body. In this paper, we propose a +novel Adaptive Affinity-Graph Network (AAGN), which extracts the global +affinity between different body parts to enhance the quality of the generated +optical flow. Specifically, our AAGN primarily introduces the following +designs: (1) we propose an Adaptive Affinity-Graph (AAG) Block that leverages +the characteristic of a fully connected graph. AAG represents different body +parts as nodes in an adaptive fully connected graph and captures all the +affinities between nodes to obtain a global affinity map. The design could +better improve the consistency between body parts. (2) Besides, for +high-frequency details are crucial for photo aesthetics, a Body Shape +Discriminator (BSD) is designed to extract information from both high-frequency +and spatial domain. Particularly, an SRM filter is utilized to extract +high-frequency details, which are combined with spatial features as input to +the BSD. With this design, BSD guides the Flow Generator (FG) to pay attention +to various fine details rather than rigid pixel-level fitting. Extensive +experiments conducted on the BR-5K dataset demonstrate that our framework +significantly enhances the aesthetic appeal of reshaped photos, marginally +surpassing all previous work to achieve state-of-the-art in all evaluation +metrics. + +
+
+ comment: 11 pages; +
+
+
+
+
+ + ☆ Non-Uniform Exposure Imaging via Neuromorphic Shutter Control + + +
+ By leveraging the blur-noise trade-off, imaging with non-uniform exposures +largely extends the image acquisition flexibility in harsh environments. +However, the limitation of conventional cameras in perceiving intra-frame +dynamic information prevents existing methods from being implemented in the +real-world frame acquisition for real-time adaptive camera shutter control. To +address this challenge, we propose a novel Neuromorphic Shutter Control (NSC) +system to avoid motion blurs and alleviate instant noises, where the extremely +low latency of events is leveraged to monitor the real-time motion and +facilitate the scene-adaptive exposure. Furthermore, to stabilize the +inconsistent Signal-to-Noise Ratio (SNR) caused by the non-uniform exposure +times, we propose an event-based image denoising network within a +self-supervised learning paradigm, i.e., SEID, exploring the statistics of +image noises and inter-frame motion information of events to obtain artificial +supervision signals for high-quality imaging in real-world scenes. To +illustrate the effectiveness of the proposed NSC, we implement it in hardware +by building a hybrid-camera imaging prototype system, with which we collect a +real-world dataset containing well-synchronized frames and events in diverse +scenarios with different target scenes and motion patterns. Experiments on the +synthetic and real-world datasets demonstrate the superiority of our method +over state-of-the-art approaches. + +
+
+
+
+
+ + ☆ 360VOTS: Visual Object Tracking and Segmentation in Omnidirectional + Videos + + +
+ Visual object tracking and segmentation in omnidirectional videos are +challenging due to the wide field-of-view and large spherical distortion +brought by 360{\deg} images. To alleviate these problems, we introduce a novel +representation, extended bounding field-of-view (eBFoV), for target +localization and use it as the foundation of a general 360 tracking framework +which is applicable for both omnidirectional visual object tracking and +segmentation tasks. Building upon our previous work on omnidirectional visual +object tracking (360VOT), we propose a comprehensive dataset and benchmark that +incorporates a new component called omnidirectional video object segmentation +(360VOS). The 360VOS dataset includes 290 sequences accompanied by dense +pixel-wise masks and covers a broader range of target categories. To support +both the development and evaluation of algorithms in this domain, we divide the +dataset into a training subset with 170 sequences and a testing subset with 120 +sequences. Furthermore, we tailor evaluation metrics for both omnidirectional +tracking and segmentation to ensure rigorous assessment. Through extensive +experiments, we benchmark state-of-the-art approaches and demonstrate the +effectiveness of our proposed 360 tracking framework and training dataset. +Homepage: https://360vots.hkustvgd.com/ + +
+
+
+
+
+ + ☆ PeLiCal: Targetless Extrinsic Calibration via Penetrating Lines for + RGB-D Cameras with Limited Co-visibility + + +
+ RGB-D cameras are crucial in robotic perception, given their ability to +produce images augmented with depth data. However, their limited FOV often +requires multiple cameras to cover a broader area. In multi-camera RGB-D +setups, the goal is typically to reduce camera overlap, optimizing spatial +coverage with as few cameras as possible. The extrinsic calibration of these +systems introduces additional complexities. Existing methods for extrinsic +calibration either necessitate specific tools or highly depend on the accuracy +of camera motion estimation. To address these issues, we present PeLiCal, a +novel line-based calibration approach for RGB-D camera systems exhibiting +limited overlap. Our method leverages long line features from surroundings, and +filters out outliers with a novel convergence voting algorithm, achieving +targetless, real-time, and outlier-robust performance compared to existing +methods. We open source our implementation on +\url{https://github.com/joomeok/PeLiCal.git}. + +
+
+
+
+
+ + ☆ Boter: Bootstrapping Knowledge Selection and Question Answering for + Knowledge-based VQA + + +
+ Knowledge-based Visual Question Answering (VQA) requires models to +incorporate external knowledge to respond to questions about visual content. +Previous methods mostly follow the "retrieve and generate" paradigm. Initially, +they utilize a pre-trained retriever to fetch relevant knowledge documents, +subsequently employing them to generate answers. While these methods have +demonstrated commendable performance in the task, they possess limitations: (1) +they employ an independent retriever to acquire knowledge solely based on the +similarity between the query and knowledge embeddings, without assessing +whether the knowledge document is truly conducive to helping answer the +question; (2) they convert the image into text and then conduct retrieval and +answering in natural language space, which may not ensure comprehensive +acquisition of all image information. To address these limitations, we propose +Boter, a novel framework designed to bootstrap knowledge selection and question +answering by leveraging the robust multimodal perception capabilities of the +Multimodal Large Language Model (MLLM). The framework consists of two modules: +Selector and Answerer, where both are initialized by the MLLM and +parameter-efficiently finetuned in a simple cycle: find key knowledge in the +retrieved knowledge documents using the Selector, and then use them to finetune +the Answerer to predict answers; obtain the pseudo-labels of key knowledge +documents based on the predictions of the Answerer and weak supervision labels, +and then finetune the Selector to select key knowledge; repeat. Our framework +significantly enhances the performance of the baseline on the challenging +open-domain Knowledge-based VQA benchmark, OK-VQA, achieving a state-of-the-art +accuracy of 62.83%. + +
+
+
+
+
+ + ☆ Gorgeous: Create Your Desired Character Facial Makeup from Any Ideas + + +
+ Contemporary makeup transfer methods primarily focus on replicating makeup +from one face to another, considerably limiting their use in creating diverse +and creative character makeup essential for visual storytelling. Such methods +typically fail to address the need for uniqueness and contextual relevance, +specifically aligning with character and story settings as they depend heavily +on existing facial makeup in reference images. This approach also presents a +significant challenge when attempting to source a perfectly matched facial +makeup style, further complicating the creation of makeup designs inspired by +various story elements, such as theme, background, and props that do not +necessarily feature faces. To address these limitations, we introduce +$Gorgeous$, a novel diffusion-based makeup application method that goes beyond +simple transfer by innovatively crafting unique and thematic facial makeup. +Unlike traditional methods, $Gorgeous$ does not require the presence of a face +in the reference images. Instead, it draws artistic inspiration from a minimal +set of three to five images, which can be of any type, and transforms these +elements into practical makeup applications directly on the face. Our +comprehensive experiments demonstrate that $Gorgeous$ can effectively generate +distinctive character facial makeup inspired by the chosen thematic reference +images. This approach opens up new possibilities for integrating broader story +elements into character makeup, thereby enhancing the narrative depth and +visual impact in storytelling. + +
+
+ comment: Project page: https://github.com/JiaWeiSii/gorgeous/ +
+
+
+
+
+ + ☆ Exploring Kinetic Curves Features for the Classification of Benign and + Malignant Breast Lesions in DCE-MRI + + +
+ Breast cancer is the most common malignant tumor among women and the second +cause of cancer-related death. Early diagnosis in clinical practice is crucial +for timely treatment and prognosis. Dynamic contrast-enhanced magnetic +resonance imaging (DCE-MRI) has revealed great usability in the preoperative +diagnosis and assessing therapy effects thanks to its capability to reflect the +morphology and dynamic characteristics of breast lesions. However, most +existing computer-assisted diagnosis algorithms only consider conventional +radiomic features when classifying benign and malignant lesions in DCE-MRI. In +this study, we propose to fully leverage the dynamic characteristics from the +kinetic curves as well as the radiomic features to boost the classification +accuracy of benign and malignant breast lesions. The proposed method is a fully +automated solution by directly analyzing the 3D features from the DCE-MRI. The +proposed method is evaluated on an in-house dataset including 200 DCE-MRI scans +with 298 breast tumors (172 benign and 126 malignant tumors), achieving +favorable classification accuracy with an area under curve (AUC) of 0.94. By +simultaneously considering the dynamic and radiomic features, it is beneficial +to effectively distinguish between benign and malignant breast lesions. + +
+
+ comment: 6 pages, 8 figures, conference +
+
+
+
+
+ + ☆ MaterialSeg3D: Segmenting Dense Materials from 2D Priors for 3D Assets + + +
+ Driven by powerful image diffusion models, recent research has achieved the +automatic creation of 3D objects from textual or visual guidance. By performing +score distillation sampling (SDS) iteratively across different views, these +methods succeed in lifting 2D generative prior to the 3D space. However, such a +2D generative image prior bakes the effect of illumination and shadow into the +texture. As a result, material maps optimized by SDS inevitably involve +spurious correlated components. The absence of precise material definition +makes it infeasible to relight the generated assets reasonably in novel scenes, +which limits their application in downstream scenarios. In contrast, humans can +effortlessly circumvent this ambiguity by deducing the material of the object +from its appearance and semantics. Motivated by this insight, we propose +MaterialSeg3D, a 3D asset material generation framework to infer underlying +material from the 2D semantic prior. Based on such a prior model, we devise a +mechanism to parse material in 3D space. We maintain a UV stack, each map of +which is unprojected from a specific viewpoint. After traversing all +viewpoints, we fuse the stack through a weighted voting scheme and then employ +region unification to ensure the coherence of the object parts. To fuel the +learning of semantics prior, we collect a material dataset, named Materialized +Individual Objects (MIO), which features abundant images, diverse categories, +and accurate annotations. Extensive quantitative and qualitative experiments +demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ☆ NeRF-DetS: Enhancing Multi-View 3D Object Detection with + Sampling-adaptive Network of Continuous NeRF-based Representation + + +
+ As a preliminary work, NeRF-Det unifies the tasks of novel view synthesis and +3D perception, demonstrating that perceptual tasks can benefit from novel view +synthesis methods like NeRF, significantly improving the performance of indoor +multi-view 3D object detection. Using the geometry MLP of NeRF to direct the +attention of detection head to crucial parts and incorporating self-supervised +loss from novel view rendering contribute to the achieved improvement. To +better leverage the notable advantages of the continuous representation through +neural rendering in space, we introduce a novel 3D perception network +structure, NeRF-DetS. The key component of NeRF-DetS is the Multi-level +Sampling-Adaptive Network, making the sampling process adaptively from coarse +to fine. Also, we propose a superior multi-view information fusion method, +known as Multi-head Weighted Fusion. This fusion approach efficiently addresses +the challenge of losing multi-view information when using arithmetic mean, +while keeping low computational costs. NeRF-DetS outperforms competitive +NeRF-Det on the ScanNetV2 dataset, by achieving +5.02% and +5.92% improvement +in mAP@.25 and mAP@.50, respectively. + +
+
+
+
+
+ + ☆ Cross-Task Multi-Branch Vision Transformer for Facial Expression and + Mask Wearing Classification + + +
+ With wearing masks becoming a new cultural norm, facial expression +recognition (FER) while taking masks into account has become a significant +challenge. In this paper, we propose a unified multi-branch vision transformer +for facial expression recognition and mask wearing classification tasks. Our +approach extracts shared features for both tasks using a dual-branch +architecture that obtains multi-scale feature representations. Furthermore, we +propose a cross-task fusion phase that processes tokens for each task with +separate branches, while exchanging information using a cross attention module. +Our proposed framework reduces the overall complexity compared with using +separate networks for both tasks by the simple yet effective cross-task fusion +phase. Extensive experiments demonstrate that our proposed model performs +better than or on par with different state-of-the-art methods on both facial +expression recognition and facial mask wearing classification task. + +
+
+
+
+
+ + ☆ Brain-Inspired Continual Learning-Robust Feature Distillation and + Re-Consolidation for Class Incremental Learning + + +
+ Artificial intelligence (AI) and neuroscience share a rich history, with +advancements in neuroscience shaping the development of AI systems capable of +human-like knowledge retention. Leveraging insights from neuroscience and +existing research in adversarial and continual learning, we introduce a novel +framework comprising two core concepts: feature distillation and +re-consolidation. Our framework, named Robust Rehearsal, addresses the +challenge of catastrophic forgetting inherent in continual learning (CL) +systems by distilling and rehearsing robust features. Inspired by the mammalian +brain's memory consolidation process, Robust Rehearsal aims to emulate the +rehearsal of distilled experiences during learning tasks. Additionally, it +mimics memory re-consolidation, where new experiences influence the integration +of past experiences to mitigate forgetting. Extensive experiments conducted on +CIFAR10, CIFAR100, and real-world helicopter attitude datasets showcase the +superior performance of CL models trained with Robust Rehearsal compared to +baseline methods. Furthermore, examining different optimization training +objectives-joint, continual, and adversarial learning-we highlight the crucial +role of feature learning in model performance. This underscores the +significance of rehearsing CL-robust samples in mitigating catastrophic +forgetting. In conclusion, aligning CL approaches with neuroscience insights +offers promising solutions to the challenge of catastrophic forgetting, paving +the way for more robust and human-like AI systems. + +
+
+
+
+
+ + ☆ The Adversarial AI-Art: Understanding, Generation, Detection, and + Benchmarking + + +
+ Generative AI models can produce high-quality images based on text prompts. +The generated images often appear indistinguishable from images generated by +conventional optical photography devices or created by human artists (i.e., +real images). While the outstanding performance of such generative models is +generally well received, security concerns arise. For instance, such image +generators could be used to facilitate fraud or scam schemes, generate and +spread misinformation, or produce fabricated artworks. In this paper, we +present a systematic attempt at understanding and detecting AI-generated images +(AI-art) in adversarial scenarios. First, we collect and share a dataset of +real images and their corresponding artificial counterparts generated by four +popular AI image generators. The dataset, named ARIA, contains over 140K images +in five categories: artworks (painting), social media images, news photos, +disaster scenes, and anime pictures. This dataset can be used as a foundation +to support future research on adversarial AI-art. Next, we present a user study +that employs the ARIA dataset to evaluate if real-world users can distinguish +with or without reference images. In a benchmarking study, we further evaluate +if state-of-the-art open-source and commercial AI image detectors can +effectively identify the images in the ARIA dataset. Finally, we present a +ResNet-50 classifier and evaluate its accuracy and transferability on the ARIA +dataset. + +
+
+
+
+
+ + ☆ UVMap-ID: A Controllable and Personalized UV Map Generative Model + + +
+ Recently, diffusion models have made significant strides in synthesizing +realistic 2D human images based on provided text prompts. Building upon this, +researchers have extended 2D text-to-image diffusion models into the 3D domain +for generating human textures (UV Maps). However, some important problems about +UV Map Generative models are still not solved, i.e., how to generate +personalized texture maps for any given face image, and how to define and +evaluate the quality of these generated texture maps. To solve the above +problems, we introduce a novel method, UVMap-ID, which is a controllable and +personalized UV Map generative model. Unlike traditional large-scale training +methods in 2D, we propose to fine-tune a pre-trained text-to-image diffusion +model which is integrated with a face fusion module for achieving ID-driven +customized generation. To support the finetuning strategy, we introduce a +small-scale attribute-balanced training dataset, including high-quality +textures with labeled text and Face ID. Additionally, we introduce some metrics +to evaluate the multiple aspects of the textures. Finally, both quantitative +and qualitative analyses demonstrate the effectiveness of our method in +controllable and personalized UV Map generation. Code is publicly available via +https://github.com/twowwj/UVMap-ID. + +
+
+
+
+
+ + ☆ "Where am I?" Scene Retrieval with Language + + +
+ Natural language interfaces to embodied AI are becoming more ubiquitous in +our daily lives. This opens further opportunities for language-based +interaction with embodied agents, such as a user instructing an agent to +execute some task in a specific location. For example, "put the bowls back in +the cupboard next to the fridge" or "meet me at the intersection under the red +sign." As such, we need methods that interface between natural language and map +representations of the environment. To this end, we explore the question of +whether we can use an open-set natural language query to identify a scene +represented by a 3D scene graph. We define this task as "language-based +scene-retrieval" and it is closely related to "coarse-localization," but we are +instead searching for a match from a collection of disjoint scenes and not +necessarily a large-scale continuous map. Therefore, we present +Text2SceneGraphMatcher, a "scene-retrieval" pipeline that learns joint +embeddings between text descriptions and scene graphs to determine if they are +matched. The code, trained models, and datasets will be made public. + +
+
+
+
+
+ + ☆ Adaptive Local Binary Pattern: A Novel Feature Descriptor for Enhanced + Analysis of Kidney Abnormalities in CT Scan Images using ensemble based + Machine Learning Approach + + +
+ The shortage of nephrologists and the growing public health concern over +renal failure have spurred the demand for AI systems capable of autonomously +detecting kidney abnormalities. Renal failure, marked by a gradual decline in +kidney function, can result from factors like cysts, stones, and tumors. +Chronic kidney disease may go unnoticed initially, leading to untreated cases +until they reach an advanced stage. The dataset, comprising 12,427 images from +multiple hospitals in Dhaka, was categorized into four groups: cyst, tumor, +stone, and normal. Our methodology aims to enhance CT scan image quality using +Cropping, Resizing, and CALHE techniques, followed by feature extraction with +our proposed Adaptive Local Binary Pattern (A-LBP) feature extraction method +compared with the state-of-the-art local binary pattern (LBP) method. Our +proposed features fed into classifiers such as Random Forest, Decision Tree, +Naive Bayes, K-Nearest Neighbor, and SVM. We explored an ensemble model with +soft voting to get a more robust model for our task. We got the highest of more +than 99% in accuracy using our feature descriptor and ensembling five +classifiers (Random Forest, Decision Tree, Naive Bayes, K-Nearest Neighbor, +Support Vector Machine) with the soft voting method. + +
+
+ comment: 17 pages, 5 tables, 4 figures +
+
+
+
+
+ + ☆ UVEB: A Large-scale Benchmark and Baseline Towards Real-World Underwater + Video Enhancement CVPR2024 + + +
+ Learning-based underwater image enhancement (UIE) methods have made great +progress. However, the lack of large-scale and high-quality paired training +samples has become the main bottleneck hindering the development of UIE. The +inter-frame information in underwater videos can accelerate or optimize the UIE +process. Thus, we constructed the first large-scale high-resolution underwater +video enhancement benchmark (UVEB) to promote the development of underwater +vision.It contains 1,308 pairs of video sequences and more than 453,000 +high-resolution with 38\% Ultra-High-Definition (UHD) 4K frame pairs. UVEB +comes from multiple countries, containing various scenes and video degradation +types to adapt to diverse and complex underwater environments. We also propose +the first supervised underwater video enhancement method, UVE-Net. UVE-Net +converts the current frame information into convolutional kernels and passes +them to adjacent frames for efficient inter-frame information exchange. By +fully utilizing the redundant degraded information of underwater videos, +UVE-Net completes video enhancement better. Experiments show the effective +network design and good performance of UVE-Net. + +
+
+ comment: 10 pages,CVPR2024 accept +
+
+
+
+
+ + ☆ SwinFuSR: an image fusion-inspired model for RGB-guided thermal image + super-resolution CVPR 2024 + + +
+ Thermal imaging plays a crucial role in various applications, but the +inherent low resolution of commonly available infrared (IR) cameras limits its +effectiveness. Conventional super-resolution (SR) methods often struggle with +thermal images due to their lack of high-frequency details. Guided SR leverages +information from a high-resolution image, typically in the visible spectrum, to +enhance the reconstruction of a high-res IR image from the low-res input. +Inspired by SwinFusion, we propose SwinFuSR, a guided SR architecture based on +Swin transformers. In real world scenarios, however, the guiding modality (e.g. +RBG image) may be missing, so we propose a training method that improves the +robustness of the model in this case. Our method has few parameters and +outperforms state of the art models in terms of Peak Signal to Noise Ratio +(PSNR) and Structural SIMilarity (SSIM). In Track 2 of the PBVS 2024 Thermal +Image Super-Resolution Challenge, it achieves 3rd place in the PSNR metric. Our +code and pretained weights are available at +https://github.com/VisionICLab/SwinFuSR. + +
+
+ comment: Accepted at 20th IEEE Workshop on Perception Beyond the Visible + Spectrum, CVPR 2024 +
+
+
+
+
+ + ☆ Align Your Steps: Optimizing Sampling Schedules in Diffusion Models + + +
+ Diffusion models (DMs) have established themselves as the state-of-the-art +generative modeling approach in the visual domain and beyond. A crucial +drawback of DMs is their slow sampling speed, relying on many sequential +function evaluations through large neural networks. Sampling from DMs can be +seen as solving a differential equation through a discretized set of noise +levels known as the sampling schedule. While past works primarily focused on +deriving efficient solvers, little attention has been given to finding optimal +sampling schedules, and the entire literature relies on hand-crafted +heuristics. In this work, for the first time, we propose a general and +principled approach to optimizing the sampling schedules of DMs for +high-quality outputs, called $\textit{Align Your Steps}$. We leverage methods +from stochastic calculus and find optimal schedules specific to different +solvers, trained DMs and datasets. We evaluate our novel approach on several +image, video as well as 2D toy data synthesis benchmarks, using a variety of +different samplers, and observe that our optimized schedules outperform +previous hand-crafted schedules in almost all experiments. Our method +demonstrates the untapped potential of sampling schedule optimization, +especially in the few-step synthesis regime. + +
+
+ comment: Project page: + https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/ +
+
+
+
+
+ + ☆ Narrative Action Evaluation with Prompt-Guided Multimodal Interaction CVPR 2024 + + +
+ In this paper, we investigate a new problem called narrative action +evaluation (NAE). NAE aims to generate professional commentary that evaluates +the execution of an action. Unlike traditional tasks such as score-based action +quality assessment and video captioning involving superficial sentences, NAE +focuses on creating detailed narratives in natural language. These narratives +provide intricate descriptions of actions along with objective evaluations. NAE +is a more challenging task because it requires both narrative flexibility and +evaluation rigor. One existing possible solution is to use multi-task learning, +where narrative language and evaluative information are predicted separately. +However, this approach results in reduced performance for individual tasks +because of variations between tasks and differences in modality between +language information and evaluation information. To address this, we propose a +prompt-guided multimodal interaction framework. This framework utilizes a pair +of transformers to facilitate the interaction between different modalities of +information. It also uses prompts to transform the score regression task into a +video-text matching task, thus enabling task interactivity. To support further +research in this field, we re-annotate the MTL-AQA and FineGym datasets with +high-quality and comprehensive action narration. Additionally, we establish +benchmarks for NAE. Extensive experiment results prove that our method +outperforms separate learning methods and naive multi-task learning methods. +Data and code are released at +\href{https://github.com/shiyi-zh0408/NAE_CVPR2024 }{here}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Global OpenBuildingMap -- Unveiling the Mystery of Global Buildings + + +
+ Understanding how buildings are distributed globally is crucial to revealing +the human footprint on our home planet. This built environment affects local +climate, land surface albedo, resource distribution, and many other key factors +that influence well-being and human health. Despite this, quantitative and +comprehensive data on the distribution and properties of buildings worldwide is +lacking. To this end, by using a big data analytics approach and nearly 800,000 +satellite images, we generated the highest resolution and highest accuracy +building map ever created: the Global OpenBuildingMap (Global OBM). A joint +analysis of building maps and solar potentials indicates that rooftop solar +energy can supply the global energy consumption need at a reasonable cost. +Specifically, if solar panels were placed on the roofs of all buildings, they +could supply 1.1-3.3 times -- depending on the efficiency of the solar device +-- the global energy consumption in 2020, which is the year with the highest +consumption on record. We also identified a clear geospatial correlation +between building areas and key socioeconomic variables, which indicates our +global building map can serve as an important input to modeling global +socioeconomic needs and drivers. + +
+
+
+
+
+ + ☆ Deep Regression Representation Learning with Topology + + +
+ Most works studying representation learning focus only on classification and +neglect regression. Yet, the learning objectives and therefore the +representation topologies of the two tasks are fundamentally different: +classification targets class separation, leading to disconnected +representations, whereas regression requires ordinality with respect to the +target, leading to continuous representations. We thus wonder how the +effectiveness of a regression representation is influenced by its topology, +with evaluation based on the Information Bottleneck (IB) principle. + The IB principle is an important framework that provides principles for +learning effectiveness representations. We establish two connections between it +and the topology of regression representations. The first connection reveals +that a lower intrinsic dimension of the feature space implies a reduced +complexity of the representation Z. This complexity can be quantified as the +conditional entropy of Z on the target space Y and serves as an upper bound on +the generalization error. The second connection suggests learning a feature +space that is topologically similar to the target space will better align with +the IB principle. Based on these two connections, we introduce PH-Reg, a +regularizer specific to regression that matches the intrinsic dimension and +topology of the feature space with the target space. Experiments on synthetic +and real-world regression tasks demonstrate the benefits of PH-Reg. + +
+
+
+
+
+ + ☆ MambaUIE&SR: Unraveling the Ocean's Secrets with Only 2.8 FLOPs + + +
+ Underwater Image Enhancement (UIE) techniques aim to address the problem of +underwater image degradation due to light absorption and scattering. In recent +years, both Convolution Neural Network (CNN)-based and Transformer-based +methods have been widely explored. In addition, combining CNN and Transformer +can effectively combine global and local information for enhancement. However, +this approach is still affected by the secondary complexity of the Transformer +and cannot maximize the performance. Recently, the state-space model (SSM) +based architecture Mamba has been proposed, which excels in modeling long +distances while maintaining linear complexity. This paper explores the +potential of this SSM-based model for UIE from both efficiency and +effectiveness perspectives. However, the performance of directly applying Mamba +is poor because local fine-grained features, which are crucial for image +enhancement, cannot be fully utilized. Specifically, we customize the MambaUIE +architecture for efficient UIE. Specifically, we introduce visual state space +(VSS) blocks to capture global contextual information at the macro level while +mining local information at the micro level. Also, for these two kinds of +information, we propose a Dynamic Interaction Block (DIB) and Spatial +feed-forward Network (SGFN) for intra-block feature aggregation. MambaUIE is +able to efficiently synthesize global and local information and maintains a +very small number of parameters with high accuracy. Experiments on UIEB +datasets show that our method reduces GFLOPs by 67.4% (2.715G) relative to the +SOTA method. To the best of our knowledge, this is the first UIE model +constructed based on SSM that breaks the limitation of FLOPs on accuracy in +UIE. The official repository of MambaUIE at +https://github.com/1024AILab/MambaUIE. + +
+
+
+
+
+ + ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+
+
+
+ + ☆ VALOR-EVAL: Holistic Coverage and Faithfulness Evaluation of Large + Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) suffer from hallucination issues, +wherein the models generate plausible-sounding but factually incorrect outputs, +undermining their reliability. A comprehensive quantitative evaluation is +necessary to identify and understand the extent of hallucinations in these +models. However, existing benchmarks are often limited in scope, focusing +mainly on object hallucinations. Furthermore, current evaluation methods +struggle to effectively address the subtle semantic distinctions between model +outputs and reference data, as well as the balance between hallucination and +informativeness. To address these issues, we introduce a multi-dimensional +benchmark covering objects, attributes, and relations, with challenging images +selected based on associative biases. Moreover, we propose an large language +model (LLM)-based two-stage evaluation framework that generalizes the popular +CHAIR metric and incorporates both faithfulness and coverage into the +evaluation. Experiments on 10 established LVLMs demonstrate that our evaluation +metric is more comprehensive and better correlated with humans than existing +work when evaluating on our challenging human annotated benchmark dataset. Our +work also highlights the critical balance between faithfulness and coverage of +model outputs, and encourages future works to address hallucinations in LVLMs +while keeping their outputs informative. + +
+
+ comment: Work in process +
+
+
+
+
+ + ☆ Texture-aware and Shape-guided Transformer for Sequential DeepFake + Detection + + +
+ Sequential DeepFake detection is an emerging task that aims to predict the +manipulation sequence in order. Existing methods typically formulate it as an +image-to-sequence problem, employing conventional Transformer architectures for +detection. However, these methods lack dedicated design and consequently result +in limited performance. In this paper, we propose a novel Texture-aware and +Shape-guided Transformer to enhance detection performance. Our method features +four major improvements. Firstly, we describe a texture-aware branch that +effectively captures subtle manipulation traces with the Diversiform Pixel +Difference Attention module. Then we introduce a Bidirectional Interaction +Cross-attention module that seeks deep correlations among spatial and +sequential features, enabling effective modeling of complex manipulation +traces. To further enhance the cross-attention, we describe a Shape-guided +Gaussian mapping strategy, providing initial priors of the manipulation shape. +Finally, observing that the latter manipulation in a sequence may influence +traces left in the earlier one, we intriguingly invert the prediction order +from forward to backward, leading to notable gains as expected. Extensive +experimental results demonstrate that our method outperforms others by a large +margin, highlighting the superiority of our method. + +
+
+
+
+
+ + ☆ FreqBlender: Enhancing DeepFake Detection by Blending Frequency + Knowledge + + +
+ Generating synthetic fake faces, known as pseudo-fake faces, is an effective +way to improve the generalization of DeepFake detection. Existing methods +typically generate these faces by blending real or fake faces in color space. +While these methods have shown promise, they overlook the simulation of +frequency distribution in pseudo-fake faces, limiting the learning of generic +forgery traces in-depth. To address this, this paper introduces {\em +FreqBlender}, a new method that can generate pseudo-fake faces by blending +frequency knowledge. Specifically, we investigate the major frequency +components and propose a Frequency Parsing Network to adaptively partition +frequency components related to forgery traces. Then we blend this frequency +knowledge from fake faces into real faces to generate pseudo-fake faces. Since +there is no ground truth for frequency components, we describe a dedicated +training strategy by leveraging the inner correlations among different +frequency knowledge to instruct the learning process. Experimental results +demonstrate the effectiveness of our method in enhancing DeepFake detection, +making it a potential plug-and-play strategy for other methods. + +
+
+
+
+
+ + ☆ TeamTrack: A Dataset for Multi-Sport Multi-Object Tracking in Full-pitch + Videos + + +
+ Multi-object tracking (MOT) is a critical and challenging task in computer +vision, particularly in situations involving objects with similar appearances +but diverse movements, as seen in team sports. Current methods, largely reliant +on object detection and appearance, often fail to track targets in such complex +scenarios accurately. This limitation is further exacerbated by the lack of +comprehensive and diverse datasets covering the full view of sports pitches. +Addressing these issues, we introduce TeamTrack, a pioneering benchmark dataset +specifically designed for MOT in sports. TeamTrack is an extensive collection +of full-pitch video data from various sports, including soccer, basketball, and +handball. Furthermore, we perform a comprehensive analysis and benchmarking +effort to underscore TeamTrack's utility and potential impact. Our work +signifies a crucial step forward, promising to elevate the precision and +effectiveness of MOT in complex, dynamic settings such as team sports. The +dataset, project code and competition is released at: +https://atomscott.github.io/TeamTrack/. + +
+
+
+
+
+ + ☆ Plug-and-Play Algorithm Convergence Analysis From The Standpoint of + Stochastic Differential Equation + + +
+ The Plug-and-Play (PnP) algorithm is popular for inverse image +problem-solving. However, this algorithm lacks theoretical analysis of its +convergence with more advanced plug-in denoisers. We demonstrate that discrete +PnP iteration can be described by a continuous stochastic differential equation +(SDE). We can also achieve this transformation through Markov process +formulation of PnP. Then, we can take a higher standpoint of PnP algorithms +from stochastic differential equations, and give a unified framework for the +convergence property of PnP according to the solvability condition of its +corresponding SDE. We reveal that a much weaker condition, bounded denoiser +with Lipschitz continuous measurement function would be enough for its +convergence guarantee, instead of previous Lipschitz continuous denoiser +condition. + +
+
+ comment: 17pages, Preprint, Under review +
+
+
+
+
+ + ☆ PM-VIS: High-Performance Box-Supervised Video Instance Segmentation + + +
+ Labeling pixel-wise object masks in videos is a resource-intensive and +laborious process. Box-supervised Video Instance Segmentation (VIS) methods +have emerged as a viable solution to mitigate the labor-intensive annotation +process. . In practical applications, the two-step approach is not only more +flexible but also exhibits a higher recognition accuracy. Inspired by the +recent success of Segment Anything Model (SAM), we introduce a novel approach +that aims at harnessing instance box annotations from multiple perspectives to +generate high-quality instance pseudo masks, thus enriching the information +contained in instance annotations. We leverage ground-truth boxes to create +three types of pseudo masks using the HQ-SAM model, the box-supervised VIS +model (IDOL-BoxInst), and the VOS model (DeAOT) separately, along with three +corresponding optimization mechanisms. Additionally, we introduce two +ground-truth data filtering methods, assisted by high-quality pseudo masks, to +further enhance the training dataset quality and improve the performance of +fully supervised VIS methods. To fully capitalize on the obtained high-quality +Pseudo Masks, we introduce a novel algorithm, PM-VIS, to integrate mask losses +into IDOL-BoxInst. Our PM-VIS model, trained with high-quality pseudo mask +annotations, demonstrates strong ability in instance mask prediction, achieving +state-of-the-art performance on the YouTube-VIS 2019, YouTube-VIS 2021, and +OVIS validation sets, notably narrowing the gap between box-supervised and +fully supervised VIS methods. + +
+
+
+
+
+ + ☆ PGAHum: Prior-Guided Geometry and Appearance Learning for High-Fidelity + Animatable Human Reconstruction + + +
+ Recent techniques on implicit geometry representation learning and neural +rendering have shown promising results for 3D clothed human reconstruction from +sparse video inputs. However, it is still challenging to reconstruct detailed +surface geometry and even more difficult to synthesize photorealistic novel +views with animated human poses. In this work, we introduce PGAHum, a +prior-guided geometry and appearance learning framework for high-fidelity +animatable human reconstruction. We thoroughly exploit 3D human priors in three +key modules of PGAHum to achieve high-quality geometry reconstruction with +intricate details and photorealistic view synthesis on unseen poses. First, a +prior-based implicit geometry representation of 3D human, which contains a +delta SDF predicted by a tri-plane network and a base SDF derived from the +prior SMPL model, is proposed to model the surface details and the body shape +in a disentangled manner. Second, we introduce a novel prior-guided sampling +strategy that fully leverages the prior information of the human pose and body +to sample the query points within or near the body surface. By avoiding +unnecessary learning in the empty 3D space, the neural rendering can recover +more appearance details. Last, we propose a novel iterative backward +deformation strategy to progressively find the correspondence for the query +point in observation space. A skinning weights prediction model is learned +based on the prior provided by the SMPL model to achieve the iterative backward +LBS deformation. Extensive quantitative and qualitative comparisons on various +datasets are conducted and the results demonstrate the superiority of our +framework. Ablation studies also verify the effectiveness of each scheme for +geometry and appearance learning. + +
+
+
+
+
+ + ☆ Unveiling and Mitigating Generalized Biases of DNNs through the + Intrinsic Dimensions of Perceptual Manifolds + + +
+ Building fair deep neural networks (DNNs) is a crucial step towards achieving +trustworthy artificial intelligence. Delving into deeper factors that affect +the fairness of DNNs is paramount and serves as the foundation for mitigating +model biases. However, current methods are limited in accurately predicting DNN +biases, relying solely on the number of training samples and lacking more +precise measurement tools. Here, we establish a geometric perspective for +analyzing the fairness of DNNs, comprehensively exploring how DNNs internally +shape the intrinsic geometric characteristics of datasets-the intrinsic +dimensions (IDs) of perceptual manifolds, and the impact of IDs on the fairness +of DNNs. Based on multiple findings, we propose Intrinsic Dimension +Regularization (IDR), which enhances the fairness and performance of models by +promoting the learning of concise and ID-balanced class perceptual manifolds. +In various image recognition benchmark tests, IDR significantly mitigates model +bias while improving its performance. + +
+
+ comment: 8pages, 6figures, Submitted to TPAMI +
+
+
+
+
+ + ☆ Self-Supervised Monocular Depth Estimation in the Dark: Towards Data + Distribution Compensation IJCAI2024 + + +
+ Nighttime self-supervised monocular depth estimation has received increasing +attention in recent years. However, using night images for self-supervision is +unreliable because the photometric consistency assumption is usually violated +in the videos taken under complex lighting conditions. Even with domain +adaptation or photometric loss repair, performance is still limited by the poor +supervision of night images on trainable networks. In this paper, we propose a +self-supervised nighttime monocular depth estimation method that does not use +any night images during training. Our framework utilizes day images as a stable +source for self-supervision and applies physical priors (e.g., wave optics, +reflection model and read-shot noise model) to compensate for some key +day-night differences. With day-to-night data distribution compensation, our +framework can be trained in an efficient one-stage self-supervised manner. +Though no nighttime images are considered during training, qualitative and +quantitative results demonstrate that our method achieves SoTA depth estimating +results on the challenging nuScenes-Night and RobotCar-Night compared with +existing methods. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ☆ DSDRNet: Disentangling Representation and Reconstruct Network for Domain + Generalization IJCNN 2024 + + +
+ Domain generalization faces challenges due to the distribution shift between +training and testing sets, and the presence of unseen target domains. Common +solutions include domain alignment, meta-learning, data augmentation, or +ensemble learning, all of which rely on domain labels or domain adversarial +techniques. In this paper, we propose a Dual-Stream Separation and +Reconstruction Network, dubbed DSDRNet. It is a disentanglement-reconstruction +approach that integrates features of both inter-instance and intra-instance +through dual-stream fusion. The method introduces novel supervised signals by +combining inter-instance semantic distance and intra-instance similarity. +Incorporating Adaptive Instance Normalization (AdaIN) into a two-stage cyclic +reconstruction process enhances self-disentangled reconstruction signals to +facilitate model convergence. Extensive experiments on four benchmark datasets +demonstrate that DSDRNet outperforms other popular methods in terms of domain +generalization capabilities. + +
+
+ comment: This paper is accepted to IJCNN 2024 +
+
+
+
+
+ + ☆ EventLens: Leveraging Event-Aware Pretraining and Cross-modal Linking + Enhances Visual Commonsense Reasoning + + +
+ Visual Commonsense Reasoning (VCR) is a cognitive task, challenging models to +answer visual questions requiring human commonsense, and to provide rationales +explaining why the answers are correct. With emergence of Large Language Models +(LLMs), it is natural and imperative to explore their applicability to VCR. +However, VCR task demands more external knowledge to tackle its challenging +questions, necessitating special designs to activate LLMs' commonsense +reasoning abilities. Also, most existing Multimodal LLMs adopted an abstraction +of entire input image, which makes it difficult to comprehend VCR's unique +co-reference tags between image regions and text, posing challenges for +fine-grained alignment. To address these issues, we propose EventLens that +leverages Event-Aware Pretraining and Cross-modal Linking and EnhanceS VCR. +First, by emulating the cognitive process of human reasoning, an Event-Aware +Pretraining auxiliary task is introduced to better activate LLM's global +comprehension of intricate scenarios. Second, during fine-tuning, we further +utilize reference tags to bridge RoI features with texts, while preserving both +modality semantics. Finally, we use instruct-style prompts to narrow the gap +between pretraining and fine-tuning, and task-specific adapters to better +integrate LLM's inherent knowledge with new commonsense. Experimental results +show the effectiveness of our proposed auxiliary task and fine-grained linking +strategy. + +
+
+
+
+
+ + ☆ On Support Relations Inference and Scene Hierarchy Graph Construction + from Point Cloud in Clustered Environments + + +
+ Over the years, scene understanding has attracted a growing interest in +computer vision, providing the semantic and physical scene information +necessary for robots to complete some particular tasks autonomously. In 3D +scenes, rich spatial geometric and topological information are often ignored by +RGB-based approaches for scene understanding. In this study, we develop a +bottom-up approach for scene understanding that infers support relations +between objects from a point cloud. Our approach utilizes the spatial topology +information of the plane pairs in the scene, consisting of three major steps. +1) Detection of pairwise spatial configuration: dividing primitive pairs into +local support connection and local inner connection; 2) primitive +classification: a combinatorial optimization method applied to classify +primitives; and 3) support relations inference and hierarchy graph +construction: bottom-up support relations inference and scene hierarchy graph +construction containing primitive level and object level. Through experiments, +we demonstrate that the algorithm achieves excellent performance in primitive +classification and support relations inference. Additionally, we show that the +scene hierarchy graph contains rich geometric and topological information of +objects, and it possesses great scalability for scene understanding. + +
+
+
+
+
+ + ☆ C2F-SemiCD: A Coarse-to-Fine Semi-Supervised Change Detection Method + Based on Consistency Regularization in High-Resolution Remote Sensing Images + + +
+ A high-precision feature extraction model is crucial for change detection +(CD). In the past, many deep learning-based supervised CD methods learned to +recognize change feature patterns from a large number of labelled bi-temporal +images, whereas labelling bi-temporal remote sensing images is very expensive +and often time-consuming; therefore, we propose a coarse-to-fine +semi-supervised CD method based on consistency regularization (C2F-SemiCD), +which includes a coarse-to-fine CD network with a multiscale attention +mechanism (C2FNet) and a semi-supervised update method. Among them, the C2FNet +network gradually completes the extraction of change features from +coarse-grained to fine-grained through multiscale feature fusion, channel +attention mechanism, spatial attention mechanism, global context module, +feature refine module, initial aggregation module, and final aggregation +module. The semi-supervised update method uses the mean teacher method. The +parameters of the student model are updated to the parameters of the teacher +Model by using the exponential moving average (EMA) method. Through extensive +experiments on three datasets and meticulous ablation studies, including +crossover experiments across datasets, we verify the significant effectiveness +and efficiency of the proposed C2F-SemiCD method. The code will be open at: +https://github.com/ChengxiHAN/C2F-SemiCDand-C2FNet. + +
+
+
+
+
+ + ☆ A Comprehensive Survey and Taxonomy on Point Cloud Registration Based on + Deep Learning IJCAI 2024 + + +
+ Point cloud registration (PCR) involves determining a rigid transformation +that aligns one point cloud to another. Despite the plethora of outstanding +deep learning (DL)-based registration methods proposed, comprehensive and +systematic studies on DL-based PCR techniques are still lacking. In this paper, +we present a comprehensive survey and taxonomy of recently proposed PCR +methods. Firstly, we conduct a taxonomy of commonly utilized datasets and +evaluation metrics. Secondly, we classify the existing research into two main +categories: supervised and unsupervised registration, providing insights into +the core concepts of various influential PCR models. Finally, we highlight open +challenges and potential directions for future research. A curated collection +of valuable resources is made available at https://github.com/yxzhang15/PCR. + +
+
+ comment: This paper is accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Swap It Like Its Hot: Segmentation-based spoof attacks on eye-tracking + images + + +
+ Video-based eye trackers capture the iris biometric and enable authentication +to secure user identity. However, biometric authentication is susceptible to +spoofing another user's identity through physical or digital manipulation. The +current standard to identify physical spoofing attacks on eye-tracking sensors +uses liveness detection. Liveness detection classifies gaze data as real or +fake, which is sufficient to detect physical presentation attacks. However, +such defenses cannot detect a spoofing attack when real eye image inputs are +digitally manipulated to swap the iris pattern of another person. We propose +IrisSwap as a novel attack on gaze-based liveness detection. IrisSwap allows +attackers to segment and digitally swap in a victim's iris pattern to fool iris +authentication. Both offline and online attacks produce gaze data that deceives +the current state-of-the-art defense models at rates up to 58% and motivates +the need to develop more advanced authentication methods for eye trackers. + +
+
+
+
+
+ + ☆ HOIST-Former: Hand-held Objects Identification, Segmentation, and + Tracking in the Wild + + +
+ We address the challenging task of identifying, segmenting, and tracking +hand-held objects, which is crucial for applications such as human action +segmentation and performance evaluation. This task is particularly challenging +due to heavy occlusion, rapid motion, and the transitory nature of objects +being hand-held, where an object may be held, released, and subsequently picked +up again. To tackle these challenges, we have developed a novel +transformer-based architecture called HOIST-Former. HOIST-Former is adept at +spatially and temporally segmenting hands and objects by iteratively pooling +features from each other, ensuring that the processes of identification, +segmentation, and tracking of hand-held objects depend on the hands' positions +and their contextual appearance. We further refine HOIST-Former with a contact +loss that focuses on areas where hands are in contact with objects. Moreover, +we also contribute an in-the-wild video dataset called HOIST, which comprises +4,125 videos complete with bounding boxes, segmentation masks, and tracking IDs +for hand-held objects. Through experiments on the HOIST dataset and two +additional public datasets, we demonstrate the efficacy of HOIST-Former in +segmenting and tracking hand-held objects. + +
+
+
+
+
+ + ☆ Neural Radiance Field in Autonomous Driving: A Survey + + +
+ Neural Radiance Field (NeRF) has garnered significant attention from both +academia and industry due to its intrinsic advantages, particularly its +implicit representation and novel view synthesis capabilities. With the rapid +advancements in deep learning, a multitude of methods have emerged to explore +the potential applications of NeRF in the domain of Autonomous Driving (AD). +However, a conspicuous void is apparent within the current literature. To +bridge this gap, this paper conducts a comprehensive survey of NeRF's +applications in the context of AD. Our survey is structured to categorize +NeRF's applications in Autonomous Driving (AD), specifically encompassing +perception, 3D reconstruction, simultaneous localization and mapping (SLAM), +and simulation. We delve into in-depth analysis and summarize the findings for +each application category, and conclude by providing insights and discussions +on future directions in this field. We hope this paper serves as a +comprehensive reference for researchers in this domain. To the best of our +knowledge, this is the first survey specifically focused on the applications of +NeRF in the Autonomous Driving domain. + +
+
+
+
+
+ + ☆ FaceFolds: Meshed Radiance Manifolds for Efficient Volumetric Rendering + of Dynamic Faces + + +
+ 3D rendering of dynamic face captures is a challenging problem, and it +demands improvements on several fronts$\unicode{x2014}$photorealism, +efficiency, compatibility, and configurability. We present a novel +representation that enables high-quality volumetric rendering of an actor's +dynamic facial performances with minimal compute and memory footprint. It runs +natively on commodity graphics soft- and hardware, and allows for a graceful +trade-off between quality and efficiency. Our method utilizes recent advances +in neural rendering, particularly learning discrete radiance manifolds to +sparsely sample the scene to model volumetric effects. We achieve efficient +modeling by learning a single set of manifolds for the entire dynamic sequence, +while implicitly modeling appearance changes as temporal canonical texture. We +export a single layered mesh and view-independent RGBA texture video that is +compatible with legacy graphics renderers without additional ML integration. We +demonstrate our method by rendering dynamic face captures of real actors in a +game engine, at comparable photorealism to state-of-the-art neural rendering +techniques at previously unseen frame rates. + +
+
+ comment: In Proceedings of the ACM in Computer Graphics and Interactive + Techniques, 2024 +
+
+
+
+
+ + ♻ ☆ Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular + Videos + + +
+ Modern 3D engines and graphics pipelines require mesh as a memory-efficient +representation, which allows efficient rendering, geometry processing, texture +editing, and many other downstream operations. However, it is still highly +difficult to obtain high-quality mesh in terms of structure and detail from +monocular visual observations. The problem becomes even more challenging for +dynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh +(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh +given a single monocular video. Our work leverages the recent advancement in 3D +Gaussian Splatting to construct the mesh sequence with temporal consistency +from a video. Building on top of this representation, DG-Mesh recovers +high-quality meshes from the Gaussian points and can track the mesh vertices +over time, which enables applications such as texture editing on dynamic +objects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly +distributed Gaussians, resulting better mesh reconstruction through mesh-guided +densification and pruning on the deformed Gaussians. By applying +cycle-consistent deformation between the canonical and the deformed space, we +can project the anchored Gaussian back to the canonical space and optimize +Gaussians across all time frames. During the evaluation on different datasets, +DG-Mesh provides significantly better mesh reconstruction and rendering than +baselines. Project page: https://www.liuisabella.com/DG-Mesh/ + +
+
+ comment: Project page: https://www.liuisabella.com/DG-Mesh/ +
+
+
+
+
+ + ♻ ☆ Trends, Applications, and Challenges in Human Attention Modelling IJCAI 2024 + + +
+ Human attention modelling has proven, in recent years, to be particularly +useful not only for understanding the cognitive processes underlying visual +exploration, but also for providing support to artificial intelligence models +that aim to solve problems in various domains, including image and video +processing, vision-and-language applications, and language modelling. This +survey offers a reasoned overview of recent efforts to integrate human +attention mechanisms into contemporary deep learning models and discusses +future research directions and challenges. For a comprehensive overview on the +ongoing research refer to our dedicated repository available at +https://github.com/aimagelab/awesome-human-visual-attention. + +
+
+ comment: Accepted at IJCAI 2024 Survey Track +
+
+
+
+
+ + ♻ ☆ GeoAI Reproducibility and Replicability: a computational and spatial + perspective + + +
+ GeoAI has emerged as an exciting interdisciplinary research area that +combines spatial theories and data with cutting-edge AI models to address +geospatial problems in a novel, data-driven manner. While GeoAI research has +flourished in the GIScience literature, its reproducibility and replicability +(R&R), fundamental principles that determine the reusability, reliability, and +scientific rigor of research findings, have rarely been discussed. This paper +aims to provide an in-depth analysis of this topic from both computational and +spatial perspectives. We first categorize the major goals for reproducing GeoAI +research, namely, validation (repeatability), learning and adapting the method +for solving a similar or new problem (reproducibility), and examining the +generalizability of the research findings (replicability). Each of these goals +requires different levels of understanding of GeoAI, as well as different +methods to ensure its success. We then discuss the factors that may cause the +lack of R&R in GeoAI research, with an emphasis on (1) the selection and use of +training data; (2) the uncertainty that resides in the GeoAI model design, +training, deployment, and inference processes; and more importantly (3) the +inherent spatial heterogeneity of geospatial data and processes. We use a deep +learning-based image analysis task as an example to demonstrate the results' +uncertainty and spatial variance caused by different factors. The findings +reiterate the importance of knowledge sharing, as well as the generation of a +"replicability map" that incorporates spatial autocorrelation and spatial +heterogeneity into consideration in quantifying the spatial replicability of +GeoAI research. + +
+
+ comment: Accepted by Annals of the American Association of Geographers +
+
+
+
+
+ + ♻ ☆ Does Gaussian Splatting need SFM Initialization? + + +
+ 3D Gaussian Splatting has recently been embraced as a versatile and effective +method for scene reconstruction and novel view synthesis, owing to its +high-quality results and compatibility with hardware rasterization. Despite its +advantages, Gaussian Splatting's reliance on high-quality point cloud +initialization by Structure-from-Motion (SFM) algorithms is a significant +limitation to be overcome. To this end, we investigate various initialization +strategies for Gaussian Splatting and delve into how volumetric reconstructions +from Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on +SFM data. Our findings demonstrate that random initialization can perform much +better if carefully designed and that by employing a combination of improved +initialization strategies and structure distillation from low-cost NeRF models, +it is possible to achieve equivalent results, or at times even superior, to +those obtained from SFM initialization. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ CoGS: Controllable Gaussian Splatting CVPR 2024 + + +
+ Capturing and re-animating the 3D structure of articulated objects present +significant barriers. On one hand, methods requiring extensively calibrated +multi-view setups are prohibitively complex and resource-intensive, limiting +their practical applicability. On the other hand, while single-camera Neural +Radiance Fields (NeRFs) offer a more streamlined approach, they have excessive +training and rendering costs. 3D Gaussian Splatting would be a suitable +alternative but for two reasons. Firstly, existing methods for 3D dynamic +Gaussians require synchronized multi-view cameras, and secondly, the lack of +controllability in dynamic scenarios. We present CoGS, a method for +Controllable Gaussian Splatting, that enables the direct manipulation of scene +elements, offering real-time control of dynamic scenes without the prerequisite +of pre-computing control signals. We evaluated CoGS using both synthetic and +real-world datasets that include dynamic objects that differ in degree of +difficulty. In our evaluations, CoGS consistently outperformed existing dynamic +and controllable neural representations in terms of visual fidelity. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning A Physical-aware Diffusion Model Based on Transformer for + Underwater Image Enhancement + + +
+ Underwater visuals undergo various complex degradations, inevitably +influencing the efficiency of underwater vision tasks. Recently, diffusion +models were employed to underwater image enhancement (UIE) tasks, and gained +SOTA performance. However, these methods fail to consider the physical +properties and underwater imaging mechanisms in the diffusion process, limiting +information completion capacity of diffusion models. In this paper, we +introduce a novel UIE framework, named PA-Diff, designed to exploiting the +knowledge of physics to guide the diffusion process. + PA-Diff consists of Physics Prior Generation (PPG) Branch, Implicit Neural +Reconstruction (INR) Branch, and Physics-aware Diffusion Transformer (PDT) +Branch. Our designed PPG branch aims to produce the prior knowledge of physics. +With utilizing the physics prior knowledge to guide the diffusion process, PDT +branch can obtain underwater-aware ability and model the complex distribution +in real-world underwater scenes. INR Branch can learn robust feature +representations from diverse underwater image via implicit neural +representation, which reduces the difficulty of restoration for PDT branch. +Extensive experiments prove that our method achieves best performance on UIE +tasks. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Learning of the Total Variation Flow + + +
+ The total variation (TV) flow generates a scale-space representation of an +image based on the TV functional. This gradient flow observes desirable +features for images, such as sharp edges and enables spectral, scale, and +texture analysis. Solving the TV flow is challenging; one reason is the the +non-uniqueness of the subgradients. The standard numerical approach for TV flow +requires solving multiple non-smooth optimisation problems. Even with +state-of-the-art convex optimisation techniques, this is often prohibitively +expensive and strongly motivates the use of alternative, faster approaches. +Inspired by and extending the framework of physics-informed neural networks +(PINNs), we propose the TVflowNET, an unsupervised neural network approach, to +approximate the solution of the TV flow given an initial image and a time +instance. The TVflowNET requires no ground truth data but rather makes use of +the PDE for optimisation of the network parameters. We circumvent the +challenges related to the non-uniqueness of the subgradients by additionally +learning the related diffusivity term. Our approach significantly speeds up the +computation time and we show that the TVflowNET approximates the TV flow +solution with high fidelity for different image sizes and image types. +Additionally, we give a full comparison of different network architecture +designs as well as training regimes to underscore the effectiveness of our +approach. + +
+
+
+
+
+ + ♻ ☆ Versatile Backdoor Attack with Visible, Semantic, Sample-Specific, and + Compatible Triggers + + +
+ Deep neural networks (DNNs) can be manipulated to exhibit specific behaviors +when exposed to specific trigger patterns, without affecting their performance +on benign samples, dubbed \textit{backdoor attack}. Currently, implementing +backdoor attacks in physical scenarios still faces significant challenges. +Physical attacks are labor-intensive and time-consuming, and the triggers are +selected in a manual and heuristic way. Moreover, expanding digital attacks to +physical scenarios faces many challenges due to their sensitivity to visual +distortions and the absence of counterparts in the real world. To address these +challenges, we define a novel trigger called the \textbf{V}isible, +\textbf{S}emantic, \textbf{S}ample-Specific, and \textbf{C}ompatible (VSSC) +trigger, to achieve effective, stealthy and robust simultaneously, which can +also be effectively deployed in the physical scenario using corresponding +objects. To implement the VSSC trigger, we propose an automated pipeline +comprising three modules: a trigger selection module that systematically +identifies suitable triggers leveraging large language models, a trigger +insertion module that employs generative models to seamlessly integrate +triggers into images, and a quality assessment module that ensures the natural +and successful insertion of triggers through vision-language models. Extensive +experimental results and analysis validate the effectiveness, stealthiness, and +robustness of the VSSC trigger. It can not only maintain robustness under +visual distortions but also demonstrates strong practicality in the physical +scenario. We hope that the proposed VSSC trigger and implementation approach +could inspire future studies on designing more practical triggers in backdoor +attacks. + +
+
+
+
+
+ + ♻ ☆ Advancing Graph Neural Networks with HL-HGAT: A Hodge-Laplacian and + Attention Mechanism Approach for Heterogeneous Graph-Structured Data + + +
+ Graph neural networks (GNNs) have proven effective in capturing relationships +among nodes in a graph. This study introduces a novel perspective by +considering a graph as a simplicial complex, encompassing nodes, edges, +triangles, and $k$-simplices, enabling the definition of graph-structured data +on any $k$-simplices. Our contribution is the Hodge-Laplacian heterogeneous +graph attention network (HL-HGAT), designed to learn heterogeneous signal +representations across $k$-simplices. The HL-HGAT incorporates three key +components: HL convolutional filters (HL-filters), simplicial projection (SP), +and simplicial attention pooling (SAP) operators, applied to $k$-simplices. +HL-filters leverage the unique topology of $k$-simplices encoded by the +Hodge-Laplacian (HL) operator, operating within the spectral domain of the +$k$-th HL operator. To address computation challenges, we introduce a +polynomial approximation for HL-filters, exhibiting spatial localization +properties. Additionally, we propose a pooling operator to coarsen +$k$-simplices, combining features through simplicial attention mechanisms of +self-attention and cross-attention via transformers and SP operators, capturing +topological interconnections across multiple dimensions of simplices. The +HL-HGAT is comprehensively evaluated across diverse graph applications, +including NP-hard problems, graph multi-label and classification challenges, +and graph regression tasks in logistics, computer vision, biology, chemistry, +and neuroscience. The results demonstrate the model's efficacy and versatility +in handling a wide range of graph-based scenarios. + +
+
+
+
+
+ + ♻ ☆ Neuromorphic Face Analysis: a Survey + + +
+ Neuromorphic sensors, also known as event cameras, are a class of imaging +devices mimicking the function of biological visual systems. Unlike traditional +frame-based cameras, which capture fixed images at discrete intervals, +neuromorphic sensors continuously generate events that represent changes in +light intensity or motion in the visual field with high temporal resolution and +low latency. These properties have proven to be interesting in modeling human +faces, both from an effectiveness and a privacy-preserving point of view. +Neuromorphic face analysis however is still a raw and unstructured field of +research, with several attempts at addressing different tasks with no clear +standard or benchmark. This survey paper presents a comprehensive overview of +capabilities, challenges and emerging applications in the domain of +neuromorphic face analysis, to outline promising directions and open issues. +After discussing the fundamental working principles of neuromorphic vision and +presenting an in-depth overview of the related research, we explore the current +state of available data, standard data representations, emerging challenges, +and limitations that require further investigation. This paper aims to +highlight the recent process in this evolving field to provide to both +experienced and newly come researchers an all-encompassing analysis of the +state of the art along with its problems and shortcomings. + +
+
+ comment: Submitted to Patter Recognition Letters +
+
+
+
+
+ + ♻ ☆ NeLF-Pro: Neural Light Field Probes for Multi-Scale Novel View Synthesis CVPR 2024 + + +
+ We present NeLF-Pro, a novel representation to model and reconstruct light +fields in diverse natural scenes that vary in extent and spatial granularity. +In contrast to previous fast reconstruction methods that represent the 3D scene +globally, we model the light field of a scene as a set of local light field +feature probes, parameterized with position and multi-channel 2D feature maps. +Our central idea is to bake the scene's light field into spatially varying +learnable representations and to query point features by weighted blending of +probes close to the camera - allowing for mipmap representation and rendering. +We introduce a novel vector-matrix-matrix (VMM) factorization technique that +effectively represents the light field feature probes as products of core +factors (i.e., VM) shared among local feature probes, and a basis factor (i.e., +M) - efficiently encoding internal relationships and patterns within the scene. +Experimentally, we demonstrate that NeLF-Pro significantly boosts the +performance of feature grid-based representations, and achieves fast +reconstruction with better rendering quality while maintaining compact +modeling. Project webpage https://sinoyou.github.io/nelf-pro/. + +
+
+ comment: CVPR 2024 Conference Paper, Camera Ready Version +
+
+
+
+
+ + ♻ ☆ Robustness and Visual Explanation for Black Box Image, Video, and ECG + Signal Classification with Reinforcement Learning AAAI + + +
+ We present a generic Reinforcement Learning (RL) framework optimized for +crafting adversarial attacks on different model types spanning from ECG signal +analysis (1D), image classification (2D), and video classification (3D). The +framework focuses on identifying sensitive regions and inducing +misclassifications with minimal distortions and various distortion types. The +novel RL method outperforms state-of-the-art methods for all three +applications, proving its efficiency. Our RL approach produces superior +localization masks, enhancing interpretability for image classification and ECG +analysis models. For applications such as ECG analysis, our platform highlights +critical ECG segments for clinicians while ensuring resilience against +prevalent distortions. This comprehensive tool aims to bolster both resilience +with adversarial training and transparency across varied applications and data +types. + +
+
+ comment: AAAI Proceedings reference: + https://ojs.aaai.org/index.php/AAAI/article/view/30579 +
+
+
+
+
+ + ♻ ☆ SPINEPS -- Automatic Whole Spine Segmentation of T2-weighted MR images + using a Two-Phase Approach to Multi-class Semantic and Instance Segmentation + + +
+ Purpose. To present SPINEPS, an open-source deep learning approach for +semantic and instance segmentation of 14 spinal structures (ten vertebra +substructures, intervertebral discs, spinal cord, spinal canal, and sacrum) in +whole body T2w MRI. + Methods. During this HIPPA-compliant, retrospective study, we utilized the +public SPIDER dataset (218 subjects, 63% female) and a subset of the German +National Cohort (1423 subjects, mean age 53, 49% female) for training and +evaluation. We combined CT and T2w segmentations to train models that segment +14 spinal structures in T2w sagittal scans both semantically and instance-wise. +Performance evaluation metrics included Dice similarity coefficient, average +symmetrical surface distance, panoptic quality, segmentation quality, and +recognition quality. Statistical significance was assessed using the Wilcoxon +signed-rank test. An in-house dataset was used to qualitatively evaluate +out-of-distribution samples. + Results. On the public dataset, our approach outperformed the baseline +(instance-wise vertebra dice score 0.929 vs. 0.907, p-value<0.001). Training on +auto-generated annotations and evaluating on manually corrected test data from +the GNC yielded global dice scores of 0.900 for vertebrae, 0.960 for +intervertebral discs, and 0.947 for the spinal canal. Incorporating the SPIDER +dataset during training increased these scores to 0.920, 0.967, 0.958, +respectively. + Conclusions. The proposed segmentation approach offers robust segmentation of +14 spinal structures in T2w sagittal images, including the spinal cord, spinal +canal, intervertebral discs, endplate, sacrum, and vertebrae. The approach +yields both a semantic and instance mask as output, thus being easy to utilize. +This marks the first publicly available algorithm for whole spine segmentation +in sagittal T2w MR imaging. + +
+
+ comment: https://github.com/Hendrik-code/spineps +
+
+
+
+
+ + ♻ ☆ YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel + Class Discovery + + +
+ Because of its use in practice, open-world object detection (OWOD) has gotten +a lot of attention recently. The challenge is how can a model detect novel +classes and then incrementally learn them without forgetting previously known +classes. Previous approaches hinge on strongly-supervised or weakly-supervised +novel-class data for novel-class detection, which may not apply to real +applications. We construct a new benchmark that novel classes are only +encountered at the inference stage. And we propose a new OWOD detector YOLOOC, +based on the YOLO architecture yet for the Open-Class setup. We introduce label +smoothing to prevent the detector from over-confidently mapping novel classes +to known classes and to discover novel classes. Extensive experiments conducted +on our more realistic setup demonstrate the effectiveness of our method for +discovering novel classes in our new benchmark. + +
+
+ comment: Withdrawn because it was submitted without consent of the first + author. In addition, this submission has some errors +
+
+
+
+
+ + ♻ ☆ Multilevel Geometric Optimization for Regularised Constrained Linear + Inverse Problems + + +
+ We present a geometric multilevel optimization approach that smoothly +incorporates box constraints. Given a box constrained optimization problem, we +consider a hierarchy of models with varying discretization levels. Finer models +are accurate but expensive to compute, while coarser models are less accurate +but cheaper to compute. When working at the fine level, multilevel optimisation +computes the search direction based on a coarser model which speeds up updates +at the fine level. Moreover, exploiting geometry induced by the hierarchy the +feasibility of the updates is preserved. In particular, our approach extends +classical components of multigrid methods like restriction and prolongation to +the Riemannian structure of our constraints. + +
+
+ comment: 25 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Cross Attention for Audio-Visual Person Verification + + +
+ Although person or identity verification has been predominantly explored +using individual modalities such as face and voice, audio-visual fusion has +recently shown immense potential to outperform unimodal approaches. Audio and +visual modalities are often expected to pose strong complementary +relationships, which plays a crucial role in effective audio-visual fusion. +However, they may not always strongly complement each other, they may also +exhibit weak complementary relationships, resulting in poor audio-visual +feature representations. In this paper, we propose a Dynamic Cross-Attention +(DCA) model that can dynamically select the cross-attended or unattended +features on the fly based on the strong or weak complementary relationships, +respectively, across audio and visual modalities. In particular, a conditional +gating layer is designed to evaluate the contribution of the cross-attention +mechanism and choose cross-attended features only when they exhibit strong +complementary relationships, otherwise unattended features. Extensive +experiments are conducted on the Voxceleb1 dataset to demonstrate the +robustness of the proposed model. Results indicate that the proposed model +consistently improves the performance on multiple variants of cross-attention +while outperforming the state-of-the-art methods. + +
+
+ comment: Accepted to FG2024 +
+
+
+
+
+ + ♻ ☆ Implicit and Explicit Language Guidance for Diffusion-based Visual + Perception + + +
+ Text-to-image diffusion models have shown powerful ability on conditional +image synthesis. With large-scale vision-language pre-training, diffusion +models are able to generate high-quality images with rich texture and +reasonable structure under different text prompts. However, it is an open +problem to adapt the pre-trained diffusion model for visual perception. In this +paper, we propose an implicit and explicit language guidance framework for +diffusion-based perception, named IEDP. Our IEDP comprises an implicit language +guidance branch and an explicit language guidance branch. The implicit branch +employs frozen CLIP image encoder to directly generate implicit text embeddings +that are fed to diffusion model, without using explicit text prompts. The +explicit branch utilizes the ground-truth labels of corresponding images as +text prompts to condition feature extraction of diffusion model. During +training, we jointly train diffusion model by sharing the model weights of +these two branches. As a result, implicit and explicit branches can jointly +guide feature learning. During inference, we only employ implicit branch for +final prediction, which does not require any ground-truth labels. Experiments +are performed on two typical perception tasks, including semantic segmentation +and depth estimation. Our IEDP achieves promising performance on both tasks. +For semantic segmentation, our IEDP has the mIoU$^\text{ss}$ score of 55.9% on +AD20K validation set, which outperforms the baseline method VPD by 2.2%. For +depth estimation, our IEDP outperforms the baseline method VPD with a relative +gain of 11.0%. + +
+
+
+
+
+ + ♻ ☆ SE(3)-Equivariant and Noise-Invariant 3D Rigid Motion Tracking in Brain + MRI + + +
+ Rigid motion tracking is paramount in many medical imaging applications where +movements need to be detected, corrected, or accounted for. Modern strategies +rely on convolutional neural networks (CNN) and pose this problem as rigid +registration. Yet, CNNs do not exploit natural symmetries in this task, as they +are equivariant to translations (their outputs shift with their inputs) but not +to rotations. Here we propose EquiTrack, the first method that uses recent +steerable SE(3)-equivariant CNNs (E-CNN) for motion tracking. While steerable +E-CNNs can extract corresponding features across different poses, testing them +on noisy medical images reveals that they do not have enough learning capacity +to learn noise invariance. Thus, we introduce a hybrid architecture that pairs +a denoiser with an E-CNN to decouple the processing of anatomically irrelevant +intensity features from the extraction of equivariant spatial features. Rigid +transforms are then estimated in closed-form. EquiTrack outperforms +state-of-the-art learning and optimisation methods for motion tracking in adult +brain MRI and fetal MRI time series. Our code is available at +https://github.com/BBillot/EquiTrack. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Think Twice Before Selection: Federated Evidential Active Learning for + Medical Image Analysis with Domain Shifts CVPR 2024 + + +
+ Federated learning facilitates the collaborative learning of a global model +across multiple distributed medical institutions without centralizing data. +Nevertheless, the expensive cost of annotation on local clients remains an +obstacle to effectively utilizing local data. To mitigate this issue, federated +active learning methods suggest leveraging local and global model predictions +to select a relatively small amount of informative local data for annotation. +However, existing methods mainly focus on all local data sampled from the same +domain, making them unreliable in realistic medical scenarios with domain +shifts among different clients. In this paper, we make the first attempt to +assess the informativeness of local data derived from diverse domains and +propose a novel methodology termed Federated Evidential Active Learning (FEAL) +to calibrate the data evaluation under domain shift. Specifically, we introduce +a Dirichlet prior distribution in both local and global models to treat the +prediction as a distribution over the probability simplex and capture both +aleatoric and epistemic uncertainties by using the Dirichlet-based evidential +model. Then we employ the epistemic uncertainty to calibrate the aleatoric +uncertainty. Afterward, we design a diversity relaxation strategy to reduce +data redundancy and maintain data diversity. Extensive experiments and analysis +on five real multi-center medical image datasets demonstrate the superiority of +FEAL over the state-of-the-art active learning methods in federated scenarios +with domain shifts. The code will be available at +https://github.com/JiayiChen815/FEAL. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid + Transformer and Contrastive Learning CVPR 2024 + + +
+ Defocus blur is a persistent problem in microscope imaging that poses harm to +pathology interpretation and medical intervention in cell microscopy and +microscope surgery. To address this problem, a unified framework including the +multi-pyramid transformer (MPT) and extended frequency contrastive +regularization (EFCR) is proposed to tackle two outstanding challenges in +microscopy deblur: longer attention span and data deficiency. The MPT employs +an explicit pyramid structure at each network stage that integrates the +cross-scale window attention (CSWA), the intra-scale channel attention (ISCA), +and the feature-enhancing feed-forward network (FEFN) to capture long-range +cross-scale spatial interaction and global channel context. The EFCR addresses +the data deficiency problem by exploring latent deblur signals from different +frequency bands. It also enables deblur knowledge transfer to learn +cross-domain information from extra data, improving deblur performance for +labeled and unlabeled data. Extensive experiments and downstream task +validation show the framework achieves state-of-the-art performance across +multiple datasets. Project page: https://github.com/PieceZhang/MPT-CataBlur. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Feedback Generation in Automated Skeletal Movement Assessment: + A Comprehensive Overview + + +
+ The application of machine-learning solutions to movement assessment from +skeleton videos has attracted significant research attention in recent years. +This advancement has made rehabilitation at home more accessible, utilizing +movement assessment algorithms that can operate on affordable equipment for +human pose detection and analysis from 2D or 3D videos. While the primary +objective of automatic assessment tasks is to score movements, the automatic +generation of feedback highlighting key movement issues has the potential to +significantly enhance and accelerate the rehabilitation process. While numerous +research works exist in the field of automatic movement assessment, only a +handful address feedback generation. In this study, we explain the types of +feedback that can be generated, review existing solutions for automatic +feedback generation, and discuss future research directions. To our knowledge, +this is the first comprehensive review of feedback generation in skeletal +movement assessment. + +
+
+
+
+
+ + ♻ ☆ Manga109Dialog: A Large-scale Dialogue Dataset for Comics Speaker + Detection ICME2024 + + +
+ The expanding market for e-comics has spurred interest in the development of +automated methods to analyze comics. For further understanding of comics, an +automated approach is needed to link text in comics to characters speaking the +words. Comics speaker detection research has practical applications, such as +automatic character assignment for audiobooks, automatic translation according +to characters' personalities, and inference of character relationships and +stories. + To deal with the problem of insufficient speaker-to-text annotations, we +created a new annotation dataset Manga109Dialog based on Manga109. +Manga109Dialog is the world's largest comics speaker annotation dataset, +containing 132,692 speaker-to-text pairs. We further divided our dataset into +different levels by prediction difficulties to evaluate speaker detection +methods more appropriately. Unlike existing methods mainly based on distances, +we propose a deep learning-based method using scene graph generation models. +Due to the unique features of comics, we enhance the performance of our +proposed model by considering the frame reading order. We conducted experiments +using Manga109Dialog and other datasets. Experimental results demonstrate that +our scene-graph-based approach outperforms existing methods, achieving a +prediction accuracy of over 75%. + +
+
+ comment: Accepted to ICME2024 +
+
+
+
+
+ + ♻ ☆ OccFusion: A Straightforward and Effective Multi-Sensor Fusion Framework + for 3D Occupancy Prediction + + +
+ This paper introduces OccFusion, a straightforward and efficient sensor +fusion framework for predicting 3D occupancy. A comprehensive understanding of +3D scenes is crucial in autonomous driving, and recent models for 3D semantic +occupancy prediction have successfully addressed the challenge of describing +real-world objects with varied shapes and classes. However, existing methods +for 3D occupancy prediction heavily rely on surround-view camera images, making +them susceptible to changes in lighting and weather conditions. By integrating +features from additional sensors, such as lidar and surround view radars, our +framework enhances the accuracy and robustness of occupancy prediction, +resulting in top-tier performance on the nuScenes benchmark. Furthermore, +extensive experiments conducted on the nuScenes dataset, including challenging +night and rainy scenarios, confirm the superior performance of our sensor +fusion strategy across various perception ranges. The code for this framework +will be made available at https://github.com/DanielMing123/OCCFusion. + +
+
+
+
+
+ + ♻ ☆ Seeing Text in the Dark: Algorithm and Benchmark + + +
+ Localizing text in low-light environments is challenging due to visual +degradations. Although a straightforward solution involves a two-stage pipeline +with low-light image enhancement (LLE) as the initial step followed by +detector, LLE is primarily designed for human vision instead of machine and can +accumulate errors. In this work, we propose an efficient and effective +single-stage approach for localizing text in dark that circumvents the need for +LLE. We introduce a constrained learning module as an auxiliary mechanism +during the training stage of the text detector. This module is designed to +guide the text detector in preserving textual spatial features amidst feature +map resizing, thus minimizing the loss of spatial information in texts under +low-light visual degradations. Specifically, we incorporate spatial +reconstruction and spatial semantic constraints within this module to ensure +the text detector acquires essential positional and contextual range knowledge. +Our approach enhances the original text detector's ability to identify text's +local topological features using a dynamic snake feature pyramid network and +adopts a bottom-up contour shaping strategy with a novel rectangular +accumulation technique for accurate delineation of streamlined text features. +In addition, we present a comprehensive low-light dataset for arbitrary-shaped +text, encompassing diverse scenes and languages. Notably, our method achieves +state-of-the-art results on this low-light dataset and exhibits comparable +performance on standard normal light datasets. The code and dataset will be +released. + +
+
+
+
+
+ + ♻ ☆ Bridging Stereo Geometry and BEV Representation with Reliable Mutual + Interaction for Semantic Scene Completion IJCAI2024 + + +
+ 3D semantic scene completion (SSC) is an ill-posed perception task that +requires inferring a dense 3D scene from limited observations. Previous +camera-based methods struggle to predict accurate semantic scenes due to +inherent geometric ambiguity and incomplete observations. In this paper, we +resort to stereo matching technique and bird's-eye-view (BEV) representation +learning to address such issues in SSC. Complementary to each other, stereo +matching mitigates geometric ambiguity with epipolar constraint while BEV +representation enhances the hallucination ability for invisible regions with +global semantic context. However, due to the inherent representation gap +between stereo geometry and BEV features, it is non-trivial to bridge them for +dense prediction task of SSC. Therefore, we further develop a unified +occupancy-based framework dubbed BRGScene, which effectively bridges these two +representations with dense 3D volumes for reliable semantic scene completion. +Specifically, we design a novel Mutual Interactive Ensemble (MIE) block for +pixel-level reliable aggregation of stereo geometry and BEV features. Within +the MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced +with confidence re-weighting, is employed to encourage fine-grained interaction +through mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is +introduced to facilitate complementary aggregation through channel-wise +recalibration and multi-group voting. Our method outperforms all published +camera-based methods on SemanticKITTI for semantic scene completion. Our code +is available on \url{https://github.com/Arlo0o/StereoScene}. + +
+
+ comment: IJCAI2024 (https://github.com/Arlo0o/StereoScene) +
+
+
+
+
+ + ♻ ☆ Unifying Feature and Cost Aggregation with Transformers for Semantic and + Visual Correspondence ICLR'24 + + +
+ This paper introduces a Transformer-based integrative feature and cost +aggregation network designed for dense matching tasks. In the context of dense +matching, many works benefit from one of two forms of aggregation: feature +aggregation, which pertains to the alignment of similar features, or cost +aggregation, a procedure aimed at instilling coherence in the flow estimates +across neighboring pixels. In this work, we first show that feature aggregation +and cost aggregation exhibit distinct characteristics and reveal the potential +for substantial benefits stemming from the judicious use of both aggregation +processes. We then introduce a simple yet effective architecture that harnesses +self- and cross-attention mechanisms to show that our approach unifies feature +aggregation and cost aggregation and effectively harnesses the strengths of +both techniques. Within the proposed attention layers, the features and cost +volume both complement each other, and the attention layers are interleaved +through a coarse-to-fine design to further promote accurate correspondence +estimation. Finally at inference, our network produces multi-scale predictions, +computes their confidence scores, and selects the most confident flow for final +prediction. Our framework is evaluated on standard benchmarks for semantic +matching, and also applied to geometric matching, where we show that our +approach achieves significant improvements compared to existing methods. + +
+
+ comment: Accepted by ICLR'24 +
+
+
+
+
+ + ♻ ☆ EGGS: Edge Guided Gaussian Splatting for Radiance Fields + + +
+ The Gaussian splatting methods are getting popular. However, their loss +function only contains the $\ell_1$ norm and the structural similarity between +the rendered and input images, without considering the edges in these images. +It is well-known that the edges in an image provide important information. +Therefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS) +method that leverages the edges in the input images. More specifically, we give +the edge region a higher weight than the flat region. With such edge guidance, +the resulting Gaussian particles focus more on the edges instead of the flat +regions. Moreover, such edge guidance does not crease the computation cost +during the training and rendering stage. The experiments confirm that such +simple edge-weighted loss function indeed improves about $1\sim2$ dB on several +difference data sets. With simply plugging in the edge guidance, the proposed +method can improve all Gaussian splatting methods in different scenarios, such +as human head modeling, building 3D reconstruction, etc. + +
+
+
+
+
+ + ♻ ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/atonderski/neuro-ncap + +
+
+
+
+
+ + ♻ ☆ DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On + Transformer + + +
+ With the continuous development of OCR technology and the expansion of +application fields, text recognition in complex scenes has become a key +challenge. Factors such as multiple fonts, mixed scenes and complex layouts +seriously affect the recognition accuracy of traditional OCR models. Although +OCR models based on deep learning have performed well in specific fields or +similar datasets in recent years, the generalization ability and robustness of +the model are still a big challenge when facing complex environments with +multiple scenes. Furthermore, training an OCR model from scratch or fine-tuning +all parameters is very demanding on computing resources and inference time, +which limits the flexibility of its application. This study focuses on a +fundamental aspect of mixed text recognition in response to the challenges +mentioned above, which involves effectively fine-tuning the pre-trained basic +OCR model to demonstrate exceptional performance across various downstream +tasks. To this end, we propose a parameter-efficient mixed text recognition +method based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method +embeds DoRA into the image encoder and LoRA into the internal structure of the +text decoder, enabling efficient parameter fine-tuning for downstream tasks. +Experimental results show that compared to similar parameter adjustment +methods, our model DLoRA-TrOCR has the smallest number of parameters and +performs better. It can achieve state-of-the-art performance on complex scene +datasets involving simultaneous recognition of mixed handwritten, printed and +street view texts. + +
+
+
+
+
+ + ♻ ☆ SAFDNet: A Simple and Effective Network for Fully Sparse 3D Object + Detection CVPR 2024 + + +
+ LiDAR-based 3D object detection plays an essential role in autonomous +driving. Existing high-performing 3D object detectors usually build dense +feature maps in the backbone network and prediction head. However, the +computational costs introduced by the dense feature maps grow quadratically as +the perception range increases, making these models hard to scale up to +long-range detection. Some recent works have attempted to construct fully +sparse detectors to solve this issue; nevertheless, the resulting models either +rely on a complex multi-stage pipeline or exhibit inferior performance. In this +work, we propose SAFDNet, a straightforward yet highly effective architecture, +tailored for fully sparse 3D object detection. In SAFDNet, an adaptive feature +diffusion strategy is designed to address the center feature missing problem. +We conducted extensive experiments on Waymo Open, nuScenes, and Argoverse2 +datasets. SAFDNet performed slightly better than the previous SOTA on the first +two datasets but much better on the last dataset, which features long-range +detection, verifying the efficacy of SAFDNet in scenarios where long-range +detection is required. Notably, on Argoverse2, SAFDNet surpassed the previous +best hybrid detector HEDNet by 2.6% mAP while being 2.1x faster, and yielded +2.1% mAP gains over the previous best sparse detector FSDv2 while being 1.3x +faster. The code will be available at https://github.com/zhanggang001/HEDNet. + +
+
+ comment: Accepted by CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Improved cryo-EM Pose Estimation and 3D Classification through + Latent-Space Disentanglement + + +
+ Due to the extremely low signal-to-noise ratio (SNR) and unknown poses +(projection angles and image shifts) in cryo-electron microscopy (cryo-EM) +experiments, reconstructing 3D volumes from 2D images is very challenging. In +addition to these challenges, heterogeneous cryo-EM reconstruction requires +conformational classification. In popular cryo-EM reconstruction algorithms, +poses and conformation classification labels must be predicted for every input +cryo-EM image, which can be computationally costly for large datasets. An +emerging class of methods adopted the amortized inference approach. In these +methods, only a subset of the input dataset is needed to train neural networks +for the estimation of poses and conformations. Once trained, these neural +networks can make pose/conformation predictions and 3D reconstructions at low +cost for the entire dataset during inference. Unfortunately, when facing +heterogeneous reconstruction tasks, it is hard for current +amortized-inference-based methods to effectively estimate the conformational +distribution and poses from entangled latent variables. Here, we propose a +self-supervised variational autoencoder architecture called "HetACUMN" based on +amortized inference. We employed an auxiliary conditional pose prediction task +by inverting the order of encoder-decoder to explicitly enforce the +disentanglement of conformation and pose predictions. Results on simulated +datasets show that HetACUMN generated more accurate conformational +classifications than other amortized or non-amortized methods. Furthermore, we +show that HetACUMN is capable of performing heterogeneous 3D reconstructions of +a real experimental dataset. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face + Recognition through Synthetic Faces + + +
+ Recent advances in deep face recognition have spurred a growing demand for +large, diverse, and manually annotated face datasets. Acquiring authentic, +high-quality data for face recognition has proven to be a challenge, primarily +due to privacy concerns. Large face datasets are primarily sourced from +web-based images, lacking explicit user consent. In this paper, we examine +whether and how synthetic face data can be used to train effective face +recognition models with reduced reliance on authentic images, thereby +mitigating data collection concerns. First, we explored the performance gap +among recent state-of-the-art face recognition models, trained with synthetic +data only and authentic (scarce) data only. Then, we deepened our analysis by +training a state-of-the-art backbone with various combinations of synthetic and +authentic data, gaining insights into optimizing the limited use of the latter +for verification accuracy. Finally, we assessed the effectiveness of data +augmentation approaches on synthetic and authentic data, with the same goal in +mind. Our results highlighted the effectiveness of FR trained on combined +datasets, particularly when combined with appropriate augmentation techniques. + +
+
+ comment: Accepted as full paper at FG 2024 main track +
+
+
+
+
+ + ♻ ☆ Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine + + +
+ Recent studies indicate that Generative Pre-trained Transformer 4 with Vision +(GPT-4V) outperforms human physicians in medical challenge tasks. However, +these evaluations primarily focused on the accuracy of multi-choice questions +alone. Our study extends the current scope by conducting a comprehensive +analysis of GPT-4V's rationales of image comprehension, recall of medical +knowledge, and step-by-step multimodal reasoning when solving New England +Journal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test +the knowledge and diagnostic capabilities of medical professionals. Evaluation +results confirmed that GPT-4V performs comparatively to human physicians +regarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in +cases where physicians incorrectly answer, with over 78% accuracy. However, we +discovered that GPT-4V frequently presents flawed rationales in cases where it +makes the correct final choices (35.5%), most prominent in image comprehension +(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our +findings emphasize the necessity for further in-depth evaluations of its +rationales before integrating such multimodal AI models into clinical +workflows. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Multichannel Orthogonal Transform-Based Perceptron Layers for Efficient + ResNets + + +
+ In this paper, we propose a set of transform-based neural network layers as +an alternative to the $3\times3$ Conv2D layers in Convolutional Neural Networks +(CNNs). The proposed layers can be implemented based on orthogonal transforms +such as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and +biorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of +the convolution theorems, convolutional filtering operations are performed in +the transform domain using element-wise multiplications. Trainable +soft-thresholding layers, that remove noise in the transform domain, bring +nonlinearity to the transform domain layers. Compared to the Conv2D layer, +which is spatial-agnostic and channel-specific, the proposed layers are +location-specific and channel-specific. Moreover, these proposed layers reduce +the number of parameters and multiplications significantly while improving the +accuracy results of regular ResNets on the ImageNet-1K classification task. +Furthermore, they can be inserted with a batch normalization layer before the +global average pooling layer in the conventional ResNets as an additional layer +to improve classification accuracy. + +
+
+ comment: This work is accepted to IEEE Transactions on Neural Networks and + Learning Systems. The initial title is "Orthogonal Transform Domain + Approaches for the Convolutional Layer". We changed it to "Multichannel + Orthogonal Transform-Based Perceptron Layers for Efficient ResNets" based on + reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577 +
+
+
+
+
+ + ♻ ☆ Non-negative Contrastive Learning ICLR 2024 + + +
+ Deep representations have shown promising performance when transferred to +downstream tasks in a black-box manner. Yet, their inherent lack of +interpretability remains a significant challenge, as these features are often +opaque to human understanding. In this paper, we propose Non-negative +Contrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization +(NMF) aimed at deriving interpretable features. The power of NCL lies in its +enforcement of non-negativity constraints on features, reminiscent of NMF's +capability to extract features that align closely with sample clusters. NCL not +only aligns mathematically well with an NMF objective but also preserves NMF's +interpretability attributes, resulting in a more sparse and disentangled +representation compared to standard contrastive learning (CL). Theoretically, +we establish guarantees on the identifiability and downstream generalization of +NCL. Empirically, we show that these advantages enable NCL to outperform CL +significantly on feature disentanglement, feature selection, as well as +downstream classification tasks. At last, we show that NCL can be easily +extended to other learning scenarios and benefit supervised learning as well. +Code is available at https://github.com/PKU-ML/non_neg. + +
+
+ comment: 22 pages. Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Representing Anatomical Trees by Denoising Diffusion of Implicit Neural + Fields + + +
+ Anatomical trees play a central role in clinical diagnosis and treatment +planning. However, accurately representing anatomical trees is challenging due +to their varying and complex topology and geometry. Traditional methods for +representing tree structures, captured using medical imaging, while invaluable +for visualizing vascular and bronchial networks, exhibit drawbacks in terms of +limited resolution, flexibility, and efficiency. Recently, implicit neural +representations (INRs) have emerged as a powerful tool for representing shapes +accurately and efficiently. We propose a novel approach for representing +anatomical trees using INR, while also capturing the distribution of a set of +trees via denoising diffusion in the space of INRs. We accurately capture the +intricate geometries and topologies of anatomical trees at any desired +resolution. Through extensive qualitative and quantitative evaluation, we +demonstrate high-fidelity tree reconstruction with arbitrary resolution yet +compact storage, and versatility across anatomical sites and tree complexities. + +
+
+ comment: Preprint. In review. Code: https://github.com/sinAshish/TreeDiffusion +
+
+
+
+
+ + ♻ ☆ Solutions to Elliptic and Parabolic Problems via Finite Difference Based + Unsupervised Small Linear Convolutional Neural Networks + + +
+ In recent years, there has been a growing interest in leveraging deep +learning and neural networks to address scientific problems, particularly in +solving partial differential equations (PDEs). However, many neural +network-based methods like PINNs rely on auto differentiation and sampling +collocation points, leading to a lack of interpretability and lower accuracy +than traditional numerical methods. As a result, we propose a fully +unsupervised approach, requiring no training data, to estimate finite +difference solutions for PDEs directly via small linear convolutional neural +networks. Our proposed approach uses substantially fewer parameters than +similar finite difference-based approaches while also demonstrating comparable +accuracy to the true solution for several selected elliptic and parabolic +problems compared to the finite difference method. + +
+
+ comment: Submitted to CMA, under review +
+
+
+
+
+ + ♻ ☆ Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by + Factoring the Real World CVPR + 2024 + + +
+ To achieve strong real world performance, neural networks must be trained on +large, diverse datasets; however, obtaining and annotating such datasets is +costly and time-consuming, particularly for 3D point clouds. In this paper, we +describe Paved2Paradise, a simple, cost-effective approach for generating fully +labeled, diverse, and realistic lidar datasets from scratch, all while +requiring minimal human annotation. Our key insight is that, by deliberately +collecting separate "background" and "object" datasets (i.e., "factoring the +real world"), we can intelligently combine them to produce a combinatorially +large and diverse training set. The Paved2Paradise pipeline thus consists of +four steps: (1) collecting copious background data, (2) recording individuals +from the desired object class(es) performing different behaviors in an isolated +environment (like a parking lot), (3) bootstrapping labels for the object +dataset, and (4) generating samples by placing objects at arbitrary locations +in backgrounds. To demonstrate the utility of Paved2Paradise, we generated +synthetic datasets for two tasks: (1) human detection in orchards (a task for +which no public data exists) and (2) pedestrian detection in urban +environments. Qualitatively, we find that a model trained exclusively on +Paved2Paradise synthetic data is highly effective at detecting humans in +orchards, including when individuals are heavily occluded by tree branches. +Quantitatively, a model trained on Paved2Paradise data that sources backgrounds +from KITTI performs comparably to a model trained on the actual dataset. These +results suggest the Paved2Paradise synthetic data pipeline can help accelerate +point cloud model development in sectors where acquiring lidar datasets has +previously been cost-prohibitive. + +
+
+ comment: Accepted to the Synthetic Data for Computer Vision workshop at CVPR + 2024 +
+
+
+
+
+ + ♻ ☆ FlashTex: Fast Relightable Mesh Texturing with LightControlNet + + +
+ Manually creating textures for 3D meshes is time-consuming, even for expert +visual content creators. We propose a fast approach for automatically texturing +an input 3D mesh based on a user-provided text prompt. Importantly, our +approach disentangles lighting from surface material/reflectance in the +resulting texture so that the mesh can be properly relit and rendered in any +lighting environment. We introduce LightControlNet, a new text-to-image model +based on the ControlNet architecture, which allows the specification of the +desired lighting as a conditioning image to the model. Our text-to-texture +pipeline then constructs the texture in two stages. The first stage produces a +sparse set of visually consistent reference views of the mesh using +LightControlNet. The second stage applies a texture optimization based on Score +Distillation Sampling (SDS) that works with LightControlNet to increase the +texture quality while disentangling surface material from lighting. Our +algorithm is significantly faster than previous text-to-texture methods, while +producing high-quality and relightable textures. + +
+
+ comment: Project page: https://flashtex.github.io/ +
+
+
+
+
+ + ♻ ☆ Holodeck: Language Guided Generation of 3D Embodied AI Environments CVPR 2024 + + +
+ 3D simulated environments play a critical role in Embodied AI, but their +creation requires expertise and extensive manual effort, restricting their +diversity and scope. To mitigate this limitation, we present Holodeck, a system +that generates 3D environments to match a user-supplied prompt fully +automatedly. Holodeck can generate diverse scenes, e.g., arcades, spas, and +museums, adjust the designs for styles, and can capture the semantics of +complex queries such as "apartment for a researcher with a cat" and "office of +a professor who is a fan of Star Wars". Holodeck leverages a large language +model (i.e., GPT-4) for common sense knowledge about what the scene might look +like and uses a large collection of 3D assets from Objaverse to populate the +scene with diverse objects. To address the challenge of positioning objects +correctly, we prompt GPT-4 to generate spatial relational constraints between +objects and then optimize the layout to satisfy those constraints. Our +large-scale human evaluation shows that annotators prefer Holodeck over +manually designed procedural baselines in residential scenes and that Holodeck +can produce high-quality outputs for diverse scene types. We also demonstrate +an exciting application of Holodeck in Embodied AI, training agents to navigate +in novel scenes like music rooms and daycares without human-constructed data, +which is a significant step forward in developing general-purpose embodied +agents. + +
+
+ comment: Published in CVPR 2024, 21 pages, 27 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ DAM: Dynamic Adapter Merging for Continual Video QA Learning + + +
+ We present a parameter-efficient method for continual video +question-answering (VidQA) learning. Our method, named DAM, uses the proposed +Dynamic Adapter Merging to (i) mitigate catastrophic forgetting, (ii) enable +efficient adaptation to continually arriving datasets, (iii) handle inputs from +unknown datasets during inference, and (iv) enable knowledge sharing across +similar dataset domains. Given a set of continually streaming VidQA datasets, +we sequentially train dataset-specific adapters for each dataset while freezing +the parameters of a large pretrained video-language backbone. During inference, +given a video-question sample from an unknown domain, our method first uses the +proposed non-parametric router function to compute a probability for each +adapter, reflecting how relevant that adapter is to the current video-question +input instance. Subsequently, the proposed dynamic adapter merging scheme +aggregates all the adapter weights into a new adapter instance tailored for +that particular test sample to compute the final VidQA prediction, mitigating +the impact of inaccurate router predictions and facilitating knowledge sharing +across domains. Our DAM model outperforms prior state-of-the-art continual +learning approaches by 9.1% while exhibiting 1.9% less forgetting on 6 VidQA +datasets spanning various domains. We further extend DAM to continual image +classification and image QA and outperform prior methods by a large margin. The +code is publicly available at: https://github.com/klauscc/DAM + +
+
+ comment: The first two authors contribute equally +
+
+
+
+
+ + ♻ ☆ IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under + Unknown Illumination + + +
+ This paper aims to recover object materials from posed images captured under +an unknown static lighting condition. Recent methods solve this task by +optimizing material parameters through differentiable physically based +rendering. However, due to the coupling between object geometry, materials, and +environment lighting, there is inherent ambiguity during the inverse rendering +process, preventing previous methods from obtaining accurate results. To +overcome this ill-posed problem, our key idea is to learn the material prior +with a generative model for regularizing the optimization process. We observe +that the general rendering equation can be split into diffuse and specular +shading terms, and thus formulate the material prior as diffusion models of +albedo and specular. Thanks to this design, our model can be trained using the +existing abundant 3D object data, and naturally acts as a versatile tool to +resolve the ambiguity when recovering material representations from RGB images. +In addition, we develop a coarse-to-fine training strategy that leverages +estimated materials to guide diffusion models to satisfy multi-view consistent +constraints, leading to more stable and accurate results. Extensive experiments +on real-world and synthetic datasets demonstrate that our approach achieves +state-of-the-art performance on material recovery. The code will be available +at https://zju3dv.github.io/IntrinsicAnything. + +
+
+ comment: Project page: https://zju3dv.github.io/IntrinsicAnything +
+
+
+
+
+ + ♻ ☆ SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient + Channels + + +
+ Pre-trained vision transformers have strong representation benefits to +various downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT) +methods have been proposed, and their experiments demonstrate that tuning only +1% of extra parameters could surpass full fine-tuning in low-data resource +scenarios. However, these methods overlook the task-specific information when +fine-tuning diverse downstream tasks. In this paper, we propose a simple yet +effective method called "Salient Channel Tuning" (SCT) to leverage the +task-specific information by forwarding the model with the task images to +select partial channels in a feature map that enables us to tune only 1/8 +channels leading to significantly lower parameter costs. Experiments outperform +full fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only +0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning +counterpart. Furthermore, experiments on domain generalization and few-shot +learning surpass other PEFT methods with lower parameter costs, demonstrating +our proposed tuning technique's strong capability and effectiveness in the +low-data regime. + +
+
+ comment: This work has been accepted by IJCV2023 +
+
+
+
+
+ + ♻ ☆ Making Images Real Again: A Comprehensive Survey on Deep Image + Composition + + +
+ As a common image editing operation, image composition aims to combine the +foreground from one image and another background image, resulting in a +composite image. However, there are many issues that could make the composite +images unrealistic. These issues can be summarized as the inconsistency between +foreground and background, which includes appearance inconsistency (e.g., +incompatible illumination), geometry inconsistency (e.g., unreasonable size), +and semantic inconsistency (e.g., mismatched semantic context). Image +composition task could be decomposed into multiple sub-tasks, in which each +sub-task targets at one or more issues. Specifically, object placement aims to +find reasonable scale, location, and shape for the foreground. Image blending +aims to address the unnatural boundary between foreground and background. Image +harmonization aims to adjust the illumination statistics of foreground. Shadow +generation aims to generate plausible shadow for the foreground. These +sub-tasks can be executed sequentially or parallelly to acquire realistic +composite images. To the best of our knowledge, there is no previous survey on +image composition. In this paper, we conduct comprehensive survey over the +sub-tasks and combinatorial task of image composition. For each one, we +summarize the existing methods, available datasets, and common evaluation +metrics. Datasets and codes for image composition are summarized at +https://github.com/bcmi/Awesome-Image-Composition. We have also contributed the +first image composition toolbox: libcom https://github.com/bcmi/libcom, which +assembles 10+ image composition related functions (e.g., image blending, image +harmonization, object placement, shadow generation, generative composition). +The ultimate goal of this toolbox is solving all the problems related to image +composition with simple `import libcom'. + +
+
+
+
+
+ + ♻ ☆ A Concise but High-performing Network for Image Guided Depth Completion + in Autonomous Driving + + +
+ Depth completion is a crucial task in autonomous driving, aiming to convert a +sparse depth map into a dense depth prediction. Due to its potentially rich +semantic information, RGB image is commonly fused to enhance the completion +effect. Image-guided depth completion involves three key challenges: 1) how to +effectively fuse the two modalities; 2) how to better recover depth +information; and 3) how to achieve real-time prediction for practical +autonomous driving. To solve the above problems, we propose a concise but +effective network, named CENet, to achieve high-performance depth completion +with a simple and elegant structure. Firstly, we use a fast guidance module to +fuse the two sensor features, utilizing abundant auxiliary features extracted +from the color space. Unlike other commonly used complicated guidance modules, +our approach is intuitive and low-cost. In addition, we find and analyze the +optimization inconsistency problem for observed and unobserved positions, and a +decoupled depth prediction head is proposed to alleviate the issue. The +proposed decoupled head can better output the depth of valid and invalid +positions with very few extra inference time. Based on the simple structure of +dual-encoder and single-decoder, our CENet can achieve superior balance between +accuracy and efficiency. In the KITTI depth completion benchmark, our CENet +attains competitive performance and inference speed compared with the +state-of-the-art methods. To validate the generalization of our method, we also +evaluate on indoor NYUv2 dataset, and our CENet still achieve impressive +results. The code of this work will be available at +https://github.com/lmomoy/CHNet. + +
+
+
+
+
+ + ♻ ☆ HanDiffuser: Text-to-Image Generation With Realistic Hand Appearances + + +
+ Text-to-image generative models can generate high-quality humans, but realism +is lost when generating hands. Common artifacts include irregular hand poses, +shapes, incorrect numbers of fingers, and physically implausible finger +orientations. To generate images with realistic hands, we propose a novel +diffusion-based architecture called HanDiffuser that achieves realism by +injecting hand embeddings in the generative process. HanDiffuser consists of +two components: a Text-to-Hand-Params diffusion model to generate SMPL-Body and +MANO-Hand parameters from input text prompts, and a Text-Guided +Hand-Params-to-Image diffusion model to synthesize images by conditioning on +the prompts and hand parameters generated by the previous component. We +incorporate multiple aspects of hand representation, including 3D shapes and +joint-level finger positions, orientations and articulations, for robust +learning and reliable performance during inference. We conduct extensive +quantitative and qualitative experiments and perform user studies to +demonstrate the efficacy of our method in generating images with high-quality +hands. + +
+
+ comment: Revisions: 1. Added a link to project page in the abstract, 2. + Updated references and related work, 3. Fixed some grammatical errors +
+
+
+
+
+ + ♻ ☆ Choosing Wisely and Learning Deeply: Selective Cross-Modality + Distillation via CLIP for Domain Generalization + + +
+ Domain Generalization (DG), a crucial research area, seeks to train models +across multiple domains and test them on unseen ones. In this paper, we +introduce a novel approach, namely, Selective Cross-Modality Distillation for +Domain Generalization (SCMD). SCMD leverages the capabilities of large +vision-language models, specifically CLIP, to train a more efficient model, +ensuring it acquires robust generalization capabilities across unseen domains. +Our primary contribution is a unique selection framework strategically designed +to identify hard-to-learn samples for distillation. In parallel, we introduce a +novel cross-modality module that seamlessly combines the projected features of +the student model with the text embeddings from CLIP, ensuring the alignment of +similarity distributions. We assess SCMD's performance on various benchmarks, +where it empowers a ResNet50 to deliver state-of-the-art performance, +surpassing existing domain generalization methods. Furthermore, we provide a +theoretical analysis of our selection strategy, offering deeper insight into +its effectiveness and potential in the field of DG. + +
+
+
+
+
+ + ♻ ☆ GhostNetV3: Exploring the Training Strategies for Compact Models + + +
+ Compact neural networks are specially designed for applications on edge +devices with faster inference speed yet modest performance. However, training +strategies of compact models are borrowed from that of conventional models at +present, which ignores their difference in model capacity and thus may impede +the performance of compact models. In this paper, by systematically +investigating the impact of different training ingredients, we introduce a +strong training strategy for compact models. We find that the appropriate +designs of re-parameterization and knowledge distillation are crucial for +training high-performance compact models, while some commonly used data +augmentations for training conventional models, such as Mixup and CutMix, lead +to worse performance. Our experiments on ImageNet-1K dataset demonstrate that +our specialized training strategy for compact models is applicable to various +architectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2. +Specifically, equipped with our strategy, GhostNetV3 1.3$\times$ achieves a +top-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile +devices, surpassing its ordinarily trained counterpart by a large margin. +Moreover, our observation can also be extended to object detection scenarios. +PyTorch code and checkpoints can be found at +https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch. + +
+
+
+
+
+ + ♻ ☆ Bayesian Diffusion Models for 3D Shape Reconstruction CVPR 2024 + + +
+ We present Bayesian Diffusion Models (BDM), a prediction algorithm that +performs effective Bayesian inference by tightly coupling the top-down (prior) +information with the bottom-up (data-driven) procedure via joint diffusion +processes. We show the effectiveness of BDM on the 3D shape reconstruction +task. Compared to prototypical deep learning data-driven approaches trained on +paired (supervised) data-labels (e.g. image-point clouds) datasets, our BDM +brings in rich prior information from standalone labels (e.g. point clouds) to +improve the bottom-up 3D reconstruction. As opposed to the standard Bayesian +frameworks where explicit prior and likelihood are required for the inference, +BDM performs seamless information fusion via coupled diffusion processes with +learned gradient computation networks. The specialty of our BDM lies in its +capability to engage the active and effective information exchange and fusion +of the top-down and bottom-up processes where each itself is a diffusion +process. We demonstrate state-of-the-art results on both synthetic and +real-world benchmarks for 3D shape reconstruction. + +
+
+ comment: Accepted to CVPR 2024; Project Page: https://mlpc-ucsd.github.io/BDM/ +
+
+
+
+
+ + ♻ ☆ IterInv: Iterative Inversion for Pixel-Level T2I Models ICME 2024 + + +
+ Large-scale text-to-image diffusion models have been a ground-breaking +development in generating convincing images following an input text prompt. The +goal of image editing research is to give users control over the generated +images by modifying the text prompt. Current image editing techniques +predominantly hinge on DDIM inversion as a prevalent practice rooted in Latent +Diffusion Models (LDM). However, the large pretrained T2I models working on the +latent space suffer from losing details due to the first compression stage with +an autoencoder mechanism. Instead, other mainstream T2I pipeline working on the +pixel level, such as Imagen and DeepFloyd-IF, circumvents the above problem. +They are commonly composed of multiple stages, typically starting with a +text-to-image stage and followed by several super-resolution stages. In this +pipeline, the DDIM inversion fails to find the initial noise and generate the +original image given that the super-resolution diffusion models are not +compatible with the DDIM technique. According to our experimental findings, +iteratively concatenating the noisy image as the condition is the root of this +problem. Based on this observation, we develop an iterative inversion (IterInv) +technique for this category of T2I models and verify IterInv with the +open-source DeepFloyd-IF model.Specifically, IterInv employ NTI as the +inversion and reconstruction of low-resolution image generation. In stages 2 +and 3, we update the latent variance at each timestep to find the deterministic +inversion trace and promote the reconstruction process. By combining our method +with a popular image editing method, we prove the application prospects of +IterInv. The code will be released upon acceptance. The code is available at +\url{https://github.com/Tchuanm/IterInv.git}. + +
+
+ comment: Accepted paper at ICME 2024 +
+
+
+
+
+ + ♻ ☆ DermSynth3D: Synthesis of in-the-wild Annotated Dermatology Images + + +
+ In recent years, deep learning (DL) has shown great potential in the field of +dermatological image analysis. However, existing datasets in this domain have +significant limitations, including a small number of image samples, limited +disease conditions, insufficient annotations, and non-standardized image +acquisitions. To address these shortcomings, we propose a novel framework +called DermSynth3D. DermSynth3D blends skin disease patterns onto 3D textured +meshes of human subjects using a differentiable renderer and generates 2D +images from various camera viewpoints under chosen lighting conditions in +diverse background scenes. Our method adheres to top-down rules that constrain +the blending and rendering process to create 2D images with skin conditions +that mimic in-the-wild acquisitions, ensuring more meaningful results. The +framework generates photo-realistic 2D dermoscopy images and the corresponding +dense annotations for semantic segmentation of the skin, skin conditions, body +parts, bounding boxes around lesions, depth maps, and other 3D scene +parameters, such as camera position and lighting conditions. DermSynth3D allows +for the creation of custom datasets for various dermatology tasks. We +demonstrate the effectiveness of data generated using DermSynth3D by training +DL models on synthetic data and evaluating them on various dermatology tasks +using real 2D dermatological images. We make our code publicly available at +https://github.com/sfu-mial/DermSynth3D. + +
+
+ comment: Accepted to Medical Image Analysis (MedIA) 2024 +
+
+
+
+
+ + ♻ ☆ End-to-end Autonomous Driving: Challenges and Frontiers + + +
+ The autonomous driving community has witnessed a rapid growth in approaches +that embrace an end-to-end algorithm framework, utilizing raw sensor input to +generate vehicle motion plans, instead of concentrating on individual tasks +such as detection and motion prediction. End-to-end systems, in comparison to +modular pipelines, benefit from joint feature optimization for perception and +planning. This field has flourished due to the availability of large-scale +datasets, closed-loop evaluation, and the increasing need for autonomous +driving algorithms to perform effectively in challenging scenarios. In this +survey, we provide a comprehensive analysis of more than 270 papers, covering +the motivation, roadmap, methodology, challenges, and future trends in +end-to-end autonomous driving. We delve into several critical challenges, +including multi-modality, interpretability, causal confusion, robustness, and +world models, amongst others. Additionally, we discuss current advancements in +foundation models and visual pre-training, as well as how to incorporate these +techniques within the end-to-end driving framework. we maintain an active +repository that contains up-to-date literature and open-source projects at +https://github.com/OpenDriveLab/End-to-end-Autonomous-Driving. + +
+
+
+
+
+ + ♻ ☆ Deep Feature Statistics Mapping for Generalized Screen Content Image + Quality Assessment + + +
+ The statistical regularities of natural images, referred to as natural scene +statistics, play an important role in no-reference image quality assessment. +However, it has been widely acknowledged that screen content images (SCIs), +which are typically computer generated, do not hold such statistics. Here we +make the first attempt to learn the statistics of SCIs, based upon which the +quality of SCIs can be effectively determined. The underlying mechanism of the +proposed approach is based upon the mild assumption that the SCIs, which are +not physically acquired, still obey certain statistics that could be understood +in a learning fashion. We empirically show that the statistics deviation could +be effectively leveraged in quality assessment, and the proposed method is +superior when evaluated in different settings. Extensive experimental results +demonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA) +model delivers promising performance compared with existing NR-IQA models and +shows a high generalization capability in the cross-dataset settings. The +implementation of our method is publicly available at +https://github.com/Baoliang93/DFSS-IQA. + +
+
+
+
+
+ + ♻ ☆ Generative Modelling with High-Order Langevin Dynamics + + +
+ Diffusion generative modelling (DGM) based on stochastic differential +equations (SDEs) with score matching has achieved unprecedented results in data +generation. In this paper, we propose a novel fast high-quality generative +modelling method based on high-order Langevin dynamics (HOLD) with score +matching. This motive is proved by third-order Langevin dynamics. By augmenting +the previous SDEs, e.g. variance exploding or variance preserving SDEs for +single-data variable processes, HOLD can simultaneously model position, +velocity, and acceleration, thereby improving the quality and speed of the data +generation at the same time. HOLD is composed of one Ornstein-Uhlenbeck process +and two Hamiltonians, which reduce the mixing time by two orders of magnitude. +Empirical experiments for unconditional image generation on the public data set +CIFAR-10 and CelebA-HQ show that the effect is significant in both Frechet +inception distance (FID) and negative log-likelihood, and achieves the +state-of-the-art FID of 1.85 on CIFAR-10. + +
+
+ comment: Some of the results in this paper have been published or accepted at + conferences such as wacv2024, icassp2024, and icme2024 +
+
+
+
+
+ + ♻ ☆ GestaltMML: Enhancing Rare Genetic Disease Diagnosis through Multimodal + Machine Learning Combining Facial Images and Clinical Texts + + +
+ Individuals with suspected rare genetic disorders often undergo multiple +clinical evaluations, imaging studies, laboratory tests and genetic tests, to +find a possible answer over a prolonged period of time. Addressing this +"diagnostic odyssey" thus has substantial clinical, psychosocial, and economic +benefits. Many rare genetic diseases have distinctive facial features, which +can be used by artificial intelligence algorithms to facilitate clinical +diagnosis, in prioritizing candidate diseases to be further examined by lab +tests or genetic assays, or in helping the phenotype-driven reinterpretation of +genome/exome sequencing data. Existing methods using frontal facial photos were +built on conventional Convolutional Neural Networks (CNNs), rely exclusively on +facial images, and cannot capture non-facial phenotypic traits and demographic +information essential for guiding accurate diagnoses. Here we introduce +GestaltMML, a multimodal machine learning (MML) approach solely based on the +Transformer architecture. It integrates facial images, demographic information +(age, sex, ethnicity), and clinical notes (optionally, a list of Human +Phenotype Ontology terms) to improve prediction accuracy. Furthermore, we also +evaluated GestaltMML on a diverse range of datasets, including 528 diseases +from the GestaltMatcher Database, several in-house datasets of +Beckwith-Wiedemann syndrome (BWS, over-growth syndrome with distinct facial +features), Sotos syndrome (overgrowth syndrome with overlapping features with +BWS), NAA10-related neurodevelopmental syndrome, Cornelia de Lange syndrome +(multiple malformation syndrome), and KBG syndrome (multiple malformation +syndrome). Our results suggest that GestaltMML effectively incorporates +multiple modalities of data, greatly narrowing candidate genetic diagnoses of +rare diseases and may facilitate the reinterpretation of genome/exome +sequencing data. + +
+
+ comment: Significant revisions +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 72 + +
+
+
+ + ☆ Enforcing Conditional Independence for Fair Representation Learning and + Causal Image Generation CVPR + + +
+ Conditional independence (CI) constraints are critical for defining and +evaluating fairness in machine learning, as well as for learning unconfounded +or causal representations. Traditional methods for ensuring fairness either +blindly learn invariant features with respect to a protected variable (e.g., +race when classifying sex from face images) or enforce CI relative to the +protected attribute only on the model output (e.g., the sex label). Neither of +these methods are effective in enforcing CI in high-dimensional feature spaces. +In this paper, we focus on a nascent approach characterizing the CI constraint +in terms of two Jensen-Shannon divergence terms, and we extend it to +high-dimensional feature spaces using a novel dynamic sampling strategy. In +doing so, we introduce a new training paradigm that can be applied to any +encoder architecture. We are able to enforce conditional independence of the +diffusion autoencoder latent representation with respect to any protected +attribute under the equalized odds constraint and show that this approach +enables causal image generation with controllable latent spaces. Our +experimental results demonstrate that our approach can achieve high accuracy on +downstream tasks while upholding equality of odds. + +
+
+ comment: To appear at the 2024 IEEE CVPR Workshop on Fair, Data-Efficient, and + Trusted Computer Vision +
+
+
+
+
+ + ☆ Universal Fingerprint Generation: Controllable Diffusion Model with + Multimodal Conditions + + +
+ The utilization of synthetic data for fingerprint recognition has garnered +increased attention due to its potential to alleviate privacy concerns +surrounding sensitive biometric data. However, current methods for generating +fingerprints have limitations in creating impressions of the same finger with +useful intra-class variations. To tackle this challenge, we present GenPrint, a +framework to produce fingerprint images of various types while maintaining +identity and offering humanly understandable control over different appearance +factors such as fingerprint class, acquisition type, sensor device, and quality +level. Unlike previous fingerprint generation approaches, GenPrint is not +confined to replicating style characteristics from the training dataset alone: +it enables the generation of novel styles from unseen devices without requiring +additional fine-tuning. To accomplish these objectives, we developed GenPrint +using latent diffusion models with multimodal conditions (text and image) for +consistent generation of style and identity. Our experiments leverage a variety +of publicly available datasets for training and evaluation. Results demonstrate +the benefits of GenPrint in terms of identity preservation, explainable +control, and universality of generated images. Importantly, the +GenPrint-generated images yield comparable or even superior accuracy to models +trained solely on real data and further enhances performance when augmenting +the diversity of existing real fingerprint datasets. + +
+
+
+
+
+ + ☆ AnyPattern: Towards In-context Image Copy Detection + + +
+ This paper explores in-context learning for image copy detection (ICD), i.e., +prompting an ICD model to identify replicated images with new tampering +patterns without the need for additional training. The prompts (or the +contexts) are from a small set of image-replica pairs that reflect the new +patterns and are used at inference time. Such in-context ICD has good realistic +value, because it requires no fine-tuning and thus facilitates fast reaction +against the emergence of unseen patterns. To accommodate the "seen +$\rightarrow$ unseen" generalization scenario, we construct the first +large-scale pattern dataset named AnyPattern, which has the largest number of +tamper patterns ($90$ for training and $10$ for testing) among all the existing +ones. We benchmark AnyPattern with popular ICD methods and reveal that existing +methods barely generalize to novel tamper patterns. We further propose a simple +in-context ICD method named ImageStacker. ImageStacker learns to select the +most representative image-replica pairs and employs them as the pattern prompts +in a stacking manner (rather than the popular concatenation manner). +Experimental results show (1) training with our large-scale dataset +substantially benefits pattern generalization ($+26.66 \%$ $\mu AP$), (2) the +proposed ImageStacker facilitates effective in-context ICD (another round of +$+16.75 \%$ $\mu AP$), and (3) AnyPattern enables in-context ICD, i.e. without +such a large-scale dataset, in-context learning does not emerge even with our +ImageStacker. The project (including the proposed dataset AnyPattern and the +code for ImageStacker) is publicly available at https://anypattern.github.io +under the MIT Licence. + +
+
+
+
+
+ + ☆ Iteratively Prompting Multimodal LLMs to Reproduce Natural and + AI-Generated Images + + +
+ With the digital imagery landscape rapidly evolving, image stocks and +AI-generated image marketplaces have become central to visual media. +Traditional stock images now exist alongside innovative platforms that trade in +prompts for AI-generated visuals, driven by sophisticated APIs like DALL-E 3 +and Midjourney. This paper studies the possibility of employing multi-modal +models with enhanced visual understanding to mimic the outputs of these +platforms, introducing an original attack strategy. Our method leverages +fine-tuned CLIP models, a multi-label classifier, and the descriptive +capabilities of GPT-4V to create prompts that generate images similar to those +available in marketplaces and from premium stock image providers, yet at a +markedly lower expense. In presenting this strategy, we aim to spotlight a new +class of economic and security considerations within the realm of digital +imagery. Our findings, supported by both automated metrics and human +assessment, reveal that comparable visual content can be produced for a +fraction of the prevailing market prices ($0.23 - $0.27 per image), emphasizing +the need for awareness and strategic discussions about the integrity of digital +media in an increasingly AI-integrated landscape. Our work also contributes to +the field by assembling a dataset consisting of approximately 19 million +prompt-image pairs generated by the popular Midjourney platform, which we plan +to release publicly. + +
+
+
+
+
+ + ☆ EncodeNet: A Framework for Boosting DNN Accuracy with Entropy-driven + Generalized Converting Autoencoder + + +
+ Image classification is a fundamental task in computer vision, and the quest +to enhance DNN accuracy without inflating model size or latency remains a +pressing concern. We make a couple of advances in this regard, leading to a +novel EncodeNet design and training framework. The first advancement involves +Converting Autoencoders, a novel approach that transforms images into an +easy-to-classify image of its class. Our prior work that applied the Converting +Autoencoder and a simple classifier in tandem achieved moderate accuracy over +simple datasets, such as MNIST and FMNIST. However, on more complex datasets +like CIFAR-10, the Converting Autoencoder has a large reconstruction loss, +making it unsuitable for enhancing DNN accuracy. To address these limitations, +we generalize the design of Converting Autoencoders by leveraging a larger +class of DNNs, those with architectures comprising feature extraction layers +followed by classification layers. We incorporate a generalized algorithmic +design of the Converting Autoencoder and intraclass clustering to identify +representative images, leading to optimized image feature learning. Next, we +demonstrate the effectiveness of our EncodeNet design and training framework, +improving the accuracy of well-trained baseline DNNs while maintaining the +overall model size. EncodeNet's building blocks comprise the trained encoder +from our generalized Converting Autoencoders transferring knowledge to a +lightweight classifier network - also extracted from the baseline DNN. Our +experimental results demonstrate that EncodeNet improves the accuracy of VGG16 +from 92.64% to 94.05% on CIFAR-10 and RestNet20 from 74.56% to 76.04% on +CIFAR-100. It outperforms state-of-the-art techniques that rely on knowledge +distillation and attention mechanisms, delivering higher accuracy for models of +comparable size. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Autonomous Robot for Disaster Mapping and Victim Localization + + +
+ In response to the critical need for effective reconnaissance in disaster +scenarios, this research article presents the design and implementation of a +complete autonomous robot system using the Turtlebot3 with Robotic Operating +System (ROS) Noetic. Upon deployment in closed, initially unknown environments, +the system aims to generate a comprehensive map and identify any present +'victims' using AprilTags as stand-ins. We discuss our solution for search and +rescue missions, while additionally exploring more advanced algorithms to +improve search and rescue functionalities. We introduce a Cubature Kalman +Filter to help reduce the mean squared error [m] for AprilTag localization and +an information-theoretic exploration algorithm to expedite exploration in +unknown environments. Just like turtles, our system takes it slow and steady, +but when it's time to save the day, it moves at ninja-like speed! Despite +Donatello's shell, he's no slowpoke - he zips through obstacles with the +agility of a teenage mutant ninja turtle. So, hang on tight to your shells and +get ready for a whirlwind of reconnaissance! + Full pipeline code https://github.com/rzhao5659/MRProject/tree/main + Exploration code https://github.com/rzhao5659/MRProject/tree/main + +
+
+ comment: Class final project for Northeastern University EECE 5550 Mobile + Robotics Course +
+
+
+
+
+ + ☆ Object-Attribute Binding in Text-to-Image Generation: Evaluation and + Control + + +
+ Current diffusion models create photorealistic images given a text prompt as +input but struggle to correctly bind attributes mentioned in the text to the +right objects in the image. This is evidenced by our novel image-graph +alignment model called EPViT (Edge Prediction Vision Transformer) for the +evaluation of image-text alignment. To alleviate the above problem, we propose +focused cross-attention (FCA) that controls the visual attention maps by +syntactic constraints found in the input sentence. Additionally, the syntax +structure of the prompt helps to disentangle the multimodal CLIP embeddings +that are commonly used in T2I generation. The resulting DisCLIP embeddings and +FCA are easily integrated in state-of-the-art diffusion models without +additional training of these models. We show substantial improvements in T2I +generation and especially its attribute-object binding on several +datasets.\footnote{Code and data will be made available upon acceptance. + +
+
+
+
+
+ + ☆ BC-MRI-SEG: A Breast Cancer MRI Tumor Segmentation Benchmark + + +
+ Binary breast cancer tumor segmentation with Magnetic Resonance Imaging (MRI) +data is typically trained and evaluated on private medical data, which makes +comparing deep learning approaches difficult. We propose a benchmark +(BC-MRI-SEG) for binary breast cancer tumor segmentation based on publicly +available MRI datasets. The benchmark consists of four datasets in total, where +two datasets are used for supervised training and evaluation, and two are used +for zero-shot evaluation. Additionally we compare state-of-the-art (SOTA) +approaches on our benchmark and provide an exhaustive list of available public +breast cancer MRI datasets. The source code has been made available at +https://irulenot.github.io/BC_MRI_SEG_Benchmark. + +
+
+
+
+
+ + ☆ A Nasal Cytology Dataset for Object Detection and Deep Learning + + +
+ Nasal Cytology is a new and efficient clinical technique to diagnose rhinitis +and allergies that is not much widespread due to the time-consuming nature of +cell counting; that is why AI-aided counting could be a turning point for the +diffusion of this technique. In this article we present the first dataset of +rhino-cytological field images: the NCD (Nasal Cytology Dataset), aimed to +train and deploy Object Detection models to support physicians and biologists +during clinical practice. The real distribution of the cytotypes, populating +the nasal mucosa has been replicated, sampling images from slides of clinical +patients, and manually annotating each cell found on them. The correspondent +object detection task presents non'trivial issues associated with the strong +class imbalancement, involving the rarest cell types. This work contributes to +some of open challenges by presenting a novel machine learning-based approach +to aid the automated detection and classification of nasal mucosa cells: the +DETR and YOLO models shown good performance in detecting cells and classifying +them correctly, revealing great potential to accelerate the work of rhinology +experts. + +
+
+ comment: Pre Print almost ready to be submitted +
+
+
+
+
+ + ☆ Interpreting COVID Lateral Flow Tests' Results with Foundation Models + + +
+ Lateral flow tests (LFTs) enable rapid, low-cost testing for health +conditions including Covid, pregnancy, HIV, and malaria. Automated readers of +LFT results can yield many benefits including empowering blind people to +independently learn about their health and accelerating data entry for +large-scale monitoring (e.g., for pandemics such as Covid) by using only a +single photograph per LFT test. Accordingly, we explore the abilities of modern +foundation vision language models (VLMs) in interpreting such tests. To enable +this analysis, we first create a new labeled dataset with hierarchical +segmentations of each LFT test and its nested test result window. We call this +dataset LFT-Grounding. Next, we benchmark eight modern VLMs in zero-shot +settings for analyzing these images. We demonstrate that current VLMs +frequently fail to correctly identify the type of LFT test, interpret the test +results, locate the nested result window of the LFT tests, and recognize LFT +tests when they partially obfuscated. To facilitate community-wide progress +towards automated LFT reading, we publicly release our dataset at +https://iamstuti.github.io/lft_grounding_foundation_models/. + +
+
+
+
+
+ + ☆ Elucidating the Design Space of Dataset Condensation + + +
+ Dataset condensation, a concept within data-centric learning, efficiently +transfers critical attributes from an original dataset to a synthetic version, +maintaining both diversity and realism. This approach significantly improves +model training efficiency and is adaptable across multiple application areas. +Previous methods in dataset condensation have faced challenges: some incur high +computational costs which limit scalability to larger datasets (e.g., MTT, +DREAM, and TESLA), while others are restricted to less optimal design spaces, +which could hinder potential improvements, especially in smaller datasets +(e.g., SRe2L, G-VBSM, and RDED). To address these limitations, we propose a +comprehensive design framework that includes specific, effective strategies +like implementing soft category-aware matching and adjusting the learning rate +schedule. These strategies are grounded in empirical evidence and theoretical +backing. Our resulting approach, Elucidate Dataset Condensation (EDC), +establishes a benchmark for both small and large-scale dataset condensation. In +our testing, EDC achieves state-of-the-art accuracy, reaching 48.6% on +ImageNet-1k with a ResNet-18 model at an IPC of 10, which corresponds to a +compression ratio of 0.78%. This performance exceeds those of SRe2L, G-VBSM, +and RDED by margins of 27.3%, 17.2%, and 6.6%, respectively. + +
+
+
+
+
+ + ☆ ArtNeRF: A Stylized Neural Field for 3D-Aware Cartoonized Face Synthesis + + +
+ Recent advances in generative visual models and neural radiance fields have +greatly boosted 3D-aware image synthesis and stylization tasks. However, +previous NeRF-based work is limited to single scene stylization, training a +model to generate 3D-aware cartoon faces with arbitrary styles remains +unsolved. We propose ArtNeRF, a novel face stylization framework derived from +3D-aware GAN to tackle this problem. In this framework, we utilize an +expressive generator to synthesize stylized faces and a triple-branch +discriminator module to improve the visual quality and style consistency of the +generated faces. Specifically, a style encoder based on contrastive learning is +leveraged to extract robust low-dimensional embeddings of style images, +empowering the generator with the knowledge of various styles. To smooth the +training process of cross-domain transfer learning, we propose an adaptive +style blending module which helps inject style information and allows users to +freely tune the level of stylization. We further introduce a neural rendering +module to achieve efficient real-time rendering of images with higher +resolutions. Extensive experiments demonstrate that ArtNeRF is versatile in +generating high-quality 3D-aware cartoon faces with arbitrary styles. + +
+
+
+
+
+ + ☆ SVGEditBench: A Benchmark Dataset for Quantitative Assessment of LLM's + SVG Editing Capabilities CVPR2024 + + +
+ Text-to-image models have shown progress in recent years. Along with this +progress, generating vector graphics from text has also advanced. SVG is a +popular format for vector graphics, and SVG represents a scene with XML text. +Therefore, Large Language Models can directly process SVG code. Taking this +into account, we focused on editing SVG with LLMs. For quantitative evaluation +of LLMs' ability to edit SVG, we propose SVGEditBench. SVGEditBench is a +benchmark for assessing the LLMs' ability to edit SVG code. We also show the +GPT-4 and GPT-3.5 results when evaluated on the proposed benchmark. In the +experiments, GPT-4 showed superior performance to GPT-3.5 both quantitatively +and qualitatively. The dataset is available at +https://github.com/mti-lab/SVGEditBench. + +
+
+ comment: Accepted to Workshop on Graphic Design Understanding and Generation + (GDUG), a CVPR2024 workshop. Dataset: https://github.com/mti-lab/SVGEditBench +
+
+
+
+
+ + ☆ Concept Arithmetics for Circumventing Concept Inhibition in Diffusion + Models + + +
+ Motivated by ethical and legal concerns, the scientific community is actively +developing methods to limit the misuse of Text-to-Image diffusion models for +reproducing copyrighted, violent, explicit, or personal information in the +generated images. Simultaneously, researchers put these newly developed safety +measures to the test by assuming the role of an adversary to find +vulnerabilities and backdoors in them. We use compositional property of +diffusion models, which allows to leverage multiple prompts in a single image +generation. This property allows us to combine other concepts, that should not +have been affected by the inhibition, to reconstruct the vector, responsible +for target concept generation, even though the direct computation of this +vector is no longer accessible. We provide theoretical and empirical evidence +why the proposed attacks are possible and discuss the implications of these +findings for safe model deployment. We argue that it is essential to consider +all possible approaches to image generation with diffusion models that can be +employed by an adversary. Our work opens up the discussion about the +implications of concept arithmetics and compositional inference for safety +mechanisms in diffusion models. + Content Advisory: This paper contains discussions and model-generated content +that may be considered offensive. Reader discretion is advised. + Project page: https://cs-people.bu.edu/vpetsiuk/arc + +
+
+
+
+
+ + ☆ PEMMA: Parameter-Efficient Multi-Modal Adaptation for Medical Image + Segmentation + + +
+ Imaging modalities such as Computed Tomography (CT) and Positron Emission +Tomography (PET) are key in cancer detection, inspiring Deep Neural Networks +(DNN) models that merge these scans for tumor segmentation. When both CT and +PET scans are available, it is common to combine them as two channels of the +input to the segmentation model. However, this method requires both scan types +during training and inference, posing a challenge due to the limited +availability of PET scans, thereby sometimes limiting the process to CT scans +only. Hence, there is a need to develop a flexible DNN architecture that can be +trained/updated using only CT scans but can effectively utilize PET scans when +they become available. In this work, we propose a parameter-efficient +multi-modal adaptation (PEMMA) framework for lightweight upgrading of a +transformer-based segmentation model trained only on CT scans to also +incorporate PET scans. The benefits of the proposed approach are two-fold. +Firstly, we leverage the inherent modularity of the transformer architecture +and perform low-rank adaptation (LoRA) of the attention weights to achieve +parameter-efficient adaptation. Secondly, since the PEMMA framework attempts to +minimize cross modal entanglement, it is possible to subsequently update the +combined model using only one modality, without causing catastrophic forgetting +of the other modality. Our proposed method achieves comparable results with the +performance of early fusion techniques with just 8% of the trainable +parameters, especially with a remarkable +28% improvement on the average dice +score on PET scans when trained on a single modality. + +
+
+
+
+
+ + ☆ Semantic-Rearrangement-Based Multi-Level Alignment for Domain + Generalized Segmentation + + +
+ Domain generalized semantic segmentation is an essential computer vision +task, for which models only leverage source data to learn the capability of +generalized semantic segmentation towards the unseen target domains. Previous +works typically address this challenge by global style randomization or feature +regularization. In this paper, we argue that given the observation that +different local semantic regions perform different visual characteristics from +the source domain to the target domain, methods focusing on global operations +are hard to capture such regional discrepancies, thus failing to construct +domain-invariant representations with the consistency from local to global +level. Therefore, we propose the Semantic-Rearrangement-based Multi-Level +Alignment (SRMA) to overcome this problem. SRMA first incorporates a Semantic +Rearrangement Module (SRM), which conducts semantic region randomization to +enhance the diversity of the source domain sufficiently. A Multi-Level +Alignment module (MLA) is subsequently proposed with the help of such diversity +to establish the global-regional-local consistent domain-invariant +representations. By aligning features across randomized samples with +domain-neutral knowledge at multiple levels, SRMA provides a more robust way to +handle the source-target domain gap. Extensive experiments demonstrate the +superiority of SRMA over the current state-of-the-art works on various +benchmarks. + +
+
+
+
+
+ + ☆ PV-S3: Advancing Automatic Photovoltaic Defect Detection using + Semi-Supervised Semantic Segmentation of Electroluminescence Images + + +
+ Photovoltaic (PV) systems allow us to tap into all abundant solar energy, +however they require regular maintenance for high efficiency and to prevent +degradation. Traditional manual health check, using Electroluminescence (EL) +imaging, is expensive and logistically challenging making automated defect +detection essential. Current automation approaches require extensive manual +expert labeling, which is time-consuming, expensive, and prone to errors. We +propose PV-S3 (Photovoltaic-Semi Supervised Segmentation), a Semi-Supervised +Learning approach for semantic segmentation of defects in EL images that +reduces reliance on extensive labeling. PV-S3 is a Deep learning model trained +using a few labeled images along with numerous unlabeled images. We introduce a +novel Semi Cross-Entropy loss function to train PV-S3 which addresses the +challenges specific to automated PV defect detection, such as diverse defect +types and class imbalance. We evaluate PV-S3 on multiple datasets and +demonstrate its effectiveness and adaptability. With merely 20% labeled +samples, we achieve an absolute improvement of 9.7% in IoU, 29.9% in Precision, +12.75% in Recall, and 20.42% in F1-Score over prior state-of-the-art supervised +method (which uses 100% labeled samples) on UCF-EL dataset (largest dataset +available for semantic segmentation of EL images) showing improvement in +performance while reducing the annotation costs by 80%. + +
+
+
+
+
+ + ☆ A sustainable development perspective on urban-scale roof greening + priorities and benefits + + +
+ Greenspaces are tightly linked to human well-being. Yet, rapid urbanization +has exacerbated greenspace exposure inequality and declining human life +quality. Roof greening has been recognized as an effective strategy to mitigate +these negative impacts. Understanding priorities and benefits is crucial to +promoting green roofs. Here, using geospatial big data, we conduct an +urban-scale assessment of roof greening at a single building level in Hong Kong +from a sustainable development perspective. We identify that 85.3\% of +buildings reveal potential and urgent demand for roof greening. We further find +green roofs could increase greenspace exposure by \textasciitilde61\% and +produce hundreds of millions (HK\$) in economic benefits annually but play a +small role in urban heat mitigation (\textasciitilde0.15\degree{C}) and annual +carbon emission offsets (\textasciitilde0.8\%). Our study offers a +comprehensive assessment of roof greening, which could provide reference for +sustainable development in cities worldwide, from data utilization to solutions +and findings. + +
+
+
+
+
+ + ☆ A Complete System for Automated 3D Semantic-Geometric Mapping of + Corrosion in Industrial Environments + + +
+ Corrosion, a naturally occurring process leading to the deterioration of +metallic materials, demands diligent detection for quality control and the +preservation of metal-based objects, especially within industrial contexts. +Traditional techniques for corrosion identification, including ultrasonic +testing, radio-graphic testing, and magnetic flux leakage, necessitate the +deployment of expensive and bulky equipment on-site for effective data +acquisition. An unexplored alternative involves employing lightweight, +conventional camera systems, and state-of-the-art computer vision methods for +its identification. + In this work, we propose a complete system for semi-automated corrosion +identification and mapping in industrial environments. We leverage recent +advances in LiDAR-based methods for localization and mapping, with vision-based +semantic segmentation deep learning techniques, in order to build +semantic-geometric maps of industrial environments. Unlike previous corrosion +identification systems available in the literature, our designed multi-modal +system is low-cost, portable, semi-autonomous and allows collecting large +datasets by untrained personnel. + A set of experiments in an indoor laboratory environment, demonstrate +quantitatively the high accuracy of the employed LiDAR based 3D mapping and +localization system, with less then $0.05m$ and 0.02m average absolute and +relative pose errors. Also, our data-driven semantic segmentation model, +achieves around 70\% precision when trained with our pixel-wise manually +annotated dataset. + +
+
+
+
+
+ + ☆ Hyper-SD: Trajectory Segmented Consistency Model for Efficient Image + Synthesis + + +
+ Recently, a series of diffusion-aware distillation algorithms have emerged to +alleviate the computational overhead associated with the multi-step inference +process of Diffusion Models (DMs). Current distillation techniques often +dichotomize into two distinct aspects: i) ODE Trajectory Preservation; and ii) +ODE Trajectory Reformulation. However, these approaches suffer from severe +performance degradation or domain shifts. To address these limitations, we +propose Hyper-SD, a novel framework that synergistically amalgamates the +advantages of ODE Trajectory Preservation and Reformulation, while maintaining +near-lossless performance during step compression. Firstly, we introduce +Trajectory Segmented Consistency Distillation to progressively perform +consistent distillation within pre-defined time-step segments, which +facilitates the preservation of the original ODE trajectory from a higher-order +perspective. Secondly, we incorporate human feedback learning to boost the +performance of the model in a low-step regime and mitigate the performance loss +incurred by the distillation process. Thirdly, we integrate score distillation +to further improve the low-step generation capability of the model and offer +the first attempt to leverage a unified LoRA to support the inference process +at all steps. Extensive experiments and user studies demonstrate that Hyper-SD +achieves SOTA performance from 1 to 8 inference steps for both SDXL and SD1.5. +For example, Hyper-SDXL surpasses SDXL-Lightning by +0.68 in CLIP Score and ++0.51 in Aes Score in the 1-step inference. + +
+
+
+
+
+ + ☆ PoseAnimate: Zero-shot high fidelity pose controllable character + animation + + +
+ Image-to-video(I2V) generation aims to create a video sequence from a single +image, which requires high temporal coherence and visual fidelity with the +source image.However, existing approaches suffer from character appearance +inconsistency and poor preservation of fine details. Moreover, they require a +large amount of video data for training, which can be computationally +demanding.To address these limitations,we propose PoseAnimate, a novel +zero-shot I2V framework for character animation.PoseAnimate contains three key +components: 1) Pose-Aware Control Module (PACM) incorporates diverse pose +signals into conditional embeddings, to preserve character-independent content +and maintain precise alignment of actions.2) Dual Consistency Attention Module +(DCAM) enhances temporal consistency, and retains character identity and +intricate background details.3) Mask-Guided Decoupling Module (MGDM) refines +distinct feature perception, improving animation fidelity by decoupling the +character and background.We also propose a Pose Alignment Transition Algorithm +(PATA) to ensure smooth action transition.Extensive experiment results +demonstrate that our approach outperforms the state-of-the-art training-based +methods in terms of character consistency and detail fidelity. Moreover, it +maintains a high level of temporal coherence throughout the generated +animations. + +
+
+
+
+
+ + ☆ GScream: Learning 3D Geometry and Feature Consistent Gaussian Splatting + for Object Removal + + +
+ This paper tackles the intricate challenge of object removal to update the +radiance field using the 3D Gaussian Splatting. The main challenges of this +task lie in the preservation of geometric consistency and the maintenance of +texture coherence in the presence of the substantial discrete nature of +Gaussian primitives. We introduce a robust framework specifically designed to +overcome these obstacles. The key insight of our approach is the enhancement of +information exchange among visible and invisible areas, facilitating content +restoration in terms of both geometry and texture. Our methodology begins with +optimizing the positioning of Gaussian primitives to improve geometric +consistency across both removed and visible areas, guided by an online +registration process informed by monocular depth estimation. Following this, we +employ a novel feature propagation mechanism to bolster texture coherence, +leveraging a cross-attention design that bridges sampling Gaussians from both +uncertain and certain areas. This innovative approach significantly refines the +texture coherence within the final radiance field. Extensive experiments +validate that our method not only elevates the quality of novel view synthesis +for scenes undergoing object removal but also showcases notable efficiency +gains in training and rendering speeds. + +
+
+ comment: Project Page: https://w-ted.github.io/publications/gscream +
+
+
+
+
+ + ☆ FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and + High-Quality Localization + + +
+ Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies +directly without access to any known normal or abnormal samples within the +target item categories. Existing approaches typically rely on the robust +generalization capabilities of multimodal pretrained models, computing +similarities between manually crafted textual features representing "normal" or +"abnormal" semantics and image features to detect anomalies and localize +anomalous patches. However, the generic descriptions of "abnormal" often fail +to precisely match diverse types of anomalies across different object +categories. Additionally, computing feature similarities for single patches +struggles to pinpoint specific locations of anomalies with various sizes and +scales. To address these issues, we propose a novel ZSAD method called FiLo, +comprising two components: adaptively learned Fine-Grained Description (FG-Des) +and position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces +fine-grained anomaly descriptions for each category using Large Language Models +(LLMs) and employs adaptively learned textual templates to enhance the accuracy +and interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for +preliminary localization, position-enhanced text prompts, and Multi-scale +Multi-shape Cross-modal Interaction (MMCI) module, facilitates more accurate +localization of anomalies of different sizes and shapes. Experimental results +on datasets like MVTec and VisA demonstrate that FiLo significantly improves +the performance of ZSAD in both detection and localization, achieving +state-of-the-art performance with an image-level AUC of 83.9% and a pixel-level +AUC of 95.9% on the VisA dataset. + +
+
+
+
+
+ + ☆ MathNet: A Data-Centric Approach for Printed Mathematical Expression + Recognition + + +
+ Printed mathematical expression recognition (MER) models are usually trained +and tested using LaTeX-generated mathematical expressions (MEs) as input and +the LaTeX source code as ground truth. As the same ME can be generated by +various different LaTeX source codes, this leads to unwanted variations in the +ground truth data that bias test performance results and hinder efficient +learning. In addition, the use of only one font to generate the MEs heavily +limits the generalization of the reported results to realistic scenarios. We +propose a data-centric approach to overcome this problem, and present +convincing experimental results: Our main contribution is an enhanced LaTeX +normalization to map any LaTeX ME to a canonical form. Based on this process, +we developed an improved version of the benchmark dataset im2latex-100k, +featuring 30 fonts instead of one. Second, we introduce the real-world dataset +realFormula, with MEs extracted from papers. Third, we developed a MER model, +MathNet, based on a convolutional vision transformer, with superior results on +all four test sets (im2latex-100k, im2latexv2, realFormula, and InftyMDB-1), +outperforming the previous state of the art by up to 88.3%. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ LMFNet: An Efficient Multimodal Fusion Approach for Semantic + Segmentation in High-Resolution Remote Sensing + + +
+ Despite the rapid evolution of semantic segmentation for land cover +classification in high-resolution remote sensing imagery, integrating multiple +data modalities such as Digital Surface Model (DSM), RGB, and Near-infrared +(NIR) remains a challenge. Current methods often process only two types of +data, missing out on the rich information that additional modalities can +provide. Addressing this gap, we propose a novel \textbf{L}ightweight +\textbf{M}ultimodal data \textbf{F}usion \textbf{Net}work (LMFNet) to +accomplish the tasks of fusion and semantic segmentation of multimodal remote +sensing images. LMFNet uniquely accommodates various data types simultaneously, +including RGB, NirRG, and DSM, through a weight-sharing, multi-branch vision +transformer that minimizes parameter count while ensuring robust feature +extraction. Our proposed multimodal fusion module integrates a +\textit{Multimodal Feature Fusion Reconstruction Layer} and \textit{Multimodal +Feature Self-Attention Fusion Layer}, which can reconstruct and fuse multimodal +features. Extensive testing on public datasets such as US3D, ISPRS Potsdam, and +ISPRS Vaihingen demonstrates the effectiveness of LMFNet. Specifically, it +achieves a mean Intersection over Union ($mIoU$) of 85.09\% on the US3D +dataset, marking a significant improvement over existing methods. Compared to +unimodal approaches, LMFNet shows a 10\% enhancement in $mIoU$ with only a 0.5M +increase in parameter count. Furthermore, against bimodal methods, our approach +with trilateral inputs enhances $mIoU$ by 0.46 percentage points. + +
+
+
+
+
+ + ☆ MLP: Motion Label Prior for Temporal Sentence Localization in Untrimmed + 3D Human Motions + + +
+ In this paper, we address the unexplored question of temporal sentence +localization in human motions (TSLM), aiming to locate a target moment from a +3D human motion that semantically corresponds to a text query. Considering that +3D human motions are captured using specialized motion capture devices, motions +with only a few joints lack complex scene information like objects and +lighting. Due to this character, motion data has low contextual richness and +semantic ambiguity between frames, which limits the accuracy of predictions +made by current video localization frameworks extended to TSLM to only a rough +level. To refine this, we devise two novel label-prior-assisted training +schemes: one embed prior knowledge of foreground and background to highlight +the localization chances of target moments, and the other forces the originally +rough predictions to overlap with the more accurate predictions obtained from +the flipped start/end prior label sequences during recovery training. We show +that injecting label-prior knowledge into the model is crucial for improving +performance at high IoU. In our constructed TSLM benchmark, our model termed +MLP achieves a recall of 44.13 at IoU@0.7 on the BABEL dataset and 71.17 on +HumanML3D (Restore), outperforming prior works. Finally, we showcase the +potential of our approach in corpus-level moment retrieval. Our source code is +openly accessible at https://github.com/eanson023/mlp. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Data-independent Module-aware Pruning for Hierarchical Vision + Transformers ICLR 2024 + + +
+ Hierarchical vision transformers (ViTs) have two advantages over conventional +ViTs. First, hierarchical ViTs achieve linear computational complexity with +respect to image size by local self-attention. Second, hierarchical ViTs create +hierarchical feature maps by merging image patches in deeper layers for dense +prediction. However, existing pruning methods ignore the unique properties of +hierarchical ViTs and use the magnitude value as the weight importance. This +approach leads to two main drawbacks. First, the "local" attention weights are +compared at a "global" level, which may cause some "locally" important weights +to be pruned due to their relatively small magnitude "globally". The second +issue with magnitude pruning is that it fails to consider the distinct weight +distributions of the network, which are essential for extracting coarse to +fine-grained features at various hierarchical levels. + To solve the aforementioned issues, we have developed a Data-independent +Module-Aware Pruning method (DIMAP) to compress hierarchical ViTs. To ensure +that "local" attention weights at different hierarchical levels are compared +fairly in terms of their contribution, we treat them as a module and examine +their contribution by analyzing their information distortion. Furthermore, we +introduce a novel weight metric that is solely based on weights and does not +require input images, thereby eliminating the dependence on the patch merging +process. Our method validates its usefulness and strengths on Swin Transformers +of different sizes on ImageNet-1k classification. Notably, the top-5 accuracy +drop is only 0.07% when we remove 52.5% FLOPs and 52.7% parameters of Swin-B. +When we reduce 33.2% FLOPs and 33.2% parameters of Swin-S, we can even achieve +a 0.8% higher relative top-5 accuracy than the original model. Code is +available at: https://github.com/he-y/Data-independent-Module-Aware-Pruning + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Beyond Alignment: Blind Video Face Restoration via Parsing-Guided + Temporal-Coherent Transformer + + +
+ Multiple complex degradations are coupled in low-quality video faces in the +real world. Therefore, blind video face restoration is a highly challenging +ill-posed problem, requiring not only hallucinating high-fidelity details but +also enhancing temporal coherence across diverse pose variations. Restoring +each frame independently in a naive manner inevitably introduces temporal +incoherence and artifacts from pose changes and keypoint localization errors. +To address this, we propose the first blind video face restoration approach +with a novel parsing-guided temporal-coherent transformer (PGTFormer) without +pre-alignment. PGTFormer leverages semantic parsing guidance to select optimal +face priors for generating temporally coherent artifact-free results. +Specifically, we pre-train a temporal-spatial vector quantized auto-encoder on +high-quality video face datasets to extract expressive context-rich priors. +Then, the temporal parse-guided codebook predictor (TPCP) restores faces in +different poses based on face parsing context cues without performing face +pre-alignment. This strategy reduces artifacts and mitigates jitter caused by +cumulative errors from face pre-alignment. Finally, the temporal fidelity +regulator (TFR) enhances fidelity through temporal feature interaction and +improves video temporal consistency. Extensive experiments on face videos show +that our method outperforms previous face restoration baselines. The code will +be released on +\href{https://github.com/kepengxu/PGTFormer}{https://github.com/kepengxu/PGTFormer}. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Attack on Scene Flow using Point Clouds + + +
+ Deep neural networks have made significant advancements in accurately +estimating scene flow using point clouds, which is vital for many applications +like video analysis, action recognition, and navigation. Robustness of these +techniques, however, remains a concern, particularly in the face of adversarial +attacks that have been proven to deceive state-of-the-art deep neural networks +in many domains. Surprisingly, the robustness of scene flow networks against +such attacks has not been thoroughly investigated. To address this problem, the +proposed approach aims to bridge this gap by introducing adversarial white-box +attacks specifically tailored for scene flow networks. Experimental results +show that the generated adversarial examples obtain up to 33.7 relative +degradation in average end-point error on the KITTI and FlyingThings3D +datasets. The study also reveals the significant impact that attacks targeting +point clouds in only one dimension or color channel have on average end-point +error. Analyzing the success and failure of these attacks on the scene flow +networks and their 2D optical flow network variants show a higher vulnerability +for the optical flow networks. + +
+
+
+
+
+ + ☆ Video sentence grounding with temporally global textual knowledge + + +
+ Temporal sentence grounding involves the retrieval of a video moment with a +natural language query. Many existing works directly incorporate the given +video and temporally localized query for temporal grounding, overlooking the +inherent domain gap between different modalities. In this paper, we utilize +pseudo-query features containing extensive temporally global textual knowledge +sourced from the same video-query pair, to enhance the bridging of domain gaps +and attain a heightened level of similarity between multi-modal features. +Specifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve +an improved alignment of visual and comprehensive pseudo-query features within +the feature space through contrastive learning. Subsequently, we utilize +learnable prompts to encapsulate the knowledge of pseudo-queries, propagating +them into the textual encoder and multi-modal fusion module, further enhancing +the feature alignment between visual and language for better temporal +grounding. Extensive experiments conducted on the Charades-STA and +ActivityNet-Captions datasets demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ☆ Turb-Seg-Res: A Segment-then-Restore Pipeline for Dynamic Videos with + Atmospheric Turbulence CVPR 2024 + + +
+ Tackling image degradation due to atmospheric turbulence, particularly in +dynamic environment, remains a challenge for long-range imaging systems. +Existing techniques have been primarily designed for static scenes or scenes +with small motion. This paper presents the first segment-then-restore pipeline +for restoring the videos of dynamic scenes in turbulent environment. We +leverage mean optical flow with an unsupervised motion segmentation method to +separate dynamic and static scene components prior to restoration. After camera +shake compensation and segmentation, we introduce foreground/background +enhancement leveraging the statistics of turbulence strength and a transformer +model trained on a novel noise-based procedural turbulence generator for fast +dataset augmentation. Benchmarked against existing restoration methods, our +approach restores most of the geometric distortion and enhances sharpness for +videos. We make our code, simulator, and data publicly available to advance the +field of video restoration from turbulence: riponcs.github.io/TurbSegRes + +
+
+ comment: CVPR 2024 Paper +
+
+
+
+
+ + ☆ Lost in Space: Probing Fine-grained Spatial Understanding in Vision and + Language Resamplers NAACL 2024 + + +
+ An effective method for combining frozen large language models (LLM) and +visual encoders involves a resampler module that creates a `visual prompt' +which is provided to the LLM, along with the textual prompt. While this +approach has enabled impressive performance across many coarse-grained tasks +like image captioning and visual question answering, more fine-grained tasks +that require spatial understanding have not been thoroughly examined. In this +paper, we use \textit{diagnostic classifiers} to measure the extent to which +the visual prompt produced by the resampler encodes spatial information. Our +results show that this information is largely absent from the resampler output +when kept frozen during training of the classifiers. However, when the +resampler and classifier are trained jointly, we observe a significant +performance boost. This shows that the compression achieved by the resamplers +can in principle encode the requisite spatial information, but that more +object-aware objectives are needed at the pretraining stage to facilitate this +capability + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ☆ MARVEL: Multidimensional Abstraction and Reasoning through Visual + Evaluation and Learning + + +
+ While multi-modal large language models (MLLMs) have shown significant +progress on many popular visual reasoning benchmarks, whether they possess +abstract visual reasoning abilities remains an open question. Similar to the +Sudoku puzzles, abstract visual reasoning (AVR) problems require finding +high-level patterns (e.g., repetition constraints) that control the input +shapes (e.g., digits) in a specific task configuration (e.g., matrix). However, +existing AVR benchmarks only considered a limited set of patterns (addition, +conjunction), input shapes (rectangle, square), and task configurations (3 by 3 +matrices). To evaluate MLLMs' reasoning abilities comprehensively, we introduce +MARVEL, a multidimensional AVR benchmark with 770 puzzles composed of six core +knowledge patterns, geometric and abstract shapes, and five different task +configurations. To inspect whether the model accuracy is grounded in perception +and reasoning, MARVEL complements the general AVR question with perception +questions in a hierarchical evaluation framework. We conduct comprehensive +experiments on MARVEL with nine representative MLLMs in zero-shot and few-shot +settings. Our experiments reveal that all models show near-random performance +on the AVR question, with significant performance gaps (40%) compared to humans +across all patterns and task configurations. Further analysis of perception +questions reveals that MLLMs struggle to comprehend the visual features +(near-random performance) and even count the panels in the puzzle ( <45%), +hindering their ability for abstract reasoning. We release our entire code and +dataset. + +
+
+
+
+
+ + ☆ Rethink Arbitrary Style Transfer with Transformer and Contrastive + Learning + + +
+ Arbitrary style transfer holds widespread attention in research and boasts +numerous practical applications. The existing methods, which either employ +cross-attention to incorporate deep style attributes into content attributes or +use adaptive normalization to adjust content features, fail to generate +high-quality stylized images. In this paper, we introduce an innovative +technique to improve the quality of stylized images. Firstly, we propose Style +Consistency Instance Normalization (SCIN), a method to refine the alignment +between content and style features. In addition, we have developed an +Instance-based Contrastive Learning (ICL) approach designed to understand the +relationships among various styles, thereby enhancing the quality of the +resulting stylized images. Recognizing that VGG networks are more adept at +extracting classification features and need to be better suited for capturing +style features, we have also introduced the Perception Encoder (PE) to capture +style features. Extensive experiments demonstrate that our proposed method +generates high-quality stylized images and effectively prevents artifacts +compared with the existing state-of-the-art methods. + +
+
+ comment: Accepted by CVIU +
+
+
+
+
+ + ☆ LTOS: Layout-controllable Text-Object Synthesis via Adaptive + Cross-attention Fusions + + +
+ Controllable text-to-image generation synthesizes visual text and objects in +images with certain conditions, which are frequently applied to emoji and +poster generation. Visual text rendering and layout-to-image generation tasks +have been popular in controllable text-to-image generation. However, each of +these tasks typically focuses on single modality generation or rendering, +leaving yet-to-be-bridged gaps between the approaches correspondingly designed +for each of the tasks. In this paper, we combine text rendering and +layout-to-image generation tasks into a single task: layout-controllable +text-object synthesis (LTOS) task, aiming at synthesizing images with object +and visual text based on predefined object layout and text contents. As +compliant datasets are not readily available for our LTOS task, we construct a +layout-aware text-object synthesis dataset, containing elaborate well-aligned +labels of visual text and object information. Based on the dataset, we propose +a layout-controllable text-object adaptive fusion (TOF) framework, which +generates images with clear, legible visual text and plausible objects. We +construct a visual-text rendering module to synthesize text and employ an +object-layout control module to generate objects while integrating the two +modules to harmoniously generate and integrate text content and objects in +images. To better the image-text integration, we propose a self-adaptive +cross-attention fusion module that helps the image generation to attend more to +important text information. Within such a fusion module, we use a self-adaptive +learnable factor to learn to flexibly control the influence of cross-attention +outputs on image generation. Experimental results show that our method +outperforms the state-of-the-art in LTOS, text rendering, and layout-to-image +tasks, enabling harmonious visual text rendering and object generation. + +
+
+
+
+
+ + ☆ I2CANSAY:Inter-Class Analogical Augmentation and Intra-Class + Significance Analysis for Non-Exemplar Online Task-Free Continual Learning + + +
+ Online task-free continual learning (OTFCL) is a more challenging variant of +continual learning which emphasizes the gradual shift of task boundaries and +learns in an online mode. Existing methods rely on a memory buffer composed of +old samples to prevent forgetting. However,the use of memory buffers not only +raises privacy concerns but also hinders the efficient learning of new samples. +To address this problem, we propose a novel framework called I2CANSAY that gets +rid of the dependence on memory buffers and efficiently learns the knowledge of +new data from one-shot samples. Concretely, our framework comprises two main +modules. Firstly, the Inter-Class Analogical Augmentation (ICAN) module +generates diverse pseudo-features for old classes based on the inter-class +analogy of feature distributions for different new classes, serving as a +substitute for the memory buffer. Secondly, the Intra-Class Significance +Analysis (ISAY) module analyzes the significance of attributes for each class +via its distribution standard deviation, and generates the importance vector as +a correction bias for the linear classifier, thereby enhancing the capability +of learning from new samples. We run our experiments on four popular image +classification datasets: CoRe50, CIFAR-10, CIFAR-100, and CUB-200, our approach +outperforms the prior state-of-the-art by a large margin. + +
+
+
+
+
+ + ☆ Exploring AIGC Video Quality: A Focus on Visual Harmony, Video-Text + Consistency and Domain Distribution Gap CVPR2024 + + +
+ The recent advancements in Text-to-Video Artificial Intelligence Generated +Content (AIGC) have been remarkable. Compared with traditional videos, the +assessment of AIGC videos encounters various challenges: visual inconsistency +that defy common sense, discrepancies between content and the textual prompt, +and distribution gap between various generative models, etc. Target at these +challenges, in this work, we categorize the assessment of AIGC video quality +into three dimensions: visual harmony, video-text consistency, and domain +distribution gap. For each dimension, we design specific modules to provide a +comprehensive quality assessment of AIGC videos. Furthermore, our research +identifies significant variations in visual quality, fluidity, and style among +videos generated by different text-to-video models. Predicting the source +generative model can make the AIGC video features more discriminative, which +enhances the quality assessment performance. The proposed method was used in +the third-place winner of the NTIRE 2024 Quality Assessment for AI-Generated +Content - Track 2 Video, demonstrating its effectiveness. + +
+
+ comment: 9 pages, 3 figures, 3 tables. Accepted by CVPR2024 Workshop (3rd + place of NTIRE2024 Quality Assessment for AI-Generated Content - Track 2 + Video) +
+
+
+
+
+ + ☆ Socratic Planner: Inquiry-Based Zero-Shot Planning for Embodied + Instruction Following + + +
+ Embodied Instruction Following (EIF) is the task of executing natural +language instructions by navigating and interacting with objects in 3D +environments. One of the primary challenges in EIF is compositional task +planning, which is often addressed with supervised or in-context learning with +labeled data. To this end, we introduce the Socratic Planner, the first +zero-shot planning method that infers without the need for any training data. +Socratic Planner first decomposes the instructions into substructural +information of the task through self-questioning and answering, translating it +into a high-level plan, i.e., a sequence of subgoals. Subgoals are executed +sequentially, with our visually grounded re-planning mechanism adjusting plans +dynamically through a dense visual feedback. We also introduce an evaluation +metric of high-level plans, RelaxedHLP, for a more comprehensive evaluation. +Experiments demonstrate the effectiveness of the Socratic Planner, achieving +competitive performance on both zero-shot and few-shot task planning in the +ALFRED benchmark, particularly excelling in tasks requiring higher-dimensional +inference. Additionally, a precise adjustments in the plan were achieved by +incorporating environmental visual information. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ Exploring Diverse Methods in Visual Question Answering + + +
+ This study explores innovative methods for improving Visual Question +Answering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and +attention mechanisms. Leveraging a balanced VQA dataset, we investigate three +distinct strategies. Firstly, GAN-based approaches aim to generate answer +embeddings conditioned on image and question inputs, showing potential but +struggling with more complex tasks. Secondly, autoencoder-based techniques +focus on learning optimal embeddings for questions and images, achieving +comparable results with GAN due to better ability on complex questions. Lastly, +attention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB), +address language priors and attention modeling, albeit with a +complexity-performance trade-off. This study underscores the challenges and +opportunities in VQA and suggests avenues for future research, including +alternative GAN formulations and attentional mechanisms. + +
+
+
+
+
+ + ☆ Masked Latent Transformer with the Random Masking Ratio to Advance the + Diagnosis of Dental Fluorosis + + +
+ Dental fluorosis is a chronic disease caused by long-term overconsumption of +fluoride, which leads to changes in the appearance of tooth enamel. It is an +important basis for early non-invasive diagnosis of endemic fluorosis. However, +even dental professionals may not be able to accurately distinguish dental +fluorosis and its severity based on tooth images. Currently, there is still a +gap in research on applying deep learning to diagnosing dental fluorosis. +Therefore, we construct the first open-source dental fluorosis image dataset +(DFID), laying the foundation for deep learning research in this field. To +advance the diagnosis of dental fluorosis, we propose a pioneering deep +learning model called masked latent transformer with the random masking ratio +(MLTrMR). MLTrMR introduces a mask latent modeling scheme based on Vision +Transformer to enhance contextual learning of dental fluorosis lesion +characteristics. Consisting of a latent embedder, encoder, and decoder, MLTrMR +employs the latent embedder to extract latent tokens from the original image, +whereas the encoder and decoder comprising the latent transformer (LT) block +are used to process unmasked tokens and predict masked tokens, respectively. To +mitigate the lack of inductive bias in Vision Transformer, which may result in +performance degradation, the LT block introduces latent tokens to enhance the +learning capacity of latent lesion features. Furthermore, we design an +auxiliary loss function to constrain the parameter update direction of the +model. MLTrMR achieves 80.19% accuracy, 75.79% F1, and 81.28% quadratic +weighted kappa on DFID, making it state-of-the-art (SOTA). + +
+
+
+
+
+ + ☆ Cell Phone Image-Based Persian Rice Detection and Classification Using + Deep Learning Techniques + + +
+ This study introduces an innovative approach to classifying various types of +Persian rice using image-based deep learning techniques, highlighting the +practical application of everyday technology in food categorization. +Recognizing the diversity of Persian rice and its culinary significance, we +leveraged the capabilities of convolutional neural networks (CNNs), +specifically by fine-tuning a ResNet model for accurate identification of +different rice varieties and employing a U-Net architecture for precise +segmentation of rice grains in bulk images. This dual-methodology framework +allows for both individual grain classification and comprehensive analysis of +bulk rice samples, addressing two crucial aspects of rice quality assessment. +Utilizing images captured with consumer-grade cell phones reflects a realistic +scenario in which individuals can leverage this technology for assistance with +grocery shopping and meal preparation. The dataset, comprising various rice +types photographed under natural conditions without professional lighting or +equipment, presents a challenging yet practical classification problem. Our +findings demonstrate the feasibility of using non-professional images for food +classification and the potential of deep learning models, like ResNet and +U-Net, to adapt to the nuances of everyday objects and textures. This study +contributes to the field by providing insights into the applicability of +image-based deep learning in daily life, specifically for enhancing consumer +experiences and knowledge in food selection. Furthermore, it opens avenues for +extending this approach to other food categories and practical applications, +emphasizing the role of accessible technology in bridging the gap between +sophisticated computational methods and everyday tasks. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Pointsoup: High-Performance and Extremely Low-Decoding-Latency Learned + Geometry Codec for Large-Scale Point Cloud Scenes + + +
+ Despite considerable progress being achieved in point cloud geometry +compression, there still remains a challenge in effectively compressing +large-scale scenes with sparse surfaces. Another key challenge lies in reducing +decoding latency, a crucial requirement in real-world application. In this +paper, we propose Pointsoup, an efficient learning-based geometry codec that +attains high-performance and extremely low-decoding-latency simultaneously. +Inspired by conventional Trisoup codec, a point model-based strategy is devised +to characterize local surfaces. Specifically, skin features are embedded from +local windows via an attention-based encoder, and dilated windows are +introduced as cross-scale priors to infer the distribution of quantized +features in parallel. During decoding, features undergo fast refinement, +followed by a folding-based point generator that reconstructs point coordinates +with fairly fast speed. Experiments show that Pointsoup achieves +state-of-the-art performance on multiple benchmarks with significantly lower +decoding complexity, i.e., up to 90$\sim$160$\times$ faster than the G-PCCv23 +Trisoup decoder on a comparatively low-end platform (e.g., one RTX 2080Ti). +Furthermore, it offers variable-rate control with a single neural model +(2.9MB), which is attractive for industrial practitioners. + +
+
+
+
+
+ + ☆ Generalizable Novel-View Synthesis using a Stereo Camera CVPR 2024 + + +
+ In this paper, we propose the first generalizable view synthesis approach +that specifically targets multi-view stereo-camera images. Since recent stereo +matching has demonstrated accurate geometry prediction, we introduce stereo +matching into novel-view synthesis for high-quality geometry reconstruction. To +this end, this paper proposes a novel framework, dubbed StereoNeRF, which +integrates stereo matching into a NeRF-based generalizable view synthesis +approach. StereoNeRF is equipped with three key components to effectively +exploit stereo matching in novel-view synthesis: a stereo feature extractor, a +depth-guided plane-sweeping, and a stereo depth loss. Moreover, we propose the +StereoNVS dataset, the first multi-view dataset of stereo-camera images, +encompassing a wide variety of both real and synthetic scenes. Our experimental +results demonstrate that StereoNeRF surpasses previous approaches in +generalizable view synthesis. + +
+
+ comment: Accepted to CVPR 2024. Project page URL: + https://jinwonjoon.github.io/stereonerf/ +
+
+
+
+
+ + ☆ Bracketing Image Restoration and Enhancement with High-Low Frequency + Decomposition CVPR 2024 + + +
+ In real-world scenarios, due to a series of image degradations, obtaining +high-quality, clear content photos is challenging. While significant progress +has been made in synthesizing high-quality images, previous methods for image +restoration and enhancement often overlooked the characteristics of different +degradations. They applied the same structure to address various types of +degradation, resulting in less-than-ideal restoration outcomes. Inspired by the +notion that high/low frequency information is applicable to different +degradations, we introduce HLNet, a Bracketing Image Restoration and +Enhancement method based on high-low frequency decomposition. Specifically, we +employ two modules for feature extraction: shared weight modules and non-shared +weight modules. In the shared weight modules, we use SCConv to extract common +features from different degradations. In the non-shared weight modules, we +introduce the High-Low Frequency Decomposition Block (HLFDB), which employs +different methods to handle high-low frequency information, enabling the model +to address different degradations more effectively. Compared to other networks, +our method takes into account the characteristics of different degradations, +thus achieving higher-quality image restoration. + +
+
+ comment: This paper is accepted by CVPR 2024 Workshop +
+
+
+
+
+ + ☆ Motion-aware Latent Diffusion Models for Video Frame Interpolation + + +
+ With the advancement of AIGC, video frame interpolation (VFI) has become a +crucial component in existing video generation frameworks, attracting +widespread research interest. For the VFI task, the motion estimation between +neighboring frames plays a crucial role in avoiding motion ambiguity. However, +existing VFI methods always struggle to accurately predict the motion +information between consecutive frames, and this imprecise estimation leads to +blurred and visually incoherent interpolated frames. In this paper, we propose +a novel diffusion framework, motion-aware latent diffusion models (MADiff), +which is specifically designed for the VFI task. By incorporating motion priors +between the conditional neighboring frames with the target interpolated frame +predicted throughout the diffusion sampling procedure, MADiff progressively +refines the intermediate outcomes, culminating in generating both visually +smooth and realistic results. Extensive experiments conducted on benchmark +datasets demonstrate that our method achieves state-of-the-art performance +significantly outperforming existing approaches, especially under challenging +scenarios involving dynamic textures with complex motion. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.09508 by + other authors +
+
+
+
+
+ + ☆ Listen Then See: Video Alignment with Speaker Attention + + +
+ Video-based Question Answering (Video QA) is a challenging task and becomes +even more intricate when addressing Socially Intelligent Question Answering +(SIQA). SIQA requires context understanding, temporal reasoning, and the +integration of multimodal information, but in addition, it requires processing +nuanced human behavior. Furthermore, the complexities involved are exacerbated +by the dominance of the primary modality (text) over the others. Thus, there is +a need to help the task's secondary modalities to work in tandem with the +primary modality. In this work, we introduce a cross-modal alignment and +subsequent representation fusion approach that achieves state-of-the-art +results (82.06\% accuracy) on the Social IQ 2.0 dataset for SIQA. Our approach +exhibits an improved ability to leverage the video modality by using the audio +modality as a bridge with the language modality. This leads to enhanced +performance by reducing the prevalent issue of language overfitting and +resultant video modality bypassing encountered by current existing techniques. +Our code and models are publicly available at +https://github.com/sts-vlcc/sts-vlcc + +
+
+
+
+
+ + ☆ Graph4GUI: Graph Neural Networks for Representing Graphical User + Interfaces + + +
+ Present-day graphical user interfaces (GUIs) exhibit diverse arrangements of +text, graphics, and interactive elements such as buttons and menus, but +representations of GUIs have not kept up. They do not encapsulate both semantic +and visuo-spatial relationships among elements. To seize machine learning's +potential for GUIs more efficiently, Graph4GUI exploits graph neural networks +to capture individual elements' properties and their semantic-visuo-spatial +constraints in a layout. The learned representation demonstrated its +effectiveness in multiple tasks, especially generating designs in a challenging +GUI autocompletion task, which involved predicting the positions of remaining +unplaced elements in a partially completed GUI. The new model's suggestions +showed alignment and visual appeal superior to the baseline method and received +higher subjective ratings for preference. Furthermore, we demonstrate the +practical benefits and efficiency advantages designers perceive when utilizing +our model as an autocompletion plug-in. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Dynamic in Static: Hybrid Visual Correspondence for Self-Supervised + Video Object Segmentation + + +
+ Conventional video object segmentation (VOS) methods usually necessitate a +substantial volume of pixel-level annotated video data for fully supervised +learning. In this paper, we present HVC, a \textbf{h}ybrid static-dynamic +\textbf{v}isual \textbf{c}orrespondence framework for self-supervised VOS. HVC +extracts pseudo-dynamic signals from static images, enabling an efficient and +scalable VOS model. Our approach utilizes a minimalist fully-convolutional +architecture to capture static-dynamic visual correspondence in image-cropped +views. To achieve this objective, we present a unified self-supervised approach +to learn visual representations of static-dynamic feature similarity. Firstly, +we establish static correspondence by utilizing a priori coordinate information +between cropped views to guide the formation of consistent static feature +representations. Subsequently, we devise a concise convolutional layer to +capture the forward / backward pseudo-dynamic signals between two views, +serving as cues for dynamic representations. Finally, we propose a hybrid +visual correspondence loss to learn joint static and dynamic consistency +representations. Our approach, without bells and whistles, necessitates only +one training session using static image data, significantly reducing memory +consumption ($\sim$16GB) and training time ($\sim$\textbf{2h}). Moreover, HVC +achieves state-of-the-art performance in several self-supervised VOS benchmarks +and additional video label propagation tasks. + +
+
+
+
+
+ + ☆ Authentic Emotion Mapping: Benchmarking Facial Expressions in Real News + + +
+ In this paper, we present a novel benchmark for Emotion Recognition using +facial landmarks extracted from realistic news videos. Traditional methods +relying on RGB images are resource-intensive, whereas our approach with Facial +Landmark Emotion Recognition (FLER) offers a simplified yet effective +alternative. By leveraging Graph Neural Networks (GNNs) to analyze the +geometric and spatial relationships of facial landmarks, our method enhances +the understanding and accuracy of emotion recognition. We discuss the +advancements and challenges in deep learning techniques for emotion +recognition, particularly focusing on Graph Neural Networks (GNNs) and +Transformers. Our experimental results demonstrate the viability and potential +of our dataset as a benchmark, setting a new direction for future research in +emotion recognition technologies. The codes and models are at: +https://github.com/wangzhifengharrison/benchmark_real_news + +
+
+
+
+
+ + ♻ ☆ Prototype-based Interpretable Breast Cancer Prediction Models: Analysis + and Challenges + + +
+ Deep learning models have achieved high performance in medical applications, +however, their adoption in clinical practice is hindered due to their black-box +nature. Self-explainable models, like prototype-based models, can be especially +beneficial as they are interpretable by design. However, if the learnt +prototypes are of low quality then the prototype-based models are as good as +black-box. Having high quality prototypes is a pre-requisite for a truly +interpretable model. In this work, we propose a prototype evaluation framework +for coherence (PEF-C) for quantitatively evaluating the quality of the +prototypes based on domain knowledge. We show the use of PEF-C in the context +of breast cancer prediction using mammography. Existing works on +prototype-based models on breast cancer prediction using mammography have +focused on improving the classification performance of prototype-based models +compared to black-box models and have evaluated prototype quality through +anecdotal evidence. We are the first to go beyond anecdotal evidence and +evaluate the quality of the mammography prototypes systematically using our +PEF-C. Specifically, we apply three state-of-the-art prototype-based models, +ProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer +prediction and evaluate these models w.r.t. i) classification performance, and +ii) quality of the prototypes, on three public datasets. Our results show that +prototype-based models are competitive with black-box models in terms of +classification performance, and achieve a higher score in detecting ROIs. +However, the quality of the prototypes are not yet sufficient and can be +improved in aspects of relevance, purity and learning a variety of prototypes. +We call the XAI community to systematically evaluate the quality of the +prototypes to check their true usability in high stake decisions and improve +such models further. + +
+
+ comment: Accepted at World Conference on Explainable Artificial Intelligence; + 21 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning + + +
+ Using single-task deep learning methods to reconstruct Magnetic Resonance +Imaging (MRI) data acquired with different imaging sequences is inherently +challenging. The trained deep learning model typically lacks generalizability, +and the dissimilarity among image datasets with different types of contrast +leads to suboptimal learning performance. This paper proposes a meta-learning +approach to efficiently learn image features from multiple MR image datasets. +Our algorithm can perform multi-task learning to simultaneously reconstruct MR +images acquired using different imaging sequences with different image +contrasts. The experiment results demonstrate the ability of our new +meta-learning reconstruction method to successfully reconstruct +highly-undersampled k-space data from multiple MRI datasets simultaneously, +outperforming other compelling reconstruction methods previously developed for +single-task learning. + +
+
+
+
+
+ + ♻ ☆ Progressive Feature Learning for Realistic Cloth-Changing Gait + Recognition + + +
+ Gait recognition is instrumental in crime prevention and social security, for +it can be conducted at a long distance to figure out the identity of persons. +However, existing datasets and methods cannot satisfactorily deal with the most +challenging cloth-changing problem in practice. Specifically, the practical +gait models are usually trained on automatically labeled data, in which the +sequences' views and cloth conditions of each person have some restrictions. To +be concrete, the cross-view sub-dataset only has normal walking condition +without cloth-changing, while the cross-cloth sub-dataset has cloth-changing +sequences but only in front views. As a result, the cloth-changing accuracy +cannot meet practical requirements. In this work, we formulate the problem as +Realistic Cloth-Changing Gait Recognition (abbreviated as RCC-GR) and we +construct two benchmarks: CASIA-BN-RCC and OUMVLP-RCC, to simulate the above +setting. Furthermore, we propose a new framework called Progressive Feature +Learning that can be applied with off-the-shelf backbones to improve their +performance in RCC-GR. Specifically, in our framework, we design Progressive +Mapping and Progressive Uncertainty to extract cross-view features and then +extract cross-cloth features on the basis. In this way, the feature from the +cross-view sub-dataset can first dominate the feature space and relieve the +uneven distribution caused by the adverse effect from the cross-cloth +sub-dataset. The experiments on our benchmarks show that our framework can +effectively improve recognition performance, especially in the cloth-changing +conditions. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Gait Recognition with Selective Fusion + + +
+ Previous gait recognition methods primarily trained on labeled datasets, +which require painful labeling effort. However, using a pre-trained model on a +new dataset without fine-tuning can lead to significant performance +degradation. So to make the pre-trained gait recognition model able to be +fine-tuned on unlabeled datasets, we propose a new task: Unsupervised Gait +Recognition (UGR). We introduce a new cluster-based baseline to solve UGR with +cluster-level contrastive learning. But we further find more challenges this +task meets. First, sequences of the same person in different clothes tend to +cluster separately due to the significant appearance changes. Second, sequences +taken from 0{\deg} and 180{\deg} views lack walking postures and do not cluster +with sequences taken from other views. To address these challenges, we propose +a Selective Fusion method, which includes Selective Cluster Fusion (SCF) and +Selective Sample Fusion (SSF). With SCF, we merge matched clusters of the same +person wearing different clothes by updating the cluster-level memory bank with +a multi-cluster update strategy. And in SSF, we merge sequences taken from +front/back views gradually with curriculum learning. Extensive experiments show +the effectiveness of our method in improving the rank-1 accuracy in walking +with different coats condition and front/back views conditions. + +
+
+
+
+
+ + ♻ ☆ VidProM: A Million-scale Real Prompt-Gallery Dataset for Text-to-Video + Diffusion Models + + +
+ The arrival of Sora marks a new era for text-to-video diffusion models, +bringing significant advancements in video generation and potential +applications. However, Sora, along with other text-to-video diffusion models, +is highly reliant on prompts, and there is no publicly available dataset that +features a study of text-to-video prompts. In this paper, we introduce VidProM, +the first large-scale dataset comprising 1.67 Million unique text-to-Video +Prompts from real users. Additionally, this dataset includes 6.69 million +videos generated by four state-of-the-art diffusion models, alongside some +related data. We initially discuss the curation of this large-scale dataset, a +process that is both time-consuming and costly. Subsequently, we underscore the +need for a new prompt dataset specifically designed for text-to-video +generation by illustrating how VidProM differs from DiffusionDB, a large-scale +prompt-gallery dataset for image generation. Our extensive and diverse dataset +also opens up many exciting new research areas. For instance, we suggest +exploring text-to-video prompt engineering, efficient video generation, and +video copy detection for diffusion models to develop better, more efficient, +and safer models. The project (including the collected dataset VidProM and +related code) is publicly available at https://vidprom.github.io under the +CC-BY-NC 4.0 License. + +
+
+ comment: The project (including the collected dataset VidProM and related + code) is publicly available at https://vidprom.github.io under the CC-BY-NC + 4.0 License +
+
+
+
+
+ + ♻ ☆ Pre-training with Random Orthogonal Projection Image Modeling ICLR + + +
+ Masked Image Modeling (MIM) is a powerful self-supervised strategy for visual +pre-training without the use of labels. MIM applies random crops to input +images, processes them with an encoder, and then recovers the masked inputs +with a decoder, which encourages the network to capture and learn structural +information about objects and scenes. The intermediate feature representations +obtained from MIM are suitable for fine-tuning on downstream tasks. In this +paper, we propose an Image Modeling framework based on random orthogonal +projection instead of binary masking as in MIM. Our proposed Random Orthogonal +Projection Image Modeling (ROPIM) reduces spatially-wise token information +under guaranteed bound on the noise variance and can be considered as masking +entire spatial image area under locally varying masking degrees. Since ROPIM +uses a random subspace for the projection that realizes the masking step, the +readily available complement of the subspace can be used during unmasking to +promote recovery of removed information. In this paper, we show that using +random orthogonal projection leads to superior performance compared to +crop-based masking. We demonstrate state-of-the-art results on several popular +benchmarks. + +
+
+ comment: Published as a conference paper at the International Conference on + Learning Representations (ICLR) 2024. 19 pages +
+
+
+
+
+ + ♻ ☆ Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language + Models with Creative Humor Generation + + +
+ Chain-of-Thought (CoT) guides large language models (LLMs) to reason +step-by-step, and can motivate their logical reasoning ability. While effective +for logical tasks, CoT is not conducive to creative problem-solving which often +requires out-of-box thoughts and is crucial for innovation advancements. In +this paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a +non-sequential, creative paradigm involving strong associations and knowledge +leaps. To this end, we study LLMs on the popular Oogiri game which needs +participants to have good creativity and strong associative thinking for +responding unexpectedly and humorously to the given image, text, or both, and +thus is suitable for LoT study. Then to investigate LLMs' LoT ability in the +Oogiri game, we first build a multimodal and multilingual Oogiri-GO dataset +which contains over 130,000 samples from the Oogiri game, and observe the +insufficient LoT ability or failures of most existing LLMs on the Oogiri game. +Accordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve +LLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into +LoT-oriented instruction tuning data to train pretrained LLM for achieving +certain LoT humor generation and discrimination abilities. Then CLoT designs an +explorative self-refinement that encourages the LLM to generate more creative +LoT data via exploring parallels between seemingly unrelated concepts and +selects high-quality data to train itself for self-refinement. CLoT not only +excels in humor generation in the Oogiri game but also boosts creative +abilities in various tasks like cloud guessing game and divergent association +task. These findings advance our understanding and offer a pathway to improve +LLMs' creative capacities for innovative applications across domains. The +dataset, code, and models will be released online. +https://zhongshsh.github.io/CLoT/. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ Analyzing Decades-Long Environmental Changes in Namibia Using Archival + Aerial Photography and Deep Learning + + +
+ This study explores object detection in historical aerial photographs of +Namibia to identify long-term environmental changes. Specifically, we aim to +identify key objects -- Waterholes, Omuti homesteads, and Big trees -- around +Oshikango in Namibia using sub-meter gray-scale aerial imagery from 1943 and +1972. In this work, we propose a workflow for analyzing historical aerial +imagery using a deep semantic segmentation model on sparse hand-labels. To this +end, we employ a number of strategies including class-weighting, +pseudo-labeling and empirical p-value-based filtering to balance skewed and +sparse representations of objects in the ground truth data. Results demonstrate +the benefits of these different training strategies resulting in an average +$F_1=0.661$ and $F_1=0.755$ over the three objects of interest for the 1943 and +1972 imagery, respectively. We also identified that the average size of +Waterhole and Big trees increased while the average size of Omuti homesteads +decreased between 1943 and 1972 reflecting some of the local effects of the +massive post-Second World War economic, agricultural, demographic, and +environmental changes. This work also highlights the untapped potential of +historical aerial photographs in understanding long-term environmental changes +beyond Namibia (and Africa). With the lack of adequate satellite technology in +the past, archival aerial photography offers a great alternative to uncover +decades-long environmental changes. + +
+
+
+
+
+ + ♻ ☆ EViT: An Eagle Vision Transformer with Bi-Fovea Self-Attention + + +
+ Thanks to the advancement of deep learning technology, vision transformers +has demonstrated competitive performance in various computer vision tasks. +Unfortunately, vision transformers still faces some challenges such as high +computational complexity and absence of desirable inductive bias. To alleviate +these issues, we propose a novel Bi-Fovea Self-Attention (BFSA) inspired by the +physiological structure and visual properties of eagle eyes. This BFSA is used +to simulate the shallow and deep fovea of eagle vision, prompting the network +to learn the feature representation of targets from coarse to fine. +Additionally, we design a Bionic Eagle Vision (BEV) block based on BFSA. It +combines the advantages of convolution and introduces a novel Bi-Fovea +Feedforward Network (BFFN) to mimic the working way of biological visual cortex +processes information in hierarchically and parallel. Furthermore, we develop a +unified and efficient pyramid backbone network family called Eagle Vision +Transformers (EViTs) by stacking BEV blocks. Experimental results show that +EViTs exhibit highly competitive performance in various computer vision tasks +such as image classification, object detection and semantic segmentation. +Especially in terms of performance and computational efficiency, EViTs show +significant advantages compared with other counterparts. Code is available at +https://github.com/nkusyl/EViT + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ OmniMedVQA: A New Large-Scale Comprehensive Evaluation Benchmark for + Medical LVLM + + +
+ Large Vision-Language Models (LVLMs) have demonstrated remarkable +capabilities in various multimodal tasks. However, their potential in the +medical domain remains largely unexplored. A significant challenge arises from +the scarcity of diverse medical images spanning various modalities and +anatomical regions, which is essential in real-world medical applications. To +solve this problem, in this paper, we introduce OmniMedVQA, a novel +comprehensive medical Visual Question Answering (VQA) benchmark. This benchmark +is collected from 73 different medical datasets, including 12 different +modalities and covering more than 20 distinct anatomical regions. Importantly, +all images in this benchmark are sourced from authentic medical scenarios, +ensuring alignment with the requirements of the medical field and suitability +for evaluating LVLMs. Through our extensive experiments, we have found that +existing LVLMs struggle to address these medical VQA problems effectively. +Moreover, what surprises us is that medical-specialized LVLMs even exhibit +inferior performance to those general-domain models, calling for a more +versatile and robust LVLM in the biomedical field. The evaluation results not +only reveal the current limitations of LVLM in understanding real medical +images but also highlight our dataset's significance. Our code with dataset are +available at https://github.com/OpenGVLab/Multi-Modality-Arena. + +
+
+
+
+
+ + ♻ ☆ FakeTracer: Catching Face-swap DeepFakes via Implanting Traces in + Training + + +
+ Face-swap DeepFake is an emerging AI-based face forgery technique that can +replace the original face in a video with a generated face of the target +identity while retaining consistent facial attributes such as expression and +orientation. Due to the high privacy of faces, the misuse of this technique can +raise severe social concerns, drawing tremendous attention to defend against +DeepFakes recently. In this paper, we describe a new proactive defense method +called FakeTracer to expose face-swap DeepFakes via implanting traces in +training. Compared to general face-synthesis DeepFake, the face-swap DeepFake +is more complex as it involves identity change, is subjected to the +encoding-decoding process, and is trained unsupervised, increasing the +difficulty of implanting traces into the training phase. To effectively defend +against face-swap DeepFake, we design two types of traces, sustainable trace +(STrace) and erasable trace (ETrace), to be added to training faces. During the +training, these manipulated faces affect the learning of the face-swap DeepFake +model, enabling it to generate faces that only contain sustainable traces. In +light of these two traces, our method can effectively expose DeepFakes by +identifying them. Extensive experiments corroborate the efficacy of our method +on defending against face-swap DeepFake. + +
+
+
+
+
+ + ♻ ☆ Generalizable Face Landmarking Guided by Conditional Face Warping CVPR 2024 + + +
+ As a significant step for human face modeling, editing, and generation, face +landmarking aims at extracting facial keypoints from images. A generalizable +face landmarker is required in practice because real-world facial images, e.g., +the avatars in animations and games, are often stylized in various ways. +However, achieving generalizable face landmarking is challenging due to the +diversity of facial styles and the scarcity of labeled stylized faces. In this +study, we propose a simple but effective paradigm to learn a generalizable face +landmarker based on labeled real human faces and unlabeled stylized faces. Our +method learns the face landmarker as the key module of a conditional face +warper. Given a pair of real and stylized facial images, the conditional face +warper predicts a warping field from the real face to the stylized one, in +which the face landmarker predicts the ending points of the warping field and +provides us with high-quality pseudo landmarks for the corresponding stylized +facial images. Applying an alternating optimization strategy, we learn the face +landmarker to minimize $i)$ the discrepancy between the stylized faces and the +warped real ones and $ii)$ the prediction errors of both real and pseudo +landmarks. Experiments on various datasets show that our method outperforms +existing state-of-the-art domain adaptation methods in face landmarking tasks, +leading to a face landmarker with better generalizability. Code is available at +https://plustwo0.github.io/project-face-landmarker. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Large-scale Dataset Pruning with Dynamic Uncertainty + + +
+ The state of the art of many learning tasks, e.g., image classification, is +advanced by collecting larger datasets and then training larger models on them. +As the outcome, the increasing computational cost is becoming unaffordable. In +this paper, we investigate how to prune the large-scale datasets, and thus +produce an informative subset for training sophisticated deep models with +negligible performance drop. We propose a simple yet effective dataset pruning +method by exploring both the prediction uncertainty and training dynamics. We +study dataset pruning by measuring the variation of predictions during the +whole training process on large-scale datasets, i.e., ImageNet-1K and +ImageNet-21K, and advanced models, i.e., Swin Transformer and ConvNeXt. +Extensive experimental results indicate that our method outperforms the state +of the art and achieves 25% lossless pruning ratio on both ImageNet-1K and +ImageNet-21K. The code and pruned datasets are available at +https://github.com/BAAI-DCAI/Dataset-Pruning. + +
+
+
+
+
+ + ♻ ☆ PI3D: Efficient Text-to-3D Generation with Pseudo-Image Diffusion CVPR 2024 + + +
+ Diffusion models trained on large-scale text-image datasets have demonstrated +a strong capability of controllable high-quality image generation from +arbitrary text prompts. However, the generation quality and generalization +ability of 3D diffusion models is hindered by the scarcity of high-quality and +large-scale 3D datasets. In this paper, we present PI3D, a framework that fully +leverages the pre-trained text-to-image diffusion models' ability to generate +high-quality 3D shapes from text prompts in minutes. The core idea is to +connect the 2D and 3D domains by representing a 3D shape as a set of Pseudo RGB +Images. We fine-tune an existing text-to-image diffusion model to produce such +pseudo-images using a small number of text-3D pairs. Surprisingly, we find that +it can already generate meaningful and consistent 3D shapes given complex text +descriptions. We further take the generated shapes as the starting point for a +lightweight iterative refinement using score distillation sampling to achieve +high-quality generation under a low budget. PI3D generates a single 3D shape +from text in only 3 minutes and the quality is validated to outperform existing +3D generative models by a large margin. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Revisiting Adversarial Training at Scale CVPR 2024 + + +
+ The machine learning community has witnessed a drastic change in the training +pipeline, pivoted by those ''foundation models'' with unprecedented scales. +However, the field of adversarial training is lagging behind, predominantly +centered around small model sizes like ResNet-50, and tiny and low-resolution +datasets like CIFAR-10. To bridge this transformation gap, this paper provides +a modern re-examination with adversarial training, investigating its potential +benefits when applied at scale. Additionally, we introduce an efficient and +effective training strategy to enable adversarial training with giant models +and web-scale data at an affordable computing cost. We denote this newly +introduced framework as AdvXL. + Empirical results demonstrate that AdvXL establishes new state-of-the-art +robust accuracy records under AutoAttack on ImageNet-1K. For example, by +training on DataComp-1B dataset, our AdvXL empowers a vanilla ViT-g model to +substantially surpass the previous records of $l_{\infty}$-, $l_{2}$-, and +$l_{1}$-robust accuracy by margins of 11.4%, 14.2% and 12.9%, respectively. +This achievement posits AdvXL as a pioneering approach, charting a new +trajectory for the efficient training of robust visual representations at +significantly larger scales. Our code is available at +https://github.com/UCSC-VLAA/AdvXL. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled + Examples in Semi-supervised Learning IJCAI 2024 + + +
+ Despite the progress of Semi-supervised Learning (SSL), existing methods fail +to utilize unlabeled data effectively and efficiently. Many pseudo-label-based +methods select unlabeled examples based on inaccurate confidence scores from +the classifier. Most prior work also uses all available unlabeled data without +pruning, making it difficult to handle large amounts of unlabeled data. To +address these issues, we propose two methods: Variational Confidence +Calibration (VCC) and Influence-Function-based Unlabeled Sample Elimination +(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a +variational autoencoder to select more accurate pseudo labels based on three +types of consistency scores. INFUSE is a data pruning method that constructs a +core dataset of unlabeled examples under SSL. Our methods are effective in +multiple datasets and settings, reducing classification errors rates and saving +training time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the +CIFAR-100 dataset by 1.08% while saving nearly half of the training time. + +
+
+ comment: Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng + contributed equally to this paper. New version, some problems and typos are + fixed +
+
+
+
+
+ + ♻ ☆ How to Evaluate Semantic Communications for Images with ViTScore Metric? + + +
+ Semantic communications (SC) have been expected to be a new paradigm shifting +to catalyze the next generation communication, whose main concerns shift from +accurate bit transmission to effective semantic information exchange in +communications. However, the previous and widely-used metrics for images are +not applicable to evaluate the image semantic similarity in SC. Classical +metrics to measure the similarity between two images usually rely on the pixel +level or the structural level, such as the PSNR and the MS-SSIM. +Straightforwardly using some tailored metrics based on deep-learning methods in +CV community, such as the LPIPS, is infeasible for SC. To tackle this, inspired +by BERTScore in NLP community, we propose a novel metric for evaluating image +semantic similarity, named Vision Transformer Score (ViTScore). We prove +theoretically that ViTScore has 3 important properties, including symmetry, +boundedness, and normalization, which make ViTScore convenient and intuitive +for image measurement. To evaluate the performance of ViTScore, we compare +ViTScore with 3 typical metrics (PSNR, MS-SSIM, and LPIPS) through 4 classes of +experiments: (i) correlation with BERTScore through evaluation of image caption +downstream CV task, (ii) evaluation in classical image communications, (iii) +evaluation in image semantic communication systems, and (iv) evaluation in +image semantic communication systems with semantic attack. Experimental results +demonstrate that ViTScore is robust and efficient in evaluating the semantic +similarity of images. Particularly, ViTScore outperforms the other 3 typical +metrics in evaluating the image semantic changes by semantic attack, such as +image inverse with Generative Adversarial Networks (GANs). This indicates that +ViTScore is an effective performance metric when deployed in SC scenarios. + +
+
+
+
+
+ + ♻ ☆ Spiking Structured State Space Model for Monaural Speech Enhancement + + +
+ Speech enhancement seeks to extract clean speech from noisy signals. +Traditional deep learning methods face two challenges: efficiently using +information in long speech sequences and high computational costs. To address +these, we introduce the Spiking Structured State Space Model (Spiking-S4). This +approach merges the energy efficiency of Spiking Neural Networks (SNN) with the +long-range sequence modeling capabilities of Structured State Space Models +(S4), offering a compelling solution. Evaluation on the DNS Challenge and +VoiceBank+Demand Datasets confirms that Spiking-S4 rivals existing Artificial +Neural Network (ANN) methods but with fewer computational resources, as +evidenced by reduced parameters and Floating Point Operations (FLOPs). + +
+
+
+
+
+ + ♻ ☆ UniM-OV3D: Uni-Modality Open-Vocabulary 3D Scene Understanding with + Fine-Grained Feature Representation IJCAI 2024 + + +
+ 3D open-vocabulary scene understanding aims to recognize arbitrary novel +categories beyond the base label space. However, existing works not only fail +to fully utilize all the available modal information in the 3D domain but also +lack sufficient granularity in representing the features of each modality. In +this paper, we propose a unified multimodal 3D open-vocabulary scene +understanding network, namely UniM-OV3D, which aligns point clouds with image, +language and depth. To better integrate global and local features of the point +clouds, we design a hierarchical point cloud feature extraction module that +learns comprehensive fine-grained feature representations. Further, to +facilitate the learning of coarse-to-fine point-semantic representations from +captions, we propose the utilization of hierarchical 3D caption pairs, +capitalizing on geometric constraints across various viewpoints of 3D scenes. +Extensive experimental results demonstrate the effectiveness and superiority of +our method in open-vocabulary semantic and instance segmentation, which +achieves state-of-the-art performance on both indoor and outdoor benchmarks +such as ScanNet, ScanNet200, S3IDS and nuScenes. Code is available at +https://github.com/hithqd/UniM-OV3D. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided + Reinforcement Learning + + +
+ From a visual perception perspective, modern graphical user interfaces (GUIs) +comprise a complex graphics-rich two-dimensional visuospatial arrangement of +text, images, and interactive objects such as buttons and menus. While existing +models can accurately predict regions and objects that are likely to attract +attention ``on average'', so far there is no scanpath model capable of +predicting scanpaths for an individual. To close this gap, we introduce +EyeFormer, which leverages a Transformer architecture as a policy network to +guide a deep reinforcement learning algorithm that controls gaze locations. Our +model has the unique capability of producing personalized predictions when +given a few user scanpath samples. It can predict full scanpath information, +including fixation positions and duration, across individuals and various +stimulus types. Additionally, we demonstrate applications in GUI layout +optimization driven by our model. Our software and models will be publicly +available. + +
+
+
+
+
+ + ♻ ☆ UCM-Net: A Lightweight and Efficient Solution for Skin Lesion + Segmentation using MLP and CNN + + +
+ Skin cancer poses a significant public health challenge, necessitating +efficient diagnostic tools. We introduce UCM-Net, a novel skin lesion +segmentation model combining Multi-Layer Perceptrons (MLP) and Convolutional +Neural Networks (CNN). This lightweight, efficient architecture, deviating from +traditional UNet designs, dramatically reduces computational demands, making it +ideal for mobile health applications. Evaluated on PH2, ISIC 2017, and ISIC +2018 datasets, UCM-Net demonstrates robust performance with fewer than 50KB +parameters and requires less than 0.05 Giga Operations Per Second (GLOPs). +Moreover, its minimal memory requirement is just 1.19MB in CPU environment +positions. It is a potential benchmark for efficiency in skin lesion +segmentation, suitable for deployment in resource-constrained settings. In +order to facilitate accessibility and further research in the field, the +UCM-Net source code is https://github.com/chunyuyuan/UCM-Net. + +
+
+ comment: 17 pages, under review +
+
+
+
+
+ + ♻ ☆ Motion-Guided Dual-Camera Tracker for Low-Cost Skill Evaluation of + Gastric Endoscopy + + +
+ Gastric simulators with objective educational feedback have been proven +useful for endoscopy training. Existing electronic simulators with feedback are +however not commonly adopted due to their high cost. In this work, a +motion-guided dual-camera tracker is proposed to provide reliable endoscope tip +position feedback at a low cost inside a mechanical simulator for endoscopy +skill evaluation, tackling several unique challenges. To address the issue of +significant appearance variation of the endoscope tip while keeping dual-camera +tracking consistency, the cross-camera mutual template strategy (CMT) is +proposed to introduce dynamic transient mutual templates to dual-camera +tracking. To alleviate disturbance from large occlusion and distortion by the +light source from the endoscope tip, the Mamba-based motion-guided prediction +head (MMH) is presented to aggregate historical motion with visual tracking. It +is the first application of Mamba for object tracking. The proposed tracker was +evaluated on datasets captured by low-cost camera pairs during endoscopy +procedures performed inside the mechanical simulator. The tracker achieves SOTA +performance with robust and consistent tracking on dual cameras. Further +downstream evaluation proves that the 3D tip position determined by the +proposed tracker enables reliable skill differentiation. The code and dataset +are available at https://github.com/PieceZhang/MotionDCTrack + +
+
+
+
+
+ + ♻ ☆ Interpretation of Neural Networks is Susceptible to Universal + Adversarial Perturbations + + +
+ Interpreting neural network classifiers using gradient-based saliency maps +has been extensively studied in the deep learning literature. While the +existing algorithms manage to achieve satisfactory performance in application +to standard image recognition datasets, recent works demonstrate the +vulnerability of widely-used gradient-based interpretation schemes to +norm-bounded perturbations adversarially designed for every individual input +sample. However, such adversarial perturbations are commonly designed using the +knowledge of an input sample, and hence perform sub-optimally in application to +an unknown or constantly changing data point. In this paper, we show the +existence of a Universal Perturbation for Interpretation (UPI) for standard +image datasets, which can alter a gradient-based feature map of neural networks +over a significant fraction of test samples. To design such a UPI, we propose a +gradient-based optimization method as well as a principal component analysis +(PCA)-based approach to compute a UPI which can effectively alter a neural +network's gradient-based interpretation on different samples. We support the +proposed UPI approaches by presenting several numerical results of their +successful applications to standard image datasets. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 70 + +
+
+
+ + ☆ Joint Quality Assessment and Example-Guided Image Processing by + Disentangling Picture Appearance from Content + + +
+ The deep learning revolution has strongly impacted low-level image processing +tasks such as style/domain transfer, enhancement/restoration, and visual +quality assessments. Despite often being treated separately, the aforementioned +tasks share a common theme of understanding, editing, or enhancing the +appearance of input images without modifying the underlying content. We +leverage this observation to develop a novel disentangled representation +learning method that decomposes inputs into content and appearance features. +The model is trained in a self-supervised manner and we use the learned +features to develop a new quality prediction model named DisQUE. We demonstrate +through extensive evaluations that DisQUE achieves state-of-the-art accuracy +across quality prediction tasks and distortion types. Moreover, we demonstrate +that the same features may also be used for image processing tasks such as HDR +tone mapping, where the desired output characteristics may be tuned using +example input-output pairs. + +
+
+
+
+
+ + ☆ Deep SE(3)-Equivariant Geometric Reasoning for Precise Placement Tasks ICLR 2024 + + +
+ Many robot manipulation tasks can be framed as geometric reasoning tasks, +where an agent must be able to precisely manipulate an object into a position +that satisfies the task from a set of initial conditions. Often, task success +is defined based on the relationship between two objects - for instance, +hanging a mug on a rack. In such cases, the solution should be equivariant to +the initial position of the objects as well as the agent, and invariant to the +pose of the camera. This poses a challenge for learning systems which attempt +to solve this task by learning directly from high-dimensional demonstrations: +the agent must learn to be both equivariant as well as precise, which can be +challenging without any inductive biases about the problem. In this work, we +propose a method for precise relative pose prediction which is provably +SE(3)-equivariant, can be learned from only a few demonstrations, and can +generalize across variations in a class of objects. We accomplish this by +factoring the problem into learning an SE(3) invariant task-specific +representation of the scene and then interpreting this representation with +novel geometric reasoning layers which are provably SE(3) equivariant. We +demonstrate that our method can yield substantially more precise placement +predictions in simulated placement tasks than previous methods trained with the +same amount of data, and can accurately represent relative placement +relationships data collected from real-world demonstrations. Supplementary +information and videos can be found at +https://sites.google.com/view/reldist-iclr-2023. + +
+
+ comment: Published at International Conference on Representation Learning + (ICLR 2024) +
+
+
+
+
+ + ☆ Composing Pre-Trained Object-Centric Representations for Robotics From + "What" and "Where" Foundation Models ICRA 2024 + + +
+ There have recently been large advances both in pre-training visual +representations for robotic control and segmenting unknown category objects in +general images. To leverage these for improved robot learning, we propose +$\textbf{POCR}$, a new framework for building pre-trained object-centric +representations for robotic control. Building on theories of "what-where" +representations in psychology and computer vision, we use segmentations from a +pre-trained model to stably locate across timesteps, various entities in the +scene, capturing "where" information. To each such segmented entity, we apply +other pre-trained models that build vector descriptions suitable for robotic +control tasks, thus capturing "what" the entity is. Thus, our pre-trained +object-centric representations for control are constructed by appropriately +combining the outputs of off-the-shelf pre-trained models, with no new +training. On various simulated and real robotic tasks, we show that imitation +policies for robotic manipulators trained on POCR achieve better performance +and systematic generalization than state of the art pre-trained representations +for robotics, as well as prior object-centric representations that are +typically trained from scratch. + +
+
+ comment: ICRA 2024. Project website: https://sites.google.com/view/pocr +
+
+
+
+
+ + ☆ Cut-FUNQUE: An Objective Quality Model for Compressed Tone-Mapped High + Dynamic Range Videos + + +
+ High Dynamic Range (HDR) videos have enjoyed a surge in popularity in recent +years due to their ability to represent a wider range of contrast and color +than Standard Dynamic Range (SDR) videos. Although HDR video capture has seen +increasing popularity because of recent flagship mobile phones such as Apple +iPhones, Google Pixels, and Samsung Galaxy phones, a broad swath of consumers +still utilize legacy SDR displays that are unable to display HDR videos. As +result, HDR videos must be processed, i.e., tone-mapped, before streaming to a +large section of SDR-capable video consumers. However, server-side tone-mapping +involves automating decisions regarding the choices of tone-mapping operators +(TMOs) and their parameters to yield high-fidelity outputs. Moreover, these +choices must be balanced against the effects of lossy compression, which is +ubiquitous in streaming scenarios. In this work, we develop a novel, efficient +model of objective video quality named Cut-FUNQUE that is able to accurately +predict the visual quality of tone-mapped and compressed HDR videos. Finally, +we evaluate Cut-FUNQUE on a large-scale crowdsourced database of such videos +and show that it achieves state-of-the-art accuracy. + +
+
+
+
+
+ + ☆ SiNC+: Adaptive Camera-Based Vitals with Unsupervised Learning of + Periodic Signals CVPR2023 + + +
+ Subtle periodic signals, such as blood volume pulse and respiration, can be +extracted from RGB video, enabling noncontact health monitoring at low cost. +Advancements in remote pulse estimation -- or remote photoplethysmography +(rPPG) -- are currently driven by deep learning solutions. However, modern +approaches are trained and evaluated on benchmark datasets with ground truth +from contact-PPG sensors. We present the first non-contrastive unsupervised +learning framework for signal regression to mitigate the need for labelled +video data. With minimal assumptions of periodicity and finite bandwidth, our +approach discovers the blood volume pulse directly from unlabelled videos. We +find that encouraging sparse power spectra within normal physiological +bandlimits and variance over batches of power spectra is sufficient for +learning visual features of periodic signals. We perform the first experiments +utilizing unlabelled video data not specifically created for rPPG to train +robust pulse rate estimators. Given the limited inductive biases, we +successfully applied the same approach to camera-based respiration by changing +the bandlimits of the target signal. This shows that the approach is general +enough for unsupervised learning of bandlimited quasi-periodic signals from +different domains. Furthermore, we show that the framework is effective for +finetuning models on unlabelled video from a single subject, allowing for +personalized and adaptive signal regressors. + +
+
+ comment: Extension of CVPR2023 highlight paper. arXiv admin note: substantial + text overlap with arXiv:2303.07944 +
+
+
+
+
+ + ☆ DMesh: A Differentiable Representation for General Meshes + + +
+ We present a differentiable representation, DMesh, for general 3D triangular +meshes. DMesh considers both the geometry and connectivity information of a +mesh. In our design, we first get a set of convex tetrahedra that compactly +tessellates the domain based on Weighted Delaunay Triangulation (WDT), and +formulate probability of faces to exist on our desired mesh in a differentiable +manner based on the WDT. This enables DMesh to represent meshes of various +topology in a differentiable way, and allows us to reconstruct the mesh under +various observations, such as point cloud and multi-view images using +gradient-based optimization. The source code and full paper is available at: +https://sonsang.github.io/dmesh-project. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ FisheyeDetNet: Object Detection on Fisheye Surround View Camera Systems + for Automated Driving + + +
+ Object detection is a mature problem in autonomous driving with pedestrian +detection being one of the first deployed algorithms. It has been +comprehensively studied in the literature. However, object detection is +relatively less explored for fisheye cameras used for surround-view near field +sensing. The standard bounding box representation fails in fisheye cameras due +to heavy radial distortion, particularly in the periphery. To mitigate this, we +explore extending the standard object detection output representation of +bounding box. We design rotated bounding boxes, ellipse, generic polygon as +polar arc/angle representations and define an instance segmentation mIOU metric +to analyze these representations. The proposed model FisheyeDetNet with polygon +outperforms others and achieves a mAP score of 49.5 % on Valeo fisheye +surround-view dataset for automated driving applications. This dataset has 60K +images captured from 4 surround-view cameras across Europe, North America and +Asia. To the best of our knowledge, this is the first detailed study on object +detection on fisheye cameras for autonomous driving scenarios. + +
+
+
+
+
+ + ☆ High-fidelity Endoscopic Image Synthesis by Utilizing Depth-guided + Neural Surfaces + + +
+ In surgical oncology, screening colonoscopy plays a pivotal role in providing +diagnostic assistance, such as biopsy, and facilitating surgical navigation, +particularly in polyp detection. Computer-assisted endoscopic surgery has +recently gained attention and amalgamated various 3D computer vision +techniques, including camera localization, depth estimation, surface +reconstruction, etc. Neural Radiance Fields (NeRFs) and Neural Implicit +Surfaces (NeuS) have emerged as promising methodologies for deriving accurate +3D surface models from sets of registered images, addressing the limitations of +existing colon reconstruction approaches stemming from constrained camera +movement. + However, the inadequate tissue texture representation and confused scale +problem in monocular colonoscopic image reconstruction still impede the +progress of the final rendering results. In this paper, we introduce a novel +method for colon section reconstruction by leveraging NeuS applied to +endoscopic images, supplemented by a single frame of depth map. Notably, we +pioneered the exploration of utilizing only one frame depth map in +photorealistic reconstruction and neural rendering applications while this +single depth map can be easily obtainable from other monocular depth estimation +networks with an object scale. Through rigorous experimentation and validation +on phantom imagery, our approach demonstrates exceptional accuracy in +completely rendering colon sections, even capturing unseen portions of the +surface. This breakthrough opens avenues for achieving stable and consistently +scaled reconstructions, promising enhanced quality in cancer screening +procedures and treatment interventions. + +
+
+
+
+
+ + ☆ Nested-TNT: Hierarchical Vision Transformers with Multi-Scale Feature + Processing + + +
+ Transformer has been applied in the field of computer vision due to its +excellent performance in natural language processing, surpassing traditional +convolutional neural networks and achieving new state-of-the-art. ViT divides +an image into several local patches, known as "visual sentences". However, the +information contained in the image is vast and complex, and focusing only on +the features at the "visual sentence" level is not enough. The features between +local patches should also be taken into consideration. In order to achieve +further improvement, the TNT model is proposed, whose algorithm further divides +the image into smaller patches, namely "visual words," achieving more accurate +results. The core of Transformer is the Multi-Head Attention mechanism, and +traditional attention mechanisms ignore interactions across different attention +heads. In order to reduce redundancy and improve utilization, we introduce the +nested algorithm and apply the Nested-TNT to image classification tasks. The +experiment confirms that the proposed model has achieved better classification +performance over ViT and TNT, exceeding 2.25%, 1.1% on dataset CIFAR10 and +2.78%, 0.25% on dataset FLOWERS102 respectively. + +
+
+
+
+
+ + ☆ AdvLoRA: Adversarial Low-Rank Adaptation of Vision-Language Models + + +
+ Vision-Language Models (VLMs) are a significant technique for Artificial +General Intelligence (AGI). With the fast growth of AGI, the security problem +become one of the most important challenges for VLMs. In this paper, through +extensive experiments, we demonstrate the vulnerability of the conventional +adaptation methods for VLMs, which may bring significant security risks. In +addition, as the size of the VLMs increases, performing conventional +adversarial adaptation techniques on VLMs results in high computational costs. +To solve these problems, we propose a parameter-efficient +\underline{Adv}ersarial adaptation method named \underline{AdvLoRA} by +\underline{Lo}w-\underline{R}ank \underline{A}daptation. At first, we +investigate and reveal the intrinsic low-rank property during the adversarial +adaptation for VLMs. Different from LoRA, we improve the efficiency and +robustness of adversarial adaptation by designing a novel reparameterizing +method based on parameter clustering and parameter alignment. In addition, an +adaptive parameter update strategy is proposed to further improve the +robustness. By these settings, our proposed AdvLoRA alleviates the model +security and high resource waste problems. Extensive experiments demonstrate +the effectiveness and efficiency of the AdvLoRA. + +
+
+
+
+
+ + ☆ NeurCADRecon: Neural Representation for Reconstructing CAD Surfaces by + Enforcing Zero Gaussian Curvature SIGGRAPH 2024 + + +
+ Despite recent advances in reconstructing an organic model with the neural +signed distance function (SDF), the high-fidelity reconstruction of a CAD model +directly from low-quality unoriented point clouds remains a significant +challenge. In this paper, we address this challenge based on the prior +observation that the surface of a CAD model is generally composed of piecewise +surface patches, each approximately developable even around the feature line. +Our approach, named NeurCADRecon, is self-supervised, and its loss includes a +developability term to encourage the Gaussian curvature toward 0 while ensuring +fidelity to the input points. Noticing that the Gaussian curvature is non-zero +at tip points, we introduce a double-trough curve to tolerate the existence of +these tip points. Furthermore, we develop a dynamic sampling strategy to deal +with situations where the given points are incomplete or too sparse. Since our +resulting neural SDFs can clearly manifest sharp feature points/lines, one can +easily extract the feature-aligned triangle mesh from the SDF and then +decompose it into smooth surface patches, greatly reducing the difficulty of +recovering the parametric CAD design. A comprehensive comparison with existing +state-of-the-art methods shows the significant advantage of our approach in +reconstructing faithful CAD shapes. + +
+
+ comment: ACM Transactions on Graphics (SIGGRAPH 2024) +
+
+
+
+
+ + ☆ Efficient and Concise Explanations for Object Detection with + Gaussian-Class Activation Mapping Explainer + + +
+ To address the challenges of providing quick and plausible explanations in +Explainable AI (XAI) for object detection models, we introduce the Gaussian +Class Activation Mapping Explainer (G-CAME). Our method efficiently generates +concise saliency maps by utilizing activation maps from selected layers and +applying a Gaussian kernel to emphasize critical image regions for the +predicted object. Compared with other Region-based approaches, G-CAME +significantly reduces explanation time to 0.5 seconds without compromising the +quality. Our evaluation of G-CAME, using Faster-RCNN and YOLOX on the MS-COCO +2017 dataset, demonstrates its ability to offer highly plausible and faithful +explanations, especially in reducing the bias on tiny object detection. + +
+
+ comment: Canadian AI 2024 +
+
+
+
+
+ + ☆ AMMUNet: Multi-Scale Attention Map Merging for Remote Sensing Image + Segmentation + + +
+ The advancement of deep learning has driven notable progress in remote +sensing semantic segmentation. Attention mechanisms, while enabling global +modeling and utilizing contextual information, face challenges of high +computational costs and require window-based operations that weaken capturing +long-range dependencies, hindering their effectiveness for remote sensing image +processing. In this letter, we propose AMMUNet, a UNet-based framework that +employs multi-scale attention map merging, comprising two key innovations: the +granular multi-head self-attention (GMSA) module and the attention map merging +mechanism (AMMM). GMSA efficiently acquires global information while +substantially mitigating computational costs in contrast to global multi-head +self-attention mechanism. This is accomplished through the strategic +utilization of dimension correspondence to align granularity and the reduction +of relative position bias parameters, thereby optimizing computational +efficiency. The proposed AMMM effectively combines multi-scale attention maps +into a unified representation using a fixed mask template, enabling the +modeling of global attention mechanism. Experimental evaluations highlight the +superior performance of our approach, achieving remarkable mean intersection +over union (mIoU) scores of 75.48\% on the challenging Vaihingen dataset and an +exceptional 77.90\% on the Potsdam dataset, demonstrating the superiority of +our method in precise remote sensing semantic segmentation. Codes are available +at https://github.com/interpretty/AMMUNet. + +
+
+
+
+
+ + ☆ HiVG: Hierarchical Multimodal Fine-grained Modulation for Visual + Grounding + + +
+ Visual grounding, which aims to ground a visual region via natural language, +is a task that heavily relies on cross-modal alignment. Existing works utilized +uni-modal pre-trained models to transfer visual/linguistic knowledge separately +while ignoring the multimodal corresponding information. Motivated by recent +advancements in contrastive language-image pre-training and low-rank adaptation +(LoRA) methods, we aim to solve the grounding task based on multimodal +pre-training. However, there exists significant task gaps between pre-training +and grounding. Therefore, to address these gaps, we propose a concise and +efficient hierarchical multimodal fine-grained modulation framework, namely +HiVG. Specifically, HiVG consists of a multi-layer adaptive cross-modal bridge +and a hierarchical multimodal low-rank adaptation (Hi LoRA) paradigm. The +cross-modal bridge can address the inconsistency between visual features and +those required for grounding, and establish a connection between multi-level +visual and text features. Hi LoRA prevents the accumulation of perceptual +errors by adapting the cross-modal features from shallow to deep layers in a +hierarchical manner. Experimental results on five datasets demonstrate the +effectiveness of our approach and showcase the significant grounding +capabilities as well as promising energy efficiency advantages. The project +page: https://github.com/linhuixiao/HiVG. + +
+
+ comment: The project page: https://github.com/linhuixiao/HiVG +
+
+
+
+
+ + ☆ SSVT: Self-Supervised Vision Transformer For Eye Disease Diagnosis Based + On Fundus Images + + +
+ Machine learning-based fundus image diagnosis technologies trigger worldwide +interest owing to their benefits such as reducing medical resource power and +providing objective evaluation results. However, current methods are commonly +based on supervised methods, bringing in a heavy workload to biomedical staff +and hence suffering in expanding effective databases. To address this issue, in +this article, we established a label-free method, name 'SSVT',which can +automatically analyze un-labeled fundus images and generate high evaluation +accuracy of 97.0% of four main eye diseases based on six public datasets and +two datasets collected by Beijing Tongren Hospital. The promising results +showcased the effectiveness of the proposed unsupervised learning method, and +the strong application potential in biomedical resource shortage regions to +improve global eye health. + +
+
+ comment: ISBI 2024 +
+
+
+
+
+ + ☆ HybridFlow: Infusing Continuity into Masked Codebook for Extreme + Low-Bitrate Image Compression + + +
+ This paper investigates the challenging problem of learned image compression +(LIC) with extreme low bitrates. Previous LIC methods based on transmitting +quantized continuous features often yield blurry and noisy reconstruction due +to the severe quantization loss. While previous LIC methods based on learned +codebooks that discretize visual space usually give poor-fidelity +reconstruction due to the insufficient representation power of limited +codewords in capturing faithful details. We propose a novel dual-stream +framework, HyrbidFlow, which combines the continuous-feature-based and +codebook-based streams to achieve both high perceptual quality and high +fidelity under extreme low bitrates. The codebook-based stream benefits from +the high-quality learned codebook priors to provide high quality and clarity in +reconstructed images. The continuous feature stream targets at maintaining +fidelity details. To achieve the ultra low bitrate, a masked token-based +transformer is further proposed, where we only transmit a masked portion of +codeword indices and recover the missing indices through token generation +guided by information from the continuous feature stream. We also develop a +bridging correction network to merge the two streams in pixel decoding for +final image reconstruction, where the continuous stream features rectify biases +of the codebook-based pixel decoder to impose reconstructed fidelity details. +Experimental results demonstrate superior performance across several datasets +under extremely low bitrates, compared with existing single-stream +codebook-based or continuous-feature-based LIC methods. + +
+
+
+
+
+ + ☆ Movie101v2: Improved Movie Narration Benchmark + + +
+ Automatic movie narration targets at creating video-aligned plot descriptions +to assist visually impaired audiences. It differs from standard video +captioning in that it requires not only describing key visual details but also +inferring the plots developed across multiple movie shots, thus posing unique +and ongoing challenges. To advance the development of automatic movie narrating +systems, we first revisit the limitations of existing datasets and develop a +large-scale, bilingual movie narration dataset, Movie101v2. Second, taking into +account the essential difficulties in achieving applicable movie narration, we +break the long-term goal into three progressive stages and tentatively focus on +the initial stages featuring understanding within individual clips. We also +introduce a new narration assessment to align with our staged task goals. +Third, using our new dataset, we baseline several leading large vision-language +models, including GPT-4V, and conduct in-depth investigations into the +challenges current models face for movie narration generation. Our findings +reveal that achieving applicable movie narration generation is a fascinating +goal that requires thorough research. + +
+
+
+
+
+ + ☆ Generating Daylight-driven Architectural Design via Diffusion Models + + +
+ In recent years, the rapid development of large-scale models has made new +possibilities for interdisciplinary fields such as architecture. In this paper, +we present a novel daylight-driven AI-aided architectural design method. +Firstly, we formulate a method for generating massing models, producing +architectural massing models using random parameters quickly. Subsequently, we +integrate a daylight-driven facade design strategy, accurately determining +window layouts and applying them to the massing models. Finally, we seamlessly +combine a large-scale language model with a text-to-image model, enhancing the +efficiency of generating visual architectural design renderings. Experimental +results demonstrate that our approach supports architects' creative +inspirations and pioneers novel avenues for architectural design development. +Project page: https://zrealli.github.io/DDADesign/. + +
+
+ comment: Project page: https://zrealli.github.io/DDADesign/ +
+
+
+
+
+ + ☆ Hyperspectral Anomaly Detection with Self-Supervised Anomaly Prior + + +
+ The majority of existing hyperspectral anomaly detection (HAD) methods use +the low-rank representation (LRR) model to separate the background and anomaly +components, where the anomaly component is optimized by handcrafted sparse +priors (e.g., $\ell_{2,1}$-norm). However, this may not be ideal since they +overlook the spatial structure present in anomalies and make the detection +result largely dependent on manually set sparsity. To tackle these problems, we +redefine the optimization criterion for the anomaly component in the LRR model +with a self-supervised network called self-supervised anomaly prior (SAP). This +prior is obtained by the pretext task of self-supervised learning, which is +customized to learn the characteristics of hyperspectral anomalies. +Specifically, this pretext task is a classification task to distinguish the +original hyperspectral image (HSI) and the pseudo-anomaly HSI, where the +pseudo-anomaly is generated from the original HSI and designed as a prism with +arbitrary polygon bases and arbitrary spectral bands. In addition, a +dual-purified strategy is proposed to provide a more refined background +representation with an enriched background dictionary, facilitating the +separation of anomalies from complex backgrounds. Extensive experiments on +various hyperspectral datasets demonstrate that the proposed SAP offers a more +accurate and interpretable solution than other advanced HAD methods. + +
+
+
+
+
+ + ☆ SEGSRNet for Stereo-Endoscopic Image Super-Resolution and Surgical + Instrument Segmentation + + +
+ SEGSRNet addresses the challenge of precisely identifying surgical +instruments in low-resolution stereo endoscopic images, a common issue in +medical imaging and robotic surgery. Our innovative framework enhances image +clarity and segmentation accuracy by applying state-of-the-art super-resolution +techniques before segmentation. This ensures higher-quality inputs for more +precise segmentation. SEGSRNet combines advanced feature extraction and +attention mechanisms with spatial processing to sharpen image details, which is +significant for accurate tool identification in medical images. Our proposed +model outperforms current models including Dice, IoU, PSNR, and SSIM, SEGSRNet +where it produces clearer and more accurate images for stereo endoscopic +surgical imaging. SEGSRNet can provide image resolution and precise +segmentation which can significantly enhance surgical accuracy and patient care +outcomes. + +
+
+ comment: Paper accepted for Presentation in 46th Annual International + Conference of the IEEE Engineering in Medicine and Biology Society (EMBS), + Orlando, Florida, USA +
+
+
+
+
+ + ☆ Collaborative Visual Place Recognition through Federated Learning CVPR + + +
+ Visual Place Recognition (VPR) aims to estimate the location of an image by +treating it as a retrieval problem. VPR uses a database of geo-tagged images +and leverages deep neural networks to extract a global representation, called +descriptor, from each image. While the training data for VPR models often +originates from diverse, geographically scattered sources (geo-tagged images), +the training process itself is typically assumed to be centralized. This +research revisits the task of VPR through the lens of Federated Learning (FL), +addressing several key challenges associated with this adaptation. VPR data +inherently lacks well-defined classes, and models are typically trained using +contrastive learning, which necessitates a data mining step on a centralized +database. Additionally, client devices in federated systems can be highly +heterogeneous in terms of their processing capabilities. The proposed FedVPR +framework not only presents a novel approach for VPR but also introduces a new, +challenging, and realistic task for FL research, paving the way to other image +retrieval tasks in FL. + +
+
+ comment: 13 pages, 7 figures, CVPR - The 3rd International Workshop on + Federated Learning for Computer Vision (FedVision-2024) +
+
+
+
+
+ + ☆ Pixel is a Barrier: Diffusion Models Are More Adversarially Robust Than + We Think + + +
+ Adversarial examples for diffusion models are widely used as solutions for +safety concerns. By adding adversarial perturbations to personal images, +attackers can not edit or imitate them easily. However, it is essential to note +that all these protections target the latent diffusion model (LDMs), the +adversarial examples for diffusion models in the pixel space (PDMs) are largely +overlooked. This may mislead us to think that the diffusion models are +vulnerable to adversarial attacks like most deep models. In this paper, we show +novel findings that: even though gradient-based white-box attacks can be used +to attack the LDMs, they fail to attack PDMs. This finding is supported by +extensive experiments of almost a wide range of attacking methods on various +PDMs and LDMs with different model structures, which means diffusion models are +indeed much more robust against adversarial attacks. We also find that PDMs can +be used as an off-the-shelf purifier to effectively remove the adversarial +patterns that were generated on LDMs to protect the images, which means that +most protection methods nowadays, to some extent, cannot protect our images +from malicious attacks. We hope that our insights will inspire the community to +rethink the adversarial samples for diffusion models as protection methods and +move forward to more effective protection. Codes are available in +https://github.com/xavihart/PDM-Pure. + +
+
+
+
+
+ + ☆ STAT: Towards Generalizable Temporal Action Localization + + +
+ Weakly-supervised temporal action localization (WTAL) aims to recognize and +localize action instances with only video-level labels. Despite the significant +progress, existing methods suffer from severe performance degradation when +transferring to different distributions and thus may hardly adapt to real-world +scenarios . To address this problem, we propose the Generalizable Temporal +Action Localization task (GTAL), which focuses on improving the +generalizability of action localization methods. We observed that the +performance decline can be primarily attributed to the lack of generalizability +to different action scales. To address this problem, we propose STAT +(Self-supervised Temporal Adaptive Teacher), which leverages a teacher-student +structure for iterative refinement. Our STAT features a refinement module and +an alignment module. The former iteratively refines the model's output by +leveraging contextual information and helps adapt to the target scale. The +latter improves the refinement process by promoting a consensus between student +and teacher models. We conduct extensive experiments on three datasets, +THUMOS14, ActivityNet1.2, and HACS, and the results show that our method +significantly improves the Baseline methods under the cross-distribution +evaluation setting, even approaching the same-distribution evaluation +performance. + +
+
+ comment: 14 pages, LaTeX; +
+
+
+
+
+ + ☆ FakeBench: Uncover the Achilles' Heels of Fake Images with Large + Multimodal Models + + +
+ Recently, fake images generated by artificial intelligence (AI) models have +become indistinguishable from the real, exerting new challenges for fake image +detection models. To this extent, simple binary judgments of real or fake seem +less convincing and credible due to the absence of human-understandable +explanations. Fortunately, Large Multimodal Models (LMMs) bring possibilities +to materialize the judgment process while their performance remains +undetermined. Therefore, we propose FakeBench, the first-of-a-kind benchmark +towards transparent defake, consisting of fake images with human language +descriptions on forgery signs. FakeBench gropes for two open questions of LMMs: +(1) can LMMs distinguish fake images generated by AI, and (2) how do LMMs +distinguish fake images? In specific, we construct the FakeClass dataset with +6k diverse-sourced fake and real images, each equipped with a Question&Answer +pair concerning the authenticity of images, which are utilized to benchmark the +detection ability. To examine the reasoning and interpretation abilities of +LMMs, we present the FakeClue dataset, consisting of 15k pieces of descriptions +on the telltale clues revealing the falsification of fake images. Besides, we +construct the FakeQA to measure the LMMs' open-question answering ability on +fine-grained authenticity-relevant aspects. Our experimental results discover +that current LMMs possess moderate identification ability, preliminary +interpretation and reasoning ability, and passable open-question answering +ability for image defake. The FakeBench will be made publicly available soon. + +
+
+
+
+
+ + ☆ PCQA: A Strong Baseline for AIGC Quality Assessment Based on Prompt + Condition CVPR-2024 + + +
+ The development of Large Language Models (LLM) and Diffusion Models brings +the boom of Artificial Intelligence Generated Content (AIGC). It is essential +to build an effective quality assessment framework to provide a quantifiable +evaluation of different images or videos based on the AIGC technologies. The +content generated by AIGC methods is driven by the crafted prompts. Therefore, +it is intuitive that the prompts can also serve as the foundation of the AIGC +quality assessment. This study proposes an effective AIGC quality assessment +(QA) framework. First, we propose a hybrid prompt encoding method based on a +dual-source CLIP (Contrastive Language-Image Pre-Training) text encoder to +understand and respond to the prompt conditions. Second, we propose an +ensemble-based feature mixer module to effectively blend the adapted prompt and +vision features. The empirical study practices in two datasets: AIGIQA-20K +(AI-Generated Image Quality Assessment database) and T2VQA-DB (Text-to-Video +Quality Assessment DataBase), which validates the effectiveness of our proposed +method: Prompt Condition Quality Assessment (PCQA). Our proposed simple and +feasible framework may promote research development in the multimodal +generation field. + +
+
+ comment: Published in CVPR-2024's NTIRE: New Trends in Image Restoration and + Enhancement workshop and challenges +
+
+
+
+
+ + ☆ PoseINN: Realtime Visual-based Pose Regression and Localization with + Invertible Neural Networks + + +
+ Estimating ego-pose from cameras is an important problem in robotics with +applications ranging from mobile robotics to augmented reality. While SOTA +models are becoming increasingly accurate, they can still be unwieldy due to +high computational costs. In this paper, we propose to solve the problem by +using invertible neural networks (INN) to find the mapping between the latent +space of images and poses for a given scene. Our model achieves similar +performance to the SOTA while being faster to train and only requiring offline +rendering of low-resolution synthetic data. By using normalizing flows, the +proposed method also provides uncertainty estimation for the output. We also +demonstrated the efficiency of this method by deploying the model on a mobile +robot. + +
+
+
+
+
+ + ☆ Wills Aligner: A Robust Multi-Subject Brain Representation Learner + + +
+ Decoding visual information from human brain activity has seen remarkable +advancements in recent research. However, due to the significant variability in +cortical parcellation and cognition patterns across subjects, current +approaches personalized deep models for each subject, constraining the +practicality of this technology in real-world contexts. To tackle the +challenges, we introduce Wills Aligner, a robust multi-subject brain +representation learner. Our Wills Aligner initially aligns different subjects' +brains at the anatomical level. Subsequently, it incorporates a mixture of +brain experts to learn individual cognition patterns. Additionally, it +decouples the multi-subject learning task into a two-stage training, propelling +the deep model and its plugin network to learn inter-subject commonality +knowledge and various cognition patterns, respectively. Wills Aligner enables +us to overcome anatomical differences and to efficiently leverage a single +model for multi-subject brain representation learning. We meticulously evaluate +the performance of our approach across coarse-grained and fine-grained visual +decoding tasks. The experimental results demonstrate that our Wills Aligner +achieves state-of-the-art performance. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Beyond Score Changes: Adversarial Attack on No-Reference Image Quality + Assessment from Two Perspectives + + +
+ Deep neural networks have demonstrated impressive success in No-Reference +Image Quality Assessment (NR-IQA). However, recent researches highlight the +vulnerability of NR-IQA models to subtle adversarial perturbations, leading to +inconsistencies between model predictions and subjective ratings. Current +adversarial attacks, however, focus on perturbing predicted scores of +individual images, neglecting the crucial aspect of inter-score correlation +relationships within an entire image set. Meanwhile, it is important to note +that the correlation, like ranking correlation, plays a significant role in +NR-IQA tasks. To comprehensively explore the robustness of NR-IQA models, we +introduce a new framework of correlation-error-based attacks that perturb both +the correlation within an image set and score changes on individual images. Our +research primarily focuses on ranking-related correlation metrics like +Spearman's Rank-Order Correlation Coefficient (SROCC) and prediction +error-related metrics like Mean Squared Error (MSE). As an instantiation, we +propose a practical two-stage SROCC-MSE-Attack (SMA) that initially optimizes +target attack scores for the entire image set and then generates adversarial +examples guided by these scores. Experimental results demonstrate that our SMA +method not only significantly disrupts the SROCC to negative values but also +maintains a considerable change in the scores of individual images. Meanwhile, +it exhibits state-of-the-art performance across metrics with different +categories. Our method provides a new perspective on the robustness of NR-IQA +models. + +
+
+ comment: Submitted to a conference +
+
+
+
+
+ + ☆ Multi-feature Reconstruction Network using Crossed-mask Restoration for + Unsupervised Anomaly Detection + + +
+ Unsupervised anomaly detection using only normal samples is of great +significance for quality inspection in industrial manufacturing. Although +existing reconstruction-based methods have achieved promising results, they +still face two problems: poor distinguishable information in image +reconstruction and well abnormal regeneration caused by model +over-generalization ability. To overcome the above issues, we convert the image +reconstruction into a combination of parallel feature restorations and propose +a multi-feature reconstruction network, MFRNet, using crossed-mask restoration +in this paper. Specifically, a multi-scale feature aggregator is first +developed to generate more discriminative hierarchical representations of the +input images from a pre-trained model. Subsequently, a crossed-mask generator +is adopted to randomly cover the extracted feature map, followed by a +restoration network based on the transformer structure for high-quality repair +of the missing regions. Finally, a hybrid loss is equipped to guide model +training and anomaly estimation, which gives consideration to both the pixel +and structural similarity. Extensive experiments show that our method is highly +competitive with or significantly outperforms other state-of-the-arts on four +public available datasets and one self-made dataset. + +
+
+
+
+
+ + ☆ StrideNET: Swin Transformer for Terrain Recognition with Dynamic + Roughness Extraction + + +
+ Advancements in deep learning are revolutionizing the classification of +remote-sensing images. Transformer-based architectures, utilizing +self-attention mechanisms, have emerged as alternatives to conventional +convolution methods, enabling the capture of long-range dependencies along with +global relationships in the image. Motivated by these advancements, this paper +presents StrideNET, a novel dual-branch architecture designed for terrain +recognition and implicit properties estimation. The terrain recognition branch +utilizes the Swin Transformer, leveraging its hierarchical representation and +low computational cost to efficiently capture both local and global features. +The terrain properties branch focuses on the extraction of surface properties +such as roughness and slipperiness using a statistical texture analysis method. +By computing surface terrain properties, an enhanced environmental perception +can be obtained. The StrideNET model is trained on a dataset comprising four +target terrain classes: Grassy, Marshy, Sandy, and Rocky. StrideNET attains +competitive performance compared to contemporary methods. The implications of +this work extend to various applications, including environmental monitoring, +land use and land cover (LULC) classification, disaster response, precision +agriculture, and much more. + +
+
+
+
+
+ + ☆ Multi-Cell Decoder and Mutual Learning for Table Structure and Character + Recognition ICDAR 2024 + + +
+ Extracting table contents from documents such as scientific papers and +financial reports and converting them into a format that can be processed by +large language models is an important task in knowledge information processing. +End-to-end approaches, which recognize not only table structure but also cell +contents, achieved performance comparable to state-of-the-art models using +external character recognition systems, and have potential for further +improvements. In addition, these models can now recognize long tables with +hundreds of cells by introducing local attention. However, the models recognize +table structure in one direction from the header to the footer, and cell +content recognition is performed independently for each cell, so there is no +opportunity to retrieve useful information from the neighbor cells. In this +paper, we propose a multi-cell content decoder and bidirectional mutual +learning mechanism to improve the end-to-end approach. The effectiveness is +demonstrated on two large datasets, and the experimental results show +comparable performance to state-of-the-art models, even for long tables with +large numbers of cells. + +
+
+ comment: ICDAR 2024 +
+
+
+
+
+ + ☆ FilterPrompt: Guiding Image Transfer in Diffusion Models + + +
+ In controllable generation tasks, flexibly manipulating the generated images +to attain a desired appearance or structure based on a single input image cue +remains a critical and longstanding challenge. Achieving this requires the +effective decoupling of key attributes within the input image data, aiming to +get representations accurately. Previous research has predominantly +concentrated on disentangling image attributes within feature space. However, +the complex distribution present in real-world data often makes the application +of such decoupling algorithms to other datasets challenging. Moreover, the +granularity of control over feature encoding frequently fails to meet specific +task requirements. Upon scrutinizing the characteristics of various generative +models, we have observed that the input sensitivity and dynamic evolution +properties of the diffusion model can be effectively fused with the explicit +decomposition operation in pixel space. This integration enables the image +processing operations performed in pixel space for a specific feature +distribution of the input image, and can achieve the desired control effect in +the generated results. Therefore, we propose FilterPrompt, an approach to +enhance the model control effect. It can be universally applied to any +diffusion model, allowing users to adjust the representation of specific image +features in accordance with task requirements, thereby facilitating more +precise and controllable generation outcomes. In particular, our designed +experiments demonstrate that the FilterPrompt optimizes feature correlation, +mitigates content conflicts during the generation process, and enhances the +model's control capability. + +
+
+
+
+
+ + ☆ 3D-Convolution Guided Spectral-Spatial Transformer for Hyperspectral + Image Classification + + +
+ In recent years, Vision Transformers (ViTs) have shown promising +classification performance over Convolutional Neural Networks (CNNs) due to +their self-attention mechanism. Many researchers have incorporated ViTs for +Hyperspectral Image (HSI) classification. HSIs are characterised by narrow +contiguous spectral bands, providing rich spectral data. Although ViTs excel +with sequential data, they cannot extract spectral-spatial information like +CNNs. Furthermore, to have high classification performance, there should be a +strong interaction between the HSI token and the class (CLS) token. To solve +these issues, we propose a 3D-Convolution guided Spectral-Spatial Transformer +(3D-ConvSST) for HSI classification that utilizes a 3D-Convolution Guided +Residual Module (CGRM) in-between encoders to "fuse" the local spatial and +spectral information and to enhance the feature propagation. Furthermore, we +forego the class token and instead apply Global Average Pooling, which +effectively encodes more discriminative and pertinent high-level features for +classification. Extensive experiments have been conducted on three public HSI +datasets to show the superiority of the proposed model over state-of-the-art +traditional, convolutional, and Transformer models. The code is available at +https://github.com/ShyamVarahagiri/3D-ConvSST. + +
+
+ comment: Accepted in IEEE Conference on Artificial Intelligence, 2024 +
+
+
+
+
+ + ☆ Beyond Pixel-Wise Supervision for Medical Image Segmentation: From + Traditional Models to Foundation Models + + +
+ Medical image segmentation plays an important role in many image-guided +clinical approaches. However, existing segmentation algorithms mostly rely on +the availability of fully annotated images with pixel-wise annotations for +training, which can be both labor-intensive and expertise-demanding, especially +in the medical imaging domain where only experts can provide reliable and +accurate annotations. To alleviate this challenge, there has been a growing +focus on developing segmentation methods that can train deep models with weak +annotations, such as image-level, bounding boxes, scribbles, and points. The +emergence of vision foundation models, notably the Segment Anything Model +(SAM), has introduced innovative capabilities for segmentation tasks using weak +annotations for promptable segmentation enabled by large-scale pre-training. +Adopting foundation models together with traditional learning methods has +increasingly gained recent interest research community and shown potential for +real-world applications. In this paper, we present a comprehensive survey of +recent progress on annotation-efficient learning for medical image segmentation +utilizing weak annotations before and in the era of foundation models. +Furthermore, we analyze and discuss several challenges of existing approaches, +which we believe will provide valuable guidance for shaping the trajectory of +foundational models to further advance the field of medical image segmentation. + +
+
+
+
+
+ + ☆ PAFedFV: Personalized and Asynchronous Federated Learning for Finger + Vein Recognition + + +
+ With the increasing emphasis on user privacy protection, biometric +recognition based on federated learning have become the latest research +hotspot. However, traditional federated learning methods cannot be directly +applied to finger vein recognition, due to heterogeneity of data and open-set +verification. Therefore, only a few application cases have been proposed. And +these methods still have two drawbacks. (1) Uniform model results in poor +performance in some clients, as the finger vein data is highly heterogeneous +and non-Independently Identically Distributed (non-IID). (2) On individual +client, a large amount of time is underutilized, such as the time to wait for +returning model from server. To address those problems, this paper proposes a +Personalized and Asynchronous Federated Learning for Finger Vein Recognition +(PAFedFV) framework. PAFedFV designs personalized model aggregation method to +solve the heterogeneity among non-IID data. Meanwhile, it employs an +asynchronized training module for clients to utilize their waiting time. +Finally, extensive experiments on six finger vein datasets are conducted. Base +on these experiment results, the impact of non-IID finger vein data on +performance of federated learning are analyzed, and the superiority of PAFedFV +in accuracy and robustness are demonstrated. + +
+
+
+
+
+ + ☆ Vim4Path: Self-Supervised Vision Mamba for Histopathology Images CVPR2023 + + +
+ Representation learning from Gigapixel Whole Slide Images (WSI) poses a +significant challenge in computational pathology due to the complicated nature +of tissue structures and the scarcity of labeled data. Multi-instance learning +methods have addressed this challenge, leveraging image patches to classify +slides utilizing pretrained models using Self-Supervised Learning (SSL) +approaches. The performance of both SSL and MIL methods relies on the +architecture of the feature encoder. This paper proposes leveraging the Vision +Mamba (Vim) architecture, inspired by state space models, within the DINO +framework for representation learning in computational pathology. We evaluate +the performance of Vim against Vision Transformers (ViT) on the Camelyon16 +dataset for both patch-level and slide-level classification. Our findings +highlight Vim's enhanced performance compared to ViT, particularly at smaller +scales, where Vim achieves an 8.21 increase in ROC AUC for models of similar +size. An explainability analysis further highlights Vim's capabilities, which +reveals that Vim uniquely emulates the pathologist workflow-unlike ViT. This +alignment with human expert analysis highlights Vim's potential in practical +diagnostic settings and contributes significantly to developing effective +representation-learning algorithms in computational pathology. We release the +codes and pretrained weights at +\url{https://github.com/AtlasAnalyticsLab/Vim4Path}. + +
+
+ comment: Accepted in CVPR2023 (9th Workshop on Computer Vision for Microscopy + Image Analysis) +
+
+
+
+
+ + ☆ Optimizing Contrail Detection: A Deep Learning Approach with + EfficientNet-b4 Encoding + + +
+ In the pursuit of environmental sustainability, the aviation industry faces +the challenge of minimizing its ecological footprint. Among the key solutions +is contrail avoidance, targeting the linear ice-crystal clouds produced by +aircraft exhaust. These contrails exacerbate global warming by trapping +atmospheric heat, necessitating precise segmentation and comprehensive analysis +of contrail images to gauge their environmental impact. However, this +segmentation task is complex due to the varying appearances of contrails under +different atmospheric conditions and potential misalignment issues in +predictive modeling. This paper presents an innovative deep-learning approach +utilizing the efficient net-b4 encoder for feature extraction, seamlessly +integrating misalignment correction, soft labeling, and pseudo-labeling +techniques to enhance the accuracy and efficiency of contrail detection in +satellite imagery. The proposed methodology aims to redefine contrail image +analysis and contribute to the objectives of sustainable aviation by providing +a robust framework for precise contrail detection and analysis in satellite +imagery, thus aiding in the mitigation of aviation's environmental impact. + +
+
+
+
+
+ + ♻ ☆ ELODI: Ensemble Logit Difference Inhibition for Positive-Congruent + Training + + +
+ Negative flips are errors introduced in a classification system when a legacy +model is updated. Existing methods to reduce the negative flip rate (NFR) +either do so at the expense of overall accuracy by forcing a new model to +imitate the old models, or use ensembles, which multiply inference cost +prohibitively. We analyze the role of ensembles in reducing NFR and observe +that they remove negative flips that are typically not close to the decision +boundary, but often exhibit large deviations in the distance among their +logits. Based on the observation, we present a method, called Ensemble Logit +Difference Inhibition (ELODI), to train a classification system that achieves +paragon performance in both error rate and NFR, at the inference cost of a +single model. The method distills a homogeneous ensemble to a single student +model which is used to update the classification system. ELODI also introduces +a generalized distillation objective, Logit Difference Inhibition (LDI), which +only penalizes the logit difference of a subset of classes with the highest +logit values. On multiple image classification benchmarks, model updates with +ELODI demonstrate superior accuracy retention and NFR reduction. + +
+
+ comment: Accepted as a Regular Paper in TPAMI. Code is at + https://github.com/amazon-science/regression-constraint-model-upgrade +
+
+
+
+
+ + ♻ ☆ Visual Whole-Body Control for Legged Loco-Manipulation + + +
+ We study the problem of mobile manipulation using legged robots equipped with +an arm, namely legged loco-manipulation. The robot legs, while usually utilized +for mobility, offer an opportunity to amplify the manipulation capabilities by +conducting whole-body control. That is, the robot can control the legs and the +arm at the same time to extend its workspace. We propose a framework that can +conduct the whole-body control autonomously with visual observations. Our +approach, namely Visual Whole-Body Control(VBC), is composed of a low-level +policy using all degrees of freedom to track the end-effector manipulator +position and a high-level policy proposing the end-effector position based on +visual inputs. We train both levels of policies in simulation and perform +Sim2Real transfer for real robot deployment. We perform extensive experiments +and show significant improvements over baselines in picking up diverse objects +in different configurations (heights, locations, orientations) and +environments. Project page: https://wholebody-b1.github.io + +
+
+ comment: Add more details. The first two authors contribute equally. Project + page: https://wholebody-b1.github.io +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Crowd Counting with Contextual Modeling: Facilitating + Holistic Understanding of Crowd Scenes + + +
+ To alleviate the heavy annotation burden for training a reliable crowd +counting model and thus make the model more practicable and accurate by being +able to benefit from more data, this paper presents a new semi-supervised +method based on the mean teacher framework. When there is a scarcity of labeled +data available, the model is prone to overfit local patches. Within such +contexts, the conventional approach of solely improving the accuracy of local +patch predictions through unlabeled data proves inadequate. Consequently, we +propose a more nuanced approach: fostering the model's intrinsic 'subitizing' +capability. This ability allows the model to accurately estimate the count in +regions by leveraging its understanding of the crowd scenes, mirroring the +human cognitive process. To achieve this goal, we apply masking on unlabeled +data, guiding the model to make predictions for these masked patches based on +the holistic cues. Furthermore, to help with feature learning, herein we +incorporate a fine-grained density classification task. Our method is general +and applicable to most existing crowd counting methods as it doesn't have +strict structural or loss constraints. In addition, we observe that the model +trained with our framework exhibits a 'subitizing'-like behavior. It accurately +predicts low-density regions with only a 'glance', while incorporating local +details to predict high-density regions. Our method achieves the +state-of-the-art performance, surpassing previous approaches by a large margin +on challenging benchmarks such as ShanghaiTech A and UCF-QNRF. The code is +available at: https://github.com/cha15yq/MRC-Crowd. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ♻ ☆ RoadBEV: Road Surface Reconstruction in Bird's Eye View + + +
+ Road surface conditions, especially geometry profiles, enormously affect +driving performance of autonomous vehicles. Vision-based online road +reconstruction promisingly captures road information in advance. Existing +solutions like monocular depth estimation and stereo matching suffer from +modest performance. The recent technique of Bird's-Eye-View (BEV) perception +provides immense potential to more reliable and accurate reconstruction. This +paper uniformly proposes two simple yet effective models for road elevation +reconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate +road elevation with monocular and stereo images, respectively. The former +directly fits elevation values based on voxel features queried from image view, +while the latter efficiently recognizes road elevation patterns based on BEV +volume representing discrepancy between left and right voxel features. +Insightful analyses reveal their consistence and difference with perspective +view. Experiments on real-world dataset verify the models' effectiveness and +superiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm +and 0.50cm, respectively. The estimation performance improves by 50\% in BEV +based on monocular image. Our models are promising for practical applications, +providing valuable references for vision-based BEV perception in autonomous +driving. The code is released at https://github.com/ztsrxh/RoadBEV. + +
+
+ comment: Dataset page: https://thu-rsxd.com/rsrd Code: + https://github.com/ztsrxh/RoadBEV +
+
+
+
+
+ + ♻ ☆ Revisiting Few-Shot Object Detection with Vision-Language Models + + +
+ Few-shot object detection (FSOD) benchmarks have advanced techniques for +detecting new categories with limited annotations. Existing benchmarks +repurpose well-established datasets like COCO by partitioning categories into +base and novel classes for pre-training and fine-tuning respectively. However, +these benchmarks do not reflect how FSOD is deployed in practice. Rather than +only pre-training on a small number of base categories, we argue that it is +more practical to fine-tune a foundation model (e.g., a vision-language model +(VLM) pre-trained on web-scale data) for a target domain. Surprisingly, we find +that zero-shot inference from VLMs like GroundingDINO significantly outperforms +the state-of-the-art (48.3 vs. 33.1 AP) on COCO. However, such zero-shot models +can still be misaligned to target concepts of interest. For example, trailers +on the web may be different from trailers in the context of autonomous +vehicles. In this work, we propose Foundational FSOD, a new benchmark protocol +that evaluates detectors pre-trained on any external datasets and fine-tuned on +K-shots per target class. Further, we note that current FSOD benchmarks are +actually federated datasets containing exhaustive annotations for each category +on a subset of the data. We leverage this insight to propose simple strategies +for fine-tuning VLMs with federated losses. We demonstrate the effectiveness of +our approach on LVIS and nuImages, improving over prior work by 5.9 AP. Our +code is available at https://github.com/anishmadan23/foundational_fsod + +
+
+
+
+
+ + ♻ ☆ Unsupervised Video Domain Adaptation with Masked Pre-Training and + Collaborative Self-Training CVPR 2024 + + +
+ In this work, we tackle the problem of unsupervised domain adaptation (UDA) +for video action recognition. Our approach, which we call UNITE, uses an image +teacher model to adapt a video student model to the target domain. UNITE first +employs self-supervised pre-training to promote discriminative feature learning +on target domain videos using a teacher-guided masked distillation objective. +We then perform self-training on masked target data, using the video student +model and image teacher model together to generate improved pseudolabels for +unlabeled target videos. Our self-training process successfully leverages the +strengths of both models to achieve strong transfer performance across domains. +We evaluate our approach on multiple video domain adaptation benchmarks and +observe significant improvements upon previously reported results. + +
+
+ comment: Accepted at CVPR 2024. 13 pages, 4 figures. Approved for public + release: distribution unlimited +
+
+
+
+
+ + ♻ ☆ Pixel to Elevation: Learning to Predict Elevation Maps at Long Range + using Images for Autonomous Offroad Navigation + + +
+ Understanding terrain topology at long-range is crucial for the success of +off-road robotic missions, especially when navigating at high-speeds. LiDAR +sensors, which are currently heavily relied upon for geometric mapping, provide +sparse measurements when mapping at greater distances. To address this +challenge, we present a novel learning-based approach capable of predicting +terrain elevation maps at long-range using only onboard egocentric images in +real-time. Our proposed method is comprised of three main elements. First, a +transformer-based encoder is introduced that learns cross-view associations +between the egocentric views and prior bird-eye-view elevation map predictions. +Second, an orientation-aware positional encoding is proposed to incorporate the +3D vehicle pose information over complex unstructured terrain with multi-view +visual image features. Lastly, a history-augmented learn-able map embedding is +proposed to achieve better temporal consistency between elevation map +predictions to facilitate the downstream navigational tasks. We experimentally +validate the applicability of our proposed approach for autonomous offroad +robotic navigation in complex and unstructured terrain using real-world offroad +driving data. Furthermore, the method is qualitatively and quantitatively +compared against the current state-of-the-art methods. Extensive field +experiments demonstrate that our method surpasses baseline models in accurately +predicting terrain elevation while effectively capturing the overall terrain +topology at long-ranges. Finally, ablation studies are conducted to highlight +and understand the effect of key components of the proposed approach and +validate their suitability to improve offroad robotic navigation capabilities. + +
+
+ comment: 8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ♻ ☆ Towards Two-Stream Foveation-based Active Vision Learning + + +
+ Deep neural network (DNN) based machine perception frameworks process the +entire input in a one-shot manner to provide answers to both "what object is +being observed" and "where it is located". In contrast, the "two-stream +hypothesis" from neuroscience explains the neural processing in the human +visual cortex as an active vision system that utilizes two separate regions of +the brain to answer the what and the where questions. In this work, we propose +a machine learning framework inspired by the "two-stream hypothesis" and +explore the potential benefits that it offers. Specifically, the proposed +framework models the following mechanisms: 1) ventral (what) stream focusing on +the input regions perceived by the fovea part of an eye (foveation), 2) dorsal +(where) stream providing visual guidance, and 3) iterative processing of the +two streams to calibrate visual focus and process the sequence of focused image +patches. The training of the proposed framework is accomplished by label-based +DNN training for the ventral stream model and reinforcement learning for the +dorsal stream model. We show that the two-stream foveation-based learning is +applicable to the challenging task of weakly-supervised object localization +(WSOL), where the training data is limited to the object class or its +attributes. The framework is capable of both predicting the properties of an +object and successfully localizing it by predicting its bounding box. We also +show that, due to the independent nature of the two streams, the dorsal model +can be applied on its own to unseen images to localize objects from different +datasets. + +
+
+ comment: Accepted version of the article, 18 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ End-to-End Temporal Action Detection with 1B Parameters Across 1000 + Frames CVPR 2024 + + +
+ Recently, temporal action detection (TAD) has seen significant performance +improvement with end-to-end training. However, due to the memory bottleneck, +only models with limited scales and limited data volumes can afford end-to-end +training, which inevitably restricts TAD performance. In this paper, we reduce +the memory consumption for end-to-end training, and manage to scale up the TAD +backbone to 1 billion parameters and the input video to 1,536 frames, leading +to significant detection performance. The key to our approach lies in our +proposed temporal-informative adapter (TIA), which is a novel lightweight +module that reduces training memory. Using TIA, we free the humongous backbone +from learning to adapt to the TAD task by only updating the parameters in TIA. +TIA also leads to better TAD representation by temporally aggregating context +from adjacent frames throughout the backbone. We evaluate our model across four +representative datasets. Owing to our efficient design, we are able to train +end-to-end on VideoMAEv2-giant and achieve 75.4% mAP on THUMOS14, being the +first end-to-end model to outperform the best feature-based methods. Code is +available at https://github.com/sming256/AdaTAD. + +
+
+ comment: Accepted to CVPR 2024. Camera-Ready Version +
+
+
+
+
+ + ♻ ☆ Learning SO(3)-Invariant Semantic Correspondence via Local Shape + Transform CVPR 2024 + + +
+ Establishing accurate 3D correspondences between shapes stands as a pivotal +challenge with profound implications for computer vision and robotics. However, +existing self-supervised methods for this problem assume perfect input shape +alignment, restricting their real-world applicability. In this work, we +introduce a novel self-supervised Rotation-Invariant 3D correspondence learner +with Local Shape Transform, dubbed RIST, that learns to establish dense +correspondences between shapes even under challenging intra-class variations +and arbitrary orientations. Specifically, RIST learns to dynamically formulate +an SO(3)-invariant local shape transform for each point, which maps the +SO(3)-equivariant global shape descriptor of the input shape to a local shape +descriptor. These local shape descriptors are provided as inputs to our decoder +to facilitate point cloud self- and cross-reconstruction. Our proposed +self-supervised training pipeline encourages semantically corresponding points +from different shapes to be mapped to similar local shape descriptors, enabling +RIST to establish dense point-wise correspondences. RIST demonstrates +state-of-the-art performances on 3D part label transfer and semantic keypoint +transfer given arbitrarily rotated point cloud pairs, outperforming existing +methods by significant margins. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion + with Mamba + + +
+ Multi-modal image fusion aims to combine information from different modes to +create a single image with comprehensive information and detailed textures. +However, fusion models based on convolutional neural networks encounter +limitations in capturing global image features due to their focus on local +convolution operations. Transformer-based models, while excelling in global +feature modeling, confront computational challenges stemming from their +quadratic complexity. Recently, the Selective Structured State Space Model has +exhibited significant potential for long-range dependency modeling with linear +complexity, offering a promising avenue to address the aforementioned dilemma. +In this paper, we propose FusionMamba, a novel dynamic feature enhancement +method for multimodal image fusion with Mamba. Specifically, we devise an +improved efficient Mamba model for image fusion, integrating efficient visual +state space model with dynamic convolution and channel attention. This refined +model not only upholds the performance of Mamba and global modeling capability +but also diminishes channel redundancy while enhancing local enhancement +capability. Additionally, we devise a dynamic feature fusion module (DFFM) +comprising two dynamic feature enhancement modules (DFEM) and a cross modality +fusion mamba module (CMFM). The former serves for dynamic texture enhancement +and dynamic difference perception, whereas the latter enhances correlation +features between modes and suppresses redundant intermodal information. +FusionMamba has yielded state-of-the-art (SOTA) performance across various +multimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared +and visible image fusion task (IR-VIS) and multimodal biomedical image fusion +dataset (GFP-PC), which is proved that our model has generalization ability. +The code for FusionMamba is available at +https://github.com/millieXie/FusionMamba. + +
+
+
+
+
+ + ♻ ☆ Joint Multimodal Transformer for Emotion Recognition in the Wild CVPR + + +
+ Multimodal emotion recognition (MMER) systems typically outperform unimodal +systems by leveraging the inter- and intra-modal relationships between, e.g., +visual, textual, physiological, and auditory modalities. This paper proposes an +MMER method that relies on a joint multimodal transformer (JMT) for fusion with +key-based cross-attention. This framework can exploit the complementary nature +of diverse modalities to improve predictive accuracy. Separate backbones +capture intra-modal spatiotemporal dependencies within each modality over video +sequences. Subsequently, our JMT fusion architecture integrates the individual +modality embeddings, allowing the model to effectively capture inter- and +intra-modal relationships. Extensive experiments on two challenging expression +recognition tasks -- (1) dimensional emotion recognition on the Affwild2 +dataset (with face and voice) and (2) pain estimation on the Biovid dataset +(with face and biosensors) -- indicate that our JMT fusion can provide a +cost-effective solution for MMER. Empirical results show that MMER systems with +our proposed fusion allow us to outperform relevant baseline and +state-of-the-art methods. + +
+
+ comment: 10 pages, 4 figures, 6 tables, CVPRw 2024 +
+
+
+
+
+ + ♻ ☆ Allowing humans to interactively guide machines where to look does not + always improve human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature importance maps \cite{bansal2020sam} +have been established as a common means for finding how important each input +feature is to an AI's decisions. It is an interesting, unexplored question +whether allowing users to edit the feature importance at test time would +improve a human-AI team's accuracy on downstream tasks. In this paper, we +address this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc +explainable classifier \cite{taesiri2022visual} that first predicts patch-wise +correspondences between the input and training-set images, and then bases on +them to make classification decisions. We build CHM-Corr++, an interactive +interface for CHM-Corr, enabling users to edit the feature importance map +provided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users +can gain insights into if, when, and how the model changes its outputs, +improving their understanding beyond static explanations. However, our study +with 18 expert users who performed 1,400 decisions finds no statistical +significance that our interactive approach improves user accuracy on CUB-200 +bird image classification over static explanations. This challenges the +hypothesis that interactivity can boost human-AI team accuracy and raises needs +for future research. We open-source CHM-Corr++, an interactive tool for editing +image classifier attention (see an interactive demo here: +http://137.184.82.109:7080/). We release code and data on github: +https://github.com/anguyen8/chm-corr-interactive. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ♻ ☆ CT Liver Segmentation via PVT-based Encoding and Refined Decoding + + +
+ Accurate liver segmentation from CT scans is essential for effective +diagnosis and treatment planning. Computer-aided diagnosis systems promise to +improve the precision of liver disease diagnosis, disease progression, and +treatment planning. In response to the need, we propose a novel deep learning +approach, \textit{\textbf{PVTFormer}}, that is built upon a pretrained pyramid +vision transformer (PVT v2) combined with advanced residual upsampling and +decoder block. By integrating a refined feature channel approach with a +hierarchical decoding strategy, PVTFormer generates high quality segmentation +masks by enhancing semantic features. Rigorous evaluation of the proposed +method on Liver Tumor Segmentation Benchmark (LiTS) 2017 demonstrates that our +proposed architecture not only achieves a high dice coefficient of 86.78\%, +mIoU of 78.46\%, but also obtains a low HD of 3.50. The results underscore +PVTFormer's efficacy in setting a new benchmark for state-of-the-art liver +segmentation methods. The source code of the proposed PVTFormer is available at +\url{https://github.com/DebeshJha/PVTFormer}. + +
+
+
+
+
+ + ♻ ☆ Action-slot: Visual Action-centric Representations for Multi-label + Atomic Activity Recognition in Traffic Scenes + + +
+ In this paper, we study multi-label atomic activity recognition. Despite the +notable progress in action recognition, it is still challenging to recognize +atomic activities due to a deficiency in a holistic understanding of both +multiple road users' motions and their contextual information. In this paper, +we introduce Action-slot, a slot attention-based approach that learns visual +action-centric representations, capturing both motion and contextual +information. Our key idea is to design action slots that are capable of paying +attention to regions where atomic activities occur, without the need for +explicit perception guidance. To further enhance slot attention, we introduce a +background slot that competes with action slots, aiding the training process in +avoiding unnecessary focus on background regions devoid of activities. Yet, the +imbalanced class distribution in the existing dataset hampers the assessment of +rare activities. To address the limitation, we collect a synthetic dataset +called TACO, which is four times larger than OATS and features a balanced +distribution of atomic activities. To validate the effectiveness of our method, +we conduct comprehensive experiments and ablation studies against various +action recognition baselines. We also show that the performance of multi-label +atomic activity recognition on real-world datasets can be improved by +pretraining representations on TACO. We will release our source code and +dataset. See the videos of visualization on the project page: +https://hcis-lab.github.io/Action-slot/ + +
+
+
+
+
+ + ♻ ☆ TransNeXt: Robust Foveal Visual Perception for Vision Transformers CVPR 2024 + + +
+ Due to the depth degradation effect in residual connections, many efficient +Vision Transformers models that rely on stacking layers for information +exchange often fail to form sufficient information mixing, leading to unnatural +visual perception. To address this issue, in this paper, we propose Aggregated +Attention, a biomimetic design-based token mixer that simulates biological +foveal vision and continuous eye movement while enabling each token on the +feature map to have a global perception. Furthermore, we incorporate learnable +tokens that interact with conventional queries and keys, which further +diversifies the generation of affinity matrices beyond merely relying on the +similarity between queries and keys. Our approach does not rely on stacking for +information exchange, thus effectively avoiding depth degradation and achieving +natural visual perception. Additionally, we propose Convolutional GLU, a +channel mixer that bridges the gap between GLU and SE mechanism, which empowers +each token to have channel attention based on its nearest neighbor image +features, enhancing local modeling capability and model robustness. We combine +aggregated attention and convolutional GLU to create a new visual backbone +called TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves +state-of-the-art performance across multiple model sizes. At a resolution of +$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing +ConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet +accuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of +$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic +segmentation mIoU of 54.7. + +
+
+ comment: CVPR 2024 Camera-ready Version. Project Page: + https://github.com/DaiShiResearch/TransNeXt +
+
+
+
+
+ + ♻ ☆ CREST: Cross-modal Resonance through Evidential Deep Learning for + Enhanced Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) enables the recognition of novel classes by +leveraging semantic knowledge transfer from known to unknown categories. This +knowledge, typically encapsulated in attribute descriptions, aids in +identifying class-specific visual features, thus facilitating visual-semantic +alignment and improving ZSL performance. However, real-world challenges such as +distribution imbalances and attribute co-occurrence among instances often +hinder the discernment of local variances in images, a problem exacerbated by +the scarcity of fine-grained, region-specific attribute annotations. Moreover, +the variability in visual presentation within categories can also skew +attribute-category associations. In response, we propose a bidirectional +cross-modal ZSL approach CREST. It begins by extracting representations for +attribute and visual localization and employs Evidential Deep Learning (EDL) to +measure underlying epistemic uncertainty, thereby enhancing the model's +resilience against hard negatives. CREST incorporates dual learning pathways, +focusing on both visual-category and attribute-category alignments, to ensure +robust correlation between latent and observable spaces. Moreover, we introduce +an uncertainty-informed cross-modal fusion technique to refine visual-attribute +inference. Extensive experiments demonstrate our model's effectiveness and +unique explainability across multiple datasets. Our code and data are available +at: https://github.com/JethroJames/CREST + +
+
+ comment: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at: + https://github.com/JethroJames/CREST +
+
+
+
+
+ + ♻ ☆ OpenPack: A Large-scale Dataset for Recognizing Packaging Works in + IoT-enabled Logistic Environments + + +
+ Unlike human daily activities, existing publicly available sensor datasets +for work activity recognition in industrial domains are limited by difficulties +in collecting realistic data as close collaboration with industrial sites is +required. This also limits research on and development of methods for +industrial applications. To address these challenges and contribute to research +on machine recognition of work activities in industrial domains, in this study, +we introduce a new large-scale dataset for packaging work recognition called +OpenPack. OpenPack contains 53.8 hours of multimodal sensor data, including +acceleration data, keypoints, depth images, and readings from IoT-enabled +devices (e.g., handheld barcode scanners), collected from 16 distinct subjects +with different levels of packaging work experience. We apply state-of-the-art +human activity recognition techniques to the dataset and provide future +directions of complex work activity recognition studies in the pervasive +computing community based on the results. We believe that OpenPack will +contribute to the sensor-based action/activity recognition community by +providing challenging tasks. The OpenPack dataset is available at +https://open-pack.github.io. + +
+
+
+
+
+ + ♻ ☆ Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of + Orthogonal Diffusion Models + + +
+ Recent advancements in 3D generation are predominantly propelled by +improvements in 3D-aware image diffusion models which are pretrained on +Internet-scale image data and fine-tuned on massive 3D data, offering the +capability of producing highly consistent multi-view images. However, due to +the scarcity of synchronized multi-view video data, it is impractical to adapt +this paradigm to 4D generation directly. Despite that, the available video and +3D data are adequate for training video and multi-view diffusion models that +can provide satisfactory dynamic and geometric priors respectively. In this +paper, we present Diffusion$^2$, a novel framework for dynamic 3D content +creation that leverages the knowledge about geometric consistency and temporal +smoothness from these models to directly sample dense multi-view and +multi-frame images which can be employed to optimize continuous 4D +representation. Specifically, we design a simple yet effective denoising +strategy via score composition of video and multi-view diffusion models based +on the probability structure of the images to be generated. Owing to the high +parallelism of the image generation and the efficiency of the modern 4D +reconstruction pipeline, our framework can generate 4D content within few +minutes. Furthermore, our method circumvents the reliance on 4D data, thereby +having the potential to benefit from the scalability of the foundation video +and multi-view diffusion models. Extensive experiments demonstrate the efficacy +of our proposed framework and its capability to flexibly adapt to various types +of prompts. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Unraveling Instance Associations: A Closer Look for Audio-Visual + Segmentation + + +
+ Audio-visual segmentation (AVS) is a challenging task that involves +accurately segmenting sounding objects based on audio-visual cues. The +effectiveness of audio-visual learning critically depends on achieving accurate +cross-modal alignment between sound and visual objects. Successful audio-visual +learning requires two essential components: 1) a challenging dataset with +high-quality pixel-level multi-class annotated images associated with audio +files, and 2) a model that can establish strong links between audio information +and its corresponding visual object. However, these requirements are only +partially addressed by current methods, with training sets containing biased +audio-visual data, and models that generalise poorly beyond this biased +training set. In this work, we propose a new cost-effective strategy to build +challenging and relatively unbiased high-quality audio-visual segmentation +benchmarks. We also propose a new informative sample mining method for +audio-visual supervised contrastive learning to leverage discriminative +contrastive samples to enforce cross-modal understanding. We show empirical +results that demonstrate the effectiveness of our benchmark. Furthermore, +experiments conducted on existing AVS datasets and on our new benchmark show +that our method achieves state-of-the-art (SOTA) segmentation accuracy. + +
+
+ comment: Code is available at https://github.com/cyh-0/CAVP +
+
+
+
+
+ + ♻ ☆ D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for + Few-shot Action Recognition + + +
+ Adapting large pre-trained image models to few-shot action recognition has +proven to be an effective and efficient strategy for learning robust feature +extractors, which is essential for few-shot learning. Typical fine-tuning based +adaptation paradigm is prone to overfitting in the few-shot learning scenarios +and offers little modeling flexibility for learning temporal features in video +data. In this work we present the Disentangled-and-Deformable Spatio-Temporal +Adapter (D$^2$ST-Adapter), which is a novel adapter tuning framework +well-suited for few-shot action recognition due to lightweight design and low +parameter-learning overhead. It is designed in a dual-pathway architecture to +encode spatial and temporal features in a disentangled manner. In particular, +we devise the anisotropic Deformable Spatio-Temporal Attention module as the +core component of D$^2$ST-Adapter, which can be tailored with anisotropic +sampling densities along spatial and temporal domains to learn spatial and +temporal features specifically in corresponding pathways, allowing our +D$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space +while maintaining a lightweight design. Extensive experiments with +instantiations of our method on both pre-trained ResNet and ViT demonstrate the +superiority of our method over state-of-the-art methods for few-shot action +recognition. Our method is particularly well-suited to challenging scenarios +where temporal dynamics are critical for action recognition. + +
+
+
+
+
+ + ♻ ☆ Delocate: Detection and Localization for Deepfake Videos with + Randomly-Located Tampered Traces + + +
+ Deepfake videos are becoming increasingly realistic, showing subtle tampering +traces on facial areasthat vary between frames. Consequently, many existing +Deepfake detection methods struggle to detect unknown domain Deepfake videos +while accurately locating the tampered region. To address thislimitation, we +propose Delocate, a novel Deepfake detection model that can both recognize +andlocalize unknown domain Deepfake videos. Ourmethod consists of two stages +named recoveringand localization. In the recovering stage, the modelrandomly +masks regions of interest (ROIs) and reconstructs real faces without tampering +traces, resulting in a relatively good recovery effect for realfaces and a poor +recovery effect for fake faces. Inthe localization stage, the output of the +recoveryphase and the forgery ground truth mask serve assupervision to guide +the forgery localization process. This process strategically emphasizes the +recovery phase of fake faces with poor recovery, facilitating the localization +of tampered regions. Ourextensive experiments on four widely used benchmark +datasets demonstrate that Delocate not onlyexcels in localizing tampered areas +but also enhances cross-domain detection performance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2308.09921, + arXiv:2305.05943 +
+
+
+
+
+ + ♻ ☆ CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding ICLR 2024 + + +
+ 3D visual grounding is the ability to localize objects in 3D scenes +conditioned by utterances. Most existing methods devote the referring head to +localize the referred object directly, causing failure in complex scenarios. In +addition, it does not illustrate how and why the network reaches the final +decision. In this paper, we address this question Can we design an +interpretable 3D visual grounding framework that has the potential to mimic the +human perception system?. To this end, we formulate the 3D visual grounding +problem as a sequence-to-sequence Seq2Seq task by first predicting a chain of +anchors and then the final target. Interpretability not only improves the +overall performance but also helps us identify failure cases. Following the +chain of thoughts approach enables us to decompose the referring task into +interpretable intermediate steps, boosting the performance and making our +framework extremely data-efficient. Moreover, our proposed framework can be +easily integrated into any existing architecture. We validate our approach +through comprehensive experiments on the Nr3D, Sr3D, and Scanrefer benchmarks +and show consistent performance gains compared to existing methods without +requiring manually annotated data. Furthermore, our proposed framework, dubbed +CoT3DRef, is significantly data-efficient, whereas on the Sr3D dataset, when +trained only on 10% of the data, we match the SOTA performance that trained on +the entire data. The code is available at +https:eslambakr.github.io/cot3dref.github.io/. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of + LiDAR-Camera Fusion for 3D Object Detection + + +
+ Multi-modal 3D object detection models for automated driving have +demonstrated exceptional performance on computer vision benchmarks like +nuScenes. However, their reliance on densely sampled LiDAR point clouds and +meticulously calibrated sensor arrays poses challenges for real-world +applications. Issues such as sensor misalignment, miscalibration, and disparate +sampling frequencies lead to spatial and temporal misalignment in data from +LiDAR and cameras. Additionally, the integrity of LiDAR and camera data is +often compromised by adverse environmental conditions such as inclement +weather, leading to occlusions and noise interference. To address this +challenge, we introduce MultiCorrupt, a comprehensive benchmark designed to +evaluate the robustness of multi-modal 3D object detectors against ten distinct +types of corruptions. We evaluate five state-of-the-art multi-modal detectors +on MultiCorrupt and analyze their performance in terms of their resistance +ability. Our results show that existing methods exhibit varying degrees of +robustness depending on the type of corruption and their fusion strategy. We +provide insights into which multi-modal design choices make such models robust +against certain perturbations. The dataset generation code and benchmark are +open-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt. + +
+
+ comment: Code: https://github.com/ika-rwth-aachen/MultiCorrupt +
+
+
+
+
+ + ♻ ☆ Improving 2D Human Pose Estimation in Rare Camera Views with Synthetic + Data + + +
+ Methods and datasets for human pose estimation focus predominantly on side- +and front-view scenarios. We overcome the limitation by leveraging synthetic +data and introduce RePoGen (RarE POses GENerator), an SMPL-based method for +generating synthetic humans with comprehensive control over pose and view. +Experiments on top-view datasets and a new dataset of real images with diverse +poses show that adding the RePoGen data to the COCO dataset outperforms +previous approaches to top- and bottom-view pose estimation without harming +performance on common views. An ablation study shows that anatomical +plausibility, a property prior research focused on, is not a prerequisite for +effective performance. The introduced dataset and the corresponding code are +available on https://mirapurkrabek.github.io/RePoGen-paper/ . + +
+
+ comment: https://mirapurkrabek.github.io/RePoGen-paper/ +
+
+
+
+
+ + ♻ ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ Deepfake is a technology dedicated to creating highly realistic facial images +and videos under specific conditions, which has significant application +potential in fields such as entertainment, movie production, digital human +creation, to name a few. With the advancements in deep learning, techniques +primarily represented by Variational Autoencoders and Generative Adversarial +Networks have achieved impressive generation results. More recently, the +emergence of diffusion models with powerful generation capabilities has sparked +a renewed wave of research. In addition to deepfake generation, corresponding +detection technologies continuously evolve to regulate the potential misuse of +deepfakes, such as for privacy invasion and phishing attacks. This survey +comprehensively reviews the latest developments in deepfake generation and +detection, summarizing and analyzing current state-of-the-arts in this rapidly +evolving field. We first unify task definitions, comprehensively introduce +datasets and metrics, and discuss developing technologies. Then, we discuss the +development of several related sub-fields and focus on researching four +representative deepfake fields: face swapping, face reenactment, talking face +generation, and facial attribute editing, as well as forgery detection. +Subsequently, we comprehensively benchmark representative methods on popular +datasets for each field, fully evaluating the latest and influential published +works. Finally, we analyze challenges and future research directions of the +discussed fields. + +
+
+ comment: We closely follow the latest developments in + https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection +
+
+
+
+
+ + ♻ ☆ Task-Aware Encoder Control for Deep Video Compression CVPR 2024 + + +
+ Prior research on deep video compression (DVC) for machine tasks typically +necessitates training a unique codec for each specific task, mandating a +dedicated decoder per task. In contrast, traditional video codecs employ a +flexible encoder controller, enabling the adaptation of a single codec to +different tasks through mechanisms like mode prediction. Drawing inspiration +from this, we introduce an innovative encoder controller for deep video +compression for machines. This controller features a mode prediction and a +Group of Pictures (GoP) selection module. Our approach centralizes control at +the encoding stage, allowing for adaptable encoder adjustments across different +tasks, such as detection and tracking, while maintaining compatibility with a +standard pre-trained DVC decoder. Empirical evidence demonstrates that our +method is applicable across multiple tasks with various existing pre-trained +DVCs. Moreover, extensive experiments demonstrate that our method outperforms +previous DVC by about 25% bitrate for different tasks, with only one +pre-trained decoder. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection CVPR2024 + + +
+ Conventional Computed Tomography (CT) imaging recognition faces two +significant challenges: (1) There is often considerable variability in the +resolution and size of each CT scan, necessitating strict requirements for the +input size and adaptability of models. (2) CT-scan contains large number of +out-of-distribution (OOD) slices. The crucial features may only be present in +specific spatial regions and slices of the entire CT scan. How can we +effectively figure out where these are located? To deal with this, we introduce +an enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically +designed for CT scan. It aim to filter out a OOD data within whole CT scan, +enabling our to select crucial spatial-slice for analysis by reducing 70% +redundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling +(KDS) method to improve the stability when training and inference stage, +therefore speeding up the rate of convergence and boosting performance. As a +result, the experiments demonstrate the promising performance of our model +using a simple EfficientNet-2D (E2D) model, even with only 1% of the training +data. The efficacy of our approach has been validated on the COVID-19-CT-DB +datasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024. +Our source code is available at https://github.com/ming053l/E2D + +
+
+ comment: Camera-ready version, accepted by DEF-AI-MIA workshop, in conjunted + with CVPR2024 +
+
+
+
+
+ + ♻ ☆ SPIRiT-Diffusion: Self-Consistency Driven Diffusion Model for + Accelerated MRI + + +
+ Diffusion models have emerged as a leading methodology for image generation +and have proven successful in the realm of magnetic resonance imaging (MRI) +reconstruction. However, existing reconstruction methods based on diffusion +models are primarily formulated in the image domain, making the reconstruction +quality susceptible to inaccuracies in coil sensitivity maps (CSMs). k-space +interpolation methods can effectively address this issue but conventional +diffusion models are not readily applicable in k-space interpolation. To +overcome this challenge, we introduce a novel approach called SPIRiT-Diffusion, +which is a diffusion model for k-space interpolation inspired by the iterative +self-consistent SPIRiT method. Specifically, we utilize the iterative solver of +the self-consistent term (i.e., k-space physical prior) in SPIRiT to formulate +a novel stochastic differential equation (SDE) governing the diffusion process. +Subsequently, k-space data can be interpolated by executing the diffusion +process. This innovative approach highlights the optimization model's role in +designing the SDE in diffusion models, enabling the diffusion process to align +closely with the physics inherent in the optimization model, a concept referred +to as model-driven diffusion. We evaluated the proposed SPIRiT-Diffusion method +using a 3D joint intracranial and carotid vessel wall imaging dataset. The +results convincingly demonstrate its superiority over image-domain +reconstruction methods, achieving high reconstruction quality even at a +substantial acceleration rate of 10. + +
+
+
+
+
+ + ♻ ☆ A Single Simple Patch is All You Need for AI-generated Image Detection + + +
+ The recent development of generative models unleashes the potential of +generating hyper-realistic fake images. To prevent the malicious usage of fake +images, AI-generated image detection aims to distinguish fake images from real +images. However, existing method suffer from severe performance drop when +detecting images generated by unseen generators. We find that generative models +tend to focus on generating the patches with rich textures to make the images +more realistic while neglecting the hidden noise caused by camera capture +present in simple patches. In this paper, we propose to exploit the noise +pattern of a single simple patch to identify fake images. Furthermore, due to +the performance decline when handling low-quality generated images, we +introduce an enhancement module and a perception module to remove the +interfering information. Extensive experiments demonstrate that our method can +achieve state-of-the-art performance on public benchmarks. + +
+
+
+
+
+ + ♻ ☆ ProTA: Probabilistic Token Aggregation for Text-Video Retrieval + + +
+ Text-video retrieval aims to find the most relevant cross-modal samples for a +given query. Recent methods focus on modeling the whole spatial-temporal +relations. However, since video clips contain more diverse content than +captions, the model aligning these asymmetric video-text pairs has a high risk +of retrieving many false positive results. In this paper, we propose +Probabilistic Token Aggregation (ProTA) to handle cross-modal interaction with +content asymmetry. Specifically, we propose dual partial-related aggregation to +disentangle and re-aggregate token representations in both low-dimension and +high-dimension spaces. We propose token-based probabilistic alignment to +generate token-level probabilistic representation and maintain the feature +representation diversity. In addition, an adaptive contrastive loss is proposed +to learn compact cross-modal distribution space. Based on extensive +experiments, ProTA achieves significant improvements on MSR-VTT (50.9%), LSMDC +(25.8%), and DiDeMo (47.2%). + +
+
+
+
+
+ + ♻ ☆ End-to-End Autonomous Driving through V2X Cooperation + + +
+ Cooperatively utilizing both ego-vehicle and infrastructure sensor data via +V2X communication has emerged as a promising approach for advanced autonomous +driving. However, current research mainly focuses on improving individual +modules, rather than taking end-to-end learning to optimize final planning +performance, resulting in underutilized data potential. In this paper, we +introduce UniV2X, a pioneering cooperative autonomous driving framework that +seamlessly integrates all key driving modules across diverse views into a +unified network. We propose a sparse-dense hybrid data transmission and fusion +mechanism for effective vehicle-infrastructure cooperation, offering three +advantages: 1) Effective for simultaneously enhancing agent perception, online +mapping, and occupancy prediction, ultimately improving planning performance. +2) Transmission-friendly for practical and limited communication conditions. 3) +Reliable data fusion with interpretability of this hybrid data. We implement +UniV2X, as well as reproducing several benchmark methods, on the challenging +DAIR-V2X, the real-world cooperative driving dataset. Experimental results +demonstrate the effectiveness of UniV2X in significantly enhancing planning +performance, as well as all intermediate output performance. Code is at +https://github.com/AIR-THU/UniV2X. + +
+
+
+
+
+ + ♻ ☆ Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation + Guided by the Characteristic Dance Primitives CVPR2024 + + +
+ We propose Lodge, a network capable of generating extremely long dance +sequences conditioned on given music. We design Lodge as a two-stage coarse to +fine diffusion architecture, and propose the characteristic dance primitives +that possess significant expressiveness as intermediate representations between +two diffusion models. The first stage is global diffusion, which focuses on +comprehending the coarse-level music-dance correlation and production +characteristic dance primitives. In contrast, the second-stage is the local +diffusion, which parallelly generates detailed motion sequences under the +guidance of the dance primitives and choreographic rules. In addition, we +propose a Foot Refine Block to optimize the contact between the feet and the +ground, enhancing the physical realism of the motion. Our approach can +parallelly generate dance sequences of extremely long length, striking a +balance between global choreographic patterns and local motion quality and +expressiveness. Extensive experiments validate the efficacy of our method. + +
+
+ comment: Accepted by CVPR2024, Project page: + https://li-ronghui.github.io/lodge +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 152 + +
+
+
+ + ☆ MoVA: Adapting Mixture of Vision Experts to Multimodal Context + + +
+ As the key component in multimodal large language models (MLLMs), the ability +of the visual encoder greatly affects MLLM's understanding on diverse image +content. Although some large-scale pretrained vision encoders such as vision +encoders in CLIP and DINOv2 have brought promising performance, we found that +there is still no single vision encoder that can dominate various image content +understanding, e.g., the CLIP vision encoder leads to outstanding results on +general image understanding but poor performance on document or chart content. +To alleviate the bias of CLIP vision encoder, we first delve into the inherent +behavior of different pre-trained vision encoders and then propose the MoVA, a +powerful and novel MLLM, adaptively routing and fusing task-specific vision +experts with a coarse-to-fine mechanism. In the coarse-grained stage, we design +a context-aware expert routing strategy to dynamically select the most suitable +vision experts according to the user instruction, input image, and expertise of +vision experts. This benefits from the powerful model function understanding +ability of the large language model (LLM) equipped with expert-routing low-rank +adaptation (LoRA). In the fine-grained stage, we elaborately conduct the +mixture-of-vision-expert adapter (MoV-Adapter) to extract and fuse +task-specific knowledge from various experts. This coarse-to-fine paradigm +effectively leverages representations from experts based on multimodal context +and model expertise, further enhancing the generalization ability. We conduct +extensive experiments to evaluate the effectiveness of the proposed approach. +Without any bells and whistles, MoVA can achieve significant performance gains +over current state-of-the-art methods in a wide range of challenging multimodal +benchmarks. Codes and models will be available at +https://github.com/TempleX98/MoVA. + +
+
+
+
+
+ + ☆ Unified Scene Representation and Reconstruction for 3D Large Language + Models + + +
+ Enabling Large Language Models (LLMs) to interact with 3D environments is +challenging. Existing approaches extract point clouds either from ground truth +(GT) geometry or 3D scenes reconstructed by auxiliary models. Text-image +aligned 2D features from CLIP are then lifted to point clouds, which serve as +inputs for LLMs. However, this solution lacks the establishment of 3D +point-to-point connections, leading to a deficiency of spatial structure +information. Concurrently, the absence of integration and unification between +the geometric and semantic representations of the scene culminates in a +diminished level of 3D scene understanding. In this paper, we demonstrate the +importance of having a unified scene representation and reconstruction +framework, which is essential for LLMs in 3D scenes. Specifically, we introduce +Uni3DR^2 extracts 3D geometric and semantic aware representation features via +the frozen pre-trained 2D foundation models (e.g., CLIP and SAM) and a +multi-scale aggregate 3D decoder. Our learned 3D representations not only +contribute to the reconstruction process but also provide valuable knowledge +for LLMs. Experimental results validate that our Uni3DR^2 yields convincing +gains over the baseline on the 3D reconstruction dataset ScanNet (increasing +F-Score by +1.8\%). When applied to LLMs, our Uni3DR^2-LLM exhibits superior +performance over the baseline on the 3D vision-language understanding dataset +ScanQA (increasing BLEU-1 by +4.0\% and +4.2\% on the val set and test set, +respectively). Furthermore, it outperforms the state-of-the-art method that +uses additional GT point clouds on both ScanQA and 3DMV-VQA. + +
+
+ comment: Project Page: https://chtsy.github.io/uni3drr-page/ +
+
+
+
+
+ + ☆ Data Alignment for Zero-Shot Concept Generation in Dermatology AI + + +
+ AI in dermatology is evolving at a rapid pace but the major limitation to +training trustworthy classifiers is the scarcity of data with ground-truth +concept level labels, which are meta-labels semantically meaningful to humans. +Foundation models like CLIP providing zero-shot capabilities can help alleviate +this challenge by leveraging vast amounts of image-caption pairs available on +the internet. CLIP can be fine-tuned using domain specific image-caption pairs +to improve classification performance. However, CLIP's pre-training data is not +well-aligned with the medical jargon that clinicians use to perform diagnoses. +The development of large language models (LLMs) in recent years has led to the +possibility of leveraging the expressive nature of these models to generate +rich text. Our goal is to use these models to generate caption text that aligns +well with both the clinical lexicon and with the natural human language used in +CLIP's pre-training data. Starting with captions used for images in PubMed +articles, we extend them by passing the raw captions through an LLM fine-tuned +on the field's several textbooks. We find that using captions generated by an +expressive fine-tuned LLM like GPT-3.5 improves downstream zero-shot concept +classification performance. + +
+
+
+
+
+ + ☆ Analysis of Classifier-Free Guidance Weight Schedulers + + +
+ Classifier-Free Guidance (CFG) enhances the quality and condition adherence +of text-to-image diffusion models. It operates by combining the conditional and +unconditional predictions using a fixed weight. However, recent works vary the +weights throughout the diffusion process, reporting superior results but +without providing any rationale or analysis. By conducting comprehensive +experiments, this paper provides insights into CFG weight schedulers. Our +findings suggest that simple, monotonically increasing weight schedulers +consistently lead to improved performances, requiring merely a single line of +code. In addition, more complex parametrized schedulers can be optimized for +further improvement, but do not generalize across different models and tasks. + +
+
+
+
+
+ + ☆ LaPA: Latent Prompt Assist Model For Medical Visual Question Answering CVPR + + +
+ Medical visual question answering (Med-VQA) aims to automate the prediction +of correct answers for medical images and questions, thereby assisting +physicians in reducing repetitive tasks and alleviating their workload. +Existing approaches primarily focus on pre-training models using additional and +comprehensive datasets, followed by fine-tuning to enhance performance in +downstream tasks. However, there is also significant value in exploring +existing models to extract clinically relevant information. In this paper, we +propose the Latent Prompt Assist model (LaPA) for medical visual question +answering. Firstly, we design a latent prompt generation module to generate the +latent prompt with the constraint of the target answer. Subsequently, we +propose a multi-modal fusion block with latent prompt fusion module that +utilizes the latent prompt to extract clinical-relevant information from +uni-modal and multi-modal features. Additionally, we introduce a prior +knowledge fusion module to integrate the relationship between diseases and +organs with the clinical-relevant information. Finally, we combine the final +integrated information with image-language cross-modal information to predict +the final answers. Experimental results on three publicly available Med-VQA +datasets demonstrate that LaPA outperforms the state-of-the-art model ARL, +achieving improvements of 1.83%, 0.63%, and 1.80% on VQA-RAD, SLAKE, and +VQA-2019, respectively. The code is publicly available at +https://github.com/GaryGuTC/LaPA_model. + +
+
+ comment: 10 pages, 4 figures, Accepted by CVPRW2024 +
+
+
+
+
+ + ☆ PhysDreamer: Physics-Based Interaction with 3D Objects via Video + Generation + + +
+ Realistic object interactions are crucial for creating immersive virtual +experiences, yet synthesizing realistic 3D object dynamics in response to novel +interactions remains a significant challenge. Unlike unconditional or +text-conditioned dynamics generation, action-conditioned dynamics requires +perceiving the physical material properties of objects and grounding the 3D +motion prediction on these properties, such as object stiffness. However, +estimating physical material properties is an open problem due to the lack of +material ground-truth data, as measuring these properties for real objects is +highly difficult. We present PhysDreamer, a physics-based approach that endows +static 3D objects with interactive dynamics by leveraging the object dynamics +priors learned by video generation models. By distilling these priors, +PhysDreamer enables the synthesis of realistic object responses to novel +interactions, such as external forces or agent manipulations. We demonstrate +our approach on diverse examples of elastic objects and evaluate the realism of +the synthesized interactions through a user study. PhysDreamer takes a step +towards more engaging and realistic virtual experiences by enabling static 3D +objects to dynamically respond to interactive stimuli in a physically plausible +manner. See our project page at https://physdreamer.github.io/. + +
+
+ comment: Project website at: https://physdreamer.github.io/ +
+
+
+
+
+ + ☆ BANF: Band-limited Neural Fields for Levels of Detail Reconstruction + + +
+ Largely due to their implicit nature, neural fields lack a direct mechanism +for filtering, as Fourier analysis from discrete signal processing is not +directly applicable to these representations. Effective filtering of neural +fields is critical to enable level-of-detail processing in downstream +applications, and support operations that involve sampling the field on regular +grids (e.g. marching cubes). Existing methods that attempt to decompose neural +fields in the frequency domain either resort to heuristics or require extensive +modifications to the neural field architecture. We show that via a simple +modification, one can obtain neural fields that are low-pass filtered, and in +turn show how this can be exploited to obtain a frequency decomposition of the +entire signal. We demonstrate the validity of our technique by investigating +level-of-detail reconstruction, and showing how coarser representations can be +computed effectively. + +
+
+ comment: Project Page: https://theialab.github.io/banf +
+
+
+
+
+ + ☆ Optimizing Calibration by Gaining Aware of Prediction Correctness + + +
+ Model calibration aims to align confidence with prediction correctness. The +Cross-Entropy CE) loss is widely used for calibrator training, which enforces +the model to increase confidence on the ground truth class. However, we find +the CE loss has intrinsic limitations. For example, for a narrow +misclassification, a calibrator trained by the CE loss often produces high +confidence on the wrongly predicted class (e.g., a test sample is wrongly +classified and its softmax score on the ground truth class is around 0.4), +which is undesirable. In this paper, we propose a new post-hoc calibration +objective derived from the aim of calibration. Intuitively, the proposed +objective function asks that the calibrator decrease model confidence on +wrongly predicted samples and increase confidence on correctly predicted +samples. Because a sample itself has insufficient ability to indicate +correctness, we use its transformed versions (e.g., rotated, greyscaled and +color-jittered) during calibrator training. Trained on an in-distribution +validation set and tested with isolated, individual test samples, our method +achieves competitive calibration performance on both in-distribution and +out-of-distribution test sets compared with the state of the art. Further, our +analysis points out the difference between our method and commonly used +objectives such as CE loss and mean square error loss, where the latters +sometimes deviates from the calibration aim. + +
+
+
+
+
+ + ☆ Groma: Localized Visual Tokenization for Grounding Multimodal Large + Language Models + + +
+ We introduce Groma, a Multimodal Large Language Model (MLLM) with grounded +and fine-grained visual perception ability. Beyond holistic image +understanding, Groma is adept at region-level tasks such as region captioning +and visual grounding. Such capabilities are built upon a localized visual +tokenization mechanism, where an image input is decomposed into regions of +interest and subsequently encoded into region tokens. By integrating region +tokens into user instructions and model responses, we seamlessly enable Groma +to understand user-specified region inputs and ground its textual output to +images. Besides, to enhance the grounded chat ability of Groma, we curate a +visually grounded instruction dataset by leveraging the powerful GPT-4V and +visual prompting techniques. Compared with MLLMs that rely on the language +model or external module for localization, Groma consistently demonstrates +superior performances in standard referring and grounding benchmarks, +highlighting the advantages of embedding localization into image tokenization. +Project page: https://groma-mllm.github.io/. + +
+
+
+
+
+ + ☆ Towards Robust Ferrous Scrap Material Classification with Deep Learning + and Conformal Prediction + + +
+ In the steel production domain, recycling ferrous scrap is essential for +environmental and economic sustainability, as it reduces both energy +consumption and greenhouse gas emissions. However, the classification of scrap +materials poses a significant challenge, requiring advancements in automation +technology. Additionally, building trust among human operators is a major +obstacle. Traditional approaches often fail to quantify uncertainty and lack +clarity in model decision-making, which complicates acceptance. In this +article, we describe how conformal prediction can be employed to quantify +uncertainty and add robustness in scrap classification. We have adapted the +Split Conformal Prediction technique to seamlessly integrate with +state-of-the-art computer vision models, such as the Vision Transformer (ViT), +Swin Transformer, and ResNet-50, while also incorporating Explainable +Artificial Intelligence (XAI) methods. We evaluate the approach using a +comprehensive dataset of 8147 images spanning nine ferrous scrap classes. The +application of the Split Conformal Prediction method allowed for the +quantification of each model's uncertainties, which enhanced the understanding +of predictions and increased the reliability of the results. Specifically, the +Swin Transformer model demonstrated more reliable outcomes than the others, as +evidenced by its smaller average size of prediction sets and achieving an +average classification accuracy exceeding 95%. Furthermore, the Score-CAM +method proved highly effective in clarifying visual features, significantly +enhancing the explainability of the classification decisions. + +
+
+
+
+
+ + ☆ RadRotator: 3D Rotation of Radiographs with Diffusion Models + + +
+ Transforming two-dimensional (2D) images into three-dimensional (3D) volumes +is a well-known yet challenging problem for the computer vision community. In +the medical domain, a few previous studies attempted to convert two or more +input radiographs into computed tomography (CT) volumes. Following their +effort, we introduce a diffusion model-based technology that can rotate the +anatomical content of any input radiograph in 3D space, potentially enabling +the visualization of the entire anatomical content of the radiograph from any +viewpoint in 3D. Similar to previous studies, we used CT volumes to create +Digitally Reconstructed Radiographs (DRRs) as the training data for our model. +However, we addressed two significant limitations encountered in previous +studies: 1. We utilized conditional diffusion models with classifier-free +guidance instead of Generative Adversarial Networks (GANs) to achieve higher +mode coverage and improved output image quality, with the only trade-off being +slower inference time, which is often less critical in medical applications; +and 2. We demonstrated that the unreliable output of style transfer deep +learning (DL) models, such as Cycle-GAN, to transfer the style of actual +radiographs to DRRs could be replaced with a simple yet effective training +transformation that randomly changes the pixel intensity histograms of the +input and ground-truth imaging data during training. This transformation makes +the diffusion model agnostic to any distribution variations of the input data +pixel intensity, enabling the reliable training of a DL model on input DRRs and +applying the exact same model to conventional radiographs (or DRRs) during +inference. + +
+
+ comment: Website: https://pouriarouzrokh.github.io/RadRotator Online demo: + https://huggingface.co/spaces/Pouriarouzrokh/RadRotator Article information: + 16 pages, 11 figures +
+
+
+
+
+ + ☆ Nuclei Instance Segmentation of Cryosectioned H&E Stained Histological + Images using Triple U-Net Architecture + + +
+ Nuclei instance segmentation is crucial in oncological diagnosis and cancer +pathology research. H&E stained images are commonly used for medical diagnosis, +but pre-processing is necessary before using them for image processing tasks. +Two principal pre-processing methods are formalin-fixed paraffin-embedded +samples (FFPE) and frozen tissue samples (FS). While FFPE is widely used, it is +time-consuming, while FS samples can be processed quickly. Analyzing H&E +stained images derived from fast sample preparation, staining, and scanning can +pose difficulties due to the swift process, which can result in the degradation +of image quality. This paper proposes a method that leverages the unique +optical characteristics of H&E stained images. A three-branch U-Net +architecture has been implemented, where each branch contributes to the final +segmentation results. The process includes applying watershed algorithm to +separate overlapping regions and enhance accuracy. The Triple U-Net +architecture comprises an RGB branch, a Hematoxylin branch, and a Segmentation +branch. This study focuses on a novel dataset named CryoNuSeg. The results +obtained through robust experiments outperform the state-of-the-art results +across various metrics. The benchmark score for this dataset is AJI 52.5 and PQ +47.7, achieved through the implementation of U-Net Architecture. However, the +proposed Triple U-Net architecture achieves an AJI score of 67.41 and PQ of +50.56. The proposed architecture improves more on AJI than other evaluation +metrics, which further justifies the superiority of the Triple U-Net +architecture over the baseline U-Net model, as AJI is a more strict evaluation +metric. The use of the three-branch U-Net model, followed by watershed +post-processing, significantly surpasses the benchmark scores, showing +substantial improvement in the AJI score + +
+
+ comment: To be published in "6th IVPR & 11th ICIEV" +
+
+
+
+
+ + ☆ Cross-modal Diffusion Modelling for Super-resolved Spatial + Transcriptomics + + +
+ The recent advancement of spatial transcriptomics (ST) allows to characterize +spatial gene expression within tissue for discovery research. However, current +ST platforms suffer from low resolution, hindering in-depth understanding of +spatial gene expression. Super-resolution approaches promise to enhance ST maps +by integrating histology images with gene expressions of profiled tissue spots. +However, current super-resolution methods are limited by restoration +uncertainty and mode collapse. Although diffusion models have shown promise in +capturing complex interactions between multi-modal conditions, it remains a +challenge to integrate histology images and gene expression for super-resolved +ST maps. This paper proposes a cross-modal conditional diffusion model for +super-resolving ST maps with the guidance of histology images. Specifically, we +design a multi-modal disentangling network with cross-modal adaptive modulation +to utilize complementary information from histology images and spatial gene +expression. Moreover, we propose a dynamic cross-attention modelling strategy +to extract hierarchical cell-to-tissue information from histology images. +Lastly, we propose a co-expression-based gene-correlation graph network to +model the co-expression relationship of multiple genes. Experiments show that +our method outperforms other state-of-the-art methods in ST super-resolution on +three public datasets. + +
+
+
+
+
+ + ☆ Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of + Multi-modal Large Language Models + + +
+ Counterfactual reasoning, as a crucial manifestation of human intelligence, +refers to making presuppositions based on established facts and extrapolating +potential outcomes. Existing multimodal large language models (MLLMs) have +exhibited impressive cognitive and reasoning capabilities, which have been +examined across a wide range of Visual Question Answering (VQA) benchmarks. +Nevertheless, how will existing MLLMs perform when faced with counterfactual +questions? To answer this question, we first curate a novel +\textbf{C}ounter\textbf{F}actual \textbf{M}ulti\textbf{M}odal reasoning +benchmark, abbreviated as \textbf{CFMM}, to systematically assess the +counterfactual reasoning capabilities of MLLMs. Our CFMM comprises six +challenging tasks, each including hundreds of carefully human-labeled +counterfactual questions, to evaluate MLLM's counterfactual reasoning +capabilities across diverse aspects. Through experiments, interestingly, we +find that existing MLLMs prefer to believe what they see, but ignore the +counterfactual presuppositions presented in the question, thereby leading to +inaccurate responses. Furthermore, we evaluate a wide range of prevalent MLLMs +on our proposed CFMM. The significant gap between their performance on our CFMM +and that on several VQA benchmarks indicates that there is still considerable +room for improvement in existing MLLMs toward approaching human-level +intelligence. On the other hand, through boosting MLLMs performances on our +CFMM in the future, potential avenues toward developing MLLMs with advanced +intelligence can be explored. + +
+
+
+
+
+ + ☆ Improving Pediatric Pneumonia Diagnosis with Adult Chest X-ray Images + Utilizing Contrastive Learning and Embedding Similarity + + +
+ Despite the advancement of deep learning-based computer-aided diagnosis (CAD) +methods for pneumonia from adult chest x-ray (CXR) images, the performance of +CAD methods applied to pediatric images remains suboptimal, mainly due to the +lack of large-scale annotated pediatric imaging datasets. Establishing a proper +framework to leverage existing adult large-scale CXR datasets can thus enhance +pediatric pneumonia detection performance. In this paper, we propose a +three-branch parallel path learning-based framework that utilizes both adult +and pediatric datasets to improve the performance of deep learning models on +pediatric test datasets. The paths are trained with pediatric only, adult only, +and both types of CXRs, respectively. Our proposed framework utilizes the +multi-positive contrastive loss to cluster the classwise embeddings and the +embedding similarity loss among these three parallel paths to make the +classwise embeddings as close as possible to reduce the effect of domain shift. +Experimental evaluations on open-access adult and pediatric CXR datasets show +that the proposed method achieves a superior AUROC score of 0.8464 compared to +0.8348 obtained using the conventional approach of join training on both +datasets. The proposed approach thus paves the way for generalized CAD models +that are effective for both adult and pediatric age groups. + +
+
+ comment: Accepted to International Conference of IEEE Engineering in Medicine + and Biology Society (EMBC), 2024 +
+
+
+
+
+ + ☆ Next Generation Loss Function for Image Classification + + +
+ Neural networks are trained by minimizing a loss function that defines the +discrepancy between the predicted model output and the target value. The +selection of the loss function is crucial to achieve task-specific behaviour +and highly influences the capability of the model. A variety of loss functions +have been proposed for a wide range of tasks affecting training and model +performance. For classification tasks, the cross entropy is the de-facto +standard and usually the first choice. Here, we try to experimentally challenge +the well-known loss functions, including cross entropy (CE) loss, by utilizing +the genetic programming (GP) approach, a population-based evolutionary +algorithm. GP constructs loss functions from a set of operators and leaf nodes +and these functions are repeatedly recombined and mutated to find an optimal +structure. Experiments were carried out on different small-sized datasets +CIFAR-10, CIFAR-100 and Fashion-MNIST using an Inception model. The 5 best +functions found were evaluated for different model architectures on a set of +standard datasets ranging from 2 to 102 classes and very different sizes. One +function, denoted as Next Generation Loss (NGL), clearly stood out showing same +or better performance for all tested datasets compared to CE. To evaluate the +NGL function on a large-scale dataset, we tested its performance on the +Imagenet-1k dataset where it showed improved top-1 accuracy compared to models +trained with identical settings and other losses. Finally, the NGL was trained +on a segmentation downstream task for Pascal VOC 2012 and COCO-Stuff164k +datasets improving the underlying model performance. + +
+
+
+
+
+ + ☆ Purposer: Putting Human Motion Generation in Context + + +
+ We present a novel method to generate human motion to populate 3D indoor +scenes. It can be controlled with various combinations of conditioning signals +such as a path in a scene, target poses, past motions, and scenes represented +as 3D point clouds. State-of-the-art methods are either models specialized to +one single setting, require vast amounts of high-quality and diverse training +data, or are unconditional models that do not integrate scene or other +contextual information. As a consequence, they have limited applicability and +rely on costly training data. To address these limitations, we propose a new +method ,dubbed Purposer, based on neural discrete representation learning. Our +model is capable of exploiting, in a flexible manner, different types of +information already present in open access large-scale datasets such as AMASS. +First, we encode unconditional human motion into a discrete latent space. +Second, an autoregressive generative model, conditioned with key contextual +information, either with prompting or additive tokens, and trained for +next-step prediction in this space, synthesizes sequences of latent indices. We +further design a novel conditioning block to handle future conditioning +information in such a causal model by using a network with two branches to +compute separate stacks of features. In this manner, Purposer can generate +realistic motion sequences in diverse test scenes. Through exhaustive +evaluation, we demonstrate that our multi-contextual solution outperforms +existing specialized approaches for specific contextual information, both in +terms of quality and diversity. Our model is trained with short sequences, but +a byproduct of being able to use various conditioning signals is that at test +time different combinations can be used to chain short sequences together and +generate long motions within a context scene. + +
+
+
+
+
+ + ☆ Neural Flow Diffusion Models: Learnable Forward Process for Improved + Diffusion Modelling + + +
+ Conventional diffusion models typically relies on a fixed forward process, +which implicitly defines complex marginal distributions over latent variables. +This can often complicate the reverse process' task in learning generative +trajectories, and results in costly inference for diffusion models. To address +these limitations, we introduce Neural Flow Diffusion Models (NFDM), a novel +framework that enhances diffusion models by supporting a broader range of +forward processes beyond the fixed linear Gaussian. We also propose a novel +parameterization technique for learning the forward process. Our framework +provides an end-to-end, simulation-free optimization objective, effectively +minimizing a variational upper bound on the negative log-likelihood. +Experimental results demonstrate NFDM's strong performance, evidenced by +state-of-the-art likelihood estimation. Furthermore, we investigate NFDM's +capacity for learning generative dynamics with specific characteristics, such +as deterministic straight lines trajectories. This exploration underscores +NFDM's versatility and its potential for a wide range of applications. + +
+
+
+
+
+ + ☆ A Hybrid Generative and Discriminative PointNet on Unordered Point Sets + + +
+ As point cloud provides a natural and flexible representation usable in +myriad applications (e.g., robotics and self-driving cars), the ability to +synthesize point clouds for analysis becomes crucial. Recently, Xie et al. +propose a generative model for unordered point sets in the form of an +energy-based model (EBM). Despite the model achieving an impressive performance +for point cloud generation, one separate model needs to be trained for each +category to capture the complex point set distributions. Besides, their method +is unable to classify point clouds directly and requires additional fine-tuning +for classification. One interesting question is: Can we train a single network +for a hybrid generative and discriminative model of point clouds? A similar +question has recently been answered in the affirmative for images, introducing +the framework of Joint Energy-based Model (JEM), which achieves high +performance in image classification and generation simultaneously. This paper +proposes GDPNet, the first hybrid Generative and Discriminative PointNet that +extends JEM for point cloud classification and generation. Our GDPNet retains +strong discriminative power of modern PointNet classifiers, while generating +point cloud samples rivaling state-of-the-art generative approaches. + +
+
+
+
+
+ + ☆ Is Retain Set All You Need in Machine Unlearning? Restoring Performance + of Unlearned Models with Out-Of-Distribution Images + + +
+ In this paper, we introduce Selective-distillation for Class and +Architecture-agnostic unleaRning (SCAR), a novel approximate unlearning method. +SCAR efficiently eliminates specific information while preserving the model's +test accuracy without using a retain set, which is a key component in +state-of-the-art approximate unlearning algorithms. Our approach utilizes a +modified Mahalanobis distance to guide the unlearning of the feature vectors of +the instances to be forgotten, aligning them to the nearest wrong class +distribution. Moreover, we propose a distillation-trick mechanism that distills +the knowledge of the original model into the unlearning model with +out-of-distribution images for retaining the original model's test performance +without using any retain set. Importantly, we propose a self-forget version of +SCAR that unlearns without having access to the forget set. We experimentally +verified the effectiveness of our method, on three public datasets, comparing +it with state-of-the-art methods. Our method obtains performance higher than +methods that operate without the retain set and comparable w.r.t the best +methods that rely on the retain set. + +
+
+
+
+
+ + ☆ Zero-Shot Medical Phrase Grounding with Off-the-shelf Diffusion Models + + +
+ Localizing the exact pathological regions in a given medical scan is an +important imaging problem that requires a large amount of bounding box ground +truth annotations to be accurately solved. However, there exist alternative, +potentially weaker, forms of supervision, such as accompanying free-text +reports, which are readily available. The task of performing localization with +textual guidance is commonly referred to as phrase grounding. In this work, we +use a publicly available Foundation Model, namely the Latent Diffusion Model, +to solve this challenging task. This choice is supported by the fact that the +Latent Diffusion Model, despite being generative in nature, contains mechanisms +(cross-attention) that implicitly align visual and textual features, thus +leading to intermediate representations that are suitable for the task at hand. +In addition, we aim to perform this task in a zero-shot manner, i.e., without +any further training on target data, meaning that the model's weights remain +frozen. To this end, we devise strategies to select features and also refine +them via post-processing without extra learnable parameters. We compare our +proposed method with state-of-the-art approaches which explicitly enforce +image-text alignment in a joint embedding space via contrastive learning. +Results on a popular chest X-ray benchmark indicate that our method is +competitive wih SOTA on different types of pathology, and even outperforms them +on average in terms of two metrics (mean IoU and AUC-ROC). Source code will be +released upon acceptance. + +
+
+ comment: 8 pages, 3 figures, submitted to IEEE J-BHI Special Issue on + Foundation Models in Medical Imaging +
+
+
+
+
+ + ☆ Zero-Shot Stitching in Reinforcement Learning using Relative + Representations + + +
+ Visual Reinforcement Learning is a popular and powerful framework that takes +full advantage of the Deep Learning breakthrough. However, it is also known +that variations in the input (e.g., different colors of the panorama due to the +season of the year) or the task (e.g., changing the speed limit for a car to +respect) could require complete retraining of the agents. In this work, we +leverage recent developments in unifying latent representations to demonstrate +that it is possible to combine the components of an agent, rather than retrain +it from scratch. We build upon the recent relative representations framework +and adapt it for Visual RL. This allows us to create completely new agents +capable of handling environment-task combinations never seen during training. +Our work paves the road toward a more accessible and flexible use of +reinforcement learning. + +
+
+ comment: 13 pages, 10 figures, 4 tables +
+
+
+
+
+ + ☆ Robust CLIP-Based Detector for Exposing Diffusion Model-Generated Images + + +
+ Diffusion models (DMs) have revolutionized image generation, producing +high-quality images with applications spanning various fields. However, their +ability to create hyper-realistic images poses significant challenges in +distinguishing between real and synthetic content, raising concerns about +digital authenticity and potential misuse in creating deepfakes. This work +introduces a robust detection framework that integrates image and text features +extracted by CLIP model with a Multilayer Perceptron (MLP) classifier. We +propose a novel loss that can improve the detector's robustness and handle +imbalanced datasets. Additionally, we flatten the loss landscape during the +model training to improve the detector's generalization capabilities. The +effectiveness of our method, which outperforms traditional detection +techniques, is demonstrated through extensive experiments, underscoring its +potential to set a new state-of-the-art approach in DM-generated image +detection. The code is available at +https://github.com/Purdue-M2/Robust_DM_Generated_Image_Detection. + +
+
+
+
+
+ + ☆ Training-and-prompt-free General Painterly Harmonization Using + Image-wise Attention Sharing + + +
+ Painterly Image Harmonization aims at seamlessly blending disparate visual +elements within a single coherent image. However, previous approaches often +encounter significant limitations due to training data constraints, the need +for time-consuming fine-tuning, or reliance on additional prompts. To surmount +these hurdles, we design a Training-and-prompt-Free General Painterly +Harmonization method using image-wise attention sharing (TF-GPH), which +integrates a novel "share-attention module". This module redefines the +traditional self-attention mechanism by allowing for comprehensive image-wise +attention, facilitating the use of a state-of-the-art pretrained latent +diffusion model without the typical training data limitations. Additionally, we +further introduce "similarity reweighting" mechanism enhances performance by +effectively harnessing cross-image information, surpassing the capabilities of +fine-tuning or prompt-based approaches. At last, we recognize the deficiencies +in existing benchmarks and propose the "General Painterly Harmonization +Benchmark", which employs range-based evaluation metrics to more accurately +reflect real-world application. Extensive experiments demonstrate the superior +efficacy of our method across various benchmarks. The code and web demo are +available at https://github.com/BlueDyee/TF-GPH. + +
+
+
+
+
+ + ☆ Learn2Talk: 3D Talking Face Learns from 2D Talking Face + + +
+ Speech-driven facial animation methods usually contain two main classes, 3D +and 2D talking face, both of which attract considerable research attention in +recent years. However, to the best of our knowledge, the research on 3D talking +face does not go deeper as 2D talking face, in the aspect of +lip-synchronization (lip-sync) and speech perception. To mind the gap between +the two sub-fields, we propose a learning framework named Learn2Talk, which can +construct a better 3D talking face network by exploiting two expertise points +from the field of 2D talking face. Firstly, inspired by the audio-video sync +network, a 3D sync-lip expert model is devised for the pursuit of lip-sync +between audio and 3D facial motion. Secondly, a teacher model selected from 2D +talking face methods is used to guide the training of the audio-to-3D motions +regression network to yield more 3D vertex accuracy. Extensive experiments show +the advantages of the proposed framework in terms of lip-sync, vertex accuracy +and speech perception, compared with state-of-the-arts. Finally, we show two +applications of the proposed framework: audio-visual speech recognition and +speech-driven 3D Gaussian Splatting based avatar animation. + +
+
+
+
+
+ + ☆ 3D Multi-frame Fusion for Video Stabilization CVPR 2024 + + +
+ In this paper, we present RStab, a novel framework for video stabilization +that integrates 3D multi-frame fusion through volume rendering. Departing from +conventional methods, we introduce a 3D multi-frame perspective to generate +stabilized images, addressing the challenge of full-frame generation while +preserving structure. The core of our approach lies in Stabilized Rendering +(SR), a volume rendering module, which extends beyond the image fusion by +incorporating feature fusion. The core of our RStab framework lies in +Stabilized Rendering (SR), a volume rendering module, fusing multi-frame +information in 3D space. Specifically, SR involves warping features and colors +from multiple frames by projection, fusing them into descriptors to render the +stabilized image. However, the precision of warped information depends on the +projection accuracy, a factor significantly influenced by dynamic regions. In +response, we introduce the Adaptive Ray Range (ARR) module to integrate depth +priors, adaptively defining the sampling range for the projection process. +Additionally, we propose Color Correction (CC) assisting geometric constraints +with optical flow for accurate color aggregation. Thanks to the three modules, +our RStab demonstrates superior performance compared with previous stabilizers +in the field of view (FOV), image quality, and video stability across various +datasets. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ MCM: Multi-condition Motion Synthesis Framework + + +
+ Conditional human motion synthesis (HMS) aims to generate human motion +sequences that conform to specific conditions. Text and audio represent the two +predominant modalities employed as HMS control conditions. While existing +research has primarily focused on single conditions, the multi-condition human +motion synthesis remains underexplored. In this study, we propose a +multi-condition HMS framework, termed MCM, based on a dual-branch structure +composed of a main branch and a control branch. This framework effectively +extends the applicability of the diffusion model, which is initially predicated +solely on textual conditions, to auditory conditions. This extension +encompasses both music-to-dance and co-speech HMS while preserving the +intrinsic quality of motion and the capabilities for semantic association +inherent in the original model. Furthermore, we propose the implementation of a +Transformer-based diffusion model, designated as MWNet, as the main branch. +This model adeptly apprehends the spatial intricacies and inter-joint +correlations inherent in motion sequences, facilitated by the integration of +multi-wise self-attention modules. Extensive experiments show that our method +achieves competitive results in single-condition and multi-condition HMS tasks. + +
+
+
+
+
+ + ☆ A Large-scale Medical Visual Task Adaptation Benchmark + + +
+ Visual task adaptation has been demonstrated to be effective in adapting +pre-trained Vision Transformers (ViTs) to general downstream visual tasks using +specialized learnable layers or tokens. However, there is yet a large-scale +benchmark to fully explore the effect of visual task adaptation on the +realistic and important medical domain, particularly across diverse medical +visual modalities, such as color images, X-ray, and CT. To close this gap, we +present Med-VTAB, a large-scale Medical Visual Task Adaptation Benchmark +consisting of 1.68 million medical images for diverse organs, modalities, and +adaptation approaches. Based on Med-VTAB, we explore the scaling law of medical +prompt tuning concerning tunable parameters and the generalizability of medical +visual adaptation using non-medical/medical pre-train weights. Besides, we +study the impact of patient ID out-of-distribution on medical visual +adaptation, which is a real and challenging scenario. Furthermore, results from +Med-VTAB indicate that a single pre-trained model falls short in medical task +adaptation. Therefore, we introduce GMoE-Adapter, a novel method that combines +medical and general pre-training weights through a gated mixture-of-experts +adapter, achieving state-of-the-art results in medical visual task adaptation. + +
+
+
+
+
+ + ☆ FipTR: A Simple yet Effective Transformer Framework for Future Instance + Prediction in Autonomous Driving + + +
+ The future instance prediction from a Bird's Eye View(BEV) perspective is a +vital component in autonomous driving, which involves future instance +segmentation and instance motion prediction. Existing methods usually rely on a +redundant and complex pipeline which requires multiple auxiliary outputs and +post-processing procedures. Moreover, estimated errors on each of the auxiliary +predictions will lead to degradation of the prediction performance. In this +paper, we propose a simple yet effective fully end-to-end framework named +Future Instance Prediction Transformer(FipTR), which views the task as BEV +instance segmentation and prediction for future frames. We propose to adopt +instance queries representing specific traffic participants to directly +estimate the corresponding future occupied masks, and thus get rid of complex +post-processing procedures. Besides, we devise a flow-aware BEV predictor for +future BEV feature prediction composed of a flow-aware deformable attention +that takes backward flow guiding the offset sampling. A novel future instance +matching strategy is also proposed to further improve the temporal coherence. +Extensive experiments demonstrate the superiority of FipTR and its +effectiveness under different temporal BEV encoders. + +
+
+
+
+
+ + ☆ How Does the Textual Information Affect the Retrieval of Multimodal + In-Context Learning? + + +
+ The increase in parameter size of multimodal large language models (MLLMs) +introduces significant capabilities, particularly in-context learning, where +MLLMs enhance task performance without updating pre-trained parameters. This +effectiveness, however, hinges on the appropriate selection of in-context +examples, a process that is currently biased towards visual data, overlooking +textual information. Furthermore, the area of supervised retrievers for MLLMs, +crucial for optimal in-context example selection, continues to be +uninvestigated. Our study offers an in-depth evaluation of the impact of +textual information on the unsupervised selection of in-context examples in +multimodal contexts, uncovering a notable sensitivity of retriever performance +to the employed modalities. Responding to this, we introduce a novel supervised +MLLM-retriever MSIER that employs a neural network to select examples that +enhance multimodal in-context learning efficiency. This approach is validated +through extensive testing across three distinct tasks, demonstrating the +method's effectiveness. Additionally, we investigate the influence of +modalities on our supervised retrieval method's training and pinpoint factors +contributing to our model's success. This exploration paves the way for future +advancements, highlighting the potential for refined in-context learning in +MLLMs through the strategic use of multimodal data. + +
+
+
+
+
+ + ☆ Foundation Model assisted Weakly Supervised LiDAR Semantic Segmentation + + +
+ Current point cloud semantic segmentation has achieved great advances when +given sufficient labels. However, the dense annotation of LiDAR point clouds +remains prohibitively expensive and time-consuming, unable to keep up with the +continuously growing volume of data. In this paper, we propose annotating +images with scattered points, followed by utilizing SAM (a Foundation model) to +generate semantic segmentation labels for the images. Finally, by mapping the +segmentation labels of the images to the LiDAR space using the intrinsic and +extrinsic parameters of the camera and LiDAR, we obtain labels for point cloud +semantic segmentation, and release Scatter-KITTI and Scatter-nuScenes, which +are the first works to utilize image segmentation-based SAM for weakly +supervised point cloud semantic segmentation. Furthermore, to mitigate the +influence of erroneous pseudo labels obtained from sparse annotations on point +cloud features, we propose a multi-modal weakly supervised network for LiDAR +semantic segmentation, called MM-ScatterNet. This network combines features +from both point cloud and image modalities, enhancing the representation +learning of point clouds by introducing consistency constraints between +multi-modal features and point cloud features. On the SemanticKITTI dataset, we +achieve 66\% of fully supervised performance using only 0.02% of annotated +data, and on the NuScenes dataset, we achieve 95% of fully supervised +performance using only 0.1% labeled points. + +
+
+
+
+
+ + ☆ Language-Driven Active Learning for Diverse Open-Set 3D Object Detection + + +
+ Object detection is crucial for ensuring safe autonomous driving. However, +data-driven approaches face challenges when encountering minority or novel +objects in the 3D driving scene. In this paper, we propose VisLED, a +language-driven active learning framework for diverse open-set 3D Object +Detection. Our method leverages active learning techniques to query diverse and +informative data samples from an unlabeled pool, enhancing the model's ability +to detect underrepresented or novel objects. Specifically, we introduce the +Vision-Language Embedding Diversity Querying (VisLED-Querying) algorithm, which +operates in both open-world exploring and closed-world mining settings. In +open-world exploring, VisLED-Querying selects data points most novel relative +to existing data, while in closed-world mining, it mines new instances of known +classes. We evaluate our approach on the nuScenes dataset and demonstrate its +effectiveness compared to random sampling and entropy-querying methods. Our +results show that VisLED-Querying consistently outperforms random sampling and +offers competitive performance compared to entropy-querying despite the +latter's model-optimality, highlighting the potential of VisLED for improving +object detection in autonomous driving scenarios. + +
+
+
+
+
+ + ☆ LSP Framework: A Compensatory Model for Defeating Trigger Reverse + Engineering via Label Smoothing Poisoning + + +
+ Deep neural networks are vulnerable to backdoor attacks. Among the existing +backdoor defense methods, trigger reverse engineering based approaches, which +reconstruct the backdoor triggers via optimizations, are the most versatile and +effective ones compared to other types of methods. In this paper, we summarize +and construct a generic paradigm for the typical trigger reverse engineering +process. Based on this paradigm, we propose a new perspective to defeat trigger +reverse engineering by manipulating the classification confidence of backdoor +samples. To determine the specific modifications of classification confidence, +we propose a compensatory model to compute the lower bound of the modification. +With proper modifications, the backdoor attack can easily bypass the trigger +reverse engineering based methods. To achieve this objective, we propose a +Label Smoothing Poisoning (LSP) framework, which leverages label smoothing to +specifically manipulate the classification confidences of backdoor samples. +Extensive experiments demonstrate that the proposed work can defeat the +state-of-the-art trigger reverse engineering based methods, and possess good +compatibility with a variety of existing backdoor attacks. + +
+
+
+
+
+ + ☆ Explainable Deepfake Video Detection using Convolutional Neural Network + and CapsuleNet + + +
+ Deepfake technology, derived from deep learning, seamlessly inserts +individuals into digital media, irrespective of their actual participation. Its +foundation lies in machine learning and Artificial Intelligence (AI). +Initially, deepfakes served research, industry, and entertainment. While the +concept has existed for decades, recent advancements render deepfakes nearly +indistinguishable from reality. Accessibility has soared, empowering even +novices to create convincing deepfakes. However, this accessibility raises +security concerns.The primary deepfake creation algorithm, GAN (Generative +Adversarial Network), employs machine learning to craft realistic images or +videos. Our objective is to utilize CNN (Convolutional Neural Network) and +CapsuleNet with LSTM to differentiate between deepfake-generated frames and +originals. Furthermore, we aim to elucidate our model's decision-making process +through Explainable AI, fostering transparent human-AI relationships and +offering practical examples for real-life scenarios. + +
+
+
+
+
+ + ☆ ECOR: Explainable CLIP for Object Recognition + + +
+ Large Vision Language Models (VLMs), such as CLIP, have significantly +contributed to various computer vision tasks, including object recognition and +object detection. Their open vocabulary feature enhances their value. However, +their black-box nature and lack of explainability in predictions make them less +trustworthy in critical domains. Recently, some work has been done to force +VLMs to provide reasonable rationales for object recognition, but this often +comes at the expense of classification accuracy. In this paper, we first +propose a mathematical definition of explainability in the object recognition +task based on the joint probability distribution of categories and rationales, +then leverage this definition to fine-tune CLIP in an explainable manner. +Through evaluations of different datasets, our method demonstrates +state-of-the-art performance in explainable classification. Notably, it excels +in zero-shot settings, showcasing its adaptability. This advancement improves +explainable object recognition, enhancing trust across diverse applications. +The code will be made available online upon publication. + +
+
+
+
+
+ + ☆ COIN: Counterfactual inpainting for weakly supervised semantic + segmentation for medical images + + +
+ Deep learning is dramatically transforming the field of medical imaging and +radiology, enabling the identification of pathologies in medical images, +including computed tomography (CT) and X-ray scans. However, the performance of +deep learning models, particularly in segmentation tasks, is often limited by +the need for extensive annotated datasets. To address this challenge, the +capabilities of weakly supervised semantic segmentation are explored through +the lens of Explainable AI and the generation of counterfactual explanations. +The scope of this research is development of a novel counterfactual inpainting +approach (COIN) that flips the predicted classification label from abnormal to +normal by using a generative model. For instance, if the classifier deems an +input medical image X as abnormal, indicating the presence of a pathology, the +generative model aims to inpaint the abnormal region, thus reversing the +classifier's original prediction label. The approach enables us to produce +precise segmentations for pathologies without depending on pre-existing +segmentation masks. Crucially, image-level labels are utilized, which are +substantially easier to acquire than creating detailed segmentation masks. The +effectiveness of the method is demonstrated by segmenting synthetic targets and +actual kidney tumors from CT images acquired from Tartu University Hospital in +Estonia. The findings indicate that COIN greatly surpasses established +attribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an +alternative counterfactual explanation method introduced by Singla et al. This +evidence suggests that COIN is a promising approach for semantic segmentation +of tumors in CT images, and presents a step forward in making deep learning +applications more accessible and effective in healthcare, where annotated data +is scarce. + +
+
+ comment: This work has been accepted to be presented to The 2nd World + Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19, + 2024 - Valletta, Malta +
+
+
+
+
+ + ☆ Unveiling the Ambiguity in Neural Inverse Rendering: A Parameter + Compensation Analysis + + +
+ Inverse rendering aims to reconstruct the scene properties of objects solely +from multiview images. However, it is an ill-posed problem prone to producing +ambiguous estimations deviating from physically accurate representations. In +this paper, we utilize Neural Microfacet Fields (NMF), a state-of-the-art +neural inverse rendering method to illustrate the inherent ambiguity. We +propose an evaluation framework to assess the degree of compensation or +interaction between the estimated scene properties, aiming to explore the +mechanisms behind this ill-posed problem and potential mitigation strategies. +Specifically, we introduce artificial perturbations to one scene property and +examine how adjusting another property can compensate for these perturbations. +To facilitate such experiments, we introduce a disentangled NMF where material +properties are independent. The experimental findings underscore the intrinsic +ambiguity present in neural inverse rendering and highlight the importance of +providing additional guidance through geometry, material, and illumination +priors. + +
+
+
+
+
+ + ☆ Generative Modelling with High-Order Langevin Dynamics + + +
+ Diffusion generative modelling (DGM) based on stochastic + differential equations (SDEs) with + score matching has achieved unprecedented results in data + generation. + In this paper, we propose a novel fast high-quality + generative modelling method + based on high-order + Langevin dynamics (HOLD) with score matching. + This motive is proved by third-order + Langevin dynamics. By augmenting the + previous SDEs, e.g. + variance exploding or variance preserving SDEs + for single-data variable processes, HOLD can simultaneously + model position, velocity, and + acceleration, thereby improving the quality + and speed of the data + generation at the same time. + HOLD is composed of one Ornstein-Uhlenbeck process + and two Hamiltonians, + which reduce the mixing time by two orders of magnitude. + Empirical experiments for unconditional image generation on the + public data set CIFAR-10 and CelebA-HQ show that the effect is significant in + both Frechet inception distance (FID) and negative log-likelihood, + and achieves the + state-of-the-art FID of 1.85 on CIFAR-10. + +
+
+ comment: Some of the results in this paper have been published or accepted at + conferences such as wacv2024, icassp2024, and icme2024 +
+
+
+
+
+ + ☆ Linearly-evolved Transformer for Pan-sharpening + + +
+ Vision transformer family has dominated the satellite pan-sharpening field +driven by the global-wise spatial information modeling mechanism from the core +self-attention ingredient. The standard modeling rules within these promising +pan-sharpening methods are to roughly stack the transformer variants in a +cascaded manner. Despite the remarkable advancement, their success may be at +the huge cost of model parameters and FLOPs, thus preventing its application +over low-resource satellites.To address this challenge between favorable +performance and expensive computation, we tailor an efficient linearly-evolved +transformer variant and employ it to construct a lightweight pan-sharpening +framework. In detail, we deepen into the popular cascaded transformer modeling +with cutting-edge methods and develop the alternative 1-order linearly-evolved +transformer variant with the 1-dimensional linear convolution chain to achieve +the same function. In this way, our proposed method is capable of benefiting +the cascaded modeling rule while achieving favorable performance in the +efficient manner. Extensive experiments over multiple satellite datasets +suggest that our proposed method achieves competitive performance against other +state-of-the-art with fewer computational resources. Further, the consistently +favorable performance has been verified over the hyper-spectral image fusion +task. Our main focus is to provide an alternative global modeling framework +with an efficient structure. The code will be publicly available. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ TextSquare: Scaling up Text-Centric Visual Instruction Tuning + + +
+ Text-centric visual question answering (VQA) has made great strides with the +development of Multimodal Large Language Models (MLLMs), yet open-source models +still fall short of leading models like GPT4V and Gemini, partly due to a lack +of extensive, high-quality instruction tuning data. To this end, we introduce a +new approach for creating a massive, high-quality instruction-tuning dataset, +Square-10M, which is generated using closed-source MLLMs. The data construction +process, termed Square, consists of four steps: Self-Questioning, Answering, +Reasoning, and Evaluation. Our experiments with Square-10M led to three key +findings: 1) Our model, TextSquare, considerably surpasses open-source previous +state-of-the-art Text-centric MLLMs and sets a new standard on OCRBench(62.2%). +It even outperforms top-tier models like GPT4V and Gemini in 6 of 10 +text-centric benchmarks. 2) Additionally, we demonstrate the critical role of +VQA reasoning data in offering comprehensive contextual insights for specific +questions. This not only improves accuracy but also significantly mitigates +hallucinations. Specifically, TextSquare scores an average of 75.1% across four +general VQA and hallucination evaluation datasets, outperforming previous +state-of-the-art models. 3) Notably, the phenomenon observed in scaling +text-centric VQA datasets reveals a vivid pattern: the exponential increase of +instruction tuning data volume is directly proportional to the improvement in +model performance, thereby validating the necessity of the dataset scale and +the high quality of Square-10M. + +
+
+
+
+
+ + ☆ A Point-Based Approach to Efficient LiDAR Multi-Task Perception + + +
+ Multi-task networks can potentially improve performance and computational +efficiency compared to single-task networks, facilitating online deployment. +However, current multi-task architectures in point cloud perception combine +multiple task-specific point cloud representations, each requiring a separate +feature encoder and making the network structures bulky and slow. We propose +PAttFormer, an efficient multi-task architecture for joint semantic +segmentation and object detection in point clouds that only relies on a +point-based representation. The network builds on transformer-based feature +encoders using neighborhood attention and grid-pooling and a query-based +detection decoder using a novel 3D deformable-attention detection head design. +Unlike other LiDAR-based multi-task architectures, our proposed PAttFormer does +not require separate feature encoders for multiple task-specific point cloud +representations, resulting in a network that is 3x smaller and 1.4x faster +while achieving competitive performance on the nuScenes and KITTI benchmarks +for autonomous driving perception. Our extensive evaluations show substantial +gains from multi-task learning, improving LiDAR semantic segmentation by +1.7% +in mIou and 3D object detection by +1.7% in mAP on the nuScenes benchmark +compared to the single-task models. + +
+
+ comment: 8 pages, 3 figures, 8 tables +
+
+
+
+
+ + ☆ MambaMOS: LiDAR-based 3D Moving Object Segmentation with Motion-aware + State Space Model + + +
+ LiDAR-based Moving Object Segmentation (MOS) aims to locate and segment +moving objects in point clouds of the current scan using motion information +from previous scans. Despite the promising results achieved by previous MOS +methods, several key issues, such as the weak coupling of temporal and spatial +information, still need further study. In this paper, we propose a novel +LiDAR-based 3D Moving Object Segmentation with Motion-aware State Space Model, +termed MambaMOS. Firstly, we develop a novel embedding module, the Time Clue +Bootstrapping Embedding (TCBE), to enhance the coupling of temporal and spatial +information in point clouds and alleviate the issue of overlooked temporal +clues. Secondly, we introduce the Motion-aware State Space Model (MSSM) to +endow the model with the capacity to understand the temporal correlations of +the same object across different time steps. Specifically, MSSM emphasizes the +motion states of the same object at different time steps through two distinct +temporal modeling and correlation steps. We utilize an improved state space +model to represent these motion differences, significantly modeling the motion +states. Finally, extensive experiments on the SemanticKITTI-MOS and KITTI-Road +benchmarks demonstrate that the proposed MambaMOS achieves state-of-the-art +performance. The source code of this work will be made publicly available at +https://github.com/Terminal-K/MambaMOS. + +
+
+ comment: The source code will be made publicly available at + https://github.com/Terminal-K/MambaMOS +
+
+
+
+
+ + ☆ Contrastive Gaussian Clustering: Weakly Supervised 3D Scene Segmentation + + +
+ We introduce Contrastive Gaussian Clustering, a novel approach capable of +provide segmentation masks from any viewpoint and of enabling 3D segmentation +of the scene. Recent works in novel-view synthesis have shown how to model the +appearance of a scene via a cloud of 3D Gaussians, and how to generate accurate +images from a given viewpoint by projecting on it the Gaussians before $\alpha$ +blending their color. Following this example, we train a model to include also +a segmentation feature vector for each Gaussian. These can then be used for 3D +scene segmentation, by clustering Gaussians according to their feature vectors; +and to generate 2D segmentation masks, by projecting the Gaussians on a plane +and $\alpha$ blending over their segmentation features. Using a combination of +contrastive learning and spatial regularization, our method can be trained on +inconsistent 2D segmentation masks, and still learn to generate segmentation +masks consistent across all views. Moreover, the resulting model is extremely +accurate, improving the IoU accuracy of the predicted masks by $+8\%$ over the +state of the art. Code and trained models will be released soon. + +
+
+
+
+
+ + ☆ Sentiment-oriented Transformer-based Variational Autoencoder Network for + Live Video Commenting + + +
+ Automatic live video commenting is with increasing attention due to its +significance in narration generation, topic explanation, etc. However, the +diverse sentiment consideration of the generated comments is missing from the +current methods. Sentimental factors are critical in interactive commenting, +and lack of research so far. Thus, in this paper, we propose a +Sentiment-oriented Transformer-based Variational Autoencoder (So-TVAE) network +which consists of a sentiment-oriented diversity encoder module and a batch +attention module, to achieve diverse video commenting with multiple sentiments +and multiple semantics. Specifically, our sentiment-oriented diversity encoder +elegantly combines VAE and random mask mechanism to achieve semantic diversity +under sentiment guidance, which is then fused with cross-modal features to +generate live video comments. Furthermore, a batch attention module is also +proposed in this paper to alleviate the problem of missing sentimental samples, +caused by the data imbalance, which is common in live videos as the popularity +of videos varies. Extensive experiments on Livebot and VideoIC datasets +demonstrate that the proposed So-TVAE outperforms the state-of-the-art methods +in terms of the quality and diversity of generated comments. Related code is +available at https://github.com/fufy1024/So-TVAE. + +
+
+ comment: 27 pages, 10 figures, ACM Transactions on Multimedia Computing, + Communications and Applications, 2024 +
+
+
+
+
+ + ☆ EfficientGS: Streamlining Gaussian Splatting for Large-Scale + High-Resolution Scene Representation + + +
+ In the domain of 3D scene representation, 3D Gaussian Splatting (3DGS) has +emerged as a pivotal technology. However, its application to large-scale, +high-resolution scenes (exceeding 4k$\times$4k pixels) is hindered by the +excessive computational requirements for managing a large number of Gaussians. +Addressing this, we introduce 'EfficientGS', an advanced approach that +optimizes 3DGS for high-resolution, large-scale scenes. We analyze the +densification process in 3DGS and identify areas of Gaussian +over-proliferation. We propose a selective strategy, limiting Gaussian increase +to key primitives, thereby enhancing the representational efficiency. +Additionally, we develop a pruning mechanism to remove redundant Gaussians, +those that are merely auxiliary to adjacent ones. For further enhancement, we +integrate a sparse order increment for Spherical Harmonics (SH), designed to +alleviate storage constraints and reduce training overhead. Our empirical +evaluations, conducted on a range of datasets including extensive 4K+ aerial +images, demonstrate that 'EfficientGS' not only expedites training and +rendering times but also achieves this with a model size approximately tenfold +smaller than conventional 3DGS while maintaining high rendering fidelity. + +
+
+
+
+
+ + ☆ Camera Agnostic Two-Head Network for Ego-Lane Inference + + +
+ Vision-based ego-lane inference using High-Definition (HD) maps is essential +in autonomous driving and advanced driver assistance systems. The traditional +approach necessitates well-calibrated cameras, which confines variation of +camera configuration, as the algorithm relies on intrinsic and extrinsic +calibration. In this paper, we propose a learning-based ego-lane inference by +directly estimating the ego-lane index from a single image. To enhance robust +performance, our model incorporates the two-head structure inferring ego-lane +in two perspectives simultaneously. Furthermore, we utilize an attention +mechanism guided by vanishing point-and-line to adapt to changes in viewpoint +without requiring accurate calibration. The high adaptability of our model was +validated in diverse environments, devices, and camera mounting points and +orientations. + +
+
+
+
+
+ + ☆ MixLight: Borrowing the Best of both Spherical Harmonics and Gaussian + Models + + +
+ Accurately estimating scene lighting is critical for applications such as +mixed reality. Existing works estimate illumination by generating illumination +maps or regressing illumination parameters. However, the method of generating +illumination maps has poor generalization performance and parametric models +such as Spherical Harmonic (SH) and Spherical Gaussian (SG) fall short in +capturing high-frequency or low-frequency components. This paper presents +MixLight, a joint model that utilizes the complementary characteristics of SH +and SG to achieve a more complete illumination representation, which uses SH +and SG to capture low-frequency ambient and high-frequency light sources +respectively. In addition, a special spherical light source sparsemax +(SLSparsemax) module that refers to the position and brightness relationship +between spherical light sources is designed to improve their sparsity, which is +significant but omitted by prior works. Extensive experiments demonstrate that +MixLight surpasses state-of-the-art (SOTA) methods on multiple metrics. In +addition, experiments on Web Dataset also show that MixLight as a parametric +method has better generalization performance than non-parametric methods. + +
+
+
+
+
+ + ☆ Continual Learning on a Diet: Learning from Sparsely Labeled Streams + Under Constrained Computation + + +
+ We propose and study a realistic Continual Learning (CL) setting where +learning algorithms are granted a restricted computational budget per time step +while training. We apply this setting to large-scale semi-supervised Continual +Learning scenarios with sparse label rates. Previous proficient CL methods +perform very poorly in this challenging setting. Overfitting to the sparse +labeled data and insufficient computational budget are the two main culprits +for such a poor performance. Our new setting encourages learning methods to +effectively and efficiently utilize the unlabeled data during training. To that +end, we propose a simple but highly effective baseline, DietCL, which utilizes +both unlabeled and labeled data jointly. DietCL meticulously allocates +computational budget for both types of data. We validate our baseline, at +scale, on several datasets, e.g., CLOC, ImageNet10K, and CGLM, under constraint +budget setups. DietCL outperforms, by a large margin, all existing supervised +CL algorithms as well as more recent continual semi-supervised methods. Our +extensive analysis and ablations demonstrate that DietCL is stable under a full +spectrum of label sparsity, computational budget, and various other ablations. + +
+
+
+
+
+ + ☆ The Solution for the CVPR2024 NICE Image Captioning Challenge + + +
+ This report introduces a solution to the Topic 1 Zero-shot Image Captioning +of 2024 NICE : New frontiers for zero-shot Image Captioning Evaluation. In +contrast to NICE 2023 datasets, this challenge involves new annotations by +humans with significant differences in caption style and content. Therefore, we +enhance image captions effectively through retrieval augmentation and caption +grading methods. At the data level, we utilize high-quality captions generated +by image caption models as training data to address the gap in text styles. At +the model level, we employ OFA (a large-scale visual-language pre-training +model based on handcrafted templates) to perform the image captioning task. +Subsequently, we propose caption-level strategy for the high-quality caption +data generated by the image caption models and integrate them with retrieval +augmentation strategy into the template to compel the model to generate higher +quality, more matching, and semantically enriched captions based on the +retrieval augmentation prompts. Our approach ranks first on the leaderboard, +achieving a CIDEr score of 234.11 and 1st in all other metrics. + +
+
+
+
+
+ + ☆ DLoRA-TrOCR: Mixed Text Mode Optical Character Recognition Based On + Transformer + + +
+ With the continuous development of OCR technology and the expansion of +application fields, text recognition in complex scenes has become a key +challenge. Factors such as multiple fonts, mixed scenes and complex layouts +seriously affect the recognition accuracy of traditional OCR models. Although +OCR models based on deep learning have performed well in specific fields or +similar data sets in recent years, the generalization ability and robustness of +the model are still a big challenge when facing complex environments with +multiple scenes. Furthermore, training an OCR model from scratch or fine-tuning +all parameters is very demanding on computing resources and inference time, +which limits the flexibility of its application. This study focuses on a +fundamental aspect of mixed text recognition in response to the challenges +mentioned above, which involves effectively fine-tuning the pre-trained basic +OCR model to demonstrate exceptional performance across various downstream +tasks. To this end, we propose a parameter-efficient hybrid text recognition +method based on pre-trained OCR Transformer, namely DLoRA-TrOCR. This method +embeds DoRA into the image encoder and LoRA into the internal structure of the +text decoder, enabling efficient parameter fine-tuning for downstream tasks. +Experimental results show that compared to similar parameter adjustment +methods, our model DLoRA-TrOCR has the smallest number of parameters and +performs better. It can achieve state-of-the-art performance on complex scene +data sets involving simultaneous recognition of mixed handwritten, printed and +street view texts. + +
+
+
+
+
+ + ☆ PATE-TripleGAN: Privacy-Preserving Image Synthesis with Gaussian + Differential Privacy + + +
+ Conditional Generative Adversarial Networks (CGANs) exhibit significant +potential in supervised learning model training by virtue of their ability to +generate realistic labeled images. However, numerous studies have indicated the +privacy leakage risk in CGANs models. The solution DPCGAN, incorporating the +differential privacy framework, faces challenges such as heavy reliance on +labeled data for model training and potential disruptions to original gradient +information due to excessive gradient clipping, making it difficult to ensure +model accuracy. To address these challenges, we present a privacy-preserving +training framework called PATE-TripleGAN. This framework incorporates a +classifier to pre-classify unlabeled data, establishing a three-party min-max +game to reduce dependence on labeled data. Furthermore, we present a hybrid +gradient desensitization algorithm based on the Private Aggregation of Teacher +Ensembles (PATE) framework and Differential Private Stochastic Gradient Descent +(DPSGD) method. This algorithm allows the model to retain gradient information +more effectively while ensuring privacy protection, thereby enhancing the +model's utility. Privacy analysis and extensive experiments affirm that the +PATE-TripleGAN model can generate a higher quality labeled image dataset while +ensuring the privacy of the training data. + +
+
+
+
+
+ + ☆ Separate in the Speech Chain: Cross-Modal Conditional Audio-Visual + Target Speech Extraction IJCAI 2024 + + +
+ The integration of visual cues has revitalized the performance of the target +speech extraction task, elevating it to the forefront of the field. +Nevertheless, this multi-modal learning paradigm often encounters the challenge +of modality imbalance. In audio-visual target speech extraction tasks, the +audio modality tends to dominate, potentially overshadowing the importance of +visual guidance. To tackle this issue, we propose AVSepChain, drawing +inspiration from the speech chain concept. Our approach partitions the +audio-visual target speech extraction task into two stages: speech perception +and speech production. In the speech perception stage, audio serves as the +dominant modality, while visual information acts as the conditional modality. +Conversely, in the speech production stage, the roles are reversed. This +transformation of modality status aims to alleviate the problem of modality +imbalance. Additionally, we introduce a contrastive semantic matching loss to +ensure that the semantic information conveyed by the generated speech aligns +with the semantic information conveyed by lip movements during the speech +production stage. Through extensive experiments conducted on multiple benchmark +datasets for audio-visual target speech extraction, we showcase the superior +performance achieved by our proposed method. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Generalized Few-Shot Meets Remote Sensing: Discovering Novel Classes in + Land Cover Mapping via Hybrid Semantic Segmentation Framework CVPR 2024 + + +
+ Land-cover mapping is one of the vital applications in Earth observation, +aiming at classifying each pixel's land-cover type of remote-sensing images. As +natural and human activities change the landscape, the land-cover map needs to +be rapidly updated. However, discovering newly appeared land-cover types in +existing classification systems is still a non-trivial task hindered by various +scales of complex land objects and insufficient labeled data over a wide-span +geographic area. In this paper, we propose a generalized few-shot +segmentation-based framework, named SegLand, to update novel classes in +high-resolution land-cover mapping. Specifically, the proposed framework is +designed in three parts: (a) Data pre-processing: the base training set and the +few-shot support sets of novel classes are analyzed and augmented; (b) Hybrid +segmentation structure; Multiple base learners and a modified Projection onto +Orthogonal Prototypes (POP) network are combined to enhance the base-class +recognition and to dig novel classes from insufficient labels data; (c) +Ultimate fusion: the semantic segmentation results of the base learners and POP +network are reasonably fused. The proposed framework has won first place in the +leaderboard of the OpenEarthMap Land Cover Mapping Few-Shot Challenge. +Experiments demonstrate the superiority of the framework for automatically +updating novel land-cover classes with limited labeled data. + +
+
+ comment: 11 pages, 11 figures, accepted by CVPR 2024 L3D-IVU Workshop +
+
+
+
+
+ + ☆ PDF-MVQA: A Dataset for Multimodal Information Retrieval in PDF-based + Visual Question Answering IJCAI 2024 + + +
+ Document Question Answering (QA) presents a challenge in understanding +visually-rich documents (VRD), particularly those dominated by lengthy textual +content like research journal articles. Existing studies primarily focus on +real-world documents with sparse text, while challenges persist in +comprehending the hierarchical semantic relations among multiple pages to +locate multimodal components. To address this gap, we propose PDF-MVQA, which +is tailored for research journal articles, encompassing multiple pages and +multimodal information retrieval. Unlike traditional machine reading +comprehension (MRC) tasks, our approach aims to retrieve entire paragraphs +containing answers or visually rich document entities like tables and figures. +Our contributions include the introduction of a comprehensive PDF Document VQA +dataset, allowing the examination of semantically hierarchical layout +structures in text-dominant documents. We also present new VRD-QA frameworks +designed to grasp textual contents and relations among document layouts +simultaneously, extending page-level understanding to the entire multi-page +document. Through this work, we aim to enhance the capabilities of existing +vision-and-language models in handling challenges posed by text-dominant +documents in VRD-QA. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Improving Prediction Accuracy of Semantic Segmentation Methods Using + Convolutional Autoencoder Based Pre-processing Layers + + +
+ In this paper, we propose a method to improve prediction accuracy of semantic +segmentation methods as follows: (1) construct a neural network that has +pre-processing layers based on a convolutional autoencoder ahead of a semantic +segmentation network, and (2) train the entire network initialized by the +weights of the pre-trained autoencoder. We applied this method to the fully +convolutional network (FCN) and experimentally compared its prediction accuracy +on the cityscapes dataset. The Mean IoU of the proposed target model with the +He normal initialization is 18.7% higher than that of FCN with the He normal +initialization. In addition, those of the modified models of the target model +are significantly higher than that of FCN with the He normal initialization. +The accuracy and loss curves during the training showed that these are +resulting from the improvement of the generalization ability. All of these +results provide strong evidence that the proposed method is significantly +effective in improving the prediction accuracy of FCN. The proposed method has +the following features: it is comparatively simple, whereas the effect on +improving the generalization ability and prediction accuracy of FCN is +significant; the increase in the number of parameters by using it is very +small, and that in the computation time is substantially large. In principle, +the proposed method can be applied to other semantic segmentation methods. For +semantic segmentation, at present, there is no effective way to improve the +prediction accuracy of existing methods. None have published a method which is +the same as or similar to our method and none have used such a method in +practice. Therefore, we believe that our method is useful in practice and +worthy of being widely known and used. + +
+
+ comment: 13 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ uTRAND: Unsupervised Anomaly Detection in Traffic Trajectories + + +
+ Deep learning-based approaches have achieved significant improvements on +public video anomaly datasets, but often do not perform well in real-world +applications. This paper addresses two issues: the lack of labeled data and the +difficulty of explaining the predictions of a neural network. To this end, we +present a framework called uTRAND, that shifts the problem of anomalous +trajectory prediction from the pixel space to a semantic-topological domain. +The framework detects and tracks all types of traffic agents in bird's-eye-view +videos of traffic cameras mounted at an intersection. By conceptualizing the +intersection as a patch-based graph, it is shown that the framework learns and +models the normal behaviour of traffic agents without costly manual labeling. +Furthermore, uTRAND allows to formulate simple rules to classify anomalous +trajectories in a way suited for human interpretation. We show that uTRAND +outperforms other state-of-the-art approaches on a dataset of anomalous +trajectories collected in a real-world setting, while producing explainable +detection results. + +
+
+
+
+
+ + ☆ Dynamic Temperature Knowledge Distillation + + +
+ Temperature plays a pivotal role in moderating label softness in the realm of +knowledge distillation (KD). Traditional approaches often employ a static +temperature throughout the KD process, which fails to address the nuanced +complexities of samples with varying levels of difficulty and overlooks the +distinct capabilities of different teacher-student pairings. This leads to a +less-than-ideal transfer of knowledge. To improve the process of knowledge +propagation, we proposed Dynamic Temperature Knowledge Distillation (DTKD) +which introduces a dynamic, cooperative temperature control for both teacher +and student models simultaneously within each training iterafion. In +particular, we proposed "\textbf{sharpness}" as a metric to quantify the +smoothness of a model's output distribution. By minimizing the sharpness +difference between the teacher and the student, we can derive sample-specific +temperatures for them respectively. Extensive experiments on CIFAR-100 and +ImageNet-2012 demonstrate that DTKD performs comparably to leading KD +techniques, with added robustness in Target Class KD and None-target Class KD +scenarios.The code is available at https://github.com/JinYu1998/DTKD. + +
+
+
+
+
+ + ☆ Modeling Multi-Granularity Context Information Flow for Pavement Crack + Detection + + +
+ Crack detection has become an indispensable, interesting yet challenging task +in the computer vision community. Specially, pavement cracks have a highly +complex spatial structure, a low contrasting background and a weak spatial +continuity, posing a significant challenge to an effective crack detection +method. In this paper, we address these problems from a view that utilizes +contexts of the cracks and propose an end-to-end deep learning method to model +the context information flow. To precisely localize crack from an image, it is +critical to effectively extract and aggregate multi-granularity context, +including the fine-grained local context around the cracks (in spatial-level) +and the coarse-grained semantics (in segment-level). Concretely, in +Convolutional Neural Network (CNN), low-level features extracted by the shallow +layers represent the local information, while the deep layers extract the +semantic features. Additionally, a second main insight in this work is that the +semantic context should be an guidance to local context feature. By the above +insights, the proposed method we first apply the dilated convolution as the +backbone feature extractor to model local context, then we build a context +guidance module to leverage semantic context to guide local feature extraction +at multiple stages. To handle label alignment between stages, we apply the +Multiple Instance Learning (MIL) strategy to align the high-level feature to +the low-level ones in the stage-wise context flow. In addition, compared with +these public crack datasets, to our best knowledge, we release the largest, +most complex and most challenging Bitumen Pavement Crack (BPC) dataset. The +experimental results on the three crack datasets demonstrate that the proposed +method performs well and outperforms the current state-of-the-art methods. + +
+
+
+
+
+ + ☆ ESC: Evolutionary Stitched Camera Calibration in the Wild CEC 2024 + + +
+ This work introduces a novel end-to-end approach for estimating extrinsic +parameters of cameras in multi-camera setups on real-life sports fields. We +identify the source of significant calibration errors in multi-camera +environments and address the limitations of existing calibration methods, +particularly the disparity between theoretical models and actual sports field +characteristics. We propose the Evolutionary Stitched Camera calibration (ESC) +algorithm to bridge this gap. It consists of image segmentation followed by +evolutionary optimization of a novel loss function, providing a unified and +accurate multi-camera calibration solution with high visual fidelity. The +outcome allows the creation of virtual stitched views from multiple video +sources, being as important for practical applications as numerical accuracy. +We demonstrate the superior performance of our approach compared to +state-of-the-art methods across diverse real-life football fields with varying +physical characteristics. + +
+
+ comment: Accepted for IEEE CEC 2024 +
+
+
+
+
+ + ☆ Improving Chinese Character Representation with Formation Tree + + +
+ Learning effective representations for Chinese characters presents unique +challenges, primarily due to the vast number of characters and their continuous +growth, which requires models to handle an expanding category space. +Additionally, the inherent sparsity of character usage complicates the +generalization of learned representations. Prior research has explored +radical-based sequences to overcome these issues, achieving progress in +recognizing unseen characters. However, these approaches fail to fully exploit +the inherent tree structure of such sequences. To address these limitations and +leverage established data properties, we propose Formation Tree-CLIP (FT-CLIP). +This model utilizes formation trees to represent characters and incorporates a +dedicated tree encoder, significantly improving performance in both seen and +unseen character recognition tasks. We further introduce masking for to both +character images and tree nodes, enabling efficient and effective training. +This approach accelerates training significantly (by a factor of 2 or more) +while enhancing accuracy. Extensive experiments show that processing characters +through formation trees aligns better with their inherent properties than +direct sequential methods, significantly enhancing the generality and usability +of the representations. + +
+
+
+
+
+ + ☆ VoxAtnNet: A 3D Point Clouds Convolutional Neural Network for + Generalizable Face Presentation Attack Detection + + +
+ Facial biometrics are an essential components of smartphones to ensure +reliable and trustworthy authentication. However, face biometric systems are +vulnerable to Presentation Attacks (PAs), and the availability of more +sophisticated presentation attack instruments such as 3D silicone face masks +will allow attackers to deceive face recognition systems easily. In this work, +we propose a novel Presentation Attack Detection (PAD) algorithm based on 3D +point clouds captured using the frontal camera of a smartphone to detect +presentation attacks. The proposed PAD algorithm, VoxAtnNet, processes 3D point +clouds to obtain voxelization to preserve the spatial structure. Then, the +voxelized 3D samples were trained using the novel convolutional attention +network to detect PAs on the smartphone. Extensive experiments were carried out +on the newly constructed 3D face point cloud dataset comprising bona fide and +two different 3D PAIs (3D silicone face mask and wrap photo mask), resulting in +3480 samples. The performance of the proposed method was compared with existing +methods to benchmark the detection performance using three different evaluation +protocols. The experimental results demonstrate the improved performance of the +proposed method in detecting both known and unknown face presentation attacks. + +
+
+ comment: Accepted in 2024 18th International Conference on Automatic Face and + Gesture Recognition (FG) +
+
+
+
+
+ + ☆ MLSD-GAN -- Generating Strong High Quality Face Morphing Attacks using + Latent Semantic Disentanglement + + +
+ Face-morphing attacks are a growing concern for biometric researchers, as +they can be used to fool face recognition systems (FRS). These attacks can be +generated at the image level (supervised) or representation level +(unsupervised). Previous unsupervised morphing attacks have relied on +generative adversarial networks (GANs). More recently, researchers have used +linear interpolation of StyleGAN-encoded images to generate morphing attacks. +In this paper, we propose a new method for generating high-quality morphing +attacks using StyleGAN disentanglement. Our approach, called MLSD-GAN, +spherically interpolates the disentangled latents to produce realistic and +diverse morphing attacks. We evaluate the vulnerability of MLSD-GAN on two +deep-learning-based FRS techniques. The results show that MLSD-GAN poses a +significant threat to FRS, as it can generate morphing attacks that are highly +effective at fooling these systems. + +
+
+
+
+
+ + ☆ Exploring Interactive Semantic Alignment for Efficient HOI Detection + with Vision-language Model ICME2024 + + +
+ Human-Object Interaction (HOI) detection aims to localize human-object pairs +and comprehend their interactions. Recently, two-stage transformer-based +methods have demonstrated competitive performance. However, these methods +frequently focus on object appearance features and ignore global contextual +information. Besides, vision-language model CLIP which effectively aligns +visual and text embeddings has shown great potential in zero-shot HOI +detection. Based on the former facts, We introduce a novel HOI detector named +ISA-HOI, which extensively leverages knowledge from CLIP, aligning interactive +semantics between visual and textual features. We first extract global context +of image and local features of object to Improve interaction Features in images +(IF). On the other hand, we propose a Verb Semantic Improvement (VSI) module to +enhance textual features of verb labels via cross-modal fusion. Ultimately, our +method achieves competitive results on the HICO-DET and V-COCO benchmarks with +much fewer training epochs, and outperforms the state-of-the-art under +zero-shot settings. + +
+
+ comment: Accepted by ICME2024 +
+
+
+
+
+ + ☆ Detecting Out-Of-Distribution Earth Observation Images with Diffusion + Models CVPR + + +
+ Earth Observation imagery can capture rare and unusual events, such as +disasters and major landscape changes, whose visual appearance contrasts with +the usual observations. Deep models trained on common remote sensing data will +output drastically different features for these out-of-distribution samples, +compared to those closer to their training dataset. Detecting them could +therefore help anticipate changes in the observations, either geographical or +environmental. In this work, we show that the reconstruction error of diffusion +models can effectively serve as unsupervised out-of-distribution detectors for +remote sensing images, using them as a plausibility score. Moreover, we +introduce ODEED, a novel reconstruction-based scorer using the probability-flow +ODE of diffusion models. We validate it experimentally on SpaceNet 8 with +various scenarios, such as classical OOD detection with geographical shift and +near-OOD setups: pre/post-flood and non-flooded/flooded image recognition. We +show that our ODEED scorer significantly outperforms other diffusion-based and +discriminative baselines on the more challenging near-OOD scenarios of flood +image detection, where OOD images are close to the distribution tail. We aim to +pave the way towards better use of generative models for anomaly detection in +remote sensing. + +
+
+ comment: EARTHVISION 2024 IEEE/CVF CVPR Workshop. Large Scale Computer Vision + for Remote Sensing Imagery, Jun 2024, Seattle, United States +
+
+
+
+
+ + ☆ Pre-trained Vision-Language Models Learn Discoverable Visual Concepts + + +
+ Do vision-language models (VLMs) pre-trained to caption an image of a +"durian" learn visual concepts such as "brown" (color) and "spiky" (texture) at +the same time? We aim to answer this question as visual concepts learned "for +free" would enable wide applications such as neuro-symbolic reasoning or +human-interpretable object classification. We assume that the visual concepts, +if captured by pre-trained VLMs, can be extracted by their vision-language +interface with text-based concept prompts. We observe that recent works +prompting VLMs with concepts often differ in their strategies to define and +evaluate the visual concepts, leading to conflicting conclusions. We propose a +new concept definition strategy based on two observations: First, certain +concept prompts include shortcuts that recognize correct concepts for wrong +reasons; Second, multimodal information (e.g. visual discriminativeness, and +textual knowledge) should be leveraged when selecting the concepts. Our +proposed concept discovery and learning (CDL) framework is thus designed to +identify a diverse list of generic visual concepts (e.g. "spiky" as opposed to +"spiky durian"), which are ranked and selected based on visual and language +mutual information. We carefully design quantitative and human evaluations of +the discovered concepts on six diverse visual recognition datasets, which +confirm that pre-trained VLMs do learn visual concepts that provide accurate +and thorough descriptions for the recognized objects. All code and models are +publicly released. + +
+
+
+
+
+ + ☆ F2FLDM: Latent Diffusion Models with Histopathology Pre-Trained + Embeddings for Unpaired Frozen Section to FFPE Translation + + +
+ The Frozen Section (FS) technique is a rapid and efficient method, taking +only 15-30 minutes to prepare slides for pathologists' evaluation during +surgery, enabling immediate decisions on further surgical interventions. +However, FS process often introduces artifacts and distortions like folds and +ice-crystal effects. In contrast, these artifacts and distortions are absent in +the higher-quality formalin-fixed paraffin-embedded (FFPE) slides, which +require 2-3 days to prepare. While Generative Adversarial Network (GAN)-based +methods have been used to translate FS to FFPE images (F2F), they may leave +morphological inaccuracies with remaining FS artifacts or introduce new +artifacts, reducing the quality of these translations for clinical assessments. +In this study, we benchmark recent generative models, focusing on GANs and +Latent Diffusion Models (LDMs), to overcome these limitations. We introduce a +novel approach that combines LDMs with Histopathology Pre-Trained Embeddings to +enhance restoration of FS images. Our framework leverages LDMs conditioned by +both text and pre-trained embeddings to learn meaningful features of FS and +FFPE histopathology images. Through diffusion and denoising techniques, our +approach not only preserves essential diagnostic attributes like color staining +and tissue morphology but also proposes an embedding translation mechanism to +better predict the targeted FFPE representation of input FS images. As a +result, this work achieves a significant improvement in classification +performance, with the Area Under the Curve rising from 81.99% to 94.64%, +accompanied by an advantageous CaseFD. This work establishes a new benchmark +for FS to FFPE image translation quality, promising enhanced reliability and +accuracy in histopathology FS image analysis. Our work is available at +https://minhmanho.github.io/f2f_ldm/. + +
+
+ comment: Preprint. Our work is available at + https://minhmanho.github.io/f2f_ldm/ +
+
+
+
+
+ + ☆ Cooperative Sentiment Agents for Multimodal Sentiment Analysis + + +
+ In this paper, we propose a new Multimodal Representation Learning (MRL) +method for Multimodal Sentiment Analysis (MSA), which facilitates the adaptive +interaction between modalities through Cooperative Sentiment Agents, named +Co-SA. Co-SA comprises two critical components: the Sentiment Agents +Establishment (SAE) phase and the Sentiment Agents Cooperation (SAC) phase. +During the SAE phase, each sentiment agent deals with an unimodal signal and +highlights explicit dynamic sentiment variations within the modality via the +Modality-Sentiment Disentanglement (MSD) and Deep Phase Space Reconstruction +(DPSR) modules. Subsequently, in the SAC phase, Co-SA meticulously designs +task-specific interaction mechanisms for sentiment agents so that coordinating +multimodal signals to learn the joint representation. Specifically, Co-SA +equips an independent policy model for each sentiment agent that captures +significant properties within the modality. These policies are optimized +mutually through the unified reward adaptive to downstream tasks. Benefitting +from the rewarding mechanism, Co-SA transcends the limitation of pre-defined +fusion modes and adaptively captures unimodal properties for MRL in the +multimodal interaction setting. To demonstrate the effectiveness of Co-SA, we +apply it to address Multimodal Sentiment Analysis (MSA) and Multimodal Emotion +Recognition (MER) tasks. Our comprehensive experimental results demonstrate +that Co-SA excels at discovering diverse cross-modal features, encompassing +both common and complementary aspects. The code can be available at +https://github.com/smwanghhh/Co-SA. + +
+
+
+
+
+ + ☆ AED-PADA:Improving Generalizability of Adversarial Example Detection via + Principal Adversarial Domain Adaptation + + +
+ Adversarial example detection, which can be conveniently applied in many +scenarios, is important in the area of adversarial defense. Unfortunately, +existing detection methods suffer from poor generalization performance, because +their training process usually relies on the examples generated from a single +known adversarial attack and there exists a large discrepancy between the +training and unseen testing adversarial examples. To address this issue, we +propose a novel method, named Adversarial Example Detection via Principal +Adversarial Domain Adaptation (AED-PADA). Specifically, our approach identifies +the Principal Adversarial Domains (PADs), i.e., a combination of features of +the adversarial examples from different attacks, which possesses large coverage +of the entire adversarial feature space. Then, we pioneer to exploit +multi-source domain adaptation in adversarial example detection with PADs as +source domains. Experiments demonstrate the superior generalization ability of +our proposed AED-PADA. Note that this superiority is particularly achieved in +challenging scenarios characterized by employing the minimal magnitude +constraint for the perturbations. + +
+
+
+
+
+ + ☆ Transformer-Based Classification Outcome Prediction for Multimodal + Stroke Treatment + + +
+ This study proposes a multi-modal fusion framework Multitrans based on the +Transformer architecture and self-attention mechanism. This architecture +combines the study of non-contrast computed tomography (NCCT) images and +discharge diagnosis reports of patients undergoing stroke treatment, using a +variety of methods based on Transformer architecture approach to predicting +functional outcomes of stroke treatment. The results show that the performance +of single-modal text classification is significantly better than single-modal +image classification, but the effect of multi-modal combination is better than +any single modality. Although the Transformer model only performs worse on +imaging data, when combined with clinical meta-diagnostic information, both can +learn better complementary information and make good contributions to +accurately predicting stroke treatment effects.. + +
+
+
+
+
+ + ☆ MindTuner: Cross-Subject Visual Decoding with Visual Fingerprint and + Semantic Correction + + +
+ Decoding natural visual scenes from brain activity has flourished, with +extensive research in single-subject tasks and, however, less in cross-subject +tasks. Reconstructing high-quality images in cross-subject tasks is a +challenging problem due to profound individual differences between subjects and +the scarcity of data annotation. In this work, we proposed MindTuner for +cross-subject visual decoding, which achieves high-quality and rich-semantic +reconstructions using only 1 hour of fMRI training data benefiting from the +phenomena of visual fingerprint in the human visual system and a novel +fMRI-to-text alignment paradigm. Firstly, we pre-train a multi-subject model +among 7 subjects and fine-tune it with scarce data on new subjects, where LoRAs +with Skip-LoRAs are utilized to learn the visual fingerprint. Then, we take the +image modality as the intermediate pivot modality to achieve fMRI-to-text +alignment, which achieves impressive fMRI-to-text retrieval performance and +corrects fMRI-to-image reconstruction with fine-tuned semantics. The results of +both qualitative and quantitative analyses demonstrate that MindTuner surpasses +state-of-the-art cross-subject visual decoding models on the Natural Scenes +Dataset (NSD), whether using training data of 1 hour or 40 hours. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ SkelFormer: Markerless 3D Pose and Shape Estimation using Skeletal + Transformers + + +
+ We introduce SkelFormer, a novel markerless motion capture pipeline for +multi-view human pose and shape estimation. Our method first uses off-the-shelf +2D keypoint estimators, pre-trained on large-scale in-the-wild data, to obtain +3D joint positions. Next, we design a regression-based inverse-kinematic +skeletal transformer that maps the joint positions to pose and shape +representations from heavily noisy observations. This module integrates prior +knowledge about pose space and infers the full pose state at runtime. +Separating the 3D keypoint detection and inverse-kinematic problems, along with +the expressive representations learned by our skeletal transformer, enhance the +generalization of our method to unseen noisy data. We evaluate our method on +three public datasets in both in-distribution and out-of-distribution settings +using three datasets, and observe strong performance with respect to prior +works. Moreover, ablation experiments demonstrate the impact of each of the +modules of our architecture. Finally, we study the performance of our method in +dealing with noise and heavy occlusions and find considerable robustness with +respect to other solutions. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Dragtraffic: A Non-Expert Interactive and Point-Based Controllable + Traffic Scene Generation Framework + + +
+ The evaluation and training of autonomous driving systems require diverse and +scalable corner cases. However, most existing scene generation methods lack +controllability, accuracy, and versatility, resulting in unsatisfactory +generation results. To address this problem, we propose Dragtraffic, a +generalized, point-based, and controllable traffic scene generation framework +based on conditional diffusion. Dragtraffic enables non-experts to generate a +variety of realistic driving scenarios for different types of traffic agents +through an adaptive mixture expert architecture. We use a regression model to +provide a general initial solution and a refinement process based on the +conditional diffusion model to ensure diversity. User-customized context is +introduced through cross-attention to ensure high controllability. Experiments +on a real-world driving dataset show that Dragtraffic outperforms existing +methods in terms of authenticity, diversity, and freedom. + +
+
+
+
+
+ + ☆ SA-Attack: Speed-adaptive stealthy adversarial attack on trajectory + prediction + + +
+ Trajectory prediction is critical for the safe planning and navigation of +automated vehicles. The trajectory prediction models based on the neural +networks are vulnerable to adversarial attacks. Previous attack methods have +achieved high attack success rates but overlook the adaptability to realistic +scenarios and the concealment of the deceits. To address this problem, we +propose a speed-adaptive stealthy adversarial attack method named SA-Attack. +This method searches the sensitive region of trajectory prediction models and +generates the adversarial trajectories by using the vehicle-following method +and incorporating information about forthcoming trajectories. Our method has +the ability to adapt to different speed scenarios by reconstructing the +trajectory from scratch. Fusing future trajectory trends and curvature +constraints can guarantee the smoothness of adversarial trajectories, further +ensuring the stealthiness of attacks. The empirical study on the datasets of +nuScenes and Apolloscape demonstrates the attack performance of our proposed +method. Finally, we also demonstrate the adaptability and stealthiness of +SA-Attack for different speed scenarios. Our code is available at the +repository: https://github.com/eclipse-bot/SA-Attack. + +
+
+ comment: This work is published in IEEE IV Symposium +
+
+
+
+
+ + ☆ Rethinking Clothes Changing Person ReID: Conflicts, Synthesis, and + Optimization + + +
+ Clothes-changing person re-identification (CC-ReID) aims to retrieve images +of the same person wearing different outfits. Mainstream researches focus on +designing advanced model structures and strategies to capture identity +information independent of clothing. However, the same-clothes discrimination +as the standard ReID learning objective in CC-ReID is persistently ignored in +previous researches. In this study, we dive into the relationship between +standard and clothes-changing~(CC) learning objectives, and bring the inner +conflicts between these two objectives to the fore. We try to magnify the +proportion of CC training pairs by supplementing high-fidelity clothes-varying +synthesis, produced by our proposed Clothes-Changing Diffusion model. By +incorporating the synthetic images into CC-ReID model training, we observe a +significant improvement under CC protocol. However, such improvement sacrifices +the performance under the standard protocol, caused by the inner conflict +between standard and CC. For conflict mitigation, we decouple these objectives +and re-formulate CC-ReID learning as a multi-objective optimization (MOO) +problem. By effectively regularizing the gradient curvature across multiple +objectives and introducing preference restrictions, our MOO solution surpasses +the single-task training paradigm. Our framework is model-agnostic, and +demonstrates superior performance under both CC and standard ReID protocols. + +
+
+
+
+
+ + ☆ ELEV-VISION-SAM: Integrated Vision Language and Foundation Model for + Automated Estimation of Building Lowest Floor Elevation + + +
+ Street view imagery, aided by advancements in image quality and +accessibility, has emerged as a valuable resource for urban analytics research. +Recent studies have explored its potential for estimating lowest floor +elevation (LFE), offering a scalable alternative to traditional on-site +measurements, crucial for assessing properties' flood risk and damage extent. +While existing methods rely on object detection, the introduction of image +segmentation has broadened street view images' utility for LFE estimation, +although challenges still remain in segmentation quality and capability to +distinguish front doors from other doors. To address these challenges in LFE +estimation, this study integrates the Segment Anything model, a segmentation +foundation model, with vision language models to conduct text-prompt image +segmentation on street view images for LFE estimation. By evaluating various +vision language models, integration methods, and text prompts, we identify the +most suitable model for street view image analytics and LFE estimation tasks, +thereby improving the availability of the current LFE estimation model based on +image segmentation from 33% to 56% of properties. Remarkably, our proposed +method significantly enhances the availability of LFE estimation to almost all +properties in which the front door is visible in the street view image. Also +the findings present the first baseline and comparison of various vision models +of street view image-based LFE estimation. The model and findings not only +contribute to advancing street view image segmentation for urban analytics but +also provide a novel approach for image segmentation tasks for other civil +engineering and infrastructure analytics tasks. + +
+
+
+
+
+ + ☆ A visualization method for data domain changes in CNN networks and the + optimization method for selecting thresholds in classification tasks + + +
+ In recent years, Face Anti-Spoofing (FAS) has played a crucial role in +preserving the security of face recognition technology. With the rise of +counterfeit face generation techniques, the challenge posed by digitally edited +faces to face anti-spoofing is escalating. Existing FAS technologies primarily +focus on intercepting physically forged faces and lack a robust solution for +cross-domain FAS challenges. Moreover, determining an appropriate threshold to +achieve optimal deployment results remains an issue for intra-domain FAS. To +address these issues, we propose a visualization method that intuitively +reflects the training outcomes of models by visualizing the prediction results +on datasets. Additionally, we demonstrate that employing data augmentation +techniques, such as downsampling and Gaussian blur, can effectively enhance +performance on cross-domain tasks. Building upon our data visualization +approach, we also introduce a methodology for setting threshold values based on +the distribution of the training dataset. Ultimately, our methods secured us +second place in both the Unified Physical-Digital Face Attack Detection +competition and the Snapshot Spectral Imaging Face Anti-spoofing contest. The +training code is available at https://github.com/SeaRecluse/CVPRW2024. + +
+
+
+
+
+ + ☆ QUTE: Quantifying Uncertainty in TinyML models with Early-exit-assisted + ensembles + + +
+ Existing methods for uncertainty quantification incur massive memory and +compute overhead, often requiring multiple models/inferences. Hence they are +impractical on ultra-low-power KB-sized TinyML devices. To reduce overhead, +prior works have proposed the use of early-exit networks as ensembles to +quantify uncertainty in a single forward-pass. However, they still have a +prohibitive cost for tinyML. To address these challenges, we propose QUTE, a +novel resource-efficient early-exit-assisted ensemble architecture optimized +for tinyML models. QUTE adds additional output blocks at the final exit of the +base network and distills the knowledge of early-exits into these blocks to +create a diverse and lightweight ensemble architecture. Our results show that +QUTE outperforms popular prior works, and improves the quality of uncertainty +estimates by 6% with 3.1x lower model size on average compared to the most +relevant prior work. Furthermore, we demonstrate that QUTE is also effective in +detecting co-variate shifted and out-of-distribution inputs, and shows +competitive performance relative to G-ODIN, a state-of-the-art generalized OOD +detector. + +
+
+
+
+
+ + ☆ Cross-Modal Adapter: Parameter-Efficient Transfer Learning Approach for + Vision-Language Models ICME 2024 + + +
+ Adapter-based parameter-efficient transfer learning has achieved exciting +results in vision-language models. Traditional adapter methods often require +training or fine-tuning, facing challenges such as insufficient samples or +resource limitations. While some methods overcome the need for training by +leveraging image modality cache and retrieval, they overlook the text +modality's importance and cross-modal cues for the efficient adaptation of +parameters in visual-language models. This work introduces a cross-modal +parameter-efficient approach named XMAdapter. XMAdapter establishes cache +models for both text and image modalities. It then leverages retrieval through +visual-language bimodal information to gather clues for inference. By +dynamically adjusting the affinity ratio, it achieves cross-modal fusion, +decoupling different modal similarities to assess their respective +contributions. Additionally, it explores hard samples based on differences in +cross-modal affinity and enhances model performance through adaptive adjustment +of sample learning intensity. Extensive experimental results on benchmark +datasets demonstrate that XMAdapter outperforms previous adapter-based methods +significantly regarding accuracy, generalization, and efficiency. + +
+
+ comment: This paper is accepted to ICME 2024 +
+
+
+
+
+ + ☆ Privacy-Preserving Debiasing using Data Augmentation and Machine + Unlearning + + +
+ Data augmentation is widely used to mitigate data bias in the training +dataset. However, data augmentation exposes machine learning models to privacy +attacks, such as membership inference attacks. In this paper, we propose an +effective combination of data augmentation and machine unlearning, which can +reduce data bias while providing a provable defense against known attacks. +Specifically, we maintain the fairness of the trained model with +diffusion-based data augmentation, and then utilize multi-shard unlearning to +remove identifying information of original data from the ML model for +protection against privacy attacks. Experimental evaluation across diverse +datasets demonstrates that our approach can achieve significant improvements in +bias reduction as well as robustness against state-of-the-art privacy attacks. + +
+
+
+
+
+ + ☆ Unlocking Robust Segmentation Across All Age Groups via Continual + Learning + + +
+ Most deep learning models in medical imaging are trained on adult data with +unclear performance on pediatric images. In this work, we aim to address this +challenge in the context of automated anatomy segmentation in whole-body +Computed Tomography (CT). We evaluate the performance of CT organ segmentation +algorithms trained on adult data when applied to pediatric CT volumes and +identify substantial age-dependent underperformance. We subsequently propose +and evaluate strategies, including data augmentation and continual learning +approaches, to achieve good segmentation accuracy across all age groups. Our +best-performing model, trained using continual learning, achieves high +segmentation accuracy on both adult and pediatric data (Dice scores of 0.90 and +0.84 respectively). + +
+
+
+
+
+ + ☆ Equivariant Imaging for Self-supervised Hyperspectral Image Inpainting + + +
+ Hyperspectral imaging (HSI) is a key technology for earth observation, +surveillance, medical imaging and diagnostics, astronomy and space exploration. +The conventional technology for HSI in remote sensing applications is based on +the push-broom scanning approach in which the camera records the spectral image +of a stripe of the scene at a time, while the image is generated by the +aggregation of measurements through time. In real-world airborne and spaceborne +HSI instruments, some empty stripes would appear at certain locations, because +platforms do not always maintain a constant programmed attitude, or have access +to accurate digital elevation maps (DEM), and the travelling track is not +necessarily aligned with the hyperspectral cameras at all times. This makes the +enhancement of the acquired HS images from incomplete or corrupted observations +an essential task. We introduce a novel HSI inpainting algorithm here, called +Hyperspectral Equivariant Imaging (Hyper-EI). Hyper-EI is a self-supervised +learning-based method which does not require training on extensive datasets or +access to a pre-trained model. Experimental results show that the proposed +method achieves state-of-the-art inpainting performance compared to the +existing methods. + +
+
+ comment: 5 Pages, 4 Figures, 2 Tables +
+
+
+
+
+ + ☆ Motion-adaptive Separable Collaborative Filters for Blind Motion + Deblurring CVPR 2024 + + +
+ Eliminating image blur produced by various kinds of motion has been a +challenging problem. Dominant approaches rely heavily on model capacity to +remove blurring by reconstructing residual from blurry observation in feature +space. These practices not only prevent the capture of spatially variable +motion in the real world but also ignore the tailored handling of various +motions in image space. In this paper, we propose a novel real-world deblurring +filtering model called the Motion-adaptive Separable Collaborative (MISC) +Filter. In particular, we use a motion estimation network to capture motion +information from neighborhoods, thereby adaptively estimating spatially-variant +motion flow, mask, kernels, weights, and offsets to obtain the MISC Filter. The +MISC Filter first aligns the motion-induced blurring patterns to the motion +middle along the predicted flow direction, and then collaboratively filters the +aligned image through the predicted kernels, weights, and offsets to generate +the output. This design can handle more generalized and complex motion in a +spatially differentiated manner. Furthermore, we analyze the relationships +between the motion estimation network and the residual reconstruction network. +Extensive experiments on four widely used benchmarks demonstrate that our +method provides an effective solution for real-world motion blur removal and +achieves state-of-the-art performance. Code is available at +https://github.com/ChengxuLiu/MISCFilter + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ BACS: Background Aware Continual Semantic Segmentation + + +
+ Semantic segmentation plays a crucial role in enabling comprehensive scene +understanding for robotic systems. However, generating annotations is +challenging, requiring labels for every pixel in an image. In scenarios like +autonomous driving, there's a need to progressively incorporate new classes as +the operating environment of the deployed agent becomes more complex. For +enhanced annotation efficiency, ideally, only pixels belonging to new classes +would be annotated. This approach is known as Continual Semantic Segmentation +(CSS). Besides the common problem of classical catastrophic forgetting in the +continual learning setting, CSS suffers from the inherent ambiguity of the +background, a phenomenon we refer to as the "background shift'', since pixels +labeled as background could correspond to future classes (forward background +shift) or previous classes (backward background shift). As a result, continual +learning approaches tend to fail. This paper proposes a Backward Background +Shift Detector (BACS) to detect previously observed classes based on their +distance in the latent space from the foreground centroids of previous steps. +Moreover, we propose a modified version of the cross-entropy loss function, +incorporating the BACS detector to down-weight background pixels associated +with formerly observed classes. To combat catastrophic forgetting, we employ +masked feature distillation alongside dark experience replay. Additionally, our +approach includes a transformer decoder capable of adjusting to new classes +without necessitating an additional classification head. We validate BACS's +superior performance over existing state-of-the-art methods on standard CSS +benchmarks. + +
+
+ comment: 8 pages, 4 figures, CRV 2024 +
+
+
+
+
+ + ☆ DeepFake-O-Meter v2.0: An Open Platform for DeepFake Detection + + +
+ Deepfakes, as AI-generated media, have increasingly threatened media +integrity and personal privacy with realistic yet fake digital content. In this +work, we introduce an open-source and user-friendly online platform, +DeepFake-O-Meter v2.0, that integrates state-of-the-art methods for detecting +Deepfake images, videos, and audio. Built upon DeepFake-O-Meter v1.0, we have +made significant upgrades and improvements in platform architecture design, +including user interaction, detector integration, job balancing, and security +management. The platform aims to offer everyday users a convenient service for +analyzing DeepFake media using multiple state-of-the-art detection algorithms. +It ensures secure and private delivery of the analysis results. Furthermore, it +serves as an evaluation and benchmarking platform for researchers in digital +media forensics to compare the performance of multiple algorithms on the same +input. We have also conducted detailed usage analysis based on the collected +data to gain deeper insights into our platform's statistics. This involves +analyzing two-month trends in user activity and evaluating the processing +efficiency of each detector. + +
+
+
+
+
+ + ☆ Deep Learning-based Text-in-Image Watermarking + + +
+ In this work, we introduce a novel deep learning-based approach to +text-in-image watermarking, a method that embeds and extracts textual +information within images to enhance data security and integrity. Leveraging +the capabilities of deep learning, specifically through the use of +Transformer-based architectures for text processing and Vision Transformers for +image feature extraction, our method sets new benchmarks in the domain. The +proposed method represents the first application of deep learning in +text-in-image watermarking that improves adaptivity, allowing the model to +intelligently adjust to specific image characteristics and emerging threats. +Through testing and evaluation, our method has demonstrated superior robustness +compared to traditional watermarking techniques, achieving enhanced +imperceptibility that ensures the watermark remains undetectable across various +image contents. + +
+
+
+
+
+ + ☆ On-board classification of underwater images using hybrid + classical-quantum CNN based method + + +
+ Underwater images taken from autonomous underwater vehicles (AUV's) often +suffer from low light, high turbidity, poor contrast, motion-blur and excessive +light scattering and hence require image enhancement techniques for object +recognition. Machine learning methods are being increasingly used for object +recognition under such adverse conditions. These enhanced object recognition +methods of images taken from AUV's has potential applications in underwater +pipeline and optical fibre surveillance, ocean bed resource extraction, ocean +floor mapping, underwater species exploration, etc. While the classical machine +learning methods are very efficient in terms of accuracy, they require large +datasets and high computational time for image classification. In the current +work, we use quantum-classical hybrid machine learning methods for real-time +under-water object recognition on-board an AUV for the first time. We use +real-time motion-blurred and low-light images taken from an on-board camera of +AUV built in-house and apply existing hybrid machine learning methods for +object recognition. Our hybrid methods consist of quantum encoding and +flattening of classical images using quantum circuits and sending them to +classical neural networks for image classification. The results of hybrid +methods carried out using Pennylane based quantum simulators both on GPU and +using pre-trained models on an on-board NVIDIA GPU chipset are compared with +results from corresponding classical machine learning methods. We observe that +the hybrid quantum machine learning methods show an efficiency greater than +65\% and reduction in run-time by one-thirds and require 50\% smaller dataset +sizes for training the models compared to classical machine learning methods. +We hope that our work opens up further possibilities in quantum enhanced +real-time computer vision in autonomous vehicles. + +
+
+
+
+
+ + ☆ FreSeg: Frenet-Frame-based Part Segmentation for 3D Curvilinear + Structures + + +
+ Part segmentation is a crucial task for 3D curvilinear structures like neuron +dendrites and blood vessels, enabling the analysis of dendritic spines and +aneurysms with scientific and clinical significance. However, their diversely +winded morphology poses a generalization challenge to existing deep learning +methods, which leads to labor-intensive manual correction. In this work, we +propose FreSeg, a framework of part segmentation tasks for 3D curvilinear +structures. With Frenet-Frame-based point cloud transformation, it enables the +models to learn more generalizable features and have significant performance +improvements on tasks involving elongated and curvy geometries. We evaluate +FreSeg on 2 datasets: 1) DenSpineEM, an in-house dataset for dendritic spine +segmentation, and 2) IntrA, a public 3D dataset for intracranial aneurysm +segmentation. Further, we will release the DenSpineEM dataset, which includes +roughly 6,000 spines from 69 dendrites from 3 public electron microscopy (EM) +datasets, to foster the development of effective dendritic spine instance +extraction methods and, consequently, large-scale connectivity analysis to +better understand mammalian brains. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ RegWSI: Whole Slide Image Registration using Combined Deep Feature- and + Intensity-Based Methods: Winner of the ACROBAT 2023 Challenge + + +
+ The automatic registration of differently stained whole slide images (WSIs) +is crucial for improving diagnosis and prognosis by fusing complementary +information emerging from different visible structures. It is also useful to +quickly transfer annotations between consecutive or restained slides, thus +significantly reducing the annotation time and associated costs. Nevertheless, +the slide preparation is different for each stain and the tissue undergoes +complex and large deformations. Therefore, a robust, efficient, and accurate +registration method is highly desired by the scientific community and hospitals +specializing in digital pathology. We propose a two-step hybrid method +consisting of (i) deep learning- and feature-based initial alignment algorithm, +and (ii) intensity-based nonrigid registration using the instance optimization. +The proposed method does not require any fine-tuning to a particular dataset +and can be used directly for any desired tissue type and stain. The method +scored 1st place in the ACROBAT 2023 challenge. We evaluated using three open +datasets: (i) ANHIR, (ii) ACROBAT, and (iii) HyReCo, and performed several +ablation studies concerning the resolution used for registration and the +initial alignment robustness and stability. The method achieves the most +accurate results for the ACROBAT dataset, the cell-level registration accuracy +for the restained slides from the HyReCo dataset, and is among the best methods +evaluated on the ANHIR dataset. The method does not require any fine-tuning to +a new datasets and can be used out-of-the-box for other types of microscopic +images. The method is incorporated into the DeeperHistReg framework, allowing +others to directly use it to register, transform, and save the WSIs at any +desired pyramid level. The proposed method is a significant contribution to the +WSI registration, thus advancing the field of digital pathology. + +
+
+
+
+
+ + ☆ DeeperHistReg: Robust Whole Slide Images Registration Framework + + +
+ DeeperHistReg is a software framework dedicated to registering whole slide +images (WSIs) acquired using multiple stains. It allows one to perform the +preprocessing, initial alignment, and nonrigid registration of WSIs acquired +using multiple stains (e.g. hematoxylin \& eosin, immunochemistry). The +framework implements several state-of-the-art registration algorithms and +provides an interface to operate on arbitrary resolution of the WSIs (up to +200k x 200k). The framework is extensible and new algorithms can be easily +integrated by other researchers. The framework is available both as a PyPI +package and as a Docker container. + +
+
+
+
+
+ + ☆ Automatic Cranial Defect Reconstruction with Self-Supervised Deep + Deformable Masked Autoencoders + + +
+ Thousands of people suffer from cranial injuries every year. They require +personalized implants that need to be designed and manufactured before the +reconstruction surgery. The manual design is expensive and time-consuming +leading to searching for algorithms whose goal is to automatize the process. +The problem can be formulated as volumetric shape completion and solved by deep +neural networks dedicated to supervised image segmentation. However, such an +approach requires annotating the ground-truth defects which is costly and +time-consuming. Usually, the process is replaced with synthetic defect +generation. However, even the synthetic ground-truth generation is +time-consuming and limits the data heterogeneity, thus the deep models' +generalizability. In our work, we propose an alternative and simple approach to +use a self-supervised masked autoencoder to solve the problem. This approach by +design increases the heterogeneity of the training set and can be seen as a +form of data augmentation. We compare the proposed method with several +state-of-the-art deep neural networks and show both the quantitative and +qualitative improvement on the SkullBreak and SkullFix datasets. The proposed +method can be used to efficiently reconstruct the cranial defects in real time. + +
+
+
+
+
+ + ☆ On-Demand Earth System Data Cubes + + +
+ Advancements in Earth system science have seen a surge in diverse datasets. +Earth System Data Cubes (ESDCs) have been introduced to efficiently handle this +influx of high-dimensional data. ESDCs offer a structured, intuitive framework +for data analysis, organising information within spatio-temporal grids. The +structured nature of ESDCs unlocks significant opportunities for Artificial +Intelligence (AI) applications. By providing well-organised data, ESDCs are +ideally suited for a wide range of sophisticated AI-driven tasks. An automated +framework for creating AI-focused ESDCs with minimal user input could +significantly accelerate the generation of task-specific training data. Here we +introduce cubo, an open-source Python tool designed for easy generation of +AI-focused ESDCs. Utilising collections in SpatioTemporal Asset Catalogs (STAC) +that are stored as Cloud Optimised GeoTIFFs (COGs), cubo efficiently creates +ESDCs, requiring only central coordinates, spatial resolution, edge size, and +time range. + +
+
+ comment: Accepted at IGARSS24 +
+
+
+
+
+ + ☆ ToNNO: Tomographic Reconstruction of a Neural Network's Output for + Weakly Supervised Segmentation of 3D Medical Images CVPR 2024 + + +
+ Annotating lots of 3D medical images for training segmentation models is +time-consuming. The goal of weakly supervised semantic segmentation is to train +segmentation models without using any ground truth segmentation masks. Our work +addresses the case where only image-level categorical labels, indicating the +presence or absence of a particular region of interest (such as tumours or +lesions), are available. Most existing methods rely on class activation mapping +(CAM). We propose a novel approach, ToNNO, which is based on the Tomographic +reconstruction of a Neural Network's Output. Our technique extracts stacks of +slices with different angles from the input 3D volume, feeds these slices to a +2D encoder, and applies the inverse Radon transform in order to reconstruct a +3D heatmap of the encoder's predictions. This generic method allows to perform +dense prediction tasks on 3D volumes using any 2D image encoder. We apply it to +weakly supervised medical image segmentation by training the 2D encoder to +output high values for slices containing the regions of interest. We test it on +four large scale medical image datasets and outperform 2D CAM methods. We then +extend ToNNO by combining tomographic reconstruction with CAM methods, +proposing Averaged CAM and Tomographic CAM, which obtain even better results. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Single-sample image-fusion upsampling of fluorescence lifetime images + + +
+ Fluorescence lifetime imaging microscopy (FLIM) provides detailed information +about molecular interactions and biological processes. A major bottleneck for +FLIM is image resolution at high acquisition speeds, due to the engineering and +signal-processing limitations of time-resolved imaging technology. Here we +present single-sample image-fusion upsampling (SiSIFUS), a data-fusion approach +to computational FLIM super-resolution that combines measurements from a +low-resolution time-resolved detector (that measures photon arrival time) and a +high-resolution camera (that measures intensity only). To solve this otherwise +ill-posed inverse retrieval problem, we introduce statistically informed priors +that encode local and global dependencies between the two single-sample +measurements. This bypasses the risk of out-of-distribution hallucination as in +traditional data-driven approaches and delivers enhanced images compared for +example to standard bilinear interpolation. The general approach laid out by +SiSIFUS can be applied to other image super-resolution problems where two +different datasets are available. + +
+
+ comment: 18 pages, 11 figures. To be published in Science Advances +
+
+
+
+
+ + ☆ DensePANet: An improved generative adversarial network for photoacoustic + tomography image reconstruction from sparse data + + +
+ Image reconstruction is an essential step of every medical imaging method, +including Photoacoustic Tomography (PAT), which is a promising modality of +imaging, that unites the benefits of both ultrasound and optical imaging +methods. Reconstruction of PAT images using conventional methods results in +rough artifacts, especially when applied directly to sparse PAT data. In recent +years, generative adversarial networks (GANs) have shown a powerful performance +in image generation as well as translation, rendering them a smart choice to be +applied to reconstruction tasks. In this study, we proposed an end-to-end +method called DensePANet to solve the problem of PAT image reconstruction from +sparse data. The proposed model employs a novel modification of UNet in its +generator, called FD-UNet++, which considerably improves the reconstruction +performance. We evaluated the method on various in-vivo and simulated datasets. +Quantitative and qualitative results show the better performance of our model +over other prevalent deep learning techniques. + +
+
+
+
+
+ + ☆ DISC: Latent Diffusion Models with Self-Distillation from Separated + Conditions for Prostate Cancer Grading CVPR 2024 + + +
+ Latent Diffusion Models (LDMs) can generate high-fidelity images from noise, +offering a promising approach for augmenting histopathology images for training +cancer grading models. While previous works successfully generated +high-fidelity histopathology images using LDMs, the generation of image tiles +to improve prostate cancer grading has not yet been explored. Additionally, +LDMs face challenges in accurately generating admixtures of multiple cancer +grades in a tile when conditioned by a tile mask. In this study, we train +specific LDMs to generate synthetic tiles that contain multiple Gleason Grades +(GGs) by leveraging pixel-wise annotations in input tiles. We introduce a novel +framework named Self-Distillation from Separated Conditions (DISC) that +generates GG patterns guided by GG masks. Finally, we deploy a training +framework for pixel-level and slide-level prostate cancer grading, where +synthetic tiles are effectively utilized to improve the cancer grading +performance of existing models. As a result, this work surpasses previous works +in two domains: 1) our LDMs enhanced with DISC produce more accurate tiles in +terms of GG patterns, and 2) our training scheme, incorporating synthetic data, +significantly improves the generalization of the baseline model for prostate +cancer grading, particularly in challenging cases of rare GG5, demonstrating +the potential of generative models to enhance cancer grading when data is +limited. + +
+
+ comment: Abstract accepted for ISBI 2024. Extended version to be presented at + SynData4CV @ CVPR 2024. See more at https://minhmanho.github.io/disc/ +
+
+
+
+
+ + ☆ Leveraging Visibility Graphs for Enhanced Arrhythmia Classification with + Graph Convolutional Networks + + +
+ Arrhythmias, detectable via electrocardiograms (ECGs), pose significant +health risks, emphasizing the need for robust automated identification +techniques. Although traditional deep learning methods have shown potential, +recent advances in graph-based strategies are aimed at enhancing arrhythmia +detection performance. However, effectively representing ECG signals as graphs +remains a challenge. This study explores graph representations of ECG signals +using Visibility Graph (VG) and Vector Visibility Graph (VVG), coupled with +Graph Convolutional Networks (GCNs) for arrhythmia classification. Through +experiments on the MIT-BIH dataset, we investigated various GCN architectures +and preprocessing parameters. The results reveal that GCNs, when integrated +with VG and VVG for signal graph mapping, can classify arrhythmias without the +need for preprocessing or noise removal from ECG signals. While both VG and VVG +methods show promise, VG is notably more efficient. The proposed approach was +competitive compared to baseline methods, although classifying the S class +remains challenging, especially under the inter-patient paradigm. Computational +complexity, particularly with the VVG method, required data balancing and +sophisticated implementation strategies. The source code is publicly available +for further research and development at +https://github.com/raffoliveira/VG_for_arrhythmia_classification_with_GCN. + +
+
+
+
+
+ + ☆ DensePANet: An improved generative adversarial network for photoacoustic + tomography image reconstruction from sparse data + + +
+ Image reconstruction is an essential step of every medical imaging method, +including Photoacoustic Tomography (PAT), which is a promising modality of +imaging, that unites the benefits of both ultrasound and optical imaging +methods. Reconstruction of PAT images using conventional methods results in +rough artifacts, especially when applied directly to sparse PAT data. In recent +years, generative adversarial networks (GANs) have shown a powerful performance +in image generation as well as translation, rendering them a smart choice to be +applied to reconstruction tasks. In this study, we proposed an end-to-end +method called DensePANet to solve the problem of PAT image reconstruction from +sparse data. The proposed model employs a novel modification of UNet in its +generator, called FD-UNet++, which considerably improves the reconstruction +performance. We evaluated the method on various in-vivo and simulated datasets. +Quantitative and qualitative results show the better performance of our model +over other prevalent deep learning techniques. + +
+
+
+
+
+ + ♻ ☆ QGen: On the Ability to Generalize in Quantization Aware Training + + +
+ Quantization lowers memory usage, computational requirements, and latency by +utilizing fewer bits to represent model weights and activations. In this work, +we investigate the generalization properties of quantized neural networks, a +characteristic that has received little attention despite its implications on +model performance. In particular, first, we develop a theoretical model for +quantization in neural networks and demonstrate how quantization functions as a +form of regularization. Second, motivated by recent work connecting the +sharpness of the loss landscape and generalization, we derive an approximate +bound for the generalization of quantized models conditioned on the amount of +quantization noise. We then validate our hypothesis by experimenting with over +2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on +convolutional and transformer-based models. + +
+
+
+
+
+ + ♻ ☆ One-shot skill assessment in high-stakes domains with limited data via + meta learning + + +
+ Deep Learning (DL) has achieved robust competency assessment in various +high-stakes fields. However, the applicability of DL models is often hampered +by their substantial data requirements and confinement to specific training +domains. This prevents them from transitioning to new tasks where data is +scarce. Therefore, domain adaptation emerges as a critical element for the +practical implementation of DL in real-world scenarios. Herein, we introduce +A-VBANet, a novel meta-learning model capable of delivering domain-agnostic +skill assessment via one-shot learning. Our methodology has been tested by +assessing surgical skills on five laparoscopic and robotic simulators and +real-life laparoscopic cholecystectomy. Our model successfully adapted with +accuracies up to 99.5% in one-shot and 99.9% in few-shot settings for simulated +tasks and 89.7% for laparoscopic cholecystectomy. This study marks the first +instance of a domain-agnostic methodology for skill assessment in critical +fields setting a precedent for the broad application of DL across diverse +real-life domains with limited data. + +
+
+ comment: 23 pages (Main Manuscript + Supplementary Materials + Arxiv Logs), 4 + figures (+2 Supplementary Figures), 2 tables (+5 Supplementary Tables) +
+
+
+
+
+ + ♻ ☆ HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and + Low-Frequency Information of Parametric Models CVPR 2024 + + +
+ Reconstructing 3D clothed human involves creating a detailed geometry of +individuals in clothing, with applications ranging from virtual try-on, movies, +to games. To enable practical and widespread applications, recent advances +propose to generate a clothed human from an RGB image. However, they struggle +to reconstruct detailed and robust avatars simultaneously. We empirically find +that the high-frequency (HF) and low-frequency (LF) information from a +parametric model has the potential to enhance geometry details and improve +robustness to noise, respectively. Based on this, we propose HiLo, namely +clothed human reconstruction with high- and low-frequency information, which +contains two components. 1) To recover detailed geometry using HF information, +we propose a progressive HF Signed Distance Function to enhance the detailed 3D +geometry of a clothed human. We analyze that our progressive learning manner +alleviates large gradients that hinder model convergence. 2) To achieve robust +reconstruction against inaccurate estimation of the parametric model by using +LF information, we propose a spatial interaction implicit function. This +function effectively exploits the complementary spatial information from a +low-resolution voxel grid of the parametric model. Experimental results +demonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and +9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets, +respectively. Additionally, HiLo demonstrates robustness to noise from the +parametric model, challenging poses, and various clothing styles. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ♻ ☆ QDFormer: Towards Robust Audiovisual Segmentation in Complex + Environments with Quantization-based Semantic Decomposition + + +
+ Audiovisual segmentation (AVS) is a challenging task that aims to segment +visual objects in videos according to their associated acoustic cues. With +multiple sound sources and background disturbances involved, establishing +robust correspondences between audio and visual contents poses unique +challenges due to (1) complex entanglement across sound sources and (2) +frequent changes in the occurrence of distinct sound events. Assuming sound +events occur independently, the multi-source semantic space can be represented +as the Cartesian product of single-source sub-spaces. We are motivated to +decompose the multi-source audio semantics into single-source semantics for +more effective interactions with visual content. We propose a semantic +decomposition method based on product quantization, where the multi-source +semantics can be decomposed and represented by several disentangled and +noise-suppressed single-source semantics. Furthermore, we introduce a +global-to-local quantization mechanism, which distills knowledge from stable +global (clip-level) features into local (frame-level) ones, to handle frequent +changes in audio semantics. Extensive experiments demonstrate that our +semantically decomposed audio representation significantly improves AVS +performance, e.g., +21.2% mIoU on the challenging AVS-Semantic benchmark with +ResNet50 backbone. https://github.com/lxa9867/QSD. + +
+
+
+
+
+ + ♻ ☆ An Embodied Generalist Agent in 3D World + + +
+ Leveraging massive knowledge and learning schemes from large language models +(LLMs), recent machine learning models show notable successes in building +generalist agents that exhibit the capability of general-purpose task solving +in diverse domains, including natural language processing, computer vision, and +robotics. However, a significant challenge remains as these models exhibit +limited ability in understanding and interacting with the 3D world. We argue +this limitation significantly hinders the current models from performing +real-world tasks and further achieving general intelligence. To this end, we +introduce an embodied multi-modal and multi-task generalist agent that excels +in perceiving, grounding, reasoning, planning, and acting in the 3D world. Our +proposed agent, referred to as LEO, is trained with shared LLM-based model +architectures, objectives, and weights in two stages: (i) 3D vision-language +alignment and (ii) 3D vision-language-action instruction tuning. To facilitate +the training, we meticulously curate and generate an extensive dataset +comprising object-level and scene-level multi-modal tasks with exceeding scale +and complexity, necessitating a deep understanding of and interaction with the +3D world. Through rigorous experiments, we demonstrate LEO's remarkable +proficiency across a wide spectrum of tasks, including 3D captioning, question +answering, embodied reasoning, embodied navigation, and robotic manipulation. +Our ablation results further provide valuable insights for the development of +future embodied generalist agents. + +
+
+ comment: The first four authors contribute equally. Project page: + https://embodied-generalist.github.io +
+
+
+
+
+ + ♻ ☆ Mitigating Open-Vocabulary Caption Hallucinations + + +
+ While recent years have seen rapid progress in image-conditioned text +generation, image captioning still suffers from the fundamental issue of +hallucinations, namely, the generation of spurious details that cannot be +inferred from the given image. Existing methods largely use closed-vocabulary +object lists to mitigate or evaluate hallucinations in image captioning, +ignoring the long-tailed nature of hallucinations that occur in practice. To +this end, we propose a framework for addressing hallucinations in image +captioning in the open-vocabulary setting. Our framework includes a new +benchmark, OpenCHAIR, that leverages generative foundation models to evaluate +open-vocabulary object hallucinations for image captioning, surpassing the +popular and similarly-sized CHAIR benchmark in both diversity and accuracy. +Furthermore, to mitigate open-vocabulary hallucinations without using a closed +object list, we propose MOCHa, an approach harnessing advancements in +reinforcement learning. Our multi-objective reward function explicitly targets +the trade-off between fidelity and adequacy in generations without requiring +any strong supervision. MOCHa improves a large variety of image captioning +models, as captured by our OpenCHAIR benchmark and other existing metrics. We +will release our code and models. + +
+
+ comment: Website Link: https://assafbk.github.io/mocha/ +
+
+
+
+
+ + ♻ ☆ Feature Corrective Transfer Learning: End-to-End Solutions to Object + Detection in Non-Ideal Visual Conditions CVPR + + +
+ A significant challenge in the field of object detection lies in the system's +performance under non-ideal imaging conditions, such as rain, fog, low +illumination, or raw Bayer images that lack ISP processing. Our study +introduces "Feature Corrective Transfer Learning", a novel approach that +leverages transfer learning and a bespoke loss function to facilitate the +end-to-end detection of objects in these challenging scenarios without the need +to convert non-ideal images into their RGB counterparts. In our methodology, we +initially train a comprehensive model on a pristine RGB image dataset. +Subsequently, non-ideal images are processed by comparing their feature maps +against those from the initial ideal RGB model. This comparison employs the +Extended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function +designed to quantify similarities and integrate them into the detection loss. +This approach refines the model's ability to perform object detection across +varying conditions through direct feature map correction, encapsulating the +essence of Feature Corrective Transfer Learning. Experimental validation on +variants of the KITTI dataset demonstrates a significant improvement in mean +Average Precision (mAP), resulting in a 3.8-8.1% relative enhancement in +detection under non-ideal conditions compared to the baseline model, and a less +marginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved +under ideal conditions by the standard Faster RCNN algorithm. + +
+
+ comment: 2024 CVPR UG2+ Workshop +
+
+
+
+
+ + ♻ ☆ RANRAC: Robust Neural Scene Representations via Random Ray Consensus + + +
+ Learning-based scene representations such as neural radiance fields or light +field networks, that rely on fitting a scene model to image observations, +commonly encounter challenges in the presence of inconsistencies within the +images caused by occlusions, inaccurately estimated camera parameters or +effects like lens flare. To address this challenge, we introduce RANdom RAy +Consensus (RANRAC), an efficient approach to eliminate the effect of +inconsistent data, thereby taking inspiration from classical RANSAC based +outlier detection for model fitting. In contrast to the down-weighting of the +effect of outliers based on robust loss formulations, our approach reliably +detects and excludes inconsistent perspectives, resulting in clean images +without floating artifacts. For this purpose, we formulate a fuzzy adaption of +the RANSAC paradigm, enabling its application to large scale models. We +interpret the minimal number of samples to determine the model parameters as a +tunable hyperparameter, investigate the generation of hypotheses with +data-driven models, and analyze the validation of hypotheses in noisy +environments. We demonstrate the compatibility and potential of our solution +for both photo-realistic robust multi-view reconstruction from real-world +images based on neural radiance fields and for single-shot reconstruction based +on light-field networks. In particular, the results indicate significant +improvements compared to state-of-the-art robust methods for novel-view +synthesis on both synthetic and captured scenes with various inconsistencies +including occlusions, noisy camera pose estimates, and unfocused perspectives. +The results further indicate significant improvements for single-shot +reconstruction from occluded images. Project Page: +https://bennobuschmann.com/ranrac/ + +
+
+
+
+
+ + ♻ ☆ RefinedFields: Radiance Fields Refinement for Unconstrained Scenes + + +
+ Modeling large scenes from unconstrained images has proven to be a major +challenge in computer vision. Existing methods tackling in-the-wild scene +modeling operate in closed-world settings, where no conditioning on priors +acquired from real-world images is present. We propose RefinedFields, which is, +to the best of our knowledge, the first method leveraging pre-trained models to +improve in-the-wild scene modeling. We employ pre-trained networks to refine +K-Planes representations via optimization guidance using an alternating +training procedure. We carry out extensive experiments and verify the merit of +our method on synthetic data and real tourism photo collections. RefinedFields +enhances rendered scenes with richer details and improves upon its base +representation on the task of novel view synthesis in the wild. Our project +page can be found at https://refinedfields.github.io. + +
+
+ comment: Corrected Table 2, where some comparisons were done among models + trained at different resolutions +
+
+
+
+
+ + ♻ ☆ On the Pitfalls of Batch Normalization for End-to-End Video Learning: A + Study on Surgical Workflow Analysis + + +
+ Batch Normalization's (BN) unique property of depending on other samples in a +batch is known to cause problems in several tasks, including sequence modeling. +Yet, BN-related issues are hardly studied for long video understanding, despite +the ubiquitous use of BN in CNNs (Convolutional Neural Networks) for feature +extraction. Especially in surgical workflow analysis, where the lack of +pretrained feature extractors has led to complex, multi-stage training +pipelines, limited awareness of BN issues may have hidden the benefits of +training CNNs and temporal models end to end. In this paper, we analyze +pitfalls of BN in video learning, including issues specific to online tasks +such as a 'cheating' effect in anticipation. We observe that BN's properties +create major obstacles for end-to-end learning. However, using BN-free +backbones, even simple CNN-LSTMs beat the state of the art +{\color{\colorrevtwo}on three surgical workflow benchmarks} by utilizing +adequate end-to-end training strategies which maximize temporal context. We +conclude that awareness of BN's pitfalls is crucial for effective end-to-end +learning in surgical tasks. By reproducing results on natural-video datasets, +we hope our insights will benefit other areas of video learning as well. Code +is available at: \url{https://gitlab.com/nct_tso_public/pitfalls_bn} + +
+
+ comment: Accepted at Medical Image Analysis (MedIA). Publication link: + https://www.sciencedirect.com/science/article/pii/S1361841524000513 +
+
+
+
+
+ + ♻ ☆ Leveraging Automatic Personalised Nutrition: Food Image Recognition + Benchmark and Dataset based on Nutrition Taxonomy + + +
+ Maintaining a healthy lifestyle has become increasingly challenging in +today's sedentary society marked by poor eating habits. To address this issue, +both national and international organisations have made numerous efforts to +promote healthier diets and increased physical activity. However, implementing +these recommendations in daily life can be difficult, as they are often generic +and not tailored to individuals. This study presents the AI4Food-NutritionDB +database, the first nutrition database that incorporates food images and a +nutrition taxonomy based on recommendations by national and international +health authorities. The database offers a multi-level categorisation, +comprising 6 nutritional levels, 19 main categories (e.g., "Meat"), 73 +subcategories (e.g., "White Meat"), and 893 specific food products (e.g., +"Chicken"). The AI4Food-NutritionDB opens the doors to new food computing +approaches in terms of food intake frequency, quality, and categorisation. +Also, we present a standardised experimental protocol and benchmark including +three tasks based on the nutrition taxonomy (i.e., category, subcategory, and +final product recognition). These resources are available to the research +community, including our deep learning models trained on AI4Food-NutritionDB, +which can serve as pre-trained models, achieving accurate recognition results +for challenging food image databases. + +
+
+ comment: 12 pages, 4 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Multi-modal vision-language model for generalizable annotation-free + pathological lesions localization and clinical diagnosis + + +
+ Defining pathologies automatically from medical images aids the understanding +of the emergence and progression of diseases, and such an ability is crucial in +clinical diagnostics. However, existing deep learning models heavily rely on +expert annotations and lack generalization capabilities in open clinical +environments. In this study, we present a generalizable vision-language +pre-training model for Annotation-Free pathological lesions Localization +(AFLoc). The core strength of AFLoc lies in its extensive multi-level semantic +structure-based contrastive learning, which comprehensively aligns +multi-granularity medical concepts from reports with abundant image features, +to adapt to the diverse expressions of pathologies and unseen pathologies +without the reliance on image annotations from experts. We demonstrate the +proof of concept on CXR images, with extensive experimental validation across 4 +distinct external datasets, encompassing 11 types of chest pathologies. The +results demonstrate that AFLoc surpasses state-of-the-art methods in +pathological lesions localization and disease classification, and even +outperforms the human benchmark in locating 5 different pathologies. +Additionally, we further verify its generalization ability by applying it to +retinal fundus images. Our approach showcases AFoc versatilities and +underscores its suitability for clinical diagnoses in complex clinical +environments. + +
+
+
+
+
+ + ♻ ☆ Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis + + +
+ Artificial intelligence (AI) in healthcare, especially in medical imaging, +faces challenges due to data scarcity and privacy concerns. Addressing these, +we introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI +synthesis. This model effectively tackles data scarcity and privacy issues by +integrating semantic conditioning. This involves the channel-wise concatenation +of a conditioning image to the model input, enabling control in image +generation. Med-DDPM demonstrates superior stability and performance compared +to existing 3D brain imaging synthesis methods. It generates diverse, +anatomically coherent images with high visual fidelity. In terms of dice score +accuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the +0.6531 accuracy of real images, and outperforms baseline models. Combined with +real images, it further increases segmentation accuracy to 0.6675, showing the +potential of our proposed method for data augmentation. This model represents +the first use of a diffusion model in 3D semantic brain MRI synthesis, +producing high-quality images. Its semantic conditioning feature also shows +potential for image anonymization in biomedical imaging, addressing data and +privacy issues. We provide the code and model weights for Med-DDPM on our +GitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support +reproducibility. + +
+
+ comment: This document is a preprint and has been accepted for publication in + the IEEE Journal of Biomedical and Health Informatics. The final, published + version can be accessed using the following DOI: 10.1109/JBHI.2024.3385504. + Copyright for this article has been transferred to IEEE +
+
+
+
+
+ + ♻ ☆ Modeling Hierarchical Structural Distance for Unsupervised Domain + Adaptation + + +
+ Unsupervised domain adaptation (UDA) aims to estimate a transferable model +for unlabeled target domains by exploiting labeled source data. Optimal +Transport (OT) based methods have recently been proven to be a promising +solution for UDA with a solid theoretical foundation and competitive +performance. However, most of these methods solely focus on domain-level OT +alignment by leveraging the geometry of domains for domain-invariant features +based on the global embeddings of images. However, global representations of +images may destroy image structure, leading to the loss of local details that +offer category-discriminative information. This study proposes an end-to-end +Deep Hierarchical Optimal Transport method (DeepHOT), which aims to learn both +domain-invariant and category-discriminative representations by mining +hierarchical structural relations among domains. The main idea is to +incorporate a domain-level OT and image-level OT into a unified OT framework, +hierarchical optimal transport, to model the underlying geometry in both domain +space and image space. In DeepHOT framework, an image-level OT serves as the +ground distance metric for the domain-level OT, leading to the hierarchical +structural distance. Compared with the ground distance of the conventional +domain-level OT, the image-level OT captures structural associations among +local regions of images that are beneficial to classification. In this way, +DeepHOT, a unified OT framework, not only aligns domains by domain-level OT, +but also enhances the discriminative power through image-level OT. Moreover, to +overcome the limitation of high computational complexity, we propose a robust +and efficient implementation of DeepHOT by approximating origin OT with sliced +Wasserstein distance in image-level OT and accomplishing the mini-batch +unbalanced domain-level OT. + +
+
+ comment: accepted by TCVST, code: https://github.com/Innse/DeepHOT +
+
+
+
+
+ + ♻ ☆ Monocular 3D lane detection for Autonomous Driving: Recent Achievements, + Challenges, and Outlooks + + +
+ 3D lane detection is essential in autonomous driving as it extracts +structural and traffic information from the road in three-dimensional space, +aiding self-driving cars in logical, safe, and comfortable path planning and +motion control. Given the cost of sensors and the advantages of visual data in +color information, 3D lane detection based on monocular vision is an important +research direction in the realm of autonomous driving, increasingly gaining +attention in both industry and academia. Regrettably, recent advancements in +visual perception seem inadequate for the development of fully reliable 3D lane +detection algorithms, which also hampers the progress of vision-based fully +autonomous vehicles. We believe that there is still considerable room for +improvement in 3D lane detection algorithms for autonomous vehicles using +visual sensors, and significant enhancements are needed. This review looks back +and analyzes the current state of achievements in the field of 3D lane +detection research. It covers all current monocular-based 3D lane detection +processes, discusses the performance of these cutting-edge algorithms, analyzes +the time complexity of various algorithms, and highlights the main achievements +and limitations of ongoing research efforts. The survey also includes a +comprehensive discussion of available 3D lane detection datasets and the +challenges that researchers face but have not yet resolved. Finally, our work +outlines future research directions and invites researchers and practitioners +to join this exciting field. + +
+
+
+
+
+ + ♻ ☆ Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis SIGGRAPH 2024 + + +
+ Designing a 3D representation of a dynamic scene for fast optimization and +rendering is a challenging task. While recent explicit representations enable +fast learning and rendering of dynamic radiance fields, they require a dense +set of input viewpoints. In this work, we focus on learning a fast +representation for dynamic radiance fields with sparse input viewpoints. +However, the optimization with sparse input is under-constrained and +necessitates the use of motion priors to constrain the learning. Existing fast +dynamic scene models do not explicitly model the motion, making them difficult +to be constrained with motion priors. We design an explicit motion model as a +factorized 4D representation that is fast and can exploit the spatio-temporal +correlation of the motion field. We then introduce reliable flow priors +including a combination of sparse flow priors across cameras and dense flow +priors within cameras to regularize our motion model. Our model is fast, +compact and achieves very good performance on popular multi-view dynamic scene +datasets with sparse input viewpoints. The source code for our model can be +found on our project page: +https://nagabhushansn95.github.io/publications/2024/RF-DeRF.html. + +
+
+ comment: Accepted at SIGGRAPH 2024 +
+
+
+
+
+ + ♻ ☆ Overcoming Generic Knowledge Loss with Selective Parameter Update + + +
+ Foundation models encompass an extensive knowledge base and offer remarkable +transferability. However, this knowledge becomes outdated or insufficient over +time. The challenge lies in continuously updating foundation models to +accommodate novel information while retaining their original capabilities. +Leveraging the fact that foundation models have initial knowledge on various +tasks and domains, we propose a novel approach that, instead of updating all +parameters equally, localizes the updates to a sparse set of parameters +relevant to the task being learned. We strike a balance between efficiency and +new task performance, while maintaining the transferability and +generalizability of foundation models. We extensively evaluate our method on +foundational vision-language models with a diverse spectrum of continual +learning tasks. Our method achieves improvements on the accuracy of the newly +learned tasks up to 7% while preserving the pretraining knowledge with a +negligible decrease of 0.9% on a representative control set accuracy. + +
+
+
+
+
+ + ♻ ☆ Koala: Key frame-conditioned long video-LLM CVPR 2024 + + +
+ Long video question answering is a challenging task that involves recognizing +short-term activities and reasoning about their fine-grained relationships. +State-of-the-art video Large Language Models (vLLMs) hold promise as a viable +solution due to their demonstrated emergent capabilities on new tasks. However, +despite being trained on millions of short seconds-long videos, vLLMs are +unable to understand minutes-long videos and accurately answer questions about +them. To address this limitation, we propose a lightweight and self-supervised +approach, Key frame-conditioned long video-LLM (Koala), that introduces +learnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to +longer videos. Our approach introduces two new tokenizers that condition on +visual tokens computed from sparse video key frames for understanding short and +long video moments. We train our proposed approach on HowTo100M and demonstrate +its effectiveness on zero-shot long video understanding benchmarks, where it +outperforms state-of-the-art large models by 3 - 6% in absolute accuracy across +all tasks. Surprisingly, we also empirically show that our approach not only +helps a pretrained vLLM to understand long videos but also improves its +accuracy on short-term action recognition. + +
+
+ comment: Accepted at CVPR 2024 as a poster highlight +
+
+
+
+
+ + ♻ ☆ Efficient Backdoor Attacks for Deep Neural Networks in Real-world + Scenarios ICLR 2024 + + +
+ Recent deep neural networks (DNNs) have came to rely on vast amounts of +training data, providing an opportunity for malicious attackers to exploit and +contaminate the data to carry out backdoor attacks. However, existing backdoor +attack methods make unrealistic assumptions, assuming that all training data +comes from a single source and that attackers have full access to the training +data. In this paper, we introduce a more realistic attack scenario where +victims collect data from multiple sources, and attackers cannot access the +complete training data. We refer to this scenario as data-constrained backdoor +attacks. In such cases, previous attack methods suffer from severe efficiency +degradation due to the entanglement between benign and poisoning features +during the backdoor injection process. To tackle this problem, we introduce +three CLIP-based technologies from two distinct streams: Clean Feature +Suppression and Poisoning Feature Augmentation.effective solution for +data-constrained backdoor attacks. The results demonstrate remarkable +improvements, with some settings achieving over 100% improvement compared to +existing attacks in data-constrained scenarios. Code is available at +https://github.com/sunh1113/Efficient-backdoor-attacks-for-deep-neural-networks-in-real-world-scenarios + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental + Semantic Segmentation + + +
+ Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a +pre-trained segmentation model to segment new classes using cost-effective and +readily available image-level labels. A prevailing way to solve WILSS is the +generation of seed areas for each new class, serving as a form of pixel-level +supervision. However, a scenario usually arises where a pixel is concurrently +predicted as an old class by the pre-trained segmentation model and a new class +by the seed areas. Such a scenario becomes particularly problematic in WILSS, +as the lack of pixel-level annotations on new classes makes it intractable to +ascertain whether the pixel pertains to the new class or not. To surmount this +issue, we propose an innovative, tendency-driven relationship of mutual +exclusivity, meticulously tailored to govern the behavior of the seed areas and +the predictions generated by the pre-trained segmentation model. This +relationship stipulates that predictions for the new and old classes must not +conflict whilst prioritizing the preservation of predictions for the old +classes, which not only addresses the conflicting prediction issue but also +effectively mitigates the inherent challenge of incremental learning - +catastrophic forgetting. Furthermore, under the auspices of this +tendency-driven mutual exclusivity relationship, we generate pseudo masks for +the new classes, allowing for concurrent execution with model parameter +updating via the resolution of a bi-level optimization problem. Extensive +experiments substantiate the effectiveness of our framework, resulting in the +establishment of new benchmarks and paving the way for further research in this +field. + +
+
+
+
+
+ + ♻ ☆ EATFormer: Improving Vision Transformer Inspired by Evolutionary + Algorithm + + +
+ Motivated by biological evolution, this paper explains the rationality of +Vision Transformer by analogy with the proven practical Evolutionary Algorithm +(EA) and derives that both have consistent mathematical formulation. Then +inspired by effective EA variants, we propose a novel pyramid EATFormer +backbone that only contains the proposed \emph{EA-based Transformer} (EAT) +block, which consists of three residual parts, i.e., \emph{Multi-Scale Region +Aggregation} (MSRA), \emph{Global and Local Interaction} (GLI), and +\emph{Feed-Forward Network} (FFN) modules, to model multi-scale, interactive, +and individual information separately. Moreover, we design a \emph{Task-Related +Head} (TRH) docked with transformer backbone to complete final information +fusion more flexibly and \emph{improve} a \emph{Modulated Deformable MSA} +(MD-MSA) to dynamically model irregular locations. Massive quantitative and +quantitative experiments on image classification, downstream tasks, and +explanatory experiments demonstrate the effectiveness and superiority of our +approach over State-Of-The-Art (SOTA) methods. \Eg, our Mobile (1.8M), Tiny +(6.1M), Small (24.3M), and Base (49.0M) models achieve 69.4, 78.4, 83.1, and +83.9 Top-1 only trained on ImageNet-1K with naive training recipe; +EATFormer-Tiny/Small/Base armed Mask-R-CNN obtain 45.4/47.4/49.0 box AP and +41.4/42.9/44.2 mask AP on COCO detection, surpassing contemporary MPViT-T, +Swin-T, and Swin-S by 0.6/1.4/0.5 box AP and 0.4/1.3/0.9 mask AP separately +with less FLOPs; Our EATFormer-Small/Base achieve 47.3/49.3 mIoU on ADE20K by +Upernet that exceeds Swin-T/S by 2.8/1.7. Code is available at +\url{https://github.com/zhangzjn/EATFormer}. + +
+
+
+
+
+ + ♻ ☆ Heterogeneous Federated Learning with Splited Language Model + + +
+ Federated Split Learning (FSL) is a promising distributed learning paradigm +in practice, which gathers the strengths of both Federated Learning (FL) and +Split Learning (SL) paradigms, to ensure model privacy while diminishing the +resource overhead of each client, especially on large transformer models in a +resource-constrained environment, e.g., Internet of Things (IoT). However, +almost all works merely investigate the performance with simple neural network +models in FSL. Despite the minor efforts focusing on incorporating Vision +Transformers (ViT) as model architectures, they train ViT from scratch, thereby +leading to enormous training overhead in each device with limited resources. +Therefore, in this paper, we harness Pre-trained Image Transformers (PITs) as +the initial model, coined FedV, to accelerate the training process and improve +model robustness. Furthermore, we propose FedVZ to hinder the gradient +inversion attack, especially having the capability compatible with black-box +scenarios, where the gradient information is unavailable. Concretely, FedVZ +approximates the server gradient by utilizing a zeroth-order (ZO) optimization, +which replaces the backward propagation with just one forward process. +Empirically, we are the first to provide a systematic evaluation of FSL methods +with PITs in real-world datasets, different partial device participations, and +heterogeneous data splits. Our experiments verify the effectiveness of our +algorithms. + +
+
+
+
+
+ + ♻ ☆ Joint Coordinate Regression and Association For Multi-Person Pose + Estimation, A Pure Neural Network Approach + + +
+ We introduce a novel one-stage end-to-end multi-person 2D pose estimation +algorithm, known as Joint Coordinate Regression and Association (JCRA), that +produces human pose joints and associations without requiring any +post-processing. The proposed algorithm is fast, accurate, effective, and +simple. The one-stage end-to-end network architecture significantly improves +the inference speed of JCRA. Meanwhile, we devised a symmetric network +structure for both the encoder and decoder, which ensures high accuracy in +identifying keypoints. It follows an architecture that directly outputs part +positions via a transformer network, resulting in a significant improvement in +performance. Extensive experiments on the MS COCO and CrowdPose benchmarks +demonstrate that JCRA outperforms state-of-the-art approaches in both accuracy +and efficiency. Moreover, JCRA demonstrates 69.2 mAP and is 78\% faster at +inference acceleration than previous state-of-the-art bottom-up algorithms. The +code for this algorithm will be publicly available. + +
+
+ comment: This paper has been accepted by MMasia 2023 and is an oral + presentation +
+
+
+
+
+ + ♻ ☆ Exploring Radar Data Representations in Autonomous Driving: A + Comprehensive Review + + +
+ With the rapid advancements of sensor technology and deep learning, +autonomous driving systems are providing safe and efficient access to +intelligent vehicles as well as intelligent transportation. Among these +equipped sensors, the radar sensor plays a crucial role in providing robust +perception information in diverse environmental conditions. This review focuses +on exploring different radar data representations utilized in autonomous +driving systems. Firstly, we introduce the capabilities and limitations of the +radar sensor by examining the working principles of radar perception and signal +processing of radar measurements. Then, we delve into the generation process of +five radar representations, including the ADC signal, radar tensor, point +cloud, grid map, and micro-Doppler signature. For each radar representation, we +examine the related datasets, methods, advantages and limitations. Furthermore, +we discuss the challenges faced in these data representations and propose +potential research directions. Above all, this comprehensive review offers an +in-depth insight into how these representations enhance autonomous system +capabilities, providing guidance for radar perception researchers. To +facilitate retrieval and comparison of different data representations, datasets +and methods, we provide an interactive website at +https://radar-camera-fusion.github.io/radar. + +
+
+ comment: 24 pages, 10 figures, 5 tables. arXiv admin note: text overlap with + arXiv:2304.10410 +
+
+
+
+
+ + ♻ ☆ MARIS: Referring Image Segmentation via Mutual-Aware Attention Features + + +
+ Referring image segmentation (RIS) aims to segment a particular region based +on a language expression prompt. Existing methods incorporate linguistic +features into visual features and obtain multi-modal features for mask +decoding. However, these methods may segment the visually salient entity +instead of the correct referring region, as the multi-modal features are +dominated by the abundant visual context. In this paper, we propose MARIS, a +referring image segmentation method that leverages the Segment Anything Model +(SAM) and introduces a mutual-aware attention mechanism to enhance the +cross-modal fusion via two parallel branches. Specifically, our mutual-aware +attention mechanism consists of Vision-Guided Attention and Language-Guided +Attention, which bidirectionally model the relationship between visual and +linguistic features. Correspondingly, we design a Mask Decoder to enable +explicit linguistic guidance for more consistent segmentation with the language +expression. To this end, a multi-modal query token is proposed to integrate +linguistic information and interact with visual information simultaneously. +Extensive experiments on three benchmark datasets show that our method +outperforms the state-of-the-art RIS methods. Our code will be publicly +available. + +
+
+
+
+
+ + ♻ ☆ Task-conditioned adaptation of visual features in multi-task policy + learning + + +
+ Successfully addressing a wide variety of tasks is a core ability of +autonomous agents, requiring flexibly adapting the underlying decision-making +strategies and, as we argue in this work, also adapting the perception modules. +An analogical argument would be the human visual system, which uses top-down +signals to focus attention determined by the current task. Similarly, we adapt +pre-trained large vision models conditioned on specific downstream tasks in the +context of multi-task policy learning. We introduce task-conditioned adapters +that do not require finetuning any pre-trained weights, combined with a single +policy trained with behavior cloning and capable of addressing multiple tasks. +We condition the visual adapters on task embeddings, which can be selected at +inference if the task is known, or alternatively inferred from a set of example +demonstrations. To this end, we propose a new optimization-based estimator. We +evaluate the method on a wide variety of tasks from the CortexBench benchmark +and show that, compared to existing work, it can be addressed with a single +policy. In particular, we demonstrate that adapting visual features is a key +design choice and that the method generalizes to unseen tasks given a few +demonstrations. + +
+
+
+
+
+ + ♻ ☆ MovePose: A High-performance Human Pose Estimation Algorithm on Mobile + and Edge Devices + + +
+ We present MovePose, an optimized lightweight convolutional neural network +designed specifically for real-time body pose estimation on CPU-based mobile +devices. The current solutions do not provide satisfactory accuracy and speed +for human posture estimation, and MovePose addresses this gap. It aims to +maintain real-time performance while improving the accuracy of human posture +estimation for mobile devices. Our MovePose algorithm has attained an Mean +Average Precision (mAP) score of 68.0 on the COCO \cite{cocodata} validation +dataset. The MovePose algorithm displayed efficiency with a performance of 69+ +frames per second (fps) when run on an Intel i9-10920x CPU. Additionally, it +showcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an +Android phone equipped with a Snapdragon 8 + 4G processor, the fps reached +above 11. To enhance accuracy, we incorporated three techniques: deconvolution, +large kernel convolution, and coordinate classification methods. Compared to +basic upsampling, deconvolution is trainable, improves model capacity, and +enhances the receptive field. Large kernel convolution strengthens these +properties at a decreased computational cost. In summary, MovePose provides +high accuracy and real-time performance, marking it a potential tool for a +variety of applications, including those focused on mobile-side human posture +estimation. The code and models for this algorithm will be made publicly +accessible. + +
+
+
+
+
+ + ♻ ☆ List-Mode PET Image Reconstruction Using Dykstra-Like Splitting + + +
+ Convergence of the block iterative method in image reconstruction for +positron emission tomography (PET) requires careful control of relaxation +parameters, which is a challenging task. The automatic determination of +relaxation parameters for list-mode reconstructions also remains challenging. +Therefore, a different approach would be desirable. In this study, we propose a +list-mode maximum likelihood Dykstra-like splitting PET reconstruction +(LM-MLDS). LM-MLDS converges the list-mode block iterative method by adding the +distance from an initial image as a penalty term into an objective function. +LM-MLDS takes a two-step approach because its performance depends on the +quality of the initial image. The first step uses a uniform image as the +initial image, and then the second step uses a reconstructed image after one +main iteration as the initial image. In a simulation study, LM-MLDS provided a +better tradeoff curve between noise and contrast than the other methods. In a +clinical study, LM-MLDS removed the false hotspots at the edge of the axial +field of view and improved the image quality of slices covering the top of the +head to the cerebellum. List-mode proximal splitting reconstruction is useful +not only for optimizing nondifferential functions but also for converging block +iterative methods without controlling relaxation parameters. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ A Multimodal Fusion Network For Student Emotion Recognition Based on + Transformer and Tensor Product + + +
+ This paper introduces a new multi-modal model based on the Transformer +architecture and tensor product fusion strategy, combining BERT's text vectors +and ViT's image vectors to classify students' psychological conditions, with an +accuracy of 93.65%. The purpose of the study is to accurately analyze the +mental health status of students from various data sources. This paper +discusses modal fusion methods, including early, late and intermediate fusion, +to overcome the challenges of integrating multi-modal information. Ablation +studies compare the performance of different models and fusion techniques, +showing that the proposed model outperforms existing methods such as CLIP and +ViLBERT in terms of accuracy and inference speed. Conclusions indicate that +while this model has significant advantages in emotion recognition, its +potential to incorporate other data modalities provides areas for future +research. + +
+
+
+
+
+ + ♻ ☆ RanLayNet: A Dataset for Document Layout Detection used for Domain + Adaptation and Generalization + + +
+ Large ground-truth datasets and recent advances in deep learning techniques +have been useful for layout detection. However, because of the restricted +layout diversity of these datasets, training on them requires a sizable number +of annotated instances, which is both expensive and time-consuming. As a +result, differences between the source and target domains may significantly +impact how well these models function. To solve this problem, domain adaptation +approaches have been developed that use a small quantity of labeled data to +adjust the model to the target domain. In this research, we introduced a +synthetic document dataset called RanLayNet, enriched with automatically +assigned labels denoting spatial positions, ranges, and types of layout +elements. The primary aim of this endeavor is to develop a versatile dataset +capable of training models with robustness and adaptability to diverse document +formats. Through empirical experimentation, we demonstrate that a deep layout +identification model trained on our dataset exhibits enhanced performance +compared to a model trained solely on actual documents. Moreover, we conduct a +comparative analysis by fine-tuning inference models using both PubLayNet and +IIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that +models enriched with our dataset are optimal for tasks such as achieving 0.398 +and 0.588 mAP95 score in the scientific document domain for the TABLE class. + +
+
+ comment: 8 pages, 6 figures, MMAsia 2023 Proceedings of the 5th ACM + International Conference on Multimedia in Asia +
+
+
+
+
+ + ♻ ☆ TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table + Structure & Content + + +
+ The automatic recognition of tabular data in document images presents a +significant challenge due to the diverse range of table styles and complex +structures. Tables offer valuable content representation, enhancing the +predictive capabilities of various systems such as search engines and Knowledge +Graphs. Addressing the two main problems, namely table detection (TD) and table +structure recognition (TSR), has traditionally been approached independently. +In this research, we propose an end-to-end pipeline that integrates deep +learning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve +comprehensive image-based table recognition. This integrated approach +effectively handles diverse table styles, complex structures, and image +distortions, resulting in improved accuracy and efficiency compared to existing +methods like Table Transformers. Our system achieves simultaneous table +detection (TD), table structure recognition (TSR), and table content +recognition (TCR), preserving table structures and accurately extracting +tabular data from document images. The integration of multiple models addresses +the intricacies of table recognition, making our approach a promising solution +for image-based table understanding, data extraction, and information retrieval +applications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy +of 78%, showcasing a remarkable improvement of approximately 25% in the OCR +Accuracy compared to the previous Table Transformer approach. + +
+
+ comment: 8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for + Information Retrieval +
+
+
+
+
+ + ♻ ☆ Research on Detection of Floating Objects in River and Lake Based on AI + Intelligent Image Recognition + + +
+ With the rapid advancement of artificial intelligence technology, AI-enabled +image recognition has emerged as a potent tool for addressing challenges in +traditional environmental monitoring. This study focuses on the detection of +floating objects in river and lake environments, exploring an innovative +approach based on deep learning. By intricately analyzing the technical +pathways for detecting static and dynamic features and considering the +characteristics of river and lake debris, a comprehensive image acquisition and +processing workflow has been developed. The study highlights the application +and performance comparison of three mainstream deep learning models -SSD, +Faster-RCNN, and YOLOv5- in debris identification. Additionally, a detection +system for floating objects has been designed and implemented, encompassing +both hardware platform construction and software framework development. Through +rigorous experimental validation, the proposed system has demonstrated its +ability to significantly enhance the accuracy and efficiency of debris +detection, thus offering a new technological avenue for water quality +monitoring in rivers and lakes + +
+
+
+
+
+ + ♻ ☆ Unified Human-Scene Interaction via Prompted Chain-of-Contacts + + +
+ Human-Scene Interaction (HSI) is a vital component of fields like embodied AI +and virtual reality. Despite advancements in motion quality and physical +plausibility, two pivotal factors, versatile interaction control and the +development of a user-friendly interface, require further exploration before +the practical application of HSI. This paper presents a unified HSI framework, +UniHSI, which supports unified control of diverse interactions through language +commands. This framework is built upon the definition of interaction as Chain +of Contacts (CoC): steps of human joint-object part pairs, which is inspired by +the strong correlation between interaction types and human-object contact +regions. Based on the definition, UniHSI constitutes a Large Language Model +(LLM) Planner to translate language prompts into task plans in the form of CoC, +and a Unified Controller that turns CoC into uniform task execution. To +facilitate training and evaluation, we collect a new dataset named ScenePlan +that encompasses thousands of task plans generated by LLMs based on diverse +scenarios. Comprehensive experiments demonstrate the effectiveness of our +framework in versatile task execution and generalizability to real scanned +scenes. The project page is at https://github.com/OpenRobotLab/UniHSI . + +
+
+ comment: A unified Human-Scene Interaction framework that supports versatile + interactions through language commands.Project URL: + https://xizaoqu.github.io/unihsi/ . Code: + https://github.com/OpenRobotLab/UniHSI +
+
+
+
+
+ + ♻ ☆ Lite-Mind: Towards Efficient and Robust Brain Representation Network + + +
+ The limited data availability and the low signal-to-noise ratio of fMRI +signals lead to the challenging task of fMRI-to-image retrieval. +State-of-the-art MindEye remarkably improves fMRI-to-image retrieval +performance by leveraging a large model, i.e., a 996M MLP Backbone per subject, +to align fMRI embeddings to the final hidden layer of CLIP's Vision Transformer +(ViT). However, significant individual variations exist among subjects, even +under identical experimental setups, mandating the training of large +subject-specific models. The substantial parameters pose significant challenges +in deploying fMRI decoding on practical devices. To this end, we propose +Lite-Mind, a lightweight, efficient, and robust brain representation learning +paradigm based on Discrete Fourier Transform (DFT), which efficiently aligns +fMRI voxels to fine-grained information of CLIP. We elaborately design a DFT +backbone with Spectrum Compression and Frequency Projector modules to learn +informative and robust voxel embeddings. Our experiments demonstrate that +Lite-Mind achieves an impressive 94.6% fMRI-to-image retrieval accuracy on the +NSD dataset for Subject 1, with 98.7% fewer parameters than MindEye. Lite-Mind +is also proven to be able to be migrated to smaller fMRI datasets and +establishes a new state-of-the-art for zero-shot classification on the GOD +dataset. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Watermark-embedded Adversarial Examples for Copyright Protection against + Diffusion Models + + +
+ Diffusion Models (DMs) have shown remarkable capabilities in various +image-generation tasks. However, there are growing concerns that DMs could be +used to imitate unauthorized creations and thus raise copyright issues. To +address this issue, we propose a novel framework that embeds personal +watermarks in the generation of adversarial examples. Such examples can force +DMs to generate images with visible watermarks and prevent DMs from imitating +unauthorized images. We construct a generator based on conditional adversarial +networks and design three losses (adversarial loss, GAN loss, and perturbation +loss) to generate adversarial examples that have subtle perturbation but can +effectively attack DMs to prevent copyright violations. Training a generator +for a personal watermark by our method only requires 5-10 samples within 2-3 +minutes, and once the generator is trained, it can generate adversarial +examples with that watermark significantly fast (0.2s per image). We conduct +extensive experiments in various conditional image-generation scenarios. +Compared to existing methods that generate images with chaotic textures, our +method adds visible watermarks on the generated images, which is a more +straightforward way to indicate copyright violations. We also observe that our +adversarial examples exhibit good transferability across unknown generative +models. Therefore, this work provides a simple yet powerful way to protect +copyright from DM-based imitation. + +
+
+ comment: updated references +
+
+
+
+
+ + ♻ ☆ EVI-SAM: Robust, Real-time, Tightly-coupled Event-Visual-Inertial State + Estimation and 3D Dense Mapping + + +
+ Event cameras are bio-inspired, motion-activated sensors that demonstrate +substantial potential in handling challenging situations, such as motion blur +and high-dynamic range. In this paper, we proposed EVI-SAM to tackle the +problem of 6 DoF pose tracking and 3D reconstruction using monocular event +camera. A novel event-based hybrid tracking framework is designed to estimate +the pose, leveraging the robustness of feature matching and the precision of +direct alignment. Specifically, we develop an event-based 2D-2D alignment to +construct the photometric constraint, and tightly integrate it with the +event-based reprojection constraint. The mapping module recovers the dense and +colorful depth of the scene through the image-guided event-based mapping +method. Subsequently, the appearance, texture, and surface mesh of the 3D scene +can be reconstructed by fusing the dense depth map from multiple viewpoints +using truncated signed distance function (TSDF) fusion. To the best of our +knowledge, this is the first non-learning work to realize event-based dense +mapping. Numerical evaluations are performed on both publicly available and +self-collected datasets, which qualitatively and quantitatively demonstrate the +superior performance of our method. Our EVI-SAM effectively balances accuracy +and robustness while maintaining computational efficiency, showcasing superior +pose tracking and dense mapping performance in challenging scenarios. Video +Demo: https://youtu.be/Nn40U4e5Si8. + +
+
+
+
+
+ + ♻ ☆ Region-Based Representations Revisited CVPR 2024 + + +
+ We investigate whether region-based representations are effective for +recognition. Regions were once a mainstay in recognition approaches, but pixel +and patch-based features are now used almost exclusively. We show that recent +class-agnostic segmenters like SAM can be effectively combined with strong +unsupervised representations like DINOv2 and used for a wide variety of tasks, +including semantic segmentation, object-based image retrieval, and multi-image +analysis. Once the masks and features are extracted, these representations, +even with linear decoders, enable competitive performance, making them well +suited to applications that require custom queries. The compactness of the +representation also makes it well-suited to video analysis and other problems +requiring inference across many images. + +
+
+ comment: CVPR 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ High-Degrees-of-Freedom Dynamic Neural Fields for Robot Self-Modeling + and Motion Planning ICRA + + +
+ A robot self-model is a task-agnostic representation of the robot's physical +morphology that can be used for motion planning tasks in the absence of a +classical geometric kinematic model. In particular, when the latter is hard to +engineer or the robot's kinematics change unexpectedly, human-free +self-modeling is a necessary feature of truly autonomous agents. In this work, +we leverage neural fields to allow a robot to self-model its kinematics as a +neural-implicit query model learned only from 2D images annotated with camera +poses and configurations. This enables significantly greater applicability than +existing approaches which have been dependent on depth images or geometry +knowledge. To this end, alongside a curricular data sampling strategy, we +propose a new encoder-based neural density field architecture for dynamic +object-centric scenes conditioned on high numbers of degrees of freedom (DOFs). +In a 7-DOF robot test setup, the learned self-model achieves a Chamfer-L2 +distance of 2% of the robot's workspace dimension. We demonstrate the +capabilities of this model on motion planning tasks as an exemplary downstream +application. + +
+
+ comment: International Conference on Robotics and Automation (ICRA) 2024; ICCV + 2023 Workshop on Neural Fields for Autonomous Driving and Robotics (oral) +
+
+
+
+
+ + ♻ ☆ Flatten Long-Range Loss Landscapes for Cross-Domain Few-Shot Learning + + +
+ Cross-domain few-shot learning (CDFSL) aims to acquire knowledge from limited +training data in the target domain by leveraging prior knowledge transferred +from source domains with abundant training samples. CDFSL faces challenges in +transferring knowledge across dissimilar domains and fine-tuning models with +limited training data. To address these challenges, we initially extend the +analysis of loss landscapes from the parameter space to the representation +space, which allows us to simultaneously interpret the transferring and +fine-tuning difficulties of CDFSL models. We observe that sharp minima in the +loss landscapes of the representation space result in representations that are +hard to transfer and fine-tune. Moreover, existing flatness-based methods have +limited generalization ability due to their short-range flatness. To enhance +the transferability and facilitate fine-tuning, we introduce a simple yet +effective approach to achieve long-range flattening of the minima in the loss +landscape. This approach considers representations that are differently +normalized as minima in the loss landscape and flattens the high-loss region in +the middle by randomly sampling interpolated representations. We implement this +method as a new normalization layer that replaces the original one in both CNNs +and ViTs. This layer is simple and lightweight, introducing only a minimal +number of additional parameters. Experimental results on 8 datasets demonstrate +that our approach outperforms state-of-the-art methods in terms of average +accuracy. Moreover, our method achieves performance improvements of up to 9\% +compared to the current best approaches on individual datasets. Our code will +be released. + +
+
+
+
+
+ + ♻ ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ♻ ☆ LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing + Diffusion Models + + +
+ In the era of AIGC, the demand for low-budget or even on-device applications +of diffusion models emerged. In terms of compressing the Stable Diffusion +models (SDMs), several approaches have been proposed, and most of them +leveraged the handcrafted layer removal methods to obtain smaller U-Nets, along +with knowledge distillation to recover the network performance. However, such a +handcrafting manner of layer removal is inefficient and lacks scalability and +generalization, and the feature distillation employed in the retraining phase +faces an imbalance issue that a few numerically significant feature loss terms +dominate over others throughout the retraining process. To this end, we +proposed the layer pruning and normalized distillation for compressing +diffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to +compress SDM's U-Net automatically and proposed an effective one-shot pruning +criterion whose one-shot performance is guaranteed by its good additivity +property, surpassing other layer pruning and handcrafted layer removal methods, +2) proposed the normalized feature distillation for retraining, alleviated the +imbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of +SDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0% +decline in PickScore at a pruning ratio of 50% while the comparative methods' +minimal PickScore decline is 8.2%. We will release our code. + +
+
+
+
+
+ + ♻ ☆ Beyond Average: Individualized Visual Scanpath Prediction CVPR2024 + + +
+ Understanding how attention varies across individuals has significant +scientific and societal impacts. However, existing visual scanpath models treat +attention uniformly, neglecting individual differences. To bridge this gap, +this paper focuses on individualized scanpath prediction (ISP), a new attention +modeling task that aims to accurately predict how different individuals shift +their attention in diverse visual tasks. It proposes an ISP method featuring +three novel technical components: (1) an observer encoder to characterize and +integrate an observer's unique attention traits, (2) an observer-centric +feature integration approach that holistically combines visual features, task +guidance, and observer-specific characteristics, and (3) an adaptive fixation +prioritization mechanism that refines scanpath predictions by dynamically +prioritizing semantic feature maps based on individual observers' attention +traits. These novel components allow scanpath models to effectively address the +attention variations across different observers. Our method is generally +applicable to different datasets, model architectures, and visual tasks, +offering a comprehensive tool for transforming general scanpath models into +individualized ones. Comprehensive evaluations using value-based and +ranking-based metrics verify the method's effectiveness and generalizability. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ♻ ☆ The Devil is in the Few Shots: Iterative Visual Knowledge Completion for + Few-shot Learning + + +
+ Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot +learning performance. Few-shot learning aims to further enhance the transfer +capability of CLIP by giving few images in each class, aka 'few shots'. Most +existing methods either implicitly learn from the few shots by incorporating +learnable prompts or adapters, or explicitly embed them in a cache model for +inference. However, the narrow distribution of few shots often contains +incomplete class information, leading to biased visual knowledge with high risk +of misclassification. To tackle this problem, recent methods propose to +supplement visual knowledge by generative models or extra databases, which can +be costly and time-consuming. In this paper, we propose an Iterative Visual +Knowledge CompLetion (KCL) method to complement visual knowledge by properly +taking advantages of unlabeled samples without access to any auxiliary or +synthetic data. Specifically, KCL first measures the similarities between +unlabeled samples and each category. Then, the samples with top confidence to +each category is selected and collected by a designed confidence criterion. +Finally, the collected samples are treated as labeled ones and added to few +shots to jointly re-estimate the remaining unlabeled ones. The above procedures +will be repeated for a certain number of iterations with more and more samples +being collected until convergence, ensuring a progressive and robust knowledge +completion process. Extensive experiments on 11 benchmark datasets demonstrate +the effectiveness and efficiency of KCL as a plug-and-play module under both +few-shot and zero-shot learning settings. Code is available at +https://github.com/Mark-Sky/KCL. + +
+
+
+
+
+ + ♻ ☆ Channel Vision Transformers: An Image Is Worth 1 x 16 x 16 Words + + +
+ Vision Transformer (ViT) has emerged as a powerful architecture in the realm +of modern computer vision. However, its application in certain imaging fields, +such as microscopy and satellite imaging, presents unique challenges. In these +domains, images often contain multiple channels, each carrying semantically +distinct and independent information. Furthermore, the model must demonstrate +robustness to sparsity in input channels, as they may not be densely available +during training or testing. In this paper, we propose a modification to the ViT +architecture that enhances reasoning across the input channels and introduce +Hierarchical Channel Sampling (HCS) as an additional regularization technique +to ensure robustness when only partial channels are presented during test time. +Our proposed model, ChannelViT, constructs patch tokens independently from each +input channel and utilizes a learnable channel embedding that is added to the +patch tokens, similar to positional embeddings. We evaluate the performance of +ChannelViT on ImageNet, JUMP-CP (microscopy cell imaging), and So2Sat +(satellite imaging). Our results show that ChannelViT outperforms ViT on +classification tasks and generalizes well, even when a subset of input channels +is used during testing. Across our experiments, HCS proves to be a powerful +regularizer, independent of the architecture employed, suggesting itself as a +straightforward technique for robust ViT training. Lastly, we find that +ChannelViT generalizes effectively even when there is limited access to all +channels during training, highlighting its potential for multi-channel imaging +under real-world conditions with sparse sensors. Our code is available at +https://github.com/insitro/ChannelViT. + +
+
+
+
+
+ + ♻ ☆ Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using + hybrid CNN-Transformer Architecture + + +
+ Medical imaging has been used for diagnosis of various conditions, making it +one of the most powerful resources for effective patient care. Due to +widespread availability, low cost, and low radiation, chest X-ray is one of the +most sought after radiology examination for the diagnosis of various thoracic +diseases. Due to advancements in medical imaging technologies and increasing +patient load, current radiology workflow faces various challenges including +increasing backlogs, working long hours, and increase in diagnostic errors. An +automated computer-aided diagnosis system that can interpret chest X-rays to +augment radiologists by providing actionable insights has potential to provide +second opinion to radiologists, highlight relevant regions in the image, in +turn expediting clinical workflow, reducing diagnostic errors, and improving +patient care. In this study, we applied a novel architecture augmenting the +DenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention +mechanism using transformer, namely SA-DenseNet121, that can identify multiple +thoracic diseases in chest X-rays. We conducted experiments on four of the +largest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG, +and IU-CXR. Experimental results in terms of area under the receiver operating +characteristics (AUC-ROC) shows that augmenting CNN with self-attention has +potential in diagnosing different thoracic diseases from chest X-rays. The +proposed methodology has the potential to support the reading workflow, improve +efficiency, and reduce diagnostic errors. + +
+
+ comment: 24 pages, 13 Figures, 13 Tables. This article heavily draws from + arXiv:1904.09925 where authors originally proposed attention-augmented + convolutional network. arXiv admin note: text overlap with arXiv:1904.09925 + by other authors +
+
+
+
+
+ + ♻ ☆ Revealing the structure-property relationships of copper alloys with + FAGC + + +
+ Understanding how the structure of materials affects their properties is a +cornerstone of materials science and engineering. However, traditional methods +have struggled to accurately describe the quantitative structure-property +relationships for complex structures. In our study, we bridge this gap by +leveraging machine learning to analyze images of materials' microstructures, +thus offering a novel way to understand and predict the properties of materials +based on their microstructures. We introduce a method known as FAGC (Feature +Augmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr +alloys. This approach utilizes machine learning to examine the shapes within +images of the alloys' microstructures and predict their mechanical and +electronic properties. This generative FAGC approach can effectively expand the +relatively small training datasets due to the limited availability of materials +images labeled with quantitative properties. The process begins with extracting +features from the images using neural networks. These features are then mapped +onto the Pre-shape space to construct the Geodesic curves. Along these curves, +new features are generated, effectively increasing the dataset. Moreover, we +design a pseudo-labeling mechanism for these newly generated features to +further enhance the training dataset. Our FAGC method has shown remarkable +results, significantly improving the accuracy of predicting the electronic +conductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978 +and 0.998, respectively. These outcomes underscore the potential of FAGC to +address the challenge of limited image data in materials science, providing a +powerful tool for establishing detailed and quantitative relationships between +complex microstructures and material properties. + +
+
+
+
+
+ + ♻ ☆ GazeHTA: End-to-end Gaze Target Detection with Head-Target Association + + +
+ We propose an end-to-end approach for gaze target detection: predicting a +head-target connection between individuals and the target image regions they +are looking at. Most of the existing methods use independent components such as +off-the-shelf head detectors or have problems in establishing associations +between heads and gaze targets. In contrast, we investigate an end-to-end +multi-person Gaze target detection framework with Heads and Targets Association +(GazeHTA), which predicts multiple head-target instances based solely on input +scene image. GazeHTA addresses challenges in gaze target detection by (1) +leveraging a pre-trained diffusion model to extract scene features for rich +semantic understanding, (2) re-injecting a head feature to enhance the head +priors for improved head understanding, and (3) learning a connection map as +the explicit visual associations between heads and gaze targets. Our extensive +experimental results demonstrate that GazeHTA outperforms state-of-the-art gaze +target detection methods and two adapted diffusion-based baselines on two +standard datasets. + +
+
+
+
+
+ + ♻ ☆ Unified Negative Pair Generation toward Well-discriminative Feature + Space for Face Recognition BMVC22 + + +
+ The goal of face recognition (FR) can be viewed as a pair similarity +optimization problem, maximizing a similarity set $\mathcal{S}^p$ over positive +pairs, while minimizing similarity set $\mathcal{S}^n$ over negative pairs. +Ideally, it is expected that FR models form a well-discriminative feature space +(WDFS) that satisfies $\inf{\mathcal{S}^p} > \sup{\mathcal{S}^n}$. With regard +to WDFS, the existing deep feature learning paradigms (i.e., metric and +classification losses) can be expressed as a unified perspective on different +pair generation (PG) strategies. Unfortunately, in the metric loss (ML), it is +infeasible to generate negative pairs taking all classes into account in each +iteration because of the limited mini-batch size. In contrast, in +classification loss (CL), it is difficult to generate extremely hard negative +pairs owing to the convergence of the class weight vectors to their center. +This leads to a mismatch between the two similarity distributions of the +sampled pairs and all negative pairs. Thus, this paper proposes a unified +negative pair generation (UNPG) by combining two PG strategies (i.e., MLPG and +CLPG) from a unified perspective to alleviate the mismatch. UNPG introduces +useful information about negative pairs using MLPG to overcome the CLPG +deficiency. Moreover, it includes filtering the similarities of noisy negative +pairs to guarantee reliable convergence and improved performance. Exhaustive +experiments show the superiority of UNPG by achieving state-of-the-art +performance across recent loss functions on public benchmark datasets. Our code +and pretrained models are publicly available. + +
+
+ comment: 9 pages, 6 figures, Published at BMVC22 +
+
+
+
+
+ + ♻ ☆ LLM4SGG: Large Language Models for Weakly Supervised Scene Graph + Generation CVPR 2024 + + +
+ Weakly-Supervised Scene Graph Generation (WSSGG) research has recently +emerged as an alternative to the fully-supervised approach that heavily relies +on costly annotations. In this regard, studies on WSSGG have utilized image +captions to obtain unlocalized triplets while primarily focusing on grounding +the unlocalized triplets over image regions. However, they have overlooked the +two issues involved in the triplet formation process from the captions: 1) +Semantic over-simplification issue arises when extracting triplets from +captions, where fine-grained predicates in captions are undesirably converted +into coarse-grained predicates, resulting in a long-tailed predicate +distribution, and 2) Low-density scene graph issue arises when aligning the +triplets in the caption with entity/predicate classes of interest, where many +triplets are discarded and not used in training, leading to insufficient +supervision. To tackle the two issues, we propose a new approach, i.e., Large +Language Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two +issues by leveraging the LLM's in-depth understanding of language and reasoning +ability during the extraction of triplets from captions and alignment of +entity/predicate classes with target data. To further engage the LLM in these +processes, we adopt the idea of Chain-of-Thought and the in-context few-shot +learning strategy. To validate the effectiveness of LLM4SGG, we conduct +extensive experiments on Visual Genome and GQA datasets, showing significant +improvements in both Recall@K and mean Recall@K compared to the +state-of-the-art WSSGG methods. A further appeal is that LLM4SGG is +data-efficient, enabling effective model training with a small amount of +training images. + +
+
+ comment: 8 pages; CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PEEKABOO: Interactive Video Generation via Masked-Diffusion + + +
+ Modern video generation models like Sora have achieved remarkable success in +producing high-quality videos. However, a significant limitation is their +inability to offer interactive control to users, a feature that promises to +open up unprecedented applications and creativity. In this work, we introduce +the first solution to equip diffusion-based video generation models with +spatio-temporal control. We present Peekaboo, a novel masked attention module, +which seamlessly integrates with current video generation models offering +control without the need for additional training or inference overhead. To +facilitate future research, we also introduce a comprehensive benchmark for +interactive video generation. This benchmark offers a standardized framework +for the community to assess the efficacy of emerging interactive video +generation models. Our extensive qualitative and quantitative assessments +reveal that Peekaboo achieves up to a 3.8x improvement in mIoU over baseline +models, all while maintaining the same latency. Code and benchmark are +available on the webpage. + +
+
+ comment: Project webpage - https://jinga-lala.github.io/projects/Peekaboo/ +
+
+
+
+
+ + ♻ ☆ Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image + Labeling + + +
+ As deep neural networks are more commonly deployed in high-stakes domains, +their black-box nature makes uncertainty quantification challenging. We +investigate the effects of presenting conformal prediction sets--a +distribution-free class of methods for generating prediction sets with +specified coverage--to express uncertainty in AI-advised decision-making. +Through a large online experiment, we compare the utility of conformal +prediction sets to displays of Top-1 and Top-k predictions for AI-advised image +labeling. In a pre-registered analysis, we find that the utility of prediction +sets for accuracy varies with the difficulty of the task: while they result in +accuracy on par with or less than Top-1 and Top-k displays for easy images, +prediction sets excel at assisting humans in labeling out-of-distribution (OOD) +images, especially when the set size is small. Our results empirically pinpoint +practical challenges of conformal prediction sets and provide implications on +how to incorporate them for real-world decision-making. + +
+
+ comment: 19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024 +
+
+
+
+
+ + ♻ ☆ SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic + Perception + + +
+ Recently, event-based vision sensors have gained attention for autonomous +driving applications, as conventional RGB cameras face limitations in handling +challenging dynamic conditions. However, the availability of real-world and +synthetic event-based vision datasets remains limited. In response to this gap, +we present SEVD, a first-of-its-kind multi-view ego, and fixed perception +synthetic event-based dataset using multiple dynamic vision sensors within the +CARLA simulator. Data sequences are recorded across diverse lighting (noon, +nighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy) +with domain shifts (discrete and continuous). SEVD spans urban, suburban, +rural, and highway scenes featuring various classes of objects (car, truck, +van, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes +RGB imagery, depth maps, optical flow, semantic, and instance segmentation, +facilitating a comprehensive understanding of the scene. Furthermore, we +evaluate the dataset using state-of-the-art event-based (RED, RVT) and +frame-based (YOLOv8) methods for traffic participant detection tasks and +provide baseline benchmarks for assessment. Additionally, we conduct +experiments to assess the synthetic event-based dataset's generalization +capabilities. The dataset is available at +https://eventbasedvision.github.io/SEVD + +
+
+
+
+
+ + ♻ ☆ PCNN: Probable-Class Nearest-Neighbor Explanations Improve Fine-Grained + Image Classification Accuracy for AIs and Humans + + +
+ Nearest neighbors (NN) are traditionally used to compute final decisions, +e.g., in Support Vector Machines or k-NN classifiers, and to provide users with +explanations for the model's decision. In this paper, we show a novel utility +of nearest neighbors: To improve predictions of a frozen, pretrained classifier +C. We leverage an image comparator S that (1) compares the input image with NN +images from the top-K most probable classes; and (2) uses S's output scores to +weight the confidence scores of C. Our method consistently improves +fine-grained image classification accuracy on CUB-200, Cars-196, and Dogs-120. +Also, a human study finds that showing lay users our probable-class nearest +neighbors (PCNN) improves their decision accuracy over prior work which only +shows only the top-1 class examples. + +
+
+
+
+
+ + ♻ ☆ MatAtlas: Text-driven Consistent Geometry Texturing and Material + Assignment + + +
+ We present MatAtlas, a method for consistent text-guided 3D model texturing. +Following recent progress we leverage a large scale text-to-image generation +model (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully +design an RGB texturing pipeline that leverages a grid pattern diffusion, +driven by depth and edges. By proposing a multi-step texture refinement +process, we significantly improve the quality and 3D consistency of the +texturing output. To further address the problem of baked-in lighting, we move +beyond RGB colors and pursue assigning parametric materials to the assets. +Given the high-quality initial RGB texture, we propose a novel material +retrieval method capitalized on Large Language Models (LLM), enabling +editabiliy and relightability. We evaluate our method on a wide variety of +geometries and show that our method significantly outperform prior arts. We +also analyze the role of each component through a detailed ablation study. + +
+
+
+
+
+ + ♻ ☆ Artwork Protection Against Neural Style Transfer Using Locally Adaptive + Adversarial Color Attack + + +
+ Neural style transfer (NST) generates new images by combining the style of +one image with the content of another. However, unauthorized NST can exploit +artwork, raising concerns about artists' rights and motivating the development +of proactive protection methods. We propose Locally Adaptive Adversarial Color +Attack (LAACA), empowering artists to protect their artwork from unauthorized +style transfer by processing before public release. By delving into the +intricacies of human visual perception and the role of different frequency +components, our method strategically introduces frequency-adaptive +perturbations in the image. These perturbations significantly degrade the +generation quality of NST while maintaining an acceptable level of visual +change in the original image, ensuring that potential infringers are +discouraged from using the protected artworks, because of its bad NST +generation quality. Additionally, existing metrics often overlook the +importance of color fidelity in evaluating color-mattered tasks, such as the +quality of NST-generated images, which is crucial in the context of artistic +works. To comprehensively assess the color-mattered tasks, we propose the +Adversarial Color Distance Metric (ACDM), designed to quantify the color +difference of images pre- and post-manipulations. Experimental results confirm +that attacking NST using LAACA results in visually inferior style transfer, and +the ACDM can efficiently measure color-mattered tasks. By providing artists +with a tool to safeguard their intellectual property, our work relieves the +socio-technical challenges posed by the misuse of NST in the art community. + +
+
+ comment: 9 pages, 5 figures, 4 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 165 + +
+
+
+ + ☆ On the Content Bias in Fréchet Video Distance CVPR 2024 + + +
+ Fr\'echet Video Distance (FVD), a prominent metric for evaluating video +generation models, is known to conflict with human perception occasionally. In +this paper, we aim to explore the extent of FVD's bias toward per-frame quality +over temporal realism and identify its sources. We first quantify the FVD's +sensitivity to the temporal axis by decoupling the frame and motion quality and +find that the FVD increases only slightly with large temporal corruption. We +then analyze the generated videos and show that via careful sampling from a +large set of generated videos that do not contain motions, one can drastically +decrease FVD without improving the temporal quality. Both studies suggest FVD's +bias towards the quality of individual frames. We further observe that the bias +can be attributed to the features extracted from a supervised video classifier +trained on the content-biased dataset. We show that FVD with features extracted +from the recent large-scale self-supervised video models is less biased toward +image quality. Finally, we revisit a few real-world examples to validate our +hypothesis. + +
+
+ comment: CVPR 2024. Project webpage: https://content-debiased-fvd.github.io/ +
+
+
+
+
+ + ☆ BLINK: Multimodal Large Language Models Can See but Not Perceive + + +
+ We introduce Blink, a new benchmark for multimodal language models (LLMs) +that focuses on core visual perception abilities not found in other +evaluations. Most of the Blink tasks can be solved by humans "within a blink" +(e.g., relative depth estimation, visual correspondence, forensics detection, +and multi-view reasoning). However, we find these perception-demanding tasks +cast significant challenges for current multimodal LLMs because they resist +mediation through natural language. Blink reformats 14 classic computer vision +tasks into 3,807 multiple-choice questions, paired with single or multiple +images and visual prompting. While humans get 95.70% accuracy on average, Blink +is surprisingly challenging for existing multimodal LLMs: even the +best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only +13.17% and 7.63% higher than random guessing, indicating that such perception +abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also +highlights that specialist CV models could solve these problems much better, +suggesting potential pathways for future improvements. We believe Blink will +stimulate the community to help multimodal LLMs catch up with human-level +visual perception. + +
+
+ comment: Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/ +
+
+
+
+
+ + ☆ VideoGigaGAN: Towards Detail-rich Video Super-Resolution + + +
+ Video super-resolution (VSR) approaches have shown impressive temporal +consistency in upsampled videos. However, these approaches tend to generate +blurrier results than their image counterparts as they are limited in their +generative capability. This raises a fundamental question: can we extend the +success of a generative image upsampler to the VSR task while preserving the +temporal consistency? We introduce VideoGigaGAN, a new generative VSR model +that can produce videos with high-frequency details and temporal consistency. +VideoGigaGAN builds upon a large-scale image upsampler -- GigaGAN. Simply +inflating GigaGAN to a video model by adding temporal modules produces severe +temporal flickering. We identify several key issues and propose techniques that +significantly improve the temporal consistency of upsampled videos. Our +experiments show that, unlike previous VSR methods, VideoGigaGAN generates +temporally consistent videos with more fine-grained appearance details. We +validate the effectiveness of VideoGigaGAN by comparing it with +state-of-the-art VSR models on public datasets and showcasing video results +with $8\times$ super-resolution. + +
+
+ comment: project page: https://videogigagan.github.io/ +
+
+
+
+
+ + ☆ Moving Object Segmentation: All You Need Is SAM (and Flow) + + +
+ The objective of this paper is motion segmentation -- discovering and +segmenting the moving objects in a video. This is a much studied area with +numerous careful,and sometimes complex, approaches and training schemes +including: self-supervised learning, learning from synthetic datasets, +object-centric representations, amodal representations, and many more. Our +interest in this paper is to determine if the Segment Anything model (SAM) can +contribute to this task. We investigate two models for combining SAM with +optical flow that harness the segmentation power of SAM with the ability of +flow to discover and group moving objects. In the first model, we adapt SAM to +take optical flow, rather than RGB, as an input. In the second, SAM takes RGB +as an input, and flow is used as a segmentation prompt. These surprisingly +simple methods, without any further modifications, outperform all previous +approaches by a considerable margin in both single and multi-object benchmarks. +We also extend these frame-level segmentations to sequence-level segmentations +that maintain object identity. Again, this simple model outperforms previous +methods on multiple video object segmentation benchmarks. + +
+
+ comment: Project Page: https://www.robots.ox.ac.uk/~vgg/research/flowsam/ +
+
+
+
+
+ + ☆ Reka Core, Flash, and Edge: A Series of Powerful Multimodal Language + Models + + +
+ We introduce Reka Core, Flash, and Edge, a series of powerful multimodal +language models trained from scratch by Reka. Reka models are able to process +and reason with text, images, video, and audio inputs. This technical report +discusses details of training some of these models and provides comprehensive +evaluation results. We show that Reka Edge and Reka Flash are not only +state-of-the-art but also outperform many much larger models, delivering +outsized values for their respective compute class. Meanwhile, our most capable +and largest model, Reka Core, approaches the best frontier models on both +automatic evaluations and blind human evaluations. On image question answering +benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. +Meanwhile, on multimodal chat, Core ranks as the second most preferred model +under a blind third-party human evaluation setup, outperforming other models +such as Claude 3 Opus. On text benchmarks, Core not only performs competitively +to other frontier models on a set of well-established benchmarks (e.g. MMLU, +GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question +answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped +in production at http://chat.reka.ai . A showcase of non cherry picked +qualitative examples can also be found at http://showcase.reka.ai . + +
+
+
+
+
+ + ☆ SOHES: Self-supervised Open-world Hierarchical Entity Segmentation ICLR 2024 + + +
+ Open-world entity segmentation, as an emerging computer vision task, aims at +segmenting entities in images without being restricted by pre-defined classes, +offering impressive generalization capabilities on unseen images and concepts. +Despite its promise, existing entity segmentation methods like Segment Anything +Model (SAM) rely heavily on costly expert annotators. This work presents +Self-supervised Open-world Hierarchical Entity Segmentation (SOHES), a novel +approach that eliminates the need for human annotations. SOHES operates in +three phases: self-exploration, self-instruction, and self-correction. Given a +pre-trained self-supervised representation, we produce abundant high-quality +pseudo-labels through visual feature clustering. Then, we train a segmentation +model on the pseudo-labels, and rectify the noises in pseudo-labels via a +teacher-student mutual-learning procedure. Beyond segmenting entities, SOHES +also captures their constituent parts, providing a hierarchical understanding +of visual entities. Using raw images as the sole training data, our method +achieves unprecedented performance in self-supervised open-world segmentation, +marking a significant milestone towards high-quality open-world entity +segmentation in the absence of human-annotated masks. Project page: +https://SOHES.github.io. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ MeshLRM: Large Reconstruction Model for High-Quality Mesh + + +
+ We propose MeshLRM, a novel LRM-based approach that can reconstruct a +high-quality mesh from merely four input images in less than one second. +Different from previous large reconstruction models (LRMs) that focus on +NeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction +and rendering within the LRM framework. This allows for end-to-end mesh +reconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering. +Moreover, we improve the LRM architecture by simplifying several complex +designs in previous LRMs. MeshLRM's NeRF initialization is sequentially trained +with low- and high-resolution images; this new LRM training strategy enables +significantly faster convergence and thereby leads to better quality with less +compute. Our approach achieves state-of-the-art mesh reconstruction from +sparse-view inputs and also allows for many downstream applications, including +text-to-3D and single-image-to-3D generation. Project page: +https://sarahweiii.github.io/meshlrm/ + +
+
+
+
+
+ + ☆ G-HOP: Generative Hand-Object Prior for Interaction Reconstruction and + Grasp Synthesis CVPR2024 + + +
+ We propose G-HOP, a denoising diffusion based generative prior for +hand-object interactions that allows modeling both the 3D object and a human +hand, conditioned on the object category. To learn a 3D spatial diffusion model +that can capture this joint distribution, we represent the human hand via a +skeletal distance field to obtain a representation aligned with the (latent) +signed distance field for the object. We show that this hand-object prior can +then serve as generic guidance to facilitate other tasks like reconstruction +from interaction clip and human grasp synthesis. We believe that our model, +trained by aggregating seven diverse real-world interaction datasets spanning +across 155 categories, represents a first approach that allows jointly +generating both hand and object. Our empirical evaluations demonstrate the +benefit of this joint prior in video-based reconstruction and human grasp +synthesis, outperforming current task-specific baselines. + Project website: https://judyye.github.io/ghop-www + +
+
+ comment: accepted to CVPR2024; project page at + https://judyye.github.io/ghop-www +
+
+
+
+
+ + ☆ Lazy Diffusion Transformer for Interactive Image Editing + + +
+ We introduce a novel diffusion transformer, LazyDiffusion, that generates +partial image updates efficiently. Our approach targets interactive image +editing applications in which, starting from a blank canvas or an image, a user +specifies a sequence of localized image modifications using binary masks and +text prompts. Our generator operates in two phases. First, a context encoder +processes the current canvas and user mask to produce a compact global context +tailored to the region to generate. Second, conditioned on this context, a +diffusion-based transformer decoder synthesizes the masked pixels in a "lazy" +fashion, i.e., it only generates the masked region. This contrasts with +previous works that either regenerate the full canvas, wasting time and +computation, or confine processing to a tight rectangular crop around the mask, +ignoring the global image context altogether. Our decoder's runtime scales with +the mask size, which is typically small, while our encoder introduces +negligible overhead. We demonstrate that our approach is competitive with +state-of-the-art inpainting methods in terms of quality and fidelity while +providing a 10x speedup for typical user interactions, where the editing mask +represents 10% of the image. + +
+
+
+
+
+ + ☆ 6Img-to-3D: Few-Image Large-Scale Outdoor Driving Scene Reconstruction + + +
+ Current 3D reconstruction techniques struggle to infer unbounded scenes from +a few images faithfully. Specifically, existing methods have high computational +demands, require detailed pose information, and cannot reconstruct occluded +regions reliably. We introduce 6Img-to-3D, an efficient, scalable +transformer-based encoder-renderer method for single-shot image to 3D +reconstruction. Our method outputs a 3D-consistent parameterized triplane from +only six outward-facing input images for large-scale, unbounded outdoor driving +scenarios. We take a step towards resolving existing shortcomings by combining +contracted custom cross- and self-attention mechanisms for triplane +parameterization, differentiable volume rendering, scene contraction, and image +feature projection. We showcase that six surround-view vehicle images from a +single timestamp without global pose information are enough to reconstruct +360$^{\circ}$ scenes during inference time, taking 395 ms. Our method allows, +for example, rendering third-person images and birds-eye views. Our code is +available at https://github.com/continental/6Img-to-3D, and more examples can +be found at our website here https://6Img-to-3D.GitHub.io/. + +
+
+ comment: Joint first authorship. Project page: https://6Img-to-3D.GitHub.io/ + Code https://github.com/continental/6Img-to-3D +
+
+
+
+
+ + ☆ Dynamic Gaussians Mesh: Consistent Mesh Reconstruction from Monocular + Videos + + +
+ Modern 3D engines and graphics pipelines require mesh as a memory-efficient +representation, which allows efficient rendering, geometry processing, texture +editing, and many other downstream operations. However, it is still highly +difficult to obtain high-quality mesh in terms of structure and detail from +monocular visual observations. The problem becomes even more challenging for +dynamic scenes and objects. To this end, we introduce Dynamic Gaussians Mesh +(DG-Mesh), a framework to reconstruct a high-fidelity and time-consistent mesh +given a single monocular video. Our work leverages the recent advancement in 3D +Gaussian Splatting to construct the mesh sequence with temporal consistency +from a video. Building on top of this representation, DG-Mesh recovers +high-quality meshes from the Gaussian points and can track the mesh vertices +over time, which enables applications such as texture editing on dynamic +objects. We introduce the Gaussian-Mesh Anchoring, which encourages evenly +distributed Gaussians, resulting better mesh reconstruction through mesh-guided +densification and pruning on the deformed Gaussians. By applying +cycle-consistent deformation between the canonical and the deformed space, we +can project the anchored Gaussian back to the canonical space and optimize +Gaussians across all time frames. During the evaluation on different datasets, +DG-Mesh provides significantly better mesh reconstruction and rendering than +baselines. + +
+
+ comment: Project page: https://www.liuisabella.com/DG-Mesh/ +
+
+
+
+
+ + ☆ MedThink: Explaining Medical Visual Question Answering via Multimodal + Decision-Making Rationale + + +
+ Medical Visual Question Answering (MedVQA), which offers language responses +to image-based medical inquiries, represents a challenging task and significant +advancement in healthcare. It assists medical experts to swiftly interpret +medical images, thereby enabling faster and more accurate diagnoses. However, +the model interpretability and transparency of existing MedVQA solutions are +often limited, posing challenges in understanding their decision-making +processes. To address this issue, we devise a semi-automated annotation process +to streamlining data preparation and build new benchmark MedVQA datasets R-RAD +and R-SLAKE. The R-RAD and R-SLAKE datasets provide intermediate medical +decision-making rationales generated by multimodal large language models and +human annotations for question-answering pairs in existing MedVQA datasets, +i.e., VQA-RAD and SLAKE. Moreover, we design a novel framework which finetunes +lightweight pretrained generative models by incorporating medical +decision-making rationales into the training process. The framework includes +three distinct strategies to generate decision outcomes and corresponding +rationales, thereby clearly showcasing the medical decision-making process +during reasoning. Extensive experiments demonstrate that our method can achieve +an accuracy of 83.5% on R-RAD and 86.3% on R-SLAKE, significantly outperforming +existing state-of-the-art baselines. Dataset and code will be released. + +
+
+
+
+
+ + ☆ Gradient-Regularized Out-of-Distribution Detection ECCV + + +
+ One of the challenges for neural networks in real-life applications is the +overconfident errors these models make when the data is not from the original +training distribution. + Addressing this issue is known as Out-of-Distribution (OOD) detection. + Many state-of-the-art OOD methods employ an auxiliary dataset as a surrogate +for OOD data during training to achieve improved performance. + However, these methods fail to fully exploit the local information embedded +in the auxiliary dataset. + In this work, we propose the idea of leveraging the information embedded in +the gradient of the loss function during training to enable the network to not +only learn a desired OOD score for each sample but also to exhibit similar +behavior in a local neighborhood around each sample. + We also develop a novel energy-based sampling method to allow the network to +be exposed to more informative OOD samples during the training phase. This is +especially important when the auxiliary dataset is large. We demonstrate the +effectiveness of our method through extensive experiments on several OOD +benchmarks, improving the existing state-of-the-art FPR95 by 4% on our ImageNet +experiment. + We further provide a theoretical analysis through the lens of certified +robustness and Lipschitz analysis to showcase the theoretical foundation of our +work. We will publicly release our code after the review process. + +
+
+ comment: Under review for the 18th European Conference on Computer Vision + (ECCV) 2024 +
+
+
+
+
+ + ☆ Inverse Neural Rendering for Explainable Multi-Object Tracking + + +
+ Today, most methods for image understanding tasks rely on feed-forward neural +networks. While this approach has allowed for empirical accuracy, efficiency, +and task adaptation via fine-tuning, it also comes with fundamental +disadvantages. Existing networks often struggle to generalize across different +datasets, even on the same task. By design, these networks ultimately reason +about high-dimensional scene features, which are challenging to analyze. This +is true especially when attempting to predict 3D information based on 2D +images. We propose to recast 3D multi-object tracking from RGB cameras as an +\emph{Inverse Rendering (IR)} problem, by optimizing via a differentiable +rendering pipeline over the latent space of pre-trained 3D object +representations and retrieve the latents that best represent object instances +in a given input image. To this end, we optimize an image loss over generative +latent spaces that inherently disentangle shape and appearance properties. We +investigate not only an alternate take on tracking but our method also enables +examining the generated objects, reasoning about failure situations, and +resolving ambiguous cases. We validate the generalization and scaling +capabilities of our method by learning the generative prior exclusively from +synthetic data and assessing camera-based 3D tracking on the nuScenes and Waymo +datasets. Both these datasets are completely unseen to our method and do not +require fine-tuning. Videos and code are available at +https://light.princeton.edu/inverse-rendering-tracking/. + +
+
+
+
+
+ + ☆ V2Xum-LLM: Cross-Modal Video Summarization with Temporal Prompt + Instruction Tuning + + +
+ Video summarization aims to create short, accurate, and cohesive summaries of +longer videos. Despite the existence of various video summarization datasets, a +notable limitation is their limited amount of source videos, which hampers the +effective fine-tuning of advanced large vision-language models (VLMs). +Additionally, most existing datasets are created for video-to-video +summarization, overlooking the contemporary need for multimodal video content +summarization. Recent efforts have been made to expand from unimodal to +multimodal video summarization, categorizing the task into three sub-tasks +based on the summary's modality: video-to-video (V2V), video-to-text (V2T), and +a combination of video and text summarization (V2VT). However, the textual +summaries in previous multimodal datasets are inadequate. To address these +issues, we introduce Instruct-V2Xum, a cross-modal video summarization dataset +featuring 30,000 diverse videos sourced from YouTube, with lengths ranging from +40 to 940 seconds and an average summarization ratio of 16.39\%. Each video +summary in Instruct-V2Xum is paired with a textual summary that references +specific frame indexes, facilitating the generation of aligned video and +textual summaries. In addition, we propose a new video summarization framework +named V2Xum-LLM. V2Xum-LLM, specifically V2Xum-LLaMA in this study, is the +first framework that unifies different video summarization tasks into one large +language model's (LLM) text decoder and achieves task-controllable video +summarization with temporal prompts and task instructions. Experiments show +that V2Xum-LLaMA outperforms strong baseline models on multiple video +summarization tasks. Furthermore, we propose an enhanced evaluation metric for +V2V and V2VT summarization tasks. + +
+
+
+
+
+ + ☆ Point-In-Context: Understanding Point Cloud via In-Context Learning + + +
+ With the emergence of large-scale models trained on diverse datasets, +in-context learning has emerged as a promising paradigm for multitasking, +notably in natural language processing and image processing. However, its +application in 3D point cloud tasks remains largely unexplored. In this work, +we introduce Point-In-Context (PIC), a novel framework for 3D point cloud +understanding via in-context learning. We address the technical challenge of +effectively extending masked point modeling to 3D point clouds by introducing a +Joint Sampling module and proposing a vanilla version of PIC called +Point-In-Context-Generalist (PIC-G). PIC-G is designed as a generalist model +for various 3D point cloud tasks, with inputs and outputs modeled as +coordinates. In this paradigm, the challenging segmentation task is achieved by +assigning label points with XYZ coordinates for each category; the final +prediction is then chosen based on the label point closest to the predictions. +To break the limitation by the fixed label-coordinate assignment, which has +poor generalization upon novel classes, we propose two novel training +strategies, In-Context Labeling and In-Context Enhancing, forming an extended +version of PIC named Point-In-Context-Segmenter (PIC-S), targeting improving +dynamic context labeling and model training. By utilizing dynamic in-context +labels and extra in-context pairs, PIC-S achieves enhanced performance and +generalization capability in and across part segmentation datasets. PIC is a +general framework so that other tasks or datasets can be seamlessly introduced +into our PIC through a unified data format. We conduct extensive experiments to +validate the versatility and adaptability of our proposed methods in handling a +wide range of tasks and segmenting multi-datasets. Our PIC-S is capable of +generalizing unseen datasets and performing novel part segmentation by +customizing prompts. + +
+
+ comment: Project page: https://fanglaosi.github.io/Point-In-Context_Pages. + arXiv admin note: text overlap with arXiv:2306.08659 +
+
+
+
+
+ + ☆ AniClipart: Clipart Animation with Text-to-Video Priors + + +
+ Clipart, a pre-made graphic art form, offers a convenient and efficient way +of illustrating visual content. Traditional workflows to convert static clipart +images into motion sequences are laborious and time-consuming, involving +numerous intricate steps like rigging, key animation and in-betweening. Recent +advancements in text-to-video generation hold great potential in resolving this +problem. Nevertheless, direct application of text-to-video generation models +often struggles to retain the visual identity of clipart images or generate +cartoon-style motions, resulting in unsatisfactory animation outcomes. In this +paper, we introduce AniClipart, a system that transforms static clipart images +into high-quality motion sequences guided by text-to-video priors. To generate +cartoon-style and smooth motion, we first define B\'{e}zier curves over +keypoints of the clipart image as a form of motion regularization. We then +align the motion trajectories of the keypoints with the provided text prompt by +optimizing the Video Score Distillation Sampling (VSDS) loss, which encodes +adequate knowledge of natural motion within a pretrained text-to-video +diffusion model. With a differentiable As-Rigid-As-Possible shape deformation +algorithm, our method can be end-to-end optimized while maintaining deformation +rigidity. Experimental results show that the proposed AniClipart consistently +outperforms existing image-to-video generation models, in terms of text-video +alignment, visual identity preservation, and motion consistency. Furthermore, +we showcase the versatility of AniClipart by adapting it to generate a broader +array of animation formats, such as layered animation, which allows topological +changes. + +
+
+ comment: Project Page: https://aniclipart.github.io/ +
+
+
+
+
+ + ☆ Measuring Feature Dependency of Neural Networks by Collapsing Feature + Dimensions in the Data Manifold + + +
+ This paper introduces a new technique to measure the feature dependency of +neural network models. The motivation is to better understand a model by +querying whether it is using information from human-understandable features, +e.g., anatomical shape, volume, or image texture. Our method is based on the +principle that if a model is dependent on a feature, then removal of that +feature should significantly harm its performance. A targeted feature is +"removed" by collapsing the dimension in the data distribution that corresponds +to that feature. We perform this by moving data points along the feature +dimension to a baseline feature value while staying on the data manifold, as +estimated by a deep generative model. Then we observe how the model's +performance changes on the modified test data set, with the target feature +dimension removed. We test our method on deep neural network models trained on +synthetic image data with known ground truth, an Alzheimer's disease prediction +task using MRI and hippocampus segmentations from the OASIS-3 dataset, and a +cell nuclei classification task using the Lizard dataset. + +
+
+ comment: Accepted and will be pulished in International Symposium on + Biomedical Imaging (ISBI) 2024 +
+
+
+
+
+ + ☆ SPOT: Point Cloud Based Stereo Visual Place Recognition for Similar and + Opposing Viewpoints ICRA 2024 + + +
+ Recognizing places from an opposing viewpoint during a return trip is a +common experience for human drivers. However, the analogous robotics +capability, visual place recognition (VPR) with limited field of view cameras +under 180 degree rotations, has proven to be challenging to achieve. To address +this problem, this paper presents Same Place Opposing Trajectory (SPOT), a +technique for opposing viewpoint VPR that relies exclusively on structure +estimated through stereo visual odometry (VO). The method extends recent +advances in lidar descriptors and utilizes a novel double (similar and +opposing) distance matrix sequence matching method. We evaluate SPOT on a +publicly available dataset with 6.7-7.6 km routes driven in similar and +opposing directions under various lighting conditions. The proposed algorithm +demonstrates remarkable improvement over the state-of-the-art, achieving up to +91.7% recall at 100% precision in opposing viewpoint cases, while requiring +less storage than all baselines tested and running faster than all but one. +Moreover, the proposed method assumes no a priori knowledge of whether the +viewpoint is similar or opposing, and also demonstrates competitive performance +in similar viewpoint cases. + +
+
+ comment: Accepted to ICRA 2024, project website: + https://umautobots.github.io/spot +
+
+
+
+
+ + ☆ Customizing Text-to-Image Diffusion with Camera Viewpoint Control + + +
+ Model customization introduces new concepts to existing text-to-image models, +enabling the generation of the new concept in novel contexts. However, such +methods lack accurate camera view control w.r.t the object, and users must +resort to prompt engineering (e.g., adding "top-view") to achieve coarse view +control. In this work, we introduce a new task -- enabling explicit control of +camera viewpoint for model customization. This allows us to modify object +properties amongst various background scenes via text prompts, all while +incorporating the target camera pose as additional control. This new task +presents significant challenges in merging a 3D representation from the +multi-view images of the new concept with a general, 2D text-to-image model. To +bridge this gap, we propose to condition the 2D diffusion process on rendered, +view-dependent features of the new object. During training, we jointly adapt +the 2D diffusion modules and 3D feature predictions to reconstruct the object's +appearance and geometry while reducing overfitting to the input multi-view +images. Our method outperforms existing image editing and model personalization +baselines in preserving the custom object's identity while following the input +text prompt and the object's camera pose. + +
+
+ comment: project page: https://customdiffusion360.github.io +
+
+
+
+
+ + ☆ A Perspective on Deep Vision Performance with Standard Image and Video + Codecs CVPR 2024 + + +
+ Resource-constrained hardware, such as edge devices or cell phones, often +rely on cloud servers to provide the required computational resources for +inference in deep vision models. However, transferring image and video data +from an edge or mobile device to a cloud server requires coding to deal with +network constraints. The use of standardized codecs, such as JPEG or H.264, is +prevalent and required to ensure interoperability. This paper aims to examine +the implications of employing standardized codecs within deep vision pipelines. +We find that using JPEG and H.264 coding significantly deteriorates the +accuracy across a broad range of vision tasks and models. For instance, strong +compression rates reduce semantic segmentation accuracy by more than 80% in +mIoU. In contrast to previous findings, our analysis extends beyond image and +action classification to localization and dense prediction tasks, thus +providing a more comprehensive perspective. + +
+
+ comment: Accepted at CVPR 2024 Workshop on AI for Streaming (AIS) +
+
+
+
+
+ + ☆ Generalizable Face Landmarking Guided by Conditional Face Warping CVPR 2024 + + +
+ As a significant step for human face modeling, editing, and generation, face +landmarking aims at extracting facial keypoints from images. A generalizable +face landmarker is required in practice because real-world facial images, e.g., +the avatars in animations and games, are often stylized in various ways. +However, achieving generalizable face landmarking is challenging due to the +diversity of facial styles and the scarcity of labeled stylized faces. In this +study, we propose a simple but effective paradigm to learn a generalizable face +landmarker based on labeled real human faces and unlabeled stylized faces. Our +method learns the face landmarker as the key module of a conditional face +warper. Given a pair of real and stylized facial images, the conditional face +warper predicts a warping field from the real face to the stylized one, in +which the face landmarker predicts the ending points of the warping field and +provides us with high-quality pseudo landmarks for the corresponding stylized +facial images. Applying an alternating optimization strategy, we learn the face +landmarker to minimize $i)$ the discrepancy between the stylized faces and the +warped real ones and $ii)$ the prediction errors of both real and pseudo +landmarks. Experiments on various datasets show that our method outperforms +existing state-of-the-art domain adaptation methods in face landmarking tasks, +leading to a face landmarker with better generalizability. Code is available at +https://plustwo0.github.io/project-face-landmarker}{https://plustwo0.github.io/project-face-landmarker. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ iRAG: An Incremental Retrieval Augmented Generation System for Videos + + +
+ Retrieval augmented generation (RAG) systems combine the strengths of +language generation and information retrieval to power many real-world +applications like chatbots. Use of RAG for combined understanding of multimodal +data such as text, images and videos is appealing but two critical limitations +exist: one-time, upfront capture of all content in large multimodal data as +text descriptions entails high processing times, and not all information in the +rich multimodal data is typically in the text descriptions. Since the user +queries are not known apriori, developing a system for multimodal to text +conversion and interactive querying of multimodal data is challenging. + To address these limitations, we propose iRAG, which augments RAG with a +novel incremental workflow to enable interactive querying of large corpus of +multimodal data. Unlike traditional RAG, iRAG quickly indexes large +repositories of multimodal data, and in the incremental workflow, it uses the +index to opportunistically extract more details from select portions of the +multimodal data to retrieve context relevant to an interactive user query. Such +an incremental workflow avoids long multimodal to text conversion times, +overcomes information loss issues by doing on-demand query-specific extraction +of details in multimodal data, and ensures high quality of responses to +interactive user queries that are often not known apriori. To the best of our +knowledge, iRAG is the first system to augment RAG with an incremental workflow +to support efficient interactive querying of large, real-world multimodal data. +Experimental results on real-world long videos demonstrate 23x to 25x faster +video to text ingestion, while ensuring that quality of responses to +interactive user queries is comparable to responses from a traditional RAG +where all video data is converted to text upfront before any querying. + +
+
+
+
+
+ + ☆ When Medical Imaging Met Self-Attention: A Love Story That Didn't Quite + Work Out + + +
+ A substantial body of research has focused on developing systems that assist +medical professionals during labor-intensive early screening processes, many +based on convolutional deep-learning architectures. Recently, multiple studies +explored the application of so-called self-attention mechanisms in the vision +domain. These studies often report empirical improvements over fully +convolutional approaches on various datasets and tasks. To evaluate this trend +for medical imaging, we extend two widely adopted convolutional architectures +with different self-attention variants on two different medical datasets. With +this, we aim to specifically evaluate the possible advantages of additional +self-attention. We compare our models with similarly sized convolutional and +attention-based baselines and evaluate performance gains statistically. +Additionally, we investigate how including such layers changes the features +learned by these models during the training. Following a hyperparameter search, +and contrary to our expectations, we observe no significant improvement in +balanced accuracy over fully convolutional models. We also find that important +features, such as dermoscopic structures in skin lesion images, are still not +learned by employing self-attention. Finally, analyzing local explanations, we +confirm biased feature usage. We conclude that merely incorporating attention +is insufficient to surpass the performance of existing fully convolutional +methods. + +
+
+ comment: 10 pages, 2 figures, 5 tables, presented at VISAPP 2024 +
+
+
+
+
+ + ☆ Reducing Bias in Pre-trained Models by Tuning while Penalizing Change + + +
+ Deep models trained on large amounts of data often incorporate implicit +biases present during training time. If later such a bias is discovered during +inference or deployment, it is often necessary to acquire new data and retrain +the model. This behavior is especially problematic in critical areas such as +autonomous driving or medical decision-making. In these scenarios, new data is +often expensive and hard to come by. In this work, we present a method based on +change penalization that takes a pre-trained model and adapts the weights to +mitigate a previously detected bias. We achieve this by tuning a +zero-initialized copy of a frozen pre-trained network. Our method needs very +few, in extreme cases only a single, examples that contradict the bias to +increase performance. Additionally, we propose an early stopping criterion to +modify baselines and reduce overfitting. We evaluate our approach on a +well-known bias in skin lesion classification and three other datasets from the +domain shift literature. We find that our approach works especially well with +very few images. Simple fine-tuning combined with our early stopping also leads +to performance benefits for a larger number of tuning samples. + +
+
+ comment: 12 pages, 12 figures, presented at VISAPP 2024 +
+
+
+
+
+ + ☆ Performance Evaluation of Segment Anything Model with Variational + Prompting for Application to Non-Visible Spectrum Imagery + + +
+ The Segment Anything Model (SAM) is a deep neural network foundational model +designed to perform instance segmentation which has gained significant +popularity given its zero-shot segmentation ability. SAM operates by generating +masks based on various input prompts such as text, bounding boxes, points, or +masks, introducing a novel methodology to overcome the constraints posed by +dataset-specific scarcity. While SAM is trained on an extensive dataset, +comprising ~11M images, it mostly consists of natural photographic images with +only very limited images from other modalities. Whilst the rapid progress in +visual infrared surveillance and X-ray security screening imaging technologies, +driven forward by advances in deep learning, has significantly enhanced the +ability to detect, classify and segment objects with high accuracy, it is not +evident if the SAM zero-shot capabilities can be transferred to such +modalities. This work assesses SAM capabilities in segmenting objects of +interest in the X-ray/infrared modalities. Our approach reuses the pre-trained +SAM with three different prompts: bounding box, centroid and random points. We +present quantitative/qualitative results to showcase the performance on +selected datasets. Our results show that SAM can segment objects in the X-ray +modality when given a box prompt, but its performance varies for point prompts. +Specifically, SAM performs poorly in segmenting slender objects and organic +materials, such as plastic bottles. We find that infrared objects are also +challenging to segment with point prompts given the low-contrast nature of this +modality. This study shows that while SAM demonstrates outstanding zero-shot +capabilities with box prompts, its performance ranges from moderate to poor for +point prompts, indicating that special consideration on the cross-modal +generalisation of SAM is needed when considering use on X-ray/infrared imagery. + +
+
+
+
+
+ + ☆ Alleviating Catastrophic Forgetting in Facial Expression Recognition + with Emotion-Centered Models + + +
+ Facial expression recognition is a pivotal component in machine learning, +facilitating various applications. However, convolutional neural networks +(CNNs) are often plagued by catastrophic forgetting, impeding their +adaptability. The proposed method, emotion-centered generative replay (ECgr), +tackles this challenge by integrating synthetic images from generative +adversarial networks. Moreover, ECgr incorporates a quality assurance algorithm +to ensure the fidelity of generated images. This dual approach enables CNNs to +retain past knowledge while learning new tasks, enhancing their performance in +emotion recognition. The experimental results on four diverse facial expression +datasets demonstrate that incorporating images generated by our +pseudo-rehearsal method enhances training on the targeted dataset and the +source dataset while making the CNN retain previously learned knowledge. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ DeepLocalization: Using change point detection for Temporal Action + Localization + + +
+ In this study, we introduce DeepLocalization, an innovative framework devised +for the real-time localization of actions tailored explicitly for monitoring +driver behavior. Utilizing the power of advanced deep learning methodologies, +our objective is to tackle the critical issue of distracted driving-a +significant factor contributing to road accidents. Our strategy employs a dual +approach: leveraging Graph-Based Change-Point Detection for pinpointing actions +in time alongside a Video Large Language Model (Video-LLM) for precisely +categorizing activities. Through careful prompt engineering, we customize the +Video-LLM to adeptly handle driving activities' nuances, ensuring its +classification efficacy even with sparse data. Engineered to be lightweight, +our framework is optimized for consumer-grade GPUs, making it vastly applicable +in practical scenarios. We subjected our method to rigorous testing on the +SynDD2 dataset, a complex benchmark for distracted driving behaviors, where it +demonstrated commendable performance-achieving 57.5% accuracy in event +classification and 51% in event detection. These outcomes underscore the +substantial promise of DeepLocalization in accurately identifying diverse +driver behaviors and their temporal occurrences, all within the bounds of +limited computational resources. + +
+
+
+
+
+ + ☆ Food Portion Estimation via 3D Object Scaling + + +
+ Image-based methods to analyze food images have alleviated the user burden +and biases associated with traditional methods. However, accurate portion +estimation remains a major challenge due to the loss of 3D information in the +2D representation of foods captured by smartphone cameras or wearable devices. +In this paper, we propose a new framework to estimate both food volume and +energy from 2D images by leveraging the power of 3D food models and physical +reference in the eating scene. Our method estimates the pose of the camera and +the food object in the input image and recreates the eating occasion by +rendering an image of a 3D model of the food with the estimated poses. We also +introduce a new dataset, SimpleFood45, which contains 2D images of 45 food +items and associated annotations including food volume, weight, and energy. Our +method achieves an average error of 31.10 kCal (17.67%) on this dataset, +outperforming existing portion estimation methods. + +
+
+
+
+
+ + ☆ Deep Gaussian mixture model for unsupervised image segmentation + + +
+ The recent emergence of deep learning has led to a great deal of work on +designing supervised deep semantic segmentation algorithms. As in many tasks +sufficient pixel-level labels are very difficult to obtain, we propose a method +which combines a Gaussian mixture model (GMM) with unsupervised deep learning +techniques. In the standard GMM the pixel values with each sub-region are +modelled by a Gaussian distribution. In order to identify the different +regions, the parameter vector that minimizes the negative log-likelihood (NLL) +function regarding the GMM has to be approximated. For this task, usually +iterative optimization methods such as the expectation-maximization (EM) +algorithm are used. In this paper, we propose to estimate these parameters +directly from the image using a convolutional neural network (CNN). We thus +change the iterative procedure in the EM algorithm replacing the +expectation-step by a gradient-step with regard to the networks parameters. +This means that the network is trained to minimize the NLL function of the GMM +which comes with at least two advantages. As once trained, the network is able +to predict label probabilities very quickly compared with time consuming +iterative optimization methods. Secondly, due to the deep image prior our +method is able to partially overcome one of the main disadvantages of GMM, +which is not taking into account correlation between neighboring pixels, as it +assumes independence between them. We demonstrate the advantages of our method +in various experiments on the example of myocardial infarct segmentation on +multi-sequence MRI images. + +
+
+
+
+
+ + ☆ Dynamic Modality and View Selection for Multimodal Emotion Recognition + with Missing Modalities + + +
+ The study of human emotions, traditionally a cornerstone in fields like +psychology and neuroscience, has been profoundly impacted by the advent of +artificial intelligence (AI). Multiple channels, such as speech (voice) and +facial expressions (image), are crucial in understanding human emotions. +However, AI's journey in multimodal emotion recognition (MER) is marked by +substantial technical challenges. One significant hurdle is how AI models +manage the absence of a particular modality - a frequent occurrence in +real-world situations. This study's central focus is assessing the performance +and resilience of two strategies when confronted with the lack of one modality: +a novel multimodal dynamic modality and view selection and a cross-attention +mechanism. Results on the RECOLA dataset show that dynamic selection-based +methods are a promising approach for MER. In the missing modalities scenarios, +all dynamic selection-based methods outperformed the baseline. The study +concludes by emphasizing the intricate interplay between audio and video +modalities in emotion prediction, showcasing the adaptability of dynamic +selection methods in handling missing modalities. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Blind Localization and Clustering of Anomalies in Textures + + +
+ Anomaly detection and localization in images is a growing field in computer +vision. In this area, a seemingly understudied problem is anomaly clustering, +i.e., identifying and grouping different types of anomalies in a fully +unsupervised manner. In this work, we propose a novel method for clustering +anomalies in largely stationary images (textures) in a blind setting. That is, +the input consists of normal and anomalous images without distinction and +without labels. What contributes to the difficulty of the task is that +anomalous regions are often small and may present only subtle changes in +appearance, which can be easily overshadowed by the genuine variance in the +texture. Moreover, each anomaly type may have a complex appearance +distribution. We introduce a novel scheme for solving this task using a +combination of blind anomaly localization and contrastive learning. By +identifying the anomalous regions with high fidelity, we can restrict our focus +to those regions of interest; then, contrastive learning is employed to +increase the separability of different anomaly types and reduce the intra-class +variation. Our experiments show that the proposed solution yields significantly +better results compared to prior work, setting a new state of the art. Project +page: https://reality.tf.fau.de/pub/ardelean2024blind.html. + +
+
+
+
+
+ + ☆ Beyond Average: Individualized Visual Scanpath Prediction CVPR2024 + + +
+ Understanding how attention varies across individuals has significant +scientific and societal impacts. However, existing visual scanpath models treat +attention uniformly, neglecting individual differences. To bridge this gap, +this paper focuses on individualized scanpath prediction (ISP), a new attention +modeling task that aims to accurately predict how different individuals shift +their attention in diverse visual tasks. It proposes an ISP method featuring +three novel technical components: (1) an observer encoder to characterize and +integrate an observer's unique attention traits, (2) an observer-centric +feature integration approach that holistically combines visual features, task +guidance, and observer-specific characteristics, and (3) an adaptive fixation +prioritization mechanism that refines scanpath predictions by dynamically +prioritizing semantic feature maps based on individual observers' attention +traits. These novel components allow scanpath models to effectively address the +attention variations across different observers. Our method is generally +applicable to different datasets, model architectures, and visual tasks, +offering a comprehensive tool for transforming general scanpath models into +individualized ones. Comprehensive evaluations using value-based and +ranking-based metrics verify the method's effectiveness and generalizability. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ☆ ProTA: Probabilistic Token Aggregation for Text-Video Retrieval + + +
+ Text-video retrieval aims to find the most relevant cross-modal samples for a +given query. Recent methods focus on modeling the whole spatial-temporal +relations. However, since video clips contain more diverse content than +captions, the model aligning these asymmetric video-text pairs has a high risk +of retrieving many false positive results. In this paper, we propose +Probabilistic Token Aggregation (\textit{ProTA}) to handle cross-modal +interaction with content asymmetry. Specifically, we propose dual +partial-related aggregation to disentangle and re-aggregate token +representations in both low-dimension and high-dimension spaces. We propose +token-based probabilistic alignment to generate token-level probabilistic +representation and maintain the feature representation diversity. In addition, +an adaptive contrastive loss is proposed to learn compact cross-modal +distribution space. Based on extensive experiments, \textit{ProTA} achieves +significant improvements on MSR-VTT (50.9%), LSMDC (25.8%), and DiDeMo (47.2%). + +
+
+
+
+
+ + ☆ Observation, Analysis, and Solution: Exploring Strong Lightweight Vision + Transformers via Masked Image Modeling Pre-Training + + +
+ Masked image modeling (MIM) pre-training for large-scale vision transformers +(ViTs) in computer vision has enabled promising downstream performance on top +of the learned self-supervised ViT features. In this paper, we question if the +extremely simple ViTs' fine-tuning performance with a small-scale architecture +can also benefit from this pre-training paradigm, which is considerably less +studied yet in contrast to the well-established lightweight architecture design +methodology with sophisticated components introduced. By carefully adapting +various typical MIM pre-training methods to this lightweight regime and +comparing them with the contrastive learning (CL) pre-training on various +downstream image classification and dense prediction tasks, we systematically +observe different behaviors between MIM and CL with respect to the downstream +fine-tuning data scales. Furthermore, we analyze the frozen features under +linear probing evaluation and also the layer representation similarities and +attention maps across the obtained models, which clearly show the inferior +learning of MIM pre-training on higher layers, leading to unsatisfactory +fine-tuning performance on data-insufficient downstream tasks. This finding is +naturally a guide to choosing appropriate distillation strategies during +pre-training to solve the above deterioration problem. Extensive experiments on +various vision tasks demonstrate the effectiveness of our +observation-analysis-solution flow. In particular, our pre-training with +distillation on pure lightweight ViTs with vanilla/hierarchical design +(5.7M/6.5M) can achieve 79.4%/78.9% top-1 accuracy on ImageNet-1K. It also +enables SOTA performance on the ADE20K semantic segmentation task (42.8% mIoU) +and LaSOT visual tracking task (66.1% AUC) in the lightweight regime. The +latter even surpasses all the current SOTA lightweight CPU-realtime trackers. + +
+
+
+
+
+ + ☆ Partial-to-Partial Shape Matching with Geometric Consistency + + +
+ Finding correspondences between 3D shapes is an important and long-standing +problem in computer vision, graphics and beyond. A prominent challenge are +partial-to-partial shape matching settings, which occur when the shapes to +match are only observed incompletely (e.g. from 3D scanning). Although +partial-to-partial matching is a highly relevant setting in practice, it is +rarely explored. Our work bridges the gap between existing (rather artificial) +3D full shape matching and partial-to-partial real-world settings by exploiting +geometric consistency as a strong constraint. We demonstrate that it is indeed +possible to solve this challenging problem in a variety of settings. For the +first time, we achieve geometric consistency for partial-to-partial matching, +which is realized by a novel integer non-linear program formalism building on +triangle product spaces, along with a new pruning algorithm based on linear +integer programming. Further, we generate a new inter-class dataset for +partial-to-partial shape-matching. We show that our method outperforms current +SOTA methods on both an established intra-class dataset and our novel +inter-class dataset. + +
+
+
+
+
+ + ☆ GraFIQs: Face Image Quality Assessment Using Gradient Magnitudes CVPR + + +
+ Face Image Quality Assessment (FIQA) estimates the utility of face images for +automated face recognition (FR) systems. We propose in this work a novel +approach to assess the quality of face images based on inspecting the required +changes in the pre-trained FR model weights to minimize differences between +testing samples and the distribution of the FR training dataset. To achieve +that, we propose quantifying the discrepancy in Batch Normalization statistics +(BNS), including mean and variance, between those recorded during FR training +and those obtained by processing testing samples through the pretrained FR +model. We then generate gradient magnitudes of pretrained FR weights by +backpropagating the BNS through the pretrained model. The cumulative absolute +sum of these gradient magnitudes serves as the FIQ for our approach. Through +comprehensive experimentation, we demonstrate the effectiveness of our +training-free and quality labeling-free approach, achieving competitive +performance to recent state-of-theart FIQA approaches without relying on +quality labeling, the need to train regression networks, specialized +architectures, or designing and optimizing specific loss functions. + +
+
+ comment: Accepted at CVPR Workshop 2024 +
+
+
+
+
+ + ☆ Aligning Actions and Walking to LLM-Generated Textual Descriptions + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +various domains, including data augmentation and synthetic data generation. +This work explores the use of LLMs to generate rich textual descriptions for +motion sequences, encompassing both actions and walking patterns. We leverage +the expressive power of LLMs to align motion representations with high-level +linguistic cues, addressing two distinct tasks: action recognition and +retrieval of walking sequences based on appearance attributes. For action +recognition, we employ LLMs to generate textual descriptions of actions in the +BABEL-60 dataset, facilitating the alignment of motion sequences with +linguistic representations. In the domain of gait analysis, we investigate the +impact of appearance attributes on walking patterns by generating textual +descriptions of motion sequences from the DenseGait dataset using LLMs. These +descriptions capture subtle variations in walking styles influenced by factors +such as clothing choices and footwear. Our approach demonstrates the potential +of LLMs in augmenting structured motion attributes and aligning multi-modal +representations. The findings contribute to the advancement of comprehensive +motion understanding and open up new avenues for leveraging LLMs in multi-modal +alignment and data augmentation for motion analysis. We make the code publicly +available at https://github.com/Radu1999/WalkAndText + +
+
+ comment: Accepted at 2nd Workshop on Learning with Few or without Annotated + Face, Body and Gesture Data +
+
+
+
+
+ + ☆ Gait Recognition from Highly Compressed Videos + + +
+ Surveillance footage represents a valuable resource and opportunities for +conducting gait analysis. However, the typical low quality and high noise +levels in such footage can severely impact the accuracy of pose estimation +algorithms, which are foundational for reliable gait analysis. Existing +literature suggests a direct correlation between the efficacy of pose +estimation and the subsequent gait analysis results. A common mitigation +strategy involves fine-tuning pose estimation models on noisy data to improve +robustness. However, this approach may degrade the downstream model's +performance on the original high-quality data, leading to a trade-off that is +undesirable in practice. We propose a processing pipeline that incorporates a +task-targeted artifact correction model specifically designed to pre-process +and enhance surveillance footage before pose estimation. Our artifact +correction model is optimized to work alongside a state-of-the-art pose +estimation network, HRNet, without requiring repeated fine-tuning of the pose +estimation model. Furthermore, we propose a simple and robust method for +obtaining low quality videos that are annotated with poses in an automatic +manner with the purpose of training the artifact correction model. We +systematically evaluate the performance of our artifact correction model +against a range of noisy surveillance data and demonstrate that our approach +not only achieves improved pose estimation on low-quality surveillance footage, +but also preserves the integrity of the pose estimation on high resolution +footage. Our experiments show a clear enhancement in gait analysis performance, +supporting the viability of the proposed method as a superior alternative to +direct fine-tuning strategies. Our contributions pave the way for more reliable +gait analysis using surveillance data in real-world applications, regardless of +data quality. + +
+
+ comment: Accepted at 2nd Workshop on Learning with Few or without Annotated + Face, Body and Gesture Data +
+
+
+
+
+ + ☆ How to Benchmark Vision Foundation Models for Semantic Segmentation? CVPR 2024 + + +
+ Recent vision foundation models (VFMs) have demonstrated proficiency in +various tasks but require supervised fine-tuning to perform the task of +semantic segmentation effectively. Benchmarking their performance is essential +for selecting current models and guiding future model developments for this +task. The lack of a standardized benchmark complicates comparisons. Therefore, +the primary objective of this paper is to study how VFMs should be benchmarked +for semantic segmentation. To do so, various VFMs are fine-tuned under various +settings, and the impact of individual settings on the performance ranking and +training time is assessed. Based on the results, the recommendation is to +fine-tune the ViT-B variants of VFMs with a 16x16 patch size and a linear +decoder, as these settings are representative of using a larger model, more +advanced decoder and smaller patch size, while reducing training time by more +than 13 times. Using multiple datasets for training and evaluation is also +recommended, as the performance ranking across datasets and domain shifts +varies. Linear probing, a common practice for some VFMs, is not recommended, as +it is not representative of end-to-end fine-tuning. The benchmarking setup +recommended in this paper enables a performance analysis of VFMs for semantic +segmentation. The findings of such an analysis reveal that pretraining with +promptable segmentation is not beneficial, whereas masked image modeling (MIM) +with abstract representations is crucial, even more important than the type of +supervision used. The code for efficiently fine-tuning VFMs for semantic +segmentation can be accessed through the project page at: +https://tue-mps.github.io/benchmark-vfm-ss/. + +
+
+ comment: CVPR 2024 Workshop Proceedings for the Second Workshop on Foundation + Models +
+
+
+
+
+ + ☆ Real-World Efficient Blind Motion Deblurring via Blur Pixel + Discretization CVPR2024 + + +
+ As recent advances in mobile camera technology have enabled the capability to +capture high-resolution images, such as 4K images, the demand for an efficient +deblurring model handling large motion has increased. In this paper, we +discover that the image residual errors, i.e., blur-sharp pixel differences, +can be grouped into some categories according to their motion blur type and how +complex their neighboring pixels are. Inspired by this, we decompose the +deblurring (regression) task into blur pixel discretization (pixel-level blur +classification) and discrete-to-continuous conversion (regression with blur +class map) tasks. Specifically, we generate the discretized image residual +errors by identifying the blur pixels and then transform them to a continuous +form, which is computationally more efficient than naively solving the original +regression problem with continuous values. Here, we found that the +discretization result, i.e., blur segmentation map, remarkably exhibits visual +similarity with the image residual errors. As a result, our efficient model +shows comparable performance to state-of-the-art methods in realistic +benchmarks, while our method is up to 10 times computationally more efficient. + +
+
+ comment: CVPR2024 Camera-Ready +
+
+
+
+
+ + ☆ StyleBooth: Image Style Editing with Multimodal Instruction + + +
+ Given an original image, image editing aims to generate an image that align +with the provided instruction. The challenges are to accept multimodal inputs +as instructions and a scarcity of high-quality training data, including crucial +triplets of source/target image pairs and multimodal (text and image) +instructions. In this paper, we focus on image style editing and present +StyleBooth, a method that proposes a comprehensive framework for image editing +and a feasible strategy for building a high-quality style editing dataset. We +integrate encoded textual instruction and image exemplar as a unified condition +for diffusion model, enabling the editing of original image following +multimodal instructions. Furthermore, by iterative style-destyle tuning and +editing and usability filtering, the StyleBooth dataset provides +content-consistent stylized/plain image pairs in various categories of styles. +To show the flexibility of StyleBooth, we conduct experiments on diverse tasks, +such as text-based style editing, exemplar-based style editing and +compositional style editing. The results demonstrate that the quality and +variety of training data significantly enhance the ability to preserve content +and improve the overall quality of generated images in editing tasks. Project +page can be found at https://ali-vilab.github.io/stylebooth-page/. + +
+
+
+
+
+ + ☆ Omniview-Tuning: Boosting Viewpoint Invariance of Vision-Language + Pre-training Models + + +
+ Vision-Language Pre-training (VLP) models like CLIP have achieved remarkable +success in computer vision and particularly demonstrated superior robustness to +distribution shifts of 2D images. However, their robustness under 3D viewpoint +variations is still limited, which can hinder the development for real-world +applications. This paper successfully addresses this concern while keeping +VLPs' original performance by breaking through two primary obstacles: 1) the +scarcity of training data and 2) the suboptimal fine-tuning paradigms. To +combat data scarcity, we build the Multi-View Caption (MVCap) dataset -- a +comprehensive collection of over four million multi-view image-text pairs +across more than 100K objects, providing more potential for VLP models to +develop generalizable viewpoint-invariant representations. To address the +limitations of existing paradigms in performance trade-offs and training +efficiency, we design a novel fine-tuning framework named Omniview-Tuning +(OVT). Specifically, OVT introduces a Cross-Viewpoint Alignment objective +through a minimax-like optimization strategy, which effectively aligns +representations of identical objects from diverse viewpoints without causing +overfitting. Additionally, OVT fine-tunes VLP models in a parameter-efficient +manner, leading to minimal computational cost. Extensive experiments on various +VLP models with different architectures validate that OVT significantly +improves the models' resilience to viewpoint shifts and keeps the original +performance, establishing a pioneering standard for boosting the viewpoint +invariance of VLP models. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ One-Shot Sequential Federated Learning for Non-IID Data by Enhancing + Local Model Diversity + + +
+ Traditional federated learning mainly focuses on parallel settings (PFL), +which can suffer significant communication and computation costs. In contrast, +one-shot and sequential federated learning (SFL) have emerged as innovative +paradigms to alleviate these costs. However, the issue of non-IID (Independent +and Identically Distributed) data persists as a significant challenge in +one-shot and SFL settings, exacerbated by the restricted communication between +clients. In this paper, we improve the one-shot sequential federated learning +for non-IID data by proposing a local model diversity-enhancing strategy. +Specifically, to leverage the potential of local model diversity for improving +model performance, we introduce a local model pool for each client that +comprises diverse models generated during local training, and propose two +distance measurements to further enhance the model diversity and mitigate the +effect of non-IID data. Consequently, our proposed framework can improve the +global model performance while maintaining low communication costs. Extensive +experiments demonstrate that our method exhibits superior performance to +existing one-shot PFL methods and achieves better accuracy compared with +state-of-the-art one-shot SFL methods on both label-skew and domain-shift tasks +(e.g., 6%+ accuracy improvement on the CIFAR-10 dataset). + +
+
+
+
+
+ + ☆ Fortify the Guardian, Not the Treasure: Resilient Adversarial Detectors + + +
+ This paper presents RADAR-Robust Adversarial Detection via Adversarial +Retraining-an approach designed to enhance the robustness of adversarial +detectors against adaptive attacks, while maintaining classifier performance. +An adaptive attack is one where the attacker is aware of the defenses and +adapts their strategy accordingly. Our proposed method leverages adversarial +training to reinforce the ability to detect attacks, without compromising clean +accuracy. During the training phase, we integrate into the dataset adversarial +examples, which were optimized to fool both the classifier and the adversarial +detector, enabling the adversarial detector to learn and adapt to potential +attack scenarios. Experimental evaluations on the CIFAR-10 and SVHN datasets +demonstrate that our proposed algorithm significantly improves a detector's +ability to accurately identify adaptive adversarial attacks -- without +sacrificing clean accuracy. + +
+
+
+
+
+ + ☆ Ethical-Lens: Curbing Malicious Usages of Open-Source Text-to-Image + Models + + +
+ The burgeoning landscape of text-to-image models, exemplified by innovations +such as Midjourney and DALLE 3, has revolutionized content creation across +diverse sectors. However, these advancements bring forth critical ethical +concerns, particularly with the misuse of open-source models to generate +content that violates societal norms. Addressing this, we introduce +Ethical-Lens, a framework designed to facilitate the value-aligned usage of +text-to-image tools without necessitating internal model revision. Ethical-Lens +ensures value alignment in text-to-image models across toxicity and bias +dimensions by refining user commands and rectifying model outputs. Systematic +evaluation metrics, combining GPT4-V, HEIM, and FairFace scores, assess +alignment capability. Our experiments reveal that Ethical-Lens enhances +alignment capabilities to levels comparable with or superior to commercial +models like DALLE 3, ensuring user-generated content adheres to ethical +standards while maintaining image quality. This study indicates the potential +of Ethical-Lens to ensure the sustainable development of open-source +text-to-image tools and their beneficial integration into society. Our code is +available at https://github.com/yuzhu-cai/Ethical-Lens. + +
+
+ comment: 42 pages, 17 figures, 29 tables +
+
+
+
+
+ + ☆ S3R-Net: A Single-Stage Approach to Self-Supervised Shadow Removal CVPR 2024 + + +
+ In this paper we present S3R-Net, the Self-Supervised Shadow Removal Network. +The two-branch WGAN model achieves self-supervision relying on the +unify-and-adaptphenomenon - it unifies the style of the output data and infers +its characteristics from a database of unaligned shadow-free reference images. +This approach stands in contrast to the large body of supervised frameworks. +S3R-Net also differentiates itself from the few existing self-supervised models +operating in a cycle-consistent manner, as it is a non-cyclic, unidirectional +solution. The proposed framework achieves comparable numerical scores to recent +selfsupervised shadow removal models while exhibiting superior qualitative +performance and keeping the computational cost low. + +
+
+ comment: NTIRE workshop @ CVPR 2024. Code & models available at + https://github.com/n-kubiak/S3R-Net +
+
+
+
+
+ + ☆ Harnessing Joint Rain-/Detail-aware Representations to Eliminate + Intricate Rains + + +
+ Recent advances in image deraining have focused on training powerful models +on mixed multiple datasets comprising diverse rain types and backgrounds. +However, this approach tends to overlook the inherent differences among rainy +images, leading to suboptimal results. To overcome this limitation, we focus on +addressing various rainy images by delving into meaningful representations that +encapsulate both the rain and background components. Leveraging these +representations as instructive guidance, we put forth a Context-based +Instance-level Modulation (CoI-M) mechanism adept at efficiently modulating +CNN- or Transformer-based models. Furthermore, we devise a rain-/detail-aware +contrastive learning strategy to help extract joint rain-/detail-aware +representations. By integrating CoI-M with the rain-/detail-aware Contrastive +learning, we develop CoIC, an innovative and potent algorithm tailored for +training models on mixed datasets. Moreover, CoIC offers insight into modeling +relationships of datasets, quantitatively assessing the impact of rain and +details on restoration, and unveiling distinct behaviors of models given +diverse inputs. Extensive experiments validate the efficacy of CoIC in boosting +the deraining ability of CNN and Transformer models. CoIC also enhances the +deraining prowess remarkably when real-world dataset is included. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ☆ MambaPupil: Bidirectional Selective Recurrent model for Event-based Eye + tracking CVPR 2024 + + +
+ Event-based eye tracking has shown great promise with the high temporal +resolution and low redundancy provided by the event camera. However, the +diversity and abruptness of eye movement patterns, including blinking, +fixating, saccades, and smooth pursuit, pose significant challenges for eye +localization. To achieve a stable event-based eye-tracking system, this paper +proposes a bidirectional long-term sequence modeling and time-varying state +selection mechanism to fully utilize contextual temporal information in +response to the variability of eye movements. Specifically, the MambaPupil +network is proposed, which consists of the multi-layer convolutional encoder to +extract features from the event representations, a bidirectional Gated +Recurrent Unit (GRU), and a Linear Time-Varying State Space Module (LTV-SSM), +to selectively capture contextual correlation from the forward and backward +temporal relationship. Furthermore, the Bina-rep is utilized as a compact event +representation, and the tailor-made data augmentation, called as Event-Cutout, +is proposed to enhance the model's robustness by applying spatial random +masking to the event image. The evaluation on the ThreeET-plus benchmark shows +the superior performance of the MambaPupil, which secured the 1st place in +CVPR'2024 AIS Event-based Eye Tracking challenge. + +
+
+ comment: Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for + Streaming), top solution of challenge Event-based Eye Tracking, see + https://www.kaggle.com/competitions/event-based-eye-tracking-ais2024 +
+
+
+
+
+ + ☆ MaskCD: A Remote Sensing Change Detection Network Based on Mask + Classification + + +
+ Change detection (CD) from remote sensing (RS) images using deep learning has +been widely investigated in the literature. It is typically regarded as a +pixel-wise labeling task that aims to classify each pixel as changed or +unchanged. Although per-pixel classification networks in encoder-decoder +structures have shown dominance, they still suffer from imprecise boundaries +and incomplete object delineation at various scenes. For high-resolution RS +images, partly or totally changed objects are more worthy of attention rather +than a single pixel. Therefore, we revisit the CD task from the mask prediction +and classification perspective and propose MaskCD to detect changed areas by +adaptively generating categorized masks from input image pairs. Specifically, +it utilizes a cross-level change representation perceiver (CLCRP) to learn +multiscale change-aware representations and capture spatiotemporal relations +from encoded features by exploiting deformable multihead self-attention +(DeformMHSA). Subsequently, a masked-attention-based detection transformers +(MA-DETR) decoder is developed to accurately locate and identify changed +objects based on masked attention and self-attention mechanisms. It +reconstructs the desired changed objects by decoding the pixel-wise +representations into learnable mask proposals and making final predictions from +these candidates. Experimental results on five benchmark datasets demonstrate +the proposed approach outperforms other state-of-the-art models. Codes and +pretrained models are available online (https://github.com/EricYu97/MaskCD). + +
+
+
+
+
+ + ☆ PureForest: A Large-scale Aerial Lidar and Aerial Imagery Dataset for + Tree Species Classification in Monospecific Forests + + +
+ Knowledge of tree species distribution is fundamental to managing forests. +New deep learning approaches promise significant accuracy gains for forest +mapping, and are becoming a critical tool for mapping multiple tree species at +scale. To advance the field, deep learning researchers need large benchmark +datasets with high-quality annotations. To this end, we present the PureForest +dataset: a large-scale, open, multimodal dataset designed for tree species +classification from both Aerial Lidar Scanning (ALS) point clouds and Very High +Resolution (VHR) aerial images. Most current public Lidar datasets for tree +species classification have low diversity as they only span a small area of a +few dozen annotated hectares at most. In contrast, PureForest has 18 tree +species grouped into 13 semantic classes, and spans 339 km$^2$ across 449 +distinct monospecific forests, and is to date the largest and most +comprehensive Lidar dataset for the identification of tree species. By making +PureForest publicly available, we hope to provide a challenging benchmark +dataset to support the development of deep learning approaches for tree species +identification from Lidar and/or aerial imagery. In this data paper, we +describe the annotation workflow, the dataset, the recommended evaluation +methodology, and establish a baseline performance from both 3D and 2D +modalities. + +
+
+ comment: 14 pages | 5 figures | Dataset is available at + http://huggingface.co/datasets/IGNF/PureForest +
+
+
+
+
+ + ☆ MIDGET: Music Conditioned 3D Dance Generation + + +
+ In this paper, we introduce a MusIc conditioned 3D Dance GEneraTion model, +named MIDGET based on Dance motion Vector Quantised Variational AutoEncoder +(VQ-VAE) model and Motion Generative Pre-Training (GPT) model to generate +vibrant and highquality dances that match the music rhythm. To tackle +challenges in the field, we introduce three new components: 1) a pre-trained +memory codebook based on the Motion VQ-VAE model to store different human pose +codes, 2) employing Motion GPT model to generate pose codes with music and +motion Encoders, 3) a simple framework for music feature extraction. We compare +with existing state-of-the-art models and perform ablation experiments on +AIST++, the largest publicly available music-dance dataset. Experiments +demonstrate that our proposed framework achieves state-of-the-art performance +on motion quality and its alignment with the music. + +
+
+ comment: 12 pages, 6 figures Published in AI 2023: Advances in Artificial + Intelligence +
+
+
+
+
+ + ☆ Improving the perception of visual fiducial markers in the field using + Adaptive Active Exposure Control + + +
+ Accurate localization is fundamental for autonomous underwater vehicles +(AUVs) to carry out precise tasks, such as manipulation and construction. +Vision-based solutions using fiducial marker are promising, but extremely +challenging underwater because of harsh lighting condition underwater. This +paper introduces a gradient-based active camera exposure control method to +tackle sharp lighting variations during image acquisition, which can establish +better foundation for subsequent image enhancement procedures. Considering a +typical scenario for underwater operations where visual tags are used, we +proposed several experiments comparing our method with other state-of-the-art +exposure control method including Active Exposure Control (AEC) and +Gradient-based Exposure Control (GEC). Results show a significant improvement +in the accuracy of robot localization. This method is an important component +that can be used in visual-based state estimation pipeline to improve the +overall localization accuracy. + +
+
+ comment: Paper accepted by ISER 2023 +
+
+
+
+
+ + ☆ Data-free Knowledge Distillation for Fine-grained Visual Categorization + + +
+ Data-free knowledge distillation (DFKD) is a promising approach for +addressing issues related to model compression, security privacy, and +transmission restrictions. Although the existing methods exploiting DFKD have +achieved inspiring achievements in coarse-grained classification, in practical +applications involving fine-grained classification tasks that require more +detailed distinctions between similar categories, sub-optimal results are +obtained. To address this issue, we propose an approach called DFKD-FGVC that +extends DFKD to fine-grained visual categorization~(FGVC) tasks. Our approach +utilizes an adversarial distillation framework with attention generator, mixed +high-order attention distillation, and semantic feature contrast learning. +Specifically, we introduce a spatial-wise attention mechanism to the generator +to synthesize fine-grained images with more details of discriminative parts. We +also utilize the mixed high-order attention mechanism to capture complex +interactions among parts and the subtle differences among discriminative +features of the fine-grained categories, paying attention to both local +features and semantic context relationships. Moreover, we leverage the teacher +and student models of the distillation framework to contrast high-level +semantic feature maps in the hyperspace, comparing variances of different +categories. We evaluate our approach on three widely-used FGVC benchmarks +(Aircraft, Cars196, and CUB200) and demonstrate its superior performance. + +
+
+
+
+
+ + ☆ MLS-Track: Multilevel Semantic Interaction in RMOT + + +
+ The new trend in multi-object tracking task is to track objects of interest +using natural language. However, the scarcity of paired prompt-instance data +hinders its progress. To address this challenge, we propose a high-quality yet +low-cost data generation method base on Unreal Engine 5 and construct a +brand-new benchmark dataset, named Refer-UE-City, which primarily includes +scenes from intersection surveillance videos, detailing the appearance and +actions of people and vehicles. Specifically, it provides 14 videos with a +total of 714 expressions, and is comparable in scale to the Refer-KITTI +dataset. Additionally, we propose a multi-level semantic-guided multi-object +framework called MLS-Track, where the interaction between the model and text is +enhanced layer by layer through the introduction of Semantic Guidance Module +(SGM) and Semantic Correlation Branch (SCB). Extensive experiments on +Refer-UE-City and Refer-KITTI datasets demonstrate the effectiveness of our +proposed framework and it achieves state-of-the-art performance. Code and +datatsets will be available. + +
+
+ comment: 17 pages 8 figures +
+
+
+
+
+ + ☆ Meta-Auxiliary Learning for Micro-Expression Recognition + + +
+ Micro-expressions (MEs) are involuntary movements revealing people's hidden +feelings, which has attracted numerous interests for its objectivity in emotion +detection. However, despite its wide applications in various scenarios, +micro-expression recognition (MER) remains a challenging problem in real life +due to three reasons, including (i) data-level: lack of data and imbalanced +classes, (ii) feature-level: subtle, rapid changing, and complex features of +MEs, and (iii) decision-making-level: impact of individual differences. To +address these issues, we propose a dual-branch meta-auxiliary learning method, +called LightmanNet, for fast and robust micro-expression recognition. +Specifically, LightmanNet learns general MER knowledge from limited data +through a dual-branch bi-level optimization process: (i) In the first level, it +obtains task-specific MER knowledge by learning in two branches, where the +first branch is for learning MER features via primary MER tasks, while the +other branch is for guiding the model obtain discriminative features via +auxiliary tasks, i.e., image alignment between micro-expressions and +macro-expressions since their resemblance in both spatial and temporal +behavioral patterns. The two branches of learning jointly constrain the model +of learning meaningful task-specific MER knowledge while avoiding learning +noise or superficial connections between MEs and emotions that may damage its +generalization ability. (ii) In the second level, LightmanNet further refines +the learned task-specific knowledge, improving model generalization and +efficiency. Extensive experiments on various benchmark datasets demonstrate the +superior robustness and efficiency of LightmanNet. + +
+
+ comment: 10 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ Look, Listen, and Answer: Overcoming Biases for Audio-Visual Question + Answering + + +
+ Audio-Visual Question Answering (AVQA) is a complex multi-modal reasoning +task, demanding intelligent systems to accurately respond to natural language +queries based on audio-video input pairs. Nevertheless, prevalent AVQA +approaches are prone to overlearning dataset biases, resulting in poor +robustness. Furthermore, current datasets may not provide a precise diagnostic +for these methods. To tackle these challenges, firstly, we propose a novel +dataset, \textit{MUSIC-AVQA-R}, crafted in two steps: rephrasing questions +within the test split of a public dataset (\textit{MUSIC-AVQA}) and +subsequently introducing distribution shifts to split questions. The former +leads to a large, diverse test space, while the latter results in a +comprehensive robustness evaluation on rare, frequent, and overall questions. +Secondly, we propose a robust architecture that utilizes a multifaceted cycle +collaborative debiasing strategy to overcome bias learning. Experimental +results show that this architecture achieves state-of-the-art performance on +both datasets, especially obtaining a significant improvement of 9.68\% on the +proposed dataset. Extensive ablation experiments are conducted on these two +datasets to validate the effectiveness of the debiasing strategy. Additionally, +we highlight the limited robustness of existing multi-modal QA methods through +the evaluation on our dataset. + +
+
+ comment: 16 pages, 9 figures,5 Tables +
+
+
+
+
+ + ☆ What does CLIP know about peeling a banana? CVPR2024 + + +
+ Humans show an innate capability to identify tools to support specific +actions. The association between objects parts and the actions they facilitate +is usually named affordance. Being able to segment objects parts depending on +the tasks they afford is crucial to enable intelligent robots to use objects of +daily living. Traditional supervised learning methods for affordance +segmentation require costly pixel-level annotations, while weakly supervised +approaches, though less demanding, still rely on object-interaction examples +and support a closed set of actions. These limitations hinder scalability, may +introduce biases, and usually restrict models to a limited set of predefined +actions. This paper proposes AffordanceCLIP, to overcome these limitations by +leveraging the implicit affordance knowledge embedded within large pre-trained +Vision-Language models like CLIP. We experimentally demonstrate that CLIP, +although not explicitly trained for affordances detection, retains valuable +information for the task. Our AffordanceCLIP achieves competitive zero-shot +performance compared to methods with specialized training, while offering +several advantages: i) it works with any action prompt, not just a predefined +set; ii) it requires training only a small number of additional parameters +compared to existing solutions and iii) eliminates the need for direct +supervision on action-object pairs, opening new perspectives for +functionality-based reasoning of models. + +
+
+ comment: Accepted to MAR Workshop at CVPR2024 +
+
+
+
+
+ + ☆ Curriculum Point Prompting for Weakly-Supervised Referring Image + Segmentation CVPR 2024 + + +
+ Referring image segmentation (RIS) aims to precisely segment referents in +images through corresponding natural language expressions, yet relying on +cost-intensive mask annotations. Weakly supervised RIS thus learns from +image-text pairs to pixel-level semantics, which is challenging for segmenting +fine-grained masks. A natural approach to enhancing segmentation precision is +to empower weakly supervised RIS with the image segmentation foundation model +SAM. Nevertheless, we observe that simply integrating SAM yields limited +benefits and can even lead to performance regression due to the inevitable +noise issues and challenges in excessive focus on object parts. In this paper, +we present an innovative framework, Point PrompTing (PPT), incorporated with +the proposed multi-source curriculum learning strategy to address these +challenges. Specifically, the core of PPT is a point generator that not only +harnesses CLIP's text-image alignment capability and SAM's powerful mask +generation ability but also generates negative point prompts to address the +noisy and excessive focus issues inherently and effectively. In addition, we +introduce a curriculum learning strategy with object-centric images to help PPT +gradually learn from simpler yet precise semantic alignment to more complex +RIS. Experiments demonstrate that our PPT significantly and consistently +outperforms prior weakly supervised techniques on mIoU by 11.34%, 14.14%, and +6.97% across RefCOCO, RefCOCO+, and G-Ref, respectively. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ MultiPhys: Multi-Person Physics-aware 3D Motion Estimation + + +
+ We introduce MultiPhys, a method designed for recovering multi-person motion +from monocular videos. Our focus lies in capturing coherent spatial placement +between pairs of individuals across varying degrees of engagement. MultiPhys, +being physically aware, exhibits robustness to jittering and occlusions, and +effectively eliminates penetration issues between the two individuals. We +devise a pipeline in which the motion estimated by a kinematic-based method is +fed into a physics simulator in an autoregressive manner. We introduce distinct +components that enable our model to harness the simulator's properties without +compromising the accuracy of the kinematic estimates. This results in final +motion estimates that are both kinematically coherent and physically compliant. +Extensive evaluations on three challenging datasets characterized by +substantial inter-person interaction show that our method significantly reduces +errors associated with penetration and foot skating, while performing +competitively with the state-of-the-art on motion accuracy and smoothness. +Results and code can be found on our project page +(http://www.iri.upc.edu/people/nugrinovic/multiphys/). + +
+
+
+
+
+ + ☆ Tendency-driven Mutual Exclusivity for Weakly Supervised Incremental + Semantic Segmentation + + +
+ Weakly Incremental Learning for Semantic Segmentation (WILSS) leverages a +pre-trained segmentation model to segment new classes using cost-effective and +readily available image-level labels. A prevailing way to solve WILSS is the +generation of seed areas for each new class, serving as a form of pixel-level +supervision. However, a scenario usually arises where a pixel is concurrently +predicted as an old class by the pre-trained segmentation model and a new class +by the seed areas. Such a scenario becomes particularly problematic in WILSS, +as the lack of pixel-level annotations on new classes makes it intractable to +ascertain whether the pixel pertains to the new class or not. To surmount this +issue, we propose an innovative, tendency-driven relationship of mutual +exclusivity, meticulously tailored to govern the behavior of the seed areas and +the predictions generated by the pre-trained segmentation model. This +relationship stipulates that predictions for the new and old classes must not +conflict whilst prioritizing the preservation of predictions for the old +classes, which not only addresses the conflicting prediction issue but also +effectively mitigates the inherent challenge of incremental learning - +catastrophic forgetting. Furthermore, under the auspices of this +tendency-driven mutual exclusivity relationship, we generate pseudo masks for +the new classes, allowing for concurrent execution with model parameter +updating via the resolution of a bi-level optimization problem. Extensive +experiments substantiate the effectiveness of our framework, resulting in the +establishment of new benchmarks and paving the way for further research in this +field. + +
+
+
+
+
+ + ☆ MTGA: Multi-view Temporal Granularity aligned Aggregation for + Event-based Lip-reading + + +
+ Lip-reading is to utilize the visual information of the speaker's lip +movements to recognize words and sentences. Existing event-based lip-reading +solutions integrate different frame rate branches to learn spatio-temporal +features of varying granularities. However, aggregating events into event +frames inevitably leads to the loss of fine-grained temporal information within +frames. To remedy this drawback, we propose a novel framework termed Multi-view +Temporal Granularity aligned Aggregation (MTGA). Specifically, we first present +a novel event representation method, namely time-segmented voxel graph list, +where the most significant local voxels are temporally connected into a graph +list. Then we design a spatio-temporal fusion module based on temporal +granularity alignment, where the global spatial features extracted from event +frames, together with the local relative spatial and temporal features +contained in voxel graph list are effectively aligned and integrated. Finally, +we design a temporal aggregation module that incorporates positional encoding, +which enables the capture of local absolute spatial and global temporal +information. Experiments demonstrate that our method outperforms both the +event-based and video-based lip-reading counterparts. Our code will be publicly +available. + +
+
+
+
+
+ + ☆ Device (In)Dependence of Deep Learning-based Image Age Approximation ICPR + + +
+ The goal of temporal image forensic is to approximate the age of a digital +image relative to images from the same device. Usually, this is based on traces +left during the image acquisition pipeline. For example, several methods exist +that exploit the presence of in-field sensor defects for this purpose. In +addition to these 'classical' methods, there is also an approach in which a +Convolutional Neural Network (CNN) is trained to approximate the image age. One +advantage of a CNN is that it independently learns the age features used. This +would make it possible to exploit other (different) age traces in addition to +the known ones (i.e., in-field sensor defects). In a previous work, we have +shown that the presence of strong in-field sensor defects is irrelevant for a +CNN to predict the age class. Based on this observation, the question arises +how device (in)dependent the learned features are. In this work, we empirically +asses this by training a network on images from a single device and then apply +the trained model to images from different devices. This evaluation is +performed on 14 different devices, including 10 devices from the publicly +available 'Northumbria Temporal Image Forensics' database. These 10 different +devices are based on five different device pairs (i.e., with the identical +camera model). + +
+
+ comment: This work was accepted and presented in: 2022 ICPR-Workshop on + Artificial Intelligence for Multimedia Forensics and Disinformation + Detection. Montreal, Quebec, Canada. However, due to a technical issue on the + publishing companies' side, the work does not appear in the workshop + proceedings +
+
+
+
+
+ + ☆ ©Plug-in Authorization for Human Content Copyright Protection + in Text-to-Image Model + + +
+ This paper addresses the contentious issue of copyright infringement in +images generated by text-to-image models, sparking debates among AI developers, +content creators, and legal entities. State-of-the-art models create +high-quality content without crediting original creators, causing concern in +the artistic community. To mitigate this, we propose the \copyright Plug-in +Authorization framework, introducing three operations: addition, extraction, +and combination. Addition involves training a \copyright plug-in for specific +copyright, facilitating proper credit attribution. Extraction allows creators +to reclaim copyright from infringing models, and combination enables users to +merge different \copyright plug-ins. These operations act as permits, +incentivizing fair use and providing flexibility in authorization. We present +innovative approaches,"Reverse LoRA" for extraction and "EasyMerge" for +seamless combination. Experiments in artist-style replication and cartoon IP +recreation demonstrate \copyright plug-ins' effectiveness, offering a valuable +solution for human copyright protection in the age of generative AIs. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ☆ Not All Voxels Are Equal: Hardness-Aware Semantic Scene Completion with + Self-Distillation CVPR2024 + + +
+ Semantic scene completion, also known as semantic occupancy prediction, can +provide dense geometric and semantic information for autonomous vehicles, which +attracts the increasing attention of both academia and industry. Unfortunately, +existing methods usually formulate this task as a voxel-wise classification +problem and treat each voxel equally in 3D space during training. As the hard +voxels have not been paid enough attention, the performance in some challenging +regions is limited. The 3D dense space typically contains a large number of +empty voxels, which are easy to learn but require amounts of computation due to +handling all the voxels uniformly for the existing models. Furthermore, the +voxels in the boundary region are more challenging to differentiate than those +in the interior. In this paper, we propose HASSC approach to train the semantic +scene completion model with hardness-aware design. The global hardness from the +network optimization process is defined for dynamical hard voxel selection. +Then, the local hardness with geometric anisotropy is adopted for voxel-wise +refinement. Besides, self-distillation strategy is introduced to make training +process stable and consistent. Extensive experiments show that our HASSC scheme +can effectively promote the accuracy of the baseline model without incurring +the extra inference cost. Source code is available at: +https://github.com/songw-zju/HASSC. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ The devil is in the object boundary: towards annotation-free instance + segmentation using Foundation Models ICLR2024 + + +
+ Foundation models, pre-trained on a large amount of data have demonstrated +impressive zero-shot capabilities in various downstream tasks. However, in +object detection and instance segmentation, two fundamental computer vision +tasks heavily reliant on extensive human annotations, foundation models such as +SAM and DINO struggle to achieve satisfactory performance. In this study, we +reveal that the devil is in the object boundary, \textit{i.e.}, these +foundation models fail to discern boundaries between individual objects. For +the first time, we probe that CLIP, which has never accessed any instance-level +annotations, can provide a highly beneficial and strong instance-level boundary +prior in the clustering results of its particular intermediate layer. Following +this surprising observation, we propose $\textbf{Zip}$ which $\textbf{Z}$ips up +CL$\textbf{ip}$ and SAM in a novel classification-first-then-discovery +pipeline, enabling annotation-free, complex-scene-capable, open-vocabulary +object detection and instance segmentation. Our Zip significantly boosts SAM's +mask AP on COCO dataset by 12.5% and establishes state-of-the-art performance +in various settings, including training-free, self-training, and +label-efficient finetuning. Furthermore, annotation-free Zip even achieves +comparable performance to the best-performing open-vocabulary object detecters +using base annotations. Code is released at +https://github.com/ChengShiest/Zip-Your-CLIP + +
+
+ comment: ICLR2024, Code is released at + https://github.com/ChengShiest/Zip-Your-CLIP +
+
+
+
+
+ + ☆ Sketch-guided Image Inpainting with Partial Discrete Diffusion Process CVPR 2024 + + +
+ In this work, we study the task of sketch-guided image inpainting. Unlike the +well-explored natural language-guided image inpainting, which excels in +capturing semantic details, the relatively less-studied sketch-guided +inpainting offers greater user control in specifying the object's shape and +pose to be inpainted. As one of the early solutions to this task, we introduce +a novel partial discrete diffusion process (PDDP). The forward pass of the PDDP +corrupts the masked regions of the image and the backward pass reconstructs +these masked regions conditioned on hand-drawn sketches using our proposed +sketch-guided bi-directional transformer. The proposed novel transformer module +accepts two inputs -- the image containing the masked region to be inpainted +and the query sketch to model the reverse diffusion process. This strategy +effectively addresses the domain gap between sketches and natural images, +thereby, enhancing the quality of inpainting results. In the absence of a +large-scale dataset specific to this task, we synthesize a dataset from the +MS-COCO to train and extensively evaluate our proposed framework against +various competent approaches in the literature. The qualitative and +quantitative results and user studies establish that the proposed method +inpaints realistic objects that fit the context in terms of the visual +appearance of the provided sketch. To aid further research, we have made our +code publicly available at https://github.com/vl2g/Sketch-Inpainting . + +
+
+ comment: Accepted to NTIRE Workshop @ CVPR 2024 +
+
+
+
+
+ + ☆ VCC-INFUSE: Towards Accurate and Efficient Selection of Unlabeled + Examples in Semi-supervised Learning IJCAI 2024 + + +
+ Despite the progress of Semi-supervised Learning (SSL), existing methods fail +to utilize unlabeled data effectively and efficiently. Many pseudo-label-based +methods select unlabeled examples based on inaccurate confidence scores from +the classifier. Most prior work also uses all available unlabeled data without +pruning, making it difficult to handle large amounts of unlabeled data. To +address these issues, we propose two methods: Variational Confidence +Calibration (VCC) and Influence-Function-based Unlabeled Sample Elimination +(INFUSE). VCC is an universal plugin for SSL confidence calibration, using a +variational autoencoder to select more accurate pseudo labels based on three +types of consistency scores. INFUSE is a data pruning method that constructs a +core dataset of unlabeled examples under SSL. Our methods are effective in +multiple datasets and settings, reducing classification errors rates and saving +training time. Together, VCC-INFUSE reduces the error rate of FlexMatch on the +CIFAR-100 dataset by 1.08% while saving nearly half of the training time. + +
+
+ comment: Accepted paper of IJCAI 2024. Shijie Fang and Qianhan Feng + contributed equally to this paper +
+
+
+
+
+ + ☆ S4TP: Social-Suitable and Safety-Sensitive Trajectory Planning for + Autonomous Vehicles + + +
+ In public roads, autonomous vehicles (AVs) face the challenge of frequent +interactions with human-driven vehicles (HDVs), which render uncertain driving +behavior due to varying social characteristics among humans. To effectively +assess the risks prevailing in the vicinity of AVs in social interactive +traffic scenarios and achieve safe autonomous driving, this article proposes a +social-suitable and safety-sensitive trajectory planning (S4TP) framework. +Specifically, S4TP integrates the Social-Aware Trajectory Prediction (SATP) and +Social-Aware Driving Risk Field (SADRF) modules. SATP utilizes Transformers to +effectively encode the driving scene and incorporates an AV's planned +trajectory during the prediction decoding process. SADRF assesses the expected +surrounding risk degrees during AVs-HDVs interactions, each with different +social characteristics, visualized as two-dimensional heat maps centered on the +AV. SADRF models the driving intentions of the surrounding HDVs and predicts +trajectories based on the representation of vehicular interactions. S4TP +employs an optimization-based approach for motion planning, utilizing the +predicted HDVs'trajectories as input. With the integration of SADRF, S4TP +executes real-time online optimization of the planned trajectory of AV within +lowrisk regions, thus improving the safety and the interpretability of the +planned trajectory. We have conducted comprehensive tests of the proposed +method using the SMARTS simulator. Experimental results in complex social +scenarios, such as unprotected left turn intersections, merging, cruising, and +overtaking, validate the superiority of our proposed S4TP in terms of safety +and rationality. S4TP achieves a pass rate of 100% across all scenarios, +surpassing the current state-of-the-art methods Fanta of 98.25% and +Predictive-Decision of 94.75%. + +
+
+ comment: 12 pages,4 figures, published to IEEE Transactions on Intelligent + Vehicles +
+
+
+
+
+ + ☆ LD-Pruner: Efficient Pruning of Latent Diffusion Models using + Task-Agnostic Insights CVPR24 + + +
+ Latent Diffusion Models (LDMs) have emerged as powerful generative models, +known for delivering remarkable results under constrained computational +resources. However, deploying LDMs on resource-limited devices remains a +complex issue, presenting challenges such as memory consumption and inference +speed. To address this issue, we introduce LD-Pruner, a novel +performance-preserving structured pruning method for compressing LDMs. +Traditional pruning methods for deep neural networks are not tailored to the +unique characteristics of LDMs, such as the high computational cost of training +and the absence of a fast, straightforward and task-agnostic method for +evaluating model performance. Our method tackles these challenges by leveraging +the latent space during the pruning process, enabling us to effectively +quantify the impact of pruning on model performance, independently of the task +at hand. This targeted pruning of components with minimal impact on the output +allows for faster convergence during training, as the model has less +information to re-learn, thereby addressing the high computational cost of +training. Consequently, our approach achieves a compressed model that offers +improved inference speed and reduced parameter count, while maintaining minimal +performance degradation. We demonstrate the effectiveness of our approach on +three different tasks: text-to-image (T2I) generation, Unconditional Image +Generation (UIG) and Unconditional Audio Generation (UAG). Notably, we reduce +the inference time of Stable Diffusion (SD) by 34.9% while simultaneously +improving its FID by 5.2% on MS-COCO T2I benchmark. This work paves the way for +more efficient pruning methods for LDMs, enhancing their applicability. + +
+
+ comment: 8 pages, accepted to CVPR24 First Workshop on Efficient and On-Device + Generation (EDGE) +
+
+
+
+
+ + ☆ A Symmetric Regressor for MRI-Based Assessment of Striatal Dopamine + Transporter Uptake in Parkinson's Disease + + +
+ Dopamine transporter (DAT) imaging is commonly used for monitoring +Parkinson's disease (PD), where striatal DAT uptake amount is computed to +assess PD severity. However, DAT imaging has a high cost and the risk of +radiance exposure and is not available in general clinics. Recently, MRI patch +of the nigral region has been proposed as a safer and easier alternative. This +paper proposes a symmetric regressor for predicting the DAT uptake amount from +the nigral MRI patch. Acknowledging the symmetry between the right and left +nigrae, the proposed regressor incorporates a paired input-output model that +simultaneously predicts the DAT uptake amounts for both the right and left +striata. Moreover, it employs a symmetric loss that imposes a constraint on the +difference between right-to-left predictions, resembling the high correlation +in DAT uptake amounts in the two lateral sides. Additionally, we propose a +symmetric Monte-Carlo (MC) dropout method for providing a fruitful uncertainty +estimate of the DAT uptake prediction, which utilizes the above symmetry. We +evaluated the proposed approach on 734 nigral patches, which demonstrated +significantly improved performance of the symmetric regressor compared with the +standard regressors while giving better explainability and feature +representation. The symmetric MC dropout also gave precise uncertainty ranges +with a high probability of including the true DAT uptake amounts within the +range. + +
+
+
+
+
+ + ☆ EdgeFusion: On-Device Text-to-Image Generation CVPR24 + + +
+ The intensive computational burden of Stable Diffusion (SD) for text-to-image +generation poses a significant hurdle for its practical application. To tackle +this challenge, recent research focuses on methods to reduce sampling steps, +such as Latent Consistency Model (LCM), and on employing architectural +optimizations, including pruning and knowledge distillation. Diverging from +existing approaches, we uniquely start with a compact SD variant, BK-SDM. We +observe that directly applying LCM to BK-SDM with commonly used crawled +datasets yields unsatisfactory results. It leads us to develop two strategies: +(1) leveraging high-quality image-text pairs from leading generative models and +(2) designing an advanced distillation process tailored for LCM. Through our +thorough exploration of quantization, profiling, and on-device deployment, we +achieve rapid generation of photo-realistic, text-aligned images in just two +steps, with latency under one second on resource-limited edge devices. + +
+
+ comment: 4 pages, accepted to CVPR24 First Workshop on Efficient and On-Device + Generation (EDGE) +
+
+
+
+
+ + ☆ Simultaneous Detection and Interaction Reasoning for Object-Centric + Action Recognition + + +
+ The interactions between human and objects are important for recognizing +object-centric actions. Existing methods usually adopt a two-stage pipeline, +where object proposals are first detected using a pretrained detector, and then +are fed to an action recognition model for extracting video features and +learning the object relations for action recognition. However, since the action +prior is unknown in the object detection stage, important objects could be +easily overlooked, leading to inferior action recognition performance. In this +paper, we propose an end-to-end object-centric action recognition framework +that simultaneously performs Detection And Interaction Reasoning in one stage. +Particularly, after extracting video features with a base network, we create +three modules for concurrent object detection and interaction reasoning. First, +a Patch-based Object Decoder generates proposals from video patch tokens. Then, +an Interactive Object Refining and Aggregation identifies important objects for +action recognition, adjusts proposal scores based on position and appearance, +and aggregates object-level info into a global video representation. Lastly, an +Object Relation Modeling module encodes object relations. These three modules +together with the video feature extractor can be trained jointly in an +end-to-end fashion, thus avoiding the heavy reliance on an off-the-shelf object +detector, and reducing the multi-stage training burden. We conduct experiments +on two datasets, Something-Else and Ikea-Assembly, to evaluate the performance +of our proposed approach on conventional, compositional, and few-shot action +recognition tasks. Through in-depth experimental analysis, we show the crucial +role of interactive objects in learning for action recognition, and we can +outperform state-of-the-art methods on both datasets. + +
+
+ comment: 12 pages, 5 figures, submitted to IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ AG-NeRF: Attention-guided Neural Radiance Fields for Multi-height + Large-scale Outdoor Scene Rendering + + +
+ Existing neural radiance fields (NeRF)-based novel view synthesis methods for +large-scale outdoor scenes are mainly built on a single altitude. Moreover, +they often require a priori camera shooting height and scene scope, leading to +inefficient and impractical applications when camera altitude changes. In this +work, we propose an end-to-end framework, termed AG-NeRF, and seek to reduce +the training cost of building good reconstructions by synthesizing +free-viewpoint images based on varying altitudes of scenes. Specifically, to +tackle the detail variation problem from low altitude (drone-level) to high +altitude (satellite-level), a source image selection method and an +attention-based feature fusion approach are developed to extract and fuse the +most relevant features of target view from multi-height images for +high-fidelity rendering. Extensive experiments demonstrate that AG-NeRF +achieves SOTA performance on 56 Leonard and Transamerica benchmarks and only +requires a half hour of training time to reach the competitive PSNR as compared +to the latest BungeeNeRF. + +
+
+
+
+
+ + ☆ FreeDiff: Progressive Frequency Truncation for Image Editing with + Diffusion Models + + +
+ Precise image editing with text-to-image models has attracted increasing +interest due to their remarkable generative capabilities and user-friendly +nature. However, such attempts face the pivotal challenge of misalignment +between the intended precise editing target regions and the broader area +impacted by the guidance in practice. Despite excellent methods leveraging +attention mechanisms that have been developed to refine the editing guidance, +these approaches necessitate modifications through complex network architecture +and are limited to specific editing tasks. In this work, we re-examine the +diffusion process and misalignment problem from a frequency perspective, +revealing that, due to the power law of natural images and the decaying noise +schedule, the denoising network primarily recovers low-frequency image +components during the earlier timesteps and thus brings excessive low-frequency +signals for editing. Leveraging this insight, we introduce a novel fine-tuning +free approach that employs progressive $\textbf{Fre}$qu$\textbf{e}$ncy +truncation to refine the guidance of $\textbf{Diff}$usion models for universal +editing tasks ($\textbf{FreeDiff}$). Our method achieves comparable results +with state-of-the-art methods across a variety of editing tasks and on a +diverse set of images, highlighting its potential as a versatile tool in image +editing applications. + +
+
+
+
+
+ + ☆ Multi-view X-ray Image Synthesis with Multiple Domain Disentanglement + from CT Scans + + +
+ X-ray images play a vital role in the intraoperative processes due to their +high resolution and fast imaging speed and greatly promote the subsequent +segmentation, registration and reconstruction. However, over-dosed X-rays +superimpose potential risks to human health to some extent. Data-driven +algorithms from volume scans to X-ray images are restricted by the scarcity of +paired X-ray and volume data. Existing methods are mainly realized by modelling +the whole X-ray imaging procedure. In this study, we propose a learning-based +approach termed CT2X-GAN to synthesize the X-ray images in an end-to-end manner +using the content and style disentanglement from three different image domains. +Our method decouples the anatomical structure information from CT scans and +style information from unpaired real X-ray images/ digital reconstructed +radiography (DRR) images via a series of decoupling encoders. Additionally, we +introduce a novel consistency regularization term to improve the stylistic +resemblance between synthesized X-ray images and real X-ray images. Meanwhile, +we also impose a supervised process by computing the similarity of computed +real DRR and synthesized DRR images. We further develop a pose attention module +to fully strengthen the comprehensive information in the decoupled content code +from CT scans, facilitating high-quality multi-view image synthesis in the +lower 2D space. Extensive experiments were conducted on the publicly available +CTSpine1K dataset and achieved 97.8350, 0.0842 and 3.0938 in terms of FID, KID +and defined user-scored X-ray similarity, respectively. In comparison with +3D-aware methods ($\pi$-GAN, EG3D), CT2X-GAN is superior in improving the +synthesis quality and realistic to the real X-ray images. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ Seeing Motion at Nighttime with an Event Camera CVPR 2024 + + +
+ We focus on a very challenging task: imaging at nighttime dynamic scenes. +Most previous methods rely on the low-light enhancement of a conventional RGB +camera. However, they would inevitably face a dilemma between the long exposure +time of nighttime and the motion blur of dynamic scenes. Event cameras react to +dynamic changes with higher temporal resolution (microsecond) and higher +dynamic range (120dB), offering an alternative solution. In this work, we +present a novel nighttime dynamic imaging method with an event camera. +Specifically, we discover that the event at nighttime exhibits temporal +trailing characteristics and spatial non-stationary distribution. Consequently, +we propose a nighttime event reconstruction network (NER-Net) which mainly +includes a learnable event timestamps calibration module (LETC) to align the +temporal trailing events and a non-uniform illumination aware module (NIAM) to +stabilize the spatiotemporal distribution of events. Moreover, we construct a +paired real low-light event dataset (RLED) through a co-axial imaging system, +including 64,200 spatially and temporally aligned image GTs and low-light +events. Extensive experiments demonstrate that the proposed method outperforms +state-of-the-art methods in terms of visual quality and generalization ability +on real-world nighttime datasets. The project are available at: +https://github.com/Liu-haoyue/NER-Net. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ SNP: Structured Neuron-level Pruning to Preserve Attention Scores + + +
+ Multi-head self-attention (MSA) is a key component of Vision Transformers +(ViTs), which have achieved great success in various vision tasks. However, +their high computational cost and memory footprint hinder their deployment on +resource-constrained devices. Conventional pruning approaches can only compress +and accelerate the MSA module using head pruning, although the head is not an +atomic unit. To address this issue, we propose a novel graph-aware neuron-level +pruning method, Structured Neuron-level Pruning (SNP). SNP prunes neurons with +less informative attention scores and eliminates redundancy among heads. +Specifically, it prunes graphically connected query and key layers having the +least informative attention scores while preserving the overall attention +scores. Value layers, which can be pruned independently, are pruned to +eliminate inter-head redundancy. Our proposed method effectively compresses and +accelerates Transformer-based models for both edge devices and server +processors. For instance, the DeiT-Small with SNP runs 3.1$\times$ faster than +the original model and achieves performance that is 21.94\% faster and 1.12\% +higher than the DeiT-Tiny. Additionally, SNP combine successfully with +conventional head or block pruning approaches. SNP with head pruning could +compress the DeiT-Base by 80\% of the parameters and computational costs and +achieve 3.85$\times$ faster inference speed on RTX3090 and 4.93$\times$ on +Jetson Nano. + +
+
+
+
+
+ + ☆ Group-On: Boosting One-Shot Segmentation with Supportive Query + + +
+ One-shot semantic segmentation aims to segment query images given only ONE +annotated support image of the same class. This task is challenging because +target objects in the support and query images can be largely different in +appearance and pose (i.e., intra-class variation). Prior works suggested that +incorporating more annotated support images in few-shot settings boosts +performances but increases costs due to additional manual labeling. In this +paper, we propose a novel approach for ONE-shot semantic segmentation, called +Group-On, which packs multiple query images in batches for the benefit of +mutual knowledge support within the same category. Specifically, after coarse +segmentation masks of the batch of queries are predicted, query-mask pairs act +as pseudo support data to enhance mask predictions mutually, under the guidance +of a simple Group-On Voting module. Comprehensive experiments on three standard +benchmarks show that, in the ONE-shot setting, our Group-On approach +significantly outperforms previous works by considerable margins. For example, +on the COCO-20i dataset, we increase mIoU scores by 8.21% and 7.46% on ASNet +and HSNet baselines, respectively. With only one support image, Group-On can be +even competitive with the counterparts using 5 annotated support images. + +
+
+
+
+
+ + ☆ OPTiML: Dense Semantic Invariance Using Optimal Transport for + Self-Supervised Medical Image Representation + + +
+ Self-supervised learning (SSL) has emerged as a promising technique for +medical image analysis due to its ability to learn without annotations. +However, despite the promising potential, conventional SSL methods encounter +limitations, including challenges in achieving semantic alignment and capturing +subtle details. This leads to suboptimal representations, which fail to +accurately capture the underlying anatomical structures and pathological +details. In response to these constraints, we introduce a novel SSL framework +OPTiML, employing optimal transport (OT), to capture the dense semantic +invariance and fine-grained details, thereby enhancing the overall +effectiveness of SSL in medical image representation learning. The core idea is +to integrate OT with a cross-viewpoint semantics infusion module (CV-SIM), +which effectively captures complex, fine-grained details inherent in medical +images across different viewpoints. In addition to the CV-SIM module, OPTiML +imposes the variance and covariance regularizations within OT framework to +force the model focus on clinically relevant information while discarding less +informative features. Through these, the proposed framework demonstrates its +capacity to learn semantically rich representations that can be applied to +various medical imaging tasks. To validate its effectiveness, we conduct +experimental studies on three publicly available datasets from chest X-ray +modality. Our empirical results reveal OPTiML's superiority over +state-of-the-art methods across all evaluated tasks. + +
+
+
+
+
+ + ☆ From Image to Video, what do we need in multimodal LLMs? + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated profound +capabilities in understanding multimodal information, covering from Image LLMs +to the more complex Video LLMs. Numerous studies have illustrated their +exceptional cross-modal comprehension. Recently, integrating video foundation +models with large language models to build a comprehensive video understanding +system has been proposed to overcome the limitations of specific pre-defined +vision tasks. However, the current advancements in Video LLMs tend to overlook +the foundational contributions of Image LLMs, often opting for more complicated +structures and a wide variety of multimodal data for pre-training. This +approach significantly increases the costs associated with these methods.In +response to these challenges, this work introduces an efficient method that +strategically leverages the priors of Image LLMs, facilitating a +resource-efficient transition from Image to Video LLMs. We propose RED-VILLM, a +Resource-Efficient Development pipeline for Video LLMs from Image LLMs, which +utilizes a temporal adaptation plug-and-play structure within the image fusion +module of Image LLMs. This adaptation extends their understanding capabilities +to include temporal information, enabling the development of Video LLMs that +not only surpass baseline performances but also do so with minimal +instructional data and training resources. Our approach highlights the +potential for a more cost-effective and scalable advancement in multimodal +models, effectively building upon the foundational work of Image LLMs. + +
+
+
+
+
+ + ☆ Progressive Multi-modal Conditional Prompt Tuning + + +
+ Pre-trained vision-language models (VLMs) have shown remarkable +generalization capabilities via prompting, which leverages VLMs as knowledge +bases to extract information beneficial for downstream tasks. However, existing +methods primarily employ uni-modal prompting, which only engages a uni-modal +branch, failing to simultaneously adjust vision-language (V-L) features. +Additionally, the one-pass forward pipeline in VLM encoding struggles to align +V-L features that have a huge gap. Confronting these challenges, we propose a +novel method, Progressive Multi-modal conditional Prompt Tuning (ProMPT). +ProMPT exploits a recurrent structure, optimizing and aligning V-L features by +iteratively utilizing image and current encoding information. It comprises an +initialization and a multi-modal iterative evolution (MIE) module. +Initialization is responsible for encoding image and text using a VLM, followed +by a feature filter that selects text features similar to image. MIE then +facilitates multi-modal prompting through class-conditional vision prompting, +instance-conditional text prompting, and feature filtering. In each MIE +iteration, vision prompts are obtained from the filtered text features via a +vision generator, promoting image features to focus more on target object +during vision prompting. The encoded image features are fed into a text +generator to produce text prompts that are more robust to class shift. Thus, +V-L features are progressively aligned, enabling advance from coarse to exact +classifications. Extensive experiments are conducted in three settings to +evaluate the efficacy of ProMPT. The results indicate that ProMPT outperforms +existing methods on average across all settings, demonstrating its superior +generalization. + +
+
+
+
+
+ + ☆ Partial Large Kernel CNNs for Efficient Super-Resolution + + +
+ Recently, in the super-resolution (SR) domain, transformers have outperformed +CNNs with fewer FLOPs and fewer parameters since they can deal with long-range +dependency and adaptively adjust weights based on instance. In this paper, we +demonstrate that CNNs, although less focused on in the current SR domain, +surpass Transformers in direct efficiency measures. By incorporating the +advantages of Transformers into CNNs, we aim to achieve both computational +efficiency and enhanced performance. However, using a large kernel in the SR +domain, which mainly processes large images, incurs a large computational +overhead. To overcome this, we propose novel approaches to employing the large +kernel, which can reduce latency by 86\% compared to the naive large kernel, +and leverage an Element-wise Attention module to imitate instance-dependent +weights. As a result, we introduce Partial Large Kernel CNNs for Efficient +Super-Resolution (PLKSR), which achieves state-of-the-art performance on four +datasets at a scale of $\times$4, with reductions of 68.1\% in latency and +80.2\% in maximum GPU memory occupancy compared to SRFormer-light. + +
+
+
+
+
+ + ☆ Computer-Aided Diagnosis of Thoracic Diseases in Chest X-rays using + hybrid CNN-Transformer Architecture + + +
+ Medical imaging has been used for diagnosis of various conditions, making it +one of the most powerful resources for effective patient care. Due to +widespread availability, low cost, and low radiation, chest X-ray is one of the +most sought after radiology examination for the diagnosis of various thoracic +diseases. Due to advancements in medical imaging technologies and increasing +patient load, current radiology workflow faces various challenges including +increasing backlogs, working long hours, and increase in diagnostic errors. An +automated computer-aided diagnosis system that can interpret chest X-rays to +augment radiologists by providing actionable insights has potential to provide +second opinion to radiologists, highlight relevant regions in the image, in +turn expediting clinical workflow, reducing diagnostic errors, and improving +patient care. In this study, we applied a novel architecture augmenting the +DenseNet121 Convolutional Neural Network (CNN) with multi-head self-attention +mechanism using transformer, namely SA-DenseNet121, that can identify multiple +thoracic diseases in chest X-rays. We conducted experiments on four of the +largest chest X-ray datasets, namely, ChestX-ray14, CheXpert, MIMIC-CXR-JPG, +and IU-CXR. Experimental results in terms of area under the receiver operating +characteristics (AUC-ROC) shows that augmenting CNN with self-attention has +potential in diagnosing different thoracic diseases from chest X-rays. The +proposed methodology has the potential to support the reading workflow, improve +efficiency, and reduce diagnostic errors. + +
+
+ comment: 24 pages, 13 Figures, 13 Tables. arXiv admin note: text overlap with + arXiv:1904.09925 by other authors +
+
+
+
+
+ + ☆ TextCenGen: Attention-Guided Text-Centric Background Adaptation for + Text-to-Image Generation + + +
+ Recent advancements in Text-to-image (T2I) generation have witnessed a shift +from adapting text to fixed backgrounds to creating images around text. +Traditional approaches are often limited to generate layouts within static +images for effective text placement. Our proposed approach, TextCenGen, +introduces a dynamic adaptation of the blank region for text-friendly image +generation, emphasizing text-centric design and visual harmony generation. Our +method employs force-directed attention guidance in T2I models to generate +images that strategically reserve whitespace for pre-defined text areas, even +for text or icons at the golden ratio. Observing how cross-attention maps +affect object placement, we detect and repel conflicting objects using a +force-directed graph approach, combined with a Spatial Excluding +Cross-Attention Constraint for smooth attention in whitespace areas. As a novel +task in graphic design, experiments indicate that TextCenGen outperforms +existing methods with more harmonious compositions. Furthermore, our method +significantly enhances T2I model outcomes on our specially collected prompt +datasets, catering to varied text positions. These results demonstrate the +efficacy of TextCenGen in creating more harmonious and integrated text-image +compositions. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Utilizing Adversarial Examples for Bias Mitigation and Accuracy + Enhancement + + +
+ We propose a novel approach to mitigate biases in computer vision models by +utilizing counterfactual generation and fine-tuning. While counterfactuals have +been used to analyze and address biases in DNN models, the counterfactuals +themselves are often generated from biased generative models, which can +introduce additional biases or spurious correlations. To address this issue, we +propose using adversarial images, that is images that deceive a deep neural +network but not humans, as counterfactuals for fair model training. + Our approach leverages a curriculum learning framework combined with a +fine-grained adversarial loss to fine-tune the model using adversarial +examples. By incorporating adversarial images into the training data, we aim to +prevent biases from propagating through the pipeline. We validate our approach +through both qualitative and quantitative assessments, demonstrating improved +bias mitigation and accuracy compared to existing methods. Qualitatively, our +results indicate that post-training, the decisions made by the model are less +dependent on the sensitive attribute and our model better disentangles the +relationship between sensitive attributes and classification variables. + +
+
+
+
+
+ + ☆ Cross-model Mutual Learning for Exemplar-based Medical Image + Segmentation AISTATS 2024 + + +
+ Medical image segmentation typically demands extensive dense annotations for +model training, which is both time-consuming and skill-intensive. To mitigate +this burden, exemplar-based medical image segmentation methods have been +introduced to achieve effective training with only one annotated image. In this +paper, we introduce a novel Cross-model Mutual learning framework for +Exemplar-based Medical image Segmentation (CMEMS), which leverages two models +to mutually excavate implicit information from unlabeled data at multiple +granularities. CMEMS can eliminate confirmation bias and enable collaborative +training to learn complementary information by enforcing consistency at +different granularities across models. Concretely, cross-model image +perturbation based mutual learning is devised by using weakly perturbed images +to generate high-confidence pseudo-labels, supervising predictions of strongly +perturbed images across models. This approach enables joint pursuit of +prediction consistency at the image granularity. Moreover, cross-model +multi-level feature perturbation based mutual learning is designed by letting +pseudo-labels supervise predictions from perturbed multi-level features with +different resolutions, which can broaden the perturbation space and enhance the +robustness of our framework. CMEMS is jointly trained using exemplar data, +synthetic data, and unlabeled data in an end-to-end manner. Experimental +results on two medical image datasets indicate that the proposed CMEMS +outperforms the state-of-the-art segmentation methods with extremely limited +supervision. + +
+
+ comment: AISTATS 2024 +
+
+
+
+
+ + ☆ Does Gaussian Splatting need SFM Initialization? + + +
+ 3D Gaussian Splatting has recently been embraced as a versatile and effective +method for scene reconstruction and novel view synthesis, owing to its +high-quality results and compatibility with hardware rasterization. Despite its +advantages, Gaussian Splatting's reliance on high-quality point cloud +initialization by Structure-from-Motion (SFM) algorithms is a significant +limitation to be overcome. To this end, we investigate various initialization +strategies for Gaussian Splatting and delve into how volumetric reconstructions +from Neural Radiance Fields (NeRF) can be utilized to bypass the dependency on +SFM data. Our findings demonstrate that random initialization can perform much +better if carefully designed and that by employing a combination of improved +initialization strategies and structure distillation from low-cost NeRF models, +it is possible to achieve equivalent results, or at times even superior, to +those obtained from SFM initialization. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ GenVideo: One-shot Target-image and Shape Aware Video Editing using T2I + Diffusion Models CVPR + + +
+ Video editing methods based on diffusion models that rely solely on a text +prompt for the edit are hindered by the limited expressive power of text +prompts. Thus, incorporating a reference target image as a visual guide becomes +desirable for precise control over edit. Also, most existing methods struggle +to accurately edit a video when the shape and size of the object in the target +image differ from the source object. To address these challenges, we propose +"GenVideo" for editing videos leveraging target-image aware T2I models. Our +approach handles edits with target objects of varying shapes and sizes while +maintaining the temporal consistency of the edit using our novel target and +shape aware InvEdit masks. Further, we propose a novel target-image aware +latent noise correction strategy during inference to improve the temporal +consistency of the edits. Experimental analyses indicate that GenVideo can +effectively handle edits with objects of varying shapes, where existing +approaches fail. + +
+
+ comment: CVPRw 2024 +
+
+
+
+
+ + ☆ TrACT: A Training Dynamics Aware Contrastive Learning Framework for + Long-tail Trajectory Prediction + + +
+ As a safety critical task, autonomous driving requires accurate predictions +of road users' future trajectories for safe motion planning, particularly under +challenging conditions. Yet, many recent deep learning methods suffer from a +degraded performance on the challenging scenarios, mainly because these +scenarios appear less frequently in the training data. To address such a +long-tail issue, existing methods force challenging scenarios closer together +in the feature space during training to trigger information sharing among them +for more robust learning. These methods, however, primarily rely on the motion +patterns to characterize scenarios, omitting more informative contextual +information, such as interactions and scene layout. We argue that exploiting +such information not only improves prediction accuracy but also scene +compliance of the generated trajectories. In this paper, we propose to +incorporate richer training dynamics information into a prototypical +contrastive learning framework. More specifically, we propose a two-stage +process. First, we generate rich contextual features using a baseline +encoder-decoder framework. These features are split into clusters based on the +model's output errors, using the training dynamics information, and a prototype +is computed within each cluster. Second, we retrain the model using the +prototypes in a contrastive learning framework. We conduct empirical +evaluations of our approach using two large-scale naturalistic datasets and +show that our method achieves state-of-the-art performance by improving +accuracy and scene compliance on the long-tail samples. Furthermore, we perform +experiments on a subset of the clusters to highlight the additional benefit of +our approach in reducing training bias. + +
+
+ comment: 2024 IEEE Intelligent Vehicles Symposium (IV) +
+
+
+
+
+ + ☆ Adaptive Memory Replay for Continual Learning CVPR + + +
+ Foundation Models (FMs) have become the hallmark of modern AI, however, these +models are trained on massive data, leading to financially expensive training. +Updating FMs as new data becomes available is important, however, can lead to +`catastrophic forgetting', where models underperform on tasks related to data +sub-populations observed too long ago. This continual learning (CL) phenomenon +has been extensively studied, but primarily in a setting where only a small +amount of past data can be stored. We advocate for the paradigm where memory is +abundant, allowing us to keep all previous data, but computational resources +are limited. In this setting, traditional replay-based CL approaches are +outperformed by a simple baseline which replays past data selected uniformly at +random, indicating that this setting necessitates a new approach. We address +this by introducing a framework of adaptive memory replay for continual +learning, where sampling of past data is phrased as a multi-armed bandit +problem. We utilize Bolzmann sampling to derive a method which dynamically +selects past data for training conditioned on the current task, assuming full +data access and emphasizing training efficiency. Through extensive evaluations +on both vision and language pre-training tasks, we demonstrate the +effectiveness of our approach, which maintains high performance while reducing +forgetting by up to 10% at no training efficiency cost. + +
+
+ comment: CVPR-W 2024 (Spotlight) +
+
+
+
+
+ + ☆ DoughNet: A Visual Predictive Model for Topological Manipulation of + Deformable Objects + + +
+ Manipulation of elastoplastic objects like dough often involves topological +changes such as splitting and merging. The ability to accurately predict these +topological changes that a specific action might incur is critical for planning +interactions with elastoplastic objects. We present DoughNet, a +Transformer-based architecture for handling these challenges, consisting of two +components. First, a denoising autoencoder represents deformable objects of +varying topology as sets of latent codes. Second, a visual predictive model +performs autoregressive set prediction to determine long-horizon geometrical +deformation and topological changes purely in latent space. Given a partial +initial state and desired manipulation trajectories, it infers all resulting +object geometries and topologies at each step. DoughNet thereby allows to plan +robotic manipulation; selecting a suited tool, its pose and opening width to +recreate robot- or human-made goals. Our experiments in simulated and real +environments show that DoughNet is able to significantly outperform related +approaches that consider deformation only as geometrical change. + +
+
+ comment: Under review. 17 pages, 14 figures +
+
+
+
+
+ + ☆ Compositional Neural Textures + + +
+ Texture plays a vital role in enhancing visual richness in both real +photographs and computer-generated imagery. However, the process of editing +textures often involves laborious and repetitive manual adjustments of textons, +which are the small, recurring local patterns that define textures. In this +work, we introduce a fully unsupervised approach for representing textures +using a compositional neural model that captures individual textons. We +represent each texton as a 2D Gaussian function whose spatial support +approximates its shape, and an associated feature that encodes its detailed +appearance. By modeling a texture as a discrete composition of Gaussian +textons, the representation offers both expressiveness and ease of editing. +Textures can be edited by modifying the compositional Gaussians within the +latent space, and new textures can be efficiently synthesized by feeding the +modified Gaussians through a generator network in a feed-forward manner. This +approach enables a wide range of applications, including transferring +appearance from an image texture to another image, diversifying textures, +texture interpolation, revealing/modifying texture variations, edit +propagation, texture animation, and direct texton manipulation. The proposed +approach contributes to advancing texture analysis, modeling, and editing +techniques, and opens up new possibilities for creating visually appealing +images with controllable textures. + +
+
+
+
+
+ + ☆ SPIdepth: Strengthened Pose Information for Self-supervised Monocular + Depth Estimation + + +
+ Self-supervised monocular depth estimation has garnered considerable +attention for its applications in autonomous driving and robotics. While recent +methods have made strides in leveraging techniques like the Self Query Layer +(SQL) to infer depth from motion, they often overlook the potential of +strengthening pose information. In this paper, we introduce SPIdepth, a novel +approach that prioritizes enhancing the pose network for improved depth +estimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the +importance of pose information in capturing fine-grained scene structures. By +enhancing the pose network's capabilities, SPIdepth achieves remarkable +advancements in scene understanding and depth estimation. Experimental results +on benchmark datasets such as KITTI and Cityscapes showcase SPIdepth's +state-of-the-art performance, surpassing previous methods by significant +margins. Notably, SPIdepth's performance exceeds that of unsupervised models +and, after finetuning on metric data, outperforms all existing methods. +Remarkably, SPIdepth achieves these results using only a single image for +inference, surpassing even methods that utilize video sequences for inference, +thus demonstrating its efficacy and efficiency in real-world applications. Our +approach represents a significant leap forward in self-supervised monocular +depth estimation, underscoring the importance of strengthening pose information +for advancing scene understanding in real-world applications. + +
+
+
+
+
+ + ☆ Global Counterfactual Directions + + +
+ Despite increasing progress in development of methods for generating visual +counterfactual explanations, especially with the recent rise of Denoising +Diffusion Probabilistic Models, previous works consider them as an entirely +local technique. In this work, we take the first step at globalizing them. +Specifically, we discover that the latent space of Diffusion Autoencoders +encodes the inference process of a given classifier in the form of global +directions. We propose a novel proxy-based approach that discovers two types of +these directions with the use of only single image in an entirely black-box +manner. Precisely, g-directions allow for flipping the decision of a given +classifier on an entire dataset of images, while h-directions further increase +the diversity of explanations. We refer to them in general as Global +Counterfactual Directions (GCDs). Moreover, we show that GCDs can be naturally +combined with Latent Integrated Gradients resulting in a new black-box +attribution method, while simultaneously enhancing the understanding of +counterfactual explanations. We validate our approach on existing benchmarks +and show that it generalizes to real-world use-cases. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Advancing Applications of Satellite Photogrammetry: Novel Approaches for + Built-up Area Modeling and Natural Environment Monitoring using + Stereo/Multi-view Satellite Image-derived 3D Data + + +
+ With the development of remote sensing technology in recent decades, +spaceborne sensors with sub-meter and meter spatial resolution (Worldview and +PlanetScope) have achieved a considerable image quality to generate 3D +geospatial data via a stereo matching pipeline. These achievements have +significantly increased the data accessibility in 3D, necessitating adapting +these 3D geospatial data to analyze human and natural environments. This +dissertation explores several novel approaches based on stereo and multi-view +satellite image-derived 3D geospatial data, to deal with remote sensing +application issues for built-up area modeling and natural environment +monitoring, including building model 3D reconstruction, glacier dynamics +tracking, and lake algae monitoring. Specifically, the dissertation introduces +four parts of novel approaches that deal with the spatial and temporal +challenges with satellite-derived 3D data. The first study advances LoD-2 +building modeling from satellite-derived Orthophoto and DSMs with a novel +approach employing a model-driven workflow that generates building rectangular +3D geometry models. Secondly, we further enhanced our building reconstruction +framework for dense urban areas and non-rectangular purposes, we implemented +deep learning for unit-level segmentation and introduced a gradient-based +circle reconstruction for circular buildings to develop a polygon composition +technique for advanced building LoD2 reconstruction. Our third study utilizes +high-spatiotemporal resolution PlanetScope satellite imagery for glacier +tracking at 3D level in mid-latitude regions. Finally, we proposed a term as +"Algal Behavior Function" to refine the quantification of chlorophyll-a +concentrations from satellite imagery in water quality monitoring, addressing +algae fluctuations and timing discrepancies between satellite observations and +field measurements, thus enhancing the precision of underwater algae volume +estimates. Overall, this dissertation demonstrates the extensive potential of +satellite photogrammetry applications in addressing urban and environmental +challenges. It further showcases innovative analytical methodologies that +enhance the applicability of adapting stereo and multi-view very +high-resolution satellite-derived 3D data. (See full abstract in the document) + +
+
+ comment: Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State + University, 2024, offical version is available in OhioLINK +
+
+
+
+
+ + ☆ Towards Multi-modal Transformers in Federated Learning + + +
+ Multi-modal transformers mark significant progress in different domains, but +siloed high-quality data hinders their further improvement. To remedy this, +federated learning (FL) has emerged as a promising privacy-preserving paradigm +for training models without direct access to the raw data held by different +clients. Despite its potential, a considerable research direction regarding the +unpaired uni-modal clients and the transformer architecture in FL remains +unexplored. To fill this gap, this paper explores a transfer multi-modal +federated learning (MFL) scenario within the vision-language domain, where +clients possess data of various modalities distributed across different +datasets. We systematically evaluate the performance of existing methods when a +transformer architecture is utilized and introduce a novel framework called +Federated modality complementary and collaboration (FedCola) by addressing the +in-modality and cross-modality gaps among clients. Through extensive +experiments across various FL settings, FedCola demonstrates superior +performance over previous approaches, offering new perspectives on future +federated training of multi-modal transformers. + +
+
+
+
+
+ + ☆ Enhancing AI Diagnostics: Autonomous Lesion Masking via Semi-Supervised + Deep Learning + + +
+ This study presents an unsupervised domain adaptation method aimed at +autonomously generating image masks outlining regions of interest (ROIs) for +differentiating breast lesions in breast ultrasound (US) imaging. Our +semi-supervised learning approach utilizes a primitive model trained on a small +public breast US dataset with true annotations. This model is then iteratively +refined for the domain adaptation task, generating pseudo-masks for our +private, unannotated breast US dataset. The dataset, twice the size of the +public one, exhibits considerable variability in image acquisition perspectives +and demographic representation, posing a domain-shift challenge. Unlike typical +domain adversarial training, we employ downstream classification outcomes as a +benchmark to guide the updating of pseudo-masks in subsequent iterations. We +found the classification precision to be highly correlated with the +completeness of the generated ROIs, which promotes the explainability of the +deep learning classification model. Preliminary findings demonstrate the +efficacy and reliability of this approach in streamlining the ROI annotation +process, thereby enhancing the classification and localization of breast +lesions for more precise and interpretable diagnoses. + +
+
+
+
+
+ + ☆ Spot-Compose: A Framework for Open-Vocabulary Object Retrieval and + Drawer Manipulation in Point Clouds ICRA 2024 + + +
+ In recent years, modern techniques in deep learning and large-scale datasets +have led to impressive progress in 3D instance segmentation, grasp pose +estimation, and robotics. This allows for accurate detection directly in 3D +scenes, object- and environment-aware grasp prediction, as well as robust and +repeatable robotic manipulation. This work aims to integrate these recent +methods into a comprehensive framework for robotic interaction and manipulation +in human-centric environments. Specifically, we leverage 3D reconstructions +from a commodity 3D scanner for open-vocabulary instance segmentation, +alongside grasp pose estimation, to demonstrate dynamic picking of objects, and +opening of drawers. We show the performance and robustness of our model in two +sets of real-world experiments including dynamic object retrieval and drawer +opening, reporting a 51% and 82% success rate respectively. Code of our +framework as well as videos are available on: https://spot-compose.github.io/. + +
+
+ comment: Accepted at ICRA 2024 Workshops. Code and videos available at + https://spot-compose.github.io/ +
+
+
+
+
+ + ☆ UIClip: A Data-driven Model for Assessing User Interface Design + + +
+ User interface (UI) design is a difficult yet important task for ensuring the +usability, accessibility, and aesthetic qualities of applications. In our +paper, we develop a machine-learned model, UIClip, for assessing the design +quality and visual relevance of a UI given its screenshot and natural language +description. To train UIClip, we used a combination of automated crawling, +synthetic augmentation, and human ratings to construct a large-scale dataset of +UIs, collated by description and ranked by design quality. Through training on +the dataset, UIClip implicitly learns properties of good and bad designs by i) +assigning a numerical score that represents a UI design's relevance and quality +and ii) providing design suggestions. In an evaluation that compared the +outputs of UIClip and other baselines to UIs rated by 12 human designers, we +found that UIClip achieved the highest agreement with ground-truth rankings. +Finally, we present three example applications that demonstrate how UIClip can +facilitate downstream applications that rely on instantaneous assessment of UI +design quality: i) UI code generation, ii) UI design tips generation, and iii) +quality-aware UI example search. + +
+
+
+
+
+ + ☆ MP-DPD: Low-Complexity Mixed-Precision Neural Networks for + Energy-Efficient Digital Predistortion of Wideband Power Amplifiers + + +
+ Digital Pre-Distortion (DPD) enhances signal quality in wideband RF power +amplifiers (PAs). As signal bandwidths expand in modern radio systems, DPD's +energy consumption increasingly impacts overall system efficiency. Deep Neural +Networks (DNNs) offer promising advancements in DPD, yet their high complexity +hinders their practical deployment. This paper introduces open-source +mixed-precision (MP) neural networks that employ quantized low-precision +fixed-point parameters for energy-efficient DPD. This approach reduces +computational complexity and memory footprint, thereby lowering power +consumption without compromising linearization efficacy. Applied to a 160MHz-BW +1024-QAM OFDM signal from a digital RF PA, MP-DPD gives no performance loss +against 32-bit floating-point precision DPDs, while achieving -43.75 (L)/-45.27 +(R) dBc in Adjacent Channel Power Ratio (ACPR) and -38.72 dB in Error Vector +Magnitude (EVM). A 16-bit fixed-point-precision MP-DPD enables a 2.8X reduction +in estimated inference power. The PyTorch learning and testing code is publicly +available at \url{https://github.com/lab-emi/OpenDPD}. + +
+
+ comment: Accepted to IEEE Microwave and Wireless Technology Letters (MWTL) +
+
+
+
+
+ + ♻ ☆ NeRF-MAE: Masked AutoEncoders for Self-Supervised 3D Representation + Learning for Neural Radiance Fields + + +
+ Neural fields excel in computer vision and robotics due to their ability to +understand the 3D visual world such as inferring semantics, geometry, and +dynamics. Given the capabilities of neural fields in densely representing a 3D +scene from 2D images, we ask the question: Can we scale their self-supervised +pretraining, specifically using masked autoencoders, to generate effective 3D +representations from posed RGB images. Owing to the astounding success of +extending transformers to novel data modalities, we employ standard 3D Vision +Transformers to suit the unique formulation of NeRFs. We leverage NeRF's +volumetric grid as a dense input to the transformer, contrasting it with other +3D representations such as pointclouds where the information density can be +uneven, and the representation is irregular. Due to the difficulty of applying +masked autoencoders to an implicit representation, such as NeRF, we opt for +extracting an explicit representation that canonicalizes scenes across domains +by employing the camera trajectory for sampling. Our goal is made possible by +masking random patches from NeRF's radiance and density grid and employing a +standard 3D Swin Transformer to reconstruct the masked patches. In doing so, +the model can learn the semantic and spatial structure of complete scenes. We +pretrain this representation at scale on our proposed curated posed-RGB data, +totaling over 1.6 million images. Once pretrained, the encoder is used for +effective 3D transfer learning. Our novel self-supervised pretraining for +NeRFs, NeRF-MAE, scales remarkably well and improves performance on various +challenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining, +NeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF +scene understanding baselines on Front3D and ScanNet datasets with an absolute +performance improvement of over 20% AP50 and 8% AP25 for 3D object detection. + +
+
+ comment: 29 pages, 13 figures. Project Page: https://nerf-mae.github.io/ +
+
+
+
+
+ + ♻ ☆ Beyond Known Clusters: Probe New Prototypes for Efficient Generalized + Class Discovery + + +
+ Generalized Class Discovery (GCD) aims to dynamically assign labels to +unlabelled data partially based on knowledge learned from labelled data, where +the unlabelled data may come from known or novel classes. The prevailing +approach generally involves clustering across all data and learning conceptions +by prototypical contrastive learning. However, existing methods largely hinge +on the performance of clustering algorithms and are thus subject to their +inherent limitations. Firstly, the estimated cluster number is often smaller +than the ground truth, making the existing methods suffer from the lack of +prototypes for comprehensive conception learning. To address this issue, we +propose an adaptive probing mechanism that introduces learnable potential +prototypes to expand cluster prototypes (centers). As there is no ground truth +for the potential prototype, we develop a self-supervised prototype learning +framework to optimize the potential prototype in an end-to-end fashion. +Secondly, clustering is computationally intensive, and the conventional +strategy of clustering both labelled and unlabelled instances exacerbates this +issue. To counteract this inefficiency, we opt to cluster only the unlabelled +instances and subsequently expand the cluster prototypes with our introduced +potential prototypes to fast explore novel classes. Despite the simplicity of +our proposed method, extensive empirical analysis on a wide range of datasets +confirms that our method consistently delivers state-of-the-art results. +Specifically, our method surpasses the nearest competitor by a significant +margin of \textbf{9.7}$\%$ within the Stanford Cars dataset and +\textbf{12$\times$} clustering efficiency within the Herbarium 19 dataset. We +will make the code and checkpoints publicly available at +\url{https://github.com/xjtuYW/PNP.git}. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Low-rank tensor completion via tensor joint rank with logarithmic + composite norm + + +
+ Low-rank tensor completion (LRTC) aims to recover a complete low-rank tensor +from incomplete observed tensor, attracting extensive attention in various +practical applications such as image processing and computer vision. However, +current methods often perform well only when there is a sufficient of observed +information, and they perform poorly or may fail when the observed information +is less than 5\%. In order to improve the utilization of observed information, +a new method called the tensor joint rank with logarithmic composite norm +(TJLC) method is proposed. This method simultaneously exploits two types of +tensor low-rank structures, namely tensor Tucker rank and tubal rank, thereby +enhancing the inherent correlations between known and missing elements. To +address the challenge of applying two tensor ranks with significantly different +directly to LRTC, a new tensor Logarithmic composite norm is further proposed. +Subsequently, the TJLC model and algorithm for the LRTC problem are proposed. +Additionally, theoretical convergence guarantees for the TJLC method are +provided. Experiments on various real datasets demonstrate that the proposed +method outperforms state-of-the-art methods significantly. Particularly, the +proposed method achieves satisfactory recovery even when the observed +information is as low as 1\%, and the recovery performance improves +significantly as the observed information increases. + +
+
+
+
+
+ + ♻ ☆ Struggle with Adversarial Defense? Try Diffusion + + +
+ Adversarial attacks induce misclassification by introducing subtle +perturbations. Recently, diffusion models are applied to the image classifiers +to improve adversarial robustness through adversarial training or by purifying +adversarial noise. However, diffusion-based adversarial training often +encounters convergence challenges and high computational expenses. +Additionally, diffusion-based purification inevitably causes data shift and is +deemed susceptible to stronger adaptive attacks. To tackle these issues, we +propose the Truth Maximization Diffusion Classifier (TMDC), a generative +Bayesian classifier that builds upon pre-trained diffusion models and the +Bayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian +principles, utilizes the conditional likelihood from diffusion models to +determine the class probabilities of input images, thereby insulating against +the influences of data shift and the limitations of adversarial training. +Moreover, to enhance TMDC's resilience against more potent adversarial attacks, +we propose an optimization strategy for diffusion classifiers. This strategy +involves post-training the diffusion model on perturbed datasets with +ground-truth labels as conditions, guiding the diffusion model to learn the +data distribution and maximizing the likelihood under the ground-truth labels. +The proposed method achieves state-of-the-art performance on the CIFAR10 +dataset against heavy white-box attacks and strong adaptive attacks. +Specifically, TMDC achieves robust accuracies of 82.81% against $l_{\infty}$ +norm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded +perturbations, respectively, with $\epsilon=0.05$. + +
+
+
+
+
+ + ♻ ☆ A new dataset for measuring the performance of blood vessel segmentation + methods under distribution shifts + + +
+ Creating a dataset for training supervised machine learning algorithms can be +a demanding task. This is especially true for medical image segmentation since +one or more specialists are usually required for image annotation, and creating +ground truth labels for just a single image can take up to several hours. In +addition, it is paramount that the annotated samples represent well the +different conditions that might affect the imaged tissues as well as possible +changes in the image acquisition process. This can only be achieved by +considering samples that are typical in the dataset as well as atypical, or +even outlier, samples. We introduce VessMAP, a heterogeneous blood vessel +segmentation dataset acquired by carefully sampling relevant images from a +larger non-annotated dataset. A methodology was developed to select both +prototypical and atypical samples from the base dataset, thus defining an +assorted set of images that can be used for measuring the performance of +segmentation algorithms on samples that are highly distinct from each other. To +demonstrate the potential of the new dataset, we show that the validation +performance of a neural network changes significantly depending on the splits +used for training the network. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Can We Edit Multimodal Large Language Models? EMNLP 2023 + + +
+ In this paper, we focus on editing Multimodal Large Language Models (MLLMs). +Compared to editing single-modal LLMs, multimodal model editing is more +challenging, which demands a higher level of scrutiny and careful consideration +in the editing process. To facilitate research in this area, we construct a new +benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite +of innovative metrics for evaluation. We conduct comprehensive experiments +involving various model editing baselines and analyze the impact of editing +different components for multimodal LLMs. Empirically, we notice that previous +baselines can implement editing multimodal LLMs to some extent, but the effect +is still barely satisfactory, indicating the potential difficulty of this task. +We hope that our work can provide the NLP community with insights. Code and +dataset are available in https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023. Add the Exact Match/Accuracy results of Reliability and + T-Generality +
+
+
+
+
+ + ♻ ☆ Exposing Image Splicing Traces in Scientific Publications via + Uncertainty-guided Refinement + + +
+ Recently, a surge in scientific publications suspected of image manipulation +has led to numerous retractions, bringing the issue of image integrity into +sharp focus. Although research on forensic detectors for image plagiarism and +image synthesis exists, the detection of image splicing traces in scientific +publications remains unexplored. Compared to image duplication and synthesis, +image splicing detection is more challenging due to the lack of reference +images and the typically small tampered areas. Furthermore, disruptive factors +in scientific images, such as artifacts from digital compression, abnormal +patterns, and noise from physical operations, present misleading features like +splicing traces, significantly increasing the difficulty of this task. +Moreover, the scarcity of high-quality datasets of spliced scientific images +limits potential advancements. In this work, we propose an Uncertainty-guided +Refinement Network (URN) to mitigate the impact of these disruptive factors. +Our URN can explicitly suppress the propagation of unreliable information flow +caused by disruptive factors between regions, thus obtaining robust splicing +features. Additionally, the URN is designed to concentrate improvements in +uncertain prediction areas during the decoding phase. We also construct a +dataset for image splicing detection (SciSp) containing 1,290 spliced images. +Compared to existing datasets, SciSp includes the largest number of spliced +images and the most diverse sources. Comprehensive experiments conducted on +three benchmark datasets demonstrate the superiority of our approach. We also +validate the URN's generalisability in resisting cross-dataset domain shifts +and its robustness against various post-processing techniques, including +advanced deep-learning-based inpainting. + +
+
+
+
+
+ + ♻ ☆ State Space Models for Event Cameras CVPR 2024 + + +
+ Today, state-of-the-art deep neural networks that process event-camera data +first convert a temporal window of events into dense, grid-like input +representations. As such, they exhibit poor generalizability when deployed at +higher inference frequencies (i.e., smaller temporal windows) than the ones +they were trained on. We address this challenge by introducing state-space +models (SSMs) with learnable timescale parameters to event-based vision. This +design adapts to varying frequencies without the need to retrain the network at +different frequencies. Additionally, we investigate two strategies to +counteract aliasing effects when deploying the model at higher frequencies. We +comprehensively evaluate our approach against existing methods based on RNN and +Transformer architectures across various benchmarks, including Gen1 and 1 Mpx +event camera datasets. Our results demonstrate that SSM-based models train 33% +faster and also exhibit minimal performance degradation when tested at higher +frequencies than the training input. Traditional RNN and Transformer models +exhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.76 +mAP, highlighting the effectiveness of SSMs in event-based vision tasks. + +
+
+ comment: 18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Reciprocal Attention Mixing Transformer for Lightweight Image + Restoration CVPR 2024 + + +
+ Although many recent works have made advancements in the image restoration +(IR) field, they often suffer from an excessive number of parameters. Another +issue is that most Transformer-based IR methods focus only on either local or +global features, leading to limited receptive fields or deficient parameter +issues. To address these problems, we propose a lightweight IR network, +Reciprocal Attention Mixing Transformer (RAMiT). It employs our proposed +dimensional reciprocal attention mixing Transformer (D-RAMiT) blocks, which +compute bi-dimensional (spatial and channel) self-attentions in parallel with +different numbers of multi-heads. The bi-dimensional attentions help each other +to complement their counterpart's drawbacks and are then mixed. Additionally, +we introduce a hierarchical reciprocal attention mixing (H-RAMi) layer that +compensates for pixel-level information losses and utilizes semantic +information while maintaining an efficient hierarchical structure. Furthermore, +we revisit and modify MobileNet V1 and V2 to attach efficient convolutions to +our proposed components. The experimental results demonstrate that RAMiT +achieves state-of-the-art performance on multiple lightweight IR tasks, +including super-resolution, color denoising, grayscale denoising, low-light +enhancement, and deraining. Codes are available at +https://github.com/rami0205/RAMiT. + +
+
+ comment: CVPR 2024 Workshop - NTIRE. Codes are available at + https://github.com/rami0205/RAMiT +
+
+
+
+
+ + ♻ ☆ Post-Training Network Compression for 3D Medical Image Segmentation: + Reducing Computational Efforts via Tucker Decomposition + + +
+ We address the computational barrier of deploying advanced deep learning +segmentation models in clinical settings by studying the efficacy of network +compression through tensor decomposition. We propose a post-training Tucker +factorization that enables the decomposition of pre-existing models to reduce +computational requirements without impeding segmentation accuracy. We applied +Tucker decomposition to the convolutional kernels of the TotalSegmentator (TS) +model, an nnU-Net model trained on a comprehensive dataset for automatic +segmentation of 117 anatomical structures. Our approach reduced the +floating-point operations (FLOPs) and memory required during inference, +offering an adjustable trade-off between computational efficiency and +segmentation quality. This study utilized the publicly available TS dataset, +employing various downsampling factors to explore the relationship between +model size, inference speed, and segmentation performance. The application of +Tucker decomposition to the TS model substantially reduced the model parameters +and FLOPs across various compression rates, with limited loss in segmentation +accuracy. We removed up to 88% of the model's parameters with no significant +performance changes in the majority of classes after fine-tuning. Practical +benefits varied across different graphics processing unit (GPU) architectures, +with more distinct speed-ups on less powerful hardware. Post-hoc network +compression via Tucker decomposition presents a viable strategy for reducing +the computational demand of medical image segmentation models without +substantially sacrificing accuracy. This approach enables the broader adoption +of advanced deep learning technologies in clinical practice, offering a way to +navigate the constraints of hardware capabilities. + +
+
+
+
+
+ + ♻ ☆ Efficiently Adversarial Examples Generation for Visual-Language Models + under Targeted Transfer Scenarios using Diffusion Models + + +
+ Targeted transfer-based attacks involving adversarial examples pose a +significant threat to large visual-language models (VLMs). However, the +state-of-the-art (SOTA) transfer-based attacks incur high costs due to +excessive iteration counts. Furthermore, the generated adversarial examples +exhibit pronounced adversarial noise and demonstrate limited efficacy in +evading defense methods such as DiffPure. To address these issues, inspired by +score matching, we introduce AdvDiffVLM, which utilizes diffusion models to +generate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM +employs Adaptive Ensemble Gradient Estimation to modify the score during the +diffusion model's reverse generation process, ensuring the adversarial examples +produced contain natural adversarial semantics and thus possess enhanced +transferability. Simultaneously, to enhance the quality of adversarial examples +further, we employ the GradCAM-guided Mask method to disperse adversarial +semantics throughout the image, rather than concentrating them in a specific +area. Experimental results demonstrate that our method achieves a speedup +ranging from 10X to 30X compared to existing transfer-based attack methods, +while maintaining superior quality of adversarial examples. Additionally, the +generated adversarial examples possess strong transferability and exhibit +increased robustness against adversarial defense methods. Notably, AdvDiffVLM +can successfully attack commercial VLMs, including GPT-4V, in a black-box +manner. + +
+
+
+
+
+ + ♻ ☆ Multi-Level Aggregation and Recursive Alignment Architecture for + Efficient Parallel Inference Segmentation Network + + +
+ Real-time semantic segmentation is a crucial research for real-world +applications. However, many methods lay particular emphasis on reducing the +computational complexity and model size, while largely sacrificing the +accuracy. To tackle this problem, we propose a parallel inference network +customized for semantic segmentation tasks to achieve a good trade-off between +speed and accuracy. We employ a shallow backbone to ensure real-time speed, and +propose three core components to compensate for the reduced model capacity to +improve accuracy. Specifically, we first design a dual-pyramidal path +architecture (Multi-level Feature Aggregation Module, MFAM) to aggregate +multi-level features from the encoder to each scale, providing hierarchical +clues for subsequent spatial alignment and corresponding in-network inference. +Then, we build Recursive Alignment Module (RAM) by combining the flow-based +alignment module with recursive upsampling architecture for accurate spatial +alignment between multi-scale feature maps with half the computational +complexity of the straightforward alignment method. Finally, we perform +independent parallel inference on the aligned features to obtain multi-scale +scores, and adaptively fuse them through an attention-based Adaptive Scores +Fusion Module (ASFM) so that the final prediction can favor objects of multiple +scales. Our framework shows a better balance between speed and accuracy than +state-of-the-art real-time methods on Cityscapes and CamVid datasets. We also +conducted systematic ablation studies to gain insight into our motivation and +architectural design. Code is available at: +https://github.com/Yanhua-Zhang/MFARANet. + +
+
+ comment: 15 pages, 9 figures and 12 Tables. Manuscript completed on April 30, + 2022 +
+
+
+
+
+ + ♻ ☆ REF$^2$-NeRF: Reflection and Refraction aware Neural Radiance Field + + +
+ Recently, significant progress has been made in the study of methods for 3D +reconstruction from multiple images using implicit neural representations, +exemplified by the neural radiance field (NeRF) method. Such methods, which are +based on volume rendering, can model various light phenomena, and various +extended methods have been proposed to accommodate different scenes and +situations. However, when handling scenes with multiple glass objects, e.g., +objects in a glass showcase, modeling the target scene accurately has been +challenging due to the presence of multiple reflection and refraction effects. +Thus, this paper proposes a NeRF-based modeling method for scenes containing a +glass case. In the proposed method, refraction and reflection are modeled using +elements that are dependent and independent of the viewer's perspective. This +approach allows us to estimate the surfaces where refraction occurs, i.e., +glass surfaces, and enables the separation and modeling of both direct and +reflected light components. The proposed method requires predetermined camera +poses, but accurately estimating these poses in scenes with glass objects is +difficult. Therefore, we used a robotic arm with an attached camera to acquire +images with known poses. Compared to existing methods, the proposed method +enables more accurate modeling of both glass refraction and the overall scene. + +
+
+ comment: 10 pages, 8 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ NeuRAD: Neural Rendering for Autonomous Driving + + +
+ Neural radiance fields (NeRFs) have gained popularity in the autonomous +driving (AD) community. Recent methods show NeRFs' potential for closed-loop +simulation, enabling testing of AD systems, and as an advanced training data +augmentation technique. However, existing methods often require long training +times, dense semantic supervision, or lack generalizability. This, in turn, +hinders the application of NeRFs for AD at scale. In this paper, we propose +NeuRAD, a robust novel view synthesis method tailored to dynamic AD data. Our +method features simple network design, extensive sensor modeling for both +camera and lidar -- including rolling shutter, beam divergence and ray dropping +-- and is applicable to multiple datasets out of the box. We verify its +performance on five popular AD datasets, achieving state-of-the-art performance +across the board. To encourage further development, we will openly release the +NeuRAD source code. See https://github.com/georghess/NeuRAD . + +
+
+
+
+
+ + ♻ ☆ Back to Basics: Fast Denoising Iterative Algorithm + + +
+ We introduce Back to Basics (BTB), a fast iterative algorithm for noise +reduction. Our method is computationally efficient, does not require training +or ground truth data, and can be applied in the presence of independent noise, +as well as correlated (coherent) noise, where the noise level is unknown. We +examine three study cases: natural image denoising in the presence of additive +white Gaussian noise, Poisson-distributed image denoising, and speckle +suppression in optical coherence tomography (OCT). Experimental results +demonstrate that the proposed approach can effectively improve image quality, +in challenging noise settings. Theoretical guarantees are provided for +convergence stability. + +
+
+
+
+
+ + ♻ ☆ XIMAGENET-12: An Explainable AI Benchmark Dataset for Model Robustness + Evaluation CVPR 2024 + + +
+ Despite the promising performance of existing visual models on public +benchmarks, the critical assessment of their robustness for real-world +applications remains an ongoing challenge. To bridge this gap, we propose an +explainable visual dataset, XIMAGENET-12, to evaluate the robustness of visual +models. XIMAGENET-12 consists of over 200K images with 15,410 manual semantic +annotations. Specifically, we deliberately selected 12 categories from +ImageNet, representing objects commonly encountered in practical life. To +simulate real-world situations, we incorporated six diverse scenarios, such as +overexposure, blurring, and color changes, etc. We further develop a +quantitative criterion for robustness assessment, allowing for a nuanced +understanding of how visual models perform under varying conditions, notably in +relation to the background. We make the XIMAGENET-12 dataset and its +corresponding code openly accessible at +\url{https://sites.google.com/view/ximagenet-12/home}. We expect the +introduction of the XIMAGENET-12 dataset will empower researchers to thoroughly +evaluate the robustness of their visual models under challenging conditions. + +
+
+ comment: Paper accepted by Synthetic Data for Computer Vision Workshop @ IEEE + CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Unified Physical-Digital Attack Detection Challenge + + +
+ Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR) +Systems. In real-world scenarios, FRs are confronted with both physical and +digital attacks. However, existing algorithms often address only one type of +attack at a time, which poses significant limitations in real-world scenarios +where FR systems face hybrid physical-digital threats. To facilitate the +research of Unified Attack Detection (UAD) algorithms, a large-scale +UniAttackData dataset has been collected. UniAttackData is the largest public +dataset for Unified Attack Detection, with a total of 28,706 videos, where each +unique identity encompasses all advanced attack types. Based on this dataset, +we organized a Unified Physical-Digital Face Attack Detection Challenge to +boost the research in Unified Attack Detections. It attracted 136 teams for the +development phase, with 13 qualifying for the final round. The results +re-verified by the organizing team were used for the final ranking. This paper +comprehensively reviews the challenge, detailing the dataset introduction, +protocol definition, evaluation criteria, and a summary of published results. +Finally, we focus on the detailed analysis of the highest-performing algorithms +and offer potential directions for unified physical-digital attack detection +inspired by this competition. Challenge Website: +https://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Bridging Stereo Geometry and BEV Representation with Reliable Mutual + Interaction for Semantic Scene Completion IJCAI2024 + + +
+ 3D semantic scene completion (SSC) is an ill-posed perception task that +requires inferring a dense 3D scene from limited observations. Previous +camera-based methods struggle to predict accurate semantic scenes due to +inherent geometric ambiguity and incomplete observations. In this paper, we +resort to stereo matching technique and bird's-eye-view (BEV) representation +learning to address such issues in SSC. Complementary to each other, stereo +matching mitigates geometric ambiguity with epipolar constraint while BEV +representation enhances the hallucination ability for invisible regions with +global semantic context. However, due to the inherent representation gap +between stereo geometry and BEV features, it is non-trivial to bridge them for +dense prediction task of SSC. Therefore, we further develop a unified +occupancy-based framework dubbed BRGScene, which effectively bridges these two +representations with dense 3D volumes for reliable semantic scene completion. +Specifically, we design a novel Mutual Interactive Ensemble (MIE) block for +pixel-level reliable aggregation of stereo geometry and BEV features. Within +the MIE block, a Bi-directional Reliable Interaction (BRI) module, enhanced +with confidence re-weighting, is employed to encourage fine-grained interaction +through mutual guidance. Besides, a Dual Volume Ensemble (DVE) module is +introduced to facilitate complementary aggregation through channel-wise +recalibration and multi-group voting. Our method outperforms all published +camera-based methods on SemanticKITTI for semantic scene completion. Our code +is available on \url{https://github.com/Arlo0o/StereoScene}. + +
+
+ comment: IJCAI2024 +
+
+
+
+
+ + ♻ ☆ Low-resolution Prior Equilibrium Network for CT Reconstruction + + +
+ The unrolling method has been investigated for learning variational models in +X-ray computed tomography. However, it has been observed that directly +unrolling the regularization model through gradient descent does not produce +satisfactory results. In this paper, we present a novel deep learning-based CT +reconstruction model, where the low-resolution image is introduced to obtain an +effective regularization term for improving the network`s robustness. Our +approach involves constructing the backbone network architecture by algorithm +unrolling that is realized using the deep equilibrium architecture. We +theoretically discuss the convergence of the proposed low-resolution prior +equilibrium model and provide the conditions to guarantee convergence. +Experimental results on both sparse-view and limited-angle reconstruction +problems are provided, demonstrating that our end-to-end low-resolution prior +equilibrium model outperforms other state-of-the-art methods in terms of noise +reduction, contrast-to-noise ratio, and preservation of edge details. + +
+
+
+
+
+ + ♻ ☆ Bootstrapping Autonomous Driving Radars with Self-Supervised Learning + + +
+ The perception of autonomous vehicles using radars has attracted increased +research interest due its ability to operate in fog and bad weather. However, +training radar models is hindered by the cost and difficulty of annotating +large-scale radar data. To overcome this bottleneck, we propose a +self-supervised learning framework to leverage the large amount of unlabeled +radar data to pre-train radar-only embeddings for self-driving perception +tasks. The proposed method combines radar-to-radar and radar-to-vision +contrastive losses to learn a general representation from unlabeled radar +heatmaps paired with their corresponding camera images. When used for +downstream object detection, we demonstrate that the proposed self-supervision +framework can improve the accuracy of state-of-the-art supervised baselines by +$5.8\%$ in mAP. Code is available at \url{https://github.com/yiduohao/Radical}. + +
+
+ comment: 12 pages, 5 figures, to be published in Proceedings of the IEEE/CVF + Conference on Computer Vision and Pattern Recognition 2024 +
+
+
+
+
+ + ♻ ☆ DualFluidNet: an Attention-based Dual-pipeline Network for FLuid + Simulation + + +
+ Fluid motion can be considered as a point cloud transformation when using the +SPH method. Compared to traditional numerical analysis methods, using machine +learning techniques to learn physics simulations can achieve near-accurate +results, while significantly increasing efficiency. In this paper, we propose +an innovative approach for 3D fluid simulations utilizing an Attention-based +Dual-pipeline Network, which employs a dual-pipeline architecture, seamlessly +integrated with an Attention-based Feature Fusion Module. Unlike previous +methods, which often make difficult trade-offs between global fluid control and +physical law constraints, we find a way to achieve a better balance between +these two crucial aspects with a well-designed dual-pipeline approach. +Additionally, we design a Type-aware Input Module to adaptively recognize +particles of different types and perform feature fusion afterward, such that +fluid-solid coupling issues can be better dealt with. Furthermore, we propose a +new dataset, Tank3D, to further explore the network's ability to handle more +complicated scenes. The experiments demonstrate that our approach not only +attains a quantitative enhancement in various metrics, surpassing the +state-of-the-art methods but also signifies a qualitative leap in neural +network-based simulation by faithfully adhering to the physical laws. Code and +video demonstrations are available at +https://github.com/chenyu-xjtu/DualFluidNet. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics + Perception + + +
+ The highly abstract nature of image aesthetics perception (IAP) poses +significant challenge for current multimodal large language models (MLLMs). The +lack of human-annotated multi-modality aesthetic data further exacerbates this +dilemma, resulting in MLLMs falling short of aesthetics perception +capabilities. To address the above challenge, we first introduce a +comprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT) +dataset, which serves as the footstone for building multi-modality aesthetics +foundation models. Specifically, to align MLLMs with human aesthetics +perception, we construct a corpus-rich aesthetic critique database with 21,904 +diverse-sourced images and 88K human natural language feedbacks, which are +collected via progressive questions, ranging from coarse-grained aesthetic +grades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle +diverse queries, we further prompt GPT to refine the aesthetic critiques and +assemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT, +which consists of 409K multi-typed instructions to activate stronger aesthetic +capabilities. Based on the AesMMIT database, we fine-tune the open-sourced +general foundation models, achieving multi-modality Aesthetic Expert models, +dubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert +models deliver significantly better aesthetic perception performances than the +state-of-the-art MLLMs, including the most advanced GPT-4V and +Gemini-Pro-Vision. Source data will be available at +https://github.com/yipoh/AesExpert. + +
+
+
+
+
+ + ♻ ☆ FaceFilterSense: A Filter-Resistant Face Recognition and Facial + Attribute Analysis Framework + + +
+ With the advent of social media, fun selfie filters have come into tremendous +mainstream use affecting the functioning of facial biometric systems as well as +image recognition systems. These filters vary from beautification filters and +Augmented Reality (AR)-based filters to filters that modify facial landmarks. +Hence, there is a need to assess the impact of such filters on the performance +of existing face recognition systems. The limitation associated with existing +solutions is that these solutions focus more on the beautification filters. +However, the current AR-based filters and filters which distort facial key +points are in vogue recently and make the faces highly unrecognizable even to +the naked eye. Also, the filters considered are mostly obsolete with limited +variations. To mitigate these limitations, we aim to perform a holistic impact +analysis of the latest filters and propose an user recognition model with the +filtered images. We have utilized a benchmark dataset for baseline images, and +applied the latest filters over them to generate a beautified/filtered dataset. +Next, we have introduced a model FaceFilterNet for beautified user recognition. +In this framework, we also utilize our model to comment on various attributes +of the person including age, gender, and ethnicity. In addition, we have also +presented a filter-wise impact analysis on face recognition, age estimation, +gender, and ethnicity prediction. The proposed method affirms the efficacy of +our dataset with an accuracy of 87.25% and an optimal accuracy for facial +attribute analysis. + +
+
+
+
+
+ + ♻ ☆ Cross-view and Cross-pose Completion for 3D Human Understanding CVPR 2024 + + +
+ Human perception and understanding is a major domain of computer vision +which, like many other vision subdomains recently, stands to gain from the use +of large models pre-trained on large datasets. We hypothesize that the most +common pre-training strategy of relying on general purpose, object-centric +image datasets such as ImageNet, is limited by an important domain shift. On +the other hand, collecting domain-specific ground truth such as 2D or 3D labels +does not scale well. Therefore, we propose a pre-training approach based on +self-supervised learning that works on human-centric data using only images. +Our method uses pairs of images of humans: the first is partially masked and +the model is trained to reconstruct the masked parts given the visible ones and +a second image. It relies on both stereoscopic (cross-view) pairs, and temporal +(cross-pose) pairs taken from videos, in order to learn priors about 3D as well +as human motion. We pre-train a model for body-centric tasks and one for +hand-centric tasks. With a generic transformer architecture, these models +outperform existing self-supervised pre-training methods on a wide set of +human-centric downstream tasks, and obtain state-of-the-art performance for +instance when fine-tuning for model-based and model-free human mesh recovery. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Predicting and Enhancing the Fairness of DNNs with the Curvature of + Perceptual Manifolds CVPR 2023 + + +
+ To address the challenges of long-tailed classification, researchers have +proposed several approaches to reduce model bias, most of which assume that +classes with few samples are weak classes. However, recent studies have shown +that tail classes are not always hard to learn, and model bias has been +observed on sample-balanced datasets, suggesting the existence of other factors +that affect model bias. In this work, we first establish a geometric +perspective for analyzing model fairness and then systematically propose a +series of geometric measurements for perceptual manifolds in deep neural +networks. Subsequently, we comprehensively explore the effect of the geometric +characteristics of perceptual manifolds on classification difficulty and how +learning shapes the geometric characteristics of perceptual manifolds. An +unanticipated finding is that the correlation between the class accuracy and +the separation degree of perceptual manifolds gradually decreases during +training, while the negative correlation with the curvature gradually +increases, implying that curvature imbalance leads to model bias.Building upon +these observations, we propose curvature regularization to facilitate the model +to learn curvature-balanced and flatter perceptual manifolds. Evaluations on +multiple long-tailed and non-long-tailed datasets show the excellent +performance and exciting generality of our approach, especially in achieving +significant performance improvements based on current state-of-the-art +techniques. Our work opens up a geometric analysis perspective on model bias +and reminds researchers to pay attention to model bias on non-long-tailed and +even sample-balanced datasets. + +
+
+ comment: 17pages, Accepted by CVPR 2023, Submitted to TPAMI +
+
+
+
+
+ + ♻ ☆ MARformer: An Efficient Metal Artifact Reduction Transformer for Dental + CBCT Images + + +
+ Cone Beam Computed Tomography (CBCT) plays a key role in dental diagnosis and +surgery. However, the metal teeth implants could bring annoying metal artifacts +during the CBCT imaging process, interfering diagnosis and downstream +processing such as tooth segmentation. In this paper, we develop an efficient +Transformer to perform metal artifacts reduction (MAR) from dental CBCT images. +The proposed MAR Transformer (MARformer) reduces computation complexity in the +multihead self-attention by a new Dimension-Reduced Self-Attention (DRSA) +module, based on that the CBCT images have globally similar structure. A +Patch-wise Perceptive Feed Forward Network (P2FFN) is also proposed to perceive +local image information for fine-grained restoration. Experimental results on +CBCT images with synthetic and real-world metal artifacts show that our +MARformer is efficient and outperforms previous MAR methods and two restoration +Transformers. + +
+
+ comment: under consideration of Computer Vision and Image Understanding + journal +
+
+
+
+
+ + ♻ ☆ PDE-CNNs: Axiomatic Derivations and Applications + + +
+ PDE-based Group Convolutional Neural Networks (PDE-G-CNNs) utilize solvers of +geometrically meaningful evolution PDEs as substitutes for the conventional +components in G-CNNs. PDE-G-CNNs offer several key benefits all at once: fewer +parameters, inherent equivariance, better performance, data efficiency, and +geometric interpretability. + In this article we focus on Euclidean equivariant PDE-G-CNNs where the +feature maps are two dimensional throughout. We call this variant of the +framework a PDE-CNN. + From a machine learning perspective, we list several practically desirable +axioms and derive from these which PDEs should be used in a PDE-CNN. Here our +approach to geometric learning via PDEs is inspired by the axioms of classical +linear and morphological scale-space theory, which we generalize by introducing +semifield-valued signals. + Furthermore, we experimentally confirm for small networks that PDE-CNNs offer +fewer parameters, increased performance, and better data efficiency when +compared to CNNs. We also investigate what effect the use of different +semifields has on the performance of the models. + +
+
+
+
+
+ + ♻ ☆ Stronger, Fewer, & Superior: Harnessing Vision Foundation Models for + Domain Generalized Semantic Segmentation + + +
+ In this paper, we first assess and harness various Vision Foundation Models +(VFMs) in the context of Domain Generalized Semantic Segmentation (DGSS). +Driven by the motivation that Leveraging Stronger pre-trained models and Fewer +trainable parameters for Superior generalizability, we introduce a robust +fine-tuning approach, namely Rein, to parameter-efficiently harness VFMs for +DGSS. Built upon a set of trainable tokens, each linked to distinct instances, +Rein precisely refines and forwards the feature maps from each layer to the +next layer within the backbone. This process produces diverse refinements for +different categories within a single image. With fewer trainable parameters, +Rein efficiently fine-tunes VFMs for DGSS tasks, surprisingly surpassing full +parameter fine-tuning. Extensive experiments across various settings +demonstrate that Rein significantly outperforms state-of-the-art methods. +Remarkably, with just an extra 1% of trainable parameters within the frozen +backbone, Rein achieves a mIoU of 78.4% on the Cityscapes, without accessing +any real urban-scene datasets.Code is available at +https://github.com/w1oves/Rein.git. + +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: minor fixes/rephrasing +
+
+
+
+
+ + ♻ ☆ JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on + Long-Tailed OCTA + + +
+ The oxygen saturation level in the blood (SaO2) is crucial for health, +particularly in relation to sleep-related breathing disorders. However, +continuous monitoring of SaO2 is time-consuming and highly variable depending +on patients' conditions. Recently, optical coherence tomography angiography +(OCTA) has shown promising development in rapidly and effectively screening +eye-related lesions, offering the potential for diagnosing sleep-related +disorders. To bridge this gap, our paper presents three key contributions. +Firstly, we propose JointViT, a novel model based on the Vision Transformer +architecture, incorporating a joint loss function for supervision. Secondly, we +introduce a balancing augmentation technique during data preprocessing to +improve the model's performance, particularly on the long-tail distribution +within the OCTA dataset. Lastly, through comprehensive experiments on the OCTA +dataset, our proposed method significantly outperforms other state-of-the-art +methods, achieving improvements of up to 12.28% in overall accuracy. This +advancement lays the groundwork for the future utilization of OCTA in +diagnosing sleep-related disorders. See project website +https://steve-zeyu-zhang.github.io/JointViT + +
+
+
+
+
+ + ♻ ☆ Quantifying and Enhancing Multi-modal Robustness with Modality + Preference ICLR 2024 + + +
+ Multi-modal models have shown a promising capability to effectively integrate +information from various sources, yet meanwhile, they are found vulnerable to +pervasive perturbations, such as uni-modal attacks and missing conditions. To +counter these perturbations, robust multi-modal representations are highly +expected, which are positioned well away from the discriminative multi-modal +decision boundary. In this paper, different from conventional empirical +studies, we focus on a commonly used joint multi-modal framework and +theoretically discover that larger uni-modal representation margins and more +reliable integration for modalities are essential components for achieving +higher robustness. This discovery can further explain the limitation of +multi-modal robustness and the phenomenon that multi-modal models are often +vulnerable to attacks on the specific modality. Moreover, our analysis reveals +how the widespread issue, that the model has different preferences for +modalities, limits the multi-modal robustness by influencing the essential +components and could lead to attacks on the specific modality highly effective. +Inspired by our theoretical finding, we introduce a training procedure called +Certifiable Robust Multi-modal Training (CRMT), which can alleviate this +influence from modality preference and explicitly regulate essential components +to significantly improve robustness in a certifiable manner. Our method +demonstrates substantial improvements in performance and robustness compared +with existing methods. Furthermore, our training procedure can be easily +extended to enhance other robust training strategies, highlighting its +credibility and flexibility. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Relaxed forced choice improves performance of visual quality assessment + methods + + +
+ In image quality assessment, a collective visual quality score for an image +or video is obtained from the individual ratings of many subjects. One commonly +used format for these experiments is the two-alternative forced choice method. +Two stimuli with the same content but differing visual quality are presented +sequentially or side-by-side. Subjects are asked to select the one of better +quality, and when uncertain, they are required to guess. The relaxed +alternative forced choice format aims to reduce the cognitive load and the +noise in the responses due to the guessing by providing a third response +option, namely, ``not sure''. This work presents a large and comprehensive +crowdsourcing experiment to compare these two response formats: the one with +the ``not sure'' option and the one without it. To provide unambiguous ground +truth for quality evaluation, subjects were shown pairs of images with +differing numbers of dots and asked each time to choose the one with more dots. +Our crowdsourcing study involved 254 participants and was conducted using a +within-subject design. Each participant was asked to respond to 40 pair +comparisons with and without the ``not sure'' response option and completed a +questionnaire to evaluate their cognitive load for each testing condition. The +experimental results show that the inclusion of the ``not sure'' response +option in the forced choice method reduced mental load and led to models with +better data fit and correspondence to ground truth. We also tested for the +equivalence of the models and found that they were different. The dataset is +available at http://database.mmsp-kn.de/cogvqa-database.html. + +
+
+ comment: 6 pages, 3 figures, accepted at the 2023 15th International + Conference on Quality of Multimedia Experience (QoMEX). Database is publicly + accessible at http://database.mmsp-kn.de/cogvqa-database.html +
+
+
+
+
+ + ♻ ☆ Terrain-Informed Self-Supervised Learning: Enhancing Building Footprint + Extraction from LiDAR Data with Limited Annotations + + +
+ Estimating building footprint maps from geospatial data is of paramount +importance in urban planning, development, disaster management, and various +other applications. Deep learning methodologies have gained prominence in +building segmentation maps, offering the promise of precise footprint +extraction without extensive post-processing. However, these methods face +challenges in generalization and label efficiency, particularly in remote +sensing, where obtaining accurate labels can be both expensive and +time-consuming. To address these challenges, we propose terrain-aware +self-supervised learning, tailored to remote sensing, using digital elevation +models from LiDAR data. We propose to learn a model to differentiate between +bare Earth and superimposed structures enabling the network to implicitly learn +domain-relevant features without the need for extensive pixel-level +annotations. We test the effectiveness of our approach by evaluating building +segmentation performance on test datasets with varying label fractions. +Remarkably, with only 1% of the labels (equivalent to 25 labeled examples), our +method improves over ImageNet pre-training, showing the advantage of leveraging +unlabeled data for feature extraction in the domain of remote sensing. The +performance improvement is more pronounced in few-shot scenarios and gradually +closes the gap with ImageNet pre-training as the label fraction increases. We +test on a dataset characterized by substantial distribution shifts and labeling +errors to demonstrate the generalizability of our approach. When compared to +other baselines, including ImageNet pretraining and more complex architectures, +our approach consistently performs better, demonstrating the efficiency and +effectiveness of self-supervised terrain-aware feature learning. + +
+
+
+
+
+ + ♻ ☆ Octopus v3: Technical Report for On-device Sub-billion Multimodal AI + Agent + + +
+ A multimodal AI agent is characterized by its ability to process and learn +from various types of data, including natural language, visual, and audio +inputs, to inform its actions. Despite advancements in large language models +that incorporate visual data, such as GPT-4V, effectively translating +image-based data into actionable outcomes for AI agents continues to be +challenging. In this paper, we introduce a multimodal model that incorporates +the concept of functional token specifically designed for AI agent +applications. To ensure compatibility with edge devices, our model is optimized +to a compact size of less than 1B parameters. Like GPT-4, our model can process +both English and Chinese. We demonstrate that this model is capable of +operating efficiently on a wide range of edge devices, including as constrained +as a Raspberry Pi. + +
+
+
+
+
+ + ♻ ☆ Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual + Perception ICLR 2024 + + +
+ Mobile device agent based on Multimodal Large Language Models (MLLM) is +becoming a popular application. In this paper, we introduce Mobile-Agent, an +autonomous multi-modal mobile device agent. Mobile-Agent first leverages visual +perception tools to accurately identify and locate both the visual and textual +elements within the app's front-end interface. Based on the perceived vision +context, it then autonomously plans and decomposes the complex operation task, +and navigates the mobile Apps through operations step by step. Different from +previous solutions that rely on XML files of Apps or mobile system metadata, +Mobile-Agent allows for greater adaptability across diverse mobile operating +environments in a vision-centric way, thereby eliminating the necessity for +system-specific customizations. To assess the performance of Mobile-Agent, we +introduced Mobile-Eval, a benchmark for evaluating mobile device operations. +Based on Mobile-Eval, we conducted a comprehensive evaluation of Mobile-Agent. +The experimental results indicate that Mobile-Agent achieved remarkable +accuracy and completion rates. Even with challenging instructions, such as +multi-app operations, Mobile-Agent can still complete the requirements. Code +and model will be open-sourced at https://github.com/X-PLUG/MobileAgent. + +
+
+ comment: Accepted by ICLR 2024 Workshop in Large Language Model (LLM) Agents +
+
+
+
+
+ + ♻ ☆ SCT: A Simple Baseline for Parameter-Efficient Fine-Tuning via Salient + Channels + + +
+ Pre-trained vision transformers have strong representation benefits to +various downstream tasks. Recently, many parameter-efficient fine-tuning (PEFT) +methods have been proposed, and their experiments demonstrate that tuning only +1% of extra parameters could surpass full fine-tuning in low-data resource +scenarios. However, these methods overlook the task-specific information when +fine-tuning diverse downstream tasks. In this paper, we propose a simple yet +effective method called "Salient Channel Tuning" (SCT) to leverage the +task-specific information by forwarding the model with the task images to +select partial channels in a feature map that enables us to tune only 1/8 +channels leading to significantly lower parameter costs. Experiments outperform +full fine-tuning on 18 out of 19 tasks in the VTAB-1K benchmark by adding only +0.11M parameters of the ViT-B, which is 780x fewer than its full fine-tuning +counterpart. Furthermore, experiments on domain generalization and few-shot +learning surpass other PEFT methods with lower parameter costs, demonstrating +our proposed tuning technique's strong capability and effectiveness in the +low-data regime. + +
+
+ comment: This work has been accepted by IJCV2023 +
+
+
+
+
+ + ♻ ☆ Streaming Anchor Loss: Augmenting Supervision with Temporal Significance ICASSP 2024 + + +
+ Streaming neural network models for fast frame-wise responses to various +speech and sensory signals are widely adopted on resource-constrained +platforms. Hence, increasing the learning capacity of such streaming models +(i.e., by adding more parameters) to improve the predictive power may not be +viable for real-world tasks. In this work, we propose a new loss, Streaming +Anchor Loss (SAL), to better utilize the given learning capacity by encouraging +the model to learn more from essential frames. More specifically, our SAL and +its focal variations dynamically modulate the frame-wise cross entropy loss +based on the importance of the corresponding frames so that a higher loss +penalty is assigned for frames within the temporal proximity of semantically +critical events. Therefore, our loss ensures that the model training focuses on +predicting the relatively rare but task-relevant frames. Experimental results +with standard lightweight convolutional and recurrent streaming networks on +three different speech based detection tasks demonstrate that SAL enables the +model to learn the overall task more effectively with improved accuracy and +latency, without any additional data, model parameters, or architectural +changes. + +
+
+ comment: Published at IEEE ICASSP 2024, please see + https://ieeexplore.ieee.org/abstract/document/10447222 +
+
+
+
+
+ + ♻ ☆ Dynamic Typography: Bringing Text to Life via Video Diffusion Prior + + +
+ Text animation serves as an expressive medium, transforming static +communication into dynamic experiences by infusing words with motion to evoke +emotions, emphasize meanings, and construct compelling narratives. Crafting +animations that are semantically aware poses significant challenges, demanding +expertise in graphic design and animation. We present an automated text +animation scheme, termed "Dynamic Typography", which combines two challenging +tasks. It deforms letters to convey semantic meaning and infuses them with +vibrant movements based on user prompts. Our technique harnesses vector +graphics representations and an end-to-end optimization-based framework. This +framework employs neural displacement fields to convert letters into base +shapes and applies per-frame motion, encouraging coherence with the intended +textual concept. Shape preservation techniques and perceptual loss +regularization are employed to maintain legibility and structural integrity +throughout the animation process. We demonstrate the generalizability of our +approach across various text-to-video models and highlight the superiority of +our end-to-end methodology over baseline methods, which might comprise separate +tasks. Through quantitative and qualitative evaluations, we demonstrate the +effectiveness of our framework in generating coherent text animations that +faithfully interpret user prompts while maintaining readability. Our code is +available at: https://animate-your-word.github.io/demo/. + +
+
+ comment: Our demo page is available at: + https://animate-your-word.github.io/demo/ +
+
+
+
+
+ + ♻ ☆ AID: Attention Interpolation of Text-to-Image Diffusion + + +
+ Conditional diffusion models can create unseen images in various settings, +aiding image interpolation. Interpolation in latent spaces is well-studied, but +interpolation with specific conditions like text or poses is less understood. +Simple approaches, such as linear interpolation in the space of conditions, +often result in images that lack consistency, smoothness, and fidelity. To that +end, we introduce a novel training-free technique named Attention Interpolation +via Diffusion (AID). Our key contributions include 1) proposing an inner/outer +interpolated attention layer; 2) fusing the interpolated attention with +self-attention to boost fidelity; and 3) applying beta distribution to +selection to increase smoothness. We also present a variant, Prompt-guided +Attention Interpolation via Diffusion (PAID), that considers interpolation as a +condition-dependent generative process. This method enables the creation of new +images with greater consistency, smoothness, and efficiency, and offers control +over the exact path of interpolation. Our approach demonstrates effectiveness +for conceptual and spatial interpolation. Code and demo are available at +https://github.com/QY-H00/attention-interpolation-diffusion. + +
+
+
+
+
+ + ♻ ☆ A Survey on 3D Egocentric Human Pose Estimation + + +
+ Egocentric human pose estimation aims to estimate human body poses and +develop body representations from a first-person camera perspective. It has +gained vast popularity in recent years because of its wide range of +applications in sectors like XR-technologies, human-computer interaction, and +fitness tracking. However, to the best of our knowledge, there is no systematic +literature review based on the proposed solutions regarding egocentric 3D human +pose estimation. To that end, the aim of this survey paper is to provide an +extensive overview of the current state of egocentric pose estimation research. +In this paper, we categorize and discuss the popular datasets and the different +pose estimation models, highlighting the strengths and weaknesses of different +methods by comparative analysis. This survey can be a valuable resource for +both researchers and practitioners in the field, offering insights into key +concepts and cutting-edge solutions in egocentric pose estimation, its +wide-ranging applications, as well as the open problems with future scope. + +
+
+
+
+
+ + ♻ ☆ ViGoR: Improving Visual Grounding of Large Vision Language Models with + Fine-Grained Reward Modeling + + +
+ By combining natural language understanding, generation capabilities, and +breadth of knowledge of large language models with image perception, recent +large vision language models (LVLMs) have shown unprecedented visual reasoning +capabilities. However, the generated text often suffers from inaccurate +grounding in the visual input, resulting in errors such as hallucination of +nonexistent scene elements, missing significant parts of the scene, and +inferring incorrect attributes of and relationships between objects. To address +these issues, we introduce a novel framework, ViGoR(Visual Grounding Through +Fine-Grained Reward Modeling) that utilizes fine-grained reward modeling to +significantly enhance the visual grounding of LVLMs over pre-trained baselines. +This improvement is efficiently achieved using much cheaper human evaluations +instead of full supervisions, as well as automated methods. We show the +effectiveness of our approach through a variety of evaluation methods and +benchmarks. Additionally, we plan to release our human annotation comprising +approximately 16,000 images and generated text pairs with fine-grained +evaluations to contribute to related research in the community. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ♻ ☆ MVDream: Multi-view Diffusion for 3D Generation + + +
+ We introduce MVDream, a diffusion model that is able to generate consistent +multi-view images from a given text prompt. Learning from both 2D and 3D data, +a multi-view diffusion model can achieve the generalizability of 2D diffusion +models and the consistency of 3D renderings. We demonstrate that such a +multi-view diffusion model is implicitly a generalizable 3D prior agnostic to +3D representations. It can be applied to 3D generation via Score Distillation +Sampling, significantly enhancing the consistency and stability of existing +2D-lifting methods. It can also learn new concepts from a few 2D examples, akin +to DreamBooth, but for 3D generation. + +
+
+ comment: Reorganized for arXiv; Our project page is https://MV-Dream.github.io +
+
+
+
+
+ + ♻ ☆ Explaining latent representations of generative models with large + multimodal models ICLR 2024 + + +
+ Learning interpretable representations of data generative latent factors is +an important topic for the development of artificial intelligence. With the +rise of the large multimodal model, it can align images with text to generate +answers. In this work, we propose a framework to comprehensively explain each +latent variable in the generative models using a large multimodal model. We +further measure the uncertainty of our generated explanations, quantitatively +evaluate the performance of explanation generation among multiple large +multimodal models, and qualitatively visualize the variations of each latent +variable to learn the disentanglement effects of different generative models on +explanations. Finally, we discuss the explanatory capabilities and limitations +of state-of-the-art large multimodal models. + +
+
+ comment: ICLR 2024 Workshop on Reliable and Responsible Foundation Models +
+
+
+
+
+ + ♻ ☆ CoReS: Orchestrating the Dance of Reasoning and Segmentation + + +
+ The reasoning segmentation task, which demands a nuanced comprehension of +intricate queries to accurately pinpoint object regions, is attracting +increasing attention. However, Multi-modal Large Language Models (MLLM) often +find it difficult to accurately localize the objects described in complex +reasoning contexts. We believe that the act of reasoning segmentation should +mirror the cognitive stages of human visual search, where each step is a +progressive refinement of thought toward the final object. Thus we introduce +the Chains of Reasoning and Segmenting (CoReS) and find this top-down visual +hierarchy indeed enhances the visual search process. Specifically, we propose a +dual-chain structure that generates multi-modal, chain-like outputs to aid the +segmentation process. Furthermore, to steer the MLLM's outputs into this +intended hierarchy, we incorporate in-context inputs as guidance. Extensive +experiments demonstrate the superior performance of our CoReS, which surpasses +the state-of-the-art method by 7.1\% on the ReasonSeg dataset. Project: +https://chain-of-reasoning-and-segmentation.github.io/. + +
+
+
+
+
+ + ♻ ☆ DeblurGS: Gaussian Splatting for Camera Motion Blur + + +
+ Although significant progress has been made in reconstructing sharp 3D scenes +from motion-blurred images, a transition to real-world applications remains +challenging. The primary obstacle stems from the severe blur which leads to +inaccuracies in the acquisition of initial camera poses through +Structure-from-Motion, a critical aspect often overlooked by previous +approaches. To address this challenge, we propose DeblurGS, a method to +optimize sharp 3D Gaussian Splatting from motion-blurred images, even with the +noisy camera pose initialization. We restore a fine-grained sharp scene by +leveraging the remarkable reconstruction capability of 3D Gaussian Splatting. +Our approach estimates the 6-Degree-of-Freedom camera motion for each blurry +observation and synthesizes corresponding blurry renderings for the +optimization process. Furthermore, we propose Gaussian Densification Annealing +strategy to prevent the generation of inaccurate Gaussians at erroneous +locations during the early training stages when camera motion is still +imprecise. Comprehensive experiments demonstrate that our DeblurGS achieves +state-of-the-art performance in deblurring and novel view synthesis for +real-world and synthetic benchmark datasets, as well as field-captured blurry +smartphone videos. + +
+
+
+
+
+ + ♻ ☆ Supervised Contrastive Vision Transformer for Breast Histopathological + Image Classification + + +
+ Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer. +Breast tissue histopathological examination is critical in diagnosing and +classifying breast cancer. Although existing methods have shown promising +results, there is still room for improvement in the classification accuracy and +generalization of IDC using histopathology images. We present a novel approach, +Supervised Contrastive Vision Transformer (SupCon-ViT), for improving the +classification of invasive ductal carcinoma in terms of accuracy and +generalization by leveraging the inherent strengths and advantages of both +transfer learning, i.e., pre-trained vision transformer, and supervised +contrastive learning. Our results on a benchmark breast cancer dataset +demonstrate that SupCon-Vit achieves state-of-the-art performance in IDC +classification, with an F1-score of 0.8188, precision of 0.7692, and +specificity of 0.8971, outperforming existing methods. In addition, the +proposed model demonstrates resilience in scenarios with minimal labeled data, +making it highly efficient in real-world clinical settings where labelled data +is limited. Our findings suggest that supervised contrastive learning in +conjunction with pre-trained vision transformers appears to be a viable +strategy for an accurate classification of IDC, thus paving the way for a more +efficient and reliable diagnosis of breast cancer through histopathological +image analysis. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing + Diffusion Models + + +
+ In the era of AIGC, the demand for low-budget or even on-device applications +of diffusion models emerged. In terms of compressing the Stable Diffusion +models (SDMs), several approaches have been proposed, and most of them +leveraged the handcrafted layer removal methods to obtain smaller U-Nets, along +with knowledge distillation to recover the network performance. However, such a +handcrafting manner of layer removal is inefficient and lacks scalability and +generalization, and the feature distillation employed in the retraining phase +faces an imbalance issue that a few numerically significant feature loss terms +dominate over others throughout the retraining process. To this end, we +proposed the layer pruning and normalized distillation for compressing +diffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to +compress SDM's U-Net automatically and proposed an effective one-shot pruning +criterion whose one-shot performance is guaranteed by its good additivity +property, surpassing other layer pruning and handcrafted layer removal methods, +2) proposed the normalized feature distillation for retraining, alleviated the +imbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of +SDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0% +decline in PickScore at a pruning ratio of 50% while the comparative methods' +minimal PickScore decline is 8.2%. We will release our code. + +
+
+
+
+
+ + ♻ ☆ Methods and strategies for improving the novel view synthesis quality of + neural radiation field + + +
+ Neural Radiation Field (NeRF) technology can learn a 3D implicit model of a +scene from 2D images and synthesize realistic novel view images. This +technology has received widespread attention from the industry and has good +application prospects. In response to the problem that the rendering quality of +NeRF images needs to be improved, many researchers have proposed various +methods to improve the rendering quality in the past three years. The latest +relevant papers are classified and reviewed, the technical principles behind +quality improvement are analyzed, and the future evolution direction of quality +improvement methods is discussed. This study can help researchers quickly +understand the current state and evolutionary context of technology in this +field, which is helpful in inspiring the development of more efficient +algorithms and promoting the application of NeRF technology in related fields. + +
+
+
+
+
+ + ♻ ☆ TaCOS: Task-Specific Camera Optimization with Simulation + + +
+ The performance of robots in their applications heavily depends on the +quality of sensory input. However, designing sensor payloads and their +parameters for specific robotic tasks is an expensive process that requires +well-established sensor knowledge and extensive experiments with physical +hardware. With cameras playing a pivotal role in robotic perception, we +introduce a novel end-to-end optimization approach for co-designing a camera +with specific robotic tasks by combining derivative-free and gradient-based +optimizers. The proposed method leverages recent computer graphics techniques +and physical camera characteristics to prototype the camera in software, +simulate operational environments and tasks for robots, and optimize the camera +design based on the desired tasks in a cost-effective way. We validate the +accuracy of our camera simulation by comparing it with physical cameras, and +demonstrate the design of cameras with stronger performance than common +off-the-shelf alternatives. Our approach supports the optimization of both +continuous and discrete camera parameters, manufacturing constraints, and can +be generalized to a broad range of camera design scenarios including multiple +cameras and unconventional cameras. This work advances the fully automated +design of cameras for specific robotics tasks. + +
+
+
+
+
+ + ♻ ☆ CogME: A Cognition-Inspired Multi-Dimensional Evaluation Metric for + Story Understanding + + +
+ We introduce CogME, a cognition-inspired, multi-dimensional evaluation metric +designed for AI models focusing on story understanding. CogME is a framework +grounded in human thinking strategies and story elements that involve story +understanding. With a specific breakdown of the questions, this approach +provides a nuanced assessment revealing not only AI models' particular +strengths and weaknesses but also the characteristics of the benchmark dataset. +Our case study with the DramaQA dataset demonstrates a refined analysis of the +model and the benchmark dataset. We argue the need for metrics based on +understanding the nature of tasks and designed to align closely with human +cognitive processes. This approach provides insights beyond traditional overall +scores and paves the way for more sophisticated AI development targeting higher +cognitive functions. + +
+
+ comment: 9 pages with 4 figures and 3 tables. This work has been accepted for + presentation at CogSci 2024 and is currently under revision +
+
+
+
+
+ + ♻ ☆ Self-supervised Learning of Rotation-invariant 3D Point Set Features + using Transformer and its Self-distillation + + +
+ Invariance against rotations of 3D objects is an important property in +analyzing 3D point set data. Conventional 3D point set DNNs having rotation +invariance typically obtain accurate 3D shape features via supervised learning +by using labeled 3D point sets as training samples. However, due to the rapid +increase in 3D point set data and the high cost of labeling, a framework to +learn rotation-invariant 3D shape features from numerous unlabeled 3D point +sets is required. This paper proposes a novel self-supervised learning +framework for acquiring accurate and rotation-invariant 3D point set features +at object-level. Our proposed lightweight DNN architecture decomposes an input +3D point set into multiple global-scale regions, called tokens, that preserve +the spatial layout of partial shapes composing the 3D object. We employ a +self-attention mechanism to refine the tokens and aggregate them into an +expressive rotation-invariant feature per 3D point set. Our DNN is effectively +trained by using pseudo-labels generated by a self-distillation framework. To +facilitate the learning of accurate features, we propose to combine multi-crop +and cut-mix data augmentation techniques to diversify 3D point sets for +training. Through a comprehensive evaluation, we empirically demonstrate that, +(1) existing rotation-invariant DNN architectures designed for supervised +learning do not necessarily learn accurate 3D shape features under a +self-supervised learning scenario, and (2) our proposed algorithm learns +rotation-invariant 3D point set features that are more accurate than those +learned by existing algorithms. Code is available at +https://github.com/takahikof/RIPT_SDMM + +
+
+ comment: Accepted to the CVIU journal +
+
+
+
+
+ + ♻ ☆ HR-APR: APR-agnostic Framework with Uncertainty Estimation and + Hierarchical Refinement for Camera Relocalisation ICRA + + +
+ Absolute Pose Regressors (APRs) directly estimate camera poses from monocular +images, but their accuracy is unstable for different queries. Uncertainty-aware +APRs provide uncertainty information on the estimated pose, alleviating the +impact of these unreliable predictions. However, existing uncertainty modelling +techniques are often coupled with a specific APR architecture, resulting in +suboptimal performance compared to state-of-the-art (SOTA) APR methods. This +work introduces a novel APR-agnostic framework, HR-APR, that formulates +uncertainty estimation as cosine similarity estimation between the query and +database features. It does not rely on or affect APR network architecture, +which is flexible and computationally efficient. In addition, we take advantage +of the uncertainty for pose refinement to enhance the performance of APR. The +extensive experiments demonstrate the effectiveness of our framework, reducing +27.4\% and 15.2\% of computational overhead on the 7Scenes and Cambridge +Landmarks datasets while maintaining the SOTA accuracy in single-image APRs. + +
+
+ comment: Accepted in in 2024 IEEE International Conference on Robotics and + Automation (ICRA). Code: https://github.com/lck666666/HR-APR +
+
+
+
+
+ + ♻ ☆ Non-negative Contrastive Learning ICLR 2024 + + +
+ Deep representations have shown promising performance when transferred to +downstream tasks in a black-box manner. Yet, their inherent lack of +interpretability remains a significant challenge, as these features are often +opaque to human understanding. In this paper, we propose Non-negative +Contrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization +(NMF) aimed at deriving interpretable features. The power of NCL lies in its +enforcement of non-negativity constraints on features, reminiscent of NMF's +capability to extract features that align closely with sample clusters. NCL not +only aligns mathematically well with an NMF objective but also preserves NMF's +interpretability attributes, resulting in a more sparse and disentangled +representation compared to standard contrastive learning (CL). Theoretically, +we establish guarantees on the identifiability and downstream generalization of +NCL. Empirically, we show that these advantages enable NCL to outperform CL +significantly on feature disentanglement, feature selection, as well as +downstream classification tasks. At last, we show that NCL can be easily +extended to other learning scenarios and benefit supervised learning as well. +Code is available at https://github.com/PKU-ML/non_neg. + +
+
+ comment: 22 pages. Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ PolyOculus: Simultaneous Multi-view Image-based Novel View Synthesis + + +
+ This paper considers the problem of generative novel view synthesis (GNVS), +generating novel, plausible views of a scene given a limited number of known +views. Here, we propose a set-based generative model that can simultaneously +generate multiple, self-consistent new views, conditioned on any number of +views. Our approach is not limited to generating a single image at a time and +can condition on a variable number of views. As a result, when generating a +large number of views, our method is not restricted to a low-order +autoregressive generation approach and is better able to maintain generated +image quality over large sets of images. We evaluate our model on standard NVS +datasets and show that it outperforms the state-of-the-art image-based GNVS +baselines. Further, we show that the model is capable of generating sets of +views that have no natural sequential ordering, like loops and binocular +trajectories, and significantly outperforms other methods on such tasks. + +
+
+
+
+
+ + ♻ ☆ WHAM: Reconstructing World-grounded Humans with Accurate 3D Motion + + +
+ The estimation of 3D human motion from video has progressed rapidly but +current methods still have several key limitations. First, most methods +estimate the human in camera coordinates. Second, prior work on estimating +humans in global coordinates often assumes a flat ground plane and produces +foot sliding. Third, the most accurate methods rely on computationally +expensive optimization pipelines, limiting their use to offline applications. +Finally, existing video-based methods are surprisingly less accurate than +single-frame methods. We address these limitations with WHAM (World-grounded +Humans with Accurate Motion), which accurately and efficiently reconstructs 3D +human motion in a global coordinate system from video. WHAM learns to lift 2D +keypoint sequences to 3D using motion capture data and fuses this with video +features, integrating motion context and visual information. WHAM exploits +camera angular velocity estimated from a SLAM method together with human motion +to estimate the body's global trajectory. We combine this with a contact-aware +trajectory refinement method that lets WHAM capture human motion in diverse +conditions, such as climbing stairs. WHAM outperforms all existing 3D human +motion recovery methods across multiple in-the-wild benchmarks. Code will be +available for research purposes at http://wham.is.tue.mpg.de/ + +
+
+
+
+
+ + ♻ ☆ Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person + Images + + +
+ Most existing methods for virtual try-on focus on studio person images with a +limited range of poses and clean backgrounds. They can achieve plausible +results for this studio try-on setting by learning to warp a garment image to +fit a person's body from paired training data, i.e., garment images paired with +images of people wearing the same garment. Such data is often collected from +commercial websites, where each garment is demonstrated both by itself and on +several models. By contrast, it is hard to collect paired data for in-the-wild +scenes, and therefore, virtual try-on for casual images of people with more +diverse poses against cluttered backgrounds is rarely studied. + In this work, we fill the gap by introducing a StreetTryOn benchmark to +evaluate in-the-wild virtual try-on performance and proposing a novel method +that can learn it without paired data, from a set of in-the-wild person images +directly. Our method achieves robust performance across shop and street domains +using a novel DensePose warping correction method combined with diffusion-based +conditional inpainting. Our experiments show competitive performance for +standard studio try-on tasks and SOTA performance for street try-on and +cross-domain try-on tasks. + +
+
+
+
+
+ + ♻ ☆ NIR-Assisted Image Denoising: A Selective Fusion Approach and A + Real-World Benchmark Dataset + + +
+ Despite the significant progress in image denoising, it is still challenging +to restore fine-scale details while removing noise, especially in extremely +low-light environments. Leveraging near-infrared (NIR) images to assist visible +RGB image denoising shows the potential to address this issue, becoming a +promising technology. Nonetheless, existing works still struggle with taking +advantage of NIR information effectively for real-world image denoising, due to +the content inconsistency between NIR-RGB images and the scarcity of real-world +paired datasets. To alleviate the problem, we propose an efficient Selective +Fusion Module (SFM), which can be plug-and-played into the advanced denoising +networks to merge the deep NIR-RGB features. Specifically, we sequentially +perform the global and local modulation for NIR and RGB features, and then +integrate the two modulated features. Furthermore, we present a Real-world +NIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse +scenarios as well as various noise levels. Extensive experiments on both +synthetic and our real-world datasets demonstrate that the proposed method +achieves better results than state-of-the-art ones. The dataset, codes, and +pre-trained models will be publicly available at +https://github.com/ronjonxu/NAID. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Towards Realistic Scene Generation with LiDAR Diffusion Models CVPR 2024 + + +
+ Diffusion models (DMs) excel in photo-realistic image synthesis, but their +adaptation to LiDAR scene generation poses a substantial hurdle. This is +primarily because DMs operating in the point space struggle to preserve the +curve-like patterns and 3D geometry of LiDAR scenes, which consumes much of +their representation power. In this paper, we propose LiDAR Diffusion Models +(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to +capture the realism of LiDAR scenes by incorporating geometric priors into the +learning pipeline. Our method targets three major desiderata: pattern realism, +geometry realism, and object realism. Specifically, we introduce curve-wise +compression to simulate real-world LiDAR patterns, point-wise coordinate +supervision to learn scene geometry, and patch-wise encoding for a full 3D +object context. With these three core designs, our method achieves competitive +performance on unconditional LiDAR generation in 64-beam scenario and state of +the art on conditional LiDAR generation, while maintaining high efficiency +compared to point-based DMs (up to 107$\times$ faster). Furthermore, by +compressing LiDAR scenes into a latent space, we enable the controllability of +DMs with various conditions such as semantic maps, camera views, and text +prompts. + +
+
+ comment: CVPR 2024. Project link: https://lidar-diffusion.github.io +
+
+
+
+
+ + ♻ ☆ MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training + + +
+ In this work, we discuss building performant Multimodal Large Language Models +(MLLMs). In particular, we study the importance of various architecture +components and data choices. Through careful and comprehensive ablations of the +image encoder, the vision language connector, and various pre-training data +choices, we identified several crucial design lessons. For example, we +demonstrate that for large-scale multimodal pre-training using a careful mix of +image-caption, interleaved image-text, and text-only data is crucial for +achieving state-of-the-art (SOTA) few-shot results across multiple benchmarks, +compared to other published pre-training results. Further, we show that the +image encoder together with image resolution and the image token count has +substantial impact, while the vision-language connector design is of +comparatively negligible importance. By scaling up the presented recipe, we +build MM1, a family of multimodal models up to 30B parameters, including both +dense models and mixture-of-experts (MoE) variants, that are SOTA in +pre-training metrics and achieve competitive performance after supervised +fine-tuning on a range of established multimodal benchmarks. Thanks to +large-scale pre-training, MM1 enjoys appealing properties such as enhanced +in-context learning, and multi-image reasoning, enabling few-shot +chain-of-thought prompting. + +
+
+
+
+
+ + ♻ ☆ Cross Domain Early Crop Mapping using CropSTGAN + + +
+ Driven by abundant satellite imagery, machine learning-based approaches have +recently been promoted to generate high-resolution crop cultivation maps to +support many agricultural applications. One of the major challenges faced by +these approaches is the limited availability of ground truth labels. In the +absence of ground truth, existing work usually adopts the "direct transfer +strategy" that trains a classifier using historical labels collected from other +regions and then applies the trained model to the target region. Unfortunately, +the spectral features of crops exhibit inter-region and inter-annual +variability due to changes in soil composition, climate conditions, and crop +progress, the resultant models perform poorly on new and unseen regions or +years. Despite recent efforts, such as the application of the deep adaptation +neural network (DANN) model structure in the deep adaptation crop +classification network (DACCN), to tackle the above cross-domain challenges, +their effectiveness diminishes significantly when there is a large +dissimilarity between the source and target regions. This paper introduces the +Crop Mapping Spectral-temporal Generative Adversarial Neural Network +(CropSTGAN), a novel solution for cross-domain challenges, that doesn't require +target domain labels. CropSTGAN learns to transform the target domain's +spectral features to those of the source domain, effectively bridging large +dissimilarities. Additionally, it employs an identity loss to maintain the +intrinsic local structure of the data. Comprehensive experiments across various +regions and years demonstrate the benefits and effectiveness of the proposed +approach. In experiments, CropSTGAN is benchmarked against various +state-of-the-art (SOTA) methods. Notably, CropSTGAN significantly outperforms +these methods in scenarios with large data distribution dissimilarities between +the target and source domains. + +
+
+
+
+
+ + ♻ ☆ Routers in Vision Mixture of Experts: An Empirical Study + + +
+ Mixture-of-Experts (MoE) models are a promising way to scale up model +capacity without significantly increasing computational cost. A key component +of MoEs is the router, which decides which subset of parameters (experts) +process which feature embeddings (tokens). In this paper, we present a +comprehensive study of routers in MoEs for computer vision tasks. We introduce +a unified MoE formulation that subsumes different MoEs with two parametric +routing tensors. This formulation covers both sparse MoE, which uses a binary +or hard assignment between experts and tokens, and soft MoE, which uses a soft +assignment between experts and weighted combinations of tokens. Routers for +sparse MoEs can be further grouped into two variants: Token Choice, which +matches experts to each token, and Expert Choice, which matches tokens to each +expert. We conduct head-to-head experiments with 6 different routers, including +existing routers from prior work and new ones we introduce. We show that (i) +many routers originally developed for language modeling can be adapted to +perform strongly in vision tasks, (ii) in sparse MoE, Expert Choice routers +generally outperform Token Choice routers, and (iii) soft MoEs generally +outperform sparse MoEs with a fixed compute budget. These results provide new +insights regarding the crucial role of routers in vision MoE models. + +
+
+
+
+
+ + ♻ ☆ D4C Glove-train: Solving the RPM and Bongard-logo Problem by + Circumscribing and Building Distribution for Concepts + + +
+ This paper achieves noteworthy progress in the realm of abstract reasoning, +particularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo +challenges. Initially, we introduce Lico-Net, a novel baseline model that +resolves RPM problems with remarkable accuracy. Leveraging this foundation, we +advance with the D3C approach, which advocates representing the underlying +concepts in abstract reasoning problems through distributions. This perspective +enhances the performance of both Lico-Net and a baseline model excelling in +Bongard-Logo tasks. To bolster the computational efficiency of D3C, we present +the D3C-cos variant, offering a streamlined yet precise solution. Furthermore, +we propose the D2C method, redefining conceptual boundaries within these +domains and bridging the divide between high-level abstractions and their +lower-dimensional counterparts. Finally, we extend our methodology to D4C, +employing adversarial techniques to refine conceptual boundaries further and +demonstrate substantial improvements in both RPM and Bongard-Logo challenges. +Overall, our contributions present a fresh outlook and practical advancements +in the field of abstract reasoning. + +
+
+ comment: 18 pages, 19 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ You Only Need One Color Space: An Efficient Network for Low-light Image + Enhancement + + +
+ Low-Light Image Enhancement (LLIE) task tends to restore the details and +visual information from corrupted low-light images. Most existing methods learn +the mapping function between low/normal-light images by Deep Neural Networks +(DNNs) on sRGB and HSV color space. Nevertheless, enhancement involves +amplifying image signals, and applying these color spaces to low-light images +with a low signal-to-noise ratio can introduce sensitivity and instability into +the enhancement process. Consequently, this results in the presence of color +artifacts and brightness artifacts in the enhanced images. To alleviate this +problem, we propose a novel trainable color space, named +Horizontal/Vertical-Intensity (HVI). It not only decouples brightness and color +from RGB channels to mitigate the instability during enhancement but also +adapts to low-light images in different illumination ranges due to the +trainable parameters. Further, we design a novel Color and Intensity Decoupling +Network (CIDNet) with two branches dedicated to processing the decoupled image +brightness and color in the HVI space. Within CIDNet, we introduce the +Lightweight Cross-Attention (LCA) module to facilitate interaction between +image structure and content information in both branches, while also +suppressing noise in low-light images. Finally, we conducted 22 quantitative +and qualitative experiments to show that the proposed CIDNet outperforms the +state-of-the-art methods on 11 datasets. The code is available at +https://github.com/Fediory/HVI-CIDNet. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 183 + +
+
+
+ + ☆ Factorized Diffusion: Perceptual Illusions by Noise Decomposition + + +
+ Given a factorization of an image into a sum of linear components, we present +a zero-shot method to control each individual component through diffusion model +sampling. For example, we can decompose an image into low and high spatial +frequencies and condition these components on different text prompts. This +produces hybrid images, which change appearance depending on viewing distance. +By decomposing an image into three frequency subbands, we can generate hybrid +images with three prompts. We also use a decomposition into grayscale and color +components to produce images whose appearance changes when they are viewed in +grayscale, a phenomena that naturally occurs under dim lighting. And we explore +a decomposition by a motion blur kernel, which produces images that change +appearance under motion blurring. Our method works by denoising with a +composite noise estimate, built from the components of noise estimates +conditioned on different prompts. We also show that for certain decompositions, +our method recovers prior approaches to compositional generation and spatial +control. Finally, we show that we can extend our approach to generate hybrid +images from real images. We do this by holding one component fixed and +generating the remaining components, effectively solving an inverse problem. + +
+
+
+
+
+ + ☆ Dynamic Typography: Bringing Words to Life + + +
+ Text animation serves as an expressive medium, transforming static +communication into dynamic experiences by infusing words with motion to evoke +emotions, emphasize meanings, and construct compelling narratives. Crafting +animations that are semantically aware poses significant challenges, demanding +expertise in graphic design and animation. We present an automated text +animation scheme, termed "Dynamic Typography", which combines two challenging +tasks. It deforms letters to convey semantic meaning and infuses them with +vibrant movements based on user prompts. Our technique harnesses vector +graphics representations and an end-to-end optimization-based framework. This +framework employs neural displacement fields to convert letters into base +shapes and applies per-frame motion, encouraging coherence with the intended +textual concept. Shape preservation techniques and perceptual loss +regularization are employed to maintain legibility and structural integrity +throughout the animation process. We demonstrate the generalizability of our +approach across various text-to-video models and highlight the superiority of +our end-to-end methodology over baseline methods, which might comprise separate +tasks. Through quantitative and qualitative evaluations, we demonstrate the +effectiveness of our framework in generating coherent text animations that +faithfully interpret user prompts while maintaining readability. Our code is +available at: https://animate-your-word.github.io/demo/. + +
+
+ comment: Our demo page is available at: + https://animate-your-word.github.io/demo/ +
+
+
+
+
+ + ☆ InFusion: Inpainting 3D Gaussians via Learning Depth Completion from + Diffusion Prior + + +
+ 3D Gaussians have recently emerged as an efficient representation for novel +view synthesis. This work studies its editability with a particular focus on +the inpainting task, which aims to supplement an incomplete set of 3D Gaussians +with additional points for visually harmonious rendering. Compared to 2D +inpainting, the crux of inpainting 3D Gaussians is to figure out the +rendering-relevant properties of the introduced points, whose optimization +largely benefits from their initial 3D positions. To this end, we propose to +guide the point initialization with an image-conditioned depth completion +model, which learns to directly restore the depth map based on the observed +image. Such a design allows our model to fill in depth values at an aligned +scale with the original depth, and also to harness strong generalizability from +largescale diffusion prior. Thanks to the more accurate depth completion, our +approach, dubbed InFusion, surpasses existing alternatives with sufficiently +better fidelity and efficiency under various complex scenarios. We further +demonstrate the effectiveness of InFusion with several practical applications, +such as inpainting with user-specific texture or with novel object insertion. + +
+
+ comment: Project page: https://johanan528.github.io/Infusion +
+
+
+
+
+ + ☆ VG4D: Vision-Language Model Goes 4D Video Recognition ICRA 2024 + + +
+ Understanding the real world through point cloud video is a crucial aspect of +robotics and autonomous driving systems. However, prevailing methods for 4D +point cloud recognition have limitations due to sensor resolution, which leads +to a lack of detailed information. Recent advances have shown that +Vision-Language Models (VLM) pre-trained on web-scale text-image datasets can +learn fine-grained visual concepts that can be transferred to various +downstream tasks. However, effectively integrating VLM into the domain of 4D +point clouds remains an unresolved problem. In this work, we propose the +Vision-Language Models Goes 4D (VG4D) framework to transfer VLM knowledge from +visual-text pre-trained models to a 4D point cloud network. Our approach +involves aligning the 4D encoder's representation with a VLM to learn a shared +visual and text space from training on large-scale image-text pairs. By +transferring the knowledge of the VLM to the 4D encoder and combining the VLM, +our VG4D achieves improved recognition performance. To enhance the 4D encoder, +we modernize the classic dynamic point cloud backbone and propose an improved +version of PSTNet, im-PSTNet, which can efficiently model point cloud videos. +Experiments demonstrate that our method achieves state-of-the-art performance +for action recognition on both the NTU RGB+D 60 dataset and the NTU RGB+D 120 +dataset. Code is available at \url{https://github.com/Shark0-0/VG4D}. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ Variational Bayesian Last Layers ICLR + + +
+ We introduce a deterministic variational formulation for training Bayesian +last layer neural networks. This yields a sampling-free, single-pass model and +loss that effectively improves uncertainty estimation. Our variational Bayesian +last layer (VBLL) can be trained and evaluated with only quadratic complexity +in last layer width, and is thus (nearly) computationally free to add to +standard architectures. We experimentally investigate VBLLs, and show that they +improve predictive accuracy, calibration, and out of distribution detection +over baselines across both regression and classification. Finally, we +investigate combining VBLL layers with variational Bayesian feature learning, +yielding a lower variance collapsed variational inference method for Bayesian +neural networks. + +
+
+ comment: International Conference on Learning Representations (ICLR) 2024 +
+
+
+
+
+ + ☆ IntrinsicAnything: Learning Diffusion Priors for Inverse Rendering Under + Unknown Illumination + + +
+ This paper aims to recover object materials from posed images captured under +an unknown static lighting condition. Recent methods solve this task by +optimizing material parameters through differentiable physically based +rendering. However, due to the coupling between object geometry, materials, and +environment lighting, there is inherent ambiguity during the inverse rendering +process, preventing previous methods from obtaining accurate results. To +overcome this ill-posed problem, our key idea is to learn the material prior +with a generative model for regularizing the optimization process. We observe +that the general rendering equation can be split into diffuse and specular +shading terms, and thus formulate the material prior as diffusion models of +albedo and specular. Thanks to this design, our model can be trained using the +existing abundant 3D object data, and naturally acts as a versatile tool to +resolve the ambiguity when recovering material representations from RGB images. +In addition, we develop a coarse-to-fine training strategy that leverages +estimated materials to guide diffusion models to satisfy multi-view consistent +constraints, leading to more stable and accurate results. Extensive experiments +on real-world and synthetic datasets demonstrate that our approach achieves +state-of-the-art performance on material recovery. The code will be available +at https://zju3dv.github.io/IntrinsicAnything. + +
+
+ comment: Project page: https://zju3dv.github.io/IntrinsicAnything +
+
+
+
+
+ + ☆ A Subspace-Constrained Tyler's Estimator and its Applications to + Structure from Motion CVPR 24 + + +
+ We present the subspace-constrained Tyler's estimator (STE) designed for +recovering a low-dimensional subspace within a dataset that may be highly +corrupted with outliers. STE is a fusion of the Tyler's M-estimator (TME) and a +variant of the fast median subspace. Our theoretical analysis suggests that, +under a common inlier-outlier model, STE can effectively recover the underlying +subspace, even when it contains a smaller fraction of inliers relative to other +methods in the field of robust subspace recovery. We apply STE in the context +of Structure from Motion (SfM) in two ways: for robust estimation of the +fundamental matrix and for the removal of outlying cameras, enhancing the +robustness of the SfM pipeline. Numerical experiments confirm the +state-of-the-art performance of our method in these applications. This research +makes significant contributions to the field of robust subspace recovery, +particularly in the context of computer vision and 3D reconstruction. + +
+
+ comment: 23 pages, accepted by CVPR 24 +
+
+
+
+
+ + ☆ Prompt Optimizer of Text-to-Image Diffusion Models for Abstract Concept + Understanding WWW 2024 + + +
+ The rapid evolution of text-to-image diffusion models has opened the door of +generative AI, enabling the translation of textual descriptions into visually +compelling images with remarkable quality. However, a persistent challenge +within this domain is the optimization of prompts to effectively convey +abstract concepts into concrete objects. For example, text encoders can hardly +express "peace", while can easily illustrate olive branches and white doves. +This paper introduces a novel approach named Prompt Optimizer for Abstract +Concepts (POAC) specifically designed to enhance the performance of +text-to-image diffusion models in interpreting and generating images from +abstract concepts. We propose a Prompt Language Model (PLM), which is +initialized from a pre-trained language model, and then fine-tuned with a +curated dataset of abstract concept prompts. The dataset is created with GPT-4 +to extend the abstract concept to a scene and concrete objects. Our framework +employs a Reinforcement Learning (RL)-based optimization strategy, focusing on +the alignment between the generated images by a stable diffusion model and +optimized prompts. Through extensive experiments, we demonstrate that our +proposed POAC significantly improves the accuracy and aesthetic quality of +generated images, particularly in the description of abstract concepts and +alignment with optimized prompts. We also present a comprehensive analysis of +our model's performance across diffusion models under different settings, +showcasing its versatility and effectiveness in enhancing abstract concept +representation. + +
+
+ comment: WWW 2024 Companion +
+
+
+
+
+ + ☆ State-space Decomposition Model for Video Prediction Considering + Long-term Motion Trend + + +
+ Stochastic video prediction enables the consideration of uncertainty in +future motion, thereby providing a better reflection of the dynamic nature of +the environment. Stochastic video prediction methods based on image +auto-regressive recurrent models need to feed their predictions back into the +latent space. Conversely, the state-space models, which decouple frame +synthesis and temporal prediction, proves to be more efficient. However, +inferring long-term temporal information about motion and generalizing to +dynamic scenarios under non-stationary assumptions remains an unresolved +challenge. In this paper, we propose a state-space decomposition stochastic +video prediction model that decomposes the overall video frame generation into +deterministic appearance prediction and stochastic motion prediction. Through +adaptive decomposition, the model's generalization capability to dynamic +scenarios is enhanced. In the context of motion prediction, obtaining a prior +on the long-term trend of future motion is crucial. Thus, in the stochastic +motion prediction branch, we infer the long-term motion trend from conditional +frames to guide the generation of future frames that exhibit high consistency +with the conditional frames. Experimental results demonstrate that our model +outperforms baselines on multiple datasets. + +
+
+
+
+
+ + ☆ Simple Image Signal Processing using Global Context Guidance + + +
+ In modern smartphone cameras, the Image Signal Processor (ISP) is the core +element that converts the RAW readings from the sensor into perceptually +pleasant RGB images for the end users. The ISP is typically proprietary and +handcrafted and consists of several blocks such as white balance, color +correction, and tone mapping. Deep learning-based ISPs aim to transform RAW +images into DSLR-like RGB images using deep neural networks. However, most +learned ISPs are trained using patches (small regions) due to computational +limitations. Such methods lack global context, which limits their efficacy on +full-resolution images and harms their ability to capture global properties +such as color constancy or illumination. First, we propose a novel module that +can be integrated into any neural ISP to capture the global context information +from the full RAW images. Second, we propose an efficient and simple neural ISP +that utilizes our proposed module. Our model achieves state-of-the-art results +on different benchmarks using diverse and real smartphone images. + +
+
+ comment: Preprint under review +
+
+
+
+
+ + ☆ MoA: Mixture-of-Attention for Subject-Context Disentanglement in + Personalized Image Generation + + +
+ We introduce a new architecture for personalization of text-to-image +diffusion models, coined Mixture-of-Attention (MoA). Inspired by the +Mixture-of-Experts mechanism utilized in large language models (LLMs), MoA +distributes the generation workload between two attention pathways: a +personalized branch and a non-personalized prior branch. MoA is designed to +retain the original model's prior by fixing its attention layers in the prior +branch, while minimally intervening in the generation process with the +personalized branch that learns to embed subjects in the layout and context +generated by the prior branch. A novel routing mechanism manages the +distribution of pixels in each layer across these branches to optimize the +blend of personalized and generic content creation. Once trained, MoA +facilitates the creation of high-quality, personalized images featuring +multiple subjects with compositions and interactions as diverse as those +generated by the original model. Crucially, MoA enhances the distinction +between the model's pre-existing capability and the newly augmented +personalized intervention, thereby offering a more disentangled subject-context +control that was previously unattainable. Project page: +https://snap-research.github.io/mixture-of-attention + +
+
+ comment: Project Website: https://snap-research.github.io/mixture-of-attention +
+
+
+
+
+ + ☆ Predicting Long-horizon Futures by Conditioning on Geometry and Time + + +
+ Our work explores the task of generating future sensor observations +conditioned on the past. We are motivated by `predictive coding' concepts from +neuroscience as well as robotic applications such as self-driving vehicles. +Predictive video modeling is challenging because the future may be multi-modal +and learning at scale remains computationally expensive for video processing. +To address both challenges, our key insight is to leverage the large-scale +pretraining of image diffusion models which can handle multi-modality. We +repurpose image models for video prediction by conditioning on new frame +timestamps. Such models can be trained with videos of both static and dynamic +scenes. To allow them to be trained with modestly-sized datasets, we introduce +invariances by factoring out illumination and texture by forcing the model to +predict (pseudo) depth, readily obtained for in-the-wild videos via +off-the-shelf monocular depth networks. In fact, we show that simply modifying +networks to predict grayscale pixels already improves the accuracy of video +prediction. Given the extra controllability with timestamp conditioning, we +propose sampling schedules that work better than the traditional autoregressive +and hierarchical sampling strategies. Motivated by probabilistic metrics from +the object forecasting literature, we create a benchmark for video prediction +on a diverse set of videos spanning indoor and outdoor scenes and a large +vocabulary of objects. Our experiments illustrate the effectiveness of learning +to condition on timestamps, and show the importance of predicting the future +with invariant modalities. + +
+
+ comment: Project page: http://www.cs.cmu.edu/~tkhurana/depthforecasting/ +
+
+
+
+
+ + ☆ SSDiff: Spatial-spectral Integrated Diffusion Model for Remote Sensing + Pansharpening + + +
+ Pansharpening is a significant image fusion technique that merges the spatial +content and spectral characteristics of remote sensing images to generate +high-resolution multispectral images. Recently, denoising diffusion +probabilistic models have been gradually applied to visual tasks, enhancing +controllable image generation through low-rank adaptation (LoRA). In this +paper, we introduce a spatial-spectral integrated diffusion model for the +remote sensing pansharpening task, called SSDiff, which considers the +pansharpening process as the fusion process of spatial and spectral components +from the perspective of subspace decomposition. Specifically, SSDiff utilizes +spatial and spectral branches to learn spatial details and spectral features +separately, then employs a designed alternating projection fusion module (APFM) +to accomplish the fusion. Furthermore, we propose a frequency modulation +inter-branch module (FMIM) to modulate the frequency distribution between +branches. The two components of SSDiff can perform favorably against the APFM +when utilizing a LoRA-like branch-wise alternative fine-tuning method. It +refines SSDiff to capture component-discriminating features more sufficiently. +Finally, extensive experiments on four commonly used datasets, i.e., +WorldView-3, WorldView-2, GaoFen-2, and QuickBird, demonstrate the superiority +of SSDiff both visually and quantitatively. The code will be made open source +after possible acceptance. + +
+
+
+
+
+ + ☆ JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on + Long-Tailed OCTA + + +
+ The oxygen saturation level in the blood (SaO2) is crucial for health, +particularly in relation to sleep-related breathing disorders. However, +continuous monitoring of SaO2 is time-consuming and highly variable depending +on patients' conditions. Recently, optical coherence tomography angiography +(OCTA) has shown promising development in rapidly and effectively screening +eye-related lesions, offering the potential for diagnosing sleep-related +disorders. To bridge this gap, our paper presents three key contributions. +Firstly, we propose JointViT, a novel model based on the Vision Transformer +architecture, incorporating a joint loss function for supervision. Secondly, we +introduce a balancing augmentation technique during data preprocessing to +improve the model's performance, particularly on the long-tail distribution +within the OCTA dataset. Lastly, through comprehensive experiments on the OCTA +dataset, our proposed method significantly outperforms other state-of-the-art +methods, achieving improvements of up to 12.28% in overall accuracy. This +advancement lays the groundwork for the future utilization of OCTA in +diagnosing sleep-related disorders. See project website +https://steve-zeyu-zhang.github.io/JointViT + +
+
+
+
+
+ + ☆ Event Cameras Meet SPADs for High-Speed, Low-Bandwidth Imaging + + +
+ Traditional cameras face a trade-off between low-light performance and +high-speed imaging: longer exposure times to capture sufficient light results +in motion blur, whereas shorter exposures result in Poisson-corrupted noisy +images. While burst photography techniques help mitigate this tradeoff, +conventional cameras are fundamentally limited in their sensor noise +characteristics. Event cameras and single-photon avalanche diode (SPAD) sensors +have emerged as promising alternatives to conventional cameras due to their +desirable properties. SPADs are capable of single-photon sensitivity with +microsecond temporal resolution, and event cameras can measure brightness +changes up to 1 MHz with low bandwidth requirements. We show that these +properties are complementary, and can help achieve low-light, high-speed image +reconstruction with low bandwidth requirements. We introduce a sensor fusion +framework to combine SPADs with event cameras to improves the reconstruction of +high-speed, low-light scenes while reducing the high bandwidth cost associated +with using every SPAD frame. Our evaluation, on both synthetic and real sensor +data, demonstrates significant enhancements ( > 5 dB PSNR) in reconstructing +low-light scenes at high temporal resolution (100 kHz) compared to conventional +cameras. Event-SPAD fusion shows great promise for real-world applications, +such as robotics or medical imaging. + +
+
+
+
+
+ + ☆ arcjetCV: an open-source software to analyze material ablation + + +
+ arcjetCV is an open-source Python software designed to automate time-resolved +measurements of heatshield material recession and recession rates from arcjet +test video footage. This new automated and accessible capability greatly +exceeds previous manual extraction methods, enabling rapid and detailed +characterization of material recession for any sample with a profile video. +arcjetCV automates the video segmentation process using machine learning +models, including a one-dimensional (1D) Convolutional Neural Network (CNN) to +infer the time-window of interest, a two-dimensional (2D) CNN for image and +edge segmentation, and a Local Outlier Factor (LOF) for outlier filtering. A +graphical user interface (GUI) simplifies the user experience and an +application programming interface (API) allows users to call the core functions +from scripts, enabling video batch processing. arcjetCV's capability to measure +time-resolved recession in turn enables characterization of non-linear +processes (shrinkage, swelling, melt flows, etc.), contributing to higher +fidelity validation and improved modeling of heatshield material performance. +The source code associated with this article can be found at +https://github.com/magnus-haw/arcjetCV. + +
+
+
+
+
+ + ☆ Multi-resolution Rescored ByteTrack for Video Object Detection on + Ultra-low-power Embedded Systems + + +
+ This paper introduces Multi-Resolution Rescored Byte-Track (MR2-ByteTrack), a +novel video object detection framework for ultra-low-power embedded processors. +This method reduces the average compute load of an off-the-shelf Deep Neural +Network (DNN) based object detector by up to 2.25$\times$ by alternating the +processing of high-resolution images (320$\times$320 pixels) with multiple +down-sized frames (192$\times$192 pixels). To tackle the accuracy degradation +due to the reduced image input size, MR2-ByteTrack correlates the output +detections over time using the ByteTrack tracker and corrects potential +misclassification using a novel probabilistic Rescore algorithm. By +interleaving two down-sized images for every high-resolution one as the input +of different state-of-the-art DNN object detectors with our MR2-ByteTrack, we +demonstrate an average accuracy increase of 2.16% and a latency reduction of +43% on the GAP9 microcontroller compared to a baseline frame-by-frame inference +scheme using exclusively full-resolution images. Code available at: +https://github.com/Bomps4/Multi_Resolution_Rescored_ByteTrack + +
+
+ comment: 9 pages, 3 figures Accepted for publication at the Embedded Vision + Workshop of the Computer Vision and Pattern Recognition conference, Seattle, + 2024 +
+
+
+
+
+ + ☆ AdaIR: Exploiting Underlying Similarities of Image Restoration Tasks + with Adapters + + +
+ Existing image restoration approaches typically employ extensive networks +specifically trained for designated degradations. Despite being effective, such +methods inevitably entail considerable storage costs and computational +overheads due to the reliance on task-specific networks. In this work, we go +beyond this well-established framework and exploit the inherent commonalities +among image restoration tasks. The primary objective is to identify components +that are shareable across restoration tasks and augment the shared components +with modules specifically trained for individual tasks. Towards this goal, we +propose AdaIR, a novel framework that enables low storage cost and efficient +training without sacrificing performance. Specifically, a generic restoration +network is first constructed through self-supervised pre-training using +synthetic degradations. Subsequent to the pre-training phase, adapters are +trained to adapt the pre-trained network to specific degradations. AdaIR +requires solely the training of lightweight, task-specific modules, ensuring a +more efficient storage and training regimen. We have conducted extensive +experiments to validate the effectiveness of AdaIR and analyze the influence of +the pre-training strategy on discovering shareable components. Extensive +experimental results show that AdaIR achieves outstanding results on multi-task +restoration while utilizing significantly fewer parameters (1.9 MB) and less +training time (7 hours) for each restoration task. The source codes and trained +models will be released. + +
+
+
+
+
+ + ☆ Towards Highly Realistic Artistic Style Transfer via Stable Diffusion + with Step-aware and Layer-aware Prompt IJCAI2024 + + +
+ Artistic style transfer aims to transfer the learned artistic style onto an +arbitrary content image, generating artistic stylized images. Existing +generative adversarial network-based methods fail to generate highly realistic +stylized images and always introduce obvious artifacts and disharmonious +patterns. Recently, large-scale pre-trained diffusion models opened up a new +way for generating highly realistic artistic stylized images. However, +diffusion model-based methods generally fail to preserve the content structure +of input content images well, introducing some undesired content structure and +style patterns. To address the above problems, we propose a novel pre-trained +diffusion-based artistic style transfer method, called LSAST, which can +generate highly realistic artistic stylized images while preserving the content +structure of input content images well, without bringing obvious artifacts and +disharmonious style patterns. Specifically, we introduce a Step-aware and +Layer-aware Prompt Space, a set of learnable prompts, which can learn the style +information from the collection of artworks and dynamically adjusts the input +images' content structure and style pattern. To train our prompt space, we +propose a novel inversion method, called Step-ware and Layer-aware Prompt +Inversion, which allows the prompt space to learn the style information of the +artworks collection. In addition, we inject a pre-trained conditional branch of +ControlNet into our LSAST, which further improved our framework's ability to +maintain content structure. Extensive experiments demonstrate that our proposed +method can generate more highly realistic artistic stylized images than the +state-of-the-art artistic style transfer methods. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ☆ Using Game Engines and Machine Learning to Create Synthetic Satellite + Imagery for a Tabletop Verification Exercise + + +
+ Satellite imagery is regarded as a great opportunity for citizen-based +monitoring of activities of interest. Relevant imagery may however not be +available at sufficiently high resolution, quality, or cadence -- let alone be +uniformly accessible to open-source analysts. This limits an assessment of the +true long-term potential of citizen-based monitoring of nuclear activities +using publicly available satellite imagery. In this article, we demonstrate how +modern game engines combined with advanced machine-learning techniques can be +used to generate synthetic imagery of sites of interest with the ability to +choose relevant parameters upon request; these include time of day, cloud +cover, season, or level of activity onsite. At the same time, resolution and +off-nadir angle can be adjusted to simulate different characteristics of the +satellite. While there are several possible use-cases for synthetic imagery, +here we focus on its usefulness to support tabletop exercises in which simple +monitoring scenarios can be examined to better understand verification +capabilities enabled by new satellite constellations and very short revisit +times. + +
+
+ comment: Annual Meeting of the Institute of Nuclear Materials Management + (INMM), Vienna +
+
+
+
+
+ + ☆ Octopus v3: Technical Report for On-device Sub-billion Multimodal AI + Agent + + +
+ A multimodal AI agent is characterized by its ability to process and learn +from various types of data, including natural language, visual, and audio +inputs, to inform its actions. Despite advancements in large language models +that incorporate visual data, such as GPT-4V, effectively translating +image-based data into actionable outcomes for AI agents continues to be +challenging. In this paper, we introduce a multimodal model that incorporates +the concept of functional token specifically designed for AI agent +applications. To ensure compatibility with edge devices, our model is optimized +to a compact size of less than 1B parameters. Like GPT-4, our model can process +both English and Chinese. We demonstrate that this model is capable of +operating efficiently on a wide range of edge devices, including as constrained +as a Raspberry Pi. + +
+
+
+
+
+ + ☆ CarcassFormer: An End-to-end Transformer-based Framework for + Simultaneous Localization, Segmentation and Classification of Poultry Carcass + Defect + + +
+ In the food industry, assessing the quality of poultry carcasses during +processing is a crucial step. This study proposes an effective approach for +automating the assessment of carcass quality without requiring skilled labor or +inspector involvement. The proposed system is based on machine learning (ML) +and computer vision (CV) techniques, enabling automated defect detection and +carcass quality assessment. To this end, an end-to-end framework called +CarcassFormer is introduced. It is built upon a Transformer-based architecture +designed to effectively extract visual representations while simultaneously +detecting, segmenting, and classifying poultry carcass defects. Our proposed +framework is capable of analyzing imperfections resulting from production and +transport welfare issues, as well as processing plant stunner, scalder, picker, +and other equipment malfunctions. To benchmark the framework, a dataset of +7,321 images was initially acquired, which contained both single and multiple +carcasses per image. In this study, the performance of the CarcassFormer system +is compared with other state-of-the-art (SOTA) approaches for both +classification, detection, and segmentation tasks. Through extensive +quantitative experiments, our framework consistently outperforms existing +methods, demonstrating remarkable improvements across various evaluation +metrics such as AP, AP@50, and AP@75. Furthermore, the qualitative results +highlight the strengths of CarcassFormer in capturing fine details, including +feathers, and accurately localizing and segmenting carcasses with high +precision. To facilitate further research and collaboration, the pre-trained +model and source code of CarcassFormer is available for research purposes at: +\url{https://github.com/UARK-AICV/CarcassFormer}. + +
+
+ comment: Accepted to Poultry Science Journal +
+
+
+
+
+ + ☆ Explainable Lung Disease Classification from Chest X-Ray Images + Utilizing Deep Learning and XAI + + +
+ Lung diseases remain a critical global health concern, and it's crucial to +have accurate and quick ways to diagnose them. This work focuses on classifying +different lung diseases into five groups: viral pneumonia, bacterial pneumonia, +COVID, tuberculosis, and normal lungs. Employing advanced deep learning +techniques, we explore a diverse range of models including CNN, hybrid models, +ensembles, transformers, and Big Transfer. The research encompasses +comprehensive methodologies such as hyperparameter tuning, stratified k-fold +cross-validation, and transfer learning with fine-tuning.Remarkably, our +findings reveal that the Xception model, fine-tuned through 5-fold +cross-validation, achieves the highest accuracy of 96.21\%. This success shows +that our methods work well in accurately identifying different lung diseases. +The exploration of explainable artificial intelligence (XAI) methodologies +further enhances our understanding of the decision-making processes employed by +these models, contributing to increased trust in their clinical applications. + +
+
+
+
+
+ + ☆ SPAMming Labels: Efficient Annotations for the Trackers of Tomorrow + + +
+ Increasing the annotation efficiency of trajectory annotations from videos +has the potential to enable the next generation of data-hungry tracking +algorithms to thrive on large-scale datasets. Despite the importance of this +task, there are currently very few works exploring how to efficiently label +tracking datasets comprehensively. In this work, we introduce SPAM, a tracking +data engine that provides high-quality labels with minimal human intervention. +SPAM is built around two key insights: i) most tracking scenarios can be easily +resolved. To take advantage of this, we utilize a pre-trained model to generate +high-quality pseudo-labels, reserving human involvement for a smaller subset of +more difficult instances; ii) handling the spatiotemporal dependencies of track +annotations across time can be elegantly and efficiently formulated through +graphs. Therefore, we use a unified graph formulation to address the annotation +of both detections and identity association for tracks across time. Based on +these insights, SPAM produces high-quality annotations with a fraction of +ground truth labeling cost. We demonstrate that trackers trained on SPAM labels +achieve comparable performance to those trained on human annotations while +requiring only 3-20% of the human labeling effort. Hence, SPAM paves the way +towards highly efficient labeling of large-scale tracking datasets. Our code +and models will be available upon acceptance. + +
+
+
+
+
+ + ☆ SLAIM: Robust Dense Neural SLAM for Online Tracking and Mapping + + +
+ We present SLAIM - Simultaneous Localization and Implicit Mapping. We propose +a novel coarse-to-fine tracking model tailored for Neural Radiance Field SLAM +(NeRF-SLAM) to achieve state-of-the-art tracking performance. Notably, existing +NeRF-SLAM systems consistently exhibit inferior tracking performance compared +to traditional SLAM algorithms. NeRF-SLAM methods solve camera tracking via +image alignment and photometric bundle-adjustment. Such optimization processes +are difficult to optimize due to the narrow basin of attraction of the +optimization loss in image space (local minima) and the lack of initial +correspondences. We mitigate these limitations by implementing a Gaussian +pyramid filter on top of NeRF, facilitating a coarse-to-fine tracking +optimization strategy. Furthermore, NeRF systems encounter challenges in +converging to the right geometry with limited input views. While prior +approaches use a Signed-Distance Function (SDF)-based NeRF and directly +supervise SDF values by approximating ground truth SDF through depth +measurements, this often results in suboptimal geometry. In contrast, our +method employs a volume density representation and introduces a novel KL +regularizer on the ray termination distribution, constraining scene geometry to +consist of empty space and opaque surfaces. Our solution implements both local +and global bundle-adjustment to produce a robust (coarse-to-fine) and accurate +(KL regularizer) SLAM solution. We conduct experiments on multiple datasets +(ScanNet, TUM, Replica) showing state-of-the-art results in tracking and in +reconstruction accuracy. + +
+
+
+
+
+ + ☆ Neural Shrödinger Bridge Matching for Pansharpening + + +
+ Recent diffusion probabilistic models (DPM) in the field of pansharpening +have been gradually gaining attention and have achieved state-of-the-art (SOTA) +performance. In this paper, we identify shortcomings in directly applying DPMs +to the task of pansharpening as an inverse problem: 1) initiating sampling +directly from Gaussian noise neglects the low-resolution multispectral image +(LRMS) as a prior; 2) low sampling efficiency often necessitates a higher +number of sampling steps. We first reformulate pansharpening into the +stochastic differential equation (SDE) form of an inverse problem. Building +upon this, we propose a Schr\"odinger bridge matching method that addresses +both issues. + We design an efficient deep neural network architecture tailored for the +proposed SB matching. + In comparison to the well-established DL-regressive-based framework and the +recent DPM framework, our method demonstrates SOTA performance with fewer +sampling steps. Moreover, we discuss the relationship between SB matching and +other methods based on SDEs and ordinary differential equations (ODEs), as well +as its connection with optimal transport. + Code will be available. + +
+
+
+
+
+ + ☆ RainyScape: Unsupervised Rainy Scene Reconstruction using Decoupled + Neural Rendering + + +
+ We propose RainyScape, an unsupervised framework for reconstructing clean +scenes from a collection of multi-view rainy images. RainyScape consists of two +main modules: a neural rendering module and a rain-prediction module that +incorporates a predictor network and a learnable latent embedding that captures +the rain characteristics of the scene. Specifically, based on the spectral bias +property of neural networks, we first optimize the neural rendering pipeline to +obtain a low-frequency scene representation. Subsequently, we jointly optimize +the two modules, driven by the proposed adaptive direction-sensitive +gradient-based reconstruction loss, which encourages the network to distinguish +between scene details and rain streaks, facilitating the propagation of +gradients to the relevant components. Extensive experiments on both the classic +neural radiance field and the recently proposed 3D Gaussian splatting +demonstrate the superiority of our method in effectively eliminating rain +streaks and rendering clean images, achieving state-of-the-art performance. The +constructed high-quality dataset and source code will be publicly available. + +
+
+
+
+
+ + ☆ Text-controlled Motion Mamba: Text-Instructed Temporal Grounding of + Human Motion + + +
+ Human motion understanding is a fundamental task with diverse practical +applications, facilitated by the availability of large-scale motion capture +datasets. Recent studies focus on text-motion tasks, such as text-based motion +generation, editing and question answering. In this study, we introduce the +novel task of text-based human motion grounding (THMG), aimed at precisely +localizing temporal segments corresponding to given textual descriptions within +untrimmed motion sequences. Capturing global temporal information is crucial +for the THMG task. However, transformer-based models that rely on global +temporal self-attention face challenges when handling long untrimmed sequences +due to the quadratic computational cost. We address these challenges by +proposing Text-controlled Motion Mamba (TM-Mamba), a unified model that +integrates temporal global context, language query control, and spatial graph +topology with only linear memory cost. The core of the model is a +text-controlled selection mechanism which dynamically incorporates global +temporal information based on text query. The model is further enhanced to be +topology-aware through the integration of relational embeddings. For +evaluation, we introduce BABEL-Grounding, the first text-motion dataset that +provides detailed textual descriptions of human actions along with their +corresponding temporal segments. Extensive evaluations demonstrate the +effectiveness of TM-Mamba on BABEL-Grounding. + +
+
+
+
+
+ + ☆ Boosting Medical Image Segmentation Performance with Adaptive + Convolution Layer + + +
+ Medical image segmentation plays a vital role in various clinical +applications, enabling accurate delineation and analysis of anatomical +structures or pathological regions. Traditional CNNs have achieved remarkable +success in this field. However, they often rely on fixed kernel sizes, which +can limit their performance and adaptability in medical images where features +exhibit diverse scales and configurations due to variability in equipment, +target sizes, and expert interpretations. + In this paper, we propose an adaptive layer placed ahead of leading +deep-learning models such as UCTransNet, which dynamically adjusts the kernel +size based on the local context of the input image. + By adaptively capturing and fusing features at multiple scales, our approach +enhances the network's ability to handle diverse anatomical structures and +subtle image details, even for recently performing architectures that +internally implement intra-scale modules, such as UCTransnet. + Extensive experiments are conducted on + benchmark medical image datasets to evaluate the effectiveness of our +proposal. It consistently outperforms traditional \glspl{CNN} with fixed kernel +sizes with a similar number of parameters, achieving superior segmentation +Accuracy, Dice, and IoU in popular datasets such as SegPC2021 and ISIC2018. The +model and data are published in the open-source repository, ensuring +transparency and reproducibility of our promising results. + +
+
+
+
+
+ + ☆ DeblurGS: Gaussian Splatting for Camera Motion Blur + + +
+ Although significant progress has been made in reconstructing sharp 3D scenes +from motion-blurred images, a transition to real-world applications remains +challenging. The primary obstacle stems from the severe blur which leads to +inaccuracies in the acquisition of initial camera poses through +Structure-from-Motion, a critical aspect often overlooked by previous +approaches. To address this challenge, we propose DeblurGS, a method to +optimize sharp 3D Gaussian Splatting from motion-blurred images, even with the +noisy camera pose initialization. We restore a fine-grained sharp scene by +leveraging the remarkable reconstruction capability of 3D Gaussian Splatting. +Our approach estimates the 6-Degree-of-Freedom camera motion for each blurry +observation and synthesizes corresponding blurry renderings for the +optimization process. Furthermore, we propose Gaussian Densification Annealing +strategy to prevent the generation of inaccurate Gaussians at erroneous +locations during the early training stages when camera motion is still +imprecise. Comprehensive experiments demonstrate that our DeblurGS achieves +state-of-the-art performance in deblurring and novel view synthesis for +real-world and synthetic benchmark datasets, as well as field-captured blurry +smartphone videos. + +
+
+
+
+
+ + ☆ Detector Collapse: Backdooring Object Detection to Catastrophic Overload + or Blindness IJCAI-24 + + +
+ Object detection tasks, crucial in safety-critical systems like autonomous +driving, focus on pinpointing object locations. These detectors are known to be +susceptible to backdoor attacks. However, existing backdoor techniques have +primarily been adapted from classification tasks, overlooking deeper +vulnerabilities specific to object detection. This paper is dedicated to +bridging this gap by introducing Detector Collapse} (DC), a brand-new backdoor +attack paradigm tailored for object detection. DC is designed to instantly +incapacitate detectors (i.e., severely impairing detector's performance and +culminating in a denial-of-service). To this end, we develop two innovative +attack schemes: Sponge for triggering widespread misidentifications and +Blinding for rendering objects invisible. Remarkably, we introduce a novel +poisoning strategy exploiting natural objects, enabling DC to act as a +practical backdoor in real-world environments. Our experiments on different +detectors across several benchmarks show a significant improvement +($\sim$10\%-60\% absolute and $\sim$2-7$\times$ relative) in attack efficacy +over state-of-the-art attacks. + +
+
+ comment: Accepted by IJCAI-24 +
+
+
+
+
+ + ☆ Consisaug: A Consistency-based Augmentation for Polyp Detection in + Endoscopy Image Analysis + + +
+ Colorectal cancer (CRC), which frequently originates from initially benign +polyps, remains a significant contributor to global cancer-related mortality. +Early and accurate detection of these polyps via colonoscopy is crucial for CRC +prevention. However, traditional colonoscopy methods depend heavily on the +operator's experience, leading to suboptimal polyp detection rates. Besides, +the public database are limited in polyp size and shape diversity. To enhance +the available data for polyp detection, we introduce Consisaug, an innovative +and effective methodology to augment data that leverages deep learning. We +utilize the constraint that when the image is flipped the class label should be +equal and the bonding boxes should be consistent. We implement our Consisaug on +five public polyp datasets and at three backbones, and the results show the +effectiveness of our method. + +
+
+ comment: MLMI 2023 +
+
+
+
+
+ + ☆ Best Practices for a Handwritten Text Recognition System + + +
+ Handwritten text recognition has been developed rapidly in the recent years, +following the rise of deep learning and its applications. Though deep learning +methods provide notable boost in performance concerning text recognition, +non-trivial deviation in performance can be detected even when small +pre-processing or architectural/optimization elements are changed. This work +follows a ``best practice'' rationale; highlight simple yet effective empirical +practices that can further help training and provide well-performing +handwritten text recognition systems. Specifically, we considered three basic +aspects of a deep HTR system and we proposed simple yet effective solutions: 1) +retain the aspect ratio of the images in the preprocessing step, 2) use +max-pooling for converting the 3D feature map of CNN output into a sequence of +features and 3) assist the training procedure via an additional CTC loss which +acts as a shortcut on the max-pooled sequential features. Using these proposed +simple modifications, one can attain close to state-of-the-art results, while +considering a basic convolutional-recurrent (CNN+LSTM) architecture, for both +IAM and RIMES datasets. Code is available at +https://github.com/georgeretsi/HTR-best-practices/. + +
+
+
+
+
+ + ☆ Vision-based control for landing an aerial vehicle on a marine vessel + + +
+ This work addresses the landing problem of an aerial vehicle, exemplified by +a simple quadrotor, on a moving platform using image-based visual servo +control. First, the mathematical model of the quadrotor aircraft is introduced, +followed by the design of the inner-loop control. At the second stage, the +image features on the textured target plane are exploited to derive a +vision-based control law. The image of the spherical centroid of a set of +landmarks present in the landing target is used as a position measurement, +whereas the translational optical flow is used as velocity measurement. The +kinematics of the vision-based system is expressed in terms of the observable +features, and the proposed control law guarantees convergence without +estimating the unknown distance between the vision system and the target, which +is also guaranteed to remain strictly positive, avoiding undesired collisions. +The performance of the proposed control law is evaluated in MATLAB and 3-D +simulation software Gazebo. Simulation results for a quadrotor UAV are provided +for different velocity profiles of the moving target, showcasing the robustness +of the proposed controller. + +
+
+
+
+
+ + ☆ SoccerNet Game State Reconstruction: End-to-End Athlete Tracking and + Identification on a Minimap + + +
+ Tracking and identifying athletes on the pitch holds a central role in +collecting essential insights from the game, such as estimating the total +distance covered by players or understanding team tactics. This tracking and +identification process is crucial for reconstructing the game state, defined by +the athletes' positions and identities on a 2D top-view of the pitch, (i.e. a +minimap). However, reconstructing the game state from videos captured by a +single camera is challenging. It requires understanding the position of the +athletes and the viewpoint of the camera to localize and identify players +within the field. In this work, we formalize the task of Game State +Reconstruction and introduce SoccerNet-GSR, a novel Game State Reconstruction +dataset focusing on football videos. SoccerNet-GSR is composed of 200 video +sequences of 30 seconds, annotated with 9.37 million line points for pitch +localization and camera calibration, as well as over 2.36 million athlete +positions on the pitch with their respective role, team, and jersey number. +Furthermore, we introduce GS-HOTA, a novel metric to evaluate game state +reconstruction methods. Finally, we propose and release an end-to-end baseline +for game state reconstruction, bootstrapping the research on this task. Our +experiments show that GSR is a challenging novel task, which opens the field +for future research. Our dataset and codebase are publicly available at +https://github.com/SoccerNet/sn-gamestate. + +
+
+
+
+
+ + ☆ Following the Human Thread in Social Navigation + + +
+ The success of collaboration between humans and robots in shared environments +relies on the robot's real-time adaptation to human motion. Specifically, in +Social Navigation, the agent should be close enough to assist but ready to back +up to let the human move freely, avoiding collisions. Human trajectories emerge +as crucial cues in Social Navigation, but they are partially observable from +the robot's egocentric view and computationally complex to process. + We propose the first Social Dynamics Adaptation model (SDA) based on the +robot's state-action history to infer the social dynamics. We propose a +two-stage Reinforcement Learning framework: the first learns to encode the +human trajectories into social dynamics and learns a motion policy conditioned +on this encoded information, the current status, and the previous action. Here, +the trajectories are fully visible, i.e., assumed as privileged information. In +the second stage, the trained policy operates without direct access to +trajectories. Instead, the model infers the social dynamics solely from the +history of previous actions and statuses in real-time. Tested on the novel +Habitat 3.0 platform, SDA sets a novel state of the art (SoA) performance in +finding and following humans. + +
+
+
+
+
+ + ☆ Single-temporal Supervised Remote Change Detection for Domain + Generalization + + +
+ Change detection is widely applied in remote sensing image analysis. Existing +methods require training models separately for each dataset, which leads to +poor domain generalization. Moreover, these methods rely heavily on large +amounts of high-quality pair-labelled data for training, which is expensive and +impractical. In this paper, we propose a multimodal contrastive learning +(ChangeCLIP) based on visual-language pre-training for change detection domain +generalization. Additionally, we propose a dynamic context optimization for +prompt learning. Meanwhile, to address the data dependency issue of existing +methods, we introduce a single-temporal and controllable AI-generated training +strategy (SAIN). This allows us to train the model using a large number of +single-temporal images without image pairs in the real world, achieving +excellent generalization. Extensive experiments on series of real change +detection datasets validate the superiority and strong generalization of +ChangeCLIP, outperforming state-of-the-art change detection methods. Code will +be available. + +
+
+
+
+
+ + ☆ VBR: A Vision Benchmark in Rome ICRA 2024 + + +
+ This paper presents a vision and perception research dataset collected in +Rome, featuring RGB data, 3D point clouds, IMU, and GPS data. We introduce a +new benchmark targeting visual odometry and SLAM, to advance the research in +autonomous robotics and computer vision. This work complements existing +datasets by simultaneously addressing several issues, such as environment +diversity, motion patterns, and sensor frequency. It uses up-to-date devices +and presents effective procedures to accurately calibrate the intrinsic and +extrinsic of the sensors while addressing temporal synchronization. During +recording, we cover multi-floor buildings, gardens, urban and highway +scenarios. Combining handheld and car-based data collections, our setup can +simulate any robot (quadrupeds, quadrotors, autonomous vehicles). The dataset +includes an accurate 6-dof ground truth based on a novel methodology that +refines the RTK-GPS estimate with LiDAR point clouds through Bundle Adjustment. +All sequences divided in training and testing are accessible through our +website. + +
+
+ comment: Accepted at IEEE ICRA 2024 Website: + https://rvp-group.net/datasets/slam.html +
+
+
+
+
+ + ☆ Leveraging Fine-Grained Information and Noise Decoupling for Remote + Sensing Change Detection + + +
+ Change detection aims to identify remote sense object changes by analyzing +data between bitemporal image pairs. Due to the large temporal and spatial span +of data collection in change detection image pairs, there are often a +significant amount of task-specific and task-agnostic noise. Previous effort +has focused excessively on denoising, with this goes a great deal of loss of +fine-grained information. In this paper, we revisit the importance of +fine-grained features in change detection and propose a series of operations +for fine-grained information compensation and noise decoupling (FINO). First, +the context is utilized to compensate for the fine-grained information in the +feature space. Next, a shape-aware and a brightness-aware module are designed +to improve the capacity for representation learning. The shape-aware module +guides the backbone for more precise shape estimation, guiding the backbone +network in extracting object shape features. The brightness-aware module learns +a overall brightness estimation to improve the model's robustness to +task-agnostic noise. Finally, a task-specific noise decoupling structure is +designed as a way to improve the model's ability to separate noise interference +from feature similarity. With these training schemes, our proposed method +achieves new state-of-the-art (SOTA) results in multiple change detection +benchmarks. The code will be made available. + +
+
+
+
+
+ + ☆ Improving Composed Image Retrieval via Contrastive Learning with Scaling + Positives and Negatives + + +
+ The Composed Image Retrieval (CIR) task aims to retrieve target images using +a composed query consisting of a reference image and a modified text. Advanced +methods often utilize contrastive learning as the optimization objective, which +benefits from adequate positive and negative examples. However, the triplet for +CIR incurs high manual annotation costs, resulting in limited positive +examples. Furthermore, existing methods commonly use in-batch negative +sampling, which reduces the negative number available for the model. To address +the problem of lack of positives, we propose a data generation method by +leveraging a multi-modal large language model to construct triplets for CIR. To +introduce more negatives during fine-tuning, we design a two-stage fine-tuning +framework for CIR, whose second stage introduces plenty of static +representations of negatives to optimize the representation space rapidly. The +above two improvements can be effectively stacked and designed to be +plug-and-play, easily applied to existing CIR models without changing their +original architectures. Extensive experiments and ablation analysis demonstrate +that our method effectively scales positives and negatives and achieves +state-of-the-art results on both FashionIQ and CIRR datasets. In addition, our +methods also perform well in zero-shot composed image retrieval, providing a +new CIR solution for the low-resources scenario. + +
+
+ comment: 12 pages, 11 figures +
+
+
+
+
+ + ☆ Achieving Rotation Invariance in Convolution Operations: Shifting from + Data-Driven to Mechanism-Assured + + +
+ Achieving rotation invariance in deep neural networks without relying on data +has always been a hot research topic. Intrinsic rotation invariance can enhance +the model's feature representation capability, enabling better performance in +tasks such as multi-orientation object recognition and detection. Based on +various types of non-learnable operators, including gradient, sort, local +binary pattern, maximum, etc., this paper designs a set of new convolution +operations that are natually invariant to arbitrary rotations. Unlike most +previous studies, these rotation-invariant convolutions (RIConvs) have the same +number of learnable parameters and a similar computational process as +conventional convolution operations, allowing them to be interchangeable. Using +the MNIST-Rot dataset, we first verify the invariance of these RIConvs under +various rotation angles and compare their performance with previous +rotation-invariant convolutional neural networks (RI-CNNs). Two types of +RIConvs based on gradient operators achieve state-of-the-art results. +Subsequently, we combine RIConvs with different types and depths of classic CNN +backbones. Using the OuTex_00012, MTARSI, and NWPU-RESISC-45 datasets, we test +their performance on texture recognition, aircraft type recognition, and remote +sensing image classification tasks. The results show that RIConvs significantly +improve the accuracy of these CNN backbones, especially when the training data +is limited. Furthermore, we find that even with data augmentation, RIConvs can +further enhance model performance. + +
+
+
+
+
+ + ☆ A Semantic Segmentation-guided Approach for Ground-to-Aerial Image + Matching + + +
+ Nowadays the accurate geo-localization of ground-view images has an important +role across domains as diverse as journalism, forensics analysis, transports, +and Earth Observation. This work addresses the problem of matching a query +ground-view image with the corresponding satellite image without GPS data. This +is done by comparing the features from a ground-view image and a satellite one, +innovatively leveraging the corresponding latter's segmentation mask through a +three-stream Siamese-like network. The proposed method, Semantic Align Net +(SAN), focuses on limited Field-of-View (FoV) and ground panorama images +(images with a FoV of 360{\deg}). The novelty lies in the fusion of satellite +images in combination with their semantic segmentation masks, aimed at ensuring +that the model can extract useful features and focus on the significant parts +of the images. This work shows how SAN through semantic analysis of images +improves the performance on the unlabelled CVUSA dataset for all the tested +FoVs. + +
+
+ comment: 6 pages, 2 figures, 2 tables, Submitted to IGARSS 2024 +
+
+
+
+
+ + ☆ Learning from Unlabelled Data with Transformers: Domain Adaptation for + Semantic Segmentation of High Resolution Aerial Images + + +
+ Data from satellites or aerial vehicles are most of the times unlabelled. +Annotating such data accurately is difficult, requires expertise, and is costly +in terms of time. Even if Earth Observation (EO) data were correctly labelled, +labels might change over time. Learning from unlabelled data within a +semi-supervised learning framework for segmentation of aerial images is +challenging. In this paper, we develop a new model for semantic segmentation of +unlabelled images, the Non-annotated Earth Observation Semantic Segmentation +(NEOS) model. NEOS performs domain adaptation as the target domain does not +have ground truth semantic segmentation masks. The distribution inconsistencies +between the target and source domains are due to differences in acquisition +scenes, environment conditions, sensors, and times. Our model aligns the +learned representations of the different domains to make them coincide. The +evaluation results show that NEOS is successful and outperforms other models +for semantic segmentation of unlabelled data. + +
+
+ comment: 6 pages, 7 figures, Submitted to IGARSS 2024 +
+
+
+
+
+ + ☆ Closely Interactive Human Reconstruction with Proxemics and + Physics-Guided Adaption CVPR2024 + + +
+ Existing multi-person human reconstruction approaches mainly focus on +recovering accurate poses or avoiding penetration, but overlook the modeling of +close interactions. In this work, we tackle the task of reconstructing closely +interactive humans from a monocular video. The main challenge of this task +comes from insufficient visual information caused by depth ambiguity and severe +inter-person occlusion. In view of this, we propose to leverage knowledge from +proxemic behavior and physics to compensate the lack of visual information. +This is based on the observation that human interaction has specific patterns +following the social proxemics. Specifically, we first design a latent +representation based on Vector Quantised-Variational AutoEncoder (VQ-VAE) to +model human interaction. A proxemics and physics guided diffusion model is then +introduced to denoise the initial distribution. We design the diffusion model +as dual branch with each branch representing one individual such that the +interaction can be modeled via cross attention. With the learned priors of +VQ-VAE and physical constraint as the additional information, our proposed +approach is capable of estimating accurate poses that are also proxemics and +physics plausible. Experimental results on Hi4D, 3DPW, and CHI3D demonstrate +that our method outperforms existing approaches. The code is available at +\url{https://github.com/boycehbz/HumanInteraction}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Training Transformer Models by Wavelet Losses Improves Quantitative and + Visual Performance in Single Image Super-Resolution + + +
+ Transformer-based models have achieved remarkable results in low-level vision +tasks including image super-resolution (SR). However, early Transformer-based +approaches that rely on self-attention within non-overlapping windows encounter +challenges in acquiring global information. To activate more input pixels +globally, hybrid attention models have been proposed. Moreover, training by +solely minimizing pixel-wise RGB losses, such as L1, have been found inadequate +for capturing essential high-frequency details. This paper presents two +contributions: i) We introduce convolutional non-local sparse attention (NLSA) +blocks to extend the hybrid transformer architecture in order to further +enhance its receptive field. ii) We employ wavelet losses to train Transformer +models to improve quantitative and subjective performance. While wavelet losses +have been explored previously, showing their power in training +Transformer-based SR models is novel. Our experimental results demonstrate that +the proposed model provides state-of-the-art PSNR results as well as superior +visual performance across various benchmark datasets. + +
+
+ comment: total of 10 pages including references, 5 tables and 5 figures, + accepted for NTIRE 2024 Single Image Super Resolution (x4) challenge +
+
+
+
+
+ + ☆ Criteria for Uncertainty-based Corner Cases Detection in Instance + Segmentation + + +
+ The operating environment of a highly automated vehicle is subject to change, +e.g., weather, illumination, or the scenario containing different objects and +other participants in which the highly automated vehicle has to navigate its +passengers safely. These situations must be considered when developing and +validating highly automated driving functions. This already poses a problem for +training and evaluating deep learning models because without the costly +labeling of thousands of recordings, not knowing whether the data contains +relevant, interesting data for further model training, it is a guess under +which conditions and situations the model performs poorly. For this purpose, we +present corner case criteria based on the predictive uncertainty. With our +corner case criteria, we are able to detect uncertainty-based corner cases of +an object instance segmentation model without relying on ground truth (GT) +data. We evaluated each corner case criterion using the COCO and the NuImages +dataset to analyze the potential of our approach. We also provide a corner case +decision function that allows us to distinguish each object into True Positive +(TP), localization and/or classification corner case, or False Positive (FP). +We also present our first results of an iterative training cycle that +outperforms the baseline and where the data added to the training dataset is +selected based on the corner case decision function. + +
+
+
+
+
+ + ☆ The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a + Clean Model on Poisoned Data ICCV + + +
+ Recently, backdoor attacks have posed a serious security threat to the +training process of deep neural networks (DNNs). The attacked model behaves +normally on benign samples but outputs a specific result when the trigger is +present. However, compared with the rocketing progress of backdoor attacks, +existing defenses are difficult to deal with these threats effectively or +require benign samples to work, which may be unavailable in real scenarios. In +this paper, we find that the poisoned samples and benign samples can be +distinguished with prediction entropy. This inspires us to propose a novel +dual-network training framework: The Victim and The Beneficiary (V&B), which +exploits a poisoned model to train a clean model without extra benign samples. +Firstly, we sacrifice the Victim network to be a powerful poisoned sample +detector by training on suspicious samples. Secondly, we train the Beneficiary +network on the credible samples selected by the Victim to inhibit backdoor +injection. Thirdly, a semi-supervised suppression strategy is adopted for +erasing potential backdoors and improving model performance. Furthermore, to +better inhibit missed poisoned samples, we propose a strong data augmentation +method, AttentionMix, which works well with our proposed V&B framework. +Extensive experiments on two widely used datasets against 6 state-of-the-art +attacks demonstrate that our framework is effective in preventing backdoor +injection and robust to various attacks while maintaining the performance on +benign samples. Our code is available at https://github.com/Zixuan-Zhu/VaB. + +
+
+ comment: 13 pages, 6 figures, published to ICCV +
+
+
+
+
+ + ☆ MMCBE: Multi-modality Dataset for Crop Biomass Estimation and Beyond + + +
+ Crop biomass, a critical indicator of plant growth, health, and productivity, +is invaluable for crop breeding programs and agronomic research. However, the +accurate and scalable quantification of crop biomass remains inaccessible due +to limitations in existing measurement methods. One of the obstacles impeding +the advancement of current crop biomass prediction methodologies is the +scarcity of publicly available datasets. Addressing this gap, we introduce a +new dataset in this domain, i.e. Multi-modality dataset for crop biomass +estimation (MMCBE). Comprising 216 sets of multi-view drone images, coupled +with LiDAR point clouds, and hand-labelled ground truth, MMCBE represents the +first multi-modality one in the field. This dataset aims to establish benchmark +methods for crop biomass quantification and foster the development of +vision-based approaches. We have rigorously evaluated state-of-the-art crop +biomass estimation methods using MMCBE and ventured into additional potential +applications, such as 3D crop reconstruction from drone imagery and novel-view +rendering. With this publication, we are making our comprehensive dataset +available to the broader community. + +
+
+ comment: 10 pages, 10 figures, 3 tables +
+
+
+
+
+ + ☆ A Progressive Framework of Vision-language Knowledge Distillation and + Alignment for Multilingual Scene + + +
+ Pre-trained vision-language (V-L) models such as CLIP have shown excellent +performance in many downstream cross-modal tasks. However, most of them are +only applicable to the English context. Subsequent research has focused on this +problem and proposed improved models, such as CN-CLIP and AltCLIP, to +facilitate their applicability to Chinese and even other languages. +Nevertheless, these models suffer from high latency and a large memory +footprint in inference, which limits their further deployment on +resource-constrained edge devices. In this work, we propose a conceptually +simple yet effective multilingual CLIP Compression framework and train a +lightweight multilingual vision-language model, called DC-CLIP, for both +Chinese and English context. In this framework, we collect high-quality Chinese +and English text-image pairs and design two training stages, including +multilingual vision-language feature distillation and alignment. During the +first stage, lightweight image/text student models are designed to learn robust +visual/multilingual textual feature representation ability from corresponding +teacher models, respectively. Subsequently, the multilingual vision-language +alignment stage enables effective alignment of visual and multilingual textual +features to further improve the model's multilingual performance. Comprehensive +experiments in zero-shot image classification, conducted based on the ELEVATER +benchmark, showcase that DC-CLIP achieves superior performance in the English +context and competitive performance in the Chinese context, even with less +training data, when compared to existing models of similar parameter magnitude. +The evaluation demonstrates the effectiveness of our designed training +mechanism. + +
+
+
+
+
+ + ☆ Optical Image-to-Image Translation Using Denoising Diffusion Models: + Heterogeneous Change Detection as a Use Case + + +
+ We introduce an innovative deep learning-based method that uses a denoising +diffusion-based model to translate low-resolution images to high-resolution +ones from different optical sensors while preserving the contents and avoiding +undesired artifacts. The proposed method is trained and tested on a large and +diverse data set of paired Sentinel-II and Planet Dove images. We show that it +can solve serious image generation issues observed when the popular +classifier-free guided Denoising Diffusion Implicit Model (DDIM) framework is +used in the task of Image-to-Image Translation of multi-sensor optical remote +sensing images and that it can generate large images with highly consistent +patches, both in colors and in features. Moreover, we demonstrate how our +method improves heterogeneous change detection results in two urban areas: +Beirut, Lebanon, and Austin, USA. Our contributions are: i) a new training and +testing algorithm based on denoising diffusion models for optical image +translation; ii) a comprehensive image quality evaluation and ablation study; +iii) a comparison with the classifier-free guided DDIM framework; and iv) +change detection experiments on heterogeneous data. + +
+
+
+
+
+ + ☆ ONOT: a High-Quality ICAO-compliant Synthetic Mugshot Dataset + + +
+ Nowadays, state-of-the-art AI-based generative models represent a viable +solution to overcome privacy issues and biases in the collection of datasets +containing personal information, such as faces. Following this intuition, in +this paper we introduce ONOT, a synthetic dataset specifically focused on the +generation of high-quality faces in adherence to the requirements of the +ISO/IEC 39794-5 standards that, following the guidelines of the International +Civil Aviation Organization (ICAO), defines the interchange formats of face +images in electronic Machine-Readable Travel Documents (eMRTD). The strictly +controlled and varied mugshot images included in ONOT are useful in research +fields related to the analysis of face images in eMRTD, such as Morphing Attack +Detection and Face Quality Assessment. The dataset is publicly released, in +combination with the generation procedure details in order to improve the +reproducibility and enable future extensions. + +
+
+ comment: Paper accepted in IEEE FG 2024 +
+
+
+
+
+ + ☆ Energy-Efficient Uncertainty-Aware Biomass Composition Prediction at the + Edge CVPR 2024 + + +
+ Clover fixates nitrogen from the atmosphere to the ground, making +grass-clover mixtures highly desirable to reduce external nitrogen +fertilization. Herbage containing clover additionally promotes higher food +intake, resulting in higher milk production. Herbage probing however remains +largely unused as it requires a time-intensive manual laboratory analysis. +Without this information, farmers are unable to perform localized clover sowing +or take targeted fertilization decisions. Deep learning algorithms have been +proposed with the goal to estimate the dry biomass composition from images of +the grass directly in the fields. The energy-intensive nature of deep learning +however limits deployment to practical edge devices such as smartphones. This +paper proposes to fill this gap by applying filter pruning to reduce the energy +requirement of existing deep learning solutions. We report that although pruned +networks are accurate on controlled, high-quality images of the grass, they +struggle to generalize to real-world smartphone images that are blurry or taken +from challenging angles. We address this challenge by training filter-pruned +models using a variance attenuation loss so they can predict the uncertainty of +their predictions. When the uncertainty exceeds a threshold, we re-infer using +a more accurate unpruned model. This hybrid approach allows us to reduce energy +consumption while retaining a high accuracy. We evaluate our algorithm on two +datasets: the GrassClover and the Irish clover using an NVIDIA Jetson Nano edge +device. We find that we reduce energy reduction with respect to +state-of-the-art solutions by 50% on average with only 4% accuracy loss. + +
+
+ comment: The paper has been accepted to CVPR 2024 5th Workshop on Vision for + Agriculture +
+
+
+
+
+ + ☆ Simple In-place Data Augmentation for Surveillance Object Detection CVPR + + +
+ Motivated by the need to improve model performance in traffic monitoring +tasks with limited labeled samples, we propose a straightforward augmentation +technique tailored for object detection datasets, specifically designed for +stationary camera-based applications. Our approach focuses on placing objects +in the same positions as the originals to ensure its effectiveness. By applying +in-place augmentation on objects from the same camera input image, we address +the challenge of overlapping with original and previously selected objects. +Through extensive testing on two traffic monitoring datasets, we illustrate the +efficacy of our augmentation strategy in improving model performance, +particularly in scenarios with limited labeled samples and imbalanced class +distributions. Notably, our method achieves comparable performance to models +trained on the entire dataset while utilizing only 8.5 percent of the original +data. Moreover, we report significant improvements, with mAP@.5 increasing from +0.4798 to 0.5025, and the mAP@.5:.95 rising from 0.29 to 0.3138 on the +FishEye8K dataset. These results highlight the potential of our augmentation +approach in enhancing object detection models for traffic monitoring +applications. + +
+
+ comment: CVPR Workshop 2024 +
+
+
+
+
+ + ☆ Feature Corrective Transfer Learning: End-to-End Solutions to Object + Detection in Non-Ideal Visual Conditions CVPR + + +
+ A significant challenge in the field of object detection lies in the system's +performance under non-ideal imaging conditions, such as rain, fog, low +illumination, or raw Bayer images that lack ISP processing. Our study +introduces "Feature Corrective Transfer Learning", a novel approach that +leverages transfer learning and a bespoke loss function to facilitate the +end-to-end detection of objects in these challenging scenarios without the need +to convert non-ideal images into their RGB counterparts. In our methodology, we +initially train a comprehensive model on a pristine RGB image dataset. +Subsequently, non-ideal images are processed by comparing their feature maps +against those from the initial ideal RGB model. This comparison employs the +Extended Area Novel Structural Discrepancy Loss (EANSDL), a novel loss function +designed to quantify similarities and integrate them into the detection loss. +This approach refines the model's ability to perform object detection across +varying conditions through direct feature map correction, encapsulating the +essence of Feature Corrective Transfer Learning. Experimental validation on +variants of the KITTI dataset demonstrates a significant improvement in mean +Average Precision (mAP), resulting in a 3.8-8.1% relative enhancement in +detection under non-ideal conditions compared to the baseline model, and a less +marginal performance difference within 1.3% of the mAP@[0.5:0.95] achieved +under ideal conditions by the standard Faster RCNN algorithm. + +
+
+ comment: 10 pages, 3 figures, accepted by 2024 CVPR UG2 Workshop +
+
+
+
+
+ + ☆ Prompt-Guided Generation of Structured Chest X-Ray Report Using a + Pre-trained LLM + + +
+ Medical report generation automates radiology descriptions from images, +easing the burden on physicians and minimizing errors. However, current methods +lack structured outputs and physician interactivity for clear, clinically +relevant reports. Our method introduces a prompt-guided approach to generate +structured chest X-ray reports using a pre-trained large language model (LLM). +First, we identify anatomical regions in chest X-rays to generate focused +sentences that center on key visual elements, thereby establishing a structured +report foundation with anatomy-based sentences. We also convert the detected +anatomy into textual prompts conveying anatomical comprehension to the LLM. +Additionally, the clinical context prompts guide the LLM to emphasize +interactivity and clinical requirements. By integrating anatomy-focused +sentences and anatomy/clinical prompts, the pre-trained LLM can generate +structured chest X-ray reports tailored to prompted anatomical regions and +clinical contexts. We evaluate using language generation and clinical +effectiveness metrics, demonstrating strong performance. + +
+
+ comment: Accepted by IEEE Conference on Multimedia Expo 2024 +
+
+
+
+
+ + ☆ Exploring the Transferability of Visual Prompting for Multimodal Large + Language Models CVPR 2024 + + +
+ Although Multimodal Large Language Models (MLLMs) have demonstrated promising +versatile capabilities, their performance is still inferior to specialized +models on downstream tasks, which makes adaptation necessary to enhance their +utility. However, fine-tuning methods require independent training for every +model, leading to huge computation and memory overheads. In this paper, we +propose a novel setting where we aim to improve the performance of diverse +MLLMs with a group of shared parameters optimized for a downstream task. To +achieve this, we propose Transferable Visual Prompting (TVP), a simple and +effective approach to generate visual prompts that can transfer to different +models and improve their performance on downstream tasks after trained on only +one model. We introduce two strategies to address the issue of cross-model +feature corruption of existing visual prompting methods and enhance the +transferability of the learned prompts, including 1) Feature Consistency +Alignment: which imposes constraints to the prompted feature changes to +maintain task-agnostic knowledge; 2) Task Semantics Enrichment: which +encourages the prompted images to contain richer task-specific semantics with +language guidance. We validate the effectiveness of TVP through extensive +experiments with 6 modern MLLMs on a wide variety of tasks ranging from object +recognition and counting to multimodal reasoning and hallucination correction. + +
+
+ comment: Accepted in CVPR 2024 as Poster (Highlight) +
+
+
+
+
+ + ☆ Kathakali Hand Gesture Recognition With Minimal Data + + +
+ The Indian classical dance-drama Kathakali has a set of hand gestures called +Mudras, which form the fundamental units of all its dance moves and postures. +Recognizing the depicted mudra becomes one of the first steps in its digital +processing. The work treats the problem as a 24-class classification task and +proposes a vector-similarity-based approach using pose estimation, eliminating +the need for further training or fine-tuning. This approach overcomes the +challenge of data scarcity that limits the application of AI in similar +domains. The method attains 92% accuracy which is a similar or better +performance as other model-training-based works existing in the domain, with +the added advantage that the method can still work with data sizes as small as +1 or 5 samples with a slightly reduced performance. Working with images, +videos, and even real-time streams is possible. The system can work with +hand-cropped or full-body images alike. We have developed and made public a +dataset for the Kathakali Mudra Recognition as part of this work. + +
+
+
+
+
+ + ☆ GhostNetV3: Exploring the Training Strategies for Compact Models + + +
+ Compact neural networks are specially designed for applications on edge +devices with faster inference speed yet modest performance. However, training +strategies of compact models are borrowed from that of conventional models at +present, which ignores their difference in model capacity and thus may impede +the performance of compact models. In this paper, by systematically +investigating the impact of different training ingredients, we introduce a +strong training strategy for compact models. We find that the appropriate +designs of re-parameterization and knowledge distillation are crucial for +training high-performance compact models, while some commonly used data +augmentations for training conventional models, such as Mixup and CutMix, lead +to worse performance. Our experiments on ImageNet-1K dataset demonstrate that +our specialized training strategy for compact models is applicable to various +architectures, including GhostNetV2, MobileNetV2 and ShuffleNetV2. +Specifically, equipped with our strategy, GhostNetV3 1.3$\times$ achieves a +top-1 accuracy of 79.1% with only 269M FLOPs and a latency of 14.46ms on mobile +devices, surpassing its ordinarily trained counterpart by a large margin. +Moreover, our observation can also be extended to object detection scenarios. +PyTorch code and checkpoints can be found at +https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv3_pytorch. + +
+
+
+
+
+ + ☆ Pre-processing matters: A segment search method for WSI classification + + +
+ Pre-processing for whole slide images can affect classification performance +both in the training and inference stages. Our study analyzes the impact of +pre-processing parameters on inference and training across single- and +multiple-domain datasets. However, searching for an optimal parameter set is +time-consuming. To overcome this, we propose a novel Similarity-based Simulated +Annealing approach for fast parameter tuning to enhance inference performance +on single-domain data. Our method demonstrates significant performance +improvements in accuracy, which raise accuracy from 0.512 to 0.847 in a single +domain. We further extend our insight into training performance in multi-domain +data by employing a novel Bayesian optimization to search optimal +pre-processing parameters, resulting in a high AUC of 0.967. We highlight that +better pre-processing for WSI can contribute to further accuracy improvement in +the histology area. + +
+
+
+
+
+ + ☆ Deep Portrait Quality Assessment. A NTIRE 2024 Challenge Survey CVPR + + +
+ This paper reviews the NTIRE 2024 Portrait Quality Assessment Challenge, +highlighting the proposed solutions and results. This challenge aims to obtain +an efficient deep neural network capable of estimating the perceptual quality +of real portrait photos. The methods must generalize to diverse scenes and +diverse lighting conditions (indoor, outdoor, low-light), movement, blur, and +other challenging conditions. In the challenge, 140 participants registered, +and 35 submitted results during the challenge period. The performance of the +top 5 submissions is reviewed and provided here as a gauge for the current +state-of-the-art in Portrait Quality Assessment. + +
+
+ comment: CVPRW - NTIRE 2024 +
+
+
+
+
+ + ☆ Learning SO(3)-Invariant Semantic Correspondence via Local Shape + Transform CVPR 2024 + + +
+ Establishing accurate 3D correspondences between shapes stands as a pivotal +challenge with profound implications for computer vision and robotics. However, +existing self-supervised methods for this problem assume perfect input shape +alignment, restricting their real-world applicability. In this work, we +introduce a novel self-supervised Rotation-Invariant 3D correspondence learner +with Local Shape Transform, dubbed RIST, that learns to establish dense +correspondences between shapes even under challenging intra-class variations +and arbitrary orientations. Specifically, RIST learns to dynamically formulate +an SO(3)-invariant local shape transform for each point, which maps the +SO(3)-equivariant global shape descriptor of the input shape to a local shape +descriptor. These local shape descriptors are provided as inputs to our decoder +to facilitate point cloud self- and cross-reconstruction. Our proposed +self-supervised training pipeline encourages semantically corresponding points +from different shapes to be mapped to similar local shape descriptors, enabling +RIST to establish dense point-wise correspondences. RIST demonstrates +state-of-the-art performances on 3D part label transfer and semantic keypoint +transfer given arbitrarily rotated point cloud pairs, outperforming existing +methods by significant margins. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ HybriMap: Hybrid Clues Utilization for Effective Vectorized HD Map + Construction + + +
+ Constructing vectorized high-definition maps from surround-view cameras has +garnered significant attention in recent years. However, the commonly employed +multi-stage sequential workflow in prevailing approaches often leads to the +loss of early-stage information, particularly in perspective-view features. +Usually, such loss is observed as an instance missing or shape mismatching in +the final birds-eye-view predictions. To address this concern, we propose a +novel approach, namely \textbf{HybriMap}, which effectively exploits clues from +hybrid features to ensure the delivery of valuable information. Specifically, +we design the Dual Enhancement Module, to enable both explicit integration and +implicit modification under the guidance of hybrid features. Additionally, the +perspective keypoints are utilized as supervision, further directing the +feature enhancement process. Extensive experiments conducted on existing +benchmarks have demonstrated the state-of-the-art performance of our proposed +approach. + +
+
+
+
+
+ + ☆ Multi-target and multi-stage liver lesion segmentation and detection in + multi-phase computed tomography scans + + +
+ Multi-phase computed tomography (CT) scans use contrast agents to highlight +different anatomical structures within the body to improve the probability of +identifying and detecting anatomical structures of interest and abnormalities +such as liver lesions. Yet, detecting these lesions remains a challenging task +as these lesions vary significantly in their size, shape, texture, and contrast +with respect to surrounding tissue. Therefore, radiologists need to have an +extensive experience to be able to identify and detect these lesions. +Segmentation-based neural networks can assist radiologists with this task. +Current state-of-the-art lesion segmentation networks use the encoder-decoder +design paradigm based on the UNet architecture where the multi-phase CT scan +volume is fed to the network as a multi-channel input. Although this approach +utilizes information from all the phases and outperform single-phase +segmentation networks, we demonstrate that their performance is not optimal and +can be further improved by incorporating the learning from models trained on +each single-phase individually. Our approach comprises three stages. The first +stage identifies the regions within the liver where there might be lesions at +three different scales (4, 8, and 16 mm). The second stage includes the main +segmentation model trained using all the phases as well as a segmentation model +trained on each of the phases individually. The third stage uses the +multi-phase CT volumes together with the predictions from each of the +segmentation models to generate the final segmentation map. Overall, our +approach improves relative liver lesion segmentation performance by 1.6% while +reducing performance variability across subjects by 8% when compared to the +current state-of-the-art models. + +
+
+
+
+
+ + ☆ REACTO: Reconstructing Articulated Objects from a Single Video + + +
+ In this paper, we address the challenge of reconstructing general articulated +3D objects from a single video. Existing works employing dynamic neural +radiance fields have advanced the modeling of articulated objects like humans +and animals from videos, but face challenges with piece-wise rigid general +articulated objects due to limitations in their deformation models. To tackle +this, we propose Quasi-Rigid Blend Skinning, a novel deformation model that +enhances the rigidity of each part while maintaining flexible deformation of +the joints. Our primary insight combines three distinct approaches: 1) an +enhanced bone rigging system for improved component modeling, 2) the use of +quasi-sparse skinning weights to boost part rigidity and reconstruction +fidelity, and 3) the application of geodesic point assignment for precise +motion and seamless deformation. Our method outperforms previous works in +producing higher-fidelity 3D reconstructions of general articulated objects, as +demonstrated on both real and synthetic datasets. Project page: +https://chaoyuesong.github.io/REACTO. + +
+
+
+
+
+ + ☆ GeoReF: Geometric Alignment Across Shape Variation for Category-level + Object Pose Refinement + + +
+ Object pose refinement is essential for robust object pose estimation. +Previous work has made significant progress towards instance-level object pose +refinement. Yet, category-level pose refinement is a more challenging problem +due to large shape variations within a category and the discrepancies between +the target object and the shape prior. To address these challenges, we +introduce a novel architecture for category-level object pose refinement. Our +approach integrates an HS-layer and learnable affine transformations, which +aims to enhance the extraction and alignment of geometric information. +Additionally, we introduce a cross-cloud transformation mechanism that +efficiently merges diverse data sources. Finally, we push the limits of our +model by incorporating the shape prior information for translation and size +error prediction. We conducted extensive experiments to demonstrate the +effectiveness of the proposed framework. Through extensive quantitative +experiments, we demonstrate significant improvement over the baseline method by +a large margin across all metrics. + +
+
+ comment: The IEEE/CVF Conference on Computer Vision and Pattern Recognition + 2024 +
+
+
+
+
+ + ☆ Fact :Teaching MLLMs with Faithful, Concise and Transferable Rationales + + +
+ The remarkable performance of Multimodal Large Language Models (MLLMs) has +unequivocally demonstrated their proficient understanding capabilities in +handling a wide array of visual tasks. Nevertheless, the opaque nature of their +black-box reasoning processes persists as an enigma, rendering them +uninterpretable and struggling with hallucination. Their ability to execute +intricate compositional reasoning tasks is also constrained, culminating in a +stagnation of learning progression for these models. In this work, we introduce +Fact, a novel paradigm designed to generate multimodal rationales that are +faithful, concise, and transferable for teaching MLLMs. This paradigm utilizes +verifiable visual programming to generate executable code guaranteeing +faithfulness and precision. Subsequently, through a series of operations +including pruning, merging, and bridging, the rationale enhances its +conciseness. Furthermore, we filter rationales that can be transferred to +end-to-end paradigms from programming paradigms to guarantee transferability. +Empirical evidence from experiments demonstrates the superiority of our method +across models of varying parameter sizes, significantly enhancing their +compositional reasoning and generalization ability. Our approach also reduces +hallucinations owing to its high correlation between images and text. + +
+
+
+
+
+ + ☆ D-Aug: Enhancing Data Augmentation for Dynamic LiDAR Scenes + + +
+ Creating large LiDAR datasets with pixel-level labeling poses significant +challenges. While numerous data augmentation methods have been developed to +reduce the reliance on manual labeling, these methods predominantly focus on +static scenes and they overlook the importance of data augmentation for dynamic +scenes, which is critical for autonomous driving. To address this issue, we +propose D-Aug, a LiDAR data augmentation method tailored for augmenting dynamic +scenes. D-Aug extracts objects and inserts them into dynamic scenes, +considering the continuity of these objects across consecutive frames. For +seamless insertion into dynamic scenes, we propose a reference-guided method +that involves dynamic collision detection and rotation alignment. Additionally, +we present a pixel-level road identification strategy to efficiently determine +suitable insertion positions. We validated our method using the nuScenes +dataset with various 3D detection and tracking methods. Comparative experiments +demonstrate the superiority of D-Aug. + +
+
+ comment: 4pages, 4 figures +
+
+
+
+
+ + ☆ TiNO-Edit: Timestep and Noise Optimization for Robust Diffusion-Based + Image Editing CVPR + + +
+ Despite many attempts to leverage pre-trained text-to-image models (T2I) like +Stable Diffusion (SD) for controllable image editing, producing good +predictable results remains a challenge. Previous approaches have focused on +either fine-tuning pre-trained T2I models on specific datasets to generate +certain kinds of images (e.g., with a specific object or person), or on +optimizing the weights, text prompts, and/or learning features for each input +image in an attempt to coax the image generator to produce the desired result. +However, these approaches all have shortcomings and fail to produce good +results in a predictable and controllable manner. To address this problem, we +present TiNO-Edit, an SD-based method that focuses on optimizing the noise +patterns and diffusion timesteps during editing, something previously +unexplored in the literature. With this simple change, we are able to generate +results that both better align with the original images and reflect the desired +result. Furthermore, we propose a set of new loss functions that operate in the +latent domain of SD, greatly speeding up the optimization when compared to +prior approaches, which operate in the pixel domain. Our method can be easily +applied to variations of SD including Textual Inversion and DreamBooth that +encode new concepts and incorporate them into the edited results. We present a +host of image-editing capabilities enabled by our approach. Our code is +publicly available at https://github.com/SherryXTChen/TiNO-Edit. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ MHLR: Moving Haar Learning Rate Scheduler for Large-scale Face + Recognition Training with One GPU + + +
+ Face recognition (FR) has seen significant advancements due to the +utilization of large-scale datasets. Training deep FR models on large-scale +datasets with multiple GPUs is now a common practice. In fact, computing power +has evolved into a foundational and indispensable resource in the area of deep +learning. It is nearly impossible to train a deep FR model without holding +adequate hardware resources. Recognizing this challenge, some FR approaches +have started exploring ways to reduce the time complexity of the +fully-connected layer in FR models. Unlike other approaches, this paper +introduces a simple yet highly effective approach, Moving Haar Learning Rate +(MHLR) scheduler, for scheduling the learning rate promptly and accurately in +the training process. MHLR supports large-scale FR training with only one GPU, +which is able to accelerate the model to 1/4 of its original training time +without sacrificing more than 1% accuracy. More specifically, MHLR only needs +$30$ hours to train the model ResNet100 on the dataset WebFace12M containing +more than 12M face images with 0.6M identities. Extensive experiments validate +the efficiency and effectiveness of MHLR. + +
+
+
+
+
+ + ☆ CorrNet+: Sign Language Recognition and Translation via Spatial-Temporal + Correlation + + +
+ In sign language, the conveyance of human body trajectories predominantly +relies upon the coordinated movements of hands and facial expressions across +successive frames. Despite the recent advancements of sign language +understanding methods, they often solely focus on individual frames, inevitably +overlooking the inter-frame correlations that are essential for effectively +modeling human body trajectories. To address this limitation, this paper +introduces a spatial-temporal correlation network, denoted as CorrNet+, which +explicitly identifies body trajectories across multiple frames. In specific, +CorrNet+ employs a correlation module and an identification module to build +human body trajectories. Afterwards, a temporal attention module is followed to +adaptively evaluate the contributions of different frames. The resultant +features offer a holistic perspective on human body movements, facilitating a +deeper understanding of sign language. As a unified model, CorrNet+ achieves +new state-of-the-art performance on two extensive sign language understanding +tasks, including continuous sign language recognition (CSLR) and sign language +translation (SLT). Especially, CorrNet+ surpasses previous methods equipped +with resource-intensive pose-estimation networks or pre-extracted heatmaps for +hand and facial feature extraction. Compared with CorrNet, CorrNet+ achieves a +significant performance boost across all benchmarks while halving the +computational overhead. A comprehensive comparison with previous +spatial-temporal reasoning methods verifies the superiority of CorrNet+. Code +is available at https://github.com/hulianyuyy/CorrNet_Plus. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.03202 +
+
+
+
+
+ + ☆ LADDER: An Efficient Framework for Video Frame Interpolation + + +
+ Video Frame Interpolation (VFI) is a crucial technique in various +applications such as slow-motion generation, frame rate conversion, video frame +restoration etc. This paper introduces an efficient video frame interpolation +framework that aims to strike a favorable balance between efficiency and +quality. Our framework follows a general paradigm consisting of a flow +estimator and a refinement module, while incorporating carefully designed +components. First of all, we adopt depth-wise convolution with large kernels in +the flow estimator that simultaneously reduces the parameters and enhances the +receptive field for encoding rich context and handling complex motion. +Secondly, diverging from a common design for the refinement module with a +UNet-structure (encoder-decoder structure), which we find redundant, our +decoder-only refinement module directly enhances the result from coarse to fine +features, offering a more efficient process. In addition, to address the +challenge of handling high-definition frames, we also introduce an innovative +HD-aware augmentation strategy during training, leading to consistent +enhancement on HD images. Extensive experiments are conducted on diverse +datasets, Vimeo90K, UCF101, Xiph and SNU-FILM. The results demonstrate that our +approach achieves state-of-the-art performance with clear improvement while +requiring much less FLOPs and parameters, reaching to a better spot for +balancing efficiency and quality. + +
+
+
+
+
+ + ☆ Object Remover Performance Evaluation Methods using Class-wise Object + Removal Images + + +
+ Object removal refers to the process of erasing designated objects from an +image while preserving the overall appearance, and it is one area where image +inpainting is widely used in real-world applications. The performance of an +object remover is quantitatively evaluated by measuring the quality of object +removal results, similar to how the performance of an image inpainter is +gauged. Current works reporting quantitative performance evaluations utilize +original images as references. In this letter, to validate the current +evaluation methods cannot properly evaluate the performance of an object +remover, we create a dataset with object removal ground truth and compare the +evaluations made by the current methods using original images to those +utilizing object removal ground truth images. The disparities between two +evaluation sets validate that the current methods are not suitable for +measuring the performance of an object remover. Additionally, we propose new +evaluation methods tailored to gauge the performance of an object remover. The +proposed methods evaluate the performance through class-wise object removal +results and utilize images without the target class objects as a comparison +set. We confirm that the proposed methods can make judgments consistent with +human evaluators in the COCO dataset, and that they can produce measurements +aligning with those using object removal ground truth in the self-acquired +dataset. + +
+
+
+
+
+ + ☆ Synthesizing Realistic Data for Table Recognition ICDAR 2024 + + +
+ To overcome the limitations and challenges of current automatic table data +annotation methods and random table data synthesis approaches, we propose a +novel method for synthesizing annotation data specifically designed for table +recognition. This method utilizes the structure and content of existing complex +tables, facilitating the efficient creation of tables that closely replicate +the authentic styles found in the target domain. By leveraging the actual +structure and content of tables from Chinese financial announcements, we have +developed the first extensive table annotation dataset in this domain. We used +this dataset to train several recent deep learning-based end-to-end table +recognition models. Additionally, we have established the inaugural benchmark +for real-world complex tables in the Chinese financial announcement domain, +using it to assess the performance of models trained on our synthetic data, +thereby effectively validating our method's practicality and effectiveness. +Furthermore, we applied our synthesis method to augment the FinTabNet dataset, +extracted from English financial announcements, by increasing the proportion of +tables with multiple spanning cells to introduce greater complexity. Our +experiments show that models trained on this augmented dataset achieve +comprehensive improvements in performance, especially in the recognition of +tables with multiple spanning cells. + +
+
+ comment: ICDAR 2024 +
+
+
+
+
+ + ☆ LAPTOP-Diff: Layer Pruning and Normalized Distillation for Compressing + Diffusion Models + + +
+ In the era of AIGC, the demand for low-budget or even on-device applications +of diffusion models emerged. In terms of compressing the Stable Diffusion +models (SDMs), several approaches have been proposed, and most of them +leveraged the handcrafted layer removal methods to obtain smaller U-Nets, along +with knowledge distillation to recover the network performance. However, such a +handcrafting manner of layer removal is inefficient and lacks scalability and +generalization, and the feature distillation employed in the retraining phase +faces an imbalance issue that a few numerically significant feature loss terms +dominate over others throughout the retraining process. To this end, we +proposed the layer pruning and normalized distillation for compressing +diffusion models (LAPTOP-Diff). We, 1) introduced the layer pruning method to +compress SDM's U-Net automatically and proposed an effective one-shot pruning +criterion whose one-shot performance is guaranteed by its good additivity +property, surpassing other layer pruning and handcrafted layer removal methods, +2) proposed the normalized feature distillation for retraining, alleviated the +imbalance issue. Using the proposed LAPTOP-Diff, we compressed the U-Nets of +SDXL and SDM-v1.5 for the most advanced performance, achieving a minimal 4.0% +decline in PickScore at a pruning ratio of 50% while the comparative methods' +minimal PickScore decline is 8.2%. We will release our code. + +
+
+
+
+
+ + ☆ Sky-GVIO: an enhanced GNSS/INS/Vision navigation with FCN-based + sky-segmentation in urban canyon + + +
+ Accurate, continuous, and reliable positioning is a critical component of +achieving autonomous driving. However, in complex urban canyon environments, +the vulnerability of a stand-alone sensor and non-line-of-sight (NLOS) caused +by high buildings, trees, and elevated structures seriously affect positioning +results. To address these challenges, a sky-view images segmentation algorithm +based on Fully Convolutional Network (FCN) is proposed for GNSS NLOS detection. +Building upon this, a novel NLOS detection and mitigation algorithm (named +S-NDM) is extended to the tightly coupled Global Navigation Satellite Systems +(GNSS), Inertial Measurement Units (IMU), and visual feature system which is +called Sky-GVIO, with the aim of achieving continuous and accurate positioning +in urban canyon environments. Furthermore, the system harmonizes Single Point +Positioning (SPP) with Real-Time Kinematic (RTK) methodologies to bolster its +operational versatility and resilience. In urban canyon environments, the +positioning performance of S-NDM algorithm proposed in this paper is evaluated +under different tightly coupled SPP-related and RTK-related models. The results +exhibit that Sky-GVIO system achieves meter-level accuracy under SPP mode and +sub-decimeter precision with RTK, surpassing the performance of GNSS/INS/Vision +frameworks devoid of S-NDM. Additionally, the sky-view image dataset, inclusive +of training and evaluation subsets, has been made publicly accessible for +scholarly exploration at https://github.com/whuwangjr/sky-view-images . + +
+
+
+
+
+ + ☆ Rethinking 3D Dense Caption and Visual Grounding in A Unified Framework + through Prompt-based Localization + + +
+ 3D Visual Grounding (3DVG) and 3D Dense Captioning (3DDC) are two crucial +tasks in various 3D applications, which require both shared and complementary +information in localization and visual-language relationships. Therefore, +existing approaches adopt the two-stage "detect-then-describe/discriminate" +pipeline, which relies heavily on the performance of the detector, resulting in +suboptimal performance. Inspired by DETR, we propose a unified framework, +3DGCTR, to jointly solve these two distinct but closely related tasks in an +end-to-end fashion. The key idea is to reconsider the prompt-based localization +ability of the 3DVG model. In this way, the 3DVG model with a well-designed +prompt as input can assist the 3DDC task by extracting localization information +from the prompt. In terms of implementation, we integrate a Lightweight Caption +Head into the existing 3DVG network with a Caption Text Prompt as a connection, +effectively harnessing the existing 3DVG model's inherent localization +capacity, thereby boosting 3DDC capability. This integration facilitates +simultaneous multi-task training on both tasks, mutually enhancing their +performance. Extensive experimental results demonstrate the effectiveness of +this approach. Specifically, on the ScanRefer dataset, 3DGCTR surpasses the +state-of-the-art 3DDC method by 4.3% in CIDEr@0.5IoU in MLE training and +improves upon the SOTA 3DVG method by 3.16% in Acc@0.25IoU. + +
+
+
+
+
+ + ☆ Multilateral Temporal-view Pyramid Transformer for Video Inpainting + Detection + + +
+ The task of video inpainting detection is to expose the pixel-level inpainted +regions within a video sequence. Existing methods usually focus on leveraging +spatial and temporal inconsistencies. However, these methods typically employ +fixed operations to combine spatial and temporal clues, limiting their +applicability in different scenarios. In this paper, we introduce a novel +Multilateral Temporal-view Pyramid Transformer ({\em MumPy}) that collaborates +spatial-temporal clues flexibly. Our method utilizes a newly designed +multilateral temporal-view encoder to extract various collaborations of +spatial-temporal clues and introduces a deformable window-based temporal-view +interaction module to enhance the diversity of these collaborations. +Subsequently, we develop a multi-pyramid decoder to aggregate the various types +of features and generate detection maps. By adjusting the contribution strength +of spatial and temporal clues, our method can effectively identify inpainted +regions. We validate our method on existing datasets and also introduce a new +challenging and large-scale Video Inpainting dataset based on the YouTube-VOS +dataset, which employs several more recent inpainting methods. The results +demonstrate the superiority of our method in both in-domain and cross-domain +evaluation scenarios. + +
+
+
+
+
+ + ☆ Supervised Contrastive Vision Transformer for Breast Histopathological + Image Classification + + +
+ Invasive ductal carcinoma (IDC) is the most prevalent form of breast cancer. +Breast tissue histopathological examination is critical in diagnosing and +classifying breast cancer. Although existing methods have shown promising +results, there is still room for improvement in the classification accuracy and +generalization of IDC using histopathology images. We present a novel approach, +Supervised Contrastive Vision Transformer (SupCon-ViT), for improving the +classification of invasive ductal carcinoma in terms of accuracy and +generalization by leveraging the inherent strengths and advantages of both +transfer learning, i.e., pre-trained vision transformer, and supervised +contrastive learning. Our results on a benchmark breast cancer dataset +demonstrate that SupCon-Vit achieves state-of-the-art performance in IDC +classification, with an F1-score of 0.8188, precision of 0.7692, and +specificity of 0.8971, outperforming existing methods. In addition, the +proposed model demonstrates resilience in scenarios with minimal labeled data, +making it highly efficient in real-world clinical settings where labelled data +is limited. Our findings suggest that supervised contrastive learning in +conjunction with pre-trained vision transformers appears to be a viable +strategy for an accurate classification of IDC, thus paving the way for a more +efficient and reliable diagnosis of breast cancer through histopathological +image analysis. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ WPS-Dataset: A benchmark for wood plate segmentation in bark removal + processing + + +
+ Using deep learning methods is a promising approach to improving bark removal +efficiency and enhancing the quality of wood products. However, the lack of +publicly available datasets for wood plate segmentation in bark removal +processing poses challenges for researchers in this field. To address this +issue, a benchmark for wood plate segmentation in bark removal processing named +WPS-dataset is proposed in this study, which consists of 4863 images. We +designed an image acquisition device and assembled it on a bark removal +equipment to capture images in real industrial settings. We evaluated the +WPS-dataset using six typical segmentation models. The models effectively learn +and understand the WPS-dataset characteristics during training, resulting in +high performance and accuracy in wood plate segmentation tasks. We believe that +our dataset can lay a solid foundation for future research in bark removal +processing and contribute to advancements in this field. + +
+
+
+
+
+ + ☆ Lightweight Unsupervised Federated Learning with Pretrained Vision + Language Model + + +
+ Federated learning aims to tackle the ``isolated data island" problem, where +it trains a collective model from physically isolated clients while +safeguarding the privacy of users' data. However, supervised federated learning +necessitates that each client labels their data for training, which can be both +time-consuming and resource-intensive, and may even be impractical for edge +devices. Moreover, the training and transmission of deep models present +challenges to the computation and communication capabilities of the clients. To +address these two inherent challenges in supervised federated learning, we +propose a novel lightweight unsupervised federated learning approach that +leverages unlabeled data on each client to perform lightweight model training +and communication by harnessing pretrained vision-language models, such as +CLIP. By capitalizing on the zero-shot prediction capability and the +well-trained image encoder of the pre-trained CLIP model, we have carefully +crafted an efficient and resilient self-training approach. This method refines +the initial zero-shot predicted pseudo-labels of unlabeled instances through +the sole training of a linear classifier on top of the fixed image encoder. +Additionally, to address data heterogeneity within each client, we propose a +class-balanced text feature sampling strategy for generating synthetic +instances in the feature space to support local training. Experiments are +conducted on multiple benchmark datasets. The experimental results demonstrate +that our proposed method greatly enhances model performance in comparison to +CLIP's zero-shot predictions and even outperforms supervised federated learning +benchmark methods given limited computational and communication overhead. + +
+
+
+
+
+ + ☆ TaCOS: Task-Specific Camera Optimization with Simulation + + +
+ The performance of robots in their applications heavily depends on the +quality of sensory input. However, designing sensor payloads and their +parameters for specific robotic tasks is an expensive process that requires +well-established sensor knowledge and extensive experiments with physical +hardware. With cameras playing a pivotal role in robotic perception, we +introduce a novel end-to-end optimization approach for co-designing a camera +with specific robotic tasks by combining derivative-free and gradient-based +optimizers. The proposed method leverages recent computer graphics techniques +and physical camera characteristics to prototype the camera in software, +simulate operational environments and tasks for robots, and optimize the camera +design based on the desired tasks in a cost-effective way. We validate the +accuracy of our camera simulation by comparing it with physical cameras, and +demonstrate the design of cameras with stronger performance than common +off-the-shelf alternatives. Our approach supports the optimization of both +continuous and discrete camera parameters, manufacturing constraints, and can +be generalized to a broad range of camera design scenarios including multiple +cameras and unconventional cameras. This work advances the fully automated +design of cameras for specific robotics tasks. + +
+
+
+
+
+ + ☆ Spatial-Aware Image Retrieval: A Hyperdimensional Computing Approach for + Efficient Similarity Hashing + + +
+ In the face of burgeoning image data, efficiently retrieving similar images +poses a formidable challenge. Past research has focused on refining hash +functions to distill images into compact indicators of resemblance. Initial +attempts used shallow models, evolving to attention mechanism-based +architectures from Convolutional Neural Networks (CNNs) to advanced models. +Recognizing limitations in gradient-based models for spatial information +embedding, we propose an innovative image hashing method, NeuroHash leveraging +Hyperdimensional Computing (HDC). HDC symbolically encodes spatial information +into high-dimensional vectors, reshaping image representation. Our approach +combines pre-trained large vision models with HDC operations, enabling +spatially encoded feature representations. Hashing with locality-sensitive +hashing (LSH) ensures swift and efficient image retrieval. Notably, our +framework allows dynamic hash manipulation for conditional image retrieval. Our +work introduces a transformative image hashing framework enabling spatial-aware +conditional retrieval. By seamlessly combining DNN-based neural and HDC-based +symbolic models, our methodology breaks from traditional training, offering +flexible and conditional image retrieval. Performance evaluations signify a +paradigm shift in image-hashing methodologies, demonstrating enhanced retrieval +accuracy. + +
+
+
+
+
+ + ☆ MaeFuse: Transferring Omni Features with Pretrained Masked Autoencoders + for Infrared and Visible Image Fusion via Guided Training + + +
+ In this research, we introduce MaeFuse, a novel autoencoder model designed +for infrared and visible image fusion (IVIF). The existing approaches for image +fusion often rely on training combined with downstream tasks to obtain +high-level visual information, which is effective in emphasizing target objects +and delivering impressive results in visual quality and task-specific +applications. MaeFuse, however, deviates from the norm. Instead of being driven +by downstream tasks, our model utilizes a pretrained encoder from Masked +Autoencoders (MAE), which facilities the omni features extraction for low-level +reconstruction and high-level vision tasks, to obtain perception friendly +features with a low cost. In order to eliminate the domain gap of different +modal features and the block effect caused by the MAE encoder, we further +develop a guided training strategy. This strategy is meticulously crafted to +ensure that the fusion layer seamlessly adjusts to the feature space of the +encoder, gradually enhancing the fusion effect. It facilitates the +comprehensive integration of feature vectors from both infrared and visible +modalities, preserving the rich details inherent in each. MaeFuse not only +introduces a novel perspective in the realm of fusion techniques but also +stands out with impressive performance across various public datasets. + +
+
+
+
+
+ + ☆ AKGNet: Attribute Knowledge-Guided Unsupervised Lung-Infected Area + Segmentation + + +
+ Lung-infected area segmentation is crucial for assessing the severity of lung +diseases. However, existing image-text multi-modal methods typically rely on +labour-intensive annotations for model training, posing challenges regarding +time and expertise. To address this issue, we propose a novel attribute +knowledge-guided framework for unsupervised lung-infected area segmentation +(AKGNet), which achieves segmentation solely based on image-text data without +any mask annotation. AKGNet facilitates text attribute knowledge learning, +attribute-image cross-attention fusion, and high-confidence-based pseudo-label +exploration simultaneously. It can learn statistical information and capture +spatial correlations between image and text attributes in the embedding space, +iteratively refining the mask to enhance segmentation. Specifically, we +introduce a text attribute knowledge learning module by extracting attribute +knowledge and incorporating it into feature representations, enabling the model +to learn statistical information and adapt to different attributes. Moreover, +we devise an attribute-image cross-attention module by calculating the +correlation between attributes and images in the embedding space to capture +spatial dependency information, thus selectively focusing on relevant regions +while filtering irrelevant areas. Finally, a self-training mask improvement +process is employed by generating pseudo-labels using high-confidence +predictions to iteratively enhance the mask and segmentation. Experimental +results on a benchmark medical image dataset demonstrate the superior +performance of our method compared to state-of-the-art segmentation techniques +in unsupervised scenarios. + +
+
+
+
+
+ + ☆ InfoMatch: Entropy Neural Estimation for Semi-Supervised Image + Classification IJCAI 2024 + + +
+ Semi-supervised image classification, leveraging pseudo supervision and +consistency regularization, has demonstrated remarkable success. However, the +ongoing challenge lies in fully exploiting the potential of unlabeled data. To +address this, we employ information entropy neural estimation to harness the +potential of unlabeled samples. Inspired by contrastive learning, the entropy +is estimated by maximizing a lower bound on mutual information across different +augmented views. Moreover, we theoretically analyze that the information +entropy of the posterior of an image classifier is approximated by maximizing +the likelihood function of the softmax predictions. Guided by these insights, +we optimize our model from both perspectives to ensure that the predicted +probability distribution closely aligns with the ground-truth distribution. +Given the theoretical connection to information entropy, we name our method +\textit{InfoMatch}. Through extensive experiments, we show its superior +performance. + +
+
+ comment: IJCAI 2024 +
+
+
+
+
+ + ☆ How to deal with glare for improved perception of Autonomous Vehicles + + +
+ Vision sensors are versatile and can capture a wide range of visual cues, +such as color, texture, shape, and depth. This versatility, along with the +relatively inexpensive availability of machine vision cameras, played an +important role in adopting vision-based environment perception systems in +autonomous vehicles (AVs). However, vision-based perception systems can be +easily affected by glare in the presence of a bright source of light, such as +the sun or the headlights of the oncoming vehicle at night or simply by light +reflecting off snow or ice-covered surfaces; scenarios encountered frequently +during driving. In this paper, we investigate various glare reduction +techniques, including the proposed saturated pixel-aware glare reduction +technique for improved performance of the computer vision (CV) tasks employed +by the perception layer of AVs. We evaluate these glare reduction methods based +on various performance metrics of the CV algorithms used by the perception +layer. Specifically, we considered object detection, object recognition, object +tracking, depth estimation, and lane detection which are crucial for autonomous +driving. The experimental findings validate the efficacy of the proposed glare +reduction approach, showcasing enhanced performance across diverse perception +tasks and remarkable resilience against varying levels of glare. + +
+
+ comment: 14 pages, 9 figures, Accepted IEEE TIV +
+
+
+
+
+ + ☆ FairSSD: Understanding Bias in Synthetic Speech Detectors CVPR 2024 + + +
+ Methods that can generate synthetic speech which is perceptually +indistinguishable from speech recorded by a human speaker, are easily +available. Several incidents report misuse of synthetic speech generated from +these methods to commit fraud. To counter such misuse, many methods have been +proposed to detect synthetic speech. Some of these detectors are more +interpretable, can generalize to detect synthetic speech in the wild and are +robust to noise. However, limited work has been done on understanding bias in +these detectors. In this work, we examine bias in existing synthetic speech +detectors to determine if they will unfairly target a particular gender, age +and accent group. We also inspect whether these detectors will have a higher +misclassification rate for bona fide speech from speech-impaired speakers w.r.t +fluent speakers. Extensive experiments on 6 existing synthetic speech detectors +using more than 0.9 million speech signals demonstrate that most detectors are +gender, age and accent biased, and future work is needed to ensure fairness. To +support future research, we release our evaluation dataset, models used in our +study and source code at https://gitlab.com/viper-purdue/fairssd. + +
+
+ comment: Accepted at CVPR 2024 (WMF) +
+
+
+
+
+ + ☆ Pixel-Wise Symbol Spotting via Progressive Points Location for Parsing + CAD Images + + +
+ Parsing Computer-Aided Design (CAD) drawings is a fundamental step for CAD +revision, semantic-based management, and the generation of 3D prototypes in +both the architecture and engineering industries. Labeling symbols from a CAD +drawing is a challenging yet notorious task from a practical point of view. In +this work, we propose to label and spot symbols from CAD images that are +converted from CAD drawings. The advantage of spotting symbols from CAD images +lies in the low requirement of labelers and the low-cost annotation. However, +pixel-wise spotting symbols from CAD images is challenging work. We propose a +pixel-wise point location via Progressive Gaussian Kernels (PGK) to balance +between training efficiency and location accuracy. Besides, we introduce a +local offset to the heatmap-based point location method. Based on the keypoints +detection, we propose a symbol grouping method to redraw the rectangle symbols +in CAD images. We have released a dataset containing CAD images of equipment +rooms from telecommunication industrial CAD drawings. Extensive experiments on +this real-world dataset show that the proposed method has good generalization +ability. + +
+
+ comment: 10 pages, 10 figures,6 tables +
+
+
+
+
+ + ☆ Hyper Evidential Deep Learning to Quantify Composite Classification + Uncertainty ICLR 2024 + + +
+ Deep neural networks (DNNs) have been shown to perform well on exclusive, +multi-class classification tasks. However, when different classes have similar +visual features, it becomes challenging for human annotators to differentiate +them. This scenario necessitates the use of composite class labels. In this +paper, we propose a novel framework called Hyper-Evidential Neural Network +(HENN) that explicitly models predictive uncertainty due to composite class +labels in training data in the context of the belief theory called Subjective +Logic (SL). By placing a grouped Dirichlet distribution on the class +probabilities, we treat predictions of a neural network as parameters of +hyper-subjective opinions and learn the network that collects both single and +composite evidence leading to these hyper-opinions by a deterministic DNN from +data. We introduce a new uncertainty type called vagueness originally designed +for hyper-opinions in SL to quantify composite classification uncertainty for +DNNs. Our results demonstrate that HENN outperforms its state-of-the-art +counterparts based on four image datasets. The code and datasets are available +at: https://github.com/Hugo101/HyperEvidentialNN. + +
+
+ comment: In Proceedings of The Twelfth International Conference on Learning + Representations, ICLR 2024 +
+
+
+
+
+ + ☆ Leveraging 3D LiDAR Sensors to Enable Enhanced Urban Safety and Public + Health: Pedestrian Monitoring and Abnormal Activity Detection + + +
+ The integration of Light Detection and Ranging (LiDAR) and Internet of Things +(IoT) technologies offers transformative opportunities for public health +informatics in urban safety and pedestrian well-being. This paper proposes a +novel framework utilizing these technologies for enhanced 3D object detection +and activity classification in urban traffic scenarios. By employing elevated +LiDAR, we obtain detailed 3D point cloud data, enabling precise pedestrian +activity monitoring. To overcome urban data scarcity, we create a specialized +dataset through simulated traffic environments in Blender, facilitating +targeted model training. Our approach employs a modified Point +Voxel-Region-based Convolutional Neural Network (PV-RCNN) for robust 3D +detection and PointNet for classifying pedestrian activities, significantly +benefiting urban traffic management and public health by offering insights into +pedestrian behavior and promoting safer urban environments. Our dual-model +approach not only enhances urban traffic management but also contributes +significantly to public health by providing insights into pedestrian behavior +and promoting safer urban environment. + +
+
+
+
+
+ + ☆ Domain-Specific Block Selection and Paired-View Pseudo-Labeling for + Online Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims to adapt a pre-trained model to a new test +domain without access to source data after deployment. Existing approaches +typically rely on self-training with pseudo-labels since ground-truth cannot be +obtained from test data. Although the quality of pseudo labels is important for +stable and accurate long-term adaptation, it has not been previously addressed. +In this work, we propose DPLOT, a simple yet effective TTA framework that +consists of two components: (1) domain-specific block selection and (2) +pseudo-label generation using paired-view images. Specifically, we select +blocks that involve domain-specific feature extraction and train these blocks +by entropy minimization. After blocks are adjusted for current test domain, we +generate pseudo-labels by averaging given test images and corresponding flipped +counterparts. By simply using flip augmentation, we prevent a decrease in the +quality of the pseudo-labels, which can be caused by the domain gap resulting +from strong augmentation. Our experimental results demonstrate that DPLOT +outperforms previous TTA methods in CIFAR10-C, CIFAR100-C, and ImageNet-C +benchmarks, reducing error by up to 5.4%, 9.1%, and 2.9%, respectively. Also, +we provide an extensive analysis to demonstrate effectiveness of our framework. +Code is available at +https://github.com/gist-ailab/domain-specific-block-selection-and-paired-view-pseudo-labeling-for-online-TTA. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ TempBEV: Improving Learned BEV Encoders with Combined Image and BEV + Space Temporal Aggregation + + +
+ Autonomous driving requires an accurate representation of the environment. A +strategy toward high accuracy is to fuse data from several sensors. Learned +Bird's-Eye View (BEV) encoders can achieve this by mapping data from individual +sensors into one joint latent space. For cost-efficient camera-only systems, +this provides an effective mechanism to fuse data from multiple cameras with +different views. Accuracy can further be improved by aggregating sensor +information over time. This is especially important in monocular camera systems +to account for the lack of explicit depth and velocity measurements. Thereby, +the effectiveness of developed BEV encoders crucially depends on the operators +used to aggregate temporal information and on the used latent representation +spaces. We analyze BEV encoders proposed in the literature and compare their +effectiveness, quantifying the effects of aggregation operators and latent +representations. While most existing approaches aggregate temporal information +either in image or in BEV latent space, our analyses and performance +comparisons suggest that these latent representations exhibit complementary +strengths. Therefore, we develop a novel temporal BEV encoder, TempBEV, which +integrates aggregated temporal information from both latent spaces. We consider +subsequent image frames as stereo through time and leverage methods from +optical flow estimation for temporal stereo encoding. Empirical evaluation on +the NuScenes dataset shows a significant improvement by TempBEV over the +baseline for 3D object detection and BEV segmentation. The ablation uncovers a +strong synergy of joint temporal aggregation in the image and BEV latent space. +These results indicate the overall effectiveness of our approach and make a +strong case for aggregating temporal information in both image and BEV latent +spaces. + +
+
+
+
+
+ + ☆ Establishing a Baseline for Gaze-driven Authentication Performance in + VR: A Breadth-First Investigation on a Very Large Dataset + + +
+ This paper performs the crucial work of establishing a baseline for +gaze-driven authentication performance to begin answering fundamental research +questions using a very large dataset of gaze recordings from 9202 people with a +level of eye tracking (ET) signal quality equivalent to modern consumer-facing +virtual reality (VR) platforms. The size of the employed dataset is at least an +order-of-magnitude larger than any other dataset from previous related work. +Binocular estimates of the optical and visual axes of the eyes and a minimum +duration for enrollment and verification are required for our model to achieve +a false rejection rate (FRR) of below 3% at a false acceptance rate (FAR) of 1 +in 50,000. In terms of identification accuracy which decreases with gallery +size, we estimate that our model would fall below chance-level accuracy for +gallery sizes of 148,000 or more. Our major findings indicate that gaze +authentication can be as accurate as required by the FIDO standard when driven +by a state-of-the-art machine learning architecture and a sufficiently large +training dataset. + +
+
+ comment: 28 pages, 18 figures, 5 tables, includes supplementary material +
+
+
+
+
+ + ☆ When are Foundation Models Effective? Understanding the Suitability for + Pixel-Level Classification Using Multispectral Imagery + + +
+ Foundation models, i.e., very large deep learning models, have demonstrated +impressive performances in various language and vision tasks that are otherwise +difficult to reach using smaller-size models. The major success of GPT-type of +language models is particularly exciting and raises expectations on the +potential of foundation models in other domains including satellite remote +sensing. In this context, great efforts have been made to build foundation +models to test their capabilities in broader applications, and examples include +Prithvi by NASA-IBM, Segment-Anything-Model, ViT, etc. This leads to an +important question: Are foundation models always a suitable choice for +different remote sensing tasks, and when or when not? This work aims to enhance +the understanding of the status and suitability of foundation models for +pixel-level classification using multispectral imagery at moderate resolution, +through comparisons with traditional machine learning (ML) and regular-size +deep learning models. Interestingly, the results reveal that in many scenarios +traditional ML models still have similar or better performance compared to +foundation models, especially for tasks where texture is less useful for +classification. On the other hand, deep learning models did show more promising +results for tasks where labels partially depend on texture (e.g., burn scar), +while the difference in performance between foundation models and deep learning +models is not obvious. The results conform with our analysis: The suitability +of foundation models depend on the alignment between the self-supervised +learning tasks and the real downstream tasks, and the typical masked +autoencoder paradigm is not necessarily suitable for many remote sensing +problems. + +
+
+
+
+
+ + ☆ Prompt-Driven Feature Diffusion for Open-World Semi-Supervised Learning + + +
+ In this paper, we present a novel approach termed Prompt-Driven Feature +Diffusion (PDFD) within a semi-supervised learning framework for Open World +Semi-Supervised Learning (OW-SSL). At its core, PDFD deploys an efficient +feature-level diffusion model with the guidance of class-specific prompts to +support discriminative feature representation learning and feature generation, +tackling the challenge of the non-availability of labeled data for unseen +classes in OW-SSL. In particular, PDFD utilizes class prototypes as prompts in +the diffusion model, leveraging their class-discriminative and semantic +generalization ability to condition and guide the diffusion process across all +the seen and unseen classes. Furthermore, PDFD incorporates a class-conditional +adversarial loss for diffusion model training, ensuring that the features +generated via the diffusion process can be discriminatively aligned with the +class-conditional features of the real data. Additionally, the class prototypes +of the unseen classes are computed using only unlabeled instances with +confident predictions within a semi-supervised learning framework. We conduct +extensive experiments to evaluate the proposed PDFD. The empirical results show +PDFD exhibits remarkable performance enhancements over many state-of-the-art +existing methods. + +
+
+
+
+
+ + ☆ CU-Mamba: Selective State Space Models with Channel Learning for Image + Restoration + + +
+ Reconstructing degraded images is a critical task in image processing. +Although CNN and Transformer-based models are prevalent in this field, they +exhibit inherent limitations, such as inadequate long-range dependency modeling +and high computational costs. To overcome these issues, we introduce the +Channel-Aware U-Shaped Mamba (CU-Mamba) model, which incorporates a dual State +Space Model (SSM) framework into the U-Net architecture. CU-Mamba employs a +Spatial SSM module for global context encoding and a Channel SSM component to +preserve channel correlation features, both in linear computational complexity +relative to the feature map size. Extensive experimental results validate +CU-Mamba's superiority over existing state-of-the-art methods, underscoring the +importance of integrating both spatial and channel contexts in image +restoration. + +
+
+
+
+
+ + ☆ 3D object quality prediction for Metal Jet Printer with Multimodal + thermal encoder + + +
+ With the advancements in 3D printing technologies, it is extremely important +that the quality of 3D printed objects, and dimensional accuracies should meet +the customer's specifications. Various factors during metal printing affect the +printed parts' quality, including the power quality, the printing stage +parameters, the print part's location inside the print bed, the curing stage +parameters, and the metal sintering process. With the large data gathered from +HP's MetJet printing process, AI techniques can be used to analyze, learn, and +effectively infer the printed part quality metrics, as well as assist in +improving the print yield. In-situ thermal sensing data captured by +printer-installed thermal sensors contains the part thermal signature of fusing +layers. Such part thermal signature contains a convoluted impact from various +factors. In this paper, we use a multimodal thermal encoder network to fuse +data of a different nature including the video data vectorized printer control +data, and exact part thermal signatures with a trained encoder-decoder module. +We explored the data fusing techniques and stages for data fusing, the +optimized end-to-end model architecture indicates an improved part quality +prediction accuracy. + +
+
+
+
+
+ + ☆ Event-Based Eye Tracking. AIS 2024 Challenge Survey + + +
+ This survey reviews the AIS 2024 Event-Based Eye Tracking (EET) Challenge. +The task of the challenge focuses on processing eye movement recorded with +event cameras and predicting the pupil center of the eye. The challenge +emphasizes efficient eye tracking with event cameras to achieve good task +accuracy and efficiency trade-off. During the challenge period, 38 participants +registered for the Kaggle competition, and 8 teams submitted a challenge +factsheet. The novel and diverse methods from the submitted factsheets are +reviewed and analyzed in this survey to advance future event-based eye tracking +research. + +
+
+ comment: Qinyu Chen is the corresponding author +
+
+
+
+
+ + ☆ QGen: On the Ability to Generalize in Quantization Aware Training + + +
+ Quantization lowers memory usage, computational requirements, and latency by +utilizing fewer bits to represent model weights and activations. In this work, +we investigate the generalization properties of quantized neural networks, a +characteristic that has received little attention despite its implications on +model performance. In particular, first, we develop a theoretical model for +quantization in neural networks and demonstrate how quantization functions as a +form of regularization. Second, motivated by recent work connecting the +sharpness of the loss landscape and generalization, we derive an approximate +bound for the generalization of quantized models conditioned on the amount of +quantization noise. We then validate our hypothesis by experimenting with over +2000 models trained on CIFAR-10, CIFAR-100, and ImageNet datasets on +convolutional and transformer-based models. + +
+
+
+
+
+ + ☆ Multimodal 3D Object Detection on Unseen Domains + + +
+ LiDAR datasets for autonomous driving exhibit biases in properties such as +point cloud density, range, and object dimensions. As a result, object +detection networks trained and evaluated in different environments often +experience performance degradation. Domain adaptation approaches assume access +to unannotated samples from the test distribution to address this problem. +However, in the real world, the exact conditions of deployment and access to +samples representative of the test dataset may be unavailable while training. +We argue that the more realistic and challenging formulation is to require +robustness in performance to unseen target domains. We propose to address this +problem in a two-pronged manner. First, we leverage paired LiDAR-image data +present in most autonomous driving datasets to perform multimodal object +detection. We suggest that working with multimodal features by leveraging both +images and LiDAR point clouds for scene understanding tasks results in object +detectors more robust to unseen domain shifts. Second, we train a 3D object +detector to learn multimodal object features across different distributions and +promote feature invariance across these source domains to improve +generalizability to unseen target domains. To this end, we propose +CLIX$^\text{3D}$, a multimodal fusion and supervised contrastive learning +framework for 3D object detection that performs alignment of object features +from same-class samples of different domains while pushing the features from +different classes apart. We show that CLIX$^\text{3D}$ yields state-of-the-art +domain generalization performance under multiple dataset shifts. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ IrrNet: Advancing Irrigation Mapping with Incremental Patch Size + Training on Remote Sensing Imagery CVPR + + +
+ Irrigation mapping plays a crucial role in effective water management, +essential for preserving both water quality and quantity, and is key to +mitigating the global issue of water scarcity. The complexity of agricultural +fields, adorned with diverse irrigation practices, especially when multiple +systems coexist in close quarters, poses a unique challenge. This complexity is +further compounded by the nature of Landsat's remote sensing data, where each +pixel is rich with densely packed information, complicating the task of +accurate irrigation mapping. In this study, we introduce an innovative approach +that employs a progressive training method, which strategically increases patch +sizes throughout the training process, utilizing datasets from Landsat 5 and 7, +labeled with the WRLU dataset for precise labeling. This initial focus allows +the model to capture detailed features, progressively shifting to broader, more +general features as the patch size enlarges. Remarkably, our method enhances +the performance of existing state-of-the-art models by approximately 20%. +Furthermore, our analysis delves into the significance of incorporating various +spectral bands into the model, assessing their impact on performance. The +findings reveal that additional bands are instrumental in enabling the model to +discern finer details more effectively. This work sets a new standard for +leveraging remote sensing imagery in irrigation mapping. + +
+
+ comment: Full version of the paper will be appearing in Proceedings of the + IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + Workshops, 2024 +
+
+
+
+
+ + ☆ Diffusion Schrödinger Bridge Models for High-Quality MR-to-CT + Synthesis for Head and Neck Proton Treatment Planning + + +
+ In recent advancements in proton therapy, MR-based treatment planning is +gaining momentum to minimize additional radiation exposure compared to +traditional CT-based methods. This transition highlights the critical need for +accurate MR-to-CT image synthesis, which is essential for precise proton dose +calculations. Our research introduces the Diffusion Schr\"odinger Bridge Models +(DSBM), an innovative approach for high-quality MR-to-CT synthesis. DSBM learns +the nonlinear diffusion processes between MR and CT data distributions. This +method improves upon traditional diffusion models by initiating synthesis from +the prior distribution rather than the Gaussian distribution, enhancing both +generation quality and efficiency. We validated the effectiveness of DSBM on a +head and neck cancer dataset, demonstrating its superiority over traditional +image synthesis methods through both image-level and dosimetric-level +evaluations. The effectiveness of DSBM in MR-based proton treatment planning +highlights its potential as a valuable tool in various clinical scenarios. + +
+
+ comment: International Conference on the use of Computers in Radiation therapy + (ICCR) +
+
+
+
+
+ + ☆ Equivariant Spatio-Temporal Self-Supervision for LiDAR Object Detection + + +
+ Popular representation learning methods encourage feature invariance under +transformations applied at the input. However, in 3D perception tasks like +object localization and segmentation, outputs are naturally equivariant to some +transformations, such as rotation. Using pre-training loss functions that +encourage equivariance of features under certain transformations provides a +strong self-supervision signal while also retaining information of geometric +relationships between transformed feature representations. This can enable +improved performance in downstream tasks that are equivariant to such +transformations. In this paper, we propose a spatio-temporal equivariant +learning framework by considering both spatial and temporal augmentations +jointly. Our experiments show that the best performance arises with a +pre-training approach that encourages equivariance to translation, scaling, and +flip, rotation and scene flow. For spatial augmentations, we find that +depending on the transformation, either a contrastive objective or an +equivariance-by-classification objective yields best results. To leverage +real-world object deformations and motion, we consider sequential LiDAR scene +pairs and develop a novel 3D scene flow-based equivariance objective that leads +to improved performance overall. We show our pre-training method for 3D object +detection which outperforms existing equivariant and invariant approaches in +many settings. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ Learning with 3D rotations, a hitchhiker's guide to SO(3) + + +
+ Many settings in machine learning require the selection of a rotation +representation. However, choosing a suitable representation from the many +available options is challenging. This paper acts as a survey and guide through +rotation representations. We walk through their properties that harm or benefit +deep learning with gradient-based optimization. By consolidating insights from +rotation-based learning, we provide a comprehensive overview of learning +functions with rotation representations. We provide guidance on selecting +representations based on whether rotations are in the model's input or output +and whether the data primarily comprises small angles. + +
+
+
+
+
+ + ☆ Visual Prompting for Generalized Few-shot Segmentation: A Multi-scale + Approach CVPR 2024 + + +
+ The emergence of attention-based transformer models has led to their +extensive use in various tasks, due to their superior generalization and +transfer properties. Recent research has demonstrated that such models, when +prompted appropriately, are excellent for few-shot inference. However, such +techniques are under-explored for dense prediction tasks like semantic +segmentation. In this work, we examine the effectiveness of prompting a +transformer-decoder with learned visual prompts for the generalized few-shot +segmentation (GFSS) task. Our goal is to achieve strong performance not only on +novel categories with limited examples, but also to retain performance on base +categories. We propose an approach to learn visual prompts with limited +examples. These learned visual prompts are used to prompt a multiscale +transformer decoder to facilitate accurate dense predictions. Additionally, we +introduce a unidirectional causal attention mechanism between the novel +prompts, learned with limited examples, and the base prompts, learned with +abundant data. This mechanism enriches the novel prompts without deteriorating +the base class performance. Overall, this form of prompting helps us achieve +state-of-the-art performance for GFSS on two different benchmark datasets: +COCO-$20^i$ and Pascal-$5^i$, without the need for test-time optimization (or +transduction). Furthermore, test-time optimization leveraging unlabelled test +data can be used to improve the prompts, which we refer to as transductive +prompt tuning. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Deep Learning for Video-Based Assessment of Endotracheal Intubation + Skills + + +
+ Endotracheal intubation (ETI) is an emergency procedure performed in civilian +and combat casualty care settings to establish an airway. Objective and +automated assessment of ETI skills is essential for the training and +certification of healthcare providers. However, the current approach is based +on manual feedback by an expert, which is subjective, time- and +resource-intensive, and is prone to poor inter-rater reliability and halo +effects. This work proposes a framework to evaluate ETI skills using single and +multi-view videos. The framework consists of two stages. First, a 2D +convolutional autoencoder (AE) and a pre-trained self-supervision network +extract features from videos. Second, a 1D convolutional enhanced with a +cross-view attention module takes the features from the AE as input and outputs +predictions for skill evaluation. The ETI datasets were collected in two +phases. In the first phase, ETI is performed by two subject cohorts: Experts +and Novices. In the second phase, novice subjects perform ETI under time +pressure, and the outcome is either Successful or Unsuccessful. A third dataset +of videos from a single head-mounted camera for Experts and Novices is also +analyzed. The study achieved an accuracy of 100% in identifying Expert/Novice +trials in the initial phase. In the second phase, the model showed 85% accuracy +in classifying Successful/Unsuccessful procedures. Using head-mounted cameras +alone, the model showed a 96% accuracy on Expert and Novice classification +while maintaining an accuracy of 85% on classifying successful and +unsuccessful. In addition, GradCAMs are presented to explain the differences +between Expert and Novice behavior and Successful and Unsuccessful trials. The +approach offers a reliable and objective method for automated assessment of ETI +skills. + +
+
+
+
+
+ + ☆ Postoperative glioblastoma segmentation: Development of a fully + automated pipeline using deep convolutional neural networks and comparison + with currently available models + + +
+ Accurately assessing tumor removal is paramount in the management of +glioblastoma. We developed a pipeline using MRI scans and neural networks to +segment tumor subregions and the surgical cavity in postoperative images. Our +model excels in accurately classifying the extent of resection, offering a +valuable tool for clinicians in assessing treatment effectiveness. + +
+
+
+
+
+ + ☆ Unifying Scene Representation and Hand-Eye Calibration with 3D + Foundation Models + + +
+ Representing the environment is a central challenge in robotics, and is +essential for effective decision-making. Traditionally, before capturing images +with a manipulator-mounted camera, users need to calibrate the camera using a +specific external marker, such as a checkerboard or AprilTag. However, recent +advances in computer vision have led to the development of \emph{3D foundation +models}. These are large, pre-trained neural networks that can establish fast +and accurate multi-view correspondences with very few images, even in the +absence of rich visual features. This paper advocates for the integration of 3D +foundation models into scene representation approaches for robotic systems +equipped with manipulator-mounted RGB cameras. Specifically, we propose the +Joint Calibration and Representation (JCR) method. JCR uses RGB images, +captured by a manipulator-mounted camera, to simultaneously construct an +environmental representation and calibrate the camera relative to the robot's +end-effector, in the absence of specific calibration markers. The resulting 3D +environment representation is aligned with the robot's coordinate frame and +maintains physically accurate scales. We demonstrate that JCR can build +effective scene representations using a low-cost RGB camera attached to a +manipulator, without prior calibration. + +
+
+
+
+
+ + ☆ Factorized Motion Fields for Fast Sparse Input Dynamic View Synthesis SIGGRAPH 2024 + + +
+ Designing a 3D representation of a dynamic scene for fast optimization and +rendering is a challenging task. While recent explicit representations enable +fast learning and rendering of dynamic radiance fields, they require a dense +set of input viewpoints. In this work, we focus on learning a fast +representation for dynamic radiance fields with sparse input viewpoints. +However, the optimization with sparse input is under-constrained and +necessitates the use of motion priors to constrain the learning. Existing fast +dynamic scene models do not explicitly model the motion, making them difficult +to be constrained with motion priors. We design an explicit motion model as a +factorized 4D representation that is fast and can exploit the spatio-temporal +correlation of the motion field. We then introduce reliable flow priors +including a combination of sparse flow priors across cameras and dense flow +priors within cameras to regularize our motion model. Our model is fast, +compact and achieves very good performance on popular multi-view dynamic scene +datasets with sparse input viewpoints. The source code for our model can be +found on our project page: +https://nagabhushansn95.github.io/publications/2024/RF-DeRF.html. + +
+
+ comment: Accepted at SIGGRAPH 2024 +
+
+
+
+
+ + ☆ Deep Dependency Networks and Advanced Inference Schemes for Multi-Label + Classification AISTATS 2024 + + +
+ We present a unified framework called deep dependency networks (DDNs) that +combines dependency networks and deep learning architectures for multi-label +classification, with a particular emphasis on image and video data. The primary +advantage of dependency networks is their ease of training, in contrast to +other probabilistic graphical models like Markov networks. In particular, when +combined with deep learning architectures, they provide an intuitive, +easy-to-use loss function for multi-label classification. A drawback of DDNs +compared to Markov networks is their lack of advanced inference schemes, +necessitating the use of Gibbs sampling. To address this challenge, we propose +novel inference schemes based on local search and integer linear programming +for computing the most likely assignment to the labels given observations. We +evaluate our novel methods on three video datasets (Charades, TACoS, Wetlab) +and three image datasets (MS-COCO, PASCAL VOC, NUS-WIDE), comparing their +performance with (a) basic neural architectures and (b) neural architectures +combined with Markov networks equipped with advanced inference and learning +techniques. Our results demonstrate the superiority of our new DDN methods over +the two competing approaches. + +
+
+ comment: Will appear in AISTATS 2024. arXiv admin note: substantial text + overlap with arXiv:2302.00633 +
+
+
+
+
+ + ☆ Unsupervised Microscopy Video Denoising CVPR + + +
+ In this paper, we introduce a novel unsupervised network to denoise +microscopy videos featured by image sequences captured by a fixed location +microscopy camera. Specifically, we propose a DeepTemporal Interpolation +method, leveraging a temporal signal filter integrated into the bottom CNN +layers, to restore microscopy videos corrupted by unknown noise types. Our +unsupervised denoising architecture is distinguished by its ability to adapt to +multiple noise conditions without the need for pre-existing noise distribution +knowledge, addressing a significant challenge in real-world medical +applications. Furthermore, we evaluate our denoising framework using both real +microscopy recordings and simulated data, validating our outperforming video +denoising performance across a broad spectrum of noise scenarios. Extensive +experiments demonstrate that our unsupervised model consistently outperforms +state-of-the-art supervised and unsupervised video denoising techniques, +proving especially effective for microscopy videos. + +
+
+ comment: Accepted at CVPRW 2024 +
+
+
+
+
+ + ☆ SDIP: Self-Reinforcement Deep Image Prior Framework for Image Processing + + +
+ Deep image prior (DIP) proposed in recent research has revealed the inherent +trait of convolutional neural networks (CNN) for capturing substantial +low-level image statistics priors. This framework efficiently addresses the +inverse problems in image processing and has induced extensive applications in +various domains. However, as the whole algorithm is initialized randomly, the +DIP algorithm often lacks stability. Thus, this method still has space for +further improvement. In this paper, we propose the self-reinforcement deep +image prior (SDIP) as an improved version of the original DIP. We observed that +the changes in the DIP networks' input and output are highly correlated during +each iteration. SDIP efficiently utilizes this trait in a reinforcement +learning manner, where the current iteration's output is utilized by a steering +algorithm to update the network input for the next iteration, guiding the +algorithm toward improved results. Experimental results across multiple +applications demonstrate that our proposed SDIP framework offers improvement +compared to the original DIP method and other state-of-the-art methods. + +
+
+
+
+
+ + ☆ Mushroom Segmentation and 3D Pose Estimation from Point Clouds using + Fully Convolutional Geometric Features and Implicit Pose Encoding + + +
+ Modern agricultural applications rely more and more on deep learning +solutions. However, training well-performing deep networks requires a large +amount of annotated data that may not be available and in the case of 3D +annotation may not even be feasible for human annotators. In this work, we +develop a deep learning approach to segment mushrooms and estimate their pose +on 3D data, in the form of point clouds acquired by depth sensors. To +circumvent the annotation problem, we create a synthetic dataset of mushroom +scenes, where we are fully aware of 3D information, such as the pose of each +mushroom. The proposed network has a fully convolutional backbone, that parses +sparse 3D data, and predicts pose information that implicitly defines both +instance segmentation and pose estimation task. We have validated the +effectiveness of the proposed implicit-based approach for a synthetic test set, +as well as provided qualitative results for a small set of real acquired point +clouds with depth sensors. Code is publicly available at +https://github.com/georgeretsi/mushroom-pose. + +
+
+
+
+
+ + ☆ Soil Fertility Prediction Using Combined USB-microscope Based Soil + Image, Auxiliary Variables, and Portable X-Ray Fluorescence Spectrometry + + +
+ This study explored the application of portable X-ray fluorescence (PXRF) +spectrometry and soil image analysis to rapidly assess soil fertility, focusing +on critical parameters such as available B, organic carbon (OC), available Mn, +available S, and the sulfur availability index (SAI). Analyzing 1,133 soil +samples from various agro-climatic zones in Eastern India, the research +combined color and texture features from microscopic soil images, PXRF data, +and auxiliary soil variables (AVs) using a Random Forest model. Results +indicated that integrating image features (IFs) with auxiliary variables (AVs) +significantly enhanced prediction accuracy for available B (R^2 = 0.80) and OC +(R^2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF data, +further improved predictions for available Mn and SAI with R^2 values of 0.72 +and 0.70, respectively. The study demonstrated how these integrated +technologies have the potential to provide quick and affordable options for +soil testing, opening up access to more sophisticated prediction models and a +better comprehension of the fertility and health of the soil. Future research +should focus on the application of deep learning models on a larger dataset of +soil images, developed using soils from a broader range of agro-climatic zones +under field condition. + +
+
+ comment: 37 pages, 10 figures; manuscript under peer-review for publication in + the jounral 'Computers and Electronics in Agriculture' +
+
+
+
+
+ + ☆ MoA: Mixture-of-Attention for Subject-Context Disentanglement in + Personalized Image Generation + + +
+ We introduce a new architecture for personalization of text-to-image +diffusion models, coined Mixture-of-Attention (MoA). Inspired by the +Mixture-of-Experts mechanism utilized in large language models (LLMs), MoA +distributes the generation workload between two attention pathways: a +personalized branch and a non-personalized prior branch. MoA is designed to +retain the original model's prior by fixing its attention layers in the prior +branch, while minimally intervening in the generation process with the +personalized branch that learns to embed subjects in the layout and context +generated by the prior branch. A novel routing mechanism manages the +distribution of pixels in each layer across these branches to optimize the +blend of personalized and generic content creation. Once trained, MoA +facilitates the creation of high-quality, personalized images featuring +multiple subjects with compositions and interactions as diverse as those +generated by the original model. Crucially, MoA enhances the distinction +between the model's pre-existing capability and the newly augmented +personalized intervention, thereby offering a more disentangled subject-context +control that was previously unattainable. Project page: +https://snap-research.github.io/mixture-of-attention + +
+
+ comment: Project Website: https://snap-research.github.io/mixture-of-attention +
+
+
+
+
+ + ♻ ☆ VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle + Re-identification + + +
+ Vehicle Re-identification (Re-ID) has been broadly studied in the last +decade; however, the different camera view angle leading to confused +discrimination in the feature subspace for the vehicles of various poses, is +still challenging for the Vehicle Re-ID models in the real world. To promote +the Vehicle Re-ID models, this paper proposes to synthesize a large number of +vehicle images in the target pose, whose idea is to project the vehicles of +diverse poses into the unified target pose so as to enhance feature +discrimination. Considering that the paired data of the same vehicles in +different traffic surveillance cameras might be not available in the real +world, we propose the first Pair-flexible Pose Guided Image Synthesis method +for Vehicle Re-ID, named as VehicleGAN in this paper, which works for both +supervised and unsupervised settings without the knowledge of geometric 3D +models. Because of the feature distribution difference between real and +synthetic data, simply training a traditional metric learning based Re-ID model +with data-level fusion (i.e., data augmentation) is not satisfactory, therefore +we propose a new Joint Metric Learning (JML) via effective feature-level fusion +from both real and synthetic data. Intensive experimental results on the public +VeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our +proposed VehicleGAN and JML. + +
+
+
+
+
+ + ♻ ☆ The Brain Tumor Sequence Registration (BraTS-Reg) Challenge: + Establishing Correspondence Between Pre-Operative and Follow-up MRI Scans of + Diffuse Glioma Patients + + +
+ Registration of longitudinal brain MRI scans containing pathologies is +challenging due to dramatic changes in tissue appearance. Although there has +been progress in developing general-purpose medical image registration +techniques, they have not yet attained the requisite precision and reliability +for this task, highlighting its inherent complexity. Here we describe the Brain +Tumor Sequence Registration (BraTS-Reg) challenge, as the first public +benchmark environment for deformable registration algorithms focusing on +estimating correspondences between pre-operative and follow-up scans of the +same patient diagnosed with a diffuse brain glioma. The BraTS-Reg data comprise +de-identified multi-institutional multi-parametric MRI (mpMRI) scans, curated +for size and resolution according to a canonical anatomical template, and +divided into training, validation, and testing sets. Clinical experts annotated +ground truth (GT) landmark points of anatomical locations distinct across the +temporal domain. Quantitative evaluation and ranking were based on the Median +Euclidean Error (MEE), Robustness, and the determinant of the Jacobian of the +displacement field. The top-ranked methodologies yielded similar performance +across all evaluation metrics and shared several methodological commonalities, +including pre-alignment, deep neural networks, inverse consistency analysis, +and test-time instance optimization per-case basis as a post-processing step. +The top-ranked method attained the MEE at or below that of the inter-rater +variability for approximately 60% of the evaluated landmarks, underscoring the +scope for further accuracy and robustness improvements, especially relative to +human experts. The aim of BraTS-Reg is to continue to serve as an active +resource for research, with the data and online evaluation tools accessible at +https://bratsreg.github.io/. + +
+
+
+
+
+ + ♻ ☆ Re-Nerfing: Improving Novel Views Synthesis through Novel Views + Synthesis + + +
+ Neural Radiance Fields (NeRFs) have shown remarkable novel view synthesis +capabilities even in large-scale, unbounded scenes, albeit requiring hundreds +of views or introducing artifacts in sparser settings. Their optimization +suffers from shape-radiance ambiguities wherever only a small visual overlap is +available. This leads to erroneous scene geometry and artifacts. In this paper, +we propose Re-Nerfing, a simple and general multi-stage data augmentation +approach that leverages NeRF's own view synthesis ability to address these +limitations. With Re-Nerfing, we enhance the geometric consistency of novel +views as follows: First, we train a NeRF with the available views. Then, we use +the optimized NeRF to synthesize pseudo-views around the original ones with a +view selection strategy to improve coverage and preserve view quality. Finally, +we train a second NeRF with both the original images and the pseudo views +masking out uncertain regions. Extensive experiments applying Re-Nerfing on +various pipelines on the mip-NeRF 360 dataset, including Gaussian Splatting, +provide valuable insights into the improvements achievable without external +data or supervision, on denser and sparser input scenarios. Project page: +https://renerfing.github.io + +
+
+ comment: Code will be released upon acceptance +
+
+
+
+
+ + ♻ ☆ Segmenting the motion components of a video: A long-term unsupervised + model + + +
+ Human beings have the ability to continuously analyze a video and immediately +extract the motion components. We want to adopt this paradigm to provide a +coherent and stable motion segmentation over the video sequence. In this +perspective, we propose a novel long-term spatio-temporal model operating in a +totally unsupervised way. It takes as input the volume of consecutive optical +flow (OF) fields, and delivers a volume of segments of coherent motion over the +video. More specifically, we have designed a transformer-based network, where +we leverage a mathematically well-founded framework, the Evidence Lower Bound +(ELBO), to derive the loss function. The loss function combines a flow +reconstruction term involving spatio-temporal parametric motion models +combining, in a novel way, polynomial (quadratic) motion models for the spatial +dimensions and B-splines for the time dimension of the video sequence, and a +regularization term enforcing temporal consistency on the segments. We report +experiments on four VOS benchmarks, demonstrating competitive quantitative +results, while performing motion segmentation on a whole sequence in one go. We +also highlight through visual results the key contributions on temporal +consistency brought by our method. + +
+
+
+
+
+ + ♻ ☆ TCJA-SNN: Temporal-Channel Joint Attention for Spiking Neural Networks + + +
+ Spiking Neural Networks (SNNs) are attracting widespread interest due to +their biological plausibility, energy efficiency, and powerful spatio-temporal +information representation ability. Given the critical role of attention +mechanisms in enhancing neural network performance, the integration of SNNs and +attention mechanisms exhibits potential to deliver energy-efficient and +high-performance computing paradigms. We present a novel Temporal-Channel Joint +Attention mechanism for SNNs, referred to as TCJA-SNN. The proposed TCJA-SNN +framework can effectively assess the significance of spike sequence from both +spatial and temporal dimensions. More specifically, our essential technical +contribution lies on: 1) We employ the squeeze operation to compress the spike +stream into an average matrix. Then, we leverage two local attention mechanisms +based on efficient 1D convolutions to facilitate comprehensive feature +extraction at the temporal and channel levels independently. 2) We introduce +the Cross Convolutional Fusion (CCF) layer as a novel approach to model the +inter-dependencies between the temporal and channel scopes. This layer breaks +the independence of these two dimensions and enables the interaction between +features. Experimental results demonstrate that the proposed TCJA-SNN +outperforms SOTA by up to 15.7% accuracy on standard static and neuromorphic +datasets, including Fashion-MNIST, CIFAR10-DVS, N-Caltech 101, and DVS128 +Gesture. Furthermore, we apply the TCJA-SNN framework to image generation tasks +by leveraging a variation autoencoder. To the best of our knowledge, this study +is the first instance where the SNN-attention mechanism has been employed for +image classification and generation tasks. Notably, our approach has achieved +SOTA performance in both domains, establishing a significant advancement in the +field. Codes are available at https://github.com/ridgerchu/TCJA. + +
+
+ comment: Accepted by IEEE Transactions on Neural Networks and Learning Systems +
+
+
+
+
+ + ♻ ☆ ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal + Instance Segmentation IJCNN2024 + + +
+ Amodal Instance Segmentation (AIS) presents a challenging task as it involves +predicting both visible and occluded parts of objects within images. Existing +AIS methods rely on a bidirectional approach, encompassing both the transition +from amodal features to visible features (amodal-to-visible) and from visible +features to amodal features (visible-to-amodal). Our observation shows that the +utilization of amodal features through the amodal-to-visible can confuse the +visible features due to the extra information of occluded/hidden segments not +presented in visible display. Consequently, this compromised quality of visible +features during the subsequent visible-to-amodal transition. To tackle this +issue, we introduce ShapeFormer, a decoupled Transformer-based model with a +visible-to-amodal transition. It facilitates the explicit relationship between +output segmentations and avoids the need for amodal-to-visible transitions. +ShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for +predicting visible segmentation with occlusion awareness, (ii) Shape-Prior +Amodal Mask Head for predicting amodal and occluded masks, and (iii) +Category-Specific Shape Prior Retriever aims to provide shape prior knowledge. +Comprehensive experiments and extensive ablation studies across various AIS +benchmarks demonstrate the effectiveness of our ShapeFormer. The code is +available at: \url{https://github.com/UARK-AICV/ShapeFormer} + +
+
+ comment: Accepted to IJCNN2024 +
+
+
+
+
+ + ♻ ☆ Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching CVPR 2024 + + +
+ Non-isometric shape correspondence remains a fundamental challenge in +computer vision. Traditional methods using Laplace-Beltrami operator (LBO) +eigenmodes face limitations in characterizing high-frequency extrinsic shape +changes like bending and creases. We propose a novel approach of combining the +non-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell +hessian with the intrinsic ones of the LBO, creating a hybrid spectral space in +which we construct functional maps. To this end, we present a theoretical +framework to effectively integrate non-orthogonal basis functions into +descriptor- and learning-based functional map methods. Our approach can be +incorporated easily into existing functional map pipelines across varying +applications and is able to handle complex deformations beyond isometries. We +show extensive evaluations across various supervised and unsupervised settings +and demonstrate significant improvements. Notably, our approach achieves up to +15% better mean geodesic error for non-isometric correspondence settings and up +to 45% improvement in scenarios with topological noise. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SuperPrimitive: Scene Reconstruction at a Primitive Level CVPR2024 + + +
+ Joint camera pose and dense geometry estimation from a set of images or a +monocular video remains a challenging problem due to its computational +complexity and inherent visual ambiguities. Most dense incremental +reconstruction systems operate directly on image pixels and solve for their 3D +positions using multi-view geometry cues. Such pixel-level approaches suffer +from ambiguities or violations of multi-view consistency (e.g. caused by +textureless or specular surfaces). + We address this issue with a new image representation which we call a +SuperPrimitive. SuperPrimitives are obtained by splitting images into +semantically correlated local regions and enhancing them with estimated surface +normal directions, both of which are predicted by state-of-the-art single image +neural networks. This provides a local geometry estimate per SuperPrimitive, +while their relative positions are adjusted based on multi-view observations. + We demonstrate the versatility of our new representation by addressing three +3D reconstruction tasks: depth completion, few-view structure from motion, and +monocular dense visual odometry. + +
+
+ comment: CVPR2024. Project Page: https://makezur.github.io/SuperPrimitive/ +
+
+
+
+
+ + ♻ ☆ Leveraging Foundation Models for Content-Based Medical Image Retrieval + in Radiology + + +
+ Content-based image retrieval (CBIR) has the potential to significantly +improve diagnostic aid and medical research in radiology. Current CBIR systems +face limitations due to their specialization to certain pathologies, limiting +their utility. In response, we propose using vision foundation models as +powerful and versatile off-the-shelf feature extractors for content-based +medical image retrieval. By benchmarking these models on a comprehensive +dataset of 1.6 million 2D radiological images spanning four modalities and 161 +pathologies, we identify weakly-supervised models as superior, achieving a P@1 +of up to 0.594. This performance not only competes with a specialized model but +does so without the need for fine-tuning. Our analysis further explores the +challenges in retrieving pathological versus anatomical structures, indicating +that accurate retrieval of pathological features presents greater difficulty. +Despite these challenges, our research underscores the vast potential of +foundation models for CBIR in radiology, proposing a shift towards versatile, +general-purpose medical image retrieval systems that do not require specific +tuning. + +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ The LuViRA Dataset: Measurement Description ICRA 2024 + + +
+ We present a dataset to evaluate localization algorithms, which utilizes +vision, audio, and radio sensors: the Lund University Vision, Radio, and Audio +(LuViRA) Dataset. The dataset includes RGB images, corresponding depth maps, +IMU readings, channel response between a massive MIMO channel sounder and a +user equipment, audio recorded by 12 microphones, and 0.5 mm accurate 6DoF pose +ground truth. We synchronize these sensors to make sure that all data are +recorded simultaneously. A camera, speaker, and transmit antenna are placed on +top of a slowly moving service robot and 88 trajectories are recorded. Each +trajectory includes 20 to 50 seconds of recorded sensor data and ground truth +labels. The data from different sensors can be used separately or jointly to +conduct localization tasks and a motion capture system is used to verify the +results obtained by the localization algorithms. The main aim of this dataset +is to enable research on fusing the most commonly used sensors for localization +tasks. However, the full dataset or some parts of it can also be used for other +research areas such as channel estimation, image classification, etc. Fusing +sensor data can lead to increased localization accuracy and reliability, as +well as decreased latency and power consumption. The created dataset will be +made public at a later date. + +
+
+ comment: 7 pages, 7 figures, Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ ChatCAD+: Towards a Universal and Reliable Interactive CAD using LLMs + + +
+ The integration of Computer-Aided Diagnosis (CAD) with Large Language Models +(LLMs) presents a promising frontier in clinical applications, notably in +automating diagnostic processes akin to those performed by radiologists and +providing consultations similar to a virtual family doctor. Despite the +promising potential of this integration, current works face at least two +limitations: (1) From the perspective of a radiologist, existing studies +typically have a restricted scope of applicable imaging domains, failing to +meet the diagnostic needs of different patients. Also, the insufficient +diagnostic capability of LLMs further undermine the quality and reliability of +the generated medical reports. (2) Current LLMs lack the requisite depth in +medical expertise, rendering them less effective as virtual family doctors due +to the potential unreliability of the advice provided during patient +consultations. To address these limitations, we introduce ChatCAD+, to be +universal and reliable. Specifically, it is featured by two main modules: (1) +Reliable Report Generation and (2) Reliable Interaction. The Reliable Report +Generation module is capable of interpreting medical images from diverse +domains and generate high-quality medical reports via our proposed hierarchical +in-context learning. Concurrently, the interaction module leverages up-to-date +information from reputable medical websites to provide reliable medical advice. +Together, these designed modules synergize to closely align with the expertise +of human medical professionals, offering enhanced consistency and reliability +for interpretation and advice. The source code is available at +https://github.com/zhaozh10/ChatCAD. + +
+
+ comment: Authors Zihao Zhao, Sheng Wang, Jinchen Gu, Yitao Zhu contributed + equally to this work and should be considered co-first authors +
+
+
+
+
+ + ♻ ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059 (14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The project page is available at +https://ecodepth-iitd.github.io + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ Distance and Collision Probability Estimation from Gaussian Surface + Models + + +
+ This paper describes continuous-space methodologies to estimate the collision +probability, Euclidean distance and gradient between an ellipsoidal robot model +and an environment surface modeled as a set of Gaussian distributions. +Continuous-space collision probability estimation is critical for +uncertainty-aware motion planning. Most collision detection and avoidance +approaches assume the robot is modeled as a sphere, but ellipsoidal +representations provide tighter approximations and enable navigation in +cluttered and narrow spaces. State-of-the-art methods derive the Euclidean +distance and gradient by processing raw point clouds, which is computationally +expensive for large workspaces. Recent advances in Gaussian surface modeling +(e.g. mixture models, splatting) enable compressed and high-fidelity surface +representations. Few methods exist to estimate continuous-space occupancy from +such models. They require Gaussians to model free space and are unable to +estimate the collision probability, Euclidean distance and gradient for an +ellipsoidal robot. The proposed methods bridge this gap by extending prior work +in ellipsoid-to-ellipsoid Euclidean distance and collision probability +estimation to Gaussian surface models. A geometric blending approach is also +proposed to improve collision probability estimation. The approaches are +evaluated with numerical 2D and 3D experiments using real-world point cloud +data. Methods for efficient calculation of these quantities are demonstrated to +execute within a few microseconds per ellipsoid pair using a single-thread on +low-power CPUs of modern embedded computers + +
+
+
+
+
+ + ♻ ☆ MISC: Ultra-low Bitrate Image Semantic Compression Driven by Large + Multimodal Model + + +
+ With the evolution of storage and communication protocols, ultra-low bitrate +image compression has become a highly demanding topic. However, existing +compression algorithms must sacrifice either consistency with the ground truth +or perceptual quality at ultra-low bitrate. In recent years, the rapid +development of the Large Multimodal Model (LMM) has made it possible to balance +these two goals. To solve this problem, this paper proposes a method called +Multimodal Image Semantic Compression (MISC), which consists of an LMM encoder +for extracting the semantic information of the image, a map encoder to locate +the region corresponding to the semantic, an image encoder generates an +extremely compressed bitstream, and a decoder reconstructs the image based on +the above information. Experimental results show that our proposed MISC is +suitable for compressing both traditional Natural Sense Images (NSIs) and +emerging AI-Generated Images (AIGIs) content. It can achieve optimal +consistency and perception results while saving 50% bitrate, which has strong +potential applications in the next generation of storage and communication. The +code will be released on https://github.com/lcysyzxdxc/MISC. + +
+
+ comment: 13 page, 11 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ EgoPlan-Bench: Benchmarking Egocentric Embodied Planning with Multimodal + Large Language Models + + +
+ Multimodal Large Language Models, combining the remarkable reasoning and +generalization capabilities of Large Language Models (LLMs) with the ability to +comprehend visual inputs, have opened up new avenues for embodied task +planning. Given diverse environmental inputs, including real-time task +progress, visual observations, and open-form language instructions, a +proficient task planner is expected to predict feasible actions, which is a +feat inherently achievable by Multimodal Large Language Models (MLLMs). In this +paper, we aim to quantitatively investigate the potential of MLLMs as embodied +task planners in real-world scenarios by introducing a benchmark with human +annotations named EgoPlan-Bench. Our benchmark is distinguished by realistic +tasks derived from real-world videos, a diverse set of actions involving +interactions with hundreds of different objects, and complex visual +observations from varied scenes. We evaluate a wide range of MLLMs, revealing +that these models have not yet evolved into embodied planning generalists (even +GPT-4V). We further construct an instruction-tuning dataset EgoPlan-IT from +videos with human-object interactions, to facilitate the learning of high-level +task planning in intricate real-world situations. The experiment results +demonstrate that the model tuned on EgoPlan-IT not only significantly improves +performance on our benchmark, but can also be applied as a task planner for +guiding embodied agents in simulations. + +
+
+ comment: Project released at: https://github.com/ChenYi99/EgoPlan +
+
+
+
+
+ + ♻ ☆ Deepfake detection by exploiting surface anomalies: the SurFake approach + + +
+ The ever-increasing use of synthetically generated content in different +sectors of our everyday life, one for all media information, poses a strong +need for deepfake detection tools in order to avoid the proliferation of +altered messages. The process to identify manipulated content, in particular +images and videos, is basically performed by looking for the presence of some +inconsistencies and/or anomalies specifically due to the fake generation +process. Different techniques exist in the scientific literature that exploit +diverse ad-hoc features in order to highlight possible modifications. In this +paper, we propose to investigate how deepfake creation can impact on the +characteristics that the whole scene had at the time of the acquisition. In +particular, when an image (video) is captured the overall geometry of the scene +(e.g. surfaces) and the acquisition process (e.g. illumination) determine a +univocal environment that is directly represented by the image pixel values; +all these intrinsic relations are possibly changed by the deepfake generation +process. By resorting to the analysis of the characteristics of the surfaces +depicted in the image it is possible to obtain a descriptor usable to train a +CNN for deepfake detection: we refer to such an approach as SurFake. +Experimental results carried out on the FF++ dataset for different kinds of +deepfake forgeries and diverse deep learning models confirm that such a feature +can be adopted to discriminate between pristine and altered images; +furthermore, experiments witness that it can also be combined with visual data +to provide a certain improvement in terms of detection accuracy. + +
+
+
+
+
+ + ♻ ☆ High-throughput Visual Nano-drone to Nano-drone Relative Localization + using Onboard Fully Convolutional Networks ICRA 2024 + + +
+ Relative drone-to-drone localization is a fundamental building block for any +swarm operations. We address this task in the context of miniaturized +nano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to +novel use cases enabled by their reduced form factor. The price for their +versatility comes with limited onboard resources, i.e., sensors, processing +units, and memory, which limits the complexity of the onboard algorithms. A +traditional solution to overcome these limitations is represented by +lightweight deep learning models directly deployed aboard nano-drones. This +work tackles the challenging relative pose estimation between nano-drones using +only a gray-scale low-resolution camera and an ultra-low-power System-on-Chip +(SoC) hosted onboard. We present a vertically integrated system based on a +novel vision-based fully convolutional neural network (FCNN), which runs at +39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8 +SoC. We compare our FCNN against three State-of-the-Art (SoA) systems. +Considering the best-performing SoA approach, our model results in an R-squared +improvement from 32 to 47% on the horizontal image coordinate and from 18 to +55% on the vertical image coordinate, on a real-world dataset of 30k images. +Finally, our in-field tests show a reduction of the average tracking error of +37% compared to a previous SoA work and an endurance performance up to the +entire battery lifetime of 4 minutes. + +
+
+ comment: ICRA 2024, IEEE Conference +
+
+
+
+
+ + ♻ ☆ Exploring Missing Modality in Multimodal Egocentric Datasets + + +
+ Multimodal video understanding is crucial for analyzing egocentric videos, +where integrating multiple sensory signals significantly enhances action +recognition and moment localization. However, practical applications often +grapple with incomplete modalities due to factors like privacy concerns, +efficiency demands, or hardware malfunctions. Addressing this, our study delves +into the impact of missing modalities on egocentric action recognition, +particularly within transformer-based models. We introduce a novel concept +-Missing Modality Token (MMT)-to maintain performance even when modalities are +absent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and +Epic-Sounds datasets. Our method mitigates the performance loss, reducing it +from its original $\sim 30\%$ drop to only $\sim 10\%$ when half of the test +set is modal-incomplete. Through extensive experimentation, we demonstrate the +adaptability of MMT to different training scenarios and its superiority in +handling missing modalities compared to current methods. Our research +contributes a comprehensive analysis and an innovative approach, opening +avenues for more resilient multimodal systems in real-world settings. + +
+
+
+
+
+ + ♻ ☆ Video shutter angle estimation using optical flow and linear blur + + +
+ We present a method for estimating the shutter angle, a.k.a. exposure +fraction - the ratio of the exposure time and the reciprocal of frame rate - of +videoclips containing motion. The approach exploits the relation of the +exposure fraction, optical flow, and linear motion blur. Robustness is achieved +by selecting image patches where both the optical flow and blur estimates are +reliable, checking their consistency. The method was evaluated on the publicly +available Beam-Splitter Dataset with a range of exposure fractions from 0.015 +to 0.36. The best achieved mean absolute error of estimates was 0.039. We +successfully test the suitability of the method for a forensic application of +detection of video tampering by frame removal or insertion + +
+
+
+
+
+ + ♻ ☆ D$^2$ST-Adapter: Disentangled-and-Deformable Spatio-Temporal Adapter for + Few-shot Action Recognition + + +
+ Adapting large pre-trained image models to few-shot action recognition has +proven to be an effective and efficient strategy for learning robust feature +extractors, which is essential for few-shot learning. Typical fine-tuning based +adaptation paradigm is prone to overfitting in the few-shot learning scenarios +and offers little modeling flexibility for learning temporal features in video +data. In this work we present the Disentangled-and-Deformable Spatio-Temporal +Adapter (D$^2$ST-Adapter), which is a novel adapter tuning framework +well-suited for few-shot action recognition due to lightweight design and low +parameter-learning overhead. It is designed in a dual-pathway architecture to +encode spatial and temporal features in a disentangled manner. In particular, +we devise the anisotropic Deformable Spatio-Temporal Attention module as the +core component of D$^2$ST-Adapter, which can be tailored with anisotropic +sampling densities along spatial and temporal domains to learn spatial and +temporal features specifically in corresponding pathways, allowing our +D$^2$ST-Adapter to encode features in a global view in 3D spatio-temporal space +while maintaining a lightweight design. Extensive experiments with +instantiations of our method on both pre-trained ResNet and ViT demonstrate the +superiority of our method over state-of-the-art methods for few-shot action +recognition. Our method is particularly well-suited to challenging scenarios +where temporal dynamics are critical for action recognition. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Learning Pace Synchronization for Open-World + Semi-Supervised Learning + + +
+ In open-world semi-supervised learning, a machine learning model is tasked +with uncovering novel categories from unlabeled data while maintaining +performance on seen categories from labeled data. The central challenge is the +substantial learning gap between seen and novel categories, as the model learns +the former faster due to accurate supervisory information. Moreover, capturing +the semantics of unlabeled novel category samples is also challenging due to +the missing label information. To address the above issues, we introduce 1) the +adaptive synchronizing marginal loss which imposes class-specific negative +margins to alleviate the model bias towards seen classes, and 2) the +pseudo-label contrastive clustering which exploits pseudo-labels predicted by +the model to group unlabeled data from the same category together in the output +space. Extensive experiments on benchmark datasets demonstrate that previous +approaches may significantly hinder novel class learning, whereas our method +strikingly balances the learning pace between seen and novel classes, achieving +a remarkable 3% average accuracy increase on the ImageNet dataset. Importantly, +we find that fine-tuning the self-supervised pre-trained model significantly +boosts the performance, which is overlooked in prior literature. Our code is +available at https://github.com/yebo0216best/LPS-main. + +
+
+
+
+
+ + ♻ ☆ Do Counterfactual Examples Complicate Adversarial Training? CVPR'24 + + +
+ We leverage diffusion models to study the robustness-performance tradeoff of +robust classifiers. Our approach introduces a simple, pretrained diffusion +method to generate low-norm counterfactual examples (CEs): semantically altered +data which results in different true class membership. We report that the +confidence and accuracy of robust models on their clean training data are +associated with the proximity of the data to their CEs. Moreover, robust models +perform very poorly when evaluated on the CEs directly, as they become +increasingly invariant to the low-norm, semantic changes brought by CEs. The +results indicate a significant overlap between non-robust and semantic +features, countering the common assumption that non-robust features are not +interpretable. + +
+
+ comment: Accepted as a short paper to the GCV Workshop at CVPR'24 +
+
+
+
+
+ + ♻ ☆ ODM: A Text-Image Further Alignment Pre-training Approach for Scene Text + Detection and Spotting CVPR2024 + + +
+ In recent years, text-image joint pre-training techniques have shown +promising results in various tasks. However, in Optical Character Recognition +(OCR) tasks, aligning text instances with their corresponding text regions in +images poses a challenge, as it requires effective alignment between text and +OCR-Text (referring to the text in images as OCR-Text to distinguish from the +text in natural language) rather than a holistic understanding of the overall +image content. In this paper, we propose a new pre-training method called +OCR-Text Destylization Modeling (ODM) that transfers diverse styles of text +found in images to a uniform style based on the text prompt. With ODM, we +achieve better alignment between text and OCR-Text and enable pre-trained +models to adapt to the complex and diverse styles of scene text detection and +spotting tasks. Additionally, we have designed a new labeling generation method +specifically for ODM and combined it with our proposed Text-Controller module +to address the challenge of annotation costs in OCR tasks, allowing a larger +amount of unlabeled data to participate in pre-training. Extensive experiments +on multiple public datasets demonstrate that our method significantly improves +performance and outperforms current pre-training methods in scene text +detection and spotting tasks. Code is available at +https://github.com/PriNing/ODM. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ ICSVR: Investigating Compositional and Syntactic Understanding in Video + Retrieval Models + + +
+ Video retrieval (VR) involves retrieving the ground truth video from the +video database given a text caption or vice-versa. The two important components +of compositionality: objects & attributes and actions are joined using correct +syntax to form a proper text query. These components (objects & attributes, +actions and syntax) each play an important role to help distinguish among +videos and retrieve the correct ground truth video. However, it is unclear what +is the effect of these components on the video retrieval performance. We +therefore, conduct a systematic study to evaluate the compositional and +syntactic understanding of video retrieval models on standard benchmarks such +as MSRVTT, MSVD and DIDEMO. The study is performed on two categories of video +retrieval models: (i) which are pre-trained on video-text pairs and fine-tuned +on downstream video retrieval datasets (Eg. Frozen-in-Time, Violet, MCQ etc.) +(ii) which adapt pre-trained image-text representations like CLIP for video +retrieval (Eg. CLIP4Clip, XCLIP, CLIP2Video etc.). Our experiments reveal that +actions and syntax play a minor role compared to objects & attributes in video +understanding. Moreover, video retrieval models that use pre-trained image-text +representations (CLIP) have better syntactic and compositional understanding as +compared to models pre-trained on video-text data. The code is available at +https://github.com/IntelLabs/multimodal_cognitive_ai/tree/main/ICSVR + +
+
+
+
+
+ + ♻ ☆ PE-MVCNet: Multi-view and Cross-modal Fusion Network for Pulmonary + Embolism Prediction + + +
+ The early detection of a pulmonary embolism (PE) is critical for enhancing +patient survival rates. Both image-based and non-image-based features are of +utmost importance in medical classification tasks. In a clinical setting, +physicians tend to rely on the contextual information provided by Electronic +Medical Records (EMR) to interpret medical imaging. However, very few models +effectively integrate clinical information with imaging data. To address this +shortcoming, we suggest a multimodal fusion methodology, termed PE-MVCNet, +which capitalizes on Computed Tomography Pulmonary Angiography imaging and EMR +data. This method comprises the Image-only module with an integrated multi-view +block, the EMR-only module, and the Cross-modal Attention Fusion (CMAF) module. +These modules cooperate to extract comprehensive features that subsequently +generate predictions for PE. We conducted experiments using the publicly +accessible Stanford University Medical Center dataset, achieving an AUROC of +94.1%, an accuracy rate of 90.2%, and an F1 score of 90.6%. Our proposed model +outperforms existing methodologies, corroborating that our multimodal fusion +model excels compared to models that use a single data modality. Our source +code is available at https://github.com/LeavingStarW/PE-MVCNET. + +
+
+
+
+
+ + ♻ ☆ One-Prompt to Segment All Medical Images + + +
+ Large foundation models, known for their strong zero-shot generalization, +have excelled in visual and language applications. However, applying them to +medical image segmentation, a domain with diverse imaging types and target +labels, remains an open challenge. Current approaches, such as adapting +interactive segmentation models like Segment Anything Model (SAM), require user +prompts for each sample during inference. Alternatively, transfer learning +methods like few/one-shot models demand labeled samples, leading to high costs. +This paper introduces a new paradigm toward the universal medical image +segmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation +combines the strengths of one-shot and interactive methods. In the inference +stage, with just \textbf{one prompted sample}, it can adeptly handle the unseen +task in a single forward pass. We train One-Prompt Model on 64 open-source +medical datasets, accompanied by the collection of over 3,000 clinician-labeled +prompts. Tested on 14 previously unseen datasets, the One-Prompt Model +showcases superior zero-shot segmentation capabilities, outperforming a wide +range of related methods. The code and data is released as +https://github.com/KidsWithTokens/one-prompt. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.12620 +
+
+
+
+
+ + ♻ ☆ AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile + Platform Real-Time RGB-D Semantic Segmentation + + +
+ Understanding indoor scenes is crucial for urban studies. Considering the +dynamic nature of indoor environments, effective semantic segmentation requires +both real-time operation and high accuracy.To address this, we propose +AsymFormer, a novel network that improves real-time semantic segmentation +accuracy using RGB-D multi-modal information without substantially increasing +network complexity. AsymFormer uses an asymmetrical backbone for multimodal +feature extraction, reducing redundant parameters by optimizing computational +resource distribution. To fuse asymmetric multimodal features, a Local +Attention-Guided Feature Selection (LAFS) module is used to selectively fuse +features from different modalities by leveraging their dependencies. +Subsequently, a Cross-Modal Attention-Guided Feature Correlation Embedding +(CMA) module is introduced to further extract cross-modal representations. The +AsymFormer demonstrates competitive results with 54.1% mIoU on NYUv2 and 49.1% +mIoU on SUNRGBD. Notably, AsymFormer achieves an inference speed of 65 FPS (79 +FPS after implementing mixed precision quantization) on RTX3090, demonstrating +that AsymFormer can strike a balance between high accuracy and efficiency. + +
+
+
+
+
+ + ♻ ☆ A2XP: Towards Private Domain Generalization CVPR 2024 + + +
+ Deep Neural Networks (DNNs) have become pivotal in various fields, especially +in computer vision, outperforming previous methodologies. A critical challenge +in their deployment is the bias inherent in data across different domains, such +as image style and environmental conditions, leading to domain gaps. This +necessitates techniques for learning general representations from biased +training data, known as domain generalization. This paper presents Attend to +eXpert Prompts (A2XP), a novel approach for domain generalization that +preserves the privacy and integrity of the network architecture. A2XP consists +of two phases: Expert Adaptation and Domain Generalization. In the first phase, +prompts for each source domain are optimized to guide the model towards the +optimal direction. In the second phase, two embedder networks are trained to +effectively amalgamate these expert prompts, aiming for an optimal output. Our +extensive experiments demonstrate that A2XP achieves state-of-the-art results +over existing non-private domain generalization methods. The experimental +results validate that the proposed approach not only tackles the domain +generalization challenge in DNNs but also offers a privacy-preserving, +efficient solution to the broader field of computer vision. + +
+
+ comment: Accepted to CVPR 2024. Our code is available at + https://github.com/AIRLABkhu/A2XP +
+
+
+
+
+ + ♻ ☆ T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation + + +
+ Recent methods in text-to-3D leverage powerful pretrained diffusion models to +optimize NeRF. Notably, these methods are able to produce high-quality 3D +scenes without training on 3D data. Due to the open-ended nature of the task, +most studies evaluate their results with subjective case studies and user +experiments, thereby presenting a challenge in quantitatively addressing the +question: How has current progress in Text-to-3D gone so far? In this paper, we +introduce T$^3$Bench, the first comprehensive text-to-3D benchmark containing +diverse text prompts of three increasing complexity levels that are specially +designed for 3D generation. To assess both the subjective quality and the text +alignment, we propose two automatic metrics based on multi-view images produced +by the 3D contents. The quality metric combines multi-view text-image scores +and regional convolution to detect quality and view inconsistency. The +alignment metric uses multi-view captioning and GPT-4 evaluation to measure +text-3D consistency. Both metrics closely correlate with different dimensions +of human judgments, providing a paradigm for efficiently evaluating text-to-3D +models. The benchmarking results, shown in Fig. 1, reveal performance +differences among an extensive 10 prevalent text-to-3D methods. Our analysis +further highlights the common struggles for current methods on generating +surroundings and multi-object scenes, as well as the bottleneck of leveraging +2D guidance for 3D generation. Our project page is available at: +https://t3bench.com. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ MV-CLIP: Multi-View CLIP for Zero-shot 3D Shape Recognition + + +
+ Large-scale pre-trained models have demonstrated impressive performance in +vision and language tasks within open-world scenarios. Due to the lack of +comparable pre-trained models for 3D shapes, recent methods utilize +language-image pre-training to realize zero-shot 3D shape recognition. However, +due to the modality gap, pretrained language-image models are not confident +enough in the generalization to 3D shape recognition. Consequently, this paper +aims to improve the confidence with view selection and hierarchical prompts. +Leveraging the CLIP model as an example, we employ view selection on the vision +side by identifying views with high prediction confidence from multiple +rendered views of a 3D shape. On the textual side, the strategy of hierarchical +prompts is proposed for the first time. The first layer prompts several +classification candidates with traditional class-level descriptions, while the +second layer refines the prediction based on function-level descriptions or +further distinctions between the candidates. Remarkably, without the need for +additional training, our proposed method achieves impressive zero-shot 3D +classification accuracies of 84.44%, 91.51%, and 66.17% on ModelNet40, +ModelNet10, and ShapeNet Core55, respectively. Furthermore, we will make the +code publicly available to facilitate reproducibility and further research in +this area. + +
+
+
+
+
+ + ♻ ☆ Dual Modalities of Text: Visual and Textual Generative Pre-training + + +
+ Harnessing visual texts represents a burgeoning frontier in the evolution of +language modeling. In this paper, we introduce a novel pre-training framework +for a suite of pixel-based autoregressive language models, pre-training on a +corpus of over 400 million documents rendered as RGB images. Our approach is +characterized by a dual-modality training regimen, engaging both visual data +through next patch prediction with a regression head and textual data via next +token prediction with a classification head. This study is particularly focused +on investigating the synergistic interplay between visual and textual +modalities of language. Our comprehensive evaluation across a diverse array of +benchmarks reveals that the confluence of visual and textual data substantially +augments the efficacy of pixel-based language models. Notably, our findings +show that a unidirectional pixel-based model, devoid of textual data during +training, can match the performance levels of advanced bidirectional +pixel-based models on various language understanding benchmarks. This work +highlights the considerable untapped potential of integrating visual and +textual information for language modeling purposes. We will release our code, +data, and checkpoints to inspire further research advancement. + +
+
+
+
+
+ + ♻ ☆ 3D Face Reconstruction with the Geometric Guidance of Facial Part + Segmentation CVPR2024 + + +
+ 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in +various applications. However, existing methods struggle to reconstruct faces +with extreme expressions due to deficiencies in supervisory signals, such as +sparse or inaccurate landmarks. Segmentation information contains effective +geometric contexts for face reconstruction. Certain attempts intuitively depend +on differentiable renderers to compare the rendered silhouettes of +reconstruction with segmentation, which is prone to issues like local optima +and gradient instability. In this paper, we fully utilize the facial part +segmentation geometry by introducing Part Re-projection Distance Loss (PRDL). +Specifically, PRDL transforms facial part segmentation into 2D points and +re-projects the reconstruction onto the image plane. Subsequently, by +introducing grid anchors and computing different statistical distances from +these anchors to the point sets, PRDL establishes geometry descriptors to +optimize the distribution of the point sets for face reconstruction. PRDL +exhibits a clear gradient compared to the renderer-based methods and presents +state-of-the-art reconstruction performance in extensive quantitative and +qualitative experiments. Our project is available at +https://github.com/wang-zidu/3DDFA-V3 . + +
+
+ comment: CVPR2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ KDAS: Knowledge Distillation via Attention Supervision Framework for + Polyp Segmentation + + +
+ Polyp segmentation, a contentious issue in medical imaging, has seen numerous +proposed methods aimed at improving the quality of segmented masks. While +current state-of-the-art techniques yield impressive results, the size and +computational cost of these models create challenges for practical industry +applications. To address this challenge, we present KDAS, a Knowledge +Distillation framework that incorporates attention supervision, and our +proposed Symmetrical Guiding Module. This framework is designed to facilitate a +compact student model with fewer parameters, allowing it to learn the strengths +of the teacher model and mitigate the inconsistency between teacher features +and student features, a common challenge in Knowledge Distillation, via the +Symmetrical Guiding Module. Through extensive experiments, our compact models +demonstrate their strength by achieving competitive results with +state-of-the-art methods, offering a promising approach to creating compact +models with high accuracy for polyp segmentation and in the medical imaging +field. The implementation is available on https://github.com/huyquoctrinh/KDAS. + +
+
+
+
+
+ + ♻ ☆ ConsistencyDet: A Robust Object Detector with a Denoising Paradigm of + Consistency Model + + +
+ Object detection, a quintessential task in the realm of perceptual computing, +can be tackled using a generative methodology. In the present study, we +introduce a novel framework designed to articulate object detection as a +denoising diffusion process, which operates on the perturbed bounding boxes of +annotated entities. This framework, termed ConsistencyDet, leverages an +innovative denoising concept known as the Consistency Model. The hallmark of +this model is its self-consistency feature, which empowers the model to map +distorted information from any temporal stage back to its pristine state, +thereby realizing a "one-step denoising" mechanism. Such an attribute markedly +elevates the operational efficiency of the model, setting it apart from the +conventional Diffusion Model. Throughout the training phase, ConsistencyDet +initiates the diffusion sequence with noise-infused boxes derived from the +ground-truth annotations and conditions the model to perform the denoising +task. Subsequently, in the inference stage, the model employs a denoising +sampling strategy that commences with bounding boxes randomly sampled from a +normal distribution. Through iterative refinement, the model transforms an +assortment of arbitrarily generated boxes into definitive detections. +Comprehensive evaluations employing standard benchmarks, such as MS-COCO and +LVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in +performance metrics. Our code is available at +https://github.com/Tankowa/ConsistencyDet. + +
+
+
+
+
+ + ♻ ☆ Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision + Transformers + + +
+ Few-shot knowledge distillation recently emerged as a viable approach to +harness the knowledge of large-scale pre-trained models, using limited data and +computational resources. In this paper, we propose a novel few-shot feature +distillation approach for vision transformers. Our approach is based on two key +steps. Leveraging the fact that vision transformers have a consistent +depth-wise structure, we first copy the weights from intermittent layers of +existing pre-trained vision transformers (teachers) into shallower +architectures (students), where the intermittence factor controls the +complexity of the student transformer with respect to its teacher. Next, we +employ an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge +into the student in a few-shot scenario, aiming to recover the information +processing carried out by the skipped teacher layers. We present comprehensive +experiments with supervised and self-supervised transformers as teachers, on +five data sets from various domains, including natural, medical and satellite +images. The empirical results confirm the superiority of our approach over +competitive baselines. Moreover, the ablation results demonstrate the +usefulness of each component of the proposed pipeline. + +
+
+
+
+
+ + ♻ ☆ Low-light Image Enhancement via CLIP-Fourier Guided Wavelet Diffusion + + +
+ Low-light image enhancement techniques have significantly progressed, but +unstable image quality recovery and unsatisfactory visual perception are still +significant challenges. To solve these problems, we propose a novel and robust +low-light image enhancement method via CLIP-Fourier Guided Wavelet Diffusion, +abbreviated as CFWD. Specifically, CFWD leverages multimodal visual-language +information in the frequency domain space created by multiple wavelet +transforms to guide the enhancement process. Multi-scale supervision across +different modalities facilitates the alignment of image features with semantic +features during the wavelet diffusion process, effectively bridging the gap +between degraded and normal domains. Moreover, to further promote the effective +recovery of the image details, we combine the Fourier transform based on the +wavelet transform and construct a Hybrid High Frequency Perception Module +(HFPM) with a significant perception of the detailed features. This module +avoids the diversity confusion of the wavelet diffusion process by guiding the +fine-grained structure recovery of the enhancement results to achieve +favourable metric and perceptually oriented enhancement. Extensive quantitative +and qualitative experiments on publicly available real-world benchmarks show +that our approach outperforms existing state-of-the-art methods, achieving +significant progress in image quality and noise suppression. The project code +is available at https://github.com/hejh8/CFWD. + +
+
+
+
+
+ + ♻ ☆ Diffusion Models Meet Remote Sensing: Principles, Methods, and + Perspectives + + +
+ As a newly emerging advance in deep generative models, diffusion models have +achieved state-of-the-art results in many fields, including computer vision, +natural language processing, and molecule design. The remote sensing community +has also noticed the powerful ability of diffusion models and quickly applied +them to a variety of tasks for image processing. Given the rapid increase in +research on diffusion models in the field of remote sensing, it is necessary to +conduct a comprehensive review of existing diffusion model-based remote sensing +papers, to help researchers recognize the potential of diffusion models and +provide some directions for further exploration. Specifically, this paper first +introduces the theoretical background of diffusion models, and then +systematically reviews the applications of diffusion models in remote sensing, +including image generation, enhancement, and interpretation. Finally, the +limitations of existing remote sensing diffusion models and worthy research +directions for further exploration are discussed and summarized. + +
+
+
+
+
+ + ♻ ☆ Representation Alignment Contrastive Regularization for Multi-Object + Tracking + + +
+ Achieving high-performance in multi-object tracking algorithms heavily relies +on modeling spatio-temporal relationships during the data association stage. +Mainstream approaches encompass rule-based and deep learning-based methods for +spatio-temporal relationship modeling. While the former relies on physical +motion laws, offering wider applicability but yielding suboptimal results for +complex object movements, the latter, though achieving high-performance, lacks +interpretability and involves complex module designs. This work aims to +simplify deep learning-based spatio-temporal relationship models and introduce +interpretability into features for data association. Specifically, a +lightweight single-layer transformer encoder is utilized to model +spatio-temporal relationships. To make features more interpretative, two +contrastive regularization losses based on representation alignment are +proposed, derived from spatio-temporal consistency rules. By applying weighted +summation to affinity matrices, the aligned features can seamlessly integrate +into the data association stage of the original tracking workflow. Experimental +results showcase that our model enhances the majority of existing tracking +networks' performance without excessive complexity, with minimal increase in +training overhead and nearly negligible computational and storage costs. + +
+
+
+
+
+ + ♻ ☆ OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable + Diffusion Model + + +
+ Omnidirectional images (ODIs) are commonly used in real-world visual tasks, +and high-resolution ODIs help improve the performance of related visual tasks. +Most existing super-resolution methods for ODIs use end-to-end learning +strategies, resulting in inferior realness of generated images and a lack of +effective out-of-domain generalization capabilities in training methods. Image +generation methods represented by diffusion model provide strong priors for +visual tasks and have been proven to be effectively applied to image +restoration tasks. Leveraging the image priors of the Stable Diffusion (SD) +model, we achieve omnidirectional image super-resolution with both fidelity and +realness, dubbed as OmniSSR. Firstly, we transform the equirectangular +projection (ERP) images into tangent projection (TP) images, whose distribution +approximates the planar image domain. Then, we use SD to iteratively sample +initial high-resolution results. At each denoising iteration, we further +correct and update the initial results using the proposed Octadecaplex Tangent +Information Interaction (OTII) and Gradient Decomposition (GD) technique to +ensure better consistency. Finally, the TP images are transformed back to +obtain the final high-resolution results. Our method is zero-shot, requiring no +training or fine-tuning. Experiments of our method on two benchmark datasets +demonstrate the effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Retina : Low-Power Eye Tracking with Event Camera and Spiking Hardware + + +
+ This paper introduces a neuromorphic methodology for eye tracking, harnessing +pure event data captured by a Dynamic Vision Sensor (DVS) camera. The framework +integrates a directly trained Spiking Neuron Network (SNN) regression model and +leverages a state-of-the-art low power edge neuromorphic processor - Speck, +collectively aiming to advance the precision and efficiency of eye-tracking +systems. First, we introduce a representative event-based eye-tracking dataset, +"Ini-30", which was collected with two glass-mounted DVS cameras from thirty +volunteers. Then,a SNN model, based on Integrate And Fire (IAF) neurons, named +"Retina", is described , featuring only 64k parameters (6.63x fewer than the +latest) and achieving pupil tracking error of only 3.24 pixels in a 64x64 DVS +input. The continous regression output is obtained by means of convolution +using a non-spiking temporal 1D filter slided across the output spiking layer. +Finally, we evaluate Retina on the neuromorphic processor, showing an +end-to-end power between 2.89-4.8 mW and a latency of 5.57-8.01 mS dependent on +the time window. We also benchmark our model against the latest event-based +eye-tracking method, "3ET", which was built upon event frames. Results show +that Retina achieves superior precision with 1.24px less pupil centroid error +and reduced computational complexity with 35 times fewer MAC operations. We +hope this work will open avenues for further investigation of close-loop +neuromorphic solutions and true event-based training pursuing edge performance. + +
+
+
+
+
+ + ♻ ☆ Digging into contrastive learning for robust depth estimation with + diffusion models + + +
+ Recently, diffusion-based depth estimation methods have drawn widespread +attention due to their elegant denoising patterns and promising performance. +However, they are typically unreliable under adverse conditions prevalent in +real-world scenarios, such as rainy, snowy, etc. In this paper, we propose a +novel robust depth estimation method called D4RD, featuring a custom +contrastive learning mode tailored for diffusion models to mitigate performance +degradation in complex environments. Concretely, we integrate the strength of +knowledge distillation into contrastive learning, building the `trinity' +contrastive scheme. This scheme utilizes the sampled noise of the forward +diffusion process as a natural reference, guiding the predicted noise in +diverse scenes toward a more stable and precise optimum. Moreover, we extend +noise-level trinity to encompass more generic feature and image levels, +establishing a multi-level contrast to distribute the burden of robust +perception across the overall network. Before addressing complex scenarios, we +enhance the stability of the baseline diffusion model with three +straightforward yet effective improvements, which facilitate convergence and +remove depth outliers. Extensive experiments demonstrate that D4RD surpasses +existing state-of-the-art solutions on synthetic corruption datasets and +real-world weather conditions. The code for D4RD will be made available for +further exploration and adoption. + +
+
+ comment: 8 pages,6 figures +
+
+
+
+
+ + ♻ ☆ The All-Seeing Project V2: Towards General Relation Comprehension of the + Open World + + +
+ We present the All-Seeing Project V2: a new model and dataset designed for +understanding object relations in images. Specifically, we propose the +All-Seeing Model V2 (ASMv2) that integrates the formulation of text generation, +object localization, and relation comprehension into a relation conversation +(ReC) task. Leveraging this unified task, our model excels not only in +perceiving and recognizing all objects within the image but also in grasping +the intricate relation graph between them, diminishing the relation +hallucination often encountered by Multi-modal Large Language Models (MLLMs). +To facilitate training and evaluation of MLLMs in relation understanding, we +created the first high-quality ReC dataset ({AS-V2) which is aligned with the +format of standard instruction tuning data. In addition, we design a new +benchmark, termed Circular-based Relation Probing Evaluation (CRPE) for +comprehensively evaluating the relation comprehension capabilities of MLLMs. +Notably, our ASMv2 achieves an overall accuracy of 52.04 on this relation-aware +benchmark, surpassing the 43.14 of LLaVA-1.5 by a large margin. We hope that +our work can inspire more future research and contribute to the evolution +towards artificial general intelligence. Our project is released at +https://github.com/OpenGVLab/all-seeing. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes CVPR 2024 + + +
+ Recent advancements in post-hoc and inherently interpretable methods have +markedly enhanced the explanations of black box classifier models. These +methods operate either through post-analysis or by integrating concept learning +during model training. Although being effective in bridging the semantic gap +between a model's latent space and human interpretation, these explanation +methods only partially reveal the model's decision-making process. The outcome +is typically limited to high-level semantics derived from the last feature map. +We argue that the explanations lacking insights into the decision processes at +low and mid-level features are neither fully faithful nor useful. Addressing +this gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet), +an inherently interpretable model. MCPNet autonomously learns meaningful +concept prototypes across multiple feature map levels using Centered Kernel +Alignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so +without reliance on predefined concept labels. Further, we propose a novel +classifier paradigm that learns and aligns multi-level concept prototype +distributions for classification purposes via Class-aware Concept Distribution +(CCD) loss. Our experiments reveal that our proposed MCPNet while being +adaptable to various model architectures, offers comprehensive multi-level +explanations while maintaining classification accuracy. Additionally, its +concept distribution-based classification approach shows improved +generalization capabilities in few-shot classification scenarios. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Anomaly Detection with Budget Annotation Using Semi-Supervised + Residual Transformer + + +
+ Anomaly Detection is challenging as usually only the normal samples are seen +during training and the detector needs to discover anomalies on-the-fly. The +recently proposed deep-learning-based approaches could somehow alleviate the +problem but there is still a long way to go in obtaining an industrial-class +anomaly detector for real-world applications. On the other hand, in some +particular AD tasks, a few anomalous samples are labeled manually for achieving +higher accuracy. However, this performance gain is at the cost of considerable +annotation efforts, which can be intractable in many practical scenarios. + In this work, the above two problems are addressed in a unified framework. +Firstly, inspired by the success of the patch-matching-based AD algorithms, we +train a sliding vision transformer over the residuals generated by a novel +position-constrained patch-matching. Secondly, the conventional pixel-wise +segmentation problem is cast into a block-wise classification problem. Thus the +sliding transformer can attain even higher accuracy with much less annotation +labor. Thirdly, to further reduce the labeling cost, we propose to label the +anomalous regions using only bounding boxes. The unlabeled regions caused by +the weak labels are effectively exploited using a highly-customized +semi-supervised learning scheme equipped with two novel data augmentation +methods. The proposed method outperforms all the state-of-the-art approaches +using all the evaluation metrics in both the unsupervised and supervised +scenarios. On the popular MVTec-AD dataset, our SemiREST algorithm obtains the +Average Precision (AP) of 81.2% in the unsupervised condition and 84.4% AP for +supervised anomaly detection. Surprisingly, with the bounding-box-based +semi-supervisions, SemiREST still outperforms the SOTA methods with full +supervision (83.8% AP) on MVTec-AD. + +
+
+ comment: 20 pages,6 figures +
+
+
+
+
+ + ♻ ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. Our project is +public at https://github.com/baochi0212/LaVy + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ GBSD: Generative Bokeh with Stage Diffusion ICASSP + + +
+ The bokeh effect is an artistic technique that blurs out-of-focus areas in a +photograph and has gained interest due to recent developments in text-to-image +synthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior +work on rendering bokeh effects have focused on post hoc image manipulation to +produce similar blurring effects in existing photographs using classical +computer graphics or neural rendering techniques, but have either depth +discontinuity artifacts or are restricted to reproducing bokeh effects that are +present in the training data. More recent diffusion based models can synthesize +images with an artistic style, but either require the generation of +high-dimensional masks, expensive fine-tuning, or affect global image +characteristics. In this paper, we present GBSD, the first generative +text-to-image model that synthesizes photorealistic images with a bokeh style. +Motivated by how image synthesis occurs progressively in diffusion models, our +approach combines latent diffusion models with a 2-stage conditioning algorithm +to render bokeh effects on semantically defined objects. Since we can focus the +effect on objects, this semantic bokeh effect is more versatile than classical +rendering techniques. We evaluate GBSD both quantitatively and qualitatively +and demonstrate its ability to be applied in both text-to-image and +image-to-image settings. + +
+
+ comment: Short Version is accepted by International Conference on Acoustics, + Speech, and Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ♻ ☆ RoboFusion: Towards Robust Multi-Modal 3D Object Detection via SAM + + +
+ Multi-modal 3D object detectors are dedicated to exploring secure and +reliable perception systems for autonomous driving (AD). However, while +achieving state-of-the-art (SOTA) performance on clean benchmark datasets, they +tend to overlook the complexity and harsh conditions of real-world +environments. Meanwhile, with the emergence of visual foundation models (VFMs), +opportunities and challenges are presented for improving the robustness and +generalization of multi-modal 3D object detection in autonomous driving. +Therefore, we propose RoboFusion, a robust framework that leverages VFMs like +SAM to tackle out-of-distribution (OOD) noise scenarios. We first adapt the +original SAM for autonomous driving scenarios named SAM-AD. To align SAM or +SAM-AD with multi-modal methods, we then introduce AD-FPN for upsampling the +image features extracted by SAM. We employ wavelet decomposition to denoise the +depth-guided images for further noise reduction and weather interference. +Lastly, we employ self-attention mechanisms to adaptively reweight the fused +features, enhancing informative features while suppressing excess noise. In +summary, our RoboFusion gradually reduces noise by leveraging the +generalization and robustness of VFMs, thereby enhancing the resilience of +multi-modal 3D object detection. Consequently, our RoboFusion achieves +state-of-the-art performance in noisy scenarios, as demonstrated by the KITTI-C +and nuScenes-C benchmarks. + +
+
+
+
+
+ + ♻ ☆ Transformer-based Multimodal Change Detection with Multitask Consistency + Constraints + + +
+ Change detection plays a fundamental role in Earth observation for analyzing +temporal iterations over time. However, recent studies have largely neglected +the utilization of multimodal data that presents significant practical and +technical advantages compared to single-modal approaches. This research focuses +on leveraging {pre-event} digital surface model (DSM) data and {post-event} +digital aerial images captured at different times for detecting change beyond +2D. We observe that the current change detection methods struggle with the +multitask conflicts between semantic and height change detection tasks. To +address this challenge, we propose an efficient Transformer-based network that +learns shared representation between cross-dimensional inputs through +cross-attention. {It adopts a consistency constraint to establish the +multimodal relationship. Initially, pseudo-changes are derived by employing +height change thresholding. Subsequently, the $L2$ distance between semantic +and pseudo-changes within their overlapping regions is minimized. This +explicitly endows the height change detection (regression task) and semantic +change detection (classification task) with representation consistency.} A +DSM-to-image multimodal dataset encompassing three cities in the Netherlands +was constructed. It lays a new foundation for beyond-2D change detection from +cross-dimensional inputs. Compared to five state-of-the-art change detection +methods, our model demonstrates consistent multitask superiority in terms of +semantic and height change detection. Furthermore, the consistency strategy can +be seamlessly adapted to the other methods, yielding promising improvements. + +
+
+
+
+
+ + ♻ ☆ SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions + + +
+ Recent advancements in diffusion models have positioned them at the forefront +of image generation. Despite their superior performance, diffusion models are +not without drawbacks; they are characterized by complex architectures and +substantial computational demands, resulting in significant latency due to +their iterative sampling process. To mitigate these limitations, we introduce a +dual approach involving model miniaturization and a reduction in sampling +steps, aimed at significantly decreasing model latency. Our methodology +leverages knowledge distillation to streamline the U-Net and image decoder +architectures, and introduces an innovative one-step DM training technique that +utilizes feature matching and score distillation. We present two models, +SDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS +(30x faster than SD v1.5) and 30 FPS (60x faster than SDXL) on a single GPU, +respectively. Moreover, our training approach offers promising applications in +image-conditioned control, facilitating efficient image-to-image translation. + +
+
+
+
+
+ + ♻ ☆ Optimization of Prompt Learning via Multi-Knowledge Representation for + Vision-Language Models + + +
+ Vision-Language Models (VLMs), such as CLIP, play a foundational role in +various cross-modal applications. To fully leverage VLMs' potential in adapting +to downstream tasks, context optimization methods like Prompt Tuning are +essential. However, one key limitation is the lack of diversity in prompt +templates, whether they are hand-crafted or learned through additional modules. +This limitation restricts the capabilities of pretrained VLMs and can result in +incorrect predictions in downstream tasks. To address this challenge, we +propose Context Optimization with Multi-Knowledge Representation (CoKnow), a +framework that enhances Prompt Learning for VLMs with rich contextual +knowledge. To facilitate CoKnow during inference, we trained lightweight +semantic knowledge mappers, which are capable of generating Multi-Knowledge +Representation for an input image without requiring additional priors. +Experimentally, We conducted extensive experiments on 11 publicly available +datasets, demonstrating that CoKnow outperforms a series of previous methods. +We will make all resources open-source: https://github.com/EMZucas/CoKnow. + +
+
+
+
+
+ + ♻ ☆ Kinematics Modeling Network for Video-based Human Pose Estimation + + +
+ Estimating human poses from videos is critical in human-computer interaction. +Joints cooperate rather than move independently during human movement. There +are both spatial and temporal correlations between joints. Despite the positive +results of previous approaches, most focus on modeling the spatial correlation +between joints while only straightforwardly integrating features along the +temporal dimension, ignoring the temporal correlation between joints. In this +work, we propose a plug-and-play kinematics modeling module (KMM) to explicitly +model temporal correlations between joints across different frames by +calculating their temporal similarity. In this way, KMM can capture motion cues +of the current joint relative to all joints in different time. Besides, we +formulate video-based human pose estimation as a Markov Decision Process and +design a novel kinematics modeling network (KIMNet) to simulate the Markov +Chain, allowing KIMNet to locate joints recursively. Our approach achieves +state-of-the-art results on two challenging benchmarks. In particular, KIMNet +shows robustness to the occlusion. The code will be released at +https://github.com/YHDang/KIMNet. + +
+
+
+
+
+ + ♻ ☆ Full-dose Whole-body PET Synthesis from Low-dose PET Using + High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency + Model + + +
+ Objective: Positron Emission Tomography (PET) has been a commonly used +imaging modality in broad clinical applications. One of the most important +tradeoffs in PET imaging is between image quality and radiation dose: high +image quality comes with high radiation exposure. Improving image quality is +desirable for all clinical applications while minimizing radiation exposure is +needed to reduce risk to patients. Approach: We introduce PET Consistency Model +(PET-CM), an efficient diffusion-based method for generating high-quality +full-dose PET images from low-dose PET images. It employs a two-step process, +adding Gaussian noise to full-dose PET images in the forward diffusion, and +then denoising them using a PET Shifted-window Vision Transformer (PET-VIT) +network in the reverse diffusion. The PET-VIT network learns a consistency +function that enables direct denoising of Gaussian noise into clean full-dose +PET images. PET-CM achieves state-of-the-art image quality while requiring +significantly less computation time than other methods. Results: In experiments +comparing eighth-dose to full-dose images, PET-CM demonstrated impressive +performance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of +0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of +0.255+/-0.318%, with an average generation time of 62 seconds per patient. This +is a significant improvement compared to the state-of-the-art diffusion-based +model with PET-CM reaching this result 12x faster. Similarly, in the +quarter-dose to full-dose image experiments, PET-CM delivered competitive +outcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM +of 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of +0.151+/-0.192% using the same generation process, which underlining its high +quantitative and clinical precision in both denoising scenario. + +
+
+
+
+
+ + ♻ ☆ Vision Augmentation Prediction Autoencoder with Attention Design + (VAPAAD) + + +
+ Recent advancements in sequence prediction have significantly improved the +accuracy of video data interpretation; however, existing models often overlook +the potential of attention-based mechanisms for next-frame prediction. This +study introduces the Vision Augmentation Prediction Autoencoder with Attention +Design (VAPAAD), an innovative approach that integrates attention mechanisms +into sequence prediction, enabling nuanced analysis and understanding of +temporal dynamics in video sequences. Utilizing the Moving MNIST dataset, we +demonstrate VAPAAD's robust performance and superior handling of complex +temporal data compared to traditional methods. VAPAAD combines data +augmentation, ConvLSTM2D layers, and a custom-built self-attention mechanism to +effectively focus on salient features within a sequence, enhancing predictive +accuracy and context-aware analysis. This methodology not only adheres to human +cognitive processes during video interpretation but also addresses limitations +in conventional models, which often struggle with the variability inherent in +video sequences. The experimental results confirm that VAPAAD outperforms +existing models, especially in integrating attention mechanisms, which +significantly improve predictive performance. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Alpha Invariance: On Inverse Scaling Between Distance and Volume Density + in Neural Radiance Fields CVPR 2024 + + +
+ Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of +volumetric densities in neural radiance fields, i.e., the densities double when +scene size is halved, and vice versa. We call this property alpha invariance. +For NeRFs to better maintain alpha invariance, we recommend 1) parameterizing +both distance and volume densities in log space, and 2) a +discretization-agnostic initialization strategy to guarantee high ray +transmittance. We revisit a few popular radiance field models and find that +these systems use various heuristics to deal with issues arising from scene +scaling. We test their behaviors and show our recipe to be more robust. + +
+
+ comment: CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance +
+
+
+
+
+ + ♻ ☆ Iterated Learning Improves Compositionality in Large Vision-Language + Models CVPR 2024 + + +
+ A fundamental characteristic common to both human vision and natural language +is their compositional nature. Yet, despite the performance gains contributed +by large vision and language pretraining, recent investigations find that +most-if not all-our state-of-the-art vision-language models struggle at +compositionality. They are unable to distinguish between images of " a girl in +white facing a man in black" and "a girl in black facing a man in white". +Moreover, prior work suggests that compositionality doesn't arise with scale: +larger model sizes or training data don't help. This paper develops a new +iterated training algorithm that incentivizes compositionality. We draw on +decades of cognitive science research that identifies cultural transmission-the +need to teach a new generation-as a necessary inductive prior that incentivizes +humans to develop compositional languages. Specifically, we reframe +vision-language contrastive learning as the Lewis Signaling Game between a +vision agent and a language agent, and operationalize cultural transmission by +iteratively resetting one of the agent's weights during training. After every +iteration, this training paradigm induces representations that become "easier +to learn", a property of compositional languages: e.g. our model trained on +CC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the +SugarCrepe benchmark. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Score Sign Language with Two-stage Method + + +
+ Human action recognition and performance assessment have been hot research +topics in recent years. Recognition problems have mature solutions in the field +of sign language, but past research in performance analysis has focused on +competitive sports and medical training, overlooking the scoring assessment +,which is an important part of sign language teaching digitalization. In this +paper, we analyze the existing technologies for performance assessment and +adopt methods that perform well in human pose reconstruction tasks combined +with motion rotation embedded expressions, proposing a two-stage sign language +performance evaluation pipeline. Our analysis shows that choosing +reconstruction tasks in the first stage can provide more expressive features, +and using smoothing methods can provide an effective reference for assessment. +Experiments show that our method provides good score feedback mechanisms and +high consistency with professional assessments compared to end-to-end +evaluations. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Runner re-identification from single-view running video in the + open-world setting + + +
+ In many sports, player re-identification is crucial for automatic video +processing and analysis. However, most of the current studies on player +re-identification in multi- or single-view sports videos focus on +re-identification in the closed-world setting using labeled image dataset, and +player re-identification in the open-world setting for automatic video analysis +is not well developed. In this paper, we propose a runner re-identification +system that directly processes single-view video to address the open-world +setting. In the open-world setting, we cannot use labeled dataset and have to +process video directly. The proposed system automatically processes raw video +as input to identify runners, and it can identify runners even when they are +framed out multiple times. For the automatic processing, we first detect the +runners in the video using the pre-trained YOLOv8 and the fine-tuned +EfficientNet. We then track the runners using ByteTrack and detect their shoes +with the fine-tuned YOLOv8. Finally, we extract the image features of the +runners using an unsupervised method with the gated recurrent unit autoencoder +and global and local features mixing. To improve the accuracy of runner +re-identification, we use shoe images as local image features and dynamic +features of running sequence images. We evaluated the system on a running +practice video dataset and showed that the proposed method identified runners +with higher accuracy than some state-of-the-art models in unsupervised +re-identification. We also showed that our proposed local image feature and +running dynamic feature were effective for runner re-identification. Our runner +re-identification system can be useful for the automatic analysis of running +videos. + +
+
+ comment: 20 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Robust Analysis of Multi-Task Learning Efficiency: New Benchmarks on + Light-Weighed Backbones and Effective Measurement of Multi-Task Learning + Challenges by Feature Disentanglement + + +
+ One of the main motivations of MTL is to develop neural networks capable of +inferring multiple tasks simultaneously. While countless methods have been +proposed in the past decade investigating robust model architectures and +efficient training algorithms, there is still lack of understanding of these +methods when applied on smaller feature extraction backbones, the +generalizability of the commonly used fast approximation technique of replacing +parameter-level gradients with feature level gradients, and lack of +comprehensive understanding of MTL challenges and how one can efficiently and +effectively identify the challenges. In this paper, we focus on the +aforementioned efficiency aspects of existing MTL methods. We first carry out +large-scale experiments of the methods with smaller backbones and on a the +MetaGraspNet dataset as a new test ground. We also compare the existing methods +with and without using the fast gradient surrogate and empirically study the +generalizability of this technique. Lastly, we propose Feature Disentanglement +measure as a novel and efficient identifier of the challenges in MTL, and +propose Ranking Similarity score as an evaluation metric for different +identifiers to prove the faithfulness of our method. + +
+
+
+
+
+ + ♻ ☆ Automated mapping of virtual environments with visual predictive coding + + +
+ Humans construct internal cognitive maps of their environment directly from +sensory inputs without access to a system of explicit coordinates or distance +measurements. While machine learning algorithms like SLAM utilize specialized +visual inference procedures to identify visual features and construct spatial +maps from visual and odometry data, the general nature of cognitive maps in the +brain suggests a unified mapping algorithmic strategy that can generalize to +auditory, tactile, and linguistic inputs. Here, we demonstrate that predictive +coding provides a natural and versatile neural network algorithm for +constructing spatial maps using sensory data. We introduce a framework in which +an agent navigates a virtual environment while engaging in visual predictive +coding using a self-attention-equipped convolutional neural network. While +learning a next image prediction task, the agent automatically constructs an +internal representation of the environment that quantitatively reflects +distances. The internal map enables the agent to pinpoint its location relative +to landmarks using only visual information.The predictive coding network +generates a vectorized encoding of the environment that supports vector +navigation where individual latent space units delineate localized, overlapping +neighborhoods in the environment. Broadly, our work introduces predictive +coding as a unified algorithmic framework for constructing cognitive maps that +can naturally extend to the mapping of auditory, sensorimotor, and linguistic +inputs. + +
+
+
+
+
+ + ♻ ☆ Framework-agnostic Semantically-aware Global Reasoning for Segmentation WACV 2024 + + +
+ Recent advances in pixel-level tasks (e.g. segmentation) illustrate the +benefit of of long-range interactions between aggregated region-based +representations that can enhance local features. However, such aggregated +representations, often in the form of attention, fail to model the underlying +semantics of the scene (e.g. individual objects and, by extension, their +interactions). In this work, we address the issue by proposing a component that +learns to project image features into latent representations and reason between +them using a transformer encoder to generate contextualized and +scene-consistent representations which are fused with original image features. +Our design encourages the latent regions to represent semantic concepts by +ensuring that the activated regions are spatially disjoint and the union of +such regions corresponds to a connected object segment. The proposed semantic +global reasoning (SGR) component is end-to-end trainable and can be easily +added to a wide variety of backbones (CNN or transformer-based) and +segmentation heads (per-pixel or mask classification) to consistently improve +the segmentation results on different datasets. In addition, our latent tokens +are semantically interpretable and diverse and provide a rich set of features +that can be transferred to downstream tasks like object detection and +segmentation, with improved performance. Furthermore, we also proposed metrics +to quantify the semantics of latent tokens at both class \& instance level. + +
+
+ comment: Published in WACV 2024 +
+
+
+
+
+ + ♻ ☆ Training point-based deep learning networks for forest segmentation with + synthetic data ICPR + + +
+ Remote sensing through unmanned aerial systems (UAS) has been increasing in +forestry in recent years, along with using machine learning for data +processing. Deep learning architectures, extensively applied in natural +language and image processing, have recently been extended to the point cloud +domain. However, the availability of point cloud datasets for training and +testing remains limited. Creating forested environment point cloud datasets is +expensive, requires high-precision sensors, and is time-consuming as manual +point classification is required. Moreover, forest areas could be inaccessible +or dangerous for humans, further complicating data collection. Then, a question +arises whether it is possible to use synthetic data to train deep learning +networks without the need to rely on large volumes of real forest data. To +answer this question, we developed a realistic simulator that procedurally +generates synthetic forest scenes. Thanks to this, we have conducted a +comparative study of different state-of-the-art point-based deep learning +networks for forest segmentation. Using created datasets, we determined the +feasibility of using synthetic data to train deep learning networks to classify +point clouds from real forest datasets. Both the simulator and the datasets are +released as part of this work. + +
+
+ comment: 15 pages, 4 figures. Submitted to the International Conference on + Pattern Recognition (ICPR) 2024 +
+
+
+
+
+ + ♻ ☆ Predicting Thrombectomy Recanalization from CT Imaging Using Deep + Learning Models + + +
+ For acute ischemic stroke (AIS) patients with large vessel occlusions, +clinicians must decide if the benefit of mechanical thrombectomy (MTB) +outweighs the risks and potential complications following an invasive +procedure. Pre-treatment computed tomography (CT) and angiography (CTA) are +widely used to characterize occlusions in the brain vasculature. If a patient +is deemed eligible, a modified treatment in cerebral ischemia (mTICI) score +will be used to grade how well blood flow is reestablished throughout and +following the MTB procedure. An estimation of the likelihood of successful +recanalization can support treatment decision-making. In this study, we +proposed a fully automated prediction of a patient's recanalization score using +pre-treatment CT and CTA imaging. We designed a spatial cross attention network +(SCANet) that utilizes vision transformers to localize to pertinent slices and +brain regions. Our top model achieved an average cross-validated ROC-AUC of +77.33 $\pm$ 3.9\%. This is a promising result that supports future applications +of deep learning on CT and CTA for the identification of eligible AIS patients +for MTB. + +
+
+ comment: Medical Imaging with Deep Learning 2022 accepted short paper Jun 2022 +
+
+
+
+
+ + ♻ ☆ Boomerang: Local sampling on image manifolds using diffusion models + + +
+ The inference stage of diffusion models can be seen as running a reverse-time +diffusion stochastic differential equation, where samples from a Gaussian +latent distribution are transformed into samples from a target distribution +that usually reside on a low-dimensional manifold, e.g., an image manifold. The +intermediate values between the initial latent space and the image manifold can +be interpreted as noisy images, with the amount of noise determined by the +forward diffusion process noise schedule. We utilize this interpretation to +present Boomerang, an approach for local sampling of image manifolds. As +implied by its name, Boomerang local sampling involves adding noise to an input +image, moving it closer to the latent space, and then mapping it back to the +image manifold through a partial reverse diffusion process. Thus, Boomerang +generates images on the manifold that are ``similar,'' but nonidentical, to the +original input image. We can control the proximity of the generated images to +the original by adjusting the amount of noise added. Furthermore, due to the +stochastic nature of the reverse diffusion process in Boomerang, the generated +images display a certain degree of stochasticity, allowing us to obtain local +samples from the manifold without encountering any duplicates. Boomerang offers +the flexibility to work seamlessly with any pretrained diffusion model, such as +Stable Diffusion, without necessitating any adjustments to the reverse +diffusion process. We present three applications for Boomerang. First, we +provide a framework for constructing privacy-preserving datasets having +controllable degrees of anonymity. Second, we show that using Boomerang for +data augmentation increases generalization performance and outperforms +state-of-the-art synthetic data augmentation. Lastly, we introduce a perceptual +image enhancement framework, which enables resolution enhancement. + +
+
+ comment: Published in Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ AffordanceLLM: Grounding Affordance from Vision Language Models + + +
+ Affordance grounding refers to the task of finding the area of an object with +which one can interact. It is a fundamental but challenging task, as a +successful solution requires the comprehensive understanding of a scene in +multiple aspects including detection, localization, and recognition of objects +with their parts, of geo-spatial configuration/layout of the scene, of 3D +shapes and physics, as well as of the functionality and potential interaction +of the objects and humans. Much of the knowledge is hidden and beyond the image +content with the supervised labels from a limited training set. In this paper, +we make an attempt to improve the generalization capability of the current +affordance grounding by taking the advantage of the rich world, abstract, and +human-object-interaction knowledge from pretrained large-scale vision language +models. Under the AGD20K benchmark, our proposed model demonstrates a +significant performance gain over the competing methods for in-the-wild object +affordance grounding. We further demonstrate it can ground affordance for +objects from random Internet images, even if both objects and actions are +unseen during training. Project site: https://jasonqsy.github.io/AffordanceLLM/ + +
+
+
+
+
+ + ♻ ☆ A Hybrid ANN-SNN Architecture for Low-Power and Low-Latency Visual + Perception + + +
+ Spiking Neural Networks (SNN) are a class of bio-inspired neural networks +that promise to bring low-power and low-latency inference to edge devices +through asynchronous and sparse processing. However, being temporal models, +SNNs depend heavily on expressive states to generate predictions on par with +classical artificial neural networks (ANNs). These states converge only after +long transient periods, and quickly decay without input data, leading to higher +latency, power consumption, and lower accuracy. This work addresses this issue +by initializing the state with an auxiliary ANN running at a low rate. The SNN +then uses the state to generate predictions with high temporal resolution until +the next initialization phase. Our hybrid ANN-SNN model thus combines the best +of both worlds: It does not suffer from long state transients and state decay +thanks to the ANN, and can generate predictions with high temporal resolution, +low latency, and low power thanks to the SNN. We show for the task of +event-based 2D and 3D human pose estimation that our method consumes 88% less +power with only a 4% decrease in performance compared to its fully ANN +counterparts when run at the same inference rate. Moreover, when compared to +SNNs, our method achieves a 74% lower error. This research thus provides a new +understanding of how ANNs and SNNs can be used to maximize their respective +benefits. + +
+
+
+
+
+ + ♻ ☆ Read Between the Layers: Leveraging Intra-Layer Representations for + Rehearsal-Free Continual Learning with Pre-Trained Models + + +
+ We address the Continual Learning (CL) problem, wherein a model must learn a +sequence of tasks from non-stationary distributions while preserving prior +knowledge upon encountering new experiences. With the advancement of foundation +models, CL research has pivoted from the initial learning-from-scratch paradigm +towards utilizing generic features from large-scale pre-training. However, +existing approaches to CL with pre-trained models primarily focus on separating +class-specific features from the final representation layer and neglect the +potential of intermediate representations to capture low- and mid-level +features, which are more invariant to domain shifts. In this work, we propose +LayUP, a new prototype-based approach to continual learning that leverages +second-order feature statistics from multiple intermediate layers of a +pre-trained network. Our method is conceptually simple, does not require access +to prior data, and works out of the box with any foundation model. LayUP +surpasses the state of the art in four of the seven class-incremental learning +benchmarks, all three domain-incremental learning benchmarks and in six of the +seven online continual learning benchmarks, while significantly reducing memory +and computational requirements compared to existing baselines. Our results +demonstrate that fully exhausting the representational capacities of +pre-trained models in CL goes well beyond their final embeddings. + +
+
+ comment: Preprint under review +
+
+
+
+
+ + ♻ ☆ Sora: A Review on Background, Technology, Limitations, and Opportunities + of Large Vision Models + + +
+ Sora is a text-to-video generative AI model, released by OpenAI in February +2024. The model is trained to generate videos of realistic or imaginative +scenes from text instructions and show potential in simulating the physical +world. Based on public technical reports and reverse engineering, this paper +presents a comprehensive review of the model's background, related +technologies, applications, remaining challenges, and future directions of +text-to-video AI models. We first trace Sora's development and investigate the +underlying technologies used to build this "world simulator". Then, we describe +in detail the applications and potential impact of Sora in multiple industries +ranging from film-making and education to marketing. We discuss the main +challenges and limitations that need to be addressed to widely deploy Sora, +such as ensuring safe and unbiased video generation. Lastly, we discuss the +future development of Sora and video generation models in general, and how +advancements in the field could enable new ways of human-AI interaction, +boosting productivity and creativity of video generation. + +
+
+ comment: 37 pages, 18 figures; GitHub: + https://github.com/lichao-sun/SoraReview +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 177 + +
+
+
+ + ☆ COMBO: Compositional World Models for Embodied Multi-Agent Cooperation + + +
+ In this paper, we investigate the problem of embodied multi-agent +cooperation, where decentralized agents must cooperate given only partial +egocentric views of the world. To effectively plan in this setting, in contrast +to learning world dynamics in a single-agent scenario, we must simulate world +dynamics conditioned on an arbitrary number of agents' actions given only +partial egocentric visual observations of the world. To address this issue of +partial observability, we first train generative models to estimate the overall +world state given partial egocentric observations. To enable accurate +simulation of multiple sets of actions on this world state, we then propose to +learn a compositional world model for multi-agent cooperation by factorizing +the naturally composable joint actions of multiple agents and compositionally +generating the video. By leveraging this compositional world model, in +combination with Vision Language Models to infer the actions of other agents, +we can use a tree search procedure to integrate these modules and facilitate +online cooperative planning. To evaluate the efficacy of our methods, we create +two challenging embodied multi-agent long-horizon cooperation tasks using the +ThreeDWorld simulator and conduct experiments with 2-4 agents. The results show +our compositional world model is effective and the framework enables the +embodied agents to cooperate efficiently with different agents across various +tasks and an arbitrary number of agents, showing the promising future of our +proposed framework. More videos can be found at +https://vis-www.cs.umass.edu/combo/. + +
+
+ comment: 23 pages. The first three authors contributed equally +
+
+
+
+
+ + ☆ Gaussian Opacity Fields: Efficient and Compact Surface Reconstruction in + Unbounded Scenes + + +
+ Recently, 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view +synthesis results, while allowing the rendering of high-resolution images in +real-time. However, leveraging 3D Gaussians for surface reconstruction poses +significant challenges due to the explicit and disconnected nature of 3D +Gaussians. In this work, we present Gaussian Opacity Fields (GOF), a novel +approach for efficient, high-quality, and compact surface reconstruction in +unbounded scenes. Our GOF is derived from ray-tracing-based volume rendering of +3D Gaussians, enabling direct geometry extraction from 3D Gaussians by +identifying its levelset, without resorting to Poisson reconstruction or TSDF +fusion as in previous work. We approximate the surface normal of Gaussians as +the normal of the ray-Gaussian intersection plane, enabling the application of +regularization that significantly enhances geometry. Furthermore, we develop an +efficient geometry extraction method utilizing marching tetrahedra, where the +tetrahedral grids are induced from 3D Gaussians and thus adapt to the scene's +complexity. Our evaluations reveal that GOF surpasses existing 3DGS-based +methods in surface reconstruction and novel view synthesis. Further, it +compares favorably to, or even outperforms, neural implicit methods in both +quality and speed. + +
+
+ comment: Project page: + https://niujinshuchong.github.io/gaussian-opacity-fields +
+
+
+
+
+ + ☆ RapidVol: Rapid Reconstruction of 3D Ultrasound Volumes from Sensorless + 2D Scans + + +
+ Two-dimensional (2D) freehand ultrasonography is one of the most commonly +used medical imaging modalities, particularly in obstetrics and gynaecology. +However, it only captures 2D cross-sectional views of inherently 3D anatomies, +losing valuable contextual information. As an alternative to requiring costly +and complex 3D ultrasound scanners, 3D volumes can be constructed from 2D scans +using machine learning. However this usually requires long computational time. +Here, we propose RapidVol: a neural representation framework to speed up +slice-to-volume ultrasound reconstruction. We use tensor-rank decomposition, to +decompose the typical 3D volume into sets of tri-planes, and store those +instead, as well as a small neural network. A set of 2D ultrasound scans, with +their ground truth (or estimated) 3D position and orientation (pose) is all +that is required to form a complete 3D reconstruction. Reconstructions are +formed from real fetal brain scans, and then evaluated by requesting novel +cross-sectional views. When compared to prior approaches based on fully +implicit representation (e.g. neural radiance fields), our method is over 3x +quicker, 46% more accurate, and if given inaccurate poses is more robust. +Further speed-up is also possible by reconstructing from a structural prior +rather than from scratch. + +
+
+
+
+
+ + ☆ RefFusion: Reference Adapted Diffusion Models for 3D Scene Inpainting + + +
+ Neural reconstruction approaches are rapidly emerging as the preferred +representation for 3D scenes, but their limited editability is still posing a +challenge. In this work, we propose an approach for 3D scene inpainting -- the +task of coherently replacing parts of the reconstructed scene with desired +content. Scene inpainting is an inherently ill-posed task as there exist many +solutions that plausibly replace the missing content. A good inpainting method +should therefore not only enable high-quality synthesis but also a high degree +of control. Based on this observation, we focus on enabling explicit control +over the inpainted content and leverage a reference image as an efficient means +to achieve this goal. Specifically, we introduce RefFusion, a novel 3D +inpainting method based on a multi-scale personalization of an image inpainting +diffusion model to the given reference view. The personalization effectively +adapts the prior distribution to the target scene, resulting in a lower +variance of score distillation objective and hence significantly sharper +details. Our framework achieves state-of-the-art results for object removal +while maintaining high controllability. We further demonstrate the generality +of our formulation on other downstream tasks such as object insertion, scene +outpainting, and sparse view reconstruction. + +
+
+ comment: Project page: https://reffusion.github.io +
+
+
+
+
+ + ☆ LaDiC: Are Diffusion Models Really Inferior to Autoregressive + Counterparts for Image-to-Text Generation? + + +
+ Diffusion models have exhibited remarkable capabilities in text-to-image +generation. However, their performance in image-to-text generation, +specifically image captioning, has lagged behind Auto-Regressive (AR) models, +casting doubt on their applicability for such tasks. In this work, we revisit +diffusion models, highlighting their capacity for holistic context modeling and +parallel decoding. With these benefits, diffusion models can alleviate the +inherent limitations of AR methods, including their slow inference speed, error +propagation, and unidirectional constraints. Furthermore, we identify the prior +underperformance of diffusion models stemming from the absence of an effective +latent space for image-text alignment, and the discrepancy between continuous +diffusion processes and discrete textual data. In response, we introduce a +novel architecture, LaDiC, which utilizes a split BERT to create a dedicated +latent space for captions and integrates a regularization module to manage +varying text lengths. Our framework also includes a diffuser for semantic +image-to-text conversion and a Back&Refine technique to enhance token +interactivity during inference. LaDiC achieves state-of-the-art performance for +diffusion-based methods on the MS COCO dataset with 38.2 BLEU@4 and 126.2 +CIDEr, demonstrating exceptional performance without pre-training or ancillary +modules. This indicates strong competitiveness with AR models, revealing the +previously untapped potential of diffusion models in image-to-text generation. + +
+
+
+
+
+ + ☆ Learning Feature Inversion for Multi-class Anomaly Detection under + General-purpose COCO-AD Benchmark + + +
+ Anomaly detection (AD) is often focused on detecting anomaly areas for +industrial quality inspection and medical lesion examination. However, due to +the specific scenario targets, the data scale for AD is relatively small, and +evaluation metrics are still deficient compared to classic vision tasks, such +as object detection and semantic segmentation. To fill these gaps, this work +first constructs a large-scale and general-purpose COCO-AD dataset by extending +COCO to the AD field. This enables fair evaluation and sustainable development +for different methods on this challenging benchmark. Moreover, current metrics +such as AU-ROC have nearly reached saturation on simple datasets, which +prevents a comprehensive evaluation of different methods. Inspired by the +metrics in the segmentation field, we further propose several more practical +threshold-dependent AD-specific metrics, ie, m$F_1$$^{.2}_{.8}$, +mAcc$^{.2}_{.8}$, mIoU$^{.2}_{.8}$, and mIoU-max. Motivated by GAN inversion's +high-quality reconstruction capability, we propose a simple but more powerful +InvAD framework to achieve high-quality feature reconstruction. Our method +improves the effectiveness of reconstruction-based methods on popular MVTec AD, +VisA, and our newly proposed COCO-AD datasets under a multi-class unsupervised +setting, where only a single detection model is trained to detect anomalies +from different classes. Extensive ablation experiments have demonstrated the +effectiveness of each component of our InvAD. Full codes and models are +available at https://github.com/zhangzjn/ader. + +
+
+
+
+
+ + ☆ Watch Your Step: Optimal Retrieval for Continual Learning at Scale + + +
+ One of the most widely used approaches in continual learning is referred to +as replay. Replay methods support interleaved learning by storing past +experiences in a replay buffer. Although there are methods for selectively +constructing the buffer and reprocessing its contents, there is limited +exploration of the problem of selectively retrieving samples from the buffer. +Current solutions have been tested in limited settings and, more importantly, +in isolation. Existing work has also not explored the impact of duplicate +replays on performance. In this work, we propose a framework for evaluating +selective retrieval strategies, categorized by simple, independent class- and +sample-selective primitives. We evaluated several combinations of existing +strategies for selective retrieval and present their performances. Furthermore, +we propose a set of strategies to prevent duplicate replays and explore whether +new samples with low loss values can be learned without replay. In an effort to +match our problem setting to a realistic continual learning pipeline, we +restrict our experiments to a setting involving a large, pre-trained, open +vocabulary object detection model, which is fully fine-tuned on a sequence of +15 datasets. + +
+
+
+
+
+ + ☆ GazeHTA: End-to-end Gaze Target Detection with Head-Target Association + + +
+ We propose an end-to-end approach for gaze target detection: predicting a +head-target connection between individuals and the target image regions they +are looking at. Most of the existing methods use independent components such as +off-the-shelf head detectors or have problems in establishing associations +between heads and gaze targets. In contrast, we investigate an end-to-end +multi-person Gaze target detection framework with Heads and Targets Association +(GazeHTA), which predicts multiple head-target instances based solely on input +scene image. GazeHTA addresses challenges in gaze target detection by (1) +leveraging a pre-trained diffusion model to extract scene features for rich +semantic understanding, (2) re-injecting a head feature to enhance the head +priors for improved head understanding, and (3) learning a connection map as +the explicit visual associations between heads and gaze targets. Our extensive +experimental results demonstrate that GazeHTA outperforms state-of-the-art gaze +target detection methods and two adapted diffusion-based baselines on two +standard datasets. + +
+
+
+
+
+ + ☆ Mixed Prototype Consistency Learning for Semi-supervised Medical Image + Segmentation + + +
+ Recently, prototype learning has emerged in semi-supervised medical image +segmentation and achieved remarkable performance. However, the scarcity of +labeled data limits the expressiveness of prototypes in previous methods, +potentially hindering the complete representation of prototypes for class +embedding. To address this problem, we propose the Mixed Prototype Consistency +Learning (MPCL) framework, which includes a Mean Teacher and an auxiliary +network. The Mean Teacher generates prototypes for labeled and unlabeled data, +while the auxiliary network produces additional prototypes for mixed data +processed by CutMix. Through prototype fusion, mixed prototypes provide extra +semantic information to both labeled and unlabeled prototypes. High-quality +global prototypes for each class are formed by fusing two enhanced prototypes, +optimizing the distribution of hidden embeddings used in consistency learning. +Extensive experiments on the left atrium and type B aortic dissection datasets +demonstrate MPCL's superiority over previous state-of-the-art approaches, +confirming the effectiveness of our framework. The code will be released soon. + +
+
+ comment: 15 pages, 2 figures +
+
+
+
+
+ + ☆ MOWA: Multiple-in-One Image Warping Model + + +
+ While recent image warping approaches achieved remarkable success on existing +benchmarks, they still require training separate models for each specific task +and cannot generalize well to different camera models or customized +manipulations. To address diverse types of warping in practice, we propose a +Multiple-in-One image WArping model (named MOWA) in this work. Specifically, we +mitigate the difficulty of multi-task learning by disentangling the motion +estimation at both the region level and pixel level. To further enable dynamic +task-aware image warping, we introduce a lightweight point-based classifier +that predicts the task type, serving as prompts to modulate the feature maps +for better estimation. To our knowledge, this is the first work that solves +multiple practical warping tasks in one single model. Extensive experiments +demonstrate that our MOWA, which is trained on six tasks for multiple-in-one +image warping, outperforms state-of-the-art task-specific models across most +tasks. Moreover, MOWA also exhibits promising potential to generalize into +unseen scenes, as evidenced by cross-domain and zero-shot evaluations. The code +will be made publicly available. + +
+
+ comment: Project page: https://kangliao929.github.io/projects/mowa/ +
+
+
+
+
+ + ☆ AV-GAN: Attention-Based Varifocal Generative Adversarial Network for + Uneven Medical Image Translation + + +
+ Different types of staining highlight different structures in organs, thereby +assisting in diagnosis. However, due to the impossibility of repeated staining, +we cannot obtain different types of stained slides of the same tissue area. +Translating the slide that is easy to obtain (e.g., H&E) to slides of staining +types difficult to obtain (e.g., MT, PAS) is a promising way to solve this +problem. However, some regions are closely connected to other regions, and to +maintain this connection, they often have complex structures and are difficult +to translate, which may lead to wrong translations. In this paper, we propose +the Attention-Based Varifocal Generative Adversarial Network (AV-GAN), which +solves multiple problems in pathologic image translation tasks, such as uneven +translation difficulty in different regions, mutual interference of multiple +resolution information, and nuclear deformation. Specifically, we develop an +Attention-Based Key Region Selection Module, which can attend to regions with +higher translation difficulty. We then develop a Varifocal Module to translate +these regions at multiple resolutions. Experimental results show that our +proposed AV-GAN outperforms existing image translation methods with two virtual +kidney tissue staining tasks and improves FID values by 15.9 and 4.16 +respectively in the H&E-MT and H&E-PAS tasks. + +
+
+
+
+
+ + ☆ A Plausibility Study of Using Augmented Reality in the + Ventriculoperitoneal Shunt Operations + + +
+ The field of augmented reality (AR) has undergone substantial growth, finding +diverse applications in the medical industry. This paper delves into various +techniques employed in medical surgeries, scrutinizing factors such as cost, +implementation, and accessibility. The focus of this exploration is on AR-based +solutions, with a particular emphasis on addressing challenges and proposing an +innovative solution for ventriculoperitoneal shunt (VP) operations. The +proposed solution introduces a novel flow in the pre-surgery phase, aiming to +substantially reduce setup time and operation duration by creating 3D models of +the skull and ventricles. Experiments are conducted where the models are +visualized on a 3D- printed skull through an AR device, specifically the +Microsoft HoloLens 2. The paper then conducts an in-depth analysis of this +proposed solution, discussing its feasibility, advantages, limitations,and +future implications. + +
+
+ comment: Accepted for the 2024 - 16th International Conference on Knowledge + and Smart Technology (KST). To be published in IEEEXplore Digital Library + (#61284), ISBN: 979-8-3503-7073-7 +
+
+
+
+
+ + ☆ Dual Modalities of Text: Visual and Textual Generative Pre-training + + +
+ Harnessing visual texts represents a burgeoning frontier in the evolution of +language modeling. In this paper, we introduce a novel pre-training framework +for a suite of pixel-based autoregressive language models, pre-training on a +corpus of over 400 million documents rendered as RGB images. Our approach is +characterized by a dual-modality training regimen, engaging both visual data +through next patch prediction with a regression head and textual data via next +token prediction with a classification head. This study is particularly focused +on investigating the synergistic interplay between visual and textual +modalities of language. Our comprehensive evaluation across a diverse array of +benchmarks reveals that the confluence of visual and textual data substantially +augments the efficacy of pixel-based language models. Notably, our findings +show that a unidirectional pixel-based model, devoid of textual data during +training, can match the performance levels of advanced bidirectional +pixel-based models on various language understanding benchmarks. This work +highlights the considerable untapped potential of integrating visual and +textual information for language modeling purposes. We will release our code, +data, and checkpoints to inspire further research advancement. + +
+
+
+
+
+ + ☆ Rawformer: Unpaired Raw-to-Raw Translation for Learnable Camera ISPs + + +
+ Modern smartphone camera quality heavily relies on the image signal processor +(ISP) to enhance captured raw images, utilizing carefully designed modules to +produce final output images encoded in a standard color space (e.g., sRGB). +Neural-based end-to-end learnable ISPs offer promising advancements, +potentially replacing traditional ISPs with their ability to adapt without +requiring extensive tuning for each new camera model, as is often the case for +nearly every module in traditional ISPs. However, the key challenge with the +recent learning-based ISPs is the urge to collect large paired datasets for +each distinct camera model due to the influence of intrinsic camera +characteristics on the formation of input raw images. This paper tackles this +challenge by introducing a novel method for unpaired learning of raw-to-raw +translation across diverse cameras. Specifically, we propose Rawformer, an +unsupervised Transformer-based encoder-decoder method for raw-to-raw +translation. It accurately maps raw images captured by a certain camera to the +target camera, facilitating the generalization of learnable ISPs to new unseen +cameras. Our method demonstrates superior performance on real camera datasets, +achieving higher accuracy compared to previous state-of-the-art techniques, and +preserving a more robust correlation between the original and translated raw +images. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ ECLAIR: A High-Fidelity Aerial LiDAR Dataset for Semantic Segmentation + + +
+ We introduce ECLAIR (Extended Classification of Lidar for AI Recognition), a +new outdoor large-scale aerial LiDAR dataset designed specifically for +advancing research in point cloud semantic segmentation. As the most extensive +and diverse collection of its kind to date, the dataset covers a total area of +10$km^2$ with close to 600 million points and features eleven distinct object +categories. To guarantee the dataset's quality and utility, we have thoroughly +curated the point labels through an internal team of experts, ensuring accuracy +and consistency in semantic labeling. The dataset is engineered to move forward +the fields of 3D urban modeling, scene understanding, and utility +infrastructure management by presenting new challenges and potential +applications. As a benchmark, we report qualitative and quantitative analysis +of a voxel-based point cloud segmentation approach based on the Minkowski +Engine. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ MathWriting: A Dataset For Handwritten Mathematical Expression + Recognition + + +
+ We introduce MathWriting, the largest online handwritten mathematical +expression dataset to date. It consists of 230k human-written samples and an +additional 400k synthetic ones. MathWriting can also be used for offline HME +recognition and is larger than all existing offline HME datasets like +IM2LATEX-100K. We introduce a benchmark based on MathWriting data in order to +advance research on both online and offline HME recognition. + +
+
+
+
+
+ + ☆ Efficient Conditional Diffusion Model with Probability Flow Sampling for + Image Super-resolution AAAI 2024 + + +
+ Image super-resolution is a fundamentally ill-posed problem because multiple +valid high-resolution images exist for one low-resolution image. +Super-resolution methods based on diffusion probabilistic models can deal with +the ill-posed nature by learning the distribution of high-resolution images +conditioned on low-resolution images, avoiding the problem of blurry images in +PSNR-oriented methods. However, existing diffusion-based super-resolution +methods have high time consumption with the use of iterative sampling, while +the quality and consistency of generated images are less than ideal due to +problems like color shifting. In this paper, we propose Efficient Conditional +Diffusion Model with Probability Flow Sampling (ECDP) for image +super-resolution. To reduce the time consumption, we design a continuous-time +conditional diffusion model for image super-resolution, which enables the use +of probability flow sampling for efficient generation. Additionally, to improve +the consistency of generated images, we propose a hybrid parametrization for +the denoiser network, which interpolates between the data-predicting +parametrization and the noise-predicting parametrization for different noise +scales. Moreover, we design an image quality loss as a complement to the score +matching loss of diffusion models, further improving the consistency and +quality of super-resolution. Extensive experiments on DIV2K, ImageNet, and +CelebA demonstrate that our method achieves higher super-resolution quality +than existing diffusion-based image super-resolution methods while having lower +time consumption. Our code is available at https://github.com/Yuan-Yutao/ECDP. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Generating Human Interaction Motions in Scenes with Text Control + + +
+ We present TeSMo, a method for text-controlled scene-aware motion generation +based on denoising diffusion models. Previous text-to-motion methods focus on +characters in isolation without considering scenes due to the limited +availability of datasets that include motion, text descriptions, and +interactive scenes. Our approach begins with pre-training a scene-agnostic +text-to-motion diffusion model, emphasizing goal-reaching constraints on +large-scale motion-capture datasets. We then enhance this model with a +scene-aware component, fine-tuned using data augmented with detailed scene +information, including ground plane and object shapes. To facilitate training, +we embed annotated navigation and interaction motions within scenes. The +proposed method produces realistic and diverse human-object interactions, such +as navigation and sitting, in different scenes with various object shapes, +orientations, initial body positions, and poses. Extensive experiments +demonstrate that our approach surpasses prior techniques in terms of the +plausibility of human-scene interactions, as well as the realism and variety of +the generated motions. Code will be released upon publication of this work at +https://research.nvidia.com/labs/toronto-ai/tesmo. + +
+
+ comment: Project Page: https://research.nvidia.com/labs/toronto-ai/tesmo/ +
+
+
+
+
+ + ☆ StyleCity: Large-Scale 3D Urban Scenes Stylization with Vision-and-Text + Reference via Progressive Optimization + + +
+ Creating large-scale virtual urban scenes with variant styles is inherently +challenging. To facilitate prototypes of virtual production and bypass the need +for complex materials and lighting setups, we introduce the first +vision-and-text-driven texture stylization system for large-scale urban scenes, +StyleCity. Taking an image and text as references, StyleCity stylizes a 3D +textured mesh of a large-scale urban scene in a semantics-aware fashion and +generates a harmonic omnidirectional sky background. To achieve that, we +propose to stylize a neural texture field by transferring 2D vision-and-text +priors to 3D globally and locally. During 3D stylization, we progressively +scale the planned training views of the input 3D scene at different levels in +order to preserve high-quality scene content. We then optimize the scene style +globally by adapting the scale of the style image with the scale of the +training views. Moreover, we enhance local semantics consistency by the +semantics-aware style loss which is crucial for photo-realistic stylization. +Besides texture stylization, we further adopt a generative diffusion model to +synthesize a style-consistent omnidirectional sky image, which offers a more +immersive atmosphere and assists the semantic stylization process. The stylized +neural texture field can be baked into an arbitrary-resolution texture, +enabling seamless integration into conventional rendering pipelines and +significantly easing the virtual production prototyping process. Extensive +experiments demonstrate our stylized scenes' superiority in qualitative and +quantitative performance and user preferences. + +
+
+ comment: project page: https://chenyingshu.github.io/stylecity3d/ +
+
+
+
+
+ + ☆ VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time + + +
+ We introduce VASA, a framework for generating lifelike talking faces with +appealing visual affective skills (VAS) given a single static image and a +speech audio clip. Our premiere model, VASA-1, is capable of not only producing +lip movements that are exquisitely synchronized with the audio, but also +capturing a large spectrum of facial nuances and natural head motions that +contribute to the perception of authenticity and liveliness. The core +innovations include a holistic facial dynamics and head movement generation +model that works in a face latent space, and the development of such an +expressive and disentangled face latent space using videos. Through extensive +experiments including evaluation on a set of new metrics, we show that our +method significantly outperforms previous methods along various dimensions +comprehensively. Our method not only delivers high video quality with realistic +facial and head dynamics but also supports the online generation of 512x512 +videos at up to 40 FPS with negligible starting latency. It paves the way for +real-time engagements with lifelike avatars that emulate human conversational +behaviors. + +
+
+ comment: Tech Report. Project webpage: + https://www.microsoft.com/en-us/research/project/vasa-1/ +
+
+
+
+
+ + ☆ Assessing The Impact of CNN Auto Encoder-Based Image Denoising on Image + Classification Tasks + + +
+ Images captured from the real world are often affected by different types of +noise, which can significantly impact the performance of Computer Vision +systems and the quality of visual data. This study presents a novel approach +for defect detection in casting product noisy images, specifically focusing on +submersible pump impellers. The methodology involves utilizing deep learning +models such as VGG16, InceptionV3, and other models in both the spatial and +frequency domains to identify noise types and defect status. The research +process begins with preprocessing images, followed by applying denoising +techniques tailored to specific noise categories. The goal is to enhance the +accuracy and robustness of defect detection by integrating noise detection and +denoising into the classification pipeline. The study achieved remarkable +results using VGG16 for noise type classification in the frequency domain, +achieving an accuracy of over 99%. Removal of salt and pepper noise resulted in +an average SSIM of 87.9, while Gaussian noise removal had an average SSIM of +64.0, and periodic noise removal yielded an average SSIM of 81.6. This +comprehensive approach showcases the effectiveness of the deep AutoEncoder +model and median filter, for denoising strategies in real-world industrial +applications. Finally, our study reports significant improvements in binary +classification accuracy for defect detection compared to previous methods. For +the VGG16 classifier, accuracy increased from 94.6% to 97.0%, demonstrating the +effectiveness of the proposed noise detection and denoising approach. +Similarly, for the InceptionV3 classifier, accuracy improved from 84.7% to +90.0%, further validating the benefits of integrating noise analysis into the +classification pipeline. + +
+
+ comment: 13 pages, 13 figures, 13th International conference on innovative + technologies in the field of science, engineering and technology +
+
+
+
+
+ + ☆ Contextrast: Contextual Contrastive Learning for Semantic Segmentation + + +
+ Despite great improvements in semantic segmentation, challenges persist +because of the lack of local/global contexts and the relationship between them. +In this paper, we propose Contextrast, a contrastive learning-based semantic +segmentation method that allows to capture local/global contexts and comprehend +their relationships. Our proposed method comprises two parts: a) contextual +contrastive learning (CCL) and b) boundary-aware negative (BANE) sampling. +Contextual contrastive learning obtains local/global context from multi-scale +feature aggregation and inter/intra-relationship of features for better +discrimination capabilities. Meanwhile, BANE sampling selects embedding +features along the boundaries of incorrectly predicted regions to employ them +as harder negative samples on our contrastive learning, resolving segmentation +issues along the boundary region by exploiting fine-grained details. We +demonstrate that our Contextrast substantially enhances the performance of +semantic segmentation networks, outperforming state-of-the-art contrastive +learning approaches on diverse public datasets, e.g. Cityscapes, CamVid, +PASCAL-C, COCO-Stuff, and ADE20K, without an increase in computational cost +during inference. + +
+
+
+
+
+ + ☆ Exploring selective image matching methods for zero-shot and few-sample + unsupervised domain adaptation of urban canopy prediction ICLR 2024 + + +
+ We explore simple methods for adapting a trained multi-task UNet which +predicts canopy cover and height to a new geographic setting using remotely +sensed data without the need of training a domain-adaptive classifier and +extensive fine-tuning. Extending previous research, we followed a selective +alignment process to identify similar images in the two geographical domains +and then tested an array of data-based unsupervised domain adaptation +approaches in a zero-shot setting as well as with a small amount of +fine-tuning. We find that the selective aligned data-based image matching +methods produce promising results in a zero-shot setting, and even more so with +a small amount of fine-tuning. These methods outperform both an untransformed +baseline and a popular data-based image-to-image translation model. The best +performing methods were pixel distribution adaptation and fourier domain +adaptation on the canopy cover and height tasks respectively. + +
+
+ comment: ICLR 2024 Machine Learning for Remote Sensing (ML4RS) Workshop +
+
+
+
+
+ + ☆ Gaussian Splatting Decoder for 3D-aware Generative Adversarial Networks CVPR + + +
+ NeRF-based 3D-aware Generative Adversarial Networks (GANs) like EG3D or +GIRAFFE have shown very high rendering quality under large representational +variety. However, rendering with Neural Radiance Fields poses challenges for 3D +applications: First, the significant computational demands of NeRF rendering +preclude its use on low-power devices, such as mobiles and VR/AR headsets. +Second, implicit representations based on neural networks are difficult to +incorporate into explicit 3D scenes, such as VR environments or video games. 3D +Gaussian Splatting (3DGS) overcomes these limitations by providing an explicit +3D representation that can be rendered efficiently at high frame rates. In this +work, we present a novel approach that combines the high rendering quality of +NeRF-based 3D-aware GANs with the flexibility and computational advantages of +3DGS. By training a decoder that maps implicit NeRF representations to explicit +3D Gaussian Splatting attributes, we can integrate the representational +diversity and quality of 3D GANs into the ecosystem of 3D Gaussian Splatting +for the first time. Additionally, our approach allows for a high resolution GAN +inversion and real-time GAN editing with 3D Gaussian Splatting scenes. + +
+
+ comment: CVPRW +
+
+
+
+
+ + ☆ PyTorchGeoNodes: Enabling Differentiable Shape Programs for 3D Shape + Reconstruction + + +
+ We propose PyTorchGeoNodes, a differentiable module for reconstructing 3D +objects from images using interpretable shape programs. In comparison to +traditional CAD model retrieval methods, the use of shape programs for 3D +reconstruction allows for reasoning about the semantic properties of +reconstructed objects, editing, low memory footprint, etc. However, the +utilization of shape programs for 3D scene understanding has been largely +neglected in past works. As our main contribution, we enable gradient-based +optimization by introducing a module that translates shape programs designed in +Blender, for example, into efficient PyTorch code. We also provide a method +that relies on PyTorchGeoNodes and is inspired by Monte Carlo Tree Search +(MCTS) to jointly optimize discrete and continuous parameters of shape programs +and reconstruct 3D objects for input scenes. In our experiments, we apply our +algorithm to reconstruct 3D objects in the ScanNet dataset and evaluate our +results against CAD model retrieval-based reconstructions. Our experiments +indicate that our reconstructions match well the input scenes while enabling +semantic reasoning about reconstructed objects. + +
+
+ comment: In Submission +
+
+
+
+
+ + ☆ Private Attribute Inference from Images with Vision-Language Models + + +
+ As large language models (LLMs) become ubiquitous in our daily tasks and +digital interactions, associated privacy risks are increasingly in focus. While +LLM privacy research has primarily focused on the leakage of model training +data, it has recently been shown that the increase in models' capabilities has +enabled LLMs to make accurate privacy-infringing inferences from previously +unseen texts. With the rise of multimodal vision-language models (VLMs), +capable of understanding both images and text, a pertinent question is whether +such results transfer to the previously unexplored domain of benign images +posted online. To investigate the risks associated with the image reasoning +capabilities of newly emerging VLMs, we compile an image dataset with +human-annotated labels of the image owner's personal attributes. In order to +understand the additional privacy risk posed by VLMs beyond traditional human +attribute recognition, our dataset consists of images where the inferable +private attributes do not stem from direct depictions of humans. On this +dataset, we evaluate the inferential capabilities of 7 state-of-the-art VLMs, +finding that they can infer various personal attributes at up to 77.6% +accuracy. Concerningly, we observe that accuracy scales with the general +capabilities of the models, implying that future models can be misused as +stronger adversaries, establishing an imperative for the development of +adequate defenses. + +
+
+
+
+
+ + ☆ Enhancing 3D Fidelity of Text-to-3D using Cross-View Correspondences CVPR 2024 + + +
+ Leveraging multi-view diffusion models as priors for 3D optimization have +alleviated the problem of 3D consistency, e.g., the Janus face problem or the +content drift problem, in zero-shot text-to-3D models. However, the 3D +geometric fidelity of the output remains an unresolved issue; albeit the +rendered 2D views are realistic, the underlying geometry may contain errors +such as unreasonable concavities. In this work, we propose CorrespondentDream, +an effective method to leverage annotation-free, cross-view correspondences +yielded from the diffusion U-Net to provide additional 3D prior to the NeRF +optimization process. We find that these correspondences are strongly +consistent with human perception, and by adopting it in our loss design, we are +able to produce NeRF models with geometries that are more coherent with common +sense, e.g., more smoothed object surface, yielding higher 3D fidelity. We +demonstrate the efficacy of our approach through various comparative +qualitative results and a solid user study. + +
+
+ comment: 25 pages, 22 figures, accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Intra-operative tumour margin evaluation in breast-conserving surgery + with deep learning + + +
+ A positive margin may result in an increased risk of local recurrences after +breast retention surgery for any malignant tumour. In order to reduce the +number of positive margins would offer surgeon real-time intra-operative +information on the presence of positive resection margins. This study aims to +design an intra-operative tumour margin evaluation scheme by using specimen +mammography in breast-conserving surgery. Total of 30 cases were evaluated and +compared with the manually determined contours by experienced physicians and +pathology report. The proposed method utilizes image thresholding to extract +regions of interest and then performs a deep learning model, i.e. SegNet, to +segment tumour tissue. The margin width of normal tissues surrounding it is +evaluated as the result. The desired size of margin around the tumor was set +for 10 mm. The smallest average difference to manual sketched margin (6.53 mm ++- 5.84). In the all case, the SegNet architecture was utilized to obtain +tissue specimen boundary and tumor contour, respectively. The simulation +results indicated that this technology is helpful in discriminating positive +from negative margins in the intra-operative setting. The aim of proposed +scheme was a potential procedure in the intra-operative measurement system. The +experimental results reveal that deep learning techniques can draw results that +are consistent with pathology reports. + +
+
+ comment: 1 pages, 6 figures and 2 tables +
+
+
+
+
+ + ☆ Automated Evaluation of Large Vision-Language Models on Self-driving + Corner Cases + + +
+ Large Vision-Language Models (LVLMs), due to the remarkable visual reasoning +ability to understand images and videos, have received widespread attention in +the autonomous driving domain, which significantly advances the development of +interpretable end-to-end autonomous driving. However, current evaluations of +LVLMs primarily focus on the multi-faceted capabilities in common scenarios, +lacking quantifiable and automated assessment in autonomous driving contexts, +let alone severe road corner cases that even the state-of-the-art autonomous +driving perception systems struggle to handle. In this paper, we propose +CODA-LM, a novel vision-language benchmark for self-driving, which provides the +first automatic and quantitative evaluation of LVLMs for interpretable +autonomous driving including general perception, regional perception, and +driving suggestions. CODA-LM utilizes the texts to describe the road images, +exploiting powerful text-only large language models (LLMs) without image inputs +to assess the capabilities of LVLMs in autonomous driving scenarios, which +reveals stronger alignment with human preferences than LVLM judges. Experiments +demonstrate that even the closed-sourced commercial LVLMs like GPT-4V cannot +deal with road corner cases well, suggesting that we are still far from a +strong LVLM-powered intelligent driving agent, and we hope our CODA-LM can +become the catalyst to promote future development. + +
+
+ comment: Project Page: https://coda-dataset.github.io/coda-lm/ +
+
+
+
+
+ + ☆ Do Counterfactual Examples Complicate Adversarial Training? CVPR'24 + + +
+ We leverage diffusion models to study the robustness-performance tradeoff of +robust classifiers. Our approach introduces a simple, pretrained diffusion +method to generate low-norm counterfactual examples (CEs): semantically altered +data which results in different true class membership. We report that the +confidence and accuracy of robust models on their clean training data are +associated with the proximity of the data to their CEs. Moreover, robust models +perform very poorly when evaluated on the CEs directly, as they become +increasingly invariant to the low-norm, semantic changes brought by CEs. The +results indicate a significant overlap between non-robust and semantic +features, countering the common assumption that non-robust features are not +interpretable. + +
+
+ comment: Accepted as a short paper to the GCV Workshop at CVPR'24 +
+
+
+
+
+ + ☆ ReWiTe: Realistic Wide-angle and Telephoto Dual Camera Fusion Dataset + via Beam Splitter Camera Rig + + +
+ The fusion of images from dual camera systems featuring a wide-angle and a +telephoto camera has become a hotspot problem recently. By integrating +simultaneously captured wide-angle and telephoto images from these systems, the +resulting fused image achieves a wide field of view (FOV) coupled with +high-definition quality. Existing approaches are mostly deep learning methods, +and predominantly rely on supervised learning, where the training dataset plays +a pivotal role. However, current datasets typically adopt a data synthesis +approach generate input pairs of wide-angle and telephoto images alongside +ground-truth images. Notably, the wide-angle inputs are synthesized rather than +captured using real wide-angle cameras, and the ground-truth image is captured +by wide-angle camera whose quality is substantially lower than that of input +telephoto images captured by telephoto cameras. To address these limitations, +we introduce a novel hardware setup utilizing a beam splitter to simultaneously +capture three images, i.e. input pairs and ground-truth images, from two +authentic cellphones equipped with wide-angle and telephoto dual cameras. +Specifically, the wide-angle and telephoto images captured by cellphone 2 serve +as the input pair, while the telephoto image captured by cellphone 1, which is +calibrated to match the optical path of the wide-angle image from cellphone 2, +serves as the ground-truth image, maintaining quality on par with the input +telephoto image. Experiments validate the efficacy of our newly introduced +dataset, named ReWiTe, significantly enhances the performance of various +existing methods for real-world wide-angle and telephoto dual image fusion +tasks. + +
+
+
+
+
+ + ☆ EMC$^2$: Efficient MCMC Negative Sampling for Contrastive Learning with + Global Convergence + + +
+ A key challenge in contrastive learning is to generate negative samples from +a large sample set to contrast with positive samples, for learning better +encoding of the data. These negative samples often follow a softmax +distribution which are dynamically updated during the training process. +However, sampling from this distribution is non-trivial due to the high +computational costs in computing the partition function. In this paper, we +propose an Efficient Markov Chain Monte Carlo negative sampling method for +Contrastive learning (EMC$^2$). We follow the global contrastive learning loss +as introduced in SogCLR, and propose EMC$^2$ which utilizes an adaptive +Metropolis-Hastings subroutine to generate hardness-aware negative samples in +an online fashion during the optimization. We prove that EMC$^2$ finds an +$\mathcal{O}(1/\sqrt{T})$-stationary point of the global contrastive loss in +$T$ iterations. Compared to prior works, EMC$^2$ is the first algorithm that +exhibits global convergence (to stationarity) regardless of the choice of batch +size while exhibiting low computation and memory cost. Numerical experiments +validate that EMC$^2$ is effective with small batch training and achieves +comparable or better performance than baseline algorithms. We report the +results for pre-training image encoders on STL-10 and Imagenet-100. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Uncertainty-guided Open-Set Source-Free Unsupervised Domain Adaptation + with Target-private Class Segregation + + +
+ Standard Unsupervised Domain Adaptation (UDA) aims to transfer knowledge from +a labeled source domain to an unlabeled target but usually requires +simultaneous access to both source and target data. Moreover, UDA approaches +commonly assume that source and target domains share the same labels space. +Yet, these two assumptions are hardly satisfied in real-world scenarios. This +paper considers the more challenging Source-Free Open-set Domain Adaptation +(SF-OSDA) setting, where both assumptions are dropped. We propose a novel +approach for SF-OSDA that exploits the granularity of target-private categories +by segregating their samples into multiple unknown classes. Starting from an +initial clustering-based assignment, our method progressively improves the +segregation of target-private samples by refining their pseudo-labels with the +guide of an uncertainty-based sample selection module. Additionally, we propose +a novel contrastive loss, named NL-InfoNCELoss, that, integrating negative +learning into self-supervised contrastive learning, enhances the model +robustness to noisy pseudo-labels. Extensive experiments on benchmark datasets +demonstrate the superiority of the proposed method over existing approaches, +establishing new state-of-the-art performance. Notably, additional analyses +show that our method is able to learn the underlying semantics of novel +classes, opening the possibility to perform novel class discovery. + +
+
+
+
+
+ + ☆ Label merge-and-split: A graph-colouring approach for memory-efficient + brain parcellation + + +
+ Whole brain parcellation requires inferring hundreds of segmentation labels +in large image volumes and thus presents significant practical challenges for +deep learning approaches. We introduce label merge-and-split, a method that +first greatly reduces the effective number of labels required for +learning-based whole brain parcellation and then recovers original labels. +Using a greedy graph colouring algorithm, our method automatically groups and +merges multiple spatially separate labels prior to model training and +inference. The merged labels may be semantically unrelated. A deep learning +model is trained to predict merged labels. At inference time, original labels +are restored using atlas-based influence regions. In our experiments, the +proposed approach reduces the number of labels by up to 68% while achieving +segmentation accuracy comparable to the baseline method without label merging +and splitting. Moreover, model training and inference times as well as GPU +memory requirements were reduced significantly. The proposed method can be +applied to all semantic segmentation tasks with a large number of spatially +separate classes within an atlas-based prior. + +
+
+
+
+
+ + ☆ CMU-Flownet: Exploring Point Cloud Scene Flow Estimation in Occluded + Scenario + + +
+ Occlusions hinder point cloud frame alignment in LiDAR data, a challenge +inadequately addressed by scene flow models tested mainly on occlusion-free +datasets. Attempts to integrate occlusion handling within networks often suffer +accuracy issues due to two main limitations: a) the inadequate use of occlusion +information, often merging it with flow estimation without an effective +integration strategy, and b) reliance on distance-weighted upsampling that +falls short in correcting occlusion-related errors. To address these +challenges, we introduce the Correlation Matrix Upsampling Flownet +(CMU-Flownet), incorporating an occlusion estimation module within its cost +volume layer, alongside an Occlusion-aware Cost Volume (OCV) mechanism. +Specifically, we propose an enhanced upsampling approach that expands the +sensory field of the sampling process which integrates a Correlation Matrix +designed to evaluate point-level similarity. Meanwhile, our model robustly +integrates occlusion data within the context of scene flow, deploying this +information strategically during the refinement phase of the flow estimation. +The efficacy of this approach is demonstrated through subsequent experimental +validation. Empirical assessments reveal that CMU-Flownet establishes +state-of-the-art performance within the realms of occluded Flyingthings3D and +KITTY datasets, surpassing previous methodologies across a majority of +evaluated metrics. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Classification of Prostate Cancer in 3D Magnetic Resonance Imaging Data + based on Convolutional Neural Networks + + +
+ Prostate cancer is a commonly diagnosed cancerous disease among men +world-wide. Even with modern technology such as multi-parametric magnetic +resonance tomography and guided biopsies, the process for diagnosing prostate +cancer remains time consuming and requires highly trained professionals. In +this paper, different convolutional neural networks (CNN) are evaluated on +their abilities to reliably classify whether an MRI sequence contains malignant +lesions. Implementations of a ResNet, a ConvNet and a ConvNeXt for 3D image +data are trained and evaluated. The models are trained using different data +augmentation techniques, learning rates, and optimizers. The data is taken from +a private dataset, provided by Cantonal Hospital Aarau. The best result was +achieved by a ResNet3D, yielding an average precision score of 0.4583 and AUC +ROC score of 0.6214. + +
+
+ comment: Previous version published in Buzug T.M., Handels H., M\"uller S., + H\"ubner C., Mertins A., Rostalski P.: Student Conference Proceedings 2023, + Infinite Science Publishing, 2023 (ISBN/EAN 978-3-945954-72-0). 7 pages, 2 + figures +
+
+
+
+
+ + ☆ SPVLoc: Semantic Panoramic Viewport Matching for 6D Camera Localization + in Unseen Environments + + +
+ In this paper, we present SPVLoc, a global indoor localization method that +accurately determines the six-dimensional (6D) camera pose of a query image and +requires minimal scene-specific prior knowledge and no scene-specific training. +Our approach employs a novel matching procedure to localize the perspective +camera's viewport, given as an RGB image, within a set of panoramic semantic +layout representations of the indoor environment. The panoramas are rendered +from an untextured 3D reference model, which only comprises approximate +structural information about room shapes, along with door and window +annotations. We demonstrate that a straightforward convolutional network +structure can successfully achieve image-to-panorama and ultimately +image-to-model matching. Through a viewport classification score, we rank +reference panoramas and select the best match for the query image. Then, a 6D +relative pose is estimated between the chosen panorama and query image. Our +experiments demonstrate that this approach not only efficiently bridges the +domain gap but also generalizes well to previously unseen scenes that are not +part of the training data. Moreover, it achieves superior localization accuracy +compared to the state of the art methods and also estimates more degrees of +freedom of the camera pose. We will make our source code publicly available at +https://github.com/fraunhoferhhi/spvloc . + +
+
+ comment: This submission includes the paper and supplementary material. 24 + pages, 11 figures +
+
+
+
+
+ + ☆ MobileNetV4 - Universal Models for the Mobile Ecosystem + + +
+ We present the latest generation of MobileNets, known as MobileNetV4 (MNv4), +featuring universally efficient architecture designs for mobile devices. At its +core, we introduce the Universal Inverted Bottleneck (UIB) search block, a +unified and flexible structure that merges Inverted Bottleneck (IB), ConvNext, +Feed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant. +Alongside UIB, we present Mobile MQA, an attention block tailored for mobile +accelerators, delivering a significant 39% speedup. An optimized neural +architecture search (NAS) recipe is also introduced which improves MNv4 search +effectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe +results in a new suite of MNv4 models that are mostly Pareto optimal across +mobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural +Engine and Google Pixel EdgeTPU - a characteristic not found in any other +models tested. Finally, to further boost accuracy, we introduce a novel +distillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model +delivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just +3.8ms. + +
+
+
+
+
+ + ☆ Self-Supervised Visual Preference Alignment + + +
+ This paper makes the first attempt towards unsupervised preference alignment +in Vision-Language Models (VLMs). We generate chosen and rejected responses +with regard to the original and augmented image pairs, and conduct preference +alignment with direct preference optimization. It is based on a core idea: +properly designed augmentation to the image input will induce VLM to generate +false but hard negative responses, which helps the model to learn from and +produce more robust and powerful answers. The whole pipeline no longer hinges +on supervision from GPT4 or human involvement during alignment, and is highly +efficient with few lines of code. With only 8k randomly sampled unsupervised +data, it achieves 90\% relative score to GPT-4 on complex reasoning in +LLaVA-Bench, and improves LLaVA-7B/13B by 6.7\%/5.6\% score on complex +multi-modal benchmark MM-Vet. Visualizations shows its improved ability to +align with user-intentions. A series of ablations are firmly conducted to +reveal the latent mechanism of the approach, which also indicates its potential +towards further scaling. Code will be available. + +
+
+
+
+
+ + ☆ Robust Noisy Label Learning via Two-Stream Sample Distillation + + +
+ Noisy label learning aims to learn robust networks under the supervision of +noisy labels, which plays a critical role in deep learning. Existing work +either conducts sample selection or label correction to deal with noisy labels +during the model training process. In this paper, we design a simple yet +effective sample selection framework, termed Two-Stream Sample Distillation +(TSSD), for noisy label learning, which can extract more high-quality samples +with clean labels to improve the robustness of network training. Firstly, a +novel Parallel Sample Division (PSD) module is designed to generate a certain +training set with sufficient reliable positive and negative samples by jointly +considering the sample structure in feature space and the human prior in loss +space. Secondly, a novel Meta Sample Purification (MSP) module is further +designed to mine adequate semi-hard samples from the remaining uncertain +training set by learning a strong meta classifier with extra golden data. As a +result, more and more high-quality samples will be distilled from the noisy +training set to train networks robustly in every iteration. Extensive +experiments on four benchmark datasets, including CIFAR-10, CIFAR-100, +Tiny-ImageNet, and Clothing-1M, show that our method has achieved +state-of-the-art results over its competitors. + +
+
+
+
+
+ + ☆ LAECIPS: Large Vision Model Assisted Adaptive Edge-Cloud Collaboration + for IoT-based Perception System + + +
+ Recent large vision models (e.g., SAM) enjoy great potential to facilitate +intelligent perception with high accuracy. Yet, the resource constraints in the +IoT environment tend to limit such large vision models to be locally deployed, +incurring considerable inference latency thereby making it difficult to support +real-time applications, such as autonomous driving and robotics. Edge-cloud +collaboration with large-small model co-inference offers a promising approach +to achieving high inference accuracy and low latency. However, existing +edge-cloud collaboration methods are tightly coupled with the model +architecture and cannot adapt to the dynamic data drifts in heterogeneous IoT +environments. To address the issues, we propose LAECIPS, a new edge-cloud +collaboration framework. In LAECIPS, both the large vision model on the cloud +and the lightweight model on the edge are plug-and-play. We design an +edge-cloud collaboration strategy based on hard input mining, optimized for +both high accuracy and low latency. We propose to update the edge model and its +collaboration strategy with the cloud under the supervision of the large vision +model, so as to adapt to the dynamic IoT data streams. Theoretical analysis of +LAECIPS proves its feasibility. Experiments conducted in a robotic semantic +segmentation system using real-world datasets show that LAECIPS outperforms its +state-of-the-art competitors in accuracy, latency, and communication overhead +while having better adaptability to dynamic environments. + +
+
+
+
+
+ + ☆ Teaching Chinese Sign Language with Feedback in Mixed Reality + + +
+ Traditional sign language teaching methods face challenges such as limited +feedback and diverse learning scenarios. Although 2D resources lack real-time +feedback, classroom teaching is constrained by a scarcity of teacher. Methods +based on VR and AR have relatively primitive interaction feedback mechanisms. +This study proposes an innovative teaching model that uses real-time monocular +vision and mixed reality technology. First, we introduce an improved +hand-posture reconstruction method to achieve sign language semantic retention +and real-time feedback. Second, a ternary system evaluation algorithm is +proposed for a comprehensive assessment, maintaining good consistency with +experts in sign language. Furthermore, we use mixed reality technology to +construct a scenario-based 3D sign language classroom and explore the user +experience of scenario teaching. Overall, this paper presents a novel teaching +method that provides an immersive learning experience, advanced posture +reconstruction, and precise feedback, achieving positive feedback on user +experience and learning effectiveness. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ AbsGS: Recovering Fine Details for 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3D-GS) technique couples 3D Gaussian primitives with +differentiable rasterization to achieve high-quality novel view synthesis +results while providing advanced real-time rendering performance. However, due +to the flaw of its adaptive density control strategy in 3D-GS, it frequently +suffers from over-reconstruction issue in intricate scenes containing +high-frequency details, leading to blurry rendered images. The underlying +reason for the flaw has still been under-explored. In this work, we present a +comprehensive analysis of the cause of aforementioned artifacts, namely +gradient collision, which prevents large Gaussians in over-reconstructed +regions from splitting. To address this issue, we propose the novel +homodirectional view-space positional gradient as the criterion for +densification. Our strategy efficiently identifies large Gaussians in +over-reconstructed regions, and recovers fine details by splitting. We evaluate +our proposed method on various challenging datasets. The experimental results +indicate that our approach achieves the best rendering quality with reduced or +similar memory consumption. Our method is easy to implement and can be +incorporated into a wide variety of most recent Gaussian Splatting-based +methods. We will open source our codes upon formal publication. Our project +page is available at: https://ty424.github.io/AbsGS.github.io/ + +
+
+
+
+
+ + ☆ Efficient optimal dispersed Haar-like filters for face detection + + +
+ This paper introduces a new dispersed Haar-like filter for efficiently +detection face. The basic idea for finding the filter is maximising +between-class and minimising within-class variance. The proposed filters can be +considered as an optimal configuration dispersed Haar-like filters; filters +with disjoint black and white parts. + +
+
+
+
+
+ + ☆ Toward a Realistic Benchmark for Out-of-Distribution Detection + + +
+ Deep neural networks are increasingly used in a wide range of technologies +and services, but remain highly susceptible to out-of-distribution (OOD) +samples, that is, drawn from a different distribution than the original +training set. A common approach to address this issue is to endow deep neural +networks with the ability to detect OOD samples. Several benchmarks have been +proposed to design and validate OOD detection techniques. However, many of them +are based on far-OOD samples drawn from very different distributions, and thus +lack the complexity needed to capture the nuances of real-world scenarios. In +this work, we introduce a comprehensive benchmark for OOD detection, based on +ImageNet and Places365, that assigns individual classes as in-distribution or +out-of-distribution depending on the semantic similarity with the training set. +Several techniques can be used to determine which classes should be considered +in-distribution, yielding benchmarks with varying properties. Experimental +results on different OOD detection techniques show how their measured efficacy +depends on the selected benchmark and how confidence-based techniques may +outperform classifier-based ones on near-OOD samples. + +
+
+
+
+
+ + ☆ A Computer Vision-Based Quality Assessment Technique for the automatic + control of consumables for analytical laboratories + + +
+ The rapid growth of the Industry 4.0 paradigm is increasing the pressure to +develop effective automated monitoring systems. Artificial Intelligence (AI) is +a convenient tool to improve the efficiency of industrial processes while +reducing errors and waste. In fact, it allows the use of real-time data to +increase the effectiveness of monitoring systems, minimize errors, make the +production process more sustainable, and save costs. In this paper, a novel +automatic monitoring system is proposed in the context of production process of +plastic consumables used in analysis laboratories, with the aim to increase the +effectiveness of the control process currently performed by a human operator. +In particular, we considered the problem of classifying the presence or absence +of a transparent anticoagulant substance inside test tubes. Specifically, a +hand-designed deep network model is used and compared with some +state-of-the-art models for its ability to categorize different images of vials +that can be either filled with the anticoagulant or empty. Collected results +indicate that the proposed approach is competitive with state-of-the-art models +in terms of accuracy. Furthermore, we increased the complexity of the task by +training the models on the ability to discriminate not only the presence or +absence of the anticoagulant inside the vial, but also the size of the test +tube. The analysis performed in the latter scenario confirms the +competitiveness of our approach. Moreover, our model is remarkably superior in +terms of its generalization ability and requires significantly fewer resources. +These results suggest the possibility of successfully implementing such a model +in the production process of a plastic consumables company. + +
+
+ comment: 31 pages, 13 figures, 10 tables +
+
+
+
+
+ + ☆ 1st Place Solution for ICCV 2023 OmniObject3D Challenge: Sparse-View + Reconstruction + + +
+ In this report, we present the 1st place solution for ICCV 2023 OmniObject3D +Challenge: Sparse-View Reconstruction. The challenge aims to evaluate +approaches for novel view synthesis and surface reconstruction using only a few +posed images of each object. We utilize Pixel-NeRF as the basic model, and +apply depth supervision as well as coarse-to-fine positional encoding. The +experiments demonstrate the effectiveness of our approach in improving +sparse-view reconstruction quality. We ranked first in the final test with a +PSNR of 25.44614. + +
+
+
+
+
+ + ☆ The Unreasonable Effectiveness of Pre-Trained Features for Camera Pose + Refinement CVPR2024 + + +
+ Pose refinement is an interesting and practically relevant research +direction. Pose refinement can be used to (1) obtain a more accurate pose +estimate from an initial prior (e.g., from retrieval), (2) as pre-processing, +i.e., to provide a better starting point to a more expensive pose estimator, +(3) as post-processing of a more accurate localizer. Existing approaches focus +on learning features / scene representations for the pose refinement task. This +involves training an implicit scene representation or learning features while +optimizing a camera pose-based loss. A natural question is whether training +specific features / representations is truly necessary or whether similar +results can be already achieved with more generic features. In this work, we +present a simple approach that combines pre-trained features with a particle +filter and a renderable representation of the scene. Despite its simplicity, it +achieves state-of-the-art results, demonstrating that one can easily build a +pose refiner without the need for specific training. The code is at +https://github.com/ga1i13o/mcloc_poseref + +
+
+ comment: Accepted to CVPR2024 (Highlight) +
+
+
+
+
+ + ☆ Explainable concept mappings of MRI: Revealing the mechanisms underlying + deep learning-based brain disease classification + + +
+ Motivation. While recent studies show high accuracy in the classification of +Alzheimer's disease using deep neural networks, the underlying learned concepts +have not been investigated. + Goals. To systematically identify changes in brain regions through concepts +learned by the deep neural network for model validation. + Approach. Using quantitative R2* maps we separated Alzheimer's patients +(n=117) from normal controls (n=219) by using a convolutional neural network +and systematically investigated the learned concepts using Concept Relevance +Propagation and compared these results to a conventional region of +interest-based analysis. + Results. In line with established histological findings and the region of +interest-based analyses, highly relevant concepts were primarily found in and +adjacent to the basal ganglia. + Impact. The identification of concepts learned by deep neural networks for +disease classification enables validation of the models and could potentially +improve reliability. + +
+
+
+
+
+ + ☆ Camera clustering for scalable stream-based active distillation + + +
+ We present a scalable framework designed to craft efficient lightweight +models for video object detection utilizing self-training and knowledge +distillation techniques. We scrutinize methodologies for the ideal selection of +training images from video streams and the efficacy of model sharing across +numerous cameras. By advocating for a camera clustering methodology, we aim to +diminish the requisite number of models for training while augmenting the +distillation dataset. The findings affirm that proper camera clustering notably +amplifies the accuracy of distilled models, eclipsing the methodologies that +employ distinct models for each camera or a universal model trained on the +aggregate camera data. + +
+
+ comment: This manuscript is currently under review at IEEE Transactions on + Circuits and Systems for Video Technology +
+
+
+
+
+ + ☆ Adversarial Identity Injection for Semantic Face Image Synthesis CVPR 2024 + + +
+ Nowadays, deep learning models have reached incredible performance in the +task of image generation. Plenty of literature works address the task of face +generation and editing, with human and automatic systems that struggle to +distinguish what's real from generated. Whereas most systems reached excellent +visual generation quality, they still face difficulties in preserving the +identity of the starting input subject. Among all the explored techniques, +Semantic Image Synthesis (SIS) methods, whose goal is to generate an image +conditioned on a semantic segmentation mask, are the most promising, even +though preserving the perceived identity of the input subject is not their main +concern. Therefore, in this paper, we investigate the problem of identity +preservation in face image generation and present an SIS architecture that +exploits a cross-attention mechanism to merge identity, style, and semantic +features to generate faces whose identities are as similar as possible to the +input ones. Experimental results reveal that the proposed method is not only +suitable for preserving the identity but is also effective in the face +recognition adversarial attack, i.e. hiding a second identity in the generated +faces. + +
+
+ comment: Paper accepted at CVPR 2024 Biometrics Workshop +
+
+
+
+
+ + ☆ Comprehensive Survey of Model Compression and Speed up for Vision + Transformers + + +
+ Vision Transformers (ViT) have marked a paradigm shift in computer vision, +outperforming state-of-the-art models across diverse tasks. However, their +practical deployment is hampered by high computational and memory demands. This +study addresses the challenge by evaluating four primary model compression +techniques: quantization, low-rank approximation, knowledge distillation, and +pruning. We methodically analyze and compare the efficacy of these techniques +and their combinations in optimizing ViTs for resource-constrained +environments. Our comprehensive experimental evaluation demonstrates that these +methods facilitate a balanced compromise between model accuracy and +computational efficiency, paving the way for wider application in edge +computing devices. + +
+
+
+
+
+ + ☆ Integration of Self-Supervised BYOL in Semi-Supervised Medical Image + Recognition CCS 2024 + + +
+ Image recognition techniques heavily rely on abundant labeled data, +particularly in medical contexts. Addressing the challenges associated with +obtaining labeled data has led to the prominence of self-supervised learning +and semi-supervised learning, especially in scenarios with limited annotated +data. In this paper, we proposed an innovative approach by integrating +self-supervised learning into semi-supervised models to enhance medical image +recognition. Our methodology commences with pre-training on unlabeled data +utilizing the BYOL method. Subsequently, we merge pseudo-labeled and labeled +datasets to construct a neural network classifier, refining it through +iterative fine-tuning. Experimental results on three different datasets +demonstrate that our approach optimally leverages unlabeled data, outperforming +existing methods in terms of accuracy for medical image recognition. + +
+
+ comment: Accepted by ICCS 2024 +
+
+
+
+
+ + ☆ Portrait3D: Text-Guided High-Quality 3D Portrait Generation Using + Pyramid Representation and GANs Prior + + +
+ Existing neural rendering-based text-to-3D-portrait generation methods +typically make use of human geometry prior and diffusion models to obtain +guidance. However, relying solely on geometry information introduces issues +such as the Janus problem, over-saturation, and over-smoothing. We present +Portrait3D, a novel neural rendering-based framework with a novel joint +geometry-appearance prior to achieve text-to-3D-portrait generation that +overcomes the aforementioned issues. To accomplish this, we train a 3D portrait +generator, 3DPortraitGAN-Pyramid, as a robust prior. This generator is capable +of producing 360{\deg} canonical 3D portraits, serving as a starting point for +the subsequent diffusion-based generation process. To mitigate the "grid-like" +artifact caused by the high-frequency information in the feature-map-based 3D +representation commonly used by most 3D-aware GANs, we integrate a novel +pyramid tri-grid 3D representation into 3DPortraitGAN-Pyramid. To generate 3D +portraits from text, we first project a randomly generated image aligned with +the given prompt into the pre-trained 3DPortraitGAN-Pyramid's latent space. The +resulting latent code is then used to synthesize a pyramid tri-grid. Beginning +with the obtained pyramid tri-grid, we use score distillation sampling to +distill the diffusion model's knowledge into the pyramid tri-grid. Following +that, we utilize the diffusion model to refine the rendered images of the 3D +portrait and then use these refined images as training data to further optimize +the pyramid tri-grid, effectively eliminating issues with unrealistic color and +unnatural artifacts. Our experimental results show that Portrait3D can produce +realistic, high-quality, and canonical 3D portraits that align with the prompt. + +
+
+
+
+
+ + ☆ CNN-based explanation ensembling for dataset, representation and + explanations evaluation + + +
+ Explainable Artificial Intelligence has gained significant attention due to +the widespread use of complex deep learning models in high-stake domains such +as medicine, finance, and autonomous cars. However, different explanations +often present different aspects of the model's behavior. In this research +manuscript, we explore the potential of ensembling explanations generated by +deep classification models using convolutional model. Through experimentation +and analysis, we aim to investigate the implications of combining explanations +to uncover a more coherent and reliable patterns of the model's behavior, +leading to the possibility of evaluating the representation learned by the +model. With our method, we can uncover problems of under-representation of +images in a certain class. Moreover, we discuss other side benefits like +features' reduction by replacing the original image with its explanations +resulting in the removal of some sensitive information. Through the use of +carefully selected evaluation metrics from the Quantus library, we demonstrated +the method's superior performance in terms of Localisation and Faithfulness, +compared to individual explanations. + +
+
+ comment: accepted at 2nd World Conference on eXplainable Artificial + Intelligence +
+
+
+
+
+ + ☆ Learning to Score Sign Language with Two-stage Method + + +
+ Human action recognition and performance assessment have been hot research +topics in recent years. Recognition problems have mature solutions in the field +of sign language, but past research in performance analysis has focused on +competitive sports and medical training, overlooking the scoring assessment +,which is an important part of sign language teaching digitalization. In this +paper, we analyze the existing technologies for performance assessment and +adopt methods that perform well in human pose reconstruction tasks combined +with motion rotation embedded expressions, proposing a two-stage sign language +performance evaluation pipeline. Our analysis shows that choosing +reconstruction tasks in the first stage can provide more expressive features, +and using smoothing methods can provide an effective reference for assessment. +Experiments show that our method provides good score feedback mechanisms and +high consistency with professional assessments compared to end-to-end +evaluations. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Second Edition FRCSyn Challenge at CVPR 2024: Face Recognition Challenge + in the Era of Synthetic Data + + +
+ Synthetic data is gaining increasing relevance for training machine learning +models. This is mainly motivated due to several factors such as the lack of +real data and intra-class variability, time and errors produced in manual +labeling, and in some cases privacy concerns, among others. This paper presents +an overview of the 2nd edition of the Face Recognition Challenge in the Era of +Synthetic Data (FRCSyn) organized at CVPR 2024. FRCSyn aims to investigate the +use of synthetic data in face recognition to address current technological +limitations, including data privacy concerns, demographic biases, +generalization to novel scenarios, and performance constraints in challenging +situations such as aging, pose variations, and occlusions. Unlike the 1st +edition, in which synthetic data from DCFace and GANDiffFace methods was only +allowed to train face recognition systems, in this 2nd edition we propose new +sub-tasks that allow participants to explore novel face generative methods. The +outcomes of the 2nd FRCSyn Challenge, along with the proposed experimental +protocol and benchmarking contribute significantly to the application of +synthetic data to face recognition. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2311.10476 +
+
+
+
+
+ + ☆ Know Yourself Better: Diverse Discriminative Feature Learning Improves + Open Set Recognition + + +
+ Open set recognition (OSR) is a critical aspect of machine learning, +addressing the challenge of detecting novel classes during inference. Within +the realm of deep learning, neural classifiers trained on a closed set of data +typically struggle to identify novel classes, leading to erroneous predictions. +To address this issue, various heuristic methods have been proposed, allowing +models to express uncertainty by stating "I don't know." However, a gap in the +literature remains, as there has been limited exploration of the underlying +mechanisms of these methods. In this paper, we conduct an analysis of open set +recognition methods, focusing on the aspect of feature diversity. Our research +reveals a significant correlation between learning diverse discriminative +features and enhancing OSR performance. Building on this insight, we propose a +novel OSR approach that leverages the advantages of feature diversity. The +efficacy of our method is substantiated through rigorous evaluation on a +standard OSR testbench, demonstrating a substantial improvement over +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Improving Bracket Image Restoration and Enhancement with Flow-guided + Alignment and Enhanced Feature Aggregation + + +
+ In this paper, we address the Bracket Image Restoration and Enhancement +(BracketIRE) task using a novel framework, which requires restoring a +high-quality high dynamic range (HDR) image from a sequence of noisy, blurred, +and low dynamic range (LDR) multi-exposure RAW inputs. To overcome this +challenge, we present the IREANet, which improves the multiple exposure +alignment and aggregation with a Flow-guide Feature Alignment Module (FFAM) and +an Enhanced Feature Aggregation Module (EFAM). Specifically, the proposed FFAM +incorporates the inter-frame optical flow as guidance to facilitate the +deformable alignment and spatial attention modules for better feature +alignment. The EFAM further employs the proposed Enhanced Residual Block (ERB) +as a foundational component, wherein a unidirectional recurrent network +aggregates the aligned temporal features to better reconstruct the results. To +improve model generalization and performance, we additionally employ the Bayer +preserving augmentation (BayerAug) strategy to augment the multi-exposure RAW +inputs. Our experimental evaluations demonstrate that the proposed IREANet +shows state-of-the-art performance compared with previous methods. + +
+
+
+
+
+ + ☆ Optimization of Prompt Learning via Multi-Knowledge Representation for + Vision-Language Models + + +
+ Vision-Language Models (VLMs), such as CLIP, play a foundational role in +various cross-modal applications. To fully leverage VLMs' potential in adapting +to downstream tasks, context optimization methods like Prompt Tuning are +essential. However, one key limitation is the lack of diversity in prompt +templates, whether they are hand-crafted or learned through additional modules. +This limitation restricts the capabilities of pretrained VLMs and can result in +incorrect predictions in downstream tasks. To address this challenge, we +propose Context Optimization with Multi-Knowledge Representation (CoKnow), a +framework that enhances Prompt Learning for VLMs with rich contextual +knowledge. To facilitate CoKnow during inference, we trained lightweight +semantic knowledge mappers, which are capable of generating Multi-Knowledge +Representation for an input image without requiring additional priors. +Experimentally, We conducted extensive experiments on 11 publicly available +datasets, demonstrating that CoKnow outperforms a series of previous methods. +We will make all resources open-source: https://github.com/EMZucas/CoKnow. + +
+
+
+
+
+ + ☆ The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report CVPR + + +
+ This paper provides a comprehensive review of the NTIRE 2024 challenge, +focusing on efficient single-image super-resolution (ESR) solutions and their +outcomes. The task of this challenge is to super-resolve an input image with a +magnification factor of x4 based on pairs of low and corresponding +high-resolution images. The primary objective is to develop networks that +optimize various aspects such as runtime, parameters, and FLOPs, while still +maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on +the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In +addition, this challenge has 4 tracks including the main track (overall +performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 +(parameters). In the main track, all three metrics (ie runtime, FLOPs, and +parameter count) were considered. The ranking of the main track is calculated +based on a weighted sum-up of the scores of all other sub-tracks. In sub-track +1, the practical runtime performance of the submissions was evaluated, and the +corresponding score was used to determine the ranking. In sub-track 2, the +number of FLOPs was considered. The score calculated based on the corresponding +FLOPs was used to determine the ranking. In sub-track 3, the number of +parameters was considered. The score calculated based on the corresponding +parameters was used to determine the ranking. RLFN is set as the baseline for +efficiency measurement. The challenge had 262 registered participants, and 34 +teams made valid submissions. They gauge the state-of-the-art in efficient +single-image super-resolution. To facilitate the reproducibility of the +challenge and enable other researchers to build upon these findings, the code +and the pre-trained model of validated solutions are made publicly available at +https://github.com/Amazingren/NTIRE2024_ESR/. + +
+
+ comment: The report paper of NTIRE2024 Efficient Super-resolution, accepted by + CVPRW2024 +
+
+
+
+
+ + ☆ Referring Flexible Image Restoration + + +
+ In reality, images often exhibit multiple degradations, such as rain and fog +at night (triple degradations). However, in many cases, individuals may not +want to remove all degradations, for instance, a blurry lens revealing a +beautiful snowy landscape (double degradations). In such scenarios, people may +only desire to deblur. These situations and requirements shed light on a new +challenge in image restoration, where a model must perceive and remove specific +degradation types specified by human commands in images with multiple +degradations. We term this task Referring Flexible Image Restoration (RFIR). To +address this, we first construct a large-scale synthetic dataset called RFIR, +comprising 153,423 samples with the degraded image, text prompt for specific +degradation removal and restored image. RFIR consists of five basic degradation +types: blur, rain, haze, low light and snow while six main sub-categories are +included for varying degrees of degradation removal. To tackle the challenge, +we propose a novel transformer-based multi-task model named TransRFIR, which +simultaneously perceives degradation types in the degraded image and removes +specific degradation upon text prompt. TransRFIR is based on two devised +attention modules, Multi-Head Agent Self-Attention (MHASA) and Multi-Head Agent +Cross Attention (MHACA), where MHASA and MHACA introduce the agent token and +reach the linear complexity, achieving lower computation cost than vanilla +self-attention and cross-attention and obtaining competitive performances. Our +TransRFIR achieves state-of-the-art performances compared with other +counterparts and is proven as an effective architecture for image restoration. +We release our project at https://github.com/GuanRunwei/FIR-CP. + +
+
+ comment: 15 pages, 19 figures +
+
+
+
+
+ + ☆ Efficiently Adversarial Examples Generation for Visual-Language Models + under Targeted Transfer Scenarios using Diffusion Models + + +
+ Targeted transfer-based attacks involving adversarial examples pose a +significant threat to large visual-language models (VLMs). However, the +state-of-the-art (SOTA) transfer-based attacks incur high costs due to +excessive iteration counts. Furthermore, the generated adversarial examples +exhibit pronounced adversarial noise and demonstrate limited efficacy in +evading defense methods such as DiffPure. To address these issues, inspired by +score matching, we introduce AdvDiffVLM, which utilizes diffusion models to +generate natural, unrestricted adversarial examples. Specifically, AdvDiffVLM +employs Adaptive Ensemble Gradient Estimation to modify the score during the +diffusion model's reverse generation process, ensuring the adversarial examples +produced contain natural adversarial semantics and thus possess enhanced +transferability. Simultaneously, to enhance the quality of adversarial examples +further, we employ the GradCAM-guided Mask method to disperse adversarial +semantics throughout the image, rather than concentrating them in a specific +area. Experimental results demonstrate that our method achieves a speedup +ranging from 10X to 30X compared to existing transfer-based attack methods, +while maintaining superior quality of adversarial examples. Additionally, the +generated adversarial examples possess strong transferability and exhibit +increased robustness against adversarial defense methods. Notably, AdvDiffVLM +can successfully attack commercial VLMs, including GPT-4V, in a black-box +manner. + +
+
+
+
+
+ + ☆ Prescribing the Right Remedy: Mitigating Hallucinations in Large + Vision-Language Models via Targeted Instruction Tuning + + +
+ Despite achieving outstanding performance on various cross-modal tasks, +current large vision-language models (LVLMs) still suffer from hallucination +issues, manifesting as inconsistencies between their generated responses and +the corresponding images. Prior research has implicated that the low quality of +instruction data, particularly the skewed balance between positive and negative +samples, is a significant contributor to model hallucinations. Recently, +researchers have proposed high-quality instruction datasets, such as +LRV-Instruction, to mitigate model hallucination. Nonetheless, our +investigation reveals that hallucinatory concepts from different LVLMs exhibit +specificity, i.e. the distribution of hallucinatory concepts varies +significantly across models. Existing datasets did not consider the +hallucination specificity of different models in the design processes, thereby +diminishing their efficacy in mitigating model hallucination. In this paper, we +propose a targeted instruction data generation framework named DFTG that +tailored to the hallucination specificity of different models. Concretely, DFTG +consists of two stages: hallucination diagnosis, which extracts the necessary +information from the model's responses and images for hallucination diagnosis; +and targeted data generation, which generates targeted instruction data based +on diagnostic results. The experimental results on hallucination benchmarks +demonstrate that the targeted instruction data generated by our method are more +effective in mitigating hallucinations compared to previous datasets. + +
+
+
+
+
+ + ☆ Domain-Rectifying Adapter for Cross-Domain Few-Shot Segmentation CVPR 2024 + + +
+ Few-shot semantic segmentation (FSS) has achieved great success on segmenting +objects of novel classes, supported by only a few annotated samples. However, +existing FSS methods often underperform in the presence of domain shifts, +especially when encountering new domain styles that are unseen during training. +It is suboptimal to directly adapt or generalize the entire model to new +domains in the few-shot scenario. Instead, our key idea is to adapt a small +adapter for rectifying diverse target domain styles to the source domain. +Consequently, the rectified target domain features can fittingly benefit from +the well-optimized source domain segmentation model, which is intently trained +on sufficient source domain data. Training domain-rectifying adapter requires +sufficiently diverse target domains. We thus propose a novel local-global style +perturbation method to simulate diverse potential target domains by +perturbating the feature channel statistics of the individual images and +collective statistics of the entire source domain, respectively. Additionally, +we propose a cyclic domain alignment module to facilitate the adapter +effectively rectifying domains using a reverse domain rectification +supervision. The adapter is trained to rectify the image features from diverse +synthesized target domains to align with the source domain. During testing on +target domains, we start by rectifying the image features and then conduct +few-shot segmentation on the domain-rectified features. Extensive experiments +demonstrate the effectiveness of our method, achieving promising results on +cross-domain few-shot semantic segmentation tasks. Our code is available at +https://github.com/Matt-Su/DR-Adapter. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Application of Deep Learning Methods to Processing of Noisy Medical + Video Data + + +
+ Cells count become a challenging problem when the cells move in a continuous +stream, and their boundaries are difficult for visual detection. To resolve +this problem we modified the training and decision making processes using +curriculum learning and multi-view predictions techniques, respectively. + +
+
+
+
+
+ + ☆ SRGS: Super-Resolution 3D Gaussian Splatting ACM MM 2024 + + +
+ Recently, 3D Gaussian Splatting (3DGS) has gained popularity as a novel +explicit 3D representation. This approach relies on the representation power of +Gaussian primitives to provide a high-quality rendering. However, primitives +optimized at low resolution inevitably exhibit sparsity and texture deficiency, +posing a challenge for achieving high-resolution novel view synthesis (HRNVS). +To address this problem, we propose Super-Resolution 3D Gaussian Splatting +(SRGS) to perform the optimization in a high-resolution (HR) space. The +sub-pixel constraint is introduced for the increased viewpoints in HR space, +exploiting the sub-pixel cross-view information of the multiple low-resolution +(LR) views. The gradient accumulated from more viewpoints will facilitate the +densification of primitives. Furthermore, a pre-trained 2D super-resolution +model is integrated with the sub-pixel constraint, enabling these dense +primitives to learn faithful texture features. In general, our method focuses +on densification and texture learning to effectively enhance the representation +ability of primitives. Experimentally, our method achieves high rendering +quality on HRNVS only with LR inputs, outperforming state-of-the-art methods on +challenging datasets such as Mip-NeRF 360 and Tanks & Temples. Related codes +will be released upon acceptance. + +
+
+ comment: submit ACM MM 2024 +
+
+
+
+
+ + ☆ Awareness of uncertainty in classification using a multivariate model + and multi-views + + +
+ One of the ways to make artificial intelligence more natural is to give it +some room for doubt. Two main questions should be resolved in that way. First, +how to train a model to estimate uncertainties of its own predictions? And +then, what to do with the uncertain predictions if they appear? First, we +proposed an uncertainty-aware negative log-likelihood loss for the case of +N-dimensional multivariate normal distribution with spherical variance matrix +to the solution of N-classes classification tasks. The loss is similar to the +heteroscedastic regression loss. The proposed model regularizes uncertain +predictions, and trains to calculate both the predictions and their uncertainty +estimations. The model fits well with the label smoothing technique. Second, we +expanded the limits of data augmentation at the training and test stages, and +made the trained model to give multiple predictions for a given number of +augmented versions of each test sample. Given the multi-view predictions +together with their uncertainties and confidences, we proposed several methods +to calculate final predictions, including mode values and bin counts with soft +and hard weights. For the latter method, we formalized the model tuning task in +the form of multimodal optimization with non-differentiable criteria of maximum +accuracy, and applied particle swarm optimization to solve the tuning task. The +proposed methodology was tested using CIFAR-10 dataset with clean and noisy +labels and demonstrated good results in comparison with other uncertainty +estimation methods related to sample selection, co-teaching, and label +smoothing. + +
+
+
+
+
+ + ☆ OmniSSR: Zero-shot Omnidirectional Image Super-Resolution using Stable + Diffusion Model + + +
+ Omnidirectional images (ODIs) are commonly used in real-world visual tasks, +and high-resolution ODIs help improve the performance of related visual tasks. +Most existing super-resolution methods for ODIs use end-to-end learning +strategies, resulting in inferior realness of generated images and a lack of +effective out-of-domain generalization capabilities in training methods. Image +generation methods represented by diffusion model provide strong priors for +visual tasks and have been proven to be effectively applied to image +restoration tasks. Leveraging the image priors of the Stable Diffusion (SD) +model, we achieve omnidirectional image super-resolution with both fidelity and +realness, dubbed as OmniSSR. Firstly, we transform the equirectangular +projection (ERP) images into tangent projection (TP) images, whose distribution +approximates the planar image domain. Then, we use SD to iteratively sample +initial high-resolution results. At each denoising iteration, we further +correct and update the initial results using the proposed Octadecaplex Tangent +Information Interaction (OTII) and Gradient Decomposition (GD) technique to +ensure better consistency. Finally, the TP images are transformed back to +obtain the final high-resolution results. Our method is zero-shot, requiring no +training or fine-tuning. Experiments of our method on two benchmark datasets +demonstrate the effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ Learnable Prompt for Few-Shot Semantic Segmentation in Remote Sensing + Domain CVPR + + +
+ Few-shot segmentation is a task to segment objects or regions of novel +classes within an image given only a few annotated examples. In the generalized +setting, the task extends to segment both the base and the novel classes. The +main challenge is how to train the model such that the addition of novel +classes does not hurt the base classes performance, also known as catastrophic +forgetting. To mitigate this issue, we use SegGPT as our base model and train +it on the base classes. Then, we use separate learnable prompts to handle +predictions for each novel class. To handle various object sizes which +typically present in remote sensing domain, we perform patch-based prediction. +To address the discontinuities along patch boundaries, we propose a +patch-and-stitch technique by re-framing the problem as an image inpainting +task. During inference, we also utilize image similarity search over image +embeddings for prompt selection and novel class filtering to reduce false +positive predictions. Based on our experiments, our proposed method boosts the +weighted mIoU of a simple fine-tuned SegGPT from 15.96 to 35.08 on the +validation set of few-shot OpenEarthMap dataset given in the challenge. + +
+
+ comment: Accepted to CVPRW 2024 +
+
+
+
+
+ + ☆ TC-OCR: TableCraft OCR for Efficient Detection & Recognition of Table + Structure & Content + + +
+ The automatic recognition of tabular data in document images presents a +significant challenge due to the diverse range of table styles and complex +structures. Tables offer valuable content representation, enhancing the +predictive capabilities of various systems such as search engines and Knowledge +Graphs. Addressing the two main problems, namely table detection (TD) and table +structure recognition (TSR), has traditionally been approached independently. +In this research, we propose an end-to-end pipeline that integrates deep +learning models, including DETR, CascadeTabNet, and PP OCR v2, to achieve +comprehensive image-based table recognition. This integrated approach +effectively handles diverse table styles, complex structures, and image +distortions, resulting in improved accuracy and efficiency compared to existing +methods like Table Transformers. Our system achieves simultaneous table +detection (TD), table structure recognition (TSR), and table content +recognition (TCR), preserving table structures and accurately extracting +tabular data from document images. The integration of multiple models addresses +the intricacies of table recognition, making our approach a promising solution +for image-based table understanding, data extraction, and information retrieval +applications. Our proposed approach achieves an IOU of 0.96 and an OCR Accuracy +of 78%, showcasing a remarkable improvement of approximately 25% in the OCR +Accuracy compared to the previous Table Transformer approach. + +
+
+ comment: 8 pages, 2 figures, Workshop of 1st MMIR Deep Multimodal Learning for + Information Retrieval +
+
+
+
+
+ + ☆ From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for + Efficient Text-based Person Search + + +
+ In text-based person search endeavors, data generation has emerged as a +prevailing practice, addressing concerns over privacy preservation and the +arduous task of manual annotation. Although the number of synthesized data can +be infinite in theory, the scientific conundrum persists that how much +generated data optimally fuels subsequent model training. We observe that only +a subset of the data in these constructed datasets plays a decisive role. +Therefore, we introduce a new Filtering-WoRA paradigm, which contains a +filtering algorithm to identify this crucial data subset and WoRA (Weighted +Low-Rank Adaptation) learning strategy for light fine-tuning. The filtering +algorithm is based on the cross-modality relevance to remove the lots of coarse +matching synthesis pairs. As the number of data decreases, we do not need to +fine-tune the entire model. Therefore, we propose a WoRA learning strategy to +efficiently update a minimal portion of model parameters. WoRA streamlines the +learning process, enabling heightened efficiency in extracting knowledge from +fewer, yet potent, data instances. Extensive experimentation validates the +efficacy of pretraining, where our model achieves advanced and efficient +retrieval performance on challenging real-world benchmarks. Notably, on the +CUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing +model training time by 19.82%. + +
+
+
+
+
+ + ☆ NeuroMorphix: A Novel Brain MRI Asymmetry-specific Feature Construction + Approach For Seizure Recurrence Prediction + + +
+ Seizure recurrence is an important concern after an initial unprovoked +seizure; without drug treatment, it occurs within 2 years in 40-50% of cases. +The decision to treat currently relies on predictors of seizure recurrence risk +that are inaccurate, resulting in unnecessary, possibly harmful, treatment in +some patients and potentially preventable seizures in others. Because of the +link between brain lesions and seizure recurrence, we developed a recurrence +prediction tool using machine learning and clinical 3T brain MRI. We developed +NeuroMorphix, a feature construction approach based on MRI brain anatomy. Each +of seven NeuroMorphix features measures the absolute or relative difference +between corresponding regions in each cerebral hemisphere. FreeSurfer was used +to segment brain regions and to generate values for morphometric parameters (8 +for each cortical region and 5 for each subcortical region). The parameters +were then mapped to whole brain NeuroMorphix features, yielding a total of 91 +features per subject. Features were generated for a first seizure patient +cohort (n = 169) categorised into seizure recurrence and non-recurrence +subgroups. State-of-the-art classification algorithms were trained and tested +using NeuroMorphix features to predict seizure recurrence. Classification +models using the top 5 features, ranked by sequential forward selection, +demonstrated excellent performance in predicting seizure recurrence, with area +under the ROC curve of 88-93%, accuracy of 83-89%, and F1 score of 83-90%. +Highly ranked features aligned with structural alterations known to be +associated with epilepsy. This study highlights the potential for targeted, +data-driven approaches to aid clinical decision-making in brain disorders. + +
+
+ comment: This work has been submitted to the IEEE TMI for possible publication +
+
+
+
+
+ + ☆ Tripod: Three Complementary Inductive Biases for Disentangled + Representation Learning + + +
+ Inductive biases are crucial in disentangled representation learning for +narrowing down an underspecified solution set. In this work, we consider +endowing a neural network autoencoder with three select inductive biases from +the literature: data compression into a grid-like latent space via +quantization, collective independence amongst latents, and minimal functional +influence of any latent on how other latents determine data generation. In +principle, these inductive biases are deeply complementary: they most directly +specify properties of the latent space, encoder, and decoder, respectively. In +practice, however, naively combining existing techniques instantiating these +inductive biases fails to yield significant benefits. To address this, we +propose adaptations to the three techniques that simplify the learning problem, +equip key regularization terms with stabilizing invariances, and quash +degenerate incentives. The resulting model, Tripod, achieves state-of-the-art +results on a suite of four image disentanglement benchmarks. We also verify +that Tripod significantly improves upon its naive incarnation and that all +three of its "legs" are necessary for best performance. + +
+
+ comment: 22 pages, 10 figures, code available at + https://github.com/kylehkhsu/tripod +
+
+
+
+
+ + ☆ EucliDreamer: Fast and High-Quality Texturing for 3D Models with + Depth-Conditioned Stable Diffusion + + +
+ We present EucliDreamer, a simple and effective method to generate textures +for 3D models given text prompts and meshes. The texture is parametrized as an +implicit function on the 3D surface, which is optimized with the Score +Distillation Sampling (SDS) process and differentiable rendering. To generate +high-quality textures, we leverage a depth-conditioned Stable Diffusion model +guided by the depth image rendered from the mesh. We test our approach on 3D +models in Objaverse and conducted a user study, which shows its superior +quality compared to existing texturing methods like Text2Tex. In addition, our +method converges 2 times faster than DreamFusion. Through text prompting, +textures of diverse art styles can be produced. We hope Euclidreamer proides a +viable solution to automate a labor-intensive stage in 3D content creation. + +
+
+ comment: Short version of arXiv:2311.15573 +
+
+
+
+
+ + ☆ Plug-and-Play Acceleration of Occupancy Grid-based NeRF Rendering using + VDB Grid and Hierarchical Ray Traversal CVPR + + +
+ Transmittance estimators such as Occupancy Grid (OG) can accelerate the +training and rendering of Neural Radiance Field (NeRF) by predicting important +samples that contributes much to the generated image. However, OG manages +occupied regions in the form of the dense binary grid, in which there are many +blocks with the same values that cause redundant examination of voxels' +emptiness in ray-tracing. In our work, we introduce two techniques to improve +the efficiency of ray-tracing in trained OG without fine-tuning. First, we +replace the dense grids with VDB grids to reduce the spatial redundancy. +Second, we use hierarchical digital differential analyzer (HDDA) to efficiently +trace voxels in the VDB grids. Our experiments on NeRF-Synthetic and Mip-NeRF +360 datasets show that our proposed method successfully accelerates rendering +NeRF-Synthetic dataset by 12% in average and Mip-NeRF 360 dataset by 4% in +average, compared to a fast implementation of OG, NerfAcc, without losing the +quality of rendered images. + +
+
+ comment: Short paper for CVPR Neural Rendering Intelligence Workshop 2024. + Code: https://github.com/Yosshi999/faster-occgrid +
+
+
+
+
+ + ☆ OneActor: Consistent Character Generation via Cluster-Conditioned + Guidance + + +
+ Text-to-image diffusion models benefit artists with high-quality image +generation. Yet its stochastic nature prevent artists from creating consistent +images of the same character. Existing methods try to tackle this challenge and +generate consistent content in various ways. However, they either depend on +external data or require expensive tuning of the diffusion model. For this +issue, we argue that a lightweight but intricate guidance is enough to +function. Aiming at this, we lead the way to formalize the objective of +consistent generation, derive a clustering-based score function and propose a +novel paradigm, OneActor. We design a cluster-conditioned model which +incorporates posterior samples to guide the denoising trajectories towards the +target cluster. To overcome the overfitting challenge shared by one-shot tuning +pipelines, we devise auxiliary components to simultaneously augment the tuning +and regulate the inference. This technique is later verified to significantly +enhance the content diversity of generated images. Comprehensive experiments +show that our method outperforms a variety of baselines with satisfactory +character consistency, superior prompt conformity as well as high image +quality. And our method is at least 4 times faster than tuning-based baselines. +Furthermore, to our best knowledge, we first prove that the semantic space has +the same interpolation property as the latent space dose. This property can +serve as another promising tool for fine generation control. + +
+
+
+
+
+ + ☆ PreGSU-A Generalized Traffic Scene Understanding Model for Autonomous + Driving based on Pre-trained Graph Attention Network + + +
+ Scene understanding, defined as learning, extraction, and representation of +interactions among traffic elements, is one of the critical challenges toward +high-level autonomous driving (AD). Current scene understanding methods mainly +focus on one concrete single task, such as trajectory prediction and risk level +evaluation. Although they perform well on specific metrics, the generalization +ability is insufficient to adapt to the real traffic complexity and downstream +demand diversity. In this study, we propose PreGSU, a generalized pre-trained +scene understanding model based on graph attention network to learn the +universal interaction and reasoning of traffic scenes to support various +downstream tasks. After the feature engineering and sub-graph module, all +elements are embedded as nodes to form a dynamic weighted graph. Then, four +graph attention layers are applied to learn the relationships among agents and +lanes. In the pre-train phase, the understanding model is trained on two +self-supervised tasks: Virtual Interaction Force (VIF) modeling and Masked Road +Modeling (MRM). Based on the artificial potential field theory, VIF modeling +enables PreGSU to capture the agent-to-agent interactions while MRM extracts +agent-to-road connections. In the fine-tuning process, the pre-trained +parameters are loaded to derive detailed understanding outputs. We conduct +validation experiments on two downstream tasks, i.e., trajectory prediction in +urban scenario, and intention recognition in highway scenario, to verify the +generalized ability and understanding ability. Results show that compared with +the baselines, PreGSU achieves better accuracy on both tasks, indicating the +potential to be generalized to various scenes and targets. Ablation study shows +the effectiveness of pre-train task design. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Masked Autoencoders for Microscopy are Scalable Learners of Cellular + Biology CVPR 2024 + + +
+ Featurizing microscopy images for use in biological research remains a +significant challenge, especially for large-scale experiments spanning millions +of images. This work explores the scaling properties of weakly supervised +classifiers and self-supervised masked autoencoders (MAEs) when training with +increasingly larger model backbones and microscopy datasets. Our results show +that ViT-based MAEs outperform weakly supervised classifiers on a variety of +tasks, achieving as much as a 11.5% relative improvement when recalling known +biological relationships curated from public databases. Additionally, we +develop a new channel-agnostic MAE architecture (CA-MAE) that allows for +inputting images of different numbers and orders of channels at inference time. +We demonstrate that CA-MAEs effectively generalize by inferring and evaluating +on a microscopy image dataset (JUMP-CP) generated under different experimental +conditions with a different channel structure than our pretraining data +(RPI-93M). Our findings motivate continued research into scaling +self-supervised learning on microscopy data in order to create powerful +foundation models of cellular biology that have the potential to catalyze +advancements in drug discovery and beyond. + +
+
+ comment: CVPR 2024 Highlight. arXiv admin note: text overlap with + arXiv:2309.16064 +
+
+
+
+
+ + ☆ Vision-and-Language Navigation via Causal Learning + + +
+ In the pursuit of robust and generalizable environment perception and +language understanding, the ubiquitous challenge of dataset bias continues to +plague vision-and-language navigation (VLN) agents, hindering their performance +in unseen environments. This paper introduces the generalized cross-modal +causal transformer (GOAT), a pioneering solution rooted in the paradigm of +causal inference. By delving into both observable and unobservable confounders +within vision, language, and history, we propose the back-door and front-door +adjustment causal learning (BACL and FACL) modules to promote unbiased learning +by comprehensively mitigating potential spurious correlations. Additionally, to +capture global confounder features, we propose a cross-modal feature pooling +(CFP) module supervised by contrastive learning, which is also shown to be +effective in improving cross-modal representations during pre-training. +Extensive experiments across multiple VLN datasets (R2R, REVERIE, RxR, and +SOON) underscore the superiority of our proposed method over previous +state-of-the-art approaches. Code is available at +https://github.com/CrystalSixone/VLN-GOAT. + +
+
+
+
+
+ + ☆ MoE-TinyMed: Mixture of Experts for Tiny Medical Large Vision-Language + Models + + +
+ Mixture of Expert Tuning (MoE-Tuning) has effectively enhanced the +performance of general MLLMs with fewer parameters, yet its application in +resource-limited medical settings has not been fully explored. To address this +gap, we developed MoE-TinyMed, a model tailored for medical applications that +significantly lowers parameter demands. In evaluations on the VQA-RAD, SLAKE, +and Path-VQA datasets, MoE-TinyMed outperformed LLaVA-Med in all Med-VQA closed +settings with just 3.6B parameters. Additionally, a streamlined version with 2B +parameters surpassed LLaVA-Med's performance in PathVQA, showcasing its +effectiveness in resource-limited healthcare settings. + +
+
+
+
+
+ + ☆ Compressible and Searchable: AI-native Multi-Modal Retrieval System with + Learned Image Compression + + +
+ The burgeoning volume of digital content across diverse modalities +necessitates efficient storage and retrieval methods. Conventional approaches +struggle to cope with the escalating complexity and scale of multimedia data. +In this paper, we proposed framework addresses this challenge by fusing +AI-native multi-modal search capabilities with neural image compression. First +we analyze the intricate relationship between compressibility and +searchability, recognizing the pivotal role each plays in the efficiency of +storage and retrieval systems. Through the usage of simple adapter is to bridge +the feature of Learned Image Compression(LIC) and Contrastive Language-Image +Pretraining(CLIP) while retaining semantic fidelity and retrieval of +multi-modal data. Experimental evaluations on Kodak datasets demonstrate the +efficacy of our approach, showcasing significant enhancements in compression +efficiency and search accuracy compared to existing methodologies. Our work +marks a significant advancement towards scalable and efficient multi-modal +search systems in the era of big data. + +
+
+
+
+
+ + ☆ MS-MANO: Enabling Hand Pose Tracking with Biomechanical Constraints CVPR 2024 + + +
+ This work proposes a novel learning framework for visual hand dynamics +analysis that takes into account the physiological aspects of hand motion. The +existing models, which are simplified joint-actuated systems, often produce +unnatural motions. To address this, we integrate a musculoskeletal system with +a learnable parametric hand model, MANO, to create a new model, MS-MANO. This +model emulates the dynamics of muscles and tendons to drive the skeletal +system, imposing physiologically realistic constraints on the resulting torque +trajectories. We further propose a simulation-in-the-loop pose refinement +framework, BioPR, that refines the initial estimated pose through a multi-layer +perceptron (MLP) network. Our evaluation of the accuracy of MS-MANO and the +efficacy of the BioPR is conducted in two separate parts. The accuracy of +MS-MANO is compared with MyoSuite, while the efficacy of BioPR is benchmarked +against two large-scale public datasets and two recent state-of-the-art +methods. The results demonstrate that our approach consistently improves the +baseline methods both quantitatively and qualitatively. + +
+
+ comment: 11 pages, 5 figures; CVPR 2024 +
+
+
+
+
+ + ☆ Find The Gap: Knowledge Base Reasoning For Visual Question Answering + + +
+ We analyze knowledge-based visual question answering, for which given a +question, the models need to ground it into the visual modality and retrieve +the relevant knowledge from a given large knowledge base (KB) to be able to +answer. Our analysis has two folds, one based on designing neural architectures +and training them from scratch, and another based on large pre-trained language +models (LLMs). Our research questions are: 1) Can we effectively augment models +by explicit supervised retrieval of the relevant KB information to solve the +KB-VQA problem? 2) How do task-specific and LLM-based models perform in the +integration of visual and external knowledge, and multi-hop reasoning over both +sources of information? 3) Is the implicit knowledge of LLMs sufficient for +KB-VQA and to what extent it can replace the explicit KB? Our results +demonstrate the positive impact of empowering task-specific and LLM models with +supervised external and visual knowledge retrieval models. Our findings show +that though LLMs are stronger in 1-hop reasoning, they suffer in 2-hop +reasoning in comparison with our fine-tuned NN model even if the relevant +information from both modalities is available to the model. Moreover, we +observed that LLM models outperform the NN model for KB-related questions which +confirms the effectiveness of implicit knowledge in LLMs however, they do not +alleviate the need for external KB. + +
+
+
+
+
+ + ☆ Closed-Loop Open-Vocabulary Mobile Manipulation with GPT-4V + + +
+ Autonomous robot navigation and manipulation in open environments require +reasoning and replanning with closed-loop feedback. We present COME-robot, the +first closed-loop framework utilizing the GPT-4V vision-language foundation +model for open-ended reasoning and adaptive planning in real-world scenarios. +We meticulously construct a library of action primitives for robot exploration, +navigation, and manipulation, serving as callable execution modules for GPT-4V +in task planning. On top of these modules, GPT-4V serves as the brain that can +accomplish multimodal reasoning, generate action policy with code, verify the +task progress, and provide feedback for replanning. Such design enables +COME-robot to (i) actively perceive the environments, (ii) perform situated +reasoning, and (iii) recover from failures. Through comprehensive experiments +involving 8 challenging real-world tabletop and manipulation tasks, COME-robot +demonstrates a significant improvement in task success rate (~25%) compared to +state-of-the-art baseline methods. We further conduct comprehensive analyses to +elucidate how COME-robot's design facilitates failure recovery, free-form +instruction following, and long-horizon task planning. + +
+
+
+
+
+ + ☆ GaitPoint+: A Gait Recognition Network Incorporating Point Cloud + Analysis and Recycling + + +
+ Gait is a behavioral biometric modality that can be used to recognize +individuals by the way they walk from a far distance. Most existing gait +recognition approaches rely on either silhouettes or skeletons, while their +joint use is underexplored. Features from silhouettes and skeletons can provide +complementary information for more robust recognition against appearance +changes or pose estimation errors. To exploit the benefits of both silhouette +and skeleton features, we propose a new gait recognition network, referred to +as the GaitPoint+. Our approach models skeleton key points as a 3D point cloud, +and employs a computational complexity-conscious 3D point processing approach +to extract skeleton features, which are then combined with silhouette features +for improved accuracy. Since silhouette- or CNN-based methods already require +considerable amount of computational resources, it is preferable that the key +point learning module is faster and more lightweight. We present a detailed +analysis of the utilization of every human key point after the use of +traditional max-pooling, and show that while elbow and ankle points are used +most commonly, many useful points are discarded by max-pooling. Thus, we +present a method to recycle some of the discarded points by a Recycling +Max-Pooling module, during processing of skeleton point clouds, and achieve +further performance improvement. We provide a comprehensive set of experimental +results showing that (i) incorporating skeleton features obtained by a +point-based 3D point cloud processing approach boosts the performance of three +different state-of-the-art silhouette- and CNN-based baselines; (ii) recycling +the discarded points increases the accuracy further. Ablation studies are also +provided to show the effectiveness and contribution of different components of +our approach. + +
+
+
+
+
+ + ☆ LWIRPOSE: A novel LWIR Thermal Image Dataset and Benchmark ICIP2024 + + +
+ Human pose estimation faces hurdles in real-world applications due to factors +like lighting changes, occlusions, and cluttered environments. We introduce a +unique RGB-Thermal Nearly Paired and Annotated 2D Pose Dataset, comprising over +2,400 high-quality LWIR (thermal) images. Each image is meticulously annotated +with 2D human poses, offering a valuable resource for researchers and +practitioners. This dataset, captured from seven actors performing diverse +everyday activities like sitting, eating, and walking, facilitates pose +estimation on occlusion and other challenging scenarios. We benchmark +state-of-the-art pose estimation methods on the dataset to showcase its +potential, establishing a strong baseline for future research. Our results +demonstrate the dataset's effectiveness in promoting advancements in pose +estimation for various applications, including surveillance, healthcare, and +sports analytics. The dataset and code are available at +https://github.com/avinres/LWIRPOSE + +
+
+ comment: Submitted in ICIP2024 +
+
+
+
+
+ + ☆ MK-SGN: A Spiking Graph Convolutional Network with Multimodal Fusion and + Knowledge Distillation for Skeleton-based Action Recognition + + +
+ In recent years, skeleton-based action recognition, leveraging multimodal +Graph Convolutional Networks (GCN), has achieved remarkable results. However, +due to their deep structure and reliance on continuous floating-point +operations, GCN-based methods are energy-intensive. To address this issue, we +propose an innovative Spiking Graph Convolutional Network with Multimodal +Fusion and Knowledge Distillation (MK-SGN). By merging the energy efficiency of +Spiking Neural Network (SNN) with the graph representation capability of GCN, +the proposed MK-SGN reduces energy consumption while maintaining recognition +accuracy. Firstly, we convert GCN into Spiking Graph Convolutional Network +(SGN) and construct a foundational Base-SGN for skeleton-based action +recognition, establishing a new benchmark and paving the way for future +research exploration. Secondly, we further propose a Spiking Multimodal Fusion +module (SMF), leveraging mutual information to process multimodal data more +efficiently. Additionally, we introduce a spiking attention mechanism and +design a Spatio Graph Convolution module with a Spatial Global Spiking +Attention mechanism (SA-SGC), enhancing feature learning capability. +Furthermore, we delve into knowledge distillation methods from multimodal GCN +to SGN and propose a novel, integrated method that simultaneously focuses on +both intermediate layer distillation and soft label distillation to improve the +performance of SGN. On two challenging datasets for skeleton-based action +recognition, MK-SGN outperforms the state-of-the-art GCN-like frameworks in +reducing computational load and energy consumption. In contrast, typical GCN +methods typically consume more than 35mJ per action sample, while MK-SGN +reduces energy consumption by more than 98%. + +
+
+
+
+
+ + ☆ Consistency and Uncertainty: Identifying Unreliable Responses From + Black-Box Vision-Language Models for Selective Visual Question Answering CVPR 2024 + + +
+ The goal of selective prediction is to allow an a model to abstain when it +may not be able to deliver a reliable prediction, which is important in +safety-critical contexts. Existing approaches to selective prediction typically +require access to the internals of a model, require retraining a model or study +only unimodal models. However, the most powerful models (e.g. GPT-4) are +typically only available as black boxes with inaccessible internals, are not +retrainable by end-users, and are frequently used for multimodal tasks. We +study the possibility of selective prediction for vision-language models in a +realistic, black-box setting. We propose using the principle of +\textit{neighborhood consistency} to identify unreliable responses from a +black-box vision-language model in question answering tasks. We hypothesize +that given only a visual question and model response, the consistency of the +model's responses over the neighborhood of a visual question will indicate +reliability. It is impossible to directly sample neighbors in feature space in +a black-box setting. Instead, we show that it is possible to use a smaller +proxy model to approximately sample from the neighborhood. We find that +neighborhood consistency can be used to identify model responses to visual +questions that are likely unreliable, even in adversarial settings or settings +that are out-of-distribution to the proxy model. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Residual Connections Harm Self-Supervised Abstract Feature Learning + + +
+ We demonstrate that adding a weighting factor to decay the strength of +identity shortcuts within residual networks substantially improves semantic +feature learning in the state-of-the-art self-supervised masked autoencoding +(MAE) paradigm. Our modification to the identity shortcuts within a VIT-B/16 +backbone of an MAE boosts linear probing accuracy on ImageNet from 67.3% to +72.3%. This significant gap suggests that, while residual connection structure +serves an essential role in facilitating gradient propagation, it may have a +harmful side effect of reducing capacity for abstract learning by virtue of +injecting an echo of shallower representations into deeper layers. We +ameliorate this downside via a fixed formula for monotonically decreasing the +contribution of identity connections as layer depth increases. Our design +promotes the gradual development of feature abstractions, without impacting +network trainability. Analyzing the representations learned by our modified +residual networks, we find correlation between low effective feature rank and +downstream task performance. + +
+
+
+
+
+ + ☆ Neuromorphic Vision-based Motion Segmentation with Graph Transformer + Neural Network + + +
+ Moving object segmentation is critical to interpret scene dynamics for +robotic navigation systems in challenging environments. Neuromorphic vision +sensors are tailored for motion perception due to their asynchronous nature, +high temporal resolution, and reduced power consumption. However, their +unconventional output requires novel perception paradigms to leverage their +spatially sparse and temporally dense nature. In this work, we propose a novel +event-based motion segmentation algorithm using a Graph Transformer Neural +Network, dubbed GTNN. Our proposed algorithm processes event streams as 3D +graphs by a series of nonlinear transformations to unveil local and global +spatiotemporal correlations between events. Based on these correlations, events +belonging to moving objects are segmented from the background without prior +knowledge of the dynamic scene geometry. The algorithm is trained on publicly +available datasets including MOD, EV-IMO, and \textcolor{black}{EV-IMO2} using +the proposed training scheme to facilitate efficient training on extensive +datasets. Moreover, we introduce the Dynamic Object Mask-aware Event Labeling +(DOMEL) approach for generating approximate ground-truth labels for event-based +motion segmentation datasets. We use DOMEL to label our own recorded Event +dataset for Motion Segmentation (EMS-DOMEL), which we release to the public for +further research and benchmarking. Rigorous experiments are conducted on +several unseen publicly-available datasets where the results revealed that GTNN +outperforms state-of-the-art methods in the presence of dynamic background +variations, motion patterns, and multiple dynamic objects with varying sizes +and velocities. GTNN achieves significant performance gains with an average +increase of 9.4% and 4.5% in terms of motion segmentation accuracy (IoU%) and +detection rate (DR%), respectively. + +
+
+
+
+
+ + ☆ A Concise Tiling Strategy for Preserving Spatial Context in Earth + Observation Imagery ICLR 2024 + + +
+ We propose a new tiling strategy, Flip-n-Slide, which has been developed for +specific use with large Earth observation satellite images when the location of +objects-of-interest (OoI) is unknown and spatial context can be necessary for +class disambiguation. Flip-n-Slide is a concise and minimalistic approach that +allows OoI to be represented at multiple tile positions and orientations. This +strategy introduces multiple views of spatio-contextual information, without +introducing redundancies into the training set. By maintaining distinct +transformation permutations for each tile overlap, we enhance the +generalizability of the training set without misrepresenting the true data +distribution. Our experiments validate the effectiveness of Flip-n-Slide in the +task of semantic segmentation, a necessary data product in geophysical studies. +We find that Flip-n-Slide outperforms the previous state-of-the-art +augmentation routines for tiled data in all evaluation metrics. For +underrepresented classes, Flip-n-Slide increases precision by as much as 15.8%. + +
+
+ comment: Accepted to the Machine Learning for Remote Sensing (ML4RS) Workshop + at ICLR 2024 +
+
+
+
+
+ + ☆ Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression + Recognition CVPR 2024 + + +
+ Human communication is multi-modal; e.g., face-to-face interaction involves +auditory signals (speech) and visual signals (face movements and hand +gestures). Hence, it is essential to exploit multiple modalities when designing +machine learning-based facial expression recognition systems. In addition, +given the ever-growing quantities of video data that capture human facial +expressions, such systems should utilize raw unlabeled videos without requiring +expensive annotations. Therefore, in this work, we employ a multitask +multi-modal self-supervised learning method for facial expression recognition +from in-the-wild video data. Our model combines three self-supervised objective +functions: First, a multi-modal contrastive loss, that pulls diverse data +modalities of the same video together in the representation space. Second, a +multi-modal clustering loss that preserves the semantic structure of input data +in the representation space. Finally, a multi-modal data reconstruction loss. +We conduct a comprehensive study on this multimodal multi-task self-supervised +learning method on three facial expression recognition benchmarks. To that end, +we examine the performance of learning through different combinations of +self-supervised tasks on the facial expression recognition downstream task. Our +model ConCluGen outperforms several multi-modal self-supervised and fully +supervised baselines on the CMU-MOSEI dataset. Our results generally show that +multi-modal self-supervision tasks offer large performance gains for +challenging tasks such as facial expression recognition, while also reducing +the amount of manual annotations required. We release our pre-trained models as +well as source code publicly + +
+
+ comment: The paper will appear in the CVPR 2024 workshops proceedings +
+
+
+
+
+ + ☆ From a Lossless (~1.5:1) Compression Algorithm for Llama2 7B Weights to + Variable Precision, Variable Range, Compressed Numeric Data Types for CNNs + and LLMs + + +
+ This paper starts with a simple lossless ~1.5:1 compression algorithm for the +weights of the Large Language Model (LLM) Llama2 7B [1] that can be implemented +in ~200 LUTs in AMD FPGAs, processing over 800 million bfloat16 numbers per +second. This framework is then extended to variable precision, variable range, +compressed numerical data types that are a user defined super set of both +floats and posits [2]. The paper then discusses a simple hardware +implementation of such format based on ANS (Asymmetrical Numeral Systems) [3] +that acts as a bridge between this flexible data format and a computational +engine while, at the same time, achieving bandwidth reduction. An example of a +token factory using weight compression and sharing is also given. + +
+
+
+
+
+ + ☆ Semantics-Aware Attention Guidance for Diagnosing Whole Slide Images + + +
+ Accurate cancer diagnosis remains a critical challenge in digital pathology, +largely due to the gigapixel size and complex spatial relationships present in +whole slide images. Traditional multiple instance learning (MIL) methods often +struggle with these intricacies, especially in preserving the necessary context +for accurate diagnosis. In response, we introduce a novel framework named +Semantics-Aware Attention Guidance (SAG), which includes 1) a technique for +converting diagnostically relevant entities into attention signals, and 2) a +flexible attention loss that efficiently integrates various semantically +significant information, such as tissue anatomy and cancerous regions. Our +experiments on two distinct cancer datasets demonstrate consistent improvements +in accuracy, precision, and recall with two state-of-the-art baseline models. +Qualitative analysis further reveals that the incorporation of heuristic +guidance enables the model to focus on regions critical for diagnosis. SAG is +not only effective for the models discussed here, but its adaptability extends +to any attention-based diagnostic model. This opens up exciting possibilities +for further improving the accuracy and efficiency of cancer diagnostics. + +
+
+
+
+
+ + ☆ Automatic classification of prostate MR series type using image content + and metadata + + +
+ With the wealth of medical image data, efficient curation is essential. +Assigning the sequence type to magnetic resonance images is necessary for +scientific studies and artificial intelligence-based analysis. However, +incomplete or missing metadata prevents effective automation. We therefore +propose a deep-learning method for classification of prostate cancer scanning +sequences based on a combination of image data and DICOM metadata. We +demonstrate superior results compared to metadata or image data alone, and make +our code publicly available at +https://github.com/deepakri201/DICOMScanClassification. + +
+
+
+
+
+ + ☆ HumMUSS: Human Motion Understanding using State Space Models CVPR 24 + + +
+ Understanding human motion from video is essential for a range of +applications, including pose estimation, mesh recovery and action recognition. +While state-of-the-art methods predominantly rely on transformer-based +architectures, these approaches have limitations in practical scenarios. +Transformers are slower when sequentially predicting on a continuous stream of +frames in real-time, and do not generalize to new frame rates. In light of +these constraints, we propose a novel attention-free spatiotemporal model for +human motion understanding building upon recent advancements in state space +models. Our model not only matches the performance of transformer-based models +in various motion understanding tasks but also brings added benefits like +adaptability to different video frame rates and enhanced training speed when +working with longer sequence of keypoints. Moreover, the proposed model +supports both offline and real-time applications. For real-time sequential +prediction, our model is both memory efficient and several times faster than +transformer-based approaches while maintaining their high accuracy. + +
+
+ comment: CVPR 24 +
+
+
+
+
+ + ☆ OSR-ViT: A Simple and Modular Framework for Open-Set Object Detection + and Discovery + + +
+ An object detector's ability to detect and flag \textit{novel} objects during +open-world deployments is critical for many real-world applications. +Unfortunately, much of the work in open object detection today is disjointed +and fails to adequately address applications that prioritize unknown object +recall \textit{in addition to} known-class accuracy. To close this gap, we +present a new task called Open-Set Object Detection and Discovery (OSODD) and +as a solution propose the Open-Set Regions with ViT features (OSR-ViT) +detection framework. OSR-ViT combines a class-agnostic proposal network with a +powerful ViT-based classifier. Its modular design simplifies optimization and +allows users to easily swap proposal solutions and feature extractors to best +suit their application. Using our multifaceted evaluation protocol, we show +that OSR-ViT obtains performance levels that far exceed state-of-the-art +supervised methods. Our method also excels in low-data settings, outperforming +supervised baselines using a fraction of the training data. + +
+
+ comment: 28 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ Vocabulary-free Image Classification and Semantic Segmentation + + +
+ Large vision-language models revolutionized image classification and semantic +segmentation paradigms. However, they typically assume a pre-defined set of +categories, or vocabulary, at test time for composing textual prompts. This +assumption is impractical in scenarios with unknown or evolving semantic +context. Here, we address this issue and introduce the Vocabulary-free Image +Classification (VIC) task, which aims to assign a class from an unconstrained +language-induced semantic space to an input image without needing a known +vocabulary. VIC is challenging due to the vastness of the semantic space, which +contains millions of concepts, including fine-grained categories. To address +VIC, we propose Category Search from External Databases (CaSED), a +training-free method that leverages a pre-trained vision-language model and an +external database. CaSED first extracts the set of candidate categories from +the most semantically similar captions in the database and then assigns the +image to the best-matching candidate category according to the same +vision-language model. Furthermore, we demonstrate that CaSED can be applied +locally to generate a coarse segmentation mask that classifies image regions, +introducing the task of Vocabulary-free Semantic Segmentation. CaSED and its +variants outperform other more complex vision-language models, on +classification and semantic segmentation benchmarks, while using much fewer +parameters. + +
+
+ comment: Under review, 22 pages, 10 figures, code is available at + https://github.com/altndrr/vicss. arXiv admin note: text overlap with + arXiv:2306.00917 +
+
+
+
+
+ + ☆ UruDendro, a public dataset of cross-section images of Pinus taeda + + +
+ The automatic detection of tree-ring boundaries and other anatomical features +using image analysis has progressed substantially over the past decade with +advances in machine learning and imagery technology, as well as increasing +demands from the dendrochronology community. This paper presents a publicly +available database of 64 scanned images of transverse sections of commercially +grown Pinus taeda trees from northern Uruguay, ranging from 17 to 24 years old. +The collection contains several challenging features for automatic ring +detection, including illumination and surface preparation variation, fungal +infection (blue stains), knot formation, missing cortex or interruptions in +outer rings, and radial cracking. This dataset can be used to develop and test +automatic tree ring detection algorithms. This paper presents to the +dendrochronology community one such method, Cross-Section Tree-Ring Detection +(CS-TRD), which identifies and marks complete annual rings in cross-sections +for tree species presenting a clear definition between early and latewood. We +compare the CS-TRD performance against the ground truth manual delineation of +all rings over the UruDendro dataset. The CS-TRD software identified rings with +an average F-score of 89% and RMSE error of 5.27px for the entire database in +less than 20 seconds per image. Finally, we propose a robust measure of the +ring growth using the \emph{equivalent radius} of a circle having the same area +enclosed by the detected tree ring. Overall, this study contributes to the +dendrochronologist's toolbox of fast and low-cost methods to automatically +detect rings in conifer species, particularly for measuring diameter growth +rates and stem transverse area using entire cross-sections. + +
+
+ comment: Submitted to Dendrochronologia. arXiv admin note: text overlap with + arXiv:2305.10809 +
+
+
+
+
+ + ☆ Gasformer: A Transformer-based Architecture for Segmenting Methane + Emissions from Livestock in Optical Gas Imaging CVPR + + +
+ Methane emissions from livestock, particularly cattle, significantly +contribute to climate change. Effective methane emission mitigation strategies +are crucial as the global population and demand for livestock products +increase. We introduce Gasformer, a novel semantic segmentation architecture +for detecting low-flow rate methane emissions from livestock, and controlled +release experiments using optical gas imaging. We present two unique datasets +captured with a FLIR GF77 OGI camera. Gasformer leverages a Mix Vision +Transformer encoder and a Light-Ham decoder to generate multi-scale features +and refine segmentation maps. Gasformer outperforms other state-of-the-art +models on both datasets, demonstrating its effectiveness in detecting and +segmenting methane plumes in controlled and real-world scenarios. On the +livestock dataset, Gasformer achieves mIoU of 88.56%, surpassing other +state-of-the-art models. Materials are available at: +github.com/toqitahamid/Gasformer. + +
+
+ comment: 9 pages, 5 figures, this paper has been submitted and accepted for + publication at CVPRW 2024 +
+
+
+
+
+ + ☆ Dynamic Self-adaptive Multiscale Distillation from Pre-trained + Multimodal Large Model for Efficient Cross-modal Representation Learning + + +
+ In recent years, pre-trained multimodal large models have attracted +widespread attention due to their outstanding performance in various multimodal +applications. Nonetheless, the extensive computational resources and vast +datasets required for their training present significant hurdles for deployment +in environments with limited computational resources. To address this +challenge, we propose a novel dynamic self-adaptive multiscale distillation +from pre-trained multimodal large model for efficient cross-modal +representation learning for the first time. Unlike existing distillation +methods, our strategy employs a multiscale perspective, enabling the extraction +structural knowledge across from the pre-trained multimodal large model. +Ensuring that the student model inherits a comprehensive and nuanced +understanding of the teacher knowledge. To optimize each distillation loss in a +balanced and efficient manner, we propose a dynamic self-adaptive distillation +loss balancer, a novel component eliminating the need for manual loss weight +adjustments and dynamically balances each loss item during the distillation +process. Our methodology streamlines pre-trained multimodal large models using +only their output features and original image-level information, requiring +minimal computational resources. This efficient approach is suited for various +applications and allows the deployment of advanced multimodal technologies even +in resource-limited settings. Extensive experiments has demonstrated that our +method maintains high performance while significantly reducing model complexity +and training costs. Moreover, our distilled student model utilizes only +image-level information to achieve state-of-the-art performance on cross-modal +retrieval tasks, surpassing previous methods that relied on region-level +information. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Semantic-Based Active Perception for Humanoid Visual Tasks with Foveal + Sensors + + +
+ The aim of this work is to establish how accurately a recent semantic-based +foveal active perception model is able to complete visual tasks that are +regularly performed by humans, namely, scene exploration and visual search. +This model exploits the ability of current object detectors to localize and +classify a large number of object classes and to update a semantic description +of a scene across multiple fixations. It has been used previously in scene +exploration tasks. In this paper, we revisit the model and extend its +application to visual search tasks. To illustrate the benefits of using +semantic information in scene exploration and visual search tasks, we compare +its performance against traditional saliency-based models. In the task of scene +exploration, the semantic-based method demonstrates superior performance +compared to the traditional saliency-based model in accurately representing the +semantic information present in the visual scene. In visual search experiments, +searching for instances of a target class in a visual field containing multiple +distractors shows superior performance compared to the saliency-driven model +and a random gaze selection algorithm. Our results demonstrate that semantic +information, from the top-down, influences visual exploration and search tasks +significantly, suggesting a potential area of research for integrating it with +traditional bottom-up cues. + +
+
+
+
+
+ + ☆ MobileNetV4 -- Universal Models for the Mobile Ecosystem + + +
+ We present the latest generation of MobileNets, known as MobileNetV4 (MNv4), +featuring universally efficient architecture designs for mobile devices. At its +core, we introduce the Universal Inverted Bottleneck (UIB) search block, a +unified and flexible structure that merges Inverted Bottleneck (IB), ConvNext, +Feed Forward Network (FFN), and a novel Extra Depthwise (ExtraDW) variant. +Alongside UIB, we present Mobile MQA, an attention block tailored for mobile +accelerators, delivering a significant 39% speedup. An optimized neural +architecture search (NAS) recipe is also introduced which improves MNv4 search +effectiveness. The integration of UIB, Mobile MQA and the refined NAS recipe +results in a new suite of MNv4 models that are mostly Pareto optimal across +mobile CPUs, DSPs, GPUs, as well as specialized accelerators like Apple Neural +Engine and Google Pixel EdgeTPU - a characteristic not found in any other +models tested. Finally, to further boost accuracy, we introduce a novel +distillation technique. Enhanced by this technique, our MNv4-Hybrid-Large model +delivers 87% ImageNet-1K accuracy, with a Pixel 8 EdgeTPU runtime of just +3.8ms. + +
+
+
+
+
+ + ☆ TV100: A TV Series Dataset that Pre-Trained CLIP Has Not Seen + + +
+ The era of pre-trained models has ushered in a wealth of new insights for the +machine learning community. Among the myriad of questions that arise, one of +paramount importance is: 'Do pre-trained models possess comprehensive +knowledge?' This paper seeks to address this crucial inquiry. In line with our +objective, we have made publicly available a novel dataset comprised of images +from TV series released post-2021. This dataset holds significant potential for +use in various research areas, including the evaluation of incremental +learning, novel class discovery, and long-tailed learning, among others. +Project page: https://tv-100.github.io/ + +
+
+ comment: Project page: https://tv-100.github.io/ +
+
+
+
+
+ + ♻ ☆ GROUNDHOG: Grounding Large Language Models to Holistic Segmentation CVPR 2024 + + +
+ Most multimodal large language models (MLLMs) learn language-to-object +grounding through causal language modeling where grounded objects are captured +by bounding boxes as sequences of location tokens. This paradigm lacks +pixel-level representations that are important for fine-grained visual +understanding and diagnosis. In this work, we introduce GROUNDHOG, an MLLM +developed by grounding Large Language Models to holistic segmentation. +GROUNDHOG incorporates a masked feature extractor and converts extracted +features into visual entity tokens for the MLLM backbone, which then connects +groundable phrases to unified grounding masks by retrieving and merging the +entity masks. To train GROUNDHOG, we carefully curated M3G2, a grounded visual +instruction tuning dataset with Multi-Modal Multi-Grained Grounding, by +harvesting a collection of segmentation-grounded datasets with rich +annotations. Our experimental results show that GROUNDHOG achieves superior +performance on various language grounding tasks without task-specific +fine-tuning, and significantly reduces object hallucination. GROUNDHOG also +demonstrates better grounding towards complex forms of visual input and +provides easy-to-understand diagnosis in failure cases. + +
+
+ comment: Accepted to CVPR 2024. Website: https://groundhog-mllm.github.io/ +
+
+
+
+
+ + ♻ ☆ Splatter Image: Ultra-Fast Single-View 3D Reconstruction CVPR 2024 + + +
+ We introduce the \method, an ultra-efficient approach for monocular 3D object +reconstruction. Splatter Image is based on Gaussian Splatting, which allows +fast and high-quality reconstruction of 3D scenes from multiple images. We +apply Gaussian Splatting to monocular reconstruction by learning a neural +network that, at test time, performs reconstruction in a feed-forward manner, +at 38 FPS. Our main innovation is the surprisingly straightforward design of +this network, which, using 2D operators, maps the input image to one 3D +Gaussian per pixel. The resulting set of Gaussians thus has the form an image, +the Splatter Image. We further extend the method take several images as input +via cross-view attention. Owning to the speed of the renderer (588 FPS), we use +a single GPU for training while generating entire images at each iteration to +optimize perceptual metrics like LPIPS. On several synthetic, real, +multi-category and large-scale benchmark datasets, we achieve better results in +terms of PSNR, LPIPS, and other metrics while training and evaluating much +faster than prior works. Code, models, demo and more results are available at +https://szymanowiczs.github.io/splatter-image. + +
+
+ comment: CVPR 2024. Project page: + https://szymanowiczs.github.io/splatter-image.html . Code: + https://github.com/szymanowiczs/splatter-image , Demo: + https://huggingface.co/spaces/szymanowiczs/splatter_image +
+
+
+
+
+ + ♻ ☆ Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the + AIIB23 challenge + + +
+ Airway-related quantitative imaging biomarkers are crucial for examination, +diagnosis, and prognosis in pulmonary diseases. However, the manual delineation +of airway trees remains prohibitively time-consuming. While significant efforts +have been made towards enhancing airway modelling, current public-available +datasets concentrate on lung diseases with moderate morphological variations. +The intricate honeycombing patterns present in the lung tissues of fibrotic +lung disease patients exacerbate the challenges, often leading to various +prediction errors. To address this issue, the 'Airway-Informed Quantitative CT +Imaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was +organized in conjunction with the official 2023 International Conference on +Medical Image Computing and Computer Assisted Intervention (MICCAI). The airway +structures were meticulously annotated by three experienced radiologists. +Competitors were encouraged to develop automatic airway segmentation models +with high robustness and generalization abilities, followed by exploring the +most correlated QIB of mortality prediction. A training set of 120 +high-resolution computerised tomography (HRCT) scans were publicly released +with expert annotations and mortality status. The online validation set +incorporated 52 HRCT scans from patients with fibrotic lung disease and the +offline test set included 140 cases from fibrosis and COVID-19 patients. The +results have shown that the capacity of extracting airway trees from patients +with fibrotic lung disease could be enhanced by introducing voxel-wise weighted +general union loss and continuity loss. In addition to the competitive image +biomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5, +p<0.0001) was revealed for survival prognostication compared with existing +clinical measurements, clinician assessment and AI-based biomarkers. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Pixel to Elevation: Learning to Predict Elevation Maps at Long Range + using Images for Autonomous Offroad Navigation + + +
+ Understanding terrain topology at long-range is crucial for the success of +off-road robotic missions, especially when navigating at high-speeds. LiDAR +sensors, which are currently heavily relied upon for geometric mapping, provide +sparse measurements when mapping at greater distances. To address this +challenge, we present a novel learning-based approach capable of predicting +terrain elevation maps at long-range using only onboard egocentric images in +real-time. Our proposed method is comprised of three main elements. First, a +transformer-based encoder is introduced that learns cross-view associations +between the egocentric views and prior bird-eye-view elevation map predictions. +Second, an orientation-aware positional encoding is proposed to incorporate the +3D vehicle pose information over complex unstructured terrain with multi-view +visual image features. Lastly, a history-augmented learn-able map embedding is +proposed to achieve better temporal consistency between elevation map +predictions to facilitate the downstream navigational tasks. We experimentally +validate the applicability of our proposed approach for autonomous offroad +robotic navigation in complex and unstructured terrain using real-world offroad +driving data. Furthermore, the method is qualitatively and quantitatively +compared against the current state-of-the-art methods. Extensive field +experiments demonstrate that our method surpasses baseline models in accurately +predicting terrain elevation while effectively capturing the overall terrain +topology at long-ranges. Finally, ablation studies are conducted to highlight +and understand the effect of key components of the proposed approach and +validate their suitability to improve offroad robotic navigation capabilities. + +
+
+ comment: 8 pages, 6 figures, Accepted in IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ A Survey and Benchmark of Automatic Surface Reconstruction from Point + Clouds + + +
+ We present a comprehensive survey and benchmark of both traditional and +learning-based methods for surface reconstruction from point clouds. This task +is particularly challenging for real-world acquisitions due to factors like +noise, outliers, non-uniform sampling, and missing data. Traditional approaches +often simplify the problem by imposing handcrafted priors on either the input +point clouds or the resulting surface, a process that can necessitate tedious +hyperparameter tuning. Conversely, deep learning models have the capability to +directly learn the properties of input point clouds and desired surfaces from +data. We study the influence of these handcrafted and learned priors on the +precision and robustness of surface reconstruction techniques. We evaluate +various time-tested and contemporary methods in a standardized manner. When +both trained and evaluated on point clouds with identical characteristics, the +learning-based models consistently produce superior surfaces compared to their +traditional counterparts$\unicode{x2013}$even in scenarios involving novel +shape categories. However, traditional methods demonstrate greater resilience +to the diverse array of point cloud anomalies commonly found in real-world 3D +acquisitions. For the benefit of the research community, we make our code and +datasets available, inviting further enhancements to learning-based surface +reconstruction. This can be accessed at +https://github.com/raphaelsulzer/dsr-benchmark . + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Ghost-dil-NetVLAD: A Lightweight Neural Network for Visual Place + Recognition + + +
+ Visual place recognition (VPR) is a challenging task with the unbalance +between enormous computational cost and high recognition performance. Thanks to +the practical feature extraction ability of the lightweight convolution neural +networks (CNNs) and the train-ability of the vector of locally aggregated +descriptors (VLAD) layer, we propose a lightweight weakly supervised end-to-end +neural network consisting of a front-ended perception model called GhostCNN and +a learnable VLAD layer as a back-end. GhostCNN is based on Ghost modules that +are lightweight CNN-based architectures. They can generate redundant feature +maps using linear operations instead of the traditional convolution process, +making a good trade-off between computation resources and recognition accuracy. +To enhance our proposed lightweight model further, we add dilated convolutions +to the Ghost module to get features containing more spatial semantic +information, improving accuracy. Finally, rich experiments conducted on a +commonly used public benchmark and our private dataset validate that the +proposed neural network reduces the FLOPs and parameters of VGG16-NetVLAD by +99.04% and 80.16%, respectively. Besides, both models achieve similar accuracy. + +
+
+
+
+
+ + ♻ ☆ VehicleGAN: Pair-flexible Pose Guided Image Synthesis for Vehicle + Re-identification + + +
+ Vehicle Re-identification (Re-ID) has been broadly studied in the last +decade; however, the different camera view angle leading to confused +discrimination in the feature subspace for the vehicles of various poses, is +still challenging for the Vehicle Re-ID models in the real world. To promote +the Vehicle Re-ID models, this paper proposes to synthesize a large number of +vehicle images in the target pose, whose idea is to project the vehicles of +diverse poses into the unified target pose so as to enhance feature +discrimination. Considering that the paired data of the same vehicles in +different traffic surveillance cameras might be not available in the real +world, we propose the first Pair-flexible Pose Guided Image Synthesis method +for Vehicle Re-ID, named as VehicleGAN in this paper, which works for both +supervised and unsupervised settings without the knowledge of geometric 3D +models. Because of the feature distribution difference between real and +synthetic data, simply training a traditional metric learning based Re-ID model +with data-level fusion (i.e., data augmentation) is not satisfactory, therefore +we propose a new Joint Metric Learning (JML) via effective feature-level fusion +from both real and synthetic data. Intensive experimental results on the public +VeRi-776 and VehicleID datasets prove the accuracy and effectiveness of our +proposed VehicleGAN and JML. + +
+
+
+
+
+ + ♻ ☆ SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM CVPR 2024 + + +
+ Dense simultaneous localization and mapping (SLAM) is crucial for robotics +and augmented reality applications. However, current methods are often hampered +by the non-volumetric or implicit way they represent a scene. This work +introduces SplaTAM, an approach that, for the first time, leverages explicit +volumetric representations, i.e., 3D Gaussians, to enable high-fidelity +reconstruction from a single unposed RGB-D camera, surpassing the capabilities +of existing methods. SplaTAM employs a simple online tracking and mapping +system tailored to the underlying Gaussian representation. It utilizes a +silhouette mask to elegantly capture the presence of scene density. This +combination enables several benefits over prior representations, including fast +rendering and dense optimization, quickly determining if areas have been +previously mapped, and structured map expansion by adding more Gaussians. +Extensive experiments show that SplaTAM achieves up to 2x superior performance +in camera pose estimation, map construction, and novel-view synthesis over +existing methods, paving the way for more immersive high-fidelity SLAM +applications. + +
+
+ comment: CVPR 2024. Website: https://spla-tam.github.io/ +
+
+
+
+
+ + ♻ ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. Our project is +public at https://github.com/baochi0212/LaVy + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ LoopAnimate: Loopable Salient Object Animation + + +
+ Research on diffusion model-based video generation has advanced rapidly. +However, limitations in object fidelity and generation length hinder its +practical applications. Additionally, specific domains like animated wallpapers +require seamless looping, where the first and last frames of the video match +seamlessly. To address these challenges, this paper proposes LoopAnimate, a +novel method for generating videos with consistent start and end frames. To +enhance object fidelity, we introduce a framework that decouples multi-level +image appearance and textual semantic information. Building upon an +image-to-image diffusion model, our approach incorporates both pixel-level and +feature-level information from the input image, injecting image appearance and +textual semantic embeddings at different positions of the diffusion model. +Existing UNet-based video generation models require to input the entire videos +during training to encode temporal and positional information at once. However, +due to limitations in GPU memory, the number of frames is typically restricted +to 16. To address this, this paper proposes a three-stage training strategy +with progressively increasing frame numbers and reducing fine-tuning modules. +Additionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend +the capacity for encoding temporal and positional information up to 36 frames. +The proposed LoopAnimate, which for the first time extends the single-pass +generation length of UNet-based video generation models to 35 frames while +maintaining high-quality video generation. Experiments demonstrate that +LoopAnimate achieves state-of-the-art performance in both objective metrics, +such as fidelity and temporal consistency, and subjective evaluation results. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Low-Rank and Local Low-Rank Matrix Approximation + in Big Data Medical Imaging + + +
+ The large volume and complexity of medical imaging datasets are bottlenecks +for storage, transmission, and processing. To tackle these challenges, the +application of low-rank matrix approximation (LRMA) and its derivative, local +LRMA (LLRMA) has demonstrated potential. + A detailed analysis of the literature identifies LRMA and LLRMA methods +applied to various imaging modalities, and the challenges and limitations +associated with existing LRMA and LLRMA methods are addressed. + We note a significant shift towards a preference for LLRMA in the medical +imaging field since 2015, demonstrating its potential and effectiveness in +capturing complex structures in medical data compared to LRMA. Acknowledging +the limitations of shallow similarity methods used with LLRMA, we suggest +advanced semantic image segmentation for similarity measure, explaining in +detail how it can measure similar patches and their feasibility. + We note that LRMA and LLRMA are mainly applied to unstructured medical data, +and we propose extending their application to different medical data types, +including structured and semi-structured. This paper also discusses how LRMA +and LLRMA can be applied to regular data with missing entries and the impact of +inaccuracies in predicting missing values and their effects. We discuss the +impact of patch size and propose the use of random search (RS) to determine the +optimal patch size. To enhance feasibility, a hybrid approach using Bayesian +optimization and RS is proposed, which could improve the application of LRMA +and LLRMA in medical imaging. + +
+
+
+
+
+ + ♻ ☆ Slide-SAM: Medical SAM Meets Sliding Window + + +
+ The Segment Anything Model (SAM) has achieved a notable success in +two-dimensional image segmentation in natural images. However, the substantial +gap between medical and natural images hinders its direct application to +medical image segmentation tasks. Particularly in 3D medical images, SAM +struggles to learn contextual relationships between slices, limiting its +practical applicability. Moreover, applying 2D SAM to 3D images requires +prompting the entire volume, which is time- and label-consuming. To address +these problems, we propose Slide-SAM, which treats a stack of three adjacent +slices as a prediction window. It firstly takes three slices from a 3D volume +and point- or bounding box prompts on the central slice as inputs to predict +segmentation masks for all three slices. Subsequently, the masks of the top and +bottom slices are then used to generate new prompts for adjacent slices. +Finally, step-wise prediction can be achieved by sliding the prediction window +forward or backward through the entire volume. Our model is trained on multiple +public and private medical datasets and demonstrates its effectiveness through +extensive 3D segmetnation experiments, with the help of minimal prompts. Code +is available at \url{https://github.com/Curli-quan/Slide-SAM}. + +
+
+
+
+
+ + ♻ ☆ E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors + to New Generators Using Limited Data CVPR + + +
+ As generative AI progresses rapidly, new synthetic image generators continue +to emerge at a swift pace. Traditional detection methods face two main +challenges in adapting to these generators: the forensic traces of synthetic +images from new techniques can vastly differ from those learned during +training, and access to data for these new generators is often limited. To +address these issues, we introduce the Ensemble of Expert Embedders (E3), a +novel continual learning framework for updating synthetic image detectors. E3 +enables the accurate detection of images from newly emerged generators using +minimal training data. Our approach does this by first employing transfer +learning to develop a suite of expert embedders, each specializing in the +forensic traces of a specific generator. Then, all embeddings are jointly +analyzed by an Expert Knowledge Fusion Network to produce accurate and reliable +detection decisions. Our experiments demonstrate that E3 outperforms existing +continual learning methods, including those developed specifically for +synthetic image detection. + +
+
+ comment: 11 pages, 4 figures, To be published in CVPRWMF24 +
+
+
+
+
+ + ♻ ☆ DP-RDM: Adapting Diffusion Models to Private Domains Without Fine-Tuning + + +
+ Text-to-image diffusion models have been shown to suffer from sample-level +memorization, possibly reproducing near-perfect replica of images that they are +trained on, which may be undesirable. To remedy this issue, we develop the +first differentially private (DP) retrieval-augmented generation algorithm that +is capable of generating high-quality image samples while providing provable +privacy guarantees. Specifically, we assume access to a text-to-image diffusion +model trained on a small amount of public data, and design a DP retrieval +mechanism to augment the text prompt with samples retrieved from a private +retrieval dataset. Our \emph{differentially private retrieval-augmented +diffusion model} (DP-RDM) requires no fine-tuning on the retrieval dataset to +adapt to another domain, and can use state-of-the-art generative models to +generate high-quality image samples while satisfying rigorous DP guarantees. +For instance, when evaluated on MS-COCO, our DP-RDM can generate samples with a +privacy budget of $\epsilon=10$, while providing a $3.5$ point improvement in +FID compared to public-only retrieval for up to $10,000$ queries. + +
+
+
+
+
+ + ♻ ☆ LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via + Eulerian Motion Field + + +
+ Cinemagraph is a unique form of visual media that combines elements of still +photography and subtle motion to create a captivating experience. However, the +majority of videos generated by recent works lack depth information and are +confined to the constraints of 2D image space. In this paper, inspired by +significant progress in the field of novel view synthesis (NVS) achieved by 3D +Gaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from +2D image space to 3D space using 3D Gaussian modeling. To achieve this, we +first employ the 3D-GS method to reconstruct 3D Gaussian point clouds from +multi-view images of static scenes,incorporating shape regularization terms to +prevent blurring or artifacts caused by object deformation. We then adopt an +autoencoder tailored for 3D Gaussian to project it into feature space. To +maintain the local continuity of the scene, we devise SuperGaussian for +clustering based on the acquired features. By calculating the similarity +between clusters and employing a two-stage estimation method, we derive an +Eulerian motion field to describe velocities across the entire scene. The 3D +Gaussian points then move within the estimated Eulerian motion field. Through +bidirectional animation techniques, we ultimately generate a 3D Cinemagraph +that exhibits natural and seamlessly loopable dynamics. Experiment results +validate the effectiveness of our approach, demonstrating high-quality and +visually appealing scene generation. The project is available at +https://pokerlishao.github.io/LoopGaussian/. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Using Multi-scale SwinTransformer-HTC with Data augmentation in CoNIC + Challenge + + +
+ Colorectal cancer is one of the most common cancers worldwide, so early +pathological examination is very important. However, it is time-consuming and +labor-intensive to identify the number and type of cells on H&E images in +clinical. Therefore, automatic segmentation and classification task and +counting the cellular composition of H&E images from pathological sections is +proposed by CoNIC Challenge 2022. We proposed a multi-scale Swin transformer +with HTC for this challenge, and also applied the known normalization methods +to generate more augmentation data. Finally, our strategy showed that the +multi-scale played a crucial role to identify different scale features and the +augmentation arose the recognition of model. + +
+
+ comment: Errors have been identified in the analysis +
+
+
+
+
+ + ♻ ☆ 2S-UDF: A Novel Two-stage UDF Learning Method for Robust Non-watertight + Model Reconstruction from Multi-view Images CVPR 2024 + + +
+ Recently, building on the foundation of neural radiance field, various +techniques have emerged to learn unsigned distance fields (UDF) to reconstruct +3D non-watertight models from multi-view images. Yet, a central challenge in +UDF-based volume rendering is formulating a proper way to convert unsigned +distance values into volume density, ensuring that the resulting weight +function remains unbiased and sensitive to occlusions. Falling short on these +requirements often results in incorrect topology or large reconstruction errors +in resulting models. This paper addresses this challenge by presenting a novel +two-stage algorithm, 2S-UDF, for learning a high-quality UDF from multi-view +images. Initially, the method applies an easily trainable density function +that, while slightly biased and transparent, aids in coarse reconstruction. The +subsequent stage then refines the geometry and appearance of the object to +achieve a high-quality reconstruction by directly adjusting the weight function +used in volume rendering to ensure that it is unbiased and occlusion-aware. +Decoupling density and weight in two stages makes our training stable and +robust, distinguishing our technique from existing UDF learning approaches. +Evaluations on the DeepFashion3D, DTU, and BlendedMVS datasets validate the +robustness and effectiveness of our proposed approach. In both quantitative +metrics and visual quality, the results indicate our superior performance over +other UDF learning techniques in reconstructing 3D non-watertight models from +multi-view images. Our code is available at +https://bitbucket.org/jkdeng/2sudf/. + +
+
+ comment: accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Video Codec Control for Vision Models CVPR 2024 + + +
+ Standardized lossy video coding is at the core of almost all real-world video +processing pipelines. Rate control is used to enable standard codecs to adapt +to different network bandwidth conditions or storage constraints. However, +standard video codecs (e.g., H.264) and their rate control modules aim to +minimize video distortion w.r.t. human quality assessment. We demonstrate +empirically that standard-coded videos vastly deteriorate the performance of +deep vision models. To overcome the deterioration of vision performance, this +paper presents the first end-to-end learnable deep video codec control that +considers both bandwidth constraints and downstream deep vision performance, +while adhering to existing standardization. We demonstrate that our approach +better preserves downstream deep vision performance than traditional standard +video coding. + +
+
+ comment: Accepted at CVPR 2024 Workshop on AI for Streaming (AIS) +
+
+
+
+
+ + ♻ ☆ Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic + Distribution Alignment + + +
+ Conventional unsupervised anomaly detection (UAD) methods build separate +models for each object category. Recent studies have proposed to train a +unified model for multiple classes, namely model-unified UAD. However, such +methods still implement the unified model separately on each class during +inference with respective anomaly decision thresholds, which hinders their +application when the image categories are entirely unavailable. In this work, +we present a simple yet powerful method to address multi-class anomaly +detection without any class information, namely \textit{absolute-unified} UAD. +We target the crux of prior works in this challenging setting: different +objects have mismatched anomaly score distributions. We propose Class-Agnostic +Distribution Alignment (CADA) to align the mismatched score distribution of +each implicit class without knowing class information, which enables unified +anomaly detection for all classes and samples. The essence of CADA is to +predict each class's score distribution of normal samples given any image, +normal or anomalous, of this class. As a general component, CADA can activate +the potential of nearly all UAD methods under absolute-unified setting. Our +approach is extensively evaluated under the proposed setting on two popular UAD +benchmark datasets, MVTec AD and VisA, where we exceed previous +state-of-the-art by a large margin. + +
+
+
+
+
+ + ♻ ☆ Pixel-Wise Contrastive Distillation ICCV 2023 + + +
+ We present a simple but effective pixel-level self-supervised distillation +framework friendly to dense prediction tasks. Our method, called Pixel-Wise +Contrastive Distillation (PCD), distills knowledge by attracting the +corresponding pixels from student's and teacher's output feature maps. PCD +includes a novel design called SpatialAdaptor which ``reshapes'' a part of the +teacher network while preserving the distribution of its output features. Our +ablation experiments suggest that this reshaping behavior enables more +informative pixel-to-pixel distillation. Moreover, we utilize a plug-in +multi-head self-attention module that explicitly relates the pixels of +student's feature maps to enhance the effective receptive field, leading to a +more competitive student. PCD \textbf{outperforms} previous self-supervised +distillation methods on various dense prediction tasks. A backbone of +\mbox{ResNet-18-FPN} distilled by PCD achieves $37.4$ AP$^\text{bbox}$ and +$34.0$ AP$^\text{mask}$ on COCO dataset using the detector of \mbox{Mask +R-CNN}. We hope our study will inspire future research on how to pre-train a +small model friendly to dense prediction tasks in a self-supervised fashion. + +
+
+ comment: ICCV 2023 camera-ready +
+
+
+
+
+ + ♻ ☆ About latent roles in forecasting players in team sports + + +
+ Forecasting players in sports has grown in popularity due to the potential +for a tactical advantage and the applicability of such research to multi-agent +interaction systems. Team sports contain a significant social component that +influences interactions between teammates and opponents. However, it still +needs to be fully exploited. In this work, we hypothesize that each participant +has a specific function in each action and that role-based interaction is +critical for predicting players' future moves. We create RolFor, a novel +end-to-end model for Role-based Forecasting. RolFor uses a new module we +developed called Ordering Neural Networks (OrderNN) to permute the order of the +players such that each player is assigned to a latent role. The latent role is +then modeled with a RoleGCN. Thanks to its graph representation, it provides a +fully learnable adjacency matrix that captures the relationships between roles +and is subsequently used to forecast the players' future trajectories. +Extensive experiments on a challenging NBA basketball dataset back up the +importance of roles and justify our goal of modeling them using optimizable +models. When an oracle provides roles, the proposed RolFor compares favorably +to the current state-of-the-art (it ranks first in terms of ADE and second in +terms of FDE errors). However, training the end-to-end RolFor incurs the issues +of differentiability of permutation methods, which we experimentally review. +Finally, this work restates differentiable ranking as a difficult open problem +and its great potential in conjunction with graph-based interaction models. +Project is available at: https://www.pinlab.org/aboutlatentroles + +
+
+
+
+
+ + ♻ ☆ Regularization by Texts for Latent Diffusion Inverse Solvers + + +
+ The recent advent of diffusion models has led to significant progress in +solving inverse problems, leveraging these models as effective generative +priors. Nonetheless, there remain challenges related to the ill-posed nature of +such problems, often due to inherent ambiguities in measurements or intrinsic +system symmetries. To address this, drawing inspiration from the human ability +to resolve visual ambiguities through perceptual biases, here we introduce a +novel latent diffusion inverse solver by regularization by texts (TReg). +Specifically, TReg applies the textual description of the preconception of the +solution during the reverse diffusion sampling, of which the description is +dynamically reinforced through null-text optimization for adaptive negation. +Our comprehensive experimental results demonstrate that TReg successfully +mitigates ambiguity in the inverse problems, enhancing their effectiveness and +accuracy. + +
+
+
+
+
+ + ♻ ☆ GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for + Real-time Human Novel View Synthesis CVPR 2024 + + +
+ We present a new approach, termed GPS-Gaussian, for synthesizing novel views +of a character in a real-time manner. The proposed method enables 2K-resolution +rendering under a sparse-view camera setting. Unlike the original Gaussian +Splatting or neural implicit rendering methods that necessitate per-subject +optimizations, we introduce Gaussian parameter maps defined on the source views +and regress directly Gaussian Splatting properties for instant novel view +synthesis without any fine-tuning or optimization. To this end, we train our +Gaussian parameter regression module on a large amount of human scan data, +jointly with a depth estimation module to lift 2D parameter maps to 3D space. +The proposed framework is fully differentiable and experiments on several +datasets demonstrate that our method outperforms state-of-the-art methods while +achieving an exceeding rendering speed. + +
+
+ comment: Accepted by CVPR 2024 (Highlight). Project page: + https://shunyuanzheng.github.io/GPS-Gaussian +
+
+
+
+
+ + ♻ ☆ Leveraging Image Matching Toward End-to-End Relative Camera Pose + Regression + + +
+ This paper proposes a generalizable, end-to-end deep learning-based method +for relative pose regression between two images. Given two images of the same +scene captured from different viewpoints, our method predicts the relative +rotation and translation (including direction and scale) between the two +respective cameras. Inspired by the classical pipeline, our method leverages +Image Matching (IM) as a pre-trained task for relative pose regression. +Specifically, we use LoFTR, an architecture that utilizes an attention-based +network pre-trained on Scannet, to extract semi-dense feature maps, which are +then warped and fed into a pose regression network. Notably, we use a loss +function that utilizes separate terms to account for the translation direction +and scale. We believe such a separation is important because translation +direction is determined by point correspondences while the scale is inferred +from prior on shape sizes. Our ablations further support this choice. We +evaluate our method on several datasets and show that it outperforms previous +end-to-end methods. The method also generalizes well to unseen datasets. + +
+
+ comment: Project webpage: https://fadikhatib.github.io/GRelPose +
+
+
+
+
+ + ♻ ☆ Deep Generative Data Assimilation in Multimodal Setting CVPR2024 + + +
+ Robust integration of physical knowledge and data is key to improve +computational simulations, such as Earth system models. Data assimilation is +crucial for achieving this goal because it provides a systematic framework to +calibrate model outputs with observations, which can include remote sensing +imagery and ground station measurements, with uncertainty quantification. +Conventional methods, including Kalman filters and variational approaches, +inherently rely on simplifying linear and Gaussian assumptions, and can be +computationally expensive. Nevertheless, with the rapid adoption of data-driven +methods in many areas of computational sciences, we see the potential of +emulating traditional data assimilation with deep learning, especially +generative models. In particular, the diffusion-based probabilistic framework +has large overlaps with data assimilation principles: both allows for +conditional generation of samples with a Bayesian inverse framework. These +models have shown remarkable success in text-conditioned image generation or +image-controlled video synthesis. Likewise, one can frame data assimilation as +observation-conditioned state calibration. In this work, we propose SLAMS: +Score-based Latent Assimilation in Multimodal Setting. Specifically, we +assimilate in-situ weather station data and ex-situ satellite imagery to +calibrate the vertical temperature profiles, globally. Through extensive +ablation, we demonstrate that SLAMS is robust even in low-resolution, noisy, +and sparse data settings. To our knowledge, our work is the first to apply deep +generative framework for multimodal data assimilation using real-world +datasets; an important step for building robust computational simulators, +including the next-generation Earth system models. Our code is available at: +https://github.com/yongquan-qu/SLAMS + +
+
+ comment: CVPR2024 EarthVision +
+
+
+
+
+ + ♻ ☆ Open-Pose 3D Zero-Shot Learning: Benchmark and Challenges + + +
+ With the explosive 3D data growth, the urgency of utilizing zero-shot +learning to facilitate data labeling becomes evident. Recently, methods +transferring language or language-image pre-training models like Contrastive +Language-Image Pre-training (CLIP) to 3D vision have made significant progress +in the 3D zero-shot classification task. These methods primarily focus on 3D +object classification with an aligned pose; such a setting is, however, rather +restrictive, which overlooks the recognition of 3D objects with open poses +typically encountered in real-world scenarios, such as an overturned chair or a +lying teddy bear. To this end, we propose a more realistic and challenging +scenario named open-pose 3D zero-shot classification, focusing on the +recognition of 3D objects regardless of their orientation. First, we revisit +the current research on 3D zero-shot classification, and propose two benchmark +datasets specifically designed for the open-pose setting. We empirically +validate many of the most popular methods in the proposed open-pose benchmark. +Our investigations reveal that most current 3D zero-shot classification models +suffer from poor performance, indicating a substantial exploration room towards +the new direction. Furthermore, we study a concise pipeline with an iterative +angle refinement mechanism that automatically optimizes one ideal angle to +classify these open-pose 3D objects. In particular, to make validation more +compelling and not just limited to existing CLIP-based methods, we also pioneer +the exploration of knowledge transfer based on Diffusion models. While the +proposed solutions can serve as a new benchmark for open-pose 3D zero-shot +classification, we discuss the complexities and challenges of this scenario +that remain for further research development. The code is available publicly at +https://github.com/weiguangzhao/Diff-OP3D. + +
+
+
+
+
+ + ♻ ☆ PartDistill: 3D Shape Part Segmentation by Vision-Language Model + Distillation CVPR 2024 + + +
+ This paper proposes a cross-modal distillation framework, PartDistill, which +transfers 2D knowledge from vision-language models (VLMs) to facilitate 3D +shape part segmentation. PartDistill addresses three major challenges in this +task: the lack of 3D segmentation in invisible or undetected regions in the 2D +projections, inconsistent 2D predictions by VLMs, and the lack of knowledge +accumulation across different 3D shapes. PartDistill consists of a teacher +network that uses a VLM to make 2D predictions and a student network that +learns from the 2D predictions while extracting geometrical features from +multiple 3D shapes to carry out 3D part segmentation. A bi-directional +distillation, including forward and backward distillations, is carried out +within the framework, where the former forward distills the 2D predictions to +the student network, and the latter improves the quality of the 2D predictions, +which subsequently enhances the final 3D segmentation. Moreover, PartDistill +can exploit generative models that facilitate effortless 3D shape creation for +generating knowledge sources to be distilled. Through extensive experiments, +PartDistill boosts the existing methods with substantial margins on widely used +ShapeNetPart and PartNetE datasets, by more than 15% and 12% higher mIoU +scores, respectively. The code for this work is available at +https://github.com/ardianumam/PartDistill. + +
+
+ comment: CVPR 2024 Accepted +
+
+
+
+
+ + ♻ ☆ Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical + Image Segmentation + + +
+ Image segmentation holds a vital position in the realms of diagnosis and +treatment within the medical domain. Traditional convolutional neural networks +(CNNs) and Transformer models have made significant advancements in this realm, +but they still encounter challenges because of limited receptive field or high +computing complexity. Recently, State Space Models (SSMs), particularly Mamba +and its variants, have demonstrated notable performance in the field of vision. +However, their feature extraction methods may not be sufficiently effective and +retain some redundant structures, leaving room for parameter reduction. +Motivated by previous spatial and channel attention methods, we propose Triplet +Mamba-UNet. The method leverages residual VSS Blocks to extract intensive +contextual features, while Triplet SSM is employed to fuse features across +spatial and channel dimensions. We conducted experiments on ISIC17, ISIC18, +CVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets, +demonstrating the superior segmentation performance of our proposed TM-UNet. +Additionally, compared to the previous VM-UNet, our model achieves a one-third +reduction in parameters. + +
+
+
+
+
+ + ♻ ☆ GPT-4V-AD: Exploring Grounding Potential of VQA-oriented GPT-4V for + Zero-shot Anomaly Detection + + +
+ Large Multimodal Model (LMM) GPT-4V(ision) endows GPT-4 with visual grounding +capabilities, making it possible to handle certain tasks through the Visual +Question Answering (VQA) paradigm. This paper explores the potential of +VQA-oriented GPT-4V in the recently popular visual Anomaly Detection (AD) and +is the first to conduct qualitative and quantitative evaluations on the popular +MVTec AD and VisA datasets. Considering that this task requires both +image-/pixel-level evaluations, the proposed GPT-4V-AD framework contains three +components: \textbf{\textit{1)}} Granular Region Division, \textbf{\textit{2)}} +Prompt Designing, \textbf{\textit{3)}} Text2Segmentation for easy quantitative +evaluation, and have made some different attempts for comparative analysis. The +results show that GPT-4V can achieve certain results in the zero-shot AD task +through a VQA paradigm, such as achieving image-level 77.1/88.0 and pixel-level +68.0/76.6 AU-ROCs on MVTec AD and VisA datasets, respectively. However, its +performance still has a certain gap compared to the state-of-the-art zero-shot +method, \eg, WinCLIP and CLIP-AD, and further researches are needed. This study +provides a baseline reference for the research of VQA-oriented LMM in the +zero-shot AD task, and we also post several possible future works. Code is +available at \url{https://github.com/zhangzjn/GPT-4V-AD}. + +
+
+
+
+
+ + ♻ ☆ RemoteCLIP: A Vision Language Foundation Model for Remote Sensing + + +
+ General-purpose foundation models have led to recent breakthroughs in +artificial intelligence. In remote sensing, self-supervised learning (SSL) and +Masked Image Modeling (MIM) have been adopted to build foundation models. +However, these models primarily learn low-level features and require annotated +data for fine-tuning. Moreover, they are inapplicable for retrieval and +zero-shot applications due to the lack of language understanding. To address +these limitations, we propose RemoteCLIP, the first vision-language foundation +model for remote sensing that aims to learn robust visual features with rich +semantics and aligned text embeddings for seamless downstream application. To +address the scarcity of pre-training data, we leverage data scaling which +converts heterogeneous annotations into a unified image-caption data format +based on Box-to-Caption (B2C) and Mask-to-Box (M2B) conversion. By further +incorporating UAV imagery, we produce a 12 $\times$ larger pretraining dataset +than the combination of all available datasets. RemoteCLIP can be applied to a +variety of downstream tasks, including zero-shot image classification, linear +probing, $\textit{k}$-NN classification, few-shot classification, image-text +retrieval, and object counting in remote sensing images. Evaluation on 16 +datasets, including a newly introduced RemoteCount benchmark to test the object +counting ability, shows that RemoteCLIP consistently outperforms baseline +foundation models across different model scales. Impressively, RemoteCLIP beats +the state-of-the-art method by 9.14% mean recall on the RSITMD dataset and +8.92% on the RSICD dataset. For zero-shot classification, our RemoteCLIP +outperforms the CLIP baseline by up to 6.39% average accuracy on 12 downstream +datasets. Project website: https://github.com/ChenDelong1999/RemoteCLIP + +
+
+ comment: Accepted by IEEE Transactions on Geoscience and Remote Sensing (TGRS) +
+
+
+
+
+ + ♻ ☆ Face-voice Association in Multilingual Environments (FAME) Challenge + 2024 Evaluation Plan + + +
+ The advancements of technology have led to the use of multimodal systems in +various real-world applications. Among them, the audio-visual systems are one +of the widely used multimodal systems. In the recent years, associating face +and voice of a person has gained attention due to presence of unique +correlation between them. The Face-voice Association in Multilingual +Environments (FAME) Challenge 2024 focuses on exploring face-voice association +under a unique condition of multilingual scenario. This condition is inspired +from the fact that half of the world's population is bilingual and most often +people communicate under multilingual scenario. The challenge uses a dataset +namely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice +association in multilingual environments. This report provides the details of +the challenge, dataset, baselines and task details for the FAME Challenge. + +
+
+ comment: ACM Multimedia Conference - Grand Challenge +
+
+
+
+
+ + ♻ ☆ 3D Human Scan With A Moving Event Camera + + +
+ Capturing a 3D human body is one of the important tasks in computer vision +with a wide range of applications such as virtual reality and sports analysis. +However, conventional frame cameras are limited by their temporal resolution +and dynamic range, which imposes constraints in real-world application setups. +Event cameras have the advantages of high temporal resolution and high dynamic +range (HDR), but the development of event-based methods is necessary to handle +data with different characteristics. This paper proposes a novel event-based +method for 3D pose estimation and human mesh recovery. Prior work on +event-based human mesh recovery require frames (images) as well as event data. +The proposed method solely relies on events; it carves 3D voxels by moving the +event camera around a stationary body, reconstructs the human pose and mesh by +attenuated rays, and fit statistical body models, preserving high-frequency +details. The experimental results show that the proposed method outperforms +conventional frame-based methods in the estimation accuracy of both pose and +body mesh. We also demonstrate results in challenging situations where a +conventional camera has motion blur. This is the first to demonstrate +event-only human mesh recovery, and we hope that it is the first step toward +achieving robust and accurate 3D human body scanning from vision sensors. +https://florpeng.github.io/event-based-human-scan/ + +
+
+
+
+
+ + ♻ ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated. Work in progress +
+
+
+
+
+ + ♻ ☆ Theoretically Achieving Continuous Representation of Oriented Bounding + Boxes CVPR'24 + + +
+ Considerable efforts have been devoted to Oriented Object Detection (OOD). +However, one lasting issue regarding the discontinuity in Oriented Bounding Box +(OBB) representation remains unresolved, which is an inherent bottleneck for +extant OOD methods. This paper endeavors to completely solve this issue in a +theoretically guaranteed manner and puts an end to the ad-hoc efforts in this +direction. Prior studies typically can only address one of the two cases of +discontinuity: rotation and aspect ratio, and often inadvertently introduce +decoding discontinuity, e.g. Decoding Incompleteness (DI) and Decoding +Ambiguity (DA) as discussed in literature. Specifically, we propose a novel +representation method called Continuous OBB (COBB), which can be readily +integrated into existing detectors e.g. Faster-RCNN as a plugin. It can +theoretically ensure continuity in bounding box regression which to our best +knowledge, has not been achieved in literature for rectangle-based object +representation. For fairness and transparency of experiments, we have developed +a modularized benchmark based on the open-source deep learning framework +Jittor's detection toolbox JDet for OOD evaluation. On the popular DOTA +dataset, by integrating Faster-RCNN as the same baseline model, our new method +outperforms the peer method Gliding Vertex by 1.13% mAP50 (relative improvement +1.54%), and 2.46% mAP75 (relative improvement 5.91%), without any tricks. + +
+
+ comment: 17 pages, 12 tables, 8 figures. Accepted by CVPR'24. Code: + https://github.com/514flowey/JDet-COBB +
+
+
+
+
+ + ♻ ☆ Privacy Preserving Image Registration + + +
+ Image registration is a key task in medical imaging applications, allowing to +represent medical images in a common spatial reference frame. Current +approaches to image registration are generally based on the assumption that the +content of the images is usually accessible in clear form, from which the +spatial transformation is subsequently estimated. This common assumption may +not be met in practical applications, since the sensitive nature of medical +images may ultimately require their analysis under privacy constraints, +preventing to openly share the image content.In this work, we formulate the +problem of image registration under a privacy preserving regime, where images +are assumed to be confidential and cannot be disclosed in clear. We derive our +privacy preserving image registration framework by extending classical +registration paradigms to account for advanced cryptographic tools, such as +secure multi-party computation and homomorphic encryption, that enable the +execution of operations without leaking the underlying data. To overcome the +problem of performance and scalability of cryptographic tools in high +dimensions, we propose several techniques to optimize the image registration +operations by using gradient approximations, and by revisiting the use of +homomorphic encryption trough packing, to allow the efficient encryption and +multiplication of large matrices. We demonstrate our privacy preserving +framework in linear and non-linear registration problems, evaluating its +accuracy and scalability with respect to standard, non-private counterparts. +Our results show that privacy preserving image registration is feasible and can +be adopted in sensitive medical imaging applications. + +
+
+ comment: v4 Accepted at Medical Image Computing and Computer Assisted + Intervention (2022) 130-140 +
+
+
+
+
+ + ♻ ☆ NIR-Assisted Image Denoising: A Selective Fusion Approach and A + Real-World Benchmark Datase + + +
+ Despite the significant progress in image denoising, it is still challenging +to restore fine-scale details while removing noise, especially in extremely +low-light environments. Leveraging near-infrared (NIR) images to assist visible +RGB image denoising shows the potential to address this issue, becoming a +promising technology. Nonetheless, existing works still struggle with taking +advantage of NIR information effectively for real-world image denoising, due to +the content inconsistency between NIR-RGB images and the scarcity of real-world +paired datasets. To alleviate the problem, we propose an efficient Selective +Fusion Module (SFM), which can be plug-and-played into the advanced denoising +networks to merge the deep NIR-RGB features. Specifically, we sequentially +perform the global and local modulation for NIR and RGB features, and then +integrate the two modulated features. Furthermore, we present a Real-world +NIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse +scenarios as well as various noise levels. Extensive experiments on both +synthetic and our real-world datasets demonstrate that the proposed method +achieves better results than state-of-the-art ones. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Proposing an intelligent mesh smoothing method with graph neural + networks + + +
+ In CFD, mesh smoothing methods are commonly utilized to refine the mesh +quality to achieve high-precision numerical simulations. Specifically, +optimization-based smoothing is used for high-quality mesh smoothing, but it +incurs significant computational overhead. Pioneer works improve its smoothing +efficiency by adopting supervised learning to learn smoothing methods from +high-quality meshes. However, they pose difficulty in smoothing the mesh nodes +with varying degrees and also need data augmentation to address the node input +sequence problem. Additionally, the required labeled high-quality meshes +further limit the applicability of the proposed method. In this paper, we +present GMSNet, a lightweight neural network model for intelligent mesh +smoothing. GMSNet adopts graph neural networks to extract features of the +node's neighbors and output the optimal node position. During smoothing, we +also introduce a fault-tolerance mechanism to prevent GMSNet from generating +negative volume elements. With a lightweight model, GMSNet can effectively +smoothing mesh nodes with varying degrees and remain unaffected by the order of +input data. A novel loss function, MetricLoss, is also developed to eliminate +the need for high-quality meshes, which provides a stable and rapid convergence +during training. We compare GMSNet with commonly used mesh smoothing methods on +two-dimensional triangle meshes. The experimental results show that GMSNet +achieves outstanding mesh smoothing performances with 5% model parameters of +the previous model, and attains 13.56 times faster than optimization-based +smoothing. + +
+
+
+
+
+ + ♻ ☆ Positive Label Is All You Need for Multi-Label Classification ICME 2024 + + +
+ Multi-label classification (MLC) faces challenges from label noise in +training data due to annotating diverse semantic labels for each image. Current +methods mainly target identifying and correcting label mistakes using trained +MLC models, but still struggle with persistent noisy labels during training, +resulting in imprecise recognition and reduced performance. Our paper addresses +label noise in MLC by introducing a positive and unlabeled multi-label +classification (PU-MLC) method. To counteract noisy labels, we directly discard +negative labels, focusing on the abundance of negative labels and the origin of +most noisy labels. PU-MLC employs positive-unlabeled learning, training the +model with only positive labels and unlabeled data. The method incorporates +adaptive re-balance factors and temperature coefficients in the loss function +to address label distribution imbalance and prevent over-smoothing of +probabilities during training. Additionally, we introduce a local-global +convolution module to capture both local and global dependencies in the image +without requiring backbone retraining. PU-MLC proves effective on MLC and MLC +with partial labels (MLC-PL) tasks, demonstrating significant improvements on +MS-COCO and PASCAL VOC datasets with fewer annotations. Code is available at: +https://github.com/TAKELAMAG/PU-MLC. + +
+
+ comment: ICME 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging edge detection and neural networks for better UAV + localization + + +
+ We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs) +in environments lacking Global Navigation Satellite Systems (GNSS). Current +state-of-the-art techniques employ an offline-trained encoder to generate a +vector representation (embedding) of the UAV's current view, which is then +compared with pre-computed embeddings of geo-referenced images to determine the +UAV's position. Here, we demonstrate that the performance of these methods can +be significantly enhanced by preprocessing the images to extract their edges, +which exhibit robustness to seasonal and illumination variations. Furthermore, +we establish that utilizing edges enhances resilience to orientation and +altitude inaccuracies. Additionally, we introduce a confidence criterion for +localization. Our findings are substantiated through synthetic experiments. + +
+
+ comment: Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Joining Forces for Pathology Diagnostics with AI Assistance: The EMPAIA + Initiative + + +
+ Over the past decade, artificial intelligence (AI) methods in pathology have +advanced substantially. However, integration into routine clinical practice has +been slow due to numerous challenges, including technical and regulatory +hurdles in translating research results into clinical diagnostic products and +the lack of standardized interfaces. The open and vendor-neutral EMPAIA +initiative addresses these challenges. Here, we provide an overview of EMPAIA's +achievements and lessons learned. EMPAIA integrates various stakeholders of the +pathology AI ecosystem, i.e., pathologists, computer scientists, and industry. +In close collaboration, we developed technical interoperability standards, +recommendations for AI testing and product development, and explainability +methods. We implemented the modular and open-source EMPAIA platform and +successfully integrated 14 AI-based image analysis apps from 8 different +vendors, demonstrating how different apps can use a single standardized +interface. We prioritized requirements and evaluated the use of AI in real +clinical settings with 14 different pathology laboratories in Europe and Asia. +In addition to technical developments, we created a forum for all stakeholders +to share information and experiences on digital pathology and AI. Commercial, +clinical, and academic stakeholders can now adopt EMPAIA's common open-source +interfaces, providing a unique opportunity for large-scale standardization and +streamlining of processes. Further efforts are needed to effectively and +broadly establish AI assistance in routine laboratory use. To this end, a +sustainable infrastructure, the non-profit association EMPAIA International, +has been established to continue standardization and support broad +implementation and advocacy for an AI-assisted digital pathology future. + +
+
+
+
+
+ + ♻ ☆ Objects as volumes: A stochastic geometry view of opaque solids + + +
+ We develop a theory for the representation of opaque solids as volumes. +Starting from a stochastic representation of opaque solids as random indicator +functions, we prove the conditions under which such solids can be modeled using +exponential volumetric transport. We also derive expressions for the volumetric +attenuation coefficient as a functional of the probability distributions of the +underlying indicator functions. We generalize our theory to account for +isotropic and anisotropic scattering at different parts of the solid, and for +representations of opaque solids as stochastic implicit surfaces. We derive our +volumetric representation from first principles, which ensures that it +satisfies physical constraints such as reciprocity and reversibility. We use +our theory to explain, compare, and correct previous volumetric +representations, as well as propose meaningful extensions that lead to improved +performance in 3D reconstruction tasks. + +
+
+ comment: project page: https://imaging.cs.cmu.edu/volumetric_opaque_solids +
+
+
+
+
+ + ♻ ☆ CuNeRF: Cube-Based Neural Radiance Field for Zero-Shot Medical Image + Arbitrary-Scale Super Resolution ICCV + + +
+ Medical image arbitrary-scale super-resolution (MIASSR) has recently gained +widespread attention, aiming to super sample medical volumes at arbitrary +scales via a single model. However, existing MIASSR methods face two major +limitations: (i) reliance on high-resolution (HR) volumes and (ii) limited +generalization ability, which restricts their application in various scenarios. +To overcome these limitations, we propose Cube-based Neural Radiance Field +(CuNeRF), a zero-shot MIASSR framework that can yield medical images at +arbitrary scales and viewpoints in a continuous domain. Unlike existing MIASSR +methods that fit the mapping between low-resolution (LR) and HR volumes, CuNeRF +focuses on building a coordinate-intensity continuous representation from LR +volumes without the need for HR references. This is achieved by the proposed +differentiable modules: including cube-based sampling, isotropic volume +rendering, and cube-based hierarchical rendering. Through extensive experiments +on magnetic resource imaging (MRI) and computed tomography (CT) modalities, we +demonstrate that CuNeRF outperforms state-of-the-art MIASSR methods. CuNeRF +yields better visual verisimilitude and reduces aliasing artifacts at various +upsampling factors. Moreover, our CuNeRF does not need any LR-HR training +pairs, which is more flexible and easier to be used than others. Our code is +released at https://github.com/NarcissusEx/CuNeRF. + +
+
+ comment: This paper is accepted by the International Conference on Computer + Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ Human-in-the-Loop Segmentation of Multi-species Coral Imagery CVPR2024 + + +
+ Broad-scale marine surveys performed by underwater vehicles significantly +increase the availability of coral reef imagery, however it is costly and +time-consuming for domain experts to label images. Point label propagation is +an approach used to leverage existing image data labeled with sparse point +labels. The resulting augmented ground truth generated is then used to train a +semantic segmentation model. Here, we first demonstrate that recent advances in +foundation models enable generation of multi-species coral augmented ground +truth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN), +without the need for any pre-training or custom-designed algorithms. For +extremely sparsely labeled images, we propose a labeling regime based on +human-in-the-loop principles, resulting in significant improvement in +annotation efficiency: If only 5 point labels per image are available, our +proposed human-in-the-loop approach improves on the state-of-the-art by 17.3% +for pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point +labels per image are available. Even if the human-in-the-loop labeling regime +is not used, the denoised DINOv2 features with a KNN outperforms the prior +state-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points). +We also provide a detailed analysis of how point labeling style and the +quantity of points per image affects the point label propagation quality and +provide general recommendations on maximizing point label efficiency. + +
+
+ comment: Accepted at the CVPR2024 3rd Workshop on Learning with Limited + Labelled Data for Image and Video Understanding (L3D-IVU), 10 pages, 6 + figures, an additional 4 pages of supplementary material +
+
+
+
+
+ + ♻ ☆ Generative Active Learning for Image Synthesis Personalization + + +
+ This paper presents a pilot study that explores the application of active +learning, traditionally studied in the context of discriminative models, to +generative models. We specifically focus on image synthesis personalization +tasks. The primary challenge in conducting active learning on generative models +lies in the open-ended nature of querying, which differs from the closed form +of querying in discriminative models that typically target a single concept. We +introduce the concept of anchor directions to transform the querying process +into a semi-open problem. We propose a direction-based uncertainty sampling +strategy to enable generative active learning and tackle the +exploitation-exploration dilemma. Extensive experiments are conducted to +validate the effectiveness of our approach, demonstrating that an open-source +model can achieve superior performance compared to closed-source models +developed by large companies, such as Google's StyleDrop. The source code is +available at https://github.com/zhangxulu1996/GAL4Personalization. + +
+
+
+
+
+ + ♻ ☆ Learning Self-Prior for Mesh Inpainting Using Self-Supervised Graph + Convolutional Networks + + +
+ In this paper, we present a self-prior-based mesh inpainting framework that +requires only an incomplete mesh as input, without the need for any training +datasets. Additionally, our method maintains the polygonal mesh format +throughout the inpainting process without converting the shape format to an +intermediate one, such as a voxel grid, a point cloud, or an implicit function, +which are typically considered easier for deep neural networks to process. To +achieve this goal, we introduce two graph convolutional networks (GCNs): +single-resolution GCN (SGCN) and multi-resolution GCN (MGCN), both trained in a +self-supervised manner. Our approach refines a watertight mesh obtained from +the initial hole filling to generate a complete output mesh. Specifically, we +train the GCNs to deform an oversmoothed version of the input mesh into the +expected complete shape. The deformation is described by vertex displacements, +and the GCNs are supervised to obtain accurate displacements at vertices in +real holes. To this end, we specify several connected regions of the mesh as +fake holes, thereby generating meshes with various sets of fake holes. The +correct displacements of vertices are known in these fake holes, thus enabling +training GCNs with loss functions that assess the accuracy of vertex +displacements. We demonstrate that our method outperforms traditional +dataset-independent approaches and exhibits greater robustness compared with +other deep-learning-based methods for shapes that infrequently appear in shape +datasets. Our code and test data are available at +https://github.com/astaka-pe/SeMIGCN. + +
+
+ comment: 18 pages, 18 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ MetaCloak: Preventing Unauthorized Subject-driven Text-to-image + Diffusion-based Synthesis via Meta-learning CVPR 2024 + + +
+ Text-to-image diffusion models allow seamless generation of personalized +images from scant reference photos. Yet, these tools, in the wrong hands, can +fabricate misleading or harmful content, endangering individuals. To address +this problem, existing poisoning-based approaches perturb user images in an +imperceptible way to render them "unlearnable" from malicious uses. We identify +two limitations of these defending approaches: i) sub-optimal due to the +hand-crafted heuristics for solving the intractable bilevel optimization and +ii) lack of robustness against simple data transformations like Gaussian +filtering. To solve these challenges, we propose MetaCloak, which solves the +bi-level poisoning problem with a meta-learning framework with an additional +transformation sampling process to craft transferable and robust perturbation. +Specifically, we employ a pool of surrogate diffusion models to craft +transferable and model-agnostic perturbation. Furthermore, by incorporating an +additional transformation process, we design a simple denoising-error +maximization loss that is sufficient for causing transformation-robust semantic +distortion and degradation in a personalized generation. Extensive experiments +on the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing +approaches. Notably, MetaCloak can successfully fool online training services +like Replicate, in a black-box manner, demonstrating the effectiveness of +MetaCloak in real-world scenarios. Our code is available at +https://github.com/liuyixin-louis/MetaCloak. + +
+
+ comment: Accepted to CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ CREST: Cross-modal Resonance through Evidential Deep Learning for + Enhanced Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) enables the recognition of novel classes by +leveraging semantic knowledge transfer from known to unknown categories. This +knowledge, typically encapsulated in attribute descriptions, aids in +identifying class-specific visual features, thus facilitating visual-semantic +alignment and improving ZSL performance. However, real-world challenges such as +distribution imbalances and attribute co-occurrence among instances often +hinder the discernment of local variances in images, a problem exacerbated by +the scarcity of fine-grained, region-specific attribute annotations. Moreover, +the variability in visual presentation within categories can also skew +attribute-category associations. In response, we propose a bidirectional +cross-modal ZSL approach CREST. It begins by extracting representations for +attribute and visual localization and employs Evidential Deep Learning (EDL) to +measure underlying epistemic uncertainty, thereby enhancing the model's +resilience against hard negatives. CREST incorporates dual learning pathways, +focusing on both visual-category and attribute-category alignments, to ensure +robust correlation between latent and observable spaces. Moreover, we introduce +an uncertainty-informed cross-modal fusion technique to refine visual-attribute +inference. Extensive experiments demonstrate our model's effectiveness and +unique explainability across multiple datasets. Our code and data are available +at: https://github.com/JethroJames/CREST. + +
+
+ comment: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at: + https://github.com/JethroJames/CREST +
+
+
+
+
+ + ♻ ☆ Orientation-conditioned Facial Texture Mapping for Video-based Facial + Remote Photoplethysmography Estimation + + +
+ Camera-based remote photoplethysmography (rPPG) enables contactless +measurement of important physiological signals such as pulse rate (PR). +However, dynamic and unconstrained subject motion introduces significant +variability into the facial appearance in video, confounding the ability of +video-based methods to accurately extract the rPPG signal. In this study, we +leverage the 3D facial surface to construct a novel orientation-conditioned +facial texture video representation which improves the motion robustness of +existing video-based facial rPPG estimation methods. Our proposed method +achieves a significant 18.2% performance improvement in cross-dataset testing +on MMPD over our baseline using the PhysNet model trained on PURE, highlighting +the efficacy and generalization benefits of our designed video representation. +We demonstrate significant performance improvements of up to 29.6% in all +tested motion scenarios in cross-dataset testing on MMPD, even in the presence +of dynamic and unconstrained subject motion, emphasizing the benefits of +disentangling motion through modeling the 3D facial surface for motion robust +facial rPPG estimation. We validate the efficacy of our design decisions and +the impact of different video processing steps through an ablation study. Our +findings illustrate the potential strengths of exploiting the 3D facial surface +as a general strategy for addressing dynamic and unconstrained subject motion +in videos. The code is available at +https://samcantrill.github.io/orientation-uv-rppg/. + +
+
+ comment: 12 pages, 8 figures, 6 tables; corrected abstract typo +
+
+
+
+
+ + ♻ ☆ Attention-based Shape-Deformation Networks for Artifact-Free Geometry + Reconstruction of Lumbar Spine from MR Images + + +
+ Lumbar disc degeneration, a progressive structural wear and tear of lumbar +intervertebral disc, is regarded as an essential role on low back pain, a +significant global health concern. Automated lumbar spine geometry +reconstruction from MR images will enable fast measurement of medical +parameters to evaluate the lumbar status, in order to determine a suitable +treatment. Existing image segmentation-based techniques often generate +erroneous segments or unstructured point clouds, unsuitable for medical +parameter measurement. In this work, we present TransDeformer: a novel +attention-based deep learning approach that reconstructs the geometry of the +lumbar spine with high spatial accuracy and mesh correspondence across +patients, and we also present a variant of TransDeformer for error estimation. +Specially, we devise new attention modules with a new attention formula, which +integrate image features and tokenized contour features to predict the +displacements of the points on a shape template without the need for image +segmentation. The deformed template reveals the lumbar spine geometry in an +image. Experiment results show that our TransDeformer generates artifact-free +geometry outputs, and its variant predicts the error of a reconstructed +geometry. Our code is available at +https://github.com/linchenq/TransDeformer-Mesh. + +
+
+
+
+
+ + ♻ ☆ A Simple Strategy for Body Estimation from Partial-View Images CVPR + + +
+ Virtual try-on and product personalization have become increasingly important +in modern online shopping, highlighting the need for accurate body measurement +estimation. Although previous research has advanced in estimating 3D body +shapes from RGB images, the task is inherently ambiguous as the observed scale +of human subjects in the images depends on two unknown factors: capture +distance and body dimensions. This ambiguity is particularly pronounced in +partial-view scenarios. To address this challenge, we propose a modular and +simple height normalization solution. This solution relocates the subject +skeleton to the desired position, thereby normalizing the scale and +disentangling the relationship between the two variables. Our experimental +results demonstrate that integrating this technique into state-of-the-art human +mesh reconstruction models significantly enhances partial body measurement +estimation. Additionally, we illustrate the applicability of this approach to +multi-view settings, showcasing its versatility. + +
+
+ comment: Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design +
+
+
+
+
+ + ♻ ☆ Overcoming the Pitfalls of Vision-Language Model Finetuning for OOD + Generalization ICLR 2024 + + +
+ Existing vision-language models exhibit strong generalization on a variety of +visual domains and tasks. However, such models mainly perform zero-shot +recognition in a closed-set manner, and thus struggle to handle open-domain +visual concepts by design. There are recent finetuning methods, such as prompt +learning, that not only study the discrimination between in-distribution (ID) +and out-of-distribution (OOD) samples, but also show some improvements in both +ID and OOD accuracies. In this paper, we first demonstrate that vision-language +models, after long enough finetuning but without proper regularization, tend to +overfit the known classes in the given dataset, with degraded performance on +unknown classes. Then we propose a novel approach OGEN to address this pitfall, +with the main focus on improving the OOD GENeralization of finetuned models. +Specifically, a class-conditional feature generator is introduced to synthesize +OOD features using just the class name of any unknown class. Such synthesized +features will provide useful knowledge about unknowns and help regularize the +decision boundary between ID and OOD data when optimized jointly. Equally +important is our adaptive self-distillation mechanism to regularize our feature +generation model during joint optimization, i.e., adaptively transferring +knowledge between model states to further prevent overfitting. Experiments +validate that our method yields convincing gains in OOD generalization +performance in different settings. Code: https://github.com/apple/ml-ogen. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Improving the Robustness of 3D Human Pose Estimation: A Benchmark and + Learning from Noisy Input + + +
+ Despite the promising performance of current 3D human pose estimation +techniques, understanding and enhancing their generalization on challenging +in-the-wild videos remain an open problem. In this work, we focus on the +robustness of 2D-to-3D pose lifters. To this end, we develop two benchmark +datasets, namely Human3.6M-C and HumanEva-I-C, to examine the robustness of +video-based 3D pose lifters to a wide range of common video corruptions +including temporary occlusion, motion blur, and pixel-level noise. We observe +the poor generalization of state-of-the-art 3D pose lifters in the presence of +corruption and establish two techniques to tackle this issue. First, we +introduce Temporal Additive Gaussian Noise (TAGN) as a simple yet effective 2D +input pose data augmentation. Additionally, to incorporate the confidence +scores output by the 2D pose detectors, we design a confidence-aware +convolution (CA-Conv) block. Extensively tested on corrupted videos, the +proposed strategies consistently boost the robustness of 3D pose lifters and +serve as new baselines for future research. + +
+
+
+
+
+ + ♻ ☆ AVS-Net: Point Sampling with Adaptive Voxel Size for 3D Scene + Understanding + + +
+ The recent advancements in point cloud learning have enabled intelligent +vehicles and robots to comprehend 3D environments better. However, processing +large-scale 3D scenes remains a challenging problem, such that efficient +downsampling methods play a crucial role in point cloud learning. Existing +downsampling methods either require a huge computational burden or sacrifice +fine-grained geometric information. For such purpose, this paper presents an +advanced sampler that achieves both high accuracy and efficiency. The proposed +method utilizes voxel centroid sampling as a foundation but effectively +addresses the challenges regarding voxel size determination and the +preservation of critical geometric cues. Specifically, we propose a Voxel +Adaptation Module that adaptively adjusts voxel sizes with the reference of +point-based downsampling ratio. This ensures that the sampling results exhibit +a favorable distribution for comprehending various 3D objects or scenes. +Meanwhile, we introduce a network compatible with arbitrary voxel sizes for +sampling and feature extraction while maintaining high efficiency. The proposed +approach is demonstrated with 3D object detection and 3D semantic segmentation. +Compared to existing state-of-the-art methods, our approach achieves better +accuracy on outdoor and indoor large-scale datasets, e.g. Waymo and ScanNet, +with promising efficiency. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Solving Inverse Problems with Latent Diffusion Models via Hard Data + Consistency + + +
+ Diffusion models have recently emerged as powerful generative priors for +solving inverse problems. However, training diffusion models in the pixel space +are both data-intensive and computationally demanding, which restricts their +applicability as priors for high-dimensional real-world data such as medical +images. Latent diffusion models, which operate in a much lower-dimensional +space, offer a solution to these challenges. However, incorporating latent +diffusion models to solve inverse problems remains a challenging problem due to +the nonlinearity of the encoder and decoder. To address these issues, we +propose \textit{ReSample}, an algorithm that can solve general inverse problems +with pre-trained latent diffusion models. Our algorithm incorporates data +consistency by solving an optimization problem during the reverse sampling +process, a concept that we term as hard data consistency. Upon solving this +optimization problem, we propose a novel resampling scheme to map the +measurement-consistent sample back onto the noisy data manifold and +theoretically demonstrate its benefits. Lastly, we apply our algorithm to solve +a wide range of linear and nonlinear inverse problems in both natural and +medical images, demonstrating that our approach outperforms existing +state-of-the-art approaches, including those based on pixel-space diffusion +models. + +
+
+ comment: 27 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Achieving Reliable and Fair Skin Lesion Diagnosis via Unsupervised + Domain Adaptation + + +
+ The development of reliable and fair diagnostic systems is often constrained +by the scarcity of labeled data. To address this challenge, our work explores +the feasibility of unsupervised domain adaptation (UDA) to integrate large +external datasets for developing reliable classifiers. The adoption of UDA with +multiple sources can simultaneously enrich the training set and bridge the +domain gap between different skin lesion datasets, which vary due to distinct +acquisition protocols. Particularly, UDA shows practical promise for improving +diagnostic reliability when training with a custom skin lesion dataset, where +only limited labeled data are available from the target domain. In this study, +we investigate three UDA training schemes based on source data utilization: +single-source, combined-source, and multi-source UDA. Our findings demonstrate +the effectiveness of applying UDA on multiple sources for binary and +multi-class classification. A strong correlation between test error and label +shift in multi-class tasks has been observed in the experiment. Crucially, our +study shows that UDA can effectively mitigate bias against minority groups and +enhance fairness in diagnostic systems, while maintaining superior +classification performance. This is achieved even without directly implementing +fairness-focused techniques. This success is potentially attributed to the +increased and well-adapted demographic information obtained from multiple +sources. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised MRI Reconstruction with Unrolled Diffusion Models + + +
+ Magnetic Resonance Imaging (MRI) produces excellent soft tissue contrast, +albeit it is an inherently slow imaging modality. Promising deep learning +methods have recently been proposed to reconstruct accelerated MRI scans. +However, existing methods still suffer from various limitations regarding image +fidelity, contextual sensitivity, and reliance on fully-sampled acquisitions +for model training. To comprehensively address these limitations, we propose a +novel self-supervised deep reconstruction model, named Self-Supervised +Diffusion Reconstruction (SSDiffRecon). SSDiffRecon expresses a conditional +diffusion process as an unrolled architecture that interleaves cross-attention +transformers for reverse diffusion steps with data-consistency blocks for +physics-driven processing. Unlike recent diffusion methods for MRI +reconstruction, a self-supervision strategy is adopted to train SSDiffRecon +using only undersampled k-space data. Comprehensive experiments on public brain +MR datasets demonstrates the superiority of SSDiffRecon against +state-of-the-art supervised, and self-supervised baselines in terms of +reconstruction speed and quality. Implementation will be available at +https://github.com/yilmazkorkmaz1/SSDiffRecon. + +
+
+
+
+
+ + ♻ ☆ Segment Anything in 3D with Radiance Fields NeurIPS 2023 + + +
+ The Segment Anything Model (SAM) emerges as a powerful vision foundation +model to generate high-quality 2D segmentation results. This paper aims to +generalize SAM to segment 3D objects. Rather than replicating the data +acquisition and annotation procedure which is costly in 3D, we design an +efficient solution, leveraging the radiance field as a cheap and off-the-shelf +prior that connects multi-view 2D images to the 3D space. We refer to the +proposed solution as SA3D, short for Segment Anything in 3D. With SA3D, the +user is only required to provide a 2D segmentation prompt (e.g., rough points) +for the target object in a single view, which is used to generate its +corresponding 2D mask with SAM. Next, SA3D alternately performs mask inverse +rendering and cross-view self-prompting across various views to iteratively +refine the 3D mask of the target object. For one view, mask inverse rendering +projects the 2D mask obtained by SAM into the 3D space with guidance of the +density distribution learned by the radiance field for 3D mask refinement; +Then, cross-view self-prompting extracts reliable prompts automatically as the +input to SAM from the rendered 2D mask of the inaccurate 3D mask for a new +view. We show in experiments that SA3D adapts to various scenes and achieves 3D +segmentation within seconds. Our research reveals a potential methodology to +lift the ability of a 2D segmentation model to 3D. Our code is available at +https://github.com/Jumpat/SegmentAnythingin3D. + +
+
+ comment: Extension version of SA3D (NeurIPS 2023). Project page: + https://jumpat.github.io/SA3D/ +
+
+
+
+
+ + ♻ ☆ GenCorres: Consistent Shape Matching via Coupled Implicit-Explicit Shape + Generative Models ICLR 2024 + + +
+ This paper introduces GenCorres, a novel unsupervised joint shape matching +(JSM) approach. Our key idea is to learn a mesh generator to fit an unorganized +deformable shape collection while constraining deformations between adjacent +synthetic shapes to preserve geometric structures such as local rigidity and +local conformality. GenCorres presents three appealing advantages over existing +JSM techniques. First, GenCorres performs JSM among a synthetic shape +collection whose size is much bigger than the input shapes and fully leverages +the datadriven power of JSM. Second, GenCorres unifies consistent shape +matching and pairwise matching (i.e., by enforcing deformation priors between +adjacent synthetic shapes). Third, the generator provides a concise encoding of +consistent shape correspondences. However, learning a mesh generator from an +unorganized shape collection is challenging, requiring a good initialization. +GenCorres addresses this issue by learning an implicit generator from the input +shapes, which provides intermediate shapes between two arbitrary shapes. We +introduce a novel approach for computing correspondences between adjacent +implicit surfaces, which we use to regularize the implicit generator. Synthetic +shapes of the implicit generator then guide initial fittings (i.e., via +template-based deformation) for learning the mesh generator. Experimental +results show that GenCorres considerably outperforms state-of-the-art JSM +techniques. The synthetic shapes of GenCorres also achieve salient performance +gains against state-of-the-art deformable shape generators. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and + Training Strategies + + +
+ This paper investigates the performance of the Contrastive Language-Image +Pre-training (CLIP) when scaled down to limited computation budgets. We explore +CLIP along three dimensions: data, architecture, and training strategies. With +regards to data, we demonstrate the significance of high-quality training data +and show that a smaller dataset of high-quality data can outperform a larger +dataset with lower quality. We also examine how model performance varies with +different dataset sizes, suggesting that smaller ViT models are better suited +for smaller datasets, while larger models perform better on larger datasets +with fixed compute. Additionally, we provide guidance on when to choose a +CNN-based architecture or a ViT-based architecture for CLIP training. We +compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data +Augmentation - and show that the choice of training strategy depends on the +available compute resource. Our analysis reveals that CLIP+Data Augmentation +can achieve comparable performance to CLIP using only half of the training +data. This work provides practical insights into how to effectively train and +deploy CLIP models, making them more accessible and affordable for practical +use in various applications. + +
+
+
+
+
+ + ♻ ☆ ScribbleGen: Generative Data Augmentation Improves Scribble-supervised + Semantic Segmentation + + +
+ Recent advances in generative models, such as diffusion models, have made +generating high-quality synthetic images widely accessible. Prior works have +shown that training on synthetic images improves many perception tasks, such as +image classification, object detection, and semantic segmentation. We are the +first to explore generative data augmentations for scribble-supervised semantic +segmentation. We propose ScribbleGen, a generative data augmentation method +that leverages a ControlNet diffusion model conditioned on semantic scribbles +to produce high-quality training data. However, naive implementations of +generative data augmentations may inadvertently harm the performance of the +downstream segmentor rather than improve it. We leverage classifier-free +diffusion guidance to enforce class consistency and introduce encode ratios to +trade off data diversity for data realism. Using the guidance scale and encode +ratio, we can generate a spectrum of high-quality training images. We propose +multiple augmentation schemes and find that these schemes significantly impact +model performance, especially in the low-data regime. Our framework further +reduces the gap between the performance of scribble-supervised segmentation and +that of fully-supervised segmentation. We also show that our framework +significantly improves segmentation performance on small datasets, even +surpassing fully-supervised segmentation. The code is available at +https://github.com/mengtang-lab/scribblegen. + +
+
+
+
+
+ + ♻ ☆ Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese + Structure Network ACM MM 2023 + + +
+ Semi-supervised semantic segmentation (SSS) is an important task that +utilizes both labeled and unlabeled data to reduce expenses on labeling +training examples. However, the effectiveness of SSS algorithms is limited by +the difficulty of fully exploiting the potential of unlabeled data. To address +this, we propose a dual-level Siamese structure network (DSSN) for pixel-wise +contrastive learning. By aligning positive pairs with a pixel-wise contrastive +loss using strong augmented views in both low-level image space and high-level +feature space, the proposed DSSN is designed to maximize the utilization of +available unlabeled data. Additionally, we introduce a novel class-aware +pseudo-label selection strategy for weak-to-strong supervision, which addresses +the limitations of most existing methods that do not perform selection or apply +a predefined threshold for all classes. Specifically, our strategy selects the +top high-confidence prediction of the weak view for each class to generate +pseudo labels that supervise the strong augmented views. This strategy is +capable of taking into account the class imbalance and improving the +performance of long-tailed classes. Our proposed method achieves +state-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes, +outperforming other SSS algorithms by a significant margin. The source code is +available at https://github.com/kunzhan/DSSN. + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ NARUTO: Neural Active Reconstruction from Uncertain Target Observations CVPR2024 + + +
+ We present NARUTO, a neural active reconstruction system that combines a +hybrid neural representation with uncertainty learning, enabling high-fidelity +surface reconstruction. Our approach leverages a multi-resolution hash-grid as +the mapping backbone, chosen for its exceptional convergence speed and capacity +to capture high-frequency local features.The centerpiece of our work is the +incorporation of an uncertainty learning module that dynamically quantifies +reconstruction uncertainty while actively reconstructing the environment. By +harnessing learned uncertainty, we propose a novel uncertainty aggregation +strategy for goal searching and efficient path planning. Our system +autonomously explores by targeting uncertain observations and reconstructs +environments with remarkable completeness and fidelity. We also demonstrate the +utility of this uncertainty-aware approach by enhancing SOTA neural SLAM +systems through an active ray sampling strategy. Extensive evaluations of +NARUTO in various environments, using an indoor scene simulator, confirm its +superior performance and state-of-the-art status in active reconstruction, as +evidenced by its impressive results on benchmark datasets like Replica and +MP3D. + +
+
+ comment: Accepted to CVPR2024. Project page: + https://oppo-us-research.github.io/NARUTO-website/. Code: + https://github.com/oppo-us-research/NARUTO +
+
+
+
+
+ + ♻ ☆ BOP Challenge 2023 on Detection, Segmentation and Pose Estimation of + Seen and Unseen Rigid Objects + + +
+ We present the evaluation methodology, datasets and results of the BOP +Challenge 2023, the fifth in a series of public competitions organized to +capture the state of the art in model-based 6D object pose estimation from an +RGB/RGB-D image and related tasks. Besides the three tasks from 2022 +(model-based 2D detection, 2D segmentation, and 6D localization of objects seen +during training), the 2023 challenge introduced new variants of these tasks +focused on objects unseen during training. In the new tasks, methods were +required to learn new objects during a short onboarding stage (max 5 minutes, 1 +GPU) from provided 3D object models. The best 2023 method for 6D localization +of unseen objects (GenFlow) notably reached the accuracy of the best 2020 +method for seen objects (CosyPose), although being noticeably slower. The best +2023 method for seen objects (GPose) achieved a moderate accuracy improvement +but a significant 43% run-time improvement compared to the best 2022 +counterpart (GDRNPP). Since 2017, the accuracy of 6D localization of seen +objects has improved by more than 50% (from 56.9 to 85.6 AR_C). The online +evaluation system stays open and is available at: http://bop.felk.cvut.cz/. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2302.13075 +
+
+
+
+
+ + ♻ ☆ GenURL: A General Framework for Unsupervised Representation Learning + + +
+ Unsupervised representation learning (URL), which learns compact embeddings +of high-dimensional data without supervision, has made remarkable progress +recently. However, the development of URLs for different requirements is +independent, which limits the generalization of the algorithms, especially +prohibitive as the number of tasks grows. For example, dimension reduction +methods, t-SNE, and UMAP optimize pair-wise data relationships by preserving +the global geometric structure, while self-supervised learning, SimCLR, and +BYOL focus on mining the local statistics of instances under specific +augmentations. To address this dilemma, we summarize and propose a unified +similarity-based URL framework, GenURL, which can smoothly adapt to various URL +tasks. In this paper, we regard URL tasks as different implicit constraints on +the data geometric structure that help to seek optimal low-dimensional +representations that boil down to data structural modeling (DSM) and +low-dimensional transformation (LDT). Specifically, DMS provides a +structure-based submodule to describe the global structures, and LDT learns +compact low-dimensional embeddings with given pretext tasks. Moreover, an +objective function, General Kullback-Leibler divergence (GKL), is proposed to +connect DMS and LDT naturally. Comprehensive experiments demonstrate that +GenURL achieves consistent state-of-the-art performance in self-supervised +visual learning, unsupervised knowledge distillation (KD), graph embeddings +(GE), and dimension reduction. + +
+
+ comment: TNNLS 2024 version with 13 pages and 14 figures +
+
+
+
+
+ + ♻ ☆ Neural Language of Thought Models ICLR 2024 + + +
+ The Language of Thought Hypothesis suggests that human cognition operates on +a structured, language-like system of mental representations. While neural +language models can naturally benefit from the compositional structure +inherently and explicitly expressed in language data, learning such +representations from non-linguistic general observations, like images, remains +a challenge. In this work, we introduce the Neural Language of Thought Model +(NLoTM), a novel approach for unsupervised learning of LoTH-inspired +representation and generation. NLoTM comprises two key components: (1) the +Semantic Vector-Quantized Variational Autoencoder, which learns hierarchical, +composable discrete representations aligned with objects and their properties, +and (2) the Autoregressive LoT Prior, an autoregressive transformer that learns +to generate semantic concept tokens compositionally, capturing the underlying +data distribution. We evaluate NLoTM on several 2D and 3D image datasets, +demonstrating superior performance in downstream tasks, out-of-distribution +generalization, and image generation quality compared to patch-based VQ-VAE and +continuous object-centric representations. Our work presents a significant step +towards creating neural networks exhibiting more human-like understanding by +developing LoT-like representations and offers insights into the intersection +of cognitive science and machine learning. + +
+
+ comment: Accepted in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Orbital Polarimetric Tomography of a Flare Near the Sagittarius A* + Supermassive Black Hole + + +
+ The interaction between the supermassive black hole at the center of the +Milky Way, Sagittarius A*, and its accretion disk occasionally produces +high-energy flares seen in X-ray, infrared, and radio. One proposed mechanism +that produces flares is the formation of compact, bright regions that appear +within the accretion disk and close to the event horizon. Understanding these +flares provides a window into accretion processes. Although sophisticated +simulations predict the formation of these flares, their structure has yet to +be recovered by observations. Here we show the first three-dimensional (3D) +reconstruction of an emission flare recovered from ALMA light curves observed +on April 11, 2017. Our recovery shows compact, bright regions at a distance of +roughly six times the event horizon. Moreover, it suggests a clockwise rotation +in a low-inclination orbital plane, consistent with prior studies by GRAVITY +and EHT. To recover this emission structure, we solve an ill-posed tomography +problem by integrating a neural 3D representation with a gravitational model +for black holes. Although the recovery is subject to, and sometimes sensitive +to, the model assumptions, under physically motivated choices, our results are +stable, and our approach is successful on simulated data. + +
+
+
+
+
+ + ♻ ☆ Social-Transmotion: Promptable Human Trajectory Prediction ICLR 2024 + + +
+ Accurate human trajectory prediction is crucial for applications such as +autonomous vehicles, robotics, and surveillance systems. Yet, existing models +often fail to fully leverage the non-verbal social cues human subconsciously +communicate when navigating the space. To address this, we introduce +Social-Transmotion, a generic Transformer-based model that exploits diverse and +numerous visual cues to predict human behavior. We translate the idea of a +prompt from Natural Language Processing (NLP) to the task of human trajectory +prediction, where a prompt can be a sequence of x-y coordinates on the ground, +bounding boxes in the image plane, or body pose keypoints in either 2D or 3D. +This, in turn, augments trajectory data, leading to enhanced human trajectory +prediction. Using masking technique, our model exhibits flexibility and +adaptability by capturing spatiotemporal interactions between agents based on +the available visual cues. We delve into the merits of using 2D versus 3D +poses, and a limited set of poses. Additionally, we investigate the spatial and +temporal attention map to identify which keypoints and time-steps in the +sequence are vital for optimizing human trajectory prediction. Our approach is +validated on multiple datasets, including JTA, JRDB, Pedestrians and Cyclists +in Road Traffic, and ETH-UCY. The code is publicly available: +https://github.com/vita-epfl/social-transmotion. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Fooling Contrastive Language-Image Pre-trained Models with + CLIPMasterPrints + + +
+ Models leveraging both visual and textual data such as Contrastive +Language-Image Pre-training (CLIP), are the backbone of many recent advances in +artificial intelligence. In this work, we show that despite their versatility, +such models are vulnerable to what we refer to as fooling master images. +Fooling master images are capable of maximizing the confidence score of a CLIP +model for a significant number of widely varying prompts, while being either +unrecognizable or unrelated to the attacked prompts for humans. The existence +of such images is problematic as it could be used by bad actors to maliciously +interfere with CLIP-trained image retrieval models in production with +comparably small effort as a single image can attack many different prompts. We +demonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined +using stochastic gradient descent, projected gradient descent, or blackbox +optimization. Contrary to many common adversarial attacks, the blackbox +optimization approach allows us to mine CLIPMasterPrints even when the weights +of the model are not accessible. We investigate the properties of the mined +images, and find that images trained on a small number of image captions +generalize to a much larger number of semantically related captions. We +evaluate possible mitigation strategies, where we increase the robustness of +the model and introduce an approach to automatically detect CLIPMasterPrints to +sanitize the input of vulnerable models. Finally, we find that vulnerability to +CLIPMasterPrints is related to a modality gap in contrastive pre-trained +multi-modal networks. Code available at +https://github.com/matfrei/CLIPMasterPrints. + +
+
+ comment: This work was supported by a research grant (40575) from VILLUM + FONDEN +
+
+
+
+
+ + ♻ ☆ Tunable Hybrid Proposal Networks for the Open World WACV 2024 + + +
+ Current state-of-the-art object proposal networks are trained with a +closed-world assumption, meaning they learn to only detect objects of the +training classes. These models fail to provide high recall in open-world +environments where important novel objects may be encountered. While a handful +of recent works attempt to tackle this problem, they fail to consider that the +optimal behavior of a proposal network can vary significantly depending on the +data and application. Our goal is to provide a flexible proposal solution that +can be easily tuned to suit a variety of open-world settings. To this end, we +design a Tunable Hybrid Proposal Network (THPN) that leverages an adjustable +hybrid architecture, a novel self-training procedure, and dynamic loss +components to optimize the tradeoff between known and unknown object detection +performance. To thoroughly evaluate our method, we devise several new +challenges which invoke varying degrees of label bias by altering known class +diversity and label count. We find that in every task, THPN easily outperforms +existing baselines (e.g., RPN, OLN). Our method is also highly data efficient, +surpassing baseline recall with a fraction of the labeled data. + +
+
+ comment: Published in WACV 2024. 22 pages, 9 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Multi-Level Feature Aggregation and Recursive Alignment Network for + Real-Time Semantic Segmentation + + +
+ Real-time semantic segmentation is a crucial research for real-world +applications. However, many methods lay particular emphasis on reducing the +computational complexity and model size, while largely sacrificing the +accuracy. To tackle this problem, we propose a parallel inference network +customized for semantic segmentation tasks to achieve a good trade-off between +speed and accuracy. We employ a shallow backbone to ensure real-time speed, and +propose three core components to compensate for the reduced model capacity to +improve accuracy. Specifically, we first design a dual-pyramidal path +architecture (Multi-level Feature Aggregation Module, MFAM) to aggregate +multi-level features from the encoder to each scale, providing hierarchical +clues for subsequent spatial alignment and corresponding in-network inference. +Then, we build Recursive Alignment Module (RAM) by combining the flow-based +alignment module with recursive upsampling architecture for accurate spatial +alignment between multi-scale feature maps with half the computational +complexity of the straightforward alignment method. Finally, we perform +independent parallel inference on the aligned features to obtain multi-scale +scores, and adaptively fuse them through an attention-based Adaptive Scores +Fusion Module (ASFM) so that the final prediction can favor objects of multiple +scales. Our framework shows a better balance between speed and accuracy than +state-of-the-art real-time methods on Cityscapes and CamVid datasets. We also +conducted systematic ablation studies to gain insight into our motivation and +architectural design. Code is available at: +https://github.com/Yanhua-Zhang/MFARANet. + +
+
+ comment: 15 pages, 9 figures and 12 Tables. Manuscript completed on April 30, + 2022 +
+
+
+
+
+ + ♻ ☆ DiffusionAvatars: Deferred Diffusion for High-fidelity 3D Head Avatars + + +
+ DiffusionAvatars synthesizes a high-fidelity 3D head avatar of a person, +offering intuitive control over both pose and expression. We propose a +diffusion-based neural renderer that leverages generic 2D priors to produce +compelling images of faces. For coarse guidance of the expression and head +pose, we render a neural parametric head model (NPHM) from the target +viewpoint, which acts as a proxy geometry of the person. Additionally, to +enhance the modeling of intricate facial expressions, we condition +DiffusionAvatars directly on the expression codes obtained from NPHM via +cross-attention. Finally, to synthesize consistent surface details across +different viewpoints and expressions, we rig learnable spatial features to the +head's surface via TriPlane lookup in NPHM's canonical space. We train +DiffusionAvatars on RGB videos and corresponding fitted NPHM meshes of a person +and test the obtained avatars in both self-reenactment and animation scenarios. +Our experiments demonstrate that DiffusionAvatars generates temporally +consistent and visually appealing videos for novel poses and expressions of a +person, outperforming existing approaches. + +
+
+ comment: Project Page: https://tobias-kirschstein.github.io/diffusion-avatars/ + , Video: https://youtu.be/nSjDiiTnp2E +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 212 + +
+
+
+ + ☆ Can We Break Free from Strong Data Augmentations in Self-Supervised + Learning? + + +
+ Self-supervised learning (SSL) has emerged as a promising solution for +addressing the challenge of limited labeled data in deep neural networks +(DNNs), offering scalability potential. However, the impact of design +dependencies within the SSL framework remains insufficiently investigated. In +this study, we comprehensively explore SSL behavior across a spectrum of +augmentations, revealing their crucial role in shaping SSL model performance +and learning mechanisms. Leveraging these insights, we propose a novel learning +approach that integrates prior knowledge, with the aim of curtailing the need +for extensive data augmentations and thereby amplifying the efficacy of learned +representations. Notably, our findings underscore that SSL models imbued with +prior knowledge exhibit reduced texture bias, diminished reliance on shortcuts +and augmentations, and improved robustness against both natural and adversarial +corruptions. These findings not only illuminate a new direction in SSL +research, but also pave the way for enhancing DNN performance while +concurrently alleviating the imperative for intensive data augmentation, +thereby enhancing scalability and real-world problem-solving capabilities. + +
+
+
+
+
+ + ☆ LetsGo: Large-Scale Garage Modeling and Rendering via LiDAR-Assisted + Gaussian Primitives + + +
+ Large garages are ubiquitous yet intricate scenes in our daily lives, posing +challenges characterized by monotonous colors, repetitive patterns, reflective +surfaces, and transparent vehicle glass. Conventional Structure from Motion +(SfM) methods for camera pose estimation and 3D reconstruction fail in these +environments due to poor correspondence construction. To address these +challenges, this paper introduces LetsGo, a LiDAR-assisted Gaussian splatting +approach for large-scale garage modeling and rendering. We develop a handheld +scanner, Polar, equipped with IMU, LiDAR, and a fisheye camera, to facilitate +accurate LiDAR and image data scanning. With this Polar device, we present a +GarageWorld dataset consisting of five expansive garage scenes with diverse +geometric structures and will release the dataset to the community for further +research. We demonstrate that the collected LiDAR point cloud by the Polar +device enhances a suite of 3D Gaussian splatting algorithms for garage scene +modeling and rendering. We also propose a novel depth regularizer for 3D +Gaussian splatting algorithm training, effectively eliminating floating +artifacts in rendered images, and a lightweight Level of Detail (LOD) Gaussian +renderer for real-time viewing on web-based devices. Additionally, we explore a +hybrid representation that combines the advantages of traditional mesh in +depicting simple geometry and colors (e.g., walls and the ground) with modern +3D Gaussian representations capturing complex details and high-frequency +textures. This strategy achieves an optimal balance between memory performance +and rendering quality. Experimental results on our dataset, along with +ScanNet++ and KITTI-360, demonstrate the superiority of our method in rendering +quality and resource efficiency. + +
+
+ comment: Project Page: https://jdtsui.github.io/letsgo/ +
+
+
+
+
+ + ☆ FSRT: Facial Scene Representation Transformer for Face Reenactment from + Factorized Appearance, Head-pose, and Facial Expression Features CVPR 2024 + + +
+ The task of face reenactment is to transfer the head motion and facial +expressions from a driving video to the appearance of a source image, which may +be of a different person (cross-reenactment). Most existing methods are +CNN-based and estimate optical flow from the source image to the current +driving frame, which is then inpainted and refined to produce the output +animation. We propose a transformer-based encoder for computing a set-latent +representation of the source image(s). We then predict the output color of a +query pixel using a transformer-based decoder, which is conditioned with +keypoints and a facial expression vector extracted from the driving frame. +Latent representations of the source person are learned in a self-supervised +manner that factorize their appearance, head pose, and facial expressions. +Thus, they are perfectly suited for cross-reenactment. In contrast to most +related work, our method naturally extends to multiple source images and can +thus adapt to person-specific facial dynamics. We also propose data +augmentation and regularization schemes that are necessary to prevent +overfitting and support generalizability of the learned representations. We +evaluated our approach in a randomized user study. The results indicate +superior performance compared to the state-of-the-art in terms of motion +transfer quality and temporal consistency. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Equipping Diffusion Models with Differentiable Spatial Entropy for + Low-Light Image Enhancement CVPR + + +
+ Image restoration, which aims to recover high-quality images from their +corrupted counterparts, often faces the challenge of being an ill-posed problem +that allows multiple solutions for a single input. However, most deep learning +based works simply employ l1 loss to train their network in a deterministic +way, resulting in over-smoothed predictions with inferior perceptual quality. +In this work, we propose a novel method that shifts the focus from a +deterministic pixel-by-pixel comparison to a statistical perspective, +emphasizing the learning of distributions rather than individual pixel values. +The core idea is to introduce spatial entropy into the loss function to measure +the distribution difference between predictions and targets. To make this +spatial entropy differentiable, we employ kernel density estimation (KDE) to +approximate the probabilities for specific intensity values of each pixel with +their neighbor areas. Specifically, we equip the entropy with diffusion models +and aim for superior accuracy and enhanced perceptual quality over l1 based +noise matching loss. In the experiments, we evaluate the proposed method for +low light enhancement on two datasets and the NTIRE challenge 2024. All these +results illustrate the effectiveness of our statistic-based entropy loss. Code +is available at https://github.com/shermanlian/spatial-entropy-loss. + +
+
+ comment: CVPRW 2024, best LPIPS in the NTIRE low light enhancement challenge + 2024 +
+
+
+
+
+ + ☆ Photo-Realistic Image Restoration in the Wild with Controlled + Vision-Language Models CVPR + + +
+ Though diffusion models have been successfully applied to various image +restoration (IR) tasks, their performance is sensitive to the choice of +training datasets. Typically, diffusion models trained in specific datasets +fail to recover images that have out-of-distribution degradations. To address +this problem, this work leverages a capable vision-language model and a +synthetic degradation pipeline to learn image restoration in the wild (wild +IR). More specifically, all low-quality images are simulated with a synthetic +degradation pipeline that contains multiple common degradations such as blur, +resize, noise, and JPEG compression. Then we introduce robust training for a +degradation-aware CLIP model to extract enriched image content features to +assist high-quality image restoration. Our base diffusion model is the image +restoration SDE (IR-SDE). Built upon it, we further present a posterior +sampling strategy for fast noise-free image generation. We evaluate our model +on both synthetic and real-world degradation datasets. Moreover, experiments on +the unified image restoration task illustrate that the proposed posterior +sampling improves image generation quality for various degradations. + +
+
+ comment: CVPRW 2024; Code: https://github.com/Algolzw/daclip-uir +
+
+
+
+
+ + ☆ Adaptive Patching for High-resolution Image Segmentation with + Transformers + + +
+ Attention-based models are proliferating in the space of image analytics, +including segmentation. The standard method of feeding images to transformer +encoders is to divide the images into patches and then feed the patches to the +model as a linear sequence of tokens. For high-resolution images, e.g. +microscopic pathology images, the quadratic compute and memory cost prohibits +the use of an attention-based model, if we are to use smaller patch sizes that +are favorable in segmentation. The solution is to either use custom complex +multi-resolution models or approximate attention schemes. We take inspiration +from Adapative Mesh Refinement (AMR) methods in HPC by adaptively patching the +images, as a pre-processing step, based on the image details to reduce the +number of patches being fed to the model, by orders of magnitude. This method +has a negligible overhead, and works seamlessly with any attention-based model, +i.e. it is a pre-processing step that can be adopted by any attention-based +model without friction. We demonstrate superior segmentation quality over SoTA +segmentation models for real-world pathology datasets while gaining a geomean +speedup of $6.9\times$ for resolutions up to $64K^2$, on up to $2,048$ GPUs. + +
+
+
+
+
+ + ☆ HSIDMamba: Exploring Bidirectional State-Space Models for Hyperspectral + Denoising + + +
+ Effectively discerning spatial-spectral dependencies in HSI denoising is +crucial, but prevailing methods using convolution or transformers still face +computational efficiency limitations. Recently, the emerging Selective State +Space Model(Mamba) has risen with its nearly linear computational complexity in +processing natural language sequences, which inspired us to explore its +potential in handling long spectral sequences. In this paper, we propose +HSIDMamba(HSDM), tailored to exploit the linear complexity for effectively +capturing spatial-spectral dependencies in HSI denoising. In particular, HSDM +comprises multiple Hyperspectral Continuous Scan Blocks, incorporating +BCSM(Bidirectional Continuous Scanning Mechanism), scale residual, and spectral +attention mechanisms to enhance the capture of long-range and local +spatial-spectral information. BCSM strengthens spatial-spectral interactions by +linking forward and backward scans and enhancing information from eight +directions through SSM, significantly enhancing the perceptual capability of +HSDM and improving denoising performance more effectively. Extensive +evaluations against HSI denoising benchmarks validate the superior performance +of HSDM, achieving state-of-the-art results in performance and surpassing the +efficiency of the latest transformer architectures by $30\%$. + +
+
+
+
+
+ + ☆ XoFTR: Cross-modal Feature Matching Transformer CVPR + + +
+ We introduce, XoFTR, a cross-modal cross-view method for local feature +matching between thermal infrared (TIR) and visible images. Unlike visible +images, TIR images are less susceptible to adverse lighting and weather +conditions but present difficulties in matching due to significant texture and +intensity differences. Current hand-crafted and learning-based methods for +visible-TIR matching fall short in handling viewpoint, scale, and texture +diversities. To address this, XoFTR incorporates masked image modeling +pre-training and fine-tuning with pseudo-thermal image augmentation to handle +the modality differences. Additionally, we introduce a refined matching +pipeline that adjusts for scale discrepancies and enhances match reliability +through sub-pixel level refinement. To validate our approach, we collect a +comprehensive visible-thermal dataset, and show that our method outperforms +existing methods on many benchmarks. + +
+
+ comment: CVPR Image Matching Workshop, 2024. 12 pages, 7 figures, 5 tables. + Codes and dataset are available at https://github.com/OnderT/XoFTR +
+
+
+
+
+ + ☆ Harnessing GPT-4V(ision) for Insurance: A Preliminary Exploration + + +
+ The emergence of Large Multimodal Models (LMMs) marks a significant milestone +in the development of artificial intelligence. Insurance, as a vast and complex +discipline, involves a wide variety of data forms in its operational processes, +including text, images, and videos, thereby giving rise to diverse multimodal +tasks. Despite this, there has been limited systematic exploration of +multimodal tasks specific to insurance, nor a thorough investigation into how +LMMs can address these challenges. In this paper, we explore GPT-4V's +capabilities in the insurance domain. We categorize multimodal tasks by +focusing primarily on visual aspects based on types of insurance (e.g., auto, +household/commercial property, health, and agricultural insurance) and +insurance stages (e.g., risk assessment, risk monitoring, and claims +processing). Our experiment reveals that GPT-4V exhibits remarkable abilities +in insurance-related tasks, demonstrating not only a robust understanding of +multimodal content in the insurance domain but also a comprehensive knowledge +of insurance scenarios. However, there are notable shortcomings: GPT-4V +struggles with detailed risk rating and loss assessment, suffers from +hallucination in image understanding, and shows variable support for different +languages. Through this work, we aim to bridge the insurance domain with +cutting-edge LMM technology, facilitate interdisciplinary exchange and +development, and provide a foundation for the continued advancement and +evolution of future research endeavors. + +
+
+
+
+
+ + ☆ Post-Training Network Compression for 3D Medical Image Segmentation: + Reducing Computational Efforts via Tucker Decomposition + + +
+ We address the computational barrier of deploying advanced deep learning +segmentation models in clinical settings by studying the efficacy of network +compression through tensor decomposition. We propose a post-training Tucker +factorization that enables the decomposition of pre-existing models to reduce +computational requirements without impeding segmentation accuracy. We applied +Tucker decomposition to the convolutional kernels of the TotalSegmentator (TS) +model, an nnU-Net model trained on a comprehensive dataset for automatic +segmentation of 117 anatomical structures. Our approach reduced the +floating-point operations (FLOPs) and memory required during inference, +offering an adjustable trade-off between computational efficiency and +segmentation quality. This study utilized the publicly available TS dataset, +employing various downsampling factors to explore the relationship between +model size, inference speed, and segmentation performance. The application of +Tucker decomposition to the TS model substantially reduced the model parameters +and FLOPs across various compression rates, with limited loss in segmentation +accuracy. We removed up to 88% of the model's parameters with no significant +performance changes in the majority of classes after fine-tuning. Practical +benefits varied across different graphics processing unit (GPU) architectures, +with more distinct speed-ups on less powerful hardware. Post-hoc network +compression via Tucker decomposition presents a viable strategy for reducing +the computational demand of medical image segmentation models without +substantially sacrificing accuracy. This approach enables the broader adoption +of advanced deep learning technologies in clinical practice, offering a way to +navigate the constraints of hardware capabilities. + +
+
+
+
+
+ + ☆ Deformable MRI Sequence Registration for AI-based Prostate Cancer + Diagnosis + + +
+ The PI-CAI (Prostate Imaging: Cancer AI) challenge led to expert-level +diagnostic algorithms for clinically significant prostate cancer detection. The +algorithms receive biparametric MRI scans as input, which consist of +T2-weighted and diffusion-weighted scans. These scans can be misaligned due to +multiple factors in the scanning process. Image registration can alleviate this +issue by predicting the deformation between the sequences. We investigate the +effect of image registration on the diagnostic performance of AI-based prostate +cancer diagnosis. First, the image registration algorithm, developed in +MeVisLab, is analyzed using a dataset with paired lesion annotations. Second, +the effect on diagnosis is evaluated by comparing case-level cancer diagnosis +performance between using the original dataset, rigidly aligned +diffusion-weighted scans, or deformably aligned diffusion-weighted scans. Rigid +registration showed no improvement. Deformable registration demonstrated a +substantial improvement in lesion overlap (+10% median Dice score) and a +positive yet non-significant improvement in diagnostic performance (+0.3% +AUROC, p=0.18). Our investigation shows that a substantial improvement in +lesion alignment does not directly lead to a significant improvement in +diagnostic performance. Qualitative analysis indicated that jointly developing +image registration methods and diagnostic AI algorithms could enhance +diagnostic accuracy and patient outcomes. + +
+
+
+
+
+ + ☆ Do LLMs Understand Visual Anomalies? Uncovering LLM Capabilities in + Zero-shot Anomaly Detection + + +
+ Large vision-language models (LVLMs) are markedly proficient in deriving +visual representations guided by natural language. Recent explorations have +utilized LVLMs to tackle zero-shot visual anomaly detection (VAD) challenges by +pairing images with textual descriptions indicative of normal and abnormal +conditions, referred to as anomaly prompts. However, existing approaches depend +on static anomaly prompts that are prone to cross-semantic ambiguity, and +prioritize global image-level representations over crucial local pixel-level +image-to-text alignment that is necessary for accurate anomaly localization. In +this paper, we present ALFA, a training-free approach designed to address these +challenges via a unified model. We propose a run-time prompt adaptation +strategy, which first generates informative anomaly prompts to leverage the +capabilities of a large language model (LLM). This strategy is enhanced by a +contextual scoring mechanism for per-image anomaly prompt adaptation and +cross-semantic ambiguity mitigation. We further introduce a novel fine-grained +aligner to fuse local pixel-level semantics for precise anomaly localization, +by projecting the image-text alignment from global to local semantic spaces. +Extensive evaluations on the challenging MVTec and VisA datasets confirm ALFA's +effectiveness in harnessing the language potential for zero-shot VAD, achieving +significant PRO improvements of 12.1% on MVTec AD and 8.9% on VisA compared to +state-of-the-art zero-shot VAD approaches. + +
+
+
+
+
+ + ☆ Real-world Instance-specific Image Goal Navigation for Service Robots: + Bridging the Domain Gap with Contrastive Learning IROS2024 + + +
+ Improving instance-specific image goal navigation (InstanceImageNav), which +locates the identical object in a real-world environment from a query image, is +essential for robotic systems to assist users in finding desired objects. The +challenge lies in the domain gap between low-quality images observed by the +moving robot, characterized by motion blur and low-resolution, and high-quality +query images provided by the user. Such domain gaps could significantly reduce +the task success rate but have not been the focus of previous work. To address +this, we propose a novel method called Few-shot Cross-quality Instance-aware +Adaptation (CrossIA), which employs contrastive learning with an instance +classifier to align features between massive low- and few high-quality images. +This approach effectively reduces the domain gap by bringing the latent +representations of cross-quality images closer on an instance basis. +Additionally, the system integrates an object image collection with a +pre-trained deblurring model to enhance the observed image quality. Our method +fine-tunes the SimSiam model, pre-trained on ImageNet, using CrossIA. We +evaluated our method's effectiveness through an InstanceImageNav task with 20 +different types of instances, where the robot identifies the same instance in a +real-world environment as a high-quality query image. Our experiments showed +that our method improves the task success rate by up to three times compared to +the baseline, a conventional approach based on SuperGlue. These findings +highlight the potential of leveraging contrastive learning and image +enhancement techniques to bridge the domain gap and improve object localization +in robotic applications. The project website is +https://emergentsystemlabstudent.github.io/DomainBridgingNav/. + +
+
+ comment: See website at + https://emergentsystemlabstudent.github.io/DomainBridgingNav/. Submitted to + IROS2024 +
+
+
+
+
+ + ☆ CREST: Cross-modal Resonance through Evidential Deep Learning for + Enhanced Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) enables the recognition of novel classes by +leveraging semantic knowledge transfer from known to unknown categories. This +knowledge, typically encapsulated in attribute descriptions, aids in +identifying class-specific visual features, thus facilitating visual-semantic +alignment and improving ZSL performance. However, real-world challenges such as +distribution imbalances and attribute co-occurrence among instances often +hinder the discernment of local variances in images, a problem exacerbated by +the scarcity of fine-grained, region-specific attribute annotations. Moreover, +the variability in visual presentation within categories can also skew +attribute-category associations. In response, we propose a bidirectional +cross-modal ZSL approach CREST. It begins by extracting representations for +attribute and visual localization and employs Evidential Deep Learning (EDL) to +measure underlying epistemic uncertainty, thereby enhancing the model's +resilience against hard negatives. CREST incorporates dual learning pathways, +focusing on both visual-category and attribute-category alignments, to ensure +robust correlation between latent and observable spaces. Moreover, we introduce +an uncertainty-informed cross-modal fusion technique to refine visual-attribute +inference. Extensive experiments demonstrate our model's effectiveness and +unique explainability across multiple datasets. Our code and data are available +at: Comments: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at +https://github.com/JethroJames/CREST. + +
+
+ comment: Ongoing work; 10 pages, 2 Tables, 9 Figures; Repo is available at + https://github.com/JethroJames/CREST +
+
+
+
+
+ + ☆ In-Context Translation: Towards Unifying Image Recognition, Processing, + and Generation + + +
+ We propose In-Context Translation (ICT), a general learning framework to +unify visual recognition (e.g., semantic segmentation), low-level image +processing (e.g., denoising), and conditional image generation (e.g., +edge-to-image synthesis). Thanks to unification, ICT significantly reduces the +inherent inductive bias that comes with designing models for specific tasks, +and it maximizes mutual enhancement across similar tasks. However, the +unification across a large number of tasks is non-trivial due to various data +formats and training pipelines. To this end, ICT introduces two designs. +Firstly, it standardizes input-output data of different tasks into RGB image +pairs, e.g., semantic segmentation data pairs an RGB image with its +segmentation mask in the same RGB format. This turns different tasks into a +general translation task between two RGB images. Secondly, it standardizes the +training of different tasks into a general in-context learning, where +"in-context" means the input comprises an example input-output pair of the +target task and a query image. The learning objective is to generate the +"missing" data paired with the query. The implicit translation process is thus +between the query and the generated image. In experiments, ICT unifies ten +vision tasks and showcases impressive performance on their respective +benchmarks. Notably, compared to its competitors, e.g., Painter and +PromptDiffusion, ICT trained on only 4 RTX 3090 GPUs is shown to be more +efficient and less costly in training. + +
+
+
+
+
+ + ☆ Bridging Vision and Language Spaces with Assignment Prediction ICLR 2024 + + +
+ This paper introduces VLAP, a novel approach that bridges pretrained vision +models and large language models (LLMs) to make frozen LLMs understand the +visual world. VLAP transforms the embedding space of pretrained vision models +into the LLMs' word embedding space using a single linear layer for efficient +and general-purpose visual and language understanding. Specifically, we harness +well-established word embeddings to bridge two modality embedding spaces. The +visual and text representations are simultaneously assigned to a set of word +embeddings within pretrained LLMs by formulating the assigning procedure as an +optimal transport problem. We predict the assignment of one modality from the +representation of another modality data, enforcing consistent assignments for +paired multimodal data. This allows vision and language representations to +contain the same information, grounding the frozen LLMs' word embedding space +in visual data. Moreover, a robust semantic taxonomy of LLMs can be preserved +with visual data since the LLMs interpret and reason linguistic information +from correlations between word embeddings. Experimental results show that VLAP +achieves substantial improvements over the previous linear transformation-based +approaches across a range of vision-language tasks, including image captioning, +visual question answering, and cross-modal retrieval. We also demonstrate the +learned visual representations hold a semantic taxonomy of LLMs, making visual +semantic arithmetic possible. + +
+
+ comment: ICLR 2024 Camera-ready +
+
+
+
+
+ + ☆ AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics + Perception + + +
+ The highly abstract nature of image aesthetics perception (IAP) poses +significant challenge for current multimodal large language models (MLLMs). The +lack of human-annotated multi-modality aesthetic data further exacerbates this +dilemma, resulting in MLLMs falling short of aesthetics perception +capabilities. To address the above challenge, we first introduce a +comprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT) +dataset, which serves as the footstone for building multi-modality aesthetics +foundation models. Specifically, to align MLLMs with human aesthetics +perception, we construct a corpus-rich aesthetic critique database with 21,904 +diverse-sourced images and 88K human natural language feedbacks, which are +collected via progressive questions, ranging from coarse-grained aesthetic +grades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle +diverse queries, we further prompt GPT to refine the aesthetic critiques and +assemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT, +which consists of 409K multi-typed instructions to activate stronger aesthetic +capabilities. Based on the AesMMIT database, we fine-tune the open-sourced +general foundation models, achieving multi-modality Aesthetic Expert models, +dubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert +models deliver significantly better aesthetic perception performances than the +state-of-the-art MLLMs, including the most advanced GPT-4V and +Gemini-Pro-Vision. Source data will be available at +https://github.com/yipoh/AesExpert. + +
+
+
+
+
+ + ☆ UNIAA: A Unified Multi-modal Image Aesthetic Assessment Baseline and + Benchmark + + +
+ As an alternative to expensive expert evaluation, Image Aesthetic Assessment +(IAA) stands out as a crucial task in computer vision. However, traditional IAA +methods are typically constrained to a single data source or task, restricting +the universality and broader application. In this work, to better align with +human aesthetics, we propose a Unified Multi-modal Image Aesthetic Assessment +(UNIAA) framework, including a Multi-modal Large Language Model (MLLM) named +UNIAA-LLaVA and a comprehensive benchmark named UNIAA-Bench. We choose MLLMs +with both visual perception and language ability for IAA and establish a +low-cost paradigm for transforming the existing datasets into unified and +high-quality visual instruction tuning data, from which the UNIAA-LLaVA is +trained. To further evaluate the IAA capability of MLLMs, we construct the +UNIAA-Bench, which consists of three aesthetic levels: Perception, Description, +and Assessment. Extensive experiments validate the effectiveness and +rationality of UNIAA. UNIAA-LLaVA achieves competitive performance on all +levels of UNIAA-Bench, compared with existing MLLMs. Specifically, our model +performs better than GPT-4V in aesthetic perception and even approaches the +junior-level human. We find MLLMs have great potential in IAA, yet there +remains plenty of room for further improvement. The UNIAA-LLaVA and UNIAA-Bench +will be released. + +
+
+
+
+
+ + ☆ A Review and Efficient Implementation of Scene Graph Generation Metrics + + +
+ Scene graph generation has emerged as a prominent research field in computer +vision, witnessing significant advancements in the recent years. However, +despite these strides, precise and thorough definitions for the metrics used to +evaluate scene graph generation models are lacking. In this paper, we address +this gap in the literature by providing a review and precise definition of +commonly used metrics in scene graph generation. Our comprehensive examination +clarifies the underlying principles of these metrics and can serve as a +reference or introduction to scene graph metrics. + Furthermore, to facilitate the usage of these metrics, we introduce a +standalone Python package called SGBench that efficiently implements all +defined metrics, ensuring their accessibility to the research community. +Additionally, we present a scene graph benchmarking web service, that enables +researchers to compare scene graph generation methods and increase visibility +of new methods in a central place. + All of our code can be found at https://lorjul.github.io/sgbench/. + +
+
+
+
+
+ + ☆ Reactive Model Correction: Mitigating Harm to Task-Relevant Features via + Conditional Bias Suppression + + +
+ Deep Neural Networks are prone to learning and relying on spurious +correlations in the training data, which, for high-risk applications, can have +fatal consequences. Various approaches to suppress model reliance on harmful +features have been proposed that can be applied post-hoc without additional +training. Whereas those methods can be applied with efficiency, they also tend +to harm model performance by globally shifting the distribution of latent +features. To mitigate unintended overcorrection of model behavior, we propose a +reactive approach conditioned on model-derived knowledge and eXplainable +Artificial Intelligence (XAI) insights. While the reactive approach can be +applied to many post-hoc methods, we demonstrate the incorporation of +reactivity in particular for P-ClArC (Projective Class Artifact Compensation), +introducing a new method called R-ClArC (Reactive Class Artifact Compensation). +Through rigorous experiments in controlled settings (FunnyBirds) and with a +real-world dataset (ISIC2019), we show that introducing reactivity can minimize +the detrimental effect of the applied correction while simultaneously ensuring +low reliance on spurious features. + +
+
+
+
+
+ + ☆ 3D Gaussian Splatting as Markov Chain Monte Carlo + + +
+ While 3D Gaussian Splatting has recently become popular for neural rendering, +current methods rely on carefully engineered cloning and splitting strategies +for placing Gaussians, which does not always generalize and may lead to +poor-quality renderings. In addition, for real-world scenes, they rely on a +good initial point cloud to perform well. In this work, we rethink 3D Gaussians +as random samples drawn from an underlying probability distribution describing +the physical representation of the scene -- in other words, Markov Chain Monte +Carlo (MCMC) samples. Under this view, we show that the 3D Gaussian updates are +strikingly similar to a Stochastic Langevin Gradient Descent (SGLD) update. As +with MCMC, samples are nothing but past visit locations, adding new Gaussians +under our framework can simply be realized without heuristics as placing +Gaussians at existing Gaussian locations. To encourage using fewer Gaussians +for efficiency, we introduce an L1-regularizer on the Gaussians. On various +standard evaluation scenes, we show that our method provides improved rendering +quality, easy control over the number of Gaussians, and robustness to +initialization. + +
+
+
+
+
+ + ☆ Mitigating the Curse of Dimensionality for Certified Robustness via Dual + Randomized Smoothing + + +
+ Randomized Smoothing (RS) has been proven a promising method for endowing an +arbitrary image classifier with certified robustness. However, the substantial +uncertainty inherent in the high-dimensional isotropic Gaussian noise imposes +the curse of dimensionality on RS. Specifically, the upper bound of ${\ell_2}$ +certified robustness radius provided by RS exhibits a diminishing trend with +the expansion of the input dimension $d$, proportionally decreasing at a rate +of $1/\sqrt{d}$. This paper explores the feasibility of providing ${\ell_2}$ +certified robustness for high-dimensional input through the utilization of dual +smoothing in the lower-dimensional space. The proposed Dual Randomized +Smoothing (DRS) down-samples the input image into two sub-images and smooths +the two sub-images in lower dimensions. Theoretically, we prove that DRS +guarantees a tight ${\ell_2}$ certified robustness radius for the original +input and reveal that DRS attains a superior upper bound on the ${\ell_2}$ +robustness radius, which decreases proportionally at a rate of $(1/\sqrt m + +1/\sqrt n )$ with $m+n=d$. Extensive experiments demonstrate the +generalizability and effectiveness of DRS, which exhibits a notable capability +to integrate with established methodologies, yielding substantial improvements +in both accuracy and ${\ell_2}$ certified robustness baselines of RS on the +CIFAR-10 and ImageNet datasets. Code is available at +https://github.com/xiasong0501/DRS. + +
+
+
+
+
+ + ☆ Pseudo-label Learning with Calibrated Confidence Using an Energy-based + Model IJCNN 2024 + + +
+ In pseudo-labeling (PL), which is a type of semi-supervised learning, +pseudo-labels are assigned based on the confidence scores provided by the +classifier; therefore, accurate confidence is important for successful PL. In +this study, we propose a PL algorithm based on an energy-based model (EBM), +which is referred to as the energy-based PL (EBPL). In EBPL, a neural +network-based classifier and an EBM are jointly trained by sharing their +feature extraction parts. This approach enables the model to learn both the +class decision boundary and input data distribution, enhancing confidence +calibration during network training. The experimental results demonstrate that +EBPL outperforms the existing PL method in semi-supervised image classification +tasks, with superior confidence calibration error and recognition accuracy. + +
+
+ comment: 8 pages, 8 figures, Accepted at IJCNN 2024 +
+
+
+
+
+ + ☆ MTKD: Multi-Teacher Knowledge Distillation for Image Super-Resolution + + +
+ Knowledge distillation (KD) has emerged as a promising technique in deep +learning, typically employed to enhance a compact student network through +learning from their high-performance but more complex teacher variant. When +applied in the context of image super-resolution, most KD approaches are +modified versions of methods developed for other computer vision tasks, which +are based on training strategies with a single teacher and simple loss +functions. In this paper, we propose a novel Multi-Teacher Knowledge +Distillation (MTKD) framework specifically for image super-resolution. It +exploits the advantages of multiple teachers by combining and enhancing the +outputs of these teacher models, which then guides the learning process of the +compact student network. To achieve more effective learning performance, we +have also developed a new wavelet-based loss function for MTKD, which can +better optimize the training process by observing differences in both the +spatial and frequency domains. We fully evaluate the effectiveness of the +proposed method by comparing it to five commonly used KD methods for image +super-resolution based on three popular network architectures. The results show +that the proposed MTKD method achieves evident improvements in super-resolution +performance, up to 0.46dB (based on PSNR), over state-of-the-art KD approaches +across different network structures. The source code of MTKD will be made +available here for public evaluation. + +
+
+
+
+
+ + ☆ The revenge of BiSeNet: Efficient Multi-Task Image Segmentation CVPR2024 + + +
+ Recent advancements in image segmentation have focused on enhancing the +efficiency of the models to meet the demands of real-time applications, +especially on edge devices. However, existing research has primarily +concentrated on single-task settings, especially on semantic segmentation, +leading to redundant efforts and specialized architectures for different tasks. +To address this limitation, we propose a novel architecture for efficient +multi-task image segmentation, capable of handling various segmentation tasks +without sacrificing efficiency or accuracy. We introduce BiSeNetFormer, that +leverages the efficiency of two-stream semantic segmentation architectures and +it extends them into a mask classification framework. Our approach maintains +the efficient spatial and context paths to capture detailed and semantic +information, respectively, while leveraging an efficient transformed-based +segmentation head that computes the binary masks and class probabilities. By +seamlessly supporting multiple tasks, namely semantic and panoptic +segmentation, BiSeNetFormer offers a versatile solution for multi-task +segmentation. We evaluate our approach on popular datasets, Cityscapes and +ADE20K, demonstrating impressive inference speeds while maintaining competitive +accuracy compared to state-of-the-art architectures. Our results indicate that +BiSeNetFormer represents a significant advancement towards fast, efficient, and +multi-task segmentation networks, bridging the gap between model efficiency and +task adaptability. + +
+
+ comment: Accepted to ECV workshop at CVPR2024 +
+
+
+
+
+ + ☆ nnU-Net Revisited: A Call for Rigorous Validation in 3D Medical Image + Segmentation + + +
+ The release of nnU-Net marked a paradigm shift in 3D medical image +segmentation, demonstrating that a properly configured U-Net architecture could +still achieve state-of-the-art results. Despite this, the pursuit of novel +architectures, and the respective claims of superior performance over the U-Net +baseline, continued. In this study, we demonstrate that many of these recent +claims fail to hold up when scrutinized for common validation shortcomings, +such as the use of inadequate baselines, insufficient datasets, and neglected +computational resources. By meticulously avoiding these pitfalls, we conduct a +thorough and comprehensive benchmarking of current segmentation methods +including CNN-based, Transformer-based, and Mamba-based approaches. In contrast +to current beliefs, we find that the recipe for state-of-the-art performance is +1) employing CNN-based U-Net models, including ResNet and ConvNeXt variants, 2) +using the nnU-Net framework, and 3) scaling models to modern hardware +resources. These results indicate an ongoing innovation bias towards novel +architectures in the field and underscore the need for more stringent +validation standards in the quest for scientific progress. + +
+
+
+
+
+ + ☆ AI-KD: Towards Alignment Invariant Face Image Quality Assessment Using + Knowledge Distillation + + +
+ Face Image Quality Assessment (FIQA) techniques have seen steady improvements +over recent years, but their performance still deteriorates if the input face +samples are not properly aligned. This alignment sensitivity comes from the +fact that most FIQA techniques are trained or designed using a specific face +alignment procedure. If the alignment technique changes, the performance of +most existing FIQA techniques quickly becomes suboptimal. To address this +problem, we present in this paper a novel knowledge distillation approach, +termed AI-KD that can extend on any existing FIQA technique, improving its +robustness to alignment variations and, in turn, performance with different +alignment procedures. To validate the proposed distillation approach, we +conduct comprehensive experiments on 6 face datasets with 4 recent face +recognition models and in comparison to 7 state-of-the-art FIQA techniques. Our +results show that AI-KD consistently improves performance of the initial FIQA +techniques not only with misaligned samples, but also with properly aligned +facial images. Furthermore, it leads to a new state-of-the-art, when used with +a competitive initial FIQA approach. The code for AI-KD is made publicly +available from: https://github.com/LSIbabnikz/AI-KD. + +
+
+ comment: IEEE International Workshop on Biometrics and Forensics (IWBF) 2024, + pp. 6 +
+
+
+
+
+ + ☆ Text-Driven Diverse Facial Texture Generation via Progressive + Latent-Space Refinement + + +
+ Automatic 3D facial texture generation has gained significant interest +recently. Existing approaches may not support the traditional physically based +rendering pipeline or rely on 3D data captured by Light Stage. Our key +contribution is a progressive latent space refinement approach that can +bootstrap from 3D Morphable Models (3DMMs)-based texture maps generated from +facial images to generate high-quality and diverse PBR textures, including +albedo, normal, and roughness. It starts with enhancing Generative Adversarial +Networks (GANs) for text-guided and diverse texture generation. To this end, we +design a self-supervised paradigm to overcome the reliance on ground truth 3D +textures and train the generative model with only entangled texture maps. +Besides, we foster mutual enhancement between GANs and Score Distillation +Sampling (SDS). SDS boosts GANs with more generative modes, while GANs promote +more efficient optimization of SDS. Furthermore, we introduce an edge-aware SDS +for multi-view consistent facial structure. Experiments demonstrate that our +method outperforms existing 3D texture generation methods regarding +photo-realistic quality, diversity, and efficiency. + +
+
+
+
+
+ + ☆ WiTUnet: A U-Shaped Architecture Integrating CNN and Transformer for + Improved Feature Alignment and Local Information Fusion + + +
+ Low-dose computed tomography (LDCT) has become the technology of choice for +diagnostic medical imaging, given its lower radiation dose compared to standard +CT, despite increasing image noise and potentially affecting diagnostic +accuracy. To address this, advanced deep learning-based LDCT denoising +algorithms have been developed, primarily using Convolutional Neural Networks +(CNNs) or Transformer Networks with the Unet architecture. This architecture +enhances image detail by integrating feature maps from the encoder and decoder +via skip connections. However, current methods often overlook enhancements to +the Unet architecture itself, focusing instead on optimizing encoder and +decoder structures. This approach can be problematic due to the significant +differences in feature map characteristics between the encoder and decoder, +where simple fusion strategies may not effectively reconstruct images.In this +paper, we introduce WiTUnet, a novel LDCT image denoising method that utilizes +nested, dense skip pathways instead of traditional skip connections to improve +feature integration. WiTUnet also incorporates a windowed Transformer structure +to process images in smaller, non-overlapping segments, reducing computational +load. Additionally, the integration of a Local Image Perception Enhancement +(LiPe) module in both the encoder and decoder replaces the standard multi-layer +perceptron (MLP) in Transformers, enhancing local feature capture and +representation. Through extensive experimental comparisons, WiTUnet has +demonstrated superior performance over existing methods in key metrics such as +Peak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), and Root Mean +Square Error (RMSE), significantly improving noise removal and image quality. + +
+
+
+
+
+ + ☆ TMPQ-DM: Joint Timestep Reduction and Quantization Precision Selection + for Efficient Diffusion Models + + +
+ Diffusion models have emerged as preeminent contenders in the realm of +generative models. Distinguished by their distinctive sequential generative +processes, characterized by hundreds or even thousands of timesteps, diffusion +models progressively reconstruct images from pure Gaussian noise, with each +timestep necessitating full inference of the entire model. However, the +substantial computational demands inherent to these models present challenges +for deployment, quantization is thus widely used to lower the bit-width for +reducing the storage and computing overheads. Current quantization +methodologies primarily focus on model-side optimization, disregarding the +temporal dimension, such as the length of the timestep sequence, thereby +allowing redundant timesteps to continue consuming computational resources, +leaving substantial scope for accelerating the generative process. In this +paper, we introduce TMPQ-DM, which jointly optimizes timestep reduction and +quantization to achieve a superior performance-efficiency trade-off, addressing +both temporal and model optimization aspects. For timestep reduction, we devise +a non-uniform grouping scheme tailored to the non-uniform nature of the +denoising process, thereby mitigating the explosive combinations of timesteps. +In terms of quantization, we adopt a fine-grained layer-wise approach to +allocate varying bit-widths to different layers based on their respective +contributions to the final generative performance, thus rectifying performance +degradation observed in prior studies. To expedite the evaluation of +fine-grained quantization, we further devise a super-network to serve as a +precision solver by leveraging shared quantization results. These two design +components are seamlessly integrated within our framework, enabling rapid joint +exploration of the exponentially large decision space via a gradient-free +evolutionary search algorithm. + +
+
+
+
+
+ + ☆ Oblique-MERF: Revisiting and Improving MERF for Oblique Photography + + +
+ Neural implicit fields have established a new paradigm for scene +representation, with subsequent work achieving high-quality real-time +rendering. However, reconstructing 3D scenes from oblique aerial photography +presents unique challenges, such as varying spatial scale distributions and a +constrained range of tilt angles, often resulting in high memory consumption +and reduced rendering quality at extrapolated viewpoints. In this paper, we +enhance MERF to accommodate these data characteristics by introducing an +innovative adaptive occupancy plane optimized during the volume rendering +process and a smoothness regularization term for view-dependent color to +address these issues. Our approach, termed Oblique-MERF, surpasses +state-of-the-art real-time methods by approximately 0.7 dB, reduces VRAM usage +by about 40%, and achieves higher rendering frame rates with more realistic +rendering outcomes across most viewpoints. + +
+
+
+
+
+ + ☆ RanLayNet: A Dataset for Document Layout Detection used for Domain + Adaptation and Generalization + + +
+ Large ground-truth datasets and recent advances in deep learning techniques +have been useful for layout detection. However, because of the restricted +layout diversity of these datasets, training on them requires a sizable number +of annotated instances, which is both expensive and time-consuming. As a +result, differences between the source and target domains may significantly +impact how well these models function. To solve this problem, domain adaptation +approaches have been developed that use a small quantity of labeled data to +adjust the model to the target domain. In this research, we introduced a +synthetic document dataset called RanLayNet, enriched with automatically +assigned labels denoting spatial positions, ranges, and types of layout +elements. The primary aim of this endeavor is to develop a versatile dataset +capable of training models with robustness and adaptability to diverse document +formats. Through empirical experimentation, we demonstrate that a deep layout +identification model trained on our dataset exhibits enhanced performance +compared to a model trained solely on actual documents. Moreover, we conduct a +comparative analysis by fine-tuning inference models using both PubLayNet and +IIIT-AR-13K datasets on the Doclaynet dataset. Our findings emphasize that +models enriched with our dataset are optimal for tasks such as achieving 0.398 +and 0.588 mAP95 score in the scientific document domain for the TABLE class. + +
+
+
+
+
+ + ☆ State Space Model for New-Generation Network Alternative to + Transformers: A Survey + + +
+ In the post-deep learning era, the Transformer architecture has demonstrated +its powerful performance across pre-trained big models and various downstream +tasks. However, the enormous computational demands of this architecture have +deterred many researchers. To further reduce the complexity of attention +models, numerous efforts have been made to design more efficient methods. Among +them, the State Space Model (SSM), as a possible replacement for the +self-attention based Transformer model, has drawn more and more attention in +recent years. In this paper, we give the first comprehensive review of these +works and also provide experimental comparisons and analysis to better +demonstrate the features and advantages of SSM. Specifically, we first give a +detailed description of principles to help the readers quickly capture the key +ideas of SSM. After that, we dive into the reviews of existing SSMs and their +various applications, including natural language processing, computer vision, +graph, multi-modal and multi-media, point cloud/event stream, time series data, +and other domains. In addition, we give statistical comparisons and analysis of +these models and hope it helps the readers to understand the effectiveness of +different structures on various tasks. Then, we propose possible research +points in this direction to better promote the development of the theoretical +model and application of SSM. More related works will be continuously updated +on the following GitHub: +https://github.com/Event-AHU/Mamba_State_Space_Model_Paper_List. + +
+
+ comment: The First review of State Space Model (SSM)/Mamba and their + applications in artificial intelligence, 33 pages +
+
+
+
+
+ + ☆ Deep image learning of quantitative structure-property relationships of + cooper alloys via feature augmentation on Geodesic curve in shape space + + +
+ Understanding how the structure of materials affects their properties is a +cornerstone of materials science and engineering. However, traditional methods +have struggled to accurately describe the quantitative structure-property +relationships for complex structures. In our study, we bridge this gap by +leveraging machine learning to analyze images of materials' microstructures, +thus offering a novel way to understand and predict the properties of materials +based on their microstructures. We introduce a method known as FAGC (Feature +Augmentation on Geodesic Curves), specifically demonstrated for Cu-Cr-Zr +alloys. This approach utilizes machine learning to examine the shapes within +images of the alloys' microstructures and predict their mechanical and +electronic properties. This generative FAGC approach can effectively expand the +relatively small training datasets due to the limited availability of materials +images labeled with quantitative properties. The process begins with extracting +features from the images using neural networks. These features are then mapped +onto the Pre-shape space to construct the Geodesic curves. Along these curves, +new features are generated, effectively increasing the dataset. Moreover, we +design a pseudo-labeling mechanism for these newly generated features to +further enhance the training dataset. Our FAGC method has shown remarkable +results, significantly improving the accuracy of predicting the electronic +conductivity and hardness of Cu-Cr-Zr alloys, with R-squared values of 0.978 +and 0.998, respectively. These outcomes underscore the potential of FAGC to +address the challenge of limited image data in materials science, providing a +powerful tool for establishing detailed and quantitative relationships between +complex microstructures and material properties. + +
+
+
+
+
+ + ☆ Magic Clothing: Controllable Garment-Driven Image Synthesis + + +
+ We propose Magic Clothing, a latent diffusion model (LDM)-based network +architecture for an unexplored garment-driven image synthesis task. Aiming at +generating customized characters wearing the target garments with diverse text +prompts, the image controllability is the most critical issue, i.e., to +preserve the garment details and maintain faithfulness to the text prompts. To +this end, we introduce a garment extractor to capture the detailed garment +features, and employ self-attention fusion to incorporate them into the +pretrained LDMs, ensuring that the garment details remain unchanged on the +target character. Then, we leverage the joint classifier-free guidance to +balance the control of garment features and text prompts over the generated +results. Meanwhile, the proposed garment extractor is a plug-in module +applicable to various finetuned LDMs, and it can be combined with other +extensions like ControlNet and IP-Adapter to enhance the diversity and +controllability of the generated characters. Furthermore, we design +Matched-Points-LPIPS (MP-LPIPS), a robust metric for evaluating the consistency +of the target image to the source garment. Extensive experiments demonstrate +that our Magic Clothing achieves state-of-the-art results under various +conditional controls for garment-driven image synthesis. Our source code is +available at https://github.com/ShineChen1024/MagicClothing. + +
+
+
+
+
+ + ☆ Fuse after Align: Improving Face-Voice Association Learning via + Multimodal Encoder + + +
+ Today, there have been many achievements in learning the association between +voice and face. However, most previous work models rely on cosine similarity or +L2 distance to evaluate the likeness of voices and faces following contrastive +learning, subsequently applied to retrieval and matching tasks. This method +only considers the embeddings as high-dimensional vectors, utilizing a minimal +scope of available information. This paper introduces a novel framework within +an unsupervised setting for learning voice-face associations. By employing a +multimodal encoder after contrastive learning and addressing the problem +through binary classification, we can learn the implicit information within the +embeddings in a more effective and varied manner. Furthermore, by introducing +an effective pair selection method, we enhance the learning outcomes of both +contrastive learning and the matching task. Empirical evidence demonstrates +that our framework achieves state-of-the-art results in voice-face matching, +verification, and retrieval tasks, improving verification by approximately 3%, +matching by about 2.5%, and retrieval by around 1.3%. + +
+
+
+
+
+ + ☆ Clothes-Changing Person Re-Identification with Feasibility-Aware + Intermediary Matching + + +
+ Current clothes-changing person re-identification (re-id) approaches usually +perform retrieval based on clothes-irrelevant features, while neglecting the +potential of clothes-relevant features. However, we observe that relying solely +on clothes-irrelevant features for clothes-changing re-id is limited, since +they often lack adequate identity information and suffer from large intra-class +variations. On the contrary, clothes-relevant features can be used to discover +same-clothes intermediaries that possess informative identity clues. Based on +this observation, we propose a Feasibility-Aware Intermediary Matching (FAIM) +framework to additionally utilize clothes-relevant features for retrieval. +Firstly, an Intermediary Matching (IM) module is designed to perform an +intermediary-assisted matching process. This process involves using +clothes-relevant features to find informative intermediates, and then using +clothes-irrelevant features of these intermediates to complete the matching. +Secondly, in order to reduce the negative effect of low-quality intermediaries, +an Intermediary-Based Feasibility Weighting (IBFW) module is designed to +evaluate the feasibility of intermediary matching process by assessing the +quality of intermediaries. Extensive experiments demonstrate that our method +outperforms state-of-the-art methods on several widely-used clothes-changing +re-id benchmarks. + +
+
+
+
+
+ + ☆ Learning Tracking Representations from Single Point Annotations CVPR2024 + + +
+ Existing deep trackers are typically trained with largescale video frames +with annotated bounding boxes. However, these bounding boxes are expensive and +time-consuming to annotate, in particular for large scale datasets. In this +paper, we propose to learn tracking representations from single point +annotations (i.e., 4.5x faster to annotate than the traditional bounding box) +in a weakly supervised manner. Specifically, we propose a soft contrastive +learning (SoCL) framework that incorporates target objectness prior into +end-to-end contrastive learning. Our SoCL consists of adaptive positive and +negative sample generation, which is memory-efficient and effective for +learning tracking representations. We apply the learned representation of SoCL +to visual tracking and show that our method can 1) achieve better performance +than the fully supervised baseline trained with box annotations under the same +annotation time cost; 2) achieve comparable performance of the fully supervised +baseline by using the same number of training frames and meanwhile reducing +annotation time cost by 78% and total fees by 85%; 3) be robust to annotation +noise. + +
+
+ comment: Accept to CVPR2024-L3DIVU +
+
+
+
+
+ + ☆ SparseOcc: Rethinking Sparse Latent Representation for Vision-Based + Semantic Occupancy Prediction CVPR 2024 + + +
+ Vision-based perception for autonomous driving requires an explicit modeling +of a 3D space, where 2D latent representations are mapped and subsequent 3D +operators are applied. However, operating on dense latent spaces introduces a +cubic time and space complexity, which limits scalability in terms of +perception range or spatial resolution. Existing approaches compress the dense +representation using projections like Bird's Eye View (BEV) or Tri-Perspective +View (TPV). Although efficient, these projections result in information loss, +especially for tasks like semantic occupancy prediction. To address this, we +propose SparseOcc, an efficient occupancy network inspired by sparse point +cloud processing. It utilizes a lossless sparse latent representation with +three key innovations. Firstly, a 3D sparse diffuser performs latent completion +using spatially decomposed 3D sparse convolutional kernels. Secondly, a feature +pyramid and sparse interpolation enhance scales with information from others. +Finally, the transformer head is redesigned as a sparse variant. SparseOcc +achieves a remarkable 74.9% reduction on FLOPs over the dense baseline. +Interestingly, it also improves accuracy, from 12.8% to 14.1% mIOU, which in +part can be attributed to the sparse representation's ability to avoid +hallucinations on empty voxels. + +
+
+ comment: 10 pages, 4 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Learning Human Motion from Monocular Videos via Cross-Modal Manifold + Alignment + + +
+ Learning 3D human motion from 2D inputs is a fundamental task in the realms +of computer vision and computer graphics. Many previous methods grapple with +this inherently ambiguous task by introducing motion priors into the learning +process. However, these approaches face difficulties in defining the complete +configurations of such priors or training a robust model. In this paper, we +present the Video-to-Motion Generator (VTM), which leverages motion priors +through cross-modal latent feature space alignment between 3D human motion and +2D inputs, namely videos and 2D keypoints. To reduce the complexity of modeling +motion priors, we model the motion data separately for the upper and lower body +parts. Additionally, we align the motion data with a scale-invariant virtual +skeleton to mitigate the interference of human skeleton variations to the +motion priors. Evaluated on AIST++, the VTM showcases state-of-the-art +performance in reconstructing 3D human motion from monocular videos. Notably, +our VTM exhibits the capabilities for generalization to unseen view angles and +in-the-wild videos. + +
+
+
+
+
+ + ☆ FusionMamba: Dynamic Feature Enhancement for Multimodal Image Fusion + with Mamba + + +
+ Multi-modal image fusion aims to combine information from different modes to +create a single image with comprehensive information and detailed textures. +However, fusion models based on convolutional neural networks encounter +limitations in capturing global image features due to their focus on local +convolution operations. Transformer-based models, while excelling in global +feature modeling, confront computational challenges stemming from their +quadratic complexity. Recently, the Selective Structured State Space Model has +exhibited significant potential for long-range dependency modeling with linear +complexity, offering a promising avenue to address the aforementioned dilemma. +In this paper, we propose FusionMamba, a novel dynamic feature enhancement +method for multimodal image fusion with Mamba. Specifically, we devise an +improved efficient Mamba model for image fusion, integrating efficient visual +state space model with dynamic convolution and channel attention. This refined +model not only upholds the performance of Mamba and global modeling capability +but also diminishes channel redundancy while enhancing local enhancement +capability. Additionally, we devise a dynamic feature fusion module (DFFM) +comprising two dynamic feature enhancement modules (DFEM) and a cross modality +fusion mamba module (CMFM). The former serves for dynamic texture enhancement +and dynamic difference perception, whereas the latter enhances correlation +features between modes and suppresses redundant intermodal information. +FusionMamba has yielded state-of-the-art (SOTA) performance across various +multimodal medical image fusion tasks (CT-MRI, PET-MRI, SPECT-MRI), infrared +and visible image fusion task (IR-VIS) and multimodal biomedical image fusion +dataset (GFP-PC), which is proved that our model has generalization ability. +The code for FusionMamba is available at +https://github.com/millieXie/FusionMamba. + +
+
+
+
+
+ + ☆ Towards Collaborative Autonomous Driving: Simulation Platform and + End-to-End System + + +
+ Vehicle-to-everything-aided autonomous driving (V2X-AD) has a huge potential +to provide a safer driving solution. Despite extensive researches in +transportation and communication to support V2X-AD, the actual utilization of +these infrastructures and communication resources in enhancing driving +performances remains largely unexplored. This highlights the necessity of +collaborative autonomous driving: a machine learning approach that optimizes +the information sharing strategy to improve the driving performance of each +vehicle. This effort necessitates two key foundations: a platform capable of +generating data to facilitate the training and testing of V2X-AD, and a +comprehensive system that integrates full driving-related functionalities with +mechanisms for information sharing. From the platform perspective, we present +V2Xverse, a comprehensive simulation platform for collaborative autonomous +driving. This platform provides a complete pipeline for collaborative driving. +From the system perspective, we introduce CoDriving, a novel end-to-end +collaborative driving system that properly integrates V2X communication over +the entire autonomous pipeline, promoting driving with shared perceptual +information. The core idea is a novel driving-oriented communication strategy. +Leveraging this strategy, CoDriving improves driving performance while +optimizing communication efficiency. We make comprehensive benchmarks with +V2Xverse, analyzing both modular performance and closed-loop driving +performance. Experimental results show that CoDriving: i) significantly +improves the driving score by 62.49% and drastically reduces the pedestrian +collision rate by 53.50% compared to the SOTA end-to-end driving method, and +ii) achieves sustaining driving performance superiority over dynamic constraint +communication conditions. + +
+
+
+
+
+ + ☆ Leveraging Temporal Contextualization for Video Action Recognition + + +
+ Pretrained vision-language models have shown effectiveness in video +understanding. However, recent studies have not sufficiently leveraged +essential temporal information from videos, simply averaging frame-wise +representations or referencing consecutive frames. We introduce Temporally +Contextualized CLIP (TC-CLIP), a pioneering framework for video understanding +that effectively and efficiently leverages comprehensive video information. We +propose Temporal Contextualization (TC), a novel layer-wise temporal +information infusion mechanism for video that extracts core information from +each frame, interconnects relevant information across the video to summarize +into context tokens, and ultimately leverages the context tokens during the +feature encoding process. Furthermore, our Video-conditional Prompting (VP) +module manufactures context tokens to generate informative prompts in text +modality. We conduct extensive experiments in zero-shot, few-shot, +base-to-novel, and fully-supervised action recognition to validate the +superiority of our TC-CLIP. Ablation studies for TC and VP guarantee our design +choices. Code is available at https://github.com/naver-ai/tc-clip + +
+
+ comment: 24 pages, 10 figures, 12 tables +
+
+
+
+
+ + ☆ MMCode: Evaluating Multi-Modal Code Large Language Models with Visually + Rich Programming Problems + + +
+ Programming often involves converting detailed and complex specifications +into code, a process during which developers typically utilize visual aids to +more effectively convey concepts. While recent developments in Large Multimodal +Models have demonstrated remarkable abilities in visual reasoning and +mathematical tasks, there is little work on investigating whether these models +can effectively interpret visual elements for code generation. To this end, we +present MMCode, the first multi-modal coding dataset for evaluating algorithmic +problem-solving skills in visually rich contexts. MMCode contains 3,548 +questions and 6,620 images collected from real-world programming challenges +harvested from 10 code competition websites, presenting significant challenges +due to the extreme demand for reasoning abilities. Our experiment results show +that current state-of-the-art models struggle to solve these problems. The +results highlight the lack of powerful vision-code models, and we hope MMCode +can serve as an inspiration for future works in this domain. The data and code +are publicly available at https://github.com/happylkx/MMCode. + +
+
+ comment: 46 pages, 21 figures and 6 tables +
+
+
+
+
+ + ☆ FreqMamba: Viewing Mamba from a Frequency Perspective for Image + Deraining + + +
+ Images corrupted by rain streaks often lose vital frequency information for +perception, and image deraining aims to solve this issue which relies on global +and local degradation modeling. Recent studies have witnessed the effectiveness +and efficiency of Mamba for perceiving global and local information based on +its exploiting local correlation among patches, however, rarely attempts have +been explored to extend it with frequency analysis for image deraining, +limiting its ability to perceive global degradation that is relevant to +frequency modeling (e.g. Fourier transform). In this paper, we propose +FreqMamba, an effective and efficient paradigm that leverages the complementary +between Mamba and frequency analysis for image deraining. The core of our +method lies in extending Mamba with frequency analysis from two perspectives: +extending it with frequency-band for exploiting frequency correlation, and +connecting it with Fourier transform for global degradation modeling. +Specifically, FreqMamba introduces complementary triple interaction structures +including spatial Mamba, frequency band Mamba, and Fourier global modeling. +Frequency band Mamba decomposes the image into sub-bands of different +frequencies to allow 2D scanning from the frequency dimension. Furthermore, +leveraging Mamba's unique data-dependent properties, we use rainy images at +different scales to provide degradation priors to the network, thereby +facilitating efficient training. Extensive experiments show that our method +outperforms state-of-the-art methods both visually and quantitatively. + +
+
+
+
+
+ + ☆ Improving Weakly-Supervised Object Localization Using Adversarial + Erasing and Pseudo Label + + +
+ Weakly-supervised learning approaches have gained significant attention due +to their ability to reduce the effort required for human annotations in +training neural networks. This paper investigates a framework for +weakly-supervised object localization, which aims to train a neural network +capable of predicting both the object class and its location using only images +and their image-level class labels. The proposed framework consists of a shared +feature extractor, a classifier, and a localizer. The localizer predicts +pixel-level class probabilities, while the classifier predicts the object class +at the image level. Since image-level class labels are insufficient for +training the localizer, weakly-supervised object localization methods often +encounter challenges in accurately localizing the entire object region. To +address this issue, the proposed method incorporates adversarial erasing and +pseudo labels to improve localization accuracy. Specifically, novel losses are +designed to utilize adversarially erased foreground features and adversarially +erased feature maps, reducing dependence on the most discriminative region. +Additionally, the proposed method employs pseudo labels to suppress activation +values in the background while increasing them in the foreground. The proposed +method is applied to two backbone networks (MobileNetV1 and InceptionV3) and is +evaluated on three publicly available datasets (ILSVRC-2012, CUB-200-2011, and +PASCAL VOC 2012). The experimental results demonstrate that the proposed method +outperforms previous state-of-the-art methods across all evaluated metrics. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ TCCT-Net: Two-Stream Network Architecture for Fast and Efficient + Engagement Estimation via Behavioral Feature Signals CVPR 2024 + + +
+ Engagement analysis finds various applications in healthcare, education, +advertisement, services. Deep Neural Networks, used for analysis, possess +complex architecture and need large amounts of input data, computational power, +inference time. These constraints challenge embedding systems into devices for +real-time use. To address these limitations, we present a novel two-stream +feature fusion "Tensor-Convolution and Convolution-Transformer Network" +(TCCT-Net) architecture. To better learn the meaningful patterns in the +temporal-spatial domain, we design a "CT" stream that integrates a hybrid +convolutional-transformer. In parallel, to efficiently extract rich patterns +from the temporal-frequency domain and boost processing speed, we introduce a +"TC" stream that uses Continuous Wavelet Transform (CWT) to represent +information in a 2D tensor form. Evaluated on the EngageNet dataset, the +proposed method outperforms existing baselines, utilizing only two behavioral +features (head pose rotations) compared to the 98 used in baseline models. +Furthermore, comparative analysis shows TCCT-Net's architecture offers an +order-of-magnitude improvement in inference speed compared to state-of-the-art +image-based Recurrent Neural Network (RNN) methods. The code will be released +at https://github.com/vedernikovphoto/TCCT_Net. + +
+
+ comment: Accepted for the CVPR 2024 workshop (ABAW) +
+
+
+
+
+ + ☆ Q2A: Querying Implicit Fully Continuous Feature Pyramid to Align + Features for Medical Image Segmentation + + +
+ Recent medical image segmentation methods apply implicit neural +representation (INR) to the decoder for achieving a continuous coordinate +decoding to tackle the drawback of conventional discrete grid-based data +representations. However, the INR-based decoder cannot well handle the feature +misalignment problem brought about by the naive latent code acquisition +strategy in INR. Although there exist many feature alignment works, they all +adopt a progressive multi-step aligning paradigm on a discrete feature pyramid, +which is incompatible with the continuous one-step characteristics of INR-based +decoder, and thus fails to be the solution. Therefore, we propose Q2A, a novel +one-step query-based aligning paradigm, to solve the feature misalignment +problem in the INR-based decoder. Specifically, for each target coordinate, Q2A +first generates several queries depicting the spatial offsets and the cell +resolutions of the contextual features aligned to the coordinate, then +calculates the corresponding aligned features by feeding the queries into a +novel implicit fully continuous feature pyramid (FCFP), finally fuses the +aligned features to predict the class distribution. In FCFP, we further propose +a novel universal partition-and-aggregate strategy (P&A) to replace the naive +interpolation strategy for latent code acquisition in INR, which mitigates the +information loss problem that occurs when the query cell resolution is +relatively large and achieves an effective feature decoding at arbitrary +continuous resolution. We conduct extensive experiments on two medical +datasets, i.e. Glas and Synapse, and a universal dataset, i.e. Cityscapes, and +they show the superiority of the proposed Q2A. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Virtually Enriched NYU Depth V2 Dataset for Monocular Depth Estimation: + Do We Need Artificial Augmentation? + + +
+ We present ANYU, a new virtually augmented version of the NYU depth v2 +dataset, designed for monocular depth estimation. In contrast to the well-known +approach where full 3D scenes of a virtual world are utilized to generate +artificial datasets, ANYU was created by incorporating RGB-D representations of +virtual reality objects into the original NYU depth v2 images. We specifically +did not match each generated virtual object with an appropriate texture and a +suitable location within the real-world image. Instead, an assignment of +texture, location, lighting, and other rendering parameters was randomized to +maximize a diversity of the training data, and to show that it is randomness +that can improve the generalizing ability of a dataset. By conducting extensive +experiments with our virtually modified dataset and validating on the original +NYU depth v2 and iBims-1 benchmarks, we show that ANYU improves the monocular +depth estimation performance and generalization of deep neural networks with +considerably different architectures, especially for the current +state-of-the-art VPD model. To the best of our knowledge, this is the first +work that augments a real-world dataset with randomly generated virtual 3D +objects for monocular depth estimation. We make our ANYU dataset publicly +available in two training configurations with 10% and 100% additional +synthetically enriched RGB-D pairs of training images, respectively, for +efficient training and empirical exploration of virtual augmentation at +https://github.com/ABrain-One/ANYU + +
+
+
+
+
+ + ☆ PhyScene: Physically Interactable 3D Scene Synthesis for Embodied AI CVPR 2024 + + +
+ With recent developments in Embodied Artificial Intelligence (EAI) research, +there has been a growing demand for high-quality, large-scale interactive scene +generation. While prior methods in scene synthesis have prioritized the +naturalness and realism of the generated scenes, the physical plausibility and +interactivity of scenes have been largely left unexplored. To address this +disparity, we introduce PhyScene, a novel method dedicated to generating +interactive 3D scenes characterized by realistic layouts, articulated objects, +and rich physical interactivity tailored for embodied agents. Based on a +conditional diffusion model for capturing scene layouts, we devise novel +physics- and interactivity-based guidance mechanisms that integrate constraints +from object collision, room layout, and object reachability. Through extensive +experiments, we demonstrate that PhyScene effectively leverages these guidance +functions for physically interactable scene synthesis, outperforming existing +state-of-the-art scene synthesis methods by a large margin. Our findings +suggest that the scenes generated by PhyScene hold considerable potential for +facilitating diverse skill acquisition among agents within interactive +environments, thereby catalyzing further advancements in embodied AI research. +Project website: http://physcene.github.io. + +
+
+ comment: Accepted by CVPR 2024, 18 pages +
+
+
+
+
+ + ☆ Improved Object-Based Style Transfer with Single Deep Network + + +
+ This research paper proposes a novel methodology for image-to-image style +transfer on objects utilizing a single deep convolutional neural network. The +proposed approach leverages the You Only Look Once version 8 (YOLOv8) +segmentation model and the backbone neural network of YOLOv8 for style +transfer. The primary objective is to enhance the visual appeal of objects in +images by seamlessly transferring artistic styles while preserving the original +object characteristics. The proposed approach's novelty lies in combining +segmentation and style transfer in a single deep convolutional neural network. +This approach omits the need for multiple stages or models, thus resulting in +simpler training and deployment of the model for practical applications. The +results of this approach are shown on two content images by applying different +style images. The paper also demonstrates the ability to apply style transfer +on multiple objects in the same image. + +
+
+ comment: In Proceedings of the Fourth International Conference on Innovations + in Computational Intelligence and Computer Vision +
+
+
+
+
+ + ☆ CompGS: Efficient 3D Scene Representation via Compressed Gaussian + Splatting + + +
+ Gaussian splatting, renowned for its exceptional rendering quality and +efficiency, has emerged as a prominent technique in 3D scene representation. +However, the substantial data volume of Gaussian splatting impedes its +practical utility in real-world applications. Herein, we propose an efficient +3D scene representation, named Compressed Gaussian Splatting (CompGS), which +harnesses compact Gaussian primitives for faithful 3D scene modeling with a +remarkably reduced data size. To ensure the compactness of Gaussian primitives, +we devise a hybrid primitive structure that captures predictive relationships +between each other. Then, we exploit a small set of anchor primitives for +prediction, allowing the majority of primitives to be encapsulated into highly +compact residual forms. Moreover, we develop a rate-constrained optimization +scheme to eliminate redundancies within such hybrid primitives, steering our +CompGS towards an optimal trade-off between bitrate consumption and +representation efficacy. Experimental results show that the proposed CompGS +significantly outperforms existing methods, achieving superior compactness in +3D scene representation without compromising model accuracy and rendering +quality. Our code will be released on GitHub for further research. + +
+
+ comment: Submitted to a conference +
+
+
+
+
+ + ☆ Utility-Fairness Trade-Offs and How to Find Them + + +
+ When building classification systems with demographic fairness +considerations, there are two objectives to satisfy: 1) maximizing utility for +the specific task and 2) ensuring fairness w.r.t. a known demographic +attribute. These objectives often compete, so optimizing both can lead to a +trade-off between utility and fairness. While existing works acknowledge the +trade-offs and study their limits, two questions remain unanswered: 1) What are +the optimal trade-offs between utility and fairness? and 2) How can we +numerically quantify these trade-offs from data for a desired prediction task +and demographic attribute of interest? This paper addresses these questions. We +introduce two utility-fairness trade-offs: the Data-Space and Label-Space +Trade-off. The trade-offs reveal three regions within the utility-fairness +plane, delineating what is fully and partially possible and impossible. We +propose U-FaTE, a method to numerically quantify the trade-offs for a given +prediction task and group fairness definition from data samples. Based on the +trade-offs, we introduce a new scheme for evaluating representations. An +extensive evaluation of fair representation learning methods and +representations from over 1000 pre-trained models revealed that most current +approaches are far from the estimated and achievable fairness-utility +trade-offs across multiple datasets and prediction tasks. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024 +
+
+
+
+
+ + ☆ Contrastive Mean-Shift Learning for Generalized Category Discovery CVPR 2024 + + +
+ We address the problem of generalized category discovery (GCD) that aims to +partition a partially labeled collection of images; only a small part of the +collection is labeled and the total number of target classes is unknown. To +address this generalized image clustering problem, we revisit the mean-shift +algorithm, i.e., a classic, powerful technique for mode seeking, and +incorporate it into a contrastive learning framework. The proposed method, +dubbed Contrastive Mean-Shift (CMS) learning, trains an image encoder to +produce representations with better clustering properties by an iterative +process of mean shift and contrastive update. Experiments demonstrate that our +method, both in settings with and without the total number of clusters being +known, achieves state-of-the-art performance on six public GCD benchmarks +without bells and whistles. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually + Expanding Large Vocabularies + + +
+ Rapid advancements in continual segmentation have yet to bridge the gap of +scaling to large continually expanding vocabularies under compute-constrained +scenarios. We discover that traditional continual training leads to +catastrophic forgetting under compute constraints, unable to outperform +zero-shot segmentation methods. We introduce a novel strategy for semantic and +panoptic segmentation with zero forgetting, capable of adapting to continually +growing vocabularies without the need for retraining or large memory costs. Our +training-free approach, kNN-CLIP, leverages a database of instance embeddings +to enable open-vocabulary segmentation approaches to continually expand their +vocabulary on any given domain with a single-pass through data, while only +storing embeddings minimizing both compute and memory costs. This method +achieves state-of-the-art mIoU performance across large-vocabulary semantic and +panoptic segmentation datasets. We hope kNN-CLIP represents a step forward in +enabling more efficient and adaptable continual segmentation, paving the way +for advances in real-world large-vocabulary continual segmentation methods. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Exploring Text-to-Motion Generation with Human Preference CVPR 2024 + + +
+ This paper presents an exploration of preference learning in text-to-motion +generation. We find that current improvements in text-to-motion generation +still rely on datasets requiring expert labelers with motion capture systems. +Instead, learning from human preference data does not require motion capture +systems; a labeler with no expertise simply compares two generated motions. +This is particularly efficient because evaluating the model's output is easier +than gathering the motion that performs a desired task (e.g. backflip). To +pioneer the exploration of this paradigm, we annotate 3,528 preference pairs +generated by MotionGPT, marking the first effort to investigate various +algorithms for learning from preference data. In particular, our exploration +highlights important design choices when using preference data. Additionally, +our experimental results show that preference learning has the potential to +greatly improve current text-to-motion generative models. Our code and dataset +are publicly available at +https://github.com/THU-LYJ-Lab/InstructMotion}{https://github.com/THU-LYJ-Lab/InstructMotion +to further facilitate research in this area. + +
+
+ comment: Accepted to CVPR 2024 HuMoGen Workshop +
+
+
+
+
+ + ☆ The 8th AI City Challenge CVPR 2024 + + +
+ The eighth AI City Challenge highlighted the convergence of computer vision +and artificial intelligence in areas like retail, warehouse settings, and +Intelligent Traffic Systems (ITS), presenting significant research +opportunities. The 2024 edition featured five tracks, attracting unprecedented +interest from 726 teams in 47 countries and regions. Track 1 dealt with +multi-target multi-camera (MTMC) people tracking, highlighting significant +enhancements in camera count, character number, 3D annotation, and camera +matrices, alongside new rules for 3D tracking and online tracking algorithm +encouragement. Track 2 introduced dense video captioning for traffic safety, +focusing on pedestrian accidents using multi-camera feeds to improve insights +for insurance and prevention. Track 3 required teams to classify driver actions +in a naturalistic driving analysis. Track 4 explored fish-eye camera analytics +using the FishEye8K dataset. Track 5 focused on motorcycle helmet rule +violation detection. The challenge utilized two leaderboards to showcase +methods, with participants setting new benchmarks, some surpassing existing +state-of-the-art achievements. + +
+
+ comment: Summary of the 8th AI City Challenge Workshop in conjunction with + CVPR 2024 +
+
+
+
+
+ + ☆ VFMM3D: Releasing the Potential of Image by Vision Foundation Model for + Monocular 3D Object Detection + + +
+ Due to its cost-effectiveness and widespread availability, monocular 3D +object detection, which relies solely on a single camera during inference, +holds significant importance across various applications, including autonomous +driving and robotics. Nevertheless, directly predicting the coordinates of +objects in 3D space from monocular images poses challenges. Therefore, an +effective solution involves transforming monocular images into LiDAR-like +representations and employing a LiDAR-based 3D object detector to predict the +3D coordinates of objects. The key step in this method is accurately converting +the monocular image into a reliable point cloud form. In this paper, we present +VFMM3D, an innovative approach that leverages the capabilities of Vision +Foundation Models (VFMs) to accurately transform single-view images into LiDAR +point cloud representations. VFMM3D utilizes the Segment Anything Model (SAM) +and Depth Anything Model (DAM) to generate high-quality pseudo-LiDAR data +enriched with rich foreground information. Specifically, the Depth Anything +Model (DAM) is employed to generate dense depth maps. Subsequently, the Segment +Anything Model (SAM) is utilized to differentiate foreground and background +regions by predicting instance masks. These predicted instance masks and depth +maps are then combined and projected into 3D space to generate pseudo-LiDAR +points. Finally, any object detectors based on point clouds can be utilized to +predict the 3D coordinates of objects. Comprehensive experiments are conducted +on the challenging 3D object detection dataset KITTI. Our VFMM3D establishes a +new state-of-the-art performance. Additionally, experimental results +demonstrate the generality of VFMM3D, showcasing its seamless integration into +various LiDAR-based 3D object detectors. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ ViFu: Multiple 360$^\circ$ Objects Reconstruction with Clean Background + via Visible Part Fusion + + +
+ In this paper, we propose a method to segment and recover a static, clean +background and multiple 360$^\circ$ objects from observations of scenes at +different timestamps. Recent works have used neural radiance fields to model 3D +scenes and improved the quality of novel view synthesis, while few studies have +focused on modeling the invisible or occluded parts of the training images. +These under-reconstruction parts constrain both scene editing and rendering +view selection, thereby limiting their utility for synthetic data generation +for downstream tasks. Our basic idea is that, by observing the same set of +objects in various arrangement, so that parts that are invisible in one scene +may become visible in others. By fusing the visible parts from each scene, +occlusion-free rendering of both background and foreground objects can be +achieved. + We decompose the multi-scene fusion task into two main components: (1) +objects/background segmentation and alignment, where we leverage point +cloud-based methods tailored to our novel problem formulation; (2) radiance +fields fusion, where we introduce visibility field to quantify the visible +information of radiance fields, and propose visibility-aware rendering for the +fusion of series of scenes, ultimately obtaining clean background and +360$^\circ$ object rendering. Comprehensive experiments were conducted on +synthetic and real datasets, and the results demonstrate the effectiveness of +our method. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Super-resolution of biomedical volumes with 2D supervision CVPR + + +
+ Volumetric biomedical microscopy has the potential to increase the diagnostic +information extracted from clinical tissue specimens and improve the diagnostic +accuracy of both human pathologists and computational pathology models. +Unfortunately, barriers to integrating 3-dimensional (3D) volumetric microscopy +into clinical medicine include long imaging times, poor depth / z-axis +resolution, and an insufficient amount of high-quality volumetric data. +Leveraging the abundance of high-resolution 2D microscopy data, we introduce +masked slice diffusion for super-resolution (MSDSR), which exploits the +inherent equivalence in the data-generating distribution across all spatial +dimensions of biological specimens. This intrinsic characteristic allows for +super-resolution models trained on high-resolution images from one plane (e.g., +XY) to effectively generalize to others (XZ, YZ), overcoming the traditional +dependency on orientation. We focus on the application of MSDSR to stimulated +Raman histology (SRH), an optical imaging modality for biological specimen +analysis and intraoperative diagnosis, characterized by its rapid acquisition +of high-resolution 2D images but slow and costly optical z-sectioning. To +evaluate MSDSR's efficacy, we introduce a new performance metric, SliceFID, and +demonstrate MSDSR's superior performance over baseline models through extensive +evaluations. Our findings reveal that MSDSR not only significantly enhances the +quality and resolution of 3D volumetric data, but also addresses major +obstacles hindering the broader application of 3D volumetric microscopy in +clinical diagnostics and biomedical research. + +
+
+ comment: CVPR Workshop on Computer Vision for Microscopy Image Analysis 2024 +
+
+
+
+
+ + ☆ A Review on Machine Learning Algorithms for Dust Aerosol Detection using + Satellite Data + + +
+ Dust storms are associated with certain respiratory illnesses across +different areas in the world. Researchers have devoted time and resources to +study the elements surrounding dust storm phenomena. This paper reviews the +efforts of those who have investigated dust aerosols using sensors onboard of +satellites using machine learning-based approaches. We have reviewed the most +common issues revolving dust aerosol modeling using different datasets and +different sensors from a historical perspective. Our findings suggest that +multi-spectral approaches based on linear and non-linear combinations of +spectral bands are some of the most successful for visualization and +quantitative analysis; however, when researchers have leveraged machine +learning, performance has been improved and new opportunities to solve unique +problems arise. + +
+
+ comment: The 23rd International Conference on Artificial Intelligence (ICAI + 2021) +
+
+
+
+
+ + ☆ DeferredGS: Decoupled and Editable Gaussian Splatting with Deferred + Shading + + +
+ Reconstructing and editing 3D objects and scenes both play crucial roles in +computer graphics and computer vision. Neural radiance fields (NeRFs) can +achieve realistic reconstruction and editing results but suffer from +inefficiency in rendering. Gaussian splatting significantly accelerates +rendering by rasterizing Gaussian ellipsoids. However, Gaussian splatting +utilizes a single Spherical Harmonic (SH) function to model both texture and +lighting, limiting independent editing capabilities of these components. +Recently, attempts have been made to decouple texture and lighting with the +Gaussian splatting representation but may fail to produce plausible geometry +and decomposition results on reflective scenes. Additionally, the forward +shading technique they employ introduces noticeable blending artifacts during +relighting, as the geometry attributes of Gaussians are optimized under the +original illumination and may not be suitable for novel lighting conditions. To +address these issues, we introduce DeferredGS, a method for decoupling and +editing the Gaussian splatting representation using deferred shading. To +achieve successful decoupling, we model the illumination with a learnable +environment map and define additional attributes such as texture parameters and +normal direction on Gaussians, where the normal is distilled from a jointly +trained signed distance function. More importantly, we apply deferred shading, +resulting in more realistic relighting effects compared to previous methods. +Both qualitative and quantitative experiments demonstrate the superior +performance of DeferredGS in novel view synthesis and editing tasks. + +
+
+
+
+
+ + ☆ Human-in-the-Loop Segmentation of Multi-species Coral Imagery + + +
+ Broad-scale marine surveys performed by underwater vehicles significantly +increase the availability of coral reef imagery, however it is costly and +time-consuming for domain experts to label images. Point label propagation is +an approach used to leverage existing image data labeled with sparse point +labels. The resulting augmented ground truth generated is then used to train a +semantic segmentation model. Here, we first demonstrate that recent advances in +foundation models enable generation of multi-species coral augmented ground +truth masks using denoised DINOv2 features and K-Nearest Neighbors (KNN), +without the need for any pre-training or custom-designed algorithms. For +extremely sparsely labeled images, we propose a labeling regime based on +human-in-the-loop principles, resulting in significant improvement in +annotation efficiency: If only 5 point labels per image are available, our +proposed human-in-the-loop approach improves on the state-of-the-art by 17.3% +for pixel accuracy and 22.6% for mIoU; and by 10.6% and 19.1% when 10 point +labels per image are available. Even if the human-in-the-loop labeling regime +is not used, the denoised DINOv2 features with a KNN outperforms the prior +state-of-the-art by 3.5% for pixel accuracy and 5.7% for mIoU (5 grid points). +We also provide a detailed analysis of how point labeling style and the +quantity of points per image affects the point label propagation quality and +provide general recommendations on maximizing point label efficiency. + +
+
+ comment: 10 pages, 6 figures, an additional 4 pages of supplementary material +
+
+
+
+
+ + ☆ Watermark-embedded Adversarial Examples for Copyright Protection against + Diffusion Models + + +
+ Diffusion Models (DMs) have shown remarkable capabilities in various +image-generation tasks. However, there are growing concerns that DMs could be +used to imitate unauthorized creations and thus raise copyright issues. To +address this issue, we propose a novel framework that embeds personal +watermarks in the generation of adversarial examples. Such examples can force +DMs to generate images with visible watermarks and prevent DMs from imitating +unauthorized images. We construct a generator based on conditional adversarial +networks and design three losses (adversarial loss, GAN loss, and perturbation +loss) to generate adversarial examples that have subtle perturbation but can +effectively attack DMs to prevent copyright violations. Training a generator +for a personal watermark by our method only requires 5-10 samples within 2-3 +minutes, and once the generator is trained, it can generate adversarial +examples with that watermark significantly fast (0.2s per image). We conduct +extensive experiments in various conditional image-generation scenarios. +Compared to existing methods that generate images with chaotic textures, our +method adds visible watermarks on the generated images, which is a more +straightforward way to indicate copyright violations. We also observe that our +adversarial examples exhibit good transferability across unknown generative +models. Therefore, this work provides a simple yet powerful way to protect +copyright from DM-based imitation. + +
+
+
+
+
+ + ☆ Masked and Shuffled Blind Spot Denoising for Real-World Images + + +
+ We introduce a novel approach to single image denoising based on the Blind +Spot Denoising principle, which we call MAsked and SHuffled Blind Spot +Denoising (MASH). We focus on the case of correlated noise, which often plagues +real images. MASH is the result of a careful analysis to determine the +relationships between the level of blindness (masking) of the input and the +(unknown) noise correlation. Moreover, we introduce a shuffling technique to +weaken the local correlation of noise, which in turn yields an additional +denoising performance improvement. We evaluate MASH via extensive experiments +on real-world noisy image datasets. We demonstrate on par or better results +compared to existing self-supervised denoising methods. + +
+
+
+
+
+ + ☆ RankCLIP: Ranking-Consistent Language-Image Pretraining + + +
+ Among the ever-evolving development of vision-language models, contrastive +language-image pretraining (CLIP) has set new benchmarks in many downstream +tasks such as zero-shot classifications by leveraging self-supervised +contrastive learning on large amounts of text-image pairs. However, its +dependency on rigid one-to-one mappings overlooks the complex and often +multifaceted relationships between and within texts and images. To this end, we +introduce RankCLIP, a novel pretraining method that extends beyond the rigid +one-to-one matching framework of CLIP and its variants. By leveraging both +in-modal and cross-modal ranking consistency, RankCLIP improves the alignment +process, enabling it to capture the nuanced many-to-many relationships between +and within each modality. Through comprehensive experiments, we demonstrate the +enhanced capability of RankCLIP to effectively improve performance across +various downstream tasks, notably achieving significant gains in zero-shot +classifications over state-of-the-art methods, underscoring the potential of +RankCLIP in further advancing vision-language pretraining. + +
+
+ comment: 10 pages, 3 figures, 6 tables. Code and model checkpoints are + available at https://github.com/Jam1ezhang/RankCLIP +
+
+
+
+
+ + ☆ CryoMAE: Few-Shot Cryo-EM Particle Picking with Masked Autoencoders + + +
+ Cryo-electron microscopy (cryo-EM) emerges as a pivotal technology for +determining the architecture of cells, viruses, and protein assemblies at +near-atomic resolution. Traditional particle picking, a key step in cryo-EM, +struggles with manual effort and automated methods' sensitivity to low +signal-to-noise ratio (SNR) and varied particle orientations. Furthermore, +existing neural network (NN)-based approaches often require extensive labeled +datasets, limiting their practicality. To overcome these obstacles, we +introduce cryoMAE, a novel approach based on few-shot learning that harnesses +the capabilities of Masked Autoencoders (MAE) to enable efficient selection of +single particles in cryo-EM images. Contrary to conventional NN-based +techniques, cryoMAE requires only a minimal set of positive particle images for +training yet demonstrates high performance in particle detection. Furthermore, +the implementation of a self-cross similarity loss ensures distinct features +for particle and background regions, thereby enhancing the discrimination +capability of cryoMAE. Experiments on large-scale cryo-EM datasets show that +cryoMAE outperforms existing state-of-the-art (SOTA) methods, improving 3D +reconstruction resolution by up to 22.4%. + +
+
+
+
+
+ + ☆ PD-L1 Classification of Weakly-Labeled Whole Slide Images of Breast + Cancer + + +
+ Specific and effective breast cancer therapy relies on the accurate +quantification of PD-L1 positivity in tumors, which appears in the form of +brown stainings in high resolution whole slide images (WSIs). However, the +retrieval and extensive labeling of PD-L1 stained WSIs is a time-consuming and +challenging task for pathologists, resulting in low reproducibility, especially +for borderline images. This study aims to develop and compare models able to +classify PD-L1 positivity of breast cancer samples based on WSI analysis, +relying only on WSI-level labels. The task consists of two phases: identifying +regions of interest (ROI) and classifying tumors as PD-L1 positive or negative. +For the latter, two model categories were developed, with different feature +extraction methodologies. The first encodes images based on the colour distance +from a base color. The second uses a convolutional autoencoder to obtain +embeddings of WSI tiles, and aggregates them into a WSI-level embedding. For +both model types, features are fed into downstream ML classifiers. Two datasets +from different clinical centers were used in two different training +configurations: (1) training on one dataset and testing on the other; (2) +combining the datasets. We also tested the performance with or without human +preprocessing to remove brown artefacts Colour distance based models achieve +the best performances on testing configuration (1) with artefact removal, while +autoencoder-based models are superior in the remaining cases, which are prone +to greater data variability. + +
+
+
+
+
+ + ☆ Forensic Iris Image-Based Post-Mortem Interval Estimation + + +
+ Post-mortem iris recognition is an emerging application of iris-based human +identification in a forensic setup. One factor that may be useful in +conditioning iris recognition methods is the tissue decomposition level, which +is correlated with the post-mortem interval (PMI), i.g., the number of hours +that have elapsed since death. PMI, however, is not always available, and its +precise estimation remains one of the core challenges in forensic examination. +This paper presents the first known to us method of PMI estimation directly +from forensic iris images. To assess the feasibility of the iris-based PMI +estimation, convolutional neural networks-based models (VGG19, DenseNet121, +ResNet152, and Inception_v3) were trained to predict the PMI from (a) +near-infrared (NIR), (b) visible (RGB), and (c) multispectral forensic iris +images. Models were evaluated following a 10-fold cross-validation in (S1) +sample-disjoint, (S2) subject-disjoint, and (S3) cross-dataset scenarios. We +found that using the multispectral data offers a spectacularly low mean +absolute error (MAE) of approximately 3.5 hours in scenario (S1), a bit worse +MAE of approximately 17.5 hours in scenario (S2), and an MAE of approximately +69.0 hours of in the scenario (S3). This suggests that if the environmental +conditions are favorable (e.g., bodies are kept in low temperatures), forensic +iris images provide features that are indicative of the PMI and can be +automatically estimated. The source codes and model weights are made available +with the paper. + +
+
+
+
+
+ + ☆ High-Resolution Detection of Earth Structural Heterogeneities from + Seismic Amplitudes using Convolutional Neural Networks with Attention layers + + +
+ Earth structural heterogeneities have a remarkable role in the petroleum +economy for both exploration and production projects. Automatic detection of +detailed structural heterogeneities is challenging when considering modern +machine learning techniques like deep neural networks. Typically, these +techniques can be an excellent tool for assisted interpretation of such +heterogeneities, but it heavily depends on the amount of data to be trained. + We propose an efficient and cost-effective architecture for detecting seismic +structural heterogeneities using Convolutional Neural Networks (CNNs) combined +with Attention layers. The attention mechanism reduces costs and enhances +accuracy, even in cases with relatively noisy data. Our model has half the +parameters compared to the state-of-the-art, and it outperforms previous +methods in terms of Intersection over Union (IoU) by 0.6% and precision by +0.4%. By leveraging synthetic data, we apply transfer learning to train and +fine-tune the model, addressing the challenge of limited annotated data +availability. + +
+
+
+
+
+ + ☆ Self-Supervised Learning Featuring Small-Scale Image Dataset for + Treatable Retinal Diseases Classification + + +
+ Automated medical diagnosis through image-based neural networks has increased +in popularity and matured over years. Nevertheless, it is confined by the +scarcity of medical images and the expensive labor annotation costs. +Self-Supervised Learning (SSL) is an good alternative to Transfer Learning (TL) +and is suitable for imbalanced image datasets. In this study, we assess four +pretrained SSL models and two TL models in treatable retinal diseases +classification using small-scale Optical Coherence Tomography (OCT) images +ranging from 125 to 4000 with balanced or imbalanced distribution for training. +The proposed SSL model achieves the state-of-art accuracy of 98.84% using only +4,000 training images. Our results suggest the SSL models provide superior +performance under both the balanced and imbalanced training scenarios. The SSL +model with MoCo-v2 scheme has consistent good performance under the imbalanced +scenario and, especially, surpasses the other models when the training set is +less than 500 images. + +
+
+
+
+
+ + ☆ EyeFormer: Predicting Personalized Scanpaths with Transformer-Guided + Reinforcement Learning + + +
+ From a visual perception perspective, modern graphical user interfaces (GUIs) +comprise a complex graphics-rich two-dimensional visuospatial arrangement of +text, images, and interactive objects such as buttons and menus. While existing +models can accurately predict regions and objects that are likely to attract +attention ``on average'', so far there is no scanpath model capable of +predicting scanpaths for an individual. To close this gap, we introduce +EyeFormer, which leverages a Transformer architecture as a policy network to +guide a deep reinforcement learning algorithm that controls gaze locations. Our +model has the unique capability of producing personalized predictions when +given a few user scanpath samples. It can predict full scanpath information, +including fixation positions and duration, across individuals and various +stimulus types. Additionally, we demonstrate applications in GUI layout +optimization driven by our model. Our software and models will be publicly +available. + +
+
+
+
+
+ + ☆ Salient Object-Aware Background Generation using Text-Guided Diffusion + Models CVPR 2024 + + +
+ Generating background scenes for salient objects plays a crucial role across +various domains including creative design and e-commerce, as it enhances the +presentation and context of subjects by integrating them into tailored +environments. Background generation can be framed as a task of text-conditioned +outpainting, where the goal is to extend image content beyond a salient +object's boundaries on a blank background. Although popular diffusion models +for text-guided inpainting can also be used for outpainting by mask inversion, +they are trained to fill in missing parts of an image rather than to place an +object into a scene. Consequently, when used for background creation, +inpainting models frequently extend the salient object's boundaries and thereby +change the object's identity, which is a phenomenon we call "object expansion." +This paper introduces a model for adapting inpainting diffusion models to the +salient object outpainting task using Stable Diffusion and ControlNet +architectures. We present a series of qualitative and quantitative results +across models and datasets, including a newly proposed metric to measure object +expansion that does not require any human labeling. Compared to Stable +Diffusion 2.0 Inpainting, our proposed approach reduces object expansion by +3.6x on average with no degradation in standard visual metrics across multiple +datasets. + +
+
+ comment: Accepted for publication at CVPR 2024's Generative Models for + Computer Vision workshop +
+
+
+
+
+ + ☆ SegFormer3D: an Efficient Transformer for 3D Medical Image Segmentation CVPR + + +
+ The adoption of Vision Transformers (ViTs) based architectures represents a +significant advancement in 3D Medical Image (MI) segmentation, surpassing +traditional Convolutional Neural Network (CNN) models by enhancing global +contextual understanding. While this paradigm shift has significantly enhanced +3D segmentation performance, state-of-the-art architectures require extremely +large and complex architectures with large scale computing resources for +training and deployment. Furthermore, in the context of limited datasets, often +encountered in medical imaging, larger models can present hurdles in both model +generalization and convergence. In response to these challenges and to +demonstrate that lightweight models are a valuable area of research in 3D +medical imaging, we present SegFormer3D, a hierarchical Transformer that +calculates attention across multiscale volumetric features. Additionally, +SegFormer3D avoids complex decoders and uses an all-MLP decoder to aggregate +local and global attention features to produce highly accurate segmentation +masks. The proposed memory efficient Transformer preserves the performance +characteristics of a significantly larger model in a compact design. +SegFormer3D democratizes deep learning for 3D medical image segmentation by +offering a model with 33x less parameters and a 13x reduction in GFLOPS +compared to the current state-of-the-art (SOTA). We benchmark SegFormer3D +against the current SOTA models on three widely used datasets Synapse, BRaTs, +and ACDC, achieving competitive results. Code: +https://github.com/OSUPCVLab/SegFormer3D.git + +
+
+ comment: Accepted at CVPR Workshop 2024 +
+
+
+
+
+ + ☆ Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban + Crime Dynamics + + +
+ This study addresses the challenge of urban safety in New York City by +examining the relationship between the built environment and crime rates using +machine learning and a comprehensive dataset of street view im- ages. We aim to +identify how urban landscapes correlate with crime statistics, focusing on the +characteristics of street views and their association with crime rates. The +findings offer insights for urban planning and crime pre- vention, highlighting +the potential of environmental de- sign in enhancing public safety. + +
+
+
+
+
+ + ☆ Cross-Modal Self-Training: Aligning Images and Pointclouds to Learn + Classification without Labels CVPR 2024 + + +
+ Large-scale vision 2D vision language models, such as CLIP can be aligned +with a 3D encoder to learn generalizable (open-vocabulary) 3D vision models. +However, current methods require supervised pre-training for such alignment, +and the performance of such 3D zero-shot models remains sub-optimal for +real-world adaptation. In this work, we propose an optimization framework: +Cross-MoST: Cross-Modal Self-Training, to improve the label-free classification +performance of a zero-shot 3D vision model by simply leveraging unlabeled 3D +data and their accompanying 2D views. We propose a student-teacher framework to +simultaneously process 2D views and 3D point clouds and generate joint pseudo +labels to train a classifier and guide cross-model feature alignment. Thereby +we demonstrate that 2D vision language models such as CLIP can be used to +complement 3D representation learning to improve classification performance +without the need for expensive class annotations. Using synthetic and +real-world 3D datasets, we further demonstrate that Cross-MoST enables +efficient cross-modal knowledge exchange resulting in both image and point +cloud modalities learning from each other's rich representations. + +
+
+ comment: To be published in Workshop for Learning 3D with Multi-View + Supervision (3DMV) at CVPR 2024 +
+
+
+
+
+ + ☆ ANCHOR: LLM-driven News Subject Conditioning for Text-to-Image Synthesis + + +
+ Text-to-Image (T2I) Synthesis has made tremendous strides in enhancing +synthesized image quality, but current datasets evaluate model performance only +on descriptive, instruction-based prompts. Real-world news image captions take +a more pragmatic approach, providing high-level situational and Named-Entity +(NE) information and limited physical object descriptions, making them +abstractive. To evaluate the ability of T2I models to capture intended subjects +from news captions, we introduce the Abstractive News Captions with High-level +cOntext Representation (ANCHOR) dataset, containing 70K+ samples sourced from 5 +different news media organizations. With Large Language Models (LLM) achieving +success in language and commonsense reasoning tasks, we explore the ability of +different LLMs to identify and understand key subjects from abstractive +captions. Our proposed method Subject-Aware Finetuning (SAFE), selects and +enhances the representation of key subjects in synthesized images by leveraging +LLM-generated subject weights. It also adapts to the domain distribution of +news images and captions through custom Domain Fine-tuning, outperforming +current T2I baselines on ANCHOR. By launching the ANCHOR dataset, we hope to +motivate research in furthering the Natural Language Understanding (NLU) +capabilities of T2I models. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ WB LUTs: Contrastive Learning for White Balancing Lookup Tables + + +
+ Automatic white balancing (AWB), one of the first steps in an integrated +signal processing (ISP) pipeline, aims to correct the color cast induced by the +scene illuminant. An incorrect white balance (WB) setting or AWB failure can +lead to an undesired blue or red tint in the rendered sRGB image. To address +this, recent methods pose the post-capture WB correction problem as an +image-to-image translation task and train deep neural networks to learn the +necessary color adjustments at a lower resolution. These low resolution outputs +are post-processed to generate high resolution WB corrected images, forming a +bottleneck in the end-to-end run time. In this paper we present a 3D Lookup +Table (LUT) based WB correction model called WB LUTs that can generate high +resolution outputs in real time. We introduce a contrastive learning framework +with a novel hard sample mining strategy, which improves the WB correction +quality of baseline 3D LUTs by 25.5%. Experimental results demonstrate that the +proposed WB LUTs perform competitively against state-of-the-art models on two +benchmark datasets while being 300 times faster using 12.7 times less memory. +Our model and code are available at https://github.com/skrmanne/3DLUT_sRGB_WB. + +
+
+
+
+
+ + ☆ NOISe: Nuclei-Aware Osteoclast Instance Segmentation for Mouse-to-Human + Domain Transfer + + +
+ Osteoclast cell image analysis plays a key role in osteoporosis research, but +it typically involves extensive manual image processing and hand annotations by +a trained expert. In the last few years, a handful of machine learning +approaches for osteoclast image analysis have been developed, but none have +addressed the full instance segmentation task required to produce the same +output as that of the human expert led process. Furthermore, none of the prior, +fully automated algorithms have publicly available code, pretrained models, or +annotated datasets, inhibiting reproduction and extension of their work. We +present a new dataset with ~2*10^5 expert annotated mouse osteoclast masks, +together with a deep learning instance segmentation method which works for both +in vitro mouse osteoclast cells on plastic tissue culture plates and human +osteoclast cells on bone chips. To our knowledge, this is the first work to +automate the full osteoclast instance segmentation task. Our method achieves a +performance of 0.82 mAP_0.5 (mean average precision at intersection-over-union +threshold of 0.5) in cross validation for mouse osteoclasts. We present a novel +nuclei-aware osteoclast instance segmentation training strategy (NOISe) based +on the unique biology of osteoclasts, to improve the model's generalizability +and boost the mAP_0.5 from 0.60 to 0.82 on human osteoclasts. We publish our +annotated mouse osteoclast image dataset, instance segmentation models, and +code at github.com/michaelwwan/noise to enable reproducibility and to provide a +public tool to accelerate osteoporosis research. + +
+
+
+
+
+ + ☆ Epistemic Uncertainty Quantification For Pre-trained Neural Network CVPR 2024 + + +
+ Epistemic uncertainty quantification (UQ) identifies where models lack +knowledge. Traditional UQ methods, often based on Bayesian neural networks, are +not suitable for pre-trained non-Bayesian models. Our study addresses +quantifying epistemic uncertainty for any pre-trained model, which does not +need the original training data or model modifications and can ensure broad +applicability regardless of network architectures or training techniques. +Specifically, we propose a gradient-based approach to assess epistemic +uncertainty, analyzing the gradients of outputs relative to model parameters, +and thereby indicating necessary model adjustments to accurately represent the +inputs. We first explore theoretical guarantees of gradient-based methods for +epistemic UQ, questioning the view that this uncertainty is only calculable +through differences between multiple models. We further improve gradient-driven +UQ by using class-specific weights for integrating gradients and emphasizing +distinct contributions from neural network layers. Additionally, we enhance UQ +accuracy by combining gradient and perturbation methods to refine the +gradients. We evaluate our approach on out-of-distribution detection, +uncertainty calibration, and active learning, demonstrating its superiority +over current state-of-the-art UQ methods for pre-trained models. + +
+
+ comment: Published at CVPR 2024 +
+
+
+
+
+ + ☆ GeoAI Reproducibility and Replicability: a computational and spatial + perspective + + +
+ GeoAI has emerged as an exciting interdisciplinary research area that +combines spatial theories and data with cutting-edge AI models to address +geospatial problems in a novel, data-driven manner. While GeoAI research has +flourished in the GIScience literature, its reproducibility and replicability +(R&R), fundamental principles that determine the reusability, reliability, and +scientific rigor of research findings, have rarely been discussed. This paper +aims to provide an in-depth analysis of this topic from both computational and +spatial perspectives. We first categorize the major goals for reproducing GeoAI +research, namely, validation (repeatability), learning and adapting the method +for solving a similar or new problem (reproducibility), and examining the +generalizability of the research findings (replicability). Each of these goals +requires different levels of understanding of GeoAI, as well as different +methods to ensure its success. We then discuss the factors that may cause the +lack of R&R in GeoAI research, with an emphasis on (1) the selection and use of +training data; (2) the uncertainty that resides in the GeoAI model design, +training, deployment, and inference processes; and more importantly (3) the +inherent spatial heterogeneity of geospatial data and processes. We use a deep +learning-based image analysis task as an example to demonstrate the results' +uncertainty and spatial variance caused by different factors. The findings +reiterate the importance of knowledge sharing, as well as the generation of a +"replicability map" that incorporates spatial autocorrelation and spatial +heterogeneity into consideration in quantifying the spatial replicability of +GeoAI research. + +
+
+ comment: Accepted by Annals of the American Association of Geographers +
+
+
+
+
+ + ☆ Vision Augmentation Prediction Autoencoder with Attention Design + (VAPAAD) + + +
+ Despite significant advancements in sequence prediction, current methods lack +attention-based mechanisms for next-frame prediction. Our work introduces +VAPAAD or Vision Augmentation Prediction Autoencoder with Attention Design, an +innovative model that enhances predictive performance by integrating attention +designs, allowing for nuanced understanding and handling of temporal dynamics +in video sequences. We demonstrate using the famous Moving MNIST dataset the +robust performance of the proposed model and potential applicability of such +design in the literature. + +
+
+
+
+
+ + ☆ Low-Light Image Enhancement Framework for Improved Object Detection in + Fisheye Lens Datasets + + +
+ This study addresses the evolving challenges in urban traffic monitoring +detection systems based on fisheye lens cameras by proposing a framework that +improves the efficacy and accuracy of these systems. In the context of urban +infrastructure and transportation management, advanced traffic monitoring +systems have become critical for managing the complexities of urbanization and +increasing vehicle density. Traditional monitoring methods, which rely on +static cameras with narrow fields of view, are ineffective in dynamic urban +environments, necessitating the installation of multiple cameras, which raises +costs. Fisheye lenses, which were recently introduced, provide wide and +omnidirectional coverage in a single frame, making them a transformative +solution. However, issues such as distorted views and blurriness arise, +preventing accurate object detection on these images. Motivated by these +challenges, this study proposes a novel approach that combines a +ransformer-based image enhancement framework and ensemble learning technique to +address these challenges and improve traffic monitoring accuracy, making +significant contributions to the future of intelligent traffic management +systems. Our proposed methodological framework won 5th place in the 2024 AI +City Challenge, Track 4, with an F1 score of 0.5965 on experimental validation +data. The experimental results demonstrate the effectiveness, efficiency, and +robustness of the proposed system. Our code is publicly available at +https://github.com/daitranskku/AIC2024-TRACK4-TEAM15. + +
+
+
+
+
+ + ☆ Explainable Light-Weight Deep Learning Pipeline for Improved Drought + Stres + + +
+ Early identification of drought stress in crops is vital for implementing +effective mitigation measures and reducing yield loss. Non-invasive imaging +techniques hold immense potential by capturing subtle physiological changes in +plants under water deficit. Sensor based imaging data serves as a rich source +of information for machine learning and deep learning algorithms, facilitating +further analysis aimed at identifying drought stress. While these approaches +yield favorable results, real-time field applications requires algorithms +specifically designed for the complexities of natural agricultural conditions. +Our work proposes a novel deep learning framework for classifying drought +stress in potato crops captured by UAVs in natural settings. The novelty lies +in the synergistic combination of a pretrained network with carefully designed +custom layers. This architecture leverages feature extraction capabilities of +the pre-trained network while the custom layers enable targeted dimensionality +reduction and enhanced regularization, ultimately leading to improved +performance. A key innovation of our work involves the integration of +Gradient-Class Activation Mapping (Grad-CAM), an explainability technique. +Grad-CAM sheds light on the internal workings of the deep learning model, +typically referred to as a black box. By visualizing the focus areas of the +model within the images, Grad-CAM fosters interpretability and builds trust in +the decision-making process of the model. Our proposed framework achieves +superior performance, particularly with the DenseNet121 pre-trained network, +reaching a precision of 98% to identify the stressed class with an overall +accuracy of 90%. Comparative analysis of existing state-of-the-art object +detection algorithms reveals the superiority of our approach in significantly +higher precision and accuracy. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ AIGeN: An Adversarial Approach for Instruction Generation in VLN + + +
+ In the last few years, the research interest in Vision-and-Language +Navigation (VLN) has grown significantly. VLN is a challenging task that +involves an agent following human instructions and navigating in a previously +unknown environment to reach a specified goal. Recent work in literature +focuses on different ways to augment the available datasets of instructions for +improving navigation performance by exploiting synthetic training data. In this +work, we propose AIGeN, a novel architecture inspired by Generative Adversarial +Networks (GANs) that produces meaningful and well-formed synthetic instructions +to improve navigation agents' performance. The model is composed of a +Transformer decoder (GPT-2) and a Transformer encoder (BERT). During the +training phase, the decoder generates sentences for a sequence of images +describing the agent's path to a particular point while the encoder +discriminates between real and fake instructions. Experimentally, we evaluate +the quality of the generated instructions and perform extensive ablation +studies. Additionally, we generate synthetic instructions for 217K trajectories +using AIGeN on Habitat-Matterport 3D Dataset (HM3D) and show an improvement in +the performance of an off-the-shelf VLN method. The validation analysis of our +proposal is conducted on REVERIE and R2R and highlights the promising aspects +of our proposal, achieving state-of-the-art performance. + +
+
+ comment: Accepted to 7th Multimodal Learning and Applications Workshop (MULA + 2024) at the IEEE/CVF Conference on Computer Vision and Pattern Recognition + 2024 +
+
+
+
+
+ + ☆ Taming Latent Diffusion Model for Neural Radiance Field Inpainting + + +
+ Neural Radiance Field (NeRF) is a representation for 3D reconstruction from +multi-view images. Despite some recent work showing preliminary success in +editing a reconstructed NeRF with diffusion prior, they remain struggling to +synthesize reasonable geometry in completely uncovered regions. One major +reason is the high diversity of synthetic contents from the diffusion model, +which hinders the radiance field from converging to a crisp and deterministic +geometry. Moreover, applying latent diffusion models on real data often yields +a textural shift incoherent to the image condition due to auto-encoding errors. +These two problems are further reinforced with the use of pixel-distance +losses. To address these issues, we propose tempering the diffusion model's +stochasticity with per-scene customization and mitigating the textural shift +with masked adversarial training. During the analyses, we also found the +commonly used pixel and perceptual losses are harmful in the NeRF inpainting +task. Through rigorous experiments, our framework yields state-of-the-art NeRF +inpainting results on various real-world scenes. Project page: +https://hubert0527.github.io/MALD-NeRF + +
+
+ comment: Project page: https://hubert0527.github.io/MALD-NeRF +
+
+
+
+
+ + ☆ No More Ambiguity in 360° Room Layout via Bi-Layout Estimation CVPR 2024 + + +
+ Inherent ambiguity in layout annotations poses significant challenges to +developing accurate 360{\deg} room layout estimation models. To address this +issue, we propose a novel Bi-Layout model capable of predicting two distinct +layout types. One stops at ambiguous regions, while the other extends to +encompass all visible areas. Our model employs two global context embeddings, +where each embedding is designed to capture specific contextual information for +each layout type. With our novel feature guidance module, the image feature +retrieves relevant context from these embeddings, generating layout-aware +features for precise bi-layout predictions. A unique property of our Bi-Layout +model is its ability to inherently detect ambiguous regions by comparing the +two predictions. To circumvent the need for manual correction of ambiguous +annotations during testing, we also introduce a new metric for disambiguating +ground truth layouts. Our method demonstrates superior performance on benchmark +datasets, notably outperforming leading approaches. Specifically, on the +MatterportLayout dataset, it improves 3DIoU from 81.70% to 82.57% across the +full test set and notably from 54.80% to 59.97% in subsets with significant +ambiguity. Project page: https://liagm.github.io/Bi_Layout/ + +
+
+ comment: CVPR 2024, Project page: https://liagm.github.io/Bi_Layout/ +
+
+
+
+
+ + ☆ MMInA: Benchmarking Multihop Multimodal Internet Agents + + +
+ Autonomous embodied agents live on an Internet of multimedia websites. Can +they hop around multimodal websites to complete complex user tasks? Existing +benchmarks fail to assess them in a realistic, evolving environment for their +embodiment across websites. To answer this question, we present MMInA, a +multihop and multimodal benchmark to evaluate the embodied agents for +compositional Internet tasks, with several appealing properties: 1) Evolving +real-world multimodal websites. Our benchmark uniquely operates on evolving +real-world websites, ensuring a high degree of realism and applicability to +natural user tasks. Our data includes 1,050 human-written tasks covering +various domains such as shopping and travel, with each task requiring the agent +to autonomously extract multimodal information from web pages as observations; +2) Multihop web browsing. Our dataset features naturally compositional tasks +that require information from or actions on multiple websites to solve, to +assess long-range reasoning capabilities on web tasks; 3) Holistic evaluation. +We propose a novel protocol for evaluating an agent's progress in completing +multihop tasks. We experiment with both standalone (multimodal) language models +and heuristic-based web agents. Extensive experiments demonstrate that while +long-chain multihop web tasks are easy for humans, they remain challenging for +state-of-the-art web agents. We identify that agents are more likely to fail on +the early hops when solving tasks of more hops, which results in lower task +success rates. To address this issue, we propose a simple memory augmentation +approach replaying past action trajectories to reflect. Our method +significantly improved both the single-hop and multihop web browsing abilities +of agents. See our code and data at https://mmina.cliangyu.com + +
+
+
+
+
+ + EgoPet: Egomotion and Interaction Data from an Animal's Perspective + + +
+ Animals perceive the world to plan their actions and interact with other +agents to accomplish complex tasks, demonstrating capabilities that are still +unmatched by AI systems. To advance our understanding and reduce the gap +between the capabilities of animals and AI systems, we introduce a dataset of +pet egomotion imagery with diverse examples of simultaneous egomotion and +multi-agent interaction. Current video datasets separately contain egomotion +and interaction examples, but rarely both at the same time. In addition, EgoPet +offers a radically distinct perspective from existing egocentric datasets of +humans or vehicles. We define two in-domain benchmark tasks that capture animal +behavior, and a third benchmark to assess the utility of EgoPet as a +pretraining resource to robotic quadruped locomotion, showing that models +trained from EgoPet outperform those trained from prior datasets. + +
+
+ comment: https://www.amirbar.net/egopet +
+
+
+
+
+ + ☆ HQ-Edit: A High-Quality Dataset for Instruction-based Image Editing + + +
+ This study introduces HQ-Edit, a high-quality instruction-based image editing +dataset with around 200,000 edits. Unlike prior approaches relying on attribute +guidance or human feedback on building datasets, we devise a scalable data +collection pipeline leveraging advanced foundation models, namely GPT-4V and +DALL-E 3. To ensure its high quality, diverse examples are first collected +online, expanded, and then used to create high-quality diptychs featuring input +and output images with detailed text prompts, followed by precise alignment +ensured through post-processing. In addition, we propose two evaluation +metrics, Alignment and Coherence, to quantitatively assess the quality of image +edit pairs using GPT-4V. HQ-Edits high-resolution images, rich in detail and +accompanied by comprehensive editing prompts, substantially enhance the +capabilities of existing image editing models. For example, an HQ-Edit +finetuned InstructPix2Pix can attain state-of-the-art image editing +performance, even surpassing those models fine-tuned with human-annotated data. +The project page is https://thefllood.github.io/HQEdit_web. + +
+
+ comment: Project Page: https://thefllood.github.io/HQEdit_web +
+
+
+
+
+ + ☆ in2IN: Leveraging individual Information to Generate Human INteractions + + +
+ Generating human-human motion interactions conditioned on textual +descriptions is a very useful application in many areas such as robotics, +gaming, animation, and the metaverse. Alongside this utility also comes a great +difficulty in modeling the highly dimensional inter-personal dynamics. In +addition, properly capturing the intra-personal diversity of interactions has a +lot of challenges. Current methods generate interactions with limited diversity +of intra-person dynamics due to the limitations of the available datasets and +conditioning strategies. For this, we introduce in2IN, a novel diffusion model +for human-human motion generation which is conditioned not only on the textual +description of the overall interaction but also on the individual descriptions +of the actions performed by each person involved in the interaction. To train +this model, we use a large language model to extend the InterHuman dataset with +individual descriptions. As a result, in2IN achieves state-of-the-art +performance in the InterHuman dataset. Furthermore, in order to increase the +intra-personal diversity on the existing interaction datasets, we propose +DualMDM, a model composition technique that combines the motions generated with +in2IN and the motions generated by a single-person motion prior pre-trained on +HumanML3D. As a result, DualMDM generates motions with higher individual +diversity and improves control over the intra-person dynamics while maintaining +inter-personal coherence. + +
+
+ comment: Project page: https://pabloruizponce.github.io/in2IN/ +
+
+
+
+
+ + ☆ OneChart: Purify the Chart Structural Extraction via One Auxiliary Token + + +
+ Chart parsing poses a significant challenge due to the diversity of styles, +values, texts, and so forth. Even advanced large vision-language models (LVLMs) +with billions of parameters struggle to handle such tasks satisfactorily. To +address this, we propose OneChart: a reliable agent specifically devised for +the structural extraction of chart information. Similar to popular LVLMs, +OneChart incorporates an autoregressive main body. Uniquely, to enhance the +reliability of the numerical parts of the output, we introduce an auxiliary +token placed at the beginning of the total tokens along with an additional +decoder. The numerically optimized (auxiliary) token allows subsequent tokens +for chart parsing to capture enhanced numerical features through causal +attention. Furthermore, with the aid of the auxiliary token, we have devised a +self-evaluation mechanism that enables the model to gauge the reliability of +its chart parsing results by providing confidence scores for the generated +content. Compared to current state-of-the-art (SOTA) chart parsing models, +e.g., DePlot, ChartVLM, ChartAst, OneChart significantly outperforms in Average +Precision (AP) for chart structural extraction across multiple public +benchmarks, despite enjoying only 0.2 billion parameters. Moreover, as a chart +parsing agent, it also brings 10%+ accuracy gains for the popular LVLM +(LLaVA-1.6) in the downstream ChartQA benchmark. + +
+
+ comment: 14 pages, 9 figures and 6 tables +
+
+
+
+
+ + ☆ One-Click Upgrade from 2D to 3D: Sandwiched RGB-D Video Compression for + Stereoscopic Teleconferencing CVPR 2024 + + +
+ Stereoscopic video conferencing is still challenging due to the need to +compress stereo RGB-D video in real-time. Though hardware implementations of +standard video codecs such as H.264 / AVC and HEVC are widely available, they +are not designed for stereoscopic videos and suffer from reduced quality and +performance. Specific multiview or 3D extensions of these codecs are complex +and lack efficient implementations. In this paper, we propose a new approach to +upgrade a 2D video codec to support stereo RGB-D video compression, by wrapping +it with a neural pre- and post-processor pair. The neural networks are +end-to-end trained with an image codec proxy, and shown to work with a more +sophisticated video codec. We also propose a geometry-aware loss function to +improve rendering quality. We train the neural pre- and post-processors on a +synthetic 4D people dataset, and evaluate it on both synthetic and +real-captured stereo RGB-D videos. Experimental results show that the neural +networks generalize well to unseen data and work out-of-box with various video +codecs. Our approach saves about 30% bit-rate compared to a conventional video +coding scheme and MV-HEVC at the same level of rendering quality from a novel +view, without the need of a task-specific hardware upgrade. + +
+
+ comment: Accepted by CVPR 2024 Workshop (AIS: Vision, Graphics and AI for + Streaming https://ai4streaming-workshop.github.io ) +
+
+
+
+
+ + ☆ MaxFusion: Plug&Play Multi-Modal Generation in Text-to-Image Diffusion + Models + + +
+ Large diffusion-based Text-to-Image (T2I) models have shown impressive +generative powers for text-to-image generation as well as spatially conditioned +image generation. For most applications, we can train the model end-toend with +paired data to obtain photorealistic generation quality. However, to add an +additional task, one often needs to retrain the model from scratch using paired +data across all modalities to retain good generation performance. In this +paper, we tackle this issue and propose a novel strategy to scale a generative +model across new tasks with minimal compute. During our experiments, we +discovered that the variance maps of intermediate feature maps of diffusion +models capture the intensity of conditioning. Utilizing this prior information, +we propose MaxFusion, an efficient strategy to scale up text-to-image +generation models to accommodate new modality conditions. Specifically, we +combine aligned features of multiple models, hence bringing a compositional +effect. Our fusion strategy can be integrated into off-the-shelf models to +enhance their generative prowess. + +
+
+
+
+
+ + ☆ Diffscaler: Enhancing the Generative Prowess of Diffusion Transformers + + +
+ Recently, diffusion transformers have gained wide attention with its +excellent performance in text-to-image and text-to-vidoe models, emphasizing +the need for transformers as backbone for diffusion models. Transformer-based +models have shown better generalization capability compared to CNN-based models +for general vision tasks. However, much less has been explored in the existing +literature regarding the capabilities of transformer-based diffusion backbones +and expanding their generative prowess to other datasets. This paper focuses on +enabling a single pre-trained diffusion transformer model to scale across +multiple datasets swiftly, allowing for the completion of diverse generative +tasks using just one model. To this end, we propose DiffScaler, an efficient +scaling strategy for diffusion models where we train a minimal amount of +parameters to adapt to different tasks. In particular, we learn task-specific +transformations at each layer by incorporating the ability to utilize the +learned subspaces of the pre-trained model, as well as the ability to learn +additional task-specific subspaces, which may be absent in the pre-training +dataset. As these parameters are independent, a single diffusion model with +these task-specific parameters can be used to perform multiple tasks +simultaneously. Moreover, we find that transformer-based diffusion models +significantly outperform CNN-based diffusion models methods while performing +fine-tuning over smaller datasets. We perform experiments on four unconditional +image generation datasets. We show that using our proposed method, a single +pre-trained model can scale up to perform these conditional and unconditional +tasks, respectively, with minimal parameter tuning while performing as close as +fine-tuning an entire diffusion model for that particular task. + +
+
+
+
+
+ + ☆ Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse + Controls to Any Diffusion Model + + +
+ ControlNets are widely used for adding spatial control in image generation +with different conditions, such as depth maps, canny edges, and human poses. +However, there are several challenges when leveraging the pretrained image +ControlNets for controlled video generation. First, pretrained ControlNet +cannot be directly plugged into new backbone models due to the mismatch of +feature spaces, and the cost of training ControlNets for new backbones is a big +burden. Second, ControlNet features for different frames might not effectively +handle the temporal consistency. To address these challenges, we introduce +Ctrl-Adapter, an efficient and versatile framework that adds diverse controls +to any image/video diffusion models, by adapting pretrained ControlNets (and +improving temporal alignment for videos). Ctrl-Adapter provides diverse +capabilities including image control, video control, video control with sparse +frames, multi-condition control, compatibility with different backbones, +adaptation to unseen control conditions, and video editing. In Ctrl-Adapter, we +train adapter layers that fuse pretrained ControlNet features to different +image/video diffusion models, while keeping the parameters of the ControlNets +and the diffusion models frozen. Ctrl-Adapter consists of temporal and spatial +modules so that it can effectively handle the temporal consistency of videos. +We also propose latent skipping and inverse timestep sampling for robust +adaptation and sparse control. Moreover, Ctrl-Adapter enables control from +multiple conditions by simply taking the (weighted) average of ControlNet +outputs. With diverse image/video diffusion backbones (SDXL, Hotshot-XL, +I2VGen-XL, and SVD), Ctrl-Adapter matches ControlNet for image control and +outperforms all baselines for video control (achieving the SOTA accuracy on the +DAVIS 2017 dataset) with significantly lower computational costs (less than 10 +GPU hours). + +
+
+ comment: First two authors contributed equally; Project page: + https://ctrl-adapter.github.io/ +
+
+
+
+
+ + ☆ Design and Analysis of Efficient Attention in Transformers for Social + Group Activity Recognition + + +
+ Social group activity recognition is a challenging task extended from group +activity recognition, where social groups must be recognized with their +activities and group members. Existing methods tackle this task by leveraging +region features of individuals following existing group activity recognition +methods. However, the effectiveness of region features is susceptible to person +localization and variable semantics of individual actions. To overcome these +issues, we propose leveraging attention modules in transformers to generate +social group features. In this method, multiple embeddings are used to +aggregate features for a social group, each of which is assigned to a group +member without duplication. Due to this non-duplicated assignment, the number +of embeddings must be significant to avoid missing group members and thus +renders attention in transformers ineffective. To find optimal attention +designs with a large number of embeddings, we explore several design choices of +queries for feature aggregation and self-attention modules in transformer +decoders. Extensive experimental results show that the proposed method achieves +state-of-the-art performance and verify that the proposed attention designs are +highly effective on social group activity recognition. + +
+
+ comment: Accepted to IJCV, preprint version +
+
+
+
+
+ + ☆ Ti-Patch: Tiled Physical Adversarial Patch for no-reference video + quality metrics + + +
+ Objective no-reference image- and video-quality metrics are crucial in many +computer vision tasks. However, state-of-the-art no-reference metrics have +become learning-based and are vulnerable to adversarial attacks. The +vulnerability of quality metrics imposes restrictions on using such metrics in +quality control systems and comparing objective algorithms. Also, using +vulnerable metrics as a loss for deep learning model training can mislead +training to worsen visual quality. Because of that, quality metrics testing for +vulnerability is a task of current interest. This paper proposes a new method +for testing quality metrics vulnerability in the physical space. To our +knowledge, quality metrics were not previously tested for vulnerability to this +attack; they were only tested in the pixel space. We applied a physical +adversarial Ti-Patch (Tiled Patch) attack to quality metrics and did +experiments both in pixel and physical space. We also performed experiments on +the implementation of physical adversarial wallpaper. The proposed method can +be used as additional quality metrics in vulnerability evaluation, +complementing traditional subjective comparison and vulnerability tests in the +pixel space. We made our code and adversarial videos available on GitHub: +https://github.com/leonenkova/Ti-Patch. + +
+
+ comment: Accepted to WAIT AINL 2024 +
+
+
+
+
+ + ☆ How to build the best medical image segmentation algorithm using + foundation models: a comprehensive empirical study with Segment Anything + Model + + +
+ Automated segmentation is a fundamental medical image analysis task, which +enjoys significant advances due to the advent of deep learning. While +foundation models have been useful in natural language processing and some +vision tasks for some time, the foundation model developed with image +segmentation in mind - Segment Anything Model (SAM) - has been developed only +recently and has shown similar promise. However, there are still no systematic +analyses or ``best-practice'' guidelines for optimal fine-tuning of SAM for +medical image segmentation. This work summarizes existing fine-tuning +strategies with various backbone architectures, model components, and +fine-tuning algorithms across 18 combinations, and evaluates them on 17 +datasets covering all common radiology modalities. Our study reveals that (1) +fine-tuning SAM leads to slightly better performance than previous segmentation +methods, (2) fine-tuning strategies that use parameter-efficient learning in +both the encoder and decoder are superior to other strategies, (3) network +architecture has a small impact on final performance, (4) further training SAM +with self-supervised learning can improve final model performance. We also +demonstrate the ineffectiveness of some methods popular in the literature and +further expand our experiments into few-shot and prompt-based settings. Lastly, +we released our code and MRI-specific fine-tuned weights, which consistently +obtained superior performance over the original SAM, at +https://github.com/mazurowski-lab/finetune-SAM. + +
+
+ comment: Code available at https://github.com/mazurowski-lab/finetune-SAM +
+
+
+
+
+ + ☆ Realistic Model Selection for Weakly Supervised Object Localization + + +
+ Weakly Supervised Object Localization (WSOL) allows for training deep +learning models for classification and localization, using only global +class-level labels. The lack of bounding box (bbox) supervision during training +represents a considerable challenge for hyper-parameter search and model +selection. Earlier WSOL works implicitly observed localization performance over +a test set which leads to biased performance evaluation. More recently, a +better WSOL protocol has been proposed, where a validation set with bbox +annotations is held out for model selection. Although it does not rely on the +test set, this protocol is unrealistic since bboxes are not available in +real-world applications, and when available, it is better to use them directly +to fit model weights. Our initial empirical analysis shows that the +localization performance of a model declines significantly when using only +image-class labels for model selection (compared to using bounding-box +annotations). This suggests that adding bounding-box labels is preferable for +selecting the best model for localization. In this paper, we introduce a new +WSOL validation protocol that provides a localization signal without the need +for manual bbox annotations. In particular, we leverage noisy pseudo boxes from +an off-the-shelf ROI proposal generator such as Selective-Search, CLIP, and RPN +pretrained models for model selection. Our experimental results with several +WSOL methods on ILSVRC and CUB-200-2011 datasets show that our noisy boxes +allow selecting models with performance close to those selected using ground +truth boxes, and better than models selected using only image-class labels. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Unifying Global and Local Scene Entities Modelling for Precise Action + Spotting IJCNN 2024 + + +
+ Sports videos pose complex challenges, including cluttered backgrounds, +camera angle changes, small action-representing objects, and imbalanced action +class distribution. Existing methods for detecting actions in sports videos +heavily rely on global features, utilizing a backbone network as a black box +that encompasses the entire spatial frame. However, these approaches tend to +overlook the nuances of the scene and struggle with detecting actions that +occupy a small portion of the frame. In particular, they face difficulties when +dealing with action classes involving small objects, such as balls or +yellow/red cards in soccer, which only occupy a fraction of the screen space. +To address these challenges, we introduce a novel approach that analyzes and +models scene entities using an adaptive attention mechanism. Particularly, our +model disentangles the scene content into the global environment feature and +local relevant scene entities feature. To efficiently extract environmental +features while considering temporal information with less computational cost, +we propose the use of a 2D backbone network with a time-shift mechanism. To +accurately capture relevant scene entities, we employ a Vision-Language model +in conjunction with the adaptive attention mechanism. Our model has +demonstrated outstanding performance, securing the 1st place in the +SoccerNet-v2 Action Spotting, FineDiving, and FineGym challenge with a +substantial performance improvement of 1.6, 2.0, and 1.3 points in avg-mAP +compared to the runner-up methods. Furthermore, our approach offers +interpretability capabilities in contrast to other deep learning models, which +are often designed as black boxes. Our code and models are released at: +https://github.com/Fsoft-AIC/unifying-global-local-feature. + +
+
+ comment: Accepted to IJCNN 2024 +
+
+
+
+
+ + ☆ Knowledge-enhanced Visual-Language Pretraining for Computational + Pathology + + +
+ In this paper, we consider the problem of visual representation learning for +computational pathology, by exploiting large-scale image-text pairs gathered +from public resources, along with the domain specific knowledge in pathology. +Specifically, we make the following contributions: (i) We curate a pathology +knowledge tree that consists of 50,470 informative attributes for 4,718 +diseases requiring pathology diagnosis from 32 human tissues. To our knowledge, +this is the first comprehensive structured pathology knowledge base; (ii) We +develop a knowledge-enhanced visual-language pretraining approach, where we +first project pathology-specific knowledge into latent embedding space via +language model, and use it to guide the visual representation learning; (iii) +We conduct thorough experiments to validate the effectiveness of our proposed +components, demonstrating significant performance improvement on various +downstream tasks, including cross-modal retrieval, zero-shot classification on +pathology patches, and zero-shot tumor subtyping on whole slide images (WSIs). +All codes, models and the pathology knowledge tree will be released to the +research community + +
+
+
+
+
+ + ☆ Evolving Interpretable Visual Classifiers with Large Language Models + + +
+ Multimodal pre-trained models, such as CLIP, are popular for zero-shot +classification due to their open-vocabulary flexibility and high performance. +However, vision-language models, which compute similarity scores between images +and class labels, are largely black-box, with limited interpretability, risk +for bias, and inability to discover new visual concepts not written down. +Moreover, in practical settings, the vocabulary for class names and attributes +of specialized concepts will not be known, preventing these methods from +performing well on images uncommon in large-scale vision-language datasets. To +address these limitations, we present a novel method that discovers +interpretable yet discriminative sets of attributes for visual recognition. We +introduce an evolutionary search algorithm that uses a large language model and +its in-context learning abilities to iteratively mutate a concept bottleneck of +attributes for classification. Our method produces state-of-the-art, +interpretable fine-grained classifiers. We outperform the latest baselines by +18.4% on five fine-grained iNaturalist datasets and by 22.2% on two KikiBouba +datasets, despite the baselines having access to privileged information about +class names. + +
+
+
+
+
+ + ☆ eMotion-GAN: A Motion-based GAN for Photorealistic and Facial Expression + Preserving Frontal View Synthesis + + +
+ Many existing facial expression recognition (FER) systems encounter +substantial performance degradation when faced with variations in head pose. +Numerous frontalization methods have been proposed to enhance these systems' +performance under such conditions. However, they often introduce undesirable +deformations, rendering them less suitable for precise facial expression +analysis. In this paper, we present eMotion-GAN, a novel deep learning approach +designed for frontal view synthesis while preserving facial expressions within +the motion domain. Considering the motion induced by head variation as noise +and the motion induced by facial expression as the relevant information, our +model is trained to filter out the noisy motion in order to retain only the +motion related to facial expression. The filtered motion is then mapped onto a +neutral frontal face to generate the corresponding expressive frontal face. We +conducted extensive evaluations using several widely recognized dynamic FER +datasets, which encompass sequences exhibiting various degrees of head pose +variations in both intensity and orientation. Our results demonstrate the +effectiveness of our approach in significantly reducing the FER performance gap +between frontal and non-frontal faces. Specifically, we achieved a FER +improvement of up to +5\% for small pose variations and up to +20\% improvement +for larger pose variations. Code available at +\url{https://github.com/o-ikne/eMotion-GAN.git}. + +
+
+
+
+
+ + ☆ HOI-Ref: Hand-Object Interaction Referral in Egocentric Vision + + +
+ Large Vision Language Models (VLMs) are now the de facto state-of-the-art for +a number of tasks including visual question answering, recognising objects, and +spatial referral. In this work, we propose the HOI-Ref task for egocentric +images that aims to understand interactions between hands and objects using +VLMs. To enable HOI-Ref, we curate the HOI-QA dataset that consists of 3.9M +question-answer pairs for training and evaluating VLMs. HOI-QA includes +questions relating to locating hands, objects, and critically their +interactions (e.g. referring to the object being manipulated by the hand). We +train the first VLM for HOI-Ref on this dataset and call it VLM4HOI. Our +results demonstrate that VLMs trained for referral on third person images fail +to recognise and refer hands and objects in egocentric images. When fine-tuned +on our egocentric HOI-QA dataset, performance improves by 27.9% for referring +hands and objects, and by 26.7% for referring interactions. + +
+
+ comment: Project Page: https://sid2697.github.io/hoi-ref/ +
+
+
+
+
+ + ☆ Zero-shot detection of buildings in mobile LiDAR using Language Vision + Model + + +
+ Recent advances have demonstrated that Language Vision Models (LVMs) surpass +the existing State-of-the-Art (SOTA) in two-dimensional (2D) computer vision +tasks, motivating attempts to apply LVMs to three-dimensional (3D) data. While +LVMs are efficient and effective in addressing various downstream 2D vision +tasks without training, they face significant challenges when it comes to point +clouds, a representative format for representing 3D data. It is more difficult +to extract features from 3D data and there are challenges due to large data +sizes and the cost of the collection and labelling, resulting in a notably +limited availability of datasets. Moreover, constructing LVMs for point clouds +is even more challenging due to the requirements for large amounts of data and +training time. To address these issues, our research aims to 1) apply the +Grounded SAM through Spherical Projection to transfer 3D to 2D, and 2) +experiment with synthetic data to evaluate its effectiveness in bridging the +gap between synthetic and real-world data domains. Our approach exhibited high +performance with an accuracy of 0.96, an IoU of 0.85, precision of 0.92, recall +of 0.91, and an F1 score of 0.92, confirming its potential. However, challenges +such as occlusion problems and pixel-level overlaps of multi-label points +during spherical image generation remain to be addressed in future studies. + +
+
+ comment: 7 pages, 6 figures, conference +
+
+
+
+
+ + ☆ Zero-shot Building Age Classification from Facade Image Using GPT-4 + + +
+ A building's age of construction is crucial for supporting many geospatial +applications. Much current research focuses on estimating building age from +facade images using deep learning. However, building an accurate deep learning +model requires a considerable amount of labelled training data, and the trained +models often have geographical constraints. Recently, large pre-trained vision +language models (VLMs) such as GPT-4 Vision, which demonstrate significant +generalisation capabilities, have emerged as potential training-free tools for +dealing with specific vision tasks, but their applicability and reliability for +building information remain unexplored. In this study, a zero-shot building age +classifier for facade images is developed using prompts that include logical +instructions. Taking London as a test case, we introduce a new dataset, +FI-London, comprising facade images and building age epochs. Although the +training-free classifier achieved a modest accuracy of 39.69%, the mean +absolute error of 0.85 decades indicates that the model can predict building +age epochs successfully albeit with a small bias. The ensuing discussion +reveals that the classifier struggles to predict the age of very old buildings +and is challenged by fine-grained predictions within 2 decades. Overall, the +classifier utilising GPT-4 Vision is capable of predicting the rough age epoch +of a building from a single facade image without any training. + +
+
+
+
+
+ + ☆ EdgeRelight360: Text-Conditioned 360-Degree HDR Image Generation for + Real-Time On-Device Video Portrait Relighting CVPR + + +
+ In this paper, we present EdgeRelight360, an approach for real-time video +portrait relighting on mobile devices, utilizing text-conditioned generation of +360-degree high dynamic range image (HDRI) maps. Our method proposes a +diffusion-based text-to-360-degree image generation in the HDR domain, taking +advantage of the HDR10 standard. This technique facilitates the generation of +high-quality, realistic lighting conditions from textual descriptions, offering +flexibility and control in portrait video relighting task. Unlike the previous +relighting frameworks, our proposed system performs video relighting directly +on-device, enabling real-time inference with real 360-degree HDRI maps. This +on-device processing ensures both privacy and guarantees low runtime, providing +an immediate response to changes in lighting conditions or user inputs. Our +approach paves the way for new possibilities in real-time video applications, +including video conferencing, gaming, and augmented reality, by allowing +dynamic, text-based control of lighting conditions. + +
+
+ comment: Camera-ready version (CVPR workshop - EDGE'24) +
+
+
+
+
+ + ☆ Evaluating the Explainability of Attributes and Prototypes for a Medical + Classification Model + + +
+ Due to the sensitive nature of medicine, it is particularly important and +highly demanded that AI methods are explainable. This need has been recognised +and there is great research interest in xAI solutions with medical +applications. However, there is a lack of user-centred evaluation regarding the +actual impact of the explanations. We evaluate attribute- and prototype-based +explanations with the Proto-Caps model. This xAI model reasons the target +classification with human-defined visual features of the target object in the +form of scores and attribute-specific prototypes. The model thus provides a +multimodal explanation that is intuitively understandable to humans thanks to +predefined attributes. A user study involving six radiologists shows that the +explanations are subjectivly perceived as helpful, as they reflect their +decision-making process. The results of the model are considered a second +opinion that radiologists can discuss using the model's explanations. However, +it was shown that the inclusion and increased magnitude of model explanations +objectively can increase confidence in the model's predictions when the model +is incorrect. We can conclude that attribute scores and visual prototypes +enhance confidence in the model. However, additional development and repeated +user studies are needed to tailor the explanation to the respective use case. + +
+
+ comment: Accepted at The 2nd World Conference on eXplainable Artificial + Intelligence +
+
+
+
+
+ + ☆ ReffAKD: Resource-efficient Autoencoder-based Knowledge Distillation + + +
+ In this research, we propose an innovative method to boost Knowledge +Distillation efficiency without the need for resource-heavy teacher models. +Knowledge Distillation trains a smaller ``student'' model with guidance from a +larger ``teacher'' model, which is computationally costly. However, the main +benefit comes from the soft labels provided by the teacher, helping the student +grasp nuanced class similarities. In our work, we propose an efficient method +for generating these soft labels, thereby eliminating the need for a large +teacher model. We employ a compact autoencoder to extract essential features +and calculate similarity scores between different classes. Afterward, we apply +the softmax function to these similarity scores to obtain a soft probability +vector. This vector serves as valuable guidance during the training of the +student model. Our extensive experiments on various datasets, including +CIFAR-100, Tiny Imagenet, and Fashion MNIST, demonstrate the superior resource +efficiency of our approach compared to traditional knowledge distillation +methods that rely on large teacher models. Importantly, our approach +consistently achieves similar or even superior performance in terms of model +accuracy. We also perform a comparative study with various techniques recently +developed for knowledge distillation showing our approach achieves competitive +performance with using significantly less resources. We also show that our +approach can be easily added to any logit based knowledge distillation method. +This research contributes to making knowledge distillation more accessible and +cost-effective for practical applications, making it a promising avenue for +improving the efficiency of model training. The code for this work is available +at, https://github.com/JEKimLab/ReffAKD. + +
+
+
+
+
+ + ☆ Map-Relative Pose Regression for Visual Re-Localization CVPR + + +
+ Pose regression networks predict the camera pose of a query image relative to +a known environment. Within this family of methods, absolute pose regression +(APR) has recently shown promising accuracy in the range of a few centimeters +in position error. APR networks encode the scene geometry implicitly in their +weights. To achieve high accuracy, they require vast amounts of training data +that, realistically, can only be created using novel view synthesis in a +days-long process. This process has to be repeated for each new scene again and +again. We present a new approach to pose regression, map-relative pose +regression (marepo), that satisfies the data hunger of the pose regression +network in a scene-agnostic fashion. We condition the pose regressor on a +scene-specific map representation such that its pose predictions are relative +to the scene map. This allows us to train the pose regressor across hundreds of +scenes to learn the generic relation between a scene-specific map +representation and the camera pose. Our map-relative pose regressor can be +applied to new map representations immediately or after mere minutes of +fine-tuning for the highest accuracy. Our approach outperforms previous pose +regression methods by far on two public datasets, indoor and outdoor. Code is +available: https://nianticlabs.github.io/marepo + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) + 2024, Highlight Paper +
+
+
+
+
+ + ☆ Conditional Prototype Rectification Prompt Learning + + +
+ Pre-trained large-scale vision-language models (VLMs) have acquired profound +understanding of general visual concepts. Recent advancements in efficient +transfer learning (ETL) have shown remarkable success in fine-tuning VLMs +within the scenario of limited data, introducing only a few parameters to +harness task-specific insights from VLMs. Despite significant progress, current +leading ETL methods tend to overfit the narrow distributions of base classes +seen during training and encounter two primary challenges: (i) only utilizing +uni-modal information to modeling task-specific knowledge; and (ii) using +costly and time-consuming methods to supplement knowledge. To address these +issues, we propose a Conditional Prototype Rectification Prompt Learning (CPR) +method to correct the bias of base examples and augment limited data in an +effective way. Specifically, we alleviate overfitting on base classes from two +aspects. First, each input image acquires knowledge from both textual and +visual prototypes, and then generates sample-conditional text tokens. Second, +we extract utilizable knowledge from unlabeled data to further refine the +prototypes. These two strategies mitigate biases stemming from base classes, +yielding a more effective classifier. Extensive experiments on 11 benchmark +datasets show that our CPR achieves state-of-the-art performance on both +few-shot classification and base-to-new generalization tasks. Our code is +avaliable at \url{https://github.com/chenhaoxing/CPR}. + +
+
+
+
+
+ + ☆ Table tennis ball spin estimation with an event camera CVPR + + +
+ Spin plays a pivotal role in ball-based sports. Estimating spin becomes a key +skill due to its impact on the ball's trajectory and bouncing behavior. Spin +cannot be observed directly, making it inherently challenging to estimate. In +table tennis, the combination of high velocity and spin renders traditional low +frame rate cameras inadequate for quickly and accurately observing the ball's +logo to estimate the spin due to the motion blur. Event cameras do not suffer +as much from motion blur, thanks to their high temporal resolution. Moreover, +the sparse nature of the event stream solves communication bandwidth +limitations many frame cameras face. To the best of our knowledge, we present +the first method for table tennis spin estimation using an event camera. We use +ordinal time surfaces to track the ball and then isolate the events generated +by the logo on the ball. Optical flow is then estimated from the extracted +events to infer the ball's spin. We achieved a spin magnitude mean error of +$10.7 \pm 17.3$ rps and a spin axis mean error of $32.9 \pm 38.2\deg$ in real +time for a flying ball. + +
+
+ comment: Accepted to CVsport (CVPRW 2024) +
+
+
+
+
+ + ☆ Empowering Embodied Visual Tracking with Visual Foundation Models and + Offline RL + + +
+ Embodied visual tracking is to follow a target object in dynamic 3D +environments using an agent's egocentric vision. This is a vital and +challenging skill for embodied agents. However, existing methods suffer from +inefficient training and poor generalization. In this paper, we propose a novel +framework that combines visual foundation models (VFM) and offline +reinforcement learning (offline RL) to empower embodied visual tracking. We use +a pre-trained VFM, such as ``Tracking Anything", to extract semantic +segmentation masks with text prompts. We then train a recurrent policy network +with offline RL, e.g., Conservative Q-Learning, to learn from the collected +demonstrations without online agent-environment interactions. To further +improve the robustness and generalization of the policy network, we also +introduce a mask re-targeting mechanism and a multi-level data collection +strategy. In this way, we can train a robust tracker within an hour on a +consumer-level GPU, e.g., Nvidia RTX 3090. Such efficiency is unprecedented for +RL-based visual tracking methods. We evaluate our tracker on several +high-fidelity environments with challenging situations, such as distraction and +occlusion. The results show that our agent outperforms state-of-the-art methods +in terms of sample efficiency, robustness to distractors, and generalization to +unseen scenarios and targets. We also demonstrate the transferability of the +learned tracker from the virtual world to real-world scenarios. + +
+
+
+
+
+ + ☆ A Diffusion-based Data Generator for Training Object Recognition Models + in Ultra-Range Distance + + +
+ Object recognition, commonly performed by a camera, is a fundamental +requirement for robots to complete complex tasks. Some tasks require +recognizing objects far from the robot's camera. A challenging example is +Ultra-Range Gesture Recognition (URGR) in human-robot interaction where the +user exhibits directive gestures at a distance of up to 25~m from the robot. +However, training a model to recognize hardly visible objects located in +ultra-range requires an exhaustive collection of a significant amount of +labeled samples. The generation of synthetic training datasets is a recent +solution to the lack of real-world data, while unable to properly replicate the +realistic visual characteristics of distant objects in images. In this letter, +we propose the Diffusion in Ultra-Range (DUR) framework based on a Diffusion +model to generate labeled images of distant objects in various scenes. The DUR +generator receives a desired distance and class (e.g., gesture) and outputs a +corresponding synthetic image. We apply DUR to train a URGR model with +directive gestures in which fine details of the gesturing hand are challenging +to distinguish. DUR is compared to other types of generative models showcasing +superiority both in fidelity and in recognition success rate when training a +URGR model. More importantly, training a DUR model on a limited amount of real +data and then using it to generate synthetic data for training a URGR model +outperforms directly training the URGR model on real data. The synthetic-based +URGR model is also demonstrated in gesture-based direction of a ground robot. + +
+
+
+
+
+ + ☆ STMixer: A One-Stage Sparse Action Detector CVPR + 2023 + + +
+ Traditional video action detectors typically adopt the two-stage pipeline, +where a person detector is first employed to generate actor boxes and then 3D +RoIAlign is used to extract actor-specific features for classification. This +detection paradigm requires multi-stage training and inference, and the feature +sampling is constrained inside the box, failing to effectively leverage richer +context information outside. Recently, a few query-based action detectors have +been proposed to predict action instances in an end-to-end manner. However, +they still lack adaptability in feature sampling and decoding, thus suffering +from the issues of inferior performance or slower convergence. In this paper, +we propose two core designs for a more flexible one-stage sparse action +detector. First, we present a query-based adaptive feature sampling module, +which endows the detector with the flexibility of mining a group of +discriminative features from the entire spatio-temporal domain. Second, we +devise a decoupled feature mixing module, which dynamically attends to and +mixes video features along the spatial and temporal dimensions respectively for +better feature decoding. Based on these designs, we instantiate two detection +pipelines, that is, STMixer-K for keyframe action detection and STMixer-T for +action tubelet detection. Without bells and whistles, our STMixer detectors +obtain state-of-the-art results on five challenging spatio-temporal action +detection benchmarks for keyframe action detection or action tube detection. + +
+
+ comment: Extended version of the paper arXiv:2303.15879 presented at CVPR + 2023. Accepted by TPAMI 2024 +
+
+
+
+
+ + ☆ Video2Game: Real-time, Interactive, Realistic and Browser-Compatible + Environment from a Single Video CVPR 2024 + + +
+ Creating high-quality and interactive virtual environments, such as games and +simulators, often involves complex and costly manual modeling processes. In +this paper, we present Video2Game, a novel approach that automatically converts +videos of real-world scenes into realistic and interactive game environments. +At the heart of our system are three core components:(i) a neural radiance +fields (NeRF) module that effectively captures the geometry and visual +appearance of the scene; (ii) a mesh module that distills the knowledge from +NeRF for faster rendering; and (iii) a physics module that models the +interactions and physical dynamics among the objects. By following the +carefully designed pipeline, one can construct an interactable and actionable +digital replica of the real world. We benchmark our system on both indoor and +large-scale outdoor scenes. We show that we can not only produce +highly-realistic renderings in real-time, but also build interactive games on +top. + +
+
+ comment: CVPR 2024. Project page (with code): https://video2game.github.io/ +
+
+
+
+
+ + ☆ Digging into contrastive learning for robust depth estimation with + diffusion models + + +
+ Recently, diffusion-based depth estimation methods have drawn widespread +attention due to their elegant denoising patterns and promising performance. +However, they are typically unreliable under adverse conditions prevalent in +real-world scenarios, such as rainy, snowy, etc. In this paper, we propose a +novel robust depth estimation method called D4RD, featuring a custom +contrastive learning mode tailored for diffusion models to mitigate performance +degradation in complex environments. Concretely, we integrate the strength of +knowledge distillation into contrastive learning, building the `trinity' +contrastive scheme. This scheme utilizes the sampled noise of the forward +diffusion process as a natural reference, guiding the predicted noise in +diverse scenes toward a more stable and precise optimum. Moreover, we extend +noise-level trinity to encompass more generic feature and image levels, +establishing a multi-level contrast to distribute the burden of robust +perception across the overall network. Before addressing complex scenarios, we +enhance the stability of the baseline diffusion model with three +straightforward yet effective improvements, which facilitate convergence and +remove depth outliers. Extensive experiments demonstrate that D4RD surpasses +existing state-of-the-art solutions on synthetic corruption datasets and +real-world weather conditions. The code for D4RD will be made available for +further exploration and adoption. + +
+
+ comment: 8 pages,6 figures +
+
+
+
+
+ + ☆ Interaction as Explanation: A User Interaction-based Method for + Explaining Image Classification Models + + +
+ In computer vision, explainable AI (xAI) methods seek to mitigate the +'black-box' problem by making the decision-making process of deep learning +models more interpretable and transparent. Traditional xAI methods concentrate +on visualizing input features that influence model predictions, providing +insights primarily suited for experts. In this work, we present an +interaction-based xAI method that enhances user comprehension of image +classification models through their interaction. Thus, we developed a web-based +prototype allowing users to modify images via painting and erasing, thereby +observing changes in classification results. Our approach enables users to +discern critical features influencing the model's decision-making process, +aligning their mental models with the model's logic. Experiments conducted with +five images demonstrate the potential of the method to reveal feature +importance through user interaction. Our work contributes a novel perspective +to xAI by centering on end-user engagement and understanding, paving the way +for more intuitive and accessible explainability in AI systems. + +
+
+ comment: 5 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ A Recipe for CAC: Mosaic-based Generalized Loss for Improved + Class-Agnostic Counting + + +
+ Class agnostic counting (CAC) is a vision task that can be used to count the +total occurrence number of any given reference objects in the query image. The +task is usually formulated as a density map estimation problem through +similarity computation among a few image samples of the reference object and +the query image. In this paper, we point out a severe issue of the existing CAC +framework: Given a multi-class setting, models don't consider reference images +and instead blindly match all dominant objects in the query image. Moreover, +the current evaluation metrics and dataset cannot be used to faithfully assess +the model's generalization performance and robustness. To this end, we discover +that the combination of mosaic augmentation with generalized loss is essential +for addressing the aforementioned issue of CAC models to count objects of +majority (i.e. dominant objects) regardless of the references. Furthermore, we +introduce a new evaluation protocol and metrics for resolving the problem +behind the existing CAC evaluation scheme and better benchmarking CAC models in +a more fair manner. Besides, extensive evaluation results demonstrate that our +proposed recipe can consistently improve the performance of different CAC +models. The code will be released upon acceptance. + +
+
+
+
+
+ + ☆ 3D Face Tracking from 2D Video through Iterative Dense UV to Image Flow CVPR 2024 + + +
+ When working with 3D facial data, improving fidelity and avoiding the uncanny +valley effect is critically dependent on accurate 3D facial performance +capture. Because such methods are expensive and due to the widespread +availability of 2D videos, recent methods have focused on how to perform +monocular 3D face tracking. However, these methods often fall short in +capturing precise facial movements due to limitations in their network +architecture, training, and evaluation processes. Addressing these challenges, +we propose a novel face tracker, FlowFace, that introduces an innovative 2D +alignment network for dense per-vertex alignment. Unlike prior work, FlowFace +is trained on high-quality 3D scan annotations rather than weak supervision or +synthetic data. Our 3D model fitting module jointly fits a 3D face model from +one or many observations, integrating existing neutral shape priors for +enhanced identity and expression disentanglement and per-vertex deformations +for detailed facial feature reconstruction. Additionally, we propose a novel +metric and benchmark for assessing tracking accuracy. Our method exhibits +superior performance on both custom and publicly available benchmarks. We +further validate the effectiveness of our tracker by generating high-quality 3D +data from 2D videos, which leads to performance gains on downstream tasks. + +
+
+ comment: 22 pages, 25 figures, to be published in CVPR 2024 +
+
+
+
+
+ + ☆ Neighbour-level Message Interaction Encoding for Improved Representation + Learning on Graphs + + +
+ Message passing has become the dominant framework in graph representation +learning. The essential idea of the message-passing framework is to update node +embeddings based on the information aggregated from local neighbours. However, +most existing aggregation methods have not encoded neighbour-level message +interactions into the aggregated message, resulting in an information lost in +embedding generation. And this information lost could be accumulated and become +more serious as more layers are added to the graph network model. To address +this issue, we propose a neighbour-level message interaction information +encoding method for improving graph representation learning. For messages that +are aggregated at a node, we explicitly generate an encoding between each +message and the rest messages using an encoding function. Then we aggregate +these learned encodings and take the sum of the aggregated encoding and the +aggregated message to update the embedding for the node. By this way, +neighbour-level message interaction information is integrated into the +generated node embeddings. The proposed encoding method is a generic method +which can be integrated into message-passing graph convolutional networks. +Extensive experiments are conducted on six popular benchmark datasets across +four highly-demanded tasks. The results show that integrating neighbour-level +message interactions achieves improved performance of the base models, +advancing the state of the art results for representation learning over graphs. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ A Universal Protocol to Benchmark Camera Calibration for Sports + + +
+ Camera calibration is a crucial component in the realm of sports analytics, +as it serves as the foundation to extract 3D information out of the broadcast +images. Despite the significance of camera calibration research in sports +analytics, progress is impeded by outdated benchmarking criteria. Indeed, the +annotation data and evaluation metrics provided by most currently available +benchmarks strongly favor and incite the development of sports field +registration methods, i.e. methods estimating homographies that map the sports +field plane to the image plane. However, such homography-based methods are +doomed to overlook the broader capabilities of camera calibration in bridging +the 3D world to the image. In particular, real-world non-planar sports field +elements (such as goals, corner flags, baskets, ...) and image distortion +caused by broadcast camera lenses are out of the scope of sports field +registration methods. To overcome these limitations, we designed a new +benchmarking protocol, named ProCC, based on two principles: (1) the protocol +should be agnostic to the camera model chosen for a camera calibration method, +and (2) the protocol should fairly evaluate camera calibration methods using +the reprojection of arbitrary yet accurately known 3D objects. Indirectly, we +also provide insights into the metric used in SoccerNet-calibration, which +solely relies on image annotation data of viewed 3D objects as ground truth, +thus implementing our protocol. With experiments on the World Cup 2014, CARWC, +and SoccerNet datasets, we show that our benchmarking protocol provides fairer +evaluations of camera calibration methods. By defining our requirements for +proper benchmarking, we hope to pave the way for a new stage in camera +calibration for sports applications with high accuracy standards. + +
+
+ comment: 12 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ TextCoT: Zoom In for Enhanced Multimodal Text-Rich Image Understanding + + +
+ The advent of Large Multimodal Models (LMMs) has sparked a surge in research +aimed at harnessing their remarkable reasoning abilities. However, for +understanding text-rich images, challenges persist in fully leveraging the +potential of LMMs, and existing methods struggle with effectively processing +high-resolution images. In this work, we propose TextCoT, a novel +Chain-of-Thought framework for text-rich image understanding. TextCoT utilizes +the captioning ability of LMMs to grasp the global context of the image and the +grounding capability to examine local textual regions. This allows for the +extraction of both global and local visual information, facilitating more +accurate question-answering. Technically, TextCoT consists of three stages, +including image overview, coarse localization, and fine-grained observation. +The image overview stage provides a comprehensive understanding of the global +scene information, and the coarse localization stage approximates the image +area containing the answer based on the question asked. Then, integrating the +obtained global image descriptions, the final stage further examines specific +regions to provide accurate answers. Our method is free of extra training, +offering immediate plug-and-play functionality. Extensive experiments are +conducted on a series of text-rich image question-answering benchmark datasets +based on several advanced LMMs, and the results demonstrate the effectiveness +and strong generalization ability of our method. Code is available at +https://github.com/bzluan/TextCoT. + +
+
+
+
+
+ + ☆ NTIRE 2024 Challenge on Image Super-Resolution ($\times$4): Methods and + Results + + +
+ This paper reviews the NTIRE 2024 challenge on image super-resolution +($\times$4), highlighting the solutions proposed and the outcomes obtained. The +challenge involves generating corresponding high-resolution (HR) images, +magnified by a factor of four, from low-resolution (LR) inputs using prior +information. The LR images originate from bicubic downsampling degradation. The +aim of the challenge is to obtain designs/solutions with the most advanced SR +performance, with no constraints on computational resources (e.g., model size +and FLOPs) or training data. The track of this challenge assesses performance +with the PSNR metric on the DIV2K testing dataset. The competition attracted +199 registrants, with 20 teams submitting valid entries. This collective +endeavour not only pushes the boundaries of performance in single-image SR but +also offers a comprehensive overview of current trends in this field. + +
+
+ comment: NTIRE 2024 webpage: https://cvlai.net/ntire/2024. Code: + https://github.com/zhengchen1999/NTIRE2024_ImageSR_x4 +
+
+
+
+
+ + ☆ The Devil is in the Few Shots: Iterative Visual Knowledge Completion for + Few-shot Learning ECCV 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) has shown powerful zero-shot +learning performance. Few-shot learning aims to further enhance the transfer +capability of CLIP by giving few images in each class, aka 'few shots'. Most +existing methods either implicitly learn from the few shots by incorporating +learnable prompts or adapters, or explicitly embed them in a cache model for +inference. However, the narrow distribution of few shots often contains +incomplete class information, leading to biased visual knowledge with high risk +of misclassification. To tackle this problem, recent methods propose to +supplement visual knowledge by generative models or extra databases, which can +be costly and time-consuming. In this paper, we propose an Iterative Visual +Knowledge CompLetion (KCL) method to complement visual knowledge by properly +taking advantages of unlabeled samples without access to any auxiliary or +synthetic data. Specifically, KCL first measures the similarities between +unlabeled samples and each category. Then, the samples with top confidence to +each category is selected and collected by a designed confidence criterion. +Finally, the collected samples are treated as labeled ones and added to few +shots to jointly re-estimate the remaining unlabeled ones. The above procedures +will be repeated for a certain number of iterations with more and more samples +being collected until convergence, ensuring a progressive and robust knowledge +completion process. Extensive experiments on 11 benchmark datasets demonstrate +the effectiveness and efficiency of KCL as a plug-and-play module under both +few-shot and zero-shot learning settings. Code is available at +https://github.com/Mark-Sky/KCL. + +
+
+ comment: 26 pages, submitted to ECCV 2024 +
+
+
+
+
+ + ☆ RandAlign: A Parameter-Free Method for Regularizing Graph Convolutional + Networks + + +
+ Studies continually find that message-passing graph convolutional networks +suffer from the over-smoothing issue. Basically, the issue of over-smoothing +refers to the phenomenon that the learned embeddings for all nodes can become +very similar to one another and therefore are uninformative after repeatedly +applying message passing iterations. Intuitively, we can expect the generated +embeddings become smooth asymptotically layerwisely, that is each layer of +graph convolution generates a smoothed version of embeddings as compared to +that generated by the previous layer. Based on this intuition, we propose +RandAlign, a stochastic regularization method for graph convolutional networks. +The idea of RandAlign is to randomly align the learned embedding for each node +with that of the previous layer using randomly interpolation in each graph +convolution layer. Through alignment, the smoothness of the generated +embeddings is explicitly reduced. To better maintain the benefit yielded by the +graph convolution, in the alignment step we introduce to first scale the +embedding of the previous layer to the same norm as the generated embedding and +then perform random interpolation for aligning the generated embedding. +RandAlign is a parameter-free method and can be directly applied without +introducing additional trainable weights or hyper-parameters. We experimentally +evaluate RandAlign on different graph domain tasks on seven benchmark datasets. +The experimental results show that RandAlign is a general method that improves +the generalization performance of various graph convolutional network models +and also improves the numerical stability of optimization, advancing the state +of the art performance for graph representation learning. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Contrastive Pretraining for Visual Concept Explanations of Socioeconomic + Outcomes + + +
+ Predicting socioeconomic indicators from satellite imagery with deep learning +has become an increasingly popular research direction. Post-hoc concept-based +explanations can be an important step towards broader adoption of these models +in policy-making as they enable the interpretation of socioeconomic outcomes +based on visual concepts that are intuitive to humans. In this paper, we study +the interplay between representation learning using an additional task-specific +contrastive loss and post-hoc concept explainability for socioeconomic studies. +Our results on two different geographical locations and tasks indicate that the +task-specific pretraining imposes a continuous ordering of the latent space +embeddings according to the socioeconomic outcomes. This improves the model's +interpretability as it enables the latent space of the model to associate urban +concepts with continuous intervals of socioeconomic outcomes. Further, we +illustrate how analyzing the model's conceptual sensitivity for the intervals +of socioeconomic outcomes can shed light on new insights for urban studies. + +
+
+
+
+
+ + ☆ Deep Learning-Based Segmentation of Tumors in PET/CT Volumes: Benchmark + of Different Architectures and Training Strategies + + +
+ Cancer is one of the leading causes of death globally, and early diagnosis is +crucial for patient survival. Deep learning algorithms have great potential for +automatic cancer analysis. Artificial intelligence has achieved high +performance in recognizing and segmenting single lesions. However, diagnosing +multiple lesions remains a challenge. This study examines and compares various +neural network architectures and training strategies for automatically +segmentation of cancer lesions using PET/CT images from the head, neck, and +whole body. The authors analyzed datasets from the AutoPET and HECKTOR +challenges, exploring popular single-step segmentation architectures and +presenting a two-step approach. The results indicate that the V-Net and nnU-Net +models were the most effective for their respective datasets. The results for +the HECKTOR dataset ranged from 0.75 to 0.76 for the aggregated Dice +coefficient. Eliminating cancer-free cases from the AutoPET dataset was found +to improve the performance of most models. In the case of AutoPET data, the +average segmentation efficiency after training only on images containing cancer +lesions increased from 0.55 to 0.66 for the classic Dice coefficient and from +0.65 to 0.73 for the aggregated Dice coefficient. The research demonstrates the +potential of artificial intelligence in precise oncological diagnostics and may +contribute to the development of more targeted and effective cancer assessment +techniques. + +
+
+
+
+
+ + ☆ Eyes on the Streets: Leveraging Street-Level Imaging to Model Urban + Crime Dynamics + + +
+ This study addresses the challenge of urban safety in New York City by +examining the relationship between the built environment and crime rates using +machine learning and a comprehensive dataset of street view images. We aim to +identify how urban landscapes correlate with crime statistics, focusing on the +characteristics of street views and their association with crime rates. The +findings offer insights for urban planning and crime prevention, highlighting +the potential of environmental design in enhancing public safety. + +
+
+
+
+
+ + ♻ ☆ Sparse Global Matching for Video Frame Interpolation with Large Motion CVPR 2024 + + +
+ Large motion poses a critical challenge in Video Frame Interpolation (VFI) +task. Existing methods are often constrained by limited receptive fields, +resulting in sub-optimal performance when handling scenarios with large motion. +In this paper, we introduce a new pipeline for VFI, which can effectively +integrate global-level information to alleviate issues associated with large +motion. Specifically, we first estimate a pair of initial intermediate flows +using a high-resolution feature map for extracting local details. Then, we +incorporate a sparse global matching branch to compensate for flow estimation, +which consists of identifying flaws in initial flows and generating sparse flow +compensation with a global receptive field. Finally, we adaptively merge the +initial flow estimation with global flow compensation, yielding a more accurate +intermediate flow. To evaluate the effectiveness of our method in handling +large motion, we carefully curate a more challenging subset from commonly used +benchmarks. Our method demonstrates the state-of-the-art performance on these +VFI subsets with large motion. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/. + Fixed some typos in the supplementary material +
+
+
+
+
+ + ♻ ☆ Image-based Deep Learning for the time-dependent prediction of fresh + concrete properties + + +
+ Increasing the degree of digitisation and automation in the concrete +production process can play a crucial role in reducing the CO$_2$ emissions +that are associated with the production of concrete. In this paper, a method is +presented that makes it possible to predict the properties of fresh concrete +during the mixing process based on stereoscopic image sequences of the +concretes flow behaviour. A Convolutional Neural Network (CNN) is used for the +prediction, which receives the images supported by information on the mix +design as input. In addition, the network receives temporal information in the +form of the time difference between the time at which the images are taken and +the time at which the reference values of the concretes are carried out. With +this temporal information, the network implicitly learns the time-dependent +behaviour of the concretes properties. The network predicts the slump flow +diameter, the yield stress and the plastic viscosity. The time-dependent +prediction potentially opens up the pathway to determine the temporal +development of the fresh concrete properties already during mixing. This +provides a huge advantage for the concrete industry. As a result, +countermeasures can be taken in a timely manner. It is shown that an approach +based on depth and optical flow images, supported by information of the mix +design, achieves the best results. + +
+
+
+
+
+ + ♻ ☆ Human vs. LMMs: Exploring the Discrepancy in Emoji Interpretation and + Usage in Digital Communication + + +
+ Leveraging Large Multimodal Models (LMMs) to simulate human behaviors when +processing multimodal information, especially in the context of social media, +has garnered immense interest due to its broad potential and far-reaching +implications. Emojis, as one of the most unique aspects of digital +communication, are pivotal in enriching and often clarifying the emotional and +tonal dimensions. Yet, there is a notable gap in understanding how these +advanced models, such as GPT-4V, interpret and employ emojis in the nuanced +context of online interaction. This study intends to bridge this gap by +examining the behavior of GPT-4V in replicating human-like use of emojis. The +findings reveal a discernible discrepancy between human and GPT-4V behaviors, +likely due to the subjective nature of human interpretation and the limitations +of GPT-4V's English-centric training, suggesting cultural biases and inadequate +representation of non-English cultures. + +
+
+ comment: Accepted for publication in ICWSM 2024 +
+
+
+
+
+ + ♻ ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ♻ ☆ Towards Variable and Coordinated Holistic Co-Speech Motion Generation CVPR 2024 + + +
+ This paper addresses the problem of generating lifelike holistic co-speech +motions for 3D avatars, focusing on two key aspects: variability and +coordination. Variability allows the avatar to exhibit a wide range of motions +even with similar speech content, while coordination ensures a harmonious +alignment among facial expressions, hand gestures, and body poses. We aim to +achieve both with ProbTalk, a unified probabilistic framework designed to +jointly model facial, hand, and body movements in speech. ProbTalk builds on +the variational autoencoder (VAE) architecture and incorporates three core +designs. First, we introduce product quantization (PQ) to the VAE, which +enriches the representation of complex holistic motion. Second, we devise a +novel non-autoregressive model that embeds 2D positional encoding into the +product-quantized representation, thereby preserving essential structure +information of the PQ codes. Last, we employ a secondary stage to refine the +preliminary prediction, further sharpening the high-frequency details. Coupling +these three designs enables ProbTalk to generate natural and diverse holistic +co-speech motions, outperforming several state-of-the-art methods in +qualitative and quantitative evaluations, particularly in terms of realism. Our +code and model will be released for research purposes at +https://feifeifeiliu.github.io/probtalk/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Dancing with Still Images: Video Distillation via Static-Dynamic + Disentanglement CVPR 2024 + + +
+ Recently, dataset distillation has paved the way towards efficient machine +learning, especially for image datasets. However, the distillation for videos, +characterized by an exclusive temporal dimension, remains an underexplored +domain. In this work, we provide the first systematic study of video +distillation and introduce a taxonomy to categorize temporal compression. Our +investigation reveals that the temporal information is usually not well learned +during distillation, and the temporal dimension of synthetic data contributes +little. The observations motivate our unified framework of disentangling the +dynamic and static information in the videos. It first distills the videos into +still images as static memory and then compensates the dynamic and motion +information with a learnable dynamic memory block. Our method achieves +state-of-the-art on video datasets at different scales, with a notably smaller +memory storage budget. Our code is available at +https://github.com/yuz1wan/video_distillation. + +
+
+ comment: CVPR 2024, project page: https://mvig-rhos.com/video-distill +
+
+
+
+
+ + ♻ ☆ A Survey of Neural Network Robustness Assessment in Image Recognition + + +
+ In recent years, there has been significant attention given to the robustness +assessment of neural networks. Robustness plays a critical role in ensuring +reliable operation of artificial intelligence (AI) systems in complex and +uncertain environments. Deep learning's robustness problem is particularly +significant, highlighted by the discovery of adversarial attacks on image +classification models. Researchers have dedicated efforts to evaluate +robustness in diverse perturbation conditions for image recognition tasks. +Robustness assessment encompasses two main techniques: robustness verification/ +certification for deliberate adversarial attacks and robustness testing for +random data corruptions. In this survey, we present a detailed examination of +both adversarial robustness (AR) and corruption robustness (CR) in neural +network assessment. Analyzing current research papers and standards, we provide +an extensive overview of robustness assessment in image recognition. Three +essential aspects are analyzed: concepts, metrics, and assessment methods. We +investigate the perturbation metrics and range representations used to measure +the degree of perturbations on images, as well as the robustness metrics +specifically for the robustness conditions of classification models. The +strengths and limitations of the existing methods are also discussed, and some +potential directions for future research are provided. + +
+
+ comment: Corrected typos and grammatical errors in Section 5 +
+
+
+
+
+ + ♻ ☆ SyncDreamer: Generating Multiview-consistent Images from a Single-view + Image ICLR 2024 + + +
+ In this paper, we present a novel diffusion model called that generates +multiview-consistent images from a single-view image. Using pretrained +large-scale 2D diffusion models, recent work Zero123 demonstrates the ability +to generate plausible novel views from a single-view image of an object. +However, maintaining consistency in geometry and colors for the generated +images remains a challenge. To address this issue, we propose a synchronized +multiview diffusion model that models the joint probability distribution of +multiview images, enabling the generation of multiview-consistent images in a +single reverse process. SyncDreamer synchronizes the intermediate states of all +the generated images at every step of the reverse process through a 3D-aware +feature attention mechanism that correlates the corresponding features across +different views. Experiments show that SyncDreamer generates images with high +consistency across different views, thus making it well-suited for various 3D +generation tasks such as novel-view-synthesis, text-to-3D, and image-to-3D. + +
+
+ comment: ICLR 2024 Spotlight. Project page: + https://liuyuan-pal.github.io/SyncDreamer/ Code: + https://github.com/liuyuan-pal/SyncDreamer +
+
+
+
+
+ + ♻ ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated +
+
+
+
+
+ + ♻ ☆ Are NeRFs ready for autonomous driving? Towards closing the + real-to-simulation gap CVPR 2024 + + +
+ Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing +autonomous driving (AD) research, offering scalable closed-loop simulation and +data augmentation capabilities. However, to trust the results achieved in +simulation, one needs to ensure that AD systems perceive real and rendered data +in the same way. Although the performance of rendering methods is increasing, +many scenarios will remain inherently challenging to reconstruct faithfully. To +this end, we propose a novel perspective for addressing the real-to-simulated +data gap. Rather than solely focusing on improving rendering fidelity, we +explore simple yet effective methods to enhance perception model robustness to +NeRF artifacts without compromising performance on real data. Moreover, we +conduct the first large-scale investigation into the real-to-simulated data gap +in an AD setting using a state-of-the-art neural rendering technique. +Specifically, we evaluate object detectors and an online mapping model on real +and simulated data, and study the effects of different fine-tuning +strategies.Our results show notable improvements in model robustness to +simulated data, even improving real-world performance in some cases. Last, we +delve into the correlation between the real-to-simulated gap and image +reconstruction metrics, identifying FID and LPIPS as strong indicators. See +https://research.zenseact.com/publications/closing-real2sim-gap for our project +page. + +
+
+ comment: Accepted at Workshop on Autonomous Driving, CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Geometrically-driven Aggregation for Zero-shot 3D Point Cloud + Understanding CVPR 2024 + + +
+ Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language +Models (VLMs). Existing strategies directly map Vision-Language Models from 2D +pixels of rendered or captured views to 3D points, overlooking the inherent and +expressible point cloud geometric structure. Geometrically similar or close +regions can be exploited for bolstering point cloud understanding as they are +likely to share semantic information. To this end, we introduce the first +training-free aggregation technique that leverages the point cloud's 3D +geometric structure to improve the quality of the transferred Vision-Language +Models. Our approach operates iteratively, performing local-to-global +aggregation based on geometric and semantic point-level reasoning. We benchmark +our approach on three downstream tasks, including classification, part +segmentation, and semantic segmentation, with a variety of datasets +representing both synthetic/real-world, and indoor/outdoor scenarios. Our +approach achieves new state-of-the-art results in all benchmarks. Our approach +operates iteratively, performing local-to-global aggregation based on geometric +and semantic point-level reasoning. Code and dataset are available at +https://luigiriz.github.io/geoze-website/ + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincaré Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ♻ ☆ TTK is Getting MPI-Ready + + +
+ This system paper documents the technical foundations for the extension of +the Topology ToolKit (TTK) to distributed-memory parallelism with the Message +Passing Interface (MPI). While several recent papers introduced topology-based +approaches for distributed-memory environments, these were reporting +experiments obtained with tailored, mono-algorithm implementations. In +contrast, we describe in this paper a versatile approach (supporting both +triangulated domains and regular grids) for the support of topological analysis +pipelines, i.e. a sequence of topological algorithms interacting together. +While developing this extension, we faced several algorithmic and software +engineering challenges, which we document in this paper. We describe an MPI +extension of TTK's data structure for triangulation representation and +traversal, a central component to the global performance and generality of +TTK's topological implementations. We also introduce an intermediate interface +between TTK and MPI, both at the global pipeline level, and at the fine-grain +algorithmic level. We provide a taxonomy for the distributed-memory topological +algorithms supported by TTK, depending on their communication needs and provide +examples of hybrid MPI+thread parallelizations. Performance analyses show that +parallel efficiencies range from 20% to 80% (depending on the algorithms), and +that the MPI-specific preconditioning introduced by our framework induces a +negligible computation time overhead. We illustrate the new distributed-memory +capabilities of TTK with an example of advanced analysis pipeline, combining +multiple algorithms, run on the largest publicly available dataset we have +found (120 billion vertices) on a cluster with 64 nodes (for a total of 1536 +cores). Finally, we provide a roadmap for the completion of TTK's MPI +extension, along with generic recommendations for each algorithm communication +category. + +
+
+ comment: 18 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ CrossKD: Cross-Head Knowledge Distillation for Object Detection + + +
+ Knowledge Distillation (KD) has been validated as an effective model +compression technique for learning compact object detectors. Existing +state-of-the-art KD methods for object detection are mostly based on feature +imitation. In this paper, we present a general and effective prediction +mimicking distillation scheme, called CrossKD, which delivers the intermediate +features of the student's detection head to the teacher's detection head. The +resulting cross-head predictions are then forced to mimic the teacher's +predictions. This manner relieves the student's head from receiving +contradictory supervision signals from the annotations and the teacher's +predictions, greatly improving the student's detection performance. Moreover, +as mimicking the teacher's predictions is the target of KD, CrossKD offers more +task-oriented information in contrast with feature imitation. On MS COCO, with +only prediction mimicking losses applied, our CrossKD boosts the average +precision of GFL ResNet-50 with 1x training schedule from 40.2 to 43.7, +outperforming all existing KD methods. In addition, our method also works well +when distilling detectors with heterogeneous backbones. Code is available at +https://github.com/jbwang1997/CrossKD. + +
+
+
+
+
+ + ♻ ☆ Z-GMOT: Zero-shot Generic Multiple Object Tracking NAACL 2024 + + +
+ Despite recent significant progress, Multi-Object Tracking (MOT) faces +limitations such as reliance on prior knowledge and predefined categories and +struggles with unseen objects. To address these issues, Generic Multiple Object +Tracking (GMOT) has emerged as an alternative approach, requiring less prior +information. However, current GMOT methods often rely on initial bounding boxes +and struggle to handle variations in factors such as viewpoint, lighting, +occlusion, and scale, among others. Our contributions commence with the +introduction of the \textit{Referring GMOT dataset} a collection of videos, +each accompanied by detailed textual descriptions of their attributes. +Subsequently, we propose $\mathtt{Z-GMOT}$, a cutting-edge tracking solution +capable of tracking objects from \textit{never-seen categories} without the +need of initial bounding boxes or predefined categories. Within our +$\mathtt{Z-GMOT}$ framework, we introduce two novel components: (i) +$\mathtt{iGLIP}$, an improved Grounded language-image pretraining, for +accurately detecting unseen objects with specific characteristics. (ii) +$\mathtt{MA-SORT}$, a novel object association approach that adeptly integrates +motion and appearance-based matching strategies to tackle the complex task of +tracking objects with high similarity. Our contributions are benchmarked +through extensive experiments conducted on the Referring GMOT dataset for GMOT +task. Additionally, to assess the generalizability of the proposed +$\mathtt{Z-GMOT}$, we conduct ablation studies on the DanceTrack and MOT20 +datasets for the MOT task. Our dataset, code, and models are released at: +https://fsoft-aic.github.io/Z-GMOT. + +
+
+ comment: Accepted to NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Stimulating the Diffusion Model for Image Denoising via Adaptive + Embedding and Ensembling + + +
+ Image denoising is a fundamental problem in computational photography, where +achieving high perception with low distortion is highly demanding. Current +methods either struggle with perceptual quality or suffer from significant +distortion. Recently, the emerging diffusion model has achieved +state-of-the-art performance in various tasks and demonstrates great potential +for image denoising. However, stimulating diffusion models for image denoising +is not straightforward and requires solving several critical problems. For one +thing, the input inconsistency hinders the connection between diffusion models +and image denoising. For another, the content inconsistency between the +generated image and the desired denoised image introduces distortion. To tackle +these problems, we present a novel strategy called the Diffusion Model for +Image Denoising (DMID) by understanding and rethinking the diffusion model from +a denoising perspective. Our DMID strategy includes an adaptive embedding +method that embeds the noisy image into a pre-trained unconditional diffusion +model and an adaptive ensembling method that reduces distortion in the denoised +image. Our DMID strategy achieves state-of-the-art performance on both +distortion-based and perception-based metrics, for both Gaussian and real-world +image denoising.The code is available at https://github.com/Li-Tong-621/DMID. + +
+
+ comment: 18 pages,15 figures +
+
+
+
+
+ + ♻ ☆ Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality + Metrics + + +
+ Recent advances in text-to-image synthesis enabled through a combination of +language and vision foundation models have led to a proliferation of the tools +available and an increased attention to the field. When conducting +text-to-image synthesis, a central goal is to ensure that the content between +text and image is aligned. As such, there exist numerous evaluation metrics +that aim to mimic human judgement. However, it is often unclear which metric to +use for evaluating text-to-image synthesis systems as their evaluation is +highly nuanced. In this work, we provide a comprehensive overview of existing +text-to-image evaluation metrics. Based on our findings, we propose a new +taxonomy for categorizing these metrics. Our taxonomy is grounded in the +assumption that there are two main quality criteria, namely compositionality +and generality, which ideally map to human preferences. Ultimately, we derive +guidelines for practitioners conducting text-to-image evaluation, discuss open +challenges of evaluation mechanisms, and surface limitations of current +metrics. + +
+
+ comment: preprint, 20 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ PEAN: A Diffusion-Based Prior-Enhanced Attention Network for Scene Text + Image Super-Resolution + + +
+ Scene text image super-resolution (STISR) aims at simultaneously increasing +the resolution and readability of low-resolution scene text images, thus +boosting the performance of the downstream recognition task. Two factors in +scene text images, visual structure and semantic information, affect the +recognition performance significantly. To mitigate the effects from these +factors, this paper proposes a Prior-Enhanced Attention Network (PEAN). +Specifically, an attention-based modulation module is leveraged to understand +scene text images by neatly perceiving the local and global dependence of +images, despite the shape of the text. Meanwhile, a diffusion-based module is +developed to enhance the text prior, hence offering better guidance for the SR +network to generate SR images with higher semantic accuracy. Additionally, a +multi-task learning paradigm is employed to optimize the network, enabling the +model to generate legible SR images. As a result, PEAN establishes new SOTA +results on the TextZoom benchmark. Experiments are also conducted to analyze +the importance of the enhanced text prior as a means of improving the +performance of the SR network. Code will be made available at +https://github.com/jdfxzzy/PEAN. + +
+
+
+
+
+ + ♻ ☆ Do More With What You Have: Transferring Depth-Scale from Labeled to + Unlabeled Domains + + +
+ Transferring the absolute depth prediction capabilities of an estimator to a +new domain is a task with significant real-world applications. This task is +specifically challenging when images from the new domain are collected without +ground-truth depth measurements, and possibly with sensors of different +intrinsics. To overcome such limitations, a recent zero-shot solution was +trained on an extensive training dataset and encoded the various camera +intrinsics. Other solutions generated synthetic data with depth labels that +matched the intrinsics of the new target data to enable depth-scale transfer +between the domains. + In this work we present an alternative solution that can utilize any existing +synthetic or real dataset, that has a small number of images annotated with +ground truth depth labels. Specifically, we show that self-supervised depth +estimators result in up-to-scale predictions that are linearly correlated to +their absolute depth values across the domain, a property that we model in this +work using a single scalar. In addition, aligning the field-of-view of two +datasets prior to training, results in a common linear relationship for both +domains. We use this observed property to transfer the depth-scale from source +datasets that have absolute depth labels to new target datasets that lack these +measurements, enabling absolute depth predictions in the target domain. + The suggested method was successfully demonstrated on the KITTI, DDAD and +nuScenes datasets, while using other existing real or synthetic source +datasets, that have a different field-of-view, other image style or structural +content, achieving comparable or better accuracy than other existing methods +that do not use target ground-truth depths. + +
+
+
+
+
+ + ♻ ☆ Disentangled Explanations of Neural Network Predictions by Finding + Relevant Subspaces + + +
+ Explainable AI aims to overcome the black-box nature of complex ML models +like neural networks by generating explanations for their predictions. +Explanations often take the form of a heatmap identifying input features (e.g. +pixels) that are relevant to the model's decision. These explanations, however, +entangle the potentially multiple factors that enter into the overall complex +decision strategy. We propose to disentangle explanations by extracting at some +intermediate layer of a neural network, subspaces that capture the multiple and +distinct activation patterns (e.g. visual concepts) that are relevant to the +prediction. To automatically extract these subspaces, we propose two new +analyses, extending principles found in PCA or ICA to explanations. These novel +analyses, which we call principal relevant component analysis (PRCA) and +disentangled relevant subspace analysis (DRSA), maximize relevance instead of +e.g. variance or kurtosis. This allows for a much stronger focus of the +analysis on what the ML model actually uses for predicting, ignoring +activations or concepts to which the model is invariant. Our approach is +general enough to work alongside common attribution techniques such as Shapley +Value, Integrated Gradients, or LRP. Our proposed methods show to be +practically useful and compare favorably to the state of the art as +demonstrated on benchmarks and three use cases. + +
+
+ comment: 17 pages + supplement +
+
+
+
+
+ + ♻ ☆ CF-Font: Content Fusion for Few-shot Font Generation CVPR 2023 + + +
+ Content and style disentanglement is an effective way to achieve few-shot +font generation. It allows to transfer the style of the font image in a source +domain to the style defined with a few reference images in a target domain. +However, the content feature extracted using a representative font might not be +optimal. In light of this, we propose a content fusion module (CFM) to project +the content feature into a linear space defined by the content features of +basis fonts, which can take the variation of content features caused by +different fonts into consideration. Our method also allows to optimize the +style representation vector of reference images through a lightweight iterative +style-vector refinement (ISR) strategy. Moreover, we treat the 1D projection of +a character image as a probability distribution and leverage the distance +between two distributions as the reconstruction loss (namely projected +character loss, PCL). Compared to L2 or L1 reconstruction loss, the +distribution distance pays more attention to the global shape of characters. We +have evaluated our method on a dataset of 300 fonts with 6.5k characters each. +Experimental results verify that our method outperforms existing +state-of-the-art few-shot font generation methods by a large margin. The source +code can be found at https://github.com/wangchi95/CF-Font. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ ASH: Animatable Gaussian Splats for Efficient and Photoreal Human + Rendering + + +
+ Real-time rendering of photorealistic and controllable human avatars stands +as a cornerstone in Computer Vision and Graphics. While recent advances in +neural implicit rendering have unlocked unprecedented photorealism for digital +avatars, real-time performance has mostly been demonstrated for static scenes +only. To address this, we propose ASH, an animatable Gaussian splatting +approach for photorealistic rendering of dynamic humans in real-time. We +parameterize the clothed human as animatable 3D Gaussians, which can be +efficiently splatted into image space to generate the final rendering. However, +naively learning the Gaussian parameters in 3D space poses a severe challenge +in terms of compute. Instead, we attach the Gaussians onto a deformable +character model, and learn their parameters in 2D texture space, which allows +leveraging efficient 2D convolutional architectures that easily scale with the +required number of Gaussians. We benchmark ASH with competing methods on +pose-controllable avatars, demonstrating that our method outperforms existing +real-time methods by a large margin and shows comparable or even better results +than offline methods. + +
+
+ comment: For project page, see https://vcai.mpi-inf.mpg.de/projects/ash/ +
+
+
+
+
+ + ♻ ☆ Text-Driven Traffic Anomaly Detection with Temporal High-Frequency + Modeling in Driving Videos + + +
+ Traffic anomaly detection (TAD) in driving videos is critical for ensuring +the safety of autonomous driving and advanced driver assistance systems. +Previous single-stage TAD methods primarily rely on frame prediction, making +them vulnerable to interference from dynamic backgrounds induced by the rapid +movement of the dashboard camera. While two-stage TAD methods appear to be a +natural solution to mitigate such interference by pre-extracting +background-independent features (such as bounding boxes and optical flow) using +perceptual algorithms, they are susceptible to the performance of first-stage +perceptual algorithms and may result in error propagation. In this paper, we +introduce TTHF, a novel single-stage method aligning video clips with text +prompts, offering a new perspective on traffic anomaly detection. Unlike +previous approaches, the supervised signal of our method is derived from +languages rather than orthogonal one-hot vectors, providing a more +comprehensive representation. Further, concerning visual representation, we +propose to model the high frequency of driving videos in the temporal domain. +This modeling captures the dynamic changes of driving scenes, enhances the +perception of driving behavior, and significantly improves the detection of +traffic anomalies. In addition, to better perceive various types of traffic +anomalies, we carefully design an attentive anomaly focusing mechanism that +visually and linguistically guides the model to adaptively focus on the visual +context of interest, thereby facilitating the detection of traffic anomalies. +It is shown that our proposed TTHF achieves promising performance, +outperforming state-of-the-art competitors by +5.4% AUC on the DoTA dataset and +achieving high generalization on the DADA dataset. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Few Shot Part Segmentation Reveals Compositional Logic for Industrial + Anomaly Detection AAAI2024 + + +
+ Logical anomalies (LA) refer to data violating underlying logical constraints +e.g., the quantity, arrangement, or composition of components within an image. +Detecting accurately such anomalies requires models to reason about various +component types through segmentation. However, curation of pixel-level +annotations for semantic segmentation is both time-consuming and expensive. +Although there are some prior few-shot or unsupervised co-part segmentation +algorithms, they often fail on images with industrial object. These images have +components with similar textures and shapes, and a precise differentiation +proves challenging. In this study, we introduce a novel component segmentation +model for LA detection that leverages a few labeled samples and unlabeled +images sharing logical constraints. To ensure consistent segmentation across +unlabeled images, we employ a histogram matching loss in conjunction with an +entropy loss. As segmentation predictions play a crucial role, we propose to +enhance both local and global sample validity detection by capturing key +aspects from visual semantics via three memory banks: class histograms, +component composition embeddings and patch-level representations. For effective +LA detection, we propose an adaptive scaling strategy to standardize anomaly +scores from different memory banks in inference. Extensive experiments on the +public benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA +detection vs. 89.6% from competing methods. + +
+
+ comment: Accepted in AAAI2024 +
+
+
+
+
+ + ♻ ☆ Adapting Short-Term Transformers for Action Detection in Untrimmed + Videos CVPR2024 + + +
+ Vision Transformer (ViT) has shown high potential in video recognition, owing +to its flexible design, adaptable self-attention mechanisms, and the efficacy +of masked pre-training. Yet, it remains unclear how to adapt these pre-trained +short-term ViTs for temporal action detection (TAD) in untrimmed videos. The +existing works treat them as off-the-shelf feature extractors for each +short-trimmed snippet without capturing the fine-grained relation among +different snippets in a broader temporal context. To mitigate this issue, this +paper focuses on designing a new mechanism for adapting these pre-trained ViT +models as a unified long-form video transformer to fully unleash its modeling +power in capturing inter-snippet relation, while still keeping low computation +overhead and memory consumption for efficient TAD. To this end, we design +effective cross-snippet propagation modules to gradually exchange short-term +video information among different snippets from two levels. For inner-backbone +information propagation, we introduce a cross-snippet propagation strategy to +enable multi-snippet temporal feature interaction inside the backbone.For +post-backbone information propagation, we propose temporal transformer layers +for further clip-level modeling. With the plain ViT-B pre-trained with +VideoMAE, our end-to-end temporal action detector (ViT-TAD) yields a very +competitive performance to previous temporal action detectors, riching up to +69.5 average mAP on THUMOS14, 37.40 average mAP on ActivityNet-1.3 and 17.20 +average mAP on FineAction. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ On the Road to Portability: Compressing End-to-End Motion Planner for + Autonomous Driving CVPR 2024 + + +
+ End-to-end motion planning models equipped with deep neural networks have +shown great potential for enabling full autonomous driving. However, the +oversized neural networks render them impractical for deployment on +resource-constrained systems, which unavoidably requires more computational +time and resources during reference.To handle this, knowledge distillation +offers a promising approach that compresses models by enabling a smaller +student model to learn from a larger teacher model. Nevertheless, how to apply +knowledge distillation to compress motion planners has not been explored so +far. In this paper, we propose PlanKD, the first knowledge distillation +framework tailored for compressing end-to-end motion planners. First, +considering that driving scenes are inherently complex, often containing +planning-irrelevant or even noisy information, transferring such information is +not beneficial for the student planner. Thus, we design an information +bottleneck based strategy to only distill planning-relevant information, rather +than transfer all information indiscriminately. Second, different waypoints in +an output planned trajectory may hold varying degrees of importance for motion +planning, where a slight deviation in certain crucial waypoints might lead to a +collision. Therefore, we devise a safety-aware waypoint-attentive distillation +module that assigns adaptive weights to different waypoints based on the +importance, to encourage the student to accurately mimic more crucial +waypoints, thereby improving overall safety. Experiments demonstrate that our +PlanKD can boost the performance of smaller planners by a large margin, and +significantly reduce their reference time. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Maintaining User Trust Through Multistage Uncertainty Aware Inference + + +
+ This paper describes and evaluates a multistage approach to AI deployment. +Each stage involves a more accurate method of inference, yet engaging each +comes with an increasing cost. In outlining the architecture, we present a +method for quantifying model uncertainty that facilitates confident deferral +decisions. The architecture is currently under active deployment to thousands +of cotton farmers across India. The broader idea however is applicable to a +growing sector of AI deployments in challenging low resources settings. + +
+
+
+
+
+ + ♻ ☆ Robust image segmentation model based on binary level set SC + + +
+ In order to improve the robustness of traditional image segmentation models +to noise, this paper models the illumination term in intensity inhomogeneity +images. Additionally, to enhance the model's robustness to noisy images, we +incorporate the binary level set model into the proposed model. Compared to the +traditional level set, the binary level set eliminates the need for continuous +reinitialization. Moreover, by introducing the variational operator GL, our +model demonstrates better capability in segmenting noisy images. Finally, we +employ the three-step splitting operator method for solving, and the +effectiveness of the proposed model is demonstrated on various images. + +
+
+ comment: SCI +
+
+
+
+
+ + ♻ ☆ LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and + Image Classification + + +
+ The fusion of hyperspectral and LiDAR data has been an active research topic. +Existing fusion methods have ignored the high-dimensionality and redundancy +challenges in hyperspectral images, despite that band selection methods have +been intensively studied for hyperspectral image (HSI) processing. This paper +addresses this significant gap by introducing a cross-attention mechanism from +the transformer architecture for the selection of HSI bands guided by LiDAR +data. LiDAR provides high-resolution vertical structural information, which can +be useful in distinguishing different types of land cover that may have similar +spectral signatures but different structural profiles. In our approach, the +LiDAR data are used as the "query" to search and identify the "key" from the +HSI to choose the most pertinent bands for LiDAR. This method ensures that the +selected HSI bands drastically reduce redundancy and computational requirements +while working optimally with the LiDAR data. Extensive experiments have been +undertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and +MUUFL. The results highlight the superiority of the cross-attention mechanism, +underlining the enhanced classification accuracy of the identified HSI bands +when fused with the LiDAR features. The results also show that the use of fewer +bands combined with LiDAR surpasses the performance of state-of-the-art fusion +models. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics CVPR 2024 + + +
+ We introduce PhysGaussian, a new method that seamlessly integrates physically +grounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel +motion synthesis. Employing a custom Material Point Method (MPM), our approach +enriches 3D Gaussian kernels with physically meaningful kinematic deformation +and mechanical stress attributes, all evolved in line with continuum mechanics +principles. A defining characteristic of our method is the seamless integration +between physical simulation and visual rendering: both components utilize the +same 3D Gaussian kernels as their discrete representations. This negates the +necessity for triangle/tetrahedron meshing, marching cubes, "cage meshes," or +any other geometry embedding, highlighting the principle of "what you see is +what you simulate (WS$^2$)." Our method demonstrates exceptional versatility +across a wide variety of materials--including elastic entities, metals, +non-Newtonian fluids, and granular materials--showcasing its strong +capabilities in creating diverse visual content with novel viewpoints and +movements. Our project page is at: https://xpandora.github.io/PhysGaussian/ + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks ICLR2024 + + +
+ Gradient inversion attacks aim to reconstruct local training data from +intermediate gradients exposed in the federated learning framework. Despite +successful attacks, all previous methods, starting from reconstructing a single +data point and then relaxing the single-image limit to batch level, are only +tested under hard label constraints. Even for single-image reconstruction, we +still lack an analysis-based algorithm to recover augmented soft labels. In +this work, we change the focus from enlarging batchsize to investigating the +hard label constraints, considering a more realistic circumstance where label +smoothing and mixup techniques are used in the training process. In particular, +we are the first to initiate a novel algorithm to simultaneously recover the +ground-truth augmented label and the input feature of the last fully-connected +layer from single-input gradients, and provide a necessary condition for any +analytical-based label recovery methods. Extensive experiments testify to the +label recovery accuracy, as well as the benefits to the following image +reconstruction. We believe soft labels in classification tasks are worth +further attention in gradient inversion attacks. + +
+
+ comment: ICLR2024 poster +
+
+
+
+
+ + ♻ ☆ Exploring Sparse Visual Prompt for Domain Adaptive Dense Prediction AAAI 2024 + + +
+ The visual prompts have provided an efficient manner in addressing visual +cross-domain problems. In previous works, Visual Domain Prompt (VDP) first +introduces domain prompts to tackle the classification Test-Time Adaptation +(TTA) problem by warping image-level prompts on the input and fine-tuning +prompts for each target domain. However, since the image-level prompts mask out +continuous spatial details in the prompt-allocated region, it will suffer from +inaccurate contextual information and limited domain knowledge extraction, +particularly when dealing with dense prediction TTA problems. To overcome these +challenges, we propose a novel Sparse Visual Domain Prompts (SVDP) approach, +which holds minimal trainable parameters (e.g., 0.1\%) in the image-level +prompt and reserves more spatial information of the input. To better apply SVDP +in extracting domain-specific knowledge, we introduce the Domain Prompt +Placement (DPP) method to adaptively allocates trainable parameters of SVDP on +the pixels with large distribution shifts. Furthermore, recognizing that each +target domain sample exhibits a unique domain shift, we design Domain Prompt +Updating (DPU) strategy to optimize prompt parameters differently for each +sample, facilitating efficient adaptation to the target domain. Extensive +experiments were conducted on widely-used TTA and continual TTA benchmarks, and +our proposed method achieves state-of-the-art performance in both semantic +segmentation and depth estimation tasks. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: updated section II-C ("A-Frame"), updated references +
+
+
+
+
+ + ♻ ☆ Comment-aided Video-Language Alignment via Contrastive Pre-training for + Short-form Video Humor Detection ICMR 2024 + + +
+ The growing importance of multi-modal humor detection within affective +computing correlates with the expanding influence of short-form video sharing +on social media platforms. In this paper, we propose a novel two-branch +hierarchical model for short-form video humor detection (SVHD), named +Comment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal +contrastive pre-training. Notably, our CVLA not only operates on raw signals +across various modal channels but also yields an appropriate multi-modal +representation by aligning the video and language components within a +consistent semantic space. The experimental results on two humor detection +datasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically +outperforms state-of-the-art and several competitive baseline approaches. Our +dataset, code and model release at https://github.com/yliu-cs/CVLA. + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ♻ ☆ LadleNet: A Two-Stage UNet for Infrared Image to Visible Image + Translation Guided by Semantic Segmentation + + +
+ The translation of thermal infrared (TIR) images into visible light (VI) +images plays a critical role in enhancing model performance and generalization +capability, particularly in various fields such as registration and fusion of +TIR and VI images. However, current research in this field faces challenges of +insufficiently realistic image quality after translation and the difficulty of +existing models in adapting to unseen scenarios. In order to develop a more +generalizable image translation architecture, we conducted an analysis of +existing translation architectures. By exploring the interpretability of +intermediate modalities in existing translation architectures, we found that +the intermediate modality in the image translation process for street scene +images essentially performs semantic segmentation, distinguishing street images +based on background and foreground patterns before assigning color information. +Based on these principles, we propose an improved algorithm based on U-net +called LadleNet. This network utilizes a two-stage U-net concatenation +structure, consisting of Handle and Bowl modules. The Handle module is +responsible for constructing an abstract semantic space, while the Bowl module +decodes the semantic space to obtain the mapped VI image. Due to the +characteristic of semantic segmentation, the Handle module has strong +extensibility. Therefore, we also propose LadleNet+, which replaces the Handle +module in LadleNet with a pre-trained DeepLabv3+ network, enabling the model to +have a more powerful capability in constructing semantic space. The proposed +methods were trained and tested on the KAIST dataset, followed by quantitative +and qualitative analysis. Compared to existing methods, LadleNet and LadleNet+ +achieved an average improvement of 12.4% and 15.2% in SSIM metrics, and 37.9% +and 50.6% in MS-SSIM metrics, respectively. + +
+
+
+
+
+ + ♻ ☆ A Survey on Open-Vocabulary Detection and Segmentation: Past, Present, + and Future + + +
+ As the most fundamental scene understanding tasks, object detection and +segmentation have made tremendous progress in deep learning era. Due to the +expensive manual labeling cost, the annotated categories in existing datasets +are often small-scale and pre-defined, i.e., state-of-the-art fully-supervised +detectors and segmentors fail to generalize beyond the closed vocabulary. To +resolve this limitation, in the last few years, the community has witnessed an +increasing attention toward Open-Vocabulary Detection (OVD) and Segmentation +(OVS). By ``open-vocabulary'', we mean that the models can classify objects +beyond pre-defined categories. In this survey, we provide a comprehensive +review on recent developments of OVD and OVS. A taxonomy is first developed to +organize different tasks and methodologies. We find that the permission and +usage of weak supervision signals can well discriminate different +methodologies, including: visual-semantic space mapping, novel visual feature +synthesis, region-aware training, pseudo-labeling, knowledge distillation, and +transfer learning. The proposed taxonomy is universal across different tasks, +covering object detection, semantic/instance/panoptic segmentation, 3D and +video understanding. The main design principles, key challenges, development +routes, methodology strengths, and weaknesses are thoroughly analyzed. In +addition, we benchmark each task along with the vital components of each method +in appendix and updated online at +https://github.com/seanzhuh/awesome-open-vocabulary-detection-and-segmentation. +Finally, several promising directions are provided and discussed to stimulate +future research. + +
+
+
+
+
+ + ♻ ☆ Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton + Action Recognition + + +
+ Skeleton-based zero-shot action recognition aims to recognize unknown human +actions based on the learned priors of the known skeleton-based actions and a +semantic descriptor space shared by both known and unknown categories. However, +previous works focus on establishing the bridges between the known skeleton +representation space and semantic descriptions space at the coarse-grained +level for recognizing unknown action categories, ignoring the fine-grained +alignment of these two spaces, resulting in suboptimal performance in +distinguishing high-similarity action categories. To address these challenges, +we propose a novel method via Side information and dual-prompts learning for +skeleton-based zero-shot action recognition (STAR) at the fine-grained level. +Specifically, 1) we decompose the skeleton into several parts based on its +topology structure and introduce the side information concerning multi-part +descriptions of human body movements for alignment between the skeleton and the +semantic space at the fine-grained level; 2) we design the visual-attribute and +semantic-part prompts to improve the intra-class compactness within the +skeleton space and inter-class separability within the semantic space, +respectively, to distinguish the high-similarity actions. Extensive experiments +show that our method achieves state-of-the-art performance in ZSL and GZSL +settings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering + + +
+ Rendering dynamic 3D human from monocular videos is crucial for various +applications such as virtual reality and digital entertainment. Most methods +assume the people is in an unobstructed scene, while various objects may cause +the occlusion of body parts in real-life scenarios. Previous method utilizing +NeRF for surface rendering to recover the occluded areas, but it requiring more +than one day to train and several seconds to render, failing to meet the +requirements of real-time interactive applications. To address these issues, we +propose OccGaussian based on 3D Gaussian Splatting, which can be trained within +6 minutes and produces high-quality human renderings up to 160 FPS with +occluded input. OccGaussian initializes 3D Gaussian distributions in the +canonical space, and we perform occlusion feature query at occluded regions, +the aggregated pixel-align feature is extracted to compensate for the missing +information. Then we use Gaussian Feature MLP to further process the feature +along with the occlusion-aware loss functions to better perceive the occluded +area. Extensive experiments both in simulated and real-world occlusions, +demonstrate that our method achieves comparable or even superior performance +compared to the state-of-the-art method. And we improving training and +inference speeds by 250x and 800x, respectively. Our code will be available for +research purposes. + +
+
+
+
+
+ + ♻ ☆ ParamISP: Learned Forward and Inverse ISPs using Camera Parameters + + +
+ RAW images are rarely shared mainly due to its excessive data size compared +to their sRGB counterparts obtained by camera ISPs. Learning the forward and +inverse processes of camera ISPs has been recently demonstrated, enabling +physically-meaningful RAW-level image processing on input sRGB images. However, +existing learning-based ISP methods fail to handle the large variations in the +ISP processes with respect to camera parameters such as ISO and exposure time, +and have limitations when used for various applications. In this paper, we +propose ParamISP, a learning-based method for forward and inverse conversion +between sRGB and RAW images, that adopts a novel neural-network module to +utilize camera parameters, which is dubbed as ParamNet. Given the camera +parameters provided in the EXIF data, ParamNet converts them into a feature +vector to control the ISP networks. Extensive experiments demonstrate that +ParamISP achieve superior RAW and sRGB reconstruction results compared to +previous methods and it can be effectively used for a variety of applications +such as deblurring dataset synthesis, raw deblurring, HDR reconstruction, and +camera-to-camera transfer. + +
+
+
+
+
+ + ♻ ☆ Investigating Low Data, Confidence Aware Image Prediction on Smooth + Repetitive Videos using Gaussian Processes + + +
+ The ability to predict future states is crucial to informed decision-making +while interacting with dynamic environments. With cameras providing a prevalent +and information-rich sensing modality, the problem of predicting future states +from image sequences has garnered a lot of attention. Current state-of-the-art +methods typically train large parametric models for their predictions. Though +often able to predict with accuracy these models often fail to provide +interpretable confidence metrics around their predictions. Additionally these +methods are reliant on the availability of large training datasets to converge +to useful solutions. In this paper, we focus on the problem of predicting +future images of an image sequence with interpretable confidence bounds from +very little training data. To approach this problem, we use non-parametric +models to take a probabilistic approach to image prediction. We generate +probability distributions over sequentially predicted images, and propagate +uncertainty through time to generate a confidence metric for our predictions. +Gaussian Processes are used for their data efficiency and ability to readily +incorporate new training data online. Our methods predictions are evaluated on +a smooth fluid simulation environment. We showcase the capabilities of our +approach on real world data by predicting pedestrian flows and weather patterns +from satellite imagery. + +
+
+
+
+
+ + ♻ ☆ Transformer based Pluralistic Image Completion with Reduced Information + Loss + + +
+ Transformer based methods have achieved great success in image inpainting +recently. However, we find that these solutions regard each pixel as a token, +thus suffering from an information loss issue from two aspects: 1) They +downsample the input image into much lower resolutions for efficiency +consideration. 2) They quantize $256^3$ RGB values to a small number (such as +512) of quantized color values. The indices of quantized pixels are used as +tokens for the inputs and prediction targets of the transformer. To mitigate +these issues, we propose a new transformer based framework called "PUT". +Specifically, to avoid input downsampling while maintaining computation +efficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts +the masked image into non-overlapped patch tokens and the decoder recovers the +masked regions from the inpainted tokens while keeping the unmasked regions +unchanged. To eliminate the information loss caused by input quantization, an +Un-quantized Transformer is applied. It directly takes features from the +P-VQVAE encoder as input without any quantization and only regards the +quantized tokens as prediction targets. Furthermore, to make the inpainting +process more controllable, we introduce semantic and structural conditions as +extra guidance. Extensive experiments show that our method greatly outperforms +existing transformer based methods on image fidelity and achieves much higher +diversity and better fidelity than state-of-the-art pluralistic inpainting +methods on complex large-scale datasets (e.g., ImageNet). Codes are available +at https://github.com/liuqk3/PUT. + +
+
+ comment: Accepted by TPAMI (2024). arXiv admin note: text overlap with + arXiv:2205.05076 +
+
+
+
+
+ + ♻ ☆ Are Bias Mitigation Techniques for Deep Learning Effective? WACV 2022 + + +
+ A critical problem in deep learning is that systems learn inappropriate +biases, resulting in their inability to perform well on minority groups. This +has led to the creation of multiple algorithms that endeavor to mitigate bias. +However, it is not clear how effective these methods are. This is because study +protocols differ among papers, systems are tested on datasets that fail to test +many forms of bias, and systems have access to hidden knowledge or are tuned +specifically to the test set. To address this, we introduce an improved +evaluation protocol, sensible metrics, and a new dataset, which enables us to +ask and answer critical questions about bias mitigation algorithms. We evaluate +seven state-of-the-art algorithms using the same network architecture and +hyperparameter selection policy across three benchmark datasets. We introduce a +new dataset called Biased MNIST that enables assessment of robustness to +multiple bias sources. We use Biased MNIST and a visual question answering +(VQA) benchmark to assess robustness to hidden biases. Rather than only tuning +to the test set distribution, we study robustness across different tuning +distributions, which is critical because for many applications the test +distribution may not be known during development. We find that algorithms +exploit hidden biases, are unable to scale to multiple forms of bias, and are +highly sensitive to the choice of tuning set. Based on our findings, we implore +the community to adopt more rigorous assessment of future bias mitigation +methods. All data, code, and results are publicly available at: +https://github.com/erobic/bias-mitigators. + +
+
+ comment: WACV 2022 +
+
+
+
+
+ + ♻ ☆ Direct May Not Be the Best: An Incremental Evolution View of Pose + Generation AAAI2024 + + +
+ Pose diversity is an inherent representative characteristic of 2D images. Due +to the 3D to 2D projection mechanism, there is evident content discrepancy +among distinct pose images. This is the main obstacle bothering pose +transformation related researches. To deal with this challenge, we propose a +fine-grained incremental evolution centered pose generation framework, rather +than traditional direct one-to-one in a rush. Since proposed approach actually +bypasses the theoretical difficulty of directly modeling dramatic non-linear +variation, the incurred content distortion and blurring could be effectively +constrained, at the same time the various individual pose details, especially +clothes texture, could be precisely maintained. In order to systematically +guide the evolution course, both global and incremental evolution constraints +are elaborately designed and merged into the overall framework. And a novel +triple-path knowledge fusion structure is worked out to take full advantage of +all available valuable knowledge to conduct high-quality pose synthesis. In +addition, our framework could generate a series of valuable byproducts, namely +the various intermediate poses. Extensive experiments have been conducted to +verify the effectiveness of the proposed approach. Code is available at +https://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation. + +
+
+ comment: Accepted at AAAI2024 +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based approaches for low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing non-local information. In the domain of +super-resolution, Swin-transformer-based models have become mainstream due to +their capability of global spatial information modeling and their +shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced model +performance by expanding the receptive fields or designing meticulous networks, +yielding commendable results. However, we observed that it is a general +phenomenon for the feature map intensity to be abruptly suppressed to small +values towards the network's end. This implies an information bottleneck and a +diminishment of spatial information, implicitly limiting the model's potential. +To address this, we propose the Dense-residual-connected Transformer (DRCT), +aimed at mitigating the loss of spatial information and stabilizing the +information flow through dense-residual connections between layers, thereby +unleashing the model's potential and saving the model away from information +bottleneck. Experiment results indicate that our approach surpasses +state-of-the-art methods on benchmark datasets and performs commendably at the +NTIRE-2024 Image Super-Resolution (x4) Challenge. Our source code is available +at https://github.com/ming053l/DRCT + +
+
+ comment: Camera-ready version, NTIRE 2024 Image Super-resolution (x4) +
+
+
+
+
+ + ♻ ☆ CADS: Unleashing the Diversity of Diffusion Models through + Condition-Annealed Sampling ICLR 2024 + + +
+ While conditional diffusion models are known to have good coverage of the +data distribution, they still face limitations in output diversity, +particularly when sampled with a high classifier-free guidance scale for +optimal image quality or when trained on small datasets. We attribute this +problem to the role of the conditioning signal in inference and offer an +improved sampling strategy for diffusion models that can increase generation +diversity, especially at high guidance scales, with minimal loss of sample +quality. Our sampling strategy anneals the conditioning signal by adding +scheduled, monotonically decreasing Gaussian noise to the conditioning vector +during inference to balance diversity and condition alignment. Our +Condition-Annealed Diffusion Sampler (CADS) can be used with any pretrained +model and sampling algorithm, and we show that it boosts the diversity of +diffusion models in various conditional generation tasks. Further, using an +existing pretrained diffusion model, CADS achieves a new state-of-the-art FID +of 1.70 and 2.31 for class-conditional ImageNet generation at 256$\times$256 +and 512$\times$512 respectively. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DistriFusion: Distributed Parallel Inference for High-Resolution + Diffusion Models CVPR 2024 + + +
+ Diffusion models have achieved great success in synthesizing high-quality +images. However, generating high-resolution images with diffusion models is +still challenging due to the enormous computational costs, resulting in a +prohibitive latency for interactive applications. In this paper, we propose +DistriFusion to tackle this problem by leveraging parallelism across multiple +GPUs. Our method splits the model input into multiple patches and assigns each +patch to a GPU. However, naively implementing such an algorithm breaks the +interaction between patches and loses fidelity, while incorporating such an +interaction will incur tremendous communication overhead. To overcome this +dilemma, we observe the high similarity between the input from adjacent +diffusion steps and propose displaced patch parallelism, which takes advantage +of the sequential nature of the diffusion process by reusing the pre-computed +feature maps from the previous timestep to provide context for the current +step. Therefore, our method supports asynchronous communication, which can be +pipelined by computation. Extensive experiments show that our method can be +applied to recent Stable Diffusion XL with no quality degradation and achieve +up to a 6.1$\times$ speedup on eight NVIDIA A100s compared to one. Our code is +publicly available at https://github.com/mit-han-lab/distrifuser. + +
+
+ comment: CVPR 2024 Highlight Code: https://github.com/mit-han-lab/distrifuser + Website: https://hanlab.mit.edu/projects/distrifusion Blog: + https://hanlab.mit.edu/blog/distrifusion +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Layout Transformer for Content-Aware Layout + Generation CVPR 2024 + + +
+ Content-aware graphic layout generation aims to automatically arrange visual +elements along with a given content, such as an e-commerce product image. In +this paper, we argue that the current layout generation approaches suffer from +the limited training data for the high-dimensional layout structure. We show +that a simple retrieval augmentation can significantly improve the generation +quality. Our model, which is named Retrieval-Augmented Layout Transformer +(RALF), retrieves nearest neighbor layout examples based on an input image and +feeds these results into an autoregressive generator. Our model can apply +retrieval augmentation to various controllable generation tasks and yield +high-quality layouts within a unified architecture. Our extensive experiments +show that RALF successfully generates content-aware layouts in both constrained +and unconstrained settings and significantly outperforms the baselines. + +
+
+ comment: Accepted to CVPR 2024 (Oral), Project website: + https://udonda.github.io/RALF/ , GitHub: + https://github.com/CyberAgentAILab/RALF +
+
+
+
+
+ + ♻ ☆ A design of Convolutional Neural Network model for the Diagnosis of the + COVID-19 + + +
+ With the spread of COVID-19 around the globe over the past year, the usage of +artificial intelligence (AI) algorithms and image processing methods to analyze +the X-ray images of patients' chest with COVID-19 has become essential. The +COVID-19 virus recognition in the lung area of a patient is one of the basic +and essential needs of clicical centers and hospitals. Most research in this +field has been devoted to papers on the basis of deep learning methods +utilizing CNNs (Convolutional Neural Network), which mainly deal with the +screening of sick and healthy people.In this study, a new structure of a +19-layer CNN has been recommended for accurately recognition of the COVID-19 +from the X-ray pictures of chest. The offered CNN is developed to serve as a +precise diagnosis system for a three class (viral pneumonia, Normal, COVID) and +a four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A +comparison is conducted among the outcomes of the offered procedure and some +popular pretrained networks, including Inception, Alexnet, ResNet50, +Squeezenet, and VGG19 and based on Specificity, Accuracy, Precision, +Sensitivity, Confusion Matrix, and F1-score. The experimental results of the +offered CNN method specify its dominance over the existing published +procedures. This method can be a useful tool for clinicians in deciding +properly about COVID-19. + +
+
+ comment: Important mistakes. Also, another author has contributed some to the + revised version. So it is not appropriate for it to be with only my name +
+
+
+
+
+ + ♻ ☆ Vision Transformer Computation and Resilience for Dynamic Inference + + +
+ State-of-the-art deep learning models for computer vision tasks are based on +the transformer architecture and often deployed in real-time applications. In +this scenario, the resources available for every inference can vary, so it is +useful to be able to dynamically adapt execution to trade accuracy for +efficiency. To create dynamic models, we leverage the resilience of vision +transformers to pruning and switch between different scaled versions of a +model. Surprisingly, we find that most FLOPs are generated by convolutions, not +attention. These relative FLOP counts are not a good predictor of GPU +performance since GPUs have special optimizations for convolutions. Some models +are fairly resilient and their model execution can be adapted without +retraining, while all models achieve better accuracy with retraining +alternative execution paths. These insights mean that we can leverage CNN +accelerators and these alternative execution paths to enable efficient and +dynamic vision transformer inference. Our analysis shows that leveraging this +type of dynamic execution can lead to saving 28\% of energy with a 1.4\% +accuracy drop for SegFormer (63 GFLOPs), with no additional training, and 53\% +of energy for ResNet-50 (4 GFLOPs) with a 3.3\% accuracy drop by switching +between pretrained Once-For-All models. + +
+
+
+
+
+ + ♻ ☆ Distilling Vision-Language Models on Millions of Videos CVPR 2024 + + +
+ The recent advance in vision-language models is largely attributed to the +abundance of image-text data. We aim to replicate this success for +video-language models, but there simply is not enough human-curated video-text +data available. We thus resort to fine-tuning a video-language model from a +strong image-language baseline with synthesized instructional data. The +resulting video model by video-instruction-tuning (VIIT) is then used to +auto-label millions of videos to generate high-quality captions. We show the +adapted video-language model performs well on a wide range of video-language +benchmarks. For instance, it surpasses the best prior result on open-ended +NExT-QA by 2.8%. Besides, our model generates detailed descriptions for +previously unseen videos, which provide better textual supervision than +existing methods. Experiments show that a video-language dual-encoder model +contrastively trained on these auto-generated captions is 3.8% better than the +strongest baseline that also leverages vision-language models. Our best model +outperforms state-of-the-art methods on MSR-VTT zero-shot text-to-video +retrieval by 6%. As a side product, we generate the largest video caption +dataset to date. + +
+
+ comment: CVPR 2024. Project page: + https://zhaoyue-zephyrus.github.io/video-instruction-tuning +
+
+
+
+
+ + ♻ ☆ Towards Two-Stream Foveation-based Active Vision Learning + + +
+ Deep neural network (DNN) based machine perception frameworks process the +entire input in a one-shot manner to provide answers to both "what object is +being observed" and "where it is located". In contrast, the "two-stream +hypothesis" from neuroscience explains the neural processing in the human +visual cortex as an active vision system that utilizes two separate regions of +the brain to answer the what and the where questions. In this work, we propose +a machine learning framework inspired by the "two-stream hypothesis" and +explore the potential benefits that it offers. Specifically, the proposed +framework models the following mechanisms: 1) ventral (what) stream focusing on +the input regions perceived by the fovea part of an eye (foveation), 2) dorsal +(where) stream providing visual guidance, and 3) iterative processing of the +two streams to calibrate visual focus and process the sequence of focused image +patches. The training of the proposed framework is accomplished by label-based +DNN training for the ventral stream model and reinforcement learning for the +dorsal stream model. We show that the two-stream foveation-based learning is +applicable to the challenging task of weakly-supervised object localization +(WSOL), where the training data is limited to the object class or its +attributes. The framework is capable of both predicting the properties of an +object and successfully localizing it by predicting its bounding box. We also +show that, due to the independent nature of the two streams, the dorsal model +can be applied on its own to unseen images to localize objects from different +datasets. + +
+
+ comment: Accepted for publication at IEEE Transactions on Cognitive and + Developmental Systems (IEEE TCDS), 18 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ nnMobileNet: Rethinking CNN for Retinopathy Research CVPR + + +
+ Over the past few decades, convolutional neural networks (CNNs) have been at +the forefront of the detection and tracking of various retinal diseases (RD). +Despite their success, the emergence of vision transformers (ViT) in the 2020s +has shifted the trajectory of RD model development. The leading-edge +performance of ViT-based models in RD can be largely credited to their +scalability-their ability to improve as more parameters are added. As a result, +ViT-based models tend to outshine traditional CNNs in RD applications, albeit +at the cost of increased data and computational demands. ViTs also differ from +CNNs in their approach to processing images, working with patches rather than +local regions, which can complicate the precise localization of small, variably +presented lesions in RD. In our study, we revisited and updated the +architecture of a CNN model, specifically MobileNet, to enhance its utility in +RD diagnostics. We found that an optimized MobileNet, through selective +modifications, can surpass ViT-based models in various RD benchmarks, including +diabetic retinopathy grading, detection of multiple fundus diseases, and +classification of diabetic macular edema. The code is available at +https://github.com/Retinal-Research/NN-MOBILENET + +
+
+ comment: Accepted as a conference paper to 2024 CVPRW +
+
+
+
+
+ + ♻ ☆ Visual Grounding Methods for VQA are Working for the Wrong Reasons! ACL 2020 + + +
+ Existing Visual Question Answering (VQA) methods tend to exploit dataset +biases and spurious statistical correlations, instead of producing right +answers for the right reasons. To address this issue, recent bias mitigation +methods for VQA propose to incorporate visual cues (e.g., human attention maps) +to better ground the VQA models, showcasing impressive gains. However, we show +that the performance improvements are not a result of improved visual +grounding, but a regularization effect which prevents over-fitting to +linguistic priors. For instance, we find that it is not actually necessary to +provide proper, human-based cues; random, insensible cues also result in +similar improvements. Based on this observation, we propose a simpler +regularization scheme that does not require any external annotations and yet +achieves near state-of-the-art performance on VQA-CPv2. + +
+
+ comment: ACL 2020 +
+
+
+
+
+ + ♻ ☆ SAWEC: Sensing-Assisted Wireless Edge Computing + + +
+ Emerging mobile virtual reality (VR) systems will require to continuously +perform complex computer vision tasks on ultra-high-resolution video frames +through the execution of deep neural networks (DNNs)-based algorithms. Since +state-of-the-art DNNs require computational power that is excessive for mobile +devices, techniques based on wireless edge computing (WEC) have been recently +proposed. However, existing WEC methods require the transmission and processing +of a high amount of video data which may ultimately saturate the wireless link. +In this paper, we propose a novel Sensing-Assisted Wireless Edge Computing +(SAWEC) paradigm to address this issue. SAWEC leverages knowledge about the +physical environment to reduce the end-to-end latency and overall computational +burden by transmitting to the edge server only the relevant data for the +delivery of the service. Our intuition is that the transmission of the portion +of the video frames where there are no changes with respect to previous frames +can be avoided. Specifically, we leverage wireless sensing techniques to +estimate the location of objects in the environment and obtain insights about +the environment dynamics. Hence, only the part of the frames where any +environmental change is detected is transmitted and processed. We evaluated +SAWEC by using a 10K 360$^{\circ}$ with a Wi-Fi 6 sensing system operating at +160 MHz and performing localization and tracking. We considered instance +segmentation and object detection as benchmarking tasks for performance +evaluation. We carried out experiments in an anechoic chamber and an entrance +hall with two human subjects in six different setups. Experimental results show +that SAWEC reduces both the channel occupation and end-to-end latency by more +than 90% while improving the instance segmentation and object detection +performance with respect to state-of-the-art WEC approaches. + +
+
+ comment: Submitted to ACM for possible publication +
+
+
+
+
+ + ♻ ☆ The Bias of Harmful Label Associations in Vision-Language Models + + +
+ Despite the remarkable performance of foundation vision-language models, the +shared representation space for text and vision can also encode harmful label +associations detrimental to fairness. While prior work has uncovered bias in +vision-language models' (VLMs) classification performance across geography, +work has been limited along the important axis of harmful label associations +due to a lack of rich, labeled data. In this work, we investigate harmful label +associations in the recently released Casual Conversations datasets containing +more than 70,000 videos. We study bias in the frequency of harmful label +associations across self-provided labels for age, gender, apparent skin tone, +and physical adornments across several leading VLMs. We find that VLMs are +$4-7$x more likely to harmfully classify individuals with darker skin tones. We +also find scaling transformer encoder model size leads to higher confidence in +harmful predictions. Finally, we find improvements on standard vision tasks +across VLMs does not address disparities in harmful label associations. + +
+
+
+
+
+ + ♻ ☆ Equivariant Multi-Modality Image Fusion CVPR 2024 + + +
+ Multi-modality image fusion is a technique that combines information from +different sensors or modalities, enabling the fused image to retain +complementary features from each modality, such as functional highlights and +texture details. However, effective training of such fusion models is +challenging due to the scarcity of ground truth fusion data. To tackle this +issue, we propose the Equivariant Multi-Modality imAge fusion (EMMA) paradigm +for end-to-end self-supervised learning. Our approach is rooted in the prior +knowledge that natural imaging responses are equivariant to certain +transformations. Consequently, we introduce a novel training paradigm that +encompasses a fusion module, a pseudo-sensing module, and an equivariant fusion +module. These components enable the net training to follow the principles of +the natural sensing-imaging process while satisfying the equivariant imaging +prior. Extensive experiments confirm that EMMA yields high-quality fusion +results for infrared-visible and medical images, concurrently facilitating +downstream multi-modal segmentation and detection tasks. The code is available +at https://github.com/Zhaozixiang1228/MMIF-EMMA. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ Clustering-based Image-Text Graph Matching for Domain Generalization + + +
+ Learning domain-invariant visual representations is important to train a +model that can generalize well to unseen target task domains. Recent works +demonstrate that text descriptions contain high-level class-discriminative +information and such auxiliary semantic cues can be used as effective pivot +embedding for domain generalization problem. However, they use pivot embedding +in global manner (i.e., aligning an image embedding with sentence-level text +embedding), not fully utilizing the semantic cues of given text description. In +this work, we advocate for the use of local alignment between image regions and +corresponding textual descriptions. To this end, we first represent image and +text inputs with graphs. We subsequently cluster nodes in those graphs and +match the graph-based image node features into textual graphs. This matching +process is conducted globally and locally, tightly aligning visual and textual +semantic sub-structures. We experiment with large-scale public datasets, such +as CUB-DG and DomainBed, and our model achieves matched or better +state-of-the-art performance on these datasets. Our code will be publicly +available upon publication. + +
+
+
+
+
+ + ♻ ☆ Strategies to Improve Real-World Applicability of Laparoscopic Anatomy + Segmentation Models CVPR 2024 + + +
+ Accurate identification and localization of anatomical structures of varying +size and appearance in laparoscopic imaging are necessary to leverage the +potential of computer vision techniques for surgical decision support. +Segmentation performance of such models is traditionally reported using metrics +of overlap such as IoU. However, imbalanced and unrealistic representation of +classes in the training data and suboptimal selection of reported metrics have +the potential to skew nominal segmentation performance and thereby ultimately +limit clinical translation. In this work, we systematically analyze the impact +of class characteristics (i.e., organ size differences), training and test data +composition (i.e., representation of positive and negative examples), and +modeling parameters (i.e., foreground-to-background class weight) on eight +segmentation metrics: accuracy, precision, recall, IoU, F1 score (Dice +Similarity Coefficient), specificity, Hausdorff Distance, and Average Symmetric +Surface Distance. Our findings support two adjustments to account for data +biases in surgical data science: First, training on datasets that are similar +to the clinical real-world scenarios in terms of class distribution, and +second, class weight adjustments to optimize segmentation model performance +with regard to metrics of particular relevance in the respective clinical +setting. + +
+
+ comment: 14 pages, 5 figures, 4 tables; accepted for the workshop "Data + Curation and Augmentation in Medical Imaging" at CVPR 2024 (archival track) +
+
+
+
+
+ + ♻ ☆ LLM-driven Multimodal Target Volume Contouring in Radiation Oncology + + +
+ Target volume contouring for radiation therapy is considered significantly +more challenging than the normal organ segmentation tasks as it necessitates +the utilization of both image and text-based clinical information. Inspired by +the recent advancement of large language models (LLMs) that can facilitate the +integration of the textural information and images, here we present a novel +LLM-driven multimodal AI, namely LLMSeg, that utilizes the clinical text +information and is applicable to the challenging task of target volume +contouring for radiation therapy, and validate it within the context of breast +cancer radiation therapy target volume contouring. Using external validation +and data-insufficient environments, which attributes highly conducive to +real-world applications, we demonstrate that the proposed model exhibits +markedly improved performance compared to conventional unimodal AI models, +particularly exhibiting robust generalization performance and data efficiency. +To our best knowledge, this is the first LLM-driven multimodal AI model that +integrates the clinical text information into target volume delineation for +radiation oncology. + +
+
+
+
+
+ + ♻ ☆ SCott: Accelerating Diffusion Models with Stochastic Consistency + Distillation + + +
+ The iterative sampling procedure employed by diffusion models (DMs) often +leads to significant inference latency. To address this, we propose Stochastic +Consistency Distillation (SCott) to enable accelerated text-to-image +generation, where high-quality generations can be achieved with just 1-2 +sampling steps, and further improvements can be obtained by adding additional +steps. In contrast to vanilla consistency distillation (CD) which distills the +ordinary differential equation solvers-based sampling process of a pretrained +teacher model into a student, SCott explores the possibility and validates the +efficacy of integrating stochastic differential equation (SDE) solvers into CD +to fully unleash the potential of the teacher. SCott is augmented with +elaborate strategies to control the noise strength and sampling process of the +SDE solver. An adversarial loss is further incorporated to strengthen the +sample quality with rare sampling steps. Empirically, on the MSCOCO-2017 5K +dataset with a Stable Diffusion-V1.5 teacher, SCott achieves an FID (Frechet +Inceptio Distance) of 22.1, surpassing that (23.4) of the 1-step InstaFlow (Liu +et al., 2023) and matching that of 4-step UFOGen (Xue et al., 2023b). Moreover, +SCott can yield more diverse samples than other consistency models for +high-resolution image generation (Luo et al., 2023a), with up to 16% +improvement in a qualified metric. The code and checkpoints are coming soon. + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ A Medical Data-Effective Learning Benchmark for Highly Efficient + Pre-training of Foundation Models + + +
+ Foundation models, pre-trained on massive datasets, have achieved +unprecedented generalizability. However, is it truly necessary to involve such +vast amounts of data in pre-training, consuming extensive computational +resources? This paper introduces data-effective learning, aiming to use data in +the most impactful way to pre-train foundation models. This involves strategies +that focus on data quality rather than quantity, ensuring the data used for +training has high informational value. Data-effective learning plays a profound +role in accelerating foundation model training, reducing computational costs, +and saving data storage, which is very important as the volume of medical data +in recent years has grown beyond many people's expectations. However, due to +the lack of standards and comprehensive benchmarks, research on medical +data-effective learning is poorly studied. To address this gap, our paper +introduces a comprehensive benchmark specifically for evaluating data-effective +learning in the medical field. This benchmark includes a dataset with millions +of data samples from 31 medical centers (DataDEL), a baseline method for +comparison (MedDEL), and a new evaluation metric (NormDEL) to objectively +measure data-effective learning performance. Our extensive experimental results +show the baseline MedDEL can achieve performance comparable to the original +large dataset with only 5% of the data. Establishing such an open +data-effective learning benchmark is crucial for the medical foundation model +research community because it facilitates efficient data use, promotes +collaborative breakthroughs, and fosters the development of cost-effective, +scalable, and impactful healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ BAA-NGP: Bundle-Adjusting Accelerated Neural Graphics Primitives + + +
+ Implicit neural representations have become pivotal in robotic perception, +enabling robots to comprehend 3D environments from 2D images. Given a set of +camera poses and associated images, the models can be trained to synthesize +novel, unseen views. To successfully navigate and interact in dynamic settings, +robots require the understanding of their spatial surroundings driven by +unassisted reconstruction of 3D scenes and camera poses from real-time video +footage. Existing approaches like COLMAP and bundle-adjusting neural radiance +field methods take hours to days to process due to the high computational +demands of feature matching, dense point sampling, and training of a +multi-layer perceptron structure with a large number of parameters. To address +these challenges, we propose a framework called bundle-adjusting accelerated +neural graphics primitives (BAA-NGP) which leverages accelerated sampling and +hash encoding to expedite automatic pose refinement/estimation and 3D scene +reconstruction. Experimental results demonstrate 10 to 20 x speed improvement +compared to other bundle-adjusting neural radiance field methods without +sacrificing the quality of pose estimation. The github repository can be found +here https://github.com/IntelLabs/baa-ngp. + +
+
+
+
+
+ + ♻ ☆ Backdoor Federated Learning by Poisoning Backdoor-Critical Layers ICLR'24 + + +
+ Federated learning (FL) has been widely deployed to enable machine learning +training on sensitive data across distributed devices. However, the +decentralized learning paradigm and heterogeneity of FL further extend the +attack surface for backdoor attacks. Existing FL attack and defense +methodologies typically focus on the whole model. None of them recognizes the +existence of backdoor-critical (BC) layers-a small subset of layers that +dominate the model vulnerabilities. Attacking the BC layers achieves equivalent +effects as attacking the whole model but at a far smaller chance of being +detected by state-of-the-art (SOTA) defenses. This paper proposes a general +in-situ approach that identifies and verifies BC layers from the perspective of +attackers. Based on the identified BC layers, we carefully craft a new backdoor +attack methodology that adaptively seeks a fundamental balance between +attacking effects and stealthiness under various defense strategies. Extensive +experiments show that our BC layer-aware backdoor attacks can successfully +backdoor FL under seven SOTA defenses with only 10% malicious clients and +outperform the latest backdoor attack methods. + +
+
+ comment: Accepted to ICLR'24 +
+
+
+
+
+ + ♻ ☆ Post-processing of coronary and myocardial spatial data + + +
+ Numerical simulations of real-world phenomenon are implemented with at least +two parts: the computational scheme and the computational domain. In the +context of hemodynamics, the computational domain of a simulation represents +the blood vessel network through which blood flows. Such blood vessel networks +can contain millions of individual vessels that are joined together to form a +in series and parallel to form the network. It is computationally unfeasible to +explicitly simulate blood flow in all blood vessels. Here, from imaged data of +a single porcine left coronary arterial tree, we develop a data-pipeline to +obtain computational domains for hemodynmaic simulations from a graph +representing the coronary vascular tree. Further, we develop a method to +ascertain which subregions of the left ventricle are most likely to be perfused +via a given artery using a comparison with the American Heart Association +division of the left ventricle as a sense check. + +
+
+ comment: 21 pages, 22 figures +
+
+
+
+
+ + ♻ ☆ Transfer Learning for Cross-dataset Isolated Sign Language Recognition + in Under-Resourced Datasets + + +
+ Sign language recognition (SLR) has recently achieved a breakthrough in +performance thanks to deep neural networks trained on large annotated sign +datasets. Of the many different sign languages, these annotated datasets are +only available for a select few. Since acquiring gloss-level labels on sign +language videos is difficult, learning by transferring knowledge from existing +annotated sources is useful for recognition in under-resourced sign languages. +This study provides a publicly available cross-dataset transfer learning +benchmark from two existing public Turkish SLR datasets. We use a temporal +graph convolution-based sign language recognition approach to evaluate five +supervised transfer learning approaches and experiment with closed-set and +partial-set cross-dataset transfer learning. Experiments demonstrate that +improvement over finetuning based transfer learning is possible with +specialized supervised transfer learning methods. + +
+
+ comment: Accepted to The 18th IEEE International Conference on Automatic Face + and Gesture Recognition 2024, Code available in + https://github.com/alpk/tid-supervised-transfer-learning-dataset +
+
+
+
+
+ + ♻ ☆ MoDA: Leveraging Motion Priors from Videos for Advancing Unsupervised + Domain Adaptation in Semantic Segmentation CVPR 2024 + + +
+ Unsupervised domain adaptation (UDA) has been a potent technique to handle +the lack of annotations in the target domain, particularly in semantic +segmentation task. This study introduces a different UDA scenarios where the +target domain contains unlabeled video frames. Drawing upon recent advancements +of self-supervised learning of the object motion from unlabeled videos with +geometric constraint, we design a \textbf{Mo}tion-guided \textbf{D}omain +\textbf{A}daptive semantic segmentation framework (MoDA). MoDA harnesses the +self-supervised object motion cues to facilitate cross-domain alignment for +segmentation task. First, we present an object discovery module to localize and +segment target moving objects using object motion information. Then, we propose +a semantic mining module that takes the object masks to refine the pseudo +labels in the target domain. Subsequently, these high-quality pseudo labels are +used in the self-training loop to bridge the cross-domain gap. On domain +adaptive video and image segmentation experiments, MoDA shows the effectiveness +utilizing object motion as guidance for domain alignment compared with optical +flow information. Moreover, MoDA exhibits versatility as it can complement +existing state-of-the-art UDA approaches. Code at +https://github.com/feipanir/MoDA. + +
+
+ comment: CVPR 2024 Workshop on Learning with Limited Labelled Data for Image + and Video Understanding. Best Paper Award +
+
+
+
+
+ + ♻ ☆ Semantics-aware Motion Retargeting with Vision-Language Models CVPR2024 + + +
+ Capturing and preserving motion semantics is essential to motion retargeting +between animation characters. However, most of the previous works neglect the +semantic information or rely on human-designed joint-level representations. +Here, we present a novel Semantics-aware Motion reTargeting (SMT) method with +the advantage of vision-language models to extract and maintain meaningful +motion semantics. We utilize a differentiable module to render 3D motions. Then +the high-level motion semantics are incorporated into the motion retargeting +process by feeding the vision-language model with the rendered images and +aligning the extracted semantic embeddings. To ensure the preservation of +fine-grained motion details and high-level semantics, we adopt a two-stage +pipeline consisting of skeleton-aware pre-training and fine-tuning with +semantics and geometry constraints. Experimental results show the effectiveness +of the proposed method in producing high-quality motion retargeting results +while accurately preserving motion semantics. + +
+
+ comment: Accepted in CVPR2024 +
+
+
+
+
+ + ♻ ☆ Adversarial Nibbler: An Open Red-Teaming Method for Identifying Diverse + Harms in Text-to-Image Generation + + +
+ With the rise of text-to-image (T2I) generative AI models reaching wide +audiences, it is critical to evaluate model robustness against non-obvious +attacks to mitigate the generation of offensive images. By focusing on +``implicitly adversarial'' prompts (those that trigger T2I models to generate +unsafe images for non-obvious reasons), we isolate a set of difficult safety +issues that human creativity is well-suited to uncover. To this end, we built +the Adversarial Nibbler Challenge, a red-teaming methodology for crowdsourcing +a diverse set of implicitly adversarial prompts. We have assembled a suite of +state-of-the-art T2I models, employed a simple user interface to identify and +annotate harms, and engaged diverse populations to capture long-tail safety +issues that may be overlooked in standard testing. The challenge is run in +consecutive rounds to enable a sustained discovery and analysis of safety +pitfalls in T2I models. + In this paper, we present an in-depth account of our methodology, a +systematic study of novel attack strategies and discussion of safety failures +revealed by challenge participants. We also release a companion visualization +tool for easy exploration and derivation of insights from the dataset. The +first challenge round resulted in over 10k prompt-image pairs with machine +annotations for safety. A subset of 1.5k samples contains rich human +annotations of harm types and attack styles. We find that 14% of images that +humans consider harmful are mislabeled as ``safe'' by machines. We have +identified new attack strategies that highlight the complexity of ensuring T2I +model robustness. Our findings emphasize the necessity of continual auditing +and adaptation as new vulnerabilities emerge. We are confident that this work +will enable proactive, iterative safety assessments and promote responsible +development of T2I models. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ CONDA: Continual Unsupervised Domain Adaptation Learning in Visual + Perception for Self-Driving Cars CVPR + + +
+ Although unsupervised domain adaptation methods have achieved remarkable +performance in semantic scene segmentation in visual perception for +self-driving cars, these approaches remain impractical in real-world use cases. +In practice, the segmentation models may encounter new data that have not been +seen yet. Also, the previous data training of segmentation models may be +inaccessible due to privacy problems. Therefore, to address these problems, in +this work, we propose a Continual Unsupervised Domain Adaptation (CONDA) +approach that allows the model to continuously learn and adapt with respect to +the presence of the new data. Moreover, our proposed approach is designed +without the requirement of accessing previous training data. To avoid the +catastrophic forgetting problem and maintain the performance of the +segmentation models, we present a novel Bijective Maximum Likelihood loss to +impose the constraint of predicted segmentation distribution shifts. The +experimental results on the benchmark of continual unsupervised domain +adaptation have shown the advanced performance of the proposed CONDA method. + +
+
+ comment: Accepted to CVPRW 2024 +
+
+
+
+
+ + ♻ ☆ Language-guided Image Reflection Separation + + +
+ This paper studies the problem of language-guided reflection separation, +which aims at addressing the ill-posed reflection separation problem by +introducing language descriptions to provide layer content. We propose a +unified framework to solve this problem, which leverages the cross-attention +mechanism with contrastive learning strategies to construct the correspondence +between language descriptions and image layers. A gated network design and a +randomized training strategy are employed to tackle the recognizable layer +ambiguity. The effectiveness of the proposed method is validated by the +significant performance advantage over existing reflection separation methods +on both quantitative and qualitative comparisons. + +
+
+
+
+
+ + ♻ ☆ Vision-Language Models for Medical Report Generation and Visual Question + Answering: A Review + + +
+ Medical vision-language models (VLMs) combine computer vision (CV) and +natural language processing (NLP) to analyze visual and textual medical data. +Our paper reviews recent advancements in developing VLMs specialized for +healthcare, focusing on models designed for medical report generation and +visual question answering (VQA). We provide background on NLP and CV, +explaining how techniques from both fields are integrated into VLMs to enable +learning from multimodal data. Key areas we address include the exploration of +medical vision-language datasets, in-depth analyses of architectures and +pre-training strategies employed in recent noteworthy medical VLMs, and +comprehensive discussion on evaluation metrics for assessing VLMs' performance +in medical report generation and VQA. We also highlight current challenges and +propose future directions, including enhancing clinical validity and addressing +patient privacy concerns. Overall, our review summarizes recent progress in +developing VLMs to harness multimodal medical data for improved healthcare +applications. + +
+
+ comment: 43 pages; paper edited and restructured +
+
+
+
+
+ + ♻ ☆ Trajectory Consistency Distillation: Improved Latent Consistency + Distillation by Semi-Linear Consistency Function with Trajectory Mapping + + +
+ Latent Consistency Model (LCM) extends the Consistency Model to the latent +space and leverages the guided consistency distillation technique to achieve +impressive performance in accelerating text-to-image synthesis. However, we +observed that LCM struggles to generate images with both clarity and detailed +intricacy. Consequently, we introduce Trajectory Consistency Distillation +(TCD), which encompasses trajectory consistency function and strategic +stochastic sampling. The trajectory consistency function diminishes the +parameterisation and distillation errors by broadening the scope of the +self-consistency boundary condition with trajectory mapping and endowing the +TCD with the ability to accurately trace the entire trajectory of the +Probability Flow ODE in semi-linear form with an Exponential Integrator. +Additionally, strategic stochastic sampling provides explicit control of +stochastic and circumvents the accumulated errors inherent in multi-step +consistency sampling. Experiments demonstrate that TCD not only significantly +enhances image quality at low NFEs but also yields more detailed results +compared to the teacher model at high NFEs. + +
+
+ comment: Project Page: https://mhh0318.github.io/tcd +
+
+
+
+
+ + ♻ ☆ EAMA : Entity-Aware Multimodal Alignment Based Approach for News Image + Captioning + + +
+ News image captioning requires model to generate an informative caption rich +in entities, with the news image and the associated news article. Though +Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in addressing various vision-language tasks, our research finds +that current MLLMs still bear limitations in handling entity information on +news image captioning task. Besides, while MLLMs have the ability to process +long inputs, generating high-quality news image captions still requires a +trade-off between sufficiency and conciseness of textual input information. To +explore the potential of MLLMs and address problems we discovered, we propose : +an Entity-Aware Multimodal Alignment based approach for news image captioning. +Our approach first aligns the MLLM through Balance Training Strategy with two +extra alignment tasks: Entity-Aware Sentence Selection task and Entity +Selection task, together with News Image Captioning task, to enhance its +capability in handling multimodal entity information. The aligned MLLM will +utilizes the additional entity-related information it explicitly extract to +supplement its textual input while generating news image captions. Our approach +achieves better results than all previous models in CIDEr score on GoodNews +dataset (72.33 -> 88.39) and NYTimes800k dataset (70.83 -> 85.61). + +
+
+
+
+
+ + ♻ ☆ FlowIBR: Leveraging Pre-Training for Efficient Neural Image-Based + Rendering of Dynamic Scenes CVPR 2024 + + +
+ We introduce FlowIBR, a novel approach for efficient monocular novel view +synthesis of dynamic scenes. Existing techniques already show impressive +rendering quality but tend to focus on optimization within a single scene +without leveraging prior knowledge, resulting in long optimization times per +scene. FlowIBR circumvents this limitation by integrating a neural image-based +rendering method, pre-trained on a large corpus of widely available static +scenes, with a per-scene optimized scene flow field. Utilizing this flow field, +we bend the camera rays to counteract the scene dynamics, thereby presenting +the dynamic scene as if it were static to the rendering network. The proposed +method reduces per-scene optimization time by an order of magnitude, achieving +comparable rendering quality to existing methods -- all on a single +consumer-grade GPU. + +
+
+ comment: Accepted to CVPR 2024 Workshop on Efficient Deep Learning for + Computer Vision. Project page: https://flowibr.github.io +
+
+
+
+
+ + ♻ ☆ 4D Facial Expression Diffusion Model + + +
+ Facial expression generation is one of the most challenging and long-sought +aspects of character animation, with many interesting applications. The +challenging task, traditionally having relied heavily on digital craftspersons, +remains yet to be explored. In this paper, we introduce a generative framework +for generating 3D facial expression sequences (i.e. 4D faces) that can be +conditioned on different inputs to animate an arbitrary 3D face mesh. It is +composed of two tasks: (1) Learning the generative model that is trained over a +set of 3D landmark sequences, and (2) Generating 3D mesh sequences of an input +facial mesh driven by the generated landmark sequences. The generative model is +based on a Denoising Diffusion Probabilistic Model (DDPM), which has achieved +remarkable success in generative tasks of other domains. While it can be +trained unconditionally, its reverse process can still be conditioned by +various condition signals. This allows us to efficiently develop several +downstream tasks involving various conditional generation, by using expression +labels, text, partial sequences, or simply a facial geometry. To obtain the +full mesh deformation, we then develop a landmark-guided encoder-decoder to +apply the geometrical deformation embedded in landmarks on a given facial mesh. +Experiments show that our model has learned to generate realistic, quality +expressions solely from the dataset of relatively small size, improving over +the state-of-the-art methods. Videos and qualitative comparisons with other +methods can be found at \url{https://github.com/ZOUKaifeng/4DFM}. + +
+
+
+
+
+ + ♻ ☆ Exploring Limits of Diffusion-Synthetic Training with Weakly Supervised + Semantic Segmentation + + +
+ The advance of generative models for images has inspired various training +techniques for image recognition utilizing synthetic images. In semantic +segmentation, one promising approach is extracting pseudo-masks from attention +maps in text-to-image diffusion models, which enables +real-image-and-annotation-free training. However, the pioneering training +method using the diffusion-synthetic images and pseudo-masks, i.e., DiffuMask +has limitations in terms of mask quality, scalability, and ranges of applicable +domains. To overcome these limitations, this work introduces three techniques +for diffusion-synthetic semantic segmentation training. First, +reliability-aware robust training, originally used in weakly supervised +learning, helps segmentation with insufficient synthetic mask quality. %Second, +large-scale pretraining of whole segmentation models, not only backbones, on +synthetic ImageNet-1k-class images with pixel-labels benefits downstream +segmentation tasks. Second, we introduce prompt augmentation, data augmentation +to the prompt text set to scale up and diversify training images with a limited +text resources. Finally, LoRA-based adaptation of Stable Diffusion enables the +transfer to a distant domain, e.g., auto-driving images. Experiments in PASCAL +VOC, ImageNet-S, and Cityscapes show that our method effectively closes gap +between real and synthetic training in semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ Unbiased Image Synthesis via Manifold Guidance in Diffusion Models + + +
+ Diffusion Models are a potent class of generative models capable of producing +high-quality images. However, they often inadvertently favor certain data +attributes, undermining the diversity of generated images. This issue is +starkly apparent in skewed datasets like CelebA, where the initial dataset +disproportionately favors females over males by 57.9%, this bias amplified in +generated data where female representation outstrips males by 148%. In +response, we propose a plug-and-play method named Manifold Guidance Sampling, +which is also the first unsupervised method to mitigate bias issue in DDPMs. +Leveraging the inherent structure of the data manifold, this method steers the +sampling process towards a more uniform distribution, effectively dispersing +the clustering of biased data. Without the need for modifying the existing +model or additional training, it significantly mitigates data bias and enhances +the quality and unbiasedness of the generated images. + +
+
+
+
+
+ + ♻ ☆ Modeling Dense Multimodal Interactions Between Biological Pathways and + Histology for Survival Prediction CVPR 2024 + + +
+ Integrating whole-slide images (WSIs) and bulk transcriptomics for predicting +patient survival can improve our understanding of patient prognosis. However, +this multimodal task is particularly challenging due to the different nature of +these data: WSIs represent a very high-dimensional spatial description of a +tumor, while bulk transcriptomics represent a global description of gene +expression levels within that tumor. In this context, our work aims to address +two key challenges: (1) how can we tokenize transcriptomics in a semantically +meaningful and interpretable way?, and (2) how can we capture dense multimodal +interactions between these two modalities? Specifically, we propose to learn +biological pathway tokens from transcriptomics that can encode specific +cellular functions. Together with histology patch tokens that encode the +different morphological patterns in the WSI, we argue that they form +appropriate reasoning units for downstream interpretability analyses. We +propose fusing both modalities using a memory-efficient multimodal Transformer +that can model interactions between pathway and histology patch tokens. Our +proposed model, SURVPATH, achieves state-of-the-art performance when evaluated +against both unimodal and multimodal baselines on five datasets from The Cancer +Genome Atlas. Our interpretability framework identifies key multimodal +prognostic factors, and, as such, can provide valuable insights into the +interaction between genotype and phenotype, enabling a deeper understanding of +the underlying biological mechanisms at play. We make our code public at: +https://github.com/ajv012/SurvPath. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Latent Noise Segmentation: How Neural Noise Leads to the Emergence of + Segmentation and Grouping + + +
+ Humans are able to segment images effortlessly without supervision using +perceptual grouping. In this work, we propose a counter-intuitive computational +approach to solving unsupervised perceptual grouping and segmentation: that +they arise \textit{because} of neural noise, rather than in spite of it. We (1) +mathematically demonstrate that under realistic assumptions, neural noise can +be used to separate objects from each other; (2) that adding noise in a DNN +enables the network to segment images even though it was never trained on any +segmentation labels; and (3) that segmenting objects using noise results in +segmentation performance that aligns with the perceptual grouping phenomena +observed in humans, and is sample-efficient. We introduce the Good Gestalt (GG) +datasets -- six datasets designed to specifically test perceptual grouping, and +show that our DNN models reproduce many important phenomena in human +perception, such as illusory contours, closure, continuity, proximity, and +occlusion. Finally, we (4) show that our model improves performance on our GG +datasets compared to other tested unsupervised models by $24.9\%$. Together, +our results suggest a novel unsupervised segmentation method requiring few +assumptions, a new explanation for the formation of perceptual grouping, and a +novel potential benefit of neural noise. + +
+
+
+
+
+ + ♻ ☆ RoHM: Robust Human Motion Reconstruction via Diffusion + + +
+ We propose RoHM, an approach for robust 3D human motion reconstruction from +monocular RGB(-D) videos in the presence of noise and occlusions. Most previous +approaches either train neural networks to directly regress motion in 3D or +learn data-driven motion priors and combine them with optimization at test +time. The former do not recover globally coherent motion and fail under +occlusions; the latter are time-consuming, prone to local minima, and require +manual tuning. To overcome these shortcomings, we exploit the iterative, +denoising nature of diffusion models. RoHM is a novel diffusion-based motion +model that, conditioned on noisy and occluded input data, reconstructs +complete, plausible motions in consistent global coordinates. Given the +complexity of the problem -- requiring one to address different tasks +(denoising and infilling) in different solution spaces (local and global +motion) -- we decompose it into two sub-tasks and learn two models, one for +global trajectory and one for local motion. To capture the correlations between +the two, we then introduce a novel conditioning module, combining it with an +iterative inference scheme. We apply RoHM to a variety of tasks -- from motion +reconstruction and denoising to spatial and temporal infilling. Extensive +experiments on three popular datasets show that our method outperforms +state-of-the-art approaches qualitatively and quantitatively, while being +faster at test time. The code is available at +https://sanweiliti.github.io/ROHM/ROHM.html. + +
+
+ comment: With the appendix included +
+
+
+
+
+ + ♻ ☆ Neural Knitworks: Patched Neural Implicit Representation Networks + + +
+ Coordinate-based Multilayer Perceptron (MLP) networks, despite being capable +of learning neural implicit representations, are not performant for internal +image synthesis applications. Convolutional Neural Networks (CNNs) are +typically used instead for a variety of internal generative tasks, at the cost +of a larger model. We propose Neural Knitwork, an architecture for neural +implicit representation learning of natural images that achieves image +synthesis by optimizing the distribution of image patches in an adversarial +manner and by enforcing consistency between the patch predictions. To the best +of our knowledge, this is the first implementation of a coordinate-based MLP +tailored for synthesis tasks such as image inpainting, super-resolution, and +denoising. We demonstrate the utility of the proposed technique by training on +these three tasks. The results show that modeling natural images using patches, +rather than pixels, produces results of higher fidelity. The resulting model +requires 80% fewer parameters than alternative CNN-based solutions while +achieving comparable performance and training time. + +
+
+ comment: Published in Pattern Recognition +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 66 + +
+
+
+ + ☆ Orientation-conditioned Facial Texture Mapping for Video-based Facial + Remote Photoplethysmography Estimation + + +
+ Camera-based remote photoplethysmography (rPPG) enables contactless +measurement of important physiological signals such as pulse rate (PR). +However, dynamic and unconstrained subject motion introduces significant +variability into the facial appearance in video, confounding the ability of +video-based methods to accurately extract the rPPG signal. In this study, we +leverage the 3D facial surface to construct a novel orientation-conditioned +facial texture video representation which improves the motion robustness of +existing video-based facial rPPG estimation methods. Our proposed method +achieves a significant 18.2% performance improvement in cross-dataset testing +on MMPD over our baseline using the PhysNet model trained on PURE, highlighting +the efficacy and generalization benefits of our designed video representation. +We demonstrate significant performance improvements of up to 29.6% in all +tested motion scenarios in cross-dataset testing on MMPD, even in the presence +of dynamic and unconstrained subject motion. Emphasizing the benefits the +benefits of disentangling motion through modeling the 3D facial surface for +motion robust facial rPPG estimation. We validate the efficacy of our design +decisions and the impact of different video processing steps through an +ablation study. Our findings illustrate the potential strengths of exploiting +the 3D facial surface as a general strategy for addressing dynamic and +unconstrained subject motion in videos. The code is available at +https://samcantrill.github.io/orientation-uv-rppg/. + +
+
+ comment: 12 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ \textit{sweet} -- An Open Source Modular Platform for Contactless Hand + Vascular Biometric Experiments + + +
+ Current finger-vein or palm-vein recognition systems usually require direct +contact of the subject with the apparatus. This can be problematic in +environments where hygiene is of primary importance. In this work we present a +contactless vascular biometrics sensor platform named \sweet which can be used +for hand vascular biometrics studies (wrist-, palm- and finger-vein) and +surface features such as palmprint. It supports several acquisition modalities +such as multi-spectral Near-Infrared (NIR), RGB-color, Stereo Vision (SV) and +Photometric Stereo (PS). Using this platform we collect a dataset consisting of +the fingers, palm and wrist vascular data of 120 subjects and develop a +powerful 3D pipeline for the pre-processing of this data. We then present +biometric experimental results, focusing on Finger-Vein Recognition (FVR). +Finally, we discuss fusion of multiple modalities, such palm-vein combined with +palm-print biometrics. The acquisition software, parts of the hardware design, +the new FV dataset, as well as source-code for our experiments are publicly +available for research purposes. + +
+
+
+
+
+ + ☆ Exploring Feedback Generation in Automated Skeletal Movement Assessment: + A Comprehensive Overview + + +
+ The application of machine-learning solutions to movement assessment from +skeleton videos has attracted significant research attention in recent years. +This advancement has made rehabilitation at home more accessible, utilizing +movement assessment algorithms that can operate on affordable equipment for +human pose detection from 2D or 3D videos. While the primary objective of +automatic assessment tasks is to score movements, the automatic generation of +feedback highlighting key movement issues has the potential to significantly +enhance and accelerate the rehabilitation process. In this study, we explain +the types of feedback that can be generated, review existing solutions for +automatic feedback generation, and discuss future research directions. To our +knowledge, this is the first comprehensive review of feedback generation in +skeletal movement assessment. + +
+
+
+
+
+ + ☆ Adversarial Robustness Limits via Scaling-Law and Human-Alignment + Studies + + +
+ This paper revisits the simple, long-studied, yet still unsolved problem of +making image classifiers robust to imperceptible perturbations. Taking CIFAR10 +as an example, SOTA clean accuracy is about $100$%, but SOTA robustness to +$\ell_{\infty}$-norm bounded perturbations barely exceeds $70$%. To understand +this gap, we analyze how model size, dataset size, and synthetic data quality +affect robustness by developing the first scaling laws for adversarial +training. Our scaling laws reveal inefficiencies in prior art and provide +actionable feedback to advance the field. For instance, we discovered that SOTA +methods diverge notably from compute-optimal setups, using excess compute for +their level of robustness. Leveraging a compute-efficient setup, we surpass the +prior SOTA with $20$% ($70$%) fewer training (inference) FLOPs. We trained +various compute-efficient models, with our best achieving $74$% AutoAttack +accuracy ($+3$% gain). However, our scaling laws also predict robustness slowly +grows then plateaus at $90$%: dwarfing our new SOTA by scaling is impractical, +and perfect robustness is impossible. To better understand this predicted +limit, we carry out a small-scale human evaluation on the AutoAttack data that +fools our top-performing model. Concerningly, we estimate that human +performance also plateaus near $90$%, which we show to be attributable to +$\ell_{\infty}$-constrained attacks' generation of invalid images not +consistent with their original labels. Having characterized limiting +roadblocks, we outline promising paths for future research. + +
+
+
+
+
+ + ☆ Face-voice Association in Multilingual Environments (FAME) Challenge + 2024 Evaluation Plan + + +
+ The advancements of technology have led to the use of multimodal systems in +various real-world applications. Among them, the audio-visual systems are one +of the widely used multimodal systems. In the recent years, associating face +and voice of a person has gained attention due to presence of unique +correlation between them. The Face-voice Association in Multilingual +Environments (FAME) Challenge 2024 focuses on exploring face-voice association +under a unique condition of multilingual scenario. This condition is inspired +from the fact that half of the world's population is bilingual and most often +people communicate under multilingual scenario. The challenge uses a dataset +namely, Multilingual Audio-Visual (MAV-Celeb) for exploring face-voice +association in multilingual environments. This report provides the details of +the challenge, dataset, baselines and task details for the FAME Challenge. + +
+
+ comment: ACM Multimedia Conference - Grand Challenge +
+
+
+
+
+ + ☆ Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision + Transformers + + +
+ Few-shot knowledge distillation recently emerged as a viable approach to +harness the knowledge of large-scale pre-trained models, using limited data and +computational resources. In this paper, we propose a novel few-shot feature +distillation approach for vision transformers. Our approach is based on two key +steps. Leveraging the fact that vision transformers have a consistent +depth-wise structure, we first copy the weights from intermittent layers of +existing pre-trained vision transformers (teachers) into shallower +architectures (students), where the intermittence factor controls the +complexity of the student transformer with respect to its teacher. Next, we +employ an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge +into the student in a few-shot scenario, aiming to recover the information +processing carried out by the skipped teacher layers. We present comprehensive +experiments with supervised and self-supervised transformers as teachers, on +five data sets from various domains, including natural, medical and satellite +images. The empirical results confirm the superiority of our approach over +competitive baselines. Moreover, the ablation results demonstrate the +usefulness of each component of the proposed pipeline. + +
+
+
+
+
+ + ☆ In My Perspective, In My Hands: Accurate Egocentric 2D Hand Pose and + Action Recognition + + +
+ Action recognition is essential for egocentric video understanding, allowing +automatic and continuous monitoring of Activities of Daily Living (ADLs) +without user effort. Existing literature focuses on 3D hand pose input, which +requires computationally intensive depth estimation networks or wearing an +uncomfortable depth sensor. In contrast, there has been insufficient research +in understanding 2D hand pose for egocentric action recognition, despite the +availability of user-friendly smart glasses in the market capable of capturing +a single RGB image. Our study aims to fill this research gap by exploring the +field of 2D hand pose estimation for egocentric action recognition, making two +contributions. Firstly, we introduce two novel approaches for 2D hand pose +estimation, namely EffHandNet for single-hand estimation and EffHandEgoNet, +tailored for an egocentric perspective, capturing interactions between hands +and objects. Both methods outperform state-of-the-art models on H2O and FPHA +public benchmarks. Secondly, we present a robust action recognition +architecture from 2D hand and object poses. This method incorporates +EffHandEgoNet, and a transformer-based action recognition method. Evaluated on +H2O and FPHA datasets, our architecture has a faster inference time and +achieves an accuracy of 91.32% and 94.43%, respectively, surpassing state of +the art, including 3D-based methods. Our work demonstrates that using 2D +skeletal data is a robust approach for egocentric action understanding. +Extensive evaluation and ablation studies show the impact of the hand pose +estimation approach, and how each input affects the overall performance. + +
+
+ comment: Accepted at: The 18th IEEE International Conference on Automatic Face + and Gesture Recognition +
+
+
+
+
+ + ☆ A Simple Strategy for Body Estimation from Partial-View Images CVPR + + +
+ Virtual try-on and product personalization have become increasingly important +in modern online shopping, highlighting the need for accurate body measurement +estimation. Although previous research has advanced in estimating 3D body +shapes from RGB images, the task is inherently ambiguous as the observed scale +of human subjects in the images depends on two unknown factors: capture +distance and body dimensions. This ambiguity is particularly pronounced in +partial-view scenarios. To address this challenge, we propose a modular and +simple height normalization solution. This solution relocates the subject +skeleton to the desired position, thereby normalizing the scale and +disentangling the relationship between the two variables. Our experimental +results demonstrate that integrating this technique into state-of-the-art human +mesh reconstruction models significantly enhances partial body measurement +estimation. Additionally, we illustrate the applicability of this approach to +multi-view settings, showcasing its versatility. + +
+
+ comment: Accepted to CVPRW 2024 Computer Vision for Fashion, Art, and Design +
+
+
+
+
+ + ☆ A Novel State Space Model with Local Enhancement and State Sharing for + Image Fusion + + +
+ In image fusion tasks, images from different sources possess distinct +characteristics. This has driven the development of numerous methods to explore +better ways of fusing them while preserving their respective characteristics. +Mamba, as a state space model, has emerged in the field of natural language +processing. Recently, many studies have attempted to extend Mamba to vision +tasks. However, due to the nature of images different from casual language +sequences, the limited state capacity of Mamba weakens its ability to model +image information. Additionally, the sequence modeling ability of Mamba is only +capable of spatial information and cannot effectively capture the rich spectral +information in images. Motivated by these challenges, we customize and improve +the vision Mamba network designed for the image fusion task. Specifically, we +propose the local-enhanced vision Mamba block, dubbed as LEVM. The LEVM block +can improve local information perception of the network and simultaneously +learn local and global spatial information. Furthermore, we propose the state +sharing technique to enhance spatial details and integrate spatial and spectral +information. Finally, the overall network is a multi-scale structure based on +vision Mamba, called LE-Mamba. Extensive experiments show the proposed methods +achieve state-of-the-art results on multispectral pansharpening and +multispectral and hyperspectral image fusion datasets, and demonstrate the +effectiveness of the proposed approach. Code will be made available. + +
+
+
+
+
+ + ☆ Bridging Data Islands: Geographic Heterogeneity-Aware Federated Learning + for Collaborative Remote Sensing Semantic Segmentation + + +
+ Remote sensing semantic segmentation (RSS) is an essential task in Earth +Observation missions. Due to data privacy concerns, high-quality remote sensing +images with annotations cannot be well shared among institutions, making it +difficult to fully utilize RSS data to train a generalized model. Federated +Learning (FL), a privacy-preserving collaborative learning technology, is a +potential solution. However, the current research on how to effectively apply +FL in RSS is still scarce and requires further investigation. Remote sensing +images in various institutions often exhibit strong geographical heterogeneity. +More specifically, it is reflected in terms of class-distribution heterogeneity +and object-appearance heterogeneity. Unfortunately, most existing FL studies +show inadequate focus on geographical heterogeneity, thus leading to +performance degradation in the global model. Considering the aforementioned +issues, we propose a novel Geographic Heterogeneity-Aware Federated Learning +(GeoFed) framework to address privacy-preserving RSS. Through Global Feature +Extension and Tail Regeneration modules, class-distribution heterogeneity is +alleviated. Additionally, we design an Essential Feature Mining strategy to +alleviate object-appearance heterogeneity by constructing essential features. +Extensive experiments on three datasets (i.e., FBP, CASID, Inria) show that our +GeoFed consistently outperforms the current state-of-the-art methods. The code +will be available publicly. + +
+
+ comment: 13 pages,9 figures, 4 tables +
+
+
+
+
+ + ☆ RoofDiffusion: Constructing Roofs from Severely Corrupted Point Data via + Diffusion + + +
+ Accurate completion and denoising of roof height maps are crucial to +reconstructing high-quality 3D buildings. Repairing sparse points can enhance +low-cost sensor use and reduce UAV flight overlap. RoofDiffusion is a new +end-to-end self-supervised diffusion technique for robustly completing, in +particular difficult, roof height maps. RoofDiffusion leverages +widely-available curated footprints and can so handle up to 99\% point sparsity +and 80\% roof area occlusion (regional incompleteness). A variant, No-FP +RoofDiffusion, simultaneously predicts building footprints and heights. Both +quantitatively outperform state-of-the-art unguided depth completion and +representative inpainting methods for Digital Elevation Models (DEM), on both a +roof-specific benchmark and the BuildingNet dataset. Qualitative assessments +show the effectiveness of RoofDiffusion for datasets with real-world scans +including AHN3, Dales3D, and USGS 3DEP LiDAR. Tested with the leading City3D +algorithm, preprocessing height maps with RoofDiffusion noticeably improves 3D +building reconstruction. RoofDiffusion is complemented by a new dataset of 13k +complex roof geometries, focusing on long-tail issues in remote sensing; a +novel simulation of tree occlusion; and a wide variety of large-area roof +cut-outs for data augmentation and benchmarking. + +
+
+
+
+
+ + ☆ SyntStereo2Real: Edge-Aware GAN for Remote Sensing Image-to-Image + Translation while Maintaining Stereo Constraint CVPR + + +
+ In the field of remote sensing, the scarcity of stereo-matched and +particularly lack of accurate ground truth data often hinders the training of +deep neural networks. The use of synthetically generated images as an +alternative, alleviates this problem but suffers from the problem of domain +generalization. Unifying the capabilities of image-to-image translation and +stereo-matching presents an effective solution to address the issue of domain +generalization. Current methods involve combining two networks, an unpaired +image-to-image translation network and a stereo-matching network, while jointly +optimizing them. We propose an edge-aware GAN-based network that effectively +tackles both tasks simultaneously. We obtain edge maps of input images from the +Sobel operator and use it as an additional input to the encoder in the +generator to enforce geometric consistency during translation. We additionally +include a warping loss calculated from the translated images to maintain the +stereo consistency. We demonstrate that our model produces qualitatively and +quantitatively superior results than existing models, and its applicability +extends to diverse domains, including autonomous driving. + +
+
+ comment: Accepted to IEEE Conference on Computer Vision and Pattern + Recognition Workshop (CVPRW) EarthVision +
+
+
+
+
+ + ☆ TrafficVLM: A Controllable Visual Language Model for Traffic Video + Captioning + + +
+ Traffic video description and analysis have received much attention recently +due to the growing demand for efficient and reliable urban surveillance +systems. Most existing methods only focus on locating traffic event segments, +which severely lack descriptive details related to the behaviour and context of +all the subjects of interest in the events. In this paper, we present +TrafficVLM, a novel multi-modal dense video captioning model for vehicle ego +camera view. TrafficVLM models traffic video events at different levels of +analysis, both spatially and temporally, and generates long fine-grained +descriptions for the vehicle and pedestrian at different phases of the event. +We also propose a conditional component for TrafficVLM to control the +generation outputs and a multi-task fine-tuning paradigm to enhance +TrafficVLM's learning capability. Experiments show that TrafficVLM performs +well on both vehicle and overhead camera views. Our solution achieved +outstanding results in Track 2 of the AI City Challenge 2024, ranking us third +in the challenge standings. Our code is publicly available at +https://github.com/quangminhdinh/TrafficVLM. + +
+
+
+
+
+ + ☆ VRS-NeRF: Visual Relocalization with Sparse Neural Radiance Field + + +
+ Visual relocalization is a key technique to autonomous driving, robotics, and +virtual/augmented reality. After decades of explorations, absolute pose +regression (APR), scene coordinate regression (SCR), and hierarchical methods +(HMs) have become the most popular frameworks. However, in spite of high +efficiency, APRs and SCRs have limited accuracy especially in large-scale +outdoor scenes; HMs are accurate but need to store a large number of 2D +descriptors for matching, resulting in poor efficiency. In this paper, we +propose an efficient and accurate framework, called VRS-NeRF, for visual +relocalization with sparse neural radiance field. Precisely, we introduce an +explicit geometric map (EGM) for 3D map representation and an implicit learning +map (ILM) for sparse patches rendering. In this localization process, EGP +provides priors of spare 2D points and ILM utilizes these sparse points to +render patches with sparse NeRFs for matching. This allows us to discard a +large number of 2D descriptors so as to reduce the map size. Moreover, +rendering patches only for useful points rather than all pixels in the whole +image reduces the rendering time significantly. This framework inherits the +accuracy of HMs and discards their low efficiency. Experiments on 7Scenes, +CambridgeLandmarks, and Aachen datasets show that our method gives much better +accuracy than APRs and SCRs, and close performance to HMs but is much more +efficient. + +
+
+ comment: source code https://github.com/feixue94/vrs-nerf +
+
+
+
+
+ + ☆ PANet: A Physics-guided Parametric Augmentation Net for Image Dehazing + by Hazing + + +
+ Image dehazing faces challenges when dealing with hazy images in real-world +scenarios. A huge domain gap between synthetic and real-world haze images +degrades dehazing performance in practical settings. However, collecting +real-world image datasets for training dehazing models is challenging since +both hazy and clean pairs must be captured under the same conditions. In this +paper, we propose a Physics-guided Parametric Augmentation Network (PANet) that +generates photo-realistic hazy and clean training pairs to effectively enhance +real-world dehazing performance. PANet comprises a Haze-to-Parameter Mapper +(HPM) to project hazy images into a parameter space and a Parameter-to-Haze +Mapper (PHM) to map the resampled haze parameters back to hazy images. In the +parameter space, we can pixel-wisely resample individual haze parameter maps to +generate diverse hazy images with physically-explainable haze conditions unseen +in the training set. Our experimental results demonstrate that PANet can +augment diverse realistic hazy images to enrich existing hazy image benchmarks +so as to effectively boost the performances of state-of-the-art image dehazing +models. + +
+
+
+
+
+ + ☆ Task-Driven Exploration: Decoupling and Inter-Task Feedback for Joint + Moment Retrieval and Highlight Detection + + +
+ Video moment retrieval and highlight detection are two highly valuable tasks +in video understanding, but until recently they have been jointly studied. +Although existing studies have made impressive advancement recently, they +predominantly follow the data-driven bottom-up paradigm. Such paradigm +overlooks task-specific and inter-task effects, resulting in poor model +performance. In this paper, we propose a novel task-driven top-down framework +TaskWeave for joint moment retrieval and highlight detection. The framework +introduces a task-decoupled unit to capture task-specific and common +representations. To investigate the interplay between the two tasks, we propose +an inter-task feedback mechanism, which transforms the results of one task as +guiding masks to assist the other task. Different from existing methods, we +present a task-dependent joint loss function to optimize the model. +Comprehensive experiments and in-depth ablation studies on QVHighlights, TVSum, +and Charades-STA datasets corroborate the effectiveness and flexibility of the +proposed framework. Codes are available at +https://github.com/EdenGabriel/TaskWeave. + +
+
+
+
+
+ + ☆ FedCCL: Federated Dual-Clustered Feature Contrast Under Domain + Heterogeneity + + +
+ Federated learning (FL) facilitates a privacy-preserving neural network +training paradigm through collaboration between edge clients and a central +server. One significant challenge is that the distributed data is not +independently and identically distributed (non-IID), typically including both +intra-domain and inter-domain heterogeneity. However, recent research is +limited to simply using averaged signals as a form of regularization and only +focusing on one aspect of these non-IID challenges. Given these limitations, +this paper clarifies these two non-IID challenges and attempts to introduce +cluster representation to address them from both local and global perspectives. +Specifically, we propose a dual-clustered feature contrast-based FL framework +with dual focuses. First, we employ clustering on the local representations of +each client, aiming to capture intra-class information based on these local +clusters at a high level of granularity. Then, we facilitate cross-client +knowledge sharing by pulling the local representation closer to clusters shared +by clients with similar semantics while pushing them away from clusters with +dissimilar semantics. Second, since the sizes of local clusters belonging to +the same class may differ for each client, we further utilize clustering on the +global side and conduct averaging to create a consistent global signal for +guiding each local training in a contrastive manner. Experimental results on +multiple datasets demonstrate that our proposal achieves comparable or superior +performance gain under intra-domain and inter-domain heterogeneity. + +
+
+
+
+
+ + ☆ TEXT2TASTE: A Versatile Egocentric Vision System for Intelligent Reading + Assistance Using Large Language Model + + +
+ The ability to read, understand and find important information from written +text is a critical skill in our daily lives for our independence, comfort and +safety. However, a significant part of our society is affected by partial +vision impairment, which leads to discomfort and dependency in daily +activities. To address the limitations of this part of society, we propose an +intelligent reading assistant based on smart glasses with embedded RGB cameras +and a Large Language Model (LLM), whose functionality goes beyond corrective +lenses. The video recorded from the egocentric perspective of a person wearing +the glasses is processed to localise text information using object detection +and optical character recognition methods. The LLM processes the data and +allows the user to interact with the text and responds to a given query, thus +extending the functionality of corrective lenses with the ability to find and +summarize knowledge from the text. To evaluate our method, we create a +chat-based application that allows the user to interact with the system. The +evaluation is conducted in a real-world setting, such as reading menus in a +restaurant, and involves four participants. The results show robust accuracy in +text retrieval. The system not only provides accurate meal suggestions but also +achieves high user satisfaction, highlighting the potential of smart glasses +and LLMs in assisting people with special needs. + +
+
+ comment: Accepted at ICCHP 2024 +
+
+
+
+
+ + ☆ Arena: A Patch-of-Interest ViT Inference Acceleration System for + Edge-Assisted Video Analytics + + +
+ The advent of edge computing has made real-time intelligent video analytics +feasible. Previous works, based on traditional model architecture (e.g., CNN, +RNN, etc.), employ various strategies to filter out non-region-of-interest +content to minimize bandwidth and computation consumption but show inferior +performance in adverse environments. Recently, visual foundation models based +on transformers have shown great performance in adverse environments due to +their amazing generalization capability. However, they require a large amount +of computation power, which limits their applications in real-time intelligent +video analytics. In this paper, we find visual foundation models like Vision +Transformer (ViT) also have a dedicated acceleration mechanism for video +analytics. To this end, we introduce Arena, an end-to-end edge-assisted video +inference acceleration system based on ViT. We leverage the capability of ViT +that can be accelerated through token pruning by only offloading and feeding +Patches-of-Interest (PoIs) to the downstream models. Additionally, we employ +probability-based patch sampling, which provides a simple but efficient +mechanism for determining PoIs where the probable locations of objects are in +subsequent frames. Through extensive evaluations on public datasets, our +findings reveal that Arena can boost inference speeds by up to $1.58\times$ and +$1.82\times$ on average while consuming only 54% and 34% of the bandwidth, +respectively, all with high inference accuracy. + +
+
+
+
+
+ + ☆ Tri-modal Confluence with Temporal Dynamics for Scene Graph Generation + in Operating Rooms + + +
+ A comprehensive understanding of surgical scenes allows for monitoring of the +surgical process, reducing the occurrence of accidents and enhancing efficiency +for medical professionals. Semantic modeling within operating rooms, as a scene +graph generation (SGG) task, is challenging since it involves consecutive +recognition of subtle surgical actions over prolonged periods. To address this +challenge, we propose a Tri-modal (i.e., images, point clouds, and language) +confluence with Temporal dynamics framework, termed TriTemp-OR. Diverging from +previous approaches that integrated temporal information via memory graphs, our +method embraces two advantages: 1) we directly exploit bi-modal temporal +information from the video streaming for hierarchical feature interaction, and +2) the prior knowledge from Large Language Models (LLMs) is embedded to +alleviate the class-imbalance problem in the operating theatre. Specifically, +our model performs temporal interactions across 2D frames and 3D point clouds, +including a scale-adaptive multi-view temporal interaction (ViewTemp) and a +geometric-temporal point aggregation (PointTemp). Furthermore, we transfer +knowledge from the biomedical LLM, LLaVA-Med, to deepen the comprehension of +intraoperative relations. The proposed TriTemp-OR enables the aggregation of +tri-modal features through relation-aware unification to predict relations so +as to generate scene graphs. Experimental results on the 4D-OR benchmark +demonstrate the superior performance of our model for long-term OR streaming. + +
+
+ comment: 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ DreamScape: 3D Scene Creation via Gaussian Splatting joint Correlation + Modeling + + +
+ Recent progress in text-to-3D creation has been propelled by integrating the +potent prior of Diffusion Models from text-to-image generation into the 3D +domain. Nevertheless, generating 3D scenes characterized by multiple instances +and intricate arrangements remains challenging. In this study, we present +DreamScape, a method for creating highly consistent 3D scenes solely from +textual descriptions, leveraging the strong 3D representation capabilities of +Gaussian Splatting and the complex arrangement abilities of large language +models (LLMs). Our approach involves a 3D Gaussian Guide ($3{DG^2}$) for scene +representation, consisting of semantic primitives (objects) and their spatial +transformations and relationships derived directly from text prompts using +LLMs. This compositional representation allows for local-to-global optimization +of the entire scene. A progressive scale control is tailored during local +object generation, ensuring that objects of different sizes and densities adapt +to the scene, which addresses training instability issue arising from simple +blending in the subsequent global optimization stage. To mitigate potential +biases of LLM priors, we model collision relationships between objects at the +global level, enhancing physical correctness and overall realism. Additionally, +to generate pervasive objects like rain and snow distributed extensively across +the scene, we introduce a sparse initialization and densification strategy. +Experiments demonstrate that DreamScape offers high usability and +controllability, enabling the generation of high-fidelity 3D scenes from only +text prompts and achieving state-of-the-art performance compared to other +methods. + +
+
+
+
+
+ + ☆ Breast Cancer Image Classification Method Based on Deep Transfer + Learning + + +
+ To address the issues of limited samples, time-consuming feature design, and +low accuracy in detection and classification of breast cancer pathological +images, a breast cancer image classification model algorithm combining deep +learning and transfer learning is proposed. This algorithm is based on the +DenseNet structure of deep neural networks, and constructs a network model by +introducing attention mechanisms, and trains the enhanced dataset using +multi-level transfer learning. Experimental results demonstrate that the +algorithm achieves an efficiency of over 84.0\% in the test set, with a +significantly improved classification accuracy compared to previous models, +making it applicable to medical breast cancer detection tasks. + +
+
+
+
+
+ + ☆ DetCLIPv3: Towards Versatile Generative Open-vocabulary Object Detection CVPR2024 + + +
+ Existing open-vocabulary object detectors typically require a predefined set +of categories from users, significantly confining their application scenarios. +In this paper, we introduce DetCLIPv3, a high-performing detector that excels +not only at both open-vocabulary object detection, but also generating +hierarchical labels for detected objects. DetCLIPv3 is characterized by three +core designs: 1. Versatile model architecture: we derive a robust open-set +detection framework which is further empowered with generation ability via the +integration of a caption head. 2. High information density data: we develop an +auto-annotation pipeline leveraging visual large language model to refine +captions for large-scale image-text pairs, providing rich, multi-granular +object labels to enhance the training. 3. Efficient training strategy: we +employ a pre-training stage with low-resolution inputs that enables the object +captioner to efficiently learn a broad spectrum of visual concepts from +extensive image-text paired data. This is followed by a fine-tuning stage that +leverages a small number of high-resolution samples to further enhance +detection performance. With these effective designs, DetCLIPv3 demonstrates +superior open-vocabulary detection performance, \eg, our Swin-T backbone model +achieves a notable 47.0 zero-shot fixed AP on the LVIS minival benchmark, +outperforming GLIPv2, GroundingDINO, and DetCLIPv2 by 18.0/19.6/6.6 AP, +respectively. DetCLIPv3 also achieves a state-of-the-art 19.7 AP in dense +captioning task on VG dataset, showcasing its strong generative capability. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ FedDistill: Global Model Distillation for Local Model De-Biasing in + Non-IID Federated Learning + + +
+ Federated Learning (FL) is a novel approach that allows for collaborative +machine learning while preserving data privacy by leveraging models trained on +decentralized devices. However, FL faces challenges due to non-uniformly +distributed (non-iid) data across clients, which impacts model performance and +its generalization capabilities. To tackle the non-iid issue, recent efforts +have utilized the global model as a teaching mechanism for local models. +However, our pilot study shows that their effectiveness is constrained by +imbalanced data distribution, which induces biases in local models and leads to +a 'local forgetting' phenomenon, where the ability of models to generalize +degrades over time, particularly for underrepresented classes. This paper +introduces FedDistill, a framework enhancing the knowledge transfer from the +global model to local models, focusing on the issue of imbalanced class +distribution. Specifically, FedDistill employs group distillation, segmenting +classes based on their frequency in local datasets to facilitate a focused +distillation process to classes with fewer samples. Additionally, FedDistill +dissects the global model into a feature extractor and a classifier. This +separation empowers local models with more generalized data representation +capabilities and ensures more accurate classification across all classes. +FedDistill mitigates the adverse effects of data imbalance, ensuring that local +models do not forget underrepresented classes but instead become more adept at +recognizing and classifying them accurately. Our comprehensive experiments +demonstrate FedDistill's effectiveness, surpassing existing baselines in +accuracy and convergence speed across several benchmark datasets. + +
+
+ comment: 13 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ TextHawk: Exploring Efficient Fine-Grained Perception of Multimodal + Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) have shown impressive results on +various multimodal tasks. However, most existing MLLMs are not well suited for +document-oriented tasks, which require fine-grained image perception and +information compression. In this paper, we present TextHawk, a MLLM that is +specifically designed for document-oriented tasks, while preserving the general +capabilities of MLLMs. TextHawk is aimed to explore efficient fine-grained +perception by designing four dedicated components. Firstly, a ReSampling and +ReArrangement (ReSA) module is proposed to reduce the redundancy in the +document texts and lower the computational cost of the MLLM. We explore +encoding the positions of each local feature by presenting Scalable Positional +Embeddings (SPEs), which can preserve the scalability of various image sizes. A +Query Proposal Network (QPN) is then adopted to initialize the queries +dynamically among different sub-images. To further enhance the fine-grained +visual perceptual ability of the MLLM, we design a Multi-Level Cross-Attention +(MLCA) mechanism that captures the hierarchical structure and semantic +relations of document images. Furthermore, we create a new instruction-tuning +dataset for document-oriented tasks by enriching the multimodal document data +with Gemini Pro. We conduct extensive experiments on both general and +document-oriented MLLM benchmarks, and show that TextHawk outperforms the +state-of-the-art methods, demonstrating its effectiveness and superiority in +fine-grained document perception and general abilities. + +
+
+
+
+
+ + ☆ FaceCat: Enhancing Face Recognition Security with a Unified Generative + Model Framework + + +
+ Face anti-spoofing (FAS) and adversarial detection (FAD) have been regarded +as critical technologies to ensure the safety of face recognition systems. As a +consequence of their limited practicality and generalization, some existing +methods aim to devise a framework capable of concurrently detecting both +threats to address the challenge. Nevertheless, these methods still encounter +challenges of insufficient generalization and suboptimal robustness, +potentially owing to the inherent drawback of discriminative models. Motivated +by the rich structural and detailed features of face generative models, we +propose FaceCat which utilizes the face generative model as a pre-trained model +to improve the performance of FAS and FAD. Specifically, FaceCat elaborately +designs a hierarchical fusion mechanism to capture rich face semantic features +of the generative model. These features then serve as a robust foundation for a +lightweight head, designed to execute FAS and FAD tasks simultaneously. As +relying solely on single-modality data often leads to suboptimal performance, +we further propose a novel text-guided multi-modal alignment strategy that +utilizes text prompts to enrich feature representation, thereby enhancing +performance. For fair evaluations, we build a comprehensive protocol with a +wide range of 28 attack types to benchmark the performance. Extensive +experiments validate the effectiveness of FaceCat generalizes significantly +better and obtains excellent robustness against input transformations. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Change Guiding Network: Incorporating Change Prior to Guide Change + Detection in Remote Sensing Imagery + + +
+ The rapid advancement of automated artificial intelligence algorithms and +remote sensing instruments has benefited change detection (CD) tasks. However, +there is still a lot of space to study for precise detection, especially the +edge integrity and internal holes phenomenon of change features. In order to +solve these problems, we design the Change Guiding Network (CGNet), to tackle +the insufficient expression problem of change features in the conventional +U-Net structure adopted in previous methods, which causes inaccurate edge +detection and internal holes. Change maps from deep features with rich semantic +information are generated and used as prior information to guide multi-scale +feature fusion, which can improve the expression ability of change features. +Meanwhile, we propose a self-attention module named Change Guide Module (CGM), +which can effectively capture the long-distance dependency among pixels and +effectively overcome the problem of the insufficient receptive field of +traditional convolutional neural networks. On four major CD datasets, we verify +the usefulness and efficiency of the CGNet, and a large number of experiments +and ablation studies demonstrate the effectiveness of CGNet. We're going to +open-source our code at https://github.com/ChengxiHAN/CGNet-CD. + +
+
+
+
+
+ + ☆ HANet: A Hierarchical Attention Network for Change Detection With + Bitemporal Very-High-Resolution Remote Sensing Images + + +
+ Benefiting from the developments in deep learning technology, +deep-learning-based algorithms employing automatic feature extraction have +achieved remarkable performance on the change detection (CD) task. However, the +performance of existing deep-learning-based CD methods is hindered by the +imbalance between changed and unchanged pixels. To tackle this problem, a +progressive foreground-balanced sampling strategy on the basis of not adding +change information is proposed in this article to help the model accurately +learn the features of the changed pixels during the early training process and +thereby improve detection performance.Furthermore, we design a discriminative +Siamese network, hierarchical attention network (HANet), which can integrate +multiscale features and refine detailed features. The main part of HANet is the +HAN module, which is a lightweight and effective self-attention mechanism. +Extensive experiments and ablation studies on two CDdatasets with extremely +unbalanced labels validate the effectiveness and efficiency of the proposed +method. + +
+
+
+
+
+ + ☆ LoopAnimate: Loopable Salient Object Animation + + +
+ Research on diffusion model-based video generation has advanced rapidly. +However, limitations in object fidelity and generation length hinder its +practical applications. Additionally, specific domains like animated wallpapers +require seamless looping, where the first and last frames of the video match +seamlessly. To address these challenges, this paper proposes LoopAnimate, a +novel method for generating videos with consistent start and end frames. To +enhance object fidelity, we introduce a framework that decouples multi-level +image appearance and textual semantic information. Building upon an +image-to-image diffusion model, our approach incorporates both pixel-level and +feature-level information from the input image, injecting image appearance and +textual semantic embeddings at different positions of the diffusion model. +Existing UNet-based video generation models require to input the entire videos +during training to encode temporal and positional information at once. However, +due to limitations in GPU memory, the number of frames is typically restricted +to 16. To address this, this paper proposes a three-stage training strategy +with progressively increasing frame numbers and reducing fine-tuning modules. +Additionally, we introduce the Temporal E nhanced Motion Module(TEMM) to extend +the capacity for encoding temporal and positional information up to 36 frames. +The proposed LoopAnimate, which for the first time extends the single-pass +generation length of UNet-based video generation models to 35 frames while +maintaining high-quality video generation. Experiments demonstrate that +LoopAnimate achieves state-of-the-art performance in both objective metrics, +such as fidelity and temporal consistency, and subjective evaluation results. + +
+
+
+
+
+ + ☆ Coreset Selection for Object Detection CVPR 2024 + + +
+ Coreset selection is a method for selecting a small, representative subset of +an entire dataset. It has been primarily researched in image classification, +assuming there is only one object per image. However, coreset selection for +object detection is more challenging as an image can contain multiple objects. +As a result, much research has yet to be done on this topic. Therefore, we +introduce a new approach, Coreset Selection for Object Detection (CSOD). CSOD +generates imagewise and classwise representative feature vectors for multiple +objects of the same class within each image. Subsequently, we adopt submodular +optimization for considering both representativeness and diversity and utilize +the representative vectors in the submodular optimization process to select a +subset. When we evaluated CSOD on the Pascal VOC dataset, CSOD outperformed +random selection by +6.4%p in AP$_{50}$ when selecting 200 images. + +
+
+ comment: Accepted by CVPR 2024: 1st Workshop on Dataset Distillation for + Computer Vision +
+
+
+
+
+ + ☆ StreakNet-Arch: An Anti-scattering Network-based Architecture for + Underwater Carrier LiDAR-Radar Imaging + + +
+ In this paper, we introduce StreakNet-Arch, a novel signal processing +architecture designed for Underwater Carrier LiDAR-Radar (UCLR) imaging +systems, to address the limitations in scatter suppression and real-time +imaging. StreakNet-Arch formulates the signal processing as a real-time, +end-to-end binary classification task, enabling real-time image acquisition. To +achieve this, we leverage Self-Attention networks and propose a novel Double +Branch Cross Attention (DBC-Attention) mechanism that surpasses the performance +of traditional methods. Furthermore, we present a method for embedding +streak-tube camera images into attention networks, effectively acting as a +learned bandpass filter. To facilitate further research, we contribute a +publicly available streak-tube camera image dataset. The dataset contains +2,695,168 real-world underwater 3D point cloud data. These advancements +significantly improve UCLR capabilities, enhancing its performance and +applicability in underwater imaging tasks. The source code and dataset can be +found at https://github.com/BestAnHongjun/StreakNet . + +
+
+
+
+
+ + ☆ Fusion-Mamba for Cross-modality Object Detection + + +
+ Cross-modality fusing complementary information from different modalities +effectively improves object detection performance, making it more useful and +robust for a wider range of applications. Existing fusion strategies combine +different types of images or merge different backbone features through +elaborated neural network modules. However, these methods neglect that modality +disparities affect cross-modality fusion performance, as different modalities +with different camera focal lengths, placements, and angles are hardly fused. +In this paper, we investigate cross-modality fusion by associating cross-modal +features in a hidden state space based on an improved Mamba with a gating +mechanism. We design a Fusion-Mamba block (FMB) to map cross-modal features +into a hidden state space for interaction, thereby reducing disparities between +cross-modal features and enhancing the representation consistency of fused +features. FMB contains two modules: the State Space Channel Swapping (SSCS) +module facilitates shallow feature fusion, and the Dual State Space Fusion +(DSSF) enables deep fusion in a hidden state space. Through extensive +experiments on public datasets, our proposed approach outperforms the +state-of-the-art methods on $m$AP with 5.9% on $M^3FD$ and 4.9% on FLIR-Aligned +datasets, demonstrating superior object detection performance. To the best of +our knowledge, this is the first work to explore the potential of Mamba for +cross-modal fusion and establish a new baseline for cross-modality object +detection. + +
+
+
+
+
+ + ☆ GCC: Generative Calibration Clustering + + +
+ Deep clustering as an important branch of unsupervised representation +learning focuses on embedding semantically similar samples into the identical +feature space. This core demand inspires the exploration of contrastive +learning and subspace clustering. However, these solutions always rely on the +basic assumption that there are sufficient and category-balanced samples for +generating valid high-level representation. This hypothesis actually is too +strict to be satisfied for real-world applications. To overcome such a +challenge, the natural strategy is utilizing generative models to augment +considerable instances. How to use these novel samples to effectively fulfill +clustering performance improvement is still difficult and under-explored. In +this paper, we propose a novel Generative Calibration Clustering (GCC) method +to delicately incorporate feature learning and augmentation into clustering +procedure. First, we develop a discriminative feature alignment mechanism to +discover intrinsic relationship across real and generated samples. Second, we +design a self-supervised metric learning to generate more reliable cluster +assignment to boost the conditional diffusion generation. Extensive +experimental results on three benchmarks validate the effectiveness and +advantage of our proposed method over the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Exploring Generative AI for Sim2Real in Driving Data Synthesis + + +
+ Datasets are essential for training and testing vehicle perception +algorithms. However, the collection and annotation of real-world images is +time-consuming and expensive. Driving simulators offer a solution by +automatically generating various driving scenarios with corresponding +annotations, but the simulation-to-reality (Sim2Real) domain gap remains a +challenge. While most of the Generative Artificial Intelligence (AI) follows +the de facto Generative Adversarial Nets (GANs)-based methods, the recent +emerging diffusion probabilistic models have not been fully explored in +mitigating Sim2Real challenges for driving data synthesis. To explore the +performance, this paper applied three different generative AI methods to +leverage semantic label maps from a driving simulator as a bridge for the +creation of realistic datasets. A comparative analysis of these methods is +presented from the perspective of image quality and perception. New synthetic +datasets, which include driving images and auto-generated high-quality +annotations, are produced with low costs and high scene variability. The +experimental results show that although GAN-based methods are adept at +generating high-quality images when provided with manually annotated labels, +ControlNet produces synthetic datasets with fewer artefacts and more structural +fidelity when using simulator-generated labels. This suggests that the +diffusion-based approach may provide improved stability and an alternative +method for addressing Sim2Real challenges. + +
+
+
+
+
+ + ☆ EGGS: Edge Guided Gaussian Splatting for Radiance Fields + + +
+ The Gaussian splatting methods are getting popular. However, their loss +function only contains the $\ell_1$ norm and the structural similarity between +the rendered and input images, without considering the edges in these images. +It is well-known that the edges in an image provide important information. +Therefore, in this paper, we propose an Edge Guided Gaussian Splatting (EGGS) +method that leverages the edges in the input images. More specifically, we give +the edge region a higher weight than the flat region. With such edge guidance, +the resulting Gaussian particles focus more on the edges instead of the flat +regions. Moreover, such edge guidance does not crease the computation cost +during the training and rendering stage. The experiments confirm that such +simple edge-weighted loss function indeed improves about $1\sim2$ dB on several +difference data sets. With simply plugging in the edge guidance, the proposed +method can improve all Gaussian splatting methods in different scenarios, such +as human head modeling, building 3D reconstruction, etc. + +
+
+
+
+
+ + ☆ VideoSAGE: Video Summarization with Graph Representation Learning + + +
+ We propose a graph-based representation learning framework for video +summarization. First, we convert an input video to a graph where nodes +correspond to each of the video frames. Then, we impose sparsity on the graph +by connecting only those pairs of nodes that are within a specified temporal +distance. We then formulate the video summarization task as a binary node +classification problem, precisely classifying video frames whether they should +belong to the output summary video. A graph constructed this way aims to +capture long-range interactions among video frames, and the sparsity ensures +the model trains without hitting the memory and compute bottleneck. Experiments +on two datasets(SumMe and TVSum) demonstrate the effectiveness of the proposed +nimble model compared to existing state-of-the-art summarization approaches +while being one order of magnitude more efficient in compute time and memory + +
+
+ comment: arXiv admin note: text overlap with arXiv:2207.07783 +
+
+
+
+
+ + ♻ ☆ In-N-Out: Faithful 3D GAN Inversion with Volumetric Decomposition for + Face Editing + + +
+ 3D-aware GANs offer new capabilities for view synthesis while preserving the +editing functionalities of their 2D counterparts. GAN inversion is a crucial +step that seeks the latent code to reconstruct input images or videos, +subsequently enabling diverse editing tasks through manipulation of this latent +code. However, a model pre-trained on a particular dataset (e.g., FFHQ) often +has difficulty reconstructing images with out-of-distribution (OOD) objects +such as faces with heavy make-up or occluding objects. We address this issue by +explicitly modeling OOD objects from the input in 3D-aware GANs. Our core idea +is to represent the image using two individual neural radiance fields: one for +the in-distribution content and the other for the out-of-distribution object. +The final reconstruction is achieved by optimizing the composition of these two +radiance fields with carefully designed regularization. We demonstrate that our +explicit decomposition alleviates the inherent trade-off between reconstruction +fidelity and editability. We evaluate reconstruction accuracy and editability +of our method on challenging real face images and videos and showcase favorable +results against other baselines. + +
+
+ comment: Project page: https://in-n-out-3d.github.io/ +
+
+
+
+
+ + ♻ ☆ Gaussian Splatting SLAM CVPR2024 + + +
+ We present the first application of 3D Gaussian Splatting in monocular SLAM, +the most fundamental but the hardest setup for Visual SLAM. Our method, which +runs live at 3fps, utilises Gaussians as the only 3D representation, unifying +the required representation for accurate, efficient tracking, mapping, and +high-quality rendering. Designed for challenging monocular settings, our +approach is seamlessly extendable to RGB-D SLAM when an external depth sensor +is available. Several innovations are required to continuously reconstruct 3D +scenes with high fidelity from a live camera. First, to move beyond the +original 3DGS algorithm, which requires accurate poses from an offline +Structure from Motion (SfM) system, we formulate camera tracking for 3DGS using +direct optimisation against the 3D Gaussians, and show that this enables fast +and robust tracking with a wide basin of convergence. Second, by utilising the +explicit nature of the Gaussians, we introduce geometric verification and +regularisation to handle the ambiguities occurring in incremental 3D dense +reconstruction. Finally, we introduce a full SLAM system which not only +achieves state-of-the-art results in novel view synthesis and trajectory +estimation but also reconstruction of tiny and even transparent objects. + +
+
+ comment: CVPR2024 Highlight. First two authors contributed equally to this + work. Project Page: https://rmurai.co.uk/projects/GaussianSplattingSLAM/ +
+
+
+
+
+ + ♻ ☆ OmniControl: Control Any Joint at Any Time for Human Motion Generation ICLR 2024 + + +
+ We present a novel approach named OmniControl for incorporating flexible +spatial control signals into a text-conditioned human motion generation model +based on the diffusion process. Unlike previous methods that can only control +the pelvis trajectory, OmniControl can incorporate flexible spatial control +signals over different joints at different times with only one model. +Specifically, we propose analytic spatial guidance that ensures the generated +motion can tightly conform to the input control signals. At the same time, +realism guidance is introduced to refine all the joints to generate more +coherent motion. Both the spatial and realism guidance are essential and they +are highly complementary for balancing control accuracy and motion realism. By +combining them, OmniControl generates motions that are realistic, coherent, and +consistent with the spatial constraints. Experiments on HumanML3D and KIT-ML +datasets show that OmniControl not only achieves significant improvement over +state-of-the-art methods on pelvis control but also shows promising results +when incorporating the constraints over other joints. + +
+
+ comment: ICLR 2024. Project page: https://neu-vi.github.io/omnicontrol/ +
+
+
+
+
+ + ♻ ☆ Analysis of the Two-Step Heterogeneous Transfer Learning for Laryngeal + Blood Vessel Classification: Issue and Improvement + + +
+ Accurate classification of laryngeal vascular as benign or malignant is +crucial for early detection of laryngeal cancer. However, organizations with +limited access to laryngeal vascular images face challenges due to the lack of +large and homogeneous public datasets for effective learning. Distinguished +from the most familiar works, which directly transfer the ImageNet pre-trained +models to the target domain for fine-tuning, this work pioneers exploring +two-step heterogeneous transfer learning (THTL) for laryngeal lesion +classification with nine deep-learning models, utilizing the diabetic +retinopathy color fundus images, semantically non-identical yet vascular +images, as the intermediate domain. Attention visualization technique, Layer +Class Activate Map (LayerCAM), reveals a novel finding that yet the +intermediate and the target domain both reflect vascular structure to a certain +extent, the prevalent radial vascular pattern in the intermediate domain +prevents learning the features of twisted and tangled vessels that distinguish +the malignant class in the target domain, summarizes a vital rule for laryngeal +lesion classification using THTL. To address this, we introduce an enhanced +fine-tuning strategy in THTL called Step-Wise Fine-Tuning (SWFT) and apply it +to the ResNet models. SWFT progressively refines model performance by +accumulating fine-tuning layers from back to front, guided by the visualization +results of LayerCAM. Comparison with the original THTL approach shows +significant improvements. For ResNet18, the accuracy and malignant recall +increases by 26.1% and 79.8%, respectively, while for ResNet50, these +indicators improve by 20.4% and 62.2%, respectively. + +
+
+
+
+
+ + ♻ ☆ VMambaMorph: a Multi-Modality Deformable Image Registration Framework + based on Visual State Space Model with Cross-Scan Module + + +
+ Image registration, a critical process in medical imaging, involves aligning +different sets of medical imaging data into a single unified coordinate system. +Deep learning networks, such as the Convolutional Neural Network (CNN)-based +VoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model +(SSM)-based MambaMorph, have demonstrated effective performance in this domain. +The recent Visual State Space Model (VMamba), which incorporates a cross-scan +module with SSM, has exhibited promising improvements in modeling global-range +dependencies with efficient computational cost in computer vision tasks. This +paper hereby introduces an exploration of VMamba with image registration, named +VMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for +3D image registration. Utilizing a U-shaped network architecture, VMambaMorph +computes the deformation field based on target and source volumes. The +VMamba-based block with 2D cross-scan module is redesigned for 3D volumetric +feature processing. To overcome the complex motion and structure on +multi-modality images, we further propose a fine-tune recursive registration +framework. We validate VMambaMorph using a public benchmark brain MR-CT +registration dataset, comparing its performance against current +state-of-the-art methods. The results indicate that VMambaMorph achieves +competitive registration quality. The code for VMambaMorph with all baseline +methods is available on GitHub. + +
+
+
+
+
+ + ♻ ☆ Image Restoration by Denoising Diffusion Models with Iteratively + Preconditioned Guidance CVPR 2024 + + +
+ Training deep neural networks has become a common approach for addressing +image restoration problems. An alternative for training a "task-specific" +network for each observation model is to use pretrained deep denoisers for +imposing only the signal's prior within iterative algorithms, without +additional training. Recently, a sampling-based variant of this approach has +become popular with the rise of diffusion/score-based generative models. Using +denoisers for general purpose restoration requires guiding the iterations to +ensure agreement of the signal with the observations. In low-noise settings, +guidance that is based on back-projection (BP) has been shown to be a promising +strategy (used recently also under the names "pseudoinverse" or +"range/null-space" guidance). However, the presence of noise in the +observations hinders the gains from this approach. In this paper, we propose a +novel guidance technique, based on preconditioning that allows traversing from +BP-based guidance to least squares based guidance along the restoration scheme. +The proposed approach is robust to noise while still having much simpler +implementation than alternative methods (e.g., it does not require SVD or a +large number of iterations). We use it within both an optimization scheme and a +sampling-based scheme, and demonstrate its advantages over existing methods for +image deblurring and super-resolution. + +
+
+ comment: CVPR 2024 (camera-ready). Code can be found at: + https://github.com/tirer-lab/DDPG +
+
+
+
+
+ + ♻ ☆ SwiftBrush: One-Step Text-to-Image Diffusion Model with Variational + Score Distillation CVPR 2024 + + +
+ Despite their ability to generate high-resolution and diverse images from +text prompts, text-to-image diffusion models often suffer from slow iterative +sampling processes. Model distillation is one of the most effective directions +to accelerate these models. However, previous distillation methods fail to +retain the generation quality while requiring a significant amount of images +for training, either from real data or synthetically generated by the teacher +model. In response to this limitation, we present a novel image-free +distillation scheme named $\textbf{SwiftBrush}$. Drawing inspiration from +text-to-3D synthesis, in which a 3D neural radiance field that aligns with the +input prompt can be obtained from a 2D text-to-image diffusion prior via a +specialized loss without the use of any 3D data ground-truth, our approach +re-purposes that same loss for distilling a pretrained multi-step text-to-image +model to a student network that can generate high-fidelity images with just a +single inference step. In spite of its simplicity, our model stands as one of +the first one-step text-to-image generators that can produce images of +comparable quality to Stable Diffusion without reliance on any training image +data. Remarkably, SwiftBrush achieves an FID score of $\textbf{16.67}$ and a +CLIP score of $\textbf{0.29}$ on the COCO-30K benchmark, achieving competitive +results or even substantially surpassing existing state-of-the-art distillation +techniques. + +
+
+ comment: Accepted to CVPR 2024; Project Page: + https://thuanz123.github.io/swiftbrush/ +
+
+
+
+
+ + ♻ ☆ TFNet: Exploiting Temporal Cues for Fast and Accurate LiDAR Semantic + Segmentation CVPR2024 + + +
+ LiDAR semantic segmentation plays a crucial role in enabling autonomous +driving and robots to understand their surroundings accurately and robustly. A +multitude of methods exist within this domain, including point-based, +range-image-based, polar-coordinate-based, and hybrid strategies. Among these, +range-image-based techniques have gained widespread adoption in practical +applications due to their efficiency. However, they face a significant +challenge known as the ``many-to-one'' problem caused by the range image's +limited horizontal and vertical angular resolution. As a result, around 20% of +the 3D points can be occluded. In this paper, we present TFNet, a +range-image-based LiDAR semantic segmentation method that utilizes temporal +information to address this issue. Specifically, we incorporate a temporal +fusion layer to extract useful information from previous scans and integrate it +with the current scan. We then design a max-voting-based post-processing +technique to correct false predictions, particularly those caused by the +``many-to-one'' issue. We evaluated the approach on two benchmarks and +demonstrated that the plug-in post-processing technique is generic and can be +applied to various networks. + +
+
+ comment: accepted by CVPR2024 Workshop on Autonomous Driving +
+
+
+
+
+ + ♻ ☆ InstantMesh: Efficient 3D Mesh Generation from a Single Image with + Sparse-view Large Reconstruction Models + + +
+ We present InstantMesh, a feed-forward framework for instant 3D mesh +generation from a single image, featuring state-of-the-art generation quality +and significant training scalability. By synergizing the strengths of an +off-the-shelf multiview diffusion model and a sparse-view reconstruction model +based on the LRM architecture, InstantMesh is able to create diverse 3D assets +within 10 seconds. To enhance the training efficiency and exploit more +geometric supervisions, e.g, depths and normals, we integrate a differentiable +iso-surface extraction module into our framework and directly optimize on the +mesh representation. Experimental results on public datasets demonstrate that +InstantMesh significantly outperforms other latest image-to-3D baselines, both +qualitatively and quantitatively. We release all the code, weights, and demo of +InstantMesh, with the intention that it can make substantial contributions to +the community of 3D generative AI and empower both researchers and content +creators. + +
+
+ comment: Technical report. Project: https://github.com/TencentARC/InstantMesh +
+
+
+
+
+ + ♻ ☆ Specialty-Oriented Generalist Medical AI for Chest CT Screening + + +
+ Modern medical records include a vast amount of multimodal free text clinical +data and imaging data from radiology, cardiology, and digital pathology. Fully +mining such big data requires multitasking; otherwise, occult but important +aspects may be overlooked, adversely affecting clinical management and +population healthcare. Despite remarkable successes of AI in individual tasks +with single-modal data, the progress in developing generalist medical AI +remains relatively slow to combine multimodal data for multitasks because of +the dual challenges of data curation and model architecture. The data challenge +involves querying and curating multimodal structured and unstructured text, +alphanumeric, and especially 3D tomographic scans on an individual patient +level for real-time decisions and on a scale to estimate population health +statistics. The model challenge demands a scalable and adaptable network +architecture to integrate multimodal datasets for diverse clinical tasks. Here +we propose the first-of-its-kind medical multimodal-multitask foundation model +(M3FM) with application in lung cancer screening and related tasks. After we +curated a comprehensive multimodal multitask dataset consisting of 49 clinical +data types including 163,725 chest CT series and 17 medical tasks involved in +LCS, we develop a multimodal question-answering framework as a unified training +and inference strategy to synergize multimodal information and perform multiple +tasks via free-text prompting. M3FM consistently outperforms the +state-of-the-art single-modal task-specific models, identifies multimodal data +elements informative for clinical tasks and flexibly adapts to new tasks with a +small out-of-distribution dataset. As a specialty-oriented generalist medical +AI model, M3FM paves the way for similar breakthroughs in other areas of +medicine, closing the gap between specialists and the generalist. + +
+
+
+
+
+ + ♻ ☆ Domain Generalization for Crop Segmentation with Standardized Ensemble + Knowledge Distillation + + +
+ In recent years, precision agriculture has gradually oriented farming closer +to automation processes to support all the activities related to field +management. Service robotics plays a predominant role in this evolution by +deploying autonomous agents that can navigate fields while performing tasks +such as monitoring, spraying, and harvesting without human intervention. To +execute these precise actions, mobile robots need a real-time perception system +that understands their surroundings and identifies their targets in the wild. +Existing methods, however, often fall short in generalizing to new crops and +environmental conditions. This limit is critical for practical applications +where labeled samples are rarely available. In this paper, we investigate the +problem of crop segmentation and propose a novel approach to enhance domain +generalization using knowledge distillation. In the proposed framework, we +transfer knowledge from a standardized ensemble of models individually trained +on source domains to a student model that can adapt to unseen realistic +scenarios. To support the proposed method, we present a synthetic multi-domain +dataset for crop segmentation containing plants of variegate species and +covering different terrain styles, weather conditions, and light scenarios for +more than 70,000 samples. We demonstrate significant improvements in +performance over state-of-the-art methods and superior sim-to-real +generalization. Our approach provides a promising solution for domain +generalization in crop segmentation and has the potential to enhance a wide +variety of agriculture applications. + +
+
+
+
+
+ + ♻ ☆ RSBuilding: Towards General Remote Sensing Image Building Extraction and + Change Detection with Foundation Model + + +
+ The intelligent interpretation of buildings plays a significant role in urban +planning and management, macroeconomic analysis, population dynamics, etc. +Remote sensing image building interpretation primarily encompasses building +extraction and change detection. However, current methodologies often treat +these two tasks as separate entities, thereby failing to leverage shared +knowledge. Moreover, the complexity and diversity of remote sensing image +scenes pose additional challenges, as most algorithms are designed to model +individual small datasets, thus lacking cross-scene generalization. In this +paper, we propose a comprehensive remote sensing image building understanding +model, termed RSBuilding, developed from the perspective of the foundation +model. RSBuilding is designed to enhance cross-scene generalization and task +universality. Specifically, we extract image features based on the prior +knowledge of the foundation model and devise a multi-level feature sampler to +augment scale information. To unify task representation and integrate image +spatiotemporal clues, we introduce a cross-attention decoder with task prompts. +Addressing the current shortage of datasets that incorporate annotations for +both tasks, we have developed a federated training strategy to facilitate +smooth model convergence even when supervision for some tasks is missing, +thereby bolstering the complementarity of different tasks. Our model was +trained on a dataset comprising up to 245,000 images and validated on multiple +building extraction and change detection datasets. The experimental results +substantiate that RSBuilding can concurrently handle two structurally distinct +tasks and exhibits robust zero-shot generalization capabilities. + +
+
+
+
+
+ + ♻ ☆ AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains + Into One CVPR 2024 + + +
+ A handful of visual foundation models (VFMs) have recently emerged as the +backbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are +trained with distinct objectives, exhibiting unique characteristics for various +downstream tasks. We find that despite their conceptual differences, these +models can be effectively merged into a unified model through multi-teacher +distillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All +Domains Into One). This integrative approach not only surpasses the performance +of individual teacher models but also amalgamates their distinctive features, +such as zero-shot vision-language comprehension, detailed pixel-level +understanding, and open vocabulary segmentation capabilities. In pursuit of the +most hardware-efficient backbone, we evaluated numerous architectures in our +multi-teacher distillation pipeline using the same training recipe. This led to +the development of a novel architecture (E-RADIO) that exceeds the performance +of its predecessors and is at least 7x faster than the teacher models. Our +comprehensive benchmarking process covers downstream tasks including ImageNet +classification, ADE20k semantic segmentation, COCO object detection and +LLaVa-1.5 framework. + Code: https://github.com/NVlabs/RADIO + +
+
+ comment: CVPR 2024 Version 3: CVPR Camera Ready, reconfigured full paper, + table 1 is now more comprehensive Version 2: Added more acknowledgements and + updated table 7 with more recent results. Ensured that the link in the + abstract to our code is working properly Version 3: Fix broken hyperlinks +
+
+
+
+
+ + ♻ ☆ RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric + Stereo Network + + +
+ Predicting accurate normal maps of objects from two-dimensional images in +regions of complex structure and spatial material variations is challenging +using photometric stereo methods due to the influence of surface reflection +properties caused by variations in object geometry and surface materials. To +address this issue, we propose a photometric stereo network called a RMAFF-PSN +that uses residual multiscale attentional feature fusion to handle the +``difficult'' regions of the object. Unlike previous approaches that only use +stacked convolutional layers to extract deep features from the input image, our +method integrates feature information from different resolution stages and +scales of the image. This approach preserves more physical information, such as +texture and geometry of the object in complex regions, through shallow-deep +stage feature extraction, double branching enhancement, and attention +optimization. To test the network structure under real-world conditions, we +propose a new real dataset called Simple PS data, which contains multiple +objects with varying structures and materials. Experimental results on a +publicly available benchmark dataset demonstrate that our method outperforms +most existing calibrated photometric stereo methods for the same number of +input images, especially in the case of highly non-convex object structures. +Our method also obtains good results under sparse lighting conditions. + +
+
+ comment: 17 pages,12 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning AAAI2024 + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ DeS3: Adaptive Attention-driven Self and Soft Shadow Removal using ViT + Similarity AAAI2024 + + +
+ Removing soft and self shadows that lack clear boundaries from a single image +is still challenging. Self shadows are shadows that are cast on the object +itself. Most existing methods rely on binary shadow masks, without considering +the ambiguous boundaries of soft and self shadows. In this paper, we present +DeS3, a method that removes hard, soft and self shadows based on adaptive +attention and ViT similarity. Our novel ViT similarity loss utilizes features +extracted from a pre-trained Vision Transformer. This loss helps guide the +reverse sampling towards recovering scene structures. Our adaptive attention is +able to differentiate shadow regions from the underlying objects, as well as +shadow regions from the object casting the shadow. This capability enables DeS3 +to better recover the structures of objects even when they are partially +occluded by shadows. Different from existing methods that rely on constraints +during the training phase, we incorporate the ViT similarity during the +sampling stage. Our method outperforms state-of-the-art methods on the SRD, +AISTD, LRSS, USR and UIUC datasets, removing hard, soft, and self shadows +robustly. Specifically, our method outperforms the SOTA method by 16\% of the +RMSE of the whole image on the LRSS dataset. Our data and code is available at: +\url{https://github.com/jinyeying/DeS3_Deshadow} + +
+
+ comment: Accepted to AAAI2024, diffusion shadow removal, + \url{https://github.com/jinyeying/DeS3_Deshadow} +
+
+
+
+
+ + ♻ ☆ Allowing humans to interactively guide machines where to look does not + always improve human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature attribution maps \cite{bansal2020sam} +have been established as a common means for finding how important each input +feature is to an AI's decisions. It is an interesting, unexplored question +whether allowing users to edit the feature importance at test time would +improve a human-AI team's accuracy on downstream tasks. In this paper, we +address this question by leveraging CHM-Corr, a state-of-the-art, ante-hoc +explainable classifier \cite{taesiri2022visual} that first predicts patch-wise +correspondences between the input and training-set images, and then base on +them to make classification decisions. We build CHM-Corr++, an interactive +interface for CHM-Corr, enabling users to edit the feature attribution map +provided by CHM-Corr and observe updated model decisions. Via CHM-Corr++, users +can gain insights into if, when, and how the model changes its outputs, +improving their understanding beyond static explanations. However, our user +study with 18 users who performed 1,400 decisions finds no statistical +significance that our interactive approach improves user accuracy on CUB-200 +bird image classification over static explanations. This challenges the +hypothesis that interactivity can boost human-AI team +accuracy~\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding} +and raises needs for future research. We open-source CHM-Corr++, an interactive +tool for editing image classifier attention (see an interactive demo +\href{http://137.184.82.109:7080/}{here}). % , and it lays the groundwork for +future research to enable effective human-AI interaction in computer vision. We +release code and data on +\href{https://github.com/anguyen8/chm-corr-interactive}{github}. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ♻ ☆ High-quality Image Dehazing with Diffusion Model + + +
+ Image dehazing is quite challenging in dense-haze scenarios, where quite less +original information remains in the hazy image. Though previous methods have +made marvelous progress, they still suffer from information loss in content and +color in dense-haze scenarios. The recently emerged Denoising Diffusion +Probabilistic Model (DDPM) exhibits strong generation ability, showing +potential for solving this problem. However, DDPM fails to consider the physics +property of dehazing task, limiting its information completion capacity. In +this work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing +framework that applies to complex hazy scenarios. Specifically, DehazeDDPM +works in two stages. The former stage physically models the dehazing task with +the Atmospheric Scattering Model (ASM), pulling the distribution closer to the +clear data and endowing DehazeDDPM with fog-aware ability. The latter stage +exploits the strong generation ability of DDPM to compensate for the +haze-induced huge information loss, by working in conjunction with the physical +modelling. Extensive experiments demonstrate that our method attains +state-of-the-art performance on both synthetic and real-world hazy datasets. + +
+
+
+
+
+ + ♻ ☆ Images are Achilles' Heel of Alignment: Exploiting Visual + Vulnerabilities for Jailbreaking Multimodal Large Language Models + + +
+ In this paper, we study the harmlessness alignment problem of multimodal +large language models (MLLMs). We conduct a systematic empirical analysis of +the harmlessness performance of representative MLLMs and reveal that the image +input poses the alignment vulnerability of MLLMs. Inspired by this, we propose +a novel jailbreak method named HADES, which hides and amplifies the harmfulness +of the malicious intent within the text input, using meticulously crafted +images. Experimental results show that HADES can effectively jailbreak existing +MLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for +LLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data will be publicly +released. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have inherent shortcomings. Recently, the Mamba architecture, +based on state space models, has shown remarkable performance in a series of +natural language processing tasks, which can effectively compensate for the +shortcomings of the above two architectures. In this paper, we explore for the +first time the potential of the Mamba architecture for remote sensing CD tasks. +We tailor the corresponding frameworks, called MambaBCD, MambaSCD, and +MambaBDA, for binary change detection (BCD), semantic change detection (SCD), +and building damage assessment (BDA), respectively. All three frameworks adopt +the cutting-edge Visual Mamba architecture as the encoder, which allows full +learning of global spatial contextual information from the input images. For +the change decoder, which is available in all three architectures, we propose +three spatio-temporal relationship modeling mechanisms, which can be naturally +combined with the Mamba architecture and fully utilize its attribute to achieve +spatio-temporal interaction of multi-temporal features, thereby obtaining +accurate change information. On five benchmark datasets, our proposed +frameworks outperform current CNN- and Transformer-based approaches without +using any complex training strategies or tricks, fully demonstrating the +potential of the Mamba architecture in CD tasks. Specifically, we obtained +83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+, +and WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA +dataset xBD, we obtained 81.41% overall F1 score. Further experiments show that +our architecture is quite robust to degraded data. The source code will be +available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ♻ ☆ UFineBench: Towards Text-based Person Retrieval with Ultra-fine + Granularity + + +
+ Existing text-based person retrieval datasets often have relatively +coarse-grained text annotations. This hinders the model to comprehend the +fine-grained semantics of query texts in real scenarios. To address this +problem, we contribute a new benchmark named \textbf{UFineBench} for text-based +person retrieval with ultra-fine granularity. + Firstly, we construct a new \textbf{dataset} named UFine6926. We collect a +large number of person images and manually annotate each image with two +detailed textual descriptions, averaging 80.8 words each. The average word +count is three to four times that of the previous datasets. In addition of +standard in-domain evaluation, we also propose a special \textbf{evaluation +paradigm} more representative of real scenarios. It contains a new evaluation +set with cross domains, cross textual granularity and cross textual styles, +named UFine3C, and a new evaluation metric for accurately measuring retrieval +ability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a +more efficient \textbf{algorithm} especially designed for text-based person +retrieval with ultra fine-grained texts. It achieves fine granularity mining by +adopting a shared cross-modal granularity decoder and hard negative match +mechanism. + With standard in-domain evaluation, CFAM establishes competitive performance +across various datasets, especially on our ultra fine-grained UFine6926. +Furthermore, by evaluating on UFine3C, we demonstrate that training on our +UFine6926 significantly improves generalization to real scenarios compared with +other coarse-grained datasets. The dataset and code will be made publicly +available at \url{https://github.com/Zplusdragon/UFineBench}. + +
+
+
+
+
+ + ♻ ☆ Multisize Dataset Condensation ICLR 2024 + + +
+ While dataset condensation effectively enhances training efficiency, its +application in on-device scenarios brings unique challenges. 1) Due to the +fluctuating computational resources of these devices, there's a demand for a +flexible dataset size that diverges from a predefined size. 2) The limited +computational power on devices often prevents additional condensation +operations. These two challenges connect to the "subset degradation problem" in +traditional dataset condensation: a subset from a larger condensed dataset is +often unrepresentative compared to directly condensing the whole dataset to +that smaller size. In this paper, we propose Multisize Dataset Condensation +(MDC) by compressing N condensation processes into a single condensation +process to obtain datasets with multiple sizes. Specifically, we introduce an +"adaptive subset loss" on top of the basic condensation loss to mitigate the +"subset degradation problem". Our MDC method offers several benefits: 1) No +additional condensation process is required; 2) reduced storage requirement by +reusing condensed images. Experiments validate our findings on networks +including ConvNet, ResNet and DenseNet, and datasets including SVHN, CIFAR-10, +CIFAR-100 and ImageNet. For example, we achieved 5.22%-6.40% average accuracy +gains on condensing CIFAR-10 to ten images per class. Code is available at: +https://github.com/he-y/Multisize-Dataset-Condensation. + +
+
+ comment: Accepted by ICLR 2024 Oral +
+
+
+
+
+ + ♻ ☆ MambaAD: Exploring State Space Models for Multi-class Unsupervised + Anomaly Detection + + +
+ Recent advancements in anomaly detection have seen the efficacy of CNN- and +transformer-based approaches. However, CNNs struggle with long-range +dependencies, while transformers are burdened by quadratic computational +complexity. Mamba-based models, with their superior long-range modeling and +linear efficiency, have garnered substantial attention. This study pioneers the +application of Mamba to multi-class unsupervised anomaly detection, presenting +MambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring +(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS +module, integrating parallel cascaded (Hybrid State Space) HSS blocks and +multi-kernel convolutions operations, effectively captures both long-range and +local information. The HSS block, utilizing (Hybrid Scanning) HS encoders, +encodes feature maps into five scanning methods and eight directions, thereby +strengthening global connections through the (State Space Model) SSM. The use +of Hilbert scanning and eight directions significantly improves feature +sequence modeling. Comprehensive experiments on six diverse anomaly detection +datasets and seven metrics demonstrate state-of-the-art performance, +substantiating the method's effectiveness. + +
+
+
+
+
+ + ♻ ☆ 3D Geometry-aware Deformable Gaussian Splatting for Dynamic View + Synthesis CVPR 2024 + + +
+ In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting +method for dynamic view synthesis. Existing neural radiance fields (NeRF) based +solutions learn the deformation in an implicit manner, which cannot incorporate +3D scene geometry. Therefore, the learned deformation is not necessarily +geometrically coherent, which results in unsatisfactory dynamic view synthesis +and 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new +representation of the 3D scene, building upon which the 3D geometry could be +exploited in learning the complex 3D deformation. Specifically, the scenes are +represented as a collection of 3D Gaussian, where each 3D Gaussian is optimized +to move and rotate over time to model the deformation. To enforce the 3D scene +geometry constraint during deformation, we explicitly extract 3D geometry +features and integrate them in learning the 3D deformation. In this way, our +solution achieves 3D geometry-aware deformation modeling, which enables +improved dynamic view synthesis and 3D dynamic reconstruction. Extensive +experimental results on both synthetic and real datasets prove the superiority +of our solution, which achieves new state-of-the-art performance. + The project is available at https://npucvr.github.io/GaGS/ + +
+
+ comment: Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/ +
+
+
+
+
+ + ♻ ☆ Learning Spatial Features from Audio-Visual Correspondence in Egocentric + Videos CVPR 2024 + + +
+ We propose a self-supervised method for learning representations based on +spatial audio-visual correspondences in egocentric videos. Our method uses a +masked auto-encoding framework to synthesize masked binaural (multi-channel) +audio through the synergy of audio and vision, thereby learning useful spatial +relationships between the two modalities. We use our pretrained features to +tackle two downstream video tasks requiring spatial understanding in social +scenarios: active speaker detection and spatial audio denoising. Through +extensive experiments, we show that our features are generic enough to improve +over multiple state-of-the-art baselines on both tasks on two challenging +egocentric video datasets that offer binaural audio, EgoCom and EasyCom. +Project: http://vision.cs.utexas.edu/projects/ego_av_corr. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Generation for AI-Generated Content: A Survey + + +
+ Advancements in model algorithms, the growth of foundational models, and +access to high-quality datasets have propelled the evolution of Artificial +Intelligence Generated Content (AIGC). Despite its notable successes, AIGC +still faces hurdles such as updating knowledge, handling long-tail data, +mitigating data leakage, and managing high training and inference costs. +Retrieval-Augmented Generation (RAG) has recently emerged as a paradigm to +address such challenges. In particular, RAG introduces the information +retrieval process, which enhances the generation process by retrieving relevant +objects from available data stores, leading to higher accuracy and better +robustness. In this paper, we comprehensively review existing efforts that +integrate RAG technique into AIGC scenarios. We first classify RAG foundations +according to how the retriever augments the generator, distilling the +fundamental abstractions of the augmentation methodologies for various +retrievers and generators. This unified perspective encompasses all RAG +scenarios, illuminating advancements and pivotal technologies that help with +potential future progress. We also summarize additional enhancements methods +for RAG, facilitating effective engineering and implementation of RAG systems. +Then from another view, we survey on practical applications of RAG across +different modalities and tasks, offering valuable references for researchers +and practitioners. Furthermore, we introduce the benchmarks for RAG, discuss +the limitations of current RAG systems, and suggest potential directions for +future research. Github: https://github.com/PKU-DAIR/RAG-Survey. + +
+
+ comment: Citing 377 papers, 28 pages, 1 table, 12 figures. Project: + https://github.com/PKU-DAIR/RAG-Survey +
+
+
+
+
+ + ♻ ☆ A Survey on 3D Gaussian Splatting + + +
+ 3D Gaussian splatting (GS) has recently emerged as a transformative technique +in the realm of explicit radiance field and computer graphics. This innovative +approach, characterized by the utilization of millions of learnable 3D +Gaussians, represents a significant departure from mainstream neural radiance +field approaches, which predominantly use implicit, coordinate-based models to +map spatial coordinates to pixel values. 3D GS, with its explicit scene +representation and differentiable rendering algorithm, not only promises +real-time rendering capability but also introduces unprecedented levels of +editability. This positions 3D GS as a potential game-changer for the next +generation of 3D reconstruction and representation. In the present paper, we +provide the first systematic overview of the recent developments and critical +contributions in the domain of 3D GS. We begin with a detailed exploration of +the underlying principles and the driving forces behind the emergence of 3D GS, +laying the groundwork for understanding its significance. A focal point of our +discussion is the practical applicability of 3D GS. By enabling unprecedented +rendering speed, 3D GS opens up a plethora of applications, ranging from +virtual reality to interactive media and beyond. This is complemented by a +comparative analysis of leading 3D GS models, evaluated across various +benchmark tasks to highlight their performance and practical utility. The +survey concludes by identifying current challenges and suggesting potential +avenues for future research in this domain. Through this survey, we aim to +provide a valuable resource for both newcomers and seasoned researchers, +fostering further exploration and advancement in applicable and explicit +radiance field representation. + +
+
+ comment: Ongoing project +
+
+
+
+
+ + ♻ ☆ The Curse of Recursion: Training on Generated Data Makes Models Forget + + +
+ Stable Diffusion revolutionised image creation from descriptive text. GPT-2, +GPT-3(.5) and GPT-4 demonstrated astonishing performance across a variety of +language tasks. ChatGPT introduced such language models to the general public. +It is now clear that large language models (LLMs) are here to stay, and will +bring about drastic change in the whole ecosystem of online text and images. In +this paper we consider what the future might hold. What will happen to GPT-{n} +once LLMs contribute much of the language found online? We find that use of +model-generated content in training causes irreversible defects in the +resulting models, where tails of the original content distribution disappear. +We refer to this effect as Model Collapse and show that it can occur in +Variational Autoencoders, Gaussian Mixture Models and LLMs. We build +theoretical intuition behind the phenomenon and portray its ubiquity amongst +all learned generative models. We demonstrate that it has to be taken seriously +if we are to sustain the benefits of training from large-scale data scraped +from the web. Indeed, the value of data collected about genuine human +interactions with systems will be increasingly valuable in the presence of +content generated by LLMs in data crawled from the Internet. + +
+
+ comment: Fixed typos in eqn 4,5 +
+
+
+
+
+ + ♻ ☆ Curvature-Balanced Feature Manifold Learning for Long-Tailed + Classification CVPR 2023 + + +
+ To address the challenges of long-tailed classification, researchers have +proposed several approaches to reduce model bias, most of which assume that +classes with few samples are weak classes. However, recent studies have shown +that tail classes are not always hard to learn, and model bias has been +observed on sample-balanced datasets, suggesting the existence of other factors +that affect model bias. In this work, we systematically propose a series of +geometric measurements for perceptual manifolds in deep neural networks, and +then explore the effect of the geometric characteristics of perceptual +manifolds on classification difficulty and how learning shapes the geometric +characteristics of perceptual manifolds. An unanticipated finding is that the +correlation between the class accuracy and the separation degree of perceptual +manifolds gradually decreases during training, while the negative correlation +with the curvature gradually increases, implying that curvature imbalance leads +to model bias. Therefore, we propose curvature regularization to facilitate the +model to learn curvature-balanced and flatter perceptual manifolds. Evaluations +on multiple long-tailed and non-long-tailed datasets show the excellent +performance and exciting generality of our approach, especially in achieving +significant performance improvements based on current state-of-the-art +techniques. Our work opens up a geometric analysis perspective on model bias +and reminds researchers to pay attention to model bias on non-long-tailed and +even sample-balanced datasets. The code and model will be made public. + +
+
+ comment: 20pages, Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Towards Reliable Medical Image Segmentation by utilizing Evidential + Calibrated Uncertainty + + +
+ Medical image segmentation is critical for disease diagnosis and treatment +assessment. However, concerns regarding the reliability of segmentation regions +persist among clinicians, mainly attributed to the absence of confidence +assessment, robustness, and calibration to accuracy. To address this, we +introduce DEviS, an easily implementable foundational model that seamlessly +integrates into various medical image segmentation networks. DEviS not only +enhances the calibration and robustness of baseline segmentation accuracy but +also provides high-efficiency uncertainty estimation for reliable predictions. +By leveraging subjective logic theory, we explicitly model probability and +uncertainty for the problem of medical image segmentation. Here, the Dirichlet +distribution parameterizes the distribution of probabilities for different +classes of the segmentation results. To generate calibrated predictions and +uncertainty, we develop a trainable calibrated uncertainty penalty. +Furthermore, DEviS incorporates an uncertainty-aware filtering module, which +utilizes the metric of uncertainty-calibrated error to filter reliable data +within the dataset. We conducted validation studies to assess both the accuracy +and robustness of DEviS segmentation, along with evaluating the efficiency and +reliability of uncertainty estimation. These evaluations were performed using +publicly available datasets including ISIC2018, LiTS2017, and BraTS2019. +Additionally, two potential clinical trials are being conducted at Johns +Hopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in +filtering high-quality or out-of-distribution data. Our code has been released +in https://github.com/Cocofeat/DEviS. + +
+
+ comment: 34 pages, 11 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 72 + +
+
+
+ + ☆ Probabilistic Directed Distance Fields for Ray-Based Shape + Representations + + +
+ In modern computer vision, the optimal representation of 3D shape continues +to be task-dependent. One fundamental operation applied to such representations +is differentiable rendering, as it enables inverse graphics approaches in +learning frameworks. Standard explicit shape representations (voxels, point +clouds, or meshes) are often easily rendered, but can suffer from limited +geometric fidelity, among other issues. On the other hand, implicit +representations (occupancy, distance, or radiance fields) preserve greater +fidelity, but suffer from complex or inefficient rendering processes, limiting +scalability. In this work, we devise Directed Distance Fields (DDFs), a novel +neural shape representation that builds upon classical distance fields. The +fundamental operation in a DDF maps an oriented point (position and direction) +to surface visibility and depth. This enables efficient differentiable +rendering, obtaining depth with a single forward pass per pixel, as well as +differential geometric quantity extraction (e.g., surface normals), with only +additional backward passes. Using probabilistic DDFs (PDDFs), we show how to +model inherent discontinuities in the underlying field. We then apply DDFs to +several applications, including single-shape fitting, generative modelling, and +single-image 3D reconstruction, showcasing strong performance with simple +architectural components via the versatility of our representation. Finally, +since the dimensionality of DDFs permits view-dependent geometric artifacts, we +conduct a theoretical investigation of the constraints necessary for view +consistency. We find a small set of field properties that are sufficient to +guarantee a DDF is consistent, without knowing, for instance, which shape the +field is expressing. + +
+
+ comment: Extension of arXiv:2112.05300 +
+
+
+
+
+ + ☆ Exploring Explainability in Video Action Recognition CVPR 2024 + + +
+ Image Classification and Video Action Recognition are perhaps the two most +foundational tasks in computer vision. Consequently, explaining the inner +workings of trained deep neural networks is of prime importance. While numerous +efforts focus on explaining the decisions of trained deep neural networks in +image classification, exploration in the domain of its temporal version, video +action recognition, has been scant. In this work, we take a deeper look at this +problem. We begin by revisiting Grad-CAM, one of the popular feature +attribution methods for Image Classification, and its extension to Video Action +Recognition tasks and examine the method's limitations. To address these, we +introduce Video-TCAV, by building on TCAV for Image Classification tasks, which +aims to quantify the importance of specific concepts in the decision-making +process of Video Action Recognition models. As the scalable generation of +concepts is still an open problem, we propose a machine-assisted approach to +generate spatial and spatiotemporal concepts relevant to Video Action +Recognition for testing Video-TCAV. We then establish the importance of +temporally-varying concepts by demonstrating the superiority of dynamic +spatiotemporal concepts over trivial spatial concepts. In conclusion, we +introduce a framework for investigating hypotheses in action recognition and +quantitatively testing them, thus advancing research in the explainability of +deep neural networks used in video action recognition. + +
+
+ comment: 6 pages, 10 figures, Accepted to the 3rd Explainable AI for Computer + Vision (XAI4CV) Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Rethinking Iterative Stereo Matching from Diffusion Bridge Model + Perspective + + +
+ Recently, iteration-based stereo matching has shown great potential. However, +these models optimize the disparity map using RNN variants. The discrete +optimization process poses a challenge of information loss, which restricts the +level of detail that can be expressed in the generated disparity map. In order +to address these issues, we propose a novel training approach that incorporates +diffusion models into the iterative optimization process. We designed a +Time-based Gated Recurrent Unit (T-GRU) to correlate temporal and disparity +outputs. Unlike standard recurrent units, we employ Agent Attention to generate +more expressive features. We also designed an attention-based context network +to capture a large amount of contextual information. Experiments on several +public benchmarks show that we have achieved competitive stereo matching +performance. Our model ranks first in the Scene Flow dataset, achieving over a +7% improvement compared to competing methods, and requires only 8 iterations to +achieve state-of-the-art results. + +
+
+ comment: tip. arXiv admin note: text overlap with arXiv:2303.06615 by other + authors +
+
+
+
+
+ + ☆ Improving Personalisation in Valence and Arousal Prediction using Data + Augmentation + + +
+ In the field of emotion recognition and Human-Machine Interaction (HMI), +personalised approaches have exhibited their efficacy in capturing +individual-specific characteristics and enhancing affective prediction +accuracy. However, personalisation techniques often face the challenge of +limited data for target individuals. This paper presents our work on an +enhanced personalisation strategy, that leverages data augmentation to develop +tailored models for continuous valence and arousal prediction. Our proposed +approach, Distance Weighting Augmentation (DWA), employs a weighting-based +augmentation method that expands a target individual's dataset, leveraging +distance metrics to identify similar samples at the segment-level. Experimental +results on the MuSe-Personalisation 2023 Challenge dataset demonstrate that our +method significantly improves the performance of features sets which have low +baseline performance, on the test set. This improvement in poor-performing +features comes without sacrificing performance on high-performing features. In +particular, our method achieves a maximum combined testing CCC of 0.78, +compared to the reported baseline score of 0.76 (reproduced at 0.72). It also +achieved a peak arousal and valence scores of 0.81 and 0.76, compared to +reproduced baseline scores of 0.76 and 0.67 respectively. Through this work, we +make significant contributions to the advancement of personalised affective +computing models, enhancing the practicality and adaptability of data-level +personalisation in real world contexts. + +
+
+
+
+
+ + ☆ Theoretical research on generative diffusion models: an overview + + +
+ Generative diffusion models showed high success in many fields with a +powerful theoretical background. They convert the data distribution to noise +and remove the noise back to obtain a similar distribution. Many existing +reviews focused on the specific application areas without concentrating on the +research about the algorithm. Unlike them we investigated the theoretical +developments of the generative diffusion models. These approaches mainly divide +into two: training-based and sampling-based. Awakening to this allowed us a +clear and understandable categorization for the researchers who will make new +developments in the future. + +
+
+
+
+
+ + ☆ PracticalDG: Perturbation Distillation on Vision-Language Models for + Hybrid Domain Generalization CVPR2024 + + +
+ Domain Generalization (DG) aims to resolve distribution shifts between source +and target domains, and current DG methods are default to the setting that data +from source and target domains share identical categories. Nevertheless, there +exists unseen classes from target domains in practical scenarios. To address +this issue, Open Set Domain Generalization (OSDG) has emerged and several +methods have been exclusively proposed. However, most existing methods adopt +complex architectures with slight improvement compared with DG methods. +Recently, vision-language models (VLMs) have been introduced in DG following +the fine-tuning paradigm, but consume huge training overhead with large vision +models. Therefore, in this paper, we innovate to transfer knowledge from VLMs +to lightweight vision models and improve the robustness by introducing +Perturbation Distillation (PD) from three perspectives, including Score, Class +and Instance (SCI), named SCI-PD. Moreover, previous methods are oriented by +the benchmarks with identical and fixed splits, ignoring the divergence between +source domains. These methods are revealed to suffer from sharp performance +decay with our proposed new benchmark Hybrid Domain Generalization (HDG) and a +novel metric $H^{2}$-CV, which construct various splits to comprehensively +assess the robustness of algorithms. Extensive experiments demonstrate that our +method outperforms state-of-the-art algorithms on multiple datasets, especially +improving the robustness when confronting data scarcity. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ MMA-DFER: MultiModal Adaptation of unimodal models for Dynamic Facial + Expression Recognition in-the-wild CVPR 2024 + + +
+ Dynamic Facial Expression Recognition (DFER) has received significant +interest in the recent years dictated by its pivotal role in enabling empathic +and human-compatible technologies. Achieving robustness towards in-the-wild +data in DFER is particularly important for real-world applications. One of the +directions aimed at improving such models is multimodal emotion recognition +based on audio and video data. Multimodal learning in DFER increases the model +capabilities by leveraging richer, complementary data representations. Within +the field of multimodal DFER, recent methods have focused on exploiting +advances of self-supervised learning (SSL) for pre-training of strong +multimodal encoders. Another line of research has focused on adapting +pre-trained static models for DFER. In this work, we propose a different +perspective on the problem and investigate the advancement of multimodal DFER +performance by adapting SSL-pre-trained disjoint unimodal encoders. We identify +main challenges associated with this task, namely, intra-modality adaptation, +cross-modal alignment, and temporal adaptation, and propose solutions to each +of them. As a result, we demonstrate improvement over current state-of-the-art +on two popular DFER benchmarks, namely DFEW and MFAW. + +
+
+ comment: accepted to CVPR 2024 ABAW Workshop +
+
+
+
+
+ + ☆ THQA: A Perceptual Quality Assessment Database for Talking Heads + + +
+ In the realm of media technology, digital humans have gained prominence due +to rapid advancements in computer technology. However, the manual modeling and +control required for the majority of digital humans pose significant obstacles +to efficient development. The speech-driven methods offer a novel avenue for +manipulating the mouth shape and expressions of digital humans. Despite the +proliferation of driving methods, the quality of many generated talking head +(TH) videos remains a concern, impacting user visual experiences. To tackle +this issue, this paper introduces the Talking Head Quality Assessment (THQA) +database, featuring 800 TH videos generated through 8 diverse speech-driven +methods. Extensive experiments affirm the THQA database's richness in character +and speech features. Subsequent subjective quality assessment experiments +analyze correlations between scoring results and speech-driven methods, ages, +and genders. In addition, experimental results show that mainstream image and +video quality assessment methods have limitations for the THQA database, +underscoring the imperative for further research to enhance TH video quality +assessment. The THQA database is publicly accessible at +https://github.com/zyj-2000/THQA. + +
+
+
+
+
+ + ☆ Smart Help: Strategic Opponent Modeling for Proactive and Adaptive Robot + Assistance in Households + + +
+ Despite the significant demand for assistive technology among vulnerable +groups (e.g., the elderly, children, and the disabled) in daily tasks, research +into advanced AI-driven assistive solutions that genuinely accommodate their +diverse needs remains sparse. Traditional human-machine interaction tasks often +require machines to simply help without nuanced consideration of human +abilities and feelings, such as their opportunity for practice and learning, +sense of self-improvement, and self-esteem. Addressing this gap, we define a +pivotal and novel challenge Smart Help, which aims to provide proactive yet +adaptive support to human agents with diverse disabilities and dynamic goals in +various tasks and environments. To establish this challenge, we leverage +AI2-THOR to build a new interactive 3D realistic household environment for the +Smart Help task. We introduce an innovative opponent modeling module that +provides a nuanced understanding of the main agent's capabilities and goals, in +order to optimize the assisting agent's helping policy. Rigorous experiments +validate the efficacy of our model components and show the superiority of our +holistic approach against established baselines. Our findings illustrate the +potential of AI-imbued assistive robots in improving the well-being of +vulnerable groups. + +
+
+
+
+
+ + ☆ MaSkel: A Model for Human Whole-body X-rays Generation from Human + Masking Images + + +
+ The human whole-body X-rays could offer a valuable reference for various +applications, including medical diagnostics, digital animation modeling, and +ergonomic design. The traditional method of obtaining X-ray information +requires the use of CT (Computed Tomography) scan machines, which emit +potentially harmful radiation. Thus it faces a significant limitation for +realistic applications because it lacks adaptability and safety. In our work, +We proposed a new method to directly generate the 2D human whole-body X-rays +from the human masking images. The predicted images will be similar to the real +ones with the same image style and anatomic structure. We employed a +data-driven strategy. By leveraging advanced generative techniques, our model +MaSkel(Masking image to Skeleton X-rays) could generate a high-quality X-ray +image from a human masking image without the need for invasive and harmful +radiation exposure, which not only provides a new path to generate highly +anatomic and customized data but also reduces health risks. To our knowledge, +our model MaSkel is the first work for predicting whole-body X-rays. In this +paper, we did two parts of the work. The first one is to solve the data +limitation problem, the diffusion-based techniques are utilized to make a data +augmentation, which provides two synthetic datasets for preliminary +pretraining. Then we designed a two-stage training strategy to train MaSkel. At +last, we make qualitative and quantitative evaluations of the generated X-rays. +In addition, we invite some professional doctors to assess our predicted data. +These evaluations demonstrate the MaSkel's superior ability to generate +anatomic X-rays from human masking images. The related code and links of the +dataset are available at https://github.com/2022yingjie/MaSkel. + +
+
+
+
+
+ + ☆ Beyond Known Clusters: Probe New Prototypes for Efficient Generalized + Class Discovery + + +
+ Generalized Class Discovery (GCD) aims to dynamically assign labels to +unlabelled data partially based on knowledge learned from labelled data, where +the unlabelled data may come from known or novel classes. The prevailing +approach generally involves clustering across all data and learning conceptions +by prototypical contrastive learning. However, existing methods largely hinge +on the performance of clustering algorithms and are thus subject to their +inherent limitations. Firstly, the estimated cluster number is often smaller +than the ground truth, making the existing methods suffer from the lack of +prototypes for comprehensive conception learning. To address this issue, we +propose an adaptive probing mechanism that introduces learnable potential +prototypes to expand cluster prototypes (centers). As there is no ground truth +for the potential prototype, we develop a self-supervised prototype learning +framework to optimize the potential prototype in an end-to-end fashion. +Secondly, clustering is computationally intensive, and the conventional +strategy of clustering both labelled and unlabelled instances exacerbates this +issue. To counteract this inefficiency, we opt to cluster only the unlabelled +instances and subsequently expand the cluster prototypes with our introduced +potential prototypes to fast explore novel classes. Despite the simplicity of +our proposed method, extensive empirical analysis on a wide range of datasets +confirms that our method consistently delivers state-of-the-art results. +Specifically, our method surpasses the nearest competitor by a significant +margin of \textbf{9.7}$\%$ within the Stanford Cars dataset and +\textbf{12$\times$} clustering efficiency within the Herbarium 19 dataset. We +will make the code and checkpoints publicly available at +\url{https://github.com/xjtuYW/PNP.git}. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ A Fourier-enhanced multi-modal 3D small object optical mark recognition + and positioning method for percutaneous abdominal puncture surgical + navigation + + +
+ Navigation for thoracoabdominal puncture surgery is used to locate the needle +entry point on the patient's body surface. The traditional reflective ball +navigation method is difficult to position the needle entry point on the soft, +irregular, smooth chest and abdomen. Due to the lack of clear characteristic +points on the body surface using structured light technology, it is difficult +to identify and locate arbitrary needle insertion points. Based on the high +stability and high accuracy requirements of surgical navigation, this paper +proposed a novel method, a muti-modal 3D small object medical marker detection +method, which identifies the center of a small single ring as the needle +insertion point. Moreover, this novel method leverages Fourier transform +enhancement technology to augment the dataset, enrich image details, and +enhance the network's capability. The method extracts the Region of Interest +(ROI) of the feature image from both enhanced and original images, followed by +generating a mask map. Subsequently, the point cloud of the ROI from the depth +map is obtained through the registration of ROI point cloud contour fitting. In +addition, this method employs Tukey loss for optimal precision. The +experimental results show this novel method proposed in this paper not only +achieves high-precision and high-stability positioning, but also enables the +positioning of any needle insertion point. + +
+
+ comment: 19 pages, 6 figures, +
+
+
+
+
+ + ☆ Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active + Image Classification + + +
+ Deep active learning (AL) seeks to minimize the annotation costs for training +deep neural networks. BAIT, a recently proposed AL strategy based on the Fisher +Information, has demonstrated impressive performance across various datasets. +However, BAIT's high computational and memory requirements hinder its +applicability on large-scale classification tasks, resulting in current +research neglecting BAIT in their evaluation. This paper introduces two methods +to enhance BAIT's computational efficiency and scalability. Notably, we +significantly reduce its time complexity by approximating the Fisher +Information. In particular, we adapt the original formulation by i) taking the +expectation over the most probable classes, and ii) constructing a binary +classification task, leading to an alternative likelihood for gradient +computations. Consequently, this allows the efficient use of BAIT on +large-scale datasets, including ImageNet. Our unified and comprehensive +evaluation across a variety of datasets demonstrates that our approximations +achieve strong performance with considerably reduced time complexity. +Furthermore, we provide an extensive open-source toolbox that implements recent +state-of-the-art AL strategies, available at +https://github.com/dhuseljic/dal-toolbox. + +
+
+
+
+
+ + ☆ BG-YOLO: A Bidirectional-Guided Method for Underwater Object Detection + + +
+ Degraded underwater images decrease the accuracy of underwater object +detection. However, existing methods for underwater image enhancement mainly +focus on improving the indicators in visual aspects, which may not benefit the +tasks of underwater image detection, and may lead to serious degradation in +performance. To alleviate this problem, we proposed a bidirectional-guided +method for underwater object detection, referred to as BG-YOLO. In the proposed +method, network is organized by constructing an enhancement branch and a +detection branch in a parallel way. The enhancement branch consists of a +cascade of an image enhancement subnet and an object detection subnet. And the +detection branch only consists of a detection subnet. A feature guided module +connects the shallow convolution layer of the two branches. When training the +enhancement branch, the object detection subnet in the enhancement branch +guides the image enhancement subnet to be optimized towards the direction that +is most conducive to the detection task. The shallow feature map of the trained +enhancement branch will be output to the feature guided module, constraining +the optimization of detection branch through consistency loss and prompting +detection branch to learn more detailed information of the objects. And hence +the detection performance will be refined. During the detection tasks, only +detection branch will be reserved so that no additional cost of computation +will be introduced. Extensive experiments demonstrate that the proposed method +shows significant improvement in performance of the detector in severely +degraded underwater scenes while maintaining a remarkable detection speed. + +
+
+ comment: 15 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ MCPNet: An Interpretable Classifier via Multi-Level Concept Prototypes CVPR 2024 + + +
+ Recent advancements in post-hoc and inherently interpretable methods have +markedly enhanced the explanations of black box classifier models. These +methods operate either through post-analysis or by integrating concept learning +during model training. Although being effective in bridging the semantic gap +between a model's latent space and human interpretation, these explanation +methods only partially reveal the model's decision-making process. The outcome +is typically limited to high-level semantics derived from the last feature map. +We argue that the explanations lacking insights into the decision processes at +low and mid-level features are neither fully faithful nor useful. Addressing +this gap, we introduce the Multi-Level Concept Prototypes Classifier (MCPNet), +an inherently interpretable model. MCPNet autonomously learns meaningful +concept prototypes across multiple feature map levels using Centered Kernel +Alignment (CKA) loss and an energy-based weighted PCA mechanism, and it does so +without reliance on predefined concept labels. Further, we propose a novel +classifier paradigm that learns and aligns multi-level concept prototype +distributions for classification purposes via Class-aware Concept Distribution +(CCD) loss. Our experiments reveal that our proposed MCPNet while being +adaptable to various model architectures, offers comprehensive multi-level +explanations while maintaining classification accuracy. Additionally, its +concept distribution-based classification approach shows improved +generalization capabilities in few-shot classification scenarios. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ LoopGaussian: Creating 3D Cinemagraph with Multi-view Images via + Eulerian Motion Field + + +
+ Cinemagraph is a unique form of visual media that combines elements of still +photography and subtle motion to create a captivating experience. However, the +majority of videos generated by recent works lack depth information and are +confined to the constraints of 2D image space. In this paper, inspired by +significant progress in the field of novel view synthesis (NVS) achieved by 3D +Gaussian Splatting (3D-GS), we propose LoopGaussian to elevate cinemagraph from +2D image space to 3D space using 3D Gaussian modeling. To achieve this, we +first employ the 3D-GS method to reconstruct 3D Gaussian point clouds from +multi-view images of static scenes,incorporating shape regularization terms to +prevent blurring or artifacts caused by object deformation. We then adopt an +autoencoder tailored for 3D Gaussian to project it into feature space. To +maintain the local continuity of the scene, we devise SuperGaussian for +clustering based on the acquired features. By calculating the similarity +between clusters and employing a two-stage estimation method, we derive an +Eulerian motion field to describe velocities across the entire scene. The 3D +Gaussian points then move within the estimated Eulerian motion field. Through +bidirectional animation techniques, we ultimately generate a 3D Cinemagraph +that exhibits natural and seamlessly loopable dynamics. Experiment results +validate the effectiveness of our approach, demonstrating high-quality and +visually appealing scene generation. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Seeing Text in the Dark: Algorithm and Benchmark + + +
+ Localizing text in low-light environments is challenging due to visual +degradations. Although a straightforward solution involves a two-stage pipeline +with low-light image enhancement (LLE) as the initial step followed by +detector, LLE is primarily designed for human vision instead of machine and can +accumulate errors. In this work, we propose an efficient and effective +single-stage approach for localizing text in dark that circumvents the need for +LLE. We introduce a constrained learning module as an auxiliary mechanism +during the training stage of the text detector. This module is designed to +guide the text detector in preserving textual spatial features amidst feature +map resizing, thus minimizing the loss of spatial information in texts under +low-light visual degradations. Specifically, we incorporate spatial +reconstruction and spatial semantic constraints within this module to ensure +the text detector acquires essential positional and contextual range knowledge. +Our approach enhances the original text detector's ability to identify text's +local topological features using a dynamic snake feature pyramid network and +adopts a bottom-up contour shaping strategy with a novel rectangular +accumulation technique for accurate delineation of streamlined text features. +In addition, we present a comprehensive low-light dataset for arbitrary-shaped +text, encompassing diverse scenes and languages. Notably, our method achieves +state-of-the-art results on this low-light dataset and exhibits comparable +performance on standard normal light datasets. The code and dataset will be +released. + +
+
+
+
+
+ + ☆ Understanding Multimodal Deep Neural Networks: A Concept Selection View + + +
+ The multimodal deep neural networks, represented by CLIP, have generated rich +downstream applications owing to their excellent performance, thus making +understanding the decision-making process of CLIP an essential research topic. +Due to the complex structure and the massive pre-training data, it is often +regarded as a black-box model that is too difficult to understand and +interpret. Concept-based models map the black-box visual representations +extracted by deep neural networks onto a set of human-understandable concepts +and use the concepts to make predictions, enhancing the transparency of the +decision-making process. However, these methods involve the datasets labeled +with fine-grained attributes by expert knowledge, which incur high costs and +introduce excessive human prior knowledge and bias. In this paper, we observe +the long-tail distribution of concepts, based on which we propose a two-stage +Concept Selection Model (CSM) to mine core concepts without introducing any +human priors. The concept greedy rough selection algorithm is applied to +extract head concepts, and then the concept mask fine selection method performs +the extraction of core concepts. Experiments show that our approach achieves +comparable performance to end-to-end black-box models, and human evaluation +demonstrates that the concepts discovered by our method are interpretable and +comprehensible for humans. + +
+
+
+
+
+ + ☆ AMU-Tuning: Effective Logit Bias for CLIP-based Few-shot Learning CVPR 2024 + + +
+ Recently, pre-trained vision-language models (e.g., CLIP) have shown great +potential in few-shot learning and attracted a lot of research interest. +Although efforts have been made to improve few-shot ability of CLIP, key +factors on the effectiveness of existing methods have not been well studied, +limiting further exploration of CLIP's potential in few-shot learning. In this +paper, we first introduce a unified formulation to analyze CLIP-based few-shot +learning methods from a perspective of logit bias, which encourages us to learn +an effective logit bias for further improving performance of CLIP-based +few-shot learning methods. To this end, we disassemble three key components +involved in computation of logit bias (i.e., logit features, logit predictor, +and logit fusion) and empirically analyze the effect on performance of few-shot +classification. Based on analysis of key components, this paper proposes a +novel AMU-Tuning method to learn effective logit bias for CLIP-based few-shot +classification. Specifically, our AMU-Tuning predicts logit bias by exploiting +the appropriate $\underline{\textbf{A}}$uxiliary features, which are fed into +an efficient feature-initialized linear classifier with +$\underline{\textbf{M}}$ulti-branch training. Finally, an +$\underline{\textbf{U}}$ncertainty-based fusion is developed to incorporate +logit bias into CLIP for few-shot classification. The experiments are conducted +on several widely used benchmarks, and the results show AMU-Tuning clearly +outperforms its counterparts while achieving state-of-the-art performance of +CLIP-based few-shot learning without bells and whistles. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Constructing and Exploring Intermediate Domains in Mixed Domain + Semi-supervised Medical Image Segmentation + + +
+ Both limited annotation and domain shift are prevalent challenges in medical +image segmentation. Traditional semi-supervised segmentation and unsupervised +domain adaptation methods address one of these issues separately. However, the +coexistence of limited annotation and domain shift is quite common, which +motivates us to introduce a novel and challenging scenario: Mixed Domain +Semi-supervised medical image Segmentation (MiDSS). In this scenario, we handle +data from multiple medical centers, with limited annotations available for a +single domain and a large amount of unlabeled data from multiple domains. We +found that the key to solving the problem lies in how to generate reliable +pseudo labels for the unlabeled data in the presence of domain shift with +labeled data. To tackle this issue, we employ Unified Copy-Paste (UCP) between +images to construct intermediate domains, facilitating the knowledge transfer +from the domain of labeled data to the domains of unlabeled data. To fully +utilize the information within the intermediate domain, we propose a symmetric +Guidance training strategy (SymGD), which additionally offers direct guidance +to unlabeled data by merging pseudo labels from intermediate samples. +Subsequently, we introduce a Training Process aware Random Amplitude MixUp +(TP-RAM) to progressively incorporate style-transition components into +intermediate samples. Compared with existing state-of-the-art approaches, our +method achieves a notable 13.57% improvement in Dice score on Prostate dataset, +as demonstrated on three public datasets. Our code is available at +https://github.com/MQinghe/MiDSS . + +
+
+
+
+
+ + ☆ ChimpVLM: Ethogram-Enhanced Chimpanzee Behaviour Recognition + + +
+ We show that chimpanzee behaviour understanding from camera traps can be +enhanced by providing visual architectures with access to an embedding of text +descriptions that detail species behaviours. In particular, we present a +vision-language model which employs multi-modal decoding of visual features +extracted directly from camera trap videos to process query tokens representing +behaviours and output class predictions. Query tokens are initialised using a +standardised ethogram of chimpanzee behaviour, rather than using random or +name-based initialisations. In addition, the effect of initialising query +tokens using a masked language model fine-tuned on a text corpus of known +behavioural patterns is explored. We evaluate our system on the PanAf500 and +PanAf20K datasets and demonstrate the performance benefits of our multi-modal +decoding approach and query initialisation strategy on multi-class and +multi-label recognition tasks, respectively. Results and ablations corroborate +performance improvements. We achieve state-of-the-art performance over vision +and vision-language models in top-1 accuracy (+6.34%) on PanAf500 and overall +(+1.1%) and tail-class (+2.26%) mean average precision on PanAf20K. We share +complete source code and network weights for full reproducibility of results +and easy utilisation. + +
+
+
+
+
+ + ☆ Shifting Spotlight for Co-supervision: A Simple yet Efficient + Single-branch Network to See Through Camouflage + + +
+ Efficient and accurate camouflaged object detection (COD) poses a challenge +in the field of computer vision. Recent approaches explored the utility of edge +information for network co-supervision, achieving notable advancements. +However, these approaches introduce an extra branch for complex edge +extraction, complicate the model architecture and increases computational +demands. Addressing this issue, our work replicates the effect that animal's +camouflage can be easily revealed under a shifting spotlight, and leverages it +for network co-supervision to form a compact yet efficient single-branch +network, the Co-Supervised Spotlight Shifting Network (CS$^3$Net). The +spotlight shifting strategy allows CS$^3$Net to learn additional prior within a +single-branch framework, obviating the need for resource demanding multi-branch +design. To leverage the prior of spotlight shifting co-supervision, we propose +Shadow Refinement Module (SRM) and Projection Aware Attention (PAA) for feature +refinement and enhancement. To ensure the continuity of multi-scale features +aggregation, we utilize the Extended Neighbor Connection Decoder (ENCD) for +generating the final predictions. Empirical evaluations on public datasets +confirm that our CS$^3$Net offers an optimal balance between efficiency and +performance: it accomplishes a 32.13% reduction in Multiply-Accumulate (MACs) +operations compared to leading efficient COD models, while also delivering +superior performance. + +
+
+
+
+
+ + ☆ Label-free Anomaly Detection in Aerial Agricultural Images with Masked + Image Modeling CVPR 2024 + + +
+ Detecting various types of stresses (nutritional, water, nitrogen, etc.) in +agricultural fields is critical for farmers to ensure maximum productivity. +However, stresses show up in different shapes and sizes across different crop +types and varieties. Hence, this is posed as an anomaly detection task in +agricultural images. Accurate anomaly detection in agricultural UAV images is +vital for early identification of field irregularities. Traditional supervised +learning faces challenges in adapting to diverse anomalies, necessitating +extensive annotated data. In this work, we overcome this limitation with +self-supervised learning using a masked image modeling approach. Masked +Autoencoders (MAE) extract meaningful normal features from unlabeled image +samples which produces high reconstruction error for the abnormal pixels during +reconstruction. To remove the need of using only ``normal" data while training, +we use an anomaly suppression loss mechanism that effectively minimizes the +reconstruction of anomalous pixels and allows the model to learn anomalous +areas without explicitly separating ``normal" images for training. Evaluation +on the Agriculture-Vision data challenge shows a mIOU score improvement in +comparison to prior state of the art in unsupervised and self-supervised +methods. A single model generalizes across all the anomaly categories in the +Agri-Vision Challenge Dataset + +
+
+ comment: The paper has been accepted to CVPR 2024 5th Workshop on Vision for + Agriculture as an Oral Paper +
+
+
+
+
+ + ☆ DeDoDe v2: Analyzing and Improving the DeDoDe Keypoint Detector CVPR + + +
+ In this paper, we analyze and improve into the recently proposed DeDoDe +keypoint detector. We focus our analysis on some key issues. First, we find +that DeDoDe keypoints tend to cluster together, which we fix by performing +non-max suppression on the target distribution of the detector during training. +Second, we address issues related to data augmentation. In particular, the +DeDoDe detector is sensitive to large rotations. We fix this by including +90-degree rotations as well as horizontal flips. Finally, the decoupled nature +of the DeDoDe detector makes evaluation of downstream usefulness problematic. +We fix this by matching the keypoints with a pretrained dense matcher (RoMa) +and evaluating two-view pose estimates. We find that the original long training +is detrimental to performance, and therefore propose a much shorter training +schedule. We integrate all these improvements into our proposed detector DeDoDe +v2 and evaluate it with the original DeDoDe descriptor on the MegaDepth-1500 +and IMC2022 benchmarks. Our proposed detector significantly increases pose +estimation results, notably from 75.9 to 78.3 mAA on the IMC2022 challenge. +Code and weights are available at https://github.com/Parskatt/DeDoDe + +
+
+ comment: Accepted to Sixth Workshop on Image Matching - CVPRW 2024 +
+
+
+
+
+ + ☆ Diffusion Models Meet Remote Sensing: Principles, Methods, and + Perspectives + + +
+ As a newly emerging advance in deep generative models, diffusion models have +achieved state-of-the-art results in many fields, including computer vision, +natural language processing, and molecule design. The remote sensing community +has also noticed the powerful ability of diffusion models and quickly applied +them to a variety of tasks for image processing. Given the rapid increase in +research on diffusion models in the field of remote sensing, it is necessary to +conduct a comprehensive review of existing diffusion model-based remote sensing +papers, to help researchers recognize the potential of diffusion models and +provide some directions for further exploration. Specifically, this paper first +introduces the theoretical background of diffusion models, and then +systematically reviews the applications of diffusion models in remote sensing, +including image generation, enhancement, and interpretation. Finally, the +limitations of existing remote sensing diffusion models and worthy research +directions for further exploration are discussed and summarized. + +
+
+
+
+
+ + ☆ Trustworthy Multimodal Fusion for Sentiment Analysis in Ordinal + Sentiment Space + + +
+ Multimodal video sentiment analysis aims to integrate multiple modal +information to analyze the opinions and attitudes of speakers. Most previous +work focuses on exploring the semantic interactions of intra- and +inter-modality. However, these works ignore the reliability of multimodality, +i.e., modalities tend to contain noise, semantic ambiguity, missing modalities, +etc. In addition, previous multimodal approaches treat different modalities +equally, largely ignoring their different contributions. Furthermore, existing +multimodal sentiment analysis methods directly regress sentiment scores without +considering ordinal relationships within sentiment categories, with limited +performance. To address the aforementioned problems, we propose a trustworthy +multimodal sentiment ordinal network (TMSON) to improve performance in +sentiment analysis. Specifically, we first devise a unimodal feature extractor +for each modality to obtain modality-specific features. Then, an uncertainty +distribution estimation network is customized, which estimates the unimodal +uncertainty distributions. Next, Bayesian fusion is performed on the learned +unimodal distributions to obtain multimodal distributions for sentiment +prediction. Finally, an ordinal-aware sentiment space is constructed, where +ordinal regression is used to constrain the multimodal distributions. Our +proposed TMSON outperforms baselines on multimodal sentiment analysis tasks, +and empirical results demonstrate that TMSON is capable of reducing uncertainty +to obtain more robust predictions. + +
+
+ comment: 14 pages, 9 figures, Accepted by IEEE Transactions on Circuits and + Systems for Video Technology +
+
+
+
+
+ + ☆ PNeRV: Enhancing Spatial Consistency via Pyramidal Neural Representation + for Videos + + +
+ The primary focus of Neural Representation for Videos (NeRV) is to +effectively model its spatiotemporal consistency. However, current NeRV systems +often face a significant issue of spatial inconsistency, leading to decreased +perceptual quality. To address this issue, we introduce the Pyramidal Neural +Representation for Videos (PNeRV), which is built on a multi-scale information +connection and comprises a lightweight rescaling operator, Kronecker +Fully-connected layer (KFc), and a Benign Selective Memory (BSM) mechanism. The +KFc, inspired by the tensor decomposition of the vanilla Fully-connected layer, +facilitates low-cost rescaling and global correlation modeling. BSM merges +high-level features with granular ones adaptively. Furthermore, we provide an +analysis based on the Universal Approximation Theory of the NeRV system and +validate the effectiveness of the proposed PNeRV.We conducted comprehensive +experiments to demonstrate that PNeRV surpasses the performance of contemporary +NeRV models, achieving the best results in video regression on UVG and DAVIS +under various metrics (PSNR, SSIM, LPIPS, and FVD). Compared to vanilla NeRV, +PNeRV achieves a +4.49 dB gain in PSNR and a 231% increase in FVD on UVG, along +with a +3.28 dB PSNR and 634% FVD increase on DAVIS. + +
+
+
+
+
+ + ☆ MAProtoNet: A Multi-scale Attentive Interpretable Prototypical Part + Network for 3D Magnetic Resonance Imaging Brain Tumor Classification + + +
+ Automated diagnosis with artificial intelligence has emerged as a promising +area in the realm of medical imaging, while the interpretability of the +introduced deep neural networks still remains an urgent concern. Although +contemporary works, such as XProtoNet and MProtoNet, has sought to design +interpretable prediction models for the issue, the localization precision of +their resulting attribution maps can be further improved. To this end, we +propose a Multi-scale Attentive Prototypical part Network, termed MAProtoNet, +to provide more precise maps for attribution. Specifically, we introduce a +concise multi-scale module to merge attentive features from quadruplet +attention layers, and produces attribution maps. The proposed quadruplet +attention layers can enhance the existing online class activation mapping loss +via capturing interactions between the spatial and channel dimension, while the +multi-scale module then fuses both fine-grained and coarse-grained information +for precise maps generation. We also apply a novel multi-scale mapping loss for +supervision on the proposed multi-scale module. Compared to existing +interpretable prototypical part networks in medical imaging, MAProtoNet can +achieve state-of-the-art performance in localization on brain tumor +segmentation (BraTS) datasets, resulting in approximately 4% overall +improvement on activation precision score (with a best score of 85.8%), without +using additional annotated labels of segmentation. Our code will be released in +https://github.com/TUAT-Novice/maprotonet. + +
+
+
+
+
+ + ☆ Meply: A Large-scale Dataset and Baseline Evaluations for Metastatic + Perirectal Lymph Node Detection and Segmentation + + +
+ Accurate segmentation of metastatic lymph nodes in rectal cancer is crucial +for the staging and treatment of rectal cancer. However, existing segmentation +approaches face challenges due to the absence of pixel-level annotated datasets +tailored for lymph nodes around the rectum. Additionally, metastatic lymph +nodes are characterized by their relatively small size, irregular shapes, and +lower contrast compared to the background, further complicating the +segmentation task. To address these challenges, we present the first +large-scale perirectal metastatic lymph node CT image dataset called Meply, +which encompasses pixel-level annotations of 269 patients diagnosed with rectal +cancer. Furthermore, we introduce a novel lymph-node segmentation model named +CoSAM. The CoSAM utilizes sequence-based detection to guide the segmentation of +metastatic lymph nodes in rectal cancer, contributing to improved localization +performance for the segmentation model. It comprises three key components: +sequence-based detection module, segmentation module, and collaborative +convergence unit. To evaluate the effectiveness of CoSAM, we systematically +compare its performance with several popular segmentation methods using the +Meply dataset. Our code and dataset will be publicly available at: +https://github.com/kanydao/CoSAM. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ PM2: A New Prompting Multi-modal Model Paradigm for Few-shot Medical + Image Classification + + +
+ Few-shot learning has been successfully applied to medical image +classification as only very few medical examples are available for training. +Due to the challenging problem of limited number of annotated medical images, +image representations should not be solely derived from a single image modality +which is insufficient for characterizing concept classes. In this paper, we +propose a new prompting multi-modal model paradigm on medical image +classification based on multi-modal foundation models, called PM2. Besides +image modality,PM2 introduces another supplementary text input, known as +prompt, to further describe corresponding image or concept classes and +facilitate few-shot learning across diverse modalities. To better explore the +potential of prompt engineering, we empirically investigate five distinct +prompt schemes under the new paradigm. Furthermore, linear probing in +multi-modal models acts as a linear classification head taking as input only +class token, which ignores completely merits of rich statistics inherent in +high-level visual tokens. Thus, we alternatively perform a linear +classification on feature distribution of visual tokens and class token +simultaneously. To effectively mine such rich statistics, a global covariance +pooling with efficient matrix power normalization is used to aggregate visual +tokens. Then we study and combine two classification heads. One is shared for +class token of image from vision encoder and prompt representation encoded by +text encoder. The other is to classification on feature distribution of visual +tokens from vision encoder. Extensive experiments on three medical datasets +show that our PM2 significantly outperforms counterparts regardless of prompt +schemes and achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ HEAT: Head-level Parameter Efficient Adaptation of Vision Transformers + with Taylor-expansion Importance Scores + + +
+ Prior computer vision research extensively explores adapting pre-trained +vision transformers (ViT) to downstream tasks. However, the substantial number +of parameters requiring adaptation has led to a focus on Parameter Efficient +Transfer Learning (PETL) as an approach to efficiently adapt large pre-trained +models by training only a subset of parameters, achieving both parameter and +storage efficiency. Although the significantly reduced parameters have shown +promising performance under transfer learning scenarios, the structural +redundancy inherent in the model still leaves room for improvement, which +warrants further investigation. In this paper, we propose Head-level Efficient +Adaptation with Taylor-expansion importance score (HEAT): a simple method that +efficiently fine-tuning ViTs at head levels. In particular, the first-order +Taylor expansion is employed to calculate each head's importance score, termed +Taylor-expansion Importance Score (TIS), indicating its contribution to +specific tasks. Additionally, three strategies for calculating TIS have been +employed to maximize the effectiveness of TIS. These strategies calculate TIS +from different perspectives, reflecting varying contributions of parameters. +Besides ViT, HEAT has also been applied to hierarchical transformers such as +Swin Transformer, demonstrating its versatility across different transformer +architectures. Through extensive experiments, HEAT has demonstrated superior +performance over state-of-the-art PETL methods on the VTAB-1K benchmark. + +
+
+
+
+
+ + ☆ ChangeAnywhere: Sample Generation for Remote Sensing Change Detection + via Semantic Latent Diffusion Model + + +
+ Remote sensing change detection (CD) is a pivotal technique that pinpoints +changes on a global scale based on multi-temporal images. With the recent +expansion of deep learning, supervised deep learning-based CD models have shown +satisfactory performance. However, CD sample labeling is very time-consuming as +it is densely labeled and requires expert knowledge. To alleviate this problem, +we introduce ChangeAnywhere, a novel CD sample generation method using the +semantic latent diffusion model and single-temporal images. Specifically, +ChangeAnywhere leverages the relative ease of acquiring large single-temporal +semantic datasets to generate large-scale, diverse, and semantically annotated +bi-temporal CD datasets. ChangeAnywhere captures the two essentials of CD +samples, i.e., change implies semantically different, and non-change implies +reasonable change under the same semantic constraints. We generated +ChangeAnywhere-100K, the largest synthesis CD dataset with 100,000 pairs of CD +samples based on the proposed method. The ChangeAnywhere-100K significantly +improved both zero-shot and few-shot performance on two CD benchmark datasets +for various deep learning-based CD models, as demonstrated by transfer +experiments. This paper delineates the enormous potential of ChangeAnywhere for +CD sample generation and demonstrates the subsequent enhancement of model +performance. Therefore, ChangeAnywhere offers a potent tool for remote sensing +CD. All codes and pre-trained models will be available at +https://github.com/tangkai-RS/ChangeAnywhere. + +
+
+ comment: Concise manuscript version of ChangeAnywhere +
+
+
+
+
+ + ☆ EIVEN: Efficient Implicit Attribute Value Extraction using Multimodal + LLM NAACL 2024 + + +
+ In e-commerce, accurately extracting product attribute values from multimodal +data is crucial for improving user experience and operational efficiency of +retailers. However, previous approaches to multimodal attribute value +extraction often struggle with implicit attribute values embedded in images or +text, rely heavily on extensive labeled data, and can easily confuse similar +attribute values. To address these issues, we introduce EIVEN, a data- and +parameter-efficient generative framework that pioneers the use of multimodal +LLM for implicit attribute value extraction. EIVEN leverages the rich inherent +knowledge of a pre-trained LLM and vision encoder to reduce reliance on labeled +data. We also introduce a novel Learning-by-Comparison technique to reduce +model confusion by enforcing attribute value comparison and difference +identification. Additionally, we construct initial open-source datasets for +multimodal implicit attribute value extraction. Our extensive experiments +reveal that EIVEN significantly outperforms existing methods in extracting +implicit attribute values while requiring less labeled data. + +
+
+ comment: Accepted by NAACL 2024 Industry Track +
+
+
+
+
+ + ☆ A Lightweight Spatiotemporal Network for Online Eye Tracking with Event + Camera + + +
+ Event-based data are commonly encountered in edge computing environments +where efficiency and low latency are critical. To interface with such data and +leverage their rich temporal features, we propose a causal spatiotemporal +convolutional network. This solution targets efficient implementation on +edge-appropriate hardware with limited resources in three ways: 1) deliberately +targets a simple architecture and set of operations (convolutions, ReLU +activations) 2) can be configured to perform online inference efficiently via +buffering of layer outputs 3) can achieve more than 90% activation sparsity +through regularization during training, enabling very significant efficiency +gains on event-based processors. In addition, we propose a general affine +augmentation strategy acting directly on the events, which alleviates the +problem of dataset scarcity for event-based systems. We apply our model on the +AIS 2024 event-based eye tracking challenge, reaching a score of 0.9916 p10 +accuracy on the Kaggle private testset. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Multimodal Attack Detection for Action Recognition Models + + +
+ Adversarial machine learning attacks on video action recognition models is a +growing research area and many effective attacks were introduced in recent +years. These attacks show that action recognition models can be breached in +many ways. Hence using these models in practice raises significant security +concerns. However, there are very few works which focus on defending against or +detecting attacks. In this work, we propose a novel universal detection method +which is compatible with any action recognition model. In our extensive +experiments, we show that our method consistently detects various attacks +against different target models with high true positive rates while satisfying +very low false positive rates. Tested against four state-of-the-art attacks +targeting four action recognition models, the proposed detector achieves an +average AUC of 0.911 over 16 test cases while the best performance achieved by +the existing detectors is 0.645 average AUC. This 41.2% improvement is enabled +by the robustness of the proposed detector to varying attack methods and target +models. The lowest AUC achieved by our detector across the 16 test cases is +0.837 while the competing detector's performance drops as low as 0.211. We also +show that the proposed detector is robust to varying attack strengths. In +addition, we analyze our method's real-time performance with different hardware +setups to demonstrate its potential as a practical defense mechanism. + +
+
+
+
+
+ + ♻ ☆ Recursive Joint Cross-Modal Attention for Multimodal Fusion in + Dimensional Emotion Recognition + + +
+ Though multimodal emotion recognition has achieved significant progress over +recent years, the potential of rich synergic relationships across the +modalities is not fully exploited. In this paper, we introduce Recursive Joint +Cross-Modal Attention (RJCMA) to effectively capture both intra- and +inter-modal relationships across audio, visual, and text modalities for +dimensional emotion recognition. In particular, we compute the attention +weights based on cross-correlation between the joint audio-visual-text feature +representations and the feature representations of individual modalities to +simultaneously capture intra- and intermodal relationships across the +modalities. The attended features of the individual modalities are again fed as +input to the fusion model in a recursive mechanism to obtain more refined +feature representations. We have also explored Temporal Convolutional Networks +(TCNs) to improve the temporal modeling of the feature representations of +individual modalities. Extensive experiments are conducted to evaluate the +performance of the proposed fusion model on the challenging Affwild2 dataset. +By effectively capturing the synergic intra- and inter-modal relationships +across audio, visual, and text modalities, the proposed fusion model achieves a +Concordance Correlation Coefficient (CCC) of 0.585 (0.542) and 0.674 (0.619) +for valence and arousal respectively on the validation set(test set). This +shows a significant improvement over the baseline of 0.240 (0.211) and 0.200 +(0.191) for valence and arousal, respectively, in the validation set (test +set), achieving second place in the valence-arousal challenge of the 6th +Affective Behavior Analysis in-the-Wild (ABAW) competition. + +
+
+
+
+
+ + ♻ ☆ ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal + Instance Segmentation IJCNN 2024 + + +
+ Amodal Instance Segmentation (AIS) presents a challenging task as it involves +predicting both visible and occluded parts of objects within images. Existing +AIS methods rely on a bidirectional approach, encompassing both the transition +from amodal features to visible features (amodal-to-visible) and from visible +features to amodal features (visible-to-amodal). Our observation shows that the +utilization of amodal features through the amodal-to-visible can confuse the +visible features due to the extra information of occluded/hidden segments not +presented in visible display. Consequently, this compromised quality of visible +features during the subsequent visible-to-amodal transition. To tackle this +issue, we introduce ShapeFormer, a decoupled Transformer-based model with a +visible-to-amodal transition. It facilitates the explicit relationship between +output segmentations and avoids the need for amodal-to-visible transitions. +ShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for +predicting visible segmentation with occlusion awareness, (ii) Shape-Prior +Amodal Mask Head for predicting amodal and occluded masks, and (iii) +Category-Specific Shape Prior Retriever aims to provide shape prior knowledge. +Comprehensive experiments and extensive ablation studies across various AIS +benchmarks demonstrate the effectiveness of our ShapeFormer. The code is +available at: https://github.com/UARK-AICV/ShapeFormer + +
+
+ comment: Accepted to IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ Tackling Structural Hallucination in Image Translation with Local + Diffusion + + +
+ Recent developments in diffusion models have advanced conditioned image +generation, yet they struggle with reconstructing out-of-distribution (OOD) +images, such as unseen tumors in medical images, causing ``image +hallucination'' and risking misdiagnosis. We hypothesize such hallucinations +result from local OOD regions in the conditional images. We verify that +partitioning the OOD region and conducting separate image generations +alleviates hallucinations in several applications. From this, we propose a +training-free diffusion framework that reduces hallucination with multiple +Local Diffusion processes. Our approach involves OOD estimation followed by two +modules: a ``branching'' module generates locally both within and outside OOD +regions, and a ``fusion'' module integrates these predictions into one. Our +evaluation shows our method mitigates hallucination over baseline models +quantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the +real-world medical and natural image datasets, respectively. It also +demonstrates compatibility with various pre-trained diffusion models. + +
+
+
+
+
+ + ♻ ☆ Dynamic Clue Bottlenecks: Towards Interpretable-by-Design Visual + Question Answering + + +
+ Recent advances in multimodal large language models (LLMs) have shown extreme +effectiveness in visual question answering (VQA). However, the design nature of +these end-to-end models prevents them from being interpretable to humans, +undermining trust and applicability in critical domains. While post-hoc +rationales offer certain insight into understanding model behavior, these +explanations are not guaranteed to be faithful to the model. In this paper, we +address these shortcomings by introducing an interpretable by design model that +factors model decisions into intermediate human-legible explanations, and +allows people to easily understand why a model fails or succeeds. We propose +the Dynamic Clue Bottleneck Model ( (DCLUB), a method that is designed towards +an inherently interpretable VQA system. DCLUB provides an explainable +intermediate space before the VQA decision and is faithful from the beginning, +while maintaining comparable performance to black-box systems. Given a +question, DCLUB first returns a set of visual clues: natural language +statements of visually salient evidence from the image, and then generates the +output based solely on the visual clues. To supervise and evaluate the +generation of VQA explanations within DCLUB, we collect a dataset of 1.7k +reasoning-focused questions with visual clues. Evaluations show that our +inherently interpretable system can improve 4.64% over a comparable black-box +system in reasoning-focused questions while preserving 99.43% of performance on +VQA-v2. + +
+
+ comment: Multimodal, Visual Question Answering, Vision and Language +
+
+
+
+
+ + ♻ ☆ When are Lemons Purple? The Concept Association Bias of Vision-Language + Models EMNLP 2023 + + +
+ Large-scale vision-language models such as CLIP have shown impressive +performance on zero-shot image classification and image-to-text retrieval. +However, such performance does not realize in tasks that require a +finer-grained correspondence between vision and language, such as Visual +Question Answering (VQA). As a potential cause of the difficulty of applying +these models to VQA and similar tasks, we report an interesting phenomenon of +vision-language models, which we call the Concept Association Bias (CAB). We +find that models with CAB tend to treat input as a bag of concepts and attempt +to fill in the other missing concept crossmodally, leading to an unexpected +zero-shot prediction. We demonstrate CAB by showing that CLIP's zero-shot +classification performance greatly suffers when there is a strong concept +association between an object (e.g. eggplant) and an attribute (e.g. color +purple). We also show that the strength of CAB predicts the performance on VQA. +We observe that CAB is prevalent in vision-language models trained with +contrastive losses, even when autoregressive losses are jointly employed. +However, a model that solely relies on autoregressive loss seems to exhibit +minimal or no signs of CAB. + +
+
+ comment: EMNLP 2023 main +
+
+
+
+
+ + ♻ ☆ Objects With Lighting: A Real-World Dataset for Evaluating + Reconstruction and Rendering for Object Relighting 3DV 2024 + + +
+ Reconstructing an object from photos and placing it virtually in a new +environment goes beyond the standard novel view synthesis task as the +appearance of the object has to not only adapt to the novel viewpoint but also +to the new lighting conditions and yet evaluations of inverse rendering methods +rely on novel view synthesis data or simplistic synthetic datasets for +quantitative analysis. This work presents a real-world dataset for measuring +the reconstruction and rendering of objects for relighting. To this end, we +capture the environment lighting and ground truth images of the same objects in +multiple environments allowing to reconstruct the objects from images taken in +one environment and quantify the quality of the rendered views for the unseen +lighting environments. Further, we introduce a simple baseline composed of +off-the-shelf methods and test several state-of-the-art methods on the +relighting task and show that novel view synthesis is not a reliable proxy to +measure performance. Code and dataset are available at +https://github.com/isl-org/objects-with-lighting . + +
+
+ comment: Accepted at 3DV 2024, Oral presentation. For the project page see + https://github.com/isl-org/objects-with-lighting +
+
+
+
+
+ + ♻ ☆ IRAD: Implicit Representation-driven Image Resampling against + Adversarial Attacks + + +
+ We introduce a novel approach to counter adversarial attacks, namely, image +resampling. Image resampling transforms a discrete image into a new one, +simulating the process of scene recapturing or rerendering as specified by a +geometrical transformation. The underlying rationale behind our idea is that +image resampling can alleviate the influence of adversarial perturbations while +preserving essential semantic information, thereby conferring an inherent +advantage in defending against adversarial attacks. To validate this concept, +we present a comprehensive study on leveraging image resampling to defend +against adversarial attacks. We have developed basic resampling methods that +employ interpolation strategies and coordinate shifting magnitudes. Our +analysis reveals that these basic methods can partially mitigate adversarial +attacks. However, they come with apparent limitations: the accuracy of clean +images noticeably decreases, while the improvement in accuracy on adversarial +examples is not substantial. We propose implicit representation-driven image +resampling (IRAD) to overcome these limitations. First, we construct an +implicit continuous representation that enables us to represent any input image +within a continuous coordinate space. Second, we introduce SampleNet, which +automatically generates pixel-wise shifts for resampling in response to +different inputs. Furthermore, we can extend our approach to the +state-of-the-art diffusion-based method, accelerating it with fewer time steps +while preserving its defense capability. Extensive experiments demonstrate that +our method significantly enhances the adversarial robustness of diverse deep +models against various attacks while maintaining high accuracy on clean images. + +
+
+
+
+
+ + ♻ ☆ PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection + Features and Variable Receptive Field Voxel Features + + +
+ LiDAR-based 3D object detection and classification is crucial for autonomous +driving. However, real-time inference from extremely sparse 3D data is a +formidable challenge. To address this problem, a typical class of approaches +transforms the point cloud cast into a regular data representation (voxels or +projection maps). Then, it performs feature extraction with convolutional +neural networks. However, such methods often result in a certain degree of +information loss due to down-sampling or over-compression of feature +information. This paper proposes a multi-modal point cloud feature fusion +method for projection features and variable receptive field voxel features +(PV-SSD) based on projection and variable voxelization to solve the information +loss problem. We design a two-branch feature extraction structure with a 2D +convolutional neural network to extract the point cloud's projection features +in bird's-eye view to focus on the correlation between local features. A voxel +feature extraction branch is used to extract local fine-grained features. +Meanwhile, we propose a voxel feature extraction method with variable sensory +fields to reduce the information loss of voxel branches due to downsampling. It +avoids missing critical point information by selecting more useful feature +points based on feature point weights for the detection task. In addition, we +propose a multi-modal feature fusion module for point clouds. To validate the +effectiveness of our method, we tested it on the KITTI dataset and ONCE +dataset. + +
+
+
+
+
+ + ♻ ☆ Adapting LLaMA Decoder to Vision Transformer + + +
+ This work examines whether decoder-only Transformers such as LLaMA, which +were originally designed for large language models (LLMs), can be adapted to +the computer vision field. We first "LLaMAfy" a standard ViT step-by-step to +align with LLaMA's architecture, and find that directly applying a casual mask +to the self-attention brings an attention collapse issue, resulting in the +failure to the network training. We suggest to reposition the class token +behind the image tokens with a post-sequence class token technique to overcome +this challenge, enabling causal self-attention to efficiently capture the +entire image's information. Additionally, we develop a soft mask strategy that +gradually introduces a casual mask to the self-attention at the onset of +training to facilitate the optimization behavior. The tailored model, dubbed as +image LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct +supervised learning. Its causal self-attention boosts computational efficiency +and learns complex representation by elevating attention map ranks. iLLaMA +rivals the performance with its encoder-only counterparts, achieving 75.1% +ImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M +and pre-training on ImageNet-21K further enhances the accuracy to 86.0%. +Extensive experiments demonstrate iLLaMA's reliable properties: calibration, +shape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR +transfer learning. We hope our study can kindle fresh views to visual model +design in the wave of LLMs. Pre-trained models and codes are available here. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. All code and model +weights are public at https://github.com/baochi0212/LaVy + +
+
+ comment: 4 pages +
+
+
+
+
+ + ♻ ☆ Detoxifying Large Language Models via Knowledge Editing + + +
+ This paper investigates using knowledge editing techniques to detoxify Large +Language Models (LLMs). We construct a benchmark, SafeEdit, which covers nine +unsafe categories with various powerful attack prompts and equips comprehensive +metrics for systematic evaluation. We conduct experiments with several +knowledge editing approaches, indicating that knowledge editing has the +potential to efficiently detoxify LLMs with limited impact on general +performance. Then, we propose a simple yet effective baseline, dubbed +Detoxifying with Intraoperative Neural Monitoring (DINM), to diminish the +toxicity of LLMs within a few tuning steps via only one instance. We further +provide an in-depth analysis of the internal mechanism for various detoxifying +approaches, demonstrating that previous methods like SFT and DPO may merely +suppress the activations of toxic parameters, while DINM mitigates the toxicity +of the toxic parameters to a certain extent, making permanent adjustments. We +hope that these insights could shed light on future work of developing +detoxifying approaches and the underlying knowledge mechanisms of LLMs. Code +and benchmark are available at https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work. Project website: + https://zjunlp.github.io/project/SafeEdit Add and update experimental results + in Tables 1 and 3 +
+
+
+
+
+ + ♻ ☆ Inconsistency Masks: Removing the Uncertainty from Input-Pseudo-Label + Pairs + + +
+ Efficiently generating sufficient labeled data remains a major bottleneck in +deep learning, particularly for image segmentation tasks where labeling +requires significant time and effort. This study tackles this issue in a +resource-constrained environment, devoid of extensive datasets or pre-existing +models. We introduce Inconsistency Masks (IM), a novel approach that filters +uncertainty in image-pseudo-label pairs to substantially enhance segmentation +quality, surpassing traditional semi-supervised learning techniques. Employing +IM, we achieve strong segmentation results with as little as 10% labeled data, +across four diverse datasets and it further benefits from integration with +other techniques, indicating broad applicability. Notably on the ISIC 2018 +dataset, three of our hybrid approaches even outperform models trained on the +fully labeled dataset. We also present a detailed comparative analysis of +prevalent semi-supervised learning strategies, all under uniform starting +conditions, to underline our approach's effectiveness and robustness. The full +code is available at: https://github.com/MichaelVorndran/InconsistencyMasks + +
+
+
+
+
+ + ♻ ☆ G-ACIL: Analytic Learning for Exemplar-Free Generalized Class + Incremental Learning + + +
+ Class incremental learning (CIL) trains a network on sequential tasks with +separated categories but suffers from catastrophic forgetting, where models +quickly lose previously learned knowledge when acquiring new tasks. The +generalized CIL (GCIL) aims to address the CIL problem in a more real-world +scenario, where incoming data have mixed data categories and unknown sample +size distribution, leading to intensified forgetting. Existing attempts for the +GCIL either have poor performance, or invade data privacy by saving historical +exemplars. To address this, in this paper, we propose an exemplar-free +generalized analytic class incremental learning (G-ACIL). The G-ACIL adopts +analytic learning (a gradient-free training technique), and delivers an +analytical solution (i.e., closed-form) to the GCIL scenario. This solution is +derived via decomposing the incoming data into exposed and unexposed classes, +allowing an equivalence between the incremental learning and its joint +training, i.e., the weight-invariant property. Such an equivalence is +theoretically validated through matrix analysis tools, and hence contributes +interpretability in GCIL. It is also empirically evidenced by experiments on +various datasets and settings of GCIL. The results show that the G-ACIL +exhibits leading performance with high robustness compared with existing +competitive GCIL methods. Codes will be ready at +\url{https://github.com/ZHUANGHP/Analytic-continual-learning}. + +
+
+
+
+
+ + ♻ ☆ CoLLaVO: Crayon Large Language and Vision mOdel + + +
+ The remarkable success of Large Language Models (LLMs) and instruction tuning +drives the evolution of Vision Language Models (VLMs) towards a versatile +general-purpose model. Yet, it remains unexplored whether current VLMs +genuinely possess quality object-level image understanding capabilities +determined from `what objects are in the image?' or `which object corresponds +to a specified bounding box?'. Our findings reveal that the image understanding +capabilities of current VLMs are strongly correlated with their zero-shot +performance on vision language (VL) tasks. This suggests that prioritizing +basic image understanding is crucial for VLMs to excel at VL tasks. To enhance +object-level image understanding, we propose Crayon Large Language and Vision +mOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a +new visual prompt tuning scheme based on panoptic color maps. Furthermore, we +present a learning strategy of Dual QLoRA to preserve object-level image +understanding without forgetting it during visual instruction tuning, thereby +achieving a significant leap in numerous VL benchmarks in a zero-shot setting. + +
+
+ comment: Code available: https://github.com/ByungKwanLee/CoLLaVO +
+
+
+
+
+ + ♻ ☆ VeCAF: Vision-language Collaborative Active Finetuning with Training + Objective Awareness + + +
+ Finetuning a pretrained vision model (PVM) is a common technique for learning +downstream vision tasks. However, the conventional finetuning process with +randomly sampled data points results in diminished training efficiency. To +address this drawback, we propose a novel approach, Vision-language +Collaborative Active Finetuning (VeCAF). With the emerging availability of +labels and natural language annotations of images through web-scale crawling or +controlled generation, VeCAF makes use of these information to perform +parametric data selection for PVM finetuning. VeCAF incorporates the finetuning +objective to select significant data points that effectively guide the PVM +towards faster convergence to meet the performance goal. This process is +assisted by the inherent semantic richness of the text embedding space which we +use to augment image features. Furthermore, the flexibility of text-domain +augmentation allows VeCAF to handle out-of-distribution scenarios without +external data. Extensive experiments show the leading performance and high +computational efficiency of VeCAF that is superior to baselines in both +in-distribution and out-of-distribution image classification tasks. On +ImageNet, VeCAF uses up to 3.3x less training batches to reach the target +performance compared to full finetuning, and achieves an accuracy improvement +of 2.7% over the state-of-the-art active finetuning method with the same number +of batches. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ FM-G-CAM: A Holistic Approach for Explainable AI in Computer Vision + + +
+ Explainability is an aspect of modern AI that is vital for impact and +usability in the real world. The main objective of this paper is to emphasise +the need to understand the predictions of Computer Vision models, specifically +Convolutional Neural Network (CNN) based models. Existing methods of explaining +CNN predictions are mostly based on Gradient-weighted Class Activation Maps +(Grad-CAM) and solely focus on a single target class. We show that from the +point of the target class selection, we make an assumption on the prediction +process, hence neglecting a large portion of the predictor CNN model's thinking +process. In this paper, we present an exhaustive methodology called Fused +Multi-class Gradient-weighted Class Activation Map (FM-G-CAM) that considers +multiple top predicted classes, which provides a holistic explanation of the +predictor CNN's thinking rationale. We also provide a detailed and +comprehensive mathematical and algorithmic description of our method. +Furthermore, along with a concise comparison of existing methods, we compare +FM-G-CAM with Grad-CAM, highlighting its benefits through real-world practical +use cases. Finally, we present an open-source Python library with FM-G-CAM +implementation to conveniently generate saliency maps for CNN-based model +predictions. + +
+
+
+
+
+ + ♻ ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks 2024 +
+
+
+
+
+ + ♻ ☆ Motion2VecSets: 4D Latent Vector Set Diffusion for Non-rigid Shape + Reconstruction and Tracking + + +
+ We introduce Motion2VecSets, a 4D diffusion model for dynamic surface +reconstruction from point cloud sequences. While existing state-of-the-art +methods have demonstrated success in reconstructing non-rigid objects using +neural field representations, conventional feed-forward networks encounter +challenges with ambiguous observations from noisy, partial, or sparse point +clouds. To address these challenges, we introduce a diffusion model that +explicitly learns the shape and motion distribution of non-rigid objects +through an iterative denoising process of compressed latent representations. +The diffusion-based priors enable more plausible and probabilistic +reconstructions when handling ambiguous inputs. We parameterize 4D dynamics +with latent sets instead of using global latent codes. This novel 4D +representation allows us to learn local shape and deformation patterns, leading +to more accurate non-linear motion capture and significantly improving +generalizability to unseen motions and identities. For more temporally-coherent +object tracking, we synchronously denoise deformation latent sets and exchange +information across multiple frames. To avoid computational overhead, we +designed an interleaved space and time attention block to alternately aggregate +deformation latents along spatial and temporal domains. Extensive comparisons +against state-of-the-art methods demonstrate the superiority of our +Motion2VecSets in 4D reconstruction from various imperfect observations. More +detailed information can be found at +https://vveicao.github.io/projects/Motion2VecSets/. + +
+
+
+
+
+ + ♻ ☆ Unraveling Batch Normalization for Realistic Test-Time Adaptation AAAI 2024 + + +
+ While recent test-time adaptations exhibit efficacy by adjusting batch +normalization to narrow domain disparities, their effectiveness diminishes with +realistic mini-batches due to inaccurate target estimation. As previous +attempts merely introduce source statistics to mitigate this issue, the +fundamental problem of inaccurate target estimation still persists, leaving the +intrinsic test-time domain shifts unresolved. This paper delves into the +problem of mini-batch degradation. By unraveling batch normalization, we +discover that the inexact target statistics largely stem from the substantially +reduced class diversity in batch. Drawing upon this insight, we introduce a +straightforward tool, Test-time Exponential Moving Average (TEMA), to bridge +the class diversity gap between training and testing batches. Importantly, our +TEMA adaptively extends the scope of typical methods beyond the current batch +to incorporate a diverse set of class information, which in turn boosts an +accurate target estimation. Built upon this foundation, we further design a +novel layer-wise rectification strategy to consistently promote test-time +performance. Our proposed method enjoys a unique advantage as it requires +neither training nor tuning parameters, offering a truly hassle-free solution. +It significantly enhances model robustness against shifted domains and +maintains resilience in diverse real-world scenarios with various batch sizes, +achieving state-of-the-art performance on several major benchmarks. Code is +available at \url{https://github.com/kiwi12138/RealisticTTA}. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ GauU-Scene V2: Assessing the Reliability of Image-Based Metrics with + Expansive Lidar Image Dataset Using 3DGS and NeRF + + +
+ We introduce a novel, multimodal large-scale scene reconstruction benchmark +that utilizes newly developed 3D representation approaches: Gaussian Splatting +and Neural Radiance Fields (NeRF). Our expansive U-Scene dataset surpasses any +previously existing real large-scale outdoor LiDAR and image dataset in both +area and point count. GauU-Scene encompasses over 6.5 square kilometers and +features a comprehensive RGB dataset coupled with LiDAR ground truth. +Additionally, we are the first to propose a LiDAR and image alignment method +for a drone-based dataset. Our assessment of GauU-Scene includes a detailed +analysis across various novel viewpoints, employing image-based metrics such as +SSIM, LPIPS, and PSNR on NeRF and Gaussian Splatting based methods. This +analysis reveals contradictory results when applying geometric-based metrics +like Chamfer distance. The experimental results on our multimodal dataset +highlight the unreliability of current image-based metrics and reveal +significant drawbacks in geometric reconstruction using the current Gaussian +Splatting-based method, further illustrating the necessity of our dataset for +assessing geometry reconstruction tasks. We also provide detailed supplementary +information on data collection protocols and make the dataset available on the +following anonymous project page + +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised 3D Visual Grounding based on Visual Linguistic + Alignment + + +
+ Learning to ground natural language queries to target objects or regions in +3D point clouds is quite essential for 3D scene understanding. Nevertheless, +existing 3D visual grounding approaches require a substantial number of +bounding box annotations for text queries, which is time-consuming and +labor-intensive to obtain. In this paper, we propose \textbf{3D-VLA}, a weakly +supervised approach for \textbf{3D} visual grounding based on \textbf{V}isual +\textbf{L}inguistic \textbf{A}lignment. Our 3D-VLA exploits the superior +ability of current large-scale vision-language models (VLMs) on aligning the +semantics between texts and 2D images, as well as the naturally existing +correspondences between 2D images and 3D point clouds, and thus implicitly +constructs correspondences between texts and 3D point clouds with no need for +fine-grained box annotations in the training procedure. During the inference +stage, the learned text-3D correspondence will help us ground the text queries +to the 3D target objects even without 2D images. To the best of our knowledge, +this is the first work to investigate 3D visual grounding in a weakly +supervised manner by involving large scale vision-language models, and +extensive experiments on ReferIt3D and ScanRefer datasets demonstrate that our +3D-VLA achieves comparable and even superior results over the fully supervised +methods. + +
+
+
+
+
+ + ♻ ☆ Recent Advances in 3D Gaussian Splatting + + +
+ The emergence of 3D Gaussian Splatting (3DGS) has greatly accelerated the +rendering speed of novel view synthesis. Unlike neural implicit representations +like Neural Radiance Fields (NeRF) that represent a 3D scene with position and +viewpoint-conditioned neural networks, 3D Gaussian Splatting utilizes a set of +Gaussian ellipsoids to model the scene so that efficient rendering can be +accomplished by rasterizing Gaussian ellipsoids into images. Apart from the +fast rendering speed, the explicit representation of 3D Gaussian Splatting +facilitates editing tasks like dynamic reconstruction, geometry editing, and +physical simulation. Considering the rapid change and growing number of works +in this field, we present a literature review of recent 3D Gaussian Splatting +methods, which can be roughly classified into 3D reconstruction, 3D editing, +and other downstream applications by functionality. Traditional point-based +rendering methods and the rendering formulation of 3D Gaussian Splatting are +also illustrated for a better understanding of this technique. This survey aims +to help beginners get into this field quickly and provide experienced +researchers with a comprehensive overview, which can stimulate the future +development of the 3D Gaussian Splatting representation. + +
+
+
+
+
+ + ♻ ☆ FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained + Diffusion Models and Monocular Depth Estimators ICLR 2024 + + +
+ Matching cross-modality features between images and point clouds is a +fundamental problem for image-to-point cloud registration. However, due to the +modality difference between images and points, it is difficult to learn robust +and discriminative cross-modality features by existing metric learning methods +for feature matching. Instead of applying metric learning on cross-modality +data, we propose to unify the modality between images and point clouds by +pretrained large-scale models first, and then establish robust correspondence +within the same modality. We show that the intermediate features, called +diffusion features, extracted by depth-to-image diffusion models are +semantically consistent between images and point clouds, which enables the +building of coarse but robust cross-modality correspondences. We further +extract geometric features on depth maps produced by the monocular depth +estimator. By matching such geometric features, we significantly improve the +accuracy of the coarse correspondences produced by diffusion features. +Extensive experiments demonstrate that without any task-specific training, +direct utilization of both features produces accurate image-to-point cloud +registration. On three public indoor and outdoor benchmarks, the proposed +method averagely achieves a 20.6 percent improvement in Inlier Ratio, a +three-fold higher Inlier Number, and a 48.6 percent improvement in Registration +Recall than existing state-of-the-arts. + +
+
+ comment: CameraReady version for ICLR 2024. Project Page: + https://whu-usi3dv.github.io/FreeReg/ +
+
+
+
+
+ + ♻ ☆ EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit + Diffusion Models ICLR 2024 + + +
+ Diffusion models have demonstrated remarkable capabilities in image synthesis +and related generative tasks. Nevertheless, their practicality for real-world +applications is constrained by substantial computational costs and latency +issues. Quantization is a dominant way to compress and accelerate diffusion +models, where post-training quantization (PTQ) and quantization-aware training +(QAT) are two main approaches, each bearing its own properties. While PTQ +exhibits efficiency in terms of both time and data usage, it may lead to +diminished performance in low bit-width. On the other hand, QAT can alleviate +performance degradation but comes with substantial demands on computational and +data resources. In this paper, we introduce a data-free and parameter-efficient +fine-tuning framework for low-bit diffusion models, dubbed EfficientDM, to +achieve QAT-level performance with PTQ-like efficiency. Specifically, we +propose a quantization-aware variant of the low-rank adapter (QALoRA) that can +be merged with model weights and jointly quantized to low bit-width. The +fine-tuning process distills the denoising capabilities of the full-precision +model into its quantized counterpart, eliminating the requirement for training +data. We also introduce scale-aware optimization and temporal learned step-size +quantization to further enhance performance. Extensive experimental results +demonstrate that our method significantly outperforms previous PTQ-based +diffusion models while maintaining similar time and data efficiency. +Specifically, there is only a 0.05 sFID increase when quantizing both weights +and activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to QAT-based +methods, our EfficientDM also boasts a 16.2x faster quantization speed with +comparable generation quality. Code is available at +\href{https://github.com/ThisisBillhe/EfficientDM}{this hrl}. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ A Specific Task-oriented Semantic Image Communication System for + substation patrol inspection + + +
+ Intelligent inspection robots are widely used in substation patrol +inspection, which can help check potential safety hazards by patrolling the +substation and sending back scene images. However, when patrolling some +marginal areas with weak signal, the scene images cannot be sucessfully +transmissted to be used for hidden danger elimination, which greatly reduces +the quality of robots'daily work. To solve such problem, a Specific +Task-oriented Semantic Communication System for Imag-STSCI is designed, which +involves the semantic features extraction, transmission, restoration and +enhancement to get clearer images sent by intelligent robots under weak +signals. Inspired by that only some specific details of the image are needed in +such substation patrol inspection task, we proposed a new paradigm of semantic +enhancement in such specific task to ensure the clarity of key semantic +information when facing a lower bit rate or a low signal-to-noise ratio +situation. Across the reality-based simulation, experiments show our STSCI can +generally surpass traditional image-compression-based and channel-codingbased +or other semantic communication system in the substation patrol inspection task +with a lower bit rate even under a low signal-to-noise ratio situation. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Visual Tuning + + +
+ Fine-tuning visual models has been widely shown promising performance on many +downstream visual tasks. With the surprising development of pre-trained visual +foundation models, visual tuning jumped out of the standard modus operandi that +fine-tunes the whole pre-trained model or just the fully connected layer. +Instead, recent advances can achieve superior performance than full-tuning the +whole pre-trained parameters by updating far fewer parameters, enabling edge +devices and downstream applications to reuse the increasingly large foundation +models deployed on the cloud. With the aim of helping researchers get the full +picture and future directions of visual tuning, this survey characterizes a +large and thoughtful selection of recent works, providing a systematic and +comprehensive overview of existing work and models. Specifically, it provides a +detailed background of visual tuning and categorizes recent visual tuning +techniques into five groups: prompt tuning, adapter tuning, parameter tuning, +and remapping tuning. Meanwhile, it offers some exciting research directions +for prospective pre-training and various interactions in visual tuning. + +
+
+ comment: 37 pages. Accepted to ACM CSUR +
+
+
+
+
+ + ♻ ☆ The Method of Detecting Flying Birds in Surveillance Video Based on + Their Characteristics + + +
+ Aiming at the characteristics of the flying bird object in surveillance +video, such as the single frame image feature is not obvious, the size is small +in most cases, and asymmetric, this paper proposes a Flying Bird Object +Detection method in Surveillance Video (FBOD-SV). Firstly, a new feature +aggregation module, the Correlation Attention Feature Aggregation +(Co-Attention-FA) module, is designed to aggregate the features of the flying +bird object according to the bird object's correlation on multiple consecutive +frames of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net) +with down-sampling and then up-sampling is designed, which uses a large feature +layer that fuses fine spatial information and large receptive field information +to detect special multi-scale (mostly small-scale) bird objects. Finally, the +SimOTA dynamic label allocation method is applied to One-Category object +detection, and the SimOTA-OC dynamic label strategy is proposed to solve the +difficult problem of label allocation caused by irregular flying bird objects. +In this paper, the algorithm's performance is verified by the experimental data +set of the surveillance video of the flying bird object of the traction +substation. The experimental results show that the surveillance video flying +bird object detection method proposed in this paper effectively improves the +detection performance of flying bird objects. + +
+
+
+
+
+ + ♻ ☆ UNK-VQA: A Dataset and a Probe into the Abstention Ability of + Multi-modal Large Models + + +
+ Teaching Visual Question Answering (VQA) models to refrain from answering +unanswerable questions is necessary for building a trustworthy AI system. +Existing studies, though have explored various aspects of VQA but somewhat +ignored this particular attribute. This paper aims to bridge the research gap +by contributing a comprehensive dataset, called UNK-VQA. The dataset is +specifically designed to address the challenge of questions that models do not +know. To this end, we first augment the existing data via deliberate +perturbations on either the image or question. In specific, we carefully ensure +that the question-image semantics remain close to the original unperturbed +distribution. By this means, the identification of unanswerable questions +becomes challenging, setting our dataset apart from others that involve mere +image replacement. We then extensively evaluate the zero- and few-shot +performance of several emerging multi-modal large models and discover their +significant limitations when applied to our dataset. Additionally, we also +propose a straightforward method to tackle these unanswerable questions. This +dataset, we believe, will serve as a valuable benchmark for enhancing the +abstention capability of VQA models, thereby leading to increased +trustworthiness of AI systems. We have made the dataset +(https://github.com/guoyang9/UNK-VQA) available to facilitate further +exploration in this area. + +
+
+
+
+
+ + ♻ ☆ M$^{2}$Chat: Empowering VLM for Multimodal LLM Interleaved Text-Image + Generation + + +
+ While current LLM chatbots like GPT-4V bridge the gap between human +instructions and visual representations to enable text-image generations, they +still lack efficient alignment methods for high-fidelity performance on +multiple downstream tasks. In this paper, we propose \textbf{$M^{2}Chat$}, a +novel unified multimodal LLM framework for generating interleaved text-image +conversation across various scenarios. Specifically, we propose an +$M^{3}Adapter$ that efficiently integrates granular low-level visual +information and high-level semantic features from multi-modality prompts. Upon +the well-aligned fused feature, $M^{3}Adapter$ tailors a learnable gating +strategy to balance the model creativity and consistency across various tasks +adaptively. Moreover, to further enhance the effectiveness of $M^{3}Adapter$ +while preserving the coherence of semantic context comprehension, we introduce +a two-stage $M^{3}FT$ fine-tuning strategy. This strategy optimizes disjoint +groups of parameters for image-text alignment and visual-instruction +respectively. Extensive experiments demonstrate our $M^{2}Chat$ surpasses +state-of-the-art counterparts across diverse benchmarks, showcasing its prowess +in interleaving generation, storytelling, and multimodal dialogue systems. The +demo and code are available at +\red{https://mattie-e.github.io/M2Chat.github.io}. + +
+
+
+
+
+ + ♻ ☆ UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery CVPR + + +
+ Raindrops adhering to the lens of UAVs can obstruct visibility of the +background scene and degrade image quality. Despite recent progress in image +deraining methods and datasets, there is a lack of focus on raindrop removal +from UAV aerial imagery due to the unique challenges posed by varying angles +and rapid movement during drone flight. To fill the gap in this research, we +first construct a new benchmark dataset for removing raindrops from UAV images, +called UAV-Rain1k. In this letter, we provide a dataset generation pipeline, +which includes modeling raindrop shapes using Blender, collecting background +images from various UAV angles, random sampling of rain masks and etc. Based on +the proposed benchmark, we further present a comprehensive evaluation of +existing representative image deraining algorithms, and reveal future research +opportunities worth exploring. The proposed dataset is publicly available at +https://github.com/cschenxiang/UAV-Rain1k. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) 2024 +
+
+
+
+
+ + ♻ ☆ Multi-scale Attention Network for Single Image Super-Resolution + + +
+ ConvNets can compete with transformers in high-level tasks by exploiting +larger receptive fields. To unleash the potential of ConvNet in +super-resolution, we propose a multi-scale attention network (MAN), by coupling +classical multi-scale mechanism with emerging large kernel attention. In +particular, we proposed multi-scale large kernel attention (MLKA) and gated +spatial attention unit (GSAU). Through our MLKA, we modify large kernel +attention with multi-scale and gate schemes to obtain the abundant attention +map at various granularity levels, thereby aggregating global and local +information and avoiding potential blocking artifacts. In GSAU, we integrate +gate mechanism and spatial attention to remove the unnecessary linear layer and +aggregate informative spatial context. To confirm the effectiveness of our +designs, we evaluate MAN with multiple complexities by simply stacking +different numbers of MLKA and GSAU. Experimental results illustrate that our +MAN can perform on par with SwinIR and achieve varied trade-offs between +state-of-the-art performance and computations. + +
+
+
+
+
+ + ♻ ☆ Generating Enhanced Negatives for Training Language-Based Object + Detectors CVPR 2024 + + +
+ The recent progress in language-based open-vocabulary object detection can be +largely attributed to finding better ways of leveraging large-scale data with +free-form text annotations. Training such models with a discriminative +objective function has proven successful, but requires good positive and +negative samples. However, the free-form nature and the open vocabulary of +object descriptions make the space of negatives extremely large. Prior works +randomly sample negatives or use rule-based techniques to build them. In +contrast, we propose to leverage the vast knowledge built into modern +generative models to automatically build negatives that are more relevant to +the original data. Specifically, we use large-language-models to generate +negative text descriptions, and text-to-image diffusion models to also generate +corresponding negative images. Our experimental analysis confirms the relevance +of the generated negative data, and its use in language-based detectors +improves performance on two complex benchmarks. Code is available at +\url{https://github.com/xiaofeng94/Gen-Enhanced-Negs}. + +
+
+ comment: Accepted to CVPR 2024. The supplementary document included +
+
+
+
+
+ + ♻ ☆ LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge + Retrieval-Augmented Diffusion CVPR 2024 + + +
+ Camouflaged vision perception is an important vision task with numerous +practical applications. Due to the expensive collection and labeling costs, +this community struggles with a major bottleneck that the species category of +its datasets is limited to a small number of object species. However, the +existing camouflaged generation methods require specifying the background +manually, thus failing to extend the camouflaged sample diversity in a low-cost +manner. In this paper, we propose a Latent Background Knowledge +Retrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To +our knowledge, our contributions mainly include: (1) For the first time, we +propose a camouflaged generation paradigm that does not need to receive any +background inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented +method with interpretability for camouflaged generation, in which we propose an +idea that knowledge retrieval and reasoning enhancement are separated +explicitly, to alleviate the task-specific challenges. Moreover, our method is +not restricted to specific foreground targets or backgrounds, offering a +potential for extending camouflaged vision perception to more diverse domains. +(3) Experimental results demonstrate that our method outperforms the existing +approaches, generating more realistic camouflage images. + +
+
+ comment: Accepted by CVPR 2024, Fig.3 revised +
+
+
+
+
+ + ♻ ☆ Taming Self-Training for Open-Vocabulary Object Detection CVPR 2024 + + +
+ Recent studies have shown promising performance in open-vocabulary object +detection (OVD) by utilizing pseudo labels (PLs) from pretrained vision and +language models (VLMs). However, teacher-student self-training, a powerful and +widely used paradigm to leverage PLs, is rarely explored for OVD. This work +identifies two challenges of using self-training in OVD: noisy PLs from VLMs +and frequent distribution changes of PLs. To address these challenges, we +propose SAS-Det that tames self-training for OVD from two key perspectives. +First, we present a split-and-fusion (SAF) head that splits a standard +detection into an open-branch and a closed-branch. This design can reduce noisy +supervision from pseudo boxes. Moreover, the two branches learn complementary +knowledge from different training data, significantly enhancing performance +when fused together. Second, in our view, unlike in closed-set tasks, the PL +distributions in OVD are solely determined by the teacher model. We introduce a +periodic update strategy to decrease the number of updates to the teacher, +thereby decreasing the frequency of changes in PL distributions, which +stabilizes the training process. Extensive experiments demonstrate SAS-Det is +both efficient and effective. SAS-Det outperforms recent models of the same +scale by a clear margin and achieves 37.4 AP50 and 29.1 APr on novel categories +of the COCO and LVIS benchmarks, respectively. Code is available at +\url{https://github.com/xiaofeng94/SAS-Det}. + +
+
+ comment: Accepted to CVPR 2024. The supplementary document included +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Road Network Graph Extraction CVPR + + +
+ We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for +extracting large-scale, vectorized road network graphs from satellite imagery. +To predict graph geometry, we formulate it as a dense semantic segmentation +task, leveraging the inherent strengths of SAM. The image encoder of SAM is +fine-tuned to produce probability masks for roads and intersections, from which +the graph vertices are extracted via simple non-maximum suppression. To predict +graph topology, we designed a lightweight transformer-based graph neural +network, which leverages the SAM image embeddings to estimate the edge +existence probabilities between vertices. Our approach directly predicts the +graph vertices and edges for large regions without expensive and complex +post-processing heuristics, and is capable of building complete road network +graphs spanning multiple square kilometers in a matter of seconds. With its +simple, straightforward, and minimalist design, SAM-Road achieves comparable +accuracy with the state-of-the-art method RNGDet++, while being 40 times faster +on the City-scale dataset. We thus demonstrate the power of a foundational +vision model when applied to a graph learning task. The code is available at +https://github.com/htcr/sam_road. + +
+
+ comment: Accepted by IEEE/CVF Computer Vision and Pattern Recognition + Conference (CVPR) 2024, 2nd Workshop on Scene Graphs and Graph Representation + Learning +
+
+
+
+
+ + ♻ ☆ ComCLIP: Training-Free Compositional Image and Text Matching + + +
+ Contrastive Language-Image Pretraining (CLIP) has demonstrated great +zero-shot performance for matching images and text. However, it is still +challenging to adapt vision-lanaguage pretrained models like CLIP to +compositional image and text matching -- a more challenging image and text +matching task requiring the model understanding of compositional word concepts +and visual components. Towards better compositional generalization in zero-shot +image and text matching, in this paper, we study the problem from a causal +perspective: the erroneous semantics of individual entities are essentially +confounders that cause the matching failure. Therefore, we propose a novel +\textbf{\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP +disentangles input images into subjects, objects, and action sub-images and +composes CLIP's vision encoder and text encoder to perform evolving matching +over compositional text embedding and sub-image embeddings. In this way, +ComCLIP can mitigate spurious correlations introduced by the pretrained CLIP +models and dynamically evaluate the importance of each component. Experiments +on four compositional image-text matching datasets: SVO, ComVG, Winoground, and +VL-checklist, and two general image-text retrieval datasets: Flick30K, and +MSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts +the \textbf{\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even +without further training or fine-tuning. Our codes can be found at +https://github.com/eric-ai-lab/ComCLIP. + +
+
+
+
+
+ + ♻ ☆ NICEST: Noisy Label Correction and Training for Robust Scene Graph + Generation CVPR'22 + + +
+ Nearly all existing scene graph generation (SGG) models have overlooked the +ground-truth annotation qualities of mainstream SGG datasets, i.e., they +assume: 1) all the manually annotated positive samples are equally correct; 2) +all the un-annotated negative samples are absolutely background. In this paper, +we argue that neither of the assumptions applies to SGG: there are numerous +noisy ground-truth predicate labels that break these two assumptions and harm +the training of unbiased SGG models. To this end, we propose a novel NoIsy +label CorrEction and Sample Training strategy for SGG: NICEST. Specifically, it +consists of two parts: NICE and NIST, which rule out these noisy label issues +by generating high-quality samples and the effective training strategy, +respectively. NICE first detects noisy samples and then reassigns them more +high-quality soft predicate labels. NIST is a multi-teacher knowledge +distillation based training strategy, which enables the model to learn unbiased +fusion knowledge. And a dynamic trade-off weighting strategy in NIST is +designed to penalize the bias of different teachers. Due to the model-agnostic +nature of both NICE and NIST, our NICEST can be seamlessly incorporated into +any SGG architecture to boost its performance on different predicate +categories. In addition, to better evaluate the generalization of SGG models, +we further propose a new benchmark VG-OOD, by re-organizing the prevalent VG +dataset and deliberately making the predicate distributions of the training and +test sets as different as possible for each subject-object category pair. This +new benchmark helps disentangle the influence of subject-object category based +frequency biases. Extensive ablations and results on different backbones and +tasks have attested to the effectiveness and generalization ability of each +component of NICEST. + +
+
+ comment: Extension of CVPR'22 work (The Devil is in the Labels: Noisy Label + Correction for Robust Scene Graph Generation). arXiv admin note: substantial + text overlap with arXiv:2206.03014 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 148 + +
+
+
+ + ☆ EventEgo3D: 3D Human Motion Capture from Egocentric Event Streams CVPR + + +
+ Monocular egocentric 3D human motion capture is a challenging and actively +researched problem. Existing methods use synchronously operating visual sensors +(e.g. RGB cameras) and often fail under low lighting and fast motions, which +can be restricting in many applications involving head-mounted devices. In +response to the existing limitations, this paper 1) introduces a new problem, +i.e., 3D human motion capture from an egocentric monocular event camera with a +fisheye lens, and 2) proposes the first approach to it called EventEgo3D +(EE3D). Event streams have high temporal resolution and provide reliable cues +for 3D human motion capture under high-speed human motions and rapidly changing +illumination. The proposed EE3D framework is specifically tailored for learning +with event streams in the LNES representation, enabling high 3D reconstruction +accuracy. We also design a prototype of a mobile head-mounted device with an +event camera and record a real dataset with event observations and the +ground-truth 3D human poses (in addition to the synthetic dataset). Our EE3D +demonstrates robustness and superior 3D accuracy compared to existing solutions +across various challenging experiments while supporting real-time 3D pose +update rates of 140Hz. + +
+
+ comment: 14 pages, 11 figures and 6 tables; project page: + https://4dqv.mpi-inf.mpg.de/EventEgo3D/; Computer Vision and Pattern + Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ COCONut: Modernizing COCO Segmentation CVPR2024 + + +
+ In recent decades, the vision community has witnessed remarkable progress in +visual recognition, partially owing to advancements in dataset benchmarks. +Notably, the established COCO benchmark has propelled the development of modern +detection and segmentation systems. However, the COCO segmentation benchmark +has seen comparatively slow improvement over the last decade. Originally +equipped with coarse polygon annotations for thing instances, it gradually +incorporated coarse superpixel annotations for stuff regions, which were +subsequently heuristically amalgamated to yield panoptic segmentation +annotations. These annotations, executed by different groups of raters, have +resulted not only in coarse segmentation masks but also in inconsistencies +between segmentation types. In this study, we undertake a comprehensive +reevaluation of the COCO segmentation annotations. By enhancing the annotation +quality and expanding the dataset to encompass 383K images with more than 5.18M +panoptic masks, we introduce COCONut, the COCO Next Universal segmenTation +dataset. COCONut harmonizes segmentation annotations across semantic, instance, +and panoptic segmentation with meticulously crafted high-quality masks, and +establishes a robust benchmark for all segmentation tasks. To our knowledge, +COCONut stands as the inaugural large-scale universal segmentation dataset, +verified by human raters. We anticipate that the release of COCONut will +significantly contribute to the community's ability to assess the progress of +novel neural networks. + +
+
+ comment: Accepted at CVPR2024, data available at + https://xdeng7.github.io/coconut.github.io/ +
+
+
+
+
+ + ☆ Probing the 3D Awareness of Visual Foundation Models CVPR 2024 + + +
+ Recent advances in large-scale pretraining have yielded visual foundation +models with strong capabilities. Not only can recent models generalize to +arbitrary images for their training task, their intermediate representations +are useful for other visual tasks such as detection and segmentation. Given +that such models can classify, delineate, and localize objects in 2D, we ask +whether they also represent their 3D structure? In this work, we analyze the 3D +awareness of visual foundation models. We posit that 3D awareness implies that +representations (1) encode the 3D structure of the scene and (2) consistently +represent the surface across views. We conduct a series of experiments using +task-specific probes and zero-shot inference procedures on frozen features. Our +experiments reveal several limitations of the current models. Our code and +analysis can be found at https://github.com/mbanani/probe3d. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://github.com/mbanani/probe3d +
+
+
+
+
+ + ☆ Automatic Quantification of Serial PET/CT Images for Pediatric Hodgkin + Lymphoma Patients Using a Longitudinally-Aware Segmentation Network + + +
+ $\textbf{Purpose}$: Automatic quantification of longitudinal changes in PET +scans for lymphoma patients has proven challenging, as residual disease in +interim-therapy scans is often subtle and difficult to detect. Our goal was to +develop a longitudinally-aware segmentation network (LAS-Net) that can quantify +serial PET/CT images for pediatric Hodgkin lymphoma patients. +$\textbf{Materials and Methods}$: This retrospective study included baseline +(PET1) and interim (PET2) PET/CT images from 297 patients enrolled in two +Children's Oncology Group clinical trials (AHOD1331 and AHOD0831). LAS-Net +incorporates longitudinal cross-attention, allowing relevant features from PET1 +to inform the analysis of PET2. Model performance was evaluated using Dice +coefficients for PET1 and detection F1 scores for PET2. Additionally, we +extracted and compared quantitative PET metrics, including metabolic tumor +volume (MTV) and total lesion glycolysis (TLG) in PET1, as well as qPET and +$\Delta$SUVmax in PET2, against physician measurements. We quantified their +agreement using Spearman's $\rho$ correlations and employed bootstrap +resampling for statistical analysis. $\textbf{Results}$: LAS-Net detected +residual lymphoma in PET2 with an F1 score of 0.606 (precision/recall: +0.615/0.600), outperforming all comparator methods (P<0.01). For baseline +segmentation, LAS-Net achieved a mean Dice score of 0.772. In PET +quantification, LAS-Net's measurements of qPET, $\Delta$SUVmax, MTV and TLG +were strongly correlated with physician measurements, with Spearman's $\rho$ of +0.78, 0.80, 0.93 and 0.96, respectively. The performance remained high, with a +slight decrease, in an external testing cohort. $\textbf{Conclusion}$: LAS-Net +achieved high performance in quantifying PET metrics across serial scans, +highlighting the value of longitudinal awareness in evaluating multi-time-point +imaging datasets. + +
+
+ comment: 6 figures, 4 tables in the main text +
+
+
+
+
+ + ☆ Training-free Boost for Open-Vocabulary Object Detection with Confidence + Aggregation + + +
+ Open-vocabulary object detection (OVOD) aims at localizing and recognizing +visual objects from novel classes unseen at the training time. Whereas, +empirical studies reveal that advanced detectors generally assign lower scores +to those novel instances, which are inadvertently suppressed during inference +by commonly adopted greedy strategies like Non-Maximum Suppression (NMS), +leading to sub-optimal detection performance for novel classes. This paper +systematically investigates this problem with the commonly-adopted two-stage +OVOD paradigm. Specifically, in the region-proposal stage, proposals that +contain novel instances showcase lower objectness scores, since they are +treated as background proposals during the training phase. Meanwhile, in the +object-classification stage, novel objects share lower region-text similarities +(i.e., classification scores) due to the biased visual-language alignment by +seen training samples. To alleviate this problem, this paper introduces two +advanced measures to adjust confidence scores and conserve erroneously +dismissed objects: (1) a class-agnostic localization quality estimate via +overlap degree of region/object proposals, and (2) a text-guided visual +similarity estimate with proxy prototypes for novel classes. Integrated with +adjusting techniques specifically designed for the region-proposal and +object-classification stages, this paper derives the aggregated confidence +estimate for the open-vocabulary object detection paradigm (AggDet). Our AggDet +is a generic and training-free post-processing scheme, which consistently +bolsters open-vocabulary detectors across model scales and architecture +designs. For instance, AggDet receives 3.3% and 1.5% gains on OV-COCO and +OV-LVIS benchmarks respectively, without any training cost. + +
+
+
+
+
+ + ☆ Improving Referring Image Segmentation using Vision-Aware Text Features + + +
+ Referring image segmentation is a challenging task that involves generating +pixel-wise segmentation masks based on natural language descriptions. Existing +methods have relied mostly on visual features to generate the segmentation +masks while treating text features as supporting components. This over-reliance +on visual features can lead to suboptimal results, especially in complex +scenarios where text prompts are ambiguous or context-dependent. To overcome +these challenges, we present a novel framework VATEX to improve referring image +segmentation by enhancing object and context understanding with Vision-Aware +Text Feature. Our method involves using CLIP to derive a CLIP Prior that +integrates an object-centric visual heatmap with text description, which can be +used as the initial query in DETR-based architecture for the segmentation task. +Furthermore, by observing that there are multiple ways to describe an instance +in an image, we enforce feature similarity between text variations referring to +the same visual input by two components: a novel Contextual Multimodal Decoder +that turns text embeddings into vision-aware text features, and a Meaning +Consistency Constraint to ensure further the coherent and consistent +interpretation of language expressions with the context understanding obtained +from the image. Our method achieves a significant performance improvement on +three benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Code is available at: +https://nero1342.github.io/VATEX\_RIS. + +
+
+ comment: 30 pages including supplementary +
+
+
+
+
+ + ☆ Enhancing Visual Question Answering through Question-Driven Image + Captions as Prompts CVPR 2024 + + +
+ Visual question answering (VQA) is known as an AI-complete task as it +requires understanding, reasoning, and inferring about the vision and the +language content. Over the past few years, numerous neural architectures have +been suggested for the VQA problem. However, achieving success in zero-shot VQA +remains a challenge due to its requirement for advanced generalization and +reasoning skills. This study explores the impact of incorporating image +captioning as an intermediary process within the VQA pipeline. Specifically, we +explore the efficacy of utilizing image captions instead of images and +leveraging large language models (LLMs) to establish a zero-shot setting. Since +image captioning is the most crucial step in this process, we compare the +impact of state-of-the-art image captioning models on VQA performance across +various question types in terms of structure and semantics. We propose a +straightforward and efficient question-driven image captioning approach within +this pipeline to transfer contextual information into the question-answering +(QA) model. This method involves extracting keywords from the question, +generating a caption for each image-question pair using the keywords, and +incorporating the question-driven caption into the LLM prompt. We evaluate the +efficacy of using general-purpose and question-driven image captions in the VQA +pipeline. Our study highlights the potential of employing image captions and +harnessing the capabilities of LLMs to achieve competitive performance on GQA +under the zero-shot setting. Our code is available at +\url{https://github.com/ovguyo/captions-in-VQA}. + +
+
+ comment: The paper has been accepted for presentation at CVPR 2024 Workshop on + Prompting in Vision +
+
+
+
+
+ + ☆ Advanced wood species identification based on multiple anatomical + sections and using deep feature transfer and fusion + + +
+ In recent years, we have seen many advancements in wood species +identification. Methods like DNA analysis, Near Infrared (NIR) spectroscopy, +and Direct Analysis in Real Time (DART) mass spectrometry complement the +long-established wood anatomical assessment of cell and tissue morphology. +However, most of these methods have some limitations such as high costs, the +need for skilled experts for data interpretation, and the lack of good datasets +for professional reference. Therefore, most of these methods, and certainly the +wood anatomical assessment, may benefit from tools based on Artificial +Intelligence. In this paper, we apply two transfer learning techniques with +Convolutional Neural Networks (CNNs) to a multi-view Congolese wood species +dataset including sections from different orientations and viewed at different +microscopic magnifications. We explore two feature extraction methods in +detail, namely Global Average Pooling (GAP) and Random Encoding of Aggregated +Deep Activation Maps (RADAM), for efficient and accurate wood species +identification. Our results indicate superior accuracy on diverse datasets and +anatomical sections, surpassing the results of other methods. Our proposal +represents a significant advancement in wood species identification, offering a +robust tool to support the conservation of forest ecosystems and promote +sustainable forestry practices. + +
+
+ comment: 33 pages, 7 tables, 9 figures +
+
+
+
+
+ + ☆ Pathological Primitive Segmentation Based on Visual Foundation Model + with Zero-Shot Mask Generation + + +
+ Medical image processing usually requires a model trained with carefully +crafted datasets due to unique image characteristics and domain-specific +challenges, especially in pathology. Primitive detection and segmentation in +digitized tissue samples are essential for objective and automated diagnosis +and prognosis of cancer. SAM (Segment Anything Model) has recently been +developed to segment general objects from natural images with high accuracy, +but it requires human prompts to generate masks. In this work, we present a +novel approach that adapts pre-trained natural image encoders of SAM for +detection-based region proposals. Regions proposed by a pre-trained encoder are +sent to cascaded feature propagation layers for projection. Then, local +semantic and global context is aggregated from multi-scale for bounding box +localization and classification. Finally, the SAM decoder uses the identified +bounding boxes as essential prompts to generate a comprehensive primitive +segmentation map. The entire base framework, SAM, requires no additional +training or fine-tuning but could produce an end-to-end result for two +fundamental segmentation tasks in pathology. Our method compares with +state-of-the-art models in F1 score for nuclei detection and binary/multiclass +panoptic(bPQ/mPQ) and mask quality(dice) for segmentation quality on the +PanNuke dataset while offering end-to-end efficiency. Our model also achieves +remarkable Average Precision (+4.5%) on the secondary dataset (HuBMAP Kidney) +compared to Faster RCNN. The code is publicly available at +https://github.com/learner-codec/autoprom_sam. + +
+
+ comment: 2024 IEEE International Symposium on Biomedical Imaging +
+
+
+
+
+ + ☆ FashionFail: Addressing Failure Cases in Fashion Object Detection and + Segmentation IJCNN + + +
+ In the realm of fashion object detection and segmentation for online shopping +images, existing state-of-the-art fashion parsing models encounter limitations, +particularly when exposed to non-model-worn apparel and close-up shots. To +address these failures, we introduce FashionFail; a new fashion dataset with +e-commerce images for object detection and segmentation. The dataset is +efficiently curated using our novel annotation tool that leverages recent +foundation models. The primary objective of FashionFail is to serve as a test +bed for evaluating the robustness of models. Our analysis reveals the +shortcomings of leading models, such as Attribute-Mask R-CNN and Fashionformer. +Additionally, we propose a baseline approach using naive data augmentation to +mitigate common failure cases and improve model robustness. Through this work, +we aim to inspire and support further research in fashion item detection and +segmentation for industrial applications. The dataset, annotation tool, code, +and models are available at \url{https://rizavelioglu.github.io/fashionfail/}. + +
+
+ comment: to be published in 2024 International Joint Conference on Neural + Networks (IJCNN) +
+
+
+
+
+ + ☆ Lossy Image Compression with Foundation Diffusion Models + + +
+ Incorporating diffusion models in the image compression domain has the +potential to produce realistic and detailed reconstructions, especially at +extremely low bitrates. Previous methods focus on using diffusion models as +expressive decoders robust to quantization errors in the conditioning signals, +yet achieving competitive results in this manner requires costly training of +the diffusion model and long inference times due to the iterative generative +process. In this work we formulate the removal of quantization error as a +denoising task, using diffusion to recover lost information in the transmitted +image latent. Our approach allows us to perform less than 10\% of the full +diffusion generative process and requires no architectural changes to the +diffusion model, enabling the use of foundation models as a strong prior +without additional fine tuning of the backbone. Our proposed codec outperforms +previous methods in quantitative realism metrics, and we verify that our +reconstructions are qualitatively preferred by end users, even when other +methods use twice the bitrate. + +
+
+
+
+
+ + ☆ IDD-X: A Multi-View Dataset for Ego-relative Important Object + Localization and Explanation in Dense and Unstructured Traffic ICRA 2024 + + +
+ Intelligent vehicle systems require a deep understanding of the interplay +between road conditions, surrounding entities, and the ego vehicle's driving +behavior for safe and efficient navigation. This is particularly critical in +developing countries where traffic situations are often dense and unstructured +with heterogeneous road occupants. Existing datasets, predominantly geared +towards structured and sparse traffic scenarios, fall short of capturing the +complexity of driving in such environments. To fill this gap, we present IDD-X, +a large-scale dual-view driving video dataset. With 697K bounding boxes, 9K +important object tracks, and 1-12 objects per video, IDD-X offers comprehensive +ego-relative annotations for multiple important road objects covering 10 +categories and 19 explanation label categories. The dataset also incorporates +rearview information to provide a more complete representation of the driving +environment. We also introduce custom-designed deep networks aimed at multiple +important object localization and per-object explanation prediction. Overall, +our dataset and introduced prediction models form the foundation for studying +how road conditions and surrounding entities affect driving behavior in complex +traffic situations. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ☆ Scalability in Building Component Data Annotation: Enhancing Facade + Material Classification with Synthetic Data + + +
+ Computer vision models trained on Google Street View images can create +material cadastres. However, current approaches need manually annotated +datasets that are difficult to obtain and often have class imbalance. To +address these challenges, this paper fine-tuned a Swin Transformer model on a +synthetic dataset generated with DALL-E and compared the performance to a +similar manually annotated dataset. Although manual annotation remains the gold +standard, the synthetic dataset performance demonstrates a reasonable +alternative. The findings will ease annotation needed to develop material +cadastres, offering architects insights into opportunities for material reuse, +thus contributing to the reduction of demolition waste. + +
+
+ comment: 10 pages, 6 figures, submitted to 2024 European Conference of + Computing in Construction +
+
+
+
+
+ + ☆ Benchmarking the Cell Image Segmentation Models Robustness under the + Microscope Optical Aberrations + + +
+ Cell segmentation is essential in biomedical research for analyzing cellular +morphology and behavior. Deep learning methods, particularly convolutional +neural networks (CNNs), have revolutionized cell segmentation by extracting +intricate features from images. However, the robustness of these methods under +microscope optical aberrations remains a critical challenge. This study +comprehensively evaluates the performance of cell instance segmentation models +under simulated aberration conditions using the DynamicNuclearNet (DNN) and +LIVECell datasets. Aberrations, including Astigmatism, Coma, Spherical, and +Trefoil, were simulated using Zernike polynomial equations. Various +segmentation models, such as Mask R-CNN with different network heads (FPN, C3) +and backbones (ResNet, VGG19, SwinS), were trained and tested under aberrated +conditions. Results indicate that FPN combined with SwinS demonstrates superior +robustness in handling simple cell images affected by minor aberrations. +Conversely, Cellpose2.0 proves effective for complex cell images under similar +conditions. Our findings provide insights into selecting appropriate +segmentation models based on cell morphology and aberration severity, enhancing +the reliability of cell segmentation in biomedical applications. Further +research is warranted to validate these methods with diverse aberration types +and emerging segmentation models. Overall, this research aims to guide +researchers in effectively utilizing cell segmentation models in the presence +of minor optical aberrations. + +
+
+
+
+
+ + ☆ Analyzing Decades-Long Environmental Changes in Namibia Using Archival + Aerial Photography and Deep Learning + + +
+ This study explores object detection in historical aerial photographs of +Namibia to identify long-term environmental changes. Specifically, we aim to +identify key objects -- \textit{Waterholes}, \textit{Omuti homesteads}, and +\textit{Big trees} -- around Oshikango in Namibia using sub-meter gray-scale +aerial imagery from 1943 and 1972. In this work, we propose a workflow for +analyzing historical aerial imagery using a deep semantic segmentation model on +sparse hand-labels. To this end, we employ a number of strategies including +class-weighting, pseudo-labeling and empirical p-value-based filtering to +balance skewed and sparse representations of objects in the ground truth data. +Results demonstrate the benefits of these different training strategies +resulting in an average $F_1=0.661$ and $F_1=0.755$ over the three objects of +interest for the 1943 and 1972 imagery, respectively. We also identified that +the average size of Waterhole and Big trees increased while the average size of +Omutis decreased between 1943 and 1972 reflecting some of the local effects of +the massive post-Second World War economic, agricultural, demographic, and +environmental changes. This work also highlights the untapped potential of +historical aerial photographs in understanding long-term environmental changes +beyond Namibia (and Africa). With the lack of adequate satellite technology in +the past, archival aerial photography offers a great alternative to uncover +decades-long environmental changes. + +
+
+
+
+
+ + ☆ On the Robustness of Language Guidance for Low-Level Vision Tasks: + Findings from Depth Estimation CVPR 2024 + + +
+ Recent advances in monocular depth estimation have been made by incorporating +natural language as additional guidance. Although yielding impressive results, +the impact of the language prior, particularly in terms of generalization and +robustness, remains unexplored. In this paper, we address this gap by +quantifying the impact of this prior and introduce methods to benchmark its +effectiveness across various settings. We generate "low-level" sentences that +convey object-centric, three-dimensional spatial relationships, incorporate +them as additional language priors and evaluate their downstream impact on +depth estimation. Our key finding is that current language-guided depth +estimators perform optimally only with scene-level descriptions and +counter-intuitively fare worse with low level descriptions. Despite leveraging +additional data, these methods are not robust to directed adversarial attacks +and decline in performance with an increase in distribution shift. Finally, to +provide a foundation for future research, we identify points of failures and +offer insights to better understand these shortcomings. With an increasing +number of methods using language for depth estimation, our findings highlight +the opportunities and pitfalls that require careful consideration for effective +deployment in real-world settings + +
+
+ comment: Accepted to CVPR 2024. Project webpage: + https://agneetchatterjee.com/robustness_depth_lang/ +
+
+
+
+
+ + ☆ Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking + + +
+ Contrastive learning has gained widespread adoption for retrieval tasks due +to its minimal requirement for manual annotations. However, popular contrastive +frameworks typically learn from binary relevance, making them ineffective at +incorporating direct fine-grained rankings. In this paper, we curate a +large-scale dataset featuring detailed relevance scores for each query-document +pair to facilitate future research and evaluation. Subsequently, we propose +Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking (GCL), +which is designed to learn from fine-grained rankings beyond binary relevance +scores. Our results show that GCL achieves a 94.5% increase in NDCG@10 for +in-domain and 26.3 to 48.8% increases for cold-start evaluations, all relative +to the CLIP baseline and involving ground truth rankings. + +
+
+
+
+
+ + ☆ Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly + Detection CVPR2024 + + +
+ Weakly supervised video anomaly detection (WSVAD) is a challenging task. +Generating fine-grained pseudo-labels based on weak-label and then +self-training a classifier is currently a promising solution. However, since +the existing methods use only RGB visual modality and the utilization of +category text information is neglected, thus limiting the generation of more +accurate pseudo-labels and affecting the performance of self-training. Inspired +by the manual labeling process based on the event description, in this paper, +we propose a novel pseudo-label generation and self-training framework based on +Text Prompt with Normality Guidance (TPWNG) for WSVAD. Our idea is to transfer +the rich language-visual knowledge of the contrastive language-image +pre-training (CLIP) model for aligning the video event description text and +corresponding video frames to generate pseudo-labels. Specifically, We first +fine-tune the CLIP for domain adaptation by designing two ranking losses and a +distributional inconsistency loss. Further, we propose a learnable text prompt +mechanism with the assist of a normality visual prompt to further improve the +matching accuracy of video event description text and video frames. Then, we +design a pseudo-label generation module based on the normality guidance to +infer reliable frame-level pseudo-labels. Finally, we introduce a temporal +context self-adaptive learning module to learn the temporal dependencies of +different video events more flexibly and accurately. Extensive experiments show +that our method achieves state-of-the-art performance on two benchmark +datasets, UCF-Crime and XD-Viole + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Masked Image Modeling as a Framework for Self-Supervised Learning across + Eye Movements + + +
+ To make sense of their surroundings, intelligent systems must transform +complex sensory inputs to structured codes that are reduced to task-relevant +information such as object category. Biological agents achieve this in a +largely autonomous manner, presumably via self-\allowbreak super-\allowbreak +vised learning. Whereas previous attempts to model the underlying mechanisms +were largely discriminative in nature, there is ample evidence that the brain +employs a generative model of the world. Here, we propose that eye movements, +in combination with the focused nature of primate vision, constitute a +generative, self-supervised task of predicting and revealing visual +information. We construct a proof-of-principle model starting from the +framework of masked image modeling (MIM), a common approach in deep +representation learning. To do so, we analyze how core components of MIM such +as masking technique and data augmentation influence the formation of +category-specific representations. This allows us not only to better understand +the principles behind MIM, but to then reassemble a MIM more in line with the +focused nature of biological perception. From a theoretical angle, we find that +MIM disentangles neurons in latent space, a property that has been suggested to +structure visual representations in primates, without explicit regulation. +Together with previous findings of invariance learning, this highlights an +interesting connection of MIM to latent regularization approaches for +self-supervised learning. The source code is available under +https://github.com/RobinWeiler/FocusMIM + +
+
+
+
+
+ + ☆ ChatGPT and general-purpose AI count fruits in pictures surprisingly + well + + +
+ Object counting is a popular task in deep learning applications in various +domains, including agriculture. A conventional deep learning approach requires +a large amount of training data, often a logistic problem in a real-world +application. To address this issue, we examined how well ChatGPT (GPT4V) and a +general-purpose AI (foundation model for object counting, T-Rex) can count the +number of fruit bodies (coffee cherries) in 100 images. The foundation model +with few-shot learning outperformed the trained YOLOv8 model (R2 = 0.923 and +0.900, respectively). ChatGPT also showed some interesting potential, +especially when few-shot learning with human feedback was applied (R2 = 0.360 +and 0.460, respectively). Moreover, we examined the time required for +implementation as a practical question. Obtaining the results with the +foundation model and ChatGPT were much shorter than the YOLOv8 model (0.83 hrs, +1.75 hrs, and 161 hrs). We interpret these results as two surprises for deep +learning users in applied domains: a foundation model with few-shot +domain-specific learning can drastically save time and effort compared to the +conventional approach, and ChatGPT can reveal a relatively good performance. +Both approaches do not need coding skills, which can foster AI education and +dissemination. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ NIR-Assisted Image Denoising: A Selective Fusion Approach and A + Real-World Benchmark Datase + + +
+ Despite the significant progress in image denoising, it is still challenging +to restore fine-scale details while removing noise, especially in extremely +low-light environments. Leveraging near-infrared (NIR) images to assist visible +RGB image denoising shows the potential to address this issue, becoming a +promising technology. Nonetheless, existing works still struggle with taking +advantage of NIR information effectively for real-world image denoising, due to +the content inconsistency between NIR-RGB images and the scarcity of real-world +paired datasets. To alleviate the problem, we propose an efficient Selective +Fusion Module (SFM), which can be plug-and-played into the advanced denoising +networks to merge the deep NIR-RGB features. Specifically, we sequentially +perform the global and local modulation for NIR and RGB features, and then +integrate the two modulated features. Furthermore, we present a Real-world +NIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse +scenarios as well as various noise levels. Extensive experiments on both +synthetic and our real-world datasets demonstrate that the proposed method +achieves better results than state-of-the-art ones. The dataset, codes, and +pre-trained models will be publicly available at +https://github.com/ronjonxu/NAID. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ LaSagnA: Language-based Segmentation Assistant for Complex Queries + + +
+ Recent advancements have empowered Large Language Models for Vision (vLLMs) +to generate detailed perceptual outcomes, including bounding boxes and masks. +Nonetheless, there are two constraints that restrict the further application of +these vLLMs: the incapability of handling multiple targets per query and the +failure to identify the absence of query objects in the image. In this study, +we acknowledge that the main cause of these problems is the insufficient +complexity of training queries. Consequently, we define the general sequence +format for complex queries. Then we incorporate a semantic segmentation task in +the current pipeline to fulfill the requirements of training data. Furthermore, +we present three novel strategies to effectively handle the challenges arising +from the direct integration of the proposed format. The effectiveness of our +model in processing complex queries is validated by the comparable results with +conventional methods on both close-set and open-set semantic segmentation +datasets. Additionally, we outperform a series of vLLMs in reasoning and +referring segmentation, showcasing our model's remarkable capabilities. We +release the code at https://github.com/congvvc/LaSagnA. + +
+
+
+
+
+ + ☆ 3D Human Scan With A Moving Event Camera + + +
+ Capturing the 3D human body is one of the important tasks in computer vision +with a wide range of applications such as virtual reality and sports analysis. +However, conventional frame cameras are limited by their temporal resolution +and dynamic range, which imposes constraints in real-world application setups. +Event cameras have the advantages of high temporal resolution and high dynamic +range (HDR), but the development of event-based methods is necessary to handle +data with different characteristics. This paper proposes a novel event-based +method for 3D pose estimation and human mesh recovery. Prior work on +event-based human mesh recovery require frames (images) as well as event data. +The proposed method solely relies on events; it carves 3D voxels by moving the +event camera around a stationary body, reconstructs the human pose and mesh by +attenuated rays, and fit statistical body models, preserving high-frequency +details. The experimental results show that the proposed method outperforms +conventional frame-based methods in the estimation accuracy of both pose and +body mesh. We also demonstrate results in challenging situations where a +conventional camera has motion blur. This is the first to demonstrate +event-only human mesh recovery, and we hope that it is the first step toward +achieving robust and accurate 3D human body scanning from vision sensors. + +
+
+
+
+
+ + ☆ SpectralMamba: Efficient Mamba for Hyperspectral Image Classification + + +
+ Recurrent neural networks and Transformers have recently dominated most +applications in hyperspectral (HS) imaging, owing to their capability to +capture long-range dependencies from spectrum sequences. However, despite the +success of these sequential architectures, the non-ignorable inefficiency +caused by either difficulty in parallelization or computationally prohibitive +attention still hinders their practicality, especially for large-scale +observation in remote sensing scenarios. To address this issue, we herein +propose SpectralMamba -- a novel state space model incorporated efficient deep +learning framework for HS image classification. SpectralMamba features the +simplified but adequate modeling of HS data dynamics at two levels. First, in +spatial-spectral space, a dynamical mask is learned by efficient convolutions +to simultaneously encode spatial regularity and spectral peculiarity, thus +attenuating the spectral variability and confusion in discriminative +representation learning. Second, the merged spectrum can then be efficiently +operated in the hidden state space with all parameters learned input-dependent, +yielding selectively focused responses without reliance on redundant attention +or imparallelizable recurrence. To explore the room for further computational +downsizing, a piece-wise scanning mechanism is employed in-between, +transferring approximately continuous spectrum into sequences with squeezed +length while maintaining short- and long-term contextual profiles among +hundreds of bands. Through extensive experiments on four benchmark HS datasets +acquired by satellite-, aircraft-, and UAV-borne imagers, SpectralMamba +surprisingly creates promising win-wins from both performance and efficiency +perspectives. + +
+
+
+
+
+ + ☆ New Efficient Visual OILU Markers + + +
+ Basic patterns are the source of a wide range of more or less complex +geometric structures. We will exploit such patterns to develop new efficient +visual markers. Besides being projective invariants, the proposed markers allow +producing rich panel of unique identifiers, highly required for +resource-intensive navigation and augmented reality applications. The spiral +topology of our markers permits the validation of an accurate identification +scheme, which is based on level set methods. The robustness of the markers +against acquisition and geometric distortions is validated by extensive +experimental tests. + +
+
+
+
+
+ + ☆ MoE-FFD: Mixture of Experts for Generalized and Parameter-Efficient Face + Forgery Detection + + +
+ Deepfakes have recently raised significant trust issues and security concerns +among the public. Compared to CNN face forgery detectors, ViT-based methods +take advantage of the expressivity of transformers, achieving superior +detection performance. However, these approaches still exhibit the following +limitations: (1). Fully fine-tuning ViT-based models from ImageNet weights +demands substantial computational and storage resources; (2). ViT-based methods +struggle to capture local forgery clues, leading to model bias and limited +generalizability. To tackle these challenges, this work introduces +Mixture-of-Experts modules for Face Forgery Detection (MoE-FFD), a generalized +yet parameter-efficient ViT-based approach. MoE-FFD only updates lightweight +Low-Rank Adaptation (LoRA) and Adapter layers while keeping the ViT backbone +frozen, thereby achieving parameter-efficient training. Moreover, MoE-FFD +leverages the expressivity of transformers and local priors of CNNs to +simultaneously extract global and local forgery clues. Additionally, novel MoE +modules are designed to scale the model's capacity and select optimal forgery +experts, further enhancing forgery detection performance. The proposed MoE +learning scheme can be seamlessly adapted to various transformer backbones in a +plug-and-play manner. Extensive experimental results demonstrate that the +proposed method achieves state-of-the-art face forgery detection performance +with reduced parameter overhead. The code will be released upon acceptance. + +
+
+
+
+
+ + ☆ Joint Physical-Digital Facial Attack Detection Via Simulating Spoofing + Clues CVPR + + +
+ Face recognition systems are frequently subjected to a variety of physical +and digital attacks of different types. Previous methods have achieved +satisfactory performance in scenarios that address physical attacks and digital +attacks, respectively. However, few methods are considered to integrate a model +that simultaneously addresses both physical and digital attacks, implying the +necessity to develop and maintain multiple models. To jointly detect physical +and digital attacks within a single model, we propose an innovative approach +that can adapt to any network architecture. Our approach mainly contains two +types of data augmentation, which we call Simulated Physical Spoofing Clues +augmentation (SPSC) and Simulated Digital Spoofing Clues augmentation (SDSC). +SPSC and SDSC augment live samples into simulated attack samples by simulating +spoofing clues of physical and digital attacks, respectively, which +significantly improve the capability of the model to detect "unseen" attack +types. Extensive experiments show that SPSC and SDSC can achieve +state-of-the-art generalization in Protocols 2.1 and 2.2 of the UniAttackData +dataset, respectively. Our method won first place in "Unified Physical-Digital +Face Attack Detection" of the 5th Face Anti-spoofing Challenge@CVPR2024. Our +final submission obtains 3.75% APCER, 0.93% BPCER, and 2.34% ACER, +respectively. Our code is available at +https://github.com/Xianhua-He/cvpr2024-face-anti-spoofing-challenge. + +
+
+ comment: 10 pages with 6 figures, Accepted by CVPRW 2024 +
+
+
+
+
+ + ☆ OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering + + +
+ Rendering dynamic 3D human from monocular videos is crucial for various +applications such as virtual reality and digital entertainment. Most methods +assume the people is in an unobstructed scene, while various objects may cause +the occlusion of body parts in real-life scenarios. Previous method utilizing +NeRF for surface rendering to recover the occluded areas, but it requiring more +than one day to train and several seconds to render, failing to meet the +requirements of real-time interactive applications. To address these issues, we +propose OccGaussian based on 3D Gaussian Splatting, which can be trained within +6 minutes and produces high-quality human renderings up to 160 FPS with +occluded input. OccGaussian initializes 3D Gaussian distributions in the +canonical space, and we perform occlusion feature query at occluded regions, +the aggregated pixel-align feature is extracted to compensate for the missing +information. Then we use Gaussian Feature MLP to further process the feature +along with the occlusion-aware loss functions to better perceive the occluded +area. Extensive experiments both in simulated and real-world occlusions, +demonstrate that our method achieves comparable or even superior performance +compared to the state-of-the-art method. And we improving training and +inference speeds by 250x and 800x, respectively. Our code will be available for +research purposes. + +
+
+ comment: 12 April, 2024; originally announced April 2024 +
+
+
+
+
+ + ☆ MSSTNet: A Multi-Scale Spatio-Temporal CNN-Transformer Network for + Dynamic Facial Expression Recognition ICASSP 2024 + + +
+ Unlike typical video action recognition, Dynamic Facial Expression +Recognition (DFER) does not involve distinct moving targets but relies on +localized changes in facial muscles. Addressing this distinctive attribute, we +propose a Multi-Scale Spatio-temporal CNN-Transformer network (MSSTNet). Our +approach takes spatial features of different scales extracted by CNN and feeds +them into a Multi-scale Embedding Layer (MELayer). The MELayer extracts +multi-scale spatial information and encodes these features before sending them +into a Temporal Transformer (T-Former). The T-Former simultaneously extracts +temporal information while continually integrating multi-scale spatial +information. This process culminates in the generation of multi-scale +spatio-temporal features that are utilized for the final classification. Our +method achieves state-of-the-art results on two in-the-wild datasets. +Furthermore, a series of ablation experiments and visualizations provide +further validation of our approach's proficiency in leveraging spatio-temporal +information within DFER. + +
+
+ comment: Accepted to 2024 IEEE International Conference on Acoustics, Speech, + and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ Adapting the Segment Anything Model During Usage in Novel Situations + + +
+ The interactive segmentation task consists in the creation of object +segmentation masks based on user interactions. The most common way to guide a +model towards producing a correct segmentation consists in clicks on the object +and background. The recently published Segment Anything Model (SAM) supports a +generalized version of the interactive segmentation problem and has been +trained on an object segmentation dataset which contains 1.1B masks. Though +being trained extensively and with the explicit purpose of serving as a +foundation model, we show significant limitations of SAM when being applied for +interactive segmentation on novel domains or object types. On the used +datasets, SAM displays a failure rate $\text{FR}_{30}@90$ of up to $72.6 \%$. +Since we still want such foundation models to be immediately applicable, we +present a framework that can adapt SAM during immediate usage. For this we will +leverage the user interactions and masks, which are constructed during the +interactive segmentation process. We use this information to generate +pseudo-labels, which we use to compute a loss function and optimize a part of +the SAM model. The presented method causes a relative reduction of up to $48.1 +\%$ in the $\text{FR}_{20}@85$ and $46.6 \%$ in the $\text{FR}_{30}@90$ +metrics. + +
+
+ comment: 11 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Direct May Not Be the Best: An Incremental Evolution View of Pose + Generation + + +
+ Pose diversity is an inherent representative characteristic of 2D images. Due +to the 3D to 2D projection mechanism, there is evident content discrepancy +among distinct pose images. This is the main obstacle bothering pose +transformation related researches. To deal with this challenge, we propose a +fine-grained incremental evolution centered pose generation framework, rather +than traditional direct one-to-one in a rush. Since proposed approach actually +bypasses the theoretical difficulty of directly modeling dramatic non-linear +variation, the incurred content distortion and blurring could be effectively +constrained, at the same time the various individual pose details, especially +clothes texture, could be precisely maintained. In order to systematically +guide the evolution course, both global and incremental evolution constraints +are elaborately designed and merged into the overall frame?work. And a novel +triple-path knowledge fusion structure is worked out to take full advantage of +all available valuable knowledge to conduct high-quality pose synthesis. In +addition, our framework could generate a series of valuable byproducts, namely +the various intermediate poses. Extensive experiments have been conducted to +verify the effectiveness of the proposed approach. Code is available at +https://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation. + +
+
+
+
+
+ + ☆ MambaDFuse: A Mamba-based Dual-phase Model for Multi-modality Image + Fusion + + +
+ Multi-modality image fusion (MMIF) aims to integrate complementary +information from different modalities into a single fused image to represent +the imaging scene and facilitate downstream visual tasks comprehensively. In +recent years, significant progress has been made in MMIF tasks due to advances +in deep neural networks. However, existing methods cannot effectively and +efficiently extract modality-specific and modality-fused features constrained +by the inherent local reductive bias (CNN) or quadratic computational +complexity (Transformers). To overcome this issue, we propose a Mamba-based +Dual-phase Fusion (MambaDFuse) model. Firstly, a dual-level feature extractor +is designed to capture long-range features from single-modality images by +extracting low and high-level features from CNN and Mamba blocks. Then, a +dual-phase feature fusion module is proposed to obtain fusion features that +combine complementary information from different modalities. It uses the +channel exchange method for shallow fusion and the enhanced Multi-modal Mamba +(M3) blocks for deep fusion. Finally, the fused image reconstruction module +utilizes the inverse transformation of the feature extraction to generate the +fused result. Through extensive experiments, our approach achieves promising +fusion results in infrared-visible image fusion and medical image fusion. +Additionally, in a unified benchmark, MambaDFuse has also demonstrated improved +performance in downstream tasks such as object detection. Code with checkpoints +will be available after the peer-review process. + +
+
+
+
+
+ + ☆ No Bells, Just Whistles: Sports Field Registration by Leveraging + Geometric Properties CVPR + + +
+ Broadcast sports field registration is traditionally addressed as a +homography estimation task, mapping the visible image area to a planar field +model, predominantly focusing on the main camera shot. Addressing the +shortcomings of previous approaches, we propose a novel calibration pipeline +enabling camera calibration using a 3D soccer field model and extending the +process to assess the multiple-view nature of broadcast videos. Our approach +begins with a keypoint generation pipeline derived from SoccerNet dataset +annotations, leveraging the geometric properties of the court. Subsequently, we +execute classical camera calibration through DLT algorithm in a minimalist +fashion, without further refinement. Through extensive experimentation on +real-world soccer broadcast datasets such as SoccerNet-Calibration, WorldCup +2014 and TS- WorldCup, our method demonstrates superior performance in both +multiple- and single-view 3D camera calibration while maintaining competitive +results in homography estimation compared to state-of-the-art techniques. + +
+
+ comment: Accepted in CVPRW 2024 +
+
+
+
+
+ + ☆ Mitigating Challenges of the Space Environment for Onboard Artificial + Intelligence: Design Overview of the Imaging Payload on SpIRIT CVPR 2024 + + +
+ Artificial intelligence (AI) and autonomous edge computing in space are +emerging areas of interest to augment capabilities of nanosatellites, where +modern sensors generate orders of magnitude more data than can typically be +transmitted to mission control. Here, we present the hardware and software +design of an onboard AI subsystem hosted on SpIRIT. The system is optimised for +on-board computer vision experiments based on visible light and long wave +infrared cameras. This paper highlights the key design choices made to maximise +the robustness of the system in harsh space conditions, and their motivation +relative to key mission requirements, such as limited compute resources, +resilience to cosmic radiation, extreme temperature variations, distribution +shifts, and very low transmission bandwidths. The payload, called Loris, +consists of six visible light cameras, three infrared cameras, a camera control +board and a Graphics Processing Unit (GPU) system-on-module. Loris enables the +execution of AI models with on-orbit fine-tuning as well as a next-generation +image compression algorithm, including progressive coding. This innovative +approach not only enhances the data processing capabilities of nanosatellites +but also lays the groundwork for broader applications to remote sensing from +space. + +
+
+ comment: AI4Space 2024, 3rd Workshop on AI for Space, CVPR 2024 +
+
+
+
+
+ + ☆ NC-TTT: A Noise Contrastive Approach for Test-Time Training + + +
+ Despite their exceptional performance in vision tasks, deep learning models +often struggle when faced with domain shifts during testing. Test-Time Training +(TTT) methods have recently gained popularity by their ability to enhance the +robustness of models through the addition of an auxiliary objective that is +jointly optimized with the main task. Being strictly unsupervised, this +auxiliary objective is used at test time to adapt the model without any access +to labels. In this work, we propose Noise-Contrastive Test-Time Training +(NC-TTT), a novel unsupervised TTT technique based on the discrimination of +noisy feature maps. By learning to classify noisy views of projected feature +maps, and then adapting the model accordingly on new domains, classification +performance can be recovered by an important margin. Experiments on several +popular test-time adaptation baselines demonstrate the advantages of our method +compared to recent approaches for this task. The code can be found +at:https://github.com/GustavoVargasHakim/NCTTT.git + +
+
+
+
+
+ + ☆ Let It Flow: Simultaneous Optimization of 3D Flow and Object Clustering ECCV + + +
+ We study the problem of self-supervised 3D scene flow estimation from real +large-scale raw point cloud sequences, which is crucial to various tasks like +trajectory prediction or instance segmentation. In the absence of ground truth +scene flow labels, contemporary approaches concentrate on deducing optimizing +flow across sequential pairs of point clouds by incorporating structure based +regularization on flow and object rigidity. The rigid objects are estimated by +a variety of 3D spatial clustering methods. While state-of-the-art methods +successfully capture overall scene motion using the Neural Prior structure, +they encounter challenges in discerning multi-object motions. We identified the +structural constraints and the use of large and strict rigid clusters as the +main pitfall of the current approaches and we propose a novel clustering +approach that allows for combination of overlapping soft clusters as well as +non-overlapping rigid clusters representation. Flow is then jointly estimated +with progressively growing non-overlapping rigid clusters together with fixed +size overlapping soft clusters. We evaluate our method on multiple datasets +with LiDAR point clouds, demonstrating the superior performance over the +self-supervised baselines reaching new state of the art results. Our method +especially excels in resolving flow in complicated dynamic scenes with multiple +independently moving objects close to each other which includes pedestrians, +cyclists and other vulnerable road users. Our codes will be publicly available. + +
+
+ comment: ECCV submission +
+
+
+
+
+ + ☆ TDANet: Target-Directed Attention Network For Object-Goal Visual + Navigation With Zero-Shot Ability + + +
+ The generalization of the end-to-end deep reinforcement learning (DRL) for +object-goal visual navigation is a long-standing challenge since object classes +and placements vary in new test environments. Learning domain-independent +visual representation is critical for enabling the trained DRL agent with the +ability to generalize to unseen scenes and objects. In this letter, a +target-directed attention network (TDANet) is proposed to learn the end-to-end +object-goal visual navigation policy with zero-shot ability. TDANet features a +novel target attention (TA) module that learns both the spatial and semantic +relationships among objects to help TDANet focus on the most relevant observed +objects to the target. With the Siamese architecture (SA) design, TDANet +distinguishes the difference between the current and target states and +generates the domain-independent visual representation. To evaluate the +navigation performance of TDANet, extensive experiments are conducted in the +AI2-THOR embodied AI environment. The simulation results demonstrate a strong +generalization ability of TDANet to unseen scenes and target objects, with +higher navigation success rate (SR) and success weighted by length (SPL) than +other state-of-the-art models. + +
+
+
+
+
+ + ☆ OmniSat: Self-Supervised Modality Fusion for Earth Observation + + +
+ The field of Earth Observations (EO) offers a wealth of data from diverse +sensors, presenting a great opportunity for advancing self-supervised +multimodal learning. However, current multimodal EO datasets and models focus +on a single data type, either mono-date images or time series, which limits +their expressivity. We introduce OmniSat, a novel architecture that exploits +the spatial alignment between multiple EO modalities to learn expressive +multimodal representations without labels. To demonstrate the advantages of +combining modalities of different natures, we augment two existing datasets +with new modalities. As demonstrated on three downstream tasks: forestry, land +cover classification, and crop mapping. OmniSat can learn rich representations +in an unsupervised manner, leading to improved performance in the semi- and +fully-supervised settings, even when only one modality is available for +inference. The code and dataset are available at github.com/gastruc/OmniSat. + +
+
+
+
+
+ + ☆ Self-Supervised k-Space Regularization for Motion-Resolved Abdominal MRI + Using Neural Implicit k-Space Representation + + +
+ Neural implicit k-space representations have shown promising results for +dynamic MRI at high temporal resolutions. Yet, their exclusive training in +k-space limits the application of common image regularization methods to +improve the final reconstruction. In this work, we introduce the concept of +parallel imaging-inspired self-consistency (PISCO), which we incorporate as +novel self-supervised k-space regularization enforcing a consistent +neighborhood relationship. At no additional data cost, the proposed +regularization significantly improves neural implicit k-space reconstructions +on simulated data. Abdominal in-vivo reconstructions using PISCO result in +enhanced spatio-temporal image quality compared to state-of-the-art methods. +Code is available at https://github.com/vjspi/PISCO-NIK. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Learning to Rebalance Multi-Modal Optimization by Adaptively Masking + Subnetworks + + +
+ Multi-modal learning aims to enhance performance by unifying models from +various modalities but often faces the "modality imbalance" problem in real +data, leading to a bias towards dominant modalities and neglecting others, +thereby limiting its overall effectiveness. To address this challenge, the core +idea is to balance the optimization of each modality to achieve a joint +optimum. Existing approaches often employ a modal-level control mechanism for +adjusting the update of each modal parameter. However, such a global-wise +updating mechanism ignores the different importance of each parameter. Inspired +by subnetwork optimization, we explore a uniform sampling-based optimization +strategy and find it more effective than global-wise updating. According to the +findings, we further propose a novel importance sampling-based, element-wise +joint optimization method, called Adaptively Mask Subnetworks Considering Modal +Significance(AMSS). Specifically, we incorporate mutual information rates to +determine the modal significance and employ non-uniform adaptive sampling to +select foreground subnetworks from each modality for parameter updates, thereby +rebalancing multi-modal learning. Additionally, we demonstrate the reliability +of the AMSS strategy through convergence analysis. Building upon theoretical +insights, we further enhance the multi-modal mask subnetwork strategy using +unbiased estimation, referred to as AMSS+. Extensive experiments reveal the +superiority of our approach over comparison methods. + +
+
+ comment: 17 pages;6 figures +
+
+
+
+
+ + ☆ Counterfactual Explanations for Face Forgery Detection via Adversarial + Removal of Artifacts ICME2024 + + +
+ Highly realistic AI generated face forgeries known as deepfakes have raised +serious social concerns. Although DNN-based face forgery detection models have +achieved good performance, they are vulnerable to latest generative methods +that have less forgery traces and adversarial attacks. This limitation of +generalization and robustness hinders the credibility of detection results and +requires more explanations. In this work, we provide counterfactual +explanations for face forgery detection from an artifact removal perspective. +Specifically, we first invert the forgery images into the StyleGAN latent +space, and then adversarially optimize their latent representations with the +discrimination supervision from the target detection model. We verify the +effectiveness of the proposed explanations from two aspects: (1) Counterfactual +Trace Visualization: the enhanced forgery images are useful to reveal artifacts +by visually contrasting the original images and two different visualization +methods; (2) Transferable Adversarial Attacks: the adversarial forgery images +generated by attacking the detection model are able to mislead other detection +models, implying the removed artifacts are general. Extensive experiments +demonstrate that our method achieves over 90% attack success rate and superior +attack transferability. Compared with naive adversarial noise methods, our +method adopts both generative and discriminative model priors, and optimize the +latent representations in a synthesis-by-analysis way, which forces the search +of counterfactual explanations on the natural face manifold. Thus, more general +counterfactual traces can be found and better adversarial attack +transferability can be achieved. + +
+
+ comment: Accepted to ICME2024 +
+
+
+
+
+ + ☆ Emerging Property of Masked Token for Effective Pre-training + + +
+ Driven by the success of Masked Language Modeling (MLM), the realm of +self-supervised learning for computer vision has been invigorated by the +central role of Masked Image Modeling (MIM) in driving recent breakthroughs. +Notwithstanding the achievements of MIM across various downstream tasks, its +overall efficiency is occasionally hampered by the lengthy duration of the +pre-training phase. This paper presents a perspective that the optimization of +masked tokens as a means of addressing the prevailing issue. Initially, we +delve into an exploration of the inherent properties that a masked token ought +to possess. Within the properties, we principally dedicated to articulating and +emphasizing the `data singularity' attribute inherent in masked tokens. Through +a comprehensive analysis of the heterogeneity between masked tokens and visible +tokens within pre-trained models, we propose a novel approach termed masked +token optimization (MTO), specifically designed to improve model efficiency +through weight recalibration and the enhancement of the key property of masked +tokens. The proposed method serves as an adaptable solution that seamlessly +integrates into any MIM approach that leverages masked tokens. As a result, MTO +achieves a considerable improvement in pre-training efficiency, resulting in an +approximately 50% reduction in pre-training epochs required to attain converged +performance of the recent approaches. + +
+
+
+
+
+ + ☆ Salience-Based Adaptive Masking: Revisiting Token Dynamics for Enhanced + Pre-training + + +
+ In this paper, we introduce Saliency-Based Adaptive Masking (SBAM), a novel +and cost-effective approach that significantly enhances the pre-training +performance of Masked Image Modeling (MIM) approaches by prioritizing token +salience. Our method provides robustness against variations in masking ratios, +effectively mitigating the performance instability issues common in existing +methods. This relaxes the sensitivity of MIM-based pre-training to masking +ratios, which in turn allows us to propose an adaptive strategy for `tailored' +masking ratios for each data sample, which no existing method can provide. +Toward this goal, we propose an Adaptive Masking Ratio (AMR) strategy that +dynamically adjusts the proportion of masking for the unique content of each +image based on token salience. We show that our method significantly improves +over the state-of-the-art in mask-based pre-training on the ImageNet-1K +dataset. + +
+
+
+
+
+ + ☆ GPN: Generative Point-based NeRF + + +
+ Scanning real-life scenes with modern registration devices typically gives +incomplete point cloud representations, primarily due to the limitations of +partial scanning, 3D occlusions, and dynamic light conditions. Recent works on +processing incomplete point clouds have always focused on point cloud +completion. However, these approaches do not ensure consistency between the +completed point cloud and the captured images regarding color and geometry. We +propose using Generative Point-based NeRF (GPN) to reconstruct and repair a +partial cloud by fully utilizing the scanning images and the corresponding +reconstructed cloud. The repaired point cloud can achieve multi-view +consistency with the captured images at high spatial resolution. For the +finetunes of a single scene, we optimize the global latent condition by +incorporating an Auto-Decoder architecture while retaining multi-view +consistency. As a result, the generated point clouds are smooth, plausible, and +geometrically consistent with the partial scanning images. Extensive +experiments on ShapeNet demonstrate that our works achieve competitive +performances to the other state-of-the-art point cloud-based neural scene +rendering and editing performances. + +
+
+
+
+
+ + ☆ Interference Motion Removal for Doppler Radar Vital Sign Detection Using + Variational Encoder-Decoder Neural Network + + +
+ The treatment of interfering motion contributions remains one of the key +challenges in the domain of radar-based vital sign monitoring. Removal of the +interference to extract the vital sign contributions is demanding due to +overlapping Doppler bands, the complex structure of the interference motions +and significant variations in the power levels of their contributions. A novel +approach to the removal of interference through the use of a probabilistic deep +learning model is presented. Results show that a convolutional encoder-decoder +neural network with a variational objective is capable of learning a meaningful +representation space of vital sign Doppler-time distribution facilitating their +extraction from a mixture signal. The approach is tested on semi-experimental +data containing real vital sign signatures and simulated returns from +interfering body motions. The application of the proposed network enhances the +extraction of the micro-Doppler frequency corresponding to the respiration rate +is demonstrated. + +
+
+ comment: Presented at 2021 IEEE Radar Conference (RadarConf21) +
+
+
+
+
+ + ☆ Overcoming Scene Context Constraints for Object Detection in wild using + Defilters + + +
+ This paper focuses on improving object detection performance by addressing +the issue of image distortions, commonly encountered in uncontrolled +acquisition environments. High-level computer vision tasks such as object +detection, recognition, and segmentation are particularly sensitive to image +distortion. To address this issue, we propose a novel approach employing an +image defilter to rectify image distortion prior to object detection. This +method enhances object detection accuracy, as models perform optimally when +trained on non-distorted images. Our experiments demonstrate that utilizing +defiltered images significantly improves mean average precision compared to +training object detection models on distorted images. Consequently, our +proposed method offers considerable benefits for real-world applications +plagued by image distortion. To our knowledge, the contribution lies in +employing distortion-removal paradigm for object detection on images captured +in natural settings. We achieved an improvement of 0.562 and 0.564 of mean +Average precision on validation and test data. + +
+
+
+
+
+ + ☆ AdaContour: Adaptive Contour Descriptor with Hierarchical Representation + + +
+ Existing angle-based contour descriptors suffer from lossy representation for +non-starconvex shapes. By and large, this is the result of the shape being +registered with a single global inner center and a set of radii corresponding +to a polar coordinate parameterization. In this paper, we propose AdaContour, +an adaptive contour descriptor that uses multiple local representations to +desirably characterize complex shapes. After hierarchically encoding object +shapes in a training set and constructing a contour matrix of all subdivided +regions, we compute a robust low-rank robust subspace and approximate each +local contour by linearly combining the shared basis vectors to represent an +object. Experiments show that AdaContour is able to represent shapes more +accurately and robustly than other descriptors while retaining effectiveness. +We validate AdaContour by integrating it into off-the-shelf detectors to enable +instance segmentation which demonstrates faithful performance. The code is +available at https://github.com/tding1/AdaContour. + +
+
+
+
+
+ + ☆ On Input Formats for Radar Micro-Doppler Signature Processing by + Convolutional Neural Networks + + +
+ Convolutional neural networks have often been proposed for processing radar +Micro-Doppler signatures, most commonly with the goal of classifying the +signals. The majority of works tend to disregard phase information from the +complex time-frequency representation. Here, the utility of the phase +information, as well as the optimal format of the Doppler-time input for a +convolutional neural network, is analysed. It is found that the performance +achieved by convolutional neural network classifiers is heavily influenced by +the type of input representation, even across formats with equivalent +information. Furthermore, it is demonstrated that the phase component of the +Doppler-time representation contains rich information useful for classification +and that unwrapping the phase in the temporal dimension can improve the results +compared to a magnitude-only solution, improving accuracy from 0.920 to 0.938 +on the tested human activity dataset. Further improvement of 0.947 is achieved +by training a linear classifier on embeddings from multiple-formats. + +
+
+ comment: Presented at International Conference on Radar Systems (RADAR 2022) +
+
+
+
+
+ + ☆ A Survey of Neural Network Robustness Assessment in Image Recognition + + +
+ In recent years, there has been significant attention given to the robustness +assessment of neural networks. Robustness plays a critical role in ensuring +reliable operation of artificial intelligence (AI) systems in complex and +uncertain environments. Deep learning's robustness problem is particularly +significant, highlighted by the discovery of adversarial attacks on image +classification models. Researchers have dedicated efforts to evaluate +robustness in diverse perturbation conditions for image recognition tasks. +Robustness assessment encompasses two main techniques: robustness verification/ +certification for deliberate adversarial attacks and robustness testing for +random data corruptions. In this survey, we present a detailed examination of +both adversarial robustness (AR) and corruption robustness (CR) in neural +network assessment. Analyzing current research papers and standards, we provide +an extensive overview of robustness assessment in image recognition. Three +essential aspects are analyzed: concepts, metrics, and assessment methods. We +investigate the perturbation metrics and range representations used to measure +the degree of perturbations on images, as well as the robustness metrics +specifically for the robustness conditions of classification models. The +strengths and limitations of the existing methods are also discussed, and some +potential directions for future research are provided. + +
+
+
+
+
+ + ☆ Calibration & Reconstruction: Deep Integrated Language for Referring + Image Segmentation ICMR2024 + + +
+ Referring image segmentation aims to segment an object referred to by natural +language expression from an image. The primary challenge lies in the efficient +propagation of fine-grained semantic information from textual features to +visual features. Many recent works utilize a Transformer to address this +challenge. However, conventional transformer decoders can distort linguistic +information with deeper layers, leading to suboptimal results. In this paper, +we introduce CRFormer, a model that iteratively calibrates multi-modal features +in the transformer decoder. We start by generating language queries using +vision features, emphasizing different aspects of the input language. Then, we +propose a novel Calibration Decoder (CDec) wherein the multi-modal features can +iteratively calibrated by the input language features. In the Calibration +Decoder, we use the output of each decoder layer and the original language +features to generate new queries for continuous calibration, which gradually +updates the language features. Based on CDec, we introduce a Language +Reconstruction Module and a reconstruction loss. This module leverages queries +from the final layer of the decoder to reconstruct the input language and +compute the reconstruction loss. This can further prevent the language +information from being lost or distorted. Our experiments consistently show the +superior performance of our approach across RefCOCO, RefCOCO+, and G-Ref +datasets compared to state-of-the-art methods. + +
+
+ comment: 9 pages, 8 figures ICMR2024. arXiv admin note: text overlap with + arXiv:2305.14969 +
+
+
+
+
+ + ☆ Convolutional neural network classification of cancer cytopathology + images: taking breast cancer as an example + + +
+ Breast cancer is a relatively common cancer among gynecological cancers. Its +diagnosis often relies on the pathology of cells in the lesion. The +pathological diagnosis of breast cancer not only requires professionals and +time, but also sometimes involves subjective judgment. To address the +challenges of dependence on pathologists expertise and the time-consuming +nature of achieving accurate breast pathological image classification, this +paper introduces an approach utilizing convolutional neural networks (CNNs) for +the rapid categorization of pathological images, aiming to enhance the +efficiency of breast pathological image detection. And the approach enables the +rapid and automatic classification of pathological images into benign and +malignant groups. The methodology involves utilizing a convolutional neural +network (CNN) model leveraging the Inceptionv3 architecture and transfer +learning algorithm for extracting features from pathological images. Utilizing +a neural network with fully connected layers and employing the SoftMax function +for image classification. Additionally, the concept of image partitioning is +introduced to handle high-resolution images. To achieve the ultimate +classification outcome, the classification probabilities of each image block +are aggregated using three algorithms: summation, product, and maximum. +Experimental validation was conducted on the BreaKHis public dataset, resulting +in accuracy rates surpassing 0.92 across all four magnification coefficients +(40X, 100X, 200X, and 400X). It demonstrates that the proposed method +effectively enhances the accuracy in classifying pathological images of breast +cancer. + +
+
+
+
+
+ + ☆ FaceFilterSense: A Filter-Resistant Face Recognition and Facial + Attribute Analysis Framework + + +
+ With the advent of social media, fun selfie filters have come into tremendous +mainstream use affecting the functioning of facial biometric systems as well as +image recognition systems. These filters vary from beautification filters and +Augmented Reality (AR)-based filters to filters that modify facial landmarks. +Hence, there is a need to assess the impact of such filters on the performance +of existing face recognition systems. The limitation associated with existing +solutions is that these solutions focus more on the beautification filters. +However, the current AR-based filters and filters which distort facial key +points are in vogue recently and make the faces highly unrecognizable even to +the naked eye. Also, the filters considered are mostly obsolete with limited +variations. To mitigate these limitations, we aim to perform a holistic impact +analysis of the latest filters and propose an user recognition model with the +filtered images. We have utilized a benchmark dataset for baseline images, and +applied the latest filters over them to generate a beautified/filtered dataset. +Next, we have introduced a model FaceFilterNet for beautified user recognition. +In this framework, we also utilize our model to comment on various attributes +of the person including age, gender, and ethnicity. In addition, we have also +presented a filter-wise impact analysis on face recognition, age estimation, +gender, and ethnicity prediction. The proposed method affirms the efficacy of +our dataset with an accuracy of 87.25% and an optimal accuracy for facial +attribute analysis. + +
+
+
+
+
+ + ☆ Struggle with Adversarial Defense? Try Diffusion + + +
+ Adversarial attacks induce misclassification by introducing subtle +perturbations. Recently, diffusion models are applied to the image classifiers +to improve adversarial robustness through adversarial training or by purifying +adversarial noise. However, diffusion-based adversarial training often +encounters convergence challenges and high computational expenses. +Additionally, diffusion-based purification inevitably causes data shift and is +deemed susceptible to stronger adaptive attacks. To tackle these issues, we +propose the Truth Maximization Diffusion Classifier (TMDC), a generative +Bayesian classifier that builds upon pre-trained diffusion models and the +Bayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian +principles, utilizes the conditional likelihood from diffusion models to +determine the class probabilities of input images, thereby insulating against +the influences of data shift and the limitations of adversarial training. +Moreover, to enhance TMDC's resilience against more potent adversarial attacks, +we propose an optimization strategy for diffusion classifiers. This strategy +involves post-training the diffusion model on perturbed datasets with +ground-truth labels as conditions, guiding the diffusion model to learn the +data distribution and maximizing the likelihood under the ground-truth labels. +The proposed method achieves state-of-the-art performance on the CIFAR10 +dataset against heavy white-box attacks and strong adaptive attacks. +Specifically, TMDC achieves robust accuracies of 82.81% against $l_{\infty}$ +norm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded +perturbations, respectively, with $\epsilon=0.05$. + +
+
+
+
+
+ + ☆ Guided Masked Self-Distillation Modeling for Distributed Multimedia + Sensor Event Analysis + + +
+ Observations with distributed sensors are essential in analyzing a series of +human and machine activities (referred to as 'events' in this paper) in complex +and extensive real-world environments. This is because the information obtained +from a single sensor is often missing or fragmented in such an environment; +observations from multiple locations and modalities should be integrated to +analyze events comprehensively. However, a learning method has yet to be +established to extract joint representations that effectively combine such +distributed observations. Therefore, we propose Guided Masked sELf-Distillation +modeling (Guided-MELD) for inter-sensor relationship modeling. The basic idea +of Guided-MELD is to learn to supplement the information from the masked sensor +with information from other sensors needed to detect the event. Guided-MELD is +expected to enable the system to effectively distill the fragmented or +redundant target event information obtained by the sensors without being overly +dependent on any specific sensors. To validate the effectiveness of the +proposed method in novel tasks of distributed multimedia sensor event analysis, +we recorded two new datasets that fit the problem setting: MM-Store and +MM-Office. These datasets consist of human activities in a convenience store +and an office, recorded using distributed cameras and microphones. Experimental +results on these datasets show that the proposed Guided-MELD improves event +tagging and detection performance and outperforms conventional inter-sensor +relationship modeling methods. Furthermore, the proposed method performed +robustly even when sensors were reduced. + +
+
+ comment: 13page, 7figure, under review +
+
+
+
+
+ + ☆ Practical Region-level Attack against Segment Anything Models + + +
+ Segment Anything Models (SAM) have made significant advancements in image +segmentation, allowing users to segment target portions of an image with a +single click (i.e., user prompt). Given its broad applications, the robustness +of SAM against adversarial attacks is a critical concern. While recent works +have explored adversarial attacks against a pre-defined prompt/click, their +threat model is not yet realistic: (1) they often assume the user-click +position is known to the attacker (point-based attack), and (2) they often +operate under a white-box setting with limited transferability. In this paper, +we propose a more practical region-level attack where attackers do not need to +know the precise user prompt. The attack remains effective as the user clicks +on any point on the target object in the image, hiding the object from SAM. +Also, by adapting a spectrum transformation method, we make the attack more +transferable under a black-box setting. Both control experiments and testing +against real-world SAM services confirm its effectiveness. + +
+
+
+
+
+ + ☆ MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based + Monocular Guidance + + +
+ The latest regularized Neural Radiance Field (NeRF) approaches produce poor +geometry and view extrapolation for multiview stereo (MVS) benchmarks such as +ETH3D. In this paper, we aim to create 3D models that provide accurate geometry +and view synthesis, partially closing the large geometric performance gap +between NeRF and traditional MVS methods. We propose a patch-based approach +that effectively leverages monocular surface normal and relative depth +predictions. The patch-based ray sampling also enables the appearance +regularization of normalized cross-correlation (NCC) and structural similarity +(SSIM) between randomly sampled virtual and training views. We further show +that "density restrictions" based on sparse structure-from-motion points can +help greatly improve geometric accuracy with a slight drop in novel view +synthesis metrics. Our experiments show 4x the performance of RegNeRF and 8x +that of FreeNeRF on average F1@2cm for ETH3D MVS benchmark, suggesting a +fruitful research direction to improve the geometric accuracy of NeRF-based +models, and sheds light on a potential future approach to enable NeRF-based +optimization to eventually outperform traditional MVS. + +
+
+ comment: 26 pages, 15 figures +
+
+
+
+
+ + ☆ Simulation of a Vision Correction Display System + + +
+ Eyes serve as our primary sensory organs, responsible for processing up to +80\% of our sensory input. However, common visual aberrations like myopia and +hyperopia affect a significant portion of the global population. This paper +focuses on simulating a Vision Correction Display (VCD) to enhance the visual +experience of individuals with various visual impairments. Utilising Blender, +we digitally model the functionality of a VCD in correcting refractive errors +such as myopia and hyperopia. With these simulations we can see potential +improvements in visual acuity and comfort. These simulations provide valuable +insights for the design and development of future VCD technologies, ultimately +advancing accessibility and usability for individuals with visual challenges. + +
+
+
+
+
+ + ☆ IFViT: Interpretable Fixed-Length Representation for Fingerprint + Matching via Vision Transformer + + +
+ Determining dense feature points on fingerprints used in constructing deep +fixed-length representations for accurate matching, particularly at the pixel +level, is of significant interest. To explore the interpretability of +fingerprint matching, we propose a multi-stage interpretable fingerprint +matching network, namely Interpretable Fixed-length Representation for +Fingerprint Matching via Vision Transformer (IFViT), which consists of two +primary modules. The first module, an interpretable dense registration module, +establishes a Vision Transformer (ViT)-based Siamese Network to capture +long-range dependencies and the global context in fingerprint pairs. It +provides interpretable dense pixel-wise correspondences of feature points for +fingerprint alignment and enhances the interpretability in the subsequent +matching stage. The second module takes into account both local and global +representations of the aligned fingerprint pair to achieve an interpretable +fixed-length representation extraction and matching. It employs the ViTs +trained in the first module with the additional fully connected layer and +retrains them to simultaneously produce the discriminative fixed-length +representation and interpretable dense pixel-wise correspondences of feature +points. Extensive experimental results on diverse publicly available +fingerprint databases demonstrate that the proposed framework not only exhibits +superior performance on dense registration and matching but also significantly +promotes the interpretability in deep fixed-length representations-based +fingerprint matching. + +
+
+ comment: ready to submit to IEEE Transactions on Information Forensics and + Security (TIFS) +
+
+
+
+
+ + ☆ Enhancing Traffic Safety with Parallel Dense Video Captioning for + End-to-End Event Analysis + + +
+ This paper introduces our solution for Track 2 in AI City Challenge 2024. The +task aims to solve traffic safety description and analysis with the dataset of +Woven Traffic Safety (WTS), a real-world Pedestrian-Centric Traffic Video +Dataset for Fine-grained Spatial-Temporal Understanding. Our solution mainly +focuses on the following points: 1) To solve dense video captioning, we +leverage the framework of dense video captioning with parallel decoding (PDVC) +to model visual-language sequences and generate dense caption by chapters for +video. 2) Our work leverages CLIP to extract visual features to more +efficiently perform cross-modality training between visual and textual +representations. 3) We conduct domain-specific model adaptation to mitigate +domain shift problem that poses recognition challenge in video understanding. +4) Moreover, we leverage BDD-5K captioned videos to conduct knowledge transfer +for better understanding WTS videos and more accurate captioning. Our solution +has yielded on the test set, achieving 6th place in the competition. The open +source code will be available at https://github.com/UCF-SST-Lab/AICity2024CVPRW + +
+
+
+
+
+ + ☆ Improving Continuous Sign Language Recognition with Adapted Image Models + + +
+ The increase of web-scale weakly labelled image-text pairs have greatly +facilitated the development of large-scale vision-language models (e.g., CLIP), +which have shown impressive generalization performance over a series of +downstream tasks. However, the massive model size and scarcity of available +data limit their applications to fine-tune the whole model in downstream tasks. +Besides, fully fine-tuning the model easily forgets the generic essential +knowledge acquired in the pretraining stage and overfits the downstream data. +To enable high efficiency when adapting these large vision-language models +(e.g., CLIP) to performing continuous sign language recognition (CSLR) while +preserving their generalizability, we propose a novel strategy (AdaptSign). +Especially, CLIP is adopted as the visual backbone to extract frame-wise +features whose parameters are fixed, and a set of learnable modules are +introduced to model spatial sign variations or capture temporal sign movements. +The introduced additional modules are quite lightweight, only owning 3.2% extra +computations with high efficiency. The generic knowledge acquired in the +pretraining stage is well-preserved in the frozen CLIP backbone in this +process. Extensive experiments show that despite being efficient, AdaptSign is +able to demonstrate superior performance across a series of CSLR benchmarks +including PHOENIX14, PHOENIX14-T, CSL-Daily and CSL compared to existing +methods. Visualizations show that AdaptSign could learn to dynamically pay +major attention to the informative spatial regions and cross-frame trajectories +in sign videos. + +
+
+
+
+
+ + ☆ A Mutual Inclusion Mechanism for Precise Boundary Segmentation in + Medical Images + + +
+ In medical imaging, accurate image segmentation is crucial for quantifying +diseases, assessing prognosis, and evaluating treatment outcomes. However, +existing methods lack an in-depth integration of global and local features, +failing to pay special attention to abnormal regions and boundary details in +medical images. To this end, we present a novel deep learning-based approach, +MIPC-Net, for precise boundary segmentation in medical images. Our approach, +inspired by radiologists' working patterns, features two distinct modules: (i) +\textbf{Mutual Inclusion of Position and Channel Attention (MIPC) module}: To +enhance the precision of boundary segmentation in medical images, we introduce +the MIPC module, which enhances the focus on channel information when +extracting position features and vice versa; (ii) \textbf{GL-MIPC-Residue}: To +improve the restoration of medical images, we propose the GL-MIPC-Residue, a +global residual connection that enhances the integration of the encoder and +decoder by filtering out invalid information and restoring the most effective +information lost during the feature extraction process. We evaluate the +performance of the proposed model using metrics such as Dice coefficient (DSC) +and Hausdorff Distance (HD) on three publicly accessible datasets: Synapse, +ISIC2018-Task, and Segpc. Our ablation study shows that each module contributes +to improving the quality of segmentation results. Furthermore, with the +assistance of both modules, our approach outperforms state-of-the-art methods +across all metrics on the benchmark datasets, notably achieving a 2.23mm +reduction in HD on the Synapse dataset, strongly evidencing our model's +enhanced capability for precise image boundary segmentation. Codes will be +available at https://github.com/SUN-1024/MIPC-Net. + +
+
+
+
+
+ + ☆ Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and + Training Strategies + + +
+ This paper investigates the performance of the Contrastive Language-Image +Pre-training (CLIP) when scaled down to limited computation budgets. We explore +CLIP along three dimensions: data, architecture, and training strategies. With +regards to data, we demonstrate the significance of high-quality training data +and show that a smaller dataset of high-quality data can outperform a larger +dataset with lower quality. We also examine how model performance varies with +different dataset sizes, suggesting that smaller ViT models are better suited +for smaller datasets, while larger models perform better on larger datasets +with fixed compute. Additionally, we provide guidance on when to choose a +CNN-based architecture or a ViT-based architecture for CLIP training. We +compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data +Augmentation - and show that the choice of training strategy depends on the +available compute resource. Our analysis reveals that CLIP+Data Augmentation +can achieve comparable performance to CLIP using only half of the training +data. This work provides practical insights into how to effectively train and +deploy CLIP models, making them more accessible and affordable for practical +use in various applications. + +
+
+
+
+
+ + ☆ Tackling Ambiguity from Perspective of Uncertainty Inference and + Affinity Diversification for Weakly Supervised Semantic Segmentation + + +
+ Weakly supervised semantic segmentation (WSSS) with image-level labels +intends to achieve dense tasks without laborious annotations. However, due to +the ambiguous contexts and fuzzy regions, the performance of WSSS, especially +the stages of generating Class Activation Maps (CAMs) and refining pseudo +masks, widely suffers from ambiguity while being barely noticed by previous +literature. In this work, we propose UniA, a unified single-staged WSSS +framework, to efficiently tackle this issue from the perspective of uncertainty +inference and affinity diversification, respectively. When activating class +objects, we argue that the false activation stems from the bias to the +ambiguous regions during the feature extraction. Therefore, we design a more +robust feature representation with a probabilistic Gaussian distribution and +introduce the uncertainty estimation to avoid the bias. A distribution loss is +particularly proposed to supervise the process, which effectively captures the +ambiguity and models the complex dependencies among features. When refining +pseudo labels, we observe that the affinity from the prevailing refinement +methods intends to be similar among ambiguities. To this end, an affinity +diversification module is proposed to promote diversity among semantics. A +mutual complementing refinement is proposed to initially rectify the ambiguous +affinity with multiple inferred pseudo labels. More importantly, a contrastive +affinity loss is further designed to diversify the relations among unrelated +semantics, which reliably propagates the diversity into the whole feature +representations and helps generate better pseudo masks. Extensive experiments +are conducted on PASCAL VOC, MS COCO, and medical ACDC datasets, which validate +the efficiency of UniA tackling ambiguity and the superiority over recent +single-staged or even most multi-staged competitors. + +
+
+
+
+
+ + ☆ Adapting CNNs for Fisheye Cameras without Retraining + + +
+ The majority of image processing approaches assume images are in or can be +rectified to a perspective projection. However, in many applications it is +beneficial to use non conventional cameras, such as fisheye cameras, that have +a larger field of view (FOV). The issue arises that these large-FOV images +can't be rectified to a perspective projection without significant cropping of +the original image. To address this issue we propose Rectified Convolutions +(RectConv); a new approach for adapting pre-trained convolutional networks to +operate with new non-perspective images, without any retraining. Replacing the +convolutional layers of the network with RectConv layers allows the network to +see both rectified patches and the entire FOV. We demonstrate RectConv adapting +multiple pre-trained networks to perform segmentation and detection on fisheye +imagery from two publicly available datasets. Our approach requires no +additional data or training, and operates directly on the native image as +captured from the camera. We believe this work is a step toward adapting the +vast resources available for perspective images to operate across a broad range +of camera geometries. + +
+
+ comment: Project page: https://roboticimaging.org/Projects/RectConv/ +
+
+
+
+
+ + ☆ Measuring Domain Shifts using Deep Learning Remote Photoplethysmography + Model Similarity + + +
+ Domain shift differences between training data for deep learning models and +the deployment context can result in severe performance issues for models which +fail to generalize. We study the domain shift problem under the context of +remote photoplethysmography (rPPG), a technique for video-based heart rate +inference. We propose metrics based on model similarity which may be used as a +measure of domain shift, and we demonstrate high correlation between these +metrics and empirical performance. One of the proposed metrics with viable +correlations, DS-diff, does not assume access to the ground truth of the target +domain, i.e. it may be applied to in-the-wild data. To that end, we investigate +a model selection problem in which ground truth results for the evaluation +domain is not known, demonstrating a 13.9% performance improvement over the +average case baseline. + +
+
+
+
+
+ + ☆ Pay Attention to Your Neighbours: Training-Free Open-Vocabulary Semantic + Segmentation + + +
+ Despite the significant progress in deep learning for dense visual +recognition problems, such as semantic segmentation, traditional methods are +constrained by fixed class sets. Meanwhile, vision-language foundation models, +such as CLIP, have showcased remarkable effectiveness in numerous zero-shot +image-level tasks, owing to their robust generalizability. Recently, a body of +work has investigated utilizing these models in open-vocabulary semantic +segmentation (OVSS). However, existing approaches often rely on impractical +supervised pre-training or access to additional pre-trained networks. In this +work, we propose a strong baseline for training-free OVSS, termed +Neighbour-Aware CLIP (NACLIP), representing a straightforward adaptation of +CLIP tailored for this scenario. Our method enforces localization of patches in +the self-attention of CLIP's vision transformer which, despite being crucial +for dense prediction tasks, has been overlooked in the OVSS literature. By +incorporating design choices favouring segmentation, our approach significantly +improves performance without requiring additional data, auxiliary pre-trained +networks, or extensive hyperparameter tuning, making it highly practical for +real-world applications. Experiments are performed on 8 popular semantic +segmentation benchmarks, yielding state-of-the-art performance on most +scenarios. Our code is publicly available at https://github.com/sinahmr/NACLIP . + +
+
+
+
+
+ + ☆ Uncertainty Quantification in Detecting Choroidal Metastases on MRI via + Evolutionary Strategies + + +
+ Uncertainty quantification plays a vital role in facilitating the practical +implementation of AI in radiology by addressing growing concerns around +trustworthiness. Given the challenges associated with acquiring large, +annotated datasets in this field, there is a need for methods that enable +uncertainty quantification in small data AI approaches tailored to radiology +images. In this study, we focused on uncertainty quantification within the +context of the small data evolutionary strategies-based technique of deep +neuroevolution (DNE). Specifically, we employed DNE to train a simple +Convolutional Neural Network (CNN) with MRI images of the eyes for binary +classification. The goal was to distinguish between normal eyes and those with +metastatic tumors called choroidal metastases. The training set comprised 18 +images with choroidal metastases and 18 without tumors, while the testing set +contained a tumor-to-normal ratio of 15:15. + We trained CNN model weights via DNE for approximately 40,000 episodes, +ultimately reaching a convergence of 100% accuracy on the training set. We +saved all models that achieved maximal training set accuracy. Then, by applying +these models to the testing set, we established an ensemble method for +uncertainty quantification.The saved set of models produced distributions for +each testing set image between the two classes of normal and tumor-containing. +The relative frequencies permitted uncertainty quantification of model +predictions. Intriguingly, we found that subjective features appreciated by +human radiologists explained images for which uncertainty was high, +highlighting the significance of uncertainty quantification in AI-driven +radiological analyses. + +
+
+
+
+
+ + ☆ Structured Model Pruning for Efficient Inference in Computational + Pathology + + +
+ Recent years have seen significant efforts to adopt Artificial Intelligence +(AI) in healthcare for various use cases, from computer-aided diagnosis to ICU +triage. However, the size of AI models has been rapidly growing due to scaling +laws and the success of foundational models, which poses an increasing +challenge to leverage advanced models in practical applications. It is thus +imperative to develop efficient models, especially for deploying AI solutions +under resource-constrains or with time sensitivity. One potential solution is +to perform model compression, a set of techniques that remove less important +model components or reduce parameter precision, to reduce model computation +demand. In this work, we demonstrate that model pruning, as a model compression +technique, can effectively reduce inference cost for computational and digital +pathology based analysis with a negligible loss of analysis performance. To +this end, we develop a methodology for pruning the widely used U-Net-style +architectures in biomedical imaging, with which we evaluate multiple pruning +heuristics on nuclei instance segmentation and classification, and empirically +demonstrate that pruning can compress models by at least 70% with a negligible +drop in performance. + +
+
+
+
+
+ + ☆ "Don't forget to put the milk back!" Dataset for Enabling Embodied + Agents to Detect Anomalous Situations + + +
+ Home robots intend to make their users lives easier. Our work assists in this +goal by enabling robots to inform their users of dangerous or unsanitary +anomalies in their home. Some examples of these anomalies include the user +leaving their milk out, forgetting to turn off the stove, or leaving poison +accessible to children. To move towards enabling home robots with these +abilities, we have created a new dataset, which we call SafetyDetect. The +SafetyDetect dataset consists of 1000 anomalous home scenes, each of which +contains unsafe or unsanitary situations for an agent to detect. Our approach +utilizes large language models (LLMs) alongside both a graph representation of +the scene and the relationships between the objects in the scene. Our key +insight is that this connected scene graph and the object relationships it +encodes enables the LLM to better reason about the scene -- especially as it +relates to detecting dangerous or unsanitary situations. Our most promising +approach utilizes GPT-4 and pursues a categorization technique where object +relations from the scene graph are classified as normal, dangerous, unsanitary, +or dangerous for children. This method is able to correctly identify over 90% +of anomalous scenarios in the SafetyDetect Dataset. Additionally, we conduct +real world experiments on a ClearPath TurtleBot where we generate a scene graph +from visuals of the real world scene, and run our approach with no +modification. This setup resulted in little performance loss. The SafetyDetect +Dataset and code will be released to the public upon this papers publication. + +
+
+
+
+
+ + ☆ Single-image driven 3d viewpoint training data augmentation for + effective wine label recognition + + +
+ Confronting the critical challenge of insufficient training data in the field +of complex image recognition, this paper introduces a novel 3D viewpoint +augmentation technique specifically tailored for wine label recognition. This +method enhances deep learning model performance by generating visually +realistic training samples from a single real-world wine label image, +overcoming the challenges posed by the intricate combinations of text and +logos. Classical Generative Adversarial Network (GAN) methods fall short in +synthesizing such intricate content combination. Our proposed solution +leverages time-tested computer vision and image processing strategies to expand +our training dataset, thereby broadening the range of training samples for deep +learning applications. This innovative approach to data augmentation +circumvents the constraints of limited training resources. Using the augmented +training images through batch-all triplet metric learning on a Vision +Transformer (ViT) architecture, we can get the most discriminative embedding +features for every wine label, enabling us to perform one-shot recognition of +existing wine labels in the training classes or future newly collected wine +labels unavailable in the training. Experimental results show a significant +increase in recognition accuracy over conventional 2D data augmentation +techniques. + +
+
+
+
+
+ + ☆ E3: Ensemble of Expert Embedders for Adapting Synthetic Image Detectors + to New Generators Using Limited Data + + +
+ As generative AI progresses rapidly, new synthetic image generators continue +to emerge at a swift pace. Traditional detection methods face two main +challenges in adapting to these generators: the forensic traces of synthetic +images from new techniques can vastly differ from those learned during +training, and access to data for these new generators is often limited. To +address these issues, we introduce the Ensemble of Expert Embedders (E3), a +novel continual learning framework for updating synthetic image detectors. E3 +enables the accurate detection of images from newly emerged generators using +minimal training data. Our approach does this by first employing transfer +learning to develop a suite of expert embedders, each specializing in the +forensic traces of a specific generator. Then, all embeddings are jointly +analyzed by an Expert Knowledge Fusion Network to produce accurate and reliable +detection decisions. Our experiments demonstrate that E3 outperforms existing +continual learning methods, including those developed specifically for +synthetic image detection. + +
+
+
+
+
+ + ☆ Real-time guidewire tracking and segmentation in intraoperative x-ray + + +
+ During endovascular interventions, physicians have to perform accurate and +immediate operations based on the available real-time information, such as the +shape and position of guidewires observed on the fluoroscopic images, haptic +information and the patients' physiological signals. For this purpose, +real-time and accurate guidewire segmentation and tracking can enhance the +visualization of guidewires and provide visual feedback for physicians during +the intervention as well as for robot-assisted interventions. Nevertheless, +this task often comes with the challenge of elongated deformable structures +that present themselves with low contrast in the noisy fluoroscopic image +sequences. To address these issues, a two-stage deep learning framework for +real-time guidewire segmentation and tracking is proposed. In the first stage, +a Yolov5s detector is trained, using the original X-ray images as well as +synthetic ones, which is employed to output the bounding boxes of possible +target guidewires. More importantly, a refinement module based on +spatiotemporal constraints is incorporated to robustly localize the guidewire +and remove false detections. In the second stage, a novel and efficient network +is proposed to segment the guidewire in each detected bounding box. The network +contains two major modules, namely a hessian-based enhancement embedding module +and a dual self-attention module. Quantitative and qualitative evaluations on +clinical intra-operative images demonstrate that the proposed approach +significantly outperforms our baselines as well as the current state of the art +and, in comparison, shows higher robustness to low quality images. + +
+
+
+
+
+ + ☆ Semantic Approach to Quantifying the Consistency of Diffusion Model + Image Generation CVPR 3 + + +
+ In this study, we identify the need for an interpretable, quantitative score +of the repeatability, or consistency, of image generation in diffusion models. +We propose a semantic approach, using a pairwise mean CLIP (Contrastive +Language-Image Pretraining) score as our semantic consistency score. We applied +this metric to compare two state-of-the-art open-source image generation +diffusion models, Stable Diffusion XL and PixArt-{\alpha}, and we found +statistically significant differences between the semantic consistency scores +for the models. Agreement between the Semantic Consistency Score selected model +and aggregated human annotations was 94%. We also explored the consistency of +SDXL and a LoRA-fine-tuned version of SDXL and found that the fine-tuned model +had significantly higher semantic consistency in generated images. The Semantic +Consistency Score proposed here offers a measure of image generation alignment, +facilitating the evaluation of model architectures for specific tasks and +aiding in informed decision-making regarding model selection. + +
+
+ comment: Accepted to 2024 CVPR 3rd Explainable AI for Computer Vision (XAI4CV) + Workshop +
+
+
+
+
+ + ☆ Detecting AI-Generated Images via CLIP + + +
+ As AI-generated image (AIGI) methods become more powerful and accessible, it +has become a critical task to determine if an image is real or AI-generated. +Because AIGI lack the signatures of photographs and have their own unique +patterns, new models are needed to determine if an image is AI-generated. In +this paper, we investigate the ability of the Contrastive Language-Image +Pre-training (CLIP) architecture, pre-trained on massive internet-scale data +sets, to perform this differentiation. We fine-tune CLIP on real images and +AIGI from several generative models, enabling CLIP to determine if an image is +AI-generated and, if so, determine what generation method was used to create +it. We show that the fine-tuned CLIP architecture is able to differentiate AIGI +as well or better than models whose architecture is specifically designed to +detect AIGI. Our method will significantly increase access to AIGI-detecting +tools and reduce the negative effects of AIGI on society, as our CLIP +fine-tuning procedures require no architecture changes from publicly available +model repositories and consume significantly less GPU resources than other AIGI +detection models. + +
+
+ comment: submitted for publication in Machine Vision and Applications +
+
+
+
+
+ + ☆ Under pressure: learning-based analog gauge reading in the wild ICRA + + +
+ We propose an interpretable framework for reading analog gauges that is +deployable on real world robotic systems. Our framework splits the reading task +into distinct steps, such that we can detect potential failures at each step. +Our system needs no prior knowledge of the type of gauge or the range of the +scale and is able to extract the units used. We show that our gauge reading +algorithm is able to extract readings with a relative reading error of less +than 2%. + +
+
+ comment: 7 pages, 8 figures, accepted for presentation at the 2024 IEEE + International Conference on Robotics and Automation (ICRA) and for inclusion + in the conference proceedings, finalist for the IEEE ICRA 2024 Best Paper + Award in Automation, source code + https://github.com/ethz-asl/analog_gauge_reader, Autonomous Systems Lab, ETH + Zurich +
+
+
+
+
+ + ☆ Towards Sim-to-Real Industrial Parts Classification with Synthetic + Dataset CVPR + + +
+ This paper is about effectively utilizing synthetic data for training deep +neural networks for industrial parts classification, in particular, by taking +into account the domain gap against real-world images. To this end, we +introduce a synthetic dataset that may serve as a preliminary testbed for the +Sim-to-Real challenge; it contains 17 objects of six industrial use cases, +including isolated and assembled parts. A few subsets of objects exhibit large +similarities in shape and albedo for reflecting challenging cases of industrial +parts. All the sample images come with and without random backgrounds and +post-processing for evaluating the importance of domain randomization. We call +it Synthetic Industrial Parts dataset (SIP-17). We study the usefulness of +SIP-17 through benchmarking the performance of five state-of-the-art deep +network models, supervised and self-supervised, trained only on the synthetic +data while testing them on real data. By analyzing the results, we deduce some +insights on the feasibility and challenges of using synthetic data for +industrial parts classification and for further developing larger-scale +synthetic datasets. Our dataset and code are publicly available. + +
+
+ comment: Published in 2023 IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) +
+
+
+
+
+ + ☆ LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning + + +
+ Understanding human instructions to identify the target objects is vital for +perception systems. In recent years, the advancements of Large Language Models +(LLMs) have introduced new possibilities for image segmentation. In this work, +we delve into reasoning segmentation, a novel task that enables segmentation +system to reason and interpret implicit user intention via large language model +reasoning and then segment the corresponding target. Our work on reasoning +segmentation contributes on both the methodological design and dataset +labeling. For the model, we propose a new framework named LLM-Seg. LLM-Seg +effectively connects the current foundational Segmentation Anything Model and +the LLM by mask proposals selection. For the dataset, we propose an automatic +data generation pipeline and construct a new reasoning segmentation dataset +named LLM-Seg40K. Experiments demonstrate that our LLM-Seg exhibits competitive +performance compared with existing methods. Furthermore, our proposed pipeline +can efficiently produce high-quality reasoning segmentation datasets. The +LLM-Seg40K dataset, developed through this pipeline, serves as a new benchmark +for training and evaluating various reasoning segmentation approaches. Our +code, models and dataset are at https://github.com/wangjunchi/LLMSeg. + +
+
+ comment: Github: https://github.com/wangjunchi/LLMSeg +
+
+
+
+
+ + ☆ `Eyes of a Hawk and Ears of a Fox': Part Prototype Network for + Generalized Zero-Shot Learning CVPR 2024 + + +
+ Current approaches in Generalized Zero-Shot Learning (GZSL) are built upon +base models which consider only a single class attribute vector representation +over the entire image. This is an oversimplification of the process of novel +category recognition, where different regions of the image may have properties +from different seen classes and thus have different predominant attributes. +With this in mind, we take a fundamentally different approach: a pre-trained +Vision-Language detector (VINVL) sensitive to attribute information is employed +to efficiently obtain region features. A learned function maps the region +features to region-specific attribute attention used to construct class part +prototypes. We conduct experiments on a popular GZSL benchmark consisting of +the CUB, SUN, and AWA2 datasets where our proposed Part Prototype Network (PPN) +achieves promising results when compared with other popular base models. +Corresponding ablation studies and analysis show that our approach is highly +practical and has a distinct advantage over global attribute attention when +localized proposals are available. + +
+
+ comment: Accepted to the CVPR 2024 LIMIT Workshop +
+
+
+
+
+ + ☆ SCOUT+: Towards Practical Task-Driven Drivers' Gaze Prediction + + +
+ Accurate prediction of drivers' gaze is an important component of +vision-based driver monitoring and assistive systems. Of particular interest +are safety-critical episodes, such as performing maneuvers or crossing +intersections. In such scenarios, drivers' gaze distribution changes +significantly and becomes difficult to predict, especially if the task and +context information is represented implicitly, as is common in many +state-of-the-art models. However, explicit modeling of top-down factors +affecting drivers' attention often requires additional information and +annotations that may not be readily available. + In this paper, we address the challenge of effective modeling of task and +context with common sources of data for use in practical systems. To this end, +we introduce SCOUT+, a task- and context-aware model for drivers' gaze +prediction, which leverages route and map information inferred from commonly +available GPS data. We evaluate our model on two datasets, DR(eye)VE and BDD-A, +and demonstrate that using maps improves results compared to bottom-up models +and reaches performance comparable to the top-down model SCOUT which relies on +privileged ground truth information. Code is available at +https://github.com/ykotseruba/SCOUT. + +
+
+ comment: Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024 +
+
+
+
+
+ + ☆ Training a Vision Language Model as Smartphone Assistant ICLR 2024 + + +
+ Addressing the challenge of a digital assistant capable of executing a wide +array of user tasks, our research focuses on the realm of instruction-based +mobile device control. We leverage recent advancements in large language models +(LLMs) and present a visual language model (VLM) that can fulfill diverse tasks +on mobile devices. Our model functions by interacting solely with the user +interface (UI). It uses the visual input from the device screen and mimics +human-like interactions, encompassing gestures such as tapping and swiping. +This generality in the input and output space allows our agent to interact with +any application on the device. Unlike previous methods, our model operates not +only on a single screen image but on vision-language sentences created from +sequences of past screenshots along with corresponding actions. Evaluating our +method on the challenging Android in the Wild benchmark demonstrates its +promising efficacy and potential. + +
+
+ comment: ICLR 2024 workshop on Generative Models for Decision Making +
+
+
+
+
+ + ☆ Data Limitations for Modeling Top-Down Effects on Drivers' Attention + + +
+ Driving is a visuomotor task, i.e., there is a connection between what +drivers see and what they do. While some models of drivers' gaze account for +top-down effects of drivers' actions, the majority learn only bottom-up +correlations between human gaze and driving footage. The crux of the problem is +lack of public data with annotations that could be used to train top-down +models and evaluate how well models of any kind capture effects of task on +attention. As a result, top-down models are trained and evaluated on private +data and public benchmarks measure only the overall fit to human data. + In this paper, we focus on data limitations by examining four large-scale +public datasets, DR(eye)VE, BDD-A, MAAD, and LBW, used to train and evaluate +algorithms for drivers' gaze prediction. We define a set of driving tasks +(lateral and longitudinal maneuvers) and context elements (intersections and +right-of-way) known to affect drivers' attention, augment the datasets with +annotations based on the said definitions, and analyze the characteristics of +data recording and processing pipelines w.r.t. capturing what the drivers see +and do. In sum, the contributions of this work are: 1) quantifying biases of +the public datasets, 2) examining performance of the SOTA bottom-up models on +subsets of the data involving non-trivial drivers' actions, 3) linking +shortcomings of the bottom-up models to data limitations, and 4) +recommendations for future data collection and processing. The new annotations +and code for reproducing the results is available at +https://github.com/ykotseruba/SCOUT. + +
+
+ comment: Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024 +
+
+
+
+
+ + ☆ Multi-Branch Generative Models for Multichannel Imaging with an + Application to PET/CT Joint Reconstruction + + +
+ This paper presents a proof-of-concept approach for learned synergistic +reconstruction of medical images using multi-branch generative models. +Leveraging variational autoencoders (VAEs) and generative adversarial networks +(GANs), our models learn from pairs of images simultaneously, enabling +effective denoising and reconstruction. Synergistic image reconstruction is +achieved by incorporating the trained models in a regularizer that evaluates +the distance between the images and the model, in a similar fashion to +multichannel dictionary learning (DiL). We demonstrate the efficacy of our +approach on both Modified National Institute of Standards and Technology +(MNIST) and positron emission tomography (PET)/computed tomography (CT) +datasets, showcasing improved image quality and information sharing between +modalities. Despite challenges such as patch decomposition and model +limitations, our results underscore the potential of generative models for +enhancing medical imaging reconstruction. + +
+
+ comment: 12 pages, 16 figures, submitted to IEEE TRPMS +
+
+
+
+
+ + ☆ Into the Fog: Evaluating Multiple Object Tracking Robustness + + +
+ State-of-the-art (SOTA) trackers have shown remarkable Multiple Object +Tracking (MOT) performance when trained and evaluated on current benchmarks. +However, these benchmarks primarily consist of clear scenarios, overlooking +adverse atmospheric conditions such as fog, haze, smoke and dust. As a result, +the robustness of SOTA trackers remains underexplored. To address these +limitations, we propose a pipeline for physic-based volumetric fog simulation +in arbitrary real-world MOT dataset utilizing frame-by-frame monocular depth +estimation and a fog formation optical model. Moreover, we enhance our +simulation by rendering of both homogeneous and heterogeneous fog effects. We +propose to use the dark channel prior method to estimate fog (smoke) color, +which shows promising results even in night and indoor scenes. We present the +leading tracking benchmark MOTChallenge (MOT17 dataset) overlaid by fog (smoke +for indoor scenes) of various intensity levels and conduct a comprehensive +evaluation of SOTA MOT methods, revealing their limitations under fog and +fog-similar challenges. + +
+
+
+
+
+ + ☆ SEVD: Synthetic Event-based Vision Dataset for Ego and Fixed Traffic + Perception + + +
+ Recently, event-based vision sensors have gained attention for autonomous +driving applications, as conventional RGB cameras face limitations in handling +challenging dynamic conditions. However, the availability of real-world and +synthetic event-based vision datasets remains limited. In response to this gap, +we present SEVD, a first-of-its-kind multi-view ego, and fixed perception +synthetic event-based dataset using multiple dynamic vision sensors within the +CARLA simulator. Data sequences are recorded across diverse lighting (noon, +nighttime, twilight) and weather conditions (clear, cloudy, wet, rainy, foggy) +with domain shifts (discrete and continuous). SEVD spans urban, suburban, +rural, and highway scenes featuring various classes of objects (car, truck, +van, bicycle, motorcycle, and pedestrian). Alongside event data, SEVD includes +RGB imagery, depth maps, optical flow, semantic, and instance segmentation, +facilitating a comprehensive understanding of the scene. Furthermore, we +evaluate the dataset using state-of-the-art event-based (RED, RVT) and +frame-based (YOLOv8) methods for traffic participant detection tasks and +provide baseline benchmarks for assessment. Additionally, we conduct +experiments to assess the synthetic event-based dataset's generalization +capabilities. The dataset is available at +https://eventbasedvision.github.io/SEVD + +
+
+
+
+
+ + ♻ ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 18 times on average, and achieve +comparable performance across diverse visual question-answering and reasoning +tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ♻ ☆ FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal + Consistency and Correlation Debiasing CVPR 2024 + + +
+ Dynamic scene graph generation (SGG) from videos requires not only a +comprehensive understanding of objects across scenes but also a method to +capture the temporal motions and interactions with different objects. Moreover, +the long-tailed distribution of visual relationships is a crucial bottleneck +for most dynamic SGG methods. This is because many of them focus on capturing +spatio-temporal context using complex architectures, leading to the generation +of biased scene graphs. To address these challenges, we propose FloCoDe: +Flow-aware Temporal Consistency and Correlation Debiasing with uncertainty +attenuation for unbiased dynamic scene graphs. FloCoDe employs feature warping +using flow to detect temporally consistent objects across frames. To address +the long-tail issue of visual relationships, we propose correlation debiasing +and a label correlation-based loss to learn unbiased relation representations +for long-tailed classes. Specifically, we propose to incorporate label +correlations using contrastive loss to capture commonly co-occurring relations, +which aids in learning robust representations for long-tailed classes. Further, +we adopt the uncertainty attenuation-based classifier framework to handle noisy +annotations in the SGG data. Extensive experimental evaluation shows a +performance gain as high as 4.1%, demonstrating the superiority of generating +more unbiased scene graphs. + +
+
+ comment: Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ PromptSync: Bridging Domain Gaps in Vision-Language Models through + Class-Aware Prototype Alignment and Discrimination CVPR 2024 + + +
+ The potential for zero-shot generalization in vision-language (V-L) models +such as CLIP has spurred their widespread adoption in addressing numerous +downstream tasks. Previous methods have employed test-time prompt tuning to +adapt the model to unseen domains, but they overlooked the issue of imbalanced +class distributions. In this study, we explicitly address this problem by +employing class-aware prototype alignment weighted by mean class probabilities +obtained for the test sample and filtered augmented views. Additionally, we +ensure that the class probabilities are as accurate as possible by performing +prototype discrimination using contrastive learning. The combination of +alignment and discriminative loss serves as a geometric regularizer, preventing +the prompt representation from collapsing onto a single class and effectively +bridging the distribution gap between the source and test domains. Our method, +named PromptSync, synchronizes the prompts for each test sample on both the +text and vision branches of the V-L model. In empirical evaluations on the +domain generalization benchmark, our method outperforms previous best methods +by 2.33% in overall performance, by 1% in base-to-novel generalization, and by +2.84% in cross-dataset transfer tasks. + +
+
+ comment: Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures +
+
+
+
+
+ + ♻ ☆ WonderJourney: Going from Anywhere to Everywhere + + +
+ We introduce WonderJourney, a modularized framework for perpetual 3D scene +generation. Unlike prior work on view generation that focuses on a single type +of scenes, we start at any user-provided location (by a text description or an +image) and generate a journey through a long sequence of diverse yet coherently +connected 3D scenes. We leverage an LLM to generate textual descriptions of the +scenes in this journey, a text-driven point cloud generation pipeline to make a +compelling and coherent sequence of 3D scenes, and a large VLM to verify the +generated scenes. We show compelling, diverse visual results across various +scene types and styles, forming imaginary "wonderjourneys". Project website: +https://kovenyu.com/WonderJourney/ + +
+
+ comment: Project website with video results: + https://kovenyu.com/WonderJourney/ +
+
+
+
+
+ + ♻ ☆ ProbMCL: Simple Probabilistic Contrastive Learning for Multi-label + Visual Classification ICASSP 2024 + + +
+ Multi-label image classification presents a challenging task in many domains, +including computer vision and medical imaging. Recent advancements have +introduced graph-based and transformer-based methods to improve performance and +capture label dependencies. However, these methods often include complex +modules that entail heavy computation and lack interpretability. In this paper, +we propose Probabilistic Multi-label Contrastive Learning (ProbMCL), a novel +framework to address these challenges in multi-label image classification +tasks. Our simple yet effective approach employs supervised contrastive +learning, in which samples that share enough labels with an anchor image based +on a decision threshold are introduced as a positive set. This structure +captures label dependencies by pulling positive pair embeddings together and +pushing away negative samples that fall below the threshold. We enhance +representation learning by incorporating a mixture density network into +contrastive learning and generating Gaussian mixture distributions to explore +the epistemic uncertainty of the feature encoder. We validate the effectiveness +of our framework through experimentation with datasets from the computer vision +and medical imaging domains. Our method outperforms the existing +state-of-the-art methods while achieving a low computational footprint on both +datasets. Visualization analyses also demonstrate that ProbMCL-learned +classifiers maintain a meaningful semantic topology. + +
+
+ comment: This paper has been accepted for the ICASSP 2024 - 2024 IEEE + International Conference on Acoustics, Speech and Signal Processing (ICASSP) +
+
+
+
+
+ + ♻ ☆ A Change Detection Reality Check + + +
+ In recent years, there has been an explosion of proposed change detection +deep learning architectures in the remote sensing literature. These approaches +claim to offer state-of-the-art performance on different standard benchmark +datasets. However, has the field truly made significant progress? In this paper +we perform experiments which conclude a simple U-Net segmentation baseline +without training tricks or complicated architectural changes is still a top +performer for the task of change detection. + +
+
+
+
+
+ + ♻ ☆ Generalization in diffusion models arises from geometry-adaptive + harmonic representations ICLR + + +
+ Deep neural networks (DNNs) trained for image denoising are able to generate +high-quality samples with score-based reverse diffusion algorithms. These +impressive capabilities seem to imply an escape from the curse of +dimensionality, but recent reports of memorization of the training set raise +the question of whether these networks are learning the "true" continuous +density of the data. Here, we show that two DNNs trained on non-overlapping +subsets of a dataset learn nearly the same score function, and thus the same +density, when the number of training images is large enough. In this regime of +strong generalization, diffusion-generated images are distinct from the +training set, and are of high visual quality, suggesting that the inductive +biases of the DNNs are well-aligned with the data density. We analyze the +learned denoising functions and show that the inductive biases give rise to a +shrinkage operation in a basis adapted to the underlying image. Examination of +these bases reveals oscillating harmonic structures along contours and in +homogeneous regions. We demonstrate that trained denoisers are inductively +biased towards these geometry-adaptive harmonic bases since they arise not only +when the network is trained on photographic images, but also when it is trained +on image classes supported on low-dimensional manifolds for which the harmonic +basis is suboptimal. Finally, we show that when trained on regular image +classes for which the optimal basis is known to be geometry-adaptive and +harmonic, the denoising performance of the networks is near-optimal. + +
+
+ comment: Accepted for oral presentation at ICLR, Vienna, May 2024 +
+
+
+
+
+ + ♻ ☆ A novel Fourier neural operator framework for classification of + multi-sized images: Application to three dimensional digital porous media + + +
+ Fourier neural operators (FNOs) are invariant with respect to the size of +input images, and thus images with any size can be fed into FNO-based +frameworks without any modification of network architectures, in contrast to +traditional convolutional neural networks (CNNs). Leveraging the advantage of +FNOs, we propose a novel deep-learning framework for classifying images with +varying sizes. Particularly, we simultaneously train the proposed network on +multi-sized images. As a practical application, we consider the problem of +predicting the label (e.g., permeability) of three-dimensional digital porous +media. To construct the framework, an intuitive approach is to connect FNO +layers to a classifier using adaptive max pooling. First, we show that this +approach is only effective for porous media with fixed sizes, whereas it fails +for porous media of varying sizes. To overcome this limitation, we introduce +our approach: instead of using adaptive max pooling, we use static max pooling +with the size of channel width of FNO layers. Since the channel width of the +FNO layers is independent of input image size, the introduced framework can +handle multi-sized images during training. We show the effectiveness of the +introduced framework and compare its performance with the intuitive approach +through the example of the classification of three-dimensional digital porous +media of varying sizes. + +
+
+
+
+
+ + ♻ ☆ View-Consistent 3D Editing with Gaussian Splatting + + +
+ The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing, +offering efficient, high-fidelity rendering and enabling precise local +manipulations. Currently, diffusion-based 2D editing models are harnessed to +modify multi-view rendered images, which then guide the editing of 3DGS models. +However, this approach faces a critical issue of multi-view inconsistency, +where the guidance images exhibit significant discrepancies across views, +leading to mode collapse and visual artifacts of 3DGS. To this end, we +introduce View-consistent Editing (VcEdit), a novel framework that seamlessly +incorporates 3DGS into image editing processes, ensuring multi-view consistency +in edited guidance images and effectively mitigating mode collapse issues. +VcEdit employs two innovative consistency modules: the Cross-attention +Consistency Module and the Editing Consistency Module, both designed to reduce +inconsistencies in edited images. By incorporating these consistency modules +into an iterative pattern, VcEdit proficiently resolves the issue of multi-view +inconsistency, facilitating high-quality 3DGS editing across a diverse range of +scenes. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike + Camera + + +
+ One of the most critical factors in achieving sharp Novel View Synthesis +(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS) is the quality of the training images. However, +Conventional RGB cameras are susceptible to motion blur. In contrast, +neuromorphic cameras like event and spike cameras inherently capture more +comprehensive temporal information, which can provide a sharp representation of +the scene as additional training data. Recent methods have explored the +integration of event cameras to improve the quality of NVS. The event-RGB +approaches have some limitations, such as high training costs and the inability +to work effectively in the background. Instead, our study introduces a new +method that uses the spike camera to overcome these limitations. By considering +texture reconstruction from spike streams as ground truth, we design the +Texture from Spike (TfS) loss. Since the spike camera relies on temporal +integration instead of temporal differentiation used by event cameras, our +proposed TfS loss maintains manageable training costs. It handles foreground +objects with backgrounds simultaneously. We also provide a real-world dataset +captured with our spike-RGB camera system to facilitate future research +endeavors. We conduct extensive experiments using synthetic and real-world +datasets to demonstrate that our design can enhance novel view synthesis across +NeRF and 3DGS. The code and dataset will be made available for public access. + +
+
+
+
+
+ + ♻ ☆ Identifying Important Group of Pixels using Interactions CVPR 2024 + + +
+ To better understand the behavior of image classifiers, it is useful to +visualize the contribution of individual pixels to the model prediction. In +this study, we propose a method, MoXI ($\textbf{Mo}$del e$\textbf{X}$planation +by $\textbf{I}$nteractions), that efficiently and accurately identifies a group +of pixels with high prediction confidence. The proposed method employs +game-theoretic concepts, Shapley values and interactions, taking into account +the effects of individual pixels and the cooperative influence of pixels on +model confidence. Theoretical analysis and experiments demonstrate that our +method better identifies the pixels that are highly contributing to the model +outputs than widely-used visualization by Grad-CAM, Attention rollout, and +Shapley value. While prior studies have suffered from the exponential +computational cost in the computation of Shapley value and interactions, we +show that this can be reduced to quadratic cost for our task. The code is +available at https://github.com/KosukeSumiyasu/MoXI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FoodLMM: A Versatile Food Assistant using Large Multi-modal Model + + +
+ Large Multi-modal Models (LMMs) have made impressive progress in many +vision-language tasks. Nevertheless, the performance of general LMMs in +specific domains is still far from satisfactory. This paper proposes FoodLMM, a +versatile food assistant based on LMMs with various capabilities, including +food recognition, ingredient recognition, recipe generation, nutrition +estimation, food segmentation and multi-round conversation. To facilitate +FoodLMM to deal with tasks beyond pure text output, we introduce a series of +novel task-specific tokens and heads, enabling the model to predict food +nutritional values and multiple segmentation masks. We adopt a two-stage +training strategy. In the first stage, we utilize multiple public food +benchmarks for multi-task learning by leveraging the instruct-following +paradigm. In the second stage, we construct a multi-round conversation dataset +and a reasoning segmentation dataset to fine-tune the model, enabling it to +conduct professional dialogues and generate segmentation masks based on complex +reasoning in the food domain. Our fine-tuned FoodLMM achieves state-of-the-art +results across several food benchmarks. We will make our code, models and +datasets publicly available. + +
+
+
+
+
+ + ♻ ☆ Transformer based Pluralistic Image Completion with Reduced Information + Loss + + +
+ Transformer based methods have achieved great success in image inpainting +recently. However, we find that these solutions regard each pixel as a token, +thus suffering from an information loss issue from two aspects: 1) They +downsample the input image into much lower resolutions for efficiency +consideration. 2) They quantize $256^3$ RGB values to a small number (such as +512) of quantized color values. The indices of quantized pixels are used as +tokens for the inputs and prediction targets of the transformer. To mitigate +these issues, we propose a new transformer based framework called "PUT". +Specifically, to avoid input downsampling while maintaining computation +efficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts +the masked image into non-overlapped patch tokens and the decoder recovers the +masked regions from the inpainted tokens while keeping the unmasked regions +unchanged. To eliminate the information loss caused by input quantization, an +Un-quantized Transformer is applied. It directly takes features from the +P-VQVAE encoder as input without any quantization and only regards the +quantized tokens as prediction targets. Furthermore, to make the inpainting +process more controllable, we introduce semantic and structural conditions as +extra guidance. Extensive experiments show that our method greatly outperforms +existing transformer based methods on image fidelity and achieves much higher +diversity and better fidelity than state-of-the-art pluralistic inpainting +methods on complex large-scale datasets (e.g., ImageNet). Codes are available +at https://github.com/liuqk3/PUT. + +
+
+ comment: Accepted by TPAMI (2024). arXiv admin note: text overlap with + arXiv:2205.05076 +
+
+
+
+
+ + ♻ ☆ WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space + + +
+ Modern learning-based approaches to 3D-aware image synthesis achieve high +photorealism and 3D-consistent viewpoint changes for the generated images. +Existing approaches represent instances in a shared canonical space. However, +for in-the-wild datasets a shared canonical system can be difficult to define +or might not even exist. In this work, we instead model instances in view +space, alleviating the need for posed images and learned camera distributions. +We find that in this setting, existing GAN-based methods are prone to +generating flat geometry and struggle with distribution coverage. We hence +propose WildFusion, a new approach to 3D-aware image synthesis based on latent +diffusion models (LDMs). We first train an autoencoder that infers a compressed +latent representation, which additionally captures the images' underlying 3D +structure and enables not only reconstruction but also novel view synthesis. To +learn a faithful 3D representation, we leverage cues from monocular depth +prediction. Then, we train a diffusion model in the 3D-aware latent space, +thereby enabling synthesis of high-quality 3D-consistent image samples, +outperforming recent state-of-the-art GAN-based methods. Importantly, our +3D-aware LDM is trained without any direct supervision from multiview images or +3D geometry and does not require posed images or learned pose or camera +distributions. It directly learns a 3D representation without relying on +canonical camera coordinates. This opens up promising research avenues for +scalable 3D-aware image synthesis and 3D content creation from in-the-wild +image data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D +results. + +
+
+
+
+
+ + ♻ ☆ Toward Reliable Human Pose Forecasting with Uncertainty + + +
+ Recently, there has been an arms race of pose forecasting methods aimed at +solving the spatio-temporal task of predicting a sequence of future 3D poses of +a person given a sequence of past observed ones. However, the lack of unified +benchmarks and limited uncertainty analysis have hindered progress in the +field. To address this, we first develop an open-source library for human pose +forecasting, including multiple models, supporting several datasets, and +employing standardized evaluation metrics, with the aim of promoting research +and moving toward a unified and consistent evaluation. Second, we devise two +types of uncertainty in the problem to increase performance and convey better +trust: 1) we propose a method for modeling aleatoric uncertainty by using +uncertainty priors to inject knowledge about the pattern of uncertainty. This +focuses the capacity of the model in the direction of more meaningful +supervision while reducing the number of learned parameters and improving +stability; 2) we introduce a novel approach for quantifying the epistemic +uncertainty of any model through clustering and measuring the entropy of its +assignments. Our experiments demonstrate up to $25\%$ improvements in +forecasting at short horizons, with no loss on longer horizons on Human3.6M, +AMSS, and 3DPW datasets, and better performance in uncertainty estimation. The +code is available online at https://github.com/vita-epfl/UnPOSed. + +
+
+ comment: Published in RA-L 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Masked Face Recognition Method during the COVID-19 Pandemic + + +
+ The coronavirus disease (COVID-19) is an unparalleled crisis leading to a +huge number of casualties and security problems. In order to reduce the spread +of coronavirus, people often wear masks to protect themselves. This makes face +recognition a very difficult task since certain parts of the face are hidden. A +primary focus of researchers during the ongoing coronavirus pandemic is to come +up with suggestions to handle this problem through rapid and efficient +solutions. In this paper, we propose a reliable method based on occlusion +removal and deep learning-based features in order to address the problem of the +masked face recognition process. The first step is to remove the masked face +region. Next, we apply three pre-trained deep Convolutional Neural Networks +(CNN) namely, VGG-16, AlexNet, and ResNet-50, and use them to extract deep +features from the obtained regions (mostly eyes and forehead regions). The +Bag-of-features paradigm is then applied to the feature maps of the last +convolutional layer in order to quantize them and to get a slight +representation comparing to the fully connected layer of classical CNN. +Finally, Multilayer Perceptron (MLP) is applied for the classification process. +Experimental results on Real-World-Masked-Face-Dataset show high recognition +performance compared to other state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Impacts of Color and Texture Distortions on Earth Observation Data in + Deep Learning + + +
+ Land cover classification and change detection are two important applications +of remote sensing and Earth observation (EO) that have benefited greatly from +the advances of deep learning. Convolutional and transformer-based U-net models +are the state-of-the-art architectures for these tasks, and their performances +have been boosted by an increased availability of large-scale annotated EO +datasets. However, the influence of different visual characteristics of the +input EO data on a model's predictions is not well understood. In this work we +systematically examine model sensitivities with respect to several color- and +texture-based distortions on the input EO data during inference, given models +that have been trained without such distortions. We conduct experiments with +multiple state-of-the-art segmentation networks for land cover classification +and show that they are in general more sensitive to texture than to color +distortions. Beyond revealing intriguing characteristics of widely used land +cover classification models, our results can also be used to guide the +development of more robust models within the EO domain. + +
+
+
+
+
+ + ♻ ☆ Vision Transformers Need Registers + + +
+ Transformers have recently emerged as a powerful tool for learning visual +representations. In this paper, we identify and characterize artifacts in +feature maps of both supervised and self-supervised ViT networks. The artifacts +correspond to high-norm tokens appearing during inference primarily in +low-informative background areas of images, that are repurposed for internal +computations. We propose a simple yet effective solution based on providing +additional tokens to the input sequence of the Vision Transformer to fill that +role. We show that this solution fixes that problem entirely for both +supervised and self-supervised models, sets a new state of the art for +self-supervised visual models on dense visual prediction tasks, enables object +discovery methods with larger models, and most importantly leads to smoother +feature maps and attention maps for downstream visual processing. + +
+
+
+
+
+ + ♻ ☆ Safe-CLIP: Removing NSFW Concepts from Vision-and-Language Models + + +
+ Large-scale vision-and-language models, such as CLIP, are typically trained +on web-scale data, which can introduce inappropriate content and lead to the +development of unsafe and biased behavior. This, in turn, hampers their +applicability in sensitive and trustworthy contexts and could raise significant +concerns in their adoption. Our research introduces a novel approach to +enhancing the safety of vision-and-language models by diminishing their +sensitivity to NSFW (not safe for work) inputs. In particular, our methodology +seeks to sever "toxic" linguistic and visual concepts, unlearning the linkage +between unsafe linguistic or visual items and unsafe regions of the embedding +space. We show how this can be done by fine-tuning a CLIP model on synthetic +data obtained from a large language model trained to convert between safe and +unsafe sentences, and a text-to-image generator. We conduct extensive +experiments on the resulting embedding space for cross-modal retrieval, +text-to-image, and image-to-text generation, where we show that our model can +be remarkably employed with pre-trained generative models. Our source code and +trained models are available at: https://github.com/aimagelab/safe-clip. + +
+
+
+
+
+ + ♻ ☆ Lightweight Deep Learning for Resource-Constrained Environments: A + Survey + + +
+ Over the past decade, the dominance of deep learning has prevailed across +various domains of artificial intelligence, including natural language +processing, computer vision, and biomedical signal processing. While there have +been remarkable improvements in model accuracy, deploying these models on +lightweight devices, such as mobile phones and microcontrollers, is constrained +by limited resources. In this survey, we provide comprehensive design guidance +tailored for these devices, detailing the meticulous design of lightweight +models, compression methods, and hardware acceleration strategies. The +principal goal of this work is to explore methods and concepts for getting +around hardware constraints without compromising the model's accuracy. +Additionally, we explore two notable paths for lightweight deep learning in the +future: deployment techniques for TinyML and Large Language Models. Although +these paths undoubtedly have potential, they also present significant +challenges, encouraging research into unexplored areas. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Which Transformer to Favor: A Comparative Analysis of Efficiency in + Vision Transformers + + +
+ Transformers come with a high computational cost, yet their effectiveness in +addressing problems in language and vision has sparked extensive research aimed +at enhancing their efficiency. However, diverse experimental conditions, +spanning multiple input domains, prevent a fair comparison based solely on +reported results, posing challenges for model selection. To address this gap in +comparability, we design a comprehensive benchmark of more than 30 models for +image classification, evaluating key efficiency aspects, including accuracy, +speed, and memory usage. This benchmark provides a standardized baseline across +the landscape of efficiency-oriented transformers and our framework of +analysis, based on Pareto optimality, reveals surprising insights. Despite +claims of other models being more efficient, ViT remains Pareto optimal across +multiple metrics. We observe that hybrid attention-CNN models exhibit +remarkable inference memory- and parameter-efficiency. Moreover, our benchmark +shows that using a larger model in general is more efficient than using higher +resolution images. Thanks to our holistic evaluation, we provide a centralized +resource for practitioners and researchers, facilitating informed decisions +when selecting transformers or measuring progress of the development of +efficient transformers. + +
+
+
+
+
+ + ♻ ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/wljungbergh/NeuroNCAP + +
+
+
+
+
+ + ♻ ☆ ZONE: Zero-Shot Instruction-Guided Local Editing CVPR 2024 + + +
+ Recent advances in vision-language models like Stable Diffusion have shown +remarkable power in creative image synthesis and editing.However, most existing +text-to-image editing methods encounter two obstacles: First, the text prompt +needs to be carefully crafted to achieve good results, which is not intuitive +or user-friendly. Second, they are insensitive to local edits and can +irreversibly affect non-edited regions, leaving obvious editing traces. To +tackle these problems, we propose a Zero-shot instructiON-guided local image +Editing approach, termed ZONE. We first convert the editing intent from the +user-provided instruction (e.g., "make his tie blue") into specific image +editing regions through InstructPix2Pix. We then propose a Region-IoU scheme +for precise image layer extraction from an off-the-shelf segment model. We +further develop an edge smoother based on FFT for seamless blending between the +layer and the image.Our method allows for arbitrary manipulation of a specific +region with a single instruction while preserving the rest. Extensive +experiments demonstrate that our ZONE achieves remarkable local editing results +and user-friendliness, outperforming state-of-the-art methods. Code is +available at https://github.com/lsl001006/ZONE. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Foundation Models for Content-Based Medical Image Retrieval + in Radiology + + +
+ Content-based image retrieval (CBIR) has the potential to significantly +improve diagnostic aid and medical research in radiology. Current CBIR systems +face limitations due to their specialization to certain pathologies, limiting +their utility. In response, we propose using vision foundation models as +powerful and versatile off-the-shelf feature extractors for content-based +medical image retrieval. By benchmarking these models on a comprehensive +dataset of 1.6 million 2D radiological images spanning four modalities and 161 +pathologies, we identify weakly-supervised models as superior, achieving a P@1 +of up to 0.594. This performance not only competes with a specialized model but +does so without the need for fine-tuning. Our analysis further explores the +challenges in retrieving pathological versus anatomical structures, indicating +that accurate retrieval of pathological features presents greater difficulty. +Despite these challenges, our research underscores the vast potential of +foundation models for CBIR in radiology, proposing a shift towards versatile, +general-purpose medical image retrieval systems that do not require specific +tuning. + +
+
+
+
+
+ + ♻ ☆ DUFOMap: Efficient Dynamic Awareness Mapping + + +
+ The dynamic nature of the real world is one of the main challenges in +robotics. The first step in dealing with it is to detect which parts of the +world are dynamic. A typical benchmark task is to create a map that contains +only the static part of the world to support, for example, localization and +planning. Current solutions are often applied in post-processing, where +parameter tuning allows the user to adjust the setting for a specific dataset. +In this paper, we propose DUFOMap, a novel dynamic awareness mapping framework +designed for efficient online processing. Despite having the same parameter +settings for all scenarios, it performs better or is on par with +state-of-the-art methods. Ray casting is utilized to identify and classify +fully observed empty regions. Since these regions have been observed empty, it +follows that anything inside them at another time must be dynamic. Evaluation +is carried out in various scenarios, including outdoor environments in KITTI +and Argoverse 2, open areas on the KTH campus, and with different sensor types. +DUFOMap outperforms the state of the art in terms of accuracy and computational +efficiency. The source code, benchmarks, and links to the datasets utilized are +provided. See https://kth-rpl.github.io/dufomap for more details. + +
+
+ comment: The first two authors hold equal contribution. 8 pages, 7 figures, + project page https://kth-rpl.github.io/dufomap +
+
+
+
+
+ + ♻ ☆ A Systematic Survey of Deep Learning-based Single-Image Super-Resolution + + +
+ Single-image super-resolution (SISR) is an important task in image +processing, which aims to enhance the resolution of imaging systems. Recently, +SISR has made a huge leap and has achieved promising results with the help of +deep learning (DL). In this survey, we give an overview of DL-based SISR +methods and group them according to their design targets. Specifically, we +first introduce the problem definition, research background, and the +significance of SISR. Secondly, we introduce some related works, including +benchmark datasets, upsampling methods, optimization objectives, and image +quality assessment methods. Thirdly, we provide a detailed investigation of +SISR and give some domain-specific applications of it. Fourthly, we present the +reconstruction results of some classic SISR methods to intuitively know their +performance. Finally, we discuss some issues that still exist in SISR and +summarize some new trends and future directions. This is an exhaustive survey +of SISR, which can help researchers better understand SISR and inspire more +exciting research in this field. An investigation project for SISR is provided +at https://github.com/CV-JunchengLi/SISR-Survey. + +
+
+ comment: 40 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ How is Visual Attention Influenced by Text Guidance? Database and Model + + +
+ The analysis and prediction of visual attention have long been crucial tasks +in the fields of computer vision and image processing. In practical +applications, images are generally accompanied by various text descriptions, +however, few studies have explored the influence of text descriptions on visual +attention, let alone developed visual saliency prediction models considering +text guidance. In this paper, we conduct a comprehensive study on text-guided +image saliency (TIS) from both subjective and objective perspectives. +Specifically, we construct a TIS database named SJTU-TIS, which includes 1200 +text-image pairs and the corresponding collected eye-tracking data. Based on +the established SJTU-TIS database, we analyze the influence of various text +descriptions on visual attention. Then, to facilitate the development of +saliency prediction models considering text influence, we construct a benchmark +for the established SJTU-TIS database using state-of-the-art saliency models. +Finally, considering the effect of text descriptions on visual attention, while +most existing saliency models ignore this impact, we further propose a +text-guided saliency (TGSal) prediction model, which extracts and integrates +both image features and text features to predict the image saliency under +various text-description conditions. Our proposed model significantly +outperforms the state-of-the-art saliency models on both the SJTU-TIS database +and the pure image saliency databases in terms of various evaluation metrics. +The SJTU-TIS database and the code of the proposed TGSal model will be released +at: https://github.com/IntMeGroup/TGSal. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ Rapid post-disaster infrastructure damage characterisation enabled by + remote sensing and deep learning technologies -- a tiered approach + + +
+ Critical infrastructure, such as transport networks and bridges, are +systematically targeted during wars and suffer damage during extensive natural +disasters because it is vital for enabling connectivity and transportation of +people and goods, and hence, underpins national and international economic +growth. Mass destruction of transport assets, in conjunction with minimal or no +accessibility in the wake of natural and anthropogenic disasters, prevents us +from delivering rapid recovery and adaptation. As a result, systemic +operability is drastically reduced, leading to low levels of resilience. Thus, +there is a need for rapid assessment of its condition to allow for informed +decision-making for restoration prioritisation. A solution to this challenge is +to use technology that enables stand-off observations. Nevertheless, no methods +exist for automated characterisation of damage at multiple scales, i.e. +regional (e.g., network), asset (e.g., bridges), and structural (e.g., road +pavement) scales. We propose a methodology based on an integrated, multi-scale +tiered approach to fill this capability gap. In doing so, we demonstrate how +automated damage characterisation can be enabled by fit-for-purpose digital +technologies. Next, the methodology is applied and validated to a case study in +Ukraine that includes 17 bridges, damaged by human targeted interventions. From +regional to component scale, we deploy technology to integrate assessments +using Sentinel-1 SAR images, crowdsourced information, and high-resolution +images for deep learning to facilitate automatic damage detection and +characterisation. For the first time, the interferometric coherence difference +and semantic segmentation of images were deployed in a tiered multi-scale +approach to improve the reliability of damage characterisations at different +scales. + +
+
+ comment: 43 pages; 20 figures +
+
+
+
+
+ + ♻ ☆ Perceptual Assessment and Optimization of High Dynamic Range Image + Rendering + + +
+ High dynamic range (HDR) rendering has the ability to faithfully reproduce +the wide luminance ranges in natural scenes, but how to accurately assess the +rendering quality is relatively underexplored. Existing quality models are +mostly designed for low dynamic range (LDR) images, and do not align well with +human perception of HDR image quality. To fill this gap, we propose a family of +HDR quality metrics, in which the key step is employing a simple inverse +display model to decompose an HDR image into a stack of LDR images with varying +exposures. Subsequently, these decomposed images are assessed through +well-established LDR quality metrics. Our HDR quality models present three +distinct benefits. First, they directly inherit the recent advancements of LDR +quality metrics. Second, they do not rely on human perceptual data of HDR image +quality for re-calibration. Third, they facilitate the alignment and +prioritization of specific luminance ranges for more accurate and detailed +quality assessment. Experimental results show that our HDR quality metrics +consistently outperform existing models in terms of quality assessment on four +HDR image quality datasets and perceptual optimization of HDR novel view +synthesis. + +
+
+
+
+
+ + ♻ ☆ FairVision: Equitable Deep Learning for Eye Disease Screening via Fair + Identity Scaling + + +
+ Equity in AI for healthcare is crucial due to its direct impact on human +well-being. Despite advancements in 2D medical imaging fairness, the fairness +of 3D models remains underexplored, hindered by the small sizes of 3D fairness +datasets. Since 3D imaging surpasses 2D imaging in SOTA clinical care, it is +critical to understand the fairness of these 3D models. To address this +research gap, we conduct the first comprehensive study on the fairness of 3D +medical imaging models across multiple protected attributes. Our investigation +spans both 2D and 3D models and evaluates fairness across five architectures on +three common eye diseases, revealing significant biases across race, gender, +and ethnicity. To alleviate these biases, we propose a novel fair identity +scaling (FIS) method that improves both overall performance and fairness, +outperforming various SOTA fairness methods. Moreover, we release +Harvard-FairVision, the first large-scale medical fairness dataset with 30,000 +subjects featuring both 2D and 3D imaging data and six demographic identity +attributes. Harvard-FairVision provides labels for three major eye disorders +affecting about 380 million people worldwide, serving as a valuable resource +for both 2D and 3D fairness learning. Our code and dataset are publicly +accessible at +\url{https://ophai.hms.harvard.edu/datasets/harvard-fairvision30k}. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-Based MR Image Re-parameterization SC + + +
+ Magnetic resonance (MR) image re-parameterization refers to the process of +generating via simulations of an MR image with a new set of MRI scanning +parameters. Different parameter values generate distinct contrast between +different tissues, helping identify pathologic tissue. Typically, more than one +scan is required for diagnosis; however, acquiring repeated scans can be +costly, time-consuming, and difficult for patients. Thus, using MR image +re-parameterization to predict and estimate the contrast in these imaging scans +can be an effective alternative. In this work, we propose a novel deep learning +(DL) based convolutional model for MRI re-parameterization. Based on our +preliminary results, DL-based techniques hold the potential to learn the +non-linearities that govern the re-parameterization. + +
+
+ comment: A. Narang, A. Raj, M. Pop and M. Ebrahimi, "Deep Learning-Based MR + Image Re-parameterization," 2023 Congress in Computer Science, Computer + Engineering, & Applied Computing (CSCE), Las Vegas, NV, USA, 2023, pp. + 536-541, doi: 10.1109/CSCE60160.2023.00094 +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks in Vision-Language Image Understanding: A Survey + + +
+ 2D image understanding is a complex problem within computer vision, but it +holds the key to providing human-level scene comprehension. It goes further +than identifying the objects in an image, and instead, it attempts to +understand the scene. Solutions to this problem form the underpinning of a +range of tasks, including image captioning, visual question answering (VQA), +and image retrieval. Graphs provide a natural way to represent the relational +arrangement between objects in an image, and thus, in recent years graph neural +networks (GNNs) have become a standard component of many 2D image understanding +pipelines, becoming a core architectural component, especially in the VQA group +of tasks. In this survey, we review this rapidly evolving field and we provide +a taxonomy of graph types used in 2D image understanding approaches, a +comprehensive list of the GNN models used in this domain, and a roadmap of +future potential developments. To the best of our knowledge, this is the first +comprehensive survey that covers image captioning, visual question answering, +and image retrieval techniques that focus on using GNNs as the main part of +their architecture. + +
+
+ comment: 20 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ DiffusionGAN3D: Boosting Text-guided 3D Generation and Domain Adaptation + by Combining 3D GANs and Diffusion Priors CVPR2024 + + +
+ Text-guided domain adaptation and generation of 3D-aware portraits find many +applications in various fields. However, due to the lack of training data and +the challenges in handling the high variety of geometry and appearance, the +existing methods for these tasks suffer from issues like inflexibility, +instability, and low fidelity. In this paper, we propose a novel framework +DiffusionGAN3D, which boosts text-guided 3D domain adaptation and generation by +combining 3D GANs and diffusion priors. Specifically, we integrate the +pre-trained 3D generative models (e.g., EG3D) and text-to-image diffusion +models. The former provides a strong foundation for stable and high-quality +avatar generation from text. And the diffusion models in turn offer powerful +priors and guide the 3D generator finetuning with informative direction to +achieve flexible and efficient text-guided domain adaptation. To enhance the +diversity in domain adaptation and the generation capability in text-to-avatar, +we introduce the relative distance loss and case-specific learnable triplane +respectively. Besides, we design a progressive texture refinement module to +improve the texture quality for both tasks above. Extensive experiments +demonstrate that the proposed framework achieves excellent results in both +domain adaptation and text-to-avatar tasks, outperforming existing methods in +terms of generation quality and efficiency. The project homepage is at +https://younglbw.github.io/DiffusionGAN3D-homepage/. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation aims to synthesize instantiations of +user-specified concepts and has achieved unprecedented progress in handling +individual concept. However, when extending to multiple customized concepts, +existing methods exhibit limitations in terms of flexibility and fidelity, only +accommodating the combination of limited types of models and potentially +resulting in a mix of characteristics from different concepts. In this paper, +we introduce the Multi-concept guidance for Multi-concept customization, termed +MC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the +requirements for model architecture via inference time optimization, allowing +the integration of various heterogeneous single-concept customized models. It +adaptively refines the attention weights between visual and textual tokens, +directing image regions to focus on their associated words while diminishing +the impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$ +even surpasses previous methods that require additional training in terms of +consistency with input prompt and reference images. Moreover, MC$^2$ can be +extended to elevate the compositional capabilities of text-to-image generation, +yielding appealing results. Code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+
+
+
+ + ♻ ☆ FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering + and Local to Global Optimization ICRA + + +
+ Cross-modality point cloud registration is confronted with significant +challenges due to inherent differences in modalities between different sensors. +We propose a cross-modality point cloud registration framework FF-LOGO: a +cross-modality point cloud registration method with feature filtering and +local-global optimization. The cross-modality feature correlation filtering +module extracts geometric transformation-invariant features from cross-modality +point clouds and achieves point selection by feature matching. We also +introduce a cross-modality optimization process, including a local adaptive key +region aggregation module and a global modality consistency fusion optimization +module. Experimental results demonstrate that our two-stage optimization +significantly improves the registration accuracy of the feature association and +selection module. Our method achieves a substantial increase in recall rate +compared to the current state-of-the-art methods on the 3DCSR dataset, +improving from 40.59% to 75.74%. Our code will be available at +https://github.com/wangmohan17/FFLOGO. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Robotics and + Automation (ICRA),7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior + + +
+ We present DiffBIR, a general restoration pipeline that could handle +different blind image restoration tasks in a unified framework. DiffBIR +decouples blind image restoration problem into two stages: 1) degradation +removal: removing image-independent content; 2) information regeneration: +generating the lost image content. Each stage is developed independently but +they work seamlessly in a cascaded manner. In the first stage, we use +restoration modules to remove degradations and obtain high-fidelity restored +results. For the second stage, we propose IRControlNet that leverages the +generative ability of latent diffusion models to generate realistic details. +Specifically, IRControlNet is trained based on specially produced condition +images without distracting noisy content for stable generation performance. +Moreover, we design a region-adaptive restoration guidance that can modify the +denoising process during inference without model re-training, allowing users to +balance realness and fidelity through a tunable guidance scale. Extensive +experiments have demonstrated DiffBIR's superiority over state-of-the-art +approaches for blind image super-resolution, blind face restoration and blind +image denoising tasks on both synthetic and real-world datasets. The code is +available at https://github.com/XPixelGroup/DiffBIR. + +
+
+
+
+
+ + ♻ ☆ RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene + Parsing + + +
+ The recent advancements in deep convolutional neural networks have shown +significant promise in the domain of road scene parsing. Nevertheless, the +existing works focus primarily on freespace detection, with little attention +given to hazardous road defects that could compromise both driving safety and +comfort. In this paper, we introduce RoadFormer, a novel Transformer-based +data-fusion network developed for road scene parsing. RoadFormer utilizes a +duplex encoder architecture to extract heterogeneous features from both RGB +images and surface normal information. The encoded features are subsequently +fed into a novel heterogeneous feature synergy block for effective feature +fusion and recalibration. The pixel decoder then learns multi-scale long-range +dependencies from the fused and recalibrated heterogeneous features, which are +subsequently processed by a Transformer decoder to produce the final semantic +prediction. Additionally, we release SYN-UDTIRI, the first large-scale road +scene parsing dataset that contains over 10,407 RGB images, dense depth images, +and the corresponding pixel-level annotations for both freespace and road +defects of different shapes and sizes. Extensive experimental evaluations +conducted on our SYN-UDTIRI dataset, as well as on three public datasets, +including KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer +outperforms all other state-of-the-art networks for road scene parsing. +Specifically, RoadFormer ranks first on the KITTI road benchmark. Our source +code, created dataset, and demo video are publicly available at +mias.group/RoadFormer. + +
+
+ comment: 9 pages 7 figures. Accepted by Transactions on Intelligent Vehicles +
+
+
+
+
+ + ♻ ☆ Accelerating ViT Inference on FPGA through Static and Dynamic Pruning + + +
+ Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various +computer vision tasks. However, their high computational complexity prevents +them from being applied to many real-world applications. Weight and token +pruning are two well-known methods for reducing complexity: weight pruning +reduces the model size and associated computational demands, while token +pruning further dynamically reduces the computation based on the input. +Combining these two techniques should significantly reduce computation +complexity and model size; however, naively integrating them results in +irregular computation patterns, leading to significant accuracy drops and +difficulties in hardware acceleration. + Addressing the above challenges, we propose a comprehensive +algorithm-hardware codesign for accelerating ViT on FPGA through simultaneous +pruning -combining static weight pruning and dynamic token pruning. For +algorithm design, we systematically combine a hardware-aware structured +block-pruning method for pruning model parameters and a dynamic token pruning +method for removing unimportant token vectors. Moreover, we design a novel +training algorithm to recover the model's accuracy. For hardware design, we +develop a novel hardware accelerator for executing the pruned model. The +proposed hardware design employs multi-level parallelism with load balancing +strategy to efficiently deal with the irregular computation pattern led by the +two pruning approaches. Moreover, we develop an efficient hardware mechanism +for efficiently executing the on-the-fly token pruning. + +
+
+ comment: FCCM 2024 +
+
+
+
+
+ + ♻ ☆ Conv-Adapter: Exploring Parameter Efficient Transfer Learning for + ConvNets + + +
+ While parameter efficient tuning (PET) methods have shown great potential +with transformer architecture on Natural Language Processing (NLP) tasks, their +effectiveness with large-scale ConvNets is still under-studied on Computer +Vision (CV) tasks. This paper proposes Conv-Adapter, a PET module designed for +ConvNets. Conv-Adapter is light-weight, domain-transferable, and +architecture-agnostic with generalized performance on different tasks. When +transferring on downstream tasks, Conv-Adapter learns tasks-specific feature +modulation to the intermediate representations of backbones while keeping the +pre-trained parameters frozen. By introducing only a tiny amount of learnable +parameters, e.g., only 3.5% full fine-tuning parameters of ResNet50. It can +also be applied for transformer-based backbones. Conv-Adapter outperforms +previous PET baseline methods and achieves comparable or surpasses the +performance of full fine-tuning on 23 classification tasks of various domains. +It also presents superior performance on the few-shot classification with an +average margin of 3.39%. Beyond classification, Conv-Adapter can generalize to +detection and segmentation tasks with more than 50% reduction of parameters but +comparable performance to the traditional full fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Robust Representation Learning with Self-Distillation for Domain + Generalization + + +
+ Despite the recent success of deep neural networks, there remains a need for +effective methods to enhance domain generalization using vision transformers. +In this paper, we propose a novel domain generalization technique called Robust +Representation Learning with Self-Distillation (RRLD) comprising i) +intermediate-block self-distillation and ii) augmentation-guided +self-distillation to improve the generalization capabilities of +transformer-based models on unseen domains. This approach enables the network +to learn robust and general features that are invariant to different +augmentations and domain shifts while effectively mitigating overfitting to +source domains. To evaluate the effectiveness of our proposed method, we +perform extensive experiments on PACS and OfficeHome benchmark datasets, as +well as an industrial wafer semiconductor defect dataset. The results +demonstrate that RRLD achieves robust and accurate generalization performance. +We observe an average accuracy improvement in the range of 1.2% to 2.3% over +the state-of-the-art on the three datasets. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker + + +
+ Most of 3D single object trackers (SOT) in point clouds follow the two-stream +multi-stage 3D Siamese or motion tracking paradigms, which process the template +and search area point clouds with two parallel branches, built on supervised +point cloud backbones. In this work, beyond typical 3D Siamese or motion +tracking, we propose a neat and compact one-stream transformer 3D SOT paradigm +from the novel perspective, termed as \textbf{EasyTrack}, which consists of +three special designs: 1) A 3D point clouds tracking feature pre-training +module is developed to exploit the masked autoencoding for learning 3D point +clouds tracking representations. 2) A unified 3D tracking feature learning and +fusion network is proposed to simultaneously learns target-aware 3D features, +and extensively captures mutual correlation through the flexible self-attention +mechanism. 3) A target location network in the dense bird's eye view (BEV) +feature space is constructed for target classification and regression. +Moreover, we develop an enhanced version named EasyTrack++, which designs the +center points interaction (CPI) strategy to reduce the ambiguous targets caused +by the noise point cloud background information. The proposed EasyTrack and +EasyTrack++ set a new state-of-the-art performance ($\textbf{18\%}$, +$\textbf{40\%}$ and $\textbf{3\%}$ success gains) in KITTI, NuScenes, and Waymo +while runing at \textbf{52.6fps} with few parameters (\textbf{1.3M}). The code +will be available at https://github.com/KnightApple427/Easytrack. + +
+
+
+
+
+ + ♻ ☆ Universal Humanoid Motion Representations for Physics-Based Control ICLR 2024 + + +
+ We present a universal motion representation that encompasses a comprehensive +range of motor skills for physics-based humanoid control. Due to the high +dimensionality of humanoids and the inherent difficulties in reinforcement +learning, prior methods have focused on learning skill embeddings for a narrow +range of movement styles (e.g. locomotion, game characters) from specialized +motion datasets. This limited scope hampers their applicability in complex +tasks. We close this gap by significantly increasing the coverage of our motion +representation space. To achieve this, we first learn a motion imitator that +can imitate all of human motion from a large, unstructured motion dataset. We +then create our motion representation by distilling skills directly from the +imitator. This is achieved by using an encoder-decoder structure with a +variational information bottleneck. Additionally, we jointly learn a prior +conditioned on proprioception (humanoid's own pose and velocities) to improve +model expressiveness and sampling efficiency for downstream tasks. By sampling +from the prior, we can generate long, stable, and diverse human motions. Using +this latent space for hierarchical RL, we show that our policies solve tasks +using human-like behavior. We demonstrate the effectiveness of our motion +representation by solving generative tasks (e.g. strike, terrain traversal) and +motion tracking using VR controllers. + +
+
+ comment: ICLR 2024 Spotlight. Project page: + https://zhengyiluo.github.io/PULSE/ +
+
+
+
+
+ + ♻ ☆ Eye-gaze Guided Multi-modal Alignment Framework for Radiology + + +
+ In multi-modal frameworks, the alignment of cross-modal features presents a +significant challenge. The predominant approach in multi-modal pre-training +emphasizes either global or local alignment between modalities, utilizing +extensive datasets. This bottom-up driven method often suffers from a lack of +interpretability, a critical concern in radiology. Previous studies have +integrated high-level labels in medical images or text, but these still rely on +manual annotation, a costly and labor-intensive process. Our work introduces a +novel approach by using eye-gaze data, collected synchronously by radiologists +during diagnostic evaluations. This data, indicating radiologists' focus areas, +naturally links chest X-rays to diagnostic texts. We propose the Eye-gaze +Guided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for +better alignment of image and text features, aiming to reduce reliance on +manual annotations and thus cut training costs. Our model demonstrates robust +performance, outperforming other state-of-the-art methods in zero-shot +classification and retrieval tasks. The incorporation of easily-obtained +eye-gaze data during routine radiological diagnoses signifies a step towards +minimizing manual annotation dependency. Additionally, we explore the impact of +varying amounts of eye-gaze data on model performance, highlighting the +feasibility and utility of integrating this auxiliary data into multi-modal +pre-training. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Technique for Classifying Static Gestures Using UWB Radar + + +
+ Our paper presents a robust framework for UWB-based static gesture +recognition, leveraging proprietary UWB radar sensor technology. Extensive data +collection efforts were undertaken to compile datasets containing five commonly +used gestures. Our approach involves a comprehensive data pre-processing +pipeline that encompasses outlier handling, aspect ratio-preserving resizing, +and false-color image transformation. Both CNN and MobileNet models were +trained on the processed images. Remarkably, our best-performing model achieved +an accuracy of 96.78%. Additionally, we developed a user-friendly GUI framework +to assess the model's system resource usage and processing times, which +revealed low memory utilization and real-time task completion in under one +second. This research marks a significant step towards enhancing static gesture +recognition using UWB technology, promising practical applications in various +domains. + +
+
+ comment: This is not a technical research paper, but an excerpt of what was + applied during a funded project for the promotion of Open Science +
+
+
+
+
+ + ♻ ☆ ChangeNet: Multi-Temporal Asymmetric Change Detection Dataset ICASSP 2024 + + +
+ Change Detection (CD) has been attracting extensive interests with the +availability of bi-temporal datasets. However, due to the huge cost of +multi-temporal images acquisition and labeling, existing change detection +datasets are small in quantity, short in temporal, and low in practicability. +Therefore, a large-scale practical-oriented dataset covering wide temporal +phases is urgently needed to facilitate the community. To this end, the +ChangeNet dataset is presented especially for multi-temporal change detection, +along with the new task of "Asymmetric Change Detection". Specifically, +ChangeNet consists of 31,000 multi-temporal images pairs, a wide range of +complex scenes from 100 cities, and 6 pixel-level annotated categories, which +is far superior to all the existing change detection datasets including +LEVIR-CD, WHU Building CD, etc.. In addition, ChangeNet contains amounts of +real-world perspective distortions in different temporal phases on the same +areas, which is able to promote the practical application of change detection +algorithms. The ChangeNet dataset is suitable for both binary change detection +(BCD) and semantic change detection (SCD) tasks. Accordingly, we benchmark the +ChangeNet dataset on six BCD methods and two SCD methods, and extensive +experiments demonstrate its challenges and great significance. The dataset is +available at https://github.com/jankyee/ChangeNet. + +
+
+ comment: Accepted to ICASSP 2024 Oral/Lecture +
+
+
+
+
+ + ♻ ☆ Comment-aided Video-Language Alignment via Contrastive Pre-training for + Short-form Video Humor Detection ICMR 2024 + + +
+ The growing importance of multi-modal humor detection within affective +computing correlates with the expanding influence of short-form video sharing +on social media platforms. In this paper, we propose a novel two-branch +hierarchical model for short-form video humor detection (SVHD), named +Comment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal +contrastive pre-training. Notably, our CVLA not only operates on raw signals +across various modal channels but also yields an appropriate multi-modal +representation by aligning the video and language components within a +consistent semantic space. The experimental results on two humor detection +datasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically +outperforms state-of-the-art and several competitive baseline approaches. Our +dataset, code and model release at https://github.com/yliu-cs/CVLA. + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ♻ ☆ CosalPure: Learning Concept from Group Images for Robust Co-Saliency + Detection CVPR 2024 + + +
+ Co-salient object detection (CoSOD) aims to identify the common and salient +(usually in the foreground) regions across a given group of images. Although +achieving significant progress, state-of-the-art CoSODs could be easily +affected by some adversarial perturbations, leading to substantial accuracy +reduction. The adversarial perturbations can mislead CoSODs but do not change +the high-level semantic information (e.g., concept) of the co-salient objects. +In this paper, we propose a novel robustness enhancement framework by first +learning the concept of the co-salient objects based on the input group images +and then leveraging this concept to purify adversarial perturbations, which are +subsequently fed to CoSODs for robustness enhancement. Specifically, we propose +CosalPure containing two modules, i.e., group-image concept learning and +concept-guided diffusion purification. For the first module, we adopt a +pre-trained text-to-image diffusion model to learn the concept of co-salient +objects within group images where the learned concept is robust to adversarial +examples. For the second module, we map the adversarial image to the latent +space and then perform diffusion generation by embedding the learned concept +into the noise prediction function as an extra condition. Our method can +effectively alleviate the influence of the SOTA adversarial attack containing +different adversarial patterns, including exposure and noise. The extensive +results demonstrate that our method could enhance the robustness of CoSODs +significantly. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RDFC-GAN: RGB-Depth Fusion CycleGAN for Indoor Depth Completion CVPR 2022 + + +
+ Raw depth images captured in indoor scenarios frequently exhibit extensive +missing values due to the inherent limitations of the sensors and environments. +For example, transparent materials frequently elude detection by depth sensors; +surfaces may introduce measurement inaccuracies due to their polished textures, +extended distances, and oblique incidence angles from the sensor. The presence +of incomplete depth maps imposes significant challenges for subsequent vision +applications, prompting the development of numerous depth completion techniques +to mitigate this problem. Numerous methods excel at reconstructing dense depth +maps from sparse samples, but they often falter when faced with extensive +contiguous regions of missing depth values, a prevalent and critical challenge +in indoor environments. To overcome these challenges, we design a novel +two-branch end-to-end fusion network named RDFC-GAN, which takes a pair of RGB +and incomplete depth images as input to predict a dense and completed depth +map. The first branch employs an encoder-decoder structure, by adhering to the +Manhattan world assumption and utilizing normal maps from RGB-D information as +guidance, to regress the local dense depth values from the raw depth map. The +other branch applies an RGB-depth fusion CycleGAN, adept at translating RGB +imagery into detailed, textured depth maps while ensuring high fidelity through +cycle consistency. We fuse the two branches via adaptive fusion modules named +W-AdaIN and train the model with the help of pseudo depth maps. Comprehensive +evaluations on NYU-Depth V2 and SUN RGB-D datasets show that our method +significantly enhances depth completion performance particularly in realistic +indoor settings. + +
+
+ comment: Haowen Wang and Zhengping Che are with equal contributions. Paper + accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence + (TPAMI). An earlier version has been accepted by CVPR 2022 + (arXiv:2203.10856). arXiv admin note: text overlap with arXiv:2203.10856 +
+
+
+
+
+ + ♻ ☆ HICO-DET-SG and V-COCO-SG: New Data Splits for Evaluating the Systematic + Generalization Performance of Human-Object Interaction Detection Models + + +
+ Human-Object Interaction (HOI) detection is a task to localize humans and +objects in an image and predict the interactions in human-object pairs. In +real-world scenarios, HOI detection models need systematic generalization, +i.e., generalization to novel combinations of objects and interactions, because +the train data are expected to cover a limited portion of all possible +combinations. To evaluate the systematic generalization performance of HOI +detection models, we created two new sets of HOI detection data splits named +HICO-DET-SG and V-COCO-SG based on the HICO-DET and V-COCO datasets, +respectively. When evaluated on the new data splits, HOI detection models with +various characteristics performed much more poorly than when evaluated on the +original splits. This shows that systematic generalization is a challenging +goal in HOI detection. By analyzing the evaluation results, we also gain +insights for improving the systematic generalization performance and identify +four possible future research directions. We hope that our new data splits and +presented analysis will encourage further research on systematic generalization +in HOI detection. + +
+
+ comment: 19 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Enhanced Muscle and Fat Segmentation for CT-Based Body Composition + Analysis: A Comparative Study + + +
+ Purpose: Body composition measurements from routine abdominal CT can yield +personalized risk assessments for asymptomatic and diseased patients. In +particular, attenuation and volume measures of muscle and fat are associated +with important clinical outcomes, such as cardiovascular events, fractures, and +death. This study evaluates the reliability of an Internal tool for the +segmentation of muscle and fat (subcutaneous and visceral) as compared to the +well-established public TotalSegmentator tool. + Methods: We assessed the tools across 900 CT series from the publicly +available SAROS dataset, focusing on muscle, subcutaneous fat, and visceral +fat. The Dice score was employed to assess accuracy in subcutaneous fat and +muscle segmentation. Due to the lack of ground truth segmentations for visceral +fat, Cohen's Kappa was utilized to assess segmentation agreement between the +tools. + Results: Our Internal tool achieved a 3% higher Dice (83.8 vs. 80.8) for +subcutaneous fat and a 5% improvement (87.6 vs. 83.2) for muscle segmentation +respectively. A Wilcoxon signed-rank test revealed that our results were +statistically different with p<0.01. For visceral fat, the Cohen's kappa score +of 0.856 indicated near-perfect agreement between the two tools. Our internal +tool also showed very strong correlations for muscle volume (R^2=0.99), muscle +attenuation (R^2=0.93), and subcutaneous fat volume (R^2=0.99) with a moderate +correlation for subcutaneous fat attenuation (R^2=0.45). + Conclusion: Our findings indicated that our Internal tool outperformed +TotalSegmentator in measuring subcutaneous fat and muscle. The high Cohen's +Kappa score for visceral fat suggests a reliable level of agreement between the +two tools. These results demonstrate the potential of our tool in advancing the +accuracy of body composition analysis. + +
+
+
+
+
+ + ♻ ☆ General surgery vision transformer: A video pre-trained foundation model + for general surgery + + +
+ The absence of openly accessible data and specialized foundation models is a +major barrier for computational research in surgery. Toward this, (i) we +open-source the largest dataset of general surgery videos to-date, consisting +of 680 hours of surgical videos, including data from robotic and laparoscopic +techniques across 28 procedures; (ii) we propose a technique for video +pre-training a general surgery vision transformer (GSViT) on surgical videos +based on forward video prediction that can run in real-time for surgical +applications, toward which we open-source the code and weights of GSViT; (iii) +we also release code and weights for procedure-specific fine-tuned versions of +GSViT across 10 procedures; (iv) we demonstrate the performance of GSViT on the +Cholec80 phase annotation task, displaying improved performance over +state-of-the-art single frame predictors. + +
+
+
+
+
+ + ♻ ☆ SatCLIP: Global, General-Purpose Location Embeddings with Satellite + Imagery + + +
+ Geographic information is essential for modeling tasks in fields ranging from +ecology to epidemiology. However, extracting relevant location characteristics +for a given task can be challenging, often requiring expensive data fusion or +distillation from massive global imagery datasets. To address this challenge, +we introduce Satellite Contrastive Location-Image Pretraining (SatCLIP). This +global, general-purpose geographic location encoder learns an implicit +representation of locations by matching CNN and ViT inferred visual patterns of +openly available satellite imagery with their geographic coordinates. The +resulting SatCLIP location encoder efficiently summarizes the characteristics +of any given location for convenient use in downstream tasks. In our +experiments, we use SatCLIP embeddings to improve prediction performance on +nine diverse location-dependent tasks including temperature prediction, animal +recognition, and population density estimation. Across tasks, SatCLIP +consistently outperforms alternative location encoders and improves geographic +generalization by encoding visual similarities of spatially distant +environments. These results demonstrate the potential of vision-location models +to learn meaningful representations of our planet from the vast, varied, and +largely untapped modalities of geospatial data. + +
+
+
+
+
+ + ♻ ☆ PrivImage: Differentially Private Synthetic Image Generation using + Diffusion Models with Semantic-Aware Pretraining USENIX Security 2024 + + +
+ Differential Privacy (DP) image data synthesis, which leverages the DP +technique to generate synthetic data to replace the sensitive data, allowing +organizations to share and utilize synthetic images without privacy concerns. +Previous methods incorporate the advanced techniques of generative models and +pre-training on a public dataset to produce exceptional DP image data, but +suffer from problems of unstable training and massive computational resource +demands. This paper proposes a novel DP image synthesis method, termed +PRIVIMAGE, which meticulously selects pre-training data, promoting the +efficient creation of DP datasets with high fidelity and utility. PRIVIMAGE +first establishes a semantic query function using a public dataset. Then, this +function assists in querying the semantic distribution of the sensitive +dataset, facilitating the selection of data from the public dataset with +analogous semantics for pre-training. Finally, we pre-train an image generative +model using the selected data and then fine-tune this model on the sensitive +dataset using Differentially Private Stochastic Gradient Descent (DP-SGD). +PRIVIMAGE allows us to train a lightly parameterized generative model, reducing +the noise in the gradient during DP-SGD training and enhancing training +stability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the +public dataset for pre-training and 7.6% of the parameters in the generative +model compared to the state-of-the-art method, whereas achieves superior +synthetic performance and conserves more computational resources. On average, +PRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy +than the state-of-the-art method. The replication package and datasets can be +accessed online. + +
+
+ comment: Accepted at USENIX Security 2024. The first two authors contributed + equally +
+
+
+
+
+ + ♻ ☆ MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly + Mixed Classifiers + + +
+ Adversarial robustness often comes at the cost of degraded accuracy, impeding +the real-life application of robust classification models. Training-based +solutions for better trade-offs are limited by incompatibilities with +already-trained high-performance large models, necessitating the exploration of +training-free ensemble approaches. Observing that robust models are more +confident in correct predictions than in incorrect ones on clean and +adversarial data alike, we speculate amplifying this "benign confidence +property" can reconcile accuracy and robustness in an ensemble setting. To +achieve so, we propose "MixedNUTS", a training-free method where the output +logits of a robust classifier and a standard non-robust classifier are +processed by nonlinear transformations with only three parameters, which are +optimized through an efficient algorithm. MixedNUTS then converts the +transformed logits into probabilities and mixes them as the overall output. On +CIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom +strong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and +near-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points, +sacrificing merely 0.87 points in robust accuracy. + +
+
+
+
+
+ + ♻ ☆ Exploring the Frontier of Vision-Language Models: A Survey of Current + Methodologies and Future Directions + + +
+ The advent of Large Language Models (LLMs) has significantly reshaped the +trajectory of the AI revolution. Nevertheless, these LLMs exhibit a notable +limitation, as they are primarily adept at processing textual information. To +address this constraint, researchers have endeavored to integrate visual +capabilities with LLMs, resulting in the emergence of Vision-Language Models +(VLMs). These advanced models are instrumental in tackling more intricate tasks +such as image captioning and visual question answering. In our comprehensive +survey paper, we delve into the key advancements within the realm of VLMs. Our +classification organizes VLMs into three distinct categories: models dedicated +to vision-language understanding, models that process multimodal inputs to +generate unimodal (textual) outputs and models that both accept and produce +multimodal inputs and outputs.This classification is based on their respective +capabilities and functionalities in processing and generating various +modalities of data.We meticulously dissect each model, offering an extensive +analysis of its foundational architecture, training data sources, as well as +its strengths and limitations wherever possible, providing readers with a +comprehensive understanding of its essential components. We also analyzed the +performance of VLMs in various benchmark datasets. By doing so, we aim to offer +a nuanced understanding of the diverse landscape of VLMs. Additionally, we +underscore potential avenues for future research in this dynamic domain, +anticipating further breakthroughs and advancements. + +
+
+ comment: The most extensive and up to date Survey on Visual Language Models + covering 76 Visual Language Models +
+
+
+
+
+ + ♻ ☆ Paved2Paradise: Cost-Effective and Scalable LiDAR Simulation by + Factoring the Real World CVPR + 2024 + + +
+ To achieve strong real world performance, neural networks must be trained on +large, diverse datasets; however, obtaining and annotating such datasets is +costly and time-consuming, particularly for 3D point clouds. In this paper, we +describe Paved2Paradise, a simple, cost-effective approach for generating fully +labeled, diverse, and realistic lidar datasets from scratch, all while +requiring minimal human annotation. Our key insight is that, by deliberately +collecting separate "background" and "object" datasets (i.e., "factoring the +real world"), we can intelligently combine them to produce a combinatorially +large and diverse training set. The Paved2Paradise pipeline thus consists of +four steps: (1) collecting copious background data, (2) recording individuals +from the desired object class(es) performing different behaviors in an isolated +environment (like a parking lot), (3) bootstrapping labels for the object +dataset, and (4) generating samples by placing objects at arbitrary locations +in backgrounds. To demonstrate the utility of Paved2Paradise, we generated +synthetic datasets for two tasks: (1) human detection in orchards (a task for +which no public data exists) and (2) pedestrian detection in urban +environments. Qualitatively, we find that a model trained exclusively on +Paved2Paradise synthetic data is highly effective at detecting humans in +orchards, including when individuals are heavily occluded by tree branches. +Quantitatively, a model trained on Paved2Paradise data that sources backgrounds +from KITTI performs comparably to a model trained on the actual dataset. These +results suggest the Paved2Paradise synthetic data pipeline can help accelerate +point cloud model development in sectors where acquiring lidar datasets has +previously been cost-prohibitive. + +
+
+ comment: Accepted to the Synthetic Data for Computer Vision workshop at CVPR + 2024 +
+
+
+
+
+ + ♻ ☆ Masked Diffusion as Self-supervised Representation Learner + + +
+ Denoising diffusion probabilistic models have recently demonstrated +state-of-the-art generative performance and have been used as strong +pixel-level representation learners. This paper decomposes the interrelation +between the generative capability and representation learning ability inherent +in diffusion models. We present the masked diffusion model (MDM), a scalable +self-supervised representation learner for semantic segmentation, substituting +the conventional additive Gaussian noise of traditional diffusion with a +masking mechanism. Our proposed approach convincingly surpasses prior +benchmarks, demonstrating remarkable advancements in both medical and natural +image semantic segmentation tasks, particularly in few-shot scenarios. + +
+
+
+
+
+ + ♻ ☆ ScribblePrompt: Fast and Flexible Interactive Segmentation for Any + Biomedical Image + + +
+ Biomedical image segmentation is a crucial part of both scientific research +and clinical care. With enough labelled data, deep learning models can be +trained to accurately automate specific biomedical image segmentation tasks. +However, manually segmenting images to create training data is highly labor +intensive and requires domain expertise. We present ScribblePrompt, a flexible +neural network based interactive segmentation tool for biomedical imaging that +enables human annotators to segment previously unseen structures using +scribbles, clicks, and bounding boxes. Through rigorous quantitative +experiments, we demonstrate that given comparable amounts of interaction, +ScribblePrompt produces more accurate segmentations than previous methods on +datasets unseen during training. In a user study with domain experts, +ScribblePrompt reduced annotation time by 28% while improving Dice by 15% +compared to the next best method. ScribblePrompt's success rests on a set of +careful design decisions. These include a training strategy that incorporates +both a highly diverse set of images and tasks, novel algorithms for simulated +user interactions and labels, and a network that enables fast inference. We +showcase ScribblePrompt in an online demo and provide code at +https://scribbleprompt.csail.mit.edu + +
+
+ comment: Project Website: https://scribbleprompt.csail.mit.edu Keywords: + Interactive Segmentation, Medical Imaging, Segment Anything Model, SAM, + Scribble Annotations, Prompt +
+
+
+
+
+ + ♻ ☆ Generative AI-Based Effective Malware Detection for Embedded Computing + Systems + + +
+ One of the pivotal security threats for the embedded computing systems is +malicious software a.k.a malware. With efficiency and efficacy, Machine +Learning (ML) has been widely adopted for malware detection in recent times. +Despite being efficient, the existing techniques require a tremendous number of +benign and malware samples for training and modeling an efficient malware +detector. Furthermore, such constraints limit the detection of emerging malware +samples due to the lack of sufficient malware samples required for efficient +training. To address such concerns, we introduce a code-aware data generation +technique that generates multiple mutated samples of the limitedly seen malware +by the devices. Loss minimization ensures that the generated samples closely +mimic the limitedly seen malware and mitigate the impractical samples. Such +developed malware is further incorporated into the training set to formulate +the model that can efficiently detect the emerging malware despite having +limited exposure. The experimental results demonstrates that the proposed +technique achieves an accuracy of 90% in detecting limitedly seen malware, +which is approximately 3x more than the accuracy attained by state-of-the-art +techniques. + +
+
+
+
+
+ + ♻ ☆ PEEB: Part-based Image Classifiers with an Explainable and Editable + Language Bottleneck NAACL 2024 + + +
+ CLIP-based classifiers rely on the prompt containing a {class name} that is +known to the text encoder. Therefore, they perform poorly on new classes or the +classes whose names rarely appear on the Internet (e.g., scientific names of +birds). For fine-grained classification, we propose PEEB - an explainable and +editable classifier to (1) express the class name into a set of text +descriptors that describe the visual parts of that class; and (2) match the +embeddings of the detected parts to their textual descriptors in each class to +compute a logit score for classification. In a zero-shot setting where the +class names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1 +accuracy). Compared to part-based classifiers, PEEB is not only the +state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% +accuracy on CUB-200 and Dogs-120, respectively) but also the first to enable +users to edit the text descriptors to form a new classifier without any +re-training. Compared to concept bottleneck models, PEEB is also the SOTA in +both zero-shot and supervised-learning settings. + +
+
+ comment: Findings of NAACL 2024 (long paper) +
+
+
+
+
+ + ♻ ☆ Coverage Axis++: Efficient Inner Point Selection for 3D Shape + Skeletonization + + +
+ We introduce Coverage Axis++, a novel and efficient approach to 3D shape +skeletonization. The current state-of-the-art approaches for this task often +rely on the watertightness of the input or suffer from substantial +computational costs, thereby limiting their practicality. To address this +challenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal +points, offering a high-accuracy approximation of the Medial Axis Transform +(MAT) while significantly mitigating computational intensity for various shape +representations. We introduce a simple yet effective strategy that considers +shape coverage, uniformity, and centrality to derive skeletal points. The +selection procedure enforces consistency with the shape structure while +favoring the dominant medial balls, which thus introduces a compact underlying +shape representation in terms of MAT. As a result, Coverage Axis++ allows for +skeletonization for various shape representations (e.g., water-tight meshes, +triangle soups, point clouds), specification of the number of skeletal points, +few hyperparameters, and highly efficient computation with improved +reconstruction accuracy. Extensive experiments across a wide range of 3D shapes +validate the efficiency and effectiveness of Coverage Axis++. The code will be +publicly available once the paper is published. + +
+
+
+
+
+ + ♻ ☆ Generating Illustrated Instructions CVPR 2024 + + +
+ We introduce the new task of generating Illustrated Instructions, i.e., +visual instructions customized to a user's needs. We identify desiderata unique +to this task, and formalize it through a suite of automatic and human +evaluation metrics, designed to measure the validity, consistency, and efficacy +of the generations. We combine the power of large language models (LLMs) +together with strong text-to-image generation diffusion models to propose a +simple approach called StackedDiffusion, which generates such illustrated +instructions given text as input. The resulting model strongly outperforms +baseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases, +users even prefer it to human-generated articles. Most notably, it enables +various new and exciting applications far beyond what static articles on the +web can provide, such as personalized instructions complete with intermediate +steps and pictures in response to a user's individual situation. + +
+
+ comment: Accepted to CVPR 2024. Project website: + http://facebookresearch.github.io/IllustratedInstructions. Code reproduction: + https://github.com/sachit-menon/generating-illustrated-instructions-reproduction +
+
+
+
+
+ + ♻ ☆ Understanding and Modeling the Effects of Task and Context on Drivers' + Gaze Allocation + + +
+ To further advance driver monitoring and assistance systems, it is important +to understand how drivers allocate their attention, in other words, where do +they tend to look and why. Traditionally, factors affecting human visual +attention have been divided into bottom-up (involuntary attraction to salient +regions) and top-down (driven by the demands of the task being performed). +Although both play a role in directing drivers' gaze, most of the existing +models for drivers' gaze prediction apply techniques developed for bottom-up +saliency and do not consider influences of the drivers' actions explicitly. +Likewise, common driving attention benchmarks lack relevant annotations for +drivers' actions and the context in which they are performed. Therefore, to +enable analysis and modeling of these factors for drivers' gaze prediction, we +propose the following: 1) we correct the data processing pipeline used in +DR(eye)VE to reduce noise in the recorded gaze data; 2) we then add per-frame +labels for driving task and context; 3) we benchmark a number of baseline and +SOTA models for saliency and driver gaze prediction and use new annotations to +analyze how their performance changes in scenarios involving different tasks; +and, lastly, 4) we develop a novel model that modulates drivers' gaze +prediction with explicit action and context information. While reducing noise +in the DR(eye)VE gaze data improves results of all models, we show that using +task information in our proposed model boosts performance even further compared +to bottom-up models on the cleaned up data, both overall (by 24% KLD and 89% +NSS) and on scenarios that involve performing safety-critical maneuvers and +crossing intersections (by up to 10--30% KLD). Extended annotations and code +are available at https://github.com/ykotseruba/SCOUT. + +
+
+ comment: Accepted at IEEE Intelligent Vehicles Symposium (IV), 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 168 + +
+
+
+ + ☆ GoMVS: Geometrically Consistent Cost Aggregation for Multi-View Stereo CVPR 2024 + + +
+ Matching cost aggregation plays a fundamental role in learning-based +multi-view stereo networks. However, directly aggregating adjacent costs can +lead to suboptimal results due to local geometric inconsistency. Related +methods either seek selective aggregation or improve aggregated depth in the 2D +space, both are unable to handle geometric inconsistency in the cost volume +effectively. In this paper, we propose GoMVS to aggregate geometrically +consistent costs, yielding better utilization of adjacent geometries. More +specifically, we correspond and propagate adjacent costs to the reference pixel +by leveraging the local geometric smoothness in conjunction with surface +normals. We achieve this by the geometric consistent propagation (GCP) module. +It computes the correspondence from the adjacent depth hypothesis space to the +reference depth space using surface normals, then uses the correspondence to +propagate adjacent costs to the reference geometry, followed by a convolution +for aggregation. Our method achieves new state-of-the-art performance on DTU, +Tanks & Temple, and ETH3D datasets. Notably, our method ranks 1st on the Tanks +& Temple Advanced benchmark. + +
+
+ comment: CVPR 2024. Project page: https://wuuu3511.github.io/gomvs/ Code: + https://github.com/Wuuu3511/GoMVS +
+
+
+
+
+ + ☆ Connecting NeRFs, Images, and Text CVPR + + +
+ Neural Radiance Fields (NeRFs) have emerged as a standard framework for +representing 3D scenes and objects, introducing a novel data type for +information exchange and storage. Concurrently, significant progress has been +made in multimodal representation learning for text and image data. This paper +explores a novel research direction that aims to connect the NeRF modality with +other modalities, similar to established methodologies for images and text. To +this end, we propose a simple framework that exploits pre-trained models for +NeRF representations alongside multimodal models for text and image processing. +Our framework learns a bidirectional mapping between NeRF embeddings and those +obtained from corresponding images and text. This mapping unlocks several novel +and useful applications, including NeRF zero-shot classification and NeRF +retrieval from images or text. + +
+
+ comment: Accepted at CVPRW-INRV 2024 +
+
+
+
+
+ + ☆ GoMAvatar: Efficient Animatable Human Modeling from Monocular Video + Using Gaussians-on-Mesh CVPR 2024 + + +
+ We introduce GoMAvatar, a novel approach for real-time, memory-efficient, +high-quality animatable human modeling. GoMAvatar takes as input a single +monocular video to create a digital avatar capable of re-articulation in new +poses and real-time rendering from novel viewpoints, while seamlessly +integrating with rasterization-based graphics pipelines. Central to our method +is the Gaussians-on-Mesh representation, a hybrid 3D model combining rendering +quality and speed of Gaussian splatting with geometry modeling and +compatibility of deformable meshes. We assess GoMAvatar on ZJU-MoCap data and +various YouTube videos. GoMAvatar matches or surpasses current monocular human +modeling algorithms in rendering quality and significantly outperforms them in +computational efficiency (43 FPS) while being memory-efficient (3.63 MB per +subject). + +
+
+ comment: CVPR 2024; project page: https://wenj.github.io/GoMAvatar/ +
+
+
+
+
+ + ☆ OpenBias: Open-set Bias Detection in Text-to-Image Generative Models CVPR 2024 + + +
+ Text-to-image generative models are becoming increasingly popular and +accessible to the general public. As these models see large-scale deployments, +it is necessary to deeply investigate their safety and fairness to not +disseminate and perpetuate any kind of biases. However, existing works focus on +detecting closed sets of biases defined a priori, limiting the studies to +well-known concepts. In this paper, we tackle the challenge of open-set bias +detection in text-to-image generative models presenting OpenBias, a new +pipeline that identifies and quantifies the severity of biases agnostically, +without access to any precompiled set. OpenBias has three stages. In the first +phase, we leverage a Large Language Model (LLM) to propose biases given a set +of captions. Secondly, the target generative model produces images using the +same set of captions. Lastly, a Vision Question Answering model recognizes the +presence and extent of the previously proposed biases. We study the behavior of +Stable Diffusion 1.5, 2, and XL emphasizing new biases, never investigated +before. Via quantitative experiments, we demonstrate that OpenBias agrees with +current closed-set bias detection methods and human judgement. + +
+
+ comment: CVPR 2024 Highlight - Code: + https://github.com/Picsart-AI-Research/OpenBias +
+
+
+
+
+ + ☆ Any2Point: Empowering Any-modality Large Models for Efficient 3D + Understanding + + +
+ Large foundation models have recently emerged as a prominent focus of +interest, attaining superior performance in widespread scenarios. Due to the +scarcity of 3D data, many efforts have been made to adapt pre-trained +transformers from vision to 3D domains. However, such 2D-to-3D approaches are +still limited, due to the potential loss of spatial geometries and high +computation cost. More importantly, their frameworks are mainly designed for 2D +models, lacking a general any-to-3D paradigm. In this paper, we introduce +Any2Point, a parameter-efficient method to empower any-modality large models +(vision, language, audio) for 3D understanding. Given a frozen transformer from +any source modality, we propose a 3D-to-any (1D or 2D) virtual projection +strategy that correlates the input 3D points to the original 1D or 2D positions +within the source modality. This mechanism enables us to assign each 3D token +with a positional encoding paired with the pre-trained model, which avoids 3D +geometry loss caused by the true projection and better motivates the +transformer for 3D learning with 1D/2D positional priors. Then, within each +transformer block, we insert an any-to-3D guided adapter module for +parameter-efficient fine-tuning. The adapter incorporates prior spatial +knowledge from the source modality to guide the local feature aggregation of 3D +tokens, compelling the semantic adaption of any-modality transformers. We +conduct extensive experiments to showcase the effectiveness and efficiency of +our method. Code and models are released at +https://github.com/Ivan-Tang-3D/Any2Point. + +
+
+ comment: Code and models are released at + https://github.com/Ivan-Tang-3D/Any2Point +
+
+
+
+
+ + ☆ QuasiSim: Parameterized Quasi-Physical Simulators for Dexterous + Manipulations Transfer + + +
+ We explore the dexterous manipulation transfer problem by designing +simulators. The task wishes to transfer human manipulations to dexterous robot +hand simulations and is inherently difficult due to its intricate, +highly-constrained, and discontinuous dynamics and the need to control a +dexterous hand with a DoF to accurately replicate human manipulations. Previous +approaches that optimize in high-fidelity black-box simulators or a modified +one with relaxed constraints only demonstrate limited capabilities or are +restricted by insufficient simulation fidelity. We introduce parameterized +quasi-physical simulators and a physics curriculum to overcome these +limitations. The key ideas are 1) balancing between fidelity and optimizability +of the simulation via a curriculum of parameterized simulators, and 2) solving +the problem in each of the simulators from the curriculum, with properties +ranging from high task optimizability to high fidelity. We successfully enable +a dexterous hand to track complex and diverse manipulations in high-fidelity +simulated environments, boosting the success rate by 11\%+ from the +best-performed baseline. The project website is available at +https://meowuu7.github.io/QuasiSim/. + +
+
+ comment: Project website: https://meowuu7.github.io/QuasiSim/ Code: + https://github.com/Meowuu7/QuasiSim Hugging Face Demo: + https://huggingface.co/spaces/xymeow7/quasi-physical-sims +
+
+
+
+
+ + ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 7.9% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. + +
+
+ comment: Project Page: https://liming-ai.github.io/ControlNet_Plus_Plus +
+
+
+
+
+ + ☆ WaveMo: Learning Wavefront Modulations to See Through Scattering + + +
+ Imaging through scattering media is a fundamental and pervasive challenge in +fields ranging from medical diagnostics to astronomy. A promising strategy to +overcome this challenge is wavefront modulation, which induces measurement +diversity during image acquisition. Despite its importance, designing optimal +wavefront modulations to image through scattering remains under-explored. This +paper introduces a novel learning-based framework to address the gap. Our +approach jointly optimizes wavefront modulations and a computationally +lightweight feedforward "proxy" reconstruction network. This network is trained +to recover scenes obscured by scattering, using measurements that are modified +by these modulations. The learned modulations produced by our framework +generalize effectively to unseen scattering scenarios and exhibit remarkable +versatility. During deployment, the learned modulations can be decoupled from +the proxy network to augment other more computationally expensive restoration +algorithms. Through extensive experiments, we demonstrate our approach +significantly advances the state of the art in imaging through scattering +media. Our project webpage is at https://wavemo-2024.github.io/. + +
+
+
+
+
+ + ☆ View Selection for 3D Captioning via Diffusion Ranking + + +
+ Scalable annotation approaches are crucial for constructing extensive 3D-text +datasets, facilitating a broader range of applications. However, existing +methods sometimes lead to the generation of hallucinated captions, compromising +caption quality. This paper explores the issue of hallucination in 3D object +captioning, with a focus on Cap3D method, which renders 3D objects into 2D +views for captioning using pre-trained models. We pinpoint a major challenge: +certain rendered views of 3D objects are atypical, deviating from the training +data of standard image captioning models and causing hallucinations. To tackle +this, we present DiffuRank, a method that leverages a pre-trained text-to-3D +model to assess the alignment between 3D objects and their 2D rendered views, +where the view with high alignment closely represent the object's +characteristics. By ranking all rendered views and feeding the top-ranked ones +into GPT4-Vision, we enhance the accuracy and detail of captions, enabling the +correction of 200k captions in the Cap3D dataset and extending it to 1 million +captions across Objaverse and Objaverse-XL datasets. Additionally, we showcase +the adaptability of DiffuRank by applying it to pre-trained text-to-image +models for a Visual Question Answering task, where it outperforms the CLIP +model. + +
+
+ comment: Dataset link: https://huggingface.co/datasets/tiange/Cap3D +
+
+
+
+
+ + ☆ Two Effects, One Trigger: On the Modality Gap, Object Bias, and + Information Imbalance in Contrastive Vision-Language Representation Learning + + +
+ Contrastive vision-language models like CLIP have gained popularity for their +versatile applicable learned representations in various downstream tasks. +Despite their successes in some tasks, like zero-shot image recognition, they +also perform surprisingly poor on other tasks, like attribute detection. +Previous work has attributed these challenges to the modality gap, a separation +of image and text in the shared representation space, and a bias towards +objects over other factors, such as attributes. In this work we investigate +both phenomena. We find that only a few embedding dimensions drive the modality +gap. Further, we propose a measure for object bias and find that object bias +does not lead to worse performance on other concepts, such as attributes. But +what leads to the emergence of the modality gap and object bias? To answer this +question we carefully designed an experimental setting which allows us to +control the amount of shared information between the modalities. This revealed +that the driving factor behind both, the modality gap and the object bias, is +the information imbalance between images and captions. + +
+
+
+
+
+ + ☆ Gaga: Group Any Gaussians via 3D-aware Memory Bank + + +
+ We introduce Gaga, a framework that reconstructs and segments open-world 3D +scenes by leveraging inconsistent 2D masks predicted by zero-shot segmentation +models. Contrasted to prior 3D scene segmentation approaches that heavily rely +on video object tracking, Gaga utilizes spatial information and effectively +associates object masks across diverse camera poses. By eliminating the +assumption of continuous view changes in training images, Gaga demonstrates +robustness to variations in camera poses, particularly beneficial for sparsely +sampled images, ensuring precise mask label consistency. Furthermore, Gaga +accommodates 2D segmentation masks from diverse sources and demonstrates robust +performance with different open-world zero-shot segmentation models, enhancing +its versatility. Extensive qualitative and quantitative evaluations demonstrate +that Gaga performs favorably against state-of-the-art methods, emphasizing its +potential for real-world applications such as scene understanding and +manipulation. + +
+
+ comment: Project Page: https://www.gaga.gallery +
+
+
+
+
+ + ☆ Self-supervised Dataset Distillation: A Good Compression Is All You Need + + +
+ Dataset distillation aims to compress information from a large-scale original +dataset to a new compact dataset while striving to preserve the utmost degree +of the original data informational essence. Previous studies have predominantly +concentrated on aligning the intermediate statistics between the original and +distilled data, such as weight trajectory, features, gradient, BatchNorm, etc. +In this work, we consider addressing this task through the new lens of model +informativeness in the compression stage on the original dataset pretraining. +We observe that with the prior state-of-the-art SRe$^2$L, as model sizes +increase, it becomes increasingly challenging for supervised pretrained models +to recover learned information during data synthesis, as the channel-wise mean +and variance inside the model are flatting and less informative. We further +notice that larger variances in BN statistics from self-supervised models +enable larger loss signals to update the recovered data by gradients, enjoying +more informativeness during synthesis. Building on this observation, we +introduce SC-DD, a simple yet effective Self-supervised Compression framework +for Dataset Distillation that facilitates diverse information compression and +recovery compared to traditional supervised learning schemes, further reaps the +potential of large pretrained models with enhanced capabilities. Extensive +experiments are conducted on CIFAR-100, Tiny-ImageNet and ImageNet-1K datasets +to demonstrate the superiority of our proposed approach. The proposed SC-DD +outperforms all previous state-of-the-art supervised dataset distillation +methods when employing larger models, such as SRe$^2$L, MTT, TESLA, DC, CAFE, +etc., by large margins under the same recovery and post-training budgets. Code +is available at https://github.com/VILA-Lab/SRe2L/tree/main/SCDD/. + +
+
+
+
+
+ + ☆ Ferret-v2: An Improved Baseline for Referring and Grounding with Large + Language Models + + +
+ While Ferret seamlessly integrates regional understanding into the Large +Language Model (LLM) to facilitate its referring and grounding capability, it +poses certain limitations: constrained by the pre-trained fixed visual encoder +and failed to perform well on broader tasks. In this work, we unveil Ferret-v2, +a significant upgrade to Ferret, with three key designs. (1) Any resolution +grounding and referring: A flexible approach that effortlessly handles higher +image resolution, improving the model's ability to process and understand +images in greater detail. (2) Multi-granularity visual encoding: By integrating +the additional DINOv2 encoder, the model learns better and diverse underlying +contexts for global and fine-grained visual information. (3) A three-stage +training paradigm: Besides image-caption alignment, an additional stage is +proposed for high-resolution dense alignment before the final instruction +tuning. Experiments show that Ferret-v2 provides substantial improvements over +Ferret and other state-of-the-art methods, thanks to its high-resolution +scaling and fine-grained visual processing. + +
+
+ comment: Preprint. 14 pages, 4 figures +
+
+
+
+
+ + ☆ Taming Stable Diffusion for Text to 360° Panorama Image Generation CVPR 2024 + + +
+ Generative models, e.g., Stable Diffusion, have enabled the creation of +photorealistic images from text prompts. Yet, the generation of 360-degree +panorama images from text remains a challenge, particularly due to the dearth +of paired text-panorama data and the domain gap between panorama and +perspective images. In this paper, we introduce a novel dual-branch diffusion +model named PanFusion to generate a 360-degree image from a text prompt. We +leverage the stable diffusion model as one branch to provide prior knowledge in +natural image generation and register it to another panorama branch for +holistic image generation. We propose a unique cross-attention mechanism with +projection awareness to minimize distortion during the collaborative denoising +process. Our experiments validate that PanFusion surpasses existing methods +and, thanks to its dual-branch structure, can integrate additional constraints +like room layout for customized panorama outputs. Code is available at +https://chengzhag.github.io/publication/panfusion. + +
+
+ comment: CVPR 2024. Project Page: + https://chengzhag.github.io/publication/panfusion Code: + https://github.com/chengzhag/PanFusion +
+
+
+
+
+ + ☆ Boosting Self-Supervision for Single-View Scene Completion via Knowledge + Distillation + + +
+ Inferring scene geometry from images via Structure from Motion is a +long-standing and fundamental problem in computer vision. While classical +approaches and, more recently, depth map predictions only focus on the visible +parts of a scene, the task of scene completion aims to reason about geometry +even in occluded regions. With the popularity of neural radiance fields +(NeRFs), implicit representations also became popular for scene completion by +predicting so-called density fields. Unlike explicit approaches. e.g. +voxel-based methods, density fields also allow for accurate depth prediction +and novel-view synthesis via image-based rendering. In this work, we propose to +fuse the scene reconstruction from multiple images and distill this knowledge +into a more accurate single-view scene reconstruction. To this end, we propose +Multi-View Behind the Scenes (MVBTS) to fuse density fields from multiple posed +images, trained fully self-supervised only from image data. Using knowledge +distillation, we use MVBTS to train a single-view scene completion network via +direct supervision called KDBTS. It achieves state-of-the-art performance on +occupancy prediction, especially in occluded regions. + +
+
+
+
+
+ + ☆ FusionMamba: Efficient Image Fusion with State Space Model + + +
+ Image fusion aims to generate a high-resolution multi/hyper-spectral image by +combining a high-resolution image with limited spectral information and a +low-resolution image with abundant spectral data. Current deep learning +(DL)-based methods for image fusion primarily rely on CNNs or Transformers to +extract features and merge different types of data. While CNNs are efficient, +their receptive fields are limited, restricting their capacity to capture +global context. Conversely, Transformers excel at learning global information +but are hindered by their quadratic complexity. Fortunately, recent +advancements in the State Space Model (SSM), particularly Mamba, offer a +promising solution to this issue by enabling global awareness with linear +complexity. However, there have been few attempts to explore the potential of +SSM in information fusion, which is a crucial ability in domains like image +fusion. Therefore, we propose FusionMamba, an innovative method for efficient +image fusion. Our contributions mainly focus on two aspects. Firstly, +recognizing that images from different sources possess distinct properties, we +incorporate Mamba blocks into two U-shaped networks, presenting a novel +architecture that extracts spatial and spectral features in an efficient, +independent, and hierarchical manner. Secondly, to effectively combine spatial +and spectral information, we extend the Mamba block to accommodate dual inputs. +This expansion leads to the creation of a new module called the FusionMamba +block, which outperforms existing fusion techniques such as concatenation and +cross-attention. To validate FusionMamba's effectiveness, we conduct a series +of experiments on five datasets related to three image fusion tasks. The +quantitative and qualitative evaluation results demonstrate that our method +achieves state-of-the-art (SOTA) performance, underscoring the superiority of +FusionMamba. + +
+
+
+
+
+ + ☆ Parameter Hierarchical Optimization for Visible-Infrared Person + Re-Identification + + +
+ Visible-infrared person re-identification (VI-reID) aims at matching +cross-modality pedestrian images captured by disjoint visible or infrared +cameras. Existing methods alleviate the cross-modality discrepancies via +designing different kinds of network architectures. Different from available +methods, in this paper, we propose a novel parameter optimizing paradigm, +parameter hierarchical optimization (PHO) method, for the task of VI-ReID. It +allows part of parameters to be directly optimized without any training, which +narrows the search space of parameters and makes the whole network more easier +to be trained. Specifically, we first divide the parameters into different +types, and then introduce a self-adaptive alignment strategy (SAS) to +automatically align the visible and infrared images through transformation. +Considering that features in different dimension have varying importance, we +develop an auto-weighted alignment learning (AAL) module that can automatically +weight features according to their importance. Importantly, in the alignment +process of SAS and AAL, all the parameters are immediately optimized with +optimization principles rather than training the whole network, which yields a +better parameter training manner. Furthermore, we establish the cross-modality +consistent learning (CCL) loss to extract discriminative person representations +with translation consistency. We provide both theoretical justification and +empirical evidence that our proposed PHO method outperform existing VI-reID +approaches. + +
+
+
+
+
+ + ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. All code and model +weights are public at https://github.com/baochi0212/LaVy + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Context-aware Video Anomaly Detection in Long-Term Datasets + + +
+ Video anomaly detection research is generally evaluated on short, isolated +benchmark videos only a few minutes long. However, in real-world environments, +security cameras observe the same scene for months or years at a time, and the +notion of anomalous behavior critically depends on context, such as the time of +day, day of week, or schedule of events. Here, we propose a context-aware video +anomaly detection algorithm, Trinity, specifically targeted to these scenarios. +Trinity is especially well-suited to crowded scenes in which individuals cannot +be easily tracked, and anomalies are due to speed, direction, or absence of +group motion. Trinity is a contrastive learning framework that aims to learn +alignments between context, appearance, and motion, and uses alignment quality +to classify videos as normal or anomalous. We evaluate our algorithm on both +conventional benchmarks and a public webcam-based dataset we collected that +spans more than three months of activity. + +
+
+
+
+
+ + ☆ The Power of Properties: Uncovering the Influential Factors in Emotion + Classification ICPR + + +
+ Facial expression-based human emotion recognition is a critical research area +in psychology and medicine. State-of-the-art classification performance is only +reached by end-to-end trained neural networks. Nevertheless, such black-box +models lack transparency in their decision-making processes, prompting efforts +to ascertain the rules that underlie classifiers' decisions. Analyzing single +inputs alone fails to expose systematic learned biases. These biases can be +characterized as facial properties summarizing abstract information like age or +medical conditions. Therefore, understanding a model's prediction behavior +requires an analysis rooted in causality along such selected properties. We +demonstrate that up to 91.25% of classifier output behavior changes are +statistically significant concerning basic properties. Among those are age, +gender, and facial symmetry. Furthermore, the medical usage of surface +electromyography significantly influences emotion prediction. We introduce a +workflow to evaluate explicit properties and their impact. These insights might +help medical professionals select and apply classifiers regarding their +specialized data and properties. + +
+
+ comment: 8 pages, 3 tables, 1 figure, accepted at ICPRAI 2024 +
+
+
+
+
+ + ☆ Resolve Domain Conflicts for Generalizable Remote Physiological + Measurement ACM MM 2023 + + +
+ Remote photoplethysmography (rPPG) technology has become increasingly popular +due to its non-invasive monitoring of various physiological indicators, making +it widely applicable in multimedia interaction, healthcare, and emotion +analysis. Existing rPPG methods utilize multiple datasets for training to +enhance the generalizability of models. However, they often overlook the +underlying conflict issues across different datasets, such as (1) label +conflict resulting from different phase delays between physiological signal +labels and face videos at the instance level, and (2) attribute conflict +stemming from distribution shifts caused by head movements, illumination +changes, skin types, etc. To address this, we introduce the DOmain-HArmonious +framework (DOHA). Specifically, we first propose a harmonious phase strategy to +eliminate uncertain phase delays and preserve the temporal variation of +physiological signals. Next, we design a harmonious hyperplane optimization +that reduces irrelevant attribute shifts and encourages the model's +optimization towards a global solution that fits more valid scenarios. Our +experiments demonstrate that DOHA significantly improves the performance of +existing methods under multiple protocols. Our code is available at +https://github.com/SWY666/rPPG-DOHA. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ MindBridge: A Cross-Subject Brain Decoding Framework CVPR 2024 + + +
+ Brain decoding, a pivotal field in neuroscience, aims to reconstruct stimuli +from acquired brain signals, primarily utilizing functional magnetic resonance +imaging (fMRI). Currently, brain decoding is confined to a +per-subject-per-model paradigm, limiting its applicability to the same +individual for whom the decoding model is trained. This constraint stems from +three key challenges: 1) the inherent variability in input dimensions across +subjects due to differences in brain size; 2) the unique intrinsic neural +patterns, influencing how different individuals perceive and process sensory +information; 3) limited data availability for new subjects in real-world +scenarios hampers the performance of decoding models. In this paper, we present +a novel approach, MindBridge, that achieves cross-subject brain decoding by +employing only one model. Our proposed framework establishes a generic paradigm +capable of addressing these challenges by introducing biological-inspired +aggregation function and novel cyclic fMRI reconstruction mechanism for +subject-invariant representation learning. Notably, by cycle reconstruction of +fMRI, MindBridge can enable novel fMRI synthesis, which also can serve as +pseudo data augmentation. Within the framework, we also devise a novel +reset-tuning method for adapting a pretrained model to a new subject. +Experimental results demonstrate MindBridge's ability to reconstruct images for +multiple subjects, which is competitive with dedicated subject-specific models. +Furthermore, with limited data for a new subject, we achieve a high level of +decoding accuracy, surpassing that of subject-specific models. This advancement +in cross-subject brain decoding suggests promising directions for wider +applications in neuroscience and indicates potential for more efficient +utilization of limited fMRI data in real-world scenarios. Project page: +https://littlepure2333.github.io/MindBridge + +
+
+ comment: CVPR 2024 highlight. Code is available at + https://github.com/littlepure2333/MindBridge +
+
+
+
+
+ + ☆ Fuss-Free Network: A Simplified and Efficient Neural Network for Crowd + Counting + + +
+ In the field of crowd-counting research, many recent deep learning based +methods have demonstrated robust capabilities for accurately estimating crowd +sizes. However, the enhancement in their performance often arises from an +increase in the complexity of the model structure. This paper introduces the +Fuss-Free Network (FFNet), a crowd counting deep learning model that is +characterized by its simplicity and efficiency in terms of its structure. The +model comprises only a backbone of a neural network and a multi-scale feature +fusion structure.The multi-scale feature fusion structure is a simple +architecture consisting of three branches, each only equipped with a focus +transition module, and combines the features from these branches through the +concatenation operation.Our proposed crowd counting model is trained and +evaluated on four widely used public datasets, and it achieves accuracy that is +comparable to that of existing complex models.The experimental results further +indicate that excellent performance in crowd counting tasks can also be +achieved by utilizing a simple, low-parameter, and computationally efficient +neural network structure. + +
+
+
+
+
+ + ☆ TBSN: Transformer-Based Blind-Spot Network for Self-Supervised Image + Denoising + + +
+ Blind-spot networks (BSN) have been prevalent network architectures in +self-supervised image denoising (SSID). Existing BSNs are mostly conducted with +convolution layers. Although transformers offer potential solutions to the +limitations of convolutions and have demonstrated success in various image +restoration tasks, their attention mechanisms may violate the blind-spot +requirement, thus restricting their applicability in SSID. In this paper, we +present a transformer-based blind-spot network (TBSN) by analyzing and +redesigning the transformer operators that meet the blind-spot requirement. +Specifically, TBSN follows the architectural principles of dilated BSNs, and +incorporates spatial as well as channel self-attention layers to enhance the +network capability. For spatial self-attention, an elaborate mask is applied to +the attention matrix to restrict its receptive field, thus mimicking the +dilated convolution. For channel self-attention, we observe that it may leak +the blind-spot information when the channel number is greater than spatial size +in the deep layers of multi-scale architectures. To eliminate this effect, we +divide the channel into several groups and perform channel attention +separately. Furthermore, we introduce a knowledge distillation strategy that +distills TBSN into smaller denoisers to improve computational efficiency while +maintaining performance. Extensive experiments on real-world image denoising +datasets show that TBSN largely extends the receptive field and exhibits +favorable performance against state-of-the-art SSID methods. The code and +pre-trained models will be publicly available at +https://github.com/nagejacob/TBSN. + +
+
+
+
+
+ + ☆ Streamlined Photoacoustic Image Processing with Foundation Models: A + Training-Free Solution + + +
+ Foundation models have rapidly evolved and have achieved significant +accomplishments in computer vision tasks. Specifically, the prompt mechanism +conveniently allows users to integrate image prior information into the model, +making it possible to apply models without any training. Therefore, we propose +a method based on foundation models and zero training to solve the tasks of +photoacoustic (PA) image segmentation. We employed the segment anything model +(SAM) by setting simple prompts and integrating the model's outputs with prior +knowledge of the imaged objects to accomplish various tasks, including: (1) +removing the skin signal in three-dimensional PA image rendering; (2) dual +speed-of-sound reconstruction, and (3) segmentation of finger blood vessels. +Through these demonstrations, we have concluded that deep learning can be +directly applied in PA imaging without the requirement for network design and +training. This potentially allows for a hands-on, convenient approach to +achieving efficient and accurate segmentation of PA images. This letter serves +as a comprehensive tutorial, facilitating the mastery of the technique through +the provision of code and sample datasets. + +
+
+
+
+
+ + ☆ Heron-Bench: A Benchmark for Evaluating Vision Language Models in + Japanese + + +
+ Vision Language Models (VLMs) have undergone a rapid evolution, giving rise +to significant advancements in the realm of multimodal understanding tasks. +However, the majority of these models are trained and evaluated on +English-centric datasets, leaving a gap in the development and evaluation of +VLMs for other languages, such as Japanese. This gap can be attributed to the +lack of methodologies for constructing VLMs and the absence of benchmarks to +accurately measure their performance. To address this issue, we introduce a +novel benchmark, Japanese Heron-Bench, for evaluating Japanese capabilities of +VLMs. The Japanese Heron-Bench consists of a variety of imagequestion answer +pairs tailored to the Japanese context. Additionally, we present a baseline +Japanese VLM that has been trained with Japanese visual instruction tuning +datasets. Our Heron-Bench reveals the strengths and limitations of the proposed +VLM across various ability dimensions. Furthermore, we clarify the capability +gap between strong closed models like GPT-4V and the baseline model, providing +valuable insights for future research in this domain. We release the benchmark +dataset and training code to facilitate further developments in Japanese VLM +research. + +
+
+
+
+
+ + ☆ Sparse Laneformer + + +
+ Lane detection is a fundamental task in autonomous driving, and has achieved +great progress as deep learning emerges. Previous anchor-based methods often +design dense anchors, which highly depend on the training dataset and remain +fixed during inference. We analyze that dense anchors are not necessary for +lane detection, and propose a transformer-based lane detection framework based +on a sparse anchor mechanism. To this end, we generate sparse anchors with +position-aware lane queries and angle queries instead of traditional explicit +anchors. We adopt Horizontal Perceptual Attention (HPA) to aggregate the lane +features along the horizontal direction, and adopt Lane-Angle Cross Attention +(LACA) to perform interactions between lane queries and angle queries. We also +propose Lane Perceptual Attention (LPA) based on deformable cross attention to +further refine the lane predictions. Our method, named Sparse Laneformer, is +easy-to-implement and end-to-end trainable. Extensive experiments demonstrate +that Sparse Laneformer performs favorably against the state-of-the-art methods, +e.g., surpassing Laneformer by 3.0% F1 score and O2SFormer by 0.7% F1 score +with fewer MACs on CULane with the same ResNet-34 backbone. + +
+
+
+
+
+ + ☆ Voice-Assisted Real-Time Traffic Sign Recognition System Using + Convolutional Neural Network + + +
+ Traffic signs are important in communicating information to drivers. Thus, +comprehension of traffic signs is essential for road safety and ignorance may +result in road accidents. Traffic sign detection has been a research spotlight +over the past few decades. Real-time and accurate detections are the +preliminaries of robust traffic sign detection system which is yet to be +achieved. This study presents a voice-assisted real-time traffic sign +recognition system which is capable of assisting drivers. This system functions +under two subsystems. Initially, the detection and recognition of the traffic +signs are carried out using a trained Convolutional Neural Network (CNN). After +recognizing the specific traffic sign, it is narrated to the driver as a voice +message using a text-to-speech engine. An efficient CNN model for a benchmark +dataset is developed for real-time detection and recognition using Deep +Learning techniques. The advantage of this system is that even if the driver +misses a traffic sign, or does not look at the traffic sign, or is unable to +comprehend the sign, the system detects it and narrates it to the driver. A +system of this type is also important in the development of autonomous +vehicles. + +
+
+
+
+
+ + ☆ DGMamba: Domain Generalization via Generalized State Space Model + + +
+ Domain generalization~(DG) aims at solving distribution shift problems in +various scenes. Existing approaches are based on Convolution Neural Networks +(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive +fields or quadratic complexities issues. Mamba, as an emerging state space +model (SSM), possesses superior linear complexity and global receptive fields. +Despite this, it can hardly be applied to DG to address distribution shifts, +due to the hidden state issues and inappropriate scan mechanisms. In this +paper, we propose a novel framework for DG, named DGMamba, that excels in +strong generalizability toward unseen domains and meanwhile has the advantages +of global receptive fields, and efficient linear complexity. Our DGMamba +compromises two core components: Hidden State Suppressing~(HSS) and +Semantic-aware Patch refining~(SPR). In particular, HSS is introduced to +mitigate the influence of hidden states associated with domain-specific +features during output prediction. SPR strives to encourage the model to +concentrate more on objects rather than context, consisting of two designs: +Prior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely, +PFS aims to shuffle the non-semantic patches within images, creating more +flexible and effective sequences from images, and DCI is designed to regularize +Mamba with the combination of mismatched non-semantic and semantic information +by fusing patches among domains. Extensive experiments on four commonly used DG +benchmarks demonstrate that the proposed DGMamba achieves remarkably superior +results to state-of-the-art models. The code will be made publicly available. + +
+
+
+
+
+ + ☆ VIFNet: An End-to-end Visible-Infrared Fusion Network for Image Dehazing + + +
+ Image dehazing poses significant challenges in environmental perception. +Recent research mainly focus on deep learning-based methods with single +modality, while they may result in severe information loss especially in +dense-haze scenarios. The infrared image exhibits robustness to the haze, +however, existing methods have primarily treated the infrared modality as +auxiliary information, failing to fully explore its rich information in +dehazing. To address this challenge, the key insight of this study is to design +a visible-infrared fusion network for image dehazing. In particular, we propose +a multi-scale Deep Structure Feature Extraction (DSFE) module, which +incorporates the Channel-Pixel Attention Block (CPAB) to restore more spatial +and marginal information within the deep structural features. Additionally, we +introduce an inconsistency weighted fusion strategy to merge the two modalities +by leveraging the more reliable information. To validate this, we construct a +visible-infrared multimodal dataset called AirSim-VID based on the AirSim +simulation platform. Extensive experiments performed on challenging real and +simulated image datasets demonstrate that VIFNet can outperform many +state-of-the-art competing methods. The code and dataset are available at +https://github.com/mengyu212/VIFNet_dehazing. + +
+
+
+
+
+ + ☆ AUG: A New Dataset and An Efficient Model for Aerial Image Urban Scene + Graph Generation + + +
+ Scene graph generation (SGG) aims to understand the visual objects and their +semantic relationships from one given image. Until now, lots of SGG datasets +with the eyelevel view are released but the SGG dataset with the overhead view +is scarcely studied. By contrast to the object occlusion problem in the +eyelevel view, which impedes the SGG, the overhead view provides a new +perspective that helps to promote the SGG by providing a clear perception of +the spatial relationships of objects in the ground scene. To fill in the gap of +the overhead view dataset, this paper constructs and releases an aerial image +urban scene graph generation (AUG) dataset. Images from the AUG dataset are +captured with the low-attitude overhead view. In the AUG dataset, 25,594 +objects, 16,970 relationships, and 27,175 attributes are manually annotated. To +avoid the local context being overwhelmed in the complex aerial urban scene, +this paper proposes one new locality-preserving graph convolutional network +(LPG). Different from the traditional graph convolutional network, which has +the natural advantage of capturing the global context for SGG, the +convolutional layer in the LPG integrates the non-destructive initial features +of the objects with dynamically updated neighborhood information to preserve +the local context under the premise of mining the global context. To address +the problem that there exists an extra-large number of potential object +relationship pairs but only a small part of them is meaningful in AUG, we +propose the adaptive bounding box scaling factor for potential relationship +detection (ABS-PRD) to intelligently prune the meaningless relationship pairs. +Extensive experiments on the AUG dataset show that our LPG can significantly +outperform the state-of-the-art methods and the effectiveness of the proposed +locality-preserving strategy. + +
+
+
+
+
+ + ☆ PRAM: Place Recognition Anywhere Model for Efficient Visual Localization + + +
+ Humans localize themselves efficiently in known environments by first +recognizing landmarks defined on certain objects and their spatial +relationships, and then verifying the location by aligning detailed structures +of recognized objects with those in the memory. Inspired by this, we propose +the place recognition anywhere model (PRAM) to perform visual localization as +efficiently as humans do. PRAM consists of two main components - recognition +and registration. In detail, first of all, a self-supervised map-centric +landmark definition strategy is adopted, making places in either indoor or +outdoor scenes act as unique landmarks. Then, sparse keypoints extracted from +images, are utilized as the input to a transformer-based deep neural network +for landmark recognition; these keypoints enable PRAM to recognize hundreds of +landmarks with high time and memory efficiency. Keypoints along with recognized +landmark labels are further used for registration between query images and the +3D landmark map. Different from previous hierarchical methods, PRAM discards +global and local descriptors, and reduces over 90% storage. Since PRAM utilizes +recognition and landmark-wise verification to replace global reference search +and exhaustive matching respectively, it runs 2.4 times faster than prior +state-of-the-art approaches. Moreover, PRAM opens new directions for visual +localization including multi-modality localization, map-centric feature +learning, and hierarchical scene coordinate regression. + +
+
+ comment: project page: https://feixue94.github.io/pram-project/ +
+
+
+
+
+ + ☆ ConsistencyDet: Robust Object Detector with Denoising Paradigm of + Consistency Model + + +
+ Object detection, a quintessential task in the realm of perceptual computing, +can be tackled using a generative methodology. In the present study, we +introduce a novel framework designed to articulate object detection as a +denoising diffusion process, which operates on perturbed bounding boxes of +annotated entities. This framework, termed ConsistencyDet, leverages an +innovative denoising concept known as the Consistency Model. The hallmark of +this model is its self-consistency feature, which empowers the model to map +distorted information from any temporal stage back to its pristine state, +thereby realizing a ``one-step denoising'' mechanism. Such an attribute +markedly elevates the operational efficiency of the model, setting it apart +from the conventional Diffusion Model. Throughout the training phase, +ConsistencyDet initiates the diffusion sequence with noise-infused boxes +derived from the ground-truth annotations and conditions the model to perform +the denoising task. Subsequently, in the inference stage, the model employs a +denoising sampling strategy that commences with bounding boxes randomly sampled +from a normal distribution. Through iterative refinement, the model transforms +an assortment of arbitrarily generated boxes into the definitive detections. +Comprehensive evaluations employing standard benchmarks, such as MS-COCO and +LVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in +performance metrics. + +
+
+
+
+
+ + ☆ Joint Conditional Diffusion Model for Image Restoration with Mixed + Degradations + + +
+ Image restoration is rather challenging in adverse weather conditions, +especially when multiple degradations occur simultaneously. Blind image +decomposition was proposed to tackle this issue, however, its effectiveness +heavily relies on the accurate estimation of each component. Although +diffusion-based models exhibit strong generative abilities in image restoration +tasks, they may generate irrelevant contents when the degraded images are +severely corrupted. To address these issues, we leverage physical constraints +to guide the whole restoration process, where a mixed degradation model based +on atmosphere scattering model is constructed. Then we formulate our Joint +Conditional Diffusion Model (JCDM) by incorporating the degraded image and +degradation mask to provide precise guidance. To achieve better color and +detail recovery results, we further integrate a refinement network to +reconstruct the restored image, where Uncertainty Estimation Block (UEB) is +employed to enhance the features. Extensive experiments performed on both +multi-weather and weather-specific datasets demonstrate the superiority of our +method over state-of-the-art competing methods. + +
+
+
+
+
+ + ☆ RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric + Stereo Network + + +
+ Predicting accurate normal maps of objects from two-dimensional images in +regions of complex structure and spatial material variations is challenging +using photometric stereo methods due to the influence of surface reflection +properties caused by variations in object geometry and surface materials. To +address this issue, we propose a photometric stereo network called a RMAFF-PSN +that uses residual multiscale attentional feature fusion to handle the +``difficult'' regions of the object. Unlike previous approaches that only use +stacked convolutional layers to extract deep features from the input image, our +method integrates feature information from different resolution stages and +scales of the image. This approach preserves more physical information, such as +texture and geometry of the object in complex regions, through shallow-deep +stage feature extraction, double branching enhancement, and attention +optimization. To test the network structure under real-world conditions, we +propose a new real dataset called Simple PS data, which contains multiple +objects with varying structures and materials. Experimental results on a +publicly available benchmark dataset demonstrate that our method outperforms +most existing calibrated photometric stereo methods for the same number of +input images, especially in the case of highly non-convex object structures. +Our method also obtains good results under sparse lighting conditions. + +
+
+ comment: 17 pages,12 figures +
+
+
+
+
+ + ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/wljungbergh/NeuroNCAP + +
+
+
+
+
+ + ☆ Generating Synthetic Satellite Imagery With Deep-Learning Text-to-Image + Models -- Technical Challenges and Implications for Monitoring and + Verification + + +
+ Novel deep-learning (DL) architectures have reached a level where they can +generate digital media, including photorealistic images, that are difficult to +distinguish from real data. These technologies have already been used to +generate training data for Machine Learning (ML) models, and large +text-to-image models like DALL-E 2, Imagen, and Stable Diffusion are achieving +remarkable results in realistic high-resolution image generation. Given these +developments, issues of data authentication in monitoring and verification +deserve a careful and systematic analysis: How realistic are synthetic images? +How easily can they be generated? How useful are they for ML researchers, and +what is their potential for Open Science? In this work, we use novel DL models +to explore how synthetic satellite images can be created using conditioning +mechanisms. We investigate the challenges of synthetic satellite image +generation and evaluate the results based on authenticity and state-of-the-art +metrics. Furthermore, we investigate how synthetic data can alleviate the lack +of data in the context of ML methods for remote-sensing. Finally we discuss +implications of synthetic satellite imagery in the context of monitoring and +verification. + +
+
+ comment: https://resources.inmm.org/annual-meeting-proceedings/generating-synthetic-satellite-imagery-deep-learning-text-image-models +
+
+
+
+
+ + ☆ 3D-CSAD: Untrained 3D Anomaly Detection for Complex Manufacturing + Surfaces + + +
+ The surface quality inspection of manufacturing parts based on 3D point cloud +data has attracted increasing attention in recent years. The reason is that the +3D point cloud can capture the entire surface of manufacturing parts, unlike +the previous practices that focus on some key product characteristics. However, +achieving accurate 3D anomaly detection is challenging, due to the complex +surfaces of manufacturing parts and the difficulty of collecting sufficient +anomaly samples. To address these challenges, we propose a novel untrained +anomaly detection method based on 3D point cloud data for complex manufacturing +parts, which can achieve accurate anomaly detection in a single sample without +training data. In the proposed framework, we transform an input sample into two +sets of profiles along different directions. Based on one set of the profiles, +a novel segmentation module is devised to segment the complex surface into +multiple basic and simple components. In each component, another set of +profiles, which have the nature of similar shapes, can be modeled as a low-rank +matrix. Thus, accurate 3D anomaly detection can be achieved by using Robust +Principal Component Analysis (RPCA) on these low-rank matrices. Extensive +numerical experiments on different types of parts show that our method achieves +promising results compared with the benchmark methods. + +
+
+
+
+
+ + ☆ Exploiting Object-based and Segmentation-based Semantic Features for + Deep Learning-based Indoor Scene Classification + + +
+ Indoor scenes are usually characterized by scattered objects and their +relationships, which turns the indoor scene classification task into a +challenging computer vision task. Despite the significant performance boost in +classification tasks achieved in recent years, provided by the use of +deep-learning-based methods, limitations such as inter-category ambiguity and +intra-category variation have been holding back their performance. To overcome +such issues, gathering semantic information has been shown to be a promising +source of information towards a more complete and discriminative feature +representation of indoor scenes. Therefore, the work described in this paper +uses both semantic information, obtained from object detection, and semantic +segmentation techniques. While object detection techniques provide the 2D +location of objects allowing to obtain spatial distributions between objects, +semantic segmentation techniques provide pixel-level information that allows to +obtain, at a pixel-level, a spatial distribution and shape-related features of +the segmentation categories. Hence, a novel approach that uses a semantic +segmentation mask to provide Hu-moments-based segmentation categories' shape +characterization, designated by Segmentation-based Hu-Moments Features (SHMFs), +is proposed. Moreover, a three-main-branch network, designated by +GOS$^2$F$^2$App, that exploits deep-learning-based global features, +object-based features, and semantic segmentation-based features is also +proposed. GOS$^2$F$^2$App was evaluated in two indoor scene benchmark datasets: +SUN RGB-D and NYU Depth V2, where, to the best of our knowledge, +state-of-the-art results were achieved on both datasets, which present +evidences of the effectiveness of the proposed approach. + +
+
+ comment: This preprint was submitted at IEEE Transactions on Image Processing +
+
+
+
+
+ + ☆ Realistic Continual Learning Approach using Pre-trained Models + + +
+ Continual learning (CL) is crucial for evaluating adaptability in learning +solutions to retain knowledge. Our research addresses the challenge of +catastrophic forgetting, where models lose proficiency in previously learned +tasks as they acquire new ones. While numerous solutions have been proposed, +existing experimental setups often rely on idealized class-incremental learning +scenarios. We introduce Realistic Continual Learning (RealCL), a novel CL +paradigm where class distributions across tasks are random, departing from +structured setups. + We also present CLARE (Continual Learning Approach with pRE-trained models +for RealCL scenarios), a pre-trained model-based solution designed to integrate +new knowledge while preserving past learning. Our contributions include +pioneering RealCL as a generalization of traditional CL setups, proposing CLARE +as an adaptable approach for RealCL tasks, and conducting extensive experiments +demonstrating its effectiveness across various RealCL scenarios. Notably, CLARE +outperforms existing models on RealCL benchmarks, highlighting its versatility +and robustness in unpredictable learning environments. + +
+
+
+
+
+ + ☆ Applying Guidance in a Limited Interval Improves Sample and Distribution + Quality in Diffusion Models + + +
+ Guidance is a crucial technique for extracting the best performance out of +image-generating diffusion models. Traditionally, a constant guidance weight +has been applied throughout the sampling chain of an image. We show that +guidance is clearly harmful toward the beginning of the chain (high noise +levels), largely unnecessary toward the end (low noise levels), and only +beneficial in the middle. We thus restrict it to a specific range of noise +levels, improving both the inference speed and result quality. This limited +guidance interval improves the record FID in ImageNet-512 significantly, from +1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial +across different sampler parameters, network architectures, and datasets, +including the large-scale setting of Stable Diffusion XL. We thus suggest +exposing the guidance interval as a hyperparameter in all diffusion models that +use guidance. + +
+
+
+
+
+ + ☆ Progressive Semantic-Guided Vision Transformer for Zero-Shot Learning CVPR'24 + + +
+ Zero-shot learning (ZSL) recognizes the unseen classes by conducting +visual-semantic interactions to transfer semantic knowledge from seen classes +to unseen ones, supported by semantic information (e.g., attributes). However, +existing ZSL methods simply extract visual features using a pre-trained network +backbone (i.e., CNN or ViT), which fail to learn matched visual-semantic +correspondences for representing semantic-related visual features as lacking of +the guidance of semantic information, resulting in undesirable visual-semantic +interactions. To tackle this issue, we propose a progressive semantic-guided +vision transformer for zero-shot learning (dubbed ZSLViT). ZSLViT mainly +considers two properties in the whole network: i) discover the semantic-related +visual representations explicitly, and ii) discard the semantic-unrelated +visual information. Specifically, we first introduce semantic-embedded token +learning to improve the visual-semantic correspondences via semantic +enhancement and discover the semantic-related visual tokens explicitly with +semantic-guided token attention. Then, we fuse low semantic-visual +correspondence visual tokens to discard the semantic-unrelated visual +information for visual enhancement. These two operations are integrated into +various encoders to progressively learn semantic-related visual representations +for accurate visual-semantic interactions in ZSL. The extensive experiments +show that our ZSLViT achieves significant performance gains on three popular +benchmark datasets, i.e., CUB, SUN, and AWA2. + +
+
+ comment: Accepted to CVPR'24 +
+
+
+
+
+ + ☆ OpenTrench3D: A Photogrammetric 3D Point Cloud Dataset for Semantic + Segmentation of Underground Utilities + + +
+ Identifying and classifying underground utilities is an important task for +efficient and effective urban planning and infrastructure maintenance. We +present OpenTrench3D, a novel and comprehensive 3D Semantic Segmentation point +cloud dataset, designed to advance research and development in underground +utility surveying and mapping. OpenTrench3D covers a completely novel domain +for public 3D point cloud datasets and is unique in its focus, scope, and +cost-effective capturing method. The dataset consists of 310 point clouds +collected across 7 distinct areas. These include 5 water utility areas and 2 +district heating utility areas. The inclusion of different geographical areas +and main utilities (water and district heating utilities) makes OpenTrench3D +particularly valuable for inter-domain transfer learning experiments. We +provide benchmark results for the dataset using three state-of-the-art semantic +segmentation models, PointNeXt, PointVector and PointMetaBase. Benchmarks are +conducted by training on data from water areas, fine-tuning on district heating +area 1 and evaluating on district heating area 2. The dataset is publicly +available. With OpenTrench3D, we seek to foster innovation and progress in the +field of 3D semantic segmentation in applications related to detection and +documentation of underground utilities as well as in transfer learning methods +in general. + +
+
+
+
+
+ + ☆ ViM-UNet: Vision Mamba for Biomedical Segmentation + + +
+ CNNs, most notably the UNet, are the default architecture for biomedical +segmentation. Transformer-based approaches, such as UNETR, have been proposed +to replace them, benefiting from a global field of view, but suffering from +larger runtimes and higher parameter counts. The recent Vision Mamba +architecture offers a compelling alternative to transformers, also providing a +global field of view, but at higher efficiency. Here, we introduce ViM-UNet, a +novel segmentation architecture based on it and compare it to UNet and UNETR +for two challenging microscopy instance segmentation tasks. We find that it +performs similarly or better than UNet, depending on the task, and outperforms +UNETR while being more efficient. Our code is open source and documented at +https://github.com/constantinpape/torch-em/blob/main/vimunet.md. + +
+
+
+
+
+ + ☆ Point Cloud Geometry Scalable Coding with a Quality-Conditioned Latents + Probability Estimator ICIP 2024 + + +
+ The widespread usage of point clouds (PC) for immersive visual applications +has resulted in the use of very heterogeneous receiving conditions and devices, +notably in terms of network, hardware, and display capabilities. In this +scenario, quality scalability, i.e., the ability to reconstruct a signal at +different qualities by progressively decoding a single bitstream, is a major +requirement that has yet to be conveniently addressed, notably in most +learning-based PC coding solutions. This paper proposes a quality scalability +scheme, named Scalable Quality Hyperprior (SQH), adaptable to learning-based +static point cloud geometry codecs, which uses a Quality-conditioned Latents +Probability Estimator (QuLPE) to decode a high-quality version of a PC +learning-based representation, based on an available lower quality base layer. +SQH is integrated in the future JPEG PC coding standard, allowing to create a +layered bitstream that can be used to progressively decode the PC geometry with +increasing quality and fidelity. Experimental results show that SQH offers the +quality scalability feature with very limited or no compression performance +penalty at all when compared with the corresponding non-scalable solution, thus +preserving the significant compression gains over other state-of-the-art PC +codecs. + +
+
+ comment: Submitted at ICIP 2024 +
+
+
+
+
+ + ☆ Flatness Improves Backbone Generalisation in Few-shot Classification + + +
+ Deployment of deep neural networks in real-world settings typically requires +adaptation to new tasks with few examples. Few-shot classification (FSC) +provides a solution to this problem by leveraging pre-trained backbones for +fast adaptation to new classes. Surprisingly, most efforts have only focused on +developing architectures for easing the adaptation to the target domain without +considering the importance of backbone training for good generalisation. We +show that flatness-aware backbone training with vanilla fine-tuning results in +a simpler yet competitive baseline compared to the state-of-the-art. Our +results indicate that for in- and cross-domain FSC, backbone training is +crucial to achieving good generalisation across different adaptation methods. +We advocate more care should be taken when training these models. + +
+
+
+
+
+ + ☆ Chaos in Motion: Unveiling Robustness in Remote Heart Rate Measurement + through Brain-Inspired Skin Tracking + + +
+ Heart rate is an important physiological indicator of human health status. +Existing remote heart rate measurement methods typically involve facial +detection followed by signal extraction from the region of interest (ROI). +These SOTA methods have three serious problems: (a) inaccuracies even failures +in detection caused by environmental influences or subject movement; (b) +failures for special patients such as infants and burn victims; (c) privacy +leakage issues resulting from collecting face video. To address these issues, +we regard the remote heart rate measurement as the process of analyzing the +spatiotemporal characteristics of the optical flow signal in the video. We +apply chaos theory to computer vision tasks for the first time, thus designing +a brain-inspired framework. Firstly, using an artificial primary visual cortex +model to extract the skin in the videos, and then calculate heart rate by +time-frequency analysis on all pixels. Our method achieves Robust Skin Tracking +for Heart Rate measurement, called HR-RST. The experimental results show that +HR-RST overcomes the difficulty of environmental influences and effectively +tracks the subject movement. Moreover, the method could extend to other body +parts. Consequently, the method can be applied to special patients and +effectively protect individual privacy, offering an innovative solution. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Depth Estimation using Weighted-loss and Transfer Learning + + +
+ Depth estimation from 2D images is a common computer vision task that has +applications in many fields including autonomous vehicles, scene understanding +and robotics. The accuracy of a supervised depth estimation method mainly +relies on the chosen loss function, the model architecture, quality of data and +performance metrics. In this study, we propose a simplified and adaptable +approach to improve depth estimation accuracy using transfer learning and an +optimized loss function. The optimized loss function is a combination of +weighted losses to which enhance robustness and generalization: Mean Absolute +Error (MAE), Edge Loss and Structural Similarity Index (SSIM). We use a grid +search and a random search method to find optimized weights for the losses, +which leads to an improved model. We explore multiple encoder-decoder-based +models including DenseNet121, DenseNet169, DenseNet201, and EfficientNet for +the supervised depth estimation model on NYU Depth Dataset v2. We observe that +the EfficientNet model, pre-trained on ImageNet for classification when used as +an encoder, with a simple upsampling decoder, gives the best results in terms +of RSME, REL and log10: 0.386, 0.113 and 0.049, respectively. We also perform a +qualitative analysis which illustrates that our model produces depth maps that +closely resemble ground truth, even in cases where the ground truth is flawed. +The results indicate significant improvements in accuracy and robustness, with +EfficientNet being the most successful architecture. + +
+
+
+
+
+ + ☆ Run-time Monitoring of 3D Object Detection in Automated Driving Systems + Using Early Layer Neural Activation Patterns CVPR 2024 + + +
+ Monitoring the integrity of object detection for errors within the perception +module of automated driving systems (ADS) is paramount for ensuring safety. +Despite recent advancements in deep neural network (DNN)-based object +detectors, their susceptibility to detection errors, particularly in the +less-explored realm of 3D object detection, remains a significant concern. +State-of-the-art integrity monitoring (also known as introspection) mechanisms +in 2D object detection mainly utilise the activation patterns in the final +layer of the DNN-based detector's backbone. However, that may not sufficiently +address the complexities and sparsity of data in 3D object detection. To this +end, we conduct, in this article, an extensive investigation into the effects +of activation patterns extracted from various layers of the backbone network +for introspecting the operation of 3D object detectors. Through a comparative +analysis using Kitti and NuScenes datasets with PointPillars and CenterPoint +detectors, we demonstrate that using earlier layers' activation patterns +enhances the error detection performance of the integrity monitoring system, +yet increases computational complexity. To address the real-time operation +requirements in ADS, we also introduce a novel introspection method that +combines activation patterns from multiple layers of the detector's backbone +and report its performance. + +
+
+ comment: Accepted by CVPR 2024 Workshop on Safe Autonomy for All Domains + (SAIAD) +
+
+
+
+
+ + ☆ Model-based Cleaning of the QUILT-1M Pathology Dataset for + Text-Conditional Image Synthesis + + +
+ The QUILT-1M dataset is the first openly available dataset containing images +harvested from various online sources. While it provides a huge data variety, +the image quality and composition is highly heterogeneous, impacting its +utility for text-conditional image synthesis. We propose an automatic pipeline +that provides predictions of the most common impurities within the images, +e.g., visibility of narrators, desktop environment and pathology software, or +text within the image. Additionally, we propose to use semantic alignment +filtering of the image-text pairs. Our findings demonstrate that by rigorously +filtering the dataset, there is a substantial enhancement of image fidelity in +text-to-image tasks. + +
+
+ comment: 4 pages (short paper) +
+
+
+
+
+ + ☆ Deep learning-driven pulmonary arteries and veins segmentation reveals + demography-associated pulmonary vasculature anatomy + + +
+ Pulmonary artery-vein segmentation is crucial for diagnosing pulmonary +diseases and surgical planning, and is traditionally achieved by Computed +Tomography Pulmonary Angiography (CTPA). However, concerns regarding adverse +health effects from contrast agents used in CTPA have constrained its clinical +utility. In contrast, identifying arteries and veins using non-contrast CT, a +conventional and low-cost clinical examination routine, has long been +considered impossible. Here we propose a High-abundant Pulmonary Artery-vein +Segmentation (HiPaS) framework achieving accurate artery-vein segmentation on +both non-contrast CT and CTPA across various spatial resolutions. HiPaS first +performs spatial normalization on raw CT scans via a super-resolution module, +and then iteratively achieves segmentation results at different branch levels +by utilizing the low-level vessel segmentation as a prior for high-level vessel +segmentation. We trained and validated HiPaS on our established multi-centric +dataset comprising 1,073 CT volumes with meticulous manual annotation. Both +quantitative experiments and clinical evaluation demonstrated the superior +performance of HiPaS, achieving a dice score of 91.8% and a sensitivity of +98.0%. Further experiments demonstrated the non-inferiority of HiPaS +segmentation on non-contrast CT compared to segmentation on CTPA. Employing +HiPaS, we have conducted an anatomical study of pulmonary vasculature on 10,613 +participants in China (five sites), discovering a new association between +pulmonary vessel abundance and sex and age: vessel abundance is significantly +higher in females than in males, and slightly decreases with age, under the +controlling of lung volumes (p < 0.0001). HiPaS realizing accurate artery-vein +segmentation delineates a promising avenue for clinical diagnosis and +understanding pulmonary physiology in a non-invasive manner. + +
+
+
+
+
+ + ☆ Shape Completion in the Dark: Completing Vertebrae Morphology from 3D + Ultrasound + + +
+ Purpose: Ultrasound (US) imaging, while advantageous for its radiation-free +nature, is challenging to interpret due to only partially visible organs and a +lack of complete 3D information. While performing US-based diagnosis or +investigation, medical professionals therefore create a mental map of the 3D +anatomy. In this work, we aim to replicate this process and enhance the visual +representation of anatomical structures. + Methods: We introduce a point-cloud-based probabilistic DL method to complete +occluded anatomical structures through 3D shape completion and choose US-based +spine examinations as our application. To enable training, we generate +synthetic 3D representations of partially occluded spinal views by mimicking US +physics and accounting for inherent artifacts. + Results: The proposed model performs consistently on synthetic and patient +data, with mean and median differences of 2.02 and 0.03 in CD, respectively. +Our ablation study demonstrates the importance of US physics-based data +generation, reflected in the large mean and median difference of 11.8 CD and +9.55 CD, respectively. Additionally, we demonstrate that anatomic landmarks, +such as the spinous process (with reconstruction CD of 4.73) and the facet +joints (mean distance to GT of 4.96mm) are preserved in the 3D completion. + Conclusion: Our work establishes the feasibility of 3D shape completion for +lumbar vertebrae, ensuring the preservation of level-wise characteristics and +successful generalization from synthetic to real data. The incorporation of US +physics contributes to more accurate patient data completions. Notably, our +method preserves essential anatomic landmarks and reconstructs crucial +injections sites at their correct locations. The generated data and source code +will be made publicly available +(https://github.com/miruna20/Shape-Completion-in-the-Dark). + +
+
+
+
+
+ + ☆ Dealing with Subject Similarity in Differential Morphing Attack + Detection + + +
+ The advent of morphing attacks has posed significant security concerns for +automated Face Recognition systems, raising the pressing need for robust and +effective Morphing Attack Detection (MAD) methods able to effectively address +this issue. In this paper, we focus on Differential MAD (D-MAD), where a +trusted live capture, usually representing the criminal, is compared with the +document image to classify it as morphed or bona fide. We show these approaches +based on identity features are effective when the morphed image and the live +one are sufficiently diverse; unfortunately, the effectiveness is significantly +reduced when the same approaches are applied to look-alike subjects or in all +those cases when the similarity between the two compared images is high (e.g. +comparison between the morphed image and the accomplice). Therefore, in this +paper, we propose ACIdA, a modular D-MAD system, consisting of a module for the +attempt type classification, and two modules for the identity and artifacts +analysis on input images. Successfully addressing this task would allow +broadening the D-MAD applications including, for instance, the document +enrollment stage, which currently relies entirely on human evaluation, thus +limiting the possibility of releasing ID documents with manipulated images, as +well as the automated gates to detect both accomplices and criminals. An +extensive cross-dataset experimental evaluation conducted on the introduced +scenario shows that ACIdA achieves state-of-the-art results, outperforming +literature competitors, while maintaining good performance in traditional D-MAD +benchmarks. + +
+
+
+
+
+ + ☆ Finding Dino: A plug-and-play framework for unsupervised detection of + out-of-distribution objects using prototypes + + +
+ Detecting and localising unknown or Out-of-distribution (OOD) objects in any +scene can be a challenging task in vision. Particularly, in safety-critical +cases involving autonomous systems like automated vehicles or trains. +Supervised anomaly segmentation or open-world object detection models depend on +training on exhaustively annotated datasets for every domain and still struggle +in distinguishing between background and OOD objects. In this work, we present +a plug-and-play generalised framework - PRototype-based zero-shot OOD detection +Without Labels (PROWL). It is an inference-based method that does not require +training on the domain dataset and relies on extracting relevant features from +self-supervised pre-trained models. PROWL can be easily adapted to detect OOD +objects in any operational design domain by specifying a list of known classes +from this domain. PROWL, as an unsupervised method, outperforms other +supervised methods trained without auxiliary OOD data on the RoadAnomaly and +RoadObstacle datasets provided in SegmentMeIfYouCan (SMIYC) benchmark. We also +demonstrate its suitability for other domains such as rail and maritime scenes. + +
+
+
+
+
+ + ☆ Separated Attention: An Improved Cycle GAN Based Under Water Image + Enhancement Method + + +
+ In this paper we have present an improved Cycle GAN based model for under +water image enhancement. We have utilized the cycle consistent learning +technique of the state-of-the-art Cycle GAN model with modification in the loss +function in terms of depth-oriented attention which enhance the contrast of the +overall image, keeping global content, color, local texture, and style +information intact. We trained the Cycle GAN model with the modified loss +functions on the benchmarked Enhancing Underwater Visual Perception (EUPV) +dataset a large dataset including paired and unpaired sets of underwater images +(poor and good quality) taken with seven distinct cameras in a range of +visibility situation during research on ocean exploration and human-robot +cooperation. In addition, we perform qualitative and quantitative evaluation +which supports the given technique applied and provided a better contrast +enhancement model of underwater imagery. More significantly, the upgraded +images provide better results from conventional models and further for under +water navigation, pose estimation, saliency prediction, object detection and +tracking. The results validate the appropriateness of the model for autonomous +underwater vehicles (AUV) in visual navigation. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Simba: Mamba augmented U-ShiftGCN for Skeletal Action Recognition in + Videos + + +
+ Skeleton Action Recognition (SAR) involves identifying human actions using +skeletal joint coordinates and their interconnections. While plain Transformers +have been attempted for this task, they still fall short compared to the +current leading methods, which are rooted in Graph Convolutional Networks +(GCNs) due to the absence of structural priors. Recently, a novel selective +state space model, Mamba, has surfaced as a compelling alternative to the +attention mechanism in Transformers, offering efficient modeling of long +sequences. In this work, to the utmost extent of our awareness, we present the +first SAR framework incorporating Mamba. Each fundamental block of our model +adopts a novel U-ShiftGCN architecture with Mamba as its core component. The +encoder segment of the U-ShiftGCN is devised to extract spatial features from +the skeletal data using downsampling vanilla Shift S-GCN blocks. These spatial +features then undergo intermediate temporal modeling facilitated by the Mamba +block before progressing to the encoder section, which comprises vanilla +upsampling Shift S-GCN blocks. Additionally, a Shift T-GCN (ShiftTCN) temporal +modeling unit is employed before the exit of each fundamental block to refine +temporal representations. This particular integration of downsampling spatial, +intermediate temporal, upsampling spatial, and ultimate temporal subunits +yields promising results for skeleton action recognition. We dub the resulting +model \textbf{Simba}, which attains state-of-the-art performance across three +well-known benchmark skeleton action recognition datasets: NTU RGB+D, NTU RGB+D +120, and Northwestern-UCLA. Interestingly, U-ShiftGCN (Simba without +Intermediate Mamba Block) by itself is capable of performing reasonably well +and surpasses our baseline. + +
+
+ comment: 20 pages, 6 tables, 1 figure +
+
+
+
+
+ + ☆ Homography Guided Temporal Fusion for Road Line and Marking Segmentation ICCV 2023 + + +
+ Reliable segmentation of road lines and markings is critical to autonomous +driving. Our work is motivated by the observations that road lines and markings +are (1) frequently occluded in the presence of moving vehicles, shadow, and +glare and (2) highly structured with low intra-class shape variance and overall +high appearance consistency. To solve these issues, we propose a Homography +Guided Fusion (HomoFusion) module to exploit temporally-adjacent video frames +for complementary cues facilitating the correct classification of the partially +occluded road lines or markings. To reduce computational complexity, a novel +surface normal estimator is proposed to establish spatial correspondences +between the sampled frames, allowing the HomoFusion module to perform a +pixel-to-pixel attention mechanism in updating the representation of the +occluded road lines or markings. Experiments on ApolloScape, a large-scale lane +mark segmentation dataset, and ApolloScape Night with artificial simulated +night-time road conditions, demonstrate that our method outperforms other +existing SOTA lane mark segmentation models with less than 9\% of their +parameters and computational complexity. We show that exploiting available +camera intrinsic data and ground plane assumption for cross-frame +correspondence can lead to a light-weight network with significantly improved +performances in speed and accuracy. We also prove the versatility of our +HomoFusion approach by applying it to the problem of water puddle segmentation +and achieving SOTA performance. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Multi-Image Visual Question Answering for Unsupervised Anomaly Detection + + +
+ Unsupervised anomaly detection enables the identification of potential +pathological areas by juxtaposing original images with their pseudo-healthy +reconstructions generated by models trained exclusively on normal images. +However, the clinical interpretation of resultant anomaly maps presents a +challenge due to a lack of detailed, understandable explanations. Recent +advancements in language models have shown the capability of mimicking +human-like understanding and providing detailed descriptions. This raises an +interesting question: \textit{How can language models be employed to make the +anomaly maps more explainable?} To the best of our knowledge, we are the first +to leverage a language model for unsupervised anomaly detection, for which we +construct a dataset with different questions and answers. Additionally, we +present a novel multi-image visual question answering framework tailored for +anomaly detection, incorporating diverse feature fusion strategies to enhance +visual knowledge extraction. Our experiments reveal that the framework, +augmented by our new Knowledge Q-Former module, adeptly answers questions on +the anomaly detection dataset. Besides, integrating anomaly maps as inputs +distinctly aids in improving the detection of unseen pathologies. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Diffusion Probabilistic Multi-cue Level Set for Reducing Edge + Uncertainty in Pancreas Segmentation + + +
+ Accurately segmenting the pancreas remains a huge challenge. Traditional +methods encounter difficulties in semantic localization due to the small volume +and distorted structure of the pancreas, while deep learning methods encounter +challenges in obtaining accurate edges because of low contrast and organ +overlapping. To overcome these issues, we propose a multi-cue level set method +based on the diffusion probabilistic model, namely Diff-mcs. Our method adopts +a coarse-to-fine segmentation strategy. We use the diffusion probabilistic +model in the coarse segmentation stage, with the obtained probability +distribution serving as both the initial localization and prior cues for the +level set method. In the fine segmentation stage, we combine the prior cues +with grayscale cues and texture cues to refine the edge by maximizing the +difference between probability distributions of the cues inside and outside the +level set curve. The method is validated on three public datasets and achieves +state-of-the-art performance, which can obtain more accurate segmentation +results with lower uncertainty segmentation edges. In addition, we conduct +ablation studies and uncertainty analysis to verify that the diffusion +probability model provides a more appropriate initialization for the level set +method. Furthermore, when combined with multiple cues, the level set method can +better obtain edges and improve the overall accuracy. Our code is available at +https://github.com/GOUYUEE/Diff-mcs. + +
+
+
+
+
+ + ☆ Do You Remember? Dense Video Captioning with Cross-Modal Memory + Retrieval CVPR 2024 + + +
+ There has been significant attention to the research on dense video +captioning, which aims to automatically localize and caption all events within +untrimmed video. Several studies introduce methods by designing dense video +captioning as a multitasking problem of event localization and event captioning +to consider inter-task relations. However, addressing both tasks using only +visual input is challenging due to the lack of semantic content. In this study, +we address this by proposing a novel framework inspired by the cognitive +information processing of humans. Our model utilizes external memory to +incorporate prior knowledge. The memory retrieval method is proposed with +cross-modal video-to-text matching. To effectively incorporate retrieved text +features, the versatile encoder and the decoder with visual and textual +cross-attention modules are designed. Comparative experiments have been +conducted to show the effectiveness of the proposed method on ActivityNet +Captions and YouCook2 datasets. Experimental results show promising performance +of our model without extensive pretraining from a large video dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Automatic Detection of Dark Ship-to-Ship Transfers using Deep Learning + and Satellite Imagery + + +
+ Despite extensive research into ship detection via remote sensing, no studies +identify ship-to-ship transfers in satellite imagery. Given the importance of +transshipment in illicit shipping practices, this is a significant gap. In what +follows, I train a convolutional neural network to accurately detect 4 +different types of cargo vessel and two different types of Ship-to-Ship +transfer in PlanetScope satellite imagery. I then elaborate a pipeline for the +automatic detection of suspected illicit ship-to-ship transfers by +cross-referencing satellite detections with vessel borne GPS data. Finally, I +apply this method to the Kerch Strait between Ukraine and Russia to identify +over 400 dark transshipment events since 2022. + +
+
+
+
+
+ + ☆ Contrastive-Based Deep Embeddings for Label Noise-Resilient + Histopathology Image Classification + + +
+ Recent advancements in deep learning have proven highly effective in medical +image classification, notably within histopathology. However, noisy labels +represent a critical challenge in histopathology image classification, where +accurate annotations are vital for training robust deep learning models. +Indeed, deep neural networks can easily overfit label noise, leading to severe +degradations in model performance. While numerous public pathology foundation +models have emerged recently, none have evaluated their resilience to label +noise. Through thorough empirical analyses across multiple datasets, we exhibit +the label noise resilience property of embeddings extracted from foundation +models trained in a self-supervised contrastive manner. We demonstrate that +training with such embeddings substantially enhances label noise robustness +when compared to non-contrastive-based ones as well as commonly used +noise-resilient methods. Our results unequivocally underline the superiority of +contrastive learning in effectively mitigating the label noise challenge. Code +is publicly available at +https://github.com/LucasDedieu/NoiseResilientHistopathology. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ GLID: Pre-training a Generalist Encoder-Decoder Vision Model CVPR 2024 + + +
+ This paper proposes a GeneraLIst encoder-Decoder (GLID) pre-training method +for better handling various downstream computer vision tasks. While +self-supervised pre-training approaches, e.g., Masked Autoencoder, have shown +success in transfer learning, task-specific sub-architectures are still +required to be appended for different downstream tasks, which cannot enjoy the +benefits of large-scale pre-training. GLID overcomes this challenge by allowing +the pre-trained generalist encoder-decoder to be fine-tuned on various vision +tasks with minimal task-specific architecture modifications. In the GLID +training scheme, pre-training pretext task and other downstream tasks are +modeled as "query-to-answer" problems, including the pre-training pretext task +and other downstream tasks. We pre-train a task-agnostic encoder-decoder with +query-mask pairs. During fine-tuning, GLID maintains the pre-trained +encoder-decoder and queries, only replacing the topmost linear transformation +layer with task-specific linear heads. This minimizes the pretrain-finetune +architecture inconsistency and enables the pre-trained model to better adapt to +downstream tasks. GLID achieves competitive performance on various vision +tasks, including object detection, image segmentation, pose estimation, and +depth estimation, outperforming or matching specialist models such as +Mask2Former, DETR, ViTPose, and BinsFormer. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Attention based End to end network for Offline Writer Identification on + Word level data + + +
+ Writer identification due to its widespread application in various fields has +gained popularity over the years. In scenarios where optimum handwriting +samples are available, whether they be in the form of a single line, a +sentence, or an entire page, writer identification algorithms have demonstrated +noteworthy levels of accuracy. However, in scenarios where only a limited +number of handwritten samples are available, particularly in the form of word +images, there is a significant scope for improvement. + In this paper, we propose a writer identification system based on an +attention-driven Convolutional Neural Network (CNN). The system is trained +utilizing image segments, known as fragments, extracted from word images, +employing a pyramid-based strategy. This methodology enables the system to +capture a comprehensive representation of the data, encompassing both +fine-grained details and coarse features across various levels of abstraction. +These extracted fragments serve as the training data for the convolutional +network, enabling it to learn a more robust representation compared to +traditional convolution-based networks trained on word images. Additionally, +the paper explores the integration of an attention mechanism to enhance the +representational power of the learned features. The efficacy of the proposed +algorithm is evaluated on three benchmark databases, demonstrating its +proficiency in writer identification tasks, particularly in scenarios with +limited access to handwriting data. + +
+
+
+
+
+ + ☆ Implicit and Explicit Language Guidance for Diffusion-based Visual + Perception + + +
+ Text-to-image diffusion models have shown powerful ability on conditional +image synthesis. With large-scale vision-language pre-training, diffusion +models are able to generate high-quality images with rich texture and +reasonable structure under different text prompts. However, it is an open +problem to adapt the pre-trained diffusion model for visual perception. In this +paper, we propose an implicit and explicit language guidance framework for +diffusion-based perception, named IEDP. Our IEDP comprises of an implicit +language guidance branch and an explicit language guidance branch. The implicit +branch employs frozen CLIP image encoder to directly generate implicit text +embeddings that are fed to diffusion model, without using explicit text +prompts. The explicit branch utilizes the ground-truth labels of corresponding +images as text prompts to condition feature extraction of diffusion model. +During training, we jointly train diffusion model by sharing the model weights +of these two branches. As a result, implicit and explicit branches can jointly +guide feature learning. During inference, we only employ implicit branch for +final prediction, which does not require any ground-truth labels. Experiments +are performed on two typical perception tasks, including semantic segmentation +and depth estimation. Our IEDP achieves promising performance on both tasks. +For semantic segmentation, our IEDP has the mIoU score of 55.9% on AD20K +validation set, which outperforms the baseline method VPD by 2.2%. For depth +estimation, our IEDP outperforms the baseline method VPD with a relative gain +of 10.2%. + +
+
+
+
+
+ + ☆ Weakly-Supervised Learning via Multi-Lateral Decoder Branching for + Guidewire Segmentation in Robot-Assisted Cardiovascular Catheterization + + +
+ Although robot-assisted cardiovascular catheterization is commonly performed +for intervention of cardiovascular diseases, more studies are needed to support +the procedure with automated tool segmentation. This can aid surgeons on tool +tracking and visualization during intervention. Learning-based segmentation has +recently offered state-of-the-art segmentation performances however, generating +ground-truth signals for fully-supervised methods is labor-intensive and time +consuming for the interventionists. In this study, a weakly-supervised learning +method with multi-lateral pseudo labeling is proposed for tool segmentation in +cardiac angiograms. The method includes a modified U-Net model with one encoder +and multiple lateral-branched decoders that produce pseudo labels as +supervision signals under different perturbation. The pseudo labels are +self-generated through a mixed loss function and shared consistency in the +decoders. We trained the model end-to-end with weakly-annotated data obtained +during robotic cardiac catheterization. Experiments with the proposed model +shows weakly annotated data has closer performance to when fully annotated data +is used. Compared to three existing weakly-supervised methods, our approach +yielded higher segmentation performance across three different cardiac +angiogram data. With ablation study, we showed consistent performance under +different parameters. Thus, we offer a less expensive method for real-time tool +segmentation and tracking during robot-assisted cardiac catheterization. + +
+
+
+
+
+ + ☆ Multi-rater Prompting for Ambiguous Medical Image Segmentation + + +
+ Multi-rater annotations commonly occur when medical images are independently +annotated by multiple experts (raters). In this paper, we tackle two challenges +arisen in multi-rater annotations for medical image segmentation (called +ambiguous medical image segmentation): (1) How to train a deep learning model +when a group of raters produces a set of diverse but plausible annotations, and +(2) how to fine-tune the model efficiently when computation resources are not +available for re-training the entire model on a different dataset domain. We +propose a multi-rater prompt-based approach to address these two challenges +altogether. Specifically, we introduce a series of rater-aware prompts that can +be plugged into the U-Net model for uncertainty estimation to handle +multi-annotation cases. During the prompt-based fine-tuning process, only 0.3% +of learnable parameters are required to be updated comparing to training the +entire model. Further, in order to integrate expert consensus and disagreement, +we explore different multi-rater incorporation strategies and design a +mix-training strategy for comprehensive insight learning. Extensive experiments +verify the effectiveness of our new approach for ambiguous medical image +segmentation on two public datasets while alleviating the heavy burden of model +re-training. + +
+
+
+
+
+ + ☆ ObjBlur: A Curriculum Learning Approach With Progressive Object-Level + Blurring for Improved Layout-to-Image Generation + + +
+ We present ObjBlur, a novel curriculum learning approach to improve +layout-to-image generation models, where the task is to produce realistic +images from layouts composed of boxes and labels. Our method is based on +progressive object-level blurring, which effectively stabilizes training and +enhances the quality of generated images. This curriculum learning strategy +systematically applies varying degrees of blurring to individual objects or the +background during training, starting from strong blurring to progressively +cleaner images. Our findings reveal that this approach yields significant +performance improvements, stabilized training, smoother convergence, and +reduced variance between multiple runs. Moreover, our technique demonstrates +its versatility by being compatible with generative adversarial networks and +diffusion models, underlining its applicability across various generative +modeling paradigms. With ObjBlur, we reach new state-of-the-art results on the +complex COCO and Visual Genome datasets. + +
+
+
+
+
+ + ☆ Attention-Aware Laparoscopic Image Desmoking Network with Lightness + Embedding and Hybrid Guided Embedding + + +
+ This paper presents a novel method of smoke removal from the laparoscopic +images. Due to the heterogeneous nature of surgical smoke, a two-stage network +is proposed to estimate the smoke distribution and reconstruct a clear, +smoke-free surgical scene. The utilization of the lightness channel plays a +pivotal role in providing vital information pertaining to smoke density. The +reconstruction of smoke-free image is guided by a hybrid embedding, which +combines the estimated smoke mask with the initial image. Experimental results +demonstrate that the proposed method boasts a Peak Signal to Noise Ratio that +is $2.79\%$ higher than the state-of-the-art methods, while also exhibits a +remarkable $38.2\%$ reduction in run-time. Overall, the proposed method offers +comparable or even superior performance in terms of both smoke removal quality +and computational efficiency when compared to existing state-of-the-art +methods. This work will be publicly available on +http://homepage.hit.edu.cn/wpgao + +
+
+ comment: ISBI2024 +
+
+
+
+
+ + ☆ CAT: Contrastive Adapter Training for Personalized Image Generation CVPR + + +
+ The emergence of various adapters, including Low-Rank Adaptation (LoRA) +applied from the field of natural language processing, has allowed diffusion +models to personalize image generation at a low cost. However, due to the +various challenges including limited datasets and shortage of regularization +and computation resources, adapter training often results in unsatisfactory +outcomes, leading to the corruption of the backbone model's prior knowledge. +One of the well known phenomena is the loss of diversity in object generation, +especially within the same class which leads to generating almost identical +objects with minor variations. This poses challenges in generation +capabilities. To solve this issue, we present Contrastive Adapter Training +(CAT), a simple yet effective strategy to enhance adapter training through the +application of CAT loss. Our approach facilitates the preservation of the base +model's original knowledge when the model initiates adapters. Furthermore, we +introduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to +keep the former information. We qualitatively and quantitatively compare CAT's +improvement. Finally, we mention the possibility of CAT in the aspects of +multi-concept adapter and optimization. + +
+
+ comment: CVPRW 2024 +
+
+
+
+
+ + ☆ SFSORT: Scene Features-based Simple Online Real-Time Tracker + + +
+ This paper introduces SFSORT, the world's fastest multi-object tracking +system based on experiments conducted on MOT Challenge datasets. To achieve an +accurate and computationally efficient tracker, this paper employs a +tracking-by-detection method, following the online real-time tracking approach +established in prior literature. By introducing a novel cost function called +the Bounding Box Similarity Index, this work eliminates the Kalman Filter, +leading to reduced computational requirements. Additionally, this paper +demonstrates the impact of scene features on enhancing object-track association +and improving track post-processing. Using a 2.2 GHz Intel Xeon CPU, the +proposed method achieves an HOTA of 61.7\% with a processing speed of 2242 Hz +on the MOT17 dataset and an HOTA of 60.9\% with a processing speed of 304 Hz on +the MOT20 dataset. The tracker's source code, fine-tuned object detection +model, and tutorials are available at +\url{https://github.com/gitmehrdad/SFSORT}. + +
+
+
+
+
+ + ☆ Event-Enhanced Snapshot Compressive Videography at 10K FPS + + +
+ Video snapshot compressive imaging (SCI) encodes the target dynamic scene +compactly into a snapshot and reconstructs its high-speed frame sequence +afterward, greatly reducing the required data footprint and transmission +bandwidth as well as enabling high-speed imaging with a low frame rate +intensity camera. In implementation, high-speed dynamics are encoded via +temporally varying patterns, and only frames at corresponding temporal +intervals can be reconstructed, while the dynamics occurring between +consecutive frames are lost. To unlock the potential of conventional snapshot +compressive videography, we propose a novel hybrid "intensity+event" imaging +scheme by incorporating an event camera into a video SCI setup. Our proposed +system consists of a dual-path optical setup to record the coded intensity +measurement and intermediate event signals simultaneously, which is compact and +photon-efficient by collecting the half photons discarded in conventional video +SCI. Correspondingly, we developed a dual-branch Transformer utilizing the +reciprocal relationship between two data modes to decode dense video frames. +Extensive experiments on both simulated and real-captured data demonstrate our +superiority to state-of-the-art video SCI and video frame interpolation (VFI) +methods. Benefiting from the new hybrid design leveraging both intrinsic +redundancy in videos and the unique feature of event cameras, we achieve +high-quality videography at 0.1ms time intervals with a low-cost CMOS image +sensor working at 24 FPS. + +
+
+
+
+
+ + ☆ Stereo-LiDAR Depth Estimation with Deformable Propagation and Learned + Disparity-Depth Conversion ICRA 2024 + + +
+ Accurate and dense depth estimation with stereo cameras and LiDAR is an +important task for automatic driving and robotic perception. While sparse hints +from LiDAR points have improved cost aggregation in stereo matching, their +effectiveness is limited by the low density and non-uniform distribution. To +address this issue, we propose a novel stereo-LiDAR depth estimation network +with Semi-Dense hint Guidance, named SDG-Depth. Our network includes a +deformable propagation module for generating a semi-dense hint map and a +confidence map by propagating sparse hints using a learned deformable window. +These maps then guide cost aggregation in stereo matching. To reduce the +triangulation error in depth recovery from disparity, especially in distant +regions, we introduce a disparity-depth conversion module. Our method is both +accurate and efficient. The experimental results on benchmark tests show its +superior performance. Our code is available at +https://github.com/SJTU-ViSYS/SDG-Depth. + +
+
+ comment: Accepted in ICRA 2024. 8 pages, 6 figures +
+
+
+
+
+ + ☆ Content-Adaptive Non-Local Convolution for Remote Sensing Pansharpening CVPR 2024 + + +
+ Currently, machine learning-based methods for remote sensing pansharpening +have progressed rapidly. However, existing pansharpening methods often do not +fully exploit differentiating regional information in non-local spaces, thereby +limiting the effectiveness of the methods and resulting in redundant learning +parameters. In this paper, we introduce a so-called content-adaptive non-local +convolution (CANConv), a novel method tailored for remote sensing image +pansharpening. Specifically, CANConv employs adaptive convolution, ensuring +spatial adaptability, and incorporates non-local self-similarity through the +similarity relationship partition (SRP) and the partition-wise adaptive +convolution (PWAC) sub-modules. Furthermore, we also propose a corresponding +network architecture, called CANNet, which mainly utilizes the multi-scale +self-similarity. Extensive experiments demonstrate the superior performance of +CANConv, compared with recent promising fusion methods. Besides, we +substantiate the method's effectiveness through visualization, ablation +experiments, and comparison with existing methods on multiple test sets. The +source code is publicly available at https://github.com/duanyll/CANConv. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ How is Visual Attention Influenced by Text Guidance? Database and Model + + +
+ The analysis and prediction of visual attention have long been crucial tasks +in the fields of computer vision and image processing. In practical +applications, images are generally accompanied by various text descriptions, +however, few studies have explored the influence of text descriptions on visual +attention, let alone developed visual saliency prediction models considering +text guidance. In this paper, we conduct a comprehensive study on text-guided +image saliency (TIS) from both subjective and objective perspectives. +Specifically, we construct a TIS database named SJTU-TIS, which includes 1200 +text-image pairs and the corresponding collected eye-tracking data. Based on +the established SJTU-TIS database, we analyze the influence of various text +descriptions on visual attention. Then, to facilitate the development of +saliency prediction models considering text influence, we construct a benchmark +for the established SJTU-TIS database using state-of-the-art saliency models. +Finally, considering the effect of text descriptions on visual attention, while +most existing saliency models ignore this impact, we further propose a +text-guided saliency (TGSal) prediction model, which extracts and integrates +both image features and text features to predict the image saliency under +various text-description conditions. Our proposed model significantly +outperforms the state-of-the-art saliency models on both the SJTU-TIS database +and the pure image saliency databases in terms of various evaluation metrics. +The SJTU-TIS database and the code of the proposed TGSal model will be released +at: https://github.com/IntMeGroup/TGSal. + +
+
+
+
+
+ + ☆ PromptSync: Bridging Domain Gaps in Vision-Language Models through + Class-Aware Prototype Alignment and Discrimination CVPR 2024 + + +
+ The potential for zero-shot generalization in vision-language (V-L) models +such as CLIP has spurred their widespread adoption in addressing numerous +downstream tasks. Previous methods have employed test-time prompt tuning to +adapt the model to unseen domains, but they overlooked the issue of imbalanced +class distributions. In this study, we explicitly address this problem by +employing class-aware prototype alignment weighted by mean class probabilities +obtained for the test sample and filtered augmented views. Additionally, we +ensure that the class probabilities are as accurate as possible by performing +prototype discrimination using contrastive learning. The combination of +alignment and discriminative loss serves as a geometric regularizer, preventing +the prompt representation from collapsing onto a single class and effectively +bridging the distribution gap between the source and test domains. Our method, +named PromptSync, synchronizes the prompts for each test sample on both the +text and vision branches of the V-L model. In empirical evaluations on the +domain generalization benchmark, our method outperforms previous best methods +by 2.33\% in overall performance, by 1\% in base-to-novel generalization, and +by 2.84\% in cross-dataset transfer tasks. + +
+
+ comment: Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures +
+
+
+
+
+ + ☆ Remembering Transformer for Continual Learning + + +
+ Neural networks encounter the challenge of Catastrophic Forgetting (CF) in +continual learning, where new task knowledge interferes with previously learned +knowledge. We propose Remembering Transformer, inspired by the brain's +Complementary Learning Systems (CLS), to tackle this issue. Remembering +Transformer employs a mixture-of-adapters and a generative model-based routing +mechanism to alleviate CF by dynamically routing task data to relevant +adapters. Our approach demonstrated a new SOTA performance in various vision +continual learning tasks and great parameter efficiency. + +
+
+
+
+
+ + ☆ Generalization Gap in Data Augmentation: Insights from Illumination + + +
+ In the field of computer vision, data augmentation is widely used to enrich +the feature complexity of training datasets with deep learning techniques. +However, regarding the generalization capabilities of models, the difference in +artificial features generated by data augmentation and natural visual features +has not been fully revealed. This study focuses on the visual representation +variable 'illumination', by simulating its distribution degradation and +examining how data augmentation techniques enhance model performance on a +classification task. Our goal is to investigate the differences in +generalization between models trained with augmented data and those trained +under real-world illumination conditions. Results indicate that after +undergoing various data augmentation methods, model performance has been +significantly improved. Yet, a noticeable generalization gap still exists after +utilizing various data augmentation methods, emphasizing the critical role of +feature diversity in the training set for enhancing model generalization. + +
+
+
+
+
+ + ☆ Learning to Classify New Foods Incrementally Via Compressed Exemplars + + +
+ Food image classification systems play a crucial role in health monitoring +and diet tracking through image-based dietary assessment techniques. However, +existing food recognition systems rely on static datasets characterized by a +pre-defined fixed number of food classes. This contrasts drastically with the +reality of food consumption, which features constantly changing data. +Therefore, food image classification systems should adapt to and manage data +that continuously evolves. This is where continual learning plays an important +role. A challenge in continual learning is catastrophic forgetting, where ML +models tend to discard old knowledge upon learning new information. While +memory-replay algorithms have shown promise in mitigating this problem by +storing old data as exemplars, they are hampered by the limited capacity of +memory buffers, leading to an imbalance between new and previously learned +data. To address this, our work explores the use of neural image compression to +extend buffer size and enhance data diversity. We introduced the concept of +continuously learning a neural compression model to adaptively improve the +quality of compressed data and optimize the bitrates per pixel (bpp) to store +more exemplars. Our extensive experiments, including evaluations on +food-specific datasets including Food-101 and VFN-74, as well as the general +dataset ImageNet-100, demonstrate improvements in classification accuracy. This +progress is pivotal in advancing more realistic food recognition systems that +are capable of adapting to continually evolving data. Moreover, the principles +and methodologies we've developed hold promise for broader applications, +extending their benefits to other domains of continual machine learning +systems. + +
+
+
+
+
+ + ☆ Mitigating Object Dependencies: Improving Point Cloud Self-Supervised + Learning through Object Exchange + + +
+ In the realm of point cloud scene understanding, particularly in indoor +scenes, objects are arranged following human habits, resulting in objects of +certain semantics being closely positioned and displaying notable inter-object +correlations. This can create a tendency for neural networks to exploit these +strong dependencies, bypassing the individual object patterns. To address this +challenge, we introduce a novel self-supervised learning (SSL) strategy. Our +approach leverages both object patterns and contextual cues to produce robust +features. It begins with the formulation of an object-exchanging strategy, +where pairs of objects with comparable sizes are exchanged across different +scenes, effectively disentangling the strong contextual dependencies. +Subsequently, we introduce a context-aware feature learning strategy, which +encodes object patterns without relying on their specific context by +aggregating object features across various scenes. Our extensive experiments +demonstrate the superiority of our method over existing SSL techniques, further +showing its better robustness to environmental changes. Moreover, we showcase +the applicability of our approach by transferring pre-trained models to diverse +point cloud datasets. + +
+
+
+
+
+ + ☆ PillarTrack: Redesigning Pillar-based Transformer Network for Single + Object Tracking on Point Clouds + + +
+ LiDAR-based 3D single object tracking (3D SOT) is a critical issue in +robotics and autonomous driving. It aims to obtain accurate 3D BBox from the +search area based on similarity or motion. However, existing 3D SOT methods +usually follow the point-based pipeline, where the sampling operation +inevitably leads to redundant or lost information, resulting in unexpected +performance. To address these issues, we propose PillarTrack, a pillar-based 3D +single object tracking framework. Firstly, we transform sparse point clouds +into dense pillars to preserve the local and global geometrics. Secondly, we +introduce a Pyramid-type Encoding Pillar Feature Encoder (PE-PFE) design to +help the feature representation of each pillar. Thirdly, we present an +efficient Transformer-based backbone from the perspective of modality +differences. Finally, we construct our PillarTrack tracker based above designs. +Extensive experiments on the KITTI and nuScenes dataset demonstrate the +superiority of our proposed method. Notably, our method achieves +state-of-the-art performance on the KITTI and nuScenes dataset and enables +real-time tracking speed. We hope our work could encourage the community to +rethink existing 3D SOT tracker designs.We will open source our code to the +research community in https://github.com/StiphyJay/PillarTrack. + +
+
+
+
+
+ + ☆ Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton + Action Recognition + + +
+ Skeleton-based zero-shot action recognition aims to recognize unknown human +actions based on the learned priors of the known skeleton-based actions and a +semantic descriptor space shared by both known and unknown categories. However, +previous works focus on establishing the bridges between the known skeleton +representation space and semantic descriptions space at the coarse-grained +level for recognizing unknown action categories, ignoring the fine-grained +alignment of these two spaces, resulting in suboptimal performance in +distinguishing high-similarity action categories. To address these challenges, +we propose a novel method via Side information and dual-prompts learning for +skeleton-based zero-shot action recognition (STAR) at the fine-grained level. +Specifically, 1) we decompose the skeleton into several parts based on its +topology structure and introduce the side information concerning multi-part +descriptions of human body movements for alignment between the skeleton and the +semantic space at the fine-grained level; 2) we design the visual-attribute and +semantic-part prompts to improve the intra-class compactness within the +skeleton space and inter-class separability within the semantic space, +respectively, to distinguish the high-similarity actions. Extensive experiments +show that our method achieves state-of-the-art performance in ZSL and GZSL +settings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ G-NeRF: Geometry-enhanced Novel View Synthesis from Single-View Images CVPR 2024 + + +
+ Novel view synthesis aims to generate new view images of a given view image +collection. Recent attempts address this problem relying on 3D geometry priors +(e.g., shapes, sizes, and positions) learned from multi-view images. However, +such methods encounter the following limitations: 1) they require a set of +multi-view images as training data for a specific scene (e.g., face, car or +chair), which is often unavailable in many real-world scenarios; 2) they fail +to extract the geometry priors from single-view images due to the lack of +multi-view supervision. In this paper, we propose a Geometry-enhanced NeRF +(G-NeRF), which seeks to enhance the geometry priors by a geometry-guided +multi-view synthesis approach, followed by a depth-aware training. In the +synthesis process, inspired that existing 3D GAN models can unconditionally +synthesize high-fidelity multi-view images, we seek to adopt off-the-shelf 3D +GAN models, such as EG3D, as a free source to provide geometry priors through +synthesizing multi-view data. Simultaneously, to further improve the geometry +quality of the synthetic data, we introduce a truncation method to effectively +sample latent codes within 3D GAN models. To tackle the absence of multi-view +supervision for single-view images, we design the depth-aware training +approach, incorporating a depth-aware discriminator to guide geometry priors +through depth maps. Experiments demonstrate the effectiveness of our method in +terms of both qualitative and quantitative results. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ☆ LUCF-Net: Lightweight U-shaped Cascade Fusion Network for Medical Image + Segmentation + + +
+ In this study, the performance of existing U-shaped neural network +architectures was enhanced for medical image segmentation by adding +Transformer. Although Transformer architectures are powerful at extracting +global information, its ability to capture local information is limited due to +its high complexity. To address this challenge, we proposed a new lightweight +U-shaped cascade fusion network (LUCF-Net) for medical image segmentation. It +utilized an asymmetrical structural design and incorporated both local and +global modules to enhance its capacity for local and global modeling. +Additionally, a multi-layer cascade fusion decoding network was designed to +further bolster the network's information fusion capabilities. Validation +results achieved on multi-organ datasets in CT format, cardiac segmentation +datasets in MRI format, and dermatology datasets in image format demonstrated +that the proposed model outperformed other state-of-the-art methods in handling +local-global information, achieving an improvement of 1.54% in Dice coefficient +and 2.6 mm in Hausdorff distance on multi-organ segmentation. Furthermore, as a +network that combines Convolutional Neural Network and Transformer +architectures, it achieves competitive segmentation performance with only 6.93 +million parameters and 6.6 gigabytes of floating point operations, without the +need of pre-training. In summary, the proposed method demonstrated enhanced +performance while retaining a simpler model design compared to other +Transformer-based segmentation networks. + +
+
+
+
+
+ + ☆ Trashbusters: Deep Learning Approach for Litter Detection and Tracking + + +
+ The illegal disposal of trash is a major public health and environmental +concern. Disposing of trash in unplanned places poses serious health and +environmental risks. We should try to restrict public trash cans as much as +possible. This research focuses on automating the penalization of litterbugs, +addressing the persistent problem of littering in public places. Traditional +approaches relying on manual intervention and witness reporting suffer from +delays, inaccuracies, and anonymity issues. To overcome these challenges, this +paper proposes a fully automated system that utilizes surveillance cameras and +advanced computer vision algorithms for litter detection, object tracking, and +face recognition. The system accurately identifies and tracks individuals +engaged in littering activities, attaches their identities through face +recognition, and enables efficient enforcement of anti-littering policies. By +reducing reliance on manual intervention, minimizing human error, and providing +prompt identification, the proposed system offers significant advantages in +addressing littering incidents. The primary contribution of this research lies +in the implementation of the proposed system, leveraging advanced technologies +to enhance surveillance operations and automate the penalization of litterbugs. + +
+
+
+
+
+ + ☆ Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs + + +
+ Integration of Large Language Models (LLMs) into visual domain tasks, +resulting in visual-LLMs (V-LLMs), has enabled exceptional performance in +vision-language tasks, particularly for visual question answering (VQA). +However, existing V-LLMs (e.g. BLIP-2, LLaVA) demonstrate weak spatial +reasoning and localization awareness. Despite generating highly descriptive and +elaborate textual answers, these models fail at simple tasks like +distinguishing a left vs right location. In this work, we explore how +image-space coordinate based instruction fine-tuning objectives could inject +spatial awareness into V-LLMs. We discover optimal coordinate representations, +data-efficient instruction fine-tuning objectives, and pseudo-data generation +strategies that lead to improved spatial awareness in V-LLMs. Additionally, our +resulting model improves VQA across image and video domains, reduces undesired +hallucination, and generates better contextual object descriptions. Experiments +across 5 vision-language tasks involving 14 different datasets establish the +clear performance improvements achieved by our proposed framework. + +
+
+
+
+
+ + ☆ Transferable and Principled Efficiency for Open-Vocabulary Segmentation + + +
+ Recent success of pre-trained foundation vision-language models makes +Open-Vocabulary Segmentation (OVS) possible. Despite the promising performance, +this approach introduces heavy computational overheads for two challenges: 1) +large model sizes of the backbone; 2) expensive costs during the fine-tuning. +These challenges hinder this OVS strategy from being widely applicable and +affordable in real-world scenarios. Although traditional methods such as model +compression and efficient fine-tuning can address these challenges, they often +rely on heuristics. This means that their solutions cannot be easily +transferred and necessitate re-training on different models, which comes at a +cost. In the context of efficient OVS, we target achieving performance that is +comparable to or even better than prior OVS works based on large +vision-language foundation models, by utilizing smaller models that incur lower +training costs. The core strategy is to make our efficiency principled and thus +seamlessly transferable from one OVS framework to others without further +customization. Comprehensive experiments on diverse OVS benchmarks demonstrate +our superior trade-off between segmentation accuracy and computation costs over +previous works. Our code is available on https://github.com/Xujxyang/OpenTrans + +
+
+
+
+
+ + ☆ Multi-view Aggregation Network for Dichotomous Image Segmentation CVPR2024 + + +
+ Dichotomous Image Segmentation (DIS) has recently emerged towards +high-precision object segmentation from high-resolution natural images. + When designing an effective DIS model, the main challenge is how to balance +the semantic dispersion of high-resolution targets in the small receptive field +and the loss of high-precision details in the large receptive field. Existing +methods rely on tedious multiple encoder-decoder streams and stages to +gradually complete the global localization and local refinement. + Human visual system captures regions of interest by observing them from +multiple views. Inspired by it, we model DIS as a multi-view object perception +problem and provide a parsimonious multi-view aggregation network (MVANet), +which unifies the feature fusion of the distant view and close-up view into a +single stream with one encoder-decoder structure. With the help of the proposed +multi-view complementary localization and refinement modules, our approach +established long-range, profound visual interactions across multiple views, +allowing the features of the detailed close-up view to focus on highly slender +structures.Experiments on the popular DIS-5K dataset show that our MVANet +significantly outperforms state-of-the-art methods in both accuracy and speed. +The source code and datasets will be publicly available at +\href{https://github.com/qianyu-dlut/MVANet}{MVANet}. + +
+
+ comment: Accepted by CVPR2024 as Highlight +
+
+
+
+
+ + ☆ Encoding Urban Ecologies: Automated Building Archetype Generation + through Self-Supervised Learning for Energy Modeling + + +
+ As the global population and urbanization expand, the building sector has +emerged as the predominant energy consumer and carbon emission contributor. The +need for innovative Urban Building Energy Modeling grows, yet existing building +archetypes often fail to capture the unique attributes of local buildings and +the nuanced distinctions between different cities, jeopardizing the precision +of energy modeling. This paper presents an alternative tool employing +self-supervised learning to distill complex geometric data into representative, +locale-specific archetypes. This study attempts to foster a new paradigm of +interaction with built environments, incorporating local parameters to conduct +bespoke energy simulations at the community level. The catered archetypes can +augment the precision and applicability of energy consumption modeling at +different scales across diverse building inventories. This tool provides a +potential solution that encourages the exploration of emerging local ecologies. +By integrating building envelope characteristics and cultural granularity into +the building archetype generation process, we seek a future where architecture +and urban design are intricately interwoven with the energy sector in shaping +our built environments. + +
+
+
+
+
+ + ☆ CopilotCAD: Empowering Radiologists with Report Completion Models and + Quantitative Evidence from Medical Image Foundation Models + + +
+ Computer-aided diagnosis systems hold great promise to aid radiologists and +clinicians in radiological clinical practice and enhance diagnostic accuracy +and efficiency. However, the conventional systems primarily focus on delivering +diagnostic results through text report generation or medical image +classification, positioning them as standalone decision-makers rather than +helpers and ignoring radiologists' expertise. This study introduces an +innovative paradigm to create an assistive co-pilot system for empowering +radiologists by leveraging Large Language Models (LLMs) and medical image +analysis tools. Specifically, we develop a collaborative framework to integrate +LLMs and quantitative medical image analysis results generated by foundation +models with radiologists in the loop, achieving efficient and safe generation +of radiology reports and effective utilization of computational power of AI and +the expertise of medical professionals. This approach empowers radiologists to +generate more precise and detailed diagnostic reports, enhancing patient +outcomes while reducing the burnout of clinicians. Our methodology underscores +the potential of AI as a supportive tool in medical diagnostics, promoting a +harmonious integration of technology and human expertise to advance the field +of radiology. + +
+
+
+
+
+ + ☆ Improving Shift Invariance in Convolutional Neural Networks with + Translation Invariant Polyphase Sampling + + +
+ Downsampling operators break the shift invariance of convolutional neural +networks (CNNs) and this affects the robustness of features learned by CNNs +when dealing with even small pixel-level shift. Through a large-scale +correlation analysis framework, we study shift invariance of CNNs by inspecting +existing downsampling operators in terms of their maximum-sampling bias (MSB), +and find that MSB is negatively correlated with shift invariance. Based on this +crucial insight, we propose a learnable pooling operator called Translation +Invariant Polyphase Sampling (TIPS) and two regularizations on the intermediate +feature maps of TIPS to reduce MSB and learn translation-invariant +representations. TIPS can be integrated into any CNN and can be trained +end-to-end with marginal computational overhead. Our experiments demonstrate +that TIPS results in consistent performance gains in terms of accuracy, shift +consistency, and shift fidelity on multiple benchmarks for image classification +and semantic segmentation compared to previous methods and also leads to +improvements in adversarial and distributional robustness. TIPS results in the +lowest MSB compared to all previous methods, thus explaining our strong +empirical results. + +
+
+
+
+
+ + ☆ Simplifying Two-Stage Detectors for On-Device Inference in Remote + Sensing + + +
+ Deep learning has been successfully applied to object detection from remotely +sensed images. Images are typically processed on the ground rather than +on-board due to the computation power of the ground system. Such offloaded +processing causes delays in acquiring target mission information, which hinders +its application to real-time use cases. For on-device object detection, +researches have been conducted on designing efficient detectors or model +compression to reduce inference latency. However, highly accurate two-stage +detectors still need further exploitation for acceleration. In this paper, we +propose a model simplification method for two-stage object detectors. Instead +of constructing a general feature pyramid, we utilize only one feature +extraction in the two-stage detector. To compensate for the accuracy drop, we +apply a high pass filter to the RPN's score map. Our approach is applicable to +any two-stage detector using a feature pyramid network. In the experiments with +state-of-the-art two-stage detectors such as ReDet, Oriented-RCNN, and LSKNet, +our method reduced computation costs upto 61.2% with the accuracy loss within +2.1% on the DOTAv1.5 dataset. Source code will be released. + +
+
+
+
+
+ + ☆ Post-hurricane building damage assessment using street-view imagery and + structured data: A multi-modal deep learning approach + + +
+ Accurately assessing building damage is critical for disaster response and +recovery. However, many existing models for detecting building damage have poor +prediction accuracy due to their limited capabilities of identifying detailed, +comprehensive structural and/or non-structural damage from the street-view +image. Additionally, these models mainly rely on the imagery data for damage +classification, failing to account for other critical information, such as wind +speed, building characteristics, evacuation zones, and distance of the building +to the hurricane track. To address these limitations, in this study, we propose +a novel multi-modal (i.e., imagery and structured data) approach for +post-hurricane building damage classification, named the Multi-Modal Swin +Transformer (MMST). We empirically train and evaluate the proposed MMST using +data collected from the 2022 Hurricane Ian in Florida, USA. Results show that +MMST outperforms all selected state-of-the-art benchmark models and can achieve +an accuracy of 92.67%, which are 7.71% improvement in accuracy compared to +Visual Geometry Group 16 (VGG-16). In addition to the street-view imagery data, +building value, building age, and wind speed are the most important predictors +for damage level classification. The proposed MMST can be deployed to assist in +rapid damage assessment and guide reconnaissance efforts in future hurricanes. + +
+
+
+
+
+ + ☆ Global versus Local: Evaluating AlexNet Architectures for Tropical + Cyclone Intensity Estimation + + +
+ Given the destructive impacts of tropical cyclones, it is critical to have a +reliable system for cyclone intensity detection. Various techniques are +available for this purpose, each with differing levels of accuracy. In this +paper, we introduce two ensemble-based models based on AlexNet architecture to +estimate tropical cyclone intensity using visible satellite images. The first +model, trained on the entire dataset, is called the global AlexNet model. The +second model is a distributed version of AlexNet in which multiple AlexNets are +trained separately on subsets of the training data categorized according to the +Saffir-Simpson wind speed scale prescribed by the meterologists. We evaluated +the performance of both models against a deep learning benchmark model called +\textit{Deepti} using a publicly available cyclone image dataset. Results +indicate that both the global model (with a root mean square error (RMSE) of +9.03 knots) and the distributed model (with a RMSE of 9.3 knots) outperform the +benchmark model (with a RMSE of 13.62 knots). We provide a thorough discussion +of our solution approach, including an explanantion of the AlexNet's +performance using gradient class activation maps (grad-CAM). Our proposed +solution strategy allows future experimentation with various deep learning +models in both single and multi-channel settings. + +
+
+
+
+
+ + ☆ SciFlow: Empowering Lightweight Optical Flow Models with Self-Cleaning + Iterations CVPR + + +
+ Optical flow estimation is crucial to a variety of vision tasks. Despite +substantial recent advancements, achieving real-time on-device optical flow +estimation remains a complex challenge. First, an optical flow model must be +sufficiently lightweight to meet computation and memory constraints to ensure +real-time performance on devices. Second, the necessity for real-time on-device +operation imposes constraints that weaken the model's capacity to adequately +handle ambiguities in flow estimation, thereby intensifying the difficulty of +preserving flow accuracy. This paper introduces two synergistic techniques, +Self-Cleaning Iteration (SCI) and Regression Focal Loss (RFL), designed to +enhance the capabilities of optical flow models, with a focus on addressing +optical flow regression ambiguities. These techniques prove particularly +effective in mitigating error propagation, a prevalent issue in optical flow +models that employ iterative refinement. Notably, these techniques add +negligible to zero overhead in model parameters and inference latency, thereby +preserving real-time on-device efficiency. The effectiveness of our proposed +SCI and RFL techniques, collectively referred to as SciFlow for brevity, is +demonstrated across two distinct lightweight optical flow model architectures +in our experiments. Remarkably, SciFlow enables substantial reduction in error +metrics (EPE and Fl-all) over the baseline models by up to 6.3% and 10.5% for +in-domain scenarios and by up to 6.2% and 13.5% for cross-domain scenarios on +the Sintel and KITTI 2015 datasets, respectively. + +
+
+ comment: CVPRW 2024 +
+
+
+
+
+ + ☆ Self-Supervised Learning of Color Constancy + + +
+ Color constancy (CC) describes the ability of the visual system to perceive +an object as having a relatively constant color despite changes in lighting +conditions. While CC and its limitations have been carefully characterized in +humans, it is still unclear how the visual system acquires this ability during +development. Here, we present a first study showing that CC develops in a +neural network trained in a self-supervised manner through an invariance +learning objective. During learning, objects are presented under changing +illuminations, while the network aims to map subsequent views of the same +object onto close-by latent representations. This gives rise to representations +that are largely invariant to the illumination conditions, offering a plausible +example of how CC could emerge during human cognitive development via a form of +self-supervised learning. + +
+
+ comment: 7 pages, 5 figures, submitted to the IEEE International Conference on + Development and Learning (ICDL 2024) +
+
+
+
+
+ + ☆ S3Editor: A Sparse Semantic-Disentangled Self-Training Framework for + Face Video Editing + + +
+ Face attribute editing plays a pivotal role in various applications. However, +existing methods encounter challenges in achieving high-quality results while +preserving identity, editing faithfulness, and temporal consistency. These +challenges are rooted in issues related to the training pipeline, including +limited supervision, architecture design, and optimization strategy. In this +work, we introduce S3Editor, a Sparse Semantic-disentangled Self-training +framework for face video editing. S3Editor is a generic solution that +comprehensively addresses these challenges with three key contributions. +Firstly, S3Editor adopts a self-training paradigm to enhance the training +process through semi-supervision. Secondly, we propose a semantic disentangled +architecture with a dynamic routing mechanism that accommodates diverse editing +requirements. Thirdly, we present a structured sparse optimization schema that +identifies and deactivates malicious neurons to further disentangle impacts +from untarget attributes. S3Editor is model-agnostic and compatible with +various editing approaches. Our extensive qualitative and quantitative results +affirm that our approach significantly enhances identity preservation, editing +fidelity, as well as temporal consistency. + +
+
+
+
+
+ + ☆ Visual Context-Aware Person Fall Detection + + +
+ As the global population ages, the number of fall-related incidents is on the +rise. Effective fall detection systems, specifically in healthcare sector, are +crucial to mitigate the risks associated with such events. This study evaluates +the role of visual context, including background objects, on the accuracy of +fall detection classifiers. We present a segmentation pipeline to +semi-automatically separate individuals and objects in images. Well-established +models like ResNet-18, EfficientNetV2-S, and Swin-Small are trained and +evaluated. During training, pixel-based transformations are applied to +segmented objects, and the models are then evaluated on raw images without +segmentation. Our findings highlight the significant influence of visual +context on fall detection. The application of Gaussian blur to the image +background notably improves the performance and generalization capabilities of +all models. Background objects such as beds, chairs, or wheelchairs can +challenge fall detection systems, leading to false positive alarms. However, we +demonstrate that object-specific contextual transformations during training +effectively mitigate this challenge. Further analysis using saliency maps +supports our observation that visual context is crucial in classification +tasks. We create both dataset processing API and segmentation pipeline, +available at https://github.com/A-NGJ/image-segmentation-cli. + +
+
+ comment: 10 pages, 6 figures, KES IDT-24 conference +
+
+
+
+
+ + ☆ Real-Time Detection and Analysis of Vehicles and Pedestrians using Deep + Learning + + +
+ Computer vision, particularly vehicle and pedestrian identification is +critical to the evolution of autonomous driving, artificial intelligence, and +video surveillance. Current traffic monitoring systems confront major +difficulty in recognizing small objects and pedestrians effectively in +real-time, posing a serious risk to public safety and contributing to traffic +inefficiency. Recognizing these difficulties, our project focuses on the +creation and validation of an advanced deep-learning framework capable of +processing complex visual input for precise, real-time recognition of cars and +people in a variety of environmental situations. On a dataset representing +complicated urban settings, we trained and evaluated different versions of the +YOLOv8 and RT-DETR models. The YOLOv8 Large version proved to be the most +effective, especially in pedestrian recognition, with great precision and +robustness. The results, which include Mean Average Precision and recall rates, +demonstrate the model's ability to dramatically improve traffic monitoring and +safety. This study makes an important addition to real-time, reliable detection +in computer vision, establishing new benchmarks for traffic management systems. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ DIMAT: Decentralized Iterative Merging-And-Training for Deep Learning + Models CVPR 2024 + + +
+ Recent advances in decentralized deep learning algorithms have demonstrated +cutting-edge performance on various tasks with large pre-trained models. +However, a pivotal prerequisite for achieving this level of competitiveness is +the significant communication and computation overheads when updating these +models, which prohibits the applications of them to real-world scenarios. To +address this issue, drawing inspiration from advanced model merging techniques +without requiring additional training, we introduce the Decentralized Iterative +Merging-And-Training (DIMAT) paradigm--a novel decentralized deep learning +framework. Within DIMAT, each agent is trained on their local data and +periodically merged with their neighboring agents using advanced model merging +techniques like activation matching until convergence is achieved. DIMAT +provably converges with the best available rate for nonconvex functions with +various first-order methods, while yielding tighter error bounds compared to +the popular existing approaches. We conduct a comprehensive empirical analysis +to validate DIMAT's superiority over baselines across diverse computer vision +tasks sourced from multiple datasets. Empirical results validate our +theoretical claims by showing that DIMAT attains faster and higher initial gain +in accuracy with independent and identically distributed (IID) and non-IID +data, incurring lower communication overhead. This DIMAT paradigm presents a +new opportunity for the future decentralized learning, enhancing its +adaptability to real-world with sparse and light-weight communication and +computation. + +
+
+ comment: CVPR 2024 accepted paper, 22 pages, 12 figures +
+
+
+
+
+ + ☆ Latent Guard: a Safety Framework for Text-to-image Generation + + +
+ With the ability to generate high-quality images, text-to-image (T2I) models +can be exploited for creating inappropriate content. To prevent misuse, +existing safety measures are either based on text blacklists, which can be +easily circumvented, or harmful content classification, requiring large +datasets for training and offering low flexibility. Hence, we propose Latent +Guard, a framework designed to improve safety measures in text-to-image +generation. Inspired by blacklist-based approaches, Latent Guard learns a +latent space on top of the T2I model's text encoder, where it is possible to +check the presence of harmful concepts in the input text embeddings. Our +proposed framework is composed of a data generation pipeline specific to the +task using large language models, ad-hoc architectural components, and a +contrastive learning strategy to benefit from the generated data. The +effectiveness of our method is verified on three datasets and against four +baselines. Code and data will be shared at +https://github.com/rt219/LatentGuard. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Rethinking Artistic Copyright Infringements in the Era of Text-to-Image + Generative Models + + +
+ Recent text-to-image generative models such as Stable Diffusion are extremely +adept at mimicking and generating copyrighted content, raising concerns amongst +artists that their unique styles may be improperly copied. Understanding how +generative models copy "artistic style" is more complex than duplicating a +single image, as style is comprised by a set of elements (or signature) that +frequently co-occurs across a body of work, where each individual work may vary +significantly. In our paper, we first reformulate the problem of "artistic +copyright infringement" to a classification problem over image sets, instead of +probing image-wise similarities. We then introduce ArtSavant, a practical +(i.e., efficient and easy to understand) tool to (i) determine the unique style +of an artist by comparing it to a reference dataset of works from 372 artists +curated from WikiArt, and (ii) recognize if the identified style reappears in +generated images. We leverage two complementary methods to perform artistic +style classification over image sets, includingTagMatch, which is a novel +inherently interpretable and attributable method, making it more suitable for +broader use by non-technical stake holders (artists, lawyers, judges, etc). +Leveraging ArtSavant, we then perform a large-scale empirical study to provide +quantitative insight on the prevalence of artistic style copying across 3 +popular text-to-image generative models. Namely, amongst a dataset of prolific +artists (including many famous ones), only 20% of them appear to have their +styles be at a risk of copying via simple prompting of today's popular +text-to-image generative models. + +
+
+
+
+
+ + ☆ SurvMamba: State Space Model with Multi-grained Multi-modal Interaction + for Survival Prediction + + +
+ Multi-modal learning that combines pathological images with genomic data has +significantly enhanced the accuracy of survival prediction. Nevertheless, +existing methods have not fully utilized the inherent hierarchical structure +within both whole slide images (WSIs) and transcriptomic data, from which +better intra-modal representations and inter-modal integration could be +derived. Moreover, many existing studies attempt to improve multi-modal +representations through attention mechanisms, which inevitably lead to high +complexity when processing high-dimensional WSIs and transcriptomic data. +Recently, a structured state space model named Mamba emerged as a promising +approach for its superior performance in modeling long sequences with low +complexity. In this study, we propose Mamba with multi-grained multi-modal +interaction (SurvMamba) for survival prediction. SurvMamba is implemented with +a Hierarchical Interaction Mamba (HIM) module that facilitates efficient +intra-modal interactions at different granularities, thereby capturing more +detailed local features as well as rich global representations. In addition, an +Interaction Fusion Mamba (IFM) module is used for cascaded inter-modal +interactive fusion, yielding more comprehensive features for survival +prediction. Comprehensive evaluations on five TCGA datasets demonstrate that +SurvMamba outperforms other existing methods in terms of performance and +computational cost. + +
+
+
+
+
+ + ☆ Synthetic Brain Images: Bridging the Gap in Brain Mapping With + Generative Adversarial Model + + +
+ Magnetic Resonance Imaging (MRI) is a vital modality for gaining precise +anatomical information, and it plays a significant role in medical imaging for +diagnosis and therapy planning. Image synthesis problems have seen a revolution +in recent years due to the introduction of deep learning techniques, +specifically Generative Adversarial Networks (GANs). This work investigates the +use of Deep Convolutional Generative Adversarial Networks (DCGAN) for producing +high-fidelity and realistic MRI image slices. The suggested approach uses a +dataset with a variety of brain MRI scans to train a DCGAN architecture. While +the discriminator network discerns between created and real slices, the +generator network learns to synthesise realistic MRI image slices. The +generator refines its capacity to generate slices that closely mimic real MRI +data through an adversarial training approach. The outcomes demonstrate that +the DCGAN promise for a range of uses in medical imaging research, since they +show that it can effectively produce MRI image slices if we train them for a +consequent number of epochs. This work adds to the expanding corpus of research +on the application of deep learning techniques for medical image synthesis. The +slices that are could be produced possess the capability to enhance datasets, +provide data augmentation in the training of deep learning models, as well as a +number of functions are made available to make MRI data cleaning easier, and a +three ready to use and clean dataset on the major anatomical plans. + +
+
+
+
+
+ + ♻ ☆ Supervised Fine-tuning in turn Improves Visual Foundation Models + + +
+ Image-text training like CLIP has dominated the pretraining of vision +foundation models in recent years. Subsequent efforts have been made to +introduce region-level visual learning into CLIP's pretraining but face +scalability challenges due to the lack of large-scale region-level datasets. +Drawing inspiration from supervised fine-tuning (SFT) in natural language +processing such as instruction tuning, we explore the potential of fine-grained +SFT in enhancing the generation of vision foundation models after their +pretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash +the fine-grained knowledge of vision foundation models. In ViSFT, the vision +foundation model is enhanced by performing visual joint learning on some +in-domain tasks and then tested on out-of-domain benchmarks. With updating +using ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over +4.4B parameters shows improvements across various out-of-domain benchmarks +including vision and vision-linguistic scenarios. + +
+
+ comment: 23 pages, 3 figures, Project page: + https://github.com/TencentARC/ViSFT/tree/main +
+
+
+
+
+ + ♻ ☆ Low-Resource Vision Challenges for Foundation Models CVPR2024 + + +
+ Low-resource settings are well-established in natural language processing, +where many languages lack sufficient data for deep learning at scale. However, +low-resource problems are under-explored in computer vision. In this paper, we +address this gap and explore the challenges of low-resource image tasks with +vision foundation models. We first collect a benchmark of genuinely +low-resource image data, covering historic maps, circuit diagrams, and +mechanical drawings. These low-resource settings all share three challenges: +data scarcity, fine-grained differences, and the distribution shift from +natural images to the specialized domain of interest. While existing foundation +models have shown impressive generalizability, we find they cannot transfer +well to our low-resource tasks. To begin to tackle the challenges of +low-resource vision, we introduce one simple baseline per challenge. +Specifically, we i) enlarge the data space by generative models, ii) adopt the +best sub-kernels to encode local regions for fine-grained difference discovery +and iii) learn attention for specialized domains. Experiments on our three +low-resource tasks demonstrate our proposals already provide a better baseline +than transfer learning, data augmentation, and fine-grained methods. This +highlights the unique characteristics and challenges of low-resource vision for +foundation models that warrant further investigation. Project page: +https://xiaobai1217.github.io/Low-Resource-Vision/. + +
+
+ comment: Accepted at CVPR2024 +
+
+
+
+
+ + ♻ ☆ EgoGen: An Egocentric Synthetic Data Generator CVPR 2024 + + +
+ Understanding the world in first-person view is fundamental in Augmented +Reality (AR). This immersive perspective brings dramatic visual changes and +unique challenges compared to third-person views. Synthetic data has empowered +third-person-view vision models, but its application to embodied egocentric +perception tasks remains largely unexplored. A critical challenge lies in +simulating natural human movements and behaviors that effectively steer the +embodied cameras to capture a faithful egocentric representation of the 3D +world. To address this challenge, we introduce EgoGen, a new synthetic data +generator that can produce accurate and rich ground-truth training data for +egocentric perception tasks. At the heart of EgoGen is a novel human motion +synthesis model that directly leverages egocentric visual inputs of a virtual +human to sense the 3D environment. Combined with collision-avoiding motion +primitives and a two-stage reinforcement learning approach, our motion +synthesis model offers a closed-loop solution where the embodied perception and +movement of the virtual human are seamlessly coupled. Compared to previous +works, our model eliminates the need for a pre-defined global path, and is +directly applicable to dynamic environments. Combined with our easy-to-use and +scalable data generation pipeline, we demonstrate EgoGen's efficacy in three +tasks: mapping and localization for head-mounted cameras, egocentric camera +tracking, and human mesh recovery from egocentric views. EgoGen will be fully +open-sourced, offering a practical solution for creating realistic egocentric +training data and aiming to serve as a useful tool for egocentric computer +vision research. Refer to our project page: https://ego-gen.github.io/. + +
+
+ comment: Accepted by CVPR 2024 (Oral). 23 pages, 17 figures. Project page: + https://ego-gen.github.io/ +
+
+
+
+
+ + ♻ ☆ MambaAD: Exploring State Space Models for Multi-class Unsupervised + Anomaly Detection + + +
+ Recent advancements in anomaly detection have seen the efficacy of CNN- and +transformer-based approaches. However, CNNs struggle with long-range +dependencies, while transformers are burdened by quadratic computational +complexity. Mamba-based models, with their superior long-range modeling and +linear efficiency, have garnered substantial attention. This study pioneers the +application of Mamba to multi-class unsupervised anomaly detection, presenting +MambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring +(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS +module, integrating parallel cascaded (Hybrid State Space) HSS blocks and +multi-kernel convolutions operations, effectively captures both long-range and +local information. The HSS block, utilizing (Hybrid Scanning) HS encoders, +encodes feature maps into five scanning methods and eight directions, thereby +strengthening global connections through the (State Space Model) SSM. The use +of Hilbert scanning and eight directions significantly improves feature +sequence modeling. Comprehensive experiments on six diverse anomaly detection +datasets and seven metrics demonstrate state-of-the-art performance, +substantiating the method's effectiveness. + +
+
+
+
+
+ + ♻ ☆ Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised + Medical Image Segmentation + + +
+ Although the existing uncertainty-based semi-supervised medical segmentation +methods have achieved excellent performance, they usually only consider a +single uncertainty evaluation, which often fails to solve the problem related +to credibility completely. Therefore, based on the framework of evidential deep +learning, this paper integrates the evidential predictive results in the +cross-region of mixed and original samples to reallocate the confidence degree +and uncertainty measure of each voxel, which is realized by emphasizing +uncertain information of probability assignments fusion rule of traditional +evidence theory. Furthermore, we design a voxel-level asymptotic learning +strategy by introducing information entropy to combine with the fused +uncertainty measure to estimate voxel prediction more precisely. The model will +gradually pay attention to the prediction results with high uncertainty in the +learning process, to learn the features that are difficult to master. The +experimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the +superior performance of our proposed method in comparison with the existing +state of the arts. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Event-based Vision: A Comprehensive Survey and + Benchmarks + + +
+ Event cameras are bio-inspired sensors that capture the per-pixel intensity +changes asynchronously and produce event streams encoding the time, pixel +position, and polarity (sign) of the intensity changes. Event cameras possess a +myriad of advantages over canonical frame-based cameras, such as high temporal +resolution, high dynamic range, low latency, etc. Being capable of capturing +information in challenging visual conditions, event cameras have the potential +to overcome the limitations of frame-based cameras in the computer vision and +robotics community. In very recent years, deep learning (DL) has been brought +to this emerging field and inspired active research endeavors in mining its +potential. However, there is still a lack of taxonomies in DL techniques for +event-based vision. We first scrutinize the typical event representations with +quality enhancement methods as they play a pivotal role as inputs to the DL +models. We then provide a comprehensive survey of existing DL-based methods by +structurally grouping them into two major categories: 1) image/video +reconstruction and restoration; 2) event-based scene understanding and 3D +vision. We conduct benchmark experiments for the existing methods in some +representative research directions, i.e., image reconstruction, deblurring, and +object recognition, to identify some critical insights and problems. Finally, +we have discussions regarding the challenges and provide new perspectives for +inspiring more research studies. + +
+
+
+
+
+ + ♻ ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Time-step Curriculum for One Image to 3D Generation CVPR 2024 + + +
+ Score distillation sampling~(SDS) has been widely adopted to overcome the +absence of unseen views in reconstructing 3D objects from a \textbf{single} +image. It leverages pre-trained 2D diffusion models as teacher to guide the +reconstruction of student 3D models. Despite their remarkable success, +SDS-based methods often encounter geometric artifacts and texture saturation. +We find out the crux is the overlooked indiscriminate treatment of diffusion +time-steps during optimization: it unreasonably treats the student-teacher +knowledge distillation to be equal at all time-steps and thus entangles +coarse-grained and fine-grained modeling. Therefore, we propose the Diffusion +Time-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the +teacher and student models collaborating with the time-step curriculum in a +coarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and +Level50 benchmark demonstrate that DTC123 can produce multi-view consistent, +high-quality, and diverse 3D assets. Codes and more generation demos will be +released in https://github.com/yxymessi/DTC123. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in + Remote Sensing + + +
+ Self-supervised learning through masked autoencoders (MAEs) has recently +attracted great attention for remote sensing (RS) image representation +learning, and thus embodies a significant potential for content-based image +retrieval (CBIR) from ever-growing RS image archives. However, the existing +studies on MAEs in RS assume that the considered RS images are acquired by a +single image sensor, and thus are only suitable for uni-modal CBIR problems. +The effectiveness of MAEs for cross-sensor CBIR, which aims to search +semantically similar images across different image modalities, has not been +explored yet. In this paper, we take the first step to explore the +effectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a +systematic overview on the possible adaptations of the vanilla MAE to exploit +masked image modeling on multi-sensor RS image archives (denoted as +cross-sensor masked autoencoders [CSMAEs]). Based on different adjustments +applied to the vanilla MAE, we introduce different CSMAE models. We also +provide an extensive experimental analysis of these CSMAE models. We finally +derive a guideline to exploit masked image modeling for uni-modal and +cross-modal CBIR problems in RS. The code of this work is publicly available at +https://github.com/jakhac/CSMAE. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Our code is available at https://github.com/jakhac/CSMAE +
+
+
+
+
+ + ♻ ☆ An Autonomous Vision-Based Algorithm for Interplanetary Navigation + + +
+ The surge of deep-space probes makes it unsustainable to navigate them with +standard radiometric tracking. Self-driving interplanetary satellites represent +a solution to this problem. In this work, a full vision-based navigation +algorithm is built by combining an orbit determination method with an image +processing pipeline suitable for interplanetary transfers of autonomous +platforms. To increase the computational efficiency of the algorithm, a +non-dimensional extended Kalman filter is selected as state estimator, fed by +the positions of the planets extracted from deep-space images. An enhancement +of the estimation accuracy is performed by applying an optimal strategy to +select the best pair of planets to track. Moreover, a novel analytical +measurement model for deep-space navigation is developed providing a +first-order approximation of the light-aberration and light-time effects. +Algorithm performance is tested on a high-fidelity, Earth--Mars interplanetary +transfer, showing the algorithm applicability for deep-space navigation. + +
+
+
+
+
+ + ♻ ☆ Attention Calibration for Disentangled Text-to-Image Personalization CVPR 2024 + + +
+ Recent thrilling progress in large-scale text-to-image (T2I) models has +unlocked unprecedented synthesis quality of AI-generated content (AIGC) +including image generation, 3D and video composition. Further, personalized +techniques enable appealing customized production of a novel concept given only +several images as reference. However, an intriguing problem persists: Is it +possible to capture multiple, novel concepts from one single reference image? +In this paper, we identify that existing approaches fail to preserve visual +consistency with the reference image and eliminate cross-influence from +concepts. To alleviate this, we propose an attention calibration mechanism to +improve the concept-level understanding of the T2I model. Specifically, we +first introduce new learnable modifiers bound with classes to capture +attributes of multiple concepts. Then, the classes are separated and +strengthened following the activation of the cross-attention operation, +ensuring comprehensive and self-contained concepts. Additionally, we suppress +the attention activation of different classes to mitigate mutual influence +among concepts. Together, our proposed method, dubbed DisenDiff, can learn +disentangled multiple concepts from one single image and produce novel +customized images with learned concepts. We demonstrate that our method +outperforms the current state of the art in both qualitative and quantitative +evaluations. More importantly, our proposed techniques are compatible with LoRA +and inpainting pipelines, enabling more interactive experiences. + +
+
+ comment: CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ A Deep Learning Method for Simultaneous Denoising and Missing Wedge + Reconstruction in Cryogenic Electron Tomography + + +
+ Cryogenic electron tomography is a technique for imaging biological samples +in 3D. A microscope collects a series of 2D projections of the sample, and the +goal is to reconstruct the 3D density of the sample called the tomogram. +Reconstruction is difficult as the 2D projections are noisy and can not be +recorded from all directions, resulting in a missing wedge of information. +Tomograms conventionally reconstructed with filtered back-projection suffer +from noise and strong artifacts due to the missing wedge. Here, we propose a +deep-learning approach for simultaneous denoising and missing wedge +reconstruction called DeepDeWedge. The algorithm requires no ground truth data +and is based on fitting a neural network to the 2D projections using a +self-supervised loss. DeepDeWedge performs better than CryoCARE and IsoNet, +which are state-of-the-art methods for denoising and missing wedge +reconstruction, and similarly and, in some cases, better than the combination +of the two methods. At the same time, DeepDeWedge is simpler than this two-step +approach, as it does denoising and missing wedge reconstruction simultaneously +rather than sequentially. + +
+
+
+
+
+ + ♻ ☆ T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise + Event Spotting in Sports Videos + + +
+ In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer +Encoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses +multiple challenges in the task, including the need for discriminability among +frame representations, high output temporal resolution to maintain prediction +precision, and the necessity to capture information at different temporal +scales to handle events with varying dynamics. It tackles these challenges +through its specifically designed architecture, featuring an encoder-decoder +for leveraging multiple temporal scales and achieving high output temporal +resolution, along with temporal modules designed to increase token +discriminability. Leveraging these characteristics, T-DEED achieves SOTA +performance on the FigureSkating and FineDiving datasets. Code is available at +https://github.com/arturxe2/T-DEED. + +
+
+
+
+
+ + ♻ ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincar{é} Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ♻ ☆ Exploring Efficient Asymmetric Blind-Spots for Self-Supervised Denoising + in Real-World Scenarios CVPR 2024 + + +
+ Self-supervised denoising has attracted widespread attention due to its +ability to train without clean images. However, noise in real-world scenarios +is often spatially correlated, which causes many self-supervised algorithms +that assume pixel-wise independent noise to perform poorly. Recent works have +attempted to break noise correlation with downsampling or neighborhood masking. +However, denoising on downsampled subgraphs can lead to aliasing effects and +loss of details due to a lower sampling rate. Furthermore, the neighborhood +masking methods either come with high computational complexity or do not +consider local spatial preservation during inference. Through the analysis of +existing methods, we point out that the key to obtaining high-quality and +texture-rich results in real-world self-supervised denoising tasks is to train +at the original input resolution structure and use asymmetric operations during +training and inference. Based on this, we propose Asymmetric Tunable Blind-Spot +Network (AT-BSN), where the blind-spot size can be freely adjusted, thus better +balancing noise correlation suppression and image local spatial destruction +during training and inference. In addition, we regard the pre-trained AT-BSN as +a meta-teacher network capable of generating various teacher networks by +sampling different blind-spots. We propose a blind-spot based multi-teacher +distillation strategy to distill a lightweight network, significantly improving +performance. Experimental results on multiple datasets prove that our method +achieves state-of-the-art, and is superior to other self-supervised algorithms +in terms of computational overhead and visual effects. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Satellite Image Time Series Analysis: A Review + + +
+ Earth observation (EO) satellite missions have been providing detailed images +about the state of the Earth and its land cover for over 50 years. Long term +missions, such as NASA's Landsat, Terra, and Aqua satellites, and more +recently, the ESA's Sentinel missions, record images of the entire world every +few days. Although single images provide point-in-time data, repeated images of +the same area, or satellite image time series (SITS) provide information about +the changing state of vegetation and land use. These SITS are useful for +modeling dynamic processes and seasonal changes such as plant phenology. They +have potential benefits for many aspects of land and natural resource +management, including applications in agricultural, forest, water, and disaster +management, urban planning, and mining. However, the resulting satellite image +time series (SITS) are complex, incorporating information from the temporal, +spatial, and spectral dimensions. Therefore, deep learning methods are often +deployed as they can analyze these complex relationships. This review presents +a summary of the state-of-the-art methods of modelling environmental, +agricultural, and other Earth observation variables from SITS data using deep +learning methods. We aim to provide a resource for remote sensing experts +interested in using deep learning techniques to enhance Earth observation +models with temporal information. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Is Medieval Distant Viewing Possible? : Extending and Enriching + Annotation of Legacy Image Collections using Visual Analytics + + +
+ Distant viewing approaches have typically used image datasets close to the +contemporary image data used to train machine learning models. To work with +images from other historical periods requires expert annotated data, and the +quality of labels is crucial for the quality of results. Especially when +working with cultural heritage collections that contain myriad uncertainties, +annotating data, or re-annotating, legacy data is an arduous task. In this +paper, we describe working with two pre-annotated sets of medieval manuscript +images that exhibit conflicting and overlapping metadata. Since a manual +reconciliation of the two legacy ontologies would be very expensive, we aim (1) +to create a more uniform set of descriptive labels to serve as a "bridge" in +the combined dataset, and (2) to establish a high quality hierarchical +classification that can be used as a valuable input for subsequent supervised +machine learning. To achieve these goals, we developed visualization and +interaction mechanisms, enabling medievalists to combine, regularize and extend +the vocabulary used to describe these, and other cognate, image datasets. The +visual interfaces provide experts an overview of relationships in the data +going beyond the sum total of the metadata. Word and image embeddings as well +as co-occurrences of labels across the datasets, enable batch re-annotation of +images, recommendation of label candidates and support composing a hierarchical +classification of labels. + +
+
+ comment: Revision after DSH Peer Review. Paper is now accepted at DSH +
+
+
+
+
+ + ♻ ☆ How NeRFs and 3D Gaussian Splatting are Reshaping SLAM: a Survey + + +
+ Over the past two decades, research in the field of Simultaneous Localization +and Mapping (SLAM) has undergone a significant evolution, highlighting its +critical role in enabling autonomous exploration of unknown environments. This +evolution ranges from hand-crafted methods, through the era of deep learning, +to more recent developments focused on Neural Radiance Fields (NeRFs) and 3D +Gaussian Splatting (3DGS) representations. Recognizing the growing body of +research and the absence of a comprehensive survey on the topic, this paper +aims to provide the first comprehensive overview of SLAM progress through the +lens of the latest advancements in radiance fields. It sheds light on the +background, evolutionary path, inherent strengths and limitations, and serves +as a fundamental reference to highlight the dynamic progress and specific +challenges. + +
+
+
+
+
+ + ♻ ☆ 3D Human Reconstruction in the Wild with Synthetic Data Using Generative + Models + + +
+ In this work, we show that synthetic data created by generative models is +complementary to computer graphics (CG) rendered data for achieving remarkable +generalization performance on diverse real-world scenes for 3D human pose and +shape estimation (HPS). Specifically, we propose an effective approach based on +recent diffusion models, termed HumanWild, which can effortlessly generate +human images and corresponding 3D mesh annotations. We first collect a +large-scale human-centric dataset with comprehensive annotations, e.g., text +captions and surface normal images. Then, we train a customized ControlNet +model upon this dataset to generate diverse human images and initial +ground-truth labels. At the core of this step is that we can easily obtain +numerous surface normal images from a 3D human parametric model, e.g., SMPL-X, +by rendering the 3D mesh onto the image plane. As there exists inevitable noise +in the initial labels, we then apply an off-the-shelf foundation segmentation +model, i.e., SAM, to filter negative data samples. Our data generation pipeline +is flexible and customizable to facilitate different real-world tasks, e.g., +ego-centric scenes and perspective-distortion scenes. The generated dataset +comprises 0.79M images with corresponding 3D annotations, covering versatile +viewpoints, scenes, and human identities. We train various HPS regressors on +top of the generated data and evaluate them on a wide range of benchmarks +(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the +generated data. By exclusively employing generative models, we generate +large-scale in-the-wild human images and high-quality annotations, eliminating +the need for real-world data collection. + +
+
+ comment: project page: https://yongtaoge.github.io/projects/humanwild +
+
+
+
+
+ + ♻ ☆ NRDF: Neural Riemannian Distance Fields for Learning Articulated Pose + Priors CVPR 2024 + + +
+ Faithfully modeling the space of articulations is a crucial task that allows +recovery and generation of realistic poses, and remains a notorious challenge. +To this end, we introduce Neural Riemannian Distance Fields (NRDFs), +data-driven priors modeling the space of plausible articulations, represented +as the zero-level-set of a neural field in a high-dimensional +product-quaternion space. To train NRDFs only on positive examples, we +introduce a new sampling algorithm, ensuring that the geodesic distances follow +a desired distribution, yielding a principled distance field learning paradigm. +We then devise a projection algorithm to map any random pose onto the level-set +by an adaptive-step Riemannian optimizer, adhering to the product manifold of +joint rotations at all times. NRDFs can compute the Riemannian gradient via +backpropagation and by mathematical analogy, are related to Riemannian flow +matching, a recent generative model. We conduct a comprehensive evaluation of +NRDF against other pose priors in various downstream tasks, i.e., pose +generation, image-based pose estimation, and solving inverse kinematics, +highlighting NRDF's superior performance. Besides humans, NRDF's versatility +extends to hand and animal poses, as it can effectively represent any +articulation. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://virtualhumans.mpi-inf.mpg.de/nrdf +
+
+
+
+
+ + ♻ ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have inherent shortcomings. Recently, the Mamba architecture, +based on state space models, has shown remarkable performance in a series of +natural language processing tasks, which can effectively compensate for the +shortcomings of the above two architectures. In this paper, we explore for the +first time the potential of the Mamba architecture for remote sensing CD tasks. +We tailor the corresponding frameworks, called MambaBCD, MambaSCD, and +MambaBDA, for binary change detection (BCD), semantic change detection (SCD), +and building damage assessment (BDA), respectively. All three frameworks adopt +the cutting-edge Visual Mamba architecture as the encoder, which allows full +learning of global spatial contextual information from the input images. For +the change decoder, which is available in all three architectures, we propose +three spatio-temporal relationship modeling mechanisms, which can be naturally +combined with the Mamba architecture and fully utilize its attribute to achieve +spatio-temporal interaction of multi-temporal features, thereby obtaining +accurate change information. On five benchmark datasets, our proposed +frameworks outperform current CNN- and Transformer-based approaches without +using any complex training strategies or tricks, fully demonstrating the +potential of the Mamba architecture in CD tasks. Specifically, we obtained +83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+, +and WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA +dataset xBD, we obtained 81.41% overall F1 score. Further experiments show that +our architecture is quite robust to degraded data. The source code will be +available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ♻ ☆ RePoseDM: Recurrent Pose Alignment and Gradient Guidance for Pose Guided + Image Synthesis CVPR 2024 + + +
+ Pose-guided person image synthesis task requires re-rendering a reference +image, which should have a photorealistic appearance and flawless pose +transfer. Since person images are highly structured, existing approaches +require dense connections for complex deformations and occlusions because these +are generally handled through multi-level warping and masking in latent space. +The feature maps generated by convolutional neural networks do not have +equivariance, and hence multi-level warping is required to perform pose +alignment. Inspired by the ability of the diffusion model to generate +photorealistic images from the given conditional guidance, we propose recurrent +pose alignment to provide pose-aligned texture features as conditional +guidance. Due to the leakage of the source pose in conditional guidance, we +propose gradient guidance from pose interaction fields, which output the +distance from the valid pose manifold given a predicted pose as input. This +helps in learning plausible pose transfer trajectories that result in +photorealism and undistorted texture details. Extensive results on two +large-scale benchmarks and a user study demonstrate the ability of our proposed +approach to generate photorealistic pose transfer under challenging scenarios. +Additionally, we demonstrate the efficiency of gradient guidance in pose-guided +image generation on the HumanArt dataset with fine-tuned stable diffusion. + +
+
+ comment: Accepted at CVPR 2024 SyntaGen Workshop, 13 pages, 4 tables, 7 + figures +
+
+
+
+
+ + ♻ ☆ COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy + Prediction CVPR2024 + + +
+ The autonomous driving community has shown significant interest in 3D +occupancy prediction, driven by its exceptional geometric perception and +general object recognition capabilities. To achieve this, current works try to +construct a Tri-Perspective View (TPV) or Occupancy (OCC) representation +extending from the Bird-Eye-View perception. However, compressed views like TPV +representation lose 3D geometry information while raw and sparse OCC +representation requires heavy but redundant computational costs. To address the +above limitations, we propose Compact Occupancy TRansformer (COTR), with a +geometry-aware occupancy encoder and a semantic-aware group decoder to +reconstruct a compact 3D OCC representation. The occupancy encoder first +generates a compact geometrical OCC feature through efficient explicit-implicit +view transformation. Then, the occupancy decoder further enhances the semantic +discriminability of the compact OCC representation by a coarse-to-fine semantic +grouping strategy. Empirical experiments show that there are evident +performance gains across multiple baselines, e.g., COTR outperforms baselines +with a relative improvement of 8%-15%, demonstrating the superiority of our +method. + +
+
+ comment: CVPR2024. Code is available at https://github.com/NotACracker/COTR +
+
+
+
+
+ + ♻ ☆ IIDM: Inter and Intra-domain Mixing for Semi-supervised Domain + Adaptation in Semantic Segmentation + + +
+ Despite recent advances in semantic segmentation, an inevitable challenge is +the performance degradation caused by the domain shift in real applications. +Current dominant approach to solve this problem is unsupervised domain +adaptation (UDA). However, the absence of labeled target data in UDA is overly +restrictive and limits performance. To overcome this limitation, a more +practical scenario called semi-supervised domain adaptation (SSDA) has been +proposed. Existing SSDA methods are derived from the UDA paradigm and primarily +focus on leveraging the unlabeled target data and source data. In this paper, +we highlight the significance of exploiting the intra-domain information +between the labeled target data and unlabeled target data. Instead of solely +using the scarce labeled target data for supervision, we propose a novel SSDA +framework that incorporates both Inter and Intra Domain Mixing (IIDM), where +inter-domain mixing mitigates the source-target domain gap and intra-domain +mixing enriches the available target domain information, and the network can +capture more domain-invariant features. We also explore different domain mixing +strategies to better exploit the target domain information. Comprehensive +experiments conducted on the GTA5 to Cityscapes and SYNTHIA to Cityscapes +benchmarks demonstrate the effectiveness of IIDM, surpassing previous methods +by a large margin. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ WWW: A Unified Framework for Explaining What, Where and Why of Neural + Networks by Interpretation of Neuron Concepts CVPR 2024 + + +
+ Recent advancements in neural networks have showcased their remarkable +capabilities across various domains. Despite these successes, the "black box" +problem still remains. Addressing this, we propose a novel framework, WWW, that +offers the 'what', 'where', and 'why' of the neural network decisions in +human-understandable terms. Specifically, WWW utilizes adaptive selection for +concept discovery, employing adaptive cosine similarity and thresholding +techniques to effectively explain 'what'. To address the 'where' and 'why', we +proposed a novel combination of neuron activation maps (NAMs) with Shapley +values, generating localized concept maps and heatmaps for individual inputs. +Furthermore, WWW introduces a method for predicting uncertainty, leveraging +heatmap similarities to estimate 'how' reliable the prediction is. Experimental +evaluations of WWW demonstrate superior performance in both quantitative and +qualitative metrics, outperforming existing methods in interpretability. WWW +provides a unified solution for explaining 'what', 'where', and 'why', +introducing a method for localized explanations from global interpretations and +offering a plug-and-play solution adaptable to various architectures. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Samba: Semantic Segmentation of Remotely Sensed Images with State Space + Model + + +
+ High-resolution remotely sensed images pose a challenge for commonly used +semantic segmentation methods such as Convolutional Neural Network (CNN) and +Vision Transformer (ViT). CNN-based methods struggle with handling such +high-resolution images due to their limited receptive field, while ViT faces +challenges in handling long sequences. Inspired by Mamba, which adopts a State +Space Model (SSM) to efficiently capture global semantic information, we +propose a semantic segmentation framework for high-resolution remotely sensed +images, named Samba. Samba utilizes an encoder-decoder architecture, with Samba +blocks serving as the encoder for efficient multi-level semantic information +extraction, and UperNet functioning as the decoder. We evaluate Samba on the +LoveDA, ISPRS Vaihingen, and ISPRS Potsdam datasets, comparing its performance +against top-performing CNN and ViT methods. The results reveal that Samba +achieved unparalleled performance on commonly used remote sensing datasets for +semantic segmentation. Our proposed Samba demonstrates for the first time the +effectiveness of SSM in semantic segmentation of remotely sensed images, +setting a new benchmark in performance for Mamba-based techniques in this +specific application. The source code and baseline implementations are +available at https://github.com/zhuqinfeng1999/Samba. + +
+
+
+
+
+ + ♻ ☆ Driver Attention Tracking and Analysis + + +
+ We propose a novel method to estimate a driver's points-of-gaze using a pair +of ordinary cameras mounted on the windshield and dashboard of a car. This is a +challenging problem due to the dynamics of traffic environments with 3D scenes +of unknown depths. This problem is further complicated by the volatile distance +between the driver and the camera system. To tackle these challenges, we +develop a novel convolutional network that simultaneously analyzes the image of +the scene and the image of the driver's face. This network has a camera +calibration module that can compute an embedding vector that represents the +spatial configuration between the driver and the camera system. This +calibration module improves the overall network's performance, which can be +jointly trained end to end. + We also address the lack of annotated data for training and evaluation by +introducing a large-scale driving dataset with point-of-gaze annotations. This +is an in situ dataset of real driving sessions in an urban city, containing +synchronized images of the driving scene as well as the face and gaze of the +driver. Experiments on this dataset show that the proposed method outperforms +various baseline methods, having the mean prediction error of 29.69 pixels, +which is relatively small compared to the $1280{\times}720$ resolution of the +scene camera. + +
+
+
+
+
+ + ♻ ☆ SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike + Camera + + +
+ One of the most critical factors in achieving sharp Novel View Synthesis +(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS) is the quality of the training images. However, +Conventional RGB cameras are susceptible to motion blur. In contrast, +neuromorphic cameras like event and spike cameras inherently capture more +comprehensive temporal information, which can provide a sharp representation of +the scene as additional training data. Recent methods have explored the +integration of event cameras to improve the quality of NVS. The event-RGB +approaches have some limitations, such as high training costs and the inability +to work effectively in the background. Instead, our study introduces a new +method that uses the spike camera to overcome these limitations. By considering +texture reconstruction from spike streams as ground truth, we design the +Texture from Spike (TfS) loss. Since the spike camera relies on temporal +integration instead of temporal differentiation used by event cameras, our +proposed TfS loss maintains manageable training costs. It handles foreground +objects with backgrounds simultaneously. We also provide a real-world dataset +captured with our spike-RGB camera system to facilitate future research +endeavors. We conduct extensive experiments using synthetic and real-world +datasets to demonstrate that our design can enhance novel view synthesis across +NeRF and 3DGS. The code and dataset will be made available for public access. + +
+
+
+
+
+ + ♻ ☆ Learning Object Permanence from Videos via Latent Imaginations + + +
+ While human infants exhibit knowledge about object permanence from two months +of age onwards, deep-learning approaches still largely fail to recognize +objects' continued existence. We introduce a slot-based autoregressive deep +learning system, the looped location and identity tracking model Loci-Looped, +which learns to adaptively fuse latent imaginations with pixel-space +observations into consistent latent object-specific what and where encodings +over time. The novel loop empowers Loci-Looped to learn the physical concepts +of object permanence, directional inertia, and object solidity through +observation alone. As a result, Loci-Looped tracks objects through occlusions, +anticipates their reappearance, and shows signs of surprise and internal +revisions when observing implausible object behavior. Notably, Loci-Looped +outperforms state-of-the-art baseline models in handling object occlusions and +temporary sensory interruptions while exhibiting more compositional, +interpretable internal activity patterns. Our work thus introduces the first +self-supervised interpretable learning model that learns about object +permanence directly from video data without supervision. + +
+
+
+
+
+ + ♻ ☆ VSCode: General Visual Salient and Camouflaged Object Detection with 2D + Prompt Learning CVPR2024 + + +
+ Salient object detection (SOD) and camouflaged object detection (COD) are +related yet distinct binary mapping tasks. These tasks involve multiple +modalities, sharing commonalities and unique cues. Existing research often +employs intricate task-specific specialist models, potentially leading to +redundancy and suboptimal results. We introduce VSCode, a generalist model with +novel 2D prompt learning, to jointly address four SOD tasks and three COD +tasks. We utilize VST as the foundation model and introduce 2D prompts within +the encoder-decoder architecture to learn domain and task-specific knowledge on +two separate dimensions. A prompt discrimination loss helps disentangle +peculiarities to benefit model optimization. VSCode outperforms +state-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot +generalization to unseen tasks by combining 2D prompts, such as RGB-D COD. +Source code has been available at https://github.com/Sssssuperior/VSCode. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Extended Reality for Mental Health Evaluation -A Scoping Review + + +
+ Mental health disorders are the leading cause of health-related problems +globally. It is projected that mental health disorders will be the leading +cause of morbidity among adults as the incidence rates of anxiety and +depression grows globally. Recently, extended reality (XR), a general term +covering virtual reality (VR), augmented reality (AR) and mixed reality (MR), +is paving a new way to deliver mental health care. In this paper, we conduct a +scoping review on the development and application of XR in the area of mental +disorders. We performed a scoping database search to identify the relevant +studies indexed in Google Scholar, PubMed, and the ACM Digital Library. A +search period between August 2016 and December 2023 was defined to select +articles related to the usage of VR, AR, and MR in a mental health context. We +identified a total of 85 studies from 27 countries across the globe. By +performing data analysis, we found that most of the studies focused on +developed countries such as the US (16.47%) and Germany (12.94%). None of the +studies were for African countries. The majority of the articles reported that +XR techniques led to a significant reduction in symptoms of anxiety or +depression. More studies were published in the year 2021, i.e., 31.76% (n = +31). This could indicate that mental disorder intervention received a higher +attention when COVID-19 emerged. Most studies (n = 65) focused on a population +between 18 and 65 years old, only a few studies focused on teenagers (n = 2). +Also, more studies were done experimentally (n = 67, 78.82%) rather than by +analytical and modeling approaches (n = 8, 9.41%). This shows that there is a +rapid development of XR technology for mental health care. Furthermore, these +studies showed that XR technology can effectively be used for evaluating mental +disorders in similar or better way as the conventional approaches. + +
+
+
+
+
+ + ♻ ☆ VST++: Efficient and Stronger Visual Saliency Transformer + + +
+ While previous CNN-based models have exhibited promising results for salient +object detection (SOD), their ability to explore global long-range dependencies +is restricted. Our previous work, the Visual Saliency Transformer (VST), +addressed this constraint from a transformer-based sequence-to-sequence +perspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task +transformer decoder that concurrently predicts saliency and boundary outcomes +in a pure transformer architecture. Moreover, we introduced a novel token +upsampling method called reverse T2T for predicting a high-resolution saliency +map effortlessly within transformer-based structures. Building upon the VST +model, we further propose an efficient and stronger VST version in this work, +i.e. VST++. To mitigate the computational costs of the VST model, we propose a +Select-Integrate Attention (SIA) module, partitioning foreground into +fine-grained segments and aggregating background information into a single +coarse-grained token. To incorporate 3D depth information with low cost, we +design a novel depth position encoding method tailored for depth maps. +Furthermore, we introduce a token-supervised prediction loss to provide +straightforward guidance for the task-related tokens. We evaluate our VST++ +model across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD +benchmark datasets. Experimental results show that our model outperforms +existing methods while achieving a 25% reduction in computational costs without +significant performance compromise. The demonstrated strong ability for +generalization, enhanced performance, and heightened efficiency of our VST++ +model highlight its potential. + +
+
+
+
+
+ + ♻ ☆ Towards Reliable Medical Image Segmentation by utilizing Evidential + Calibrated Uncertainty + + +
+ Medical image segmentation is critical for disease diagnosis and treatment +assessment. However, concerns regarding the reliability of segmentation regions +persist among clinicians, mainly attributed to the absence of confidence +assessment, robustness, and calibration to accuracy. To address this, we +introduce DEviS, an easily implementable foundational model that seamlessly +integrates into various medical image segmentation networks. DEviS not only +enhances the calibration and robustness of baseline segmentation accuracy but +also provides high-efficiency uncertainty estimation for reliable predictions. +By leveraging subjective logic theory, we explicitly model probability and +uncertainty for the problem of medical image segmentation. Here, the Dirichlet +distribution parameterizes the distribution of probabilities for different +classes of the segmentation results. To generate calibrated predictions and +uncertainty, we develop a trainable calibrated uncertainty penalty. +Furthermore, DEviS incorporates an uncertainty-aware filtering module, which +utilizes the metric of uncertainty-calibrated error to filter reliable data +within the dataset. We conducted validation studies to assess both the accuracy +and robustness of DEviS segmentation, along with evaluating the efficiency and +reliability of uncertainty estimation. These evaluations were performed using +publicly available datasets including ISIC2018, LiTS2017, and BraTS2019. +Additionally, two potential clinical trials are being conducted at Johns +Hopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in +filtering high-quality or out-of-distribution data. Our code has been released +in https://github.com/Cocofeat/DEviS. + +
+
+ comment: 34 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Analyzing the Internals of Neural Radiance Fields CVPR + + +
+ Modern Neural Radiance Fields (NeRFs) learn a mapping from position to +volumetric density leveraging proposal network samplers. In contrast to the +coarse-to-fine sampling approach with two NeRFs, this offers significant +potential for acceleration using lower network capacity. Given that NeRFs +utilize most of their network capacity to estimate radiance, they could store +valuable density information in their parameters or their deep features. To +investigate this proposition, we take one step back and analyze large, trained +ReLU-MLPs used in coarse-to-fine sampling. Building on our novel activation +visualization method, we find that trained NeRFs, Mip-NeRFs and proposal +network samplers map samples with high density to local minima along a ray in +activation feature space. We show how these large MLPs can be accelerated by +transforming intermediate activations to a weight estimate, without any +modifications to the training protocol or the network architecture. With our +approach, we can reduce the computational requirements of trained NeRFs by up +to 50% with only a slight hit in rendering quality. Extensive experimental +evaluation on a variety of datasets and architectures demonstrates the +effectiveness of our approach. Consequently, our methodology provides valuable +insight into the inner workings of NeRFs. + +
+
+ comment: Accepted to CVPRW'24! Project Page: + https://r4dl.github.io/nerfinternals/ +
+
+
+
+
+ + ♻ ☆ S^2MVTC: a Simple yet Efficient Scalable Multi-View Tensor Clustering CVPR2024 + + +
+ Anchor-based large-scale multi-view clustering has attracted considerable +attention for its effectiveness in handling massive datasets. However, current +methods mainly seek the consensus embedding feature for clustering by exploring +global correlations between anchor graphs or projection matrices.In this paper, +we propose a simple yet efficient scalable multi-view tensor clustering +(S^2MVTC) approach, where our focus is on learning correlations of embedding +features within and across views. Specifically, we first construct the +embedding feature tensor by stacking the embedding features of different views +into a tensor and rotating it. Additionally, we build a novel tensor +low-frequency approximation (TLFA) operator, which incorporates graph +similarity into embedding feature learning, efficiently achieving smooth +representation of embedding features within different views. Furthermore, +consensus constraints are applied to embedding features to ensure inter-view +semantic consistency. Experimental results on six large-scale multi-view +datasets demonstrate that S^2MVTC significantly outperforms state-of-the-art +algorithms in terms of clustering performance and CPU execution time, +especially when handling massive data. The code of S^2MVTC is publicly +available at https://github.com/longzhen520/S2MVTC. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark + + +
+ Multi-label image classification in dynamic environments is a problem that +poses significant challenges. Previous studies have primarily focused on +scenarios such as Domain Incremental Learning and Class Incremental Learning, +which do not fully capture the complexity of real-world applications. In this +paper, we study the problem of classification of medical imaging in the +scenario termed New Instances and New Classes, which combines the challenges of +both new class arrivals and domain shifts in a single framework. Unlike +traditional scenarios, it reflects the realistic nature of CL in domains such +as medical imaging, where updates may introduce both new classes and changes in +domain characteristics. To address the unique challenges posed by this complex +scenario, we introduce a novel approach called Pseudo-Label Replay. This method +aims to mitigate forgetting while adapting to new classes and domain shifts by +combining the advantages of the Replay and Pseudo-Label methods and solving +their limitations in the proposed scenario. We evaluate our proposed approach +on a challenging benchmark consisting of two datasets, seven tasks, and +nineteen classes, modeling a realistic Continual Learning scenario. Our +experimental findings demonstrate the effectiveness of Pseudo-Label Replay in +addressing the challenges posed by the complex scenario proposed. Our method +surpasses existing approaches, exhibiting superior performance while showing +minimal forgetting. + +
+
+
+
+
+ + ♻ ☆ FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal + Consistency and Correlation Debiasing CVPR 2024 + + +
+ Dynamic scene graph generation (SGG) from videos requires not only a +comprehensive understanding of objects across scenes but also a method to +capture the temporal motions and interactions with different objects. Moreover, +the long-tailed distribution of visual relationships is a crucial bottleneck +for most dynamic SGG methods. This is because many of them focus on capturing +spatio-temporal context using complex architectures, leading to the generation +of biased scene graphs. To address these challenges, we propose +\textsc{FloCoDe}: \textbf{Flo}w-aware Temporal Consistency and +\textbf{Co}rrelation \textbf{De}biasing with uncertainty attenuation for +unbiased dynamic scene graphs. \textsc{FloCoDe} employs feature warping using +flow to detect temporally consistent objects across frames. To address the +long-tail issue of visual relationships, we propose correlation debiasing and a +label correlation-based loss to learn unbiased relation representations for +long-tailed classes. Specifically, we propose to incorporate label correlations +using contrastive loss to capture commonly co-occurring relations, which aids +in learning robust representations for long-tailed classes. Further, we adopt +the uncertainty attenuation-based classifier framework to handle noisy +annotations in the SGG data. Extensive experimental evaluation shows a +performance gain as high as 4.1\%, demonstrating the superiority of generating +more unbiased scene graphs. + +
+
+ comment: Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ Test-Time Zero-Shot Temporal Action Localization CVPR 2024 + + +
+ Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate +actions in untrimmed videos unseen during training. Existing ZS-TAL methods +involve fine-tuning a model on a large amount of annotated training data. While +effective, training-based ZS-TAL approaches assume the availability of labeled +data for supervised learning, which can be impractical in some applications. +Furthermore, the training process naturally induces a domain bias into the +learned model, which may adversely affect the model's generalization ability to +arbitrary videos. These considerations prompt us to approach the ZS-TAL problem +from a radically novel perspective, relaxing the requirement for training data. +To this aim, we introduce a novel method that performs Test-Time adaptation for +Temporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained +Vision and Language Model (VLM). T3AL operates in three steps. First, a +video-level pseudo-label of the action category is computed by aggregating +information from the entire video. Then, action localization is performed +adopting a novel procedure inspired by self-supervised learning. Finally, +frame-level textual descriptions extracted with a state-of-the-art captioning +model are employed for refining the action region proposals. We validate the +effectiveness of T3AL by conducting experiments on the THUMOS14 and the +ActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly +outperforms zero-shot baselines based on state-of-the-art VLMs, confirming the +benefit of a test-time adaptation approach. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Finding Regions of Interest in Whole Slide Images Using Multiple + Instance Learning + + +
+ Whole Slide Images (WSI), obtained by high-resolution digital scanning of +microscope slides at multiple scales, are the cornerstone of modern Digital +Pathology. However, they represent a particular challenge to +AI-based/AI-mediated analysis because pathology labeling is typically done at +slide-level, instead of tile-level. It is not just that medical diagnostics is +recorded at the specimen level, the detection of oncogene mutation is also +experimentally obtained, and recorded by initiatives like The Cancer Genome +Atlas (TCGA), at the slide level. This configures a dual challenge: a) +accurately predicting the overall cancer phenotype and b) finding out what +cellular morphologies are associated with it at the tile level. To address +these challenges, a weakly supervised Multiple Instance Learning (MIL) approach +was explored for two prevalent cancer types, Invasive Breast Carcinoma +(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was +explored for tumor detection at low magnification levels and TP53 mutations at +various levels. Our results show that a novel additive implementation of MIL +matched the performance of reference implementation (AUC 0.96), and was only +slightly outperformed by Attention MIL (AUC 0.97). More interestingly from the +perspective of the molecular pathologist, these different AI architectures +identify distinct sensitivities to morphological features (through the +detection of Regions of Interest, RoI) at different amplification levels. +Tellingly, TP53 mutation was most sensitive to features at the higher +applications where cellular morphology is resolved. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Invariance for Robust and Interpretable Vision Tasks at + Larger Scales + + +
+ Developing robust and interpretable vision systems is a crucial step towards +trustworthy artificial intelligence. In this regard, a promising paradigm +considers embedding task-required invariant structures, e.g., geometric +invariance, in the fundamental image representation. However, such invariant +representations typically exhibit limited discriminability, limiting their +applications in larger-scale trustworthy vision tasks. For this open problem, +we conduct a systematic investigation of hierarchical invariance, exploring +this topic from theoretical, practical, and application perspectives. At the +theoretical level, we show how to construct over-complete invariants with a +Convolutional Neural Networks (CNN)-like hierarchical architecture yet in a +fully interpretable manner. The general blueprint, specific definitions, +invariant properties, and numerical implementations are provided. At the +practical level, we discuss how to customize this theoretical framework into a +given task. With the over-completeness, discriminative features w.r.t. the task +can be adaptively formed in a Neural Architecture Search (NAS)-like manner. We +demonstrate the above arguments with accuracy, invariance, and efficiency +results on texture, digit, and parasite classification experiments. +Furthermore, at the application level, our representations are explored in +real-world forensics tasks on adversarial perturbations and Artificial +Intelligence Generated Content (AIGC). Such applications reveal that the +proposed strategy not only realizes the theoretically promised invariance, but +also exhibits competitive discriminability even in the era of deep learning. +For robust and interpretable vision tasks at larger scales, hierarchical +invariant representation can be considered as an effective alternative to +traditional CNN and invariants. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ MV-Adapter: Multimodal Video Transfer Learning for Video Text Retrieval + + +
+ State-of-the-art video-text retrieval (VTR) methods typically involve fully +fine-tuning a pre-trained model (e.g. CLIP) on specific datasets. However, this +can result in significant storage costs in practical applications as a separate +model per task must be stored. To address this issue, we present our pioneering +work that enables parameter-efficient VTR using a pre-trained model, with only +a small number of tunable parameters during training. Towards this goal, we +propose a new method dubbed Multimodal Video Adapter (MV-Adapter) for +efficiently transferring the knowledge in the pre-trained CLIP from image-text +to video-text. Specifically, MV-Adapter utilizes bottleneck structures in both +video and text branches, along with two novel components. The first is a +Temporal Adaptation Module that is incorporated in the video branch to +introduce global and local temporal contexts. We also train weights +calibrations to adjust to dynamic variations across frames. The second is Cross +Modality Tying that generates weights for video/text branches through sharing +cross modality factors, for better aligning between modalities. Thanks to above +innovations, MV-Adapter can achieve comparable or better performance than +standard full fine-tuning with negligible parameters overhead. Notably, +MV-Adapter consistently outperforms various competing methods in V2T/T2V tasks +with large margins on five widely used VTR benchmarks (MSR-VTT, MSVD, LSMDC, +DiDemo, and ActivityNet). + +
+
+
+
+
+ + ♻ ☆ Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks CVPR2024 + + +
+ Diffusion models trained on large-scale datasets have achieved remarkable +progress in image synthesis. However, due to the randomness in the diffusion +process, they often struggle with handling diverse low-level tasks that require +details preservation. To overcome this limitation, we present a new Diff-Plugin +framework to enable a single pre-trained diffusion model to generate +high-fidelity results across a variety of low-level tasks. Specifically, we +first propose a lightweight Task-Plugin module with a dual branch design to +provide task-specific priors, guiding the diffusion process in preserving image +content. We then propose a Plugin-Selector that can automatically select +different Task-Plugins based on the text instruction, allowing users to edit +images by indicating multiple low-level tasks with natural language. We conduct +extensive experiments on 8 low-level vision tasks. The results demonstrate the +superiority of Diff-Plugin over existing methods, particularly in real-world +scenarios. Our ablations further validate that Diff-Plugin is stable, +schedulable, and supports robust training across different dataset sizes. + +
+
+ comment: Accepted to CVPR2024. Replaced some celebrity images to avoid + copyright disputes +
+
+
+
+
+ + ♻ ☆ HPNet: Dynamic Trajectory Forecasting with Historical Prediction + Attention CVPR2024 + + +
+ Predicting the trajectories of road agents is essential for autonomous +driving systems. The recent mainstream methods follow a static paradigm, which +predicts the future trajectory by using a fixed duration of historical frames. +These methods make the predictions independently even at adjacent time steps, +which leads to potential instability and temporal inconsistency. As successive +time steps have largely overlapping historical frames, their forecasting should +have intrinsic correlation, such as overlapping predicted trajectories should +be consistent, or be different but share the same motion goal depending on the +road situation. Motivated by this, in this work, we introduce HPNet, a novel +dynamic trajectory forecasting method. Aiming for stable and accurate +trajectory forecasting, our method leverages not only historical frames +including maps and agent states, but also historical predictions. Specifically, +we newly design a Historical Prediction Attention module to automatically +encode the dynamic relationship between successive predictions. Besides, it +also extends the attention range beyond the currently visible window +benefitting from the use of historical predictions. The proposed Historical +Prediction Attention together with the Agent Attention and Mode Attention is +further formulated as the Triple Factorized Attention module, serving as the +core design of HPNet.Experiments on the Argoverse and INTERACTION datasets show +that HPNet achieves state-of-the-art performance, and generates accurate and +stable future trajectories. Our code are available at +https://github.com/XiaolongTang23/HPNet. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in + Conversations with Multimodal Language Models SemEval '24 + + +
+ This paper presents our winning submission to Subtask 2 of SemEval 2024 Task +3 on multimodal emotion cause analysis in conversations. We propose a novel +Multimodal Emotion Recognition and Multimodal Emotion Cause Extraction +(MER-MCE) framework that integrates text, audio, and visual modalities using +specialized emotion encoders. Our approach sets itself apart from +top-performing teams by leveraging modality-specific features for enhanced +emotion understanding and causality inference. Experimental evaluation +demonstrates the advantages of our multimodal approach, with our submission +achieving a competitive weighted F1 score of 0.3435, ranking third with a +margin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team. +Project: https://github.com/MIPS-COLT/MER-MCE.git + +
+
+ comment: Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st & + 2nd by 0.0339 & 0.0025 +
+
+
+
+
+ + ♻ ☆ DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video + Generation + + +
+ World models have demonstrated superiority in autonomous driving, +particularly in the generation of multi-view driving videos. However, +significant challenges still exist in generating customized driving videos. In +this paper, we propose DriveDreamer-2, which builds upon the framework of +DriveDreamer and incorporates a Large Language Model (LLM) to generate +user-defined driving videos. Specifically, an LLM interface is initially +incorporated to convert a user's query into agent trajectories. Subsequently, a +HDMap, adhering to traffic regulations, is generated based on the trajectories. +Ultimately, we propose the Unified Multi-View Model to enhance temporal and +spatial coherence in the generated driving videos. DriveDreamer-2 is the first +world model to generate customized driving videos, it can generate uncommon +driving videos (e.g., vehicles abruptly cut in) in a user-friendly manner. +Besides, experimental results demonstrate that the generated videos enhance the +training of driving perception methods (e.g., 3D detection and tracking). +Furthermore, video generation quality of DriveDreamer-2 surpasses other +state-of-the-art methods, showcasing FID and FVD scores of 11.2 and 55.7, +representing relative improvements of 30% and 50%. + +
+
+ comment: Project Page: https://drivedreamer2.github.io +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ GEM3D: GEnerative Medial Abstractions for 3D Shape Synthesis SIGGRAPH 2024 + + +
+ We introduce GEM3D -- a new deep, topology-aware generative model of 3D +shapes. The key ingredient of our method is a neural skeleton-based +representation encoding information on both shape topology and geometry. +Through a denoising diffusion probabilistic model, our method first generates +skeleton-based representations following the Medial Axis Transform (MAT), then +generates surfaces through a skeleton-driven neural implicit formulation. The +neural implicit takes into account the topological and geometric information +stored in the generated skeleton representations to yield surfaces that are +more topologically and geometrically accurate compared to previous neural field +formulations. We discuss applications of our method in shape synthesis and +point cloud reconstruction tasks, and evaluate our method both qualitatively +and quantitatively. We demonstrate significantly more faithful surface +reconstruction and diverse shape generation results compared to the +state-of-the-art, also involving challenging scenarios of reconstructing and +synthesizing structurally complex, high-genus shape surfaces from Thingi10K and +ShapeNet. + +
+
+ comment: Webpage: https://lodurality.github.io/GEM3D/ -- Cond. accept. to + SIGGRAPH 2024 (conf. track) -- Changes (based on reviews): changed style to + sigconf; rearranged figures for readability; added missing citations; fixed + misaligned centers in Fig. 3; added failure cases (Fig. 10); rewrote + discussion; added categories averages to Tab. 8; added Tab. 10 with model + capacities +
+
+
+
+
+ + ♻ ☆ Fourier Prompt Tuning for Modality-Incomplete Scene Segmentation + + +
+ Integrating information from multiple modalities enhances the robustness of +scene perception systems in autonomous vehicles, providing a more comprehensive +and reliable sensory framework. However, the modality incompleteness in +multi-modal segmentation remains under-explored. In this work, we establish a +task called Modality-Incomplete Scene Segmentation (MISS), which encompasses +both system-level modality absence and sensor-level modality errors. To avoid +the predominant modality reliance in multi-modal fusion, we introduce a +Missing-aware Modal Switch (MMS) strategy to proactively manage missing +modalities during training. Utilizing bit-level batch-wise sampling enhances +the model's performance in both complete and incomplete testing scenarios. +Furthermore, we introduce the Fourier Prompt Tuning (FPT) method to incorporate +representative spectral information into a limited number of learnable prompts +that maintain robustness against all MISS scenarios. Akin to fine-tuning +effects but with fewer tunable parameters (1.1%). Extensive experiments prove +the efficacy of our proposed approach, showcasing an improvement of 5.84% mIoU +over the prior state-of-the-art parameter-efficient methods in modality +missing. The source code is publicly available at +https://github.com/RuipingL/MISS. + +
+
+ comment: Accepted to IEEE IV 2024. The source code is publicly available at + https://github.com/RuipingL/MISS +
+
+
+
+
+ + ♻ ☆ Tensor Decomposition Based Attention Module for Spiking Neural Networks + + +
+ The attention mechanism has been proven to be an effective way to improve +spiking neural network (SNN). However, based on the fact that the current SNN +input data flow is split into tensors to process on GPUs, none of the previous +works consider the properties of tensors to implement an attention module. This +inspires us to rethink current SNN from the perspective of tensor-relevant +theories. Using tensor decomposition, we design the \textit{projected full +attention} (PFA) module, which demonstrates excellent results with linearly +growing parameters. Specifically, PFA is composed by the \textit{linear +projection of spike tensor} (LPST) module and \textit{attention map composing} +(AMC) module. In LPST, we start by compressing the original spike tensor into +three projected tensors using a single property-preserving strategy with +learnable parameters for each dimension. Then, in AMC, we exploit the inverse +procedure of the tensor decomposition process to combine the three tensors into +the attention map using a so-called connecting factor. To validate the +effectiveness of the proposed PFA module, we integrate it into the widely used +VGG and ResNet architectures for classification tasks. Our method achieves +state-of-the-art performance on both static and dynamic benchmark datasets, +surpassing the existing SNN models with Transformer-based and CNN-based +backbones. + +
+
+ comment: Accepted by Knowledge-Based Systems +
+
+
+
+
+ + ♻ ☆ TC4D: Trajectory-Conditioned Text-to-4D Generation + + +
+ Recent techniques for text-to-4D generation synthesize dynamic 3D scenes +using supervision from pre-trained text-to-video models. However, existing +representations for motion, such as deformation models or time-dependent neural +representations, are limited in the amount of motion they can generate-they +cannot synthesize motion extending far beyond the bounding box used for volume +rendering. The lack of a more flexible motion model contributes to the gap in +realism between 4D generation methods and recent, near-photorealistic video +generation models. Here, we propose TC4D: trajectory-conditioned text-to-4D +generation, which factors motion into global and local components. We represent +the global motion of a scene's bounding box using rigid transformation along a +trajectory parameterized by a spline. We learn local deformations that conform +to the global trajectory using supervision from a text-to-video model. Our +approach enables the synthesis of scenes animated along arbitrary trajectories, +compositional scene generation, and significant improvements to the realism and +amount of generated motion, which we evaluate qualitatively and through a user +study. Video results can be viewed on our website: +https://sherwinbahmani.github.io/tc4d. + +
+
+ comment: Project Page: https://sherwinbahmani.github.io/tc4d +
+
+
+
+
+ + ♻ ☆ Exploring Effective Priors and Efficient Models for Weakly-Supervised + Change Detection + + +
+ Weakly-supervised change detection (WSCD) aims to detect pixel-level changes +with only image-level annotations. Owing to its label efficiency, WSCD is +drawing increasing attention recently. However, current WSCD methods often +encounter the challenge of change missing and fabricating, i.e., the +inconsistency between image-level annotations and pixel-level predictions. +Specifically, change missing refer to the situation that the WSCD model fails +to predict any changed pixels, even though the image-level label indicates +changed, and vice versa for change fabricating. To address this challenge, in +this work, we leverage global-scale and local-scale priors in WSCD and propose +two components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint. +The DP decoder decodes samples with the changed image-level label, skips +samples with the unchanged label, and replaces them with an all-unchanged +pixel-level label. The LG constraint is derived from the correspondence between +changed representations and image-level labels, penalizing the model when it +mispredicts the change status. Additionally, we develop TransWCD, a simple yet +powerful transformer-based model, showcasing the potential of weakly-supervised +learning in change detection. By integrating the DP decoder and LG constraint +into TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL +achieve significant +6.33% and +9.55% F1 score improvements over the +state-of-the-art methods on the WHU-CD dataset, respectively. Some performance +metrics even exceed several fully-supervised change detection (FSCD) +competitors. Code will be available at +https://github.com/zhenghuizhao/TransWCD. + +
+
+
+
+
+ + ♻ ☆ One-Prompt to Segment All Medical Images + + +
+ Large foundation models, known for their strong zero-shot generalization, +have excelled in visual and language applications. However, applying them to +medical image segmentation, a domain with diverse imaging types and target +labels, remains an open challenge. Current approaches, such as adapting +interactive segmentation models like Segment Anything Model (SAM), require user +prompts for each sample during inference. Alternatively, transfer learning +methods like few/one-shot models demand labeled samples, leading to high costs. +This paper introduces a new paradigm toward the universal medical image +segmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation +combines the strengths of one-shot and interactive methods. In the inference +stage, with just \textbf{one prompted sample}, it can adeptly handle the unseen +task in a single forward pass. We train One-Prompt Model on 64 open-source +medical datasets, accompanied by the collection of over 3,000 clinician-labeled +prompts. Tested on 14 previously unseen datasets, the One-Prompt Model +showcases superior zero-shot segmentation capabilities, outperforming a wide +range of related methods. The code and data is released as +\url{https://github.com/KidsWithTokens/one-prompt}. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.12620 +
+
+
+
+
+ + ♻ ☆ Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours + Maps + + +
+ In this work, we describe a method for large-scale 3D cell-tracking through a +segmentation selection approach. The proposed method is effective at tracking +cells across large microscopy datasets on two fronts: (i) It can solve problems +containing millions of segmentation instances in terabyte-scale 3D+t datasets; +(ii) It achieves competitive results with or without deep learning, which +requires 3D annotated data, that is scarce in the fluorescence microscopy +field. The proposed method computes cell tracks and segments using a hierarchy +of segmentation hypotheses and selects disjoint segments by maximizing the +overlap between adjacent frames. We show that this method achieves +state-of-the-art results in 3D images from the cell tracking challenge and has +a faster integer linear programming formulation. Moreover, our framework is +flexible and supports segmentations from off-the-shelf cell segmentation models +and can combine them into an ensemble that improves tracking. The code is +available https://github.com/royerlab/ultrack. + +
+
+ comment: 13 pages, 7 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D + Pose Estimation + + +
+ In medical and industrial domains, providing guidance for assembly processes +is critical to ensure efficiency and safety. Errors in assembly can lead to +significant consequences such as extended surgery times, and prolonged +manufacturing or maintenance times in industry. Assembly scenarios can benefit +from in-situ AR visualization to provide guidance, reduce assembly times and +minimize errors. To enable in-situ visualization 6D pose estimation can be +leveraged. Existing 6D pose estimation techniques primarily focus on individual +objects and static captures. However, assembly scenarios have various dynamics +including occlusion during assembly and dynamics in the assembly objects +appearance. Existing work, combining object detection/6D pose estimation and +assembly state detection focuses either on pure deep learning-based approaches, +or limit the assembly state detection to building blocks. To address the +challenges of 6D pose estimation in combination with assembly state detection, +our approach ASDF builds upon the strengths of YOLOv8, a real-time capable +object detection framework. We extend this framework, refine the object pose +and fuse pose knowledge with network-detected pose information. Utilizing our +late fusion in our Pose2State module results in refined 6D pose estimation and +assembly state detection. By combining both pose and state information, our +Pose2State module predicts the final assembly state with precision. Our +evaluation on our ASDF dataset shows that our Pose2State module leads to an +improved assembly state detection and that the improvement of the assembly +state further leads to a more robust 6D pose estimation. Moreover, on the GBOT +dataset, we outperform the pure deep learning-based network, and even +outperform the hybrid and pure tracking-based approaches. + +
+
+
+
+
+ + ♻ ☆ Modality Translation for Object Detection Adaptation Without Forgetting + Prior Knowledge + + +
+ A common practice in deep learning consists of training large neural networks +on massive datasets to perform accurately for different domains and tasks. +While this methodology may work well in numerous application areas, it only +applies across modalities due to a larger distribution shift in data captured +using different sensors. This paper focuses on the problem of adapting a large +object detection model to one or multiple modalities while being efficient. To +do so, we propose ModTr as an alternative to the common approach of fine-tuning +large models. ModTr consists of adapting the input with a small transformation +network trained to minimize the detection loss directly. The original model can +therefore work on the translated inputs without any further change or +fine-tuning to its parameters. Experimental results on translating from IR to +RGB images on two well-known datasets show that this simple ModTr approach +provides detectors that can perform comparably or better than the standard +fine-tuning without forgetting the original knowledge. This opens the doors to +a more flexible and efficient service-based detection pipeline in which, +instead of using a different detector for each modality, a unique and unaltered +server is constantly running, where multiple modalities with the corresponding +translations can query it. Code: https://github.com/heitorrapela/ModTr. + +
+
+
+
+
+ + ♻ ☆ Putting the Object Back into Video Object Segmentation CVPR 2024 + + +
+ We present Cutie, a video object segmentation (VOS) network with object-level +memory reading, which puts the object representation from memory back into the +video object segmentation result. Recent works on VOS employ bottom-up +pixel-level memory reading which struggles due to matching noise, especially in +the presence of distractors, resulting in lower performance in more challenging +data. In contrast, Cutie performs top-down object-level memory reading by +adapting a small set of object queries. Via those, it interacts with the +bottom-up pixel features iteratively with a query-based object transformer (qt, +hence Cutie). The object queries act as a high-level summary of the target +object, while high-resolution feature maps are retained for accurate +segmentation. Together with foreground-background masked attention, Cutie +cleanly separates the semantics of the foreground object from the background. +On the challenging MOSE dataset, Cutie improves by 8.7 J&F over XMem with a +similar running time and improves by 4.2 J&F over DeAOT while being three times +faster. Code is available at: https://hkchengrex.github.io/Cutie + +
+
+ comment: CVPR 2024 Highlight. Project page: https://hkchengrex.github.io/Cutie +
+
+
+
+
+ + ♻ ☆ Sat2Cap: Mapping Fine-Grained Textual Descriptions from Satellite Images + + +
+ We propose a weakly supervised approach for creating maps using free-form +textual descriptions. We refer to this work of creating textual maps as +zero-shot mapping. Prior works have approached mapping tasks by developing +models that predict a fixed set of attributes using overhead imagery. However, +these models are very restrictive as they can only solve highly specific tasks +for which they were trained. Mapping text, on the other hand, allows us to +solve a large variety of mapping problems with minimal restrictions. To achieve +this, we train a contrastive learning framework called Sat2Cap on a new +large-scale dataset with 6.1M pairs of overhead and ground-level images. For a +given location and overhead image, our model predicts the expected CLIP +embeddings of the ground-level scenery. The predicted CLIP embeddings are then +used to learn about the textual space associated with that location. Sat2Cap is +also conditioned on date-time information, allowing it to model temporally +varying concepts over a location. Our experimental results demonstrate that our +models successfully capture ground-level concepts and allow large-scale mapping +of fine-grained textual queries. Our approach does not require any text-labeled +data, making the training easily scalable. The code, dataset, and models will +be made publicly available. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Learning county from pixels: Corn yield prediction with + attention-weighted multiple instance learning + + +
+ Remote sensing technology has become a promising tool in yield prediction. +Most prior work employs satellite imagery for county-level corn yield +prediction by spatially aggregating all pixels within a county into a single +value, potentially overlooking the detailed information and valuable insights +offered by more granular data. To this end, this research examines each county +at the pixel level and applies multiple instance learning to leverage detailed +information within a county. In addition, our method addresses the "mixed +pixel" issue caused by the inconsistent resolution between feature datasets and +crop mask, which may introduce noise into the model and therefore hinder +accurate yield prediction. Specifically, the attention mechanism is employed to +automatically assign weights to different pixels, which can mitigate the +influence of mixed pixels. The experimental results show that the developed +model outperforms four other machine learning models over the past five years +in the U.S. corn belt and demonstrates its best performance in 2022, achieving +a coefficient of determination (R2) value of 0.84 and a root mean square error +(RMSE) of 0.83. This paper demonstrates the advantages of our approach from +both spatial and temporal perspectives. Furthermore, through an in-depth study +of the relationship between mixed pixels and attention, it is verified that our +approach can capture critical feature information while filtering out noise +from mixed pixels. + +
+
+ comment: I am writing to request the withdrawal of my paper submitted to + arXiv. Upon further review, I have identified an error in the paper that + significantly affects the results and conclusions. To maintain the integrity + of the scientific record and prevent the dissemination of incorrect + information, I believe it is necessary to withdraw the paper from the archive +
+
+
+
+
+ + ♻ ☆ Fooling Contrastive Language-Image Pre-trained Models with + CLIPMasterPrints + + +
+ Models leveraging both visual and textual data such as Contrastive +Language-Image Pre-training (CLIP), are the backbone of many recent advances in +artificial intelligence. In this work, we show that despite their versatility, +such models are vulnerable to what we refer to as fooling master images. +Fooling master images are capable of maximizing the confidence score of a CLIP +model for a significant number of widely varying prompts, while being either +unrecognizable or unrelated to the attacked prompts for humans. The existence +of such images is problematic as it could be used by bad actors to maliciously +interfere with CLIP-trained image retrieval models in production with +comparably small effort as a single image can attack many different prompts. We +demonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined +using stochastic gradient descent, projected gradient descent, or blackbox +optimization. Contrary to many common adversarial attacks, the blackbox +optimization approach allows us to mine CLIPMasterPrints even when the weights +of the model are not accessible. We investigate the properties of the mined +images, and find that images trained on a small number of image captions +generalize to a much larger number of semantically related captions. We +evaluate possible mitigation strategies, where we increase the robustness of +the model and introduce an approach to automatically detect CLIPMasterPrints to +sanitize the input of vulnerable models. Finally, we find that vulnerability to +CLIPMasterPrints is related to a modality gap in contrastive pre-trained +multi-modal networks. Code available at +https://github.com/matfrei/CLIPMasterPrints. + +
+
+
+
+
+ + ♻ ☆ Efficient Representation of Natural Image Patches + + +
+ Utilizing an abstract information processing model based on minimal yet +realistic assumptions inspired by biological systems, we study how to achieve +the early visual system's two ultimate objectives: efficient information +transmission and accurate sensor probability distribution modeling. We prove +that optimizing for information transmission does not guarantee optimal +probability distribution modeling in general. We illustrate, using a two-pixel +(2D) system and image patches, that an efficient representation can be realized +through a nonlinear population code driven by two types of biologically +plausible loss functions that depend solely on output. After unsupervised +learning, our abstract information processing model bears remarkable +resemblances to biological systems, despite not mimicking many features of real +neurons, such as spiking activity. A preliminary comparison with a contemporary +deep learning model suggests that our model offers a significant efficiency +advantage. Our model provides novel insights into the computational theory of +early visual systems as well as a potential new approach to enhance the +efficiency of deep learning models. + +
+
+
+
+
+ + ♻ ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. + +
+
+
+
+
+ + ♻ ☆ EFHQ: Multi-purpose ExtremePose-Face-HQ dataset + + +
+ The existing facial datasets, while having plentiful images at near frontal +views, lack images with extreme head poses, leading to the downgraded +performance of deep learning models when dealing with profile or pitched faces. +This work aims to address this gap by introducing a novel dataset named Extreme +Pose Face High-Quality Dataset (EFHQ), which includes a maximum of 450k +high-quality images of faces at extreme poses. To produce such a massive +dataset, we utilize a novel and meticulous dataset processing pipeline to +curate two publicly available datasets, VFHQ and CelebV-HQ, which contain many +high-resolution face videos captured in various settings. Our dataset can +complement existing datasets on various facial-related tasks, such as facial +synthesis with 2D/3D-aware GAN, diffusion-based text-to-image face generation, +and face reenactment. Specifically, training with EFHQ helps models generalize +well across diverse poses, significantly improving performance in scenarios +involving extreme views, confirmed by extensive experiments. Additionally, we +utilize EFHQ to define a challenging cross-view face verification benchmark, in +which the performance of SOTA face recognition models drops 5-37% compared to +frontal-to-frontal scenarios, aiming to stimulate studies on face recognition +under severe pose conditions in the wild. + +
+
+ comment: Project Page: https://bomcon123456.github.io/efhq/ +
+
+
+
+
+ + ♻ ☆ IISAN: Efficiently Adapting Multimodal Representation for Sequential + Recommendation with Decoupled PEFT SIGIR2024 + + +
+ Multimodal foundation models are transformative in sequential recommender +systems, leveraging powerful representation learning capabilities. While +Parameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation +models for recommendation tasks, most research prioritizes parameter +efficiency, often overlooking critical factors like GPU memory efficiency and +training speed. Addressing this gap, our paper introduces IISAN (Intra- and +Inter-modal Side Adapted Network for Multimodal Representation), a simple +plug-and-play architecture using a Decoupled PEFT structure and exploiting both +intra- and inter-modal adaptation. + IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art +PEFT. More importantly, it significantly reduces GPU memory usage - from 47GB +to just 3GB for multimodal sequential recommendation tasks. Additionally, it +accelerates training time per epoch from 443s to 22s compared to FFT. This is +also a notable improvement over the Adapter and LoRA, which require 37-39 GB +GPU memory and 350-380 seconds per epoch for training. + Furthermore, we propose a new composite efficiency metric, TPME +(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the +prevalent misconception that "parameter efficiency represents overall +efficiency". TPME provides more comprehensive insights into practical +efficiency comparisons between different methods. Besides, we give an +accessible efficiency analysis of all PEFT and FFT approaches, which +demonstrate the superiority of IISAN. We release our codes and other materials +at https://github.com/GAIR-Lab/IISAN. + +
+
+ comment: Accepted by SIGIR2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 140 + +
+
+
+ + ☆ GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models + + +
+ In this paper, we introduce GoodDrag, a novel approach to improve the +stability and image quality of drag editing. Unlike existing methods that +struggle with accumulated perturbations and often result in distortions, +GoodDrag introduces an AlDD framework that alternates between drag and +denoising operations within the diffusion process, effectively improving the +fidelity of the result. We also propose an information-preserving motion +supervision operation that maintains the original features of the starting +point for precise manipulation and artifact reduction. In addition, we +contribute to the benchmarking of drag editing by introducing a new dataset, +Drag100, and developing dedicated quality assessment metrics, Dragging Accuracy +Index and Gemini Score, utilizing Large Multimodal Models. Extensive +experiments demonstrate that the proposed GoodDrag compares favorably against +the state-of-the-art approaches both qualitatively and quantitatively. The +project page is https://gooddrag.github.io. + +
+
+
+
+
+ + ☆ BRAVE: Broadening the visual encoding of vision-language models + + +
+ Vision-language models (VLMs) are typically composed of a vision encoder, +e.g. CLIP, and a language model (LM) that interprets the encoded features to +solve downstream tasks. Despite remarkable progress, VLMs are subject to +several shortcomings due to the limited capabilities of vision encoders, e.g. +"blindness" to certain image features, visual hallucination, etc. To address +these issues, we study broadening the visual encoding capabilities of VLMs. We +first comprehensively benchmark several vision encoders with different +inductive biases for solving VLM tasks. We observe that there is no single +encoding configuration that consistently achieves top performance across +different tasks, and encoders with different biases can perform surprisingly +similarly. Motivated by this, we introduce a method, named BRAVE, that +consolidates features from multiple frozen encoders into a more versatile +representation that can be directly fed as the input to a frozen LM. BRAVE +achieves state-of-the-art performance on a broad range of captioning and VQA +benchmarks and significantly reduces the aforementioned issues of VLMs, while +requiring a smaller number of trainable parameters than existing methods and +having a more compressed representation. Our results highlight the potential of +incorporating different visual biases for a more broad and contextualized +visual understanding of VLMs. + +
+
+ comment: Project page at https://brave-vlms.epfl.ch/ +
+
+
+
+
+ + ☆ UMBRAE: Unified Multimodal Decoding of Brain Signals + + +
+ We address prevailing challenges of the brain-powered research, departing +from the observation that the literature hardly recover accurate spatial +information and require subject-specific models. To address these challenges, +we propose UMBRAE, a unified multimodal decoding of brain signals. First, to +extract instance-level conceptual and spatial details from neural signals, we +introduce an efficient universal brain encoder for multimodal-brain alignment +and recover object descriptions at multiple levels of granularity from +subsequent multimodal large language model (MLLM). Second, we introduce a +cross-subject training strategy mapping subject-specific features to a common +feature space. This allows a model to be trained on multiple subjects without +extra resources, even yielding superior results compared to subject-specific +models. Further, we demonstrate this supports weakly-supervised adaptation to +new subjects, with only a fraction of the total training data. Experiments +demonstrate that UMBRAE not only achieves superior results in the newly +introduced tasks but also outperforms methods in well established tasks. To +assess our method, we construct and share with the community a comprehensive +brain understanding benchmark BrainHub. Our code and benchmark are available at +https://weihaox.github.io/UMBRAE. + +
+
+ comment: Project Page: https://weihaox.github.io/UMBRAE +
+
+
+
+
+ + ☆ RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth + Diffusion + + +
+ We introduce RealmDreamer, a technique for generation of general +forward-facing 3D scenes from text descriptions. Our technique optimizes a 3D +Gaussian Splatting representation to match complex text prompts. We initialize +these splats by utilizing the state-of-the-art text-to-image generators, +lifting their samples into 3D, and computing the occlusion volume. We then +optimize this representation across multiple views as a 3D inpainting task with +image-conditional diffusion models. To learn correct geometric structure, we +incorporate a depth diffusion model by conditioning on the samples from the +inpainting model, giving rich geometric structure. Finally, we finetune the +model using sharpened samples from image generators. Notably, our technique +does not require video or multi-view data and can synthesize a variety of +high-quality 3D scenes in different styles, consisting of multiple objects. Its +generality additionally allows 3D synthesis from a single image. + +
+
+ comment: Project Page: https://realmdreamer.github.io/ +
+
+
+
+
+ + ☆ InstantMesh: Efficient 3D Mesh Generation from a Single Image with + Sparse-view Large Reconstruction Models + + +
+ We present InstantMesh, a feed-forward framework for instant 3D mesh +generation from a single image, featuring state-of-the-art generation quality +and significant training scalability. By synergizing the strengths of an +off-the-shelf multiview diffusion model and a sparse-view reconstruction model +based on the LRM architecture, InstantMesh is able to create diverse 3D assets +within 10 seconds. To enhance the training efficiency and exploit more +geometric supervisions, e.g, depths and normals, we integrate a differentiable +iso-surface extraction module into our framework and directly optimize on the +mesh representation. Experimental results on public datasets demonstrate that +InstantMesh significantly outperforms other latest image-to-3D baselines, both +qualitatively and quantitatively. We release all the code, weights, and demo of +InstantMesh, with the intention that it can make substantial contributions to +the community of 3D generative AI and empower both researchers and content +creators. + +
+
+ comment: Technical report. Project: https://github.com/TencentARC/InstantMesh +
+
+
+
+
+ + ☆ GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on + FPGA + + +
+ Graph neural networks (GNNs) have recently empowered various novel computer +vision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN +layers or only GNN layers are employed. This paper introduces GCV-Turbo, a +domain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV +tasks. GCV-Turbo consists of two key components: (1) a \emph{novel} hardware +architecture optimized for the computation kernels in both CNNs and GNNs using +the same set of computation resources. (2) a PyTorch-compatible compiler that +takes a user-defined model as input, performs end-to-end optimization for the +computation graph of a given GNN-based CV task, and produces optimized code for +hardware execution. The hardware architecture and the compiler work +synergistically to support a variety of GNN-based CV tasks. We implement +GCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six +representative GNN-based CV tasks with diverse input data modalities (e.g., +image, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU) +implementations, GCV-Turbo achieves an average latency reduction of +$68.4\times$ ($4.1\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo +supports the execution of the standalone CNNs or GNNs, achieving performance +comparable to that of state-of-the-art CNN (GNN) accelerators for widely used +CNN-only (GNN-only) models. + +
+
+
+
+
+ + ☆ Move Anything with Layered Scene Diffusion CVPR 2024 + + +
+ Diffusion models generate images with an unprecedented level of quality, but +how can we freely rearrange image layouts? Recent works generate controllable +scenes via learning spatially disentangled latent codes, but these methods do +not apply to diffusion models due to their fixed forward process. In this work, +we propose SceneDiffusion to optimize a layered scene representation during the +diffusion sampling process. Our key insight is that spatial disentanglement can +be obtained by jointly denoising scene renderings at different spatial layouts. +Our generated scenes support a wide range of spatial editing operations, +including moving, resizing, cloning, and layer-wise appearance editing +operations, including object restyling and replacing. Moreover, a scene can be +generated conditioned on a reference image, thus enabling object moving for +in-the-wild images. Notably, this approach is training-free, compatible with +general text-to-image diffusion models, and responsive in less than a second. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation on Water Scenes via Specular + Reflection Prior + + +
+ Monocular depth estimation from a single image is an ill-posed problem for +computer vision due to insufficient reliable cues as the prior knowledge. +Besides the inter-frame supervision, namely stereo and adjacent frames, +extensive prior information is available in the same frame. Reflections from +specular surfaces, informative intra-frame priors, enable us to reformulate the +ill-posed depth estimation task as a multi-view synthesis. This paper proposes +the first self-supervision for deep-learning depth estimation on water scenes +via intra-frame priors, known as reflection supervision and geometrical +constraints. In the first stage, a water segmentation network is performed to +separate the reflection components from the entire image. Next, we construct a +self-supervised framework to predict the target appearance from reflections, +perceived as other perspectives. The photometric re-projection error, +incorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to +optimize pose and depth estimation by aligning the transformed virtual depths +and source ones. As a supplement, the water surface is determined from real and +virtual camera positions, which complement the depth of the water area. +Furthermore, to alleviate these laborious ground truth annotations, we +introduce a large-scale water reflection scene (WRS) dataset rendered from +Unreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility +of the proposed method compared to state-of-the-art depth estimation +techniques. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ☆ Unified Language-driven Zero-shot Domain Adaptation CVPR 2024 + + +
+ This paper introduces Unified Language-driven Zero-shot Domain Adaptation +(ULDA), a novel task setting that enables a single model to adapt to diverse +target domains without explicit domain-ID knowledge. We identify the +constraints in the existing language-driven zero-shot domain adaptation task, +particularly the requirement for domain IDs and domain-specific models, which +may restrict flexibility and scalability. To overcome these issues, we propose +a new framework for ULDA, consisting of Hierarchical Context Alignment (HCA), +Domain Consistent Representation Learning (DCRL), and Text-Driven Rectifier +(TDR). These components work synergistically to align simulated features with +target text across multiple visual levels, retain semantic correlations between +different regional representations, and rectify biases between simulated and +real target visual features, respectively. Our extensive empirical evaluations +demonstrate that this framework achieves competitive performance in both +settings, surpassing even the model that requires domain-ID, showcasing its +superiority and generalization ability. The proposed method is not only +effective but also maintains practicality and efficiency, as it does not +introduce additional computational costs during inference. Our project page is +https://senqiaoyang.com/project/ULDA . + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Lost in Translation: Modern Neural Networks Still Struggle With Small + Realistic Image Transformations + + +
+ Deep neural networks that achieve remarkable performance in image +classification have previously been shown to be easily fooled by tiny +transformations such as a one pixel translation of the input image. In order to +address this problem, two approaches have been proposed in recent years. The +first approach suggests using huge datasets together with data augmentation in +the hope that a highly varied training set will teach the network to learn to +be invariant. The second approach suggests using architectural modifications +based on sampling theory to deal explicitly with image translations. In this +paper, we show that these approaches still fall short in robustly handling +'natural' image translations that simulate a subtle change in camera +orientation. Our findings reveal that a mere one-pixel translation can result +in a significant change in the predicted image representation for approximately +40% of the test images in state-of-the-art models (e.g. open-CLIP trained on +LAION-2B or DINO-v2) , while models that are explicitly constructed to be +robust to cyclic translations can still be fooled with 1 pixel realistic +(non-cyclic) translations 11% of the time. We present Robust Inference by Crop +Selection: a simple method that can be proven to achieve any desired level of +consistency, although with a modest tradeoff with the model's accuracy. +Importantly, we demonstrate how employing this method reduces the ability to +fool state-of-the-art models with a 1 pixel translation to less than 5% while +suffering from only a 1% drop in classification accuracy. Additionally, we show +that our method can be easy adjusted to deal with circular shifts as well. In +such case we achieve 100% robustness to integer shifts with state-of-the-art +accuracy, and with no need for any further training. + +
+
+ comment: 14 pages, 6 appendices, 17 figures +
+
+
+
+
+ + ☆ Measuring proximity to standard planes during fetal brain ultrasound + scanning + + +
+ This paper introduces a novel pipeline designed to bring ultrasound (US) +plane pose estimation closer to clinical use for more effective navigation to +the standard planes (SPs) in the fetal brain. We propose a semi-supervised +segmentation model utilizing both labeled SPs and unlabeled 3D US volume +slices. Our model enables reliable segmentation across a diverse set of fetal +brain images. Furthermore, the model incorporates a classification mechanism to +identify the fetal brain precisely. Our model not only filters out frames +lacking the brain but also generates masks for those containing it, enhancing +the relevance of plane pose regression in clinical settings. We focus on fetal +brain navigation from 2D ultrasound (US) video analysis and combine this model +with a US plane pose regression network to provide sensorless proximity +detection to SPs and non-SPs planes; we emphasize the importance of proximity +detection to SPs for guiding sonographers, offering a substantial advantage +over traditional methods by allowing earlier and more precise adjustments +during scanning. We demonstrate the practical applicability of our approach +through validation on real fetal scan videos obtained from sonographers of +varying expertise levels. Our findings demonstrate the potential of our +approach to complement existing fetal US technologies and advance prenatal +diagnostic practices. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Driver Attention Tracking and Analysis + + +
+ We propose a novel method to estimate a driver's points-of-gaze using a pair +of ordinary cameras mounted on the windshield and dashboard of a car. This is a +challenging problem due to the dynamics of traffic environments with 3D scenes +of unknown depths. This problem is further complicated by the volatile distance +between the driver and the camera system. To tackle these challenges, we +develop a novel convolutional network that simultaneously analyzes the image of +the scene and the image of the driver's face. This network has a camera +calibration module that can compute an embedding vector that represents the +spatial configuration between the driver and the camera system. This +calibration module improves the overall network's performance, which can be +jointly trained end to end. + We also address the lack of annotated data for training and evaluation by +introducing a large-scale driving dataset with point-of-gaze annotations. This +is an in situ dataset of real driving sessions in an urban city, containing +synchronized images of the driving scene as well as the face and gaze of the +driver. Experiments on this dataset show that the proposed method outperforms +various baseline methods, having the mean prediction error of 29.69 pixels, +which is relatively small compared to the $1280{\times}720$ resolution of the +scene camera. + +
+
+
+
+
+ + ☆ Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images + + +
+ Deep subspace clustering methods are now prominent in clustering, typically +using fully connected networks and a self-representation loss function. +However, these methods often struggle with overfitting and lack +interpretability. In this paper, we explore an alternative clustering approach +based on deep unfolding. By unfolding iterative optimization methods into +neural networks, this approach offers enhanced interpretability and reliability +compared to data-driven deep learning methods, and greater adaptability and +generalization than model-based approaches. Hence, unfolding has become widely +used in inverse imaging problems, such as image restoration, reconstruction, +and super-resolution, but has not been sufficiently explored yet in the context +of clustering. In this work, we introduce an innovative clustering architecture +for hyperspectral images (HSI) by unfolding an iterative solver based on the +Alternating Direction Method of Multipliers (ADMM) for sparse subspace +clustering. To our knowledge, this is the first attempt to apply unfolding ADMM +for computing the self-representation matrix in subspace clustering. Moreover, +our approach captures well the structural characteristics of HSI data by +employing the K nearest neighbors algorithm as part of a structure preservation +module. Experimental evaluation of three established HSI datasets shows clearly +the potential of the unfolding approach in HSI clustering and even demonstrates +superior performance compared to state-of-the-art techniques. + +
+
+
+
+
+ + ☆ Wild Visual Navigation: Fast Traversability Learning via Pre-Trained + Models and Online Self-Supervision + + +
+ Natural environments such as forests and grasslands are challenging for +robotic navigation because of the false perception of rigid obstacles from high +grass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN), +an online self-supervised learning system for visual traversability estimation. +The system is able to continuously adapt from a short human demonstration in +the field, only using onboard sensing and computing. One of the key ideas to +achieve this is the use of high-dimensional features from pre-trained +self-supervised models, which implicitly encode semantic information that +massively simplifies the learning task. Further, the development of an online +scheme for supervision generator enables concurrent training and inference of +the learned model in the wild. We demonstrate our approach through diverse +real-world deployments in forests, parks, and grasslands. Our system is able to +bootstrap the traversable terrain segmentation in less than 5 min of in-field +training time, enabling the robot to navigate in complex, previously unseen +outdoor terrains. Code: https://bit.ly/498b0CV - Project +page:https://bit.ly/3M6nMHH + +
+
+ comment: Extended version of arXiv:2305.08510 +
+
+
+
+
+ + ☆ 3DMambaComplete: Exploring Structured State Space Model for Point Cloud + Completion + + +
+ Point cloud completion aims to generate a complete and high-fidelity point +cloud from an initially incomplete and low-quality input. A prevalent strategy +involves leveraging Transformer-based models to encode global features and +facilitate the reconstruction process. However, the adoption of pooling +operations to obtain global feature representations often results in the loss +of local details within the point cloud. Moreover, the attention mechanism +inherent in Transformers introduces additional computational complexity, +rendering it challenging to handle long sequences effectively. To address these +issues, we propose 3DMambaComplete, a point cloud completion network built on +the novel Mamba framework. It comprises three modules: HyperPoint Generation +encodes point cloud features using Mamba's selection mechanism and predicts a +set of Hyperpoints. A specific offset is estimated, and the down-sampled points +become HyperPoints. The HyperPoint Spread module disperses these HyperPoints +across different spatial locations to avoid concentration. Finally, a +deformation method transforms the 2D mesh representation of HyperPoints into a +fine-grained 3D structure for point cloud reconstruction. Extensive experiments +conducted on various established benchmarks demonstrate that 3DMambaComplete +surpasses state-of-the-art point cloud completion methods, as confirmed by +qualitative and quantitative analyses. + +
+
+ comment: 10 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ Learning Priors for Non Rigid SfM from Casual Videos + + +
+ We tackle the long-standing challenge of reconstructing 3D structures and +camera positions from videos. The problem is particularly hard when objects are +transformed in a non-rigid way. Current approaches to this problem make +unrealistic assumptions or require a long optimization time. + We present TracksTo4D, a novel deep learning-based approach that enables +inferring 3D structure and camera positions from dynamic content originating +from in-the-wild videos using a single feed-forward pass on a sparse point +track matrix. To achieve this, we leverage recent advances in 2D point tracking +and design an equivariant neural architecture tailored for directly processing +2D point tracks by leveraging their symmetries. TracksTo4D is trained on a +dataset of in-the-wild videos utilizing only the 2D point tracks extracted from +the videos, without any 3D supervision. Our experiments demonstrate that +TracksTo4D generalizes well to unseen videos of unseen semantic categories at +inference time, producing equivalent results to state-of-the-art methods while +significantly reducing the runtime compared to other baselines. + +
+
+
+
+
+ + ☆ MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation + from 2D Keypoints CVPR + + +
+ This paper presents Key2Mesh, a model that takes a set of 2D human pose +keypoints as input and estimates the corresponding body mesh. Since this +process does not involve any visual (i.e. RGB image) data, the model can be +trained on large-scale motion capture (MoCap) datasets, thereby overcoming the +scarcity of image datasets with 3D labels. To enable the model's application on +RGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D +keypoints, and then feed these 2D keypoints to Key2Mesh. To improve the +performance of our model on RGB images, we apply an adversarial domain +adaptation (DA) method to bridge the gap between the MoCap and visual domains. +Crucially, our DA method does not require 3D labels for visual data, which +enables adaptation to target sets without the need for costly labels. We +evaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints, +in the absence of RGB and mesh label pairs. Our results on widely used H3.6M +and 3DPW datasets show that Key2Mesh sets the new state-of-the-art by +outperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE +for the 3DPW dataset. Thanks to our model's simple architecture, it operates at +least 12x faster than the prior state-of-the-art model, LGD. Additional +qualitative samples and code are available on the project website: +https://key2mesh.github.io/. + +
+
+ comment: accepted to CVPRW 2024 +
+
+
+
+
+ + ☆ VLLMs Provide Better Context for Emotion Understanding Through Common + Sense Reasoning + + +
+ Recognising emotions in context involves identifying the apparent emotions of +an individual, taking into account contextual cues from the surrounding scene. +Previous approaches to this task have involved the design of explicit +scene-encoding architectures or the incorporation of external scene-related +information, such as captions. However, these methods often utilise limited +contextual information or rely on intricate training pipelines. In this work, +we leverage the groundbreaking capabilities of Vision-and-Large-Language Models +(VLLMs) to enhance in-context emotion classification without introducing +complexity to the training process in a two-stage approach. In the first stage, +we propose prompting VLLMs to generate descriptions in natural language of the +subject's apparent emotion relative to the visual context. In the second stage, +the descriptions are used as contextual information and, along with the image +input, are used to train a transformer-based architecture that fuses text and +visual features before the final classification task. Our experimental results +show that the text and image features have complementary information, and our +fused architecture significantly outperforms the individual modalities without +any complex training methods. We evaluate our approach on three different +datasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or +comparable accuracy across all datasets and metrics compared to much more +complex approaches. The code will be made publicly available on github: +https://github.com/NickyFot/EmoCommonSense.git + +
+
+ comment: A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this + work; 14 pages, 5 figures +
+
+
+
+
+ + ☆ Implicit Multi-Spectral Transformer: An Lightweight and Effective + Visible to Infrared Image Translation Model IJCNN 2024 + + +
+ In the field of computer vision, visible light images often exhibit low +contrast in low-light conditions, presenting a significant challenge. While +infrared imagery provides a potential solution, its utilization entails high +costs and practical limitations. Recent advancements in deep learning, +particularly the deployment of Generative Adversarial Networks (GANs), have +facilitated the transformation of visible light images to infrared images. +However, these methods often experience unstable training phases and may +produce suboptimal outputs. To address these issues, we propose a novel +end-to-end Transformer-based model that efficiently converts visible light +images into high-fidelity infrared images. Initially, the Texture Mapping +Module and Color Perception Adapter collaborate to extract texture and color +features from the visible light image. The Dynamic Fusion Aggregation Module +subsequently integrates these features. Finally, the transformation into an +infrared image is refined through the synergistic action of the Color +Perception Adapter and the Enhanced Perception Attention mechanism. +Comprehensive benchmarking experiments confirm that our model outperforms +existing methods, producing infrared images of markedly superior quality, both +qualitatively and quantitatively. Furthermore, the proposed model enables more +effective downstream applications for infrared images than other methods. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ☆ Identification of Fine-grained Systematic Errors via Controlled Scene + Generation + + +
+ Many safety-critical applications, especially in autonomous driving, require +reliable object detectors. They can be very effectively assisted by a method to +search for and identify potential failures and systematic errors before these +detectors are deployed. Systematic errors are characterized by combinations of +attributes such as object location, scale, orientation, and color, as well as +the composition of their respective backgrounds. To identify them, one must +rely on something other than real images from a test set because they do not +account for very rare but possible combinations of attributes. To overcome this +limitation, we propose a pipeline for generating realistic synthetic scenes +with fine-grained control, allowing the creation of complex scenes with +multiple objects. Our approach, BEV2EGO, allows for a realistic generation of +the complete scene with road-contingent control that maps 2D bird's-eye view +(BEV) scene configurations to a first-person view (EGO). In addition, we +propose a benchmark for controlled scene generation to select the most +appropriate generative outpainting model for BEV2EGO. We further use it to +perform a systematic analysis of multiple state-of-the-art object detection +models and discover differences between them. + +
+
+
+
+
+ + ☆ An Evidential-enhanced Tri-Branch Consistency Learning Method for + Semi-supervised Medical Image Segmentation + + +
+ Semi-supervised segmentation presents a promising approach for large-scale +medical image analysis, effectively reducing annotation burdens while achieving +comparable performance. This methodology holds substantial potential for +streamlining the segmentation process and enhancing its feasibility within +clinical settings for translational investigations. While cross-supervised +training, based on distinct co-training sub-networks, has become a prevalent +paradigm for this task, addressing critical issues such as predication +disagreement and label-noise suppression requires further attention and +progress in cross-supervised training. In this paper, we introduce an +Evidential Tri-Branch Consistency learning framework (ETC-Net) for +semi-supervised medical image segmentation. ETC-Net employs three branches: an +evidential conservative branch, an evidential progressive branch, and an +evidential fusion branch. The first two branches exhibit complementary +characteristics, allowing them to address prediction diversity and enhance +training stability. We also integrate uncertainty estimation from the +evidential learning into cross-supervised training, mitigating the negative +impact of erroneous supervision signals. Additionally, the evidential fusion +branch capitalizes on the complementary attributes of the first two branches +and leverages an evidence-based Dempster-Shafer fusion strategy, supervised by +more reliable and accurate pseudo-labels of unlabeled data. Extensive +experiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that +ETC-Net surpasses other state-of-the-art methods for semi-supervised +segmentation. The code will be made available in the near future at +https://github.com/Medsemiseg. + +
+
+
+
+
+ + ☆ ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR + Domain Modeling + + +
+ Every day, countless surgeries are performed worldwide, each within the +distinct settings of operating rooms (ORs) that vary not only in their setups +but also in the personnel, tools, and equipment used. This inherent diversity +poses a substantial challenge for achieving a holistic understanding of the OR, +as it requires models to generalize beyond their initial training datasets. To +reduce this gap, we introduce ORacle, an advanced vision-language model +designed for holistic OR domain modeling, which incorporates multi-view and +temporal capabilities and can leverage external knowledge during inference, +enabling it to adapt to previously unseen surgical scenarios. This capability +is further enhanced by our novel data augmentation framework, which +significantly diversifies the training dataset, ensuring ORacle's proficiency +in applying the provided knowledge effectively. In rigorous testing, in scene +graph generation, and downstream tasks on the 4D-OR dataset, ORacle not only +demonstrates state-of-the-art performance but does so requiring less data than +existing models. Furthermore, its adaptability is displayed through its ability +to interpret unseen views, actions, and appearances of tools and equipment. +This demonstrates ORacle's potential to significantly enhance the scalability +and affordability of OR domain modeling and opens a pathway for future +advancements in surgical data science. We will release our code and data upon +acceptance. + +
+
+ comment: 11 pages, 3 figures, 7 tables +
+
+
+
+
+ + ☆ Diffusion-based inpainting of incomplete Euclidean distance matrices of + trajectories generated by a fractional Brownian motion + + +
+ Fractional Brownian trajectories (fBm) feature both randomness and strong +scale-free correlations, challenging generative models to reproduce the +intrinsic memory characterizing the underlying process. Here we test a +diffusion probabilistic model on a specific dataset of corrupted images +corresponding to incomplete Euclidean distance matrices of fBm at various +memory exponents $H$. Our dataset implies uniqueness of the data imputation in +the regime of low missing ratio, where the remaining partial graph is rigid, +providing the ground truth for the inpainting. We find that the conditional +diffusion generation stably reproduces the statistics of missing +fBm-distributed distances for different values of $H$ exponent. Furthermore, +while diffusion models have been recently shown to remember samples from the +training database, we show that diffusion-based inpainting behaves +qualitatively different from the database search with the increasing database +size. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for +completion of chromosome distance matrices obtained in single-cell microscopy +experiments, showing its superiority over the standard bioinformatics +algorithms. Our source code is available on GitHub at +https://github.com/alobashev/diffusion_fbm. + +
+
+
+
+
+ + ☆ Ray-driven Spectral CT Reconstruction Based on Neural Base-Material + Fields + + +
+ In spectral CT reconstruction, the basis materials decomposition involves +solving a large-scale nonlinear system of integral equations, which is highly +ill-posed mathematically. This paper proposes a model that parameterizes the +attenuation coefficients of the object using a neural field representation, +thereby avoiding the complex calculations of pixel-driven projection +coefficient matrices during the discretization process of line integrals. It +introduces a lightweight discretization method for line integrals based on a +ray-driven neural field, enhancing the accuracy of the integral approximation +during the discretization process. The basis materials are represented as +continuous vector-valued implicit functions to establish a neural field +parameterization model for the basis materials. The auto-differentiation +framework of deep learning is then used to solve the implicit continuous +function of the neural base-material fields. This method is not limited by the +spatial resolution of reconstructed images, and the network has compact and +regular properties. Experimental validation shows that our method performs +exceptionally well in addressing the spectral CT reconstruction. Additionally, +it fulfils the requirements for the generation of high-resolution +reconstruction images. + +
+
+ comment: 14 pages,16 figures +
+
+
+
+
+ + ☆ Accurate Tennis Court Line Detection on Amateur Recorded Matches + + +
+ Typically, tennis court line detection is done by running +Hough-Line-Detection to find straight lines in the image, and then computing a +transformation matrix from the detected lines to create the final court +structure. We propose numerous improvements and enhancements to this algorithm, +including using pretrained State-of-the-Art shadow-removal and object-detection +ML models to make our line-detection more robust. Compared to the original +algorithm, our method can accurately detect lines on amateur, dirty courts. +When combined with a robust ball-tracking system, our method will enable +accurate, automatic refereeing for amateur and professional tennis matches +alike. + +
+
+ comment: Accepted to 5th International conference on Image, Video Processing + and Artificial Intelligence +
+
+
+
+
+ + ☆ TrajPRed: Trajectory Prediction with Region-based Relation Learning + + +
+ Forecasting human trajectories in traffic scenes is critical for safety +within mixed or fully autonomous systems. Human future trajectories are driven +by two major stimuli, social interactions, and stochastic goals. Thus, reliable +forecasting needs to capture these two stimuli. Edge-based relation modeling +represents social interactions using pairwise correlations from precise +individual states. Nevertheless, edge-based relations can be vulnerable under +perturbations. To alleviate these issues, we propose a region-based relation +learning paradigm that models social interactions via region-wise dynamics of +joint states, i.e., the changes in the density of crowds. In particular, +region-wise agent joint information is encoded within convolutional feature +grids. Social relations are modeled by relating the temporal changes of local +joint information from a global perspective. We show that region-based +relations are less susceptible to perturbations. In order to account for the +stochastic individual goals, we exploit a conditional variational autoencoder +to realize multi-goal estimation and diverse future prediction. Specifically, +we perform variational inference via the latent distribution, which is +conditioned on the correlation between input states and associated target +goals. Sampling from the latent distribution enables the framework to reliably +capture the stochastic behavior in test data. We integrate multi-goal +estimation and region-based relation learning to model the two stimuli, social +interactions, and stochastic goals, in a prediction framework. We evaluate our +framework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that +the diverse prediction better fits the ground truth when incorporating the +relation module. Our framework outperforms the state-of-the-art models on SDD +by $27.61\%$/$18.20\%$ of ADE/FDE metrics. + +
+
+
+
+
+ + ☆ V-MAD: Video-based Morphing Attack Detection in Operational Scenarios + + +
+ In response to the rising threat of the face morphing attack, this paper +introduces and explores the potential of Video-based Morphing Attack Detection +(V-MAD) systems in real-world operational scenarios. While current morphing +attack detection methods primarily focus on a single or a pair of images, V-MAD +is based on video sequences, exploiting the video streams often acquired by +face verification tools available, for instance, at airport gates. Through this +study, we show for the first time the advantages that the availability of +multiple probe frames can bring to the morphing attack detection task, +especially in scenarios where the quality of probe images is varied and might +be affected, for instance, by pose or illumination variations. Experimental +results on a real operational database demonstrate that video sequences +represent valuable information for increasing the robustness and performance of +morphing attack detection systems. + +
+
+
+
+
+ + ☆ Adversarial purification for no-reference image-quality metrics: + applicability study and new methods + + +
+ Recently, the area of adversarial attacks on image quality metrics has begun +to be explored, whereas the area of defences remains under-researched. In this +study, we aim to cover that case and check the transferability of adversarial +purification defences from image classifiers to IQA methods. In this paper, we +apply several widespread attacks on IQA models and examine the success of the +defences against them. The purification methodologies covered different +preprocessing techniques, including geometrical transformations, compression, +denoising, and modern neural network-based methods. Also, we address the +challenge of assessing the efficacy of a defensive methodology by proposing +ways to estimate output visual quality and the success of neutralizing attacks. +Defences were tested against attack on three IQA metrics -- Linearity, MetaIQA +and SPAQ. The code for attacks and defences is available at: (link is hidden +for a blind review). + +
+
+
+
+
+ + ☆ Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven + Approach + + +
+ Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark +modality for the comprehensive assessment of cardiac function. Nevertheless, +the acquisition process of cine CMR is considered as an impediment due to its +prolonged scanning time. One commonly used strategy to expedite the acquisition +process is through k-space undersampling, though it comes with a drawback of +introducing aliasing effects in the reconstructed image. Lately, deep +learning-based methods have shown remarkable results over traditional +approaches in rapidly achieving precise CMR reconstructed images. This study +aims to explore the untapped potential of attention mechanisms incorporated +with a deep learning model within the context of the CMR reconstruction +problem. We are motivated by the fact that attention has proven beneficial in +downstream tasks such as image classification and segmentation, but has not +been systematically analysed in the context of CMR reconstruction. Our primary +goal is to identify the strengths and potential limitations of attention +algorithms when integrated with a convolutional backbone model such as a U-Net. +To achieve this, we benchmark different state-of-the-art spatial and channel +attention mechanisms on the CMRxRecon dataset and quantitatively evaluate the +quality of reconstruction using objective metrics. Furthermore, inspired by the +best performing attention mechanism, we propose a new, simple yet effective, +attention pipeline specifically optimised for the task of cardiac image +reconstruction that outperforms other state-of-the-art attention methods. The +layer and model code will be made publicly available. + +
+
+ comment: This paper has been submitted for the 32nd European Signal Processing + Conference EUSIPCO 2024 in Lyon +
+
+
+
+
+ + ☆ Efficient and Generic Point Model for Lossless Point Cloud Attribute + Compression + + +
+ The past several years have witnessed the emergence of learned point cloud +compression (PCC) techniques. However, current learning-based lossless point +cloud attribute compression (PCAC) methods either suffer from high +computational complexity or deteriorated compression performance. Moreover, the +significant variations in point cloud scale and sparsity encountered in +real-world applications make developing an all-in-one neural model a +challenging task. In this paper, we propose PoLoPCAC, an efficient and generic +lossless PCAC method that achieves high compression efficiency and strong +generalizability simultaneously. We formulate lossless PCAC as the task of +inferring explicit distributions of attributes from group-wise autoregressive +priors. A progressive random grouping strategy is first devised to efficiently +resolve the point cloud into groups, and then the attributes of each group are +modeled sequentially from accumulated antecedents. A locality-aware attention +mechanism is utilized to exploit prior knowledge from context windows in +parallel. Since our method directly operates on points, it can naturally avoids +distortion caused by voxelization, and can be executed on point clouds with +arbitrary scale and density. Experiments show that our method can be instantly +deployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying +continuous bit-rate reduction over the latest G-PCCv23 on various datasets +(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding +time than G-PCCv23 on the majority of sequences with a lightweight model size +(2.6MB), which is highly attractive for practical applications. Dataset, code +and trained model are available at +https://github.com/I2-Multimedia-Lab/PoLoPCAC. + +
+
+
+
+
+ + ☆ HRVDA: High-Resolution Visual Document Assistant CVPR 2024 + + +
+ Leveraging vast training data, multimodal large language models (MLLMs) have +demonstrated formidable general visual comprehension capabilities and achieved +remarkable performance across various tasks. However, their performance in +visual document understanding still leaves much room for improvement. This +discrepancy is primarily attributed to the fact that visual document +understanding is a fine-grained prediction task. In natural scenes, MLLMs +typically use low-resolution images, leading to a substantial loss of visual +information. Furthermore, general-purpose MLLMs do not excel in handling +document-oriented instructions. In this paper, we propose a High-Resolution +Visual Document Assistant (HRVDA), which bridges the gap between MLLMs and +visual document understanding. This model employs a content filtering mechanism +and an instruction filtering module to separately filter out the +content-agnostic visual tokens and instruction-agnostic visual tokens, thereby +achieving efficient model training and inference for high-resolution images. In +addition, we construct a document-oriented visual instruction tuning dataset +and apply a multi-stage training strategy to enhance the model's document +modeling capabilities. Extensive experiments demonstrate that our model +achieves state-of-the-art performance across multiple document understanding +datasets, while maintaining training efficiency and inference speed comparable +to low-resolution models. + +
+
+ comment: Accepted to CVPR 2024 main conference +
+
+
+
+
+ + ☆ Sparse Global Matching for Video Frame Interpolation with Large Motion CVPR 2024 + + +
+ Large motion poses a critical challenge in Video Frame Interpolation (VFI) +task. Existing methods are often constrained by limited receptive fields, +resulting in sub-optimal performance when handling scenarios with large motion. +In this paper, we introduce a new pipeline for VFI, which can effectively +integrate global-level information to alleviate issues associated with large +motion. Specifically, we first estimate a pair of initial intermediate flows +using a high-resolution feature map for extracting local details. Then, we +incorporate a sparse global matching branch to compensate for flow estimation, +which consists of identifying flaws in initial flows and generating sparse flow +compensation with a global receptive field. Finally, we adaptively merge the +initial flow estimation with global flow compensation, yielding a more accurate +intermediate flow. To evaluate the effectiveness of our method in handling +large motion, we carefully curate a more challenging subset from commonly used +benchmarks. Our method demonstrates the state-of-the-art performance on these +VFI subsets with large motion. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/ +
+
+
+
+
+ + ☆ DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic + Gaussian Splatting + + +
+ The increasing demand for virtual reality applications has highlighted the +significance of crafting immersive 3D assets. We present a text-to-3D +360$^{\circ}$ scene generation pipeline that facilitates the creation of +comprehensive 360$^{\circ}$ scenes for in-the-wild environments in a matter of +minutes. Our approach utilizes the generative power of a 2D diffusion model and +prompt self-refinement to create a high-quality and globally coherent panoramic +image. This image acts as a preliminary "flat" (2D) scene representation. +Subsequently, it is lifted into 3D Gaussians, employing splatting techniques to +enable real-time exploration. To produce consistent 3D geometry, our pipeline +constructs a spatially coherent structure by aligning the 2D monocular depth +into a globally optimized point cloud. This point cloud serves as the initial +state for the centroids of 3D Gaussians. In order to address invisible issues +inherent in single-view inputs, we impose semantic and geometric constraints on +both synthesized and input camera views as regularizations. These guide the +optimization of Gaussians, aiding in the reconstruction of unseen regions. In +summary, our method offers a globally consistent 3D scene within a +360$^{\circ}$ perspective, providing an enhanced immersive experience over +existing techniques. Project website at: http://dreamscene360.github.io/ + +
+
+
+
+
+ + ☆ O-TALC: Steps Towards Combating Oversegmentation within Online Action + Segmentation + + +
+ Online temporal action segmentation shows a strong potential to facilitate +many HRI tasks where extended human action sequences must be tracked and +understood in real time. Traditional action segmentation approaches, however, +operate in an offline two stage approach, relying on computationally expensive +video wide features for segmentation, rendering them unsuitable for online HRI +applications. In order to facilitate online action segmentation on a stream of +incoming video data, we introduce two methods for improved training and +inference of backbone action recognition models, allowing them to be deployed +directly for online frame level classification. Firstly, we introduce surround +dense sampling whilst training to facilitate training vs. inference clip +matching and improve segment boundary predictions. Secondly, we introduce an +Online Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce +oversegmentation during online inference. As our methods are backbone +invariant, they can be deployed with computationally efficient spatio-temporal +action recognition models capable of operating in real time with a small +segmentation latency. We show our method outperforms similar online action +segmentation work as well as matches the performance of many offline models +with access to full temporal resolution when operating on challenging +fine-grained datasets. + +
+
+ comment: 5 pages, 3 figures. Accepted as a short (unindexed) paper at the + TAHRI conference +
+
+
+
+
+ + ☆ SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End + Autonomous Driving + + +
+ End-to-End paradigms use a unified framework to implement multi-tasks in an +autonomous driving system. Despite simplicity and clarity, the performance of +end-to-end autonomous driving methods on sub-tasks is still far behind the +single-task methods. Meanwhile, the widely used dense BEV features in previous +end-to-end methods make it costly to extend to more modalities or tasks. In +this paper, we propose a Sparse query-centric paradigm for end-to-end +Autonomous Driving (SparseAD), where the sparse queries completely represent +the whole driving scenario across space, time and tasks without any dense BEV +representation. Concretely, we design a unified sparse architecture for +perception tasks including detection, tracking, and online mapping. Moreover, +we revisit motion prediction and planning, and devise a more justifiable motion +planner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA +full-task performance among end-to-end methods and significantly narrows the +performance gap between end-to-end paradigms and single-task methods. Codes +will be released soon. + +
+
+
+
+
+ + ☆ Research on Detection of Floating Objects in River and Lake Based on AI + Intelligent Image Recognition + + +
+ With the rapid advancement of artificial intelligence technology, AI-enabled +image recognition has emerged as a potent tool for addressing challenges in +traditional environmental monitoring. This study focuses on the detection of +floating objects in river and lake environments, exploring an innovative +approach based on deep learning. By intricately analyzing the technical +pathways for detecting static and dynamic features and considering the +characteristics of river and lake debris, a comprehensive image acquisition and +processing workflow has been developed. The study highlights the application +and performance comparison of three mainstream deep learning models -SSD, +Faster-RCNN, and YOLOv5- in debris identification. Additionally, a detection +system for floating objects has been designed and implemented, encompassing +both hardware platform construction and software framework development. Through +rigorous experimental validation, the proposed system has demonstrated its +ability to significantly enhance the accuracy and efficiency of debris +detection, thus offering a new technological avenue for water quality +monitoring in rivers and lakes + +
+
+
+
+
+ + ☆ Fine color guidance in diffusion models and its application to image + compression at extremely low bitrates + + +
+ This study addresses the challenge of, without training or fine-tuning, +controlling the global color aspect of images generated with a diffusion model. +We rewrite the guidance equations to ensure that the outputs are closer to a +known color map, and this without hindering the quality of the generation. Our +method leads to new guidance equations. We show in the color guidance context +that, the scaling of the guidance should not decrease but remains high +throughout the diffusion process. In a second contribution, our guidance is +applied in a compression framework, we combine both semantic and general color +information on the image to decode the images at low cost. We show that our +method is effective at improving fidelity and realism of compressed images at +extremely low bit rates, when compared to other classical or more semantic +oriented approaches. + +
+
+ comment: Submitted to IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds ICIP + + +
+ While deep learning-based methods have demonstrated outstanding results in +numerous domains, some important functionalities are missing. Resolution +scalability is one of them. In this work, we introduce a novel architecture, +dubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of +point clouds. In contrast to existing works, the proposed method does not +require the whole point cloud to be available to start inference. Once a +low-resolution version of the input point cloud is available, first semantic +predictions can be generated in an extremely fast manner. This enables early +decision-making in subsequent processing steps. As additional points become +available, these are processed in parallel. To improve performance, features +from previously computed scales are employed as prior knowledge at the current +scale. Our experiments show that RESSCAL3D is 31-62% faster than the +non-scalable baseline while keeping a limited impact on performance. To the +best of our knowledge, the proposed method is the first to propose a +resolution-scalable approach for 3D semantic segmentation of point clouds based +on deep learning. + +
+
+ comment: Published at 2023 IEEE International Conference on Image Processing + (ICIP) +
+
+
+
+
+ + ☆ Monocular 3D lane detection for Autonomous Driving: Recent Achievements, + Challenges, and Outlooks + + +
+ 3D lane detection plays a crucial role in autonomous driving by extracting +structural and traffic information from the road in 3D space to assist the +self-driving car in rational, safe, and comfortable path planning and motion +control. Due to the consideration of sensor costs and the advantages of visual +data in color information, in practical applications, 3D lane detection based +on monocular vision is one of the important research directions in the field of +autonomous driving, which has attracted more and more attention in both +industry and academia. Unfortunately, recent progress in visual perception +seems insufficient to develop completely reliable 3D lane detection algorithms, +which also hinders the development of vision-based fully autonomous +self-driving cars, i.e., achieving level 5 autonomous driving, driving like +human-controlled cars. This is one of the conclusions drawn from this review +paper: there is still a lot of room for improvement and significant +improvements are still needed in the 3D lane detection algorithm for autonomous +driving cars using visual sensors. Motivated by this, this review defines, +analyzes, and reviews the current achievements in the field of 3D lane +detection research, and the vast majority of the current progress relies +heavily on computationally complex deep learning models. In addition, this +review covers the 3D lane detection pipeline, investigates the performance of +state-of-the-art algorithms, analyzes the time complexity of cutting-edge +modeling choices, and highlights the main achievements and limitations of +current research efforts. The survey also includes a comprehensive discussion +of available 3D lane detection datasets and the challenges that researchers +have faced but have not yet resolved. Finally, our work outlines future +research directions and welcomes researchers and practitioners to enter this +exciting field. + +
+
+
+
+
+ + ☆ Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark + + +
+ Multi-label image classification in dynamic environments is a problem that +poses significant challenges. Previous studies have primarily focused on +scenarios such as Domain Incremental Learning and Class Incremental Learning, +which do not fully capture the complexity of real-world applications. In this +paper, we study the problem of classification of medical imaging in the +scenario termed New Instances \& New Classes, which combines the challenges of +both new class arrivals and domain shifts in a single framework. Unlike +traditional scenarios, it reflects the realistic nature of CL in domains such +as medical imaging, where updates may introduce both new classes and changes in +domain characteristics. To address the unique challenges posed by this complex +scenario, we introduce a novel approach called Pseudo-Label Replay. This method +aims to mitigate forgetting while adapting to new classes and domain shifts by +combining the advantages of the Replay and Pseudo-Label methods and solving +their limitations in the proposed scenario. % part3 We evaluate our proposed +approach on a challenging benchmark consisting of two datasets, seven tasks, +and nineteen classes, modeling a realistic Continual Learning scenario. Our +experimental findings demonstrate the effectiveness of Pseudo-Label Replay in +addressing the challenges posed by the complex scenario proposed. Our method +surpasses existing approaches, exhibiting superior performance while showing +minimal forgetting. + +
+
+
+
+
+ + ☆ UDiFF: Generating Conditional Unsigned Distance Fields with Optimal + Wavelet Diffusion CVPR2024 + + +
+ Diffusion models have shown remarkable results for image generation, editing +and inpainting. Recent works explore diffusion models for 3D shape generation +with neural implicit functions, i.e., signed distance function and occupancy +function. However, they are limited to shapes with closed surfaces, which +prevents them from generating diverse 3D real-world contents containing open +surfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned +distance fields (UDFs) which is capable to generate textured 3D shapes with +open surfaces from text conditions or unconditionally. Our key idea is to +generate UDFs in spatial-frequency domain with an optimal wavelet +transformation, which produces a compact representation space for UDF +generation. Specifically, instead of selecting an appropriate wavelet +transformation which requires expensive manual efforts and still leads to large +information loss, we propose a data-driven approach to learn the optimal +wavelet transformation for UDFs. We evaluate UDiFF to show our advantages by +numerical and visual comparisons with the latest methods on widely used +benchmarks. Page: https://weiqi-zhang.github.io/UDiFF. + +
+
+ comment: To appear at CVPR2024. Project page: + https://weiqi-zhang.github.io/UDiFF +
+
+
+
+
+ + ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit + Representation + + +
+ Online construction of open-ended language scenes is crucial for robotic +applications, where open-vocabulary interactive scene understanding is +required. Recently, neural implicit representation has provided a promising +direction for online interactive mapping. However, implementing open-vocabulary +scene understanding capability into online neural implicit mapping still faces +three challenges: lack of local scene updating ability, blurry spatial +hierarchical semantic segmentation and difficulty in maintaining multi-view +consistency. To this end, we proposed O2V-mapping, which utilizes voxel-based +language and geometric features to create an open-vocabulary field, thus +allowing for local updates during online training process. Additionally, we +leverage a foundational model for image segmentation to extract language +features on object-level entities, achieving clear segmentation boundaries and +hierarchical semantic features. For the purpose of preserving consistency in 3D +object properties across different viewpoints, we propose a spatial adaptive +voxel adjustment mechanism and a multi-view weight selection method. Extensive +experiments on open-vocabulary object localization and semantic segmentation +demonstrate that O2V-mapping achieves online construction of language scenes +while enhancing accuracy, outperforming the previous SOTA method. + +
+
+
+
+
+ + ☆ Tuning-Free Adaptive Style Incorporation for Structure-Consistent + Text-Driven Style Transfer + + +
+ In this work, we target the task of text-driven style transfer in the context +of text-to-image (T2I) diffusion models. The main challenge is consistent +structure preservation while enabling effective style transfer effects. The +past approaches in this field directly concatenate the content and style +prompts for a prompt-level style injection, leading to unavoidable structure +distortions. In this work, we propose a novel solution to the text-driven style +transfer task, namely, Adaptive Style Incorporation~(ASI), to achieve +fine-grained feature-level style incorporation. It consists of the Siamese +Cross-Attention~(SiCA) to decouple the single-track cross-attention to a +dual-track structure to obtain separate content and style features, and the +Adaptive Content-Style Blending (AdaBlending) module to couple the content and +style information from a structure-consistent manner. Experimentally, our +method exhibits much better performance in both structure preservation and +stylized effects. + +
+
+
+
+
+ + ☆ SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection CVPR 2024 + + +
+ Detecting anomalies in images has become a well-explored problem in both +academia and industry. State-of-the-art algorithms are able to detect defects +in increasingly difficult settings and data modalities. However, most current +methods are not suited to address 3D objects captured from differing poses. +While solutions using Neural Radiance Fields (NeRFs) have been proposed, they +suffer from excessive computation requirements, which hinder real-world +usability. For this reason, we propose the novel 3D Gaussian splatting-based +framework SplatPose which, given multi-view images of a 3D object, accurately +estimates the pose of unseen views in a differentiable manner, and detects +anomalies in them. We achieve state-of-the-art results in both training and +inference speed, and detection performance, even when using less training data +than competing methods. We thoroughly evaluate our framework using the recently +proposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly +detection (MAD) data set. + +
+
+ comment: Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Zero-shot Point Cloud Completion Via 2D Priors + + +
+ 3D point cloud completion is designed to recover complete shapes from +partially observed point clouds. Conventional completion methods typically +depend on extensive point cloud data for training %, with their effectiveness +often constrained to object categories similar to those seen during training. +In contrast, we propose a zero-shot framework aimed at completing partially +observed point clouds across any unseen categories. Leveraging point rendering +via Gaussian Splatting, we develop techniques of Point Cloud Colorization and +Zero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion +models to infer missing regions. Experimental results on both synthetic and +real-world scanned point clouds demonstrate that our approach outperforms +existing methods in completing a variety of objects without any requirement for +specific training data. + +
+
+
+
+
+ + ☆ MedRG: Medical Report Grounding with Multi-modal Large Language Model + + +
+ Medical Report Grounding is pivotal in identifying the most relevant regions +in medical images based on a given phrase query, a critical aspect in medical +image analysis and radiological diagnosis. However, prevailing visual grounding +approaches necessitate the manual extraction of key phrases from medical +reports, imposing substantial burdens on both system efficiency and physicians. +In this paper, we introduce a novel framework, Medical Report Grounding +(MedRG), an end-to-end solution for utilizing a multi-modal Large Language +Model to predict key phrase by incorporating a unique token, BOX, into the +vocabulary to serve as an embedding for unlocking detection capabilities. +Subsequently, the vision encoder-decoder jointly decodes the hidden embedding +and the input medical image, generating the corresponding grounding box. The +experimental results validate the effectiveness of MedRG, surpassing the +performance of the existing state-of-the-art medical phrase grounding methods. +This study represents a pioneering exploration of the medical report grounding +task, marking the first-ever endeavor in this domain. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior + + +
+ Text-to-3D generation has achieved remarkable success via large-scale +text-to-image diffusion models. Nevertheless, there is no paradigm for scaling +up the methodology to urban scale. Urban scenes, characterized by numerous +elements, intricate arrangement relationships, and vast scale, present a +formidable barrier to the interpretability of ambiguous textual descriptions +for effective model optimization. In this work, we surmount the limitations by +introducing a compositional 3D layout representation into text-to-3D paradigm, +serving as an additional prior. It comprises a set of semantic primitives with +simple geometric structures and explicit arrangement relationships, +complementing textual descriptions and enabling steerable generation. Upon +this, we propose two modifications -- (1) We introduce Layout-Guided +Variational Score Distillation to address model optimization inadequacies. It +conditions the score distillation sampling process with geometric and semantic +constraints of 3D layouts. (2) To handle the unbounded nature of urban scenes, +we represent 3D scene with a Scalable Hash Grid structure, incrementally +adapting to the growing scale of urban scenes. Extensive experiments +substantiate the capability of our framework to scale text-to-3D generation to +large-scale urban scenes that cover over 1000m driving distance for the first +time. We also present various scene editing demonstrations, showing the powers +of steerable urban scene generation. Website: https://urbanarchitect.github.io. + +
+
+ comment: Project page: https://urbanarchitect.github.io/ +
+
+
+
+
+ + ☆ Efficient and Scalable Chinese Vector Font Generation via Component + Composition + + +
+ Chinese vector font generation is challenging due to the complex structure +and huge amount of Chinese characters. Recent advances remain limited to +generating a small set of characters with simple structure. In this work, we +first observe that most Chinese characters can be disassembled into +frequently-reused components. Therefore, we introduce the first efficient and +scalable Chinese vector font generation approach via component composition, +allowing generating numerous vector characters from a small set of components. +To achieve this, we collect a large-scale dataset that contains over +\textit{90K} Chinese characters with their components and layout information. +Upon the dataset, we propose a simple yet effective framework based on spatial +transformer networks (STN) and multiple losses tailored to font characteristics +to learn the affine transformation of the components, which can be directly +applied to the B\'ezier curves, resulting in Chinese characters in vector +format. Our qualitative and quantitative experiments have demonstrated that our +method significantly surpasses the state-of-the-art vector font generation +methods in generating large-scale complex Chinese characters in both font +generation and zero-shot font extension. + +
+
+ comment: 15 pages, 23 figures +
+
+
+
+
+ + ☆ Object-Conditioned Energy-Based Attention Map Alignment in Text-to-Image + Diffusion Models + + +
+ Text-to-image diffusion models have shown great success in generating +high-quality text-guided images. Yet, these models may still fail to +semantically align generated images with the provided text prompts, leading to +problems like incorrect attribute binding and/or catastrophic object neglect. +Given the pervasive object-oriented structure underlying text prompts, we +introduce a novel object-conditioned Energy-Based Attention Map Alignment +(EBAMA) method to address the aforementioned problems. We show that an +object-centric attribute binding loss naturally emerges by approximately +maximizing the log-likelihood of a $z$-parameterized energy-based model with +the help of the negative sampling technique. We further propose an +object-centric intensity regularizer to prevent excessive shifts of objects +attention towards their attributes. Extensive qualitative and quantitative +experiments, including human evaluation, on several challenging benchmarks +demonstrate the superior performance of our method over previous strong +counterparts. With better aligned attention maps, our approach shows great +promise in further enhancing the text-controlled image editing ability of +diffusion models. + +
+
+
+
+
+ + ☆ Deep Generative Sampling in the Dual Divergence Space: A Data-efficient + & Interpretative Approach for Generative AI + + +
+ Building on the remarkable achievements in generative sampling of natural +images, we propose an innovative challenge, potentially overly ambitious, which +involves generating samples of entire multivariate time series that resemble +images. However, the statistical challenge lies in the small sample size, +sometimes consisting of a few hundred subjects. This issue is especially +problematic for deep generative models that follow the conventional approach of +generating samples from a canonical distribution and then decoding or denoising +them to match the true data distribution. In contrast, our method is grounded +in information theory and aims to implicitly characterize the distribution of +images, particularly the (global and local) dependency structure between +pixels. We achieve this by empirically estimating its KL-divergence in the dual +form with respect to the respective marginal distribution. This enables us to +perform generative sampling directly in the optimized 1-D dual divergence +space. Specifically, in the dual space, training samples representing the data +distribution are embedded in the form of various clusters between two end +points. In theory, any sample embedded between those two end points is +in-distribution w.r.t. the data distribution. Our key idea for generating novel +samples of images is to interpolate between the clusters via a walk as per +gradients of the dual function w.r.t. the data dimensions. In addition to the +data efficiency gained from direct sampling, we propose an algorithm that +offers a significant reduction in sample complexity for estimating the +divergence of the data distribution with respect to the marginal distribution. +We provide strong theoretical guarantees along with an extensive empirical +evaluation using many real-world datasets from diverse domains, establishing +the superiority of our approach w.r.t. state-of-the-art deep learning methods. + +
+
+
+
+
+ + ☆ Improving Multi-Center Generalizability of GAN-Based Fat Suppression + using Federated Learning + + +
+ Generative Adversarial Network (GAN)-based synthesis of fat suppressed (FS) +MRIs from non-FS proton density sequences has the potential to accelerate +acquisition of knee MRIs. However, GANs trained on single-site data have poor +generalizability to external data. We show that federated learning can improve +multi-center generalizability of GANs for synthesizing FS MRIs, while +facilitating privacy-preserving multi-institutional collaborations. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic + Microplastics Data + + +
+ Microplastic particle ingestion or inhalation by humans is a problem of +growing concern. Unfortunately, current research methods that use machine +learning to understand their potential harms are obstructed by a lack of +available data. Deep learning techniques in particular are challenged by such +domains where only small or imbalanced data sets are available. Overcoming this +challenge often involves oversampling underrepresented classes or augmenting +the existing data to improve model performance. This paper proposes GANsemble: +a two-module framework connecting data augmentation with conditional generative +adversarial networks (cGANs) to generate class-conditioned synthetic data. +First, the data chooser module automates augmentation strategy selection by +searching for the best data augmentation strategy. Next, the cGAN module uses +this strategy to train a cGAN for generating enhanced synthetic data. We +experiment with the GANsemble framework on a small and imbalanced microplastics +data set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines +for synthetic microplastics (SYMP) data are established in terms of Frechet +Inception Distance (FID) and Inception Scores (IS). We also provide a synthetic +microplastics filter (SYMP-Filter) algorithm to increase the quality of +generated SYMP. Additionally, we show the best amount of oversampling with +augmentation to fix class imbalance in small microplastics data sets. To our +knowledge, this study is the first application of generative AI to +synthetically create microplastics data. + +
+
+ comment: Accepted to the 37th Canadian Artificial Intelligence Conference + (2024), 12 pages, 4 figures +
+
+
+
+
+ + ☆ A Transformer-Based Model for the Prediction of Human Gaze Behavior on + Videos + + +
+ Eye-tracking applications that utilize the human gaze in video understanding +tasks have become increasingly important. To effectively automate the process +of video analysis based on eye-tracking data, it is important to accurately +replicate human gaze behavior. However, this task presents significant +challenges due to the inherent complexity and ambiguity of human gaze patterns. +In this work, we introduce a novel method for simulating human gaze behavior. +Our approach uses a transformer-based reinforcement learning algorithm to train +an agent that acts as a human observer, with the primary role of watching +videos and simulating human gaze behavior. We employed an eye-tracking dataset +gathered from videos generated by the VirtualHome simulator, with a primary +focus on activity recognition. Our experimental results demonstrate the +effectiveness of our gaze prediction method by highlighting its capability to +replicate human gaze behavior and its applicability for downstream tasks where +real human-gaze is used as input. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on + Intention + + +
+ Humans utilize their gaze to concentrate on essential information while +perceiving and interpreting intentions in videos. Incorporating human gaze into +computational algorithms can significantly enhance model performance in video +understanding tasks. In this work, we address a challenging and innovative task +in video understanding: predicting the actions of an agent in a video based on +a partial video. We introduce the Gaze-guided Action Anticipation algorithm, +which establishes a visual-semantic graph from the video input. Our method +utilizes a Graph Neural Network to recognize the agent's intention and predict +the action sequence to fulfill this intention. To assess the efficiency of our +approach, we collect a dataset containing household activities generated in the +VirtualHome environment, accompanied by human gaze data of viewing videos. Our +method outperforms state-of-the-art techniques, achieving a 7\% improvement in +accuracy for 18-class intention recognition. This highlights the efficiency of +our method in learning important features from human gaze data. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ PEAVS: Perceptual Evaluation of Audio-Visual Synchrony Grounded in + Viewers' Opinion Scores + + +
+ Recent advancements in audio-visual generative modeling have been propelled +by progress in deep learning and the availability of data-rich benchmarks. +However, the growth is not attributed solely to models and benchmarks. +Universally accepted evaluation metrics also play an important role in +advancing the field. While there are many metrics available to evaluate audio +and visual content separately, there is a lack of metrics that offer a +quantitative and interpretable measure of audio-visual synchronization for +videos "in the wild". To address this gap, we first created a large scale human +annotated dataset (100+ hrs) representing nine types of synchronization errors +in audio-visual content and how human perceive them. We then developed a PEAVS +(Perceptual Evaluation of Audio-Visual Synchrony) score, a novel automatic +metric with a 5-point scale that evaluates the quality of audio-visual +synchronization. We validate PEAVS using a newly generated dataset, achieving a +Pearson correlation of 0.79 at the set level and 0.54 at the clip level when +compared to human labels. In our experiments, we observe a relative gain 50% +over a natural extension of Fr\'echet based metrics for Audio-Visual synchrony, +confirming PEAVS efficacy in objectively modeling subjective perceptions of +audio-visual synchronization for videos "in the wild". + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Rethinking Perceptual Metrics for Medical Image Translation + + +
+ Modern medical image translation methods use generative models for tasks such +as the conversion of CT images to MRI. Evaluating these methods typically +relies on some chosen downstream task in the target domain, such as +segmentation. On the other hand, task-agnostic metrics are attractive, such as +the network feature-based perceptual metrics (e.g., FID) that are common to +image translation in general computer vision. In this paper, we investigate +evaluation metrics for medical image translation on two medical image +translation tasks (GE breast MRI to Siemens breast MRI and lumbar spine MRI to +CT), tested on various state-of-the-art translation methods. We show that +perceptual metrics do not generally correlate with segmentation metrics due to +them extending poorly to the anatomical constraints of this sub-field, with FID +being especially inconsistent. However, we find that the lesser-used +pixel-level SWD metric may be useful for subtle intra-modality translation. Our +results demonstrate the need for further research into helpful metrics for +medical image translation. + +
+
+
+
+
+ + ☆ AI-Guided Defect Detection Techniques to Model Single Crystal Diamond + Growth + + +
+ From a process development perspective, diamond growth via chemical vapor +deposition has made significant strides. However, challenges persist in +achieving high quality and large-area material production. These difficulties +include controlling conditions to maintain uniform growth rates for the entire +growth surface. As growth progresses, various factors or defect states emerge, +altering the uniform conditions. These changes affect the growth rate and +result in the formation of crystalline defects at the microscale. However, +there is a distinct lack of methods to identify these defect states and their +geometry using images taken during the growth process. This paper details +seminal work on defect segmentation pipeline using in-situ optical images to +identify features that indicate defective states that are visible at the +macroscale. Using a semantic segmentation approach as applied in our previous +work, these defect states and corresponding derivative features are isolated +and classified by their pixel masks. Using an annotation focused +human-in-the-loop software architecture to produce training datasets, with +modules for selective data labeling using active learning, data augmentations, +and model-assisted labeling, our approach achieves effective annotation +accuracy and drastically reduces the time and cost of labeling by orders of +magnitude. On the model development front, we found that deep learning-based +algorithms are the most efficient. They can accurately learn complex +representations from feature-rich datasets. Our best-performing model, based on +the YOLOV3 and DeeplabV3plus architectures, achieved excellent accuracy for +specific features of interest. Specifically, it reached 93.35% accuracy for +center defects, 92.83% for polycrystalline defects, and 91.98% for edge +defects. + +
+
+ comment: 12 pages,4 figures,ACMME 2024 +
+
+
+
+
+ + ☆ Solving Masked Jigsaw Puzzles with Diffusion Vision Transformers + + +
+ Solving image and video jigsaw puzzles poses the challenging task of +rearranging image fragments or video frames from unordered sequences to restore +meaningful images and video sequences. Existing approaches often hinge on +discriminative models tasked with predicting either the absolute positions of +puzzle elements or the permutation actions applied to the original data. +Unfortunately, these methods face limitations in effectively solving puzzles +with a large number of elements. In this paper, we propose JPDVT, an innovative +approach that harnesses diffusion transformers to address this challenge. +Specifically, we generate positional information for image patches or video +frames, conditioned on their underlying visual content. This information is +then employed to accurately assemble the puzzle pieces in their correct +positions, even in scenarios involving missing pieces. Our method achieves +state-of-the-art performance on several datasets. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Logit Calibration and Feature Contrast for Robust Federated Learning on + Non-IID Data + + +
+ Federated learning (FL) is a privacy-preserving distributed framework for +collaborative model training on devices in edge networks. However, challenges +arise due to vulnerability to adversarial examples (AEs) and the +non-independent and identically distributed (non-IID) nature of data +distribution among devices, hindering the deployment of adversarially robust +and accurate learning models at the edge. While adversarial training (AT) is +commonly acknowledged as an effective defense strategy against adversarial +attacks in centralized training, we shed light on the adverse effects of +directly applying AT in FL that can severely compromise accuracy, especially in +non-IID challenges. Given this limitation, this paper proposes FatCC, which +incorporates local logit \underline{C}alibration and global feature +\underline{C}ontrast into the vanilla federated adversarial training +(\underline{FAT}) process from both logit and feature perspectives. This +approach can effectively enhance the federated system's robust accuracy (RA) +and clean accuracy (CA). First, we propose logit calibration, where the logits +are calibrated during local adversarial updates, thereby improving adversarial +robustness. Second, FatCC introduces feature contrast, which involves a global +alignment term that aligns each local representation with unbiased global +features, thus further enhancing robustness and accuracy in federated +adversarial environments. Extensive experiments across multiple datasets +demonstrate that FatCC achieves comparable or superior performance gains in +both CA and RA compared to other baselines. + +
+
+
+
+
+ + ☆ Adapting LLaMA Decoder to Vision Transformer + + +
+ This work examines whether decoder-only Transformers such as LLaMA, which +were originally designed for large language models (LLMs), can be adapted to +the computer vision field. We first "LLaMAfy" a standard ViT step-by-step to +align with LLaMA's architecture, and find that directly applying a casual mask +to the self-attention brings an attention collapse issue, resulting in the +failure to the network training. We suggest to reposition the class token +behind the image tokens with a post-sequence class token technique to overcome +this challenge, enabling causal self-attention to efficiently capture the +entire image's information. Additionally, we develop a soft mask strategy that +gradually introduces a casual mask to the self-attention at the onset of +training to facilitate the optimization behavior. The tailored model, dubbed as +image LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct +supervised learning. Its causal self-attention boosts computational efficiency +and learns complex representation by elevating attention map ranks. iLLaMA +rivals the performance with its encoder-only counterparts, achieving 75.1% +ImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M +and pre-training on ImageNet-21K further enhances the accuracy to 86.0%. +Extensive experiments demonstrate iLLaMA's reliable properties: calibration, +shape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR +transfer learning. We hope our study can kindle fresh views to visual model +design in the wave of LLMs. Pre-trained models and codes are available here. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D + Reconstruction of Indoor Scenes from Monocular RGB Views + + +
+ Current monocular 3D scene reconstruction (3DR) works are either +fully-supervised, or not generalizable, or implicit in 3D representation. We +propose a novel framework - MonoSelfRecon that for the first time achieves +explicit 3D mesh reconstruction for generalizable indoor scenes with monocular +RGB views by purely self-supervision on voxel-SDF (signed distance function). +MonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and +a generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF +in self-supervision. We propose novel self-supervised losses, which not only +support pure self-supervision, but can be used together with supervised signals +to further boost supervised training. Our experiments show that "MonoSelfRecon" +trained in pure self-supervision outperforms current best self-supervised +indoor depth estimation models and is comparable to 3DR models trained in fully +supervision with depth annotations. MonoSelfRecon is not restricted by specific +model design, which can be used to any models with voxel-SDF for purely +self-supervised manner. + +
+
+
+
+
+ + ☆ YOLO based Ocean Eddy Localization with AWS SageMaker + + +
+ Ocean eddies play a significant role both on the sea surface and beneath it, +contributing to the sustainability of marine life dependent on oceanic +behaviors. Therefore, it is crucial to investigate ocean eddies to monitor +changes in the Earth, particularly in the oceans, and their impact on climate. +This study aims to pinpoint ocean eddies using AWS cloud services, specifically +SageMaker. The primary objective is to detect small-scale (<20km) ocean eddies +from satellite remote images and assess the feasibility of utilizing SageMaker, +which offers tools for deploying AI applications. Moreover, this research not +only explores the deployment of cloud-based services for remote sensing of +Earth data but also evaluates several YOLO (You Only Look Once) models using +single and multi-GPU-based services in the cloud. Furthermore, this study +underscores the potential of these services, their limitations, challenges +related to deployment and resource management, and their user-riendliness for +Earth science projects. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ An Animation-based Augmentation Approach for Action Recognition from + Discontinuous Video + + +
+ The study of action recognition has attracted considerable attention recently +due to its broad applications in multiple areas. However, with the issue of +discontinuous training video, which not only decreases the performance of +action recognition model, but complicates the data augmentation process as +well, still remains under-exploration. In this study, we introduce the 4A +(Action Animation-based Augmentation Approach), an innovative pipeline for data +augmentation to address the problem. The main contributions remain in our work +includes: (1) we investigate the problem of severe decrease on performance of +action recognition task training by discontinuous video, and the limitation of +existing augmentation methods on solving this problem. (2) we propose a novel +augmentation pipeline, 4A, to address the problem of discontinuous video for +training, while achieving a smoother and natural-looking action representation +than the latest data augmentation methodology. (3) We achieve the same +performance with only 10% of the original data for training as with all of the +original data from the real-world dataset, and a better performance on +In-the-wild videos, by employing our data augmentation techniques. + +
+
+
+
+
+ + ☆ Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural + Radiance Fields + + +
+ We present the Bayesian Neural Radiance Field (NeRF), which explicitly +quantifies uncertainty in geometric volume structures without the need for +additional networks, making it adept for challenging observations and +uncontrolled images. NeRF diverges from traditional geometric methods by +offering an enriched scene representation, rendering color and density in 3D +space from various viewpoints. However, NeRF encounters limitations in relaxing +uncertainties by using geometric structure information, leading to inaccuracies +in interpretation under insufficient real-world observations. Recent research +efforts aimed at addressing this issue have primarily relied on empirical +methods or auxiliary networks. To fundamentally address this issue, we propose +a series of formulational extensions to NeRF. By introducing generalized +approximations and defining density-related uncertainty, our method seamlessly +extends to manage uncertainty not only for RGB but also for depth, without the +need for additional networks or empirical assumptions. In experiments we show +that our method significantly enhances performance on RGB and depth images in +the comprehensive dataset, demonstrating the reliability of the Bayesian NeRF +approach to quantifying uncertainty based on the geometric structure. + +
+
+
+
+
+ + ☆ Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR + Data + + +
+ 3D detection is a critical task that enables machines to identify and locate +objects in three-dimensional space. It has a broad range of applications in +several fields, including autonomous driving, robotics and augmented reality. +Monocular 3D detection is attractive as it requires only a single camera, +however, it lacks the accuracy and robustness required for real world +applications. High resolution LiDAR on the other hand, can be expensive and +lead to interference problems in heavy traffic given their active +transmissions. We propose a balanced approach that combines the advantages of +monocular and point cloud-based 3D detection. Our method requires only a small +number of 3D points, that can be obtained from a low-cost, low-resolution +sensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR +frame in the KITTI dataset. Our method reconstructs a complete 3D point cloud +from this limited 3D information combined with a single image. The +reconstructed 3D point cloud and corresponding image can be used by any +multi-modal off-the-shelf detector for 3D object detection. By using the +proposed network architecture with an off-the-shelf multi-modal 3D detector, +the accuracy of 3D detection improves by 20% compared to the state-of-the-art +monocular detection methods and 6% to 9% compare to the baseline multi-modal +methods on KITTI and JackRabbot datasets. + +
+
+
+
+
+ + ☆ Convolution-based Probability Gradient Loss for Semantic Segmentation + + +
+ In this paper, we introduce a novel Convolution-based Probability Gradient +(CPG) loss for semantic segmentation. It employs convolution kernels similar to +the Sobel operator, capable of computing the gradient of pixel intensity in an +image. This enables the computation of gradients for both ground-truth and +predicted category-wise probabilities. It enhances network performance by +maximizing the similarity between these two probability gradients. Moreover, to +specifically enhance accuracy near the object's boundary, we extract the object +boundary based on the ground-truth probability gradient and exclusively apply +the CPG loss to pixels belonging to boundaries. CPG loss proves to be highly +convenient and effective. It establishes pixel relationships through +convolution, calculating errors from a distinct dimension compared to +pixel-wise loss functions such as cross-entropy loss. We conduct qualitative +and quantitative analyses to evaluate the impact of the CPG loss on three +well-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and +LRASPP_MobileNet_V3_Large) across three standard segmentation datasets +(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results +consistently and significantly demonstrate that the CPG loss enhances the mean +Intersection over Union. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Scaling Multi-Camera 3D Object Detection through Weak-to-Strong + Eliciting + + +
+ The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by +bird's-eye view (BEV) representation, signifies a notable progression in 3D +object detection. Scaling MC3D-Det training effectively accommodates varied +camera parameters and urban landscapes, paving the way for the MC3D-Det +foundation model. However, the multi-view fusion stage of the MC3D-Det method +relies on the ill-posed monocular perception during training rather than +surround refinement ability, leading to what we term "surround refinement +degradation". To this end, our study presents a weak-to-strong eliciting +framework aimed at enhancing surround refinement while maintaining robust +monocular perception. Specifically, our framework employs weakly tuned experts +trained on distinct subsets, and each is inherently biased toward specific +camera configurations and scenarios. These biased experts can learn the +perception of monocular degeneration, which can help the multi-view fusion +stage to enhance surround refinement abilities. Moreover, a composite +distillation strategy is proposed to integrate the universal knowledge of 2D +foundation models and task-specific information. Finally, for MC3D-Det joint +training, the elaborate dataset merge strategy is designed to solve the problem +of inconsistent camera numbers and camera parameters. We set up a multiple +dataset joint training benchmark for MC3D-Det and adequately evaluated existing +methods. Further, we demonstrate the proposed framework brings a generalized +and significant boost over multiple baselines. Our code is at +\url{https://github.com/EnVision-Research/Scale-BEV}. + +
+
+
+
+
+ + ☆ Binomial Self-compensation for Motion Error in Dynamic 3D Scanning + + +
+ Phase shifting profilometry (PSP) is favored in high-precision 3D scanning +due to its high accuracy, robustness, and pixel-wise property. However, a +fundamental assumption of PSP that the object should remain static is violated +in dynamic measurement, making PSP susceptible to object moving, resulting in +ripple-like errors in the point clouds. We propose a pixel-wise and frame-wise +loopable binomial self-compensation (BSC) algorithm to effectively and flexibly +eliminate motion error in the four-step PSP. Our mathematical model +demonstrates that by summing successive motion-affected phase frames weighted +by binomial coefficients, motion error exponentially diminishes as the binomial +order increases, accomplishing automatic error compensation through the +motion-affected phase sequence, without the assistance of any intermediate +variable. Extensive experiments show that our BSC outperforms the existing +methods in reducing motion error, while achieving a depth map frame rate equal +to the camera's acquisition rate (90 fps), enabling high-accuracy 3D +reconstruction with a quasi-single-shot frame rate. + +
+
+
+
+
+ + ☆ Perception-Oriented Video Frame Interpolation via Asymmetric Blending CVPR 2024 + + +
+ Previous methods for Video Frame Interpolation (VFI) have encountered +challenges, notably the manifestation of blur and ghosting effects. These +issues can be traced back to two pivotal factors: unavoidable motion errors and +misalignment in supervision. In practice, motion estimates often prove to be +error-prone, resulting in misaligned features. Furthermore, the reconstruction +loss tends to bring blurry results, particularly in misaligned regions. To +mitigate these challenges, we propose a new paradigm called PerVFI +(Perception-oriented Video Frame Interpolation). Our approach incorporates an +Asymmetric Synergistic Blending module (ASB) that utilizes features from both +sides to synergistically blend intermediate features. One reference frame +emphasizes primary content, while the other contributes complementary +information. To impose a stringent constraint on the blending process, we +introduce a self-learned sparse quasi-binary mask which effectively mitigates +ghosting and blur artifacts in the output. Additionally, we employ a +normalizing flow-based generator and utilize the negative log-likelihood loss +to learn the conditional distribution of the output, which further facilitates +the generation of clear and fine details. Experimental results validate the +superiority of PerVFI, demonstrating significant improvements in perceptual +quality compared to existing methods. Codes are available at +\url{https://github.com/mulns/PerVFI} + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Unsupervised Visible-Infrared ReID via Pseudo-label Correction and + Modality-level Alignment + + +
+ Unsupervised visible-infrared person re-identification (UVI-ReID) has +recently gained great attention due to its potential for enhancing human +detection in diverse environments without labeling. Previous methods utilize +intra-modality clustering and cross-modality feature matching to achieve +UVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be +generated in the clustering process, and 2) the cross-modality feature +alignment via matching the marginal distribution of visible and infrared +modalities may misalign the different identities from two modalities. In this +paper, we first conduct a theoretic analysis where an interpretable +generalization upper bound is introduced. Based on the analysis, we then +propose a novel unsupervised cross-modality person re-identification framework +(PRAISE). Specifically, to address the first challenge, we propose a +pseudo-label correction strategy that utilizes a Beta Mixture Model to predict +the probability of mis-clustering based network's memory effect and rectifies +the correspondence by adding a perceptual term to contrastive learning. Next, +we introduce a modality-level alignment strategy that generates paired +visible-infrared latent features and reduces the modality gap by aligning the +labeling function of visible and infrared features to learn identity +discriminative and modality-invariant features. Experimental results on two +benchmark datasets demonstrate that our method achieves state-of-the-art +performance than the unsupervised visible-ReID methods. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models + + +
+ Text-to-image (T2I) models, such as Stable Diffusion, have exhibited +remarkable performance in generating high-quality images from text descriptions +in recent years. However, text-to-image models may be tricked into generating +not-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing +countermeasures mostly focus on filtering inappropriate inputs and outputs, or +suppressing improper text embeddings, which can block explicit NSFW-related +content (e.g., naked or sexy) but may still be vulnerable to adversarial +prompts inputs that appear innocent but are ill-intended. In this paper, we +present SafeGen, a framework to mitigate unsafe content generation by +text-to-image models in a text-agnostic manner. The key idea is to eliminate +unsafe visual representations from the model regardless of the text input. In +this way, the text-to-image model is resistant to adversarial prompts since +unsafe visual representations are obstructed from within. Extensive experiments +conducted on four datasets demonstrate SafeGen's effectiveness in mitigating +unsafe content generation while preserving the high-fidelity of benign images. +SafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1% +sexual content removal performance. Furthermore, our constructed benchmark of +adversarial prompts provides a basis for future development and evaluation of +anti-NSFW-generation methods. + +
+
+
+
+
+ + ☆ Deep Generative Data Assimilation in Multimodal Setting CVPR2024 + + +
+ Robust integration of physical knowledge and data is key to improve +computational simulations, such as Earth system models. Data assimilation is +crucial for achieving this goal because it provides a systematic framework to +calibrate model outputs with observations, which can include remote sensing +imagery and ground station measurements, with uncertainty quantification. +Conventional methods, including Kalman filters and variational approaches, +inherently rely on simplifying linear and Gaussian assumptions, and can be +computationally expensive. Nevertheless, with the rapid adoption of data-driven +methods in many areas of computational sciences, we see the potential of +emulating traditional data assimilation with deep learning, especially +generative models. In particular, the diffusion-based probabilistic framework +has large overlaps with data assimilation principles: both allows for +conditional generation of samples with a Bayesian inverse framework. These +models have shown remarkable success in text-conditioned image generation or +image-controlled video synthesis. Likewise, one can frame data assimilation as +observation-conditioned state calibration. In this work, we propose SLAMS: +Score-based Latent Assimilation in Multimodal Setting. Specifically, we +assimilate in-situ weather station data and ex-situ satellite imagery to +calibrate the vertical temperature profiles, globally. Through extensive +ablation, we demonstrate that SLAMS is robust even in low-resolution, noisy, +and sparse data settings. To our knowledge, our work is the first to apply deep +generative framework for multimodal data assimilation using real-world +datasets; an important step for building robust computational simulators, +including the next-generation Earth system models. Our code is available at: +https://github.com/yongquan-qu/SLAMS + +
+
+ comment: Accepted to CVPR2024 EarthVision +
+
+
+
+
+ + ☆ Multi-modal Document Presentation Attack Detection With Forensics Trace + Disentanglement ICME 2024 + + +
+ Document Presentation Attack Detection (DPAD) is an important measure in +protecting the authenticity of a document image. However, recent DPAD methods +demand additional resources, such as manual effort in collecting additional +data or knowing the parameters of acquisition devices. This work proposes a +DPAD method based on multi-modal disentangled traces (MMDT) without the above +drawbacks. We first disentangle the recaptured traces by a self-supervised +disentanglement and synthesis network to enhance the generalization capacity in +document images with different contents and layouts. Then, unlike the existing +DPAD approaches that rely only on data in the RGB domain, we propose to +explicitly employ the disentangled recaptured traces as new modalities in the +transformer backbone through adaptive multi-modal adapters to fuse RGB/trace +features efficiently. Visualization of the disentangled traces confirms the +effectiveness of the proposed method in different document contents. Extensive +experiments on three benchmark datasets demonstrate the superiority of our MMDT +method on representing forensic traces of recapturing distortion. + +
+
+ comment: Accepted to ICME 2024 +
+
+
+
+
+ + ☆ Efficient Denoising using Score Embedding in Score-based Diffusion + Models + + +
+ It is well known that training a denoising score-based diffusion models +requires tens of thousands of epochs and a substantial number of image data to +train the model. In this paper, we propose to increase the efficiency in +training score-based diffusion models. Our method allows us to decrease the +number of epochs needed to train the diffusion model. We accomplish this by +solving the log-density Fokker-Planck (FP) Equation numerically to compute the +score \textit{before} training. The pre-computed score is embedded into the +image to encourage faster training under slice Wasserstein distance. +Consequently, it also allows us to decrease the number of images we need to +train the neural network to learn an accurate score. We demonstrate through our +numerical experiments the improved performance of our proposed method compared +to standard score-based diffusion models. Our proposed method achieves a +similar quality to the standard method meaningfully faster. + +
+
+
+
+
+ + ☆ AI-Guided Feature Segmentation Techniques to Model Features from Single + Crystal Diamond Growth + + +
+ Process refinement to consistently produce high-quality material over a large +area of the grown crystal, enabling various applications from optics crystals +to quantum detectors, has long been a goal for diamond growth. Machine learning +offers a promising path toward this goal, but faces challenges such as the +complexity of features within datasets, their time-dependency, and the volume +of data produced per growth run. Accurate spatial feature extraction from image +to image for real-time monitoring of diamond growth is crucial yet complicated +due to the low-volume and high feature complexity nature of the datasets. This +paper compares various traditional and machine learning-driven approaches for +feature extraction in the diamond growth domain, proposing a novel deep +learning-driven semantic segmentation approach to isolate and classify accurate +pixel masks of geometric features like diamond, pocket holder, and background, +along with their derivative features based on shape and size. Using an +annotation-focused human-in-the-loop software architecture for training +datasets, with modules for selective data labeling using active learning, data +augmentations, and model-assisted labeling, our approach achieves effective +annotation accuracy and drastically reduces labeling time and cost. Deep +learning algorithms prove highly efficient in accurately learning complex +representations from datasets with many features. Our top-performing model, +based on the DeeplabV3plus architecture, achieves outstanding accuracy in +classifying features of interest, with accuracies of 96.31% for pocket holder, +98.60% for diamond top, and 91.64% for diamond side features. + +
+
+ comment: 12 pages,4 figures,ACMME 2024. arXiv admin note: substantial text + overlap with arXiv:2404.07306 +
+
+
+
+
+ + ☆ Enhanced Cooperative Perception for Autonomous Vehicles Using Imperfect + Communication + + +
+ Sharing and joint processing of camera feeds and sensor measurements, known +as Cooperative Perception (CP), has emerged as a new technique to achieve +higher perception qualities. CP can enhance the safety of Autonomous Vehicles +(AVs) where their individual visual perception quality is compromised by +adverse weather conditions (haze as foggy weather), low illumination, winding +roads, and crowded traffic. To cover the limitations of former methods, in this +paper, we propose a novel approach to realize an optimized CP under constrained +communications. At the core of our approach is recruiting the best helper from +the available list of front vehicles to augment the visual range and enhance +the Object Detection (OD) accuracy of the ego vehicle. In this two-step +process, we first select the helper vehicles that contribute the most to CP +based on their visual range and lowest motion blur. Next, we implement a radio +block optimization among the candidate vehicles to further improve +communication efficiency. We specifically focus on pedestrian detection as an +exemplary scenario. To validate our approach, we used the CARLA simulator to +create a dataset of annotated videos for different driving scenarios where +pedestrian detection is challenging for an AV with compromised vision. Our +results demonstrate the efficacy of our two-step optimization process in +improving the overall performance of cooperative perception in challenging +scenarios, substantially improving driving safety under adverse conditions. +Finally, we note that the networking assumptions are adopted from LTE Release +14 Mode 4 side-link communication, commonly used for Vehicle-to-Vehicle (V2V) +communication. Nonetheless, our method is flexible and applicable to arbitrary +V2V communications. + +
+
+
+
+
+ + ☆ An inclusive review on deep learning techniques and their scope in + handwriting recognition + + +
+ Deep learning expresses a category of machine learning algorithms that have +the capability to combine raw inputs into intermediate features layers. These +deep learning algorithms have demonstrated great results in different fields. +Deep learning has particularly witnessed for a great achievement of human level +performance across a number of domains in computer vision and pattern +recognition. For the achievement of state-of-the-art performances in diverse +domains, the deep learning used different architectures and these architectures +used activation functions to perform various computations between hidden and +output layers of any architecture. This paper presents a survey on the existing +studies of deep learning in handwriting recognition field. Even though the +recent progress indicates that the deep learning methods has provided valuable +means for speeding up or proving accurate results in handwriting recognition, +but following from the extensive literature survey, the present study finds +that the deep learning has yet to revolutionize more and has to resolve many of +the most pressing challenges in this field, but promising advances have been +made on the prior state of the art. Additionally, an inadequate availability of +labelled data to train presents problems in this domain. Nevertheless, the +present handwriting recognition survey foresees deep learning enabling changes +at both bench and bedside with the potential to transform several domains as +image processing, speech recognition, computer vision, machine translation, +robotics and control, medical imaging, medical information processing, +bio-informatics, natural language processing, cyber security, and many others. + +
+
+
+
+
+ + ☆ A Transformer-Based Model for the Prediction of Human Gaze Behavior on + Videos + + +
+ Eye-tracking applications that utilize the human gaze in video understanding +tasks have become increasingly important. To effectively automate the process +of video analysis based on eye-tracking data, it is important to accurately +replicate human gaze behavior. However, this task presents significant +challenges due to the inherent complexity and ambiguity of human gaze patterns. +In this work, we introduce a novel method for simulating human gaze behavior. +Our approach uses a transformer-based reinforcement learning algorithm to train +an agent that acts as a human observer, with the primary role of watching +videos and simulating human gaze behavior. We employed an eye-tracking dataset +gathered from videos generated by the VirtualHome simulator, with a primary +focus on activity recognition. Our experimental results demonstrate the +effectiveness of our gaze prediction method by highlighting its capability to +replicate human gaze behavior and its applicability for downstream tasks where +real human-gaze is used as input. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on + Intention + + +
+ Humans utilize their gaze to concentrate on essential information while +perceiving and interpreting intentions in videos. Incorporating human gaze into +computational algorithms can significantly enhance model performance in video +understanding tasks. In this work, we address a challenging and innovative task +in video understanding: predicting the actions of an agent in a video based on +a partial video. We introduce the Gaze-guided Action Anticipation algorithm, +which establishes a visual-semantic graph from the video input. Our method +utilizes a Graph Neural Network to recognize the agent's intention and predict +the action sequence to fulfill this intention. To assess the efficiency of our +approach, we collect a dataset containing household activities generated in the +VirtualHome environment, accompanied by human gaze data of viewing videos. Our +method outperforms state-of-the-art techniques, achieving a 7\% improvement in +accuracy for 18-class intention recognition. This highlights the efficiency of +our method in learning important features from human gaze data. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ♻ ☆ Disentangled Explanations of Neural Network Predictions by Finding + Relevant Subspaces + + +
+ Explainable AI aims to overcome the black-box nature of complex ML models +like neural networks by generating explanations for their predictions. +Explanations often take the form of a heatmap identifying input features (e.g. +pixels) that are relevant to the model's decision. These explanations, however, +entangle the potentially multiple factors that enter into the overall complex +decision strategy. We propose to disentangle explanations by extracting at some +intermediate layer of a neural network, subspaces that capture the multiple and +distinct activation patterns (e.g. visual concepts) that are relevant to the +prediction. To automatically extract these subspaces, we propose two new +analyses, extending principles found in PCA or ICA to explanations. These novel +analyses, which we call principal relevant component analysis (PRCA) and +disentangled relevant subspace analysis (DRSA), maximize relevance instead of +e.g. variance or kurtosis. This allows for a much stronger focus of the +analysis on what the ML model actually uses for predicting, ignoring +activations or concepts to which the model is invariant. Our approach is +general enough to work alongside common attribution techniques such as Shapley +Value, Integrated Gradients, or LRP. Our proposed methods show to be +practically useful and compare favorably to the state of the art as +demonstrated on benchmarks and three use cases. + +
+
+ comment: 17 pages + supplement +
+
+
+
+
+ + ♻ ☆ Deep Learning for Inertial Sensor Alignment + + +
+ Accurate alignment of a fixed mobile device equipped with inertial sensors +inside a moving vehicle is important for navigation, activity recognition, and +other applications. Accurate estimation of the device mounting angle is +required to rotate the inertial measurement from the sensor frame to the moving +platform frame to standardize measurements and improve the performance of the +target task. In this work, a data-driven approach using deep neural networks +(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped +with an inertial measurement unit (IMU) and strapped to a car. The proposed +model uses only the accelerometer and gyroscope readings from an IMU as input +and, in contrast to existing solutions, does not require global position inputs +from global navigation satellite systems (GNSS). To train the model in a +supervised manner, IMU data is collected for training and validation with the +sensor mounted at a known yaw mounting angle, and a range of ground truth +labels is generated by applying a random rotation in a bounded range to the +measurements. The trained model is tested on data with real rotations showing +similar performance as with synthetic rotations. The trained model is deployed +on an Android device and evaluated in real-time to test the accuracy of the +estimated yaw mounting angle. The model is shown to find the mounting angle at +an accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An +experiment is conducted to compare the proposed model with an existing +off-the-shelf solution. + +
+
+ comment: 9 Pages, Preprint. Accepted IEEE +
+
+
+
+
+ + ♻ ☆ GLiDR: Topologically Regularized Graph Generative Network for Sparse + LiDAR Point Clouds CVPR + + +
+ Sparse LiDAR point clouds cause severe loss of detail of static structures +and reduce the density of static points available for navigation. Reduced +density can be detrimental to navigation under several scenarios. We observe +that despite high sparsity, in most cases, the global topology of LiDAR +outlining the static structures can be inferred. We utilize this property to +obtain a backbone skeleton of a LiDAR scan in the form of a single connected +component that is a proxy to its global topology. We utilize the backbone to +augment new points along static structures to overcome sparsity. Newly +introduced points could correspond to existing static structures or to static +points that were earlier obstructed by dynamic objects. To the best of our +knowledge, we are the first to use such a strategy for sparse LiDAR point +clouds. Existing solutions close to our approach fail to identify and preserve +the global static LiDAR topology and generate sub-optimal points. We propose +GLiDR, a Graph Generative network that is topologically regularized using +0-dimensional Persistent Homology ($\mathcal{PH}$) constraints. This enables +GLiDR to introduce newer static points along a topologically consistent global +static LiDAR backbone. GLiDR generates precise static points using $32\times$ +sparser dynamic scans and performs better than the baselines across three +datasets. GLiDR generates a valuable byproduct - an accurate binary +segmentation mask of static and dynamic objects that are helpful for navigation +planning and safety in constrained environments. The newly introduced static +points allow GLiDR to outperform LiDAR-based navigation using SLAM in several +settings. Source code is available at +$\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) +
+
+
+
+
+ + ♻ ☆ CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update CVPR 2024 + + +
+ Utilizing large language models (LLMs) to compose off-the-shelf visual tools +represents a promising avenue of research for developing robust visual +assistants capable of addressing diverse visual tasks. However, these methods +often overlook the potential for continual learning, typically by freezing the +utilized tools, thus limiting their adaptation to environments requiring new +knowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual +Assistant, which operates within a framework encompassing inference, +reflection, and learning phases. During the inference phase, LLMs generate +programs and execute corresponding tools to complete assigned tasks. In the +reflection phase, a multimodal global-local reflection scheme analyzes human +feedback to determine which tools require updating. Lastly, the learning phase +employs three flexible approaches to automatically gather training data and +introduces a novel prompt tuning scheme to update the tools, allowing CLOVA to +efficiently acquire new knowledge. Experimental findings demonstrate that CLOVA +surpasses existing tool-usage methods by 5% in visual question answering and +multiple-image reasoning, by 10% in knowledge tagging, and by 20% in image +editing. These results underscore the significance of the continual learning +capability in general visual assistants. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Bias-Reduced Neural Networks for Parameter Estimation in Quantitative + MRI + + +
+ Purpose: To develop neural network (NN)-based quantitative MRI parameter +estimators with minimal bias and a variance close to the Cram\'er-Rao bound. + Theory and Methods: We generalize the mean squared error loss to control the +bias and variance of the NN's estimates, which involves averaging over multiple +noise realizations of the same measurements during training. Bias and variance +properties of the resulting NNs are studied for two neuroimaging applications. + Results: In simulations, the proposed strategy reduces the estimates' bias +throughout parameter space and achieves a variance close to the Cram\'er-Rao +bound. In vivo, we observe good concordance between parameter maps estimated +with the proposed NNs and traditional estimators, such as non-linear +least-squares fitting, while state-of-the-art NNs show larger deviations. + Conclusion: The proposed NNs have greatly reduced bias compared to those +trained using the mean squared error and offer significantly improved +computational efficiency over traditional estimators with comparable or better +accuracy. + +
+
+
+
+
+ + ♻ ☆ MaskClustering: View Consensus based Mask Graph Clustering for + Open-Vocabulary 3D Instance Segmentation + + +
+ Open-vocabulary 3D instance segmentation is cutting-edge for its ability to +segment 3D instances without predefined categories. However, progress in 3D +lags behind its 2D counterpart due to limited annotated 3D data. To address +this, recent works first generate 2D open-vocabulary masks through 2D models +and then merge them into 3D instances based on metrics calculated between two +neighboring frames. In contrast to these local metrics, we propose a novel +metric, view consensus rate, to enhance the utilization of multi-view +observations. The key insight is that two 2D masks should be deemed part of the +same 3D instance if a significant number of other 2D masks from different views +contain both these two masks. Using this metric as edge weight, we construct a +global mask graph where each mask is a node. Through iterative clustering of +masks showing high view consensus, we generate a series of clusters, each +representing a distinct 3D instance. Notably, our model is training-free. +Through extensive experiments on publicly available datasets, including +ScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves +state-of-the-art performance in open-vocabulary 3D instance segmentation. Our +project page is at https://pku-epic.github.io/MaskClustering. + +
+
+
+
+
+ + ♻ ☆ Visual Concept Connectome (VCC): Open World Concept Discovery and their + Interlayer Connections in Deep Models CVPR 2024 + + +
+ Understanding what deep network models capture in their learned +representations is a fundamental challenge in computer vision. We present a new +methodology to understanding such vision models, the Visual Concept Connectome +(VCC), which discovers human interpretable concepts and their interlayer +connections in a fully unsupervised manner. Our approach simultaneously reveals +fine-grained concepts at a layer, connection weightings across all layers and +is amendable to global analysis of network structure (e.g., branching pattern +of hierarchical concept assemblies). Previous work yielded ways to extract +interpretable concepts from single layers and examine their impact on +classification, but did not afford multilayer concept analysis across an entire +network architecture. Quantitative and qualitative empirical results show the +effectiveness of VCCs in the domain of image classification. Also, we leverage +VCCs for the application of failure mode debugging to reveal where mistakes +arise in deep networks. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Understanding Video Transformers via Universal Concept Discovery CVPR 2024 + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we show that VTCD +can be used for fine-grained action recognition and video object segmentation. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Location-guided Head Pose Estimation for Fisheye Image + + +
+ Camera with a fisheye or ultra-wide lens covers a wide field of view that +cannot be modeled by the perspective projection. Serious fisheye lens +distortion in the peripheral region of the image leads to degraded performance +of the existing head pose estimation models trained on undistorted images. This +paper presents a new approach for head pose estimation that uses the knowledge +of head location in the image to reduce the negative effect of fisheye +distortion. We develop an end-to-end convolutional neural network to estimate +the head pose with the multi-task learning of head pose and head location. Our +proposed network estimates the head pose directly from the fisheye image +without the operation of rectification or calibration. We also created a +fisheye-distorted version of the three popular head pose estimation datasets, +BIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that +our network remarkably improves the accuracy of head pose estimation compared +with other state-of-the-art one-stage and two-stage methods. + +
+
+ comment: Revised Introduction and Related Work; Submitted to lEEE Transactions + on Cognitive and Developmental Systems for review +
+
+
+
+
+ + ♻ ☆ VMamba: Visual State Space Model + + +
+ Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long +been the predominant backbone networks for visual representation learning. +While ViTs have recently gained prominence over CNNs due to their superior +fitting capabilities, their scalability is largely constrained by the quadratic +complexity of attention computation. Inspired by the capability of Mamba in +efficiently modeling long sequences, we propose VMamba, a generic vision +backbone model aiming to reduce the computational complexity to linear while +retaining ViTs' advantageous features. To enhance VMamba's adaptability in +processing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D +selective scanning in 2D image space with global receptive fields. +Additionally, we make further improvements in implementation details and +architectural designs to enhance VMamba's performance and boost its inference +speed. Extensive experimental results demonstrate VMamba's promising +performance across various visual perception tasks, highlighting its pronounced +advantages in input scaling efficiency compared to existing benchmark models. +Source code is available at https://github.com/MzeroMiko/VMamba. + +
+
+ comment: 21 pages, 12 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Data-Efficient Multimodal Fusion on a Single GPU CVPR 2024 + + +
+ The goal of multimodal alignment is to learn a single latent space that is +shared between multimodal inputs. The most powerful models in this space have +been trained using massive datasets of paired inputs and large-scale +computational resources, making them prohibitively expensive to train in many +practical scenarios. We surmise that existing unimodal encoders pre-trained on +large amounts of unimodal data should provide an effective bootstrap to create +multimodal models from unimodal ones at much lower costs. We therefore propose +FuseMix, a multimodal augmentation scheme that operates on the latent spaces of +arbitrary pre-trained unimodal encoders. Using FuseMix for multimodal +alignment, we achieve competitive performance -- and in certain cases +outperform state-of-the art methods -- in both image-text and audio-text +retrieval, with orders of magnitude less compute and data: for example, we +outperform CLIP on the Flickr30K text-to-image retrieval task with $\sim \! +600\times$ fewer GPU days and $\sim \! 80\times$ fewer image-text pairs. +Additionally, we show how our method can be applied to convert pre-trained +text-to-image generative models into audio-to-image ones. Code is available at: +https://github.com/layer6ai-labs/fusemix. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Building-road Collaborative Extraction from Remotely Sensed Images via + Cross-Interaction + + +
+ Buildings are the basic carrier of social production and human life; roads +are the links that interconnect social networks. Building and road information +has important application value in the frontier fields of regional coordinated +development, disaster prevention, auto-driving, etc. Mapping buildings and +roads from very high-resolution (VHR) remote sensing images have become a hot +research topic. However, the existing methods often ignore the strong spatial +correlation between roads and buildings and extract them in isolation. To fully +utilize the complementary advantages between buildings and roads, we propose a +building-road collaborative extraction method based on multi-task and +cross-scale feature interaction to improve the accuracy of both tasks in a +complementary way. A multi-task interaction module is proposed to interact +information across tasks and preserve the unique information of each task, +which tackle the seesaw phenomenon in multitask learning. By considering the +variation in appearance and structure between buildings and roads, a +cross-scale interaction module is designed to automatically learn the optimal +reception field for different tasks. Compared with many existing methods that +train each task individually, the proposed collaborative extraction method can +utilize the complementary advantages between buildings and roads by the +proposed inter-task and inter-scale feature interactions, and automatically +select the optimal reception field for different tasks. Experiments on a wide +range of urban and rural scenarios show that the proposed algorithm can achieve +building-road extraction with outstanding performance and efficiency. + +
+
+ comment: IEEE Transactions on Geoscience and Remote Sensing +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics CVPR 2024 + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: minor fixes (typos, URLs etc.) +
+
+
+
+
+ + ♻ ☆ Implicit Neural Representation for MRI Parallel Imaging Reconstruction + + +
+ Magnetic resonance imaging (MRI) usually faces lengthy acquisition times, +prompting the exploration of strategies such as parallel imaging (PI) to +alleviate this problem by periodically skipping specific K-space lines and +subsequently reconstructing high-quality images from the undersampled K-space. +Implicit neural representation (INR) has recently emerged as a promising deep +learning technique, characterizing objects as continuous functions of spatial +coordinates typically parameterized by a multilayer perceptron (MLP). In this +study, we propose a novel MRI PI reconstruction method that uses INR. Our +approach represents reconstructed fully-sampled images as functions of voxel +coordinates and prior feature vectors from undersampled images, addressing the +generalization challenges of INR. Specifically, we introduce a scale-embedded +encoder to generate scale-independent, voxel-specific features from MR images +across various undersampling scales. These features are then concatenated with +coordinate vectors to reconstruct fully-sampled MR images, facilitating +multiple-scale reconstructions. To evaluate our method's performance, we +conducted experiments using publicly available MRI datasets, comparing it with +alternative reconstruction techniques. Our quantitative assessment demonstrates +the superiority of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Expediting Building Footprint Extraction from High-resolution Remote + Sensing Images via progressive lenient supervision + + +
+ The efficacy of building footprint segmentation from remotely sensed images +has been hindered by model transfer effectiveness. Many existing building +segmentation methods were developed upon the encoder-decoder architecture of +U-Net, in which the encoder is finetuned from the newly developed backbone +networks that are pre-trained on ImageNet. However, the heavy computational +burden of the existing decoder designs hampers the successful transfer of these +modern encoder networks to remote sensing tasks. Even the widely-adopted deep +supervision strategy fails to mitigate these challenges due to its invalid loss +in hybrid regions where foreground and background pixels are intermixed. In +this paper, we conduct a comprehensive evaluation of existing decoder network +designs for building footprint segmentation and propose an efficient framework +denoted as BFSeg to enhance learning efficiency and effectiveness. +Specifically, a densely-connected coarse-to-fine feature fusion decoder network +that facilitates easy and fast feature fusion across scales is proposed. +Moreover, considering the invalidity of hybrid regions in the down-sampled +ground truth during the deep supervision process, we present a lenient deep +supervision and distillation strategy that enables the network to learn proper +knowledge from deep supervision. Building upon these advancements, we have +developed a new family of building segmentation networks, which consistently +surpass prior works with outstanding performance and efficiency across a wide +range of newly developed encoder networks. + +
+
+
+
+
+ + ♻ ☆ Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level + Awareness + + +
+ To obtain high-quality positron emission tomography (PET) while minimizing +radiation exposure, a range of methods have been designed to reconstruct +standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. +However, most current methods merely learn the mapping between +single-dose-level LPET and SPET images, but omit the dose disparity of LPET +images in clinical scenarios. In this paper, to reconstruct high-quality SPET +images from multi-dose-level LPET images, we design a novel two-phase +multi-dose-level PET reconstruction algorithm with dose level awareness, +containing a pre-training phase and a SPET prediction phase. Specifically, the +pre-training phase is devised to explore both fine-grained discriminative +features and effective semantic representation. The SPET prediction phase +adopts a coarse prediction network utilizing pre-learned dose level prior to +generate preliminary result, and a refinement network to precisely preserve the +details. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge +Dataset have demonstrated the superiority of our method. + +
+
+ comment: Accepted by ISBI2024 +
+
+
+
+
+ + ♻ ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at +https://github.com/ZhiyingDu/BHFMEF. + +
+
+
+
+
+ + ♻ ☆ DREAM: Visual Decoding from Reversing Human Visual System + + +
+ In this work we present DREAM, an fMRI-to-image method for reconstructing +viewed images from brain activities, grounded on fundamental knowledge of the +human visual system. We craft reverse pathways that emulate the hierarchical +and parallel nature of how humans perceive the visual world. These tailored +pathways are specialized to decipher semantics, color, and depth cues from fMRI +data, mirroring the forward pathways from visual stimuli to fMRI recordings. To +do so, two components mimic the inverse processes within the human visual +system: the Reverse Visual Association Cortex (R-VAC) which reverses pathways +of this brain region, extracting semantics from fMRI data; the Reverse Parallel +PKM (R-PKM) component simultaneously predicting color and depth from fMRI +signals. The experiments indicate that our method outperforms the current +state-of-the-art models in terms of the consistency of appearance, structure, +and semantics. Code will be made publicly available to facilitate further +research in this field. + +
+
+ comment: Project Page: https://weihaox.github.io/DREAM +
+
+
+
+
+ + ♻ ☆ Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial + Robustness CVPR 2024 + + +
+ Large-scale pre-trained vision-language models like CLIP have demonstrated +impressive performance across various tasks, and exhibit remarkable zero-shot +generalization capability, while they are also vulnerable to imperceptible +adversarial examples. Existing works typically employ adversarial training +(fine-tuning) as a defense method against adversarial examples. However, direct +application to the CLIP model may result in overfitting, compromising the +model's capacity for generalization. In this paper, we propose Pre-trained +Model Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages +supervision from the original pre-trained model by carefully designing an +auxiliary branch, to enhance the model's zero-shot adversarial robustness. +Specifically, PMG-AFT minimizes the distance between the features of +adversarial examples in the target model and those in the pre-trained model, +aiming to preserve the generalization features already captured by the +pre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate +that PMG-AFT significantly outperforms the state-of-the-art method, improving +the top-1 robust accuracy by an average of 4.99%. Furthermore, our approach +consistently improves clean accuracy by an average of 8.72%. Our code is +available at +https://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DG-TTA: Out-of-domain medical image segmentation through Domain + Generalization and Test-Time Adaptation + + +
+ Applying pre-trained medical segmentation models on out-of-domain images +often yields predictions of insufficient quality. Several strategies have been +proposed to maintain model performance, such as finetuning or unsupervised- and +source-free domain adaptation. These strategies set restrictive requirements +for data availability. In this study, we propose to combine domain +generalization and test-time adaptation to create a highly effective approach +for reusing pre-trained models in unseen target domains. Domain-generalized +pre-training on source data is used to obtain the best initial performance in +the target domain. We introduce the MIND descriptor previously used in image +registration tasks as a further technique to achieve generalization and present +superior performance for small-scale datasets compared to existing approaches. +At test-time, high-quality segmentation for every single unseen scan is ensured +by optimizing the model weights for consistency given different image +augmentations. That way, our method enables separate use of source and target +data and thus removes current data availability barriers. Moreover, the +presented method is highly modular as it does not require specific model +architectures or prior knowledge of involved domains and labels. We demonstrate +this by integrating it into the nnUNet, which is currently the most popular and +accurate framework for medical image segmentation. We employ multiple datasets +covering abdominal, cardiac, and lumbar spine scans and compose several +out-of-domain scenarios in this study. We demonstrate that our method, combined +with pre-trained whole-body CT models, can effectively segment MR images with +high accuracy in all of the aforementioned scenarios. Open-source code can be +found here: https://github.com/multimodallearning/DG-TTA + +
+
+
+
+
+ + ♻ ☆ ExpPoint-MAE: Better interpretability and performance for + self-supervised point cloud transformers + + +
+ In this paper we delve into the properties of transformers, attained through +self-supervision, in the point cloud domain. Specifically, we evaluate the +effectiveness of Masked Autoencoding as a pretraining scheme, and explore +Momentum Contrast as an alternative. In our study we investigate the impact of +data quantity on the learned features, and uncover similarities in the +transformer's behavior across domains. Through comprehensive visualiations, we +observe that the transformer learns to attend to semantically meaningful +regions, indicating that pretraining leads to a better understanding of the +underlying geometry. Moreover, we examine the finetuning process and its effect +on the learned representations. Based on that, we devise an unfreezing strategy +which consistently outperforms our baseline without introducing any other +modifications to the model or the training pipeline, and achieve +state-of-the-art results in the classification task among transformer models. + +
+
+
+
+
+ + ♻ ☆ AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation ICLR 2024 + + +
+ During interactive segmentation, a model and a user work together to +delineate objects of interest in a 3D point cloud. In an iterative process, the +model assigns each data point to an object (or the background), while the user +corrects errors in the resulting segmentation and feeds them back into the +model. The current best practice formulates the problem as binary +classification and segments objects one at a time. The model expects the user +to provide positive clicks to indicate regions wrongly assigned to the +background and negative clicks on regions wrongly assigned to the object. +Sequentially visiting objects is wasteful since it disregards synergies between +objects: a positive click for a given object can, by definition, serve as a +negative click for nearby objects. Moreover, a direct competition between +adjacent objects can speed up the identification of their common boundary. We +introduce AGILE3D, an efficient, attention-based model that (1) supports +simultaneous segmentation of multiple 3D objects, (2) yields more accurate +segmentation masks with fewer user clicks, and (3) offers faster inference. Our +core idea is to encode user clicks as spatial-temporal queries and enable +explicit interactions between click queries as well as between them and the 3D +scene through a click attention module. Every time new clicks are added, we +only need to run a lightweight decoder that produces updated segmentation +masks. In experiments with four different 3D point cloud datasets, AGILE3D sets +a new state-of-the-art. Moreover, we also verify its practicality in real-world +setups with real user studies. + +
+
+ comment: ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D +
+
+
+
+
+ + ♻ ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging + Noise + + +
+ Accurate analysis of microscopy images is hindered by the presence of noise. +This noise is usually signal-dependent and often additionally correlated along +rows or columns of pixels. Current self- and unsupervised denoisers can address +signal-dependent noise, but none can reliably remove noise that is also row- or +column-correlated. Here, we present the first fully unsupervised deep +learning-based denoiser capable of handling imaging noise that is +row-correlated as well as signal-dependent. Our approach uses a Variational +Autoencoder (VAE) with a specially designed autoregressive decoder. This +decoder is capable of modeling row-correlated and signal-dependent noise but is +incapable of independently modeling underlying clean signal. The VAE therefore +produces latent variables containing only clean signal information, and these +are mapped back into image space using a proposed second decoder network. Our +method does not require a pre-trained noise model and can be trained from +scratch using unpaired noisy data. We show that our approach achieves +competitive results when applied to a range of different sensor types and +imaging modalities. + +
+
+
+
+
+ + ♻ ☆ Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract + Reasoning process + + +
+ Abstract reasoning problems pose significant challenges to artificial +intelligence algorithms, demanding cognitive capabilities beyond those required +for perception tasks. This study introduces the Triple-CFN approach to tackle +the Bongard-Logo problem, achieving notable reasoning accuracy by implicitly +reorganizing the concept space of conflicting instances. Additionally, the +Triple-CFN paradigm proves effective for the RPM problem with necessary +modifications, yielding competitive results. To further enhance performance on +the RPM issue, we develop the Meta Triple-CFN network, which explicitly +structures the problem space while maintaining interpretability on progressive +patterns. The success of Meta Triple-CFN is attributed to its paradigm of +modeling the conceptual space, equivalent to normalizing reasoning information. +Based on this ideology, we introduce the Re-space layer, enhancing the +performance of both Meta Triple-CFN and Triple-CFN. This paper aims to +contribute to advancements in machine intelligence by exploring innovative +network designs for addressing abstract reasoning problems, paving the way for +further breakthroughs in this domain. + +
+
+ comment: 14 pages, 14 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey + + +
+ With the urgent demand for generalized deep models, many pre-trained big +models are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of +these models in single domains (like computer vision and natural language +processing), the multi-modal pre-trained big models have also drawn more and +more attention in recent years. In this work, we give a comprehensive survey of +these models and hope this paper could provide new insights and helps fresh +researchers to track the most cutting-edge works. Specifically, we firstly +introduce the background of multi-modal pre-training by reviewing the +conventional deep learning, pre-training works in natural language process, +computer vision, and speech. Then, we introduce the task definition, key +challenges, and advantages of multi-modal pre-training models (MM-PTMs), and +discuss the MM-PTMs with a focus on data, objectives, network architectures, +and knowledge enhanced pre-training. After that, we introduce the downstream +tasks used for the validation of large-scale MM-PTMs, including generative, +classification, and regression tasks. We also give visualization and analysis +of the model parameters and results on representative downstream tasks. +Finally, we point out possible research directions for this topic that may +benefit future works. In addition, we maintain a continuously updated paper +list for large-scale pre-trained multi-modal big models: +https://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has +been published by the journal Machine Intelligence Research (MIR), +https://link.springer.com/article/10.1007/s11633-022-1410-8, DOI: +10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023. + +
+
+ comment: Accepted by Machine Intelligence Research (MIR) +
+
+
+
+
+ + ♻ ☆ MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly + Mixed Classifiers + + +
+ Adversarial robustness often comes at the cost of degraded accuracy, impeding +the real-life application of robust classification models. Training-based +solutions for better trade-offs are limited by incompatibilities with +already-trained high-performance large models, necessitating the exploration of +training-free ensemble approaches. Observing that robust models are more +confident in correct predictions than in incorrect ones on clean and +adversarial data alike, we speculate amplifying this "benign confidence +property" can reconcile accuracy and robustness in an ensemble setting. To +achieve so, we propose "MixedNUTS", a training-free method where the output +logits of a robust classifier and a standard non-robust classifier are +processed by nonlinear transformations with only three parameters, which are +optimized through an efficient algorithm. MixedNUTS then converts the +transformed logits into probabilities and mixes them as the overall output. On +CIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom +strong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and +near-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points, +sacrificing merely 0.87 points in robust accuracy. + +
+
+
+
+
+ + ♻ ☆ RS-Mamba for Large Remote Sensing Image Dense Prediction + + +
+ Context modeling is critical for remote sensing image dense prediction tasks. +Nowadays, the growing size of very-high-resolution (VHR) remote sensing images +poses challenges in effectively modeling context. While transformer-based +models possess global modeling capabilities, they encounter computational +challenges when applied to large VHR images due to their quadratic complexity. +The conventional practice of cropping large images into smaller patches results +in a notable loss of contextual information. To address these issues, we +propose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR +remote sensing images. RSM is specifically designed to capture the global +context of remote sensing images with linear complexity, facilitating the +effective processing of large VHR images. Considering that the land covers in +remote sensing images are distributed in arbitrary spatial directions due to +characteristics of remote sensing over-head imaging, the RSM incorporates an +omnidirectional selective scan module to globally model the context of images +in multiple directions, capturing large spatial features from various +directions. Extensive experiments on semantic segmentation and change detection +tasks across various land covers demonstrate the effectiveness of the proposed +RSM. We designed simple yet effective models based on RSM, achieving +state-of-the-art performance on dense prediction tasks in VHR remote sensing +images without fancy training strategies. Leveraging the linear complexity and +global modeling capabilities, RSM achieves better efficiency and accuracy than +transformer-based models on large remote sensing images. Interestingly, we also +demonstrated that our model generally performs better with a larger image size +on dense prediction tasks. Our code is available at +https://github.com/walking-shadow/Official_Remote_Sensing_Mamba. + +
+
+ comment: 15 pages,8 figures +
+
+
+
+
+ + ♻ ☆ Improving the Generalization of Segmentation Foundation Model under + Distribution Shift via Weakly Supervised Adaptation + + +
+ The success of large language models has inspired the computer vision +community to explore image segmentation foundation model that is able to +zero/few-shot generalize through prompt engineering. Segment-Anything(SAM), +among others, is the state-of-the-art image segmentation foundation model +demonstrating strong zero/few-shot generalization. Despite the success, recent +studies reveal the weakness of SAM under strong distribution shift. In +particular, SAM performs awkwardly on corrupted natural images, camouflaged +images, medical images, etc. Motivated by the observations, we aim to develop a +self-training based strategy to adapt SAM to target distribution. Given the +unique challenges of large source dataset, high computation cost and incorrect +pseudo label, we propose a weakly supervised self-training architecture with +anchor regularization and low-rank finetuning to improve the robustness and +computation efficiency of adaptation. We validate the effectiveness on 5 types +of downstream segmentation tasks including natural clean/corrupted images, +medical images, camouflaged images and robotic images. Our proposed method is +task-agnostic in nature and outperforms pre-trained SAM and state-of-the-art +domain adaptation methods on almost all downstream tasks with the same testing +prompt inputs. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing + Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset + + +
+ Deep learning-based ear disease diagnosis technology has proven effective and +affordable. However, due to the lack of ear endoscope datasets with diversity, +the practical potential of the deep learning model has not been thoroughly +studied. Moreover, existing research failed to achieve a good trade-off between +model inference speed and parameter size, rendering models inapplicable in +real-world settings. To address these challenges, we constructed the first +large-scale ear endoscopic dataset comprising eight types of ear diseases and +disease-free samples from two institutions. Inspired by ShuffleNetV2, we +proposed Best-EarNet, an ultrafast and ultralight network enabling real-time +ear disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial +Feature Fusion Module and multi-scale supervision strategy, which facilitates +the model focusing on global-local information within feature maps at various +levels. Utilizing transfer learning, the accuracy of Best-EarNet with only +0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external +1,652 images), respectively. In particular, it achieves an average frame per +second of 80 on the CPU. From the perspective of model practicality, the +proposed Best-EarNet is superior to state-of-the-art backbone models in ear +lesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis +system based Best-EarNet, was developed successfully and deployed on common +electronic devices (smartphone, tablet computer and personal computer). In the +future, Ear-Keeper has the potential to assist the public and healthcare +providers in performing comprehensive scanning and diagnosis of the ear canal +in real-time video, thereby promptly detecting ear lesions. + +
+
+ comment: 18 pages,8 figures +
+
+
+
+
+ + ♻ ☆ GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual + Affective Computing + + +
+ Multimodal large language models (MLLMs) are designed to process and +integrate information from multiple sources, such as text, speech, images, and +videos. Despite its success in language understanding, it is critical to +evaluate the performance of downstream tasks for better human-centric +applications. This paper assesses the application of MLLMs with 5 crucial +abilities for affective computing, spanning from visual affective tasks and +reasoning tasks. The results show that \gpt has high accuracy in facial action +unit recognition and micro-expression detection while its general facial +expression recognition performance is not accurate. We also highlight the +challenges of achieving fine-grained micro-expression recognition and the +potential for further study and demonstrate the versatility and potential of +\gpt for handling advanced tasks in emotion recognition and related fields by +integrating with task-related agents for more complex tasks, such as heart rate +estimation through signal processing. In conclusion, this paper provides +valuable insights into the potential applications and challenges of MLLMs in +human-centric computing. Our interesting examples are at +https://github.com/EnVision-Research/GPT4Affectivity. + +
+
+
+
+
+ + ♻ ☆ GaussianImage: 1000 FPS Image Representation and Compression by 2D + Gaussian Splatting + + +
+ Implicit neural representations (INRs) recently achieved great success in +image representation and compression, offering high visual quality and fast +rendering speeds with 10-1000 FPS, assuming sufficient GPU resources are +available. However, this requirement often hinders their use on low-end devices +with limited memory. In response, we propose a groundbreaking paradigm of image +representation and compression by 2D Gaussian Splatting, named GaussianImage. +We first introduce 2D Gaussian to represent the image, where each Gaussian has +8 parameters including position, covariance and color. Subsequently, we unveil +a novel rendering algorithm based on accumulated summation. Remarkably, our +method with a minimum of 3$\times$ lower GPU memory usage and 5$\times$ faster +fitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation +performance, but also delivers a faster rendering speed of 1500-2000 FPS +regardless of parameter size. Furthermore, we integrate existing vector +quantization technique to build an image codec. Experimental results +demonstrate that our codec attains rate-distortion performance comparable to +compression-based INRs such as COIN and COIN++, while facilitating decoding +speeds of approximately 1000 FPS. Additionally, preliminary proof of concept +shows that our codec surpasses COIN and COIN++ in performance when using +partial bits-back coding. Code will be available at +https://github.com/Xinjie-Q/GaussianImage. + +
+
+
+
+
+ + ♻ ☆ Re-DiffiNet: Modeling discrepancies in tumor segmentation using + diffusion models + + +
+ Identification of tumor margins is essential for surgical decision-making for +glioblastoma patients and provides reliable assistance for neurosurgeons. +Despite improvements in deep learning architectures for tumor segmentation over +the years, creating a fully autonomous system suitable for clinical floors +remains a formidable challenge because the model predictions have not yet +reached the desired level of accuracy and generalizability for clinical +applications. Generative modeling techniques have seen significant improvements +in recent times. Specifically, Generative Adversarial Networks (GANs) and +Denoising-diffusion-based models (DDPMs) have been used to generate +higher-quality images with fewer artifacts and finer attributes. In this work, +we introduce a framework called Re-Diffinet for modeling the discrepancy +between the outputs of a segmentation model like U-Net and the ground truth, +using DDPMs. By explicitly modeling the discrepancy, the results show an +average improvement of 0.55\% in the Dice score and 16.28\% in HD95 from +cross-validation over 5-folds, compared to the state-of-the-art U-Net +segmentation model. + +
+
+
+
+
+ + ♻ ☆ AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with + Implicit Disentanglement + + +
+ Facial action unit (AU) intensity plays a pivotal role in quantifying +fine-grained expression behaviors, which is an effective condition for facial +expression manipulation. However, publicly available datasets containing +intensity annotations for multiple AUs remain severely limited, often featuring +a restricted number of subjects. This limitation places challenges to the AU +intensity manipulation in images due to disentanglement issues, leading +researchers to resort to other large datasets with pretrained AU intensity +estimators for pseudo labels. In addressing this constraint and fully +leveraging manual annotations of AU intensities for precise manipulation, we +introduce AUEditNet. Our proposed model achieves impressive intensity +manipulation across 12 AUs, trained effectively with only 18 subjects. +Utilizing a dual-branch architecture, our approach achieves comprehensive +disentanglement of facial attributes and identity without necessitating +additional loss functions or implementing with large batch sizes. This approach +offers a potential solution to achieve desired facial attribute editing despite +the dataset's limited subject count. Our experiments demonstrate AUEditNet's +superior accuracy in editing AU intensities, affirming its capability in +disentangling facial attributes and identity within a limited subject pool. +AUEditNet allows conditioning by either intensity values or target images, +eliminating the need for constructing AU combinations for specific facial +expression synthesis. Moreover, AU intensity estimation, as a downstream task, +validates the consistency between real and edited images, confirming the +effectiveness of our proposed AU intensity manipulation method. + +
+
+
+
+
+ + ♻ ☆ Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot + Interaction + + +
+ Hand gestures play a significant role in human interactions where non-verbal +intentions, thoughts and commands are conveyed. In Human-Robot Interaction +(HRI), hand gestures offer a similar and efficient medium for conveying clear +and rapid directives to a robotic agent. However, state-of-the-art vision-based +methods for gesture recognition have been shown to be effective only up to a +user-camera distance of seven meters. Such a short distance range limits +practical HRI with, for example, service robots, search and rescue robots and +drones. In this work, we address the Ultra-Range Gesture Recognition (URGR) +problem by aiming for a recognition distance of up to 25 meters and in the +context of HRI. We propose the URGR framework, a novel deep-learning, using +solely a simple RGB camera. Gesture inference is based on a single image. +First, a novel super-resolution model termed High-Quality Network (HQ-Net) uses +a set of self-attention and convolutional layers to enhance the low-resolution +image of the user. Then, we propose a novel URGR classifier termed Graph Vision +Transformer (GViT) which takes the enhanced image as input. GViT combines the +benefits of a Graph Convolutional Network (GCN) and a modified Vision +Transformer (ViT). Evaluation of the proposed framework over diverse test data +yields a high recognition rate of 98.1%. The framework has also exhibited +superior performance compared to human recognition in ultra-range distances. +With the framework, we analyze and demonstrate the performance of an autonomous +quadruped robot directed by human gestures in complex ultra-range indoor and +outdoor environments, acquiring 96% recognition rate on average. + +
+
+ comment: Engineering Applications of Artificial Intelligence, In press +
+
+
+
+
+ + ♻ ☆ Concept-based Analysis of Neural Networks via Vision-Language Models + + +
+ The analysis of vision-based deep neural networks (DNNs) is highly desirable +but it is very challenging due to the difficulty of expressing formal +specifications for vision tasks and the lack of efficient verification +procedures. In this paper, we propose to leverage emerging multimodal, +vision-language, foundation models (VLMs) as a lens through which we can reason +about vision models. VLMs have been trained on a large body of images +accompanied by their textual description, and are thus implicitly aware of +high-level, human-understandable concepts describing the images. We describe a +logical specification language $\texttt{Con}_{\texttt{spec}}$ designed to +facilitate writing specifications in terms of these concepts. To define and +formally check $\texttt{Con}_{\texttt{spec}}$ specifications, we build a map +between the internal representations of a given vision model and a VLM, leading +to an efficient verification procedure of natural-language properties for +vision models. We demonstrate our techniques on a ResNet-based classifier +trained on the RIVAL-10 dataset using CLIP as the multimodal model. + +
+
+
+
+
+ + ♻ ☆ Learning to Predict 3D Rotational Dynamics from Images of a Rigid Body + with Unknown Mass Distribution + + +
+ In many real-world settings, image observations of freely rotating 3D rigid +bodies may be available when low-dimensional measurements are not. However, the +high-dimensionality of image data precludes the use of classical estimation +techniques to learn the dynamics. The usefulness of standard deep learning +methods is also limited, because an image of a rigid body reveals nothing about +the distribution of mass inside the body, which, together with initial angular +velocity, is what determines how the body will rotate. We present a +physics-based neural network model to estimate and predict 3D rotational +dynamics from image sequences. We achieve this using a multi-stage prediction +pipeline that maps individual images to a latent representation homeomorphic to +$\mathbf{SO}(3)$, computes angular velocities from latent pairs, and predicts +future latent states using the Hamiltonian equations of motion. We demonstrate +the efficacy of our approach on new rotating rigid-body datasets of sequences +of synthetic images of rotating objects, including cubes, prisms and +satellites, with unknown uniform and non-uniform mass distributions. Our model +outperforms competing baselines on our datasets, producing better qualitative +predictions and reducing the error observed for the state-of-the-art +Hamiltonian Generative Network by a factor of 2. + +
+
+ comment: Previously appeared as arXiv:2209.11355v2, which was submitted as a + replacement by accident. arXiv admin note: text overlap with arXiv:2209.11355 +
+
+
+
+
+ + ♻ ☆ Mask4Former: Mask Transformer for 4D Panoptic Segmentation ICRA 2024 + + +
+ Accurately perceiving and tracking instances over time is essential for the +decision-making processes of autonomous agents interacting safely in dynamic +environments. With this intention, we propose Mask4Former for the challenging +task of 4D panoptic segmentation of LiDAR point clouds. Mask4Former is the +first transformer-based approach unifying semantic instance segmentation and +tracking of sparse and irregular sequences of 3D point clouds into a single +joint model. Our model directly predicts semantic instances and their temporal +associations without relying on hand-crafted non-learned association strategies +such as probabilistic clustering or voting-based center prediction. Instead, +Mask4Former introduces spatio-temporal instance queries that encode the +semantic and geometric properties of each semantic tracklet in the sequence. In +an in-depth study, we find that promoting spatially compact instance +predictions is critical as spatio-temporal instance queries tend to merge +multiple semantically similar instances, even if they are spatially distant. To +this end, we regress 6-DOF bounding box parameters from spatio-temporal +instance queries, which are used as an auxiliary task to foster spatially +compact predictions. Mask4Former achieves a new state-of-the-art on the +SemanticKITTI test set with a score of 68.4 LSTQ. + +
+
+ comment: Renamed from MASK4D to Mask4Former. ICRA 2024. Project page: + https://vision.rwth-aachen.de/Mask4Former +
+
+
+
+
+ + ♻ ☆ Enhancing Hierarchical Transformers for Whole Brain Segmentation with + Intracranial Measurements Integration + + +
+ Whole brain segmentation with magnetic resonance imaging (MRI) enables the +non-invasive measurement of brain regions, including total intracranial volume +(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain +segmentation methodology to incorporate intracranial measurements offers a +heightened level of comprehensiveness in the analysis of brain structures. +Despite its potential, the task of generalizing deep learning techniques for +intracranial measurements faces data availability constraints due to limited +manually annotated atlases encompassing whole brain and TICV/PFV labels. In +this paper, we enhancing the hierarchical transformer UNesT for whole brain +segmentation to achieve segmenting whole brain with 133 classes and TICV/PFV +simultaneously. To address the problem of data scarcity, the model is first +pretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites. +These volumes are processed through a multi-atlas segmentation pipeline for +label generation, while TICV/PFV labels are unavailable. Subsequently, the +model is finetuned with 45 T1w 3D volumes from Open Access Series Imaging +Studies (OASIS) where both 133 whole brain classes and TICV/PFV labels are +available. We evaluate our method with Dice similarity coefficients(DSC). We +show that our model is able to conduct precise TICV/PFV estimation while +maintaining the 132 brain regions performance at a comparable level. Code and +trained model are available at: +https://github.com/MASILab/UNesT/tree/main/wholebrainSeg. + +
+
+
+
+
+ + ♻ ☆ Detecting Image Attribution for Text-to-Image Diffusion Models in RGB + and Beyond + + +
+ Modern text-to-image (T2I) diffusion models can generate images with +remarkable realism and creativity. These advancements have sparked research in +fake image detection and attribution, yet prior studies have not fully explored +the practical and scientific dimensions of this task. In addition to +attributing images to 12 state-of-the-art T2I generators, we provide extensive +analyses on what inference stage hyperparameters and image modifications are +discernible. Our experiments reveal that initialization seeds are highly +detectable, along with other subtle variations in the image generation process +to some extent. We further investigate what visual traces are leveraged in +image attribution by perturbing high-frequency details and employing mid-level +representations of image style and structure. Notably, altering high-frequency +information causes only slight reductions in accuracy, and training an +attributor on style representations outperforms training on RGB images. Our +analyses underscore that fake images are detectable and attributable at various +levels of visual granularity than previously explored. + +
+
+ comment: Code available at https://github.com/k8xu/ImageAttribution +
+
+
+
+
+ + ♻ ☆ Hierarchical Augmentation and Distillation for Class Incremental + Audio-Visual Video Recognition + + +
+ Audio-visual video recognition (AVVR) aims to integrate audio and visual +clues to categorize videos accurately. While existing methods train AVVR models +using provided datasets and achieve satisfactory results, they struggle to +retain historical class knowledge when confronted with new classes in +real-world situations. Currently, there are no dedicated methods for addressing +this problem, so this paper concentrates on exploring Class Incremental +Audio-Visual Video Recognition (CIAVVR). For CIAVVR, since both stored data and +learned model of past classes contain historical knowledge, the core challenge +is how to capture past data knowledge and past model knowledge to prevent +catastrophic forgetting. We introduce Hierarchical Augmentation and +Distillation (HAD), which comprises the Hierarchical Augmentation Module (HAM) +and Hierarchical Distillation Module (HDM) to efficiently utilize the +hierarchical structure of data and models, respectively. Specifically, HAM +implements a novel augmentation strategy, segmental feature augmentation, to +preserve hierarchical model knowledge. Meanwhile, HDM introduces newly designed +hierarchical (video-distribution) logical distillation and hierarchical +(snippet-video) correlative distillation to capture and maintain the +hierarchical intra-sample knowledge of each data and the hierarchical +inter-sample knowledge between data, respectively. Evaluations on four +benchmarks (AVE, AVK-100, AVK-200, and AVK-400) demonstrate that the proposed +HAD effectively captures hierarchical information in both data and models, +resulting in better preservation of historical class knowledge and improved +performance. Furthermore, we provide a theoretical analysis to support the +necessity of the segmental feature augmentation strategy. + +
+
+ comment: Submitted to TPAMI +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models ICLR 2024 + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their \textit{exposure bias} problem, described as the input mismatch between +training and sampling, lacks in-depth exploration. In this paper, we +systematically investigate the exposure bias problem in diffusion models by +first analytically modelling the sampling distribution, based on which we then +attribute the prediction error at each sampling step as the root cause of the +exposure bias issue. Furthermore, we discuss potential solutions to this issue +and propose an intuitive metric for it. Along with the elucidation of exposure +bias, we propose a simple, yet effective, training-free method called Epsilon +Scaling to alleviate the exposure bias. We show that Epsilon Scaling explicitly +moves the sampling trajectory closer to the vector field learned in the +training phase by scaling down the network output, mitigating the input +mismatch between training and sampling. Experiments on various diffusion +frameworks (ADM, DDIM, EDM, LDM, DiT, PFGM++) verify the effectiveness of our +method. Remarkably, our ADM-ES, as a state-of-the-art stochastic sampler, +obtains 2.17 FID on CIFAR-10 under 100-step unconditional generation. The code +is available at \url{https://github.com/forever208/ADM-ES} and +\url{https://github.com/forever208/EDM-ES}. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Discovering Closed-Loop Failures of Vision-Based Controllers via + Reachability Analysis + + +
+ Machine learning driven image-based controllers allow robotic systems to take +intelligent actions based on the visual feedback from their environment. +Understanding when these controllers might lead to system safety violations is +important for their integration in safety-critical applications and engineering +corrective safety measures for the system. Existing methods leverage +simulation-based testing (or falsification) to find the failures of +vision-based controllers, i.e., the visual inputs that lead to closed-loop +safety violations. However, these techniques do not scale well to the scenarios +involving high-dimensional and complex visual inputs, such as RGB images. In +this work, we cast the problem of finding closed-loop vision failures as a +Hamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based +analysis with HJ reachability methods to compute an approximation of the +backward reachable tube (BRT) of the system, i.e., the set of unsafe states for +the system under vision-based controllers. Utilizing the BRT, we can tractably +and systematically find the system states and corresponding visual inputs that +lead to closed-loop failures. These visual inputs can be subsequently analyzed +to find the input characteristics that might have caused the failure. Besides +its scalability to high-dimensional visual inputs, an explicit computation of +BRT allows the proposed approach to capture non-trivial system failures that +are difficult to expose via random simulations. We demonstrate our framework on +two case studies involving an RGB image-based neural network controller for (a) +autonomous indoor navigation, and (b) autonomous aircraft taxiing. + +
+
+
+
+
+ + ♻ ☆ nnMobileNe: Rethinking CNN for Retinopathy Research CVPR + + +
+ Over the past few decades, convolutional neural networks (CNNs) have been at +the forefront of the detection and tracking of various retinal diseases (RD). +Despite their success, the emergence of vision transformers (ViT) in the 2020s +has shifted the trajectory of RD model development. The leading-edge +performance of ViT-based models in RD can be largely credited to their +scalability-their ability to improve as more parameters are added. As a result, +ViT-based models tend to outshine traditional CNNs in RD applications, albeit +at the cost of increased data and computational demands. ViTs also differ from +CNNs in their approach to processing images, working with patches rather than +local regions, which can complicate the precise localization of small, variably +presented lesions in RD. In our study, we revisited and updated the +architecture of a CNN model, specifically MobileNet, to enhance its utility in +RD diagnostics. We found that an optimized MobileNet, through selective +modifications, can surpass ViT-based models in various RD benchmarks, including +diabetic retinopathy grading, detection of multiple fundus diseases, and +classification of diabetic macular edema. The code is available at +https://github.com/Retinal-Research/NN-MOBILENET + +
+
+ comment: Accepted as a conference paper to 2024 CVPRW +
+
+
+
+
+ + ♻ ☆ LongVLM: Efficient Long Video Understanding via Large Language Models + + +
+ Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs +have driven progress in various video understanding tasks. These models encode +video representations through pooling or query aggregation over a vast number +of visual tokens, making computational and memory costs affordable. Despite +successfully providing an overall comprehension of video content, existing +VideoLLMs still face challenges in achieving detailed understanding in videos +due to overlooking local information in long-term videos. To tackle this +challenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for +long video understanding, building upon the observation that long videos often +consist of sequential key events, complex actions, and camera movements. Our +approach proposes to decompose long videos into multiple short-term segments +and encode local features for each local segment via a hierarchical token +merging module. These features are concatenated in temporal order to maintain +the storyline across sequential short-term segments. Additionally, we propose +to integrate global semantics into each local feature to enhance context +understanding. In this way, we encode video representations that incorporate +both local and global information, enabling the LLM to generate comprehensive +responses for long-term videos. Experimental results on the VideoChatGPT +benchmark and zero-shot video question-answering datasets demonstrate the +superior capabilities of our model over the previous state-of-the-art methods. +Qualitative examples demonstrate that our model produces more precise responses +for long videos understanding. Code will be available at +https://github.com/ziplab/LongVLM. + +
+
+
+
+
+ + ♻ ☆ GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object + Detection + + +
+ Integrating LiDAR and camera information into Bird's-Eye-View (BEV) +representation has emerged as a crucial aspect of 3D object detection in +autonomous driving. However, existing methods are susceptible to the inaccurate +calibration relationship between LiDAR and the camera sensor. Such inaccuracies +result in errors in depth estimation for the camera branch, ultimately causing +misalignment between LiDAR and camera BEV features. In this work, we propose a +robust fusion framework called Graph BEV. Addressing errors caused by +inaccurate point cloud projection, we introduce a Local Align module that +employs neighbor-aware depth features via Graph matching. Additionally, we +propose a Global Align module to rectify the misalignment between LiDAR and +camera BEV features. Our Graph BEV framework achieves state-of-the-art +performance, with an mAP of 70.1\%, surpassing BEV Fusion by 1.6\% on the +nuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by +8.3\% under conditions with misalignment noise. + +
+
+
+
+
+ + ♻ ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ♻ ☆ Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A + Semi-Supervised Video Object Detection Method + + +
+ This study aims to establish a computer-aided diagnostic system for lung +lesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians +in identifying lesion areas. During EBUS-transbronchial needle aspiration +(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to +determine the location of lesions. However, these images often contain +significant noise and can be influenced by surrounding tissues or blood +vessels, making interpretation challenging. Previous research has lacked the +application of object detection models to EBUS-TBNA, and there has been no +well-defined solution for annotating the EBUS-TBNA dataset. In related studies +on ultrasound images, although models have been successful in capturing target +regions for their respective tasks, their training and predictions have been +based on two-dimensional images, limiting their ability to leverage temporal +features for improved predictions. This study introduces a three-dimensional +image-based object detection model. It utilizes an attention mechanism to +capture temporal correlations and we will implements a filtering mechanism to +select relevant information from previous frames. Subsequently, a +teacher-student model training approach is employed to optimize the model +further, leveraging unlabeled data. To mitigate the impact of poor-quality +pseudo-labels on the student model, we will add a special Gaussian Mixture +Model (GMM) to ensure the quality of pseudo-labels. + +
+
+
+
+
+ + ♻ ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ♻ ☆ Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D + Images + + +
+ Accurately recovering the dense 3D mesh of both hands from monocular images +poses considerable challenges due to occlusions and projection ambiguity. Most +of the existing methods extract features from color images to estimate the +root-aligned hand meshes, which neglect the crucial depth and scale information +in the real world. Given the noisy sensor measurements with limited resolution, +depth-based methods predict 3D keypoints rather than a dense mesh. These +limitations motivate us to take advantage of these two complementary inputs to +acquire dense hand meshes on a real-world scale. In this work, we propose an +end-to-end framework for recovering dense meshes for both hands, which employ +single-view RGB-D image pairs as input. The primary challenge lies in +effectively utilizing two different input modalities to mitigate the blurring +effects in RGB images and noises in depth images. Instead of directly treating +depth maps as additional channels for RGB images, we encode the depth +information into the unordered point cloud to preserve more geometric details. +Specifically, our framework employs ResNet50 and PointNet++ to derive features +from RGB and point cloud, respectively. Additionally, we introduce a novel +pyramid deep fusion network (PDFNet) to aggregate features at different scales, +which demonstrates superior efficacy compared to previous fusion strategies. +Furthermore, we employ a GCN-based decoder to process the fused features and +recover the corresponding 3D pose and dense mesh. Through comprehensive +ablation experiments, we have not only demonstrated the effectiveness of our +proposed fusion algorithm but also outperformed the state-of-the-art approaches +on publicly available datasets. To reproduce the results, we will make our +source code and models publicly available at +{https://github.com/zijinxuxu/PDFNet}. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ♻ ☆ CitDet: A Benchmark Dataset for Citrus Fruit Detection + + +
+ In this letter, we present a new dataset to advance the state of the art in +detecting citrus fruit and accurately estimate yield on trees affected by the +Huanglongbing (HLB) disease in orchard environments via imaging. Despite the +fact that significant progress has been made in solving the fruit detection +problem, the lack of publicly available datasets has complicated direct +comparison of results. For instance, citrus detection has long been of interest +to the agricultural research community, yet there is an absence of work, +particularly involving public datasets of citrus affected by HLB. To address +this issue, we enhance state-of-the-art object detection methods for use in +typical orchard settings. Concretely, we provide high-resolution images of +citrus trees located in an area known to be highly affected by HLB, along with +high-quality bounding box annotations of citrus fruit. Fruit on both the trees +and the ground are labeled to allow for identification of fruit location, which +contributes to advancements in yield estimation and potential measure of HLB +impact via fruit drop. The dataset consists of over 32,000 bounding box +annotations for fruit instances contained in 579 high-resolution images. In +summary, our contributions are the following: (i) we introduce a novel dataset +along with baseline performance benchmarks on multiple contemporary object +detection algorithms, (ii) we show the ability to accurately capture fruit +location on tree or on ground, and finally (ii) we present a correlation of our +results with yield estimations. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ A Generic Shared Attention Mechanism for Various Backbone Neural + Networks + + +
+ The self-attention mechanism has emerged as a critical component for +improving the performance of various backbone neural networks. However, current +mainstream approaches individually incorporate newly designed self-attention +modules (SAMs) into each layer of the network for granted without fully +exploiting their parameters' potential. This leads to suboptimal performance +and increased parameter consumption as the network depth increases. To improve +this paradigm, in this paper, we first present a counterintuitive but inherent +phenomenon: SAMs tend to produce strongly correlated attention maps across +different layers, with an average Pearson correlation coefficient of up to +0.85. Inspired by this inherent observation, we propose Dense-and-Implicit +Attention (DIA), which directly shares SAMs across layers and employs a long +short-term memory module to calibrate and bridge the highly correlated +attention maps of different layers, thus improving the parameter utilization +efficiency of SAMs. This design of DIA is also consistent with the neural +network's dynamical system perspective. Through extensive experiments, we +demonstrate that our simple yet effective DIA can consistently enhance various +network backbones, including ResNet, Transformer, and UNet, across tasks such +as image classification, object detection, and image generation using diffusion +models. + +
+
+ comment: Work in progress. arXiv admin note: text overlap with + arXiv:1905.10671 +
+
+
+
+
+ + ♻ ☆ Flying with Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ♻ ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+ comment: Project page: https://janehwu.github.io/mcc-ho +
+
+
+
+
+ + ♻ ☆ Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging + + +
+ On 3D imaging, light field cameras typically are of single shot, and however, +they heavily suffer from low spatial resolution and depth accuracy. In this +paper, by employing an optical projector to project a group of single +high-frequency phase-shifted sinusoid patterns, we propose a phase guided light +field algorithm to significantly improve both the spatial and depth resolutions +for off-the-shelf light field cameras. First, for correcting the axial +aberrations caused by the main lens of our light field camera, we propose a +deformed cone model to calibrate our structured light field system. Second, +over wrapped phases computed from patterned images, we propose a stereo +matching algorithm, i.e. phase guided sum of absolute difference, to robustly +obtain the correspondence for each pair of neighbored two lenslets. Finally, by +introducing a virtual camera according to the basic geometrical optics of light +field imaging, we propose a reorganization strategy to reconstruct 3D point +clouds with spatial-depth high resolution. Experimental results show that, +compared with the state-of-the-art active light field methods, the proposed +reconstructs 3D point clouds with a spatial resolution of 1280$\times$720 with +factors 10$\times$ increased, while maintaining the same high depth resolution +and needing merely a single group of high-frequency patterns. + +
+
+
+
+
+ + ♻ ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ♻ ☆ Multi-Level Label Correction by Distilling Proximate Patterns for + Semi-supervised Semantic Segmentation + + +
+ Semi-supervised semantic segmentation relieves the reliance on large-scale +labeled data by leveraging unlabeled data. Recent semi-supervised semantic +segmentation approaches mainly resort to pseudo-labeling methods to exploit +unlabeled data. However, unreliable pseudo-labeling can undermine the +semi-supervision processes. In this paper, we propose an algorithm called +Multi-Level Label Correction (MLLC), which aims to use graph neural networks to +capture structural relationships in Semantic-Level Graphs (SLGs) and +Class-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically, +SLGs represent semantic affinities between pairs of pixel features, and CLGs +describe classification consistencies between pairs of pixel labels. With the +support of proximate pattern information from graphs, MLLC can rectify +incorrectly predicted pseudo-labels and can facilitate discriminative feature +representations. We design an end-to-end network to train and perform this +effective label corrections mechanism. Experiments demonstrate that MLLC can +significantly improve supervised baselines and outperforms state-of-the-art +approaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets. +Specifically, MLLC improves the supervised baseline by at least 5% and 2% with +DeepLabV2 and DeepLabV3+ respectively under different partition protocols. + +
+
+ comment: 12 pages, 8 figures. IEEE Transactions on Multimedia, 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Diffusion For Strong and High Quality Face Morphing Attacks + + +
+ Face morphing attacks seek to deceive a Face Recognition (FR) system by +presenting a morphed image consisting of the biometric qualities from two +different identities with the aim of triggering a false acceptance with one of +the two identities, thereby presenting a significant threat to biometric +systems. The success of a morphing attack is dependent on the ability of the +morphed image to represent the biometric characteristics of both identities +that were used to create the image. We present a novel morphing attack that +uses a Diffusion-based architecture to improve the visual fidelity of the image +and the ability of the morphing attack to represent characteristics from both +identities. We demonstrate the effectiveness of the proposed attack by +evaluating its visual fidelity via the Frechet Inception Distance (FID). Also, +extensive experiments are conducted to measure the vulnerability of FR systems +to the proposed attack. The ability of a morphing attack detector to detect the +proposed attack is measured and compared against two state-of-the-art GAN-based +morphing attacks along with two Landmark-based attacks. Additionally, a novel +metric to measure the relative strength between different morphing attacks is +introduced and evaluated. + +
+
+ comment: Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 170 + +
+
+
+ + ☆ InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model + Handling Resolutions from 336 Pixels to 4K HD + + +
+ The Large Vision-Language Model (LVLM) field has seen significant +advancements, yet its progression has been hindered by challenges in +comprehending fine-grained visual content due to limited resolution. Recent +efforts have aimed to enhance the high-resolution understanding capabilities of +LVLMs, yet they remain capped at approximately 1500 x 1500 pixels and +constrained to a relatively narrow resolution range. This paper represents +InternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM +resolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently, +considering the ultra-high resolution may not be necessary in all scenarios, it +supports a wide range of diverse resolutions from 336 pixels to 4K standard, +significantly broadening its scope of applicability. Specifically, this +research advances the patch division paradigm by introducing a novel extension: +dynamic resolution with automatic patch configuration. It maintains the +training image aspect ratios while automatically varying patch counts and +configuring layouts based on a pre-trained Vision Transformer (ViT) (336 x +336), leading to dynamic training resolution from 336 pixels to 4K standard. +Our research demonstrates that scaling training resolution up to 4K HD leads to +consistent performance enhancements without hitting the ceiling of potential +improvements. InternLM-XComposer2-4KHD shows superb capability that matches or +even surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The +InternLM-XComposer2-4KHD model series with 7B parameters are publicly available +at https://github.com/InternLM/InternLM-XComposer. + +
+
+ comment: Code and models are publicly available at + https://github.com/InternLM/InternLM-XComposer +
+
+
+
+
+ + ☆ MoReVQA: Exploring Modular Reasoning Models for Video Question Answering CVPR 2024 + + +
+ This paper addresses the task of video question answering (videoQA) via a +decomposed multi-stage, modular reasoning framework. Previous modular methods +have shown promise with a single planning stage ungrounded in visual content. +However, through a simple and effective baseline, we find that such systems can +lead to brittle behavior in practice for challenging videoQA settings. Thus, +unlike traditional single-stage planning methods, we propose a multi-stage +system consisting of an event parser, a grounding stage, and a final reasoning +stage in conjunction with an external memory. All stages are training-free, and +performed using few-shot prompting of large models, creating interpretable +intermediate outputs at each stage. By decomposing the underlying planning and +task complexity, our method, MoReVQA, improves over prior work on standard +videoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with +state-of-the-art results, and extensions to related tasks (grounded videoQA, +paragraph captioning). + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Can Feedback Enhance Semantic Grounding in Large Vision-Language Models? + + +
+ Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often +involves collecting domain-specific training data, refining the network +architectures, or modifying the training recipes. In this work, we venture into +an orthogonal direction and explore whether VLMs can improve their semantic +grounding by "receiving" feedback, without requiring in-domain data, +fine-tuning, or modifications to the network architectures. We systematically +analyze this hypothesis using a feedback mechanism composed of a binary signal. +We find that if prompted appropriately, VLMs can utilize feedback both in a +single step and iteratively, showcasing the potential of feedback as an +alternative technique to improve grounding in internet-scale VLMs. Furthermore, +VLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we +find that this issue can be mitigated via a binary verification mechanism. +Finally, we explore the potential and limitations of amalgamating these +findings and applying them iteratively to automatically enhance VLMs' grounding +performance, showing grounding accuracy consistently improves using automated +feedback across all models in all settings investigated. Overall, our iterative +framework improves semantic grounding in VLMs by more than 15 accuracy points +under noise-free feedback and up to 5 accuracy points under a simple automated +binary verification mechanism. The project website is hosted at +https://andrewliao11.github.io/vlms_feedback + +
+
+ comment: 31 pages, 15 figures +
+
+
+
+
+ + ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+
+
+
+ + ☆ Flying With Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ☆ RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length + Videos + + +
+ Remote photoplethysmography (rPPG) is a non-contact method for detecting +physiological signals from facial videos, holding great potential in various +applications such as healthcare, affective computing, and anti-spoofing. +Existing deep learning methods struggle to address two core issues of rPPG +simultaneously: extracting weak rPPG signals from video segments with large +spatiotemporal redundancy and understanding the periodic patterns of rPPG among +long contexts. This represents a trade-off between computational complexity and +the ability to capture long-range dependencies, posing a challenge for rPPG +that is suitable for deployment on mobile devices. Based on the in-depth +exploration of Mamba's comprehension of spatial and temporal information, this +paper introduces RhythmMamba, an end-to-end Mamba-based method that employs +multi-temporal Mamba to constrain both periodic patterns and short-term trends, +coupled with frequency domain feed-forward to enable Mamba to robustly +understand the quasi-periodic patterns of rPPG. Extensive experiments show that +RhythmMamba achieves state-of-the-art performance with reduced parameters and +lower computational complexity. The proposed RhythmMamba can be applied to +video segments of any length without performance degradation. The codes are +available at https://github.com/zizheng-guo/RhythmMamba. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.12788 +
+
+
+
+
+ + ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ☆ Learning State-Invariant Representations of Objects from Image + Collections with State, Pose, and Viewpoint Changes + + +
+ We add one more invariance - state invariance - to the more commonly used +other invariances for learning object representations for recognition and +retrieval. By state invariance, we mean robust with respect to changes in the +structural form of the object, such as when an umbrella is folded, or when an +item of clothing is tossed on the floor. Since humans generally have no +difficulty in recognizing objects despite such state changes, we are naturally +faced with the question of whether it is possible to devise a neural +architecture with similar abilities. To that end, we present a novel dataset, +ObjectsWithStateChange, that captures state and pose variations in the object +images recorded from arbitrary viewpoints. We believe that this dataset will +facilitate research in fine-grained object recognition and retrieval of objects +that are capable of state changes. The goal of such research would be to train +models capable of generating object embeddings that remain invariant to state +changes while also staying invariant to transformations induced by changes in +viewpoint, pose, illumination, etc. To demonstrate the usefulness of the +ObjectsWithStateChange dataset, we also propose a curriculum learning strategy +that uses the similarity relationships in the learned embedding space after +each epoch to guide the training process. The model learns discriminative +features by comparing visually similar objects within and across different +categories, encouraging it to differentiate between objects that may be +challenging to distinguish due to changes in their state. We believe that this +strategy enhances the model's ability to capture discriminative features for +fine-grained tasks that may involve objects with state changes, leading to +performance improvements on object-level tasks not only on our new dataset, but +also on two other challenging multi-view datasets such as ModelNet40 and +ObjectPI. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ A comparative analysis of deep learning models for lung segmentation on + X-ray images + + +
+ Robust and highly accurate lung segmentation in X-rays is crucial in medical +imaging. This study evaluates deep learning solutions for this task, ranking +existing methods and analyzing their performance under diverse image +modifications. Out of 61 analyzed papers, only nine offered implementation or +pre-trained models, enabling assessment of three prominent methods: Lung VAE, +TransResUNet, and CE-Net. The analysis revealed that CE-Net performs best, +demonstrating the highest values in dice similarity coefficient and +intersection over union metric. + +
+
+ comment: published at the Polish Conference on Artificial Intelligence + (PP-RAI), 2024 +
+
+
+
+
+ + ☆ PURE: Turning Polysemantic Neurons Into Pure Features by Identifying + Relevant Circuits + + +
+ The field of mechanistic interpretability aims to study the role of +individual neurons in Deep Neural Networks. Single neurons, however, have the +capability to act polysemantically and encode for multiple (unrelated) +features, which renders their interpretation difficult. We present a method for +disentangling polysemanticity of any Deep Neural Network by decomposing a +polysemantic neuron into multiple monosemantic "virtual" neurons. This is +achieved by identifying the relevant sub-graph ("circuit") for each "pure" +feature. We demonstrate how our approach allows us to find and disentangle +various polysemantic units of ResNet models trained on ImageNet. While +evaluating feature visualizations using CLIP, our method effectively +disentangles representations, improving upon methods based on neuron +activations. Our code is available at https://github.com/maxdreyer/PURE. + +
+
+ comment: 14 pages (4 pages manuscript, 2 pages references, 8 pages appendix) +
+
+
+
+
+ + ☆ SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions + + +
+ Human visual imagination usually begins with analogies or rough sketches. For +example, given an image with a girl playing guitar before a building, one may +analogously imagine how it seems like if Iron Man playing guitar before Pyramid +in Egypt. Nonetheless, visual condition may not be precisely aligned with the +imaginary result indicated by text prompt, and existing layout-controllable +text-to-image (T2I) generation models is prone to producing degraded generated +results with obvious artifacts. To address this issue, we present a novel T2I +generation method dubbed SmartControl, which is designed to modify the rough +visual conditions for adapting to text prompt. The key idea of our SmartControl +is to relax the visual condition on the areas that are conflicted with text +prompts. In specific, a Control Scale Predictor (CSP) is designed to identify +the conflict regions and predict the local control scales, while a dataset with +text prompts and rough visual conditions is constructed for training CSP. It is +worth noting that, even with a limited number (e.g., 1,000~2,000) of training +samples, our SmartControl can generalize well to unseen objects. Extensive +experiments on four typical visual condition types clearly show the efficacy of +our SmartControl against state-of-the-arts. Source code, pre-trained models, +and datasets are available at https://github.com/liuxiaoyu1104/SmartControl. + +
+
+
+
+
+ + ☆ The Central Spanning Tree Problem + + +
+ Spanning trees are an important primitive in many data analysis tasks, when a +data set needs to be summarized in terms of its "skeleton", or when a +tree-shaped graph over all observations is required for downstream processing. +Popular definitions of spanning trees include the minimum spanning tree and the +optimum distance spanning tree, a.k.a. the minimum routing cost tree. When +searching for the shortest spanning tree but admitting additional branching +points, even shorter spanning trees can be realized: Steiner trees. +Unfortunately, both minimum spanning and Steiner trees are not robust with +respect to noise in the observations; that is, small perturbations of the +original data set often lead to drastic changes in the associated spanning +trees. In response, we make two contributions when the data lies in a Euclidean +space: on the theoretical side, we introduce a new optimization problem, the +"(branched) central spanning tree", which subsumes all previously mentioned +definitions as special cases. On the practical side, we show empirically that +the (branched) central spanning tree is more robust to noise in the data, and +as such is better suited to summarize a data set in terms of its skeleton. We +also propose a heuristic to address the NP-hard optimization problem, and +illustrate its use on single cell RNA expression data from biology and 3D point +clouds of plants. + +
+
+
+
+
+ + ☆ Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial + Action Units Recognition CVPR2024 + + +
+ Human facial action units (AUs) are mutually related in a hierarchical +manner, as not only they are associated with each other in both spatial and +temporal domains but also AUs located in the same/close facial regions show +stronger relationships than those of different facial regions. While none of +existing approach thoroughly model such hierarchical inter-dependencies among +AUs, this paper proposes to comprehensively model multi-scale AU-related +dynamic and hierarchical spatio-temporal relationship among AUs for their +occurrences recognition. Specifically, we first propose a novel multi-scale +temporal differencing network with an adaptive weighting block to explicitly +capture facial dynamics across frames at different spatial scales, which +specifically considers the heterogeneity of range and magnitude in different +AUs' activation. Then, a two-stage strategy is introduced to hierarchically +model the relationship among AUs based on their spatial distribution (i.e., +local and cross-region AU relationship modelling). Experimental results +achieved on BP4D and DISFA show that our approach is the new state-of-the-art +in the field of AU occurrence recognition. Our code is publicly available at +https://github.com/CVI-SZU/MDHR. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ QueSTMaps: Queryable Semantic Topological Maps for 3D Scene + Understanding + + +
+ Understanding the structural organisation of 3D indoor scenes in terms of +rooms is often accomplished via floorplan extraction. Robotic tasks such as +planning and navigation require a semantic understanding of the scene as well. +This is typically achieved via object-level semantic segmentation. However, +such methods struggle to segment out topological regions like "kitchen" in the +scene. In this work, we introduce a two-step pipeline. First, we extract a +topological map, i.e., floorplan of the indoor scene using a novel +multi-channel occupancy representation. Then, we generate CLIP-aligned features +and semantic labels for every room instance based on the objects it contains +using a self-attention transformer. Our language-topology alignment supports +natural language querying, e.g., a "place to cook" locates the "kitchen". We +outperform the current state-of-the-art on room segmentation by ~20% and room +classification by ~12%. Our detailed qualitative analysis and ablation studies +provide insights into the problem of joint structural and semantic 3D scene +understanding. + +
+
+
+
+
+ + ☆ Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks + + +
+ With climate change expected to exacerbate fire weather conditions, the +accurate anticipation of wildfires on a global scale becomes increasingly +crucial for disaster mitigation. In this study, we utilize SeasFire, a +comprehensive global wildfire dataset with climate, vegetation, oceanic +indices, and human-related variables, to enable seasonal wildfire forecasting +with machine learning. For the predictive analysis, we train deep learning +models with different architectures that capture the spatio-temporal context +leading to wildfires. Our investigation focuses on assessing the effectiveness +of these models in predicting the presence of burned areas at varying +forecasting time horizons globally, extending up to six months into the future, +and on how different spatial or/and temporal context affects the performance of +the models. Our findings demonstrate the great potential of deep learning +models in seasonal fire forecasting; longer input time-series leads to more +robust predictions across varying forecasting horizons, while integrating +spatial information to capture wildfire spatio-temporal dynamics boosts +performance. Finally, our results hint that in order to enhance performance at +longer forecasting horizons, a larger receptive field spatially needs to be +considered. + +
+
+
+
+
+ + ☆ pfl-research: simulation framework for accelerating research in Private + Federated Learning + + +
+ Federated learning (FL) is an emerging machine learning (ML) training +paradigm where clients own their data and collaborate to train a global model, +without revealing any data to the server and other participants. Researchers +commonly perform experiments in a simulation environment to quickly iterate on +ideas. However, existing open-source tools do not offer the efficiency required +to simulate FL on larger and more realistic FL datasets. We introduce +pfl-research, a fast, modular, and easy-to-use Python framework for simulating +FL. It supports TensorFlow, PyTorch, and non-neural network models, and is +tightly integrated with state-of-the-art privacy algorithms. We study the speed +of open-source FL frameworks and show that pfl-research is 7-72$\times$ faster +than alternative open-source frameworks on common cross-device setups. Such +speedup will significantly boost the productivity of the FL research community +and enable testing hypotheses on realistic FL datasets that were previously too +resource intensive. We release a suite of benchmarks that evaluates an +algorithm's overall performance on a diverse set of realistic scenarios. The +code is available on GitHub at https://github.com/apple/pfl-research. + +
+
+
+
+
+ + ☆ Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion + + +
+ Benefiting from the rapid development of 2D diffusion models, 3D content +creation has made significant progress recently. One promising solution +involves the fine-tuning of pre-trained 2D diffusion models to harness their +capacity for producing multi-view images, which are then lifted into accurate +3D models via methods like fast-NeRFs or large reconstruction models. However, +as inconsistency still exists and limited generated resolution, the generation +results of such methods still lack intricate textures and complex geometries. +To solve this problem, we propose Magic-Boost, a multi-view conditioned +diffusion model that significantly refines coarse generative results through a +brief period of SDS optimization ($\sim15$min). Compared to the previous text +or single image based diffusion models, Magic-Boost exhibits a robust +capability to generate images with high consistency from pseudo synthesized +multi-view images. It provides precise SDS guidance that well aligns with the +identity of the input images, enriching the local detail in both geometry and +texture of the initial generative results. Extensive experiments show +Magic-Boost greatly enhances the coarse inputs and generates high-quality 3D +assets with rich geometric and textural details. (Project Page: +https://magic-research.github.io/magic-boost/) + +
+
+
+
+
+ + ☆ ZeST: Zero-Shot Material Transfer from a Single Image + + +
+ We propose ZeST, a method for zero-shot material transfer to an object in the +input image given a material exemplar image. ZeST leverages existing diffusion +adapters to extract implicit material representation from the exemplar image. +This representation is used to transfer the material using pre-trained +inpainting diffusion model on the object in the input image using depth +estimates as geometry cue and grayscale object shading as illumination cues. +The method works on real images without any training resulting a zero-shot +approach. Both qualitative and quantitative results on real and synthetic +datasets demonstrate that ZeST outputs photorealistic images with transferred +materials. We also show the application of ZeST to perform multiple edits and +robust material assignment under different illuminations. Project Page: +https://ttchengab.github.io/zest + +
+
+ comment: Project Page: https://ttchengab.github.io/zest +
+
+
+
+
+ + ☆ Emergent Dynamics in Neural Cellular Automata + + +
+ Neural Cellular Automata (NCA) models are trainable variations of traditional +Cellular Automata (CA). Emergent motion in the patterns created by NCA has been +successfully applied to synthesize dynamic textures. However, the conditions +required for an NCA to display dynamic patterns remain unexplored. Here, we +investigate the relationship between the NCA architecture and the emergent +dynamics of the trained models. Specifically, we vary the number of channels in +the cell state and the number of hidden neurons in the MultiLayer Perceptron +(MLP), and draw a relationship between the combination of these two variables +and the motion strength between successive frames. Our analysis reveals that +the disparity and proportionality between these two variables have a strong +correlation with the emergent dynamics in the NCA output. We thus propose a +design principle for creating dynamic NCA. + +
+
+ comment: 2 pages +
+
+
+
+
+ + ☆ Raster Forge: Interactive Raster Manipulation Library and GUI for Python + + +
+ Raster Forge is a Python library and graphical user interface for raster data +manipulation and analysis. The tool is focused on remote sensing applications, +particularly in wildfire management. It allows users to import, visualize, and +process raster layers for tasks such as image compositing or topographical +analysis. For wildfire management, it generates fuel maps using predefined +models. Its impact extends from disaster management to hydrological modeling, +agriculture, and environmental monitoring. Raster Forge can be a valuable asset +for geoscientists and researchers who rely on raster data analysis, enhancing +geospatial data processing and visualization across various disciplines. + +
+
+
+
+
+ + ☆ VISION2UI: A Real-World Dataset with Layout for Code Generation from UI + Designs + + +
+ Automatically generating UI code from webpage design visions can +significantly alleviate the burden of developers, enabling beginner developers +or designers to directly generate Web pages from design diagrams. Currently, +prior research has accomplished the objective of generating UI code from +rudimentary design visions or sketches through designing deep neural networks. +Inspired by the groundbreaking advancements achieved by Multimodal Large +Language Models (MLLMs), the automatic generation of UI code from high-fidelity +design images is now emerging as a viable possibility. Nevertheless, our +investigation reveals that existing MLLMs are hampered by the scarcity of +authentic, high-quality, and large-scale datasets, leading to unsatisfactory +performance in automated UI code generation. To mitigate this gap, we present a +novel dataset, termed VISION2UI, extracted from real-world scenarios, augmented +with comprehensive layout information, tailored specifically for finetuning +MLLMs in UI code generation. Specifically, this dataset is derived through a +series of operations, encompassing collecting, cleaning, and filtering of the +open-source Common Crawl dataset. In order to uphold its quality, a neural +scorer trained on labeled samples is utilized to refine the data, retaining +higher-quality instances. Ultimately, this process yields a dataset comprising +2,000 (Much more is coming soon) parallel samples encompassing design visions +and UI code. The dataset is available at +https://huggingface.co/datasets/xcodemind/vision2ui. + +
+
+
+
+
+ + ☆ Dynamic Resolution Guidance for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is vital for human-computer interaction +and emotion analysis, yet recognizing expressions in low-resolution images +remains challenging. This paper introduces a practical method called Dynamic +Resolution Guidance for Facial Expression Recognition (DRGFER) to effectively +recognize facial expressions in images with varying resolutions without +compromising FER model accuracy. Our framework comprises two main components: +the Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation +Facial Expression Recognition Network (MRAFER). The RRN determines image +resolution, outputs a binary vector, and the MRAFER assigns images to suitable +facial expression recognition networks based on resolution. We evaluated DRGFER +on widely-used datasets RAFDB and FERPlus, demonstrating that our method +retains optimal model performance at each resolution and outperforms +alternative resolution approaches. The proposed framework exhibits robustness +against resolution variations and facial expressions, offering a promising +solution for real-world applications. + +
+
+
+
+
+ + ☆ Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot + Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) and CLIP are remarkable vision foundation +models (VFMs). SAM, a prompt driven segmentation model, excels in segmentation +tasks across diverse domains, while CLIP is renowned for its zero shot +recognition capabilities. However, their unified potential has not yet been +explored in medical image segmentation. To adapt SAM to medical imaging, +existing methods primarily rely on tuning strategies that require extensive +data or prior prompts tailored to the specific task, making it particularly +challenging when only a limited number of data samples are available. This work +presents an in depth exploration of integrating SAM and CLIP into a unified +framework for medical image segmentation. Specifically, we propose a simple +unified framework, SaLIP, for organ segmentation. Initially, SAM is used for +part based segmentation within the image, followed by CLIP to retrieve the mask +corresponding to the region of interest (ROI) from the pool of SAM generated +masks. Finally, SAM is prompted by the retrieved ROI to segment a specific +organ. Thus, SaLIP is training and fine tuning free and does not rely on domain +expertise or labeled data for prompt engineering. Our method shows substantial +enhancements in zero shot segmentation, showcasing notable improvements in DICE +scores across diverse segmentation tasks like brain (63.46%), lung (50.11%), +and fetal head (30.82%), when compared to un prompted SAM. Code and text +prompts will be available online. + +
+
+
+
+
+ + ☆ High Noise Scheduling is a Must + + +
+ Consistency models possess high capabilities for image generation, advancing +sampling steps to a single step through their advanced techniques. Current +advancements move one step forward consistency training techniques and +eliminates the limitation of distillation training. Even though the proposed +curriculum and noise scheduling in improved training techniques yield better +results than basic consistency models, it lacks well balanced noise +distribution and its consistency between curriculum. In this study, it is +investigated the balance between high and low noise levels in noise +distribution and offered polynomial noise distribution to maintain the +stability. This proposed polynomial noise distribution is also supported with a +predefined Karras noises to prevent unique noise levels arises with Karras +noise generation algorithm. Furthermore, by elimination of learned noisy steps +with a curriculum based on sinusoidal function increase the performance of the +model in denoising. To make a fair comparison with the latest released +consistency model training techniques, experiments are conducted with same +hyper-parameters except curriculum and noise distribution. The models utilized +during experiments are determined with low depth to prove the robustness of our +proposed technique. The results show that the polynomial noise distribution +outperforms the model trained with log-normal noise distribution, yielding a +33.54 FID score after 100,000 training steps with constant discretization +steps. Additionally, the implementation of a sinusoidal-based curriculum +enhances denoising performance, resulting in a FID score of 30.48. + +
+
+
+
+
+ + ☆ DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View + Segmentation with Occlusion Reasoning + + +
+ Semantic segmentation is an effective way to perform scene understanding. +Recently, segmentation in 3D Bird's Eye View (BEV) space has become popular as +its directly used by drive policy. However, there is limited work on BEV +segmentation for surround-view fisheye cameras, commonly used in commercial +vehicles. As this task has no real-world public dataset and existing synthetic +datasets do not handle amodal regions due to occlusion, we create a synthetic +dataset using the Cognata simulator comprising diverse road types, weather, and +lighting conditions. We generalize the BEV segmentation to work with any camera +model; this is useful for mixing diverse cameras. We implement a baseline by +applying cylindrical rectification on the fisheye images and using a standard +LSS-based BEV segmentation model. We demonstrate that we can achieve better +performance without undistortion, which has the adverse effects of increased +runtime due to pre-processing, reduced field-of-view, and resampling artifacts. +Further, we introduce a distortion-aware learnable BEV pooling strategy that is +more effective for the fisheye cameras. We extend the model with an occlusion +reasoning module, which is critical for estimating in BEV space. Qualitative +performance of DaF-BEVSeg is showcased in the video at +https://streamable.com/ge4v51. + +
+
+
+
+
+ + ☆ HPNet: Dynamic Trajectory Forecasting with Historical Prediction + Attention CVPR2024 + + +
+ Predicting the trajectories of road agents is essential for autonomous +driving systems. The recent mainstream methods follow a static paradigm, which +predicts the future trajectory by using a fixed duration of historical frames. +These methods make the predictions independently even at adjacent time steps, +which leads to potential instability and temporal inconsistency. As successive +time steps have largely overlapping historical frames, their forecasting should +have intrinsic correlation, such as overlapping predicted trajectories should +be consistent, or be different but share the same motion goal depending on the +road situation. Motivated by this, in this work, we introduce HPNet, a novel +dynamic trajectory forecasting method. Aiming for stable and accurate +trajectory forecasting, our method leverages not only historical frames +including maps and agent states, but also historical predictions. Specifically, +we newly design a Historical Prediction Attention module to automatically +encode the dynamic relationship between successive predictions. Besides, it +also extends the attention range beyond the currently visible window +benefitting from the use of historical predictions. The proposed Historical +Prediction Attention together with the Agent Attention and Mode Attention is +further formulated as the Triple Factorized Attention module, serving as the +core design of HPNet.Experiments on the Argoverse and INTERACTION datasets show +that HPNet achieves state-of-the-art performance, and generates accurate and +stable future trajectories. Our code are available at +https://github.com/XiaolongTang23/HPNet. + +
+
+ comment: accepted by CVPR2024 +
+
+
+
+
+ + ☆ Rolling Shutter Correction with Intermediate Distortion Flow Estimation CVPR2024 + + +
+ This paper proposes to correct the rolling shutter (RS) distorted images by +estimating the distortion flow from the global shutter (GS) to RS directly. +Existing methods usually perform correction using the undistortion flow from +the RS to GS. They initially predict the flow from consecutive RS frames, +subsequently rescaling it as the displacement fields from the RS frame to the +underlying GS image using time-dependent scaling factors. Following this, +RS-aware forward warping is employed to convert the RS image into its GS +counterpart. Nevertheless, this strategy is prone to two shortcomings. First, +the undistortion flow estimation is rendered inaccurate by merely linear +scaling the flow, due to the complex non-linear motion nature. Second, RS-aware +forward warping often results in unavoidable artifacts. To address these +limitations, we introduce a new framework that directly estimates the +distortion flow and rectifies the RS image with the backward warping operation. +More specifically, we first propose a global correlation-based flow attention +mechanism to estimate the initial distortion flow and GS feature jointly, which +are then refined by the following coarse-to-fine decoder layers. Additionally, +a multi-distortion flow prediction strategy is integrated to mitigate the issue +of inaccurate flow estimation further. Experimental results validate the +effectiveness of the proposed method, which outperforms state-of-the-art +approaches on various benchmarks while maintaining high efficiency. The project +is available at \url{https://github.com/ljzycmd/DFRSC}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Matching 2D Images in 3D: Metric Relative Pose from Metric + Correspondences + + +
+ Given two images, we can estimate the relative camera pose between them by +establishing image-to-image correspondences. Usually, correspondences are +2D-to-2D and the pose we estimate is defined only up to scale. Some +applications, aiming at instant augmented reality anywhere, require +scale-metric pose estimates, and hence, they rely on external depth estimators +to recover the scale. We present MicKey, a keypoint matching pipeline that is +able to predict metric correspondences in 3D camera space. By learning to match +3D coordinates across images, we are able to infer the metric relative pose +without depth measurements. Depth measurements are also not required for +training, nor are scene reconstructions or image overlap information. MicKey is +supervised only by pairs of images and their relative poses. MicKey achieves +state-of-the-art performance on the Map-Free Relocalisation benchmark while +requiring less supervision than competing approaches. + +
+
+
+
+
+ + ☆ Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large + Multi-Modal Models CVPR + + +
+ Audio-visual zero-shot learning methods commonly build on features extracted +from pre-trained models, e.g. video or audio classification models. However, +existing benchmarks predate the popularization of large multi-modal models, +such as CLIP and CLAP. In this work, we explore such large pre-trained models +to obtain features, i.e. CLIP for visual features, and CLAP for audio features. +Furthermore, the CLIP and CLAP text encoders provide class label embeddings +which are combined to boost the performance of the system. We propose a simple +yet effective model that only relies on feed-forward neural networks, +exploiting the strong generalization capabilities of the new audio, visual and +textual features. Our framework achieves state-of-the-art performance on +VGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and +data available at: https://github.com/dkurzend/ClipClap-GZSL. + +
+
+ comment: CVPRw 2024 (L3D-IVU) +
+
+
+
+
+ + ☆ Fortifying Fully Convolutional Generative Adversarial Networks for Image + Super-Resolution Using Divergence Measures + + +
+ Super-Resolution (SR) is a time-hallowed image processing problem that aims +to improve the quality of a Low-Resolution (LR) sample up to the standard of +its High-Resolution (HR) counterpart. We aim to address this by introducing +Super-Resolution Generator (SuRGe), a fully-convolutional Generative +Adversarial Network (GAN)-based architecture for SR. We show that distinct +convolutional features obtained at increasing depths of a GAN generator can be +optimally combined by a set of learnable convex weights to improve the quality +of generated SR samples. In the process, we employ the Jensen-Shannon and the +Gromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of +distributions to further aid the generator of SuRGe to better exploit the +available information in an attempt to improve SR. Moreover, we train the +discriminator of SuRGe with the Wasserstein loss with gradient penalty, to +primarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN +workflow tailor-made for super-resolution, offers improved performance while +maintaining low inference time. The efficacy of SuRGe is substantiated by its +superior performance compared to 18 state-of-the-art contenders on 10 benchmark +datasets. + +
+
+
+
+
+ + ☆ Counterfactual Reasoning for Multi-Label Image Classification via + Patching-Based Training + + +
+ The key to multi-label image classification (MLC) is to improve model +performance by leveraging label correlations. Unfortunately, it has been shown +that overemphasizing co-occurrence relationships can cause the overfitting +issue of the model, ultimately leading to performance degradation. In this +paper, we provide a causal inference framework to show that the correlative +features caused by the target object and its co-occurring objects can be +regarded as a mediator, which has both positive and negative impacts on model +predictions. On the positive side, the mediator enhances the recognition +performance of the model by capturing co-occurrence relationships; on the +negative side, it has the harmful causal effect that causes the model to make +an incorrect prediction for the target object, even when only co-occurring +objects are present in an image. To address this problem, we propose a +counterfactual reasoning method to measure the total direct effect, achieved by +enhancing the direct effect caused only by the target object. Due to the +unknown location of the target object, we propose patching-based training and +inference to accomplish this goal, which divides an image into multiple patches +and identifies the pivot patch that contains the target object. Experimental +results on multiple benchmark datasets with diverse configurations validate +that the proposed method can achieve state-of-the-art performance. + +
+
+
+
+
+ + ☆ NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural + Cellular Automata + + +
+ Neural Cellular Automata (NCA) is a class of Cellular Automata where the +update rule is parameterized by a neural network that can be trained using +gradient descent. In this paper, we focus on NCA models used for texture +synthesis, where the update rule is inspired by partial differential equations +(PDEs) describing reaction-diffusion systems. To train the NCA model, the +spatio-termporal domain is discretized, and Euler integration is used to +numerically simulate the PDE. However, whether a trained NCA truly learns the +continuous dynamic described by the corresponding PDE or merely overfits the +discretization used in training remains an open question. We study NCA models +at the limit where space-time discretization approaches continuity. We find +that existing NCA models tend to overfit the training discretization, +especially in the proximity of the initial condition, also called "seed". To +address this, we propose a solution that utilizes uniform noise as the initial +condition. We demonstrate the effectiveness of our approach in preserving the +consistency of NCA dynamics across a wide range of spatio-temporal +granularities. Our improved NCA model enables two new test-time interactions by +allowing continuous control over the speed of pattern formation and the scale +of the synthesized patterns. We demonstrate this new NCA feature in our +interactive online demo. Our work reveals that NCA models can learn continuous +dynamics and opens new venues for NCA research from a dynamical systems' +perspective. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ☆ Learning Embeddings with Centroid Triplet Loss for Object Identification + in Robotic Grasping + + +
+ Foundation models are a strong trend in deep learning and computer vision. +These models serve as a base for applications as they require minor or no +further fine-tuning by developers to integrate into their applications. +Foundation models for zero-shot object segmentation such as Segment Anything +(SAM) output segmentation masks from images without any further object +information. When they are followed in a pipeline by an object identification +model, they can perform object detection without training. Here, we focus on +training such an object identification model. A crucial practical aspect for an +object identification model is to be flexible in input size. As object +identification is an image retrieval problem, a suitable method should handle +multi-query multi-gallery situations without constraining the number of input +images (e.g. by having fixed-size aggregation layers). The key solution to +train such a model is the centroid triplet loss (CTL), which aggregates image +features to their centroids. CTL yields high accuracy, avoids misleading +training signals and keeps the model input size flexible. In our experiments, +we establish a new state of the art on the ArmBench object identification task, +which shows general applicability of our model. We furthermore demonstrate an +integrated unseen object detection pipeline on the challenging HOPE dataset, +which requires fine-grained detection. There, our pipeline matches and +surpasses related methods which have been trained on dataset-specific data. + +
+
+
+
+
+ + ☆ Robust Confidence Intervals in Stereo Matching using Possibility Theory + + +
+ We propose a method for estimating disparity confidence intervals in stereo +matching problems. Confidence intervals provide complementary information to +usual confidence measures. To the best of our knowledge, this is the first +method creating disparity confidence intervals based on the cost volume. This +method relies on possibility distributions to interpret the epistemic +uncertainty of the cost volume. Our method has the benefit of having a +white-box nature, differing in this respect from current state-of-the-art deep +neural networks approaches. The accuracy and size of confidence intervals are +validated using the Middlebury stereo datasets as well as a dataset of +satellite images. This contribution is freely available on GitHub. + +
+
+
+
+
+ + ☆ 3D Geometry-aware Deformable Gaussian Splatting for Dynamic View + Synthesis CVPR 2024 + + +
+ In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting +method for dynamic view synthesis. Existing neural radiance fields (NeRF) based +solutions learn the deformation in an implicit manner, which cannot incorporate +3D scene geometry. Therefore, the learned deformation is not necessarily +geometrically coherent, which results in unsatisfactory dynamic view synthesis +and 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new +representation of the 3D scene, building upon which the 3D geometry could be +exploited in learning the complex 3D deformation. Specifically, the scenes are +represented as a collection of 3D Gaussian, where each 3D Gaussian is optimized +to move and rotate over time to model the deformation. To enforce the 3D scene +geometry constraint during deformation, we explicitly extract 3D geometry +features and integrate them in learning the 3D deformation. In this way, our +solution achieves 3D geometry-aware deformation modeling, which enables +improved dynamic view synthesis and 3D dynamic reconstruction. Extensive +experimental results on both synthetic and real datasets prove the superiority +of our solution, which achieves new state-of-the-art performance. + The project is available at https://npucvr.github.io/GaGS/ + +
+
+ comment: Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/ +
+
+
+
+
+ + ☆ Spatial-Temporal Multi-level Association for Video Object Segmentation + + +
+ Existing semi-supervised video object segmentation methods either focus on +temporal feature matching or spatial-temporal feature modeling. However, they +do not address the issues of sufficient target interaction and efficient +parallel processing simultaneously, thereby constraining the learning of +dynamic, target-aware features. To tackle these limitations, this paper +proposes a spatial-temporal multi-level association framework, which jointly +associates reference frame, test frame, and object features to achieve +sufficient interaction and parallel target ID association with a +spatial-temporal memory bank for efficient video object segmentation. +Specifically, we construct a spatial-temporal multi-level feature association +module to learn better target-aware features, which formulates feature +extraction and interaction as the efficient operations of object +self-attention, reference object enhancement, and test reference correlation. +In addition, we propose a spatial-temporal memory to assist feature association +and temporal ID assignment and correlation. We evaluate the proposed method by +conducting extensive experiments on numerous video object segmentation +datasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS +2018/2019 val. The favorable performance against the state-of-the-art methods +demonstrates the effectiveness of our approach. All source code and trained +models will be made publicly available. + +
+
+
+
+
+ + ☆ Playing to Vision Foundation Model's Strengths in Stereo Matching + + +
+ Stereo matching has become a key technique for 3D environment perception in +intelligent vehicles. For a considerable time, convolutional neural networks +(CNNs) have remained the mainstream choice for feature extraction in this +domain. Nonetheless, there is a growing consensus that the existing paradigm +should evolve towards vision foundation models (VFM), particularly those +developed based on vision Transformers (ViTs) and pre-trained through +self-supervision on extensive, unlabeled datasets. While VFMs are adept at +extracting informative, general-purpose visual features, specifically for dense +prediction tasks, their performance often lacks in geometric vision tasks. This +study serves as the first exploration of a viable approach for adapting VFMs to +stereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon +three types of modules: spatial differentiation, patch attention fusion, and +cross-attention. The first module initializes feature pyramids, while the +latter two aggregate stereo and multi-scale contextual information into +fine-grained features, respectively. ViTAStereo, which combines ViTAS with cost +volume-based stereo matching back-end processes, achieves the top rank on the +KITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by +approximately 7.9% in terms of the percentage of error pixels, with a tolerance +of 3 pixels. Additional experiments across diverse scenarios further +demonstrate its superior generalizability compared to all other +state-of-the-art approaches. We believe this new paradigm will pave the way for +the next generation of stereo matching networks. + +
+
+
+
+
+ + ☆ Robust feature knowledge distillation for enhanced performance of + lightweight crack segmentation models + + +
+ Vision-based crack detection faces deployment challenges due to the size of +robust models and edge device limitations. These can be addressed with +lightweight models trained with knowledge distillation (KD). However, +state-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper +develops Robust Feature Knowledge Distillation (RFKD), a framework to improve +robustness while retaining the precision of light models for crack +segmentation. RFKD distils knowledge from a teacher model's logit layers and +intermediate feature maps while leveraging mixed clean and noisy images to +transfer robust patterns to the student model, improving its precision, +generalisation, and anti-noise performance. To validate the proposed RFKD, a +lightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M +parameters, is also designed and used as the student to run the framework. The +results show a significant enhancement in noisy images, with RFKD reaching a +62% enhanced mean Dice score (mDS) compared to SOTA KD methods. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ☆ Label-Efficient 3D Object Detection For Road-Side Units + + +
+ Occlusion presents a significant challenge for safety-critical applications +such as autonomous driving. Collaborative perception has recently attracted a +large research interest thanks to the ability to enhance the perception of +autonomous vehicles via deep information fusion with intelligent roadside units +(RSU), thus minimizing the impact of occlusion. While significant advancement +has been made, the data-hungry nature of these methods creates a major hurdle +for their real-world deployment, particularly due to the need for annotated RSU +data. Manually annotating the vast amount of RSU data required for training is +prohibitively expensive, given the sheer number of intersections and the effort +involved in annotating point clouds. We address this challenge by devising a +label-efficient object detection method for RSU based on unsupervised object +discovery. Our paper introduces two new modules: one for object discovery based +on a spatial-temporal aggregation of point clouds, and another for refinement. +Furthermore, we demonstrate that fine-tuning on a small portion of annotated +data allows our object discovery models to narrow the performance gap with, or +even surpass, fully supervised models. Extensive experiments are carried out in +simulated and real-world datasets to evaluate our method. + +
+
+ comment: IV 2024 +
+
+
+
+
+ + ☆ From Barlow Twins to Triplet Training: Differentiating Dementia with + Limited Data + + +
+ Differential diagnosis of dementia is challenging due to overlapping +symptoms, with structural magnetic resonance imaging (MRI) being the primary +method for diagnosis. Despite the clinical value of computer-aided differential +diagnosis, research has been limited, mainly due to the absence of public +datasets that contain diverse types of dementia. This leaves researchers with +small in-house datasets that are insufficient for training deep neural networks +(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI +scans in training, but small batch sizes for volumetric brain scans make its +application challenging. To address these issues, we propose Triplet Training +for differential diagnosis with limited target data. It consists of three key +stages: (i) self-supervised pre-training on unlabeled data with Barlow Twins, +(ii) self-distillation on task-related data, and (iii) fine-tuning on the +target dataset. Our approach significantly outperforms traditional training +strategies, achieving a balanced accuracy of 75.6%. We further provide insights +into the training process by visualizing changes in the latent space after each +step. Finally, we validate the robustness of Triplet Training in terms of its +individual components in a comprehensive ablation study. Our code is available +at https://github.com/ai-med/TripletTraining. + +
+
+ comment: Accepted for presentation at MIDL 2024 +
+
+
+
+
+ + ☆ ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation + Network for Video Colorization + + +
+ How to effectively explore spatial-temporal features is important for video +colorization. Instead of stacking multiple frames along the temporal dimension +or recurrently propagating estimated features that will accumulate errors or +cannot explore information from far-apart frames, we develop a memory-based +feature propagation module that can establish reliable connections with +features from far-apart frames and alleviate the influence of inaccurately +estimated features. To extract better features from each frame for the +above-mentioned feature propagation, we explore the features from +large-pretrained visual models to guide the feature estimation of each frame so +that the estimated features can model complex scenarios. In addition, we note +that adjacent frames usually contain similar contents. To explore this property +for better spatial and temporal feature utilization, we develop a local +attention module to aggregate the features from adjacent frames in a +spatial-temporal neighborhood. We formulate our memory-based feature +propagation module, large-pretrained visual model guided feature estimation +module, and local attention module into an end-to-end trainable network (named +ColorMNet) and show that it performs favorably against state-of-the-art methods +on both the benchmark datasets and real-world scenarios. The source code and +pre-trained models will be available at +\url{https://github.com/yyang181/colormnet}. + +
+
+ comment: Project website: \url{https://github.com/yyang181/colormnet} +
+
+
+
+
+ + ☆ LRR: Language-Driven Resamplable Continuous Representation against + Adversarial Tracking Attacks + + +
+ Visual object tracking plays a critical role in visual-based autonomous +systems, as it aims to estimate the position and size of the object of interest +within a live video. Despite significant progress made in this field, +state-of-the-art (SOTA) trackers often fail when faced with adversarial +perturbations in the incoming frames. This can lead to significant robustness +and security issues when these trackers are deployed in the real world. To +achieve high accuracy on both clean and adversarial data, we propose building a +spatial-temporal continuous representation using the semantic text guidance of +the object of interest. This novel continuous representation enables us to +reconstruct incoming frames to maintain semantic and appearance consistency +with the object of interest and its clean counterparts. As a result, our +proposed method successfully defends against different SOTA adversarial +tracking attacks while maintaining high accuracy on clean data. In particular, +our method significantly increases tracking accuracy under adversarial attacks +with around 90% relative improvement on UAV123, which is even higher than the +accuracy on clean data. + +
+
+
+
+
+ + ☆ GHNeRF: Learning Generalizable Human Features with Efficient Neural + Radiance Fields + + +
+ Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising +results in 3D scene representations, including 3D human representations. +However, these representations often lack crucial information on the underlying +human pose and structure, which is crucial for AR/VR applications and games. In +this paper, we introduce a novel approach, termed GHNeRF, designed to address +these limitations by learning 2D/3D joint locations of human subjects with NeRF +representation. GHNeRF uses a pre-trained 2D encoder streamlined to extract +essential human features from 2D images, which are then incorporated into the +NeRF framework in order to encode human biomechanic features. This allows our +network to simultaneously learn biomechanic features, such as joint locations, +along with human geometry and texture. To assess the effectiveness of our +method, we conduct a comprehensive comparison with state-of-the-art human NeRF +techniques and joint estimation algorithms. Our results show that GHNeRF can +achieve state-of-the-art results in near real-time. + +
+
+
+
+
+ + ☆ Anchor-based Robust Finetuning of Vision-Language Models CVPR2024 + + +
+ We aim at finetuning a vision-language model without hurting its +out-of-distribution (OOD) generalization. We address two types of OOD +generalization, i.e., i) domain shift such as natural to sketch images, and ii) +zero-shot capability to recognize the category that was not contained in the +finetune data. Arguably, the diminished OOD generalization after finetuning +stems from the excessively simplified finetuning target, which only provides +the class information, such as ``a photo of a [CLASS]''. This is distinct from +the process in that CLIP was pretrained, where there is abundant text +supervision with rich semantic information. Therefore, we propose to compensate +for the finetune process using auxiliary supervision with rich semantic +information, which acts as anchors to preserve the OOD generalization. +Specifically, two types of anchors are elaborated in our method, including i) +text-compensated anchor which uses the images from the finetune set but +enriches the text supervision from a pretrained captioner, ii) image-text-pair +anchor which is retrieved from the dataset similar to pretraining data of CLIP +according to the downstream task, associating with the original CLIP text with +rich semantics. Those anchors are utilized as auxiliary semantic information to +maintain the original feature space of CLIP, thereby preserving the OOD +generalization capabilities. Comprehensive experiments demonstrate that our +method achieves in-distribution performance akin to conventional finetuning +while attaining new state-of-the-art results on domain shift and zero-shot +learning benchmarks. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised + Action Recognition in Videos + + +
+ Human action or activity recognition in videos is a fundamental task in +computer vision with applications in surveillance and monitoring, self-driving +cars, sports analytics, human-robot interaction and many more. Traditional +supervised methods require large annotated datasets for training, which are +expensive and time-consuming to acquire. This work proposes a novel approach +using Cross-Architecture Pseudo-Labeling with contrastive learning for +semi-supervised action recognition. Our framework leverages both labeled and +unlabelled data to robustly learn action representations in videos, combining +pseudo-labeling with contrastive learning for effective learning from both +types of samples. We introduce a novel cross-architecture approach where 3D +Convolutional Neural Networks (3D CNNs) and video transformers (VIT) are +utilised to capture different aspects of action representations; hence we call +it ActNetFormer. The 3D CNNs excel at capturing spatial features and local +dependencies in the temporal domain, while VIT excels at capturing long-range +dependencies across frames. By integrating these complementary architectures +within the ActNetFormer framework, our approach can effectively capture both +local and global contextual information of an action. This comprehensive +representation learning enables the model to achieve better performance in +semi-supervised action recognition tasks by leveraging the strengths of each of +these architectures. Experimental results on standard action recognition +datasets demonstrate that our approach performs better than the existing +methods, achieving state-of-the-art performance with only a fraction of labeled +data. The official website of this work is available at: +https://github.com/rana2149/ActNetFormer. + +
+
+ comment: Submitted for peer review +
+
+
+
+
+ + ☆ Hyperparameter-Free Medical Image Synthesis for Sharing Data and + Improving Site-Specific Segmentation + + +
+ Sharing synthetic medical images is a promising alternative to sharing real +images that can improve patient privacy and data security. To get good results, +existing methods for medical image synthesis must be manually adjusted when +they are applied to unseen data. To remove this manual burden, we introduce a +Hyperparameter-Free distributed learning method for automatic medical image +Synthesis, Sharing, and Segmentation called HyFree-S3. For three diverse +segmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of +HyFree-S3 results in improved performance over training only with site-specific +data (in the majority of cases). The hyperparameter-free nature of the method +should make data synthesis and sharing easier, potentially leading to an +increase in the quantity of available data and consequently the quality of the +models trained that may ultimately be applied in the clinic. Our code is +available at https://github.com/AwesomeLemon/HyFree-S3 + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ Automatic Defect Detection in Sewer Network Using Deep Learning Based + Object Detector + + +
+ Maintaining sewer systems in large cities is important, but also time and +effort consuming, because visual inspections are currently done manually. To +reduce the amount of aforementioned manual work, defects within sewer pipes +should be located and classified automatically. In the past, multiple works +have attempted solving this problem using classical image processing, machine +learning, or a combination of those. However, each provided solution only focus +on detecting a limited set of defect/structure types, such as fissure, root, +and/or connection. Furthermore, due to the use of hand-crafted features and +small training datasets, generalization is also problematic. In order to +overcome these deficits, a sizable dataset with 14.7 km of various sewer pipes +were annotated by sewer maintenance experts in the scope of this work. On top +of that, an object detector (EfficientDet-D0) was trained for automatic defect +detection. From the result of several expermients, peculiar natures of defects +in the context of object detection, which greatly effect annotation and +training process, are found and discussed. At the end, the final detector was +able to detect 83% of defects in the test set; out of the missing 17%, only +0.77% are very severe defects. This work provides an example of applying deep +learning-based object detection into an important but quiet engineering field. +It also gives some practical pointers on how to annotate peculiar "object", +such as defects. + +
+
+
+
+
+ + ☆ OmniFusion Technical Report + + +
+ Last year, multimodal architectures served up a revolution in AI-based +approaches and solutions, extending the capabilities of large language models +(LLM). We propose an \textit{OmniFusion} model based on a pretrained LLM and +adapters for visual modality. We evaluated and compared several architecture +design principles for better text and visual data coupling: MLP and transformer +adapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their +fusing approach, image encoding method (whole image or tiles encoding) and two +7B LLMs (the proprietary one and open-source Mistral). Experiments on 8 +visual-language benchmarks show the top score for the best OmniFusion setup in +terms of different VQA tasks in comparison with open-source LLaVA-like +solutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We +also propose a variety of situations, where OmniFusion provides highly-detailed +answers in different domains: housekeeping, sightseeing, culture, medicine, +handwritten and scanned equations recognition, etc. Mistral-based OmniFusion +model is an open-source solution with weights, training and inference scripts +available at https://github.com/AIRI-Institute/OmniFusion. + +
+
+ comment: 17 pages, 4 figures, 9 tables, 2 appendices +
+
+
+
+
+ + ☆ Unified Physical-Digital Attack Detection Challenge + + +
+ Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR) +Systems. In real-world scenarios, FRs are confronted with both physical and +digital attacks. However, existing algorithms often address only one type of +attack at a time, which poses significant limitations in real-world scenarios +where FR systems face hybrid physical-digital threats. To facilitate the +research of Unified Attack Detection (UAD) algorithms, a large-scale +UniAttackData dataset has been collected. UniAttackData is the largest public +dataset for Unified Attack Detection, with a total of 28,706 videos, where each +unique identity encompasses all advanced attack types. Based on this dataset, +we organized a Unified Physical-Digital Face Attack Detection Challenge to +boost the research in Unified Attack Detections. It attracted 136 teams for the +development phase, with 13 qualifying for the final round. The results +re-verified by the organizing team were used for the final ranking. This paper +comprehensively reviews the challenge, detailing the dataset introduction, +protocol definition, evaluation criteria, and a summary of published results. +Finally, we focus on the detailed analysis of the highest-performing algorithms +and offer potential directions for unified physical-digital attack detection +inspired by this competition. Challenge Website: +https://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ Leveraging edge detection and neural networks for better UAV + localization + + +
+ We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs) +in environments lacking Global Navigation Satellite Systems (GNSS). Current +state-of-the-art techniques employ an offline-trained encoder to generate a +vector representation (embedding) of the UAV's current view, which is then +compared with pre-computed embeddings of geo-referenced images to determine the +UAV's position. Here, we demonstrate that the performance of these methods can +be significantly enhanced by preprocessing the images to extract their edges, +which exhibit robustness to seasonal and illumination variations. Furthermore, +we establish that utilizing edges enhances resilience to orientation and +altitude inaccuracies. Additionally, we introduce a confidence criterion for +localization. Our findings are substantiated through synthetic experiments. + +
+
+ comment: Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Automated National Urban Map Extraction + + +
+ Developing countries usually lack the proper governance means to generate and +regularly update a national rooftop map. Using traditional photogrammetry and +surveying methods to produce a building map at the federal level is costly and +time consuming. Using earth observation and deep learning methods, we can +bridge this gap and propose an automated pipeline to fetch such national urban +maps. This paper aims to exploit the power of fully convolutional neural +networks for multi-class buildings' instance segmentation to leverage high +object-wise accuracy results. Buildings' instance segmentation from sub-meter +high-resolution satellite images can be achieved with relatively high +pixel-wise metric scores. We detail all engineering steps to replicate this +work and ensure highly accurate results in dense and slum areas witnessed in +regions that lack proper urban planning in the Global South. We applied a case +study of the proposed pipeline to Lebanon and successfully produced the first +comprehensive national building footprint map with approximately 1 Million +units with an 84% accuracy. The proposed architecture relies on advanced +augmentation techniques to overcome dataset scarcity, which is often the case +in developing countries. + +
+
+
+
+
+ + ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ☆ EPL: Evidential Prototype Learning for Semi-supervised Medical Image + Segmentation + + +
+ Although current semi-supervised medical segmentation methods can achieve +decent performance, they are still affected by the uncertainty in unlabeled +data and model predictions, and there is currently a lack of effective +strategies that can explore the uncertain aspects of both simultaneously. To +address the aforementioned issues, we propose Evidential Prototype Learning +(EPL), which utilizes an extended probabilistic framework to effectively fuse +voxel probability predictions from different sources and achieves prototype +fusion utilization of labeled and unlabeled data under a generalized evidential +framework, leveraging voxel-level dual uncertainty masking. The uncertainty not +only enables the model to self-correct predictions but also improves the guided +learning process with pseudo-labels and is able to feed back into the +construction of hidden features. The method proposed in this paper has been +experimented on LA, Pancreas-CT and TBAD datasets, achieving the +state-of-the-art performance in three different labeled ratios, which strongly +demonstrates the effectiveness of our strategy. + +
+
+
+
+
+ + ☆ YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images + + +
+ Detecting objects from aerial images poses significant challenges due to the +following factors: 1) Aerial images typically have very large sizes, generally +with millions or even hundreds of millions of pixels, while computational +resources are limited. 2) Small object size leads to insufficient information +for effective detection. 3) Non-uniform object distribution leads to +computational resource wastage. To address these issues, we propose YOLC (You +Only Look Clusters), an efficient and effective framework that builds on an +anchor-free object detector, CenterNet. To overcome the challenges posed by +large-scale images and non-uniform object distribution, we introduce a Local +Scale Module (LSM) that adaptively searches cluster regions for zooming in for +accurate detection. Additionally, we modify the regression loss using Gaussian +Wasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable +convolution and refinement methods are employed in the detection head to +enhance the detection of small objects. We perform extensive experiments on two +aerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the +effectiveness and superiority of our proposed approach. + +
+
+ comment: accepted to TITS +
+
+
+
+
+ + ☆ Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised + Medical Image Segmentation + + +
+ Although the existing uncertainty-based semi-supervised medical segmentation +methods have achieved excellent performance, they usually only consider a +single uncertainty evaluation, which often fails to solve the problem related +to credibility completely. Therefore, based on the framework of evidential deep +learning, this paper integrates the evidential predictive results in the +cross-region of mixed and original samples to reallocate the confidence degree +and uncertainty measure of each voxel, which is realized by emphasizing +uncertain information of probability assignments fusion rule of traditional +evidence theory. Furthermore, we design a voxel-level asymptotic learning +strategy by introducing information entropy to combine with the fused +uncertainty measure to estimate voxel prediction more precisely. The model will +gradually pay attention to the prediction results with high uncertainty in the +learning process, to learn the features that are difficult to master. The +experimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the +superior performance of our proposed method in comparison with the existing +state of the arts. + +
+
+
+
+
+ + ☆ Improving Interpretable Embeddings for Ad-hoc Video Search with + Generative Captions and Multi-word Concept Bank ICMR2024 + + +
+ Aligning a user query and video clips in cross-modal latent space and that +with semantic concepts are two mainstream approaches for ad-hoc video search +(AVS). However, the effectiveness of existing approaches is bottlenecked by the +small sizes of available video-text datasets and the low quality of concept +banks, which results in the failures of unseen queries and the +out-of-vocabulary problem. This paper addresses these two problems by +constructing a new dataset and developing a multi-word concept bank. +Specifically, capitalizing on a generative model, we construct a new dataset +consisting of 7 million generated text and video pairs for pre-training. To +tackle the out-of-vocabulary problem, we develop a multi-word concept bank +based on syntax analysis to enhance the capability of a state-of-the-art +interpretable AVS method in modeling relationships between query words. We also +study the impact of current advanced features on the method. Experimental +results show that the integration of the above-proposed elements doubles the +R@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP +on the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2% +to 77%, with an average about 20%. + +
+
+ comment: Accepted in ICMR2024 +
+
+
+
+
+ + ☆ Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data + for Sensor Fusion Applications + + +
+ Radar and camera fusion yields robustness in perception tasks by leveraging +the strength of both sensors. The typical extracted radar point cloud is 2D +without height information due to insufficient antennas along the elevation +axis, which challenges the network performance. This work introduces a +learning-based approach to infer the height of radar points associated with 3D +objects. A novel robust regression loss is introduced to address the sparse +target challenge. In addition, a multi-task training strategy is employed, +emphasizing important features. The average radar absolute height error +decreases from 1.69 to 0.25 meters compared to the state-of-the-art height +extension method. The estimated target height values are used to preprocess and +enrich radar data for downstream perception tasks. Integrating this refined +radar information further enhances the performance of existing radar camera +fusion models for object detection and depth estimation tasks. + +
+
+ comment: Accepted by IEEE Intelligent Vehicles Symposium (IV 2024) +
+
+
+
+
+ + ☆ Efficient and Robust Point Cloud Registration via Heuristics-guided + Parameter Search + + +
+ Estimating the rigid transformation with 6 degrees of freedom based on a +putative 3D correspondence set is a crucial procedure in point cloud +registration. Existing correspondence identification methods usually lead to +large outlier ratios ($>$ 95 $\%$ is common), underscoring the significance of +robust registration methods. Many researchers turn to parameter search-based +strategies (e.g., Branch-and-Bround) for robust registration. Although related +methods show high robustness, their efficiency is limited to the +high-dimensional search space. This paper proposes a heuristics-guided +parameter search strategy to accelerate the search while maintaining high +robustness. We first sample some correspondences (i.e., heuristics) and then +just need to sequentially search the feasible regions that make each sample an +inlier. Our strategy largely reduces the search space and can guarantee +accuracy with only a few inlier samples, therefore enjoying an excellent +trade-off between efficiency and robustness. Since directly parameterizing the +6-dimensional nonlinear feasible region for efficient search is intractable, we +construct a three-stage decomposition pipeline to reparameterize the feasible +region, resulting in three lower-dimensional sub-problems that are easily +solvable via our strategy. Besides reducing the searching dimension, our +decomposition enables the leverage of 1-dimensional interval stabbing at all +three stages for searching acceleration. Moreover, we propose a valid sampling +strategy to guarantee our sampling effectiveness, and a compatibility +verification setup to further accelerate our search. Extensive experiments on +both simulated and real-world datasets demonstrate that our approach exhibits +comparable robustness with state-of-the-art methods while achieving a +significant efficiency boost. + +
+
+ comment: 21 pages, 16 figures. Accepted to IEEE Transactions on Pattern + Analysis and Machine Intelligence, 2024 +
+
+
+
+
+ + ☆ Concise Plane Arrangements for Low-Poly Surface and Volume Modelling + + +
+ Plane arrangements are a useful tool for surface and volume modelling. +However, their main drawback is poor scalability. We introduce two key +novelties that enable the construction of plane arrangements for complex +objects and entire scenes: an ordering scheme for the plane insertion and the +direct use of input points during arrangement construction. Both ingredients +reduce the number of unwanted splits, resulting in improved scalability of the +construction mechanism by up to two orders of magnitude compared to existing +algorithms. We further introduce a remeshing and simplification technique that +allows us to extract low-polygon surface meshes and lightweight convex +decompositions of volumes from the arrangement. We show that our approach leads +to state-of-the-art results for the aforementioned tasks by comparing it to +learning-based and traditional approaches on various different datasets. Our +implementation is available at https://github.com/raphaelsulzer/compod . + +
+
+
+
+
+ + ☆ HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields + + +
+ In recent advancements in novel view synthesis, generalizable Neural Radiance +Fields (NeRF) based methods applied to human subjects have shown remarkable +results in generating novel views from few images. However, this generalization +ability cannot capture the underlying structural features of the skeleton +shared across all instances. Building upon this, we introduce HFNeRF: a novel +generalizable human feature NeRF aimed at generating human biomechanic features +using a pre-trained image encoder. While previous human NeRF methods have shown +promising results in the generation of photorealistic virtual avatars, such +methods lack underlying human structure or biomechanic features such as +skeleton or joint information that are crucial for downstream applications +including Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D +pre-trained foundation models toward learning human features in 3D using neural +rendering, and then volume rendering towards generating 2D feature maps. We +evaluate HFNeRF in the skeleton estimation task by predicting heatmaps as +features. The proposed method is fully differentiable, allowing to successfully +learn color, geometry, and human skeleton in a simultaneous manner. This paper +presents preliminary results of HFNeRF, illustrating its potential in +generating realistic virtual avatars with biomechanic features using NeRF. + +
+
+
+
+
+ + ☆ DiffHarmony: Latent Diffusion Model Meets Image Harmonization ICMR 2024 + + +
+ Image harmonization, which involves adjusting the foreground of a composite +image to attain a unified visual consistency with the background, can be +conceptualized as an image-to-image translation task. Diffusion models have +recently promoted the rapid development of image-to-image translation tasks . +However, training diffusion models from scratch is computationally intensive. +Fine-tuning pre-trained latent diffusion models entails dealing with the +reconstruction error induced by the image compression autoencoder, making it +unsuitable for image generation tasks that involve pixel-level evaluation +metrics. To deal with these issues, in this paper, we first adapt a pre-trained +latent diffusion model to the image harmonization task to generate the +harmonious but potentially blurry initial images. Then we implement two +strategies: utilizing higher-resolution images during inference and +incorporating an additional refinement stage, to further enhance the clarity of +the initially harmonized images. Extensive experiments on iHarmony4 datasets +demonstrate the superiority of our proposed method. The code and model will be +made publicly available at https://github.com/nicecv/DiffHarmony . + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ☆ Mansformer: Efficient Transformer of Mixed Attention for Image + Deblurring and Beyond + + +
+ Transformer has made an enormous success in natural language processing and +high-level vision over the past few years. However, the complexity of +self-attention is quadratic to the image size, which makes it infeasible for +high-resolution vision tasks. In this paper, we propose the Mansformer, a +Transformer of mixed attention that combines multiple self-attentions, gate, +and multi-layer perceptions (MLPs), to explore and employ more possibilities of +self-attention. Taking efficiency into account, we design four kinds of +self-attention, whose complexities are all linear. By elaborate adjustment of +the tensor shapes and dimensions for the dot product, we split the typical +self-attention of quadratic complexity into four operations of linear +complexity. To adaptively merge these different kinds of self-attention, we +take advantage of an architecture similar to Squeeze-and-Excitation Networks. +Furthermore, we make it to merge the two-staged Transformer design into one +stage by the proposed gated-dconv MLP. Image deblurring is our main target, +while extensive quantitative and qualitative evaluations show that this method +performs favorably against the state-of-the-art methods far more than simply +deblurring. The source codes and trained models will be made available to the +public. + +
+
+
+
+
+ + ☆ Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for + Realistic Endoscopic Reconstruction + + +
+ Within colorectal cancer diagnostics, conventional colonoscopy techniques +face critical limitations, including a limited field of view and a lack of +depth information, which can impede the detection of precancerous lesions. +Current methods struggle to provide comprehensive and accurate 3D +reconstructions of the colonic surface which can help minimize the missing +regions and reinspection for pre-cancerous polyps. Addressing this, we +introduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting +(3D GS) combined with a Recurrent Neural Network-based Simultaneous +Localization and Mapping (RNNSLAM) system. By introducing geometric and depth +regularization into the 3D GS framework, our approach ensures more accurate +alignment of Gaussians with the colon surface, resulting in smoother 3D +reconstructions with novel viewing of detailed textures and structures. +Evaluations across three diverse datasets show that Gaussian Pancakes enhances +novel view synthesis quality, surpassing current leading methods with a 18% +boost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster +rendering and more than 10X shorter training times, making it a practical tool +for real-time applications. Hence, this holds promise for achieving clinical +translation for better detection and diagnosis of colorectal cancer. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Hierarchical Insights: Exploiting Structural Similarities for Reliable + 3D Semantic Segmentation IROS 2024 + + +
+ Safety-critical applications like autonomous driving call for robust 3D +environment perception algorithms which can withstand highly diverse and +ambiguous surroundings. The predictive performance of any classification model +strongly depends on the underlying dataset and the prior knowledge conveyed by +the annotated labels. While the labels provide a basis for the learning +process, they usually fail to represent inherent relations between the classes +- representations, which are a natural element of the human perception system. +We propose a training strategy which enables a 3D LiDAR semantic segmentation +model to learn structural relationships between the different classes through +abstraction. We achieve this by implicitly modeling those relationships through +a learning rule for hierarchical multi-label classification (HMC). With a +detailed analysis we show, how this training strategy not only improves the +model's confidence calibration, but also preserves additional information for +downstream tasks like fusion, prediction and planning. + +
+
+ comment: submitted to IROS 2024 +
+
+
+
+
+ + ☆ DreamView: Injecting View-specific Text Guidance into Text-to-3D + Generation + + +
+ Text-to-3D generation, which synthesizes 3D assets according to an overall +text description, has significantly progressed. However, a challenge arises +when the specific appearances need customizing at designated viewpoints but +referring solely to the overall description for generating 3D objects. For +instance, ambiguity easily occurs when producing a T-shirt with distinct +patterns on its front and back using a single overall text guidance. In this +work, we propose DreamView, a text-to-image approach enabling multi-view +customization while maintaining overall consistency by adaptively injecting the +view-specific and overall text guidance through a collaborative text guidance +injection module, which can also be lifted to 3D generation via score +distillation sampling. DreamView is trained with large-scale rendered +multi-view images and their corresponding view-specific texts to learn to +balance the separate content manipulation in each view and the global +consistency of the overall object, resulting in a dual achievement of +customization and consistency. Consequently, DreamView empowers artists to +design 3D objects creatively, fostering the creation of more innovative and +diverse 3D assets. Code and model will be released at +https://github.com/iSEE-Laboratory/DreamView. + +
+
+
+
+
+ + ☆ Revising Densification in Gaussian Splatting + + +
+ In this paper, we address the limitations of Adaptive Density Control (ADC) +in 3D Gaussian Splatting (3DGS), a scene representation method achieving +high-quality, photorealistic results for novel view synthesis. ADC has been +introduced for automatic 3D point primitive management, controlling +densification and pruning, however, with certain limitations in the +densification logic. Our main contribution is a more principled, pixel-error +driven formulation for density control in 3DGS, leveraging an auxiliary, +per-pixel error function as the criterion for densification. We further +introduce a mechanism to control the total number of primitives generated per +scene and correct a bias in the current opacity handling strategy of ADC during +cloning operations. Our approach leads to consistent quality improvements +across a variety of benchmark scenes, without sacrificing the method's +efficiency. + +
+
+
+
+
+ + ☆ Hash3D: Training-free Acceleration for 3D Generation + + +
+ The evolution of 3D generative modeling has been notably propelled by the +adoption of 2D diffusion models. Despite this progress, the cumbersome +optimization process per se presents a critical hurdle to efficiency. In this +paper, we introduce Hash3D, a universal acceleration for 3D generation without +model training. Central to Hash3D is the insight that feature-map redundancy is +prevalent in images rendered from camera positions and diffusion time-steps in +close proximity. By effectively hashing and reusing these feature maps across +neighboring timesteps and camera angles, Hash3D substantially prevents +redundant calculations, thus accelerating the diffusion model's inference in 3D +generation tasks. We achieve this through an adaptive grid-based hashing. +Surprisingly, this feature-sharing mechanism not only speed up the generation +but also enhances the smoothness and view consistency of the synthesized 3D +objects. Our experiments covering 5 text-to-3D and 3 image-to-3D models, +demonstrate Hash3D's versatility to speed up optimization, enhancing efficiency +by 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian +splatting largely speeds up 3D model creation, reducing text-to-3D processing +to about 10 minutes and image-to-3D conversion to roughly 30 seconds. The +project page is at https://adamdad.github.io/hash3D/. + +
+
+ comment: https://adamdad.github.io/hash3D/ +
+
+
+
+
+ + ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ☆ LIPT: Latency-aware Image Processing Transformer + + +
+ Transformer is leading a trend in the field of image processing. Despite the +great success that existing lightweight image processing transformers have +achieved, they are tailored to FLOPs or parameters reduction, rather than +practical inference acceleration. In this paper, we present a latency-aware +image processing transformer, termed LIPT. We devise the low-latency proportion +LIPT block that substitutes memory-intensive operators with the combination of +self-attention and convolutions to achieve practical speedup. Specifically, we +propose a novel non-volatile sparse masking self-attention (NVSM-SA) that +utilizes a pre-computing sparse mask to capture contextual information from a +larger window with no extra computation overload. Besides, a high-frequency +reparameterization module (HRM) is proposed to make LIPT block +reparameterization friendly, which improves the model's detail reconstruction +capability. Extensive experiments on multiple image processing tasks (e.g., +image super-resolution (SR), JPEG artifact reduction, and image denoising) +demonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves +real-time GPU inference with state-of-the-art performance on multiple image SR +benchmarks. + +
+
+
+
+
+ + ☆ Unified Entropy Optimization for Open-Set Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims at adapting a model pre-trained on the +labeled source domain to the unlabeled target domain. Existing methods usually +focus on improving TTA performance under covariate shifts, while neglecting +semantic shifts. In this paper, we delve into a realistic open-set TTA setting +where the target domain may contain samples from unknown classes. Many +state-of-the-art closed-set TTA methods perform poorly when applied to open-set +scenarios, which can be attributed to the inaccurate estimation of data +distribution and model confidence. To address these issues, we propose a simple +but effective framework called unified entropy optimization (UniEnt), which is +capable of simultaneously adapting to covariate-shifted in-distribution (csID) +data and detecting covariate-shifted out-of-distribution (csOOD) data. +Specifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test +data, followed by entropy minimization on the pseudo-csID data and entropy +maximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to +alleviate the noise caused by hard data partition leveraging sample-level +confidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show +the superiority of our framework. The code is available at +https://github.com/gaozhengqing/UniEnt + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Unified Multi-modal Diagnostic Framework with Reconstruction + Pre-training and Heterogeneity-combat Tuning + + +
+ Medical multi-modal pre-training has revealed promise in computer-aided +diagnosis by leveraging large-scale unlabeled datasets. However, existing +methods based on masked autoencoders mainly rely on data-level reconstruction +tasks, but lack high-level semantic information. Furthermore, two significant +heterogeneity challenges hinder the transfer of pre-trained knowledge to +downstream tasks, \textit{i.e.}, the distribution heterogeneity between +pre-training data and downstream data, and the modality heterogeneity within +downstream data. To address these challenges, we propose a Unified Medical +Multi-modal Diagnostic (UMD) framework with tailored pre-training and +downstream tuning strategies. Specifically, to enhance the representation +abilities of vision and language encoders, we propose the Multi-level +Reconstruction Pre-training (MR-Pretrain) strategy, including a feature-level +and data-level reconstruction, which guides models to capture the semantic +information from masked inputs of different modalities. Moreover, to tackle two +kinds of heterogeneities during the downstream tuning, we present the +heterogeneity-combat downstream tuning strategy, which consists of a +Task-oriented Distribution Calibration (TD-Calib) and a Gradient-guided +Modality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the +pre-trained model regarding the distribution of downstream datasets, and +GM-Coord adjusts the gradient weights according to the dynamic optimization +status of different modalities. Extensive experiments on five public medical +datasets demonstrate the effectiveness of our UMD framework, which remarkably +outperforms existing approaches on three kinds of downstream tasks. + +
+
+ comment: to be published in IEEE JBHI; Code available at + https://github.com/helenypzhang/UMD +
+
+
+
+
+ + ☆ Incremental Joint Learning of Depth, Pose and Implicit Scene + Representation on Monocular Camera in Large-scale Scenes + + +
+ Dense scene reconstruction for photo-realistic view synthesis has various +applications, such as VR/AR, autonomous vehicles. However, most existing +methods have difficulties in large-scale scenes due to three core challenges: +\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get +in real-world large-scale scenes. \textit{(b) inaccurate pose estimation.} Most +existing approaches rely on accurate pre-estimated camera poses. \textit{(c) +insufficient scene representation capability.} A single global radiance field +lacks the capacity to effectively scale to large-scale scenes. To this end, we +propose an incremental joint learning framework, which can achieve accurate +depth, pose estimation, and large-scale scene reconstruction. A vision +transformer-based network is adopted as the backbone to enhance performance in +scale information estimation. For pose estimation, a feature-metric bundle +adjustment (FBA) method is designed for accurate and robust camera tracking in +large-scale scenes. In terms of implicit scene representation, we propose an +incremental scene representation method to construct the entire large-scale +scene as multiple local radiance fields to enhance the scalability of 3D scene +representation. Extended experiments have been conducted to demonstrate the +effectiveness and accuracy of our method in depth estimation, pose estimation, +and large-scale scene reconstruction. + +
+
+
+
+
+ + ☆ Object Dynamics Modeling with Hierarchical Point Cloud-based + Representations CVPR 2024 + + +
+ Modeling object dynamics with a neural network is an important problem with +numerous applications. Most recent work has been based on graph neural +networks. However, physics happens in 3D space, where geometric information +potentially plays an important role in modeling physical phenomena. In this +work, we propose a novel U-net architecture based on continuous point +convolution which naturally embeds information from 3D coordinates and allows +for multi-scale feature representations with established downsampling and +upsampling procedures. Bottleneck layers in the downsampled point clouds lead +to better long-range interaction modeling. Besides, the flexibility of point +convolutions allows our approach to generalize to sparsely sampled points from +mesh vertices and dynamically generate features on important interaction points +on mesh faces. Experimental results demonstrate that our approach significantly +improves the state-of-the-art, especially in scenarios that require accurate +gravity or collision reasoning. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Space-Time Video Super-resolution with Neural Operator + + +
+ This paper addresses the task of space-time video super-resolution (ST-VSR). +Existing methods generally suffer from inaccurate motion estimation and motion +compensation (MEMC) problems for large motions. Inspired by recent progress in +physics-informed neural networks, we model the challenges of MEMC in ST-VSR as +a mapping between two continuous function spaces. Specifically, our approach +transforms independent low-resolution representations in the coarse-grained +continuous function space into refined representations with enriched +spatiotemporal details in the fine-grained continuous function space. To +achieve efficient and accurate MEMC, we design a Galerkin-type attention +function to perform frame alignment and temporal interpolation. Due to the +linear complexity of the Galerkin-type attention mechanism, our model avoids +patch partitioning and offers global receptive fields, enabling precise +estimation of large motions. The experimental results show that the proposed +method surpasses state-of-the-art techniques in both fixed-size and continuous +space-time video super-resolution tasks. + +
+
+
+
+
+ + ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url. + +
+
+
+
+
+ + ☆ Improving Facial Landmark Detection Accuracy and Efficiency with + Knowledge Distillation ICME 2024 + + +
+ The domain of computer vision has experienced significant advancements in +facial-landmark detection, becoming increasingly essential across various +applications such as augmented reality, facial recognition, and emotion +analysis. Unlike object detection or semantic segmentation, which focus on +identifying objects and outlining boundaries, faciallandmark detection aims to +precisely locate and track critical facial features. However, deploying deep +learning-based facial-landmark detection models on embedded systems with +limited computational resources poses challenges due to the complexity of +facial features, especially in dynamic settings. Additionally, ensuring +robustness across diverse ethnicities and expressions presents further +obstacles. Existing datasets often lack comprehensive representation of facial +nuances, particularly within populations like those in Taiwan. This paper +introduces a novel approach to address these challenges through the development +of a knowledge distillation method. By transferring knowledge from larger +models to smaller ones, we aim to create lightweight yet powerful deep learning +models tailored specifically for facial-landmark detection tasks. Our goal is +to design models capable of accurately locating facial landmarks under varying +conditions, including diverse expressions, orientations, and lighting +environments. The ultimate objective is to achieve high accuracy and real-time +performance suitable for deployment on embedded systems. This method was +successfully implemented and achieved a top 6th place finish out of 165 +participants in the IEEE ICME 2024 PAIR competition. + +
+
+ comment: technical report. 6th/165 in IEEE ICME 2024 PAIR competition +
+
+
+
+
+ + ☆ Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs + + +
+ Morphing attacks are an emerging threat to state-of-the-art Face Recognition +(FR) systems, which aim to create a single image that contains the biometric +information of multiple identities. Diffusion Morphs (DiM) are a recently +proposed morphing attack that has achieved state-of-the-art performance for +representation-based morphing attacks. However, none of the existing research +on DiMs have leveraged the iterative nature of DiMs and left the DiM model as a +black box, treating it no differently than one would a Generative Adversarial +Network (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on +the iterative sampling process of DiM models which searches for an optimal step +guided by an identity-based heuristic function. We compare our proposed +algorithm against ten other state-of-the-art morphing algorithms using the +open-source SYN-MAD 2022 competition dataset. We find that our proposed +algorithm is unreasonably effective, fooling all of the tested FR systems with +an MMPMR of 100%, outperforming all other morphing algorithms compared. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Band-Attention Modulated RetNet for Face Forgery Detection + + +
+ The transformer networks are extensively utilized in face forgery detection +due to their scalability across large datasets.Despite their success, +transformers face challenges in balancing the capture of global context, which +is crucial for unveiling forgery clues, with computational complexity.To +mitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a +lightweight network designed to efficiently process extensive visual contexts +while avoiding catastrophic forgetting.Our approach empowers the target token +to perceive global information by assigning differential attention levels to +tokens at varying distances. We implement self-attention along both spatial +axes, thereby maintaining spatial priors and easing the computational +burden.Moreover, we present the adaptive frequency Band-Attention Modulation +mechanism, which treats the entire Discrete Cosine Transform spectrogram as a +series of frequency bands with learnable weights.Together, BAR-Net achieves +favorable performance on several face forgery datasets, outperforming current +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data + + +
+ The millimeter-wave radar sensor maintains stable performance under adverse +environmental conditions, making it a promising solution for all-weather +perception tasks, such as outdoor mobile robotics. However, the radar point +clouds are relatively sparse and contain massive ghost points, which greatly +limits the development of mmWave radar technology. In this paper, we propose a +novel point cloud super-resolution approach for 3D mmWave radar data, named +Radar-diffusion. Our approach employs the diffusion model defined by +mean-reverting stochastic differential equations(SDE). Using our proposed new +objective function with supervision from corresponding LiDAR point clouds, our +approach efficiently handles radar ghost points and enhances the sparse mmWave +radar point clouds to dense LiDAR-like point clouds. We evaluate our approach +on two different datasets, and the experimental results show that our method +outperforms the state-of-the-art baseline methods in 3D radar super-resolution +tasks. Furthermore, we demonstrate that our enhanced radar point cloud is +capable of downstream radar point-based registration tasks. + +
+
+
+
+
+ + ☆ Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis + + +
+ The black-box nature of deep learning models has raised concerns about their +interpretability for successful deployment in real-world clinical applications. +To address the concerns, eXplainable Artificial Intelligence (XAI) aims to +provide clear and understandable explanations of the decision-making process. +In the medical domain, concepts such as attributes of lesions or abnormalities +serve as key evidence for deriving diagnostic results. However, existing +concept-based models mainly depend on concepts that appear independently and +require fine-grained concept annotations such as bounding boxes. A medical +image usually contains multiple concepts and the fine-grained concept +annotations are difficult to acquire. In this paper, we propose a novel +Concept-Attention Whitening (CAW) framework for interpretable skin lesion +diagnosis. CAW is comprised of a disease diagnosis branch and a concept +alignment branch. In the former branch, we train the CNN with a CAW layer +inserted to perform skin lesion diagnosis. The CAW layer decorrelates features +and aligns image features to conceptual meanings via an orthogonal matrix. In +the latter branch, we calculate the orthogonal matrix under the guidance of the +concept attention mask. We particularly introduce a weakly-supervised concept +mask generator that only leverages coarse concept labels for filtering local +regions that are relevant to certain concepts, improving the optimization of +the orthogonal matrix. Extensive experiments on two public skin lesion +diagnosis datasets demonstrated that CAW not only enhanced interpretability but +also maintained a state-of-the-art diagnostic performance. + +
+
+
+
+
+ + ☆ A Lightweight Measure of Classification Difficulty from Application + Dataset Characteristics + + +
+ Despite accuracy and computation benchmarks being widely available to help +choose among neural network models, these are usually trained on datasets with +many classes, and do not give a precise idea of performance for applications of +few (< 10) classes. The conventional procedure to predict performance is to +train and test repeatedly on the different models and dataset variations of +interest. However, this is computationally expensive. We propose an efficient +classification difficulty measure that is calculated from the number of classes +and intra- and inter-class similarity metrics of the dataset. After a single +stage of training and testing per model family, relative performance for +different datasets and models of the same family can be predicted by comparing +difficulty measures - without further training and testing. We show how this +measure can help a practitioner select a computationally efficient model for a +small dataset 6 to 29x faster than through repeated training and testing. We +give an example of use of the measure for an industrial application in which +options are identified to select a model 42% smaller than the baseline +YOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements, +85% smaller. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ Tackling Structural Hallucination in Image Translation with Local + Diffusion + + +
+ Recent developments in diffusion models have advanced conditioned image +generation, yet they struggle with reconstructing out-of-distribution (OOD) +images, such as unseen tumors in medical images, causing ``image +hallucination'' and risking misdiagnosis. We hypothesize such hallucinations +result from local OOD regions in the conditional images. We verify that +partitioning the OOD region and conducting separate image generations +alleviates hallucinations in several applications. From this, we propose a +training-free diffusion framework that reduces hallucination with multiple +Local Diffusion processes. Our approach involves OOD estimation followed by two +modules: a ``branching'' module generates locally both within and outside OOD +regions, and a ``fusion'' module integrates these predictions into one. Our +evaluation shows our method mitigates hallucination over baseline models +quantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the +real-world medical and natural image datasets, respectively. It also +demonstrates compatibility with various pre-trained diffusion models. + +
+
+
+
+
+ + ☆ StoryImager: A Unified and Efficient Framework for Coherent Story + Visualization and Completion + + +
+ Story visualization aims to generate a series of realistic and coherent +images based on a storyline. Current models adopt a frame-by-frame architecture +by transforming the pre-trained text-to-image model into an auto-regressive +manner. Although these models have shown notable progress, there are still +three flaws. 1) The unidirectional generation of auto-regressive manner +restricts the usability in many scenarios. 2) The additional introduced story +history encoders bring an extremely high computational cost. 3) The story +visualization and continuation models are trained and inferred independently, +which is not user-friendly. To these ends, we propose a bidirectional, unified, +and efficient framework, namely StoryImager. The StoryImager enhances the +storyboard generative ability inherited from the pre-trained text-to-image +model for a bidirectional generation. Specifically, we introduce a Target Frame +Masking Strategy to extend and unify different story image generation tasks. +Furthermore, we propose a Frame-Story Cross Attention Module that decomposes +the cross attention for local fidelity and global coherence. Moreover, we +design a Contextual Feature Extractor to extract contextual information from +the whole storyline. The extensive experimental results demonstrate the +excellent performance of our StoryImager. The code is available at +https://github.com/tobran/StoryImager. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ JSTR: Judgment Improves Scene Text Recognition + + +
+ In this paper, we present a method for enhancing the accuracy of scene text +recognition tasks by judging whether the image and text match each other. While +previous studies focused on generating the recognition results from input +images, our approach also considers the model's misrecognition results to +understand its error tendencies, thus improving the text recognition pipeline. +This method boosts text recognition accuracy by providing explicit feedback on +the data that the model is likely to misrecognize by predicting correct or +incorrect between the image and text. The experimental results on publicly +available datasets demonstrate that our proposed method outperforms the +baseline and state-of-the-art methods in scene text recognition. + +
+
+ comment: IntelliSys 2024 +
+
+
+
+
+ + ☆ EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker + + +
+ Most of 3D single object trackers (SOT) in point clouds follow the two-stream +multi-stage 3D Siamese or motion tracking paradigms, which process the template +and search area point clouds with two parallel branches, built on supervised +point cloud backbones. In this work, beyond typical 3D Siamese or motion +tracking, we propose a neat and compact one-stream transformer 3D SOT paradigm +from the novel perspective, termed as \textbf{EasyTrack}, which consists of +three special designs: 1) A 3D point clouds tracking feature pre-training +module is developed to exploit the masked autoencoding for learning 3D point +clouds tracking representations. 2) A unified 3D tracking feature learning and +fusion network is proposed to simultaneously learns target-aware 3D features, +and extensively captures mutual correlation through the flexible self-attention +mechanism. 3) A target location network in the dense bird's eye view (BEV) +feature space is constructed for target classification and regression. +Moreover, we develop an enhanced version named EasyTrack++, which designs the +center points interaction (CPI) strategy to reduce the ambiguous targets caused +by the noise point cloud background information. The proposed EasyTrack and +EasyTrack++ set a new state-of-the-art performance ($\textbf{18\%}$, +$\textbf{40\%}$ and $\textbf{3\%}$ success gains) in KITTI, NuScenes, and Waymo +while runing at \textbf{52.6fps} with few parameters (\textbf{1.3M}). The code +will be available at https://github.com/KnightApple427/Easytrack. + +
+
+
+
+
+ + ☆ Prompt-driven Universal Model for View-Agnostic Echocardiography + Analysis + + +
+ Echocardiography segmentation for cardiac analysis is time-consuming and +resource-intensive due to the variability in image quality and the necessity to +process scans from various standard views. While current automated segmentation +methods in echocardiography show promising performance, they are trained on +specific scan views to analyze corresponding data. However, this solution has a +limitation as the number of required models increases with the number of +standard views. To address this, in this paper, we present a prompt-driven +universal method for view-agnostic echocardiography analysis. Considering the +domain shift between standard views, we first introduce a method called prompt +matching, aimed at learning prompts specific to different views by matching +prompts and querying input embeddings using a pre-trained vision model. Then, +we utilized a pre-trained medical language model to align textual information +with pixel data for accurate segmentation. Extensive experiments on three +standard views showed that our approach significantly outperforms the +state-of-the-art universal methods and achieves comparable or even better +performances over the segmentation model trained and tested on same views. + +
+
+
+
+
+ + ☆ LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions + for Brain Tumor Segmentation + + +
+ Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI) +scans is crucial for prompt and effective treatment. However, this process +faces the challenge of precise delineation due to the tumors' complex +heterogeneity. Moreover, energy sustainability targets and resource +limitations, especially in developing countries, require efficient and +accessible medical imaging solutions. The proposed architecture, a Lightweight +3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these +issues. It is specifically designed to reduce computational requirements +significantly while maintaining high segmentation performance. By incorporating +parallel convolutions, it enhances feature representation by capturing +multi-scale information. It further integrates an attention mechanism to refine +segmentation through selective feature recalibration. LATUP-Net achieves +promising segmentation performance: the average Dice scores for the whole +tumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%, +83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and +83.92%, respectively. Hausdorff distance metrics further indicate its improved +ability to delineate tumor boundaries. With its significantly reduced +computational demand using only 3.07 M parameters, about 59 times fewer than +other state-of-the-art models, and running on a single V100 GPU, LATUP-Net +stands out as a promising solution for real-world clinical applications, +particularly in settings with limited resources. Investigations into the +model's interpretability, utilizing gradient-weighted class activation mapping +and confusion matrices, reveal that while attention mechanisms enhance the +segmentation of small regions, their impact is nuanced. Achieving the most +accurate tumor delineation requires carefully balancing local and global +features. + +
+
+
+
+
+ + ☆ Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image + Reconstruction + + +
+ Conventional deep learning-based image reconstruction methods require a large +amount of training data which can be hard to obtain in practice. Untrained deep +learning methods overcome this limitation by training a network to invert a +physical model of the image formation process. Here we present a novel +untrained Res-U2Net model for phase retrieval. We use the extracted phase +information to determine changes in an object's surface and generate a mesh +representation of its 3D structure. We compare the performance of Res-U2Net +phase retrieval against UNet and U2Net using images from the GDXRAY dataset. + +
+
+ comment: 16 pages, 8 figures, 4 Tables +
+
+
+
+
+ + ☆ FlameFinder: Illuminating Obscured Fire through Smoke with Attentive + Deep Metric Learning + + +
+ FlameFinder is a deep metric learning (DML) framework designed to accurately +detect flames, even when obscured by smoke, using thermal images from +firefighter drones during wildfire monitoring. Traditional RGB cameras struggle +in such conditions, but thermal cameras can capture smoke-obscured flame +features. However, they lack absolute thermal reference points, leading to +false positives.To address this issue, FlameFinder utilizes paired thermal-RGB +images for training. By learning latent flame features from smoke-free samples, +the model becomes less biased towards relative thermal gradients. In testing, +it identifies flames in smoky patches by analyzing their equivalent +thermal-domain distribution. This method improves performance using both +supervised and distance-based clustering metrics.The framework incorporates a +flame segmentation method and a DML-aided detection framework. This includes +utilizing center loss (CL), triplet center loss (TCL), and triplet cosine +center loss (TCCL) to identify optimal cluster representatives for +classification. However, the dominance of center loss over the other losses +leads to the model missing features sensitive to them. To address this +limitation, an attention mechanism is proposed. This mechanism allows for +non-uniform feature contribution, amplifying the critical role of cosine and +triplet loss in the DML framework. Additionally, it improves interpretability, +class discrimination, and decreases intra-class variance. As a result, the +proposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in +the FLAME3 dataset for unobscured flame detection accuracy. Moreover, it +demonstrates enhanced class separation in obscured scenarios compared to VGG19, +ResNet18, and three backbone models tailored for flame detection. + +
+
+ comment: Submitted as a Journal Paper to IEEE Transactions on Geoscience and + Remote Sensing +
+
+
+
+
+ + ☆ SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron + Micrograph Segmentation + + +
+ Image segmentation is a critical enabler for tasks ranging from medical +diagnostics to autonomous driving. However, the correct segmentation semantics +- where are boundaries located? what segments are logically similar? - change +depending on the domain, such that state-of-the-art foundation models can +generate meaningless and incorrect results. Moreover, in certain domains, +fine-tuning and retraining techniques are infeasible: obtaining labels is +costly and time-consuming; domain images (micrographs) can be exponentially +diverse; and data sharing (for third-party retraining) is restricted. To enable +rapid adaptation of the best segmentation technology, we propose the concept of +semantic boosting: given a zero-shot foundation model, guide its segmentation +and adjust results to match domain expectations. We apply semantic boosting to +the Segment Anything Model (SAM) to obtain microstructure segmentation for +transmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and +textural features of various intermediate masks to perform mask removal and +mask merging operations. We demonstrate a zero-shot performance increase of +(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06% +drop in mean false positive masks across images of three difficulty classes +over vanilla SAM (ViT-L). + +
+
+
+
+
+ + ☆ GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis + + +
+ We present GeoSynth, a model for synthesizing satellite images with global +style and image-driven layout control. The global style control is via textual +prompts or geographic location. These enable the specification of scene +semantics or regional appearance respectively, and can be used together. We +train our model on a large dataset of paired satellite imagery, with +automatically generated captions, and OpenStreetMap data. We evaluate various +combinations of control inputs, including different types of layout controls. +Results demonstrate that our model can generate diverse, high-quality images +and exhibits excellent zero-shot generalization. The code and model checkpoints +are available at https://github.com/mvrl/GeoSynth. + +
+
+
+
+
+ + ☆ Calibrating Higher-Order Statistics for Few-Shot Class-Incremental + Learning with Pre-trained Vision Transformers CVPR 2024 + + +
+ Few-shot class-incremental learning (FSCIL) aims to adapt the model to new +classes from very few data (5 samples) without forgetting the previously +learned classes. Recent works in many-shot CIL (MSCIL) (using all available +training data) exploited pre-trained models to reduce forgetting and achieve +better plasticity. In a similar fashion, we use ViT models pre-trained on +large-scale datasets for few-shot settings, which face the critical issue of +low plasticity. FSCIL methods start with a many-shot first task to learn a very +good feature extractor and then move to the few-shot setting from the second +task onwards. While the focus of most recent studies is on how to learn the +many-shot first task so that the model generalizes to all future few-shot +tasks, we explore in this work how to better model the few-shot data using +pre-trained models, irrespective of how the first task is trained. Inspired by +recent works in MSCIL, we explore how using higher-order feature statistics can +influence the classification of few-shot classes. We identify the main +challenge of obtaining a good covariance matrix from few-shot data and propose +to calibrate the covariance matrix for new classes based on semantic similarity +to the many-shot base classes. Using the calibrated feature statistics in +combination with existing methods significantly improves few-shot continual +classification on several FSCIL benchmarks. Code is available at +https://github.com/dipamgoswami/FSCIL-Calibration. + +
+
+ comment: Accepted at CLVision workshop (CVPR 2024) +
+
+
+
+
+ + ☆ RoadBEV: Road Surface Reconstruction in Bird's Eye View + + +
+ Road surface conditions, especially geometry profiles, enormously affect +driving performance of autonomous vehicles. Vision-based online road +reconstruction promisingly captures road information in advance. Existing +solutions like monocular depth estimation and stereo matching suffer from +modest performance. The recent technique of Bird's-Eye-View (BEV) perception +provides immense potential to more reliable and accurate reconstruction. This +paper uniformly proposes two simple yet effective models for road elevation +reconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate +road elevation with monocular and stereo images, respectively. The former +directly fits elevation values based on voxel features queried from image view, +while the latter efficiently recognizes road elevation patterns based on BEV +volume representing discrepancy between left and right voxel features. +Insightful analyses reveal their consistence and difference with perspective +view. Experiments on real-world dataset verify the models' effectiveness and +superiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm +and 0.56cm, respectively. The estimation performance improves by 50\% in BEV +based on monocular image. Our models are promising for practical applications, +providing valuable references for vision-based BEV perception in autonomous +driving. The code is released at https://github.com/ztsrxh/RoadBEV. + +
+
+ comment: Dataset page: https://thu-rsxd.com/rsrd Code: + https://github.com/ztsrxh/RoadBEV +
+
+
+
+
+ + ☆ Spatially Optimized Compact Deep Metric Learning Model for Similarity + Search + + +
+ Spatial optimization is often overlooked in many computer vision tasks. +Filters should be able to recognize the features of an object regardless of +where it is in the image. Similarity search is a crucial task where spatial +features decide an important output. The capacity of convolution to capture +visual patterns across various locations is limited. In contrast to +convolution, the involution kernel is dynamically created at each pixel based +on the pixel value and parameters that have been learned. This study +demonstrates that utilizing a single layer of involution feature extractor +alongside a compact convolution model significantly enhances the performance of +similarity search. Additionally, we improve predictions by using the GELU +activation function rather than the ReLU. The negligible amount of weight +parameters in involution with a compact model with better performance makes the +model very useful in real-world implementations. Our proposed model is below 1 +megabyte in size. We have experimented with our proposed methodology and other +models on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method +outperforms across all three datasets. + +
+
+ comment: 5 pages, 3 figures, +
+
+
+
+
+ + ☆ Leveraging Latents for Efficient Thermography Classification and + Segmentation + + +
+ Breast cancer is a prominent health concern worldwide, currently being the +secondmost common and second-deadliest type of cancer in women. While current +breast cancer diagnosis mainly relies on mammography imaging, in recent years +the use of thermography for breast cancer imaging has been garnering growing +popularity. Thermographic imaging relies on infrared cameras to capture +body-emitted heat distributions. While these heat signatures have proven useful +for computer-vision systems for accurate breast cancer segmentation and +classification, prior work often relies on handcrafted feature engineering or +complex architectures, potentially limiting the comparability and applicability +of these methods. In this work, we present a novel algorithm for both breast +cancer classification and segmentation. Rather than focusing efforts on manual +feature and architecture engineering, our algorithm focuses on leveraging an +informative, learned feature space, thus making our solution simpler to use and +extend to other frameworks and downstream tasks, as well as more applicable to +data-scarce settings. Our classification produces SOTA results, while we are +the first work to produce segmentation regions studied in this paper. + +
+
+
+
+
+ + ☆ The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios + + +
+ Face morphing attacks present an emerging threat to the face recognition +system. On top of that, printing and scanning the morphed images could obscure +the artifacts generated during the morphing process, which makes morphed image +detection even harder. In this work, we investigate the impact that printing +and scanning has on morphing attacks through a series of heterogeneous tests. +Our experiments show that we can increase the possibility of a false match by +up to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has +been printed and scanned, regardless it is morphed or bona fide, to a Face +Recognition (FR) system. Likewise, using Frechet Inception Distance (FID) +metric, strictly print-scanned morph attacks performed on average 9.185% +stronger than non-print-scanned digital morphs. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Training-Free Open-Vocabulary Segmentation with Offline + Diffusion-Augmented Prototype Generation CVPR 2024 + + +
+ Open-vocabulary semantic segmentation aims at segmenting arbitrary categories +expressed in textual form. Previous works have trained over large amounts of +image-caption pairs to enforce pixel-level multimodal alignments. However, +captions provide global information about the semantics of a given image but +lack direct localization of individual concepts. Further, training on +large-scale datasets inevitably brings significant computational costs. In this +paper, we propose FreeDA, a training-free diffusion-augmented method for +open-vocabulary semantic segmentation, which leverages the ability of diffusion +models to visually localize generated concepts and local-global similarities to +match class-agnostic regions with semantic classes. Our approach involves an +offline stage in which textual-visual reference embeddings are collected, +starting from a large set of captions and leveraging visual and semantic +contexts. At test time, these are queried to support the visual matching +process, which is carried out by jointly considering class-agnostic regions and +global semantic similarities. Extensive analyses demonstrate that FreeDA +achieves state-of-the-art performance on five datasets, surpassing previous +methods by more than 7.0 average points in terms of mIoU and without requiring +any training. + +
+
+ comment: CVPR 2024. Project page: https://aimagelab.github.io/freeda/ +
+
+
+
+
+ + ☆ GO4Align: Group Optimization for Multi-Task Alignment + + +
+ This paper proposes \textit{GO4Align}, a multi-task optimization approach +that tackles task imbalance by explicitly aligning the optimization across +tasks. To achieve this, we design an adaptive group risk minimization strategy, +compromising two crucial techniques in implementation: (i) dynamical group +assignment, which clusters similar tasks based on task interactions; (ii) +risk-guided group indicators, which exploit consistent task correlations with +risk information from previous iterations. Comprehensive experimental results +on diverse typical benchmarks demonstrate our method's performance superiority +with even lower computational costs. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Referring Expression Comprehension via Structural Similarity + Between Images and Captions CVPR 2024 + + +
+ Zero-shot referring expression comprehension aims at localizing bounding +boxes in an image corresponding to provided textual prompts, which requires: +(i) a fine-grained disentanglement of complex visual scene and textual context, +and (ii) a capacity to understand relationships among disentangled entities. +Unfortunately, existing large vision-language alignment (VLA) models, e.g., +CLIP, struggle with both aspects so cannot be directly used for this task. To +mitigate this gap, we leverage large foundation models to disentangle both +images and texts into triplets in the format of (subject, predicate, object). +After that, grounding is accomplished by calculating the structural similarity +matrix between visual and textual triplets with a VLA model, and subsequently +propagate it to an instance-level similarity matrix. Furthermore, to equip VLA +models with the ability of relationship understanding, we design a +triplet-matching objective to fine-tune the VLA models on a collection of +curated dataset containing abundant entity relationships. Experiments +demonstrate that our visual grounding performance increase of up to 19.5% over +the SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo +dataset, our zero-shot approach achieves comparable accuracy to the fully +supervised model. Code is available at +https://github.com/Show-han/Zeroshot_REC. + +
+
+ comment: CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC +
+
+
+
+
+ + ♻ ☆ Multi-person 3D pose estimation from unlabelled data + + +
+ Its numerous applications make multi-human 3D pose estimation a remarkably +impactful area of research. Nevertheless, assuming a multiple-view system +composed of several regular RGB cameras, 3D multi-pose estimation presents +several challenges. First of all, each person must be uniquely identified in +the different views to separate the 2D information provided by the cameras. +Secondly, the 3D pose estimation process from the multi-view 2D information of +each person must be robust against noise and potential occlusions in the +scenario. In this work, we address these two challenges with the help of deep +learning. Specifically, we present a model based on Graph Neural Networks +capable of predicting the cross-view correspondence of the people in the +scenario along with a Multilayer Perceptron that takes the 2D points to yield +the 3D poses of each person. These two models are trained in a self-supervised +manner, thus avoiding the need for large datasets with 3D annotations. + +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ An Edit Friendly DDPM Noise Space: Inversion and Manipulations CVPR 2024 + + +
+ Denoising diffusion probabilistic models (DDPMs) employ a sequence of white +Gaussian noise samples to generate an image. In analogy with GANs, those noise +maps could be considered as the latent code associated with the generated +image. However, this native noise space does not possess a convenient +structure, and is thus challenging to work with in editing tasks. Here, we +propose an alternative latent noise space for DDPM that enables a wide range of +editing operations via simple means, and present an inversion method for +extracting these edit-friendly noise maps for any given image (real or +synthetically generated). As opposed to the native DDPM noise space, the +edit-friendly noise maps do not have a standard normal distribution and are not +statistically independent across timesteps. However, they allow perfect +reconstruction of any desired image, and simple transformations on them +translate into meaningful manipulations of the output image (e.g. shifting, +color edits). Moreover, in text-conditional models, fixing those noise maps +while changing the text prompt, modifies semantics while retaining structure. +We illustrate how this property enables text-based editing of real images via +the diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM +inversion). We also show how it can be used within existing diffusion-based +editing methods to improve their quality and diversity. Webpage: +https://inbarhub.github.io/DDPM_inversion + +
+
+ comment: CVPR 2024. Code and examples are available at + https://github.com/inbarhub/DDPM_inversion +
+
+
+
+
+ + ♻ ☆ Event Data Association via Robust Model Fitting for Event-based Object + Tracking + + +
+ Event-based approaches, which are based on bio-inspired asynchronous event +cameras, have achieved promising performance on various computer vision tasks. +However, the study of the fundamental event data association problem is still +in its infancy. In this paper, we propose a novel Event Data Association +(called EDA) approach to explicitly address the event association and fusion +problem. The proposed EDA seeks for event trajectories that best fit the event +data, in order to perform unifying data association and information fusion. In +EDA, we first asynchronously fuse the event data based on its information +entropy. Then, we introduce a deterministic model hypothesis generation +strategy, which effectively generates model hypotheses from the fused events, +to represent the corresponding event trajectories. After that, we present a +two-stage weighting algorithm, which robustly weighs and selects true models +from the generated model hypotheses, through multi-structural geometric model +fitting. Meanwhile, we also propose an adaptive model selection strategy to +automatically determine the number of the true models. Finally, we use the +selected true models to associate and fuse the event data, without being +affected by sensor noise and irrelevant structures. We evaluate the performance +of the proposed EDA on the object tracking task. The experimental results show +the effectiveness of EDA under challenging scenarios, such as high speed, +motion blur, and high dynamic range conditions. + +
+
+ comment: 32 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement + + +
+ Distortions caused by low-light conditions are not only visually unpleasant +but also degrade the performance of computer vision tasks. The restoration and +enhancement have proven to be highly beneficial. However, there are only a +limited number of enhancement methods explicitly designed for videos acquired +in low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet) +model using a Swin Transformer as a backbone to capture low light video +features and exploit their spatio-temporal correlations. The STA-SUNet model is +trained on a novel, fully registered dataset (BVI), which comprises dynamic +scenes captured under varying light conditions. It is further analysed +comparatively against various other models over three test datasets. The model +demonstrates superior adaptivity across all datasets, obtaining the highest +PSNR and SSIM values. It is particularly effective in extreme low-light +conditions, yielding fairly good visualisation results. + +
+
+
+
+
+ + ♻ ☆ DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion + Models ICLR 2024 + + +
+ Recent text-to-image diffusion models have shown surprising performance in +generating high-quality images. However, concerns have arisen regarding the +unauthorized data usage during the training or fine-tuning process. One example +is when a model trainer collects a set of images created by a particular artist +and attempts to train a model capable of generating similar images without +obtaining permission and giving credit to the artist. To address this issue, we +propose a method for detecting such unauthorized data usage by planting the +injected memorization into the text-to-image diffusion models trained on the +protected dataset. Specifically, we modify the protected images by adding +unique contents on these images using stealthy image warping functions that are +nearly imperceptible to humans but can be captured and memorized by diffusion +models. By analyzing whether the model has memorized the injected content +(i.e., whether the generated images are processed by the injected +post-processing function), we can detect models that had illegally utilized the +unauthorized data. Experiments on Stable Diffusion and VQ Diffusion with +different model training or fine-tuning methods (i.e, LoRA, DreamBooth, and +standard training) demonstrate the effectiveness of our proposed method in +detecting unauthorized data usages. Code: +https://github.com/ZhentingWang/DIAGNOSIS. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DiffusionLight: Light Probes for Free by Painting a Chrome Ball CVPR 2024 + + +
+ We present a simple yet effective technique to estimate lighting in a single +input image. Current techniques rely heavily on HDR panorama datasets to train +neural networks to regress an input with limited field-of-view to a full +environment map. However, these approaches often struggle with real-world, +uncontrolled settings due to the limited diversity and size of their datasets. +To address this problem, we leverage diffusion models trained on billions of +standard images to render a chrome ball into the input image. Despite its +simplicity, this task remains challenging: the diffusion models often insert +incorrect or inconsistent objects and cannot readily generate images in HDR +format. Our research uncovers a surprising relationship between the appearance +of chrome balls and the initial diffusion noise map, which we utilize to +consistently generate high-quality chrome balls. We further fine-tune an LDR +diffusion model (Stable Diffusion XL) with LoRA, enabling it to perform +exposure bracketing for HDR light estimation. Our method produces convincing +light estimates across diverse settings and demonstrates superior +generalization to in-the-wild scenarios. + +
+
+ comment: CVPR 2024 Oral. For more information and code, please visit our + website https://diffusionlight.github.io/ +
+
+
+
+
+ + ♻ ☆ Learning Local and Global Temporal Contexts for Video Semantic + Segmentation CVPR + 2022 + + +
+ Contextual information plays a core role for video semantic segmentation +(VSS). This paper summarizes contexts for VSS in two-fold: local temporal +contexts (LTC) which define the contexts from neighboring frames, and global +temporal contexts (GTC) which represent the contexts from the whole video. As +for LTC, it includes static and motional contexts, corresponding to static and +moving content in neighboring frames, respectively. Previously, both static and +motional contexts have been studied. However, there is no research about +simultaneously learning static and motional contexts (highly complementary). +Hence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a +unified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature +Assembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static +and motional contexts, and CFM mines useful information from nearby frames to +enhance target features. To further exploit more temporal contexts, we propose +CFFM++ by additionally learning GTC from the whole video. Specifically, we +uniformly sample certain frames from the video and extract global contextual +prototypes by k-means. The information within those prototypes is mined by CFM +to refine target features. Experimental results on popular benchmarks +demonstrate that CFFM and CFFM++ perform favorably against state-of-the-art +methods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM + +
+
+ comment: Accepted to TPAMI, an extended version of a paper published in CVPR + 2022 +
+
+
+
+
+ + ♻ ☆ SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D + Object Detection + + +
+ Roadside perception can greatly increase the safety of autonomous vehicles by +extending their perception ability beyond the visual range and addressing blind +spots. However, current state-of-the-art vision-based roadside detection +methods possess high accuracy on labeled scenes but have inferior performance +on new scenes. This is because roadside cameras remain stationary after +installation and can only collect data from a single scene, resulting in the +algorithm overfitting these roadside backgrounds and camera poses. To address +this issue, in this paper, we propose an innovative Scenario Generalization +Framework for Vision-based Roadside 3D Object Detection, dubbed SGV3D. +Specifically, we employ a Background-suppressed Module (BSM) to mitigate +background overfitting in vision-centric pipelines by attenuating background +features during the 2D to bird's-eye-view projection. Furthermore, by +introducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled +images from new scenes, diverse instance foregrounds with varying camera poses +are generated, addressing the risk of overfitting specific camera poses. We +evaluate our method on two large-scale roadside benchmarks. Our method +surpasses all previous methods by a significant margin in new scenes, including ++42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to +BEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D +heterologous benchmark, we achieve notable gains of 14.48% for car and 12.41% +for large vehicle. We aspire to contribute insights on the exploration of +roadside perception techniques, emphasizing their capability for scenario +generalization. The code will be available at +https://github.com/yanglei18/SGV3D + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Are We on the Right Way for Evaluating Large Vision-Language Models? + + +
+ Large vision-language models (LVLMs) have recently achieved rapid progress, +sparking numerous studies to evaluate their multi-modal capabilities. However, +we dig into current evaluation works and identify two primary issues: 1) Visual +content is unnecessary for many samples. The answers can be directly inferred +from the questions and options, or the world knowledge embedded in LLMs. This +phenomenon is prevalent across current benchmarks. For instance, GeminiPro +achieves 42.9% on the MMMU benchmark without any visual input, and outperforms +the random choice baseline across six benchmarks over 24% on average. 2) +Unintentional data leakage exists in LLM and LVLM training. LLM and LVLM could +still answer some visual-necessary questions without visual content, indicating +the memorizing of these samples within large-scale training data. For example, +Sphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM +backbone with 17.9%. Both problems lead to misjudgments of actual multi-modal +gains and potentially misguide the study of LVLM. To this end, we present +MMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500 +samples meticulously selected by humans. MMStar benchmarks 6 core capabilities +and 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with +carefully balanced and purified samples. These samples are first roughly +selected from current benchmarks with an automated pipeline, human review is +then involved to ensure each curated sample exhibits visual dependency, minimal +data leakage, and requires advanced multi-modal capabilities. Moreover, two +metrics are developed to measure data leakage and actual performance gain in +multi-modal training. We evaluate 16 leading LVLMs on MMStar to assess their +multi-modal capabilities, and on 7 benchmarks with the proposed metrics to +investigate their data leakage and actual multi-modal gain. + +
+
+ comment: Project page: https://mmstar-benchmark.github.io/ +
+
+
+
+
+ + ♻ ☆ CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors + Object Detection from Multi-view Images CVPR2024 + + +
+ This paper introduces CN-RMA, a novel approach for 3D indoor object detection +from multi-view images. We observe the key challenge as the ambiguity of image +and 3D correspondence without explicit geometry to provide occlusion +information. To address this issue, CN-RMA leverages the synergy of 3D +reconstruction networks and 3D object detection networks, where the +reconstruction network provides a rough Truncated Signed Distance Function +(TSDF) and guides image features to vote to 3D space correctly in an end-to-end +manner. Specifically, we associate weights to sampled points of each ray +through ray marching, representing the contribution of a pixel in an image to +corresponding 3D locations. Such weights are determined by the predicted signed +distances so that image features vote only to regions near the reconstructed +surface. Our method achieves state-of-the-art performance in 3D object +detection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the +ScanNet and ARKitScenes datasets. The code and models are released at +https://github.com/SerCharles/CN-RMA. + +
+
+ comment: CVPR2024 poster paper, 8 pages of main part, and 4 pages of + supplementary material +
+
+
+
+
+ + ♻ ☆ MetaMix: Meta-state Precision Searcher for Mixed-precision Activation + Quantization AAAI + + +
+ Mixed-precision quantization of efficient networks often suffer from +activation instability encountered in the exploration of bit selections. To +address this problem, we propose a novel method called MetaMix which consists +of bit selection and weight training phases. The bit selection phase iterates +two steps, (1) the mixed-precision-aware weight update, and (2) the bit-search +training with the fixed mixed-precision-aware weights, both of which combined +reduce activation instability in mixed-precision quantization and contribute to +fast and high-quality bit selection. The weight training phase exploits the +weights and step sizes trained in the bit selection phase and fine-tunes them +thereby offering fast training. Our experiments with efficient and +hard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet +show that our proposed method pushes the boundary of mixed-precision +quantization, in terms of accuracy vs. operations, by outperforming both mixed- +and single-precision SOTA methods. + +
+
+ comment: Proc. The 38th Annual AAAI Conference on Artificial Intelligence + (AAAI) +
+
+
+
+
+ + ♻ ☆ UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces + Parameters for Skin Lesion Segmentation + + +
+ Traditionally for improving the segmentation performance of models, most +approaches prefer to use adding more complex modules. And this is not suitable +for the medical field, especially for mobile medical devices, where +computationally loaded models are not suitable for real clinical environments +due to computational resource constraints. Recently, state-space models (SSMs), +represented by Mamba, have become a strong competitor to traditional CNNs and +Transformers. In this paper, we deeply explore the key elements of parameter +influence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight +VM-UNet) based on this. Specifically, we propose a method for processing +features in parallel Vision Mamba, named PVM Layer, which achieves excellent +performance with the lowest computational load while keeping the overall number +of processing channels constant. We conducted comparisons and ablation +experiments with several state-of-the-art lightweight models on three skin +lesion public datasets and demonstrated that the UltraLight VM-UNet exhibits +the same strong performance competitiveness with parameters of only 0.049M and +GFLOPs of 0.060. In addition, this study deeply explores the key elements of +parameter influence in Mamba, which will lay a theoretical foundation for Mamba +to possibly become a new mainstream module for lightweighting in the future. +The code is available from https://github.com/wurenkai/UltraLight-VM-UNet . + +
+
+
+
+
+ + ♻ ☆ Cross-Silo Federated Learning Across Divergent Domains with Iterative + Parameter Alignment + + +
+ Learning from the collective knowledge of data dispersed across private +sources can provide neural networks with enhanced generalization capabilities. +Federated learning, a method for collaboratively training a machine learning +model across remote clients, achieves this by combining client models via the +orchestration of a central server. However, current approaches face two +critical limitations: i) they struggle to converge when client domains are +sufficiently different, and ii) current aggregation techniques produce an +identical global model for each client. In this work, we address these issues +by reformulating the typical federated learning setup: rather than learning a +single global model, we learn N models each optimized for a common objective. +To achieve this, we apply a weighted distance minimization to model parameters +shared in a peer-to-peer topology. The resulting framework, Iterative Parameter +Alignment, applies naturally to the cross-silo setting, and has the following +properties: (i) a unique solution for each participant, with the option to +globally converge each model in the federation, and (ii) an optional +early-stopping mechanism to elicit fairness among peers in collaborative +learning settings. These characteristics jointly provide a flexible new +framework for iteratively learning from peer models trained on disparate +datasets. We find that the technique achieves competitive results on a variety +of data partitions compared to state-of-the-art approaches. Further, we show +that the method is robust to divergent domains (i.e. disjoint classes across +peers) where existing approaches struggle. + +
+
+ comment: Published at IEEE Big Data 2023 +
+
+
+
+
+ + ♻ ☆ Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis CVPR 2024 + + +
+ Diffusion model is a promising approach to image generation and has been +employed for Pose-Guided Person Image Synthesis (PGPIS) with competitive +performance. While existing methods simply align the person appearance to the +target pose, they are prone to overfitting due to the lack of a high-level +semantic understanding on the source person image. In this paper, we propose a +novel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence +of image-caption pairs and textual prompts, we develop a novel training +paradigm purely based on images to control the generation process of a +pre-trained text-to-image diffusion model. A perception-refined decoder is +designed to progressively refine a set of learnable queries and extract +semantic understanding of person images as a coarse-grained prompt. This allows +for the decoupling of fine-grained appearance and pose information controls at +different stages, and thus circumventing the potential overfitting problem. To +generate more realistic texture details, a hybrid-granularity attention module +is proposed to encode multi-scale fine-grained appearance features as bias +terms to augment the coarse-grained prompt. Both quantitative and qualitative +experimental results on the DeepFashion benchmark demonstrate the superiority +of our method over the state of the arts for PGPIS. Code is available at +https://github.com/YanzuoLu/CFLD. + +
+
+ comment: Accepted by CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ One-Step Late Fusion Multi-view Clustering with Compressed Subspace ICASSP2024 + + +
+ Late fusion multi-view clustering (LFMVC) has become a rapidly growing class +of methods in the multi-view clustering (MVC) field, owing to its excellent +computational speed and clustering performance. One bottleneck faced by +existing late fusion methods is that they are usually aligned to the average +kernel function, which makes the clustering performance highly dependent on the +quality of datasets. Another problem is that they require subsequent k-means +clustering after obtaining the consensus partition matrix to get the final +discrete labels, and the resulting separation of the label learning and cluster +structure optimization processes limits the integrity of these models. To +address the above issues, we propose an integrated framework named One-Step +Late Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS). +Specifically, we use the consensus subspace to align the partition matrix while +optimizing the partition fusion, and utilize the fused partition matrix to +guide the learning of discrete labels. A six-step iterative optimization +approach with verified convergence is proposed. Sufficient experiments on +multiple datasets validate the effectiveness and efficiency of our proposed +method. + +
+
+ comment: Accepted by ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ In addition to the advancements in deepfake generation, corresponding +detection technologies need to continuously evolve to regulate the potential +misuse of deepfakes, such as for privacy invasion and phishing attacks. This +survey comprehensively reviews the latest developments in deepfake generation +and detection, summarizing and analyzing the current state of the art in this +rapidly evolving field. We first unify task definitions, comprehensively +introduce datasets and metrics, and discuss the development of generation and +detection technology frameworks. Then, we discuss the development of several +related sub-fields and focus on researching four mainstream deepfake fields: +popular face swap, face reenactment, talking face generation, and facial +attribute editing, as well as foreign detection. Subsequently, we +comprehensively benchmark representative methods on popular datasets for each +field, fully evaluating the latest and influential works published in top +conferences/journals. Finally, we analyze the challenges and future research +directions of the discussed fields. We closely follow the latest developments +in https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection. + +
+
+
+
+
+ + ♻ ☆ MultIOD: Rehearsal-free Multihead Incremental Object Detector CVPR 2024 + + +
+ Class-Incremental learning (CIL) refers to the ability of artificial agents +to integrate new classes as they appear in a stream. It is particularly +interesting in evolving environments where agents have limited access to memory +and computational resources. The main challenge of incremental learning is +catastrophic forgetting, the inability of neural networks to retain past +knowledge when learning a new one. Unfortunately, most existing +class-incremental methods for object detection are applied to two-stage +algorithms such as Faster-RCNN, and rely on rehearsal memory to retain past +knowledge. We argue that those are not suitable in resource-limited +environments, and more effort should be dedicated to anchor-free and +rehearsal-free object detection. In this paper, we propose MultIOD, a +class-incremental object detector based on CenterNet. Our contributions are: +(1) we propose a multihead feature pyramid and multihead detection architecture +to efficiently separate class representations, (2) we employ transfer learning +between classes learned initially and those learned incrementally to tackle +catastrophic forgetting, and (3) we use a class-wise non-max-suppression as a +post-processing technique to remove redundant boxes. Results show that our +method outperforms state-of-the-art methods on two Pascal VOC datasets, while +only saving the model in its current state, contrary to other +distillation-based counterparts. + +
+
+ comment: Accepted at the archival track of the Workshop on Continual Learning + in Computer Vision (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane + Extrapolation + + +
+ We present BlockFusion, a diffusion-based model that generates 3D scenes as +unit blocks and seamlessly incorporates new blocks to extend the scene. +BlockFusion is trained using datasets of 3D blocks that are randomly cropped +from complete 3D scene meshes. Through per-block fitting, all training blocks +are converted into the hybrid neural fields: with a tri-plane containing the +geometry features, followed by a Multi-layer Perceptron (MLP) for decoding the +signed distance values. A variational auto-encoder is employed to compress the +tri-planes into the latent tri-plane space, on which the denoising diffusion +process is performed. Diffusion applied to the latent representations allows +for high-quality and diverse 3D scene generation. To expand a scene during +generation, one needs only to append empty blocks to overlap with the current +scene and extrapolate existing latent tri-planes to populate new blocks. The +extrapolation is done by conditioning the generation process with the feature +samples from the overlapping tri-planes during the denoising iterations. Latent +tri-plane extrapolation produces semantically and geometrically meaningful +transitions that harmoniously blend with the existing scene. A 2D layout +conditioning mechanism is used to control the placement and arrangement of +scene elements. Experimental results indicate that BlockFusion is capable of +generating diverse, geometrically consistent and unbounded large 3D scenes with +unprecedented high-quality shapes in both indoor and outdoor scenarios. + +
+
+ comment: Video: https://www.youtube.com/watch?v=PxIBtd6G0mA +
+
+
+
+
+ + ♻ ☆ Learning Zero-Shot Material States Segmentation, by Implanting Natural + Image Patterns in Synthetic Data + + +
+ Visual understanding and segmentation of materials and their states is +fundamental to understanding the physical world. The myriad textures, shapes, +and often blurry boundaries formed by materials make this task particularly +hard to generalize. Whether it's identifying wet regions of a surface, minerals +in rocks, infected regions in plants, or pollution in water, each material +state has its own unique form. For neural nets to learn general class-agnostic +material segmentation, it is necessary to first collect and annotate data that +captures this complexity. Collecting and manually annotating real-world images +is limited by the cost and precision of manual labor. In contrast, synthetic +CGI data is highly accurate and almost cost-free, but fails to replicate the +vast diversity of the material world. This work offers a method to bridge this +crucial gap by implanting patterns extracted from real-world images in +synthetic data. Hence, patterns automatically collected from natural images are +used to map materials into synthetic scenes. This unsupervised approach allows +the generated data to capture the vast complexity of the real world while +maintaining the precision and scale of synthetic data. We also present the +first general benchmark for zero-shot material state segmentation. The +benchmark contains a wide range of real-world images of material states, like +food, rocks, construction, plants, liquids, and many others, each in various +states (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The +annotation includes both partial similarity between regions with similar but +not identical materials, and hard segmentation of only points in the exact same +material state. We show that net trains on MatSeg significantly outperform +existing state-of-the-art methods on this task. The dataset, code, and trained +model are available + +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the negative effect under massive +false negatives; second, mixed sample data augmentation for probabilistic +matching. Experimental results on MS-COCO Caption and two extended benchmarks, +CxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to +state-of-the-art ITM methods. The robustness of PCME++ is also evaluated under +noisy image-text correspondences. In addition, the potential applicability of +PCME++ in automatic prompt-filtering for zero-shot classification is shown. The +code is available at https://github.com/naver-ai/pcmepp + +
+
+ comment: ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp. + Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB +
+
+
+
+
+ + ♻ ☆ Industrial Application of 6D Pose Estimation for Robotic Manipulation in + Automotive Internal Logistics + + +
+ Despite the advances in robotics a large proportion of the of parts handling +tasks in the automotive industry's internal logistics are not automated but +still performed by humans. A key component to competitively automate these +processes is a 6D pose estimation that can handle a large number of different +parts, is adaptable to new parts with little manual effort, and is sufficiently +accurate and robust with respect to industry requirements. In this context, the +question arises as to the current status quo with respect to these measures. To +address this we built a representative 6D pose estimation pipeline with +state-of-the-art components from economically scalable real to synthetic data +generation to pose estimators and evaluated it on automotive parts with regards +to a realistic sequencing process. We found that using the data generation +approaches, the performance of the trained 6D pose estimators are promising, +but do not meet industry requirements. We reveal that the reason for this is +the inability of the estimators to provide reliable uncertainties for their +poses, rather than the ability of to provide sufficiently accurate poses. In +this context we further analyzed how RGB- and RGB-D-based approaches compare +against this background and show that they are differently vulnerable to the +domain gap induced by synthetic data. + +
+
+ comment: Accepted for publication at IEEE International Conference on + Automation Science and Engineering (CASE 2023) +
+
+
+
+
+ + ♻ ☆ Self-training via Metric Learning for Source-Free Domain Adaptation of + Semantic Segmentation + + +
+ Unsupervised source-free domain adaptation methods aim to train a model for +the target domain utilizing a pretrained source-domain model and unlabeled +target-domain data, particularly when accessibility to source data is +restricted due to intellectual property or privacy concerns. Traditional +methods usually use self-training with pseudo-labeling, which is often +subjected to thresholding based on prediction confidence. However, such +thresholding limits the effectiveness of self-training due to insufficient +supervision. This issue becomes more severe in a source-free setting, where +supervision comes solely from the predictions of the pre-trained source model. +In this study, we propose a novel approach by incorporating a mean-teacher +model, wherein the student network is trained using all predictions from the +teacher network. Instead of employing thresholding on predictions, we introduce +a method to weight the gradients calculated from pseudo-labels based on the +reliability of the teacher's predictions. To assess reliability, we introduce a +novel approach using proxy-based metric learning. Our method is evaluated in +synthetic-to-real and cross-city scenarios, demonstrating superior performance +compared to existing state-of-the-art methods. + +
+
+ comment: This paper is under consideration at Computer Vision and Image + Understanding +
+
+
+
+
+ + ♻ ☆ Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of + Figure Skating + + +
+ The fine-grained action analysis of the existing action datasets is +challenged by insufficient action categories, low fine granularities, limited +modalities, and tasks. In this paper, we propose a Multi-modality and +Multi-task dataset of Figure Skating (MMFS) which was collected from the World +Figure Skating Championships. MMFS, which possesses action recognition and +action quality assessment, captures RGB, skeleton, and is collected the score +of actions from 11671 clips with 256 categories including spatial and temporal +labels. The key contributions of our dataset fall into three aspects as +follows. (1) Independently spatial and temporal categories are first proposed +to further explore fine-grained action recognition and quality assessment. (2) +MMFS first introduces the skeleton modality for complex fine-grained action +quality assessment. (3) Our multi-modality and multi-task dataset encourage +more action analysis models. To benchmark our dataset, we adopt RGB-based and +skeleton-based baseline methods for action recognition and action quality +assessment. + +
+
+
+
+
+ + ♻ ☆ Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering + Regularization for Multi-Modal 3D Semantic Occupancy Prediction + + +
+ 3D semantic occupancy prediction is a pivotal task in the field of autonomous +driving. Recent approaches have made great advances in 3D semantic occupancy +predictions on a single modality. However, multi-modal semantic occupancy +prediction approaches have encountered difficulties in dealing with the +modality heterogeneity, modality misalignment, and insufficient modality +interactions that arise during the fusion of different modalities data, which +may result in the loss of important geometric and semantic information. This +letter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy +prediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera +feature fusion with implicit volume rendering regularization. The key insight +is that volume rendering in the feature space can proficiently bridge the gap +between 3D LiDAR sweeps and 2D images while serving as a physical +regularization to enhance LiDAR-camera fused volumetric representation. +Specifically, we first propose a Geometric- and Semantic-aware Fusion +(GSFusion) module to explicitly enhance LiDAR features by incorporating +neighboring camera features through a K-nearest neighbors (KNN) search. Then, +we employ volume rendering to project the fused feature back to the image +planes for reconstructing color and depth maps. These maps are then supervised +by input images from the camera and depth estimations derived from LiDAR, +respectively. Extensive experiments on the popular nuScenes and SemanticKITTI +benchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy +prediction. The project page is available at +https://rorisis.github.io/Co-Occ_project-page/. + +
+
+
+
+
+ + ♻ ☆ Anchor-based Multi-view Subspace Clustering with Hierarchical Feature + Descent + + +
+ Multi-view clustering has attracted growing attention owing to its +capabilities of aggregating information from various sources and its promising +horizons in public affairs. Up till now, many advanced approaches have been +proposed in recent literature. However, there are several ongoing difficulties +to be tackled. One common dilemma occurs while attempting to align the features +of different views. {Moreover, due to the fact that many existing multi-view +clustering algorithms stem from spectral clustering, this results to cubic time +complexity w.r.t. the number of dataset. However, we propose Anchor-based +Multi-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to +tackle the discrepancy among views through hierarchical feature descent and +project to a common subspace( STAGE 1), which reveals dependency of different +views. We further reduce the computational complexity to linear time cost +through a unified sampling strategy in the common subspace( STAGE 2), followed +by anchor-based subspace clustering to learn the bipartite graph collectively( +STAGE 3). }Extensive experimental results on public benchmark datasets +demonstrate that our proposed model consistently outperforms the +state-of-the-art techniques. + +
+
+
+
+
+ + ♻ ☆ Simple Semantic-Aided Few-Shot Learning CVPR 2024 + + +
+ Learning from a limited amount of data, namely Few-Shot Learning, stands out +as a challenging computer vision task. Several works exploit semantics and +design complicated semantic fusion mechanisms to compensate for rare +representative features within restricted data. However, relying on naive +semantics such as class names introduces biases due to their brevity, while +acquiring extensive semantics from external knowledge takes a huge time and +effort. This limitation severely constrains the potential of semantics in +Few-Shot Learning. In this paper, we design an automatic way called Semantic +Evolution to generate high-quality semantics. The incorporation of high-quality +semantics alleviates the need for complex network structures and learning +algorithms used in previous works. Hence, we employ a simple two-layer network +termed Semantic Alignment Network to transform semantics and visual features +into robust class prototypes with rich discriminative features for few-shot +classification. The experimental results show our framework outperforms all +previous methods on six benchmarks, demonstrating a simple network with +high-quality semantics can beat intricate multi-modal modules on few-shot +classification tasks. Code is available at +https://github.com/zhangdoudou123/SemFew. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively + Aggregated Spatio-Temporal Alignment + + +
+ Leveraging Transformer attention has led to great advancements in HDR +deghosting. However, the intricate nature of self-attention introduces +practical challenges, as existing state-of-the-art methods often demand +high-end GPUs or exhibit slow inference speeds, especially for high-resolution +images like 2K. Striking an optimal balance between performance and latency +remains a critical concern. In response, this work presents PASTA, a novel +Progressively Aggregated Spatio-Temporal Alignment framework for HDR +deghosting. Our approach achieves effectiveness and efficiency by harnessing +hierarchical representation during feature distanglement. Through the +utilization of diverse granularities within the hierarchical structure, our +method substantially boosts computational speed and optimizes the HDR imaging +workflow. In addition, we explore within-scale feature modeling with local and +global attention, gradually merging and refining them in a coarse-to-fine +fashion. Experimental results showcase PASTA's superiority over current SOTA +methods in both visual quality and performance metrics, accompanied by a +substantial 3-fold (x3) increase in inference speed. + +
+
+
+
+
+ + ♻ ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ♻ ☆ Anomaly Score: Evaluating Generative Models and Individual Generated + Images based on Complexity and Vulnerability CVPR 2024 + + +
+ With the advancement of generative models, the assessment of generated images +becomes more and more important. Previous methods measure distances between +features of reference and generated images from trained vision models. In this +paper, we conduct an extensive investigation into the relationship between the +representation space and input space around generated images. We first propose +two measures related to the presence of unnatural elements within images: +complexity, which indicates how non-linear the representation space is, and +vulnerability, which is related to how easily the extracted feature changes by +adversarial input changes. Based on these, we introduce a new metric to +evaluating image-generative models called anomaly score (AS). Moreover, we +propose AS-i (anomaly score for individual images) that can effectively +evaluate generated images individually. Experimental results demonstrate the +validity of the proposed approach. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Scalable 3D Registration via Truncated Entry-wise Absolute Residuals CVPR 2024 + + +
+ Given an input set of $3$D point pairs, the goal of outlier-robust $3$D +registration is to compute some rotation and translation that align as many +point pairs as possible. This is an important problem in computer vision, for +which many highly accurate approaches have been recently proposed. Despite +their impressive performance, these approaches lack scalability, often +overflowing the $16$GB of memory of a standard laptop to handle roughly +$30,000$ point pairs. In this paper, we propose a $3$D registration approach +that can process more than ten million ($10^7$) point pairs with over $99\%$ +random outliers. Moreover, our method is efficient, entails low memory costs, +and maintains high accuracy at the same time. We call our method TEAR, as it +involves minimizing an outlier-robust loss that computes Truncated Entry-wise +Absolute Residuals. To minimize this loss, we decompose the original +$6$-dimensional problem into two subproblems of dimensions $3$ and $2$, +respectively, solved in succession to global optimality via a customized +branch-and-bound method. While branch-and-bound is often slow and unscalable, +this does not apply to TEAR as we propose novel bounding functions that are +tight and computationally efficient. Experiments on various datasets are +conducted to validate the scalability and efficiency of our method. + +
+
+ comment: 24 pages, 12 figures. Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis + via Bridging Image and Video Diffusion Models CVPR 2024 + + +
+ Diffusion models have made tremendous progress in text-driven image and video +generation. Now text-to-image foundation models are widely applied to various +downstream image synthesis tasks, such as controllable image generation and +image editing, while downstream video synthesis tasks are less explored for +several reasons. First, it requires huge memory and computation overhead to +train a video generation foundation model. Even with video foundation models, +additional costly training is still required for downstream video synthesis +tasks. Second, although some works extend image diffusion models into videos in +a training-free manner, temporal consistency cannot be well preserved. Finally, +these adaption methods are specifically designed for one task and fail to +generalize to different tasks. To mitigate these issues, we propose a +training-free general-purpose video synthesis framework, coined as {\bf +BIVDiff}, via bridging specific image diffusion models and general +text-to-video foundation diffusion models. Specifically, we first use a +specific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for +frame-wise video generation, then perform Mixed Inversion on the generated +video, and finally input the inverted latents into the video diffusion models +(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework +enables flexible image model selection for different purposes with strong task +generalization and high efficiency. To validate the effectiveness and general +use of BIVDiff, we perform a wide range of video synthesis tasks, including +controllable video generation, video editing, video inpainting, and +outpainting. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://bivdiff.github.io; + GitHub repository: https://github.com/MCG-NJU/BIVDiff +
+
+
+
+
+ + ♻ ☆ Empowering Image Recovery_ A Multi-Attention Approach + + +
+ We propose Diverse Restormer (DART), a novel image restoration method that +effectively integrates information from various sources (long sequences, local +and global regions, feature dimensions, and positional dimensions) to address +restoration challenges. While Transformer models have demonstrated excellent +performance in image restoration due to their self-attention mechanism, they +face limitations in complex scenarios. Leveraging recent advancements in +Transformers and various attention mechanisms, our method utilizes customized +attention mechanisms to enhance overall performance. DART, our novel network +architecture, employs windowed attention to mimic the selective focusing +mechanism of human eyes. By dynamically adjusting receptive fields, it +optimally captures the fundamental features crucial for image resolution +reconstruction. Efficiency and performance balance are achieved through the +LongIR attention mechanism for long sequence image restoration. Integration of +attention mechanisms across feature and positional dimensions further enhances +the recovery of fine details. Evaluation across five restoration tasks +consistently positions DART at the forefront. Upon acceptance, we commit to +providing publicly accessible code and models to ensure reproducibility and +facilitate further research. + +
+
+ comment: 12 pages, 10 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation + in VEM images + + +
+ While imaging techniques at macro and mesoscales have garnered substantial +attention and resources, microscale VEM imaging, capable of revealing intricate +vascular details, has lacked the necessary benchmarking infrastructure. In this +paper, we address a significant gap in the field of neuroimaging by introducing +the largest-to-date public benchmark, \textbf{BvEM}, designed specifically for +cortical blood vessel segmentation in volume electron microscopy (VEM) images. +Our BvEM benchmark is based on VEM image volumes from three mammal species: +adult mouse, macaque, and human. We standardized the resolution, addressed +imaging variations, and meticulously annotated blood vessels through +semi-automatic, manual, and quality control processes, ensuring high-quality 3D +segmentation. Furthermore, we developed a zero-shot cortical blood vessel +segmentation method named TriSAM, which leverages the powerful segmentation +model SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation, +TriSAM employs a multi-seed tracking framework, leveraging the reliability of +certain image planes for tracking while using others to identify potential +turning points. This approach effectively achieves long-term 3D blood vessel +segmentation without model training or fine-tuning. Experimental results show +that TriSAM achieved superior performances on the BvEM benchmark across three +species. + +
+
+ comment: BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9 +
+
+
+
+
+ + ♻ ☆ GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped + Robot + + +
+ Multi-task robot learning holds significant importance in tackling diverse +and complex scenarios. However, current approaches are hindered by performance +issues and difficulties in collecting training datasets. In this paper, we +propose GeRM (Generalist Robotic Model). We utilize offline reinforcement +learning to optimize data utilization strategies to learn from both +demonstrations and sub-optimal data, thus surpassing the limitations of human +demonstrations. Thereafter, we employ a transformer-based VLA network to +process multi-modal inputs and output actions. By introducing the +Mixture-of-Experts structure, GeRM allows faster inference speed with higher +whole model capacity, and thus resolves the issue of limited RL parameters, +enhancing model performance in multi-task learning while controlling +computational costs. Through a series of experiments, we demonstrate that GeRM +outperforms other methods across all tasks, while also validating its +efficiency in both training and inference processes. Additionally, we uncover +its potential to acquire emergent skills. Additionally, we contribute the +QUARD-Auto dataset, collected automatically to support our training approach +and foster advancements in multi-task quadruped robot learning. This work +presents a new paradigm for reducing the cost of collecting robot data and +driving progress in the multi-task learning community. You can reach our +project and video through the link: https://songwxuan.github.io/GeRM/ . + +
+
+
+
+
+ + ♻ ☆ Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D + Perception + + +
+ Long-term temporal fusion is a crucial but often overlooked technique in +camera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly +in a parallel manner. While parallel fusion can benefit from long-term +information, it suffers from increasing computational and memory overheads as +the fusion window size grows. Alternatively, BEVFormer adopts a recurrent +fusion pipeline so that history information can be efficiently integrated, yet +it fails to benefit from longer temporal frames. In this paper, we explore an +embarrassingly simple long-term recurrent fusion strategy built upon the +LSS-based methods and find it already able to enjoy the merits from both sides, +i.e., rich long-term information and efficient fusion pipeline. A temporal +embedding module is further proposed to improve the model's robustness against +occasionally missed frames in practical scenarios. We name this simple but +effective fusing pipeline VideoBEV. Experimental results on the nuScenes +benchmark show that VideoBEV obtains strong performance on various camera-based +3D perception tasks, including object detection (55.4\% mAP and 62.9\% NDS), +segmentation (48.6\% vehicle mIoU), tracking (54.8\% AMOTA), and motion +prediction (0.80m minADE and 0.463 EPA). + +
+
+
+
+
+ + ♻ ☆ Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following + + +
+ Existing text-to-image (T2I) diffusion models usually struggle in +interpreting complex prompts, especially those with quantity, object-attribute +binding, and multi-subject descriptions. In this work, we introduce a semantic +panel as the middleware in decoding texts to images, supporting the generator +to better follow instructions. The panel is obtained through arranging the +visual concepts parsed from the input text by the aid of large language models, +and then injected into the denoising network as a detailed control signal to +complement the text condition. To facilitate text-to-panel learning, we come up +with a carefully designed semantic formatting protocol, accompanied by a +fully-automatic data preparation pipeline. Thanks to such a design, our +approach, which we call Ranni, manages to enhance a pre-trained T2I generator +regarding its textual controllability. More importantly, the introduction of +the generative middleware brings a more convenient form of interaction (i.e., +directly adjusting the elements in the panel or using language instructions) +and further allows users to finely customize their generation, based on which +we develop a practical system and showcase its potential in continuous +generation and chatting-based editing. Our project page is at +https://ranni-t2i.github.io/Ranni. + +
+
+
+
+
+ + ♻ ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024. Project Webpage: + https://jacobchalk.github.io/TIM-Project +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+
+
+
+ + ♻ ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The study introduces an integrated framework combining Convolutional Neural +Networks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced +diagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned +ResNet50 architecture, our investigation not only provides effective +differentiation of mammographic images into benign and malignant categories but +also addresses the opaque "black-box" nature of deep learning models by +employing XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN +decision-making processes for healthcare professionals. Our methodology +encompasses an elaborate data preprocessing pipeline and advanced data +augmentation techniques to counteract dataset limitations, and transfer +learning using pre-trained networks, such as VGG-16, DenseNet and ResNet was +employed. A focal point of our study is the evaluation of XAI's effectiveness +in interpreting model predictions, highlighted by utilising the Hausdorff +measure to assess the alignment between AI-generated explanations and expert +annotations quantitatively. This approach plays a critical role for XAI in +promoting trustworthiness and ethical fairness in AI-assisted diagnostics. The +findings from our research illustrate the effective collaboration between CNNs +and XAI in advancing diagnostic methods for breast cancer, thereby facilitating +a more seamless integration of advanced AI technologies within clinical +settings. By enhancing the interpretability of AI-driven decisions, this work +lays the groundwork for improved collaboration between AI systems and medical +practitioners, ultimately enriching patient care. Furthermore, the implications +of our research extend well beyond the current methodologies, advocating for +subsequent inquiries into the integration of multimodal data and the refinement +of AI explanations to satisfy the needs of clinical practice. + +
+
+
+
+
+ + ♻ ☆ Learning Invariant Inter-pixel Correlations for Superpixel Generation AAAI24 + + +
+ Deep superpixel algorithms have made remarkable strides by substituting +hand-crafted features with learnable ones. Nevertheless, we observe that +existing deep superpixel methods, serving as mid-level representation +operations, remain sensitive to the statistical properties (e.g., color +distribution, high-level semantics) embedded within the training dataset. +Consequently, learnable features exhibit constrained discriminative capability, +resulting in unsatisfactory pixel grouping performance, particularly in +untrainable application scenarios. To address this issue, we propose the +Content Disentangle Superpixel (CDS) algorithm to selectively separate the +invariant inter-pixel correlations and statistical properties, i.e., style +noise. Specifically, We first construct auxiliary modalities that are +homologous to the original RGB image but have substantial stylistic variations. +Then, driven by mutual information, we propose the local-grid correlation +alignment across modalities to reduce the distribution discrepancy of +adaptively selected features and learn invariant inter-pixel correlations. +Afterwards, we perform global-style mutual information minimization to enforce +the separation of invariant content and train data styles. The experimental +results on four benchmark datasets demonstrate the superiority of our approach +to existing state-of-the-art methods, regarding boundary adherence, +generalization, and efficiency. Code and pre-trained model are available at +https://github.com/rookiie/CDSpixel. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ SDFR: Synthetic Data for Face Recognition Competition + + +
+ Large-scale face recognition datasets are collected by crawling the Internet +and without individuals' consent, raising legal, ethical, and privacy concerns. +With the recent advances in generative models, recently several works proposed +generating synthetic face recognition datasets to mitigate concerns in +web-crawled face recognition datasets. This paper presents the summary of the +Synthetic Data for Face Recognition (SDFR) Competition held in conjunction with +the 18th IEEE International Conference on Automatic Face and Gesture +Recognition (FG 2024) and established to investigate the use of synthetic data +for training face recognition models. The SDFR competition was split into two +tasks, allowing participants to train face recognition systems using new +synthetic datasets and/or existing ones. In the first task, the face +recognition backbone was fixed and the dataset size was limited, while the +second task provided almost complete freedom on the model backbone, the +dataset, and the training pipeline. The submitted models were trained on +existing and also new synthetic datasets and used clever methods to improve +training with synthetic data. The submissions were evaluated and ranked on a +diverse set of seven benchmarking datasets. The paper gives an overview of the +submitted face recognition models and reports achieved performance compared to +baseline models trained on real and synthetic datasets. Furthermore, the +evaluation of submissions is extended to bias assessment across different +demography groups. Lastly, an outlook on the current state of the research in +training face recognition models using synthetic data is presented, and +existing problems as well as potential future directions are also discussed. + +
+
+ comment: The 18th IEEE International Conference on Automatic Face and Gesture + Recognition (FG 2024) +
+
+
+
+
+ + ♻ ☆ PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual + Observations + + +
+ Modeling and rendering photorealistic avatars is of crucial importance in +many applications. Existing methods that build a 3D avatar from visual +observations, however, struggle to reconstruct clothed humans. We introduce +PhysAvatar, a novel framework that combines inverse rendering with inverse +physics to automatically estimate the shape and appearance of a human from +multi-view video data along with the physical parameters of the fabric of their +clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for +spatio-temporal mesh tracking as well as a physically based inverse renderer to +estimate the intrinsic material properties. PhysAvatar integrates a physics +simulator to estimate the physical parameters of the garments using +gradient-based optimization in a principled manner. These novel capabilities +enable PhysAvatar to create high-quality novel-view renderings of avatars +dressed in loose-fitting clothes under motions and lighting conditions not seen +in the training data. This marks a significant advancement towards modeling +photorealistic digital humans using physically based inverse rendering with +physics in the loop. Our project website is at: +https://qingqing-zhao.github.io/PhysAvatar + +
+
+ comment: Project Page: https://qingqing-zhao.github.io/PhysAvatar +
+
+
+
+
+ + ♻ ☆ Dense Video Object Captioning from Disjoint Supervision + + +
+ We propose a new task and model for dense video object captioning -- +detecting, tracking and captioning trajectories of objects in a video. This +task unifies spatial and temporal localization in video, whilst also requiring +fine-grained visual understanding that is best described by natural language. +We propose a unified model, and demonstrate how our end-to-end approach is more +accurate and temporally coherent than a multi-stage pipeline combining +state-of-the-art detection, tracking, and captioning models. Moreover, we +propose a training strategy based on a mixture of disjoint tasks, which allows +us to leverage diverse, large-scale datasets which supervise different parts of +our model. Although each pretraining task only provides weak supervision, they +are complementary and, when combined, result in noteworthy zero-shot ability +and serve as strong initialization for additional finetuning to further improve +accuracy. We carefully design new metrics capturing all components of our task, +and show how we can repurpose existing video grounding datasets (e.g. VidSTG +and VLN) for our new task. We show that our model improves upon a number of +strong baselines for this new task. Furthermore, we can apply our model to the +task of spatial grounding, outperforming prior state-of-the-art on VidSTG and +VLN, without explicitly training for it. Code is available at +https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc. + +
+
+ comment: Code is available at + https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc +
+
+
+
+
+ + ♻ ☆ Oriented Object Detection in Optical Remote Sensing Images using Deep + Learning: A Survey + + +
+ Oriented object detection is one of the most fundamental and challenging +tasks in remote sensing, aiming to locate and classify objects with arbitrary +orientations. Recent years have witnessed remarkable progress in oriented +object detection using deep learning techniques. Given the rapid development of +this field, this paper aims to provide a comprehensive survey of recent +advances in oriented object detection. To be specific, we first review the +technical evolution from horizontal object detection to oriented object +detection and summarize the specific challenges, including feature +misalignment, spatial misalignment, and periodicity of angle. Subsequently, we +further categorize existing methods into detection framework, oriented bounding +box (OBB) regression, and feature representations, and discuss how these +methods address the above challenges in detail. In addition, we cover several +publicly available datasets and performance evaluation protocols. Furthermore, +we provide a comprehensive comparison and analysis of state-of-the-art oriented +object detection methods. Toward the end of this paper, we discuss several +future directions for oriented object detection. + +
+
+
+
+
+ + ♻ ☆ PeerAiD: Improving Adversarial Distillation from a Specialized Peer + Tutor CVPR 2024 + + +
+ Adversarial robustness of the neural network is a significant concern when it +is applied to security-critical domains. In this situation, adversarial +distillation is a promising option which aims to distill the robustness of the +teacher network to improve the robustness of a small student network. Previous +works pretrain the teacher network to make it robust to the adversarial +examples aimed at itself. However, the adversarial examples are dependent on +the parameters of the target network. The fixed teacher network inevitably +degrades its robustness against the unseen transferred adversarial examples +which targets the parameters of the student network in the adversarial +distillation process. We propose PeerAiD to make a peer network learn the +adversarial examples of the student network instead of adversarial examples +aimed at itself. PeerAiD is an adversarial distillation that trains the peer +network and the student network simultaneously in order to make the peer +network specialized for defending the student network. We observe that such +peer networks surpass the robustness of pretrained robust teacher network +against student-attacked adversarial samples. With this peer network and +adversarial distillation, PeerAiD achieves significantly higher robustness of +the student network with AutoAttack (AA) accuracy up to 1.66%p and improves the +natural accuracy of the student network up to 4.72%p with ResNet-18 and +TinyImageNet dataset. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 37 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion + Models with RL Finetuning CVPR 2024 + + +
+ Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT) +to text-to-image diffusion models, have driven recent breakthroughs in +text-to-3D research. However, due to the limited size and quality of existing +3D datasets, they still suffer from multi-view inconsistencies and Neural +Radiance Field (NeRF) reconstruction artifacts. We argue that multi-view +diffusion models can benefit from further Reinforcement Learning Finetuning +(RLFT), which allows models to learn from the data generated by themselves and +improve beyond their dataset limitations during SFT. To this end, we introduce +Carve3D, an improved RLFT algorithm coupled with a novel Multi-view +Reconstruction Consistency (MRC) metric, to enhance the consistency of +multi-view diffusion models. To measure the MRC metric on a set of multi-view +images, we compare them with their corresponding NeRF renderings at the same +camera viewpoints. The resulting model, which we denote as Carve3DM, +demonstrates superior multi-view consistency and NeRF reconstruction quality +than existing models. Our results suggest that pairing SFT with Carve3D's RLFT +is essential for developing multi-view-consistent diffusion models, mirroring +the standard Large Language Model (LLM) alignment pipeline. Our code, training +and testing data, and video results are available at: +https://desaixie.github.io/carve-3d. + +
+
+ comment: 22 pages, 16 figures. Our code, training and testing data, and video + results are available at: https://desaixie.github.io/carve-3d. This paper has + been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024 + camera-ready version +
+
+
+
+
+ + ♻ ☆ Surface Reconstruction from Point Clouds via Grid-based Intersection + Prediction + + +
+ Surface reconstruction from point clouds is a crucial task in the fields of +computer vision and computer graphics. SDF-based methods excel at +reconstructing smooth meshes with minimal error and artefacts but struggle with +representing open surfaces. On the other hand, UDF-based methods can +effectively represent open surfaces but often introduce noise, leading to +artefacts in the mesh. In this work, we propose a novel approach that directly +predicts the intersection points between line segment of point pairs and +implicit surfaces. To achieve it, we propose two modules named Relative +Intersection Module and Sign Module respectively with the feature of point pair +as input. To preserve the continuity of the surface, we also integrate symmetry +into the two modules, which means the position of predicted intersection will +not change even if the input order of the point pair changes. This method not +only preserves the ability to represent open surfaces but also eliminates most +artefacts on the mesh. Our approach demonstrates state-of-the-art performance +on three datasets: ShapeNet, MGN, and ScanNet. The code will be made available +upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Background Noise Reduction of Attention Map for Weakly Supervised + Semantic Segmentation + + +
+ In weakly-supervised semantic segmentation (WSSS) using only image-level +class labels, a problem with CNN-based Class Activation Maps (CAM) is that they +tend to activate the most discriminative local regions of objects. On the other +hand, methods based on Transformers learn global features but suffer from the +issue of background noise contamination. This paper focuses on addressing the +issue of background noise in attention weights within the existing WSSS method +based on Conformer, known as TransCAM. The proposed method successfully reduces +background noise, leading to improved accuracy of pseudo labels. Experimental +results demonstrate that our model achieves segmentation performance of 70.5% +on the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS +COCO 2014 data, outperforming TransCAM in terms of segmentation performance. + +
+
+
+
+
+ + ♻ ☆ Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive + Smoothing + + +
+ While prior research has proposed a plethora of methods that build neural +classifiers robust against adversarial robustness, practitioners are still +reluctant to adopt them due to their unacceptably severe clean accuracy +penalties. This paper significantly alleviates this accuracy-robustness +trade-off by mixing the output probabilities of a standard classifier and a +robust classifier, where the standard network is optimized for clean accuracy +and is not robust in general. We show that the robust base classifier's +confidence difference for correct and incorrect examples is the key to this +improvement. In addition to providing intuitions and empirical evidence, we +theoretically certify the robustness of the mixed classifier under realistic +assumptions. Furthermore, we adapt an adversarial input detector into a mixing +network that adaptively adjusts the mixture of the two base models, further +reducing the accuracy penalty of achieving robustness. The proposed flexible +method, termed "adaptive smoothing", can work in conjunction with existing or +even future methods that improve clean accuracy, robustness, or adversary +detection. Our empirical evaluation considers strong attack methods, including +AutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves +an 85.21% clean accuracy while maintaining a 38.72% $\ell_\infty$-AutoAttacked +($\epsilon = 8/255$) accuracy, becoming the second most robust method on the +RobustBench CIFAR-100 benchmark as of submission, while improving the clean +accuracy by ten percentage points compared with all listed models. The code +that implements our method is available at +https://github.com/Bai-YT/AdaptiveSmoothing. + +
+
+
+
+
+ + ♻ ☆ SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor + Scenes + + +
+ We propose SIR, an efficient method to decompose differentiable shadows for +inverse rendering on indoor scenes using multi-view data, addressing the +challenges in accurately decomposing the materials and lighting conditions. +Unlike previous methods that struggle with shadow fidelity in complex lighting +environments, our approach explicitly learns shadows for enhanced realism in +material estimation under unknown light positions. Utilizing posed HDR images +as input, SIR employs an SDF-based neural radiance field for comprehensive +scene representation. Then, SIR integrates a shadow term with a three-stage +material estimation approach to improve SVBRDF quality. Specifically, SIR is +designed to learn a differentiable shadow, complemented by BRDF regularization, +to optimize inverse rendering accuracy. Extensive experiments on both synthetic +and real-world indoor scenes demonstrate the superior performance of SIR over +existing methods in both quantitative metrics and qualitative analysis. The +significant decomposing ability of SIR enables sophisticated editing +capabilities like free-view relighting, object insertion, and material +replacement. The code and data are available at +https://xiaokangwei.github.io/SIR/. + +
+
+
+
+
+ + ♻ ☆ Toward Tiny and High-quality Facial Makeup with Data Amplify Learning + + +
+ Contemporary makeup approaches primarily hinge on unpaired learning +paradigms, yet they grapple with the challenges of inaccurate supervision +(e.g., face misalignment) and sophisticated facial prompts (including face +parsing, and landmark detection). These challenges prohibit low-cost deployment +of facial makeup models, especially on mobile devices. To solve above problems, +we propose a brand-new learning paradigm, termed "Data Amplify Learning (DAL)," +alongside a compact makeup model named "TinyBeauty." The core idea of DAL lies +in employing a Diffusion-based Data Amplifier (DDA) to "amplify" limited images +for the model training, thereby enabling accurate pixel-to-pixel supervision +with merely a handful of annotations. Two pivotal innovations in DDA facilitate +the above training approach: (1) A Residual Diffusion Model (RDM) is designed +to generate high-fidelity detail and circumvent the detail vanishing problem in +the vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is +proposed to achieve precise makeup control and combination while retaining face +identity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to +achieve a state-of-the-art performance without intricate face prompts. +Meanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on +the iPhone 13. Extensive experiments show that DAL can produce highly +competitive makeup models using only 5 image pairs. + +
+
+
+
+
+ + ♻ ☆ Harnessing Meta-Learning for Improving Full-Frame Video Stabilization CVPR 2024 + + +
+ Video stabilization is a longstanding computer vision problem, particularly +pixel-level synthesis solutions for video stabilization which synthesize full +frames add to the complexity of this task. These techniques aim to stabilize +videos by synthesizing full frames while enhancing the stability of the +considered video. This intensifies the complexity of the task due to the +distinct mix of unique motion profiles and visual content present in each video +sequence, making robust generalization with fixed parameters difficult. In our +study, we introduce a novel approach to enhance the performance of pixel-level +synthesis solutions for video stabilization by adapting these models to +individual input video sequences. The proposed adaptation exploits low-level +visual cues accessible during test-time to improve both the stability and +quality of resulting videos. We highlight the efficacy of our methodology of +"test-time adaptation" through simple fine-tuning of one of these models, +followed by significant stability gain via the integration of meta-learning +techniques. Notably, significant improvement is achieved with only a single +adaptation step. The versatility of the proposed algorithm is demonstrated by +consistently improving the performance of various pixel-level synthesis models +for video stabilization in real-world scenarios. + +
+
+ comment: CVPR 2024, Code will be made availble on: + http://github.com/MKashifAli/MetaVideoStab +
+
+
+
+
+ + ♻ ☆ Detecting and Mitigating System-Level Anomalies of Vision-Based + Controllers + + +
+ Autonomous systems, such as self-driving cars and drones, have made +significant strides in recent years by leveraging visual inputs and machine +learning for decision-making and control. Despite their impressive performance, +these vision-based controllers can make erroneous predictions when faced with +novel or out-of-distribution inputs. Such errors can cascade to catastrophic +system failures and compromise system safety. In this work, we introduce a +run-time anomaly monitor to detect and mitigate such closed-loop, system-level +failures. Specifically, we leverage a reachability-based framework to +stress-test the vision-based controller offline and mine its system-level +failures. This data is then used to train a classifier that is leveraged online +to flag inputs that might cause system breakdowns. The anomaly detector +highlights issues that transcend individual modules and pertain to the safety +of the overall system. We also design a fallback controller that robustly +handles these detected anomalies to preserve system safety. We validate the +proposed approach on an autonomous aircraft taxiing system that uses a +vision-based controller for taxiing. Our results show the efficacy of the +proposed approach in identifying and handling system-level anomalies, +outperforming methods such as prediction error-based detection, and ensembling, +thereby enhancing the overall safety and robustness of autonomous systems. + +
+
+
+
+
+ + ♻ ☆ Rich Human Feedback for Text-to-Image Generation CVPR'24 + + +
+ Recent Text-to-Image (T2I) generation models such as Stable Diffusion and +Imagen have made significant progress in generating high-resolution images +based on text descriptions. However, many generated images still suffer from +issues such as artifacts/implausibility, misalignment with text descriptions, +and low aesthetic quality. Inspired by the success of Reinforcement Learning +with Human Feedback (RLHF) for large language models, prior works collected +human-provided scores as feedback on generated images and trained a reward +model to improve the T2I generation. In this paper, we enrich the feedback +signal by (i) marking image regions that are implausible or misaligned with the +text, and (ii) annotating which words in the text prompt are misrepresented or +missing on the image. We collect such rich human feedback on 18K generated +images (RichHF-18K) and train a multimodal transformer to predict the rich +feedback automatically. We show that the predicted rich human feedback can be +leveraged to improve image generation, for example, by selecting high-quality +training data to finetune and improve the generative models, or by creating +masks with predicted heatmaps to inpaint the problematic regions. Notably, the +improvements generalize to models (Muse) beyond those used to generate the +images on which human feedback data were collected (Stable Diffusion variants). +The RichHF-18K data set will be released in our GitHub repository: +https://github.com/google-research/google-research/tree/master/richhf_18k. + +
+
+ comment: CVPR'24 +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Full-dose Whole-body PET Synthesis from Low-dose PET Using + High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency + Model + + +
+ Objective: Positron Emission Tomography (PET) has been a commonly used +imaging modality in broad clinical applications. One of the most important +tradeoffs in PET imaging is between image quality and radiation dose: high +image quality comes with high radiation exposure. Improving image quality is +desirable for all clinical applications while minimizing radiation exposure is +needed to reduce risk to patients. Approach: We introduce PET Consistency Model +(PET-CM), an efficient diffusion-based method for generating high-quality +full-dose PET images from low-dose PET images. It employs a two-step process, +adding Gaussian noise to full-dose PET images in the forward diffusion, and +then denoising them using a PET Shifted-window Vision Transformer (PET-VIT) +network in the reverse diffusion. The PET-VIT network learns a consistency +function that enables direct denoising of Gaussian noise into clean full-dose +PET images. PET-CM achieves state-of-the-art image quality while requiring +significantly less computation time than other methods. Results: In experiments +comparing eighth-dose to full-dose images, PET-CM demonstrated impressive +performance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of +0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of +0.255+/-0.318%, with an average generation time of 62 seconds per patient. This +is a significant improvement compared to the state-of-the-art diffusion-based +model with PET-CM reaching this result 12x faster. Similarly, in the +quarter-dose to full-dose image experiments, PET-CM delivered competitive +outcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM +of 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of +0.151+/-0.192% using the same generation process, which underlining its high +quantitative and clinical precision in both denoising scenario. + +
+
+
+
+
+ + ♻ ☆ SocialCounterfactuals: Probing and Mitigating Intersectional Social + Biases in Vision-Language Models with Counterfactual Examples CVPR 2024 + + +
+ While vision-language models (VLMs) have achieved remarkable performance +improvements recently, there is growing evidence that these models also posses +harmful biases with respect to social attributes such as gender and race. Prior +studies have primarily focused on probing such bias attributes individually +while ignoring biases associated with intersections between social attributes. +This could be due to the difficulty of collecting an exhaustive set of +image-text pairs for various combinations of social attributes. To address this +challenge, we employ text-to-image diffusion models to produce counterfactual +examples for probing intersectional social biases at scale. Our approach +utilizes Stable Diffusion with cross attention control to produce sets of +counterfactual image-text pairs that are highly similar in their depiction of a +subject (e.g., a given occupation) while differing only in their depiction of +intersectional social attributes (e.g., race & gender). Through our +over-generate-then-filter methodology, we produce SocialCounterfactuals, a +high-quality dataset containing 171k image-text pairs for probing +intersectional biases related to gender, race, and physical characteristics. We +conduct extensive experiments to demonstrate the usefulness of our generated +dataset for probing and mitigating intersectional social biases in +state-of-the-art VLMs. + +
+
+ comment: Accepted to CVPR 2024. arXiv admin note: text overlap with + arXiv:2310.02988 +
+
+
+
+
+ + ♻ ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2024 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2024. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ♻ ☆ $λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion + Models by Leveraging CLIP Latent Space + + +
+ Despite the recent advances in personalized text-to-image (P-T2I) generative +models, it remains challenging to perform finetuning-free multi-subject-driven +T2I in a resource-efficient manner. Predominantly, contemporary approaches, +involving the training of Hypernetworks and Multimodal Large Language Models +(MLLMs), require heavy computing resources that range from 600 to 12300 GPU +hours of training. These subject-driven T2I methods hinge on Latent Diffusion +Models (LDMs), which facilitate T2I mapping through cross-attention layers. +While LDMs offer distinct advantages, P-T2I methods' reliance on the latent +space of these diffusion models significantly escalates resource demands, +leading to inconsistent results and necessitating numerous iterations for a +single desired image. In this paper, we present $\lambda$-ECLIPSE, an +alternative prior-training strategy that works in the latent space of a +pre-trained CLIP model without relying on the diffusion UNet models. +$\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast +and effective multi-subject-driven P-T2I. Through extensive experiments, we +establish that $\lambda$-ECLIPSE surpasses existing baselines in composition +alignment while preserving concept alignment performance, even with +significantly lower resource utilization. $\lambda$-ECLIPSE performs +multi-subject driven P-T2I with just 34M parameters and is trained on a mere 74 +GPU hours. Additionally, $\lambda$-ECLIPSE demonstrates the unique ability to +perform multi-concept interpolations. + +
+
+ comment: Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/ +
+
+
+
+
+ + ♻ ☆ Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized + Narratives from Open-Source Histopathology Videos + + +
+ Diagnosis in histopathology requires a global whole slide images (WSIs) +analysis, requiring pathologists to compound evidence from different WSI +patches. The gigapixel scale of WSIs poses a challenge for histopathology +multi-modal models. Training multi-model models for histopathology requires +instruction tuning datasets, which currently contain information for individual +image patches, without a spatial grounding of the concepts within each patch +and without a wider view of the WSI. Therefore, they lack sufficient diagnostic +capacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a +large-scale dataset of 107,131 histopathology-specific instruction +question/answer pairs, grounded within diagnostically relevant image patches +that make up the WSI. Our dataset is collected by leveraging educational +histopathology videos from YouTube, which provides spatial localization of +narrations by automatically extracting the narrators' cursor positions. +Quilt-Instruct supports contextual reasoning by extracting diagnosis and +supporting facts from the entire WSI. Using Quilt-Instruct, we train +Quilt-LLaVA, which can reason beyond the given single image patch, enabling +diagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a +comprehensive evaluation dataset created from 985 images and 1283 +human-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using +public histopathology datasets, where Quilt-LLaVA significantly outperforms +SOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set +VQA. Our code, data, and model are publicly accessible at +quilt-llava.github.io. + +
+
+
+
+
+ + ♻ ☆ Mitigating the Impact of Attribute Editing on Face Recognition + + +
+ Through a large-scale study over diverse face images, we show that facial +attribute editing using modern generative AI models can severely degrade +automated face recognition systems. This degradation persists even with +identity-preserving generative models. To mitigate this issue, we propose two +novel techniques for local and global attribute editing. We empirically ablate +twenty-six facial semantic, demographic and expression-based attributes that +have been edited using state-of-the-art generative models, and evaluate them +using ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets. +Finally, we use LLaVA, an emerging visual question-answering framework for +attribute prediction to validate our editing techniques. Our methods outperform +the current state-of-the-art at facial editing (BLIP, InstantID) while +improving identity retention by a significant extent. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Diffusion based Zero-shot Medical Image-to-Image Translation for Cross + Modality Segmentation + + +
+ Cross-modality image segmentation aims to segment the target modalities using +a method designed in the source modality. Deep generative models can translate +the target modality images into the source modality, thus enabling +cross-modality segmentation. However, a vast body of existing cross-modality +image translation methods relies on supervised learning. In this work, we aim +to address the challenge of zero-shot learning-based image translation tasks +(extreme scenarios in the target modality is unseen in the training phase). To +leverage generative learning for zero-shot cross-modality image segmentation, +we propose a novel unsupervised image translation method. The framework learns +to translate the unseen source image to the target modality for image +segmentation by leveraging the inherent statistical consistency between +different modalities for diffusion guidance. Our framework captures identical +cross-modality features in the statistical domain, offering diffusion guidance +without relying on direct mappings between the source and target domains. This +advantage allows our method to adapt to changing source domains without the +need for retraining, making it highly practical when sufficient labeled source +domain data is not available. The proposed framework is validated in zero-shot +cross-modality image segmentation tasks through empirical comparisons with +influential generative models, including adversarial-based and diffusion-based +models. + +
+
+ comment: Neurips 2023 Diffusion Workshop +
+
+
+
+
+ + ♻ ☆ Local Neighborhood Features for 3D Classification + + +
+ With advances in deep learning model training strategies, the training of +Point cloud classification methods is significantly improving. For example, +PointNeXt, which adopts prominent training techniques and InvResNet layers into +PointNet++, achieves over 7% improvement on the real-world ScanObjectNN +dataset. However, most of these models use point coordinates features of +neighborhood points mapped to higher dimensional space while ignoring the +neighborhood point features computed before feeding to the network layers. In +this paper, we revisit the PointNeXt model to study the usage and benefit of +such neighborhood point features. We train and evaluate PointNeXt on ModelNet40 +(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world +grocery dataset, i.e., 3DGrocery100. In addition, we provide an additional +inference strategy of weight averaging the top two checkpoints of PointNeXt to +improve classification accuracy. Together with the abovementioned ideas, we +gain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model +with real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's +Apple10, Fruits, Vegetables, and Packages subsets, respectively. We also +achieve a comparable 0.2% accuracy gain on ModelNet40. + +
+
+
+
+
+ + ♻ ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ♻ ☆ A dataset of over one thousand computed tomography scans of battery + cells + + +
+ Battery technology is increasingly important for global electrification +efforts. However, batteries are highly sensitive to small manufacturing +variations that can induce reliability or safety issues. An important +technology for battery quality control is computed tomography (CT) scanning, +which is widely used for non-destructive 3D inspection across a variety of +clinical and industrial applications. Historically, however, the utility of CT +scanning for high-volume manufacturing has been limited by its low throughput +as well as the difficulty of handling its large file sizes. In this work, we +present a dataset of over one thousand CT scans of as-produced commercially +available batteries. The dataset spans various chemistries (lithium-ion and +sodium-ion) as well as various battery form factors (cylindrical, pouch, and +prismatic). We evaluate seven different battery types in total. The +manufacturing variability and the presence of battery defects can be observed +via this dataset. This dataset may be of interest to scientists and engineers +working on battery technology, computer vision, or both. + +
+
+
+
+
+ + ♻ ☆ TAM-VT: Transformation-Aware Multi-scale Video Transformer for + Segmentation and Tracking + + +
+ Video Object Segmentation (VOS) has emerged as an increasingly important +problem with availability of larger datasets and more complex and realistic +settings, which involve long videos with global motion (e.g, in egocentric +settings), depicting small objects undergoing both rigid and non-rigid +(including state) deformations. While a number of recent approaches have been +explored for this task, these data characteristics still present challenges. In +this work we propose a novel, clip-based DETR-style encoder-decoder +architecture, which focuses on systematically analyzing and addressing +aforementioned challenges. Specifically, we propose a novel +transformation-aware loss that focuses learning on portions of the video where +an object undergoes significant deformations -- a form of "soft" hard examples +mining. Further, we propose a multiplicative time-coded memory, beyond vanilla +additive positional encoding, which helps propagate context across long videos. +Finally, we incorporate these in our proposed holistic multi-scale video +transformer for tracking via multi-scale memory matching and decoding to ensure +sensitivity and accuracy for long videos and small objects. Our model enables +on-line inference with long videos in a windowed fashion, by breaking the video +into clips and propagating context among them. We illustrate that short clip +length and longer memory with learned time-coding are important design choices +for improved performance. Collectively, these technical contributions enable +our model to achieve new state-of-the-art (SoTA) performance on two complex +egocentric datasets -- VISOR and VOST, while achieving comparable to SoTA +results on the conventional VOS benchmark, DAVIS'17. A series of detailed +ablations validate our design choices as well as provide insights into the +importance of parameter choices and their impact on performance. + +
+
+
+
+
+ + ♻ ☆ Lane Change Classification and Prediction with Action Recognition + Networks ECCV2022 + + +
+ Anticipating lane change intentions of surrounding vehicles is crucial for +efficient and safe driving decision making in an autonomous driving system. +Previous works often adopt physical variables such as driving speed, +acceleration and so forth for lane change classification. However, physical +variables do not contain semantic information. Although 3D CNNs have been +developing rapidly, the number of methods utilising action recognition models +and appearance feature for lane change recognition is low, and they all require +additional information to pre-process data. In this work, we propose an +end-to-end framework including two action recognition methods for lane change +recognition, using video data collected by cameras. Our method achieves the +best lane change classification results using only the RGB video data of the +PREVENTION dataset. Class activation maps demonstrate that action recognition +models can efficiently extract lane change motions. A method to better extract +motion clues is also proposed in this paper. + +
+
+ comment: Accepted to ECCV2022 AVVISION +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 167 + +
+
+
+ + ☆ Finding Visual Task Vectors + + +
+ Visual Prompting is a technique for teaching models to perform a visual task +via in-context examples, without any additional training. In this work, we +analyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find +task vectors, activations that encode task-specific information. Equipped with +this insight, we demonstrate that it is possible to identify the task vectors +and use them to guide the network towards performing different tasks without +providing any input-output examples. To find task vectors, we compute the +average intermediate activations per task and use the REINFORCE algorithm to +search for the subset of task vectors. The resulting task vectors guide the +model towards performing a task better than the original model without the need +for input-output examples. + +
+
+ comment: https://github.com/alhojel/visual_task_vectors +
+
+
+
+
+ + ☆ MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video + Understanding CVPR 2024 + + +
+ With the success of large language models (LLMs), integrating the vision +model into LLMs to build vision-language foundation models has gained much more +interest recently. However, existing LLM-based large multimodal models (e.g., +Video-LLaMA, VideoChat) can only take in a limited number of frames for short +video understanding. In this study, we mainly focus on designing an efficient +and effective model for long-term video understanding. Instead of trying to +process more frames simultaneously like most existing work, we propose to +process videos in an online manner and store past video information in a memory +bank. This allows our model to reference historical video content for long-term +analysis without exceeding LLMs' context length constraints or GPU memory +limits. Our memory bank can be seamlessly integrated into current multimodal +LLMs in an off-the-shelf manner. We conduct extensive experiments on various +video understanding tasks, such as long-video understanding, video question +answering, and video captioning, and our model can achieve state-of-the-art +performances across multiple datasets. Code available at +https://boheumd.github.io/MA-LMM/. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have been +noteworthy, yet, these general-domain MLLMs often fall short in their ability +to comprehend and interact effectively with user interface (UI) screens. In +this paper, we present Ferret-UI, a new MLLM tailored for enhanced +understanding of mobile UI screens, equipped with referring, grounding, and +reasoning capabilities. Given that UI screens typically exhibit a more +elongated aspect ratio and contain smaller objects of interest (e.g., icons, +texts) than natural images, we incorporate "any resolution" on top of Ferret to +magnify details and leverage enhanced visual features. Specifically, each +screen is divided into 2 sub-images based on the original aspect ratio (i.e., +horizontal division for portrait screens and vertical division for landscape +screens). Both sub-images are encoded separately before being sent to LLMs. We +meticulously gather training samples from an extensive range of elementary UI +tasks, such as icon recognition, find text, and widget listing. These samples +are formatted for instruction-following with region annotations to facilitate +precise referring and grounding. To augment the model's reasoning ability, we +further compile a dataset for advanced tasks, including detailed description, +perception/interaction conversations, and function inference. After training on +the curated datasets, Ferret-UI exhibits outstanding comprehension of UI +screens and the capability to execute open-ended instructions. For model +evaluation, we establish a comprehensive benchmark encompassing all the +aforementioned tasks. Ferret-UI excels not only beyond most open-source UI +MLLMs, but also surpasses GPT-4V on all the elementary UI tasks. + +
+
+
+
+
+ + ☆ SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual + Editing + + +
+ Effective editing of personal content holds a pivotal role in enabling +individuals to express their creativity, weaving captivating narratives within +their visual stories, and elevate the overall quality and impact of their +visual content. Therefore, in this work, we introduce SwapAnything, a novel +framework that can swap any objects in an image with personalized concepts +given by the reference, while keeping the context unchanged. Compared with +existing methods for personalized subject swapping, SwapAnything has three +unique advantages: (1) precise control of arbitrary objects and parts rather +than the main subject, (2) more faithful preservation of context pixels, (3) +better adaptation of the personalized concept to the image. First, we propose +targeted variable swapping to apply region control over latent feature maps and +swap masked variables for faithful context preservation and initial semantic +concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt +the semantic concept into the original image in terms of target location, +shape, style, and content during the image generation process. Extensive +results on both human and automatic evaluation demonstrate significant +improvements of our approach over baseline methods on personalized swapping. +Furthermore, SwapAnything shows its precise and faithful swapping abilities +across single object, multiple objects, partial object, and cross-domain +swapping tasks. SwapAnything also achieves great performance on text-based +swapping and tasks beyond swapping such as object insertion. + +
+
+ comment: 18 pages, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Learning 3D-Aware GANs from Unposed Images with Template Feature Field + + +
+ Collecting accurate camera poses of training images has been shown to well +serve the learning of 3D-aware generative adversarial networks (GANs) yet can +be quite expensive in practice. This work targets learning 3D-aware GANs from +unposed images, for which we propose to perform on-the-fly pose estimation of +training images with a learned template feature field (TeFF). Concretely, in +addition to a generative radiance field as in previous approaches, we ask the +generator to also learn a field from 2D semantic features while sharing the +density from the radiance field. Such a framework allows us to acquire a +canonical 3D feature template leveraging the dataset mean discovered by the +generative model, and further efficiently estimate the pose parameters on real +data. Experimental results on various challenging datasets demonstrate the +superiority of our approach over state-of-the-art alternatives from both the +qualitative and the quantitative perspectives. + +
+
+ comment: https://XDimlab.github.io/TeFF +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic + Segmentation for Satellite Imagery + + +
+ Satellite imagery is crucial for tasks like environmental monitoring and +urban planning. Typically, it relies on semantic segmentation or Land Use Land +Cover (LULC) classification to categorize each pixel. Despite the advancements +brought about by Deep Neural Networks (DNNs), their performance in segmentation +tasks is hindered by challenges such as limited availability of labeled data, +class imbalance and the inherent variability and complexity of satellite +images. In order to mitigate those issues, our study explores the effectiveness +of a Cut-and-Paste augmentation technique for semantic segmentation in +satellite images. We adapt this augmentation, which usually requires labeled +instances, to the case of semantic segmentation. By leveraging the connected +components in the semantic segmentation labels, we extract instances that are +then randomly pasted during training. Using the DynamicEarthNet dataset and a +U-Net model for evaluation, we found that this augmentation significantly +enhances the mIoU score on the test set from 37.9 to 44.1. This finding +highlights the potential of the Cut-and-Paste augmentation to improve the +generalization capabilities of semantic segmentation models in satellite +imagery. + +
+
+ comment: Accepted for publication in IEEE 2024 International Geoscience & + Remote Sensing Symposium (IGARSS 2024) +
+
+
+
+
+ + ☆ Retrieval-Augmented Open-Vocabulary Object Detection CVPR 2024 + + +
+ Open-vocabulary object detection (OVD) has been studied with Vision-Language +Models (VLMs) to detect novel objects beyond the pre-trained categories. +Previous approaches improve the generalization ability to expand the knowledge +of the detector, using 'positive' pseudo-labels with additional 'class' names, +e.g., sock, iPod, and alligator. To extend the previous methods in two aspects, +we propose Retrieval-Augmented Losses and visual Features (RALF). Our method +retrieves related 'negative' classes and augments loss functions. Also, visual +features are augmented with 'verbalized concepts' of classes, e.g., worn on the +feet, handheld music player, and sharp teeth. Specifically, RALF consists of +two modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual +Features (RAF). RAL constitutes two losses reflecting the semantic similarity +with negative vocabularies. In addition, RAF augments visual features with the +verbalized concepts from a large language model (LLM). Our experiments +demonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We +achieve improvement up to 3.4 box AP$_{50}^{\text{N}}$ on novel categories of +the COCO dataset and 3.6 mask AP$_{\text{r}}$ gains on the LVIS dataset. Code +is available at https://github.com/mlvlab/RALF . + +
+
+ comment: Accepted paper at CVPR 2024 +
+
+
+
+
+ + ☆ SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane + Representation + + +
+ While recent advances in 3D-aware Generative Adversarial Networks (GANs) have +aided the development of near-frontal view human face synthesis, the challenge +of comprehensively synthesizing a full 3D head viewable from all angles still +persists. Although PanoHead proves the possibilities of using a large-scale +dataset with images of both frontal and back views for full-head synthesis, it +often causes artifacts for back views. Based on our in-depth analysis, we found +the reasons are mainly twofold. First, from network architecture perspective, +we found each plane in the utilized tri-plane/tri-grid representation space +tends to confuse the features from both sides, causing "mirroring" artifacts +(e.g., the glasses appear in the back). Second, from data supervision aspect, +we found that existing discriminator training in 3D GANs mainly focuses on the +quality of the rendered image itself, and does not care much about its +plausibility with the perspective from which it was rendered. This makes it +possible to generate "face" in non-frontal views, due to its easiness to fool +the discriminator. In response, we propose SphereHead, a novel tri-plane +representation in the spherical coordinate system that fits the human head's +geometric characteristics and efficiently mitigates many of the generated +artifacts. We further introduce a view-image consistency loss for the +discriminator to emphasize the correspondence of the camera parameters and the +images. The combination of these efforts results in visually superior outcomes +with significantly fewer artifacts. Our code and dataset are publicly available +at https://lhyfst.github.io/spherehead. + +
+
+ comment: project page: https://lhyfst.github.io/spherehead +
+
+
+
+
+ + ☆ Normalizing Flows on the Product Space of SO(3) Manifolds for + Probabilistic Human Pose Modeling CVPR 2024 + + +
+ Normalizing flows have proven their efficacy for density estimation in +Euclidean space, but their application to rotational representations, crucial +in various domains such as robotics or human pose modeling, remains +underexplored. Probabilistic models of the human pose can benefit from +approaches that rigorously consider the rotational nature of human joints. For +this purpose, we introduce HuProSO3, a normalizing flow model that operates on +a high-dimensional product space of SO(3) manifolds, modeling the joint +distribution for human joints with three degrees of freedom. HuProSO3's +advantage over state-of-the-art approaches is demonstrated through its superior +modeling accuracy in three different applications and its capability to +evaluate the exact likelihood. This work not only addresses the technical +challenge of learning densities on SO(3) manifolds, but it also has broader +implications for domains where the probabilistic regression of correlated 3D +rotations is of importance. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation + + +
+ In this paper, we present MoMA: an open-vocabulary, training-free +personalized image model that boasts flexible zero-shot capabilities. As +foundational text-to-image models rapidly evolve, the demand for robust +image-to-image translation grows. Addressing this need, MoMA specializes in +subject-driven personalized image generation. Utilizing an open-source, +Multimodal Large Language Model (MLLM), we train MoMA to serve a dual role as +both a feature extractor and a generator. This approach effectively synergizes +reference image and text prompt information to produce valuable image features, +facilitating an image diffusion model. To better leverage the generated +features, we further introduce a novel self-attention shortcut method that +efficiently transfers image features to an image diffusion model, improving the +resemblance of the target object in generated images. Remarkably, as a +tuning-free plug-and-play module, our model requires only a single reference +image and outperforms existing methods in generating images with high detail +fidelity, enhanced identity-preservation and prompt faithfulness. Our work is +open-source, thereby providing universal access to these advancements. + +
+
+
+
+
+ + ☆ CoReS: Orchestrating the Dance of Reasoning and Segmentation + + +
+ The reasoning segmentation task, which demands a nuanced comprehension of +intricate queries to accurately pinpoint object regions, is attracting +increasing attention. However, Multi-modal Large Language Models (MLLM) often +find it difficult to accurately localize the objects described in complex +reasoning contexts. We believe that the act of reasoning segmentation should +mirror the cognitive stages of human visual search, where each step is a +progressive refinement of thought toward the final object. Thus we introduce +the Chains of Reasoning and Segmenting (CoReS) and find this top-down visual +hierarchy indeed enhances the visual search process. Specifically, we propose a +dual-chain structure that generates multi-modal, chain-like outputs to aid the +segmentation process. Furthermore, to steer the MLLM's outputs into this +intended hierarchy, we incorporate in-context inputs as guidance. Extensive +experiments demonstrate the superior performance of our CoReS, which surpasses +the state-of-the-art method by 7.1\% on the ReasonSeg dataset. The code will be +released at https://github.com/baoxiaoyi/CoReS. + +
+
+
+
+
+ + ☆ NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for + Document Enhancement + + +
+ Real-world documents may suffer various forms of degradation, often resulting +in lower accuracy in optical character recognition (OCR) systems. Therefore, a +crucial preprocessing step is essential to eliminate noise while preserving +text and key features of documents. In this paper, we propose NAF-DPM, a novel +generative framework based on a diffusion probabilistic model (DPM) designed to +restore the original quality of degraded documents. While DPMs are recognized +for their high-quality generated images, they are also known for their large +inference time. To mitigate this problem we provide the DPM with an efficient +nonlinear activation-free (NAF) network and we employ as a sampler a fast +solver of ordinary differential equations, which can converge in a few +iterations. To better preserve text characters, we introduce an additional +differentiable module based on convolutional recurrent neural networks, +simulating the behavior of an OCR system during training. Experiments conducted +on various datasets showcase the superiority of our approach, achieving +state-of-the-art performance in terms of pixel-level and perceptual similarity +metrics. Furthermore, the results demonstrate a notable character error +reduction made by OCR systems when transcribing real-world document images +enhanced by our framework. Code and pre-trained models are available at +https://github.com/ispamm/NAF-DPM. + +
+
+ comment: Under review at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ☆ AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic + Segmentation + + +
+ A serious issue that harms the performance of zero-shot visual recognition is +named objective misalignment, i.e., the learning objective prioritizes +improving the recognition accuracy of seen classes rather than unseen classes, +while the latter is the true target to pursue. This issue becomes more +significant in zero-shot image segmentation because the stronger (i.e., +pixel-level) supervision brings a larger gap between seen and unseen classes. +To mitigate it, we propose a novel architecture named AlignZeg, which embodies +a comprehensive improvement of the segmentation pipeline, including proposal +extraction, classification, and correction, to better fit the goal of zero-shot +segmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a +mutual interaction between mask queries and visual features, facilitating +detailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced +Proposal Classification. AlignZeg introduces synthetic data and incorporates +multiple background prototypes to allocate a more generalizable feature space. +(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a +class indicator to find potential unseen class proposals followed by a +prediction postprocess to correct the prediction bias. Experiments demonstrate +that AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an +average 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in +identifying unseen classes, and we further validate that the improvement comes +from alleviating the objective misalignment issue. + +
+
+
+
+
+ + ☆ YaART: Yet Another ART Rendering Technology + + +
+ In the rapidly progressing field of generative models, the development of +efficient and high-fidelity text-to-image diffusion systems represents a +significant frontier. This study introduces YaART, a novel production-grade +text-to-image cascaded diffusion model aligned to human preferences using +Reinforcement Learning from Human Feedback (RLHF). During the development of +YaART, we especially focus on the choices of the model and training dataset +sizes, the aspects that were not systematically investigated for text-to-image +cascaded diffusion models before. In particular, we comprehensively analyze how +these choices affect both the efficiency of the training process and the +quality of the generated images, which are highly important in practice. +Furthermore, we demonstrate that models trained on smaller datasets of +higher-quality images can successfully compete with those trained on larger +datasets, establishing a more efficient scenario of diffusion models training. +From the quality perspective, YaART is consistently preferred by users over +many existing state-of-the-art models. + +
+
+ comment: Prompts and additional information are available on the project page, + see https://ya.ru/ai/art/paper-yaart-v1 +
+
+
+
+
+ + ☆ BinaryDM: Towards Accurate Binarization of Diffusion Model + + +
+ With the advancement of diffusion models (DMs) and the substantially +increased computational requirements, quantization emerges as a practical +solution to obtain compact and efficient low-bit DMs. However, the highly +discrete representation leads to severe accuracy degradation, hindering the +quantization of diffusion models to ultra-low bit-widths. In this paper, we +propose BinaryDM, a novel accurate quantization-aware training approach to push +the weights of diffusion models towards the limit of 1-bit. Firstly, we present +a Learnable Multi-basis Binarizer (LMB) to recover the representations +generated by the binarized DM, which improves the information in details of +representations crucial to the DM. Secondly, a Low-rank Representation +Mimicking (LRM) is applied to enhance the binarization-aware optimization of +the DM, alleviating the optimization direction ambiguity caused by fine-grained +alignment. Moreover, a progressive initialization strategy is applied to +training DMs to avoid convergence difficulties. Comprehensive experiments +demonstrate that BinaryDM achieves significant accuracy and efficiency gains +compared to SOTA quantization methods of DMs under ultra-low bit-widths. As the +first binarization method for diffusion models, BinaryDM achieves impressive +16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit +activation, showcasing its substantial advantages and potential for deploying +DMs on resource-limited scenarios. + +
+
+ comment: The code will soon be available at + https://github.com/Xingyu-Zheng/BinaryDM +
+
+
+
+
+ + ☆ Automatic Controllable Colorization via Imagination CVPR 2024 + + +
+ We propose a framework for automatic colorization that allows for iterative +editing and modifications. The core of our framework lies in an imagination +module: by understanding the content within a grayscale image, we utilize a +pre-trained image generation model to generate multiple images that contain the +same content. These images serve as references for coloring, mimicking the +process of human experts. As the synthesized images can be imperfect or +different from the original grayscale image, we propose a Reference Refinement +Module to select the optimal reference composition. Unlike most previous +end-to-end automatic colorization algorithms, our framework allows for +iterative and localized modifications of the colorization results because we +explicitly model the coloring samples. Extensive experiments demonstrate the +superiority of our framework over existing automatic colorization algorithms in +editability and flexibility. Project page: +https://xy-cong.github.io/imagine-colorization. + +
+
+ comment: CVPR 2024. Project page: + https://xy-cong.github.io/imagine-colorization +
+
+
+
+
+ + ☆ MLP Can Be A Good Transformer Learner + + +
+ Self-attention mechanism is the key of the Transformer but often criticized +for its computation demands. Previous token pruning works motivate their +methods from the view of computation redundancy but still need to load the full +network and require same memory costs. This paper introduces a novel strategy +that simplifies vision transformers and reduces computational load through the +selective removal of non-essential attention layers, guided by entropy +considerations. We identify that regarding the attention layer in bottom +blocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit +the same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited +since they exhibit smaller feature entropy compared to those MLPs in the top +blocks. Therefore, we propose to integrate the uninformative attention layers +into their subsequent counterparts by degenerating them into identical mapping, +yielding only MLP in certain transformer blocks. Experimental results on +ImageNet-1k show that the proposed method can remove 40% attention layer of +DeiT-B, improving throughput and memory bound without performance compromise. +Code is available at https://github.com/sihaoevery/lambda_vit. + +
+
+ comment: efficient transformer +
+
+
+
+
+ + ☆ 3D-COCO: extension of MS-COCO dataset for image detection and 3D + reconstruction modules + + +
+ We introduce 3D-COCO, an extension of the original MS-COCO dataset providing +3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve +computer vision tasks such as 3D reconstruction or image detection configurable +with textual, 2D image, and 3D CAD model queries. We complete the existing +MS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By +using an IoU-based method, we match each MS-COCO annotation with the best 3D +models to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a +premiere that should pave the way for new research on 3D-related topics. The +dataset and its source codes is available at +https://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/ + +
+
+
+
+
+ + ☆ Learning a Category-level Object Pose Estimator without Pose Annotations + + +
+ 3D object pose estimation is a challenging task. Previous works always +require thousands of object images with annotated poses for learning the 3D +pose correspondence, which is laborious and time-consuming for labeling. In +this paper, we propose to learn a category-level 3D object pose estimator +without pose annotations. Instead of using manually annotated images, we +leverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under +controlled pose differences and propose to learn our object pose estimator with +those images. Directly using the original diffusion model leads to images with +noisy poses and artifacts. To tackle this issue, firstly, we exploit an image +encoder, which is learned from a specially designed contrastive pose learning, +to filter the unreasonable details and extract image feature maps. +Additionally, we propose a novel learning strategy that allows the model to +learn object poses from those generated image sets without knowing the +alignment of their canonical poses. Experimental results show that our method +has the capability of category-level object pose estimation from a single shot +setting (as pose definition), while significantly outperforming other +state-of-the-art methods on the few-shot category-level object pose estimation +benchmarks. + +
+
+
+
+
+ + ☆ MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning CVPR 2024 + + +
+ While excellent in transfer learning, Vision-Language models (VLMs) come with +high computational costs due to their large number of parameters. To address +this issue, removing parameters via model pruning is a viable solution. +However, existing techniques for VLMs are task-specific, and thus require +pruning the network from scratch for each new task of interest. In this work, +we explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP). +Given a pretrained VLM, the goal is to find a unique pruned counterpart +transferable to multiple unknown downstream tasks. In this challenging setting, +the transferable representations already encoded in the pretrained model are a +key aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a +first, gradient-free, pruning framework for TA-VLP where: (i) the importance of +a parameter is expressed in terms of its magnitude and its information flow, by +incorporating the saliency of the neurons it connects; and (ii) pruning is +driven by the emergent (multimodal) distribution of the VLM parameters after +pretraining. We benchmark eight state-of-the-art pruning algorithms in the +context of TA-VLP, experimenting with two VLMs, three vision-language tasks, +and three pruning ratios. Our experimental results show that MULTIFLOW +outperforms recent sophisticated, combinatorial competitors in the vast +majority of the cases, paving the way towards addressing TA-VLP. The code is +publicly available at https://github.com/FarinaMatteo/multiflow. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion + + +
+ Nowadays, the family of Stable Diffusion (SD) models has gained prominence +for its high quality outputs and scalability. This has also raised security +concerns on social media, as malicious users can create and disseminate harmful +content. Existing approaches involve training components or entire SDs to embed +a watermark in generated images for traceability and responsibility +attribution. However, in the era of AI-generated content (AIGC), the rapid +iteration of SDs renders retraining with watermark models costly. To address +this, we propose a training-free plug-and-play watermark framework for SDs. +Without modifying any components of SDs, we embed diverse watermarks in the +latent space, adapting to the denoising process. Our experimental findings +reveal that our method effectively harmonizes image quality and watermark +invisibility. Furthermore, it performs robustly under various attacks. We also +have validated that our method is generalized to multiple versions of SDs, even +without retraining the watermark model. + +
+
+
+
+
+ + ☆ Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view + Reconstruction + + +
+ Face meshes in consistent topology serve as the foundation for many +face-related applications, such as 3DMM constrained face reconstruction and +expression retargeting. Traditional methods commonly acquire topology uniformed +face meshes by two separate steps: multi-view stereo (MVS) to reconstruct +shapes followed by non-rigid registration to align topology, but struggles with +handling noise and non-lambertian surfaces. Recently neural volume rendering +techniques have been rapidly evolved and shown great advantages in 3D +reconstruction or novel view synthesis. Our goal is to leverage the superiority +of neural volume rendering into multi-view reconstruction of face mesh with +consistent topology. We propose a mesh volume rendering method that enables +directly optimizing mesh geometry while preserving topology, and learning +implicit features to model complex facial appearance from multi-view images. +The key innovation lies in spreading sparse mesh features into the surrounding +space to simulate radiance field required for volume rendering, which +facilitates backpropagation of gradients from images to mesh geometry and +implicit appearance features. Our proposed feature spreading module exhibits +deformation invariance, enabling photorealistic rendering seamlessly after mesh +editing. We conduct experiments on multi-view face image dataset to evaluate +the reconstruction and implement an application for photorealistic rendering of +animated face mesh. + +
+
+
+
+
+ + ☆ Self-Explainable Affordance Learning with Embodied Caption + + +
+ In the field of visual affordance learning, previous methods mainly used +abundant images or videos that delineate human behavior patterns to identify +action possibility regions for object manipulation, with a variety of +applications in robotic tasks. However, they encounter a main challenge of +action ambiguity, illustrated by the vagueness like whether to beat or carry a +drum, and the complexities involved in processing intricate scenes. Moreover, +it is important for human intervention to rectify robot errors in time. To +address these issues, we introduce Self-Explainable Affordance learning (SEA) +with embodied caption. This innovation enables robots to articulate their +intentions and bridge the gap between explainable vision-language caption and +visual affordance learning. Due to a lack of appropriate dataset, we unveil a +pioneering dataset and metrics tailored for this task, which integrates images, +heatmaps, and embodied captions. Furthermore, we propose a novel model to +effectively combine affordance grounding with self-explanation in a simple but +efficient manner. Extensive quantitative and qualitative experiments +demonstrate our method's effectiveness. + +
+
+
+
+
+ + ☆ UniFL: Improve Stable Diffusion via Unified Feedback Learning + + +
+ Diffusion models have revolutionized the field of image generation, leading +to the proliferation of high-quality models and diverse downstream +applications. However, despite these significant advancements, the current +competitive solutions still suffer from several limitations, including inferior +visual quality, a lack of aesthetic appeal, and inefficient inference, without +a comprehensive solution in sight. To address these challenges, we present +UniFL, a unified framework that leverages feedback learning to enhance +diffusion models comprehensively. UniFL stands out as a universal, effective, +and generalizable solution applicable to various diffusion models, such as +SD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual +feedback learning, which enhances visual quality; decoupled feedback learning, +which improves aesthetic appeal; and adversarial feedback learning, which +optimizes inference speed. In-depth experiments and extensive user studies +validate the superior performance of our proposed method in enhancing both the +quality of generated models and their acceleration. For instance, UniFL +surpasses ImageReward by 17% user preference in terms of generation quality and +outperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we +have verified the efficacy of our approach in downstream tasks, including Lora, +ControlNet, and AnimateDiff. + +
+
+
+
+
+ + ☆ Neural Cellular Automata for Lightweight, Robust and Explainable + Classification of White Blood Cell Images + + +
+ Diagnosis of hematological malignancies depends on accurate identification of +white blood cells in peripheral blood smears. Deep learning techniques are +emerging as a viable solution to scale and optimize this process by automatic +identification of cells in laboratories. However, these techniques face several +challenges such as limited generalizability, sensitivity to domain shifts and +lack of explainability. Here, we are introducing a novel approach based on +neural cellular automata (NCA) for white blood cell classification. We test our +approach on three datasets of white blood cell images and show that we achieve +competitive performance compared to conventional methods. Our NCA-based method +is significantly smaller in terms of parameters and exhibits robustness to +domain shifts. Furthermore, the architecture is inherently explainable, +providing insights into the decision process for each classification, helping +experts understand and validate model predictions. Results demonstrate that NCA +not only can be used for image classification, but also address key challenges +of conventional methods, indicating a high potential for applicability in +clinical practice. + +
+
+
+
+
+ + ☆ Towards More General Video-based Deepfake Detection through Facial + Feature Guided Adaptation for Foundation Model + + +
+ With the rise of deep learning, generative models have enabled the creation +of highly realistic synthetic images, presenting challenges due to their +potential misuse. While research in Deepfake detection has grown rapidly in +response, many detection methods struggle with unseen Deepfakes generated by +new synthesis techniques. To address this generalisation challenge, we propose +a novel Deepfake detection approach by adapting rich information encoded inside +the Foundation Models with rich information encoded inside, specifically using +the image encoder from CLIP which has demonstrated strong zero-shot capability +for downstream tasks. Inspired by the recent advances of parameter efficient +fine-tuning, we propose a novel side-network-based decoder to extract spatial +and temporal cues from the given video clip, with the promotion of the Facial +Component Guidance (FCG) to guidencourage the spatial feature to include +features of key facial parts for more robust and general Deepfake detection. +Through extensive cross-dataset evaluations, our approach exhibits superior +effectiveness in identifying unseen Deepfake samples, achieving notable +performance improvementsuccess even with limited training samples and +manipulation types. Our model secures an average performance enhancement of +0.9% AUROC in cross-dataset assessments comparing with state-of-the-art +methods, especiallytablishing a significant lead of achieving 4.4% improvement +on the challenging DFDC dataset. + +
+
+
+
+
+ + ☆ Responsible Visual Editing + + +
+ With recent advancements in visual synthesis, there is a growing risk of +encountering images with detrimental effects, such as hate, discrimination, or +privacy violations. The research on transforming harmful images into +responsible ones remains unexplored. In this paper, we formulate a new task, +responsible visual editing, which entails modifying specific concepts within an +image to render it more responsible while minimizing changes. However, the +concept that needs to be edited is often abstract, making it challenging to +locate what needs to be modified and plan how to modify it. To tackle these +challenges, we propose a Cognitive Editor (CoEditor) that harnesses the large +multimodal model through a two-stage cognitive process: (1) a perceptual +cognitive process to focus on what needs to be modified and (2) a behavioral +cognitive process to strategize how to modify. To mitigate the negative +implications of harmful images on research, we create a transparent and public +dataset, AltBear, which expresses harmful information using teddy bears instead +of humans. Experiments demonstrate that CoEditor can effectively comprehend +abstract concepts within complex scenes and significantly surpass the +performance of baseline models for responsible visual editing. We find that the +AltBear dataset corresponds well to the harmful content found in real images, +offering a consistent experimental evaluation, thereby providing a safer +benchmark for future research. Moreover, CoEditor also shows great results in +general editing. We release our code and dataset at +https://github.com/kodenii/Responsible-Visual-Editing. + +
+
+ comment: 24 pages, 12 figures +
+
+
+
+
+ + ☆ Robust Data Pruning: Uncovering and Overcoming Implicit Bias + + +
+ In the era of exceptionally data-hungry models, careful selection of the +training data is essential to mitigate the extensive costs of deep learning. +Data pruning offers a solution by removing redundant or uninformative samples +from the dataset, which yields faster convergence and improved neural scaling +laws. However, little is known about its impact on classification bias of the +trained models. We conduct the first systematic study of this effect and reveal +that existing data pruning algorithms can produce highly biased classifiers. At +the same time, we argue that random data pruning with appropriate class ratios +has potential to improve the worst-class performance. We propose a +"fairness-aware" approach to pruning and empirically demonstrate its +performance on standard computer vision benchmarks. In sharp contrast to +existing algorithms, our proposed method continues improving robustness at a +tolerable drop of average performance as we prune more from the datasets. We +present theoretical analysis of the classification risk in a mixture of +Gaussians to further motivate our algorithm and support our findings. + +
+
+
+
+
+ + ☆ Social-MAE: Social Masked Autoencoder for Multi-person Motion + Representation Learning + + +
+ For a complete comprehension of multi-person scenes, it is essential to go +beyond basic tasks like detection and tracking. Higher-level tasks, such as +understanding the interactions and social activities among individuals, are +also crucial. Progress towards models that can fully understand scenes +involving multiple people is hindered by a lack of sufficient annotated data +for such high-level tasks. To address this challenge, we introduce Social-MAE, +a simple yet effective transformer-based masked autoencoder framework for +multi-person human motion data. The framework uses masked modeling to pre-train +the encoder to reconstruct masked human joint trajectories, enabling it to +learn generalizable and data efficient representations of motion in human +crowded scenes. Social-MAE comprises a transformer as the MAE encoder and a +lighter-weight transformer as the MAE decoder which operates on multi-person +joints' trajectory in the frequency domain. After the reconstruction task, the +MAE decoder is replaced with a task-specific decoder and the model is +fine-tuned end-to-end for a variety of high-level social tasks. Our proposed +model combined with our pre-training approach achieves the state-of-the-art +results on various high-level social tasks, including multi-person pose +forecasting, social grouping, and social action understanding. These +improvements are demonstrated across four popular multi-person datasets +encompassing both human 2D and 3D body pose. + +
+
+
+
+
+ + ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot + Editing of Text-to-Video Diffusion Models CVPR 2024 + + +
+ With recent advances in image and video diffusion models for content +creation, a plethora of techniques have been proposed for customizing their +generated content. In particular, manipulating the cross-attention layers of +Text-to-Image (T2I) diffusion models has shown great promise in controlling the +shape and location of objects in the scene. Transferring image-editing +techniques to the video domain, however, is extremely challenging as object +motion and temporal consistency are difficult to capture accurately. In this +work, we take a first look at the role of cross-attention in Text-to-Video +(T2V) diffusion models for zero-shot video editing. While one-shot models have +shown potential in controlling motion and camera movement, we demonstrate +zero-shot control over object shape, position and movement in T2V models. We +show that despite the limitations of current T2V models, cross-attention +guidance can be a promising approach for editing videos. + +
+
+ comment: Generative Models for Computer Vision Generative Models for Computer + Vision CVPR 2024 Workshop +
+
+
+
+
+ + ☆ DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker + + +
+ Accurately distinguishing each object is a fundamental goal of Multi-object +tracking (MOT) algorithms. However, achieving this goal still remains +challenging, primarily due to: (i) For crowded scenes with occluded objects, +the high overlap of object bounding boxes leads to confusion among closely +located objects. Nevertheless, humans naturally perceive the depth of elements +in a scene when observing 2D videos. Inspired by this, even though the bounding +boxes of objects are close on the camera plane, we can differentiate them in +the depth dimension, thereby establishing a 3D perception of the objects. (ii) +For videos with rapidly irregular camera motion, abrupt changes in object +positions can result in ID switches. However, if the camera pose are known, we +can compensate for the errors in linear motion models. In this paper, we +propose \textit{DepthMOT}, which achieves: (i) detecting and estimating scene +depth map \textit{end-to-end}, (ii) compensating the irregular camera motion by +camera pose estimation. Extensive experiments demonstrate the superior +performance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be +available at \url{https://github.com/JackWoo0831/DepthMOT}. + +
+
+
+
+
+ + ☆ Impact of LiDAR visualisations on semantic segmentation of + archaeological objects + + +
+ Deep learning methods in LiDAR-based archaeological research often leverage +visualisation techniques derived from Digital Elevation Models to enhance +characteristics of archaeological objects present in the images. This paper +investigates the impact of visualisations on deep learning performance through +a comprehensive testing framework. The study involves the use of eight semantic +segmentation models to evaluate seven diverse visualisations across two study +areas, encompassing five archaeological classes. Experimental results reveal +that the choice of appropriate visualisations can influence performance by up +to 8%. Yet, pinpointing one visualisation that outperforms the others in +segmenting all archaeological classes proves challenging. The observed +performance variation, reaching up to 25% across different model +configurations, underscores the importance of thoughtfully selecting model +configurations and LiDAR visualisations for successfully segmenting +archaeological objects. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Taming Transformers for Realistic Lidar Point Cloud Generation + + +
+ Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the +Lidar point cloud generation task, benefiting from their stable training and +iterative refinement during sampling. However, DMs often fail to realistically +model Lidar raydrop noise due to their inherent denoising process. To retain +the strength of iterative sampling while enhancing the generation of raydrop +noise, we introduce LidarGRIT, a generative model that uses auto-regressive +transformers to iteratively sample the range images in the latent space rather +than image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode +range images and raydrop masks. Our results show that LidarGRIT achieves +superior performance compared to SOTA models on KITTI-360 and KITTI odometry +datasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT. + +
+
+
+
+
+ + ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated +
+
+
+
+
+ + ☆ Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder ICME2024 + + +
+ Automatic lip-reading (ALR) aims to automatically transcribe spoken content +from a speaker's silent lip motion captured in video. Current mainstream +lip-reading approaches only use a single visual encoder to model input videos +of a single scale. In this paper, we propose to enhance lipreading by +incorporating multi-scale video data and multi-encoder. Specifically, we first +propose a novel multi-scale lip extraction algorithm based on the size of the +speaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip +features at different scales. For the multi-encoder, in addition to the +mainstream Transformer and Conformer, we also incorporate the recently proposed +Branchformer and EBranchformer as visual encoders. In the experiments, we +explore the influence of different video data scales and encoders on ALR system +performance and fuse the texts transcribed by all ALR systems using recognizer +output voting error reduction (ROVER). Finally, our proposed approach placed +second in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in +character error rate (CER) compared to the official baseline on the evaluation +set. + +
+
+ comment: 6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR +
+
+
+
+
+ + ☆ HAMMR: HierArchical MultiModal React agents for generic VQA + + +
+ Combining Large Language Models (LLMs) with external specialized tools +(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual +Question Answering (VQA). While this approach was demonstrated to work well +when optimized and evaluated for each individual benchmark, in practice it is +crucial for the next generation of real-world AI systems to handle a broad +range of multimodal problems. Therefore we pose the VQA problem from a unified +perspective and evaluate a single system on a varied suite of VQA tasks +including counting, spatial reasoning, OCR-based reasoning, visual pointing, +external knowledge, and more. In this setting, we demonstrate that naively +applying the LLM+tools approach using the combined set of all tools leads to +poor results. This motivates us to introduce HAMMR: HierArchical MultiModal +React. We start from a multimodal ReAct-based system and make it hierarchical +by enabling our HAMMR agents to call upon other specialized agents. This +enhances the compositionality of the LLM+tools approach, which we show to be +critical for obtaining high accuracy on generic VQA. Concretely, on our generic +VQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%. +Additionally, HAMMR achieves state-of-the-art results on this task, +outperforming the generic standalone PaLI-X VQA model by 5.0%. + +
+
+
+
+
+ + ☆ Pansharpening of PRISMA products for archaeological prospection + + +
+ Hyperspectral data recorded from satellite platforms are often ill-suited for +geo-archaeological prospection due to low spatial resolution. The established +potential of hyperspectral data from airborne sensors in identifying +archaeological features has, on the other side, generated increased interest in +enhancing hyperspectral data to achieve higher spatial resolution. This +improvement is crucial for detecting traces linked to sub-surface +geo-archaeological features and can make satellite hyperspectral acquisitions +more suitable for archaeological research. This research assesses the usability +of pansharpened PRISMA satellite products in geo-archaeological prospections. +Three pan-sharpening methods (GSA, MTF-GLP and HySure) are compared +quantitatively and qualitatively and tested over the archaeological landscape +of Aquileia (Italy). The results suggest that the application of pansharpening +techniques makes hyperspectral satellite imagery highly suitable, under certain +conditions, to the identification of sub-surface archaeological features of +small and large size. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Action-conditioned video data improves predictability + + +
+ Long-term video generation and prediction remain challenging tasks in +computer vision, particularly in partially observable scenarios where cameras +are mounted on moving platforms. The interaction between observed image frames +and the motion of the recording agent introduces additional complexities. To +address these issues, we introduce the Action-Conditioned Video Generation +(ACVG) framework, a novel approach that investigates the relationship between +actions and generated image frames through a deep dual Generator-Actor +architecture. ACVG generates video sequences conditioned on the actions of +robots, enabling exploration and analysis of how vision and action mutually +influence one another in dynamic environments. We evaluate the framework's +effectiveness on an indoor robot motion dataset which consists of sequences of +image frames along with the sequences of actions taken by the robotic agent, +conducting a comprehensive empirical study comparing ACVG to other +state-of-the-art frameworks along with a detailed ablation study. + +
+
+
+
+
+ + ☆ Test-Time Zero-Shot Temporal Action Localization + + +
+ Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate +actions in untrimmed videos unseen during training. Existing ZS-TAL methods +involve fine-tuning a model on a large amount of annotated training data. While +effective, training-based ZS-TAL approaches assume the availability of labeled +data for supervised learning, which can be impractical in some applications. +Furthermore, the training process naturally induces a domain bias into the +learned model, which may adversely affect the model's generalization ability to +arbitrary videos. These considerations prompt us to approach the ZS-TAL problem +from a radically novel perspective, relaxing the requirement for training data. +To this aim, we introduce a novel method that performs Test-Time adaptation for +Temporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained +Vision and Language Model (VLM). T3AL operates in three steps. First, a +video-level pseudo-label of the action category is computed by aggregating +information from the entire video. Then, action localization is performed +adopting a novel procedure inspired by self-supervised learning. Finally, +frame-level textual descriptions extracted with a state-of-the-art captioning +model are employed for refining the action region proposals. We validate the +effectiveness of T3AL by conducting experiments on the THUMOS14 and the +ActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly +outperforms zero-shot baselines based on state-of-the-art VLMs, confirming the +benefit of a test-time adaptation approach. + +
+
+
+
+
+ + ☆ Two Hands Are Better Than One: Resolving Hand to Hand Intersections via + Occupancy Networks + + +
+ 3D hand pose estimation from images has seen considerable interest from the +literature, with new methods improving overall 3D accuracy. One current +challenge is to address hand-to-hand interaction where self-occlusions and +finger articulation pose a significant problem to estimation. Little work has +applied physical constraints that minimize the hand intersections that occur as +a result of noisy estimation. This work addresses the intersection of hands by +exploiting an occupancy network that represents the hand's volume as a +continuous manifold. This allows us to model the probability distribution of +points being inside a hand. We designed an intersection loss function to +minimize the likelihood of hand-to-point intersections. Moreover, we propose a +new hand mesh parameterization that is superior to the commonly used MANO model +in many respects including lower mesh complexity, underlying 3D skeleton +extraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the +models trained using our intersection loss achieve better results than the +state-of-the-art by significantly decreasing the number of hand intersections +while lowering the mean per-joint positional error. Additionally, we +demonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE +datasets and show reduced hand-to-hand intersections for complex domains such +as sign-language pose estimation. + +
+
+
+
+
+ + ☆ Anatomical Conditioning for Contrastive Unpaired Image-to-Image + Translation of Optical Coherence Tomography Images + + +
+ For a unified analysis of medical images from different modalities, data +harmonization using image-to-image (I2I) translation is desired. We study this +problem employing an optical coherence tomography (OCT) data set of +Spectralis-OCT and Home-OCT images. I2I translation is challenging because the +images are unpaired, and a bijective mapping does not exist due to the +information discrepancy between both domains. This problem has been addressed +by the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it +reduces semantic consistency. To restore the semantic consistency, we support +the style decoder using an additional segmentation decoder. Our approach +increases the similarity between the style-translated images and the target +distribution. Importantly, we improve the segmentation of biomarkers in +Home-OCT images in an unsupervised domain adaptation scenario. Our data +harmonization approach provides potential for the monitoring of diseases, e.g., +age related macular disease, using different OCT devices. + +
+
+ comment: Accepted at ISBI 2024 +
+
+
+
+
+ + ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ☆ T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise + Event Spotting in Sports Videos + + +
+ In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer +Encoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses +multiple challenges in the task, including the need for discriminability among +frame representations, high output temporal resolution to maintain prediction +precision, and the necessity to capture information at different temporal +scales to handle events with varying dynamics. It tackles these challenges +through its specifically designed architecture, featuring an encoder-decoder +for leveraging multiple temporal scales and achieving high output temporal +resolution, along with temporal modules designed to increase token +discriminability. Leveraging these characteristics, T-DEED achieves SOTA +performance on the FigureSkating and FineDiving datasets. + +
+
+
+
+
+ + ☆ Rethinking the Spatial Inconsistency in Classifier-Free Diffusion + Guidance CVPR-2024 + + +
+ Classifier-Free Guidance (CFG) has been widely used in text-to-image +diffusion models, where the CFG scale is introduced to control the strength of +text guidance on the whole image space. However, we argue that a global CFG +scale results in spatial inconsistency on varying semantic strengths and +suboptimal image quality. To address this problem, we present a novel approach, +Semantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance +degrees for different semantic units in text-to-image diffusion models. +Specifically, we first design a training-free semantic segmentation method to +partition the latent image into relatively independent semantic regions at each +denoising step. In particular, the cross-attention map in the denoising U-net +backbone is renormalized for assigning each patch to the corresponding token, +while the self-attention map is used to complete the semantic regions. Then, to +balance the amplification of diverse semantic units, we adaptively adjust the +CFG scales across different semantic regions to rescale the text guidance +degrees into a uniform level. Finally, extensive experiments demonstrate the +superiority of S-CFG over the original CFG strategy on various text-to-image +diffusion models, without requiring any extra training cost. our codes are +available at https://github.com/SmilesDZgk/S-CFG. + +
+
+ comment: accepted by CVPR-2024 +
+
+
+
+
+ + ☆ CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery CVPR + + +
+ In Generalized Category Discovery (GCD), we cluster unlabeled samples of +known and novel classes, leveraging a training dataset of known classes. A +salient challenge arises due to domain shifts between these datasets. To +address this, we present a novel setting: Across Domain Generalized Category +Discovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains) +as a remedy. CDAD-NET is architected to synchronize potential known class +samples across both the labeled (source) and unlabeled (target) datasets, while +emphasizing the distinct categorization of the target data. To facilitate this, +we propose an entropy-driven adversarial learning strategy that accounts for +the distance distributions of target samples relative to source-domain class +prototypes. Parallelly, the discriminative nature of the shared space is upheld +through a fusion of three metric learning objectives. In the source domain, our +focus is on refining the proximity between samples and their affiliated class +prototypes, while in the target domain, we integrate a neighborhood-centric +contrastive learning mechanism, enriched with an adept neighborsmining +approach. To further accentuate the nuanced feature interrelation among +semantically aligned images, we champion the concept of conditional image +inpainting, underscoring the premise that semantically analogous images prove +more efficacious to the task than their disjointed counterparts. +Experimentally, CDAD-NET eclipses existing literature with a performance +increment of 8-15% on three AD-GCD benchmarks we present. + +
+
+ comment: Accepted in L3D-IVU, CVPR Workshop, 2024 +
+
+
+
+
+ + ☆ Multi-head Attention-based Deep Multiple Instance Learning + + +
+ This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple +Instance Learning model, designed for weakly supervised Whole Slide Images +(WSIs) classification in digital pathology. Inspired by the multi-head +attention mechanism of the Transformer, MAD-MIL simplifies model complexity +while achieving competitive results against advanced models like CLAM and +DS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16, +TCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL. +This demonstrates enhanced information diversity, interpretability, and +efficiency in slide representation. The model's effectiveness, coupled with +fewer trainable parameters and lower computational complexity makes it a +promising solution for automated pathology workflows. Our code is available at +https://github.com/tueimage/MAD-MIL. + +
+
+
+
+
+ + ☆ CNN-based Game State Detection for a Foosball Table + + +
+ The automation of games using Deep Reinforcement Learning Strategies (DRL) is +a well-known challenge in AI research. While for feature extraction in a video +game typically the whole image is used, this is hardly practical for many real +world games. Instead, using a smaller game state reducing the dimension of the +parameter space to include essential parameters only seems to be a promising +approach. In the game of Foosball, a compact and comprehensive game state +description consists of the positional shifts and rotations of the figures and +the position of the ball over time. In particular, velocities and accelerations +can be derived from consecutive time samples of the game state. In this paper, +a figure detection system to determine the game state in Foosball is presented. +We capture a dataset containing the rotations of the rods which were measured +using accelerometers and the positional shifts were derived using traditional +Computer Vision techniques (in a laboratory setting). This dataset is utilized +to train Convolutional Neural Network (CNN) based end-to-end regression models +to predict the rotations and shifts of each rod. We present an evaluation of +our system using different state-of-the-art CNNs as base architectures for the +regression model. We show that our system is able to predict the game state +with high accuracy. By providing data for both black and white teams, the +presented system is intended to provide the required data for future +developments of Imitation Learning techniques w.r.t. to observing human +players. + +
+
+
+
+
+ + ☆ Iterative Refinement Strategy for Automated Data Labeling: Facial + Landmark Diagnosis in Medical Imaging + + +
+ Automated data labeling techniques are crucial for accelerating the +development of deep learning models, particularly in complex medical imaging +applications. However, ensuring accuracy and efficiency remains challenging. +This paper presents iterative refinement strategies for automated data labeling +in facial landmark diagnosis to enhance accuracy and efficiency for deep +learning models in medical applications, including dermatology, plastic +surgery, and ophthalmology. Leveraging feedback mechanisms and advanced +algorithms, our approach iteratively refines initial labels, reducing reliance +on manual intervention while improving label quality. Through empirical +evaluation and case studies, we demonstrate the effectiveness of our proposed +strategies in deep learning tasks across medical imaging domains. Our results +highlight the importance of iterative refinement in automated data labeling to +enhance the capabilities of deep learning systems in medical imaging +applications. + +
+
+
+
+
+ + ☆ Comparative Analysis of Image Enhancement Techniques for Brain Tumor + Segmentation: Contrast, Histogram, and Hybrid Approaches CCS + + +
+ This study systematically investigates the impact of image enhancement +techniques on Convolutional Neural Network (CNN)-based Brain Tumor +Segmentation, focusing on Histogram Equalization (HE), Contrast Limited +Adaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing +the U-Net architecture on a dataset of 3064 Brain MRI images, the research +delves into preprocessing steps, including resizing and enhancement, to +optimize segmentation accuracy. A detailed analysis of the CNN-based U-Net +architecture, training, and validation processes is provided. The comparative +analysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals +that the hybrid approach CLAHE-HE consistently outperforms others. Results +highlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing, +and validation, respectively) and robust segmentation overlap, with Jaccard +values of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and +0.9932 for the same phases, emphasizing its potential in neuro-oncological +applications. The study concludes with a call for refinement in segmentation +methodologies to further enhance diagnostic precision and treatment planning in +neuro-oncology. + +
+
+ comment: 9 Pages, & Figures, 2 Tables, International Conference on Computer + Science Electronics and Information (ICCSEI 2023) +
+
+
+
+
+ + ☆ Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask + Prompt + + +
+ Text-to-image generation has witnessed great progress, especially with the +recent advancements in diffusion models. Since texts cannot provide detailed +conditions like object appearance, reference images are usually leveraged for +the control of objects in the generated images. However, existing methods still +suffer limited accuracy when the relationship between the foreground and +background is complicated. To address this issue, we develop a framework termed +Mask-ControlNet by introducing an additional mask prompt. Specifically, we +first employ large vision models to obtain masks to segment the objects of +interest in the reference image. Then, the object images are employed as +additional prompts to facilitate the diffusion model to better understand the +relationship between foreground and background regions during image generation. +Experiments show that the mask prompts enhance the controllability of the +diffusion model to maintain higher fidelity to the reference image while +achieving better image quality. Comparison with previous text-to-image +generation methods demonstrates our method's superior quantitative and +qualitative performance on the benchmark datasets. + +
+
+
+
+
+ + ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ CLIPping the Limits: Finding the Sweet Spot for Relevant Images in + Automated Driving Systems Perception Testing + + +
+ Perception systems, especially cameras, are the eyes of automated driving +systems. Ensuring that they function reliably and robustly is therefore an +important building block in the automation of vehicles. There are various +approaches to test the perception of automated driving systems. Ultimately, +however, it always comes down to the investigation of the behavior of +perception systems under specific input data. Camera images are a crucial part +of the input data. Image data sets are therefore collected for the testing of +automated driving systems, but it is non-trivial to find specific images in +these data sets. Thanks to recent developments in neural networks, there are +now methods for sorting the images in a data set according to their similarity +to a prompt in natural language. In order to further automate the provision of +search results, we make a contribution by automating the threshold definition +in these sorted results and returning only the images relevant to the prompt as +a result. Our focus is on preventing false positives and false negatives +equally. It is also important that our method is robust and in the case that +our assumptions are not fulfilled, we provide a fallback solution. + +
+
+
+
+
+ + ☆ Human Detection from 4D Radar Data in Low-Visibility Field Conditions ICRA 2024 + + +
+ Autonomous driving technology is increasingly being used on public roads and +in industrial settings such as mines. While it is essential to detect +pedestrians, vehicles, or other obstacles, adverse field conditions negatively +affect the performance of classical sensors such as cameras or lidars. Radar, +on the other hand, is a promising modality that is less affected by, e.g., +dust, smoke, water mist or fog. In particular, modern 4D imaging radars provide +target responses across the range, vertical angle, horizontal angle and Doppler +velocity dimensions. We propose TMVA4D, a CNN architecture that leverages this +4D radar modality for semantic segmentation. The CNN is trained to distinguish +between the background and person classes based on a series of 2D projections +of the 4D radar data that include the elevation, azimuth, range, and Doppler +velocity dimensions. We also outline the process of compiling a novel dataset +consisting of data collected in industrial settings with a car-mounted 4D radar +and describe how the ground-truth labels were generated from reference thermal +images. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an +mDice score of 86.1%, evaluated on the two classes background and person + +
+
+ comment: Submitted to Radar in Robotics workshop at ICRA 2024 +
+
+
+
+
+ + ☆ Texture Classification Network Integrating Adaptive Wavelet Transform + + +
+ Graves' disease is a common condition that is diagnosed clinically by +determining the smoothness of the thyroid texture and its morphology in +ultrasound images. Currently, the most widely used approach for the automated +diagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for +both feature extraction and classification. However, these methods demonstrate +limited efficacy in capturing texture features. Given the high capacity of +wavelets in describing texture features, this research integrates learnable +wavelet modules utilizing the Lifting Scheme into CNNs and incorporates a +parallel wavelet branch into the ResNet18 model to enhance texture feature +extraction. Our model can analyze texture features in spatial and frequency +domains simultaneously, leading to optimized classification accuracy. We +conducted experiments on collected ultrasound datasets and publicly available +natural image texture datasets, our proposed network achieved 97.27% accuracy +and 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image +texture datasets, surpassing the accuracy of ResNet and conrming the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ MindSet: Vision. A toolbox for testing DNNs on key psychological + experiments + + +
+ Multiple benchmarks have been developed to assess the alignment between deep +neural networks (DNNs) and human vision. In almost all cases these benchmarks +are observational in the sense they are composed of behavioural and brain +responses to naturalistic images that have not been manipulated to test +hypotheses regarding how DNNs or humans perceive and identify objects. Here we +introduce the toolbox MindSet: Vision, consisting of a collection of image +datasets and related scripts designed to test DNNs on 30 psychological +findings. In all experimental conditions, the stimuli are systematically +manipulated to test specific hypotheses regarding human visual perception and +object recognition. In addition to providing pre-generated datasets of images, +we provide code to regenerate these datasets, offering many configurable +parameters which greatly extend the dataset versatility for different research +contexts, and code to facilitate the testing of DNNs on these image datasets +using three different methods (similarity judgments, out-of-distribution +classification, and decoder method), accessible at +https://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of +these methods as an example of how the toolbox can be used. + +
+
+
+
+
+ + ☆ Detecting Every Object from Events + + +
+ Object detection is critical in autonomous driving, and it is more practical +yet challenging to localize objects of unknown categories: an endeavour known +as Class-Agnostic Object Detection (CAOD). Existing studies on CAOD +predominantly rely on ordinary cameras, but these frame-based sensors usually +have high latency and limited dynamic range, leading to safety risks in +real-world scenarios. In this study, we turn to a new modality enabled by the +so-called event camera, featured by its sub-millisecond latency and high +dynamic range, for robust CAOD. We propose Detecting Every Object in Events +(DEOE), an approach tailored for achieving high-speed, class-agnostic +open-world object detection in event-based vision. Built upon the fast +event-based backbone: recurrent vision transformer, we jointly consider the +spatial and temporal consistencies to identify potential objects. The +discovered potential objects are assimilated as soft positive samples to avoid +being suppressed as background. Moreover, we introduce a disentangled +objectness head to separate the foreground-background classification and novel +object discovery tasks, enhancing the model's generalization in localizing +novel objects while maintaining a strong ability to filter out the background. +Extensive experiments confirm the superiority of our proposed DEOE in +comparison with three strong baseline methods that integrate the +state-of-the-art event-based object detector with advancements in RGB-based +CAOD. Our code is available at https://github.com/Hatins/DEOE. + +
+
+
+
+
+ + ☆ MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues + + +
+ 3D object detection based on roadside cameras is an additional way for +autonomous driving to alleviate the challenges of occlusion and short +perception range from vehicle cameras. Previous methods for roadside 3D object +detection mainly focus on modeling the depth or height of objects, neglecting +the stationary of cameras and the characteristic of inter-frame consistency. In +this work, we propose a novel framework, namely MOSE, for MOnocular 3D object +detection with Scene cuEs. The scene cues are the frame-invariant +scene-specific features, which are crucial for object localization and can be +intuitively regarded as the height between the surface of the real road and the +virtual ground plane. In the proposed framework, a scene cue bank is designed +to aggregate scene cues from multiple frames of the same scene with a carefully +designed extrinsic augmentation strategy. Then, a transformer-based decoder +lifts the aggregated scene cues as well as the 3D position embeddings for 3D +object location, which boosts generalization ability in heterologous scenes. +The extensive experiment results on two public benchmarks demonstrate the +state-of-the-art performance of the proposed method, which surpasses the +existing methods by a large margin. + +
+
+
+
+
+ + ☆ Deep Optics for Video Snapshot Compressive Imaging ICCV 2023 + + +
+ Video snapshot compressive imaging (SCI) aims to capture a sequence of video +frames with only a single shot of a 2D detector, whose backbones rest in +optical modulation patterns (also known as masks) and a computational +reconstruction algorithm. Advanced deep learning algorithms and mature hardware +are putting video SCI into practical applications. Yet, there are two clouds in +the sunshine of SCI: i) low dynamic range as a victim of high temporal +multiplexing, and ii) existing deep learning algorithms' degradation on real +system. To address these challenges, this paper presents a deep optics +framework to jointly optimize masks and a reconstruction network. Specifically, +we first propose a new type of structural mask to realize motion-aware and +full-dynamic-range measurement. Considering the motion awareness property in +measurement domain, we develop an efficient network for video SCI +reconstruction using Transformer to capture long-term temporal dependencies, +dubbed Res2former. Moreover, sensor response is introduced into the forward +model of video SCI to guarantee end-to-end model training close to real system. +Finally, we implement the learned structural masks on a digital micro-mirror +device. Experimental results on synthetic and real data validate the +effectiveness of the proposed framework. We believe this is a milestone for +real-world video SCI. The source code and data are available at +https://github.com/pwangcs/DeepOpticsSCI. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation aims to synthesize instantiations of +user-specified concepts and has achieved unprecedented progress in handling +individual concept. However, when extending to multiple customized concepts, +existing methods exhibit limitations in terms of flexibility and fidelity, only +accommodating the combination of limited types of models and potentially +resulting in a mix of characteristics from different concepts. In this paper, +we introduce the Multi-concept guidance for Multi-concept customization, termed +MC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the +requirements for model architecture via inference time optimization, allowing +the integration of various heterogeneous single-concept customized models. It +adaptively refines the attention weights between visual and textual tokens, +directing image regions to focus on their associated words while diminishing +the impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$ +even surpasses previous methods that require additional training in terms of +consistency with input prompt and reference images. Moreover, MC$^2$ can be +extended to elevate the compositional capabilities of text-to-image generation, +yielding appealing results. Code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+
+
+
+ + ☆ Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in + Multimodal Large Language Model Security + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities +that increasingly influence various aspects of our daily lives, constantly +defining the new boundary of Artificial General Intelligence (AGI). Image +modalities, enriched with profound semantic information and a more continuous +mathematical nature compared to other modalities, greatly enhance the +functionalities of MLLMs when integrated. However, this integration serves as a +double-edged sword, providing attackers with expansive vulnerabilities to +exploit for highly covert and harmful attacks. The pursuit of reliable AI +systems like powerful MLLMs has emerged as a pivotal area of contemporary +research. In this paper, we endeavor to demostrate the multifaceted risks +associated with the incorporation of image modalities into MLLMs. Initially, we +delineate the foundational components and training processes of MLLMs. +Subsequently, we construct a threat model, outlining the security +vulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing +scholarly discourses on MLLMs' attack and defense mechanisms, culminating in +suggestions for the future research on MLLM security. Through this +comprehensive analysis, we aim to deepen the academic understanding of MLLM +security challenges and propel forward the development of trustworthy MLLM +systems. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ Unsupervised Band Selection Using Fused HSI and LiDAR Attention + Integrating With Autoencoder + + +
+ Band selection in hyperspectral imaging (HSI) is critical for optimising data +processing and enhancing analytical accuracy. Traditional approaches have +predominantly concentrated on analysing spectral and pixel characteristics +within individual bands independently. These approaches overlook the potential +benefits of integrating multiple data sources, such as Light Detection and +Ranging (LiDAR), and is further challenged by the limited availability of +labeled data in HSI processing, which represents a significant obstacle. To +address these challenges, this paper introduces a novel unsupervised band +selection framework that incorporates attention mechanisms and an Autoencoder +for reconstruction-based band selection. Our methodology distinctively +integrates HSI with LiDAR data through an attention score, using a +convolutional Autoencoder to process the combined feature mask. This fusion +effectively captures essential spatial and spectral features and reduces +redundancy in hyperspectral datasets. A comprehensive comparative analysis of +our innovative fused band selection approach is performed against existing +unsupervised band selection and fusion models. We used data sets such as +Houston 2013, Trento, and MUUFLE for our experiments. The results demonstrate +that our method achieves superior classification accuracy and significantly +outperforms existing models. This enhancement in HSI band selection, +facilitated by the incorporation of LiDAR features, underscores the +considerable advantages of integrating features from different sources. + +
+
+ comment: 13 pages, 13figures, 6 tables +
+
+
+
+
+ + ☆ Text-to-Image Synthesis for Any Artistic Styles: Advancements in + Personalized Artistic Image Generation via Subdivision and Dual Binding + + +
+ Recent advancements in text-to-image models, such as Stable Diffusion, have +demonstrated their ability to synthesize visual images through natural language +prompts. One approach of personalizing text-to-image models, exemplified by +DreamBooth, fine-tunes the pre-trained model by binding unique text identifiers +with a few images of a specific subject. Although existing fine-tuning methods +have demonstrated competence in rendering images according to the styles of +famous painters, it is still challenging to learn to produce images +encapsulating distinct art styles due to abstract and broad visual perceptions +of stylistic attributes such as lines, shapes, textures, and colors. In this +paper, we introduce a new method, Single-StyleForge, for personalization. It +fine-tunes pre-trained text-to-image diffusion models to generate diverse +images in specified styles from text prompts. By using around 15-20 images of +the target style, the approach establishes a foundational binding of a unique +token identifier with a broad range of the target style. It also utilizes +auxiliary images to strengthen this binding, resulting in offering specific +guidance on representing elements such as persons in a target style-consistent +manner. In addition, we present ways to improve the quality of style and +text-image alignment through a method called Multi-StyleForge, which inherits +the strategy used in StyleForge and learns tokens in multiple. Experimental +evaluation conducted on six distinct artistic styles demonstrates substantial +improvements in both the quality of generated images and the perceptual +fidelity metrics, such as FID, KID, and CLIP scores. + +
+
+ comment: 20 pages, 12 figuers +
+
+
+
+
+ + ☆ CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement + + +
+ Low-light image enhancement (LLIE) aims to improve low-illumination images. +However, existing methods face two challenges: (1) uncertainty in restoration +from diverse brightness degradations; (2) loss of texture and color information +caused by noise suppression and light enhancement. In this paper, we propose a +novel enhancement approach, CodeEnhance, by leveraging quantized priors and +image refinement to address these challenges. In particular, we reframe LLIE as +learning an image-to-code mapping from low-light images to discrete codebook, +which has been learned from high-quality images. To enhance this process, a +Semantic Embedding Module (SEM) is introduced to integrate semantic information +with low-level features, and a Codebook Shift (CS) mechanism, designed to adapt +the pre-learned codebook to better suit the distinct characteristics of our +low-light dataset. Additionally, we present an Interactive Feature +Transformation (IFT) module to refine texture and color information during +image reconstruction, allowing for interactive enhancement based on user +preferences. Extensive experiments on both real-world and synthetic benchmarks +demonstrate that the incorporation of prior knowledge and controllable +information transfer significantly enhances LLIE performance in terms of +quality and fidelity. The proposed CodeEnhance exhibits superior robustness to +various degradations, including uneven illumination, noise, and color +distortion. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ☆ Allowing humans to interactively guide machines where to look does not + always improve a human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature attribution maps \cite{bansal2020sam} +have been established as a common means for explaining the input features that +are important to AI's decisions. It is an interesting but unexplored question +whether allowing users to edit the importance scores of input features at test +time would improve the human-AI team's accuracy on downstream tasks. In this +paper, we address this question by taking CHM-Corr, a state-of-the-art, +ante-hoc explanation method \cite{taesiri2022visual} that first predicts +patch-wise correspondences between the input and the training-set images, and +then uses them to make classification decisions. We build an interactive +interface on top of CHM-Corr, enabling users to directly edit the initial +feature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface, +users gain insights into if, when, and how the model changes its outputs, +enhancing understanding beyond static explanations. Our user study with 18 +machine learning researchers who performed $\sim$1,400 decisions shows that our +interactive approach does not improve user accuracy on CUB-200 bird image +classification over static explanations. This challenges the belief that +interactivity inherently boosts XAI +effectiveness~\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding} +and raises needs for future research. Our work contributes to the field by +open-sourcing an interactive tool for manipulating model attention, and it lays +the groundwork for future research to enable effective human-AI interaction in +computer vision. We release code and data on +\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our +interface are available \href{http://137.184.82.109:7080/}{here}. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ☆ Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation + + +
+ Recently, a surge of 3D style transfer methods has been proposed that +leverage the scene reconstruction power of a pre-trained neural radiance field +(NeRF). To successfully stylize a scene this way, one must first reconstruct a +photo-realistic radiance field from collected images of the scene. However, +when only sparse input views are available, pre-trained few-shot NeRFs often +suffer from high-frequency artifacts, which are generated as a by-product of +high-frequency details for improving reconstruction quality. Is it possible to +generate more faithful stylized scenes from sparse inputs by directly +optimizing encoding-based scene representation with target style? In this +paper, we consider the stylization of sparse-view scenes in terms of +disentangling content semantics and style textures. We propose a coarse-to-fine +sparse-view scene stylization framework, where a novel hierarchical +encoding-based neural representation is designed to generate high-quality +stylized scenes directly from implicit scene representations. We also propose a +new optimization strategy with content strength annealing to achieve realistic +stylization and better content preservation. Extensive experiments demonstrate +that our method can achieve high-quality stylization of sparse-view scenes and +outperforms fine-tuning-based baselines in terms of stylization quality and +efficiency. + +
+
+
+
+
+ + ☆ PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly + Detection CVPR2024 + + +
+ The vision-language model has brought great improvement to few-shot +industrial anomaly detection, which usually needs to design of hundreds of +prompts through prompt engineering. For automated scenarios, we first use +conventional prompt learning with many-class paradigm as the baseline to +automatically learn prompts but found that it can not work well in one-class +anomaly detection. To address the above problem, this paper proposes a +one-class prompt learning method for few-shot anomaly detection, termed +PromptAD. First, we propose semantic concatenation which can transpose normal +prompts into anomaly prompts by concatenating normal prompts with anomaly +suffixes, thus constructing a large number of negative samples used to guide +prompt learning in one-class setting. Furthermore, to mitigate the training +challenge caused by the absence of anomaly images, we introduce the concept of +explicit anomaly margin, which is used to explicitly control the margin between +normal prompt features and anomaly prompt features through a hyper-parameter. +For image-level/pixel-level anomaly detection, PromptAD achieves first place in +11/12 few-shot settings on MVTec and VisA. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ LayoutLLM: Layout Instruction Tuning with Large Language Models for + Document Understanding CVPR 2024 + + +
+ Recently, leveraging large language models (LLMs) or multimodal large +language models (MLLMs) for document understanding has been proven very +promising. However, previous works that employ LLMs/MLLMs for document +understanding have not fully explored and utilized the document layout +information, which is vital for precise document understanding. In this paper, +we propose LayoutLLM, an LLM/MLLM based method for document understanding. The +core of LayoutLLM is a layout instruction tuning strategy, which is specially +designed to enhance the comprehension and utilization of document layouts. The +proposed layout instruction tuning strategy consists of two components: +Layout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture +the characteristics of document layout in Layout-aware Pre-training, three +groups of pre-training tasks, corresponding to document-level, region-level and +segment-level information, are introduced. Furthermore, a novel module called +layout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on +regions relevant to the question and generate accurate answers. LayoutCoT is +effective for boosting the performance of document understanding. Meanwhile, it +brings a certain degree of interpretability, which could facilitate manual +inspection and correction. Experiments on standard benchmarks show that the +proposed LayoutLLM significantly outperforms existing methods that adopt +open-source 7B LLMs/MLLMs for document understanding. The training data of the +LayoutLLM is publicly available at +https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ StylizedGS: Controllable Stylization for 3D Gaussian Splatting + + +
+ With the rapid development of XR, 3D generation and editing are becoming more +and more important, among which, stylization is an important tool of 3D +appearance editing. It can achieve consistent 3D artistic stylization given a +single reference style image and thus is a user-friendly editing way. However, +recent NeRF-based 3D stylization methods face efficiency issues that affect the +actual user experience and the implicit nature limits its ability to transfer +the geometric pattern styles. Additionally, the ability for artists to exert +flexible control over stylized scenes is considered highly desirable, fostering +an environment conducive to creative exploration. In this paper, we introduce +StylizedGS, a 3D neural style transfer framework with adaptable control over +perceptual factors based on 3D Gaussian Splatting (3DGS) representation. The +3DGS brings the benefits of high efficiency. We propose a GS filter to +eliminate floaters in the reconstruction which affects the stylization effects +before stylization. Then the nearest neighbor-based style loss is introduced to +achieve stylization by fine-tuning the geometry and color parameters of 3DGS, +while a depth preservation loss with other regularizations is proposed to +prevent the tampering of geometry content. Moreover, facilitated by specially +designed losses, StylizedGS enables users to control color, stylized scale and +regions during the stylization to possess customized capabilities. Our method +can attain high-quality stylization results characterized by faithful +brushstrokes and geometric consistency with flexible controls. Extensive +experiments across various scenes and styles demonstrate the effectiveness and +efficiency of our method concerning both stylization quality and inference FPS. + +
+
+
+
+
+ + ☆ Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware + Trajectory Conditioning CVPR + + +
+ Human pose forecasting garners attention for its diverse applications. +However, challenges in modeling the multi-modal nature of human motion and +intricate interactions among agents persist, particularly with longer +timescales and more agents. In this paper, we propose an interaction-aware +trajectory-conditioned long-term multi-agent human pose forecasting model, +utilizing a coarse-to-fine prediction approach: multi-modal global trajectories +are initially forecasted, followed by respective local pose forecasts +conditioned on each mode. In doing so, our Trajectory2Pose model introduces a +graph-based agent-wise interaction module for a reciprocal forecast of local +motion-conditioned global trajectory and trajectory-conditioned local pose. Our +model effectively handles the multi-modality of human motion and the complexity +of long-term multi-agent interactions, improving performance in complex +environments. Furthermore, we address the lack of long-term (6s+) multi-agent +(5+) datasets by constructing a new dataset from real-world images and 2D +annotations, enabling a comprehensive evaluation of our proposed model. +State-of-the-art prediction performance on both complex and simpler datasets +confirms the generalized effectiveness of our method. The code is available at +https://github.com/Jaewoo97/T2P. + +
+
+ comment: 2024 CVPR Highlight +
+
+
+
+
+ + ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+ + ☆ DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage + CJK Character Generation + + +
+ Chinese, Japanese, and Korean (CJK), with a vast number of native speakers, +has profound influence on society and culture. The typesetting of CJK languages +carries a wide range of requirements due to the complexity of their scripts and +unique literary traditions. A critical aspect of this typesetting process is +that CJK fonts need to provide a set of consistent-looking glyphs for +approximately one hundred thousand characters. However, creating such a font is +inherently labor-intensive and expensive, which significantly hampers the +development of new CJK fonts for typesetting, historical, aesthetic, or +artistic purposes. + To bridge this gap, we are motivated by recent advancements in +diffusion-based generative models and propose a novel diffusion method for +generating glyphs in a targeted style from a \emph{single} conditioned, +standard glyph form. Our experiments show that our method is capable of +generating fonts of both printed and hand-written styles, the latter of which +presents a greater challenge. Moreover, our approach shows remarkable zero-shot +generalization capabilities for non-CJK but Chinese-inspired scripts. We also +show our method facilitates smooth style interpolation and generates bitmap +images suitable for vectorization, which is crucial in the font creation +process. In summary, our proposed method opens the door to high-quality, +generative model-assisted font creation for CJK characters, for both +typesetting and artistic endeavors. + +
+
+
+
+
+ + ☆ Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image + Clustering IJCNN 2024 + + +
+ Hyperspectral image (HSI) clustering is a challenging task due to its high +complexity. Despite subspace clustering shows impressive performance for HSI, +traditional methods tend to ignore the global-local interaction in HSI data. In +this study, we proposed a multi-level graph subspace contrastive learning +(MLGSC) for HSI clustering. The model is divided into the following main parts. +Graph convolution subspace construction: utilizing spectral and texture +feautures to construct two graph convolution views. Local-global graph +representation: local graph representations were obtained by step-by-step +convolutions and a more representative global graph representation was obtained +using an attention-based pooling strategy. Multi-level graph subspace +contrastive learning: multi-level contrastive learning was conducted to obtain +local-global joint graph representations, to improve the consistency of the +positive samples between views, and to obtain more robust graph embeddings. +Specifically, graph-level contrastive learning is used to better learn global +representations of HSI data. Node-level intra-view and inter-view contrastive +learning is designed to learn joint representations of local regions of HSI. +The proposed model is evaluated on four popular HSI datasets: Indian Pines, +Pavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%, +99.96%, 92.28%, and 95.73%, which significantly outperforms the current +state-of-the-art clustering methods. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ Bidirectional Long-Range Parser for Sequential Data Understanding + + +
+ The transformer is a powerful data modelling framework responsible for +remarkable performance on a wide range of tasks. However, they are limited in +terms of scalability as it is suboptimal and inefficient to process +long-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range +Parser), a novel and versatile attention mechanism designed to increase +performance and efficiency on long-sequence tasks. It leverages short and long +range heuristics in the form of a local sliding window approach combined with a +global bidirectional latent space synthesis technique. We show the benefits and +versatility of our approach on vision and language domains by demonstrating +competitive results against state-of-the-art methods on the Long-Range-Arena +and CIFAR benchmarks together with ablations demonstrating the computational +efficiency. + +
+
+
+
+
+ + ☆ iVPT: Improving Task-relevant Information Sharing in Visual Prompt + Tuning by Cross-layer Dynamic Connection + + +
+ Recent progress has shown great potential of visual prompt tuning (VPT) when +adapting pre-trained vision transformers to various downstream tasks. However, +most existing solutions independently optimize prompts at each layer, thereby +neglecting the usage of task-relevant information encoded in prompt tokens +across layers. Additionally, existing prompt structures are prone to +interference from task-irrelevant noise in input images, which can do harm to +the sharing of task-relevant information. In this paper, we propose a novel VPT +approach, \textbf{iVPT}. It innovatively incorporates a cross-layer dynamic +connection (CDC) for input prompt tokens from adjacent layers, enabling +effective sharing of task-relevant information. Furthermore, we design a +dynamic aggregation (DA) module that facilitates selective sharing of +information between layers. The combination of CDC and DA enhances the +flexibility of the attention process within the VPT framework. Building upon +these foundations, iVPT introduces an attentive reinforcement (AR) mechanism, +by automatically identifying salient image tokens, which are further enhanced +by prompt tokens in an additive manner. Extensive experiments on 24 image +classification and semantic segmentation benchmarks clearly demonstrate the +advantage of the proposed iVPT, compared to the state-of-the-art counterparts. + +
+
+
+
+
+ + ☆ SoundingActions: Learning How Actions Sound from Narrated Egocentric + Videos CVPR 2024 + + +
+ We propose a novel self-supervised embedding to learn how actions sound from +narrated in-the-wild egocentric videos. Whereas existing methods rely on +curated data with known audio-visual correspondence, our multimodal +contrastive-consensus coding (MC3) embedding reinforces the associations +between audio, language, and vision when all modality pairs agree, while +diminishing those associations when any one pair does not. We show our approach +can successfully discover how the long tail of human actions sound from +egocentric video, outperforming an array of recent multimodal embedding +techniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal +tasks. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://vision.cs.utexas.edu/projects/soundingactions +
+
+
+
+
+ + ☆ A secure and private ensemble matcher using multi-vault obfuscated + templates + + +
+ Given the irrevocability of biometric samples and mounting privacy concerns, +biometric template security and secure matching are among the essential +features of any well-designed modern biometric system. In this paper, we +propose an obfuscation method that hides the biometric template information +with just enough chaff. The main idea is to reduce the number of chaff points +to a practical level by creating n sub-templates from the original template and +hiding each sub-template with m chaff points. During verification, s closest +vectors to the biometric query are retrieved from each vault and then combined +to generate hash values that are compared with the stored hash value. We +demonstrate the effectiveness of synthetic facial images, generated by a +Generative Adversarial Network (GAN), as ``random chaff points'' within a +secure-vault authorization system. This approach safeguards user identities +during training and deployment. We tested our protocol using the AT&T, GT, and +LFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and +0.90, respectively. These numbers were close to those of the unprotected +templates, showing that our method does not adversely affect accuracy. + +
+
+
+
+
+ + ☆ HSViT: Horizontally Scalable Vision Transformer + + +
+ While the Vision Transformer (ViT) architecture gains prominence in computer +vision and attracts significant attention from multimedia communities, its +deficiency in prior knowledge (inductive bias) regarding shift, scale, and +rotational invariance necessitates pre-training on large-scale datasets. +Furthermore, the growing layers and parameters in both ViT and convolutional +neural networks (CNNs) impede their applicability to mobile multimedia +services, primarily owing to the constrained computational resources on edge +devices. To mitigate the aforementioned challenges, this paper introduces a +novel horizontally scalable vision transformer (HSViT). Specifically, a novel +image-level feature embedding allows ViT to better leverage the inductive bias +inherent in the convolutional layers. Based on this, an innovative horizontally +scalable architecture is designed, which reduces the number of layers and +parameters of the models while facilitating collaborative training and +inference of ViT models across multiple nodes. The experimental results depict +that, without pre-training on large-scale datasets, HSViT achieves up to 10% +higher top-1 accuracy than state-of-the-art schemes, ascertaining its superior +preservation of inductive bias. The code is available at +https://github.com/xuchenhao001/HSViT. + +
+
+
+
+
+ + ☆ LGSDF: Continual Global Learning of Signed Distance Fields Aided by + Local Updating + + +
+ Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves +training a neural network to regress the signed distance from any point to the +nearest obstacle, which has the advantages of lightweight storage and +continuous querying. However, existing algorithms usually rely on conflicting +raw observations as training data, resulting in poor map performance. In this +paper, we propose LGSDF, an ESDF continual Global learning algorithm aided by +Local updating. At the front end, axis-aligned grids are dynamically updated by +pre-processed sensor observations, where incremental fusion alleviates +estimation error caused by limited viewing directions. At the back end, a +randomly initialized implicit ESDF neural network performs continual +self-supervised learning guided by these grids to generate smooth and +continuous maps. The results on multiple scenes show that LGSDF can construct +more accurate ESDF maps and meshes compared with SOTA (State Of The Art) +explicit and implicit mapping algorithms. The source code of LGSDF is publicly +available at https://github.com/BIT-DYN/LGSDF. + +
+
+
+
+
+ + ☆ Progressive Alignment with VLM-LLM Feature to Augment Defect + Classification for the ASE Dataset + + +
+ Traditional defect classification approaches are facing with two barriers. +(1) Insufficient training data and unstable data quality. Collecting sufficient +defective sample is expensive and time-costing, consequently leading to dataset +variance. It introduces the difficulty on recognition and learning. (2) +Over-dependence on visual modality. When the image pattern and texture is +monotonic for all defect classes in a given dataset, the performance of +conventional AOI system cannot be guaranteed. In scenarios where image quality +is compromised due to mechanical failures or when defect information is +inherently difficult to discern, the performance of deep models cannot be +guaranteed. A main question is, "how to solve those two problems when they +occur at the same time?" The feasible strategy is to explore another feature +within dataset and combine an eminent vision-language model (VLM) and +Large-Language model (LLM) with their astonishing zero-shot capability. In this +work, we propose the special ASE dataset, including rich data description +recorded on image, for defect classification, but the defect feature is uneasy +to learn directly. Secondly, We present the prompting for VLM-LLM against +defect classification with the proposed ASE dataset to activate extra-modality +feature from images to enhance performance. Then, We design the novel +progressive feature alignment (PFA) block to refine image-text feature to +alleviate the difficulty of alignment under few-shot scenario. Finally, the +proposed Cross-modality attention fusion (CMAF) module can effectively fuse +different modality feature. Experiment results have demonstrated our method's +effectiveness over several defect classification methods for the ASE dataset. + +
+
+ comment: MULA 2024 +
+
+
+
+
+ + ☆ Adaptive Learning for Multi-view Stereo Reconstruction + + +
+ Deep learning has recently demonstrated its excellent performance on the task +of multi-view stereo (MVS). However, loss functions applied for deep MVS are +rarely studied. In this paper, we first analyze existing loss functions' +properties for deep depth based MVS approaches. Regression based loss leads to +inaccurate continuous results by computing mathematical expectation, while +classification based loss outputs discretized depth values. To this end, we +then propose a novel loss function, named adaptive Wasserstein loss, which is +able to narrow down the difference between the true and predicted probability +distributions of depth. Besides, a simple but effective offset module is +introduced to better achieve sub-pixel prediction accuracy. Extensive +experiments on different benchmarks, including DTU, Tanks and Temples and +BlendedMVS, show that the proposed method with the adaptive Wasserstein loss +and the offset module achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ GloSoFarID: Global multispectral dataset for Solar Farm IDentification + in satellite imagery + + +
+ Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal +solution in the global pursuit of clean and renewable energy. This technology +addresses the urgent need for sustainable energy alternatives by converting +solar power into electricity without greenhouse gas emissions. It not only +curtails global carbon emissions but also reduces reliance on finite, +non-renewable energy sources. In this context, monitoring solar panel farms +becomes essential for understanding and facilitating the worldwide shift toward +clean energy. This study contributes to this effort by developing the first +comprehensive global dataset of multispectral satellite imagery of solar panel +farms. This dataset is intended to form the basis for training robust machine +learning models, which can accurately map and analyze the expansion and +distribution of solar panel farms globally. The insights gained from this +endeavor will be instrumental in guiding informed decision-making for a +sustainable energy future. https://github.com/yzyly1992/GloSoFarID + +
+
+
+
+
+ + ☆ QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease + Diagnosis + + +
+ Due to the complexity of medical image acquisition and the difficulty of +annotation, medical image datasets inevitably contain noise. Noisy data with +wrong labels affects the robustness and generalization ability of deep neural +networks. Previous noise learning methods mainly considered noise arising from +images being mislabeled, i.e. label noise, assuming that all mislabeled images +are of high image quality. However, medical images are prone to suffering +extreme quality issues, i.e. data noise, where discriminative visual features +are missing for disease diagnosis. In this paper, we propose a noise learning +framework, termed as QMix, that learns a robust disease diagnosis model under +mixed noise. QMix alternates between sample separation and quality-aware +semisupervised training in each training epoch. In the sample separation phase, +we design a joint uncertainty-loss criterion to effectively separate (1) +correctly labeled images; (2) mislabeled images with high quality and (3) +mislabeled images with low quality. In the semi-supervised training phase, we +train a disease diagnosis model to learn robust feature representation from the +separated samples. Specifically, we devise a sample-reweighing loss to mitigate +the effect of mislabeled images with low quality during training. Meanwhile, a +contrastive enhancement loss is proposed to further distinguish mislabeled +images with low quality from correctly labeled images. QMix achieved +state-of-the-art disease diagnosis performance on five public retinal image +datasets and exhibited substantial improvement on robustness against mixed +noise. + +
+
+
+
+
+ + ☆ Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular + Videos ICLR 2024 + + +
+ In this work, we pioneer Semantic Flow, a neural semantic representation of +dynamic scenes from monocular videos. In contrast to previous NeRF methods that +reconstruct dynamic scenes from the colors and volume densities of individual +points, Semantic Flow learns semantics from continuous flows that contain rich +3D motion information. As there is 2D-to-3D ambiguity problem in the viewing +direction when extracting 3D flow features from 2D video frames, we consider +the volume densities as opacity priors that describe the contributions of flow +features to the semantics on the frames. More specifically, we first learn a +flow network to predict flows in the dynamic scene, and propose a flow feature +aggregation module to extract flow features from video frames. Then, we propose +a flow attention module to extract motion information from flow features, which +is followed by a semantic network to output semantic logits of flows. We +integrate the logits with volume densities in the viewing direction to +supervise the flow features with semantic labels on video frames. Experimental +results show that our model is able to learn from multiple dynamic scenes and +supports a series of new tasks such as instance-level scene editing, semantic +completions, dynamic scene tracking and semantic adaption on novel scenes. +Codes are available at https://github.com/tianfr/Semantic-Flow/. + +
+
+ comment: Accepted by ICLR 2024, Codes are available at + https://github.com/tianfr/Semantic-Flow/ +
+
+
+
+
+ + ☆ UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic + Segmentation in Adverse Weather CVPR 2024 + + +
+ LiDAR semantic segmentation (LSS) is a critical task in autonomous driving +and has achieved promising progress. However, prior LSS methods are +conventionally investigated and evaluated on datasets within the same domain in +clear weather. The robustness of LSS models in unseen scenes and all weather +conditions is crucial for ensuring safety and reliability in real applications. +To this end, we propose UniMix, a universal method that enhances the +adaptability and generalizability of LSS models. UniMix first leverages +physically valid adverse weather simulation to construct a Bridge Domain, which +serves to bridge the domain gap between the clear weather scenes and the +adverse weather scenes. Then, a Universal Mixing operator is defined regarding +spatial, intensity, and semantic distributions to create the intermediate +domain with mixed samples from given domains. Integrating the proposed two +techniques into a teacher-student framework, UniMix efficiently mitigates the +domain gap and enables LSS models to learn weather-robust and domain-invariant +representations. We devote UniMix to two main setups: 1) unsupervised domain +adaption, adapting the model from the clear weather source domain to the +adverse weather target domain; 2) domain generalization, learning a model that +generalizes well to unseen scenes in adverse weather. Extensive experiments +validate the effectiveness of UniMix across different tasks and datasets, all +achieving superior performance over state-of-the-art methods. The code will be +released. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Clinical Efficiency through LLM: Discharge Note Generation for + Cardiac Patients + + +
+ Medical documentation, including discharge notes, is crucial for ensuring +patient care quality, continuity, and effective medical communication. However, +the manual creation of these documents is not only time-consuming but also +prone to inconsistencies and potential errors. The automation of this +documentation process using artificial intelligence (AI) represents a promising +area of innovation in healthcare. This study directly addresses the +inefficiencies and inaccuracies in creating discharge notes manually, +particularly for cardiac patients, by employing AI techniques, specifically +large language model (LLM). Utilizing a substantial dataset from a cardiology +center, encompassing wide-ranging medical records and physician assessments, +our research evaluates the capability of LLM to enhance the documentation +process. Among the various models assessed, Mistral-7B distinguished itself by +accurately generating discharge notes that significantly improve both +documentation efficiency and the continuity of care for patients. These notes +underwent rigorous qualitative evaluation by medical expert, receiving high +marks for their clinical relevance, completeness, readability, and contribution +to informed decision-making and care planning. Coupled with quantitative +analyses, these results confirm Mistral-7B's efficacy in distilling complex +medical information into concise, coherent summaries. Overall, our findings +illuminate the considerable promise of specialized LLM, such as Mistral-7B, in +refining healthcare documentation workflows and advancing patient care. This +study lays the groundwork for further integrating advanced AI technologies in +healthcare, demonstrating their potential to revolutionize patient +documentation and support better care outcomes. + +
+
+ comment: 10 pages, 1 figure, 3 tables, conference +
+
+
+
+
+ + ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2022 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2022. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ☆ Self-Supervised Multi-Object Tracking with Path Consistency CVPR 2024 + + +
+ In this paper, we propose a novel concept of path consistency to learn robust +object matching without using manual object identity supervision. Our key idea +is that, to track a object through frames, we can obtain multiple different +association results from a model by varying the frames it can observe, i.e., +skipping frames in observation. As the differences in observations do not alter +the identities of objects, the obtained association results should be +consistent. Based on this rationale, we generate multiple observation paths, +each specifying a different set of frames to be skipped, and formulate the Path +Consistency Loss that enforces the association results are consistent across +different observation paths. We use the proposed loss to train our object +matching model with only self-supervision. By extensive experiments on three +tracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method +outperforms existing unsupervised methods with consistent margins on various +evaluation metrics, and even achieves performance close to supervised methods. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Image-based Agarwood Resinous Area Segmentation using Deep Learning + + +
+ The manual extraction method of Agarwood resinous compound is laborious work, +requires skilled workers, and is subject to human errors. Commercial Agarwood +industries have been actively exploring using Computer Numerical Control (CNC) +machines to replace human effort for this particular task. The CNC machine +accepts a G-code script produced from a binary image in which the wood region +that needs to be chiselled off is marked with (0, 0, 0) as its RGB value. +Rather than requiring a human expert to perform the region marking, we propose +using a Deep learning image segmentation method instead. Our setup involves a +camera that captures the cross-section image and then passes the image file to +a computer. The computer performs the automated image segmentation and feeds +the CNC machine with a G-code script. In this article, we report the initial +segmentation results achieved using a state-of-the-art Deep learning +segmentation method and discuss potential improvements to refine the +segmentation accuracy. + +
+
+ comment: 15 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Improving Deep Learning Predictions with Simulated Images, and Vice + Versa + + +
+ Artificial neural networks are often used to identify features of crop +plants. However, training their models requires many annotated images, which +can be expensive and time-consuming to acquire. Procedural models of plants, +such as those developed with Lindenmayer-systems (L-systems) can be created to +produce visually realistic simulations, and hence images of plant simulations, +where annotations are implicitly known. These synthetic images can either +augment or completely replace real images in training neural networks for +phenotyping tasks. In this paper, we systematically vary amounts of real and +synthetic images used for training in both maize and canola to better +understand situations where synthetic images generated from L-systems can help +prediction on real images. This work also explores the degree to which realism +in the synthetic images improves prediction. Furthermore, we see how neural +network predictions can be used to help calibrate L-systems themselves, +creating a feedback loop. + +
+
+
+
+
+ + ☆ Class Similarity Transition: Decoupling Class Similarities and Imbalance + from Generalized Few-shot Segmentation + + +
+ In Generalized Few-shot Segmentation (GFSS), a model is trained with a large +corpus of base class samples and then adapted on limited samples of novel +classes. This paper focuses on the relevance between base and novel classes, +and improves GFSS in two aspects: 1) mining the similarity between base and +novel classes to promote the learning of novel classes, and 2) mitigating the +class imbalance issue caused by the volume difference between the support set +and the training set. Specifically, we first propose a similarity transition +matrix to guide the learning of novel classes with base class knowledge. Then, +we leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive +Inference to the GFSS task to address the problem of class imbalance as well as +overfitting the support set. In addition, by extending the probability +transition matrix, the proposed method can mitigate the catastrophic forgetting +of base classes when learning novel classes. With a simple training phase, our +proposed method can be applied to any segmentation network trained on base +classes. We validated our methods on the adapted version of OpenEarthMap. +Compared to existing GFSS baselines, our method excels them all from 3% to 7% +and ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at +the completion of this paper. Code: +https://github.com/earth-insights/ClassTrans + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ TabConv: Low-Computation CNN Inference via Table Lookups + + +
+ Convolutional Neural Networks (CNNs) have demonstrated remarkable ability +throughout the field of computer vision. However, CNN inference requires a +large number of arithmetic operations, making them expensive to deploy in +hardware. Current approaches alleviate this issue by developing +hardware-supported, algorithmic processes to simplify spatial convolution +functions. However, these methods still heavily rely on matrix multiplication, +leading to significant computational overhead. To bridge the gap between +hardware, algorithmic acceleration, and approximate matrix multiplication, we +propose TabConv, a novel, table-based approximation for convolution to +significantly reduce arithmetic operations during inference. Additionally, we +introduce a priority masking technique based on cosine similarity to select +layers for table-based approximation, thereby maintaining the model +performance. We evaluate our approach on popular CNNs: ResNet-18, ResNet-34, +and NetworkInNetwork (NIN). TabConv preserves over 93% of the original model's +performance while reducing arithmetic operations by 36.5%, 25.8%, and 99.4% for +ResNet-18 on CIFAR-10, CIFAR-100, and MNIST, respectively, 35.6% and 99.3% for +ResNet-34 on CIFAR-10 and MNIST, and 98.9% for NIN on MNIST, achieving +low-computation inference. + +
+
+ comment: 8 pages, Accepted at CF '24 +
+
+
+
+
+ + ☆ Towards Improved Semiconductor Defect Inspection for high-NA EUVL based + on SEMI-SuperYOLO-NAS + + +
+ Due to potential pitch reduction, the semiconductor industry is adopting +High-NA EUVL technology. However, its low depth of focus presents challenges +for High Volume Manufacturing. To address this, suppliers are exploring thinner +photoresists and new underlayers/hardmasks. These may suffer from poor SNR, +complicating defect detection. Vision-based ML algorithms offer a promising +solution for semiconductor defect inspection. However, developing a robust ML +model across various image resolutions without explicit training remains a +challenge for nano-scale defect inspection. This research's goal is to propose +a scale-invariant ADCD framework capable to upscale images, addressing this +issue. We propose an improvised ADCD framework as SEMI-SuperYOLO-NAS, which +builds upon the baseline YOLO-NAS architecture. This framework integrates a SR +assisted branch to aid in learning HR features by the defect detection +backbone, particularly for detecting nano-scale defect instances from LR +images. Additionally, the SR-assisted branch can recursively generate upscaled +images from their corresponding downscaled counterparts, enabling defect +detection inference across various image resolutions without requiring explicit +training. Moreover, we investigate improved data augmentation strategy aimed at +generating diverse and realistic training datasets to enhance model +performance. We have evaluated our proposed approach using two original FAB +datasets obtained from two distinct processes and captured using two different +imaging tools. Finally, we demonstrate zero-shot inference for our model on a +new, originating from a process condition distinct from the training dataset +and possessing different Pitch characteristics. Experimental validation +demonstrates that our proposed ADCD framework aids in increasing the throughput +of imaging tools for defect inspection by reducing the required image pixel +resolutions. + +
+
+
+
+
+ + ☆ Localizing Moments of Actions in Untrimmed Videos of Infants with Autism + Spectrum Disorder + + +
+ Autism Spectrum Disorder (ASD) presents significant challenges in early +diagnosis and intervention, impacting children and their families. With +prevalence rates rising, there is a critical need for accessible and efficient +screening tools. Leveraging machine learning (ML) techniques, in particular +Temporal Action Localization (TAL), holds promise for automating ASD screening. +This paper introduces a self-attention based TAL model designed to identify +ASD-related behaviors in infant videos. Unlike existing methods, our approach +simplifies complex modeling and emphasizes efficiency, which is essential for +practical deployment in real-world scenarios. Importantly, this work +underscores the importance of developing computer vision methods capable of +operating in naturilistic environments with little equipment control, +addressing key challenges in ASD screening. This study is the first to conduct +end-to-end temporal action localization in untrimmed videos of infants with +ASD, offering promising avenues for early intervention and support. We report +baseline results of behavior detection using our TAL model. We achieve 70% +accuracy for look face, 79% accuracy for look object, 72% for smile and 65% for +vocalization. + +
+
+ comment: 7 pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ Privacy-Preserving Deep Learning Using Deformable Operators for Secure + Task Learning + + +
+ In the era of cloud computing and data-driven applications, it is crucial to +protect sensitive information to maintain data privacy, ensuring truly reliable +systems. As a result, preserving privacy in deep learning systems has become a +critical concern. Existing methods for privacy preservation rely on image +encryption or perceptual transformation approaches. However, they often suffer +from reduced task performance and high computational costs. To address these +challenges, we propose a novel Privacy-Preserving framework that uses a set of +deformable operators for secure task learning. Our method involves shuffling +pixels during the analog-to-digital conversion process to generate visually +protected data. Those are then fed into a well-known network enhanced with +deformable operators. Using our approach, users can achieve equivalent +performance to original images without additional training using a secret key. +Moreover, our method enables access control against unauthorized users. +Experimental results demonstrate the efficacy of our approach, showcasing its +potential in cloud-based scenarios and privacy-sensitive applications. + +
+
+ comment: copyright 2024 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ☆ Towards Explainable Automated Neuroanatomy + + +
+ We present a novel method for quantifying the microscopic structure of brain +tissue. It is based on the automated recognition of interpretable features +obtained by analyzing the shapes of cells. This contrasts with prevailing +methods of brain anatomical analysis in two ways. First, contemporary methods +use gray-scale values derived from smoothed version of the anatomical images, +which dissipated valuable information from the texture of the images. Second, +contemporary analysis uses the output of black-box Convolutional Neural +Networks, while our system makes decisions based on interpretable features +obtained by analyzing the shapes of individual cells. An important benefit of +this open-box approach is that the anatomist can understand and correct the +decisions made by the computer. Our proposed system can accurately localize and +identify existing brain structures. This can be used to align and coregistar +brains and will facilitate connectomic studies for reverse engineering of brain +circuitry. + +
+
+
+
+
+ + ☆ BatSort: Enhanced Battery Classification with Transfer Learning for + Battery Sorting and Recycling + + +
+ Battery recycling is a critical process for minimizing environmental harm and +resource waste for used batteries. However, it is challenging, largely because +sorting batteries is costly and hardly automated to group batteries based on +battery types. In this paper, we introduce a machine learning-based approach +for battery-type classification and address the daunting problem of data +scarcity for the application. We propose BatSort which applies transfer +learning to utilize the existing knowledge optimized with large-scale datasets +and customizes ResNet to be specialized for classifying battery types. We +collected our in-house battery-type dataset of small-scale to guide the +knowledge transfer as a case study and evaluate the system performance. We +conducted an experimental study and the results show that BatSort can achieve +outstanding accuracy of 92.1% on average and up to 96.2% and the performance is +stable for battery-type classification. Our solution helps realize fast and +automated battery sorting with minimized cost and can be transferred to related +industry applications with insufficient data. + +
+
+
+
+
+ + ☆ Responsible Generative AI: What to Generate and What Not + + +
+ In recent years, generative AI (GenAI), like large language models and +text-to-image models, has received significant attention across various +domains. However, ensuring the responsible generation of content by these +models is crucial for their real-world applicability. This raises an +interesting question: \textit{What should responsible GenAI generate, and what +should it not?} To answer the question, this paper investigates the practical +responsible requirements of both textual and visual generative models, +outlining five key considerations: generating truthful content, avoiding toxic +content, refusing harmful instruction, leaking no training data-related +content, and ensuring generated content identifiable. Specifically, we review +recent advancements and challenges in addressing these requirements. Besides, +we discuss and emphasize the importance of responsible GenAI across healthcare, +education, finance, and artificial general intelligence domains. Through a +unified perspective on both textual and visual generative models, this paper +aims to provide insights into practical safety-related issues and further +benefit the community in building responsible GenAI. + +
+
+ comment: 74 pages, 10 figures +
+
+
+
+
+ + ☆ Forecasting Electric Vehicle Battery Output Voltage: A Predictive + Modeling Approach + + +
+ The battery management system plays a vital role in ensuring the safety and +dependability of electric and hybrid vehicles. It is responsible for various +functions, including state evaluation, monitoring, charge control, and cell +balancing, all integrated within the BMS. Nonetheless, due to the uncertainties +surrounding battery performance, implementing these functionalities poses +significant challenges. In this study, we explore the latest approaches for +assessing battery states, highlight notable advancements in battery management +systems (BMS), address existing issues with current BMS technology, and put +forth possible solutions for predicting battery charging voltage. + +
+
+
+
+
+ + ☆ Lightweight Deep Learning for Resource-Constrained Environments: A + Survey + + +
+ Over the past decade, the dominance of deep learning has prevailed across +various domains of artificial intelligence, including natural language +processing, computer vision, and biomedical signal processing. While there have +been remarkable improvements in model accuracy, deploying these models on +lightweight devices, such as mobile phones and microcontrollers, is constrained +by limited resources. In this survey, we provide comprehensive design guidance +tailored for these devices, detailing the meticulous design of lightweight +models, compression methods, and hardware acceleration strategies. The +principal goal of this work is to explore methods and concepts for getting +around hardware constraints without compromising the model's accuracy. +Additionally, we explore two notable paths for lightweight deep learning in the +future: deployment techniques for TinyML and Large Language Models. Although +these paths undoubtedly have potential, they also present significant +challenges, encouraging research into unexplored areas. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Energy-Calibrated VAE with Test Time Free Lunch + + +
+ In this paper, we propose a novel generative model that utilizes a +conditional Energy-Based Model (EBM) for enhancing Variational Autoencoder +(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer +from blurry generated samples due to the lack of a tailored training on the +samples generated in the generative direction. On the other hand, EBMs can +generate high-quality samples but require expensive Markov Chain Monte Carlo +(MCMC) sampling. To address these issues, we introduce a conditional EBM for +calibrating the generative direction of VAE during training, without requiring +it for the generation at test time. In particular, we train EC-VAE upon both +the input data and the calibrated samples with adaptive weight to enhance +efficacy while avoiding MCMC sampling at test time. Furthermore, we extend the +calibration idea of EC-VAE to variational learning and normalizing flows, and +apply EC-VAE to an additional application of zero-shot image restoration via +neural transport prior and range-null theory. We evaluate the proposed method +with two applications, including image generation and zero-shot image +restoration, and the experimental results show that our method achieves +competitive performance over single-step non-adversarial generation. Our code +is available at https://github.com/DJ-LYH/EC-VAE. + +
+
+ comment: Revision. Code is available at https://github.com/DJ-LYH/EC-VAE +
+
+
+
+
+ + ♻ ☆ Deep Internal Learning: Deep Learning from a Single Input + + +
+ Deep learning, in general, focuses on training a neural network from large +labeled datasets. Yet, in many cases there is value in training a network just +from the input at hand. This is particularly relevant in many signal and image +processing problems where training data is scarce and diversity is large on the +one hand, and on the other, there is a lot of structure in the data that can be +exploited. Using this information is the key to deep internal-learning +strategies, which may involve training a network from scratch using a single +input or adapting an already trained network to a provided input example at +inference time. This survey paper aims at covering deep internal-learning +techniques that have been proposed in the past few years for these two +important directions. While our main focus will be on image processing +problems, most of the approaches that we survey are derived for general signals +(vectors with recurring patterns that can be distinguished from noise) and are +therefore applicable to other modalities. + +
+
+ comment: Accepted to IEEE Signal Processing Magazine +
+
+
+
+
+ + ♻ ☆ FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization CVPR 2024 + + +
+ 3D Gaussian splatting has achieved very impressive performance in real-time +novel view synthesis. However, it often suffers from over-reconstruction during +Gaussian densification where high-variance image regions are covered by a few +large Gaussians only, leading to blur and artifacts in the rendered images. We +design a progressive frequency regularization (FreGS) technique to tackle the +over-reconstruction issue within the frequency space. Specifically, FreGS +performs coarse-to-fine Gaussian densification by exploiting low-to-high +frequency components that can be easily extracted with low-pass and high-pass +filters in the Fourier space. By minimizing the discrepancy between the +frequency spectrum of the rendered image and the corresponding ground truth, it +achieves high-quality Gaussian densification and alleviates the +over-reconstruction of Gaussian splatting effectively. Experiments over +multiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and +Deep Blending) show that FreGS achieves superior novel view synthesis and +outperforms the state-of-the-art consistently. + +
+
+ comment: Accepted by CVPR 2024. Project website: + https://rogeraigc.github.io/FreGS-Page/ +
+
+
+
+
+ + ♻ ☆ WEEP: A method for spatial interpretation of weakly supervised CNN + models in computational pathology + + +
+ Deep learning enables the modelling of high-resolution histopathology +whole-slide images (WSI). Weakly supervised learning of tile-level data is +typically applied for tasks where labels only exist on the patient or WSI level +(e.g. patient outcomes or histological grading). In this context, there is a +need for improved spatial interpretability of predictions from such models. We +propose a novel method, Wsi rEgion sElection aPproach (WEEP), for model +interpretation. It provides a principled yet straightforward way to establish +the spatial area of WSI required for assigning a particular prediction label. +We demonstrate WEEP on a binary classification task in the area of breast +cancer computational pathology. WEEP is easy to implement, is directly +connected to the model-based decision process, and offers information relevant +to both research and diagnostic applications. + +
+
+
+
+
+ + ♻ ☆ Deep Feature Statistics Mapping for Generalized Screen Content Image + Quality Assessment + + +
+ The statistical regularities of natural images, referred to as natural scene +statistics, play an important role in no-reference image quality assessment. +However, it has been widely acknowledged that screen content images (SCIs), +which are typically computer generated, do not hold such statistics. Here we +make the first attempt to learn the statistics of SCIs, based upon which the +quality of SCIs can be effectively determined. The underlying mechanism of the +proposed approach is based upon the mild assumption that the SCIs, which are +not physically acquired, still obey certain statistics that could be understood +in a learning fashion. We empirically show that the statistics deviation could +be effectively leveraged in quality assessment, and the proposed method is +superior when evaluated in different settings. Extensive experimental results +demonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA) +model delivers promising performance compared with existing NR-IQA models and +shows a high generalization capability in the cross-dataset settings. The +implementation of our method is publicly available at +https://github.com/Baoliang93/DFSS-IQA. + +
+
+
+
+
+ + ♻ ☆ Towards Domain-agnostic Depth Completion + + +
+ Existing depth completion methods are often targeted at a specific sparse +depth type and generalize poorly across task domains. We present a method to +complete sparse/semi-dense, noisy, and potentially low-resolution depth maps +obtained by various range sensors, including those in modern mobile phones, or +by multi-view reconstruction algorithms. Our method leverages a data-driven +prior in the form of a single image depth prediction network trained on +large-scale datasets, the output of which is used as an input to our model. We +propose an effective training scheme where we simulate various sparsity +patterns in typical task domains. In addition, we design two new benchmarks to +evaluate the generalizability and the robustness of depth completion methods. +Our simple method shows superior cross-domain generalization ability against +state-of-the-art depth completion methods, introducing a practical solution to +high-quality depth capture on a mobile device. The code is available at: +https://github.com/YvanYin/FillDepth. + +
+
+
+
+
+ + ♻ ☆ Intention-Conditioned Long-Term Human Egocentric Action Forecasting CVPR + + +
+ To anticipate how a human would act in the future, it is essential to +understand the human intention since it guides the human towards a certain +goal. In this paper, we propose a hierarchical architecture which assumes a +sequence of human action (low-level) can be driven from the human intention +(high-level). Based on this, we deal with Long-Term Action Anticipation task in +egocentric videos. Our framework first extracts two level of human information +over the N observed videos human actions through a Hierarchical Multi-task MLP +Mixer (H3M). Then, we condition the uncertainty of the future through an +Intention-Conditioned Variational Auto-Encoder (I-CVAE) that generates K stable +predictions of the next Z=20 actions that the observed human might perform. By +leveraging human intention as high-level information, we claim that our model +is able to anticipate more time-consistent actions in the long-term, thus +improving the results over baseline methods in EGO4D Challenge. This work +ranked first in both CVPR@2022 and ECVV@2022 EGO4D LTA Challenge by providing +more plausible anticipated sequences, improving the anticipation of nouns and +overall actions. Webpage: https://evm7.github.io/icvae-page/ + +
+
+ comment: Winner of CVPR@2022 and ECCV@2022 EGO4D LTA Challenge. Accepted in + WACV2023. Webpage: https://evm7.github.io/icvae-page/ +
+
+
+
+
+ + ♻ ☆ Robust Human Motion Forecasting using Transformer-based Model IROS2022 + + +
+ Comprehending human motion is a fundamental challenge for developing +Human-Robot Collaborative applications. Computer vision researchers have +addressed this field by only focusing on reducing error in predictions, but not +taking into account the requirements to facilitate its implementation in +robots. In this paper, we propose a new model based on Transformer that +simultaneously deals with the real time 3D human motion forecasting in the +short and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently +exploit the spatio-temporal information of a shortly observed sequence (400ms) +and generates a competitive accuracy against the current state-of-the-art. +2CH-TR stands out for the efficient performance of the Transformer, being +lighter and faster than its competitors. In addition, our model is tested in +conditions where the human motion is severely occluded, demonstrating its +robustness in reconstructing and predicting 3D human motion in a highly noisy +environment. Our experiment results show that the proposed 2CH-TR outperforms +the ST-Transformer, which is another state-of-the-art model based on the +Transformer, in terms of reconstruction and prediction under the same +conditions of input prefix. Our model reduces in 8.89% the mean squared error +of ST-Transformer in short-term prediction, and 2.57% in long-term prediction +in Human3.6M dataset with 400ms input prefix. Webpage: +https://evm7.github.io/2CHTR-page/ + +
+
+ comment: Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/ +
+
+
+
+
+ + ♻ ☆ A Unified Masked Autoencoder with Patchified Skeletons for Motion + Synthesis AAAI2024 + + +
+ The synthesis of human motion has traditionally been addressed through +task-dependent models that focus on specific challenges, such as predicting +future motions or filling in intermediate poses conditioned on known key-poses. +In this paper, we present a novel task-independent model called UNIMASK-M, +which can effectively address these challenges using a unified architecture. +Our model obtains comparable or better performance than the state-of-the-art in +each field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model +decomposes a human pose into body parts to leverage the spatio-temporal +relationships existing in human motion. Moreover, we reformulate various +pose-conditioned motion synthesis tasks as a reconstruction problem with +different masking patterns given as input. By explicitly informing our model +about the masked joints, our UNIMASK-M becomes more robust to occlusions. +Experimental results show that our model successfully forecasts human motion on +the Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion +inbetweening on the LaFAN1 dataset, particularly in long transition periods. +More information can be found on the project website +https://evm7.github.io/UNIMASKM-page/ + +
+
+ comment: Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/ +
+
+
+
+
+ + ♻ ☆ HOI4ABOT: Human-Object Interaction Anticipation for Human Intention + Reading Collaborative roBOTs + + +
+ Robots are becoming increasingly integrated into our lives, assisting us in +various tasks. To ensure effective collaboration between humans and robots, it +is essential that they understand our intentions and anticipate our actions. In +this paper, we propose a Human-Object Interaction (HOI) anticipation framework +for collaborative robots. We propose an efficient and robust transformer-based +model to detect and anticipate HOIs from videos. This enhanced anticipation +empowers robots to proactively assist humans, resulting in more efficient and +intuitive collaborations. Our model outperforms state-of-the-art results in HOI +detection and anticipation in VidHOI dataset with an increase of 1.76% and +1.04% in mAP respectively while being 15.4 times faster. We showcase the +effectiveness of our approach through experimental results in a real robot, +demonstrating that the robot's ability to anticipate HOIs is key for better +Human-Robot Interaction. More information can be found on our project webpage: +https://evm7.github.io/HOI4ABOT_page/ + +
+
+ comment: Proceedings in Conference on Robot Learning 2023. Webpage: + https://evm7.github.io/HOI4ABOT_page/ +
+
+
+
+
+ + ♻ ☆ Robot Interaction Behavior Generation based on Social Motion Forecasting + for Human-Robot Interaction ICRA 2024 + + +
+ Integrating robots into populated environments is a complex challenge that +requires an understanding of human social dynamics. In this work, we propose to +model social motion forecasting in a shared human-robot representation space, +which facilitates us to synthesize robot motions that interact with humans in +social scenarios despite not observing any robot in the motion training. We +develop a transformer-based architecture called ECHO, which operates in the +aforementioned shared space to predict the future motions of the agents +encountered in social scenarios. Contrary to prior works, we reformulate the +social motion problem as the refinement of the predicted individual motions +based on the surrounding agents, which facilitates the training while allowing +for single-motion forecasting when only one human is in the scene. We evaluate +our model in multi-person and human-robot motion forecasting tasks and obtain +state-of-the-art performance by a large margin while being efficient and +performing in real-time. Additionally, our qualitative results showcase the +effectiveness of our approach in generating human-robot interaction behaviors +that can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/ + +
+
+ comment: Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/ +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based applications to low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing information from non-local areas. In the +domain of super-resolution, Swin-transformer-based approaches have become +mainstream due to their capacity to capture global spatial information and +their shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced image +quality and network efficiency by expanding the receptive field or designing +complex networks, yielding commendable results. However, we observed that +spatial information tends to diminish during the forward propagation process +due to increased depth, leading to a loss of spatial information and, +consequently, limiting the model's potential. To address this, we propose the +Dense-residual-connected Transformer (DRCT), aimed at mitigating the loss of +spatial information through dense-residual connections between layers, thereby +unleashing the model's potential and enhancing performance. Experiment results +indicate that our approach is not only straightforward but also achieves +remarkable efficiency, surpassing state-of-the-art methods and performing +commendably at NTIRE2024. + +
+
+ comment: NTIRE 2024 Image Super-resolution (x4) +
+
+
+
+
+ + ♻ ☆ MESA: Matching Everything by Segmenting Anything CVPR24 + + +
+ Feature matching is a crucial task in the field of computer vision, which +involves finding correspondences between images. Previous studies achieve +remarkable performance using learning-based feature comparison. However, the +pervasive presence of matching redundancy between images gives rise to +unnecessary and error-prone computations in these methods, imposing limitations +on their accuracy. To address this issue, we propose MESA, a novel approach to +establish precise area (or region) matches for efficient matching redundancy +reduction. MESA first leverages the advanced image understanding capability of +SAM, a state-of-the-art foundation model for image segmentation, to obtain +image areas with implicit semantic. Then, a multi-relational graph is proposed +to model the spatial structure of these areas and construct their scale +hierarchy. Based on graphical models derived from the graph, the area matching +is reformulated as an energy minimization task and effectively resolved. +Extensive experiments demonstrate that MESA yields substantial precision +improvement for multiple point matchers in indoor and outdoor downstream tasks, +e.g. +13.61% for DKM in indoor pose estimation. + +
+
+ comment: CVPR24 +
+
+
+
+
+ + ♻ ☆ DPHMs: Diffusion Parametric Head Models for Depth-based Tracking CVPR 2024 + + +
+ We introduce Diffusion Parametric Head Models (DPHMs), a generative model +that enables robust volumetric head reconstruction and tracking from monocular +depth sequences. While recent volumetric head models, such as NPHMs, can now +excel in representing high-fidelity head geometries, tracking and +reconstructing heads from real-world single-view depth sequences remains very +challenging, as the fitting to partial and noisy observations is +underconstrained. To tackle these challenges, we propose a latent +diffusion-based prior to regularize volumetric head reconstruction and +tracking. This prior-based regularizer effectively constrains the identity and +expression codes to lie on the underlying latent manifold which represents +plausible head shapes. To evaluate the effectiveness of the diffusion-based +prior, we collect a dataset of monocular Kinect sequences consisting of various +complex facial expression motions and rapid transitions. We compare our method +to state-of-the-art tracking methods and demonstrate improved head identity +reconstruction as well as robust expression tracking. + +
+
+ comment: CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/ +
+
+
+
+
+ + ♻ ☆ SepVAE: a contrastive VAE to separate pathological patterns from healthy + ones ICML + + +
+ Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders +(VAEs) that aims at separating the common factors of variation between a +background dataset (BG) (i.e., healthy subjects) and a target dataset (TG) +(i.e., patients) from the ones that only exist in the target dataset. To do so, +these methods separate the latent space into a set of salient features (i.e., +proper to the target dataset) and a set of common features (i.e., exist in both +datasets). Currently, all models fail to prevent the sharing of information +between latent spaces effectively and to capture all salient factors of +variation. To this end, we introduce two crucial regularization losses: a +disentangling term between common and salient representations and a +classification term between background and target samples in the salient space. +We show a better performance than previous CA-VAEs methods on three medical +applications and a natural images dataset (CelebA). Code and datasets are +available on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae. + +
+
+ comment: Workshop on Interpretable ML in Healthcare at International + Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023 +
+
+
+
+
+ + ♻ ☆ SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for + Skeleton-based Action Recognition + + +
+ Graph convolution networks (GCNs) have achieved remarkable performance in +skeleton-based action recognition. However, previous GCN-based methods rely on +elaborate human priors excessively and construct complex feature aggregation +mechanisms, which limits the generalizability and effectiveness of networks. To +solve these problems, we propose a novel Spatial Topology Gating Unit (STGU), +an MLP-based variant without extra priors, to capture the co-occurrence +topology features that encode the spatial dependency across all joints. In +STGU, to learn the point-wise topology features, a new gate-based feature +interaction mechanism is introduced to activate the features point-to-point by +the attention map generated from the input sample. Based on the STGU, we +propose the first MLP-based model, SiT-MLP, for skeleton-based action +recognition in this work. Compared with previous methods on three large-scale +datasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP +reduces the parameters significantly with favorable results. The code will be +available at https://github.com/BUPTSJZhang/SiT?MLP. + +
+
+ comment: Accepted by IEEE TCSVT 2024 +
+
+
+
+
+ + ♻ ☆ RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose + Estimation CVPR 2024 + + +
+ Real-time multi-person pose estimation presents significant challenges in +balancing speed and precision. While two-stage top-down methods slow down as +the number of people in the image increases, existing one-stage methods often +fail to simultaneously deliver high accuracy and real-time performance. This +paper introduces RTMO, a one-stage pose estimation framework that seamlessly +integrates coordinate classification by representing keypoints using dual 1-D +heatmaps within the YOLO architecture, achieving accuracy comparable to +top-down methods while maintaining high speed. We propose a dynamic coordinate +classifier and a tailored loss function for heatmap learning, specifically +designed to address the incompatibilities between coordinate classification and +dense prediction models. RTMO outperforms state-of-the-art one-stage pose +estimators, achieving 1.1% higher AP on COCO while operating about 9 times +faster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on +COCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and +accuracy. The code and models are available at +https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo +
+
+
+
+
+ + ♻ ☆ Zero-Shot Segmentation of Eye Features Using the Segment Anything Model + (SAM) + + +
+ The advent of foundation models signals a new era in artificial intelligence. +The Segment Anything Model (SAM) is the first foundation model for image +segmentation. In this study, we evaluate SAM's ability to segment features from +eye images recorded in virtual reality setups. The increasing requirement for +annotated eye-image datasets presents a significant opportunity for SAM to +redefine the landscape of data annotation in gaze estimation. Our investigation +centers on SAM's zero-shot learning abilities and the effectiveness of prompts +like bounding boxes or point clicks. Our results are consistent with studies in +other domains, demonstrating that SAM's segmentation effectiveness can be +on-par with specialized models depending on the feature, with prompts improving +its performance, evidenced by an IoU of 93.34% for pupil segmentation in one +dataset. Foundation models like SAM could revolutionize gaze estimation by +enabling quick and easy image segmentation, reducing reliance on specialized +models and extensive manual annotation. + +
+
+ comment: 14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on + Eye Tracking Research & Applications +
+
+
+
+
+ + ♻ ☆ Photo-SLAM: Real-time Simultaneous Localization and Photorealistic + Mapping for Monocular, Stereo, and RGB-D Cameras CVPR 2024 + + +
+ The integration of neural rendering and the SLAM system recently showed +promising results in joint localization and photorealistic view reconstruction. +However, existing methods, fully relying on implicit representations, are so +resource-hungry that they cannot run on portable devices, which deviates from +the original intention of SLAM. In this paper, we present Photo-SLAM, a novel +SLAM framework with a hyper primitives map. Specifically, we simultaneously +exploit explicit geometric features for localization and learn implicit +photometric features to represent the texture information of the observed +environment. In addition to actively densifying hyper primitives based on +geometric features, we further introduce a Gaussian-Pyramid-based training +method to progressively learn multi-level features, enhancing photorealistic +mapping performance. The extensive experiments with monocular, stereo, and +RGB-D datasets prove that our proposed system Photo-SLAM significantly +outperforms current state-of-the-art SLAM systems for online photorealistic +mapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times +faster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time +speed using an embedded platform such as Jetson AGX Orin, showing the potential +of robotics applications. + +
+
+ comment: CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project + Page: https://huajianup.github.io/research/Photo-SLAM/ +
+
+
+
+
+ + ♻ ☆ 360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization + with Cross-device Queries CVPR 2024 + + +
+ Portable 360$^\circ$ cameras are becoming a cheap and efficient tool to +establish large visual databases. By capturing omnidirectional views of a +scene, these cameras could expedite building environment models that are +essential for visual localization. However, such an advantage is often +overlooked due to the lack of valuable datasets. This paper introduces a new +benchmark dataset, 360Loc, composed of 360$^\circ$ images with ground truth +poses for visual localization. We present a practical implementation of +360$^\circ$ mapping combining 360$^\circ$ images with lidar data to generate +the ground truth 6DoF poses. 360Loc is the first dataset and benchmark that +explores the challenge of cross-device visual positioning, involving +360$^\circ$ reference frames, and query frames from pinhole, ultra-wide FoV +fisheye, and 360$^\circ$ cameras. We propose a virtual camera approach to +generate lower-FoV query frames from 360$^\circ$ images, which ensures a fair +comparison of performance among different query types in visual localization +tasks. We also extend this virtual camera approach to feature matching-based +and pose regression-based methods to alleviate the performance loss caused by +the cross-device domain gap, and evaluate its effectiveness against +state-of-the-art baselines. We demonstrate that omnidirectional visual +localization is more robust in challenging large-scale scenes with symmetries +and repetitive structures. These results provide new insights into 360-camera +mapping and omnidirectional visual localization with cross-device queries. + +
+
+ comment: CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/ +
+
+
+
+
+ + ♻ ☆ Design as Desired: Utilizing Visual Question Answering for Multimodal + Pre-training + + +
+ Multimodal pre-training demonstrates its potential in the medical domain, +which learns medical visual representations from paired medical reports. +However, many pre-training tasks require extra annotations from clinicians, and +most of them fail to explicitly guide the model to learn the desired features +of different pathologies. To the best of our knowledge, we are the first to +utilize Visual Question Answering (VQA) for multimodal pre-training to guide +the framework focusing on targeted pathological features. In this work, we +leverage descriptions in medical reports to design multi-granular +question-answer pairs associated with different diseases, which assist the +framework in pre-training without requiring extra annotations from experts. We +also propose a novel pre-training framework with a quasi-textual feature +transformer, a module designed to transform visual features into a +quasi-textual space closer to the textual domain via a contrastive learning +strategy. This narrows the vision-language gap and facilitates modality +alignment. Our framework is applied to four downstream tasks: report +generation, classification, segmentation, and detection across five datasets. +Extensive experiments demonstrate the superiority of our framework compared to +other state-of-the-art methods. Our code will be released upon acceptance. + +
+
+
+
+
+ + ♻ ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging CVPR 2024 + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+ comment: Accepted to CVPR 2024. More results available at + https://cic.tju.edu.cn/faculty/likun/projects/LPSNet +
+
+
+
+
+ + ♻ ☆ A ground-based dataset and a diffusion model for on-orbit low-light + image enhancement + + +
+ On-orbit service is important for maintaining the sustainability of space +environment. Space-based visible camera is an economical and lightweight sensor +for situation awareness during on-orbit service. However, it can be easily +affected by the low illumination environment. Recently, deep learning has +achieved remarkable success in image enhancement of natural images, but seldom +applied in space due to the data bottleneck. In this article, we first propose +a dataset of the Beidou Navigation Satellite for on-orbit low-light image +enhancement (LLIE). In the automatic data collection scheme, we focus on +reducing domain gap and improving the diversity of the dataset. we collect +hardware in-the-loop images based on a robotic simulation testbed imitating +space lighting conditions. To evenly sample poses of different orientation and +distance without collision, a collision-free working space and pose stratified +sampling is proposed. Afterwards, a novel diffusion model is proposed. To +enhance the image contrast without over-exposure and blurring details, we +design a fused attention to highlight the structure and dark region. Finally, +we compare our method with previous methods using our dataset, which indicates +that our method has a better capacity in on-orbit LLIE. + +
+
+
+
+
+ + ♻ ☆ Representing Noisy Image Without Denoising + + +
+ A long-standing topic in artificial intelligence is the effective recognition +of patterns from noisy images. In this regard, the recent data-driven paradigm +considers 1) improving the representation robustness by adding noisy samples in +training phase (i.e., data augmentation) or 2) pre-processing the noisy image +by learning to solve the inverse problem (i.e., image denoising). However, such +methods generally exhibit inefficient process and unstable result, limiting +their practical applications. In this paper, we explore a non-learning paradigm +that aims to derive robust representation directly from noisy images, without +the denoising as pre-processing. Here, the noise-robust representation is +designed as Fractional-order Moments in Radon space (FMR), with also beneficial +properties of orthogonality and rotation invariance. Unlike earlier +integer-order methods, our work is a more generic design taking such classical +methods as special cases, and the introduced fractional-order parameter offers +time-frequency analysis capability that is not available in classical methods. +Formally, both implicit and explicit paths for constructing the FMR are +discussed in detail. Extensive simulation experiments and an image security +application are provided to demonstrate the uniqueness and usefulness of our +FMR, especially for noise robustness, rotation invariance, and time-frequency +discriminability. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence, 2024 +
+
+
+
+
+ + ♻ ☆ PEEB: Part-based Image Classifiers with an Explainable and Editable + Language Bottleneck NAACL 2024 + + +
+ CLIP-based classifiers rely on the prompt containing a {class name} that is +known to the text encoder. Therefore, they perform poorly on new classes or the +classes whose names rarely appear on the Internet (e.g., scientific names of +birds). For fine-grained classification, we propose PEEB - an explainable and +editable classifier to (1) express the class name into a set of text +descriptors that describe the visual parts of that class; and (2) match the +embeddings of the detected parts to their textual descriptors in each class to +compute a logit score for classification. In a zero-shot setting where the +class names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1 +accuracy). Compared to part-based classifiers, PEEB is not only the +state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% +accuracy on CUB-200 and Dogs-120, respectively) but also the first to enable +users to edit the text descriptors to form a new classifier without any +re-training. Compared to concept bottleneck models, PEEB is also the SOTA in +both zero-shot and supervised-learning settings. + +
+
+ comment: Findings of NAACL 2024 (long paper) +
+
+
+
+
+ + ♻ ☆ Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation + + +
+ With the explosive popularity of AI-generated content (AIGC), video +generation has recently received a lot of attention. Generating videos guided +by text instructions poses significant challenges, such as modeling the complex +relationship between space and time, and the lack of large-scale text-video +paired data. Existing text-video datasets suffer from limitations in both +content quality and scale, or they are not open-source, rendering them +inaccessible for study and use. For model design, previous approaches extend +pretrained text-to-image generation models by adding temporal 1D +convolution/attention modules for video generation. However, these approaches +overlook the importance of jointly modeling space and time, inevitably leading +to temporal distortions and misalignment between texts and videos. In this +paper, we propose a novel approach that strengthens the interaction between +spatial and temporal perceptions. In particular, we utilize a swapped +cross-attention mechanism in 3D windows that alternates the ``query'' role +between spatial and temporal blocks, enabling mutual reinforcement for each +other. Moreover, to fully unlock model capabilities for high-quality video +generation and promote the development of the field, we curate a large-scale +and open-source video dataset called HD-VG-130M. This dataset comprises 130 +million text-video pairs from the open-domain, ensuring high-definition, +widescreen and watermark-free characters. A smaller-scale yet more meticulously +cleaned subset further enhances the data quality, aiding models in achieving +superior performance. Experimental quantitative and qualitative results +demonstrate the superiority of our approach in terms of per-frame quality, +temporal correlation, and text-video alignment, with clear margins. + +
+
+
+
+
+ + ♻ ☆ InstaGen: Enhancing Object Detection by Training on Synthetic Dataset CVPR2024 + + +
+ In this paper, we present a novel paradigm to enhance the ability of object +detector, e.g., expanding categories or improving detection performance, by +training on synthetic dataset generated from diffusion models. Specifically, we +integrate an instance-level grounding head into a pre-trained, generative +diffusion model, to augment it with the ability of localising instances in the +generated images. The grounding head is trained to align the text embedding of +category names with the regional visual feature of the diffusion model, using +supervision from an off-the-shelf object detector, and a novel self-training +scheme on (novel) categories not covered by the detector. We conduct thorough +experiments to show that, this enhanced version of diffusion model, termed as +InstaGen, can serve as a data synthesizer, to enhance object detectors by +training on its generated samples, demonstrating superior performance over +existing state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse +(+1.2 to 5.2 AP) scenarios. Project page with code: +https://fcjian.github.io/InstaGen. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ SIFU: Side-view Conditioned Implicit Function for Real-world Usable + Clothed Human Reconstruction CVPR 2024 + + +
+ Creating high-quality 3D models of clothed humans from single images for +real-world applications is crucial. Despite recent advancements, accurately +reconstructing humans in complex poses or with loose clothing from in-the-wild +images, along with predicting textures for unseen areas, remains a significant +challenge. A key limitation of previous methods is their insufficient prior +guidance in transitioning from 2D to 3D and in texture prediction. In response, +we introduce SIFU (Side-view Conditioned Implicit Function for Real-world +Usable Clothed Human Reconstruction), a novel approach combining a Side-view +Decoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU +employs a cross-attention mechanism within the transformer, using SMPL-X +normals as queries to effectively decouple side-view features in the process of +mapping 2D features to 3D. This method not only improves the precision of the +3D models but also their robustness, especially when SMPL-X estimates are not +perfect. Our texture refinement process leverages text-to-image diffusion-based +prior to generate realistic and consistent textures for invisible views. +Through extensive experiments, SIFU surpasses SOTA methods in both geometry and +texture reconstruction, showcasing enhanced robustness in complex scenarios and +achieving an unprecedented Chamfer and P2S measurement. Our approach extends to +practical applications such as 3D printing and scene building, demonstrating +its broad utility in real-world scenarios. Project page +https://river-zhang.github.io/SIFU-projectpage/ . + +
+
+ comment: Accepted by CVPR 2024; Project page + https://river-zhang.github.io/SIFU-projectpage/ +
+
+
+
+
+ + ♻ ☆ SAOR: Single-View Articulated Object Reconstruction CVPR 2024 + + +
+ We introduce SAOR, a novel approach for estimating the 3D shape, texture, and +viewpoint of an articulated object from a single image captured in the wild. +Unlike prior approaches that rely on pre-defined category-specific 3D templates +or tailored 3D skeletons, SAOR learns to articulate shapes from single-view +image collections with a skeleton-free part-based model without requiring any +3D object shape priors. To prevent ill-posed solutions, we propose a +cross-instance consistency loss that exploits disentangled object shape +deformation and articulation. This is helped by a new silhouette-based sampling +mechanism to enhance viewpoint diversity during training. Our method only +requires estimated object silhouettes and relative depth maps from +off-the-shelf pre-trained networks during training. At inference time, given a +single-view image, it efficiently outputs an explicit mesh representation. We +obtain improved qualitative and quantitative results on challenging quadruped +animals compared to relevant existing work. + +
+
+ comment: Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor +
+
+
+
+
+ + ♻ ☆ CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification CVPR 2024 + + +
+ Person re-identification (re-ID) is a challenging task that aims to learn +discriminative features for person retrieval. In person re-ID, Jaccard distance +is a widely used distance metric, especially in re-ranking and clustering +scenarios. However, we discover that camera variation has a significant +negative impact on the reliability of Jaccard distance. In particular, Jaccard +distance calculates the distance based on the overlap of relevant neighbors. +Due to camera variation, intra-camera samples dominate the relevant neighbors, +which reduces the reliability of the neighbors by introducing intra-camera +negative samples and excluding inter-camera positive samples. To overcome this +problem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that +leverages camera information to enhance the reliability of Jaccard distance. +Specifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to +find k-reciprocal nearest neighbors on the intra-camera and inter-camera +ranking lists, which improves the reliability of relevant neighbors and +guarantees the contribution of inter-camera samples in the overlap. Moreover, +we propose a camera-aware local query expansion (CLQE) to mine reliable samples +in relevant neighbors by exploiting camera variation as a strong constraint and +assign these samples higher weights in overlap, further improving the +reliability. Our CA-Jaccard distance is simple yet effective and can serve as a +general distance metric for person re-ID methods with high reliability and low +computational cost. Extensive experiments demonstrate the effectiveness of our +method. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegmentAnything helps microscopy images based automatic and quantitative + organoid detection and analysis + + +
+ Organoids are self-organized 3D cell clusters that closely mimic the +architecture and function of in vivo tissues and organs. Quantification of +organoid morphology helps in studying organ development, drug discovery, and +toxicity assessment. Recent microscopy techniques provide a potent tool to +acquire organoid morphology features, but manual image analysis remains a labor +and time-intensive process. Thus, this paper proposes a comprehensive pipeline +for microscopy analysis that leverages the SegmentAnything to precisely +demarcate individual organoids. Additionally, we introduce a set of +morphological properties, including perimeter, area, radius, non-smoothness, +and non-circularity, allowing researchers to analyze the organoid structures +quantitatively and automatically. To validate the effectiveness of our +approach, we conducted tests on bright-field images of human induced +pluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The +results obtained from our automatic pipeline closely align with manual organoid +detection and measurement, showcasing the capability of our proposed method in +accelerating organoids morphology analysis. + +
+
+ comment: Replace Figure 4 with the correct version. The original version is + wrong due to a column name mismatch +
+
+
+
+
+ + ♻ ☆ Understanding normalization in contrastive representation learning and + out-of-distribution detection + + +
+ Contrastive representation learning has emerged as an outstanding approach +for anomaly detection. In this work, we explore the $\ell_2$-norm of +contrastive features and its applications in out-of-distribution detection. We +propose a simple method based on contrastive learning, which incorporates +out-of-distribution data by discriminating against normal samples in the +contrastive layer space. Our approach can be applied flexibly as an outlier +exposure (OE) approach, where the out-of-distribution data is a huge collective +of random images, or as a fully self-supervised learning approach, where the +out-of-distribution data is self-generated by applying distribution-shifting +transformations. The ability to incorporate additional out-of-distribution +samples enables a feasible solution for datasets where AD methods based on +contrastive learning generally underperform, such as aerial images or +microscopy images. Furthermore, the high-quality features learned through +contrastive learning consistently enhance performance in OE scenarios, even +when the available out-of-distribution dataset is not diverse enough. Our +extensive experiments demonstrate the superiority of our proposed method under +various scenarios, including unimodal and multimodal settings, with various +image datasets. + +
+
+
+
+
+ + ♻ ☆ Confronting Ambiguity in 6D Object Pose Estimation via Score-Based + Diffusion on SE(3) CVPR2024 + + +
+ Addressing pose ambiguity in 6D object pose estimation from single RGB images +presents a significant challenge, particularly due to object symmetries or +occlusions. In response, we introduce a novel score-based diffusion method +applied to the $SE(3)$ group, marking the first application of diffusion models +to $SE(3)$ within the image domain, specifically tailored for pose estimation +tasks. Extensive evaluations demonstrate the method's efficacy in handling pose +ambiguity, mitigating perspective-induced ambiguity, and showcasing the +robustness of our surrogate Stein score formulation on $SE(3)$. This +formulation not only improves the convergence of denoising process but also +enhances computational efficiency. Thus, we pioneer a promising strategy for 6D +object pose estimation. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Neural Implicit Morphing of Face Images CVPR 2024 + + +
+ Face morphing is a problem in computer graphics with numerous artistic and +forensic applications. It is challenging due to variations in pose, lighting, +gender, and ethnicity. This task consists of a warping for feature alignment +and a blending for a seamless transition between the warped images. We propose +to leverage coord-based neural networks to represent such warpings and +blendings of face images. During training, we exploit the smoothness and +flexibility of such networks by combining energy functionals employed in +classical approaches without discretizations. Additionally, our method is +time-dependent, allowing a continuous warping/blending of the images. During +morphing inference, we need both direct and inverse transformations of the +time-dependent warping. The first (second) is responsible for warping the +target (source) image into the source (target) image. Our neural warping stores +those maps in a single network dismissing the need for inverting them. The +results of our experiments indicate that our method is competitive with both +classical and generative models under the lens of image quality and +face-morphing detectors. Aesthetically, the resulting images present a seamless +blending of diverse faces not yet usual in the literature. + +
+
+ comment: 14 pages, 20 figures, accepted for CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation + + +
+ Aerial image segmentation is the basis for applications such as automatically +creating maps or tracking deforestation. In true orthophotos, which are often +used in these applications, many objects and regions can be approximated well +by polygons. However, this fact is rarely exploited by state-of-the-art +semantic segmentation models. Instead, most models allow unnecessary degrees of +freedom in their predictions by allowing arbitrary region shapes. We therefore +present a refinement of our deep learning model which predicts binary space +partitioning trees, an efficient polygon representation. The refinements +include a new feature decoder architecture and a new differentiable BSP tree +renderer which both avoid vanishing gradients. Additionally, we designed a +novel loss function specifically designed to improve the spatial partitioning +defined by the predicted trees. Furthermore, our expanded model can predict +multiple trees at once and thus can predict class-specific segmentations. As an +additional contribution, we investigate the impact of a non-optimal training +process in comparison to an optimized training process. While model +architectures optimized for aerial images, such as PFNet or our own model, show +an advantage under non-optimal conditions, this advantage disappears under +optimal training conditions. Despite this observation, our model still makes +better predictions for small rectangular objects, e.g., cars. + +
+
+
+
+
+ + ♻ ☆ Synthetic data shuffling accelerates the convergence of federated + learning under data heterogeneity + + +
+ In federated learning, data heterogeneity is a critical challenge. A +straightforward solution is to shuffle the clients' data to homogenize the +distribution. However, this may violate data access rights, and how and when +shuffling can accelerate the convergence of a federated optimization algorithm +is not theoretically well understood. In this paper, we establish a precise and +quantifiable correspondence between data heterogeneity and parameters in the +convergence rate when a fraction of data is shuffled across clients. We prove +that shuffling can quadratically reduce the gradient dissimilarity with respect +to the shuffling percentage, accelerating convergence. Inspired by the theory, +we propose a practical approach that addresses the data access rights issue by +shuffling locally generated synthetic data. The experimental results show that +shuffling synthetic data improves the performance of multiple existing +federated learning algorithms by a large margin. + +
+
+ comment: Accepted at TMLR +
+
+
+
+
+ + ♻ ☆ Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR + Fusion + + +
+ In this paper, we study the problem of jointly estimating the optical flow +and scene flow from synchronized 2D and 3D data. Previous methods either employ +a complex pipeline that splits the joint task into independent stages, or fuse +2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such +one-size-fits-all approaches suffer from a dilemma of failing to fully utilize +the characteristic of each modality or to maximize the inter-modality +complementarity. To address the problem, we propose a novel end-to-end +framework, which consists of 2D and 3D branches with multiple bidirectional +fusion connections between them in specific layers. Different from previous +work, we apply a point-based 3D branch to extract the LiDAR features, as it +preserves the geometric structure of point clouds. To fuse dense image features +and sparse point features, we propose a learnable operator named bidirectional +camera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the +bidirectional fusion pipeline, one based on the pyramidal coarse-to-fine +architecture (dubbed CamLiPWC), and the other one based on the recurrent +all-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC +and CamLiRAFT surpass all existing methods and achieve up to a 47.9\% reduction +in 3D end-point-error from the best published result. Our best-performing +model, CamLiRAFT, achieves an error of 4.26\% on the KITTI Scene Flow +benchmark, ranking 1st among all submissions with much fewer parameters. +Besides, our methods have strong generalization performance and the ability to +handle non-rigid motion. Code is available at +https://github.com/MCG-NJU/CamLiFlow. + +
+
+ comment: Accepted to TPAMI 2023 +
+
+
+
+
+ + ♻ ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality IJCNN 2024 + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ♻ ☆ Gyro-based Neural Single Image Deblurring + + +
+ In this paper, we present GyroDeblurNet, a novel single image deblurring +method that utilizes a gyro sensor to effectively resolve the ill-posedness of +image deblurring. The gyro sensor provides valuable information about camera +motion during exposure time that can significantly improve deblurring quality. +However, effectively exploiting real-world gyro data is challenging due to +significant errors from various sources including sensor noise, the disparity +between the positions of a camera module and a gyro sensor, the absence of +translational motion information, and moving objects whose motions cannot be +captured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with +two novel neural network blocks: a gyro refinement block and a gyro deblurring +block. The gyro refinement block refines the error-ridden gyro data using the +blur information from the input image. On the other hand, the gyro deblurring +block removes blur from the input image using the refined gyro data and further +compensates for gyro error by leveraging the blur information from the input +image. For training a neural network with erroneous gyro data, we propose a +training strategy based on the curriculum learning. We also introduce a novel +gyro data embedding scheme to represent real-world intricate camera shakes. +Finally, we present a synthetic dataset and a real dataset for the training and +evaluation of gyro-based single image deblurring. Our experiments demonstrate +that our approach achieves state-of-the-art deblurring quality by effectively +utilizing erroneous gyro data. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Knowledge Distillation in Computer Vision + + +
+ Deep learning techniques have been demonstrated to surpass preceding +cutting-edge machine learning techniques in recent years, with computer vision +being one of the most prominent examples. However, deep learning models suffer +from significant drawbacks when deployed in resource-constrained environments +due to their large model size and high complexity. Knowledge Distillation is +one of the prominent solutions to overcome this challenge. This review paper +examines the current state of research on knowledge distillation, a technique +for compressing complex models into smaller and simpler ones. The paper +provides an overview of the major principles and techniques associated with +knowledge distillation and reviews the applications of knowledge distillation +in the domain of computer vision. The review focuses on the benefits of +knowledge distillation, as well as the problems that must be overcome to +improve its effectiveness. + +
+
+ comment: 36 pages ,10 figures +
+
+
+
+
+ + ♻ ☆ Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree + Image Generation AAAI 24 + + +
+ A 360-degree (omni-directional) image provides an all-encompassing spherical +view of a scene. Recently, there has been an increasing interest in +synthesising 360-degree images from conventional narrow field of view (NFoV) +images captured by digital cameras and smartphones, for providing immersive +experiences in various scenarios such as virtual reality. Yet, existing methods +typically fall short in synthesizing intricate visual details or ensure the +generated images align consistently with user-provided prompts. In this study, +autoregressive omni-aware generative network (AOG-Net) is proposed for +360-degree image generation by out-painting an incomplete 360-degree image +progressively with NFoV and text guidances joinly or individually. This +autoregressive scheme not only allows for deriving finer-grained and +text-consistent patterns by dynamically generating and adjusting the process +but also offers users greater flexibility to edit their conditions throughout +the generation process. A global-local conditioning mechanism is devised to +comprehensively formulate the outpainting guidance in each autoregressive step. +Text guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and +further formulated with cross-attention based transformers into a global stream +and a local stream into a conditioned generative backbone model. As AOG-Net is +compatible to leverage large-scale models for the conditional encoder and the +generative prior, it enables the generation to use extensive open-vocabulary +text guidances. Comprehensive experiments on two commonly used 360-degree image +datasets for both indoor and outdoor settings demonstrate the state-of-the-art +performance of our proposed method. Our code will be made publicly available. + +
+
+ comment: Accepted by AAAI 24 +
+
+
+
+
+ + ♻ ☆ Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled + Feature Fields + + +
+ 3D scene representations have gained immense popularity in recent years. +Methods that use Neural Radiance fields are versatile for traditional tasks +such as novel view synthesis. In recent times, some work has emerged that aims +to extend the functionality of NeRF beyond view synthesis, for semantically +aware tasks such as editing and segmentation using 3D feature field +distillation from 2D foundation models. However, these methods have two major +limitations: (a) they are limited by the rendering speed of NeRF pipelines, and +(b) implicitly represented feature fields suffer from continuity artifacts +reducing feature quality. Recently, 3D Gaussian Splatting has shown +state-of-the-art performance on real-time radiance field rendering. In this +work, we go one step further: in addition to radiance field rendering, we +enable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D +foundation model distillation. This translation is not straightforward: naively +incorporating feature fields in the 3DGS framework encounters significant +challenges, notably the disparities in spatial resolution and channel +consistency between RGB images and feature maps. We propose architectural and +training changes to efficiently avert this problem. Our proposed method is +general, and our experiments showcase novel view semantic segmentation, +language-guided editing and segment anything through learning feature fields +from state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across +experiments, our distillation method is able to provide comparable or better +results, while being significantly faster to both train and render. +Additionally, to the best of our knowledge, we are the first method to enable +point and bounding-box prompting for radiance field manipulation, by leveraging +the SAM model. Project website at: https://feature-3dgs.github.io/ + +
+
+
+
+
+ + ♻ ☆ Unifying Correspondence, Pose and NeRF for Pose-Free Novel View + Synthesis from Stereo Pairs CVPR2024 + + +
+ This work delves into the task of pose-free novel view synthesis from stereo +pairs, a challenging and pioneering task in 3D vision. Our innovative +framework, unlike any before, seamlessly integrates 2D correspondence matching, +camera pose estimation, and NeRF rendering, fostering a synergistic enhancement +of these tasks. We achieve this through designing an architecture that utilizes +a shared representation, which serves as a foundation for enhanced 3D geometry +understanding. Capitalizing on the inherent interplay between the tasks, our +unified framework is trained end-to-end with the proposed training strategy to +improve overall model accuracy. Through extensive evaluations across diverse +indoor and outdoor scenes from two real-world datasets, we demonstrate that our +approach achieves substantial improvement over previous methodologies, +especially in scenarios characterized by extreme viewpoint changes and the +absence of accurate camera poses. + +
+
+ comment: Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera + ready version (Highlight) +
+
+
+
+
+ + ♻ ☆ UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery CVPR + + +
+ Raindrops adhering to the lens of UAVs can obstruct visibility of the +background scene and degrade image quality. Despite recent progress in image +deraining methods and datasets, there is a lack of focus on raindrop removal +from UAV aerial imagery due to the unique challenges posed by varying angles +and rapid movement during drone flight. To fill the gap in this research, we +first construct a new benchmark dataset for removing raindrops from UAV images, +called UAV-Rain1k. In this letter, we provide a dataset generation pipeline, +which includes modeling raindrop shapes using Blender, collecting background +images from various UAV angles, random sampling of rain masks and etc. Based on +the proposed benchmark, we further present a comprehensive evaluation of +existing representative image deraining algorithms, and reveal future research +opportunities worth exploring. The proposed dataset is publicly available at +https://github.com/cschenxiang/UAV-Rain1k. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) 2024 +
+
+
+
+
+ + ♻ ☆ Fully Sparse 3D Occupancy Prediction + + +
+ Occupancy prediction plays a pivotal role in autonomous driving. Previous +methods typically construct dense 3D volumes, neglecting the inherent sparsity +of the scene and suffering high computational costs. To bridge the gap, we +introduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc +initially reconstructs a sparse 3D representation from visual inputs and +subsequently predicts semantic/instance occupancy from the 3D sparse +representation by sparse queries. A mask-guided sparse sampling is designed to +enable sparse queries to interact with 2D features in a fully sparse manner, +thereby circumventing costly dense features or global attention. Additionally, +we design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the +inconsistency penalty along depths raised in traditional voxel-level mIoU +criteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of +34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history +frames inputs. By incorporating more preceding frames to 15, SparseOcc +continuously improves its performance to 35.1 RayIoU without whistles and +bells. Code is available at https://github.com/MCG-NJU/SparseOcc. + +
+
+ comment: Add new metric: RayIoU +
+
+
+
+
+ + ♻ ☆ Enhancing Ship Classification in Optical Satellite Imagery: Integrating + Convolutional Block Attention Module with ResNet for Improved Performance + + +
+ This study presents an advanced Convolutional Neural Network (CNN) +architecture for ship classification from optical satellite imagery, +significantly enhancing performance through the integration of the +Convolutional Block Attention Module (CBAM) and additional architectural +innovations. Building upon the foundational ResNet50 model, we first +incorporated a standard CBAM to direct the model's focus towards more +informative features, achieving an accuracy of 87% compared to the baseline +ResNet50's 85%. Further augmentations involved multi-scale feature integration, +depthwise separable convolutions, and dilated convolutions, culminating in the +Enhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable +accuracy of 95%, with precision, recall, and f1-scores all witnessing +substantial improvements across various ship classes. The bulk carrier and oil +tanker classes, in particular, showcased nearly perfect precision and recall +rates, underscoring the model's enhanced capability in accurately identifying +and classifying ships. Attention heatmap analyses further validated the +improved model's efficacy, revealing a more focused attention on relevant ship +features, regardless of background complexities. These findings underscore the +potential of integrating attention mechanisms and architectural innovations in +CNNs for high-resolution satellite imagery classification. The study navigates +through the challenges of class imbalance and computational costs, proposing +future directions towards scalability and adaptability in new or rare ship type +recognition. This research lays a groundwork for the application of advanced +deep learning techniques in the domain of remote sensing, offering insights +into scalable and efficient satellite image classification. + +
+
+
+
+
+ + ♻ ☆ Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning + + +
+ In this work, we use multi-view aerial images to reconstruct the geometry, +lighting, and material of facades using neural signed distance fields (SDFs). +Without the requirement of complex equipment, our method only takes simple RGB +images captured by a drone as inputs to enable physically based and +photorealistic novel-view rendering, relighting, and editing. However, a +real-world facade usually has complex appearances ranging from diffuse rocks +with subtle details to large-area glass windows with specular reflections, +making it hard to attend to everything. As a result, previous methods can +preserve the geometry details but fail to reconstruct smooth glass windows or +verse vise. In order to address this challenge, we introduce three spatial- and +semantic-adaptive optimization strategies, including a semantic regularization +approach based on zero-shot segmentation techniques to improve material +consistency, a frequency-aware geometry regularization to balance surface +smoothness and details in different surfaces, and a visibility probe-based +scheme to enable efficient modeling of the local lighting in large-scale +outdoor environments. In addition, we capture a real-world facade aerial 3D +scanning image set and corresponding point clouds for training and +benchmarking. The experiment demonstrates the superior quality of our method on +facade holistic inverse rendering, novel view synthesis, and scene editing +compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable + Trajectory Generation AAAI-2024 + + +
+ The learn-from-observation (LfO) paradigm is a human-inspired mode for a +robot to learn to perform a task simply by watching it being performed. LfO can +facilitate robot integration on factory floors by minimizing disruption and +reducing tedious programming. A key component of the LfO pipeline is a +transformation of the depth camera frames to the corresponding task state and +action pairs, which are then relayed to learning techniques such as imitation +or inverse reinforcement learning for understanding the task parameters. While +several existing computer vision models analyze videos for activity +recognition, SA-Net specifically targets robotic LfO from RGB-D data. However, +SA-Net and many other models analyze frame data captured from a single +viewpoint. Their analysis is therefore highly sensitive to occlusions of the +observed task, which are frequent in deployments. An obvious way of reducing +occlusions is to simultaneously observe the task from multiple viewpoints and +synchronously fuse the multiple streams in the model. Toward this, we present +multi-view SA-Net, which generalizes the SA-Net model to allow the perception +of multiple viewpoints of the task activity, integrate them, and better +recognize the state and action in each frame. Performance evaluations on two +distinct domains establish that MVSA-Net recognizes the state-action pairs +under occlusion more accurately compared to single-view MVSA-Net and other +baselines. Our ablation studies further evaluate its performance under +different ambient conditions and establish the contribution of the architecture +components. As such, MVSA-Net offers a significantly more robust and deployable +state-action trajectory generation compared to previous methods. + +
+
+ comment: Presented at Deployable AI Workshop at AAAI-2024 and 'Towards + Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023 +
+
+
+
+
+ + ♻ ☆ And Then the Hammer Broke: Reflections on Machine Ethics from Feminist + Philosophy of Science + + +
+ Vision is an important metaphor in ethical and political questions of +knowledge. The feminist philosopher Donna Haraway points out the ``perverse'' +nature of an intrusive, alienating, all-seeing vision (to which we might cry +out ``stop looking at me!''), but also encourages us to embrace the embodied +nature of sight and its promises for genuinely situated knowledge. Current +technologies of machine vision -- surveillance cameras, drones (for war or +recreation), iPhone cameras -- are usually construed as instances of the former +rather than the latter, and for good reasons. However, although in no way +attempting to diminish the real suffering these technologies have brought about +in the world, I make the case for understanding technologies of computer vision +as material instances of embodied seeing and situated knowing. Furthermore, +borrowing from Iris Murdoch's concept of moral vision, I suggest that these +technologies direct our labor towards self-reflection in ethically significant +ways. My approach draws upon paradigms in computer vision research, +phenomenology, and feminist epistemology. Ultimately, this essay is an argument +for directing more philosophical attention from merely criticizing technologies +of vision as ethically deficient towards embracing them as complex, +methodologically and epistemologically important objects. + +
+
+ comment: Pacific University Philosophy Conference +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Videos, code, and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ 360+x: A Panoptic Multi-modal Scene Understanding Dataset CVPR 2024 + + +
+ Human perception of the world is shaped by a multitude of viewpoints and +modalities. While many existing datasets focus on scene understanding from a +certain perspective (e.g. egocentric or third-person views), our dataset offers +a panoptic perspective (i.e. multiple viewpoints with multiple data +modalities). Specifically, we encapsulate third-person panoramic and front +views, as well as egocentric monocular/binocular views with rich modalities +including video, multi-channel audio, directional binaural delay, location data +and textual scene descriptions within each scene captured, presenting +comprehensive observation of the world. Figure 1 offers a glimpse of all 28 +scene categories of our 360+x dataset. To the best of our knowledge, this is +the first database that covers multiple viewpoints with multiple data +modalities to mimic how daily information is accessed in the real world. +Through our benchmark analysis, we presented 5 different scene understanding +tasks on the proposed 360+x dataset to evaluate the impact and benefit of each +data modality and perspective in panoptic scene understanding. We hope this +unique dataset could broaden the scope of comprehensive scene understanding and +encourage the community to approach these problems from more diverse +perspectives. + +
+
+ comment: CVPR 2024 (Oral Presentation), Project page: + https://x360dataset.github.io/ +
+
+
+
+
+ + ♻ ☆ A Benchmark Grocery Dataset of Realworld Point Clouds From Single View + + +
+ Fine-grained grocery object recognition is an important computer vision +problem with broad applications in automatic checkout, in-store robotic +navigation, and assistive technologies for the visually impaired. Existing +datasets on groceries are mainly 2D images. Models trained on these datasets +are limited to learning features from the regular 2D grids. While portable 3D +sensors such as Kinect were commonly available for mobile phones, sensors such +as LiDAR and TrueDepth, have recently been integrated into mobile phones. +Despite the availability of mobile 3D sensors, there are currently no dedicated +real-world large-scale benchmark 3D datasets for grocery. In addition, existing +3D datasets lack fine-grained grocery categories and have limited training +samples. Furthermore, collecting data by going around the object versus the +traditional photo capture makes data collection cumbersome. Thus, we introduce +a large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes, +with a total of 87,898 3D point clouds created from 10,755 RGB-D single-view +images. We benchmark our dataset on six recent state-of-the-art 3D point cloud +classification models. Additionally, we also benchmark the dataset on few-shot +and continual learning point cloud classification tasks. Project Page: +https://bigdatavision.org/3DGrocery100/. + +
+
+
+
+
+ + ♻ ☆ Linear Combination of Saved Checkpoints Makes Consistency and Diffusion + Models Better + + +
+ Diffusion Models (DM) and Consistency Models (CM) are two types of popular +generative models with good generation quality on various tasks. When training +DM and CM, intermediate weight checkpoints are not fully utilized and only the +last converged checkpoint is used. In this work, we find that high-quality +model weights often lie in a basin which cannot be reached by SGD but can be +obtained by proper checkpoint averaging. Based on these observations, we +propose LCSC, a simple but effective and efficient method to enhance the +performance of DM and CM, by combining checkpoints along the training +trajectory with coefficients deduced from evolutionary search. We demonstrate +the value of LCSC through two use cases: $\textbf{(a) Reducing training cost.}$ +With LCSC, we only need to train DM/CM with fewer number of iterations and/or +lower batch sizes to obtain comparable sample quality with the fully trained +model. For example, LCSC achieves considerable training speedups for CM +(23$\times$ on CIFAR-10 and 15$\times$ on ImageNet-64). $\textbf{(b) Enhancing +pre-trained models.}$ Assuming full training is already done, LCSC can further +improve the generation quality or speed of the final converged models. For +example, LCSC achieves better performance using 1 number of function evaluation +(NFE) than the base model with 2 NFE on consistency distillation, and decreases +the NFE of DM from 15 to 9 while maintaining the generation quality on +CIFAR-10. Our code is available at +https://github.com/imagination-research/LCSC. + +
+
+
+
+
+ + ♻ ☆ S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation + + +
+ Deep learning has become a powerful tool for Mars exploration. Mars terrain +semantic segmentation is an important Martian vision task, which is the base of +rover autonomous planning and safe driving. However, there is a lack of +sufficient detailed and high-confidence data annotations, which are exactly +required by most deep learning methods to obtain a good model. To address this +problem, we propose our solution from the perspective of joint data and method +design. We first present a newdataset S5Mars for Semi-SuperviSed learning on +Mars Semantic Segmentation, which contains 6K high-resolution images and is +sparsely annotated based on confidence, ensuring the high quality of labels. +Then to learn from this sparse data, we propose a semi-supervised learning +(SSL) framework for Mars image semantic segmentation, to learn representations +from limited labeled data. Different from the existing SSL methods which are +mostly targeted at the Earth image data, our method takes into account Mars +data characteristics. Specifically, we first investigate the impact of current +widely used natural image augmentations on Mars images. Based on the analysis, +we then proposed two novel and effective augmentations for SSL of Mars +segmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost +the model performance. Meanwhile, to fully leverage the unlabeled data, we +introduce a soft-to-hard consistency learning strategy, learning from different +targets based on prediction confidence. Experimental results show that our +method can outperform state-of-the-art SSL approaches remarkably. Our proposed +dataset is available at https://jhang2020.github.io/S5Mars.github.io/. + +
+
+ comment: IEEE TGRS 2024 +
+
+
+
+
+ + ♻ ☆ OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field + Reconstruction using Omnidirectional Images + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in robotics. However, the current 3D Gaussian Splatting +system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. As a result, we realize +differentiable optimization of the radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. To benefit the research community, the code will be +made publicly available once the paper is published. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Single Mesh Diffusion Models with Field Latents for Texture Generation CVPR 2024 + + +
+ We introduce a framework for intrinsic latent diffusion models operating +directly on the surfaces of 3D shapes, with the goal of synthesizing +high-quality textures. Our approach is underpinned by two contributions: field +latents, a latent representation encoding textures as discrete vector fields on +the mesh vertices, and field latent diffusion models, which learn to denoise a +diffusion process in the learned latent space on the surface. We consider a +single-textured-mesh paradigm, where our models are trained to generate +variations of a given texture on a mesh. We show the synthesized textures are +of superior fidelity compared those from existing single-textured-mesh +generative models. Our models can also be adapted for user-controlled editing +tasks such as inpainting and label-guided generation. The efficacy of our +approach is due in part to the equivariance of our proposed framework under +isometries, allowing our models to seamlessly reproduce details across locally +similar regions and opening the door to a notion of generative texture +transfer. + +
+
+ comment: CVPR 2024. Code and additional visualizations available: + https://single-mesh-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ GMISeg: General Medical Image Segmentation without Re-Training + + +
+ Although deep learning models have become the main method for medical image +segmentation, they often cannot be extended to unknown segmentation tasks +involving new anatomical structures, image shapes, or labels. For new +segmentation tasks, researchers often have to retrain or fine-tune the model, +which is time-consuming and poses a significant obstacle to clinical +researchers, who often lack the resources and professional knowledge to train +neural networks. Therefore, we proposed a general method that can solve unknown +medical image segmentation tasks without requiring additional training. Given +an example set of images and prompts for defining new segmentation tasks, +GMISeg applies a novel low-rank fine-tuning strategy based on the proposed +approach to the SAM (Segment Anything Model) image encoder, and works with the +prompt encoder and mask decoder to fine-tune the labeled dataset without the +need for additional training. To achieve generalization of new tasks, we used +medical image datasets with different imaging modes for different parts. We +trained and generalized GMISeg on a different set of anatomical and imaging +modes using cardiac images on other site datasets. We have demonstrated that +GMISeg outperforms the latest methods on unknown tasks and have conducted a +comprehensive analysis and summary of the important performance of the proposed +method. + +
+
+
+
+
+ + ♻ ☆ i-MAE: Are Latent Representations in Masked Autoencoders Linearly + Separable? + + +
+ Masked image modeling (MIM) has been recognized as a strong self-supervised +pre-training approach in the vision domain. However, the mechanism and +properties of the learned representations by such a scheme, as well as how to +further enhance the representations are so far not well-explored. In this +paper, we aim to explore an interactive Masked Autoencoders (i-MAE) framework +to enhance the representation capability from two aspects: (1) employing a +two-way image reconstruction and a latent feature reconstruction with +distillation loss to learn better features; (2) proposing a semantics-enhanced +sampling strategy to boost the learned semantics in MAE. Upon the proposed +i-MAE architecture, we can address two critical questions to explore the +behaviors of the learned representations in MAE: (1) Whether the separability +of latent representations in Masked Autoencoders is helpful for model +performance? We study it by forcing the input as a mixture of two images +instead of one. (2) Whether we can enhance the representations in the latent +feature space by controlling the degree of semantics during sampling on Masked +Autoencoders? To this end, we propose a sampling strategy within a mini-batch +based on the semantics of training samples to examine this aspect. Extensive +experiments are conducted on CIFAR-10/100, Tiny-ImageNet and ImageNet-1K to +verify the observations we discovered. Furthermore, in addition to +qualitatively analyzing the characteristics of the latent representations, we +examine the existence of linear separability and the degree of semantics in the +latent space by proposing two evaluation schemes. The surprising and consistent +results demonstrate that i-MAE is a superior framework design for understanding +MAE frameworks, as well as achieving better representational ability. Code is +available at https://github.com/vision-learning-acceleration-lab/i-mae. + +
+
+ comment: Project page: https://zhiqiangshen.com/projects/i-mae/ +
+
+
+
+
+ + ♻ ☆ Two Tricks to Improve Unsupervised Segmentation Learning + + +
+ We present two practical improvement techniques for unsupervised segmentation +learning. These techniques address limitations in the resolution and accuracy +of predicted segmentation maps of recent state-of-the-art methods. Firstly, we +leverage image post-processing techniques such as guided filtering to refine +the output masks, improving accuracy while avoiding substantial computational +costs. Secondly, we introduce a multi-scale consistency criterion, based on a +teacher-student training scheme. This criterion matches segmentation masks +predicted from regions of the input image extracted at different resolutions to +each other. Experimental results on several benchmarks used in unsupervised +segmentation learning demonstrate the effectiveness of our proposed techniques. + +
+
+
+
+
+ + ♻ ☆ Divide and Conquer: High-Resolution Industrial Anomaly Detection via + Memory Efficient Tiled Ensemble CVPR 24 + + +
+ Industrial anomaly detection is an important task within computer vision with +a wide range of practical use cases. The small size of anomalous regions in +many real-world datasets necessitates processing the images at a high +resolution. This frequently poses significant challenges concerning memory +consumption during the model training and inference stages, leaving some +existing methods impractical for widespread adoption. To overcome this +challenge, we present the tiled ensemble approach, which reduces memory +consumption by dividing the input images into a grid of tiles and training a +dedicated model for each tile location. The tiled ensemble is compatible with +any existing anomaly detection model without the need for any modification of +the underlying architecture. By introducing overlapping tiles, we utilize the +benefits of traditional stacking ensembles, leading to further improvements in +anomaly detection capabilities beyond high resolution alone. We perform a +comprehensive analysis using diverse underlying architectures, including Padim, +PatchCore, FastFlow, and Reverse Distillation, on two standard anomaly +detection datasets: MVTec and VisA. Our method demonstrates a notable +improvement across setups while remaining within GPU memory constraints, +consuming only as much GPU memory as a single model needs to process a single +tile. + +
+
+ comment: To appear at CVPR 24 Visual Anomaly Detection Workshop. Research + conducted during Google Summer of Code 2023 at OpenVINO (Intel). GSoC 2023 + page: https://summerofcode.withgoogle.com/archive/2023/projects/WUSjdxGl +
+
+
+
+
+ + ♻ ☆ TrailBlazer: Trajectory Control for Diffusion-Based Video Generation + + +
+ Within recent approaches to text-to-video (T2V) generation, achieving +controllability in the synthesized video is often a challenge. Typically, this +issue is addressed by providing low-level per-frame guidance in the form of +edge maps, depth maps, or an existing video to be altered. However, the process +of obtaining such guidance can be labor-intensive. This paper focuses on +enhancing controllability in video synthesis by employing straightforward +bounding boxes to guide the subject in various ways, all without the need for +neural network training, finetuning, optimization at inference time, or the use +of pre-existing videos. Our algorithm, TrailBlazer, is constructed upon a +pre-trained (T2V) model, and easy to implement. The subject is directed by a +bounding box through the proposed spatial and temporal attention map editing. +Moreover, we introduce the concept of keyframing, allowing the subject +trajectory and overall appearance to be guided by both a moving bounding box +and corresponding prompts, without the need to provide a detailed mask. The +method is efficient, with negligible additional computation relative to the +underlying pre-trained model. Despite the simplicity of the bounding box +guidance, the resulting motion is surprisingly natural, with emergent effects +including perspective and movement toward the virtual camera as the box size +increases. + +
+
+ comment: 14 pages, 18 figures, Project Page: + https://hohonu-vicml.github.io/Trailblazer.Page/ +
+
+
+
+
+ + ♻ ☆ Knowledge Distillation via the Target-aware Transformer CVPR2022 + + +
+ Knowledge distillation becomes a de facto standard to improve the performance +of small neural networks. Most of the previous works propose to regress the +representational features from the teacher to the student in a one-to-one +spatial matching fashion. However, people tend to overlook the fact that, due +to the architecture differences, the semantic information on the same spatial +location usually vary. This greatly undermines the underlying assumption of the +one-to-one distillation approach. To this end, we propose a novel one-to-all +spatial matching knowledge distillation approach. Specifically, we allow each +pixel of the teacher feature to be distilled to all spatial locations of the +student features given its similarity, which is generated from a target-aware +transformer. Our approach surpasses the state-of-the-art methods by a +significant margin on various computer vision benchmarks, such as ImageNet, +Pascal VOC and COCOStuff10k. Code is available at +https://github.com/sihaoevery/TaT. + +
+
+ comment: CVPR2022(Oral) +
+
+
+
+
+ + ♻ ☆ PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor CVPR 2024 + + +
+ Generative image editing has recently witnessed extremely fast-paced growth. +Some works use high-level conditioning such as text, while others use low-level +conditioning. Nevertheless, most of them lack fine-grained control over the +properties of the different objects present in the image, i.e. object-level +image editing. In this work, we tackle the task by perceiving the images as an +amalgamation of various objects and aim to control the properties of each +object in a fine-grained manner. Out of these properties, we identify structure +and appearance as the most intuitive to understand and useful for editing +purposes. We propose PAIR Diffusion, a generic framework that can enable a +diffusion model to control the structure and appearance properties of each +object in the image. We show that having control over the properties of each +object in an image leads to comprehensive editing capabilities. Our framework +allows for various object-level editing operations on real images such as +reference image-based appearance editing, free-form shape editing, adding +objects, and variations. Thanks to our design, we do not require any inversion +step. Additionally, we propose multimodal classifier-free guidance which +enables editing images using both reference images and text when using our +approach with foundational diffusion models. We validate the above claims by +extensively evaluating our framework on both unconditional and foundational +diffusion models. Please refer to +https://vidit98.github.io/publication/conference-paper/pair_diff.html for code +and model release. + +
+
+ comment: Accepted in CVPR 2024, Project page + https://vidit98.github.io/publication/conference-paper/pair_diff.html +
+
+
+
+
+ + ♻ ☆ DGInStyle: Domain-Generalizable Semantic Segmentation with Image + Diffusion Models and Stylized Semantic Control + + +
+ Large, pretrained latent diffusion models (LDMs) have demonstrated an +extraordinary ability to generate creative content, specialize to user data +through few-shot fine-tuning, and condition their output on other modalities, +such as semantic maps. However, are they usable as large-scale data generators, +e.g., to improve tasks in the perception stack, like semantic segmentation? We +investigate this question in the context of autonomous driving, and answer it +with a resounding "yes". We propose an efficient data generation pipeline +termed DGInStyle. First, we examine the problem of specializing a pretrained +LDM to semantically-controlled generation within a narrow domain. Second, we +propose a Style Swap technique to endow the rich generative prior with the +learned semantic control. Third, we design a Multi-resolution Latent Fusion +technique to overcome the bias of LDMs towards dominant objects. Using +DGInStyle, we generate a diverse dataset of street scenes, train a +domain-agnostic semantic segmentation model on it, and evaluate the model on +multiple popular autonomous driving datasets. Our approach consistently +increases the performance of several domain generalization methods compared to +the previous state-of-the-art methods. Source code and dataset are available at +https://dginstyle.github.io. + +
+
+
+
+
+ + ♻ ☆ No "Zero-Shot" Without Exponential Data: Pretraining Concept Frequency + Determines Multimodal Model Performance ICLR'24 + + +
+ Web-crawled pretraining datasets underlie the impressive "zero-shot" +evaluation performance of multimodal models, such as CLIP for +classification/retrieval and Stable-Diffusion for image generation. However, it +is unclear how meaningful the notion of "zero-shot" generalization is for such +multimodal models, as it is not known to what extent their pretraining datasets +encompass the downstream concepts targeted for during "zero-shot" evaluation. +In this work, we ask: How is the performance of multimodal models on downstream +concepts influenced by the frequency of these concepts in their pretraining +datasets? We comprehensively investigate this question across 34 models and +five standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M, +LAION-Aesthetics), generating over 300GB of data artifacts. We consistently +find that, far from exhibiting "zero-shot" generalization, multimodal models +require exponentially more data to achieve linear improvements in downstream +"zero-shot" performance, following a sample inefficient log-linear scaling +trend. This trend persists even when controlling for sample-level similarity +between pretraining and downstream datasets, and testing on purely synthetic +data distributions. Furthermore, upon benchmarking models on long-tailed data +sampled based on our analysis, we demonstrate that multimodal models across the +board perform poorly. We contribute this long-tail test set as the "Let it +Wag!" benchmark to further research in this direction. Taken together, our +study reveals an exponential need for training data which implies that the key +to "zero-shot" generalization capabilities under large-scale training paradigms +remains to be found. + +
+
+ comment: Extended version of the short paper accepted at DPFM, ICLR'24 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 121 + +
+
+
+ + ☆ Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by + Unsupervised Learning + + +
+ The reconstruction of human visual inputs from brain activity, particularly +through functional Magnetic Resonance Imaging (fMRI), holds promising avenues +for unraveling the mechanisms of the human visual system. Despite the +significant strides made by deep learning methods in improving the quality and +interpretability of visual reconstruction, there remains a substantial demand +for high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The +challenge arises in integrating diverse smaller 3-Tesla datasets or +accommodating new subjects with brief and low-quality fMRI scans. In response +to these constraints, we propose a novel framework that generates enhanced 3T +fMRI data through an unsupervised Generative Adversarial Network (GAN), +leveraging unpaired training across two distinct fMRI datasets in 7T and 3T, +respectively. This approach aims to overcome the limitations of the scarcity of +high-quality 7-Tesla data and the challenges associated with brief and +low-quality scans in 3-Tesla experiments. In this paper, we demonstrate the +reconstruction capabilities of the enhanced 3T fMRI data, highlighting its +proficiency in generating superior input visual images compared to +data-intensive methods trained and tested on a single subject. + +
+
+ comment: Accepted by ISBI 2024 +
+
+
+
+
+ + ☆ VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for + Deformable 3D Image Registration + + +
+ Image registration, a critical process in medical imaging, involves aligning +different sets of medical imaging data into a single unified coordinate system. +Deep learning networks, such as the Convolutional Neural Network (CNN)-based +VoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model +(SSM)-based MambaMorph, have demonstrated effective performance in this domain. +The recent Visual State Space Model (VMamba), which incorporates a cross-scan +module with SSM, has exhibited promising improvements in modeling global-range +dependencies with efficient computational cost in computer vision tasks. This +paper hereby introduces an exploration of VMamba with image registration, named +VMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for +3D image registration. Utilizing a U-shaped network architecture, VMambaMorph +computes the deformation field based on target and source volumes. The +VMamba-based block with 2D cross-scan module is redesigned for 3D volumetric +feature processing, and a fine-grained feature extraction module is proposed +for high-dimensional feature learning. We validate VMambaMorph using a public +benchmark brain MR-CT registration dataset, comparing its performance against +current state-of-the-art methods. The results indicate that VMambaMorph +achieves competitive registration quality. The code for VMambaMorph is +available on GitHub. + +
+
+
+
+
+ + ☆ LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance + Volumetric Medical Image Segmentation + + +
+ As a result of the rise of Transformer architectures in medical image +analysis, specifically in the domain of medical image segmentation, a multitude +of hybrid models have been created that merge the advantages of Convolutional +Neural Networks (CNNs) and Transformers. These hybrid models have achieved +notable success by significantly improving segmentation accuracy. Yet, this +progress often comes at the cost of increased model complexity, both in terms +of parameters and computational demand. Moreover, many of these models fail to +consider the crucial interplay between spatial and channel features, which +could further refine and improve segmentation outcomes. To address this, we +introduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric +medical image segmentation. LHU-Net is meticulously designed to prioritize +spatial feature analysis in its initial layers before shifting focus to +channel-based features in its deeper layers, ensuring a comprehensive feature +extraction process. Rigorous evaluation across five benchmark datasets - +Synapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior +performance, showcasing its dual capacity for efficiency and accuracy. Notably, +LHU-Net sets new performance benchmarks, such as attaining a Dice score of +92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and +quartering the computational load compared to existing state-of-the-art models. +Achieved without any reliance on pre-training, additional data, or model +ensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art +performance across all evaluated datasets, utilizing fewer than 11 million +parameters. This achievement highlights that balancing computational efficiency +with high accuracy in medical image segmentation is feasible. Our +implementation of LHU-Net is freely accessible to the research community on +GitHub. + +
+
+
+
+
+ + ☆ HaVTR: Improving Video-Text Retrieval Through Augmentation Using Large + Foundation Models + + +
+ While recent progress in video-text retrieval has been driven by the +exploration of powerful model architectures and training strategies, the +representation learning ability of video-text retrieval models is still limited +due to low-quality and scarce training data annotations. To address this issue, +we present a novel video-text learning paradigm, HaVTR, which augments video +and text data to learn more generalized features. Specifically, we first adopt +a simple augmentation method, which generates self-similar data by randomly +duplicating or dropping subwords and frames. In addition, inspired by the +recent advancement in visual and language generative models, we propose a more +powerful augmentation method through textual paraphrasing and video stylization +using large language models (LLMs) and visual generative models (VGMs). +Further, to bring richer information into video and text, we propose a +hallucination-based augmentation method, where we use LLMs and VGMs to generate +and add new relevant information to the original data. Benefiting from the +enriched data, extensive experiments on several video-text retrieval benchmarks +demonstrate the superiority of HaVTR over existing methods. + +
+
+
+
+
+ + ☆ Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind + + +
+ As humans move around, performing their daily tasks, they are able to recall +where they have positioned objects in their environment, even if these objects +are currently out of sight. In this paper, we aim to mimic this spatial +cognition ability. We thus formulate the task of Out of Sight, Not Out of Mind +- 3D tracking active objects using observations captured through an egocentric +camera. We introduce Lift, Match and Keep (LMK), a method which lifts partial +2D observations to 3D world coordinates, matches them over time using visual +appearance, 3D location and interactions to form object tracks, and keeps these +object tracks even when they go out-of-view of the camera - hence keeping in +mind what is out of sight. We test LMK on 100 long videos from EPIC-KITCHENS. +Our results demonstrate that spatial cognition is critical for correctly +locating objects over short and long time scales. E.g., for one long egocentric +video, we estimate the 3D location of 50 active objects. Of these, 60% can be +correctly positioned in 3D after 2 minutes of leaving the camera view. + +
+
+ comment: 21 pages including references and appendix. Project Webpage: + http://dimadamen.github.io/OSNOM/ +
+
+
+
+
+ + ☆ AirShot: Efficient Few-Shot Detection for Autonomous Exploration + + +
+ Few-shot object detection has drawn increasing attention in the field of +robotic exploration, where robots are required to find unseen objects with a +few online provided examples. Despite recent efforts have been made to yield +online processing capabilities, slow inference speeds of low-powered robots +fail to meet the demands of real-time detection-making them impractical for +autonomous exploration. Existing methods still face performance and efficiency +challenges, mainly due to unreliable features and exhaustive class loops. In +this work, we propose a new paradigm AirShot, and discover that, by fully +exploiting the valuable correlation map, AirShot can result in a more robust +and faster few-shot object detection system, which is more applicable to +robotics community. The core module Top Prediction Filter (TPF) can operate on +multi-scale correlation maps in both the training and inference stages. During +training, TPF supervises the generation of a more representative correlation +map, while during inference, it reduces looping iterations by selecting +top-ranked classes, thus cutting down on computational costs with better +performance. Surprisingly, this dual functionality exhibits general +effectiveness and efficiency on various off-the-shelf models. Exhaustive +experiments on COCO2017, VOC2014, and SubT datasets demonstrate that TPF can +significantly boost the efficacy and efficiency of most off-the-shelf models, +achieving up to 36.4% precision improvements along with 56.3% faster inference +speed. Code and Data are at: https://github.com/ImNotPrepared/AirShot. + +
+
+
+
+
+ + ☆ AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with + Implicit Disentanglement + + +
+ Facial action unit (AU) intensity plays a pivotal role in quantifying +fine-grained expression behaviors, which is an effective condition for facial +expression manipulation. However, publicly available datasets containing +intensity annotations for multiple AUs remain severely limited, often featuring +a restricted number of subjects. This limitation places challenges to the AU +intensity manipulation in images due to disentanglement issues, leading +researchers to resort to other large datasets with pretrained AU intensity +estimators for pseudo labels. In addressing this constraint and fully +leveraging manual annotations of AU intensities for precise manipulation, we +introduce AUEditNet. Our proposed model achieves impressive intensity +manipulation across 12 AUs, trained effectively with only 18 subjects. +Utilizing a dual-branch architecture, our approach achieves comprehensive +disentanglement of facial attributes and identity without necessitating +additional loss functions or implementing with large batch sizes. This approach +offers a potential solution to achieve desired facial attribute editing despite +the dataset's limited subject count. Our experiments demonstrate AUEditNet's +superior accuracy in editing AU intensities, affirming its capability in +disentangling facial attributes and identity within a limited subject pool. +AUEditNet allows conditioning by either intensity values or target images, +eliminating the need for constructing AU combinations for specific facial +expression synthesis. Moreover, AU intensity estimation, as a downstream task, +validates the consistency between real and edited images, confirming the +effectiveness of our proposed AU intensity manipulation method. + +
+
+
+
+
+ + ☆ Automated Prediction of Breast Cancer Response to Neoadjuvant + Chemotherapy from DWI Data + + +
+ Effective surgical planning for breast cancer hinges on accurately predicting +pathological complete response (pCR) to neoadjuvant chemotherapy (NAC). +Diffusion-weighted MRI (DWI) and machine learning offer a non-invasive approach +for early pCR assessment. However, most machine-learning models require manual +tumor segmentation, a cumbersome and error-prone task. We propose a deep +learning model employing "Size-Adaptive Lesion Weighting" for automatic DWI +tumor segmentation to enhance pCR prediction accuracy. Despite +histopathological changes during NAC complicating DWI image segmentation, our +model demonstrates robust performance. Utilizing the BMMR2 challenge dataset, +it matches human experts in pCR prediction pre-NAC with an area under the curve +(AUC) of 0.76 vs. 0.796, and surpasses standard automated methods mid-NAC, with +an AUC of 0.729 vs. 0.654 and 0.576. Our approach represents a significant +advancement in automating breast cancer treatment planning, enabling more +reliable pCR predictions without manual segmentation. + +
+
+ comment: Accepted for presentation at the IEEE International Symposium on + Biomedical Imaging (ISBI) +
+
+
+
+
+ + ☆ Facial Affective Behavior Analysis with Instruction Tuning + + +
+ Facial affective behavior analysis (FABA) is crucial for understanding human +mental states from images. However, traditional approaches primarily deploy +models to discriminate among discrete emotion categories, and lack the fine +granularity and reasoning capability for complex facial behaviors. The advent +of Multi-modal Large Language Models (MLLMs) has been proven successful in +general visual understanding tasks. However, directly harnessing MLLMs for FABA +is challenging due to the scarcity of datasets and benchmarks, neglecting +facial prior knowledge, and low training efficiency. To address these +challenges, we introduce (i) an instruction-following dataset for two FABA +tasks, e.g., emotion and action unit recognition, (ii) a benchmark FABA-Bench +with a new metric considering both recognition and generation ability, and +(iii) a new MLLM "EmoLA" as a strong baseline to the community. Our initiative +on the dataset and benchmarks reveal the nature and rationale of facial +affective behaviors, i.e., fine-grained facial movement, interpretability, and +reasoning. Moreover, to build an effective and efficient FABA MLLM, we +introduce a facial prior expert module with face structure knowledge and a +low-rank adaptation module into pre-trained MLLM. We conduct extensive +experiments on FABA-Bench and four commonly-used FABA datasets. The results +demonstrate that the proposed facial prior expert can boost the performance and +EmoLA achieves the best results on our FABA-Bench. On commonly-used FABA +datasets, EmoLA is competitive rivaling task-specific state-of-the-art models. + +
+
+ comment: V1.0 +
+
+
+
+
+ + ☆ PlateSegFL: A Privacy-Preserving License Plate Detection Using Federated + Segmentation Learning + + +
+ Automatic License Plate Recognition (ALPR) is an integral component of an +intelligent transport system with extensive applications in secure +transportation, vehicle-to-vehicle communication, stolen vehicles detection, +traffic violations, and traffic flow management. The existing license plate +detection system focuses on one-shot learners or pre-trained models that +operate with a geometric bounding box, limiting the model's performance. +Furthermore, continuous video data streams uploaded to the central server +result in network and complexity issues. To combat this, PlateSegFL was +introduced, which implements U-Net-based segmentation along with Federated +Learning (FL). U-Net is well-suited for multi-class image segmentation tasks +because it can analyze a large number of classes and generate a pixel-level +segmentation map for each class. Federated Learning is used to reduce the +quantity of data required while safeguarding the user's privacy. Different +computing platforms, such as mobile phones, are able to collaborate on the +development of a standard prediction model where it makes efficient use of +one's time; incorporates more diverse data; delivers projections in real-time; +and requires no physical effort from the user; resulting around 95% F1 score. + +
+
+
+
+
+ + ☆ FGAIF: Aligning Large Vision-Language Models with Fine-grained AI + Feedback + + +
+ Large Vision-Language Models (LVLMs) have demonstrated proficiency in +tackling a variety of visual-language tasks. However, current LVLMs suffer from +misalignment between text and image modalities which causes three kinds of +hallucination problems, i.e., object existence, object attribute, and object +relationship. To tackle this issue, existing methods mainly utilize +Reinforcement Learning (RL) to align modalities in LVLMs. However, they still +suffer from three main limitations: (1) General feedback can not indicate the +hallucination type contained in the response; (2) Sparse rewards only give the +sequence-level reward for the whole response; and (3)Annotation cost is +time-consuming and labor-intensive. To handle these limitations, we propose an +innovative method to align modalities in LVLMs through Fine-Grained Artificial +Intelligence Feedback (FGAIF), which mainly consists of three steps: AI-based +Feedback Collection, Fine-grained Reward Model Training, and Reinforcement +Learning with Fine-grained Reward. Specifically, We first utilize AI tools to +predict the types of hallucination for each segment in the response and obtain +a collection of fine-grained feedback. Then, based on the collected reward +data, three specialized reward models are trained to produce dense rewards. +Finally, a novel fine-grained feedback module is integrated into the Proximal +Policy Optimization (PPO) algorithm. Extensive experiments are conducted on +hallucination and general benchmarks, demonstrating the superior performance of +our proposed method. Notably, compared with previous models trained with the +RL-based aligning method, our proposed method is effective even with fewer +parameters. + +
+
+
+
+
+ + ☆ LOGO: A Long-Form Video Dataset for Group Action Quality Assessment CVPR 2023 + + +
+ Action quality assessment (AQA) has become an emerging topic since it can be +extensively applied in numerous scenarios. However, most existing methods and +datasets focus on single-person short-sequence scenes, hindering the +application of AQA in more complex situations. To address this issue, we +construct a new multi-person long-form video dataset for action quality +assessment named LOGO. Distinguished in scenario complexity, our dataset +contains 200 videos from 26 artistic swimming events with 8 athletes in each +sample along with an average duration of 204.2 seconds. As for richness in +annotations, LOGO includes formation labels to depict group information of +multiple athletes and detailed annotations on action procedures. Furthermore, +we propose a simple yet effective method to model relations among athletes and +reason about the potential temporal logic in long-form videos. Specifically, we +design a group-aware attention module, which can be easily plugged into +existing AQA methods, to enrich the clip-wise representations based on +contextual group information. To benchmark LOGO, we systematically conduct +investigations on the performance of several popular methods in AQA and action +segmentation. The results reveal the challenges our dataset brings. Extensive +experiments also show that our approach achieves state-of-the-art on the LOGO +dataset. The dataset and code will be released at +\url{https://github.com/shiyi-zh0408/LOGO }. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ☆ PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a + Mobile Robot + + +
+ The study of non-line-of-sight (NLOS) imaging is growing due to its many +potential applications, including rescue operations and pedestrian detection by +self-driving cars. However, implementing NLOS imaging on a moving camera +remains an open area of research. Existing NLOS imaging methods rely on +time-resolved detectors and laser configurations that require precise optical +alignment, making it difficult to deploy them in dynamic environments. This +work proposes a data-driven approach to NLOS imaging, PathFinder, that can be +used with a standard RGB camera mounted on a small, power-constrained mobile +robot, such as an aerial drone. Our experimental pipeline is designed to +accurately estimate the 2D trajectory of a person who moves in a +Manhattan-world environment while remaining hidden from the camera's +field-of-view. We introduce a novel approach to process a sequence of dynamic +successive frames in a line-of-sight (LOS) video using an attention-based +neural network that performs inference in real-time. The method also includes a +preprocessing selection metric that analyzes images from a moving camera which +contain multiple vertical planar surfaces, such as walls and building facades, +and extracts planes that return maximum NLOS information. We validate the +approach on in-the-wild scenes using a drone for video capture, thus +demonstrating low-cost NLOS imaging in dynamic capture environments. + +
+
+ comment: First two authors have equal contribution +
+
+
+
+
+ + ☆ Scalable and Efficient Hierarchical Visual Topological Mapping + + +
+ Hierarchical topological representations can significantly reduce search +times within mapping and localization algorithms. Although recent research has +shown the potential for such approaches, limited consideration has been given +to the suitability and comparative performance of different global feature +representations within this context. In this work, we evaluate state-of-the-art +hand-crafted and learned global descriptors using a hierarchical topological +mapping technique on benchmark datasets and present results of a comprehensive +evaluation of the impact of the global descriptor used. Although learned +descriptors have been incorporated into place recognition methods to improve +retrieval accuracy and enhance overall recall, the problem of scalability and +efficiency when applied to longer trajectories has not been adequately +addressed in a majority of research studies. Based on our empirical analysis of +multiple runs, we identify that continuity and distinctiveness are crucial +characteristics for an optimal global descriptor that enable efficient and +scalable hierarchical mapping, and present a methodology for quantifying and +contrasting these characteristics across different global descriptors. Our +study demonstrates that the use of global descriptors based on an unsupervised +learned Variational Autoencoder (VAE) excels in these characteristics and +achieves significantly lower runtime. It runs on a consumer grade desktop, up +to 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x +faster than the hand-crafted descriptor, PHOG, on the longest track evaluated +(St Lucia, 17.6 km), without sacrificing overall recall performance. + +
+
+ comment: Published in the 21st International Conference on Advanced Robotics + (ICAR 2023) +
+
+
+
+
+ + ☆ DinoBloom: A Foundation Model for Generalizable Cell Embeddings in + Hematology + + +
+ In hematology, computational models offer significant potential to improve +diagnostic accuracy, streamline workflows, and reduce the tedious work of +analyzing single cells in peripheral blood or bone marrow smears. However, +clinical adoption of computational models has been hampered by the lack of +generalization due to large batch effects, small dataset sizes, and poor +performance in transfer learning from natural images. To address these +challenges, we introduce DinoBloom, the first foundation model for single cell +images in hematology, utilizing a tailored DINOv2 pipeline. Our model is built +upon an extensive collection of 13 diverse, publicly available datasets of +peripheral blood and bone marrow smears, the most substantial open-source +cohort in hematology so far, comprising over 380,000 white blood cell images. +To assess its generalization capability, we evaluate it on an external dataset +with a challenging domain shift. We show that our model outperforms existing +medical and non-medical vision models in (i) linear probing and k-nearest +neighbor evaluations for cell-type classification on blood and bone marrow +smears and (ii) weakly supervised multiple instance learning for acute myeloid +leukemia subtyping by a large margin. A family of four DinoBloom models (small, +base, large, and giant) can be adapted for a wide range of downstream +applications, be a strong baseline for classification problems, and facilitate +the assessment of batch effects in new datasets. All models are available at +github.com/marrlab/DinoBloom. + +
+
+
+
+
+ + ☆ Hyperbolic Learning with Synthetic Captions for Open-World Detection CVPR 2024 + + +
+ Open-world detection poses significant challenges, as it requires the +detection of any object using either object class labels or free-form texts. +Existing related works often use large-scale manual annotated caption datasets +for training, which are extremely expensive to collect. Instead, we propose to +transfer knowledge from vision-language models (VLMs) to enrich the +open-vocabulary descriptions automatically. Specifically, we bootstrap dense +synthetic captions using pre-trained VLMs to provide rich descriptions on +different regions in images, and incorporate these captions to train a novel +detector that generalizes to novel concepts. To mitigate the noise caused by +hallucination in synthetic captions, we also propose a novel hyperbolic +vision-language learning approach to impose a hierarchy between visual and +caption embeddings. We call our detector ``HyperLearner''. We conduct extensive +experiments on a wide variety of open-world detection benchmarks (COCO, LVIS, +Object Detection in the Wild, RefCOCO) and our results show that our model +consistently outperforms existing state-of-the-art methods, such as GLIP, +GLIPv2 and Grounding DINO, when using the same backbone. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators + + +
+ Recent advances in Text-to-Video generation (T2V) have achieved remarkable +success in synthesizing high-quality general videos from textual descriptions. +A largely overlooked problem in T2V is that existing models have not adequately +encoded physical knowledge of the real world, thus generated videos tend to +have limited motion and poor variations. In this paper, we propose +\textbf{MagicTime}, a metamorphic time-lapse video generation model, which +learns real-world physics knowledge from time-lapse videos and implements +metamorphic generation. First, we design a MagicAdapter scheme to decouple +spatial and temporal training, encode more physical knowledge from metamorphic +videos, and transform pre-trained T2V models to generate metamorphic videos. +Second, we introduce a Dynamic Frames Extraction strategy to adapt to +metamorphic time-lapse videos, which have a wider variation range and cover +dramatic object metamorphic processes, thus embodying more physical knowledge +than general videos. Finally, we introduce a Magic Text-Encoder to improve the +understanding of metamorphic video prompts. Furthermore, we create a time-lapse +video-text dataset called \textbf{ChronoMagic}, specifically curated to unlock +the metamorphic video generation ability. Extensive experiments demonstrate the +superiority and effectiveness of MagicTime for generating high-quality and +dynamic metamorphic videos, suggesting time-lapse video generation is a +promising path toward building metamorphic simulators of the physical world. + +
+
+
+
+
+ + ☆ Camera-Based Remote Physiology Sensing for Hundreds of Subjects Across + Skin Tones + + +
+ Remote photoplethysmography (rPPG) emerges as a promising method for +non-invasive, convenient measurement of vital signs, utilizing the widespread +presence of cameras. Despite advancements, existing datasets fall short in +terms of size and diversity, limiting comprehensive evaluation under diverse +conditions. This paper presents an in-depth analysis of the VitalVideo dataset, +the largest real-world rPPG dataset to date, encompassing 893 subjects and 6 +Fitzpatrick skin tones. Our experimentation with six unsupervised methods and +three supervised models demonstrates that datasets comprising a few hundred +subjects(i.e., 300 for UBFC-rPPG, 500 for PURE, and 700 for MMPD-Simple) are +sufficient for effective rPPG model training. Our findings highlight the +importance of diversity and consistency in skin tones for precise performance +evaluation across different datasets. + +
+
+ comment: 11 pages, 5 figures, CHI24 Workshop PhysioCHI +
+
+
+
+
+ + ☆ Dual-Scale Transformer for Large-Scale Single-Pixel Imaging CVPR 2024 + + +
+ Single-pixel imaging (SPI) is a potential computational imaging technique +which produces image by solving an illposed reconstruction problem from few +measurements captured by a single-pixel detector. Deep learning has achieved +impressive success on SPI reconstruction. However, previous poor reconstruction +performance and impractical imaging model limit its real-world applications. In +this paper, we propose a deep unfolding network with hybrid-attention +Transformer on Kronecker SPI model, dubbed HATNet, to improve the imaging +quality of real SPI cameras. Specifically, we unfold the computation graph of +the iterative shrinkagethresholding algorithm (ISTA) into two alternative +modules: efficient tensor gradient descent and hybrid-attention multiscale +denoising. By virtue of Kronecker SPI, the gradient descent module can avoid +high computational overheads rooted in previous gradient descent modules based +on vectorized SPI. The denoising module is an encoder-decoder architecture +powered by dual-scale spatial attention for high- and low-frequency aggregation +and channel attention for global information recalibration. Moreover, we build +a SPI prototype to verify the effectiveness of the proposed method. Extensive +experiments on synthetic and real data demonstrate that our method achieves the +state-of-the-art performance. The source code and pre-trained models are +available at https://github.com/Gang-Qu/HATNet-SPI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Weakly Supervised Deep Hyperspherical Quantization for Image Retrieval AAAI 2021 + + +
+ Deep quantization methods have shown high efficiency on large-scale image +retrieval. However, current models heavily rely on ground-truth information, +hindering the application of quantization in label-hungry scenarios. A more +realistic demand is to learn from inexhaustible uploaded images that are +associated with informal tags provided by amateur users. Though such sketchy +tags do not obviously reveal the labels, they actually contain useful semantic +information for supervising deep quantization. To this end, we propose +Weakly-Supervised Deep Hyperspherical Quantization (WSDHQ), which is the first +work to learn deep quantization from weakly tagged images. Specifically, 1) we +use word embeddings to represent the tags and enhance their semantic +information based on a tag correlation graph. 2) To better preserve semantic +information in quantization codes and reduce quantization error, we jointly +learn semantics-preserving embeddings and supervised quantizer on hypersphere +by employing a well-designed fusion layer and tailor-made loss functions. +Extensive experiments show that WSDHQ can achieve state-of-art performance on +weakly-supervised compact coding. Code is available at +https://github.com/gimpong/AAAI21-WSDHQ. + +
+
+ comment: In proceedings of AAAI 2021. Code and data are available +
+
+
+
+
+ + ☆ Fantastic Animals and Where to Find Them: Segment Any Marine Animal with + Dual SAM CVPR2024 + + +
+ As an important pillar of underwater intelligence, Marine Animal Segmentation +(MAS) involves segmenting animals within marine environments. Previous methods +don't excel in extracting long-range contextual features and overlook the +connectivity between discrete pixels. Recently, Segment Anything Model (SAM) +offers a universal framework for general segmentation tasks. Unfortunately, +trained with natural images, SAM does not obtain the prior knowledge from +marine images. In addition, the single-position prompt of SAM is very +insufficient for prior guidance. To address these issues, we propose a novel +feature learning framework, named Dual-SAM for high-performance MAS. To this +end, we first introduce a dual structure with SAM's paradigm to enhance feature +learning of marine images. Then, we propose a Multi-level Coupled Prompt (MCP) +strategy to instruct comprehensive underwater prior information, and enhance +the multi-level features of SAM's encoder with adapters. Subsequently, we +design a Dilated Fusion Attention Module (DFAM) to progressively integrate +multi-level features from SAM's encoder. Finally, instead of directly +predicting the masks of marine animals, we propose a Criss-Cross Connectivity +Prediction (C$^3$P) paradigm to capture the inter-connectivity between discrete +pixels. With dual decoders, it generates pseudo-labels and achieves mutual +supervision for complementary feature representations, resulting in +considerable improvements over previous techniques. Extensive experiments +verify that our proposed method achieves state-of-the-art performances on five +widely-used MAS datasets. The code is available at +https://github.com/Drchip61/Dual_SAM. + +
+
+ comment: Accepted by CVPR2024 as Poster(Highlight) +
+
+
+
+
+ + ☆ Efficient Surgical Tool Recognition via HMM-Stabilized Deep Learning + + +
+ Recognizing various surgical tools, actions and phases from surgery videos is +an important problem in computer vision with exciting clinical applications. +Existing deep-learning-based methods for this problem either process each +surgical video as a series of independent images without considering their +dependence, or rely on complicated deep learning models to count for dependence +of video frames. In this study, we revealed from exploratory data analysis that +surgical videos enjoy relatively simple semantic structure, where the presence +of surgical phases and tools can be well modeled by a compact hidden Markov +model (HMM). Based on this observation, we propose an HMM-stabilized deep +learning method for tool presence detection. A wide range of experiments +confirm that the proposed approaches achieve better performance with lower +training and running costs, and support more flexible ways to construct and +utilize training data in scenarios where not all surgery videos of interest are +extensively labelled. These results suggest that popular deep learning +approaches with over-complicated model structures may suffer from inefficient +utilization of data, and integrating ingredients of deep learning and +statistical learning wisely may lead to more powerful algorithms that enjoy +competitive performance, transparent interpretation and convenient model +training simultaneously. + +
+
+
+
+
+ + ☆ Dynamic Distinction Learning: Adaptive Pseudo Anomalies for Video + Anomaly Detection CVPR2024 + + +
+ We introduce Dynamic Distinction Learning (DDL) for Video Anomaly Detection, +a novel video anomaly detection methodology that combines pseudo-anomalies, +dynamic anomaly weighting, and a distinction loss function to improve detection +accuracy. By training on pseudo-anomalies, our approach adapts to the +variability of normal and anomalous behaviors without fixed anomaly thresholds. +Our model showcases superior performance on the Ped2, Avenue and ShanghaiTech +datasets, where individual models are tailored for each scene. These +achievements highlight DDL's effectiveness in advancing anomaly detection, +offering a scalable and adaptable solution for video surveillance challenges. + +
+
+ comment: To be published in the CVPR2024 Workshop +
+
+
+
+
+ + ☆ Primary liver cancer classification from routine tumour biopsy using + weakly supervised deep learning + + +
+ The diagnosis of primary liver cancers (PLCs) can be challenging, especially +on biopsies and for combined hepatocellular-cholangiocarcinoma (cHCC-CCA). We +automatically classified PLCs on routine-stained biopsies using a weakly +supervised learning method. Weak tumour/non-tumour annotations served as labels +for training a Resnet18 neural network, and the network's last convolutional +layer was used to extract new tumour tile features. Without knowledge of the +precise labels of the malignancies, we then applied an unsupervised clustering +algorithm. Our model identified specific features of hepatocellular carcinoma +(HCC) and intrahepatic cholangiocarcinoma (iCCA). Despite no specific features +of cHCC-CCA being recognized, the identification of HCC and iCCA tiles within a +slide could facilitate the diagnosis of primary liver cancers, particularly +cHCC-CCA. + Method and results: 166 PLC biopsies were divided into training, internal and +external validation sets: 90, 29 and 47 samples. Two liver pathologists +reviewed each whole-slide hematein eosin saffron (HES)-stained image (WSI). +After annotating the tumour/non-tumour areas, 256x256 pixel tiles were +extracted from the WSIs and used to train a ResNet18. The network was used to +extract new tile features. An unsupervised clustering algorithm was then +applied to the new tile features. In a two-cluster model, Clusters 0 and 1 +contained mainly HCC and iCCA histological features. The diagnostic agreement +between the pathological diagnosis and the model predictions in the internal +and external validation sets was 100% (11/11) and 96% (25/26) for HCC and 78% +(7/9) and 87% (13/15) for iCCA, respectively. For cHCC-CCA, we observed a +highly variable proportion of tiles from each cluster (Cluster 0: 5-97%; +Cluster 1: 2-94%). + +
+
+ comment: https://www.sciencedirect.com/science/article/pii/S2589555924000090 +
+
+
+
+
+ + ☆ FPL+: Filtered Pseudo Label-based Unsupervised Cross-Modality Adaptation + for 3D Medical Image Segmentation + + +
+ Adapting a medical image segmentation model to a new domain is important for +improving its cross-domain transferability, and due to the expensive annotation +process, Unsupervised Domain Adaptation (UDA) is appealing where only unlabeled +images are needed for the adaptation. Existing UDA methods are mainly based on +image or feature alignment with adversarial training for regularization, and +they are limited by insufficient supervision in the target domain. In this +paper, we propose an enhanced Filtered Pseudo Label (FPL+)-based UDA method for +3D medical image segmentation. It first uses cross-domain data augmentation to +translate labeled images in the source domain to a dual-domain training set +consisting of a pseudo source-domain set and a pseudo target-domain set. To +leverage the dual-domain augmented images to train a pseudo label generator, +domain-specific batch normalization layers are used to deal with the domain +shift while learning the domain-invariant structure features, generating +high-quality pseudo labels for target-domain images. We then combine labeled +source-domain images and target-domain images with pseudo labels to train a +final segmentor, where image-level weighting based on uncertainty estimation +and pixel-level weighting based on dual-domain consensus are proposed to +mitigate the adverse effect of noisy pseudo labels. Experiments on three public +multi-modal datasets for Vestibular Schwannoma, brain tumor and whole heart +segmentation show that our method surpassed ten state-of-the-art UDA methods, +and it even achieved better results than fully supervised learning in the +target domain in some cases. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ PairAug: What Can Augmented Image-Text Pairs Do for Radiology? CVPR2024 + + +
+ Current vision-language pre-training (VLP) methodologies predominantly depend +on paired image-text datasets, a resource that is challenging to acquire in +radiology due to privacy considerations and labelling complexities. Data +augmentation provides a practical solution to overcome the issue of data +scarcity, however, most augmentation methods exhibit a limited focus, +prioritising either image or text augmentation exclusively. Acknowledging this +limitation, our objective is to devise a framework capable of concurrently +augmenting medical image and text data. We design a Pairwise Augmentation +(PairAug) approach that contains an Inter-patient Augmentation (InterAug) +branch and an Intra-patient Augmentation (IntraAug) branch. Specifically, the +InterAug branch of our approach generates radiology images using synthesised +yet plausible reports derived from a Large Language Model (LLM). The generated +pairs can be considered a collection of new patient cases since they are +artificially created and may not exist in the original dataset. In contrast, +the IntraAug branch uses newly generated reports to manipulate images. This +process allows us to create new paired data for each individual with diverse +medical conditions. Our extensive experiments on various downstream tasks +covering medical image classification zero-shot and fine-tuning analysis +demonstrate that our PairAug, concurrently expanding both image and text data, +substantially outperforms image-/text-only expansion baselines and advanced +medical VLP baselines. Our code is released at +\url{https://github.com/YtongXie/PairAug}. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Gaussian Shading: Provable Performance-Lossless Image Watermarking for + Diffusion Models CVPR 2024 + + +
+ Ethical concerns surrounding copyright protection and inappropriate content +generation pose challenges for the practical implementation of diffusion +models. One effective solution involves watermarking the generated images. +However, existing methods often compromise the model performance or require +additional training, which is undesirable for operators and users. To address +this issue, we propose Gaussian Shading, a diffusion model watermarking +technique that is both performance-lossless and training-free, while serving +the dual purpose of copyright protection and tracing of offending content. Our +watermark embedding is free of model parameter modifications and thus is +plug-and-play. We map the watermark to latent representations following a +standard Gaussian distribution, which is indistinguishable from latent +representations obtained from the non-watermarked diffusion model. Therefore we +can achieve watermark embedding with lossless performance, for which we also +provide theoretical proof. Furthermore, since the watermark is intricately +linked with image semantics, it exhibits resilience to lossy processing and +erasure attempts. The watermark can be extracted by Denoising Diffusion +Implicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian +Shading on multiple versions of Stable Diffusion, and the results demonstrate +that Gaussian Shading not only is performance-lossless but also outperforms +existing methods in terms of robustness. + +
+
+ comment: 17 pages, 11 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ High-Discriminative Attribute Feature Learning for Generalized Zero-Shot + Learning + + +
+ Zero-shot learning(ZSL) aims to recognize new classes without prior exposure +to their samples, relying on semantic knowledge from observed classes. However, +current attention-based models may overlook the transferability of visual +features and the distinctiveness of attribute localization when learning +regional features in images. Additionally, they often overlook shared +attributes among different objects. Highly discriminative attribute features +are crucial for identifying and distinguishing unseen classes. To address these +issues, we propose an innovative approach called High-Discriminative Attribute +Feature Learning for Generalized Zero-Shot Learning (HDAFL). HDAFL optimizes +visual features by learning attribute features to obtain discriminative visual +embeddings. Specifically, HDAFL utilizes multiple convolutional kernels to +automatically learn discriminative regions highly correlated with attributes in +images, eliminating irrelevant interference in image features. Furthermore, we +introduce a Transformer-based attribute discrimination encoder to enhance the +discriminative capability among attributes. Simultaneously, the method employs +contrastive loss to alleviate dataset biases and enhance the transferability of +visual features, facilitating better semantic transfer between seen and unseen +classes. Experimental results demonstrate the effectiveness of HDAFL across +three widely used datasets. + +
+
+
+
+
+ + ☆ AnimateZoo: Zero-shot Video Generation of Cross-Species Animation via + Subject Alignment + + +
+ Recent video editing advancements rely on accurate pose sequences to animate +subjects. However, these efforts are not suitable for cross-species animation +due to pose misalignment between species (for example, the poses of a cat +differs greatly from that of a pig due to differences in body structure). In +this paper, we present AnimateZoo, a zero-shot diffusion-based video generator +to address this challenging cross-species animation issue, aiming to accurately +produce animal animations while preserving the background. The key technique +used in our AnimateZoo is subject alignment, which includes two steps. First, +we improve appearance feature extraction by integrating a Laplacian detail +booster and a prompt-tuning identity extractor. These components are +specifically designed to capture essential appearance information, including +identity and fine details. Second, we align shape features and address +conflicts from differing subjects by introducing a scale-information remover. +This ensures accurate cross-species animation. Moreover, we introduce two +high-quality animal video datasets featuring a wide variety of species. Trained +on these extensive datasets, our model is capable of generating videos +characterized by accurate movements, consistent appearance, and high-fidelity +frames, without the need for the pre-inference fine-tuning that prior arts +required. Extensive experiments showcase the outstanding performance of our +method in cross-species action following tasks, demonstrating exceptional shape +adaptation capability. The project page is available at +https://justinxu0.github.io/AnimateZoo/. + +
+
+ comment: Technical report,15 pages +
+
+
+
+
+ + ☆ Bootstrapping Chest CT Image Understanding by Distilling Knowledge from + X-ray Expert Models CVPR 2024 + + +
+ Radiologists highly desire fully automated versatile AI for medical imaging +interpretation. However, the lack of extensively annotated large-scale +multi-disease datasets has hindered the achievement of this goal. In this +paper, we explore the feasibility of leveraging language as a naturally +high-quality supervision for chest CT imaging. In light of the limited +availability of image-report pairs, we bootstrap the understanding of 3D chest +CT images by distilling chest-related diagnostic knowledge from an extensively +pre-trained 2D X-ray expert model. Specifically, we propose a language-guided +retrieval method to match each 3D CT image with its semantically closest 2D +X-ray image, and perform pair-wise and semantic relation knowledge +distillation. Subsequently, we use contrastive learning to align images and +reports within the same patient while distinguishing them from the other +patients. However, the challenge arises when patients have similar semantic +diagnoses, such as healthy patients, potentially confusing if treated as +negatives. We introduce a robust contrastive learning that identifies and +corrects these false negatives. We train our model with over 12,000 pairs of +chest CT images and radiology reports. Extensive experiments across multiple +scenarios, including zero-shot learning, report generation, and fine-tuning +processes, demonstrate the model's feasibility in interpreting chest CT images. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Anomaly Detection in Electrocardiograms: Advancing Clinical Diagnosis + Through Self-Supervised Learning + + +
+ The electrocardiogram (ECG) is an essential tool for diagnosing heart +disease, with computer-aided systems improving diagnostic accuracy and reducing +healthcare costs. Despite advancements, existing systems often miss rare +cardiac anomalies that could be precursors to serious, life-threatening issues +or alterations in the cardiac macro/microstructure. We address this gap by +focusing on self-supervised anomaly detection (AD), training exclusively on +normal ECGs to recognize deviations indicating anomalies. We introduce a novel +self-supervised learning framework for ECG AD, utilizing a vast dataset of +normal ECGs to autonomously detect and localize cardiac anomalies. It proposes +a novel masking and restoration technique alongside a multi-scale +cross-attention module, enhancing the model's ability to integrate global and +local signal features. The framework emphasizes accurate localization of +anomalies within ECG signals, ensuring the method's clinical relevance and +reliability. To reduce the impact of individual variability, the approach +further incorporates crucial patient-specific information from ECG reports, +such as age and gender, thus enabling accurate identification of a broad +spectrum of cardiac anomalies, including rare ones. Utilizing an extensive +dataset of 478,803 ECG graphic reports from real-world clinical practice, our +method has demonstrated exceptional effectiveness in AD across all tested +conditions, regardless of their frequency of occurrence, significantly +outperforming existing models. It achieved superior performance metrics, +including an AUROC of 91.2%, an F1 score of 83.7%, a sensitivity rate of 84.2%, +a specificity of 83.0%, and a precision of 75.6% with a fixed recall rate of +90%. It has also demonstrated robust localization capabilities, with an AUROC +of 76.5% and a Dice coefficient of 65.3% for anomaly localization. + +
+
+
+
+
+ + ☆ UniMD: Towards Unifying Moment Retrieval and Temporal Action Detection + + +
+ Temporal Action Detection (TAD) focuses on detecting pre-defined actions, +while Moment Retrieval (MR) aims to identify the events described by open-ended +natural language within untrimmed videos. Despite that they focus on different +events, we observe they have a significant connection. For instance, most +descriptions in MR involve multiple actions from TAD. In this paper, we aim to +investigate the potential synergy between TAD and MR. Firstly, we propose a +unified architecture, termed Unified Moment Detection (UniMD), for both TAD and +MR. It transforms the inputs of the two tasks, namely actions for TAD or events +for MR, into a common embedding space, and utilizes two novel query-dependent +decoders to generate a uniform output of classification score and temporal +segments. Secondly, we explore the efficacy of two task fusion learning +approaches, pre-training and co-training, in order to enhance the mutual +benefits between TAD and MR. Extensive experiments demonstrate that the +proposed task fusion learning scheme enables the two tasks to help each other +and outperform the separately trained counterparts. Impressively, UniMD +achieves state-of-the-art results on three paired datasets Ego4D, Charades-STA, +and ActivityNet. Our code will be released at +https://github.com/yingsen1/UniMD. + +
+
+ comment: Tech report +
+
+
+
+
+ + ☆ GvT: A Graph-based Vision Transformer with Talking-Heads Utilizing + Sparsity, Trained from Scratch on Small Datasets + + +
+ Vision Transformers (ViTs) have achieved impressive results in large-scale +image classification. However, when training from scratch on small datasets, +there is still a significant performance gap between ViTs and Convolutional +Neural Networks (CNNs), which is attributed to the lack of inductive bias. To +address this issue, we propose a Graph-based Vision Transformer (GvT) that +utilizes graph convolutional projection and graph-pooling. In each block, +queries and keys are calculated through graph convolutional projection based on +the spatial adjacency matrix, while dot-product attention is used in another +graph convolution to generate values. When using more attention heads, the +queries and keys become lower-dimensional, making their dot product an +uninformative matching function. To overcome this low-rank bottleneck in +attention heads, we employ talking-heads technology based on bilinear pooled +features and sparse selection of attention tensors. This allows interaction +among filtered attention scores and enables each attention mechanism to depend +on all queries and keys. Additionally, we apply graph-pooling between two +intermediate blocks to reduce the number of tokens and aggregate semantic +information more effectively. Our experimental results show that GvT produces +comparable or superior outcomes to deep convolutional networks and surpasses +vision transformers without pre-training on large datasets. The code for our +proposed model is publicly available on the website. + +
+
+
+
+
+ + ☆ Efficient Learnable Collaborative Attention for Single Image + Super-Resolution + + +
+ Non-Local Attention (NLA) is a powerful technique for capturing long-range +feature correlations in deep single image super-resolution (SR). However, NLA +suffers from high computational complexity and memory consumption, as it +requires aggregating all non-local feature information for each query response +and recalculating the similarity weight distribution for different abstraction +levels of features. To address these challenges, we propose a novel Learnable +Collaborative Attention (LCoA) that introduces inductive bias into non-local +modeling. Our LCoA consists of two components: Learnable Sparse Pattern (LSP) +and Collaborative Attention (CoA). LSP uses the k-means clustering algorithm to +dynamically adjust the sparse attention pattern of deep features, which reduces +the number of non-local modeling rounds compared with existing sparse +solutions. CoA leverages the sparse attention pattern and weights learned by +LSP, and co-optimizes the similarity matrix across different abstraction +levels, which avoids redundant similarity matrix calculations. The experimental +results show that our LCoA can reduce the non-local modeling time by about 83% +in the inference stage. In addition, we integrate our LCoA into a deep +Learnable Collaborative Attention Network (LCoAN), which achieves competitive +performance in terms of inference time, memory consumption, and reconstruction +quality compared with other state-of-the-art SR methods. + +
+
+
+
+
+ + ☆ Correcting Diffusion-Based Perceptual Image Compression with Privileged + End-to-End Decoder + + +
+ The images produced by diffusion models can attain excellent perceptual +quality. However, it is challenging for diffusion models to guarantee +distortion, hence the integration of diffusion models and image compression +models still needs more comprehensive explorations. This paper presents a +diffusion-based image compression method that employs a privileged end-to-end +decoder model as correction, which achieves better perceptual quality while +guaranteeing the distortion to an extent. We build a diffusion model and design +a novel paradigm that combines the diffusion model and an end-to-end decoder, +and the latter is responsible for transmitting the privileged information +extracted at the encoder side. Specifically, we theoretically analyze the +reconstruction process of the diffusion models at the encoder side with the +original images being visible. Based on the analysis, we introduce an +end-to-end convolutional decoder to provide a better approximation of the score +function $\nabla_{\mathbf{x}_t}\log p(\mathbf{x}_t)$ at the encoder side and +effectively transmit the combination. Experiments demonstrate the superiority +of our method in both distortion and perception compared with previous +perceptual compression methods. + +
+
+
+
+
+ + ☆ CodecNeRF: Toward Fast Encoding and Decoding, Compact, and High-quality + Novel-view Synthesis + + +
+ Neural Radiance Fields (NeRF) have achieved huge success in effectively +capturing and representing 3D objects and scenes. However, several factors have +impeded its further proliferation as next-generation 3D media. To establish a +ubiquitous presence in everyday media formats, such as images and videos, it is +imperative to devise a solution that effectively fulfills three key objectives: +fast encoding and decoding time, compact model sizes, and high-quality +renderings. Despite significant advancements, a comprehensive algorithm that +adequately addresses all objectives has yet to be fully realized. In this work, +we present CodecNeRF, a neural codec for NeRF representations, consisting of a +novel encoder and decoder architecture that can generate a NeRF representation +in a single forward pass. Furthermore, inspired by the recent +parameter-efficient finetuning approaches, we develop a novel finetuning method +to efficiently adapt the generated NeRF representations to a new test instance, +leading to high-quality image renderings and compact code sizes. The proposed +CodecNeRF, a newly suggested encoding-decoding-finetuning pipeline for NeRF, +achieved unprecedented compression performance of more than 150x and 20x +reduction in encoding time while maintaining (or improving) the image quality +on widely used 3D object datasets, such as ShapeNet and Objaverse. + +
+
+ comment: 34 pages, 22 figures, Project page: + https://gynjn.github.io/Codec-NeRF/ +
+
+
+
+
+ + ☆ MonoTAKD: Teaching Assistant Knowledge Distillation for Monocular 3D + Object Detection + + +
+ Monocular 3D object detection (Mono3D) is an indispensable research topic in +autonomous driving, thanks to the cost-effective monocular camera sensors and +its wide range of applications. Since the image perspective has depth +ambiguity, the challenges of Mono3D lie in understanding 3D scene geometry and +reconstructing 3D object information from a single image. Previous methods +attempted to transfer 3D information directly from the LiDAR-based teacher to +the camera-based student. However, a considerable gap in feature representation +makes direct cross-modal distillation inefficient, resulting in a significant +performance deterioration between the LiDAR-based teacher and the camera-based +student. To address this issue, we propose the Teaching Assistant Knowledge +Distillation (MonoTAKD) to break down the learning objective by integrating +intra-modal distillation with cross-modal residual distillation. In particular, +we employ a strong camera-based teaching assistant model to distill powerful +visual knowledge effectively through intra-modal distillation. Subsequently, we +introduce the cross-modal residual distillation to transfer the 3D spatial +cues. By acquiring both visual knowledge and 3D spatial cues, the predictions +of our approach are rigorously evaluated on the KITTI 3D object detection +benchmark and achieve state-of-the-art performance in Mono3D. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Dual-Camera Smooth Zoom on Mobile Phones + + +
+ When zooming between dual cameras on a mobile, noticeable jumps in geometric +content and image color occur in the preview, inevitably affecting the user's +zoom experience. In this work, we introduce a new task, ie, dual-camera smooth +zoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI) +technique is a potential solution but struggles with ground-truth collection. +To address the issue, we suggest a data factory solution where continuous +virtual cameras are assembled to generate DCSZ data by rendering reconstructed +3D models of the scene. In particular, we propose a novel dual-camera smooth +zoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is +introduced to construct a specific 3D model for each virtual camera. With the +proposed data factory, we construct a synthetic dataset for DCSZ, and we +utilize it to fine-tune FI models. In addition, we collect real-world dual-zoom +images without ground-truth for evaluation. Extensive experiments are conducted +with multiple FI methods. The results show that the fine-tuned FI models +achieve a significant performance improvement over the original ones on DCSZ +task. The datasets, codes, and pre-trained models will be publicly available. + +
+
+ comment: 24 +
+
+
+
+
+ + ☆ DL-EWF: Deep Learning Empowering Women's Fashion with + Grounded-Segment-Anything Segmentation for Body Shape Classification + + +
+ The global fashion industry plays a pivotal role in the global economy, and +addressing fundamental issues within the industry is crucial for developing +innovative solutions. One of the most pressing challenges in the fashion +industry is the mismatch between body shapes and the garments of individuals +they purchase. This issue is particularly prevalent among individuals with +non-ideal body shapes, exacerbating the challenges faced. Considering +inter-individual variability in body shapes is essential for designing and +producing garments that are widely accepted by consumers. Traditional methods +for determining human body shape are limited due to their low accuracy, high +costs, and time-consuming nature. New approaches, utilizing digital imaging and +deep neural networks (DNN), have been introduced to identify human body shape. +In this study, the Style4BodyShape dataset is used for classifying body shapes +into five categories: Rectangle, Triangle, Inverted Triangle, Hourglass, and +Apple. In this paper, the body shape segmentation of a person is extracted from +the image, disregarding the surroundings and background. Then, Various +pre-trained models, such as ResNet18, ResNet34, ResNet50, VGG16, VGG19, and +Inception v3, are used to classify the segmentation results. Among these +pre-trained models, the Inception V3 model demonstrates superior performance +regarding f1-score evaluation metric and accuracy compared to the other models. + +
+
+
+
+
+ + ☆ A Unified Diffusion Framework for Scene-aware Human Motion Estimation + from Sparse Signals + + +
+ Estimating full-body human motion via sparse tracking signals from +head-mounted displays and hand controllers in 3D scenes is crucial to +applications in AR/VR. One of the biggest challenges to this task is the +one-to-many mapping from sparse observations to dense full-body motions, which +endowed inherent ambiguities. To help resolve this ambiguous problem, we +introduce a new framework to combine rich contextual information provided by +scenes to benefit full-body motion tracking from sparse observations. To +estimate plausible human motions given sparse tracking signals and 3D scenes, +we develop $\text{S}^2$Fusion, a unified framework fusing \underline{S}cene and +sparse \underline{S}ignals with a conditional dif\underline{Fusion} model. +$\text{S}^2$Fusion first extracts the spatial-temporal relations residing in +the sparse signals via a periodic autoencoder, and then produces time-alignment +feature embedding as additional inputs. Subsequently, by drawing initial noisy +motion from a pre-trained prior, $\text{S}^2$Fusion utilizes conditional +diffusion to fuse scene geometry and sparse tracking signals to generate +full-body scene-aware motions. The sampling procedure of $\text{S}^2$Fusion is +further guided by a specially designed scene-penetration loss and +phase-matching loss, which effectively regularizes the motion of the lower body +even in the absence of any tracking signals, making the generated motion much +more plausible and coherent. Extensive experimental results have demonstrated +that our $\text{S}^2$Fusion outperforms the state-of-the-art in terms of +estimation quality and smoothness. + +
+
+
+
+
+ + ☆ A Clinical-oriented Multi-level Contrastive Learning Method for Disease + Diagnosis in Low-quality Medical Images + + +
+ Representation learning offers a conduit to elucidate distinctive features +within the latent space and interpret the deep models. However, the randomness +of lesion distribution and the complexity of low-quality factors in medical +images pose great challenges for models to extract key lesion features. Disease +diagnosis methods guided by contrastive learning (CL) have shown significant +advantages in lesion feature representation. Nevertheless, the effectiveness of +CL is highly dependent on the quality of the positive and negative sample +pairs. In this work, we propose a clinical-oriented multi-level CL framework +that aims to enhance the model's capacity to extract lesion features and +discriminate between lesion and low-quality factors, thereby enabling more +accurate disease diagnosis from low-quality medical images. Specifically, we +first construct multi-level positive and negative pairs to enhance the model's +comprehensive recognition capability of lesion features by integrating +information from different levels and qualities of medical images. Moreover, to +improve the quality of the learned lesion embeddings, we introduce a dynamic +hard sample mining method based on self-paced learning. The proposed CL +framework is validated on two public medical image datasets, EyeQ and Chest +X-ray, demonstrating superior performance compared to other state-of-the-art +disease diagnostic methods. + +
+
+
+
+
+ + ☆ LRNet: Change detection of high-resolution remote sensing imagery via + strategy of localization-then-refinement + + +
+ Change detection, as a research hotspot in the field of remote sensing, has +witnessed continuous development and progress. However, the discrimination of +boundary details remains a significant bottleneck due to the complexity of +surrounding elements between change areas and backgrounds. Discriminating the +boundaries of large change areas results in misalignment, while connecting +boundaries occurs for small change targets. To address the above issues, a +novel network based on the localization-then-refinement strategy is proposed in +this paper, namely LRNet. LRNet consists of two stages: localization and +refinement. In the localization stage, a three-branch encoder simultaneously +extracts original image features and their differential features for +interactive localization of the position of each change area. To minimize +information loss during feature extraction, learnable optimal pooling (LOP) is +proposed to replace the widely used max-pooling. Additionally, this process is +trainable and contributes to the overall optimization of the network. To +effectively interact features from different branches and accurately locate +change areas of various sizes, change alignment attention (C2A) and +hierarchical change alignment module (HCA) are proposed. In the refinement +stage, the localization results from the localization stage are corrected by +constraining the change areas and change edges through the edge-area alignment +module (E2A). Subsequently, the decoder, combined with the difference features +strengthened by C2A in the localization phase, refines change areas of +different sizes, ultimately achieving accurate boundary discrimination of +change areas. The proposed LRNet outperforms 13 other state-of-the-art methods +in terms of comprehensive evaluation metrics and provides the most precise +boundary discrimination results on the LEVIR-CD and WHU-CD datasets. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ Mixture of Low-rank Experts for Transferable AI-Generated Image + Detection + + +
+ Generative models have shown a giant leap in synthesizing photo-realistic +images with minimal expertise, sparking concerns about the authenticity of +online information. This study aims to develop a universal AI-generated image +detector capable of identifying images from diverse sources. Existing methods +struggle to generalize across unseen generative models when provided with +limited sample sources. Inspired by the zero-shot transferability of +pre-trained vision-language models, we seek to harness the nontrivial +visual-world knowledge and descriptive proficiency of CLIP-ViT to generalize +over unknown domains. This paper presents a novel parameter-efficient +fine-tuning approach, mixture of low-rank experts, to fully exploit CLIP-ViT's +potential while preserving knowledge and expanding capacity for transferable +detection. We adapt only the MLP layers of deeper ViT blocks via an integration +of shared and separate LoRAs within an MoE-based structure. Extensive +experiments on public benchmarks show that our method achieves superiority over +state-of-the-art approaches in cross-generator generalization and robustness to +perturbations. Remarkably, our best-performing ViT-L/14 variant requires +training only 0.08% of its parameters to surpass the leading baseline by +3.64% +mAP and +12.72% avg.Acc across unseen diffusion and autoregressive models. This +even outperforms the baseline with just 0.28% of the training data. Our code +and pre-trained models will be available at +https://github.com/zhliuworks/CLIPMoLE. + +
+
+
+
+
+ + ☆ GauU-Scene V2: Expanse Lidar Image Dataset Shows Unreliable Geometric + Reconstruction Using Gaussian Splatting and NeRF + + +
+ We introduce a novel large-scale scene reconstruction benchmark that utilizes +newly developed 3D representation approaches: Gaussian Splatting and Neural +Radiance Fields, on our expansive GauU-Scene V2 dataset. GauU-Scene V2 +encompasses over 6.5 square kilometers and features a comprehensive RGB dataset +coupled with LiDAR ground truth. This dataset offers a unique blend of urban +and academic environments for advanced spatial analysis, covering more than 6.5 +km2. We also provide detailed supplementary information on data collection +protocols. Furthermore, we present an easy-to-follow pipeline to align the +COLMAP sparse point cloud with the detailed LiDAR dataset. Our evaluation of +U-Scene, which includes a detailed analysis across various novel viewpoints +using image-based metrics such as SSIM, LPIPS, and PSNR, shows contradictory +results when applying geometric-based metrics, such as Chamfer distance. This +leads to doubts about the reliability of current image-based measurement +matrices and geometric extraction methods on Gaussian Splatting. We also make +the dataset available on the following anonymous project page + +
+
+ comment: 8 pages(No reference) 6 figures 4 tabs +
+
+
+
+
+ + ☆ CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale + Volumetric Super-Resolution of Medical Data CVPR + + +
+ In the realm of medical 3D data, such as CT and MRI images, prevalent +anisotropic resolution is characterized by high intra-slice but diminished +inter-slice resolution. The lowered resolution between adjacent slices poses +challenges, hindering optimal viewing experiences and impeding the development +of robust downstream analysis algorithms. Various volumetric super-resolution +algorithms aim to surmount these challenges, enhancing inter-slice resolution +and overall 3D medical imaging quality. However, existing approaches confront +inherent challenges: 1) often tailored to specific upsampling factors, lacking +flexibility for diverse clinical scenarios; 2) newly generated slices +frequently suffer from over-smoothing, degrading fine details, and leading to +inter-slice inconsistency. In response, this study presents CycleINR, a novel +enhanced Implicit Neural Representation model for 3D medical data volumetric +super-resolution. Leveraging the continuity of the learned implicit function, +the CycleINR model can achieve results with arbitrary up-sampling rates, +eliminating the need for separate training. Additionally, we enhance the grid +sampling in CycleINR with a local attention mechanism and mitigate +over-smoothing by integrating cycle-consistent loss. We introduce a new metric, +Slice-wise Noise Level Inconsistency (SNLI), to quantitatively assess +inter-slice noise level inconsistency. The effectiveness of our approach is +demonstrated through image quality evaluations on an in-house dataset and a +downstream task analysis on the Medical Segmentation Decathlon liver tumor +dataset. + +
+
+ comment: CVPR accepted paper +
+
+
+
+
+ + ☆ HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and + Low-Frequency Information of Parametric Models CVPR 2024 + + +
+ Reconstructing 3D clothed human involves creating a detailed geometry of +individuals in clothing, with applications ranging from virtual try-on, movies, +to games. To enable practical and widespread applications, recent advances +propose to generate a clothed human from an RGB image. However, they struggle +to reconstruct detailed and robust avatars simultaneously. We empirically find +that the high-frequency (HF) and low-frequency (LF) information from a +parametric model has the potential to enhance geometry details and improve +robustness to noise, respectively. Based on this, we propose HiLo, namely +clothed human reconstruction with high- and low-frequency information, which +contains two components. 1) To recover detailed geometry using HF information, +we propose a progressive HF Signed Distance Function to enhance the detailed 3D +geometry of a clothed human. We analyze that our progressive learning manner +alleviates large gradients that hinder model convergence. 2) To achieve robust +reconstruction against inaccurate estimation of the parametric model by using +LF information, we propose a spatial interaction implicit function. This +function effectively exploits the complementary spatial information from a +low-resolution voxel grid of the parametric model. Experimental results +demonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and +9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets, +respectively. Additionally, HiLo demonstrates robustness to noise from the +parametric model, challenging poses, and various clothing styles. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ☆ NeRF2Points: Large-Scale Point Cloud Generation From Street Views' + Radiance Field Optimization + + +
+ Neural Radiance Fields (NeRF) have emerged as a paradigm-shifting methodology +for the photorealistic rendering of objects and environments, enabling the +synthesis of novel viewpoints with remarkable fidelity. This is accomplished +through the strategic utilization of object-centric camera poses characterized +by significant inter-frame overlap. This paper explores a compelling, +alternative utility of NeRF: the derivation of point clouds from aggregated +urban landscape imagery. The transmutation of street-view data into point +clouds is fraught with complexities, attributable to a nexus of interdependent +variables. First, high-quality point cloud generation hinges on precise camera +poses, yet many datasets suffer from inaccuracies in pose metadata. Also, the +standard approach of NeRF is ill-suited for the distinct characteristics of +street-view data from autonomous vehicles in vast, open settings. Autonomous +vehicle cameras often record with limited overlap, leading to blurring, +artifacts, and compromised pavement representation in NeRF-based point clouds. +In this paper, we present NeRF2Points, a tailored NeRF variant for urban point +cloud synthesis, notable for its high-quality output from RGB inputs alone. Our +paper is supported by a bespoke, high-resolution 20-kilometer urban street +dataset, designed for point cloud generation and evaluation. NeRF2Points +adeptly navigates the inherent challenges of NeRF-based point cloud synthesis +through the implementation of the following strategic innovations: (1) +Integration of Weighted Iterative Geometric Optimization (WIGO) and Structure +from Motion (SfM) for enhanced camera pose accuracy, elevating street-view data +precision. (2) Layered Perception and Integrated Modeling (LPiM) is designed +for distinct radiance field modeling in urban environments, resulting in +coherent point cloud representations. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Data Stream Sampling with Fuzzy Task Boundaries and Noisy Labels + + +
+ In the realm of continual learning, the presence of noisy labels within data +streams represents a notable obstacle to model reliability and fairness. We +focus on the data stream scenario outlined in pertinent literature, +characterized by fuzzy task boundaries and noisy labels. To address this +challenge, we introduce a novel and intuitive sampling method called Noisy Test +Debiasing (NTD) to mitigate noisy labels in evolving data streams and establish +a fair and robust continual learning algorithm. NTD is straightforward to +implement, making it feasible across various scenarios. Our experiments +benchmark four datasets, including two synthetic noise datasets (CIFAR10 and +CIFAR100) and real-world noise datasets (mini-WebVision and Food-101N). The +results validate the efficacy of NTD for online continual learning in scenarios +with noisy labels in data streams. Compared to the previous leading approach, +NTD achieves a training speedup enhancement over two times while maintaining or +surpassing accuracy levels. Moreover, NTD utilizes less than one-fifth of the +GPU memory resources compared to previous leading methods. + +
+
+
+
+
+ + ☆ On the Learnability of Out-of-distribution Detection NeurIPS 2022 + + +
+ Supervised learning aims to train a classifier under the assumption that +training and test data are from the same distribution. To ease the above +assumption, researchers have studied a more realistic setting: +out-of-distribution (OOD) detection, where test data may come from classes that +are unknown during training (i.e., OOD data). Due to the unavailability and +diversity of OOD data, good generalization ability is crucial for effective OOD +detection algorithms, and corresponding learning theory is still an open +problem. To study the generalization of OOD detection, this paper investigates +the probably approximately correct (PAC) learning theory of OOD detection that +fits the commonly used evaluation metrics in the literature. First, we find a +necessary condition for the learnability of OOD detection. Then, using this +condition, we prove several impossibility theorems for the learnability of OOD +detection under some scenarios. Although the impossibility theorems are +frustrating, we find that some conditions of these impossibility theorems may +not hold in some practical scenarios. Based on this observation, we next give +several necessary and sufficient conditions to characterize the learnability of +OOD detection in some practical scenarios. Lastly, we offer theoretical support +for representative OOD detection works based on our OOD theory. + +
+
+ comment: Accepted by JMLR in 7th of April, 2024. This is a journal extension + of the previous NeurIPS 2022 Outstanding Paper "Is Out-of-distribution + Detection Learnable?" [arXiv:2210.14707] +
+
+
+
+
+ + ☆ ByteEdit: Boost, Comply and Accelerate Generative Image Editing + + +
+ Recent advancements in diffusion-based generative image editing have sparked +a profound revolution, reshaping the landscape of image outpainting and +inpainting tasks. Despite these strides, the field grapples with inherent +challenges, including: i) inferior quality; ii) poor consistency; iii) +insufficient instrcution adherence; iv) suboptimal generation efficiency. To +address these obstacles, we present ByteEdit, an innovative feedback learning +framework meticulously designed to Boost, Comply, and Accelerate Generative +Image Editing tasks. ByteEdit seamlessly integrates image reward models +dedicated to enhancing aesthetics and image-text alignment, while also +introducing a dense, pixel-level reward model tailored to foster coherence in +the output. Furthermore, we propose a pioneering adversarial and progressive +feedback learning strategy to expedite the model's inference speed. Through +extensive large-scale user evaluations, we demonstrate that ByteEdit surpasses +leading generative image editing products, including Adobe, Canva, and MeiTu, +in both generation quality and consistency. ByteEdit-Outpainting exhibits a +remarkable enhancement of 388% and 135% in quality and consistency, +respectively, when compared to the baseline model. Experiments also verfied +that our acceleration models maintains excellent performance results in terms +of quality and consistency. + +
+
+
+
+
+ + ☆ Msmsfnet: a multi-stream and multi-scale fusion net for edge detection + + +
+ Edge detection is a long standing problem in computer vision. Recent deep +learning based algorithms achieve state of-the-art performance in publicly +available datasets. Despite the efficiency of these algorithms, their +performance, however, relies heavily on the pretrained weights of the backbone +network on the ImageNet dataset. This limits heavily the design space of deep +learning based edge detectors. Whenever we want to devise a new model, we have +to train this new model on the ImageNet dataset first, and then fine tune the +model using the edge detection datasets. The comparison would be unfair +otherwise. However, it is usually not feasible for many researchers to train a +model on the ImageNet dataset due to the limited computation resources. In this +work, we study the performance that can be achieved by state-of-the-art deep +learning based edge detectors in publicly available datasets when they are +trained from scratch, and devise a new network architecture, the multi-stream +and multi scale fusion net (msmsfnet), for edge detection. We show in our +experiments that by training all models from scratch to ensure the fairness of +comparison, out model outperforms state-of-the art deep learning based edge +detectors in three publicly available datasets. + +
+
+
+
+
+ + ☆ Task-Aware Encoder Control for Deep Video Compression CVPR 2024 + + +
+ Prior research on deep video compression (DVC) for machine tasks typically +necessitates training a unique codec for each specific task, mandating a +dedicated decoder per task. In contrast, traditional video codecs employ a +flexible encoder controller, enabling the adaptation of a single codec to +different tasks through mechanisms like mode prediction. Drawing inspiration +from this, we introduce an innovative encoder controller for deep video +compression for machines. This controller features a mode prediction and a +Group of Pictures (GoP) selection module. Our approach centralizes control at +the encoding stage, allowing for adaptable encoder adjustments across different +tasks, such as detection and tracking, while maintaining compatibility with a +standard pre-trained DVC decoder. Empirical evidence demonstrates that our +method is applicable across multiple tasks with various existing pre-trained +DVCs. Moreover, extensive experiments demonstrate that our method outperforms +previous DVC by about 25% bitrate for different tasks, with only one +pre-trained decoder. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ ShoeModel: Learning to Wear on the User-specified Shoes via Diffusion + Model + + +
+ With the development of the large-scale diffusion model, Artificial +Intelligence Generated Content (AIGC) techniques are popular recently. However, +how to truly make it serve our daily lives remains an open question. To this +end, in this paper, we focus on employing AIGC techniques in one filed of +E-commerce marketing, i.e., generating hyper-realistic advertising images for +displaying user-specified shoes by human. Specifically, we propose a +shoe-wearing system, called Shoe-Model, to generate plausible images of human +legs interacting with the given shoes. It consists of three modules: (1) shoe +wearable-area detection module (WD), (2) leg-pose synthesis module (LpS) and +the final (3) shoe-wearing image generation module (SW). Them three are +performed in ordered stages. Compared to baselines, our ShoeModel is shown to +generalize better to different type of shoes and has ability of keeping the +ID-consistency of the given shoes, as well as automatically producing +reasonable interactions with human. Extensive experiments show the +effectiveness of our proposed shoe-wearing system. Figure 1 shows the input and +output examples of our ShoeModel. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Strictly-ID-Preserved and Controllable Accessory Advertising Image + Generation + + +
+ Customized generative text-to-image models have the ability to produce images +that closely resemble a given subject. However, in the context of generating +advertising images for e-commerce scenarios, it is crucial that the generated +subject's identity aligns perfectly with the product being advertised. In order +to address the need for strictly-ID preserved advertising image generation, we +have developed a Control-Net based customized image generation pipeline and +have taken earring model advertising as an example. Our approach facilitates a +seamless interaction between the earrings and the model's face, while ensuring +that the identity of the earrings remains intact. Furthermore, to achieve a +diverse and controllable display, we have proposed a multi-branch +cross-attention architecture, which allows for control over the scale, pose, +and appearance of the model, going beyond the limitations of text prompts. Our +method manages to achieve fine-grained control of the generated model's face, +resulting in controllable and captivating advertising effects. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ 3D Building Reconstruction from Monocular Remote Sensing Images with + Multi-level Supervisions CVPR 2024 + + +
+ 3D building reconstruction from monocular remote sensing images is an +important and challenging research problem that has received increasing +attention in recent years, owing to its low cost of data acquisition and +availability for large-scale applications. However, existing methods rely on +expensive 3D-annotated samples for fully-supervised training, restricting their +application to large-scale cross-city scenarios. In this work, we propose +MLS-BRN, a multi-level supervised building reconstruction network that can +flexibly utilize training samples with different annotation levels to achieve +better reconstruction results in an end-to-end manner. To alleviate the demand +on full 3D supervision, we design two new modules, Pseudo Building Bbox +Calculator and Roof-Offset guided Footprint Extractor, as well as new tasks and +training strategies for different types of samples. Experimental results on +several public and new datasets demonstrate that our proposed MLS-BRN achieves +competitive performance using much fewer 3D-annotated samples, and +significantly improves the footprint extraction and 3D reconstruction +performance compared with current state-of-the-art. The code and datasets of +this work will be released at https://github.com/opendatalab/MLS-BRN.git. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Joint Reconstruction of 3D Human and Object via Contact-Based Refinement + Transformer CVPR 2024 + + +
+ Human-object contact serves as a strong cue to understand how humans +physically interact with objects. Nevertheless, it is not widely explored to +utilize human-object contact information for the joint reconstruction of 3D +human and object from a single image. In this work, we present a novel joint 3D +human-object reconstruction method (CONTHO) that effectively exploits contact +information between humans and objects. There are two core designs in our +system: 1) 3D-guided contact estimation and 2) contact-based 3D human and +object refinement. First, for accurate human-object contact estimation, CONTHO +initially reconstructs 3D humans and objects and utilizes them as explicit 3D +guidance for contact estimation. Second, to refine the initial reconstructions +of 3D human and object, we propose a novel contact-based refinement Transformer +that effectively aggregates human features and object features based on the +estimated human-object contact. The proposed contact-based refinement prevents +the learning of erroneous correlation between human and object, which enables +accurate 3D reconstruction. As a result, our CONTHO achieves state-of-the-art +performance in both human-object contact estimation and joint reconstruction of +3D human and object. The code is publicly available at +https://github.com/dqj5182/CONTHO_RELEASE. + +
+
+ comment: Published at CVPR 2024, 19 pages including the supplementary material +
+
+
+
+
+ + ☆ DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking + + +
+ Multimodal entity linking (MEL) aims to utilize multimodal information +(usually textual and visual information) to link ambiguous mentions to +unambiguous entities in knowledge base. Current methods facing main issues: +(1)treating the entire image as input may contain redundant information. (2)the +insufficient utilization of entity-related information, such as attributes in +images. (3)semantic inconsistency between the entity in knowledge base and its +representation. To this end, we propose DWE+ for multimodal entity linking. +DWE+ could capture finer semantics and dynamically maintain semantic +consistency with entities. This is achieved by three aspects: (a)we introduce a +method for extracting fine-grained image features by partitioning the image +into multiple local objects. Then, hierarchical contrastive learning is used to +further align semantics between coarse-grained information(text and image) and +fine-grained (mention and visual objects). (b)we explore ways to extract visual +attributes from images to enhance fusion feature such as facial features and +identity. (c)we leverage Wikipedia and ChatGPT to capture the entity +representation, achieving semantic enrichment from both static and dynamic +perspectives, which better reflects the real-world entity semantics. +Experiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the +effectiveness of DWE+ in improving MEL performance. Specifically, we optimize +these datasets and achieve state-of-the-art performance on the enhanced +datasets. The code and enhanced datasets are released on +https://github.com/season1blue/DWET + +
+
+ comment: under review on TOIS +
+
+
+
+
+ + ☆ MemFlow: Optical Flow Estimation and Prediction with Memory CVPR 2024 + + +
+ Optical flow is a classical task that is important to the vision community. +Classical optical flow estimation uses two frames as input, whilst some recent +methods consider multiple frames to explicitly model long-range information. +The former ones limit their ability to fully leverage temporal coherence along +the video sequence; and the latter ones incur heavy computational overhead, +typically not possible for real-time flow estimation. Some multi-frame-based +approaches even necessitate unseen future frames for current estimation, +compromising real-time applicability in safety-critical scenarios. To this end, +we present MemFlow, a real-time method for optical flow estimation and +prediction with memory. Our method enables memory read-out and update modules +for aggregating historical motion information in real-time. Furthermore, we +integrate resolution-adaptive re-scaling to accommodate diverse video +resolutions. Besides, our approach seamlessly extends to the future prediction +of optical flow based on past observations. Leveraging effective historical +motion aggregation, our method outperforms VideoFlow with fewer parameters and +faster inference speed on Sintel and KITTI-15 datasets in terms of +generalization performance. At the time of submission, MemFlow also leads in +performance on the 1080p Spring dataset. Codes and models will be available at: +https://dqiaole.github.io/MemFlow/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ D2SL: Decouple Defogging and Semantic Learning for Foggy Domain-Adaptive + Segmentation + + +
+ We investigated domain adaptive semantic segmentation in foggy weather +scenarios, which aims to enhance the utilization of unlabeled foggy data and +improve the model's adaptability to foggy conditions. Current methods rely on +clear images as references, jointly learning defogging and segmentation for +foggy images. Despite making some progress, there are still two main drawbacks: +(1) the coupling of segmentation and defogging feature representations, +resulting in a decrease in semantic representation capability, and (2) the +failure to leverage real fog priors in unlabeled foggy data, leading to +insufficient model generalization ability. To address these issues, we propose +a novel training framework, Decouple Defogging and Semantic learning, called +D2SL, aiming to alleviate the adverse impact of defogging tasks on the final +segmentation task. In this framework, we introduce a domain-consistent transfer +strategy to establish a connection between defogging and segmentation tasks. +Furthermore, we design a real fog transfer strategy to improve defogging +effects by fully leveraging the fog priors from real foggy images. Our approach +enhances the semantic representations required for segmentation during the +defogging learning process and maximizes the representation capability of fog +invariance by effectively utilizing real fog data. Comprehensive experiments +validate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Light the Night: A Multi-Condition Diffusion Framework for Unpaired + Low-Light Enhancement in Autonomous Driving CVPR 2024 + + +
+ Vision-centric perception systems for autonomous driving have gained +considerable attention recently due to their cost-effectiveness and +scalability, especially compared to LiDAR-based systems. However, these systems +often struggle in low-light conditions, potentially compromising their +performance and safety. To address this, our paper introduces LightDiff, a +domain-tailored framework designed to enhance the low-light image quality for +autonomous driving applications. Specifically, we employ a multi-condition +controlled diffusion model. LightDiff works without any human-collected paired +data, leveraging a dynamic data degradation process instead. It incorporates a +novel multi-condition adapter that adaptively controls the input weights from +different modalities, including depth maps, RGB images, and text captions, to +effectively illuminate dark scenes while maintaining context consistency. +Furthermore, to align the enhanced images with the detection model's knowledge, +LightDiff employs perception-specific scores as rewards to guide the diffusion +training process through reinforcement learning. Extensive experiments on the +nuScenes datasets demonstrate that LightDiff can significantly improve the +performance of several state-of-the-art 3D detectors in night-time conditions +while achieving high visual quality scores, highlighting its potential to +safeguard autonomous driving. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Coordinated Sparse Recovery of Label Noise + + +
+ Label noise is a common issue in real-world datasets that inevitably impacts +the generalization of models. This study focuses on robust classification tasks +where the label noise is instance-dependent. Estimating the transition matrix +accurately in this task is challenging, and methods based on sample selection +often exhibit confirmation bias to varying degrees. Sparse over-parameterized +training (SOP) has been theoretically effective in estimating and recovering +label noise, offering a novel solution for noise-label learning. However, this +study empirically observes and verifies a technical flaw of SOP: the lack of +coordination between model predictions and noise recovery leads to increased +generalization error. To address this, we propose a method called Coordinated +Sparse Recovery (CSR). CSR introduces a collaboration matrix and confidence +weights to coordinate model predictions and noise recovery, reducing error +leakage. Based on CSR, this study designs a joint sample selection strategy and +constructs a comprehensive and powerful learning framework called CSR+. CSR+ +significantly reduces confirmation bias, especially for datasets with more +classes and a high proportion of instance-specific noise. Experimental results +on simulated and real-world noisy datasets demonstrate that both CSR and CSR+ +achieve outstanding performance compared to methods at the same level. + +
+
+ comment: Pre-print prior to submission to journal +
+
+
+
+
+ + ☆ Few-Shot Object Detection: Research Advances and Challenges + + +
+ Object detection as a subfield within computer vision has achieved remarkable +progress, which aims to accurately identify and locate a specific object from +images or videos. Such methods rely on large-scale labeled training samples for +each object category to ensure accurate detection, but obtaining extensive +annotated data is a labor-intensive and expensive process in many real-world +scenarios. To tackle this challenge, researchers have explored few-shot object +detection (FSOD) that combines few-shot learning and object detection +techniques to rapidly adapt to novel objects with limited annotated samples. +This paper presents a comprehensive survey to review the significant +advancements in the field of FSOD in recent years and summarize the existing +challenges and solutions. Specifically, we first introduce the background and +definition of FSOD to emphasize potential value in advancing the field of +computer vision. We then propose a novel FSOD taxonomy method and survey the +plentifully remarkable FSOD algorithms based on this fact to report a +comprehensive overview that facilitates a deeper understanding of the FSOD +problem and the development of innovative solutions. Finally, we discuss the +advantages and limitations of these algorithms to summarize the challenges, +potential research direction, and development trend of object detection in the +data scarcity scenario. + +
+
+
+
+
+ + ☆ Rethinking Diffusion Model for Multi-Contrast MRI Super-Resolution CVPR2024 + + +
+ Recently, diffusion models (DM) have been applied in magnetic resonance +imaging (MRI) super-resolution (SR) reconstruction, exhibiting impressive +performance, especially with regard to detailed reconstruction. However, the +current DM-based SR reconstruction methods still face the following issues: (1) +They require a large number of iterations to reconstruct the final image, which +is inefficient and consumes a significant amount of computational resources. +(2) The results reconstructed by these methods are often misaligned with the +real high-resolution images, leading to remarkable distortion in the +reconstructed MR images. To address the aforementioned issues, we propose an +efficient diffusion model for multi-contrast MRI SR, named as DiffMSR. +Specifically, we apply DM in a highly compact low-dimensional latent space to +generate prior knowledge with high-frequency detail information. The highly +compact latent space ensures that DM requires only a few simple iterations to +produce accurate prior knowledge. In addition, we design the Prior-Guide Large +Window Transformer (PLWformer) as the decoder for DM, which can extend the +receptive field while fully utilizing the prior knowledge generated by DM to +ensure that the reconstructed MR image remains undistorted. Extensive +experiments on public and clinical datasets demonstrate that our DiffMSR +outperforms state-of-the-art methods. + +
+
+ comment: 14 pages, 12 figures, Accepted by CVPR2024 +
+
+
+
+
+ + ☆ GenEARL: A Training-Free Generative Framework for Multimodal Event + Argument Role Labeling + + +
+ Multimodal event argument role labeling (EARL), a task that assigns a role +for each event participant (object) in an image is a complex challenge. It +requires reasoning over the entire image, the depicted event, and the +interactions between various objects participating in the event. Existing +models heavily rely on high-quality event-annotated training data to understand +the event semantics and structures, and they fail to generalize to new event +types and domains. In this paper, we propose GenEARL, a training-free +generative framework that harness the power of the modern generative models to +understand event task descriptions given image contexts to perform the EARL +task. Specifically, GenEARL comprises two stages of generative prompting with a +frozen vision-language model (VLM) and a frozen large language model (LLM). +First, a generative VLM learns the semantics of the event argument roles and +generates event-centric object descriptions based on the image. Subsequently, a +LLM is prompted with the generated object descriptions with a predefined +template for EARL (i.e., assign an object with an event argument role). We show +that GenEARL outperforms the contrastive pretraining (CLIP) baseline by 9.4% +and 14.2% accuracy for zero-shot EARL on the M2E2 and SwiG datasets, +respectively. In addition, we outperform CLIP-Event by 22% precision on M2E2 +dataset. The framework also allows flexible adaptation and generalization to +unseen domains. + +
+
+ comment: 20 pages, 15 Figures, 13 figures +
+
+
+
+
+ + ☆ X-VARS: Introducing Explainability in Football Refereeing with + Multi-Modal Large Language Model + + +
+ The rapid advancement of artificial intelligence has led to significant +improvements in automated decision-making. However, the increased performance +of models often comes at the cost of explainability and transparency of their +decision-making processes. In this paper, we investigate the capabilities of +large language models to explain decisions, using football refereeing as a +testing ground, given its decision complexity and subjectivity. We introduce +the Explainable Video Assistant Referee System, X-VARS, a multi-modal large +language model designed for understanding football videos from the point of +view of a referee. X-VARS can perform a multitude of tasks, including video +description, question answering, action recognition, and conducting meaningful +conversations based on video content and in accordance with the Laws of the +Game for football referees. We validate X-VARS on our novel dataset, +SoccerNet-XFoul, which consists of more than 22k video-question-answer triplets +annotated by over 70 experienced football referees. Our experiments and human +study illustrate the impressive capabilities of X-VARS in interpreting complex +football clips. Furthermore, we highlight the potential of X-VARS to reach +human performance and support football referees in the future. + +
+
+
+
+
+ + ☆ DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking + + +
+ Multimodal entity linking (MEL) aims to utilize multimodal information +(usually textual and visual information) to link ambiguous mentions to +unambiguous entities in knowledge base. Current methods facing main issues: +(1)treating the entire image as input may contain redundant information. (2)the +insufficient utilization of entity-related information, such as attributes in +images. (3)semantic inconsistency between the entity in knowledge base and its +representation. To this end, we propose DWE+ for multimodal entity linking. +DWE+ could capture finer semantics and dynamically maintain semantic +consistency with entities. This is achieved by three aspects: (a)we introduce a +method for extracting fine-grained image features by partitioning the image +into multiple local objects. Then, hierarchical contrastive learning is used to +further align semantics between coarse-grained information(text and image) and +fine-grained (mention and visual objects). (b)we explore ways to extract visual +attributes from images to enhance fusion feature such as facial features and +identity. (c)we leverage Wikipedia and ChatGPT to capture the entity +representation, achieving semantic enrichment from both static and dynamic +perspectives, which better reflects the real-world entity semantics. +Experiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the +effectiveness of DWE+ in improving MEL performance. Specifically, we optimize +these datasets and achieve state-of-the-art performance on the enhanced +datasets. The code and enhanced datasets are released on +https://github.com/season1blue/DWET + +
+
+ comment: under review on TOIS. arXiv admin note: substantial text overlap with + arXiv:2312.11816 +
+
+
+
+
+ + ♻ ☆ PIGEON: Predicting Image Geolocations + + +
+ Planet-scale image geolocalization remains a challenging problem due to the +diversity of images originating from anywhere in the world. Although approaches +based on vision transformers have made significant progress in geolocalization +accuracy, success in prior literature is constrained to narrow distributions of +images of landmarks, and performance has not generalized to unseen places. We +present a new geolocalization system that combines semantic geocell creation, +multi-task contrastive pretraining, and a novel loss function. Additionally, +our work is the first to perform retrieval over location clusters for guess +refinements. We train two models for evaluations on street-level data and +general-purpose image geolocalization; the first model, PIGEON, is trained on +data from the game of Geoguessr and is capable of placing over 40% of its +guesses within 25 kilometers of the target location globally. We also develop a +bot and deploy PIGEON in a blind experiment against humans, ranking in the top +0.01% of players. We further challenge one of the world's foremost professional +Geoguessr players to a series of six matches with millions of viewers, winning +all six games. Our second model, PIGEOTTO, differs in that it is trained on a +dataset of images from Flickr and Wikipedia, achieving state-of-the-art results +on a wide range of image geolocalization benchmarks, outperforming the previous +SOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8 +percentage points on the country level. Our findings suggest that PIGEOTTO is +the first image geolocalization model that effectively generalizes to unseen +places and that our approach can pave the way for highly accurate, planet-scale +image geolocalization systems. Our code is available on GitHub. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ MMSFormer: Multimodal Transformer for Material and Semantic Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different modality +combinations. We also propose a new model named Multi-Modal Segmentation +TransFormer (MMSFormer) that incorporates the proposed fusion strategy to +perform multimodal material and semantic segmentation tasks. MMSFormer +outperforms current state-of-the-art models on three different datasets. As we +begin with only one input modality, performance improves progressively as +additional modalities are incorporated, showcasing the effectiveness of the +fusion block in combining useful information from diverse input modalities. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3 + figures, 9 tables +
+
+
+
+
+ + ♻ ☆ AG-ReID.v2: Bridging Aerial and Ground Views for Person + Re-identification + + +
+ Aerial-ground person re-identification (Re-ID) presents unique challenges in +computer vision, stemming from the distinct differences in viewpoints, poses, +and resolutions between high-altitude aerial and ground-based cameras. Existing +research predominantly focuses on ground-to-ground matching, with aerial +matching less explored due to a dearth of comprehensive datasets. To address +this, we introduce AG-ReID.v2, a dataset specifically designed for person Re-ID +in mixed aerial and ground scenarios. This dataset comprises 100,502 images of +1,615 unique individuals, each annotated with matching IDs and 15 soft +attribute labels. Data were collected from diverse perspectives using a UAV, +stationary CCTV, and smart glasses-integrated camera, providing a rich variety +of intra-identity variations. Additionally, we have developed an explainable +attention network tailored for this dataset. This network features a +three-stream architecture that efficiently processes pairwise image distances, +emphasizes key top-down features, and adapts to variations in appearance due to +altitude differences. Comparative evaluations demonstrate the superiority of +our approach over existing baselines. We plan to release the dataset and +algorithm source code publicly, aiming to advance research in this specialized +field of computer vision. For access, please visit +https://github.com/huynguyen792/AG-ReID.v2. + +
+
+ comment: 13 pages, Accepted by TIFS 2023 +
+
+
+
+
+ + ♻ ☆ Relightful Harmonization: Lighting-aware Portrait Background Replacement CVPR 2024 + + +
+ Portrait harmonization aims to composite a subject into a new background, +adjusting its lighting and color to ensure harmony with the background scene. +Existing harmonization techniques often only focus on adjusting the global +color and brightness of the foreground and ignore crucial illumination cues +from the background such as apparent lighting direction, leading to unrealistic +compositions. We introduce Relightful Harmonization, a lighting-aware diffusion +model designed to seamlessly harmonize sophisticated lighting effect for the +foreground portrait using any background image. Our approach unfolds in three +stages. First, we introduce a lighting representation module that allows our +diffusion model to encode lighting information from target image background. +Second, we introduce an alignment network that aligns lighting features learned +from image background with lighting features learned from panorama environment +maps, which is a complete representation for scene illumination. Last, to +further boost the photorealism of the proposed method, we introduce a novel +data simulation pipeline that generates synthetic training pairs from a diverse +range of natural images, which are used to refine the model. Our method +outperforms existing benchmarks in visual fidelity and lighting coherence, +showing superior generalization in real-world testing scenarios, highlighting +its versatility and practicality. + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention + Graph in Pre-Trained Transformers CVPR + + +
+ Deployment of Transformer models on edge devices is becoming increasingly +challenging due to the exponentially growing inference cost that scales +quadratically with the number of tokens in the input sequence. Token pruning is +an emerging solution to address this challenge due to its ease of deployment on +various Transformer backbones. However, most token pruning methods require +computationally expensive fine-tuning, which is undesirable in many edge +deployment cases. In this work, we propose Zero-TPrune, the first zero-shot +method that considers both the importance and similarity of tokens in +performing token pruning. It leverages the attention graph of pre-trained +Transformer models to produce an importance distribution for tokens via our +proposed Weighted Page Rank (WPR) algorithm. This distribution further guides +token partitioning for efficient similarity-based pruning. Due to the +elimination of the fine-tuning overhead, Zero-TPrune can prune large models at +negligible computational cost, switch between different pruning configurations +at no computational cost, and perform hyperparameter tuning efficiently. We +evaluate the performance of Zero-TPrune on vision tasks by applying it to +various vision Transformer backbones and testing them on ImageNet. Without any +fine-tuning, Zero-TPrune reduces the FLOPs cost of DeiT-S by 34.7% and improves +its throughput by 45.3% with only 0.4% accuracy loss. Compared with +state-of-the-art pruning methods that require fine-tuning, Zero-TPrune not only +eliminates the need for fine-tuning after pruning but also does so with only +0.1% accuracy loss. Compared with state-of-the-art fine-tuning-free pruning +methods, Zero-TPrune reduces accuracy loss by up to 49% with similar FLOPs +budgets. Project webpage: https://jha-lab.github.io/zerotprune. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Turbulence Mitigation: A Translational Perspective CVPR 2024 + + +
+ Recovering images distorted by atmospheric turbulence is a challenging +inverse problem due to the stochastic nature of turbulence. Although numerous +turbulence mitigation (TM) algorithms have been proposed, their efficiency and +generalization to real-world dynamic scenarios remain severely limited. +Building upon the intuitions of classical TM algorithms, we present the Deep +Atmospheric TUrbulence Mitigation network (DATUM). DATUM aims to overcome major +challenges when transitioning from classical to deep learning approaches. By +carefully integrating the merits of classical multi-frame TM methods into a +deep network structure, we demonstrate that DATUM can efficiently perform +long-range temporal aggregation using a recurrent fashion, while deformable +attention and temporal-channel attention seamlessly facilitate pixel +registration and lucky imaging. With additional supervision, tilt and blur +degradation can be jointly mitigated. These inductive biases empower DATUM to +significantly outperform existing methods while delivering a tenfold increase +in processing speed. A large-scale training dataset, ATSyn, is presented as a +co-invention to enable generalization in real turbulence. Our code and datasets +are available at https://xg416.github.io/DATUM. + +
+
+ comment: Accepted by CVPR 2024, project page https://xg416.github.io/DATUM/ +
+
+
+
+
+ + ♻ ☆ Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric + Videos + + +
+ We propose the task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the +reconstruction of frames during which the hand is stably holding the object. We +first develop the stable grasp definition based on the intuition that the +in-contact area between the hand and object should remain stable. By analysing +the 3D ARCTIC dataset, we identify stable grasp durations and showcase that +objects in stable grasps move within a single degree of freedom (1-DoF). We +thereby propose a method to jointly optimise all frames within a stable grasp, +minimising object motions to a latent 1-DoF. Finally, we extend the knowledge +to in-the-wild videos by labelling 2.4K clips of stable grasps. Our proposed +EPIC-Grasps dataset includes 390 object instances of 9 categories, featuring +stable grasps from videos of daily interactions in 141 environments. Without 3D +ground truth, we use stable contact areas and 2D projection masks to assess the +HO-SGR task in the wild. We evaluate relevant methods and our approach +preserves significantly higher stable contact area, on both EPIC-Grasps and +stable grasp sub-sequences from the ARCTIC dataset. + +
+
+ comment: webpage: https://zhifanzhu.github.io/getagrip +
+
+
+
+
+ + ♻ ☆ DragDiffusion: Harnessing Diffusion Models for Interactive Point-based + Image Editing + + +
+ Accurate and controllable image editing is a challenging task that has +attracted significant attention recently. Notably, DragGAN is an interactive +point-based image editing framework that achieves impressive editing results +with pixel-level precision. However, due to its reliance on generative +adversarial networks (GANs), its generality is limited by the capacity of +pretrained GAN models. In this work, we extend this editing framework to +diffusion models and propose a novel approach DragDiffusion. By harnessing +large-scale pretrained diffusion models, we greatly enhance the applicability +of interactive point-based editing on both real and diffusion-generated images. +Our approach involves optimizing the diffusion latents to achieve precise +spatial control. The supervision signal of this optimization process is from +the diffusion model's UNet features, which are known to contain rich semantic +and geometric information. Moreover, we introduce two additional techniques, +namely LoRA fine-tuning and latent-MasaCtrl, to further preserve the identity +of the original image. Lastly, we present a challenging benchmark dataset +called DragBench -- the first benchmark to evaluate the performance of +interactive point-based image editing methods. Experiments across a wide range +of challenging cases (e.g., images with multiple objects, diverse object +categories, various styles, etc.) demonstrate the versatility and generality of +DragDiffusion. Code: https://github.com/Yujun-Shi/DragDiffusion. + +
+
+ comment: Code is released at https://github.com/Yujun-Shi/DragDiffusion +
+
+
+
+
+ + ♻ ☆ Demystifying CLIP Data + + +
+ Contrastive Language-Image Pre-training (CLIP) is an approach that has +advanced research and applications in computer vision, fueling modern +recognition systems and generative models. We believe that the main ingredient +to the success of CLIP is its data and not the model architecture or +pre-training objective. However, CLIP only provides very limited information +about its data and how it has been collected, leading to works that aim to +reproduce CLIP's data by filtering with its model parameters. In this work, we +intend to reveal CLIP's data curation approach and in our pursuit of making it +open to the community introduce Metadata-Curated Language-Image Pre-training +(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's +concepts) and yields a balanced subset over the metadata distribution. Our +experimental study rigorously isolates the model and training settings, +concentrating solely on data. MetaCLIP applied to CommonCrawl with 400M +image-text data pairs outperforms CLIP's data on multiple standard benchmarks. +In zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy, +surpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining +the same training budget, attains 72.4%. Our observations hold across various +model sizes, exemplified by ViT-H achieving 80.5%, without any +bells-and-whistles. Curation code and training data distribution on metadata is +made available at https://github.com/facebookresearch/MetaCLIP. + +
+
+ comment: 17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by + other authors +
+
+
+
+
+ + ♻ ☆ Mimicking the Oracle: An Initial Phase Decorrelation Approach for Class + Incremental Learning CVPR 2022 + + +
+ Class Incremental Learning (CIL) aims at learning a multi-class classifier in +a phase-by-phase manner, in which only data of a subset of the classes are +provided at each phase. Previous works mainly focus on mitigating forgetting in +phases after the initial one. However, we find that improving CIL at its +initial phase is also a promising direction. Specifically, we experimentally +show that directly encouraging CIL Learner at the initial phase to output +similar representations as the model jointly trained on all classes can greatly +boost the CIL performance. Motivated by this, we study the difference between a +na\"ively-trained initial-phase model and the oracle model. Specifically, since +one major difference between these two models is the number of training +classes, we investigate how such difference affects the model representations. +We find that, with fewer training classes, the data representations of each +class lie in a long and narrow region; with more training classes, the +representations of each class scatter more uniformly. Inspired by this +observation, we propose Class-wise Decorrelation (CwD) that effectively +regularizes representations of each class to scatter more uniformly, thus +mimicking the model jointly trained with all classes (i.e., the oracle model). +Our CwD is simple to implement and easy to plug into existing methods. +Extensive experiments on various benchmark datasets show that CwD consistently +and significantly improves the performance of existing state-of-the-art methods +by around 1\% to 3\%. Code will be released. + +
+
+ comment: CVPR 2022 Camera-Ready Version +
+
+
+
+
+ + ♻ ☆ Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on + Vulnerable Patient Populations + + +
+ The proliferation of artificial intelligence (AI) in radiology has shed light +on the risk of deep learning (DL) models exacerbating clinical biases towards +vulnerable patient populations. While prior literature has focused on +quantifying biases exhibited by trained DL models, demographically targeted +adversarial bias attacks on DL models and its implication in the clinical +environment remains an underexplored field of research in medical imaging. In +this work, we demonstrate that demographically targeted label poisoning attacks +can introduce undetectable underdiagnosis bias in DL models. Our results across +multiple performance metrics and demographic groups like sex, age, and their +intersectional subgroups show that adversarial bias attacks demonstrate +high-selectivity for bias in the targeted group by degrading group model +performance without impacting overall model performance. Furthermore, our +results indicate that adversarial bias attacks result in biased DL models that +propagate prediction bias even when evaluated with external datasets. + +
+
+ comment: 29 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ NiteDR: Nighttime Image De-Raining with Cross-View Sensor Cooperative + Learning for Dynamic Driving Scenes + + +
+ In real-world environments, outdoor imaging systems are often affected by +disturbances such as rain degradation. Especially, in nighttime driving scenes, +insufficient and uneven lighting shrouds the scenes in darkness, resulting +degradation of both the image quality and visibility. Particularly, in the +field of autonomous driving, the visual perception ability of RGB sensors +experiences a sharp decline in such harsh scenarios. Additionally, driving +assistance systems suffer from reduced capabilities in capturing and discerning +the surrounding environment, posing a threat to driving safety. Single-view +information captured by single-modal sensors cannot comprehensively depict the +entire scene. To address these challenges, we developed an image de-raining +framework tailored for rainy nighttime driving scenes. It aims to remove rain +artifacts, enrich scene representation, and restore useful information. +Specifically, we introduce cooperative learning between visible and infrared +images captured by different sensors. By cross-view fusion of these +multi-source data, the scene within the images gains richer texture details and +enhanced contrast. We constructed an information cleaning module called +CleanNet as the first stage of our framework. Moreover, we designed an +information fusion module called FusionNet as the second stage to fuse the +clean visible images with infrared images. Using this stage-by-stage learning +strategy, we obtain de-rained fusion images with higher quality and better +visual perception. Extensive experiments demonstrate the effectiveness of our +proposed Cross-View Cooperative Learning (CVCL) in adverse driving scenarios in +low-light rainy environments. The proposed approach addresses the gap in the +utilization of existing rain removal algorithms in specific low-light +conditions. + +
+
+
+
+
+ + ♻ ☆ HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning + for RGB-D 6DoF Object Pose Estimation CVPR 2024 + + +
+ In this work, we present a novel dense-correspondence method for 6DoF object +pose estimation from a single RGB-D image. While many existing data-driven +methods achieve impressive performance, they tend to be time-consuming due to +their reliance on rendering-based refinement approaches. To circumvent this +limitation, we present HiPose, which establishes 3D-3D correspondences in a +coarse-to-fine manner with a hierarchical binary surface encoding. Unlike +previous dense-correspondence methods, we estimate the correspondence surface +by employing point-to-surface matching and iteratively constricting the surface +until it becomes a correspondence point while gradually removing outliers. +Extensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate +that our method surpasses all refinement-free methods and is even on par with +expensive refinement-based approaches. Crucially, our approach is +computationally efficient and enables real-time critical applications with high +accuracy requirements. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz + continuity constrAIned Normalization CVPR2024 + + +
+ Generative Adversarial Networks (GANs) significantly advanced image +generation but their performance heavily depends on abundant training data. In +scenarios with limited data, GANs often struggle with discriminator overfitting +and unstable training. Batch Normalization (BN), despite being known for +enhancing generalization and training stability, has rarely been used in the +discriminator of Data-Efficient GANs. Our work addresses this gap by +identifying a critical flaw in BN: the tendency for gradient explosion during +the centering and scaling steps. To tackle this issue, we present CHAIN +(lipsCHitz continuity constrAIned Normalization), which replaces the +conventional centering step with zero-mean regularization and integrates a +Lipschitz continuity constraint in the scaling step. CHAIN further enhances GAN +training by adaptively interpolating the normalized and unnormalized features, +effectively avoiding discriminator overfitting. Our theoretical analyses firmly +establishes CHAIN's effectiveness in reducing gradients in latent features and +weights, improving stability and generalization in GAN training. Empirical +evidence supports our theory. CHAIN achieves state-of-the-art results in +data-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven +high-resolution few-shot image datasets. Code: +https://github.com/MaxwellYaoNi/CHAIN + +
+
+ comment: Accepted by CVPR2024. 26 pages full version. Code: + https://github.com/MaxwellYaoNi/CHAIN +
+
+
+
+
+ + ♻ ☆ EVCap: Retrieval-Augmented Image Captioning with External Visual-Name + Memory for Open-World Comprehension CVPR 2024 + + +
+ Large language models (LLMs)-based image captioning has the capability of +describing objects not explicitly observed in training data; yet novel objects +occur frequently, necessitating the requirement of sustaining up-to-date object +knowledge for open-world comprehension. Instead of relying on large amounts of +data and/or scaling up network parameters, we introduce a highly effective +retrieval-augmented image captioning method that prompts LLMs with object names +retrieved from External Visual--name memory (EVCap). We build ever-changing +object knowledge memory using objects' visuals and names, enabling us to (i) +update the memory at a minimal cost and (ii) effortlessly augment LLMs with +retrieved object names by utilizing a lightweight and fast-to-train model. Our +model, which was trained only on the COCO dataset, can adapt to out-of-domain +without requiring additional fine-tuning or re-training. Our experiments +conducted on benchmarks and synthetic commonsense-violating data show that +EVCap, with only 3.97M trainable parameters, exhibits superior performance +compared to other methods based on frozen pre-trained LLMs. Its performance is +also competitive to specialist SOTAs that require extensive training. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Cooperation Does Matter: Exploring Multi-Order Bilateral Relations for + Audio-Visual Segmentation CVPR 2024 + + +
+ Recently, an audio-visual segmentation (AVS) task has been introduced, aiming +to group pixels with sounding objects within a given video. This task +necessitates a first-ever audio-driven pixel-level understanding of the scene, +posing significant challenges. In this paper, we propose an innovative +audio-visual transformer framework, termed COMBO, an acronym for COoperation of +Multi-order Bilateral relatiOns. For the first time, our framework explores +three types of bilateral entanglements within AVS: pixel entanglement, modality +entanglement, and temporal entanglement. Regarding pixel entanglement, we +employ a Siam-Encoder Module (SEM) that leverages prior knowledge to generate +more precise visual features from the foundational model. For modality +entanglement, we design a Bilateral-Fusion Module (BFM), enabling COMBO to +align corresponding visual and auditory signals bi-directionally. As for +temporal entanglement, we introduce an innovative adaptive inter-frame +consistency loss according to the inherent rules of temporal. Comprehensive +experiments and ablation studies on AVSBench-object (84.7 mIoU on S4, 59.2 mIou +on MS3) and AVSBench-semantic (42.1 mIoU on AVSS) datasets demonstrate that +COMBO surpasses previous state-of-the-art methods. Code and more results will +be publicly available at https://yannqi.github.io/AVS-COMBO/. + +
+
+ comment: CVPR 2024 Highlight. 13 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Video Anomaly Detection via Spatio-Temporal Pseudo-Anomaly Generation : + A Unified Approach CVPR + + +
+ Video Anomaly Detection (VAD) is an open-set recognition task, which is +usually formulated as a one-class classification (OCC) problem, where training +data is comprised of videos with normal instances while test data contains both +normal and anomalous instances. Recent works have investigated the creation of +pseudo-anomalies (PAs) using only the normal data and making strong assumptions +about real-world anomalies with regards to abnormality of objects and speed of +motion to inject prior information about anomalies in an autoencoder (AE) based +reconstruction model during training. This work proposes a novel method for +generating generic spatio-temporal PAs by inpainting a masked out region of an +image using a pre-trained Latent Diffusion Model and further perturbing the +optical flow using mixup to emulate spatio-temporal distortions in the data. In +addition, we present a simple unified framework to detect real-world anomalies +under the OCC setting by learning three types of anomaly indicators, namely +reconstruction quality, temporal irregularity and semantic inconsistency. +Extensive experiments on four VAD benchmark datasets namely Ped2, Avenue, +ShanghaiTech and UBnormal demonstrate that our method performs on par with +other existing state-of-the-art PAs generation and reconstruction based methods +under the OCC setting. Our analysis also examines the transferability and +generalisation of PAs across these datasets, offering valuable insights by +identifying real-world anomalies through PAs. + +
+
+ comment: Accepted in CVPRW 2024 - VAND Workshop +
+
+
+
+
+ + ♻ ☆ Reconstruction and Simulation of Elastic Objects with Spring-Mass 3D + Gaussians + + +
+ Reconstructing and simulating elastic objects from visual observations is +crucial for applications in computer vision and robotics. Existing methods, +such as 3D Gaussians, model 3D appearance and geometry, but lack the ability to +estimate physical properties for objects and simulate them. The core challenge +lies in integrating an expressive yet efficient physical dynamics model. We +propose Spring-Gaus, a 3D physical object representation for reconstructing and +simulating elastic objects from videos of the object from multiple viewpoints. +In particular, we develop and integrate a 3D Spring-Mass model into 3D Gaussian +kernels, enabling the reconstruction of the visual appearance, shape, and +physical dynamics of the object. Our approach enables future prediction and +simulation under various initial states and environmental properties. We +evaluate Spring-Gaus on both synthetic and real-world datasets, demonstrating +accurate reconstruction and simulation of elastic objects. Project page: +https://zlicheng.com/spring_gaus. + +
+
+
+
+
+ + ♻ ☆ A Survey on Transformer Compression + + +
+ Transformer plays a vital role in the realms of natural language processing +(NLP) and computer vision (CV), specially for constructing large language +models (LLM) and large vision models (LVM). Model compression methods reduce +the memory and computational cost of Transformer, which is a necessary step to +implement large language/vision models on practical devices. Given the unique +architecture of Transformer, featuring alternative attention and feedforward +neural network (FFN) modules, specific compression techniques are usually +required. The efficiency of these compression methods is also paramount, as +retraining large models on the entire training dataset is usually impractical. +This survey provides a comprehensive review of recent compression methods, with +a specific focus on their application to Transformer-based models. The +compression methods are primarily categorized into pruning, quantization, +knowledge distillation, and efficient architecture design (Mamba, RetNet, RWKV, +etc.). In each category, we discuss compression methods for both language and +vision tasks, highlighting common underlying principles. Finally, we delve into +the relation between various compression methods, and discuss further +directions in this domain. + +
+
+ comment: Model Compression, Transformer, Large Language Model, Large Vision + Model, LLM +
+
+
+
+
+ + ♻ ☆ Linear Anchored Gaussian Mixture Model for Location and Width + Computation of Objects in Thick Line Shape + + +
+ An accurate detection of the centerlines of linear objects is a challenging +topic in many sensitive real-world applications such X-ray imaging, remote +sensing and lane marking detection in road traffic. Model-based approaches +using Hough and Radon transforms are often used but, are not recommended for +thick line detection, whereas approaches based on image derivatives need +further step-by-step processing, making their efficiency dependent on each step +outcomes. In this paper, we aim to detect linear structures found in images by +considering the 3D representation of the image gray levels as a finite mixture +model of statistical distribution. The latter, which we named linear anchored +Gaussian distribution could be parametrized by a scale value ${\sigma}$ +describing the linear structure thickness and a line equation, parametrized, in +turn, by a radius ${\rho}$ and an orientation angle ${\theta}$, describing the +linear structure centerline location. Expectation-Maximization (EM) algorithm +is used for the mixture model parameter estimation, where a new paradigm, using +the background subtraction for the likelihood function computation, is +proposed. For the EM algorithm, two ${\theta}$ parameter initialization schemes +are used: the first one is based on a random choice of the first component of +${\theta}$ vector, whereas the second is based on the image Hessian with a +simultaneous computation of the mixture model components number. Experiments on +real world images and synthetic images corrupted by blur and additive noise +show the good performance of the proposed methods, where the algorithm using +background subtraction and Hessian-based ${\theta}$ initialization provides an +outstanding accuracy of the linear structure detection despite irregular image +background and presence of blur and noise. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ UPNet: Uncertainty-based Picking Deep Learning Network for Robust First + Break Picking + + +
+ In seismic exploration, first break (FB) picking is a crucial aspect in the +determination of subsurface velocity models, significantly influencing the +placement of wells. Many deep neural networks (DNNs)-based automatic picking +methods have been proposed to accelerate this processing. Significantly, the +segmentation-based DNN methods provide a segmentation map and then estimate FB +from the map using a picking threshold. However, the uncertainty of the results +picked by DNNs still needs to be analyzed. Thus, the automatic picking methods +applied in field datasets can not ensure robustness, especially in the case of +a low signal-to-noise ratio (SNR). In this paper, we introduce uncertainty +quantification into the FB picking task and propose a novel uncertainty-based +picking deep learning network called UPNet. UPNet not only estimates the +uncertainty of network output but also can filter the pickings with low +confidence. Many experiments evaluate that UPNet exhibits higher accuracy and +robustness than the deterministic DNN-based model, achieving State-of-the-Art +(SOTA) performance in field surveys. In addition, we verify that the +measurement uncertainty is meaningful, which can provide a reference for human +decision-making. + +
+
+
+
+
+ + ♻ ☆ UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance + Editing + + +
+ Recent advances in text-guided video editing have showcased promising results +in appearance editing (e.g., stylization). However, video motion editing in the +temporal dimension (e.g., from eating to waving), which distinguishes video +editing from image editing, is underexplored. In this work, we present UniEdit, +a tuning-free framework that supports both video motion and appearance editing +by harnessing the power of a pre-trained text-to-video generator within an +inversion-then-generation framework. To realize motion editing while preserving +source video content, based on the insights that temporal and spatial +self-attention layers encode inter-frame and intra-frame dependency +respectively, we introduce auxiliary motion-reference and reconstruction +branches to produce text-guided motion and source features respectively. The +obtained features are then injected into the main editing path via temporal and +spatial self-attention layers. Extensive experiments demonstrate that UniEdit +covers video motion editing and various appearance editing scenarios, and +surpasses the state-of-the-art methods. Our code will be publicly available. + +
+
+ comment: Project page: https://jianhongbai.github.io/UniEdit/ +
+
+
+
+
+ + ♻ ☆ SiCL: Silhouette-Driven Contrastive Learning for Unsupervised Person + Re-Identification with Clothes Change + + +
+ In this paper, we address a highly challenging yet critical task: +unsupervised long-term person re-identification with clothes change. Existing +unsupervised person re-id methods are mainly designed for short-term scenarios +and usually rely on RGB cues so that fail to perceive feature patterns that are +independent of the clothes. To crack this bottleneck, we propose a +silhouette-driven contrastive learning (SiCL) method, which is designed to +learn cross-clothes invariance by integrating both the RGB cues and the +silhouette information within a contrastive learning framework. To our +knowledge, this is the first tailor-made framework for unsupervised long-term +clothes change \reid{}, with superior performance on six benchmark datasets. We +conduct extensive experiments to evaluate our proposed SiCL compared to the +state-of-the-art unsupervised person reid methods across all the representative +datasets. Experimental results demonstrate that our proposed SiCL significantly +outperforms other unsupervised re-id methods. + +
+
+
+
+
+ + ♻ ☆ DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of + MLLM + + +
+ We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot +object detection ability of multimodal large language models (MLLMs), such as +GPT-4V and Gemini. Our approach consists of a detection prompting toolkit +inspired by high-precision detection priors and a new Chain-of-Thought to +implement these prompts. Specifically, the prompts in the toolkit are designed +to guide the MLLM to focus on regional information (e.g., zooming in), read +coordinates according to measure standards (e.g., overlaying rulers and +compasses), and infer from the contextual information (e.g., overlaying scene +graphs). Building upon these tools, the new detection chain-of-thought can +automatically decompose the task into simple subtasks, diagnose the +predictions, and plan for progressive box refinements. The effectiveness of our +framework is demonstrated across a spectrum of detection tasks, especially hard +cases. Compared to existing state-of-the-art methods, GPT-4V with our +DetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS +COCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val +set for zero-shot referring expression comprehension, +14.5% AP on D-cube +describe object detection FULL setting. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Learning for Medical Image Data with Anatomy-Oriented + Imaging Planes + + +
+ Self-supervised learning has emerged as a powerful tool for pretraining deep +networks on unlabeled data, prior to transfer learning of target tasks with +limited annotation. The relevance between the pretraining pretext and target +tasks is crucial to the success of transfer learning. Various pretext tasks +have been proposed to utilize properties of medical image data (e.g., three +dimensionality), which are more relevant to medical image analysis than generic +ones for natural images. However, previous work rarely paid attention to data +with anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance +imaging views. As these imaging planes are defined according to the anatomy of +the imaged organ, pretext tasks effectively exploiting this information can +pretrain the networks to gain knowledge on the organ of interest. In this work, +we propose two complementary pretext tasks for this group of medical image data +based on the spatial relationship of the imaging planes. The first is to learn +the relative orientation between the imaging planes and implemented as +regressing their intersecting lines. The second exploits parallel imaging +planes to regress their relative slice locations within a stack. Both pretext +tasks are conceptually straightforward and easy to implement, and can be +combined in multitask learning for better representation learning. Thorough +experiments on two anatomical structures (heart and knee) and representative +target tasks (semantic segmentation and classification) demonstrate that the +proposed pretext tasks are effective in pretraining deep networks for +remarkably boosted performance on the target tasks, and superior to other +recent approaches. + +
+
+ comment: Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ From Two-Stream to One-Stream: Efficient RGB-T Tracking via Mutual + Prompt Learning and Knowledge Distillation + + +
+ Due to the complementary nature of visible light and thermal infrared +modalities, object tracking based on the fusion of visible light images and +thermal images (referred to as RGB-T tracking) has received increasing +attention from researchers in recent years. How to achieve more comprehensive +fusion of information from the two modalities at a lower cost has been an issue +that researchers have been exploring. Inspired by visual prompt learning, we +designed a novel two-stream RGB-T tracking architecture based on cross-modal +mutual prompt learning, and used this model as a teacher to guide a one-stream +student model for rapid learning through knowledge distillation techniques. +Extensive experiments have shown that, compared to similar RGB-T trackers, our +designed teacher model achieved the highest precision rate, while the student +model, with comparable precision rate to the teacher model, realized an +inference speed more than three times faster than the teacher model.(Codes will +be available if accepted.) + +
+
+
+
+
+ + ♻ ☆ GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting CVPR 2024 + + +
+ In this paper, we introduce \textbf{GS-SLAM} that first utilizes 3D Gaussian +representation in the Simultaneous Localization and Mapping (SLAM) system. It +facilitates a better balance between efficiency and accuracy. Compared to +recent SLAM methods employing neural implicit representations, our method +utilizes a real-time differentiable splatting rendering pipeline that offers +significant speedup to map optimization and RGB-D rendering. Specifically, we +propose an adaptive expansion strategy that adds new or deletes noisy 3D +Gaussians in order to efficiently reconstruct new observed scene geometry and +improve the mapping of previously observed areas. This strategy is essential to +extend 3D Gaussian representation to reconstruct the whole scene rather than +synthesize a static object in existing methods. Moreover, in the pose tracking +process, an effective coarse-to-fine technique is designed to select reliable +3D Gaussian representations to optimize camera pose, resulting in runtime +reduction and robust estimation. Our method achieves competitive performance +compared with existing state-of-the-art real-time methods on the Replica, +TUM-RGBD datasets. Project page: https://gs-slam.github.io/. + +
+
+ comment: Accepted to CVPR 2024(highlight). Project Page: + https://gs-slam.github.io/ +
+
+
+
+
+ + ♻ ☆ PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection + Features and Variable Receptive Field Voxel Features + + +
+ LiDAR-based 3D object detection and classification is crucial for autonomous +driving. However, real-time inference from extremely sparse 3D data is a +formidable challenge. To address this problem, a typical class of approaches +transforms the point cloud cast into a regular data representation (voxels or +projection maps). Then, it performs feature extraction with convolutional +neural networks. However, such methods often result in a certain degree of +information loss due to down-sampling or over-compression of feature +information. This paper proposes a multi-modal point cloud feature fusion +method for projection features and variable receptive field voxel features +(PV-SSD) based on projection and variable voxelization to solve the information +loss problem. We design a two-branch feature extraction structure with a 2D +convolutional neural network to extract the point cloud's projection features +in bird's-eye view to focus on the correlation between local features. A voxel +feature extraction branch is used to extract local fine-grained features. +Meanwhile, we propose a voxel feature extraction method with variable sensory +fields to reduce the information loss of voxel branches due to downsampling. It +avoids missing critical point information by selecting more useful feature +points based on feature point weights for the detection task. In addition, we +propose a multi-modal feature fusion module for point clouds. To validate the +effectiveness of our method, we tested it on the KITTI dataset and ONCE +dataset. + +
+
+
+
+
+ + ♻ ☆ LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge + Retrieval-Augmented Diffusion CVPR 2024 + + +
+ Camouflaged vision perception is an important vision task with numerous +practical applications. Due to the expensive collection and labeling costs, +this community struggles with a major bottleneck that the species category of +its datasets is limited to a small number of object species. However, the +existing camouflaged generation methods require specifying the background +manually, thus failing to extend the camouflaged sample diversity in a low-cost +manner. In this paper, we propose a Latent Background Knowledge +Retrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To +our knowledge, our contributions mainly include: (1) For the first time, we +propose a camouflaged generation paradigm that does not need to receive any +background inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented +method with interpretability for camouflaged generation, in which we propose an +idea that knowledge retrieval and reasoning enhancement are separated +explicitly, to alleviate the task-specific challenges. Moreover, our method is +not restricted to specific foreground targets or backgrounds, offering a +potential for extending camouflaged vision perception to more diverse domains. +(3) Experimental results demonstrate that our method outperforms the existing +approaches, generating more realistic camouflage images. + +
+
+ comment: Accepted by CVPR 2024, Fig.3 revised +
+
+
+
+
+ + ♻ ☆ Extending CLIP's Image-Text Alignment to Referring Image Segmentation NAACL 2024 + + +
+ Referring Image Segmentation (RIS) is a cross-modal task that aims to segment +an instance described by a natural language expression. Recent methods leverage +large-scale pretrained unimodal models as backbones along with fusion +techniques for joint reasoning across modalities. However, the inherent +cross-modal nature of RIS raises questions about the effectiveness of unimodal +backbones. We propose RISCLIP, a novel framework that effectively leverages the +cross-modal nature of CLIP for RIS. Observing CLIP's inherent alignment between +image and text features, we capitalize on this starting point and introduce +simple but strong modules that enhance unimodal feature extraction and leverage +rich alignment knowledge in CLIP's image-text shared-embedding space. RISCLIP +exhibits outstanding results on all three major RIS benchmarks and also +outperforms previous CLIP-based methods, demonstrating the efficacy of our +strategy in extending CLIP's image-text alignment to RIS. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Human Mesh Recovery from Arbitrary Multi-view Images + + +
+ Human mesh recovery from arbitrary multi-view images involves two +characteristics: the arbitrary camera poses and arbitrary number of camera +views. Because of the variability, designing a unified framework to tackle this +task is challenging. The challenges can be summarized as the dilemma of being +able to simultaneously estimate arbitrary camera poses and recover human mesh +from arbitrary multi-view images while maintaining flexibility. To solve this +dilemma, we propose a divide and conquer framework for Unified Human Mesh +Recovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR +consists of a decoupled structure and two main components: camera and body +decoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion +(AVF). As camera poses and human body mesh are independent of each other, CBD +splits the estimation of them into two sub-tasks for two individual +sub-networks (ie, CPE and AVF) to handle respectively, thus the two sub-tasks +are disentangled. In CPE, since each camera pose is unrelated to the others, we +adopt a shared MLP to process all views in a parallel way. In AVF, in order to +fuse multi-view information and make the fusion operation independent of the +number of views, we introduce a transformer decoder with a SMPL parameters +query token to extract cross-view features for mesh recovery. To demonstrate +the efficacy and flexibility of the proposed framework and effect of each +component, we conduct extensive experiments on three public datasets: +Human3.6M, MPI-INF-3DHP, and TotalCapture. + +
+
+
+
+
+ + ♻ ☆ GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene + Understanding CVPR 2024 + + +
+ Applying NeRF to downstream perception tasks for scene understanding and +representation is becoming increasingly popular. Most existing methods treat +semantic prediction as an additional rendering task, \textit{i.e.}, the "label +rendering" task, to build semantic NeRFs. However, by rendering +semantic/instance labels per pixel without considering the contextual +information of the rendered image, these methods usually suffer from unclear +boundary segmentation and abnormal segmentation of pixels within an object. To +solve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel +pipeline that makes the widely used segmentation model and NeRF work compatibly +under a unified framework, for facilitating context-aware 3D scene perception. +To accomplish this goal, we introduce transformers to aggregate radiance as +well as semantic embedding fields jointly for novel views and facilitate the +joint volumetric rendering of both fields. In addition, we propose two +self-distillation mechanisms, i.e., the Semantic Distill Loss and the +Depth-Guided Semantic Distill Loss, to enhance the discrimination and quality +of the semantic field and the maintenance of geometric consistency. In +evaluation, we conduct experimental comparisons under two perception tasks +(\textit{i.e.} semantic and instance segmentation) using both synthetic and +real-world datasets. Notably, our method outperforms SOTA approaches by 6.94\%, +11.76\%, and 8.47\% on generalized semantic segmentation, finetuning semantic +segmentation, and instance segmentation, respectively. + +
+
+ comment: CVPR 2024 (Highlight). Project Page: + https://lifuguan.github.io/gpnerf-pages/ +
+
+
+
+
+ + ♻ ☆ RaFE: Generative Radiance Fields Restoration + + +
+ NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel +view synthesis and 3D reconstruction, but its performance is sensitive to input +image quality, which struggles to achieve high-fidelity rendering when provided +with low-quality sparse input viewpoints. Previous methods for NeRF restoration +are tailored for specific degradation type, ignoring the generality of +restoration. To overcome this limitation, we propose a generic radiance fields +restoration pipeline, named RaFE, which applies to various types of +degradations, such as low resolution, blurriness, noise, compression artifacts, +or their combinations. Our approach leverages the success of off-the-shelf 2D +restoration methods to recover the multi-view images individually. Instead of +reconstructing a blurred NeRF by averaging inconsistencies, we introduce a +novel approach using Generative Adversarial Networks (GANs) for NeRF generation +to better accommodate the geometric and appearance inconsistencies present in +the multi-view images. Specifically, we adopt a two-level tri-plane +architecture, where the coarse level remains fixed to represent the low-quality +NeRF, and a fine-level residual tri-plane to be added to the coarse level is +modeled as a distribution with GAN to capture potential variations in +restoration. We validate RaFE on both synthetic and real cases for various +restoration tasks, demonstrating superior performance in both quantitative and +qualitative evaluations, surpassing other 3D restoration methods specific to +single task. Please see our project website +https://zkaiwu.github.io/RaFE-Project/. + +
+
+ comment: Project Page: https://zkaiwu.github.io/RaFE +
+
+
+
+
+ + ♻ ☆ Reduction of Class Activation Uncertainty with Background Information + + +
+ Multitask learning is a popular approach to training high-performing neural +networks with improved generalization. In this paper, we propose a background +class to achieve improved generalization at a lower computation compared to +multitask learning to help researchers and organizations with limited +computation power. We also present a methodology for selecting background +images and discuss potential future improvements. We apply our approach to +several datasets and achieve improved generalization with much lower +computation. Through the class activation mappings (CAMs) of the trained +models, we observed the tendency towards looking at a bigger picture with the +proposed model training methodology. Applying the vision transformer with the +proposed background class, we receive state-of-the-art (SOTA) performance on +STL-10, Caltech-101, and CINIC-10 datasets. Example scripts are available in +the 'CAM' folder of the following GitHub Repository: github.com/dipuk0506/UQ + +
+
+
+
+
+ + ♻ ☆ Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data CVPR 2024 + + +
+ This work presents Depth Anything, a highly practical solution for robust +monocular depth estimation. Without pursuing novel technical modules, we aim to +build a simple yet powerful foundation model dealing with any images under any +circumstances. To this end, we scale up the dataset by designing a data engine +to collect and automatically annotate large-scale unlabeled data (~62M), which +significantly enlarges the data coverage and thus is able to reduce the +generalization error. We investigate two simple yet effective strategies that +make data scaling-up promising. First, a more challenging optimization target +is created by leveraging data augmentation tools. It compels the model to +actively seek extra visual knowledge and acquire robust representations. +Second, an auxiliary supervision is developed to enforce the model to inherit +rich semantic priors from pre-trained encoders. We evaluate its zero-shot +capabilities extensively, including six public datasets and randomly captured +photos. It demonstrates impressive generalization ability. Further, through +fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs +are set. Our better depth model also results in a better depth-conditioned +ControlNet. Our models are released at +https://github.com/LiheYoung/Depth-Anything. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://depth-anything.github.io +
+
+
+
+
+ + ♻ ☆ StepNet: Spatial-temporal Part-aware Network for Isolated Sign Language + Recognition + + +
+ The goal of sign language recognition (SLR) is to help those who are hard of +hearing or deaf overcome the communication barrier. Most existing approaches +can be typically divided into two lines, i.e., Skeleton-based and RGB-based +methods, but both the two lines of methods have their limitations. +Skeleton-based methods do not consider facial expressions, while RGB-based +approaches usually ignore the fine-grained hand structure. To overcome both +limitations, we propose a new framework called Spatial-temporal Part-aware +network~(StepNet), based on RGB parts. As its name suggests, it is made up of +two modules: Part-level Spatial Modeling and Part-level Temporal Modeling. +Part-level Spatial Modeling, in particular, automatically captures the +appearance-based properties, such as hands and faces, in the feature space +without the use of any keypoint-level annotations. On the other hand, +Part-level Temporal Modeling implicitly mines the long-short term context to +capture the relevant attributes over time. Extensive experiments demonstrate +that our StepNet, thanks to spatial-temporal modules, achieves competitive +Top-1 Per-instance accuracy on three commonly-used SLR benchmarks, i.e., 56.89% +on WLASL, 77.2% on NMFs-CSL, and 77.1% on BOBSL. Additionally, the proposed +method is compatible with the optical flow input and can produce superior +performance if fused. For those who are hard of hearing, we hope that our work +can act as a preliminary step. + +
+
+
+
+
+ + ♻ ☆ DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping + + +
+ Automated Aerial Triangulation (AAT), aiming to restore image pose and +reconstruct sparse points simultaneously, plays a pivotal role in earth +observation. With its rich research heritage spanning several decades in +photogrammetry, AAT has evolved into a fundamental process widely applied in +large-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its +advancements, classic AAT methods still face challenges like low efficiency and +limited robustness. This paper introduces DeepAAT, a deep learning network +designed specifically for AAT of UAV imagery. DeepAAT considers both spatial +and spectral characteristics of imagery, enhancing its capability to resolve +erroneous matching pairs and accurately predict image poses. DeepAAT marks a +significant leap in AAT's efficiency, ensuring thorough scene coverage and +precision. Its processing speed outpaces incremental AAT methods by hundreds of +times and global AAT methods by tens of times while maintaining a comparable +level of reconstruction accuracy. Additionally, DeepAAT's scene clustering and +merging strategy facilitate rapid localization and pose determination for +large-scale UAV images, even under constrained computing resources. The +experimental results demonstrate DeepAAT's substantial improvements over +conventional AAT methods, highlighting its potential in the efficiency and +accuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry +society, the code of DeepAAT will be released at: +https://github.com/WHU-USI3DV/DeepAAT. + +
+
+
+
+
+ + ♻ ☆ UniPAD: A Universal Pre-training Paradigm for Autonomous Driving CVPR2024 + + +
+ In the context of autonomous driving, the significance of effective feature +learning is widely acknowledged. While conventional 3D self-supervised +pre-training methods have shown widespread success, most methods follow the +ideas originally designed for 2D images. In this paper, we present UniPAD, a +novel self-supervised learning paradigm applying 3D volumetric differentiable +rendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction +of continuous 3D shape structures and the intricate appearance characteristics +of their 2D projections. The flexibility of our method enables seamless +integration into both 2D and 3D frameworks, enabling a more holistic +comprehension of the scenes. We manifest the feasibility and effectiveness of +UniPAD by conducting extensive experiments on various downstream 3D tasks. Our +method significantly improves lidar-, camera-, and lidar-camera-based baseline +by 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline +achieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic +segmentation on the nuScenes validation set, achieving state-of-the-art results +in comparison with previous methods. The code will be available at +https://github.com/Nightmare-n/UniPAD. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ CityGaussian: Real-time High-quality Large-Scale Scene Rendering with + Gaussians + + +
+ The advancement of real-time 3D scene reconstruction and novel view synthesis +has been significantly propelled by 3D Gaussian Splatting (3DGS). However, +effectively training large-scale 3DGS and rendering it in real-time across +various scales remains challenging. This paper introduces CityGaussian +(CityGS), which employs a novel divide-and-conquer training approach and +Level-of-Detail (LoD) strategy for efficient large-scale 3DGS training and +rendering. Specifically, the global scene prior and adaptive training data +selection enables efficient training and seamless fusion. Based on fused +Gaussian primitives, we generate different detail levels through compression, +and realize fast rendering across various scales through the proposed +block-wise detail levels selection and aggregation strategy. Extensive +experimental results on large-scale scenes demonstrate that our approach +attains state-of-theart rendering quality, enabling consistent real-time +rendering of largescale scenes across vastly different scales. Our project page +is available at https://dekuliutesla.github.io/citygs/. + +
+
+ comment: Project Page: https://dekuliutesla.github.io/citygs/ +
+
+
+
+
+ + ♻ ☆ Towards AI-Architecture Liberty: A Comprehensive Survey on Designing and + Collaborating Virtual Architecture by Deep Learning in the Metaverse + + +
+ 3D shape generation techniques leveraging deep learning have garnered +significant interest from both the computer vision and architectural design +communities, promising to enrich the content of the future metaverse. However, +research on virtual architectural design remains limited, particularly +regarding human-AI collaboration and deep learning-assisted design. We first +illuminate the principles, generation techniques, and current literature of +virtual architecture, focusing on challenges such as datasets, multimodality, +design intuition, and generative frameworks. In our survey, we reviewed 187 +related articles (80.7\% of articles published between 2018 and 2022) covering +architectural research, virtual environments, and technical approaches. This +survey investigates the latest approaches to 3D object generation with deep +generative models (DGMs) and summarizes four characteristics of deep-learning +generation approaches for virtual architecture. According to our analysis of +the survey, we expound on four research agendas, including agency, +communication, user consideration, and integrating tools, and highlight three +important enablers of ubiquitous interaction with immersive systems in deep +learning-assisted architectural generation. Our work contributes to fostering +understanding between designers and deep learning techniques, broadening access +to human-AI collaboration. We advocate for interdisciplinary efforts to address +this timely research topic, facilitating content designing and generation in +the metaverse. + +
+
+ comment: 37 pages, 9 figures, and 5 tables +
+
+
+
+
+ + ♻ ☆ ARS-DETR: Aspect Ratio-Sensitive Detection Transformer for Aerial + Oriented Object Detection + + +
+ Existing oriented object detection methods commonly use metric AP$_{50}$ to +measure the performance of the model. We argue that AP$_{50}$ is inherently +unsuitable for oriented object detection due to its large tolerance in angle +deviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$, +to measure the performance of models. In this paper, we propose an Aspect Ratio +Sensitive Oriented Object Detector with Transformer, termed ARS-DETR, which +exhibits a competitive performance in high-precision oriented object detection. +Specifically, a new angle classification method, calling Aspect Ratio aware +Circle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more +reasonable way and discard the hyperparameter that introduced by previous work +(e.g. CSL). Then, a rotated deformable attention module is designed to rotate +the sampling points with the corresponding angles and eliminate the +misalignment between region features and sampling points. Moreover, a dynamic +weight coefficient according to the aspect ratio is adopted to calculate the +angle loss. Comprehensive experiments on several challenging datasets show that +our method achieves competitive performance on the high-precision oriented +object detection task. + +
+
+ comment: 15 pages, 13 figures, 13 tables, the source code is available at + https://github.com/httle/ARS-DETR +
+
+
+
+
+ + ♻ ☆ Bi-LORA: A Vision-Language Approach for Synthetic Image Detection + + +
+ Advancements in deep image synthesis techniques, such as generative +adversarial networks (GANs) and diffusion models (DMs), have ushered in an era +of generating highly realistic images. While this technological progress has +captured significant interest, it has also raised concerns about the potential +difficulty in distinguishing real images from their synthetic counterparts. +This paper takes inspiration from the potent convergence capabilities between +vision and language, coupled with the zero-shot nature of vision-language +models (VLMs). We introduce an innovative method called Bi-LORA that leverages +VLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance +the precision of synthetic image detection for unseen model-generated images. +The pivotal conceptual shift in our methodology revolves around reframing +binary classification as an image captioning task, leveraging the distinctive +capabilities of cutting-edge VLM, notably bootstrapping language image +pre-training (BLIP2). Rigorous and comprehensive experiments are conducted to +validate the effectiveness of our proposed approach, particularly in detecting +unseen diffusion-generated images from unknown diffusion-based generative +models during training, showcasing robustness to noise, and demonstrating +generalization capabilities to GANs. The obtained results showcase an +impressive average accuracy of 93.41% in synthetic image detection on unseen +generation models. The code and models associated with this research can be +publicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+
+
+
+ + ♻ ☆ Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation + + +
+ Recently, image-to-3D approaches have achieved significant results with a +natural image as input. However, it is not always possible to access these +enriched color input samples in practical applications, where only sketches are +available. Existing sketch-to-3D researches suffer from limitations in broad +applications due to the challenges of lacking color information and multi-view +content. To overcome them, this paper proposes a novel generation paradigm +Sketch3D to generate realistic 3D assets with shape aligned with the input +sketch and color matching the textual description. Concretely, Sketch3D first +instantiates the given sketch in the reference image through the +shape-preserving generation process. Second, the reference image is leveraged +to deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance +images are generated based on the renderings of the 3D Gaussians. Finally, +three strategies are designed to optimize 3D Gaussians, i.e., structural +optimization via a distribution transfer mechanism, color optimization with a +straightforward MSE loss and sketch similarity optimization with a CLIP-based +geometric similarity loss. Extensive visual comparisons and quantitative +analysis illustrate the advantage of our Sketch3D in generating realistic 3D +assets while preserving consistency with the input. + +
+
+
+
+
+ + ♻ ☆ Training Like a Medical Resident: Context-Prior Learning Toward + Universal Medical Image Segmentation CVPR 2024 + + +
+ A major focus of clinical imaging workflow is disease diagnosis and +management, leading to medical imaging datasets strongly tied to specific +clinical objectives. This scenario has led to the prevailing practice of +developing task-specific segmentation models, without gaining insights from +widespread imaging cohorts. Inspired by the training program of medical +radiology residents, we propose a shift towards universal medical image +segmentation, a paradigm aiming to build medical image understanding foundation +models by leveraging the diversity and commonality across clinical targets, +body regions, and imaging modalities. Towards this goal, we develop Hermes, a +novel context-prior learning approach to address the challenges of data +heterogeneity and annotation differences in medical image segmentation. In a +large collection of eleven diverse datasets (2,438 3D images) across five +modalities (CT, PET, T1, T2 and cine MRI) and multiple body regions, we +demonstrate the merit of the universal paradigm over the traditional paradigm +on addressing multiple tasks within a single model. By exploiting the synergy +across tasks, Hermes achieves state-of-the-art performance on all testing +datasets and shows superior model scalability. Results on two additional +datasets reveals Hermes' strong performance for transfer learning, incremental +learning, and generalization to downstream tasks. Hermes's learned priors +demonstrate an appealing trait to reflect the intricate relations among tasks +and modalities, which aligns with the established anatomical and imaging +principles in radiology. The code is available: +https://github.com/yhygao/universal-medical-image-segmentation. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Tailored Visions: Enhancing Text-to-Image Generation with Personalized + Prompt Rewriting CVPR 2024 + + +
+ Despite significant progress in the field, it is still challenging to create +personalized visual representations that align closely with the desires and +preferences of individual users. This process requires users to articulate +their ideas in words that are both comprehensible to the models and accurately +capture their vision, posing difficulties for many users. In this paper, we +tackle this challenge by leveraging historical user interactions with the +system to enhance user prompts. We propose a novel approach that involves +rewriting user prompts based on a newly collected large-scale text-to-image +dataset with over 300k prompts from 3115 users. Our rewriting model enhances +the expressiveness and alignment of user prompts with their intended visual +outputs. Experimental results demonstrate the superiority of our methods over +baseline approaches, as evidenced in our new offline evaluation method and +online tests. Our code and dataset are available at +https://github.com/zzjchen/Tailored-Visions. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving + + +
+ Modeling complicated interactions among the ego-vehicle, road agents, and map +elements has been a crucial part for safety-critical autonomous driving. +Previous works on end-to-end autonomous driving rely on the attention mechanism +for handling heterogeneous interactions, which fails to capture the geometric +priors and is also computationally intensive. In this paper, we propose the +Interaction Scene Graph (ISG) as a unified method to model the interactions +among the ego-vehicle, road agents, and map elements. With the representation +of the ISG, the driving agents aggregate essential information from the most +influential elements, including the road agents with potential collisions and +the map elements to follow. Since a mass of unnecessary interactions are +omitted, the more efficient scene-graph-based framework is able to focus on +indispensable connections and leads to better performance. We evaluate the +proposed method for end-to-end autonomous driving on the nuScenes dataset. +Compared with strong baselines, our method significantly outperforms in the +full-stack driving tasks, including perception, prediction, and planning. Code +will be released at https://github.com/zhangyp15/GraphAD. + +
+
+ comment: project page: https://github.com/zhangyp15/GraphAD +
+
+
+
+
+ + ♻ ☆ Towards a Simultaneous and Granular Identity-Expression Control in + Personalized Face Generation + + +
+ In human-centric content generation, the pre-trained text-to-image models +struggle to produce user-wanted portrait images, which retain the identity of +individuals while exhibiting diverse expressions. This paper introduces our +efforts towards personalized face generation. To this end, we propose a novel +multi-modal face generation framework, capable of simultaneous +identity-expression control and more fine-grained expression synthesis. Our +expression control is so sophisticated that it can be specialized by the +fine-grained emotional vocabulary. We devise a novel diffusion model that can +undertake the task of simultaneously face swapping and reenactment. Due to the +entanglement of identity and expression, it's nontrivial to separately and +precisely control them in one framework, thus has not been explored yet. To +overcome this, we propose several innovative designs in the conditional +diffusion model, including balancing identity and expression encoder, improved +midpoint sampling, and explicitly background conditioning. Extensive +experiments have demonstrated the controllability and scalability of the +proposed framework, in comparison with state-of-the-art text-to-image, face +swapping, and face reenactment methods. + +
+
+
+
+
+ + ♻ ☆ FineDiffusion: Scaling up Diffusion Models for Fine-grained Image + Generation with 10,000 Classes + + +
+ The class-conditional image generation based on diffusion models is renowned +for generating high-quality and diverse images. However, most prior efforts +focus on generating images for general categories, e.g., 1000 classes in +ImageNet-1k. A more challenging task, large-scale fine-grained image +generation, remains the boundary to explore. In this work, we present a +parameter-efficient strategy, called FineDiffusion, to fine-tune large +pre-trained diffusion models scaling to large-scale fine-grained image +generation with 10,000 categories. FineDiffusion significantly accelerates +training and reduces storage overhead by only fine-tuning tiered class +embedder, bias terms, and normalization layers' parameters. To further improve +the image generation quality of fine-grained categories, we propose a novel +sampling method for fine-grained image generation, which utilizes +superclass-conditioned guidance, specifically tailored for fine-grained +categories, to replace the conventional classifier-free guidance sampling. +Compared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x +training speed-up and requires storing merely 1.77% of the total model +parameters, while achieving state-of-the-art FID of 9.776 on image generation +of 10,000 classes. Extensive qualitative and quantitative experiments +demonstrate the superiority of our method compared to other parameter-efficient +fine-tuning methods. The code and more generated results are available at our +project website: https://finediffusion.github.io/. + +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ GenAD: Generative End-to-End Autonomous Driving + + +
+ Directly producing planning results from raw sensors has been a long-desired +solution for autonomous driving and has attracted increasing attention +recently. Most existing end-to-end autonomous driving methods factorize this +problem into perception, motion prediction, and planning. However, we argue +that the conventional progressive pipeline still cannot comprehensively model +the entire traffic evolution process, e.g., the future interaction between the +ego car and other traffic participants and the structural trajectory prior. In +this paper, we explore a new paradigm for end-to-end autonomous driving, where +the key is to predict how the ego car and the surroundings evolve given past +scenes. We propose GenAD, a generative framework that casts autonomous driving +into a generative modeling problem. We propose an instance-centric scene +tokenizer that first transforms the surrounding scenes into map-aware instance +tokens. We then employ a variational autoencoder to learn the future trajectory +distribution in a structural latent space for trajectory prior modeling. We +further adopt a temporal model to capture the agent and ego movements in the +latent space to generate more effective future trajectories. GenAD finally +simultaneously performs motion prediction and planning by sampling +distributions in the learned structural latent space conditioned on the +instance tokens and using the learned temporal model to generate futures. +Extensive experiments on the widely used nuScenes benchmark show that the +proposed GenAD achieves state-of-the-art performance on vision-centric +end-to-end autonomous driving with high efficiency. Code: +https://github.com/wzzheng/GenAD. + +
+
+ comment: Code is available at: https://github.com/wzzheng/GenAD +
+
+
+
+
+ + ♻ ☆ CCEdit: Creative and Controllable Video Editing via Diffusion Models + + +
+ In this paper, we present CCEdit, a versatile generative video editing +framework based on diffusion models. Our approach employs a novel trident +network structure that separates structure and appearance control, ensuring +precise and creative editing capabilities. Utilizing the foundational +ControlNet architecture, we maintain the structural integrity of the video +during editing. The incorporation of an additional appearance branch enables +users to exert fine-grained control over the edited key frame. These two side +branches seamlessly integrate into the main branch, which is constructed upon +existing text-to-image (T2I) generation models, through learnable temporal +layers. The versatility of our framework is demonstrated through a diverse +range of choices in both structure representations and personalized T2I models, +as well as the option to provide the edited key frame. To facilitate +comprehensive evaluation, we introduce the BalanceCC benchmark dataset, +comprising 100 videos and 4 target prompts for each video. Our extensive user +studies compare CCEdit with eight state-of-the-art video editing methods. The +outcomes demonstrate CCEdit's substantial superiority over all other methods. + +
+
+
+
+
+ + ♻ ☆ PrivImage: Differentially Private Synthetic Image Generation using + Diffusion Models with Semantic-Aware Pretraining USENIX Security 2024 + + +
+ Differential Privacy (DP) image data synthesis, which leverages the DP +technique to generate synthetic data to replace the sensitive data, allowing +organizations to share and utilize synthetic images without privacy concerns. +Previous methods incorporate the advanced techniques of generative models and +pre-training on a public dataset to produce exceptional DP image data, but +suffer from problems of unstable training and massive computational resource +demands. This paper proposes a novel DP image synthesis method, termed +PRIVIMAGE, which meticulously selects pre-training data, promoting the +efficient creation of DP datasets with high fidelity and utility. PRIVIMAGE +first establishes a semantic query function using a public dataset. Then, this +function assists in querying the semantic distribution of the sensitive +dataset, facilitating the selection of data from the public dataset with +analogous semantics for pre-training. Finally, we pre-train an image generative +model using the selected data and then fine-tune this model on the sensitive +dataset using Differentially Private Stochastic Gradient Descent (DP-SGD). +PRIVIMAGE allows us to train a lightly parameterized generative model, reducing +the noise in the gradient during DP-SGD training and enhancing training +stability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the +public dataset for pre-training and 7.6% of the parameters in the generative +model compared to the state-of-the-art method, whereas achieves superior +synthetic performance and conserves more computational resources. On average, +PRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy +than the state-of-the-art method. The replication package and datasets can be +accessed online. + +
+
+ comment: Accepted at USENIX Security 2024 +
+
+
+
+
+ + ♻ ☆ Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated + Objects + + +
+ We present Knowledge NeRF to synthesize novel views for dynamic scenes. +Reconstructing dynamic 3D scenes from few sparse views and rendering them from +arbitrary perspectives is a challenging problem with applications in various +domains. Previous dynamic NeRF methods learn the deformation of articulated +objects from monocular videos. However, qualities of their reconstructed scenes +are limited. To clearly reconstruct dynamic scenes, we propose a new framework +by considering two frames at a time.We pretrain a NeRF model for an articulated +object.When articulated objects moves, Knowledge NeRF learns to generate novel +views at the new state by incorporating past knowledge in the pretrained NeRF +model with minimal observations in the present state. We propose a projection +module to adapt NeRF for dynamic scenes, learning the correspondence between +pretrained knowledge base and current states. Experimental results demonstrate +the effectiveness of our method in reconstructing dynamic 3D scenes with 5 +input images in one state. Knowledge NeRF is a new pipeline and promising +solution for novel view synthesis in dynamic articulated objects. The data and +implementation are publicly available at +https://github.com/RussRobin/Knowledge_NeRF. + +
+
+
+
+
+ + ♻ ☆ Optimizing Illuminant Estimation in Dual-Exposure HDR Imaging + + +
+ High dynamic range (HDR) imaging involves capturing a series of frames of the +same scene, each with different exposure settings, to broaden the dynamic range +of light. This can be achieved through burst capturing or using staggered HDR +sensors that capture long and short exposures simultaneously in the camera +image signal processor (ISP). Within camera ISP pipeline, illuminant estimation +is a crucial step aiming to estimate the color of the global illuminant in the +scene. This estimation is used in camera ISP white-balance module to remove +undesirable color cast in the final image. Despite the multiple frames captured +in the HDR pipeline, conventional illuminant estimation methods often rely only +on a single frame of the scene. In this paper, we explore leveraging +information from frames captured with different exposure times. Specifically, +we introduce a simple feature extracted from dual-exposure images to guide +illuminant estimators, referred to as the dual-exposure feature (DEF). To +validate the efficiency of DEF, we employed two illuminant estimators using the +proposed DEF: 1) a multilayer perceptron network (MLP), referred to as +exposure-based MLP (EMLP), and 2) a modified version of the convolutional color +constancy (CCC) to integrate our DEF, that we call ECCC. Both EMLP and ECCC +achieve promising results, in some cases surpassing prior methods that require +hundreds of thousands or millions of parameters, with only a few hundred +parameters for EMLP and a few thousand parameters for ECCC. + +
+
+
+
+
+ + ♻ ☆ UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale + Transformer IJCNN 2024 + + +
+ Underwater images often exhibit poor quality, distorted color balance and low +contrast due to the complex and intricate interplay of light, water, and +objects. Despite the significant contributions of previous underwater +enhancement techniques, there exist several problems that demand further +improvement: (i) The current deep learning methods rely on Convolutional Neural +Networks (CNNs) that lack the multi-scale enhancement, and global perception +field is also limited. (ii) The scarcity of paired real-world underwater +datasets poses a significant challenge, and the utilization of synthetic image +pairs could lead to overfitting. To address the aforementioned problems, this +paper introduces a Multi-scale Transformer-based Network called UWFormer for +enhancing images at multiple frequencies via semi-supervised learning, in which +we propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale +Fusion Feed-forward Network for low-frequency enhancement. Besides, we +introduce a special underwater semi-supervised training strategy, where we +propose a Subaqueous Perceptual Loss function to generate reliable pseudo +labels. Experiments using full-reference and non-reference underwater +benchmarks demonstrate that our method outperforms state-of-the-art methods in +terms of both quantity and visual quality. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`